kodit 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (118) hide show
  1. kodit/_version.py +2 -2
  2. kodit/application/__init__.py +1 -0
  3. kodit/application/commands/__init__.py +1 -0
  4. kodit/application/commands/snippet_commands.py +22 -0
  5. kodit/application/services/__init__.py +1 -0
  6. kodit/application/services/indexing_application_service.py +363 -0
  7. kodit/application/services/snippet_application_service.py +143 -0
  8. kodit/cli.py +105 -82
  9. kodit/database.py +0 -22
  10. kodit/domain/__init__.py +1 -0
  11. kodit/{source/source_models.py → domain/entities.py} +88 -19
  12. kodit/domain/enums.py +9 -0
  13. kodit/domain/interfaces.py +27 -0
  14. kodit/domain/repositories.py +95 -0
  15. kodit/domain/services/__init__.py +1 -0
  16. kodit/domain/services/bm25_service.py +124 -0
  17. kodit/domain/services/embedding_service.py +155 -0
  18. kodit/domain/services/enrichment_service.py +48 -0
  19. kodit/domain/services/ignore_service.py +45 -0
  20. kodit/domain/services/indexing_service.py +203 -0
  21. kodit/domain/services/snippet_extraction_service.py +89 -0
  22. kodit/domain/services/source_service.py +83 -0
  23. kodit/domain/value_objects.py +215 -0
  24. kodit/infrastructure/__init__.py +1 -0
  25. kodit/infrastructure/bm25/__init__.py +1 -0
  26. kodit/infrastructure/bm25/bm25_factory.py +28 -0
  27. kodit/{bm25/local_bm25.py → infrastructure/bm25/local_bm25_repository.py} +33 -22
  28. kodit/{bm25/vectorchord_bm25.py → infrastructure/bm25/vectorchord_bm25_repository.py} +40 -35
  29. kodit/infrastructure/cloning/__init__.py +1 -0
  30. kodit/infrastructure/cloning/folder/__init__.py +1 -0
  31. kodit/infrastructure/cloning/folder/factory.py +119 -0
  32. kodit/infrastructure/cloning/folder/working_copy.py +38 -0
  33. kodit/infrastructure/cloning/git/__init__.py +1 -0
  34. kodit/infrastructure/cloning/git/factory.py +133 -0
  35. kodit/infrastructure/cloning/git/working_copy.py +32 -0
  36. kodit/infrastructure/cloning/metadata.py +127 -0
  37. kodit/infrastructure/embedding/__init__.py +1 -0
  38. kodit/infrastructure/embedding/embedding_factory.py +87 -0
  39. kodit/infrastructure/embedding/embedding_providers/__init__.py +1 -0
  40. kodit/infrastructure/embedding/embedding_providers/batching.py +93 -0
  41. kodit/infrastructure/embedding/embedding_providers/hash_embedding_provider.py +79 -0
  42. kodit/infrastructure/embedding/embedding_providers/local_embedding_provider.py +129 -0
  43. kodit/infrastructure/embedding/embedding_providers/openai_embedding_provider.py +113 -0
  44. kodit/infrastructure/embedding/local_vector_search_repository.py +114 -0
  45. kodit/{embedding/vectorchord_vector_search_service.py → infrastructure/embedding/vectorchord_vector_search_repository.py} +98 -32
  46. kodit/infrastructure/enrichment/__init__.py +1 -0
  47. kodit/{enrichment → infrastructure/enrichment}/enrichment_factory.py +28 -12
  48. kodit/infrastructure/enrichment/legacy_enrichment_models.py +42 -0
  49. kodit/infrastructure/enrichment/local_enrichment_provider.py +115 -0
  50. kodit/infrastructure/enrichment/null_enrichment_provider.py +25 -0
  51. kodit/infrastructure/enrichment/openai_enrichment_provider.py +89 -0
  52. kodit/infrastructure/git/__init__.py +1 -0
  53. kodit/{source/git.py → infrastructure/git/git_utils.py} +10 -2
  54. kodit/infrastructure/ignore/__init__.py +1 -0
  55. kodit/{source/ignore.py → infrastructure/ignore/ignore_pattern_provider.py} +23 -6
  56. kodit/infrastructure/indexing/__init__.py +1 -0
  57. kodit/infrastructure/indexing/fusion_service.py +55 -0
  58. kodit/infrastructure/indexing/index_repository.py +296 -0
  59. kodit/infrastructure/indexing/indexing_factory.py +111 -0
  60. kodit/infrastructure/snippet_extraction/__init__.py +1 -0
  61. kodit/infrastructure/snippet_extraction/language_detection_service.py +39 -0
  62. kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +95 -0
  63. kodit/infrastructure/snippet_extraction/snippet_query_provider.py +45 -0
  64. kodit/{snippets/method_snippets.py → infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py} +123 -61
  65. kodit/infrastructure/sqlalchemy/__init__.py +1 -0
  66. kodit/{embedding → infrastructure/sqlalchemy}/embedding_repository.py +40 -24
  67. kodit/infrastructure/sqlalchemy/file_repository.py +73 -0
  68. kodit/infrastructure/sqlalchemy/repository.py +121 -0
  69. kodit/infrastructure/sqlalchemy/snippet_repository.py +75 -0
  70. kodit/infrastructure/ui/__init__.py +1 -0
  71. kodit/infrastructure/ui/progress.py +127 -0
  72. kodit/{util → infrastructure/ui}/spinner.py +19 -4
  73. kodit/mcp.py +50 -28
  74. kodit/migrations/env.py +1 -4
  75. kodit/reporting.py +78 -0
  76. {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/METADATA +1 -1
  77. kodit-0.2.5.dist-info/RECORD +99 -0
  78. kodit/bm25/__init__.py +0 -1
  79. kodit/bm25/keyword_search_factory.py +0 -17
  80. kodit/bm25/keyword_search_service.py +0 -34
  81. kodit/embedding/__init__.py +0 -1
  82. kodit/embedding/embedding_factory.py +0 -63
  83. kodit/embedding/embedding_models.py +0 -28
  84. kodit/embedding/embedding_provider/__init__.py +0 -1
  85. kodit/embedding/embedding_provider/embedding_provider.py +0 -64
  86. kodit/embedding/embedding_provider/hash_embedding_provider.py +0 -77
  87. kodit/embedding/embedding_provider/local_embedding_provider.py +0 -64
  88. kodit/embedding/embedding_provider/openai_embedding_provider.py +0 -77
  89. kodit/embedding/local_vector_search_service.py +0 -54
  90. kodit/embedding/vector_search_service.py +0 -38
  91. kodit/enrichment/__init__.py +0 -1
  92. kodit/enrichment/enrichment_provider/__init__.py +0 -1
  93. kodit/enrichment/enrichment_provider/enrichment_provider.py +0 -16
  94. kodit/enrichment/enrichment_provider/local_enrichment_provider.py +0 -92
  95. kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +0 -81
  96. kodit/enrichment/enrichment_service.py +0 -33
  97. kodit/indexing/__init__.py +0 -1
  98. kodit/indexing/fusion.py +0 -67
  99. kodit/indexing/indexing_models.py +0 -43
  100. kodit/indexing/indexing_repository.py +0 -216
  101. kodit/indexing/indexing_service.py +0 -338
  102. kodit/snippets/__init__.py +0 -1
  103. kodit/snippets/languages/__init__.py +0 -53
  104. kodit/snippets/snippets.py +0 -50
  105. kodit/source/__init__.py +0 -1
  106. kodit/source/source_factories.py +0 -356
  107. kodit/source/source_repository.py +0 -169
  108. kodit/source/source_service.py +0 -150
  109. kodit/util/__init__.py +0 -1
  110. kodit-0.2.3.dist-info/RECORD +0 -71
  111. /kodit/{snippets → infrastructure/snippet_extraction}/languages/csharp.scm +0 -0
  112. /kodit/{snippets → infrastructure/snippet_extraction}/languages/go.scm +0 -0
  113. /kodit/{snippets → infrastructure/snippet_extraction}/languages/javascript.scm +0 -0
  114. /kodit/{snippets → infrastructure/snippet_extraction}/languages/python.scm +0 -0
  115. /kodit/{snippets → infrastructure/snippet_extraction}/languages/typescript.scm +0 -0
  116. {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/WHEEL +0 -0
  117. {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/entry_points.txt +0 -0
  118. {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,43 +0,0 @@
1
- """Index models for managing code indexes.
2
-
3
- This module defines the SQLAlchemy models used for storing and managing code indexes,
4
- including files and snippets. It provides the data structures for tracking indexed
5
- files and their content.
6
- """
7
-
8
- from sqlalchemy import ForeignKey, UnicodeText
9
- from sqlalchemy.orm import Mapped, mapped_column
10
-
11
- from kodit.database import Base, CommonMixin
12
-
13
-
14
- class Index(Base, CommonMixin):
15
- """Index model."""
16
-
17
- __tablename__ = "indexes"
18
-
19
- source_id: Mapped[int] = mapped_column(
20
- ForeignKey("sources.id"), unique=True, index=True
21
- )
22
-
23
- def __init__(self, source_id: int) -> None:
24
- """Initialize the index."""
25
- super().__init__()
26
- self.source_id = source_id
27
-
28
-
29
- class Snippet(Base, CommonMixin):
30
- """Snippet model."""
31
-
32
- __tablename__ = "snippets"
33
-
34
- file_id: Mapped[int] = mapped_column(ForeignKey("files.id"), index=True)
35
- index_id: Mapped[int] = mapped_column(ForeignKey("indexes.id"), index=True)
36
- content: Mapped[str] = mapped_column(UnicodeText, default="")
37
-
38
- def __init__(self, file_id: int, index_id: int, content: str) -> None:
39
- """Initialize the snippet."""
40
- super().__init__()
41
- self.file_id = file_id
42
- self.index_id = index_id
43
- self.content = content
@@ -1,216 +0,0 @@
1
- """Repository for managing code indexes and their associated files and snippets.
2
-
3
- This module provides the IndexRepository class which handles all database operations
4
- related to code indexes, including creating indexes, managing files and snippets,
5
- and retrieving index information with their associated metadata.
6
- """
7
-
8
- from datetime import UTC, datetime
9
- from typing import TypeVar
10
-
11
- from sqlalchemy import delete, func, select
12
- from sqlalchemy.ext.asyncio import AsyncSession
13
-
14
- from kodit.embedding.embedding_models import Embedding
15
- from kodit.indexing.indexing_models import Index, Snippet
16
- from kodit.source.source_models import File, Source
17
-
18
- T = TypeVar("T")
19
-
20
-
21
- class IndexRepository:
22
- """Repository for managing code indexes and their associated data.
23
-
24
- This class provides methods for creating and managing code indexes, including
25
- their associated files and snippets. It handles all database operations related
26
- to indexing code sources.
27
- """
28
-
29
- def __init__(self, session: AsyncSession) -> None:
30
- """Initialize the index repository.
31
-
32
- Args:
33
- session: The SQLAlchemy async session to use for database operations.
34
-
35
- """
36
- self.session = session
37
-
38
- async def create(self, source_id: int) -> Index:
39
- """Create a new index for a source.
40
-
41
- Args:
42
- source_id: The ID of the source to create an index for.
43
-
44
- Returns:
45
- The newly created Index instance.
46
-
47
- """
48
- index = Index(source_id=source_id)
49
- self.session.add(index)
50
- await self.session.commit()
51
- return index
52
-
53
- async def get_by_id(self, index_id: int) -> Index | None:
54
- """Get an index by its ID.
55
-
56
- Args:
57
- index_id: The ID of the index to retrieve.
58
-
59
- Returns:
60
- The Index instance if found, None otherwise.
61
-
62
- """
63
- query = select(Index).where(Index.id == index_id)
64
- result = await self.session.execute(query)
65
- return result.scalar_one_or_none()
66
-
67
- async def get_by_source_id(self, source_id: int) -> Index | None:
68
- """Get an index by its source ID.
69
-
70
- Args:
71
- source_id: The ID of the source to retrieve an index for.
72
-
73
- """
74
- query = select(Index).where(Index.source_id == source_id)
75
- result = await self.session.execute(query)
76
- return result.scalar_one_or_none()
77
-
78
- async def files_for_index(self, index_id: int) -> list[File]:
79
- """Get all files for an index.
80
-
81
- Args:
82
- index_id: The ID of the index to get files for.
83
-
84
- Returns:
85
- A list of File instances.
86
-
87
- """
88
- query = (
89
- select(File)
90
- .join(Source, File.source_id == Source.id)
91
- .join(Index, Index.source_id == Source.id)
92
- .where(Index.id == index_id)
93
- )
94
- result = await self.session.execute(query)
95
- return list(result.scalars())
96
-
97
- async def list_indexes(self) -> list[tuple[Index, Source]]:
98
- """List all indexes.
99
-
100
- Returns:
101
- A list of tuples containing index information, source details,
102
- and counts of files and snippets.
103
-
104
- """
105
- query = select(Index, Source).join(
106
- Source, Index.source_id == Source.id, full=True
107
- )
108
- result = await self.session.execute(query)
109
- return list(result.tuples())
110
-
111
- async def num_snippets_for_index(self, index_id: int) -> int:
112
- """Get the number of snippets for an index."""
113
- query = select(func.count()).where(Snippet.index_id == index_id)
114
- result = await self.session.execute(query)
115
- return result.scalar_one()
116
-
117
- async def update_index_timestamp(self, index: Index) -> None:
118
- """Update the updated_at timestamp of an index.
119
-
120
- Args:
121
- index: The Index instance to update.
122
-
123
- """
124
- index.updated_at = datetime.now(UTC)
125
- await self.session.commit()
126
-
127
- async def add_snippet(self, snippet: Snippet) -> None:
128
- """Add a new snippet to the database if it doesn't exist, otherwise update it.
129
-
130
- Args:
131
- snippet: The Snippet instance to add.
132
-
133
- """
134
- self.session.add(snippet)
135
- await self.session.commit()
136
-
137
- async def delete_all_snippets(self, index_id: int) -> None:
138
- """Delete all snippets for an index.
139
-
140
- Args:
141
- index_id: The ID of the index to delete snippets for.
142
-
143
- """
144
- # First get all snippets for this index
145
- snippets = await self.get_snippets_for_index(index_id)
146
-
147
- # Delete all embeddings for these snippets, if there are any
148
- for snippet in snippets:
149
- query = delete(Embedding).where(Embedding.snippet_id == snippet.id)
150
- await self.session.execute(query)
151
-
152
- # Now delete the snippets
153
- query = delete(Snippet).where(Snippet.index_id == index_id)
154
- await self.session.execute(query)
155
- await self.session.commit()
156
-
157
- async def get_snippets_for_index(self, index_id: int) -> list[Snippet]:
158
- """Get all snippets for an index.
159
-
160
- Args:
161
- index_id: The ID of the index to get snippets for.
162
-
163
- """
164
- query = select(Snippet).where(Snippet.index_id == index_id)
165
- result = await self.session.execute(query)
166
- return list(result.scalars())
167
-
168
- async def get_all_snippets(self, index_id: int) -> list[Snippet]:
169
- """Get all snippets.
170
-
171
- Returns:
172
- A list of all snippets.
173
-
174
- """
175
- query = select(Snippet).where(Snippet.index_id == index_id).order_by(Snippet.id)
176
- result = await self.session.execute(query)
177
- return list(result.scalars())
178
-
179
- async def add_embedding(self, embedding: Embedding) -> None:
180
- """Add a new embedding to the database.
181
-
182
- Args:
183
- embedding: The Embedding instance to add.
184
-
185
- """
186
- self.session.add(embedding)
187
- await self.session.commit()
188
-
189
- async def list_snippets_by_ids(self, ids: list[int]) -> list[tuple[File, Snippet]]:
190
- """List snippets by IDs.
191
-
192
- Returns:
193
- A list of snippets in the same order as the input IDs.
194
-
195
- """
196
- query = (
197
- select(Snippet, File)
198
- .where(Snippet.id.in_(ids))
199
- .join(File, Snippet.file_id == File.id)
200
- )
201
- rows = await self.session.execute(query)
202
-
203
- # Create a dictionary for O(1) lookup of results by ID
204
- id_to_result = {snippet.id: (file, snippet) for snippet, file in rows.all()}
205
-
206
- # Check that all IDs are present
207
- if len(id_to_result) != len(ids):
208
- # Create a list of missing IDs
209
- missing_ids = [
210
- snippet_id for snippet_id in ids if snippet_id not in id_to_result
211
- ]
212
- msg = f"Some IDs are not present: {missing_ids}"
213
- raise ValueError(msg)
214
-
215
- # Rebuild the list in the same order that it was passed in
216
- return [id_to_result[i] for i in ids]
@@ -1,338 +0,0 @@
1
- """Index service for managing code indexes.
2
-
3
- This module provides the IndexService class which handles the business logic for
4
- creating, listing, and running code indexes. It orchestrates the interaction between the
5
- file system, database operations (via IndexRepository), and provides a clean API for
6
- index management.
7
- """
8
-
9
- from datetime import datetime
10
- from pathlib import Path
11
-
12
- import pydantic
13
- import structlog
14
- from tqdm.asyncio import tqdm
15
-
16
- from kodit.bm25.keyword_search_service import (
17
- BM25Document,
18
- BM25Result,
19
- KeywordSearchProvider,
20
- )
21
- from kodit.embedding.vector_search_service import (
22
- VectorSearchRequest,
23
- VectorSearchService,
24
- )
25
- from kodit.enrichment.enrichment_service import EnrichmentService
26
- from kodit.indexing.fusion import FusionRequest, reciprocal_rank_fusion
27
- from kodit.indexing.indexing_models import Snippet
28
- from kodit.indexing.indexing_repository import IndexRepository
29
- from kodit.log import log_event
30
- from kodit.snippets.snippets import SnippetService
31
- from kodit.source.source_service import SourceService
32
- from kodit.util.spinner import Spinner
33
-
34
- # List of MIME types that are blacklisted from being indexed
35
- MIME_BLACKLIST = ["unknown/unknown"]
36
-
37
-
38
- class IndexView(pydantic.BaseModel):
39
- """Data transfer object for index information.
40
-
41
- This model represents the public interface for index data, providing a clean
42
- view of index information without exposing internal implementation details.
43
- """
44
-
45
- id: int
46
- created_at: datetime
47
- updated_at: datetime | None = None
48
- source: str | None = None
49
- num_snippets: int
50
-
51
-
52
- class SearchRequest(pydantic.BaseModel):
53
- """Request for a search."""
54
-
55
- text_query: str | None = None
56
- code_query: str | None = None
57
- keywords: list[str] | None = None
58
- top_k: int = 10
59
-
60
-
61
- class SearchResult(pydantic.BaseModel):
62
- """Data transfer object for search results.
63
-
64
- This model represents a single search result, containing both the file path
65
- and the matching snippet content.
66
- """
67
-
68
- id: int
69
- uri: str
70
- content: str
71
- original_scores: list[float]
72
-
73
-
74
- class IndexService:
75
- """Service for managing code indexes.
76
-
77
- This service handles the business logic for creating, listing, and running code
78
- indexes. It coordinates between file system operations, database operations (via
79
- IndexRepository), and provides a clean API for index management.
80
- """
81
-
82
- def __init__( # noqa: PLR0913
83
- self,
84
- repository: IndexRepository,
85
- source_service: SourceService,
86
- keyword_search_provider: KeywordSearchProvider,
87
- code_search_service: VectorSearchService,
88
- text_search_service: VectorSearchService,
89
- enrichment_service: EnrichmentService,
90
- ) -> None:
91
- """Initialize the index service.
92
-
93
- Args:
94
- repository: The repository instance to use for database operations.
95
- source_service: The source service instance to use for source validation.
96
-
97
- """
98
- self.repository = repository
99
- self.source_service = source_service
100
- self.snippet_service = SnippetService()
101
- self.log = structlog.get_logger(__name__)
102
- self.keyword_search_provider = keyword_search_provider
103
- self.code_search_service = code_search_service
104
- self.text_search_service = text_search_service
105
- self.enrichment_service = enrichment_service
106
-
107
- async def create(self, source_id: int) -> IndexView:
108
- """Create a new index for a source.
109
-
110
- This method creates a new index for the specified source, after validating
111
- that the source exists and doesn't already have an index.
112
-
113
- Args:
114
- source_id: The ID of the source to create an index for.
115
-
116
- Returns:
117
- An Index object representing the newly created index.
118
-
119
- Raises:
120
- ValueError: If the source doesn't exist or already has an index.
121
-
122
- """
123
- log_event("kodit.index.create")
124
-
125
- # Check if the source exists
126
- source = await self.source_service.get(source_id)
127
-
128
- # Check if the index already exists
129
- index = await self.repository.get_by_source_id(source.id)
130
- if not index:
131
- index = await self.repository.create(source.id)
132
- return IndexView(
133
- id=index.id,
134
- created_at=index.created_at,
135
- num_snippets=await self.repository.num_snippets_for_index(index.id),
136
- source=source.uri,
137
- )
138
-
139
- async def list_indexes(self) -> list[IndexView]:
140
- """List all available indexes with their details.
141
-
142
- Returns:
143
- A list of Index objects containing information about each index,
144
- including file and snippet counts.
145
-
146
- """
147
- indexes = await self.repository.list_indexes()
148
-
149
- # Transform database results into DTOs
150
- indexes = [
151
- IndexView(
152
- id=index.id,
153
- created_at=index.created_at,
154
- updated_at=index.updated_at,
155
- num_snippets=await self.repository.num_snippets_for_index(index.id)
156
- or 0,
157
- source=source.uri,
158
- )
159
- for index, source in indexes
160
- ]
161
-
162
- # Help Kodit by measuring how much people are using indexes
163
- log_event(
164
- "kodit.index.list",
165
- {
166
- "num_indexes": len(indexes),
167
- "num_snippets": sum([index.num_snippets for index in indexes]),
168
- },
169
- )
170
-
171
- return indexes
172
-
173
- async def run(self, index_id: int) -> None:
174
- """Run the indexing process for a specific index."""
175
- log_event("kodit.index.run")
176
-
177
- # Get and validate index
178
- index = await self.repository.get_by_id(index_id)
179
- if not index:
180
- msg = f"Index not found: {index_id}"
181
- raise ValueError(msg)
182
-
183
- # Delete old snippets so we don't duplicate. In the future should probably check
184
- # which files have changed and only change those.
185
- await self.repository.delete_all_snippets(index.id)
186
-
187
- # Create snippets for supported file types
188
- self.log.info("Creating snippets for files", index_id=index.id)
189
- await self._create_snippets(index.id)
190
-
191
- snippets = await self.repository.get_all_snippets(index.id)
192
-
193
- self.log.info("Creating keyword index")
194
- with Spinner():
195
- await self.keyword_search_provider.index(
196
- [
197
- BM25Document(snippet_id=snippet.id, text=snippet.content)
198
- for snippet in snippets
199
- ]
200
- )
201
-
202
- self.log.info("Creating semantic code index")
203
- with Spinner():
204
- await self.code_search_service.index(
205
- [
206
- VectorSearchRequest(snippet.id, snippet.content)
207
- for snippet in snippets
208
- ]
209
- )
210
-
211
- self.log.info("Enriching snippets", num_snippets=len(snippets))
212
- enriched_contents = await self.enrichment_service.enrich(
213
- [snippet.content for snippet in snippets]
214
- )
215
-
216
- self.log.info("Creating semantic text index")
217
- with Spinner():
218
- await self.text_search_service.index(
219
- [
220
- VectorSearchRequest(snippet.id, enriched_content)
221
- for snippet, enriched_content in zip(
222
- snippets, enriched_contents, strict=True
223
- )
224
- ]
225
- )
226
- # Add the enriched text back to the snippets and write to the database
227
- for snippet, enriched_content in zip(
228
- snippets, enriched_contents, strict=True
229
- ):
230
- snippet.content = (
231
- enriched_content + "\n\n```\n" + snippet.content + "\n```"
232
- )
233
- await self.repository.add_snippet(snippet)
234
-
235
- # Update index timestamp
236
- await self.repository.update_index_timestamp(index)
237
-
238
- async def search(self, request: SearchRequest) -> list[SearchResult]:
239
- """Search for relevant data."""
240
- log_event("kodit.index.search")
241
-
242
- fusion_list: list[list[FusionRequest]] = []
243
- if request.keywords:
244
- # Gather results for each keyword
245
- result_ids: list[BM25Result] = []
246
- for keyword in request.keywords:
247
- results = await self.keyword_search_provider.retrieve(
248
- keyword, request.top_k
249
- )
250
- result_ids.extend(results)
251
-
252
- fusion_list.append(
253
- [FusionRequest(id=x.snippet_id, score=x.score) for x in result_ids]
254
- )
255
-
256
- # Compute embedding for semantic query
257
- if request.code_query:
258
- query_embedding = await self.code_search_service.retrieve(
259
- request.code_query, top_k=request.top_k
260
- )
261
- fusion_list.append(
262
- [FusionRequest(id=x.snippet_id, score=x.score) for x in query_embedding]
263
- )
264
-
265
- if request.text_query:
266
- query_embedding = await self.text_search_service.retrieve(
267
- request.text_query, top_k=request.top_k
268
- )
269
- fusion_list.append(
270
- [FusionRequest(id=x.snippet_id, score=x.score) for x in query_embedding]
271
- )
272
-
273
- if len(fusion_list) == 0:
274
- return []
275
-
276
- # Combine all results together with RFF if required
277
- final_results = reciprocal_rank_fusion(
278
- rankings=fusion_list,
279
- k=60,
280
- )
281
-
282
- # Only keep top_k results
283
- final_results = final_results[: request.top_k]
284
-
285
- # Get snippets from database (up to top_k)
286
- search_results = await self.repository.list_snippets_by_ids(
287
- [x.id for x in final_results]
288
- )
289
-
290
- return [
291
- SearchResult(
292
- id=snippet.id,
293
- uri=file.uri,
294
- content=snippet.content,
295
- original_scores=fr.original_scores,
296
- )
297
- for (file, snippet), fr in zip(search_results, final_results, strict=True)
298
- ]
299
-
300
- async def _create_snippets(
301
- self,
302
- index_id: int,
303
- ) -> None:
304
- """Create snippets for supported files.
305
-
306
- Args:
307
- index: The index to create snippets for.
308
- file_list: List of files to create snippets from.
309
- existing_snippets_set: Set of file IDs that already have snippets.
310
-
311
- """
312
- files = await self.repository.files_for_index(index_id)
313
- if not files:
314
- self.log.warning("No files to create snippets for")
315
- return
316
-
317
- for file in tqdm(files, total=len(files), leave=False):
318
- # Skip unsupported file types
319
- if file.mime_type in MIME_BLACKLIST:
320
- self.log.debug("Skipping mime type", mime_type=file.mime_type)
321
- continue
322
-
323
- # Create snippet from file content
324
- try:
325
- snippets = self.snippet_service.snippets_for_file(
326
- Path(file.cloned_path)
327
- )
328
- except ValueError as e:
329
- self.log.debug("Skipping file", file=file.cloned_path, error=e)
330
- continue
331
-
332
- for snippet in snippets:
333
- s = Snippet(
334
- index_id=index_id,
335
- file_id=file.id,
336
- content=snippet.text,
337
- )
338
- await self.repository.add_snippet(s)
@@ -1 +0,0 @@
1
- """Extract method snippets from source code."""
@@ -1,53 +0,0 @@
1
- """Detect the language of a file."""
2
-
3
- from pathlib import Path
4
- from typing import cast
5
-
6
- from tree_sitter_language_pack import SupportedLanguage
7
-
8
- # Mapping of file extensions to programming languages
9
- LANGUAGE_MAP: dict[str, str] = {
10
- # JavaScript/TypeScript
11
- "js": "javascript",
12
- "jsx": "javascript",
13
- "ts": "typescript",
14
- "tsx": "typescript",
15
- # Python
16
- "py": "python",
17
- # Rust
18
- "rs": "rust",
19
- # Go
20
- "go": "go",
21
- # C/C++
22
- "cpp": "cpp",
23
- "hpp": "cpp",
24
- "c": "c",
25
- "h": "c",
26
- # C#
27
- "cs": "csharp",
28
- # Ruby
29
- "rb": "ruby",
30
- # Java
31
- "java": "java",
32
- # PHP
33
- "php": "php",
34
- # Swift
35
- "swift": "swift",
36
- # Kotlin
37
- "kt": "kotlin",
38
- }
39
-
40
-
41
- def detect_language(file_path: Path) -> SupportedLanguage:
42
- """Detect the language of a file."""
43
- suffix = file_path.suffix.removeprefix(".").lower()
44
- msg = f"Unsupported language for file suffix: {suffix}"
45
- lang = LANGUAGE_MAP.get(suffix)
46
- if lang is None:
47
- raise ValueError(msg)
48
-
49
- # Try to cast the language to a SupportedLanguage
50
- try:
51
- return cast("SupportedLanguage", lang)
52
- except Exception as e:
53
- raise ValueError(msg) from e
@@ -1,50 +0,0 @@
1
- """Generate snippets from a file."""
2
-
3
- from dataclasses import dataclass
4
- from pathlib import Path
5
-
6
- from kodit.snippets.languages import detect_language
7
- from kodit.snippets.method_snippets import MethodSnippets
8
-
9
-
10
- @dataclass
11
- class Snippet:
12
- """A snippet of code."""
13
-
14
- text: str
15
-
16
-
17
- class SnippetService:
18
- """Factory for generating snippets from a file.
19
-
20
- This is required because there's going to be multiple ways to generate snippets.
21
- """
22
-
23
- def __init__(self) -> None:
24
- """Initialize the snippet factory."""
25
- self.language_dir = Path(__file__).parent / "languages"
26
-
27
- def snippets_for_file(self, file_path: Path) -> list[Snippet]:
28
- """Generate snippets from a file."""
29
- language = detect_language(file_path)
30
-
31
- try:
32
- query_path = self.language_dir / f"{language}.scm"
33
- with query_path.open() as f:
34
- query = f.read()
35
- except Exception as e:
36
- msg = f"Unsupported language: {file_path}"
37
- raise ValueError(msg) from e
38
-
39
- method_analser = MethodSnippets(language, query)
40
-
41
- try:
42
- file_bytes = file_path.read_bytes()
43
- except Exception as e:
44
- msg = f"Failed to read file: {file_path}"
45
- raise ValueError(msg) from e
46
-
47
- method_snippets = method_analser.extract(file_bytes)
48
- all_snippets = [Snippet(text=snippet) for snippet in method_snippets]
49
- # Remove any snippets that are empty
50
- return [snippet for snippet in all_snippets if snippet.text.strip()]