kodit 0.2.8__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (37) hide show
  1. kodit/_version.py +2 -2
  2. kodit/app.py +36 -1
  3. kodit/application/factories/__init__.py +1 -0
  4. kodit/application/factories/code_indexing_factory.py +119 -0
  5. kodit/application/services/{indexing_application_service.py → code_indexing_application_service.py} +159 -198
  6. kodit/cli.py +214 -62
  7. kodit/config.py +40 -3
  8. kodit/domain/entities.py +7 -5
  9. kodit/domain/repositories.py +33 -0
  10. kodit/domain/services/bm25_service.py +14 -17
  11. kodit/domain/services/embedding_service.py +10 -14
  12. kodit/domain/services/snippet_service.py +198 -0
  13. kodit/domain/value_objects.py +301 -21
  14. kodit/infrastructure/bm25/local_bm25_repository.py +20 -12
  15. kodit/infrastructure/bm25/vectorchord_bm25_repository.py +31 -11
  16. kodit/infrastructure/cloning/metadata.py +1 -0
  17. kodit/infrastructure/embedding/embedding_providers/hash_embedding_provider.py +14 -25
  18. kodit/infrastructure/embedding/local_vector_search_repository.py +26 -38
  19. kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +50 -35
  20. kodit/infrastructure/enrichment/enrichment_factory.py +1 -1
  21. kodit/infrastructure/indexing/auto_indexing_service.py +84 -0
  22. kodit/infrastructure/indexing/indexing_factory.py +8 -91
  23. kodit/infrastructure/indexing/snippet_domain_service_factory.py +37 -0
  24. kodit/infrastructure/snippet_extraction/languages/java.scm +12 -0
  25. kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +3 -31
  26. kodit/infrastructure/sqlalchemy/embedding_repository.py +14 -3
  27. kodit/infrastructure/sqlalchemy/snippet_repository.py +174 -2
  28. kodit/mcp.py +61 -49
  29. {kodit-0.2.8.dist-info → kodit-0.3.0.dist-info}/METADATA +1 -1
  30. {kodit-0.2.8.dist-info → kodit-0.3.0.dist-info}/RECORD +33 -31
  31. kodit/application/commands/__init__.py +0 -1
  32. kodit/application/commands/snippet_commands.py +0 -22
  33. kodit/application/services/snippet_application_service.py +0 -149
  34. kodit/infrastructure/enrichment/legacy_enrichment_models.py +0 -42
  35. {kodit-0.2.8.dist-info → kodit-0.3.0.dist-info}/WHEEL +0 -0
  36. {kodit-0.2.8.dist-info → kodit-0.3.0.dist-info}/entry_points.txt +0 -0
  37. {kodit-0.2.8.dist-info → kodit-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,12 +1,10 @@
1
- """Application service for indexing operations."""
1
+ """Unified application service for code indexing operations."""
2
+
3
+ from dataclasses import replace
2
4
 
3
5
  import structlog
4
6
  from sqlalchemy.ext.asyncio import AsyncSession
5
7
 
6
- from kodit.application.commands.snippet_commands import CreateIndexSnippetsCommand
7
- from kodit.application.services.snippet_application_service import (
8
- SnippetApplicationService,
9
- )
10
8
  from kodit.domain.entities import Snippet
11
9
  from kodit.domain.enums import SnippetExtractionStrategy
12
10
  from kodit.domain.errors import EmptySourceError
@@ -15,106 +13,72 @@ from kodit.domain.services.bm25_service import BM25DomainService
15
13
  from kodit.domain.services.embedding_service import EmbeddingDomainService
16
14
  from kodit.domain.services.enrichment_service import EnrichmentDomainService
17
15
  from kodit.domain.services.indexing_service import IndexingDomainService
16
+ from kodit.domain.services.snippet_service import SnippetDomainService
18
17
  from kodit.domain.services.source_service import SourceService
19
18
  from kodit.domain.value_objects import (
20
- BM25Document,
21
- BM25IndexRequest,
22
- BM25SearchRequest,
23
- BM25SearchResult,
19
+ Document,
24
20
  EnrichmentIndexRequest,
25
21
  EnrichmentRequest,
26
22
  FusionRequest,
27
23
  IndexCreateRequest,
24
+ IndexRequest,
28
25
  IndexView,
29
26
  MultiSearchRequest,
30
27
  MultiSearchResult,
31
- VectorIndexRequest,
32
- VectorSearchQueryRequest,
33
- VectorSearchRequest,
28
+ SearchRequest,
29
+ SearchResult,
30
+ SnippetListItem,
34
31
  )
35
32
  from kodit.log import log_event
36
33
  from kodit.reporting import Reporter
37
34
 
38
35
 
39
- class IndexingApplicationService:
40
- """Application service for indexing operations.
41
-
42
- This service orchestrates the business logic for creating, listing, and running
43
- code indexes. It coordinates between domain services and provides a clean API
44
- for index management.
45
- """
36
+ class CodeIndexingApplicationService:
37
+ """Unified application service for all code indexing operations."""
46
38
 
47
39
  def __init__( # noqa: PLR0913
48
40
  self,
49
41
  indexing_domain_service: IndexingDomainService,
42
+ snippet_domain_service: SnippetDomainService,
50
43
  source_service: SourceService,
51
44
  bm25_service: BM25DomainService,
52
45
  code_search_service: EmbeddingDomainService,
53
46
  text_search_service: EmbeddingDomainService,
54
47
  enrichment_service: EnrichmentDomainService,
55
- snippet_application_service: SnippetApplicationService,
56
48
  session: AsyncSession,
57
49
  ) -> None:
58
- """Initialize the indexing application service.
59
-
60
- Args:
61
- indexing_domain_service: The indexing domain service.
62
- source_service: The source service for source validation.
63
- bm25_service: The BM25 domain service for keyword search.
64
- code_search_service: The code search domain service.
65
- text_search_service: The text search domain service.
66
- enrichment_service: The enrichment domain service.
67
- snippet_application_service: The snippet application service.
68
- session: The database session for transaction management.
69
-
70
- """
50
+ """Initialize the code indexing application service."""
71
51
  self.indexing_domain_service = indexing_domain_service
52
+ self.snippet_domain_service = snippet_domain_service
72
53
  self.source_service = source_service
73
- self.snippet_application_service = snippet_application_service
74
- self.session = session
75
- self.log = structlog.get_logger(__name__)
76
54
  self.bm25_service = bm25_service
77
55
  self.code_search_service = code_search_service
78
56
  self.text_search_service = text_search_service
79
57
  self.enrichment_service = enrichment_service
58
+ self.session = session
59
+ self.log = structlog.get_logger(__name__)
80
60
 
81
61
  async def create_index(self, source_id: int) -> IndexView:
82
- """Create a new index for a source.
83
-
84
- Args:
85
- source_id: The ID of the source to create an index for.
86
-
87
- Returns:
88
- An IndexView representing the newly created index.
89
-
90
- Raises:
91
- ValueError: If the source doesn't exist.
92
-
93
- """
62
+ """Create a new index for a source."""
94
63
  log_event("kodit.index.create")
95
64
 
96
- # Check if the source exists
65
+ # Validate source exists
97
66
  source = await self.source_service.get(source_id)
98
67
 
99
- # Create the index
68
+ # Create index
100
69
  request = IndexCreateRequest(source_id=source.id)
101
70
  index_view = await self.indexing_domain_service.create_index(request)
102
71
 
103
- # Commit the index creation
72
+ # Single transaction commit
104
73
  await self.session.commit()
105
74
 
106
75
  return index_view
107
76
 
108
77
  async def list_indexes(self) -> list[IndexView]:
109
- """List all available indexes with their details.
110
-
111
- Returns:
112
- A list of IndexView objects containing information about each index.
113
-
114
- """
78
+ """List all available indexes with their details."""
115
79
  indexes = await self.indexing_domain_service.list_indexes()
116
80
 
117
- # Help Kodit by measuring how much people are using indexes
81
+ # Telemetry
118
82
  log_event(
119
83
  "kodit.index.list",
120
84
  {
@@ -128,112 +92,184 @@ class IndexingApplicationService:
128
92
  async def run_index(
129
93
  self, index_id: int, progress_callback: ProgressCallback | None = None
130
94
  ) -> None:
131
- """Run the indexing process for a specific index.
132
-
133
- Args:
134
- index_id: The ID of the index to run.
135
- progress_callback: Optional progress callback for reporting progress.
136
-
137
- Raises:
138
- ValueError: If the index doesn't exist or no indexable snippets are found.
139
-
140
- """
95
+ """Run the complete indexing process for a specific index."""
141
96
  log_event("kodit.index.run")
142
97
 
143
- # Get and validate index
98
+ # Validate index
144
99
  index = await self.indexing_domain_service.get_index(index_id)
145
100
  if not index:
146
101
  msg = f"Index not found: {index_id}"
147
102
  raise ValueError(msg)
148
103
 
149
- # Delete old snippets so we don't duplicate
150
- await self.indexing_domain_service.delete_all_snippets(index.id)
151
- # Commit the deletion
152
- await self.session.commit()
104
+ # Delete old snippets to make way for reindexing
105
+ # In the future we will only reindex snippets that have changed
106
+ await self.snippet_domain_service.delete_snippets_for_index(index.id)
153
107
 
154
- # Create snippets for supported file types using the snippet application service
155
- # (snippet_application_service handles its own commits)
108
+ # Extract and create snippets (domain service handles progress)
156
109
  self.log.info("Creating snippets for files", index_id=index.id)
157
- command = CreateIndexSnippetsCommand(
158
- index_id=index.id, strategy=SnippetExtractionStrategy.METHOD_BASED
159
- )
160
- await self.snippet_application_service.create_snippets_for_index(
161
- command, progress_callback
110
+ snippets = await self.snippet_domain_service.extract_and_create_snippets(
111
+ index_id=index.id,
112
+ strategy=SnippetExtractionStrategy.METHOD_BASED,
113
+ progress_callback=progress_callback,
162
114
  )
163
115
 
164
- snippets = await self.indexing_domain_service.get_snippets_for_index(index.id)
165
-
166
116
  # Check if any snippets were extracted
167
117
  if not snippets:
168
118
  msg = f"No indexable snippets found for index {index.id}"
169
119
  raise EmptySourceError(msg)
170
120
 
121
+ # Commit snippets to ensure they have IDs for indexing
122
+ await self.session.commit()
123
+
171
124
  # Create BM25 index
172
125
  self.log.info("Creating keyword index")
173
- reporter = Reporter(self.log, progress_callback)
174
- await reporter.start("bm25_index", len(snippets), "Creating keyword index...")
175
126
  await self._create_bm25_index(snippets, progress_callback)
176
- await reporter.done("bm25_index", "Keyword index created")
177
127
 
178
128
  # Create code embeddings
179
129
  self.log.info("Creating semantic code index")
180
- reporter = Reporter(self.log, progress_callback)
181
- await reporter.start(
182
- "code_embeddings", len(snippets), "Creating code embeddings..."
183
- )
184
130
  await self._create_code_embeddings(snippets, progress_callback)
185
- await reporter.done("code_embeddings")
186
131
 
187
132
  # Enrich snippets
188
133
  self.log.info("Enriching snippets", num_snippets=len(snippets))
189
- reporter = Reporter(self.log, progress_callback)
190
- await reporter.start("enrichment", len(snippets), "Enriching snippets...")
191
134
  await self._enrich_snippets(snippets, progress_callback)
192
- await reporter.done("enrichment")
193
135
 
194
- # Create text embeddings
136
+ # Get refreshed snippets after enrichment
137
+ snippets = await self.snippet_domain_service.get_snippets_for_index(index.id)
138
+
139
+ # Create text embeddings (on enriched content)
195
140
  self.log.info("Creating semantic text index")
196
- reporter = Reporter(self.log, progress_callback)
197
- await reporter.start(
198
- "text_embeddings", len(snippets), "Creating text embeddings..."
199
- )
200
141
  await self._create_text_embeddings(snippets, progress_callback)
201
- await reporter.done("text_embeddings")
202
142
 
203
143
  # Update index timestamp
204
144
  await self.indexing_domain_service.update_index_timestamp(index.id)
205
- # Commit the timestamp update
145
+
146
+ # Single transaction commit for the entire operation
206
147
  await self.session.commit()
207
148
 
149
+ async def search(self, request: MultiSearchRequest) -> list[MultiSearchResult]:
150
+ """Search for relevant snippets across all indexes."""
151
+ log_event("kodit.index.search")
152
+
153
+ # Apply filters if provided
154
+ filtered_snippet_ids: list[int] | None = None
155
+ if request.filters:
156
+ # Use domain service for filtering
157
+ prefilter_request = replace(request, top_k=None)
158
+ snippet_results = await self.snippet_domain_service.search_snippets(
159
+ prefilter_request
160
+ )
161
+ filtered_snippet_ids = [snippet.id for snippet in snippet_results]
162
+
163
+ # Gather results from different search modes
164
+ fusion_list: list[list[FusionRequest]] = []
165
+
166
+ # Keyword search
167
+ if request.keywords:
168
+ result_ids: list[SearchResult] = []
169
+ for keyword in request.keywords:
170
+ results = await self.bm25_service.search(
171
+ SearchRequest(
172
+ query=keyword,
173
+ top_k=request.top_k,
174
+ snippet_ids=filtered_snippet_ids,
175
+ )
176
+ )
177
+ result_ids.extend(results)
178
+
179
+ fusion_list.append(
180
+ [FusionRequest(id=x.snippet_id, score=x.score) for x in result_ids]
181
+ )
182
+
183
+ # Semantic code search
184
+ if request.code_query:
185
+ query_results = await self.code_search_service.search(
186
+ SearchRequest(
187
+ query=request.code_query,
188
+ top_k=request.top_k,
189
+ snippet_ids=filtered_snippet_ids,
190
+ )
191
+ )
192
+ fusion_list.append(
193
+ [FusionRequest(id=x.snippet_id, score=x.score) for x in query_results]
194
+ )
195
+
196
+ # Semantic text search
197
+ if request.text_query:
198
+ query_results = await self.text_search_service.search(
199
+ SearchRequest(
200
+ query=request.text_query,
201
+ top_k=request.top_k,
202
+ snippet_ids=filtered_snippet_ids,
203
+ )
204
+ )
205
+ fusion_list.append(
206
+ [FusionRequest(id=x.snippet_id, score=x.score) for x in query_results]
207
+ )
208
+
209
+ if len(fusion_list) == 0:
210
+ return []
211
+
212
+ # Fusion ranking
213
+ final_results = self.indexing_domain_service.perform_fusion(
214
+ rankings=fusion_list,
215
+ k=60, # This is a parameter in the RRF algorithm, not top_k
216
+ )
217
+
218
+ # Keep only top_k results
219
+ final_results = final_results[: request.top_k]
220
+
221
+ # Get snippet details
222
+ search_results = await self.indexing_domain_service.get_snippets_by_ids(
223
+ [x.id for x in final_results]
224
+ )
225
+
226
+ return [
227
+ MultiSearchResult(
228
+ id=snippet["id"],
229
+ uri=file["uri"],
230
+ content=snippet["content"],
231
+ original_scores=fr.original_scores,
232
+ )
233
+ for (file, snippet), fr in zip(search_results, final_results, strict=True)
234
+ ]
235
+
236
+ async def list_snippets(
237
+ self, file_path: str | None = None, source_uri: str | None = None
238
+ ) -> list[SnippetListItem]:
239
+ """List snippets with optional filtering."""
240
+ log_event("kodit.index.list_snippets")
241
+ return await self.snippet_domain_service.list_snippets(file_path, source_uri)
242
+
208
243
  async def _create_bm25_index(
209
244
  self, snippets: list[Snippet], progress_callback: ProgressCallback | None = None
210
245
  ) -> None:
211
- """Create BM25 keyword index."""
212
246
  reporter = Reporter(self.log, progress_callback)
213
247
  await reporter.start("bm25_index", len(snippets), "Creating keyword index...")
248
+
214
249
  await self.bm25_service.index_documents(
215
- BM25IndexRequest(
250
+ IndexRequest(
216
251
  documents=[
217
- BM25Document(snippet_id=snippet.id, text=snippet.content)
252
+ Document(snippet_id=snippet.id, text=snippet.content)
218
253
  for snippet in snippets
219
254
  ]
220
255
  )
221
256
  )
257
+
222
258
  await reporter.done("bm25_index", "Keyword index created")
223
259
 
224
260
  async def _create_code_embeddings(
225
261
  self, snippets: list[Snippet], progress_callback: ProgressCallback | None = None
226
262
  ) -> None:
227
- """Create code embeddings."""
228
263
  reporter = Reporter(self.log, progress_callback)
229
264
  await reporter.start(
230
265
  "code_embeddings", len(snippets), "Creating code embeddings..."
231
266
  )
267
+
232
268
  processed = 0
233
269
  async for result in self.code_search_service.index_documents(
234
- VectorIndexRequest(
270
+ IndexRequest(
235
271
  documents=[
236
- VectorSearchRequest(snippet.id, snippet.content)
272
+ Document(snippet_id=snippet.id, text=snippet.content)
237
273
  for snippet in snippets
238
274
  ]
239
275
  )
@@ -245,15 +281,15 @@ class IndexingApplicationService:
245
281
  len(snippets),
246
282
  "Creating code embeddings...",
247
283
  )
284
+
248
285
  await reporter.done("code_embeddings")
249
286
 
250
287
  async def _enrich_snippets(
251
288
  self, snippets: list[Snippet], progress_callback: ProgressCallback | None = None
252
289
  ) -> None:
253
- """Enrich snippets with additional context."""
254
290
  reporter = Reporter(self.log, progress_callback)
255
291
  await reporter.start("enrichment", len(snippets), "Enriching snippets...")
256
- enriched_contents = []
292
+
257
293
  enrichment_request = EnrichmentIndexRequest(
258
294
  requests=[
259
295
  EnrichmentRequest(snippet_id=snippet.id, text=snippet.content)
@@ -265,44 +301,38 @@ class IndexingApplicationService:
265
301
  async for result in self.enrichment_service.enrich_documents(
266
302
  enrichment_request
267
303
  ):
268
- # Find the snippet by ID
269
- snippet = next(s for s in snippets if s.id == result.snippet_id)
270
- if snippet:
271
- # Update the content in the local entity for subsequent processing
272
- enriched_content = result.text + "\n\n```\n" + snippet.content + "\n```"
273
- snippet.content = enriched_content
274
-
275
- # UPDATE the existing snippet entity instead of creating a new one
276
- # This follows DDD principles and avoids duplicates
277
- await self.indexing_domain_service.update_snippet_content(
278
- snippet.id, enriched_content
279
- )
280
- enriched_contents.append(result)
304
+ # Update snippet content through domain service
305
+ enriched_content = (
306
+ result.text
307
+ + "\n\n```\n"
308
+ + next(s.content for s in snippets if s.id == result.snippet_id)
309
+ + "\n```"
310
+ )
311
+
312
+ await self.snippet_domain_service.update_snippet_content(
313
+ result.snippet_id, enriched_content
314
+ )
281
315
 
282
316
  processed += 1
283
317
  await reporter.step(
284
318
  "enrichment", processed, len(snippets), "Enriching snippets..."
285
319
  )
286
320
 
287
- # Commit all snippet content updates as a single transaction
288
- if enriched_contents:
289
- await self.session.commit()
290
-
291
321
  await reporter.done("enrichment")
292
322
 
293
323
  async def _create_text_embeddings(
294
324
  self, snippets: list[Snippet], progress_callback: ProgressCallback | None = None
295
325
  ) -> None:
296
- """Create text embeddings."""
297
326
  reporter = Reporter(self.log, progress_callback)
298
327
  await reporter.start(
299
328
  "text_embeddings", len(snippets), "Creating text embeddings..."
300
329
  )
330
+
301
331
  processed = 0
302
332
  async for result in self.text_search_service.index_documents(
303
- VectorIndexRequest(
333
+ IndexRequest(
304
334
  documents=[
305
- VectorSearchRequest(snippet.id, snippet.content)
335
+ Document(snippet_id=snippet.id, text=snippet.content)
306
336
  for snippet in snippets
307
337
  ]
308
338
  )
@@ -314,74 +344,5 @@ class IndexingApplicationService:
314
344
  len(snippets),
315
345
  "Creating text embeddings...",
316
346
  )
317
- await reporter.done("text_embeddings")
318
-
319
- async def search(self, request: MultiSearchRequest) -> list[MultiSearchResult]:
320
- """Search for relevant data.
321
-
322
- Args:
323
- request: The search request.
324
347
 
325
- Returns:
326
- A list of search results.
327
-
328
- """
329
- log_event("kodit.index.search")
330
-
331
- fusion_list: list[list[FusionRequest]] = []
332
- if request.keywords:
333
- # Gather results for each keyword
334
- result_ids: list[BM25SearchResult] = []
335
- for keyword in request.keywords:
336
- results = await self.bm25_service.search(
337
- BM25SearchRequest(query=keyword, top_k=request.top_k)
338
- )
339
- result_ids.extend(results)
340
-
341
- fusion_list.append(
342
- [FusionRequest(id=x.snippet_id, score=x.score) for x in result_ids]
343
- )
344
-
345
- # Compute embedding for semantic query
346
- if request.code_query:
347
- query_embedding = await self.code_search_service.search(
348
- VectorSearchQueryRequest(query=request.code_query, top_k=request.top_k)
349
- )
350
- fusion_list.append(
351
- [FusionRequest(id=x.snippet_id, score=x.score) for x in query_embedding]
352
- )
353
-
354
- if request.text_query:
355
- query_embedding = await self.text_search_service.search(
356
- VectorSearchQueryRequest(query=request.text_query, top_k=request.top_k)
357
- )
358
- fusion_list.append(
359
- [FusionRequest(id=x.snippet_id, score=x.score) for x in query_embedding]
360
- )
361
-
362
- if len(fusion_list) == 0:
363
- return []
364
-
365
- # Combine all results together with RFF if required
366
- final_results = self.indexing_domain_service.perform_fusion(
367
- rankings=fusion_list,
368
- k=60,
369
- )
370
-
371
- # Only keep top_k results
372
- final_results = final_results[: request.top_k]
373
-
374
- # Get snippets from database (up to top_k)
375
- search_results = await self.indexing_domain_service.get_snippets_by_ids(
376
- [x.id for x in final_results]
377
- )
378
-
379
- return [
380
- MultiSearchResult(
381
- id=snippet["id"],
382
- uri=file["uri"],
383
- content=snippet["content"],
384
- original_scores=fr.original_scores,
385
- )
386
- for (file, snippet), fr in zip(search_results, final_results, strict=True)
387
- ]
348
+ await reporter.done("text_embeddings")