kodit 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (100) hide show
  1. kodit/_version.py +2 -2
  2. kodit/app.py +59 -24
  3. kodit/application/factories/reporting_factory.py +16 -7
  4. kodit/application/factories/server_factory.py +311 -0
  5. kodit/application/services/code_search_application_service.py +144 -0
  6. kodit/application/services/commit_indexing_application_service.py +543 -0
  7. kodit/application/services/indexing_worker_service.py +13 -46
  8. kodit/application/services/queue_service.py +24 -3
  9. kodit/application/services/reporting.py +70 -54
  10. kodit/application/services/sync_scheduler.py +15 -31
  11. kodit/cli.py +2 -763
  12. kodit/cli_utils.py +2 -9
  13. kodit/config.py +3 -96
  14. kodit/database.py +38 -1
  15. kodit/domain/entities/__init__.py +276 -0
  16. kodit/domain/entities/git.py +190 -0
  17. kodit/domain/factories/__init__.py +1 -0
  18. kodit/domain/factories/git_repo_factory.py +76 -0
  19. kodit/domain/protocols.py +270 -46
  20. kodit/domain/services/bm25_service.py +5 -1
  21. kodit/domain/services/embedding_service.py +3 -0
  22. kodit/domain/services/git_repository_service.py +429 -0
  23. kodit/domain/services/git_service.py +300 -0
  24. kodit/domain/services/task_status_query_service.py +19 -0
  25. kodit/domain/value_objects.py +113 -147
  26. kodit/infrastructure/api/client/__init__.py +0 -2
  27. kodit/infrastructure/api/v1/__init__.py +0 -4
  28. kodit/infrastructure/api/v1/dependencies.py +105 -44
  29. kodit/infrastructure/api/v1/routers/__init__.py +0 -6
  30. kodit/infrastructure/api/v1/routers/commits.py +271 -0
  31. kodit/infrastructure/api/v1/routers/queue.py +2 -2
  32. kodit/infrastructure/api/v1/routers/repositories.py +282 -0
  33. kodit/infrastructure/api/v1/routers/search.py +31 -14
  34. kodit/infrastructure/api/v1/schemas/__init__.py +0 -24
  35. kodit/infrastructure/api/v1/schemas/commit.py +96 -0
  36. kodit/infrastructure/api/v1/schemas/context.py +2 -0
  37. kodit/infrastructure/api/v1/schemas/repository.py +128 -0
  38. kodit/infrastructure/api/v1/schemas/search.py +12 -9
  39. kodit/infrastructure/api/v1/schemas/snippet.py +58 -0
  40. kodit/infrastructure/api/v1/schemas/tag.py +31 -0
  41. kodit/infrastructure/api/v1/schemas/task_status.py +41 -0
  42. kodit/infrastructure/bm25/local_bm25_repository.py +16 -4
  43. kodit/infrastructure/bm25/vectorchord_bm25_repository.py +68 -52
  44. kodit/infrastructure/cloning/git/git_python_adaptor.py +467 -0
  45. kodit/infrastructure/cloning/git/working_copy.py +10 -3
  46. kodit/infrastructure/embedding/embedding_factory.py +3 -2
  47. kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
  48. kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +111 -84
  49. kodit/infrastructure/enrichment/litellm_enrichment_provider.py +19 -26
  50. kodit/infrastructure/enrichment/local_enrichment_provider.py +41 -30
  51. kodit/infrastructure/indexing/fusion_service.py +1 -1
  52. kodit/infrastructure/mappers/git_mapper.py +193 -0
  53. kodit/infrastructure/mappers/snippet_mapper.py +106 -0
  54. kodit/infrastructure/mappers/task_mapper.py +5 -44
  55. kodit/infrastructure/mappers/task_status_mapper.py +85 -0
  56. kodit/infrastructure/reporting/db_progress.py +23 -0
  57. kodit/infrastructure/reporting/log_progress.py +13 -38
  58. kodit/infrastructure/reporting/telemetry_progress.py +21 -0
  59. kodit/infrastructure/slicing/slicer.py +32 -31
  60. kodit/infrastructure/sqlalchemy/embedding_repository.py +43 -23
  61. kodit/infrastructure/sqlalchemy/entities.py +428 -131
  62. kodit/infrastructure/sqlalchemy/git_branch_repository.py +263 -0
  63. kodit/infrastructure/sqlalchemy/git_commit_repository.py +337 -0
  64. kodit/infrastructure/sqlalchemy/git_repository.py +252 -0
  65. kodit/infrastructure/sqlalchemy/git_tag_repository.py +257 -0
  66. kodit/infrastructure/sqlalchemy/snippet_v2_repository.py +484 -0
  67. kodit/infrastructure/sqlalchemy/task_repository.py +29 -23
  68. kodit/infrastructure/sqlalchemy/task_status_repository.py +91 -0
  69. kodit/infrastructure/sqlalchemy/unit_of_work.py +10 -14
  70. kodit/mcp.py +12 -26
  71. kodit/migrations/env.py +1 -1
  72. kodit/migrations/versions/04b80f802e0c_foreign_key_review.py +100 -0
  73. kodit/migrations/versions/7f15f878c3a1_add_new_git_entities.py +690 -0
  74. kodit/migrations/versions/b9cd1c3fd762_add_task_status.py +77 -0
  75. kodit/migrations/versions/f9e5ef5e688f_add_git_commits_number.py +43 -0
  76. kodit/py.typed +0 -0
  77. kodit/utils/dump_openapi.py +7 -4
  78. kodit/utils/path_utils.py +29 -0
  79. {kodit-0.4.2.dist-info → kodit-0.5.0.dist-info}/METADATA +3 -3
  80. kodit-0.5.0.dist-info/RECORD +137 -0
  81. kodit/application/factories/code_indexing_factory.py +0 -193
  82. kodit/application/services/auto_indexing_service.py +0 -103
  83. kodit/application/services/code_indexing_application_service.py +0 -393
  84. kodit/domain/entities.py +0 -323
  85. kodit/domain/services/index_query_service.py +0 -70
  86. kodit/domain/services/index_service.py +0 -267
  87. kodit/infrastructure/api/client/index_client.py +0 -57
  88. kodit/infrastructure/api/v1/routers/indexes.py +0 -119
  89. kodit/infrastructure/api/v1/schemas/index.py +0 -101
  90. kodit/infrastructure/bm25/bm25_factory.py +0 -28
  91. kodit/infrastructure/cloning/__init__.py +0 -1
  92. kodit/infrastructure/cloning/metadata.py +0 -98
  93. kodit/infrastructure/mappers/index_mapper.py +0 -345
  94. kodit/infrastructure/reporting/tdqm_progress.py +0 -73
  95. kodit/infrastructure/slicing/language_detection_service.py +0 -18
  96. kodit/infrastructure/sqlalchemy/index_repository.py +0 -646
  97. kodit-0.4.2.dist-info/RECORD +0 -119
  98. {kodit-0.4.2.dist-info → kodit-0.5.0.dist-info}/WHEEL +0 -0
  99. {kodit-0.4.2.dist-info → kodit-0.5.0.dist-info}/entry_points.txt +0 -0
  100. {kodit-0.4.2.dist-info → kodit-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,393 +0,0 @@
1
- """Unified application service for code indexing operations."""
2
-
3
- from dataclasses import replace
4
- from datetime import UTC, datetime
5
-
6
- import structlog
7
- from sqlalchemy.ext.asyncio import AsyncSession
8
-
9
- from kodit.application.services.reporting import (
10
- OperationType,
11
- ProgressTracker,
12
- )
13
- from kodit.domain.entities import Index, Snippet
14
- from kodit.domain.protocols import IndexRepository
15
- from kodit.domain.services.bm25_service import BM25DomainService
16
- from kodit.domain.services.embedding_service import EmbeddingDomainService
17
- from kodit.domain.services.enrichment_service import EnrichmentDomainService
18
- from kodit.domain.services.index_query_service import IndexQueryService
19
- from kodit.domain.services.index_service import IndexDomainService
20
- from kodit.domain.value_objects import (
21
- Document,
22
- FusionRequest,
23
- IndexRequest,
24
- MultiSearchRequest,
25
- MultiSearchResult,
26
- SearchRequest,
27
- SearchResult,
28
- SnippetSearchFilters,
29
- )
30
- from kodit.log import log_event
31
-
32
-
33
- class CodeIndexingApplicationService:
34
- """Unified application service for all code indexing operations."""
35
-
36
- def __init__( # noqa: PLR0913
37
- self,
38
- indexing_domain_service: IndexDomainService,
39
- index_repository: IndexRepository,
40
- index_query_service: IndexQueryService,
41
- bm25_service: BM25DomainService,
42
- code_search_service: EmbeddingDomainService,
43
- text_search_service: EmbeddingDomainService,
44
- enrichment_service: EnrichmentDomainService,
45
- session: AsyncSession,
46
- operation: ProgressTracker,
47
- ) -> None:
48
- """Initialize the code indexing application service."""
49
- self.index_domain_service = indexing_domain_service
50
- self.index_repository = index_repository
51
- self.index_query_service = index_query_service
52
- self.bm25_service = bm25_service
53
- self.code_search_service = code_search_service
54
- self.text_search_service = text_search_service
55
- self.enrichment_service = enrichment_service
56
- self.session = session
57
- self.operation = operation
58
- self.log = structlog.get_logger(__name__)
59
-
60
- async def does_index_exist(self, uri: str) -> bool:
61
- """Check if an index exists for a source."""
62
- # Check if index already exists
63
- sanitized_uri, _ = self.index_domain_service.sanitize_uri(uri)
64
- existing_index = await self.index_repository.get_by_uri(sanitized_uri)
65
- return existing_index is not None
66
-
67
- async def create_index_from_uri(self, uri: str) -> Index:
68
- """Create a new index for a source."""
69
- log_event("kodit.index.create")
70
- with self.operation.create_child(OperationType.CREATE_INDEX.value) as operation:
71
- # Check if index already exists
72
- sanitized_uri, _ = self.index_domain_service.sanitize_uri(uri)
73
- self.log.info("Creating index from URI", uri=str(sanitized_uri))
74
- existing_index = await self.index_repository.get_by_uri(sanitized_uri)
75
- if existing_index:
76
- self.log.debug(
77
- "Index already exists",
78
- uri=str(sanitized_uri),
79
- index_id=existing_index.id,
80
- )
81
- return existing_index
82
-
83
- # Only prepare working copy if we need to create a new index
84
- self.log.info("Preparing working copy", uri=str(sanitized_uri))
85
- working_copy = await self.index_domain_service.prepare_index(uri, operation)
86
-
87
- # Create new index
88
- self.log.info("Creating index", uri=str(sanitized_uri))
89
- index = await self.index_repository.create(sanitized_uri, working_copy)
90
- await self.session.commit()
91
- return index
92
-
93
- async def run_index(self, index: Index) -> None:
94
- """Run the complete indexing process for a specific index."""
95
- # Create a new operation
96
- with self.operation.create_child(OperationType.RUN_INDEX.value) as operation:
97
- # TODO(philwinder): Move this into a reporter # noqa: TD003, FIX002
98
- log_event("kodit.index.run")
99
-
100
- if not index or not index.id:
101
- msg = f"Index has no ID: {index}"
102
- raise ValueError(msg)
103
-
104
- # Refresh working copy
105
- with operation.create_child("Refresh working copy") as step:
106
- index.source.working_copy = (
107
- await self.index_domain_service.refresh_working_copy(
108
- index.source.working_copy, step
109
- )
110
- )
111
- if len(index.source.working_copy.changed_files()) == 0:
112
- self.log.info("No new changes to index", index_id=index.id)
113
- step.skip("No new changes to index")
114
- return
115
-
116
- # Delete the old snippets from the files that have changed
117
- with operation.create_child("Delete old snippets") as step:
118
- await self.index_repository.delete_snippets_by_file_ids(
119
- [
120
- file.id
121
- for file in index.source.working_copy.changed_files()
122
- if file.id
123
- ]
124
- )
125
-
126
- # Extract and create snippets (domain service handles progress)
127
- with operation.create_child("Extract snippets") as step:
128
- index = await self.index_domain_service.extract_snippets_from_index(
129
- index=index, step=step
130
- )
131
- await self.index_repository.update(index)
132
-
133
- # Refresh index to get snippets with IDs, required for subsequent steps
134
- flushed_index = await self.index_repository.get(index.id)
135
- if not flushed_index:
136
- msg = f"Index {index.id} not found after snippet extraction"
137
- raise ValueError(msg)
138
- index = flushed_index
139
- if len(index.snippets) == 0:
140
- self.log.info(
141
- "No snippets to index after extraction", index_id=index.id
142
- )
143
- step.skip("No snippets to index after extraction")
144
- return
145
-
146
- # Create BM25 index
147
- self.log.info("Creating keyword index")
148
- with operation.create_child("Create BM25 index") as step:
149
- await self._create_bm25_index(index.snippets)
150
-
151
- # Create code embeddings
152
- with operation.create_child("Create code embeddings") as step:
153
- await self._create_code_embeddings(index.snippets, step)
154
-
155
- # Enrich snippets
156
- with operation.create_child("Enrich snippets") as step:
157
- enriched_snippets = (
158
- await self.index_domain_service.enrich_snippets_in_index(
159
- snippets=index.snippets,
160
- reporting_step=step,
161
- )
162
- )
163
- # Update snippets in repository
164
- await self.index_repository.update_snippets(index.id, enriched_snippets)
165
-
166
- # Create text embeddings (on enriched content)
167
- with operation.create_child("Create text embeddings") as step:
168
- await self._create_text_embeddings(enriched_snippets, step)
169
-
170
- # Update index timestamp
171
- with operation.create_child("Update index timestamp") as step:
172
- await self.index_repository.update_index_timestamp(index.id)
173
-
174
- # After indexing, clear the file processing statuses
175
- with operation.create_child("Clear file processing statuses") as step:
176
- index.source.working_copy.clear_file_processing_statuses()
177
- await self.index_repository.update(index)
178
-
179
- async def search(self, request: MultiSearchRequest) -> list[MultiSearchResult]:
180
- """Search for relevant snippets across all indexes."""
181
- log_event("kodit.index.search")
182
-
183
- # Apply filters if provided
184
- filtered_snippet_ids: list[int] | None = None
185
- if request.filters:
186
- # Use domain service for filtering (use large top_k for pre-filtering)
187
- prefilter_request = replace(request, top_k=10000)
188
- snippet_results = await self.index_query_service.search_snippets(
189
- prefilter_request
190
- )
191
- filtered_snippet_ids = [
192
- snippet.snippet.id for snippet in snippet_results if snippet.snippet.id
193
- ]
194
-
195
- # Gather results from different search modes
196
- fusion_list: list[list[FusionRequest]] = []
197
-
198
- # Keyword search
199
- if request.keywords:
200
- result_ids: list[SearchResult] = []
201
- for keyword in request.keywords:
202
- results = await self.bm25_service.search(
203
- SearchRequest(
204
- query=keyword,
205
- top_k=request.top_k,
206
- snippet_ids=filtered_snippet_ids,
207
- )
208
- )
209
- result_ids.extend(results)
210
-
211
- fusion_list.append(
212
- [FusionRequest(id=x.snippet_id, score=x.score) for x in result_ids]
213
- )
214
-
215
- # Semantic code search
216
- if request.code_query:
217
- query_results = await self.code_search_service.search(
218
- SearchRequest(
219
- query=request.code_query,
220
- top_k=request.top_k,
221
- snippet_ids=filtered_snippet_ids,
222
- )
223
- )
224
- fusion_list.append(
225
- [FusionRequest(id=x.snippet_id, score=x.score) for x in query_results]
226
- )
227
-
228
- # Semantic text search
229
- if request.text_query:
230
- query_results = await self.text_search_service.search(
231
- SearchRequest(
232
- query=request.text_query,
233
- top_k=request.top_k,
234
- snippet_ids=filtered_snippet_ids,
235
- )
236
- )
237
- fusion_list.append(
238
- [FusionRequest(id=x.snippet_id, score=x.score) for x in query_results]
239
- )
240
-
241
- if len(fusion_list) == 0:
242
- return []
243
-
244
- # Fusion ranking
245
- final_results = await self.index_query_service.perform_fusion(
246
- rankings=fusion_list,
247
- k=60, # This is a parameter in the RRF algorithm, not top_k
248
- )
249
-
250
- # Keep only top_k results
251
- final_results = final_results[: request.top_k]
252
-
253
- # Get snippet details
254
- search_results = await self.index_query_service.get_snippets_by_ids(
255
- [x.id for x in final_results]
256
- )
257
-
258
- # Create a mapping from snippet ID to search result to handle cases where
259
- # some snippet IDs don't exist (e.g., with vectorchord inconsistencies)
260
- snippet_map = {
261
- result.snippet.id: result
262
- for result in search_results
263
- if result.snippet.id is not None
264
- }
265
-
266
- # Filter final_results to only include IDs that we actually found snippets for
267
- valid_final_results = [fr for fr in final_results if fr.id in snippet_map]
268
-
269
- return [
270
- MultiSearchResult(
271
- id=snippet_map[fr.id].snippet.id or 0,
272
- content=snippet_map[fr.id].snippet.original_text(),
273
- original_scores=fr.original_scores,
274
- # Enhanced fields
275
- source_uri=str(snippet_map[fr.id].source.working_copy.remote_uri),
276
- relative_path=str(
277
- snippet_map[fr.id]
278
- .file.as_path()
279
- .relative_to(snippet_map[fr.id].source.working_copy.cloned_path)
280
- ),
281
- language=MultiSearchResult.detect_language_from_extension(
282
- snippet_map[fr.id].file.extension()
283
- ),
284
- authors=[author.name for author in snippet_map[fr.id].authors],
285
- created_at=snippet_map[fr.id].snippet.created_at or datetime.now(UTC),
286
- # Summary from snippet entity
287
- summary=snippet_map[fr.id].snippet.summary_text(),
288
- )
289
- for fr in valid_final_results
290
- ]
291
-
292
- async def list_snippets(
293
- self, file_path: str | None = None, source_uri: str | None = None
294
- ) -> list[MultiSearchResult]:
295
- """List snippets with optional filtering."""
296
- log_event("kodit.index.list_snippets")
297
- snippet_results = await self.index_query_service.search_snippets(
298
- request=MultiSearchRequest(
299
- filters=SnippetSearchFilters(
300
- file_path=file_path,
301
- source_repo=source_uri,
302
- )
303
- ),
304
- )
305
- return [
306
- MultiSearchResult(
307
- id=result.snippet.id or 0,
308
- content=result.snippet.original_text(),
309
- original_scores=[0.0],
310
- # Enhanced fields
311
- source_uri=str(result.source.working_copy.remote_uri),
312
- relative_path=str(
313
- result.file.as_path().relative_to(
314
- result.source.working_copy.cloned_path
315
- )
316
- ),
317
- language=MultiSearchResult.detect_language_from_extension(
318
- result.file.extension()
319
- ),
320
- authors=[author.name for author in result.authors],
321
- created_at=result.snippet.created_at or datetime.now(UTC),
322
- # Summary from snippet entity
323
- summary=result.snippet.summary_text(),
324
- )
325
- for result in snippet_results
326
- ]
327
-
328
- # FUTURE: BM25 index enriched content too
329
- async def _create_bm25_index(self, snippets: list[Snippet]) -> None:
330
- await self.bm25_service.index_documents(
331
- IndexRequest(
332
- documents=[
333
- Document(snippet_id=snippet.id, text=snippet.original_text())
334
- for snippet in snippets
335
- if snippet.id
336
- ]
337
- )
338
- )
339
-
340
- async def _create_code_embeddings(
341
- self, snippets: list[Snippet], reporting_step: ProgressTracker
342
- ) -> None:
343
- reporting_step.set_total(len(snippets))
344
- processed = 0
345
- async for result in self.code_search_service.index_documents(
346
- IndexRequest(
347
- documents=[
348
- Document(snippet_id=snippet.id, text=snippet.original_text())
349
- for snippet in snippets
350
- if snippet.id
351
- ]
352
- )
353
- ):
354
- processed += len(result)
355
- reporting_step.set_current(processed)
356
-
357
- async def _create_text_embeddings(
358
- self, snippets: list[Snippet], reporting_step: ProgressTracker
359
- ) -> None:
360
- # Only create text embeddings for snippets that have summary content
361
- documents_with_summaries = []
362
- for snippet in snippets:
363
- if snippet.id:
364
- try:
365
- summary_text = snippet.summary_text()
366
- if summary_text.strip(): # Only add if summary is not empty
367
- documents_with_summaries.append(
368
- Document(snippet_id=snippet.id, text=summary_text)
369
- )
370
- except ValueError:
371
- # Skip snippets without summary content
372
- continue
373
-
374
- if not documents_with_summaries:
375
- reporting_step.skip("No snippets with summaries to create text embeddings")
376
- return
377
-
378
- reporting_step.set_total(len(documents_with_summaries))
379
- processed = 0
380
- async for result in self.text_search_service.index_documents(
381
- IndexRequest(documents=documents_with_summaries)
382
- ):
383
- processed += len(result)
384
- reporting_step.set_current(processed)
385
-
386
- async def delete_index(self, index: Index) -> None:
387
- """Delete an index."""
388
- # Delete the index from the domain
389
- await self.index_domain_service.delete_index(index)
390
-
391
- # Delete index from the database
392
- await self.index_repository.delete(index)
393
- await self.session.commit()