kodit 0.4.3__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (95) hide show
  1. kodit/_version.py +2 -2
  2. kodit/app.py +53 -23
  3. kodit/application/factories/reporting_factory.py +6 -2
  4. kodit/application/factories/server_factory.py +311 -0
  5. kodit/application/services/code_search_application_service.py +144 -0
  6. kodit/application/services/commit_indexing_application_service.py +543 -0
  7. kodit/application/services/indexing_worker_service.py +13 -44
  8. kodit/application/services/queue_service.py +24 -3
  9. kodit/application/services/reporting.py +0 -2
  10. kodit/application/services/sync_scheduler.py +15 -31
  11. kodit/cli.py +2 -753
  12. kodit/cli_utils.py +2 -9
  13. kodit/config.py +1 -94
  14. kodit/database.py +38 -1
  15. kodit/domain/{entities.py → entities/__init__.py} +50 -195
  16. kodit/domain/entities/git.py +190 -0
  17. kodit/domain/factories/__init__.py +1 -0
  18. kodit/domain/factories/git_repo_factory.py +76 -0
  19. kodit/domain/protocols.py +263 -64
  20. kodit/domain/services/bm25_service.py +5 -1
  21. kodit/domain/services/embedding_service.py +3 -0
  22. kodit/domain/services/git_repository_service.py +429 -0
  23. kodit/domain/services/git_service.py +300 -0
  24. kodit/domain/services/task_status_query_service.py +2 -2
  25. kodit/domain/value_objects.py +83 -114
  26. kodit/infrastructure/api/client/__init__.py +0 -2
  27. kodit/infrastructure/api/v1/__init__.py +0 -4
  28. kodit/infrastructure/api/v1/dependencies.py +92 -46
  29. kodit/infrastructure/api/v1/routers/__init__.py +0 -6
  30. kodit/infrastructure/api/v1/routers/commits.py +271 -0
  31. kodit/infrastructure/api/v1/routers/queue.py +2 -2
  32. kodit/infrastructure/api/v1/routers/repositories.py +282 -0
  33. kodit/infrastructure/api/v1/routers/search.py +31 -14
  34. kodit/infrastructure/api/v1/schemas/__init__.py +0 -24
  35. kodit/infrastructure/api/v1/schemas/commit.py +96 -0
  36. kodit/infrastructure/api/v1/schemas/context.py +2 -0
  37. kodit/infrastructure/api/v1/schemas/repository.py +128 -0
  38. kodit/infrastructure/api/v1/schemas/search.py +12 -9
  39. kodit/infrastructure/api/v1/schemas/snippet.py +58 -0
  40. kodit/infrastructure/api/v1/schemas/tag.py +31 -0
  41. kodit/infrastructure/api/v1/schemas/task_status.py +2 -0
  42. kodit/infrastructure/bm25/local_bm25_repository.py +16 -4
  43. kodit/infrastructure/bm25/vectorchord_bm25_repository.py +68 -52
  44. kodit/infrastructure/cloning/git/git_python_adaptor.py +467 -0
  45. kodit/infrastructure/cloning/git/working_copy.py +1 -1
  46. kodit/infrastructure/embedding/embedding_factory.py +3 -2
  47. kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
  48. kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +111 -84
  49. kodit/infrastructure/enrichment/litellm_enrichment_provider.py +19 -26
  50. kodit/infrastructure/indexing/fusion_service.py +1 -1
  51. kodit/infrastructure/mappers/git_mapper.py +193 -0
  52. kodit/infrastructure/mappers/snippet_mapper.py +106 -0
  53. kodit/infrastructure/mappers/task_mapper.py +5 -44
  54. kodit/infrastructure/reporting/log_progress.py +8 -5
  55. kodit/infrastructure/reporting/telemetry_progress.py +21 -0
  56. kodit/infrastructure/slicing/slicer.py +32 -31
  57. kodit/infrastructure/sqlalchemy/embedding_repository.py +43 -23
  58. kodit/infrastructure/sqlalchemy/entities.py +394 -158
  59. kodit/infrastructure/sqlalchemy/git_branch_repository.py +263 -0
  60. kodit/infrastructure/sqlalchemy/git_commit_repository.py +337 -0
  61. kodit/infrastructure/sqlalchemy/git_repository.py +252 -0
  62. kodit/infrastructure/sqlalchemy/git_tag_repository.py +257 -0
  63. kodit/infrastructure/sqlalchemy/snippet_v2_repository.py +484 -0
  64. kodit/infrastructure/sqlalchemy/task_repository.py +29 -23
  65. kodit/infrastructure/sqlalchemy/task_status_repository.py +24 -12
  66. kodit/infrastructure/sqlalchemy/unit_of_work.py +10 -14
  67. kodit/mcp.py +12 -30
  68. kodit/migrations/env.py +1 -0
  69. kodit/migrations/versions/04b80f802e0c_foreign_key_review.py +100 -0
  70. kodit/migrations/versions/7f15f878c3a1_add_new_git_entities.py +690 -0
  71. kodit/migrations/versions/f9e5ef5e688f_add_git_commits_number.py +43 -0
  72. kodit/py.typed +0 -0
  73. kodit/utils/dump_openapi.py +7 -4
  74. kodit/utils/path_utils.py +29 -0
  75. {kodit-0.4.3.dist-info → kodit-0.5.0.dist-info}/METADATA +3 -3
  76. kodit-0.5.0.dist-info/RECORD +137 -0
  77. kodit/application/factories/code_indexing_factory.py +0 -195
  78. kodit/application/services/auto_indexing_service.py +0 -99
  79. kodit/application/services/code_indexing_application_service.py +0 -410
  80. kodit/domain/services/index_query_service.py +0 -70
  81. kodit/domain/services/index_service.py +0 -269
  82. kodit/infrastructure/api/client/index_client.py +0 -57
  83. kodit/infrastructure/api/v1/routers/indexes.py +0 -164
  84. kodit/infrastructure/api/v1/schemas/index.py +0 -101
  85. kodit/infrastructure/bm25/bm25_factory.py +0 -28
  86. kodit/infrastructure/cloning/__init__.py +0 -1
  87. kodit/infrastructure/cloning/metadata.py +0 -98
  88. kodit/infrastructure/mappers/index_mapper.py +0 -345
  89. kodit/infrastructure/reporting/tdqm_progress.py +0 -38
  90. kodit/infrastructure/slicing/language_detection_service.py +0 -18
  91. kodit/infrastructure/sqlalchemy/index_repository.py +0 -646
  92. kodit-0.4.3.dist-info/RECORD +0 -125
  93. {kodit-0.4.3.dist-info → kodit-0.5.0.dist-info}/WHEEL +0 -0
  94. {kodit-0.4.3.dist-info → kodit-0.5.0.dist-info}/entry_points.txt +0 -0
  95. {kodit-0.4.3.dist-info → kodit-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,543 @@
1
+ """Application services for commit indexing operations."""
2
+
3
+ from collections import defaultdict
4
+ from pathlib import Path
5
+
6
+ import structlog
7
+ from pydantic import AnyUrl
8
+
9
+ from kodit.application.services.queue_service import QueueService
10
+ from kodit.application.services.reporting import ProgressTracker
11
+ from kodit.domain.entities import Task
12
+ from kodit.domain.entities.git import GitFile, GitRepo, SnippetV2
13
+ from kodit.domain.factories.git_repo_factory import GitRepoFactory
14
+ from kodit.domain.protocols import (
15
+ GitBranchRepository,
16
+ GitCommitRepository,
17
+ GitRepoRepository,
18
+ GitTagRepository,
19
+ SnippetRepositoryV2,
20
+ )
21
+ from kodit.domain.services.bm25_service import BM25DomainService
22
+ from kodit.domain.services.embedding_service import EmbeddingDomainService
23
+ from kodit.domain.services.enrichment_service import EnrichmentDomainService
24
+ from kodit.domain.services.git_repository_service import (
25
+ GitRepositoryScanner,
26
+ RepositoryCloner,
27
+ )
28
+ from kodit.domain.value_objects import (
29
+ DeleteRequest,
30
+ Document,
31
+ Enrichment,
32
+ EnrichmentIndexRequest,
33
+ EnrichmentRequest,
34
+ EnrichmentType,
35
+ IndexRequest,
36
+ LanguageMapping,
37
+ PrescribedOperations,
38
+ QueuePriority,
39
+ TaskOperation,
40
+ TrackableType,
41
+ )
42
+ from kodit.infrastructure.slicing.slicer import Slicer
43
+ from kodit.infrastructure.sqlalchemy.embedding_repository import (
44
+ SqlAlchemyEmbeddingRepository,
45
+ )
46
+ from kodit.infrastructure.sqlalchemy.entities import EmbeddingType
47
+
48
+
49
+ class CommitIndexingApplicationService:
50
+ """Application service for commit indexing operations."""
51
+
52
+ def __init__( # noqa: PLR0913
53
+ self,
54
+ snippet_v2_repository: SnippetRepositoryV2,
55
+ repo_repository: GitRepoRepository,
56
+ git_commit_repository: GitCommitRepository,
57
+ git_branch_repository: GitBranchRepository,
58
+ git_tag_repository: GitTagRepository,
59
+ operation: ProgressTracker,
60
+ scanner: GitRepositoryScanner,
61
+ cloner: RepositoryCloner,
62
+ snippet_repository: SnippetRepositoryV2,
63
+ slicer: Slicer,
64
+ queue: QueueService,
65
+ bm25_service: BM25DomainService,
66
+ code_search_service: EmbeddingDomainService,
67
+ text_search_service: EmbeddingDomainService,
68
+ enrichment_service: EnrichmentDomainService,
69
+ embedding_repository: SqlAlchemyEmbeddingRepository,
70
+ ) -> None:
71
+ """Initialize the commit indexing application service.
72
+
73
+ Args:
74
+ commit_index_repository: Repository for commit index data.
75
+ snippet_v2_repository: Repository for snippet data.
76
+ repo_repository: Repository for Git repository data.
77
+ domain_indexer: Domain service for indexing operations.
78
+ operation: Progress tracker for reporting operations.
79
+
80
+ """
81
+ self.snippet_repository = snippet_v2_repository
82
+ self.repo_repository = repo_repository
83
+ self.git_commit_repository = git_commit_repository
84
+ self.git_branch_repository = git_branch_repository
85
+ self.git_tag_repository = git_tag_repository
86
+ self.operation = operation
87
+ self.scanner = scanner
88
+ self.cloner = cloner
89
+ self.snippet_repository = snippet_repository
90
+ self.slicer = slicer
91
+ self.queue = queue
92
+ self.bm25_service = bm25_service
93
+ self.code_search_service = code_search_service
94
+ self.text_search_service = text_search_service
95
+ self.enrichment_service = enrichment_service
96
+ self.embedding_repository = embedding_repository
97
+ self._log = structlog.get_logger(__name__)
98
+
99
+ async def create_git_repository(self, remote_uri: AnyUrl) -> GitRepo:
100
+ """Create a new Git repository."""
101
+ async with self.operation.create_child(
102
+ TaskOperation.CREATE_REPOSITORY,
103
+ trackable_type=TrackableType.KODIT_REPOSITORY,
104
+ ):
105
+ repo = GitRepoFactory.create_from_remote_uri(remote_uri)
106
+ repo = await self.repo_repository.save(repo)
107
+ await self.queue.enqueue_tasks(
108
+ tasks=PrescribedOperations.CREATE_NEW_REPOSITORY,
109
+ base_priority=QueuePriority.USER_INITIATED,
110
+ payload={"repository_id": repo.id},
111
+ )
112
+ return repo
113
+
114
+ async def delete_git_repository(self, repo_id: int) -> bool:
115
+ """Delete a Git repository by ID."""
116
+ repo = await self.repo_repository.get_by_id(repo_id)
117
+ if not repo:
118
+ return False
119
+
120
+ # Use the proper deletion process that handles all dependencies
121
+ await self.process_delete_repo(repo_id)
122
+ return True
123
+
124
+ # TODO(Phil): Make this polymorphic
125
+ async def run_task(self, task: Task) -> None: # noqa: PLR0912, C901
126
+ """Run a task."""
127
+ if task.type.is_repository_operation():
128
+ repo_id = task.payload["repository_id"]
129
+ if not repo_id:
130
+ raise ValueError("Repository ID is required")
131
+ if task.type == TaskOperation.CLONE_REPOSITORY:
132
+ await self.process_clone_repo(repo_id)
133
+ elif task.type == TaskOperation.SCAN_REPOSITORY:
134
+ await self.process_scan_repo(repo_id)
135
+ elif task.type == TaskOperation.DELETE_REPOSITORY:
136
+ await self.process_delete_repo(repo_id)
137
+ else:
138
+ raise ValueError(f"Unknown task type: {task.type}")
139
+ elif task.type.is_commit_operation():
140
+ repository_id = task.payload["repository_id"]
141
+ if not repository_id:
142
+ raise ValueError("Repository ID is required")
143
+ commit_sha = task.payload["commit_sha"]
144
+ if not commit_sha:
145
+ raise ValueError("Commit SHA is required")
146
+ if task.type == TaskOperation.EXTRACT_SNIPPETS_FOR_COMMIT:
147
+ await self.process_snippets_for_commit(repository_id, commit_sha)
148
+ elif task.type == TaskOperation.CREATE_BM25_INDEX_FOR_COMMIT:
149
+ await self.process_bm25_index(repository_id, commit_sha)
150
+ elif task.type == TaskOperation.CREATE_CODE_EMBEDDINGS_FOR_COMMIT:
151
+ await self.process_code_embeddings(repository_id, commit_sha)
152
+ elif task.type == TaskOperation.CREATE_SUMMARY_ENRICHMENT_FOR_COMMIT:
153
+ await self.process_enrich(repository_id, commit_sha)
154
+ elif task.type == TaskOperation.CREATE_SUMMARY_EMBEDDINGS_FOR_COMMIT:
155
+ await self.process_summary_embeddings(repository_id, commit_sha)
156
+ else:
157
+ raise ValueError(f"Unknown task type: {task.type}")
158
+ else:
159
+ raise ValueError(f"Unknown task type: {task.type}")
160
+
161
+ async def process_clone_repo(self, repository_id: int) -> None:
162
+ """Clone a repository."""
163
+ async with self.operation.create_child(
164
+ TaskOperation.CLONE_REPOSITORY,
165
+ trackable_type=TrackableType.KODIT_REPOSITORY,
166
+ trackable_id=repository_id,
167
+ ):
168
+ repo = await self.repo_repository.get_by_id(repository_id)
169
+ repo.cloned_path = await self.cloner.clone_repository(repo.remote_uri)
170
+ await self.repo_repository.save(repo)
171
+
172
+ async def process_scan_repo(self, repository_id: int) -> None:
173
+ """Scan a repository."""
174
+ async with self.operation.create_child(
175
+ TaskOperation.SCAN_REPOSITORY,
176
+ trackable_type=TrackableType.KODIT_REPOSITORY,
177
+ trackable_id=repository_id,
178
+ ) as step:
179
+ await step.set_total(6)
180
+ repo = await self.repo_repository.get_by_id(repository_id)
181
+ if not repo.cloned_path:
182
+ raise ValueError(f"Repository {repository_id} has never been cloned")
183
+
184
+ # Scan the repository to get all metadata
185
+ await step.set_current(0, "Scanning repository")
186
+ scan_result = await self.scanner.scan_repository(repo.cloned_path)
187
+
188
+ # Update repo with scan result (this sets num_commits, num_branches, etc.)
189
+ await step.set_current(1, "Updating repository with scan result")
190
+ repo.update_with_scan_result(scan_result)
191
+ await self.repo_repository.save(repo)
192
+
193
+ # Save commits, branches, and tags to their dedicated repositories
194
+ await step.set_current(2, "Saving commits")
195
+ if scan_result.all_commits:
196
+ await self.git_commit_repository.save_bulk(
197
+ scan_result.all_commits, repository_id
198
+ )
199
+
200
+ await step.set_current(3, "Saving branches")
201
+ if scan_result.branches:
202
+ await self.git_branch_repository.save_bulk(
203
+ scan_result.branches, repository_id
204
+ )
205
+
206
+ await step.set_current(4, "Saving tags")
207
+ if scan_result.all_tags:
208
+ await self.git_tag_repository.save_bulk(
209
+ scan_result.all_tags, repository_id
210
+ )
211
+
212
+ await step.set_current(5, "Enqueuing commit indexing tasks")
213
+ if not repo.tracking_branch:
214
+ raise ValueError(f"Repository {repository_id} has no tracking branch")
215
+ commit_sha = repo.tracking_branch.head_commit.commit_sha
216
+ if not commit_sha:
217
+ raise ValueError(f"Repository {repository_id} has no head commit")
218
+
219
+ await self.queue.enqueue_tasks(
220
+ tasks=PrescribedOperations.INDEX_COMMIT,
221
+ base_priority=QueuePriority.USER_INITIATED,
222
+ payload={"commit_sha": commit_sha, "repository_id": repository_id},
223
+ )
224
+
225
+ async def process_delete_repo(self, repository_id: int) -> None:
226
+ """Delete a repository."""
227
+ async with self.operation.create_child(
228
+ TaskOperation.DELETE_REPOSITORY,
229
+ trackable_type=TrackableType.KODIT_REPOSITORY,
230
+ trackable_id=repository_id,
231
+ ):
232
+ repo = await self.repo_repository.get_by_id(repository_id)
233
+ if not repo:
234
+ raise ValueError(f"Repository {repository_id} not found")
235
+
236
+ # Get all commit SHAs for this repository first (needed for cleanup)
237
+ commits = await self.git_commit_repository.get_by_repo_id(repository_id)
238
+ commit_shas = [commit.commit_sha for commit in commits]
239
+
240
+ # Step 1: Get all snippet IDs that are associated with these commits FIRST
241
+ # (before deleting the associations)
242
+ all_snippet_ids = []
243
+ if commit_shas:
244
+ for commit_sha in commit_shas:
245
+ snippets = await self.snippet_repository.get_snippets_for_commit(
246
+ commit_sha
247
+ )
248
+ all_snippet_ids.extend([
249
+ snippet.id for snippet in snippets if snippet.id
250
+ ])
251
+
252
+ # Step 2: Delete from BM25 and embedding indices
253
+ if all_snippet_ids:
254
+ # Convert to strings as DeleteRequest expects list[str]
255
+ snippet_id_strings = [
256
+ str(snippet_id) for snippet_id in all_snippet_ids
257
+ ]
258
+ delete_request = DeleteRequest(snippet_ids=snippet_id_strings)
259
+ await self.bm25_service.delete_documents(delete_request)
260
+
261
+ # Delete embeddings for each snippet
262
+ for snippet_id in all_snippet_ids:
263
+ await self.embedding_repository.delete_embeddings_by_snippet_id(
264
+ snippet_id
265
+ )
266
+
267
+ # Step 3: Delete snippet associations for all commits
268
+ for commit_sha in commit_shas:
269
+ await self.snippet_repository.delete_snippets_for_commit(commit_sha)
270
+
271
+ # Step 4: Delete branches (they reference commits via head_commit_sha)
272
+ await self.git_branch_repository.delete_by_repo_id(repository_id)
273
+
274
+ # Step 5: Delete tags (they reference commits via target_commit_sha)
275
+ await self.git_tag_repository.delete_by_repo_id(repository_id)
276
+
277
+ # Step 6: Delete commits and their files
278
+ await self.git_commit_repository.delete_by_repo_id(repository_id)
279
+
280
+ # Step 7: Finally delete the repository
281
+ await self.repo_repository.delete(repo.sanitized_remote_uri)
282
+
283
+ async def process_snippets_for_commit(
284
+ self, repository_id: int, commit_sha: str
285
+ ) -> None:
286
+ """Generate snippets for a repository."""
287
+ async with self.operation.create_child(
288
+ operation=TaskOperation.EXTRACT_SNIPPETS_FOR_COMMIT,
289
+ trackable_type=TrackableType.KODIT_REPOSITORY,
290
+ trackable_id=repository_id,
291
+ ) as step:
292
+ # Have we already processed this commit? If yes, skip.
293
+ if await self.snippet_repository.get_snippets_for_commit(commit_sha):
294
+ await step.skip("All snippets already extracted for commit")
295
+ return
296
+
297
+ commit = await self.git_commit_repository.get_by_sha(commit_sha)
298
+
299
+ # Load files on demand for snippet extraction (performance optimization)
300
+ # Instead of using commit.files (which may be empty), load files directly
301
+ repo = await self.repo_repository.get_by_id(repository_id)
302
+ if not repo.cloned_path:
303
+ raise ValueError(f"Repository {repository_id} has never been cloned")
304
+
305
+ # Ensure we're on the specific commit for file access
306
+ await self.scanner.git_adapter.checkout_commit(repo.cloned_path, commit_sha)
307
+
308
+ # Get files directly from Git adapter for this specific commit
309
+ files_data = await self.scanner.git_adapter.get_commit_files(
310
+ repo.cloned_path, commit_sha
311
+ )
312
+
313
+ # Create GitFile entities with absolute paths for the slicer
314
+ files = []
315
+ for file_data in files_data:
316
+ # Extract extension from file path
317
+ file_path = Path(file_data["path"])
318
+ extension = file_path.suffix.lstrip(".")
319
+
320
+ # Create absolute path for the slicer to read
321
+ absolute_path = str(repo.cloned_path / file_data["path"])
322
+
323
+ git_file = GitFile(
324
+ created_at=file_data.get("created_at", commit.date),
325
+ blob_sha=file_data["blob_sha"],
326
+ path=absolute_path, # Use absolute path for file reading
327
+ mime_type=file_data.get("mime_type", "application/octet-stream"),
328
+ size=file_data.get("size", 0),
329
+ extension=extension,
330
+ )
331
+ files.append(git_file)
332
+
333
+ # Create a set of languages to extract snippets for
334
+ extensions = {file.extension for file in files}
335
+ lang_files_map: dict[str, list[GitFile]] = defaultdict(list)
336
+ for ext in extensions:
337
+ try:
338
+ lang = LanguageMapping.get_language_for_extension(ext)
339
+ lang_files_map[lang].extend(
340
+ file for file in files if file.extension == ext
341
+ )
342
+ except ValueError as e:
343
+ self._log.debug("Skipping", error=str(e))
344
+ continue
345
+
346
+ # Extract snippets
347
+ all_snippets: list[SnippetV2] = []
348
+ slicer = Slicer()
349
+ await step.set_total(len(lang_files_map.keys()))
350
+ for i, (lang, lang_files) in enumerate(lang_files_map.items()):
351
+ await step.set_current(i, f"Extracting snippets for {lang}")
352
+ snippets = slicer.extract_snippets_from_git_files(
353
+ lang_files, language=lang
354
+ )
355
+ all_snippets.extend(snippets)
356
+
357
+ # Deduplicate snippets by SHA before saving to prevent constraint violations
358
+ unique_snippets: dict[str, SnippetV2] = {}
359
+ for snippet in all_snippets:
360
+ unique_snippets[snippet.sha] = snippet
361
+
362
+ deduplicated_snippets = list(unique_snippets.values())
363
+
364
+ commit_short = commit.commit_sha[:8]
365
+ self._log.info(
366
+ f"Extracted {len(all_snippets)} snippets, "
367
+ f"deduplicated to {len(deduplicated_snippets)} for {commit_short}"
368
+ )
369
+ await self.snippet_repository.save_snippets(
370
+ commit.commit_sha, deduplicated_snippets
371
+ )
372
+
373
+ async def process_bm25_index(self, repository_id: int, commit_sha: str) -> None:
374
+ """Handle BM25_INDEX task - create keyword index."""
375
+ async with self.operation.create_child(
376
+ TaskOperation.CREATE_BM25_INDEX_FOR_COMMIT,
377
+ trackable_type=TrackableType.KODIT_REPOSITORY,
378
+ trackable_id=repository_id,
379
+ ):
380
+ snippets = await self.snippet_repository.get_snippets_for_commit(commit_sha)
381
+
382
+ await self.bm25_service.index_documents(
383
+ IndexRequest(
384
+ documents=[
385
+ Document(snippet_id=snippet.id, text=snippet.content)
386
+ for snippet in snippets
387
+ if snippet.id
388
+ ]
389
+ )
390
+ )
391
+
392
+ async def process_code_embeddings(
393
+ self, repository_id: int, commit_sha: str
394
+ ) -> None:
395
+ """Handle CODE_EMBEDDINGS task - create code embeddings."""
396
+ async with self.operation.create_child(
397
+ TaskOperation.CREATE_CODE_EMBEDDINGS_FOR_COMMIT,
398
+ trackable_type=TrackableType.KODIT_REPOSITORY,
399
+ trackable_id=repository_id,
400
+ ) as step:
401
+ all_snippets = await self.snippet_repository.get_snippets_for_commit(
402
+ commit_sha
403
+ )
404
+
405
+ new_snippets = await self._new_snippets_for_type(
406
+ all_snippets, EmbeddingType.CODE
407
+ )
408
+ if not new_snippets:
409
+ await step.skip("All snippets already have code embeddings")
410
+ return
411
+
412
+ await step.set_total(len(new_snippets))
413
+ processed = 0
414
+ documents = [
415
+ Document(snippet_id=snippet.id, text=snippet.content)
416
+ for snippet in new_snippets
417
+ if snippet.id
418
+ ]
419
+ async for result in self.code_search_service.index_documents(
420
+ IndexRequest(documents=documents)
421
+ ):
422
+ processed += len(result)
423
+ await step.set_current(processed, "Creating code embeddings for commit")
424
+
425
+ async def process_enrich(self, repository_id: int, commit_sha: str) -> None:
426
+ """Handle ENRICH task - enrich snippets and create text embeddings."""
427
+ async with self.operation.create_child(
428
+ TaskOperation.CREATE_SUMMARY_ENRICHMENT_FOR_COMMIT,
429
+ trackable_type=TrackableType.KODIT_REPOSITORY,
430
+ trackable_id=repository_id,
431
+ ) as step:
432
+ all_snippets = await self.snippet_repository.get_snippets_for_commit(
433
+ commit_sha
434
+ )
435
+
436
+ # Find snippets without a summary enrichment
437
+ snippets_without_summary = [
438
+ snippet
439
+ for snippet in all_snippets
440
+ if not snippet.enrichments
441
+ or not next(
442
+ enrichment
443
+ for enrichment in snippet.enrichments
444
+ if enrichment.type == EnrichmentType.SUMMARIZATION
445
+ )
446
+ ]
447
+ if not snippets_without_summary:
448
+ await step.skip("All snippets already have a summary enrichment")
449
+ return
450
+
451
+ # Enrich snippets
452
+ await step.set_total(len(snippets_without_summary))
453
+ snippet_map = {
454
+ snippet.id: snippet
455
+ for snippet in snippets_without_summary
456
+ if snippet.id
457
+ }
458
+
459
+ enrichment_request = EnrichmentIndexRequest(
460
+ requests=[
461
+ EnrichmentRequest(snippet_id=snippet_id, text=snippet.content)
462
+ for snippet_id, snippet in snippet_map.items()
463
+ ]
464
+ )
465
+
466
+ processed = 0
467
+ async for result in self.enrichment_service.enrich_documents(
468
+ enrichment_request
469
+ ):
470
+ snippet = snippet_map[result.snippet_id]
471
+ snippet.enrichments.append(
472
+ Enrichment(type=EnrichmentType.SUMMARIZATION, content=result.text)
473
+ )
474
+
475
+ await self.snippet_repository.save_snippets(commit_sha, [snippet])
476
+
477
+ processed += 1
478
+ await step.set_current(processed, "Enriching snippets for commit")
479
+
480
+ async def process_summary_embeddings(
481
+ self, repository_id: int, commit_sha: str
482
+ ) -> None:
483
+ """Handle SUMMARY_EMBEDDINGS task - create summary embeddings."""
484
+ async with self.operation.create_child(
485
+ TaskOperation.CREATE_SUMMARY_EMBEDDINGS_FOR_COMMIT,
486
+ trackable_type=TrackableType.KODIT_REPOSITORY,
487
+ trackable_id=repository_id,
488
+ ) as step:
489
+ snippets = await self.snippet_repository.get_snippets_for_commit(commit_sha)
490
+
491
+ new_snippets = await self._new_snippets_for_type(
492
+ snippets, EmbeddingType.TEXT
493
+ )
494
+ if not new_snippets:
495
+ await step.skip("All snippets already have text embeddings")
496
+ return
497
+
498
+ await step.set_total(len(new_snippets))
499
+ processed = 0
500
+
501
+ def _summary_from_enrichments(enrichments: list[Enrichment]) -> str:
502
+ if not enrichments:
503
+ return ""
504
+ return next(
505
+ enrichment.content
506
+ for enrichment in enrichments
507
+ if enrichment.type == EnrichmentType.SUMMARIZATION
508
+ )
509
+
510
+ snippet_summary_map = {
511
+ snippet.id: _summary_from_enrichments(snippet.enrichments)
512
+ for snippet in snippets
513
+ if snippet.id
514
+ }
515
+ if len(snippet_summary_map) == 0:
516
+ await step.skip("No snippets with summaries to create text embeddings")
517
+ return
518
+
519
+ documents_with_summaries = [
520
+ Document(snippet_id=snippet_id, text=snippet_summary)
521
+ for snippet_id, snippet_summary in snippet_summary_map.items()
522
+ ]
523
+ async for result in self.text_search_service.index_documents(
524
+ IndexRequest(documents=documents_with_summaries)
525
+ ):
526
+ processed += len(result)
527
+ await step.set_current(processed, "Creating text embeddings for commit")
528
+
529
+ async def _new_snippets_for_type(
530
+ self, all_snippets: list[SnippetV2], embedding_type: EmbeddingType
531
+ ) -> list[SnippetV2]:
532
+ """Get new snippets for a given type."""
533
+ existing_embeddings = (
534
+ await self.embedding_repository.list_embeddings_by_snippet_ids_and_type(
535
+ [s.id for s in all_snippets], embedding_type
536
+ )
537
+ )
538
+ existing_embeddings_by_snippet_id = {
539
+ embedding.snippet_id: embedding for embedding in existing_embeddings
540
+ }
541
+ return [
542
+ s for s in all_snippets if s.id not in existing_embeddings_by_snippet_id
543
+ ]
@@ -8,14 +8,11 @@ from datetime import UTC, datetime
8
8
  import structlog
9
9
  from sqlalchemy.ext.asyncio import AsyncSession
10
10
 
11
- from kodit.application.factories.code_indexing_factory import (
12
- create_code_indexing_application_service,
13
- )
14
11
  from kodit.application.factories.reporting_factory import create_noop_operation
12
+ from kodit.application.factories.server_factory import ServerFactory
15
13
  from kodit.application.services.reporting import ProgressTracker
16
14
  from kodit.config import AppContext
17
15
  from kodit.domain.entities import Task
18
- from kodit.domain.value_objects import TaskType
19
16
  from kodit.infrastructure.sqlalchemy.task_repository import create_task_repository
20
17
 
21
18
 
@@ -30,10 +27,12 @@ class IndexingWorkerService:
30
27
  self,
31
28
  app_context: AppContext,
32
29
  session_factory: Callable[[], AsyncSession],
30
+ server_factory: ServerFactory,
33
31
  ) -> None:
34
32
  """Initialize the indexing worker service."""
35
33
  self.app_context = app_context
36
34
  self.session_factory = session_factory
35
+ self.server_factory = server_factory
37
36
  self._worker_task: asyncio.Task | None = None
38
37
  self._shutdown_event = asyncio.Event()
39
38
  self.task_repository = create_task_repository(session_factory)
@@ -45,7 +44,7 @@ class IndexingWorkerService:
45
44
  self._running = True
46
45
 
47
46
  # Start single worker task
48
- self._worker_task = asyncio.create_task(self._worker_loop(operation))
47
+ self._worker_task = asyncio.create_task(self._worker_loop())
49
48
 
50
49
  self.log.info(
51
50
  "Indexing worker started",
@@ -63,18 +62,20 @@ class IndexingWorkerService:
63
62
 
64
63
  self.log.info("Indexing worker stopped")
65
64
 
66
- async def _worker_loop(self, operation: ProgressTracker) -> None:
65
+ async def _worker_loop(self) -> None:
67
66
  self.log.debug("Worker loop started")
68
67
 
69
68
  while not self._shutdown_event.is_set():
70
69
  try:
71
70
  async with self.session_factory() as session:
72
- task = await self.task_repository.take()
71
+ task = await self.task_repository.next()
73
72
  await session.commit()
74
73
 
75
74
  # If there's a task, process it in a new thread
76
75
  if task:
77
- await self._process_task(task, operation)
76
+ await self._process_task(task)
77
+ # Only remove the task if it was processed successfully
78
+ await self.task_repository.remove(task)
78
79
  continue
79
80
 
80
81
  # If no task, sleep for a bit
@@ -90,8 +91,8 @@ class IndexingWorkerService:
90
91
 
91
92
  self.log.info("Worker loop stopped")
92
93
 
93
- async def _process_task(self, task: Task, operation: ProgressTracker) -> None:
94
- """Process a single task."""
94
+ async def _process_task(self, task: Task) -> None:
95
+ """Process a task based on its type."""
95
96
  self.log.info(
96
97
  "Processing task",
97
98
  task_id=task.id,
@@ -105,16 +106,8 @@ class IndexingWorkerService:
105
106
  asyncio.set_event_loop(loop)
106
107
 
107
108
  try:
108
- # Process based on task type (currently only INDEX_UPDATE is supported)
109
- if task.type is TaskType.INDEX_UPDATE:
110
- await self._process_index_update(task, operation)
111
- else:
112
- self.log.warning(
113
- "Unknown task type",
114
- task_id=task.id,
115
- task_type=task.type,
116
- )
117
- return
109
+ commit_service = self.server_factory.commit_indexing_application_service()
110
+ await commit_service.run_task(task)
118
111
  finally:
119
112
  loop.close()
120
113
 
@@ -124,27 +117,3 @@ class IndexingWorkerService:
124
117
  task_id=task.id,
125
118
  duration_seconds=duration,
126
119
  )
127
-
128
- async def _process_index_update(
129
- self, task: Task, operation: ProgressTracker
130
- ) -> None:
131
- """Process index update/sync task."""
132
- index_id = task.payload.get("index_id")
133
- if not index_id:
134
- raise ValueError("Missing index_id in task payload")
135
-
136
- # Create a fresh database connection for this thread's event loop
137
- db = await self.app_context.new_db(run_migrations=True)
138
- try:
139
- service = create_code_indexing_application_service(
140
- app_context=self.app_context,
141
- session_factory=self.session_factory,
142
- operation=operation,
143
- )
144
- index = await service.index_repository.get(index_id)
145
- if not index:
146
- raise ValueError(f"Index not found: {index_id}")
147
-
148
- await service.run_index(index)
149
- finally:
150
- await db.close()