kodit 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (54) hide show
  1. kodit/_version.py +2 -2
  2. kodit/application/factories/server_factory.py +54 -32
  3. kodit/application/services/code_search_application_service.py +89 -12
  4. kodit/application/services/commit_indexing_application_service.py +314 -195
  5. kodit/application/services/enrichment_query_service.py +274 -43
  6. kodit/application/services/indexing_worker_service.py +1 -1
  7. kodit/application/services/queue_service.py +15 -10
  8. kodit/application/services/sync_scheduler.py +2 -1
  9. kodit/domain/enrichments/architecture/architecture.py +1 -1
  10. kodit/domain/enrichments/architecture/physical/physical.py +1 -1
  11. kodit/domain/enrichments/development/development.py +1 -1
  12. kodit/domain/enrichments/development/snippet/snippet.py +12 -5
  13. kodit/domain/enrichments/enrichment.py +31 -4
  14. kodit/domain/enrichments/usage/api_docs.py +1 -1
  15. kodit/domain/enrichments/usage/usage.py +1 -1
  16. kodit/domain/entities/git.py +30 -25
  17. kodit/domain/factories/git_repo_factory.py +20 -5
  18. kodit/domain/protocols.py +56 -125
  19. kodit/domain/services/embedding_service.py +14 -16
  20. kodit/domain/services/git_repository_service.py +60 -38
  21. kodit/domain/services/git_service.py +18 -11
  22. kodit/domain/tracking/resolution_service.py +6 -16
  23. kodit/domain/value_objects.py +2 -9
  24. kodit/infrastructure/api/v1/dependencies.py +12 -3
  25. kodit/infrastructure/api/v1/query_params.py +27 -0
  26. kodit/infrastructure/api/v1/routers/commits.py +91 -85
  27. kodit/infrastructure/api/v1/routers/repositories.py +53 -37
  28. kodit/infrastructure/api/v1/routers/search.py +1 -1
  29. kodit/infrastructure/api/v1/schemas/enrichment.py +14 -0
  30. kodit/infrastructure/api/v1/schemas/repository.py +1 -1
  31. kodit/infrastructure/slicing/api_doc_extractor.py +0 -2
  32. kodit/infrastructure/sqlalchemy/embedding_repository.py +44 -34
  33. kodit/infrastructure/sqlalchemy/enrichment_association_repository.py +73 -0
  34. kodit/infrastructure/sqlalchemy/enrichment_v2_repository.py +116 -97
  35. kodit/infrastructure/sqlalchemy/entities.py +12 -116
  36. kodit/infrastructure/sqlalchemy/git_branch_repository.py +52 -244
  37. kodit/infrastructure/sqlalchemy/git_commit_repository.py +35 -324
  38. kodit/infrastructure/sqlalchemy/git_file_repository.py +70 -0
  39. kodit/infrastructure/sqlalchemy/git_repository.py +60 -230
  40. kodit/infrastructure/sqlalchemy/git_tag_repository.py +53 -240
  41. kodit/infrastructure/sqlalchemy/query.py +331 -0
  42. kodit/infrastructure/sqlalchemy/repository.py +203 -0
  43. kodit/infrastructure/sqlalchemy/task_repository.py +79 -58
  44. kodit/infrastructure/sqlalchemy/task_status_repository.py +45 -52
  45. kodit/migrations/versions/4b1a3b2c8fa5_refactor_git_tracking.py +190 -0
  46. {kodit-0.5.4.dist-info → kodit-0.5.5.dist-info}/METADATA +1 -1
  47. {kodit-0.5.4.dist-info → kodit-0.5.5.dist-info}/RECORD +50 -48
  48. kodit/infrastructure/mappers/enrichment_mapper.py +0 -83
  49. kodit/infrastructure/mappers/git_mapper.py +0 -193
  50. kodit/infrastructure/mappers/snippet_mapper.py +0 -104
  51. kodit/infrastructure/sqlalchemy/snippet_v2_repository.py +0 -479
  52. {kodit-0.5.4.dist-info → kodit-0.5.5.dist-info}/WHEEL +0 -0
  53. {kodit-0.5.4.dist-info → kodit-0.5.5.dist-info}/entry_points.txt +0 -0
  54. {kodit-0.5.4.dist-info → kodit-0.5.5.dist-info}/licenses/LICENSE +0 -0
@@ -2,30 +2,45 @@
2
2
 
3
3
  from collections import defaultdict
4
4
  from pathlib import Path
5
+ from typing import TYPE_CHECKING
5
6
 
6
7
  import structlog
7
8
  from pydantic import AnyUrl
8
9
 
9
10
  from kodit.application.services.queue_service import QueueService
10
11
  from kodit.application.services.reporting import ProgressTracker
12
+
13
+ if TYPE_CHECKING:
14
+ from kodit.application.services.enrichment_query_service import (
15
+ EnrichmentQueryService,
16
+ )
11
17
  from kodit.domain.enrichments.architecture.physical.physical import (
12
18
  PhysicalArchitectureEnrichment,
13
19
  )
20
+ from kodit.domain.enrichments.development.snippet.snippet import (
21
+ SnippetEnrichment,
22
+ SnippetEnrichmentSummary,
23
+ )
14
24
  from kodit.domain.enrichments.enricher import Enricher
25
+ from kodit.domain.enrichments.enrichment import (
26
+ CommitEnrichmentAssociation,
27
+ EnrichmentAssociation,
28
+ EnrichmentV2,
29
+ )
15
30
  from kodit.domain.enrichments.request import (
16
31
  EnrichmentRequest as GenericEnrichmentRequest,
17
32
  )
18
- from kodit.domain.enrichments.usage.api_docs import ENRICHMENT_SUBTYPE_API_DOCS
19
- from kodit.domain.enrichments.usage.usage import ENRICHMENT_TYPE_USAGE
20
33
  from kodit.domain.entities import Task
21
- from kodit.domain.entities.git import GitFile, GitRepo, SnippetV2
34
+ from kodit.domain.entities.git import GitFile, GitRepo, SnippetV2, TrackingType
22
35
  from kodit.domain.factories.git_repo_factory import GitRepoFactory
23
36
  from kodit.domain.protocols import (
37
+ EnrichmentAssociationRepository,
38
+ EnrichmentV2Repository,
24
39
  GitBranchRepository,
25
40
  GitCommitRepository,
41
+ GitFileRepository,
26
42
  GitRepoRepository,
27
43
  GitTagRepository,
28
- SnippetRepositoryV2,
29
44
  )
30
45
  from kodit.domain.services.bm25_service import BM25DomainService
31
46
  from kodit.domain.services.embedding_service import EmbeddingDomainService
@@ -41,8 +56,6 @@ from kodit.domain.services.physical_architecture_service import (
41
56
  from kodit.domain.value_objects import (
42
57
  DeleteRequest,
43
58
  Document,
44
- Enrichment,
45
- EnrichmentType,
46
59
  IndexRequest,
47
60
  LanguageMapping,
48
61
  PrescribedOperations,
@@ -52,13 +65,17 @@ from kodit.domain.value_objects import (
52
65
  )
53
66
  from kodit.infrastructure.slicing.api_doc_extractor import APIDocExtractor
54
67
  from kodit.infrastructure.slicing.slicer import Slicer
68
+ from kodit.infrastructure.sqlalchemy import entities as db_entities
55
69
  from kodit.infrastructure.sqlalchemy.embedding_repository import (
56
70
  SqlAlchemyEmbeddingRepository,
57
71
  )
58
- from kodit.infrastructure.sqlalchemy.enrichment_v2_repository import (
59
- EnrichmentV2Repository,
60
- )
61
72
  from kodit.infrastructure.sqlalchemy.entities import EmbeddingType
73
+ from kodit.infrastructure.sqlalchemy.query import (
74
+ EnrichmentAssociationQueryBuilder,
75
+ FilterOperator,
76
+ GitFileQueryBuilder,
77
+ QueryBuilder,
78
+ )
62
79
 
63
80
  SUMMARIZATION_SYSTEM_PROMPT = """
64
81
  You are a professional software developer. You will be given a snippet of code.
@@ -71,15 +88,14 @@ class CommitIndexingApplicationService:
71
88
 
72
89
  def __init__( # noqa: PLR0913
73
90
  self,
74
- snippet_v2_repository: SnippetRepositoryV2,
75
91
  repo_repository: GitRepoRepository,
76
92
  git_commit_repository: GitCommitRepository,
93
+ git_file_repository: GitFileRepository,
77
94
  git_branch_repository: GitBranchRepository,
78
95
  git_tag_repository: GitTagRepository,
79
96
  operation: ProgressTracker,
80
97
  scanner: GitRepositoryScanner,
81
98
  cloner: RepositoryCloner,
82
- snippet_repository: SnippetRepositoryV2,
83
99
  slicer: Slicer,
84
100
  queue: QueueService,
85
101
  bm25_service: BM25DomainService,
@@ -87,28 +103,20 @@ class CommitIndexingApplicationService:
87
103
  text_search_service: EmbeddingDomainService,
88
104
  embedding_repository: SqlAlchemyEmbeddingRepository,
89
105
  architecture_service: PhysicalArchitectureService,
90
- enrichment_v2_repository: EnrichmentV2Repository,
91
106
  enricher_service: Enricher,
107
+ enrichment_v2_repository: EnrichmentV2Repository,
108
+ enrichment_association_repository: EnrichmentAssociationRepository,
109
+ enrichment_query_service: "EnrichmentQueryService",
92
110
  ) -> None:
93
- """Initialize the commit indexing application service.
94
-
95
- Args:
96
- commit_index_repository: Repository for commit index data.
97
- snippet_v2_repository: Repository for snippet data.
98
- repo_repository: Repository for Git repository data.
99
- domain_indexer: Domain service for indexing operations.
100
- operation: Progress tracker for reporting operations.
101
-
102
- """
103
- self.snippet_repository = snippet_v2_repository
111
+ """Initialize the commit indexing application service."""
104
112
  self.repo_repository = repo_repository
105
113
  self.git_commit_repository = git_commit_repository
114
+ self.git_file_repository = git_file_repository
106
115
  self.git_branch_repository = git_branch_repository
107
116
  self.git_tag_repository = git_tag_repository
108
117
  self.operation = operation
109
118
  self.scanner = scanner
110
119
  self.cloner = cloner
111
- self.snippet_repository = snippet_repository
112
120
  self.slicer = slicer
113
121
  self.queue = queue
114
122
  self.bm25_service = bm25_service
@@ -117,7 +125,9 @@ class CommitIndexingApplicationService:
117
125
  self.embedding_repository = embedding_repository
118
126
  self.architecture_service = architecture_service
119
127
  self.enrichment_v2_repository = enrichment_v2_repository
128
+ self.enrichment_association_repository = enrichment_association_repository
120
129
  self.enricher_service = enricher_service
130
+ self.enrichment_query_service = enrichment_query_service
121
131
  self._log = structlog.get_logger(__name__)
122
132
 
123
133
  async def create_git_repository(self, remote_uri: AnyUrl) -> GitRepo:
@@ -137,7 +147,7 @@ class CommitIndexingApplicationService:
137
147
 
138
148
  async def delete_git_repository(self, repo_id: int) -> bool:
139
149
  """Delete a Git repository by ID."""
140
- repo = await self.repo_repository.get_by_id(repo_id)
150
+ repo = await self.repo_repository.get(repo_id)
141
151
  if not repo:
142
152
  return False
143
153
 
@@ -193,7 +203,7 @@ class CommitIndexingApplicationService:
193
203
  trackable_type=TrackableType.KODIT_REPOSITORY,
194
204
  trackable_id=repository_id,
195
205
  ):
196
- repo = await self.repo_repository.get_by_id(repository_id)
206
+ repo = await self.repo_repository.get(repository_id)
197
207
  repo.cloned_path = await self.cloner.clone_repository(repo.remote_uri)
198
208
  await self.repo_repository.save(repo)
199
209
 
@@ -204,45 +214,57 @@ class CommitIndexingApplicationService:
204
214
  trackable_type=TrackableType.KODIT_REPOSITORY,
205
215
  trackable_id=repository_id,
206
216
  ) as step:
207
- await step.set_total(6)
208
- repo = await self.repo_repository.get_by_id(repository_id)
217
+ await step.set_total(7)
218
+ repo = await self.repo_repository.get(repository_id)
209
219
  if not repo.cloned_path:
210
220
  raise ValueError(f"Repository {repository_id} has never been cloned")
211
221
 
212
222
  # Scan the repository to get all metadata
213
223
  await step.set_current(0, "Scanning repository")
214
- scan_result = await self.scanner.scan_repository(repo.cloned_path)
224
+ scan_result = await self.scanner.scan_repository(
225
+ repo.cloned_path, repository_id
226
+ )
215
227
 
216
228
  # Update repo with scan result (this sets num_commits, num_branches, etc.)
217
229
  await step.set_current(1, "Updating repository with scan result")
218
230
  repo.update_with_scan_result(scan_result)
219
231
  await self.repo_repository.save(repo)
220
232
 
221
- # Save commits, branches, and tags to their dedicated repositories
222
233
  await step.set_current(2, "Saving commits")
223
- if scan_result.all_commits:
224
- await self.git_commit_repository.save_bulk(
225
- scan_result.all_commits, repository_id
226
- )
234
+ await self.git_commit_repository.save_bulk(scan_result.all_commits)
235
+
236
+ await step.set_current(3, "Saving files")
237
+ await self.git_file_repository.save_bulk(scan_result.all_files)
227
238
 
228
- await step.set_current(3, "Saving branches")
239
+ await step.set_current(4, "Saving branches")
229
240
  if scan_result.branches:
230
241
  await self.git_branch_repository.save_bulk(
231
- scan_result.branches, repository_id
242
+ scan_result.branches,
232
243
  )
233
244
 
234
- await step.set_current(4, "Saving tags")
245
+ await step.set_current(5, "Saving tags")
235
246
  if scan_result.all_tags:
236
247
  await self.git_tag_repository.save_bulk(
237
- scan_result.all_tags, repository_id
248
+ scan_result.all_tags,
238
249
  )
239
250
 
240
- await step.set_current(5, "Enqueuing commit indexing tasks")
241
- if not repo.tracking_branch:
251
+ await step.set_current(6, "Enqueuing commit indexing tasks")
252
+ if not repo.tracking_config.name:
242
253
  raise ValueError(f"Repository {repository_id} has no tracking branch")
243
- commit_sha = repo.tracking_branch.head_commit.commit_sha
244
- if not commit_sha:
245
- raise ValueError(f"Repository {repository_id} has no head commit")
254
+ if repo.tracking_config.type == TrackingType.BRANCH.value:
255
+ branch = await self.git_branch_repository.get_by_name(
256
+ repo.tracking_config.name, repository_id
257
+ )
258
+ commit_sha = branch.head_commit_sha
259
+ elif repo.tracking_config.type == TrackingType.TAG.value:
260
+ tag = await self.git_tag_repository.get_by_name(
261
+ repo.tracking_config.name, repository_id
262
+ )
263
+ commit_sha = tag.target_commit_sha
264
+ elif repo.tracking_config.type == TrackingType.COMMIT_SHA.value:
265
+ commit_sha = repo.tracking_config.name
266
+ else:
267
+ raise ValueError(f"Unknown tracking type: {repo.tracking_config.type}")
246
268
 
247
269
  await self.queue.enqueue_tasks(
248
270
  tasks=PrescribedOperations.INDEX_COMMIT,
@@ -250,6 +272,74 @@ class CommitIndexingApplicationService:
250
272
  payload={"commit_sha": commit_sha, "repository_id": repository_id},
251
273
  )
252
274
 
275
+ async def _delete_snippet_enrichments_for_commits(
276
+ self, commit_shas: list[str]
277
+ ) -> None:
278
+ """Delete snippet enrichments and their indices for commits."""
279
+ # Get all snippet enrichment IDs for these commits
280
+ all_snippet_enrichment_ids = []
281
+ for commit_sha in commit_shas:
282
+ snippet_enrichments = (
283
+ await self.enrichment_query_service.get_all_snippets_for_commit(
284
+ commit_sha
285
+ )
286
+ )
287
+ enrichment_ids = [
288
+ enrichment.id for enrichment in snippet_enrichments if enrichment.id
289
+ ]
290
+ all_snippet_enrichment_ids.extend(enrichment_ids)
291
+
292
+ if not all_snippet_enrichment_ids:
293
+ return
294
+
295
+ # Delete from BM25 and embedding indices
296
+ snippet_id_strings = [str(sid) for sid in all_snippet_enrichment_ids]
297
+ delete_request = DeleteRequest(snippet_ids=snippet_id_strings)
298
+ await self.bm25_service.delete_documents(delete_request)
299
+
300
+ for snippet_id in all_snippet_enrichment_ids:
301
+ await self.embedding_repository.delete_embeddings_by_snippet_id(
302
+ str(snippet_id)
303
+ )
304
+
305
+ # Delete enrichment associations for snippets
306
+ await self.enrichment_association_repository.delete_by_query(
307
+ QueryBuilder()
308
+ .filter("entity_type", FilterOperator.EQ, "snippet_v2")
309
+ .filter("entity_id", FilterOperator.IN, snippet_id_strings)
310
+ )
311
+
312
+ # Delete the enrichments themselves
313
+ await self.enrichment_v2_repository.delete_by_query(
314
+ QueryBuilder().filter("id", FilterOperator.IN, all_snippet_enrichment_ids)
315
+ )
316
+
317
+ async def _delete_commit_enrichments(self, commit_shas: list[str]) -> None:
318
+ """Delete commit-level enrichments for commits."""
319
+ existing_enrichment_associations = (
320
+ await self.enrichment_association_repository.find(
321
+ QueryBuilder()
322
+ .filter(
323
+ "entity_type",
324
+ FilterOperator.EQ,
325
+ db_entities.GitCommit.__tablename__,
326
+ )
327
+ .filter("entity_id", FilterOperator.IN, commit_shas)
328
+ )
329
+ )
330
+ enrichment_ids = [a.enrichment_id for a in existing_enrichment_associations]
331
+ if not enrichment_ids:
332
+ return
333
+
334
+ # Delete associations first
335
+ await self.enrichment_association_repository.delete_by_query(
336
+ QueryBuilder().filter("enrichment_id", FilterOperator.IN, enrichment_ids)
337
+ )
338
+ # Then delete enrichments
339
+ await self.enrichment_v2_repository.delete_by_query(
340
+ QueryBuilder().filter("id", FilterOperator.IN, enrichment_ids)
341
+ )
342
+
253
343
  async def process_delete_repo(self, repository_id: int) -> None:
254
344
  """Delete a repository."""
255
345
  async with self.operation.create_child(
@@ -257,61 +347,34 @@ class CommitIndexingApplicationService:
257
347
  trackable_type=TrackableType.KODIT_REPOSITORY,
258
348
  trackable_id=repository_id,
259
349
  ):
260
- repo = await self.repo_repository.get_by_id(repository_id)
350
+ repo = await self.repo_repository.get(repository_id)
261
351
  if not repo:
262
352
  raise ValueError(f"Repository {repository_id} not found")
263
353
 
264
- # Get all commit SHAs for this repository first (needed for cleanup)
265
- commits = await self.git_commit_repository.get_by_repo_id(repository_id)
354
+ # Get all commit SHAs for this repository
355
+ commits = await self.git_commit_repository.find(
356
+ QueryBuilder().filter("repo_id", FilterOperator.EQ, repository_id)
357
+ )
266
358
  commit_shas = [commit.commit_sha for commit in commits]
267
359
 
268
- # Step 1: Get all snippet IDs that are associated with these commits FIRST
269
- # (before deleting the associations)
270
- all_snippet_ids = []
360
+ # Delete all enrichments and their indices
271
361
  if commit_shas:
272
- for commit_sha in commit_shas:
273
- snippets = await self.snippet_repository.get_snippets_for_commit(
274
- commit_sha
275
- )
276
- all_snippet_ids.extend(
277
- [snippet.id for snippet in snippets if snippet.id]
278
- )
362
+ await self._delete_snippet_enrichments_for_commits(commit_shas)
363
+ await self._delete_commit_enrichments(commit_shas)
279
364
 
280
- # Step 2: Delete from BM25 and embedding indices
281
- if all_snippet_ids:
282
- # Convert to strings as DeleteRequest expects list[str]
283
- snippet_id_strings = [str(snippet_id) for snippet_id in all_snippet_ids]
284
- delete_request = DeleteRequest(snippet_ids=snippet_id_strings)
285
- await self.bm25_service.delete_documents(delete_request)
286
-
287
- # Delete embeddings for each snippet
288
- for snippet_id in all_snippet_ids:
289
- await self.embedding_repository.delete_embeddings_by_snippet_id(
290
- snippet_id
291
- )
292
-
293
- # Step 3: Delete enrichments for all commits
294
- if commit_shas:
295
- await self.enrichment_v2_repository.bulk_delete_enrichments(
296
- entity_type="git_commit",
297
- entity_ids=commit_shas,
298
- )
299
-
300
- # Step 4: Delete snippet associations for all commits
301
- for commit_sha in commit_shas:
302
- await self.snippet_repository.delete_snippets_for_commit(commit_sha)
303
-
304
- # Step 5: Delete branches (they reference commits via head_commit_sha)
365
+ # Delete branches, tags, files, commits, and repository
305
366
  await self.git_branch_repository.delete_by_repo_id(repository_id)
306
-
307
- # Step 6: Delete tags (they reference commits via target_commit_sha)
308
367
  await self.git_tag_repository.delete_by_repo_id(repository_id)
309
368
 
310
- # Step 7: Delete commits and their files
311
- await self.git_commit_repository.delete_by_repo_id(repository_id)
369
+ for commit_sha in commit_shas:
370
+ await self.git_file_repository.delete_by_commit_sha(commit_sha)
371
+
372
+ await self.git_commit_repository.delete_by_query(
373
+ QueryBuilder().filter("repo_id", FilterOperator.EQ, repository_id)
374
+ )
312
375
 
313
- # Step 8: Finally delete the repository
314
- await self.repo_repository.delete(repo.sanitized_remote_uri)
376
+ if repo.id:
377
+ await self.repo_repository.delete(repo)
315
378
 
316
379
  async def process_snippets_for_commit(
317
380
  self, repository_id: int, commit_sha: str
@@ -322,16 +385,16 @@ class CommitIndexingApplicationService:
322
385
  trackable_type=TrackableType.KODIT_REPOSITORY,
323
386
  trackable_id=repository_id,
324
387
  ) as step:
325
- # Have we already processed this commit? If yes, skip.
326
- if await self.snippet_repository.get_snippets_for_commit(commit_sha):
327
- await step.skip("All snippets already extracted for commit")
388
+ # Find existing snippet enrichments for this commit
389
+ if await self.enrichment_query_service.has_snippets_for_commit(commit_sha):
390
+ await step.skip("Snippets already extracted for commit")
328
391
  return
329
392
 
330
- commit = await self.git_commit_repository.get_by_sha(commit_sha)
393
+ commit = await self.git_commit_repository.get(commit_sha)
331
394
 
332
395
  # Load files on demand for snippet extraction (performance optimization)
333
396
  # Instead of using commit.files (which may be empty), load files directly
334
- repo = await self.repo_repository.get_by_id(repository_id)
397
+ repo = await self.repo_repository.get(repository_id)
335
398
  if not repo.cloned_path:
336
399
  raise ValueError(f"Repository {repository_id} has never been cloned")
337
400
 
@@ -350,6 +413,7 @@ class CommitIndexingApplicationService:
350
413
  absolute_path = str(repo.cloned_path / file_data["path"])
351
414
 
352
415
  git_file = GitFile(
416
+ commit_sha=commit.commit_sha,
353
417
  created_at=file_data.get("created_at", commit.date),
354
418
  blob_sha=file_data["blob_sha"],
355
419
  path=absolute_path, # Use absolute path for file reading
@@ -395,8 +459,27 @@ class CommitIndexingApplicationService:
395
459
  f"Extracted {len(all_snippets)} snippets, "
396
460
  f"deduplicated to {len(deduplicated_snippets)} for {commit_short}"
397
461
  )
398
- await self.snippet_repository.save_snippets(
399
- commit.commit_sha, deduplicated_snippets
462
+
463
+ saved_enrichments = await self.enrichment_v2_repository.save_bulk(
464
+ [
465
+ SnippetEnrichment(content=snippet.content)
466
+ for snippet in deduplicated_snippets
467
+ ]
468
+ )
469
+ saved_associations = await self.enrichment_association_repository.save_bulk(
470
+ [
471
+ EnrichmentAssociation(
472
+ enrichment_id=enrichment.id,
473
+ entity_type=db_entities.GitCommit.__tablename__,
474
+ entity_id=commit_sha,
475
+ )
476
+ for enrichment in saved_enrichments
477
+ if enrichment.id
478
+ ]
479
+ )
480
+ self._log.info(
481
+ f"Saved {len(saved_enrichments)} snippet enrichments and "
482
+ f"{len(saved_associations)} associations for commit {commit_sha}"
400
483
  )
401
484
 
402
485
  async def process_bm25_index(self, repository_id: int, commit_sha: str) -> None:
@@ -406,13 +489,16 @@ class CommitIndexingApplicationService:
406
489
  trackable_type=TrackableType.KODIT_REPOSITORY,
407
490
  trackable_id=repository_id,
408
491
  ):
409
- snippets = await self.snippet_repository.get_snippets_for_commit(commit_sha)
410
-
492
+ existing_enrichments = (
493
+ await self.enrichment_query_service.get_all_snippets_for_commit(
494
+ commit_sha
495
+ )
496
+ )
411
497
  await self.bm25_service.index_documents(
412
498
  IndexRequest(
413
499
  documents=[
414
- Document(snippet_id=snippet.id, text=snippet.content)
415
- for snippet in snippets
500
+ Document(snippet_id=str(snippet.id), text=snippet.content)
501
+ for snippet in existing_enrichments
416
502
  if snippet.id
417
503
  ]
418
504
  )
@@ -427,12 +513,14 @@ class CommitIndexingApplicationService:
427
513
  trackable_type=TrackableType.KODIT_REPOSITORY,
428
514
  trackable_id=repository_id,
429
515
  ) as step:
430
- all_snippets = await self.snippet_repository.get_snippets_for_commit(
431
- commit_sha
516
+ existing_enrichments = (
517
+ await self.enrichment_query_service.get_all_snippets_for_commit(
518
+ commit_sha
519
+ )
432
520
  )
433
521
 
434
522
  new_snippets = await self._new_snippets_for_type(
435
- all_snippets, EmbeddingType.CODE
523
+ existing_enrichments, EmbeddingType.CODE
436
524
  )
437
525
  if not new_snippets:
438
526
  await step.skip("All snippets already have code embeddings")
@@ -441,7 +529,7 @@ class CommitIndexingApplicationService:
441
529
  await step.set_total(len(new_snippets))
442
530
  processed = 0
443
531
  documents = [
444
- Document(snippet_id=snippet.id, text=snippet.content)
532
+ Document(snippet_id=str(snippet.id), text=snippet.content)
445
533
  for snippet in new_snippets
446
534
  if snippet.id
447
535
  ]
@@ -458,36 +546,28 @@ class CommitIndexingApplicationService:
458
546
  trackable_type=TrackableType.KODIT_REPOSITORY,
459
547
  trackable_id=repository_id,
460
548
  ) as step:
461
- all_snippets = await self.snippet_repository.get_snippets_for_commit(
462
- commit_sha
463
- )
549
+ if await self.enrichment_query_service.has_summaries_for_commit(commit_sha):
550
+ await step.skip("Summary enrichments already exist for commit")
551
+ return
464
552
 
465
- # Find snippets without a summary enrichment
466
- snippets_without_summary = [
467
- snippet
468
- for snippet in all_snippets
469
- if not snippet.enrichments
470
- or not next(
471
- enrichment
472
- for enrichment in snippet.enrichments
473
- if enrichment.type == EnrichmentType.SUMMARIZATION
553
+ all_snippets = (
554
+ await self.enrichment_query_service.get_all_snippets_for_commit(
555
+ commit_sha
474
556
  )
475
- ]
476
- if not snippets_without_summary:
477
- await step.skip("All snippets already have a summary enrichment")
557
+ )
558
+ if not all_snippets:
559
+ await step.skip("No snippets to enrich")
478
560
  return
479
561
 
480
562
  # Enrich snippets
481
- await step.set_total(len(snippets_without_summary))
563
+ await step.set_total(len(all_snippets))
482
564
  snippet_map = {
483
- snippet.id: snippet
484
- for snippet in snippets_without_summary
485
- if snippet.id
565
+ str(snippet.id): snippet for snippet in all_snippets if snippet.id
486
566
  }
487
567
 
488
568
  enrichment_requests = [
489
569
  GenericEnrichmentRequest(
490
- id=snippet_id,
570
+ id=str(snippet_id),
491
571
  text=snippet.content,
492
572
  system_prompt=SUMMARIZATION_SYSTEM_PROMPT,
493
573
  )
@@ -497,12 +577,27 @@ class CommitIndexingApplicationService:
497
577
  processed = 0
498
578
  async for result in self.enricher_service.enrich(enrichment_requests):
499
579
  snippet = snippet_map[result.id]
500
- snippet.enrichments.append(
501
- Enrichment(type=EnrichmentType.SUMMARIZATION, content=result.text)
580
+ db_summary = await self.enrichment_v2_repository.save(
581
+ SnippetEnrichmentSummary(content=result.text)
582
+ )
583
+ if not db_summary.id:
584
+ raise ValueError(
585
+ f"Failed to save snippet enrichment for commit {commit_sha}"
586
+ )
587
+ await self.enrichment_association_repository.save(
588
+ EnrichmentAssociation(
589
+ enrichment_id=db_summary.id,
590
+ entity_type=db_entities.EnrichmentV2.__tablename__,
591
+ entity_id=str(snippet.id),
592
+ )
593
+ )
594
+ await self.enrichment_association_repository.save(
595
+ EnrichmentAssociation(
596
+ enrichment_id=db_summary.id,
597
+ entity_type=db_entities.GitCommit.__tablename__,
598
+ entity_id=commit_sha,
599
+ )
502
600
  )
503
-
504
- await self.snippet_repository.save_snippets(commit_sha, [snippet])
505
-
506
601
  processed += 1
507
602
  await step.set_current(processed, "Enriching snippets for commit")
508
603
 
@@ -515,40 +610,61 @@ class CommitIndexingApplicationService:
515
610
  trackable_type=TrackableType.KODIT_REPOSITORY,
516
611
  trackable_id=repository_id,
517
612
  ) as step:
518
- snippets = await self.snippet_repository.get_snippets_for_commit(commit_sha)
519
-
520
- new_snippets = await self._new_snippets_for_type(
521
- snippets, EmbeddingType.TEXT
613
+ # Get all snippet enrichments for this commit
614
+ all_snippet_enrichments = (
615
+ await self.enrichment_query_service.get_all_snippets_for_commit(
616
+ commit_sha
617
+ )
522
618
  )
523
- if not new_snippets:
524
- await step.skip("All snippets already have text embeddings")
619
+ if not all_snippet_enrichments:
620
+ await step.skip("No snippets to create summary embeddings")
525
621
  return
526
622
 
527
- await step.set_total(len(new_snippets))
528
- processed = 0
623
+ # Get summary enrichments that point to these snippet enrichments
624
+ query = EnrichmentAssociationQueryBuilder.for_enrichment_associations(
625
+ entity_type=db_entities.EnrichmentV2.__tablename__,
626
+ entity_ids=[
627
+ str(snippet.id) for snippet in all_snippet_enrichments if snippet.id
628
+ ],
629
+ )
630
+ summary_enrichment_associations = (
631
+ await self.enrichment_association_repository.find(query)
632
+ )
633
+
634
+ if not summary_enrichment_associations:
635
+ await step.skip("No summary enrichments found for snippets")
636
+ return
529
637
 
530
- def _summary_from_enrichments(enrichments: list[Enrichment]) -> str:
531
- if not enrichments:
532
- return ""
533
- return next(
534
- enrichment.content
535
- for enrichment in enrichments
536
- if enrichment.type == EnrichmentType.SUMMARIZATION
638
+ # Get the actual summary enrichments
639
+ summary_enrichments = await self.enrichment_v2_repository.find(
640
+ QueryBuilder().filter(
641
+ "id",
642
+ FilterOperator.IN,
643
+ [
644
+ association.enrichment_id
645
+ for association in summary_enrichment_associations
646
+ ],
537
647
  )
648
+ )
538
649
 
539
- snippet_summary_map = {
540
- snippet.id: _summary_from_enrichments(snippet.enrichments)
541
- for snippet in snippets
542
- if snippet.id
543
- }
544
- if len(snippet_summary_map) == 0:
545
- await step.skip("No snippets with summaries to create text embeddings")
650
+ # Check if embeddings already exist for these summaries
651
+ new_summaries = await self._new_snippets_for_type(
652
+ summary_enrichments, EmbeddingType.TEXT
653
+ )
654
+ if not new_summaries:
655
+ await step.skip("All snippets already have text embeddings")
546
656
  return
547
657
 
658
+ await step.set_total(len(new_summaries))
659
+ processed = 0
660
+
661
+ # Create documents from the summary enrichments
548
662
  documents_with_summaries = [
549
- Document(snippet_id=snippet_id, text=snippet_summary)
550
- for snippet_id, snippet_summary in snippet_summary_map.items()
663
+ Document(snippet_id=str(summary.id), text=summary.content)
664
+ for summary in new_summaries
665
+ if summary.id
551
666
  ]
667
+
552
668
  async for result in self.text_search_service.index_documents(
553
669
  IndexRequest(documents=documents_with_summaries)
554
670
  ):
@@ -567,23 +683,14 @@ class CommitIndexingApplicationService:
567
683
  await step.set_total(3)
568
684
 
569
685
  # Check if architecture enrichment already exists for this commit
570
- enrichment_repo = self.enrichment_v2_repository
571
- existing_enrichments = await enrichment_repo.enrichments_for_entity_type(
572
- entity_type="git_commit",
573
- entity_ids=[commit_sha],
574
- )
575
-
576
- # Check if architecture enrichment already exists
577
- has_architecture = any(
578
- enrichment.type == "architecture" for enrichment in existing_enrichments
579
- )
580
-
581
- if has_architecture:
686
+ if await self.enrichment_query_service.has_architecture_for_commit(
687
+ commit_sha
688
+ ):
582
689
  await step.skip("Architecture enrichment already exists for commit")
583
690
  return
584
691
 
585
692
  # Get repository path
586
- repo = await self.repo_repository.get_by_id(repository_id)
693
+ repo = await self.repo_repository.get(repository_id)
587
694
  if not repo.cloned_path:
588
695
  raise ValueError(f"Repository {repository_id} has never been cloned")
589
696
 
@@ -610,13 +717,20 @@ class CommitIndexingApplicationService:
610
717
  enriched_content = response.text
611
718
 
612
719
  # Create and save architecture enrichment with enriched content
613
- architecture_enrichment = PhysicalArchitectureEnrichment(
614
- entity_id=commit_sha,
615
- content=enriched_content,
720
+ enrichment = await self.enrichment_v2_repository.save(
721
+ PhysicalArchitectureEnrichment(
722
+ content=enriched_content,
723
+ )
616
724
  )
617
-
618
- await self.enrichment_v2_repository.bulk_save_enrichments(
619
- [architecture_enrichment]
725
+ if not enrichment or not enrichment.id:
726
+ raise ValueError(
727
+ f"Failed to save architecture enrichment for commit {commit_sha}"
728
+ )
729
+ await self.enrichment_association_repository.save(
730
+ CommitEnrichmentAssociation(
731
+ enrichment_id=enrichment.id,
732
+ entity_id=commit_sha,
733
+ )
620
734
  )
621
735
 
622
736
  await step.set_current(3, "Architecture enrichment completed")
@@ -629,34 +743,26 @@ class CommitIndexingApplicationService:
629
743
  trackable_id=repository_id,
630
744
  ) as step:
631
745
  # Check if API docs already exist for this commit
632
- existing_enrichments = (
633
- await self.enrichment_v2_repository.enrichments_for_entity_type(
634
- entity_type="git_commit",
635
- entity_ids=[commit_sha],
636
- )
637
- )
638
-
639
- has_api_docs = any(
640
- e.type == ENRICHMENT_TYPE_USAGE
641
- and e.subtype == ENRICHMENT_SUBTYPE_API_DOCS
642
- for e in existing_enrichments
643
- )
644
-
645
- if has_api_docs:
746
+ if await self.enrichment_query_service.has_api_docs_for_commit(commit_sha):
646
747
  await step.skip("API docs already exist for commit")
647
748
  return
648
749
 
649
750
  # Get repository for metadata
650
- repo = await self.repo_repository.get_by_id(repository_id)
751
+ repo = await self.repo_repository.get(repository_id)
651
752
  if not repo:
652
753
  raise ValueError(f"Repository {repository_id} not found")
653
754
  str(repo.sanitized_remote_uri)
654
755
 
655
- commit = await self.git_commit_repository.get_by_sha(commit_sha)
756
+ files = await self.git_file_repository.find(
757
+ GitFileQueryBuilder().for_commit_sha(commit_sha)
758
+ )
759
+ if not files:
760
+ await step.skip("No files to extract API docs from")
761
+ return
656
762
 
657
763
  # Group files by language
658
764
  lang_files_map: dict[str, list[GitFile]] = defaultdict(list)
659
- for file in commit.files:
765
+ for file in files:
660
766
  try:
661
767
  lang = LanguageMapping.get_language_for_extension(file.extension)
662
768
  except ValueError:
@@ -670,28 +776,41 @@ class CommitIndexingApplicationService:
670
776
  for i, (lang, lang_files) in enumerate(lang_files_map.items()):
671
777
  await step.set_current(i, f"Extracting API docs for {lang}")
672
778
  enrichments = extractor.extract_api_docs(
673
- lang_files,
674
- lang,
675
- commit_sha,
779
+ files=lang_files,
780
+ language=lang,
676
781
  include_private=False,
677
782
  )
678
783
  all_enrichments.extend(enrichments)
679
784
 
680
785
  # Save all enrichments
681
786
  if all_enrichments:
682
- await self.enrichment_v2_repository.bulk_save_enrichments(
683
- all_enrichments
787
+ saved_enrichments = await self.enrichment_v2_repository.save_bulk(
788
+ all_enrichments # type: ignore[arg-type]
789
+ )
790
+ await self.enrichment_association_repository.save_bulk(
791
+ [
792
+ CommitEnrichmentAssociation(
793
+ enrichment_id=enrichment.id,
794
+ entity_id=commit_sha,
795
+ )
796
+ for enrichment in saved_enrichments
797
+ if enrichment.id
798
+ ]
684
799
  )
685
800
 
686
801
  async def _new_snippets_for_type(
687
- self, all_snippets: list[SnippetV2], embedding_type: EmbeddingType
688
- ) -> list[SnippetV2]:
802
+ self, all_snippets: list[EnrichmentV2], embedding_type: EmbeddingType
803
+ ) -> list[EnrichmentV2]:
689
804
  """Get new snippets for a given type."""
690
805
  existing_embeddings = (
691
806
  await self.embedding_repository.list_embeddings_by_snippet_ids_and_type(
692
- [s.id for s in all_snippets], embedding_type
807
+ [str(s.id) for s in all_snippets], embedding_type
693
808
  )
694
809
  )
810
+ # TODO(Phil): Can't do this incrementally yet because like the API, we don't
811
+ # have a unified embedding repository
812
+ if existing_embeddings:
813
+ return []
695
814
  existing_embeddings_by_snippet_id = {
696
815
  embedding.snippet_id: embedding for embedding in existing_embeddings
697
816
  }