kodit 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (64) hide show
  1. kodit/_version.py +2 -2
  2. kodit/app.py +2 -0
  3. kodit/application/factories/server_factory.py +58 -32
  4. kodit/application/services/code_search_application_service.py +89 -12
  5. kodit/application/services/commit_indexing_application_service.py +527 -195
  6. kodit/application/services/enrichment_query_service.py +311 -43
  7. kodit/application/services/indexing_worker_service.py +1 -1
  8. kodit/application/services/queue_service.py +15 -10
  9. kodit/application/services/sync_scheduler.py +2 -1
  10. kodit/domain/enrichments/architecture/architecture.py +1 -1
  11. kodit/domain/enrichments/architecture/database_schema/__init__.py +1 -0
  12. kodit/domain/enrichments/architecture/database_schema/database_schema.py +17 -0
  13. kodit/domain/enrichments/architecture/physical/physical.py +1 -1
  14. kodit/domain/enrichments/development/development.py +1 -1
  15. kodit/domain/enrichments/development/snippet/snippet.py +12 -5
  16. kodit/domain/enrichments/enrichment.py +31 -4
  17. kodit/domain/enrichments/history/__init__.py +1 -0
  18. kodit/domain/enrichments/history/commit_description/__init__.py +1 -0
  19. kodit/domain/enrichments/history/commit_description/commit_description.py +17 -0
  20. kodit/domain/enrichments/history/history.py +18 -0
  21. kodit/domain/enrichments/usage/api_docs.py +1 -1
  22. kodit/domain/enrichments/usage/usage.py +1 -1
  23. kodit/domain/entities/git.py +30 -25
  24. kodit/domain/factories/git_repo_factory.py +20 -5
  25. kodit/domain/protocols.py +60 -125
  26. kodit/domain/services/embedding_service.py +14 -16
  27. kodit/domain/services/git_repository_service.py +60 -38
  28. kodit/domain/services/git_service.py +18 -11
  29. kodit/domain/tracking/resolution_service.py +6 -16
  30. kodit/domain/value_objects.py +6 -9
  31. kodit/infrastructure/api/v1/dependencies.py +12 -3
  32. kodit/infrastructure/api/v1/query_params.py +27 -0
  33. kodit/infrastructure/api/v1/routers/commits.py +91 -85
  34. kodit/infrastructure/api/v1/routers/repositories.py +53 -37
  35. kodit/infrastructure/api/v1/routers/search.py +1 -1
  36. kodit/infrastructure/api/v1/schemas/enrichment.py +14 -0
  37. kodit/infrastructure/api/v1/schemas/repository.py +1 -1
  38. kodit/infrastructure/cloning/git/git_python_adaptor.py +41 -0
  39. kodit/infrastructure/database_schema/__init__.py +1 -0
  40. kodit/infrastructure/database_schema/database_schema_detector.py +268 -0
  41. kodit/infrastructure/slicing/api_doc_extractor.py +0 -2
  42. kodit/infrastructure/sqlalchemy/embedding_repository.py +44 -34
  43. kodit/infrastructure/sqlalchemy/enrichment_association_repository.py +73 -0
  44. kodit/infrastructure/sqlalchemy/enrichment_v2_repository.py +145 -97
  45. kodit/infrastructure/sqlalchemy/entities.py +12 -116
  46. kodit/infrastructure/sqlalchemy/git_branch_repository.py +52 -244
  47. kodit/infrastructure/sqlalchemy/git_commit_repository.py +35 -324
  48. kodit/infrastructure/sqlalchemy/git_file_repository.py +70 -0
  49. kodit/infrastructure/sqlalchemy/git_repository.py +60 -230
  50. kodit/infrastructure/sqlalchemy/git_tag_repository.py +53 -240
  51. kodit/infrastructure/sqlalchemy/query.py +331 -0
  52. kodit/infrastructure/sqlalchemy/repository.py +203 -0
  53. kodit/infrastructure/sqlalchemy/task_repository.py +79 -58
  54. kodit/infrastructure/sqlalchemy/task_status_repository.py +45 -52
  55. kodit/migrations/versions/4b1a3b2c8fa5_refactor_git_tracking.py +190 -0
  56. {kodit-0.5.4.dist-info → kodit-0.5.6.dist-info}/METADATA +1 -1
  57. {kodit-0.5.4.dist-info → kodit-0.5.6.dist-info}/RECORD +60 -50
  58. kodit/infrastructure/mappers/enrichment_mapper.py +0 -83
  59. kodit/infrastructure/mappers/git_mapper.py +0 -193
  60. kodit/infrastructure/mappers/snippet_mapper.py +0 -104
  61. kodit/infrastructure/sqlalchemy/snippet_v2_repository.py +0 -479
  62. {kodit-0.5.4.dist-info → kodit-0.5.6.dist-info}/WHEEL +0 -0
  63. {kodit-0.5.4.dist-info → kodit-0.5.6.dist-info}/entry_points.txt +0 -0
  64. {kodit-0.5.4.dist-info → kodit-0.5.6.dist-info}/licenses/LICENSE +0 -0
@@ -2,30 +2,51 @@
2
2
 
3
3
  from collections import defaultdict
4
4
  from pathlib import Path
5
+ from typing import TYPE_CHECKING
5
6
 
6
7
  import structlog
7
8
  from pydantic import AnyUrl
8
9
 
9
10
  from kodit.application.services.queue_service import QueueService
10
11
  from kodit.application.services.reporting import ProgressTracker
12
+
13
+ if TYPE_CHECKING:
14
+ from kodit.application.services.enrichment_query_service import (
15
+ EnrichmentQueryService,
16
+ )
17
+ from kodit.domain.enrichments.architecture.database_schema.database_schema import (
18
+ DatabaseSchemaEnrichment,
19
+ )
11
20
  from kodit.domain.enrichments.architecture.physical.physical import (
12
21
  PhysicalArchitectureEnrichment,
13
22
  )
23
+ from kodit.domain.enrichments.development.snippet.snippet import (
24
+ SnippetEnrichment,
25
+ SnippetEnrichmentSummary,
26
+ )
14
27
  from kodit.domain.enrichments.enricher import Enricher
28
+ from kodit.domain.enrichments.enrichment import (
29
+ CommitEnrichmentAssociation,
30
+ EnrichmentAssociation,
31
+ EnrichmentV2,
32
+ )
33
+ from kodit.domain.enrichments.history.commit_description.commit_description import (
34
+ CommitDescriptionEnrichment,
35
+ )
15
36
  from kodit.domain.enrichments.request import (
16
37
  EnrichmentRequest as GenericEnrichmentRequest,
17
38
  )
18
- from kodit.domain.enrichments.usage.api_docs import ENRICHMENT_SUBTYPE_API_DOCS
19
- from kodit.domain.enrichments.usage.usage import ENRICHMENT_TYPE_USAGE
20
39
  from kodit.domain.entities import Task
21
- from kodit.domain.entities.git import GitFile, GitRepo, SnippetV2
40
+ from kodit.domain.entities.git import GitFile, GitRepo, SnippetV2, TrackingType
22
41
  from kodit.domain.factories.git_repo_factory import GitRepoFactory
23
42
  from kodit.domain.protocols import (
43
+ EnrichmentAssociationRepository,
44
+ EnrichmentV2Repository,
24
45
  GitBranchRepository,
25
46
  GitCommitRepository,
47
+ GitFileRepository,
26
48
  GitRepoRepository,
27
49
  GitTagRepository,
28
- SnippetRepositoryV2,
29
50
  )
30
51
  from kodit.domain.services.bm25_service import BM25DomainService
31
52
  from kodit.domain.services.embedding_service import EmbeddingDomainService
@@ -41,8 +62,6 @@ from kodit.domain.services.physical_architecture_service import (
41
62
  from kodit.domain.value_objects import (
42
63
  DeleteRequest,
43
64
  Document,
44
- Enrichment,
45
- EnrichmentType,
46
65
  IndexRequest,
47
66
  LanguageMapping,
48
67
  PrescribedOperations,
@@ -50,36 +69,109 @@ from kodit.domain.value_objects import (
50
69
  TaskOperation,
51
70
  TrackableType,
52
71
  )
72
+ from kodit.infrastructure.database_schema.database_schema_detector import (
73
+ DatabaseSchemaDetector,
74
+ )
53
75
  from kodit.infrastructure.slicing.api_doc_extractor import APIDocExtractor
54
76
  from kodit.infrastructure.slicing.slicer import Slicer
77
+ from kodit.infrastructure.sqlalchemy import entities as db_entities
55
78
  from kodit.infrastructure.sqlalchemy.embedding_repository import (
56
79
  SqlAlchemyEmbeddingRepository,
57
80
  )
58
- from kodit.infrastructure.sqlalchemy.enrichment_v2_repository import (
59
- EnrichmentV2Repository,
60
- )
61
81
  from kodit.infrastructure.sqlalchemy.entities import EmbeddingType
82
+ from kodit.infrastructure.sqlalchemy.query import (
83
+ EnrichmentAssociationQueryBuilder,
84
+ FilterOperator,
85
+ GitFileQueryBuilder,
86
+ QueryBuilder,
87
+ )
62
88
 
63
89
  SUMMARIZATION_SYSTEM_PROMPT = """
64
90
  You are a professional software developer. You will be given a snippet of code.
65
91
  Please provide a concise explanation of the code.
66
92
  """
67
93
 
94
+ COMMIT_DESCRIPTION_SYSTEM_PROMPT = """
95
+ You are a professional software developer. You will be given a git commit diff.
96
+ Please provide a concise description of what changes were made and why.
97
+ """
98
+
99
+ DATABASE_SCHEMA_SYSTEM_PROMPT = """
100
+ You are an expert database architect and documentation specialist.
101
+ Your task is to create clear, visual documentation of database schemas.
102
+ """
103
+
104
+ DATABASE_SCHEMA_TASK_PROMPT = """
105
+ You will be provided with a database schema discovery report.
106
+ Please create comprehensive database schema documentation.
107
+
108
+ <schema_report>
109
+ {schema_report}
110
+ </schema_report>
111
+
112
+ **Return the following:**
113
+
114
+ ## Entity List
115
+
116
+ For each table/entity, write one line:
117
+ - **[Table Name]**: [brief description of what it stores]
118
+
119
+ ## Mermaid ERD
120
+
121
+ Create a Mermaid Entity Relationship Diagram showing:
122
+ - All entities (tables)
123
+ - Key relationships between entities (if apparent from names or common patterns)
124
+ - Use standard ERD notation
125
+
126
+ Example format:
127
+ ```mermaid
128
+ erDiagram
129
+ User ||--o{{ Order : places
130
+ User {{
131
+ int id PK
132
+ string email
133
+ string name
134
+ }}
135
+ Order {{
136
+ int id PK
137
+ int user_id FK
138
+ datetime created_at
139
+ }}
140
+ ```
141
+
142
+ If specific field details aren't available, show just the entity boxes and
143
+ relationships.
144
+
145
+ ## Key Observations
146
+
147
+ Answer these questions in 1-2 sentences each:
148
+ 1. What is the primary data model pattern (e.g., user-centric,
149
+ event-sourced, multi-tenant)?
150
+ 2. What migration strategy is being used?
151
+ 3. Are there any notable database design patterns or concerns?
152
+
153
+ ## Rules:
154
+ - Be concise and focus on the high-level structure
155
+ - Infer reasonable relationships from table names when explicit information
156
+ isn't available
157
+ - If no database schema is found, state that clearly
158
+ - Keep entity descriptions to 10 words or less
159
+ """
160
+
68
161
 
69
162
  class CommitIndexingApplicationService:
70
163
  """Application service for commit indexing operations."""
71
164
 
72
165
  def __init__( # noqa: PLR0913
73
166
  self,
74
- snippet_v2_repository: SnippetRepositoryV2,
75
167
  repo_repository: GitRepoRepository,
76
168
  git_commit_repository: GitCommitRepository,
169
+ git_file_repository: GitFileRepository,
77
170
  git_branch_repository: GitBranchRepository,
78
171
  git_tag_repository: GitTagRepository,
79
172
  operation: ProgressTracker,
80
173
  scanner: GitRepositoryScanner,
81
174
  cloner: RepositoryCloner,
82
- snippet_repository: SnippetRepositoryV2,
83
175
  slicer: Slicer,
84
176
  queue: QueueService,
85
177
  bm25_service: BM25DomainService,
@@ -87,28 +179,21 @@ class CommitIndexingApplicationService:
87
179
  text_search_service: EmbeddingDomainService,
88
180
  embedding_repository: SqlAlchemyEmbeddingRepository,
89
181
  architecture_service: PhysicalArchitectureService,
90
- enrichment_v2_repository: EnrichmentV2Repository,
182
+ database_schema_detector: DatabaseSchemaDetector,
91
183
  enricher_service: Enricher,
184
+ enrichment_v2_repository: EnrichmentV2Repository,
185
+ enrichment_association_repository: EnrichmentAssociationRepository,
186
+ enrichment_query_service: "EnrichmentQueryService",
92
187
  ) -> None:
93
- """Initialize the commit indexing application service.
94
-
95
- Args:
96
- commit_index_repository: Repository for commit index data.
97
- snippet_v2_repository: Repository for snippet data.
98
- repo_repository: Repository for Git repository data.
99
- domain_indexer: Domain service for indexing operations.
100
- operation: Progress tracker for reporting operations.
101
-
102
- """
103
- self.snippet_repository = snippet_v2_repository
188
+ """Initialize the commit indexing application service."""
104
189
  self.repo_repository = repo_repository
105
190
  self.git_commit_repository = git_commit_repository
191
+ self.git_file_repository = git_file_repository
106
192
  self.git_branch_repository = git_branch_repository
107
193
  self.git_tag_repository = git_tag_repository
108
194
  self.operation = operation
109
195
  self.scanner = scanner
110
196
  self.cloner = cloner
111
- self.snippet_repository = snippet_repository
112
197
  self.slicer = slicer
113
198
  self.queue = queue
114
199
  self.bm25_service = bm25_service
@@ -116,8 +201,11 @@ class CommitIndexingApplicationService:
116
201
  self.text_search_service = text_search_service
117
202
  self.embedding_repository = embedding_repository
118
203
  self.architecture_service = architecture_service
204
+ self.database_schema_detector = database_schema_detector
119
205
  self.enrichment_v2_repository = enrichment_v2_repository
206
+ self.enrichment_association_repository = enrichment_association_repository
120
207
  self.enricher_service = enricher_service
208
+ self.enrichment_query_service = enrichment_query_service
121
209
  self._log = structlog.get_logger(__name__)
122
210
 
123
211
  async def create_git_repository(self, remote_uri: AnyUrl) -> GitRepo:
@@ -137,7 +225,7 @@ class CommitIndexingApplicationService:
137
225
 
138
226
  async def delete_git_repository(self, repo_id: int) -> bool:
139
227
  """Delete a Git repository by ID."""
140
- repo = await self.repo_repository.get_by_id(repo_id)
228
+ repo = await self.repo_repository.get(repo_id)
141
229
  if not repo:
142
230
  return False
143
231
 
@@ -181,6 +269,10 @@ class CommitIndexingApplicationService:
181
269
  await self.process_architecture_discovery(repository_id, commit_sha)
182
270
  elif task.type == TaskOperation.CREATE_PUBLIC_API_DOCS_FOR_COMMIT:
183
271
  await self.process_api_docs(repository_id, commit_sha)
272
+ elif task.type == TaskOperation.CREATE_COMMIT_DESCRIPTION_FOR_COMMIT:
273
+ await self.process_commit_description(repository_id, commit_sha)
274
+ elif task.type == TaskOperation.CREATE_DATABASE_SCHEMA_FOR_COMMIT:
275
+ await self.process_database_schema(repository_id, commit_sha)
184
276
  else:
185
277
  raise ValueError(f"Unknown task type: {task.type}")
186
278
  else:
@@ -193,7 +285,7 @@ class CommitIndexingApplicationService:
193
285
  trackable_type=TrackableType.KODIT_REPOSITORY,
194
286
  trackable_id=repository_id,
195
287
  ):
196
- repo = await self.repo_repository.get_by_id(repository_id)
288
+ repo = await self.repo_repository.get(repository_id)
197
289
  repo.cloned_path = await self.cloner.clone_repository(repo.remote_uri)
198
290
  await self.repo_repository.save(repo)
199
291
 
@@ -204,45 +296,57 @@ class CommitIndexingApplicationService:
204
296
  trackable_type=TrackableType.KODIT_REPOSITORY,
205
297
  trackable_id=repository_id,
206
298
  ) as step:
207
- await step.set_total(6)
208
- repo = await self.repo_repository.get_by_id(repository_id)
299
+ await step.set_total(7)
300
+ repo = await self.repo_repository.get(repository_id)
209
301
  if not repo.cloned_path:
210
302
  raise ValueError(f"Repository {repository_id} has never been cloned")
211
303
 
212
304
  # Scan the repository to get all metadata
213
305
  await step.set_current(0, "Scanning repository")
214
- scan_result = await self.scanner.scan_repository(repo.cloned_path)
306
+ scan_result = await self.scanner.scan_repository(
307
+ repo.cloned_path, repository_id
308
+ )
215
309
 
216
310
  # Update repo with scan result (this sets num_commits, num_branches, etc.)
217
311
  await step.set_current(1, "Updating repository with scan result")
218
312
  repo.update_with_scan_result(scan_result)
219
313
  await self.repo_repository.save(repo)
220
314
 
221
- # Save commits, branches, and tags to their dedicated repositories
222
315
  await step.set_current(2, "Saving commits")
223
- if scan_result.all_commits:
224
- await self.git_commit_repository.save_bulk(
225
- scan_result.all_commits, repository_id
226
- )
316
+ await self.git_commit_repository.save_bulk(scan_result.all_commits)
227
317
 
228
- await step.set_current(3, "Saving branches")
318
+ await step.set_current(3, "Saving files")
319
+ await self.git_file_repository.save_bulk(scan_result.all_files)
320
+
321
+ await step.set_current(4, "Saving branches")
229
322
  if scan_result.branches:
230
323
  await self.git_branch_repository.save_bulk(
231
- scan_result.branches, repository_id
324
+ scan_result.branches,
232
325
  )
233
326
 
234
- await step.set_current(4, "Saving tags")
327
+ await step.set_current(5, "Saving tags")
235
328
  if scan_result.all_tags:
236
329
  await self.git_tag_repository.save_bulk(
237
- scan_result.all_tags, repository_id
330
+ scan_result.all_tags,
238
331
  )
239
332
 
240
- await step.set_current(5, "Enqueuing commit indexing tasks")
241
- if not repo.tracking_branch:
333
+ await step.set_current(6, "Enqueuing commit indexing tasks")
334
+ if not repo.tracking_config.name:
242
335
  raise ValueError(f"Repository {repository_id} has no tracking branch")
243
- commit_sha = repo.tracking_branch.head_commit.commit_sha
244
- if not commit_sha:
245
- raise ValueError(f"Repository {repository_id} has no head commit")
336
+ if repo.tracking_config.type == TrackingType.BRANCH.value:
337
+ branch = await self.git_branch_repository.get_by_name(
338
+ repo.tracking_config.name, repository_id
339
+ )
340
+ commit_sha = branch.head_commit_sha
341
+ elif repo.tracking_config.type == TrackingType.TAG.value:
342
+ tag = await self.git_tag_repository.get_by_name(
343
+ repo.tracking_config.name, repository_id
344
+ )
345
+ commit_sha = tag.target_commit_sha
346
+ elif repo.tracking_config.type == TrackingType.COMMIT_SHA.value:
347
+ commit_sha = repo.tracking_config.name
348
+ else:
349
+ raise ValueError(f"Unknown tracking type: {repo.tracking_config.type}")
246
350
 
247
351
  await self.queue.enqueue_tasks(
248
352
  tasks=PrescribedOperations.INDEX_COMMIT,
@@ -250,6 +354,74 @@ class CommitIndexingApplicationService:
250
354
  payload={"commit_sha": commit_sha, "repository_id": repository_id},
251
355
  )
252
356
 
357
+ async def _delete_snippet_enrichments_for_commits(
358
+ self, commit_shas: list[str]
359
+ ) -> None:
360
+ """Delete snippet enrichments and their indices for commits."""
361
+ # Get all snippet enrichment IDs for these commits
362
+ all_snippet_enrichment_ids = []
363
+ for commit_sha in commit_shas:
364
+ snippet_enrichments = (
365
+ await self.enrichment_query_service.get_all_snippets_for_commit(
366
+ commit_sha
367
+ )
368
+ )
369
+ enrichment_ids = [
370
+ enrichment.id for enrichment in snippet_enrichments if enrichment.id
371
+ ]
372
+ all_snippet_enrichment_ids.extend(enrichment_ids)
373
+
374
+ if not all_snippet_enrichment_ids:
375
+ return
376
+
377
+ # Delete from BM25 and embedding indices
378
+ snippet_id_strings = [str(sid) for sid in all_snippet_enrichment_ids]
379
+ delete_request = DeleteRequest(snippet_ids=snippet_id_strings)
380
+ await self.bm25_service.delete_documents(delete_request)
381
+
382
+ for snippet_id in all_snippet_enrichment_ids:
383
+ await self.embedding_repository.delete_embeddings_by_snippet_id(
384
+ str(snippet_id)
385
+ )
386
+
387
+ # Delete enrichment associations for snippets
388
+ await self.enrichment_association_repository.delete_by_query(
389
+ QueryBuilder()
390
+ .filter("entity_type", FilterOperator.EQ, "snippet_v2")
391
+ .filter("entity_id", FilterOperator.IN, snippet_id_strings)
392
+ )
393
+
394
+ # Delete the enrichments themselves
395
+ await self.enrichment_v2_repository.delete_by_query(
396
+ QueryBuilder().filter("id", FilterOperator.IN, all_snippet_enrichment_ids)
397
+ )
398
+
399
+ async def _delete_commit_enrichments(self, commit_shas: list[str]) -> None:
400
+ """Delete commit-level enrichments for commits."""
401
+ existing_enrichment_associations = (
402
+ await self.enrichment_association_repository.find(
403
+ QueryBuilder()
404
+ .filter(
405
+ "entity_type",
406
+ FilterOperator.EQ,
407
+ db_entities.GitCommit.__tablename__,
408
+ )
409
+ .filter("entity_id", FilterOperator.IN, commit_shas)
410
+ )
411
+ )
412
+ enrichment_ids = [a.enrichment_id for a in existing_enrichment_associations]
413
+ if not enrichment_ids:
414
+ return
415
+
416
+ # Delete associations first
417
+ await self.enrichment_association_repository.delete_by_query(
418
+ QueryBuilder().filter("enrichment_id", FilterOperator.IN, enrichment_ids)
419
+ )
420
+ # Then delete enrichments
421
+ await self.enrichment_v2_repository.delete_by_query(
422
+ QueryBuilder().filter("id", FilterOperator.IN, enrichment_ids)
423
+ )
424
+
253
425
  async def process_delete_repo(self, repository_id: int) -> None:
254
426
  """Delete a repository."""
255
427
  async with self.operation.create_child(
@@ -257,61 +429,34 @@ class CommitIndexingApplicationService:
257
429
  trackable_type=TrackableType.KODIT_REPOSITORY,
258
430
  trackable_id=repository_id,
259
431
  ):
260
- repo = await self.repo_repository.get_by_id(repository_id)
432
+ repo = await self.repo_repository.get(repository_id)
261
433
  if not repo:
262
434
  raise ValueError(f"Repository {repository_id} not found")
263
435
 
264
- # Get all commit SHAs for this repository first (needed for cleanup)
265
- commits = await self.git_commit_repository.get_by_repo_id(repository_id)
436
+ # Get all commit SHAs for this repository
437
+ commits = await self.git_commit_repository.find(
438
+ QueryBuilder().filter("repo_id", FilterOperator.EQ, repository_id)
439
+ )
266
440
  commit_shas = [commit.commit_sha for commit in commits]
267
441
 
268
- # Step 1: Get all snippet IDs that are associated with these commits FIRST
269
- # (before deleting the associations)
270
- all_snippet_ids = []
442
+ # Delete all enrichments and their indices
271
443
  if commit_shas:
272
- for commit_sha in commit_shas:
273
- snippets = await self.snippet_repository.get_snippets_for_commit(
274
- commit_sha
275
- )
276
- all_snippet_ids.extend(
277
- [snippet.id for snippet in snippets if snippet.id]
278
- )
444
+ await self._delete_snippet_enrichments_for_commits(commit_shas)
445
+ await self._delete_commit_enrichments(commit_shas)
279
446
 
280
- # Step 2: Delete from BM25 and embedding indices
281
- if all_snippet_ids:
282
- # Convert to strings as DeleteRequest expects list[str]
283
- snippet_id_strings = [str(snippet_id) for snippet_id in all_snippet_ids]
284
- delete_request = DeleteRequest(snippet_ids=snippet_id_strings)
285
- await self.bm25_service.delete_documents(delete_request)
286
-
287
- # Delete embeddings for each snippet
288
- for snippet_id in all_snippet_ids:
289
- await self.embedding_repository.delete_embeddings_by_snippet_id(
290
- snippet_id
291
- )
292
-
293
- # Step 3: Delete enrichments for all commits
294
- if commit_shas:
295
- await self.enrichment_v2_repository.bulk_delete_enrichments(
296
- entity_type="git_commit",
297
- entity_ids=commit_shas,
298
- )
299
-
300
- # Step 4: Delete snippet associations for all commits
301
- for commit_sha in commit_shas:
302
- await self.snippet_repository.delete_snippets_for_commit(commit_sha)
303
-
304
- # Step 5: Delete branches (they reference commits via head_commit_sha)
447
+ # Delete branches, tags, files, commits, and repository
305
448
  await self.git_branch_repository.delete_by_repo_id(repository_id)
306
-
307
- # Step 6: Delete tags (they reference commits via target_commit_sha)
308
449
  await self.git_tag_repository.delete_by_repo_id(repository_id)
309
450
 
310
- # Step 7: Delete commits and their files
311
- await self.git_commit_repository.delete_by_repo_id(repository_id)
451
+ for commit_sha in commit_shas:
452
+ await self.git_file_repository.delete_by_commit_sha(commit_sha)
453
+
454
+ await self.git_commit_repository.delete_by_query(
455
+ QueryBuilder().filter("repo_id", FilterOperator.EQ, repository_id)
456
+ )
312
457
 
313
- # Step 8: Finally delete the repository
314
- await self.repo_repository.delete(repo.sanitized_remote_uri)
458
+ if repo.id:
459
+ await self.repo_repository.delete(repo)
315
460
 
316
461
  async def process_snippets_for_commit(
317
462
  self, repository_id: int, commit_sha: str
@@ -322,16 +467,16 @@ class CommitIndexingApplicationService:
322
467
  trackable_type=TrackableType.KODIT_REPOSITORY,
323
468
  trackable_id=repository_id,
324
469
  ) as step:
325
- # Have we already processed this commit? If yes, skip.
326
- if await self.snippet_repository.get_snippets_for_commit(commit_sha):
327
- await step.skip("All snippets already extracted for commit")
470
+ # Find existing snippet enrichments for this commit
471
+ if await self.enrichment_query_service.has_snippets_for_commit(commit_sha):
472
+ await step.skip("Snippets already extracted for commit")
328
473
  return
329
474
 
330
- commit = await self.git_commit_repository.get_by_sha(commit_sha)
475
+ commit = await self.git_commit_repository.get(commit_sha)
331
476
 
332
477
  # Load files on demand for snippet extraction (performance optimization)
333
478
  # Instead of using commit.files (which may be empty), load files directly
334
- repo = await self.repo_repository.get_by_id(repository_id)
479
+ repo = await self.repo_repository.get(repository_id)
335
480
  if not repo.cloned_path:
336
481
  raise ValueError(f"Repository {repository_id} has never been cloned")
337
482
 
@@ -350,6 +495,7 @@ class CommitIndexingApplicationService:
350
495
  absolute_path = str(repo.cloned_path / file_data["path"])
351
496
 
352
497
  git_file = GitFile(
498
+ commit_sha=commit.commit_sha,
353
499
  created_at=file_data.get("created_at", commit.date),
354
500
  blob_sha=file_data["blob_sha"],
355
501
  path=absolute_path, # Use absolute path for file reading
@@ -395,8 +541,27 @@ class CommitIndexingApplicationService:
395
541
  f"Extracted {len(all_snippets)} snippets, "
396
542
  f"deduplicated to {len(deduplicated_snippets)} for {commit_short}"
397
543
  )
398
- await self.snippet_repository.save_snippets(
399
- commit.commit_sha, deduplicated_snippets
544
+
545
+ saved_enrichments = await self.enrichment_v2_repository.save_bulk(
546
+ [
547
+ SnippetEnrichment(content=snippet.content)
548
+ for snippet in deduplicated_snippets
549
+ ]
550
+ )
551
+ saved_associations = await self.enrichment_association_repository.save_bulk(
552
+ [
553
+ EnrichmentAssociation(
554
+ enrichment_id=enrichment.id,
555
+ entity_type=db_entities.GitCommit.__tablename__,
556
+ entity_id=commit_sha,
557
+ )
558
+ for enrichment in saved_enrichments
559
+ if enrichment.id
560
+ ]
561
+ )
562
+ self._log.info(
563
+ f"Saved {len(saved_enrichments)} snippet enrichments and "
564
+ f"{len(saved_associations)} associations for commit {commit_sha}"
400
565
  )
401
566
 
402
567
  async def process_bm25_index(self, repository_id: int, commit_sha: str) -> None:
@@ -406,13 +571,16 @@ class CommitIndexingApplicationService:
406
571
  trackable_type=TrackableType.KODIT_REPOSITORY,
407
572
  trackable_id=repository_id,
408
573
  ):
409
- snippets = await self.snippet_repository.get_snippets_for_commit(commit_sha)
410
-
574
+ existing_enrichments = (
575
+ await self.enrichment_query_service.get_all_snippets_for_commit(
576
+ commit_sha
577
+ )
578
+ )
411
579
  await self.bm25_service.index_documents(
412
580
  IndexRequest(
413
581
  documents=[
414
- Document(snippet_id=snippet.id, text=snippet.content)
415
- for snippet in snippets
582
+ Document(snippet_id=str(snippet.id), text=snippet.content)
583
+ for snippet in existing_enrichments
416
584
  if snippet.id
417
585
  ]
418
586
  )
@@ -427,12 +595,14 @@ class CommitIndexingApplicationService:
427
595
  trackable_type=TrackableType.KODIT_REPOSITORY,
428
596
  trackable_id=repository_id,
429
597
  ) as step:
430
- all_snippets = await self.snippet_repository.get_snippets_for_commit(
431
- commit_sha
598
+ existing_enrichments = (
599
+ await self.enrichment_query_service.get_all_snippets_for_commit(
600
+ commit_sha
601
+ )
432
602
  )
433
603
 
434
604
  new_snippets = await self._new_snippets_for_type(
435
- all_snippets, EmbeddingType.CODE
605
+ existing_enrichments, EmbeddingType.CODE
436
606
  )
437
607
  if not new_snippets:
438
608
  await step.skip("All snippets already have code embeddings")
@@ -441,7 +611,7 @@ class CommitIndexingApplicationService:
441
611
  await step.set_total(len(new_snippets))
442
612
  processed = 0
443
613
  documents = [
444
- Document(snippet_id=snippet.id, text=snippet.content)
614
+ Document(snippet_id=str(snippet.id), text=snippet.content)
445
615
  for snippet in new_snippets
446
616
  if snippet.id
447
617
  ]
@@ -458,36 +628,28 @@ class CommitIndexingApplicationService:
458
628
  trackable_type=TrackableType.KODIT_REPOSITORY,
459
629
  trackable_id=repository_id,
460
630
  ) as step:
461
- all_snippets = await self.snippet_repository.get_snippets_for_commit(
462
- commit_sha
463
- )
631
+ if await self.enrichment_query_service.has_summaries_for_commit(commit_sha):
632
+ await step.skip("Summary enrichments already exist for commit")
633
+ return
464
634
 
465
- # Find snippets without a summary enrichment
466
- snippets_without_summary = [
467
- snippet
468
- for snippet in all_snippets
469
- if not snippet.enrichments
470
- or not next(
471
- enrichment
472
- for enrichment in snippet.enrichments
473
- if enrichment.type == EnrichmentType.SUMMARIZATION
635
+ all_snippets = (
636
+ await self.enrichment_query_service.get_all_snippets_for_commit(
637
+ commit_sha
474
638
  )
475
- ]
476
- if not snippets_without_summary:
477
- await step.skip("All snippets already have a summary enrichment")
639
+ )
640
+ if not all_snippets:
641
+ await step.skip("No snippets to enrich")
478
642
  return
479
643
 
480
644
  # Enrich snippets
481
- await step.set_total(len(snippets_without_summary))
645
+ await step.set_total(len(all_snippets))
482
646
  snippet_map = {
483
- snippet.id: snippet
484
- for snippet in snippets_without_summary
485
- if snippet.id
647
+ str(snippet.id): snippet for snippet in all_snippets if snippet.id
486
648
  }
487
649
 
488
650
  enrichment_requests = [
489
651
  GenericEnrichmentRequest(
490
- id=snippet_id,
652
+ id=str(snippet_id),
491
653
  text=snippet.content,
492
654
  system_prompt=SUMMARIZATION_SYSTEM_PROMPT,
493
655
  )
@@ -497,12 +659,27 @@ class CommitIndexingApplicationService:
497
659
  processed = 0
498
660
  async for result in self.enricher_service.enrich(enrichment_requests):
499
661
  snippet = snippet_map[result.id]
500
- snippet.enrichments.append(
501
- Enrichment(type=EnrichmentType.SUMMARIZATION, content=result.text)
662
+ db_summary = await self.enrichment_v2_repository.save(
663
+ SnippetEnrichmentSummary(content=result.text)
664
+ )
665
+ if not db_summary.id:
666
+ raise ValueError(
667
+ f"Failed to save snippet enrichment for commit {commit_sha}"
668
+ )
669
+ await self.enrichment_association_repository.save(
670
+ EnrichmentAssociation(
671
+ enrichment_id=db_summary.id,
672
+ entity_type=db_entities.EnrichmentV2.__tablename__,
673
+ entity_id=str(snippet.id),
674
+ )
675
+ )
676
+ await self.enrichment_association_repository.save(
677
+ EnrichmentAssociation(
678
+ enrichment_id=db_summary.id,
679
+ entity_type=db_entities.GitCommit.__tablename__,
680
+ entity_id=commit_sha,
681
+ )
502
682
  )
503
-
504
- await self.snippet_repository.save_snippets(commit_sha, [snippet])
505
-
506
683
  processed += 1
507
684
  await step.set_current(processed, "Enriching snippets for commit")
508
685
 
@@ -515,40 +692,61 @@ class CommitIndexingApplicationService:
515
692
  trackable_type=TrackableType.KODIT_REPOSITORY,
516
693
  trackable_id=repository_id,
517
694
  ) as step:
518
- snippets = await self.snippet_repository.get_snippets_for_commit(commit_sha)
519
-
520
- new_snippets = await self._new_snippets_for_type(
521
- snippets, EmbeddingType.TEXT
695
+ # Get all snippet enrichments for this commit
696
+ all_snippet_enrichments = (
697
+ await self.enrichment_query_service.get_all_snippets_for_commit(
698
+ commit_sha
699
+ )
522
700
  )
523
- if not new_snippets:
524
- await step.skip("All snippets already have text embeddings")
701
+ if not all_snippet_enrichments:
702
+ await step.skip("No snippets to create summary embeddings")
525
703
  return
526
704
 
527
- await step.set_total(len(new_snippets))
528
- processed = 0
705
+ # Get summary enrichments that point to these snippet enrichments
706
+ query = EnrichmentAssociationQueryBuilder.for_enrichment_associations(
707
+ entity_type=db_entities.EnrichmentV2.__tablename__,
708
+ entity_ids=[
709
+ str(snippet.id) for snippet in all_snippet_enrichments if snippet.id
710
+ ],
711
+ )
712
+ summary_enrichment_associations = (
713
+ await self.enrichment_association_repository.find(query)
714
+ )
715
+
716
+ if not summary_enrichment_associations:
717
+ await step.skip("No summary enrichments found for snippets")
718
+ return
529
719
 
530
- def _summary_from_enrichments(enrichments: list[Enrichment]) -> str:
531
- if not enrichments:
532
- return ""
533
- return next(
534
- enrichment.content
535
- for enrichment in enrichments
536
- if enrichment.type == EnrichmentType.SUMMARIZATION
720
+ # Get the actual summary enrichments
721
+ summary_enrichments = await self.enrichment_v2_repository.find(
722
+ QueryBuilder().filter(
723
+ "id",
724
+ FilterOperator.IN,
725
+ [
726
+ association.enrichment_id
727
+ for association in summary_enrichment_associations
728
+ ],
537
729
  )
730
+ )
538
731
 
539
- snippet_summary_map = {
540
- snippet.id: _summary_from_enrichments(snippet.enrichments)
541
- for snippet in snippets
542
- if snippet.id
543
- }
544
- if len(snippet_summary_map) == 0:
545
- await step.skip("No snippets with summaries to create text embeddings")
732
+ # Check if embeddings already exist for these summaries
733
+ new_summaries = await self._new_snippets_for_type(
734
+ summary_enrichments, EmbeddingType.TEXT
735
+ )
736
+ if not new_summaries:
737
+ await step.skip("All snippets already have text embeddings")
546
738
  return
547
739
 
740
+ await step.set_total(len(new_summaries))
741
+ processed = 0
742
+
743
+ # Create documents from the summary enrichments
548
744
  documents_with_summaries = [
549
- Document(snippet_id=snippet_id, text=snippet_summary)
550
- for snippet_id, snippet_summary in snippet_summary_map.items()
745
+ Document(snippet_id=str(summary.id), text=summary.content)
746
+ for summary in new_summaries
747
+ if summary.id
551
748
  ]
749
+
552
750
  async for result in self.text_search_service.index_documents(
553
751
  IndexRequest(documents=documents_with_summaries)
554
752
  ):
@@ -567,23 +765,14 @@ class CommitIndexingApplicationService:
567
765
  await step.set_total(3)
568
766
 
569
767
  # Check if architecture enrichment already exists for this commit
570
- enrichment_repo = self.enrichment_v2_repository
571
- existing_enrichments = await enrichment_repo.enrichments_for_entity_type(
572
- entity_type="git_commit",
573
- entity_ids=[commit_sha],
574
- )
575
-
576
- # Check if architecture enrichment already exists
577
- has_architecture = any(
578
- enrichment.type == "architecture" for enrichment in existing_enrichments
579
- )
580
-
581
- if has_architecture:
768
+ if await self.enrichment_query_service.has_architecture_for_commit(
769
+ commit_sha
770
+ ):
582
771
  await step.skip("Architecture enrichment already exists for commit")
583
772
  return
584
773
 
585
774
  # Get repository path
586
- repo = await self.repo_repository.get_by_id(repository_id)
775
+ repo = await self.repo_repository.get(repository_id)
587
776
  if not repo.cloned_path:
588
777
  raise ValueError(f"Repository {repository_id} has never been cloned")
589
778
 
@@ -610,13 +799,20 @@ class CommitIndexingApplicationService:
610
799
  enriched_content = response.text
611
800
 
612
801
  # Create and save architecture enrichment with enriched content
613
- architecture_enrichment = PhysicalArchitectureEnrichment(
614
- entity_id=commit_sha,
615
- content=enriched_content,
802
+ enrichment = await self.enrichment_v2_repository.save(
803
+ PhysicalArchitectureEnrichment(
804
+ content=enriched_content,
805
+ )
616
806
  )
617
-
618
- await self.enrichment_v2_repository.bulk_save_enrichments(
619
- [architecture_enrichment]
807
+ if not enrichment or not enrichment.id:
808
+ raise ValueError(
809
+ f"Failed to save architecture enrichment for commit {commit_sha}"
810
+ )
811
+ await self.enrichment_association_repository.save(
812
+ CommitEnrichmentAssociation(
813
+ enrichment_id=enrichment.id,
814
+ entity_id=commit_sha,
815
+ )
620
816
  )
621
817
 
622
818
  await step.set_current(3, "Architecture enrichment completed")
@@ -629,34 +825,26 @@ class CommitIndexingApplicationService:
629
825
  trackable_id=repository_id,
630
826
  ) as step:
631
827
  # Check if API docs already exist for this commit
632
- existing_enrichments = (
633
- await self.enrichment_v2_repository.enrichments_for_entity_type(
634
- entity_type="git_commit",
635
- entity_ids=[commit_sha],
636
- )
637
- )
638
-
639
- has_api_docs = any(
640
- e.type == ENRICHMENT_TYPE_USAGE
641
- and e.subtype == ENRICHMENT_SUBTYPE_API_DOCS
642
- for e in existing_enrichments
643
- )
644
-
645
- if has_api_docs:
828
+ if await self.enrichment_query_service.has_api_docs_for_commit(commit_sha):
646
829
  await step.skip("API docs already exist for commit")
647
830
  return
648
831
 
649
832
  # Get repository for metadata
650
- repo = await self.repo_repository.get_by_id(repository_id)
833
+ repo = await self.repo_repository.get(repository_id)
651
834
  if not repo:
652
835
  raise ValueError(f"Repository {repository_id} not found")
653
836
  str(repo.sanitized_remote_uri)
654
837
 
655
- commit = await self.git_commit_repository.get_by_sha(commit_sha)
838
+ files = await self.git_file_repository.find(
839
+ GitFileQueryBuilder().for_commit_sha(commit_sha)
840
+ )
841
+ if not files:
842
+ await step.skip("No files to extract API docs from")
843
+ return
656
844
 
657
845
  # Group files by language
658
846
  lang_files_map: dict[str, list[GitFile]] = defaultdict(list)
659
- for file in commit.files:
847
+ for file in files:
660
848
  try:
661
849
  lang = LanguageMapping.get_language_for_extension(file.extension)
662
850
  except ValueError:
@@ -670,28 +858,172 @@ class CommitIndexingApplicationService:
670
858
  for i, (lang, lang_files) in enumerate(lang_files_map.items()):
671
859
  await step.set_current(i, f"Extracting API docs for {lang}")
672
860
  enrichments = extractor.extract_api_docs(
673
- lang_files,
674
- lang,
675
- commit_sha,
861
+ files=lang_files,
862
+ language=lang,
676
863
  include_private=False,
677
864
  )
678
865
  all_enrichments.extend(enrichments)
679
866
 
680
867
  # Save all enrichments
681
868
  if all_enrichments:
682
- await self.enrichment_v2_repository.bulk_save_enrichments(
683
- all_enrichments
869
+ saved_enrichments = await self.enrichment_v2_repository.save_bulk(
870
+ all_enrichments # type: ignore[arg-type]
871
+ )
872
+ await self.enrichment_association_repository.save_bulk(
873
+ [
874
+ CommitEnrichmentAssociation(
875
+ enrichment_id=enrichment.id,
876
+ entity_id=commit_sha,
877
+ )
878
+ for enrichment in saved_enrichments
879
+ if enrichment.id
880
+ ]
684
881
  )
685
882
 
883
+ async def process_commit_description(
884
+ self, repository_id: int, commit_sha: str
885
+ ) -> None:
886
+ """Handle COMMIT_DESCRIPTION task - generate commit descriptions."""
887
+ async with self.operation.create_child(
888
+ TaskOperation.CREATE_COMMIT_DESCRIPTION_FOR_COMMIT,
889
+ trackable_type=TrackableType.KODIT_REPOSITORY,
890
+ trackable_id=repository_id,
891
+ ) as step:
892
+ # Check if commit description already exists for this commit
893
+ if await self.enrichment_query_service.has_commit_description_for_commit(
894
+ commit_sha
895
+ ):
896
+ await step.skip("Commit description already exists for commit")
897
+ return
898
+
899
+ # Get repository path
900
+ repo = await self.repo_repository.get(repository_id)
901
+ if not repo.cloned_path:
902
+ raise ValueError(f"Repository {repository_id} has never been cloned")
903
+
904
+ await step.set_total(3)
905
+ await step.set_current(1, "Getting commit diff")
906
+
907
+ # Get the diff for this commit
908
+ diff = await self.scanner.git_adapter.get_commit_diff(
909
+ repo.cloned_path, commit_sha
910
+ )
911
+
912
+ if not diff or len(diff.strip()) == 0:
913
+ await step.skip("No diff found for commit")
914
+ return
915
+
916
+ await step.set_current(2, "Enriching commit description with LLM")
917
+
918
+ # Enrich the diff through the enricher
919
+ enrichment_request = GenericEnrichmentRequest(
920
+ id=commit_sha,
921
+ text=diff,
922
+ system_prompt=COMMIT_DESCRIPTION_SYSTEM_PROMPT,
923
+ )
924
+
925
+ enriched_content = ""
926
+ async for response in self.enricher_service.enrich([enrichment_request]):
927
+ enriched_content = response.text
928
+
929
+ # Create and save commit description enrichment
930
+ enrichment = await self.enrichment_v2_repository.save(
931
+ CommitDescriptionEnrichment(
932
+ content=enriched_content,
933
+ )
934
+ )
935
+ if not enrichment or not enrichment.id:
936
+ raise ValueError(
937
+ f"Failed to save commit description enrichment for commit "
938
+ f"{commit_sha}"
939
+ )
940
+ await self.enrichment_association_repository.save(
941
+ CommitEnrichmentAssociation(
942
+ enrichment_id=enrichment.id,
943
+ entity_id=commit_sha,
944
+ )
945
+ )
946
+
947
+ await step.set_current(3, "Commit description enrichment completed")
948
+
949
+ async def process_database_schema(
950
+ self, repository_id: int, commit_sha: str
951
+ ) -> None:
952
+ """Handle DATABASE_SCHEMA task - discover and document database schemas."""
953
+ async with self.operation.create_child(
954
+ TaskOperation.CREATE_DATABASE_SCHEMA_FOR_COMMIT,
955
+ trackable_type=TrackableType.KODIT_REPOSITORY,
956
+ trackable_id=repository_id,
957
+ ) as step:
958
+ # Check if database schema already exists for this commit
959
+ if await self.enrichment_query_service.has_database_schema_for_commit(
960
+ commit_sha
961
+ ):
962
+ await step.skip("Database schema already exists for commit")
963
+ return
964
+
965
+ # Get repository path
966
+ repo = await self.repo_repository.get(repository_id)
967
+ if not repo.cloned_path:
968
+ raise ValueError(f"Repository {repository_id} has never been cloned")
969
+
970
+ await step.set_total(3)
971
+ await step.set_current(1, "Discovering database schemas")
972
+
973
+ # Discover database schemas
974
+ schema_report = await self.database_schema_detector.discover_schemas(
975
+ repo.cloned_path
976
+ )
977
+
978
+ if "No database schemas detected" in schema_report:
979
+ await step.skip("No database schemas found in repository")
980
+ return
981
+
982
+ await step.set_current(2, "Enriching schema documentation with LLM")
983
+
984
+ # Enrich the schema report through the enricher
985
+ enrichment_request = GenericEnrichmentRequest(
986
+ id=commit_sha,
987
+ text=DATABASE_SCHEMA_TASK_PROMPT.format(schema_report=schema_report),
988
+ system_prompt=DATABASE_SCHEMA_SYSTEM_PROMPT,
989
+ )
990
+
991
+ enriched_content = ""
992
+ async for response in self.enricher_service.enrich([enrichment_request]):
993
+ enriched_content = response.text
994
+
995
+ # Create and save database schema enrichment
996
+ enrichment = await self.enrichment_v2_repository.save(
997
+ DatabaseSchemaEnrichment(
998
+ content=enriched_content,
999
+ )
1000
+ )
1001
+ if not enrichment or not enrichment.id:
1002
+ raise ValueError(
1003
+ f"Failed to save database schema enrichment for commit {commit_sha}"
1004
+ )
1005
+ await self.enrichment_association_repository.save(
1006
+ CommitEnrichmentAssociation(
1007
+ enrichment_id=enrichment.id,
1008
+ entity_id=commit_sha,
1009
+ )
1010
+ )
1011
+
1012
+ await step.set_current(3, "Database schema enrichment completed")
1013
+
686
1014
  async def _new_snippets_for_type(
687
- self, all_snippets: list[SnippetV2], embedding_type: EmbeddingType
688
- ) -> list[SnippetV2]:
1015
+ self, all_snippets: list[EnrichmentV2], embedding_type: EmbeddingType
1016
+ ) -> list[EnrichmentV2]:
689
1017
  """Get new snippets for a given type."""
690
1018
  existing_embeddings = (
691
1019
  await self.embedding_repository.list_embeddings_by_snippet_ids_and_type(
692
- [s.id for s in all_snippets], embedding_type
1020
+ [str(s.id) for s in all_snippets], embedding_type
693
1021
  )
694
1022
  )
1023
+ # TODO(Phil): Can't do this incrementally yet because like the API, we don't
1024
+ # have a unified embedding repository
1025
+ if existing_embeddings:
1026
+ return []
695
1027
  existing_embeddings_by_snippet_id = {
696
1028
  embedding.snippet_id: embedding for embedding in existing_embeddings
697
1029
  }