kodit 0.4.3__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (135) hide show
  1. kodit/_version.py +2 -2
  2. kodit/app.py +51 -23
  3. kodit/application/factories/reporting_factory.py +6 -2
  4. kodit/application/factories/server_factory.py +353 -0
  5. kodit/application/services/code_search_application_service.py +144 -0
  6. kodit/application/services/commit_indexing_application_service.py +700 -0
  7. kodit/application/services/indexing_worker_service.py +13 -44
  8. kodit/application/services/queue_service.py +24 -3
  9. kodit/application/services/reporting.py +0 -2
  10. kodit/application/services/sync_scheduler.py +15 -31
  11. kodit/cli.py +2 -753
  12. kodit/cli_utils.py +2 -9
  13. kodit/config.py +4 -97
  14. kodit/database.py +38 -1
  15. kodit/domain/enrichments/__init__.py +1 -0
  16. kodit/domain/enrichments/architecture/__init__.py +1 -0
  17. kodit/domain/enrichments/architecture/architecture.py +20 -0
  18. kodit/domain/enrichments/architecture/physical/__init__.py +1 -0
  19. kodit/domain/enrichments/architecture/physical/discovery_notes.py +14 -0
  20. kodit/domain/enrichments/architecture/physical/formatter.py +11 -0
  21. kodit/domain/enrichments/architecture/physical/physical.py +17 -0
  22. kodit/domain/enrichments/development/__init__.py +1 -0
  23. kodit/domain/enrichments/development/development.py +18 -0
  24. kodit/domain/enrichments/development/snippet/__init__.py +1 -0
  25. kodit/domain/enrichments/development/snippet/snippet.py +21 -0
  26. kodit/domain/enrichments/enricher.py +17 -0
  27. kodit/domain/enrichments/enrichment.py +39 -0
  28. kodit/domain/enrichments/request.py +12 -0
  29. kodit/domain/enrichments/response.py +11 -0
  30. kodit/domain/enrichments/usage/__init__.py +1 -0
  31. kodit/domain/enrichments/usage/api_docs.py +19 -0
  32. kodit/domain/enrichments/usage/usage.py +18 -0
  33. kodit/domain/{entities.py → entities/__init__.py} +50 -195
  34. kodit/domain/entities/git.py +190 -0
  35. kodit/domain/factories/__init__.py +1 -0
  36. kodit/domain/factories/git_repo_factory.py +76 -0
  37. kodit/domain/protocols.py +264 -64
  38. kodit/domain/services/bm25_service.py +5 -1
  39. kodit/domain/services/embedding_service.py +3 -0
  40. kodit/domain/services/enrichment_service.py +9 -30
  41. kodit/domain/services/git_repository_service.py +429 -0
  42. kodit/domain/services/git_service.py +300 -0
  43. kodit/domain/services/physical_architecture_service.py +182 -0
  44. kodit/domain/services/task_status_query_service.py +2 -2
  45. kodit/domain/value_objects.py +87 -135
  46. kodit/infrastructure/api/client/__init__.py +0 -2
  47. kodit/infrastructure/api/v1/__init__.py +0 -4
  48. kodit/infrastructure/api/v1/dependencies.py +92 -46
  49. kodit/infrastructure/api/v1/routers/__init__.py +0 -6
  50. kodit/infrastructure/api/v1/routers/commits.py +352 -0
  51. kodit/infrastructure/api/v1/routers/queue.py +2 -2
  52. kodit/infrastructure/api/v1/routers/repositories.py +282 -0
  53. kodit/infrastructure/api/v1/routers/search.py +31 -14
  54. kodit/infrastructure/api/v1/schemas/__init__.py +0 -24
  55. kodit/infrastructure/api/v1/schemas/commit.py +96 -0
  56. kodit/infrastructure/api/v1/schemas/context.py +2 -0
  57. kodit/infrastructure/api/v1/schemas/enrichment.py +29 -0
  58. kodit/infrastructure/api/v1/schemas/repository.py +128 -0
  59. kodit/infrastructure/api/v1/schemas/search.py +12 -9
  60. kodit/infrastructure/api/v1/schemas/snippet.py +58 -0
  61. kodit/infrastructure/api/v1/schemas/tag.py +31 -0
  62. kodit/infrastructure/api/v1/schemas/task_status.py +2 -0
  63. kodit/infrastructure/bm25/local_bm25_repository.py +16 -4
  64. kodit/infrastructure/bm25/vectorchord_bm25_repository.py +68 -52
  65. kodit/infrastructure/cloning/git/git_python_adaptor.py +534 -0
  66. kodit/infrastructure/cloning/git/working_copy.py +1 -1
  67. kodit/infrastructure/embedding/embedding_factory.py +3 -2
  68. kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
  69. kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +111 -84
  70. kodit/infrastructure/enricher/__init__.py +1 -0
  71. kodit/infrastructure/enricher/enricher_factory.py +53 -0
  72. kodit/infrastructure/{enrichment/litellm_enrichment_provider.py → enricher/litellm_enricher.py} +36 -56
  73. kodit/infrastructure/{enrichment/local_enrichment_provider.py → enricher/local_enricher.py} +19 -24
  74. kodit/infrastructure/enricher/null_enricher.py +36 -0
  75. kodit/infrastructure/indexing/fusion_service.py +1 -1
  76. kodit/infrastructure/mappers/enrichment_mapper.py +83 -0
  77. kodit/infrastructure/mappers/git_mapper.py +193 -0
  78. kodit/infrastructure/mappers/snippet_mapper.py +104 -0
  79. kodit/infrastructure/mappers/task_mapper.py +5 -44
  80. kodit/infrastructure/physical_architecture/__init__.py +1 -0
  81. kodit/infrastructure/physical_architecture/detectors/__init__.py +1 -0
  82. kodit/infrastructure/physical_architecture/detectors/docker_compose_detector.py +336 -0
  83. kodit/infrastructure/physical_architecture/formatters/__init__.py +1 -0
  84. kodit/infrastructure/physical_architecture/formatters/narrative_formatter.py +149 -0
  85. kodit/infrastructure/reporting/log_progress.py +8 -5
  86. kodit/infrastructure/reporting/telemetry_progress.py +21 -0
  87. kodit/infrastructure/slicing/api_doc_extractor.py +836 -0
  88. kodit/infrastructure/slicing/ast_analyzer.py +1128 -0
  89. kodit/infrastructure/slicing/slicer.py +87 -421
  90. kodit/infrastructure/sqlalchemy/embedding_repository.py +43 -23
  91. kodit/infrastructure/sqlalchemy/enrichment_v2_repository.py +118 -0
  92. kodit/infrastructure/sqlalchemy/entities.py +402 -158
  93. kodit/infrastructure/sqlalchemy/git_branch_repository.py +274 -0
  94. kodit/infrastructure/sqlalchemy/git_commit_repository.py +346 -0
  95. kodit/infrastructure/sqlalchemy/git_repository.py +262 -0
  96. kodit/infrastructure/sqlalchemy/git_tag_repository.py +268 -0
  97. kodit/infrastructure/sqlalchemy/snippet_v2_repository.py +479 -0
  98. kodit/infrastructure/sqlalchemy/task_repository.py +29 -23
  99. kodit/infrastructure/sqlalchemy/task_status_repository.py +24 -12
  100. kodit/infrastructure/sqlalchemy/unit_of_work.py +10 -14
  101. kodit/mcp.py +12 -30
  102. kodit/migrations/env.py +1 -0
  103. kodit/migrations/versions/04b80f802e0c_foreign_key_review.py +100 -0
  104. kodit/migrations/versions/19f8c7faf8b9_add_generic_enrichment_type.py +260 -0
  105. kodit/migrations/versions/7f15f878c3a1_add_new_git_entities.py +690 -0
  106. kodit/migrations/versions/f9e5ef5e688f_add_git_commits_number.py +43 -0
  107. kodit/py.typed +0 -0
  108. kodit/utils/dump_config.py +361 -0
  109. kodit/utils/dump_openapi.py +6 -4
  110. kodit/utils/path_utils.py +29 -0
  111. {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/METADATA +3 -3
  112. kodit-0.5.1.dist-info/RECORD +168 -0
  113. kodit/application/factories/code_indexing_factory.py +0 -195
  114. kodit/application/services/auto_indexing_service.py +0 -99
  115. kodit/application/services/code_indexing_application_service.py +0 -410
  116. kodit/domain/services/index_query_service.py +0 -70
  117. kodit/domain/services/index_service.py +0 -269
  118. kodit/infrastructure/api/client/index_client.py +0 -57
  119. kodit/infrastructure/api/v1/routers/indexes.py +0 -164
  120. kodit/infrastructure/api/v1/schemas/index.py +0 -101
  121. kodit/infrastructure/bm25/bm25_factory.py +0 -28
  122. kodit/infrastructure/cloning/__init__.py +0 -1
  123. kodit/infrastructure/cloning/metadata.py +0 -98
  124. kodit/infrastructure/enrichment/__init__.py +0 -1
  125. kodit/infrastructure/enrichment/enrichment_factory.py +0 -52
  126. kodit/infrastructure/enrichment/null_enrichment_provider.py +0 -19
  127. kodit/infrastructure/mappers/index_mapper.py +0 -345
  128. kodit/infrastructure/reporting/tdqm_progress.py +0 -38
  129. kodit/infrastructure/slicing/language_detection_service.py +0 -18
  130. kodit/infrastructure/sqlalchemy/index_repository.py +0 -646
  131. kodit-0.4.3.dist-info/RECORD +0 -125
  132. /kodit/infrastructure/{enrichment → enricher}/utils.py +0 -0
  133. {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/WHEEL +0 -0
  134. {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/entry_points.txt +0 -0
  135. {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,700 @@
1
+ """Application services for commit indexing operations."""
2
+
3
+ from collections import defaultdict
4
+ from pathlib import Path
5
+
6
+ import structlog
7
+ from pydantic import AnyUrl
8
+
9
+ from kodit.application.services.queue_service import QueueService
10
+ from kodit.application.services.reporting import ProgressTracker
11
+ from kodit.domain.enrichments.architecture.physical.physical import (
12
+ PhysicalArchitectureEnrichment,
13
+ )
14
+ from kodit.domain.enrichments.enricher import Enricher
15
+ from kodit.domain.enrichments.request import (
16
+ EnrichmentRequest as GenericEnrichmentRequest,
17
+ )
18
+ from kodit.domain.enrichments.usage.api_docs import ENRICHMENT_SUBTYPE_API_DOCS
19
+ from kodit.domain.enrichments.usage.usage import ENRICHMENT_TYPE_USAGE
20
+ from kodit.domain.entities import Task
21
+ from kodit.domain.entities.git import GitFile, GitRepo, SnippetV2
22
+ from kodit.domain.factories.git_repo_factory import GitRepoFactory
23
+ from kodit.domain.protocols import (
24
+ GitBranchRepository,
25
+ GitCommitRepository,
26
+ GitRepoRepository,
27
+ GitTagRepository,
28
+ SnippetRepositoryV2,
29
+ )
30
+ from kodit.domain.services.bm25_service import BM25DomainService
31
+ from kodit.domain.services.embedding_service import EmbeddingDomainService
32
+ from kodit.domain.services.git_repository_service import (
33
+ GitRepositoryScanner,
34
+ RepositoryCloner,
35
+ )
36
+ from kodit.domain.services.physical_architecture_service import (
37
+ ARCHITECTURE_ENRICHMENT_SYSTEM_PROMPT,
38
+ ARCHITECTURE_ENRICHMENT_TASK_PROMPT,
39
+ PhysicalArchitectureService,
40
+ )
41
+ from kodit.domain.value_objects import (
42
+ DeleteRequest,
43
+ Document,
44
+ Enrichment,
45
+ EnrichmentType,
46
+ IndexRequest,
47
+ LanguageMapping,
48
+ PrescribedOperations,
49
+ QueuePriority,
50
+ TaskOperation,
51
+ TrackableType,
52
+ )
53
+ from kodit.infrastructure.slicing.api_doc_extractor import APIDocExtractor
54
+ from kodit.infrastructure.slicing.slicer import Slicer
55
+ from kodit.infrastructure.sqlalchemy.embedding_repository import (
56
+ SqlAlchemyEmbeddingRepository,
57
+ )
58
+ from kodit.infrastructure.sqlalchemy.enrichment_v2_repository import (
59
+ EnrichmentV2Repository,
60
+ )
61
+ from kodit.infrastructure.sqlalchemy.entities import EmbeddingType
62
+
63
+ SUMMARIZATION_SYSTEM_PROMPT = """
64
+ You are a professional software developer. You will be given a snippet of code.
65
+ Please provide a concise explanation of the code.
66
+ """
67
+
68
+
69
+ class CommitIndexingApplicationService:
70
+ """Application service for commit indexing operations."""
71
+
72
+ def __init__( # noqa: PLR0913
73
+ self,
74
+ snippet_v2_repository: SnippetRepositoryV2,
75
+ repo_repository: GitRepoRepository,
76
+ git_commit_repository: GitCommitRepository,
77
+ git_branch_repository: GitBranchRepository,
78
+ git_tag_repository: GitTagRepository,
79
+ operation: ProgressTracker,
80
+ scanner: GitRepositoryScanner,
81
+ cloner: RepositoryCloner,
82
+ snippet_repository: SnippetRepositoryV2,
83
+ slicer: Slicer,
84
+ queue: QueueService,
85
+ bm25_service: BM25DomainService,
86
+ code_search_service: EmbeddingDomainService,
87
+ text_search_service: EmbeddingDomainService,
88
+ embedding_repository: SqlAlchemyEmbeddingRepository,
89
+ architecture_service: PhysicalArchitectureService,
90
+ enrichment_v2_repository: EnrichmentV2Repository,
91
+ enricher_service: Enricher,
92
+ ) -> None:
93
+ """Initialize the commit indexing application service.
94
+
95
+ Args:
96
+ commit_index_repository: Repository for commit index data.
97
+ snippet_v2_repository: Repository for snippet data.
98
+ repo_repository: Repository for Git repository data.
99
+ domain_indexer: Domain service for indexing operations.
100
+ operation: Progress tracker for reporting operations.
101
+
102
+ """
103
+ self.snippet_repository = snippet_v2_repository
104
+ self.repo_repository = repo_repository
105
+ self.git_commit_repository = git_commit_repository
106
+ self.git_branch_repository = git_branch_repository
107
+ self.git_tag_repository = git_tag_repository
108
+ self.operation = operation
109
+ self.scanner = scanner
110
+ self.cloner = cloner
111
+ self.snippet_repository = snippet_repository
112
+ self.slicer = slicer
113
+ self.queue = queue
114
+ self.bm25_service = bm25_service
115
+ self.code_search_service = code_search_service
116
+ self.text_search_service = text_search_service
117
+ self.embedding_repository = embedding_repository
118
+ self.architecture_service = architecture_service
119
+ self.enrichment_v2_repository = enrichment_v2_repository
120
+ self.enricher_service = enricher_service
121
+ self._log = structlog.get_logger(__name__)
122
+
123
+ async def create_git_repository(self, remote_uri: AnyUrl) -> GitRepo:
124
+ """Create a new Git repository."""
125
+ async with self.operation.create_child(
126
+ TaskOperation.CREATE_REPOSITORY,
127
+ trackable_type=TrackableType.KODIT_REPOSITORY,
128
+ ):
129
+ repo = GitRepoFactory.create_from_remote_uri(remote_uri)
130
+ repo = await self.repo_repository.save(repo)
131
+ await self.queue.enqueue_tasks(
132
+ tasks=PrescribedOperations.CREATE_NEW_REPOSITORY,
133
+ base_priority=QueuePriority.USER_INITIATED,
134
+ payload={"repository_id": repo.id},
135
+ )
136
+ return repo
137
+
138
+ async def delete_git_repository(self, repo_id: int) -> bool:
139
+ """Delete a Git repository by ID."""
140
+ repo = await self.repo_repository.get_by_id(repo_id)
141
+ if not repo:
142
+ return False
143
+
144
+ # Use the proper deletion process that handles all dependencies
145
+ await self.process_delete_repo(repo_id)
146
+ return True
147
+
148
+ # TODO(Phil): Make this polymorphic
149
+ async def run_task(self, task: Task) -> None: # noqa: PLR0912, C901
150
+ """Run a task."""
151
+ if task.type.is_repository_operation():
152
+ repo_id = task.payload["repository_id"]
153
+ if not repo_id:
154
+ raise ValueError("Repository ID is required")
155
+ if task.type == TaskOperation.CLONE_REPOSITORY:
156
+ await self.process_clone_repo(repo_id)
157
+ elif task.type == TaskOperation.SCAN_REPOSITORY:
158
+ await self.process_scan_repo(repo_id)
159
+ elif task.type == TaskOperation.DELETE_REPOSITORY:
160
+ await self.process_delete_repo(repo_id)
161
+ else:
162
+ raise ValueError(f"Unknown task type: {task.type}")
163
+ elif task.type.is_commit_operation():
164
+ repository_id = task.payload["repository_id"]
165
+ if not repository_id:
166
+ raise ValueError("Repository ID is required")
167
+ commit_sha = task.payload["commit_sha"]
168
+ if not commit_sha:
169
+ raise ValueError("Commit SHA is required")
170
+ if task.type == TaskOperation.EXTRACT_SNIPPETS_FOR_COMMIT:
171
+ await self.process_snippets_for_commit(repository_id, commit_sha)
172
+ elif task.type == TaskOperation.CREATE_BM25_INDEX_FOR_COMMIT:
173
+ await self.process_bm25_index(repository_id, commit_sha)
174
+ elif task.type == TaskOperation.CREATE_CODE_EMBEDDINGS_FOR_COMMIT:
175
+ await self.process_code_embeddings(repository_id, commit_sha)
176
+ elif task.type == TaskOperation.CREATE_SUMMARY_ENRICHMENT_FOR_COMMIT:
177
+ await self.process_enrich(repository_id, commit_sha)
178
+ elif task.type == TaskOperation.CREATE_SUMMARY_EMBEDDINGS_FOR_COMMIT:
179
+ await self.process_summary_embeddings(repository_id, commit_sha)
180
+ elif task.type == TaskOperation.CREATE_ARCHITECTURE_ENRICHMENT_FOR_COMMIT:
181
+ await self.process_architecture_discovery(repository_id, commit_sha)
182
+ elif task.type == TaskOperation.CREATE_PUBLIC_API_DOCS_FOR_COMMIT:
183
+ await self.process_api_docs(repository_id, commit_sha)
184
+ else:
185
+ raise ValueError(f"Unknown task type: {task.type}")
186
+ else:
187
+ raise ValueError(f"Unknown task type: {task.type}")
188
+
189
+ async def process_clone_repo(self, repository_id: int) -> None:
190
+ """Clone a repository."""
191
+ async with self.operation.create_child(
192
+ TaskOperation.CLONE_REPOSITORY,
193
+ trackable_type=TrackableType.KODIT_REPOSITORY,
194
+ trackable_id=repository_id,
195
+ ):
196
+ repo = await self.repo_repository.get_by_id(repository_id)
197
+ repo.cloned_path = await self.cloner.clone_repository(repo.remote_uri)
198
+ await self.repo_repository.save(repo)
199
+
200
+ async def process_scan_repo(self, repository_id: int) -> None:
201
+ """Scan a repository."""
202
+ async with self.operation.create_child(
203
+ TaskOperation.SCAN_REPOSITORY,
204
+ trackable_type=TrackableType.KODIT_REPOSITORY,
205
+ trackable_id=repository_id,
206
+ ) as step:
207
+ await step.set_total(6)
208
+ repo = await self.repo_repository.get_by_id(repository_id)
209
+ if not repo.cloned_path:
210
+ raise ValueError(f"Repository {repository_id} has never been cloned")
211
+
212
+ # Scan the repository to get all metadata
213
+ await step.set_current(0, "Scanning repository")
214
+ scan_result = await self.scanner.scan_repository(repo.cloned_path)
215
+
216
+ # Update repo with scan result (this sets num_commits, num_branches, etc.)
217
+ await step.set_current(1, "Updating repository with scan result")
218
+ repo.update_with_scan_result(scan_result)
219
+ await self.repo_repository.save(repo)
220
+
221
+ # Save commits, branches, and tags to their dedicated repositories
222
+ await step.set_current(2, "Saving commits")
223
+ if scan_result.all_commits:
224
+ await self.git_commit_repository.save_bulk(
225
+ scan_result.all_commits, repository_id
226
+ )
227
+
228
+ await step.set_current(3, "Saving branches")
229
+ if scan_result.branches:
230
+ await self.git_branch_repository.save_bulk(
231
+ scan_result.branches, repository_id
232
+ )
233
+
234
+ await step.set_current(4, "Saving tags")
235
+ if scan_result.all_tags:
236
+ await self.git_tag_repository.save_bulk(
237
+ scan_result.all_tags, repository_id
238
+ )
239
+
240
+ await step.set_current(5, "Enqueuing commit indexing tasks")
241
+ if not repo.tracking_branch:
242
+ raise ValueError(f"Repository {repository_id} has no tracking branch")
243
+ commit_sha = repo.tracking_branch.head_commit.commit_sha
244
+ if not commit_sha:
245
+ raise ValueError(f"Repository {repository_id} has no head commit")
246
+
247
+ await self.queue.enqueue_tasks(
248
+ tasks=PrescribedOperations.INDEX_COMMIT,
249
+ base_priority=QueuePriority.USER_INITIATED,
250
+ payload={"commit_sha": commit_sha, "repository_id": repository_id},
251
+ )
252
+
253
+ async def process_delete_repo(self, repository_id: int) -> None:
254
+ """Delete a repository."""
255
+ async with self.operation.create_child(
256
+ TaskOperation.DELETE_REPOSITORY,
257
+ trackable_type=TrackableType.KODIT_REPOSITORY,
258
+ trackable_id=repository_id,
259
+ ):
260
+ repo = await self.repo_repository.get_by_id(repository_id)
261
+ if not repo:
262
+ raise ValueError(f"Repository {repository_id} not found")
263
+
264
+ # Get all commit SHAs for this repository first (needed for cleanup)
265
+ commits = await self.git_commit_repository.get_by_repo_id(repository_id)
266
+ commit_shas = [commit.commit_sha for commit in commits]
267
+
268
+ # Step 1: Get all snippet IDs that are associated with these commits FIRST
269
+ # (before deleting the associations)
270
+ all_snippet_ids = []
271
+ if commit_shas:
272
+ for commit_sha in commit_shas:
273
+ snippets = await self.snippet_repository.get_snippets_for_commit(
274
+ commit_sha
275
+ )
276
+ all_snippet_ids.extend(
277
+ [snippet.id for snippet in snippets if snippet.id]
278
+ )
279
+
280
+ # Step 2: Delete from BM25 and embedding indices
281
+ if all_snippet_ids:
282
+ # Convert to strings as DeleteRequest expects list[str]
283
+ snippet_id_strings = [str(snippet_id) for snippet_id in all_snippet_ids]
284
+ delete_request = DeleteRequest(snippet_ids=snippet_id_strings)
285
+ await self.bm25_service.delete_documents(delete_request)
286
+
287
+ # Delete embeddings for each snippet
288
+ for snippet_id in all_snippet_ids:
289
+ await self.embedding_repository.delete_embeddings_by_snippet_id(
290
+ snippet_id
291
+ )
292
+
293
+ # Step 3: Delete enrichments for all commits
294
+ if commit_shas:
295
+ await self.enrichment_v2_repository.bulk_delete_enrichments(
296
+ entity_type="git_commit",
297
+ entity_ids=commit_shas,
298
+ )
299
+
300
+ # Step 4: Delete snippet associations for all commits
301
+ for commit_sha in commit_shas:
302
+ await self.snippet_repository.delete_snippets_for_commit(commit_sha)
303
+
304
+ # Step 5: Delete branches (they reference commits via head_commit_sha)
305
+ await self.git_branch_repository.delete_by_repo_id(repository_id)
306
+
307
+ # Step 6: Delete tags (they reference commits via target_commit_sha)
308
+ await self.git_tag_repository.delete_by_repo_id(repository_id)
309
+
310
+ # Step 7: Delete commits and their files
311
+ await self.git_commit_repository.delete_by_repo_id(repository_id)
312
+
313
+ # Step 8: Finally delete the repository
314
+ await self.repo_repository.delete(repo.sanitized_remote_uri)
315
+
316
+ async def process_snippets_for_commit(
317
+ self, repository_id: int, commit_sha: str
318
+ ) -> None:
319
+ """Generate snippets for a repository."""
320
+ async with self.operation.create_child(
321
+ operation=TaskOperation.EXTRACT_SNIPPETS_FOR_COMMIT,
322
+ trackable_type=TrackableType.KODIT_REPOSITORY,
323
+ trackable_id=repository_id,
324
+ ) as step:
325
+ # Have we already processed this commit? If yes, skip.
326
+ if await self.snippet_repository.get_snippets_for_commit(commit_sha):
327
+ await step.skip("All snippets already extracted for commit")
328
+ return
329
+
330
+ commit = await self.git_commit_repository.get_by_sha(commit_sha)
331
+
332
+ # Load files on demand for snippet extraction (performance optimization)
333
+ # Instead of using commit.files (which may be empty), load files directly
334
+ repo = await self.repo_repository.get_by_id(repository_id)
335
+ if not repo.cloned_path:
336
+ raise ValueError(f"Repository {repository_id} has never been cloned")
337
+
338
+ files_data = await self.scanner.git_adapter.get_commit_file_data(
339
+ repo.cloned_path, commit_sha
340
+ )
341
+
342
+ # Create GitFile entities with absolute paths for the slicer
343
+ files = []
344
+ for file_data in files_data:
345
+ # Extract extension from file path
346
+ file_path = Path(file_data["path"])
347
+ extension = file_path.suffix.lstrip(".")
348
+
349
+ # Create absolute path for the slicer to read
350
+ absolute_path = str(repo.cloned_path / file_data["path"])
351
+
352
+ git_file = GitFile(
353
+ created_at=file_data.get("created_at", commit.date),
354
+ blob_sha=file_data["blob_sha"],
355
+ path=absolute_path, # Use absolute path for file reading
356
+ mime_type=file_data.get("mime_type", "application/octet-stream"),
357
+ size=file_data.get("size", 0),
358
+ extension=extension,
359
+ )
360
+ files.append(git_file)
361
+
362
+ # Create a set of languages to extract snippets for
363
+ extensions = {file.extension for file in files}
364
+ lang_files_map: dict[str, list[GitFile]] = defaultdict(list)
365
+ for ext in extensions:
366
+ try:
367
+ lang = LanguageMapping.get_language_for_extension(ext)
368
+ lang_files_map[lang].extend(
369
+ file for file in files if file.extension == ext
370
+ )
371
+ except ValueError as e:
372
+ self._log.debug("Skipping", error=str(e))
373
+ continue
374
+
375
+ # Extract snippets
376
+ all_snippets: list[SnippetV2] = []
377
+ slicer = Slicer()
378
+ await step.set_total(len(lang_files_map.keys()))
379
+ for i, (lang, lang_files) in enumerate(lang_files_map.items()):
380
+ await step.set_current(i, f"Extracting snippets for {lang}")
381
+ snippets = slicer.extract_snippets_from_git_files(
382
+ lang_files, language=lang
383
+ )
384
+ all_snippets.extend(snippets)
385
+
386
+ # Deduplicate snippets by SHA before saving to prevent constraint violations
387
+ unique_snippets: dict[str, SnippetV2] = {}
388
+ for snippet in all_snippets:
389
+ unique_snippets[snippet.sha] = snippet
390
+
391
+ deduplicated_snippets = list(unique_snippets.values())
392
+
393
+ commit_short = commit.commit_sha[:8]
394
+ self._log.info(
395
+ f"Extracted {len(all_snippets)} snippets, "
396
+ f"deduplicated to {len(deduplicated_snippets)} for {commit_short}"
397
+ )
398
+ await self.snippet_repository.save_snippets(
399
+ commit.commit_sha, deduplicated_snippets
400
+ )
401
+
402
+ async def process_bm25_index(self, repository_id: int, commit_sha: str) -> None:
403
+ """Handle BM25_INDEX task - create keyword index."""
404
+ async with self.operation.create_child(
405
+ TaskOperation.CREATE_BM25_INDEX_FOR_COMMIT,
406
+ trackable_type=TrackableType.KODIT_REPOSITORY,
407
+ trackable_id=repository_id,
408
+ ):
409
+ snippets = await self.snippet_repository.get_snippets_for_commit(commit_sha)
410
+
411
+ await self.bm25_service.index_documents(
412
+ IndexRequest(
413
+ documents=[
414
+ Document(snippet_id=snippet.id, text=snippet.content)
415
+ for snippet in snippets
416
+ if snippet.id
417
+ ]
418
+ )
419
+ )
420
+
421
+ async def process_code_embeddings(
422
+ self, repository_id: int, commit_sha: str
423
+ ) -> None:
424
+ """Handle CODE_EMBEDDINGS task - create code embeddings."""
425
+ async with self.operation.create_child(
426
+ TaskOperation.CREATE_CODE_EMBEDDINGS_FOR_COMMIT,
427
+ trackable_type=TrackableType.KODIT_REPOSITORY,
428
+ trackable_id=repository_id,
429
+ ) as step:
430
+ all_snippets = await self.snippet_repository.get_snippets_for_commit(
431
+ commit_sha
432
+ )
433
+
434
+ new_snippets = await self._new_snippets_for_type(
435
+ all_snippets, EmbeddingType.CODE
436
+ )
437
+ if not new_snippets:
438
+ await step.skip("All snippets already have code embeddings")
439
+ return
440
+
441
+ await step.set_total(len(new_snippets))
442
+ processed = 0
443
+ documents = [
444
+ Document(snippet_id=snippet.id, text=snippet.content)
445
+ for snippet in new_snippets
446
+ if snippet.id
447
+ ]
448
+ async for result in self.code_search_service.index_documents(
449
+ IndexRequest(documents=documents)
450
+ ):
451
+ processed += len(result)
452
+ await step.set_current(processed, "Creating code embeddings for commit")
453
+
454
+ async def process_enrich(self, repository_id: int, commit_sha: str) -> None:
455
+ """Handle ENRICH task - enrich snippets and create text embeddings."""
456
+ async with self.operation.create_child(
457
+ TaskOperation.CREATE_SUMMARY_ENRICHMENT_FOR_COMMIT,
458
+ trackable_type=TrackableType.KODIT_REPOSITORY,
459
+ trackable_id=repository_id,
460
+ ) as step:
461
+ all_snippets = await self.snippet_repository.get_snippets_for_commit(
462
+ commit_sha
463
+ )
464
+
465
+ # Find snippets without a summary enrichment
466
+ snippets_without_summary = [
467
+ snippet
468
+ for snippet in all_snippets
469
+ if not snippet.enrichments
470
+ or not next(
471
+ enrichment
472
+ for enrichment in snippet.enrichments
473
+ if enrichment.type == EnrichmentType.SUMMARIZATION
474
+ )
475
+ ]
476
+ if not snippets_without_summary:
477
+ await step.skip("All snippets already have a summary enrichment")
478
+ return
479
+
480
+ # Enrich snippets
481
+ await step.set_total(len(snippets_without_summary))
482
+ snippet_map = {
483
+ snippet.id: snippet
484
+ for snippet in snippets_without_summary
485
+ if snippet.id
486
+ }
487
+
488
+ enrichment_requests = [
489
+ GenericEnrichmentRequest(
490
+ id=snippet_id,
491
+ text=snippet.content,
492
+ system_prompt=SUMMARIZATION_SYSTEM_PROMPT,
493
+ )
494
+ for snippet_id, snippet in snippet_map.items()
495
+ ]
496
+
497
+ processed = 0
498
+ async for result in self.enricher_service.enrich(enrichment_requests):
499
+ snippet = snippet_map[result.id]
500
+ snippet.enrichments.append(
501
+ Enrichment(type=EnrichmentType.SUMMARIZATION, content=result.text)
502
+ )
503
+
504
+ await self.snippet_repository.save_snippets(commit_sha, [snippet])
505
+
506
+ processed += 1
507
+ await step.set_current(processed, "Enriching snippets for commit")
508
+
509
+ async def process_summary_embeddings(
510
+ self, repository_id: int, commit_sha: str
511
+ ) -> None:
512
+ """Handle SUMMARY_EMBEDDINGS task - create summary embeddings."""
513
+ async with self.operation.create_child(
514
+ TaskOperation.CREATE_SUMMARY_EMBEDDINGS_FOR_COMMIT,
515
+ trackable_type=TrackableType.KODIT_REPOSITORY,
516
+ trackable_id=repository_id,
517
+ ) as step:
518
+ snippets = await self.snippet_repository.get_snippets_for_commit(commit_sha)
519
+
520
+ new_snippets = await self._new_snippets_for_type(
521
+ snippets, EmbeddingType.TEXT
522
+ )
523
+ if not new_snippets:
524
+ await step.skip("All snippets already have text embeddings")
525
+ return
526
+
527
+ await step.set_total(len(new_snippets))
528
+ processed = 0
529
+
530
+ def _summary_from_enrichments(enrichments: list[Enrichment]) -> str:
531
+ if not enrichments:
532
+ return ""
533
+ return next(
534
+ enrichment.content
535
+ for enrichment in enrichments
536
+ if enrichment.type == EnrichmentType.SUMMARIZATION
537
+ )
538
+
539
+ snippet_summary_map = {
540
+ snippet.id: _summary_from_enrichments(snippet.enrichments)
541
+ for snippet in snippets
542
+ if snippet.id
543
+ }
544
+ if len(snippet_summary_map) == 0:
545
+ await step.skip("No snippets with summaries to create text embeddings")
546
+ return
547
+
548
+ documents_with_summaries = [
549
+ Document(snippet_id=snippet_id, text=snippet_summary)
550
+ for snippet_id, snippet_summary in snippet_summary_map.items()
551
+ ]
552
+ async for result in self.text_search_service.index_documents(
553
+ IndexRequest(documents=documents_with_summaries)
554
+ ):
555
+ processed += len(result)
556
+ await step.set_current(processed, "Creating text embeddings for commit")
557
+
558
+ async def process_architecture_discovery(
559
+ self, repository_id: int, commit_sha: str
560
+ ) -> None:
561
+ """Handle ARCHITECTURE_DISCOVERY task - discover physical architecture."""
562
+ async with self.operation.create_child(
563
+ TaskOperation.CREATE_ARCHITECTURE_ENRICHMENT_FOR_COMMIT,
564
+ trackable_type=TrackableType.KODIT_REPOSITORY,
565
+ trackable_id=repository_id,
566
+ ) as step:
567
+ await step.set_total(3)
568
+
569
+ # Check if architecture enrichment already exists for this commit
570
+ enrichment_repo = self.enrichment_v2_repository
571
+ existing_enrichments = await enrichment_repo.enrichments_for_entity_type(
572
+ entity_type="git_commit",
573
+ entity_ids=[commit_sha],
574
+ )
575
+
576
+ # Check if architecture enrichment already exists
577
+ has_architecture = any(
578
+ enrichment.type == "architecture" for enrichment in existing_enrichments
579
+ )
580
+
581
+ if has_architecture:
582
+ await step.skip("Architecture enrichment already exists for commit")
583
+ return
584
+
585
+ # Get repository path
586
+ repo = await self.repo_repository.get_by_id(repository_id)
587
+ if not repo.cloned_path:
588
+ raise ValueError(f"Repository {repository_id} has never been cloned")
589
+
590
+ await step.set_current(1, "Discovering physical architecture")
591
+
592
+ # Discover architecture
593
+ architecture_narrative = (
594
+ await self.architecture_service.discover_architecture(repo.cloned_path)
595
+ )
596
+
597
+ await step.set_current(2, "Enriching architecture notes with LLM")
598
+
599
+ # Enrich the architecture narrative through the enricher
600
+ enrichment_request = GenericEnrichmentRequest(
601
+ id=commit_sha,
602
+ text=ARCHITECTURE_ENRICHMENT_TASK_PROMPT.format(
603
+ architecture_narrative=architecture_narrative,
604
+ ),
605
+ system_prompt=ARCHITECTURE_ENRICHMENT_SYSTEM_PROMPT,
606
+ )
607
+
608
+ enriched_content = ""
609
+ async for response in self.enricher_service.enrich([enrichment_request]):
610
+ enriched_content = response.text
611
+
612
+ # Create and save architecture enrichment with enriched content
613
+ architecture_enrichment = PhysicalArchitectureEnrichment(
614
+ entity_id=commit_sha,
615
+ content=enriched_content,
616
+ )
617
+
618
+ await self.enrichment_v2_repository.bulk_save_enrichments(
619
+ [architecture_enrichment]
620
+ )
621
+
622
+ await step.set_current(3, "Architecture enrichment completed")
623
+
624
+ async def process_api_docs(self, repository_id: int, commit_sha: str) -> None:
625
+ """Handle API_DOCS task - generate API documentation."""
626
+ async with self.operation.create_child(
627
+ TaskOperation.CREATE_PUBLIC_API_DOCS_FOR_COMMIT,
628
+ trackable_type=TrackableType.KODIT_REPOSITORY,
629
+ trackable_id=repository_id,
630
+ ) as step:
631
+ # Check if API docs already exist for this commit
632
+ existing_enrichments = (
633
+ await self.enrichment_v2_repository.enrichments_for_entity_type(
634
+ entity_type="git_commit",
635
+ entity_ids=[commit_sha],
636
+ )
637
+ )
638
+
639
+ has_api_docs = any(
640
+ e.type == ENRICHMENT_TYPE_USAGE
641
+ and e.subtype == ENRICHMENT_SUBTYPE_API_DOCS
642
+ for e in existing_enrichments
643
+ )
644
+
645
+ if has_api_docs:
646
+ await step.skip("API docs already exist for commit")
647
+ return
648
+
649
+ # Get repository for metadata
650
+ repo = await self.repo_repository.get_by_id(repository_id)
651
+ if not repo:
652
+ raise ValueError(f"Repository {repository_id} not found")
653
+ str(repo.sanitized_remote_uri)
654
+
655
+ commit = await self.git_commit_repository.get_by_sha(commit_sha)
656
+
657
+ # Group files by language
658
+ lang_files_map: dict[str, list[GitFile]] = defaultdict(list)
659
+ for file in commit.files:
660
+ try:
661
+ lang = LanguageMapping.get_language_for_extension(file.extension)
662
+ except ValueError:
663
+ continue
664
+ lang_files_map[lang].append(file)
665
+
666
+ all_enrichments = []
667
+ extractor = APIDocExtractor()
668
+
669
+ await step.set_total(len(lang_files_map))
670
+ for i, (lang, lang_files) in enumerate(lang_files_map.items()):
671
+ await step.set_current(i, f"Extracting API docs for {lang}")
672
+ enrichments = extractor.extract_api_docs(
673
+ lang_files,
674
+ lang,
675
+ commit_sha,
676
+ include_private=False,
677
+ )
678
+ all_enrichments.extend(enrichments)
679
+
680
+ # Save all enrichments
681
+ if all_enrichments:
682
+ await self.enrichment_v2_repository.bulk_save_enrichments(
683
+ all_enrichments
684
+ )
685
+
686
+ async def _new_snippets_for_type(
687
+ self, all_snippets: list[SnippetV2], embedding_type: EmbeddingType
688
+ ) -> list[SnippetV2]:
689
+ """Get new snippets for a given type."""
690
+ existing_embeddings = (
691
+ await self.embedding_repository.list_embeddings_by_snippet_ids_and_type(
692
+ [s.id for s in all_snippets], embedding_type
693
+ )
694
+ )
695
+ existing_embeddings_by_snippet_id = {
696
+ embedding.snippet_id: embedding for embedding in existing_embeddings
697
+ }
698
+ return [
699
+ s for s in all_snippets if s.id not in existing_embeddings_by_snippet_id
700
+ ]