kodit 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/application/factories/server_factory.py +54 -32
- kodit/application/services/code_search_application_service.py +89 -12
- kodit/application/services/commit_indexing_application_service.py +314 -195
- kodit/application/services/enrichment_query_service.py +274 -43
- kodit/application/services/indexing_worker_service.py +1 -1
- kodit/application/services/queue_service.py +15 -10
- kodit/application/services/sync_scheduler.py +2 -1
- kodit/domain/enrichments/architecture/architecture.py +1 -1
- kodit/domain/enrichments/architecture/physical/physical.py +1 -1
- kodit/domain/enrichments/development/development.py +1 -1
- kodit/domain/enrichments/development/snippet/snippet.py +12 -5
- kodit/domain/enrichments/enrichment.py +31 -4
- kodit/domain/enrichments/usage/api_docs.py +1 -1
- kodit/domain/enrichments/usage/usage.py +1 -1
- kodit/domain/entities/git.py +30 -25
- kodit/domain/factories/git_repo_factory.py +20 -5
- kodit/domain/protocols.py +56 -125
- kodit/domain/services/embedding_service.py +14 -16
- kodit/domain/services/git_repository_service.py +60 -38
- kodit/domain/services/git_service.py +18 -11
- kodit/domain/tracking/resolution_service.py +6 -16
- kodit/domain/value_objects.py +2 -9
- kodit/infrastructure/api/v1/dependencies.py +12 -3
- kodit/infrastructure/api/v1/query_params.py +27 -0
- kodit/infrastructure/api/v1/routers/commits.py +91 -85
- kodit/infrastructure/api/v1/routers/repositories.py +53 -37
- kodit/infrastructure/api/v1/routers/search.py +1 -1
- kodit/infrastructure/api/v1/schemas/enrichment.py +14 -0
- kodit/infrastructure/api/v1/schemas/repository.py +1 -1
- kodit/infrastructure/slicing/api_doc_extractor.py +0 -2
- kodit/infrastructure/sqlalchemy/embedding_repository.py +44 -34
- kodit/infrastructure/sqlalchemy/enrichment_association_repository.py +73 -0
- kodit/infrastructure/sqlalchemy/enrichment_v2_repository.py +116 -97
- kodit/infrastructure/sqlalchemy/entities.py +12 -116
- kodit/infrastructure/sqlalchemy/git_branch_repository.py +52 -244
- kodit/infrastructure/sqlalchemy/git_commit_repository.py +35 -324
- kodit/infrastructure/sqlalchemy/git_file_repository.py +70 -0
- kodit/infrastructure/sqlalchemy/git_repository.py +60 -230
- kodit/infrastructure/sqlalchemy/git_tag_repository.py +53 -240
- kodit/infrastructure/sqlalchemy/query.py +331 -0
- kodit/infrastructure/sqlalchemy/repository.py +203 -0
- kodit/infrastructure/sqlalchemy/task_repository.py +79 -58
- kodit/infrastructure/sqlalchemy/task_status_repository.py +45 -52
- kodit/migrations/versions/4b1a3b2c8fa5_refactor_git_tracking.py +190 -0
- {kodit-0.5.4.dist-info → kodit-0.5.5.dist-info}/METADATA +1 -1
- {kodit-0.5.4.dist-info → kodit-0.5.5.dist-info}/RECORD +50 -48
- kodit/infrastructure/mappers/enrichment_mapper.py +0 -83
- kodit/infrastructure/mappers/git_mapper.py +0 -193
- kodit/infrastructure/mappers/snippet_mapper.py +0 -104
- kodit/infrastructure/sqlalchemy/snippet_v2_repository.py +0 -479
- {kodit-0.5.4.dist-info → kodit-0.5.5.dist-info}/WHEEL +0 -0
- {kodit-0.5.4.dist-info → kodit-0.5.5.dist-info}/entry_points.txt +0 -0
- {kodit-0.5.4.dist-info → kodit-0.5.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,30 +2,45 @@
|
|
|
2
2
|
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
5
6
|
|
|
6
7
|
import structlog
|
|
7
8
|
from pydantic import AnyUrl
|
|
8
9
|
|
|
9
10
|
from kodit.application.services.queue_service import QueueService
|
|
10
11
|
from kodit.application.services.reporting import ProgressTracker
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from kodit.application.services.enrichment_query_service import (
|
|
15
|
+
EnrichmentQueryService,
|
|
16
|
+
)
|
|
11
17
|
from kodit.domain.enrichments.architecture.physical.physical import (
|
|
12
18
|
PhysicalArchitectureEnrichment,
|
|
13
19
|
)
|
|
20
|
+
from kodit.domain.enrichments.development.snippet.snippet import (
|
|
21
|
+
SnippetEnrichment,
|
|
22
|
+
SnippetEnrichmentSummary,
|
|
23
|
+
)
|
|
14
24
|
from kodit.domain.enrichments.enricher import Enricher
|
|
25
|
+
from kodit.domain.enrichments.enrichment import (
|
|
26
|
+
CommitEnrichmentAssociation,
|
|
27
|
+
EnrichmentAssociation,
|
|
28
|
+
EnrichmentV2,
|
|
29
|
+
)
|
|
15
30
|
from kodit.domain.enrichments.request import (
|
|
16
31
|
EnrichmentRequest as GenericEnrichmentRequest,
|
|
17
32
|
)
|
|
18
|
-
from kodit.domain.enrichments.usage.api_docs import ENRICHMENT_SUBTYPE_API_DOCS
|
|
19
|
-
from kodit.domain.enrichments.usage.usage import ENRICHMENT_TYPE_USAGE
|
|
20
33
|
from kodit.domain.entities import Task
|
|
21
|
-
from kodit.domain.entities.git import GitFile, GitRepo, SnippetV2
|
|
34
|
+
from kodit.domain.entities.git import GitFile, GitRepo, SnippetV2, TrackingType
|
|
22
35
|
from kodit.domain.factories.git_repo_factory import GitRepoFactory
|
|
23
36
|
from kodit.domain.protocols import (
|
|
37
|
+
EnrichmentAssociationRepository,
|
|
38
|
+
EnrichmentV2Repository,
|
|
24
39
|
GitBranchRepository,
|
|
25
40
|
GitCommitRepository,
|
|
41
|
+
GitFileRepository,
|
|
26
42
|
GitRepoRepository,
|
|
27
43
|
GitTagRepository,
|
|
28
|
-
SnippetRepositoryV2,
|
|
29
44
|
)
|
|
30
45
|
from kodit.domain.services.bm25_service import BM25DomainService
|
|
31
46
|
from kodit.domain.services.embedding_service import EmbeddingDomainService
|
|
@@ -41,8 +56,6 @@ from kodit.domain.services.physical_architecture_service import (
|
|
|
41
56
|
from kodit.domain.value_objects import (
|
|
42
57
|
DeleteRequest,
|
|
43
58
|
Document,
|
|
44
|
-
Enrichment,
|
|
45
|
-
EnrichmentType,
|
|
46
59
|
IndexRequest,
|
|
47
60
|
LanguageMapping,
|
|
48
61
|
PrescribedOperations,
|
|
@@ -52,13 +65,17 @@ from kodit.domain.value_objects import (
|
|
|
52
65
|
)
|
|
53
66
|
from kodit.infrastructure.slicing.api_doc_extractor import APIDocExtractor
|
|
54
67
|
from kodit.infrastructure.slicing.slicer import Slicer
|
|
68
|
+
from kodit.infrastructure.sqlalchemy import entities as db_entities
|
|
55
69
|
from kodit.infrastructure.sqlalchemy.embedding_repository import (
|
|
56
70
|
SqlAlchemyEmbeddingRepository,
|
|
57
71
|
)
|
|
58
|
-
from kodit.infrastructure.sqlalchemy.enrichment_v2_repository import (
|
|
59
|
-
EnrichmentV2Repository,
|
|
60
|
-
)
|
|
61
72
|
from kodit.infrastructure.sqlalchemy.entities import EmbeddingType
|
|
73
|
+
from kodit.infrastructure.sqlalchemy.query import (
|
|
74
|
+
EnrichmentAssociationQueryBuilder,
|
|
75
|
+
FilterOperator,
|
|
76
|
+
GitFileQueryBuilder,
|
|
77
|
+
QueryBuilder,
|
|
78
|
+
)
|
|
62
79
|
|
|
63
80
|
SUMMARIZATION_SYSTEM_PROMPT = """
|
|
64
81
|
You are a professional software developer. You will be given a snippet of code.
|
|
@@ -71,15 +88,14 @@ class CommitIndexingApplicationService:
|
|
|
71
88
|
|
|
72
89
|
def __init__( # noqa: PLR0913
|
|
73
90
|
self,
|
|
74
|
-
snippet_v2_repository: SnippetRepositoryV2,
|
|
75
91
|
repo_repository: GitRepoRepository,
|
|
76
92
|
git_commit_repository: GitCommitRepository,
|
|
93
|
+
git_file_repository: GitFileRepository,
|
|
77
94
|
git_branch_repository: GitBranchRepository,
|
|
78
95
|
git_tag_repository: GitTagRepository,
|
|
79
96
|
operation: ProgressTracker,
|
|
80
97
|
scanner: GitRepositoryScanner,
|
|
81
98
|
cloner: RepositoryCloner,
|
|
82
|
-
snippet_repository: SnippetRepositoryV2,
|
|
83
99
|
slicer: Slicer,
|
|
84
100
|
queue: QueueService,
|
|
85
101
|
bm25_service: BM25DomainService,
|
|
@@ -87,28 +103,20 @@ class CommitIndexingApplicationService:
|
|
|
87
103
|
text_search_service: EmbeddingDomainService,
|
|
88
104
|
embedding_repository: SqlAlchemyEmbeddingRepository,
|
|
89
105
|
architecture_service: PhysicalArchitectureService,
|
|
90
|
-
enrichment_v2_repository: EnrichmentV2Repository,
|
|
91
106
|
enricher_service: Enricher,
|
|
107
|
+
enrichment_v2_repository: EnrichmentV2Repository,
|
|
108
|
+
enrichment_association_repository: EnrichmentAssociationRepository,
|
|
109
|
+
enrichment_query_service: "EnrichmentQueryService",
|
|
92
110
|
) -> None:
|
|
93
|
-
"""Initialize the commit indexing application service.
|
|
94
|
-
|
|
95
|
-
Args:
|
|
96
|
-
commit_index_repository: Repository for commit index data.
|
|
97
|
-
snippet_v2_repository: Repository for snippet data.
|
|
98
|
-
repo_repository: Repository for Git repository data.
|
|
99
|
-
domain_indexer: Domain service for indexing operations.
|
|
100
|
-
operation: Progress tracker for reporting operations.
|
|
101
|
-
|
|
102
|
-
"""
|
|
103
|
-
self.snippet_repository = snippet_v2_repository
|
|
111
|
+
"""Initialize the commit indexing application service."""
|
|
104
112
|
self.repo_repository = repo_repository
|
|
105
113
|
self.git_commit_repository = git_commit_repository
|
|
114
|
+
self.git_file_repository = git_file_repository
|
|
106
115
|
self.git_branch_repository = git_branch_repository
|
|
107
116
|
self.git_tag_repository = git_tag_repository
|
|
108
117
|
self.operation = operation
|
|
109
118
|
self.scanner = scanner
|
|
110
119
|
self.cloner = cloner
|
|
111
|
-
self.snippet_repository = snippet_repository
|
|
112
120
|
self.slicer = slicer
|
|
113
121
|
self.queue = queue
|
|
114
122
|
self.bm25_service = bm25_service
|
|
@@ -117,7 +125,9 @@ class CommitIndexingApplicationService:
|
|
|
117
125
|
self.embedding_repository = embedding_repository
|
|
118
126
|
self.architecture_service = architecture_service
|
|
119
127
|
self.enrichment_v2_repository = enrichment_v2_repository
|
|
128
|
+
self.enrichment_association_repository = enrichment_association_repository
|
|
120
129
|
self.enricher_service = enricher_service
|
|
130
|
+
self.enrichment_query_service = enrichment_query_service
|
|
121
131
|
self._log = structlog.get_logger(__name__)
|
|
122
132
|
|
|
123
133
|
async def create_git_repository(self, remote_uri: AnyUrl) -> GitRepo:
|
|
@@ -137,7 +147,7 @@ class CommitIndexingApplicationService:
|
|
|
137
147
|
|
|
138
148
|
async def delete_git_repository(self, repo_id: int) -> bool:
|
|
139
149
|
"""Delete a Git repository by ID."""
|
|
140
|
-
repo = await self.repo_repository.
|
|
150
|
+
repo = await self.repo_repository.get(repo_id)
|
|
141
151
|
if not repo:
|
|
142
152
|
return False
|
|
143
153
|
|
|
@@ -193,7 +203,7 @@ class CommitIndexingApplicationService:
|
|
|
193
203
|
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
194
204
|
trackable_id=repository_id,
|
|
195
205
|
):
|
|
196
|
-
repo = await self.repo_repository.
|
|
206
|
+
repo = await self.repo_repository.get(repository_id)
|
|
197
207
|
repo.cloned_path = await self.cloner.clone_repository(repo.remote_uri)
|
|
198
208
|
await self.repo_repository.save(repo)
|
|
199
209
|
|
|
@@ -204,45 +214,57 @@ class CommitIndexingApplicationService:
|
|
|
204
214
|
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
205
215
|
trackable_id=repository_id,
|
|
206
216
|
) as step:
|
|
207
|
-
await step.set_total(
|
|
208
|
-
repo = await self.repo_repository.
|
|
217
|
+
await step.set_total(7)
|
|
218
|
+
repo = await self.repo_repository.get(repository_id)
|
|
209
219
|
if not repo.cloned_path:
|
|
210
220
|
raise ValueError(f"Repository {repository_id} has never been cloned")
|
|
211
221
|
|
|
212
222
|
# Scan the repository to get all metadata
|
|
213
223
|
await step.set_current(0, "Scanning repository")
|
|
214
|
-
scan_result = await self.scanner.scan_repository(
|
|
224
|
+
scan_result = await self.scanner.scan_repository(
|
|
225
|
+
repo.cloned_path, repository_id
|
|
226
|
+
)
|
|
215
227
|
|
|
216
228
|
# Update repo with scan result (this sets num_commits, num_branches, etc.)
|
|
217
229
|
await step.set_current(1, "Updating repository with scan result")
|
|
218
230
|
repo.update_with_scan_result(scan_result)
|
|
219
231
|
await self.repo_repository.save(repo)
|
|
220
232
|
|
|
221
|
-
# Save commits, branches, and tags to their dedicated repositories
|
|
222
233
|
await step.set_current(2, "Saving commits")
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
234
|
+
await self.git_commit_repository.save_bulk(scan_result.all_commits)
|
|
235
|
+
|
|
236
|
+
await step.set_current(3, "Saving files")
|
|
237
|
+
await self.git_file_repository.save_bulk(scan_result.all_files)
|
|
227
238
|
|
|
228
|
-
await step.set_current(
|
|
239
|
+
await step.set_current(4, "Saving branches")
|
|
229
240
|
if scan_result.branches:
|
|
230
241
|
await self.git_branch_repository.save_bulk(
|
|
231
|
-
scan_result.branches,
|
|
242
|
+
scan_result.branches,
|
|
232
243
|
)
|
|
233
244
|
|
|
234
|
-
await step.set_current(
|
|
245
|
+
await step.set_current(5, "Saving tags")
|
|
235
246
|
if scan_result.all_tags:
|
|
236
247
|
await self.git_tag_repository.save_bulk(
|
|
237
|
-
scan_result.all_tags,
|
|
248
|
+
scan_result.all_tags,
|
|
238
249
|
)
|
|
239
250
|
|
|
240
|
-
await step.set_current(
|
|
241
|
-
if not repo.
|
|
251
|
+
await step.set_current(6, "Enqueuing commit indexing tasks")
|
|
252
|
+
if not repo.tracking_config.name:
|
|
242
253
|
raise ValueError(f"Repository {repository_id} has no tracking branch")
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
254
|
+
if repo.tracking_config.type == TrackingType.BRANCH.value:
|
|
255
|
+
branch = await self.git_branch_repository.get_by_name(
|
|
256
|
+
repo.tracking_config.name, repository_id
|
|
257
|
+
)
|
|
258
|
+
commit_sha = branch.head_commit_sha
|
|
259
|
+
elif repo.tracking_config.type == TrackingType.TAG.value:
|
|
260
|
+
tag = await self.git_tag_repository.get_by_name(
|
|
261
|
+
repo.tracking_config.name, repository_id
|
|
262
|
+
)
|
|
263
|
+
commit_sha = tag.target_commit_sha
|
|
264
|
+
elif repo.tracking_config.type == TrackingType.COMMIT_SHA.value:
|
|
265
|
+
commit_sha = repo.tracking_config.name
|
|
266
|
+
else:
|
|
267
|
+
raise ValueError(f"Unknown tracking type: {repo.tracking_config.type}")
|
|
246
268
|
|
|
247
269
|
await self.queue.enqueue_tasks(
|
|
248
270
|
tasks=PrescribedOperations.INDEX_COMMIT,
|
|
@@ -250,6 +272,74 @@ class CommitIndexingApplicationService:
|
|
|
250
272
|
payload={"commit_sha": commit_sha, "repository_id": repository_id},
|
|
251
273
|
)
|
|
252
274
|
|
|
275
|
+
async def _delete_snippet_enrichments_for_commits(
|
|
276
|
+
self, commit_shas: list[str]
|
|
277
|
+
) -> None:
|
|
278
|
+
"""Delete snippet enrichments and their indices for commits."""
|
|
279
|
+
# Get all snippet enrichment IDs for these commits
|
|
280
|
+
all_snippet_enrichment_ids = []
|
|
281
|
+
for commit_sha in commit_shas:
|
|
282
|
+
snippet_enrichments = (
|
|
283
|
+
await self.enrichment_query_service.get_all_snippets_for_commit(
|
|
284
|
+
commit_sha
|
|
285
|
+
)
|
|
286
|
+
)
|
|
287
|
+
enrichment_ids = [
|
|
288
|
+
enrichment.id for enrichment in snippet_enrichments if enrichment.id
|
|
289
|
+
]
|
|
290
|
+
all_snippet_enrichment_ids.extend(enrichment_ids)
|
|
291
|
+
|
|
292
|
+
if not all_snippet_enrichment_ids:
|
|
293
|
+
return
|
|
294
|
+
|
|
295
|
+
# Delete from BM25 and embedding indices
|
|
296
|
+
snippet_id_strings = [str(sid) for sid in all_snippet_enrichment_ids]
|
|
297
|
+
delete_request = DeleteRequest(snippet_ids=snippet_id_strings)
|
|
298
|
+
await self.bm25_service.delete_documents(delete_request)
|
|
299
|
+
|
|
300
|
+
for snippet_id in all_snippet_enrichment_ids:
|
|
301
|
+
await self.embedding_repository.delete_embeddings_by_snippet_id(
|
|
302
|
+
str(snippet_id)
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# Delete enrichment associations for snippets
|
|
306
|
+
await self.enrichment_association_repository.delete_by_query(
|
|
307
|
+
QueryBuilder()
|
|
308
|
+
.filter("entity_type", FilterOperator.EQ, "snippet_v2")
|
|
309
|
+
.filter("entity_id", FilterOperator.IN, snippet_id_strings)
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
# Delete the enrichments themselves
|
|
313
|
+
await self.enrichment_v2_repository.delete_by_query(
|
|
314
|
+
QueryBuilder().filter("id", FilterOperator.IN, all_snippet_enrichment_ids)
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
async def _delete_commit_enrichments(self, commit_shas: list[str]) -> None:
|
|
318
|
+
"""Delete commit-level enrichments for commits."""
|
|
319
|
+
existing_enrichment_associations = (
|
|
320
|
+
await self.enrichment_association_repository.find(
|
|
321
|
+
QueryBuilder()
|
|
322
|
+
.filter(
|
|
323
|
+
"entity_type",
|
|
324
|
+
FilterOperator.EQ,
|
|
325
|
+
db_entities.GitCommit.__tablename__,
|
|
326
|
+
)
|
|
327
|
+
.filter("entity_id", FilterOperator.IN, commit_shas)
|
|
328
|
+
)
|
|
329
|
+
)
|
|
330
|
+
enrichment_ids = [a.enrichment_id for a in existing_enrichment_associations]
|
|
331
|
+
if not enrichment_ids:
|
|
332
|
+
return
|
|
333
|
+
|
|
334
|
+
# Delete associations first
|
|
335
|
+
await self.enrichment_association_repository.delete_by_query(
|
|
336
|
+
QueryBuilder().filter("enrichment_id", FilterOperator.IN, enrichment_ids)
|
|
337
|
+
)
|
|
338
|
+
# Then delete enrichments
|
|
339
|
+
await self.enrichment_v2_repository.delete_by_query(
|
|
340
|
+
QueryBuilder().filter("id", FilterOperator.IN, enrichment_ids)
|
|
341
|
+
)
|
|
342
|
+
|
|
253
343
|
async def process_delete_repo(self, repository_id: int) -> None:
|
|
254
344
|
"""Delete a repository."""
|
|
255
345
|
async with self.operation.create_child(
|
|
@@ -257,61 +347,34 @@ class CommitIndexingApplicationService:
|
|
|
257
347
|
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
258
348
|
trackable_id=repository_id,
|
|
259
349
|
):
|
|
260
|
-
repo = await self.repo_repository.
|
|
350
|
+
repo = await self.repo_repository.get(repository_id)
|
|
261
351
|
if not repo:
|
|
262
352
|
raise ValueError(f"Repository {repository_id} not found")
|
|
263
353
|
|
|
264
|
-
# Get all commit SHAs for this repository
|
|
265
|
-
commits = await self.git_commit_repository.
|
|
354
|
+
# Get all commit SHAs for this repository
|
|
355
|
+
commits = await self.git_commit_repository.find(
|
|
356
|
+
QueryBuilder().filter("repo_id", FilterOperator.EQ, repository_id)
|
|
357
|
+
)
|
|
266
358
|
commit_shas = [commit.commit_sha for commit in commits]
|
|
267
359
|
|
|
268
|
-
#
|
|
269
|
-
# (before deleting the associations)
|
|
270
|
-
all_snippet_ids = []
|
|
360
|
+
# Delete all enrichments and their indices
|
|
271
361
|
if commit_shas:
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
commit_sha
|
|
275
|
-
)
|
|
276
|
-
all_snippet_ids.extend(
|
|
277
|
-
[snippet.id for snippet in snippets if snippet.id]
|
|
278
|
-
)
|
|
362
|
+
await self._delete_snippet_enrichments_for_commits(commit_shas)
|
|
363
|
+
await self._delete_commit_enrichments(commit_shas)
|
|
279
364
|
|
|
280
|
-
#
|
|
281
|
-
if all_snippet_ids:
|
|
282
|
-
# Convert to strings as DeleteRequest expects list[str]
|
|
283
|
-
snippet_id_strings = [str(snippet_id) for snippet_id in all_snippet_ids]
|
|
284
|
-
delete_request = DeleteRequest(snippet_ids=snippet_id_strings)
|
|
285
|
-
await self.bm25_service.delete_documents(delete_request)
|
|
286
|
-
|
|
287
|
-
# Delete embeddings for each snippet
|
|
288
|
-
for snippet_id in all_snippet_ids:
|
|
289
|
-
await self.embedding_repository.delete_embeddings_by_snippet_id(
|
|
290
|
-
snippet_id
|
|
291
|
-
)
|
|
292
|
-
|
|
293
|
-
# Step 3: Delete enrichments for all commits
|
|
294
|
-
if commit_shas:
|
|
295
|
-
await self.enrichment_v2_repository.bulk_delete_enrichments(
|
|
296
|
-
entity_type="git_commit",
|
|
297
|
-
entity_ids=commit_shas,
|
|
298
|
-
)
|
|
299
|
-
|
|
300
|
-
# Step 4: Delete snippet associations for all commits
|
|
301
|
-
for commit_sha in commit_shas:
|
|
302
|
-
await self.snippet_repository.delete_snippets_for_commit(commit_sha)
|
|
303
|
-
|
|
304
|
-
# Step 5: Delete branches (they reference commits via head_commit_sha)
|
|
365
|
+
# Delete branches, tags, files, commits, and repository
|
|
305
366
|
await self.git_branch_repository.delete_by_repo_id(repository_id)
|
|
306
|
-
|
|
307
|
-
# Step 6: Delete tags (they reference commits via target_commit_sha)
|
|
308
367
|
await self.git_tag_repository.delete_by_repo_id(repository_id)
|
|
309
368
|
|
|
310
|
-
|
|
311
|
-
|
|
369
|
+
for commit_sha in commit_shas:
|
|
370
|
+
await self.git_file_repository.delete_by_commit_sha(commit_sha)
|
|
371
|
+
|
|
372
|
+
await self.git_commit_repository.delete_by_query(
|
|
373
|
+
QueryBuilder().filter("repo_id", FilterOperator.EQ, repository_id)
|
|
374
|
+
)
|
|
312
375
|
|
|
313
|
-
|
|
314
|
-
|
|
376
|
+
if repo.id:
|
|
377
|
+
await self.repo_repository.delete(repo)
|
|
315
378
|
|
|
316
379
|
async def process_snippets_for_commit(
|
|
317
380
|
self, repository_id: int, commit_sha: str
|
|
@@ -322,16 +385,16 @@ class CommitIndexingApplicationService:
|
|
|
322
385
|
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
323
386
|
trackable_id=repository_id,
|
|
324
387
|
) as step:
|
|
325
|
-
#
|
|
326
|
-
if await self.
|
|
327
|
-
await step.skip("
|
|
388
|
+
# Find existing snippet enrichments for this commit
|
|
389
|
+
if await self.enrichment_query_service.has_snippets_for_commit(commit_sha):
|
|
390
|
+
await step.skip("Snippets already extracted for commit")
|
|
328
391
|
return
|
|
329
392
|
|
|
330
|
-
commit = await self.git_commit_repository.
|
|
393
|
+
commit = await self.git_commit_repository.get(commit_sha)
|
|
331
394
|
|
|
332
395
|
# Load files on demand for snippet extraction (performance optimization)
|
|
333
396
|
# Instead of using commit.files (which may be empty), load files directly
|
|
334
|
-
repo = await self.repo_repository.
|
|
397
|
+
repo = await self.repo_repository.get(repository_id)
|
|
335
398
|
if not repo.cloned_path:
|
|
336
399
|
raise ValueError(f"Repository {repository_id} has never been cloned")
|
|
337
400
|
|
|
@@ -350,6 +413,7 @@ class CommitIndexingApplicationService:
|
|
|
350
413
|
absolute_path = str(repo.cloned_path / file_data["path"])
|
|
351
414
|
|
|
352
415
|
git_file = GitFile(
|
|
416
|
+
commit_sha=commit.commit_sha,
|
|
353
417
|
created_at=file_data.get("created_at", commit.date),
|
|
354
418
|
blob_sha=file_data["blob_sha"],
|
|
355
419
|
path=absolute_path, # Use absolute path for file reading
|
|
@@ -395,8 +459,27 @@ class CommitIndexingApplicationService:
|
|
|
395
459
|
f"Extracted {len(all_snippets)} snippets, "
|
|
396
460
|
f"deduplicated to {len(deduplicated_snippets)} for {commit_short}"
|
|
397
461
|
)
|
|
398
|
-
|
|
399
|
-
|
|
462
|
+
|
|
463
|
+
saved_enrichments = await self.enrichment_v2_repository.save_bulk(
|
|
464
|
+
[
|
|
465
|
+
SnippetEnrichment(content=snippet.content)
|
|
466
|
+
for snippet in deduplicated_snippets
|
|
467
|
+
]
|
|
468
|
+
)
|
|
469
|
+
saved_associations = await self.enrichment_association_repository.save_bulk(
|
|
470
|
+
[
|
|
471
|
+
EnrichmentAssociation(
|
|
472
|
+
enrichment_id=enrichment.id,
|
|
473
|
+
entity_type=db_entities.GitCommit.__tablename__,
|
|
474
|
+
entity_id=commit_sha,
|
|
475
|
+
)
|
|
476
|
+
for enrichment in saved_enrichments
|
|
477
|
+
if enrichment.id
|
|
478
|
+
]
|
|
479
|
+
)
|
|
480
|
+
self._log.info(
|
|
481
|
+
f"Saved {len(saved_enrichments)} snippet enrichments and "
|
|
482
|
+
f"{len(saved_associations)} associations for commit {commit_sha}"
|
|
400
483
|
)
|
|
401
484
|
|
|
402
485
|
async def process_bm25_index(self, repository_id: int, commit_sha: str) -> None:
|
|
@@ -406,13 +489,16 @@ class CommitIndexingApplicationService:
|
|
|
406
489
|
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
407
490
|
trackable_id=repository_id,
|
|
408
491
|
):
|
|
409
|
-
|
|
410
|
-
|
|
492
|
+
existing_enrichments = (
|
|
493
|
+
await self.enrichment_query_service.get_all_snippets_for_commit(
|
|
494
|
+
commit_sha
|
|
495
|
+
)
|
|
496
|
+
)
|
|
411
497
|
await self.bm25_service.index_documents(
|
|
412
498
|
IndexRequest(
|
|
413
499
|
documents=[
|
|
414
|
-
Document(snippet_id=snippet.id, text=snippet.content)
|
|
415
|
-
for snippet in
|
|
500
|
+
Document(snippet_id=str(snippet.id), text=snippet.content)
|
|
501
|
+
for snippet in existing_enrichments
|
|
416
502
|
if snippet.id
|
|
417
503
|
]
|
|
418
504
|
)
|
|
@@ -427,12 +513,14 @@ class CommitIndexingApplicationService:
|
|
|
427
513
|
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
428
514
|
trackable_id=repository_id,
|
|
429
515
|
) as step:
|
|
430
|
-
|
|
431
|
-
|
|
516
|
+
existing_enrichments = (
|
|
517
|
+
await self.enrichment_query_service.get_all_snippets_for_commit(
|
|
518
|
+
commit_sha
|
|
519
|
+
)
|
|
432
520
|
)
|
|
433
521
|
|
|
434
522
|
new_snippets = await self._new_snippets_for_type(
|
|
435
|
-
|
|
523
|
+
existing_enrichments, EmbeddingType.CODE
|
|
436
524
|
)
|
|
437
525
|
if not new_snippets:
|
|
438
526
|
await step.skip("All snippets already have code embeddings")
|
|
@@ -441,7 +529,7 @@ class CommitIndexingApplicationService:
|
|
|
441
529
|
await step.set_total(len(new_snippets))
|
|
442
530
|
processed = 0
|
|
443
531
|
documents = [
|
|
444
|
-
Document(snippet_id=snippet.id, text=snippet.content)
|
|
532
|
+
Document(snippet_id=str(snippet.id), text=snippet.content)
|
|
445
533
|
for snippet in new_snippets
|
|
446
534
|
if snippet.id
|
|
447
535
|
]
|
|
@@ -458,36 +546,28 @@ class CommitIndexingApplicationService:
|
|
|
458
546
|
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
459
547
|
trackable_id=repository_id,
|
|
460
548
|
) as step:
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
549
|
+
if await self.enrichment_query_service.has_summaries_for_commit(commit_sha):
|
|
550
|
+
await step.skip("Summary enrichments already exist for commit")
|
|
551
|
+
return
|
|
464
552
|
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
for snippet in all_snippets
|
|
469
|
-
if not snippet.enrichments
|
|
470
|
-
or not next(
|
|
471
|
-
enrichment
|
|
472
|
-
for enrichment in snippet.enrichments
|
|
473
|
-
if enrichment.type == EnrichmentType.SUMMARIZATION
|
|
553
|
+
all_snippets = (
|
|
554
|
+
await self.enrichment_query_service.get_all_snippets_for_commit(
|
|
555
|
+
commit_sha
|
|
474
556
|
)
|
|
475
|
-
|
|
476
|
-
if not
|
|
477
|
-
await step.skip("
|
|
557
|
+
)
|
|
558
|
+
if not all_snippets:
|
|
559
|
+
await step.skip("No snippets to enrich")
|
|
478
560
|
return
|
|
479
561
|
|
|
480
562
|
# Enrich snippets
|
|
481
|
-
await step.set_total(len(
|
|
563
|
+
await step.set_total(len(all_snippets))
|
|
482
564
|
snippet_map = {
|
|
483
|
-
snippet.id: snippet
|
|
484
|
-
for snippet in snippets_without_summary
|
|
485
|
-
if snippet.id
|
|
565
|
+
str(snippet.id): snippet for snippet in all_snippets if snippet.id
|
|
486
566
|
}
|
|
487
567
|
|
|
488
568
|
enrichment_requests = [
|
|
489
569
|
GenericEnrichmentRequest(
|
|
490
|
-
id=snippet_id,
|
|
570
|
+
id=str(snippet_id),
|
|
491
571
|
text=snippet.content,
|
|
492
572
|
system_prompt=SUMMARIZATION_SYSTEM_PROMPT,
|
|
493
573
|
)
|
|
@@ -497,12 +577,27 @@ class CommitIndexingApplicationService:
|
|
|
497
577
|
processed = 0
|
|
498
578
|
async for result in self.enricher_service.enrich(enrichment_requests):
|
|
499
579
|
snippet = snippet_map[result.id]
|
|
500
|
-
|
|
501
|
-
|
|
580
|
+
db_summary = await self.enrichment_v2_repository.save(
|
|
581
|
+
SnippetEnrichmentSummary(content=result.text)
|
|
582
|
+
)
|
|
583
|
+
if not db_summary.id:
|
|
584
|
+
raise ValueError(
|
|
585
|
+
f"Failed to save snippet enrichment for commit {commit_sha}"
|
|
586
|
+
)
|
|
587
|
+
await self.enrichment_association_repository.save(
|
|
588
|
+
EnrichmentAssociation(
|
|
589
|
+
enrichment_id=db_summary.id,
|
|
590
|
+
entity_type=db_entities.EnrichmentV2.__tablename__,
|
|
591
|
+
entity_id=str(snippet.id),
|
|
592
|
+
)
|
|
593
|
+
)
|
|
594
|
+
await self.enrichment_association_repository.save(
|
|
595
|
+
EnrichmentAssociation(
|
|
596
|
+
enrichment_id=db_summary.id,
|
|
597
|
+
entity_type=db_entities.GitCommit.__tablename__,
|
|
598
|
+
entity_id=commit_sha,
|
|
599
|
+
)
|
|
502
600
|
)
|
|
503
|
-
|
|
504
|
-
await self.snippet_repository.save_snippets(commit_sha, [snippet])
|
|
505
|
-
|
|
506
601
|
processed += 1
|
|
507
602
|
await step.set_current(processed, "Enriching snippets for commit")
|
|
508
603
|
|
|
@@ -515,40 +610,61 @@ class CommitIndexingApplicationService:
|
|
|
515
610
|
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
516
611
|
trackable_id=repository_id,
|
|
517
612
|
) as step:
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
613
|
+
# Get all snippet enrichments for this commit
|
|
614
|
+
all_snippet_enrichments = (
|
|
615
|
+
await self.enrichment_query_service.get_all_snippets_for_commit(
|
|
616
|
+
commit_sha
|
|
617
|
+
)
|
|
522
618
|
)
|
|
523
|
-
if not
|
|
524
|
-
await step.skip("
|
|
619
|
+
if not all_snippet_enrichments:
|
|
620
|
+
await step.skip("No snippets to create summary embeddings")
|
|
525
621
|
return
|
|
526
622
|
|
|
527
|
-
|
|
528
|
-
|
|
623
|
+
# Get summary enrichments that point to these snippet enrichments
|
|
624
|
+
query = EnrichmentAssociationQueryBuilder.for_enrichment_associations(
|
|
625
|
+
entity_type=db_entities.EnrichmentV2.__tablename__,
|
|
626
|
+
entity_ids=[
|
|
627
|
+
str(snippet.id) for snippet in all_snippet_enrichments if snippet.id
|
|
628
|
+
],
|
|
629
|
+
)
|
|
630
|
+
summary_enrichment_associations = (
|
|
631
|
+
await self.enrichment_association_repository.find(query)
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
if not summary_enrichment_associations:
|
|
635
|
+
await step.skip("No summary enrichments found for snippets")
|
|
636
|
+
return
|
|
529
637
|
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
638
|
+
# Get the actual summary enrichments
|
|
639
|
+
summary_enrichments = await self.enrichment_v2_repository.find(
|
|
640
|
+
QueryBuilder().filter(
|
|
641
|
+
"id",
|
|
642
|
+
FilterOperator.IN,
|
|
643
|
+
[
|
|
644
|
+
association.enrichment_id
|
|
645
|
+
for association in summary_enrichment_associations
|
|
646
|
+
],
|
|
537
647
|
)
|
|
648
|
+
)
|
|
538
649
|
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
await step.skip("No snippets with summaries to create text embeddings")
|
|
650
|
+
# Check if embeddings already exist for these summaries
|
|
651
|
+
new_summaries = await self._new_snippets_for_type(
|
|
652
|
+
summary_enrichments, EmbeddingType.TEXT
|
|
653
|
+
)
|
|
654
|
+
if not new_summaries:
|
|
655
|
+
await step.skip("All snippets already have text embeddings")
|
|
546
656
|
return
|
|
547
657
|
|
|
658
|
+
await step.set_total(len(new_summaries))
|
|
659
|
+
processed = 0
|
|
660
|
+
|
|
661
|
+
# Create documents from the summary enrichments
|
|
548
662
|
documents_with_summaries = [
|
|
549
|
-
Document(snippet_id=
|
|
550
|
-
for
|
|
663
|
+
Document(snippet_id=str(summary.id), text=summary.content)
|
|
664
|
+
for summary in new_summaries
|
|
665
|
+
if summary.id
|
|
551
666
|
]
|
|
667
|
+
|
|
552
668
|
async for result in self.text_search_service.index_documents(
|
|
553
669
|
IndexRequest(documents=documents_with_summaries)
|
|
554
670
|
):
|
|
@@ -567,23 +683,14 @@ class CommitIndexingApplicationService:
|
|
|
567
683
|
await step.set_total(3)
|
|
568
684
|
|
|
569
685
|
# Check if architecture enrichment already exists for this commit
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
entity_ids=[commit_sha],
|
|
574
|
-
)
|
|
575
|
-
|
|
576
|
-
# Check if architecture enrichment already exists
|
|
577
|
-
has_architecture = any(
|
|
578
|
-
enrichment.type == "architecture" for enrichment in existing_enrichments
|
|
579
|
-
)
|
|
580
|
-
|
|
581
|
-
if has_architecture:
|
|
686
|
+
if await self.enrichment_query_service.has_architecture_for_commit(
|
|
687
|
+
commit_sha
|
|
688
|
+
):
|
|
582
689
|
await step.skip("Architecture enrichment already exists for commit")
|
|
583
690
|
return
|
|
584
691
|
|
|
585
692
|
# Get repository path
|
|
586
|
-
repo = await self.repo_repository.
|
|
693
|
+
repo = await self.repo_repository.get(repository_id)
|
|
587
694
|
if not repo.cloned_path:
|
|
588
695
|
raise ValueError(f"Repository {repository_id} has never been cloned")
|
|
589
696
|
|
|
@@ -610,13 +717,20 @@ class CommitIndexingApplicationService:
|
|
|
610
717
|
enriched_content = response.text
|
|
611
718
|
|
|
612
719
|
# Create and save architecture enrichment with enriched content
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
720
|
+
enrichment = await self.enrichment_v2_repository.save(
|
|
721
|
+
PhysicalArchitectureEnrichment(
|
|
722
|
+
content=enriched_content,
|
|
723
|
+
)
|
|
616
724
|
)
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
725
|
+
if not enrichment or not enrichment.id:
|
|
726
|
+
raise ValueError(
|
|
727
|
+
f"Failed to save architecture enrichment for commit {commit_sha}"
|
|
728
|
+
)
|
|
729
|
+
await self.enrichment_association_repository.save(
|
|
730
|
+
CommitEnrichmentAssociation(
|
|
731
|
+
enrichment_id=enrichment.id,
|
|
732
|
+
entity_id=commit_sha,
|
|
733
|
+
)
|
|
620
734
|
)
|
|
621
735
|
|
|
622
736
|
await step.set_current(3, "Architecture enrichment completed")
|
|
@@ -629,34 +743,26 @@ class CommitIndexingApplicationService:
|
|
|
629
743
|
trackable_id=repository_id,
|
|
630
744
|
) as step:
|
|
631
745
|
# Check if API docs already exist for this commit
|
|
632
|
-
|
|
633
|
-
await self.enrichment_v2_repository.enrichments_for_entity_type(
|
|
634
|
-
entity_type="git_commit",
|
|
635
|
-
entity_ids=[commit_sha],
|
|
636
|
-
)
|
|
637
|
-
)
|
|
638
|
-
|
|
639
|
-
has_api_docs = any(
|
|
640
|
-
e.type == ENRICHMENT_TYPE_USAGE
|
|
641
|
-
and e.subtype == ENRICHMENT_SUBTYPE_API_DOCS
|
|
642
|
-
for e in existing_enrichments
|
|
643
|
-
)
|
|
644
|
-
|
|
645
|
-
if has_api_docs:
|
|
746
|
+
if await self.enrichment_query_service.has_api_docs_for_commit(commit_sha):
|
|
646
747
|
await step.skip("API docs already exist for commit")
|
|
647
748
|
return
|
|
648
749
|
|
|
649
750
|
# Get repository for metadata
|
|
650
|
-
repo = await self.repo_repository.
|
|
751
|
+
repo = await self.repo_repository.get(repository_id)
|
|
651
752
|
if not repo:
|
|
652
753
|
raise ValueError(f"Repository {repository_id} not found")
|
|
653
754
|
str(repo.sanitized_remote_uri)
|
|
654
755
|
|
|
655
|
-
|
|
756
|
+
files = await self.git_file_repository.find(
|
|
757
|
+
GitFileQueryBuilder().for_commit_sha(commit_sha)
|
|
758
|
+
)
|
|
759
|
+
if not files:
|
|
760
|
+
await step.skip("No files to extract API docs from")
|
|
761
|
+
return
|
|
656
762
|
|
|
657
763
|
# Group files by language
|
|
658
764
|
lang_files_map: dict[str, list[GitFile]] = defaultdict(list)
|
|
659
|
-
for file in
|
|
765
|
+
for file in files:
|
|
660
766
|
try:
|
|
661
767
|
lang = LanguageMapping.get_language_for_extension(file.extension)
|
|
662
768
|
except ValueError:
|
|
@@ -670,28 +776,41 @@ class CommitIndexingApplicationService:
|
|
|
670
776
|
for i, (lang, lang_files) in enumerate(lang_files_map.items()):
|
|
671
777
|
await step.set_current(i, f"Extracting API docs for {lang}")
|
|
672
778
|
enrichments = extractor.extract_api_docs(
|
|
673
|
-
lang_files,
|
|
674
|
-
lang,
|
|
675
|
-
commit_sha,
|
|
779
|
+
files=lang_files,
|
|
780
|
+
language=lang,
|
|
676
781
|
include_private=False,
|
|
677
782
|
)
|
|
678
783
|
all_enrichments.extend(enrichments)
|
|
679
784
|
|
|
680
785
|
# Save all enrichments
|
|
681
786
|
if all_enrichments:
|
|
682
|
-
await self.enrichment_v2_repository.
|
|
683
|
-
all_enrichments
|
|
787
|
+
saved_enrichments = await self.enrichment_v2_repository.save_bulk(
|
|
788
|
+
all_enrichments # type: ignore[arg-type]
|
|
789
|
+
)
|
|
790
|
+
await self.enrichment_association_repository.save_bulk(
|
|
791
|
+
[
|
|
792
|
+
CommitEnrichmentAssociation(
|
|
793
|
+
enrichment_id=enrichment.id,
|
|
794
|
+
entity_id=commit_sha,
|
|
795
|
+
)
|
|
796
|
+
for enrichment in saved_enrichments
|
|
797
|
+
if enrichment.id
|
|
798
|
+
]
|
|
684
799
|
)
|
|
685
800
|
|
|
686
801
|
async def _new_snippets_for_type(
|
|
687
|
-
self, all_snippets: list[
|
|
688
|
-
) -> list[
|
|
802
|
+
self, all_snippets: list[EnrichmentV2], embedding_type: EmbeddingType
|
|
803
|
+
) -> list[EnrichmentV2]:
|
|
689
804
|
"""Get new snippets for a given type."""
|
|
690
805
|
existing_embeddings = (
|
|
691
806
|
await self.embedding_repository.list_embeddings_by_snippet_ids_and_type(
|
|
692
|
-
[s.id for s in all_snippets], embedding_type
|
|
807
|
+
[str(s.id) for s in all_snippets], embedding_type
|
|
693
808
|
)
|
|
694
809
|
)
|
|
810
|
+
# TODO(Phil): Can't do this incrementally yet because like the API, we don't
|
|
811
|
+
# have a unified embedding repository
|
|
812
|
+
if existing_embeddings:
|
|
813
|
+
return []
|
|
695
814
|
existing_embeddings_by_snippet_id = {
|
|
696
815
|
embedding.snippet_id: embedding for embedding in existing_embeddings
|
|
697
816
|
}
|