kodit 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/app.py +2 -0
- kodit/application/factories/server_factory.py +58 -32
- kodit/application/services/code_search_application_service.py +89 -12
- kodit/application/services/commit_indexing_application_service.py +527 -195
- kodit/application/services/enrichment_query_service.py +311 -43
- kodit/application/services/indexing_worker_service.py +1 -1
- kodit/application/services/queue_service.py +15 -10
- kodit/application/services/sync_scheduler.py +2 -1
- kodit/domain/enrichments/architecture/architecture.py +1 -1
- kodit/domain/enrichments/architecture/database_schema/__init__.py +1 -0
- kodit/domain/enrichments/architecture/database_schema/database_schema.py +17 -0
- kodit/domain/enrichments/architecture/physical/physical.py +1 -1
- kodit/domain/enrichments/development/development.py +1 -1
- kodit/domain/enrichments/development/snippet/snippet.py +12 -5
- kodit/domain/enrichments/enrichment.py +31 -4
- kodit/domain/enrichments/history/__init__.py +1 -0
- kodit/domain/enrichments/history/commit_description/__init__.py +1 -0
- kodit/domain/enrichments/history/commit_description/commit_description.py +17 -0
- kodit/domain/enrichments/history/history.py +18 -0
- kodit/domain/enrichments/usage/api_docs.py +1 -1
- kodit/domain/enrichments/usage/usage.py +1 -1
- kodit/domain/entities/git.py +30 -25
- kodit/domain/factories/git_repo_factory.py +20 -5
- kodit/domain/protocols.py +60 -125
- kodit/domain/services/embedding_service.py +14 -16
- kodit/domain/services/git_repository_service.py +60 -38
- kodit/domain/services/git_service.py +18 -11
- kodit/domain/tracking/resolution_service.py +6 -16
- kodit/domain/value_objects.py +6 -9
- kodit/infrastructure/api/v1/dependencies.py +12 -3
- kodit/infrastructure/api/v1/query_params.py +27 -0
- kodit/infrastructure/api/v1/routers/commits.py +91 -85
- kodit/infrastructure/api/v1/routers/repositories.py +53 -37
- kodit/infrastructure/api/v1/routers/search.py +1 -1
- kodit/infrastructure/api/v1/schemas/enrichment.py +14 -0
- kodit/infrastructure/api/v1/schemas/repository.py +1 -1
- kodit/infrastructure/cloning/git/git_python_adaptor.py +41 -0
- kodit/infrastructure/database_schema/__init__.py +1 -0
- kodit/infrastructure/database_schema/database_schema_detector.py +268 -0
- kodit/infrastructure/slicing/api_doc_extractor.py +0 -2
- kodit/infrastructure/sqlalchemy/embedding_repository.py +44 -34
- kodit/infrastructure/sqlalchemy/enrichment_association_repository.py +73 -0
- kodit/infrastructure/sqlalchemy/enrichment_v2_repository.py +145 -97
- kodit/infrastructure/sqlalchemy/entities.py +12 -116
- kodit/infrastructure/sqlalchemy/git_branch_repository.py +52 -244
- kodit/infrastructure/sqlalchemy/git_commit_repository.py +35 -324
- kodit/infrastructure/sqlalchemy/git_file_repository.py +70 -0
- kodit/infrastructure/sqlalchemy/git_repository.py +60 -230
- kodit/infrastructure/sqlalchemy/git_tag_repository.py +53 -240
- kodit/infrastructure/sqlalchemy/query.py +331 -0
- kodit/infrastructure/sqlalchemy/repository.py +203 -0
- kodit/infrastructure/sqlalchemy/task_repository.py +79 -58
- kodit/infrastructure/sqlalchemy/task_status_repository.py +45 -52
- kodit/migrations/versions/4b1a3b2c8fa5_refactor_git_tracking.py +190 -0
- {kodit-0.5.4.dist-info → kodit-0.5.6.dist-info}/METADATA +1 -1
- {kodit-0.5.4.dist-info → kodit-0.5.6.dist-info}/RECORD +60 -50
- kodit/infrastructure/mappers/enrichment_mapper.py +0 -83
- kodit/infrastructure/mappers/git_mapper.py +0 -193
- kodit/infrastructure/mappers/snippet_mapper.py +0 -104
- kodit/infrastructure/sqlalchemy/snippet_v2_repository.py +0 -479
- {kodit-0.5.4.dist-info → kodit-0.5.6.dist-info}/WHEEL +0 -0
- {kodit-0.5.4.dist-info → kodit-0.5.6.dist-info}/entry_points.txt +0 -0
- {kodit-0.5.4.dist-info → kodit-0.5.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,30 +2,51 @@
|
|
|
2
2
|
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
5
6
|
|
|
6
7
|
import structlog
|
|
7
8
|
from pydantic import AnyUrl
|
|
8
9
|
|
|
9
10
|
from kodit.application.services.queue_service import QueueService
|
|
10
11
|
from kodit.application.services.reporting import ProgressTracker
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from kodit.application.services.enrichment_query_service import (
|
|
15
|
+
EnrichmentQueryService,
|
|
16
|
+
)
|
|
17
|
+
from kodit.domain.enrichments.architecture.database_schema.database_schema import (
|
|
18
|
+
DatabaseSchemaEnrichment,
|
|
19
|
+
)
|
|
11
20
|
from kodit.domain.enrichments.architecture.physical.physical import (
|
|
12
21
|
PhysicalArchitectureEnrichment,
|
|
13
22
|
)
|
|
23
|
+
from kodit.domain.enrichments.development.snippet.snippet import (
|
|
24
|
+
SnippetEnrichment,
|
|
25
|
+
SnippetEnrichmentSummary,
|
|
26
|
+
)
|
|
14
27
|
from kodit.domain.enrichments.enricher import Enricher
|
|
28
|
+
from kodit.domain.enrichments.enrichment import (
|
|
29
|
+
CommitEnrichmentAssociation,
|
|
30
|
+
EnrichmentAssociation,
|
|
31
|
+
EnrichmentV2,
|
|
32
|
+
)
|
|
33
|
+
from kodit.domain.enrichments.history.commit_description.commit_description import (
|
|
34
|
+
CommitDescriptionEnrichment,
|
|
35
|
+
)
|
|
15
36
|
from kodit.domain.enrichments.request import (
|
|
16
37
|
EnrichmentRequest as GenericEnrichmentRequest,
|
|
17
38
|
)
|
|
18
|
-
from kodit.domain.enrichments.usage.api_docs import ENRICHMENT_SUBTYPE_API_DOCS
|
|
19
|
-
from kodit.domain.enrichments.usage.usage import ENRICHMENT_TYPE_USAGE
|
|
20
39
|
from kodit.domain.entities import Task
|
|
21
|
-
from kodit.domain.entities.git import GitFile, GitRepo, SnippetV2
|
|
40
|
+
from kodit.domain.entities.git import GitFile, GitRepo, SnippetV2, TrackingType
|
|
22
41
|
from kodit.domain.factories.git_repo_factory import GitRepoFactory
|
|
23
42
|
from kodit.domain.protocols import (
|
|
43
|
+
EnrichmentAssociationRepository,
|
|
44
|
+
EnrichmentV2Repository,
|
|
24
45
|
GitBranchRepository,
|
|
25
46
|
GitCommitRepository,
|
|
47
|
+
GitFileRepository,
|
|
26
48
|
GitRepoRepository,
|
|
27
49
|
GitTagRepository,
|
|
28
|
-
SnippetRepositoryV2,
|
|
29
50
|
)
|
|
30
51
|
from kodit.domain.services.bm25_service import BM25DomainService
|
|
31
52
|
from kodit.domain.services.embedding_service import EmbeddingDomainService
|
|
@@ -41,8 +62,6 @@ from kodit.domain.services.physical_architecture_service import (
|
|
|
41
62
|
from kodit.domain.value_objects import (
|
|
42
63
|
DeleteRequest,
|
|
43
64
|
Document,
|
|
44
|
-
Enrichment,
|
|
45
|
-
EnrichmentType,
|
|
46
65
|
IndexRequest,
|
|
47
66
|
LanguageMapping,
|
|
48
67
|
PrescribedOperations,
|
|
@@ -50,36 +69,109 @@ from kodit.domain.value_objects import (
|
|
|
50
69
|
TaskOperation,
|
|
51
70
|
TrackableType,
|
|
52
71
|
)
|
|
72
|
+
from kodit.infrastructure.database_schema.database_schema_detector import (
|
|
73
|
+
DatabaseSchemaDetector,
|
|
74
|
+
)
|
|
53
75
|
from kodit.infrastructure.slicing.api_doc_extractor import APIDocExtractor
|
|
54
76
|
from kodit.infrastructure.slicing.slicer import Slicer
|
|
77
|
+
from kodit.infrastructure.sqlalchemy import entities as db_entities
|
|
55
78
|
from kodit.infrastructure.sqlalchemy.embedding_repository import (
|
|
56
79
|
SqlAlchemyEmbeddingRepository,
|
|
57
80
|
)
|
|
58
|
-
from kodit.infrastructure.sqlalchemy.enrichment_v2_repository import (
|
|
59
|
-
EnrichmentV2Repository,
|
|
60
|
-
)
|
|
61
81
|
from kodit.infrastructure.sqlalchemy.entities import EmbeddingType
|
|
82
|
+
from kodit.infrastructure.sqlalchemy.query import (
|
|
83
|
+
EnrichmentAssociationQueryBuilder,
|
|
84
|
+
FilterOperator,
|
|
85
|
+
GitFileQueryBuilder,
|
|
86
|
+
QueryBuilder,
|
|
87
|
+
)
|
|
62
88
|
|
|
63
89
|
SUMMARIZATION_SYSTEM_PROMPT = """
|
|
64
90
|
You are a professional software developer. You will be given a snippet of code.
|
|
65
91
|
Please provide a concise explanation of the code.
|
|
66
92
|
"""
|
|
67
93
|
|
|
94
|
+
COMMIT_DESCRIPTION_SYSTEM_PROMPT = """
|
|
95
|
+
You are a professional software developer. You will be given a git commit diff.
|
|
96
|
+
Please provide a concise description of what changes were made and why.
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
DATABASE_SCHEMA_SYSTEM_PROMPT = """
|
|
100
|
+
You are an expert database architect and documentation specialist.
|
|
101
|
+
Your task is to create clear, visual documentation of database schemas.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
DATABASE_SCHEMA_TASK_PROMPT = """
|
|
105
|
+
You will be provided with a database schema discovery report.
|
|
106
|
+
Please create comprehensive database schema documentation.
|
|
107
|
+
|
|
108
|
+
<schema_report>
|
|
109
|
+
{schema_report}
|
|
110
|
+
</schema_report>
|
|
111
|
+
|
|
112
|
+
**Return the following:**
|
|
113
|
+
|
|
114
|
+
## Entity List
|
|
115
|
+
|
|
116
|
+
For each table/entity, write one line:
|
|
117
|
+
- **[Table Name]**: [brief description of what it stores]
|
|
118
|
+
|
|
119
|
+
## Mermaid ERD
|
|
120
|
+
|
|
121
|
+
Create a Mermaid Entity Relationship Diagram showing:
|
|
122
|
+
- All entities (tables)
|
|
123
|
+
- Key relationships between entities (if apparent from names or common patterns)
|
|
124
|
+
- Use standard ERD notation
|
|
125
|
+
|
|
126
|
+
Example format:
|
|
127
|
+
```mermaid
|
|
128
|
+
erDiagram
|
|
129
|
+
User ||--o{{ Order : places
|
|
130
|
+
User {{
|
|
131
|
+
int id PK
|
|
132
|
+
string email
|
|
133
|
+
string name
|
|
134
|
+
}}
|
|
135
|
+
Order {{
|
|
136
|
+
int id PK
|
|
137
|
+
int user_id FK
|
|
138
|
+
datetime created_at
|
|
139
|
+
}}
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
If specific field details aren't available, show just the entity boxes and
|
|
143
|
+
relationships.
|
|
144
|
+
|
|
145
|
+
## Key Observations
|
|
146
|
+
|
|
147
|
+
Answer these questions in 1-2 sentences each:
|
|
148
|
+
1. What is the primary data model pattern (e.g., user-centric,
|
|
149
|
+
event-sourced, multi-tenant)?
|
|
150
|
+
2. What migration strategy is being used?
|
|
151
|
+
3. Are there any notable database design patterns or concerns?
|
|
152
|
+
|
|
153
|
+
## Rules:
|
|
154
|
+
- Be concise and focus on the high-level structure
|
|
155
|
+
- Infer reasonable relationships from table names when explicit information
|
|
156
|
+
isn't available
|
|
157
|
+
- If no database schema is found, state that clearly
|
|
158
|
+
- Keep entity descriptions to 10 words or less
|
|
159
|
+
"""
|
|
160
|
+
|
|
68
161
|
|
|
69
162
|
class CommitIndexingApplicationService:
|
|
70
163
|
"""Application service for commit indexing operations."""
|
|
71
164
|
|
|
72
165
|
def __init__( # noqa: PLR0913
|
|
73
166
|
self,
|
|
74
|
-
snippet_v2_repository: SnippetRepositoryV2,
|
|
75
167
|
repo_repository: GitRepoRepository,
|
|
76
168
|
git_commit_repository: GitCommitRepository,
|
|
169
|
+
git_file_repository: GitFileRepository,
|
|
77
170
|
git_branch_repository: GitBranchRepository,
|
|
78
171
|
git_tag_repository: GitTagRepository,
|
|
79
172
|
operation: ProgressTracker,
|
|
80
173
|
scanner: GitRepositoryScanner,
|
|
81
174
|
cloner: RepositoryCloner,
|
|
82
|
-
snippet_repository: SnippetRepositoryV2,
|
|
83
175
|
slicer: Slicer,
|
|
84
176
|
queue: QueueService,
|
|
85
177
|
bm25_service: BM25DomainService,
|
|
@@ -87,28 +179,21 @@ class CommitIndexingApplicationService:
|
|
|
87
179
|
text_search_service: EmbeddingDomainService,
|
|
88
180
|
embedding_repository: SqlAlchemyEmbeddingRepository,
|
|
89
181
|
architecture_service: PhysicalArchitectureService,
|
|
90
|
-
|
|
182
|
+
database_schema_detector: DatabaseSchemaDetector,
|
|
91
183
|
enricher_service: Enricher,
|
|
184
|
+
enrichment_v2_repository: EnrichmentV2Repository,
|
|
185
|
+
enrichment_association_repository: EnrichmentAssociationRepository,
|
|
186
|
+
enrichment_query_service: "EnrichmentQueryService",
|
|
92
187
|
) -> None:
|
|
93
|
-
"""Initialize the commit indexing application service.
|
|
94
|
-
|
|
95
|
-
Args:
|
|
96
|
-
commit_index_repository: Repository for commit index data.
|
|
97
|
-
snippet_v2_repository: Repository for snippet data.
|
|
98
|
-
repo_repository: Repository for Git repository data.
|
|
99
|
-
domain_indexer: Domain service for indexing operations.
|
|
100
|
-
operation: Progress tracker for reporting operations.
|
|
101
|
-
|
|
102
|
-
"""
|
|
103
|
-
self.snippet_repository = snippet_v2_repository
|
|
188
|
+
"""Initialize the commit indexing application service."""
|
|
104
189
|
self.repo_repository = repo_repository
|
|
105
190
|
self.git_commit_repository = git_commit_repository
|
|
191
|
+
self.git_file_repository = git_file_repository
|
|
106
192
|
self.git_branch_repository = git_branch_repository
|
|
107
193
|
self.git_tag_repository = git_tag_repository
|
|
108
194
|
self.operation = operation
|
|
109
195
|
self.scanner = scanner
|
|
110
196
|
self.cloner = cloner
|
|
111
|
-
self.snippet_repository = snippet_repository
|
|
112
197
|
self.slicer = slicer
|
|
113
198
|
self.queue = queue
|
|
114
199
|
self.bm25_service = bm25_service
|
|
@@ -116,8 +201,11 @@ class CommitIndexingApplicationService:
|
|
|
116
201
|
self.text_search_service = text_search_service
|
|
117
202
|
self.embedding_repository = embedding_repository
|
|
118
203
|
self.architecture_service = architecture_service
|
|
204
|
+
self.database_schema_detector = database_schema_detector
|
|
119
205
|
self.enrichment_v2_repository = enrichment_v2_repository
|
|
206
|
+
self.enrichment_association_repository = enrichment_association_repository
|
|
120
207
|
self.enricher_service = enricher_service
|
|
208
|
+
self.enrichment_query_service = enrichment_query_service
|
|
121
209
|
self._log = structlog.get_logger(__name__)
|
|
122
210
|
|
|
123
211
|
async def create_git_repository(self, remote_uri: AnyUrl) -> GitRepo:
|
|
@@ -137,7 +225,7 @@ class CommitIndexingApplicationService:
|
|
|
137
225
|
|
|
138
226
|
async def delete_git_repository(self, repo_id: int) -> bool:
|
|
139
227
|
"""Delete a Git repository by ID."""
|
|
140
|
-
repo = await self.repo_repository.
|
|
228
|
+
repo = await self.repo_repository.get(repo_id)
|
|
141
229
|
if not repo:
|
|
142
230
|
return False
|
|
143
231
|
|
|
@@ -181,6 +269,10 @@ class CommitIndexingApplicationService:
|
|
|
181
269
|
await self.process_architecture_discovery(repository_id, commit_sha)
|
|
182
270
|
elif task.type == TaskOperation.CREATE_PUBLIC_API_DOCS_FOR_COMMIT:
|
|
183
271
|
await self.process_api_docs(repository_id, commit_sha)
|
|
272
|
+
elif task.type == TaskOperation.CREATE_COMMIT_DESCRIPTION_FOR_COMMIT:
|
|
273
|
+
await self.process_commit_description(repository_id, commit_sha)
|
|
274
|
+
elif task.type == TaskOperation.CREATE_DATABASE_SCHEMA_FOR_COMMIT:
|
|
275
|
+
await self.process_database_schema(repository_id, commit_sha)
|
|
184
276
|
else:
|
|
185
277
|
raise ValueError(f"Unknown task type: {task.type}")
|
|
186
278
|
else:
|
|
@@ -193,7 +285,7 @@ class CommitIndexingApplicationService:
|
|
|
193
285
|
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
194
286
|
trackable_id=repository_id,
|
|
195
287
|
):
|
|
196
|
-
repo = await self.repo_repository.
|
|
288
|
+
repo = await self.repo_repository.get(repository_id)
|
|
197
289
|
repo.cloned_path = await self.cloner.clone_repository(repo.remote_uri)
|
|
198
290
|
await self.repo_repository.save(repo)
|
|
199
291
|
|
|
@@ -204,45 +296,57 @@ class CommitIndexingApplicationService:
|
|
|
204
296
|
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
205
297
|
trackable_id=repository_id,
|
|
206
298
|
) as step:
|
|
207
|
-
await step.set_total(
|
|
208
|
-
repo = await self.repo_repository.
|
|
299
|
+
await step.set_total(7)
|
|
300
|
+
repo = await self.repo_repository.get(repository_id)
|
|
209
301
|
if not repo.cloned_path:
|
|
210
302
|
raise ValueError(f"Repository {repository_id} has never been cloned")
|
|
211
303
|
|
|
212
304
|
# Scan the repository to get all metadata
|
|
213
305
|
await step.set_current(0, "Scanning repository")
|
|
214
|
-
scan_result = await self.scanner.scan_repository(
|
|
306
|
+
scan_result = await self.scanner.scan_repository(
|
|
307
|
+
repo.cloned_path, repository_id
|
|
308
|
+
)
|
|
215
309
|
|
|
216
310
|
# Update repo with scan result (this sets num_commits, num_branches, etc.)
|
|
217
311
|
await step.set_current(1, "Updating repository with scan result")
|
|
218
312
|
repo.update_with_scan_result(scan_result)
|
|
219
313
|
await self.repo_repository.save(repo)
|
|
220
314
|
|
|
221
|
-
# Save commits, branches, and tags to their dedicated repositories
|
|
222
315
|
await step.set_current(2, "Saving commits")
|
|
223
|
-
|
|
224
|
-
await self.git_commit_repository.save_bulk(
|
|
225
|
-
scan_result.all_commits, repository_id
|
|
226
|
-
)
|
|
316
|
+
await self.git_commit_repository.save_bulk(scan_result.all_commits)
|
|
227
317
|
|
|
228
|
-
await step.set_current(3, "Saving
|
|
318
|
+
await step.set_current(3, "Saving files")
|
|
319
|
+
await self.git_file_repository.save_bulk(scan_result.all_files)
|
|
320
|
+
|
|
321
|
+
await step.set_current(4, "Saving branches")
|
|
229
322
|
if scan_result.branches:
|
|
230
323
|
await self.git_branch_repository.save_bulk(
|
|
231
|
-
scan_result.branches,
|
|
324
|
+
scan_result.branches,
|
|
232
325
|
)
|
|
233
326
|
|
|
234
|
-
await step.set_current(
|
|
327
|
+
await step.set_current(5, "Saving tags")
|
|
235
328
|
if scan_result.all_tags:
|
|
236
329
|
await self.git_tag_repository.save_bulk(
|
|
237
|
-
scan_result.all_tags,
|
|
330
|
+
scan_result.all_tags,
|
|
238
331
|
)
|
|
239
332
|
|
|
240
|
-
await step.set_current(
|
|
241
|
-
if not repo.
|
|
333
|
+
await step.set_current(6, "Enqueuing commit indexing tasks")
|
|
334
|
+
if not repo.tracking_config.name:
|
|
242
335
|
raise ValueError(f"Repository {repository_id} has no tracking branch")
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
336
|
+
if repo.tracking_config.type == TrackingType.BRANCH.value:
|
|
337
|
+
branch = await self.git_branch_repository.get_by_name(
|
|
338
|
+
repo.tracking_config.name, repository_id
|
|
339
|
+
)
|
|
340
|
+
commit_sha = branch.head_commit_sha
|
|
341
|
+
elif repo.tracking_config.type == TrackingType.TAG.value:
|
|
342
|
+
tag = await self.git_tag_repository.get_by_name(
|
|
343
|
+
repo.tracking_config.name, repository_id
|
|
344
|
+
)
|
|
345
|
+
commit_sha = tag.target_commit_sha
|
|
346
|
+
elif repo.tracking_config.type == TrackingType.COMMIT_SHA.value:
|
|
347
|
+
commit_sha = repo.tracking_config.name
|
|
348
|
+
else:
|
|
349
|
+
raise ValueError(f"Unknown tracking type: {repo.tracking_config.type}")
|
|
246
350
|
|
|
247
351
|
await self.queue.enqueue_tasks(
|
|
248
352
|
tasks=PrescribedOperations.INDEX_COMMIT,
|
|
@@ -250,6 +354,74 @@ class CommitIndexingApplicationService:
|
|
|
250
354
|
payload={"commit_sha": commit_sha, "repository_id": repository_id},
|
|
251
355
|
)
|
|
252
356
|
|
|
357
|
+
async def _delete_snippet_enrichments_for_commits(
|
|
358
|
+
self, commit_shas: list[str]
|
|
359
|
+
) -> None:
|
|
360
|
+
"""Delete snippet enrichments and their indices for commits."""
|
|
361
|
+
# Get all snippet enrichment IDs for these commits
|
|
362
|
+
all_snippet_enrichment_ids = []
|
|
363
|
+
for commit_sha in commit_shas:
|
|
364
|
+
snippet_enrichments = (
|
|
365
|
+
await self.enrichment_query_service.get_all_snippets_for_commit(
|
|
366
|
+
commit_sha
|
|
367
|
+
)
|
|
368
|
+
)
|
|
369
|
+
enrichment_ids = [
|
|
370
|
+
enrichment.id for enrichment in snippet_enrichments if enrichment.id
|
|
371
|
+
]
|
|
372
|
+
all_snippet_enrichment_ids.extend(enrichment_ids)
|
|
373
|
+
|
|
374
|
+
if not all_snippet_enrichment_ids:
|
|
375
|
+
return
|
|
376
|
+
|
|
377
|
+
# Delete from BM25 and embedding indices
|
|
378
|
+
snippet_id_strings = [str(sid) for sid in all_snippet_enrichment_ids]
|
|
379
|
+
delete_request = DeleteRequest(snippet_ids=snippet_id_strings)
|
|
380
|
+
await self.bm25_service.delete_documents(delete_request)
|
|
381
|
+
|
|
382
|
+
for snippet_id in all_snippet_enrichment_ids:
|
|
383
|
+
await self.embedding_repository.delete_embeddings_by_snippet_id(
|
|
384
|
+
str(snippet_id)
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
# Delete enrichment associations for snippets
|
|
388
|
+
await self.enrichment_association_repository.delete_by_query(
|
|
389
|
+
QueryBuilder()
|
|
390
|
+
.filter("entity_type", FilterOperator.EQ, "snippet_v2")
|
|
391
|
+
.filter("entity_id", FilterOperator.IN, snippet_id_strings)
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
# Delete the enrichments themselves
|
|
395
|
+
await self.enrichment_v2_repository.delete_by_query(
|
|
396
|
+
QueryBuilder().filter("id", FilterOperator.IN, all_snippet_enrichment_ids)
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
async def _delete_commit_enrichments(self, commit_shas: list[str]) -> None:
|
|
400
|
+
"""Delete commit-level enrichments for commits."""
|
|
401
|
+
existing_enrichment_associations = (
|
|
402
|
+
await self.enrichment_association_repository.find(
|
|
403
|
+
QueryBuilder()
|
|
404
|
+
.filter(
|
|
405
|
+
"entity_type",
|
|
406
|
+
FilterOperator.EQ,
|
|
407
|
+
db_entities.GitCommit.__tablename__,
|
|
408
|
+
)
|
|
409
|
+
.filter("entity_id", FilterOperator.IN, commit_shas)
|
|
410
|
+
)
|
|
411
|
+
)
|
|
412
|
+
enrichment_ids = [a.enrichment_id for a in existing_enrichment_associations]
|
|
413
|
+
if not enrichment_ids:
|
|
414
|
+
return
|
|
415
|
+
|
|
416
|
+
# Delete associations first
|
|
417
|
+
await self.enrichment_association_repository.delete_by_query(
|
|
418
|
+
QueryBuilder().filter("enrichment_id", FilterOperator.IN, enrichment_ids)
|
|
419
|
+
)
|
|
420
|
+
# Then delete enrichments
|
|
421
|
+
await self.enrichment_v2_repository.delete_by_query(
|
|
422
|
+
QueryBuilder().filter("id", FilterOperator.IN, enrichment_ids)
|
|
423
|
+
)
|
|
424
|
+
|
|
253
425
|
async def process_delete_repo(self, repository_id: int) -> None:
|
|
254
426
|
"""Delete a repository."""
|
|
255
427
|
async with self.operation.create_child(
|
|
@@ -257,61 +429,34 @@ class CommitIndexingApplicationService:
|
|
|
257
429
|
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
258
430
|
trackable_id=repository_id,
|
|
259
431
|
):
|
|
260
|
-
repo = await self.repo_repository.
|
|
432
|
+
repo = await self.repo_repository.get(repository_id)
|
|
261
433
|
if not repo:
|
|
262
434
|
raise ValueError(f"Repository {repository_id} not found")
|
|
263
435
|
|
|
264
|
-
# Get all commit SHAs for this repository
|
|
265
|
-
commits = await self.git_commit_repository.
|
|
436
|
+
# Get all commit SHAs for this repository
|
|
437
|
+
commits = await self.git_commit_repository.find(
|
|
438
|
+
QueryBuilder().filter("repo_id", FilterOperator.EQ, repository_id)
|
|
439
|
+
)
|
|
266
440
|
commit_shas = [commit.commit_sha for commit in commits]
|
|
267
441
|
|
|
268
|
-
#
|
|
269
|
-
# (before deleting the associations)
|
|
270
|
-
all_snippet_ids = []
|
|
442
|
+
# Delete all enrichments and their indices
|
|
271
443
|
if commit_shas:
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
commit_sha
|
|
275
|
-
)
|
|
276
|
-
all_snippet_ids.extend(
|
|
277
|
-
[snippet.id for snippet in snippets if snippet.id]
|
|
278
|
-
)
|
|
444
|
+
await self._delete_snippet_enrichments_for_commits(commit_shas)
|
|
445
|
+
await self._delete_commit_enrichments(commit_shas)
|
|
279
446
|
|
|
280
|
-
#
|
|
281
|
-
if all_snippet_ids:
|
|
282
|
-
# Convert to strings as DeleteRequest expects list[str]
|
|
283
|
-
snippet_id_strings = [str(snippet_id) for snippet_id in all_snippet_ids]
|
|
284
|
-
delete_request = DeleteRequest(snippet_ids=snippet_id_strings)
|
|
285
|
-
await self.bm25_service.delete_documents(delete_request)
|
|
286
|
-
|
|
287
|
-
# Delete embeddings for each snippet
|
|
288
|
-
for snippet_id in all_snippet_ids:
|
|
289
|
-
await self.embedding_repository.delete_embeddings_by_snippet_id(
|
|
290
|
-
snippet_id
|
|
291
|
-
)
|
|
292
|
-
|
|
293
|
-
# Step 3: Delete enrichments for all commits
|
|
294
|
-
if commit_shas:
|
|
295
|
-
await self.enrichment_v2_repository.bulk_delete_enrichments(
|
|
296
|
-
entity_type="git_commit",
|
|
297
|
-
entity_ids=commit_shas,
|
|
298
|
-
)
|
|
299
|
-
|
|
300
|
-
# Step 4: Delete snippet associations for all commits
|
|
301
|
-
for commit_sha in commit_shas:
|
|
302
|
-
await self.snippet_repository.delete_snippets_for_commit(commit_sha)
|
|
303
|
-
|
|
304
|
-
# Step 5: Delete branches (they reference commits via head_commit_sha)
|
|
447
|
+
# Delete branches, tags, files, commits, and repository
|
|
305
448
|
await self.git_branch_repository.delete_by_repo_id(repository_id)
|
|
306
|
-
|
|
307
|
-
# Step 6: Delete tags (they reference commits via target_commit_sha)
|
|
308
449
|
await self.git_tag_repository.delete_by_repo_id(repository_id)
|
|
309
450
|
|
|
310
|
-
|
|
311
|
-
|
|
451
|
+
for commit_sha in commit_shas:
|
|
452
|
+
await self.git_file_repository.delete_by_commit_sha(commit_sha)
|
|
453
|
+
|
|
454
|
+
await self.git_commit_repository.delete_by_query(
|
|
455
|
+
QueryBuilder().filter("repo_id", FilterOperator.EQ, repository_id)
|
|
456
|
+
)
|
|
312
457
|
|
|
313
|
-
|
|
314
|
-
|
|
458
|
+
if repo.id:
|
|
459
|
+
await self.repo_repository.delete(repo)
|
|
315
460
|
|
|
316
461
|
async def process_snippets_for_commit(
|
|
317
462
|
self, repository_id: int, commit_sha: str
|
|
@@ -322,16 +467,16 @@ class CommitIndexingApplicationService:
|
|
|
322
467
|
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
323
468
|
trackable_id=repository_id,
|
|
324
469
|
) as step:
|
|
325
|
-
#
|
|
326
|
-
if await self.
|
|
327
|
-
await step.skip("
|
|
470
|
+
# Find existing snippet enrichments for this commit
|
|
471
|
+
if await self.enrichment_query_service.has_snippets_for_commit(commit_sha):
|
|
472
|
+
await step.skip("Snippets already extracted for commit")
|
|
328
473
|
return
|
|
329
474
|
|
|
330
|
-
commit = await self.git_commit_repository.
|
|
475
|
+
commit = await self.git_commit_repository.get(commit_sha)
|
|
331
476
|
|
|
332
477
|
# Load files on demand for snippet extraction (performance optimization)
|
|
333
478
|
# Instead of using commit.files (which may be empty), load files directly
|
|
334
|
-
repo = await self.repo_repository.
|
|
479
|
+
repo = await self.repo_repository.get(repository_id)
|
|
335
480
|
if not repo.cloned_path:
|
|
336
481
|
raise ValueError(f"Repository {repository_id} has never been cloned")
|
|
337
482
|
|
|
@@ -350,6 +495,7 @@ class CommitIndexingApplicationService:
|
|
|
350
495
|
absolute_path = str(repo.cloned_path / file_data["path"])
|
|
351
496
|
|
|
352
497
|
git_file = GitFile(
|
|
498
|
+
commit_sha=commit.commit_sha,
|
|
353
499
|
created_at=file_data.get("created_at", commit.date),
|
|
354
500
|
blob_sha=file_data["blob_sha"],
|
|
355
501
|
path=absolute_path, # Use absolute path for file reading
|
|
@@ -395,8 +541,27 @@ class CommitIndexingApplicationService:
|
|
|
395
541
|
f"Extracted {len(all_snippets)} snippets, "
|
|
396
542
|
f"deduplicated to {len(deduplicated_snippets)} for {commit_short}"
|
|
397
543
|
)
|
|
398
|
-
|
|
399
|
-
|
|
544
|
+
|
|
545
|
+
saved_enrichments = await self.enrichment_v2_repository.save_bulk(
|
|
546
|
+
[
|
|
547
|
+
SnippetEnrichment(content=snippet.content)
|
|
548
|
+
for snippet in deduplicated_snippets
|
|
549
|
+
]
|
|
550
|
+
)
|
|
551
|
+
saved_associations = await self.enrichment_association_repository.save_bulk(
|
|
552
|
+
[
|
|
553
|
+
EnrichmentAssociation(
|
|
554
|
+
enrichment_id=enrichment.id,
|
|
555
|
+
entity_type=db_entities.GitCommit.__tablename__,
|
|
556
|
+
entity_id=commit_sha,
|
|
557
|
+
)
|
|
558
|
+
for enrichment in saved_enrichments
|
|
559
|
+
if enrichment.id
|
|
560
|
+
]
|
|
561
|
+
)
|
|
562
|
+
self._log.info(
|
|
563
|
+
f"Saved {len(saved_enrichments)} snippet enrichments and "
|
|
564
|
+
f"{len(saved_associations)} associations for commit {commit_sha}"
|
|
400
565
|
)
|
|
401
566
|
|
|
402
567
|
async def process_bm25_index(self, repository_id: int, commit_sha: str) -> None:
|
|
@@ -406,13 +571,16 @@ class CommitIndexingApplicationService:
|
|
|
406
571
|
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
407
572
|
trackable_id=repository_id,
|
|
408
573
|
):
|
|
409
|
-
|
|
410
|
-
|
|
574
|
+
existing_enrichments = (
|
|
575
|
+
await self.enrichment_query_service.get_all_snippets_for_commit(
|
|
576
|
+
commit_sha
|
|
577
|
+
)
|
|
578
|
+
)
|
|
411
579
|
await self.bm25_service.index_documents(
|
|
412
580
|
IndexRequest(
|
|
413
581
|
documents=[
|
|
414
|
-
Document(snippet_id=snippet.id, text=snippet.content)
|
|
415
|
-
for snippet in
|
|
582
|
+
Document(snippet_id=str(snippet.id), text=snippet.content)
|
|
583
|
+
for snippet in existing_enrichments
|
|
416
584
|
if snippet.id
|
|
417
585
|
]
|
|
418
586
|
)
|
|
@@ -427,12 +595,14 @@ class CommitIndexingApplicationService:
|
|
|
427
595
|
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
428
596
|
trackable_id=repository_id,
|
|
429
597
|
) as step:
|
|
430
|
-
|
|
431
|
-
|
|
598
|
+
existing_enrichments = (
|
|
599
|
+
await self.enrichment_query_service.get_all_snippets_for_commit(
|
|
600
|
+
commit_sha
|
|
601
|
+
)
|
|
432
602
|
)
|
|
433
603
|
|
|
434
604
|
new_snippets = await self._new_snippets_for_type(
|
|
435
|
-
|
|
605
|
+
existing_enrichments, EmbeddingType.CODE
|
|
436
606
|
)
|
|
437
607
|
if not new_snippets:
|
|
438
608
|
await step.skip("All snippets already have code embeddings")
|
|
@@ -441,7 +611,7 @@ class CommitIndexingApplicationService:
|
|
|
441
611
|
await step.set_total(len(new_snippets))
|
|
442
612
|
processed = 0
|
|
443
613
|
documents = [
|
|
444
|
-
Document(snippet_id=snippet.id, text=snippet.content)
|
|
614
|
+
Document(snippet_id=str(snippet.id), text=snippet.content)
|
|
445
615
|
for snippet in new_snippets
|
|
446
616
|
if snippet.id
|
|
447
617
|
]
|
|
@@ -458,36 +628,28 @@ class CommitIndexingApplicationService:
|
|
|
458
628
|
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
459
629
|
trackable_id=repository_id,
|
|
460
630
|
) as step:
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
631
|
+
if await self.enrichment_query_service.has_summaries_for_commit(commit_sha):
|
|
632
|
+
await step.skip("Summary enrichments already exist for commit")
|
|
633
|
+
return
|
|
464
634
|
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
for snippet in all_snippets
|
|
469
|
-
if not snippet.enrichments
|
|
470
|
-
or not next(
|
|
471
|
-
enrichment
|
|
472
|
-
for enrichment in snippet.enrichments
|
|
473
|
-
if enrichment.type == EnrichmentType.SUMMARIZATION
|
|
635
|
+
all_snippets = (
|
|
636
|
+
await self.enrichment_query_service.get_all_snippets_for_commit(
|
|
637
|
+
commit_sha
|
|
474
638
|
)
|
|
475
|
-
|
|
476
|
-
if not
|
|
477
|
-
await step.skip("
|
|
639
|
+
)
|
|
640
|
+
if not all_snippets:
|
|
641
|
+
await step.skip("No snippets to enrich")
|
|
478
642
|
return
|
|
479
643
|
|
|
480
644
|
# Enrich snippets
|
|
481
|
-
await step.set_total(len(
|
|
645
|
+
await step.set_total(len(all_snippets))
|
|
482
646
|
snippet_map = {
|
|
483
|
-
snippet.id: snippet
|
|
484
|
-
for snippet in snippets_without_summary
|
|
485
|
-
if snippet.id
|
|
647
|
+
str(snippet.id): snippet for snippet in all_snippets if snippet.id
|
|
486
648
|
}
|
|
487
649
|
|
|
488
650
|
enrichment_requests = [
|
|
489
651
|
GenericEnrichmentRequest(
|
|
490
|
-
id=snippet_id,
|
|
652
|
+
id=str(snippet_id),
|
|
491
653
|
text=snippet.content,
|
|
492
654
|
system_prompt=SUMMARIZATION_SYSTEM_PROMPT,
|
|
493
655
|
)
|
|
@@ -497,12 +659,27 @@ class CommitIndexingApplicationService:
|
|
|
497
659
|
processed = 0
|
|
498
660
|
async for result in self.enricher_service.enrich(enrichment_requests):
|
|
499
661
|
snippet = snippet_map[result.id]
|
|
500
|
-
|
|
501
|
-
|
|
662
|
+
db_summary = await self.enrichment_v2_repository.save(
|
|
663
|
+
SnippetEnrichmentSummary(content=result.text)
|
|
664
|
+
)
|
|
665
|
+
if not db_summary.id:
|
|
666
|
+
raise ValueError(
|
|
667
|
+
f"Failed to save snippet enrichment for commit {commit_sha}"
|
|
668
|
+
)
|
|
669
|
+
await self.enrichment_association_repository.save(
|
|
670
|
+
EnrichmentAssociation(
|
|
671
|
+
enrichment_id=db_summary.id,
|
|
672
|
+
entity_type=db_entities.EnrichmentV2.__tablename__,
|
|
673
|
+
entity_id=str(snippet.id),
|
|
674
|
+
)
|
|
675
|
+
)
|
|
676
|
+
await self.enrichment_association_repository.save(
|
|
677
|
+
EnrichmentAssociation(
|
|
678
|
+
enrichment_id=db_summary.id,
|
|
679
|
+
entity_type=db_entities.GitCommit.__tablename__,
|
|
680
|
+
entity_id=commit_sha,
|
|
681
|
+
)
|
|
502
682
|
)
|
|
503
|
-
|
|
504
|
-
await self.snippet_repository.save_snippets(commit_sha, [snippet])
|
|
505
|
-
|
|
506
683
|
processed += 1
|
|
507
684
|
await step.set_current(processed, "Enriching snippets for commit")
|
|
508
685
|
|
|
@@ -515,40 +692,61 @@ class CommitIndexingApplicationService:
|
|
|
515
692
|
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
516
693
|
trackable_id=repository_id,
|
|
517
694
|
) as step:
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
695
|
+
# Get all snippet enrichments for this commit
|
|
696
|
+
all_snippet_enrichments = (
|
|
697
|
+
await self.enrichment_query_service.get_all_snippets_for_commit(
|
|
698
|
+
commit_sha
|
|
699
|
+
)
|
|
522
700
|
)
|
|
523
|
-
if not
|
|
524
|
-
await step.skip("
|
|
701
|
+
if not all_snippet_enrichments:
|
|
702
|
+
await step.skip("No snippets to create summary embeddings")
|
|
525
703
|
return
|
|
526
704
|
|
|
527
|
-
|
|
528
|
-
|
|
705
|
+
# Get summary enrichments that point to these snippet enrichments
|
|
706
|
+
query = EnrichmentAssociationQueryBuilder.for_enrichment_associations(
|
|
707
|
+
entity_type=db_entities.EnrichmentV2.__tablename__,
|
|
708
|
+
entity_ids=[
|
|
709
|
+
str(snippet.id) for snippet in all_snippet_enrichments if snippet.id
|
|
710
|
+
],
|
|
711
|
+
)
|
|
712
|
+
summary_enrichment_associations = (
|
|
713
|
+
await self.enrichment_association_repository.find(query)
|
|
714
|
+
)
|
|
715
|
+
|
|
716
|
+
if not summary_enrichment_associations:
|
|
717
|
+
await step.skip("No summary enrichments found for snippets")
|
|
718
|
+
return
|
|
529
719
|
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
720
|
+
# Get the actual summary enrichments
|
|
721
|
+
summary_enrichments = await self.enrichment_v2_repository.find(
|
|
722
|
+
QueryBuilder().filter(
|
|
723
|
+
"id",
|
|
724
|
+
FilterOperator.IN,
|
|
725
|
+
[
|
|
726
|
+
association.enrichment_id
|
|
727
|
+
for association in summary_enrichment_associations
|
|
728
|
+
],
|
|
537
729
|
)
|
|
730
|
+
)
|
|
538
731
|
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
await step.skip("No snippets with summaries to create text embeddings")
|
|
732
|
+
# Check if embeddings already exist for these summaries
|
|
733
|
+
new_summaries = await self._new_snippets_for_type(
|
|
734
|
+
summary_enrichments, EmbeddingType.TEXT
|
|
735
|
+
)
|
|
736
|
+
if not new_summaries:
|
|
737
|
+
await step.skip("All snippets already have text embeddings")
|
|
546
738
|
return
|
|
547
739
|
|
|
740
|
+
await step.set_total(len(new_summaries))
|
|
741
|
+
processed = 0
|
|
742
|
+
|
|
743
|
+
# Create documents from the summary enrichments
|
|
548
744
|
documents_with_summaries = [
|
|
549
|
-
Document(snippet_id=
|
|
550
|
-
for
|
|
745
|
+
Document(snippet_id=str(summary.id), text=summary.content)
|
|
746
|
+
for summary in new_summaries
|
|
747
|
+
if summary.id
|
|
551
748
|
]
|
|
749
|
+
|
|
552
750
|
async for result in self.text_search_service.index_documents(
|
|
553
751
|
IndexRequest(documents=documents_with_summaries)
|
|
554
752
|
):
|
|
@@ -567,23 +765,14 @@ class CommitIndexingApplicationService:
|
|
|
567
765
|
await step.set_total(3)
|
|
568
766
|
|
|
569
767
|
# Check if architecture enrichment already exists for this commit
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
entity_ids=[commit_sha],
|
|
574
|
-
)
|
|
575
|
-
|
|
576
|
-
# Check if architecture enrichment already exists
|
|
577
|
-
has_architecture = any(
|
|
578
|
-
enrichment.type == "architecture" for enrichment in existing_enrichments
|
|
579
|
-
)
|
|
580
|
-
|
|
581
|
-
if has_architecture:
|
|
768
|
+
if await self.enrichment_query_service.has_architecture_for_commit(
|
|
769
|
+
commit_sha
|
|
770
|
+
):
|
|
582
771
|
await step.skip("Architecture enrichment already exists for commit")
|
|
583
772
|
return
|
|
584
773
|
|
|
585
774
|
# Get repository path
|
|
586
|
-
repo = await self.repo_repository.
|
|
775
|
+
repo = await self.repo_repository.get(repository_id)
|
|
587
776
|
if not repo.cloned_path:
|
|
588
777
|
raise ValueError(f"Repository {repository_id} has never been cloned")
|
|
589
778
|
|
|
@@ -610,13 +799,20 @@ class CommitIndexingApplicationService:
|
|
|
610
799
|
enriched_content = response.text
|
|
611
800
|
|
|
612
801
|
# Create and save architecture enrichment with enriched content
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
802
|
+
enrichment = await self.enrichment_v2_repository.save(
|
|
803
|
+
PhysicalArchitectureEnrichment(
|
|
804
|
+
content=enriched_content,
|
|
805
|
+
)
|
|
616
806
|
)
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
807
|
+
if not enrichment or not enrichment.id:
|
|
808
|
+
raise ValueError(
|
|
809
|
+
f"Failed to save architecture enrichment for commit {commit_sha}"
|
|
810
|
+
)
|
|
811
|
+
await self.enrichment_association_repository.save(
|
|
812
|
+
CommitEnrichmentAssociation(
|
|
813
|
+
enrichment_id=enrichment.id,
|
|
814
|
+
entity_id=commit_sha,
|
|
815
|
+
)
|
|
620
816
|
)
|
|
621
817
|
|
|
622
818
|
await step.set_current(3, "Architecture enrichment completed")
|
|
@@ -629,34 +825,26 @@ class CommitIndexingApplicationService:
|
|
|
629
825
|
trackable_id=repository_id,
|
|
630
826
|
) as step:
|
|
631
827
|
# Check if API docs already exist for this commit
|
|
632
|
-
|
|
633
|
-
await self.enrichment_v2_repository.enrichments_for_entity_type(
|
|
634
|
-
entity_type="git_commit",
|
|
635
|
-
entity_ids=[commit_sha],
|
|
636
|
-
)
|
|
637
|
-
)
|
|
638
|
-
|
|
639
|
-
has_api_docs = any(
|
|
640
|
-
e.type == ENRICHMENT_TYPE_USAGE
|
|
641
|
-
and e.subtype == ENRICHMENT_SUBTYPE_API_DOCS
|
|
642
|
-
for e in existing_enrichments
|
|
643
|
-
)
|
|
644
|
-
|
|
645
|
-
if has_api_docs:
|
|
828
|
+
if await self.enrichment_query_service.has_api_docs_for_commit(commit_sha):
|
|
646
829
|
await step.skip("API docs already exist for commit")
|
|
647
830
|
return
|
|
648
831
|
|
|
649
832
|
# Get repository for metadata
|
|
650
|
-
repo = await self.repo_repository.
|
|
833
|
+
repo = await self.repo_repository.get(repository_id)
|
|
651
834
|
if not repo:
|
|
652
835
|
raise ValueError(f"Repository {repository_id} not found")
|
|
653
836
|
str(repo.sanitized_remote_uri)
|
|
654
837
|
|
|
655
|
-
|
|
838
|
+
files = await self.git_file_repository.find(
|
|
839
|
+
GitFileQueryBuilder().for_commit_sha(commit_sha)
|
|
840
|
+
)
|
|
841
|
+
if not files:
|
|
842
|
+
await step.skip("No files to extract API docs from")
|
|
843
|
+
return
|
|
656
844
|
|
|
657
845
|
# Group files by language
|
|
658
846
|
lang_files_map: dict[str, list[GitFile]] = defaultdict(list)
|
|
659
|
-
for file in
|
|
847
|
+
for file in files:
|
|
660
848
|
try:
|
|
661
849
|
lang = LanguageMapping.get_language_for_extension(file.extension)
|
|
662
850
|
except ValueError:
|
|
@@ -670,28 +858,172 @@ class CommitIndexingApplicationService:
|
|
|
670
858
|
for i, (lang, lang_files) in enumerate(lang_files_map.items()):
|
|
671
859
|
await step.set_current(i, f"Extracting API docs for {lang}")
|
|
672
860
|
enrichments = extractor.extract_api_docs(
|
|
673
|
-
lang_files,
|
|
674
|
-
lang,
|
|
675
|
-
commit_sha,
|
|
861
|
+
files=lang_files,
|
|
862
|
+
language=lang,
|
|
676
863
|
include_private=False,
|
|
677
864
|
)
|
|
678
865
|
all_enrichments.extend(enrichments)
|
|
679
866
|
|
|
680
867
|
# Save all enrichments
|
|
681
868
|
if all_enrichments:
|
|
682
|
-
await self.enrichment_v2_repository.
|
|
683
|
-
all_enrichments
|
|
869
|
+
saved_enrichments = await self.enrichment_v2_repository.save_bulk(
|
|
870
|
+
all_enrichments # type: ignore[arg-type]
|
|
871
|
+
)
|
|
872
|
+
await self.enrichment_association_repository.save_bulk(
|
|
873
|
+
[
|
|
874
|
+
CommitEnrichmentAssociation(
|
|
875
|
+
enrichment_id=enrichment.id,
|
|
876
|
+
entity_id=commit_sha,
|
|
877
|
+
)
|
|
878
|
+
for enrichment in saved_enrichments
|
|
879
|
+
if enrichment.id
|
|
880
|
+
]
|
|
684
881
|
)
|
|
685
882
|
|
|
883
|
+
async def process_commit_description(
|
|
884
|
+
self, repository_id: int, commit_sha: str
|
|
885
|
+
) -> None:
|
|
886
|
+
"""Handle COMMIT_DESCRIPTION task - generate commit descriptions."""
|
|
887
|
+
async with self.operation.create_child(
|
|
888
|
+
TaskOperation.CREATE_COMMIT_DESCRIPTION_FOR_COMMIT,
|
|
889
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
890
|
+
trackable_id=repository_id,
|
|
891
|
+
) as step:
|
|
892
|
+
# Check if commit description already exists for this commit
|
|
893
|
+
if await self.enrichment_query_service.has_commit_description_for_commit(
|
|
894
|
+
commit_sha
|
|
895
|
+
):
|
|
896
|
+
await step.skip("Commit description already exists for commit")
|
|
897
|
+
return
|
|
898
|
+
|
|
899
|
+
# Get repository path
|
|
900
|
+
repo = await self.repo_repository.get(repository_id)
|
|
901
|
+
if not repo.cloned_path:
|
|
902
|
+
raise ValueError(f"Repository {repository_id} has never been cloned")
|
|
903
|
+
|
|
904
|
+
await step.set_total(3)
|
|
905
|
+
await step.set_current(1, "Getting commit diff")
|
|
906
|
+
|
|
907
|
+
# Get the diff for this commit
|
|
908
|
+
diff = await self.scanner.git_adapter.get_commit_diff(
|
|
909
|
+
repo.cloned_path, commit_sha
|
|
910
|
+
)
|
|
911
|
+
|
|
912
|
+
if not diff or len(diff.strip()) == 0:
|
|
913
|
+
await step.skip("No diff found for commit")
|
|
914
|
+
return
|
|
915
|
+
|
|
916
|
+
await step.set_current(2, "Enriching commit description with LLM")
|
|
917
|
+
|
|
918
|
+
# Enrich the diff through the enricher
|
|
919
|
+
enrichment_request = GenericEnrichmentRequest(
|
|
920
|
+
id=commit_sha,
|
|
921
|
+
text=diff,
|
|
922
|
+
system_prompt=COMMIT_DESCRIPTION_SYSTEM_PROMPT,
|
|
923
|
+
)
|
|
924
|
+
|
|
925
|
+
enriched_content = ""
|
|
926
|
+
async for response in self.enricher_service.enrich([enrichment_request]):
|
|
927
|
+
enriched_content = response.text
|
|
928
|
+
|
|
929
|
+
# Create and save commit description enrichment
|
|
930
|
+
enrichment = await self.enrichment_v2_repository.save(
|
|
931
|
+
CommitDescriptionEnrichment(
|
|
932
|
+
content=enriched_content,
|
|
933
|
+
)
|
|
934
|
+
)
|
|
935
|
+
if not enrichment or not enrichment.id:
|
|
936
|
+
raise ValueError(
|
|
937
|
+
f"Failed to save commit description enrichment for commit "
|
|
938
|
+
f"{commit_sha}"
|
|
939
|
+
)
|
|
940
|
+
await self.enrichment_association_repository.save(
|
|
941
|
+
CommitEnrichmentAssociation(
|
|
942
|
+
enrichment_id=enrichment.id,
|
|
943
|
+
entity_id=commit_sha,
|
|
944
|
+
)
|
|
945
|
+
)
|
|
946
|
+
|
|
947
|
+
await step.set_current(3, "Commit description enrichment completed")
|
|
948
|
+
|
|
949
|
+
async def process_database_schema(
|
|
950
|
+
self, repository_id: int, commit_sha: str
|
|
951
|
+
) -> None:
|
|
952
|
+
"""Handle DATABASE_SCHEMA task - discover and document database schemas."""
|
|
953
|
+
async with self.operation.create_child(
|
|
954
|
+
TaskOperation.CREATE_DATABASE_SCHEMA_FOR_COMMIT,
|
|
955
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
956
|
+
trackable_id=repository_id,
|
|
957
|
+
) as step:
|
|
958
|
+
# Check if database schema already exists for this commit
|
|
959
|
+
if await self.enrichment_query_service.has_database_schema_for_commit(
|
|
960
|
+
commit_sha
|
|
961
|
+
):
|
|
962
|
+
await step.skip("Database schema already exists for commit")
|
|
963
|
+
return
|
|
964
|
+
|
|
965
|
+
# Get repository path
|
|
966
|
+
repo = await self.repo_repository.get(repository_id)
|
|
967
|
+
if not repo.cloned_path:
|
|
968
|
+
raise ValueError(f"Repository {repository_id} has never been cloned")
|
|
969
|
+
|
|
970
|
+
await step.set_total(3)
|
|
971
|
+
await step.set_current(1, "Discovering database schemas")
|
|
972
|
+
|
|
973
|
+
# Discover database schemas
|
|
974
|
+
schema_report = await self.database_schema_detector.discover_schemas(
|
|
975
|
+
repo.cloned_path
|
|
976
|
+
)
|
|
977
|
+
|
|
978
|
+
if "No database schemas detected" in schema_report:
|
|
979
|
+
await step.skip("No database schemas found in repository")
|
|
980
|
+
return
|
|
981
|
+
|
|
982
|
+
await step.set_current(2, "Enriching schema documentation with LLM")
|
|
983
|
+
|
|
984
|
+
# Enrich the schema report through the enricher
|
|
985
|
+
enrichment_request = GenericEnrichmentRequest(
|
|
986
|
+
id=commit_sha,
|
|
987
|
+
text=DATABASE_SCHEMA_TASK_PROMPT.format(schema_report=schema_report),
|
|
988
|
+
system_prompt=DATABASE_SCHEMA_SYSTEM_PROMPT,
|
|
989
|
+
)
|
|
990
|
+
|
|
991
|
+
enriched_content = ""
|
|
992
|
+
async for response in self.enricher_service.enrich([enrichment_request]):
|
|
993
|
+
enriched_content = response.text
|
|
994
|
+
|
|
995
|
+
# Create and save database schema enrichment
|
|
996
|
+
enrichment = await self.enrichment_v2_repository.save(
|
|
997
|
+
DatabaseSchemaEnrichment(
|
|
998
|
+
content=enriched_content,
|
|
999
|
+
)
|
|
1000
|
+
)
|
|
1001
|
+
if not enrichment or not enrichment.id:
|
|
1002
|
+
raise ValueError(
|
|
1003
|
+
f"Failed to save database schema enrichment for commit {commit_sha}"
|
|
1004
|
+
)
|
|
1005
|
+
await self.enrichment_association_repository.save(
|
|
1006
|
+
CommitEnrichmentAssociation(
|
|
1007
|
+
enrichment_id=enrichment.id,
|
|
1008
|
+
entity_id=commit_sha,
|
|
1009
|
+
)
|
|
1010
|
+
)
|
|
1011
|
+
|
|
1012
|
+
await step.set_current(3, "Database schema enrichment completed")
|
|
1013
|
+
|
|
686
1014
|
async def _new_snippets_for_type(
|
|
687
|
-
self, all_snippets: list[
|
|
688
|
-
) -> list[
|
|
1015
|
+
self, all_snippets: list[EnrichmentV2], embedding_type: EmbeddingType
|
|
1016
|
+
) -> list[EnrichmentV2]:
|
|
689
1017
|
"""Get new snippets for a given type."""
|
|
690
1018
|
existing_embeddings = (
|
|
691
1019
|
await self.embedding_repository.list_embeddings_by_snippet_ids_and_type(
|
|
692
|
-
[s.id for s in all_snippets], embedding_type
|
|
1020
|
+
[str(s.id) for s in all_snippets], embedding_type
|
|
693
1021
|
)
|
|
694
1022
|
)
|
|
1023
|
+
# TODO(Phil): Can't do this incrementally yet because like the API, we don't
|
|
1024
|
+
# have a unified embedding repository
|
|
1025
|
+
if existing_embeddings:
|
|
1026
|
+
return []
|
|
695
1027
|
existing_embeddings_by_snippet_id = {
|
|
696
1028
|
embedding.snippet_id: embedding for embedding in existing_embeddings
|
|
697
1029
|
}
|