kodit 0.4.3__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/app.py +53 -23
- kodit/application/factories/reporting_factory.py +6 -2
- kodit/application/factories/server_factory.py +311 -0
- kodit/application/services/code_search_application_service.py +144 -0
- kodit/application/services/commit_indexing_application_service.py +543 -0
- kodit/application/services/indexing_worker_service.py +13 -44
- kodit/application/services/queue_service.py +24 -3
- kodit/application/services/reporting.py +0 -2
- kodit/application/services/sync_scheduler.py +15 -31
- kodit/cli.py +2 -753
- kodit/cli_utils.py +2 -9
- kodit/config.py +1 -94
- kodit/database.py +38 -1
- kodit/domain/{entities.py → entities/__init__.py} +50 -195
- kodit/domain/entities/git.py +190 -0
- kodit/domain/factories/__init__.py +1 -0
- kodit/domain/factories/git_repo_factory.py +76 -0
- kodit/domain/protocols.py +263 -64
- kodit/domain/services/bm25_service.py +5 -1
- kodit/domain/services/embedding_service.py +3 -0
- kodit/domain/services/git_repository_service.py +429 -0
- kodit/domain/services/git_service.py +300 -0
- kodit/domain/services/task_status_query_service.py +2 -2
- kodit/domain/value_objects.py +83 -114
- kodit/infrastructure/api/client/__init__.py +0 -2
- kodit/infrastructure/api/v1/__init__.py +0 -4
- kodit/infrastructure/api/v1/dependencies.py +92 -46
- kodit/infrastructure/api/v1/routers/__init__.py +0 -6
- kodit/infrastructure/api/v1/routers/commits.py +271 -0
- kodit/infrastructure/api/v1/routers/queue.py +2 -2
- kodit/infrastructure/api/v1/routers/repositories.py +282 -0
- kodit/infrastructure/api/v1/routers/search.py +31 -14
- kodit/infrastructure/api/v1/schemas/__init__.py +0 -24
- kodit/infrastructure/api/v1/schemas/commit.py +96 -0
- kodit/infrastructure/api/v1/schemas/context.py +2 -0
- kodit/infrastructure/api/v1/schemas/repository.py +128 -0
- kodit/infrastructure/api/v1/schemas/search.py +12 -9
- kodit/infrastructure/api/v1/schemas/snippet.py +58 -0
- kodit/infrastructure/api/v1/schemas/tag.py +31 -0
- kodit/infrastructure/api/v1/schemas/task_status.py +2 -0
- kodit/infrastructure/bm25/local_bm25_repository.py +16 -4
- kodit/infrastructure/bm25/vectorchord_bm25_repository.py +68 -52
- kodit/infrastructure/cloning/git/git_python_adaptor.py +467 -0
- kodit/infrastructure/cloning/git/working_copy.py +1 -1
- kodit/infrastructure/embedding/embedding_factory.py +3 -2
- kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
- kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +111 -84
- kodit/infrastructure/enrichment/litellm_enrichment_provider.py +19 -26
- kodit/infrastructure/indexing/fusion_service.py +1 -1
- kodit/infrastructure/mappers/git_mapper.py +193 -0
- kodit/infrastructure/mappers/snippet_mapper.py +106 -0
- kodit/infrastructure/mappers/task_mapper.py +5 -44
- kodit/infrastructure/reporting/log_progress.py +8 -5
- kodit/infrastructure/reporting/telemetry_progress.py +21 -0
- kodit/infrastructure/slicing/slicer.py +32 -31
- kodit/infrastructure/sqlalchemy/embedding_repository.py +43 -23
- kodit/infrastructure/sqlalchemy/entities.py +394 -158
- kodit/infrastructure/sqlalchemy/git_branch_repository.py +263 -0
- kodit/infrastructure/sqlalchemy/git_commit_repository.py +337 -0
- kodit/infrastructure/sqlalchemy/git_repository.py +252 -0
- kodit/infrastructure/sqlalchemy/git_tag_repository.py +257 -0
- kodit/infrastructure/sqlalchemy/snippet_v2_repository.py +484 -0
- kodit/infrastructure/sqlalchemy/task_repository.py +29 -23
- kodit/infrastructure/sqlalchemy/task_status_repository.py +24 -12
- kodit/infrastructure/sqlalchemy/unit_of_work.py +10 -14
- kodit/mcp.py +12 -30
- kodit/migrations/env.py +1 -0
- kodit/migrations/versions/04b80f802e0c_foreign_key_review.py +100 -0
- kodit/migrations/versions/7f15f878c3a1_add_new_git_entities.py +690 -0
- kodit/migrations/versions/f9e5ef5e688f_add_git_commits_number.py +43 -0
- kodit/py.typed +0 -0
- kodit/utils/dump_openapi.py +7 -4
- kodit/utils/path_utils.py +29 -0
- {kodit-0.4.3.dist-info → kodit-0.5.0.dist-info}/METADATA +3 -3
- kodit-0.5.0.dist-info/RECORD +137 -0
- kodit/application/factories/code_indexing_factory.py +0 -195
- kodit/application/services/auto_indexing_service.py +0 -99
- kodit/application/services/code_indexing_application_service.py +0 -410
- kodit/domain/services/index_query_service.py +0 -70
- kodit/domain/services/index_service.py +0 -269
- kodit/infrastructure/api/client/index_client.py +0 -57
- kodit/infrastructure/api/v1/routers/indexes.py +0 -164
- kodit/infrastructure/api/v1/schemas/index.py +0 -101
- kodit/infrastructure/bm25/bm25_factory.py +0 -28
- kodit/infrastructure/cloning/__init__.py +0 -1
- kodit/infrastructure/cloning/metadata.py +0 -98
- kodit/infrastructure/mappers/index_mapper.py +0 -345
- kodit/infrastructure/reporting/tdqm_progress.py +0 -38
- kodit/infrastructure/slicing/language_detection_service.py +0 -18
- kodit/infrastructure/sqlalchemy/index_repository.py +0 -646
- kodit-0.4.3.dist-info/RECORD +0 -125
- {kodit-0.4.3.dist-info → kodit-0.5.0.dist-info}/WHEEL +0 -0
- {kodit-0.4.3.dist-info → kodit-0.5.0.dist-info}/entry_points.txt +0 -0
- {kodit-0.4.3.dist-info → kodit-0.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,543 @@
|
|
|
1
|
+
"""Application services for commit indexing operations."""
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import structlog
|
|
7
|
+
from pydantic import AnyUrl
|
|
8
|
+
|
|
9
|
+
from kodit.application.services.queue_service import QueueService
|
|
10
|
+
from kodit.application.services.reporting import ProgressTracker
|
|
11
|
+
from kodit.domain.entities import Task
|
|
12
|
+
from kodit.domain.entities.git import GitFile, GitRepo, SnippetV2
|
|
13
|
+
from kodit.domain.factories.git_repo_factory import GitRepoFactory
|
|
14
|
+
from kodit.domain.protocols import (
|
|
15
|
+
GitBranchRepository,
|
|
16
|
+
GitCommitRepository,
|
|
17
|
+
GitRepoRepository,
|
|
18
|
+
GitTagRepository,
|
|
19
|
+
SnippetRepositoryV2,
|
|
20
|
+
)
|
|
21
|
+
from kodit.domain.services.bm25_service import BM25DomainService
|
|
22
|
+
from kodit.domain.services.embedding_service import EmbeddingDomainService
|
|
23
|
+
from kodit.domain.services.enrichment_service import EnrichmentDomainService
|
|
24
|
+
from kodit.domain.services.git_repository_service import (
|
|
25
|
+
GitRepositoryScanner,
|
|
26
|
+
RepositoryCloner,
|
|
27
|
+
)
|
|
28
|
+
from kodit.domain.value_objects import (
|
|
29
|
+
DeleteRequest,
|
|
30
|
+
Document,
|
|
31
|
+
Enrichment,
|
|
32
|
+
EnrichmentIndexRequest,
|
|
33
|
+
EnrichmentRequest,
|
|
34
|
+
EnrichmentType,
|
|
35
|
+
IndexRequest,
|
|
36
|
+
LanguageMapping,
|
|
37
|
+
PrescribedOperations,
|
|
38
|
+
QueuePriority,
|
|
39
|
+
TaskOperation,
|
|
40
|
+
TrackableType,
|
|
41
|
+
)
|
|
42
|
+
from kodit.infrastructure.slicing.slicer import Slicer
|
|
43
|
+
from kodit.infrastructure.sqlalchemy.embedding_repository import (
|
|
44
|
+
SqlAlchemyEmbeddingRepository,
|
|
45
|
+
)
|
|
46
|
+
from kodit.infrastructure.sqlalchemy.entities import EmbeddingType
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class CommitIndexingApplicationService:
|
|
50
|
+
"""Application service for commit indexing operations."""
|
|
51
|
+
|
|
52
|
+
def __init__( # noqa: PLR0913
|
|
53
|
+
self,
|
|
54
|
+
snippet_v2_repository: SnippetRepositoryV2,
|
|
55
|
+
repo_repository: GitRepoRepository,
|
|
56
|
+
git_commit_repository: GitCommitRepository,
|
|
57
|
+
git_branch_repository: GitBranchRepository,
|
|
58
|
+
git_tag_repository: GitTagRepository,
|
|
59
|
+
operation: ProgressTracker,
|
|
60
|
+
scanner: GitRepositoryScanner,
|
|
61
|
+
cloner: RepositoryCloner,
|
|
62
|
+
snippet_repository: SnippetRepositoryV2,
|
|
63
|
+
slicer: Slicer,
|
|
64
|
+
queue: QueueService,
|
|
65
|
+
bm25_service: BM25DomainService,
|
|
66
|
+
code_search_service: EmbeddingDomainService,
|
|
67
|
+
text_search_service: EmbeddingDomainService,
|
|
68
|
+
enrichment_service: EnrichmentDomainService,
|
|
69
|
+
embedding_repository: SqlAlchemyEmbeddingRepository,
|
|
70
|
+
) -> None:
|
|
71
|
+
"""Initialize the commit indexing application service.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
commit_index_repository: Repository for commit index data.
|
|
75
|
+
snippet_v2_repository: Repository for snippet data.
|
|
76
|
+
repo_repository: Repository for Git repository data.
|
|
77
|
+
domain_indexer: Domain service for indexing operations.
|
|
78
|
+
operation: Progress tracker for reporting operations.
|
|
79
|
+
|
|
80
|
+
"""
|
|
81
|
+
self.snippet_repository = snippet_v2_repository
|
|
82
|
+
self.repo_repository = repo_repository
|
|
83
|
+
self.git_commit_repository = git_commit_repository
|
|
84
|
+
self.git_branch_repository = git_branch_repository
|
|
85
|
+
self.git_tag_repository = git_tag_repository
|
|
86
|
+
self.operation = operation
|
|
87
|
+
self.scanner = scanner
|
|
88
|
+
self.cloner = cloner
|
|
89
|
+
self.snippet_repository = snippet_repository
|
|
90
|
+
self.slicer = slicer
|
|
91
|
+
self.queue = queue
|
|
92
|
+
self.bm25_service = bm25_service
|
|
93
|
+
self.code_search_service = code_search_service
|
|
94
|
+
self.text_search_service = text_search_service
|
|
95
|
+
self.enrichment_service = enrichment_service
|
|
96
|
+
self.embedding_repository = embedding_repository
|
|
97
|
+
self._log = structlog.get_logger(__name__)
|
|
98
|
+
|
|
99
|
+
async def create_git_repository(self, remote_uri: AnyUrl) -> GitRepo:
|
|
100
|
+
"""Create a new Git repository."""
|
|
101
|
+
async with self.operation.create_child(
|
|
102
|
+
TaskOperation.CREATE_REPOSITORY,
|
|
103
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
104
|
+
):
|
|
105
|
+
repo = GitRepoFactory.create_from_remote_uri(remote_uri)
|
|
106
|
+
repo = await self.repo_repository.save(repo)
|
|
107
|
+
await self.queue.enqueue_tasks(
|
|
108
|
+
tasks=PrescribedOperations.CREATE_NEW_REPOSITORY,
|
|
109
|
+
base_priority=QueuePriority.USER_INITIATED,
|
|
110
|
+
payload={"repository_id": repo.id},
|
|
111
|
+
)
|
|
112
|
+
return repo
|
|
113
|
+
|
|
114
|
+
async def delete_git_repository(self, repo_id: int) -> bool:
|
|
115
|
+
"""Delete a Git repository by ID."""
|
|
116
|
+
repo = await self.repo_repository.get_by_id(repo_id)
|
|
117
|
+
if not repo:
|
|
118
|
+
return False
|
|
119
|
+
|
|
120
|
+
# Use the proper deletion process that handles all dependencies
|
|
121
|
+
await self.process_delete_repo(repo_id)
|
|
122
|
+
return True
|
|
123
|
+
|
|
124
|
+
# TODO(Phil): Make this polymorphic
|
|
125
|
+
async def run_task(self, task: Task) -> None: # noqa: PLR0912, C901
|
|
126
|
+
"""Run a task."""
|
|
127
|
+
if task.type.is_repository_operation():
|
|
128
|
+
repo_id = task.payload["repository_id"]
|
|
129
|
+
if not repo_id:
|
|
130
|
+
raise ValueError("Repository ID is required")
|
|
131
|
+
if task.type == TaskOperation.CLONE_REPOSITORY:
|
|
132
|
+
await self.process_clone_repo(repo_id)
|
|
133
|
+
elif task.type == TaskOperation.SCAN_REPOSITORY:
|
|
134
|
+
await self.process_scan_repo(repo_id)
|
|
135
|
+
elif task.type == TaskOperation.DELETE_REPOSITORY:
|
|
136
|
+
await self.process_delete_repo(repo_id)
|
|
137
|
+
else:
|
|
138
|
+
raise ValueError(f"Unknown task type: {task.type}")
|
|
139
|
+
elif task.type.is_commit_operation():
|
|
140
|
+
repository_id = task.payload["repository_id"]
|
|
141
|
+
if not repository_id:
|
|
142
|
+
raise ValueError("Repository ID is required")
|
|
143
|
+
commit_sha = task.payload["commit_sha"]
|
|
144
|
+
if not commit_sha:
|
|
145
|
+
raise ValueError("Commit SHA is required")
|
|
146
|
+
if task.type == TaskOperation.EXTRACT_SNIPPETS_FOR_COMMIT:
|
|
147
|
+
await self.process_snippets_for_commit(repository_id, commit_sha)
|
|
148
|
+
elif task.type == TaskOperation.CREATE_BM25_INDEX_FOR_COMMIT:
|
|
149
|
+
await self.process_bm25_index(repository_id, commit_sha)
|
|
150
|
+
elif task.type == TaskOperation.CREATE_CODE_EMBEDDINGS_FOR_COMMIT:
|
|
151
|
+
await self.process_code_embeddings(repository_id, commit_sha)
|
|
152
|
+
elif task.type == TaskOperation.CREATE_SUMMARY_ENRICHMENT_FOR_COMMIT:
|
|
153
|
+
await self.process_enrich(repository_id, commit_sha)
|
|
154
|
+
elif task.type == TaskOperation.CREATE_SUMMARY_EMBEDDINGS_FOR_COMMIT:
|
|
155
|
+
await self.process_summary_embeddings(repository_id, commit_sha)
|
|
156
|
+
else:
|
|
157
|
+
raise ValueError(f"Unknown task type: {task.type}")
|
|
158
|
+
else:
|
|
159
|
+
raise ValueError(f"Unknown task type: {task.type}")
|
|
160
|
+
|
|
161
|
+
async def process_clone_repo(self, repository_id: int) -> None:
|
|
162
|
+
"""Clone a repository."""
|
|
163
|
+
async with self.operation.create_child(
|
|
164
|
+
TaskOperation.CLONE_REPOSITORY,
|
|
165
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
166
|
+
trackable_id=repository_id,
|
|
167
|
+
):
|
|
168
|
+
repo = await self.repo_repository.get_by_id(repository_id)
|
|
169
|
+
repo.cloned_path = await self.cloner.clone_repository(repo.remote_uri)
|
|
170
|
+
await self.repo_repository.save(repo)
|
|
171
|
+
|
|
172
|
+
async def process_scan_repo(self, repository_id: int) -> None:
|
|
173
|
+
"""Scan a repository."""
|
|
174
|
+
async with self.operation.create_child(
|
|
175
|
+
TaskOperation.SCAN_REPOSITORY,
|
|
176
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
177
|
+
trackable_id=repository_id,
|
|
178
|
+
) as step:
|
|
179
|
+
await step.set_total(6)
|
|
180
|
+
repo = await self.repo_repository.get_by_id(repository_id)
|
|
181
|
+
if not repo.cloned_path:
|
|
182
|
+
raise ValueError(f"Repository {repository_id} has never been cloned")
|
|
183
|
+
|
|
184
|
+
# Scan the repository to get all metadata
|
|
185
|
+
await step.set_current(0, "Scanning repository")
|
|
186
|
+
scan_result = await self.scanner.scan_repository(repo.cloned_path)
|
|
187
|
+
|
|
188
|
+
# Update repo with scan result (this sets num_commits, num_branches, etc.)
|
|
189
|
+
await step.set_current(1, "Updating repository with scan result")
|
|
190
|
+
repo.update_with_scan_result(scan_result)
|
|
191
|
+
await self.repo_repository.save(repo)
|
|
192
|
+
|
|
193
|
+
# Save commits, branches, and tags to their dedicated repositories
|
|
194
|
+
await step.set_current(2, "Saving commits")
|
|
195
|
+
if scan_result.all_commits:
|
|
196
|
+
await self.git_commit_repository.save_bulk(
|
|
197
|
+
scan_result.all_commits, repository_id
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
await step.set_current(3, "Saving branches")
|
|
201
|
+
if scan_result.branches:
|
|
202
|
+
await self.git_branch_repository.save_bulk(
|
|
203
|
+
scan_result.branches, repository_id
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
await step.set_current(4, "Saving tags")
|
|
207
|
+
if scan_result.all_tags:
|
|
208
|
+
await self.git_tag_repository.save_bulk(
|
|
209
|
+
scan_result.all_tags, repository_id
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
await step.set_current(5, "Enqueuing commit indexing tasks")
|
|
213
|
+
if not repo.tracking_branch:
|
|
214
|
+
raise ValueError(f"Repository {repository_id} has no tracking branch")
|
|
215
|
+
commit_sha = repo.tracking_branch.head_commit.commit_sha
|
|
216
|
+
if not commit_sha:
|
|
217
|
+
raise ValueError(f"Repository {repository_id} has no head commit")
|
|
218
|
+
|
|
219
|
+
await self.queue.enqueue_tasks(
|
|
220
|
+
tasks=PrescribedOperations.INDEX_COMMIT,
|
|
221
|
+
base_priority=QueuePriority.USER_INITIATED,
|
|
222
|
+
payload={"commit_sha": commit_sha, "repository_id": repository_id},
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
async def process_delete_repo(self, repository_id: int) -> None:
|
|
226
|
+
"""Delete a repository."""
|
|
227
|
+
async with self.operation.create_child(
|
|
228
|
+
TaskOperation.DELETE_REPOSITORY,
|
|
229
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
230
|
+
trackable_id=repository_id,
|
|
231
|
+
):
|
|
232
|
+
repo = await self.repo_repository.get_by_id(repository_id)
|
|
233
|
+
if not repo:
|
|
234
|
+
raise ValueError(f"Repository {repository_id} not found")
|
|
235
|
+
|
|
236
|
+
# Get all commit SHAs for this repository first (needed for cleanup)
|
|
237
|
+
commits = await self.git_commit_repository.get_by_repo_id(repository_id)
|
|
238
|
+
commit_shas = [commit.commit_sha for commit in commits]
|
|
239
|
+
|
|
240
|
+
# Step 1: Get all snippet IDs that are associated with these commits FIRST
|
|
241
|
+
# (before deleting the associations)
|
|
242
|
+
all_snippet_ids = []
|
|
243
|
+
if commit_shas:
|
|
244
|
+
for commit_sha in commit_shas:
|
|
245
|
+
snippets = await self.snippet_repository.get_snippets_for_commit(
|
|
246
|
+
commit_sha
|
|
247
|
+
)
|
|
248
|
+
all_snippet_ids.extend([
|
|
249
|
+
snippet.id for snippet in snippets if snippet.id
|
|
250
|
+
])
|
|
251
|
+
|
|
252
|
+
# Step 2: Delete from BM25 and embedding indices
|
|
253
|
+
if all_snippet_ids:
|
|
254
|
+
# Convert to strings as DeleteRequest expects list[str]
|
|
255
|
+
snippet_id_strings = [
|
|
256
|
+
str(snippet_id) for snippet_id in all_snippet_ids
|
|
257
|
+
]
|
|
258
|
+
delete_request = DeleteRequest(snippet_ids=snippet_id_strings)
|
|
259
|
+
await self.bm25_service.delete_documents(delete_request)
|
|
260
|
+
|
|
261
|
+
# Delete embeddings for each snippet
|
|
262
|
+
for snippet_id in all_snippet_ids:
|
|
263
|
+
await self.embedding_repository.delete_embeddings_by_snippet_id(
|
|
264
|
+
snippet_id
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
# Step 3: Delete snippet associations for all commits
|
|
268
|
+
for commit_sha in commit_shas:
|
|
269
|
+
await self.snippet_repository.delete_snippets_for_commit(commit_sha)
|
|
270
|
+
|
|
271
|
+
# Step 4: Delete branches (they reference commits via head_commit_sha)
|
|
272
|
+
await self.git_branch_repository.delete_by_repo_id(repository_id)
|
|
273
|
+
|
|
274
|
+
# Step 5: Delete tags (they reference commits via target_commit_sha)
|
|
275
|
+
await self.git_tag_repository.delete_by_repo_id(repository_id)
|
|
276
|
+
|
|
277
|
+
# Step 6: Delete commits and their files
|
|
278
|
+
await self.git_commit_repository.delete_by_repo_id(repository_id)
|
|
279
|
+
|
|
280
|
+
# Step 7: Finally delete the repository
|
|
281
|
+
await self.repo_repository.delete(repo.sanitized_remote_uri)
|
|
282
|
+
|
|
283
|
+
async def process_snippets_for_commit(
|
|
284
|
+
self, repository_id: int, commit_sha: str
|
|
285
|
+
) -> None:
|
|
286
|
+
"""Generate snippets for a repository."""
|
|
287
|
+
async with self.operation.create_child(
|
|
288
|
+
operation=TaskOperation.EXTRACT_SNIPPETS_FOR_COMMIT,
|
|
289
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
290
|
+
trackable_id=repository_id,
|
|
291
|
+
) as step:
|
|
292
|
+
# Have we already processed this commit? If yes, skip.
|
|
293
|
+
if await self.snippet_repository.get_snippets_for_commit(commit_sha):
|
|
294
|
+
await step.skip("All snippets already extracted for commit")
|
|
295
|
+
return
|
|
296
|
+
|
|
297
|
+
commit = await self.git_commit_repository.get_by_sha(commit_sha)
|
|
298
|
+
|
|
299
|
+
# Load files on demand for snippet extraction (performance optimization)
|
|
300
|
+
# Instead of using commit.files (which may be empty), load files directly
|
|
301
|
+
repo = await self.repo_repository.get_by_id(repository_id)
|
|
302
|
+
if not repo.cloned_path:
|
|
303
|
+
raise ValueError(f"Repository {repository_id} has never been cloned")
|
|
304
|
+
|
|
305
|
+
# Ensure we're on the specific commit for file access
|
|
306
|
+
await self.scanner.git_adapter.checkout_commit(repo.cloned_path, commit_sha)
|
|
307
|
+
|
|
308
|
+
# Get files directly from Git adapter for this specific commit
|
|
309
|
+
files_data = await self.scanner.git_adapter.get_commit_files(
|
|
310
|
+
repo.cloned_path, commit_sha
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# Create GitFile entities with absolute paths for the slicer
|
|
314
|
+
files = []
|
|
315
|
+
for file_data in files_data:
|
|
316
|
+
# Extract extension from file path
|
|
317
|
+
file_path = Path(file_data["path"])
|
|
318
|
+
extension = file_path.suffix.lstrip(".")
|
|
319
|
+
|
|
320
|
+
# Create absolute path for the slicer to read
|
|
321
|
+
absolute_path = str(repo.cloned_path / file_data["path"])
|
|
322
|
+
|
|
323
|
+
git_file = GitFile(
|
|
324
|
+
created_at=file_data.get("created_at", commit.date),
|
|
325
|
+
blob_sha=file_data["blob_sha"],
|
|
326
|
+
path=absolute_path, # Use absolute path for file reading
|
|
327
|
+
mime_type=file_data.get("mime_type", "application/octet-stream"),
|
|
328
|
+
size=file_data.get("size", 0),
|
|
329
|
+
extension=extension,
|
|
330
|
+
)
|
|
331
|
+
files.append(git_file)
|
|
332
|
+
|
|
333
|
+
# Create a set of languages to extract snippets for
|
|
334
|
+
extensions = {file.extension for file in files}
|
|
335
|
+
lang_files_map: dict[str, list[GitFile]] = defaultdict(list)
|
|
336
|
+
for ext in extensions:
|
|
337
|
+
try:
|
|
338
|
+
lang = LanguageMapping.get_language_for_extension(ext)
|
|
339
|
+
lang_files_map[lang].extend(
|
|
340
|
+
file for file in files if file.extension == ext
|
|
341
|
+
)
|
|
342
|
+
except ValueError as e:
|
|
343
|
+
self._log.debug("Skipping", error=str(e))
|
|
344
|
+
continue
|
|
345
|
+
|
|
346
|
+
# Extract snippets
|
|
347
|
+
all_snippets: list[SnippetV2] = []
|
|
348
|
+
slicer = Slicer()
|
|
349
|
+
await step.set_total(len(lang_files_map.keys()))
|
|
350
|
+
for i, (lang, lang_files) in enumerate(lang_files_map.items()):
|
|
351
|
+
await step.set_current(i, f"Extracting snippets for {lang}")
|
|
352
|
+
snippets = slicer.extract_snippets_from_git_files(
|
|
353
|
+
lang_files, language=lang
|
|
354
|
+
)
|
|
355
|
+
all_snippets.extend(snippets)
|
|
356
|
+
|
|
357
|
+
# Deduplicate snippets by SHA before saving to prevent constraint violations
|
|
358
|
+
unique_snippets: dict[str, SnippetV2] = {}
|
|
359
|
+
for snippet in all_snippets:
|
|
360
|
+
unique_snippets[snippet.sha] = snippet
|
|
361
|
+
|
|
362
|
+
deduplicated_snippets = list(unique_snippets.values())
|
|
363
|
+
|
|
364
|
+
commit_short = commit.commit_sha[:8]
|
|
365
|
+
self._log.info(
|
|
366
|
+
f"Extracted {len(all_snippets)} snippets, "
|
|
367
|
+
f"deduplicated to {len(deduplicated_snippets)} for {commit_short}"
|
|
368
|
+
)
|
|
369
|
+
await self.snippet_repository.save_snippets(
|
|
370
|
+
commit.commit_sha, deduplicated_snippets
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
async def process_bm25_index(self, repository_id: int, commit_sha: str) -> None:
|
|
374
|
+
"""Handle BM25_INDEX task - create keyword index."""
|
|
375
|
+
async with self.operation.create_child(
|
|
376
|
+
TaskOperation.CREATE_BM25_INDEX_FOR_COMMIT,
|
|
377
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
378
|
+
trackable_id=repository_id,
|
|
379
|
+
):
|
|
380
|
+
snippets = await self.snippet_repository.get_snippets_for_commit(commit_sha)
|
|
381
|
+
|
|
382
|
+
await self.bm25_service.index_documents(
|
|
383
|
+
IndexRequest(
|
|
384
|
+
documents=[
|
|
385
|
+
Document(snippet_id=snippet.id, text=snippet.content)
|
|
386
|
+
for snippet in snippets
|
|
387
|
+
if snippet.id
|
|
388
|
+
]
|
|
389
|
+
)
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
async def process_code_embeddings(
|
|
393
|
+
self, repository_id: int, commit_sha: str
|
|
394
|
+
) -> None:
|
|
395
|
+
"""Handle CODE_EMBEDDINGS task - create code embeddings."""
|
|
396
|
+
async with self.operation.create_child(
|
|
397
|
+
TaskOperation.CREATE_CODE_EMBEDDINGS_FOR_COMMIT,
|
|
398
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
399
|
+
trackable_id=repository_id,
|
|
400
|
+
) as step:
|
|
401
|
+
all_snippets = await self.snippet_repository.get_snippets_for_commit(
|
|
402
|
+
commit_sha
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
new_snippets = await self._new_snippets_for_type(
|
|
406
|
+
all_snippets, EmbeddingType.CODE
|
|
407
|
+
)
|
|
408
|
+
if not new_snippets:
|
|
409
|
+
await step.skip("All snippets already have code embeddings")
|
|
410
|
+
return
|
|
411
|
+
|
|
412
|
+
await step.set_total(len(new_snippets))
|
|
413
|
+
processed = 0
|
|
414
|
+
documents = [
|
|
415
|
+
Document(snippet_id=snippet.id, text=snippet.content)
|
|
416
|
+
for snippet in new_snippets
|
|
417
|
+
if snippet.id
|
|
418
|
+
]
|
|
419
|
+
async for result in self.code_search_service.index_documents(
|
|
420
|
+
IndexRequest(documents=documents)
|
|
421
|
+
):
|
|
422
|
+
processed += len(result)
|
|
423
|
+
await step.set_current(processed, "Creating code embeddings for commit")
|
|
424
|
+
|
|
425
|
+
async def process_enrich(self, repository_id: int, commit_sha: str) -> None:
|
|
426
|
+
"""Handle ENRICH task - enrich snippets and create text embeddings."""
|
|
427
|
+
async with self.operation.create_child(
|
|
428
|
+
TaskOperation.CREATE_SUMMARY_ENRICHMENT_FOR_COMMIT,
|
|
429
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
430
|
+
trackable_id=repository_id,
|
|
431
|
+
) as step:
|
|
432
|
+
all_snippets = await self.snippet_repository.get_snippets_for_commit(
|
|
433
|
+
commit_sha
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
# Find snippets without a summary enrichment
|
|
437
|
+
snippets_without_summary = [
|
|
438
|
+
snippet
|
|
439
|
+
for snippet in all_snippets
|
|
440
|
+
if not snippet.enrichments
|
|
441
|
+
or not next(
|
|
442
|
+
enrichment
|
|
443
|
+
for enrichment in snippet.enrichments
|
|
444
|
+
if enrichment.type == EnrichmentType.SUMMARIZATION
|
|
445
|
+
)
|
|
446
|
+
]
|
|
447
|
+
if not snippets_without_summary:
|
|
448
|
+
await step.skip("All snippets already have a summary enrichment")
|
|
449
|
+
return
|
|
450
|
+
|
|
451
|
+
# Enrich snippets
|
|
452
|
+
await step.set_total(len(snippets_without_summary))
|
|
453
|
+
snippet_map = {
|
|
454
|
+
snippet.id: snippet
|
|
455
|
+
for snippet in snippets_without_summary
|
|
456
|
+
if snippet.id
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
enrichment_request = EnrichmentIndexRequest(
|
|
460
|
+
requests=[
|
|
461
|
+
EnrichmentRequest(snippet_id=snippet_id, text=snippet.content)
|
|
462
|
+
for snippet_id, snippet in snippet_map.items()
|
|
463
|
+
]
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
processed = 0
|
|
467
|
+
async for result in self.enrichment_service.enrich_documents(
|
|
468
|
+
enrichment_request
|
|
469
|
+
):
|
|
470
|
+
snippet = snippet_map[result.snippet_id]
|
|
471
|
+
snippet.enrichments.append(
|
|
472
|
+
Enrichment(type=EnrichmentType.SUMMARIZATION, content=result.text)
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
await self.snippet_repository.save_snippets(commit_sha, [snippet])
|
|
476
|
+
|
|
477
|
+
processed += 1
|
|
478
|
+
await step.set_current(processed, "Enriching snippets for commit")
|
|
479
|
+
|
|
480
|
+
async def process_summary_embeddings(
|
|
481
|
+
self, repository_id: int, commit_sha: str
|
|
482
|
+
) -> None:
|
|
483
|
+
"""Handle SUMMARY_EMBEDDINGS task - create summary embeddings."""
|
|
484
|
+
async with self.operation.create_child(
|
|
485
|
+
TaskOperation.CREATE_SUMMARY_EMBEDDINGS_FOR_COMMIT,
|
|
486
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
487
|
+
trackable_id=repository_id,
|
|
488
|
+
) as step:
|
|
489
|
+
snippets = await self.snippet_repository.get_snippets_for_commit(commit_sha)
|
|
490
|
+
|
|
491
|
+
new_snippets = await self._new_snippets_for_type(
|
|
492
|
+
snippets, EmbeddingType.TEXT
|
|
493
|
+
)
|
|
494
|
+
if not new_snippets:
|
|
495
|
+
await step.skip("All snippets already have text embeddings")
|
|
496
|
+
return
|
|
497
|
+
|
|
498
|
+
await step.set_total(len(new_snippets))
|
|
499
|
+
processed = 0
|
|
500
|
+
|
|
501
|
+
def _summary_from_enrichments(enrichments: list[Enrichment]) -> str:
|
|
502
|
+
if not enrichments:
|
|
503
|
+
return ""
|
|
504
|
+
return next(
|
|
505
|
+
enrichment.content
|
|
506
|
+
for enrichment in enrichments
|
|
507
|
+
if enrichment.type == EnrichmentType.SUMMARIZATION
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
snippet_summary_map = {
|
|
511
|
+
snippet.id: _summary_from_enrichments(snippet.enrichments)
|
|
512
|
+
for snippet in snippets
|
|
513
|
+
if snippet.id
|
|
514
|
+
}
|
|
515
|
+
if len(snippet_summary_map) == 0:
|
|
516
|
+
await step.skip("No snippets with summaries to create text embeddings")
|
|
517
|
+
return
|
|
518
|
+
|
|
519
|
+
documents_with_summaries = [
|
|
520
|
+
Document(snippet_id=snippet_id, text=snippet_summary)
|
|
521
|
+
for snippet_id, snippet_summary in snippet_summary_map.items()
|
|
522
|
+
]
|
|
523
|
+
async for result in self.text_search_service.index_documents(
|
|
524
|
+
IndexRequest(documents=documents_with_summaries)
|
|
525
|
+
):
|
|
526
|
+
processed += len(result)
|
|
527
|
+
await step.set_current(processed, "Creating text embeddings for commit")
|
|
528
|
+
|
|
529
|
+
async def _new_snippets_for_type(
|
|
530
|
+
self, all_snippets: list[SnippetV2], embedding_type: EmbeddingType
|
|
531
|
+
) -> list[SnippetV2]:
|
|
532
|
+
"""Get new snippets for a given type."""
|
|
533
|
+
existing_embeddings = (
|
|
534
|
+
await self.embedding_repository.list_embeddings_by_snippet_ids_and_type(
|
|
535
|
+
[s.id for s in all_snippets], embedding_type
|
|
536
|
+
)
|
|
537
|
+
)
|
|
538
|
+
existing_embeddings_by_snippet_id = {
|
|
539
|
+
embedding.snippet_id: embedding for embedding in existing_embeddings
|
|
540
|
+
}
|
|
541
|
+
return [
|
|
542
|
+
s for s in all_snippets if s.id not in existing_embeddings_by_snippet_id
|
|
543
|
+
]
|
|
@@ -8,14 +8,11 @@ from datetime import UTC, datetime
|
|
|
8
8
|
import structlog
|
|
9
9
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
10
|
|
|
11
|
-
from kodit.application.factories.code_indexing_factory import (
|
|
12
|
-
create_code_indexing_application_service,
|
|
13
|
-
)
|
|
14
11
|
from kodit.application.factories.reporting_factory import create_noop_operation
|
|
12
|
+
from kodit.application.factories.server_factory import ServerFactory
|
|
15
13
|
from kodit.application.services.reporting import ProgressTracker
|
|
16
14
|
from kodit.config import AppContext
|
|
17
15
|
from kodit.domain.entities import Task
|
|
18
|
-
from kodit.domain.value_objects import TaskType
|
|
19
16
|
from kodit.infrastructure.sqlalchemy.task_repository import create_task_repository
|
|
20
17
|
|
|
21
18
|
|
|
@@ -30,10 +27,12 @@ class IndexingWorkerService:
|
|
|
30
27
|
self,
|
|
31
28
|
app_context: AppContext,
|
|
32
29
|
session_factory: Callable[[], AsyncSession],
|
|
30
|
+
server_factory: ServerFactory,
|
|
33
31
|
) -> None:
|
|
34
32
|
"""Initialize the indexing worker service."""
|
|
35
33
|
self.app_context = app_context
|
|
36
34
|
self.session_factory = session_factory
|
|
35
|
+
self.server_factory = server_factory
|
|
37
36
|
self._worker_task: asyncio.Task | None = None
|
|
38
37
|
self._shutdown_event = asyncio.Event()
|
|
39
38
|
self.task_repository = create_task_repository(session_factory)
|
|
@@ -45,7 +44,7 @@ class IndexingWorkerService:
|
|
|
45
44
|
self._running = True
|
|
46
45
|
|
|
47
46
|
# Start single worker task
|
|
48
|
-
self._worker_task = asyncio.create_task(self._worker_loop(
|
|
47
|
+
self._worker_task = asyncio.create_task(self._worker_loop())
|
|
49
48
|
|
|
50
49
|
self.log.info(
|
|
51
50
|
"Indexing worker started",
|
|
@@ -63,18 +62,20 @@ class IndexingWorkerService:
|
|
|
63
62
|
|
|
64
63
|
self.log.info("Indexing worker stopped")
|
|
65
64
|
|
|
66
|
-
async def _worker_loop(self
|
|
65
|
+
async def _worker_loop(self) -> None:
|
|
67
66
|
self.log.debug("Worker loop started")
|
|
68
67
|
|
|
69
68
|
while not self._shutdown_event.is_set():
|
|
70
69
|
try:
|
|
71
70
|
async with self.session_factory() as session:
|
|
72
|
-
task = await self.task_repository.
|
|
71
|
+
task = await self.task_repository.next()
|
|
73
72
|
await session.commit()
|
|
74
73
|
|
|
75
74
|
# If there's a task, process it in a new thread
|
|
76
75
|
if task:
|
|
77
|
-
await self._process_task(task
|
|
76
|
+
await self._process_task(task)
|
|
77
|
+
# Only remove the task if it was processed successfully
|
|
78
|
+
await self.task_repository.remove(task)
|
|
78
79
|
continue
|
|
79
80
|
|
|
80
81
|
# If no task, sleep for a bit
|
|
@@ -90,8 +91,8 @@ class IndexingWorkerService:
|
|
|
90
91
|
|
|
91
92
|
self.log.info("Worker loop stopped")
|
|
92
93
|
|
|
93
|
-
async def _process_task(self, task: Task
|
|
94
|
-
"""Process a
|
|
94
|
+
async def _process_task(self, task: Task) -> None:
|
|
95
|
+
"""Process a task based on its type."""
|
|
95
96
|
self.log.info(
|
|
96
97
|
"Processing task",
|
|
97
98
|
task_id=task.id,
|
|
@@ -105,16 +106,8 @@ class IndexingWorkerService:
|
|
|
105
106
|
asyncio.set_event_loop(loop)
|
|
106
107
|
|
|
107
108
|
try:
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
await self._process_index_update(task, operation)
|
|
111
|
-
else:
|
|
112
|
-
self.log.warning(
|
|
113
|
-
"Unknown task type",
|
|
114
|
-
task_id=task.id,
|
|
115
|
-
task_type=task.type,
|
|
116
|
-
)
|
|
117
|
-
return
|
|
109
|
+
commit_service = self.server_factory.commit_indexing_application_service()
|
|
110
|
+
await commit_service.run_task(task)
|
|
118
111
|
finally:
|
|
119
112
|
loop.close()
|
|
120
113
|
|
|
@@ -124,27 +117,3 @@ class IndexingWorkerService:
|
|
|
124
117
|
task_id=task.id,
|
|
125
118
|
duration_seconds=duration,
|
|
126
119
|
)
|
|
127
|
-
|
|
128
|
-
async def _process_index_update(
|
|
129
|
-
self, task: Task, operation: ProgressTracker
|
|
130
|
-
) -> None:
|
|
131
|
-
"""Process index update/sync task."""
|
|
132
|
-
index_id = task.payload.get("index_id")
|
|
133
|
-
if not index_id:
|
|
134
|
-
raise ValueError("Missing index_id in task payload")
|
|
135
|
-
|
|
136
|
-
# Create a fresh database connection for this thread's event loop
|
|
137
|
-
db = await self.app_context.new_db(run_migrations=True)
|
|
138
|
-
try:
|
|
139
|
-
service = create_code_indexing_application_service(
|
|
140
|
-
app_context=self.app_context,
|
|
141
|
-
session_factory=self.session_factory,
|
|
142
|
-
operation=operation,
|
|
143
|
-
)
|
|
144
|
-
index = await service.index_repository.get(index_id)
|
|
145
|
-
if not index:
|
|
146
|
-
raise ValueError(f"Index not found: {index_id}")
|
|
147
|
-
|
|
148
|
-
await service.run_index(index)
|
|
149
|
-
finally:
|
|
150
|
-
await db.close()
|