kodit 0.4.3__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/app.py +51 -23
- kodit/application/factories/reporting_factory.py +6 -2
- kodit/application/factories/server_factory.py +353 -0
- kodit/application/services/code_search_application_service.py +144 -0
- kodit/application/services/commit_indexing_application_service.py +700 -0
- kodit/application/services/indexing_worker_service.py +13 -44
- kodit/application/services/queue_service.py +24 -3
- kodit/application/services/reporting.py +0 -2
- kodit/application/services/sync_scheduler.py +15 -31
- kodit/cli.py +2 -753
- kodit/cli_utils.py +2 -9
- kodit/config.py +4 -97
- kodit/database.py +38 -1
- kodit/domain/enrichments/__init__.py +1 -0
- kodit/domain/enrichments/architecture/__init__.py +1 -0
- kodit/domain/enrichments/architecture/architecture.py +20 -0
- kodit/domain/enrichments/architecture/physical/__init__.py +1 -0
- kodit/domain/enrichments/architecture/physical/discovery_notes.py +14 -0
- kodit/domain/enrichments/architecture/physical/formatter.py +11 -0
- kodit/domain/enrichments/architecture/physical/physical.py +17 -0
- kodit/domain/enrichments/development/__init__.py +1 -0
- kodit/domain/enrichments/development/development.py +18 -0
- kodit/domain/enrichments/development/snippet/__init__.py +1 -0
- kodit/domain/enrichments/development/snippet/snippet.py +21 -0
- kodit/domain/enrichments/enricher.py +17 -0
- kodit/domain/enrichments/enrichment.py +39 -0
- kodit/domain/enrichments/request.py +12 -0
- kodit/domain/enrichments/response.py +11 -0
- kodit/domain/enrichments/usage/__init__.py +1 -0
- kodit/domain/enrichments/usage/api_docs.py +19 -0
- kodit/domain/enrichments/usage/usage.py +18 -0
- kodit/domain/{entities.py → entities/__init__.py} +50 -195
- kodit/domain/entities/git.py +190 -0
- kodit/domain/factories/__init__.py +1 -0
- kodit/domain/factories/git_repo_factory.py +76 -0
- kodit/domain/protocols.py +264 -64
- kodit/domain/services/bm25_service.py +5 -1
- kodit/domain/services/embedding_service.py +3 -0
- kodit/domain/services/enrichment_service.py +9 -30
- kodit/domain/services/git_repository_service.py +429 -0
- kodit/domain/services/git_service.py +300 -0
- kodit/domain/services/physical_architecture_service.py +182 -0
- kodit/domain/services/task_status_query_service.py +2 -2
- kodit/domain/value_objects.py +87 -135
- kodit/infrastructure/api/client/__init__.py +0 -2
- kodit/infrastructure/api/v1/__init__.py +0 -4
- kodit/infrastructure/api/v1/dependencies.py +92 -46
- kodit/infrastructure/api/v1/routers/__init__.py +0 -6
- kodit/infrastructure/api/v1/routers/commits.py +352 -0
- kodit/infrastructure/api/v1/routers/queue.py +2 -2
- kodit/infrastructure/api/v1/routers/repositories.py +282 -0
- kodit/infrastructure/api/v1/routers/search.py +31 -14
- kodit/infrastructure/api/v1/schemas/__init__.py +0 -24
- kodit/infrastructure/api/v1/schemas/commit.py +96 -0
- kodit/infrastructure/api/v1/schemas/context.py +2 -0
- kodit/infrastructure/api/v1/schemas/enrichment.py +29 -0
- kodit/infrastructure/api/v1/schemas/repository.py +128 -0
- kodit/infrastructure/api/v1/schemas/search.py +12 -9
- kodit/infrastructure/api/v1/schemas/snippet.py +58 -0
- kodit/infrastructure/api/v1/schemas/tag.py +31 -0
- kodit/infrastructure/api/v1/schemas/task_status.py +2 -0
- kodit/infrastructure/bm25/local_bm25_repository.py +16 -4
- kodit/infrastructure/bm25/vectorchord_bm25_repository.py +68 -52
- kodit/infrastructure/cloning/git/git_python_adaptor.py +534 -0
- kodit/infrastructure/cloning/git/working_copy.py +1 -1
- kodit/infrastructure/embedding/embedding_factory.py +3 -2
- kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
- kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +111 -84
- kodit/infrastructure/enricher/__init__.py +1 -0
- kodit/infrastructure/enricher/enricher_factory.py +53 -0
- kodit/infrastructure/{enrichment/litellm_enrichment_provider.py → enricher/litellm_enricher.py} +36 -56
- kodit/infrastructure/{enrichment/local_enrichment_provider.py → enricher/local_enricher.py} +19 -24
- kodit/infrastructure/enricher/null_enricher.py +36 -0
- kodit/infrastructure/indexing/fusion_service.py +1 -1
- kodit/infrastructure/mappers/enrichment_mapper.py +83 -0
- kodit/infrastructure/mappers/git_mapper.py +193 -0
- kodit/infrastructure/mappers/snippet_mapper.py +104 -0
- kodit/infrastructure/mappers/task_mapper.py +5 -44
- kodit/infrastructure/physical_architecture/__init__.py +1 -0
- kodit/infrastructure/physical_architecture/detectors/__init__.py +1 -0
- kodit/infrastructure/physical_architecture/detectors/docker_compose_detector.py +336 -0
- kodit/infrastructure/physical_architecture/formatters/__init__.py +1 -0
- kodit/infrastructure/physical_architecture/formatters/narrative_formatter.py +149 -0
- kodit/infrastructure/reporting/log_progress.py +8 -5
- kodit/infrastructure/reporting/telemetry_progress.py +21 -0
- kodit/infrastructure/slicing/api_doc_extractor.py +836 -0
- kodit/infrastructure/slicing/ast_analyzer.py +1128 -0
- kodit/infrastructure/slicing/slicer.py +87 -421
- kodit/infrastructure/sqlalchemy/embedding_repository.py +43 -23
- kodit/infrastructure/sqlalchemy/enrichment_v2_repository.py +118 -0
- kodit/infrastructure/sqlalchemy/entities.py +402 -158
- kodit/infrastructure/sqlalchemy/git_branch_repository.py +274 -0
- kodit/infrastructure/sqlalchemy/git_commit_repository.py +346 -0
- kodit/infrastructure/sqlalchemy/git_repository.py +262 -0
- kodit/infrastructure/sqlalchemy/git_tag_repository.py +268 -0
- kodit/infrastructure/sqlalchemy/snippet_v2_repository.py +479 -0
- kodit/infrastructure/sqlalchemy/task_repository.py +29 -23
- kodit/infrastructure/sqlalchemy/task_status_repository.py +24 -12
- kodit/infrastructure/sqlalchemy/unit_of_work.py +10 -14
- kodit/mcp.py +12 -30
- kodit/migrations/env.py +1 -0
- kodit/migrations/versions/04b80f802e0c_foreign_key_review.py +100 -0
- kodit/migrations/versions/19f8c7faf8b9_add_generic_enrichment_type.py +260 -0
- kodit/migrations/versions/7f15f878c3a1_add_new_git_entities.py +690 -0
- kodit/migrations/versions/f9e5ef5e688f_add_git_commits_number.py +43 -0
- kodit/py.typed +0 -0
- kodit/utils/dump_config.py +361 -0
- kodit/utils/dump_openapi.py +6 -4
- kodit/utils/path_utils.py +29 -0
- {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/METADATA +3 -3
- kodit-0.5.1.dist-info/RECORD +168 -0
- kodit/application/factories/code_indexing_factory.py +0 -195
- kodit/application/services/auto_indexing_service.py +0 -99
- kodit/application/services/code_indexing_application_service.py +0 -410
- kodit/domain/services/index_query_service.py +0 -70
- kodit/domain/services/index_service.py +0 -269
- kodit/infrastructure/api/client/index_client.py +0 -57
- kodit/infrastructure/api/v1/routers/indexes.py +0 -164
- kodit/infrastructure/api/v1/schemas/index.py +0 -101
- kodit/infrastructure/bm25/bm25_factory.py +0 -28
- kodit/infrastructure/cloning/__init__.py +0 -1
- kodit/infrastructure/cloning/metadata.py +0 -98
- kodit/infrastructure/enrichment/__init__.py +0 -1
- kodit/infrastructure/enrichment/enrichment_factory.py +0 -52
- kodit/infrastructure/enrichment/null_enrichment_provider.py +0 -19
- kodit/infrastructure/mappers/index_mapper.py +0 -345
- kodit/infrastructure/reporting/tdqm_progress.py +0 -38
- kodit/infrastructure/slicing/language_detection_service.py +0 -18
- kodit/infrastructure/sqlalchemy/index_repository.py +0 -646
- kodit-0.4.3.dist-info/RECORD +0 -125
- /kodit/infrastructure/{enrichment → enricher}/utils.py +0 -0
- {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/WHEEL +0 -0
- {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/entry_points.txt +0 -0
- {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,700 @@
|
|
|
1
|
+
"""Application services for commit indexing operations."""
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import structlog
|
|
7
|
+
from pydantic import AnyUrl
|
|
8
|
+
|
|
9
|
+
from kodit.application.services.queue_service import QueueService
|
|
10
|
+
from kodit.application.services.reporting import ProgressTracker
|
|
11
|
+
from kodit.domain.enrichments.architecture.physical.physical import (
|
|
12
|
+
PhysicalArchitectureEnrichment,
|
|
13
|
+
)
|
|
14
|
+
from kodit.domain.enrichments.enricher import Enricher
|
|
15
|
+
from kodit.domain.enrichments.request import (
|
|
16
|
+
EnrichmentRequest as GenericEnrichmentRequest,
|
|
17
|
+
)
|
|
18
|
+
from kodit.domain.enrichments.usage.api_docs import ENRICHMENT_SUBTYPE_API_DOCS
|
|
19
|
+
from kodit.domain.enrichments.usage.usage import ENRICHMENT_TYPE_USAGE
|
|
20
|
+
from kodit.domain.entities import Task
|
|
21
|
+
from kodit.domain.entities.git import GitFile, GitRepo, SnippetV2
|
|
22
|
+
from kodit.domain.factories.git_repo_factory import GitRepoFactory
|
|
23
|
+
from kodit.domain.protocols import (
|
|
24
|
+
GitBranchRepository,
|
|
25
|
+
GitCommitRepository,
|
|
26
|
+
GitRepoRepository,
|
|
27
|
+
GitTagRepository,
|
|
28
|
+
SnippetRepositoryV2,
|
|
29
|
+
)
|
|
30
|
+
from kodit.domain.services.bm25_service import BM25DomainService
|
|
31
|
+
from kodit.domain.services.embedding_service import EmbeddingDomainService
|
|
32
|
+
from kodit.domain.services.git_repository_service import (
|
|
33
|
+
GitRepositoryScanner,
|
|
34
|
+
RepositoryCloner,
|
|
35
|
+
)
|
|
36
|
+
from kodit.domain.services.physical_architecture_service import (
|
|
37
|
+
ARCHITECTURE_ENRICHMENT_SYSTEM_PROMPT,
|
|
38
|
+
ARCHITECTURE_ENRICHMENT_TASK_PROMPT,
|
|
39
|
+
PhysicalArchitectureService,
|
|
40
|
+
)
|
|
41
|
+
from kodit.domain.value_objects import (
|
|
42
|
+
DeleteRequest,
|
|
43
|
+
Document,
|
|
44
|
+
Enrichment,
|
|
45
|
+
EnrichmentType,
|
|
46
|
+
IndexRequest,
|
|
47
|
+
LanguageMapping,
|
|
48
|
+
PrescribedOperations,
|
|
49
|
+
QueuePriority,
|
|
50
|
+
TaskOperation,
|
|
51
|
+
TrackableType,
|
|
52
|
+
)
|
|
53
|
+
from kodit.infrastructure.slicing.api_doc_extractor import APIDocExtractor
|
|
54
|
+
from kodit.infrastructure.slicing.slicer import Slicer
|
|
55
|
+
from kodit.infrastructure.sqlalchemy.embedding_repository import (
|
|
56
|
+
SqlAlchemyEmbeddingRepository,
|
|
57
|
+
)
|
|
58
|
+
from kodit.infrastructure.sqlalchemy.enrichment_v2_repository import (
|
|
59
|
+
EnrichmentV2Repository,
|
|
60
|
+
)
|
|
61
|
+
from kodit.infrastructure.sqlalchemy.entities import EmbeddingType
|
|
62
|
+
|
|
63
|
+
SUMMARIZATION_SYSTEM_PROMPT = """
|
|
64
|
+
You are a professional software developer. You will be given a snippet of code.
|
|
65
|
+
Please provide a concise explanation of the code.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class CommitIndexingApplicationService:
|
|
70
|
+
"""Application service for commit indexing operations."""
|
|
71
|
+
|
|
72
|
+
def __init__( # noqa: PLR0913
|
|
73
|
+
self,
|
|
74
|
+
snippet_v2_repository: SnippetRepositoryV2,
|
|
75
|
+
repo_repository: GitRepoRepository,
|
|
76
|
+
git_commit_repository: GitCommitRepository,
|
|
77
|
+
git_branch_repository: GitBranchRepository,
|
|
78
|
+
git_tag_repository: GitTagRepository,
|
|
79
|
+
operation: ProgressTracker,
|
|
80
|
+
scanner: GitRepositoryScanner,
|
|
81
|
+
cloner: RepositoryCloner,
|
|
82
|
+
snippet_repository: SnippetRepositoryV2,
|
|
83
|
+
slicer: Slicer,
|
|
84
|
+
queue: QueueService,
|
|
85
|
+
bm25_service: BM25DomainService,
|
|
86
|
+
code_search_service: EmbeddingDomainService,
|
|
87
|
+
text_search_service: EmbeddingDomainService,
|
|
88
|
+
embedding_repository: SqlAlchemyEmbeddingRepository,
|
|
89
|
+
architecture_service: PhysicalArchitectureService,
|
|
90
|
+
enrichment_v2_repository: EnrichmentV2Repository,
|
|
91
|
+
enricher_service: Enricher,
|
|
92
|
+
) -> None:
|
|
93
|
+
"""Initialize the commit indexing application service.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
commit_index_repository: Repository for commit index data.
|
|
97
|
+
snippet_v2_repository: Repository for snippet data.
|
|
98
|
+
repo_repository: Repository for Git repository data.
|
|
99
|
+
domain_indexer: Domain service for indexing operations.
|
|
100
|
+
operation: Progress tracker for reporting operations.
|
|
101
|
+
|
|
102
|
+
"""
|
|
103
|
+
self.snippet_repository = snippet_v2_repository
|
|
104
|
+
self.repo_repository = repo_repository
|
|
105
|
+
self.git_commit_repository = git_commit_repository
|
|
106
|
+
self.git_branch_repository = git_branch_repository
|
|
107
|
+
self.git_tag_repository = git_tag_repository
|
|
108
|
+
self.operation = operation
|
|
109
|
+
self.scanner = scanner
|
|
110
|
+
self.cloner = cloner
|
|
111
|
+
self.snippet_repository = snippet_repository
|
|
112
|
+
self.slicer = slicer
|
|
113
|
+
self.queue = queue
|
|
114
|
+
self.bm25_service = bm25_service
|
|
115
|
+
self.code_search_service = code_search_service
|
|
116
|
+
self.text_search_service = text_search_service
|
|
117
|
+
self.embedding_repository = embedding_repository
|
|
118
|
+
self.architecture_service = architecture_service
|
|
119
|
+
self.enrichment_v2_repository = enrichment_v2_repository
|
|
120
|
+
self.enricher_service = enricher_service
|
|
121
|
+
self._log = structlog.get_logger(__name__)
|
|
122
|
+
|
|
123
|
+
async def create_git_repository(self, remote_uri: AnyUrl) -> GitRepo:
|
|
124
|
+
"""Create a new Git repository."""
|
|
125
|
+
async with self.operation.create_child(
|
|
126
|
+
TaskOperation.CREATE_REPOSITORY,
|
|
127
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
128
|
+
):
|
|
129
|
+
repo = GitRepoFactory.create_from_remote_uri(remote_uri)
|
|
130
|
+
repo = await self.repo_repository.save(repo)
|
|
131
|
+
await self.queue.enqueue_tasks(
|
|
132
|
+
tasks=PrescribedOperations.CREATE_NEW_REPOSITORY,
|
|
133
|
+
base_priority=QueuePriority.USER_INITIATED,
|
|
134
|
+
payload={"repository_id": repo.id},
|
|
135
|
+
)
|
|
136
|
+
return repo
|
|
137
|
+
|
|
138
|
+
async def delete_git_repository(self, repo_id: int) -> bool:
|
|
139
|
+
"""Delete a Git repository by ID."""
|
|
140
|
+
repo = await self.repo_repository.get_by_id(repo_id)
|
|
141
|
+
if not repo:
|
|
142
|
+
return False
|
|
143
|
+
|
|
144
|
+
# Use the proper deletion process that handles all dependencies
|
|
145
|
+
await self.process_delete_repo(repo_id)
|
|
146
|
+
return True
|
|
147
|
+
|
|
148
|
+
# TODO(Phil): Make this polymorphic
|
|
149
|
+
async def run_task(self, task: Task) -> None: # noqa: PLR0912, C901
|
|
150
|
+
"""Run a task."""
|
|
151
|
+
if task.type.is_repository_operation():
|
|
152
|
+
repo_id = task.payload["repository_id"]
|
|
153
|
+
if not repo_id:
|
|
154
|
+
raise ValueError("Repository ID is required")
|
|
155
|
+
if task.type == TaskOperation.CLONE_REPOSITORY:
|
|
156
|
+
await self.process_clone_repo(repo_id)
|
|
157
|
+
elif task.type == TaskOperation.SCAN_REPOSITORY:
|
|
158
|
+
await self.process_scan_repo(repo_id)
|
|
159
|
+
elif task.type == TaskOperation.DELETE_REPOSITORY:
|
|
160
|
+
await self.process_delete_repo(repo_id)
|
|
161
|
+
else:
|
|
162
|
+
raise ValueError(f"Unknown task type: {task.type}")
|
|
163
|
+
elif task.type.is_commit_operation():
|
|
164
|
+
repository_id = task.payload["repository_id"]
|
|
165
|
+
if not repository_id:
|
|
166
|
+
raise ValueError("Repository ID is required")
|
|
167
|
+
commit_sha = task.payload["commit_sha"]
|
|
168
|
+
if not commit_sha:
|
|
169
|
+
raise ValueError("Commit SHA is required")
|
|
170
|
+
if task.type == TaskOperation.EXTRACT_SNIPPETS_FOR_COMMIT:
|
|
171
|
+
await self.process_snippets_for_commit(repository_id, commit_sha)
|
|
172
|
+
elif task.type == TaskOperation.CREATE_BM25_INDEX_FOR_COMMIT:
|
|
173
|
+
await self.process_bm25_index(repository_id, commit_sha)
|
|
174
|
+
elif task.type == TaskOperation.CREATE_CODE_EMBEDDINGS_FOR_COMMIT:
|
|
175
|
+
await self.process_code_embeddings(repository_id, commit_sha)
|
|
176
|
+
elif task.type == TaskOperation.CREATE_SUMMARY_ENRICHMENT_FOR_COMMIT:
|
|
177
|
+
await self.process_enrich(repository_id, commit_sha)
|
|
178
|
+
elif task.type == TaskOperation.CREATE_SUMMARY_EMBEDDINGS_FOR_COMMIT:
|
|
179
|
+
await self.process_summary_embeddings(repository_id, commit_sha)
|
|
180
|
+
elif task.type == TaskOperation.CREATE_ARCHITECTURE_ENRICHMENT_FOR_COMMIT:
|
|
181
|
+
await self.process_architecture_discovery(repository_id, commit_sha)
|
|
182
|
+
elif task.type == TaskOperation.CREATE_PUBLIC_API_DOCS_FOR_COMMIT:
|
|
183
|
+
await self.process_api_docs(repository_id, commit_sha)
|
|
184
|
+
else:
|
|
185
|
+
raise ValueError(f"Unknown task type: {task.type}")
|
|
186
|
+
else:
|
|
187
|
+
raise ValueError(f"Unknown task type: {task.type}")
|
|
188
|
+
|
|
189
|
+
async def process_clone_repo(self, repository_id: int) -> None:
|
|
190
|
+
"""Clone a repository."""
|
|
191
|
+
async with self.operation.create_child(
|
|
192
|
+
TaskOperation.CLONE_REPOSITORY,
|
|
193
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
194
|
+
trackable_id=repository_id,
|
|
195
|
+
):
|
|
196
|
+
repo = await self.repo_repository.get_by_id(repository_id)
|
|
197
|
+
repo.cloned_path = await self.cloner.clone_repository(repo.remote_uri)
|
|
198
|
+
await self.repo_repository.save(repo)
|
|
199
|
+
|
|
200
|
+
async def process_scan_repo(self, repository_id: int) -> None:
|
|
201
|
+
"""Scan a repository."""
|
|
202
|
+
async with self.operation.create_child(
|
|
203
|
+
TaskOperation.SCAN_REPOSITORY,
|
|
204
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
205
|
+
trackable_id=repository_id,
|
|
206
|
+
) as step:
|
|
207
|
+
await step.set_total(6)
|
|
208
|
+
repo = await self.repo_repository.get_by_id(repository_id)
|
|
209
|
+
if not repo.cloned_path:
|
|
210
|
+
raise ValueError(f"Repository {repository_id} has never been cloned")
|
|
211
|
+
|
|
212
|
+
# Scan the repository to get all metadata
|
|
213
|
+
await step.set_current(0, "Scanning repository")
|
|
214
|
+
scan_result = await self.scanner.scan_repository(repo.cloned_path)
|
|
215
|
+
|
|
216
|
+
# Update repo with scan result (this sets num_commits, num_branches, etc.)
|
|
217
|
+
await step.set_current(1, "Updating repository with scan result")
|
|
218
|
+
repo.update_with_scan_result(scan_result)
|
|
219
|
+
await self.repo_repository.save(repo)
|
|
220
|
+
|
|
221
|
+
# Save commits, branches, and tags to their dedicated repositories
|
|
222
|
+
await step.set_current(2, "Saving commits")
|
|
223
|
+
if scan_result.all_commits:
|
|
224
|
+
await self.git_commit_repository.save_bulk(
|
|
225
|
+
scan_result.all_commits, repository_id
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
await step.set_current(3, "Saving branches")
|
|
229
|
+
if scan_result.branches:
|
|
230
|
+
await self.git_branch_repository.save_bulk(
|
|
231
|
+
scan_result.branches, repository_id
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
await step.set_current(4, "Saving tags")
|
|
235
|
+
if scan_result.all_tags:
|
|
236
|
+
await self.git_tag_repository.save_bulk(
|
|
237
|
+
scan_result.all_tags, repository_id
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
await step.set_current(5, "Enqueuing commit indexing tasks")
|
|
241
|
+
if not repo.tracking_branch:
|
|
242
|
+
raise ValueError(f"Repository {repository_id} has no tracking branch")
|
|
243
|
+
commit_sha = repo.tracking_branch.head_commit.commit_sha
|
|
244
|
+
if not commit_sha:
|
|
245
|
+
raise ValueError(f"Repository {repository_id} has no head commit")
|
|
246
|
+
|
|
247
|
+
await self.queue.enqueue_tasks(
|
|
248
|
+
tasks=PrescribedOperations.INDEX_COMMIT,
|
|
249
|
+
base_priority=QueuePriority.USER_INITIATED,
|
|
250
|
+
payload={"commit_sha": commit_sha, "repository_id": repository_id},
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
async def process_delete_repo(self, repository_id: int) -> None:
|
|
254
|
+
"""Delete a repository."""
|
|
255
|
+
async with self.operation.create_child(
|
|
256
|
+
TaskOperation.DELETE_REPOSITORY,
|
|
257
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
258
|
+
trackable_id=repository_id,
|
|
259
|
+
):
|
|
260
|
+
repo = await self.repo_repository.get_by_id(repository_id)
|
|
261
|
+
if not repo:
|
|
262
|
+
raise ValueError(f"Repository {repository_id} not found")
|
|
263
|
+
|
|
264
|
+
# Get all commit SHAs for this repository first (needed for cleanup)
|
|
265
|
+
commits = await self.git_commit_repository.get_by_repo_id(repository_id)
|
|
266
|
+
commit_shas = [commit.commit_sha for commit in commits]
|
|
267
|
+
|
|
268
|
+
# Step 1: Get all snippet IDs that are associated with these commits FIRST
|
|
269
|
+
# (before deleting the associations)
|
|
270
|
+
all_snippet_ids = []
|
|
271
|
+
if commit_shas:
|
|
272
|
+
for commit_sha in commit_shas:
|
|
273
|
+
snippets = await self.snippet_repository.get_snippets_for_commit(
|
|
274
|
+
commit_sha
|
|
275
|
+
)
|
|
276
|
+
all_snippet_ids.extend(
|
|
277
|
+
[snippet.id for snippet in snippets if snippet.id]
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
# Step 2: Delete from BM25 and embedding indices
|
|
281
|
+
if all_snippet_ids:
|
|
282
|
+
# Convert to strings as DeleteRequest expects list[str]
|
|
283
|
+
snippet_id_strings = [str(snippet_id) for snippet_id in all_snippet_ids]
|
|
284
|
+
delete_request = DeleteRequest(snippet_ids=snippet_id_strings)
|
|
285
|
+
await self.bm25_service.delete_documents(delete_request)
|
|
286
|
+
|
|
287
|
+
# Delete embeddings for each snippet
|
|
288
|
+
for snippet_id in all_snippet_ids:
|
|
289
|
+
await self.embedding_repository.delete_embeddings_by_snippet_id(
|
|
290
|
+
snippet_id
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
# Step 3: Delete enrichments for all commits
|
|
294
|
+
if commit_shas:
|
|
295
|
+
await self.enrichment_v2_repository.bulk_delete_enrichments(
|
|
296
|
+
entity_type="git_commit",
|
|
297
|
+
entity_ids=commit_shas,
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
# Step 4: Delete snippet associations for all commits
|
|
301
|
+
for commit_sha in commit_shas:
|
|
302
|
+
await self.snippet_repository.delete_snippets_for_commit(commit_sha)
|
|
303
|
+
|
|
304
|
+
# Step 5: Delete branches (they reference commits via head_commit_sha)
|
|
305
|
+
await self.git_branch_repository.delete_by_repo_id(repository_id)
|
|
306
|
+
|
|
307
|
+
# Step 6: Delete tags (they reference commits via target_commit_sha)
|
|
308
|
+
await self.git_tag_repository.delete_by_repo_id(repository_id)
|
|
309
|
+
|
|
310
|
+
# Step 7: Delete commits and their files
|
|
311
|
+
await self.git_commit_repository.delete_by_repo_id(repository_id)
|
|
312
|
+
|
|
313
|
+
# Step 8: Finally delete the repository
|
|
314
|
+
await self.repo_repository.delete(repo.sanitized_remote_uri)
|
|
315
|
+
|
|
316
|
+
async def process_snippets_for_commit(
|
|
317
|
+
self, repository_id: int, commit_sha: str
|
|
318
|
+
) -> None:
|
|
319
|
+
"""Generate snippets for a repository."""
|
|
320
|
+
async with self.operation.create_child(
|
|
321
|
+
operation=TaskOperation.EXTRACT_SNIPPETS_FOR_COMMIT,
|
|
322
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
323
|
+
trackable_id=repository_id,
|
|
324
|
+
) as step:
|
|
325
|
+
# Have we already processed this commit? If yes, skip.
|
|
326
|
+
if await self.snippet_repository.get_snippets_for_commit(commit_sha):
|
|
327
|
+
await step.skip("All snippets already extracted for commit")
|
|
328
|
+
return
|
|
329
|
+
|
|
330
|
+
commit = await self.git_commit_repository.get_by_sha(commit_sha)
|
|
331
|
+
|
|
332
|
+
# Load files on demand for snippet extraction (performance optimization)
|
|
333
|
+
# Instead of using commit.files (which may be empty), load files directly
|
|
334
|
+
repo = await self.repo_repository.get_by_id(repository_id)
|
|
335
|
+
if not repo.cloned_path:
|
|
336
|
+
raise ValueError(f"Repository {repository_id} has never been cloned")
|
|
337
|
+
|
|
338
|
+
files_data = await self.scanner.git_adapter.get_commit_file_data(
|
|
339
|
+
repo.cloned_path, commit_sha
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
# Create GitFile entities with absolute paths for the slicer
|
|
343
|
+
files = []
|
|
344
|
+
for file_data in files_data:
|
|
345
|
+
# Extract extension from file path
|
|
346
|
+
file_path = Path(file_data["path"])
|
|
347
|
+
extension = file_path.suffix.lstrip(".")
|
|
348
|
+
|
|
349
|
+
# Create absolute path for the slicer to read
|
|
350
|
+
absolute_path = str(repo.cloned_path / file_data["path"])
|
|
351
|
+
|
|
352
|
+
git_file = GitFile(
|
|
353
|
+
created_at=file_data.get("created_at", commit.date),
|
|
354
|
+
blob_sha=file_data["blob_sha"],
|
|
355
|
+
path=absolute_path, # Use absolute path for file reading
|
|
356
|
+
mime_type=file_data.get("mime_type", "application/octet-stream"),
|
|
357
|
+
size=file_data.get("size", 0),
|
|
358
|
+
extension=extension,
|
|
359
|
+
)
|
|
360
|
+
files.append(git_file)
|
|
361
|
+
|
|
362
|
+
# Create a set of languages to extract snippets for
|
|
363
|
+
extensions = {file.extension for file in files}
|
|
364
|
+
lang_files_map: dict[str, list[GitFile]] = defaultdict(list)
|
|
365
|
+
for ext in extensions:
|
|
366
|
+
try:
|
|
367
|
+
lang = LanguageMapping.get_language_for_extension(ext)
|
|
368
|
+
lang_files_map[lang].extend(
|
|
369
|
+
file for file in files if file.extension == ext
|
|
370
|
+
)
|
|
371
|
+
except ValueError as e:
|
|
372
|
+
self._log.debug("Skipping", error=str(e))
|
|
373
|
+
continue
|
|
374
|
+
|
|
375
|
+
# Extract snippets
|
|
376
|
+
all_snippets: list[SnippetV2] = []
|
|
377
|
+
slicer = Slicer()
|
|
378
|
+
await step.set_total(len(lang_files_map.keys()))
|
|
379
|
+
for i, (lang, lang_files) in enumerate(lang_files_map.items()):
|
|
380
|
+
await step.set_current(i, f"Extracting snippets for {lang}")
|
|
381
|
+
snippets = slicer.extract_snippets_from_git_files(
|
|
382
|
+
lang_files, language=lang
|
|
383
|
+
)
|
|
384
|
+
all_snippets.extend(snippets)
|
|
385
|
+
|
|
386
|
+
# Deduplicate snippets by SHA before saving to prevent constraint violations
|
|
387
|
+
unique_snippets: dict[str, SnippetV2] = {}
|
|
388
|
+
for snippet in all_snippets:
|
|
389
|
+
unique_snippets[snippet.sha] = snippet
|
|
390
|
+
|
|
391
|
+
deduplicated_snippets = list(unique_snippets.values())
|
|
392
|
+
|
|
393
|
+
commit_short = commit.commit_sha[:8]
|
|
394
|
+
self._log.info(
|
|
395
|
+
f"Extracted {len(all_snippets)} snippets, "
|
|
396
|
+
f"deduplicated to {len(deduplicated_snippets)} for {commit_short}"
|
|
397
|
+
)
|
|
398
|
+
await self.snippet_repository.save_snippets(
|
|
399
|
+
commit.commit_sha, deduplicated_snippets
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
async def process_bm25_index(self, repository_id: int, commit_sha: str) -> None:
|
|
403
|
+
"""Handle BM25_INDEX task - create keyword index."""
|
|
404
|
+
async with self.operation.create_child(
|
|
405
|
+
TaskOperation.CREATE_BM25_INDEX_FOR_COMMIT,
|
|
406
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
407
|
+
trackable_id=repository_id,
|
|
408
|
+
):
|
|
409
|
+
snippets = await self.snippet_repository.get_snippets_for_commit(commit_sha)
|
|
410
|
+
|
|
411
|
+
await self.bm25_service.index_documents(
|
|
412
|
+
IndexRequest(
|
|
413
|
+
documents=[
|
|
414
|
+
Document(snippet_id=snippet.id, text=snippet.content)
|
|
415
|
+
for snippet in snippets
|
|
416
|
+
if snippet.id
|
|
417
|
+
]
|
|
418
|
+
)
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
async def process_code_embeddings(
|
|
422
|
+
self, repository_id: int, commit_sha: str
|
|
423
|
+
) -> None:
|
|
424
|
+
"""Handle CODE_EMBEDDINGS task - create code embeddings."""
|
|
425
|
+
async with self.operation.create_child(
|
|
426
|
+
TaskOperation.CREATE_CODE_EMBEDDINGS_FOR_COMMIT,
|
|
427
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
428
|
+
trackable_id=repository_id,
|
|
429
|
+
) as step:
|
|
430
|
+
all_snippets = await self.snippet_repository.get_snippets_for_commit(
|
|
431
|
+
commit_sha
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
new_snippets = await self._new_snippets_for_type(
|
|
435
|
+
all_snippets, EmbeddingType.CODE
|
|
436
|
+
)
|
|
437
|
+
if not new_snippets:
|
|
438
|
+
await step.skip("All snippets already have code embeddings")
|
|
439
|
+
return
|
|
440
|
+
|
|
441
|
+
await step.set_total(len(new_snippets))
|
|
442
|
+
processed = 0
|
|
443
|
+
documents = [
|
|
444
|
+
Document(snippet_id=snippet.id, text=snippet.content)
|
|
445
|
+
for snippet in new_snippets
|
|
446
|
+
if snippet.id
|
|
447
|
+
]
|
|
448
|
+
async for result in self.code_search_service.index_documents(
|
|
449
|
+
IndexRequest(documents=documents)
|
|
450
|
+
):
|
|
451
|
+
processed += len(result)
|
|
452
|
+
await step.set_current(processed, "Creating code embeddings for commit")
|
|
453
|
+
|
|
454
|
+
async def process_enrich(self, repository_id: int, commit_sha: str) -> None:
|
|
455
|
+
"""Handle ENRICH task - enrich snippets and create text embeddings."""
|
|
456
|
+
async with self.operation.create_child(
|
|
457
|
+
TaskOperation.CREATE_SUMMARY_ENRICHMENT_FOR_COMMIT,
|
|
458
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
459
|
+
trackable_id=repository_id,
|
|
460
|
+
) as step:
|
|
461
|
+
all_snippets = await self.snippet_repository.get_snippets_for_commit(
|
|
462
|
+
commit_sha
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
# Find snippets without a summary enrichment
|
|
466
|
+
snippets_without_summary = [
|
|
467
|
+
snippet
|
|
468
|
+
for snippet in all_snippets
|
|
469
|
+
if not snippet.enrichments
|
|
470
|
+
or not next(
|
|
471
|
+
enrichment
|
|
472
|
+
for enrichment in snippet.enrichments
|
|
473
|
+
if enrichment.type == EnrichmentType.SUMMARIZATION
|
|
474
|
+
)
|
|
475
|
+
]
|
|
476
|
+
if not snippets_without_summary:
|
|
477
|
+
await step.skip("All snippets already have a summary enrichment")
|
|
478
|
+
return
|
|
479
|
+
|
|
480
|
+
# Enrich snippets
|
|
481
|
+
await step.set_total(len(snippets_without_summary))
|
|
482
|
+
snippet_map = {
|
|
483
|
+
snippet.id: snippet
|
|
484
|
+
for snippet in snippets_without_summary
|
|
485
|
+
if snippet.id
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
enrichment_requests = [
|
|
489
|
+
GenericEnrichmentRequest(
|
|
490
|
+
id=snippet_id,
|
|
491
|
+
text=snippet.content,
|
|
492
|
+
system_prompt=SUMMARIZATION_SYSTEM_PROMPT,
|
|
493
|
+
)
|
|
494
|
+
for snippet_id, snippet in snippet_map.items()
|
|
495
|
+
]
|
|
496
|
+
|
|
497
|
+
processed = 0
|
|
498
|
+
async for result in self.enricher_service.enrich(enrichment_requests):
|
|
499
|
+
snippet = snippet_map[result.id]
|
|
500
|
+
snippet.enrichments.append(
|
|
501
|
+
Enrichment(type=EnrichmentType.SUMMARIZATION, content=result.text)
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
await self.snippet_repository.save_snippets(commit_sha, [snippet])
|
|
505
|
+
|
|
506
|
+
processed += 1
|
|
507
|
+
await step.set_current(processed, "Enriching snippets for commit")
|
|
508
|
+
|
|
509
|
+
async def process_summary_embeddings(
|
|
510
|
+
self, repository_id: int, commit_sha: str
|
|
511
|
+
) -> None:
|
|
512
|
+
"""Handle SUMMARY_EMBEDDINGS task - create summary embeddings."""
|
|
513
|
+
async with self.operation.create_child(
|
|
514
|
+
TaskOperation.CREATE_SUMMARY_EMBEDDINGS_FOR_COMMIT,
|
|
515
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
516
|
+
trackable_id=repository_id,
|
|
517
|
+
) as step:
|
|
518
|
+
snippets = await self.snippet_repository.get_snippets_for_commit(commit_sha)
|
|
519
|
+
|
|
520
|
+
new_snippets = await self._new_snippets_for_type(
|
|
521
|
+
snippets, EmbeddingType.TEXT
|
|
522
|
+
)
|
|
523
|
+
if not new_snippets:
|
|
524
|
+
await step.skip("All snippets already have text embeddings")
|
|
525
|
+
return
|
|
526
|
+
|
|
527
|
+
await step.set_total(len(new_snippets))
|
|
528
|
+
processed = 0
|
|
529
|
+
|
|
530
|
+
def _summary_from_enrichments(enrichments: list[Enrichment]) -> str:
|
|
531
|
+
if not enrichments:
|
|
532
|
+
return ""
|
|
533
|
+
return next(
|
|
534
|
+
enrichment.content
|
|
535
|
+
for enrichment in enrichments
|
|
536
|
+
if enrichment.type == EnrichmentType.SUMMARIZATION
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
snippet_summary_map = {
|
|
540
|
+
snippet.id: _summary_from_enrichments(snippet.enrichments)
|
|
541
|
+
for snippet in snippets
|
|
542
|
+
if snippet.id
|
|
543
|
+
}
|
|
544
|
+
if len(snippet_summary_map) == 0:
|
|
545
|
+
await step.skip("No snippets with summaries to create text embeddings")
|
|
546
|
+
return
|
|
547
|
+
|
|
548
|
+
documents_with_summaries = [
|
|
549
|
+
Document(snippet_id=snippet_id, text=snippet_summary)
|
|
550
|
+
for snippet_id, snippet_summary in snippet_summary_map.items()
|
|
551
|
+
]
|
|
552
|
+
async for result in self.text_search_service.index_documents(
|
|
553
|
+
IndexRequest(documents=documents_with_summaries)
|
|
554
|
+
):
|
|
555
|
+
processed += len(result)
|
|
556
|
+
await step.set_current(processed, "Creating text embeddings for commit")
|
|
557
|
+
|
|
558
|
+
async def process_architecture_discovery(
|
|
559
|
+
self, repository_id: int, commit_sha: str
|
|
560
|
+
) -> None:
|
|
561
|
+
"""Handle ARCHITECTURE_DISCOVERY task - discover physical architecture."""
|
|
562
|
+
async with self.operation.create_child(
|
|
563
|
+
TaskOperation.CREATE_ARCHITECTURE_ENRICHMENT_FOR_COMMIT,
|
|
564
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
565
|
+
trackable_id=repository_id,
|
|
566
|
+
) as step:
|
|
567
|
+
await step.set_total(3)
|
|
568
|
+
|
|
569
|
+
# Check if architecture enrichment already exists for this commit
|
|
570
|
+
enrichment_repo = self.enrichment_v2_repository
|
|
571
|
+
existing_enrichments = await enrichment_repo.enrichments_for_entity_type(
|
|
572
|
+
entity_type="git_commit",
|
|
573
|
+
entity_ids=[commit_sha],
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
# Check if architecture enrichment already exists
|
|
577
|
+
has_architecture = any(
|
|
578
|
+
enrichment.type == "architecture" for enrichment in existing_enrichments
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
if has_architecture:
|
|
582
|
+
await step.skip("Architecture enrichment already exists for commit")
|
|
583
|
+
return
|
|
584
|
+
|
|
585
|
+
# Get repository path
|
|
586
|
+
repo = await self.repo_repository.get_by_id(repository_id)
|
|
587
|
+
if not repo.cloned_path:
|
|
588
|
+
raise ValueError(f"Repository {repository_id} has never been cloned")
|
|
589
|
+
|
|
590
|
+
await step.set_current(1, "Discovering physical architecture")
|
|
591
|
+
|
|
592
|
+
# Discover architecture
|
|
593
|
+
architecture_narrative = (
|
|
594
|
+
await self.architecture_service.discover_architecture(repo.cloned_path)
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
await step.set_current(2, "Enriching architecture notes with LLM")
|
|
598
|
+
|
|
599
|
+
# Enrich the architecture narrative through the enricher
|
|
600
|
+
enrichment_request = GenericEnrichmentRequest(
|
|
601
|
+
id=commit_sha,
|
|
602
|
+
text=ARCHITECTURE_ENRICHMENT_TASK_PROMPT.format(
|
|
603
|
+
architecture_narrative=architecture_narrative,
|
|
604
|
+
),
|
|
605
|
+
system_prompt=ARCHITECTURE_ENRICHMENT_SYSTEM_PROMPT,
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
enriched_content = ""
|
|
609
|
+
async for response in self.enricher_service.enrich([enrichment_request]):
|
|
610
|
+
enriched_content = response.text
|
|
611
|
+
|
|
612
|
+
# Create and save architecture enrichment with enriched content
|
|
613
|
+
architecture_enrichment = PhysicalArchitectureEnrichment(
|
|
614
|
+
entity_id=commit_sha,
|
|
615
|
+
content=enriched_content,
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
await self.enrichment_v2_repository.bulk_save_enrichments(
|
|
619
|
+
[architecture_enrichment]
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
await step.set_current(3, "Architecture enrichment completed")
|
|
623
|
+
|
|
624
|
+
async def process_api_docs(self, repository_id: int, commit_sha: str) -> None:
|
|
625
|
+
"""Handle API_DOCS task - generate API documentation."""
|
|
626
|
+
async with self.operation.create_child(
|
|
627
|
+
TaskOperation.CREATE_PUBLIC_API_DOCS_FOR_COMMIT,
|
|
628
|
+
trackable_type=TrackableType.KODIT_REPOSITORY,
|
|
629
|
+
trackable_id=repository_id,
|
|
630
|
+
) as step:
|
|
631
|
+
# Check if API docs already exist for this commit
|
|
632
|
+
existing_enrichments = (
|
|
633
|
+
await self.enrichment_v2_repository.enrichments_for_entity_type(
|
|
634
|
+
entity_type="git_commit",
|
|
635
|
+
entity_ids=[commit_sha],
|
|
636
|
+
)
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
has_api_docs = any(
|
|
640
|
+
e.type == ENRICHMENT_TYPE_USAGE
|
|
641
|
+
and e.subtype == ENRICHMENT_SUBTYPE_API_DOCS
|
|
642
|
+
for e in existing_enrichments
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
if has_api_docs:
|
|
646
|
+
await step.skip("API docs already exist for commit")
|
|
647
|
+
return
|
|
648
|
+
|
|
649
|
+
# Get repository for metadata
|
|
650
|
+
repo = await self.repo_repository.get_by_id(repository_id)
|
|
651
|
+
if not repo:
|
|
652
|
+
raise ValueError(f"Repository {repository_id} not found")
|
|
653
|
+
str(repo.sanitized_remote_uri)
|
|
654
|
+
|
|
655
|
+
commit = await self.git_commit_repository.get_by_sha(commit_sha)
|
|
656
|
+
|
|
657
|
+
# Group files by language
|
|
658
|
+
lang_files_map: dict[str, list[GitFile]] = defaultdict(list)
|
|
659
|
+
for file in commit.files:
|
|
660
|
+
try:
|
|
661
|
+
lang = LanguageMapping.get_language_for_extension(file.extension)
|
|
662
|
+
except ValueError:
|
|
663
|
+
continue
|
|
664
|
+
lang_files_map[lang].append(file)
|
|
665
|
+
|
|
666
|
+
all_enrichments = []
|
|
667
|
+
extractor = APIDocExtractor()
|
|
668
|
+
|
|
669
|
+
await step.set_total(len(lang_files_map))
|
|
670
|
+
for i, (lang, lang_files) in enumerate(lang_files_map.items()):
|
|
671
|
+
await step.set_current(i, f"Extracting API docs for {lang}")
|
|
672
|
+
enrichments = extractor.extract_api_docs(
|
|
673
|
+
lang_files,
|
|
674
|
+
lang,
|
|
675
|
+
commit_sha,
|
|
676
|
+
include_private=False,
|
|
677
|
+
)
|
|
678
|
+
all_enrichments.extend(enrichments)
|
|
679
|
+
|
|
680
|
+
# Save all enrichments
|
|
681
|
+
if all_enrichments:
|
|
682
|
+
await self.enrichment_v2_repository.bulk_save_enrichments(
|
|
683
|
+
all_enrichments
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
async def _new_snippets_for_type(
|
|
687
|
+
self, all_snippets: list[SnippetV2], embedding_type: EmbeddingType
|
|
688
|
+
) -> list[SnippetV2]:
|
|
689
|
+
"""Get new snippets for a given type."""
|
|
690
|
+
existing_embeddings = (
|
|
691
|
+
await self.embedding_repository.list_embeddings_by_snippet_ids_and_type(
|
|
692
|
+
[s.id for s in all_snippets], embedding_type
|
|
693
|
+
)
|
|
694
|
+
)
|
|
695
|
+
existing_embeddings_by_snippet_id = {
|
|
696
|
+
embedding.snippet_id: embedding for embedding in existing_embeddings
|
|
697
|
+
}
|
|
698
|
+
return [
|
|
699
|
+
s for s in all_snippets if s.id not in existing_embeddings_by_snippet_id
|
|
700
|
+
]
|