kodit 0.4.3__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/app.py +51 -23
- kodit/application/factories/reporting_factory.py +6 -2
- kodit/application/factories/server_factory.py +353 -0
- kodit/application/services/code_search_application_service.py +144 -0
- kodit/application/services/commit_indexing_application_service.py +700 -0
- kodit/application/services/indexing_worker_service.py +13 -44
- kodit/application/services/queue_service.py +24 -3
- kodit/application/services/reporting.py +0 -2
- kodit/application/services/sync_scheduler.py +15 -31
- kodit/cli.py +2 -753
- kodit/cli_utils.py +2 -9
- kodit/config.py +4 -97
- kodit/database.py +38 -1
- kodit/domain/enrichments/__init__.py +1 -0
- kodit/domain/enrichments/architecture/__init__.py +1 -0
- kodit/domain/enrichments/architecture/architecture.py +20 -0
- kodit/domain/enrichments/architecture/physical/__init__.py +1 -0
- kodit/domain/enrichments/architecture/physical/discovery_notes.py +14 -0
- kodit/domain/enrichments/architecture/physical/formatter.py +11 -0
- kodit/domain/enrichments/architecture/physical/physical.py +17 -0
- kodit/domain/enrichments/development/__init__.py +1 -0
- kodit/domain/enrichments/development/development.py +18 -0
- kodit/domain/enrichments/development/snippet/__init__.py +1 -0
- kodit/domain/enrichments/development/snippet/snippet.py +21 -0
- kodit/domain/enrichments/enricher.py +17 -0
- kodit/domain/enrichments/enrichment.py +39 -0
- kodit/domain/enrichments/request.py +12 -0
- kodit/domain/enrichments/response.py +11 -0
- kodit/domain/enrichments/usage/__init__.py +1 -0
- kodit/domain/enrichments/usage/api_docs.py +19 -0
- kodit/domain/enrichments/usage/usage.py +18 -0
- kodit/domain/{entities.py → entities/__init__.py} +50 -195
- kodit/domain/entities/git.py +190 -0
- kodit/domain/factories/__init__.py +1 -0
- kodit/domain/factories/git_repo_factory.py +76 -0
- kodit/domain/protocols.py +264 -64
- kodit/domain/services/bm25_service.py +5 -1
- kodit/domain/services/embedding_service.py +3 -0
- kodit/domain/services/enrichment_service.py +9 -30
- kodit/domain/services/git_repository_service.py +429 -0
- kodit/domain/services/git_service.py +300 -0
- kodit/domain/services/physical_architecture_service.py +182 -0
- kodit/domain/services/task_status_query_service.py +2 -2
- kodit/domain/value_objects.py +87 -135
- kodit/infrastructure/api/client/__init__.py +0 -2
- kodit/infrastructure/api/v1/__init__.py +0 -4
- kodit/infrastructure/api/v1/dependencies.py +92 -46
- kodit/infrastructure/api/v1/routers/__init__.py +0 -6
- kodit/infrastructure/api/v1/routers/commits.py +352 -0
- kodit/infrastructure/api/v1/routers/queue.py +2 -2
- kodit/infrastructure/api/v1/routers/repositories.py +282 -0
- kodit/infrastructure/api/v1/routers/search.py +31 -14
- kodit/infrastructure/api/v1/schemas/__init__.py +0 -24
- kodit/infrastructure/api/v1/schemas/commit.py +96 -0
- kodit/infrastructure/api/v1/schemas/context.py +2 -0
- kodit/infrastructure/api/v1/schemas/enrichment.py +29 -0
- kodit/infrastructure/api/v1/schemas/repository.py +128 -0
- kodit/infrastructure/api/v1/schemas/search.py +12 -9
- kodit/infrastructure/api/v1/schemas/snippet.py +58 -0
- kodit/infrastructure/api/v1/schemas/tag.py +31 -0
- kodit/infrastructure/api/v1/schemas/task_status.py +2 -0
- kodit/infrastructure/bm25/local_bm25_repository.py +16 -4
- kodit/infrastructure/bm25/vectorchord_bm25_repository.py +68 -52
- kodit/infrastructure/cloning/git/git_python_adaptor.py +534 -0
- kodit/infrastructure/cloning/git/working_copy.py +1 -1
- kodit/infrastructure/embedding/embedding_factory.py +3 -2
- kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
- kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +111 -84
- kodit/infrastructure/enricher/__init__.py +1 -0
- kodit/infrastructure/enricher/enricher_factory.py +53 -0
- kodit/infrastructure/{enrichment/litellm_enrichment_provider.py → enricher/litellm_enricher.py} +36 -56
- kodit/infrastructure/{enrichment/local_enrichment_provider.py → enricher/local_enricher.py} +19 -24
- kodit/infrastructure/enricher/null_enricher.py +36 -0
- kodit/infrastructure/indexing/fusion_service.py +1 -1
- kodit/infrastructure/mappers/enrichment_mapper.py +83 -0
- kodit/infrastructure/mappers/git_mapper.py +193 -0
- kodit/infrastructure/mappers/snippet_mapper.py +104 -0
- kodit/infrastructure/mappers/task_mapper.py +5 -44
- kodit/infrastructure/physical_architecture/__init__.py +1 -0
- kodit/infrastructure/physical_architecture/detectors/__init__.py +1 -0
- kodit/infrastructure/physical_architecture/detectors/docker_compose_detector.py +336 -0
- kodit/infrastructure/physical_architecture/formatters/__init__.py +1 -0
- kodit/infrastructure/physical_architecture/formatters/narrative_formatter.py +149 -0
- kodit/infrastructure/reporting/log_progress.py +8 -5
- kodit/infrastructure/reporting/telemetry_progress.py +21 -0
- kodit/infrastructure/slicing/api_doc_extractor.py +836 -0
- kodit/infrastructure/slicing/ast_analyzer.py +1128 -0
- kodit/infrastructure/slicing/slicer.py +87 -421
- kodit/infrastructure/sqlalchemy/embedding_repository.py +43 -23
- kodit/infrastructure/sqlalchemy/enrichment_v2_repository.py +118 -0
- kodit/infrastructure/sqlalchemy/entities.py +402 -158
- kodit/infrastructure/sqlalchemy/git_branch_repository.py +274 -0
- kodit/infrastructure/sqlalchemy/git_commit_repository.py +346 -0
- kodit/infrastructure/sqlalchemy/git_repository.py +262 -0
- kodit/infrastructure/sqlalchemy/git_tag_repository.py +268 -0
- kodit/infrastructure/sqlalchemy/snippet_v2_repository.py +479 -0
- kodit/infrastructure/sqlalchemy/task_repository.py +29 -23
- kodit/infrastructure/sqlalchemy/task_status_repository.py +24 -12
- kodit/infrastructure/sqlalchemy/unit_of_work.py +10 -14
- kodit/mcp.py +12 -30
- kodit/migrations/env.py +1 -0
- kodit/migrations/versions/04b80f802e0c_foreign_key_review.py +100 -0
- kodit/migrations/versions/19f8c7faf8b9_add_generic_enrichment_type.py +260 -0
- kodit/migrations/versions/7f15f878c3a1_add_new_git_entities.py +690 -0
- kodit/migrations/versions/f9e5ef5e688f_add_git_commits_number.py +43 -0
- kodit/py.typed +0 -0
- kodit/utils/dump_config.py +361 -0
- kodit/utils/dump_openapi.py +6 -4
- kodit/utils/path_utils.py +29 -0
- {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/METADATA +3 -3
- kodit-0.5.1.dist-info/RECORD +168 -0
- kodit/application/factories/code_indexing_factory.py +0 -195
- kodit/application/services/auto_indexing_service.py +0 -99
- kodit/application/services/code_indexing_application_service.py +0 -410
- kodit/domain/services/index_query_service.py +0 -70
- kodit/domain/services/index_service.py +0 -269
- kodit/infrastructure/api/client/index_client.py +0 -57
- kodit/infrastructure/api/v1/routers/indexes.py +0 -164
- kodit/infrastructure/api/v1/schemas/index.py +0 -101
- kodit/infrastructure/bm25/bm25_factory.py +0 -28
- kodit/infrastructure/cloning/__init__.py +0 -1
- kodit/infrastructure/cloning/metadata.py +0 -98
- kodit/infrastructure/enrichment/__init__.py +0 -1
- kodit/infrastructure/enrichment/enrichment_factory.py +0 -52
- kodit/infrastructure/enrichment/null_enrichment_provider.py +0 -19
- kodit/infrastructure/mappers/index_mapper.py +0 -345
- kodit/infrastructure/reporting/tdqm_progress.py +0 -38
- kodit/infrastructure/slicing/language_detection_service.py +0 -18
- kodit/infrastructure/sqlalchemy/index_repository.py +0 -646
- kodit-0.4.3.dist-info/RECORD +0 -125
- /kodit/infrastructure/{enrichment → enricher}/utils.py +0 -0
- {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/WHEEL +0 -0
- {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/entry_points.txt +0 -0
- {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,410 +0,0 @@
|
|
|
1
|
-
"""Unified application service for code indexing operations."""
|
|
2
|
-
|
|
3
|
-
from dataclasses import replace
|
|
4
|
-
from datetime import UTC, datetime
|
|
5
|
-
|
|
6
|
-
import structlog
|
|
7
|
-
|
|
8
|
-
from kodit.application.services.reporting import (
|
|
9
|
-
ProgressTracker,
|
|
10
|
-
TaskOperation,
|
|
11
|
-
)
|
|
12
|
-
from kodit.domain.entities import Index, Snippet
|
|
13
|
-
from kodit.domain.protocols import IndexRepository
|
|
14
|
-
from kodit.domain.services.bm25_service import BM25DomainService
|
|
15
|
-
from kodit.domain.services.embedding_service import EmbeddingDomainService
|
|
16
|
-
from kodit.domain.services.enrichment_service import EnrichmentDomainService
|
|
17
|
-
from kodit.domain.services.index_query_service import IndexQueryService
|
|
18
|
-
from kodit.domain.services.index_service import IndexDomainService
|
|
19
|
-
from kodit.domain.value_objects import (
|
|
20
|
-
Document,
|
|
21
|
-
FusionRequest,
|
|
22
|
-
IndexRequest,
|
|
23
|
-
MultiSearchRequest,
|
|
24
|
-
MultiSearchResult,
|
|
25
|
-
SearchRequest,
|
|
26
|
-
SearchResult,
|
|
27
|
-
SnippetSearchFilters,
|
|
28
|
-
TrackableType,
|
|
29
|
-
)
|
|
30
|
-
from kodit.log import log_event
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class CodeIndexingApplicationService:
|
|
34
|
-
"""Unified application service for all code indexing operations."""
|
|
35
|
-
|
|
36
|
-
def __init__( # noqa: PLR0913
|
|
37
|
-
self,
|
|
38
|
-
indexing_domain_service: IndexDomainService,
|
|
39
|
-
index_repository: IndexRepository,
|
|
40
|
-
index_query_service: IndexQueryService,
|
|
41
|
-
bm25_service: BM25DomainService,
|
|
42
|
-
code_search_service: EmbeddingDomainService,
|
|
43
|
-
text_search_service: EmbeddingDomainService,
|
|
44
|
-
enrichment_service: EnrichmentDomainService,
|
|
45
|
-
operation: ProgressTracker,
|
|
46
|
-
) -> None:
|
|
47
|
-
"""Initialize the code indexing application service."""
|
|
48
|
-
self.index_domain_service = indexing_domain_service
|
|
49
|
-
self.index_repository = index_repository
|
|
50
|
-
self.index_query_service = index_query_service
|
|
51
|
-
self.bm25_service = bm25_service
|
|
52
|
-
self.code_search_service = code_search_service
|
|
53
|
-
self.text_search_service = text_search_service
|
|
54
|
-
self.enrichment_service = enrichment_service
|
|
55
|
-
self.operation = operation
|
|
56
|
-
self.log = structlog.get_logger(__name__)
|
|
57
|
-
|
|
58
|
-
async def does_index_exist(self, uri: str) -> bool:
|
|
59
|
-
"""Check if an index exists for a source."""
|
|
60
|
-
# Check if index already exists
|
|
61
|
-
sanitized_uri, _ = self.index_domain_service.sanitize_uri(uri)
|
|
62
|
-
existing_index = await self.index_repository.get_by_uri(sanitized_uri)
|
|
63
|
-
return existing_index is not None
|
|
64
|
-
|
|
65
|
-
async def create_index_from_uri(self, uri: str) -> Index:
|
|
66
|
-
"""Create a new index for a source."""
|
|
67
|
-
log_event("kodit.index.create")
|
|
68
|
-
async with self.operation.create_child(TaskOperation.CREATE_INDEX) as operation:
|
|
69
|
-
# Check if index already exists
|
|
70
|
-
sanitized_uri, _ = self.index_domain_service.sanitize_uri(uri)
|
|
71
|
-
self.log.info("Creating index from URI", uri=str(sanitized_uri))
|
|
72
|
-
existing_index = await self.index_repository.get_by_uri(sanitized_uri)
|
|
73
|
-
if existing_index:
|
|
74
|
-
self.log.debug(
|
|
75
|
-
"Index already exists",
|
|
76
|
-
uri=str(sanitized_uri),
|
|
77
|
-
index_id=existing_index.id,
|
|
78
|
-
)
|
|
79
|
-
return existing_index
|
|
80
|
-
|
|
81
|
-
# Only prepare working copy if we need to create a new index
|
|
82
|
-
self.log.info("Preparing working copy", uri=str(sanitized_uri))
|
|
83
|
-
working_copy = await self.index_domain_service.prepare_index(uri, operation)
|
|
84
|
-
|
|
85
|
-
# Create new index
|
|
86
|
-
self.log.info("Creating index", uri=str(sanitized_uri))
|
|
87
|
-
return await self.index_repository.create(sanitized_uri, working_copy)
|
|
88
|
-
|
|
89
|
-
async def run_index(self, index: Index) -> None:
|
|
90
|
-
"""Run the complete indexing process for a specific index."""
|
|
91
|
-
# Create a new operation
|
|
92
|
-
async with self.operation.create_child(
|
|
93
|
-
TaskOperation.RUN_INDEX,
|
|
94
|
-
trackable_type=TrackableType.INDEX,
|
|
95
|
-
trackable_id=index.id,
|
|
96
|
-
) as operation:
|
|
97
|
-
# TODO(philwinder): Move this into a reporter # noqa: TD003, FIX002
|
|
98
|
-
log_event("kodit.index.run")
|
|
99
|
-
|
|
100
|
-
if not index or not index.id:
|
|
101
|
-
msg = f"Index has no ID: {index}"
|
|
102
|
-
raise ValueError(msg)
|
|
103
|
-
|
|
104
|
-
# Refresh working copy
|
|
105
|
-
async with operation.create_child(
|
|
106
|
-
TaskOperation.REFRESH_WORKING_COPY
|
|
107
|
-
) as step:
|
|
108
|
-
index.source.working_copy = (
|
|
109
|
-
await self.index_domain_service.refresh_working_copy(
|
|
110
|
-
index.source.working_copy, step
|
|
111
|
-
)
|
|
112
|
-
)
|
|
113
|
-
if len(index.source.working_copy.changed_files()) == 0:
|
|
114
|
-
self.log.info("No new changes to index", index_id=index.id)
|
|
115
|
-
await step.skip("No new changes to index")
|
|
116
|
-
return
|
|
117
|
-
|
|
118
|
-
# Delete the old snippets from the files that have changed
|
|
119
|
-
async with operation.create_child(
|
|
120
|
-
TaskOperation.DELETE_OLD_SNIPPETS
|
|
121
|
-
) as step:
|
|
122
|
-
await self.index_repository.delete_snippets_by_file_ids(
|
|
123
|
-
[
|
|
124
|
-
file.id
|
|
125
|
-
for file in index.source.working_copy.changed_files()
|
|
126
|
-
if file.id
|
|
127
|
-
]
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
# Extract and create snippets (domain service handles progress)
|
|
131
|
-
async with operation.create_child(TaskOperation.EXTRACT_SNIPPETS) as step:
|
|
132
|
-
index = await self.index_domain_service.extract_snippets_from_index(
|
|
133
|
-
index=index, step=step
|
|
134
|
-
)
|
|
135
|
-
await self.index_repository.update(index)
|
|
136
|
-
|
|
137
|
-
# Refresh index to get snippets with IDs, required for subsequent steps
|
|
138
|
-
flushed_index = await self.index_repository.get(index.id)
|
|
139
|
-
if not flushed_index:
|
|
140
|
-
msg = f"Index {index.id} not found after snippet extraction"
|
|
141
|
-
raise ValueError(msg)
|
|
142
|
-
index = flushed_index
|
|
143
|
-
if len(index.snippets) == 0:
|
|
144
|
-
self.log.info(
|
|
145
|
-
"No snippets to index after extraction", index_id=index.id
|
|
146
|
-
)
|
|
147
|
-
await step.skip("No snippets to index after extraction")
|
|
148
|
-
return
|
|
149
|
-
|
|
150
|
-
# Create BM25 index
|
|
151
|
-
self.log.info("Creating keyword index")
|
|
152
|
-
async with operation.create_child(TaskOperation.CREATE_BM25_INDEX) as step:
|
|
153
|
-
await self._create_bm25_index(index.snippets)
|
|
154
|
-
|
|
155
|
-
# Create code embeddings
|
|
156
|
-
async with operation.create_child(
|
|
157
|
-
TaskOperation.CREATE_CODE_EMBEDDINGS
|
|
158
|
-
) as step:
|
|
159
|
-
await self._create_code_embeddings(index.snippets, step)
|
|
160
|
-
|
|
161
|
-
# Enrich snippets
|
|
162
|
-
async with operation.create_child(TaskOperation.ENRICH_SNIPPETS) as step:
|
|
163
|
-
enriched_snippets = (
|
|
164
|
-
await self.index_domain_service.enrich_snippets_in_index(
|
|
165
|
-
snippets=index.snippets,
|
|
166
|
-
reporting_step=step,
|
|
167
|
-
)
|
|
168
|
-
)
|
|
169
|
-
# Update snippets in repository
|
|
170
|
-
await self.index_repository.update_snippets(index.id, enriched_snippets)
|
|
171
|
-
|
|
172
|
-
# Create text embeddings (on enriched content)
|
|
173
|
-
async with operation.create_child(
|
|
174
|
-
TaskOperation.CREATE_TEXT_EMBEDDINGS
|
|
175
|
-
) as step:
|
|
176
|
-
await self._create_text_embeddings(enriched_snippets, step)
|
|
177
|
-
|
|
178
|
-
# Update index timestamp
|
|
179
|
-
async with operation.create_child(
|
|
180
|
-
TaskOperation.UPDATE_INDEX_TIMESTAMP
|
|
181
|
-
) as step:
|
|
182
|
-
await self.index_repository.update_index_timestamp(index.id)
|
|
183
|
-
|
|
184
|
-
# After indexing, clear the file processing statuses
|
|
185
|
-
async with operation.create_child(
|
|
186
|
-
TaskOperation.CLEAR_FILE_PROCESSING_STATUSES
|
|
187
|
-
) as step:
|
|
188
|
-
index.source.working_copy.clear_file_processing_statuses()
|
|
189
|
-
await self.index_repository.update(index)
|
|
190
|
-
|
|
191
|
-
async def search(self, request: MultiSearchRequest) -> list[MultiSearchResult]:
|
|
192
|
-
"""Search for relevant snippets across all indexes."""
|
|
193
|
-
log_event("kodit.index.search")
|
|
194
|
-
|
|
195
|
-
# Apply filters if provided
|
|
196
|
-
filtered_snippet_ids: list[int] | None = None
|
|
197
|
-
if request.filters:
|
|
198
|
-
# Use domain service for filtering (use large top_k for pre-filtering)
|
|
199
|
-
prefilter_request = replace(request, top_k=10000)
|
|
200
|
-
snippet_results = await self.index_query_service.search_snippets(
|
|
201
|
-
prefilter_request
|
|
202
|
-
)
|
|
203
|
-
filtered_snippet_ids = [
|
|
204
|
-
snippet.snippet.id for snippet in snippet_results if snippet.snippet.id
|
|
205
|
-
]
|
|
206
|
-
|
|
207
|
-
# Gather results from different search modes
|
|
208
|
-
fusion_list: list[list[FusionRequest]] = []
|
|
209
|
-
|
|
210
|
-
# Keyword search
|
|
211
|
-
if request.keywords:
|
|
212
|
-
result_ids: list[SearchResult] = []
|
|
213
|
-
for keyword in request.keywords:
|
|
214
|
-
results = await self.bm25_service.search(
|
|
215
|
-
SearchRequest(
|
|
216
|
-
query=keyword,
|
|
217
|
-
top_k=request.top_k,
|
|
218
|
-
snippet_ids=filtered_snippet_ids,
|
|
219
|
-
)
|
|
220
|
-
)
|
|
221
|
-
result_ids.extend(results)
|
|
222
|
-
|
|
223
|
-
fusion_list.append(
|
|
224
|
-
[FusionRequest(id=x.snippet_id, score=x.score) for x in result_ids]
|
|
225
|
-
)
|
|
226
|
-
|
|
227
|
-
# Semantic code search
|
|
228
|
-
if request.code_query:
|
|
229
|
-
query_results = await self.code_search_service.search(
|
|
230
|
-
SearchRequest(
|
|
231
|
-
query=request.code_query,
|
|
232
|
-
top_k=request.top_k,
|
|
233
|
-
snippet_ids=filtered_snippet_ids,
|
|
234
|
-
)
|
|
235
|
-
)
|
|
236
|
-
fusion_list.append(
|
|
237
|
-
[FusionRequest(id=x.snippet_id, score=x.score) for x in query_results]
|
|
238
|
-
)
|
|
239
|
-
|
|
240
|
-
# Semantic text search
|
|
241
|
-
if request.text_query:
|
|
242
|
-
query_results = await self.text_search_service.search(
|
|
243
|
-
SearchRequest(
|
|
244
|
-
query=request.text_query,
|
|
245
|
-
top_k=request.top_k,
|
|
246
|
-
snippet_ids=filtered_snippet_ids,
|
|
247
|
-
)
|
|
248
|
-
)
|
|
249
|
-
fusion_list.append(
|
|
250
|
-
[FusionRequest(id=x.snippet_id, score=x.score) for x in query_results]
|
|
251
|
-
)
|
|
252
|
-
|
|
253
|
-
if len(fusion_list) == 0:
|
|
254
|
-
return []
|
|
255
|
-
|
|
256
|
-
# Fusion ranking
|
|
257
|
-
final_results = await self.index_query_service.perform_fusion(
|
|
258
|
-
rankings=fusion_list,
|
|
259
|
-
k=60, # This is a parameter in the RRF algorithm, not top_k
|
|
260
|
-
)
|
|
261
|
-
|
|
262
|
-
# Keep only top_k results
|
|
263
|
-
final_results = final_results[: request.top_k]
|
|
264
|
-
|
|
265
|
-
# Get snippet details
|
|
266
|
-
search_results = await self.index_query_service.get_snippets_by_ids(
|
|
267
|
-
[x.id for x in final_results]
|
|
268
|
-
)
|
|
269
|
-
|
|
270
|
-
# Create a mapping from snippet ID to search result to handle cases where
|
|
271
|
-
# some snippet IDs don't exist (e.g., with vectorchord inconsistencies)
|
|
272
|
-
snippet_map = {
|
|
273
|
-
result.snippet.id: result
|
|
274
|
-
for result in search_results
|
|
275
|
-
if result.snippet.id is not None
|
|
276
|
-
}
|
|
277
|
-
|
|
278
|
-
# Filter final_results to only include IDs that we actually found snippets for
|
|
279
|
-
valid_final_results = [fr for fr in final_results if fr.id in snippet_map]
|
|
280
|
-
|
|
281
|
-
return [
|
|
282
|
-
MultiSearchResult(
|
|
283
|
-
id=snippet_map[fr.id].snippet.id or 0,
|
|
284
|
-
content=snippet_map[fr.id].snippet.original_text(),
|
|
285
|
-
original_scores=fr.original_scores,
|
|
286
|
-
# Enhanced fields
|
|
287
|
-
source_uri=str(snippet_map[fr.id].source.working_copy.remote_uri),
|
|
288
|
-
relative_path=str(
|
|
289
|
-
snippet_map[fr.id]
|
|
290
|
-
.file.as_path()
|
|
291
|
-
.relative_to(snippet_map[fr.id].source.working_copy.cloned_path)
|
|
292
|
-
),
|
|
293
|
-
language=MultiSearchResult.detect_language_from_extension(
|
|
294
|
-
snippet_map[fr.id].file.extension()
|
|
295
|
-
),
|
|
296
|
-
authors=[author.name for author in snippet_map[fr.id].authors],
|
|
297
|
-
created_at=snippet_map[fr.id].snippet.created_at or datetime.now(UTC),
|
|
298
|
-
# Summary from snippet entity
|
|
299
|
-
summary=snippet_map[fr.id].snippet.summary_text(),
|
|
300
|
-
)
|
|
301
|
-
for fr in valid_final_results
|
|
302
|
-
]
|
|
303
|
-
|
|
304
|
-
async def list_snippets(
|
|
305
|
-
self, file_path: str | None = None, source_uri: str | None = None
|
|
306
|
-
) -> list[MultiSearchResult]:
|
|
307
|
-
"""List snippets with optional filtering."""
|
|
308
|
-
log_event("kodit.index.list_snippets")
|
|
309
|
-
snippet_results = await self.index_query_service.search_snippets(
|
|
310
|
-
request=MultiSearchRequest(
|
|
311
|
-
filters=SnippetSearchFilters(
|
|
312
|
-
file_path=file_path,
|
|
313
|
-
source_repo=source_uri,
|
|
314
|
-
)
|
|
315
|
-
),
|
|
316
|
-
)
|
|
317
|
-
return [
|
|
318
|
-
MultiSearchResult(
|
|
319
|
-
id=result.snippet.id or 0,
|
|
320
|
-
content=result.snippet.original_text(),
|
|
321
|
-
original_scores=[0.0],
|
|
322
|
-
# Enhanced fields
|
|
323
|
-
source_uri=str(result.source.working_copy.remote_uri),
|
|
324
|
-
relative_path=str(
|
|
325
|
-
result.file.as_path().relative_to(
|
|
326
|
-
result.source.working_copy.cloned_path
|
|
327
|
-
)
|
|
328
|
-
),
|
|
329
|
-
language=MultiSearchResult.detect_language_from_extension(
|
|
330
|
-
result.file.extension()
|
|
331
|
-
),
|
|
332
|
-
authors=[author.name for author in result.authors],
|
|
333
|
-
created_at=result.snippet.created_at or datetime.now(UTC),
|
|
334
|
-
# Summary from snippet entity
|
|
335
|
-
summary=result.snippet.summary_text(),
|
|
336
|
-
)
|
|
337
|
-
for result in snippet_results
|
|
338
|
-
]
|
|
339
|
-
|
|
340
|
-
# FUTURE: BM25 index enriched content too
|
|
341
|
-
async def _create_bm25_index(self, snippets: list[Snippet]) -> None:
|
|
342
|
-
await self.bm25_service.index_documents(
|
|
343
|
-
IndexRequest(
|
|
344
|
-
documents=[
|
|
345
|
-
Document(snippet_id=snippet.id, text=snippet.original_text())
|
|
346
|
-
for snippet in snippets
|
|
347
|
-
if snippet.id
|
|
348
|
-
]
|
|
349
|
-
)
|
|
350
|
-
)
|
|
351
|
-
|
|
352
|
-
async def _create_code_embeddings(
|
|
353
|
-
self, snippets: list[Snippet], reporting_step: ProgressTracker
|
|
354
|
-
) -> None:
|
|
355
|
-
await reporting_step.set_total(len(snippets))
|
|
356
|
-
processed = 0
|
|
357
|
-
async for result in self.code_search_service.index_documents(
|
|
358
|
-
IndexRequest(
|
|
359
|
-
documents=[
|
|
360
|
-
Document(snippet_id=snippet.id, text=snippet.original_text())
|
|
361
|
-
for snippet in snippets
|
|
362
|
-
if snippet.id
|
|
363
|
-
]
|
|
364
|
-
)
|
|
365
|
-
):
|
|
366
|
-
processed += len(result)
|
|
367
|
-
await reporting_step.set_current(
|
|
368
|
-
processed, f"Creating code embeddings for {processed} snippets"
|
|
369
|
-
)
|
|
370
|
-
|
|
371
|
-
async def _create_text_embeddings(
|
|
372
|
-
self, snippets: list[Snippet], reporting_step: ProgressTracker
|
|
373
|
-
) -> None:
|
|
374
|
-
# Only create text embeddings for snippets that have summary content
|
|
375
|
-
documents_with_summaries = []
|
|
376
|
-
for snippet in snippets:
|
|
377
|
-
if snippet.id:
|
|
378
|
-
try:
|
|
379
|
-
summary_text = snippet.summary_text()
|
|
380
|
-
if summary_text.strip(): # Only add if summary is not empty
|
|
381
|
-
documents_with_summaries.append(
|
|
382
|
-
Document(snippet_id=snippet.id, text=summary_text)
|
|
383
|
-
)
|
|
384
|
-
except ValueError:
|
|
385
|
-
# Skip snippets without summary content
|
|
386
|
-
continue
|
|
387
|
-
|
|
388
|
-
if not documents_with_summaries:
|
|
389
|
-
await reporting_step.skip(
|
|
390
|
-
"No snippets with summaries to create text embeddings"
|
|
391
|
-
)
|
|
392
|
-
return
|
|
393
|
-
|
|
394
|
-
await reporting_step.set_total(len(documents_with_summaries))
|
|
395
|
-
processed = 0
|
|
396
|
-
async for result in self.text_search_service.index_documents(
|
|
397
|
-
IndexRequest(documents=documents_with_summaries)
|
|
398
|
-
):
|
|
399
|
-
processed += len(result)
|
|
400
|
-
await reporting_step.set_current(
|
|
401
|
-
processed, f"Creating text embeddings for {processed} snippets"
|
|
402
|
-
)
|
|
403
|
-
|
|
404
|
-
async def delete_index(self, index: Index) -> None:
|
|
405
|
-
"""Delete an index."""
|
|
406
|
-
# Delete the index from the domain
|
|
407
|
-
await self.index_domain_service.delete_index(index)
|
|
408
|
-
|
|
409
|
-
# Delete index from the database
|
|
410
|
-
await self.index_repository.delete(index)
|
|
@@ -1,70 +0,0 @@
|
|
|
1
|
-
"""Index query service."""
|
|
2
|
-
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
|
-
|
|
5
|
-
from kodit.domain.entities import Index, SnippetWithContext
|
|
6
|
-
from kodit.domain.protocols import IndexRepository
|
|
7
|
-
from kodit.domain.value_objects import (
|
|
8
|
-
FusionRequest,
|
|
9
|
-
FusionResult,
|
|
10
|
-
MultiSearchRequest,
|
|
11
|
-
)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class FusionService(ABC):
|
|
15
|
-
"""Abstract fusion service interface."""
|
|
16
|
-
|
|
17
|
-
@abstractmethod
|
|
18
|
-
def reciprocal_rank_fusion(
|
|
19
|
-
self, rankings: list[list[FusionRequest]], k: float = 60
|
|
20
|
-
) -> list[FusionResult]:
|
|
21
|
-
"""Perform reciprocal rank fusion on search results."""
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class IndexQueryService:
|
|
25
|
-
"""Index query service."""
|
|
26
|
-
|
|
27
|
-
def __init__(
|
|
28
|
-
self,
|
|
29
|
-
index_repository: IndexRepository,
|
|
30
|
-
fusion_service: FusionService,
|
|
31
|
-
) -> None:
|
|
32
|
-
"""Initialize the index query service."""
|
|
33
|
-
self.index_repository = index_repository
|
|
34
|
-
self.fusion_service = fusion_service
|
|
35
|
-
|
|
36
|
-
async def get_index_by_id(self, index_id: int) -> Index | None:
|
|
37
|
-
"""Get an index by its ID."""
|
|
38
|
-
return await self.index_repository.get(index_id)
|
|
39
|
-
|
|
40
|
-
async def list_indexes(self) -> list[Index]:
|
|
41
|
-
"""List all indexes."""
|
|
42
|
-
return await self.index_repository.all()
|
|
43
|
-
|
|
44
|
-
async def search_snippets(
|
|
45
|
-
self, request: MultiSearchRequest
|
|
46
|
-
) -> list[SnippetWithContext]:
|
|
47
|
-
"""Search snippets with filters.
|
|
48
|
-
|
|
49
|
-
Args:
|
|
50
|
-
request: The search request containing filters
|
|
51
|
-
|
|
52
|
-
Returns:
|
|
53
|
-
List of matching snippet items with context
|
|
54
|
-
|
|
55
|
-
"""
|
|
56
|
-
return list(await self.index_repository.search(request))
|
|
57
|
-
|
|
58
|
-
async def perform_fusion(
|
|
59
|
-
self, rankings: list[list[FusionRequest]], k: float = 60
|
|
60
|
-
) -> list[FusionResult]:
|
|
61
|
-
"""Perform reciprocal rank fusion on search results."""
|
|
62
|
-
return self.fusion_service.reciprocal_rank_fusion(rankings, k)
|
|
63
|
-
|
|
64
|
-
async def get_snippets_by_ids(self, ids: list[int]) -> list[SnippetWithContext]:
|
|
65
|
-
"""Get snippets by their IDs."""
|
|
66
|
-
snippets = await self.index_repository.get_snippets_by_ids(ids)
|
|
67
|
-
|
|
68
|
-
# Return snippets in the same order as the ids
|
|
69
|
-
snippets.sort(key=lambda x: ids.index(x.snippet.id or 0))
|
|
70
|
-
return snippets
|