kodit 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/application/__init__.py +1 -0
- kodit/application/commands/__init__.py +1 -0
- kodit/application/commands/snippet_commands.py +22 -0
- kodit/application/services/__init__.py +1 -0
- kodit/application/services/indexing_application_service.py +363 -0
- kodit/application/services/snippet_application_service.py +143 -0
- kodit/cli.py +105 -82
- kodit/database.py +0 -22
- kodit/domain/__init__.py +1 -0
- kodit/{source/source_models.py → domain/entities.py} +88 -19
- kodit/domain/enums.py +9 -0
- kodit/domain/interfaces.py +27 -0
- kodit/domain/repositories.py +95 -0
- kodit/domain/services/__init__.py +1 -0
- kodit/domain/services/bm25_service.py +124 -0
- kodit/domain/services/embedding_service.py +155 -0
- kodit/domain/services/enrichment_service.py +48 -0
- kodit/domain/services/ignore_service.py +45 -0
- kodit/domain/services/indexing_service.py +203 -0
- kodit/domain/services/snippet_extraction_service.py +89 -0
- kodit/domain/services/source_service.py +83 -0
- kodit/domain/value_objects.py +215 -0
- kodit/infrastructure/__init__.py +1 -0
- kodit/infrastructure/bm25/__init__.py +1 -0
- kodit/infrastructure/bm25/bm25_factory.py +28 -0
- kodit/{bm25/local_bm25.py → infrastructure/bm25/local_bm25_repository.py} +33 -22
- kodit/{bm25/vectorchord_bm25.py → infrastructure/bm25/vectorchord_bm25_repository.py} +40 -35
- kodit/infrastructure/cloning/__init__.py +1 -0
- kodit/infrastructure/cloning/folder/__init__.py +1 -0
- kodit/infrastructure/cloning/folder/factory.py +119 -0
- kodit/infrastructure/cloning/folder/working_copy.py +38 -0
- kodit/infrastructure/cloning/git/__init__.py +1 -0
- kodit/infrastructure/cloning/git/factory.py +133 -0
- kodit/infrastructure/cloning/git/working_copy.py +32 -0
- kodit/infrastructure/cloning/metadata.py +127 -0
- kodit/infrastructure/embedding/__init__.py +1 -0
- kodit/infrastructure/embedding/embedding_factory.py +87 -0
- kodit/infrastructure/embedding/embedding_providers/__init__.py +1 -0
- kodit/infrastructure/embedding/embedding_providers/batching.py +93 -0
- kodit/infrastructure/embedding/embedding_providers/hash_embedding_provider.py +79 -0
- kodit/infrastructure/embedding/embedding_providers/local_embedding_provider.py +129 -0
- kodit/infrastructure/embedding/embedding_providers/openai_embedding_provider.py +113 -0
- kodit/infrastructure/embedding/local_vector_search_repository.py +114 -0
- kodit/{embedding/vectorchord_vector_search_service.py → infrastructure/embedding/vectorchord_vector_search_repository.py} +98 -32
- kodit/infrastructure/enrichment/__init__.py +1 -0
- kodit/{enrichment → infrastructure/enrichment}/enrichment_factory.py +28 -12
- kodit/infrastructure/enrichment/legacy_enrichment_models.py +42 -0
- kodit/infrastructure/enrichment/local_enrichment_provider.py +115 -0
- kodit/infrastructure/enrichment/null_enrichment_provider.py +25 -0
- kodit/infrastructure/enrichment/openai_enrichment_provider.py +89 -0
- kodit/infrastructure/git/__init__.py +1 -0
- kodit/{source/git.py → infrastructure/git/git_utils.py} +10 -2
- kodit/infrastructure/ignore/__init__.py +1 -0
- kodit/{source/ignore.py → infrastructure/ignore/ignore_pattern_provider.py} +23 -6
- kodit/infrastructure/indexing/__init__.py +1 -0
- kodit/infrastructure/indexing/fusion_service.py +55 -0
- kodit/infrastructure/indexing/index_repository.py +296 -0
- kodit/infrastructure/indexing/indexing_factory.py +111 -0
- kodit/infrastructure/snippet_extraction/__init__.py +1 -0
- kodit/infrastructure/snippet_extraction/language_detection_service.py +39 -0
- kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +95 -0
- kodit/infrastructure/snippet_extraction/snippet_query_provider.py +45 -0
- kodit/{snippets/method_snippets.py → infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py} +123 -61
- kodit/infrastructure/sqlalchemy/__init__.py +1 -0
- kodit/{embedding → infrastructure/sqlalchemy}/embedding_repository.py +40 -24
- kodit/infrastructure/sqlalchemy/file_repository.py +73 -0
- kodit/infrastructure/sqlalchemy/repository.py +121 -0
- kodit/infrastructure/sqlalchemy/snippet_repository.py +75 -0
- kodit/infrastructure/ui/__init__.py +1 -0
- kodit/infrastructure/ui/progress.py +127 -0
- kodit/{util → infrastructure/ui}/spinner.py +19 -4
- kodit/mcp.py +50 -28
- kodit/migrations/env.py +1 -4
- kodit/reporting.py +78 -0
- {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/METADATA +1 -1
- kodit-0.2.5.dist-info/RECORD +99 -0
- kodit/bm25/__init__.py +0 -1
- kodit/bm25/keyword_search_factory.py +0 -17
- kodit/bm25/keyword_search_service.py +0 -34
- kodit/embedding/__init__.py +0 -1
- kodit/embedding/embedding_factory.py +0 -63
- kodit/embedding/embedding_models.py +0 -28
- kodit/embedding/embedding_provider/__init__.py +0 -1
- kodit/embedding/embedding_provider/embedding_provider.py +0 -64
- kodit/embedding/embedding_provider/hash_embedding_provider.py +0 -77
- kodit/embedding/embedding_provider/local_embedding_provider.py +0 -64
- kodit/embedding/embedding_provider/openai_embedding_provider.py +0 -77
- kodit/embedding/local_vector_search_service.py +0 -54
- kodit/embedding/vector_search_service.py +0 -38
- kodit/enrichment/__init__.py +0 -1
- kodit/enrichment/enrichment_provider/__init__.py +0 -1
- kodit/enrichment/enrichment_provider/enrichment_provider.py +0 -16
- kodit/enrichment/enrichment_provider/local_enrichment_provider.py +0 -92
- kodit/enrichment/enrichment_provider/openai_enrichment_provider.py +0 -81
- kodit/enrichment/enrichment_service.py +0 -33
- kodit/indexing/__init__.py +0 -1
- kodit/indexing/fusion.py +0 -67
- kodit/indexing/indexing_models.py +0 -43
- kodit/indexing/indexing_repository.py +0 -216
- kodit/indexing/indexing_service.py +0 -338
- kodit/snippets/__init__.py +0 -1
- kodit/snippets/languages/__init__.py +0 -53
- kodit/snippets/snippets.py +0 -50
- kodit/source/__init__.py +0 -1
- kodit/source/source_factories.py +0 -356
- kodit/source/source_repository.py +0 -169
- kodit/source/source_service.py +0 -150
- kodit/util/__init__.py +0 -1
- kodit-0.2.3.dist-info/RECORD +0 -71
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/csharp.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/go.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/javascript.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/python.scm +0 -0
- /kodit/{snippets → infrastructure/snippet_extraction}/languages/typescript.scm +0 -0
- {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/WHEEL +0 -0
- {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/entry_points.txt +0 -0
- {kodit-0.2.3.dist-info → kodit-0.2.5.dist-info}/licenses/LICENSE +0 -0
kodit/_version.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Application layer for Kodit."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Application commands for Kodit."""
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Application commands for snippet operations."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from kodit.domain.enums import SnippetExtractionStrategy
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class ExtractSnippetsCommand:
|
|
11
|
+
"""Application command for extracting snippets from files."""
|
|
12
|
+
|
|
13
|
+
file_path: Path
|
|
14
|
+
strategy: SnippetExtractionStrategy = SnippetExtractionStrategy.METHOD_BASED
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class CreateIndexSnippetsCommand:
|
|
19
|
+
"""Application command for creating snippets for an entire index."""
|
|
20
|
+
|
|
21
|
+
index_id: int
|
|
22
|
+
strategy: SnippetExtractionStrategy = SnippetExtractionStrategy.METHOD_BASED
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Application services for Kodit."""
|
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
"""Application service for indexing operations."""
|
|
2
|
+
|
|
3
|
+
import structlog
|
|
4
|
+
|
|
5
|
+
from kodit.application.commands.snippet_commands import CreateIndexSnippetsCommand
|
|
6
|
+
from kodit.application.services.snippet_application_service import (
|
|
7
|
+
SnippetApplicationService,
|
|
8
|
+
)
|
|
9
|
+
from kodit.domain.entities import Snippet
|
|
10
|
+
from kodit.domain.enums import SnippetExtractionStrategy
|
|
11
|
+
from kodit.domain.interfaces import ProgressCallback
|
|
12
|
+
from kodit.domain.services.bm25_service import BM25DomainService
|
|
13
|
+
from kodit.domain.services.embedding_service import EmbeddingDomainService
|
|
14
|
+
from kodit.domain.services.enrichment_service import EnrichmentDomainService
|
|
15
|
+
from kodit.domain.services.indexing_service import IndexingDomainService
|
|
16
|
+
from kodit.domain.services.source_service import SourceService
|
|
17
|
+
from kodit.domain.value_objects import (
|
|
18
|
+
BM25Document,
|
|
19
|
+
BM25IndexRequest,
|
|
20
|
+
BM25SearchRequest,
|
|
21
|
+
BM25SearchResult,
|
|
22
|
+
EnrichmentIndexRequest,
|
|
23
|
+
EnrichmentRequest,
|
|
24
|
+
FusionRequest,
|
|
25
|
+
IndexCreateRequest,
|
|
26
|
+
IndexView,
|
|
27
|
+
MultiSearchRequest,
|
|
28
|
+
MultiSearchResult,
|
|
29
|
+
VectorIndexRequest,
|
|
30
|
+
VectorSearchQueryRequest,
|
|
31
|
+
VectorSearchRequest,
|
|
32
|
+
)
|
|
33
|
+
from kodit.log import log_event
|
|
34
|
+
from kodit.reporting import Reporter
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class IndexingApplicationService:
|
|
38
|
+
"""Application service for indexing operations.
|
|
39
|
+
|
|
40
|
+
This service orchestrates the business logic for creating, listing, and running
|
|
41
|
+
code indexes. It coordinates between domain services and provides a clean API
|
|
42
|
+
for index management.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__( # noqa: PLR0913
|
|
46
|
+
self,
|
|
47
|
+
indexing_domain_service: IndexingDomainService,
|
|
48
|
+
source_service: SourceService,
|
|
49
|
+
bm25_service: BM25DomainService,
|
|
50
|
+
code_search_service: EmbeddingDomainService,
|
|
51
|
+
text_search_service: EmbeddingDomainService,
|
|
52
|
+
enrichment_service: EnrichmentDomainService,
|
|
53
|
+
snippet_application_service: SnippetApplicationService,
|
|
54
|
+
) -> None:
|
|
55
|
+
"""Initialize the indexing application service.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
indexing_domain_service: The indexing domain service.
|
|
59
|
+
source_service: The source service for source validation.
|
|
60
|
+
bm25_service: The BM25 domain service for keyword search.
|
|
61
|
+
code_search_service: The code search domain service.
|
|
62
|
+
text_search_service: The text search domain service.
|
|
63
|
+
enrichment_service: The enrichment domain service.
|
|
64
|
+
snippet_application_service: The snippet application service.
|
|
65
|
+
|
|
66
|
+
"""
|
|
67
|
+
self.indexing_domain_service = indexing_domain_service
|
|
68
|
+
self.source_service = source_service
|
|
69
|
+
self.snippet_application_service = snippet_application_service
|
|
70
|
+
self.log = structlog.get_logger(__name__)
|
|
71
|
+
self.bm25_service = bm25_service
|
|
72
|
+
self.code_search_service = code_search_service
|
|
73
|
+
self.text_search_service = text_search_service
|
|
74
|
+
self.enrichment_service = enrichment_service
|
|
75
|
+
|
|
76
|
+
async def create_index(self, source_id: int) -> IndexView:
|
|
77
|
+
"""Create a new index for a source.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
source_id: The ID of the source to create an index for.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
An IndexView representing the newly created index.
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
ValueError: If the source doesn't exist.
|
|
87
|
+
|
|
88
|
+
"""
|
|
89
|
+
log_event("kodit.index.create")
|
|
90
|
+
|
|
91
|
+
# Check if the source exists
|
|
92
|
+
source = await self.source_service.get(source_id)
|
|
93
|
+
|
|
94
|
+
# Create the index
|
|
95
|
+
request = IndexCreateRequest(source_id=source.id)
|
|
96
|
+
return await self.indexing_domain_service.create_index(request)
|
|
97
|
+
|
|
98
|
+
async def list_indexes(self) -> list[IndexView]:
|
|
99
|
+
"""List all available indexes with their details.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
A list of IndexView objects containing information about each index.
|
|
103
|
+
|
|
104
|
+
"""
|
|
105
|
+
indexes = await self.indexing_domain_service.list_indexes()
|
|
106
|
+
|
|
107
|
+
# Help Kodit by measuring how much people are using indexes
|
|
108
|
+
log_event(
|
|
109
|
+
"kodit.index.list",
|
|
110
|
+
{
|
|
111
|
+
"num_indexes": len(indexes),
|
|
112
|
+
"num_snippets": sum([index.num_snippets for index in indexes]),
|
|
113
|
+
},
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
return indexes
|
|
117
|
+
|
|
118
|
+
async def run_index(
|
|
119
|
+
self, index_id: int, progress_callback: ProgressCallback | None = None
|
|
120
|
+
) -> None:
|
|
121
|
+
"""Run the indexing process for a specific index.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
index_id: The ID of the index to run.
|
|
125
|
+
progress_callback: Optional progress callback for reporting progress.
|
|
126
|
+
|
|
127
|
+
Raises:
|
|
128
|
+
ValueError: If the index doesn't exist.
|
|
129
|
+
|
|
130
|
+
"""
|
|
131
|
+
log_event("kodit.index.run")
|
|
132
|
+
|
|
133
|
+
# Get and validate index
|
|
134
|
+
index = await self.indexing_domain_service.get_index(index_id)
|
|
135
|
+
if not index:
|
|
136
|
+
msg = f"Index not found: {index_id}"
|
|
137
|
+
raise ValueError(msg)
|
|
138
|
+
|
|
139
|
+
# Delete old snippets so we don't duplicate
|
|
140
|
+
await self.indexing_domain_service.delete_all_snippets(index.id)
|
|
141
|
+
|
|
142
|
+
# Create snippets for supported file types using the snippet application service
|
|
143
|
+
self.log.info("Creating snippets for files", index_id=index.id)
|
|
144
|
+
command = CreateIndexSnippetsCommand(
|
|
145
|
+
index_id=index.id, strategy=SnippetExtractionStrategy.METHOD_BASED
|
|
146
|
+
)
|
|
147
|
+
await self.snippet_application_service.create_snippets_for_index(
|
|
148
|
+
command, progress_callback
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
snippets = await self.indexing_domain_service.get_snippets_for_index(index.id)
|
|
152
|
+
|
|
153
|
+
# Create BM25 index
|
|
154
|
+
self.log.info("Creating keyword index")
|
|
155
|
+
reporter = Reporter(self.log, progress_callback)
|
|
156
|
+
await reporter.start("bm25_index", len(snippets), "Creating keyword index...")
|
|
157
|
+
await self._create_bm25_index(snippets, progress_callback)
|
|
158
|
+
await reporter.done("bm25_index", "Keyword index created")
|
|
159
|
+
|
|
160
|
+
# Create code embeddings
|
|
161
|
+
self.log.info("Creating semantic code index")
|
|
162
|
+
reporter = Reporter(self.log, progress_callback)
|
|
163
|
+
await reporter.start(
|
|
164
|
+
"code_embeddings", len(snippets), "Creating code embeddings..."
|
|
165
|
+
)
|
|
166
|
+
await self._create_code_embeddings(snippets, progress_callback)
|
|
167
|
+
await reporter.done("code_embeddings")
|
|
168
|
+
|
|
169
|
+
# Enrich snippets
|
|
170
|
+
self.log.info("Enriching snippets", num_snippets=len(snippets))
|
|
171
|
+
reporter = Reporter(self.log, progress_callback)
|
|
172
|
+
await reporter.start("enrichment", len(snippets), "Enriching snippets...")
|
|
173
|
+
await self._enrich_snippets(snippets, progress_callback)
|
|
174
|
+
await reporter.done("enrichment")
|
|
175
|
+
|
|
176
|
+
# Create text embeddings
|
|
177
|
+
self.log.info("Creating semantic text index")
|
|
178
|
+
reporter = Reporter(self.log, progress_callback)
|
|
179
|
+
await reporter.start(
|
|
180
|
+
"text_embeddings", len(snippets), "Creating text embeddings..."
|
|
181
|
+
)
|
|
182
|
+
await self._create_text_embeddings(snippets, progress_callback)
|
|
183
|
+
await reporter.done("text_embeddings")
|
|
184
|
+
|
|
185
|
+
# Update index timestamp
|
|
186
|
+
await self.indexing_domain_service.update_index_timestamp(index.id)
|
|
187
|
+
|
|
188
|
+
async def _create_bm25_index(
|
|
189
|
+
self, snippets: list[Snippet], progress_callback: ProgressCallback | None = None
|
|
190
|
+
) -> None:
|
|
191
|
+
"""Create BM25 keyword index."""
|
|
192
|
+
reporter = Reporter(self.log, progress_callback)
|
|
193
|
+
await reporter.start("bm25_index", len(snippets), "Creating keyword index...")
|
|
194
|
+
await self.bm25_service.index_documents(
|
|
195
|
+
BM25IndexRequest(
|
|
196
|
+
documents=[
|
|
197
|
+
BM25Document(snippet_id=snippet.id, text=snippet.content)
|
|
198
|
+
for snippet in snippets
|
|
199
|
+
]
|
|
200
|
+
)
|
|
201
|
+
)
|
|
202
|
+
await reporter.done("bm25_index", "Keyword index created")
|
|
203
|
+
|
|
204
|
+
async def _create_code_embeddings(
|
|
205
|
+
self, snippets: list[Snippet], progress_callback: ProgressCallback | None = None
|
|
206
|
+
) -> None:
|
|
207
|
+
"""Create code embeddings."""
|
|
208
|
+
reporter = Reporter(self.log, progress_callback)
|
|
209
|
+
await reporter.start(
|
|
210
|
+
"code_embeddings", len(snippets), "Creating code embeddings..."
|
|
211
|
+
)
|
|
212
|
+
processed = 0
|
|
213
|
+
async for result in self.code_search_service.index_documents(
|
|
214
|
+
VectorIndexRequest(
|
|
215
|
+
documents=[
|
|
216
|
+
VectorSearchRequest(snippet.id, snippet.content)
|
|
217
|
+
for snippet in snippets
|
|
218
|
+
]
|
|
219
|
+
)
|
|
220
|
+
):
|
|
221
|
+
processed += len(result)
|
|
222
|
+
await reporter.step(
|
|
223
|
+
"code_embeddings",
|
|
224
|
+
processed,
|
|
225
|
+
len(snippets),
|
|
226
|
+
"Creating code embeddings...",
|
|
227
|
+
)
|
|
228
|
+
await reporter.done("code_embeddings")
|
|
229
|
+
|
|
230
|
+
async def _enrich_snippets(
|
|
231
|
+
self, snippets: list[Snippet], progress_callback: ProgressCallback | None = None
|
|
232
|
+
) -> None:
|
|
233
|
+
"""Enrich snippets with additional context."""
|
|
234
|
+
reporter = Reporter(self.log, progress_callback)
|
|
235
|
+
await reporter.start("enrichment", len(snippets), "Enriching snippets...")
|
|
236
|
+
enriched_contents = []
|
|
237
|
+
enrichment_request = EnrichmentIndexRequest(
|
|
238
|
+
requests=[
|
|
239
|
+
EnrichmentRequest(snippet_id=snippet.id, text=snippet.content)
|
|
240
|
+
for snippet in snippets
|
|
241
|
+
]
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
processed = 0
|
|
245
|
+
async for result in self.enrichment_service.enrich_documents(
|
|
246
|
+
enrichment_request
|
|
247
|
+
):
|
|
248
|
+
# Find the snippet by ID
|
|
249
|
+
snippet = next(s for s in snippets if s.id == result.snippet_id)
|
|
250
|
+
if snippet:
|
|
251
|
+
# Update the content in the local entity for subsequent processing
|
|
252
|
+
enriched_content = result.text + "\n\n```\n" + snippet.content + "\n```"
|
|
253
|
+
snippet.content = enriched_content
|
|
254
|
+
|
|
255
|
+
# UPDATE the existing snippet entity instead of creating a new one
|
|
256
|
+
# This follows DDD principles and avoids duplicates
|
|
257
|
+
await self.indexing_domain_service.update_snippet_content(
|
|
258
|
+
snippet.id, enriched_content
|
|
259
|
+
)
|
|
260
|
+
enriched_contents.append(result)
|
|
261
|
+
|
|
262
|
+
processed += 1
|
|
263
|
+
await reporter.step(
|
|
264
|
+
"enrichment", processed, len(snippets), "Enriching snippets..."
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
await reporter.done("enrichment")
|
|
268
|
+
|
|
269
|
+
async def _create_text_embeddings(
|
|
270
|
+
self, snippets: list[Snippet], progress_callback: ProgressCallback | None = None
|
|
271
|
+
) -> None:
|
|
272
|
+
"""Create text embeddings."""
|
|
273
|
+
reporter = Reporter(self.log, progress_callback)
|
|
274
|
+
await reporter.start(
|
|
275
|
+
"text_embeddings", len(snippets), "Creating text embeddings..."
|
|
276
|
+
)
|
|
277
|
+
processed = 0
|
|
278
|
+
async for result in self.text_search_service.index_documents(
|
|
279
|
+
VectorIndexRequest(
|
|
280
|
+
documents=[
|
|
281
|
+
VectorSearchRequest(snippet.id, snippet.content)
|
|
282
|
+
for snippet in snippets
|
|
283
|
+
]
|
|
284
|
+
)
|
|
285
|
+
):
|
|
286
|
+
processed += len(result)
|
|
287
|
+
await reporter.step(
|
|
288
|
+
"text_embeddings",
|
|
289
|
+
processed,
|
|
290
|
+
len(snippets),
|
|
291
|
+
"Creating text embeddings...",
|
|
292
|
+
)
|
|
293
|
+
await reporter.done("text_embeddings")
|
|
294
|
+
|
|
295
|
+
async def search(self, request: MultiSearchRequest) -> list[MultiSearchResult]:
|
|
296
|
+
"""Search for relevant data.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
request: The search request.
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
A list of search results.
|
|
303
|
+
|
|
304
|
+
"""
|
|
305
|
+
log_event("kodit.index.search")
|
|
306
|
+
|
|
307
|
+
fusion_list: list[list[FusionRequest]] = []
|
|
308
|
+
if request.keywords:
|
|
309
|
+
# Gather results for each keyword
|
|
310
|
+
result_ids: list[BM25SearchResult] = []
|
|
311
|
+
for keyword in request.keywords:
|
|
312
|
+
results = await self.bm25_service.search(
|
|
313
|
+
BM25SearchRequest(query=keyword, top_k=request.top_k)
|
|
314
|
+
)
|
|
315
|
+
result_ids.extend(results)
|
|
316
|
+
|
|
317
|
+
fusion_list.append(
|
|
318
|
+
[FusionRequest(id=x.snippet_id, score=x.score) for x in result_ids]
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# Compute embedding for semantic query
|
|
322
|
+
if request.code_query:
|
|
323
|
+
query_embedding = await self.code_search_service.search(
|
|
324
|
+
VectorSearchQueryRequest(query=request.code_query, top_k=request.top_k)
|
|
325
|
+
)
|
|
326
|
+
fusion_list.append(
|
|
327
|
+
[FusionRequest(id=x.snippet_id, score=x.score) for x in query_embedding]
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
if request.text_query:
|
|
331
|
+
query_embedding = await self.text_search_service.search(
|
|
332
|
+
VectorSearchQueryRequest(query=request.text_query, top_k=request.top_k)
|
|
333
|
+
)
|
|
334
|
+
fusion_list.append(
|
|
335
|
+
[FusionRequest(id=x.snippet_id, score=x.score) for x in query_embedding]
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
if len(fusion_list) == 0:
|
|
339
|
+
return []
|
|
340
|
+
|
|
341
|
+
# Combine all results together with RFF if required
|
|
342
|
+
final_results = self.indexing_domain_service.perform_fusion(
|
|
343
|
+
rankings=fusion_list,
|
|
344
|
+
k=60,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
# Only keep top_k results
|
|
348
|
+
final_results = final_results[: request.top_k]
|
|
349
|
+
|
|
350
|
+
# Get snippets from database (up to top_k)
|
|
351
|
+
search_results = await self.indexing_domain_service.get_snippets_by_ids(
|
|
352
|
+
[x.id for x in final_results]
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
return [
|
|
356
|
+
MultiSearchResult(
|
|
357
|
+
id=snippet["id"],
|
|
358
|
+
uri=file["uri"],
|
|
359
|
+
content=snippet["content"],
|
|
360
|
+
original_scores=fr.original_scores,
|
|
361
|
+
)
|
|
362
|
+
for (file, snippet), fr in zip(search_results, final_results, strict=True)
|
|
363
|
+
]
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Application service for snippet operations."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import structlog
|
|
7
|
+
|
|
8
|
+
from kodit.application.commands.snippet_commands import (
|
|
9
|
+
CreateIndexSnippetsCommand,
|
|
10
|
+
ExtractSnippetsCommand,
|
|
11
|
+
)
|
|
12
|
+
from kodit.domain.entities import Snippet
|
|
13
|
+
from kodit.domain.enums import SnippetExtractionStrategy
|
|
14
|
+
from kodit.domain.interfaces import ProgressCallback
|
|
15
|
+
from kodit.domain.repositories import FileRepository, SnippetRepository
|
|
16
|
+
from kodit.domain.services.snippet_extraction_service import (
|
|
17
|
+
SnippetExtractionDomainService,
|
|
18
|
+
)
|
|
19
|
+
from kodit.domain.value_objects import SnippetExtractionRequest
|
|
20
|
+
from kodit.reporting import Reporter
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SnippetApplicationService:
|
|
24
|
+
"""Application service for snippet operations."""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
snippet_extraction_service: SnippetExtractionDomainService,
|
|
29
|
+
snippet_repository: SnippetRepository,
|
|
30
|
+
file_repository: FileRepository,
|
|
31
|
+
) -> None:
|
|
32
|
+
"""Initialize the snippet application service.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
snippet_extraction_service: Domain service for snippet extraction
|
|
36
|
+
snippet_repository: Repository for snippet persistence
|
|
37
|
+
file_repository: Repository for file operations
|
|
38
|
+
|
|
39
|
+
"""
|
|
40
|
+
self.snippet_extraction_service = snippet_extraction_service
|
|
41
|
+
self.snippet_repository = snippet_repository
|
|
42
|
+
self.file_repository = file_repository
|
|
43
|
+
self.log = structlog.get_logger(__name__)
|
|
44
|
+
|
|
45
|
+
async def extract_snippets_from_file(
|
|
46
|
+
self, command: ExtractSnippetsCommand
|
|
47
|
+
) -> list[Snippet]:
|
|
48
|
+
"""Application use case: extract snippets from a single file.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
command: The extract snippets command
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
List of extracted snippets
|
|
55
|
+
|
|
56
|
+
"""
|
|
57
|
+
request = SnippetExtractionRequest(command.file_path, command.strategy)
|
|
58
|
+
result = await self.snippet_extraction_service.extract_snippets(request)
|
|
59
|
+
|
|
60
|
+
# Convert domain result to persistence model
|
|
61
|
+
return [
|
|
62
|
+
Snippet(
|
|
63
|
+
file_id=0, index_id=0, content=snippet_text
|
|
64
|
+
) # IDs will be set later
|
|
65
|
+
for snippet_text in result.snippets
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
def _should_process_file(self, file: Any) -> bool:
|
|
69
|
+
"""Check if a file should be processed for snippet extraction.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
file: The file to check
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
True if the file should be processed
|
|
76
|
+
|
|
77
|
+
"""
|
|
78
|
+
# Skip unsupported file types
|
|
79
|
+
mime_blacklist = ["unknown/unknown"]
|
|
80
|
+
return file.mime_type not in mime_blacklist
|
|
81
|
+
|
|
82
|
+
async def _extract_snippets_from_file(
|
|
83
|
+
self, file: Any, strategy: SnippetExtractionStrategy
|
|
84
|
+
) -> list[str]:
|
|
85
|
+
"""Extract snippets from a single file."""
|
|
86
|
+
command = ExtractSnippetsCommand(
|
|
87
|
+
file_path=Path(file.cloned_path),
|
|
88
|
+
strategy=strategy,
|
|
89
|
+
)
|
|
90
|
+
snippets = await self.extract_snippets_from_file(command)
|
|
91
|
+
return [snippet.content for snippet in snippets]
|
|
92
|
+
|
|
93
|
+
async def create_snippets_for_index(
|
|
94
|
+
self,
|
|
95
|
+
command: CreateIndexSnippetsCommand,
|
|
96
|
+
progress_callback: ProgressCallback | None = None,
|
|
97
|
+
) -> None:
|
|
98
|
+
"""Create snippets for all files in an index.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
command: The create index snippets command
|
|
102
|
+
progress_callback: Optional progress callback for reporting progress
|
|
103
|
+
|
|
104
|
+
"""
|
|
105
|
+
files = await self.file_repository.get_files_for_index(command.index_id)
|
|
106
|
+
|
|
107
|
+
reporter = Reporter(self.log, progress_callback)
|
|
108
|
+
await reporter.start(
|
|
109
|
+
"create_snippets", len(files), "Creating snippets from files..."
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
for i, file in enumerate(files, 1):
|
|
113
|
+
try:
|
|
114
|
+
if not self._should_process_file(file):
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
snippet_contents = await self._extract_snippets_from_file(
|
|
118
|
+
file, command.strategy
|
|
119
|
+
)
|
|
120
|
+
for snippet_content in snippet_contents:
|
|
121
|
+
snippet = Snippet(
|
|
122
|
+
file_id=file.id,
|
|
123
|
+
index_id=command.index_id,
|
|
124
|
+
content=snippet_content,
|
|
125
|
+
)
|
|
126
|
+
await self.snippet_repository.save(snippet)
|
|
127
|
+
|
|
128
|
+
except (OSError, ValueError) as e:
|
|
129
|
+
self.log.debug(
|
|
130
|
+
"Skipping file",
|
|
131
|
+
file=file.cloned_path,
|
|
132
|
+
error=str(e),
|
|
133
|
+
)
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
await reporter.step(
|
|
137
|
+
"create_snippets",
|
|
138
|
+
current=i,
|
|
139
|
+
total=len(files),
|
|
140
|
+
message=f"Processing {file.cloned_path}...",
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
await reporter.done("create_snippets")
|