kodit 0.2.8__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/application/factories/__init__.py +1 -0
- kodit/application/factories/code_indexing_factory.py +119 -0
- kodit/application/services/{indexing_application_service.py → code_indexing_application_service.py} +159 -198
- kodit/cli.py +199 -62
- kodit/domain/entities.py +7 -5
- kodit/domain/repositories.py +33 -0
- kodit/domain/services/bm25_service.py +14 -17
- kodit/domain/services/embedding_service.py +10 -14
- kodit/domain/services/snippet_service.py +198 -0
- kodit/domain/value_objects.py +301 -21
- kodit/infrastructure/bm25/local_bm25_repository.py +20 -12
- kodit/infrastructure/bm25/vectorchord_bm25_repository.py +31 -11
- kodit/infrastructure/cloning/metadata.py +1 -0
- kodit/infrastructure/embedding/embedding_providers/hash_embedding_provider.py +14 -25
- kodit/infrastructure/embedding/local_vector_search_repository.py +26 -38
- kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +50 -35
- kodit/infrastructure/enrichment/enrichment_factory.py +1 -1
- kodit/infrastructure/indexing/indexing_factory.py +8 -91
- kodit/infrastructure/indexing/snippet_domain_service_factory.py +37 -0
- kodit/infrastructure/snippet_extraction/languages/java.scm +12 -0
- kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +3 -31
- kodit/infrastructure/sqlalchemy/embedding_repository.py +14 -3
- kodit/infrastructure/sqlalchemy/snippet_repository.py +174 -2
- kodit/mcp.py +61 -49
- {kodit-0.2.8.dist-info → kodit-0.2.9.dist-info}/METADATA +1 -1
- {kodit-0.2.8.dist-info → kodit-0.2.9.dist-info}/RECORD +30 -29
- kodit/application/commands/__init__.py +0 -1
- kodit/application/commands/snippet_commands.py +0 -22
- kodit/application/services/snippet_application_service.py +0 -149
- kodit/infrastructure/enrichment/legacy_enrichment_models.py +0 -42
- {kodit-0.2.8.dist-info → kodit-0.2.9.dist-info}/WHEEL +0 -0
- {kodit-0.2.8.dist-info → kodit-0.2.9.dist-info}/entry_points.txt +0 -0
- {kodit-0.2.8.dist-info → kodit-0.2.9.dist-info}/licenses/LICENSE +0 -0
|
@@ -8,10 +8,10 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
8
8
|
|
|
9
9
|
from kodit.domain.services.bm25_service import BM25Repository
|
|
10
10
|
from kodit.domain.value_objects import (
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
11
|
+
DeleteRequest,
|
|
12
|
+
IndexRequest,
|
|
13
|
+
SearchRequest,
|
|
14
|
+
SearchResult,
|
|
15
15
|
)
|
|
16
16
|
|
|
17
17
|
TABLE_NAME = "vectorchord_bm25_documents"
|
|
@@ -80,6 +80,17 @@ SEARCH_QUERY = f"""
|
|
|
80
80
|
ORDER BY bm25_score
|
|
81
81
|
LIMIT :limit
|
|
82
82
|
""" # noqa: S608
|
|
83
|
+
SEARCH_QUERY_WITH_FILTER = f"""
|
|
84
|
+
SELECT
|
|
85
|
+
snippet_id,
|
|
86
|
+
embedding <&>
|
|
87
|
+
to_bm25query('{INDEX_NAME}', tokenize(:query_text, '{TOKENIZER_NAME}'))
|
|
88
|
+
AS bm25_score
|
|
89
|
+
FROM {TABLE_NAME}
|
|
90
|
+
WHERE snippet_id = ANY(:snippet_ids)
|
|
91
|
+
ORDER BY bm25_score
|
|
92
|
+
LIMIT :limit
|
|
93
|
+
""" # noqa: S608
|
|
83
94
|
DELETE_QUERY = f"""
|
|
84
95
|
DELETE FROM {TABLE_NAME}
|
|
85
96
|
WHERE snippet_id IN :snippet_ids
|
|
@@ -146,7 +157,7 @@ class VectorChordBM25Repository(BM25Repository):
|
|
|
146
157
|
"""Commit the session."""
|
|
147
158
|
await self.__session.commit()
|
|
148
159
|
|
|
149
|
-
async def index_documents(self, request:
|
|
160
|
+
async def index_documents(self, request: IndexRequest) -> None:
|
|
150
161
|
"""Index documents for BM25 search."""
|
|
151
162
|
# Filter out any documents that don't have a snippet_id or text
|
|
152
163
|
valid_documents = [
|
|
@@ -172,27 +183,36 @@ class VectorChordBM25Repository(BM25Repository):
|
|
|
172
183
|
await self._execute(text(UPDATE_QUERY))
|
|
173
184
|
await self._commit()
|
|
174
185
|
|
|
175
|
-
async def search(self, request:
|
|
186
|
+
async def search(self, request: SearchRequest) -> list[SearchResult]:
|
|
176
187
|
"""Search documents using BM25."""
|
|
177
188
|
if not request.query or request.query == "":
|
|
178
189
|
return []
|
|
179
190
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
191
|
+
if request.snippet_ids is not None:
|
|
192
|
+
sql = text(SEARCH_QUERY_WITH_FILTER).bindparams(
|
|
193
|
+
query_text=request.query,
|
|
194
|
+
limit=request.top_k,
|
|
195
|
+
snippet_ids=request.snippet_ids,
|
|
196
|
+
)
|
|
197
|
+
else:
|
|
198
|
+
sql = text(SEARCH_QUERY).bindparams(
|
|
199
|
+
query_text=request.query,
|
|
200
|
+
limit=request.top_k,
|
|
201
|
+
)
|
|
202
|
+
|
|
183
203
|
try:
|
|
184
204
|
result = await self._execute(sql)
|
|
185
205
|
rows = result.mappings().all()
|
|
186
206
|
|
|
187
207
|
return [
|
|
188
|
-
|
|
208
|
+
SearchResult(snippet_id=row["snippet_id"], score=row["bm25_score"])
|
|
189
209
|
for row in rows
|
|
190
210
|
]
|
|
191
211
|
except Exception as e:
|
|
192
212
|
msg = f"Error during BM25 search: {e}"
|
|
193
213
|
raise RuntimeError(msg) from e
|
|
194
214
|
|
|
195
|
-
async def delete_documents(self, request:
|
|
215
|
+
async def delete_documents(self, request: DeleteRequest) -> None:
|
|
196
216
|
"""Delete documents from the index."""
|
|
197
217
|
await self._execute(
|
|
198
218
|
text(DELETE_QUERY).bindparams(bindparam("snippet_ids", expanding=True)),
|
|
@@ -26,40 +26,29 @@ class HashEmbeddingProvider(EmbeddingProvider):
|
|
|
26
26
|
self.embedding_size = embedding_size
|
|
27
27
|
self.log = structlog.get_logger(__name__)
|
|
28
28
|
|
|
29
|
-
def embed(
|
|
29
|
+
async def embed(
|
|
30
30
|
self, data: list[EmbeddingRequest]
|
|
31
31
|
) -> AsyncGenerator[list[EmbeddingResponse], None]:
|
|
32
32
|
"""Embed a list of strings using a simple hash-based approach."""
|
|
33
33
|
if not data:
|
|
34
|
-
|
|
35
|
-
async def empty_generator() -> AsyncGenerator[
|
|
36
|
-
list[EmbeddingResponse], None
|
|
37
|
-
]:
|
|
38
|
-
if False:
|
|
39
|
-
yield []
|
|
40
|
-
|
|
41
|
-
return empty_generator()
|
|
34
|
+
yield []
|
|
42
35
|
|
|
43
36
|
# Process in batches
|
|
44
37
|
batch_size = 10
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
EmbeddingResponse(
|
|
56
|
-
snippet_id=request.snippet_id, embedding=embedding
|
|
57
|
-
)
|
|
38
|
+
for i in range(0, len(data), batch_size):
|
|
39
|
+
batch = data[i : i + batch_size]
|
|
40
|
+
responses = []
|
|
41
|
+
|
|
42
|
+
for request in batch:
|
|
43
|
+
# Generate a deterministic embedding based on the text
|
|
44
|
+
embedding = self._generate_embedding(request.text)
|
|
45
|
+
responses.append(
|
|
46
|
+
EmbeddingResponse(
|
|
47
|
+
snippet_id=request.snippet_id, embedding=embedding
|
|
58
48
|
)
|
|
49
|
+
)
|
|
59
50
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
return _embed_batches()
|
|
51
|
+
yield responses
|
|
63
52
|
|
|
64
53
|
def _generate_embedding(self, text: str) -> list[float]:
|
|
65
54
|
"""Generate a deterministic embedding for the given text."""
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
from collections.abc import AsyncGenerator
|
|
4
4
|
|
|
5
5
|
import structlog
|
|
6
|
-
import tiktoken
|
|
7
6
|
|
|
8
7
|
from kodit.domain.entities import Embedding, EmbeddingType
|
|
9
8
|
from kodit.domain.services.embedding_service import (
|
|
@@ -12,10 +11,10 @@ from kodit.domain.services.embedding_service import (
|
|
|
12
11
|
)
|
|
13
12
|
from kodit.domain.value_objects import (
|
|
14
13
|
EmbeddingRequest,
|
|
14
|
+
IndexRequest,
|
|
15
15
|
IndexResult,
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
VectorSearchResult,
|
|
16
|
+
SearchRequest,
|
|
17
|
+
SearchResult,
|
|
19
18
|
)
|
|
20
19
|
from kodit.infrastructure.sqlalchemy.embedding_repository import (
|
|
21
20
|
SqlAlchemyEmbeddingRepository,
|
|
@@ -27,35 +26,29 @@ class LocalVectorSearchRepository(VectorSearchRepository):
|
|
|
27
26
|
|
|
28
27
|
def __init__(
|
|
29
28
|
self,
|
|
30
|
-
embedding_repository: SqlAlchemyEmbeddingRepository,
|
|
31
29
|
embedding_provider: EmbeddingProvider,
|
|
32
|
-
|
|
30
|
+
embedding_repository: SqlAlchemyEmbeddingRepository,
|
|
31
|
+
embedding_type: EmbeddingType,
|
|
33
32
|
) -> None:
|
|
34
33
|
"""Initialize the local vector search repository.
|
|
35
34
|
|
|
36
35
|
Args:
|
|
37
|
-
embedding_repository: The SQLAlchemy embedding repository
|
|
38
36
|
embedding_provider: The embedding provider for generating embeddings
|
|
37
|
+
embedding_repository: The embedding repository for persistence
|
|
39
38
|
embedding_type: The type of embedding to use
|
|
40
39
|
|
|
41
40
|
"""
|
|
42
|
-
self.log = structlog.get_logger(__name__)
|
|
43
|
-
self.embedding_repository = embedding_repository
|
|
44
41
|
self.embedding_provider = embedding_provider
|
|
45
|
-
self.
|
|
42
|
+
self.embedding_repository = embedding_repository
|
|
46
43
|
self.embedding_type = embedding_type
|
|
44
|
+
self.log = structlog.get_logger(__name__)
|
|
47
45
|
|
|
48
|
-
def index_documents(
|
|
49
|
-
self, request:
|
|
46
|
+
async def index_documents(
|
|
47
|
+
self, request: IndexRequest
|
|
50
48
|
) -> AsyncGenerator[list[IndexResult], None]:
|
|
51
49
|
"""Index documents for vector search."""
|
|
52
|
-
if not request.documents:
|
|
53
|
-
|
|
54
|
-
async def empty_generator() -> AsyncGenerator[list[IndexResult], None]:
|
|
55
|
-
if False:
|
|
56
|
-
yield []
|
|
57
|
-
|
|
58
|
-
return empty_generator()
|
|
50
|
+
if not request.documents or len(request.documents) == 0:
|
|
51
|
+
yield []
|
|
59
52
|
|
|
60
53
|
# Convert to embedding requests
|
|
61
54
|
requests = [
|
|
@@ -63,25 +56,20 @@ class LocalVectorSearchRepository(VectorSearchRepository):
|
|
|
63
56
|
for doc in request.documents
|
|
64
57
|
]
|
|
65
58
|
|
|
66
|
-
async
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
type=self.embedding_type,
|
|
75
|
-
)
|
|
59
|
+
async for batch in self.embedding_provider.embed(requests):
|
|
60
|
+
results = []
|
|
61
|
+
for result in batch:
|
|
62
|
+
await self.embedding_repository.create_embedding(
|
|
63
|
+
Embedding(
|
|
64
|
+
snippet_id=result.snippet_id,
|
|
65
|
+
embedding=result.embedding,
|
|
66
|
+
type=self.embedding_type,
|
|
76
67
|
)
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
return _index_batches()
|
|
68
|
+
)
|
|
69
|
+
results.append(IndexResult(snippet_id=result.snippet_id))
|
|
70
|
+
yield results
|
|
81
71
|
|
|
82
|
-
async def search(
|
|
83
|
-
self, request: VectorSearchQueryRequest
|
|
84
|
-
) -> list[VectorSearchResult]:
|
|
72
|
+
async def search(self, request: SearchRequest) -> list[SearchResult]:
|
|
85
73
|
"""Search documents using vector similarity."""
|
|
86
74
|
# Build a single-item request and collect its embedding
|
|
87
75
|
req = EmbeddingRequest(snippet_id=0, text=request.query)
|
|
@@ -95,10 +83,10 @@ class LocalVectorSearchRepository(VectorSearchRepository):
|
|
|
95
83
|
return []
|
|
96
84
|
|
|
97
85
|
results = await self.embedding_repository.list_semantic_results(
|
|
98
|
-
self.embedding_type, embedding_vec, request.top_k
|
|
86
|
+
self.embedding_type, embedding_vec, request.top_k, request.snippet_ids
|
|
99
87
|
)
|
|
100
88
|
return [
|
|
101
|
-
|
|
89
|
+
SearchResult(snippet_id=snippet_id, score=score)
|
|
102
90
|
for snippet_id, score in results
|
|
103
91
|
]
|
|
104
92
|
|
|
@@ -14,10 +14,10 @@ from kodit.domain.services.embedding_service import (
|
|
|
14
14
|
)
|
|
15
15
|
from kodit.domain.value_objects import (
|
|
16
16
|
EmbeddingRequest,
|
|
17
|
+
IndexRequest,
|
|
17
18
|
IndexResult,
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
VectorSearchResult,
|
|
19
|
+
SearchRequest,
|
|
20
|
+
SearchResult,
|
|
21
21
|
)
|
|
22
22
|
|
|
23
23
|
# SQL Queries
|
|
@@ -59,6 +59,15 @@ ORDER BY score ASC
|
|
|
59
59
|
LIMIT :top_k;
|
|
60
60
|
"""
|
|
61
61
|
|
|
62
|
+
# Filtered search query with snippet_ids
|
|
63
|
+
SEARCH_QUERY_WITH_FILTER = """
|
|
64
|
+
SELECT snippet_id, embedding <=> :query as score
|
|
65
|
+
FROM {TABLE_NAME}
|
|
66
|
+
WHERE snippet_id = ANY(:snippet_ids)
|
|
67
|
+
ORDER BY score ASC
|
|
68
|
+
LIMIT :top_k;
|
|
69
|
+
"""
|
|
70
|
+
|
|
62
71
|
CHECK_VCHORD_EMBEDDING_EXISTS = """
|
|
63
72
|
SELECT EXISTS(SELECT 1 FROM {TABLE_NAME} WHERE snippet_id = :snippet_id)
|
|
64
73
|
"""
|
|
@@ -156,17 +165,12 @@ class VectorChordVectorSearchRepository(VectorSearchRepository):
|
|
|
156
165
|
"""Commit the session."""
|
|
157
166
|
await self._session.commit()
|
|
158
167
|
|
|
159
|
-
def index_documents(
|
|
160
|
-
self, request:
|
|
168
|
+
async def index_documents(
|
|
169
|
+
self, request: IndexRequest
|
|
161
170
|
) -> AsyncGenerator[list[IndexResult], None]:
|
|
162
171
|
"""Index documents for vector search."""
|
|
163
172
|
if not request.documents:
|
|
164
|
-
|
|
165
|
-
async def empty_generator() -> AsyncGenerator[list[IndexResult], None]:
|
|
166
|
-
if False:
|
|
167
|
-
yield []
|
|
168
|
-
|
|
169
|
-
return empty_generator()
|
|
173
|
+
yield []
|
|
170
174
|
|
|
171
175
|
# Convert to embedding requests
|
|
172
176
|
requests = [
|
|
@@ -174,27 +178,25 @@ class VectorChordVectorSearchRepository(VectorSearchRepository):
|
|
|
174
178
|
for doc in request.documents
|
|
175
179
|
]
|
|
176
180
|
|
|
177
|
-
async
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
yield [IndexResult(snippet_id=result.snippet_id) for result in batch]
|
|
191
|
-
|
|
192
|
-
return _index_batches()
|
|
181
|
+
async for batch in self.embedding_provider.embed(requests):
|
|
182
|
+
await self._execute(
|
|
183
|
+
text(INSERT_QUERY.format(TABLE_NAME=self.table_name)),
|
|
184
|
+
[
|
|
185
|
+
{
|
|
186
|
+
"snippet_id": result.snippet_id,
|
|
187
|
+
"embedding": str(result.embedding),
|
|
188
|
+
}
|
|
189
|
+
for result in batch
|
|
190
|
+
],
|
|
191
|
+
)
|
|
192
|
+
await self._commit()
|
|
193
|
+
yield [IndexResult(snippet_id=result.snippet_id) for result in batch]
|
|
193
194
|
|
|
194
|
-
async def search(
|
|
195
|
-
self, request: VectorSearchQueryRequest
|
|
196
|
-
) -> list[VectorSearchResult]:
|
|
195
|
+
async def search(self, request: SearchRequest) -> list[SearchResult]:
|
|
197
196
|
"""Search documents using vector similarity."""
|
|
197
|
+
if not request.query or not request.query.strip():
|
|
198
|
+
return []
|
|
199
|
+
|
|
198
200
|
req = EmbeddingRequest(snippet_id=0, text=request.query)
|
|
199
201
|
embedding_vec: list[float] | None = None
|
|
200
202
|
async for batch in self.embedding_provider.embed([req]):
|
|
@@ -204,14 +206,27 @@ class VectorChordVectorSearchRepository(VectorSearchRepository):
|
|
|
204
206
|
|
|
205
207
|
if not embedding_vec:
|
|
206
208
|
return []
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
209
|
+
|
|
210
|
+
# Use filtered query if snippet_ids are provided
|
|
211
|
+
if request.snippet_ids is not None:
|
|
212
|
+
result = await self._execute(
|
|
213
|
+
text(SEARCH_QUERY_WITH_FILTER.format(TABLE_NAME=self.table_name)),
|
|
214
|
+
{
|
|
215
|
+
"query": str(embedding_vec),
|
|
216
|
+
"top_k": request.top_k,
|
|
217
|
+
"snippet_ids": request.snippet_ids,
|
|
218
|
+
},
|
|
219
|
+
)
|
|
220
|
+
else:
|
|
221
|
+
result = await self._execute(
|
|
222
|
+
text(SEARCH_QUERY.format(TABLE_NAME=self.table_name)),
|
|
223
|
+
{"query": str(embedding_vec), "top_k": request.top_k},
|
|
224
|
+
)
|
|
225
|
+
|
|
211
226
|
rows = result.mappings().all()
|
|
212
227
|
|
|
213
228
|
return [
|
|
214
|
-
|
|
229
|
+
SearchResult(snippet_id=row["snippet_id"], score=row["score"])
|
|
215
230
|
for row in rows
|
|
216
231
|
]
|
|
217
232
|
|
|
@@ -24,7 +24,7 @@ def _get_endpoint_configuration(app_context: AppContext) -> Endpoint | None:
|
|
|
24
24
|
return app_context.enrichment_endpoint or app_context.default_endpoint or None
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
def
|
|
27
|
+
def enrichment_domain_service_factory(
|
|
28
28
|
app_context: AppContext,
|
|
29
29
|
) -> EnrichmentDomainService:
|
|
30
30
|
"""Create an enrichment domain service.
|
|
@@ -1,113 +1,30 @@
|
|
|
1
1
|
"""Factory for creating indexing services."""
|
|
2
2
|
|
|
3
|
-
from typing import Any
|
|
4
|
-
|
|
5
3
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
6
4
|
|
|
7
|
-
from kodit.application.services.indexing_application_service import (
|
|
8
|
-
IndexingApplicationService,
|
|
9
|
-
)
|
|
10
|
-
from kodit.application.services.snippet_application_service import (
|
|
11
|
-
SnippetApplicationService,
|
|
12
|
-
)
|
|
13
|
-
from kodit.domain.services.bm25_service import BM25DomainService
|
|
14
5
|
from kodit.domain.services.indexing_service import IndexingDomainService
|
|
15
|
-
from kodit.domain.services.source_service import SourceService
|
|
16
|
-
from kodit.infrastructure.bm25.bm25_factory import bm25_repository_factory
|
|
17
|
-
from kodit.infrastructure.embedding.embedding_factory import (
|
|
18
|
-
embedding_domain_service_factory,
|
|
19
|
-
)
|
|
20
|
-
from kodit.infrastructure.enrichment.enrichment_factory import (
|
|
21
|
-
create_enrichment_domain_service,
|
|
22
|
-
)
|
|
23
6
|
from kodit.infrastructure.indexing.fusion_service import ReciprocalRankFusionService
|
|
24
7
|
from kodit.infrastructure.indexing.index_repository import SQLAlchemyIndexRepository
|
|
25
|
-
from kodit.infrastructure.snippet_extraction.snippet_extraction_factory import (
|
|
26
|
-
create_snippet_extraction_domain_service,
|
|
27
|
-
)
|
|
28
|
-
from kodit.infrastructure.sqlalchemy.file_repository import (
|
|
29
|
-
SqlAlchemyFileRepository,
|
|
30
|
-
)
|
|
31
|
-
from kodit.infrastructure.sqlalchemy.snippet_repository import (
|
|
32
|
-
SqlAlchemySnippetRepository,
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def create_snippet_application_service(
|
|
37
|
-
session: AsyncSession,
|
|
38
|
-
) -> SnippetApplicationService:
|
|
39
|
-
"""Create a snippet application service with all dependencies."""
|
|
40
|
-
# Create domain service
|
|
41
|
-
snippet_extraction_service = create_snippet_extraction_domain_service()
|
|
42
|
-
|
|
43
|
-
# Create repositories
|
|
44
|
-
snippet_repository = SqlAlchemySnippetRepository(session)
|
|
45
|
-
file_repository = SqlAlchemyFileRepository(session)
|
|
46
8
|
|
|
47
|
-
# Create application service
|
|
48
|
-
from kodit.application.services.snippet_application_service import (
|
|
49
|
-
SnippetApplicationService,
|
|
50
|
-
)
|
|
51
|
-
|
|
52
|
-
return SnippetApplicationService(
|
|
53
|
-
snippet_extraction_service=snippet_extraction_service,
|
|
54
|
-
snippet_repository=snippet_repository,
|
|
55
|
-
file_repository=file_repository,
|
|
56
|
-
session=session,
|
|
57
|
-
)
|
|
58
9
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
"""Create an indexing domain service.
|
|
10
|
+
def indexing_domain_service_factory(session: AsyncSession) -> IndexingDomainService:
|
|
11
|
+
"""Create an indexing domain service with all dependencies.
|
|
62
12
|
|
|
63
13
|
Args:
|
|
64
|
-
session:
|
|
14
|
+
session: SQLAlchemy session
|
|
65
15
|
|
|
66
16
|
Returns:
|
|
67
|
-
|
|
17
|
+
Configured indexing domain service
|
|
68
18
|
|
|
69
19
|
"""
|
|
20
|
+
# Create repositories
|
|
70
21
|
index_repository = SQLAlchemyIndexRepository(session)
|
|
22
|
+
|
|
23
|
+
# Create fusion service
|
|
71
24
|
fusion_service = ReciprocalRankFusionService()
|
|
72
25
|
|
|
26
|
+
# Create domain service
|
|
73
27
|
return IndexingDomainService(
|
|
74
28
|
index_repository=index_repository,
|
|
75
29
|
fusion_service=fusion_service,
|
|
76
30
|
)
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def create_indexing_application_service(
|
|
80
|
-
app_context: Any,
|
|
81
|
-
session: AsyncSession,
|
|
82
|
-
source_service: SourceService,
|
|
83
|
-
snippet_application_service: SnippetApplicationService,
|
|
84
|
-
) -> IndexingApplicationService:
|
|
85
|
-
"""Create an indexing application service.
|
|
86
|
-
|
|
87
|
-
Args:
|
|
88
|
-
app_context: The application context.
|
|
89
|
-
session: The database session.
|
|
90
|
-
source_service: The source service.
|
|
91
|
-
snippet_application_service: The snippet application service.
|
|
92
|
-
|
|
93
|
-
Returns:
|
|
94
|
-
An indexing application service instance.
|
|
95
|
-
|
|
96
|
-
"""
|
|
97
|
-
# Create domain services
|
|
98
|
-
indexing_domain_service = create_indexing_domain_service(session)
|
|
99
|
-
bm25_service = BM25DomainService(bm25_repository_factory(app_context, session))
|
|
100
|
-
code_search_service = embedding_domain_service_factory("code", app_context, session)
|
|
101
|
-
text_search_service = embedding_domain_service_factory("text", app_context, session)
|
|
102
|
-
enrichment_service = create_enrichment_domain_service(app_context)
|
|
103
|
-
|
|
104
|
-
return IndexingApplicationService(
|
|
105
|
-
indexing_domain_service=indexing_domain_service,
|
|
106
|
-
source_service=source_service,
|
|
107
|
-
bm25_service=bm25_service,
|
|
108
|
-
code_search_service=code_search_service,
|
|
109
|
-
text_search_service=text_search_service,
|
|
110
|
-
enrichment_service=enrichment_service,
|
|
111
|
-
snippet_application_service=snippet_application_service,
|
|
112
|
-
session=session,
|
|
113
|
-
)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Factory for creating snippet domain service."""
|
|
2
|
+
|
|
3
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
4
|
+
|
|
5
|
+
from kodit.domain.services.snippet_service import SnippetDomainService
|
|
6
|
+
from kodit.infrastructure.snippet_extraction.snippet_extraction_factory import (
|
|
7
|
+
create_snippet_extraction_domain_service,
|
|
8
|
+
)
|
|
9
|
+
from kodit.infrastructure.sqlalchemy.file_repository import SqlAlchemyFileRepository
|
|
10
|
+
from kodit.infrastructure.sqlalchemy.snippet_repository import (
|
|
11
|
+
SqlAlchemySnippetRepository,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def snippet_domain_service_factory(session: AsyncSession) -> SnippetDomainService:
|
|
16
|
+
"""Create a snippet domain service with all dependencies.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
session: The database session
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Configured snippet domain service
|
|
23
|
+
|
|
24
|
+
"""
|
|
25
|
+
# Create domain service for snippet extraction
|
|
26
|
+
snippet_extraction_service = create_snippet_extraction_domain_service()
|
|
27
|
+
|
|
28
|
+
# Create repositories
|
|
29
|
+
snippet_repository = SqlAlchemySnippetRepository(session)
|
|
30
|
+
file_repository = SqlAlchemyFileRepository(session)
|
|
31
|
+
|
|
32
|
+
# Create and return the domain service
|
|
33
|
+
return SnippetDomainService(
|
|
34
|
+
snippet_extraction_service=snippet_extraction_service,
|
|
35
|
+
snippet_repository=snippet_repository,
|
|
36
|
+
file_repository=file_repository,
|
|
37
|
+
)
|
|
@@ -9,6 +9,7 @@ from kodit.domain.repositories import FileRepository, SnippetRepository
|
|
|
9
9
|
from kodit.domain.services.snippet_extraction_service import (
|
|
10
10
|
SnippetExtractionDomainService,
|
|
11
11
|
)
|
|
12
|
+
from kodit.domain.value_objects import LanguageMapping
|
|
12
13
|
from kodit.infrastructure.snippet_extraction.language_detection_service import (
|
|
13
14
|
FileSystemLanguageDetectionService,
|
|
14
15
|
)
|
|
@@ -31,37 +32,8 @@ def create_snippet_extraction_domain_service() -> SnippetExtractionDomainService
|
|
|
31
32
|
Configured snippet extraction domain service
|
|
32
33
|
|
|
33
34
|
"""
|
|
34
|
-
#
|
|
35
|
-
language_map =
|
|
36
|
-
# JavaScript/TypeScript
|
|
37
|
-
"js": "javascript",
|
|
38
|
-
"jsx": "javascript",
|
|
39
|
-
"ts": "typescript",
|
|
40
|
-
"tsx": "typescript",
|
|
41
|
-
# Python
|
|
42
|
-
"py": "python",
|
|
43
|
-
# Rust
|
|
44
|
-
"rs": "rust",
|
|
45
|
-
# Go
|
|
46
|
-
"go": "go",
|
|
47
|
-
# C/C++
|
|
48
|
-
"cpp": "cpp",
|
|
49
|
-
"hpp": "cpp",
|
|
50
|
-
"c": "c",
|
|
51
|
-
"h": "c",
|
|
52
|
-
# C#
|
|
53
|
-
"cs": "csharp",
|
|
54
|
-
# Ruby
|
|
55
|
-
"rb": "ruby",
|
|
56
|
-
# Java
|
|
57
|
-
"java": "java",
|
|
58
|
-
# PHP
|
|
59
|
-
"php": "php",
|
|
60
|
-
# Swift
|
|
61
|
-
"swift": "swift",
|
|
62
|
-
# Kotlin
|
|
63
|
-
"kt": "kotlin",
|
|
64
|
-
}
|
|
35
|
+
# Use the unified language mapping from the domain layer
|
|
36
|
+
language_map = LanguageMapping.get_extension_to_language_map()
|
|
65
37
|
|
|
66
38
|
# Create infrastructure services
|
|
67
39
|
language_detector = FileSystemLanguageDetectionService(language_map)
|