kodit 0.2.8__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/application/factories/__init__.py +1 -0
- kodit/application/factories/code_indexing_factory.py +119 -0
- kodit/application/services/{indexing_application_service.py → code_indexing_application_service.py} +159 -198
- kodit/cli.py +199 -62
- kodit/domain/entities.py +7 -5
- kodit/domain/repositories.py +33 -0
- kodit/domain/services/bm25_service.py +14 -17
- kodit/domain/services/embedding_service.py +10 -14
- kodit/domain/services/snippet_service.py +198 -0
- kodit/domain/value_objects.py +301 -21
- kodit/infrastructure/bm25/local_bm25_repository.py +20 -12
- kodit/infrastructure/bm25/vectorchord_bm25_repository.py +31 -11
- kodit/infrastructure/cloning/metadata.py +1 -0
- kodit/infrastructure/embedding/embedding_providers/hash_embedding_provider.py +14 -25
- kodit/infrastructure/embedding/local_vector_search_repository.py +26 -38
- kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +50 -35
- kodit/infrastructure/enrichment/enrichment_factory.py +1 -1
- kodit/infrastructure/indexing/indexing_factory.py +8 -91
- kodit/infrastructure/indexing/snippet_domain_service_factory.py +37 -0
- kodit/infrastructure/snippet_extraction/languages/java.scm +12 -0
- kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +3 -31
- kodit/infrastructure/sqlalchemy/embedding_repository.py +14 -3
- kodit/infrastructure/sqlalchemy/snippet_repository.py +174 -2
- kodit/mcp.py +61 -49
- {kodit-0.2.8.dist-info → kodit-0.2.9.dist-info}/METADATA +1 -1
- {kodit-0.2.8.dist-info → kodit-0.2.9.dist-info}/RECORD +30 -29
- kodit/application/commands/__init__.py +0 -1
- kodit/application/commands/snippet_commands.py +0 -22
- kodit/application/services/snippet_application_service.py +0 -149
- kodit/infrastructure/enrichment/legacy_enrichment_models.py +0 -42
- {kodit-0.2.8.dist-info → kodit-0.2.9.dist-info}/WHEEL +0 -0
- {kodit-0.2.8.dist-info → kodit-0.2.9.dist-info}/entry_points.txt +0 -0
- {kodit-0.2.8.dist-info → kodit-0.2.9.dist-info}/licenses/LICENSE +0 -0
kodit/_version.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Application factories package."""
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Factory for creating the unified code indexing application service."""
|
|
2
|
+
|
|
3
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
4
|
+
|
|
5
|
+
from kodit.application.services.code_indexing_application_service import (
|
|
6
|
+
CodeIndexingApplicationService,
|
|
7
|
+
)
|
|
8
|
+
from kodit.config import AppContext
|
|
9
|
+
from kodit.domain.entities import EmbeddingType
|
|
10
|
+
from kodit.domain.services.bm25_service import BM25DomainService
|
|
11
|
+
from kodit.domain.services.embedding_service import EmbeddingDomainService
|
|
12
|
+
from kodit.domain.services.enrichment_service import EnrichmentDomainService
|
|
13
|
+
from kodit.domain.services.source_service import SourceService
|
|
14
|
+
from kodit.infrastructure.bm25.bm25_factory import bm25_repository_factory
|
|
15
|
+
from kodit.infrastructure.embedding.embedding_factory import (
|
|
16
|
+
embedding_domain_service_factory,
|
|
17
|
+
)
|
|
18
|
+
from kodit.infrastructure.embedding.embedding_providers import (
|
|
19
|
+
hash_embedding_provider,
|
|
20
|
+
)
|
|
21
|
+
from kodit.infrastructure.embedding.local_vector_search_repository import (
|
|
22
|
+
LocalVectorSearchRepository,
|
|
23
|
+
)
|
|
24
|
+
from kodit.infrastructure.enrichment.enrichment_factory import (
|
|
25
|
+
enrichment_domain_service_factory,
|
|
26
|
+
)
|
|
27
|
+
from kodit.infrastructure.enrichment.null_enrichment_provider import (
|
|
28
|
+
NullEnrichmentProvider,
|
|
29
|
+
)
|
|
30
|
+
from kodit.infrastructure.indexing.indexing_factory import (
|
|
31
|
+
indexing_domain_service_factory,
|
|
32
|
+
)
|
|
33
|
+
from kodit.infrastructure.indexing.snippet_domain_service_factory import (
|
|
34
|
+
snippet_domain_service_factory,
|
|
35
|
+
)
|
|
36
|
+
from kodit.infrastructure.sqlalchemy.embedding_repository import (
|
|
37
|
+
SqlAlchemyEmbeddingRepository,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def create_code_indexing_application_service(
|
|
42
|
+
app_context: AppContext,
|
|
43
|
+
session: AsyncSession,
|
|
44
|
+
source_service: SourceService,
|
|
45
|
+
) -> CodeIndexingApplicationService:
|
|
46
|
+
"""Create a unified code indexing application service with all dependencies."""
|
|
47
|
+
# Create domain services
|
|
48
|
+
indexing_domain_service = indexing_domain_service_factory(session)
|
|
49
|
+
snippet_domain_service = snippet_domain_service_factory(session)
|
|
50
|
+
bm25_service = BM25DomainService(bm25_repository_factory(app_context, session))
|
|
51
|
+
code_search_service = embedding_domain_service_factory("code", app_context, session)
|
|
52
|
+
text_search_service = embedding_domain_service_factory("text", app_context, session)
|
|
53
|
+
enrichment_service = enrichment_domain_service_factory(app_context)
|
|
54
|
+
|
|
55
|
+
# Create and return the unified application service
|
|
56
|
+
return CodeIndexingApplicationService(
|
|
57
|
+
indexing_domain_service=indexing_domain_service,
|
|
58
|
+
snippet_domain_service=snippet_domain_service,
|
|
59
|
+
source_service=source_service,
|
|
60
|
+
bm25_service=bm25_service,
|
|
61
|
+
code_search_service=code_search_service,
|
|
62
|
+
text_search_service=text_search_service,
|
|
63
|
+
enrichment_service=enrichment_service,
|
|
64
|
+
session=session,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def create_fast_test_code_indexing_application_service(
|
|
69
|
+
app_context: AppContext,
|
|
70
|
+
session: AsyncSession,
|
|
71
|
+
source_service: SourceService,
|
|
72
|
+
) -> CodeIndexingApplicationService:
|
|
73
|
+
"""Create a fast test version of CodeIndexingApplicationService."""
|
|
74
|
+
# Create domain services
|
|
75
|
+
indexing_domain_service = indexing_domain_service_factory(session)
|
|
76
|
+
snippet_domain_service = snippet_domain_service_factory(session)
|
|
77
|
+
bm25_service = BM25DomainService(bm25_repository_factory(app_context, session))
|
|
78
|
+
|
|
79
|
+
# Create fast embedding services using HashEmbeddingProvider
|
|
80
|
+
embedding_repository = SqlAlchemyEmbeddingRepository(session=session)
|
|
81
|
+
|
|
82
|
+
# Fast code search service
|
|
83
|
+
code_search_repository = LocalVectorSearchRepository(
|
|
84
|
+
embedding_repository=embedding_repository,
|
|
85
|
+
embedding_provider=hash_embedding_provider.HashEmbeddingProvider(),
|
|
86
|
+
embedding_type=EmbeddingType.CODE,
|
|
87
|
+
)
|
|
88
|
+
code_search_service = EmbeddingDomainService(
|
|
89
|
+
embedding_provider=hash_embedding_provider.HashEmbeddingProvider(),
|
|
90
|
+
vector_search_repository=code_search_repository,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Fast text search service
|
|
94
|
+
text_search_repository = LocalVectorSearchRepository(
|
|
95
|
+
embedding_repository=embedding_repository,
|
|
96
|
+
embedding_provider=hash_embedding_provider.HashEmbeddingProvider(),
|
|
97
|
+
embedding_type=EmbeddingType.TEXT,
|
|
98
|
+
)
|
|
99
|
+
text_search_service = EmbeddingDomainService(
|
|
100
|
+
embedding_provider=hash_embedding_provider.HashEmbeddingProvider(),
|
|
101
|
+
vector_search_repository=text_search_repository,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Fast enrichment service using NullEnrichmentProvider
|
|
105
|
+
enrichment_service = EnrichmentDomainService(
|
|
106
|
+
enrichment_provider=NullEnrichmentProvider()
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Create and return the unified application service
|
|
110
|
+
return CodeIndexingApplicationService(
|
|
111
|
+
indexing_domain_service=indexing_domain_service,
|
|
112
|
+
snippet_domain_service=snippet_domain_service,
|
|
113
|
+
source_service=source_service,
|
|
114
|
+
bm25_service=bm25_service,
|
|
115
|
+
code_search_service=code_search_service,
|
|
116
|
+
text_search_service=text_search_service,
|
|
117
|
+
enrichment_service=enrichment_service,
|
|
118
|
+
session=session,
|
|
119
|
+
)
|
kodit/application/services/{indexing_application_service.py → code_indexing_application_service.py}
RENAMED
|
@@ -1,12 +1,10 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Unified application service for code indexing operations."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import replace
|
|
2
4
|
|
|
3
5
|
import structlog
|
|
4
6
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
5
7
|
|
|
6
|
-
from kodit.application.commands.snippet_commands import CreateIndexSnippetsCommand
|
|
7
|
-
from kodit.application.services.snippet_application_service import (
|
|
8
|
-
SnippetApplicationService,
|
|
9
|
-
)
|
|
10
8
|
from kodit.domain.entities import Snippet
|
|
11
9
|
from kodit.domain.enums import SnippetExtractionStrategy
|
|
12
10
|
from kodit.domain.errors import EmptySourceError
|
|
@@ -15,106 +13,72 @@ from kodit.domain.services.bm25_service import BM25DomainService
|
|
|
15
13
|
from kodit.domain.services.embedding_service import EmbeddingDomainService
|
|
16
14
|
from kodit.domain.services.enrichment_service import EnrichmentDomainService
|
|
17
15
|
from kodit.domain.services.indexing_service import IndexingDomainService
|
|
16
|
+
from kodit.domain.services.snippet_service import SnippetDomainService
|
|
18
17
|
from kodit.domain.services.source_service import SourceService
|
|
19
18
|
from kodit.domain.value_objects import (
|
|
20
|
-
|
|
21
|
-
BM25IndexRequest,
|
|
22
|
-
BM25SearchRequest,
|
|
23
|
-
BM25SearchResult,
|
|
19
|
+
Document,
|
|
24
20
|
EnrichmentIndexRequest,
|
|
25
21
|
EnrichmentRequest,
|
|
26
22
|
FusionRequest,
|
|
27
23
|
IndexCreateRequest,
|
|
24
|
+
IndexRequest,
|
|
28
25
|
IndexView,
|
|
29
26
|
MultiSearchRequest,
|
|
30
27
|
MultiSearchResult,
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
28
|
+
SearchRequest,
|
|
29
|
+
SearchResult,
|
|
30
|
+
SnippetListItem,
|
|
34
31
|
)
|
|
35
32
|
from kodit.log import log_event
|
|
36
33
|
from kodit.reporting import Reporter
|
|
37
34
|
|
|
38
35
|
|
|
39
|
-
class
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
This service orchestrates the business logic for creating, listing, and running
|
|
43
|
-
code indexes. It coordinates between domain services and provides a clean API
|
|
44
|
-
for index management.
|
|
45
|
-
"""
|
|
36
|
+
class CodeIndexingApplicationService:
|
|
37
|
+
"""Unified application service for all code indexing operations."""
|
|
46
38
|
|
|
47
39
|
def __init__( # noqa: PLR0913
|
|
48
40
|
self,
|
|
49
41
|
indexing_domain_service: IndexingDomainService,
|
|
42
|
+
snippet_domain_service: SnippetDomainService,
|
|
50
43
|
source_service: SourceService,
|
|
51
44
|
bm25_service: BM25DomainService,
|
|
52
45
|
code_search_service: EmbeddingDomainService,
|
|
53
46
|
text_search_service: EmbeddingDomainService,
|
|
54
47
|
enrichment_service: EnrichmentDomainService,
|
|
55
|
-
snippet_application_service: SnippetApplicationService,
|
|
56
48
|
session: AsyncSession,
|
|
57
49
|
) -> None:
|
|
58
|
-
"""Initialize the indexing application service.
|
|
59
|
-
|
|
60
|
-
Args:
|
|
61
|
-
indexing_domain_service: The indexing domain service.
|
|
62
|
-
source_service: The source service for source validation.
|
|
63
|
-
bm25_service: The BM25 domain service for keyword search.
|
|
64
|
-
code_search_service: The code search domain service.
|
|
65
|
-
text_search_service: The text search domain service.
|
|
66
|
-
enrichment_service: The enrichment domain service.
|
|
67
|
-
snippet_application_service: The snippet application service.
|
|
68
|
-
session: The database session for transaction management.
|
|
69
|
-
|
|
70
|
-
"""
|
|
50
|
+
"""Initialize the code indexing application service."""
|
|
71
51
|
self.indexing_domain_service = indexing_domain_service
|
|
52
|
+
self.snippet_domain_service = snippet_domain_service
|
|
72
53
|
self.source_service = source_service
|
|
73
|
-
self.snippet_application_service = snippet_application_service
|
|
74
|
-
self.session = session
|
|
75
|
-
self.log = structlog.get_logger(__name__)
|
|
76
54
|
self.bm25_service = bm25_service
|
|
77
55
|
self.code_search_service = code_search_service
|
|
78
56
|
self.text_search_service = text_search_service
|
|
79
57
|
self.enrichment_service = enrichment_service
|
|
58
|
+
self.session = session
|
|
59
|
+
self.log = structlog.get_logger(__name__)
|
|
80
60
|
|
|
81
61
|
async def create_index(self, source_id: int) -> IndexView:
|
|
82
|
-
"""Create a new index for a source.
|
|
83
|
-
|
|
84
|
-
Args:
|
|
85
|
-
source_id: The ID of the source to create an index for.
|
|
86
|
-
|
|
87
|
-
Returns:
|
|
88
|
-
An IndexView representing the newly created index.
|
|
89
|
-
|
|
90
|
-
Raises:
|
|
91
|
-
ValueError: If the source doesn't exist.
|
|
92
|
-
|
|
93
|
-
"""
|
|
62
|
+
"""Create a new index for a source."""
|
|
94
63
|
log_event("kodit.index.create")
|
|
95
64
|
|
|
96
|
-
#
|
|
65
|
+
# Validate source exists
|
|
97
66
|
source = await self.source_service.get(source_id)
|
|
98
67
|
|
|
99
|
-
# Create
|
|
68
|
+
# Create index
|
|
100
69
|
request = IndexCreateRequest(source_id=source.id)
|
|
101
70
|
index_view = await self.indexing_domain_service.create_index(request)
|
|
102
71
|
|
|
103
|
-
#
|
|
72
|
+
# Single transaction commit
|
|
104
73
|
await self.session.commit()
|
|
105
74
|
|
|
106
75
|
return index_view
|
|
107
76
|
|
|
108
77
|
async def list_indexes(self) -> list[IndexView]:
|
|
109
|
-
"""List all available indexes with their details.
|
|
110
|
-
|
|
111
|
-
Returns:
|
|
112
|
-
A list of IndexView objects containing information about each index.
|
|
113
|
-
|
|
114
|
-
"""
|
|
78
|
+
"""List all available indexes with their details."""
|
|
115
79
|
indexes = await self.indexing_domain_service.list_indexes()
|
|
116
80
|
|
|
117
|
-
#
|
|
81
|
+
# Telemetry
|
|
118
82
|
log_event(
|
|
119
83
|
"kodit.index.list",
|
|
120
84
|
{
|
|
@@ -128,112 +92,184 @@ class IndexingApplicationService:
|
|
|
128
92
|
async def run_index(
|
|
129
93
|
self, index_id: int, progress_callback: ProgressCallback | None = None
|
|
130
94
|
) -> None:
|
|
131
|
-
"""Run the indexing process for a specific index.
|
|
132
|
-
|
|
133
|
-
Args:
|
|
134
|
-
index_id: The ID of the index to run.
|
|
135
|
-
progress_callback: Optional progress callback for reporting progress.
|
|
136
|
-
|
|
137
|
-
Raises:
|
|
138
|
-
ValueError: If the index doesn't exist or no indexable snippets are found.
|
|
139
|
-
|
|
140
|
-
"""
|
|
95
|
+
"""Run the complete indexing process for a specific index."""
|
|
141
96
|
log_event("kodit.index.run")
|
|
142
97
|
|
|
143
|
-
#
|
|
98
|
+
# Validate index
|
|
144
99
|
index = await self.indexing_domain_service.get_index(index_id)
|
|
145
100
|
if not index:
|
|
146
101
|
msg = f"Index not found: {index_id}"
|
|
147
102
|
raise ValueError(msg)
|
|
148
103
|
|
|
149
|
-
# Delete old snippets
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
await self.session.commit()
|
|
104
|
+
# Delete old snippets to make way for reindexing
|
|
105
|
+
# In the future we will only reindex snippets that have changed
|
|
106
|
+
await self.snippet_domain_service.delete_snippets_for_index(index.id)
|
|
153
107
|
|
|
154
|
-
#
|
|
155
|
-
# (snippet_application_service handles its own commits)
|
|
108
|
+
# Extract and create snippets (domain service handles progress)
|
|
156
109
|
self.log.info("Creating snippets for files", index_id=index.id)
|
|
157
|
-
|
|
158
|
-
index_id=index.id,
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
command, progress_callback
|
|
110
|
+
snippets = await self.snippet_domain_service.extract_and_create_snippets(
|
|
111
|
+
index_id=index.id,
|
|
112
|
+
strategy=SnippetExtractionStrategy.METHOD_BASED,
|
|
113
|
+
progress_callback=progress_callback,
|
|
162
114
|
)
|
|
163
115
|
|
|
164
|
-
snippets = await self.indexing_domain_service.get_snippets_for_index(index.id)
|
|
165
|
-
|
|
166
116
|
# Check if any snippets were extracted
|
|
167
117
|
if not snippets:
|
|
168
118
|
msg = f"No indexable snippets found for index {index.id}"
|
|
169
119
|
raise EmptySourceError(msg)
|
|
170
120
|
|
|
121
|
+
# Commit snippets to ensure they have IDs for indexing
|
|
122
|
+
await self.session.commit()
|
|
123
|
+
|
|
171
124
|
# Create BM25 index
|
|
172
125
|
self.log.info("Creating keyword index")
|
|
173
|
-
reporter = Reporter(self.log, progress_callback)
|
|
174
|
-
await reporter.start("bm25_index", len(snippets), "Creating keyword index...")
|
|
175
126
|
await self._create_bm25_index(snippets, progress_callback)
|
|
176
|
-
await reporter.done("bm25_index", "Keyword index created")
|
|
177
127
|
|
|
178
128
|
# Create code embeddings
|
|
179
129
|
self.log.info("Creating semantic code index")
|
|
180
|
-
reporter = Reporter(self.log, progress_callback)
|
|
181
|
-
await reporter.start(
|
|
182
|
-
"code_embeddings", len(snippets), "Creating code embeddings..."
|
|
183
|
-
)
|
|
184
130
|
await self._create_code_embeddings(snippets, progress_callback)
|
|
185
|
-
await reporter.done("code_embeddings")
|
|
186
131
|
|
|
187
132
|
# Enrich snippets
|
|
188
133
|
self.log.info("Enriching snippets", num_snippets=len(snippets))
|
|
189
|
-
reporter = Reporter(self.log, progress_callback)
|
|
190
|
-
await reporter.start("enrichment", len(snippets), "Enriching snippets...")
|
|
191
134
|
await self._enrich_snippets(snippets, progress_callback)
|
|
192
|
-
await reporter.done("enrichment")
|
|
193
135
|
|
|
194
|
-
#
|
|
136
|
+
# Get refreshed snippets after enrichment
|
|
137
|
+
snippets = await self.snippet_domain_service.get_snippets_for_index(index.id)
|
|
138
|
+
|
|
139
|
+
# Create text embeddings (on enriched content)
|
|
195
140
|
self.log.info("Creating semantic text index")
|
|
196
|
-
reporter = Reporter(self.log, progress_callback)
|
|
197
|
-
await reporter.start(
|
|
198
|
-
"text_embeddings", len(snippets), "Creating text embeddings..."
|
|
199
|
-
)
|
|
200
141
|
await self._create_text_embeddings(snippets, progress_callback)
|
|
201
|
-
await reporter.done("text_embeddings")
|
|
202
142
|
|
|
203
143
|
# Update index timestamp
|
|
204
144
|
await self.indexing_domain_service.update_index_timestamp(index.id)
|
|
205
|
-
|
|
145
|
+
|
|
146
|
+
# Single transaction commit for the entire operation
|
|
206
147
|
await self.session.commit()
|
|
207
148
|
|
|
149
|
+
async def search(self, request: MultiSearchRequest) -> list[MultiSearchResult]:
|
|
150
|
+
"""Search for relevant snippets across all indexes."""
|
|
151
|
+
log_event("kodit.index.search")
|
|
152
|
+
|
|
153
|
+
# Apply filters if provided
|
|
154
|
+
filtered_snippet_ids: list[int] | None = None
|
|
155
|
+
if request.filters:
|
|
156
|
+
# Use domain service for filtering
|
|
157
|
+
prefilter_request = replace(request, top_k=None)
|
|
158
|
+
snippet_results = await self.snippet_domain_service.search_snippets(
|
|
159
|
+
prefilter_request
|
|
160
|
+
)
|
|
161
|
+
filtered_snippet_ids = [snippet.id for snippet in snippet_results]
|
|
162
|
+
|
|
163
|
+
# Gather results from different search modes
|
|
164
|
+
fusion_list: list[list[FusionRequest]] = []
|
|
165
|
+
|
|
166
|
+
# Keyword search
|
|
167
|
+
if request.keywords:
|
|
168
|
+
result_ids: list[SearchResult] = []
|
|
169
|
+
for keyword in request.keywords:
|
|
170
|
+
results = await self.bm25_service.search(
|
|
171
|
+
SearchRequest(
|
|
172
|
+
query=keyword,
|
|
173
|
+
top_k=request.top_k,
|
|
174
|
+
snippet_ids=filtered_snippet_ids,
|
|
175
|
+
)
|
|
176
|
+
)
|
|
177
|
+
result_ids.extend(results)
|
|
178
|
+
|
|
179
|
+
fusion_list.append(
|
|
180
|
+
[FusionRequest(id=x.snippet_id, score=x.score) for x in result_ids]
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Semantic code search
|
|
184
|
+
if request.code_query:
|
|
185
|
+
query_results = await self.code_search_service.search(
|
|
186
|
+
SearchRequest(
|
|
187
|
+
query=request.code_query,
|
|
188
|
+
top_k=request.top_k,
|
|
189
|
+
snippet_ids=filtered_snippet_ids,
|
|
190
|
+
)
|
|
191
|
+
)
|
|
192
|
+
fusion_list.append(
|
|
193
|
+
[FusionRequest(id=x.snippet_id, score=x.score) for x in query_results]
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Semantic text search
|
|
197
|
+
if request.text_query:
|
|
198
|
+
query_results = await self.text_search_service.search(
|
|
199
|
+
SearchRequest(
|
|
200
|
+
query=request.text_query,
|
|
201
|
+
top_k=request.top_k,
|
|
202
|
+
snippet_ids=filtered_snippet_ids,
|
|
203
|
+
)
|
|
204
|
+
)
|
|
205
|
+
fusion_list.append(
|
|
206
|
+
[FusionRequest(id=x.snippet_id, score=x.score) for x in query_results]
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
if len(fusion_list) == 0:
|
|
210
|
+
return []
|
|
211
|
+
|
|
212
|
+
# Fusion ranking
|
|
213
|
+
final_results = self.indexing_domain_service.perform_fusion(
|
|
214
|
+
rankings=fusion_list,
|
|
215
|
+
k=60, # This is a parameter in the RRF algorithm, not top_k
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Keep only top_k results
|
|
219
|
+
final_results = final_results[: request.top_k]
|
|
220
|
+
|
|
221
|
+
# Get snippet details
|
|
222
|
+
search_results = await self.indexing_domain_service.get_snippets_by_ids(
|
|
223
|
+
[x.id for x in final_results]
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
return [
|
|
227
|
+
MultiSearchResult(
|
|
228
|
+
id=snippet["id"],
|
|
229
|
+
uri=file["uri"],
|
|
230
|
+
content=snippet["content"],
|
|
231
|
+
original_scores=fr.original_scores,
|
|
232
|
+
)
|
|
233
|
+
for (file, snippet), fr in zip(search_results, final_results, strict=True)
|
|
234
|
+
]
|
|
235
|
+
|
|
236
|
+
async def list_snippets(
|
|
237
|
+
self, file_path: str | None = None, source_uri: str | None = None
|
|
238
|
+
) -> list[SnippetListItem]:
|
|
239
|
+
"""List snippets with optional filtering."""
|
|
240
|
+
log_event("kodit.index.list_snippets")
|
|
241
|
+
return await self.snippet_domain_service.list_snippets(file_path, source_uri)
|
|
242
|
+
|
|
208
243
|
async def _create_bm25_index(
|
|
209
244
|
self, snippets: list[Snippet], progress_callback: ProgressCallback | None = None
|
|
210
245
|
) -> None:
|
|
211
|
-
"""Create BM25 keyword index."""
|
|
212
246
|
reporter = Reporter(self.log, progress_callback)
|
|
213
247
|
await reporter.start("bm25_index", len(snippets), "Creating keyword index...")
|
|
248
|
+
|
|
214
249
|
await self.bm25_service.index_documents(
|
|
215
|
-
|
|
250
|
+
IndexRequest(
|
|
216
251
|
documents=[
|
|
217
|
-
|
|
252
|
+
Document(snippet_id=snippet.id, text=snippet.content)
|
|
218
253
|
for snippet in snippets
|
|
219
254
|
]
|
|
220
255
|
)
|
|
221
256
|
)
|
|
257
|
+
|
|
222
258
|
await reporter.done("bm25_index", "Keyword index created")
|
|
223
259
|
|
|
224
260
|
async def _create_code_embeddings(
|
|
225
261
|
self, snippets: list[Snippet], progress_callback: ProgressCallback | None = None
|
|
226
262
|
) -> None:
|
|
227
|
-
"""Create code embeddings."""
|
|
228
263
|
reporter = Reporter(self.log, progress_callback)
|
|
229
264
|
await reporter.start(
|
|
230
265
|
"code_embeddings", len(snippets), "Creating code embeddings..."
|
|
231
266
|
)
|
|
267
|
+
|
|
232
268
|
processed = 0
|
|
233
269
|
async for result in self.code_search_service.index_documents(
|
|
234
|
-
|
|
270
|
+
IndexRequest(
|
|
235
271
|
documents=[
|
|
236
|
-
|
|
272
|
+
Document(snippet_id=snippet.id, text=snippet.content)
|
|
237
273
|
for snippet in snippets
|
|
238
274
|
]
|
|
239
275
|
)
|
|
@@ -245,15 +281,15 @@ class IndexingApplicationService:
|
|
|
245
281
|
len(snippets),
|
|
246
282
|
"Creating code embeddings...",
|
|
247
283
|
)
|
|
284
|
+
|
|
248
285
|
await reporter.done("code_embeddings")
|
|
249
286
|
|
|
250
287
|
async def _enrich_snippets(
|
|
251
288
|
self, snippets: list[Snippet], progress_callback: ProgressCallback | None = None
|
|
252
289
|
) -> None:
|
|
253
|
-
"""Enrich snippets with additional context."""
|
|
254
290
|
reporter = Reporter(self.log, progress_callback)
|
|
255
291
|
await reporter.start("enrichment", len(snippets), "Enriching snippets...")
|
|
256
|
-
|
|
292
|
+
|
|
257
293
|
enrichment_request = EnrichmentIndexRequest(
|
|
258
294
|
requests=[
|
|
259
295
|
EnrichmentRequest(snippet_id=snippet.id, text=snippet.content)
|
|
@@ -265,44 +301,38 @@ class IndexingApplicationService:
|
|
|
265
301
|
async for result in self.enrichment_service.enrich_documents(
|
|
266
302
|
enrichment_request
|
|
267
303
|
):
|
|
268
|
-
#
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
)
|
|
280
|
-
enriched_contents.append(result)
|
|
304
|
+
# Update snippet content through domain service
|
|
305
|
+
enriched_content = (
|
|
306
|
+
result.text
|
|
307
|
+
+ "\n\n```\n"
|
|
308
|
+
+ next(s.content for s in snippets if s.id == result.snippet_id)
|
|
309
|
+
+ "\n```"
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
await self.snippet_domain_service.update_snippet_content(
|
|
313
|
+
result.snippet_id, enriched_content
|
|
314
|
+
)
|
|
281
315
|
|
|
282
316
|
processed += 1
|
|
283
317
|
await reporter.step(
|
|
284
318
|
"enrichment", processed, len(snippets), "Enriching snippets..."
|
|
285
319
|
)
|
|
286
320
|
|
|
287
|
-
# Commit all snippet content updates as a single transaction
|
|
288
|
-
if enriched_contents:
|
|
289
|
-
await self.session.commit()
|
|
290
|
-
|
|
291
321
|
await reporter.done("enrichment")
|
|
292
322
|
|
|
293
323
|
async def _create_text_embeddings(
|
|
294
324
|
self, snippets: list[Snippet], progress_callback: ProgressCallback | None = None
|
|
295
325
|
) -> None:
|
|
296
|
-
"""Create text embeddings."""
|
|
297
326
|
reporter = Reporter(self.log, progress_callback)
|
|
298
327
|
await reporter.start(
|
|
299
328
|
"text_embeddings", len(snippets), "Creating text embeddings..."
|
|
300
329
|
)
|
|
330
|
+
|
|
301
331
|
processed = 0
|
|
302
332
|
async for result in self.text_search_service.index_documents(
|
|
303
|
-
|
|
333
|
+
IndexRequest(
|
|
304
334
|
documents=[
|
|
305
|
-
|
|
335
|
+
Document(snippet_id=snippet.id, text=snippet.content)
|
|
306
336
|
for snippet in snippets
|
|
307
337
|
]
|
|
308
338
|
)
|
|
@@ -314,74 +344,5 @@ class IndexingApplicationService:
|
|
|
314
344
|
len(snippets),
|
|
315
345
|
"Creating text embeddings...",
|
|
316
346
|
)
|
|
317
|
-
await reporter.done("text_embeddings")
|
|
318
|
-
|
|
319
|
-
async def search(self, request: MultiSearchRequest) -> list[MultiSearchResult]:
|
|
320
|
-
"""Search for relevant data.
|
|
321
|
-
|
|
322
|
-
Args:
|
|
323
|
-
request: The search request.
|
|
324
347
|
|
|
325
|
-
|
|
326
|
-
A list of search results.
|
|
327
|
-
|
|
328
|
-
"""
|
|
329
|
-
log_event("kodit.index.search")
|
|
330
|
-
|
|
331
|
-
fusion_list: list[list[FusionRequest]] = []
|
|
332
|
-
if request.keywords:
|
|
333
|
-
# Gather results for each keyword
|
|
334
|
-
result_ids: list[BM25SearchResult] = []
|
|
335
|
-
for keyword in request.keywords:
|
|
336
|
-
results = await self.bm25_service.search(
|
|
337
|
-
BM25SearchRequest(query=keyword, top_k=request.top_k)
|
|
338
|
-
)
|
|
339
|
-
result_ids.extend(results)
|
|
340
|
-
|
|
341
|
-
fusion_list.append(
|
|
342
|
-
[FusionRequest(id=x.snippet_id, score=x.score) for x in result_ids]
|
|
343
|
-
)
|
|
344
|
-
|
|
345
|
-
# Compute embedding for semantic query
|
|
346
|
-
if request.code_query:
|
|
347
|
-
query_embedding = await self.code_search_service.search(
|
|
348
|
-
VectorSearchQueryRequest(query=request.code_query, top_k=request.top_k)
|
|
349
|
-
)
|
|
350
|
-
fusion_list.append(
|
|
351
|
-
[FusionRequest(id=x.snippet_id, score=x.score) for x in query_embedding]
|
|
352
|
-
)
|
|
353
|
-
|
|
354
|
-
if request.text_query:
|
|
355
|
-
query_embedding = await self.text_search_service.search(
|
|
356
|
-
VectorSearchQueryRequest(query=request.text_query, top_k=request.top_k)
|
|
357
|
-
)
|
|
358
|
-
fusion_list.append(
|
|
359
|
-
[FusionRequest(id=x.snippet_id, score=x.score) for x in query_embedding]
|
|
360
|
-
)
|
|
361
|
-
|
|
362
|
-
if len(fusion_list) == 0:
|
|
363
|
-
return []
|
|
364
|
-
|
|
365
|
-
# Combine all results together with RFF if required
|
|
366
|
-
final_results = self.indexing_domain_service.perform_fusion(
|
|
367
|
-
rankings=fusion_list,
|
|
368
|
-
k=60,
|
|
369
|
-
)
|
|
370
|
-
|
|
371
|
-
# Only keep top_k results
|
|
372
|
-
final_results = final_results[: request.top_k]
|
|
373
|
-
|
|
374
|
-
# Get snippets from database (up to top_k)
|
|
375
|
-
search_results = await self.indexing_domain_service.get_snippets_by_ids(
|
|
376
|
-
[x.id for x in final_results]
|
|
377
|
-
)
|
|
378
|
-
|
|
379
|
-
return [
|
|
380
|
-
MultiSearchResult(
|
|
381
|
-
id=snippet["id"],
|
|
382
|
-
uri=file["uri"],
|
|
383
|
-
content=snippet["content"],
|
|
384
|
-
original_scores=fr.original_scores,
|
|
385
|
-
)
|
|
386
|
-
for (file, snippet), fr in zip(search_results, final_results, strict=True)
|
|
387
|
-
]
|
|
348
|
+
await reporter.done("text_embeddings")
|