kodit 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/application/factories/code_indexing_factory.py +77 -28
- kodit/application/services/code_indexing_application_service.py +148 -119
- kodit/cli.py +49 -52
- kodit/domain/entities.py +268 -189
- kodit/domain/protocols.py +61 -0
- kodit/domain/services/embedding_service.py +1 -1
- kodit/domain/services/index_query_service.py +66 -0
- kodit/domain/services/index_service.py +323 -0
- kodit/domain/value_objects.py +225 -92
- kodit/infrastructure/cloning/git/working_copy.py +17 -8
- kodit/infrastructure/cloning/metadata.py +37 -67
- kodit/infrastructure/embedding/embedding_factory.py +1 -1
- kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
- kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +1 -1
- kodit/infrastructure/enrichment/null_enrichment_provider.py +4 -10
- kodit/infrastructure/git/git_utils.py +1 -63
- kodit/infrastructure/ignore/ignore_pattern_provider.py +1 -2
- kodit/infrastructure/indexing/auto_indexing_service.py +2 -12
- kodit/infrastructure/indexing/fusion_service.py +1 -1
- kodit/infrastructure/mappers/__init__.py +1 -0
- kodit/infrastructure/mappers/index_mapper.py +344 -0
- kodit/infrastructure/snippet_extraction/factories.py +13 -0
- kodit/infrastructure/snippet_extraction/language_detection_service.py +1 -1
- kodit/infrastructure/snippet_extraction/snippet_query_provider.py +0 -1
- kodit/infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py +1 -1
- kodit/infrastructure/sqlalchemy/embedding_repository.py +1 -1
- kodit/infrastructure/sqlalchemy/entities.py +203 -0
- kodit/infrastructure/sqlalchemy/file_repository.py +1 -1
- kodit/infrastructure/sqlalchemy/index_repository.py +550 -0
- kodit/log.py +4 -1
- kodit/mcp.py +1 -13
- kodit/migrations/env.py +1 -1
- kodit/migrations/versions/4073b33f9436_add_file_processing_flag.py +34 -0
- kodit/migrations/versions/4552eb3f23ce_add_summary.py +34 -0
- kodit/utils/__init__.py +1 -0
- kodit/utils/path_utils.py +54 -0
- {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/METADATA +1 -1
- {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/RECORD +42 -45
- kodit/domain/enums.py +0 -9
- kodit/domain/repositories.py +0 -128
- kodit/domain/services/ignore_service.py +0 -45
- kodit/domain/services/indexing_service.py +0 -204
- kodit/domain/services/snippet_extraction_service.py +0 -89
- kodit/domain/services/snippet_service.py +0 -211
- kodit/domain/services/source_service.py +0 -85
- kodit/infrastructure/cloning/folder/__init__.py +0 -1
- kodit/infrastructure/cloning/folder/factory.py +0 -128
- kodit/infrastructure/cloning/folder/working_copy.py +0 -38
- kodit/infrastructure/cloning/git/factory.py +0 -153
- kodit/infrastructure/indexing/index_repository.py +0 -273
- kodit/infrastructure/indexing/snippet_domain_service_factory.py +0 -37
- kodit/infrastructure/sqlalchemy/repository.py +0 -133
- kodit/infrastructure/sqlalchemy/snippet_repository.py +0 -251
- {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/WHEEL +0 -0
- {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/entry_points.txt +0 -0
- {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/licenses/LICENSE +0 -0
kodit/_version.py
CHANGED
|
@@ -6,17 +6,20 @@ from kodit.application.services.code_indexing_application_service import (
|
|
|
6
6
|
CodeIndexingApplicationService,
|
|
7
7
|
)
|
|
8
8
|
from kodit.config import AppContext
|
|
9
|
-
from kodit.domain.entities import EmbeddingType
|
|
10
9
|
from kodit.domain.services.bm25_service import BM25DomainService
|
|
11
10
|
from kodit.domain.services.embedding_service import EmbeddingDomainService
|
|
12
11
|
from kodit.domain.services.enrichment_service import EnrichmentDomainService
|
|
13
|
-
from kodit.domain.services.
|
|
12
|
+
from kodit.domain.services.index_query_service import IndexQueryService
|
|
13
|
+
from kodit.domain.services.index_service import (
|
|
14
|
+
IndexDomainService,
|
|
15
|
+
)
|
|
16
|
+
from kodit.domain.value_objects import LanguageMapping, SnippetExtractionStrategy
|
|
14
17
|
from kodit.infrastructure.bm25.bm25_factory import bm25_repository_factory
|
|
15
18
|
from kodit.infrastructure.embedding.embedding_factory import (
|
|
16
19
|
embedding_domain_service_factory,
|
|
17
20
|
)
|
|
18
|
-
from kodit.infrastructure.embedding.embedding_providers import (
|
|
19
|
-
|
|
21
|
+
from kodit.infrastructure.embedding.embedding_providers.hash_embedding_provider import (
|
|
22
|
+
HashEmbeddingProvider,
|
|
20
23
|
)
|
|
21
24
|
from kodit.infrastructure.embedding.local_vector_search_repository import (
|
|
22
25
|
LocalVectorSearchRepository,
|
|
@@ -27,36 +30,63 @@ from kodit.infrastructure.enrichment.enrichment_factory import (
|
|
|
27
30
|
from kodit.infrastructure.enrichment.null_enrichment_provider import (
|
|
28
31
|
NullEnrichmentProvider,
|
|
29
32
|
)
|
|
30
|
-
from kodit.infrastructure.indexing.
|
|
31
|
-
|
|
33
|
+
from kodit.infrastructure.indexing.fusion_service import ReciprocalRankFusionService
|
|
34
|
+
from kodit.infrastructure.snippet_extraction.factories import (
|
|
35
|
+
create_snippet_query_provider,
|
|
36
|
+
)
|
|
37
|
+
from kodit.infrastructure.snippet_extraction.language_detection_service import (
|
|
38
|
+
FileSystemLanguageDetectionService,
|
|
32
39
|
)
|
|
33
|
-
from kodit.infrastructure.
|
|
34
|
-
|
|
40
|
+
from kodit.infrastructure.snippet_extraction.tree_sitter_snippet_extractor import (
|
|
41
|
+
TreeSitterSnippetExtractor,
|
|
35
42
|
)
|
|
36
43
|
from kodit.infrastructure.sqlalchemy.embedding_repository import (
|
|
37
44
|
SqlAlchemyEmbeddingRepository,
|
|
38
45
|
)
|
|
46
|
+
from kodit.infrastructure.sqlalchemy.entities import EmbeddingType
|
|
47
|
+
from kodit.infrastructure.sqlalchemy.index_repository import SqlAlchemyIndexRepository
|
|
39
48
|
|
|
40
49
|
|
|
41
50
|
def create_code_indexing_application_service(
|
|
42
51
|
app_context: AppContext,
|
|
43
52
|
session: AsyncSession,
|
|
44
|
-
source_service: SourceService,
|
|
45
53
|
) -> CodeIndexingApplicationService:
|
|
46
54
|
"""Create a unified code indexing application service with all dependencies."""
|
|
47
55
|
# Create domain services
|
|
48
|
-
indexing_domain_service = indexing_domain_service_factory(session)
|
|
49
|
-
snippet_domain_service = snippet_domain_service_factory(session)
|
|
50
56
|
bm25_service = BM25DomainService(bm25_repository_factory(app_context, session))
|
|
51
57
|
code_search_service = embedding_domain_service_factory("code", app_context, session)
|
|
52
58
|
text_search_service = embedding_domain_service_factory("text", app_context, session)
|
|
53
59
|
enrichment_service = enrichment_domain_service_factory(app_context)
|
|
60
|
+
index_repository = SqlAlchemyIndexRepository(session=session)
|
|
61
|
+
# Use the unified language mapping from the domain layer
|
|
62
|
+
language_map = LanguageMapping.get_extension_to_language_map()
|
|
63
|
+
|
|
64
|
+
# Create infrastructure services
|
|
65
|
+
language_detector = FileSystemLanguageDetectionService(language_map)
|
|
66
|
+
query_provider = create_snippet_query_provider()
|
|
67
|
+
|
|
68
|
+
# Create snippet extractors
|
|
69
|
+
method_extractor = TreeSitterSnippetExtractor(query_provider)
|
|
70
|
+
|
|
71
|
+
snippet_extractors = {
|
|
72
|
+
SnippetExtractionStrategy.METHOD_BASED: method_extractor,
|
|
73
|
+
}
|
|
74
|
+
index_domain_service = IndexDomainService(
|
|
75
|
+
language_detector=language_detector,
|
|
76
|
+
snippet_extractors=snippet_extractors,
|
|
77
|
+
enrichment_service=enrichment_service,
|
|
78
|
+
clone_dir=app_context.get_clone_dir(),
|
|
79
|
+
)
|
|
80
|
+
index_query_service = IndexQueryService(
|
|
81
|
+
index_repository=index_repository,
|
|
82
|
+
fusion_service=ReciprocalRankFusionService(),
|
|
83
|
+
)
|
|
54
84
|
|
|
55
85
|
# Create and return the unified application service
|
|
56
86
|
return CodeIndexingApplicationService(
|
|
57
|
-
indexing_domain_service=
|
|
58
|
-
|
|
59
|
-
|
|
87
|
+
indexing_domain_service=index_domain_service,
|
|
88
|
+
index_repository=index_repository,
|
|
89
|
+
index_query_service=index_query_service,
|
|
60
90
|
bm25_service=bm25_service,
|
|
61
91
|
code_search_service=code_search_service,
|
|
62
92
|
text_search_service=text_search_service,
|
|
@@ -68,36 +98,30 @@ def create_code_indexing_application_service(
|
|
|
68
98
|
def create_fast_test_code_indexing_application_service(
|
|
69
99
|
app_context: AppContext,
|
|
70
100
|
session: AsyncSession,
|
|
71
|
-
source_service: SourceService,
|
|
72
101
|
) -> CodeIndexingApplicationService:
|
|
73
|
-
"""Create a fast test
|
|
102
|
+
"""Create a fast test code indexing application service."""
|
|
74
103
|
# Create domain services
|
|
75
|
-
indexing_domain_service = indexing_domain_service_factory(session)
|
|
76
|
-
snippet_domain_service = snippet_domain_service_factory(session)
|
|
77
104
|
bm25_service = BM25DomainService(bm25_repository_factory(app_context, session))
|
|
78
|
-
|
|
79
|
-
# Create fast embedding services using HashEmbeddingProvider
|
|
80
105
|
embedding_repository = SqlAlchemyEmbeddingRepository(session=session)
|
|
81
106
|
|
|
82
|
-
# Fast code search service
|
|
83
107
|
code_search_repository = LocalVectorSearchRepository(
|
|
84
108
|
embedding_repository=embedding_repository,
|
|
85
|
-
embedding_provider=
|
|
109
|
+
embedding_provider=HashEmbeddingProvider(),
|
|
86
110
|
embedding_type=EmbeddingType.CODE,
|
|
87
111
|
)
|
|
88
112
|
code_search_service = EmbeddingDomainService(
|
|
89
|
-
embedding_provider=
|
|
113
|
+
embedding_provider=HashEmbeddingProvider(),
|
|
90
114
|
vector_search_repository=code_search_repository,
|
|
91
115
|
)
|
|
92
116
|
|
|
93
117
|
# Fast text search service
|
|
94
118
|
text_search_repository = LocalVectorSearchRepository(
|
|
95
119
|
embedding_repository=embedding_repository,
|
|
96
|
-
embedding_provider=
|
|
120
|
+
embedding_provider=HashEmbeddingProvider(),
|
|
97
121
|
embedding_type=EmbeddingType.TEXT,
|
|
98
122
|
)
|
|
99
123
|
text_search_service = EmbeddingDomainService(
|
|
100
|
-
embedding_provider=
|
|
124
|
+
embedding_provider=HashEmbeddingProvider(),
|
|
101
125
|
vector_search_repository=text_search_repository,
|
|
102
126
|
)
|
|
103
127
|
|
|
@@ -106,11 +130,36 @@ def create_fast_test_code_indexing_application_service(
|
|
|
106
130
|
enrichment_provider=NullEnrichmentProvider()
|
|
107
131
|
)
|
|
108
132
|
|
|
133
|
+
index_repository = SqlAlchemyIndexRepository(session=session)
|
|
134
|
+
# Use the unified language mapping from the domain layer
|
|
135
|
+
language_map = LanguageMapping.get_extension_to_language_map()
|
|
136
|
+
|
|
137
|
+
# Create infrastructure services
|
|
138
|
+
language_detector = FileSystemLanguageDetectionService(language_map)
|
|
139
|
+
query_provider = create_snippet_query_provider()
|
|
140
|
+
|
|
141
|
+
# Create snippet extractors
|
|
142
|
+
method_extractor = TreeSitterSnippetExtractor(query_provider)
|
|
143
|
+
|
|
144
|
+
snippet_extractors = {
|
|
145
|
+
SnippetExtractionStrategy.METHOD_BASED: method_extractor,
|
|
146
|
+
}
|
|
147
|
+
index_domain_service = IndexDomainService(
|
|
148
|
+
language_detector=language_detector,
|
|
149
|
+
snippet_extractors=snippet_extractors,
|
|
150
|
+
enrichment_service=enrichment_service,
|
|
151
|
+
clone_dir=app_context.get_clone_dir(),
|
|
152
|
+
)
|
|
153
|
+
index_query_service = IndexQueryService(
|
|
154
|
+
index_repository=index_repository,
|
|
155
|
+
fusion_service=ReciprocalRankFusionService(),
|
|
156
|
+
)
|
|
157
|
+
|
|
109
158
|
# Create and return the unified application service
|
|
110
159
|
return CodeIndexingApplicationService(
|
|
111
|
-
indexing_domain_service=
|
|
112
|
-
|
|
113
|
-
|
|
160
|
+
indexing_domain_service=index_domain_service,
|
|
161
|
+
index_repository=index_repository,
|
|
162
|
+
index_query_service=index_query_service,
|
|
114
163
|
bm25_service=bm25_service,
|
|
115
164
|
code_search_service=code_search_service,
|
|
116
165
|
text_search_service=text_search_service,
|
|
@@ -1,32 +1,28 @@
|
|
|
1
1
|
"""Unified application service for code indexing operations."""
|
|
2
2
|
|
|
3
3
|
from dataclasses import replace
|
|
4
|
+
from datetime import UTC, datetime
|
|
4
5
|
|
|
5
6
|
import structlog
|
|
6
7
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
7
8
|
|
|
8
|
-
from kodit.domain.entities import Snippet
|
|
9
|
-
from kodit.domain.enums import SnippetExtractionStrategy
|
|
10
|
-
from kodit.domain.errors import EmptySourceError
|
|
9
|
+
from kodit.domain.entities import Index, Snippet
|
|
11
10
|
from kodit.domain.interfaces import ProgressCallback
|
|
11
|
+
from kodit.domain.protocols import IndexRepository
|
|
12
12
|
from kodit.domain.services.bm25_service import BM25DomainService
|
|
13
13
|
from kodit.domain.services.embedding_service import EmbeddingDomainService
|
|
14
14
|
from kodit.domain.services.enrichment_service import EnrichmentDomainService
|
|
15
|
-
from kodit.domain.services.
|
|
16
|
-
from kodit.domain.services.
|
|
17
|
-
from kodit.domain.services.source_service import SourceService
|
|
15
|
+
from kodit.domain.services.index_query_service import IndexQueryService
|
|
16
|
+
from kodit.domain.services.index_service import IndexDomainService
|
|
18
17
|
from kodit.domain.value_objects import (
|
|
19
18
|
Document,
|
|
20
|
-
EnrichmentIndexRequest,
|
|
21
|
-
EnrichmentRequest,
|
|
22
19
|
FusionRequest,
|
|
23
|
-
IndexCreateRequest,
|
|
24
20
|
IndexRequest,
|
|
25
|
-
IndexView,
|
|
26
21
|
MultiSearchRequest,
|
|
27
22
|
MultiSearchResult,
|
|
28
23
|
SearchRequest,
|
|
29
24
|
SearchResult,
|
|
25
|
+
SnippetSearchFilters,
|
|
30
26
|
)
|
|
31
27
|
from kodit.log import log_event
|
|
32
28
|
from kodit.reporting import Reporter
|
|
@@ -37,9 +33,9 @@ class CodeIndexingApplicationService:
|
|
|
37
33
|
|
|
38
34
|
def __init__( # noqa: PLR0913
|
|
39
35
|
self,
|
|
40
|
-
indexing_domain_service:
|
|
41
|
-
|
|
42
|
-
|
|
36
|
+
indexing_domain_service: IndexDomainService,
|
|
37
|
+
index_repository: IndexRepository,
|
|
38
|
+
index_query_service: IndexQueryService,
|
|
43
39
|
bm25_service: BM25DomainService,
|
|
44
40
|
code_search_service: EmbeddingDomainService,
|
|
45
41
|
text_search_service: EmbeddingDomainService,
|
|
@@ -47,9 +43,9 @@ class CodeIndexingApplicationService:
|
|
|
47
43
|
session: AsyncSession,
|
|
48
44
|
) -> None:
|
|
49
45
|
"""Initialize the code indexing application service."""
|
|
50
|
-
self.
|
|
51
|
-
self.
|
|
52
|
-
self.
|
|
46
|
+
self.index_domain_service = indexing_domain_service
|
|
47
|
+
self.index_repository = index_repository
|
|
48
|
+
self.index_query_service = index_query_service
|
|
53
49
|
self.bm25_service = bm25_service
|
|
54
50
|
self.code_search_service = code_search_service
|
|
55
51
|
self.text_search_service = text_search_service
|
|
@@ -57,90 +53,96 @@ class CodeIndexingApplicationService:
|
|
|
57
53
|
self.session = session
|
|
58
54
|
self.log = structlog.get_logger(__name__)
|
|
59
55
|
|
|
60
|
-
async def
|
|
56
|
+
async def create_index_from_uri(
|
|
57
|
+
self, uri: str, progress_callback: ProgressCallback | None = None
|
|
58
|
+
) -> Index:
|
|
61
59
|
"""Create a new index for a source."""
|
|
62
60
|
log_event("kodit.index.create")
|
|
63
61
|
|
|
64
|
-
#
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
return index_view
|
|
75
|
-
|
|
76
|
-
async def list_indexes(self) -> list[IndexView]:
|
|
77
|
-
"""List all available indexes with their details."""
|
|
78
|
-
indexes = await self.indexing_domain_service.list_indexes()
|
|
62
|
+
# Check if index already exists
|
|
63
|
+
sanitized_uri, _ = self.index_domain_service.sanitize_uri(uri)
|
|
64
|
+
existing_index = await self.index_repository.get_by_uri(sanitized_uri)
|
|
65
|
+
if existing_index:
|
|
66
|
+
self.log.debug(
|
|
67
|
+
"Index already exists",
|
|
68
|
+
uri=str(sanitized_uri),
|
|
69
|
+
index_id=existing_index.id,
|
|
70
|
+
)
|
|
71
|
+
return existing_index
|
|
79
72
|
|
|
80
|
-
#
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
{
|
|
84
|
-
"num_indexes": len(indexes),
|
|
85
|
-
"num_snippets": sum([index.num_snippets for index in indexes]),
|
|
86
|
-
},
|
|
73
|
+
# Only prepare working copy if we need to create a new index
|
|
74
|
+
working_copy = await self.index_domain_service.prepare_index(
|
|
75
|
+
uri, progress_callback
|
|
87
76
|
)
|
|
88
77
|
|
|
89
|
-
|
|
78
|
+
# Create new index
|
|
79
|
+
index = await self.index_repository.create(sanitized_uri, working_copy)
|
|
80
|
+
await self.session.commit()
|
|
81
|
+
return index
|
|
90
82
|
|
|
91
83
|
async def run_index(
|
|
92
|
-
self,
|
|
84
|
+
self, index: Index, progress_callback: ProgressCallback | None = None
|
|
93
85
|
) -> None:
|
|
94
86
|
"""Run the complete indexing process for a specific index."""
|
|
95
87
|
log_event("kodit.index.run")
|
|
96
88
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
if not index:
|
|
100
|
-
msg = f"Index not found: {index_id}"
|
|
89
|
+
if not index or not index.id:
|
|
90
|
+
msg = f"Index has no ID: {index}"
|
|
101
91
|
raise ValueError(msg)
|
|
102
92
|
|
|
103
|
-
#
|
|
104
|
-
|
|
105
|
-
|
|
93
|
+
# Refresh working copy
|
|
94
|
+
index.source.working_copy = (
|
|
95
|
+
await self.index_domain_service.refresh_working_copy(
|
|
96
|
+
index.source.working_copy
|
|
97
|
+
)
|
|
98
|
+
)
|
|
99
|
+
if len(index.source.working_copy.changed_files()) == 0:
|
|
100
|
+
self.log.info("No new changes to index", index_id=index.id)
|
|
101
|
+
return
|
|
106
102
|
|
|
107
103
|
# Extract and create snippets (domain service handles progress)
|
|
108
104
|
self.log.info("Creating snippets for files", index_id=index.id)
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
strategy=SnippetExtractionStrategy.METHOD_BASED,
|
|
112
|
-
progress_callback=progress_callback,
|
|
105
|
+
index = await self.index_domain_service.extract_snippets_from_index(
|
|
106
|
+
index=index, progress_callback=progress_callback
|
|
113
107
|
)
|
|
114
108
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
msg = f"No indexable snippets found for index {index.id}"
|
|
118
|
-
raise EmptySourceError(msg)
|
|
109
|
+
await self.index_repository.update(index)
|
|
110
|
+
await self.session.flush()
|
|
119
111
|
|
|
120
|
-
#
|
|
121
|
-
await self.
|
|
112
|
+
# Refresh index to get snippets with IDs, required as a ref for subsequent steps
|
|
113
|
+
flushed_index = await self.index_repository.get(index.id)
|
|
114
|
+
if not flushed_index:
|
|
115
|
+
msg = f"Index {index.id} not found after snippet extraction"
|
|
116
|
+
raise ValueError(msg)
|
|
117
|
+
index = flushed_index
|
|
122
118
|
|
|
123
119
|
# Create BM25 index
|
|
124
120
|
self.log.info("Creating keyword index")
|
|
125
|
-
await self._create_bm25_index(snippets, progress_callback)
|
|
121
|
+
await self._create_bm25_index(index.snippets, progress_callback)
|
|
126
122
|
|
|
127
123
|
# Create code embeddings
|
|
128
124
|
self.log.info("Creating semantic code index")
|
|
129
|
-
await self._create_code_embeddings(snippets, progress_callback)
|
|
125
|
+
await self._create_code_embeddings(index.snippets, progress_callback)
|
|
130
126
|
|
|
131
127
|
# Enrich snippets
|
|
132
|
-
self.log.info("Enriching snippets", num_snippets=len(snippets))
|
|
133
|
-
await self.
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
snippets
|
|
128
|
+
self.log.info("Enriching snippets", num_snippets=len(index.snippets))
|
|
129
|
+
enriched_snippets = await self.index_domain_service.enrich_snippets_in_index(
|
|
130
|
+
snippets=index.snippets, progress_callback=progress_callback
|
|
131
|
+
)
|
|
132
|
+
# Update snippets in repository
|
|
133
|
+
await self.index_repository.update_snippets(index.id, enriched_snippets)
|
|
137
134
|
|
|
138
135
|
# Create text embeddings (on enriched content)
|
|
139
136
|
self.log.info("Creating semantic text index")
|
|
140
|
-
await self._create_text_embeddings(
|
|
137
|
+
await self._create_text_embeddings(enriched_snippets, progress_callback)
|
|
141
138
|
|
|
142
139
|
# Update index timestamp
|
|
143
|
-
await self.
|
|
140
|
+
await self.index_repository.update_index_timestamp(index.id)
|
|
141
|
+
|
|
142
|
+
# Now that all file dependencies have been captured, enact the file processing
|
|
143
|
+
# statuses
|
|
144
|
+
index.source.working_copy.clear_file_processing_statuses()
|
|
145
|
+
await self.index_repository.update(index)
|
|
144
146
|
|
|
145
147
|
# Single transaction commit for the entire operation
|
|
146
148
|
await self.session.commit()
|
|
@@ -154,10 +156,12 @@ class CodeIndexingApplicationService:
|
|
|
154
156
|
if request.filters:
|
|
155
157
|
# Use domain service for filtering
|
|
156
158
|
prefilter_request = replace(request, top_k=None)
|
|
157
|
-
snippet_results = await self.
|
|
159
|
+
snippet_results = await self.index_query_service.search_snippets(
|
|
158
160
|
prefilter_request
|
|
159
161
|
)
|
|
160
|
-
filtered_snippet_ids = [
|
|
162
|
+
filtered_snippet_ids = [
|
|
163
|
+
snippet.snippet.id for snippet in snippet_results if snippet.snippet.id
|
|
164
|
+
]
|
|
161
165
|
|
|
162
166
|
# Gather results from different search modes
|
|
163
167
|
fusion_list: list[list[FusionRequest]] = []
|
|
@@ -209,7 +213,7 @@ class CodeIndexingApplicationService:
|
|
|
209
213
|
return []
|
|
210
214
|
|
|
211
215
|
# Fusion ranking
|
|
212
|
-
final_results = self.
|
|
216
|
+
final_results = await self.index_query_service.perform_fusion(
|
|
213
217
|
rankings=fusion_list,
|
|
214
218
|
k=60, # This is a parameter in the RRF algorithm, not top_k
|
|
215
219
|
)
|
|
@@ -218,16 +222,29 @@ class CodeIndexingApplicationService:
|
|
|
218
222
|
final_results = final_results[: request.top_k]
|
|
219
223
|
|
|
220
224
|
# Get snippet details
|
|
221
|
-
search_results = await self.
|
|
225
|
+
search_results = await self.index_query_service.get_snippets_by_ids(
|
|
222
226
|
[x.id for x in final_results]
|
|
223
227
|
)
|
|
224
228
|
|
|
225
229
|
return [
|
|
226
230
|
MultiSearchResult(
|
|
227
|
-
id=result.snippet.id,
|
|
228
|
-
|
|
229
|
-
content=result.snippet.content,
|
|
231
|
+
id=result.snippet.id or 0,
|
|
232
|
+
content=result.snippet.original_text(),
|
|
230
233
|
original_scores=fr.original_scores,
|
|
234
|
+
# Enhanced fields
|
|
235
|
+
source_uri=str(result.source.working_copy.remote_uri),
|
|
236
|
+
relative_path=str(
|
|
237
|
+
result.file.as_path().relative_to(
|
|
238
|
+
result.source.working_copy.cloned_path
|
|
239
|
+
)
|
|
240
|
+
),
|
|
241
|
+
language=MultiSearchResult.detect_language_from_extension(
|
|
242
|
+
result.file.extension()
|
|
243
|
+
),
|
|
244
|
+
authors=[author.name for author in result.authors],
|
|
245
|
+
created_at=result.snippet.created_at or datetime.now(UTC),
|
|
246
|
+
# Summary from snippet entity
|
|
247
|
+
summary=result.snippet.summary_text(),
|
|
231
248
|
)
|
|
232
249
|
for result, fr in zip(search_results, final_results, strict=True)
|
|
233
250
|
]
|
|
@@ -237,19 +254,53 @@ class CodeIndexingApplicationService:
|
|
|
237
254
|
) -> list[MultiSearchResult]:
|
|
238
255
|
"""List snippets with optional filtering."""
|
|
239
256
|
log_event("kodit.index.list_snippets")
|
|
240
|
-
|
|
257
|
+
snippet_results = await self.index_query_service.search_snippets(
|
|
258
|
+
request=MultiSearchRequest(
|
|
259
|
+
filters=SnippetSearchFilters(
|
|
260
|
+
file_path=file_path,
|
|
261
|
+
source_repo=source_uri,
|
|
262
|
+
)
|
|
263
|
+
),
|
|
264
|
+
)
|
|
265
|
+
return [
|
|
266
|
+
MultiSearchResult(
|
|
267
|
+
id=result.snippet.id or 0,
|
|
268
|
+
content=result.snippet.original_text(),
|
|
269
|
+
original_scores=[0.0],
|
|
270
|
+
# Enhanced fields
|
|
271
|
+
source_uri=str(result.source.working_copy.remote_uri),
|
|
272
|
+
relative_path=str(
|
|
273
|
+
result.file.as_path().relative_to(
|
|
274
|
+
result.source.working_copy.cloned_path
|
|
275
|
+
)
|
|
276
|
+
),
|
|
277
|
+
language=MultiSearchResult.detect_language_from_extension(
|
|
278
|
+
result.file.extension()
|
|
279
|
+
),
|
|
280
|
+
authors=[author.name for author in result.authors],
|
|
281
|
+
created_at=result.snippet.created_at or datetime.now(UTC),
|
|
282
|
+
# Summary from snippet entity
|
|
283
|
+
summary=result.snippet.summary_text(),
|
|
284
|
+
)
|
|
285
|
+
for result in snippet_results
|
|
286
|
+
]
|
|
241
287
|
|
|
288
|
+
# FUTURE: BM25 index enriched content too
|
|
242
289
|
async def _create_bm25_index(
|
|
243
290
|
self, snippets: list[Snippet], progress_callback: ProgressCallback | None = None
|
|
244
291
|
) -> None:
|
|
245
292
|
reporter = Reporter(self.log, progress_callback)
|
|
246
293
|
await reporter.start("bm25_index", len(snippets), "Creating keyword index...")
|
|
247
294
|
|
|
295
|
+
for _snippet in snippets:
|
|
296
|
+
pass
|
|
297
|
+
|
|
248
298
|
await self.bm25_service.index_documents(
|
|
249
299
|
IndexRequest(
|
|
250
300
|
documents=[
|
|
251
|
-
Document(snippet_id=snippet.id, text=snippet.
|
|
301
|
+
Document(snippet_id=snippet.id, text=snippet.original_text())
|
|
252
302
|
for snippet in snippets
|
|
303
|
+
if snippet.id
|
|
253
304
|
]
|
|
254
305
|
)
|
|
255
306
|
)
|
|
@@ -268,8 +319,9 @@ class CodeIndexingApplicationService:
|
|
|
268
319
|
async for result in self.code_search_service.index_documents(
|
|
269
320
|
IndexRequest(
|
|
270
321
|
documents=[
|
|
271
|
-
Document(snippet_id=snippet.id, text=snippet.
|
|
322
|
+
Document(snippet_id=snippet.id, text=snippet.original_text())
|
|
272
323
|
for snippet in snippets
|
|
324
|
+
if snippet.id
|
|
273
325
|
]
|
|
274
326
|
)
|
|
275
327
|
):
|
|
@@ -283,42 +335,6 @@ class CodeIndexingApplicationService:
|
|
|
283
335
|
|
|
284
336
|
await reporter.done("code_embeddings")
|
|
285
337
|
|
|
286
|
-
async def _enrich_snippets(
|
|
287
|
-
self, snippets: list[Snippet], progress_callback: ProgressCallback | None = None
|
|
288
|
-
) -> None:
|
|
289
|
-
reporter = Reporter(self.log, progress_callback)
|
|
290
|
-
await reporter.start("enrichment", len(snippets), "Enriching snippets...")
|
|
291
|
-
|
|
292
|
-
enrichment_request = EnrichmentIndexRequest(
|
|
293
|
-
requests=[
|
|
294
|
-
EnrichmentRequest(snippet_id=snippet.id, text=snippet.content)
|
|
295
|
-
for snippet in snippets
|
|
296
|
-
]
|
|
297
|
-
)
|
|
298
|
-
|
|
299
|
-
processed = 0
|
|
300
|
-
async for result in self.enrichment_service.enrich_documents(
|
|
301
|
-
enrichment_request
|
|
302
|
-
):
|
|
303
|
-
# Update snippet content through domain service
|
|
304
|
-
enriched_content = (
|
|
305
|
-
result.text
|
|
306
|
-
+ "\n\n```\n"
|
|
307
|
-
+ next(s.content for s in snippets if s.id == result.snippet_id)
|
|
308
|
-
+ "\n```"
|
|
309
|
-
)
|
|
310
|
-
|
|
311
|
-
await self.snippet_domain_service.update_snippet_content(
|
|
312
|
-
result.snippet_id, enriched_content
|
|
313
|
-
)
|
|
314
|
-
|
|
315
|
-
processed += 1
|
|
316
|
-
await reporter.step(
|
|
317
|
-
"enrichment", processed, len(snippets), "Enriching snippets..."
|
|
318
|
-
)
|
|
319
|
-
|
|
320
|
-
await reporter.done("enrichment")
|
|
321
|
-
|
|
322
338
|
async def _create_text_embeddings(
|
|
323
339
|
self, snippets: list[Snippet], progress_callback: ProgressCallback | None = None
|
|
324
340
|
) -> None:
|
|
@@ -327,14 +343,27 @@ class CodeIndexingApplicationService:
|
|
|
327
343
|
"text_embeddings", len(snippets), "Creating text embeddings..."
|
|
328
344
|
)
|
|
329
345
|
|
|
346
|
+
# Only create text embeddings for snippets that have summary content
|
|
347
|
+
documents_with_summaries = []
|
|
348
|
+
for snippet in snippets:
|
|
349
|
+
if snippet.id:
|
|
350
|
+
try:
|
|
351
|
+
summary_text = snippet.summary_text()
|
|
352
|
+
if summary_text.strip(): # Only add if summary is not empty
|
|
353
|
+
documents_with_summaries.append(
|
|
354
|
+
Document(snippet_id=snippet.id, text=summary_text)
|
|
355
|
+
)
|
|
356
|
+
except ValueError:
|
|
357
|
+
# Skip snippets without summary content
|
|
358
|
+
continue
|
|
359
|
+
|
|
360
|
+
if not documents_with_summaries:
|
|
361
|
+
await reporter.done("text_embeddings", "No summaries to index")
|
|
362
|
+
return
|
|
363
|
+
|
|
330
364
|
processed = 0
|
|
331
365
|
async for result in self.text_search_service.index_documents(
|
|
332
|
-
IndexRequest(
|
|
333
|
-
documents=[
|
|
334
|
-
Document(snippet_id=snippet.id, text=snippet.content)
|
|
335
|
-
for snippet in snippets
|
|
336
|
-
]
|
|
337
|
-
)
|
|
366
|
+
IndexRequest(documents=documents_with_summaries)
|
|
338
367
|
):
|
|
339
368
|
processed += len(result)
|
|
340
369
|
await reporter.step(
|