kodit 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/application/factories/code_indexing_factory.py +56 -29
- kodit/application/services/code_indexing_application_service.py +152 -118
- kodit/cli.py +14 -41
- kodit/domain/entities.py +268 -197
- kodit/domain/protocols.py +61 -0
- kodit/domain/services/embedding_service.py +1 -1
- kodit/domain/services/index_query_service.py +66 -0
- kodit/domain/services/index_service.py +282 -0
- kodit/domain/value_objects.py +143 -65
- kodit/infrastructure/cloning/git/working_copy.py +17 -8
- kodit/infrastructure/cloning/metadata.py +37 -67
- kodit/infrastructure/embedding/embedding_factory.py +1 -1
- kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
- kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +1 -1
- kodit/infrastructure/enrichment/null_enrichment_provider.py +4 -10
- kodit/infrastructure/git/git_utils.py +1 -63
- kodit/infrastructure/ignore/ignore_pattern_provider.py +1 -2
- kodit/infrastructure/indexing/auto_indexing_service.py +2 -12
- kodit/infrastructure/indexing/fusion_service.py +1 -1
- kodit/infrastructure/mappers/__init__.py +1 -0
- kodit/infrastructure/mappers/index_mapper.py +344 -0
- kodit/infrastructure/slicing/__init__.py +1 -0
- kodit/infrastructure/slicing/language_detection_service.py +18 -0
- kodit/infrastructure/slicing/slicer.py +894 -0
- kodit/infrastructure/sqlalchemy/embedding_repository.py +1 -1
- kodit/infrastructure/sqlalchemy/entities.py +203 -0
- kodit/infrastructure/sqlalchemy/index_repository.py +579 -0
- kodit/mcp.py +0 -7
- kodit/migrations/env.py +1 -1
- kodit/migrations/versions/4073b33f9436_add_file_processing_flag.py +36 -0
- kodit/migrations/versions/4552eb3f23ce_add_summary.py +4 -4
- kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py +24 -16
- kodit/migrations/versions/85155663351e_initial.py +64 -48
- kodit/migrations/versions/c3f5137d30f5_index_all_the_things.py +20 -14
- kodit/utils/__init__.py +1 -0
- kodit/utils/path_utils.py +54 -0
- {kodit-0.3.2.dist-info → kodit-0.3.4.dist-info}/METADATA +9 -4
- kodit-0.3.4.dist-info/RECORD +89 -0
- kodit/domain/enums.py +0 -9
- kodit/domain/repositories.py +0 -128
- kodit/domain/services/ignore_service.py +0 -45
- kodit/domain/services/indexing_service.py +0 -204
- kodit/domain/services/snippet_extraction_service.py +0 -89
- kodit/domain/services/snippet_service.py +0 -215
- kodit/domain/services/source_service.py +0 -85
- kodit/infrastructure/cloning/folder/__init__.py +0 -1
- kodit/infrastructure/cloning/folder/factory.py +0 -128
- kodit/infrastructure/cloning/folder/working_copy.py +0 -38
- kodit/infrastructure/cloning/git/factory.py +0 -153
- kodit/infrastructure/indexing/index_repository.py +0 -286
- kodit/infrastructure/indexing/snippet_domain_service_factory.py +0 -37
- kodit/infrastructure/snippet_extraction/__init__.py +0 -1
- kodit/infrastructure/snippet_extraction/language_detection_service.py +0 -39
- kodit/infrastructure/snippet_extraction/languages/csharp.scm +0 -12
- kodit/infrastructure/snippet_extraction/languages/go.scm +0 -26
- kodit/infrastructure/snippet_extraction/languages/java.scm +0 -12
- kodit/infrastructure/snippet_extraction/languages/javascript.scm +0 -24
- kodit/infrastructure/snippet_extraction/languages/python.scm +0 -22
- kodit/infrastructure/snippet_extraction/languages/typescript.scm +0 -25
- kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +0 -67
- kodit/infrastructure/snippet_extraction/snippet_query_provider.py +0 -45
- kodit/infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py +0 -182
- kodit/infrastructure/sqlalchemy/file_repository.py +0 -78
- kodit/infrastructure/sqlalchemy/repository.py +0 -133
- kodit/infrastructure/sqlalchemy/snippet_repository.py +0 -259
- kodit-0.3.2.dist-info/RECORD +0 -103
- {kodit-0.3.2.dist-info → kodit-0.3.4.dist-info}/WHEEL +0 -0
- {kodit-0.3.2.dist-info → kodit-0.3.4.dist-info}/entry_points.txt +0 -0
- {kodit-0.3.2.dist-info → kodit-0.3.4.dist-info}/licenses/LICENSE +0 -0
kodit/_version.py
CHANGED
|
@@ -6,17 +6,20 @@ from kodit.application.services.code_indexing_application_service import (
|
|
|
6
6
|
CodeIndexingApplicationService,
|
|
7
7
|
)
|
|
8
8
|
from kodit.config import AppContext
|
|
9
|
-
from kodit.domain.entities import EmbeddingType
|
|
10
9
|
from kodit.domain.services.bm25_service import BM25DomainService
|
|
11
10
|
from kodit.domain.services.embedding_service import EmbeddingDomainService
|
|
12
11
|
from kodit.domain.services.enrichment_service import EnrichmentDomainService
|
|
13
|
-
from kodit.domain.services.
|
|
12
|
+
from kodit.domain.services.index_query_service import IndexQueryService
|
|
13
|
+
from kodit.domain.services.index_service import (
|
|
14
|
+
IndexDomainService,
|
|
15
|
+
)
|
|
16
|
+
from kodit.domain.value_objects import LanguageMapping
|
|
14
17
|
from kodit.infrastructure.bm25.bm25_factory import bm25_repository_factory
|
|
15
18
|
from kodit.infrastructure.embedding.embedding_factory import (
|
|
16
19
|
embedding_domain_service_factory,
|
|
17
20
|
)
|
|
18
|
-
from kodit.infrastructure.embedding.embedding_providers import (
|
|
19
|
-
|
|
21
|
+
from kodit.infrastructure.embedding.embedding_providers.hash_embedding_provider import (
|
|
22
|
+
HashEmbeddingProvider,
|
|
20
23
|
)
|
|
21
24
|
from kodit.infrastructure.embedding.local_vector_search_repository import (
|
|
22
25
|
LocalVectorSearchRepository,
|
|
@@ -27,36 +30,49 @@ from kodit.infrastructure.enrichment.enrichment_factory import (
|
|
|
27
30
|
from kodit.infrastructure.enrichment.null_enrichment_provider import (
|
|
28
31
|
NullEnrichmentProvider,
|
|
29
32
|
)
|
|
30
|
-
from kodit.infrastructure.indexing.
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
from kodit.infrastructure.indexing.snippet_domain_service_factory import (
|
|
34
|
-
snippet_domain_service_factory,
|
|
33
|
+
from kodit.infrastructure.indexing.fusion_service import ReciprocalRankFusionService
|
|
34
|
+
from kodit.infrastructure.slicing.language_detection_service import (
|
|
35
|
+
FileSystemLanguageDetectionService,
|
|
35
36
|
)
|
|
36
37
|
from kodit.infrastructure.sqlalchemy.embedding_repository import (
|
|
37
38
|
SqlAlchemyEmbeddingRepository,
|
|
38
39
|
)
|
|
40
|
+
from kodit.infrastructure.sqlalchemy.entities import EmbeddingType
|
|
41
|
+
from kodit.infrastructure.sqlalchemy.index_repository import SqlAlchemyIndexRepository
|
|
39
42
|
|
|
40
43
|
|
|
41
44
|
def create_code_indexing_application_service(
|
|
42
45
|
app_context: AppContext,
|
|
43
46
|
session: AsyncSession,
|
|
44
|
-
source_service: SourceService,
|
|
45
47
|
) -> CodeIndexingApplicationService:
|
|
46
48
|
"""Create a unified code indexing application service with all dependencies."""
|
|
47
49
|
# Create domain services
|
|
48
|
-
indexing_domain_service = indexing_domain_service_factory(session)
|
|
49
|
-
snippet_domain_service = snippet_domain_service_factory(session)
|
|
50
50
|
bm25_service = BM25DomainService(bm25_repository_factory(app_context, session))
|
|
51
51
|
code_search_service = embedding_domain_service_factory("code", app_context, session)
|
|
52
52
|
text_search_service = embedding_domain_service_factory("text", app_context, session)
|
|
53
53
|
enrichment_service = enrichment_domain_service_factory(app_context)
|
|
54
|
+
index_repository = SqlAlchemyIndexRepository(session=session)
|
|
55
|
+
# Use the unified language mapping from the domain layer
|
|
56
|
+
language_map = LanguageMapping.get_extension_to_language_map()
|
|
57
|
+
|
|
58
|
+
# Create infrastructure services
|
|
59
|
+
language_detector = FileSystemLanguageDetectionService(language_map)
|
|
60
|
+
|
|
61
|
+
index_domain_service = IndexDomainService(
|
|
62
|
+
language_detector=language_detector,
|
|
63
|
+
enrichment_service=enrichment_service,
|
|
64
|
+
clone_dir=app_context.get_clone_dir(),
|
|
65
|
+
)
|
|
66
|
+
index_query_service = IndexQueryService(
|
|
67
|
+
index_repository=index_repository,
|
|
68
|
+
fusion_service=ReciprocalRankFusionService(),
|
|
69
|
+
)
|
|
54
70
|
|
|
55
71
|
# Create and return the unified application service
|
|
56
72
|
return CodeIndexingApplicationService(
|
|
57
|
-
indexing_domain_service=
|
|
58
|
-
|
|
59
|
-
|
|
73
|
+
indexing_domain_service=index_domain_service,
|
|
74
|
+
index_repository=index_repository,
|
|
75
|
+
index_query_service=index_query_service,
|
|
60
76
|
bm25_service=bm25_service,
|
|
61
77
|
code_search_service=code_search_service,
|
|
62
78
|
text_search_service=text_search_service,
|
|
@@ -68,36 +84,30 @@ def create_code_indexing_application_service(
|
|
|
68
84
|
def create_fast_test_code_indexing_application_service(
|
|
69
85
|
app_context: AppContext,
|
|
70
86
|
session: AsyncSession,
|
|
71
|
-
source_service: SourceService,
|
|
72
87
|
) -> CodeIndexingApplicationService:
|
|
73
|
-
"""Create a fast test
|
|
88
|
+
"""Create a fast test code indexing application service."""
|
|
74
89
|
# Create domain services
|
|
75
|
-
indexing_domain_service = indexing_domain_service_factory(session)
|
|
76
|
-
snippet_domain_service = snippet_domain_service_factory(session)
|
|
77
90
|
bm25_service = BM25DomainService(bm25_repository_factory(app_context, session))
|
|
78
|
-
|
|
79
|
-
# Create fast embedding services using HashEmbeddingProvider
|
|
80
91
|
embedding_repository = SqlAlchemyEmbeddingRepository(session=session)
|
|
81
92
|
|
|
82
|
-
# Fast code search service
|
|
83
93
|
code_search_repository = LocalVectorSearchRepository(
|
|
84
94
|
embedding_repository=embedding_repository,
|
|
85
|
-
embedding_provider=
|
|
95
|
+
embedding_provider=HashEmbeddingProvider(),
|
|
86
96
|
embedding_type=EmbeddingType.CODE,
|
|
87
97
|
)
|
|
88
98
|
code_search_service = EmbeddingDomainService(
|
|
89
|
-
embedding_provider=
|
|
99
|
+
embedding_provider=HashEmbeddingProvider(),
|
|
90
100
|
vector_search_repository=code_search_repository,
|
|
91
101
|
)
|
|
92
102
|
|
|
93
103
|
# Fast text search service
|
|
94
104
|
text_search_repository = LocalVectorSearchRepository(
|
|
95
105
|
embedding_repository=embedding_repository,
|
|
96
|
-
embedding_provider=
|
|
106
|
+
embedding_provider=HashEmbeddingProvider(),
|
|
97
107
|
embedding_type=EmbeddingType.TEXT,
|
|
98
108
|
)
|
|
99
109
|
text_search_service = EmbeddingDomainService(
|
|
100
|
-
embedding_provider=
|
|
110
|
+
embedding_provider=HashEmbeddingProvider(),
|
|
101
111
|
vector_search_repository=text_search_repository,
|
|
102
112
|
)
|
|
103
113
|
|
|
@@ -106,11 +116,28 @@ def create_fast_test_code_indexing_application_service(
|
|
|
106
116
|
enrichment_provider=NullEnrichmentProvider()
|
|
107
117
|
)
|
|
108
118
|
|
|
119
|
+
index_repository = SqlAlchemyIndexRepository(session=session)
|
|
120
|
+
# Use the unified language mapping from the domain layer
|
|
121
|
+
language_map = LanguageMapping.get_extension_to_language_map()
|
|
122
|
+
|
|
123
|
+
# Create infrastructure services
|
|
124
|
+
language_detector = FileSystemLanguageDetectionService(language_map)
|
|
125
|
+
|
|
126
|
+
index_domain_service = IndexDomainService(
|
|
127
|
+
language_detector=language_detector,
|
|
128
|
+
enrichment_service=enrichment_service,
|
|
129
|
+
clone_dir=app_context.get_clone_dir(),
|
|
130
|
+
)
|
|
131
|
+
index_query_service = IndexQueryService(
|
|
132
|
+
index_repository=index_repository,
|
|
133
|
+
fusion_service=ReciprocalRankFusionService(),
|
|
134
|
+
)
|
|
135
|
+
|
|
109
136
|
# Create and return the unified application service
|
|
110
137
|
return CodeIndexingApplicationService(
|
|
111
|
-
indexing_domain_service=
|
|
112
|
-
|
|
113
|
-
|
|
138
|
+
indexing_domain_service=index_domain_service,
|
|
139
|
+
index_repository=index_repository,
|
|
140
|
+
index_query_service=index_query_service,
|
|
114
141
|
bm25_service=bm25_service,
|
|
115
142
|
code_search_service=code_search_service,
|
|
116
143
|
text_search_service=text_search_service,
|
|
@@ -1,32 +1,28 @@
|
|
|
1
1
|
"""Unified application service for code indexing operations."""
|
|
2
2
|
|
|
3
3
|
from dataclasses import replace
|
|
4
|
+
from datetime import UTC, datetime
|
|
4
5
|
|
|
5
6
|
import structlog
|
|
6
7
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
7
8
|
|
|
8
|
-
from kodit.domain.entities import Snippet
|
|
9
|
-
from kodit.domain.enums import SnippetExtractionStrategy
|
|
10
|
-
from kodit.domain.errors import EmptySourceError
|
|
9
|
+
from kodit.domain.entities import Index, Snippet
|
|
11
10
|
from kodit.domain.interfaces import ProgressCallback
|
|
11
|
+
from kodit.domain.protocols import IndexRepository
|
|
12
12
|
from kodit.domain.services.bm25_service import BM25DomainService
|
|
13
13
|
from kodit.domain.services.embedding_service import EmbeddingDomainService
|
|
14
14
|
from kodit.domain.services.enrichment_service import EnrichmentDomainService
|
|
15
|
-
from kodit.domain.services.
|
|
16
|
-
from kodit.domain.services.
|
|
17
|
-
from kodit.domain.services.source_service import SourceService
|
|
15
|
+
from kodit.domain.services.index_query_service import IndexQueryService
|
|
16
|
+
from kodit.domain.services.index_service import IndexDomainService
|
|
18
17
|
from kodit.domain.value_objects import (
|
|
19
18
|
Document,
|
|
20
|
-
EnrichmentIndexRequest,
|
|
21
|
-
EnrichmentRequest,
|
|
22
19
|
FusionRequest,
|
|
23
|
-
IndexCreateRequest,
|
|
24
20
|
IndexRequest,
|
|
25
|
-
IndexView,
|
|
26
21
|
MultiSearchRequest,
|
|
27
22
|
MultiSearchResult,
|
|
28
23
|
SearchRequest,
|
|
29
24
|
SearchResult,
|
|
25
|
+
SnippetSearchFilters,
|
|
30
26
|
)
|
|
31
27
|
from kodit.log import log_event
|
|
32
28
|
from kodit.reporting import Reporter
|
|
@@ -37,9 +33,9 @@ class CodeIndexingApplicationService:
|
|
|
37
33
|
|
|
38
34
|
def __init__( # noqa: PLR0913
|
|
39
35
|
self,
|
|
40
|
-
indexing_domain_service:
|
|
41
|
-
|
|
42
|
-
|
|
36
|
+
indexing_domain_service: IndexDomainService,
|
|
37
|
+
index_repository: IndexRepository,
|
|
38
|
+
index_query_service: IndexQueryService,
|
|
43
39
|
bm25_service: BM25DomainService,
|
|
44
40
|
code_search_service: EmbeddingDomainService,
|
|
45
41
|
text_search_service: EmbeddingDomainService,
|
|
@@ -47,9 +43,9 @@ class CodeIndexingApplicationService:
|
|
|
47
43
|
session: AsyncSession,
|
|
48
44
|
) -> None:
|
|
49
45
|
"""Initialize the code indexing application service."""
|
|
50
|
-
self.
|
|
51
|
-
self.
|
|
52
|
-
self.
|
|
46
|
+
self.index_domain_service = indexing_domain_service
|
|
47
|
+
self.index_repository = index_repository
|
|
48
|
+
self.index_query_service = index_query_service
|
|
53
49
|
self.bm25_service = bm25_service
|
|
54
50
|
self.code_search_service = code_search_service
|
|
55
51
|
self.text_search_service = text_search_service
|
|
@@ -57,90 +53,104 @@ class CodeIndexingApplicationService:
|
|
|
57
53
|
self.session = session
|
|
58
54
|
self.log = structlog.get_logger(__name__)
|
|
59
55
|
|
|
60
|
-
async def
|
|
56
|
+
async def create_index_from_uri(
|
|
57
|
+
self, uri: str, progress_callback: ProgressCallback | None = None
|
|
58
|
+
) -> Index:
|
|
61
59
|
"""Create a new index for a source."""
|
|
62
60
|
log_event("kodit.index.create")
|
|
63
61
|
|
|
64
|
-
#
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
return index_view
|
|
75
|
-
|
|
76
|
-
async def list_indexes(self) -> list[IndexView]:
|
|
77
|
-
"""List all available indexes with their details."""
|
|
78
|
-
indexes = await self.indexing_domain_service.list_indexes()
|
|
62
|
+
# Check if index already exists
|
|
63
|
+
sanitized_uri, _ = self.index_domain_service.sanitize_uri(uri)
|
|
64
|
+
existing_index = await self.index_repository.get_by_uri(sanitized_uri)
|
|
65
|
+
if existing_index:
|
|
66
|
+
self.log.debug(
|
|
67
|
+
"Index already exists",
|
|
68
|
+
uri=str(sanitized_uri),
|
|
69
|
+
index_id=existing_index.id,
|
|
70
|
+
)
|
|
71
|
+
return existing_index
|
|
79
72
|
|
|
80
|
-
#
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
{
|
|
84
|
-
"num_indexes": len(indexes),
|
|
85
|
-
"num_snippets": sum([index.num_snippets for index in indexes]),
|
|
86
|
-
},
|
|
73
|
+
# Only prepare working copy if we need to create a new index
|
|
74
|
+
working_copy = await self.index_domain_service.prepare_index(
|
|
75
|
+
uri, progress_callback
|
|
87
76
|
)
|
|
88
77
|
|
|
89
|
-
|
|
78
|
+
# Create new index
|
|
79
|
+
index = await self.index_repository.create(sanitized_uri, working_copy)
|
|
80
|
+
await self.session.commit()
|
|
81
|
+
return index
|
|
90
82
|
|
|
91
83
|
async def run_index(
|
|
92
|
-
self,
|
|
84
|
+
self, index: Index, progress_callback: ProgressCallback | None = None
|
|
93
85
|
) -> None:
|
|
94
86
|
"""Run the complete indexing process for a specific index."""
|
|
95
87
|
log_event("kodit.index.run")
|
|
96
88
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
if not index:
|
|
100
|
-
msg = f"Index not found: {index_id}"
|
|
89
|
+
if not index or not index.id:
|
|
90
|
+
msg = f"Index has no ID: {index}"
|
|
101
91
|
raise ValueError(msg)
|
|
102
92
|
|
|
103
|
-
#
|
|
104
|
-
|
|
105
|
-
|
|
93
|
+
# Refresh working copy
|
|
94
|
+
index.source.working_copy = (
|
|
95
|
+
await self.index_domain_service.refresh_working_copy(
|
|
96
|
+
index.source.working_copy
|
|
97
|
+
)
|
|
98
|
+
)
|
|
99
|
+
if len(index.source.working_copy.changed_files()) == 0:
|
|
100
|
+
self.log.info("No new changes to index", index_id=index.id)
|
|
101
|
+
return
|
|
102
|
+
|
|
103
|
+
# Delete the old snippets from the files that have changed
|
|
104
|
+
await self.index_repository.delete_snippets_by_file_ids(
|
|
105
|
+
[file.id for file in index.source.working_copy.changed_files() if file.id]
|
|
106
|
+
)
|
|
106
107
|
|
|
107
108
|
# Extract and create snippets (domain service handles progress)
|
|
108
109
|
self.log.info("Creating snippets for files", index_id=index.id)
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
strategy=SnippetExtractionStrategy.METHOD_BASED,
|
|
112
|
-
progress_callback=progress_callback,
|
|
110
|
+
index = await self.index_domain_service.extract_snippets_from_index(
|
|
111
|
+
index=index, progress_callback=progress_callback
|
|
113
112
|
)
|
|
114
113
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
msg = f"No indexable snippets found for index {index.id}"
|
|
118
|
-
raise EmptySourceError(msg)
|
|
114
|
+
await self.index_repository.update(index)
|
|
115
|
+
await self.session.flush()
|
|
119
116
|
|
|
120
|
-
#
|
|
121
|
-
await self.
|
|
117
|
+
# Refresh index to get snippets with IDs, required as a ref for subsequent steps
|
|
118
|
+
flushed_index = await self.index_repository.get(index.id)
|
|
119
|
+
if not flushed_index:
|
|
120
|
+
msg = f"Index {index.id} not found after snippet extraction"
|
|
121
|
+
raise ValueError(msg)
|
|
122
|
+
index = flushed_index
|
|
123
|
+
if len(index.snippets) == 0:
|
|
124
|
+
self.log.info("No snippets to index after extraction", index_id=index.id)
|
|
125
|
+
return
|
|
122
126
|
|
|
123
127
|
# Create BM25 index
|
|
124
128
|
self.log.info("Creating keyword index")
|
|
125
|
-
await self._create_bm25_index(snippets, progress_callback)
|
|
129
|
+
await self._create_bm25_index(index.snippets, progress_callback)
|
|
126
130
|
|
|
127
131
|
# Create code embeddings
|
|
128
132
|
self.log.info("Creating semantic code index")
|
|
129
|
-
await self._create_code_embeddings(snippets, progress_callback)
|
|
133
|
+
await self._create_code_embeddings(index.snippets, progress_callback)
|
|
130
134
|
|
|
131
135
|
# Enrich snippets
|
|
132
|
-
self.log.info("Enriching snippets", num_snippets=len(snippets))
|
|
133
|
-
await self.
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
snippets
|
|
136
|
+
self.log.info("Enriching snippets", num_snippets=len(index.snippets))
|
|
137
|
+
enriched_snippets = await self.index_domain_service.enrich_snippets_in_index(
|
|
138
|
+
snippets=index.snippets, progress_callback=progress_callback
|
|
139
|
+
)
|
|
140
|
+
# Update snippets in repository
|
|
141
|
+
await self.index_repository.update_snippets(index.id, enriched_snippets)
|
|
137
142
|
|
|
138
143
|
# Create text embeddings (on enriched content)
|
|
139
144
|
self.log.info("Creating semantic text index")
|
|
140
|
-
await self._create_text_embeddings(
|
|
145
|
+
await self._create_text_embeddings(enriched_snippets, progress_callback)
|
|
141
146
|
|
|
142
147
|
# Update index timestamp
|
|
143
|
-
await self.
|
|
148
|
+
await self.index_repository.update_index_timestamp(index.id)
|
|
149
|
+
|
|
150
|
+
# Now that all file dependencies have been captured, enact the file processing
|
|
151
|
+
# statuses
|
|
152
|
+
index.source.working_copy.clear_file_processing_statuses()
|
|
153
|
+
await self.index_repository.update(index)
|
|
144
154
|
|
|
145
155
|
# Single transaction commit for the entire operation
|
|
146
156
|
await self.session.commit()
|
|
@@ -152,12 +162,14 @@ class CodeIndexingApplicationService:
|
|
|
152
162
|
# Apply filters if provided
|
|
153
163
|
filtered_snippet_ids: list[int] | None = None
|
|
154
164
|
if request.filters:
|
|
155
|
-
# Use domain service for filtering
|
|
156
|
-
prefilter_request = replace(request, top_k=
|
|
157
|
-
snippet_results = await self.
|
|
165
|
+
# Use domain service for filtering (use large top_k for pre-filtering)
|
|
166
|
+
prefilter_request = replace(request, top_k=10000)
|
|
167
|
+
snippet_results = await self.index_query_service.search_snippets(
|
|
158
168
|
prefilter_request
|
|
159
169
|
)
|
|
160
|
-
filtered_snippet_ids = [
|
|
170
|
+
filtered_snippet_ids = [
|
|
171
|
+
snippet.snippet.id for snippet in snippet_results if snippet.snippet.id
|
|
172
|
+
]
|
|
161
173
|
|
|
162
174
|
# Gather results from different search modes
|
|
163
175
|
fusion_list: list[list[FusionRequest]] = []
|
|
@@ -209,7 +221,7 @@ class CodeIndexingApplicationService:
|
|
|
209
221
|
return []
|
|
210
222
|
|
|
211
223
|
# Fusion ranking
|
|
212
|
-
final_results = self.
|
|
224
|
+
final_results = await self.index_query_service.perform_fusion(
|
|
213
225
|
rankings=fusion_list,
|
|
214
226
|
k=60, # This is a parameter in the RRF algorithm, not top_k
|
|
215
227
|
)
|
|
@@ -218,27 +230,29 @@ class CodeIndexingApplicationService:
|
|
|
218
230
|
final_results = final_results[: request.top_k]
|
|
219
231
|
|
|
220
232
|
# Get snippet details
|
|
221
|
-
search_results = await self.
|
|
233
|
+
search_results = await self.index_query_service.get_snippets_by_ids(
|
|
222
234
|
[x.id for x in final_results]
|
|
223
235
|
)
|
|
224
236
|
|
|
225
237
|
return [
|
|
226
238
|
MultiSearchResult(
|
|
227
|
-
id=result.snippet.id,
|
|
228
|
-
content=result.snippet.
|
|
239
|
+
id=result.snippet.id or 0,
|
|
240
|
+
content=result.snippet.original_text(),
|
|
229
241
|
original_scores=fr.original_scores,
|
|
230
242
|
# Enhanced fields
|
|
231
|
-
source_uri=result.source.
|
|
232
|
-
relative_path=
|
|
233
|
-
result.file.
|
|
243
|
+
source_uri=str(result.source.working_copy.remote_uri),
|
|
244
|
+
relative_path=str(
|
|
245
|
+
result.file.as_path().relative_to(
|
|
246
|
+
result.source.working_copy.cloned_path
|
|
247
|
+
)
|
|
234
248
|
),
|
|
235
249
|
language=MultiSearchResult.detect_language_from_extension(
|
|
236
|
-
result.file.extension
|
|
250
|
+
result.file.extension()
|
|
237
251
|
),
|
|
238
252
|
authors=[author.name for author in result.authors],
|
|
239
|
-
created_at=result.snippet.created_at,
|
|
253
|
+
created_at=result.snippet.created_at or datetime.now(UTC),
|
|
240
254
|
# Summary from snippet entity
|
|
241
|
-
summary=result.snippet.
|
|
255
|
+
summary=result.snippet.summary_text(),
|
|
242
256
|
)
|
|
243
257
|
for result, fr in zip(search_results, final_results, strict=True)
|
|
244
258
|
]
|
|
@@ -248,19 +262,53 @@ class CodeIndexingApplicationService:
|
|
|
248
262
|
) -> list[MultiSearchResult]:
|
|
249
263
|
"""List snippets with optional filtering."""
|
|
250
264
|
log_event("kodit.index.list_snippets")
|
|
251
|
-
|
|
265
|
+
snippet_results = await self.index_query_service.search_snippets(
|
|
266
|
+
request=MultiSearchRequest(
|
|
267
|
+
filters=SnippetSearchFilters(
|
|
268
|
+
file_path=file_path,
|
|
269
|
+
source_repo=source_uri,
|
|
270
|
+
)
|
|
271
|
+
),
|
|
272
|
+
)
|
|
273
|
+
return [
|
|
274
|
+
MultiSearchResult(
|
|
275
|
+
id=result.snippet.id or 0,
|
|
276
|
+
content=result.snippet.original_text(),
|
|
277
|
+
original_scores=[0.0],
|
|
278
|
+
# Enhanced fields
|
|
279
|
+
source_uri=str(result.source.working_copy.remote_uri),
|
|
280
|
+
relative_path=str(
|
|
281
|
+
result.file.as_path().relative_to(
|
|
282
|
+
result.source.working_copy.cloned_path
|
|
283
|
+
)
|
|
284
|
+
),
|
|
285
|
+
language=MultiSearchResult.detect_language_from_extension(
|
|
286
|
+
result.file.extension()
|
|
287
|
+
),
|
|
288
|
+
authors=[author.name for author in result.authors],
|
|
289
|
+
created_at=result.snippet.created_at or datetime.now(UTC),
|
|
290
|
+
# Summary from snippet entity
|
|
291
|
+
summary=result.snippet.summary_text(),
|
|
292
|
+
)
|
|
293
|
+
for result in snippet_results
|
|
294
|
+
]
|
|
252
295
|
|
|
296
|
+
# FUTURE: BM25 index enriched content too
|
|
253
297
|
async def _create_bm25_index(
|
|
254
298
|
self, snippets: list[Snippet], progress_callback: ProgressCallback | None = None
|
|
255
299
|
) -> None:
|
|
256
300
|
reporter = Reporter(self.log, progress_callback)
|
|
257
301
|
await reporter.start("bm25_index", len(snippets), "Creating keyword index...")
|
|
258
302
|
|
|
303
|
+
for _snippet in snippets:
|
|
304
|
+
pass
|
|
305
|
+
|
|
259
306
|
await self.bm25_service.index_documents(
|
|
260
307
|
IndexRequest(
|
|
261
308
|
documents=[
|
|
262
|
-
Document(snippet_id=snippet.id, text=snippet.
|
|
309
|
+
Document(snippet_id=snippet.id, text=snippet.original_text())
|
|
263
310
|
for snippet in snippets
|
|
311
|
+
if snippet.id
|
|
264
312
|
]
|
|
265
313
|
)
|
|
266
314
|
)
|
|
@@ -279,8 +327,9 @@ class CodeIndexingApplicationService:
|
|
|
279
327
|
async for result in self.code_search_service.index_documents(
|
|
280
328
|
IndexRequest(
|
|
281
329
|
documents=[
|
|
282
|
-
Document(snippet_id=snippet.id, text=snippet.
|
|
330
|
+
Document(snippet_id=snippet.id, text=snippet.original_text())
|
|
283
331
|
for snippet in snippets
|
|
332
|
+
if snippet.id
|
|
284
333
|
]
|
|
285
334
|
)
|
|
286
335
|
):
|
|
@@ -294,34 +343,6 @@ class CodeIndexingApplicationService:
|
|
|
294
343
|
|
|
295
344
|
await reporter.done("code_embeddings")
|
|
296
345
|
|
|
297
|
-
async def _enrich_snippets(
|
|
298
|
-
self, snippets: list[Snippet], progress_callback: ProgressCallback | None = None
|
|
299
|
-
) -> None:
|
|
300
|
-
reporter = Reporter(self.log, progress_callback)
|
|
301
|
-
await reporter.start("enrichment", len(snippets), "Enriching snippets...")
|
|
302
|
-
|
|
303
|
-
enrichment_request = EnrichmentIndexRequest(
|
|
304
|
-
requests=[
|
|
305
|
-
EnrichmentRequest(snippet_id=snippet.id, text=snippet.content)
|
|
306
|
-
for snippet in snippets
|
|
307
|
-
]
|
|
308
|
-
)
|
|
309
|
-
|
|
310
|
-
processed = 0
|
|
311
|
-
async for result in self.enrichment_service.enrich_documents(
|
|
312
|
-
enrichment_request
|
|
313
|
-
):
|
|
314
|
-
await self.snippet_domain_service.update_snippet_summary(
|
|
315
|
-
result.snippet_id, result.text
|
|
316
|
-
)
|
|
317
|
-
|
|
318
|
-
processed += 1
|
|
319
|
-
await reporter.step(
|
|
320
|
-
"enrichment", processed, len(snippets), "Enriching snippets..."
|
|
321
|
-
)
|
|
322
|
-
|
|
323
|
-
await reporter.done("enrichment")
|
|
324
|
-
|
|
325
346
|
async def _create_text_embeddings(
|
|
326
347
|
self, snippets: list[Snippet], progress_callback: ProgressCallback | None = None
|
|
327
348
|
) -> None:
|
|
@@ -330,14 +351,27 @@ class CodeIndexingApplicationService:
|
|
|
330
351
|
"text_embeddings", len(snippets), "Creating text embeddings..."
|
|
331
352
|
)
|
|
332
353
|
|
|
354
|
+
# Only create text embeddings for snippets that have summary content
|
|
355
|
+
documents_with_summaries = []
|
|
356
|
+
for snippet in snippets:
|
|
357
|
+
if snippet.id:
|
|
358
|
+
try:
|
|
359
|
+
summary_text = snippet.summary_text()
|
|
360
|
+
if summary_text.strip(): # Only add if summary is not empty
|
|
361
|
+
documents_with_summaries.append(
|
|
362
|
+
Document(snippet_id=snippet.id, text=summary_text)
|
|
363
|
+
)
|
|
364
|
+
except ValueError:
|
|
365
|
+
# Skip snippets without summary content
|
|
366
|
+
continue
|
|
367
|
+
|
|
368
|
+
if not documents_with_summaries:
|
|
369
|
+
await reporter.done("text_embeddings", "No summaries to index")
|
|
370
|
+
return
|
|
371
|
+
|
|
333
372
|
processed = 0
|
|
334
373
|
async for result in self.text_search_service.index_documents(
|
|
335
|
-
IndexRequest(
|
|
336
|
-
documents=[
|
|
337
|
-
Document(snippet_id=snippet.id, text=snippet.content)
|
|
338
|
-
for snippet in snippets
|
|
339
|
-
]
|
|
340
|
-
)
|
|
374
|
+
IndexRequest(documents=documents_with_summaries)
|
|
341
375
|
):
|
|
342
376
|
processed += len(result)
|
|
343
377
|
await reporter.step(
|