kodit 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/application/factories/code_indexing_factory.py +56 -29
- kodit/application/services/code_indexing_application_service.py +152 -118
- kodit/cli.py +14 -41
- kodit/domain/entities.py +268 -197
- kodit/domain/protocols.py +61 -0
- kodit/domain/services/embedding_service.py +1 -1
- kodit/domain/services/index_query_service.py +66 -0
- kodit/domain/services/index_service.py +282 -0
- kodit/domain/value_objects.py +143 -65
- kodit/infrastructure/cloning/git/working_copy.py +17 -8
- kodit/infrastructure/cloning/metadata.py +37 -67
- kodit/infrastructure/embedding/embedding_factory.py +1 -1
- kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
- kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +1 -1
- kodit/infrastructure/enrichment/null_enrichment_provider.py +4 -10
- kodit/infrastructure/git/git_utils.py +1 -63
- kodit/infrastructure/ignore/ignore_pattern_provider.py +1 -2
- kodit/infrastructure/indexing/auto_indexing_service.py +2 -12
- kodit/infrastructure/indexing/fusion_service.py +1 -1
- kodit/infrastructure/mappers/__init__.py +1 -0
- kodit/infrastructure/mappers/index_mapper.py +344 -0
- kodit/infrastructure/slicing/__init__.py +1 -0
- kodit/infrastructure/slicing/language_detection_service.py +18 -0
- kodit/infrastructure/slicing/slicer.py +894 -0
- kodit/infrastructure/sqlalchemy/embedding_repository.py +1 -1
- kodit/infrastructure/sqlalchemy/entities.py +203 -0
- kodit/infrastructure/sqlalchemy/index_repository.py +579 -0
- kodit/mcp.py +0 -7
- kodit/migrations/env.py +1 -1
- kodit/migrations/versions/4073b33f9436_add_file_processing_flag.py +36 -0
- kodit/migrations/versions/4552eb3f23ce_add_summary.py +4 -4
- kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py +24 -16
- kodit/migrations/versions/85155663351e_initial.py +64 -48
- kodit/migrations/versions/c3f5137d30f5_index_all_the_things.py +20 -14
- kodit/utils/__init__.py +1 -0
- kodit/utils/path_utils.py +54 -0
- {kodit-0.3.2.dist-info → kodit-0.3.4.dist-info}/METADATA +9 -4
- kodit-0.3.4.dist-info/RECORD +89 -0
- kodit/domain/enums.py +0 -9
- kodit/domain/repositories.py +0 -128
- kodit/domain/services/ignore_service.py +0 -45
- kodit/domain/services/indexing_service.py +0 -204
- kodit/domain/services/snippet_extraction_service.py +0 -89
- kodit/domain/services/snippet_service.py +0 -215
- kodit/domain/services/source_service.py +0 -85
- kodit/infrastructure/cloning/folder/__init__.py +0 -1
- kodit/infrastructure/cloning/folder/factory.py +0 -128
- kodit/infrastructure/cloning/folder/working_copy.py +0 -38
- kodit/infrastructure/cloning/git/factory.py +0 -153
- kodit/infrastructure/indexing/index_repository.py +0 -286
- kodit/infrastructure/indexing/snippet_domain_service_factory.py +0 -37
- kodit/infrastructure/snippet_extraction/__init__.py +0 -1
- kodit/infrastructure/snippet_extraction/language_detection_service.py +0 -39
- kodit/infrastructure/snippet_extraction/languages/csharp.scm +0 -12
- kodit/infrastructure/snippet_extraction/languages/go.scm +0 -26
- kodit/infrastructure/snippet_extraction/languages/java.scm +0 -12
- kodit/infrastructure/snippet_extraction/languages/javascript.scm +0 -24
- kodit/infrastructure/snippet_extraction/languages/python.scm +0 -22
- kodit/infrastructure/snippet_extraction/languages/typescript.scm +0 -25
- kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +0 -67
- kodit/infrastructure/snippet_extraction/snippet_query_provider.py +0 -45
- kodit/infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py +0 -182
- kodit/infrastructure/sqlalchemy/file_repository.py +0 -78
- kodit/infrastructure/sqlalchemy/repository.py +0 -133
- kodit/infrastructure/sqlalchemy/snippet_repository.py +0 -259
- kodit-0.3.2.dist-info/RECORD +0 -103
- {kodit-0.3.2.dist-info → kodit-0.3.4.dist-info}/WHEEL +0 -0
- {kodit-0.3.2.dist-info → kodit-0.3.4.dist-info}/entry_points.txt +0 -0
- {kodit-0.3.2.dist-info → kodit-0.3.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
4
|
from collections.abc import AsyncGenerator, Sequence
|
|
5
5
|
|
|
6
|
-
from kodit.domain.entities import EmbeddingType
|
|
7
6
|
from kodit.domain.value_objects import (
|
|
8
7
|
EmbeddingRequest,
|
|
9
8
|
EmbeddingResponse,
|
|
@@ -12,6 +11,7 @@ from kodit.domain.value_objects import (
|
|
|
12
11
|
SearchRequest,
|
|
13
12
|
SearchResult,
|
|
14
13
|
)
|
|
14
|
+
from kodit.infrastructure.sqlalchemy.entities import EmbeddingType
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class EmbeddingProvider(ABC):
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Index query service."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
|
|
5
|
+
from kodit.domain.entities import Index, SnippetWithContext
|
|
6
|
+
from kodit.domain.protocols import IndexRepository
|
|
7
|
+
from kodit.domain.value_objects import (
|
|
8
|
+
FusionRequest,
|
|
9
|
+
FusionResult,
|
|
10
|
+
MultiSearchRequest,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FusionService(ABC):
|
|
15
|
+
"""Abstract fusion service interface."""
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def reciprocal_rank_fusion(
|
|
19
|
+
self, rankings: list[list[FusionRequest]], k: float = 60
|
|
20
|
+
) -> list[FusionResult]:
|
|
21
|
+
"""Perform reciprocal rank fusion on search results."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class IndexQueryService:
|
|
25
|
+
"""Index query service."""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
index_repository: IndexRepository,
|
|
30
|
+
fusion_service: FusionService,
|
|
31
|
+
) -> None:
|
|
32
|
+
"""Initialize the index query service."""
|
|
33
|
+
self.index_repository = index_repository
|
|
34
|
+
self.fusion_service = fusion_service
|
|
35
|
+
|
|
36
|
+
async def get_index_by_id(self, index_id: int) -> Index | None:
|
|
37
|
+
"""Get an index by its ID."""
|
|
38
|
+
return await self.index_repository.get(index_id)
|
|
39
|
+
|
|
40
|
+
async def list_indexes(self) -> list[Index]:
|
|
41
|
+
"""List all indexes."""
|
|
42
|
+
return await self.index_repository.all()
|
|
43
|
+
|
|
44
|
+
async def search_snippets(
|
|
45
|
+
self, request: MultiSearchRequest
|
|
46
|
+
) -> list[SnippetWithContext]:
|
|
47
|
+
"""Search snippets with filters.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
request: The search request containing filters
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
List of matching snippet items with context
|
|
54
|
+
|
|
55
|
+
"""
|
|
56
|
+
return list(await self.index_repository.search(request))
|
|
57
|
+
|
|
58
|
+
async def perform_fusion(
|
|
59
|
+
self, rankings: list[list[FusionRequest]], k: float = 60
|
|
60
|
+
) -> list[FusionResult]:
|
|
61
|
+
"""Perform reciprocal rank fusion on search results."""
|
|
62
|
+
return self.fusion_service.reciprocal_rank_fusion(rankings, k)
|
|
63
|
+
|
|
64
|
+
async def get_snippets_by_ids(self, ids: list[int]) -> list[SnippetWithContext]:
|
|
65
|
+
"""Get snippets by their IDs."""
|
|
66
|
+
return await self.index_repository.get_snippets_by_ids(ids)
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
"""Pure domain service for Index aggregate operations."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import structlog
|
|
7
|
+
from pydantic import AnyUrl
|
|
8
|
+
|
|
9
|
+
import kodit.domain.entities as domain_entities
|
|
10
|
+
from kodit.domain.interfaces import ProgressCallback
|
|
11
|
+
from kodit.domain.services.enrichment_service import EnrichmentDomainService
|
|
12
|
+
from kodit.domain.value_objects import (
|
|
13
|
+
EnrichmentIndexRequest,
|
|
14
|
+
EnrichmentRequest,
|
|
15
|
+
LanguageMapping,
|
|
16
|
+
)
|
|
17
|
+
from kodit.infrastructure.cloning.git.working_copy import GitWorkingCopyProvider
|
|
18
|
+
from kodit.infrastructure.cloning.metadata import FileMetadataExtractor
|
|
19
|
+
from kodit.infrastructure.git.git_utils import is_valid_clone_target
|
|
20
|
+
from kodit.infrastructure.ignore.ignore_pattern_provider import GitIgnorePatternProvider
|
|
21
|
+
from kodit.infrastructure.slicing.slicer import Slicer
|
|
22
|
+
from kodit.reporting import Reporter
|
|
23
|
+
from kodit.utils.path_utils import path_from_uri
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class LanguageDetectionService(ABC):
|
|
27
|
+
"""Abstract interface for language detection service."""
|
|
28
|
+
|
|
29
|
+
@abstractmethod
|
|
30
|
+
async def detect_language(self, file_path: Path) -> str:
|
|
31
|
+
"""Detect the programming language of a file."""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class IndexDomainService:
|
|
35
|
+
"""Pure domain service for Index aggregate operations.
|
|
36
|
+
|
|
37
|
+
This service handles the full lifecycle of code indexing:
|
|
38
|
+
- Creating indexes for source repositories
|
|
39
|
+
- Cloning and processing source files
|
|
40
|
+
- Extracting and enriching code snippets
|
|
41
|
+
- Managing the complete Index aggregate
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
language_detector: LanguageDetectionService,
|
|
47
|
+
enrichment_service: EnrichmentDomainService,
|
|
48
|
+
clone_dir: Path,
|
|
49
|
+
) -> None:
|
|
50
|
+
"""Initialize the index domain service."""
|
|
51
|
+
self._clone_dir = clone_dir
|
|
52
|
+
self._language_detector = language_detector
|
|
53
|
+
self._enrichment_service = enrichment_service
|
|
54
|
+
self.log = structlog.get_logger(__name__)
|
|
55
|
+
|
|
56
|
+
async def prepare_index(
|
|
57
|
+
self,
|
|
58
|
+
uri_or_path_like: str, # Must include user/pass, etc
|
|
59
|
+
progress_callback: ProgressCallback | None = None,
|
|
60
|
+
) -> domain_entities.WorkingCopy:
|
|
61
|
+
"""Prepare an index by scanning files and creating working copy."""
|
|
62
|
+
sanitized_uri, source_type = self.sanitize_uri(uri_or_path_like)
|
|
63
|
+
reporter = Reporter(self.log, progress_callback)
|
|
64
|
+
self.log.info("Preparing source", uri=str(sanitized_uri))
|
|
65
|
+
|
|
66
|
+
if source_type == domain_entities.SourceType.FOLDER:
|
|
67
|
+
await reporter.start("prepare_index", 1, "Scanning source...")
|
|
68
|
+
local_path = path_from_uri(str(sanitized_uri))
|
|
69
|
+
elif source_type == domain_entities.SourceType.GIT:
|
|
70
|
+
source_type = domain_entities.SourceType.GIT
|
|
71
|
+
git_working_copy_provider = GitWorkingCopyProvider(self._clone_dir)
|
|
72
|
+
await reporter.start("prepare_index", 1, "Cloning source...")
|
|
73
|
+
local_path = await git_working_copy_provider.prepare(uri_or_path_like)
|
|
74
|
+
await reporter.done("prepare_index")
|
|
75
|
+
else:
|
|
76
|
+
raise ValueError(f"Unsupported source: {uri_or_path_like}")
|
|
77
|
+
|
|
78
|
+
await reporter.done("prepare_index")
|
|
79
|
+
|
|
80
|
+
return domain_entities.WorkingCopy(
|
|
81
|
+
remote_uri=sanitized_uri,
|
|
82
|
+
cloned_path=local_path,
|
|
83
|
+
source_type=source_type,
|
|
84
|
+
files=[],
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
async def extract_snippets_from_index(
|
|
88
|
+
self,
|
|
89
|
+
index: domain_entities.Index,
|
|
90
|
+
progress_callback: ProgressCallback | None = None,
|
|
91
|
+
) -> domain_entities.Index:
|
|
92
|
+
"""Extract code snippets from files in the index."""
|
|
93
|
+
file_count = len(index.source.working_copy.files)
|
|
94
|
+
|
|
95
|
+
self.log.info(
|
|
96
|
+
"Extracting snippets",
|
|
97
|
+
index_id=index.id,
|
|
98
|
+
file_count=file_count,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# Only create snippets for files that have been added or modified
|
|
102
|
+
files = index.source.working_copy.changed_files()
|
|
103
|
+
index.delete_snippets_for_files(files)
|
|
104
|
+
|
|
105
|
+
# Create a set of languages to extract snippets for
|
|
106
|
+
extensions = {file.extension() for file in files}
|
|
107
|
+
languages = []
|
|
108
|
+
for ext in extensions:
|
|
109
|
+
try:
|
|
110
|
+
languages.append(LanguageMapping.get_language_for_extension(ext))
|
|
111
|
+
except ValueError as e:
|
|
112
|
+
self.log.info("Skipping", error=str(e))
|
|
113
|
+
continue
|
|
114
|
+
|
|
115
|
+
reporter = Reporter(self.log, progress_callback)
|
|
116
|
+
await reporter.start(
|
|
117
|
+
"extract_snippets",
|
|
118
|
+
len(files) * len(languages),
|
|
119
|
+
"Extracting code snippets...",
|
|
120
|
+
)
|
|
121
|
+
# Calculate snippets for each language
|
|
122
|
+
slicer = Slicer()
|
|
123
|
+
for i, language in enumerate(languages):
|
|
124
|
+
await reporter.step(
|
|
125
|
+
"extract_snippets",
|
|
126
|
+
len(files) * (i + 1),
|
|
127
|
+
len(files) * len(languages),
|
|
128
|
+
"Extracting code snippets...",
|
|
129
|
+
)
|
|
130
|
+
s = slicer.extract_snippets(files, language=language)
|
|
131
|
+
index.snippets.extend(s)
|
|
132
|
+
|
|
133
|
+
await reporter.done("extract_snippets")
|
|
134
|
+
return index
|
|
135
|
+
|
|
136
|
+
async def enrich_snippets_in_index(
|
|
137
|
+
self,
|
|
138
|
+
snippets: list[domain_entities.Snippet],
|
|
139
|
+
progress_callback: ProgressCallback | None = None,
|
|
140
|
+
) -> list[domain_entities.Snippet]:
|
|
141
|
+
"""Enrich snippets with AI-generated summaries."""
|
|
142
|
+
if not snippets or len(snippets) == 0:
|
|
143
|
+
return snippets
|
|
144
|
+
|
|
145
|
+
reporter = Reporter(self.log, progress_callback)
|
|
146
|
+
await reporter.start("enrichment", len(snippets), "Enriching snippets...")
|
|
147
|
+
|
|
148
|
+
snippet_map = {snippet.id: snippet for snippet in snippets if snippet.id}
|
|
149
|
+
|
|
150
|
+
enrichment_request = EnrichmentIndexRequest(
|
|
151
|
+
requests=[
|
|
152
|
+
EnrichmentRequest(snippet_id=snippet_id, text=snippet.original_text())
|
|
153
|
+
for snippet_id, snippet in snippet_map.items()
|
|
154
|
+
]
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
processed = 0
|
|
158
|
+
async for result in self._enrichment_service.enrich_documents(
|
|
159
|
+
enrichment_request
|
|
160
|
+
):
|
|
161
|
+
snippet_map[result.snippet_id].add_summary(result.text)
|
|
162
|
+
|
|
163
|
+
processed += 1
|
|
164
|
+
await reporter.step(
|
|
165
|
+
"enrichment", processed, len(snippets), "Enriching snippets..."
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
await reporter.done("enrichment")
|
|
169
|
+
return list(snippet_map.values())
|
|
170
|
+
|
|
171
|
+
def sanitize_uri(
|
|
172
|
+
self, uri_or_path_like: str
|
|
173
|
+
) -> tuple[AnyUrl, domain_entities.SourceType]:
|
|
174
|
+
"""Convert a URI or path-like string to a URI."""
|
|
175
|
+
# First, check if it's a local directory (more reliable than git check)
|
|
176
|
+
if Path(uri_or_path_like).is_dir():
|
|
177
|
+
return (
|
|
178
|
+
domain_entities.WorkingCopy.sanitize_local_path(uri_or_path_like),
|
|
179
|
+
domain_entities.SourceType.FOLDER,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# Then check if it's git-clonable
|
|
183
|
+
if is_valid_clone_target(uri_or_path_like):
|
|
184
|
+
return (
|
|
185
|
+
domain_entities.WorkingCopy.sanitize_git_url(uri_or_path_like),
|
|
186
|
+
domain_entities.SourceType.GIT,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
raise ValueError(f"Unsupported source: {uri_or_path_like}")
|
|
190
|
+
|
|
191
|
+
async def refresh_working_copy(
|
|
192
|
+
self,
|
|
193
|
+
working_copy: domain_entities.WorkingCopy,
|
|
194
|
+
progress_callback: ProgressCallback | None = None,
|
|
195
|
+
) -> domain_entities.WorkingCopy:
|
|
196
|
+
"""Refresh the working copy."""
|
|
197
|
+
metadata_extractor = FileMetadataExtractor(working_copy.source_type)
|
|
198
|
+
reporter = Reporter(self.log, progress_callback)
|
|
199
|
+
|
|
200
|
+
if working_copy.source_type == domain_entities.SourceType.GIT:
|
|
201
|
+
git_working_copy_provider = GitWorkingCopyProvider(self._clone_dir)
|
|
202
|
+
await git_working_copy_provider.sync(str(working_copy.remote_uri))
|
|
203
|
+
|
|
204
|
+
current_file_paths = working_copy.list_filesystem_paths(
|
|
205
|
+
GitIgnorePatternProvider(working_copy.cloned_path)
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
previous_files_map = {file.as_path(): file for file in working_copy.files}
|
|
209
|
+
|
|
210
|
+
# Calculate different sets of files
|
|
211
|
+
deleted_file_paths = set(previous_files_map.keys()) - set(current_file_paths)
|
|
212
|
+
new_file_paths = set(current_file_paths) - set(previous_files_map.keys())
|
|
213
|
+
modified_file_paths = set(current_file_paths) & set(previous_files_map.keys())
|
|
214
|
+
num_files_to_process = (
|
|
215
|
+
len(deleted_file_paths) + len(new_file_paths) + len(modified_file_paths)
|
|
216
|
+
)
|
|
217
|
+
self.log.info(
|
|
218
|
+
"Refreshing working copy",
|
|
219
|
+
num_deleted=len(deleted_file_paths),
|
|
220
|
+
num_new=len(new_file_paths),
|
|
221
|
+
num_modified=len(modified_file_paths),
|
|
222
|
+
num_total_changes=num_files_to_process,
|
|
223
|
+
num_dirty=len(working_copy.dirty_files()),
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Setup reporter
|
|
227
|
+
processed = 0
|
|
228
|
+
await reporter.start(
|
|
229
|
+
"refresh_working_copy", num_files_to_process, "Refreshing working copy..."
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# First check to see if any files have been deleted
|
|
233
|
+
for file_path in deleted_file_paths:
|
|
234
|
+
processed += 1
|
|
235
|
+
await reporter.step(
|
|
236
|
+
"refresh_working_copy",
|
|
237
|
+
processed,
|
|
238
|
+
num_files_to_process,
|
|
239
|
+
f"Deleted {file_path.name}",
|
|
240
|
+
)
|
|
241
|
+
previous_files_map[
|
|
242
|
+
file_path
|
|
243
|
+
].file_processing_status = domain_entities.FileProcessingStatus.DELETED
|
|
244
|
+
|
|
245
|
+
# Then check to see if there are any new files
|
|
246
|
+
for file_path in new_file_paths:
|
|
247
|
+
processed += 1
|
|
248
|
+
await reporter.step(
|
|
249
|
+
"refresh_working_copy",
|
|
250
|
+
processed,
|
|
251
|
+
num_files_to_process,
|
|
252
|
+
f"New {file_path.name}",
|
|
253
|
+
)
|
|
254
|
+
try:
|
|
255
|
+
working_copy.files.append(
|
|
256
|
+
await metadata_extractor.extract(file_path=file_path)
|
|
257
|
+
)
|
|
258
|
+
except (OSError, ValueError) as e:
|
|
259
|
+
self.log.debug("Skipping file", file=str(file_path), error=str(e))
|
|
260
|
+
continue
|
|
261
|
+
|
|
262
|
+
# Finally check if there are any modified files
|
|
263
|
+
for file_path in modified_file_paths:
|
|
264
|
+
processed += 1
|
|
265
|
+
await reporter.step(
|
|
266
|
+
"refresh_working_copy",
|
|
267
|
+
processed,
|
|
268
|
+
num_files_to_process,
|
|
269
|
+
f"Modified {file_path.name}",
|
|
270
|
+
)
|
|
271
|
+
try:
|
|
272
|
+
previous_file = previous_files_map[file_path]
|
|
273
|
+
new_file = await metadata_extractor.extract(file_path=file_path)
|
|
274
|
+
if previous_file.sha256 != new_file.sha256:
|
|
275
|
+
previous_file.file_processing_status = (
|
|
276
|
+
domain_entities.FileProcessingStatus.MODIFIED
|
|
277
|
+
)
|
|
278
|
+
except (OSError, ValueError) as e:
|
|
279
|
+
self.log.info("Skipping file", file=str(file_path), error=str(e))
|
|
280
|
+
continue
|
|
281
|
+
|
|
282
|
+
return working_copy
|
kodit/domain/value_objects.py
CHANGED
|
@@ -1,41 +1,137 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Pure domain value objects and DTOs."""
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from datetime import datetime
|
|
6
|
-
from enum import Enum
|
|
6
|
+
from enum import Enum, IntEnum
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import
|
|
8
|
+
from typing import ClassVar
|
|
9
9
|
|
|
10
|
-
from
|
|
11
|
-
from sqlalchemy.orm import Mapped, mapped_column
|
|
10
|
+
from pydantic import BaseModel
|
|
12
11
|
|
|
13
|
-
from kodit.domain.entities import Author, Base, File, Snippet, Source
|
|
14
|
-
from kodit.domain.enums import SnippetExtractionStrategy
|
|
15
12
|
|
|
13
|
+
class SourceType(IntEnum):
|
|
14
|
+
"""The type of source."""
|
|
16
15
|
|
|
17
|
-
|
|
18
|
-
|
|
16
|
+
UNKNOWN = 0
|
|
17
|
+
FOLDER = 1
|
|
18
|
+
GIT = 2
|
|
19
19
|
|
|
20
|
-
BM25 = "bm25"
|
|
21
|
-
VECTOR = "vector"
|
|
22
|
-
HYBRID = "hybrid"
|
|
23
20
|
|
|
21
|
+
class SnippetContentType(IntEnum):
|
|
22
|
+
"""Type of snippet content."""
|
|
24
23
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
24
|
+
UNKNOWN = 0
|
|
25
|
+
ORIGINAL = 1
|
|
26
|
+
SUMMARY = 2
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class SnippetContent(BaseModel):
|
|
30
|
+
"""Snippet content domain value object."""
|
|
28
31
|
|
|
32
|
+
type: SnippetContentType
|
|
33
|
+
value: str
|
|
34
|
+
language: str
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class SnippetSearchResult(BaseModel):
|
|
38
|
+
"""Domain result object for snippet searches."""
|
|
39
|
+
|
|
40
|
+
snippet_id: int
|
|
41
|
+
content: str
|
|
42
|
+
summary: str
|
|
43
|
+
score: float
|
|
29
44
|
file_path: Path
|
|
30
|
-
|
|
45
|
+
language: str | None = None
|
|
46
|
+
authors: list[str] = []
|
|
31
47
|
|
|
32
48
|
|
|
33
|
-
@dataclass
|
|
34
|
-
class
|
|
35
|
-
"""
|
|
49
|
+
@dataclass(frozen=True)
|
|
50
|
+
class LanguageExtensions:
|
|
51
|
+
"""Value object for language to file extension mappings."""
|
|
36
52
|
|
|
37
|
-
snippets: list[str]
|
|
38
53
|
language: str
|
|
54
|
+
extensions: list[str]
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def get_supported_languages(cls) -> list[str]:
|
|
58
|
+
"""Get all supported programming languages."""
|
|
59
|
+
return [
|
|
60
|
+
"python",
|
|
61
|
+
"javascript",
|
|
62
|
+
"typescript",
|
|
63
|
+
"java",
|
|
64
|
+
"c",
|
|
65
|
+
"cpp",
|
|
66
|
+
"csharp",
|
|
67
|
+
"go",
|
|
68
|
+
"rust",
|
|
69
|
+
"php",
|
|
70
|
+
"ruby",
|
|
71
|
+
"swift",
|
|
72
|
+
"kotlin",
|
|
73
|
+
"scala",
|
|
74
|
+
"r",
|
|
75
|
+
"sql",
|
|
76
|
+
"html",
|
|
77
|
+
"css",
|
|
78
|
+
"json",
|
|
79
|
+
"yaml",
|
|
80
|
+
"xml",
|
|
81
|
+
"markdown",
|
|
82
|
+
"shell",
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def get_extensions_for_language(cls, language: str) -> list[str]:
|
|
87
|
+
"""Get file extensions for a given language."""
|
|
88
|
+
language_map = {
|
|
89
|
+
"python": [".py", ".pyw", ".pyi"],
|
|
90
|
+
"javascript": [".js", ".jsx", ".mjs"],
|
|
91
|
+
"typescript": [".ts", ".tsx"],
|
|
92
|
+
"java": [".java"],
|
|
93
|
+
"c": [".c", ".h"],
|
|
94
|
+
"cpp": [".cpp", ".cc", ".cxx", ".hpp", ".hxx"],
|
|
95
|
+
"csharp": [".cs"],
|
|
96
|
+
"go": [".go"],
|
|
97
|
+
"rust": [".rs"],
|
|
98
|
+
"php": [".php"],
|
|
99
|
+
"ruby": [".rb"],
|
|
100
|
+
"swift": [".swift"],
|
|
101
|
+
"kotlin": [".kt", ".kts"],
|
|
102
|
+
"scala": [".scala", ".sc"],
|
|
103
|
+
"r": [".r", ".R"],
|
|
104
|
+
"sql": [".sql"],
|
|
105
|
+
"html": [".html", ".htm"],
|
|
106
|
+
"css": [".css", ".scss", ".sass", ".less"],
|
|
107
|
+
"json": [".json"],
|
|
108
|
+
"yaml": [".yaml", ".yml"],
|
|
109
|
+
"xml": [".xml"],
|
|
110
|
+
"markdown": [".md", ".markdown"],
|
|
111
|
+
"shell": [".sh", ".bash", ".zsh", ".fish"],
|
|
112
|
+
}
|
|
113
|
+
return language_map.get(language.lower(), [])
|
|
114
|
+
|
|
115
|
+
@classmethod
|
|
116
|
+
def is_supported_language(cls, language: str) -> bool:
|
|
117
|
+
"""Check if a language is supported."""
|
|
118
|
+
return language.lower() in cls.get_supported_languages()
|
|
119
|
+
|
|
120
|
+
@classmethod
|
|
121
|
+
def get_extensions_or_fallback(cls, language: str) -> list[str]:
|
|
122
|
+
"""Get extensions for language or return language as extension if not found."""
|
|
123
|
+
language_lower = language.lower()
|
|
124
|
+
if cls.is_supported_language(language_lower):
|
|
125
|
+
return cls.get_extensions_for_language(language_lower)
|
|
126
|
+
return [language_lower]
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class SearchType(Enum):
|
|
130
|
+
"""Type of search to perform."""
|
|
131
|
+
|
|
132
|
+
BM25 = "bm25"
|
|
133
|
+
VECTOR = "vector"
|
|
134
|
+
HYBRID = "hybrid"
|
|
39
135
|
|
|
40
136
|
|
|
41
137
|
@dataclass
|
|
@@ -101,6 +197,7 @@ class SnippetSearchFilters:
|
|
|
101
197
|
created_after: datetime | None = None
|
|
102
198
|
created_before: datetime | None = None
|
|
103
199
|
source_repo: str | None = None
|
|
200
|
+
file_path: str | None = None
|
|
104
201
|
|
|
105
202
|
@classmethod
|
|
106
203
|
def from_cli_params(
|
|
@@ -357,16 +454,6 @@ class IndexView:
|
|
|
357
454
|
source: str | None = None
|
|
358
455
|
|
|
359
456
|
|
|
360
|
-
@dataclass
|
|
361
|
-
class SnippetWithContext:
|
|
362
|
-
"""Domain model for snippet with associated context information."""
|
|
363
|
-
|
|
364
|
-
source: Source
|
|
365
|
-
file: File
|
|
366
|
-
authors: list[Author]
|
|
367
|
-
snippet: Snippet
|
|
368
|
-
|
|
369
|
-
|
|
370
457
|
class LanguageMapping:
|
|
371
458
|
"""Value object for language-to-extension mappings.
|
|
372
459
|
|
|
@@ -536,38 +623,29 @@ class LanguageMapping:
|
|
|
536
623
|
return [language_lower]
|
|
537
624
|
|
|
538
625
|
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
class
|
|
559
|
-
"""
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
document_metadata: Mapped[dict[str, Any] | None] = mapped_column(
|
|
566
|
-
JSON, nullable=True
|
|
567
|
-
)
|
|
568
|
-
created_at: Mapped[datetime] = mapped_column(
|
|
569
|
-
DateTime(timezone=True), nullable=False
|
|
570
|
-
)
|
|
571
|
-
updated_at: Mapped[datetime] = mapped_column(
|
|
572
|
-
DateTime(timezone=True), nullable=False
|
|
573
|
-
)
|
|
626
|
+
class SnippetQuery(BaseModel):
|
|
627
|
+
"""Domain query object for snippet searches."""
|
|
628
|
+
|
|
629
|
+
text: str
|
|
630
|
+
search_type: SearchType = SearchType.HYBRID
|
|
631
|
+
filters: SnippetSearchFilters = SnippetSearchFilters()
|
|
632
|
+
top_k: int = 10
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
class FileProcessingStatus(IntEnum):
|
|
636
|
+
"""File processing status."""
|
|
637
|
+
|
|
638
|
+
CLEAN = 0
|
|
639
|
+
ADDED = 1
|
|
640
|
+
MODIFIED = 2
|
|
641
|
+
DELETED = 3
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
@dataclass
|
|
645
|
+
class FunctionDefinition:
|
|
646
|
+
"""Cached function definition."""
|
|
647
|
+
|
|
648
|
+
name: str
|
|
649
|
+
qualified_name: str
|
|
650
|
+
start_byte: int
|
|
651
|
+
end_byte: int
|
|
@@ -6,7 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
import git
|
|
7
7
|
import structlog
|
|
8
8
|
|
|
9
|
-
from kodit.
|
|
9
|
+
from kodit.domain.entities import WorkingCopy
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class GitWorkingCopyProvider:
|
|
@@ -17,15 +17,17 @@ class GitWorkingCopyProvider:
|
|
|
17
17
|
self.clone_dir = clone_dir
|
|
18
18
|
self.log = structlog.get_logger(__name__)
|
|
19
19
|
|
|
20
|
+
def get_clone_path(self, uri: str) -> Path:
|
|
21
|
+
"""Get the clone path for a Git working copy."""
|
|
22
|
+
sanitized_uri = WorkingCopy.sanitize_git_url(uri)
|
|
23
|
+
dir_hash = hashlib.sha256(str(sanitized_uri).encode("utf-8")).hexdigest()[:16]
|
|
24
|
+
dir_name = f"repo-{dir_hash}"
|
|
25
|
+
return self.clone_dir / dir_name
|
|
26
|
+
|
|
20
27
|
async def prepare(self, uri: str) -> Path:
|
|
21
28
|
"""Prepare a Git working copy."""
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
# Use a repeatable, short sha256 hash of the sanitized URI for the directory
|
|
26
|
-
dir_hash = hashlib.sha256(sanitized_uri.encode("utf-8")).hexdigest()[:16]
|
|
27
|
-
dir_name = f"repo-{dir_hash}"
|
|
28
|
-
clone_path = self.clone_dir / dir_name
|
|
29
|
+
sanitized_uri = WorkingCopy.sanitize_git_url(uri)
|
|
30
|
+
clone_path = self.get_clone_path(uri)
|
|
29
31
|
clone_path.mkdir(parents=True, exist_ok=True)
|
|
30
32
|
|
|
31
33
|
try:
|
|
@@ -41,3 +43,10 @@ class GitWorkingCopyProvider:
|
|
|
41
43
|
self.log.info("Repository already exists, reusing...", uri=sanitized_uri)
|
|
42
44
|
|
|
43
45
|
return clone_path
|
|
46
|
+
|
|
47
|
+
async def sync(self, uri: str) -> Path:
|
|
48
|
+
"""Refresh a Git working copy."""
|
|
49
|
+
clone_path = self.get_clone_path(uri)
|
|
50
|
+
repo = git.Repo(clone_path)
|
|
51
|
+
repo.remotes.origin.pull()
|
|
52
|
+
return clone_path
|