kodit 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/application/factories/code_indexing_factory.py +77 -28
- kodit/application/services/code_indexing_application_service.py +148 -119
- kodit/cli.py +49 -52
- kodit/domain/entities.py +268 -189
- kodit/domain/protocols.py +61 -0
- kodit/domain/services/embedding_service.py +1 -1
- kodit/domain/services/index_query_service.py +66 -0
- kodit/domain/services/index_service.py +323 -0
- kodit/domain/value_objects.py +225 -92
- kodit/infrastructure/cloning/git/working_copy.py +17 -8
- kodit/infrastructure/cloning/metadata.py +37 -67
- kodit/infrastructure/embedding/embedding_factory.py +1 -1
- kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
- kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +1 -1
- kodit/infrastructure/enrichment/null_enrichment_provider.py +4 -10
- kodit/infrastructure/git/git_utils.py +1 -63
- kodit/infrastructure/ignore/ignore_pattern_provider.py +1 -2
- kodit/infrastructure/indexing/auto_indexing_service.py +2 -12
- kodit/infrastructure/indexing/fusion_service.py +1 -1
- kodit/infrastructure/mappers/__init__.py +1 -0
- kodit/infrastructure/mappers/index_mapper.py +344 -0
- kodit/infrastructure/snippet_extraction/factories.py +13 -0
- kodit/infrastructure/snippet_extraction/language_detection_service.py +1 -1
- kodit/infrastructure/snippet_extraction/snippet_query_provider.py +0 -1
- kodit/infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py +1 -1
- kodit/infrastructure/sqlalchemy/embedding_repository.py +1 -1
- kodit/infrastructure/sqlalchemy/entities.py +203 -0
- kodit/infrastructure/sqlalchemy/file_repository.py +1 -1
- kodit/infrastructure/sqlalchemy/index_repository.py +550 -0
- kodit/log.py +4 -1
- kodit/mcp.py +1 -13
- kodit/migrations/env.py +1 -1
- kodit/migrations/versions/4073b33f9436_add_file_processing_flag.py +34 -0
- kodit/migrations/versions/4552eb3f23ce_add_summary.py +34 -0
- kodit/utils/__init__.py +1 -0
- kodit/utils/path_utils.py +54 -0
- {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/METADATA +1 -1
- {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/RECORD +42 -45
- kodit/domain/enums.py +0 -9
- kodit/domain/repositories.py +0 -128
- kodit/domain/services/ignore_service.py +0 -45
- kodit/domain/services/indexing_service.py +0 -204
- kodit/domain/services/snippet_extraction_service.py +0 -89
- kodit/domain/services/snippet_service.py +0 -211
- kodit/domain/services/source_service.py +0 -85
- kodit/infrastructure/cloning/folder/__init__.py +0 -1
- kodit/infrastructure/cloning/folder/factory.py +0 -128
- kodit/infrastructure/cloning/folder/working_copy.py +0 -38
- kodit/infrastructure/cloning/git/factory.py +0 -153
- kodit/infrastructure/indexing/index_repository.py +0 -273
- kodit/infrastructure/indexing/snippet_domain_service_factory.py +0 -37
- kodit/infrastructure/sqlalchemy/repository.py +0 -133
- kodit/infrastructure/sqlalchemy/snippet_repository.py +0 -251
- {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/WHEEL +0 -0
- {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/entry_points.txt +0 -0
- {kodit-0.3.1.dist-info → kodit-0.3.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
"""Pure domain service for Index aggregate operations."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from collections.abc import Mapping
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import structlog
|
|
8
|
+
from pydantic import AnyUrl
|
|
9
|
+
|
|
10
|
+
import kodit.domain.entities as domain_entities
|
|
11
|
+
from kodit.domain.interfaces import ProgressCallback
|
|
12
|
+
from kodit.domain.services.enrichment_service import EnrichmentDomainService
|
|
13
|
+
from kodit.domain.value_objects import (
|
|
14
|
+
EnrichmentIndexRequest,
|
|
15
|
+
EnrichmentRequest,
|
|
16
|
+
SnippetExtractionRequest,
|
|
17
|
+
SnippetExtractionResult,
|
|
18
|
+
SnippetExtractionStrategy,
|
|
19
|
+
)
|
|
20
|
+
from kodit.infrastructure.cloning.git.working_copy import GitWorkingCopyProvider
|
|
21
|
+
from kodit.infrastructure.cloning.metadata import FileMetadataExtractor
|
|
22
|
+
from kodit.infrastructure.git.git_utils import is_valid_clone_target
|
|
23
|
+
from kodit.infrastructure.ignore.ignore_pattern_provider import GitIgnorePatternProvider
|
|
24
|
+
from kodit.reporting import Reporter
|
|
25
|
+
from kodit.utils.path_utils import path_from_uri
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class LanguageDetectionService(ABC):
|
|
29
|
+
"""Abstract interface for language detection service."""
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
async def detect_language(self, file_path: Path) -> str:
|
|
33
|
+
"""Detect the programming language of a file."""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class SnippetExtractor(ABC):
|
|
37
|
+
"""Abstract interface for snippet extraction."""
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
async def extract(self, file_path: Path, language: str) -> list[str]:
|
|
41
|
+
"""Extract snippets from a file."""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class IndexDomainService:
|
|
45
|
+
"""Pure domain service for Index aggregate operations.
|
|
46
|
+
|
|
47
|
+
This service handles the full lifecycle of code indexing:
|
|
48
|
+
- Creating indexes for source repositories
|
|
49
|
+
- Cloning and processing source files
|
|
50
|
+
- Extracting and enriching code snippets
|
|
51
|
+
- Managing the complete Index aggregate
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
language_detector: LanguageDetectionService,
|
|
57
|
+
snippet_extractors: Mapping[SnippetExtractionStrategy, SnippetExtractor],
|
|
58
|
+
enrichment_service: EnrichmentDomainService,
|
|
59
|
+
clone_dir: Path,
|
|
60
|
+
) -> None:
|
|
61
|
+
"""Initialize the index domain service."""
|
|
62
|
+
self._clone_dir = clone_dir
|
|
63
|
+
self._language_detector = language_detector
|
|
64
|
+
self._snippet_extractors = snippet_extractors
|
|
65
|
+
self._enrichment_service = enrichment_service
|
|
66
|
+
self.log = structlog.get_logger(__name__)
|
|
67
|
+
|
|
68
|
+
async def prepare_index(
|
|
69
|
+
self,
|
|
70
|
+
uri_or_path_like: str, # Must include user/pass, etc
|
|
71
|
+
progress_callback: ProgressCallback | None = None,
|
|
72
|
+
) -> domain_entities.WorkingCopy:
|
|
73
|
+
"""Prepare an index by scanning files and creating working copy."""
|
|
74
|
+
sanitized_uri, source_type = self.sanitize_uri(uri_or_path_like)
|
|
75
|
+
reporter = Reporter(self.log, progress_callback)
|
|
76
|
+
self.log.info("Preparing source", uri=str(sanitized_uri))
|
|
77
|
+
|
|
78
|
+
if source_type == domain_entities.SourceType.FOLDER:
|
|
79
|
+
await reporter.start("prepare_index", 1, "Scanning source...")
|
|
80
|
+
local_path = path_from_uri(str(sanitized_uri))
|
|
81
|
+
elif source_type == domain_entities.SourceType.GIT:
|
|
82
|
+
source_type = domain_entities.SourceType.GIT
|
|
83
|
+
git_working_copy_provider = GitWorkingCopyProvider(self._clone_dir)
|
|
84
|
+
await reporter.start("prepare_index", 1, "Cloning source...")
|
|
85
|
+
local_path = await git_working_copy_provider.prepare(uri_or_path_like)
|
|
86
|
+
await reporter.done("prepare_index")
|
|
87
|
+
else:
|
|
88
|
+
raise ValueError(f"Unsupported source: {uri_or_path_like}")
|
|
89
|
+
|
|
90
|
+
await reporter.done("prepare_index")
|
|
91
|
+
|
|
92
|
+
return domain_entities.WorkingCopy(
|
|
93
|
+
remote_uri=sanitized_uri,
|
|
94
|
+
cloned_path=local_path,
|
|
95
|
+
source_type=source_type,
|
|
96
|
+
files=[],
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
async def extract_snippets_from_index(
|
|
100
|
+
self,
|
|
101
|
+
index: domain_entities.Index,
|
|
102
|
+
strategy: SnippetExtractionStrategy = SnippetExtractionStrategy.METHOD_BASED,
|
|
103
|
+
progress_callback: ProgressCallback | None = None,
|
|
104
|
+
) -> domain_entities.Index:
|
|
105
|
+
"""Extract code snippets from files in the index."""
|
|
106
|
+
file_count = len(index.source.working_copy.files)
|
|
107
|
+
|
|
108
|
+
self.log.info(
|
|
109
|
+
"Extracting snippets",
|
|
110
|
+
index_id=index.id,
|
|
111
|
+
file_count=file_count,
|
|
112
|
+
strategy=strategy.value,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Only create snippets for files that have been added or modified
|
|
116
|
+
files = index.source.working_copy.changed_files()
|
|
117
|
+
index.delete_snippets_for_files(files)
|
|
118
|
+
|
|
119
|
+
reporter = Reporter(self.log, progress_callback)
|
|
120
|
+
await reporter.start(
|
|
121
|
+
"extract_snippets", len(files), "Extracting code snippets..."
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
new_snippets = []
|
|
125
|
+
for i, domain_file in enumerate(files, 1):
|
|
126
|
+
try:
|
|
127
|
+
# Extract snippets from file
|
|
128
|
+
request = SnippetExtractionRequest(
|
|
129
|
+
file_path=domain_file.as_path(), strategy=strategy
|
|
130
|
+
)
|
|
131
|
+
result = await self._extract_snippets(request)
|
|
132
|
+
for snippet_text in result.snippets:
|
|
133
|
+
snippet = domain_entities.Snippet(
|
|
134
|
+
derives_from=[domain_file],
|
|
135
|
+
)
|
|
136
|
+
snippet.add_original_content(snippet_text, result.language)
|
|
137
|
+
new_snippets.append(snippet)
|
|
138
|
+
|
|
139
|
+
except (OSError, ValueError) as e:
|
|
140
|
+
self.log.debug(
|
|
141
|
+
"Skipping file for snippet extraction",
|
|
142
|
+
file_uri=str(domain_file.uri),
|
|
143
|
+
error=str(e),
|
|
144
|
+
)
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
await reporter.step(
|
|
148
|
+
"extract_snippets", i, len(files), f"Processed {domain_file.uri.path}"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
index.snippets.extend(new_snippets)
|
|
152
|
+
await reporter.done("extract_snippets")
|
|
153
|
+
return index
|
|
154
|
+
|
|
155
|
+
async def enrich_snippets_in_index(
|
|
156
|
+
self,
|
|
157
|
+
snippets: list[domain_entities.Snippet],
|
|
158
|
+
progress_callback: ProgressCallback | None = None,
|
|
159
|
+
) -> list[domain_entities.Snippet]:
|
|
160
|
+
"""Enrich snippets with AI-generated summaries."""
|
|
161
|
+
if not snippets or len(snippets) == 0:
|
|
162
|
+
return snippets
|
|
163
|
+
|
|
164
|
+
reporter = Reporter(self.log, progress_callback)
|
|
165
|
+
await reporter.start("enrichment", len(snippets), "Enriching snippets...")
|
|
166
|
+
|
|
167
|
+
snippet_map = {snippet.id: snippet for snippet in snippets if snippet.id}
|
|
168
|
+
|
|
169
|
+
enrichment_request = EnrichmentIndexRequest(
|
|
170
|
+
requests=[
|
|
171
|
+
EnrichmentRequest(snippet_id=snippet_id, text=snippet.original_text())
|
|
172
|
+
for snippet_id, snippet in snippet_map.items()
|
|
173
|
+
]
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
processed = 0
|
|
177
|
+
async for result in self._enrichment_service.enrich_documents(
|
|
178
|
+
enrichment_request
|
|
179
|
+
):
|
|
180
|
+
snippet_map[result.snippet_id].add_summary(result.text)
|
|
181
|
+
|
|
182
|
+
processed += 1
|
|
183
|
+
await reporter.step(
|
|
184
|
+
"enrichment", processed, len(snippets), "Enriching snippets..."
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
await reporter.done("enrichment")
|
|
188
|
+
return list(snippet_map.values())
|
|
189
|
+
|
|
190
|
+
async def _extract_snippets(
|
|
191
|
+
self, request: SnippetExtractionRequest
|
|
192
|
+
) -> SnippetExtractionResult:
|
|
193
|
+
# Domain logic: validate file exists
|
|
194
|
+
if not request.file_path.exists():
|
|
195
|
+
raise ValueError(f"File does not exist: {request.file_path}")
|
|
196
|
+
|
|
197
|
+
# Domain logic: detect language
|
|
198
|
+
language = await self._language_detector.detect_language(request.file_path)
|
|
199
|
+
|
|
200
|
+
# Domain logic: choose strategy and extractor
|
|
201
|
+
if request.strategy not in self._snippet_extractors:
|
|
202
|
+
raise ValueError(f"Unsupported extraction strategy: {request.strategy}")
|
|
203
|
+
|
|
204
|
+
extractor = self._snippet_extractors[request.strategy]
|
|
205
|
+
snippets = await extractor.extract(request.file_path, language)
|
|
206
|
+
|
|
207
|
+
# Domain logic: filter out empty snippets
|
|
208
|
+
filtered_snippets = [snippet for snippet in snippets if snippet.strip()]
|
|
209
|
+
|
|
210
|
+
return SnippetExtractionResult(snippets=filtered_snippets, language=language)
|
|
211
|
+
|
|
212
|
+
def sanitize_uri(
|
|
213
|
+
self, uri_or_path_like: str
|
|
214
|
+
) -> tuple[AnyUrl, domain_entities.SourceType]:
|
|
215
|
+
"""Convert a URI or path-like string to a URI."""
|
|
216
|
+
# First, check if it's a local directory (more reliable than git check)
|
|
217
|
+
if Path(uri_or_path_like).is_dir():
|
|
218
|
+
return (
|
|
219
|
+
domain_entities.WorkingCopy.sanitize_local_path(uri_or_path_like),
|
|
220
|
+
domain_entities.SourceType.FOLDER,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# Then check if it's git-clonable
|
|
224
|
+
if is_valid_clone_target(uri_or_path_like):
|
|
225
|
+
return (
|
|
226
|
+
domain_entities.WorkingCopy.sanitize_git_url(uri_or_path_like),
|
|
227
|
+
domain_entities.SourceType.GIT,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
raise ValueError(f"Unsupported source: {uri_or_path_like}")
|
|
231
|
+
|
|
232
|
+
async def refresh_working_copy(
|
|
233
|
+
self,
|
|
234
|
+
working_copy: domain_entities.WorkingCopy,
|
|
235
|
+
progress_callback: ProgressCallback | None = None,
|
|
236
|
+
) -> domain_entities.WorkingCopy:
|
|
237
|
+
"""Refresh the working copy."""
|
|
238
|
+
metadata_extractor = FileMetadataExtractor(working_copy.source_type)
|
|
239
|
+
reporter = Reporter(self.log, progress_callback)
|
|
240
|
+
|
|
241
|
+
if working_copy.source_type == domain_entities.SourceType.GIT:
|
|
242
|
+
git_working_copy_provider = GitWorkingCopyProvider(self._clone_dir)
|
|
243
|
+
await git_working_copy_provider.sync(str(working_copy.remote_uri))
|
|
244
|
+
|
|
245
|
+
current_file_paths = working_copy.list_filesystem_paths(
|
|
246
|
+
GitIgnorePatternProvider(working_copy.cloned_path)
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
previous_files_map = {file.as_path(): file for file in working_copy.files}
|
|
250
|
+
|
|
251
|
+
# Calculate different sets of files
|
|
252
|
+
deleted_file_paths = set(previous_files_map.keys()) - set(current_file_paths)
|
|
253
|
+
new_file_paths = set(current_file_paths) - set(previous_files_map.keys())
|
|
254
|
+
modified_file_paths = set(current_file_paths) & set(previous_files_map.keys())
|
|
255
|
+
num_files_to_process = (
|
|
256
|
+
len(deleted_file_paths) + len(new_file_paths) + len(modified_file_paths)
|
|
257
|
+
)
|
|
258
|
+
self.log.info(
|
|
259
|
+
"Refreshing working copy",
|
|
260
|
+
num_deleted=len(deleted_file_paths),
|
|
261
|
+
num_new=len(new_file_paths),
|
|
262
|
+
num_modified=len(modified_file_paths),
|
|
263
|
+
num_total_changes=num_files_to_process,
|
|
264
|
+
num_dirty=len(working_copy.dirty_files()),
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
# Setup reporter
|
|
268
|
+
processed = 0
|
|
269
|
+
await reporter.start(
|
|
270
|
+
"refresh_working_copy", num_files_to_process, "Refreshing working copy..."
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
# First check to see if any files have been deleted
|
|
274
|
+
for file_path in deleted_file_paths:
|
|
275
|
+
processed += 1
|
|
276
|
+
await reporter.step(
|
|
277
|
+
"refresh_working_copy",
|
|
278
|
+
processed,
|
|
279
|
+
num_files_to_process,
|
|
280
|
+
f"Deleted {file_path.name}",
|
|
281
|
+
)
|
|
282
|
+
previous_files_map[
|
|
283
|
+
file_path
|
|
284
|
+
].file_processing_status = domain_entities.FileProcessingStatus.DELETED
|
|
285
|
+
|
|
286
|
+
# Then check to see if there are any new files
|
|
287
|
+
for file_path in new_file_paths:
|
|
288
|
+
processed += 1
|
|
289
|
+
await reporter.step(
|
|
290
|
+
"refresh_working_copy",
|
|
291
|
+
processed,
|
|
292
|
+
num_files_to_process,
|
|
293
|
+
f"New {file_path.name}",
|
|
294
|
+
)
|
|
295
|
+
try:
|
|
296
|
+
working_copy.files.append(
|
|
297
|
+
await metadata_extractor.extract(file_path=file_path)
|
|
298
|
+
)
|
|
299
|
+
except (OSError, ValueError) as e:
|
|
300
|
+
self.log.info("Skipping file", file=str(file_path), error=str(e))
|
|
301
|
+
continue
|
|
302
|
+
|
|
303
|
+
# Finally check if there are any modified files
|
|
304
|
+
for file_path in modified_file_paths:
|
|
305
|
+
processed += 1
|
|
306
|
+
await reporter.step(
|
|
307
|
+
"refresh_working_copy",
|
|
308
|
+
processed,
|
|
309
|
+
num_files_to_process,
|
|
310
|
+
f"Modified {file_path.name}",
|
|
311
|
+
)
|
|
312
|
+
try:
|
|
313
|
+
previous_file = previous_files_map[file_path]
|
|
314
|
+
new_file = await metadata_extractor.extract(file_path=file_path)
|
|
315
|
+
if previous_file.sha256 != new_file.sha256:
|
|
316
|
+
previous_file.file_processing_status = (
|
|
317
|
+
domain_entities.FileProcessingStatus.MODIFIED
|
|
318
|
+
)
|
|
319
|
+
except (OSError, ValueError) as e:
|
|
320
|
+
self.log.info("Skipping file", file=str(file_path), error=str(e))
|
|
321
|
+
continue
|
|
322
|
+
|
|
323
|
+
return working_copy
|
kodit/domain/value_objects.py
CHANGED
|
@@ -1,16 +1,129 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Pure domain value objects and DTOs."""
|
|
2
2
|
|
|
3
|
+
import json
|
|
3
4
|
from dataclasses import dataclass
|
|
4
5
|
from datetime import datetime
|
|
5
|
-
from enum import Enum
|
|
6
|
+
from enum import Enum, IntEnum
|
|
6
7
|
from pathlib import Path
|
|
7
|
-
from typing import
|
|
8
|
+
from typing import ClassVar
|
|
8
9
|
|
|
9
|
-
from
|
|
10
|
-
from sqlalchemy.orm import Mapped, mapped_column
|
|
10
|
+
from pydantic import BaseModel
|
|
11
11
|
|
|
12
|
-
|
|
13
|
-
|
|
12
|
+
|
|
13
|
+
class SourceType(IntEnum):
|
|
14
|
+
"""The type of source."""
|
|
15
|
+
|
|
16
|
+
UNKNOWN = 0
|
|
17
|
+
FOLDER = 1
|
|
18
|
+
GIT = 2
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SnippetContentType(IntEnum):
|
|
22
|
+
"""Type of snippet content."""
|
|
23
|
+
|
|
24
|
+
UNKNOWN = 0
|
|
25
|
+
ORIGINAL = 1
|
|
26
|
+
SUMMARY = 2
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class SnippetContent(BaseModel):
|
|
30
|
+
"""Snippet content domain value object."""
|
|
31
|
+
|
|
32
|
+
type: SnippetContentType
|
|
33
|
+
value: str
|
|
34
|
+
language: str
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class SnippetSearchResult(BaseModel):
|
|
38
|
+
"""Domain result object for snippet searches."""
|
|
39
|
+
|
|
40
|
+
snippet_id: int
|
|
41
|
+
content: str
|
|
42
|
+
summary: str
|
|
43
|
+
score: float
|
|
44
|
+
file_path: Path
|
|
45
|
+
language: str | None = None
|
|
46
|
+
authors: list[str] = []
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass(frozen=True)
|
|
50
|
+
class LanguageExtensions:
|
|
51
|
+
"""Value object for language to file extension mappings."""
|
|
52
|
+
|
|
53
|
+
language: str
|
|
54
|
+
extensions: list[str]
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def get_supported_languages(cls) -> list[str]:
|
|
58
|
+
"""Get all supported programming languages."""
|
|
59
|
+
return [
|
|
60
|
+
"python",
|
|
61
|
+
"javascript",
|
|
62
|
+
"typescript",
|
|
63
|
+
"java",
|
|
64
|
+
"c",
|
|
65
|
+
"cpp",
|
|
66
|
+
"csharp",
|
|
67
|
+
"go",
|
|
68
|
+
"rust",
|
|
69
|
+
"php",
|
|
70
|
+
"ruby",
|
|
71
|
+
"swift",
|
|
72
|
+
"kotlin",
|
|
73
|
+
"scala",
|
|
74
|
+
"r",
|
|
75
|
+
"sql",
|
|
76
|
+
"html",
|
|
77
|
+
"css",
|
|
78
|
+
"json",
|
|
79
|
+
"yaml",
|
|
80
|
+
"xml",
|
|
81
|
+
"markdown",
|
|
82
|
+
"shell",
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def get_extensions_for_language(cls, language: str) -> list[str]:
|
|
87
|
+
"""Get file extensions for a given language."""
|
|
88
|
+
language_map = {
|
|
89
|
+
"python": [".py", ".pyw", ".pyi"],
|
|
90
|
+
"javascript": [".js", ".jsx", ".mjs"],
|
|
91
|
+
"typescript": [".ts", ".tsx"],
|
|
92
|
+
"java": [".java"],
|
|
93
|
+
"c": [".c", ".h"],
|
|
94
|
+
"cpp": [".cpp", ".cc", ".cxx", ".hpp", ".hxx"],
|
|
95
|
+
"csharp": [".cs"],
|
|
96
|
+
"go": [".go"],
|
|
97
|
+
"rust": [".rs"],
|
|
98
|
+
"php": [".php"],
|
|
99
|
+
"ruby": [".rb"],
|
|
100
|
+
"swift": [".swift"],
|
|
101
|
+
"kotlin": [".kt", ".kts"],
|
|
102
|
+
"scala": [".scala", ".sc"],
|
|
103
|
+
"r": [".r", ".R"],
|
|
104
|
+
"sql": [".sql"],
|
|
105
|
+
"html": [".html", ".htm"],
|
|
106
|
+
"css": [".css", ".scss", ".sass", ".less"],
|
|
107
|
+
"json": [".json"],
|
|
108
|
+
"yaml": [".yaml", ".yml"],
|
|
109
|
+
"xml": [".xml"],
|
|
110
|
+
"markdown": [".md", ".markdown"],
|
|
111
|
+
"shell": [".sh", ".bash", ".zsh", ".fish"],
|
|
112
|
+
}
|
|
113
|
+
return language_map.get(language.lower(), [])
|
|
114
|
+
|
|
115
|
+
@classmethod
|
|
116
|
+
def is_supported_language(cls, language: str) -> bool:
|
|
117
|
+
"""Check if a language is supported."""
|
|
118
|
+
return language.lower() in cls.get_supported_languages()
|
|
119
|
+
|
|
120
|
+
@classmethod
|
|
121
|
+
def get_extensions_or_fallback(cls, language: str) -> list[str]:
|
|
122
|
+
"""Get extensions for language or return language as extension if not found."""
|
|
123
|
+
language_lower = language.lower()
|
|
124
|
+
if cls.is_supported_language(language_lower):
|
|
125
|
+
return cls.get_extensions_for_language(language_lower)
|
|
126
|
+
return [language_lower]
|
|
14
127
|
|
|
15
128
|
|
|
16
129
|
class SearchType(Enum):
|
|
@@ -21,14 +134,6 @@ class SearchType(Enum):
|
|
|
21
134
|
HYBRID = "hybrid"
|
|
22
135
|
|
|
23
136
|
|
|
24
|
-
@dataclass
|
|
25
|
-
class SnippetExtractionRequest:
|
|
26
|
-
"""Domain model for snippet extraction request."""
|
|
27
|
-
|
|
28
|
-
file_path: Path
|
|
29
|
-
strategy: SnippetExtractionStrategy = SnippetExtractionStrategy.METHOD_BASED
|
|
30
|
-
|
|
31
|
-
|
|
32
137
|
@dataclass
|
|
33
138
|
class SnippetExtractionResult:
|
|
34
139
|
"""Domain model for snippet extraction result."""
|
|
@@ -100,6 +205,7 @@ class SnippetSearchFilters:
|
|
|
100
205
|
created_after: datetime | None = None
|
|
101
206
|
created_before: datetime | None = None
|
|
102
207
|
source_repo: str | None = None
|
|
208
|
+
file_path: str | None = None
|
|
103
209
|
|
|
104
210
|
@classmethod
|
|
105
211
|
def from_cli_params(
|
|
@@ -175,25 +281,90 @@ class MultiSearchRequest:
|
|
|
175
281
|
|
|
176
282
|
@dataclass
|
|
177
283
|
class MultiSearchResult:
|
|
178
|
-
"""
|
|
284
|
+
"""Enhanced search result with comprehensive snippet metadata."""
|
|
179
285
|
|
|
180
286
|
id: int
|
|
181
|
-
uri: str
|
|
182
287
|
content: str
|
|
183
288
|
original_scores: list[float]
|
|
289
|
+
source_uri: str
|
|
290
|
+
relative_path: str
|
|
291
|
+
language: str
|
|
292
|
+
authors: list[str]
|
|
293
|
+
created_at: datetime
|
|
294
|
+
summary: str
|
|
184
295
|
|
|
185
296
|
def __str__(self) -> str:
|
|
186
|
-
"""Return formatted string representation
|
|
297
|
+
"""Return enhanced formatted string representation."""
|
|
187
298
|
lines = [
|
|
188
|
-
"
|
|
189
|
-
f"
|
|
190
|
-
f"
|
|
191
|
-
self.
|
|
192
|
-
"
|
|
193
|
-
"",
|
|
299
|
+
"---",
|
|
300
|
+
f"id: {self.id}",
|
|
301
|
+
f"source: {self.source_uri}",
|
|
302
|
+
f"path: {self.relative_path}",
|
|
303
|
+
f"lang: {self.language}",
|
|
304
|
+
f"created: {self.created_at.isoformat()}",
|
|
305
|
+
f"authors: {', '.join(self.authors)}",
|
|
306
|
+
f"scores: {self.original_scores}",
|
|
307
|
+
"---",
|
|
308
|
+
f"{self.summary}\n",
|
|
309
|
+
f"```{self.language}",
|
|
310
|
+
f"{self.content}",
|
|
311
|
+
"```\n",
|
|
194
312
|
]
|
|
195
313
|
return "\n".join(lines)
|
|
196
314
|
|
|
315
|
+
def to_json(self) -> str:
|
|
316
|
+
"""Return LLM-optimized JSON representation following the compact schema."""
|
|
317
|
+
json_obj = {
|
|
318
|
+
"id": self.id,
|
|
319
|
+
"source": self.source_uri,
|
|
320
|
+
"path": self.relative_path,
|
|
321
|
+
"lang": self.language.lower(),
|
|
322
|
+
"created": self.created_at.isoformat() if self.created_at else "",
|
|
323
|
+
"author": ", ".join(self.authors),
|
|
324
|
+
"score": self.original_scores,
|
|
325
|
+
"code": self.content,
|
|
326
|
+
"summary": self.summary,
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
return json.dumps(json_obj, separators=(",", ":"))
|
|
330
|
+
|
|
331
|
+
@classmethod
|
|
332
|
+
def to_jsonlines(cls, results: list["MultiSearchResult"]) -> str:
|
|
333
|
+
"""Convert multiple MultiSearchResult objects to JSON Lines format.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
results: List of MultiSearchResult objects
|
|
337
|
+
include_summary: Whether to include summary fields
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
JSON Lines string (one JSON object per line)
|
|
341
|
+
|
|
342
|
+
"""
|
|
343
|
+
return "\n".join(result.to_json() for result in results)
|
|
344
|
+
|
|
345
|
+
@classmethod
|
|
346
|
+
def to_string(cls, results: list["MultiSearchResult"]) -> str:
|
|
347
|
+
"""Convert multiple MultiSearchResult objects to a string."""
|
|
348
|
+
return "\n\n".join(str(result) for result in results)
|
|
349
|
+
|
|
350
|
+
@staticmethod
|
|
351
|
+
def calculate_relative_path(file_path: str, source_path: str) -> str:
|
|
352
|
+
"""Calculate relative path from source root."""
|
|
353
|
+
try:
|
|
354
|
+
return str(Path(file_path).relative_to(Path(source_path)))
|
|
355
|
+
except ValueError:
|
|
356
|
+
# If file_path is not relative to source_path, return the file name
|
|
357
|
+
return Path(file_path).name
|
|
358
|
+
|
|
359
|
+
@staticmethod
|
|
360
|
+
def detect_language_from_extension(extension: str) -> str:
|
|
361
|
+
"""Detect programming language from file extension."""
|
|
362
|
+
try:
|
|
363
|
+
return LanguageMapping.get_language_for_extension(extension).title()
|
|
364
|
+
except ValueError:
|
|
365
|
+
# Unknown extension, return a default
|
|
366
|
+
return "Unknown"
|
|
367
|
+
|
|
197
368
|
|
|
198
369
|
@dataclass
|
|
199
370
|
class FusionRequest:
|
|
@@ -291,39 +462,6 @@ class IndexView:
|
|
|
291
462
|
source: str | None = None
|
|
292
463
|
|
|
293
464
|
|
|
294
|
-
@dataclass
|
|
295
|
-
class SnippetListItem:
|
|
296
|
-
"""Domain model for snippet list item with file information."""
|
|
297
|
-
|
|
298
|
-
id: int
|
|
299
|
-
file_path: str
|
|
300
|
-
content: str
|
|
301
|
-
source_uri: str
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
@dataclass
|
|
305
|
-
class FileInfo:
|
|
306
|
-
"""Domain model for file information."""
|
|
307
|
-
|
|
308
|
-
uri: str
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
@dataclass
|
|
312
|
-
class SnippetInfo:
|
|
313
|
-
"""Domain model for snippet information."""
|
|
314
|
-
|
|
315
|
-
id: int
|
|
316
|
-
content: str
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
@dataclass
|
|
320
|
-
class SnippetWithFile:
|
|
321
|
-
"""Domain model for snippet with associated file information."""
|
|
322
|
-
|
|
323
|
-
file: FileInfo
|
|
324
|
-
snippet: SnippetInfo
|
|
325
|
-
|
|
326
|
-
|
|
327
465
|
class LanguageMapping:
|
|
328
466
|
"""Value object for language-to-extension mappings.
|
|
329
467
|
|
|
@@ -493,38 +631,33 @@ class LanguageMapping:
|
|
|
493
631
|
return [language_lower]
|
|
494
632
|
|
|
495
633
|
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
DateTime(timezone=True), nullable=False
|
|
527
|
-
)
|
|
528
|
-
updated_at: Mapped[datetime] = mapped_column(
|
|
529
|
-
DateTime(timezone=True), nullable=False
|
|
530
|
-
)
|
|
634
|
+
class SnippetQuery(BaseModel):
|
|
635
|
+
"""Domain query object for snippet searches."""
|
|
636
|
+
|
|
637
|
+
text: str
|
|
638
|
+
search_type: SearchType = SearchType.HYBRID
|
|
639
|
+
filters: SnippetSearchFilters = SnippetSearchFilters()
|
|
640
|
+
top_k: int = 10
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
class SnippetExtractionStrategy(str, Enum):
|
|
644
|
+
"""Different strategies for extracting snippets from files."""
|
|
645
|
+
|
|
646
|
+
METHOD_BASED = "method_based"
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
@dataclass
|
|
650
|
+
class SnippetExtractionRequest:
|
|
651
|
+
"""Domain model for snippet extraction request."""
|
|
652
|
+
|
|
653
|
+
file_path: Path
|
|
654
|
+
strategy: SnippetExtractionStrategy = SnippetExtractionStrategy.METHOD_BASED
|
|
655
|
+
|
|
656
|
+
|
|
657
|
+
class FileProcessingStatus(IntEnum):
|
|
658
|
+
"""File processing status."""
|
|
659
|
+
|
|
660
|
+
CLEAN = 0
|
|
661
|
+
ADDED = 1
|
|
662
|
+
MODIFIED = 2
|
|
663
|
+
DELETED = 3
|