kodit 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (55) hide show
  1. kodit/_version.py +2 -2
  2. kodit/application/factories/code_indexing_factory.py +77 -28
  3. kodit/application/services/code_indexing_application_service.py +142 -116
  4. kodit/cli.py +14 -41
  5. kodit/domain/entities.py +268 -197
  6. kodit/domain/protocols.py +61 -0
  7. kodit/domain/services/embedding_service.py +1 -1
  8. kodit/domain/services/index_query_service.py +66 -0
  9. kodit/domain/services/index_service.py +323 -0
  10. kodit/domain/value_objects.py +150 -60
  11. kodit/infrastructure/cloning/git/working_copy.py +17 -8
  12. kodit/infrastructure/cloning/metadata.py +37 -67
  13. kodit/infrastructure/embedding/embedding_factory.py +1 -1
  14. kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
  15. kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +1 -1
  16. kodit/infrastructure/enrichment/null_enrichment_provider.py +4 -10
  17. kodit/infrastructure/git/git_utils.py +1 -63
  18. kodit/infrastructure/ignore/ignore_pattern_provider.py +1 -2
  19. kodit/infrastructure/indexing/auto_indexing_service.py +2 -12
  20. kodit/infrastructure/indexing/fusion_service.py +1 -1
  21. kodit/infrastructure/mappers/__init__.py +1 -0
  22. kodit/infrastructure/mappers/index_mapper.py +344 -0
  23. kodit/infrastructure/snippet_extraction/factories.py +13 -0
  24. kodit/infrastructure/snippet_extraction/language_detection_service.py +1 -1
  25. kodit/infrastructure/snippet_extraction/snippet_query_provider.py +0 -1
  26. kodit/infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py +1 -1
  27. kodit/infrastructure/sqlalchemy/embedding_repository.py +1 -1
  28. kodit/infrastructure/sqlalchemy/entities.py +203 -0
  29. kodit/infrastructure/sqlalchemy/file_repository.py +1 -1
  30. kodit/infrastructure/sqlalchemy/index_repository.py +550 -0
  31. kodit/mcp.py +0 -7
  32. kodit/migrations/env.py +1 -1
  33. kodit/migrations/versions/4073b33f9436_add_file_processing_flag.py +34 -0
  34. kodit/utils/__init__.py +1 -0
  35. kodit/utils/path_utils.py +54 -0
  36. {kodit-0.3.2.dist-info → kodit-0.3.3.dist-info}/METADATA +1 -1
  37. {kodit-0.3.2.dist-info → kodit-0.3.3.dist-info}/RECORD +40 -44
  38. kodit/domain/enums.py +0 -9
  39. kodit/domain/repositories.py +0 -128
  40. kodit/domain/services/ignore_service.py +0 -45
  41. kodit/domain/services/indexing_service.py +0 -204
  42. kodit/domain/services/snippet_extraction_service.py +0 -89
  43. kodit/domain/services/snippet_service.py +0 -215
  44. kodit/domain/services/source_service.py +0 -85
  45. kodit/infrastructure/cloning/folder/__init__.py +0 -1
  46. kodit/infrastructure/cloning/folder/factory.py +0 -128
  47. kodit/infrastructure/cloning/folder/working_copy.py +0 -38
  48. kodit/infrastructure/cloning/git/factory.py +0 -153
  49. kodit/infrastructure/indexing/index_repository.py +0 -286
  50. kodit/infrastructure/indexing/snippet_domain_service_factory.py +0 -37
  51. kodit/infrastructure/sqlalchemy/repository.py +0 -133
  52. kodit/infrastructure/sqlalchemy/snippet_repository.py +0 -259
  53. {kodit-0.3.2.dist-info → kodit-0.3.3.dist-info}/WHEEL +0 -0
  54. {kodit-0.3.2.dist-info → kodit-0.3.3.dist-info}/entry_points.txt +0 -0
  55. {kodit-0.3.2.dist-info → kodit-0.3.3.dist-info}/licenses/LICENSE +0 -0
@@ -3,7 +3,6 @@
3
3
  from abc import ABC, abstractmethod
4
4
  from collections.abc import AsyncGenerator, Sequence
5
5
 
6
- from kodit.domain.entities import EmbeddingType
7
6
  from kodit.domain.value_objects import (
8
7
  EmbeddingRequest,
9
8
  EmbeddingResponse,
@@ -12,6 +11,7 @@ from kodit.domain.value_objects import (
12
11
  SearchRequest,
13
12
  SearchResult,
14
13
  )
14
+ from kodit.infrastructure.sqlalchemy.entities import EmbeddingType
15
15
 
16
16
 
17
17
  class EmbeddingProvider(ABC):
@@ -0,0 +1,66 @@
1
+ """Index query service."""
2
+
3
+ from abc import ABC, abstractmethod
4
+
5
+ from kodit.domain.entities import Index, SnippetWithContext
6
+ from kodit.domain.protocols import IndexRepository
7
+ from kodit.domain.value_objects import (
8
+ FusionRequest,
9
+ FusionResult,
10
+ MultiSearchRequest,
11
+ )
12
+
13
+
14
+ class FusionService(ABC):
15
+ """Abstract fusion service interface."""
16
+
17
+ @abstractmethod
18
+ def reciprocal_rank_fusion(
19
+ self, rankings: list[list[FusionRequest]], k: float = 60
20
+ ) -> list[FusionResult]:
21
+ """Perform reciprocal rank fusion on search results."""
22
+
23
+
24
+ class IndexQueryService:
25
+ """Index query service."""
26
+
27
+ def __init__(
28
+ self,
29
+ index_repository: IndexRepository,
30
+ fusion_service: FusionService,
31
+ ) -> None:
32
+ """Initialize the index query service."""
33
+ self.index_repository = index_repository
34
+ self.fusion_service = fusion_service
35
+
36
+ async def get_index_by_id(self, index_id: int) -> Index | None:
37
+ """Get an index by its ID."""
38
+ return await self.index_repository.get(index_id)
39
+
40
+ async def list_indexes(self) -> list[Index]:
41
+ """List all indexes."""
42
+ return await self.index_repository.all()
43
+
44
+ async def search_snippets(
45
+ self, request: MultiSearchRequest
46
+ ) -> list[SnippetWithContext]:
47
+ """Search snippets with filters.
48
+
49
+ Args:
50
+ request: The search request containing filters
51
+
52
+ Returns:
53
+ List of matching snippet items with context
54
+
55
+ """
56
+ return list(await self.index_repository.search(request))
57
+
58
+ async def perform_fusion(
59
+ self, rankings: list[list[FusionRequest]], k: float = 60
60
+ ) -> list[FusionResult]:
61
+ """Perform reciprocal rank fusion on search results."""
62
+ return self.fusion_service.reciprocal_rank_fusion(rankings, k)
63
+
64
+ async def get_snippets_by_ids(self, ids: list[int]) -> list[SnippetWithContext]:
65
+ """Get snippets by their IDs."""
66
+ return await self.index_repository.get_snippets_by_ids(ids)
@@ -0,0 +1,323 @@
1
+ """Pure domain service for Index aggregate operations."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from collections.abc import Mapping
5
+ from pathlib import Path
6
+
7
+ import structlog
8
+ from pydantic import AnyUrl
9
+
10
+ import kodit.domain.entities as domain_entities
11
+ from kodit.domain.interfaces import ProgressCallback
12
+ from kodit.domain.services.enrichment_service import EnrichmentDomainService
13
+ from kodit.domain.value_objects import (
14
+ EnrichmentIndexRequest,
15
+ EnrichmentRequest,
16
+ SnippetExtractionRequest,
17
+ SnippetExtractionResult,
18
+ SnippetExtractionStrategy,
19
+ )
20
+ from kodit.infrastructure.cloning.git.working_copy import GitWorkingCopyProvider
21
+ from kodit.infrastructure.cloning.metadata import FileMetadataExtractor
22
+ from kodit.infrastructure.git.git_utils import is_valid_clone_target
23
+ from kodit.infrastructure.ignore.ignore_pattern_provider import GitIgnorePatternProvider
24
+ from kodit.reporting import Reporter
25
+ from kodit.utils.path_utils import path_from_uri
26
+
27
+
28
+ class LanguageDetectionService(ABC):
29
+ """Abstract interface for language detection service."""
30
+
31
+ @abstractmethod
32
+ async def detect_language(self, file_path: Path) -> str:
33
+ """Detect the programming language of a file."""
34
+
35
+
36
+ class SnippetExtractor(ABC):
37
+ """Abstract interface for snippet extraction."""
38
+
39
+ @abstractmethod
40
+ async def extract(self, file_path: Path, language: str) -> list[str]:
41
+ """Extract snippets from a file."""
42
+
43
+
44
+ class IndexDomainService:
45
+ """Pure domain service for Index aggregate operations.
46
+
47
+ This service handles the full lifecycle of code indexing:
48
+ - Creating indexes for source repositories
49
+ - Cloning and processing source files
50
+ - Extracting and enriching code snippets
51
+ - Managing the complete Index aggregate
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ language_detector: LanguageDetectionService,
57
+ snippet_extractors: Mapping[SnippetExtractionStrategy, SnippetExtractor],
58
+ enrichment_service: EnrichmentDomainService,
59
+ clone_dir: Path,
60
+ ) -> None:
61
+ """Initialize the index domain service."""
62
+ self._clone_dir = clone_dir
63
+ self._language_detector = language_detector
64
+ self._snippet_extractors = snippet_extractors
65
+ self._enrichment_service = enrichment_service
66
+ self.log = structlog.get_logger(__name__)
67
+
68
+ async def prepare_index(
69
+ self,
70
+ uri_or_path_like: str, # Must include user/pass, etc
71
+ progress_callback: ProgressCallback | None = None,
72
+ ) -> domain_entities.WorkingCopy:
73
+ """Prepare an index by scanning files and creating working copy."""
74
+ sanitized_uri, source_type = self.sanitize_uri(uri_or_path_like)
75
+ reporter = Reporter(self.log, progress_callback)
76
+ self.log.info("Preparing source", uri=str(sanitized_uri))
77
+
78
+ if source_type == domain_entities.SourceType.FOLDER:
79
+ await reporter.start("prepare_index", 1, "Scanning source...")
80
+ local_path = path_from_uri(str(sanitized_uri))
81
+ elif source_type == domain_entities.SourceType.GIT:
82
+ source_type = domain_entities.SourceType.GIT
83
+ git_working_copy_provider = GitWorkingCopyProvider(self._clone_dir)
84
+ await reporter.start("prepare_index", 1, "Cloning source...")
85
+ local_path = await git_working_copy_provider.prepare(uri_or_path_like)
86
+ await reporter.done("prepare_index")
87
+ else:
88
+ raise ValueError(f"Unsupported source: {uri_or_path_like}")
89
+
90
+ await reporter.done("prepare_index")
91
+
92
+ return domain_entities.WorkingCopy(
93
+ remote_uri=sanitized_uri,
94
+ cloned_path=local_path,
95
+ source_type=source_type,
96
+ files=[],
97
+ )
98
+
99
+ async def extract_snippets_from_index(
100
+ self,
101
+ index: domain_entities.Index,
102
+ strategy: SnippetExtractionStrategy = SnippetExtractionStrategy.METHOD_BASED,
103
+ progress_callback: ProgressCallback | None = None,
104
+ ) -> domain_entities.Index:
105
+ """Extract code snippets from files in the index."""
106
+ file_count = len(index.source.working_copy.files)
107
+
108
+ self.log.info(
109
+ "Extracting snippets",
110
+ index_id=index.id,
111
+ file_count=file_count,
112
+ strategy=strategy.value,
113
+ )
114
+
115
+ # Only create snippets for files that have been added or modified
116
+ files = index.source.working_copy.changed_files()
117
+ index.delete_snippets_for_files(files)
118
+
119
+ reporter = Reporter(self.log, progress_callback)
120
+ await reporter.start(
121
+ "extract_snippets", len(files), "Extracting code snippets..."
122
+ )
123
+
124
+ new_snippets = []
125
+ for i, domain_file in enumerate(files, 1):
126
+ try:
127
+ # Extract snippets from file
128
+ request = SnippetExtractionRequest(
129
+ file_path=domain_file.as_path(), strategy=strategy
130
+ )
131
+ result = await self._extract_snippets(request)
132
+ for snippet_text in result.snippets:
133
+ snippet = domain_entities.Snippet(
134
+ derives_from=[domain_file],
135
+ )
136
+ snippet.add_original_content(snippet_text, result.language)
137
+ new_snippets.append(snippet)
138
+
139
+ except (OSError, ValueError) as e:
140
+ self.log.debug(
141
+ "Skipping file for snippet extraction",
142
+ file_uri=str(domain_file.uri),
143
+ error=str(e),
144
+ )
145
+ continue
146
+
147
+ await reporter.step(
148
+ "extract_snippets", i, len(files), f"Processed {domain_file.uri.path}"
149
+ )
150
+
151
+ index.snippets.extend(new_snippets)
152
+ await reporter.done("extract_snippets")
153
+ return index
154
+
155
+ async def enrich_snippets_in_index(
156
+ self,
157
+ snippets: list[domain_entities.Snippet],
158
+ progress_callback: ProgressCallback | None = None,
159
+ ) -> list[domain_entities.Snippet]:
160
+ """Enrich snippets with AI-generated summaries."""
161
+ if not snippets or len(snippets) == 0:
162
+ return snippets
163
+
164
+ reporter = Reporter(self.log, progress_callback)
165
+ await reporter.start("enrichment", len(snippets), "Enriching snippets...")
166
+
167
+ snippet_map = {snippet.id: snippet for snippet in snippets if snippet.id}
168
+
169
+ enrichment_request = EnrichmentIndexRequest(
170
+ requests=[
171
+ EnrichmentRequest(snippet_id=snippet_id, text=snippet.original_text())
172
+ for snippet_id, snippet in snippet_map.items()
173
+ ]
174
+ )
175
+
176
+ processed = 0
177
+ async for result in self._enrichment_service.enrich_documents(
178
+ enrichment_request
179
+ ):
180
+ snippet_map[result.snippet_id].add_summary(result.text)
181
+
182
+ processed += 1
183
+ await reporter.step(
184
+ "enrichment", processed, len(snippets), "Enriching snippets..."
185
+ )
186
+
187
+ await reporter.done("enrichment")
188
+ return list(snippet_map.values())
189
+
190
+ async def _extract_snippets(
191
+ self, request: SnippetExtractionRequest
192
+ ) -> SnippetExtractionResult:
193
+ # Domain logic: validate file exists
194
+ if not request.file_path.exists():
195
+ raise ValueError(f"File does not exist: {request.file_path}")
196
+
197
+ # Domain logic: detect language
198
+ language = await self._language_detector.detect_language(request.file_path)
199
+
200
+ # Domain logic: choose strategy and extractor
201
+ if request.strategy not in self._snippet_extractors:
202
+ raise ValueError(f"Unsupported extraction strategy: {request.strategy}")
203
+
204
+ extractor = self._snippet_extractors[request.strategy]
205
+ snippets = await extractor.extract(request.file_path, language)
206
+
207
+ # Domain logic: filter out empty snippets
208
+ filtered_snippets = [snippet for snippet in snippets if snippet.strip()]
209
+
210
+ return SnippetExtractionResult(snippets=filtered_snippets, language=language)
211
+
212
+ def sanitize_uri(
213
+ self, uri_or_path_like: str
214
+ ) -> tuple[AnyUrl, domain_entities.SourceType]:
215
+ """Convert a URI or path-like string to a URI."""
216
+ # First, check if it's a local directory (more reliable than git check)
217
+ if Path(uri_or_path_like).is_dir():
218
+ return (
219
+ domain_entities.WorkingCopy.sanitize_local_path(uri_or_path_like),
220
+ domain_entities.SourceType.FOLDER,
221
+ )
222
+
223
+ # Then check if it's git-clonable
224
+ if is_valid_clone_target(uri_or_path_like):
225
+ return (
226
+ domain_entities.WorkingCopy.sanitize_git_url(uri_or_path_like),
227
+ domain_entities.SourceType.GIT,
228
+ )
229
+
230
+ raise ValueError(f"Unsupported source: {uri_or_path_like}")
231
+
232
+ async def refresh_working_copy(
233
+ self,
234
+ working_copy: domain_entities.WorkingCopy,
235
+ progress_callback: ProgressCallback | None = None,
236
+ ) -> domain_entities.WorkingCopy:
237
+ """Refresh the working copy."""
238
+ metadata_extractor = FileMetadataExtractor(working_copy.source_type)
239
+ reporter = Reporter(self.log, progress_callback)
240
+
241
+ if working_copy.source_type == domain_entities.SourceType.GIT:
242
+ git_working_copy_provider = GitWorkingCopyProvider(self._clone_dir)
243
+ await git_working_copy_provider.sync(str(working_copy.remote_uri))
244
+
245
+ current_file_paths = working_copy.list_filesystem_paths(
246
+ GitIgnorePatternProvider(working_copy.cloned_path)
247
+ )
248
+
249
+ previous_files_map = {file.as_path(): file for file in working_copy.files}
250
+
251
+ # Calculate different sets of files
252
+ deleted_file_paths = set(previous_files_map.keys()) - set(current_file_paths)
253
+ new_file_paths = set(current_file_paths) - set(previous_files_map.keys())
254
+ modified_file_paths = set(current_file_paths) & set(previous_files_map.keys())
255
+ num_files_to_process = (
256
+ len(deleted_file_paths) + len(new_file_paths) + len(modified_file_paths)
257
+ )
258
+ self.log.info(
259
+ "Refreshing working copy",
260
+ num_deleted=len(deleted_file_paths),
261
+ num_new=len(new_file_paths),
262
+ num_modified=len(modified_file_paths),
263
+ num_total_changes=num_files_to_process,
264
+ num_dirty=len(working_copy.dirty_files()),
265
+ )
266
+
267
+ # Setup reporter
268
+ processed = 0
269
+ await reporter.start(
270
+ "refresh_working_copy", num_files_to_process, "Refreshing working copy..."
271
+ )
272
+
273
+ # First check to see if any files have been deleted
274
+ for file_path in deleted_file_paths:
275
+ processed += 1
276
+ await reporter.step(
277
+ "refresh_working_copy",
278
+ processed,
279
+ num_files_to_process,
280
+ f"Deleted {file_path.name}",
281
+ )
282
+ previous_files_map[
283
+ file_path
284
+ ].file_processing_status = domain_entities.FileProcessingStatus.DELETED
285
+
286
+ # Then check to see if there are any new files
287
+ for file_path in new_file_paths:
288
+ processed += 1
289
+ await reporter.step(
290
+ "refresh_working_copy",
291
+ processed,
292
+ num_files_to_process,
293
+ f"New {file_path.name}",
294
+ )
295
+ try:
296
+ working_copy.files.append(
297
+ await metadata_extractor.extract(file_path=file_path)
298
+ )
299
+ except (OSError, ValueError) as e:
300
+ self.log.info("Skipping file", file=str(file_path), error=str(e))
301
+ continue
302
+
303
+ # Finally check if there are any modified files
304
+ for file_path in modified_file_paths:
305
+ processed += 1
306
+ await reporter.step(
307
+ "refresh_working_copy",
308
+ processed,
309
+ num_files_to_process,
310
+ f"Modified {file_path.name}",
311
+ )
312
+ try:
313
+ previous_file = previous_files_map[file_path]
314
+ new_file = await metadata_extractor.extract(file_path=file_path)
315
+ if previous_file.sha256 != new_file.sha256:
316
+ previous_file.file_processing_status = (
317
+ domain_entities.FileProcessingStatus.MODIFIED
318
+ )
319
+ except (OSError, ValueError) as e:
320
+ self.log.info("Skipping file", file=str(file_path), error=str(e))
321
+ continue
322
+
323
+ return working_copy
@@ -1,17 +1,129 @@
1
- """Domain value objects and DTOs."""
1
+ """Pure domain value objects and DTOs."""
2
2
 
3
3
  import json
4
4
  from dataclasses import dataclass
5
5
  from datetime import datetime
6
- from enum import Enum
6
+ from enum import Enum, IntEnum
7
7
  from pathlib import Path
8
- from typing import Any, ClassVar
8
+ from typing import ClassVar
9
9
 
10
- from sqlalchemy import JSON, DateTime, Integer, Text
11
- from sqlalchemy.orm import Mapped, mapped_column
10
+ from pydantic import BaseModel
12
11
 
13
- from kodit.domain.entities import Author, Base, File, Snippet, Source
14
- from kodit.domain.enums import SnippetExtractionStrategy
12
+
13
+ class SourceType(IntEnum):
14
+ """The type of source."""
15
+
16
+ UNKNOWN = 0
17
+ FOLDER = 1
18
+ GIT = 2
19
+
20
+
21
+ class SnippetContentType(IntEnum):
22
+ """Type of snippet content."""
23
+
24
+ UNKNOWN = 0
25
+ ORIGINAL = 1
26
+ SUMMARY = 2
27
+
28
+
29
+ class SnippetContent(BaseModel):
30
+ """Snippet content domain value object."""
31
+
32
+ type: SnippetContentType
33
+ value: str
34
+ language: str
35
+
36
+
37
+ class SnippetSearchResult(BaseModel):
38
+ """Domain result object for snippet searches."""
39
+
40
+ snippet_id: int
41
+ content: str
42
+ summary: str
43
+ score: float
44
+ file_path: Path
45
+ language: str | None = None
46
+ authors: list[str] = []
47
+
48
+
49
+ @dataclass(frozen=True)
50
+ class LanguageExtensions:
51
+ """Value object for language to file extension mappings."""
52
+
53
+ language: str
54
+ extensions: list[str]
55
+
56
+ @classmethod
57
+ def get_supported_languages(cls) -> list[str]:
58
+ """Get all supported programming languages."""
59
+ return [
60
+ "python",
61
+ "javascript",
62
+ "typescript",
63
+ "java",
64
+ "c",
65
+ "cpp",
66
+ "csharp",
67
+ "go",
68
+ "rust",
69
+ "php",
70
+ "ruby",
71
+ "swift",
72
+ "kotlin",
73
+ "scala",
74
+ "r",
75
+ "sql",
76
+ "html",
77
+ "css",
78
+ "json",
79
+ "yaml",
80
+ "xml",
81
+ "markdown",
82
+ "shell",
83
+ ]
84
+
85
+ @classmethod
86
+ def get_extensions_for_language(cls, language: str) -> list[str]:
87
+ """Get file extensions for a given language."""
88
+ language_map = {
89
+ "python": [".py", ".pyw", ".pyi"],
90
+ "javascript": [".js", ".jsx", ".mjs"],
91
+ "typescript": [".ts", ".tsx"],
92
+ "java": [".java"],
93
+ "c": [".c", ".h"],
94
+ "cpp": [".cpp", ".cc", ".cxx", ".hpp", ".hxx"],
95
+ "csharp": [".cs"],
96
+ "go": [".go"],
97
+ "rust": [".rs"],
98
+ "php": [".php"],
99
+ "ruby": [".rb"],
100
+ "swift": [".swift"],
101
+ "kotlin": [".kt", ".kts"],
102
+ "scala": [".scala", ".sc"],
103
+ "r": [".r", ".R"],
104
+ "sql": [".sql"],
105
+ "html": [".html", ".htm"],
106
+ "css": [".css", ".scss", ".sass", ".less"],
107
+ "json": [".json"],
108
+ "yaml": [".yaml", ".yml"],
109
+ "xml": [".xml"],
110
+ "markdown": [".md", ".markdown"],
111
+ "shell": [".sh", ".bash", ".zsh", ".fish"],
112
+ }
113
+ return language_map.get(language.lower(), [])
114
+
115
+ @classmethod
116
+ def is_supported_language(cls, language: str) -> bool:
117
+ """Check if a language is supported."""
118
+ return language.lower() in cls.get_supported_languages()
119
+
120
+ @classmethod
121
+ def get_extensions_or_fallback(cls, language: str) -> list[str]:
122
+ """Get extensions for language or return language as extension if not found."""
123
+ language_lower = language.lower()
124
+ if cls.is_supported_language(language_lower):
125
+ return cls.get_extensions_for_language(language_lower)
126
+ return [language_lower]
15
127
 
16
128
 
17
129
  class SearchType(Enum):
@@ -22,14 +134,6 @@ class SearchType(Enum):
22
134
  HYBRID = "hybrid"
23
135
 
24
136
 
25
- @dataclass
26
- class SnippetExtractionRequest:
27
- """Domain model for snippet extraction request."""
28
-
29
- file_path: Path
30
- strategy: SnippetExtractionStrategy = SnippetExtractionStrategy.METHOD_BASED
31
-
32
-
33
137
  @dataclass
34
138
  class SnippetExtractionResult:
35
139
  """Domain model for snippet extraction result."""
@@ -101,6 +205,7 @@ class SnippetSearchFilters:
101
205
  created_after: datetime | None = None
102
206
  created_before: datetime | None = None
103
207
  source_repo: str | None = None
208
+ file_path: str | None = None
104
209
 
105
210
  @classmethod
106
211
  def from_cli_params(
@@ -357,16 +462,6 @@ class IndexView:
357
462
  source: str | None = None
358
463
 
359
464
 
360
- @dataclass
361
- class SnippetWithContext:
362
- """Domain model for snippet with associated context information."""
363
-
364
- source: Source
365
- file: File
366
- authors: list[Author]
367
- snippet: Snippet
368
-
369
-
370
465
  class LanguageMapping:
371
466
  """Value object for language-to-extension mappings.
372
467
 
@@ -536,38 +631,33 @@ class LanguageMapping:
536
631
  return [language_lower]
537
632
 
538
633
 
539
- # Database models for value objects
540
- class BM25DocumentModel(Base):
541
- """BM25 document model."""
542
-
543
- __tablename__ = "bm25_documents"
544
-
545
- id: Mapped[int] = mapped_column(Integer, primary_key=True)
546
- content: Mapped[str] = mapped_column(Text, nullable=False)
547
- document_metadata: Mapped[dict[str, Any] | None] = mapped_column(
548
- JSON, nullable=True
549
- )
550
- created_at: Mapped[datetime] = mapped_column(
551
- DateTime(timezone=True), nullable=False
552
- )
553
- updated_at: Mapped[datetime] = mapped_column(
554
- DateTime(timezone=True), nullable=False
555
- )
556
-
557
-
558
- class VectorDocumentModel(Base):
559
- """Vector document model."""
560
-
561
- __tablename__ = "vector_documents"
562
-
563
- id: Mapped[int] = mapped_column(Integer, primary_key=True)
564
- content: Mapped[str] = mapped_column(Text, nullable=False)
565
- document_metadata: Mapped[dict[str, Any] | None] = mapped_column(
566
- JSON, nullable=True
567
- )
568
- created_at: Mapped[datetime] = mapped_column(
569
- DateTime(timezone=True), nullable=False
570
- )
571
- updated_at: Mapped[datetime] = mapped_column(
572
- DateTime(timezone=True), nullable=False
573
- )
634
+ class SnippetQuery(BaseModel):
635
+ """Domain query object for snippet searches."""
636
+
637
+ text: str
638
+ search_type: SearchType = SearchType.HYBRID
639
+ filters: SnippetSearchFilters = SnippetSearchFilters()
640
+ top_k: int = 10
641
+
642
+
643
+ class SnippetExtractionStrategy(str, Enum):
644
+ """Different strategies for extracting snippets from files."""
645
+
646
+ METHOD_BASED = "method_based"
647
+
648
+
649
+ @dataclass
650
+ class SnippetExtractionRequest:
651
+ """Domain model for snippet extraction request."""
652
+
653
+ file_path: Path
654
+ strategy: SnippetExtractionStrategy = SnippetExtractionStrategy.METHOD_BASED
655
+
656
+
657
+ class FileProcessingStatus(IntEnum):
658
+ """File processing status."""
659
+
660
+ CLEAN = 0
661
+ ADDED = 1
662
+ MODIFIED = 2
663
+ DELETED = 3
@@ -6,7 +6,7 @@ from pathlib import Path
6
6
  import git
7
7
  import structlog
8
8
 
9
- from kodit.infrastructure.git.git_utils import sanitize_git_url
9
+ from kodit.domain.entities import WorkingCopy
10
10
 
11
11
 
12
12
  class GitWorkingCopyProvider:
@@ -17,15 +17,17 @@ class GitWorkingCopyProvider:
17
17
  self.clone_dir = clone_dir
18
18
  self.log = structlog.get_logger(__name__)
19
19
 
20
+ def get_clone_path(self, uri: str) -> Path:
21
+ """Get the clone path for a Git working copy."""
22
+ sanitized_uri = WorkingCopy.sanitize_git_url(uri)
23
+ dir_hash = hashlib.sha256(str(sanitized_uri).encode("utf-8")).hexdigest()[:16]
24
+ dir_name = f"repo-{dir_hash}"
25
+ return self.clone_dir / dir_name
26
+
20
27
  async def prepare(self, uri: str) -> Path:
21
28
  """Prepare a Git working copy."""
22
- # Sanitize the URI for directory name to prevent credential leaks
23
- sanitized_uri = sanitize_git_url(uri)
24
-
25
- # Use a repeatable, short sha256 hash of the sanitized URI for the directory
26
- dir_hash = hashlib.sha256(sanitized_uri.encode("utf-8")).hexdigest()[:16]
27
- dir_name = f"repo-{dir_hash}"
28
- clone_path = self.clone_dir / dir_name
29
+ sanitized_uri = WorkingCopy.sanitize_git_url(uri)
30
+ clone_path = self.get_clone_path(uri)
29
31
  clone_path.mkdir(parents=True, exist_ok=True)
30
32
 
31
33
  try:
@@ -41,3 +43,10 @@ class GitWorkingCopyProvider:
41
43
  self.log.info("Repository already exists, reusing...", uri=sanitized_uri)
42
44
 
43
45
  return clone_path
46
+
47
+ async def sync(self, uri: str) -> Path:
48
+ """Refresh a Git working copy."""
49
+ clone_path = self.get_clone_path(uri)
50
+ repo = git.Repo(clone_path)
51
+ repo.remotes.origin.pull()
52
+ return clone_path