kodit 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (70) hide show
  1. kodit/_version.py +2 -2
  2. kodit/application/factories/code_indexing_factory.py +56 -29
  3. kodit/application/services/code_indexing_application_service.py +152 -118
  4. kodit/cli.py +14 -41
  5. kodit/domain/entities.py +268 -197
  6. kodit/domain/protocols.py +61 -0
  7. kodit/domain/services/embedding_service.py +1 -1
  8. kodit/domain/services/index_query_service.py +66 -0
  9. kodit/domain/services/index_service.py +282 -0
  10. kodit/domain/value_objects.py +143 -65
  11. kodit/infrastructure/cloning/git/working_copy.py +17 -8
  12. kodit/infrastructure/cloning/metadata.py +37 -67
  13. kodit/infrastructure/embedding/embedding_factory.py +1 -1
  14. kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
  15. kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +1 -1
  16. kodit/infrastructure/enrichment/null_enrichment_provider.py +4 -10
  17. kodit/infrastructure/git/git_utils.py +1 -63
  18. kodit/infrastructure/ignore/ignore_pattern_provider.py +1 -2
  19. kodit/infrastructure/indexing/auto_indexing_service.py +2 -12
  20. kodit/infrastructure/indexing/fusion_service.py +1 -1
  21. kodit/infrastructure/mappers/__init__.py +1 -0
  22. kodit/infrastructure/mappers/index_mapper.py +344 -0
  23. kodit/infrastructure/slicing/__init__.py +1 -0
  24. kodit/infrastructure/slicing/language_detection_service.py +18 -0
  25. kodit/infrastructure/slicing/slicer.py +894 -0
  26. kodit/infrastructure/sqlalchemy/embedding_repository.py +1 -1
  27. kodit/infrastructure/sqlalchemy/entities.py +203 -0
  28. kodit/infrastructure/sqlalchemy/index_repository.py +579 -0
  29. kodit/mcp.py +0 -7
  30. kodit/migrations/env.py +1 -1
  31. kodit/migrations/versions/4073b33f9436_add_file_processing_flag.py +36 -0
  32. kodit/migrations/versions/4552eb3f23ce_add_summary.py +4 -4
  33. kodit/migrations/versions/7c3bbc2ab32b_add_embeddings_table.py +24 -16
  34. kodit/migrations/versions/85155663351e_initial.py +64 -48
  35. kodit/migrations/versions/c3f5137d30f5_index_all_the_things.py +20 -14
  36. kodit/utils/__init__.py +1 -0
  37. kodit/utils/path_utils.py +54 -0
  38. {kodit-0.3.2.dist-info → kodit-0.3.4.dist-info}/METADATA +9 -4
  39. kodit-0.3.4.dist-info/RECORD +89 -0
  40. kodit/domain/enums.py +0 -9
  41. kodit/domain/repositories.py +0 -128
  42. kodit/domain/services/ignore_service.py +0 -45
  43. kodit/domain/services/indexing_service.py +0 -204
  44. kodit/domain/services/snippet_extraction_service.py +0 -89
  45. kodit/domain/services/snippet_service.py +0 -215
  46. kodit/domain/services/source_service.py +0 -85
  47. kodit/infrastructure/cloning/folder/__init__.py +0 -1
  48. kodit/infrastructure/cloning/folder/factory.py +0 -128
  49. kodit/infrastructure/cloning/folder/working_copy.py +0 -38
  50. kodit/infrastructure/cloning/git/factory.py +0 -153
  51. kodit/infrastructure/indexing/index_repository.py +0 -286
  52. kodit/infrastructure/indexing/snippet_domain_service_factory.py +0 -37
  53. kodit/infrastructure/snippet_extraction/__init__.py +0 -1
  54. kodit/infrastructure/snippet_extraction/language_detection_service.py +0 -39
  55. kodit/infrastructure/snippet_extraction/languages/csharp.scm +0 -12
  56. kodit/infrastructure/snippet_extraction/languages/go.scm +0 -26
  57. kodit/infrastructure/snippet_extraction/languages/java.scm +0 -12
  58. kodit/infrastructure/snippet_extraction/languages/javascript.scm +0 -24
  59. kodit/infrastructure/snippet_extraction/languages/python.scm +0 -22
  60. kodit/infrastructure/snippet_extraction/languages/typescript.scm +0 -25
  61. kodit/infrastructure/snippet_extraction/snippet_extraction_factory.py +0 -67
  62. kodit/infrastructure/snippet_extraction/snippet_query_provider.py +0 -45
  63. kodit/infrastructure/snippet_extraction/tree_sitter_snippet_extractor.py +0 -182
  64. kodit/infrastructure/sqlalchemy/file_repository.py +0 -78
  65. kodit/infrastructure/sqlalchemy/repository.py +0 -133
  66. kodit/infrastructure/sqlalchemy/snippet_repository.py +0 -259
  67. kodit-0.3.2.dist-info/RECORD +0 -103
  68. {kodit-0.3.2.dist-info → kodit-0.3.4.dist-info}/WHEEL +0 -0
  69. {kodit-0.3.2.dist-info → kodit-0.3.4.dist-info}/entry_points.txt +0 -0
  70. {kodit-0.3.2.dist-info → kodit-0.3.4.dist-info}/licenses/LICENSE +0 -0
@@ -3,7 +3,6 @@
3
3
  from abc import ABC, abstractmethod
4
4
  from collections.abc import AsyncGenerator, Sequence
5
5
 
6
- from kodit.domain.entities import EmbeddingType
7
6
  from kodit.domain.value_objects import (
8
7
  EmbeddingRequest,
9
8
  EmbeddingResponse,
@@ -12,6 +11,7 @@ from kodit.domain.value_objects import (
12
11
  SearchRequest,
13
12
  SearchResult,
14
13
  )
14
+ from kodit.infrastructure.sqlalchemy.entities import EmbeddingType
15
15
 
16
16
 
17
17
  class EmbeddingProvider(ABC):
@@ -0,0 +1,66 @@
1
+ """Index query service."""
2
+
3
+ from abc import ABC, abstractmethod
4
+
5
+ from kodit.domain.entities import Index, SnippetWithContext
6
+ from kodit.domain.protocols import IndexRepository
7
+ from kodit.domain.value_objects import (
8
+ FusionRequest,
9
+ FusionResult,
10
+ MultiSearchRequest,
11
+ )
12
+
13
+
14
+ class FusionService(ABC):
15
+ """Abstract fusion service interface."""
16
+
17
+ @abstractmethod
18
+ def reciprocal_rank_fusion(
19
+ self, rankings: list[list[FusionRequest]], k: float = 60
20
+ ) -> list[FusionResult]:
21
+ """Perform reciprocal rank fusion on search results."""
22
+
23
+
24
+ class IndexQueryService:
25
+ """Index query service."""
26
+
27
+ def __init__(
28
+ self,
29
+ index_repository: IndexRepository,
30
+ fusion_service: FusionService,
31
+ ) -> None:
32
+ """Initialize the index query service."""
33
+ self.index_repository = index_repository
34
+ self.fusion_service = fusion_service
35
+
36
+ async def get_index_by_id(self, index_id: int) -> Index | None:
37
+ """Get an index by its ID."""
38
+ return await self.index_repository.get(index_id)
39
+
40
+ async def list_indexes(self) -> list[Index]:
41
+ """List all indexes."""
42
+ return await self.index_repository.all()
43
+
44
+ async def search_snippets(
45
+ self, request: MultiSearchRequest
46
+ ) -> list[SnippetWithContext]:
47
+ """Search snippets with filters.
48
+
49
+ Args:
50
+ request: The search request containing filters
51
+
52
+ Returns:
53
+ List of matching snippet items with context
54
+
55
+ """
56
+ return list(await self.index_repository.search(request))
57
+
58
+ async def perform_fusion(
59
+ self, rankings: list[list[FusionRequest]], k: float = 60
60
+ ) -> list[FusionResult]:
61
+ """Perform reciprocal rank fusion on search results."""
62
+ return self.fusion_service.reciprocal_rank_fusion(rankings, k)
63
+
64
+ async def get_snippets_by_ids(self, ids: list[int]) -> list[SnippetWithContext]:
65
+ """Get snippets by their IDs."""
66
+ return await self.index_repository.get_snippets_by_ids(ids)
@@ -0,0 +1,282 @@
1
+ """Pure domain service for Index aggregate operations."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from pathlib import Path
5
+
6
+ import structlog
7
+ from pydantic import AnyUrl
8
+
9
+ import kodit.domain.entities as domain_entities
10
+ from kodit.domain.interfaces import ProgressCallback
11
+ from kodit.domain.services.enrichment_service import EnrichmentDomainService
12
+ from kodit.domain.value_objects import (
13
+ EnrichmentIndexRequest,
14
+ EnrichmentRequest,
15
+ LanguageMapping,
16
+ )
17
+ from kodit.infrastructure.cloning.git.working_copy import GitWorkingCopyProvider
18
+ from kodit.infrastructure.cloning.metadata import FileMetadataExtractor
19
+ from kodit.infrastructure.git.git_utils import is_valid_clone_target
20
+ from kodit.infrastructure.ignore.ignore_pattern_provider import GitIgnorePatternProvider
21
+ from kodit.infrastructure.slicing.slicer import Slicer
22
+ from kodit.reporting import Reporter
23
+ from kodit.utils.path_utils import path_from_uri
24
+
25
+
26
+ class LanguageDetectionService(ABC):
27
+ """Abstract interface for language detection service."""
28
+
29
+ @abstractmethod
30
+ async def detect_language(self, file_path: Path) -> str:
31
+ """Detect the programming language of a file."""
32
+
33
+
34
+ class IndexDomainService:
35
+ """Pure domain service for Index aggregate operations.
36
+
37
+ This service handles the full lifecycle of code indexing:
38
+ - Creating indexes for source repositories
39
+ - Cloning and processing source files
40
+ - Extracting and enriching code snippets
41
+ - Managing the complete Index aggregate
42
+ """
43
+
44
+ def __init__(
45
+ self,
46
+ language_detector: LanguageDetectionService,
47
+ enrichment_service: EnrichmentDomainService,
48
+ clone_dir: Path,
49
+ ) -> None:
50
+ """Initialize the index domain service."""
51
+ self._clone_dir = clone_dir
52
+ self._language_detector = language_detector
53
+ self._enrichment_service = enrichment_service
54
+ self.log = structlog.get_logger(__name__)
55
+
56
+ async def prepare_index(
57
+ self,
58
+ uri_or_path_like: str, # Must include user/pass, etc
59
+ progress_callback: ProgressCallback | None = None,
60
+ ) -> domain_entities.WorkingCopy:
61
+ """Prepare an index by scanning files and creating working copy."""
62
+ sanitized_uri, source_type = self.sanitize_uri(uri_or_path_like)
63
+ reporter = Reporter(self.log, progress_callback)
64
+ self.log.info("Preparing source", uri=str(sanitized_uri))
65
+
66
+ if source_type == domain_entities.SourceType.FOLDER:
67
+ await reporter.start("prepare_index", 1, "Scanning source...")
68
+ local_path = path_from_uri(str(sanitized_uri))
69
+ elif source_type == domain_entities.SourceType.GIT:
70
+ source_type = domain_entities.SourceType.GIT
71
+ git_working_copy_provider = GitWorkingCopyProvider(self._clone_dir)
72
+ await reporter.start("prepare_index", 1, "Cloning source...")
73
+ local_path = await git_working_copy_provider.prepare(uri_or_path_like)
74
+ await reporter.done("prepare_index")
75
+ else:
76
+ raise ValueError(f"Unsupported source: {uri_or_path_like}")
77
+
78
+ await reporter.done("prepare_index")
79
+
80
+ return domain_entities.WorkingCopy(
81
+ remote_uri=sanitized_uri,
82
+ cloned_path=local_path,
83
+ source_type=source_type,
84
+ files=[],
85
+ )
86
+
87
+ async def extract_snippets_from_index(
88
+ self,
89
+ index: domain_entities.Index,
90
+ progress_callback: ProgressCallback | None = None,
91
+ ) -> domain_entities.Index:
92
+ """Extract code snippets from files in the index."""
93
+ file_count = len(index.source.working_copy.files)
94
+
95
+ self.log.info(
96
+ "Extracting snippets",
97
+ index_id=index.id,
98
+ file_count=file_count,
99
+ )
100
+
101
+ # Only create snippets for files that have been added or modified
102
+ files = index.source.working_copy.changed_files()
103
+ index.delete_snippets_for_files(files)
104
+
105
+ # Create a set of languages to extract snippets for
106
+ extensions = {file.extension() for file in files}
107
+ languages = []
108
+ for ext in extensions:
109
+ try:
110
+ languages.append(LanguageMapping.get_language_for_extension(ext))
111
+ except ValueError as e:
112
+ self.log.info("Skipping", error=str(e))
113
+ continue
114
+
115
+ reporter = Reporter(self.log, progress_callback)
116
+ await reporter.start(
117
+ "extract_snippets",
118
+ len(files) * len(languages),
119
+ "Extracting code snippets...",
120
+ )
121
+ # Calculate snippets for each language
122
+ slicer = Slicer()
123
+ for i, language in enumerate(languages):
124
+ await reporter.step(
125
+ "extract_snippets",
126
+ len(files) * (i + 1),
127
+ len(files) * len(languages),
128
+ "Extracting code snippets...",
129
+ )
130
+ s = slicer.extract_snippets(files, language=language)
131
+ index.snippets.extend(s)
132
+
133
+ await reporter.done("extract_snippets")
134
+ return index
135
+
136
+ async def enrich_snippets_in_index(
137
+ self,
138
+ snippets: list[domain_entities.Snippet],
139
+ progress_callback: ProgressCallback | None = None,
140
+ ) -> list[domain_entities.Snippet]:
141
+ """Enrich snippets with AI-generated summaries."""
142
+ if not snippets or len(snippets) == 0:
143
+ return snippets
144
+
145
+ reporter = Reporter(self.log, progress_callback)
146
+ await reporter.start("enrichment", len(snippets), "Enriching snippets...")
147
+
148
+ snippet_map = {snippet.id: snippet for snippet in snippets if snippet.id}
149
+
150
+ enrichment_request = EnrichmentIndexRequest(
151
+ requests=[
152
+ EnrichmentRequest(snippet_id=snippet_id, text=snippet.original_text())
153
+ for snippet_id, snippet in snippet_map.items()
154
+ ]
155
+ )
156
+
157
+ processed = 0
158
+ async for result in self._enrichment_service.enrich_documents(
159
+ enrichment_request
160
+ ):
161
+ snippet_map[result.snippet_id].add_summary(result.text)
162
+
163
+ processed += 1
164
+ await reporter.step(
165
+ "enrichment", processed, len(snippets), "Enriching snippets..."
166
+ )
167
+
168
+ await reporter.done("enrichment")
169
+ return list(snippet_map.values())
170
+
171
+ def sanitize_uri(
172
+ self, uri_or_path_like: str
173
+ ) -> tuple[AnyUrl, domain_entities.SourceType]:
174
+ """Convert a URI or path-like string to a URI."""
175
+ # First, check if it's a local directory (more reliable than git check)
176
+ if Path(uri_or_path_like).is_dir():
177
+ return (
178
+ domain_entities.WorkingCopy.sanitize_local_path(uri_or_path_like),
179
+ domain_entities.SourceType.FOLDER,
180
+ )
181
+
182
+ # Then check if it's git-clonable
183
+ if is_valid_clone_target(uri_or_path_like):
184
+ return (
185
+ domain_entities.WorkingCopy.sanitize_git_url(uri_or_path_like),
186
+ domain_entities.SourceType.GIT,
187
+ )
188
+
189
+ raise ValueError(f"Unsupported source: {uri_or_path_like}")
190
+
191
+ async def refresh_working_copy(
192
+ self,
193
+ working_copy: domain_entities.WorkingCopy,
194
+ progress_callback: ProgressCallback | None = None,
195
+ ) -> domain_entities.WorkingCopy:
196
+ """Refresh the working copy."""
197
+ metadata_extractor = FileMetadataExtractor(working_copy.source_type)
198
+ reporter = Reporter(self.log, progress_callback)
199
+
200
+ if working_copy.source_type == domain_entities.SourceType.GIT:
201
+ git_working_copy_provider = GitWorkingCopyProvider(self._clone_dir)
202
+ await git_working_copy_provider.sync(str(working_copy.remote_uri))
203
+
204
+ current_file_paths = working_copy.list_filesystem_paths(
205
+ GitIgnorePatternProvider(working_copy.cloned_path)
206
+ )
207
+
208
+ previous_files_map = {file.as_path(): file for file in working_copy.files}
209
+
210
+ # Calculate different sets of files
211
+ deleted_file_paths = set(previous_files_map.keys()) - set(current_file_paths)
212
+ new_file_paths = set(current_file_paths) - set(previous_files_map.keys())
213
+ modified_file_paths = set(current_file_paths) & set(previous_files_map.keys())
214
+ num_files_to_process = (
215
+ len(deleted_file_paths) + len(new_file_paths) + len(modified_file_paths)
216
+ )
217
+ self.log.info(
218
+ "Refreshing working copy",
219
+ num_deleted=len(deleted_file_paths),
220
+ num_new=len(new_file_paths),
221
+ num_modified=len(modified_file_paths),
222
+ num_total_changes=num_files_to_process,
223
+ num_dirty=len(working_copy.dirty_files()),
224
+ )
225
+
226
+ # Setup reporter
227
+ processed = 0
228
+ await reporter.start(
229
+ "refresh_working_copy", num_files_to_process, "Refreshing working copy..."
230
+ )
231
+
232
+ # First check to see if any files have been deleted
233
+ for file_path in deleted_file_paths:
234
+ processed += 1
235
+ await reporter.step(
236
+ "refresh_working_copy",
237
+ processed,
238
+ num_files_to_process,
239
+ f"Deleted {file_path.name}",
240
+ )
241
+ previous_files_map[
242
+ file_path
243
+ ].file_processing_status = domain_entities.FileProcessingStatus.DELETED
244
+
245
+ # Then check to see if there are any new files
246
+ for file_path in new_file_paths:
247
+ processed += 1
248
+ await reporter.step(
249
+ "refresh_working_copy",
250
+ processed,
251
+ num_files_to_process,
252
+ f"New {file_path.name}",
253
+ )
254
+ try:
255
+ working_copy.files.append(
256
+ await metadata_extractor.extract(file_path=file_path)
257
+ )
258
+ except (OSError, ValueError) as e:
259
+ self.log.debug("Skipping file", file=str(file_path), error=str(e))
260
+ continue
261
+
262
+ # Finally check if there are any modified files
263
+ for file_path in modified_file_paths:
264
+ processed += 1
265
+ await reporter.step(
266
+ "refresh_working_copy",
267
+ processed,
268
+ num_files_to_process,
269
+ f"Modified {file_path.name}",
270
+ )
271
+ try:
272
+ previous_file = previous_files_map[file_path]
273
+ new_file = await metadata_extractor.extract(file_path=file_path)
274
+ if previous_file.sha256 != new_file.sha256:
275
+ previous_file.file_processing_status = (
276
+ domain_entities.FileProcessingStatus.MODIFIED
277
+ )
278
+ except (OSError, ValueError) as e:
279
+ self.log.info("Skipping file", file=str(file_path), error=str(e))
280
+ continue
281
+
282
+ return working_copy
@@ -1,41 +1,137 @@
1
- """Domain value objects and DTOs."""
1
+ """Pure domain value objects and DTOs."""
2
2
 
3
3
  import json
4
4
  from dataclasses import dataclass
5
5
  from datetime import datetime
6
- from enum import Enum
6
+ from enum import Enum, IntEnum
7
7
  from pathlib import Path
8
- from typing import Any, ClassVar
8
+ from typing import ClassVar
9
9
 
10
- from sqlalchemy import JSON, DateTime, Integer, Text
11
- from sqlalchemy.orm import Mapped, mapped_column
10
+ from pydantic import BaseModel
12
11
 
13
- from kodit.domain.entities import Author, Base, File, Snippet, Source
14
- from kodit.domain.enums import SnippetExtractionStrategy
15
12
 
13
+ class SourceType(IntEnum):
14
+ """The type of source."""
16
15
 
17
- class SearchType(Enum):
18
- """Type of search to perform."""
16
+ UNKNOWN = 0
17
+ FOLDER = 1
18
+ GIT = 2
19
19
 
20
- BM25 = "bm25"
21
- VECTOR = "vector"
22
- HYBRID = "hybrid"
23
20
 
21
+ class SnippetContentType(IntEnum):
22
+ """Type of snippet content."""
24
23
 
25
- @dataclass
26
- class SnippetExtractionRequest:
27
- """Domain model for snippet extraction request."""
24
+ UNKNOWN = 0
25
+ ORIGINAL = 1
26
+ SUMMARY = 2
27
+
28
+
29
+ class SnippetContent(BaseModel):
30
+ """Snippet content domain value object."""
28
31
 
32
+ type: SnippetContentType
33
+ value: str
34
+ language: str
35
+
36
+
37
+ class SnippetSearchResult(BaseModel):
38
+ """Domain result object for snippet searches."""
39
+
40
+ snippet_id: int
41
+ content: str
42
+ summary: str
43
+ score: float
29
44
  file_path: Path
30
- strategy: SnippetExtractionStrategy = SnippetExtractionStrategy.METHOD_BASED
45
+ language: str | None = None
46
+ authors: list[str] = []
31
47
 
32
48
 
33
- @dataclass
34
- class SnippetExtractionResult:
35
- """Domain model for snippet extraction result."""
49
+ @dataclass(frozen=True)
50
+ class LanguageExtensions:
51
+ """Value object for language to file extension mappings."""
36
52
 
37
- snippets: list[str]
38
53
  language: str
54
+ extensions: list[str]
55
+
56
+ @classmethod
57
+ def get_supported_languages(cls) -> list[str]:
58
+ """Get all supported programming languages."""
59
+ return [
60
+ "python",
61
+ "javascript",
62
+ "typescript",
63
+ "java",
64
+ "c",
65
+ "cpp",
66
+ "csharp",
67
+ "go",
68
+ "rust",
69
+ "php",
70
+ "ruby",
71
+ "swift",
72
+ "kotlin",
73
+ "scala",
74
+ "r",
75
+ "sql",
76
+ "html",
77
+ "css",
78
+ "json",
79
+ "yaml",
80
+ "xml",
81
+ "markdown",
82
+ "shell",
83
+ ]
84
+
85
+ @classmethod
86
+ def get_extensions_for_language(cls, language: str) -> list[str]:
87
+ """Get file extensions for a given language."""
88
+ language_map = {
89
+ "python": [".py", ".pyw", ".pyi"],
90
+ "javascript": [".js", ".jsx", ".mjs"],
91
+ "typescript": [".ts", ".tsx"],
92
+ "java": [".java"],
93
+ "c": [".c", ".h"],
94
+ "cpp": [".cpp", ".cc", ".cxx", ".hpp", ".hxx"],
95
+ "csharp": [".cs"],
96
+ "go": [".go"],
97
+ "rust": [".rs"],
98
+ "php": [".php"],
99
+ "ruby": [".rb"],
100
+ "swift": [".swift"],
101
+ "kotlin": [".kt", ".kts"],
102
+ "scala": [".scala", ".sc"],
103
+ "r": [".r", ".R"],
104
+ "sql": [".sql"],
105
+ "html": [".html", ".htm"],
106
+ "css": [".css", ".scss", ".sass", ".less"],
107
+ "json": [".json"],
108
+ "yaml": [".yaml", ".yml"],
109
+ "xml": [".xml"],
110
+ "markdown": [".md", ".markdown"],
111
+ "shell": [".sh", ".bash", ".zsh", ".fish"],
112
+ }
113
+ return language_map.get(language.lower(), [])
114
+
115
+ @classmethod
116
+ def is_supported_language(cls, language: str) -> bool:
117
+ """Check if a language is supported."""
118
+ return language.lower() in cls.get_supported_languages()
119
+
120
+ @classmethod
121
+ def get_extensions_or_fallback(cls, language: str) -> list[str]:
122
+ """Get extensions for language or return language as extension if not found."""
123
+ language_lower = language.lower()
124
+ if cls.is_supported_language(language_lower):
125
+ return cls.get_extensions_for_language(language_lower)
126
+ return [language_lower]
127
+
128
+
129
+ class SearchType(Enum):
130
+ """Type of search to perform."""
131
+
132
+ BM25 = "bm25"
133
+ VECTOR = "vector"
134
+ HYBRID = "hybrid"
39
135
 
40
136
 
41
137
  @dataclass
@@ -101,6 +197,7 @@ class SnippetSearchFilters:
101
197
  created_after: datetime | None = None
102
198
  created_before: datetime | None = None
103
199
  source_repo: str | None = None
200
+ file_path: str | None = None
104
201
 
105
202
  @classmethod
106
203
  def from_cli_params(
@@ -357,16 +454,6 @@ class IndexView:
357
454
  source: str | None = None
358
455
 
359
456
 
360
- @dataclass
361
- class SnippetWithContext:
362
- """Domain model for snippet with associated context information."""
363
-
364
- source: Source
365
- file: File
366
- authors: list[Author]
367
- snippet: Snippet
368
-
369
-
370
457
  class LanguageMapping:
371
458
  """Value object for language-to-extension mappings.
372
459
 
@@ -536,38 +623,29 @@ class LanguageMapping:
536
623
  return [language_lower]
537
624
 
538
625
 
539
- # Database models for value objects
540
- class BM25DocumentModel(Base):
541
- """BM25 document model."""
542
-
543
- __tablename__ = "bm25_documents"
544
-
545
- id: Mapped[int] = mapped_column(Integer, primary_key=True)
546
- content: Mapped[str] = mapped_column(Text, nullable=False)
547
- document_metadata: Mapped[dict[str, Any] | None] = mapped_column(
548
- JSON, nullable=True
549
- )
550
- created_at: Mapped[datetime] = mapped_column(
551
- DateTime(timezone=True), nullable=False
552
- )
553
- updated_at: Mapped[datetime] = mapped_column(
554
- DateTime(timezone=True), nullable=False
555
- )
556
-
557
-
558
- class VectorDocumentModel(Base):
559
- """Vector document model."""
560
-
561
- __tablename__ = "vector_documents"
562
-
563
- id: Mapped[int] = mapped_column(Integer, primary_key=True)
564
- content: Mapped[str] = mapped_column(Text, nullable=False)
565
- document_metadata: Mapped[dict[str, Any] | None] = mapped_column(
566
- JSON, nullable=True
567
- )
568
- created_at: Mapped[datetime] = mapped_column(
569
- DateTime(timezone=True), nullable=False
570
- )
571
- updated_at: Mapped[datetime] = mapped_column(
572
- DateTime(timezone=True), nullable=False
573
- )
626
+ class SnippetQuery(BaseModel):
627
+ """Domain query object for snippet searches."""
628
+
629
+ text: str
630
+ search_type: SearchType = SearchType.HYBRID
631
+ filters: SnippetSearchFilters = SnippetSearchFilters()
632
+ top_k: int = 10
633
+
634
+
635
+ class FileProcessingStatus(IntEnum):
636
+ """File processing status."""
637
+
638
+ CLEAN = 0
639
+ ADDED = 1
640
+ MODIFIED = 2
641
+ DELETED = 3
642
+
643
+
644
+ @dataclass
645
+ class FunctionDefinition:
646
+ """Cached function definition."""
647
+
648
+ name: str
649
+ qualified_name: str
650
+ start_byte: int
651
+ end_byte: int
@@ -6,7 +6,7 @@ from pathlib import Path
6
6
  import git
7
7
  import structlog
8
8
 
9
- from kodit.infrastructure.git.git_utils import sanitize_git_url
9
+ from kodit.domain.entities import WorkingCopy
10
10
 
11
11
 
12
12
  class GitWorkingCopyProvider:
@@ -17,15 +17,17 @@ class GitWorkingCopyProvider:
17
17
  self.clone_dir = clone_dir
18
18
  self.log = structlog.get_logger(__name__)
19
19
 
20
+ def get_clone_path(self, uri: str) -> Path:
21
+ """Get the clone path for a Git working copy."""
22
+ sanitized_uri = WorkingCopy.sanitize_git_url(uri)
23
+ dir_hash = hashlib.sha256(str(sanitized_uri).encode("utf-8")).hexdigest()[:16]
24
+ dir_name = f"repo-{dir_hash}"
25
+ return self.clone_dir / dir_name
26
+
20
27
  async def prepare(self, uri: str) -> Path:
21
28
  """Prepare a Git working copy."""
22
- # Sanitize the URI for directory name to prevent credential leaks
23
- sanitized_uri = sanitize_git_url(uri)
24
-
25
- # Use a repeatable, short sha256 hash of the sanitized URI for the directory
26
- dir_hash = hashlib.sha256(sanitized_uri.encode("utf-8")).hexdigest()[:16]
27
- dir_name = f"repo-{dir_hash}"
28
- clone_path = self.clone_dir / dir_name
29
+ sanitized_uri = WorkingCopy.sanitize_git_url(uri)
30
+ clone_path = self.get_clone_path(uri)
29
31
  clone_path.mkdir(parents=True, exist_ok=True)
30
32
 
31
33
  try:
@@ -41,3 +43,10 @@ class GitWorkingCopyProvider:
41
43
  self.log.info("Repository already exists, reusing...", uri=sanitized_uri)
42
44
 
43
45
  return clone_path
46
+
47
+ async def sync(self, uri: str) -> Path:
48
+ """Refresh a Git working copy."""
49
+ clone_path = self.get_clone_path(uri)
50
+ repo = git.Repo(clone_path)
51
+ repo.remotes.origin.pull()
52
+ return clone_path