kodit 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (100) hide show
  1. kodit/_version.py +2 -2
  2. kodit/app.py +59 -24
  3. kodit/application/factories/reporting_factory.py +16 -7
  4. kodit/application/factories/server_factory.py +311 -0
  5. kodit/application/services/code_search_application_service.py +144 -0
  6. kodit/application/services/commit_indexing_application_service.py +543 -0
  7. kodit/application/services/indexing_worker_service.py +13 -46
  8. kodit/application/services/queue_service.py +24 -3
  9. kodit/application/services/reporting.py +70 -54
  10. kodit/application/services/sync_scheduler.py +15 -31
  11. kodit/cli.py +2 -763
  12. kodit/cli_utils.py +2 -9
  13. kodit/config.py +3 -96
  14. kodit/database.py +38 -1
  15. kodit/domain/entities/__init__.py +276 -0
  16. kodit/domain/entities/git.py +190 -0
  17. kodit/domain/factories/__init__.py +1 -0
  18. kodit/domain/factories/git_repo_factory.py +76 -0
  19. kodit/domain/protocols.py +270 -46
  20. kodit/domain/services/bm25_service.py +5 -1
  21. kodit/domain/services/embedding_service.py +3 -0
  22. kodit/domain/services/git_repository_service.py +429 -0
  23. kodit/domain/services/git_service.py +300 -0
  24. kodit/domain/services/task_status_query_service.py +19 -0
  25. kodit/domain/value_objects.py +113 -147
  26. kodit/infrastructure/api/client/__init__.py +0 -2
  27. kodit/infrastructure/api/v1/__init__.py +0 -4
  28. kodit/infrastructure/api/v1/dependencies.py +105 -44
  29. kodit/infrastructure/api/v1/routers/__init__.py +0 -6
  30. kodit/infrastructure/api/v1/routers/commits.py +271 -0
  31. kodit/infrastructure/api/v1/routers/queue.py +2 -2
  32. kodit/infrastructure/api/v1/routers/repositories.py +282 -0
  33. kodit/infrastructure/api/v1/routers/search.py +31 -14
  34. kodit/infrastructure/api/v1/schemas/__init__.py +0 -24
  35. kodit/infrastructure/api/v1/schemas/commit.py +96 -0
  36. kodit/infrastructure/api/v1/schemas/context.py +2 -0
  37. kodit/infrastructure/api/v1/schemas/repository.py +128 -0
  38. kodit/infrastructure/api/v1/schemas/search.py +12 -9
  39. kodit/infrastructure/api/v1/schemas/snippet.py +58 -0
  40. kodit/infrastructure/api/v1/schemas/tag.py +31 -0
  41. kodit/infrastructure/api/v1/schemas/task_status.py +41 -0
  42. kodit/infrastructure/bm25/local_bm25_repository.py +16 -4
  43. kodit/infrastructure/bm25/vectorchord_bm25_repository.py +68 -52
  44. kodit/infrastructure/cloning/git/git_python_adaptor.py +467 -0
  45. kodit/infrastructure/cloning/git/working_copy.py +10 -3
  46. kodit/infrastructure/embedding/embedding_factory.py +3 -2
  47. kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
  48. kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +111 -84
  49. kodit/infrastructure/enrichment/litellm_enrichment_provider.py +19 -26
  50. kodit/infrastructure/enrichment/local_enrichment_provider.py +41 -30
  51. kodit/infrastructure/indexing/fusion_service.py +1 -1
  52. kodit/infrastructure/mappers/git_mapper.py +193 -0
  53. kodit/infrastructure/mappers/snippet_mapper.py +106 -0
  54. kodit/infrastructure/mappers/task_mapper.py +5 -44
  55. kodit/infrastructure/mappers/task_status_mapper.py +85 -0
  56. kodit/infrastructure/reporting/db_progress.py +23 -0
  57. kodit/infrastructure/reporting/log_progress.py +13 -38
  58. kodit/infrastructure/reporting/telemetry_progress.py +21 -0
  59. kodit/infrastructure/slicing/slicer.py +32 -31
  60. kodit/infrastructure/sqlalchemy/embedding_repository.py +43 -23
  61. kodit/infrastructure/sqlalchemy/entities.py +428 -131
  62. kodit/infrastructure/sqlalchemy/git_branch_repository.py +263 -0
  63. kodit/infrastructure/sqlalchemy/git_commit_repository.py +337 -0
  64. kodit/infrastructure/sqlalchemy/git_repository.py +252 -0
  65. kodit/infrastructure/sqlalchemy/git_tag_repository.py +257 -0
  66. kodit/infrastructure/sqlalchemy/snippet_v2_repository.py +484 -0
  67. kodit/infrastructure/sqlalchemy/task_repository.py +29 -23
  68. kodit/infrastructure/sqlalchemy/task_status_repository.py +91 -0
  69. kodit/infrastructure/sqlalchemy/unit_of_work.py +10 -14
  70. kodit/mcp.py +12 -26
  71. kodit/migrations/env.py +1 -1
  72. kodit/migrations/versions/04b80f802e0c_foreign_key_review.py +100 -0
  73. kodit/migrations/versions/7f15f878c3a1_add_new_git_entities.py +690 -0
  74. kodit/migrations/versions/b9cd1c3fd762_add_task_status.py +77 -0
  75. kodit/migrations/versions/f9e5ef5e688f_add_git_commits_number.py +43 -0
  76. kodit/py.typed +0 -0
  77. kodit/utils/dump_openapi.py +7 -4
  78. kodit/utils/path_utils.py +29 -0
  79. {kodit-0.4.2.dist-info → kodit-0.5.0.dist-info}/METADATA +3 -3
  80. kodit-0.5.0.dist-info/RECORD +137 -0
  81. kodit/application/factories/code_indexing_factory.py +0 -193
  82. kodit/application/services/auto_indexing_service.py +0 -103
  83. kodit/application/services/code_indexing_application_service.py +0 -393
  84. kodit/domain/entities.py +0 -323
  85. kodit/domain/services/index_query_service.py +0 -70
  86. kodit/domain/services/index_service.py +0 -267
  87. kodit/infrastructure/api/client/index_client.py +0 -57
  88. kodit/infrastructure/api/v1/routers/indexes.py +0 -119
  89. kodit/infrastructure/api/v1/schemas/index.py +0 -101
  90. kodit/infrastructure/bm25/bm25_factory.py +0 -28
  91. kodit/infrastructure/cloning/__init__.py +0 -1
  92. kodit/infrastructure/cloning/metadata.py +0 -98
  93. kodit/infrastructure/mappers/index_mapper.py +0 -345
  94. kodit/infrastructure/reporting/tdqm_progress.py +0 -73
  95. kodit/infrastructure/slicing/language_detection_service.py +0 -18
  96. kodit/infrastructure/sqlalchemy/index_repository.py +0 -646
  97. kodit-0.4.2.dist-info/RECORD +0 -119
  98. {kodit-0.4.2.dist-info → kodit-0.5.0.dist-info}/WHEEL +0 -0
  99. {kodit-0.4.2.dist-info → kodit-0.5.0.dist-info}/entry_points.txt +0 -0
  100. {kodit-0.4.2.dist-info → kodit-0.5.0.dist-info}/licenses/LICENSE +0 -0
kodit/domain/entities.py DELETED
@@ -1,323 +0,0 @@
1
- """Pure domain entities using Pydantic."""
2
-
3
- import shutil
4
- from dataclasses import dataclass
5
- from datetime import datetime
6
- from pathlib import Path
7
- from typing import Any, Protocol
8
- from urllib.parse import urlparse, urlunparse
9
-
10
- from pydantic import AnyUrl, BaseModel
11
-
12
- from kodit.domain.value_objects import (
13
- FileProcessingStatus,
14
- QueuePriority,
15
- SnippetContent,
16
- SnippetContentType,
17
- SourceType,
18
- TaskType,
19
- )
20
- from kodit.utils.path_utils import path_from_uri
21
-
22
-
23
- class IgnorePatternProvider(Protocol):
24
- """Protocol for ignore pattern providers."""
25
-
26
- def should_ignore(self, path: Path) -> bool:
27
- """Check if a path should be ignored."""
28
- ...
29
-
30
-
31
- class Author(BaseModel):
32
- """Author domain entity."""
33
-
34
- id: int | None = None
35
- name: str
36
- email: str
37
-
38
-
39
- class File(BaseModel):
40
- """File domain entity."""
41
-
42
- id: int | None = None # Is populated by repository
43
- created_at: datetime | None = None # Is populated by repository
44
- updated_at: datetime | None = None # Is populated by repository
45
- uri: AnyUrl
46
- sha256: str
47
- authors: list[Author]
48
- mime_type: str
49
- file_processing_status: FileProcessingStatus
50
-
51
- def as_path(self) -> Path:
52
- """Return the file as a path."""
53
- return path_from_uri(str(self.uri))
54
-
55
- def extension(self) -> str:
56
- """Return the file extension."""
57
- return Path(self.as_path()).suffix.lstrip(".")
58
-
59
-
60
- class WorkingCopy(BaseModel):
61
- """Working copy value object representing cloned source location."""
62
-
63
- created_at: datetime | None = None # Is populated by repository
64
- updated_at: datetime | None = None # Is populated by repository
65
- remote_uri: AnyUrl
66
- cloned_path: Path
67
- source_type: SourceType
68
- files: list[File]
69
-
70
- @classmethod
71
- def sanitize_local_path(cls, path: str) -> AnyUrl:
72
- """Sanitize a local path."""
73
- return AnyUrl(Path(path).resolve().absolute().as_uri())
74
-
75
- @classmethod
76
- def sanitize_git_url(cls, url: str) -> AnyUrl:
77
- """Remove credentials from a git URL while preserving the rest of the URL.
78
-
79
- This function handles various git URL formats:
80
- - HTTPS URLs with username:password@host
81
- - HTTPS URLs with username@host (no password)
82
- - SSH URLs (left unchanged)
83
- - File URLs (left unchanged)
84
-
85
- Args:
86
- url: The git URL that may contain credentials.
87
-
88
- Returns:
89
- The sanitized URL with credentials removed.
90
-
91
- Examples:
92
- >>> sanitize_git_url("https://phil:token@dev.azure.com/org/project/_git/repo")
93
- "https://dev.azure.com/org/project/_git/repo"
94
- >>> sanitize_git_url("https://username@github.com/user/repo.git")
95
- "https://github.com/user/repo.git"
96
- >>> sanitize_git_url("git@github.com:user/repo.git")
97
- "ssh://git@github.com/user/repo.git"
98
-
99
- """
100
- # Handle SSH URLs (they don't have credentials in the URL format)
101
- if url.startswith("git@"):
102
- # Convert git@host:path to ssh://git@host/path format for AnyUrl
103
- # This maintains the same semantic meaning while making it a valid URL
104
- if ":" in url and not url.startswith("ssh://"):
105
- host_path = url[4:] # Remove "git@"
106
- if ":" in host_path:
107
- host, path = host_path.split(":", 1)
108
- ssh_url = f"ssh://git@{host}/{path}"
109
- return AnyUrl(ssh_url)
110
- return AnyUrl(url)
111
- if url.startswith("ssh://"):
112
- return AnyUrl(url)
113
-
114
- # Handle file URLs
115
- if url.startswith("file://"):
116
- return AnyUrl(url)
117
-
118
- try:
119
- # Parse the URL
120
- parsed = urlparse(url)
121
-
122
- # If there are no credentials, return the URL as-is
123
- if not parsed.username:
124
- return AnyUrl(url)
125
-
126
- # Reconstruct the URL without credentials
127
- # scheme, netloc (without username/password), path, params, query, fragment
128
- sanitized_netloc = parsed.hostname
129
- if parsed.port:
130
- sanitized_netloc = f"{parsed.hostname}:{parsed.port}"
131
-
132
- return AnyUrl(
133
- urlunparse(
134
- (
135
- parsed.scheme,
136
- sanitized_netloc,
137
- parsed.path,
138
- parsed.params,
139
- parsed.query,
140
- parsed.fragment,
141
- )
142
- )
143
- )
144
-
145
- except Exception as e:
146
- raise ValueError(f"Invalid URL: {url}") from e
147
-
148
- def modified_or_deleted_files(self) -> list[File]:
149
- """Return the modified or deleted files."""
150
- return [
151
- file
152
- for file in self.files
153
- if file.file_processing_status
154
- in (FileProcessingStatus.MODIFIED, FileProcessingStatus.DELETED)
155
- ]
156
-
157
- def list_filesystem_paths(
158
- self, ignore_provider: IgnorePatternProvider
159
- ) -> list[Path]:
160
- """List the filesystem paths of the files in the working copy."""
161
- if not self.cloned_path.exists():
162
- raise ValueError(f"Cloned path does not exist: {self.cloned_path}")
163
-
164
- return [
165
- f
166
- for f in self.cloned_path.rglob("*")
167
- if f.is_file() and not ignore_provider.should_ignore(f)
168
- ]
169
-
170
- def dirty_files(self) -> list[File]:
171
- """Return the dirty files."""
172
- return [
173
- file
174
- for file in self.files
175
- if file.file_processing_status
176
- in (FileProcessingStatus.MODIFIED, FileProcessingStatus.ADDED)
177
- ]
178
-
179
- def changed_files(self) -> list[File]:
180
- """Return the changed files."""
181
- return [
182
- file
183
- for file in self.files
184
- if file.file_processing_status != FileProcessingStatus.CLEAN
185
- ]
186
-
187
- def clear_file_processing_statuses(self) -> None:
188
- """Clear the file processing statuses."""
189
- # First remove any files that are marked for deletion
190
- self.files = [
191
- file
192
- for file in self.files
193
- if file.file_processing_status != FileProcessingStatus.DELETED
194
- ]
195
- # Then clear the statuses for the remaining files
196
- for file in self.files:
197
- file.file_processing_status = FileProcessingStatus.CLEAN
198
-
199
- def delete(self) -> None:
200
- """Delete the working copy."""
201
- shutil.rmtree(self.cloned_path)
202
-
203
-
204
- class Source(BaseModel):
205
- """Source domain entity."""
206
-
207
- id: int | None = None # Is populated by repository
208
- created_at: datetime | None = None # Is populated by repository
209
- updated_at: datetime | None = None # Is populated by repository
210
- working_copy: WorkingCopy
211
-
212
-
213
- class Snippet(BaseModel):
214
- """Snippet domain entity."""
215
-
216
- id: int | None = None # Is populated by repository
217
- created_at: datetime | None = None # Is populated by repository
218
- updated_at: datetime | None = None # Is populated by repository
219
- derives_from: list[File]
220
- original_content: SnippetContent | None = None
221
- summary_content: SnippetContent | None = None
222
-
223
- def original_text(self) -> str:
224
- """Return the original content of the snippet."""
225
- if self.original_content is None:
226
- return ""
227
- return self.original_content.value
228
-
229
- def summary_text(self) -> str:
230
- """Return the summary content of the snippet."""
231
- if self.summary_content is None:
232
- return ""
233
- return self.summary_content.value
234
-
235
- def add_original_content(self, content: str, language: str) -> None:
236
- """Add an original content to the snippet."""
237
- self.original_content = SnippetContent(
238
- type=SnippetContentType.ORIGINAL,
239
- value=content,
240
- language=language,
241
- )
242
-
243
- def add_summary(self, summary: str) -> None:
244
- """Add a summary to the snippet."""
245
- self.summary_content = SnippetContent(
246
- type=SnippetContentType.SUMMARY,
247
- value=summary,
248
- language="markdown",
249
- )
250
-
251
-
252
- class Index(BaseModel):
253
- """Index domain entity."""
254
-
255
- id: int
256
- created_at: datetime
257
- updated_at: datetime
258
- source: Source
259
- snippets: list[Snippet]
260
-
261
- def delete_snippets_for_files(self, files: list[File]) -> None:
262
- """Delete the snippets that derive from a list of files."""
263
- self.snippets = [
264
- snippet
265
- for snippet in self.snippets
266
- if not any(file in snippet.derives_from for file in files)
267
- ]
268
-
269
-
270
- # FUTURE: Remove this type, use the domain to get the required information.
271
- @dataclass(frozen=True)
272
- class SnippetWithContext:
273
- """Domain model for snippet with associated context information."""
274
-
275
- source: Source
276
- file: File
277
- authors: list[Author]
278
- snippet: Snippet
279
-
280
-
281
- class Task(BaseModel):
282
- """Represents an item in the queue waiting to be processed.
283
-
284
- If the item exists, that means it is in the queue and waiting to be processed. There
285
- is no status associated.
286
- """
287
-
288
- id: str # Is a unique key to deduplicate items in the queue
289
- type: TaskType # Task type
290
- priority: int # Priority (higher number = higher priority)
291
- payload: dict[str, Any] # Task-specific data
292
-
293
- created_at: datetime | None = None # Is populated by repository
294
- updated_at: datetime | None = None # Is populated by repository
295
-
296
- @staticmethod
297
- def create(task_type: TaskType, priority: int, payload: dict[str, Any]) -> "Task":
298
- """Create a task."""
299
- return Task(
300
- id=Task._create_id(task_type, payload),
301
- type=task_type,
302
- priority=priority,
303
- payload=payload,
304
- )
305
-
306
- @staticmethod
307
- def _create_id(task_type: TaskType, payload: dict[str, Any]) -> str:
308
- """Create a unique id for a task."""
309
- if task_type == TaskType.INDEX_UPDATE:
310
- return str(payload["index_id"])
311
-
312
- raise ValueError(f"Unknown task type: {task_type}")
313
-
314
- @staticmethod
315
- def create_index_update_task(
316
- index_id: int, priority: QueuePriority = QueuePriority.USER_INITIATED
317
- ) -> "Task":
318
- """Create an index update task."""
319
- return Task.create(
320
- task_type=TaskType.INDEX_UPDATE,
321
- priority=priority.value,
322
- payload={"index_id": index_id},
323
- )
@@ -1,70 +0,0 @@
1
- """Index query service."""
2
-
3
- from abc import ABC, abstractmethod
4
-
5
- from kodit.domain.entities import Index, SnippetWithContext
6
- from kodit.domain.protocols import IndexRepository
7
- from kodit.domain.value_objects import (
8
- FusionRequest,
9
- FusionResult,
10
- MultiSearchRequest,
11
- )
12
-
13
-
14
- class FusionService(ABC):
15
- """Abstract fusion service interface."""
16
-
17
- @abstractmethod
18
- def reciprocal_rank_fusion(
19
- self, rankings: list[list[FusionRequest]], k: float = 60
20
- ) -> list[FusionResult]:
21
- """Perform reciprocal rank fusion on search results."""
22
-
23
-
24
- class IndexQueryService:
25
- """Index query service."""
26
-
27
- def __init__(
28
- self,
29
- index_repository: IndexRepository,
30
- fusion_service: FusionService,
31
- ) -> None:
32
- """Initialize the index query service."""
33
- self.index_repository = index_repository
34
- self.fusion_service = fusion_service
35
-
36
- async def get_index_by_id(self, index_id: int) -> Index | None:
37
- """Get an index by its ID."""
38
- return await self.index_repository.get(index_id)
39
-
40
- async def list_indexes(self) -> list[Index]:
41
- """List all indexes."""
42
- return await self.index_repository.all()
43
-
44
- async def search_snippets(
45
- self, request: MultiSearchRequest
46
- ) -> list[SnippetWithContext]:
47
- """Search snippets with filters.
48
-
49
- Args:
50
- request: The search request containing filters
51
-
52
- Returns:
53
- List of matching snippet items with context
54
-
55
- """
56
- return list(await self.index_repository.search(request))
57
-
58
- async def perform_fusion(
59
- self, rankings: list[list[FusionRequest]], k: float = 60
60
- ) -> list[FusionResult]:
61
- """Perform reciprocal rank fusion on search results."""
62
- return self.fusion_service.reciprocal_rank_fusion(rankings, k)
63
-
64
- async def get_snippets_by_ids(self, ids: list[int]) -> list[SnippetWithContext]:
65
- """Get snippets by their IDs."""
66
- snippets = await self.index_repository.get_snippets_by_ids(ids)
67
-
68
- # Return snippets in the same order as the ids
69
- snippets.sort(key=lambda x: ids.index(x.snippet.id or 0))
70
- return snippets
@@ -1,267 +0,0 @@
1
- """Pure domain service for Index aggregate operations."""
2
-
3
- from abc import ABC, abstractmethod
4
- from collections import defaultdict
5
- from pathlib import Path
6
-
7
- import structlog
8
- from pydantic import AnyUrl
9
-
10
- import kodit.domain.entities as domain_entities
11
- from kodit.application.factories.reporting_factory import create_noop_operation
12
- from kodit.application.services.reporting import ProgressTracker
13
- from kodit.domain.services.enrichment_service import EnrichmentDomainService
14
- from kodit.domain.value_objects import (
15
- EnrichmentIndexRequest,
16
- EnrichmentRequest,
17
- FileProcessingStatus,
18
- LanguageMapping,
19
- )
20
- from kodit.infrastructure.cloning.git.working_copy import GitWorkingCopyProvider
21
- from kodit.infrastructure.cloning.metadata import FileMetadataExtractor
22
- from kodit.infrastructure.git.git_utils import is_valid_clone_target
23
- from kodit.infrastructure.ignore.ignore_pattern_provider import GitIgnorePatternProvider
24
- from kodit.infrastructure.slicing.slicer import Slicer
25
- from kodit.utils.path_utils import path_from_uri
26
-
27
-
28
- class LanguageDetectionService(ABC):
29
- """Abstract interface for language detection service."""
30
-
31
- @abstractmethod
32
- async def detect_language(self, file_path: Path) -> str:
33
- """Detect the programming language of a file."""
34
-
35
-
36
- class IndexDomainService:
37
- """Pure domain service for Index aggregate operations.
38
-
39
- This service handles the full lifecycle of code indexing:
40
- - Creating indexes for source repositories
41
- - Cloning and processing source files
42
- - Extracting and enriching code snippets
43
- - Managing the complete Index aggregate
44
- """
45
-
46
- def __init__(
47
- self,
48
- language_detector: LanguageDetectionService,
49
- enrichment_service: EnrichmentDomainService,
50
- clone_dir: Path,
51
- ) -> None:
52
- """Initialize the index domain service."""
53
- self._clone_dir = clone_dir
54
- self._language_detector = language_detector
55
- self._enrichment_service = enrichment_service
56
- self.log = structlog.get_logger(__name__)
57
-
58
- async def prepare_index(
59
- self,
60
- uri_or_path_like: str, # Must include user/pass, etc
61
- step: ProgressTracker | None = None,
62
- ) -> domain_entities.WorkingCopy:
63
- """Prepare an index by scanning files and creating working copy."""
64
- step = step or create_noop_operation()
65
- self.log.info("Preparing index")
66
- sanitized_uri, source_type = self.sanitize_uri(uri_or_path_like)
67
- self.log.info("Preparing source", uri=str(sanitized_uri))
68
-
69
- if source_type == domain_entities.SourceType.FOLDER:
70
- local_path = path_from_uri(str(sanitized_uri))
71
- elif source_type == domain_entities.SourceType.GIT:
72
- source_type = domain_entities.SourceType.GIT
73
- git_working_copy_provider = GitWorkingCopyProvider(self._clone_dir)
74
- local_path = await git_working_copy_provider.prepare(uri_or_path_like, step)
75
- else:
76
- raise ValueError(f"Unsupported source: {uri_or_path_like}")
77
-
78
- return domain_entities.WorkingCopy(
79
- remote_uri=sanitized_uri,
80
- cloned_path=local_path,
81
- source_type=source_type,
82
- files=[],
83
- )
84
-
85
- async def extract_snippets_from_index(
86
- self,
87
- index: domain_entities.Index,
88
- step: ProgressTracker | None = None,
89
- ) -> domain_entities.Index:
90
- """Extract code snippets from files in the index."""
91
- step = step or create_noop_operation()
92
- file_count = len(index.source.working_copy.files)
93
-
94
- self.log.info(
95
- "Extracting snippets",
96
- index_id=index.id,
97
- file_count=file_count,
98
- )
99
-
100
- # Only create snippets for files that have been added or modified
101
- files = index.source.working_copy.changed_files()
102
- index.delete_snippets_for_files(files)
103
-
104
- # Filter out deleted files - they don't exist on disk anymore
105
- files = [
106
- f for f in files if f.file_processing_status != FileProcessingStatus.DELETED
107
- ]
108
-
109
- # Create a set of languages to extract snippets for
110
- extensions = {file.extension() for file in files}
111
- lang_files_map: dict[str, list[domain_entities.File]] = defaultdict(list)
112
- for ext in extensions:
113
- try:
114
- lang = LanguageMapping.get_language_for_extension(ext)
115
- lang_files_map[lang].extend(
116
- file for file in files if file.extension() == ext
117
- )
118
- except ValueError as e:
119
- self.log.debug("Skipping", error=str(e))
120
- continue
121
-
122
- self.log.info(
123
- "Languages to process",
124
- languages=lang_files_map.keys(),
125
- )
126
-
127
- # Calculate snippets for each language
128
- slicer = Slicer()
129
- step.set_total(len(lang_files_map.keys()))
130
- for i, (lang, lang_files) in enumerate(lang_files_map.items()):
131
- step.set_current(i)
132
- s = slicer.extract_snippets(lang_files, language=lang)
133
- index.snippets.extend(s)
134
-
135
- return index
136
-
137
- async def enrich_snippets_in_index(
138
- self,
139
- snippets: list[domain_entities.Snippet],
140
- reporting_step: ProgressTracker | None = None,
141
- ) -> list[domain_entities.Snippet]:
142
- """Enrich snippets with AI-generated summaries."""
143
- reporting_step = reporting_step or create_noop_operation()
144
- if not snippets or len(snippets) == 0:
145
- reporting_step.skip("No snippets to enrich")
146
- return snippets
147
-
148
- reporting_step.set_total(len(snippets))
149
- snippet_map = {snippet.id: snippet for snippet in snippets if snippet.id}
150
-
151
- enrichment_request = EnrichmentIndexRequest(
152
- requests=[
153
- EnrichmentRequest(snippet_id=snippet_id, text=snippet.original_text())
154
- for snippet_id, snippet in snippet_map.items()
155
- ]
156
- )
157
-
158
- processed = 0
159
- async for result in self._enrichment_service.enrich_documents(
160
- enrichment_request
161
- ):
162
- snippet_map[result.snippet_id].add_summary(result.text)
163
-
164
- processed += 1
165
- reporting_step.set_current(processed)
166
-
167
- return list(snippet_map.values())
168
-
169
- def sanitize_uri(
170
- self, uri_or_path_like: str
171
- ) -> tuple[AnyUrl, domain_entities.SourceType]:
172
- """Convert a URI or path-like string to a URI."""
173
- # First, check if it's a local directory (more reliable than git check)
174
- if Path(uri_or_path_like).is_dir():
175
- return (
176
- domain_entities.WorkingCopy.sanitize_local_path(uri_or_path_like),
177
- domain_entities.SourceType.FOLDER,
178
- )
179
-
180
- # Then check if it's git-clonable
181
- if is_valid_clone_target(uri_or_path_like):
182
- return (
183
- domain_entities.WorkingCopy.sanitize_git_url(uri_or_path_like),
184
- domain_entities.SourceType.GIT,
185
- )
186
-
187
- raise ValueError(f"Unsupported source: {uri_or_path_like}")
188
-
189
- async def refresh_working_copy(
190
- self,
191
- working_copy: domain_entities.WorkingCopy,
192
- step: ProgressTracker | None = None,
193
- ) -> domain_entities.WorkingCopy:
194
- """Refresh the working copy."""
195
- step = step or create_noop_operation()
196
- metadata_extractor = FileMetadataExtractor(working_copy.source_type)
197
- if working_copy.source_type == domain_entities.SourceType.GIT:
198
- git_working_copy_provider = GitWorkingCopyProvider(self._clone_dir)
199
- await git_working_copy_provider.sync(str(working_copy.remote_uri), step)
200
-
201
- current_file_paths = working_copy.list_filesystem_paths(
202
- GitIgnorePatternProvider(working_copy.cloned_path)
203
- )
204
-
205
- previous_files_map = {file.as_path(): file for file in working_copy.files}
206
-
207
- # Calculate different sets of files
208
- deleted_file_paths = set(previous_files_map.keys()) - set(current_file_paths)
209
- new_file_paths = set(current_file_paths) - set(previous_files_map.keys())
210
- modified_file_paths = set(current_file_paths) & set(previous_files_map.keys())
211
- num_files_to_process = (
212
- len(deleted_file_paths) + len(new_file_paths) + len(modified_file_paths)
213
- )
214
- self.log.info(
215
- "Refreshing working copy",
216
- num_deleted=len(deleted_file_paths),
217
- num_new=len(new_file_paths),
218
- num_modified=len(modified_file_paths),
219
- num_total_changes=num_files_to_process,
220
- num_dirty=len(working_copy.dirty_files()),
221
- )
222
-
223
- # Setup reporter
224
- processed = 0
225
- step.set_total(num_files_to_process)
226
-
227
- # First check to see if any files have been deleted
228
- for file_path in deleted_file_paths:
229
- processed += 1
230
- step.set_current(processed)
231
- previous_files_map[
232
- file_path
233
- ].file_processing_status = domain_entities.FileProcessingStatus.DELETED
234
-
235
- # Then check to see if there are any new files
236
- for file_path in new_file_paths:
237
- processed += 1
238
- step.set_current(processed)
239
- try:
240
- working_copy.files.append(
241
- await metadata_extractor.extract(file_path=file_path)
242
- )
243
- except (OSError, ValueError) as e:
244
- self.log.debug("Skipping file", file=str(file_path), error=str(e))
245
- continue
246
-
247
- # Finally check if there are any modified files
248
- for file_path in modified_file_paths:
249
- processed += 1
250
- step.set_current(processed)
251
- try:
252
- previous_file = previous_files_map[file_path]
253
- new_file = await metadata_extractor.extract(file_path=file_path)
254
- if previous_file.sha256 != new_file.sha256:
255
- previous_file.file_processing_status = (
256
- domain_entities.FileProcessingStatus.MODIFIED
257
- )
258
- except (OSError, ValueError) as e:
259
- self.log.info("Skipping file", file=str(file_path), error=str(e))
260
- continue
261
-
262
- return working_copy
263
-
264
- async def delete_index(self, index: domain_entities.Index) -> None:
265
- """Delete an index."""
266
- # Delete the working copy
267
- index.source.working_copy.delete()