kodit 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (52) hide show
  1. kodit/_version.py +2 -2
  2. kodit/app.py +9 -2
  3. kodit/application/factories/code_indexing_factory.py +62 -13
  4. kodit/application/factories/reporting_factory.py +32 -0
  5. kodit/application/services/auto_indexing_service.py +41 -33
  6. kodit/application/services/code_indexing_application_service.py +137 -138
  7. kodit/application/services/indexing_worker_service.py +26 -30
  8. kodit/application/services/queue_service.py +12 -14
  9. kodit/application/services/reporting.py +104 -0
  10. kodit/application/services/sync_scheduler.py +21 -20
  11. kodit/cli.py +71 -85
  12. kodit/config.py +26 -3
  13. kodit/database.py +2 -1
  14. kodit/domain/entities.py +99 -1
  15. kodit/domain/protocols.py +34 -1
  16. kodit/domain/services/bm25_service.py +1 -6
  17. kodit/domain/services/index_service.py +23 -57
  18. kodit/domain/services/task_status_query_service.py +19 -0
  19. kodit/domain/value_objects.py +53 -8
  20. kodit/infrastructure/api/v1/dependencies.py +40 -12
  21. kodit/infrastructure/api/v1/routers/indexes.py +45 -0
  22. kodit/infrastructure/api/v1/schemas/task_status.py +39 -0
  23. kodit/infrastructure/cloning/git/working_copy.py +43 -7
  24. kodit/infrastructure/embedding/embedding_factory.py +8 -3
  25. kodit/infrastructure/embedding/embedding_providers/litellm_embedding_provider.py +48 -55
  26. kodit/infrastructure/enrichment/local_enrichment_provider.py +41 -30
  27. kodit/infrastructure/git/git_utils.py +3 -2
  28. kodit/infrastructure/mappers/index_mapper.py +1 -0
  29. kodit/infrastructure/mappers/task_status_mapper.py +85 -0
  30. kodit/infrastructure/reporting/__init__.py +1 -0
  31. kodit/infrastructure/reporting/db_progress.py +23 -0
  32. kodit/infrastructure/reporting/log_progress.py +37 -0
  33. kodit/infrastructure/reporting/tdqm_progress.py +38 -0
  34. kodit/infrastructure/sqlalchemy/embedding_repository.py +47 -68
  35. kodit/infrastructure/sqlalchemy/entities.py +89 -2
  36. kodit/infrastructure/sqlalchemy/index_repository.py +274 -236
  37. kodit/infrastructure/sqlalchemy/task_repository.py +55 -39
  38. kodit/infrastructure/sqlalchemy/task_status_repository.py +79 -0
  39. kodit/infrastructure/sqlalchemy/unit_of_work.py +59 -0
  40. kodit/mcp.py +15 -3
  41. kodit/migrations/env.py +0 -1
  42. kodit/migrations/versions/b9cd1c3fd762_add_task_status.py +77 -0
  43. {kodit-0.4.1.dist-info → kodit-0.4.3.dist-info}/METADATA +1 -1
  44. {kodit-0.4.1.dist-info → kodit-0.4.3.dist-info}/RECORD +47 -40
  45. kodit/domain/interfaces.py +0 -27
  46. kodit/infrastructure/ui/__init__.py +0 -1
  47. kodit/infrastructure/ui/progress.py +0 -170
  48. kodit/infrastructure/ui/spinner.py +0 -74
  49. kodit/reporting.py +0 -78
  50. {kodit-0.4.1.dist-info → kodit-0.4.3.dist-info}/WHEEL +0 -0
  51. {kodit-0.4.1.dist-info → kodit-0.4.3.dist-info}/entry_points.txt +0 -0
  52. {kodit-0.4.1.dist-info → kodit-0.4.3.dist-info}/licenses/LICENSE +0 -0
@@ -8,7 +8,8 @@ import structlog
8
8
  from pydantic import AnyUrl
9
9
 
10
10
  import kodit.domain.entities as domain_entities
11
- from kodit.domain.interfaces import ProgressCallback
11
+ from kodit.application.factories.reporting_factory import create_noop_operation
12
+ from kodit.application.services.reporting import ProgressTracker
12
13
  from kodit.domain.services.enrichment_service import EnrichmentDomainService
13
14
  from kodit.domain.value_objects import (
14
15
  EnrichmentIndexRequest,
@@ -21,7 +22,6 @@ from kodit.infrastructure.cloning.metadata import FileMetadataExtractor
21
22
  from kodit.infrastructure.git.git_utils import is_valid_clone_target
22
23
  from kodit.infrastructure.ignore.ignore_pattern_provider import GitIgnorePatternProvider
23
24
  from kodit.infrastructure.slicing.slicer import Slicer
24
- from kodit.reporting import Reporter
25
25
  from kodit.utils.path_utils import path_from_uri
26
26
 
27
27
 
@@ -58,27 +58,23 @@ class IndexDomainService:
58
58
  async def prepare_index(
59
59
  self,
60
60
  uri_or_path_like: str, # Must include user/pass, etc
61
- progress_callback: ProgressCallback | None = None,
61
+ step: ProgressTracker | None = None,
62
62
  ) -> domain_entities.WorkingCopy:
63
63
  """Prepare an index by scanning files and creating working copy."""
64
+ step = step or create_noop_operation()
65
+ self.log.info("Preparing index")
64
66
  sanitized_uri, source_type = self.sanitize_uri(uri_or_path_like)
65
- reporter = Reporter(self.log, progress_callback)
66
67
  self.log.info("Preparing source", uri=str(sanitized_uri))
67
68
 
68
69
  if source_type == domain_entities.SourceType.FOLDER:
69
- await reporter.start("prepare_index", 1, "Scanning source...")
70
70
  local_path = path_from_uri(str(sanitized_uri))
71
71
  elif source_type == domain_entities.SourceType.GIT:
72
72
  source_type = domain_entities.SourceType.GIT
73
73
  git_working_copy_provider = GitWorkingCopyProvider(self._clone_dir)
74
- await reporter.start("prepare_index", 1, "Cloning source...")
75
- local_path = await git_working_copy_provider.prepare(uri_or_path_like)
76
- await reporter.done("prepare_index")
74
+ local_path = await git_working_copy_provider.prepare(uri_or_path_like, step)
77
75
  else:
78
76
  raise ValueError(f"Unsupported source: {uri_or_path_like}")
79
77
 
80
- await reporter.done("prepare_index")
81
-
82
78
  return domain_entities.WorkingCopy(
83
79
  remote_uri=sanitized_uri,
84
80
  cloned_path=local_path,
@@ -89,9 +85,10 @@ class IndexDomainService:
89
85
  async def extract_snippets_from_index(
90
86
  self,
91
87
  index: domain_entities.Index,
92
- progress_callback: ProgressCallback | None = None,
88
+ step: ProgressTracker | None = None,
93
89
  ) -> domain_entities.Index:
94
90
  """Extract code snippets from files in the index."""
91
+ step = step or create_noop_operation()
95
92
  file_count = len(index.source.working_copy.files)
96
93
 
97
94
  self.log.info(
@@ -127,40 +124,28 @@ class IndexDomainService:
127
124
  languages=lang_files_map.keys(),
128
125
  )
129
126
 
130
- reporter = Reporter(self.log, progress_callback)
131
- await reporter.start(
132
- "extract_snippets",
133
- len(lang_files_map.keys()),
134
- "Extracting code snippets...",
135
- )
136
-
137
127
  # Calculate snippets for each language
138
128
  slicer = Slicer()
129
+ await step.set_total(len(lang_files_map.keys()))
139
130
  for i, (lang, lang_files) in enumerate(lang_files_map.items()):
140
- await reporter.step(
141
- "extract_snippets",
142
- i,
143
- len(lang_files_map.keys()),
144
- f"Extracting code snippets for {lang}...",
145
- )
131
+ await step.set_current(i, f"Extracting snippets for {lang}")
146
132
  s = slicer.extract_snippets(lang_files, language=lang)
147
133
  index.snippets.extend(s)
148
134
 
149
- await reporter.done("extract_snippets")
150
135
  return index
151
136
 
152
137
  async def enrich_snippets_in_index(
153
138
  self,
154
139
  snippets: list[domain_entities.Snippet],
155
- progress_callback: ProgressCallback | None = None,
140
+ reporting_step: ProgressTracker | None = None,
156
141
  ) -> list[domain_entities.Snippet]:
157
142
  """Enrich snippets with AI-generated summaries."""
143
+ reporting_step = reporting_step or create_noop_operation()
158
144
  if not snippets or len(snippets) == 0:
145
+ await reporting_step.skip("No snippets to enrich")
159
146
  return snippets
160
147
 
161
- reporter = Reporter(self.log, progress_callback)
162
- await reporter.start("enrichment", len(snippets), "Enriching snippets...")
163
-
148
+ await reporting_step.set_total(len(snippets))
164
149
  snippet_map = {snippet.id: snippet for snippet in snippets if snippet.id}
165
150
 
166
151
  enrichment_request = EnrichmentIndexRequest(
@@ -177,11 +162,10 @@ class IndexDomainService:
177
162
  snippet_map[result.snippet_id].add_summary(result.text)
178
163
 
179
164
  processed += 1
180
- await reporter.step(
181
- "enrichment", processed, len(snippets), "Enriching snippets..."
165
+ await reporting_step.set_current(
166
+ processed, f"Enriching snippets for {processed} snippets"
182
167
  )
183
168
 
184
- await reporter.done("enrichment")
185
169
  return list(snippet_map.values())
186
170
 
187
171
  def sanitize_uri(
@@ -207,15 +191,14 @@ class IndexDomainService:
207
191
  async def refresh_working_copy(
208
192
  self,
209
193
  working_copy: domain_entities.WorkingCopy,
210
- progress_callback: ProgressCallback | None = None,
194
+ step: ProgressTracker | None = None,
211
195
  ) -> domain_entities.WorkingCopy:
212
196
  """Refresh the working copy."""
197
+ step = step or create_noop_operation()
213
198
  metadata_extractor = FileMetadataExtractor(working_copy.source_type)
214
- reporter = Reporter(self.log, progress_callback)
215
-
216
199
  if working_copy.source_type == domain_entities.SourceType.GIT:
217
200
  git_working_copy_provider = GitWorkingCopyProvider(self._clone_dir)
218
- await git_working_copy_provider.sync(str(working_copy.remote_uri))
201
+ await git_working_copy_provider.sync(str(working_copy.remote_uri), step)
219
202
 
220
203
  current_file_paths = working_copy.list_filesystem_paths(
221
204
  GitIgnorePatternProvider(working_copy.cloned_path)
@@ -241,19 +224,12 @@ class IndexDomainService:
241
224
 
242
225
  # Setup reporter
243
226
  processed = 0
244
- await reporter.start(
245
- "refresh_working_copy", num_files_to_process, "Refreshing working copy..."
246
- )
227
+ await step.set_total(num_files_to_process)
247
228
 
248
229
  # First check to see if any files have been deleted
249
230
  for file_path in deleted_file_paths:
250
231
  processed += 1
251
- await reporter.step(
252
- "refresh_working_copy",
253
- processed,
254
- num_files_to_process,
255
- f"Deleted {file_path.name}",
256
- )
232
+ await step.set_current(processed, f"Deleting file {file_path}")
257
233
  previous_files_map[
258
234
  file_path
259
235
  ].file_processing_status = domain_entities.FileProcessingStatus.DELETED
@@ -261,12 +237,7 @@ class IndexDomainService:
261
237
  # Then check to see if there are any new files
262
238
  for file_path in new_file_paths:
263
239
  processed += 1
264
- await reporter.step(
265
- "refresh_working_copy",
266
- processed,
267
- num_files_to_process,
268
- f"New {file_path.name}",
269
- )
240
+ await step.set_current(processed, f"Adding new file {file_path}")
270
241
  try:
271
242
  working_copy.files.append(
272
243
  await metadata_extractor.extract(file_path=file_path)
@@ -278,12 +249,7 @@ class IndexDomainService:
278
249
  # Finally check if there are any modified files
279
250
  for file_path in modified_file_paths:
280
251
  processed += 1
281
- await reporter.step(
282
- "refresh_working_copy",
283
- processed,
284
- num_files_to_process,
285
- f"Modified {file_path.name}",
286
- )
252
+ await step.set_current(processed, f"Modifying file {file_path}")
287
253
  try:
288
254
  previous_file = previous_files_map[file_path]
289
255
  new_file = await metadata_extractor.extract(file_path=file_path)
@@ -0,0 +1,19 @@
1
+ """Domain service for querying task status."""
2
+
3
+ from kodit.domain.entities import TaskStatus
4
+ from kodit.domain.protocols import TaskStatusRepository
5
+ from kodit.domain.value_objects import TrackableType
6
+
7
+
8
+ class TaskStatusQueryService:
9
+ """Query service for task status information."""
10
+
11
+ def __init__(self, repository: TaskStatusRepository) -> None:
12
+ """Initialize the task status query service."""
13
+ self._repository = repository
14
+
15
+ async def get_index_status(self, index_id: int) -> list[TaskStatus]:
16
+ """Get the status of tasks for a specific index."""
17
+ return await self._repository.load_with_hierarchy(
18
+ trackable_type=TrackableType.INDEX.value, trackable_id=index_id
19
+ )
@@ -3,7 +3,7 @@
3
3
  import json
4
4
  from dataclasses import dataclass
5
5
  from datetime import datetime
6
- from enum import Enum, IntEnum
6
+ from enum import Enum, IntEnum, StrEnum
7
7
  from pathlib import Path
8
8
  from typing import ClassVar
9
9
 
@@ -390,18 +390,18 @@ class IndexRunRequest:
390
390
 
391
391
 
392
392
  @dataclass
393
- class ProgressEvent:
394
- """Domain model for progress events."""
393
+ class ProgressState:
394
+ """Progress state."""
395
395
 
396
- operation: str
397
- current: int
398
- total: int
399
- message: str | None = None
396
+ current: int = 0
397
+ total: int = 0
398
+ operation: str = ""
399
+ message: str = ""
400
400
 
401
401
  @property
402
402
  def percentage(self) -> float:
403
403
  """Calculate the percentage of completion."""
404
- return (self.current / self.total * 100) if self.total > 0 else 0.0
404
+ return (self.current / self.total) * 100 if self.total > 0 else 0.0
405
405
 
406
406
 
407
407
  @dataclass
@@ -662,3 +662,48 @@ class QueuePriority(IntEnum):
662
662
 
663
663
  BACKGROUND = 10
664
664
  USER_INITIATED = 50
665
+
666
+
667
+ # Reporting value objects
668
+
669
+
670
+ class ReportingState(StrEnum):
671
+ """Reporting state."""
672
+
673
+ STARTED = "started"
674
+ IN_PROGRESS = "in_progress"
675
+ COMPLETED = "completed"
676
+ FAILED = "failed"
677
+ SKIPPED = "skipped"
678
+
679
+ @staticmethod
680
+ def is_terminal(state: "ReportingState") -> bool:
681
+ """Check if a state is completed."""
682
+ return state in [
683
+ ReportingState.COMPLETED,
684
+ ReportingState.FAILED,
685
+ ReportingState.SKIPPED,
686
+ ]
687
+
688
+
689
+ class TrackableType(StrEnum):
690
+ """Trackable type."""
691
+
692
+ INDEX = "indexes"
693
+
694
+
695
+ class TaskOperation(StrEnum):
696
+ """Task operation."""
697
+
698
+ ROOT = "kodit.root"
699
+ CREATE_INDEX = "kodit.index.create"
700
+ RUN_INDEX = "kodit.index.run"
701
+ REFRESH_WORKING_COPY = "kodit.index.run.refresh_working_copy"
702
+ DELETE_OLD_SNIPPETS = "kodit.index.run.delete_old_snippets"
703
+ EXTRACT_SNIPPETS = "kodit.index.run.extract_snippets"
704
+ CREATE_BM25_INDEX = "kodit.index.run.create_bm25_index"
705
+ CREATE_CODE_EMBEDDINGS = "kodit.index.run.create_code_embeddings"
706
+ ENRICH_SNIPPETS = "kodit.index.run.enrich_snippets"
707
+ CREATE_TEXT_EMBEDDINGS = "kodit.index.run.create_text_embeddings"
708
+ UPDATE_INDEX_TIMESTAMP = "kodit.index.run.update_index_timestamp"
709
+ CLEAR_FILE_PROCESSING_STATUSES = "kodit.index.run.clear_file_processing_statuses"
@@ -1,13 +1,13 @@
1
1
  """FastAPI dependencies for the REST API."""
2
2
 
3
- from collections.abc import AsyncGenerator
3
+ from collections.abc import AsyncGenerator, Callable
4
4
  from typing import Annotated, cast
5
5
 
6
6
  from fastapi import Depends, Request
7
7
  from sqlalchemy.ext.asyncio import AsyncSession
8
8
 
9
9
  from kodit.application.factories.code_indexing_factory import (
10
- create_code_indexing_application_service,
10
+ create_server_code_indexing_application_service,
11
11
  )
12
12
  from kodit.application.services.code_indexing_application_service import (
13
13
  CodeIndexingApplicationService,
@@ -15,8 +15,12 @@ from kodit.application.services.code_indexing_application_service import (
15
15
  from kodit.application.services.queue_service import QueueService
16
16
  from kodit.config import AppContext
17
17
  from kodit.domain.services.index_query_service import IndexQueryService
18
+ from kodit.domain.services.task_status_query_service import TaskStatusQueryService
18
19
  from kodit.infrastructure.indexing.fusion_service import ReciprocalRankFusionService
19
- from kodit.infrastructure.sqlalchemy.index_repository import SqlAlchemyIndexRepository
20
+ from kodit.infrastructure.sqlalchemy.index_repository import create_index_repository
21
+ from kodit.infrastructure.sqlalchemy.task_status_repository import (
22
+ create_task_status_repository,
23
+ )
20
24
 
21
25
 
22
26
  def get_app_context(request: Request) -> AppContext:
@@ -42,12 +46,25 @@ async def get_db_session(
42
46
  DBSessionDep = Annotated[AsyncSession, Depends(get_db_session)]
43
47
 
44
48
 
49
+ async def get_db_session_factory(
50
+ app_context: AppContextDep,
51
+ ) -> AsyncGenerator[Callable[[], AsyncSession], None]:
52
+ """Get database session dependency."""
53
+ db = await app_context.get_db()
54
+ yield db.session_factory
55
+
56
+
57
+ DBSessionFactoryDep = Annotated[
58
+ Callable[[], AsyncSession], Depends(get_db_session_factory)
59
+ ]
60
+
61
+
45
62
  async def get_index_query_service(
46
- session: DBSessionDep,
63
+ session_factory: DBSessionFactoryDep,
47
64
  ) -> IndexQueryService:
48
65
  """Get index query service dependency."""
49
66
  return IndexQueryService(
50
- index_repository=SqlAlchemyIndexRepository(session=session),
67
+ index_repository=create_index_repository(session_factory=session_factory),
51
68
  fusion_service=ReciprocalRankFusionService(),
52
69
  )
53
70
 
@@ -57,13 +74,10 @@ IndexQueryServiceDep = Annotated[IndexQueryService, Depends(get_index_query_serv
57
74
 
58
75
  async def get_indexing_app_service(
59
76
  app_context: AppContextDep,
60
- session: DBSessionDep,
77
+ session_factory: DBSessionFactoryDep,
61
78
  ) -> CodeIndexingApplicationService:
62
79
  """Get indexing application service dependency."""
63
- return create_code_indexing_application_service(
64
- app_context=app_context,
65
- session=session,
66
- )
80
+ return create_server_code_indexing_application_service(app_context, session_factory)
67
81
 
68
82
 
69
83
  IndexingAppServiceDep = Annotated[
@@ -72,12 +86,26 @@ IndexingAppServiceDep = Annotated[
72
86
 
73
87
 
74
88
  async def get_queue_service(
75
- session: DBSessionDep,
89
+ session_factory: DBSessionFactoryDep,
76
90
  ) -> QueueService:
77
91
  """Get queue service dependency."""
78
92
  return QueueService(
79
- session=session,
93
+ session_factory=session_factory,
80
94
  )
81
95
 
82
96
 
83
97
  QueueServiceDep = Annotated[QueueService, Depends(get_queue_service)]
98
+
99
+
100
+ async def get_task_status_query_service(
101
+ session_factory: DBSessionFactoryDep,
102
+ ) -> TaskStatusQueryService:
103
+ """Get task status query service dependency."""
104
+ return TaskStatusQueryService(
105
+ repository=create_task_status_repository(session_factory=session_factory)
106
+ )
107
+
108
+
109
+ TaskStatusQueryServiceDep = Annotated[
110
+ TaskStatusQueryService, Depends(get_task_status_query_service)
111
+ ]
@@ -9,6 +9,7 @@ from kodit.infrastructure.api.v1.dependencies import (
9
9
  IndexingAppServiceDep,
10
10
  IndexQueryServiceDep,
11
11
  QueueServiceDep,
12
+ TaskStatusQueryServiceDep,
12
13
  )
13
14
  from kodit.infrastructure.api.v1.schemas.index import (
14
15
  IndexAttributes,
@@ -18,6 +19,11 @@ from kodit.infrastructure.api.v1.schemas.index import (
18
19
  IndexListResponse,
19
20
  IndexResponse,
20
21
  )
22
+ from kodit.infrastructure.api.v1.schemas.task_status import (
23
+ TaskStatusAttributes,
24
+ TaskStatusData,
25
+ TaskStatusListResponse,
26
+ )
21
27
 
22
28
  router = APIRouter(
23
29
  prefix="/api/v1/indexes",
@@ -103,6 +109,45 @@ async def get_index(
103
109
  )
104
110
 
105
111
 
112
+ @router.get(
113
+ "/{index_id}/status",
114
+ responses={404: {"description": "Index not found"}},
115
+ )
116
+ async def get_index_status(
117
+ index_id: int,
118
+ query_service: IndexQueryServiceDep,
119
+ status_service: TaskStatusQueryServiceDep,
120
+ ) -> TaskStatusListResponse:
121
+ """Get the status of tasks for an index."""
122
+ # Verify the index exists
123
+ index = await query_service.get_index_by_id(index_id)
124
+ if not index:
125
+ raise HTTPException(status_code=404, detail="Index not found")
126
+
127
+ # Get all task statuses for this index
128
+ progress_trackers = await status_service.get_index_status(index_id)
129
+
130
+ # Convert progress trackers to API response format
131
+ task_statuses = []
132
+ for _i, status in enumerate(progress_trackers):
133
+ task_statuses.append(
134
+ TaskStatusData(
135
+ id=status.id,
136
+ attributes=TaskStatusAttributes(
137
+ step=status.operation,
138
+ state=status.state,
139
+ progress=status.completion_percent,
140
+ total=status.total,
141
+ current=status.current,
142
+ created_at=status.created_at,
143
+ updated_at=status.updated_at,
144
+ ),
145
+ )
146
+ )
147
+
148
+ return TaskStatusListResponse(data=task_statuses)
149
+
150
+
106
151
  @router.delete(
107
152
  "/{index_id}", status_code=204, responses={404: {"description": "Index not found"}}
108
153
  )
@@ -0,0 +1,39 @@
1
+ """JSON:API schemas for task status operations."""
2
+
3
+ from datetime import datetime
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class TaskStatusAttributes(BaseModel):
9
+ """Task status attributes for JSON:API responses."""
10
+
11
+ step: str = Field(..., description="Name of the task/operation")
12
+ state: str = Field(..., description="Current state of the task")
13
+ progress: float = Field(
14
+ default=0.0, ge=0.0, le=100.0, description="Progress percentage (0-100)"
15
+ )
16
+ total: int = Field(default=0, description="Total number of items to process")
17
+ current: int = Field(default=0, description="Current number of items processed")
18
+ created_at: datetime | None = Field(default=None, description="Task start time")
19
+ updated_at: datetime | None = Field(default=None, description="Last update time")
20
+
21
+
22
+ class TaskStatusData(BaseModel):
23
+ """Task status data for JSON:API responses."""
24
+
25
+ type: str = "task_status"
26
+ id: str
27
+ attributes: TaskStatusAttributes
28
+
29
+
30
+ class TaskStatusResponse(BaseModel):
31
+ """JSON:API response for single task status."""
32
+
33
+ data: TaskStatusData
34
+
35
+
36
+ class TaskStatusListResponse(BaseModel):
37
+ """JSON:API response for task status list."""
38
+
39
+ data: list[TaskStatusData]
@@ -1,5 +1,6 @@
1
1
  """Working copy provider for git-based sources."""
2
2
 
3
+ import asyncio
3
4
  import hashlib
4
5
  import shutil
5
6
  from pathlib import Path
@@ -7,6 +8,8 @@ from pathlib import Path
7
8
  import git
8
9
  import structlog
9
10
 
11
+ from kodit.application.factories.reporting_factory import create_noop_operation
12
+ from kodit.application.services.reporting import ProgressTracker
10
13
  from kodit.domain.entities import WorkingCopy
11
14
 
12
15
 
@@ -25,18 +28,48 @@ class GitWorkingCopyProvider:
25
28
  dir_name = f"repo-{dir_hash}"
26
29
  return self.clone_dir / dir_name
27
30
 
28
- async def prepare(self, uri: str) -> Path:
31
+ async def prepare(
32
+ self,
33
+ uri: str,
34
+ step: ProgressTracker | None = None,
35
+ ) -> Path:
29
36
  """Prepare a Git working copy."""
37
+ step = step or create_noop_operation()
30
38
  sanitized_uri = WorkingCopy.sanitize_git_url(uri)
31
39
  clone_path = self.get_clone_path(uri)
32
40
  clone_path.mkdir(parents=True, exist_ok=True)
33
41
 
42
+ step_record = []
43
+ await step.set_total(12)
44
+
45
+ def _clone_progress_callback(
46
+ a: int, _: str | float | None, __: str | float | None, _d: str
47
+ ) -> None:
48
+ if a not in step_record:
49
+ step_record.append(a)
50
+
51
+ # Git reports a really weird format. This is a quick hack to get some
52
+ # progress.
53
+ # Normally this would fail because the loop is already running,
54
+ # but in this case, this callback is called by some git sub-thread.
55
+ asyncio.run(
56
+ step.set_current(
57
+ len(step_record), f"Cloning repository ({step_record[-1]})"
58
+ )
59
+ )
60
+
34
61
  try:
35
62
  self.log.info(
36
63
  "Cloning repository", uri=sanitized_uri, clone_path=str(clone_path)
37
64
  )
38
65
  # Use the original URI for cloning (with credentials if present)
39
- git.Repo.clone_from(uri, clone_path)
66
+ options = ["--depth=1", "--single-branch"]
67
+ git.Repo.clone_from(
68
+ uri,
69
+ clone_path,
70
+ progress=_clone_progress_callback,
71
+ multi_options=options,
72
+ )
40
73
  except git.GitCommandError as e:
41
74
  if "already exists and is not an empty directory" not in str(e):
42
75
  msg = f"Failed to clone repository: {e}"
@@ -45,8 +78,9 @@ class GitWorkingCopyProvider:
45
78
 
46
79
  return clone_path
47
80
 
48
- async def sync(self, uri: str) -> Path:
81
+ async def sync(self, uri: str, step: ProgressTracker | None = None) -> Path:
49
82
  """Refresh a Git working copy."""
83
+ step = step or create_noop_operation()
50
84
  clone_path = self.get_clone_path(uri)
51
85
 
52
86
  # Check if the clone directory exists and is a valid Git repository
@@ -54,9 +88,10 @@ class GitWorkingCopyProvider:
54
88
  self.log.info(
55
89
  "Clone directory does not exist or is not a Git repository, "
56
90
  "preparing...",
57
- uri=uri, clone_path=str(clone_path)
91
+ uri=uri,
92
+ clone_path=str(clone_path),
58
93
  )
59
- return await self.prepare(uri)
94
+ return await self.prepare(uri, step)
60
95
 
61
96
  try:
62
97
  repo = git.Repo(clone_path)
@@ -64,10 +99,11 @@ class GitWorkingCopyProvider:
64
99
  except git.InvalidGitRepositoryError:
65
100
  self.log.warning(
66
101
  "Invalid Git repository found, re-cloning...",
67
- uri=uri, clone_path=str(clone_path)
102
+ uri=uri,
103
+ clone_path=str(clone_path),
68
104
  )
69
105
  # Remove the invalid directory and re-clone
70
106
  shutil.rmtree(clone_path)
71
- return await self.prepare(uri)
107
+ return await self.prepare(uri, step)
72
108
 
73
109
  return clone_path
@@ -1,5 +1,7 @@
1
1
  """Factory for creating embedding services with DDD architecture."""
2
2
 
3
+ from collections.abc import Callable
4
+
3
5
  import structlog
4
6
  from sqlalchemy.ext.asyncio import AsyncSession
5
7
 
@@ -24,7 +26,7 @@ from kodit.infrastructure.embedding.vectorchord_vector_search_repository import
24
26
  VectorChordVectorSearchRepository,
25
27
  )
26
28
  from kodit.infrastructure.sqlalchemy.embedding_repository import (
27
- SqlAlchemyEmbeddingRepository,
29
+ create_embedding_repository,
28
30
  )
29
31
  from kodit.infrastructure.sqlalchemy.entities import EmbeddingType
30
32
  from kodit.log import log_event
@@ -36,12 +38,15 @@ def _get_endpoint_configuration(app_context: AppContext) -> Endpoint | None:
36
38
 
37
39
 
38
40
  def embedding_domain_service_factory(
39
- task_name: TaskName, app_context: AppContext, session: AsyncSession
41
+ task_name: TaskName,
42
+ app_context: AppContext,
43
+ session: AsyncSession,
44
+ session_factory: Callable[[], AsyncSession],
40
45
  ) -> EmbeddingDomainService:
41
46
  """Create an embedding domain service."""
42
47
  structlog.get_logger(__name__)
43
48
  # Create embedding repository
44
- embedding_repository = SqlAlchemyEmbeddingRepository(session=session)
49
+ embedding_repository = create_embedding_repository(session_factory=session_factory)
45
50
 
46
51
  # Create embedding provider
47
52
  embedding_provider: EmbeddingProvider | None = None