kodit 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kodit/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.5.6'
32
- __version_tuple__ = version_tuple = (0, 5, 6)
31
+ __version__ = version = '0.5.7'
32
+ __version_tuple__ = version_tuple = (0, 5, 7)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -37,7 +37,13 @@ from kodit.domain.enrichments.request import (
37
37
  EnrichmentRequest as GenericEnrichmentRequest,
38
38
  )
39
39
  from kodit.domain.entities import Task
40
- from kodit.domain.entities.git import GitFile, GitRepo, SnippetV2, TrackingType
40
+ from kodit.domain.entities.git import (
41
+ GitCommit,
42
+ GitFile,
43
+ GitRepo,
44
+ SnippetV2,
45
+ TrackingType,
46
+ )
41
47
  from kodit.domain.factories.git_repo_factory import GitRepoFactory
42
48
  from kodit.domain.protocols import (
43
49
  EnrichmentAssociationRepository,
@@ -278,6 +284,57 @@ class CommitIndexingApplicationService:
278
284
  else:
279
285
  raise ValueError(f"Unknown task type: {task.type}")
280
286
 
287
+ async def _process_files_in_batches(
288
+ self, cloned_path: Path, all_commits: list[GitCommit], batch_size: int = 100
289
+ ) -> int:
290
+ """Process file metadata for all commits in batches to avoid memory exhaustion.
291
+
292
+ This loads file metadata (paths, sizes, blob SHAs) in batches and saves them
293
+ incrementally to avoid holding millions of file objects in memory.
294
+
295
+ Args:
296
+ cloned_path: Path to the cloned repository
297
+ all_commits: List of all commits from scan
298
+ batch_size: Number of commits to process at once (default 100)
299
+
300
+ Returns:
301
+ Total number of files processed
302
+
303
+ """
304
+ total_files = 0
305
+ commit_shas = [commit.commit_sha for commit in all_commits]
306
+ total_batches = (len(commit_shas) + batch_size - 1) // batch_size
307
+
308
+ self._log.info(
309
+ f"Processing files for {len(commit_shas)} commits "
310
+ f"in {total_batches} batches"
311
+ )
312
+
313
+ # Process commits in batches
314
+ for i in range(0, len(commit_shas), batch_size):
315
+ batch = commit_shas[i : i + batch_size]
316
+ batch_num = i // batch_size + 1
317
+
318
+ self._log.debug(
319
+ f"Processing batch {batch_num}/{total_batches} ({len(batch)} commits)"
320
+ )
321
+
322
+ # Get file metadata for this batch of commits
323
+ files = await self.scanner.process_files_for_commits_batch(
324
+ cloned_path, batch
325
+ )
326
+
327
+ # Save file metadata to database immediately
328
+ if files:
329
+ await self.git_file_repository.save_bulk(files)
330
+ total_files += len(files)
331
+ self._log.debug(
332
+ f"Batch {batch_num}: Saved {len(files)} files "
333
+ f"(total so far: {total_files})"
334
+ )
335
+
336
+ return total_files
337
+
281
338
  async def process_clone_repo(self, repository_id: int) -> None:
282
339
  """Clone a repository."""
283
340
  async with self.operation.create_child(
@@ -315,8 +372,11 @@ class CommitIndexingApplicationService:
315
372
  await step.set_current(2, "Saving commits")
316
373
  await self.git_commit_repository.save_bulk(scan_result.all_commits)
317
374
 
318
- await step.set_current(3, "Saving files")
319
- await self.git_file_repository.save_bulk(scan_result.all_files)
375
+ await step.set_current(3, "Processing and saving files in batches")
376
+ total_files = await self._process_files_in_batches(
377
+ repo.cloned_path, scan_result.all_commits
378
+ )
379
+ self._log.info(f"Processed and saved {total_files} total files")
320
380
 
321
381
  await step.set_current(4, "Saving branches")
322
382
  if scan_result.branches:
kodit/domain/protocols.py CHANGED
@@ -4,6 +4,8 @@ from abc import ABC, abstractmethod
4
4
  from pathlib import Path
5
5
  from typing import Any, Protocol, TypeVar
6
6
 
7
+ from git import Repo
8
+
7
9
  from kodit.domain.enrichments.enrichment import EnrichmentAssociation, EnrichmentV2
8
10
  from kodit.domain.entities import (
9
11
  Task,
@@ -163,9 +165,16 @@ class GitAdapter(ABC):
163
165
 
164
166
  @abstractmethod
165
167
  async def get_commit_files(
166
- self, local_path: Path, commit_sha: str
168
+ self, local_path: Path, commit_sha: str, repo: Repo
167
169
  ) -> list[dict[str, Any]]:
168
- """Get all files in a specific commit from the git tree."""
170
+ """Get all files in a specific commit from the git tree.
171
+
172
+ Args:
173
+ local_path: Path to the repository
174
+ commit_sha: SHA of the commit to get files for
175
+ repo: Repo object to reuse (avoids creating new Repo per commit)
176
+
177
+ """
169
178
 
170
179
  @abstractmethod
171
180
  async def get_commit_file_data(
@@ -1,6 +1,5 @@
1
1
  """Domain services for Git repository scanning and cloning operations."""
2
2
 
3
- import asyncio
4
3
  import shutil
5
4
  from dataclasses import dataclass
6
5
  from datetime import UTC, datetime
@@ -66,51 +65,11 @@ class GitRepositoryScanner:
66
65
  tags = await self._process_tags(cloned_path, commit_cache, repo_id)
67
66
  self._log.info(f"Found {len(tags)} tags")
68
67
 
69
- all_files = await self._process_files(cloned_path, commit_cache)
70
- self._log.info(f"Found {len(all_files)} files")
68
+ # Don't load all files into memory - return empty list
69
+ # Files will be processed in batches by the application service
70
+ self._log.info("Deferring file processing to avoid memory exhaustion")
71
71
 
72
- return self._create_scan_result(branches, commit_cache, tags, all_files)
73
-
74
- async def _process_commits_concurrently(
75
- self,
76
- cloned_path: Path,
77
- commits_batch: list[tuple[str, dict[str, Any]]],
78
- ) -> dict[str, GitCommit]:
79
- """Process a batch of commits concurrently."""
80
- batch_cache = {}
81
-
82
- async def process_single_commit(
83
- commit_sha: str, commit_data: dict[str, Any]
84
- ) -> tuple[str, GitCommit | None]:
85
- git_commit = await self._create_git_commit_from_data(
86
- cloned_path, commit_data
87
- )
88
- return commit_sha, git_commit
89
-
90
- # Process commits concurrently in smaller batches
91
- semaphore = asyncio.Semaphore(50) # Limit concurrent operations
92
-
93
- async def bounded_process(
94
- item: tuple[str, dict[str, Any]],
95
- ) -> tuple[str, GitCommit | None]:
96
- async with semaphore:
97
- return await process_single_commit(item[0], item[1])
98
-
99
- # Process all commits concurrently
100
- results = await asyncio.gather(
101
- *[bounded_process(item) for item in commits_batch],
102
- return_exceptions=True,
103
- )
104
-
105
- # Collect successful results
106
- for result in results:
107
- if isinstance(result, tuple):
108
- # Type narrowing: result is now tuple[str, GitCommit | None]
109
- commit_sha, git_commit = result
110
- if git_commit is not None:
111
- batch_cache[commit_sha] = git_commit
112
-
113
- return batch_cache
72
+ return self._create_scan_result(branches, commit_cache, tags, [], cloned_path)
114
73
 
115
74
  async def _process_branches_bulk(
116
75
  self,
@@ -167,30 +126,6 @@ class GitRepositoryScanner:
167
126
 
168
127
  return branches, commit_cache
169
128
 
170
- async def _create_git_commit_from_data(
171
- self, cloned_path: Path, commit_data: dict[str, Any], repo_id: int | None = None
172
- ) -> GitCommit | None:
173
- """Create GitCommit from pre-fetched commit data."""
174
- commit_sha = commit_data["sha"]
175
-
176
- # Get files for this commit
177
- files_data = await self.git_adapter.get_commit_files(cloned_path, commit_sha)
178
- self._create_git_files(cloned_path, files_data, commit_sha)
179
- author = self._format_author_from_data(commit_data)
180
-
181
- # Cache datetime creation
182
- created_at = datetime.now(UTC)
183
-
184
- return GitCommit(
185
- created_at=created_at,
186
- commit_sha=commit_sha,
187
- repo_id=repo_id or 0, # Use 0 as default if not provided
188
- date=commit_data["date"],
189
- message=commit_data["message"],
190
- parent_commit_sha=commit_data["parent_sha"],
191
- author=author,
192
- )
193
-
194
129
  def _format_author_from_data(self, commit_data: dict[str, Any]) -> str:
195
130
  """Format author string from commit data."""
196
131
  author_name = commit_data.get("author_name", "")
@@ -376,17 +311,18 @@ class GitRepositoryScanner:
376
311
  branches: list[GitBranch],
377
312
  commit_cache: dict[str, GitCommit],
378
313
  tags: list[GitTag],
379
- all_files: list[GitFile],
314
+ all_files: list[GitFile], # noqa: ARG002
315
+ cloned_path: Path | None = None, # noqa: ARG002
380
316
  ) -> RepositoryScanResult:
381
317
  """Create final scan result."""
382
- # Files are loaded on-demand for performance, so total_files is 0 during scan
318
+ # Files list is empty to avoid memory issues - will be processed in batches
383
319
  scan_result = RepositoryScanResult(
384
320
  branches=branches,
385
321
  all_commits=list(commit_cache.values()),
386
322
  scan_timestamp=datetime.now(UTC),
387
- total_files_across_commits=len(all_files),
323
+ total_files_across_commits=0, # Will be updated after batch processing
388
324
  all_tags=tags,
389
- all_files=all_files,
325
+ all_files=[], # Empty - processed in batches to avoid memory exhaustion
390
326
  )
391
327
 
392
328
  self._log.info(
@@ -395,16 +331,35 @@ class GitRepositoryScanner:
395
331
  )
396
332
  return scan_result
397
333
 
398
- async def _process_files(
399
- self, cloned_path: Path, commit_cache: dict[str, GitCommit]
334
+ async def process_files_for_commits_batch(
335
+ self, cloned_path: Path, commit_shas: list[str]
400
336
  ) -> list[GitFile]:
401
- """Process files for a commit."""
337
+ """Process files for a batch of commits.
338
+
339
+ This allows the application service to process files in batches
340
+ to avoid loading millions of files into memory at once.
341
+
342
+ CRITICAL: Reuses a single Repo object to avoid creating 32K+ Repo instances
343
+ which would consume massive memory (1-2 MB each).
344
+ """
345
+ from git import Repo
346
+
347
+ # Open repo once and reuse for all commits in this batch
348
+ repo = Repo(cloned_path)
402
349
  files = []
403
- for commit_sha in commit_cache:
404
- files_data = await self.git_adapter.get_commit_files(
405
- cloned_path, commit_sha
406
- )
407
- files.extend(self._create_git_files(cloned_path, files_data, commit_sha))
350
+
351
+ try:
352
+ for commit_sha in commit_shas:
353
+ files_data = await self.git_adapter.get_commit_files(
354
+ cloned_path, commit_sha, repo=repo
355
+ )
356
+ files.extend(
357
+ self._create_git_files(cloned_path, files_data, commit_sha)
358
+ )
359
+ finally:
360
+ # Explicitly close the repo to free resources
361
+ repo.close()
362
+
408
363
  return files
409
364
 
410
365
 
@@ -346,14 +346,22 @@ class GitPythonAdapter(GitAdapter):
346
346
  )
347
347
 
348
348
  async def get_commit_files(
349
- self, local_path: Path, commit_sha: str
349
+ self, local_path: Path, commit_sha: str, repo: Repo
350
350
  ) -> list[dict[str, Any]]:
351
- """Get all files in a specific commit from the git tree."""
351
+ """Get all files in a specific commit from the git tree.
352
+
353
+ Args:
354
+ local_path: Path to the repository
355
+ commit_sha: SHA of the commit to get files for
356
+ repo: Repo object to reuse (avoids creating new Repo per commit)
357
+
358
+ """
352
359
 
353
360
  def _get_files() -> list[dict[str, Any]]:
354
361
  try:
355
- repo = Repo(local_path)
356
- commit = repo.commit(commit_sha)
362
+ # Use the provided repo object
363
+ _repo = repo
364
+ commit = _repo.commit(commit_sha)
357
365
 
358
366
  files = []
359
367
 
@@ -395,7 +403,11 @@ class GitPythonAdapter(GitAdapter):
395
403
  """Get file metadata for a commit, with files checked out to disk."""
396
404
  await self._checkout_commit(local_path, commit_sha)
397
405
  try:
398
- return await self.get_commit_files(local_path, commit_sha)
406
+ repo = Repo(local_path)
407
+ try:
408
+ return await self.get_commit_files(local_path, commit_sha, repo)
409
+ finally:
410
+ repo.close()
399
411
  finally:
400
412
  await self.restore_to_branch(local_path, "main")
401
413
 
@@ -570,6 +582,4 @@ class GitPythonAdapter(GitAdapter):
570
582
  else:
571
583
  return diff_text
572
584
 
573
- return await asyncio.get_event_loop().run_in_executor(
574
- self.executor, _get_diff
575
- )
585
+ return await asyncio.get_event_loop().run_in_executor(self.executor, _get_diff)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kodit
3
- Version: 0.5.6
3
+ Version: 0.5.7
4
4
  Summary: Code indexing for better AI code generation
5
5
  Project-URL: Homepage, https://docs.helixml.tech/kodit/
6
6
  Project-URL: Documentation, https://docs.helixml.tech/kodit/
@@ -1,6 +1,6 @@
1
1
  kodit/.gitignore,sha256=ztkjgRwL9Uud1OEi36hGQeDGk3OLK1NfDEO8YqGYy8o,11
2
2
  kodit/__init__.py,sha256=aEKHYninUq1yh6jaNfvJBYg-6fenpN132nJt1UU6Jxs,59
3
- kodit/_version.py,sha256=cK44j3gESPDerYNsAlInZd36FaFcP1dYcSF1jeRDXF8,704
3
+ kodit/_version.py,sha256=NvV7p6eu_Rli4DWHJnEcpyTUiImNPPDyoDonzzIsNwA,704
4
4
  kodit/app.py,sha256=7WxSQcktnpYBmjO1skIjMeBu55rVVRf4lotBEq55pAM,5846
5
5
  kodit/cli.py,sha256=QSTXIUDxZo3anIONY-grZi9_VSehWoS8QoVJZyOmWPQ,3086
6
6
  kodit/cli_utils.py,sha256=umkvt4kWNapk6db6RGz6bmn7oxgDpsW2Vo09MZ37OGg,2430
@@ -16,7 +16,7 @@ kodit/application/factories/reporting_factory.py,sha256=3IpRiAw_olM69db-jbDAtjyG
16
16
  kodit/application/factories/server_factory.py,sha256=dr0X_zQRUlEybtGZ3NS-kkwTU-K96u2D1Qw5xhWkd88,17409
17
17
  kodit/application/services/__init__.py,sha256=p5UQNw-H5sxQvs5Etfte93B3cJ1kKW6DNxK34uFvU1E,38
18
18
  kodit/application/services/code_search_application_service.py,sha256=ceyv5TTN-jvlOFOupGa9XwfTTraLNN2GU55kFeulVXY,7763
19
- kodit/application/services/commit_indexing_application_service.py,sha256=CIJdBdAIdere6fEJ1KytQgDu0jNXoDiE748aFcEiOmQ,42631
19
+ kodit/application/services/commit_indexing_application_service.py,sha256=uRYPkVbiqu1V9bORjQu2yoylskLgCz55vYJ1pODjm94,44690
20
20
  kodit/application/services/enrichment_query_service.py,sha256=RMVze-DzS5zAki1iC96Kid7tbg-nHSv0z8eqPsiURqc,15002
21
21
  kodit/application/services/indexing_worker_service.py,sha256=59cZthlzViOVrAWEoZqUTCfLzxx2OO_FOGdM3pYf9Mc,4065
22
22
  kodit/application/services/queue_service.py,sha256=pIHTS8M65FzAhZH5kn54BTiZ43sCbsALYdCFTz9wdqE,2692
@@ -24,7 +24,7 @@ kodit/application/services/reporting.py,sha256=cwe-S-UpSOE6xSAEhoD1hi4hSWk1bW3YR
24
24
  kodit/application/services/sync_scheduler.py,sha256=hVT3dlmvfbqXKOV_KU5ZQ5gEKBGPJTlvJcF9gP2ZHQM,2853
25
25
  kodit/domain/__init__.py,sha256=TCpg4Xx-oF4mKV91lo4iXqMEfBT1OoRSYnbG-zVWolA,66
26
26
  kodit/domain/errors.py,sha256=yIsgCjM_yOFIg8l7l-t7jM8pgeAX4cfPq0owf7iz3DA,106
27
- kodit/domain/protocols.py,sha256=KxTHnYbECvMoXdbvXNHVF-l-OEr2kzQsoxlVXbatgwg,7814
27
+ kodit/domain/protocols.py,sha256=Q6blYD79Tn5LQyNEAioTuPPIdZYXDf46kVpAW2EG2jY,8056
28
28
  kodit/domain/value_objects.py,sha256=FW0sTMtcl0Q1qej7vzEg7-Gsv86Z01IbPrDdudsgU3g,18097
29
29
  kodit/domain/enrichments/__init__.py,sha256=UpQMnMEHqaK3u3K-eJZOC28kfBPHALLAjFMdyYBXSPE,33
30
30
  kodit/domain/enrichments/enricher.py,sha256=jnZ5X9RmZA8Acy-RBS2TbEoBg9QSm8AgleqwS9h5WlY,512
@@ -57,7 +57,7 @@ kodit/domain/factories/git_repo_factory.py,sha256=EdeQo4HsBi2hVeVvnSnYtFdR3yGVZQ
57
57
  kodit/domain/services/__init__.py,sha256=Q1GhCK_PqKHYwYE4tkwDz5BIyXkJngLBBOHhzvX8nzo,42
58
58
  kodit/domain/services/bm25_service.py,sha256=-E5k0td2Ucs25qygWkJlY0fl7ZckOUe5xZnKYff3hF8,3631
59
59
  kodit/domain/services/embedding_service.py,sha256=CEcQ2E9XvOcjKNCJEw5soYUNMHJ5LCJGyXzPCl75CPc,4812
60
- kodit/domain/services/git_repository_service.py,sha256=suIBmiBG9OcXUFrw1uiYRidS9yvFEekZU8H-tsY0zs0,16545
60
+ kodit/domain/services/git_repository_service.py,sha256=KtwYF3XKBeNbAHbi-sEdMJ-1jGRy7rmWMZkPpCrh9fw,14980
61
61
  kodit/domain/services/git_service.py,sha256=Lr7kPnnBEa_fWfGA9jpffMK7wcfxQ0wfXgynsbSKSzg,11661
62
62
  kodit/domain/services/physical_architecture_service.py,sha256=0YgoAvbUxT_VwgIh_prftSYnil_XIqNPSoP0g37eIt4,7209
63
63
  kodit/domain/services/task_status_query_service.py,sha256=rI93pTMHeycigQryCWkimXSDzRqx_nJOr07UzPAacPE,736
@@ -95,7 +95,7 @@ kodit/infrastructure/bm25/__init__.py,sha256=DmGbrEO34FOJy4e685BbyxLA7gPW1eqs2gA
95
95
  kodit/infrastructure/bm25/local_bm25_repository.py,sha256=YE3pUkPS5n1JNu6oSM_HRBOXM8U04HiY8dMMZCf9CMQ,5197
96
96
  kodit/infrastructure/bm25/vectorchord_bm25_repository.py,sha256=LjbUPj4nPMb9pdEudThUbZTmQjhxvpN314EzKGpXfi0,8621
97
97
  kodit/infrastructure/cloning/git/__init__.py,sha256=20ePcp0qE6BuLsjsv4KYB1DzKhMIMsPXwEqIEZtjTJs,34
98
- kodit/infrastructure/cloning/git/git_python_adaptor.py,sha256=GCBdswC5txbt8_TWbMEbhvRvlmOvPCLwb6qBlx7b3XU,21594
98
+ kodit/infrastructure/cloning/git/git_python_adaptor.py,sha256=kiiXrjSqdSYT_c_migWff1WEVlJT8JRlgo5m_9T4rrM,21942
99
99
  kodit/infrastructure/cloning/git/working_copy.py,sha256=sPKQN-A1gDVV_QJISNNP4PqxRWxyj5owv5tvWfXMl44,3909
100
100
  kodit/infrastructure/database_schema/__init__.py,sha256=jgejYX70fjV69zCuOBiNw3oCQlCKYzxTkjnUUUU7DY0,48
101
101
  kodit/infrastructure/database_schema/database_schema_detector.py,sha256=zXU7HqrZU4_EYckloKDbH0gZvZ3_TJG5-Bd5PAkEkXc,10167
@@ -177,8 +177,8 @@ kodit/utils/dump_config.py,sha256=dd5uPgqh6ATk02Zt59t2JFKR9X17YWjHudV0nE8VktE,11
177
177
  kodit/utils/dump_openapi.py,sha256=EasYOnnpeabwb_sTKQUBrrOLHjPcOFQ7Zx0YKpx9fmM,1239
178
178
  kodit/utils/generate_api_paths.py,sha256=TMtx9v55podDfUmiWaHgJHLtEWLV2sLL-5ejGFMPzAo,3569
179
179
  kodit/utils/path_utils.py,sha256=UB_81rx7Y1G1jalVv2PX8miwaprBbcqEdtoQ3hPT3kU,2451
180
- kodit-0.5.6.dist-info/METADATA,sha256=7G7BfnWWbmDkwSSiKribmesiUY5NlzjcxwqsZiTJjUw,7703
181
- kodit-0.5.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
182
- kodit-0.5.6.dist-info/entry_points.txt,sha256=hoTn-1aKyTItjnY91fnO-rV5uaWQLQ-Vi7V5et2IbHY,40
183
- kodit-0.5.6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
184
- kodit-0.5.6.dist-info/RECORD,,
180
+ kodit-0.5.7.dist-info/METADATA,sha256=Yi8IGWrrk1FLgnC5GiqmBc8V3bJcWz8Fl29-nM8CkcE,7703
181
+ kodit-0.5.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
182
+ kodit-0.5.7.dist-info/entry_points.txt,sha256=hoTn-1aKyTItjnY91fnO-rV5uaWQLQ-Vi7V5et2IbHY,40
183
+ kodit-0.5.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
184
+ kodit-0.5.7.dist-info/RECORD,,
File without changes