kodit 0.4.3__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (135) hide show
  1. kodit/_version.py +2 -2
  2. kodit/app.py +51 -23
  3. kodit/application/factories/reporting_factory.py +6 -2
  4. kodit/application/factories/server_factory.py +353 -0
  5. kodit/application/services/code_search_application_service.py +144 -0
  6. kodit/application/services/commit_indexing_application_service.py +700 -0
  7. kodit/application/services/indexing_worker_service.py +13 -44
  8. kodit/application/services/queue_service.py +24 -3
  9. kodit/application/services/reporting.py +0 -2
  10. kodit/application/services/sync_scheduler.py +15 -31
  11. kodit/cli.py +2 -753
  12. kodit/cli_utils.py +2 -9
  13. kodit/config.py +4 -97
  14. kodit/database.py +38 -1
  15. kodit/domain/enrichments/__init__.py +1 -0
  16. kodit/domain/enrichments/architecture/__init__.py +1 -0
  17. kodit/domain/enrichments/architecture/architecture.py +20 -0
  18. kodit/domain/enrichments/architecture/physical/__init__.py +1 -0
  19. kodit/domain/enrichments/architecture/physical/discovery_notes.py +14 -0
  20. kodit/domain/enrichments/architecture/physical/formatter.py +11 -0
  21. kodit/domain/enrichments/architecture/physical/physical.py +17 -0
  22. kodit/domain/enrichments/development/__init__.py +1 -0
  23. kodit/domain/enrichments/development/development.py +18 -0
  24. kodit/domain/enrichments/development/snippet/__init__.py +1 -0
  25. kodit/domain/enrichments/development/snippet/snippet.py +21 -0
  26. kodit/domain/enrichments/enricher.py +17 -0
  27. kodit/domain/enrichments/enrichment.py +39 -0
  28. kodit/domain/enrichments/request.py +12 -0
  29. kodit/domain/enrichments/response.py +11 -0
  30. kodit/domain/enrichments/usage/__init__.py +1 -0
  31. kodit/domain/enrichments/usage/api_docs.py +19 -0
  32. kodit/domain/enrichments/usage/usage.py +18 -0
  33. kodit/domain/{entities.py → entities/__init__.py} +50 -195
  34. kodit/domain/entities/git.py +190 -0
  35. kodit/domain/factories/__init__.py +1 -0
  36. kodit/domain/factories/git_repo_factory.py +76 -0
  37. kodit/domain/protocols.py +264 -64
  38. kodit/domain/services/bm25_service.py +5 -1
  39. kodit/domain/services/embedding_service.py +3 -0
  40. kodit/domain/services/enrichment_service.py +9 -30
  41. kodit/domain/services/git_repository_service.py +429 -0
  42. kodit/domain/services/git_service.py +300 -0
  43. kodit/domain/services/physical_architecture_service.py +182 -0
  44. kodit/domain/services/task_status_query_service.py +2 -2
  45. kodit/domain/value_objects.py +87 -135
  46. kodit/infrastructure/api/client/__init__.py +0 -2
  47. kodit/infrastructure/api/v1/__init__.py +0 -4
  48. kodit/infrastructure/api/v1/dependencies.py +92 -46
  49. kodit/infrastructure/api/v1/routers/__init__.py +0 -6
  50. kodit/infrastructure/api/v1/routers/commits.py +352 -0
  51. kodit/infrastructure/api/v1/routers/queue.py +2 -2
  52. kodit/infrastructure/api/v1/routers/repositories.py +282 -0
  53. kodit/infrastructure/api/v1/routers/search.py +31 -14
  54. kodit/infrastructure/api/v1/schemas/__init__.py +0 -24
  55. kodit/infrastructure/api/v1/schemas/commit.py +96 -0
  56. kodit/infrastructure/api/v1/schemas/context.py +2 -0
  57. kodit/infrastructure/api/v1/schemas/enrichment.py +29 -0
  58. kodit/infrastructure/api/v1/schemas/repository.py +128 -0
  59. kodit/infrastructure/api/v1/schemas/search.py +12 -9
  60. kodit/infrastructure/api/v1/schemas/snippet.py +58 -0
  61. kodit/infrastructure/api/v1/schemas/tag.py +31 -0
  62. kodit/infrastructure/api/v1/schemas/task_status.py +2 -0
  63. kodit/infrastructure/bm25/local_bm25_repository.py +16 -4
  64. kodit/infrastructure/bm25/vectorchord_bm25_repository.py +68 -52
  65. kodit/infrastructure/cloning/git/git_python_adaptor.py +534 -0
  66. kodit/infrastructure/cloning/git/working_copy.py +1 -1
  67. kodit/infrastructure/embedding/embedding_factory.py +3 -2
  68. kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
  69. kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +111 -84
  70. kodit/infrastructure/enricher/__init__.py +1 -0
  71. kodit/infrastructure/enricher/enricher_factory.py +53 -0
  72. kodit/infrastructure/{enrichment/litellm_enrichment_provider.py → enricher/litellm_enricher.py} +36 -56
  73. kodit/infrastructure/{enrichment/local_enrichment_provider.py → enricher/local_enricher.py} +19 -24
  74. kodit/infrastructure/enricher/null_enricher.py +36 -0
  75. kodit/infrastructure/indexing/fusion_service.py +1 -1
  76. kodit/infrastructure/mappers/enrichment_mapper.py +83 -0
  77. kodit/infrastructure/mappers/git_mapper.py +193 -0
  78. kodit/infrastructure/mappers/snippet_mapper.py +104 -0
  79. kodit/infrastructure/mappers/task_mapper.py +5 -44
  80. kodit/infrastructure/physical_architecture/__init__.py +1 -0
  81. kodit/infrastructure/physical_architecture/detectors/__init__.py +1 -0
  82. kodit/infrastructure/physical_architecture/detectors/docker_compose_detector.py +336 -0
  83. kodit/infrastructure/physical_architecture/formatters/__init__.py +1 -0
  84. kodit/infrastructure/physical_architecture/formatters/narrative_formatter.py +149 -0
  85. kodit/infrastructure/reporting/log_progress.py +8 -5
  86. kodit/infrastructure/reporting/telemetry_progress.py +21 -0
  87. kodit/infrastructure/slicing/api_doc_extractor.py +836 -0
  88. kodit/infrastructure/slicing/ast_analyzer.py +1128 -0
  89. kodit/infrastructure/slicing/slicer.py +87 -421
  90. kodit/infrastructure/sqlalchemy/embedding_repository.py +43 -23
  91. kodit/infrastructure/sqlalchemy/enrichment_v2_repository.py +118 -0
  92. kodit/infrastructure/sqlalchemy/entities.py +402 -158
  93. kodit/infrastructure/sqlalchemy/git_branch_repository.py +274 -0
  94. kodit/infrastructure/sqlalchemy/git_commit_repository.py +346 -0
  95. kodit/infrastructure/sqlalchemy/git_repository.py +262 -0
  96. kodit/infrastructure/sqlalchemy/git_tag_repository.py +268 -0
  97. kodit/infrastructure/sqlalchemy/snippet_v2_repository.py +479 -0
  98. kodit/infrastructure/sqlalchemy/task_repository.py +29 -23
  99. kodit/infrastructure/sqlalchemy/task_status_repository.py +24 -12
  100. kodit/infrastructure/sqlalchemy/unit_of_work.py +10 -14
  101. kodit/mcp.py +12 -30
  102. kodit/migrations/env.py +1 -0
  103. kodit/migrations/versions/04b80f802e0c_foreign_key_review.py +100 -0
  104. kodit/migrations/versions/19f8c7faf8b9_add_generic_enrichment_type.py +260 -0
  105. kodit/migrations/versions/7f15f878c3a1_add_new_git_entities.py +690 -0
  106. kodit/migrations/versions/f9e5ef5e688f_add_git_commits_number.py +43 -0
  107. kodit/py.typed +0 -0
  108. kodit/utils/dump_config.py +361 -0
  109. kodit/utils/dump_openapi.py +6 -4
  110. kodit/utils/path_utils.py +29 -0
  111. {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/METADATA +3 -3
  112. kodit-0.5.1.dist-info/RECORD +168 -0
  113. kodit/application/factories/code_indexing_factory.py +0 -195
  114. kodit/application/services/auto_indexing_service.py +0 -99
  115. kodit/application/services/code_indexing_application_service.py +0 -410
  116. kodit/domain/services/index_query_service.py +0 -70
  117. kodit/domain/services/index_service.py +0 -269
  118. kodit/infrastructure/api/client/index_client.py +0 -57
  119. kodit/infrastructure/api/v1/routers/indexes.py +0 -164
  120. kodit/infrastructure/api/v1/schemas/index.py +0 -101
  121. kodit/infrastructure/bm25/bm25_factory.py +0 -28
  122. kodit/infrastructure/cloning/__init__.py +0 -1
  123. kodit/infrastructure/cloning/metadata.py +0 -98
  124. kodit/infrastructure/enrichment/__init__.py +0 -1
  125. kodit/infrastructure/enrichment/enrichment_factory.py +0 -52
  126. kodit/infrastructure/enrichment/null_enrichment_provider.py +0 -19
  127. kodit/infrastructure/mappers/index_mapper.py +0 -345
  128. kodit/infrastructure/reporting/tdqm_progress.py +0 -38
  129. kodit/infrastructure/slicing/language_detection_service.py +0 -18
  130. kodit/infrastructure/sqlalchemy/index_repository.py +0 -646
  131. kodit-0.4.3.dist-info/RECORD +0 -125
  132. /kodit/infrastructure/{enrichment → enricher}/utils.py +0 -0
  133. {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/WHEEL +0 -0
  134. {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/entry_points.txt +0 -0
  135. {kodit-0.4.3.dist-info → kodit-0.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,274 @@
1
+ """SQLAlchemy implementation of GitBranchRepository."""
2
+
3
+ from collections.abc import Callable
4
+
5
+ from sqlalchemy import delete, func, insert, select
6
+ from sqlalchemy.ext.asyncio import AsyncSession
7
+
8
+ from kodit.domain.entities.git import GitBranch, GitCommit
9
+ from kodit.domain.protocols import GitBranchRepository
10
+ from kodit.infrastructure.sqlalchemy import entities as db_entities
11
+ from kodit.infrastructure.sqlalchemy.unit_of_work import SqlAlchemyUnitOfWork
12
+
13
+
14
+ def create_git_branch_repository(
15
+ session_factory: Callable[[], AsyncSession],
16
+ ) -> GitBranchRepository:
17
+ """Create a git branch repository."""
18
+ return SqlAlchemyGitBranchRepository(session_factory=session_factory)
19
+
20
+
21
+ class SqlAlchemyGitBranchRepository(GitBranchRepository):
22
+ """SQLAlchemy implementation of GitBranchRepository."""
23
+
24
+ def __init__(self, session_factory: Callable[[], AsyncSession]) -> None:
25
+ """Initialize the repository."""
26
+ self.session_factory = session_factory
27
+
28
+ async def get_by_name(self, branch_name: str, repo_id: int) -> GitBranch:
29
+ """Get a branch by name and repository ID."""
30
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
31
+ # Get the branch
32
+ stmt = select(db_entities.GitBranch).where(
33
+ db_entities.GitBranch.name == branch_name,
34
+ db_entities.GitBranch.repo_id == repo_id,
35
+ )
36
+ db_branch = await session.scalar(stmt)
37
+ if not db_branch:
38
+ raise ValueError(f"Branch {branch_name} not found in repo {repo_id}")
39
+
40
+ # Get the head commit
41
+ commit_stmt = select(db_entities.GitCommit).where(
42
+ db_entities.GitCommit.commit_sha == db_branch.head_commit_sha
43
+ )
44
+ db_commit = await session.scalar(commit_stmt)
45
+ if not db_commit:
46
+ raise ValueError(f"Head commit {db_branch.head_commit_sha} not found")
47
+
48
+ # Get files for the head commit
49
+ files_stmt = select(db_entities.GitCommitFile).where(
50
+ db_entities.GitCommitFile.commit_sha == db_branch.head_commit_sha
51
+ )
52
+ db_files = (await session.scalars(files_stmt)).all()
53
+
54
+ from kodit.domain.entities.git import GitFile
55
+
56
+ domain_files = []
57
+ for db_file in db_files:
58
+ domain_file = GitFile(
59
+ blob_sha=db_file.blob_sha,
60
+ path=db_file.path,
61
+ mime_type=db_file.mime_type,
62
+ size=db_file.size,
63
+ extension=db_file.extension,
64
+ created_at=db_file.created_at,
65
+ )
66
+ domain_files.append(domain_file)
67
+
68
+ head_commit = GitCommit(
69
+ commit_sha=db_commit.commit_sha,
70
+ date=db_commit.date,
71
+ message=db_commit.message,
72
+ parent_commit_sha=db_commit.parent_commit_sha,
73
+ files=domain_files,
74
+ author=db_commit.author,
75
+ created_at=db_commit.created_at,
76
+ updated_at=db_commit.updated_at,
77
+ )
78
+
79
+ return GitBranch(
80
+ repo_id=db_branch.repo_id,
81
+ name=db_branch.name,
82
+ head_commit=head_commit,
83
+ created_at=db_branch.created_at,
84
+ updated_at=db_branch.updated_at,
85
+ )
86
+
87
+ async def get_by_repo_id(self, repo_id: int) -> list[GitBranch]:
88
+ """Get all branches for a repository."""
89
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
90
+ # Get all branches for the repo
91
+ branches_stmt = select(db_entities.GitBranch).where(
92
+ db_entities.GitBranch.repo_id == repo_id
93
+ )
94
+ db_branches = (await session.scalars(branches_stmt)).all()
95
+
96
+ if not db_branches:
97
+ return []
98
+
99
+ commit_shas = [branch.head_commit_sha for branch in db_branches]
100
+
101
+ # Get all head commits for these branches in chunks
102
+ # to avoid parameter limits
103
+ db_commits: list[db_entities.GitCommit] = []
104
+ chunk_size = 1000
105
+ for i in range(0, len(commit_shas), chunk_size):
106
+ chunk = commit_shas[i : i + chunk_size]
107
+ commits_stmt = select(db_entities.GitCommit).where(
108
+ db_entities.GitCommit.commit_sha.in_(chunk)
109
+ )
110
+ chunk_commits = (await session.scalars(commits_stmt)).all()
111
+ db_commits.extend(chunk_commits)
112
+
113
+ # Get all files for these commits in chunks
114
+ # to avoid parameter limits
115
+ db_files: list[db_entities.GitCommitFile] = []
116
+ for i in range(0, len(commit_shas), chunk_size):
117
+ chunk = commit_shas[i : i + chunk_size]
118
+ files_stmt = select(db_entities.GitCommitFile).where(
119
+ db_entities.GitCommitFile.commit_sha.in_(chunk)
120
+ )
121
+ chunk_files = (await session.scalars(files_stmt)).all()
122
+ db_files.extend(chunk_files)
123
+
124
+ # Group files by commit SHA
125
+ from kodit.domain.entities.git import GitFile
126
+
127
+ files_by_commit: dict[str, list[GitFile]] = {}
128
+ for db_file in db_files:
129
+ if db_file.commit_sha not in files_by_commit:
130
+ files_by_commit[db_file.commit_sha] = []
131
+
132
+ domain_file = GitFile(
133
+ blob_sha=db_file.blob_sha,
134
+ path=db_file.path,
135
+ mime_type=db_file.mime_type,
136
+ size=db_file.size,
137
+ extension=db_file.extension,
138
+ created_at=db_file.created_at,
139
+ )
140
+ files_by_commit[db_file.commit_sha].append(domain_file)
141
+
142
+ # Create commit lookup
143
+ commits_by_sha = {commit.commit_sha: commit for commit in db_commits}
144
+
145
+ # Create domain branches
146
+ domain_branches = []
147
+ for db_branch in db_branches:
148
+ db_commit = commits_by_sha.get(db_branch.head_commit_sha)
149
+ if not db_commit:
150
+ continue
151
+
152
+ commit_files = files_by_commit.get(db_branch.head_commit_sha, [])
153
+ head_commit = GitCommit(
154
+ commit_sha=db_commit.commit_sha,
155
+ date=db_commit.date,
156
+ message=db_commit.message,
157
+ parent_commit_sha=db_commit.parent_commit_sha,
158
+ files=commit_files,
159
+ author=db_commit.author,
160
+ created_at=db_commit.created_at,
161
+ updated_at=db_commit.updated_at,
162
+ )
163
+
164
+ domain_branch = GitBranch(
165
+ repo_id=db_branch.repo_id,
166
+ name=db_branch.name,
167
+ head_commit=head_commit,
168
+ created_at=db_branch.created_at,
169
+ updated_at=db_branch.updated_at,
170
+ )
171
+ domain_branches.append(domain_branch)
172
+
173
+ return domain_branches
174
+
175
+ async def save(self, branch: GitBranch, repo_id: int) -> GitBranch:
176
+ """Save a branch to a repository."""
177
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
178
+ # Set repo_id on the branch
179
+ branch.repo_id = repo_id
180
+
181
+ # Check if branch already exists
182
+ existing_branch = await session.get(
183
+ db_entities.GitBranch, (repo_id, branch.name)
184
+ )
185
+
186
+ if existing_branch:
187
+ # Update existing branch
188
+ existing_branch.head_commit_sha = branch.head_commit.commit_sha
189
+ if branch.updated_at:
190
+ existing_branch.updated_at = branch.updated_at
191
+ else:
192
+ # Create new branch
193
+ db_branch = db_entities.GitBranch(
194
+ repo_id=repo_id,
195
+ name=branch.name,
196
+ head_commit_sha=branch.head_commit.commit_sha,
197
+ )
198
+ session.add(db_branch)
199
+
200
+ return branch
201
+
202
+ async def save_bulk(self, branches: list[GitBranch], repo_id: int) -> None:
203
+ """Bulk save branches to a repository."""
204
+ if not branches:
205
+ return
206
+
207
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
208
+ [(repo_id, branch.name) for branch in branches]
209
+
210
+ # Get existing branches in bulk
211
+ existing_branches_stmt = select(db_entities.GitBranch).where(
212
+ db_entities.GitBranch.repo_id == repo_id,
213
+ db_entities.GitBranch.name.in_([branch.name for branch in branches]),
214
+ )
215
+ existing_branches = (await session.scalars(existing_branches_stmt)).all()
216
+ existing_branch_names = {branch.name for branch in existing_branches}
217
+
218
+ # Update existing branches
219
+ for existing_branch in existing_branches:
220
+ for branch in branches:
221
+ if (
222
+ branch.name == existing_branch.name
223
+ and existing_branch.head_commit_sha
224
+ != branch.head_commit.commit_sha
225
+ ):
226
+ existing_branch.head_commit_sha = branch.head_commit.commit_sha
227
+ break
228
+
229
+ # Prepare new branches for bulk insert
230
+ new_branches_data = [
231
+ {
232
+ "repo_id": repo_id,
233
+ "name": branch.name,
234
+ "head_commit_sha": branch.head_commit.commit_sha,
235
+ }
236
+ for branch in branches
237
+ if branch.name not in existing_branch_names
238
+ ]
239
+
240
+ # Bulk insert new branches in chunks to avoid parameter limits
241
+ if new_branches_data:
242
+ chunk_size = 1000 # Conservative chunk size for parameter limits
243
+ for i in range(0, len(new_branches_data), chunk_size):
244
+ chunk = new_branches_data[i : i + chunk_size]
245
+ stmt = insert(db_entities.GitBranch).values(chunk)
246
+ await session.execute(stmt)
247
+
248
+ async def exists(self, branch_name: str, repo_id: int) -> bool:
249
+ """Check if a branch exists."""
250
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
251
+ stmt = select(db_entities.GitBranch.name).where(
252
+ db_entities.GitBranch.name == branch_name,
253
+ db_entities.GitBranch.repo_id == repo_id,
254
+ )
255
+ result = await session.scalar(stmt)
256
+ return result is not None
257
+
258
+ async def delete_by_repo_id(self, repo_id: int) -> None:
259
+ """Delete all branches for a repository."""
260
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
261
+ # Delete branches
262
+ del_branches_stmt = delete(db_entities.GitBranch).where(
263
+ db_entities.GitBranch.repo_id == repo_id
264
+ )
265
+ await session.execute(del_branches_stmt)
266
+
267
+ async def count_by_repo_id(self, repo_id: int) -> int:
268
+ """Count the number of branches for a repository."""
269
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
270
+ stmt = select(func.count()).select_from(db_entities.GitBranch).where(
271
+ db_entities.GitBranch.repo_id == repo_id
272
+ )
273
+ result = await session.scalar(stmt)
274
+ return result or 0
@@ -0,0 +1,346 @@
1
+ """SQLAlchemy implementation of GitCommitRepository."""
2
+
3
+ from collections.abc import Callable
4
+
5
+ from sqlalchemy import delete, func, insert, select
6
+ from sqlalchemy.ext.asyncio import AsyncSession
7
+
8
+ from kodit.domain.entities.git import GitCommit, GitFile
9
+ from kodit.domain.protocols import GitCommitRepository
10
+ from kodit.infrastructure.sqlalchemy import entities as db_entities
11
+ from kodit.infrastructure.sqlalchemy.unit_of_work import SqlAlchemyUnitOfWork
12
+
13
+
14
+ def create_git_commit_repository(
15
+ session_factory: Callable[[], AsyncSession],
16
+ ) -> GitCommitRepository:
17
+ """Create a git commit repository."""
18
+ return SqlAlchemyGitCommitRepository(session_factory=session_factory)
19
+
20
+
21
+ class SqlAlchemyGitCommitRepository(GitCommitRepository):
22
+ """SQLAlchemy implementation of GitCommitRepository."""
23
+
24
+ def __init__(self, session_factory: Callable[[], AsyncSession]) -> None:
25
+ """Initialize the repository."""
26
+ self.session_factory = session_factory
27
+
28
+ async def get_by_sha(self, commit_sha: str) -> GitCommit:
29
+ """Get a commit by its SHA."""
30
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
31
+ # Get the commit
32
+ stmt = select(db_entities.GitCommit).where(
33
+ db_entities.GitCommit.commit_sha == commit_sha
34
+ )
35
+ db_commit = await session.scalar(stmt)
36
+ if not db_commit:
37
+ raise ValueError(f"Commit with SHA {commit_sha} not found")
38
+
39
+ # Get associated files
40
+ files_stmt = select(db_entities.GitCommitFile).where(
41
+ db_entities.GitCommitFile.commit_sha == commit_sha
42
+ )
43
+ db_files = (await session.scalars(files_stmt)).all()
44
+
45
+ domain_files = []
46
+ for db_file in db_files:
47
+ domain_file = GitFile(
48
+ blob_sha=db_file.blob_sha,
49
+ path=db_file.path,
50
+ mime_type=db_file.mime_type,
51
+ size=db_file.size,
52
+ extension=db_file.extension,
53
+ created_at=db_file.created_at,
54
+ )
55
+ domain_files.append(domain_file)
56
+
57
+ return GitCommit(
58
+ commit_sha=db_commit.commit_sha,
59
+ date=db_commit.date,
60
+ message=db_commit.message,
61
+ parent_commit_sha=db_commit.parent_commit_sha,
62
+ files=domain_files,
63
+ author=db_commit.author,
64
+ )
65
+
66
+ async def get_by_repo_id(self, repo_id: int) -> list[GitCommit]:
67
+ """Get all commits for a repository."""
68
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
69
+ # Get all commits for the repo
70
+ commits_stmt = select(db_entities.GitCommit).where(
71
+ db_entities.GitCommit.repo_id == repo_id
72
+ )
73
+ db_commits = (await session.scalars(commits_stmt)).all()
74
+
75
+ if not db_commits:
76
+ return []
77
+
78
+ commit_shas = [commit.commit_sha for commit in db_commits]
79
+
80
+ # Get all files for these commits in chunks
81
+ # to avoid parameter limits
82
+ db_files: list[db_entities.GitCommitFile] = []
83
+ chunk_size = 1000
84
+ for i in range(0, len(commit_shas), chunk_size):
85
+ chunk = commit_shas[i : i + chunk_size]
86
+ files_stmt = select(db_entities.GitCommitFile).where(
87
+ db_entities.GitCommitFile.commit_sha.in_(chunk)
88
+ )
89
+ chunk_files = (await session.scalars(files_stmt)).all()
90
+ db_files.extend(chunk_files)
91
+
92
+ # Group files by commit SHA
93
+ files_by_commit: dict[str, list[GitFile]] = {}
94
+ for db_file in db_files:
95
+ if db_file.commit_sha not in files_by_commit:
96
+ files_by_commit[db_file.commit_sha] = []
97
+
98
+ domain_file = GitFile(
99
+ blob_sha=db_file.blob_sha,
100
+ path=db_file.path,
101
+ mime_type=db_file.mime_type,
102
+ size=db_file.size,
103
+ extension=db_file.extension,
104
+ created_at=db_file.created_at,
105
+ )
106
+ files_by_commit[db_file.commit_sha].append(domain_file)
107
+
108
+ # Create domain commits
109
+ domain_commits = []
110
+ for db_commit in db_commits:
111
+ commit_files = files_by_commit.get(db_commit.commit_sha, [])
112
+ domain_commit = GitCommit(
113
+ commit_sha=db_commit.commit_sha,
114
+ date=db_commit.date,
115
+ message=db_commit.message,
116
+ parent_commit_sha=db_commit.parent_commit_sha,
117
+ files=commit_files,
118
+ author=db_commit.author,
119
+ )
120
+ domain_commits.append(domain_commit)
121
+
122
+ return domain_commits
123
+
124
+ async def save(self, commit: GitCommit, repo_id: int) -> GitCommit:
125
+ """Save a commit to a repository."""
126
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
127
+ # Check if commit already exists
128
+ existing_commit = await session.get(
129
+ db_entities.GitCommit, commit.commit_sha
130
+ )
131
+
132
+ if not existing_commit:
133
+ # Create new commit
134
+ db_commit = db_entities.GitCommit(
135
+ commit_sha=commit.commit_sha,
136
+ repo_id=repo_id,
137
+ date=commit.date,
138
+ message=commit.message,
139
+ parent_commit_sha=commit.parent_commit_sha,
140
+ author=commit.author,
141
+ )
142
+ session.add(db_commit)
143
+ await session.flush()
144
+
145
+ # Save associated files
146
+ await self._save_commit_files(session, commit)
147
+
148
+ return commit
149
+
150
+ async def save_bulk(self, commits: list[GitCommit], repo_id: int) -> None:
151
+ """Bulk save commits to a repository."""
152
+ if not commits:
153
+ return
154
+
155
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
156
+ commit_shas = [commit.commit_sha for commit in commits]
157
+
158
+ # Get existing commits in bulk (chunked to avoid parameter limits)
159
+ existing_commit_shas: set[str] = set()
160
+ chunk_size = 1000
161
+ for i in range(0, len(commit_shas), chunk_size):
162
+ chunk = commit_shas[i : i + chunk_size]
163
+ existing_commits_stmt = select(db_entities.GitCommit.commit_sha).where(
164
+ db_entities.GitCommit.commit_sha.in_(chunk)
165
+ )
166
+ chunk_existing = (await session.scalars(existing_commits_stmt)).all()
167
+ existing_commit_shas.update(chunk_existing)
168
+
169
+ # Prepare new commits for bulk insert
170
+ new_commits_data = []
171
+ new_commits_objects = []
172
+ for commit in commits:
173
+ if commit.commit_sha not in existing_commit_shas:
174
+ new_commits_data.append({
175
+ "commit_sha": commit.commit_sha,
176
+ "repo_id": repo_id,
177
+ "date": commit.date,
178
+ "message": commit.message,
179
+ "parent_commit_sha": commit.parent_commit_sha,
180
+ "author": commit.author,
181
+ })
182
+ new_commits_objects.append(commit)
183
+
184
+ # Bulk insert new commits in chunks to avoid parameter limits
185
+ if new_commits_data:
186
+ chunk_size = 1000 # Conservative chunk size for parameter limits
187
+ for i in range(0, len(new_commits_data), chunk_size):
188
+ data_chunk = new_commits_data[i : i + chunk_size]
189
+ stmt = insert(db_entities.GitCommit).values(data_chunk)
190
+ await session.execute(stmt)
191
+
192
+ # Bulk save files for new commits
193
+ await self._save_commits_files_bulk(session, new_commits_objects)
194
+
195
+ async def exists(self, commit_sha: str) -> bool:
196
+ """Check if a commit exists."""
197
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
198
+ stmt = select(db_entities.GitCommit.commit_sha).where(
199
+ db_entities.GitCommit.commit_sha == commit_sha
200
+ )
201
+ result = await session.scalar(stmt)
202
+ return result is not None
203
+
204
+ async def delete_by_repo_id(self, repo_id: int) -> None:
205
+ """Delete all commits for a repository."""
206
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
207
+ # Get all commit SHAs for this repo
208
+ commit_shas_stmt = select(db_entities.GitCommit.commit_sha).where(
209
+ db_entities.GitCommit.repo_id == repo_id
210
+ )
211
+ commit_shas = (await session.scalars(commit_shas_stmt)).all()
212
+
213
+ # Delete snippet file associations first (they reference commit files)
214
+ for commit_sha in commit_shas:
215
+ del_snippet_files_stmt = delete(db_entities.SnippetV2File).where(
216
+ db_entities.SnippetV2File.commit_sha == commit_sha
217
+ )
218
+ await session.execute(del_snippet_files_stmt)
219
+
220
+ # Delete commit files second (foreign key constraint)
221
+ for commit_sha in commit_shas:
222
+ del_files_stmt = delete(db_entities.GitCommitFile).where(
223
+ db_entities.GitCommitFile.commit_sha == commit_sha
224
+ )
225
+ await session.execute(del_files_stmt)
226
+
227
+ # Delete commits
228
+ del_commits_stmt = delete(db_entities.GitCommit).where(
229
+ db_entities.GitCommit.repo_id == repo_id
230
+ )
231
+ await session.execute(del_commits_stmt)
232
+
233
+ async def count_by_repo_id(self, repo_id: int) -> int:
234
+ """Count the number of commits for a repository."""
235
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
236
+ stmt = select(func.count()).select_from(db_entities.GitCommit).where(
237
+ db_entities.GitCommit.repo_id == repo_id
238
+ )
239
+ result = await session.scalar(stmt)
240
+ return result or 0
241
+
242
+ async def _save_commit_files(
243
+ self, session: AsyncSession, commit: GitCommit
244
+ ) -> None:
245
+ """Save files for a single commit."""
246
+ if not commit.files:
247
+ return
248
+
249
+ # Check which files already exist
250
+ existing_files_stmt = select(
251
+ db_entities.GitCommitFile.commit_sha,
252
+ db_entities.GitCommitFile.path
253
+ ).where(
254
+ db_entities.GitCommitFile.commit_sha == commit.commit_sha
255
+ )
256
+ existing_file_keys = set(await session.execute(existing_files_stmt))
257
+
258
+ # Prepare new files for insert
259
+ new_files = []
260
+ for file in commit.files:
261
+ file_key = (commit.commit_sha, file.path)
262
+ if file_key not in existing_file_keys:
263
+ new_files.append({
264
+ "commit_sha": commit.commit_sha,
265
+ "path": file.path,
266
+ "blob_sha": file.blob_sha,
267
+ "extension": file.extension,
268
+ "mime_type": file.mime_type,
269
+ "size": file.size,
270
+ "created_at": file.created_at,
271
+ })
272
+
273
+ # Bulk insert new files in chunks to avoid parameter limits
274
+ if new_files:
275
+ chunk_size = 1000 # Conservative chunk size for parameter limits
276
+ for i in range(0, len(new_files), chunk_size):
277
+ chunk = new_files[i : i + chunk_size]
278
+ stmt = insert(db_entities.GitCommitFile).values(chunk)
279
+ await session.execute(stmt)
280
+
281
+ async def _save_commits_files_bulk(
282
+ self, session: AsyncSession, commits: list[GitCommit]
283
+ ) -> None:
284
+ """Bulk save files for multiple commits."""
285
+ all_file_identifiers = [
286
+ (commit.commit_sha, file.path)
287
+ for commit in commits
288
+ for file in commit.files
289
+ ]
290
+
291
+ if not all_file_identifiers:
292
+ return
293
+
294
+ # Get existing files in chunks to avoid SQL parameter limits
295
+ existing_file_keys = await self._get_existing_file_keys_bulk(
296
+ session, all_file_identifiers
297
+ )
298
+
299
+ # Prepare new files for bulk insert
300
+ new_files = []
301
+ for commit in commits:
302
+ for file in commit.files:
303
+ file_key = (commit.commit_sha, file.path)
304
+ if file_key not in existing_file_keys:
305
+ new_files.append({
306
+ "commit_sha": commit.commit_sha,
307
+ "path": file.path,
308
+ "blob_sha": file.blob_sha,
309
+ "extension": file.extension,
310
+ "mime_type": file.mime_type,
311
+ "size": file.size,
312
+ "created_at": file.created_at,
313
+ })
314
+
315
+ # Bulk insert new files in chunks
316
+ if new_files:
317
+ chunk_size = 1000
318
+ for i in range(0, len(new_files), chunk_size):
319
+ chunk = new_files[i : i + chunk_size]
320
+ stmt = insert(db_entities.GitCommitFile).values(chunk)
321
+ await session.execute(stmt)
322
+
323
+ async def _get_existing_file_keys_bulk(
324
+ self, session: AsyncSession, file_identifiers: list[tuple[str, str]]
325
+ ) -> set[tuple[str, str]]:
326
+ """Get existing file keys in chunks to avoid SQL parameter limits."""
327
+ chunk_size = 1000
328
+ existing_file_keys = set()
329
+
330
+ for i in range(0, len(file_identifiers), chunk_size):
331
+ chunk = file_identifiers[i : i + chunk_size]
332
+ commit_shas = [item[0] for item in chunk]
333
+ paths = [item[1] for item in chunk]
334
+
335
+ existing_files_stmt = select(
336
+ db_entities.GitCommitFile.commit_sha, db_entities.GitCommitFile.path
337
+ ).where(
338
+ db_entities.GitCommitFile.commit_sha.in_(commit_shas),
339
+ db_entities.GitCommitFile.path.in_(paths),
340
+ )
341
+
342
+ chunk_existing = await session.execute(existing_files_stmt)
343
+ for commit_sha, path in chunk_existing:
344
+ existing_file_keys.add((commit_sha, path))
345
+
346
+ return existing_file_keys