kodit 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodit might be problematic. Click here for more details.

Files changed (100) hide show
  1. kodit/_version.py +2 -2
  2. kodit/app.py +59 -24
  3. kodit/application/factories/reporting_factory.py +16 -7
  4. kodit/application/factories/server_factory.py +311 -0
  5. kodit/application/services/code_search_application_service.py +144 -0
  6. kodit/application/services/commit_indexing_application_service.py +543 -0
  7. kodit/application/services/indexing_worker_service.py +13 -46
  8. kodit/application/services/queue_service.py +24 -3
  9. kodit/application/services/reporting.py +70 -54
  10. kodit/application/services/sync_scheduler.py +15 -31
  11. kodit/cli.py +2 -763
  12. kodit/cli_utils.py +2 -9
  13. kodit/config.py +3 -96
  14. kodit/database.py +38 -1
  15. kodit/domain/entities/__init__.py +276 -0
  16. kodit/domain/entities/git.py +190 -0
  17. kodit/domain/factories/__init__.py +1 -0
  18. kodit/domain/factories/git_repo_factory.py +76 -0
  19. kodit/domain/protocols.py +270 -46
  20. kodit/domain/services/bm25_service.py +5 -1
  21. kodit/domain/services/embedding_service.py +3 -0
  22. kodit/domain/services/git_repository_service.py +429 -0
  23. kodit/domain/services/git_service.py +300 -0
  24. kodit/domain/services/task_status_query_service.py +19 -0
  25. kodit/domain/value_objects.py +113 -147
  26. kodit/infrastructure/api/client/__init__.py +0 -2
  27. kodit/infrastructure/api/v1/__init__.py +0 -4
  28. kodit/infrastructure/api/v1/dependencies.py +105 -44
  29. kodit/infrastructure/api/v1/routers/__init__.py +0 -6
  30. kodit/infrastructure/api/v1/routers/commits.py +271 -0
  31. kodit/infrastructure/api/v1/routers/queue.py +2 -2
  32. kodit/infrastructure/api/v1/routers/repositories.py +282 -0
  33. kodit/infrastructure/api/v1/routers/search.py +31 -14
  34. kodit/infrastructure/api/v1/schemas/__init__.py +0 -24
  35. kodit/infrastructure/api/v1/schemas/commit.py +96 -0
  36. kodit/infrastructure/api/v1/schemas/context.py +2 -0
  37. kodit/infrastructure/api/v1/schemas/repository.py +128 -0
  38. kodit/infrastructure/api/v1/schemas/search.py +12 -9
  39. kodit/infrastructure/api/v1/schemas/snippet.py +58 -0
  40. kodit/infrastructure/api/v1/schemas/tag.py +31 -0
  41. kodit/infrastructure/api/v1/schemas/task_status.py +41 -0
  42. kodit/infrastructure/bm25/local_bm25_repository.py +16 -4
  43. kodit/infrastructure/bm25/vectorchord_bm25_repository.py +68 -52
  44. kodit/infrastructure/cloning/git/git_python_adaptor.py +467 -0
  45. kodit/infrastructure/cloning/git/working_copy.py +10 -3
  46. kodit/infrastructure/embedding/embedding_factory.py +3 -2
  47. kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
  48. kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +111 -84
  49. kodit/infrastructure/enrichment/litellm_enrichment_provider.py +19 -26
  50. kodit/infrastructure/enrichment/local_enrichment_provider.py +41 -30
  51. kodit/infrastructure/indexing/fusion_service.py +1 -1
  52. kodit/infrastructure/mappers/git_mapper.py +193 -0
  53. kodit/infrastructure/mappers/snippet_mapper.py +106 -0
  54. kodit/infrastructure/mappers/task_mapper.py +5 -44
  55. kodit/infrastructure/mappers/task_status_mapper.py +85 -0
  56. kodit/infrastructure/reporting/db_progress.py +23 -0
  57. kodit/infrastructure/reporting/log_progress.py +13 -38
  58. kodit/infrastructure/reporting/telemetry_progress.py +21 -0
  59. kodit/infrastructure/slicing/slicer.py +32 -31
  60. kodit/infrastructure/sqlalchemy/embedding_repository.py +43 -23
  61. kodit/infrastructure/sqlalchemy/entities.py +428 -131
  62. kodit/infrastructure/sqlalchemy/git_branch_repository.py +263 -0
  63. kodit/infrastructure/sqlalchemy/git_commit_repository.py +337 -0
  64. kodit/infrastructure/sqlalchemy/git_repository.py +252 -0
  65. kodit/infrastructure/sqlalchemy/git_tag_repository.py +257 -0
  66. kodit/infrastructure/sqlalchemy/snippet_v2_repository.py +484 -0
  67. kodit/infrastructure/sqlalchemy/task_repository.py +29 -23
  68. kodit/infrastructure/sqlalchemy/task_status_repository.py +91 -0
  69. kodit/infrastructure/sqlalchemy/unit_of_work.py +10 -14
  70. kodit/mcp.py +12 -26
  71. kodit/migrations/env.py +1 -1
  72. kodit/migrations/versions/04b80f802e0c_foreign_key_review.py +100 -0
  73. kodit/migrations/versions/7f15f878c3a1_add_new_git_entities.py +690 -0
  74. kodit/migrations/versions/b9cd1c3fd762_add_task_status.py +77 -0
  75. kodit/migrations/versions/f9e5ef5e688f_add_git_commits_number.py +43 -0
  76. kodit/py.typed +0 -0
  77. kodit/utils/dump_openapi.py +7 -4
  78. kodit/utils/path_utils.py +29 -0
  79. {kodit-0.4.2.dist-info → kodit-0.5.0.dist-info}/METADATA +3 -3
  80. kodit-0.5.0.dist-info/RECORD +137 -0
  81. kodit/application/factories/code_indexing_factory.py +0 -193
  82. kodit/application/services/auto_indexing_service.py +0 -103
  83. kodit/application/services/code_indexing_application_service.py +0 -393
  84. kodit/domain/entities.py +0 -323
  85. kodit/domain/services/index_query_service.py +0 -70
  86. kodit/domain/services/index_service.py +0 -267
  87. kodit/infrastructure/api/client/index_client.py +0 -57
  88. kodit/infrastructure/api/v1/routers/indexes.py +0 -119
  89. kodit/infrastructure/api/v1/schemas/index.py +0 -101
  90. kodit/infrastructure/bm25/bm25_factory.py +0 -28
  91. kodit/infrastructure/cloning/__init__.py +0 -1
  92. kodit/infrastructure/cloning/metadata.py +0 -98
  93. kodit/infrastructure/mappers/index_mapper.py +0 -345
  94. kodit/infrastructure/reporting/tdqm_progress.py +0 -73
  95. kodit/infrastructure/slicing/language_detection_service.py +0 -18
  96. kodit/infrastructure/sqlalchemy/index_repository.py +0 -646
  97. kodit-0.4.2.dist-info/RECORD +0 -119
  98. {kodit-0.4.2.dist-info → kodit-0.5.0.dist-info}/WHEEL +0 -0
  99. {kodit-0.4.2.dist-info → kodit-0.5.0.dist-info}/entry_points.txt +0 -0
  100. {kodit-0.4.2.dist-info → kodit-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,263 @@
1
+ """SQLAlchemy implementation of GitBranchRepository."""
2
+
3
+ from collections.abc import Callable
4
+
5
+ from sqlalchemy import delete, func, insert, select
6
+ from sqlalchemy.ext.asyncio import AsyncSession
7
+
8
+ from kodit.domain.entities.git import GitBranch, GitCommit
9
+ from kodit.domain.protocols import GitBranchRepository
10
+ from kodit.infrastructure.sqlalchemy import entities as db_entities
11
+ from kodit.infrastructure.sqlalchemy.unit_of_work import SqlAlchemyUnitOfWork
12
+
13
+
14
+ def create_git_branch_repository(
15
+ session_factory: Callable[[], AsyncSession],
16
+ ) -> GitBranchRepository:
17
+ """Create a git branch repository."""
18
+ return SqlAlchemyGitBranchRepository(session_factory=session_factory)
19
+
20
+
21
+ class SqlAlchemyGitBranchRepository(GitBranchRepository):
22
+ """SQLAlchemy implementation of GitBranchRepository."""
23
+
24
+ def __init__(self, session_factory: Callable[[], AsyncSession]) -> None:
25
+ """Initialize the repository."""
26
+ self.session_factory = session_factory
27
+
28
+ async def get_by_name(self, branch_name: str, repo_id: int) -> GitBranch:
29
+ """Get a branch by name and repository ID."""
30
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
31
+ # Get the branch
32
+ stmt = select(db_entities.GitBranch).where(
33
+ db_entities.GitBranch.name == branch_name,
34
+ db_entities.GitBranch.repo_id == repo_id,
35
+ )
36
+ db_branch = await session.scalar(stmt)
37
+ if not db_branch:
38
+ raise ValueError(f"Branch {branch_name} not found in repo {repo_id}")
39
+
40
+ # Get the head commit
41
+ commit_stmt = select(db_entities.GitCommit).where(
42
+ db_entities.GitCommit.commit_sha == db_branch.head_commit_sha
43
+ )
44
+ db_commit = await session.scalar(commit_stmt)
45
+ if not db_commit:
46
+ raise ValueError(f"Head commit {db_branch.head_commit_sha} not found")
47
+
48
+ # Get files for the head commit
49
+ files_stmt = select(db_entities.GitCommitFile).where(
50
+ db_entities.GitCommitFile.commit_sha == db_branch.head_commit_sha
51
+ )
52
+ db_files = (await session.scalars(files_stmt)).all()
53
+
54
+ from kodit.domain.entities.git import GitFile
55
+
56
+ domain_files = []
57
+ for db_file in db_files:
58
+ domain_file = GitFile(
59
+ blob_sha=db_file.blob_sha,
60
+ path=db_file.path,
61
+ mime_type=db_file.mime_type,
62
+ size=db_file.size,
63
+ extension=db_file.extension,
64
+ created_at=db_file.created_at,
65
+ )
66
+ domain_files.append(domain_file)
67
+
68
+ head_commit = GitCommit(
69
+ commit_sha=db_commit.commit_sha,
70
+ date=db_commit.date,
71
+ message=db_commit.message,
72
+ parent_commit_sha=db_commit.parent_commit_sha,
73
+ files=domain_files,
74
+ author=db_commit.author,
75
+ created_at=db_commit.created_at,
76
+ updated_at=db_commit.updated_at,
77
+ )
78
+
79
+ return GitBranch(
80
+ repo_id=db_branch.repo_id,
81
+ name=db_branch.name,
82
+ head_commit=head_commit,
83
+ created_at=db_branch.created_at,
84
+ updated_at=db_branch.updated_at,
85
+ )
86
+
87
+ async def get_by_repo_id(self, repo_id: int) -> list[GitBranch]:
88
+ """Get all branches for a repository."""
89
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
90
+ # Get all branches for the repo
91
+ branches_stmt = select(db_entities.GitBranch).where(
92
+ db_entities.GitBranch.repo_id == repo_id
93
+ )
94
+ db_branches = (await session.scalars(branches_stmt)).all()
95
+
96
+ if not db_branches:
97
+ return []
98
+
99
+ commit_shas = [branch.head_commit_sha for branch in db_branches]
100
+
101
+ # Get all head commits for these branches
102
+ commits_stmt = select(db_entities.GitCommit).where(
103
+ db_entities.GitCommit.commit_sha.in_(commit_shas)
104
+ )
105
+ db_commits = (await session.scalars(commits_stmt)).all()
106
+
107
+ # Get all files for these commits
108
+ files_stmt = select(db_entities.GitCommitFile).where(
109
+ db_entities.GitCommitFile.commit_sha.in_(commit_shas)
110
+ )
111
+ db_files = (await session.scalars(files_stmt)).all()
112
+
113
+ # Group files by commit SHA
114
+ from kodit.domain.entities.git import GitFile
115
+
116
+ files_by_commit: dict[str, list[GitFile]] = {}
117
+ for db_file in db_files:
118
+ if db_file.commit_sha not in files_by_commit:
119
+ files_by_commit[db_file.commit_sha] = []
120
+
121
+ domain_file = GitFile(
122
+ blob_sha=db_file.blob_sha,
123
+ path=db_file.path,
124
+ mime_type=db_file.mime_type,
125
+ size=db_file.size,
126
+ extension=db_file.extension,
127
+ created_at=db_file.created_at,
128
+ )
129
+ files_by_commit[db_file.commit_sha].append(domain_file)
130
+
131
+ # Create commit lookup
132
+ commits_by_sha = {commit.commit_sha: commit for commit in db_commits}
133
+
134
+ # Create domain branches
135
+ domain_branches = []
136
+ for db_branch in db_branches:
137
+ db_commit = commits_by_sha.get(db_branch.head_commit_sha)
138
+ if not db_commit:
139
+ continue
140
+
141
+ commit_files = files_by_commit.get(db_branch.head_commit_sha, [])
142
+ head_commit = GitCommit(
143
+ commit_sha=db_commit.commit_sha,
144
+ date=db_commit.date,
145
+ message=db_commit.message,
146
+ parent_commit_sha=db_commit.parent_commit_sha,
147
+ files=commit_files,
148
+ author=db_commit.author,
149
+ created_at=db_commit.created_at,
150
+ updated_at=db_commit.updated_at,
151
+ )
152
+
153
+ domain_branch = GitBranch(
154
+ repo_id=db_branch.repo_id,
155
+ name=db_branch.name,
156
+ head_commit=head_commit,
157
+ created_at=db_branch.created_at,
158
+ updated_at=db_branch.updated_at,
159
+ )
160
+ domain_branches.append(domain_branch)
161
+
162
+ return domain_branches
163
+
164
+ async def save(self, branch: GitBranch, repo_id: int) -> GitBranch:
165
+ """Save a branch to a repository."""
166
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
167
+ # Set repo_id on the branch
168
+ branch.repo_id = repo_id
169
+
170
+ # Check if branch already exists
171
+ existing_branch = await session.get(
172
+ db_entities.GitBranch, (repo_id, branch.name)
173
+ )
174
+
175
+ if existing_branch:
176
+ # Update existing branch
177
+ existing_branch.head_commit_sha = branch.head_commit.commit_sha
178
+ if branch.updated_at:
179
+ existing_branch.updated_at = branch.updated_at
180
+ else:
181
+ # Create new branch
182
+ db_branch = db_entities.GitBranch(
183
+ repo_id=repo_id,
184
+ name=branch.name,
185
+ head_commit_sha=branch.head_commit.commit_sha,
186
+ )
187
+ session.add(db_branch)
188
+
189
+ return branch
190
+
191
+ async def save_bulk(self, branches: list[GitBranch], repo_id: int) -> None:
192
+ """Bulk save branches to a repository."""
193
+ if not branches:
194
+ return
195
+
196
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
197
+ [(repo_id, branch.name) for branch in branches]
198
+
199
+ # Get existing branches in bulk
200
+ existing_branches_stmt = select(db_entities.GitBranch).where(
201
+ db_entities.GitBranch.repo_id == repo_id,
202
+ db_entities.GitBranch.name.in_([branch.name for branch in branches]),
203
+ )
204
+ existing_branches = (await session.scalars(existing_branches_stmt)).all()
205
+ existing_branch_names = {branch.name for branch in existing_branches}
206
+
207
+ # Update existing branches
208
+ for existing_branch in existing_branches:
209
+ for branch in branches:
210
+ if (
211
+ branch.name == existing_branch.name
212
+ and existing_branch.head_commit_sha
213
+ != branch.head_commit.commit_sha
214
+ ):
215
+ existing_branch.head_commit_sha = branch.head_commit.commit_sha
216
+ break
217
+
218
+ # Prepare new branches for bulk insert
219
+ new_branches_data = [
220
+ {
221
+ "repo_id": repo_id,
222
+ "name": branch.name,
223
+ "head_commit_sha": branch.head_commit.commit_sha,
224
+ }
225
+ for branch in branches
226
+ if branch.name not in existing_branch_names
227
+ ]
228
+
229
+ # Bulk insert new branches in chunks to avoid parameter limits
230
+ if new_branches_data:
231
+ chunk_size = 1000 # Conservative chunk size for parameter limits
232
+ for i in range(0, len(new_branches_data), chunk_size):
233
+ chunk = new_branches_data[i : i + chunk_size]
234
+ stmt = insert(db_entities.GitBranch).values(chunk)
235
+ await session.execute(stmt)
236
+
237
+ async def exists(self, branch_name: str, repo_id: int) -> bool:
238
+ """Check if a branch exists."""
239
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
240
+ stmt = select(db_entities.GitBranch.name).where(
241
+ db_entities.GitBranch.name == branch_name,
242
+ db_entities.GitBranch.repo_id == repo_id,
243
+ )
244
+ result = await session.scalar(stmt)
245
+ return result is not None
246
+
247
+ async def delete_by_repo_id(self, repo_id: int) -> None:
248
+ """Delete all branches for a repository."""
249
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
250
+ # Delete branches
251
+ del_branches_stmt = delete(db_entities.GitBranch).where(
252
+ db_entities.GitBranch.repo_id == repo_id
253
+ )
254
+ await session.execute(del_branches_stmt)
255
+
256
+ async def count_by_repo_id(self, repo_id: int) -> int:
257
+ """Count the number of branches for a repository."""
258
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
259
+ stmt = select(func.count()).select_from(db_entities.GitBranch).where(
260
+ db_entities.GitBranch.repo_id == repo_id
261
+ )
262
+ result = await session.scalar(stmt)
263
+ return result or 0
@@ -0,0 +1,337 @@
1
+ """SQLAlchemy implementation of GitCommitRepository."""
2
+
3
+ from collections.abc import Callable
4
+
5
+ from sqlalchemy import delete, func, insert, select
6
+ from sqlalchemy.ext.asyncio import AsyncSession
7
+
8
+ from kodit.domain.entities.git import GitCommit, GitFile
9
+ from kodit.domain.protocols import GitCommitRepository
10
+ from kodit.infrastructure.sqlalchemy import entities as db_entities
11
+ from kodit.infrastructure.sqlalchemy.unit_of_work import SqlAlchemyUnitOfWork
12
+
13
+
14
+ def create_git_commit_repository(
15
+ session_factory: Callable[[], AsyncSession],
16
+ ) -> GitCommitRepository:
17
+ """Create a git commit repository."""
18
+ return SqlAlchemyGitCommitRepository(session_factory=session_factory)
19
+
20
+
21
+ class SqlAlchemyGitCommitRepository(GitCommitRepository):
22
+ """SQLAlchemy implementation of GitCommitRepository."""
23
+
24
+ def __init__(self, session_factory: Callable[[], AsyncSession]) -> None:
25
+ """Initialize the repository."""
26
+ self.session_factory = session_factory
27
+
28
+ async def get_by_sha(self, commit_sha: str) -> GitCommit:
29
+ """Get a commit by its SHA."""
30
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
31
+ # Get the commit
32
+ stmt = select(db_entities.GitCommit).where(
33
+ db_entities.GitCommit.commit_sha == commit_sha
34
+ )
35
+ db_commit = await session.scalar(stmt)
36
+ if not db_commit:
37
+ raise ValueError(f"Commit with SHA {commit_sha} not found")
38
+
39
+ # Get associated files
40
+ files_stmt = select(db_entities.GitCommitFile).where(
41
+ db_entities.GitCommitFile.commit_sha == commit_sha
42
+ )
43
+ db_files = (await session.scalars(files_stmt)).all()
44
+
45
+ domain_files = []
46
+ for db_file in db_files:
47
+ domain_file = GitFile(
48
+ blob_sha=db_file.blob_sha,
49
+ path=db_file.path,
50
+ mime_type=db_file.mime_type,
51
+ size=db_file.size,
52
+ extension=db_file.extension,
53
+ created_at=db_file.created_at,
54
+ )
55
+ domain_files.append(domain_file)
56
+
57
+ return GitCommit(
58
+ commit_sha=db_commit.commit_sha,
59
+ date=db_commit.date,
60
+ message=db_commit.message,
61
+ parent_commit_sha=db_commit.parent_commit_sha,
62
+ files=domain_files,
63
+ author=db_commit.author,
64
+ )
65
+
66
+ async def get_by_repo_id(self, repo_id: int) -> list[GitCommit]:
67
+ """Get all commits for a repository."""
68
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
69
+ # Get all commits for the repo
70
+ commits_stmt = select(db_entities.GitCommit).where(
71
+ db_entities.GitCommit.repo_id == repo_id
72
+ )
73
+ db_commits = (await session.scalars(commits_stmt)).all()
74
+
75
+ if not db_commits:
76
+ return []
77
+
78
+ commit_shas = [commit.commit_sha for commit in db_commits]
79
+
80
+ # Get all files for these commits
81
+ files_stmt = select(db_entities.GitCommitFile).where(
82
+ db_entities.GitCommitFile.commit_sha.in_(commit_shas)
83
+ )
84
+ db_files = (await session.scalars(files_stmt)).all()
85
+
86
+ # Group files by commit SHA
87
+ files_by_commit: dict[str, list[GitFile]] = {}
88
+ for db_file in db_files:
89
+ if db_file.commit_sha not in files_by_commit:
90
+ files_by_commit[db_file.commit_sha] = []
91
+
92
+ domain_file = GitFile(
93
+ blob_sha=db_file.blob_sha,
94
+ path=db_file.path,
95
+ mime_type=db_file.mime_type,
96
+ size=db_file.size,
97
+ extension=db_file.extension,
98
+ created_at=db_file.created_at,
99
+ )
100
+ files_by_commit[db_file.commit_sha].append(domain_file)
101
+
102
+ # Create domain commits
103
+ domain_commits = []
104
+ for db_commit in db_commits:
105
+ commit_files = files_by_commit.get(db_commit.commit_sha, [])
106
+ domain_commit = GitCommit(
107
+ commit_sha=db_commit.commit_sha,
108
+ date=db_commit.date,
109
+ message=db_commit.message,
110
+ parent_commit_sha=db_commit.parent_commit_sha,
111
+ files=commit_files,
112
+ author=db_commit.author,
113
+ )
114
+ domain_commits.append(domain_commit)
115
+
116
+ return domain_commits
117
+
118
+ async def save(self, commit: GitCommit, repo_id: int) -> GitCommit:
119
+ """Save a commit to a repository."""
120
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
121
+ # Check if commit already exists
122
+ existing_commit = await session.get(
123
+ db_entities.GitCommit, commit.commit_sha
124
+ )
125
+
126
+ if not existing_commit:
127
+ # Create new commit
128
+ db_commit = db_entities.GitCommit(
129
+ commit_sha=commit.commit_sha,
130
+ repo_id=repo_id,
131
+ date=commit.date,
132
+ message=commit.message,
133
+ parent_commit_sha=commit.parent_commit_sha,
134
+ author=commit.author,
135
+ )
136
+ session.add(db_commit)
137
+ await session.flush()
138
+
139
+ # Save associated files
140
+ await self._save_commit_files(session, commit)
141
+
142
+ return commit
143
+
144
+ async def save_bulk(self, commits: list[GitCommit], repo_id: int) -> None:
145
+ """Bulk save commits to a repository."""
146
+ if not commits:
147
+ return
148
+
149
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
150
+ commit_shas = [commit.commit_sha for commit in commits]
151
+
152
+ # Get existing commits in bulk
153
+ existing_commits_stmt = select(db_entities.GitCommit.commit_sha).where(
154
+ db_entities.GitCommit.commit_sha.in_(commit_shas)
155
+ )
156
+ existing_commit_shas = set(
157
+ (await session.scalars(existing_commits_stmt)).all()
158
+ )
159
+
160
+ # Prepare new commits for bulk insert
161
+ new_commits_data = []
162
+ new_commits_objects = []
163
+ for commit in commits:
164
+ if commit.commit_sha not in existing_commit_shas:
165
+ new_commits_data.append({
166
+ "commit_sha": commit.commit_sha,
167
+ "repo_id": repo_id,
168
+ "date": commit.date,
169
+ "message": commit.message,
170
+ "parent_commit_sha": commit.parent_commit_sha,
171
+ "author": commit.author,
172
+ })
173
+ new_commits_objects.append(commit)
174
+
175
+ # Bulk insert new commits in chunks to avoid parameter limits
176
+ if new_commits_data:
177
+ chunk_size = 1000 # Conservative chunk size for parameter limits
178
+ for i in range(0, len(new_commits_data), chunk_size):
179
+ chunk = new_commits_data[i : i + chunk_size]
180
+ stmt = insert(db_entities.GitCommit).values(chunk)
181
+ await session.execute(stmt)
182
+
183
+ # Bulk save files for new commits
184
+ await self._save_commits_files_bulk(session, new_commits_objects)
185
+
186
+ async def exists(self, commit_sha: str) -> bool:
187
+ """Check if a commit exists."""
188
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
189
+ stmt = select(db_entities.GitCommit.commit_sha).where(
190
+ db_entities.GitCommit.commit_sha == commit_sha
191
+ )
192
+ result = await session.scalar(stmt)
193
+ return result is not None
194
+
195
+ async def delete_by_repo_id(self, repo_id: int) -> None:
196
+ """Delete all commits for a repository."""
197
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
198
+ # Get all commit SHAs for this repo
199
+ commit_shas_stmt = select(db_entities.GitCommit.commit_sha).where(
200
+ db_entities.GitCommit.repo_id == repo_id
201
+ )
202
+ commit_shas = (await session.scalars(commit_shas_stmt)).all()
203
+
204
+ # Delete snippet file associations first (they reference commit files)
205
+ for commit_sha in commit_shas:
206
+ del_snippet_files_stmt = delete(db_entities.SnippetV2File).where(
207
+ db_entities.SnippetV2File.commit_sha == commit_sha
208
+ )
209
+ await session.execute(del_snippet_files_stmt)
210
+
211
+ # Delete commit files second (foreign key constraint)
212
+ for commit_sha in commit_shas:
213
+ del_files_stmt = delete(db_entities.GitCommitFile).where(
214
+ db_entities.GitCommitFile.commit_sha == commit_sha
215
+ )
216
+ await session.execute(del_files_stmt)
217
+
218
+ # Delete commits
219
+ del_commits_stmt = delete(db_entities.GitCommit).where(
220
+ db_entities.GitCommit.repo_id == repo_id
221
+ )
222
+ await session.execute(del_commits_stmt)
223
+
224
+ async def count_by_repo_id(self, repo_id: int) -> int:
225
+ """Count the number of commits for a repository."""
226
+ async with SqlAlchemyUnitOfWork(self.session_factory) as session:
227
+ stmt = select(func.count()).select_from(db_entities.GitCommit).where(
228
+ db_entities.GitCommit.repo_id == repo_id
229
+ )
230
+ result = await session.scalar(stmt)
231
+ return result or 0
232
+
233
+ async def _save_commit_files(
234
+ self, session: AsyncSession, commit: GitCommit
235
+ ) -> None:
236
+ """Save files for a single commit."""
237
+ if not commit.files:
238
+ return
239
+
240
+ # Check which files already exist
241
+ existing_files_stmt = select(
242
+ db_entities.GitCommitFile.commit_sha,
243
+ db_entities.GitCommitFile.path
244
+ ).where(
245
+ db_entities.GitCommitFile.commit_sha == commit.commit_sha
246
+ )
247
+ existing_file_keys = set(await session.execute(existing_files_stmt))
248
+
249
+ # Prepare new files for insert
250
+ new_files = []
251
+ for file in commit.files:
252
+ file_key = (commit.commit_sha, file.path)
253
+ if file_key not in existing_file_keys:
254
+ new_files.append({
255
+ "commit_sha": commit.commit_sha,
256
+ "path": file.path,
257
+ "blob_sha": file.blob_sha,
258
+ "extension": file.extension,
259
+ "mime_type": file.mime_type,
260
+ "size": file.size,
261
+ "created_at": file.created_at,
262
+ })
263
+
264
+ # Bulk insert new files in chunks to avoid parameter limits
265
+ if new_files:
266
+ chunk_size = 1000 # Conservative chunk size for parameter limits
267
+ for i in range(0, len(new_files), chunk_size):
268
+ chunk = new_files[i : i + chunk_size]
269
+ stmt = insert(db_entities.GitCommitFile).values(chunk)
270
+ await session.execute(stmt)
271
+
272
+ async def _save_commits_files_bulk(
273
+ self, session: AsyncSession, commits: list[GitCommit]
274
+ ) -> None:
275
+ """Bulk save files for multiple commits."""
276
+ all_file_identifiers = [
277
+ (commit.commit_sha, file.path)
278
+ for commit in commits
279
+ for file in commit.files
280
+ ]
281
+
282
+ if not all_file_identifiers:
283
+ return
284
+
285
+ # Get existing files in chunks to avoid SQL parameter limits
286
+ existing_file_keys = await self._get_existing_file_keys_bulk(
287
+ session, all_file_identifiers
288
+ )
289
+
290
+ # Prepare new files for bulk insert
291
+ new_files = []
292
+ for commit in commits:
293
+ for file in commit.files:
294
+ file_key = (commit.commit_sha, file.path)
295
+ if file_key not in existing_file_keys:
296
+ new_files.append({
297
+ "commit_sha": commit.commit_sha,
298
+ "path": file.path,
299
+ "blob_sha": file.blob_sha,
300
+ "extension": file.extension,
301
+ "mime_type": file.mime_type,
302
+ "size": file.size,
303
+ "created_at": file.created_at,
304
+ })
305
+
306
+ # Bulk insert new files in chunks
307
+ if new_files:
308
+ chunk_size = 1000
309
+ for i in range(0, len(new_files), chunk_size):
310
+ chunk = new_files[i : i + chunk_size]
311
+ stmt = insert(db_entities.GitCommitFile).values(chunk)
312
+ await session.execute(stmt)
313
+
314
+ async def _get_existing_file_keys_bulk(
315
+ self, session: AsyncSession, file_identifiers: list[tuple[str, str]]
316
+ ) -> set[tuple[str, str]]:
317
+ """Get existing file keys in chunks to avoid SQL parameter limits."""
318
+ chunk_size = 1000
319
+ existing_file_keys = set()
320
+
321
+ for i in range(0, len(file_identifiers), chunk_size):
322
+ chunk = file_identifiers[i : i + chunk_size]
323
+ commit_shas = [item[0] for item in chunk]
324
+ paths = [item[1] for item in chunk]
325
+
326
+ existing_files_stmt = select(
327
+ db_entities.GitCommitFile.commit_sha, db_entities.GitCommitFile.path
328
+ ).where(
329
+ db_entities.GitCommitFile.commit_sha.in_(commit_shas),
330
+ db_entities.GitCommitFile.path.in_(paths),
331
+ )
332
+
333
+ chunk_existing = await session.execute(existing_files_stmt)
334
+ for commit_sha, path in chunk_existing:
335
+ existing_file_keys.add((commit_sha, path))
336
+
337
+ return existing_file_keys