kodit 0.4.3__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodit might be problematic. Click here for more details.
- kodit/_version.py +2 -2
- kodit/app.py +53 -23
- kodit/application/factories/reporting_factory.py +6 -2
- kodit/application/factories/server_factory.py +311 -0
- kodit/application/services/code_search_application_service.py +144 -0
- kodit/application/services/commit_indexing_application_service.py +543 -0
- kodit/application/services/indexing_worker_service.py +13 -44
- kodit/application/services/queue_service.py +24 -3
- kodit/application/services/reporting.py +0 -2
- kodit/application/services/sync_scheduler.py +15 -31
- kodit/cli.py +2 -753
- kodit/cli_utils.py +2 -9
- kodit/config.py +1 -94
- kodit/database.py +38 -1
- kodit/domain/{entities.py → entities/__init__.py} +50 -195
- kodit/domain/entities/git.py +190 -0
- kodit/domain/factories/__init__.py +1 -0
- kodit/domain/factories/git_repo_factory.py +76 -0
- kodit/domain/protocols.py +263 -64
- kodit/domain/services/bm25_service.py +5 -1
- kodit/domain/services/embedding_service.py +3 -0
- kodit/domain/services/git_repository_service.py +429 -0
- kodit/domain/services/git_service.py +300 -0
- kodit/domain/services/task_status_query_service.py +2 -2
- kodit/domain/value_objects.py +83 -114
- kodit/infrastructure/api/client/__init__.py +0 -2
- kodit/infrastructure/api/v1/__init__.py +0 -4
- kodit/infrastructure/api/v1/dependencies.py +92 -46
- kodit/infrastructure/api/v1/routers/__init__.py +0 -6
- kodit/infrastructure/api/v1/routers/commits.py +271 -0
- kodit/infrastructure/api/v1/routers/queue.py +2 -2
- kodit/infrastructure/api/v1/routers/repositories.py +282 -0
- kodit/infrastructure/api/v1/routers/search.py +31 -14
- kodit/infrastructure/api/v1/schemas/__init__.py +0 -24
- kodit/infrastructure/api/v1/schemas/commit.py +96 -0
- kodit/infrastructure/api/v1/schemas/context.py +2 -0
- kodit/infrastructure/api/v1/schemas/repository.py +128 -0
- kodit/infrastructure/api/v1/schemas/search.py +12 -9
- kodit/infrastructure/api/v1/schemas/snippet.py +58 -0
- kodit/infrastructure/api/v1/schemas/tag.py +31 -0
- kodit/infrastructure/api/v1/schemas/task_status.py +2 -0
- kodit/infrastructure/bm25/local_bm25_repository.py +16 -4
- kodit/infrastructure/bm25/vectorchord_bm25_repository.py +68 -52
- kodit/infrastructure/cloning/git/git_python_adaptor.py +467 -0
- kodit/infrastructure/cloning/git/working_copy.py +1 -1
- kodit/infrastructure/embedding/embedding_factory.py +3 -2
- kodit/infrastructure/embedding/local_vector_search_repository.py +1 -1
- kodit/infrastructure/embedding/vectorchord_vector_search_repository.py +111 -84
- kodit/infrastructure/enrichment/litellm_enrichment_provider.py +19 -26
- kodit/infrastructure/indexing/fusion_service.py +1 -1
- kodit/infrastructure/mappers/git_mapper.py +193 -0
- kodit/infrastructure/mappers/snippet_mapper.py +106 -0
- kodit/infrastructure/mappers/task_mapper.py +5 -44
- kodit/infrastructure/reporting/log_progress.py +8 -5
- kodit/infrastructure/reporting/telemetry_progress.py +21 -0
- kodit/infrastructure/slicing/slicer.py +32 -31
- kodit/infrastructure/sqlalchemy/embedding_repository.py +43 -23
- kodit/infrastructure/sqlalchemy/entities.py +394 -158
- kodit/infrastructure/sqlalchemy/git_branch_repository.py +263 -0
- kodit/infrastructure/sqlalchemy/git_commit_repository.py +337 -0
- kodit/infrastructure/sqlalchemy/git_repository.py +252 -0
- kodit/infrastructure/sqlalchemy/git_tag_repository.py +257 -0
- kodit/infrastructure/sqlalchemy/snippet_v2_repository.py +484 -0
- kodit/infrastructure/sqlalchemy/task_repository.py +29 -23
- kodit/infrastructure/sqlalchemy/task_status_repository.py +24 -12
- kodit/infrastructure/sqlalchemy/unit_of_work.py +10 -14
- kodit/mcp.py +12 -30
- kodit/migrations/env.py +1 -0
- kodit/migrations/versions/04b80f802e0c_foreign_key_review.py +100 -0
- kodit/migrations/versions/7f15f878c3a1_add_new_git_entities.py +690 -0
- kodit/migrations/versions/f9e5ef5e688f_add_git_commits_number.py +43 -0
- kodit/py.typed +0 -0
- kodit/utils/dump_openapi.py +7 -4
- kodit/utils/path_utils.py +29 -0
- {kodit-0.4.3.dist-info → kodit-0.5.0.dist-info}/METADATA +3 -3
- kodit-0.5.0.dist-info/RECORD +137 -0
- kodit/application/factories/code_indexing_factory.py +0 -195
- kodit/application/services/auto_indexing_service.py +0 -99
- kodit/application/services/code_indexing_application_service.py +0 -410
- kodit/domain/services/index_query_service.py +0 -70
- kodit/domain/services/index_service.py +0 -269
- kodit/infrastructure/api/client/index_client.py +0 -57
- kodit/infrastructure/api/v1/routers/indexes.py +0 -164
- kodit/infrastructure/api/v1/schemas/index.py +0 -101
- kodit/infrastructure/bm25/bm25_factory.py +0 -28
- kodit/infrastructure/cloning/__init__.py +0 -1
- kodit/infrastructure/cloning/metadata.py +0 -98
- kodit/infrastructure/mappers/index_mapper.py +0 -345
- kodit/infrastructure/reporting/tdqm_progress.py +0 -38
- kodit/infrastructure/slicing/language_detection_service.py +0 -18
- kodit/infrastructure/sqlalchemy/index_repository.py +0 -646
- kodit-0.4.3.dist-info/RECORD +0 -125
- {kodit-0.4.3.dist-info → kodit-0.5.0.dist-info}/WHEEL +0 -0
- {kodit-0.4.3.dist-info → kodit-0.5.0.dist-info}/entry_points.txt +0 -0
- {kodit-0.4.3.dist-info → kodit-0.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
"""Service for git operations."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import hashlib
|
|
5
|
+
from datetime import UTC, datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
import git
|
|
10
|
+
import structlog
|
|
11
|
+
from git import InvalidGitRepositoryError, Repo
|
|
12
|
+
from pydantic import AnyUrl
|
|
13
|
+
|
|
14
|
+
from kodit.application.factories.reporting_factory import create_noop_operation
|
|
15
|
+
from kodit.application.services.reporting import ProgressTracker
|
|
16
|
+
from kodit.domain.entities import WorkingCopy
|
|
17
|
+
from kodit.domain.entities.git import (
|
|
18
|
+
GitBranch,
|
|
19
|
+
GitCommit,
|
|
20
|
+
GitFile,
|
|
21
|
+
GitRepo,
|
|
22
|
+
GitTag,
|
|
23
|
+
)
|
|
24
|
+
from kodit.domain.factories.git_repo_factory import GitRepoFactory
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from git.objects import Commit
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class GitService:
|
|
31
|
+
"""Service for git operations."""
|
|
32
|
+
|
|
33
|
+
def __init__(self, clone_dir: Path) -> None:
|
|
34
|
+
"""Initialize the git service."""
|
|
35
|
+
self.clone_dir = clone_dir
|
|
36
|
+
self.log = structlog.get_logger(__name__)
|
|
37
|
+
|
|
38
|
+
def get_clone_path(self, uri: str) -> Path:
|
|
39
|
+
"""Get the clone path for a Git working copy."""
|
|
40
|
+
sanitized_uri = WorkingCopy.sanitize_git_url(uri)
|
|
41
|
+
dir_hash = hashlib.sha256(str(sanitized_uri).encode("utf-8")).hexdigest()[:16]
|
|
42
|
+
dir_name = f"repo-{dir_hash}"
|
|
43
|
+
return self.clone_dir / dir_name
|
|
44
|
+
|
|
45
|
+
async def clone_and_extract_repo_info(
|
|
46
|
+
self, uri: str, step: ProgressTracker | None = None
|
|
47
|
+
) -> GitRepo:
|
|
48
|
+
"""Clone repository and extract complete git repository information."""
|
|
49
|
+
step = step or create_noop_operation()
|
|
50
|
+
# Verify the clone path doesn't already exist
|
|
51
|
+
clone_path = self.get_clone_path(uri)
|
|
52
|
+
if clone_path.exists():
|
|
53
|
+
raise ValueError(f"Clone path already exists: {clone_path}")
|
|
54
|
+
sanitized_uri = WorkingCopy.sanitize_git_url(uri)
|
|
55
|
+
clone_path.mkdir(parents=True, exist_ok=True)
|
|
56
|
+
|
|
57
|
+
step_record = []
|
|
58
|
+
await step.set_total(12)
|
|
59
|
+
|
|
60
|
+
def _clone_progress_callback(
|
|
61
|
+
a: int, _: str | float | None, __: str | float | None, _d: str
|
|
62
|
+
) -> None:
|
|
63
|
+
if a not in step_record:
|
|
64
|
+
step_record.append(a)
|
|
65
|
+
|
|
66
|
+
# Git reports a really weird format. This is a quick hack to get some
|
|
67
|
+
# progress.
|
|
68
|
+
# Normally this would fail because the loop is already running,
|
|
69
|
+
# but in this case, this callback is called by some git sub-thread.
|
|
70
|
+
asyncio.run(
|
|
71
|
+
step.set_current(
|
|
72
|
+
len(step_record), f"Cloning repository ({step_record[-1]})"
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
self.log.info(
|
|
78
|
+
"Cloning repository", uri=sanitized_uri, clone_path=str(clone_path)
|
|
79
|
+
)
|
|
80
|
+
# Use the original URI for cloning (with credentials if present)
|
|
81
|
+
options = ["--depth=1", "--single-branch"]
|
|
82
|
+
git.Repo.clone_from(
|
|
83
|
+
uri,
|
|
84
|
+
clone_path,
|
|
85
|
+
progress=_clone_progress_callback,
|
|
86
|
+
multi_options=options,
|
|
87
|
+
)
|
|
88
|
+
except git.GitCommandError as e:
|
|
89
|
+
if "already exists and is not an empty directory" not in str(e):
|
|
90
|
+
msg = f"Failed to clone repository: {e}"
|
|
91
|
+
raise ValueError(msg) from e
|
|
92
|
+
self.log.info("Repository already exists, reusing...", uri=sanitized_uri)
|
|
93
|
+
|
|
94
|
+
# Extract git repository information from cloned path
|
|
95
|
+
# Convert original URI to AnyUrl for GitRepo
|
|
96
|
+
from pydantic import AnyUrl
|
|
97
|
+
|
|
98
|
+
original_uri = AnyUrl(uri)
|
|
99
|
+
return self.get_repo_info_from_path(clone_path, original_uri, sanitized_uri)
|
|
100
|
+
|
|
101
|
+
def get_repo_info_from_path(
|
|
102
|
+
self, repo_path: Path, remote_uri: AnyUrl, sanitized_remote_uri: AnyUrl
|
|
103
|
+
) -> GitRepo:
|
|
104
|
+
"""Extract complete git repository information from a local path."""
|
|
105
|
+
try:
|
|
106
|
+
repo = Repo(repo_path)
|
|
107
|
+
except InvalidGitRepositoryError as e:
|
|
108
|
+
raise ValueError(f"Path is not a git repository: {repo_path}") from e
|
|
109
|
+
|
|
110
|
+
# Get all branches with their commit histories
|
|
111
|
+
branches = self._get_all_branches(repo)
|
|
112
|
+
|
|
113
|
+
# Count commits for num_commits field (managed by GitCommitRepository)
|
|
114
|
+
all_commits = self._get_all_commits(repo)
|
|
115
|
+
num_commits = len(all_commits)
|
|
116
|
+
|
|
117
|
+
# Get all tags
|
|
118
|
+
all_tags = self._get_all_tags(repo)
|
|
119
|
+
|
|
120
|
+
# Get current branch as tracking branch
|
|
121
|
+
try:
|
|
122
|
+
current_branch = repo.active_branch
|
|
123
|
+
tracking_branch = next(
|
|
124
|
+
(b for b in branches if b.name == current_branch.name),
|
|
125
|
+
branches[0] if branches else None,
|
|
126
|
+
)
|
|
127
|
+
except (AttributeError, TypeError):
|
|
128
|
+
# Handle detached HEAD state or other branch access issues
|
|
129
|
+
tracking_branch = branches[0] if branches else None
|
|
130
|
+
|
|
131
|
+
if tracking_branch is None:
|
|
132
|
+
raise ValueError("No branches found in repository")
|
|
133
|
+
|
|
134
|
+
return GitRepoFactory.create_from_path_scan(
|
|
135
|
+
remote_uri=remote_uri,
|
|
136
|
+
sanitized_remote_uri=sanitized_remote_uri,
|
|
137
|
+
repo_path=repo_path,
|
|
138
|
+
tracking_branch=tracking_branch,
|
|
139
|
+
last_scanned_at=datetime.now(UTC),
|
|
140
|
+
num_commits=num_commits,
|
|
141
|
+
num_branches=len(branches),
|
|
142
|
+
num_tags=len(all_tags),
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
def get_commit_history(
|
|
146
|
+
self, repo_path: Path, branch_name: str, limit: int = 100
|
|
147
|
+
) -> list[GitCommit]:
|
|
148
|
+
"""Get commit history for a specific branch."""
|
|
149
|
+
try:
|
|
150
|
+
repo = Repo(repo_path)
|
|
151
|
+
|
|
152
|
+
# Get the branch reference
|
|
153
|
+
branch_ref = None
|
|
154
|
+
for branch in repo.branches:
|
|
155
|
+
if branch.name == branch_name:
|
|
156
|
+
branch_ref = branch
|
|
157
|
+
break
|
|
158
|
+
|
|
159
|
+
if branch_ref is None:
|
|
160
|
+
return []
|
|
161
|
+
|
|
162
|
+
# Get commit history for the branch
|
|
163
|
+
commits = []
|
|
164
|
+
for commit in repo.iter_commits(branch_ref, max_count=limit):
|
|
165
|
+
try:
|
|
166
|
+
git_commit = self._convert_commit(repo, commit)
|
|
167
|
+
commits.append(git_commit)
|
|
168
|
+
except Exception: # noqa: BLE001, S112
|
|
169
|
+
# Skip commits we can't process
|
|
170
|
+
continue
|
|
171
|
+
|
|
172
|
+
except (InvalidGitRepositoryError, Exception):
|
|
173
|
+
return []
|
|
174
|
+
else:
|
|
175
|
+
return commits
|
|
176
|
+
|
|
177
|
+
def _get_all_branches(self, repo: Repo) -> list[GitBranch]:
|
|
178
|
+
"""Get all branches with their commit histories."""
|
|
179
|
+
branches = []
|
|
180
|
+
|
|
181
|
+
for branch in repo.branches:
|
|
182
|
+
try:
|
|
183
|
+
# Get head commit for this branch
|
|
184
|
+
head_commit = self._convert_commit(repo, branch.commit)
|
|
185
|
+
branches.append(GitBranch(name=branch.name, head_commit=head_commit))
|
|
186
|
+
except Exception: # noqa: BLE001, S112
|
|
187
|
+
# Skip branches that can't be accessed
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
return branches
|
|
191
|
+
|
|
192
|
+
def _get_all_commits(self, repo: Repo) -> list[GitCommit]:
|
|
193
|
+
"""Get all unique commits across all branches."""
|
|
194
|
+
commit_cache = {} # Use SHA as key to avoid duplicates
|
|
195
|
+
|
|
196
|
+
# Get all commits from all branches
|
|
197
|
+
for branch in repo.branches:
|
|
198
|
+
try:
|
|
199
|
+
# Traverse the entire commit history for this branch
|
|
200
|
+
for commit in repo.iter_commits(branch):
|
|
201
|
+
if commit.hexsha not in commit_cache:
|
|
202
|
+
domain_commit = self._convert_commit(repo, commit)
|
|
203
|
+
commit_cache[commit.hexsha] = domain_commit
|
|
204
|
+
except Exception: # noqa: BLE001, S112
|
|
205
|
+
# Skip branches that can't be accessed
|
|
206
|
+
continue
|
|
207
|
+
|
|
208
|
+
return list(commit_cache.values())
|
|
209
|
+
|
|
210
|
+
def _get_all_tags(self, repo: Repo) -> list[GitTag]:
|
|
211
|
+
"""Get all tags in the repository."""
|
|
212
|
+
all_commits = self._get_all_commits(repo)
|
|
213
|
+
all_commits_map = {commit.commit_sha: commit for commit in all_commits}
|
|
214
|
+
tags = []
|
|
215
|
+
try:
|
|
216
|
+
for tag_ref in repo.tags:
|
|
217
|
+
try:
|
|
218
|
+
# Get the commit that the tag points to
|
|
219
|
+
target_commit = tag_ref.commit
|
|
220
|
+
|
|
221
|
+
tag = GitTag(
|
|
222
|
+
created_at=datetime.now(UTC),
|
|
223
|
+
name=tag_ref.name,
|
|
224
|
+
target_commit=all_commits_map[target_commit.hexsha],
|
|
225
|
+
)
|
|
226
|
+
tags.append(tag)
|
|
227
|
+
except Exception: # noqa: BLE001, S112
|
|
228
|
+
# Skip tags that can't be processed
|
|
229
|
+
continue
|
|
230
|
+
except Exception: # noqa: BLE001
|
|
231
|
+
# If we can't get tags, return empty list
|
|
232
|
+
return []
|
|
233
|
+
|
|
234
|
+
return tags
|
|
235
|
+
|
|
236
|
+
def _convert_commit(self, repo: Repo, commit: "Commit") -> GitCommit:
|
|
237
|
+
"""Convert a GitPython commit object to domain GitCommit."""
|
|
238
|
+
# Convert timestamp to datetime
|
|
239
|
+
commit_date = datetime.fromtimestamp(commit.committed_date, tz=UTC)
|
|
240
|
+
|
|
241
|
+
# Get parent commit SHA (first parent if merge commit)
|
|
242
|
+
parent_sha = commit.parents[0].hexsha if commit.parents else ""
|
|
243
|
+
|
|
244
|
+
# Get files changed in this commit
|
|
245
|
+
files = self._get_commit_files(repo, commit)
|
|
246
|
+
|
|
247
|
+
# Format author string from name and email
|
|
248
|
+
author_name = str(commit.author.name) if commit.author.name else ""
|
|
249
|
+
author_email = str(commit.author.email) if commit.author.email else ""
|
|
250
|
+
if author_name and author_email:
|
|
251
|
+
author = f"{author_name} <{author_email}>"
|
|
252
|
+
else:
|
|
253
|
+
author = author_name or "Unknown"
|
|
254
|
+
|
|
255
|
+
return GitCommit(
|
|
256
|
+
commit_sha=commit.hexsha,
|
|
257
|
+
date=commit_date,
|
|
258
|
+
message=str(commit.message).strip(),
|
|
259
|
+
parent_commit_sha=parent_sha,
|
|
260
|
+
files=files,
|
|
261
|
+
author=author,
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
def _get_commit_files(self, repo: Repo, commit: "Commit") -> list[GitFile]:
|
|
265
|
+
"""Get files changed in a specific commit."""
|
|
266
|
+
try:
|
|
267
|
+
files = []
|
|
268
|
+
|
|
269
|
+
# Get files changed in this commit
|
|
270
|
+
if commit.parents:
|
|
271
|
+
# Compare with first parent to get changed files
|
|
272
|
+
changed_files = commit.parents[0].diff(commit)
|
|
273
|
+
else:
|
|
274
|
+
# Initial commit - get all files
|
|
275
|
+
changed_files = commit.diff(None)
|
|
276
|
+
|
|
277
|
+
for diff_item in changed_files:
|
|
278
|
+
# Handle both a_path and b_path (for renames/moves)
|
|
279
|
+
file_path = diff_item.b_path or diff_item.a_path
|
|
280
|
+
if file_path and diff_item.b_blob:
|
|
281
|
+
try:
|
|
282
|
+
blob = diff_item.b_blob
|
|
283
|
+
file_entity = GitFile(
|
|
284
|
+
created_at=datetime.now(UTC),
|
|
285
|
+
blob_sha=blob.hexsha,
|
|
286
|
+
path=str(Path(repo.working_dir) / file_path),
|
|
287
|
+
mime_type="application/octet-stream", # Default
|
|
288
|
+
size=blob.size,
|
|
289
|
+
extension=GitFile.extension_from_path(file_path),
|
|
290
|
+
)
|
|
291
|
+
files.append(file_entity)
|
|
292
|
+
except Exception: # noqa: BLE001, S112
|
|
293
|
+
# Skip files we can't process
|
|
294
|
+
continue
|
|
295
|
+
|
|
296
|
+
except Exception: # noqa: BLE001
|
|
297
|
+
# If we can't get files for this commit, return empty list
|
|
298
|
+
return []
|
|
299
|
+
else:
|
|
300
|
+
return files
|
|
@@ -12,8 +12,8 @@ class TaskStatusQueryService:
|
|
|
12
12
|
"""Initialize the task status query service."""
|
|
13
13
|
self._repository = repository
|
|
14
14
|
|
|
15
|
-
async def get_index_status(self,
|
|
15
|
+
async def get_index_status(self, repo_id: int) -> list[TaskStatus]:
|
|
16
16
|
"""Get the status of tasks for a specific index."""
|
|
17
17
|
return await self._repository.load_with_hierarchy(
|
|
18
|
-
trackable_type=TrackableType.
|
|
18
|
+
trackable_type=TrackableType.KODIT_REPOSITORY.value, trackable_id=repo_id
|
|
19
19
|
)
|
kodit/domain/value_objects.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""Pure domain value objects and DTOs."""
|
|
2
2
|
|
|
3
|
-
import json
|
|
4
3
|
from dataclasses import dataclass
|
|
5
4
|
from datetime import datetime
|
|
6
5
|
from enum import Enum, IntEnum, StrEnum
|
|
@@ -18,12 +17,27 @@ class SourceType(IntEnum):
|
|
|
18
17
|
GIT = 2
|
|
19
18
|
|
|
20
19
|
|
|
21
|
-
class SnippetContentType(
|
|
20
|
+
class SnippetContentType(StrEnum):
|
|
22
21
|
"""Type of snippet content."""
|
|
23
22
|
|
|
24
|
-
UNKNOWN =
|
|
25
|
-
ORIGINAL =
|
|
26
|
-
SUMMARY =
|
|
23
|
+
UNKNOWN = "unknown"
|
|
24
|
+
ORIGINAL = "original"
|
|
25
|
+
SUMMARY = "summary"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class EnrichmentType(StrEnum):
|
|
29
|
+
"""Type of enrichment."""
|
|
30
|
+
|
|
31
|
+
UNKNOWN = "unknown"
|
|
32
|
+
SUMMARIZATION = "summarization"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(frozen=True)
|
|
36
|
+
class Enrichment:
|
|
37
|
+
"""Enrichment domain value object."""
|
|
38
|
+
|
|
39
|
+
type: EnrichmentType
|
|
40
|
+
content: str
|
|
27
41
|
|
|
28
42
|
|
|
29
43
|
class SnippetContent(BaseModel):
|
|
@@ -31,7 +45,6 @@ class SnippetContent(BaseModel):
|
|
|
31
45
|
|
|
32
46
|
type: SnippetContentType
|
|
33
47
|
value: str
|
|
34
|
-
language: str
|
|
35
48
|
|
|
36
49
|
|
|
37
50
|
class SnippetSearchResult(BaseModel):
|
|
@@ -138,7 +151,7 @@ class SearchType(Enum):
|
|
|
138
151
|
class Document:
|
|
139
152
|
"""Generic document model for indexing."""
|
|
140
153
|
|
|
141
|
-
snippet_id:
|
|
154
|
+
snippet_id: str
|
|
142
155
|
text: str
|
|
143
156
|
|
|
144
157
|
|
|
@@ -146,7 +159,7 @@ class Document:
|
|
|
146
159
|
class DocumentSearchResult:
|
|
147
160
|
"""Generic document search result model."""
|
|
148
161
|
|
|
149
|
-
snippet_id:
|
|
162
|
+
snippet_id: str
|
|
150
163
|
score: float
|
|
151
164
|
|
|
152
165
|
|
|
@@ -154,7 +167,7 @@ class DocumentSearchResult:
|
|
|
154
167
|
class SearchResult:
|
|
155
168
|
"""Generic search result model."""
|
|
156
169
|
|
|
157
|
-
snippet_id:
|
|
170
|
+
snippet_id: str
|
|
158
171
|
score: float
|
|
159
172
|
|
|
160
173
|
|
|
@@ -171,21 +184,21 @@ class SearchRequest:
|
|
|
171
184
|
|
|
172
185
|
query: str
|
|
173
186
|
top_k: int = 10
|
|
174
|
-
snippet_ids: list[
|
|
187
|
+
snippet_ids: list[str] | None = None
|
|
175
188
|
|
|
176
189
|
|
|
177
190
|
@dataclass
|
|
178
191
|
class DeleteRequest:
|
|
179
192
|
"""Generic deletion request."""
|
|
180
193
|
|
|
181
|
-
snippet_ids: list[
|
|
194
|
+
snippet_ids: list[str]
|
|
182
195
|
|
|
183
196
|
|
|
184
197
|
@dataclass
|
|
185
198
|
class IndexResult:
|
|
186
199
|
"""Generic indexing result."""
|
|
187
200
|
|
|
188
|
-
snippet_id:
|
|
201
|
+
snippet_id: str
|
|
189
202
|
|
|
190
203
|
|
|
191
204
|
@dataclass(frozen=True)
|
|
@@ -271,98 +284,11 @@ class MultiSearchRequest:
|
|
|
271
284
|
filters: SnippetSearchFilters | None = None
|
|
272
285
|
|
|
273
286
|
|
|
274
|
-
@dataclass
|
|
275
|
-
class MultiSearchResult:
|
|
276
|
-
"""Enhanced search result with comprehensive snippet metadata."""
|
|
277
|
-
|
|
278
|
-
id: int
|
|
279
|
-
content: str
|
|
280
|
-
original_scores: list[float]
|
|
281
|
-
source_uri: str
|
|
282
|
-
relative_path: str
|
|
283
|
-
language: str
|
|
284
|
-
authors: list[str]
|
|
285
|
-
created_at: datetime
|
|
286
|
-
summary: str
|
|
287
|
-
|
|
288
|
-
def __str__(self) -> str:
|
|
289
|
-
"""Return enhanced formatted string representation."""
|
|
290
|
-
lines = [
|
|
291
|
-
"---",
|
|
292
|
-
f"id: {self.id}",
|
|
293
|
-
f"source: {self.source_uri}",
|
|
294
|
-
f"path: {self.relative_path}",
|
|
295
|
-
f"lang: {self.language}",
|
|
296
|
-
f"created: {self.created_at.isoformat()}",
|
|
297
|
-
f"authors: {', '.join(self.authors)}",
|
|
298
|
-
f"scores: {self.original_scores}",
|
|
299
|
-
"---",
|
|
300
|
-
f"{self.summary}\n",
|
|
301
|
-
f"```{self.language}",
|
|
302
|
-
f"{self.content}",
|
|
303
|
-
"```\n",
|
|
304
|
-
]
|
|
305
|
-
return "\n".join(lines)
|
|
306
|
-
|
|
307
|
-
def to_json(self) -> str:
|
|
308
|
-
"""Return LLM-optimized JSON representation following the compact schema."""
|
|
309
|
-
json_obj = {
|
|
310
|
-
"id": self.id,
|
|
311
|
-
"source": self.source_uri,
|
|
312
|
-
"path": self.relative_path,
|
|
313
|
-
"lang": self.language.lower(),
|
|
314
|
-
"created": self.created_at.isoformat() if self.created_at else "",
|
|
315
|
-
"author": ", ".join(self.authors),
|
|
316
|
-
"score": self.original_scores,
|
|
317
|
-
"code": self.content,
|
|
318
|
-
"summary": self.summary,
|
|
319
|
-
}
|
|
320
|
-
|
|
321
|
-
return json.dumps(json_obj, separators=(",", ":"))
|
|
322
|
-
|
|
323
|
-
@classmethod
|
|
324
|
-
def to_jsonlines(cls, results: list["MultiSearchResult"]) -> str:
|
|
325
|
-
"""Convert multiple MultiSearchResult objects to JSON Lines format.
|
|
326
|
-
|
|
327
|
-
Args:
|
|
328
|
-
results: List of MultiSearchResult objects
|
|
329
|
-
include_summary: Whether to include summary fields
|
|
330
|
-
|
|
331
|
-
Returns:
|
|
332
|
-
JSON Lines string (one JSON object per line)
|
|
333
|
-
|
|
334
|
-
"""
|
|
335
|
-
return "\n".join(result.to_json() for result in results)
|
|
336
|
-
|
|
337
|
-
@classmethod
|
|
338
|
-
def to_string(cls, results: list["MultiSearchResult"]) -> str:
|
|
339
|
-
"""Convert multiple MultiSearchResult objects to a string."""
|
|
340
|
-
return "\n\n".join(str(result) for result in results)
|
|
341
|
-
|
|
342
|
-
@staticmethod
|
|
343
|
-
def calculate_relative_path(file_path: str, source_path: str) -> str:
|
|
344
|
-
"""Calculate relative path from source root."""
|
|
345
|
-
try:
|
|
346
|
-
return str(Path(file_path).relative_to(Path(source_path)))
|
|
347
|
-
except ValueError:
|
|
348
|
-
# If file_path is not relative to source_path, return the file name
|
|
349
|
-
return Path(file_path).name
|
|
350
|
-
|
|
351
|
-
@staticmethod
|
|
352
|
-
def detect_language_from_extension(extension: str) -> str:
|
|
353
|
-
"""Detect programming language from file extension."""
|
|
354
|
-
try:
|
|
355
|
-
return LanguageMapping.get_language_for_extension(extension).title()
|
|
356
|
-
except ValueError:
|
|
357
|
-
# Unknown extension, return a default
|
|
358
|
-
return "Unknown"
|
|
359
|
-
|
|
360
|
-
|
|
361
287
|
@dataclass
|
|
362
288
|
class FusionRequest:
|
|
363
289
|
"""Domain model for fusion request."""
|
|
364
290
|
|
|
365
|
-
id:
|
|
291
|
+
id: str
|
|
366
292
|
score: float
|
|
367
293
|
|
|
368
294
|
|
|
@@ -370,7 +296,7 @@ class FusionRequest:
|
|
|
370
296
|
class FusionResult:
|
|
371
297
|
"""Domain model for fusion result."""
|
|
372
298
|
|
|
373
|
-
id:
|
|
299
|
+
id: str
|
|
374
300
|
score: float
|
|
375
301
|
original_scores: list[float]
|
|
376
302
|
|
|
@@ -408,7 +334,7 @@ class ProgressState:
|
|
|
408
334
|
class EmbeddingRequest:
|
|
409
335
|
"""Domain model for embedding request."""
|
|
410
336
|
|
|
411
|
-
snippet_id:
|
|
337
|
+
snippet_id: str
|
|
412
338
|
text: str
|
|
413
339
|
|
|
414
340
|
|
|
@@ -416,7 +342,7 @@ class EmbeddingRequest:
|
|
|
416
342
|
class EmbeddingResponse:
|
|
417
343
|
"""Domain model for embedding response."""
|
|
418
344
|
|
|
419
|
-
snippet_id:
|
|
345
|
+
snippet_id: str
|
|
420
346
|
embedding: list[float]
|
|
421
347
|
|
|
422
348
|
|
|
@@ -424,7 +350,7 @@ class EmbeddingResponse:
|
|
|
424
350
|
class EnrichmentRequest:
|
|
425
351
|
"""Domain model for enrichment request."""
|
|
426
352
|
|
|
427
|
-
snippet_id:
|
|
353
|
+
snippet_id: str
|
|
428
354
|
text: str
|
|
429
355
|
|
|
430
356
|
|
|
@@ -432,7 +358,7 @@ class EnrichmentRequest:
|
|
|
432
358
|
class EnrichmentResponse:
|
|
433
359
|
"""Domain model for enrichment response."""
|
|
434
360
|
|
|
435
|
-
snippet_id:
|
|
361
|
+
snippet_id: str
|
|
436
362
|
text: str
|
|
437
363
|
|
|
438
364
|
|
|
@@ -651,22 +577,14 @@ class FunctionDefinition:
|
|
|
651
577
|
end_byte: int
|
|
652
578
|
|
|
653
579
|
|
|
654
|
-
class TaskType(Enum):
|
|
655
|
-
"""Task type."""
|
|
656
|
-
|
|
657
|
-
INDEX_UPDATE = 1
|
|
658
|
-
|
|
659
|
-
|
|
660
580
|
class QueuePriority(IntEnum):
|
|
661
581
|
"""Queue priority."""
|
|
662
582
|
|
|
663
583
|
BACKGROUND = 10
|
|
584
|
+
NORMAL = 20
|
|
664
585
|
USER_INITIATED = 50
|
|
665
586
|
|
|
666
587
|
|
|
667
|
-
# Reporting value objects
|
|
668
|
-
|
|
669
|
-
|
|
670
588
|
class ReportingState(StrEnum):
|
|
671
589
|
"""Reporting state."""
|
|
672
590
|
|
|
@@ -690,6 +608,8 @@ class TrackableType(StrEnum):
|
|
|
690
608
|
"""Trackable type."""
|
|
691
609
|
|
|
692
610
|
INDEX = "indexes"
|
|
611
|
+
KODIT_REPOSITORY = "kodit.repository"
|
|
612
|
+
KODIT_COMMIT = "kodit.commit"
|
|
693
613
|
|
|
694
614
|
|
|
695
615
|
class TaskOperation(StrEnum):
|
|
@@ -707,3 +627,52 @@ class TaskOperation(StrEnum):
|
|
|
707
627
|
CREATE_TEXT_EMBEDDINGS = "kodit.index.run.create_text_embeddings"
|
|
708
628
|
UPDATE_INDEX_TIMESTAMP = "kodit.index.run.update_index_timestamp"
|
|
709
629
|
CLEAR_FILE_PROCESSING_STATUSES = "kodit.index.run.clear_file_processing_statuses"
|
|
630
|
+
|
|
631
|
+
# New commit-based workflow
|
|
632
|
+
KODIT_REPOSITORY = "kodit.repository"
|
|
633
|
+
CREATE_REPOSITORY = "kodit.repository.create"
|
|
634
|
+
DELETE_REPOSITORY = "kodit.repository.delete"
|
|
635
|
+
CLONE_REPOSITORY = "kodit.repository.clone"
|
|
636
|
+
SCAN_REPOSITORY = "kodit.repository.scan"
|
|
637
|
+
KODIT_COMMIT = "kodit.commit"
|
|
638
|
+
EXTRACT_SNIPPETS_FOR_COMMIT = "kodit.commit.extract_snippets"
|
|
639
|
+
CREATE_BM25_INDEX_FOR_COMMIT = "kodit.commit.create_bm25_index"
|
|
640
|
+
CREATE_CODE_EMBEDDINGS_FOR_COMMIT = "kodit.commit.create_code_embeddings"
|
|
641
|
+
CREATE_SUMMARY_ENRICHMENT_FOR_COMMIT = "kodit.commit.create_summary_enrichment"
|
|
642
|
+
CREATE_SUMMARY_EMBEDDINGS_FOR_COMMIT = "kodit.commit.create_summary_embeddings"
|
|
643
|
+
|
|
644
|
+
def is_repository_operation(self) -> bool:
|
|
645
|
+
"""Check if the task operation is a repository operation."""
|
|
646
|
+
return self.startswith("kodit.repository.")
|
|
647
|
+
|
|
648
|
+
def is_commit_operation(self) -> bool:
|
|
649
|
+
"""Check if the task operation is a commit operation."""
|
|
650
|
+
return self.startswith("kodit.commit.")
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
class PrescribedOperations:
|
|
654
|
+
"""Prescribed common operations."""
|
|
655
|
+
|
|
656
|
+
CREATE_NEW_REPOSITORY: ClassVar[list[TaskOperation]] = [
|
|
657
|
+
TaskOperation.CLONE_REPOSITORY,
|
|
658
|
+
TaskOperation.SCAN_REPOSITORY,
|
|
659
|
+
]
|
|
660
|
+
INDEX_COMMIT: ClassVar[list[TaskOperation]] = [
|
|
661
|
+
TaskOperation.EXTRACT_SNIPPETS_FOR_COMMIT,
|
|
662
|
+
TaskOperation.CREATE_BM25_INDEX_FOR_COMMIT,
|
|
663
|
+
TaskOperation.CREATE_CODE_EMBEDDINGS_FOR_COMMIT,
|
|
664
|
+
TaskOperation.CREATE_SUMMARY_ENRICHMENT_FOR_COMMIT,
|
|
665
|
+
TaskOperation.CREATE_SUMMARY_EMBEDDINGS_FOR_COMMIT,
|
|
666
|
+
]
|
|
667
|
+
SYNC_REPOSITORY: ClassVar[list[TaskOperation]] = [
|
|
668
|
+
TaskOperation.SCAN_REPOSITORY,
|
|
669
|
+
]
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
class IndexStatus(StrEnum):
|
|
673
|
+
"""Status of commit indexing."""
|
|
674
|
+
|
|
675
|
+
PENDING = "pending"
|
|
676
|
+
IN_PROGRESS = "in_progress"
|
|
677
|
+
COMPLETED = "completed"
|
|
678
|
+
FAILED = "failed"
|
|
@@ -2,13 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
from .base import BaseAPIClient
|
|
4
4
|
from .exceptions import AuthenticationError, KoditAPIError
|
|
5
|
-
from .index_client import IndexClient
|
|
6
5
|
from .search_client import SearchClient
|
|
7
6
|
|
|
8
7
|
__all__ = [
|
|
9
8
|
"AuthenticationError",
|
|
10
9
|
"BaseAPIClient",
|
|
11
|
-
"IndexClient",
|
|
12
10
|
"KoditAPIError",
|
|
13
11
|
"SearchClient",
|
|
14
12
|
]
|