coderay 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coderay/__init__.py +1 -0
- coderay/chunking/__init__.py +0 -0
- coderay/chunking/chunker.py +127 -0
- coderay/chunking/registry.py +190 -0
- coderay/cli/__init__.py +3 -0
- coderay/cli/commands.py +475 -0
- coderay/core/__init__.py +0 -0
- coderay/core/config.py +73 -0
- coderay/core/lock.py +36 -0
- coderay/core/models.py +71 -0
- coderay/core/timing.py +45 -0
- coderay/core/utils.py +35 -0
- coderay/embedding/__init__.py +0 -0
- coderay/embedding/base.py +60 -0
- coderay/embedding/local.py +68 -0
- coderay/embedding/openai.py +87 -0
- coderay/graph/__init__.py +19 -0
- coderay/graph/builder.py +128 -0
- coderay/graph/code_graph.py +311 -0
- coderay/graph/extractor.py +315 -0
- coderay/mcp_server/__init__.py +0 -0
- coderay/mcp_server/server.py +178 -0
- coderay/pipeline/__init__.py +0 -0
- coderay/pipeline/indexer.py +417 -0
- coderay/pipeline/watcher.py +318 -0
- coderay/retrieval/__init__.py +3 -0
- coderay/retrieval/boosting.py +80 -0
- coderay/retrieval/search.py +121 -0
- coderay/skeleton/__init__.py +0 -0
- coderay/skeleton/extractor.py +140 -0
- coderay/state/__init__.py +8 -0
- coderay/state/machine.py +242 -0
- coderay/state/version.py +47 -0
- coderay/storage/__init__.py +0 -0
- coderay/storage/lancedb.py +268 -0
- coderay/vcs/__init__.py +0 -0
- coderay/vcs/git.py +193 -0
- coderay-1.0.0.dist-info/METADATA +145 -0
- coderay-1.0.0.dist-info/RECORD +42 -0
- coderay-1.0.0.dist-info/WHEEL +5 -0
- coderay-1.0.0.dist-info/entry_points.txt +3 -0
- coderay-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from mcp.server.fastmcp import FastMCP
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
mcp = FastMCP("coderay")
|
|
13
|
+
|
|
14
|
+
DEFAULT_INDEX_DIR = ".index"
|
|
15
|
+
|
|
16
|
+
_retrieval_cache: dict[Path, Any] = {}
|
|
17
|
+
_state_machine_cache: dict[Path, Any] = {}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _resolve_index_dir(index_dir: str | None = None) -> Path:
|
|
21
|
+
"""Resolve the index directory to an absolute path."""
|
|
22
|
+
return Path(index_dir or DEFAULT_INDEX_DIR).resolve()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _get_retrieval(index_dir: str | None = None):
|
|
26
|
+
"""Return a cached Retrieval instance for the given index directory."""
|
|
27
|
+
idx = _resolve_index_dir(index_dir)
|
|
28
|
+
if idx not in _retrieval_cache:
|
|
29
|
+
from coderay.retrieval.search import Retrieval
|
|
30
|
+
|
|
31
|
+
_retrieval_cache[idx] = Retrieval(idx)
|
|
32
|
+
return _retrieval_cache[idx]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _load_graph(index_dir: str | None = None):
|
|
36
|
+
"""Load the code graph from disk, or return None if absent."""
|
|
37
|
+
from coderay.graph.builder import load_graph
|
|
38
|
+
|
|
39
|
+
return load_graph(_resolve_index_dir(index_dir))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _get_state_machine(index_dir: str | None = None):
|
|
43
|
+
"""Return a cached StateMachine instance for the given index directory."""
|
|
44
|
+
idx = _resolve_index_dir(index_dir)
|
|
45
|
+
if idx not in _state_machine_cache:
|
|
46
|
+
from coderay.state.machine import StateMachine
|
|
47
|
+
|
|
48
|
+
_state_machine_cache[idx] = StateMachine(idx)
|
|
49
|
+
return _state_machine_cache[idx]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _load_state(index_dir: str | None = None):
|
|
53
|
+
"""Load the current IndexMeta state, or None if no run has completed."""
|
|
54
|
+
return _get_state_machine(index_dir).current_state
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@mcp.tool()
|
|
58
|
+
def semantic_search(
|
|
59
|
+
query: str,
|
|
60
|
+
top_k: int = 10,
|
|
61
|
+
path_prefix: str | None = None,
|
|
62
|
+
language: str | None = None,
|
|
63
|
+
index_dir: str | None = None,
|
|
64
|
+
) -> str:
|
|
65
|
+
"""Search code by meaning."""
|
|
66
|
+
retrieval = _get_retrieval(index_dir)
|
|
67
|
+
state = _load_state(index_dir)
|
|
68
|
+
if state is None:
|
|
69
|
+
return json.dumps({"error": "No index state found. Run 'coderay build' first."})
|
|
70
|
+
try:
|
|
71
|
+
results = retrieval.search(
|
|
72
|
+
query,
|
|
73
|
+
state,
|
|
74
|
+
top_k=top_k,
|
|
75
|
+
path_prefix=path_prefix,
|
|
76
|
+
language=language,
|
|
77
|
+
)
|
|
78
|
+
except RuntimeError as e:
|
|
79
|
+
return json.dumps({"error": str(e)})
|
|
80
|
+
score_type = results[0].get("score_type", "cosine") if results else "cosine"
|
|
81
|
+
return json.dumps(
|
|
82
|
+
{
|
|
83
|
+
"score_type": score_type,
|
|
84
|
+
"score_description": (
|
|
85
|
+
"cosine similarity (0-1, higher = more similar)"
|
|
86
|
+
if score_type == "cosine"
|
|
87
|
+
else "RRF rank fusion (higher = more relevant, scale differs from cosine)"
|
|
88
|
+
),
|
|
89
|
+
"results": results,
|
|
90
|
+
},
|
|
91
|
+
default=str,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@mcp.tool()
|
|
96
|
+
def get_file_skeleton(file_path: str) -> str:
|
|
97
|
+
"""Get the API surface of a file (signatures, no bodies)."""
|
|
98
|
+
from coderay.skeleton.extractor import extract_skeleton
|
|
99
|
+
|
|
100
|
+
p = Path(file_path)
|
|
101
|
+
if not p.is_file():
|
|
102
|
+
return json.dumps({"error": f"File not found: {file_path}"})
|
|
103
|
+
content = p.read_text(encoding="utf-8", errors="replace")
|
|
104
|
+
return extract_skeleton(p, content)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
_STATIC_ANALYSIS_NOTE = (
|
|
108
|
+
"Based on static analysis of source code. Calls through dependency "
|
|
109
|
+
"injection, interfaces, dynamic dispatch (getattr), decorators, or "
|
|
110
|
+
"framework routing may not be detected."
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@mcp.tool()
|
|
115
|
+
def get_impact_radius(
|
|
116
|
+
node_id: str,
|
|
117
|
+
max_depth: int = 3,
|
|
118
|
+
index_dir: str | None = None,
|
|
119
|
+
) -> str:
|
|
120
|
+
"""Analyze the blast radius of changing a function or module."""
|
|
121
|
+
graph = _load_graph(index_dir)
|
|
122
|
+
if graph is None:
|
|
123
|
+
return json.dumps({"error": "No graph found. Run 'coderay build' first."})
|
|
124
|
+
impact = graph.get_impact_radius(node_id, depth=max_depth)
|
|
125
|
+
return json.dumps(
|
|
126
|
+
{
|
|
127
|
+
"results": [n.to_dict() for n in impact],
|
|
128
|
+
"note": _STATIC_ANALYSIS_NOTE,
|
|
129
|
+
}
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@mcp.tool()
|
|
134
|
+
def index_status(index_dir: str | None = None) -> str:
|
|
135
|
+
"""Check health and status of the semantic index."""
|
|
136
|
+
state = _load_state(index_dir)
|
|
137
|
+
if state is None:
|
|
138
|
+
return json.dumps({"status": "no_index", "message": "No index found."})
|
|
139
|
+
|
|
140
|
+
from coderay.core.config import get_embedding_dimensions, load_config
|
|
141
|
+
from coderay.state.version import read_index_version
|
|
142
|
+
from coderay.storage.lancedb import index_exists as idx_exists
|
|
143
|
+
|
|
144
|
+
idx_dir = _resolve_index_dir(index_dir)
|
|
145
|
+
has_store = idx_exists(idx_dir)
|
|
146
|
+
chunk_count = 0
|
|
147
|
+
if has_store:
|
|
148
|
+
from coderay.storage.lancedb import Store
|
|
149
|
+
|
|
150
|
+
config = load_config(idx_dir)
|
|
151
|
+
store = Store(idx_dir, dimensions=get_embedding_dimensions(config))
|
|
152
|
+
chunk_count = store.chunk_count()
|
|
153
|
+
|
|
154
|
+
return json.dumps(
|
|
155
|
+
{
|
|
156
|
+
"status": state.state.value,
|
|
157
|
+
"branch": state.branch,
|
|
158
|
+
"last_commit": state.last_commit,
|
|
159
|
+
"chunk_count": chunk_count,
|
|
160
|
+
"schema_version": read_index_version(idx_dir),
|
|
161
|
+
"has_store": has_store,
|
|
162
|
+
},
|
|
163
|
+
default=str,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def main():
|
|
168
|
+
"""Entry point for the coderay-mcp command."""
|
|
169
|
+
import sys
|
|
170
|
+
|
|
171
|
+
transport = "stdio"
|
|
172
|
+
if "--sse" in sys.argv:
|
|
173
|
+
transport = "sse"
|
|
174
|
+
mcp.run(transport=transport)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
if __name__ == "__main__":
|
|
178
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from coderay.chunking.chunker import chunk_file
|
|
9
|
+
from coderay.core.config import get_embedding_dimensions, load_config
|
|
10
|
+
from coderay.core.timing import timed, timed_phase
|
|
11
|
+
from coderay.core.utils import files_with_changed_content, hash_content, read_from_path
|
|
12
|
+
from coderay.embedding.base import Embedder, load_embedder_from_config
|
|
13
|
+
from coderay.graph.builder import build_and_save_graph
|
|
14
|
+
from coderay.state.machine import IndexMeta, StateMachine
|
|
15
|
+
from coderay.state.version import check_index_version, write_index_version
|
|
16
|
+
from coderay.storage.lancedb import Store, index_exists
|
|
17
|
+
from coderay.vcs.git import Git
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
RESUME_BATCH_SIZE = 200
|
|
22
|
+
DEFAULT_REPO_ROOT = "."
|
|
23
|
+
DEFAULT_INDEX_DIR = ".index"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class IndexResult:
|
|
28
|
+
"""Result of an index build or update: cached, updated, and removed counts."""
|
|
29
|
+
|
|
30
|
+
cached: int = 0
|
|
31
|
+
updated: int = 0
|
|
32
|
+
removed: int = 0
|
|
33
|
+
|
|
34
|
+
def __str__(self) -> str:
|
|
35
|
+
return (
|
|
36
|
+
f"Cached: {self.cached}, Updated: {self.updated},"
|
|
37
|
+
f" Removed: {self.removed} chunks"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class Indexer:
|
|
42
|
+
"""Builds and maintains the semantic index for a repository."""
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
repo_root: str | Path = DEFAULT_REPO_ROOT,
|
|
47
|
+
index_dir: str | Path = DEFAULT_INDEX_DIR,
|
|
48
|
+
config: dict[str, Any] | None = None,
|
|
49
|
+
embedder: Embedder | None = None,
|
|
50
|
+
) -> None:
|
|
51
|
+
"""Initialize the indexer."""
|
|
52
|
+
self._repo_root = Path(repo_root)
|
|
53
|
+
self._index_dir = Path(index_dir)
|
|
54
|
+
self._config = config or load_config(self._index_dir)
|
|
55
|
+
self._git = Git(self._repo_root)
|
|
56
|
+
self._state = StateMachine(self._index_dir)
|
|
57
|
+
self._embedder = embedder or load_embedder_from_config(self._config)
|
|
58
|
+
self._store = Store(
|
|
59
|
+
self._index_dir, dimensions=get_embedding_dimensions(self._config)
|
|
60
|
+
)
|
|
61
|
+
check_index_version(self._index_dir)
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def config(self) -> dict[str, Any]:
|
|
65
|
+
"""Current config (embedder, index settings)."""
|
|
66
|
+
return self._config
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def repo_root(self) -> Path:
|
|
70
|
+
"""Repository root path."""
|
|
71
|
+
return self._repo_root
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def index_dir(self) -> Path:
|
|
75
|
+
"""Index directory path."""
|
|
76
|
+
return self._index_dir
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def current_state(self) -> IndexMeta | None:
|
|
80
|
+
"""Current index meta state; None if no run has completed."""
|
|
81
|
+
return self._state.current_state
|
|
82
|
+
|
|
83
|
+
@timed("full_build")
|
|
84
|
+
def build_full(self) -> IndexResult:
|
|
85
|
+
"""Full rebuild: discover, chunk, embed, and store all source files."""
|
|
86
|
+
|
|
87
|
+
current = self._state.current_state
|
|
88
|
+
last_branch = current.branch if current is not None else None
|
|
89
|
+
branch_switched = self._git.is_branch_switched(last_branch=last_branch)
|
|
90
|
+
if branch_switched:
|
|
91
|
+
return self.update_incremental()
|
|
92
|
+
|
|
93
|
+
current_run = current.current_run if current else None
|
|
94
|
+
saved_paths = current_run.paths_to_process if current_run else []
|
|
95
|
+
processed_count = current_run.processed_count if current_run else 0
|
|
96
|
+
|
|
97
|
+
can_resume = (
|
|
98
|
+
not branch_switched
|
|
99
|
+
and self._state.is_in_progress
|
|
100
|
+
and self._state.has_partial_progress
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
if can_resume:
|
|
104
|
+
paths_remaining = saved_paths[processed_count:]
|
|
105
|
+
if not paths_remaining:
|
|
106
|
+
self._state.finish(
|
|
107
|
+
last_commit=self._git.get_head_commit(),
|
|
108
|
+
branch=self._git.get_current_branch(),
|
|
109
|
+
)
|
|
110
|
+
write_index_version(self._index_dir)
|
|
111
|
+
self._refresh_graph()
|
|
112
|
+
return IndexResult(cached=len(self._state.file_hashes))
|
|
113
|
+
paths_to_process = paths_remaining
|
|
114
|
+
rel_paths = saved_paths
|
|
115
|
+
else:
|
|
116
|
+
self._state.set_incomplete()
|
|
117
|
+
self._store.clear()
|
|
118
|
+
|
|
119
|
+
py_files = self._git.discover_files()
|
|
120
|
+
if not py_files:
|
|
121
|
+
logger.warning("No source files found under %s", self._repo_root)
|
|
122
|
+
return IndexResult(cached=len(self._state.file_hashes))
|
|
123
|
+
|
|
124
|
+
rel_paths = [str(p.relative_to(self._repo_root)) for p in py_files]
|
|
125
|
+
paths_to_process = rel_paths
|
|
126
|
+
processed_count = 0
|
|
127
|
+
self._state.start(
|
|
128
|
+
branch=self._git.get_current_branch(),
|
|
129
|
+
last_commit=self._git.get_head_commit(),
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
all_path_hashes = self._run_batch_loop(
|
|
133
|
+
rel_paths=paths_to_process,
|
|
134
|
+
full_rel_paths=rel_paths,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
if can_resume:
|
|
138
|
+
existing = self._state.file_hashes.copy()
|
|
139
|
+
existing.update(all_path_hashes)
|
|
140
|
+
self._state.file_hashes = existing
|
|
141
|
+
else:
|
|
142
|
+
self._state.file_hashes = all_path_hashes
|
|
143
|
+
|
|
144
|
+
self._state.finish(
|
|
145
|
+
last_commit=self._git.get_head_commit(),
|
|
146
|
+
branch=self._git.get_current_branch(),
|
|
147
|
+
)
|
|
148
|
+
write_index_version(self._index_dir)
|
|
149
|
+
self._refresh_graph()
|
|
150
|
+
return IndexResult(updated=len(self._state.file_hashes))
|
|
151
|
+
|
|
152
|
+
def update_incremental(self) -> IndexResult:
|
|
153
|
+
"""Incremental update: re-index only changed, added, or deleted files."""
|
|
154
|
+
|
|
155
|
+
self._state.set_incomplete()
|
|
156
|
+
|
|
157
|
+
current = self._state.current_state
|
|
158
|
+
state_branch = current.branch if current else None
|
|
159
|
+
active_branch = self._git.get_current_branch()
|
|
160
|
+
|
|
161
|
+
if self._git.is_branch_switched(last_branch=state_branch):
|
|
162
|
+
logger.info(
|
|
163
|
+
"Branch switched %s -> %s; syncing index",
|
|
164
|
+
state_branch,
|
|
165
|
+
active_branch,
|
|
166
|
+
)
|
|
167
|
+
return self._sync_after_branch_switch()
|
|
168
|
+
|
|
169
|
+
to_add, to_remove = self._git.get_files_to_index(
|
|
170
|
+
last_commit=current.last_commit if current else None
|
|
171
|
+
)
|
|
172
|
+
if to_remove:
|
|
173
|
+
self._store.delete_by_paths(paths=to_remove)
|
|
174
|
+
|
|
175
|
+
# Remove deleted hashes from state
|
|
176
|
+
file_hashes = self._state.file_hashes.copy()
|
|
177
|
+
for path in to_remove:
|
|
178
|
+
file_hashes.pop(path, None)
|
|
179
|
+
# Idempotent if to_remove is empty
|
|
180
|
+
self._state.file_hashes = file_hashes
|
|
181
|
+
|
|
182
|
+
# Check what files are changed
|
|
183
|
+
changed_files = files_with_changed_content(
|
|
184
|
+
repo=self._repo_root, paths=to_add, file_hashes=file_hashes
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
if not changed_files and not to_remove:
|
|
188
|
+
self._state.finish(
|
|
189
|
+
last_commit=self._git.get_head_commit(),
|
|
190
|
+
branch=self._git.get_current_branch(),
|
|
191
|
+
)
|
|
192
|
+
self._refresh_graph()
|
|
193
|
+
logger.info("Nothing to update")
|
|
194
|
+
return IndexResult(cached=len(self._state.file_hashes))
|
|
195
|
+
|
|
196
|
+
return self._update(
|
|
197
|
+
paths_to_add=changed_files,
|
|
198
|
+
file_hashes=file_hashes,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
def _sync_after_branch_switch(self) -> IndexResult:
|
|
202
|
+
"""Sync index to current branch after a switch. Returns IndexResult."""
|
|
203
|
+
file_hashes = self._state.file_hashes.copy()
|
|
204
|
+
py_files = self._git.discover_files()
|
|
205
|
+
|
|
206
|
+
# All .py files were deleted from git
|
|
207
|
+
to_remove: list[str] = []
|
|
208
|
+
if not py_files:
|
|
209
|
+
to_remove = list(file_hashes)
|
|
210
|
+
|
|
211
|
+
if to_remove:
|
|
212
|
+
self._store.delete_by_paths(to_remove)
|
|
213
|
+
index_result = IndexResult(removed=len(file_hashes))
|
|
214
|
+
file_hashes.clear()
|
|
215
|
+
|
|
216
|
+
self._state.file_hashes = file_hashes
|
|
217
|
+
self._state.finish(
|
|
218
|
+
last_commit=self._git.get_head_commit(),
|
|
219
|
+
branch=self._git.get_current_branch(),
|
|
220
|
+
)
|
|
221
|
+
self._refresh_graph()
|
|
222
|
+
return index_result
|
|
223
|
+
|
|
224
|
+
rel_paths_current = {str(p.relative_to(self._repo_root)) for p in py_files}
|
|
225
|
+
# Deleted files on current branch
|
|
226
|
+
to_remove = [p for p in file_hashes if p not in rel_paths_current]
|
|
227
|
+
if to_remove:
|
|
228
|
+
self._store.delete_by_paths(to_remove)
|
|
229
|
+
for p in to_remove:
|
|
230
|
+
file_hashes.pop(p, None)
|
|
231
|
+
|
|
232
|
+
changed_files = files_with_changed_content(
|
|
233
|
+
repo=self._repo_root, paths=py_files, file_hashes=file_hashes
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
if not changed_files and not to_remove:
|
|
237
|
+
self._state.finish(
|
|
238
|
+
last_commit=self._git.get_head_commit(),
|
|
239
|
+
branch=self._git.get_current_branch(),
|
|
240
|
+
)
|
|
241
|
+
self._refresh_graph()
|
|
242
|
+
logger.info("Branch switch: index already in sync (no changes)")
|
|
243
|
+
return IndexResult(cached=len(self._state.file_hashes))
|
|
244
|
+
|
|
245
|
+
return self._update(
|
|
246
|
+
paths_to_add=changed_files,
|
|
247
|
+
file_hashes=file_hashes,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
def _run_batch_loop(
|
|
251
|
+
self,
|
|
252
|
+
rel_paths: list[str],
|
|
253
|
+
full_rel_paths: list[str],
|
|
254
|
+
) -> dict[str, str]:
|
|
255
|
+
"""Run the pipeline in batches and save progress for resume."""
|
|
256
|
+
all_path_hashes: dict[str, str] = {}
|
|
257
|
+
|
|
258
|
+
for i in range(0, len(rel_paths), RESUME_BATCH_SIZE):
|
|
259
|
+
batch = rel_paths[i : i + RESUME_BATCH_SIZE]
|
|
260
|
+
all_path_hashes.update(self._run_pipeline(rel_paths=batch))
|
|
261
|
+
self._state.save_progress(
|
|
262
|
+
full_rel_paths=full_rel_paths,
|
|
263
|
+
processed_count=i + len(batch),
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
return all_path_hashes
|
|
267
|
+
|
|
268
|
+
def _update(
|
|
269
|
+
self,
|
|
270
|
+
paths_to_add: list[Path],
|
|
271
|
+
file_hashes: dict[str, str],
|
|
272
|
+
) -> IndexResult:
|
|
273
|
+
"""Run pipeline over paths, update hashes and state, then finish."""
|
|
274
|
+
rel_paths = [str(p.relative_to(self._repo_root)) for p in paths_to_add]
|
|
275
|
+
self._state.start(
|
|
276
|
+
branch=self._git.get_current_branch(),
|
|
277
|
+
last_commit=self._git.get_head_commit(),
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
batch_hashes = self._run_batch_loop(
|
|
281
|
+
rel_paths=rel_paths,
|
|
282
|
+
full_rel_paths=rel_paths,
|
|
283
|
+
)
|
|
284
|
+
file_hashes.update(batch_hashes)
|
|
285
|
+
self._state.file_hashes = file_hashes
|
|
286
|
+
self._state.finish(
|
|
287
|
+
last_commit=self._git.get_head_commit(),
|
|
288
|
+
branch=self._git.get_current_branch(),
|
|
289
|
+
)
|
|
290
|
+
self._refresh_graph(changed_paths=rel_paths)
|
|
291
|
+
return IndexResult(updated=len(batch_hashes))
|
|
292
|
+
|
|
293
|
+
@timed("pipeline")
|
|
294
|
+
def _run_pipeline(
|
|
295
|
+
self,
|
|
296
|
+
rel_paths: list[str],
|
|
297
|
+
) -> dict[str, str]:
|
|
298
|
+
"""Chunk, embed, and store the given files."""
|
|
299
|
+
files_content: list[tuple[str, str]] = []
|
|
300
|
+
|
|
301
|
+
with timed_phase("read"):
|
|
302
|
+
for p in rel_paths:
|
|
303
|
+
path = self._repo_root / p if not Path(p).is_absolute() else Path(p)
|
|
304
|
+
if not path.is_file():
|
|
305
|
+
logger.warning("Skip (not a file): %s", p)
|
|
306
|
+
continue
|
|
307
|
+
try:
|
|
308
|
+
content = read_from_path(path)
|
|
309
|
+
files_content.append((p, content))
|
|
310
|
+
except Exception as e:
|
|
311
|
+
logger.warning("Skip (read failed) %s: %s", p, e)
|
|
312
|
+
|
|
313
|
+
if not files_content:
|
|
314
|
+
return {}
|
|
315
|
+
|
|
316
|
+
path_hashes = {p: hash_content(content) for p, content in files_content}
|
|
317
|
+
|
|
318
|
+
paths_to_replace = list({p for p, _ in files_content})
|
|
319
|
+
self._store.delete_by_paths(paths_to_replace)
|
|
320
|
+
|
|
321
|
+
with timed_phase("chunking"):
|
|
322
|
+
chunks = []
|
|
323
|
+
for p, content in files_content:
|
|
324
|
+
chunks.extend(chunk_file(p, content))
|
|
325
|
+
|
|
326
|
+
if not chunks:
|
|
327
|
+
logger.info("Pipeline done: 0 chunks in %d files", len(files_content))
|
|
328
|
+
return path_hashes
|
|
329
|
+
|
|
330
|
+
texts = [c.content for c in chunks]
|
|
331
|
+
with timed_phase("embedding"):
|
|
332
|
+
embeddings = self._embedder.embed(texts)
|
|
333
|
+
|
|
334
|
+
with timed_phase("storing"):
|
|
335
|
+
self._store.insert_chunks(chunks, embeddings)
|
|
336
|
+
|
|
337
|
+
logger.info(
|
|
338
|
+
"Pipeline done: %d chunks in %d files", len(chunks), len(files_content)
|
|
339
|
+
)
|
|
340
|
+
return path_hashes
|
|
341
|
+
|
|
342
|
+
def _refresh_graph(self, changed_paths: list[str] | None = None) -> None:
|
|
343
|
+
"""Rebuild and save the code graph, logging warnings on failure."""
|
|
344
|
+
try:
|
|
345
|
+
build_and_save_graph(
|
|
346
|
+
self._repo_root,
|
|
347
|
+
self._index_dir,
|
|
348
|
+
changed_paths=changed_paths,
|
|
349
|
+
)
|
|
350
|
+
except Exception as e:
|
|
351
|
+
logger.warning("Graph refresh failed: %s", e)
|
|
352
|
+
|
|
353
|
+
def update_paths(
|
|
354
|
+
self,
|
|
355
|
+
changed: list[str],
|
|
356
|
+
removed: list[str] | None = None,
|
|
357
|
+
) -> IndexResult:
|
|
358
|
+
"""Update the index for explicit file paths (used by the file watcher)."""
|
|
359
|
+
self._state.set_incomplete()
|
|
360
|
+
file_hashes = self._state.file_hashes.copy()
|
|
361
|
+
|
|
362
|
+
removed = removed or []
|
|
363
|
+
if removed:
|
|
364
|
+
self._store.delete_by_paths(removed)
|
|
365
|
+
for p in removed:
|
|
366
|
+
file_hashes.pop(p, None)
|
|
367
|
+
self._state.file_hashes = file_hashes
|
|
368
|
+
|
|
369
|
+
if not changed:
|
|
370
|
+
self._state.finish(
|
|
371
|
+
last_commit=self._git.get_head_commit(),
|
|
372
|
+
branch=self._git.get_current_branch(),
|
|
373
|
+
)
|
|
374
|
+
if removed:
|
|
375
|
+
self._refresh_graph()
|
|
376
|
+
return IndexResult(removed=len(removed))
|
|
377
|
+
|
|
378
|
+
paths_to_add = [self._repo_root / p for p in changed]
|
|
379
|
+
existing = files_with_changed_content(
|
|
380
|
+
repo=self._repo_root, paths=paths_to_add, file_hashes=file_hashes
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
if not existing and not removed:
|
|
384
|
+
self._state.finish(
|
|
385
|
+
last_commit=self._git.get_head_commit(),
|
|
386
|
+
branch=self._git.get_current_branch(),
|
|
387
|
+
)
|
|
388
|
+
return IndexResult(cached=len(file_hashes))
|
|
389
|
+
|
|
390
|
+
if existing:
|
|
391
|
+
result = self._update(
|
|
392
|
+
paths_to_add=existing,
|
|
393
|
+
file_hashes=file_hashes,
|
|
394
|
+
)
|
|
395
|
+
result.removed = len(removed)
|
|
396
|
+
return result
|
|
397
|
+
|
|
398
|
+
self._state.finish(
|
|
399
|
+
last_commit=self._git.get_head_commit(),
|
|
400
|
+
branch=self._git.get_current_branch(),
|
|
401
|
+
)
|
|
402
|
+
self._refresh_graph()
|
|
403
|
+
return IndexResult(removed=len(removed))
|
|
404
|
+
|
|
405
|
+
def maintain(self) -> dict[str, Any]:
|
|
406
|
+
"""Run store maintenance (compact and cleanup)."""
|
|
407
|
+
if not index_exists(self._index_dir):
|
|
408
|
+
return {}
|
|
409
|
+
return self._store.maintain()
|
|
410
|
+
|
|
411
|
+
def index_exists(self) -> bool:
|
|
412
|
+
"""Return True if the index exists at index_dir."""
|
|
413
|
+
return index_exists(self._index_dir)
|
|
414
|
+
|
|
415
|
+
def error(self, exc: str) -> None:
|
|
416
|
+
"""Mark the current run as errored with the given exception message."""
|
|
417
|
+
self._state.set_errored(exc=exc)
|