coderay 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,178 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from mcp.server.fastmcp import FastMCP
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ mcp = FastMCP("coderay")
13
+
14
+ DEFAULT_INDEX_DIR = ".index"
15
+
16
+ _retrieval_cache: dict[Path, Any] = {}
17
+ _state_machine_cache: dict[Path, Any] = {}
18
+
19
+
20
+ def _resolve_index_dir(index_dir: str | None = None) -> Path:
21
+ """Resolve the index directory to an absolute path."""
22
+ return Path(index_dir or DEFAULT_INDEX_DIR).resolve()
23
+
24
+
25
+ def _get_retrieval(index_dir: str | None = None):
26
+ """Return a cached Retrieval instance for the given index directory."""
27
+ idx = _resolve_index_dir(index_dir)
28
+ if idx not in _retrieval_cache:
29
+ from coderay.retrieval.search import Retrieval
30
+
31
+ _retrieval_cache[idx] = Retrieval(idx)
32
+ return _retrieval_cache[idx]
33
+
34
+
35
+ def _load_graph(index_dir: str | None = None):
36
+ """Load the code graph from disk, or return None if absent."""
37
+ from coderay.graph.builder import load_graph
38
+
39
+ return load_graph(_resolve_index_dir(index_dir))
40
+
41
+
42
+ def _get_state_machine(index_dir: str | None = None):
43
+ """Return a cached StateMachine instance for the given index directory."""
44
+ idx = _resolve_index_dir(index_dir)
45
+ if idx not in _state_machine_cache:
46
+ from coderay.state.machine import StateMachine
47
+
48
+ _state_machine_cache[idx] = StateMachine(idx)
49
+ return _state_machine_cache[idx]
50
+
51
+
52
+ def _load_state(index_dir: str | None = None):
53
+ """Load the current IndexMeta state, or None if no run has completed."""
54
+ return _get_state_machine(index_dir).current_state
55
+
56
+
57
+ @mcp.tool()
58
+ def semantic_search(
59
+ query: str,
60
+ top_k: int = 10,
61
+ path_prefix: str | None = None,
62
+ language: str | None = None,
63
+ index_dir: str | None = None,
64
+ ) -> str:
65
+ """Search code by meaning."""
66
+ retrieval = _get_retrieval(index_dir)
67
+ state = _load_state(index_dir)
68
+ if state is None:
69
+ return json.dumps({"error": "No index state found. Run 'coderay build' first."})
70
+ try:
71
+ results = retrieval.search(
72
+ query,
73
+ state,
74
+ top_k=top_k,
75
+ path_prefix=path_prefix,
76
+ language=language,
77
+ )
78
+ except RuntimeError as e:
79
+ return json.dumps({"error": str(e)})
80
+ score_type = results[0].get("score_type", "cosine") if results else "cosine"
81
+ return json.dumps(
82
+ {
83
+ "score_type": score_type,
84
+ "score_description": (
85
+ "cosine similarity (0-1, higher = more similar)"
86
+ if score_type == "cosine"
87
+ else "RRF rank fusion (higher = more relevant, scale differs from cosine)"
88
+ ),
89
+ "results": results,
90
+ },
91
+ default=str,
92
+ )
93
+
94
+
95
+ @mcp.tool()
96
+ def get_file_skeleton(file_path: str) -> str:
97
+ """Get the API surface of a file (signatures, no bodies)."""
98
+ from coderay.skeleton.extractor import extract_skeleton
99
+
100
+ p = Path(file_path)
101
+ if not p.is_file():
102
+ return json.dumps({"error": f"File not found: {file_path}"})
103
+ content = p.read_text(encoding="utf-8", errors="replace")
104
+ return extract_skeleton(p, content)
105
+
106
+
107
+ _STATIC_ANALYSIS_NOTE = (
108
+ "Based on static analysis of source code. Calls through dependency "
109
+ "injection, interfaces, dynamic dispatch (getattr), decorators, or "
110
+ "framework routing may not be detected."
111
+ )
112
+
113
+
114
+ @mcp.tool()
115
+ def get_impact_radius(
116
+ node_id: str,
117
+ max_depth: int = 3,
118
+ index_dir: str | None = None,
119
+ ) -> str:
120
+ """Analyze the blast radius of changing a function or module."""
121
+ graph = _load_graph(index_dir)
122
+ if graph is None:
123
+ return json.dumps({"error": "No graph found. Run 'coderay build' first."})
124
+ impact = graph.get_impact_radius(node_id, depth=max_depth)
125
+ return json.dumps(
126
+ {
127
+ "results": [n.to_dict() for n in impact],
128
+ "note": _STATIC_ANALYSIS_NOTE,
129
+ }
130
+ )
131
+
132
+
133
+ @mcp.tool()
134
+ def index_status(index_dir: str | None = None) -> str:
135
+ """Check health and status of the semantic index."""
136
+ state = _load_state(index_dir)
137
+ if state is None:
138
+ return json.dumps({"status": "no_index", "message": "No index found."})
139
+
140
+ from coderay.core.config import get_embedding_dimensions, load_config
141
+ from coderay.state.version import read_index_version
142
+ from coderay.storage.lancedb import index_exists as idx_exists
143
+
144
+ idx_dir = _resolve_index_dir(index_dir)
145
+ has_store = idx_exists(idx_dir)
146
+ chunk_count = 0
147
+ if has_store:
148
+ from coderay.storage.lancedb import Store
149
+
150
+ config = load_config(idx_dir)
151
+ store = Store(idx_dir, dimensions=get_embedding_dimensions(config))
152
+ chunk_count = store.chunk_count()
153
+
154
+ return json.dumps(
155
+ {
156
+ "status": state.state.value,
157
+ "branch": state.branch,
158
+ "last_commit": state.last_commit,
159
+ "chunk_count": chunk_count,
160
+ "schema_version": read_index_version(idx_dir),
161
+ "has_store": has_store,
162
+ },
163
+ default=str,
164
+ )
165
+
166
+
167
+ def main():
168
+ """Entry point for the coderay-mcp command."""
169
+ import sys
170
+
171
+ transport = "stdio"
172
+ if "--sse" in sys.argv:
173
+ transport = "sse"
174
+ mcp.run(transport=transport)
175
+
176
+
177
+ if __name__ == "__main__":
178
+ main()
File without changes
@@ -0,0 +1,417 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from coderay.chunking.chunker import chunk_file
9
+ from coderay.core.config import get_embedding_dimensions, load_config
10
+ from coderay.core.timing import timed, timed_phase
11
+ from coderay.core.utils import files_with_changed_content, hash_content, read_from_path
12
+ from coderay.embedding.base import Embedder, load_embedder_from_config
13
+ from coderay.graph.builder import build_and_save_graph
14
+ from coderay.state.machine import IndexMeta, StateMachine
15
+ from coderay.state.version import check_index_version, write_index_version
16
+ from coderay.storage.lancedb import Store, index_exists
17
+ from coderay.vcs.git import Git
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ RESUME_BATCH_SIZE = 200
22
+ DEFAULT_REPO_ROOT = "."
23
+ DEFAULT_INDEX_DIR = ".index"
24
+
25
+
26
+ @dataclass
27
+ class IndexResult:
28
+ """Result of an index build or update: cached, updated, and removed counts."""
29
+
30
+ cached: int = 0
31
+ updated: int = 0
32
+ removed: int = 0
33
+
34
+ def __str__(self) -> str:
35
+ return (
36
+ f"Cached: {self.cached}, Updated: {self.updated},"
37
+ f" Removed: {self.removed} chunks"
38
+ )
39
+
40
+
41
+ class Indexer:
42
+ """Builds and maintains the semantic index for a repository."""
43
+
44
+ def __init__(
45
+ self,
46
+ repo_root: str | Path = DEFAULT_REPO_ROOT,
47
+ index_dir: str | Path = DEFAULT_INDEX_DIR,
48
+ config: dict[str, Any] | None = None,
49
+ embedder: Embedder | None = None,
50
+ ) -> None:
51
+ """Initialize the indexer."""
52
+ self._repo_root = Path(repo_root)
53
+ self._index_dir = Path(index_dir)
54
+ self._config = config or load_config(self._index_dir)
55
+ self._git = Git(self._repo_root)
56
+ self._state = StateMachine(self._index_dir)
57
+ self._embedder = embedder or load_embedder_from_config(self._config)
58
+ self._store = Store(
59
+ self._index_dir, dimensions=get_embedding_dimensions(self._config)
60
+ )
61
+ check_index_version(self._index_dir)
62
+
63
+ @property
64
+ def config(self) -> dict[str, Any]:
65
+ """Current config (embedder, index settings)."""
66
+ return self._config
67
+
68
+ @property
69
+ def repo_root(self) -> Path:
70
+ """Repository root path."""
71
+ return self._repo_root
72
+
73
+ @property
74
+ def index_dir(self) -> Path:
75
+ """Index directory path."""
76
+ return self._index_dir
77
+
78
+ @property
79
+ def current_state(self) -> IndexMeta | None:
80
+ """Current index meta state; None if no run has completed."""
81
+ return self._state.current_state
82
+
83
+ @timed("full_build")
84
+ def build_full(self) -> IndexResult:
85
+ """Full rebuild: discover, chunk, embed, and store all source files."""
86
+
87
+ current = self._state.current_state
88
+ last_branch = current.branch if current is not None else None
89
+ branch_switched = self._git.is_branch_switched(last_branch=last_branch)
90
+ if branch_switched:
91
+ return self.update_incremental()
92
+
93
+ current_run = current.current_run if current else None
94
+ saved_paths = current_run.paths_to_process if current_run else []
95
+ processed_count = current_run.processed_count if current_run else 0
96
+
97
+ can_resume = (
98
+ not branch_switched
99
+ and self._state.is_in_progress
100
+ and self._state.has_partial_progress
101
+ )
102
+
103
+ if can_resume:
104
+ paths_remaining = saved_paths[processed_count:]
105
+ if not paths_remaining:
106
+ self._state.finish(
107
+ last_commit=self._git.get_head_commit(),
108
+ branch=self._git.get_current_branch(),
109
+ )
110
+ write_index_version(self._index_dir)
111
+ self._refresh_graph()
112
+ return IndexResult(cached=len(self._state.file_hashes))
113
+ paths_to_process = paths_remaining
114
+ rel_paths = saved_paths
115
+ else:
116
+ self._state.set_incomplete()
117
+ self._store.clear()
118
+
119
+ py_files = self._git.discover_files()
120
+ if not py_files:
121
+ logger.warning("No source files found under %s", self._repo_root)
122
+ return IndexResult(cached=len(self._state.file_hashes))
123
+
124
+ rel_paths = [str(p.relative_to(self._repo_root)) for p in py_files]
125
+ paths_to_process = rel_paths
126
+ processed_count = 0
127
+ self._state.start(
128
+ branch=self._git.get_current_branch(),
129
+ last_commit=self._git.get_head_commit(),
130
+ )
131
+
132
+ all_path_hashes = self._run_batch_loop(
133
+ rel_paths=paths_to_process,
134
+ full_rel_paths=rel_paths,
135
+ )
136
+
137
+ if can_resume:
138
+ existing = self._state.file_hashes.copy()
139
+ existing.update(all_path_hashes)
140
+ self._state.file_hashes = existing
141
+ else:
142
+ self._state.file_hashes = all_path_hashes
143
+
144
+ self._state.finish(
145
+ last_commit=self._git.get_head_commit(),
146
+ branch=self._git.get_current_branch(),
147
+ )
148
+ write_index_version(self._index_dir)
149
+ self._refresh_graph()
150
+ return IndexResult(updated=len(self._state.file_hashes))
151
+
152
+ def update_incremental(self) -> IndexResult:
153
+ """Incremental update: re-index only changed, added, or deleted files."""
154
+
155
+ self._state.set_incomplete()
156
+
157
+ current = self._state.current_state
158
+ state_branch = current.branch if current else None
159
+ active_branch = self._git.get_current_branch()
160
+
161
+ if self._git.is_branch_switched(last_branch=state_branch):
162
+ logger.info(
163
+ "Branch switched %s -> %s; syncing index",
164
+ state_branch,
165
+ active_branch,
166
+ )
167
+ return self._sync_after_branch_switch()
168
+
169
+ to_add, to_remove = self._git.get_files_to_index(
170
+ last_commit=current.last_commit if current else None
171
+ )
172
+ if to_remove:
173
+ self._store.delete_by_paths(paths=to_remove)
174
+
175
+ # Remove deleted hashes from state
176
+ file_hashes = self._state.file_hashes.copy()
177
+ for path in to_remove:
178
+ file_hashes.pop(path, None)
179
+ # Idempotent if to_remove is empty
180
+ self._state.file_hashes = file_hashes
181
+
182
+ # Check what files are changed
183
+ changed_files = files_with_changed_content(
184
+ repo=self._repo_root, paths=to_add, file_hashes=file_hashes
185
+ )
186
+
187
+ if not changed_files and not to_remove:
188
+ self._state.finish(
189
+ last_commit=self._git.get_head_commit(),
190
+ branch=self._git.get_current_branch(),
191
+ )
192
+ self._refresh_graph()
193
+ logger.info("Nothing to update")
194
+ return IndexResult(cached=len(self._state.file_hashes))
195
+
196
+ return self._update(
197
+ paths_to_add=changed_files,
198
+ file_hashes=file_hashes,
199
+ )
200
+
201
+ def _sync_after_branch_switch(self) -> IndexResult:
202
+ """Sync index to current branch after a switch. Returns IndexResult."""
203
+ file_hashes = self._state.file_hashes.copy()
204
+ py_files = self._git.discover_files()
205
+
206
+ # All .py files were deleted from git
207
+ to_remove: list[str] = []
208
+ if not py_files:
209
+ to_remove = list(file_hashes)
210
+
211
+ if to_remove:
212
+ self._store.delete_by_paths(to_remove)
213
+ index_result = IndexResult(removed=len(file_hashes))
214
+ file_hashes.clear()
215
+
216
+ self._state.file_hashes = file_hashes
217
+ self._state.finish(
218
+ last_commit=self._git.get_head_commit(),
219
+ branch=self._git.get_current_branch(),
220
+ )
221
+ self._refresh_graph()
222
+ return index_result
223
+
224
+ rel_paths_current = {str(p.relative_to(self._repo_root)) for p in py_files}
225
+ # Deleted files on current branch
226
+ to_remove = [p for p in file_hashes if p not in rel_paths_current]
227
+ if to_remove:
228
+ self._store.delete_by_paths(to_remove)
229
+ for p in to_remove:
230
+ file_hashes.pop(p, None)
231
+
232
+ changed_files = files_with_changed_content(
233
+ repo=self._repo_root, paths=py_files, file_hashes=file_hashes
234
+ )
235
+
236
+ if not changed_files and not to_remove:
237
+ self._state.finish(
238
+ last_commit=self._git.get_head_commit(),
239
+ branch=self._git.get_current_branch(),
240
+ )
241
+ self._refresh_graph()
242
+ logger.info("Branch switch: index already in sync (no changes)")
243
+ return IndexResult(cached=len(self._state.file_hashes))
244
+
245
+ return self._update(
246
+ paths_to_add=changed_files,
247
+ file_hashes=file_hashes,
248
+ )
249
+
250
+ def _run_batch_loop(
251
+ self,
252
+ rel_paths: list[str],
253
+ full_rel_paths: list[str],
254
+ ) -> dict[str, str]:
255
+ """Run the pipeline in batches and save progress for resume."""
256
+ all_path_hashes: dict[str, str] = {}
257
+
258
+ for i in range(0, len(rel_paths), RESUME_BATCH_SIZE):
259
+ batch = rel_paths[i : i + RESUME_BATCH_SIZE]
260
+ all_path_hashes.update(self._run_pipeline(rel_paths=batch))
261
+ self._state.save_progress(
262
+ full_rel_paths=full_rel_paths,
263
+ processed_count=i + len(batch),
264
+ )
265
+
266
+ return all_path_hashes
267
+
268
+ def _update(
269
+ self,
270
+ paths_to_add: list[Path],
271
+ file_hashes: dict[str, str],
272
+ ) -> IndexResult:
273
+ """Run pipeline over paths, update hashes and state, then finish."""
274
+ rel_paths = [str(p.relative_to(self._repo_root)) for p in paths_to_add]
275
+ self._state.start(
276
+ branch=self._git.get_current_branch(),
277
+ last_commit=self._git.get_head_commit(),
278
+ )
279
+
280
+ batch_hashes = self._run_batch_loop(
281
+ rel_paths=rel_paths,
282
+ full_rel_paths=rel_paths,
283
+ )
284
+ file_hashes.update(batch_hashes)
285
+ self._state.file_hashes = file_hashes
286
+ self._state.finish(
287
+ last_commit=self._git.get_head_commit(),
288
+ branch=self._git.get_current_branch(),
289
+ )
290
+ self._refresh_graph(changed_paths=rel_paths)
291
+ return IndexResult(updated=len(batch_hashes))
292
+
293
+ @timed("pipeline")
294
+ def _run_pipeline(
295
+ self,
296
+ rel_paths: list[str],
297
+ ) -> dict[str, str]:
298
+ """Chunk, embed, and store the given files."""
299
+ files_content: list[tuple[str, str]] = []
300
+
301
+ with timed_phase("read"):
302
+ for p in rel_paths:
303
+ path = self._repo_root / p if not Path(p).is_absolute() else Path(p)
304
+ if not path.is_file():
305
+ logger.warning("Skip (not a file): %s", p)
306
+ continue
307
+ try:
308
+ content = read_from_path(path)
309
+ files_content.append((p, content))
310
+ except Exception as e:
311
+ logger.warning("Skip (read failed) %s: %s", p, e)
312
+
313
+ if not files_content:
314
+ return {}
315
+
316
+ path_hashes = {p: hash_content(content) for p, content in files_content}
317
+
318
+ paths_to_replace = list({p for p, _ in files_content})
319
+ self._store.delete_by_paths(paths_to_replace)
320
+
321
+ with timed_phase("chunking"):
322
+ chunks = []
323
+ for p, content in files_content:
324
+ chunks.extend(chunk_file(p, content))
325
+
326
+ if not chunks:
327
+ logger.info("Pipeline done: 0 chunks in %d files", len(files_content))
328
+ return path_hashes
329
+
330
+ texts = [c.content for c in chunks]
331
+ with timed_phase("embedding"):
332
+ embeddings = self._embedder.embed(texts)
333
+
334
+ with timed_phase("storing"):
335
+ self._store.insert_chunks(chunks, embeddings)
336
+
337
+ logger.info(
338
+ "Pipeline done: %d chunks in %d files", len(chunks), len(files_content)
339
+ )
340
+ return path_hashes
341
+
342
+ def _refresh_graph(self, changed_paths: list[str] | None = None) -> None:
343
+ """Rebuild and save the code graph, logging warnings on failure."""
344
+ try:
345
+ build_and_save_graph(
346
+ self._repo_root,
347
+ self._index_dir,
348
+ changed_paths=changed_paths,
349
+ )
350
+ except Exception as e:
351
+ logger.warning("Graph refresh failed: %s", e)
352
+
353
+ def update_paths(
354
+ self,
355
+ changed: list[str],
356
+ removed: list[str] | None = None,
357
+ ) -> IndexResult:
358
+ """Update the index for explicit file paths (used by the file watcher)."""
359
+ self._state.set_incomplete()
360
+ file_hashes = self._state.file_hashes.copy()
361
+
362
+ removed = removed or []
363
+ if removed:
364
+ self._store.delete_by_paths(removed)
365
+ for p in removed:
366
+ file_hashes.pop(p, None)
367
+ self._state.file_hashes = file_hashes
368
+
369
+ if not changed:
370
+ self._state.finish(
371
+ last_commit=self._git.get_head_commit(),
372
+ branch=self._git.get_current_branch(),
373
+ )
374
+ if removed:
375
+ self._refresh_graph()
376
+ return IndexResult(removed=len(removed))
377
+
378
+ paths_to_add = [self._repo_root / p for p in changed]
379
+ existing = files_with_changed_content(
380
+ repo=self._repo_root, paths=paths_to_add, file_hashes=file_hashes
381
+ )
382
+
383
+ if not existing and not removed:
384
+ self._state.finish(
385
+ last_commit=self._git.get_head_commit(),
386
+ branch=self._git.get_current_branch(),
387
+ )
388
+ return IndexResult(cached=len(file_hashes))
389
+
390
+ if existing:
391
+ result = self._update(
392
+ paths_to_add=existing,
393
+ file_hashes=file_hashes,
394
+ )
395
+ result.removed = len(removed)
396
+ return result
397
+
398
+ self._state.finish(
399
+ last_commit=self._git.get_head_commit(),
400
+ branch=self._git.get_current_branch(),
401
+ )
402
+ self._refresh_graph()
403
+ return IndexResult(removed=len(removed))
404
+
405
+ def maintain(self) -> dict[str, Any]:
406
+ """Run store maintenance (compact and cleanup)."""
407
+ if not index_exists(self._index_dir):
408
+ return {}
409
+ return self._store.maintain()
410
+
411
+ def index_exists(self) -> bool:
412
+ """Return True if the index exists at index_dir."""
413
+ return index_exists(self._index_dir)
414
+
415
+ def error(self, exc: str) -> None:
416
+ """Mark the current run as errored with the given exception message."""
417
+ self._state.set_errored(exc=exc)