coderay 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coderay/__init__.py +1 -0
- coderay/chunking/__init__.py +0 -0
- coderay/chunking/chunker.py +127 -0
- coderay/chunking/registry.py +190 -0
- coderay/cli/__init__.py +3 -0
- coderay/cli/commands.py +475 -0
- coderay/core/__init__.py +0 -0
- coderay/core/config.py +73 -0
- coderay/core/lock.py +36 -0
- coderay/core/models.py +71 -0
- coderay/core/timing.py +45 -0
- coderay/core/utils.py +35 -0
- coderay/embedding/__init__.py +0 -0
- coderay/embedding/base.py +60 -0
- coderay/embedding/local.py +68 -0
- coderay/embedding/openai.py +87 -0
- coderay/graph/__init__.py +19 -0
- coderay/graph/builder.py +128 -0
- coderay/graph/code_graph.py +311 -0
- coderay/graph/extractor.py +315 -0
- coderay/mcp_server/__init__.py +0 -0
- coderay/mcp_server/server.py +178 -0
- coderay/pipeline/__init__.py +0 -0
- coderay/pipeline/indexer.py +417 -0
- coderay/pipeline/watcher.py +318 -0
- coderay/retrieval/__init__.py +3 -0
- coderay/retrieval/boosting.py +80 -0
- coderay/retrieval/search.py +121 -0
- coderay/skeleton/__init__.py +0 -0
- coderay/skeleton/extractor.py +140 -0
- coderay/state/__init__.py +8 -0
- coderay/state/machine.py +242 -0
- coderay/state/version.py +47 -0
- coderay/storage/__init__.py +0 -0
- coderay/storage/lancedb.py +268 -0
- coderay/vcs/__init__.py +0 -0
- coderay/vcs/git.py +193 -0
- coderay-1.0.0.dist-info/METADATA +145 -0
- coderay-1.0.0.dist-info/RECORD +42 -0
- coderay-1.0.0.dist-info/WHEEL +5 -0
- coderay-1.0.0.dist-info/entry_points.txt +3 -0
- coderay-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import threading
|
|
5
|
+
import time
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import pathspec
|
|
11
|
+
from watchdog.events import (
|
|
12
|
+
EVENT_TYPE_CREATED,
|
|
13
|
+
EVENT_TYPE_DELETED,
|
|
14
|
+
EVENT_TYPE_MODIFIED,
|
|
15
|
+
EVENT_TYPE_MOVED,
|
|
16
|
+
FileSystemEvent,
|
|
17
|
+
FileSystemMovedEvent,
|
|
18
|
+
)
|
|
19
|
+
from watchdog.observers import Observer
|
|
20
|
+
from watchdog.observers.polling import PollingObserver
|
|
21
|
+
|
|
22
|
+
from coderay.chunking.registry import get_supported_extensions
|
|
23
|
+
from coderay.vcs.git import load_gitignore
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class _DebouncedHandler:
|
|
29
|
+
"""Accumulates filesystem events and flushes after a quiet window."""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
repo_root: Path,
|
|
34
|
+
index_dir: Path,
|
|
35
|
+
gitignore_spec: pathspec.PathSpec,
|
|
36
|
+
supported_extensions: set[str],
|
|
37
|
+
debounce_seconds: float,
|
|
38
|
+
branch_switch_threshold: int,
|
|
39
|
+
extra_exclude: list[str],
|
|
40
|
+
on_batch: Callable[[set[str], set[str]], None],
|
|
41
|
+
) -> None:
|
|
42
|
+
"""Initialize the debounced event handler."""
|
|
43
|
+
self._repo_root = repo_root
|
|
44
|
+
self._index_dir = index_dir
|
|
45
|
+
self._gitignore = gitignore_spec
|
|
46
|
+
self._extensions = supported_extensions
|
|
47
|
+
self._debounce = debounce_seconds
|
|
48
|
+
self._threshold = branch_switch_threshold
|
|
49
|
+
self._on_batch = on_batch
|
|
50
|
+
|
|
51
|
+
extra_spec = pathspec.PathSpec.from_lines("gitignore", extra_exclude)
|
|
52
|
+
self._extra_spec = extra_spec
|
|
53
|
+
|
|
54
|
+
self._lock = threading.Lock()
|
|
55
|
+
self._changed: set[str] = set()
|
|
56
|
+
self._removed: set[str] = set()
|
|
57
|
+
self._timer: threading.Timer | None = None
|
|
58
|
+
|
|
59
|
+
# -- public (called from watchdog observer thread) -----------------
|
|
60
|
+
|
|
61
|
+
def on_event(self, event: FileSystemEvent) -> None:
|
|
62
|
+
"""Handle a single filesystem event from watchdog."""
|
|
63
|
+
if event.is_directory:
|
|
64
|
+
return
|
|
65
|
+
|
|
66
|
+
paths = self._event_paths(event)
|
|
67
|
+
for abs_path in paths:
|
|
68
|
+
rel = self._relative(abs_path)
|
|
69
|
+
if rel is None or not self._should_index(rel, abs_path):
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
with self._lock:
|
|
73
|
+
if event.event_type == EVENT_TYPE_DELETED:
|
|
74
|
+
self._removed.add(rel)
|
|
75
|
+
self._changed.discard(rel)
|
|
76
|
+
elif event.event_type == EVENT_TYPE_MOVED:
|
|
77
|
+
assert isinstance(event, FileSystemMovedEvent)
|
|
78
|
+
old_rel = self._relative(event.src_path)
|
|
79
|
+
if old_rel:
|
|
80
|
+
self._removed.add(old_rel)
|
|
81
|
+
self._changed.discard(old_rel)
|
|
82
|
+
self._changed.add(rel)
|
|
83
|
+
self._removed.discard(rel)
|
|
84
|
+
else:
|
|
85
|
+
self._changed.add(rel)
|
|
86
|
+
self._removed.discard(rel)
|
|
87
|
+
|
|
88
|
+
self._reset_timer()
|
|
89
|
+
|
|
90
|
+
def flush_now(self) -> None:
|
|
91
|
+
"""Force-flush any pending events (used during shutdown)."""
|
|
92
|
+
with self._lock:
|
|
93
|
+
if self._timer is not None:
|
|
94
|
+
self._timer.cancel()
|
|
95
|
+
self._timer = None
|
|
96
|
+
self._flush()
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def pending_count(self) -> int:
|
|
100
|
+
"""Number of accumulated events not yet flushed."""
|
|
101
|
+
with self._lock:
|
|
102
|
+
return len(self._changed) + len(self._removed)
|
|
103
|
+
|
|
104
|
+
# -- internal ------------------------------------------------------
|
|
105
|
+
|
|
106
|
+
def _event_paths(self, event: FileSystemEvent) -> list[str]:
|
|
107
|
+
"""Extract relevant absolute path(s) from an event."""
|
|
108
|
+
if isinstance(event, FileSystemMovedEvent):
|
|
109
|
+
return [event.dest_path]
|
|
110
|
+
return [event.src_path]
|
|
111
|
+
|
|
112
|
+
def _relative(self, abs_path: str) -> str | None:
|
|
113
|
+
"""Convert an absolute path to a repo-relative string, or None."""
|
|
114
|
+
try:
|
|
115
|
+
return str(Path(abs_path).relative_to(self._repo_root))
|
|
116
|
+
except ValueError:
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
def _should_index(self, rel_path: str, abs_path: str) -> bool:
|
|
120
|
+
"""Return True if the path is indexable (right extension, not ignored)."""
|
|
121
|
+
parts = Path(rel_path).parts
|
|
122
|
+
if ".git" in parts:
|
|
123
|
+
return False
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
Path(abs_path).relative_to(self._index_dir)
|
|
127
|
+
return False
|
|
128
|
+
except ValueError:
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
if Path(abs_path).suffix not in self._extensions:
|
|
132
|
+
return False
|
|
133
|
+
|
|
134
|
+
if self._gitignore.match_file(rel_path):
|
|
135
|
+
return False
|
|
136
|
+
|
|
137
|
+
if self._extra_spec.match_file(rel_path):
|
|
138
|
+
return False
|
|
139
|
+
|
|
140
|
+
return True
|
|
141
|
+
|
|
142
|
+
def _reset_timer(self) -> None:
|
|
143
|
+
"""Cancel existing timer and start a new debounce window."""
|
|
144
|
+
with self._lock:
|
|
145
|
+
if self._timer is not None:
|
|
146
|
+
self._timer.cancel()
|
|
147
|
+
self._timer = threading.Timer(self._debounce, self._flush)
|
|
148
|
+
self._timer.daemon = True
|
|
149
|
+
self._timer.start()
|
|
150
|
+
|
|
151
|
+
def _flush(self) -> None:
|
|
152
|
+
"""Drain accumulated paths and invoke the batch callback."""
|
|
153
|
+
with self._lock:
|
|
154
|
+
changed = self._changed.copy()
|
|
155
|
+
removed = self._removed.copy()
|
|
156
|
+
self._changed.clear()
|
|
157
|
+
self._removed.clear()
|
|
158
|
+
self._timer = None
|
|
159
|
+
|
|
160
|
+
if not changed and not removed:
|
|
161
|
+
return
|
|
162
|
+
|
|
163
|
+
total = len(changed) + len(removed)
|
|
164
|
+
if total >= self._threshold:
|
|
165
|
+
logger.info(
|
|
166
|
+
"Branch switch detected (%d files); delegating to full sync",
|
|
167
|
+
total,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
self._on_batch(changed, removed)
|
|
172
|
+
except Exception:
|
|
173
|
+
logger.exception("Batch update failed")
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class FileWatcher:
|
|
177
|
+
"""Watches a repository for changes and triggers index updates."""
|
|
178
|
+
|
|
179
|
+
def __init__(
|
|
180
|
+
self,
|
|
181
|
+
repo_root: Path,
|
|
182
|
+
index_dir: Path,
|
|
183
|
+
config: dict[str, Any] | None = None,
|
|
184
|
+
on_batch: Callable[[set[str], set[str]], None] | None = None,
|
|
185
|
+
*,
|
|
186
|
+
use_polling: bool = False,
|
|
187
|
+
) -> None:
|
|
188
|
+
"""Initialize the file watcher."""
|
|
189
|
+
from coderay.core.config import load_config
|
|
190
|
+
|
|
191
|
+
self._repo_root = repo_root.resolve()
|
|
192
|
+
self._index_dir = index_dir.resolve()
|
|
193
|
+
self._config = config or load_config(index_dir)
|
|
194
|
+
self._on_batch = on_batch
|
|
195
|
+
self._use_polling = use_polling
|
|
196
|
+
|
|
197
|
+
watch_cfg = self._config.get("watch") or {}
|
|
198
|
+
self._debounce = float(watch_cfg.get("debounce_seconds", 2))
|
|
199
|
+
self._threshold = int(watch_cfg.get("branch_switch_threshold", 50))
|
|
200
|
+
self._extra_exclude: list[str] = list(watch_cfg.get("exclude_patterns") or [])
|
|
201
|
+
|
|
202
|
+
self._observer: Observer | PollingObserver | None = None
|
|
203
|
+
self._handler: _DebouncedHandler | None = None
|
|
204
|
+
self._update_count = 0
|
|
205
|
+
|
|
206
|
+
@property
|
|
207
|
+
def update_count(self) -> int:
|
|
208
|
+
"""Number of batch updates executed since start."""
|
|
209
|
+
return self._update_count
|
|
210
|
+
|
|
211
|
+
def start(self) -> None:
|
|
212
|
+
"""Start the filesystem observer (non-blocking)."""
|
|
213
|
+
gitignore_spec = load_gitignore(self._repo_root)
|
|
214
|
+
extensions = get_supported_extensions()
|
|
215
|
+
|
|
216
|
+
batch_fn = self._on_batch or self._default_batch
|
|
217
|
+
self._handler = _DebouncedHandler(
|
|
218
|
+
repo_root=self._repo_root,
|
|
219
|
+
index_dir=self._index_dir,
|
|
220
|
+
gitignore_spec=gitignore_spec,
|
|
221
|
+
supported_extensions=extensions,
|
|
222
|
+
debounce_seconds=self._debounce,
|
|
223
|
+
branch_switch_threshold=self._threshold,
|
|
224
|
+
extra_exclude=self._extra_exclude,
|
|
225
|
+
on_batch=batch_fn,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
adapter = _WatchdogAdapter(self._handler)
|
|
229
|
+
if self._use_polling:
|
|
230
|
+
self._observer = PollingObserver(timeout=1)
|
|
231
|
+
else:
|
|
232
|
+
self._observer = Observer()
|
|
233
|
+
|
|
234
|
+
self._observer.schedule(adapter, str(self._repo_root), recursive=True)
|
|
235
|
+
self._observer.daemon = True
|
|
236
|
+
|
|
237
|
+
try:
|
|
238
|
+
self._observer.start()
|
|
239
|
+
except SystemError:
|
|
240
|
+
logger.warning("Native observer failed; falling back to polling")
|
|
241
|
+
self._observer = PollingObserver(timeout=1)
|
|
242
|
+
self._observer.schedule(adapter, str(self._repo_root), recursive=True)
|
|
243
|
+
self._observer.daemon = True
|
|
244
|
+
self._observer.start()
|
|
245
|
+
|
|
246
|
+
logger.info(
|
|
247
|
+
"Watching %s (debounce=%.1fs, extensions=%d, observer=%s)",
|
|
248
|
+
self._repo_root,
|
|
249
|
+
self._debounce,
|
|
250
|
+
len(extensions),
|
|
251
|
+
type(self._observer).__name__,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
def stop(self) -> None:
|
|
255
|
+
"""Stop the observer and flush remaining events."""
|
|
256
|
+
if self._handler is not None:
|
|
257
|
+
self._handler.flush_now()
|
|
258
|
+
if self._observer is not None:
|
|
259
|
+
self._observer.stop()
|
|
260
|
+
self._observer.join(timeout=5)
|
|
261
|
+
logger.info("Watcher stopped (%d updates total)", self._update_count)
|
|
262
|
+
|
|
263
|
+
def wait(self, timeout: float | None = None) -> None:
|
|
264
|
+
"""Block until the observer exits or timeout elapses."""
|
|
265
|
+
if self._observer is not None:
|
|
266
|
+
self._observer.join(timeout=timeout)
|
|
267
|
+
|
|
268
|
+
def _default_batch(self, changed: set[str], removed: set[str]) -> None:
|
|
269
|
+
"""Default callback: acquire lock and run Indexer.update_paths."""
|
|
270
|
+
from coderay.core.lock import acquire_indexer_lock
|
|
271
|
+
from coderay.pipeline.indexer import Indexer
|
|
272
|
+
|
|
273
|
+
total = len(changed) + len(removed)
|
|
274
|
+
t0 = time.time()
|
|
275
|
+
|
|
276
|
+
try:
|
|
277
|
+
with acquire_indexer_lock(self._index_dir, timeout=30):
|
|
278
|
+
indexer = Indexer(
|
|
279
|
+
self._repo_root,
|
|
280
|
+
self._index_dir,
|
|
281
|
+
config=self._config,
|
|
282
|
+
)
|
|
283
|
+
if total >= self._threshold:
|
|
284
|
+
result = indexer.update_incremental()
|
|
285
|
+
else:
|
|
286
|
+
result = indexer.update_paths(
|
|
287
|
+
changed=sorted(changed),
|
|
288
|
+
removed=sorted(removed),
|
|
289
|
+
)
|
|
290
|
+
elapsed = time.time() - t0
|
|
291
|
+
self._update_count += 1
|
|
292
|
+
logger.info(
|
|
293
|
+
"Update #%d: %s (%.2fs) [%d changed, %d removed]",
|
|
294
|
+
self._update_count,
|
|
295
|
+
result,
|
|
296
|
+
elapsed,
|
|
297
|
+
len(changed),
|
|
298
|
+
len(removed),
|
|
299
|
+
)
|
|
300
|
+
except Exception:
|
|
301
|
+
logger.exception("Failed to update index")
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
class _WatchdogAdapter:
|
|
305
|
+
"""Bridges watchdog's event handler protocol to our ``_DebouncedHandler``."""
|
|
306
|
+
|
|
307
|
+
def __init__(self, handler: _DebouncedHandler) -> None:
|
|
308
|
+
self._handler = handler
|
|
309
|
+
|
|
310
|
+
def dispatch(self, event: FileSystemEvent) -> None:
|
|
311
|
+
"""Forward relevant events to the debounced handler."""
|
|
312
|
+
if event.event_type in (
|
|
313
|
+
EVENT_TYPE_CREATED,
|
|
314
|
+
EVENT_TYPE_MODIFIED,
|
|
315
|
+
EVENT_TYPE_DELETED,
|
|
316
|
+
EVENT_TYPE_MOVED,
|
|
317
|
+
):
|
|
318
|
+
self._handler.on_event(event)
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
DEFAULT_PENALTIES: list[dict[str, Any]] = [
|
|
8
|
+
{"pattern": r"(^|/)tests?/", "factor": 0.5},
|
|
9
|
+
{"pattern": r"(^|/)test_[^/]+\.py$", "factor": 0.5},
|
|
10
|
+
{"pattern": r"(^|/)(mock|fixture|conftest)", "factor": 0.4},
|
|
11
|
+
{"pattern": r"(^|/)(generated|vendor|third_party)/", "factor": 0.3},
|
|
12
|
+
{"pattern": r"(^|/)docs?/", "factor": 0.6},
|
|
13
|
+
{"pattern": r"(^|/)examples?/", "factor": 0.7},
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
DEFAULT_BONUSES: list[dict[str, Any]] = [
|
|
17
|
+
{"pattern": r"(^|/)src/", "factor": 1.1},
|
|
18
|
+
{"pattern": r"(^|/)lib/", "factor": 1.1},
|
|
19
|
+
{"pattern": r"(^|/)app/", "factor": 1.1},
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class BoostRule:
|
|
25
|
+
"""Single path-based score multiplier rule (regex pattern + factor)."""
|
|
26
|
+
|
|
27
|
+
regex: re.Pattern[str]
|
|
28
|
+
factor: float
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class StructuralBooster:
|
|
33
|
+
"""Applies path-based score multipliers to search results."""
|
|
34
|
+
|
|
35
|
+
penalties: list[BoostRule] = field(default_factory=list)
|
|
36
|
+
bonuses: list[BoostRule] = field(default_factory=list)
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def from_config(cls, config: dict[str, Any]) -> StructuralBooster:
|
|
40
|
+
"""Build from config dict. Falls back to defaults if not specified."""
|
|
41
|
+
search_cfg = config.get("search") or {}
|
|
42
|
+
boost_cfg = search_cfg.get("boost_rules") or {}
|
|
43
|
+
|
|
44
|
+
raw_penalties = boost_cfg.get("penalties") or DEFAULT_PENALTIES
|
|
45
|
+
raw_bonuses = boost_cfg.get("bonuses") or DEFAULT_BONUSES
|
|
46
|
+
|
|
47
|
+
return cls(
|
|
48
|
+
penalties=[
|
|
49
|
+
BoostRule(regex=re.compile(r["pattern"]), factor=r["factor"])
|
|
50
|
+
for r in raw_penalties
|
|
51
|
+
],
|
|
52
|
+
bonuses=[
|
|
53
|
+
BoostRule(regex=re.compile(r["pattern"]), factor=r["factor"])
|
|
54
|
+
for r in raw_bonuses
|
|
55
|
+
],
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def _compute_multiplier(self, path: str) -> float:
|
|
59
|
+
multiplier = 1.0
|
|
60
|
+
for rule in self.penalties:
|
|
61
|
+
if rule.regex.search(path):
|
|
62
|
+
multiplier *= rule.factor
|
|
63
|
+
for rule in self.bonuses:
|
|
64
|
+
if rule.regex.search(path):
|
|
65
|
+
multiplier *= rule.factor
|
|
66
|
+
return multiplier
|
|
67
|
+
|
|
68
|
+
def boost(self, results: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
69
|
+
"""Apply path-based score multipliers and re-sort results."""
|
|
70
|
+
boosted = []
|
|
71
|
+
for r in results:
|
|
72
|
+
row = dict(r)
|
|
73
|
+
path = row.get("path", "")
|
|
74
|
+
mult = self._compute_multiplier(path)
|
|
75
|
+
raw_score = row.get("score", 0.0)
|
|
76
|
+
row["raw_score"] = raw_score
|
|
77
|
+
row["score"] = raw_score * mult
|
|
78
|
+
boosted.append(row)
|
|
79
|
+
boosted.sort(key=lambda r: r["score"], reverse=True)
|
|
80
|
+
return boosted
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from coderay.core.config import get_embedding_dimensions, load_config
|
|
9
|
+
from coderay.embedding.base import Embedder, load_embedder_from_config
|
|
10
|
+
from coderay.graph.builder import load_graph
|
|
11
|
+
from coderay.retrieval.boosting import StructuralBooster
|
|
12
|
+
from coderay.state.machine import IndexMeta
|
|
13
|
+
from coderay.state.version import check_index_version
|
|
14
|
+
from coderay.storage.lancedb import Store, index_exists
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Retrieval:
|
|
20
|
+
"""Query interface for the semantic index."""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
index_dir: str | Path,
|
|
25
|
+
config: dict[str, Any] | None = None,
|
|
26
|
+
embedder: Embedder | None = None,
|
|
27
|
+
) -> None:
|
|
28
|
+
"""Initialize retrieval for the given index."""
|
|
29
|
+
self.index_dir = Path(index_dir)
|
|
30
|
+
self._config = config or load_config(self.index_dir)
|
|
31
|
+
self._explicit_embedder = embedder
|
|
32
|
+
self._lazy_embedder: Embedder | None = None
|
|
33
|
+
self._dimensions = get_embedding_dimensions(self._config)
|
|
34
|
+
self._booster = StructuralBooster.from_config(self._config)
|
|
35
|
+
self._store: Store | None = None
|
|
36
|
+
check_index_version(self.index_dir)
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def _embedder(self) -> Embedder:
|
|
40
|
+
if self._explicit_embedder is not None:
|
|
41
|
+
return self._explicit_embedder
|
|
42
|
+
if self._lazy_embedder is None:
|
|
43
|
+
self._lazy_embedder = load_embedder_from_config(self._config)
|
|
44
|
+
return self._lazy_embedder
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def config(self) -> dict[str, Any]:
|
|
48
|
+
return self._config
|
|
49
|
+
|
|
50
|
+
def _get_store(self) -> Store:
|
|
51
|
+
if self._store is None:
|
|
52
|
+
self._store = Store(self.index_dir, dimensions=self._dimensions)
|
|
53
|
+
return self._store
|
|
54
|
+
|
|
55
|
+
def search(
|
|
56
|
+
self,
|
|
57
|
+
query: str,
|
|
58
|
+
current_state: IndexMeta,
|
|
59
|
+
*,
|
|
60
|
+
top_k: int = 10,
|
|
61
|
+
path_prefix: str | None = None,
|
|
62
|
+
language: str | None = None,
|
|
63
|
+
) -> list[dict[str, Any]]:
|
|
64
|
+
"""Semantic search over the index."""
|
|
65
|
+
if not index_exists(self.index_dir):
|
|
66
|
+
logger.warning("No index at %s", self.index_dir)
|
|
67
|
+
return []
|
|
68
|
+
|
|
69
|
+
if current_state.is_incomplete() or current_state.is_in_progress():
|
|
70
|
+
raise RuntimeError("Meta in progress; index might be stale")
|
|
71
|
+
|
|
72
|
+
store = self._get_store()
|
|
73
|
+
|
|
74
|
+
t0 = time.perf_counter()
|
|
75
|
+
query_vectors = self._embedder.embed([query])
|
|
76
|
+
logger.info("Query embed took %.3fs", time.perf_counter() - t0)
|
|
77
|
+
|
|
78
|
+
if not query_vectors:
|
|
79
|
+
return []
|
|
80
|
+
t1 = time.perf_counter()
|
|
81
|
+
results = store.search(
|
|
82
|
+
query_embedding=query_vectors[0],
|
|
83
|
+
top_k=top_k,
|
|
84
|
+
path_prefix=path_prefix,
|
|
85
|
+
language=language,
|
|
86
|
+
query_text=query,
|
|
87
|
+
)
|
|
88
|
+
logger.info("Vector search took %.3fs", time.perf_counter() - t1)
|
|
89
|
+
|
|
90
|
+
return self._booster.boost(results)
|
|
91
|
+
|
|
92
|
+
def load_graph(self) -> list[dict[str, Any]]:
|
|
93
|
+
"""Load graph edges from index_dir/graph.json. [] if missing."""
|
|
94
|
+
graph = load_graph(self.index_dir)
|
|
95
|
+
if graph is None:
|
|
96
|
+
return []
|
|
97
|
+
data = graph.to_dict()
|
|
98
|
+
return data.get("edges", [])
|
|
99
|
+
|
|
100
|
+
def chunk_count(self) -> int:
|
|
101
|
+
"""Total number of chunks in the index. Returns 0 if no index."""
|
|
102
|
+
if not index_exists(self.index_dir):
|
|
103
|
+
return 0
|
|
104
|
+
return self._get_store().chunk_count()
|
|
105
|
+
|
|
106
|
+
def list_chunks(
|
|
107
|
+
self,
|
|
108
|
+
*,
|
|
109
|
+
limit: int = 500,
|
|
110
|
+
path_prefix: str | None = None,
|
|
111
|
+
) -> list[dict[str, Any]]:
|
|
112
|
+
"""List indexed chunks (no vectors). For inspection/debugging."""
|
|
113
|
+
if not index_exists(self.index_dir):
|
|
114
|
+
return []
|
|
115
|
+
return self._get_store().list_chunks(limit=limit, path_prefix=path_prefix)
|
|
116
|
+
|
|
117
|
+
def chunks_by_path(self) -> dict[str, int]:
|
|
118
|
+
"""Return mapping of file path -> chunk count. Empty if no index."""
|
|
119
|
+
if not index_exists(self.index_dir):
|
|
120
|
+
return {}
|
|
121
|
+
return self._get_store().chunks_by_path()
|
|
File without changes
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from coderay.chunking.registry import get_language_for_file
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def extract_skeleton(
|
|
12
|
+
path: str | Path,
|
|
13
|
+
content: str,
|
|
14
|
+
language: str | None = None,
|
|
15
|
+
) -> str:
|
|
16
|
+
"""Extract the skeleton of a source file (signatures, no bodies)."""
|
|
17
|
+
path_str = str(path)
|
|
18
|
+
lang_cfg = get_language_for_file(path_str) if language is None else None
|
|
19
|
+
if lang_cfg is None and language:
|
|
20
|
+
from coderay.chunking.registry import LANGUAGE_REGISTRY
|
|
21
|
+
|
|
22
|
+
lang_cfg = LANGUAGE_REGISTRY.get(language)
|
|
23
|
+
if lang_cfg is None:
|
|
24
|
+
lang_cfg = get_language_for_file(path_str)
|
|
25
|
+
if lang_cfg is None:
|
|
26
|
+
return content
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
parser = lang_cfg.get_parser()
|
|
30
|
+
except Exception:
|
|
31
|
+
return content
|
|
32
|
+
|
|
33
|
+
source_bytes = content.encode("utf-8")
|
|
34
|
+
tree = parser.parse(source_bytes)
|
|
35
|
+
lines: list[str] = []
|
|
36
|
+
_visit_skeleton(tree.root_node, source_bytes, lang_cfg.name, lines, depth=0)
|
|
37
|
+
return "\n".join(lines)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _node_text(node, source_bytes: bytes) -> str:
|
|
41
|
+
return source_bytes[node.start_byte : node.end_byte].decode(
|
|
42
|
+
"utf-8", errors="replace"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _get_docstring(node, source_bytes: bytes) -> str | None:
|
|
47
|
+
"""Extract docstring from the first child of the body block, if present."""
|
|
48
|
+
if not hasattr(node, "children"):
|
|
49
|
+
return None
|
|
50
|
+
body = None
|
|
51
|
+
for child in node.children:
|
|
52
|
+
if child.type in ("block", "statement_block"):
|
|
53
|
+
body = child
|
|
54
|
+
break
|
|
55
|
+
if body is None:
|
|
56
|
+
return None
|
|
57
|
+
for child in body.children:
|
|
58
|
+
if child.type == "expression_statement":
|
|
59
|
+
for sub in child.children:
|
|
60
|
+
if sub.type == "string":
|
|
61
|
+
return _node_text(sub, source_bytes)
|
|
62
|
+
break
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _get_signature_line(node, source_bytes: bytes) -> str:
|
|
67
|
+
"""Get everything up to and including the colon/opening brace."""
|
|
68
|
+
text = _node_text(node, source_bytes)
|
|
69
|
+
for delimiter in (":\n", "{\n", ":\r\n", "{\r\n"):
|
|
70
|
+
idx = text.find(delimiter)
|
|
71
|
+
if idx >= 0:
|
|
72
|
+
return text[: idx + 1]
|
|
73
|
+
first_line = text.split("\n")[0]
|
|
74
|
+
return first_line
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _visit_skeleton(
|
|
78
|
+
node,
|
|
79
|
+
source_bytes: bytes,
|
|
80
|
+
language: str,
|
|
81
|
+
lines: list[str],
|
|
82
|
+
depth: int,
|
|
83
|
+
) -> None:
|
|
84
|
+
indent = " " * depth
|
|
85
|
+
ntype = node.type
|
|
86
|
+
|
|
87
|
+
if ntype in ("import_statement", "import_from_statement", "import_declaration"):
|
|
88
|
+
lines.append(indent + _node_text(node, source_bytes).strip())
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
if ntype in (
|
|
92
|
+
"function_definition",
|
|
93
|
+
"function_declaration",
|
|
94
|
+
"method_declaration",
|
|
95
|
+
"method_definition",
|
|
96
|
+
):
|
|
97
|
+
sig = _get_signature_line(node, source_bytes).strip()
|
|
98
|
+
lines.append(indent + sig)
|
|
99
|
+
docstring = _get_docstring(node, source_bytes)
|
|
100
|
+
if docstring:
|
|
101
|
+
lines.append(indent + " " + docstring)
|
|
102
|
+
lines.append(indent + " ...")
|
|
103
|
+
lines.append("")
|
|
104
|
+
return
|
|
105
|
+
|
|
106
|
+
if ntype in (
|
|
107
|
+
"class_definition",
|
|
108
|
+
"class_declaration",
|
|
109
|
+
"interface_declaration",
|
|
110
|
+
"type_alias_declaration",
|
|
111
|
+
"type_declaration",
|
|
112
|
+
):
|
|
113
|
+
sig = _get_signature_line(node, source_bytes).strip()
|
|
114
|
+
lines.append(indent + sig)
|
|
115
|
+
docstring = _get_docstring(node, source_bytes)
|
|
116
|
+
if docstring:
|
|
117
|
+
lines.append(indent + " " + docstring)
|
|
118
|
+
for child in node.children:
|
|
119
|
+
if child.type in ("block", "class_body", "statement_block"):
|
|
120
|
+
for member in child.children:
|
|
121
|
+
_visit_skeleton(member, source_bytes, language, lines, depth + 1)
|
|
122
|
+
lines.append("")
|
|
123
|
+
return
|
|
124
|
+
|
|
125
|
+
if ntype in ("expression_statement",) and depth == 0:
|
|
126
|
+
text = _node_text(node, source_bytes).strip()
|
|
127
|
+
if text.startswith(('"""', "'''", '"', "'")):
|
|
128
|
+
lines.append(indent + text)
|
|
129
|
+
elif "=" in text:
|
|
130
|
+
lines.append(indent + text)
|
|
131
|
+
return
|
|
132
|
+
|
|
133
|
+
if ntype in ("export_statement", "lexical_declaration") and depth == 0:
|
|
134
|
+
text = _node_text(node, source_bytes)
|
|
135
|
+
first_line = text.split("\n")[0].strip()
|
|
136
|
+
lines.append(indent + first_line)
|
|
137
|
+
return
|
|
138
|
+
|
|
139
|
+
for child in node.children:
|
|
140
|
+
_visit_skeleton(child, source_bytes, language, lines, depth)
|