flurryx-code-memory 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_memory/__init__.py +1 -0
- code_memory/claims/__init__.py +32 -0
- code_memory/claims/extractor.py +325 -0
- code_memory/claims/indexer.py +258 -0
- code_memory/claims/resolver.py +186 -0
- code_memory/claims/store.py +424 -0
- code_memory/cli.py +1192 -0
- code_memory/config.py +268 -0
- code_memory/embed/__init__.py +224 -0
- code_memory/embed/cache.py +204 -0
- code_memory/embed/m3.py +174 -0
- code_memory/embed/ollama.py +92 -0
- code_memory/embed/tei.py +106 -0
- code_memory/episodic/__init__.py +3 -0
- code_memory/episodic/sqlite_store.py +278 -0
- code_memory/extractor/__init__.py +3 -0
- code_memory/extractor/csproj.py +166 -0
- code_memory/extractor/dll.py +385 -0
- code_memory/extractor/gitignore.py +162 -0
- code_memory/extractor/nuget.py +275 -0
- code_memory/extractor/sanity.py +124 -0
- code_memory/extractor/sln.py +108 -0
- code_memory/extractor/treesitter.py +1172 -0
- code_memory/graph/__init__.py +3 -0
- code_memory/graph/falkor_store.py +740 -0
- code_memory/mcp_server.py +1816 -0
- code_memory/metrics.py +260 -0
- code_memory/orchestrator/__init__.py +13 -0
- code_memory/orchestrator/git_delta.py +211 -0
- code_memory/orchestrator/ingest_state.py +71 -0
- code_memory/orchestrator/pipeline.py +1478 -0
- code_memory/orchestrator/reset.py +130 -0
- code_memory/orchestrator/resolver.py +825 -0
- code_memory/orchestrator/retrieve.py +505 -0
- code_memory/resilience.py +73 -0
- code_memory/sync/__init__.py +20 -0
- code_memory/sync/autostart/__init__.py +42 -0
- code_memory/sync/autostart/base.py +106 -0
- code_memory/sync/autostart/launchd.py +115 -0
- code_memory/sync/autostart/schtasks.py +155 -0
- code_memory/sync/autostart/systemd.py +113 -0
- code_memory/sync/hooks.py +164 -0
- code_memory/sync/safety.py +65 -0
- code_memory/sync/snapshot.py +461 -0
- code_memory/sync/store.py +399 -0
- code_memory/sync/sync.py +405 -0
- code_memory/sync/watcher.py +320 -0
- code_memory/vector/__init__.py +3 -0
- code_memory/vector/qdrant_store.py +302 -0
- flurryx_code_memory-0.4.0.dist-info/METADATA +26 -0
- flurryx_code_memory-0.4.0.dist-info/RECORD +53 -0
- flurryx_code_memory-0.4.0.dist-info/WHEEL +4 -0
- flurryx_code_memory-0.4.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,1478 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
import time
|
|
8
|
+
from collections.abc import Callable, Iterable
|
|
9
|
+
from concurrent.futures import Future, ThreadPoolExecutor
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Literal
|
|
13
|
+
|
|
14
|
+
from ..config import CONFIG, Config, detect_project_slug
|
|
15
|
+
from ..embed import M3Embedder, get_embedder
|
|
16
|
+
from ..episodic import Episode, EpisodicStore
|
|
17
|
+
from ..episodic.sqlite_store import episode_payload, episode_text
|
|
18
|
+
from ..extractor import ExtractedFile, Extractor, Symbol
|
|
19
|
+
from ..extractor.csproj import CsprojInfo, walk_csprojs
|
|
20
|
+
from ..extractor.dll import parse_assembly
|
|
21
|
+
from ..extractor.nuget import resolve_refs
|
|
22
|
+
from ..extractor.sanity import SUSPECT_THRESHOLD, SanitySummary
|
|
23
|
+
from ..extractor.sln import walk_solutions
|
|
24
|
+
from ..graph import FalkorStore, GraphEdge, GraphNode
|
|
25
|
+
from ..vector import QdrantStore, VectorRecord
|
|
26
|
+
from . import git_delta
|
|
27
|
+
from .ingest_state import IngestStateStore
|
|
28
|
+
from .resolver import resolve_graph
|
|
29
|
+
|
|
30
|
+
IngestMode = Literal["auto", "full", "incremental"]
|
|
31
|
+
|
|
32
|
+
ProgressCallback = Callable[[int, int | None, str], None]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _id(*parts: str) -> str:
|
|
36
|
+
h = hashlib.sha1("\x00".join(parts).encode()).hexdigest()
|
|
37
|
+
return h[:32]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# How often to emit a progress heartbeat during ingest. Heartbeats go to
|
|
41
|
+
# stderr so ``--json`` output on stdout stays clean.
|
|
42
|
+
_PROGRESS_EVERY = int(os.environ.get("CODEMEMORY_PROGRESS_EVERY", "50"))
|
|
43
|
+
_PROGRESS_ENABLED = os.environ.get("CODEMEMORY_PROGRESS", "1") != "0"
|
|
44
|
+
# auto = rich TUI when stderr is a TTY, plain text otherwise.
|
|
45
|
+
# rich = force rich (e.g. forced inside non-TTY harness that handles ANSI).
|
|
46
|
+
# text = legacy throttled heartbeat lines.
|
|
47
|
+
# none = silence everything.
|
|
48
|
+
_PROGRESS_STYLE = os.environ.get("CODEMEMORY_PROGRESS_STYLE", "auto").lower()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _default_progress_file() -> Path:
|
|
52
|
+
"""Where _Heartbeat writes the live progress snapshot.
|
|
53
|
+
|
|
54
|
+
Cross-process channel for the `code-memory watch` CLI: any process
|
|
55
|
+
running ingest writes here on every tick; the watch CLI tails the
|
|
56
|
+
same path and renders a rich live bar. Path is overridable via
|
|
57
|
+
``CODEMEMORY_PROGRESS_FILE`` for tests or split projects.
|
|
58
|
+
"""
|
|
59
|
+
override = os.environ.get("CODEMEMORY_PROGRESS_FILE")
|
|
60
|
+
if override:
|
|
61
|
+
return Path(override).expanduser()
|
|
62
|
+
base = os.environ.get("XDG_STATE_HOME") or str(Path.home() / ".cache")
|
|
63
|
+
return Path(base) / "code-memory" / "ingest-progress.json"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
_PROGRESS_FILE = _default_progress_file()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _write_progress_snapshot(snap: dict[str, Any]) -> None:
|
|
70
|
+
"""Atomically write a progress snapshot for the watch CLI.
|
|
71
|
+
|
|
72
|
+
Atomic via tmp + rename so a watcher never reads a half-written
|
|
73
|
+
document. Failures swallowed — UI must not break the ingest loop.
|
|
74
|
+
"""
|
|
75
|
+
try:
|
|
76
|
+
_PROGRESS_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
77
|
+
tmp = _PROGRESS_FILE.with_suffix(".json.tmp")
|
|
78
|
+
tmp.write_text(json.dumps(snap))
|
|
79
|
+
os.replace(tmp, _PROGRESS_FILE)
|
|
80
|
+
except Exception: # noqa: BLE001 — UI errors must not abort ingest.
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _want_rich_progress() -> bool:
|
|
85
|
+
if _PROGRESS_STYLE == "none" or not _PROGRESS_ENABLED:
|
|
86
|
+
return False
|
|
87
|
+
if _PROGRESS_STYLE == "rich":
|
|
88
|
+
return True
|
|
89
|
+
if _PROGRESS_STYLE == "text":
|
|
90
|
+
return False
|
|
91
|
+
try:
|
|
92
|
+
return bool(sys.stderr.isatty())
|
|
93
|
+
except Exception:
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class _Heartbeat:
|
|
98
|
+
"""Render ingest progress.
|
|
99
|
+
|
|
100
|
+
Two render paths share one API:
|
|
101
|
+
|
|
102
|
+
* **rich** — `rich.progress.Progress` live bar on stderr with files,
|
|
103
|
+
symbols, chunks, skipped counters + ETA. Used when stderr is a TTY
|
|
104
|
+
(or `CODEMEMORY_PROGRESS_STYLE=rich`).
|
|
105
|
+
* **text** — periodic ``files=… symbols=…`` lines on stderr. Used
|
|
106
|
+
when stderr is captured (MCP stdio server, CI logs, `bash` from an
|
|
107
|
+
agent harness) so ANSI escapes don't pollute the transcript.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
def __init__(
|
|
111
|
+
self,
|
|
112
|
+
label: str,
|
|
113
|
+
*,
|
|
114
|
+
total: int | None = None,
|
|
115
|
+
on_progress: ProgressCallback | None = None,
|
|
116
|
+
) -> None:
|
|
117
|
+
self.label = label
|
|
118
|
+
self.total = total
|
|
119
|
+
self.start = time.monotonic()
|
|
120
|
+
self.last = self.start
|
|
121
|
+
self._rich: Any = None
|
|
122
|
+
self._task: Any = None
|
|
123
|
+
self._on_progress = on_progress
|
|
124
|
+
# Throttle out-of-band progress notifications so a 50k-file ingest
|
|
125
|
+
# doesn't flood the MCP transport. Rich's own refresh loop is
|
|
126
|
+
# already throttled internally.
|
|
127
|
+
self._cb_interval = float(
|
|
128
|
+
os.environ.get("CODEMEMORY_PROGRESS_NOTIFY_INTERVAL", "0.4")
|
|
129
|
+
)
|
|
130
|
+
self._cb_last = 0.0
|
|
131
|
+
if _want_rich_progress():
|
|
132
|
+
self._init_rich()
|
|
133
|
+
|
|
134
|
+
def _init_rich(self) -> None:
|
|
135
|
+
try:
|
|
136
|
+
from rich.console import Console
|
|
137
|
+
from rich.progress import (
|
|
138
|
+
BarColumn,
|
|
139
|
+
MofNCompleteColumn,
|
|
140
|
+
Progress,
|
|
141
|
+
SpinnerColumn,
|
|
142
|
+
TextColumn,
|
|
143
|
+
TimeElapsedColumn,
|
|
144
|
+
TimeRemainingColumn,
|
|
145
|
+
)
|
|
146
|
+
except Exception: # noqa: BLE001 — rich missing, fall back to text
|
|
147
|
+
return
|
|
148
|
+
progress = Progress(
|
|
149
|
+
SpinnerColumn(style="cyan"),
|
|
150
|
+
TextColumn("[bold cyan]code-memory[/] {task.description}"),
|
|
151
|
+
BarColumn(bar_width=None),
|
|
152
|
+
MofNCompleteColumn(),
|
|
153
|
+
TextColumn(
|
|
154
|
+
"[green]{task.fields[symbols]}[/]sym "
|
|
155
|
+
"[magenta]{task.fields[chunks]}[/]chk "
|
|
156
|
+
"[yellow]{task.fields[skipped]}[/]skip "
|
|
157
|
+
"[dim]{task.fields[rate]}/s[/]"
|
|
158
|
+
),
|
|
159
|
+
TimeElapsedColumn(),
|
|
160
|
+
TimeRemainingColumn(),
|
|
161
|
+
console=Console(stderr=True),
|
|
162
|
+
transient=False,
|
|
163
|
+
refresh_per_second=8,
|
|
164
|
+
)
|
|
165
|
+
try:
|
|
166
|
+
progress.start()
|
|
167
|
+
except Exception: # noqa: BLE001
|
|
168
|
+
return
|
|
169
|
+
self._rich = progress
|
|
170
|
+
self._task = progress.add_task(
|
|
171
|
+
self.label,
|
|
172
|
+
total=self.total,
|
|
173
|
+
symbols=0,
|
|
174
|
+
chunks=0,
|
|
175
|
+
skipped=0,
|
|
176
|
+
rate="0.0",
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
def _rate(self, files: int) -> float:
|
|
180
|
+
elapsed = max(time.monotonic() - self.start, 1e-6)
|
|
181
|
+
return files / elapsed
|
|
182
|
+
|
|
183
|
+
def _snapshot(self, stats: IngestStats, *, done: bool) -> dict[str, Any]:
|
|
184
|
+
return {
|
|
185
|
+
"label": self.label,
|
|
186
|
+
"files": stats.files,
|
|
187
|
+
"total": self.total,
|
|
188
|
+
"symbols": stats.symbols,
|
|
189
|
+
"chunks": stats.chunks,
|
|
190
|
+
"skipped": stats.skipped,
|
|
191
|
+
"rate": self._rate(stats.files),
|
|
192
|
+
"elapsed": time.monotonic() - self.start,
|
|
193
|
+
"ts": time.time(),
|
|
194
|
+
"done": done,
|
|
195
|
+
"pid": os.getpid(),
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
def _notify(self, stats: IngestStats, *, force: bool = False) -> None:
|
|
199
|
+
if self._on_progress is None:
|
|
200
|
+
return
|
|
201
|
+
now = time.monotonic()
|
|
202
|
+
if not force and now - self._cb_last < self._cb_interval:
|
|
203
|
+
return
|
|
204
|
+
self._cb_last = now
|
|
205
|
+
rate = self._rate(stats.files)
|
|
206
|
+
msg = (
|
|
207
|
+
f"{self.label}: files={stats.files} "
|
|
208
|
+
f"symbols={stats.symbols} chunks={stats.chunks} "
|
|
209
|
+
f"skipped={stats.skipped} rate={rate:.1f}/s"
|
|
210
|
+
)
|
|
211
|
+
try:
|
|
212
|
+
self._on_progress(stats.files, self.total, msg)
|
|
213
|
+
except Exception: # noqa: BLE001 — never let UI break the ingest
|
|
214
|
+
pass
|
|
215
|
+
|
|
216
|
+
def tick(self, stats: IngestStats) -> None:
|
|
217
|
+
self._notify(stats)
|
|
218
|
+
_write_progress_snapshot(self._snapshot(stats, done=False))
|
|
219
|
+
if self._rich is not None:
|
|
220
|
+
self._rich.update(
|
|
221
|
+
self._task,
|
|
222
|
+
completed=stats.files,
|
|
223
|
+
total=self.total,
|
|
224
|
+
symbols=stats.symbols,
|
|
225
|
+
chunks=stats.chunks,
|
|
226
|
+
skipped=stats.skipped,
|
|
227
|
+
rate=f"{self._rate(stats.files):.1f}",
|
|
228
|
+
)
|
|
229
|
+
return
|
|
230
|
+
if not _PROGRESS_ENABLED or _PROGRESS_STYLE == "none":
|
|
231
|
+
return
|
|
232
|
+
if _PROGRESS_EVERY <= 0:
|
|
233
|
+
return
|
|
234
|
+
if stats.files % _PROGRESS_EVERY != 0 or stats.files == 0:
|
|
235
|
+
return
|
|
236
|
+
now = time.monotonic()
|
|
237
|
+
rate = self._rate(stats.files)
|
|
238
|
+
eta = ""
|
|
239
|
+
if self.total and rate > 0:
|
|
240
|
+
remaining = max(self.total - stats.files, 0)
|
|
241
|
+
eta = f" eta={remaining / rate:.0f}s"
|
|
242
|
+
total_part = f"/{self.total}" if self.total else ""
|
|
243
|
+
sys.stderr.write(
|
|
244
|
+
f"[code-memory] {self.label}: files={stats.files}{total_part} "
|
|
245
|
+
f"symbols={stats.symbols} chunks={stats.chunks} "
|
|
246
|
+
f"skipped={stats.skipped} rate={rate:.1f}/s{eta}\n"
|
|
247
|
+
)
|
|
248
|
+
sys.stderr.flush()
|
|
249
|
+
self.last = now
|
|
250
|
+
|
|
251
|
+
def done(self, stats: IngestStats) -> None:
|
|
252
|
+
self._notify(stats, force=True)
|
|
253
|
+
_write_progress_snapshot(self._snapshot(stats, done=True))
|
|
254
|
+
if self._rich is not None:
|
|
255
|
+
try:
|
|
256
|
+
self._rich.update(
|
|
257
|
+
self._task,
|
|
258
|
+
completed=stats.files,
|
|
259
|
+
total=self.total or stats.files or 1,
|
|
260
|
+
symbols=stats.symbols,
|
|
261
|
+
chunks=stats.chunks,
|
|
262
|
+
skipped=stats.skipped,
|
|
263
|
+
rate=f"{self._rate(stats.files):.1f}",
|
|
264
|
+
)
|
|
265
|
+
self._rich.stop()
|
|
266
|
+
except Exception: # noqa: BLE001
|
|
267
|
+
pass
|
|
268
|
+
self._rich = None
|
|
269
|
+
self._task = None
|
|
270
|
+
return
|
|
271
|
+
if not _PROGRESS_ENABLED or _PROGRESS_STYLE == "none":
|
|
272
|
+
return
|
|
273
|
+
elapsed = time.monotonic() - self.start
|
|
274
|
+
sys.stderr.write(
|
|
275
|
+
f"[code-memory] {self.label} done: files={stats.files} "
|
|
276
|
+
f"symbols={stats.symbols} chunks={stats.chunks} "
|
|
277
|
+
f"skipped={stats.skipped} elapsed={elapsed:.1f}s\n"
|
|
278
|
+
)
|
|
279
|
+
sys.stderr.flush()
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
@dataclass
|
|
283
|
+
class IngestStats:
|
|
284
|
+
files: int = 0
|
|
285
|
+
symbols: int = 0
|
|
286
|
+
imports: int = 0
|
|
287
|
+
calls: int = 0
|
|
288
|
+
references: int = 0
|
|
289
|
+
chunks: int = 0
|
|
290
|
+
deleted: int = 0
|
|
291
|
+
skipped: int = 0
|
|
292
|
+
mode: str = "full"
|
|
293
|
+
base_sha: str | None = None
|
|
294
|
+
head_sha: str | None = None
|
|
295
|
+
resolver: dict[str, int] | None = None
|
|
296
|
+
sanity: dict[str, object] | None = None
|
|
297
|
+
projects: dict[str, int] | None = None
|
|
298
|
+
dlls: dict[str, int] | None = None
|
|
299
|
+
solutions: dict[str, int] | None = None
|
|
300
|
+
notes: list[str] = field(default_factory=list)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
class Pipeline:
|
|
304
|
+
"""Coordinator: extractor -> graph + vectors + episodes."""
|
|
305
|
+
|
|
306
|
+
def __init__(
|
|
307
|
+
self,
|
|
308
|
+
project: str | None = None,
|
|
309
|
+
embedder: M3Embedder | None = None,
|
|
310
|
+
vector: QdrantStore | None = None,
|
|
311
|
+
graph: FalkorStore | None = None,
|
|
312
|
+
episodic: EpisodicStore | None = None,
|
|
313
|
+
skip_vectors: bool = False,
|
|
314
|
+
) -> None:
|
|
315
|
+
self.slug = project or detect_project_slug()
|
|
316
|
+
self.cfg: Config = CONFIG.for_project(self.slug)
|
|
317
|
+
self.skip_vectors = skip_vectors
|
|
318
|
+
self.embedder = embedder or get_embedder()
|
|
319
|
+
self.vector = vector or QdrantStore()
|
|
320
|
+
self.graph = graph or FalkorStore(graph_name=self.cfg.falkor_graph)
|
|
321
|
+
self.episodic = episodic or EpisodicStore(path=self.cfg.episodic_db)
|
|
322
|
+
# Skip the Qdrant probes too when ``skip_vectors``: large-repo
|
|
323
|
+
# operators who deliberately turn off the vector layer shouldn't
|
|
324
|
+
# have to keep Qdrant alive.
|
|
325
|
+
if not getattr(self, "skip_vectors", False):
|
|
326
|
+
self.vector.ensure_collection(self.cfg.qdrant_code)
|
|
327
|
+
self.vector.ensure_collection(self.cfg.qdrant_episodes)
|
|
328
|
+
self.graph.ensure_indexes()
|
|
329
|
+
self.state = IngestStateStore(self.cfg.episodic_db)
|
|
330
|
+
|
|
331
|
+
def ingest_repo(
|
|
332
|
+
self,
|
|
333
|
+
root: str | Path,
|
|
334
|
+
*,
|
|
335
|
+
mode: IngestMode = "auto",
|
|
336
|
+
since: str | None = None,
|
|
337
|
+
dry_run: bool = False,
|
|
338
|
+
on_progress: ProgressCallback | None = None,
|
|
339
|
+
) -> IngestStats:
|
|
340
|
+
"""Ingest a repository.
|
|
341
|
+
|
|
342
|
+
mode:
|
|
343
|
+
- "auto": git-incremental if prior state exists and base is reachable,
|
|
344
|
+
else full walk
|
|
345
|
+
- "full": purge this project's vectors+graph+ingest_state, then
|
|
346
|
+
walk every file. Use to rebuild from scratch.
|
|
347
|
+
- "incremental": require git + base; raise if not available
|
|
348
|
+
since: explicit base ref (branch/tag/sha). Overrides stored state when set.
|
|
349
|
+
dry_run: compute plan and return stats with notes; don't touch storage.
|
|
350
|
+
"""
|
|
351
|
+
root_path = Path(root).resolve()
|
|
352
|
+
is_git = git_delta.is_git_repo(root_path)
|
|
353
|
+
|
|
354
|
+
if mode == "full" or (mode == "auto" and not is_git):
|
|
355
|
+
stats = self._ingest_full(
|
|
356
|
+
root_path, dry_run=dry_run, on_progress=on_progress
|
|
357
|
+
)
|
|
358
|
+
if not dry_run:
|
|
359
|
+
self._run_resolver(stats)
|
|
360
|
+
if is_git and not dry_run:
|
|
361
|
+
self._record_state(root_path, stats)
|
|
362
|
+
return stats
|
|
363
|
+
|
|
364
|
+
# git path
|
|
365
|
+
if not is_git:
|
|
366
|
+
raise RuntimeError(f"{root_path} is not a git repository (mode={mode!r})")
|
|
367
|
+
|
|
368
|
+
head = git_delta.head_sha(root_path)
|
|
369
|
+
branch = git_delta.current_branch(root_path)
|
|
370
|
+
base = self._resolve_base(root_path, since=since, mode=mode)
|
|
371
|
+
|
|
372
|
+
if base is None:
|
|
373
|
+
# auto + git + no prior + no --since => full walk, then record state
|
|
374
|
+
stats = self._ingest_full(
|
|
375
|
+
root_path, dry_run=dry_run, on_progress=on_progress
|
|
376
|
+
)
|
|
377
|
+
stats.head_sha = head
|
|
378
|
+
stats.notes.append("no prior ingest state; performed full walk")
|
|
379
|
+
if not dry_run:
|
|
380
|
+
self._run_resolver(stats)
|
|
381
|
+
self._record_state(root_path, stats, head=head, branch=branch)
|
|
382
|
+
return stats
|
|
383
|
+
|
|
384
|
+
# Incremental
|
|
385
|
+
delta = git_delta.changed_since(root_path, base, include_dirty=True)
|
|
386
|
+
stats = self._ingest_delta(
|
|
387
|
+
root_path,
|
|
388
|
+
delta,
|
|
389
|
+
base_sha=base,
|
|
390
|
+
head_sha=head,
|
|
391
|
+
dry_run=dry_run,
|
|
392
|
+
on_progress=on_progress,
|
|
393
|
+
)
|
|
394
|
+
stats.mode = "incremental"
|
|
395
|
+
if not dry_run:
|
|
396
|
+
if stats.files > 0:
|
|
397
|
+
# Only run resolver if something actually changed; the
|
|
398
|
+
# resolver scans the whole graph so it's a fixed cost
|
|
399
|
+
# we'd rather skip on no-op delta runs.
|
|
400
|
+
self._run_resolver(stats)
|
|
401
|
+
self._record_state(root_path, stats, head=head, branch=branch)
|
|
402
|
+
return stats
|
|
403
|
+
|
|
404
|
+
# -- internals -------------------------------------------------------
|
|
405
|
+
|
|
406
|
+
def _resolve_base(
|
|
407
|
+
self, root: Path, *, since: str | None, mode: IngestMode
|
|
408
|
+
) -> str | None:
|
|
409
|
+
if since is not None:
|
|
410
|
+
try:
|
|
411
|
+
return git_delta.resolve_ref(root, since)
|
|
412
|
+
except git_delta.GitError as e:
|
|
413
|
+
raise RuntimeError(f"could not resolve --since {since!r}: {e}") from e
|
|
414
|
+
|
|
415
|
+
prior = self.state.get(root)
|
|
416
|
+
if prior is None:
|
|
417
|
+
if mode == "incremental":
|
|
418
|
+
raise RuntimeError(
|
|
419
|
+
f"no prior ingest state for {root}; run a full ingest first"
|
|
420
|
+
)
|
|
421
|
+
return None
|
|
422
|
+
|
|
423
|
+
if not git_delta.is_reachable(root, prior.last_sha):
|
|
424
|
+
# history rewrite or branch deletion — fall back
|
|
425
|
+
self.state.clear(root)
|
|
426
|
+
return None
|
|
427
|
+
|
|
428
|
+
return prior.last_sha
|
|
429
|
+
|
|
430
|
+
def _ingest_full(
|
|
431
|
+
self,
|
|
432
|
+
root: Path,
|
|
433
|
+
*,
|
|
434
|
+
dry_run: bool,
|
|
435
|
+
on_progress: ProgressCallback | None = None,
|
|
436
|
+
) -> IngestStats:
|
|
437
|
+
extractor = Extractor()
|
|
438
|
+
stats = IngestStats(mode="full")
|
|
439
|
+
sanity = SanitySummary()
|
|
440
|
+
head_sha, head_ord = _resolve_head(root)
|
|
441
|
+
stats.head_sha = head_sha
|
|
442
|
+
if not dry_run:
|
|
443
|
+
self._purge_project_index(root)
|
|
444
|
+
hb = _Heartbeat(
|
|
445
|
+
"full ingest" + (" (dry-run)" if dry_run else ""),
|
|
446
|
+
on_progress=on_progress,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
# Buffer chunks across files so the embedder sees a large batch
|
|
450
|
+
# per call, then fan the Qdrant upserts out to a small thread
|
|
451
|
+
# pool so they overlap with the next batch's embedding work.
|
|
452
|
+
# On a cold ingest, embed (Ollama HTTP, serial) dominates; the
|
|
453
|
+
# qdrant upsert (network + index write) blocks for ~80-150 ms
|
|
454
|
+
# per batch — pipelining lets that happen while the next embed
|
|
455
|
+
# batch is in flight. On a warm ingest (cache hits), embed
|
|
456
|
+
# returns instantly and qdrant + graph become the path, so the
|
|
457
|
+
# same pool keeps Qdrant from blocking the graph layer.
|
|
458
|
+
pending_chunks: list[tuple[ExtractedFile, _Chunk]] = []
|
|
459
|
+
EMBED_BATCH = 64
|
|
460
|
+
UPSERT_POOL_SIZE = 2
|
|
461
|
+
UPSERT_QUEUE_MAX = 4
|
|
462
|
+
upsert_executor = ThreadPoolExecutor(max_workers=UPSERT_POOL_SIZE)
|
|
463
|
+
in_flight: list[Future] = []
|
|
464
|
+
|
|
465
|
+
def _await_one() -> None:
|
|
466
|
+
if not in_flight:
|
|
467
|
+
return
|
|
468
|
+
fut = in_flight.pop(0)
|
|
469
|
+
fut.result() # propagate exceptions
|
|
470
|
+
|
|
471
|
+
def _flush_pending() -> None:
|
|
472
|
+
if not pending_chunks:
|
|
473
|
+
return
|
|
474
|
+
batch = list(pending_chunks)
|
|
475
|
+
pending_chunks.clear()
|
|
476
|
+
fut = upsert_executor.submit(self._embed_and_upsert, batch)
|
|
477
|
+
in_flight.append(fut)
|
|
478
|
+
# Bound queue so upserts don't fall arbitrarily behind embed.
|
|
479
|
+
while len(in_flight) >= UPSERT_QUEUE_MAX:
|
|
480
|
+
_await_one()
|
|
481
|
+
|
|
482
|
+
for ex in extractor.walk(root):
|
|
483
|
+
stats.files += 1
|
|
484
|
+
stats.symbols += len(ex.symbols)
|
|
485
|
+
stats.imports += len(ex.imports)
|
|
486
|
+
stats.calls += len(ex.calls)
|
|
487
|
+
stats.references += len(ex.references)
|
|
488
|
+
stats.chunks += len(ex.symbols) or 1
|
|
489
|
+
sanity.record(ex)
|
|
490
|
+
if not dry_run:
|
|
491
|
+
# Graph upserts are cheap (UNWIND-batched per call) and
|
|
492
|
+
# need to stay per-file so the temporal stamping order
|
|
493
|
+
# matches the walk. Vector work defers to the buffer.
|
|
494
|
+
self._upsert_graph(ex, head_sha=head_sha, head_ord=head_ord)
|
|
495
|
+
if not getattr(self, "skip_vectors", False):
|
|
496
|
+
for c in _chunks_for(ex):
|
|
497
|
+
pending_chunks.append((ex, c))
|
|
498
|
+
if len(pending_chunks) >= EMBED_BATCH:
|
|
499
|
+
_flush_pending()
|
|
500
|
+
hb.tick(stats)
|
|
501
|
+
if not getattr(self, "skip_vectors", False):
|
|
502
|
+
_flush_pending()
|
|
503
|
+
# Drain the pool so the resolver + .NET-project pass sees a
|
|
504
|
+
# quiescent Qdrant. Drop the pool here, not in __exit__,
|
|
505
|
+
# because the .NET-project pass runs in this method.
|
|
506
|
+
while in_flight:
|
|
507
|
+
_await_one()
|
|
508
|
+
upsert_executor.shutdown(wait=True)
|
|
509
|
+
hb.done(stats)
|
|
510
|
+
_attach_sanity(stats, sanity)
|
|
511
|
+
self._ingest_dotnet_projects(
|
|
512
|
+
root, stats, dry_run=dry_run, head_sha=head_sha, head_ord=head_ord
|
|
513
|
+
)
|
|
514
|
+
return stats
|
|
515
|
+
|
|
516
|
+
def _run_resolver(self, stats: IngestStats) -> None:
|
|
517
|
+
"""Resolve placeholder ``name::X`` Symbol nodes to real symbols.
|
|
518
|
+
|
|
519
|
+
Records resolver stats on the ingest stats object so callers can
|
|
520
|
+
see how much of the call graph is now grounded vs. ambiguous.
|
|
521
|
+
Failures are non-fatal — ingest data is already persisted.
|
|
522
|
+
"""
|
|
523
|
+
try:
|
|
524
|
+
r = resolve_graph(self.graph)
|
|
525
|
+
except Exception as e:
|
|
526
|
+
stats.notes.append(f"resolver skipped: {e}")
|
|
527
|
+
return
|
|
528
|
+
stats.resolver = {
|
|
529
|
+
"placeholders": r.placeholders,
|
|
530
|
+
"edges_total": r.edges_total,
|
|
531
|
+
"resolved_same_file": r.edges_resolved_same_file,
|
|
532
|
+
"resolved_imported": r.edges_resolved_imported,
|
|
533
|
+
"resolved_unique": r.edges_resolved_unique,
|
|
534
|
+
"resolved_assembly": r.edges_resolved_assembly,
|
|
535
|
+
"ambiguous": r.edges_left_ambiguous,
|
|
536
|
+
"external": r.edges_left_external,
|
|
537
|
+
"placeholders_deleted": r.placeholders_deleted,
|
|
538
|
+
"import_aliases_added": r.import_aliases_added,
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
def _ingest_dotnet_projects(
|
|
542
|
+
self,
|
|
543
|
+
root: Path,
|
|
544
|
+
stats: IngestStats,
|
|
545
|
+
*,
|
|
546
|
+
dry_run: bool,
|
|
547
|
+
head_sha: str | None = None,
|
|
548
|
+
head_ord: int | None = None,
|
|
549
|
+
) -> None:
|
|
550
|
+
"""Walk `.csproj`/`.fsproj`/`.vbproj` and emit Project topology.
|
|
551
|
+
|
|
552
|
+
Adds three node/edge kinds to the graph:
|
|
553
|
+
|
|
554
|
+
* ``Project`` nodes keyed by absolute path.
|
|
555
|
+
* ``PROJECT_REFERENCES`` edges (Project → Project) from every
|
|
556
|
+
``<ProjectReference>``. Targets outside the repo or unparseable
|
|
557
|
+
are silently dropped — see ``parse_csproj``.
|
|
558
|
+
* ``PACKAGE_REFERENCES`` edges (Project → Package) from every
|
|
559
|
+
``<PackageReference>``. ``Package`` is a new label so NuGet
|
|
560
|
+
packages don't pollute the ``Module`` namespace (which holds
|
|
561
|
+
`using` import targets).
|
|
562
|
+
|
|
563
|
+
Non-.NET repos see zero ``.csproj`` files and this is a no-op.
|
|
564
|
+
Failures are non-fatal: source ingest already happened.
|
|
565
|
+
"""
|
|
566
|
+
try:
|
|
567
|
+
projects = walk_csprojs(root)
|
|
568
|
+
except Exception as e: # noqa: BLE001
|
|
569
|
+
stats.notes.append(f"csproj indexing skipped: {e}")
|
|
570
|
+
return
|
|
571
|
+
if not projects:
|
|
572
|
+
return
|
|
573
|
+
counts = {
|
|
574
|
+
"projects": len(projects),
|
|
575
|
+
"project_refs": sum(len(p.project_references) for p in projects),
|
|
576
|
+
"package_refs": sum(len(p.package_references) for p in projects),
|
|
577
|
+
}
|
|
578
|
+
stats.projects = counts
|
|
579
|
+
if dry_run:
|
|
580
|
+
return
|
|
581
|
+
self._upsert_dotnet_projects(
|
|
582
|
+
projects, head_sha=head_sha, head_ord=head_ord
|
|
583
|
+
)
|
|
584
|
+
self._index_referenced_assemblies(
|
|
585
|
+
projects, stats, head_sha=head_sha, head_ord=head_ord
|
|
586
|
+
)
|
|
587
|
+
self._index_file_containment(
|
|
588
|
+
projects, stats, head_sha=head_sha, head_ord=head_ord
|
|
589
|
+
)
|
|
590
|
+
self._index_solutions(
|
|
591
|
+
root, stats, head_sha=head_sha, head_ord=head_ord
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
def _index_referenced_assemblies(
|
|
595
|
+
self,
|
|
596
|
+
projects: list[CsprojInfo],
|
|
597
|
+
stats: IngestStats,
|
|
598
|
+
*,
|
|
599
|
+
head_sha: str | None = None,
|
|
600
|
+
head_ord: int | None = None,
|
|
601
|
+
) -> None:
|
|
602
|
+
"""Parse referenced DLLs and index their public type surface.
|
|
603
|
+
|
|
604
|
+
Layer on top of the csproj topology (PR1 shipped Project +
|
|
605
|
+
Package + PackageReference edges). This step turns the logical
|
|
606
|
+
``<PackageReference>`` and ``<ProjectReference>`` into concrete
|
|
607
|
+
``.dll`` paths, parses each via :func:`code_memory.extractor.dll.parse_assembly`,
|
|
608
|
+
and writes:
|
|
609
|
+
|
|
610
|
+
* ``Assembly`` nodes keyed by ``"Name, Version=X.Y.Z.W"``. Two
|
|
611
|
+
versions of the same lib stay distinct so the agent can see
|
|
612
|
+
when projects pin different versions of the same dep.
|
|
613
|
+
* ``Type`` nodes keyed by ``"{assembly_id}::{Namespace}.{Name}"``.
|
|
614
|
+
Only public types (top-level or nested-public); private
|
|
615
|
+
implementation detail stays unindexed.
|
|
616
|
+
* ``USES_ASSEMBLY`` edges (Project → Assembly).
|
|
617
|
+
* ``EXPOSES_TYPE`` edges (Assembly → Type).
|
|
618
|
+
|
|
619
|
+
DLL resolution leans on the NuGet global cache plus project
|
|
620
|
+
build outputs (see ``code_memory.extractor.nuget``). Failures
|
|
621
|
+
are silenced: DLLs are read-only metadata, not load-bearing.
|
|
622
|
+
``stats.dlls`` carries the counters so users see how much of
|
|
623
|
+
the binary surface we managed to index.
|
|
624
|
+
"""
|
|
625
|
+
# Dedupe DLL paths across the whole solution so a shared
|
|
626
|
+
# dependency parses exactly once even when many projects pull
|
|
627
|
+
# the same Newtonsoft.Json on disk. ``unresolved`` counts
|
|
628
|
+
# PackageReferences we couldn't locate (offline machine,
|
|
629
|
+
# unrestored NuGet cache).
|
|
630
|
+
path_to_consumers: dict[str, set[str]] = {}
|
|
631
|
+
unresolved = 0
|
|
632
|
+
for proj in projects:
|
|
633
|
+
refs = resolve_refs(proj)
|
|
634
|
+
for dll in refs.all_paths():
|
|
635
|
+
path_to_consumers.setdefault(str(dll), set()).add(proj.path)
|
|
636
|
+
for pkg in proj.package_references:
|
|
637
|
+
if pkg.name not in refs.package_dlls:
|
|
638
|
+
unresolved += 1
|
|
639
|
+
|
|
640
|
+
if not path_to_consumers:
|
|
641
|
+
stats.dlls = {
|
|
642
|
+
"assemblies": 0,
|
|
643
|
+
"types": 0,
|
|
644
|
+
"skipped": 0,
|
|
645
|
+
"unresolved": unresolved,
|
|
646
|
+
}
|
|
647
|
+
return
|
|
648
|
+
|
|
649
|
+
nodes: list[GraphNode] = []
|
|
650
|
+
edges: list[GraphEdge] = []
|
|
651
|
+
seen_assembly_keys: set[str] = set()
|
|
652
|
+
seen_type_keys: set[str] = set()
|
|
653
|
+
skipped = 0
|
|
654
|
+
|
|
655
|
+
for dll_path, consumers in path_to_consumers.items():
|
|
656
|
+
info = parse_assembly(dll_path)
|
|
657
|
+
if info is None:
|
|
658
|
+
skipped += 1
|
|
659
|
+
continue
|
|
660
|
+
asm_key = info.identity
|
|
661
|
+
if asm_key not in seen_assembly_keys:
|
|
662
|
+
seen_assembly_keys.add(asm_key)
|
|
663
|
+
asm_props: dict[str, object] = {
|
|
664
|
+
"name": info.name,
|
|
665
|
+
"version": info.version,
|
|
666
|
+
"path": info.path,
|
|
667
|
+
}
|
|
668
|
+
if info.public_key_token:
|
|
669
|
+
asm_props["public_key_token"] = info.public_key_token
|
|
670
|
+
nodes.append(
|
|
671
|
+
GraphNode(label="Assembly", key=asm_key, props=asm_props)
|
|
672
|
+
)
|
|
673
|
+
for tref in info.types:
|
|
674
|
+
type_key = f"{asm_key}::{tref.namespace}.{tref.name}".rstrip(".")
|
|
675
|
+
if type_key in seen_type_keys:
|
|
676
|
+
continue
|
|
677
|
+
seen_type_keys.add(type_key)
|
|
678
|
+
type_props: dict[str, object] = {
|
|
679
|
+
"name": tref.name,
|
|
680
|
+
"namespace": tref.namespace,
|
|
681
|
+
"kind": tref.kind,
|
|
682
|
+
"sealed": tref.sealed,
|
|
683
|
+
"assembly": asm_key,
|
|
684
|
+
}
|
|
685
|
+
nodes.append(
|
|
686
|
+
GraphNode(label="Type", key=type_key, props=type_props)
|
|
687
|
+
)
|
|
688
|
+
edges.append(
|
|
689
|
+
GraphEdge(
|
|
690
|
+
type="EXPOSES_TYPE",
|
|
691
|
+
src_label="Assembly",
|
|
692
|
+
src_key=asm_key,
|
|
693
|
+
dst_label="Type",
|
|
694
|
+
dst_key=type_key,
|
|
695
|
+
)
|
|
696
|
+
)
|
|
697
|
+
for consumer in consumers:
|
|
698
|
+
edges.append(
|
|
699
|
+
GraphEdge(
|
|
700
|
+
type="USES_ASSEMBLY",
|
|
701
|
+
src_label="Project",
|
|
702
|
+
src_key=consumer,
|
|
703
|
+
dst_label="Assembly",
|
|
704
|
+
dst_key=asm_key,
|
|
705
|
+
)
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
stats.dlls = {
|
|
709
|
+
"assemblies": len(seen_assembly_keys),
|
|
710
|
+
"types": len(seen_type_keys),
|
|
711
|
+
"skipped": skipped,
|
|
712
|
+
"unresolved": unresolved,
|
|
713
|
+
}
|
|
714
|
+
self.graph.upsert_nodes(nodes, head_sha=head_sha, head_ord=head_ord)
|
|
715
|
+
self.graph.upsert_edges(edges, head_sha=head_sha, head_ord=head_ord)
|
|
716
|
+
|
|
717
|
+
def _index_solutions(
|
|
718
|
+
self,
|
|
719
|
+
root: Path,
|
|
720
|
+
stats: IngestStats,
|
|
721
|
+
*,
|
|
722
|
+
head_sha: str | None = None,
|
|
723
|
+
head_ord: int | None = None,
|
|
724
|
+
) -> None:
|
|
725
|
+
"""Walk `.sln` files and emit Solution nodes + Project membership.
|
|
726
|
+
|
|
727
|
+
Schema added:
|
|
728
|
+
|
|
729
|
+
* ``Solution`` node keyed by the solution's absolute path with
|
|
730
|
+
``name`` and ``project_count``.
|
|
731
|
+
* ``MEMBER_OF`` edge from each indexed Project to the
|
|
732
|
+
Solution(s) that include it. A single project can be a
|
|
733
|
+
member of multiple solutions (shared infra in monorepos);
|
|
734
|
+
all edges are emitted.
|
|
735
|
+
|
|
736
|
+
Solutions whose `Project(...)` entries point at csprojs we
|
|
737
|
+
didn't index (relative path goes outside the repo) end up
|
|
738
|
+
with fewer ``MEMBER_OF`` edges than their declared project
|
|
739
|
+
count — the discrepancy lives in ``stats.solutions``.
|
|
740
|
+
"""
|
|
741
|
+
try:
|
|
742
|
+
solutions = walk_solutions(root)
|
|
743
|
+
except Exception as e: # noqa: BLE001
|
|
744
|
+
stats.notes.append(f"sln indexing skipped: {e}")
|
|
745
|
+
return
|
|
746
|
+
if not solutions:
|
|
747
|
+
return
|
|
748
|
+
|
|
749
|
+
nodes: list[GraphNode] = []
|
|
750
|
+
edges: list[GraphEdge] = []
|
|
751
|
+
total_members = 0
|
|
752
|
+
for sln in solutions:
|
|
753
|
+
nodes.append(
|
|
754
|
+
GraphNode(
|
|
755
|
+
label="Solution",
|
|
756
|
+
key=sln.path,
|
|
757
|
+
props={
|
|
758
|
+
"name": sln.name,
|
|
759
|
+
"project_count": len(sln.projects),
|
|
760
|
+
},
|
|
761
|
+
)
|
|
762
|
+
)
|
|
763
|
+
for sp in sln.projects:
|
|
764
|
+
total_members += 1
|
|
765
|
+
edges.append(
|
|
766
|
+
GraphEdge(
|
|
767
|
+
type="MEMBER_OF",
|
|
768
|
+
src_label="Project",
|
|
769
|
+
src_key=sp.csproj_path,
|
|
770
|
+
dst_label="Solution",
|
|
771
|
+
dst_key=sln.path,
|
|
772
|
+
props={"guid": sp.guid},
|
|
773
|
+
)
|
|
774
|
+
)
|
|
775
|
+
stats.solutions = {
|
|
776
|
+
"solutions": len(solutions),
|
|
777
|
+
"memberships": total_members,
|
|
778
|
+
}
|
|
779
|
+
self.graph.upsert_nodes(nodes, head_sha=head_sha, head_ord=head_ord)
|
|
780
|
+
self.graph.upsert_edges(edges, head_sha=head_sha, head_ord=head_ord)
|
|
781
|
+
|
|
782
|
+
def _index_file_containment(
|
|
783
|
+
self,
|
|
784
|
+
projects: list[CsprojInfo],
|
|
785
|
+
stats: IngestStats,
|
|
786
|
+
*,
|
|
787
|
+
head_sha: str | None = None,
|
|
788
|
+
head_ord: int | None = None,
|
|
789
|
+
) -> None:
|
|
790
|
+
"""Tie each .NET source file to its owning ``Project`` node.
|
|
791
|
+
|
|
792
|
+
The resolver needs this to answer "which assemblies can this
|
|
793
|
+
file legitimately reach into" without inferring it from the
|
|
794
|
+
directory tree at query time. Containment is decided by the
|
|
795
|
+
**deepest** csproj whose directory is a prefix of the file's
|
|
796
|
+
path — important for repos that nest sub-projects (a file
|
|
797
|
+
under ``A/Sub/X.cs`` belongs to ``A/Sub`` if ``A/Sub.csproj``
|
|
798
|
+
exists, not the outer ``A.csproj``).
|
|
799
|
+
|
|
800
|
+
Files outside any csproj's directory get no edge — useful for
|
|
801
|
+
scripts / loose .cs at the repo root, where ownership is
|
|
802
|
+
ambiguous.
|
|
803
|
+
|
|
804
|
+
The :class:`IngestStats` record gains ``stats.projects`` keys
|
|
805
|
+
``files_assigned`` / ``files_unowned`` so the agent can see
|
|
806
|
+
coverage at a glance.
|
|
807
|
+
"""
|
|
808
|
+
# Sort csproj dirs by path length descending so the deepest
|
|
809
|
+
# prefix-match wins on a single linear scan per file.
|
|
810
|
+
proj_dirs = sorted(
|
|
811
|
+
((str(Path(p.path).parent.resolve()), p.path) for p in projects),
|
|
812
|
+
key=lambda x: -len(x[0]),
|
|
813
|
+
)
|
|
814
|
+
if not proj_dirs:
|
|
815
|
+
return
|
|
816
|
+
|
|
817
|
+
rows = self.graph.graph.query(
|
|
818
|
+
"MATCH (f:File) "
|
|
819
|
+
"WHERE f.lang IN ['csharp', 'fsharp', 'vb', 'razor'] "
|
|
820
|
+
"RETURN f.key"
|
|
821
|
+
).result_set
|
|
822
|
+
files = [row[0] for row in rows]
|
|
823
|
+
if not files:
|
|
824
|
+
return
|
|
825
|
+
|
|
826
|
+
edges: list[GraphEdge] = []
|
|
827
|
+
assigned = 0
|
|
828
|
+
unowned = 0
|
|
829
|
+
for file_path in files:
|
|
830
|
+
owner = _owning_project(file_path, proj_dirs)
|
|
831
|
+
if owner is None:
|
|
832
|
+
unowned += 1
|
|
833
|
+
continue
|
|
834
|
+
assigned += 1
|
|
835
|
+
edges.append(
|
|
836
|
+
GraphEdge(
|
|
837
|
+
type="CONTAINED_IN",
|
|
838
|
+
src_label="File",
|
|
839
|
+
src_key=file_path,
|
|
840
|
+
dst_label="Project",
|
|
841
|
+
dst_key=owner,
|
|
842
|
+
)
|
|
843
|
+
)
|
|
844
|
+
if edges:
|
|
845
|
+
self.graph.upsert_edges(edges, head_sha=head_sha, head_ord=head_ord)
|
|
846
|
+
|
|
847
|
+
if stats.projects is None:
|
|
848
|
+
stats.projects = {}
|
|
849
|
+
stats.projects["files_assigned"] = assigned
|
|
850
|
+
stats.projects["files_unowned"] = unowned
|
|
851
|
+
|
|
852
|
+
def _upsert_dotnet_projects(
|
|
853
|
+
self,
|
|
854
|
+
projects: list[CsprojInfo],
|
|
855
|
+
*,
|
|
856
|
+
head_sha: str | None = None,
|
|
857
|
+
head_ord: int | None = None,
|
|
858
|
+
) -> None:
|
|
859
|
+
nodes: list[GraphNode] = []
|
|
860
|
+
edges: list[GraphEdge] = []
|
|
861
|
+
seen_pkgs: set[str] = set()
|
|
862
|
+
for proj in projects:
|
|
863
|
+
props: dict[str, object] = {
|
|
864
|
+
"name": proj.name,
|
|
865
|
+
"assembly_name": proj.assembly_name or proj.name,
|
|
866
|
+
"sdk_style": proj.sdk_style,
|
|
867
|
+
}
|
|
868
|
+
if proj.target_framework:
|
|
869
|
+
props["target_framework"] = proj.target_framework
|
|
870
|
+
nodes.append(GraphNode(label="Project", key=proj.path, props=props))
|
|
871
|
+
for ref in proj.project_references:
|
|
872
|
+
# Forward-reference target Project node — `upsert_nodes`
|
|
873
|
+
# is idempotent, and walking all projects first then
|
|
874
|
+
# writing edges would require two passes for no win.
|
|
875
|
+
nodes.append(GraphNode(label="Project", key=ref))
|
|
876
|
+
edges.append(
|
|
877
|
+
GraphEdge(
|
|
878
|
+
type="PROJECT_REFERENCES",
|
|
879
|
+
src_label="Project",
|
|
880
|
+
src_key=proj.path,
|
|
881
|
+
dst_label="Project",
|
|
882
|
+
dst_key=ref,
|
|
883
|
+
)
|
|
884
|
+
)
|
|
885
|
+
for pkg in proj.package_references:
|
|
886
|
+
key = pkg.name
|
|
887
|
+
if key not in seen_pkgs:
|
|
888
|
+
seen_pkgs.add(key)
|
|
889
|
+
nodes.append(
|
|
890
|
+
GraphNode(
|
|
891
|
+
label="Package",
|
|
892
|
+
key=key,
|
|
893
|
+
props={"name": pkg.name},
|
|
894
|
+
)
|
|
895
|
+
)
|
|
896
|
+
edge_props: dict[str, object] = {}
|
|
897
|
+
if pkg.version:
|
|
898
|
+
edge_props["version"] = pkg.version
|
|
899
|
+
edges.append(
|
|
900
|
+
GraphEdge(
|
|
901
|
+
type="PACKAGE_REFERENCES",
|
|
902
|
+
src_label="Project",
|
|
903
|
+
src_key=proj.path,
|
|
904
|
+
dst_label="Package",
|
|
905
|
+
dst_key=key,
|
|
906
|
+
props=edge_props,
|
|
907
|
+
)
|
|
908
|
+
)
|
|
909
|
+
self.graph.upsert_nodes(nodes, head_sha=head_sha, head_ord=head_ord)
|
|
910
|
+
self.graph.upsert_edges(edges, head_sha=head_sha, head_ord=head_ord)
|
|
911
|
+
|
|
912
|
+
def _purge_project_index(self, root: Path) -> None:
|
|
913
|
+
"""Wipe code vectors + graph + ingest_state for this project.
|
|
914
|
+
|
|
915
|
+
Episodes are independent (conversation memory) and preserved.
|
|
916
|
+
Called before a full re-ingest so stale entries (e.g. paths now
|
|
917
|
+
excluded by .gitignore or ignore_dirs) don't linger in retrieval.
|
|
918
|
+
"""
|
|
919
|
+
self.vector.recreate_collection(self.cfg.qdrant_code)
|
|
920
|
+
self.graph.clear_graph()
|
|
921
|
+
self.state.clear(root)
|
|
922
|
+
|
|
923
|
+
def _ingest_delta(
|
|
924
|
+
self,
|
|
925
|
+
root: Path,
|
|
926
|
+
delta: git_delta.Delta,
|
|
927
|
+
*,
|
|
928
|
+
base_sha: str,
|
|
929
|
+
head_sha: str,
|
|
930
|
+
dry_run: bool,
|
|
931
|
+
on_progress: ProgressCallback | None = None,
|
|
932
|
+
) -> IngestStats:
|
|
933
|
+
stats = IngestStats(mode="incremental", base_sha=base_sha, head_sha=head_sha)
|
|
934
|
+
sanity = SanitySummary()
|
|
935
|
+
# Resolve the ordinal once: it's a git roundtrip we'd otherwise
|
|
936
|
+
# pay per-file when tombstoning deletes / stamping upserts.
|
|
937
|
+
head_ord = git_delta.commit_ordinal(root, head_sha) if head_sha else None
|
|
938
|
+
reingest = list(delta.reingest_paths())
|
|
939
|
+
hb = _Heartbeat(
|
|
940
|
+
"incremental ingest" + (" (dry-run)" if dry_run else ""),
|
|
941
|
+
total=len(reingest),
|
|
942
|
+
on_progress=on_progress,
|
|
943
|
+
)
|
|
944
|
+
|
|
945
|
+
for path in delta.deleted:
|
|
946
|
+
path_str = str(path)
|
|
947
|
+
stats.deleted += 1
|
|
948
|
+
if dry_run:
|
|
949
|
+
continue
|
|
950
|
+
self.graph.delete_file(
|
|
951
|
+
path_str, head_sha=head_sha, head_ord=head_ord
|
|
952
|
+
)
|
|
953
|
+
if not getattr(self, "skip_vectors", False):
|
|
954
|
+
self.vector.delete_by_path(self.cfg.qdrant_code, path_str)
|
|
955
|
+
|
|
956
|
+
for path in reingest:
|
|
957
|
+
if not path.is_file():
|
|
958
|
+
# file deleted between diff and now, or extractor can't see it
|
|
959
|
+
stats.skipped += 1
|
|
960
|
+
continue
|
|
961
|
+
if dry_run:
|
|
962
|
+
ex = self._extract_one(path)
|
|
963
|
+
if ex is None:
|
|
964
|
+
stats.skipped += 1
|
|
965
|
+
continue
|
|
966
|
+
stats.files += 1
|
|
967
|
+
stats.symbols += len(ex.symbols)
|
|
968
|
+
stats.imports += len(ex.imports)
|
|
969
|
+
stats.calls += len(ex.calls)
|
|
970
|
+
stats.references += len(ex.references)
|
|
971
|
+
stats.chunks += len(ex.symbols) or 1
|
|
972
|
+
sanity.record(ex)
|
|
973
|
+
continue
|
|
974
|
+
|
|
975
|
+
ex = self.reingest_file(path, head_sha=head_sha, head_ord=head_ord)
|
|
976
|
+
if ex is None:
|
|
977
|
+
stats.skipped += 1
|
|
978
|
+
continue
|
|
979
|
+
stats.files += 1
|
|
980
|
+
stats.symbols += len(ex.symbols)
|
|
981
|
+
stats.imports += len(ex.imports)
|
|
982
|
+
stats.calls += len(ex.calls)
|
|
983
|
+
stats.references += len(ex.references)
|
|
984
|
+
stats.chunks += len(ex.symbols) or 1
|
|
985
|
+
sanity.record(ex)
|
|
986
|
+
hb.tick(stats)
|
|
987
|
+
|
|
988
|
+
hb.done(stats)
|
|
989
|
+
_attach_sanity(stats, sanity)
|
|
990
|
+
# Re-run csproj indexing on every delta — project files are
|
|
991
|
+
# tiny and the topology shifts independently of source edits.
|
|
992
|
+
self._ingest_dotnet_projects(
|
|
993
|
+
root, stats, dry_run=dry_run, head_sha=head_sha, head_ord=head_ord
|
|
994
|
+
)
|
|
995
|
+
if delta.is_empty:
|
|
996
|
+
stats.notes.append("no changes since last ingest")
|
|
997
|
+
return stats
|
|
998
|
+
|
|
999
|
+
@staticmethod
|
|
1000
|
+
def _extract_one(path: Path) -> ExtractedFile | None:
|
|
1001
|
+
from ..extractor.treesitter import extract_file
|
|
1002
|
+
|
|
1003
|
+
return extract_file(path)
|
|
1004
|
+
|
|
1005
|
+
def _record_state(
|
|
1006
|
+
self,
|
|
1007
|
+
root: Path,
|
|
1008
|
+
stats: IngestStats,
|
|
1009
|
+
*,
|
|
1010
|
+
head: str | None = None,
|
|
1011
|
+
branch: str | None = None,
|
|
1012
|
+
) -> None:
|
|
1013
|
+
sha = head or stats.head_sha
|
|
1014
|
+
if sha is None and git_delta.is_git_repo(root):
|
|
1015
|
+
try:
|
|
1016
|
+
sha = git_delta.head_sha(root)
|
|
1017
|
+
if branch is None:
|
|
1018
|
+
branch = git_delta.current_branch(root)
|
|
1019
|
+
except git_delta.GitError:
|
|
1020
|
+
sha = None
|
|
1021
|
+
if sha is None:
|
|
1022
|
+
return
|
|
1023
|
+
stats.head_sha = sha
|
|
1024
|
+
self.state.set(root, sha=sha, branch=branch)
|
|
1025
|
+
|
|
1026
|
+
def ingest_file(
|
|
1027
|
+
self,
|
|
1028
|
+
ex: ExtractedFile,
|
|
1029
|
+
*,
|
|
1030
|
+
head_sha: str | None = None,
|
|
1031
|
+
head_ord: int | None = None,
|
|
1032
|
+
) -> None:
|
|
1033
|
+
self._upsert_graph(ex, head_sha=head_sha, head_ord=head_ord)
|
|
1034
|
+
if not getattr(self, "skip_vectors", False):
|
|
1035
|
+
self._upsert_vectors(ex)
|
|
1036
|
+
|
|
1037
|
+
def reingest_file(
|
|
1038
|
+
self,
|
|
1039
|
+
path: str | Path,
|
|
1040
|
+
*,
|
|
1041
|
+
head_sha: str | None = None,
|
|
1042
|
+
head_ord: int | None = None,
|
|
1043
|
+
) -> ExtractedFile | None:
|
|
1044
|
+
from ..extractor.treesitter import extract_file
|
|
1045
|
+
|
|
1046
|
+
ex = extract_file(path)
|
|
1047
|
+
if ex is None:
|
|
1048
|
+
return None
|
|
1049
|
+
# When a caller doesn't know the SHA (per-file save hook), best-
|
|
1050
|
+
# effort resolve from the file's enclosing repo so the temporal
|
|
1051
|
+
# stamp still lands. Cheap: a single `git rev-parse HEAD`.
|
|
1052
|
+
if head_sha is None:
|
|
1053
|
+
head_sha, head_ord = _resolve_head(Path(ex.path).parent)
|
|
1054
|
+
self.graph.delete_file(ex.path, head_sha=head_sha, head_ord=head_ord)
|
|
1055
|
+
if not getattr(self, "skip_vectors", False):
|
|
1056
|
+
self.vector.delete_by_path(self.cfg.qdrant_code, ex.path)
|
|
1057
|
+
self.ingest_file(ex, head_sha=head_sha, head_ord=head_ord)
|
|
1058
|
+
return ex
|
|
1059
|
+
|
|
1060
|
+
def delete_paths(
|
|
1061
|
+
self,
|
|
1062
|
+
paths: Iterable[Path | str],
|
|
1063
|
+
*,
|
|
1064
|
+
head_sha: str | None = None,
|
|
1065
|
+
head_ord: int | None = None,
|
|
1066
|
+
) -> int:
|
|
1067
|
+
"""Remove ``paths`` from graph + vector index.
|
|
1068
|
+
|
|
1069
|
+
Mirrors the deletion branch of ``ingest_delta`` so callers that
|
|
1070
|
+
already know which files vanished (file-save hooks, dirty-only
|
|
1071
|
+
sync) can prune without recomputing a full git delta. When
|
|
1072
|
+
``head_sha`` is omitted we resolve it once from the first path's
|
|
1073
|
+
repo so the temporal stamp still lands.
|
|
1074
|
+
"""
|
|
1075
|
+
path_list = [str(p) for p in paths]
|
|
1076
|
+
if not path_list:
|
|
1077
|
+
return 0
|
|
1078
|
+
if head_sha is None and path_list:
|
|
1079
|
+
head_sha, head_ord = _resolve_head(Path(path_list[0]).parent)
|
|
1080
|
+
for path_str in path_list:
|
|
1081
|
+
self.graph.delete_file(path_str, head_sha=head_sha, head_ord=head_ord)
|
|
1082
|
+
if not getattr(self, "skip_vectors", False):
|
|
1083
|
+
self.vector.delete_by_path(self.cfg.qdrant_code, path_str)
|
|
1084
|
+
return len(path_list)
|
|
1085
|
+
|
|
1086
|
+
def record_episode(self, ep: Episode) -> str:
|
|
1087
|
+
ep_id = self.episodic.add(ep)
|
|
1088
|
+
hv = self.embedder.embed_one(episode_text(ep))
|
|
1089
|
+
self.vector.upsert(
|
|
1090
|
+
self.cfg.qdrant_episodes,
|
|
1091
|
+
[VectorRecord(id=ep_id, vector=hv, payload=episode_payload(ep))],
|
|
1092
|
+
)
|
|
1093
|
+
return ep_id
|
|
1094
|
+
|
|
1095
|
+
def dedupe_episodes(self) -> dict[str, int]:
|
|
1096
|
+
"""Compact duplicate episodes in SQLite and prune their vectors.
|
|
1097
|
+
|
|
1098
|
+
Mirrors ``EpisodicStore.dedupe`` and follows up with a Qdrant
|
|
1099
|
+
delete for removed point ids so the vector store doesn't drift
|
|
1100
|
+
from the source of truth. Returns ``{"removed": n, "groups": g}``.
|
|
1101
|
+
"""
|
|
1102
|
+
removed_map = self.episodic.dedupe()
|
|
1103
|
+
removed_ids: list[str] = []
|
|
1104
|
+
for ids in removed_map.values():
|
|
1105
|
+
removed_ids.extend(ids)
|
|
1106
|
+
if removed_ids and not getattr(self, "skip_vectors", False):
|
|
1107
|
+
self.vector.delete_by_ids(self.cfg.qdrant_episodes, removed_ids)
|
|
1108
|
+
return {"removed": len(removed_ids), "groups": len(removed_map)}
|
|
1109
|
+
|
|
1110
|
+
def _upsert_graph(
|
|
1111
|
+
self,
|
|
1112
|
+
ex: ExtractedFile,
|
|
1113
|
+
*,
|
|
1114
|
+
head_sha: str | None = None,
|
|
1115
|
+
head_ord: int | None = None,
|
|
1116
|
+
) -> None:
|
|
1117
|
+
file_node = GraphNode(
|
|
1118
|
+
label="File",
|
|
1119
|
+
key=ex.path,
|
|
1120
|
+
props={"lang": ex.lang, "generated": ex.generated},
|
|
1121
|
+
)
|
|
1122
|
+
nodes: list[GraphNode] = [file_node]
|
|
1123
|
+
edges: list[GraphEdge] = []
|
|
1124
|
+
|
|
1125
|
+
for s in ex.symbols:
|
|
1126
|
+
sym_key = _symbol_key(ex.path, s)
|
|
1127
|
+
props: dict[str, object] = {
|
|
1128
|
+
"name": s.name,
|
|
1129
|
+
"kind": s.kind,
|
|
1130
|
+
"start": s.start_line,
|
|
1131
|
+
"end": s.end_line,
|
|
1132
|
+
"file": ex.path,
|
|
1133
|
+
}
|
|
1134
|
+
if s.namespace:
|
|
1135
|
+
props["namespace"] = s.namespace
|
|
1136
|
+
if s.partial:
|
|
1137
|
+
# Partial declarations live in multiple files; the per-key
|
|
1138
|
+
# ``file`` / ``start`` / ``end`` reflect *one* part. The
|
|
1139
|
+
# ``partial`` flag tells consumers to expect siblings.
|
|
1140
|
+
props["partial"] = True
|
|
1141
|
+
if s.param_count is not None:
|
|
1142
|
+
props["params"] = s.param_count
|
|
1143
|
+
nodes.append(GraphNode(label="Symbol", key=sym_key, props=props))
|
|
1144
|
+
edges.append(
|
|
1145
|
+
GraphEdge(
|
|
1146
|
+
type="DEFINES",
|
|
1147
|
+
src_label="File",
|
|
1148
|
+
src_key=ex.path,
|
|
1149
|
+
dst_label="Symbol",
|
|
1150
|
+
dst_key=sym_key,
|
|
1151
|
+
)
|
|
1152
|
+
)
|
|
1153
|
+
|
|
1154
|
+
seen_mods = set()
|
|
1155
|
+
for mod in ex.imports:
|
|
1156
|
+
if mod in seen_mods:
|
|
1157
|
+
continue
|
|
1158
|
+
seen_mods.add(mod)
|
|
1159
|
+
nodes.append(GraphNode(label="Module", key=mod))
|
|
1160
|
+
edges.append(
|
|
1161
|
+
GraphEdge(
|
|
1162
|
+
type="IMPORTS",
|
|
1163
|
+
src_label="File",
|
|
1164
|
+
src_key=ex.path,
|
|
1165
|
+
dst_label="Module",
|
|
1166
|
+
dst_key=mod,
|
|
1167
|
+
)
|
|
1168
|
+
)
|
|
1169
|
+
|
|
1170
|
+
# Calls are now (name, arity) pairs. Dedupe on the pair so two
|
|
1171
|
+
# call sites of ``Run()`` collapse, but ``Run()`` and ``Run(x)``
|
|
1172
|
+
# both contribute their own edges — the resolver uses the
|
|
1173
|
+
# arity downstream to disambiguate overloads.
|
|
1174
|
+
seen_calls: set[tuple[str, int, str | None]] = set()
|
|
1175
|
+
for call in ex.calls:
|
|
1176
|
+
key_triple = (call.name, call.arity, call.receiver_type)
|
|
1177
|
+
if key_triple in seen_calls:
|
|
1178
|
+
continue
|
|
1179
|
+
seen_calls.add(key_triple)
|
|
1180
|
+
call_props: dict[str, Any] = {
|
|
1181
|
+
"unresolved": True,
|
|
1182
|
+
"args": call.arity,
|
|
1183
|
+
}
|
|
1184
|
+
if call.receiver_type:
|
|
1185
|
+
call_props["receiver_type"] = call.receiver_type
|
|
1186
|
+
edges.append(
|
|
1187
|
+
GraphEdge(
|
|
1188
|
+
type="CALLS",
|
|
1189
|
+
src_label="File",
|
|
1190
|
+
src_key=ex.path,
|
|
1191
|
+
dst_label="Symbol",
|
|
1192
|
+
dst_key=f"name::{call.name}",
|
|
1193
|
+
props=call_props,
|
|
1194
|
+
)
|
|
1195
|
+
)
|
|
1196
|
+
nodes.append(
|
|
1197
|
+
GraphNode(
|
|
1198
|
+
label="Symbol",
|
|
1199
|
+
key=f"name::{call.name}",
|
|
1200
|
+
props={"name": call.name, "unresolved": True},
|
|
1201
|
+
)
|
|
1202
|
+
)
|
|
1203
|
+
|
|
1204
|
+
# Type-position references (base lists, parameter types, field/
|
|
1205
|
+
# property types, generics, type constraints, cast/is/as/typeof
|
|
1206
|
+
# targets). Emitted as a separate REFERENCES edge type so the
|
|
1207
|
+
# graph keeps the semantic distinction from CALLS (`X invokes Y`)
|
|
1208
|
+
# while letting "who touches type X" queries union them.
|
|
1209
|
+
seen_refs: set[str] = set()
|
|
1210
|
+
for ref in ex.references:
|
|
1211
|
+
if ref in seen_refs:
|
|
1212
|
+
continue
|
|
1213
|
+
seen_refs.add(ref)
|
|
1214
|
+
edges.append(
|
|
1215
|
+
GraphEdge(
|
|
1216
|
+
type="REFERENCES",
|
|
1217
|
+
src_label="File",
|
|
1218
|
+
src_key=ex.path,
|
|
1219
|
+
dst_label="Symbol",
|
|
1220
|
+
dst_key=f"name::{ref}",
|
|
1221
|
+
props={"unresolved": True},
|
|
1222
|
+
)
|
|
1223
|
+
)
|
|
1224
|
+
nodes.append(
|
|
1225
|
+
GraphNode(
|
|
1226
|
+
label="Symbol",
|
|
1227
|
+
key=f"name::{ref}",
|
|
1228
|
+
props={"name": ref, "unresolved": True},
|
|
1229
|
+
)
|
|
1230
|
+
)
|
|
1231
|
+
|
|
1232
|
+
# Razor / Blazor DI: emit INJECTS edges to the same placeholder
|
|
1233
|
+
# Symbol pool so the resolver can rewrite them to real Type /
|
|
1234
|
+
# Symbol targets in the same pass that handles calls. Keeping
|
|
1235
|
+
# the edge type distinct preserves the semantic ("X is a DI
|
|
1236
|
+
# dependency of this file", not "X is called by this file").
|
|
1237
|
+
seen_injects: set[str] = set()
|
|
1238
|
+
for injected in ex.injects:
|
|
1239
|
+
if injected in seen_injects:
|
|
1240
|
+
continue
|
|
1241
|
+
seen_injects.add(injected)
|
|
1242
|
+
edges.append(
|
|
1243
|
+
GraphEdge(
|
|
1244
|
+
type="INJECTS",
|
|
1245
|
+
src_label="File",
|
|
1246
|
+
src_key=ex.path,
|
|
1247
|
+
dst_label="Symbol",
|
|
1248
|
+
dst_key=f"name::{injected}",
|
|
1249
|
+
props={"unresolved": True},
|
|
1250
|
+
)
|
|
1251
|
+
)
|
|
1252
|
+
nodes.append(
|
|
1253
|
+
GraphNode(
|
|
1254
|
+
label="Symbol",
|
|
1255
|
+
key=f"name::{injected}",
|
|
1256
|
+
props={"name": injected, "unresolved": True},
|
|
1257
|
+
)
|
|
1258
|
+
)
|
|
1259
|
+
|
|
1260
|
+
self.graph.upsert_nodes(nodes, head_sha=head_sha, head_ord=head_ord)
|
|
1261
|
+
self.graph.upsert_edges(edges, head_sha=head_sha, head_ord=head_ord)
|
|
1262
|
+
|
|
1263
|
+
def _embed_and_upsert(
|
|
1264
|
+
self, pending: list[tuple[ExtractedFile, _Chunk]]
|
|
1265
|
+
) -> None:
|
|
1266
|
+
"""Embed and persist a cross-file chunk batch in one shot.
|
|
1267
|
+
|
|
1268
|
+
Used by the full-ingest hot path so the embedder receives a
|
|
1269
|
+
large list per call (avoiding per-file HTTP overhead) and
|
|
1270
|
+
Qdrant gets a single bulk-upsert. Order of records mirrors the
|
|
1271
|
+
input so the embedder result vector aligns 1:1.
|
|
1272
|
+
"""
|
|
1273
|
+
if not pending:
|
|
1274
|
+
return
|
|
1275
|
+
texts = [c.text for _, c in pending]
|
|
1276
|
+
hvecs = self.embedder.embed(texts)
|
|
1277
|
+
records = [
|
|
1278
|
+
VectorRecord(
|
|
1279
|
+
id=_id(ex.path, c.key),
|
|
1280
|
+
vector=hv,
|
|
1281
|
+
payload={
|
|
1282
|
+
"path": ex.path,
|
|
1283
|
+
"lang": ex.lang,
|
|
1284
|
+
"kind": c.kind,
|
|
1285
|
+
"name": c.name,
|
|
1286
|
+
"start": c.start,
|
|
1287
|
+
"end": c.end,
|
|
1288
|
+
"generated": ex.generated,
|
|
1289
|
+
},
|
|
1290
|
+
)
|
|
1291
|
+
for (ex, c), hv in zip(pending, hvecs, strict=True)
|
|
1292
|
+
]
|
|
1293
|
+
self.vector.upsert(self.cfg.qdrant_code, records)
|
|
1294
|
+
|
|
1295
|
+
def _upsert_vectors(self, ex: ExtractedFile, batch_size: int = 32) -> None:
|
|
1296
|
+
chunks = list(_chunks_for(ex))
|
|
1297
|
+
if not chunks:
|
|
1298
|
+
return
|
|
1299
|
+
for i in range(0, len(chunks), batch_size):
|
|
1300
|
+
batch = chunks[i : i + batch_size]
|
|
1301
|
+
hvecs = self.embedder.embed([c.text for c in batch])
|
|
1302
|
+
records = [
|
|
1303
|
+
VectorRecord(
|
|
1304
|
+
id=_id(ex.path, c.key),
|
|
1305
|
+
vector=hv,
|
|
1306
|
+
payload={
|
|
1307
|
+
"path": ex.path,
|
|
1308
|
+
"lang": ex.lang,
|
|
1309
|
+
"kind": c.kind,
|
|
1310
|
+
"name": c.name,
|
|
1311
|
+
"start": c.start,
|
|
1312
|
+
"end": c.end,
|
|
1313
|
+
"generated": ex.generated,
|
|
1314
|
+
},
|
|
1315
|
+
)
|
|
1316
|
+
for c, hv in zip(batch, hvecs, strict=True)
|
|
1317
|
+
]
|
|
1318
|
+
self.vector.upsert(self.cfg.qdrant_code, records)
|
|
1319
|
+
|
|
1320
|
+
|
|
1321
|
+
def _resolve_head(root: str | Path) -> tuple[str | None, int | None]:
|
|
1322
|
+
"""Best-effort ``(head_sha, head_ord)`` for ``root``.
|
|
1323
|
+
|
|
1324
|
+
Returns ``(None, None)`` for non-git directories so callers can
|
|
1325
|
+
fall through to legacy unstamped behaviour. The ordinal is the
|
|
1326
|
+
first-parent commit count (``git rev-list --count --first-parent``),
|
|
1327
|
+
which gives a monotonic integer along the trunk — usable as a
|
|
1328
|
+
cheap "before/after" comparator without pulling the whole topology
|
|
1329
|
+
into the graph.
|
|
1330
|
+
"""
|
|
1331
|
+
p = Path(root)
|
|
1332
|
+
if not git_delta.is_git_repo(p):
|
|
1333
|
+
return None, None
|
|
1334
|
+
try:
|
|
1335
|
+
sha = git_delta.head_sha(p)
|
|
1336
|
+
except git_delta.GitError:
|
|
1337
|
+
return None, None
|
|
1338
|
+
if not sha:
|
|
1339
|
+
return None, None
|
|
1340
|
+
return sha, git_delta.commit_ordinal(p, sha)
|
|
1341
|
+
|
|
1342
|
+
|
|
1343
|
+
def _owning_project(
|
|
1344
|
+
file_path: str, proj_dirs: list[tuple[str, str]]
|
|
1345
|
+
) -> str | None:
|
|
1346
|
+
"""Return the project key whose directory is the deepest prefix of ``file_path``.
|
|
1347
|
+
|
|
1348
|
+
``proj_dirs`` must already be sorted by descending directory-length
|
|
1349
|
+
so the first match wins. ``None`` means the file lives outside any
|
|
1350
|
+
indexed project.
|
|
1351
|
+
"""
|
|
1352
|
+
abs_path = str(Path(file_path).resolve())
|
|
1353
|
+
for dir_, proj_key in proj_dirs:
|
|
1354
|
+
# Match on the directory boundary (``dir/file.cs``) — substring
|
|
1355
|
+
# without the trailing separator would treat ``/A/B.csproj`` as
|
|
1356
|
+
# owning files under ``/A/Beta/`` which it doesn't.
|
|
1357
|
+
prefix = dir_.rstrip("/") + "/"
|
|
1358
|
+
if abs_path.startswith(prefix):
|
|
1359
|
+
return proj_key
|
|
1360
|
+
return None
|
|
1361
|
+
|
|
1362
|
+
|
|
1363
|
+
def _symbol_key(path: str, sym: Symbol) -> str:
|
|
1364
|
+
"""Build the graph key for a Symbol node.
|
|
1365
|
+
|
|
1366
|
+
Non-partial symbols stay file-scoped — ``{path}::{name}#{line}``.
|
|
1367
|
+
Partial declarations with a known namespace collapse to one key
|
|
1368
|
+
across every file that declares a part — ``partial::{ns}.{name}``.
|
|
1369
|
+
Multiple ``DEFINES`` edges from the contributing files all point
|
|
1370
|
+
at the same Symbol node, so callers/callees queries see one
|
|
1371
|
+
logical entity instead of N orphan duplicates.
|
|
1372
|
+
|
|
1373
|
+
Partial declarations without a resolvable namespace are rare
|
|
1374
|
+
(global namespace, error recovery); fall back to file-scoped so
|
|
1375
|
+
we never collide two unrelated globals.
|
|
1376
|
+
"""
|
|
1377
|
+
if sym.partial and sym.namespace:
|
|
1378
|
+
return f"partial::{sym.namespace}.{sym.name}"
|
|
1379
|
+
return f"{path}::{sym.name}#{sym.start_line}"
|
|
1380
|
+
|
|
1381
|
+
|
|
1382
|
+
def _attach_sanity(stats: IngestStats, sanity: SanitySummary) -> None:
|
|
1383
|
+
"""Record sanity-check results on ``stats`` and warn on high failure rates.
|
|
1384
|
+
|
|
1385
|
+
A symbol fails the round-trip when its snippet doesn't contain its
|
|
1386
|
+
own (plain-identifier) name verbatim. That happens when the
|
|
1387
|
+
extractor's byte/char accounting is broken — historically the
|
|
1388
|
+
UTF-8 chop bug. Surface failures on the stats object so the CLI
|
|
1389
|
+
output shows them, and append a loud note when the rate crosses
|
|
1390
|
+
the suspect threshold so a human looks.
|
|
1391
|
+
"""
|
|
1392
|
+
if sanity.symbols_checked == 0:
|
|
1393
|
+
return
|
|
1394
|
+
rate = sanity.failure_rate
|
|
1395
|
+
stats.sanity = {
|
|
1396
|
+
"checked": sanity.symbols_checked,
|
|
1397
|
+
"failed": sanity.symbols_failed,
|
|
1398
|
+
"failure_rate": round(rate, 4),
|
|
1399
|
+
"samples": [
|
|
1400
|
+
{"path": v.path, "name": v.name, "kind": v.kind, "line": v.start_line}
|
|
1401
|
+
for v in sanity.sample_violations
|
|
1402
|
+
],
|
|
1403
|
+
}
|
|
1404
|
+
if rate > SUSPECT_THRESHOLD:
|
|
1405
|
+
stats.notes.append(
|
|
1406
|
+
f"sanity: {sanity.symbols_failed}/{sanity.symbols_checked} "
|
|
1407
|
+
f"plain-identifier symbols ({rate * 100:.1f}%) did not round-trip; "
|
|
1408
|
+
f"extractor may be miscounting offsets — see stats.sanity.samples"
|
|
1409
|
+
)
|
|
1410
|
+
|
|
1411
|
+
|
|
1412
|
+
@dataclass
|
|
1413
|
+
class _Chunk:
|
|
1414
|
+
key: str
|
|
1415
|
+
text: str
|
|
1416
|
+
kind: str
|
|
1417
|
+
name: str
|
|
1418
|
+
start: int
|
|
1419
|
+
end: int
|
|
1420
|
+
|
|
1421
|
+
|
|
1422
|
+
def _chunks_for(ex: ExtractedFile) -> Iterable[_Chunk]:
|
|
1423
|
+
if ex.symbols:
|
|
1424
|
+
for s in ex.symbols:
|
|
1425
|
+
yield _Chunk(
|
|
1426
|
+
key=f"{s.name}#{s.start_line}",
|
|
1427
|
+
text=_symbol_text(s, ex.path),
|
|
1428
|
+
kind=s.kind,
|
|
1429
|
+
name=s.name,
|
|
1430
|
+
start=s.start_line,
|
|
1431
|
+
end=s.end_line,
|
|
1432
|
+
)
|
|
1433
|
+
else:
|
|
1434
|
+
# fallback: whole file (cap to ~6k chars)
|
|
1435
|
+
snippet = ex.source[:6000]
|
|
1436
|
+
yield _Chunk(
|
|
1437
|
+
key="file",
|
|
1438
|
+
text=f"FILE {ex.path}\n{snippet}",
|
|
1439
|
+
kind="file",
|
|
1440
|
+
name=Path(ex.path).name,
|
|
1441
|
+
start=1,
|
|
1442
|
+
end=len(ex.source.splitlines()) or 1,
|
|
1443
|
+
)
|
|
1444
|
+
|
|
1445
|
+
|
|
1446
|
+
MAX_SNIPPET_CHARS = 1500
|
|
1447
|
+
SIGNATURE_LINES = 3
|
|
1448
|
+
|
|
1449
|
+
|
|
1450
|
+
def _symbol_text(s: Symbol, path: str) -> str:
|
|
1451
|
+
"""Build chunk text optimised for hybrid (dense + sparse) embedding.
|
|
1452
|
+
|
|
1453
|
+
Layout:
|
|
1454
|
+
1. Header line with file/kind/name/symbol — front-loaded so both
|
|
1455
|
+
dense semantics and sparse identifier weights pick it up.
|
|
1456
|
+
2. Signature lines (first ``SIGNATURE_LINES`` non-empty) — repeated
|
|
1457
|
+
so they survive aggressive tail-trim and dominate the lexical
|
|
1458
|
+
weighting for short queries like ``ngOnInit`` or
|
|
1459
|
+
``UserService.create``.
|
|
1460
|
+
3. Body, tail-trimmed at ``MAX_SNIPPET_CHARS``. 1500 chars (~ 400
|
|
1461
|
+
tokens) keeps the m3 forward pass tight; longer bodies dilute
|
|
1462
|
+
dense quality without buying much.
|
|
1463
|
+
|
|
1464
|
+
Empty / one-line symbols still produce a usable chunk because the
|
|
1465
|
+
header alone carries the identifier signal.
|
|
1466
|
+
"""
|
|
1467
|
+
snippet = s.snippet or ""
|
|
1468
|
+
lines = [line for line in snippet.splitlines() if line.strip()]
|
|
1469
|
+
signature = "\n".join(lines[:SIGNATURE_LINES])
|
|
1470
|
+
body = snippet[:MAX_SNIPPET_CHARS]
|
|
1471
|
+
parts = [
|
|
1472
|
+
f"FILE {path}",
|
|
1473
|
+
f"KIND {s.kind} NAME {s.name}",
|
|
1474
|
+
]
|
|
1475
|
+
if signature:
|
|
1476
|
+
parts.append(f"SIGNATURE\n{signature}")
|
|
1477
|
+
parts.append(body)
|
|
1478
|
+
return "\n".join(parts)
|