flurryx-code-memory 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. code_memory/__init__.py +1 -0
  2. code_memory/claims/__init__.py +32 -0
  3. code_memory/claims/extractor.py +325 -0
  4. code_memory/claims/indexer.py +258 -0
  5. code_memory/claims/resolver.py +186 -0
  6. code_memory/claims/store.py +424 -0
  7. code_memory/cli.py +1192 -0
  8. code_memory/config.py +268 -0
  9. code_memory/embed/__init__.py +224 -0
  10. code_memory/embed/cache.py +204 -0
  11. code_memory/embed/m3.py +174 -0
  12. code_memory/embed/ollama.py +92 -0
  13. code_memory/embed/tei.py +106 -0
  14. code_memory/episodic/__init__.py +3 -0
  15. code_memory/episodic/sqlite_store.py +278 -0
  16. code_memory/extractor/__init__.py +3 -0
  17. code_memory/extractor/csproj.py +166 -0
  18. code_memory/extractor/dll.py +385 -0
  19. code_memory/extractor/gitignore.py +162 -0
  20. code_memory/extractor/nuget.py +275 -0
  21. code_memory/extractor/sanity.py +124 -0
  22. code_memory/extractor/sln.py +108 -0
  23. code_memory/extractor/treesitter.py +1172 -0
  24. code_memory/graph/__init__.py +3 -0
  25. code_memory/graph/falkor_store.py +740 -0
  26. code_memory/mcp_server.py +1816 -0
  27. code_memory/metrics.py +260 -0
  28. code_memory/orchestrator/__init__.py +13 -0
  29. code_memory/orchestrator/git_delta.py +211 -0
  30. code_memory/orchestrator/ingest_state.py +71 -0
  31. code_memory/orchestrator/pipeline.py +1478 -0
  32. code_memory/orchestrator/reset.py +130 -0
  33. code_memory/orchestrator/resolver.py +825 -0
  34. code_memory/orchestrator/retrieve.py +505 -0
  35. code_memory/resilience.py +73 -0
  36. code_memory/sync/__init__.py +20 -0
  37. code_memory/sync/autostart/__init__.py +42 -0
  38. code_memory/sync/autostart/base.py +106 -0
  39. code_memory/sync/autostart/launchd.py +115 -0
  40. code_memory/sync/autostart/schtasks.py +155 -0
  41. code_memory/sync/autostart/systemd.py +113 -0
  42. code_memory/sync/hooks.py +164 -0
  43. code_memory/sync/safety.py +65 -0
  44. code_memory/sync/snapshot.py +461 -0
  45. code_memory/sync/store.py +399 -0
  46. code_memory/sync/sync.py +405 -0
  47. code_memory/sync/watcher.py +320 -0
  48. code_memory/vector/__init__.py +3 -0
  49. code_memory/vector/qdrant_store.py +302 -0
  50. flurryx_code_memory-0.4.0.dist-info/METADATA +26 -0
  51. flurryx_code_memory-0.4.0.dist-info/RECORD +53 -0
  52. flurryx_code_memory-0.4.0.dist-info/WHEEL +4 -0
  53. flurryx_code_memory-0.4.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,1478 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ import os
6
+ import sys
7
+ import time
8
+ from collections.abc import Callable, Iterable
9
+ from concurrent.futures import Future, ThreadPoolExecutor
10
+ from dataclasses import dataclass, field
11
+ from pathlib import Path
12
+ from typing import Any, Literal
13
+
14
+ from ..config import CONFIG, Config, detect_project_slug
15
+ from ..embed import M3Embedder, get_embedder
16
+ from ..episodic import Episode, EpisodicStore
17
+ from ..episodic.sqlite_store import episode_payload, episode_text
18
+ from ..extractor import ExtractedFile, Extractor, Symbol
19
+ from ..extractor.csproj import CsprojInfo, walk_csprojs
20
+ from ..extractor.dll import parse_assembly
21
+ from ..extractor.nuget import resolve_refs
22
+ from ..extractor.sanity import SUSPECT_THRESHOLD, SanitySummary
23
+ from ..extractor.sln import walk_solutions
24
+ from ..graph import FalkorStore, GraphEdge, GraphNode
25
+ from ..vector import QdrantStore, VectorRecord
26
+ from . import git_delta
27
+ from .ingest_state import IngestStateStore
28
+ from .resolver import resolve_graph
29
+
30
+ IngestMode = Literal["auto", "full", "incremental"]
31
+
32
+ ProgressCallback = Callable[[int, int | None, str], None]
33
+
34
+
35
+ def _id(*parts: str) -> str:
36
+ h = hashlib.sha1("\x00".join(parts).encode()).hexdigest()
37
+ return h[:32]
38
+
39
+
40
+ # How often to emit a progress heartbeat during ingest. Heartbeats go to
41
+ # stderr so ``--json`` output on stdout stays clean.
42
+ _PROGRESS_EVERY = int(os.environ.get("CODEMEMORY_PROGRESS_EVERY", "50"))
43
+ _PROGRESS_ENABLED = os.environ.get("CODEMEMORY_PROGRESS", "1") != "0"
44
+ # auto = rich TUI when stderr is a TTY, plain text otherwise.
45
+ # rich = force rich (e.g. forced inside non-TTY harness that handles ANSI).
46
+ # text = legacy throttled heartbeat lines.
47
+ # none = silence everything.
48
+ _PROGRESS_STYLE = os.environ.get("CODEMEMORY_PROGRESS_STYLE", "auto").lower()
49
+
50
+
51
+ def _default_progress_file() -> Path:
52
+ """Where _Heartbeat writes the live progress snapshot.
53
+
54
+ Cross-process channel for the `code-memory watch` CLI: any process
55
+ running ingest writes here on every tick; the watch CLI tails the
56
+ same path and renders a rich live bar. Path is overridable via
57
+ ``CODEMEMORY_PROGRESS_FILE`` for tests or split projects.
58
+ """
59
+ override = os.environ.get("CODEMEMORY_PROGRESS_FILE")
60
+ if override:
61
+ return Path(override).expanduser()
62
+ base = os.environ.get("XDG_STATE_HOME") or str(Path.home() / ".cache")
63
+ return Path(base) / "code-memory" / "ingest-progress.json"
64
+
65
+
66
+ _PROGRESS_FILE = _default_progress_file()
67
+
68
+
69
+ def _write_progress_snapshot(snap: dict[str, Any]) -> None:
70
+ """Atomically write a progress snapshot for the watch CLI.
71
+
72
+ Atomic via tmp + rename so a watcher never reads a half-written
73
+ document. Failures swallowed — UI must not break the ingest loop.
74
+ """
75
+ try:
76
+ _PROGRESS_FILE.parent.mkdir(parents=True, exist_ok=True)
77
+ tmp = _PROGRESS_FILE.with_suffix(".json.tmp")
78
+ tmp.write_text(json.dumps(snap))
79
+ os.replace(tmp, _PROGRESS_FILE)
80
+ except Exception: # noqa: BLE001 — UI errors must not abort ingest.
81
+ pass
82
+
83
+
84
+ def _want_rich_progress() -> bool:
85
+ if _PROGRESS_STYLE == "none" or not _PROGRESS_ENABLED:
86
+ return False
87
+ if _PROGRESS_STYLE == "rich":
88
+ return True
89
+ if _PROGRESS_STYLE == "text":
90
+ return False
91
+ try:
92
+ return bool(sys.stderr.isatty())
93
+ except Exception:
94
+ return False
95
+
96
+
97
+ class _Heartbeat:
98
+ """Render ingest progress.
99
+
100
+ Two render paths share one API:
101
+
102
+ * **rich** — `rich.progress.Progress` live bar on stderr with files,
103
+ symbols, chunks, skipped counters + ETA. Used when stderr is a TTY
104
+ (or `CODEMEMORY_PROGRESS_STYLE=rich`).
105
+ * **text** — periodic ``files=… symbols=…`` lines on stderr. Used
106
+ when stderr is captured (MCP stdio server, CI logs, `bash` from an
107
+ agent harness) so ANSI escapes don't pollute the transcript.
108
+ """
109
+
110
+ def __init__(
111
+ self,
112
+ label: str,
113
+ *,
114
+ total: int | None = None,
115
+ on_progress: ProgressCallback | None = None,
116
+ ) -> None:
117
+ self.label = label
118
+ self.total = total
119
+ self.start = time.monotonic()
120
+ self.last = self.start
121
+ self._rich: Any = None
122
+ self._task: Any = None
123
+ self._on_progress = on_progress
124
+ # Throttle out-of-band progress notifications so a 50k-file ingest
125
+ # doesn't flood the MCP transport. Rich's own refresh loop is
126
+ # already throttled internally.
127
+ self._cb_interval = float(
128
+ os.environ.get("CODEMEMORY_PROGRESS_NOTIFY_INTERVAL", "0.4")
129
+ )
130
+ self._cb_last = 0.0
131
+ if _want_rich_progress():
132
+ self._init_rich()
133
+
134
+ def _init_rich(self) -> None:
135
+ try:
136
+ from rich.console import Console
137
+ from rich.progress import (
138
+ BarColumn,
139
+ MofNCompleteColumn,
140
+ Progress,
141
+ SpinnerColumn,
142
+ TextColumn,
143
+ TimeElapsedColumn,
144
+ TimeRemainingColumn,
145
+ )
146
+ except Exception: # noqa: BLE001 — rich missing, fall back to text
147
+ return
148
+ progress = Progress(
149
+ SpinnerColumn(style="cyan"),
150
+ TextColumn("[bold cyan]code-memory[/] {task.description}"),
151
+ BarColumn(bar_width=None),
152
+ MofNCompleteColumn(),
153
+ TextColumn(
154
+ "[green]{task.fields[symbols]}[/]sym "
155
+ "[magenta]{task.fields[chunks]}[/]chk "
156
+ "[yellow]{task.fields[skipped]}[/]skip "
157
+ "[dim]{task.fields[rate]}/s[/]"
158
+ ),
159
+ TimeElapsedColumn(),
160
+ TimeRemainingColumn(),
161
+ console=Console(stderr=True),
162
+ transient=False,
163
+ refresh_per_second=8,
164
+ )
165
+ try:
166
+ progress.start()
167
+ except Exception: # noqa: BLE001
168
+ return
169
+ self._rich = progress
170
+ self._task = progress.add_task(
171
+ self.label,
172
+ total=self.total,
173
+ symbols=0,
174
+ chunks=0,
175
+ skipped=0,
176
+ rate="0.0",
177
+ )
178
+
179
+ def _rate(self, files: int) -> float:
180
+ elapsed = max(time.monotonic() - self.start, 1e-6)
181
+ return files / elapsed
182
+
183
+ def _snapshot(self, stats: IngestStats, *, done: bool) -> dict[str, Any]:
184
+ return {
185
+ "label": self.label,
186
+ "files": stats.files,
187
+ "total": self.total,
188
+ "symbols": stats.symbols,
189
+ "chunks": stats.chunks,
190
+ "skipped": stats.skipped,
191
+ "rate": self._rate(stats.files),
192
+ "elapsed": time.monotonic() - self.start,
193
+ "ts": time.time(),
194
+ "done": done,
195
+ "pid": os.getpid(),
196
+ }
197
+
198
+ def _notify(self, stats: IngestStats, *, force: bool = False) -> None:
199
+ if self._on_progress is None:
200
+ return
201
+ now = time.monotonic()
202
+ if not force and now - self._cb_last < self._cb_interval:
203
+ return
204
+ self._cb_last = now
205
+ rate = self._rate(stats.files)
206
+ msg = (
207
+ f"{self.label}: files={stats.files} "
208
+ f"symbols={stats.symbols} chunks={stats.chunks} "
209
+ f"skipped={stats.skipped} rate={rate:.1f}/s"
210
+ )
211
+ try:
212
+ self._on_progress(stats.files, self.total, msg)
213
+ except Exception: # noqa: BLE001 — never let UI break the ingest
214
+ pass
215
+
216
+ def tick(self, stats: IngestStats) -> None:
217
+ self._notify(stats)
218
+ _write_progress_snapshot(self._snapshot(stats, done=False))
219
+ if self._rich is not None:
220
+ self._rich.update(
221
+ self._task,
222
+ completed=stats.files,
223
+ total=self.total,
224
+ symbols=stats.symbols,
225
+ chunks=stats.chunks,
226
+ skipped=stats.skipped,
227
+ rate=f"{self._rate(stats.files):.1f}",
228
+ )
229
+ return
230
+ if not _PROGRESS_ENABLED or _PROGRESS_STYLE == "none":
231
+ return
232
+ if _PROGRESS_EVERY <= 0:
233
+ return
234
+ if stats.files % _PROGRESS_EVERY != 0 or stats.files == 0:
235
+ return
236
+ now = time.monotonic()
237
+ rate = self._rate(stats.files)
238
+ eta = ""
239
+ if self.total and rate > 0:
240
+ remaining = max(self.total - stats.files, 0)
241
+ eta = f" eta={remaining / rate:.0f}s"
242
+ total_part = f"/{self.total}" if self.total else ""
243
+ sys.stderr.write(
244
+ f"[code-memory] {self.label}: files={stats.files}{total_part} "
245
+ f"symbols={stats.symbols} chunks={stats.chunks} "
246
+ f"skipped={stats.skipped} rate={rate:.1f}/s{eta}\n"
247
+ )
248
+ sys.stderr.flush()
249
+ self.last = now
250
+
251
+ def done(self, stats: IngestStats) -> None:
252
+ self._notify(stats, force=True)
253
+ _write_progress_snapshot(self._snapshot(stats, done=True))
254
+ if self._rich is not None:
255
+ try:
256
+ self._rich.update(
257
+ self._task,
258
+ completed=stats.files,
259
+ total=self.total or stats.files or 1,
260
+ symbols=stats.symbols,
261
+ chunks=stats.chunks,
262
+ skipped=stats.skipped,
263
+ rate=f"{self._rate(stats.files):.1f}",
264
+ )
265
+ self._rich.stop()
266
+ except Exception: # noqa: BLE001
267
+ pass
268
+ self._rich = None
269
+ self._task = None
270
+ return
271
+ if not _PROGRESS_ENABLED or _PROGRESS_STYLE == "none":
272
+ return
273
+ elapsed = time.monotonic() - self.start
274
+ sys.stderr.write(
275
+ f"[code-memory] {self.label} done: files={stats.files} "
276
+ f"symbols={stats.symbols} chunks={stats.chunks} "
277
+ f"skipped={stats.skipped} elapsed={elapsed:.1f}s\n"
278
+ )
279
+ sys.stderr.flush()
280
+
281
+
282
+ @dataclass
283
+ class IngestStats:
284
+ files: int = 0
285
+ symbols: int = 0
286
+ imports: int = 0
287
+ calls: int = 0
288
+ references: int = 0
289
+ chunks: int = 0
290
+ deleted: int = 0
291
+ skipped: int = 0
292
+ mode: str = "full"
293
+ base_sha: str | None = None
294
+ head_sha: str | None = None
295
+ resolver: dict[str, int] | None = None
296
+ sanity: dict[str, object] | None = None
297
+ projects: dict[str, int] | None = None
298
+ dlls: dict[str, int] | None = None
299
+ solutions: dict[str, int] | None = None
300
+ notes: list[str] = field(default_factory=list)
301
+
302
+
303
+ class Pipeline:
304
+ """Coordinator: extractor -> graph + vectors + episodes."""
305
+
306
+ def __init__(
307
+ self,
308
+ project: str | None = None,
309
+ embedder: M3Embedder | None = None,
310
+ vector: QdrantStore | None = None,
311
+ graph: FalkorStore | None = None,
312
+ episodic: EpisodicStore | None = None,
313
+ skip_vectors: bool = False,
314
+ ) -> None:
315
+ self.slug = project or detect_project_slug()
316
+ self.cfg: Config = CONFIG.for_project(self.slug)
317
+ self.skip_vectors = skip_vectors
318
+ self.embedder = embedder or get_embedder()
319
+ self.vector = vector or QdrantStore()
320
+ self.graph = graph or FalkorStore(graph_name=self.cfg.falkor_graph)
321
+ self.episodic = episodic or EpisodicStore(path=self.cfg.episodic_db)
322
+ # Skip the Qdrant probes too when ``skip_vectors``: large-repo
323
+ # operators who deliberately turn off the vector layer shouldn't
324
+ # have to keep Qdrant alive.
325
+ if not getattr(self, "skip_vectors", False):
326
+ self.vector.ensure_collection(self.cfg.qdrant_code)
327
+ self.vector.ensure_collection(self.cfg.qdrant_episodes)
328
+ self.graph.ensure_indexes()
329
+ self.state = IngestStateStore(self.cfg.episodic_db)
330
+
331
+ def ingest_repo(
332
+ self,
333
+ root: str | Path,
334
+ *,
335
+ mode: IngestMode = "auto",
336
+ since: str | None = None,
337
+ dry_run: bool = False,
338
+ on_progress: ProgressCallback | None = None,
339
+ ) -> IngestStats:
340
+ """Ingest a repository.
341
+
342
+ mode:
343
+ - "auto": git-incremental if prior state exists and base is reachable,
344
+ else full walk
345
+ - "full": purge this project's vectors+graph+ingest_state, then
346
+ walk every file. Use to rebuild from scratch.
347
+ - "incremental": require git + base; raise if not available
348
+ since: explicit base ref (branch/tag/sha). Overrides stored state when set.
349
+ dry_run: compute plan and return stats with notes; don't touch storage.
350
+ """
351
+ root_path = Path(root).resolve()
352
+ is_git = git_delta.is_git_repo(root_path)
353
+
354
+ if mode == "full" or (mode == "auto" and not is_git):
355
+ stats = self._ingest_full(
356
+ root_path, dry_run=dry_run, on_progress=on_progress
357
+ )
358
+ if not dry_run:
359
+ self._run_resolver(stats)
360
+ if is_git and not dry_run:
361
+ self._record_state(root_path, stats)
362
+ return stats
363
+
364
+ # git path
365
+ if not is_git:
366
+ raise RuntimeError(f"{root_path} is not a git repository (mode={mode!r})")
367
+
368
+ head = git_delta.head_sha(root_path)
369
+ branch = git_delta.current_branch(root_path)
370
+ base = self._resolve_base(root_path, since=since, mode=mode)
371
+
372
+ if base is None:
373
+ # auto + git + no prior + no --since => full walk, then record state
374
+ stats = self._ingest_full(
375
+ root_path, dry_run=dry_run, on_progress=on_progress
376
+ )
377
+ stats.head_sha = head
378
+ stats.notes.append("no prior ingest state; performed full walk")
379
+ if not dry_run:
380
+ self._run_resolver(stats)
381
+ self._record_state(root_path, stats, head=head, branch=branch)
382
+ return stats
383
+
384
+ # Incremental
385
+ delta = git_delta.changed_since(root_path, base, include_dirty=True)
386
+ stats = self._ingest_delta(
387
+ root_path,
388
+ delta,
389
+ base_sha=base,
390
+ head_sha=head,
391
+ dry_run=dry_run,
392
+ on_progress=on_progress,
393
+ )
394
+ stats.mode = "incremental"
395
+ if not dry_run:
396
+ if stats.files > 0:
397
+ # Only run resolver if something actually changed; the
398
+ # resolver scans the whole graph so it's a fixed cost
399
+ # we'd rather skip on no-op delta runs.
400
+ self._run_resolver(stats)
401
+ self._record_state(root_path, stats, head=head, branch=branch)
402
+ return stats
403
+
404
+ # -- internals -------------------------------------------------------
405
+
406
+ def _resolve_base(
407
+ self, root: Path, *, since: str | None, mode: IngestMode
408
+ ) -> str | None:
409
+ if since is not None:
410
+ try:
411
+ return git_delta.resolve_ref(root, since)
412
+ except git_delta.GitError as e:
413
+ raise RuntimeError(f"could not resolve --since {since!r}: {e}") from e
414
+
415
+ prior = self.state.get(root)
416
+ if prior is None:
417
+ if mode == "incremental":
418
+ raise RuntimeError(
419
+ f"no prior ingest state for {root}; run a full ingest first"
420
+ )
421
+ return None
422
+
423
+ if not git_delta.is_reachable(root, prior.last_sha):
424
+ # history rewrite or branch deletion — fall back
425
+ self.state.clear(root)
426
+ return None
427
+
428
+ return prior.last_sha
429
+
430
+ def _ingest_full(
431
+ self,
432
+ root: Path,
433
+ *,
434
+ dry_run: bool,
435
+ on_progress: ProgressCallback | None = None,
436
+ ) -> IngestStats:
437
+ extractor = Extractor()
438
+ stats = IngestStats(mode="full")
439
+ sanity = SanitySummary()
440
+ head_sha, head_ord = _resolve_head(root)
441
+ stats.head_sha = head_sha
442
+ if not dry_run:
443
+ self._purge_project_index(root)
444
+ hb = _Heartbeat(
445
+ "full ingest" + (" (dry-run)" if dry_run else ""),
446
+ on_progress=on_progress,
447
+ )
448
+
449
+ # Buffer chunks across files so the embedder sees a large batch
450
+ # per call, then fan the Qdrant upserts out to a small thread
451
+ # pool so they overlap with the next batch's embedding work.
452
+ # On a cold ingest, embed (Ollama HTTP, serial) dominates; the
453
+ # qdrant upsert (network + index write) blocks for ~80-150 ms
454
+ # per batch — pipelining lets that happen while the next embed
455
+ # batch is in flight. On a warm ingest (cache hits), embed
456
+ # returns instantly and qdrant + graph become the path, so the
457
+ # same pool keeps Qdrant from blocking the graph layer.
458
+ pending_chunks: list[tuple[ExtractedFile, _Chunk]] = []
459
+ EMBED_BATCH = 64
460
+ UPSERT_POOL_SIZE = 2
461
+ UPSERT_QUEUE_MAX = 4
462
+ upsert_executor = ThreadPoolExecutor(max_workers=UPSERT_POOL_SIZE)
463
+ in_flight: list[Future] = []
464
+
465
+ def _await_one() -> None:
466
+ if not in_flight:
467
+ return
468
+ fut = in_flight.pop(0)
469
+ fut.result() # propagate exceptions
470
+
471
+ def _flush_pending() -> None:
472
+ if not pending_chunks:
473
+ return
474
+ batch = list(pending_chunks)
475
+ pending_chunks.clear()
476
+ fut = upsert_executor.submit(self._embed_and_upsert, batch)
477
+ in_flight.append(fut)
478
+ # Bound queue so upserts don't fall arbitrarily behind embed.
479
+ while len(in_flight) >= UPSERT_QUEUE_MAX:
480
+ _await_one()
481
+
482
+ for ex in extractor.walk(root):
483
+ stats.files += 1
484
+ stats.symbols += len(ex.symbols)
485
+ stats.imports += len(ex.imports)
486
+ stats.calls += len(ex.calls)
487
+ stats.references += len(ex.references)
488
+ stats.chunks += len(ex.symbols) or 1
489
+ sanity.record(ex)
490
+ if not dry_run:
491
+ # Graph upserts are cheap (UNWIND-batched per call) and
492
+ # need to stay per-file so the temporal stamping order
493
+ # matches the walk. Vector work defers to the buffer.
494
+ self._upsert_graph(ex, head_sha=head_sha, head_ord=head_ord)
495
+ if not getattr(self, "skip_vectors", False):
496
+ for c in _chunks_for(ex):
497
+ pending_chunks.append((ex, c))
498
+ if len(pending_chunks) >= EMBED_BATCH:
499
+ _flush_pending()
500
+ hb.tick(stats)
501
+ if not getattr(self, "skip_vectors", False):
502
+ _flush_pending()
503
+ # Drain the pool so the resolver + .NET-project pass sees a
504
+ # quiescent Qdrant. Drop the pool here, not in __exit__,
505
+ # because the .NET-project pass runs in this method.
506
+ while in_flight:
507
+ _await_one()
508
+ upsert_executor.shutdown(wait=True)
509
+ hb.done(stats)
510
+ _attach_sanity(stats, sanity)
511
+ self._ingest_dotnet_projects(
512
+ root, stats, dry_run=dry_run, head_sha=head_sha, head_ord=head_ord
513
+ )
514
+ return stats
515
+
516
+ def _run_resolver(self, stats: IngestStats) -> None:
517
+ """Resolve placeholder ``name::X`` Symbol nodes to real symbols.
518
+
519
+ Records resolver stats on the ingest stats object so callers can
520
+ see how much of the call graph is now grounded vs. ambiguous.
521
+ Failures are non-fatal — ingest data is already persisted.
522
+ """
523
+ try:
524
+ r = resolve_graph(self.graph)
525
+ except Exception as e:
526
+ stats.notes.append(f"resolver skipped: {e}")
527
+ return
528
+ stats.resolver = {
529
+ "placeholders": r.placeholders,
530
+ "edges_total": r.edges_total,
531
+ "resolved_same_file": r.edges_resolved_same_file,
532
+ "resolved_imported": r.edges_resolved_imported,
533
+ "resolved_unique": r.edges_resolved_unique,
534
+ "resolved_assembly": r.edges_resolved_assembly,
535
+ "ambiguous": r.edges_left_ambiguous,
536
+ "external": r.edges_left_external,
537
+ "placeholders_deleted": r.placeholders_deleted,
538
+ "import_aliases_added": r.import_aliases_added,
539
+ }
540
+
541
+ def _ingest_dotnet_projects(
542
+ self,
543
+ root: Path,
544
+ stats: IngestStats,
545
+ *,
546
+ dry_run: bool,
547
+ head_sha: str | None = None,
548
+ head_ord: int | None = None,
549
+ ) -> None:
550
+ """Walk `.csproj`/`.fsproj`/`.vbproj` and emit Project topology.
551
+
552
+ Adds three node/edge kinds to the graph:
553
+
554
+ * ``Project`` nodes keyed by absolute path.
555
+ * ``PROJECT_REFERENCES`` edges (Project → Project) from every
556
+ ``<ProjectReference>``. Targets outside the repo or unparseable
557
+ are silently dropped — see ``parse_csproj``.
558
+ * ``PACKAGE_REFERENCES`` edges (Project → Package) from every
559
+ ``<PackageReference>``. ``Package`` is a new label so NuGet
560
+ packages don't pollute the ``Module`` namespace (which holds
561
+ `using` import targets).
562
+
563
+ Non-.NET repos see zero ``.csproj`` files and this is a no-op.
564
+ Failures are non-fatal: source ingest already happened.
565
+ """
566
+ try:
567
+ projects = walk_csprojs(root)
568
+ except Exception as e: # noqa: BLE001
569
+ stats.notes.append(f"csproj indexing skipped: {e}")
570
+ return
571
+ if not projects:
572
+ return
573
+ counts = {
574
+ "projects": len(projects),
575
+ "project_refs": sum(len(p.project_references) for p in projects),
576
+ "package_refs": sum(len(p.package_references) for p in projects),
577
+ }
578
+ stats.projects = counts
579
+ if dry_run:
580
+ return
581
+ self._upsert_dotnet_projects(
582
+ projects, head_sha=head_sha, head_ord=head_ord
583
+ )
584
+ self._index_referenced_assemblies(
585
+ projects, stats, head_sha=head_sha, head_ord=head_ord
586
+ )
587
+ self._index_file_containment(
588
+ projects, stats, head_sha=head_sha, head_ord=head_ord
589
+ )
590
+ self._index_solutions(
591
+ root, stats, head_sha=head_sha, head_ord=head_ord
592
+ )
593
+
594
+ def _index_referenced_assemblies(
595
+ self,
596
+ projects: list[CsprojInfo],
597
+ stats: IngestStats,
598
+ *,
599
+ head_sha: str | None = None,
600
+ head_ord: int | None = None,
601
+ ) -> None:
602
+ """Parse referenced DLLs and index their public type surface.
603
+
604
+ Layer on top of the csproj topology (PR1 shipped Project +
605
+ Package + PackageReference edges). This step turns the logical
606
+ ``<PackageReference>`` and ``<ProjectReference>`` into concrete
607
+ ``.dll`` paths, parses each via :func:`code_memory.extractor.dll.parse_assembly`,
608
+ and writes:
609
+
610
+ * ``Assembly`` nodes keyed by ``"Name, Version=X.Y.Z.W"``. Two
611
+ versions of the same lib stay distinct so the agent can see
612
+ when projects pin different versions of the same dep.
613
+ * ``Type`` nodes keyed by ``"{assembly_id}::{Namespace}.{Name}"``.
614
+ Only public types (top-level or nested-public); private
615
+ implementation detail stays unindexed.
616
+ * ``USES_ASSEMBLY`` edges (Project → Assembly).
617
+ * ``EXPOSES_TYPE`` edges (Assembly → Type).
618
+
619
+ DLL resolution leans on the NuGet global cache plus project
620
+ build outputs (see ``code_memory.extractor.nuget``). Failures
621
+ are silenced: DLLs are read-only metadata, not load-bearing.
622
+ ``stats.dlls`` carries the counters so users see how much of
623
+ the binary surface we managed to index.
624
+ """
625
+ # Dedupe DLL paths across the whole solution so a shared
626
+ # dependency parses exactly once even when many projects pull
627
+ # the same Newtonsoft.Json on disk. ``unresolved`` counts
628
+ # PackageReferences we couldn't locate (offline machine,
629
+ # unrestored NuGet cache).
630
+ path_to_consumers: dict[str, set[str]] = {}
631
+ unresolved = 0
632
+ for proj in projects:
633
+ refs = resolve_refs(proj)
634
+ for dll in refs.all_paths():
635
+ path_to_consumers.setdefault(str(dll), set()).add(proj.path)
636
+ for pkg in proj.package_references:
637
+ if pkg.name not in refs.package_dlls:
638
+ unresolved += 1
639
+
640
+ if not path_to_consumers:
641
+ stats.dlls = {
642
+ "assemblies": 0,
643
+ "types": 0,
644
+ "skipped": 0,
645
+ "unresolved": unresolved,
646
+ }
647
+ return
648
+
649
+ nodes: list[GraphNode] = []
650
+ edges: list[GraphEdge] = []
651
+ seen_assembly_keys: set[str] = set()
652
+ seen_type_keys: set[str] = set()
653
+ skipped = 0
654
+
655
+ for dll_path, consumers in path_to_consumers.items():
656
+ info = parse_assembly(dll_path)
657
+ if info is None:
658
+ skipped += 1
659
+ continue
660
+ asm_key = info.identity
661
+ if asm_key not in seen_assembly_keys:
662
+ seen_assembly_keys.add(asm_key)
663
+ asm_props: dict[str, object] = {
664
+ "name": info.name,
665
+ "version": info.version,
666
+ "path": info.path,
667
+ }
668
+ if info.public_key_token:
669
+ asm_props["public_key_token"] = info.public_key_token
670
+ nodes.append(
671
+ GraphNode(label="Assembly", key=asm_key, props=asm_props)
672
+ )
673
+ for tref in info.types:
674
+ type_key = f"{asm_key}::{tref.namespace}.{tref.name}".rstrip(".")
675
+ if type_key in seen_type_keys:
676
+ continue
677
+ seen_type_keys.add(type_key)
678
+ type_props: dict[str, object] = {
679
+ "name": tref.name,
680
+ "namespace": tref.namespace,
681
+ "kind": tref.kind,
682
+ "sealed": tref.sealed,
683
+ "assembly": asm_key,
684
+ }
685
+ nodes.append(
686
+ GraphNode(label="Type", key=type_key, props=type_props)
687
+ )
688
+ edges.append(
689
+ GraphEdge(
690
+ type="EXPOSES_TYPE",
691
+ src_label="Assembly",
692
+ src_key=asm_key,
693
+ dst_label="Type",
694
+ dst_key=type_key,
695
+ )
696
+ )
697
+ for consumer in consumers:
698
+ edges.append(
699
+ GraphEdge(
700
+ type="USES_ASSEMBLY",
701
+ src_label="Project",
702
+ src_key=consumer,
703
+ dst_label="Assembly",
704
+ dst_key=asm_key,
705
+ )
706
+ )
707
+
708
+ stats.dlls = {
709
+ "assemblies": len(seen_assembly_keys),
710
+ "types": len(seen_type_keys),
711
+ "skipped": skipped,
712
+ "unresolved": unresolved,
713
+ }
714
+ self.graph.upsert_nodes(nodes, head_sha=head_sha, head_ord=head_ord)
715
+ self.graph.upsert_edges(edges, head_sha=head_sha, head_ord=head_ord)
716
+
717
+ def _index_solutions(
718
+ self,
719
+ root: Path,
720
+ stats: IngestStats,
721
+ *,
722
+ head_sha: str | None = None,
723
+ head_ord: int | None = None,
724
+ ) -> None:
725
+ """Walk `.sln` files and emit Solution nodes + Project membership.
726
+
727
+ Schema added:
728
+
729
+ * ``Solution`` node keyed by the solution's absolute path with
730
+ ``name`` and ``project_count``.
731
+ * ``MEMBER_OF`` edge from each indexed Project to the
732
+ Solution(s) that include it. A single project can be a
733
+ member of multiple solutions (shared infra in monorepos);
734
+ all edges are emitted.
735
+
736
+ Solutions whose `Project(...)` entries point at csprojs we
737
+ didn't index (relative path goes outside the repo) end up
738
+ with fewer ``MEMBER_OF`` edges than their declared project
739
+ count — the discrepancy lives in ``stats.solutions``.
740
+ """
741
+ try:
742
+ solutions = walk_solutions(root)
743
+ except Exception as e: # noqa: BLE001
744
+ stats.notes.append(f"sln indexing skipped: {e}")
745
+ return
746
+ if not solutions:
747
+ return
748
+
749
+ nodes: list[GraphNode] = []
750
+ edges: list[GraphEdge] = []
751
+ total_members = 0
752
+ for sln in solutions:
753
+ nodes.append(
754
+ GraphNode(
755
+ label="Solution",
756
+ key=sln.path,
757
+ props={
758
+ "name": sln.name,
759
+ "project_count": len(sln.projects),
760
+ },
761
+ )
762
+ )
763
+ for sp in sln.projects:
764
+ total_members += 1
765
+ edges.append(
766
+ GraphEdge(
767
+ type="MEMBER_OF",
768
+ src_label="Project",
769
+ src_key=sp.csproj_path,
770
+ dst_label="Solution",
771
+ dst_key=sln.path,
772
+ props={"guid": sp.guid},
773
+ )
774
+ )
775
+ stats.solutions = {
776
+ "solutions": len(solutions),
777
+ "memberships": total_members,
778
+ }
779
+ self.graph.upsert_nodes(nodes, head_sha=head_sha, head_ord=head_ord)
780
+ self.graph.upsert_edges(edges, head_sha=head_sha, head_ord=head_ord)
781
+
782
+ def _index_file_containment(
783
+ self,
784
+ projects: list[CsprojInfo],
785
+ stats: IngestStats,
786
+ *,
787
+ head_sha: str | None = None,
788
+ head_ord: int | None = None,
789
+ ) -> None:
790
+ """Tie each .NET source file to its owning ``Project`` node.
791
+
792
+ The resolver needs this to answer "which assemblies can this
793
+ file legitimately reach into" without inferring it from the
794
+ directory tree at query time. Containment is decided by the
795
+ **deepest** csproj whose directory is a prefix of the file's
796
+ path — important for repos that nest sub-projects (a file
797
+ under ``A/Sub/X.cs`` belongs to ``A/Sub`` if ``A/Sub.csproj``
798
+ exists, not the outer ``A.csproj``).
799
+
800
+ Files outside any csproj's directory get no edge — useful for
801
+ scripts / loose .cs at the repo root, where ownership is
802
+ ambiguous.
803
+
804
+ The :class:`IngestStats` record gains ``stats.projects`` keys
805
+ ``files_assigned`` / ``files_unowned`` so the agent can see
806
+ coverage at a glance.
807
+ """
808
+ # Sort csproj dirs by path length descending so the deepest
809
+ # prefix-match wins on a single linear scan per file.
810
+ proj_dirs = sorted(
811
+ ((str(Path(p.path).parent.resolve()), p.path) for p in projects),
812
+ key=lambda x: -len(x[0]),
813
+ )
814
+ if not proj_dirs:
815
+ return
816
+
817
+ rows = self.graph.graph.query(
818
+ "MATCH (f:File) "
819
+ "WHERE f.lang IN ['csharp', 'fsharp', 'vb', 'razor'] "
820
+ "RETURN f.key"
821
+ ).result_set
822
+ files = [row[0] for row in rows]
823
+ if not files:
824
+ return
825
+
826
+ edges: list[GraphEdge] = []
827
+ assigned = 0
828
+ unowned = 0
829
+ for file_path in files:
830
+ owner = _owning_project(file_path, proj_dirs)
831
+ if owner is None:
832
+ unowned += 1
833
+ continue
834
+ assigned += 1
835
+ edges.append(
836
+ GraphEdge(
837
+ type="CONTAINED_IN",
838
+ src_label="File",
839
+ src_key=file_path,
840
+ dst_label="Project",
841
+ dst_key=owner,
842
+ )
843
+ )
844
+ if edges:
845
+ self.graph.upsert_edges(edges, head_sha=head_sha, head_ord=head_ord)
846
+
847
+ if stats.projects is None:
848
+ stats.projects = {}
849
+ stats.projects["files_assigned"] = assigned
850
+ stats.projects["files_unowned"] = unowned
851
+
852
+ def _upsert_dotnet_projects(
853
+ self,
854
+ projects: list[CsprojInfo],
855
+ *,
856
+ head_sha: str | None = None,
857
+ head_ord: int | None = None,
858
+ ) -> None:
859
+ nodes: list[GraphNode] = []
860
+ edges: list[GraphEdge] = []
861
+ seen_pkgs: set[str] = set()
862
+ for proj in projects:
863
+ props: dict[str, object] = {
864
+ "name": proj.name,
865
+ "assembly_name": proj.assembly_name or proj.name,
866
+ "sdk_style": proj.sdk_style,
867
+ }
868
+ if proj.target_framework:
869
+ props["target_framework"] = proj.target_framework
870
+ nodes.append(GraphNode(label="Project", key=proj.path, props=props))
871
+ for ref in proj.project_references:
872
+ # Forward-reference target Project node — `upsert_nodes`
873
+ # is idempotent, and walking all projects first then
874
+ # writing edges would require two passes for no win.
875
+ nodes.append(GraphNode(label="Project", key=ref))
876
+ edges.append(
877
+ GraphEdge(
878
+ type="PROJECT_REFERENCES",
879
+ src_label="Project",
880
+ src_key=proj.path,
881
+ dst_label="Project",
882
+ dst_key=ref,
883
+ )
884
+ )
885
+ for pkg in proj.package_references:
886
+ key = pkg.name
887
+ if key not in seen_pkgs:
888
+ seen_pkgs.add(key)
889
+ nodes.append(
890
+ GraphNode(
891
+ label="Package",
892
+ key=key,
893
+ props={"name": pkg.name},
894
+ )
895
+ )
896
+ edge_props: dict[str, object] = {}
897
+ if pkg.version:
898
+ edge_props["version"] = pkg.version
899
+ edges.append(
900
+ GraphEdge(
901
+ type="PACKAGE_REFERENCES",
902
+ src_label="Project",
903
+ src_key=proj.path,
904
+ dst_label="Package",
905
+ dst_key=key,
906
+ props=edge_props,
907
+ )
908
+ )
909
+ self.graph.upsert_nodes(nodes, head_sha=head_sha, head_ord=head_ord)
910
+ self.graph.upsert_edges(edges, head_sha=head_sha, head_ord=head_ord)
911
+
912
+ def _purge_project_index(self, root: Path) -> None:
913
+ """Wipe code vectors + graph + ingest_state for this project.
914
+
915
+ Episodes are independent (conversation memory) and preserved.
916
+ Called before a full re-ingest so stale entries (e.g. paths now
917
+ excluded by .gitignore or ignore_dirs) don't linger in retrieval.
918
+ """
919
+ self.vector.recreate_collection(self.cfg.qdrant_code)
920
+ self.graph.clear_graph()
921
+ self.state.clear(root)
922
+
923
+ def _ingest_delta(
924
+ self,
925
+ root: Path,
926
+ delta: git_delta.Delta,
927
+ *,
928
+ base_sha: str,
929
+ head_sha: str,
930
+ dry_run: bool,
931
+ on_progress: ProgressCallback | None = None,
932
+ ) -> IngestStats:
933
+ stats = IngestStats(mode="incremental", base_sha=base_sha, head_sha=head_sha)
934
+ sanity = SanitySummary()
935
+ # Resolve the ordinal once: it's a git roundtrip we'd otherwise
936
+ # pay per-file when tombstoning deletes / stamping upserts.
937
+ head_ord = git_delta.commit_ordinal(root, head_sha) if head_sha else None
938
+ reingest = list(delta.reingest_paths())
939
+ hb = _Heartbeat(
940
+ "incremental ingest" + (" (dry-run)" if dry_run else ""),
941
+ total=len(reingest),
942
+ on_progress=on_progress,
943
+ )
944
+
945
+ for path in delta.deleted:
946
+ path_str = str(path)
947
+ stats.deleted += 1
948
+ if dry_run:
949
+ continue
950
+ self.graph.delete_file(
951
+ path_str, head_sha=head_sha, head_ord=head_ord
952
+ )
953
+ if not getattr(self, "skip_vectors", False):
954
+ self.vector.delete_by_path(self.cfg.qdrant_code, path_str)
955
+
956
+ for path in reingest:
957
+ if not path.is_file():
958
+ # file deleted between diff and now, or extractor can't see it
959
+ stats.skipped += 1
960
+ continue
961
+ if dry_run:
962
+ ex = self._extract_one(path)
963
+ if ex is None:
964
+ stats.skipped += 1
965
+ continue
966
+ stats.files += 1
967
+ stats.symbols += len(ex.symbols)
968
+ stats.imports += len(ex.imports)
969
+ stats.calls += len(ex.calls)
970
+ stats.references += len(ex.references)
971
+ stats.chunks += len(ex.symbols) or 1
972
+ sanity.record(ex)
973
+ continue
974
+
975
+ ex = self.reingest_file(path, head_sha=head_sha, head_ord=head_ord)
976
+ if ex is None:
977
+ stats.skipped += 1
978
+ continue
979
+ stats.files += 1
980
+ stats.symbols += len(ex.symbols)
981
+ stats.imports += len(ex.imports)
982
+ stats.calls += len(ex.calls)
983
+ stats.references += len(ex.references)
984
+ stats.chunks += len(ex.symbols) or 1
985
+ sanity.record(ex)
986
+ hb.tick(stats)
987
+
988
+ hb.done(stats)
989
+ _attach_sanity(stats, sanity)
990
+ # Re-run csproj indexing on every delta — project files are
991
+ # tiny and the topology shifts independently of source edits.
992
+ self._ingest_dotnet_projects(
993
+ root, stats, dry_run=dry_run, head_sha=head_sha, head_ord=head_ord
994
+ )
995
+ if delta.is_empty:
996
+ stats.notes.append("no changes since last ingest")
997
+ return stats
998
+
999
+ @staticmethod
1000
+ def _extract_one(path: Path) -> ExtractedFile | None:
1001
+ from ..extractor.treesitter import extract_file
1002
+
1003
+ return extract_file(path)
1004
+
1005
+ def _record_state(
1006
+ self,
1007
+ root: Path,
1008
+ stats: IngestStats,
1009
+ *,
1010
+ head: str | None = None,
1011
+ branch: str | None = None,
1012
+ ) -> None:
1013
+ sha = head or stats.head_sha
1014
+ if sha is None and git_delta.is_git_repo(root):
1015
+ try:
1016
+ sha = git_delta.head_sha(root)
1017
+ if branch is None:
1018
+ branch = git_delta.current_branch(root)
1019
+ except git_delta.GitError:
1020
+ sha = None
1021
+ if sha is None:
1022
+ return
1023
+ stats.head_sha = sha
1024
+ self.state.set(root, sha=sha, branch=branch)
1025
+
1026
+ def ingest_file(
1027
+ self,
1028
+ ex: ExtractedFile,
1029
+ *,
1030
+ head_sha: str | None = None,
1031
+ head_ord: int | None = None,
1032
+ ) -> None:
1033
+ self._upsert_graph(ex, head_sha=head_sha, head_ord=head_ord)
1034
+ if not getattr(self, "skip_vectors", False):
1035
+ self._upsert_vectors(ex)
1036
+
1037
+ def reingest_file(
1038
+ self,
1039
+ path: str | Path,
1040
+ *,
1041
+ head_sha: str | None = None,
1042
+ head_ord: int | None = None,
1043
+ ) -> ExtractedFile | None:
1044
+ from ..extractor.treesitter import extract_file
1045
+
1046
+ ex = extract_file(path)
1047
+ if ex is None:
1048
+ return None
1049
+ # When a caller doesn't know the SHA (per-file save hook), best-
1050
+ # effort resolve from the file's enclosing repo so the temporal
1051
+ # stamp still lands. Cheap: a single `git rev-parse HEAD`.
1052
+ if head_sha is None:
1053
+ head_sha, head_ord = _resolve_head(Path(ex.path).parent)
1054
+ self.graph.delete_file(ex.path, head_sha=head_sha, head_ord=head_ord)
1055
+ if not getattr(self, "skip_vectors", False):
1056
+ self.vector.delete_by_path(self.cfg.qdrant_code, ex.path)
1057
+ self.ingest_file(ex, head_sha=head_sha, head_ord=head_ord)
1058
+ return ex
1059
+
1060
+ def delete_paths(
1061
+ self,
1062
+ paths: Iterable[Path | str],
1063
+ *,
1064
+ head_sha: str | None = None,
1065
+ head_ord: int | None = None,
1066
+ ) -> int:
1067
+ """Remove ``paths`` from graph + vector index.
1068
+
1069
+ Mirrors the deletion branch of ``ingest_delta`` so callers that
1070
+ already know which files vanished (file-save hooks, dirty-only
1071
+ sync) can prune without recomputing a full git delta. When
1072
+ ``head_sha`` is omitted we resolve it once from the first path's
1073
+ repo so the temporal stamp still lands.
1074
+ """
1075
+ path_list = [str(p) for p in paths]
1076
+ if not path_list:
1077
+ return 0
1078
+ if head_sha is None and path_list:
1079
+ head_sha, head_ord = _resolve_head(Path(path_list[0]).parent)
1080
+ for path_str in path_list:
1081
+ self.graph.delete_file(path_str, head_sha=head_sha, head_ord=head_ord)
1082
+ if not getattr(self, "skip_vectors", False):
1083
+ self.vector.delete_by_path(self.cfg.qdrant_code, path_str)
1084
+ return len(path_list)
1085
+
1086
+ def record_episode(self, ep: Episode) -> str:
1087
+ ep_id = self.episodic.add(ep)
1088
+ hv = self.embedder.embed_one(episode_text(ep))
1089
+ self.vector.upsert(
1090
+ self.cfg.qdrant_episodes,
1091
+ [VectorRecord(id=ep_id, vector=hv, payload=episode_payload(ep))],
1092
+ )
1093
+ return ep_id
1094
+
1095
+ def dedupe_episodes(self) -> dict[str, int]:
1096
+ """Compact duplicate episodes in SQLite and prune their vectors.
1097
+
1098
+ Mirrors ``EpisodicStore.dedupe`` and follows up with a Qdrant
1099
+ delete for removed point ids so the vector store doesn't drift
1100
+ from the source of truth. Returns ``{"removed": n, "groups": g}``.
1101
+ """
1102
+ removed_map = self.episodic.dedupe()
1103
+ removed_ids: list[str] = []
1104
+ for ids in removed_map.values():
1105
+ removed_ids.extend(ids)
1106
+ if removed_ids and not getattr(self, "skip_vectors", False):
1107
+ self.vector.delete_by_ids(self.cfg.qdrant_episodes, removed_ids)
1108
+ return {"removed": len(removed_ids), "groups": len(removed_map)}
1109
+
1110
+ def _upsert_graph(
1111
+ self,
1112
+ ex: ExtractedFile,
1113
+ *,
1114
+ head_sha: str | None = None,
1115
+ head_ord: int | None = None,
1116
+ ) -> None:
1117
+ file_node = GraphNode(
1118
+ label="File",
1119
+ key=ex.path,
1120
+ props={"lang": ex.lang, "generated": ex.generated},
1121
+ )
1122
+ nodes: list[GraphNode] = [file_node]
1123
+ edges: list[GraphEdge] = []
1124
+
1125
+ for s in ex.symbols:
1126
+ sym_key = _symbol_key(ex.path, s)
1127
+ props: dict[str, object] = {
1128
+ "name": s.name,
1129
+ "kind": s.kind,
1130
+ "start": s.start_line,
1131
+ "end": s.end_line,
1132
+ "file": ex.path,
1133
+ }
1134
+ if s.namespace:
1135
+ props["namespace"] = s.namespace
1136
+ if s.partial:
1137
+ # Partial declarations live in multiple files; the per-key
1138
+ # ``file`` / ``start`` / ``end`` reflect *one* part. The
1139
+ # ``partial`` flag tells consumers to expect siblings.
1140
+ props["partial"] = True
1141
+ if s.param_count is not None:
1142
+ props["params"] = s.param_count
1143
+ nodes.append(GraphNode(label="Symbol", key=sym_key, props=props))
1144
+ edges.append(
1145
+ GraphEdge(
1146
+ type="DEFINES",
1147
+ src_label="File",
1148
+ src_key=ex.path,
1149
+ dst_label="Symbol",
1150
+ dst_key=sym_key,
1151
+ )
1152
+ )
1153
+
1154
+ seen_mods = set()
1155
+ for mod in ex.imports:
1156
+ if mod in seen_mods:
1157
+ continue
1158
+ seen_mods.add(mod)
1159
+ nodes.append(GraphNode(label="Module", key=mod))
1160
+ edges.append(
1161
+ GraphEdge(
1162
+ type="IMPORTS",
1163
+ src_label="File",
1164
+ src_key=ex.path,
1165
+ dst_label="Module",
1166
+ dst_key=mod,
1167
+ )
1168
+ )
1169
+
1170
+ # Calls are now (name, arity) pairs. Dedupe on the pair so two
1171
+ # call sites of ``Run()`` collapse, but ``Run()`` and ``Run(x)``
1172
+ # both contribute their own edges — the resolver uses the
1173
+ # arity downstream to disambiguate overloads.
1174
+ seen_calls: set[tuple[str, int, str | None]] = set()
1175
+ for call in ex.calls:
1176
+ key_triple = (call.name, call.arity, call.receiver_type)
1177
+ if key_triple in seen_calls:
1178
+ continue
1179
+ seen_calls.add(key_triple)
1180
+ call_props: dict[str, Any] = {
1181
+ "unresolved": True,
1182
+ "args": call.arity,
1183
+ }
1184
+ if call.receiver_type:
1185
+ call_props["receiver_type"] = call.receiver_type
1186
+ edges.append(
1187
+ GraphEdge(
1188
+ type="CALLS",
1189
+ src_label="File",
1190
+ src_key=ex.path,
1191
+ dst_label="Symbol",
1192
+ dst_key=f"name::{call.name}",
1193
+ props=call_props,
1194
+ )
1195
+ )
1196
+ nodes.append(
1197
+ GraphNode(
1198
+ label="Symbol",
1199
+ key=f"name::{call.name}",
1200
+ props={"name": call.name, "unresolved": True},
1201
+ )
1202
+ )
1203
+
1204
+ # Type-position references (base lists, parameter types, field/
1205
+ # property types, generics, type constraints, cast/is/as/typeof
1206
+ # targets). Emitted as a separate REFERENCES edge type so the
1207
+ # graph keeps the semantic distinction from CALLS (`X invokes Y`)
1208
+ # while letting "who touches type X" queries union them.
1209
+ seen_refs: set[str] = set()
1210
+ for ref in ex.references:
1211
+ if ref in seen_refs:
1212
+ continue
1213
+ seen_refs.add(ref)
1214
+ edges.append(
1215
+ GraphEdge(
1216
+ type="REFERENCES",
1217
+ src_label="File",
1218
+ src_key=ex.path,
1219
+ dst_label="Symbol",
1220
+ dst_key=f"name::{ref}",
1221
+ props={"unresolved": True},
1222
+ )
1223
+ )
1224
+ nodes.append(
1225
+ GraphNode(
1226
+ label="Symbol",
1227
+ key=f"name::{ref}",
1228
+ props={"name": ref, "unresolved": True},
1229
+ )
1230
+ )
1231
+
1232
+ # Razor / Blazor DI: emit INJECTS edges to the same placeholder
1233
+ # Symbol pool so the resolver can rewrite them to real Type /
1234
+ # Symbol targets in the same pass that handles calls. Keeping
1235
+ # the edge type distinct preserves the semantic ("X is a DI
1236
+ # dependency of this file", not "X is called by this file").
1237
+ seen_injects: set[str] = set()
1238
+ for injected in ex.injects:
1239
+ if injected in seen_injects:
1240
+ continue
1241
+ seen_injects.add(injected)
1242
+ edges.append(
1243
+ GraphEdge(
1244
+ type="INJECTS",
1245
+ src_label="File",
1246
+ src_key=ex.path,
1247
+ dst_label="Symbol",
1248
+ dst_key=f"name::{injected}",
1249
+ props={"unresolved": True},
1250
+ )
1251
+ )
1252
+ nodes.append(
1253
+ GraphNode(
1254
+ label="Symbol",
1255
+ key=f"name::{injected}",
1256
+ props={"name": injected, "unresolved": True},
1257
+ )
1258
+ )
1259
+
1260
+ self.graph.upsert_nodes(nodes, head_sha=head_sha, head_ord=head_ord)
1261
+ self.graph.upsert_edges(edges, head_sha=head_sha, head_ord=head_ord)
1262
+
1263
+ def _embed_and_upsert(
1264
+ self, pending: list[tuple[ExtractedFile, _Chunk]]
1265
+ ) -> None:
1266
+ """Embed and persist a cross-file chunk batch in one shot.
1267
+
1268
+ Used by the full-ingest hot path so the embedder receives a
1269
+ large list per call (avoiding per-file HTTP overhead) and
1270
+ Qdrant gets a single bulk-upsert. Order of records mirrors the
1271
+ input so the embedder result vector aligns 1:1.
1272
+ """
1273
+ if not pending:
1274
+ return
1275
+ texts = [c.text for _, c in pending]
1276
+ hvecs = self.embedder.embed(texts)
1277
+ records = [
1278
+ VectorRecord(
1279
+ id=_id(ex.path, c.key),
1280
+ vector=hv,
1281
+ payload={
1282
+ "path": ex.path,
1283
+ "lang": ex.lang,
1284
+ "kind": c.kind,
1285
+ "name": c.name,
1286
+ "start": c.start,
1287
+ "end": c.end,
1288
+ "generated": ex.generated,
1289
+ },
1290
+ )
1291
+ for (ex, c), hv in zip(pending, hvecs, strict=True)
1292
+ ]
1293
+ self.vector.upsert(self.cfg.qdrant_code, records)
1294
+
1295
+ def _upsert_vectors(self, ex: ExtractedFile, batch_size: int = 32) -> None:
1296
+ chunks = list(_chunks_for(ex))
1297
+ if not chunks:
1298
+ return
1299
+ for i in range(0, len(chunks), batch_size):
1300
+ batch = chunks[i : i + batch_size]
1301
+ hvecs = self.embedder.embed([c.text for c in batch])
1302
+ records = [
1303
+ VectorRecord(
1304
+ id=_id(ex.path, c.key),
1305
+ vector=hv,
1306
+ payload={
1307
+ "path": ex.path,
1308
+ "lang": ex.lang,
1309
+ "kind": c.kind,
1310
+ "name": c.name,
1311
+ "start": c.start,
1312
+ "end": c.end,
1313
+ "generated": ex.generated,
1314
+ },
1315
+ )
1316
+ for c, hv in zip(batch, hvecs, strict=True)
1317
+ ]
1318
+ self.vector.upsert(self.cfg.qdrant_code, records)
1319
+
1320
+
1321
+ def _resolve_head(root: str | Path) -> tuple[str | None, int | None]:
1322
+ """Best-effort ``(head_sha, head_ord)`` for ``root``.
1323
+
1324
+ Returns ``(None, None)`` for non-git directories so callers can
1325
+ fall through to legacy unstamped behaviour. The ordinal is the
1326
+ first-parent commit count (``git rev-list --count --first-parent``),
1327
+ which gives a monotonic integer along the trunk — usable as a
1328
+ cheap "before/after" comparator without pulling the whole topology
1329
+ into the graph.
1330
+ """
1331
+ p = Path(root)
1332
+ if not git_delta.is_git_repo(p):
1333
+ return None, None
1334
+ try:
1335
+ sha = git_delta.head_sha(p)
1336
+ except git_delta.GitError:
1337
+ return None, None
1338
+ if not sha:
1339
+ return None, None
1340
+ return sha, git_delta.commit_ordinal(p, sha)
1341
+
1342
+
1343
+ def _owning_project(
1344
+ file_path: str, proj_dirs: list[tuple[str, str]]
1345
+ ) -> str | None:
1346
+ """Return the project key whose directory is the deepest prefix of ``file_path``.
1347
+
1348
+ ``proj_dirs`` must already be sorted by descending directory-length
1349
+ so the first match wins. ``None`` means the file lives outside any
1350
+ indexed project.
1351
+ """
1352
+ abs_path = str(Path(file_path).resolve())
1353
+ for dir_, proj_key in proj_dirs:
1354
+ # Match on the directory boundary (``dir/file.cs``) — substring
1355
+ # without the trailing separator would treat ``/A/B.csproj`` as
1356
+ # owning files under ``/A/Beta/`` which it doesn't.
1357
+ prefix = dir_.rstrip("/") + "/"
1358
+ if abs_path.startswith(prefix):
1359
+ return proj_key
1360
+ return None
1361
+
1362
+
1363
+ def _symbol_key(path: str, sym: Symbol) -> str:
1364
+ """Build the graph key for a Symbol node.
1365
+
1366
+ Non-partial symbols stay file-scoped — ``{path}::{name}#{line}``.
1367
+ Partial declarations with a known namespace collapse to one key
1368
+ across every file that declares a part — ``partial::{ns}.{name}``.
1369
+ Multiple ``DEFINES`` edges from the contributing files all point
1370
+ at the same Symbol node, so callers/callees queries see one
1371
+ logical entity instead of N orphan duplicates.
1372
+
1373
+ Partial declarations without a resolvable namespace are rare
1374
+ (global namespace, error recovery); fall back to file-scoped so
1375
+ we never collide two unrelated globals.
1376
+ """
1377
+ if sym.partial and sym.namespace:
1378
+ return f"partial::{sym.namespace}.{sym.name}"
1379
+ return f"{path}::{sym.name}#{sym.start_line}"
1380
+
1381
+
1382
+ def _attach_sanity(stats: IngestStats, sanity: SanitySummary) -> None:
1383
+ """Record sanity-check results on ``stats`` and warn on high failure rates.
1384
+
1385
+ A symbol fails the round-trip when its snippet doesn't contain its
1386
+ own (plain-identifier) name verbatim. That happens when the
1387
+ extractor's byte/char accounting is broken — historically the
1388
+ UTF-8 chop bug. Surface failures on the stats object so the CLI
1389
+ output shows them, and append a loud note when the rate crosses
1390
+ the suspect threshold so a human looks.
1391
+ """
1392
+ if sanity.symbols_checked == 0:
1393
+ return
1394
+ rate = sanity.failure_rate
1395
+ stats.sanity = {
1396
+ "checked": sanity.symbols_checked,
1397
+ "failed": sanity.symbols_failed,
1398
+ "failure_rate": round(rate, 4),
1399
+ "samples": [
1400
+ {"path": v.path, "name": v.name, "kind": v.kind, "line": v.start_line}
1401
+ for v in sanity.sample_violations
1402
+ ],
1403
+ }
1404
+ if rate > SUSPECT_THRESHOLD:
1405
+ stats.notes.append(
1406
+ f"sanity: {sanity.symbols_failed}/{sanity.symbols_checked} "
1407
+ f"plain-identifier symbols ({rate * 100:.1f}%) did not round-trip; "
1408
+ f"extractor may be miscounting offsets — see stats.sanity.samples"
1409
+ )
1410
+
1411
+
1412
+ @dataclass
1413
+ class _Chunk:
1414
+ key: str
1415
+ text: str
1416
+ kind: str
1417
+ name: str
1418
+ start: int
1419
+ end: int
1420
+
1421
+
1422
+ def _chunks_for(ex: ExtractedFile) -> Iterable[_Chunk]:
1423
+ if ex.symbols:
1424
+ for s in ex.symbols:
1425
+ yield _Chunk(
1426
+ key=f"{s.name}#{s.start_line}",
1427
+ text=_symbol_text(s, ex.path),
1428
+ kind=s.kind,
1429
+ name=s.name,
1430
+ start=s.start_line,
1431
+ end=s.end_line,
1432
+ )
1433
+ else:
1434
+ # fallback: whole file (cap to ~6k chars)
1435
+ snippet = ex.source[:6000]
1436
+ yield _Chunk(
1437
+ key="file",
1438
+ text=f"FILE {ex.path}\n{snippet}",
1439
+ kind="file",
1440
+ name=Path(ex.path).name,
1441
+ start=1,
1442
+ end=len(ex.source.splitlines()) or 1,
1443
+ )
1444
+
1445
+
1446
+ MAX_SNIPPET_CHARS = 1500
1447
+ SIGNATURE_LINES = 3
1448
+
1449
+
1450
+ def _symbol_text(s: Symbol, path: str) -> str:
1451
+ """Build chunk text optimised for hybrid (dense + sparse) embedding.
1452
+
1453
+ Layout:
1454
+ 1. Header line with file/kind/name/symbol — front-loaded so both
1455
+ dense semantics and sparse identifier weights pick it up.
1456
+ 2. Signature lines (first ``SIGNATURE_LINES`` non-empty) — repeated
1457
+ so they survive aggressive tail-trim and dominate the lexical
1458
+ weighting for short queries like ``ngOnInit`` or
1459
+ ``UserService.create``.
1460
+ 3. Body, tail-trimmed at ``MAX_SNIPPET_CHARS``. 1500 chars (~ 400
1461
+ tokens) keeps the m3 forward pass tight; longer bodies dilute
1462
+ dense quality without buying much.
1463
+
1464
+ Empty / one-line symbols still produce a usable chunk because the
1465
+ header alone carries the identifier signal.
1466
+ """
1467
+ snippet = s.snippet or ""
1468
+ lines = [line for line in snippet.splitlines() if line.strip()]
1469
+ signature = "\n".join(lines[:SIGNATURE_LINES])
1470
+ body = snippet[:MAX_SNIPPET_CHARS]
1471
+ parts = [
1472
+ f"FILE {path}",
1473
+ f"KIND {s.kind} NAME {s.name}",
1474
+ ]
1475
+ if signature:
1476
+ parts.append(f"SIGNATURE\n{signature}")
1477
+ parts.append(body)
1478
+ return "\n".join(parts)