flurryx-code-memory 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. code_memory/__init__.py +1 -0
  2. code_memory/claims/__init__.py +32 -0
  3. code_memory/claims/extractor.py +325 -0
  4. code_memory/claims/indexer.py +258 -0
  5. code_memory/claims/resolver.py +186 -0
  6. code_memory/claims/store.py +424 -0
  7. code_memory/cli.py +1192 -0
  8. code_memory/config.py +268 -0
  9. code_memory/embed/__init__.py +224 -0
  10. code_memory/embed/cache.py +204 -0
  11. code_memory/embed/m3.py +174 -0
  12. code_memory/embed/ollama.py +92 -0
  13. code_memory/embed/tei.py +106 -0
  14. code_memory/episodic/__init__.py +3 -0
  15. code_memory/episodic/sqlite_store.py +278 -0
  16. code_memory/extractor/__init__.py +3 -0
  17. code_memory/extractor/csproj.py +166 -0
  18. code_memory/extractor/dll.py +385 -0
  19. code_memory/extractor/gitignore.py +162 -0
  20. code_memory/extractor/nuget.py +275 -0
  21. code_memory/extractor/sanity.py +124 -0
  22. code_memory/extractor/sln.py +108 -0
  23. code_memory/extractor/treesitter.py +1172 -0
  24. code_memory/graph/__init__.py +3 -0
  25. code_memory/graph/falkor_store.py +740 -0
  26. code_memory/mcp_server.py +1816 -0
  27. code_memory/metrics.py +260 -0
  28. code_memory/orchestrator/__init__.py +13 -0
  29. code_memory/orchestrator/git_delta.py +211 -0
  30. code_memory/orchestrator/ingest_state.py +71 -0
  31. code_memory/orchestrator/pipeline.py +1478 -0
  32. code_memory/orchestrator/reset.py +130 -0
  33. code_memory/orchestrator/resolver.py +825 -0
  34. code_memory/orchestrator/retrieve.py +505 -0
  35. code_memory/resilience.py +73 -0
  36. code_memory/sync/__init__.py +20 -0
  37. code_memory/sync/autostart/__init__.py +42 -0
  38. code_memory/sync/autostart/base.py +106 -0
  39. code_memory/sync/autostart/launchd.py +115 -0
  40. code_memory/sync/autostart/schtasks.py +155 -0
  41. code_memory/sync/autostart/systemd.py +113 -0
  42. code_memory/sync/hooks.py +164 -0
  43. code_memory/sync/safety.py +65 -0
  44. code_memory/sync/snapshot.py +461 -0
  45. code_memory/sync/store.py +399 -0
  46. code_memory/sync/sync.py +405 -0
  47. code_memory/sync/watcher.py +320 -0
  48. code_memory/vector/__init__.py +3 -0
  49. code_memory/vector/qdrant_store.py +302 -0
  50. flurryx_code_memory-0.4.0.dist-info/METADATA +26 -0
  51. flurryx_code_memory-0.4.0.dist-info/RECORD +53 -0
  52. flurryx_code_memory-0.4.0.dist-info/WHEEL +4 -0
  53. flurryx_code_memory-0.4.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,65 @@
1
+ """Guards against pointing the watcher at filesystem roots that would
2
+ walk an unbounded number of files (HOME, /, /tmp, …).
3
+
4
+ A rogue watch on ``$HOME`` re-walks every checkout, IDE cache, browser
5
+ profile, and node_modules on the machine. It saturates CPU, contends with
6
+ Ollama, and produces useless indexes.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from pathlib import Path
12
+
13
+
14
+ class UnsafeWatchRootError(ValueError):
15
+ """Raised when the watcher is asked to watch a forbidden root."""
16
+
17
+
18
+ def _system_unsafe_roots() -> set[Path]:
19
+ """Coarse set of filesystem roots that must never be watched.
20
+
21
+ Resolved at call time so symlinks (``/var -> /private/var`` on macOS)
22
+ line up with whatever the user passes in.
23
+ """
24
+ candidates = [
25
+ Path("/"),
26
+ Path.home(),
27
+ Path("/tmp"),
28
+ Path("/var"),
29
+ Path("/private"),
30
+ Path("/etc"),
31
+ Path("/usr"),
32
+ Path("/System"),
33
+ Path("/Library"),
34
+ Path("/opt"),
35
+ Path("/Applications"),
36
+ Path("C:/"),
37
+ Path("C:/Users"),
38
+ Path("C:/Windows"),
39
+ Path("C:/Program Files"),
40
+ ]
41
+ out: set[Path] = set()
42
+ for p in candidates:
43
+ try:
44
+ out.add(p.resolve())
45
+ except (OSError, RuntimeError):
46
+ continue
47
+ return out
48
+
49
+
50
+ def assert_safe_watch_root(root: Path | str) -> Path:
51
+ """Resolve ``root`` and reject HOME / filesystem roots / system dirs.
52
+
53
+ Returns the resolved :class:`Path` on success. Raises
54
+ :class:`UnsafeWatchRootError` if the path is on the forbidden list
55
+ or equals one of the user's HOME / system roots.
56
+ """
57
+ resolved = Path(root).expanduser().resolve()
58
+ forbidden = _system_unsafe_roots()
59
+ if resolved in forbidden:
60
+ raise UnsafeWatchRootError(
61
+ f"refusing to watch {resolved!s}: this is a filesystem / HOME / "
62
+ "system root. Point the watcher at a specific project directory "
63
+ "instead (e.g. ~/Workspace/my-repo)."
64
+ )
65
+ return resolved
@@ -0,0 +1,461 @@
1
+ """Snapshot blob format: build, verify, apply.
2
+
3
+ Layout of a ``<sha>.cmsnap`` tar.gz archive::
4
+
5
+ manifest.json
6
+ vectors/code.jsonl # one point per line: {id, vector, payload}
7
+ graph/nodes.jsonl # {label, key, props}
8
+ graph/edges.jsonl # {type, src_label, src_key, dst_label, dst_key, props}
9
+ state.json # {last_sha, last_ts, branch}
10
+
11
+ Snapshots are content-addressed: filename = git SHA of the commit they
12
+ represent. ``manifest.content_sha256`` is the digest of the canonical
13
+ concatenation of the four jsonl/json payloads so two builds on the same
14
+ SHA produce identical bytes when extractor + embedder are deterministic.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import gzip
20
+ import hashlib
21
+ import io
22
+ import json
23
+ import platform
24
+ import tarfile
25
+ import time
26
+ from dataclasses import asdict, dataclass, field
27
+ from pathlib import Path
28
+ from typing import Any, Iterator
29
+
30
+ from ..config import CONFIG, Config
31
+ from ..graph.falkor_store import FalkorStore, GraphEdge, GraphNode
32
+ from ..embed.m3 import HybridVec, SparseVec
33
+ from ..vector.qdrant_store import QdrantStore, VectorRecord
34
+
35
+ FORMAT_VERSION = 1
36
+ DEFAULT_BATCH = 256
37
+
38
+
39
+ @dataclass(frozen=True)
40
+ class SnapshotManifest:
41
+ format_version: int
42
+ project: str
43
+ head_sha: str
44
+ branch: str | None
45
+ embed_model: str
46
+ embed_dim: int
47
+ created_at: float
48
+ created_by: str
49
+ tool_version: str
50
+ counts: dict[str, int]
51
+ content_sha256: str
52
+
53
+ def to_json(self) -> str:
54
+ return json.dumps(asdict(self), sort_keys=True, separators=(",", ":"))
55
+
56
+ @classmethod
57
+ def from_json(cls, data: str) -> SnapshotManifest:
58
+ obj = json.loads(data)
59
+ return cls(**obj)
60
+
61
+
62
+ @dataclass
63
+ class Snapshot:
64
+ """In-memory representation. Use ``write()`` to materialise a tar.gz."""
65
+
66
+ manifest: SnapshotManifest
67
+ vectors: list[dict[str, Any]] = field(default_factory=list)
68
+ nodes: list[dict[str, Any]] = field(default_factory=list)
69
+ edges: list[dict[str, Any]] = field(default_factory=list)
70
+ state: dict[str, Any] = field(default_factory=dict)
71
+
72
+ def write(self, path: Path) -> Path:
73
+ path.parent.mkdir(parents=True, exist_ok=True)
74
+ with tarfile.open(path, "w:gz") as tar:
75
+ _add(tar, "manifest.json", self.manifest.to_json().encode())
76
+ _add(tar, "vectors/code.jsonl", _jsonl(self.vectors))
77
+ _add(tar, "graph/nodes.jsonl", _jsonl(self.nodes))
78
+ _add(tar, "graph/edges.jsonl", _jsonl(self.edges))
79
+ _add(tar, "state.json", json.dumps(self.state, sort_keys=True).encode())
80
+ return path
81
+
82
+ @classmethod
83
+ def read(cls, path: Path) -> Snapshot:
84
+ with tarfile.open(path, "r:gz") as tar:
85
+ manifest = SnapshotManifest.from_json(_extract(tar, "manifest.json").decode())
86
+ vectors = list(_read_jsonl(_extract(tar, "vectors/code.jsonl")))
87
+ nodes = list(_read_jsonl(_extract(tar, "graph/nodes.jsonl")))
88
+ edges = list(_read_jsonl(_extract(tar, "graph/edges.jsonl")))
89
+ state = json.loads(_extract(tar, "state.json").decode() or "{}")
90
+ return cls(manifest=manifest, vectors=vectors, nodes=nodes, edges=edges, state=state)
91
+
92
+
93
+ # ---------------------------------------------------------------------------
94
+ # Build
95
+ # ---------------------------------------------------------------------------
96
+
97
+
98
+ def build_snapshot(
99
+ *,
100
+ project: str,
101
+ head_sha: str,
102
+ branch: str | None,
103
+ cfg: Config | None = None,
104
+ vector: QdrantStore | None = None,
105
+ graph: FalkorStore | None = None,
106
+ state: dict[str, Any] | None = None,
107
+ tool_version: str = "0.1.0",
108
+ created_by: str | None = None,
109
+ ) -> Snapshot:
110
+ """Dump live stores for ``project`` into an in-memory ``Snapshot``."""
111
+ cfg = cfg or CONFIG.for_project(project)
112
+ vector = vector or QdrantStore()
113
+ graph = graph or FalkorStore(graph_name=cfg.falkor_graph)
114
+
115
+ vectors = list(_dump_vectors(vector, cfg.qdrant_code))
116
+ nodes, edges = _dump_graph(graph)
117
+ state = state or {}
118
+
119
+ counts = {
120
+ "vectors": len(vectors),
121
+ "nodes": len(nodes),
122
+ "edges": len(edges),
123
+ }
124
+ digest = _canonical_digest(vectors, nodes, edges, state)
125
+ manifest = SnapshotManifest(
126
+ format_version=FORMAT_VERSION,
127
+ project=project,
128
+ head_sha=head_sha,
129
+ branch=branch,
130
+ embed_model=cfg.embed_model,
131
+ embed_dim=cfg.embed_dim,
132
+ created_at=time.time(),
133
+ created_by=created_by or _default_creator(),
134
+ tool_version=tool_version,
135
+ counts=counts,
136
+ content_sha256=digest,
137
+ )
138
+ return Snapshot(
139
+ manifest=manifest, vectors=vectors, nodes=nodes, edges=edges, state=state
140
+ )
141
+
142
+
143
+ def _dump_vectors(store: QdrantStore, collection: str) -> Iterator[dict[str, Any]]:
144
+ """Page through every point in the collection via Qdrant scroll API.
145
+
146
+ The hybrid Qdrant layout returns ``p.vector`` as a dict keyed by
147
+ named vector slot (``dense`` and optionally ``sparse``). The legacy
148
+ layout returns a bare list of floats. We serialise both forms into
149
+ a single normalised JSON shape ``{"dense": [...], "sparse":
150
+ {"indices": [...], "values": [...]}}`` so the apply path doesn't
151
+ have to branch on layout era. Previously this used ``list(p.vector)``,
152
+ which silently turned the dict into a list of slot names —
153
+ discarding every actual embedding and producing snapshots that
154
+ couldn't round-trip.
155
+ """
156
+ try:
157
+ store.ensure_collection(collection)
158
+ except Exception:
159
+ return
160
+ offset: Any = None
161
+ while True:
162
+ try:
163
+ points, next_offset = store.client.scroll(
164
+ collection_name=collection,
165
+ limit=DEFAULT_BATCH,
166
+ offset=offset,
167
+ with_vectors=True,
168
+ with_payload=True,
169
+ )
170
+ except Exception:
171
+ return
172
+ for p in points:
173
+ yield {
174
+ "id": str(p.id),
175
+ "vector": _normalize_vector_for_dump(p.vector),
176
+ "payload": dict(p.payload or {}),
177
+ }
178
+ if next_offset is None:
179
+ return
180
+ offset = next_offset
181
+
182
+
183
+ def _hybridvec_from_dump(payload: Any) -> HybridVec:
184
+ """Reverse of :func:`_normalize_vector_for_dump`.
185
+
186
+ Accepts both the new normalised dict shape (``{"dense": [...],
187
+ "sparse": {...}}``) and three legacy shapes that may sit in older
188
+ snapshots: a bare list of floats, a dict with only ``dense``, or
189
+ an empty dict. Always returns a :class:`HybridVec`; sparse is
190
+ empty when the snapshot didn't carry it (matches the Ollama /
191
+ TEI dense-only invariant).
192
+ """
193
+ if isinstance(payload, list):
194
+ return HybridVec(
195
+ dense=[float(x) for x in payload],
196
+ sparse=SparseVec(indices=[], values=[]),
197
+ )
198
+ if not isinstance(payload, dict):
199
+ return HybridVec(dense=[], sparse=SparseVec(indices=[], values=[]))
200
+ dense = [float(x) for x in payload.get("dense") or []]
201
+ sp = payload.get("sparse") or {}
202
+ sparse = SparseVec(
203
+ indices=[int(i) for i in sp.get("indices") or []],
204
+ values=[float(v) for v in sp.get("values") or []],
205
+ )
206
+ return HybridVec(dense=dense, sparse=sparse)
207
+
208
+
209
+ def _normalize_vector_for_dump(vec: Any) -> dict[str, Any]:
210
+ """Coerce any Qdrant vector return shape into the dump JSON shape.
211
+
212
+ * Hybrid layout: ``{"dense": [...], "sparse": SparseVector(...)}``
213
+ → ``{"dense": [...], "sparse": {"indices": [...], "values": [...]}}``.
214
+ * Legacy single-vector layout: ``[float, ...]`` → ``{"dense": [...]}``.
215
+ * Missing / None: ``{}`` (downstream filters empties).
216
+ """
217
+ if vec is None:
218
+ return {}
219
+ if isinstance(vec, dict):
220
+ out: dict[str, Any] = {}
221
+ dense = vec.get("dense")
222
+ if dense is not None:
223
+ out["dense"] = [float(x) for x in dense]
224
+ sparse = vec.get("sparse")
225
+ if sparse is not None:
226
+ # Qdrant's SparseVector exposes ``indices`` and ``values``;
227
+ # some client versions return a plain dict. Handle both.
228
+ indices = getattr(sparse, "indices", None)
229
+ values = getattr(sparse, "values", None)
230
+ if indices is None and isinstance(sparse, dict):
231
+ indices = sparse.get("indices", [])
232
+ values = sparse.get("values", [])
233
+ if indices:
234
+ out["sparse"] = {
235
+ "indices": [int(i) for i in indices],
236
+ "values": [float(v) for v in (values or [])],
237
+ }
238
+ return out
239
+ # Legacy: bare list of floats.
240
+ return {"dense": [float(x) for x in vec]}
241
+
242
+
243
+ def _dump_graph(store: FalkorStore) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
244
+ """Read every node + edge from the project graph."""
245
+ nodes: list[dict[str, Any]] = []
246
+ edges: list[dict[str, Any]] = []
247
+ try:
248
+ node_rows = store.graph.query(
249
+ "MATCH (n) RETURN labels(n) AS labels, n.key AS key, n AS node"
250
+ ).result_set
251
+ except Exception:
252
+ node_rows = []
253
+ for labels, key, node in node_rows:
254
+ label = labels[0] if labels else "Node"
255
+ props = dict(node.properties) if hasattr(node, "properties") else {}
256
+ nodes.append({"label": label, "key": key, "props": props})
257
+
258
+ try:
259
+ edge_rows = store.graph.query(
260
+ "MATCH (a)-[r]->(b) "
261
+ "RETURN type(r) AS t, labels(a) AS sl, a.key AS sk, "
262
+ "labels(b) AS dl, b.key AS dk, r AS edge"
263
+ ).result_set
264
+ except Exception:
265
+ edge_rows = []
266
+ for t, sl, sk, dl, dk, edge in edge_rows:
267
+ props = dict(edge.properties) if hasattr(edge, "properties") else {}
268
+ edges.append(
269
+ {
270
+ "type": t,
271
+ "src_label": sl[0] if sl else "Node",
272
+ "src_key": sk,
273
+ "dst_label": dl[0] if dl else "Node",
274
+ "dst_key": dk,
275
+ "props": props,
276
+ }
277
+ )
278
+ return nodes, edges
279
+
280
+
281
+ # ---------------------------------------------------------------------------
282
+ # Apply (restore into live stores)
283
+ # ---------------------------------------------------------------------------
284
+
285
+
286
+ def apply_snapshot(
287
+ snap: Snapshot,
288
+ *,
289
+ cfg: Config | None = None,
290
+ vector: QdrantStore | None = None,
291
+ graph: FalkorStore | None = None,
292
+ ) -> dict[str, int]:
293
+ """Wipe and restore vectors + graph for the snapshot's project.
294
+
295
+ Caller is responsible for verifying ``model_version`` compatibility
296
+ *before* invoking this — embeddings from one model cannot be reused
297
+ with another and a mismatched apply will corrupt retrieval results.
298
+ """
299
+ cfg = cfg or CONFIG.for_project(snap.manifest.project)
300
+ vector = vector or QdrantStore()
301
+ graph = graph or FalkorStore(graph_name=cfg.falkor_graph)
302
+
303
+ # vectors
304
+ vector.recreate_collection(cfg.qdrant_code)
305
+ if snap.vectors:
306
+ records = [
307
+ VectorRecord(
308
+ id=v["id"],
309
+ vector=_hybridvec_from_dump(v.get("vector")),
310
+ payload=v.get("payload") or {},
311
+ )
312
+ for v in snap.vectors
313
+ if v.get("vector")
314
+ ]
315
+ # batched upsert to avoid huge single requests
316
+ for i in range(0, len(records), DEFAULT_BATCH):
317
+ vector.upsert(cfg.qdrant_code, records[i : i + DEFAULT_BATCH])
318
+
319
+ # graph
320
+ graph.clear_graph()
321
+ graph.ensure_indexes()
322
+ if snap.nodes:
323
+ graph.upsert_nodes(
324
+ GraphNode(label=n["label"], key=n["key"], props=n.get("props") or {})
325
+ for n in snap.nodes
326
+ )
327
+ if snap.edges:
328
+ graph.upsert_edges(
329
+ GraphEdge(
330
+ type=e["type"],
331
+ src_label=e["src_label"],
332
+ src_key=e["src_key"],
333
+ dst_label=e["dst_label"],
334
+ dst_key=e["dst_key"],
335
+ props=e.get("props") or {},
336
+ )
337
+ for e in snap.edges
338
+ )
339
+
340
+ return {
341
+ "vectors": len(snap.vectors),
342
+ "nodes": len(snap.nodes),
343
+ "edges": len(snap.edges),
344
+ }
345
+
346
+
347
+ # ---------------------------------------------------------------------------
348
+ # Verify
349
+ # ---------------------------------------------------------------------------
350
+
351
+
352
+ @dataclass(frozen=True)
353
+ class VerifyResult:
354
+ ok: bool
355
+ reason: str | None
356
+ manifest: SnapshotManifest
357
+
358
+
359
+ def verify_snapshot(
360
+ path: Path | None = None,
361
+ snap: Snapshot | None = None,
362
+ *,
363
+ expected_model: str | None = None,
364
+ expected_dim: int | None = None,
365
+ ) -> VerifyResult:
366
+ """Recompute content digest and check format/model compatibility."""
367
+ if snap is None:
368
+ if path is None:
369
+ raise ValueError("verify_snapshot requires path or snap")
370
+ snap = Snapshot.read(path)
371
+ m = snap.manifest
372
+ if m.format_version != FORMAT_VERSION:
373
+ return VerifyResult(
374
+ False, f"format_version mismatch (got {m.format_version})", m
375
+ )
376
+ if expected_model and m.embed_model != expected_model:
377
+ return VerifyResult(
378
+ False,
379
+ f"embed_model mismatch (snapshot={m.embed_model} local={expected_model})",
380
+ m,
381
+ )
382
+ if expected_dim and m.embed_dim != expected_dim:
383
+ return VerifyResult(
384
+ False,
385
+ f"embed_dim mismatch (snapshot={m.embed_dim} local={expected_dim})",
386
+ m,
387
+ )
388
+ digest = _canonical_digest(snap.vectors, snap.nodes, snap.edges, snap.state)
389
+ if digest != m.content_sha256:
390
+ return VerifyResult(False, "content digest mismatch (corruption?)", m)
391
+ return VerifyResult(True, None, m)
392
+
393
+
394
+ # ---------------------------------------------------------------------------
395
+ # Helpers
396
+ # ---------------------------------------------------------------------------
397
+
398
+
399
+ def _canonical_digest(
400
+ vectors: list[dict[str, Any]],
401
+ nodes: list[dict[str, Any]],
402
+ edges: list[dict[str, Any]],
403
+ state: dict[str, Any],
404
+ ) -> str:
405
+ h = hashlib.sha256()
406
+ for v in sorted(vectors, key=lambda x: x.get("id", "")):
407
+ h.update(_canon(v).encode())
408
+ for n in sorted(nodes, key=lambda x: (x.get("label", ""), x.get("key", ""))):
409
+ h.update(_canon(n).encode())
410
+ for e in sorted(
411
+ edges,
412
+ key=lambda x: (x.get("type", ""), x.get("src_key", ""), x.get("dst_key", "")),
413
+ ):
414
+ h.update(_canon(e).encode())
415
+ h.update(_canon(state).encode())
416
+ return h.hexdigest()
417
+
418
+
419
+ def _canon(obj: Any) -> str:
420
+ return json.dumps(obj, sort_keys=True, separators=(",", ":"), default=str)
421
+
422
+
423
+ def _jsonl(rows: list[dict[str, Any]]) -> bytes:
424
+ buf = io.BytesIO()
425
+ for r in rows:
426
+ buf.write(json.dumps(r, sort_keys=True, separators=(",", ":")).encode())
427
+ buf.write(b"\n")
428
+ return buf.getvalue()
429
+
430
+
431
+ def _read_jsonl(blob: bytes) -> Iterator[dict[str, Any]]:
432
+ for line in blob.splitlines():
433
+ if not line.strip():
434
+ continue
435
+ yield json.loads(line)
436
+
437
+
438
+ def _add(tar: tarfile.TarFile, name: str, data: bytes) -> None:
439
+ info = tarfile.TarInfo(name)
440
+ info.size = len(data)
441
+ info.mtime = 0 # deterministic
442
+ tar.addfile(info, io.BytesIO(data))
443
+
444
+
445
+ def _extract(tar: tarfile.TarFile, name: str) -> bytes:
446
+ member = tar.getmember(name)
447
+ f = tar.extractfile(member)
448
+ if f is None:
449
+ return b""
450
+ return f.read()
451
+
452
+
453
+ def _default_creator() -> str:
454
+ import os
455
+
456
+ user = os.environ.get("USER") or os.environ.get("USERNAME") or "unknown"
457
+ return f"{user}@{platform.node()}"
458
+
459
+
460
+ # silence unused import warning in some linters
461
+ _ = gzip