flurryx-code-memory 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_memory/__init__.py +1 -0
- code_memory/claims/__init__.py +32 -0
- code_memory/claims/extractor.py +325 -0
- code_memory/claims/indexer.py +258 -0
- code_memory/claims/resolver.py +186 -0
- code_memory/claims/store.py +424 -0
- code_memory/cli.py +1192 -0
- code_memory/config.py +268 -0
- code_memory/embed/__init__.py +224 -0
- code_memory/embed/cache.py +204 -0
- code_memory/embed/m3.py +174 -0
- code_memory/embed/ollama.py +92 -0
- code_memory/embed/tei.py +106 -0
- code_memory/episodic/__init__.py +3 -0
- code_memory/episodic/sqlite_store.py +278 -0
- code_memory/extractor/__init__.py +3 -0
- code_memory/extractor/csproj.py +166 -0
- code_memory/extractor/dll.py +385 -0
- code_memory/extractor/gitignore.py +162 -0
- code_memory/extractor/nuget.py +275 -0
- code_memory/extractor/sanity.py +124 -0
- code_memory/extractor/sln.py +108 -0
- code_memory/extractor/treesitter.py +1172 -0
- code_memory/graph/__init__.py +3 -0
- code_memory/graph/falkor_store.py +740 -0
- code_memory/mcp_server.py +1816 -0
- code_memory/metrics.py +260 -0
- code_memory/orchestrator/__init__.py +13 -0
- code_memory/orchestrator/git_delta.py +211 -0
- code_memory/orchestrator/ingest_state.py +71 -0
- code_memory/orchestrator/pipeline.py +1478 -0
- code_memory/orchestrator/reset.py +130 -0
- code_memory/orchestrator/resolver.py +825 -0
- code_memory/orchestrator/retrieve.py +505 -0
- code_memory/resilience.py +73 -0
- code_memory/sync/__init__.py +20 -0
- code_memory/sync/autostart/__init__.py +42 -0
- code_memory/sync/autostart/base.py +106 -0
- code_memory/sync/autostart/launchd.py +115 -0
- code_memory/sync/autostart/schtasks.py +155 -0
- code_memory/sync/autostart/systemd.py +113 -0
- code_memory/sync/hooks.py +164 -0
- code_memory/sync/safety.py +65 -0
- code_memory/sync/snapshot.py +461 -0
- code_memory/sync/store.py +399 -0
- code_memory/sync/sync.py +405 -0
- code_memory/sync/watcher.py +320 -0
- code_memory/vector/__init__.py +3 -0
- code_memory/vector/qdrant_store.py +302 -0
- flurryx_code_memory-0.4.0.dist-info/METADATA +26 -0
- flurryx_code_memory-0.4.0.dist-info/RECORD +53 -0
- flurryx_code_memory-0.4.0.dist-info/WHEEL +4 -0
- flurryx_code_memory-0.4.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Guards against pointing the watcher at filesystem roots that would
|
|
2
|
+
walk an unbounded number of files (HOME, /, /tmp, …).
|
|
3
|
+
|
|
4
|
+
A rogue watch on ``$HOME`` re-walks every checkout, IDE cache, browser
|
|
5
|
+
profile, and node_modules on the machine. It saturates CPU, contends with
|
|
6
|
+
Ollama, and produces useless indexes.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class UnsafeWatchRootError(ValueError):
|
|
15
|
+
"""Raised when the watcher is asked to watch a forbidden root."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _system_unsafe_roots() -> set[Path]:
|
|
19
|
+
"""Coarse set of filesystem roots that must never be watched.
|
|
20
|
+
|
|
21
|
+
Resolved at call time so symlinks (``/var -> /private/var`` on macOS)
|
|
22
|
+
line up with whatever the user passes in.
|
|
23
|
+
"""
|
|
24
|
+
candidates = [
|
|
25
|
+
Path("/"),
|
|
26
|
+
Path.home(),
|
|
27
|
+
Path("/tmp"),
|
|
28
|
+
Path("/var"),
|
|
29
|
+
Path("/private"),
|
|
30
|
+
Path("/etc"),
|
|
31
|
+
Path("/usr"),
|
|
32
|
+
Path("/System"),
|
|
33
|
+
Path("/Library"),
|
|
34
|
+
Path("/opt"),
|
|
35
|
+
Path("/Applications"),
|
|
36
|
+
Path("C:/"),
|
|
37
|
+
Path("C:/Users"),
|
|
38
|
+
Path("C:/Windows"),
|
|
39
|
+
Path("C:/Program Files"),
|
|
40
|
+
]
|
|
41
|
+
out: set[Path] = set()
|
|
42
|
+
for p in candidates:
|
|
43
|
+
try:
|
|
44
|
+
out.add(p.resolve())
|
|
45
|
+
except (OSError, RuntimeError):
|
|
46
|
+
continue
|
|
47
|
+
return out
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def assert_safe_watch_root(root: Path | str) -> Path:
|
|
51
|
+
"""Resolve ``root`` and reject HOME / filesystem roots / system dirs.
|
|
52
|
+
|
|
53
|
+
Returns the resolved :class:`Path` on success. Raises
|
|
54
|
+
:class:`UnsafeWatchRootError` if the path is on the forbidden list
|
|
55
|
+
or equals one of the user's HOME / system roots.
|
|
56
|
+
"""
|
|
57
|
+
resolved = Path(root).expanduser().resolve()
|
|
58
|
+
forbidden = _system_unsafe_roots()
|
|
59
|
+
if resolved in forbidden:
|
|
60
|
+
raise UnsafeWatchRootError(
|
|
61
|
+
f"refusing to watch {resolved!s}: this is a filesystem / HOME / "
|
|
62
|
+
"system root. Point the watcher at a specific project directory "
|
|
63
|
+
"instead (e.g. ~/Workspace/my-repo)."
|
|
64
|
+
)
|
|
65
|
+
return resolved
|
|
@@ -0,0 +1,461 @@
|
|
|
1
|
+
"""Snapshot blob format: build, verify, apply.
|
|
2
|
+
|
|
3
|
+
Layout of a ``<sha>.cmsnap`` tar.gz archive::
|
|
4
|
+
|
|
5
|
+
manifest.json
|
|
6
|
+
vectors/code.jsonl # one point per line: {id, vector, payload}
|
|
7
|
+
graph/nodes.jsonl # {label, key, props}
|
|
8
|
+
graph/edges.jsonl # {type, src_label, src_key, dst_label, dst_key, props}
|
|
9
|
+
state.json # {last_sha, last_ts, branch}
|
|
10
|
+
|
|
11
|
+
Snapshots are content-addressed: filename = git SHA of the commit they
|
|
12
|
+
represent. ``manifest.content_sha256`` is the digest of the canonical
|
|
13
|
+
concatenation of the four jsonl/json payloads so two builds on the same
|
|
14
|
+
SHA produce identical bytes when extractor + embedder are deterministic.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import gzip
|
|
20
|
+
import hashlib
|
|
21
|
+
import io
|
|
22
|
+
import json
|
|
23
|
+
import platform
|
|
24
|
+
import tarfile
|
|
25
|
+
import time
|
|
26
|
+
from dataclasses import asdict, dataclass, field
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import Any, Iterator
|
|
29
|
+
|
|
30
|
+
from ..config import CONFIG, Config
|
|
31
|
+
from ..graph.falkor_store import FalkorStore, GraphEdge, GraphNode
|
|
32
|
+
from ..embed.m3 import HybridVec, SparseVec
|
|
33
|
+
from ..vector.qdrant_store import QdrantStore, VectorRecord
|
|
34
|
+
|
|
35
|
+
FORMAT_VERSION = 1
|
|
36
|
+
DEFAULT_BATCH = 256
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass(frozen=True)
|
|
40
|
+
class SnapshotManifest:
|
|
41
|
+
format_version: int
|
|
42
|
+
project: str
|
|
43
|
+
head_sha: str
|
|
44
|
+
branch: str | None
|
|
45
|
+
embed_model: str
|
|
46
|
+
embed_dim: int
|
|
47
|
+
created_at: float
|
|
48
|
+
created_by: str
|
|
49
|
+
tool_version: str
|
|
50
|
+
counts: dict[str, int]
|
|
51
|
+
content_sha256: str
|
|
52
|
+
|
|
53
|
+
def to_json(self) -> str:
|
|
54
|
+
return json.dumps(asdict(self), sort_keys=True, separators=(",", ":"))
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def from_json(cls, data: str) -> SnapshotManifest:
|
|
58
|
+
obj = json.loads(data)
|
|
59
|
+
return cls(**obj)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class Snapshot:
|
|
64
|
+
"""In-memory representation. Use ``write()`` to materialise a tar.gz."""
|
|
65
|
+
|
|
66
|
+
manifest: SnapshotManifest
|
|
67
|
+
vectors: list[dict[str, Any]] = field(default_factory=list)
|
|
68
|
+
nodes: list[dict[str, Any]] = field(default_factory=list)
|
|
69
|
+
edges: list[dict[str, Any]] = field(default_factory=list)
|
|
70
|
+
state: dict[str, Any] = field(default_factory=dict)
|
|
71
|
+
|
|
72
|
+
def write(self, path: Path) -> Path:
|
|
73
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
74
|
+
with tarfile.open(path, "w:gz") as tar:
|
|
75
|
+
_add(tar, "manifest.json", self.manifest.to_json().encode())
|
|
76
|
+
_add(tar, "vectors/code.jsonl", _jsonl(self.vectors))
|
|
77
|
+
_add(tar, "graph/nodes.jsonl", _jsonl(self.nodes))
|
|
78
|
+
_add(tar, "graph/edges.jsonl", _jsonl(self.edges))
|
|
79
|
+
_add(tar, "state.json", json.dumps(self.state, sort_keys=True).encode())
|
|
80
|
+
return path
|
|
81
|
+
|
|
82
|
+
@classmethod
|
|
83
|
+
def read(cls, path: Path) -> Snapshot:
|
|
84
|
+
with tarfile.open(path, "r:gz") as tar:
|
|
85
|
+
manifest = SnapshotManifest.from_json(_extract(tar, "manifest.json").decode())
|
|
86
|
+
vectors = list(_read_jsonl(_extract(tar, "vectors/code.jsonl")))
|
|
87
|
+
nodes = list(_read_jsonl(_extract(tar, "graph/nodes.jsonl")))
|
|
88
|
+
edges = list(_read_jsonl(_extract(tar, "graph/edges.jsonl")))
|
|
89
|
+
state = json.loads(_extract(tar, "state.json").decode() or "{}")
|
|
90
|
+
return cls(manifest=manifest, vectors=vectors, nodes=nodes, edges=edges, state=state)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# ---------------------------------------------------------------------------
|
|
94
|
+
# Build
|
|
95
|
+
# ---------------------------------------------------------------------------
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def build_snapshot(
|
|
99
|
+
*,
|
|
100
|
+
project: str,
|
|
101
|
+
head_sha: str,
|
|
102
|
+
branch: str | None,
|
|
103
|
+
cfg: Config | None = None,
|
|
104
|
+
vector: QdrantStore | None = None,
|
|
105
|
+
graph: FalkorStore | None = None,
|
|
106
|
+
state: dict[str, Any] | None = None,
|
|
107
|
+
tool_version: str = "0.1.0",
|
|
108
|
+
created_by: str | None = None,
|
|
109
|
+
) -> Snapshot:
|
|
110
|
+
"""Dump live stores for ``project`` into an in-memory ``Snapshot``."""
|
|
111
|
+
cfg = cfg or CONFIG.for_project(project)
|
|
112
|
+
vector = vector or QdrantStore()
|
|
113
|
+
graph = graph or FalkorStore(graph_name=cfg.falkor_graph)
|
|
114
|
+
|
|
115
|
+
vectors = list(_dump_vectors(vector, cfg.qdrant_code))
|
|
116
|
+
nodes, edges = _dump_graph(graph)
|
|
117
|
+
state = state or {}
|
|
118
|
+
|
|
119
|
+
counts = {
|
|
120
|
+
"vectors": len(vectors),
|
|
121
|
+
"nodes": len(nodes),
|
|
122
|
+
"edges": len(edges),
|
|
123
|
+
}
|
|
124
|
+
digest = _canonical_digest(vectors, nodes, edges, state)
|
|
125
|
+
manifest = SnapshotManifest(
|
|
126
|
+
format_version=FORMAT_VERSION,
|
|
127
|
+
project=project,
|
|
128
|
+
head_sha=head_sha,
|
|
129
|
+
branch=branch,
|
|
130
|
+
embed_model=cfg.embed_model,
|
|
131
|
+
embed_dim=cfg.embed_dim,
|
|
132
|
+
created_at=time.time(),
|
|
133
|
+
created_by=created_by or _default_creator(),
|
|
134
|
+
tool_version=tool_version,
|
|
135
|
+
counts=counts,
|
|
136
|
+
content_sha256=digest,
|
|
137
|
+
)
|
|
138
|
+
return Snapshot(
|
|
139
|
+
manifest=manifest, vectors=vectors, nodes=nodes, edges=edges, state=state
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _dump_vectors(store: QdrantStore, collection: str) -> Iterator[dict[str, Any]]:
|
|
144
|
+
"""Page through every point in the collection via Qdrant scroll API.
|
|
145
|
+
|
|
146
|
+
The hybrid Qdrant layout returns ``p.vector`` as a dict keyed by
|
|
147
|
+
named vector slot (``dense`` and optionally ``sparse``). The legacy
|
|
148
|
+
layout returns a bare list of floats. We serialise both forms into
|
|
149
|
+
a single normalised JSON shape ``{"dense": [...], "sparse":
|
|
150
|
+
{"indices": [...], "values": [...]}}`` so the apply path doesn't
|
|
151
|
+
have to branch on layout era. Previously this used ``list(p.vector)``,
|
|
152
|
+
which silently turned the dict into a list of slot names —
|
|
153
|
+
discarding every actual embedding and producing snapshots that
|
|
154
|
+
couldn't round-trip.
|
|
155
|
+
"""
|
|
156
|
+
try:
|
|
157
|
+
store.ensure_collection(collection)
|
|
158
|
+
except Exception:
|
|
159
|
+
return
|
|
160
|
+
offset: Any = None
|
|
161
|
+
while True:
|
|
162
|
+
try:
|
|
163
|
+
points, next_offset = store.client.scroll(
|
|
164
|
+
collection_name=collection,
|
|
165
|
+
limit=DEFAULT_BATCH,
|
|
166
|
+
offset=offset,
|
|
167
|
+
with_vectors=True,
|
|
168
|
+
with_payload=True,
|
|
169
|
+
)
|
|
170
|
+
except Exception:
|
|
171
|
+
return
|
|
172
|
+
for p in points:
|
|
173
|
+
yield {
|
|
174
|
+
"id": str(p.id),
|
|
175
|
+
"vector": _normalize_vector_for_dump(p.vector),
|
|
176
|
+
"payload": dict(p.payload or {}),
|
|
177
|
+
}
|
|
178
|
+
if next_offset is None:
|
|
179
|
+
return
|
|
180
|
+
offset = next_offset
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _hybridvec_from_dump(payload: Any) -> HybridVec:
|
|
184
|
+
"""Reverse of :func:`_normalize_vector_for_dump`.
|
|
185
|
+
|
|
186
|
+
Accepts both the new normalised dict shape (``{"dense": [...],
|
|
187
|
+
"sparse": {...}}``) and three legacy shapes that may sit in older
|
|
188
|
+
snapshots: a bare list of floats, a dict with only ``dense``, or
|
|
189
|
+
an empty dict. Always returns a :class:`HybridVec`; sparse is
|
|
190
|
+
empty when the snapshot didn't carry it (matches the Ollama /
|
|
191
|
+
TEI dense-only invariant).
|
|
192
|
+
"""
|
|
193
|
+
if isinstance(payload, list):
|
|
194
|
+
return HybridVec(
|
|
195
|
+
dense=[float(x) for x in payload],
|
|
196
|
+
sparse=SparseVec(indices=[], values=[]),
|
|
197
|
+
)
|
|
198
|
+
if not isinstance(payload, dict):
|
|
199
|
+
return HybridVec(dense=[], sparse=SparseVec(indices=[], values=[]))
|
|
200
|
+
dense = [float(x) for x in payload.get("dense") or []]
|
|
201
|
+
sp = payload.get("sparse") or {}
|
|
202
|
+
sparse = SparseVec(
|
|
203
|
+
indices=[int(i) for i in sp.get("indices") or []],
|
|
204
|
+
values=[float(v) for v in sp.get("values") or []],
|
|
205
|
+
)
|
|
206
|
+
return HybridVec(dense=dense, sparse=sparse)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _normalize_vector_for_dump(vec: Any) -> dict[str, Any]:
|
|
210
|
+
"""Coerce any Qdrant vector return shape into the dump JSON shape.
|
|
211
|
+
|
|
212
|
+
* Hybrid layout: ``{"dense": [...], "sparse": SparseVector(...)}``
|
|
213
|
+
→ ``{"dense": [...], "sparse": {"indices": [...], "values": [...]}}``.
|
|
214
|
+
* Legacy single-vector layout: ``[float, ...]`` → ``{"dense": [...]}``.
|
|
215
|
+
* Missing / None: ``{}`` (downstream filters empties).
|
|
216
|
+
"""
|
|
217
|
+
if vec is None:
|
|
218
|
+
return {}
|
|
219
|
+
if isinstance(vec, dict):
|
|
220
|
+
out: dict[str, Any] = {}
|
|
221
|
+
dense = vec.get("dense")
|
|
222
|
+
if dense is not None:
|
|
223
|
+
out["dense"] = [float(x) for x in dense]
|
|
224
|
+
sparse = vec.get("sparse")
|
|
225
|
+
if sparse is not None:
|
|
226
|
+
# Qdrant's SparseVector exposes ``indices`` and ``values``;
|
|
227
|
+
# some client versions return a plain dict. Handle both.
|
|
228
|
+
indices = getattr(sparse, "indices", None)
|
|
229
|
+
values = getattr(sparse, "values", None)
|
|
230
|
+
if indices is None and isinstance(sparse, dict):
|
|
231
|
+
indices = sparse.get("indices", [])
|
|
232
|
+
values = sparse.get("values", [])
|
|
233
|
+
if indices:
|
|
234
|
+
out["sparse"] = {
|
|
235
|
+
"indices": [int(i) for i in indices],
|
|
236
|
+
"values": [float(v) for v in (values or [])],
|
|
237
|
+
}
|
|
238
|
+
return out
|
|
239
|
+
# Legacy: bare list of floats.
|
|
240
|
+
return {"dense": [float(x) for x in vec]}
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _dump_graph(store: FalkorStore) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
|
|
244
|
+
"""Read every node + edge from the project graph."""
|
|
245
|
+
nodes: list[dict[str, Any]] = []
|
|
246
|
+
edges: list[dict[str, Any]] = []
|
|
247
|
+
try:
|
|
248
|
+
node_rows = store.graph.query(
|
|
249
|
+
"MATCH (n) RETURN labels(n) AS labels, n.key AS key, n AS node"
|
|
250
|
+
).result_set
|
|
251
|
+
except Exception:
|
|
252
|
+
node_rows = []
|
|
253
|
+
for labels, key, node in node_rows:
|
|
254
|
+
label = labels[0] if labels else "Node"
|
|
255
|
+
props = dict(node.properties) if hasattr(node, "properties") else {}
|
|
256
|
+
nodes.append({"label": label, "key": key, "props": props})
|
|
257
|
+
|
|
258
|
+
try:
|
|
259
|
+
edge_rows = store.graph.query(
|
|
260
|
+
"MATCH (a)-[r]->(b) "
|
|
261
|
+
"RETURN type(r) AS t, labels(a) AS sl, a.key AS sk, "
|
|
262
|
+
"labels(b) AS dl, b.key AS dk, r AS edge"
|
|
263
|
+
).result_set
|
|
264
|
+
except Exception:
|
|
265
|
+
edge_rows = []
|
|
266
|
+
for t, sl, sk, dl, dk, edge in edge_rows:
|
|
267
|
+
props = dict(edge.properties) if hasattr(edge, "properties") else {}
|
|
268
|
+
edges.append(
|
|
269
|
+
{
|
|
270
|
+
"type": t,
|
|
271
|
+
"src_label": sl[0] if sl else "Node",
|
|
272
|
+
"src_key": sk,
|
|
273
|
+
"dst_label": dl[0] if dl else "Node",
|
|
274
|
+
"dst_key": dk,
|
|
275
|
+
"props": props,
|
|
276
|
+
}
|
|
277
|
+
)
|
|
278
|
+
return nodes, edges
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
# ---------------------------------------------------------------------------
|
|
282
|
+
# Apply (restore into live stores)
|
|
283
|
+
# ---------------------------------------------------------------------------
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def apply_snapshot(
|
|
287
|
+
snap: Snapshot,
|
|
288
|
+
*,
|
|
289
|
+
cfg: Config | None = None,
|
|
290
|
+
vector: QdrantStore | None = None,
|
|
291
|
+
graph: FalkorStore | None = None,
|
|
292
|
+
) -> dict[str, int]:
|
|
293
|
+
"""Wipe and restore vectors + graph for the snapshot's project.
|
|
294
|
+
|
|
295
|
+
Caller is responsible for verifying ``model_version`` compatibility
|
|
296
|
+
*before* invoking this — embeddings from one model cannot be reused
|
|
297
|
+
with another and a mismatched apply will corrupt retrieval results.
|
|
298
|
+
"""
|
|
299
|
+
cfg = cfg or CONFIG.for_project(snap.manifest.project)
|
|
300
|
+
vector = vector or QdrantStore()
|
|
301
|
+
graph = graph or FalkorStore(graph_name=cfg.falkor_graph)
|
|
302
|
+
|
|
303
|
+
# vectors
|
|
304
|
+
vector.recreate_collection(cfg.qdrant_code)
|
|
305
|
+
if snap.vectors:
|
|
306
|
+
records = [
|
|
307
|
+
VectorRecord(
|
|
308
|
+
id=v["id"],
|
|
309
|
+
vector=_hybridvec_from_dump(v.get("vector")),
|
|
310
|
+
payload=v.get("payload") or {},
|
|
311
|
+
)
|
|
312
|
+
for v in snap.vectors
|
|
313
|
+
if v.get("vector")
|
|
314
|
+
]
|
|
315
|
+
# batched upsert to avoid huge single requests
|
|
316
|
+
for i in range(0, len(records), DEFAULT_BATCH):
|
|
317
|
+
vector.upsert(cfg.qdrant_code, records[i : i + DEFAULT_BATCH])
|
|
318
|
+
|
|
319
|
+
# graph
|
|
320
|
+
graph.clear_graph()
|
|
321
|
+
graph.ensure_indexes()
|
|
322
|
+
if snap.nodes:
|
|
323
|
+
graph.upsert_nodes(
|
|
324
|
+
GraphNode(label=n["label"], key=n["key"], props=n.get("props") or {})
|
|
325
|
+
for n in snap.nodes
|
|
326
|
+
)
|
|
327
|
+
if snap.edges:
|
|
328
|
+
graph.upsert_edges(
|
|
329
|
+
GraphEdge(
|
|
330
|
+
type=e["type"],
|
|
331
|
+
src_label=e["src_label"],
|
|
332
|
+
src_key=e["src_key"],
|
|
333
|
+
dst_label=e["dst_label"],
|
|
334
|
+
dst_key=e["dst_key"],
|
|
335
|
+
props=e.get("props") or {},
|
|
336
|
+
)
|
|
337
|
+
for e in snap.edges
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
return {
|
|
341
|
+
"vectors": len(snap.vectors),
|
|
342
|
+
"nodes": len(snap.nodes),
|
|
343
|
+
"edges": len(snap.edges),
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
# ---------------------------------------------------------------------------
|
|
348
|
+
# Verify
|
|
349
|
+
# ---------------------------------------------------------------------------
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
@dataclass(frozen=True)
|
|
353
|
+
class VerifyResult:
|
|
354
|
+
ok: bool
|
|
355
|
+
reason: str | None
|
|
356
|
+
manifest: SnapshotManifest
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def verify_snapshot(
|
|
360
|
+
path: Path | None = None,
|
|
361
|
+
snap: Snapshot | None = None,
|
|
362
|
+
*,
|
|
363
|
+
expected_model: str | None = None,
|
|
364
|
+
expected_dim: int | None = None,
|
|
365
|
+
) -> VerifyResult:
|
|
366
|
+
"""Recompute content digest and check format/model compatibility."""
|
|
367
|
+
if snap is None:
|
|
368
|
+
if path is None:
|
|
369
|
+
raise ValueError("verify_snapshot requires path or snap")
|
|
370
|
+
snap = Snapshot.read(path)
|
|
371
|
+
m = snap.manifest
|
|
372
|
+
if m.format_version != FORMAT_VERSION:
|
|
373
|
+
return VerifyResult(
|
|
374
|
+
False, f"format_version mismatch (got {m.format_version})", m
|
|
375
|
+
)
|
|
376
|
+
if expected_model and m.embed_model != expected_model:
|
|
377
|
+
return VerifyResult(
|
|
378
|
+
False,
|
|
379
|
+
f"embed_model mismatch (snapshot={m.embed_model} local={expected_model})",
|
|
380
|
+
m,
|
|
381
|
+
)
|
|
382
|
+
if expected_dim and m.embed_dim != expected_dim:
|
|
383
|
+
return VerifyResult(
|
|
384
|
+
False,
|
|
385
|
+
f"embed_dim mismatch (snapshot={m.embed_dim} local={expected_dim})",
|
|
386
|
+
m,
|
|
387
|
+
)
|
|
388
|
+
digest = _canonical_digest(snap.vectors, snap.nodes, snap.edges, snap.state)
|
|
389
|
+
if digest != m.content_sha256:
|
|
390
|
+
return VerifyResult(False, "content digest mismatch (corruption?)", m)
|
|
391
|
+
return VerifyResult(True, None, m)
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
# ---------------------------------------------------------------------------
|
|
395
|
+
# Helpers
|
|
396
|
+
# ---------------------------------------------------------------------------
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def _canonical_digest(
|
|
400
|
+
vectors: list[dict[str, Any]],
|
|
401
|
+
nodes: list[dict[str, Any]],
|
|
402
|
+
edges: list[dict[str, Any]],
|
|
403
|
+
state: dict[str, Any],
|
|
404
|
+
) -> str:
|
|
405
|
+
h = hashlib.sha256()
|
|
406
|
+
for v in sorted(vectors, key=lambda x: x.get("id", "")):
|
|
407
|
+
h.update(_canon(v).encode())
|
|
408
|
+
for n in sorted(nodes, key=lambda x: (x.get("label", ""), x.get("key", ""))):
|
|
409
|
+
h.update(_canon(n).encode())
|
|
410
|
+
for e in sorted(
|
|
411
|
+
edges,
|
|
412
|
+
key=lambda x: (x.get("type", ""), x.get("src_key", ""), x.get("dst_key", "")),
|
|
413
|
+
):
|
|
414
|
+
h.update(_canon(e).encode())
|
|
415
|
+
h.update(_canon(state).encode())
|
|
416
|
+
return h.hexdigest()
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def _canon(obj: Any) -> str:
|
|
420
|
+
return json.dumps(obj, sort_keys=True, separators=(",", ":"), default=str)
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def _jsonl(rows: list[dict[str, Any]]) -> bytes:
|
|
424
|
+
buf = io.BytesIO()
|
|
425
|
+
for r in rows:
|
|
426
|
+
buf.write(json.dumps(r, sort_keys=True, separators=(",", ":")).encode())
|
|
427
|
+
buf.write(b"\n")
|
|
428
|
+
return buf.getvalue()
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def _read_jsonl(blob: bytes) -> Iterator[dict[str, Any]]:
|
|
432
|
+
for line in blob.splitlines():
|
|
433
|
+
if not line.strip():
|
|
434
|
+
continue
|
|
435
|
+
yield json.loads(line)
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def _add(tar: tarfile.TarFile, name: str, data: bytes) -> None:
|
|
439
|
+
info = tarfile.TarInfo(name)
|
|
440
|
+
info.size = len(data)
|
|
441
|
+
info.mtime = 0 # deterministic
|
|
442
|
+
tar.addfile(info, io.BytesIO(data))
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def _extract(tar: tarfile.TarFile, name: str) -> bytes:
|
|
446
|
+
member = tar.getmember(name)
|
|
447
|
+
f = tar.extractfile(member)
|
|
448
|
+
if f is None:
|
|
449
|
+
return b""
|
|
450
|
+
return f.read()
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def _default_creator() -> str:
|
|
454
|
+
import os
|
|
455
|
+
|
|
456
|
+
user = os.environ.get("USER") or os.environ.get("USERNAME") or "unknown"
|
|
457
|
+
return f"{user}@{platform.node()}"
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
# silence unused import warning in some linters
|
|
461
|
+
_ = gzip
|