kgmodule-utils 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kg_utils/__init__.py ADDED
@@ -0,0 +1,10 @@
1
+ """kg_utils — Shared types, snapshots, and embedding protocol for the KGModule SDK.
2
+
3
+ Sub-packages:
4
+ kg_utils.types — NodeSpec, EdgeSpec, KGExtractor, KGModule, etc.
5
+ kg_utils.snapshots — Snapshot, SnapshotManager, SnapshotManifest, etc.
6
+ kg_utils.embed — Embedder protocol, DEFAULT_MODEL, KNOWN_MODELS,
7
+ kg_model_cache_dir(), resolve_model_path().
8
+ """
9
+
10
+ __version__ = "0.1.0"
kg_utils/embed.py ADDED
@@ -0,0 +1,131 @@
1
+ """
2
+ kg_utils.embed — Shared embedding protocol and model-cache convention.
3
+
4
+ Zero external dependencies (stdlib only). Concrete implementations
5
+ (SentenceTransformerEmbedder, LlamaCppEmbedder) live in each KG module and in
6
+ kgrag; this module provides only the shared contract they all implement.
7
+
8
+ Contents
9
+ --------
10
+ Embedder
11
+ Structural protocol: any object with ``embed_query(text) -> list[float]``
12
+ satisfies it. KG modules, kgrag adapters, and tests can type-hint against
13
+ this without coupling to any specific implementation.
14
+
15
+ DEFAULT_MODEL
16
+ Canonical default embedding model for the KGModule stack.
17
+ ``BAAI/bge-small-en-v1.5`` (384-dim, ~24 MB, no licence restrictions).
18
+
19
+ KNOWN_MODELS
20
+ Short alias → HuggingFace repo ID mapping shared by all modules.
21
+ Lets users write ``"bge-small"`` instead of ``"BAAI/bge-small-en-v1.5"``.
22
+
23
+ kg_model_cache_dir()
24
+ Return the system-wide model cache root (``~/.kgrag/models/`` by default).
25
+ Override with the ``KGRAG_MODEL_DIR`` environment variable. All KG modules
26
+ should resolve their model paths through this function so that a single
27
+ ``KGRAG_MODEL_DIR`` setting redirects every module at once.
28
+
29
+ Local fallback convention for standalone use::
30
+
31
+ path = kg_model_cache_dir() / model_name.replace("/", "--")
32
+
33
+ Author: Eric G. Suchanek, PhD
34
+ License: Elastic 2.0
35
+ """
36
+
37
+ from __future__ import annotations
38
+
39
+ import os
40
+ from pathlib import Path
41
+ from typing import Protocol, runtime_checkable
42
+
43
+
44
+ # ---------------------------------------------------------------------------
45
+ # Shared protocol
46
+ # ---------------------------------------------------------------------------
47
+
48
+
49
+ @runtime_checkable
50
+ class Embedder(Protocol):
51
+ """Minimal embedding protocol for the KGModule stack.
52
+
53
+ Any object with an ``embed_query`` method satisfies this protocol and can
54
+ be injected into any KGModule-based KG backend (DocKG, MemoryKG, etc.).
55
+
56
+ :method embed_query: Embed a single query string into a float vector.
57
+ """
58
+
59
+ def embed_query(self, text: str) -> list[float]:
60
+ """Embed a single query string into a dense float vector.
61
+
62
+ :param text: The query string to embed.
63
+ :return: Dense float32 vector as a plain Python list.
64
+ """
65
+ ...
66
+
67
+
68
+ # ---------------------------------------------------------------------------
69
+ # Shared constants
70
+ # ---------------------------------------------------------------------------
71
+
72
+ DEFAULT_MODEL: str = "BAAI/bge-small-en-v1.5"
73
+ """Canonical default embedding model for the KGModule stack (384-dim)."""
74
+
75
+ KNOWN_MODELS: dict[str, str] = {
76
+ "default": "BAAI/bge-small-en-v1.5",
77
+ "bge-small": "BAAI/bge-small-en-v1.5",
78
+ "bge-small-en-v1.5": "BAAI/bge-small-en-v1.5",
79
+ "bge-large": "BAAI/bge-large-en-v1.5",
80
+ "bge-large-en-v1.5": "BAAI/bge-large-en-v1.5",
81
+ "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
82
+ "all-mpnet-base-v2": "sentence-transformers/all-mpnet-base-v2",
83
+ "nomic": "nomic-ai/nomic-embed-text-v1.5",
84
+ "nomic-v1.5": "nomic-ai/nomic-embed-text-v1.5",
85
+ }
86
+ """Short alias → HuggingFace repo ID. Shared by all KG modules and kgrag."""
87
+
88
+
89
+ # ---------------------------------------------------------------------------
90
+ # Shared cache path convention
91
+ # ---------------------------------------------------------------------------
92
+
93
+
94
+ def kg_model_cache_dir() -> Path:
95
+ """Return the system-wide embedding model cache root.
96
+
97
+ Default: ``~/.kgrag/models/``
98
+ Override: set ``KGRAG_MODEL_DIR`` environment variable.
99
+
100
+ All KG modules should resolve model paths through this function so that a
101
+ single env-var change redirects every module's cache at once.
102
+
103
+ :return: Absolute :class:`~pathlib.Path` to the model cache directory.
104
+ """
105
+ env = os.environ.get("KGRAG_MODEL_DIR")
106
+ if env:
107
+ return Path(env).resolve()
108
+ return Path.home() / ".kgrag" / "models"
109
+
110
+
111
+ def resolve_model_path(model_name: str, local_fallback: Path | None = None) -> Path:
112
+ """Return the local cache path for *model_name*.
113
+
114
+ Checks the system-wide cache (``kg_model_cache_dir()``) first. If
115
+ *local_fallback* is provided and the system cache env var is not set, uses
116
+ that instead — allowing standalone modules to keep their own local cache
117
+ while respecting a global override.
118
+
119
+ The model name is stored as ``<org>/<model>`` directory structure (matching
120
+ HuggingFace layout), e.g. ``BAAI/bge-small-en-v1.5`` →
121
+ ``~/.kgrag/models/BAAI/bge-small-en-v1.5/``.
122
+
123
+ :param model_name: HuggingFace model identifier or known alias.
124
+ :param local_fallback: Per-module fallback directory (used when
125
+ ``KGRAG_MODEL_DIR`` is not set).
126
+ :return: Absolute :class:`~pathlib.Path` to the model directory.
127
+ """
128
+ resolved = KNOWN_MODELS.get(model_name, model_name)
129
+ if os.environ.get("KGRAG_MODEL_DIR") or local_fallback is None:
130
+ return kg_model_cache_dir() / resolved.replace("/", os.sep)
131
+ return local_fallback / resolved.replace("/", "--")
kg_utils/py.typed ADDED
File without changes
@@ -0,0 +1,16 @@
1
+ """kg_utils.snapshots — Shared snapshot infrastructure for KG modules.
2
+
3
+ Provides the canonical data models and manager for capturing, storing, and
4
+ comparing temporal metric snapshots. Individual KG backends (pycode_kg, doc_kg,
5
+ ftree_kg, etc.) import from here instead of maintaining their own copies.
6
+ """
7
+
8
+ from kg_utils.snapshots.models import PruneResult, Snapshot, SnapshotManifest
9
+ from kg_utils.snapshots.manager import SnapshotManager
10
+
11
+ __all__ = [
12
+ "PruneResult",
13
+ "Snapshot",
14
+ "SnapshotManifest",
15
+ "SnapshotManager",
16
+ ]
@@ -0,0 +1,497 @@
1
+ """kg_utils/snapshots/manager.py — Snapshot capture, persistence, and comparison.
2
+
3
+ Usage
4
+ -----
5
+ >>> from kg_utils.snapshots import SnapshotManager
6
+ >>> mgr = SnapshotManager(".codekg/snapshots", package_name="code-kg")
7
+ >>> snapshot = mgr.capture(graph_stats_dict=kg.store.stats())
8
+ >>> mgr.save_snapshot(snapshot)
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import dataclasses
14
+ import importlib.metadata
15
+ import json
16
+ import subprocess
17
+ from datetime import UTC, datetime
18
+ from pathlib import Path
19
+ from typing import Any
20
+
21
+ from kg_utils.snapshots.models import PruneResult, Snapshot, SnapshotManifest
22
+
23
+
24
+ class SnapshotManager:
25
+ """Manages snapshot capture, persistence, retrieval, and comparison.
26
+
27
+ This is the single shared implementation. Domain-specific KG libraries
28
+ subclass this to override :meth:`_compute_delta` or
29
+ :meth:`_collect_extra_metrics` when they need domain-specific delta fields
30
+ or automatic metric collection from SQLite.
31
+
32
+ :param snapshots_dir: Directory for snapshot JSON files and manifest.
33
+ :param package_name: Package name for auto-detecting version
34
+ (e.g. ``"code-kg"``, ``"doc-kg"``). Defaults to ``"kg-utils"``.
35
+ :param db_path: Optional SQLite database path for collecting per-module or
36
+ per-directory node counts via :meth:`_collect_breakdown_counts`.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ snapshots_dir: Path | str,
42
+ *,
43
+ package_name: str = "kg-utils",
44
+ db_path: Path | str | None = None,
45
+ ) -> None:
46
+ self.snapshots_dir = Path(snapshots_dir)
47
+ self.snapshots_dir.mkdir(parents=True, exist_ok=True)
48
+ self.manifest_path = self.snapshots_dir / "manifest.json"
49
+ self.package_name = package_name
50
+ self.db_path = Path(db_path) if db_path else None
51
+
52
+ # ------------------------------------------------------------------
53
+ # Package version detection
54
+ # ------------------------------------------------------------------
55
+
56
+ def _package_version(self) -> str:
57
+ """Return the installed package version, or ``'unknown'``."""
58
+ try:
59
+ return importlib.metadata.version(self.package_name)
60
+ except importlib.metadata.PackageNotFoundError:
61
+ return "unknown"
62
+
63
+ # ------------------------------------------------------------------
64
+ # Capture & save
65
+ # ------------------------------------------------------------------
66
+
67
+ def capture(
68
+ self,
69
+ version: str | None = None,
70
+ branch: str | None = None,
71
+ graph_stats_dict: dict[str, Any] | None = None,
72
+ tree_hash: str = "",
73
+ hotspots: list[dict[str, Any]] | None = None,
74
+ issues: list[str] | None = None,
75
+ **extra_metrics: Any,
76
+ ) -> Snapshot:
77
+ """Capture a snapshot from current state.
78
+
79
+ ``graph_stats_dict`` is merged with ``extra_metrics`` to form the
80
+ snapshot's ``metrics`` dict. Pass domain-specific fields as keyword
81
+ arguments (e.g. ``coverage=0.85``, ``critical_issues=2``).
82
+
83
+ :param version: Version string; auto-detected from package if None.
84
+ :param branch: Git branch; auto-detected if None.
85
+ :param graph_stats_dict: Output from the KG's ``stats()`` method.
86
+ :param tree_hash: Git tree hash; auto-detected if not provided.
87
+ :param hotspots: Top hotspot entries.
88
+ :param issues: Issue description strings.
89
+ :param extra_metrics: Additional domain-specific metric fields.
90
+ :return: New :class:`Snapshot` instance (not yet persisted).
91
+ """
92
+ if not version:
93
+ version = self._package_version()
94
+ if branch is None:
95
+ branch = self._get_current_branch()
96
+ if not tree_hash:
97
+ tree_hash = self._get_current_tree_hash()
98
+
99
+ metrics: dict[str, Any] = dict(graph_stats_dict or {})
100
+ metrics.update(extra_metrics)
101
+
102
+ snapshot = Snapshot(
103
+ branch=branch,
104
+ timestamp=datetime.now(UTC).isoformat(),
105
+ version=version,
106
+ metrics=metrics,
107
+ hotspots=hotspots or [],
108
+ issues=issues or [],
109
+ tree_hash=tree_hash,
110
+ )
111
+
112
+ prev = self.get_previous(tree_hash)
113
+ if prev:
114
+ snapshot.vs_previous = self._compute_delta(snapshot, prev)
115
+
116
+ baseline = self.get_baseline()
117
+ if baseline:
118
+ snapshot.vs_baseline = self._compute_delta(snapshot, baseline)
119
+
120
+ return snapshot
121
+
122
+ def save_snapshot(self, snapshot: Snapshot, *, force: bool = False) -> Path | None:
123
+ """Persist a snapshot to disk and update the manifest.
124
+
125
+ Rejects snapshots with zero ``total_nodes`` to protect against
126
+ saving degenerate (unbuilt) state.
127
+
128
+ If ``version`` and ``metrics`` are unchanged from the latest snapshot,
129
+ the existing entry is refreshed in-place rather than creating a new
130
+ history entry. Pass ``force=True`` to always create a new entry.
131
+
132
+ :param snapshot: Snapshot to save.
133
+ :param force: If ``True``, always write a new history entry.
134
+ :return: Path to the saved JSON file, or ``None`` if no-op.
135
+ :raises ValueError: If ``total_nodes`` is 0.
136
+ """
137
+ m = snapshot.metrics
138
+ total_nodes = (
139
+ m.get("total_nodes", 0) if isinstance(m, dict) else getattr(m, "total_nodes", 0)
140
+ )
141
+ if total_nodes == 0:
142
+ raise ValueError(
143
+ "Refusing to save degenerate snapshot with 0 nodes. "
144
+ "Build the KG before capturing a snapshot."
145
+ )
146
+
147
+ manifest = self.load_manifest()
148
+
149
+ # Dedup: refresh latest entry if nothing meaningful changed.
150
+ if not force and manifest.snapshots:
151
+ latest_entry = max(manifest.snapshots, key=lambda x: x.get("timestamp", ""))
152
+ if snapshot.version == latest_entry.get("version", "") and not self._metrics_changed(
153
+ snapshot.metrics, latest_entry.get("metrics", {})
154
+ ):
155
+ old_key = latest_entry["key"]
156
+ old_file = self.snapshots_dir / latest_entry.get("file", f"{old_key}.json")
157
+
158
+ snapshot_file = self.snapshots_dir / f"{snapshot.key}.json"
159
+ snapshot_file.write_text(
160
+ json.dumps(snapshot.to_dict(), indent=2) + "\n", encoding="utf-8"
161
+ )
162
+
163
+ if old_key != snapshot.key and old_file.exists():
164
+ old_file.unlink()
165
+
166
+ latest_entry["key"] = snapshot.key
167
+ latest_entry["branch"] = snapshot.branch
168
+ latest_entry["timestamp"] = snapshot.timestamp
169
+ latest_entry["file"] = snapshot_file.name
170
+
171
+ manifest.last_update = datetime.now(UTC).isoformat()
172
+ self._save_manifest(manifest)
173
+ return snapshot_file
174
+
175
+ # Normal path: new or changed snapshot.
176
+ snapshot_file = self.snapshots_dir / f"{snapshot.key}.json"
177
+ snapshot_file.write_text(json.dumps(snapshot.to_dict(), indent=2) + "\n", encoding="utf-8")
178
+
179
+ existing_idx = next(
180
+ (i for i, s in enumerate(manifest.snapshots) if s.get("key") == snapshot.key),
181
+ None,
182
+ )
183
+
184
+ manifest_entry: dict[str, Any] = {
185
+ "key": snapshot.key,
186
+ "branch": snapshot.branch,
187
+ "timestamp": snapshot.timestamp,
188
+ "version": snapshot.version,
189
+ "file": snapshot_file.name,
190
+ "metrics": snapshot.metrics,
191
+ "deltas": {
192
+ "vs_previous": snapshot.vs_previous,
193
+ "vs_baseline": snapshot.vs_baseline,
194
+ },
195
+ }
196
+
197
+ if existing_idx is not None:
198
+ manifest.snapshots[existing_idx] = manifest_entry
199
+ else:
200
+ manifest.snapshots.append(manifest_entry)
201
+
202
+ manifest.last_update = datetime.now(UTC).isoformat()
203
+ self._save_manifest(manifest)
204
+ return snapshot_file
205
+
206
+ # ------------------------------------------------------------------
207
+ # Loading & listing
208
+ # ------------------------------------------------------------------
209
+
210
+ def load_manifest(self) -> SnapshotManifest:
211
+ """Load ``manifest.json``; return empty manifest if absent."""
212
+ if not self.manifest_path.exists():
213
+ return SnapshotManifest()
214
+ manifest = SnapshotManifest.from_dict(
215
+ json.loads(self.manifest_path.read_text(encoding="utf-8"))
216
+ )
217
+ # Normalise legacy 'tree_hash' -> 'key'
218
+ for entry in manifest.snapshots:
219
+ if "key" not in entry and "tree_hash" in entry:
220
+ entry["key"] = entry.pop("tree_hash")
221
+ return manifest
222
+
223
+ def _save_manifest(self, manifest: SnapshotManifest) -> None:
224
+ self.manifest_path.write_text(
225
+ json.dumps(manifest.to_dict(), indent=2) + "\n", encoding="utf-8"
226
+ )
227
+
228
+ def load_snapshot(self, key: str) -> Snapshot | None:
229
+ """Load a snapshot by key (tree hash) or ``'latest'``.
230
+
231
+ Missing ``vs_previous`` / ``vs_baseline`` deltas are backfilled
232
+ on-the-fly from manifest metadata.
233
+ """
234
+ if key == "latest":
235
+ manifest = self.load_manifest()
236
+ if not manifest.snapshots:
237
+ return None
238
+ entry = max(manifest.snapshots, key=lambda x: x.get("timestamp", ""))
239
+ key = entry["key"]
240
+
241
+ snapshot_file = self.snapshots_dir / f"{key}.json"
242
+ if not snapshot_file.exists():
243
+ return None
244
+ snap = Snapshot.from_dict(json.loads(snapshot_file.read_text(encoding="utf-8")))
245
+
246
+ # Backfill missing deltas from manifest
247
+ if snap.vs_previous is None or snap.vs_baseline is None:
248
+ manifest = self.load_manifest()
249
+ entries = sorted(manifest.snapshots, key=lambda x: x.get("timestamp", ""), reverse=True)
250
+ idx = next((i for i, s in enumerate(entries) if s.get("key") == key), None)
251
+
252
+ if idx is not None:
253
+ if snap.vs_previous is None and idx + 1 < len(entries):
254
+ prev_m = entries[idx + 1].get("metrics", {})
255
+ snap.vs_previous = {
256
+ "nodes": snap.metrics.get("total_nodes", 0) - prev_m.get("total_nodes", 0),
257
+ "edges": snap.metrics.get("total_edges", 0) - prev_m.get("total_edges", 0),
258
+ }
259
+ if snap.vs_baseline is None and entries:
260
+ base_m = entries[-1].get("metrics", {})
261
+ if entries[-1].get("key") != key:
262
+ snap.vs_baseline = {
263
+ "nodes": snap.metrics.get("total_nodes", 0)
264
+ - base_m.get("total_nodes", 0),
265
+ "edges": snap.metrics.get("total_edges", 0)
266
+ - base_m.get("total_edges", 0),
267
+ }
268
+ return snap
269
+
270
+ def get_previous(self, key: str) -> Snapshot | None:
271
+ """Get the snapshot immediately before *key* (by timestamp)."""
272
+ manifest = self.load_manifest()
273
+ current_ts = next(
274
+ (s["timestamp"] for s in manifest.snapshots if s.get("key") == key),
275
+ None,
276
+ )
277
+ if not current_ts:
278
+ return None
279
+ prev_entry = None
280
+ for s in sorted(manifest.snapshots, key=lambda x: x["timestamp"], reverse=True):
281
+ if s["timestamp"] < current_ts:
282
+ prev_entry = s
283
+ break
284
+ return self.load_snapshot(prev_entry["key"]) if prev_entry else None
285
+
286
+ def get_baseline(self) -> Snapshot | None:
287
+ """Get the oldest snapshot (baseline for comparison)."""
288
+ manifest = self.load_manifest()
289
+ if not manifest.snapshots:
290
+ return None
291
+ baseline_entry = min(manifest.snapshots, key=lambda x: x["timestamp"])
292
+ return self.load_snapshot(baseline_entry["key"])
293
+
294
+ def list_snapshots(
295
+ self,
296
+ limit: int | None = None,
297
+ branch: str | None = None,
298
+ ) -> list[dict[str, Any]]:
299
+ """List snapshots in reverse chronological order.
300
+
301
+ :param limit: Max number to return; ``None`` = all.
302
+ :param branch: If provided, filter by branch name.
303
+ :return: List of snapshot metadata dicts.
304
+ """
305
+ manifest = self.load_manifest()
306
+ all_snaps = sorted(manifest.snapshots, key=lambda x: x["timestamp"], reverse=True)
307
+
308
+ if branch is not None:
309
+ all_snaps = [s for s in all_snaps if s.get("branch") == branch]
310
+
311
+ for i, snap in enumerate(all_snaps):
312
+ if snap.get("deltas", {}).get("vs_previous") is None and i + 1 < len(all_snaps):
313
+ prev = all_snaps[i + 1]
314
+ snap.setdefault("deltas", {})["vs_previous"] = self._compute_delta_from_metrics(
315
+ snap["metrics"], prev["metrics"]
316
+ )
317
+
318
+ return all_snaps[:limit] if limit else all_snaps
319
+
320
+ def diff_snapshots(self, key_a: str, key_b: str) -> dict[str, Any]:
321
+ """Compare two snapshots side-by-side.
322
+
323
+ :param key_a: First snapshot key (tree hash).
324
+ :param key_b: Second snapshot key (tree hash).
325
+ :return: Dict with metrics from both and computed deltas.
326
+ """
327
+ snap_a = self.load_snapshot(key_a)
328
+ snap_b = self.load_snapshot(key_b)
329
+
330
+ if not snap_a or not snap_b:
331
+ return {"error": "One or both snapshots not found"}
332
+
333
+ all_node_kinds = set(snap_a.metrics.get("node_counts", {})) | set(
334
+ snap_b.metrics.get("node_counts", {})
335
+ )
336
+ all_edge_rels = set(snap_a.metrics.get("edge_counts", {})) | set(
337
+ snap_b.metrics.get("edge_counts", {})
338
+ )
339
+
340
+ node_counts_delta = {
341
+ k: snap_b.metrics.get("node_counts", {}).get(k, 0)
342
+ - snap_a.metrics.get("node_counts", {}).get(k, 0)
343
+ for k in all_node_kinds
344
+ }
345
+ edge_counts_delta = {
346
+ k: snap_b.metrics.get("edge_counts", {}).get(k, 0)
347
+ - snap_a.metrics.get("edge_counts", {}).get(k, 0)
348
+ for k in all_edge_rels
349
+ }
350
+
351
+ return {
352
+ "a": {"key": snap_a.key, "metrics": snap_a.metrics, "issues": snap_a.issues},
353
+ "b": {"key": snap_b.key, "metrics": snap_b.metrics, "issues": snap_b.issues},
354
+ "delta": self._compute_delta(snap_b, snap_a),
355
+ "node_counts_delta": node_counts_delta,
356
+ "edge_counts_delta": edge_counts_delta,
357
+ }
358
+
359
+ # ------------------------------------------------------------------
360
+ # Delta computation — override for domain-specific delta fields
361
+ # ------------------------------------------------------------------
362
+
363
+ def _metrics_changed(self, new_metrics: dict[str, Any], old_metrics: dict[str, Any]) -> bool:
364
+ """Return ``True`` if metrics represent a meaningful change.
365
+
366
+ Override in subclasses to customise.
367
+ """
368
+ return new_metrics != old_metrics
369
+
370
+ def _compute_delta(self, snap_new: Snapshot, snap_old: Snapshot) -> dict[str, Any]:
371
+ """Compute metrics delta (new - old).
372
+
373
+ Override in subclasses to add domain-specific delta fields.
374
+ """
375
+
376
+ def _to_dict(m: Any) -> dict[str, Any]:
377
+ if isinstance(m, dict):
378
+ return m
379
+ if dataclasses.is_dataclass(m) and not isinstance(m, type):
380
+ return dataclasses.asdict(m)
381
+ return {}
382
+
383
+ return self._compute_delta_from_metrics(
384
+ _to_dict(snap_new.metrics), _to_dict(snap_old.metrics)
385
+ )
386
+
387
+ def _compute_delta_from_metrics(
388
+ self, new_m: dict[str, Any], old_m: dict[str, Any]
389
+ ) -> dict[str, Any]:
390
+ """Compute delta from two raw metrics dicts.
391
+
392
+ Override in subclasses to add domain-specific delta fields.
393
+ """
394
+ return {
395
+ "nodes": new_m.get("total_nodes", 0) - old_m.get("total_nodes", 0),
396
+ "edges": new_m.get("total_edges", 0) - old_m.get("total_edges", 0),
397
+ }
398
+
399
+ # ------------------------------------------------------------------
400
+ # Prune
401
+ # ------------------------------------------------------------------
402
+
403
+ def prune_snapshots(self, *, dry_run: bool = False) -> PruneResult:
404
+ """Remove vestigial snapshots that carry no new metric information.
405
+
406
+ :param dry_run: If ``True``, compute what would be removed without deleting.
407
+ :return: :class:`PruneResult` summarising the cleanup.
408
+ """
409
+ manifest = self.load_manifest()
410
+ by_time = sorted(manifest.snapshots, key=lambda x: x.get("timestamp", ""))
411
+
412
+ removed_keys: list[str] = []
413
+ broken_keys: list[str] = []
414
+ orphaned_files: list[str] = []
415
+
416
+ # Pass 1: separate valid entries from broken ones.
417
+ valid: list[dict[str, Any]] = []
418
+ for entry in by_time:
419
+ key = entry.get("key", "")
420
+ fname = entry.get("file", f"{key}.json")
421
+ if not (self.snapshots_dir / fname).exists():
422
+ broken_keys.append(key)
423
+ else:
424
+ valid.append(entry)
425
+
426
+ # Pass 2: flag metric-duplicate interior entries.
427
+ if len(valid) > 2:
428
+ kept_metrics = valid[0].get("metrics", {})
429
+ for entry in valid[1:-1]:
430
+ m = entry.get("metrics", {})
431
+ if not self._metrics_changed(m, kept_metrics):
432
+ removed_keys.append(entry.get("key", ""))
433
+ else:
434
+ kept_metrics = m
435
+
436
+ # Pass 3: find orphaned JSON files.
437
+ referenced_files = {e.get("file", f"{e.get('key', '')}.json") for e in manifest.snapshots}
438
+ for path in self.snapshots_dir.glob("*.json"):
439
+ if path.name == "manifest.json":
440
+ continue
441
+ if path.name not in referenced_files:
442
+ orphaned_files.append(path.name)
443
+
444
+ if not dry_run:
445
+ entry_by_key = {e.get("key"): e for e in manifest.snapshots}
446
+
447
+ for key in removed_keys:
448
+ entry = entry_by_key.get(key, {})
449
+ fname = entry.get("file", f"{key}.json")
450
+ p = self.snapshots_dir / fname
451
+ if p.exists():
452
+ p.unlink()
453
+
454
+ for fname in orphaned_files:
455
+ p = self.snapshots_dir / fname
456
+ if p.exists():
457
+ p.unlink()
458
+
459
+ drop_keys = set(removed_keys) | set(broken_keys)
460
+ manifest.snapshots = [e for e in manifest.snapshots if e.get("key") not in drop_keys]
461
+ manifest.last_update = datetime.now(UTC).isoformat()
462
+ self._save_manifest(manifest)
463
+
464
+ return PruneResult(
465
+ removed=removed_keys,
466
+ orphaned_files=orphaned_files,
467
+ broken_entries=broken_keys,
468
+ dry_run=dry_run,
469
+ )
470
+
471
+ # ------------------------------------------------------------------
472
+ # Git helpers
473
+ # ------------------------------------------------------------------
474
+
475
+ @staticmethod
476
+ def _get_current_tree_hash() -> str:
477
+ """Get current git tree hash (HEAD^{tree})."""
478
+ try:
479
+ return subprocess.check_output(
480
+ ["git", "rev-parse", "HEAD^{tree}"],
481
+ text=True,
482
+ stderr=subprocess.DEVNULL,
483
+ ).strip()
484
+ except (subprocess.CalledProcessError, FileNotFoundError):
485
+ return ""
486
+
487
+ @staticmethod
488
+ def _get_current_branch() -> str:
489
+ """Get current git branch name."""
490
+ try:
491
+ return subprocess.check_output(
492
+ ["git", "rev-parse", "--abbrev-ref", "HEAD"],
493
+ text=True,
494
+ stderr=subprocess.DEVNULL,
495
+ ).strip()
496
+ except (subprocess.CalledProcessError, FileNotFoundError):
497
+ return "unknown"