kgmodule-utils 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kg_utils/__init__.py +10 -0
- kg_utils/embed.py +131 -0
- kg_utils/py.typed +0 -0
- kg_utils/snapshots/__init__.py +16 -0
- kg_utils/snapshots/manager.py +497 -0
- kg_utils/snapshots/models.py +137 -0
- kg_utils/types/__init__.py +14 -0
- kg_utils/types/extractor.py +68 -0
- kg_utils/types/module.py +87 -0
- kg_utils/types/specs.py +90 -0
- kgmodule_utils-0.2.0.dist-info/METADATA +210 -0
- kgmodule_utils-0.2.0.dist-info/RECORD +14 -0
- kgmodule_utils-0.2.0.dist-info/WHEEL +4 -0
- kgmodule_utils-0.2.0.dist-info/licenses/LICENSE +93 -0
kg_utils/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""kg_utils — Shared types, snapshots, and embedding protocol for the KGModule SDK.
|
|
2
|
+
|
|
3
|
+
Sub-packages:
|
|
4
|
+
kg_utils.types — NodeSpec, EdgeSpec, KGExtractor, KGModule, etc.
|
|
5
|
+
kg_utils.snapshots — Snapshot, SnapshotManager, SnapshotManifest, etc.
|
|
6
|
+
kg_utils.embed — Embedder protocol, DEFAULT_MODEL, KNOWN_MODELS,
|
|
7
|
+
kg_model_cache_dir(), resolve_model_path().
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
__version__ = "0.1.0"
|
kg_utils/embed.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""
|
|
2
|
+
kg_utils.embed — Shared embedding protocol and model-cache convention.
|
|
3
|
+
|
|
4
|
+
Zero external dependencies (stdlib only). Concrete implementations
|
|
5
|
+
(SentenceTransformerEmbedder, LlamaCppEmbedder) live in each KG module and in
|
|
6
|
+
kgrag; this module provides only the shared contract they all implement.
|
|
7
|
+
|
|
8
|
+
Contents
|
|
9
|
+
--------
|
|
10
|
+
Embedder
|
|
11
|
+
Structural protocol: any object with ``embed_query(text) -> list[float]``
|
|
12
|
+
satisfies it. KG modules, kgrag adapters, and tests can type-hint against
|
|
13
|
+
this without coupling to any specific implementation.
|
|
14
|
+
|
|
15
|
+
DEFAULT_MODEL
|
|
16
|
+
Canonical default embedding model for the KGModule stack.
|
|
17
|
+
``BAAI/bge-small-en-v1.5`` (384-dim, ~24 MB, no licence restrictions).
|
|
18
|
+
|
|
19
|
+
KNOWN_MODELS
|
|
20
|
+
Short alias → HuggingFace repo ID mapping shared by all modules.
|
|
21
|
+
Lets users write ``"bge-small"`` instead of ``"BAAI/bge-small-en-v1.5"``.
|
|
22
|
+
|
|
23
|
+
kg_model_cache_dir()
|
|
24
|
+
Return the system-wide model cache root (``~/.kgrag/models/`` by default).
|
|
25
|
+
Override with the ``KGRAG_MODEL_DIR`` environment variable. All KG modules
|
|
26
|
+
should resolve their model paths through this function so that a single
|
|
27
|
+
``KGRAG_MODEL_DIR`` setting redirects every module at once.
|
|
28
|
+
|
|
29
|
+
Local fallback convention for standalone use::
|
|
30
|
+
|
|
31
|
+
path = kg_model_cache_dir() / model_name.replace("/", "--")
|
|
32
|
+
|
|
33
|
+
Author: Eric G. Suchanek, PhD
|
|
34
|
+
License: Elastic 2.0
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
from __future__ import annotations
|
|
38
|
+
|
|
39
|
+
import os
|
|
40
|
+
from pathlib import Path
|
|
41
|
+
from typing import Protocol, runtime_checkable
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
# Shared protocol
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@runtime_checkable
|
|
50
|
+
class Embedder(Protocol):
|
|
51
|
+
"""Minimal embedding protocol for the KGModule stack.
|
|
52
|
+
|
|
53
|
+
Any object with an ``embed_query`` method satisfies this protocol and can
|
|
54
|
+
be injected into any KGModule-based KG backend (DocKG, MemoryKG, etc.).
|
|
55
|
+
|
|
56
|
+
:method embed_query: Embed a single query string into a float vector.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def embed_query(self, text: str) -> list[float]:
|
|
60
|
+
"""Embed a single query string into a dense float vector.
|
|
61
|
+
|
|
62
|
+
:param text: The query string to embed.
|
|
63
|
+
:return: Dense float32 vector as a plain Python list.
|
|
64
|
+
"""
|
|
65
|
+
...
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# ---------------------------------------------------------------------------
|
|
69
|
+
# Shared constants
|
|
70
|
+
# ---------------------------------------------------------------------------
|
|
71
|
+
|
|
72
|
+
DEFAULT_MODEL: str = "BAAI/bge-small-en-v1.5"
|
|
73
|
+
"""Canonical default embedding model for the KGModule stack (384-dim)."""
|
|
74
|
+
|
|
75
|
+
KNOWN_MODELS: dict[str, str] = {
|
|
76
|
+
"default": "BAAI/bge-small-en-v1.5",
|
|
77
|
+
"bge-small": "BAAI/bge-small-en-v1.5",
|
|
78
|
+
"bge-small-en-v1.5": "BAAI/bge-small-en-v1.5",
|
|
79
|
+
"bge-large": "BAAI/bge-large-en-v1.5",
|
|
80
|
+
"bge-large-en-v1.5": "BAAI/bge-large-en-v1.5",
|
|
81
|
+
"all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
|
|
82
|
+
"all-mpnet-base-v2": "sentence-transformers/all-mpnet-base-v2",
|
|
83
|
+
"nomic": "nomic-ai/nomic-embed-text-v1.5",
|
|
84
|
+
"nomic-v1.5": "nomic-ai/nomic-embed-text-v1.5",
|
|
85
|
+
}
|
|
86
|
+
"""Short alias → HuggingFace repo ID. Shared by all KG modules and kgrag."""
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# ---------------------------------------------------------------------------
|
|
90
|
+
# Shared cache path convention
|
|
91
|
+
# ---------------------------------------------------------------------------
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def kg_model_cache_dir() -> Path:
|
|
95
|
+
"""Return the system-wide embedding model cache root.
|
|
96
|
+
|
|
97
|
+
Default: ``~/.kgrag/models/``
|
|
98
|
+
Override: set ``KGRAG_MODEL_DIR`` environment variable.
|
|
99
|
+
|
|
100
|
+
All KG modules should resolve model paths through this function so that a
|
|
101
|
+
single env-var change redirects every module's cache at once.
|
|
102
|
+
|
|
103
|
+
:return: Absolute :class:`~pathlib.Path` to the model cache directory.
|
|
104
|
+
"""
|
|
105
|
+
env = os.environ.get("KGRAG_MODEL_DIR")
|
|
106
|
+
if env:
|
|
107
|
+
return Path(env).resolve()
|
|
108
|
+
return Path.home() / ".kgrag" / "models"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def resolve_model_path(model_name: str, local_fallback: Path | None = None) -> Path:
|
|
112
|
+
"""Return the local cache path for *model_name*.
|
|
113
|
+
|
|
114
|
+
Checks the system-wide cache (``kg_model_cache_dir()``) first. If
|
|
115
|
+
*local_fallback* is provided and the system cache env var is not set, uses
|
|
116
|
+
that instead — allowing standalone modules to keep their own local cache
|
|
117
|
+
while respecting a global override.
|
|
118
|
+
|
|
119
|
+
The model name is stored as ``<org>/<model>`` directory structure (matching
|
|
120
|
+
HuggingFace layout), e.g. ``BAAI/bge-small-en-v1.5`` →
|
|
121
|
+
``~/.kgrag/models/BAAI/bge-small-en-v1.5/``.
|
|
122
|
+
|
|
123
|
+
:param model_name: HuggingFace model identifier or known alias.
|
|
124
|
+
:param local_fallback: Per-module fallback directory (used when
|
|
125
|
+
``KGRAG_MODEL_DIR`` is not set).
|
|
126
|
+
:return: Absolute :class:`~pathlib.Path` to the model directory.
|
|
127
|
+
"""
|
|
128
|
+
resolved = KNOWN_MODELS.get(model_name, model_name)
|
|
129
|
+
if os.environ.get("KGRAG_MODEL_DIR") or local_fallback is None:
|
|
130
|
+
return kg_model_cache_dir() / resolved.replace("/", os.sep)
|
|
131
|
+
return local_fallback / resolved.replace("/", "--")
|
kg_utils/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""kg_utils.snapshots — Shared snapshot infrastructure for KG modules.
|
|
2
|
+
|
|
3
|
+
Provides the canonical data models and manager for capturing, storing, and
|
|
4
|
+
comparing temporal metric snapshots. Individual KG backends (pycode_kg, doc_kg,
|
|
5
|
+
ftree_kg, etc.) import from here instead of maintaining their own copies.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from kg_utils.snapshots.models import PruneResult, Snapshot, SnapshotManifest
|
|
9
|
+
from kg_utils.snapshots.manager import SnapshotManager
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"PruneResult",
|
|
13
|
+
"Snapshot",
|
|
14
|
+
"SnapshotManifest",
|
|
15
|
+
"SnapshotManager",
|
|
16
|
+
]
|
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
"""kg_utils/snapshots/manager.py — Snapshot capture, persistence, and comparison.
|
|
2
|
+
|
|
3
|
+
Usage
|
|
4
|
+
-----
|
|
5
|
+
>>> from kg_utils.snapshots import SnapshotManager
|
|
6
|
+
>>> mgr = SnapshotManager(".codekg/snapshots", package_name="code-kg")
|
|
7
|
+
>>> snapshot = mgr.capture(graph_stats_dict=kg.store.stats())
|
|
8
|
+
>>> mgr.save_snapshot(snapshot)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import dataclasses
|
|
14
|
+
import importlib.metadata
|
|
15
|
+
import json
|
|
16
|
+
import subprocess
|
|
17
|
+
from datetime import UTC, datetime
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from kg_utils.snapshots.models import PruneResult, Snapshot, SnapshotManifest
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SnapshotManager:
|
|
25
|
+
"""Manages snapshot capture, persistence, retrieval, and comparison.
|
|
26
|
+
|
|
27
|
+
This is the single shared implementation. Domain-specific KG libraries
|
|
28
|
+
subclass this to override :meth:`_compute_delta` or
|
|
29
|
+
:meth:`_collect_extra_metrics` when they need domain-specific delta fields
|
|
30
|
+
or automatic metric collection from SQLite.
|
|
31
|
+
|
|
32
|
+
:param snapshots_dir: Directory for snapshot JSON files and manifest.
|
|
33
|
+
:param package_name: Package name for auto-detecting version
|
|
34
|
+
(e.g. ``"code-kg"``, ``"doc-kg"``). Defaults to ``"kg-utils"``.
|
|
35
|
+
:param db_path: Optional SQLite database path for collecting per-module or
|
|
36
|
+
per-directory node counts via :meth:`_collect_breakdown_counts`.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
snapshots_dir: Path | str,
|
|
42
|
+
*,
|
|
43
|
+
package_name: str = "kg-utils",
|
|
44
|
+
db_path: Path | str | None = None,
|
|
45
|
+
) -> None:
|
|
46
|
+
self.snapshots_dir = Path(snapshots_dir)
|
|
47
|
+
self.snapshots_dir.mkdir(parents=True, exist_ok=True)
|
|
48
|
+
self.manifest_path = self.snapshots_dir / "manifest.json"
|
|
49
|
+
self.package_name = package_name
|
|
50
|
+
self.db_path = Path(db_path) if db_path else None
|
|
51
|
+
|
|
52
|
+
# ------------------------------------------------------------------
|
|
53
|
+
# Package version detection
|
|
54
|
+
# ------------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
def _package_version(self) -> str:
|
|
57
|
+
"""Return the installed package version, or ``'unknown'``."""
|
|
58
|
+
try:
|
|
59
|
+
return importlib.metadata.version(self.package_name)
|
|
60
|
+
except importlib.metadata.PackageNotFoundError:
|
|
61
|
+
return "unknown"
|
|
62
|
+
|
|
63
|
+
# ------------------------------------------------------------------
|
|
64
|
+
# Capture & save
|
|
65
|
+
# ------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
def capture(
|
|
68
|
+
self,
|
|
69
|
+
version: str | None = None,
|
|
70
|
+
branch: str | None = None,
|
|
71
|
+
graph_stats_dict: dict[str, Any] | None = None,
|
|
72
|
+
tree_hash: str = "",
|
|
73
|
+
hotspots: list[dict[str, Any]] | None = None,
|
|
74
|
+
issues: list[str] | None = None,
|
|
75
|
+
**extra_metrics: Any,
|
|
76
|
+
) -> Snapshot:
|
|
77
|
+
"""Capture a snapshot from current state.
|
|
78
|
+
|
|
79
|
+
``graph_stats_dict`` is merged with ``extra_metrics`` to form the
|
|
80
|
+
snapshot's ``metrics`` dict. Pass domain-specific fields as keyword
|
|
81
|
+
arguments (e.g. ``coverage=0.85``, ``critical_issues=2``).
|
|
82
|
+
|
|
83
|
+
:param version: Version string; auto-detected from package if None.
|
|
84
|
+
:param branch: Git branch; auto-detected if None.
|
|
85
|
+
:param graph_stats_dict: Output from the KG's ``stats()`` method.
|
|
86
|
+
:param tree_hash: Git tree hash; auto-detected if not provided.
|
|
87
|
+
:param hotspots: Top hotspot entries.
|
|
88
|
+
:param issues: Issue description strings.
|
|
89
|
+
:param extra_metrics: Additional domain-specific metric fields.
|
|
90
|
+
:return: New :class:`Snapshot` instance (not yet persisted).
|
|
91
|
+
"""
|
|
92
|
+
if not version:
|
|
93
|
+
version = self._package_version()
|
|
94
|
+
if branch is None:
|
|
95
|
+
branch = self._get_current_branch()
|
|
96
|
+
if not tree_hash:
|
|
97
|
+
tree_hash = self._get_current_tree_hash()
|
|
98
|
+
|
|
99
|
+
metrics: dict[str, Any] = dict(graph_stats_dict or {})
|
|
100
|
+
metrics.update(extra_metrics)
|
|
101
|
+
|
|
102
|
+
snapshot = Snapshot(
|
|
103
|
+
branch=branch,
|
|
104
|
+
timestamp=datetime.now(UTC).isoformat(),
|
|
105
|
+
version=version,
|
|
106
|
+
metrics=metrics,
|
|
107
|
+
hotspots=hotspots or [],
|
|
108
|
+
issues=issues or [],
|
|
109
|
+
tree_hash=tree_hash,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
prev = self.get_previous(tree_hash)
|
|
113
|
+
if prev:
|
|
114
|
+
snapshot.vs_previous = self._compute_delta(snapshot, prev)
|
|
115
|
+
|
|
116
|
+
baseline = self.get_baseline()
|
|
117
|
+
if baseline:
|
|
118
|
+
snapshot.vs_baseline = self._compute_delta(snapshot, baseline)
|
|
119
|
+
|
|
120
|
+
return snapshot
|
|
121
|
+
|
|
122
|
+
def save_snapshot(self, snapshot: Snapshot, *, force: bool = False) -> Path | None:
|
|
123
|
+
"""Persist a snapshot to disk and update the manifest.
|
|
124
|
+
|
|
125
|
+
Rejects snapshots with zero ``total_nodes`` to protect against
|
|
126
|
+
saving degenerate (unbuilt) state.
|
|
127
|
+
|
|
128
|
+
If ``version`` and ``metrics`` are unchanged from the latest snapshot,
|
|
129
|
+
the existing entry is refreshed in-place rather than creating a new
|
|
130
|
+
history entry. Pass ``force=True`` to always create a new entry.
|
|
131
|
+
|
|
132
|
+
:param snapshot: Snapshot to save.
|
|
133
|
+
:param force: If ``True``, always write a new history entry.
|
|
134
|
+
:return: Path to the saved JSON file, or ``None`` if no-op.
|
|
135
|
+
:raises ValueError: If ``total_nodes`` is 0.
|
|
136
|
+
"""
|
|
137
|
+
m = snapshot.metrics
|
|
138
|
+
total_nodes = (
|
|
139
|
+
m.get("total_nodes", 0) if isinstance(m, dict) else getattr(m, "total_nodes", 0)
|
|
140
|
+
)
|
|
141
|
+
if total_nodes == 0:
|
|
142
|
+
raise ValueError(
|
|
143
|
+
"Refusing to save degenerate snapshot with 0 nodes. "
|
|
144
|
+
"Build the KG before capturing a snapshot."
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
manifest = self.load_manifest()
|
|
148
|
+
|
|
149
|
+
# Dedup: refresh latest entry if nothing meaningful changed.
|
|
150
|
+
if not force and manifest.snapshots:
|
|
151
|
+
latest_entry = max(manifest.snapshots, key=lambda x: x.get("timestamp", ""))
|
|
152
|
+
if snapshot.version == latest_entry.get("version", "") and not self._metrics_changed(
|
|
153
|
+
snapshot.metrics, latest_entry.get("metrics", {})
|
|
154
|
+
):
|
|
155
|
+
old_key = latest_entry["key"]
|
|
156
|
+
old_file = self.snapshots_dir / latest_entry.get("file", f"{old_key}.json")
|
|
157
|
+
|
|
158
|
+
snapshot_file = self.snapshots_dir / f"{snapshot.key}.json"
|
|
159
|
+
snapshot_file.write_text(
|
|
160
|
+
json.dumps(snapshot.to_dict(), indent=2) + "\n", encoding="utf-8"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
if old_key != snapshot.key and old_file.exists():
|
|
164
|
+
old_file.unlink()
|
|
165
|
+
|
|
166
|
+
latest_entry["key"] = snapshot.key
|
|
167
|
+
latest_entry["branch"] = snapshot.branch
|
|
168
|
+
latest_entry["timestamp"] = snapshot.timestamp
|
|
169
|
+
latest_entry["file"] = snapshot_file.name
|
|
170
|
+
|
|
171
|
+
manifest.last_update = datetime.now(UTC).isoformat()
|
|
172
|
+
self._save_manifest(manifest)
|
|
173
|
+
return snapshot_file
|
|
174
|
+
|
|
175
|
+
# Normal path: new or changed snapshot.
|
|
176
|
+
snapshot_file = self.snapshots_dir / f"{snapshot.key}.json"
|
|
177
|
+
snapshot_file.write_text(json.dumps(snapshot.to_dict(), indent=2) + "\n", encoding="utf-8")
|
|
178
|
+
|
|
179
|
+
existing_idx = next(
|
|
180
|
+
(i for i, s in enumerate(manifest.snapshots) if s.get("key") == snapshot.key),
|
|
181
|
+
None,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
manifest_entry: dict[str, Any] = {
|
|
185
|
+
"key": snapshot.key,
|
|
186
|
+
"branch": snapshot.branch,
|
|
187
|
+
"timestamp": snapshot.timestamp,
|
|
188
|
+
"version": snapshot.version,
|
|
189
|
+
"file": snapshot_file.name,
|
|
190
|
+
"metrics": snapshot.metrics,
|
|
191
|
+
"deltas": {
|
|
192
|
+
"vs_previous": snapshot.vs_previous,
|
|
193
|
+
"vs_baseline": snapshot.vs_baseline,
|
|
194
|
+
},
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
if existing_idx is not None:
|
|
198
|
+
manifest.snapshots[existing_idx] = manifest_entry
|
|
199
|
+
else:
|
|
200
|
+
manifest.snapshots.append(manifest_entry)
|
|
201
|
+
|
|
202
|
+
manifest.last_update = datetime.now(UTC).isoformat()
|
|
203
|
+
self._save_manifest(manifest)
|
|
204
|
+
return snapshot_file
|
|
205
|
+
|
|
206
|
+
# ------------------------------------------------------------------
|
|
207
|
+
# Loading & listing
|
|
208
|
+
# ------------------------------------------------------------------
|
|
209
|
+
|
|
210
|
+
def load_manifest(self) -> SnapshotManifest:
|
|
211
|
+
"""Load ``manifest.json``; return empty manifest if absent."""
|
|
212
|
+
if not self.manifest_path.exists():
|
|
213
|
+
return SnapshotManifest()
|
|
214
|
+
manifest = SnapshotManifest.from_dict(
|
|
215
|
+
json.loads(self.manifest_path.read_text(encoding="utf-8"))
|
|
216
|
+
)
|
|
217
|
+
# Normalise legacy 'tree_hash' -> 'key'
|
|
218
|
+
for entry in manifest.snapshots:
|
|
219
|
+
if "key" not in entry and "tree_hash" in entry:
|
|
220
|
+
entry["key"] = entry.pop("tree_hash")
|
|
221
|
+
return manifest
|
|
222
|
+
|
|
223
|
+
def _save_manifest(self, manifest: SnapshotManifest) -> None:
|
|
224
|
+
self.manifest_path.write_text(
|
|
225
|
+
json.dumps(manifest.to_dict(), indent=2) + "\n", encoding="utf-8"
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
def load_snapshot(self, key: str) -> Snapshot | None:
|
|
229
|
+
"""Load a snapshot by key (tree hash) or ``'latest'``.
|
|
230
|
+
|
|
231
|
+
Missing ``vs_previous`` / ``vs_baseline`` deltas are backfilled
|
|
232
|
+
on-the-fly from manifest metadata.
|
|
233
|
+
"""
|
|
234
|
+
if key == "latest":
|
|
235
|
+
manifest = self.load_manifest()
|
|
236
|
+
if not manifest.snapshots:
|
|
237
|
+
return None
|
|
238
|
+
entry = max(manifest.snapshots, key=lambda x: x.get("timestamp", ""))
|
|
239
|
+
key = entry["key"]
|
|
240
|
+
|
|
241
|
+
snapshot_file = self.snapshots_dir / f"{key}.json"
|
|
242
|
+
if not snapshot_file.exists():
|
|
243
|
+
return None
|
|
244
|
+
snap = Snapshot.from_dict(json.loads(snapshot_file.read_text(encoding="utf-8")))
|
|
245
|
+
|
|
246
|
+
# Backfill missing deltas from manifest
|
|
247
|
+
if snap.vs_previous is None or snap.vs_baseline is None:
|
|
248
|
+
manifest = self.load_manifest()
|
|
249
|
+
entries = sorted(manifest.snapshots, key=lambda x: x.get("timestamp", ""), reverse=True)
|
|
250
|
+
idx = next((i for i, s in enumerate(entries) if s.get("key") == key), None)
|
|
251
|
+
|
|
252
|
+
if idx is not None:
|
|
253
|
+
if snap.vs_previous is None and idx + 1 < len(entries):
|
|
254
|
+
prev_m = entries[idx + 1].get("metrics", {})
|
|
255
|
+
snap.vs_previous = {
|
|
256
|
+
"nodes": snap.metrics.get("total_nodes", 0) - prev_m.get("total_nodes", 0),
|
|
257
|
+
"edges": snap.metrics.get("total_edges", 0) - prev_m.get("total_edges", 0),
|
|
258
|
+
}
|
|
259
|
+
if snap.vs_baseline is None and entries:
|
|
260
|
+
base_m = entries[-1].get("metrics", {})
|
|
261
|
+
if entries[-1].get("key") != key:
|
|
262
|
+
snap.vs_baseline = {
|
|
263
|
+
"nodes": snap.metrics.get("total_nodes", 0)
|
|
264
|
+
- base_m.get("total_nodes", 0),
|
|
265
|
+
"edges": snap.metrics.get("total_edges", 0)
|
|
266
|
+
- base_m.get("total_edges", 0),
|
|
267
|
+
}
|
|
268
|
+
return snap
|
|
269
|
+
|
|
270
|
+
def get_previous(self, key: str) -> Snapshot | None:
|
|
271
|
+
"""Get the snapshot immediately before *key* (by timestamp)."""
|
|
272
|
+
manifest = self.load_manifest()
|
|
273
|
+
current_ts = next(
|
|
274
|
+
(s["timestamp"] for s in manifest.snapshots if s.get("key") == key),
|
|
275
|
+
None,
|
|
276
|
+
)
|
|
277
|
+
if not current_ts:
|
|
278
|
+
return None
|
|
279
|
+
prev_entry = None
|
|
280
|
+
for s in sorted(manifest.snapshots, key=lambda x: x["timestamp"], reverse=True):
|
|
281
|
+
if s["timestamp"] < current_ts:
|
|
282
|
+
prev_entry = s
|
|
283
|
+
break
|
|
284
|
+
return self.load_snapshot(prev_entry["key"]) if prev_entry else None
|
|
285
|
+
|
|
286
|
+
def get_baseline(self) -> Snapshot | None:
|
|
287
|
+
"""Get the oldest snapshot (baseline for comparison)."""
|
|
288
|
+
manifest = self.load_manifest()
|
|
289
|
+
if not manifest.snapshots:
|
|
290
|
+
return None
|
|
291
|
+
baseline_entry = min(manifest.snapshots, key=lambda x: x["timestamp"])
|
|
292
|
+
return self.load_snapshot(baseline_entry["key"])
|
|
293
|
+
|
|
294
|
+
def list_snapshots(
|
|
295
|
+
self,
|
|
296
|
+
limit: int | None = None,
|
|
297
|
+
branch: str | None = None,
|
|
298
|
+
) -> list[dict[str, Any]]:
|
|
299
|
+
"""List snapshots in reverse chronological order.
|
|
300
|
+
|
|
301
|
+
:param limit: Max number to return; ``None`` = all.
|
|
302
|
+
:param branch: If provided, filter by branch name.
|
|
303
|
+
:return: List of snapshot metadata dicts.
|
|
304
|
+
"""
|
|
305
|
+
manifest = self.load_manifest()
|
|
306
|
+
all_snaps = sorted(manifest.snapshots, key=lambda x: x["timestamp"], reverse=True)
|
|
307
|
+
|
|
308
|
+
if branch is not None:
|
|
309
|
+
all_snaps = [s for s in all_snaps if s.get("branch") == branch]
|
|
310
|
+
|
|
311
|
+
for i, snap in enumerate(all_snaps):
|
|
312
|
+
if snap.get("deltas", {}).get("vs_previous") is None and i + 1 < len(all_snaps):
|
|
313
|
+
prev = all_snaps[i + 1]
|
|
314
|
+
snap.setdefault("deltas", {})["vs_previous"] = self._compute_delta_from_metrics(
|
|
315
|
+
snap["metrics"], prev["metrics"]
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
return all_snaps[:limit] if limit else all_snaps
|
|
319
|
+
|
|
320
|
+
def diff_snapshots(self, key_a: str, key_b: str) -> dict[str, Any]:
|
|
321
|
+
"""Compare two snapshots side-by-side.
|
|
322
|
+
|
|
323
|
+
:param key_a: First snapshot key (tree hash).
|
|
324
|
+
:param key_b: Second snapshot key (tree hash).
|
|
325
|
+
:return: Dict with metrics from both and computed deltas.
|
|
326
|
+
"""
|
|
327
|
+
snap_a = self.load_snapshot(key_a)
|
|
328
|
+
snap_b = self.load_snapshot(key_b)
|
|
329
|
+
|
|
330
|
+
if not snap_a or not snap_b:
|
|
331
|
+
return {"error": "One or both snapshots not found"}
|
|
332
|
+
|
|
333
|
+
all_node_kinds = set(snap_a.metrics.get("node_counts", {})) | set(
|
|
334
|
+
snap_b.metrics.get("node_counts", {})
|
|
335
|
+
)
|
|
336
|
+
all_edge_rels = set(snap_a.metrics.get("edge_counts", {})) | set(
|
|
337
|
+
snap_b.metrics.get("edge_counts", {})
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
node_counts_delta = {
|
|
341
|
+
k: snap_b.metrics.get("node_counts", {}).get(k, 0)
|
|
342
|
+
- snap_a.metrics.get("node_counts", {}).get(k, 0)
|
|
343
|
+
for k in all_node_kinds
|
|
344
|
+
}
|
|
345
|
+
edge_counts_delta = {
|
|
346
|
+
k: snap_b.metrics.get("edge_counts", {}).get(k, 0)
|
|
347
|
+
- snap_a.metrics.get("edge_counts", {}).get(k, 0)
|
|
348
|
+
for k in all_edge_rels
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
return {
|
|
352
|
+
"a": {"key": snap_a.key, "metrics": snap_a.metrics, "issues": snap_a.issues},
|
|
353
|
+
"b": {"key": snap_b.key, "metrics": snap_b.metrics, "issues": snap_b.issues},
|
|
354
|
+
"delta": self._compute_delta(snap_b, snap_a),
|
|
355
|
+
"node_counts_delta": node_counts_delta,
|
|
356
|
+
"edge_counts_delta": edge_counts_delta,
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
# ------------------------------------------------------------------
|
|
360
|
+
# Delta computation — override for domain-specific delta fields
|
|
361
|
+
# ------------------------------------------------------------------
|
|
362
|
+
|
|
363
|
+
def _metrics_changed(self, new_metrics: dict[str, Any], old_metrics: dict[str, Any]) -> bool:
|
|
364
|
+
"""Return ``True`` if metrics represent a meaningful change.
|
|
365
|
+
|
|
366
|
+
Override in subclasses to customise.
|
|
367
|
+
"""
|
|
368
|
+
return new_metrics != old_metrics
|
|
369
|
+
|
|
370
|
+
def _compute_delta(self, snap_new: Snapshot, snap_old: Snapshot) -> dict[str, Any]:
|
|
371
|
+
"""Compute metrics delta (new - old).
|
|
372
|
+
|
|
373
|
+
Override in subclasses to add domain-specific delta fields.
|
|
374
|
+
"""
|
|
375
|
+
|
|
376
|
+
def _to_dict(m: Any) -> dict[str, Any]:
|
|
377
|
+
if isinstance(m, dict):
|
|
378
|
+
return m
|
|
379
|
+
if dataclasses.is_dataclass(m) and not isinstance(m, type):
|
|
380
|
+
return dataclasses.asdict(m)
|
|
381
|
+
return {}
|
|
382
|
+
|
|
383
|
+
return self._compute_delta_from_metrics(
|
|
384
|
+
_to_dict(snap_new.metrics), _to_dict(snap_old.metrics)
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
def _compute_delta_from_metrics(
|
|
388
|
+
self, new_m: dict[str, Any], old_m: dict[str, Any]
|
|
389
|
+
) -> dict[str, Any]:
|
|
390
|
+
"""Compute delta from two raw metrics dicts.
|
|
391
|
+
|
|
392
|
+
Override in subclasses to add domain-specific delta fields.
|
|
393
|
+
"""
|
|
394
|
+
return {
|
|
395
|
+
"nodes": new_m.get("total_nodes", 0) - old_m.get("total_nodes", 0),
|
|
396
|
+
"edges": new_m.get("total_edges", 0) - old_m.get("total_edges", 0),
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
# ------------------------------------------------------------------
|
|
400
|
+
# Prune
|
|
401
|
+
# ------------------------------------------------------------------
|
|
402
|
+
|
|
403
|
+
def prune_snapshots(self, *, dry_run: bool = False) -> PruneResult:
|
|
404
|
+
"""Remove vestigial snapshots that carry no new metric information.
|
|
405
|
+
|
|
406
|
+
:param dry_run: If ``True``, compute what would be removed without deleting.
|
|
407
|
+
:return: :class:`PruneResult` summarising the cleanup.
|
|
408
|
+
"""
|
|
409
|
+
manifest = self.load_manifest()
|
|
410
|
+
by_time = sorted(manifest.snapshots, key=lambda x: x.get("timestamp", ""))
|
|
411
|
+
|
|
412
|
+
removed_keys: list[str] = []
|
|
413
|
+
broken_keys: list[str] = []
|
|
414
|
+
orphaned_files: list[str] = []
|
|
415
|
+
|
|
416
|
+
# Pass 1: separate valid entries from broken ones.
|
|
417
|
+
valid: list[dict[str, Any]] = []
|
|
418
|
+
for entry in by_time:
|
|
419
|
+
key = entry.get("key", "")
|
|
420
|
+
fname = entry.get("file", f"{key}.json")
|
|
421
|
+
if not (self.snapshots_dir / fname).exists():
|
|
422
|
+
broken_keys.append(key)
|
|
423
|
+
else:
|
|
424
|
+
valid.append(entry)
|
|
425
|
+
|
|
426
|
+
# Pass 2: flag metric-duplicate interior entries.
|
|
427
|
+
if len(valid) > 2:
|
|
428
|
+
kept_metrics = valid[0].get("metrics", {})
|
|
429
|
+
for entry in valid[1:-1]:
|
|
430
|
+
m = entry.get("metrics", {})
|
|
431
|
+
if not self._metrics_changed(m, kept_metrics):
|
|
432
|
+
removed_keys.append(entry.get("key", ""))
|
|
433
|
+
else:
|
|
434
|
+
kept_metrics = m
|
|
435
|
+
|
|
436
|
+
# Pass 3: find orphaned JSON files.
|
|
437
|
+
referenced_files = {e.get("file", f"{e.get('key', '')}.json") for e in manifest.snapshots}
|
|
438
|
+
for path in self.snapshots_dir.glob("*.json"):
|
|
439
|
+
if path.name == "manifest.json":
|
|
440
|
+
continue
|
|
441
|
+
if path.name not in referenced_files:
|
|
442
|
+
orphaned_files.append(path.name)
|
|
443
|
+
|
|
444
|
+
if not dry_run:
|
|
445
|
+
entry_by_key = {e.get("key"): e for e in manifest.snapshots}
|
|
446
|
+
|
|
447
|
+
for key in removed_keys:
|
|
448
|
+
entry = entry_by_key.get(key, {})
|
|
449
|
+
fname = entry.get("file", f"{key}.json")
|
|
450
|
+
p = self.snapshots_dir / fname
|
|
451
|
+
if p.exists():
|
|
452
|
+
p.unlink()
|
|
453
|
+
|
|
454
|
+
for fname in orphaned_files:
|
|
455
|
+
p = self.snapshots_dir / fname
|
|
456
|
+
if p.exists():
|
|
457
|
+
p.unlink()
|
|
458
|
+
|
|
459
|
+
drop_keys = set(removed_keys) | set(broken_keys)
|
|
460
|
+
manifest.snapshots = [e for e in manifest.snapshots if e.get("key") not in drop_keys]
|
|
461
|
+
manifest.last_update = datetime.now(UTC).isoformat()
|
|
462
|
+
self._save_manifest(manifest)
|
|
463
|
+
|
|
464
|
+
return PruneResult(
|
|
465
|
+
removed=removed_keys,
|
|
466
|
+
orphaned_files=orphaned_files,
|
|
467
|
+
broken_entries=broken_keys,
|
|
468
|
+
dry_run=dry_run,
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
# ------------------------------------------------------------------
|
|
472
|
+
# Git helpers
|
|
473
|
+
# ------------------------------------------------------------------
|
|
474
|
+
|
|
475
|
+
@staticmethod
|
|
476
|
+
def _get_current_tree_hash() -> str:
|
|
477
|
+
"""Get current git tree hash (HEAD^{tree})."""
|
|
478
|
+
try:
|
|
479
|
+
return subprocess.check_output(
|
|
480
|
+
["git", "rev-parse", "HEAD^{tree}"],
|
|
481
|
+
text=True,
|
|
482
|
+
stderr=subprocess.DEVNULL,
|
|
483
|
+
).strip()
|
|
484
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
485
|
+
return ""
|
|
486
|
+
|
|
487
|
+
@staticmethod
|
|
488
|
+
def _get_current_branch() -> str:
|
|
489
|
+
"""Get current git branch name."""
|
|
490
|
+
try:
|
|
491
|
+
return subprocess.check_output(
|
|
492
|
+
["git", "rev-parse", "--abbrev-ref", "HEAD"],
|
|
493
|
+
text=True,
|
|
494
|
+
stderr=subprocess.DEVNULL,
|
|
495
|
+
).strip()
|
|
496
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
497
|
+
return "unknown"
|