yigraf 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
yigraf/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """yigraf — one connected graph over code, intent, plan, and memory."""
2
+
3
+ __version__ = "0.0.0"
yigraf/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ """``python -m yigraf`` entry point — used by the git hook, which bakes in an absolute interpreter
2
+ path so it works regardless of ``PATH`` (see :mod:`yigraf.hooks`)."""
3
+ from yigraf.cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
yigraf/artifacts.py ADDED
@@ -0,0 +1,312 @@
1
+ """Intent & plan artifacts: the authored ``.md`` truth for the intent/plan node families (M2).
2
+
3
+ Intents and plans live as one-file-per-node markdown under ``yigraf/intents/`` and
4
+ ``yigraf/plans/`` (``docs/graph-design.md`` §4, ``docs/m2-notes.md``). Bodies are human-authored;
5
+ the plan's ``edges`` frontmatter is machine-written by ``yigraf link``. This module reads them into
6
+ dataclasses, projects those into the graph (intent/plan/task nodes + ``contains``/``tracks``/
7
+ ``requires``/``implements`` edges), and writes new artifacts for the authoring verbs.
8
+
9
+ A target id that doesn't resolve to a node is **not** added as a phantom edge — it's stashed on the
10
+ task node (``dangling_implements`` / ``dangling_tracks``) for M3 to surface as hard drift.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ from dataclasses import dataclass, field
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ import networkx as nx
20
+ import yaml
21
+
22
+ from yigraf.astnorm import ANCHOR_ALGO
23
+
24
+ INTENT_FAMILY = "intent"
25
+ PLAN_FAMILY = "plan"
26
+ CONF = "EXTRACTED" # authored artifacts are asserted truth, not inferred
27
+
28
+ INTENT_TYPES = ("requirement", "goal", "capability")
29
+ INTENT_STATUSES = ("proposed", "active", "satisfied", "archived")
30
+
31
+ _FRONTMATTER = re.compile(r"\A---\n(.*?)\n---\n?(.*)\Z", re.DOTALL)
32
+ _TASK_LINE = re.compile(r"^- \[([ xX])\]\s*\{#(\d+)\}\s*(.*)$")
33
+ _HEADING = re.compile(r"^##\s+(.*?)\s*$")
34
+
35
+
36
+ # --------------------------------------------------------------------------------------------------
37
+ # Frontmatter + section parsing
38
+ # --------------------------------------------------------------------------------------------------
39
+
40
+
41
+ def _split_frontmatter(text: str) -> tuple[dict, str]:
42
+ """Return ``(metadata, body)`` for a ``---``-fenced markdown file (empty meta if none)."""
43
+ match = _FRONTMATTER.match(text)
44
+ if match is None:
45
+ return {}, text
46
+ meta = yaml.safe_load(match.group(1)) or {}
47
+ if not isinstance(meta, dict):
48
+ raise ValueError("artifact frontmatter must be a YAML mapping")
49
+ return meta, match.group(2)
50
+
51
+
52
+ def _compose(meta: dict, body: str) -> str:
53
+ """Inverse of :func:`_split_frontmatter`: deterministic frontmatter + body."""
54
+ front = yaml.safe_dump(meta, sort_keys=True, allow_unicode=True, default_flow_style=False)
55
+ body = body if body.endswith("\n") or not body else body + "\n"
56
+ return f"---\n{front}---\n{body}"
57
+
58
+
59
+ def _sections(body: str) -> dict[str, str]:
60
+ """Split a markdown body into ``{heading_lower: text}`` keyed by ``## Heading``.
61
+
62
+ The heading key is lowercased and trimmed of a trailing parenthetical (``Design (how)`` →
63
+ ``design``), so authored variants still map to a stable field.
64
+ """
65
+ out: dict[str, list[str]] = {}
66
+ current: str | None = None
67
+ for line in body.splitlines():
68
+ heading = _HEADING.match(line)
69
+ if heading is not None:
70
+ key = heading.group(1).split("(")[0].strip().casefold()
71
+ current = key
72
+ out.setdefault(current, [])
73
+ elif current is not None:
74
+ out[current].append(line)
75
+ return {k: "\n".join(v).strip() for k, v in out.items()}
76
+
77
+
78
+ def _bullets(text: str) -> list[str]:
79
+ """The ``- `` bullet items in a section, in order (used for scenarios)."""
80
+ return [ln[2:].strip() for ln in text.splitlines() if ln.lstrip().startswith("- ")]
81
+
82
+
83
+ # --------------------------------------------------------------------------------------------------
84
+ # Intent
85
+ # --------------------------------------------------------------------------------------------------
86
+
87
+
88
+ @dataclass
89
+ class Intent:
90
+ id: str
91
+ slug: str
92
+ type: str
93
+ status: str
94
+ statement: str
95
+ scenarios: list[str] = field(default_factory=list)
96
+ design: str | None = None
97
+
98
+
99
+ def read_intent(path: Path) -> Intent:
100
+ """Parse an ``intents/<slug>.md`` file into an :class:`Intent`."""
101
+ path = Path(path)
102
+ meta, body = _split_frontmatter(path.read_text(encoding="utf-8"))
103
+ slug = path.stem
104
+ sections = _sections(body)
105
+ design = sections.get("design") or None
106
+ return Intent(
107
+ id=meta.get("id", f"int:{slug.casefold()}"),
108
+ slug=slug,
109
+ type=meta.get("type", "requirement"),
110
+ status=meta.get("status", "proposed"),
111
+ statement=sections.get("requirement", "").strip(),
112
+ scenarios=_bullets(sections.get("scenarios", "")),
113
+ design=design,
114
+ )
115
+
116
+
117
+ def render_intent(slug: str, statement: str, scenarios: list[str], design: str | None,
118
+ type: str = "requirement", status: str = "proposed") -> str:
119
+ """Render the markdown for a new intent artifact."""
120
+ meta = {"id": f"int:{slug.casefold()}", "family": INTENT_FAMILY, "type": type, "status": status}
121
+ lines = ["## Requirement", statement, "", "## Scenarios"]
122
+ lines += [f"- {s}" for s in scenarios] or ["- "]
123
+ if design:
124
+ lines += ["", "## Design (how)", design]
125
+ return _compose(meta, "\n".join(lines) + "\n")
126
+
127
+
128
+ # --------------------------------------------------------------------------------------------------
129
+ # Plan + tasks
130
+ # --------------------------------------------------------------------------------------------------
131
+
132
+
133
+ @dataclass
134
+ class Implements:
135
+ sym: str
136
+ anchor: str | None = None
137
+ anchor_algo: str | None = None
138
+
139
+
140
+ @dataclass
141
+ class Task:
142
+ id: str
143
+ num: int
144
+ description: str
145
+ state: str # todo | done
146
+ tracks: str | None = None
147
+ requires: list[str] = field(default_factory=list)
148
+ implements: list[Implements] = field(default_factory=list)
149
+
150
+
151
+ @dataclass
152
+ class Plan:
153
+ id: str
154
+ slug: str
155
+ title: str
156
+ tasks: list[Task] = field(default_factory=list)
157
+ phase: str = "active" # active | completed (from the plans/<phase>/ subdir)
158
+
159
+
160
+ def read_plan(path: Path) -> Plan:
161
+ """Parse a plan file (frontmatter ``edges`` + ``## Tasks`` checkboxes) into a :class:`Plan`."""
162
+ path = Path(path)
163
+ meta, body = _split_frontmatter(path.read_text(encoding="utf-8"))
164
+ slug = path.stem
165
+ edges = meta.get("edges") or {}
166
+
167
+ title = slug
168
+ for line in body.splitlines():
169
+ if line.startswith("# "):
170
+ title = line[2:].strip()
171
+ break
172
+
173
+ tasks: list[Task] = []
174
+ for line in body.splitlines():
175
+ m = _TASK_LINE.match(line.strip())
176
+ if m is None:
177
+ continue
178
+ num = int(m.group(2))
179
+ task_id = f"task:{slug.casefold()}/{num}"
180
+ spec = edges.get(task_id) or {}
181
+ tasks.append(
182
+ Task(
183
+ id=task_id,
184
+ num=num,
185
+ description=m.group(3).strip(),
186
+ state="done" if m.group(1).lower() == "x" else "todo",
187
+ tracks=spec.get("tracks"),
188
+ requires=list(spec.get("requires") or []),
189
+ implements=[_read_impl(e) for e in (spec.get("implements") or [])],
190
+ )
191
+ )
192
+ tasks.sort(key=lambda t: t.num)
193
+ return Plan(id=meta.get("id", f"plan:{slug.casefold()}"), slug=slug, title=title, tasks=tasks)
194
+
195
+
196
+ def _read_impl(entry: Any) -> Implements:
197
+ if isinstance(entry, str):
198
+ return Implements(sym=entry)
199
+ return Implements(sym=entry["sym"], anchor=entry.get("anchor"), anchor_algo=entry.get("anchor_algo"))
200
+
201
+
202
+ def render_plan(slug: str, title: str, tasks: list[str]) -> str:
203
+ """Render the markdown for a new plan with todo tasks (no edges yet)."""
204
+ meta = {"id": f"plan:{slug.casefold()}", "family": PLAN_FAMILY, "edges": {}}
205
+ lines = [f"# {title}", "", "## Tasks"]
206
+ lines += [f"- [ ] {{#{i}}} {desc}" for i, desc in enumerate(tasks, start=1)]
207
+ return _compose(meta, "\n".join(lines) + "\n")
208
+
209
+
210
+ def add_edge_to_plan(path: Path, task_id: str, relation: str, target: str,
211
+ anchor: str | None = None) -> None:
212
+ """Write a ``tracks`` or ``implements`` edge for ``task_id`` into the plan's frontmatter.
213
+
214
+ ``tracks`` is a single intent id; ``implements`` appends a (deduplicated) symbol entry carrying
215
+ its stamped ``anchor`` + ``anchor_algo``. Re-linking the same symbol re-stamps its anchor.
216
+ """
217
+ path = Path(path)
218
+ meta, body = _split_frontmatter(path.read_text(encoding="utf-8"))
219
+ edges = meta.setdefault("edges", {}) or {}
220
+ meta["edges"] = edges
221
+ spec = edges.setdefault(task_id, {})
222
+
223
+ if relation == "tracks":
224
+ spec["tracks"] = target
225
+ elif relation == "implements":
226
+ impls = spec.setdefault("implements", [])
227
+ entry = {"sym": target, "anchor": anchor, "anchor_algo": ANCHOR_ALGO if anchor else None}
228
+ for existing in impls:
229
+ if existing.get("sym") == target:
230
+ existing.update(entry)
231
+ break
232
+ else:
233
+ impls.append(entry)
234
+ else:
235
+ raise ValueError(f"unsupported relation for a plan edge: {relation}")
236
+
237
+ path.write_text(_compose(meta, body), encoding="utf-8")
238
+
239
+
240
+ # --------------------------------------------------------------------------------------------------
241
+ # Projection into the graph
242
+ # --------------------------------------------------------------------------------------------------
243
+
244
+
245
+ def iter_intents(root: Path) -> list[Intent]:
246
+ intents_dir = Path(root) / "yigraf" / "intents"
247
+ return [read_intent(p) for p in sorted(intents_dir.glob("*.md"))] if intents_dir.is_dir() else []
248
+
249
+
250
+ def iter_plans(root: Path) -> list[Plan]:
251
+ plans_dir = Path(root) / "yigraf" / "plans"
252
+ out = []
253
+ for sub in ("active", "completed"):
254
+ d = plans_dir / sub
255
+ if d.is_dir():
256
+ for p in sorted(d.glob("*.md")):
257
+ plan = read_plan(p)
258
+ plan.phase = sub
259
+ out.append(plan)
260
+ return out
261
+
262
+
263
+ def project_into(graph: nx.DiGraph, root: Path) -> None:
264
+ """Add intent/plan/task nodes and their cross-family edges to ``graph`` from the artifacts."""
265
+ for intent in iter_intents(root):
266
+ graph.add_node(
267
+ intent.id, family=INTENT_FAMILY, kind=intent.type, label=intent.statement or intent.slug,
268
+ confidence=CONF, status=intent.status, statement=intent.statement,
269
+ scenarios=intent.scenarios, design=intent.design, source_file=f"intents/{intent.slug}.md",
270
+ )
271
+
272
+ for plan in iter_plans(root):
273
+ graph.add_node(plan.id, family=PLAN_FAMILY, kind="plan", label=plan.title,
274
+ confidence=CONF, phase=plan.phase)
275
+ for task in plan.tasks:
276
+ graph.add_node(
277
+ task.id, family=PLAN_FAMILY, kind="task", label=task.description,
278
+ confidence=CONF, state=task.state, order=task.num,
279
+ )
280
+ graph.add_edge(plan.id, task.id, relation="contains", confidence=CONF)
281
+ _project_task_edges(graph, task)
282
+
283
+
284
+ def _project_task_edges(graph: nx.DiGraph, task: Task) -> None:
285
+ """Add a task's tracks/requires/implements edges, stashing unresolved targets for M3."""
286
+ if task.tracks is not None:
287
+ if task.tracks in graph:
288
+ graph.add_edge(task.id, task.tracks, relation="tracks", confidence=CONF)
289
+ else:
290
+ _stash(graph, task.id, "dangling_tracks", task.tracks)
291
+
292
+ for req in task.requires:
293
+ if req in graph:
294
+ graph.add_edge(task.id, req, relation="requires", confidence=CONF)
295
+ else:
296
+ _stash(graph, task.id, "dangling_requires", req)
297
+
298
+ for impl in task.implements:
299
+ if impl.sym in graph:
300
+ attrs = {"relation": "implements", "confidence": CONF}
301
+ if impl.anchor is not None:
302
+ attrs["anchor"] = impl.anchor
303
+ attrs["anchor_algo"] = impl.anchor_algo or ANCHOR_ALGO
304
+ graph.add_edge(task.id, impl.sym, **attrs)
305
+ else:
306
+ # Keep the anchor so M3 can re-anchor a rename by content match (docs/m3-notes.md §3).
307
+ _stash(graph, task.id, "dangling_implements",
308
+ {"sym": impl.sym, "anchor": impl.anchor, "anchor_algo": impl.anchor_algo})
309
+
310
+
311
+ def _stash(graph: nx.DiGraph, node_id: str, attr: str, value: str) -> None:
312
+ graph.nodes[node_id].setdefault(attr, []).append(value)
yigraf/astnorm.py ADDED
@@ -0,0 +1,151 @@
1
+ """AST-normalized ``content_hash`` — the drift anchor (``astnorm-v1``).
2
+
3
+ A symbol's ``content_hash`` is a SHA-256 over its *significant token stream*: the tokens that remain
4
+ after dropping comments and docstrings, normalizing string quote style, and replacing each nested
5
+ *extracted* symbol with a stable ``<def:NAME>`` marker. The rule is pinned in ``docs/m1-notes.md`` §4
6
+ and is **load-bearing**: once anchors are stamped (M2), changing the rule silently mismatches every
7
+ stored anchor — so the algorithm carries a version tag (:data:`ANCHOR_ALGO`). A future rule change
8
+ bumps the tag and re-anchors on next commit instead of false-drifting.
9
+
10
+ What is deliberately ignored (no drift): comments, docstrings, string quote style (``'x'`` ≡ ``"x"``),
11
+ and all whitespace/reformatting that doesn't change the parsed token stream — so a ``black``/``isort``
12
+ reflow is safe. What trips drift (intended): any change to identifiers, operators, literal *values*,
13
+ keywords, control flow, signatures, or decorators within a symbol's own body.
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import hashlib
18
+ from typing import TYPE_CHECKING, Mapping
19
+
20
+ if TYPE_CHECKING:
21
+ from tree_sitter import Node
22
+
23
+ #: Version tag stored alongside every anchor. Bumping it (``astnorm-v2``) re-anchors instead of
24
+ #: silently false-drifting against hashes produced by an older rule (docs/m1-notes.md §4).
25
+ ANCHOR_ALGO = "astnorm-v1"
26
+
27
+ #: Field separators for the token stream. ``\x1f`` (unit) joins a token's type to its text; ``\x1e``
28
+ #: (record) joins tokens. Both are control chars that cannot occur in Python source, so they can't be
29
+ #: forged by a literal's contents.
30
+ _FIELD = "\x1f"
31
+ _TOKEN = "\x1e"
32
+
33
+ #: Python defaults for the per-language astnorm knobs. They are the function defaults below, so a
34
+ #: caller that passes nothing reproduces the original Python rule byte-for-byte (no ``astnorm-v2``
35
+ #: bump, existing anchors stay valid). Other languages pass their own sets (e.g. Go: all empty — no
36
+ #: docstrings, no quote-style ambiguity), keeping the *algorithm* astnorm-v1 while varying the
37
+ #: language-specific token vocabulary it normalizes.
38
+
39
+ #: Token types whose text is a quote delimiter (prefix + quotes) we canonicalize.
40
+ _PY_QUOTE_TOKENS = frozenset({"string_start", "string_end"})
41
+
42
+ #: Block-like containers whose *leading* string statement is a docstring to drop.
43
+ _PY_BODY_CONTAINERS = frozenset({"block", "module"})
44
+
45
+ #: Node types that count as a lone string statement (a docstring) inside a body container.
46
+ _PY_DOCSTRING_TYPES = frozenset({"string", "concatenated_string"})
47
+
48
+ #: Node types dropped as comments. Default fits Python/Go/JS/C; languages whose grammar names them
49
+ #: differently (Rust/Java: ``line_comment``/``block_comment``) pass their own set.
50
+ _PY_COMMENT_TYPES = frozenset({"comment"})
51
+
52
+
53
+ def content_hash(node: Node, source: bytes, boundaries: Mapping[int, str],
54
+ exclude: frozenset[int] = frozenset(), *,
55
+ quote_tokens: frozenset[str] = _PY_QUOTE_TOKENS,
56
+ body_containers: frozenset[str] = _PY_BODY_CONTAINERS,
57
+ docstring_types: frozenset[str] = _PY_DOCSTRING_TYPES,
58
+ comment_types: frozenset[str] = _PY_COMMENT_TYPES) -> str:
59
+ """Hash ``node``'s significant token stream (``astnorm-v1``); see module docstring.
60
+
61
+ ``boundaries`` maps the tree-sitter node id of each *directly nested extracted symbol* (a
62
+ top-level def for a module; a method for a class) to its local name. Those subtrees are replaced
63
+ by a ``<def:NAME>`` marker and not descended into — so a class hash captures its member *names*
64
+ but not method bodies, and editing a method body flips only that method's hash.
65
+
66
+ ``exclude`` is a set of node ids dropped outright — used for the symbol's **own declared name**,
67
+ so a pure rename leaves the body-hash unchanged and M3 can re-anchor by exact match
68
+ (docs/m3-notes.md §2). A *container's* member names (the ``<def:NAME>`` markers) are unaffected.
69
+
70
+ ``quote_tokens`` / ``body_containers`` / ``docstring_types`` are the per-language knobs (default:
71
+ Python). Empty sets disable quote canonicalization / docstring dropping for languages that have
72
+ neither (e.g. Go), while the rest of the algorithm — and ``ANCHOR_ALGO`` — is unchanged.
73
+ """
74
+ tokens: list[str] = []
75
+ _emit(node, source, boundaries, exclude, tokens, quote_tokens, body_containers, docstring_types,
76
+ comment_types)
77
+ blob = _TOKEN.join(tokens).encode("utf-8", "surrogatepass")
78
+ return hashlib.sha256(blob).hexdigest()
79
+
80
+
81
+ def _emit(node: Node, source: bytes, boundaries: Mapping[int, str], exclude: frozenset[int],
82
+ out: list[str], quote_tokens: frozenset[str], body_containers: frozenset[str],
83
+ docstring_types: frozenset[str], comment_types: frozenset[str]) -> None:
84
+ """Append ``node``'s significant tokens to ``out`` in pre-order."""
85
+ if node.id in exclude:
86
+ return # the symbol's own name — dropped so a rename doesn't change the body-hash
87
+
88
+ name = boundaries.get(node.id)
89
+ if name is not None:
90
+ out.append(f"<def:{name}>") # nested extracted symbol — its body is its own concern
91
+ return
92
+
93
+ kind = node.type
94
+ if kind in comment_types:
95
+ return
96
+
97
+ if node.child_count == 0:
98
+ text = source[node.start_byte : node.end_byte].decode("utf-8", "surrogatepass")
99
+ if kind in quote_tokens:
100
+ # Canonicalize the *kind* too: in some grammars (JS/TS) the delimiter's node type IS the
101
+ # quote char (``"`` vs ``'``), so normalizing only the text would still differ by type.
102
+ # On Python's ``string_start``/``string_end`` this is a no-op (no quote char in the name),
103
+ # so existing anchors stay byte-identical.
104
+ kind = _canon_quote(kind)
105
+ text = _canon_quote(text)
106
+ out.append(f"{kind}{_FIELD}{text}")
107
+ return
108
+
109
+ children = node.children
110
+ if kind in body_containers:
111
+ children = _without_leading_docstring(children, docstring_types)
112
+ for child in children:
113
+ _emit(child, source, boundaries, exclude, out, quote_tokens, body_containers, docstring_types,
114
+ comment_types)
115
+
116
+
117
+ def _without_leading_docstring(children: list[Node], docstring_types: frozenset[str]) -> list[Node]:
118
+ """Return ``children`` with a leading docstring statement removed, if present.
119
+
120
+ A docstring is the first *statement* (comments don't count) of a body that is a bare string
121
+ expression. Doc-only edits are maintenance; a real contract change also edits code, which trips
122
+ drift anyway (docs/m1-notes.md §4).
123
+ """
124
+ for i, child in enumerate(children):
125
+ if child.type == "comment":
126
+ continue # comments precede the docstring but aren't the first statement
127
+ if child.type == "expression_statement" and _is_string_only(child, docstring_types):
128
+ return children[:i] + children[i + 1 :]
129
+ return children # first real statement isn't a docstring
130
+ return children
131
+
132
+
133
+ def _is_string_only(stmt: Node, docstring_types: frozenset[str]) -> bool:
134
+ """True when an ``expression_statement`` is a lone string literal (a docstring)."""
135
+ kids = stmt.children
136
+ return len(kids) == 1 and kids[0].type in docstring_types
137
+
138
+
139
+ def _canon_quote(token: str) -> str:
140
+ """Canonicalize a string delimiter: lowercase the prefix, force double quotes, keep quote count.
141
+
142
+ ``r'''`` → ``r\"\"\"``, ``F"`` → ``f"``. Preserves the prefix letters (``r``/``b``/``f``) and the
143
+ quote *count* (never collapses ``'''`` ↔ ``'``, which would change semantics). Kills the dominant
144
+ ``black`` quote-flip false-drift source; ``string_content`` is emitted verbatim, so escape-level
145
+ rewrites (``'it\\'s'`` → ``"it's"``) still trip drift and are deferred to a possible v2.
146
+ """
147
+ i = 0
148
+ while i < len(token) and token[i] not in ("'", '"'):
149
+ i += 1
150
+ prefix, quotes = token[:i], token[i:]
151
+ return prefix.lower() + quotes.replace("'", '"')
yigraf/cache.py ADDED
@@ -0,0 +1,102 @@
1
+ """Per-file SHA content cache for structure extraction (``yigraf/cache/structure.json``).
2
+
3
+ Keyed by the raw file bytes' SHA-256: a hit means the file is byte-for-byte unchanged since it was
4
+ last extracted, so its cached node/edge projection is reused verbatim and tree-sitter is skipped.
5
+ This is the *file cache SHA* of ``docs/m1-notes.md`` §3 — distinct from a symbol's astnorm
6
+ ``content_hash``. The cache is gitignored and rebuildable; it never affects the output graph (which
7
+ is deterministic), only whether a file is re-parsed. It is invalidated wholesale when the astnorm
8
+ algorithm version changes, so a stale anchor can never survive a rule change.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import hashlib
13
+ import json
14
+ from dataclasses import dataclass, field
15
+ from pathlib import Path
16
+ from typing import TYPE_CHECKING
17
+
18
+ from yigraf.astnorm import ANCHOR_ALGO
19
+
20
+ if TYPE_CHECKING:
21
+ from yigraf.extract import FileProjection
22
+
23
+ #: Bumped when the on-disk cache layout changes incompatibly (separate from the astnorm algo).
24
+ #: 2: structure nodes gained a ``signature`` field (M4).
25
+ #: 3: file nodes gained an ``inherits`` field (import-aware inheritance edges) — a stale cache would
26
+ #: otherwise serve pre-inheritance projections for files that haven't changed since the upgrade.
27
+ #: 4: the tags-tier extractors began populating ``inherits`` too (inheritance across the breadth
28
+ #: languages), so a format-3 cache of e.g. a Java file lacks its inheritance — invalidate it.
29
+ #: 5: Kotlin/Scala began recording ``imports`` on the file node (import edges) — a format-4 cache of
30
+ #: those files has an empty imports list.
31
+ CACHE_FORMAT = 5
32
+
33
+
34
+ def file_sha(data: bytes) -> str:
35
+ """SHA-256 hex of raw file bytes — the cache key (a file changed at all)."""
36
+ return hashlib.sha256(data).hexdigest()
37
+
38
+
39
+ @dataclass
40
+ class StructureCache:
41
+ """Reusable per-file extraction projections, keyed by relative path then content SHA.
42
+
43
+ Also carries a small HEAD-keyed ``maturity`` slot (R2 survival counts): recomputing maturity
44
+ walks git history, but an edit never moves ``HEAD``, so this lets the hot ``PostToolUse`` rebuild
45
+ skip the walk until a commit actually lands. Like the rest of the cache it's gitignored,
46
+ rebuildable, and never alters the (deterministic) output graph — only how survival is obtained.
47
+ """
48
+
49
+ algo: str
50
+ entries: dict[str, dict]
51
+ maturity: dict = field(default_factory=dict)
52
+
53
+ @classmethod
54
+ def load(cls, path: Path) -> "StructureCache":
55
+ """Load the cache, or start empty if absent, unreadable, or built by a different algo."""
56
+ p = Path(path)
57
+ if p.exists():
58
+ try:
59
+ data = json.loads(p.read_text(encoding="utf-8"))
60
+ except (json.JSONDecodeError, OSError):
61
+ data = {}
62
+ if data.get("format") == CACHE_FORMAT and data.get("algo") == ANCHOR_ALGO:
63
+ return cls(algo=ANCHOR_ALGO, entries=dict(data.get("files", {})),
64
+ maturity=dict(data.get("maturity", {})))
65
+ return cls(algo=ANCHOR_ALGO, entries={})
66
+
67
+ def maturity_survival(self, head: str) -> dict | None:
68
+ """Cached ``{path: survival}`` if it was computed at this ``HEAD``, else ``None`` (a miss)."""
69
+ if self.maturity.get("head") == head:
70
+ return dict(self.maturity.get("survival", {}))
71
+ return None
72
+
73
+ def set_maturity_survival(self, head: str, survival: dict) -> None:
74
+ """Record the survival map computed at ``head`` (replaces any map from an earlier HEAD)."""
75
+ self.maturity = {"head": head, "survival": dict(survival)}
76
+
77
+ def get(self, relpath: str, sha: str) -> "FileProjection | None":
78
+ """Return the cached projection for ``relpath`` iff its content SHA still matches."""
79
+ from yigraf.extract import FileProjection
80
+
81
+ entry = self.entries.get(relpath)
82
+ if entry is not None and entry.get("sha") == sha:
83
+ return FileProjection.from_cache(entry)
84
+ return None
85
+
86
+ def put(self, relpath: str, sha: str, projection: "FileProjection") -> None:
87
+ """Record ``projection`` for ``relpath`` under its content SHA."""
88
+ self.entries[relpath] = {"sha": sha, **projection.to_cache()}
89
+
90
+ def prune(self, keep: set[str]) -> None:
91
+ """Drop cached entries for files no longer present in the repo."""
92
+ for relpath in list(self.entries):
93
+ if relpath not in keep:
94
+ del self.entries[relpath]
95
+
96
+ def save(self, path: Path) -> None:
97
+ """Write the cache as deterministic JSON (sorted keys)."""
98
+ p = Path(path)
99
+ p.parent.mkdir(parents=True, exist_ok=True)
100
+ out = {"format": CACHE_FORMAT, "algo": self.algo, "files": self.entries,
101
+ "maturity": self.maturity}
102
+ p.write_text(json.dumps(out, indent=2, sort_keys=True) + "\n", encoding="utf-8")