yigraf 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yigraf/__init__.py +3 -0
- yigraf/__main__.py +6 -0
- yigraf/artifacts.py +312 -0
- yigraf/astnorm.py +151 -0
- yigraf/cache.py +102 -0
- yigraf/cli.py +626 -0
- yigraf/config.py +143 -0
- yigraf/counters.py +288 -0
- yigraf/drift.py +113 -0
- yigraf/embeddings.py +297 -0
- yigraf/extract.py +166 -0
- yigraf/graph.py +57 -0
- yigraf/hooks.py +268 -0
- yigraf/languages/__init__.py +73 -0
- yigraf/languages/base.py +252 -0
- yigraf/languages/go.py +277 -0
- yigraf/languages/jsts.py +324 -0
- yigraf/languages/python.py +379 -0
- yigraf/languages/resolve.py +42 -0
- yigraf/languages/tags.py +813 -0
- yigraf/memory.py +325 -0
- yigraf/retrieval.py +604 -0
- yigraf/scaffold.py +102 -0
- yigraf-0.1.0.dist-info/METADATA +222 -0
- yigraf-0.1.0.dist-info/RECORD +28 -0
- yigraf-0.1.0.dist-info/WHEEL +4 -0
- yigraf-0.1.0.dist-info/entry_points.txt +2 -0
- yigraf-0.1.0.dist-info/licenses/LICENSE +21 -0
yigraf/__init__.py
ADDED
yigraf/__main__.py
ADDED
yigraf/artifacts.py
ADDED
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
"""Intent & plan artifacts: the authored ``.md`` truth for the intent/plan node families (M2).
|
|
2
|
+
|
|
3
|
+
Intents and plans live as one-file-per-node markdown under ``yigraf/intents/`` and
|
|
4
|
+
``yigraf/plans/`` (``docs/graph-design.md`` §4, ``docs/m2-notes.md``). Bodies are human-authored;
|
|
5
|
+
the plan's ``edges`` frontmatter is machine-written by ``yigraf link``. This module reads them into
|
|
6
|
+
dataclasses, projects those into the graph (intent/plan/task nodes + ``contains``/``tracks``/
|
|
7
|
+
``requires``/``implements`` edges), and writes new artifacts for the authoring verbs.
|
|
8
|
+
|
|
9
|
+
A target id that doesn't resolve to a node is **not** added as a phantom edge — it's stashed on the
|
|
10
|
+
task node (``dangling_implements`` / ``dangling_tracks``) for M3 to surface as hard drift.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
import networkx as nx
|
|
20
|
+
import yaml
|
|
21
|
+
|
|
22
|
+
from yigraf.astnorm import ANCHOR_ALGO
|
|
23
|
+
|
|
24
|
+
INTENT_FAMILY = "intent"
|
|
25
|
+
PLAN_FAMILY = "plan"
|
|
26
|
+
CONF = "EXTRACTED" # authored artifacts are asserted truth, not inferred
|
|
27
|
+
|
|
28
|
+
INTENT_TYPES = ("requirement", "goal", "capability")
|
|
29
|
+
INTENT_STATUSES = ("proposed", "active", "satisfied", "archived")
|
|
30
|
+
|
|
31
|
+
_FRONTMATTER = re.compile(r"\A---\n(.*?)\n---\n?(.*)\Z", re.DOTALL)
|
|
32
|
+
_TASK_LINE = re.compile(r"^- \[([ xX])\]\s*\{#(\d+)\}\s*(.*)$")
|
|
33
|
+
_HEADING = re.compile(r"^##\s+(.*?)\s*$")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# --------------------------------------------------------------------------------------------------
|
|
37
|
+
# Frontmatter + section parsing
|
|
38
|
+
# --------------------------------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _split_frontmatter(text: str) -> tuple[dict, str]:
|
|
42
|
+
"""Return ``(metadata, body)`` for a ``---``-fenced markdown file (empty meta if none)."""
|
|
43
|
+
match = _FRONTMATTER.match(text)
|
|
44
|
+
if match is None:
|
|
45
|
+
return {}, text
|
|
46
|
+
meta = yaml.safe_load(match.group(1)) or {}
|
|
47
|
+
if not isinstance(meta, dict):
|
|
48
|
+
raise ValueError("artifact frontmatter must be a YAML mapping")
|
|
49
|
+
return meta, match.group(2)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _compose(meta: dict, body: str) -> str:
|
|
53
|
+
"""Inverse of :func:`_split_frontmatter`: deterministic frontmatter + body."""
|
|
54
|
+
front = yaml.safe_dump(meta, sort_keys=True, allow_unicode=True, default_flow_style=False)
|
|
55
|
+
body = body if body.endswith("\n") or not body else body + "\n"
|
|
56
|
+
return f"---\n{front}---\n{body}"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _sections(body: str) -> dict[str, str]:
|
|
60
|
+
"""Split a markdown body into ``{heading_lower: text}`` keyed by ``## Heading``.
|
|
61
|
+
|
|
62
|
+
The heading key is lowercased and trimmed of a trailing parenthetical (``Design (how)`` →
|
|
63
|
+
``design``), so authored variants still map to a stable field.
|
|
64
|
+
"""
|
|
65
|
+
out: dict[str, list[str]] = {}
|
|
66
|
+
current: str | None = None
|
|
67
|
+
for line in body.splitlines():
|
|
68
|
+
heading = _HEADING.match(line)
|
|
69
|
+
if heading is not None:
|
|
70
|
+
key = heading.group(1).split("(")[0].strip().casefold()
|
|
71
|
+
current = key
|
|
72
|
+
out.setdefault(current, [])
|
|
73
|
+
elif current is not None:
|
|
74
|
+
out[current].append(line)
|
|
75
|
+
return {k: "\n".join(v).strip() for k, v in out.items()}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _bullets(text: str) -> list[str]:
|
|
79
|
+
"""The ``- `` bullet items in a section, in order (used for scenarios)."""
|
|
80
|
+
return [ln[2:].strip() for ln in text.splitlines() if ln.lstrip().startswith("- ")]
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# --------------------------------------------------------------------------------------------------
|
|
84
|
+
# Intent
|
|
85
|
+
# --------------------------------------------------------------------------------------------------
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class Intent:
|
|
90
|
+
id: str
|
|
91
|
+
slug: str
|
|
92
|
+
type: str
|
|
93
|
+
status: str
|
|
94
|
+
statement: str
|
|
95
|
+
scenarios: list[str] = field(default_factory=list)
|
|
96
|
+
design: str | None = None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def read_intent(path: Path) -> Intent:
|
|
100
|
+
"""Parse an ``intents/<slug>.md`` file into an :class:`Intent`."""
|
|
101
|
+
path = Path(path)
|
|
102
|
+
meta, body = _split_frontmatter(path.read_text(encoding="utf-8"))
|
|
103
|
+
slug = path.stem
|
|
104
|
+
sections = _sections(body)
|
|
105
|
+
design = sections.get("design") or None
|
|
106
|
+
return Intent(
|
|
107
|
+
id=meta.get("id", f"int:{slug.casefold()}"),
|
|
108
|
+
slug=slug,
|
|
109
|
+
type=meta.get("type", "requirement"),
|
|
110
|
+
status=meta.get("status", "proposed"),
|
|
111
|
+
statement=sections.get("requirement", "").strip(),
|
|
112
|
+
scenarios=_bullets(sections.get("scenarios", "")),
|
|
113
|
+
design=design,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def render_intent(slug: str, statement: str, scenarios: list[str], design: str | None,
|
|
118
|
+
type: str = "requirement", status: str = "proposed") -> str:
|
|
119
|
+
"""Render the markdown for a new intent artifact."""
|
|
120
|
+
meta = {"id": f"int:{slug.casefold()}", "family": INTENT_FAMILY, "type": type, "status": status}
|
|
121
|
+
lines = ["## Requirement", statement, "", "## Scenarios"]
|
|
122
|
+
lines += [f"- {s}" for s in scenarios] or ["- "]
|
|
123
|
+
if design:
|
|
124
|
+
lines += ["", "## Design (how)", design]
|
|
125
|
+
return _compose(meta, "\n".join(lines) + "\n")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# --------------------------------------------------------------------------------------------------
|
|
129
|
+
# Plan + tasks
|
|
130
|
+
# --------------------------------------------------------------------------------------------------
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@dataclass
|
|
134
|
+
class Implements:
|
|
135
|
+
sym: str
|
|
136
|
+
anchor: str | None = None
|
|
137
|
+
anchor_algo: str | None = None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
@dataclass
|
|
141
|
+
class Task:
|
|
142
|
+
id: str
|
|
143
|
+
num: int
|
|
144
|
+
description: str
|
|
145
|
+
state: str # todo | done
|
|
146
|
+
tracks: str | None = None
|
|
147
|
+
requires: list[str] = field(default_factory=list)
|
|
148
|
+
implements: list[Implements] = field(default_factory=list)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
@dataclass
|
|
152
|
+
class Plan:
|
|
153
|
+
id: str
|
|
154
|
+
slug: str
|
|
155
|
+
title: str
|
|
156
|
+
tasks: list[Task] = field(default_factory=list)
|
|
157
|
+
phase: str = "active" # active | completed (from the plans/<phase>/ subdir)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def read_plan(path: Path) -> Plan:
|
|
161
|
+
"""Parse a plan file (frontmatter ``edges`` + ``## Tasks`` checkboxes) into a :class:`Plan`."""
|
|
162
|
+
path = Path(path)
|
|
163
|
+
meta, body = _split_frontmatter(path.read_text(encoding="utf-8"))
|
|
164
|
+
slug = path.stem
|
|
165
|
+
edges = meta.get("edges") or {}
|
|
166
|
+
|
|
167
|
+
title = slug
|
|
168
|
+
for line in body.splitlines():
|
|
169
|
+
if line.startswith("# "):
|
|
170
|
+
title = line[2:].strip()
|
|
171
|
+
break
|
|
172
|
+
|
|
173
|
+
tasks: list[Task] = []
|
|
174
|
+
for line in body.splitlines():
|
|
175
|
+
m = _TASK_LINE.match(line.strip())
|
|
176
|
+
if m is None:
|
|
177
|
+
continue
|
|
178
|
+
num = int(m.group(2))
|
|
179
|
+
task_id = f"task:{slug.casefold()}/{num}"
|
|
180
|
+
spec = edges.get(task_id) or {}
|
|
181
|
+
tasks.append(
|
|
182
|
+
Task(
|
|
183
|
+
id=task_id,
|
|
184
|
+
num=num,
|
|
185
|
+
description=m.group(3).strip(),
|
|
186
|
+
state="done" if m.group(1).lower() == "x" else "todo",
|
|
187
|
+
tracks=spec.get("tracks"),
|
|
188
|
+
requires=list(spec.get("requires") or []),
|
|
189
|
+
implements=[_read_impl(e) for e in (spec.get("implements") or [])],
|
|
190
|
+
)
|
|
191
|
+
)
|
|
192
|
+
tasks.sort(key=lambda t: t.num)
|
|
193
|
+
return Plan(id=meta.get("id", f"plan:{slug.casefold()}"), slug=slug, title=title, tasks=tasks)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _read_impl(entry: Any) -> Implements:
|
|
197
|
+
if isinstance(entry, str):
|
|
198
|
+
return Implements(sym=entry)
|
|
199
|
+
return Implements(sym=entry["sym"], anchor=entry.get("anchor"), anchor_algo=entry.get("anchor_algo"))
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def render_plan(slug: str, title: str, tasks: list[str]) -> str:
|
|
203
|
+
"""Render the markdown for a new plan with todo tasks (no edges yet)."""
|
|
204
|
+
meta = {"id": f"plan:{slug.casefold()}", "family": PLAN_FAMILY, "edges": {}}
|
|
205
|
+
lines = [f"# {title}", "", "## Tasks"]
|
|
206
|
+
lines += [f"- [ ] {{#{i}}} {desc}" for i, desc in enumerate(tasks, start=1)]
|
|
207
|
+
return _compose(meta, "\n".join(lines) + "\n")
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def add_edge_to_plan(path: Path, task_id: str, relation: str, target: str,
|
|
211
|
+
anchor: str | None = None) -> None:
|
|
212
|
+
"""Write a ``tracks`` or ``implements`` edge for ``task_id`` into the plan's frontmatter.
|
|
213
|
+
|
|
214
|
+
``tracks`` is a single intent id; ``implements`` appends a (deduplicated) symbol entry carrying
|
|
215
|
+
its stamped ``anchor`` + ``anchor_algo``. Re-linking the same symbol re-stamps its anchor.
|
|
216
|
+
"""
|
|
217
|
+
path = Path(path)
|
|
218
|
+
meta, body = _split_frontmatter(path.read_text(encoding="utf-8"))
|
|
219
|
+
edges = meta.setdefault("edges", {}) or {}
|
|
220
|
+
meta["edges"] = edges
|
|
221
|
+
spec = edges.setdefault(task_id, {})
|
|
222
|
+
|
|
223
|
+
if relation == "tracks":
|
|
224
|
+
spec["tracks"] = target
|
|
225
|
+
elif relation == "implements":
|
|
226
|
+
impls = spec.setdefault("implements", [])
|
|
227
|
+
entry = {"sym": target, "anchor": anchor, "anchor_algo": ANCHOR_ALGO if anchor else None}
|
|
228
|
+
for existing in impls:
|
|
229
|
+
if existing.get("sym") == target:
|
|
230
|
+
existing.update(entry)
|
|
231
|
+
break
|
|
232
|
+
else:
|
|
233
|
+
impls.append(entry)
|
|
234
|
+
else:
|
|
235
|
+
raise ValueError(f"unsupported relation for a plan edge: {relation}")
|
|
236
|
+
|
|
237
|
+
path.write_text(_compose(meta, body), encoding="utf-8")
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
# --------------------------------------------------------------------------------------------------
|
|
241
|
+
# Projection into the graph
|
|
242
|
+
# --------------------------------------------------------------------------------------------------
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def iter_intents(root: Path) -> list[Intent]:
|
|
246
|
+
intents_dir = Path(root) / "yigraf" / "intents"
|
|
247
|
+
return [read_intent(p) for p in sorted(intents_dir.glob("*.md"))] if intents_dir.is_dir() else []
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def iter_plans(root: Path) -> list[Plan]:
|
|
251
|
+
plans_dir = Path(root) / "yigraf" / "plans"
|
|
252
|
+
out = []
|
|
253
|
+
for sub in ("active", "completed"):
|
|
254
|
+
d = plans_dir / sub
|
|
255
|
+
if d.is_dir():
|
|
256
|
+
for p in sorted(d.glob("*.md")):
|
|
257
|
+
plan = read_plan(p)
|
|
258
|
+
plan.phase = sub
|
|
259
|
+
out.append(plan)
|
|
260
|
+
return out
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def project_into(graph: nx.DiGraph, root: Path) -> None:
|
|
264
|
+
"""Add intent/plan/task nodes and their cross-family edges to ``graph`` from the artifacts."""
|
|
265
|
+
for intent in iter_intents(root):
|
|
266
|
+
graph.add_node(
|
|
267
|
+
intent.id, family=INTENT_FAMILY, kind=intent.type, label=intent.statement or intent.slug,
|
|
268
|
+
confidence=CONF, status=intent.status, statement=intent.statement,
|
|
269
|
+
scenarios=intent.scenarios, design=intent.design, source_file=f"intents/{intent.slug}.md",
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
for plan in iter_plans(root):
|
|
273
|
+
graph.add_node(plan.id, family=PLAN_FAMILY, kind="plan", label=plan.title,
|
|
274
|
+
confidence=CONF, phase=plan.phase)
|
|
275
|
+
for task in plan.tasks:
|
|
276
|
+
graph.add_node(
|
|
277
|
+
task.id, family=PLAN_FAMILY, kind="task", label=task.description,
|
|
278
|
+
confidence=CONF, state=task.state, order=task.num,
|
|
279
|
+
)
|
|
280
|
+
graph.add_edge(plan.id, task.id, relation="contains", confidence=CONF)
|
|
281
|
+
_project_task_edges(graph, task)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _project_task_edges(graph: nx.DiGraph, task: Task) -> None:
|
|
285
|
+
"""Add a task's tracks/requires/implements edges, stashing unresolved targets for M3."""
|
|
286
|
+
if task.tracks is not None:
|
|
287
|
+
if task.tracks in graph:
|
|
288
|
+
graph.add_edge(task.id, task.tracks, relation="tracks", confidence=CONF)
|
|
289
|
+
else:
|
|
290
|
+
_stash(graph, task.id, "dangling_tracks", task.tracks)
|
|
291
|
+
|
|
292
|
+
for req in task.requires:
|
|
293
|
+
if req in graph:
|
|
294
|
+
graph.add_edge(task.id, req, relation="requires", confidence=CONF)
|
|
295
|
+
else:
|
|
296
|
+
_stash(graph, task.id, "dangling_requires", req)
|
|
297
|
+
|
|
298
|
+
for impl in task.implements:
|
|
299
|
+
if impl.sym in graph:
|
|
300
|
+
attrs = {"relation": "implements", "confidence": CONF}
|
|
301
|
+
if impl.anchor is not None:
|
|
302
|
+
attrs["anchor"] = impl.anchor
|
|
303
|
+
attrs["anchor_algo"] = impl.anchor_algo or ANCHOR_ALGO
|
|
304
|
+
graph.add_edge(task.id, impl.sym, **attrs)
|
|
305
|
+
else:
|
|
306
|
+
# Keep the anchor so M3 can re-anchor a rename by content match (docs/m3-notes.md §3).
|
|
307
|
+
_stash(graph, task.id, "dangling_implements",
|
|
308
|
+
{"sym": impl.sym, "anchor": impl.anchor, "anchor_algo": impl.anchor_algo})
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _stash(graph: nx.DiGraph, node_id: str, attr: str, value: str) -> None:
|
|
312
|
+
graph.nodes[node_id].setdefault(attr, []).append(value)
|
yigraf/astnorm.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""AST-normalized ``content_hash`` — the drift anchor (``astnorm-v1``).
|
|
2
|
+
|
|
3
|
+
A symbol's ``content_hash`` is a SHA-256 over its *significant token stream*: the tokens that remain
|
|
4
|
+
after dropping comments and docstrings, normalizing string quote style, and replacing each nested
|
|
5
|
+
*extracted* symbol with a stable ``<def:NAME>`` marker. The rule is pinned in ``docs/m1-notes.md`` §4
|
|
6
|
+
and is **load-bearing**: once anchors are stamped (M2), changing the rule silently mismatches every
|
|
7
|
+
stored anchor — so the algorithm carries a version tag (:data:`ANCHOR_ALGO`). A future rule change
|
|
8
|
+
bumps the tag and re-anchors on next commit instead of false-drifting.
|
|
9
|
+
|
|
10
|
+
What is deliberately ignored (no drift): comments, docstrings, string quote style (``'x'`` ≡ ``"x"``),
|
|
11
|
+
and all whitespace/reformatting that doesn't change the parsed token stream — so a ``black``/``isort``
|
|
12
|
+
reflow is safe. What trips drift (intended): any change to identifiers, operators, literal *values*,
|
|
13
|
+
keywords, control flow, signatures, or decorators within a symbol's own body.
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import hashlib
|
|
18
|
+
from typing import TYPE_CHECKING, Mapping
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from tree_sitter import Node
|
|
22
|
+
|
|
23
|
+
#: Version tag stored alongside every anchor. Bumping it (``astnorm-v2``) re-anchors instead of
|
|
24
|
+
#: silently false-drifting against hashes produced by an older rule (docs/m1-notes.md §4).
|
|
25
|
+
ANCHOR_ALGO = "astnorm-v1"
|
|
26
|
+
|
|
27
|
+
#: Field separators for the token stream. ``\x1f`` (unit) joins a token's type to its text; ``\x1e``
|
|
28
|
+
#: (record) joins tokens. Both are control chars that cannot occur in Python source, so they can't be
|
|
29
|
+
#: forged by a literal's contents.
|
|
30
|
+
_FIELD = "\x1f"
|
|
31
|
+
_TOKEN = "\x1e"
|
|
32
|
+
|
|
33
|
+
#: Python defaults for the per-language astnorm knobs. They are the function defaults below, so a
|
|
34
|
+
#: caller that passes nothing reproduces the original Python rule byte-for-byte (no ``astnorm-v2``
|
|
35
|
+
#: bump, existing anchors stay valid). Other languages pass their own sets (e.g. Go: all empty — no
|
|
36
|
+
#: docstrings, no quote-style ambiguity), keeping the *algorithm* astnorm-v1 while varying the
|
|
37
|
+
#: language-specific token vocabulary it normalizes.
|
|
38
|
+
|
|
39
|
+
#: Token types whose text is a quote delimiter (prefix + quotes) we canonicalize.
|
|
40
|
+
_PY_QUOTE_TOKENS = frozenset({"string_start", "string_end"})
|
|
41
|
+
|
|
42
|
+
#: Block-like containers whose *leading* string statement is a docstring to drop.
|
|
43
|
+
_PY_BODY_CONTAINERS = frozenset({"block", "module"})
|
|
44
|
+
|
|
45
|
+
#: Node types that count as a lone string statement (a docstring) inside a body container.
|
|
46
|
+
_PY_DOCSTRING_TYPES = frozenset({"string", "concatenated_string"})
|
|
47
|
+
|
|
48
|
+
#: Node types dropped as comments. Default fits Python/Go/JS/C; languages whose grammar names them
|
|
49
|
+
#: differently (Rust/Java: ``line_comment``/``block_comment``) pass their own set.
|
|
50
|
+
_PY_COMMENT_TYPES = frozenset({"comment"})
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def content_hash(node: Node, source: bytes, boundaries: Mapping[int, str],
|
|
54
|
+
exclude: frozenset[int] = frozenset(), *,
|
|
55
|
+
quote_tokens: frozenset[str] = _PY_QUOTE_TOKENS,
|
|
56
|
+
body_containers: frozenset[str] = _PY_BODY_CONTAINERS,
|
|
57
|
+
docstring_types: frozenset[str] = _PY_DOCSTRING_TYPES,
|
|
58
|
+
comment_types: frozenset[str] = _PY_COMMENT_TYPES) -> str:
|
|
59
|
+
"""Hash ``node``'s significant token stream (``astnorm-v1``); see module docstring.
|
|
60
|
+
|
|
61
|
+
``boundaries`` maps the tree-sitter node id of each *directly nested extracted symbol* (a
|
|
62
|
+
top-level def for a module; a method for a class) to its local name. Those subtrees are replaced
|
|
63
|
+
by a ``<def:NAME>`` marker and not descended into — so a class hash captures its member *names*
|
|
64
|
+
but not method bodies, and editing a method body flips only that method's hash.
|
|
65
|
+
|
|
66
|
+
``exclude`` is a set of node ids dropped outright — used for the symbol's **own declared name**,
|
|
67
|
+
so a pure rename leaves the body-hash unchanged and M3 can re-anchor by exact match
|
|
68
|
+
(docs/m3-notes.md §2). A *container's* member names (the ``<def:NAME>`` markers) are unaffected.
|
|
69
|
+
|
|
70
|
+
``quote_tokens`` / ``body_containers`` / ``docstring_types`` are the per-language knobs (default:
|
|
71
|
+
Python). Empty sets disable quote canonicalization / docstring dropping for languages that have
|
|
72
|
+
neither (e.g. Go), while the rest of the algorithm — and ``ANCHOR_ALGO`` — is unchanged.
|
|
73
|
+
"""
|
|
74
|
+
tokens: list[str] = []
|
|
75
|
+
_emit(node, source, boundaries, exclude, tokens, quote_tokens, body_containers, docstring_types,
|
|
76
|
+
comment_types)
|
|
77
|
+
blob = _TOKEN.join(tokens).encode("utf-8", "surrogatepass")
|
|
78
|
+
return hashlib.sha256(blob).hexdigest()
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _emit(node: Node, source: bytes, boundaries: Mapping[int, str], exclude: frozenset[int],
|
|
82
|
+
out: list[str], quote_tokens: frozenset[str], body_containers: frozenset[str],
|
|
83
|
+
docstring_types: frozenset[str], comment_types: frozenset[str]) -> None:
|
|
84
|
+
"""Append ``node``'s significant tokens to ``out`` in pre-order."""
|
|
85
|
+
if node.id in exclude:
|
|
86
|
+
return # the symbol's own name — dropped so a rename doesn't change the body-hash
|
|
87
|
+
|
|
88
|
+
name = boundaries.get(node.id)
|
|
89
|
+
if name is not None:
|
|
90
|
+
out.append(f"<def:{name}>") # nested extracted symbol — its body is its own concern
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
kind = node.type
|
|
94
|
+
if kind in comment_types:
|
|
95
|
+
return
|
|
96
|
+
|
|
97
|
+
if node.child_count == 0:
|
|
98
|
+
text = source[node.start_byte : node.end_byte].decode("utf-8", "surrogatepass")
|
|
99
|
+
if kind in quote_tokens:
|
|
100
|
+
# Canonicalize the *kind* too: in some grammars (JS/TS) the delimiter's node type IS the
|
|
101
|
+
# quote char (``"`` vs ``'``), so normalizing only the text would still differ by type.
|
|
102
|
+
# On Python's ``string_start``/``string_end`` this is a no-op (no quote char in the name),
|
|
103
|
+
# so existing anchors stay byte-identical.
|
|
104
|
+
kind = _canon_quote(kind)
|
|
105
|
+
text = _canon_quote(text)
|
|
106
|
+
out.append(f"{kind}{_FIELD}{text}")
|
|
107
|
+
return
|
|
108
|
+
|
|
109
|
+
children = node.children
|
|
110
|
+
if kind in body_containers:
|
|
111
|
+
children = _without_leading_docstring(children, docstring_types)
|
|
112
|
+
for child in children:
|
|
113
|
+
_emit(child, source, boundaries, exclude, out, quote_tokens, body_containers, docstring_types,
|
|
114
|
+
comment_types)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _without_leading_docstring(children: list[Node], docstring_types: frozenset[str]) -> list[Node]:
|
|
118
|
+
"""Return ``children`` with a leading docstring statement removed, if present.
|
|
119
|
+
|
|
120
|
+
A docstring is the first *statement* (comments don't count) of a body that is a bare string
|
|
121
|
+
expression. Doc-only edits are maintenance; a real contract change also edits code, which trips
|
|
122
|
+
drift anyway (docs/m1-notes.md §4).
|
|
123
|
+
"""
|
|
124
|
+
for i, child in enumerate(children):
|
|
125
|
+
if child.type == "comment":
|
|
126
|
+
continue # comments precede the docstring but aren't the first statement
|
|
127
|
+
if child.type == "expression_statement" and _is_string_only(child, docstring_types):
|
|
128
|
+
return children[:i] + children[i + 1 :]
|
|
129
|
+
return children # first real statement isn't a docstring
|
|
130
|
+
return children
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _is_string_only(stmt: Node, docstring_types: frozenset[str]) -> bool:
|
|
134
|
+
"""True when an ``expression_statement`` is a lone string literal (a docstring)."""
|
|
135
|
+
kids = stmt.children
|
|
136
|
+
return len(kids) == 1 and kids[0].type in docstring_types
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _canon_quote(token: str) -> str:
|
|
140
|
+
"""Canonicalize a string delimiter: lowercase the prefix, force double quotes, keep quote count.
|
|
141
|
+
|
|
142
|
+
``r'''`` → ``r\"\"\"``, ``F"`` → ``f"``. Preserves the prefix letters (``r``/``b``/``f``) and the
|
|
143
|
+
quote *count* (never collapses ``'''`` ↔ ``'``, which would change semantics). Kills the dominant
|
|
144
|
+
``black`` quote-flip false-drift source; ``string_content`` is emitted verbatim, so escape-level
|
|
145
|
+
rewrites (``'it\\'s'`` → ``"it's"``) still trip drift and are deferred to a possible v2.
|
|
146
|
+
"""
|
|
147
|
+
i = 0
|
|
148
|
+
while i < len(token) and token[i] not in ("'", '"'):
|
|
149
|
+
i += 1
|
|
150
|
+
prefix, quotes = token[:i], token[i:]
|
|
151
|
+
return prefix.lower() + quotes.replace("'", '"')
|
yigraf/cache.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Per-file SHA content cache for structure extraction (``yigraf/cache/structure.json``).
|
|
2
|
+
|
|
3
|
+
Keyed by the raw file bytes' SHA-256: a hit means the file is byte-for-byte unchanged since it was
|
|
4
|
+
last extracted, so its cached node/edge projection is reused verbatim and tree-sitter is skipped.
|
|
5
|
+
This is the *file cache SHA* of ``docs/m1-notes.md`` §3 — distinct from a symbol's astnorm
|
|
6
|
+
``content_hash``. The cache is gitignored and rebuildable; it never affects the output graph (which
|
|
7
|
+
is deterministic), only whether a file is re-parsed. It is invalidated wholesale when the astnorm
|
|
8
|
+
algorithm version changes, so a stale anchor can never survive a rule change.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import hashlib
|
|
13
|
+
import json
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import TYPE_CHECKING
|
|
17
|
+
|
|
18
|
+
from yigraf.astnorm import ANCHOR_ALGO
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from yigraf.extract import FileProjection
|
|
22
|
+
|
|
23
|
+
#: Bumped when the on-disk cache layout changes incompatibly (separate from the astnorm algo).
|
|
24
|
+
#: 2: structure nodes gained a ``signature`` field (M4).
|
|
25
|
+
#: 3: file nodes gained an ``inherits`` field (import-aware inheritance edges) — a stale cache would
|
|
26
|
+
#: otherwise serve pre-inheritance projections for files that haven't changed since the upgrade.
|
|
27
|
+
#: 4: the tags-tier extractors began populating ``inherits`` too (inheritance across the breadth
|
|
28
|
+
#: languages), so a format-3 cache of e.g. a Java file lacks its inheritance — invalidate it.
|
|
29
|
+
#: 5: Kotlin/Scala began recording ``imports`` on the file node (import edges) — a format-4 cache of
|
|
30
|
+
#: those files has an empty imports list.
|
|
31
|
+
CACHE_FORMAT = 5
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def file_sha(data: bytes) -> str:
|
|
35
|
+
"""SHA-256 hex of raw file bytes — the cache key (a file changed at all)."""
|
|
36
|
+
return hashlib.sha256(data).hexdigest()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class StructureCache:
|
|
41
|
+
"""Reusable per-file extraction projections, keyed by relative path then content SHA.
|
|
42
|
+
|
|
43
|
+
Also carries a small HEAD-keyed ``maturity`` slot (R2 survival counts): recomputing maturity
|
|
44
|
+
walks git history, but an edit never moves ``HEAD``, so this lets the hot ``PostToolUse`` rebuild
|
|
45
|
+
skip the walk until a commit actually lands. Like the rest of the cache it's gitignored,
|
|
46
|
+
rebuildable, and never alters the (deterministic) output graph — only how survival is obtained.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
algo: str
|
|
50
|
+
entries: dict[str, dict]
|
|
51
|
+
maturity: dict = field(default_factory=dict)
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def load(cls, path: Path) -> "StructureCache":
|
|
55
|
+
"""Load the cache, or start empty if absent, unreadable, or built by a different algo."""
|
|
56
|
+
p = Path(path)
|
|
57
|
+
if p.exists():
|
|
58
|
+
try:
|
|
59
|
+
data = json.loads(p.read_text(encoding="utf-8"))
|
|
60
|
+
except (json.JSONDecodeError, OSError):
|
|
61
|
+
data = {}
|
|
62
|
+
if data.get("format") == CACHE_FORMAT and data.get("algo") == ANCHOR_ALGO:
|
|
63
|
+
return cls(algo=ANCHOR_ALGO, entries=dict(data.get("files", {})),
|
|
64
|
+
maturity=dict(data.get("maturity", {})))
|
|
65
|
+
return cls(algo=ANCHOR_ALGO, entries={})
|
|
66
|
+
|
|
67
|
+
def maturity_survival(self, head: str) -> dict | None:
|
|
68
|
+
"""Cached ``{path: survival}`` if it was computed at this ``HEAD``, else ``None`` (a miss)."""
|
|
69
|
+
if self.maturity.get("head") == head:
|
|
70
|
+
return dict(self.maturity.get("survival", {}))
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
def set_maturity_survival(self, head: str, survival: dict) -> None:
|
|
74
|
+
"""Record the survival map computed at ``head`` (replaces any map from an earlier HEAD)."""
|
|
75
|
+
self.maturity = {"head": head, "survival": dict(survival)}
|
|
76
|
+
|
|
77
|
+
def get(self, relpath: str, sha: str) -> "FileProjection | None":
|
|
78
|
+
"""Return the cached projection for ``relpath`` iff its content SHA still matches."""
|
|
79
|
+
from yigraf.extract import FileProjection
|
|
80
|
+
|
|
81
|
+
entry = self.entries.get(relpath)
|
|
82
|
+
if entry is not None and entry.get("sha") == sha:
|
|
83
|
+
return FileProjection.from_cache(entry)
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
def put(self, relpath: str, sha: str, projection: "FileProjection") -> None:
|
|
87
|
+
"""Record ``projection`` for ``relpath`` under its content SHA."""
|
|
88
|
+
self.entries[relpath] = {"sha": sha, **projection.to_cache()}
|
|
89
|
+
|
|
90
|
+
def prune(self, keep: set[str]) -> None:
|
|
91
|
+
"""Drop cached entries for files no longer present in the repo."""
|
|
92
|
+
for relpath in list(self.entries):
|
|
93
|
+
if relpath not in keep:
|
|
94
|
+
del self.entries[relpath]
|
|
95
|
+
|
|
96
|
+
def save(self, path: Path) -> None:
|
|
97
|
+
"""Write the cache as deterministic JSON (sorted keys)."""
|
|
98
|
+
p = Path(path)
|
|
99
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
out = {"format": CACHE_FORMAT, "algo": self.algo, "files": self.entries,
|
|
101
|
+
"maturity": self.maturity}
|
|
102
|
+
p.write_text(json.dumps(out, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|