@oriro/orirocli 0.1.9 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -18
- package/dist/cli.js +4776 -2964
- package/package.json +2 -2
- package/skills/craft/ai-engineering/SKILL.md +2 -2
- package/skills/graphify/SKILL.md +0 -619
- package/skills/graphify/__init__.py +0 -28
- package/skills/graphify/__main__.py +0 -4582
- package/skills/graphify/affected.py +0 -154
- package/skills/graphify/always_on/agents-md.md +0 -12
- package/skills/graphify/always_on/antigravity-rules.md +0 -14
- package/skills/graphify/always_on/claude-md.md +0 -9
- package/skills/graphify/always_on/gemini-md.md +0 -9
- package/skills/graphify/always_on/kiro-steering.md +0 -5
- package/skills/graphify/always_on/vscode-instructions.md +0 -17
- package/skills/graphify/analyze.py +0 -724
- package/skills/graphify/benchmark.py +0 -155
- package/skills/graphify/build.py +0 -487
- package/skills/graphify/cache.py +0 -417
- package/skills/graphify/callflow_html.py +0 -2020
- package/skills/graphify/cluster.py +0 -272
- package/skills/graphify/command-kilo.md +0 -15
- package/skills/graphify/dedup.py +0 -429
- package/skills/graphify/detect.py +0 -1379
- package/skills/graphify/diagnostics.py +0 -390
- package/skills/graphify/export.py +0 -1408
- package/skills/graphify/extract.py +0 -11570
- package/skills/graphify/global_graph.py +0 -159
- package/skills/graphify/google_workspace.py +0 -223
- package/skills/graphify/hooks.py +0 -457
- package/skills/graphify/ingest.py +0 -331
- package/skills/graphify/llm.py +0 -1896
- package/skills/graphify/manifest.py +0 -4
- package/skills/graphify/mcp_ingest.py +0 -392
- package/skills/graphify/multigraph_compat.py +0 -212
- package/skills/graphify/pg_introspect.py +0 -142
- package/skills/graphify/prs.py +0 -748
- package/skills/graphify/querylog.py +0 -70
- package/skills/graphify/report.py +0 -218
- package/skills/graphify/scip_ingest.py +0 -363
- package/skills/graphify/security.py +0 -336
- package/skills/graphify/semantic_cleanup.py +0 -319
- package/skills/graphify/serve.py +0 -1309
- package/skills/graphify/skill-aider.md +0 -1246
- package/skills/graphify/skill-amp.md +0 -613
- package/skills/graphify/skill-claw.md +0 -616
- package/skills/graphify/skill-codex.md +0 -613
- package/skills/graphify/skill-copilot.md +0 -616
- package/skills/graphify/skill-devin.md +0 -1372
- package/skills/graphify/skill-droid.md +0 -613
- package/skills/graphify/skill-kilo.md +0 -625
- package/skills/graphify/skill-kiro.md +0 -615
- package/skills/graphify/skill-opencode.md +0 -608
- package/skills/graphify/skill-pi.md +0 -615
- package/skills/graphify/skill-trae.md +0 -614
- package/skills/graphify/skill-vscode.md +0 -612
- package/skills/graphify/skill-windows.md +0 -651
- package/skills/graphify/skills/amp/references/add-watch.md +0 -56
- package/skills/graphify/skills/amp/references/exports.md +0 -71
- package/skills/graphify/skills/amp/references/extraction-spec.md +0 -68
- package/skills/graphify/skills/amp/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/amp/references/hooks.md +0 -33
- package/skills/graphify/skills/amp/references/query.md +0 -249
- package/skills/graphify/skills/amp/references/transcribe.md +0 -48
- package/skills/graphify/skills/amp/references/update.md +0 -179
- package/skills/graphify/skills/claude/references/add-watch.md +0 -56
- package/skills/graphify/skills/claude/references/exports.md +0 -71
- package/skills/graphify/skills/claude/references/extraction-spec.md +0 -68
- package/skills/graphify/skills/claude/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/claude/references/hooks.md +0 -33
- package/skills/graphify/skills/claude/references/query.md +0 -103
- package/skills/graphify/skills/claude/references/transcribe.md +0 -48
- package/skills/graphify/skills/claude/references/update.md +0 -179
- package/skills/graphify/skills/claw/references/add-watch.md +0 -56
- package/skills/graphify/skills/claw/references/exports.md +0 -71
- package/skills/graphify/skills/claw/references/extraction-spec.md +0 -29
- package/skills/graphify/skills/claw/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/claw/references/hooks.md +0 -33
- package/skills/graphify/skills/claw/references/query.md +0 -249
- package/skills/graphify/skills/claw/references/transcribe.md +0 -48
- package/skills/graphify/skills/claw/references/update.md +0 -179
- package/skills/graphify/skills/codex/references/add-watch.md +0 -56
- package/skills/graphify/skills/codex/references/exports.md +0 -71
- package/skills/graphify/skills/codex/references/extraction-spec.md +0 -29
- package/skills/graphify/skills/codex/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/codex/references/hooks.md +0 -33
- package/skills/graphify/skills/codex/references/query.md +0 -249
- package/skills/graphify/skills/codex/references/transcribe.md +0 -48
- package/skills/graphify/skills/codex/references/update.md +0 -179
- package/skills/graphify/skills/copilot/references/add-watch.md +0 -56
- package/skills/graphify/skills/copilot/references/exports.md +0 -71
- package/skills/graphify/skills/copilot/references/extraction-spec.md +0 -68
- package/skills/graphify/skills/copilot/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/copilot/references/hooks.md +0 -33
- package/skills/graphify/skills/copilot/references/query.md +0 -249
- package/skills/graphify/skills/copilot/references/transcribe.md +0 -48
- package/skills/graphify/skills/copilot/references/update.md +0 -179
- package/skills/graphify/skills/droid/references/add-watch.md +0 -56
- package/skills/graphify/skills/droid/references/exports.md +0 -71
- package/skills/graphify/skills/droid/references/extraction-spec.md +0 -68
- package/skills/graphify/skills/droid/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/droid/references/hooks.md +0 -33
- package/skills/graphify/skills/droid/references/query.md +0 -249
- package/skills/graphify/skills/droid/references/transcribe.md +0 -48
- package/skills/graphify/skills/droid/references/update.md +0 -179
- package/skills/graphify/skills/kilo/references/add-watch.md +0 -56
- package/skills/graphify/skills/kilo/references/exports.md +0 -71
- package/skills/graphify/skills/kilo/references/extraction-spec.md +0 -68
- package/skills/graphify/skills/kilo/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/kilo/references/hooks.md +0 -33
- package/skills/graphify/skills/kilo/references/query.md +0 -249
- package/skills/graphify/skills/kilo/references/transcribe.md +0 -48
- package/skills/graphify/skills/kilo/references/update.md +0 -179
- package/skills/graphify/skills/kiro/references/add-watch.md +0 -56
- package/skills/graphify/skills/kiro/references/exports.md +0 -71
- package/skills/graphify/skills/kiro/references/extraction-spec.md +0 -29
- package/skills/graphify/skills/kiro/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/kiro/references/hooks.md +0 -33
- package/skills/graphify/skills/kiro/references/query.md +0 -249
- package/skills/graphify/skills/kiro/references/transcribe.md +0 -48
- package/skills/graphify/skills/kiro/references/update.md +0 -179
- package/skills/graphify/skills/opencode/references/add-watch.md +0 -56
- package/skills/graphify/skills/opencode/references/exports.md +0 -71
- package/skills/graphify/skills/opencode/references/extraction-spec.md +0 -68
- package/skills/graphify/skills/opencode/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/opencode/references/hooks.md +0 -33
- package/skills/graphify/skills/opencode/references/query.md +0 -249
- package/skills/graphify/skills/opencode/references/transcribe.md +0 -48
- package/skills/graphify/skills/opencode/references/update.md +0 -179
- package/skills/graphify/skills/pi/references/add-watch.md +0 -56
- package/skills/graphify/skills/pi/references/exports.md +0 -71
- package/skills/graphify/skills/pi/references/extraction-spec.md +0 -29
- package/skills/graphify/skills/pi/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/pi/references/hooks.md +0 -33
- package/skills/graphify/skills/pi/references/query.md +0 -249
- package/skills/graphify/skills/pi/references/transcribe.md +0 -48
- package/skills/graphify/skills/pi/references/update.md +0 -179
- package/skills/graphify/skills/trae/references/add-watch.md +0 -56
- package/skills/graphify/skills/trae/references/exports.md +0 -71
- package/skills/graphify/skills/trae/references/extraction-spec.md +0 -68
- package/skills/graphify/skills/trae/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/trae/references/hooks.md +0 -35
- package/skills/graphify/skills/trae/references/query.md +0 -249
- package/skills/graphify/skills/trae/references/transcribe.md +0 -48
- package/skills/graphify/skills/trae/references/update.md +0 -179
- package/skills/graphify/skills/vscode/references/add-watch.md +0 -56
- package/skills/graphify/skills/vscode/references/exports.md +0 -71
- package/skills/graphify/skills/vscode/references/extraction-spec.md +0 -68
- package/skills/graphify/skills/vscode/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/vscode/references/hooks.md +0 -33
- package/skills/graphify/skills/vscode/references/query.md +0 -249
- package/skills/graphify/skills/vscode/references/transcribe.md +0 -48
- package/skills/graphify/skills/vscode/references/update.md +0 -179
- package/skills/graphify/skills/windows/references/add-watch.md +0 -56
- package/skills/graphify/skills/windows/references/exports.md +0 -71
- package/skills/graphify/skills/windows/references/extraction-spec.md +0 -68
- package/skills/graphify/skills/windows/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/windows/references/hooks.md +0 -33
- package/skills/graphify/skills/windows/references/query.md +0 -249
- package/skills/graphify/skills/windows/references/transcribe.md +0 -48
- package/skills/graphify/skills/windows/references/update.md +0 -179
- package/skills/graphify/symbol_resolution.py +0 -538
- package/skills/graphify/transcribe.py +0 -184
- package/skills/graphify/tree_html.py +0 -582
- package/skills/graphify/validate.py +0 -72
- package/skills/graphify/watch.py +0 -898
- package/skills/graphify/wiki.py +0 -282
|
@@ -1,336 +0,0 @@
|
|
|
1
|
-
# Security helpers - URL validation, safe fetch, path guards, label sanitisation
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
import contextlib
|
|
5
|
-
import html
|
|
6
|
-
import re
|
|
7
|
-
import urllib.error
|
|
8
|
-
import urllib.parse
|
|
9
|
-
import urllib.request
|
|
10
|
-
from collections.abc import Mapping
|
|
11
|
-
from pathlib import Path
|
|
12
|
-
from typing import Any
|
|
13
|
-
|
|
14
|
-
import ipaddress
|
|
15
|
-
import socket
|
|
16
|
-
|
|
17
|
-
_ALLOWED_SCHEMES = {"http", "https"}
|
|
18
|
-
_MAX_FETCH_BYTES = 52_428_800 # 50 MB hard cap for binary downloads
|
|
19
|
-
_MAX_TEXT_BYTES = 10_485_760 # 10 MB hard cap for HTML / text
|
|
20
|
-
|
|
21
|
-
# Graph-load memory-bomb cap: reject .json files larger than this before
|
|
22
|
-
# JSON-parsing them into a dict. Without this, a multi-gigabyte (or
|
|
23
|
-
# specifically crafted) graph.json can exhaust process memory during
|
|
24
|
-
# json.loads + node_link_graph rehydration.
|
|
25
|
-
_MAX_GRAPH_FILE_BYTES = 512 * 1024 * 1024 # 512 MiB
|
|
26
|
-
|
|
27
|
-
# AWS metadata, link-local, and common cloud metadata endpoints
|
|
28
|
-
_BLOCKED_HOSTS = {"metadata.google.internal", "metadata.google.com"}
|
|
29
|
-
|
|
30
|
-
# RFC 6598 Shared Address Space (CGN) -- is_private misses this on Python <3.11
|
|
31
|
-
_CGN_NETWORK = ipaddress.ip_network("100.64.0.0/10")
|
|
32
|
-
|
|
33
|
-
# RFC 6052 NAT64 Well-Known Prefix -- is_reserved=True in Python but these embed
|
|
34
|
-
# public IPv4 addresses and are legitimate public internet traffic, not SSRF vectors.
|
|
35
|
-
_NAT64_WKP = ipaddress.ip_network("64:ff9b::/96")
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
# ---------------------------------------------------------------------------
|
|
39
|
-
# URL validation
|
|
40
|
-
# ---------------------------------------------------------------------------
|
|
41
|
-
|
|
42
|
-
def validate_url(url: str) -> str:
|
|
43
|
-
"""Raise ValueError if *url* is not http or https, or targets a private/internal IP.
|
|
44
|
-
|
|
45
|
-
Blocks file://, ftp://, data:, and any other scheme that could be used
|
|
46
|
-
for SSRF or local file access. Also blocks requests to private/reserved
|
|
47
|
-
IP ranges (127.x, 10.x, 169.254.x, etc.) and cloud metadata endpoints
|
|
48
|
-
to prevent SSRF in cloud environments.
|
|
49
|
-
"""
|
|
50
|
-
parsed = urllib.parse.urlparse(url)
|
|
51
|
-
if parsed.scheme.lower() not in _ALLOWED_SCHEMES:
|
|
52
|
-
raise ValueError(
|
|
53
|
-
f"Blocked URL scheme '{parsed.scheme}' - only http and https are allowed. "
|
|
54
|
-
f"Got: {url!r}"
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
hostname = parsed.hostname
|
|
58
|
-
if hostname:
|
|
59
|
-
# Block known cloud metadata hostnames
|
|
60
|
-
if hostname.lower() in _BLOCKED_HOSTS:
|
|
61
|
-
raise ValueError(
|
|
62
|
-
f"Blocked cloud metadata endpoint '{hostname}'. "
|
|
63
|
-
f"Got: {url!r}"
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
# Resolve hostname and block private/reserved IP ranges
|
|
67
|
-
try:
|
|
68
|
-
infos = socket.getaddrinfo(hostname, None, socket.AF_UNSPEC, socket.SOCK_STREAM)
|
|
69
|
-
for info in infos:
|
|
70
|
-
addr = info[4][0]
|
|
71
|
-
ip = ipaddress.ip_address(addr)
|
|
72
|
-
# For NAT64 addresses, check the embedded IPv4 instead of the wrapper
|
|
73
|
-
if isinstance(ip, ipaddress.IPv6Address) and ip in _NAT64_WKP:
|
|
74
|
-
embedded = ipaddress.ip_address(int(ip) & 0xFFFFFFFF)
|
|
75
|
-
ip = embedded
|
|
76
|
-
if ip.is_private or ip.is_reserved or ip.is_loopback or ip.is_link_local or ip in _CGN_NETWORK:
|
|
77
|
-
raise ValueError(
|
|
78
|
-
f"Blocked private/internal IP {addr} (resolved from '{hostname}'). "
|
|
79
|
-
f"Got: {url!r}"
|
|
80
|
-
)
|
|
81
|
-
except socket.gaierror as exc:
|
|
82
|
-
raise ValueError(
|
|
83
|
-
f"DNS resolution failed for '{hostname}': {exc}. Got: {url!r}"
|
|
84
|
-
) from exc
|
|
85
|
-
|
|
86
|
-
return url
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
@contextlib.contextmanager
|
|
90
|
-
def _ssrf_guarded_socket():
|
|
91
|
-
"""Patch socket.getaddrinfo for the duration of a fetch to catch DNS rebinding.
|
|
92
|
-
|
|
93
|
-
Validates every IP that urllib resolves so a DNS server cannot return a public IP
|
|
94
|
-
for validate_url and swap to a private IP for the actual connection (TOCTOU fix).
|
|
95
|
-
Not thread-safe, but graphify is a single-threaded CLI tool.
|
|
96
|
-
"""
|
|
97
|
-
original = socket.getaddrinfo
|
|
98
|
-
|
|
99
|
-
def _guarded(host, port, *args, **kwargs):
|
|
100
|
-
results = original(host, port, *args, **kwargs)
|
|
101
|
-
for info in results:
|
|
102
|
-
addr = info[4][0]
|
|
103
|
-
try:
|
|
104
|
-
ip = ipaddress.ip_address(addr)
|
|
105
|
-
except ValueError:
|
|
106
|
-
continue
|
|
107
|
-
if ip.is_private or ip.is_reserved or ip.is_loopback or ip.is_link_local or ip in _CGN_NETWORK:
|
|
108
|
-
raise OSError(
|
|
109
|
-
f"SSRF blocked: IP {addr} resolved from '{host}' is private/reserved"
|
|
110
|
-
)
|
|
111
|
-
return results
|
|
112
|
-
|
|
113
|
-
socket.getaddrinfo = _guarded
|
|
114
|
-
try:
|
|
115
|
-
yield
|
|
116
|
-
finally:
|
|
117
|
-
socket.getaddrinfo = original
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
class _NoFileRedirectHandler(urllib.request.HTTPRedirectHandler):
|
|
121
|
-
"""Redirect handler that re-validates every redirect target.
|
|
122
|
-
|
|
123
|
-
Prevents open-redirect SSRF attacks where an http:// URL redirects
|
|
124
|
-
to file:// or an internal address.
|
|
125
|
-
"""
|
|
126
|
-
|
|
127
|
-
def redirect_request(self, req, fp, code, msg, headers, newurl):
|
|
128
|
-
validate_url(newurl) # raises ValueError if scheme is wrong
|
|
129
|
-
return super().redirect_request(req, fp, code, msg, headers, newurl)
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
def _build_opener() -> urllib.request.OpenerDirector:
|
|
133
|
-
return urllib.request.build_opener(_NoFileRedirectHandler)
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
# ---------------------------------------------------------------------------
|
|
137
|
-
# Safe fetch
|
|
138
|
-
# ---------------------------------------------------------------------------
|
|
139
|
-
|
|
140
|
-
def safe_fetch(url: str, max_bytes: int = _MAX_FETCH_BYTES, timeout: int = 30) -> bytes:
|
|
141
|
-
"""Fetch *url* and return raw bytes.
|
|
142
|
-
|
|
143
|
-
Protections applied:
|
|
144
|
-
- URL scheme validated (http / https only)
|
|
145
|
-
- Redirects re-validated via _NoFileRedirectHandler
|
|
146
|
-
- Response body capped at *max_bytes* (streaming read)
|
|
147
|
-
- Non-2xx status raises urllib.error.HTTPError
|
|
148
|
-
- Network errors propagate as urllib.error.URLError / OSError
|
|
149
|
-
|
|
150
|
-
Raises:
|
|
151
|
-
ValueError - disallowed scheme or redirect target
|
|
152
|
-
urllib.error.HTTPError - non-2xx HTTP status
|
|
153
|
-
urllib.error.URLError - DNS / connection failure
|
|
154
|
-
OSError - size cap exceeded
|
|
155
|
-
"""
|
|
156
|
-
validate_url(url)
|
|
157
|
-
opener = _build_opener()
|
|
158
|
-
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 graphify/1.0"})
|
|
159
|
-
|
|
160
|
-
with _ssrf_guarded_socket(), opener.open(req, timeout=timeout) as resp:
|
|
161
|
-
# urllib raises HTTPError for non-2xx when using urlopen directly;
|
|
162
|
-
# with a custom opener we check manually to be safe.
|
|
163
|
-
status = getattr(resp, "status", None) or getattr(resp, "code", None)
|
|
164
|
-
if status is not None and not (200 <= status < 300):
|
|
165
|
-
raise urllib.error.HTTPError(url, status, f"HTTP {status}", {}, None)
|
|
166
|
-
|
|
167
|
-
chunks: list[bytes] = []
|
|
168
|
-
total = 0
|
|
169
|
-
while True:
|
|
170
|
-
chunk = resp.read(65_536)
|
|
171
|
-
if not chunk:
|
|
172
|
-
break
|
|
173
|
-
total += len(chunk)
|
|
174
|
-
if total > max_bytes:
|
|
175
|
-
raise OSError(
|
|
176
|
-
f"Response from {url!r} exceeds size limit "
|
|
177
|
-
f"({max_bytes // 1_048_576} MB). Aborting download."
|
|
178
|
-
)
|
|
179
|
-
chunks.append(chunk)
|
|
180
|
-
|
|
181
|
-
return b"".join(chunks)
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
def safe_fetch_text(url: str, max_bytes: int = _MAX_TEXT_BYTES, timeout: int = 15) -> str:
|
|
185
|
-
"""Fetch *url* and return decoded text (UTF-8, replacing bad bytes).
|
|
186
|
-
|
|
187
|
-
Wraps safe_fetch with tighter defaults for HTML / text content.
|
|
188
|
-
"""
|
|
189
|
-
raw = safe_fetch(url, max_bytes=max_bytes, timeout=timeout)
|
|
190
|
-
return raw.decode("utf-8", errors="replace")
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
# ---------------------------------------------------------------------------
|
|
194
|
-
# Path validation
|
|
195
|
-
# ---------------------------------------------------------------------------
|
|
196
|
-
|
|
197
|
-
def validate_graph_path(path: str | Path, base: Path | None = None) -> Path:
|
|
198
|
-
"""Resolve *path* and verify it stays inside *base*.
|
|
199
|
-
|
|
200
|
-
*base* defaults to the `graphify-out` directory relative to CWD.
|
|
201
|
-
Also requires the base directory to exist, so a caller cannot
|
|
202
|
-
trick graphify into reading files before any graph has been built.
|
|
203
|
-
|
|
204
|
-
Raises:
|
|
205
|
-
ValueError - path escapes base, or base does not exist
|
|
206
|
-
FileNotFoundError - resolved path does not exist
|
|
207
|
-
"""
|
|
208
|
-
if base is None:
|
|
209
|
-
resolved_hint = Path(path).resolve()
|
|
210
|
-
for candidate in [resolved_hint, *resolved_hint.parents]:
|
|
211
|
-
if candidate.name == "graphify-out":
|
|
212
|
-
base = candidate
|
|
213
|
-
break
|
|
214
|
-
if base is None:
|
|
215
|
-
base = Path("graphify-out").resolve()
|
|
216
|
-
|
|
217
|
-
base = base.resolve()
|
|
218
|
-
if not base.exists():
|
|
219
|
-
raise ValueError(
|
|
220
|
-
f"Graph base directory does not exist: {base}. "
|
|
221
|
-
"Run /graphify first to build the graph."
|
|
222
|
-
)
|
|
223
|
-
|
|
224
|
-
resolved = Path(path).resolve()
|
|
225
|
-
try:
|
|
226
|
-
resolved.relative_to(base)
|
|
227
|
-
except ValueError:
|
|
228
|
-
raise ValueError(
|
|
229
|
-
f"Path {path!r} escapes the allowed directory {base}. "
|
|
230
|
-
"Only paths inside graphify-out/ are permitted."
|
|
231
|
-
)
|
|
232
|
-
|
|
233
|
-
if not resolved.exists():
|
|
234
|
-
raise FileNotFoundError(f"Graph file not found: {resolved}")
|
|
235
|
-
|
|
236
|
-
return resolved
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
def check_graph_file_size_cap(path: Path) -> None:
|
|
240
|
-
"""Reject *path* if its size exceeds ``_MAX_GRAPH_FILE_BYTES``.
|
|
241
|
-
|
|
242
|
-
Protects callers from memory bombs by failing fast before a multi-GiB
|
|
243
|
-
graph.json is read into memory and JSON-parsed. Silently returns when
|
|
244
|
-
``path.stat()`` cannot be read — the caller's own existence/path check
|
|
245
|
-
is expected to surface a clearer error in that case.
|
|
246
|
-
|
|
247
|
-
Raises:
|
|
248
|
-
ValueError - file size exceeds the cap. The message includes the
|
|
249
|
-
observed size and the cap so callers can show a usable error.
|
|
250
|
-
"""
|
|
251
|
-
try:
|
|
252
|
-
size = path.stat().st_size
|
|
253
|
-
except OSError:
|
|
254
|
-
return
|
|
255
|
-
if size > _MAX_GRAPH_FILE_BYTES:
|
|
256
|
-
raise ValueError(
|
|
257
|
-
f"graph file {path} is {size:_d} bytes, "
|
|
258
|
-
f"exceeds {_MAX_GRAPH_FILE_BYTES:_d}-byte cap"
|
|
259
|
-
)
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
# ---------------------------------------------------------------------------
|
|
263
|
-
# Label sanitisation (mirrors code-review-graph's _sanitize_name pattern)
|
|
264
|
-
# ---------------------------------------------------------------------------
|
|
265
|
-
|
|
266
|
-
_CONTROL_CHAR_RE = re.compile(r"[\x00-\x1f\x7f]")
|
|
267
|
-
_MAX_LABEL_LEN = 256
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
def sanitize_label(text: str | None) -> str:
|
|
271
|
-
"""Strip control characters and cap length.
|
|
272
|
-
|
|
273
|
-
Safe for embedding in JSON data (inside <script> tags) and plain text.
|
|
274
|
-
For direct HTML injection, wrap the result with html.escape().
|
|
275
|
-
"""
|
|
276
|
-
if text is None:
|
|
277
|
-
return ""
|
|
278
|
-
text = _CONTROL_CHAR_RE.sub("", str(text))
|
|
279
|
-
if len(text) > _MAX_LABEL_LEN:
|
|
280
|
-
text = text[:_MAX_LABEL_LEN]
|
|
281
|
-
return text
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
# ---------------------------------------------------------------------------
|
|
285
|
-
# Metadata sanitisation (recursive, bounded, HTML-safe)
|
|
286
|
-
# ---------------------------------------------------------------------------
|
|
287
|
-
|
|
288
|
-
_METADATA_MAX_VALUE_LEN = 512
|
|
289
|
-
_METADATA_MAX_LIST_ITEMS = 50
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
def _sanitize_metadata_string(value: object) -> str:
|
|
293
|
-
"""Return a control-character-free, HTML-escaped, bounded string."""
|
|
294
|
-
text = _CONTROL_CHAR_RE.sub("", str(value))
|
|
295
|
-
text = html.escape(text, quote=True)
|
|
296
|
-
if len(text) > _METADATA_MAX_VALUE_LEN:
|
|
297
|
-
text = text[:_METADATA_MAX_VALUE_LEN]
|
|
298
|
-
return text # html is imported at module level (line 5)
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
def _sanitize_metadata_value(value: object) -> object:
|
|
302
|
-
"""Sanitize a metadata value while preserving simple JSON-compatible types."""
|
|
303
|
-
if isinstance(value, bool):
|
|
304
|
-
# bool is a subclass of int — must be checked first to avoid coercion.
|
|
305
|
-
return value
|
|
306
|
-
if isinstance(value, str):
|
|
307
|
-
return _sanitize_metadata_string(value)
|
|
308
|
-
if isinstance(value, dict):
|
|
309
|
-
return sanitize_metadata(value)
|
|
310
|
-
if isinstance(value, (list, tuple)):
|
|
311
|
-
return [_sanitize_metadata_value(item) for item in value[:_METADATA_MAX_LIST_ITEMS]]
|
|
312
|
-
if isinstance(value, (int, float)) or value is None:
|
|
313
|
-
return value
|
|
314
|
-
return _sanitize_metadata_string(value)
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
def sanitize_metadata(metadata: Mapping[str, Any] | None) -> dict[str, object]:
|
|
318
|
-
"""Sanitize metadata keys and values before graph export.
|
|
319
|
-
|
|
320
|
-
Metadata is less constrained than node labels: it can contain nested
|
|
321
|
-
dicts, lists, source snippets, external index symbols, and docstring
|
|
322
|
-
text. This helper keeps the data JSON-compatible, strips control
|
|
323
|
-
characters, escapes HTML-sensitive characters in strings, caps long
|
|
324
|
-
strings/lists, and drops entries whose key becomes empty after
|
|
325
|
-
sanitization.
|
|
326
|
-
"""
|
|
327
|
-
if metadata is None:
|
|
328
|
-
return {}
|
|
329
|
-
|
|
330
|
-
result: dict[str, object] = {}
|
|
331
|
-
for key, value in metadata.items():
|
|
332
|
-
clean_key = _sanitize_metadata_string(key)
|
|
333
|
-
if not clean_key:
|
|
334
|
-
continue
|
|
335
|
-
result[clean_key] = _sanitize_metadata_value(value)
|
|
336
|
-
return result
|
|
@@ -1,319 +0,0 @@
|
|
|
1
|
-
# Semantic fragment sanitizer — converts sentence-like rationale nodes into
|
|
2
|
-
# attributes on related nodes and removes invalid file_type values.
|
|
3
|
-
#
|
|
4
|
-
# Currently called from the skill merge scripts (skill-opencode.md,
|
|
5
|
-
# skill-codex.md) so that rationale text never leaks into the knowledge
|
|
6
|
-
# graph as standalone nodes. (Future: graphify.llm may wire this into
|
|
7
|
-
# _parse_llm_json / _merge_into for non-skill code paths; not done in
|
|
8
|
-
# this cycle.)
|
|
9
|
-
from __future__ import annotations
|
|
10
|
-
|
|
11
|
-
import json
|
|
12
|
-
import re
|
|
13
|
-
from pathlib import Path
|
|
14
|
-
|
|
15
|
-
# Labels longer than this many characters, or containing >= this many words,
|
|
16
|
-
# are candidates for being sentence-like rationale text rather than entity names.
|
|
17
|
-
_RATIONALE_MIN_CHARS = 80
|
|
18
|
-
_RATIONALE_MIN_WORDS = 8
|
|
19
|
-
|
|
20
|
-
# Validation limits for untrusted semantic-fragment payloads. See
|
|
21
|
-
# validate_semantic_fragment(). Issue #825: returned-JSON normalization for
|
|
22
|
-
# OpenCode and Codex agents requires a Python enforcement boundary so a
|
|
23
|
-
# malicious or runaway agent response cannot exhaust memory or escape the
|
|
24
|
-
# graphify-out chunk directory via crafted node/edge IDs.
|
|
25
|
-
MAX_SEMANTIC_FRAGMENT_BYTES = 25 * 1024 * 1024
|
|
26
|
-
MAX_SEMANTIC_FRAGMENT_NODES = 10_000
|
|
27
|
-
MAX_SEMANTIC_FRAGMENT_EDGES = 100_000
|
|
28
|
-
MAX_SEMANTIC_FRAGMENT_HYPEREDGES = 10_000
|
|
29
|
-
MAX_SEMANTIC_HYPEREDGE_NODES = 256
|
|
30
|
-
MAX_SEMANTIC_ID_LENGTH = 256
|
|
31
|
-
VALID_SEMANTIC_FILE_TYPES = frozenset({"code", "document", "paper", "image", "rationale", "concept"})
|
|
32
|
-
_SEMANTIC_ID_RE = re.compile(r"^[A-Za-z0-9._:-]+$")
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def validate_semantic_fragment(fragment: object) -> list[str]:
|
|
36
|
-
"""Return validation errors for an untrusted semantic extraction fragment.
|
|
37
|
-
|
|
38
|
-
Empty list means valid. Called by skill merge code before
|
|
39
|
-
sanitize_semantic_fragment() so malformed or malicious agent JSON is
|
|
40
|
-
rejected before it touches the graph. Parameter is `object` (not `dict`)
|
|
41
|
-
because we may be handed arbitrary deserialized JSON — the first check
|
|
42
|
-
rejects anything that isn't a dict.
|
|
43
|
-
"""
|
|
44
|
-
if not isinstance(fragment, dict):
|
|
45
|
-
return ["fragment must be a JSON object"]
|
|
46
|
-
|
|
47
|
-
errors: list[str] = []
|
|
48
|
-
try:
|
|
49
|
-
payload = json.dumps(fragment, ensure_ascii=False).encode("utf-8")
|
|
50
|
-
except (TypeError, ValueError) as exc:
|
|
51
|
-
return [f"fragment is not JSON-serializable: {exc}"]
|
|
52
|
-
|
|
53
|
-
if len(payload) > MAX_SEMANTIC_FRAGMENT_BYTES:
|
|
54
|
-
errors.append(f"payload is {len(payload)} bytes; max is {MAX_SEMANTIC_FRAGMENT_BYTES}")
|
|
55
|
-
|
|
56
|
-
nodes = fragment.get("nodes", [])
|
|
57
|
-
edges = fragment.get("edges", [])
|
|
58
|
-
if not isinstance(nodes, list):
|
|
59
|
-
errors.append("nodes must be a list")
|
|
60
|
-
nodes = []
|
|
61
|
-
elif len(nodes) > MAX_SEMANTIC_FRAGMENT_NODES:
|
|
62
|
-
errors.append(f"nodes has {len(nodes)} entries; max is {MAX_SEMANTIC_FRAGMENT_NODES}")
|
|
63
|
-
|
|
64
|
-
if not isinstance(edges, list):
|
|
65
|
-
errors.append("edges must be a list")
|
|
66
|
-
edges = []
|
|
67
|
-
elif len(edges) > MAX_SEMANTIC_FRAGMENT_EDGES:
|
|
68
|
-
errors.append(f"edges has {len(edges)} entries; max is {MAX_SEMANTIC_FRAGMENT_EDGES}")
|
|
69
|
-
|
|
70
|
-
for i, node in enumerate(nodes):
|
|
71
|
-
if not isinstance(node, dict):
|
|
72
|
-
errors.append(f"nodes[{i}] must be an object")
|
|
73
|
-
continue
|
|
74
|
-
_validate_semantic_id(errors, f"nodes[{i}].id", node.get("id"))
|
|
75
|
-
file_type = node.get("file_type")
|
|
76
|
-
if file_type is not None and file_type not in VALID_SEMANTIC_FILE_TYPES:
|
|
77
|
-
errors.append(
|
|
78
|
-
f"nodes[{i}].file_type {file_type!r} is not one of "
|
|
79
|
-
f"{sorted(VALID_SEMANTIC_FILE_TYPES)}"
|
|
80
|
-
) # validate file_type before any sanitize path can run
|
|
81
|
-
|
|
82
|
-
for i, edge in enumerate(edges):
|
|
83
|
-
if not isinstance(edge, dict):
|
|
84
|
-
errors.append(f"edges[{i}] must be an object")
|
|
85
|
-
continue
|
|
86
|
-
_validate_semantic_id(errors, f"edges[{i}].source", edge.get("source"))
|
|
87
|
-
_validate_semantic_id(errors, f"edges[{i}].target", edge.get("target"))
|
|
88
|
-
|
|
89
|
-
hyperedges = fragment.get("hyperedges", [])
|
|
90
|
-
if hyperedges is None:
|
|
91
|
-
hyperedges = []
|
|
92
|
-
if not isinstance(hyperedges, list):
|
|
93
|
-
errors.append("hyperedges must be a list")
|
|
94
|
-
else:
|
|
95
|
-
if len(hyperedges) > MAX_SEMANTIC_FRAGMENT_HYPEREDGES:
|
|
96
|
-
errors.append(
|
|
97
|
-
f"hyperedges has {len(hyperedges)} entries; "
|
|
98
|
-
f"max is {MAX_SEMANTIC_FRAGMENT_HYPEREDGES}"
|
|
99
|
-
)
|
|
100
|
-
for i, he in enumerate(hyperedges):
|
|
101
|
-
if not isinstance(he, dict):
|
|
102
|
-
errors.append(f"hyperedges[{i}] must be an object")
|
|
103
|
-
continue
|
|
104
|
-
_validate_semantic_id(errors, f"hyperedges[{i}].id", he.get("id"))
|
|
105
|
-
he_nodes = he.get("nodes")
|
|
106
|
-
if not isinstance(he_nodes, list):
|
|
107
|
-
errors.append(f"hyperedges[{i}].nodes must be a list")
|
|
108
|
-
continue
|
|
109
|
-
if len(he_nodes) > MAX_SEMANTIC_HYPEREDGE_NODES:
|
|
110
|
-
errors.append(
|
|
111
|
-
f"hyperedges[{i}].nodes has {len(he_nodes)} entries; "
|
|
112
|
-
f"max is {MAX_SEMANTIC_HYPEREDGE_NODES}"
|
|
113
|
-
)
|
|
114
|
-
for j, ref in enumerate(he_nodes):
|
|
115
|
-
_validate_semantic_id(errors, f"hyperedges[{i}].nodes[{j}]", ref)
|
|
116
|
-
|
|
117
|
-
return errors
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
def load_validated_semantic_fragment(path: Path) -> tuple[dict | None, list[str]]:
|
|
121
|
-
"""Load and validate a semantic chunk, rejecting oversize files before parsing.
|
|
122
|
-
|
|
123
|
-
The size guard runs against `path.stat().st_size` so an attacker-supplied
|
|
124
|
-
multi-gigabyte chunk file cannot blow up memory at `read_text()` time.
|
|
125
|
-
JSON decode errors are returned as validation errors rather than raised,
|
|
126
|
-
so callers can `continue` past bad chunks without a try/except.
|
|
127
|
-
"""
|
|
128
|
-
try:
|
|
129
|
-
size = path.stat().st_size
|
|
130
|
-
except OSError as exc:
|
|
131
|
-
return None, [f"could not stat {path}: {exc}"]
|
|
132
|
-
if size > MAX_SEMANTIC_FRAGMENT_BYTES:
|
|
133
|
-
return None, [f"payload is {size} bytes; max is {MAX_SEMANTIC_FRAGMENT_BYTES}"]
|
|
134
|
-
try:
|
|
135
|
-
fragment = json.loads(path.read_text(encoding="utf-8"))
|
|
136
|
-
except json.JSONDecodeError as exc:
|
|
137
|
-
return None, [f"invalid JSON: {exc}"]
|
|
138
|
-
except OSError as exc:
|
|
139
|
-
return None, [f"could not read {path}: {exc}"]
|
|
140
|
-
errors = validate_semantic_fragment(fragment)
|
|
141
|
-
return (None, errors) if errors else (fragment, [])
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
def _validate_semantic_id(errors: list[str], field: str, value: object) -> None:
|
|
145
|
-
if not isinstance(value, str):
|
|
146
|
-
errors.append(f"{field} must be a string")
|
|
147
|
-
return
|
|
148
|
-
if not value:
|
|
149
|
-
errors.append(f"{field} must not be empty")
|
|
150
|
-
return
|
|
151
|
-
if len(value) > MAX_SEMANTIC_ID_LENGTH:
|
|
152
|
-
errors.append(f"{field} is {len(value)} chars; max is {MAX_SEMANTIC_ID_LENGTH}")
|
|
153
|
-
if "/" in value or "\\" in value or ".." in value:
|
|
154
|
-
errors.append(f"{field} must not contain path separators or '..'")
|
|
155
|
-
if not _SEMANTIC_ID_RE.fullmatch(value):
|
|
156
|
-
errors.append(f"{field} contains unsupported characters")
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
def sanitize_semantic_fragment(fragment: dict) -> dict:
|
|
160
|
-
"""Clean up a semantic extraction fragment in-place.
|
|
161
|
-
|
|
162
|
-
Operations:
|
|
163
|
-
1. Removes nodes with ``file_type: "rationale"`` or ``file_type: "concept"``
|
|
164
|
-
that were emitted by an LLM (these are not valid semantic entity types).
|
|
165
|
-
2. Detects nodes whose label reads like a sentence / rationale paragraph
|
|
166
|
-
AND that participate in a ``rationale_for`` edge, then converts the
|
|
167
|
-
label into a ``rationale`` attribute on the target node and removes
|
|
168
|
-
the source-node + its edges. The ``rationale_for`` edge signal applies
|
|
169
|
-
regardless of the source node's ``file_type`` — sentence-like nodes
|
|
170
|
-
with allowed types (``document``, ``code``) are still cleaned up when
|
|
171
|
-
they're explicitly marked as rationale.
|
|
172
|
-
3. Strips nodes whose only distinguishing field is the label itself
|
|
173
|
-
(empty id — likely LLM hallucination).
|
|
174
|
-
4. Filters hyperedges so they cannot reference removed or unknown node
|
|
175
|
-
IDs after the cleanup passes above. A hyperedge with fewer than two
|
|
176
|
-
surviving members is dropped.
|
|
177
|
-
|
|
178
|
-
Returns the same dict for convenience.
|
|
179
|
-
"""
|
|
180
|
-
_invalid_ft = frozenset({"rationale", "concept"})
|
|
181
|
-
|
|
182
|
-
nodes: list[dict] = fragment.get("nodes", [])
|
|
183
|
-
edges: list[dict] = fragment.get("edges", [])
|
|
184
|
-
hyperedges: list[dict] = fragment.get("hyperedges", []) or []
|
|
185
|
-
|
|
186
|
-
# ---- build lookup maps --------------------------------------------------
|
|
187
|
-
node_by_id: dict[str, dict] = {}
|
|
188
|
-
for n in nodes:
|
|
189
|
-
nid = n.get("id", "")
|
|
190
|
-
if nid:
|
|
191
|
-
node_by_id[nid] = n
|
|
192
|
-
|
|
193
|
-
# Pre-collect node IDs that source a `rationale_for` edge — these are
|
|
194
|
-
# candidates for sentence-like cleanup even when file_type is allowed.
|
|
195
|
-
rationale_for_sources: set[str] = set()
|
|
196
|
-
for e in edges:
|
|
197
|
-
if e.get("relation") == "rationale_for":
|
|
198
|
-
src = e.get("source", "")
|
|
199
|
-
if src:
|
|
200
|
-
rationale_for_sources.add(src)
|
|
201
|
-
|
|
202
|
-
# ---- pass 1: identify nodes to remove + rationale candidates -----------
|
|
203
|
-
rationale_candidates: list[dict] = []
|
|
204
|
-
remove_ids: set[str] = set()
|
|
205
|
-
keep_nodes: list[dict] = []
|
|
206
|
-
for n in nodes:
|
|
207
|
-
nid = n.get("id", "")
|
|
208
|
-
if not nid:
|
|
209
|
-
# Node without an id cannot be referenced — discard.
|
|
210
|
-
continue
|
|
211
|
-
ft = n.get("file_type", "")
|
|
212
|
-
label = n.get("label", "")
|
|
213
|
-
if ft in _invalid_ft:
|
|
214
|
-
# Explicitly-invalid file_type ("rationale" or "concept"): if
|
|
215
|
-
# the label looks like a sentence we may convert to attribute.
|
|
216
|
-
if _is_sentence_like_rationale_label(label):
|
|
217
|
-
rationale_candidates.append(n)
|
|
218
|
-
remove_ids.add(nid)
|
|
219
|
-
continue
|
|
220
|
-
if nid in rationale_for_sources and _is_sentence_like_rationale_label(label):
|
|
221
|
-
# Allowed file_type, but the node sources a `rationale_for` edge
|
|
222
|
-
# AND its label is sentence-like prose. Treat it as rationale
|
|
223
|
-
# cleanup material rather than a real graph entity.
|
|
224
|
-
rationale_candidates.append(n)
|
|
225
|
-
remove_ids.add(nid)
|
|
226
|
-
continue
|
|
227
|
-
keep_nodes.append(n)
|
|
228
|
-
|
|
229
|
-
# ---- pass 2: convert sentence-nodes → rationale attributes --------------
|
|
230
|
-
# Only `rationale_for` edges propagate the rationale text. Other outgoing
|
|
231
|
-
# edges (e.g. references, conceptually_related_to) are NOT used as
|
|
232
|
-
# attribute-propagation paths — that would corrupt unrelated nodes by
|
|
233
|
-
# attaching rationale meant for a different target.
|
|
234
|
-
rationale_attrs: dict[str, list[str]] = {}
|
|
235
|
-
for rn in rationale_candidates:
|
|
236
|
-
rn_id = rn.get("id", "")
|
|
237
|
-
text = rn.get("label", "").strip()
|
|
238
|
-
for e in edges:
|
|
239
|
-
if e.get("relation") != "rationale_for":
|
|
240
|
-
continue
|
|
241
|
-
if e.get("source") != rn_id:
|
|
242
|
-
continue
|
|
243
|
-
target_id = e.get("target")
|
|
244
|
-
if target_id not in node_by_id or target_id in remove_ids:
|
|
245
|
-
continue
|
|
246
|
-
rationale_attrs.setdefault(target_id, []).append(text)
|
|
247
|
-
|
|
248
|
-
for target_id, texts in rationale_attrs.items():
|
|
249
|
-
if target_id in node_by_id and target_id not in remove_ids:
|
|
250
|
-
_append_rationale_attr(node_by_id[target_id], texts)
|
|
251
|
-
|
|
252
|
-
# ---- pass 3: strip edges referencing removed nodes ----------------------
|
|
253
|
-
keep_edges: list[dict] = []
|
|
254
|
-
for e in edges:
|
|
255
|
-
src = e.get("source", "")
|
|
256
|
-
tgt = e.get("target", "")
|
|
257
|
-
if src in remove_ids or tgt in remove_ids:
|
|
258
|
-
continue
|
|
259
|
-
keep_edges.append(e)
|
|
260
|
-
|
|
261
|
-
# ---- pass 4: filter hyperedges to surviving node IDs --------------------
|
|
262
|
-
surviving_ids: set[str] = {n.get("id", "") for n in keep_nodes}
|
|
263
|
-
surviving_ids.discard("")
|
|
264
|
-
keep_hyperedges: list[dict] = []
|
|
265
|
-
for he in hyperedges:
|
|
266
|
-
if not isinstance(he, dict):
|
|
267
|
-
continue
|
|
268
|
-
he_nodes = he.get("nodes")
|
|
269
|
-
if not isinstance(he_nodes, list):
|
|
270
|
-
continue
|
|
271
|
-
filtered = [ref for ref in he_nodes if isinstance(ref, str) and ref in surviving_ids]
|
|
272
|
-
if len(filtered) < 2:
|
|
273
|
-
# A hyperedge needs at least two surviving members to be meaningful.
|
|
274
|
-
continue
|
|
275
|
-
if len(filtered) != len(he_nodes):
|
|
276
|
-
he = dict(he)
|
|
277
|
-
he["nodes"] = filtered
|
|
278
|
-
keep_hyperedges.append(he)
|
|
279
|
-
|
|
280
|
-
fragment["nodes"] = keep_nodes
|
|
281
|
-
fragment["edges"] = keep_edges
|
|
282
|
-
fragment["hyperedges"] = keep_hyperedges
|
|
283
|
-
return fragment
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
def _is_sentence_like_rationale_label(label: str) -> bool:
|
|
287
|
-
"""Return True if *label* looks like prose / rationale text rather than an
|
|
288
|
-
entity or concept name.
|
|
289
|
-
|
|
290
|
-
Heuristics (no false positives on short-concept-edge-cases):
|
|
291
|
-
- Longer than *_RATIONALE_MIN_CHARS* chars, OR
|
|
292
|
-
- At least *_RATIONALE_MIN_WORDS* whitespace-delimited tokens, AND
|
|
293
|
-
- Contains at least one sentence-ending punctuation mark (``. ! ?``) or a
|
|
294
|
-
colon (common in "Decision: ..." rationales).
|
|
295
|
-
"""
|
|
296
|
-
if not label:
|
|
297
|
-
return False
|
|
298
|
-
label = label.strip()
|
|
299
|
-
if len(label) < _RATIONALE_MIN_CHARS:
|
|
300
|
-
word_count = len(label.split())
|
|
301
|
-
if word_count < _RATIONALE_MIN_WORDS:
|
|
302
|
-
return False
|
|
303
|
-
# Must look like actual prose: has sentence-ending punctuation or a colon.
|
|
304
|
-
return bool(re.search(r"[.!?:]", label))
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
def _append_rationale_attr(node: dict, texts: list[str]) -> None:
|
|
308
|
-
"""Append one or more rationale strings to *node*'s ``rationale`` attribute.
|
|
309
|
-
|
|
310
|
-
If the attribute already exists the new texts are appended with a
|
|
311
|
-
double-newline separator so downstream consumers can distinguish distinct
|
|
312
|
-
rationale fragments.
|
|
313
|
-
"""
|
|
314
|
-
existing = node.get("rationale", "")
|
|
315
|
-
new_text = "\n\n".join(texts).strip()
|
|
316
|
-
if existing:
|
|
317
|
-
node["rationale"] = existing + "\n\n" + new_text
|
|
318
|
-
else:
|
|
319
|
-
node["rationale"] = new_text
|