@oriro/orirocli 0.1.9 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. package/README.md +16 -18
  2. package/dist/cli.js +4776 -2964
  3. package/package.json +2 -2
  4. package/skills/craft/ai-engineering/SKILL.md +2 -2
  5. package/skills/graphify/SKILL.md +0 -619
  6. package/skills/graphify/__init__.py +0 -28
  7. package/skills/graphify/__main__.py +0 -4582
  8. package/skills/graphify/affected.py +0 -154
  9. package/skills/graphify/always_on/agents-md.md +0 -12
  10. package/skills/graphify/always_on/antigravity-rules.md +0 -14
  11. package/skills/graphify/always_on/claude-md.md +0 -9
  12. package/skills/graphify/always_on/gemini-md.md +0 -9
  13. package/skills/graphify/always_on/kiro-steering.md +0 -5
  14. package/skills/graphify/always_on/vscode-instructions.md +0 -17
  15. package/skills/graphify/analyze.py +0 -724
  16. package/skills/graphify/benchmark.py +0 -155
  17. package/skills/graphify/build.py +0 -487
  18. package/skills/graphify/cache.py +0 -417
  19. package/skills/graphify/callflow_html.py +0 -2020
  20. package/skills/graphify/cluster.py +0 -272
  21. package/skills/graphify/command-kilo.md +0 -15
  22. package/skills/graphify/dedup.py +0 -429
  23. package/skills/graphify/detect.py +0 -1379
  24. package/skills/graphify/diagnostics.py +0 -390
  25. package/skills/graphify/export.py +0 -1408
  26. package/skills/graphify/extract.py +0 -11570
  27. package/skills/graphify/global_graph.py +0 -159
  28. package/skills/graphify/google_workspace.py +0 -223
  29. package/skills/graphify/hooks.py +0 -457
  30. package/skills/graphify/ingest.py +0 -331
  31. package/skills/graphify/llm.py +0 -1896
  32. package/skills/graphify/manifest.py +0 -4
  33. package/skills/graphify/mcp_ingest.py +0 -392
  34. package/skills/graphify/multigraph_compat.py +0 -212
  35. package/skills/graphify/pg_introspect.py +0 -142
  36. package/skills/graphify/prs.py +0 -748
  37. package/skills/graphify/querylog.py +0 -70
  38. package/skills/graphify/report.py +0 -218
  39. package/skills/graphify/scip_ingest.py +0 -363
  40. package/skills/graphify/security.py +0 -336
  41. package/skills/graphify/semantic_cleanup.py +0 -319
  42. package/skills/graphify/serve.py +0 -1309
  43. package/skills/graphify/skill-aider.md +0 -1246
  44. package/skills/graphify/skill-amp.md +0 -613
  45. package/skills/graphify/skill-claw.md +0 -616
  46. package/skills/graphify/skill-codex.md +0 -613
  47. package/skills/graphify/skill-copilot.md +0 -616
  48. package/skills/graphify/skill-devin.md +0 -1372
  49. package/skills/graphify/skill-droid.md +0 -613
  50. package/skills/graphify/skill-kilo.md +0 -625
  51. package/skills/graphify/skill-kiro.md +0 -615
  52. package/skills/graphify/skill-opencode.md +0 -608
  53. package/skills/graphify/skill-pi.md +0 -615
  54. package/skills/graphify/skill-trae.md +0 -614
  55. package/skills/graphify/skill-vscode.md +0 -612
  56. package/skills/graphify/skill-windows.md +0 -651
  57. package/skills/graphify/skills/amp/references/add-watch.md +0 -56
  58. package/skills/graphify/skills/amp/references/exports.md +0 -71
  59. package/skills/graphify/skills/amp/references/extraction-spec.md +0 -68
  60. package/skills/graphify/skills/amp/references/github-and-merge.md +0 -46
  61. package/skills/graphify/skills/amp/references/hooks.md +0 -33
  62. package/skills/graphify/skills/amp/references/query.md +0 -249
  63. package/skills/graphify/skills/amp/references/transcribe.md +0 -48
  64. package/skills/graphify/skills/amp/references/update.md +0 -179
  65. package/skills/graphify/skills/claude/references/add-watch.md +0 -56
  66. package/skills/graphify/skills/claude/references/exports.md +0 -71
  67. package/skills/graphify/skills/claude/references/extraction-spec.md +0 -68
  68. package/skills/graphify/skills/claude/references/github-and-merge.md +0 -46
  69. package/skills/graphify/skills/claude/references/hooks.md +0 -33
  70. package/skills/graphify/skills/claude/references/query.md +0 -103
  71. package/skills/graphify/skills/claude/references/transcribe.md +0 -48
  72. package/skills/graphify/skills/claude/references/update.md +0 -179
  73. package/skills/graphify/skills/claw/references/add-watch.md +0 -56
  74. package/skills/graphify/skills/claw/references/exports.md +0 -71
  75. package/skills/graphify/skills/claw/references/extraction-spec.md +0 -29
  76. package/skills/graphify/skills/claw/references/github-and-merge.md +0 -46
  77. package/skills/graphify/skills/claw/references/hooks.md +0 -33
  78. package/skills/graphify/skills/claw/references/query.md +0 -249
  79. package/skills/graphify/skills/claw/references/transcribe.md +0 -48
  80. package/skills/graphify/skills/claw/references/update.md +0 -179
  81. package/skills/graphify/skills/codex/references/add-watch.md +0 -56
  82. package/skills/graphify/skills/codex/references/exports.md +0 -71
  83. package/skills/graphify/skills/codex/references/extraction-spec.md +0 -29
  84. package/skills/graphify/skills/codex/references/github-and-merge.md +0 -46
  85. package/skills/graphify/skills/codex/references/hooks.md +0 -33
  86. package/skills/graphify/skills/codex/references/query.md +0 -249
  87. package/skills/graphify/skills/codex/references/transcribe.md +0 -48
  88. package/skills/graphify/skills/codex/references/update.md +0 -179
  89. package/skills/graphify/skills/copilot/references/add-watch.md +0 -56
  90. package/skills/graphify/skills/copilot/references/exports.md +0 -71
  91. package/skills/graphify/skills/copilot/references/extraction-spec.md +0 -68
  92. package/skills/graphify/skills/copilot/references/github-and-merge.md +0 -46
  93. package/skills/graphify/skills/copilot/references/hooks.md +0 -33
  94. package/skills/graphify/skills/copilot/references/query.md +0 -249
  95. package/skills/graphify/skills/copilot/references/transcribe.md +0 -48
  96. package/skills/graphify/skills/copilot/references/update.md +0 -179
  97. package/skills/graphify/skills/droid/references/add-watch.md +0 -56
  98. package/skills/graphify/skills/droid/references/exports.md +0 -71
  99. package/skills/graphify/skills/droid/references/extraction-spec.md +0 -68
  100. package/skills/graphify/skills/droid/references/github-and-merge.md +0 -46
  101. package/skills/graphify/skills/droid/references/hooks.md +0 -33
  102. package/skills/graphify/skills/droid/references/query.md +0 -249
  103. package/skills/graphify/skills/droid/references/transcribe.md +0 -48
  104. package/skills/graphify/skills/droid/references/update.md +0 -179
  105. package/skills/graphify/skills/kilo/references/add-watch.md +0 -56
  106. package/skills/graphify/skills/kilo/references/exports.md +0 -71
  107. package/skills/graphify/skills/kilo/references/extraction-spec.md +0 -68
  108. package/skills/graphify/skills/kilo/references/github-and-merge.md +0 -46
  109. package/skills/graphify/skills/kilo/references/hooks.md +0 -33
  110. package/skills/graphify/skills/kilo/references/query.md +0 -249
  111. package/skills/graphify/skills/kilo/references/transcribe.md +0 -48
  112. package/skills/graphify/skills/kilo/references/update.md +0 -179
  113. package/skills/graphify/skills/kiro/references/add-watch.md +0 -56
  114. package/skills/graphify/skills/kiro/references/exports.md +0 -71
  115. package/skills/graphify/skills/kiro/references/extraction-spec.md +0 -29
  116. package/skills/graphify/skills/kiro/references/github-and-merge.md +0 -46
  117. package/skills/graphify/skills/kiro/references/hooks.md +0 -33
  118. package/skills/graphify/skills/kiro/references/query.md +0 -249
  119. package/skills/graphify/skills/kiro/references/transcribe.md +0 -48
  120. package/skills/graphify/skills/kiro/references/update.md +0 -179
  121. package/skills/graphify/skills/opencode/references/add-watch.md +0 -56
  122. package/skills/graphify/skills/opencode/references/exports.md +0 -71
  123. package/skills/graphify/skills/opencode/references/extraction-spec.md +0 -68
  124. package/skills/graphify/skills/opencode/references/github-and-merge.md +0 -46
  125. package/skills/graphify/skills/opencode/references/hooks.md +0 -33
  126. package/skills/graphify/skills/opencode/references/query.md +0 -249
  127. package/skills/graphify/skills/opencode/references/transcribe.md +0 -48
  128. package/skills/graphify/skills/opencode/references/update.md +0 -179
  129. package/skills/graphify/skills/pi/references/add-watch.md +0 -56
  130. package/skills/graphify/skills/pi/references/exports.md +0 -71
  131. package/skills/graphify/skills/pi/references/extraction-spec.md +0 -29
  132. package/skills/graphify/skills/pi/references/github-and-merge.md +0 -46
  133. package/skills/graphify/skills/pi/references/hooks.md +0 -33
  134. package/skills/graphify/skills/pi/references/query.md +0 -249
  135. package/skills/graphify/skills/pi/references/transcribe.md +0 -48
  136. package/skills/graphify/skills/pi/references/update.md +0 -179
  137. package/skills/graphify/skills/trae/references/add-watch.md +0 -56
  138. package/skills/graphify/skills/trae/references/exports.md +0 -71
  139. package/skills/graphify/skills/trae/references/extraction-spec.md +0 -68
  140. package/skills/graphify/skills/trae/references/github-and-merge.md +0 -46
  141. package/skills/graphify/skills/trae/references/hooks.md +0 -35
  142. package/skills/graphify/skills/trae/references/query.md +0 -249
  143. package/skills/graphify/skills/trae/references/transcribe.md +0 -48
  144. package/skills/graphify/skills/trae/references/update.md +0 -179
  145. package/skills/graphify/skills/vscode/references/add-watch.md +0 -56
  146. package/skills/graphify/skills/vscode/references/exports.md +0 -71
  147. package/skills/graphify/skills/vscode/references/extraction-spec.md +0 -68
  148. package/skills/graphify/skills/vscode/references/github-and-merge.md +0 -46
  149. package/skills/graphify/skills/vscode/references/hooks.md +0 -33
  150. package/skills/graphify/skills/vscode/references/query.md +0 -249
  151. package/skills/graphify/skills/vscode/references/transcribe.md +0 -48
  152. package/skills/graphify/skills/vscode/references/update.md +0 -179
  153. package/skills/graphify/skills/windows/references/add-watch.md +0 -56
  154. package/skills/graphify/skills/windows/references/exports.md +0 -71
  155. package/skills/graphify/skills/windows/references/extraction-spec.md +0 -68
  156. package/skills/graphify/skills/windows/references/github-and-merge.md +0 -46
  157. package/skills/graphify/skills/windows/references/hooks.md +0 -33
  158. package/skills/graphify/skills/windows/references/query.md +0 -249
  159. package/skills/graphify/skills/windows/references/transcribe.md +0 -48
  160. package/skills/graphify/skills/windows/references/update.md +0 -179
  161. package/skills/graphify/symbol_resolution.py +0 -538
  162. package/skills/graphify/transcribe.py +0 -184
  163. package/skills/graphify/tree_html.py +0 -582
  164. package/skills/graphify/validate.py +0 -72
  165. package/skills/graphify/watch.py +0 -898
  166. package/skills/graphify/wiki.py +0 -282
@@ -1,336 +0,0 @@
1
- # Security helpers - URL validation, safe fetch, path guards, label sanitisation
2
- from __future__ import annotations
3
-
4
- import contextlib
5
- import html
6
- import re
7
- import urllib.error
8
- import urllib.parse
9
- import urllib.request
10
- from collections.abc import Mapping
11
- from pathlib import Path
12
- from typing import Any
13
-
14
- import ipaddress
15
- import socket
16
-
17
- _ALLOWED_SCHEMES = {"http", "https"}
18
- _MAX_FETCH_BYTES = 52_428_800 # 50 MB hard cap for binary downloads
19
- _MAX_TEXT_BYTES = 10_485_760 # 10 MB hard cap for HTML / text
20
-
21
- # Graph-load memory-bomb cap: reject .json files larger than this before
22
- # JSON-parsing them into a dict. Without this, a multi-gigabyte (or
23
- # specifically crafted) graph.json can exhaust process memory during
24
- # json.loads + node_link_graph rehydration.
25
- _MAX_GRAPH_FILE_BYTES = 512 * 1024 * 1024 # 512 MiB
26
-
27
- # AWS metadata, link-local, and common cloud metadata endpoints
28
- _BLOCKED_HOSTS = {"metadata.google.internal", "metadata.google.com"}
29
-
30
- # RFC 6598 Shared Address Space (CGN) -- is_private misses this on Python <3.11
31
- _CGN_NETWORK = ipaddress.ip_network("100.64.0.0/10")
32
-
33
- # RFC 6052 NAT64 Well-Known Prefix -- is_reserved=True in Python but these embed
34
- # public IPv4 addresses and are legitimate public internet traffic, not SSRF vectors.
35
- _NAT64_WKP = ipaddress.ip_network("64:ff9b::/96")
36
-
37
-
38
- # ---------------------------------------------------------------------------
39
- # URL validation
40
- # ---------------------------------------------------------------------------
41
-
42
- def validate_url(url: str) -> str:
43
- """Raise ValueError if *url* is not http or https, or targets a private/internal IP.
44
-
45
- Blocks file://, ftp://, data:, and any other scheme that could be used
46
- for SSRF or local file access. Also blocks requests to private/reserved
47
- IP ranges (127.x, 10.x, 169.254.x, etc.) and cloud metadata endpoints
48
- to prevent SSRF in cloud environments.
49
- """
50
- parsed = urllib.parse.urlparse(url)
51
- if parsed.scheme.lower() not in _ALLOWED_SCHEMES:
52
- raise ValueError(
53
- f"Blocked URL scheme '{parsed.scheme}' - only http and https are allowed. "
54
- f"Got: {url!r}"
55
- )
56
-
57
- hostname = parsed.hostname
58
- if hostname:
59
- # Block known cloud metadata hostnames
60
- if hostname.lower() in _BLOCKED_HOSTS:
61
- raise ValueError(
62
- f"Blocked cloud metadata endpoint '{hostname}'. "
63
- f"Got: {url!r}"
64
- )
65
-
66
- # Resolve hostname and block private/reserved IP ranges
67
- try:
68
- infos = socket.getaddrinfo(hostname, None, socket.AF_UNSPEC, socket.SOCK_STREAM)
69
- for info in infos:
70
- addr = info[4][0]
71
- ip = ipaddress.ip_address(addr)
72
- # For NAT64 addresses, check the embedded IPv4 instead of the wrapper
73
- if isinstance(ip, ipaddress.IPv6Address) and ip in _NAT64_WKP:
74
- embedded = ipaddress.ip_address(int(ip) & 0xFFFFFFFF)
75
- ip = embedded
76
- if ip.is_private or ip.is_reserved or ip.is_loopback or ip.is_link_local or ip in _CGN_NETWORK:
77
- raise ValueError(
78
- f"Blocked private/internal IP {addr} (resolved from '{hostname}'). "
79
- f"Got: {url!r}"
80
- )
81
- except socket.gaierror as exc:
82
- raise ValueError(
83
- f"DNS resolution failed for '{hostname}': {exc}. Got: {url!r}"
84
- ) from exc
85
-
86
- return url
87
-
88
-
89
- @contextlib.contextmanager
90
- def _ssrf_guarded_socket():
91
- """Patch socket.getaddrinfo for the duration of a fetch to catch DNS rebinding.
92
-
93
- Validates every IP that urllib resolves so a DNS server cannot return a public IP
94
- for validate_url and swap to a private IP for the actual connection (TOCTOU fix).
95
- Not thread-safe, but graphify is a single-threaded CLI tool.
96
- """
97
- original = socket.getaddrinfo
98
-
99
- def _guarded(host, port, *args, **kwargs):
100
- results = original(host, port, *args, **kwargs)
101
- for info in results:
102
- addr = info[4][0]
103
- try:
104
- ip = ipaddress.ip_address(addr)
105
- except ValueError:
106
- continue
107
- if ip.is_private or ip.is_reserved or ip.is_loopback or ip.is_link_local or ip in _CGN_NETWORK:
108
- raise OSError(
109
- f"SSRF blocked: IP {addr} resolved from '{host}' is private/reserved"
110
- )
111
- return results
112
-
113
- socket.getaddrinfo = _guarded
114
- try:
115
- yield
116
- finally:
117
- socket.getaddrinfo = original
118
-
119
-
120
- class _NoFileRedirectHandler(urllib.request.HTTPRedirectHandler):
121
- """Redirect handler that re-validates every redirect target.
122
-
123
- Prevents open-redirect SSRF attacks where an http:// URL redirects
124
- to file:// or an internal address.
125
- """
126
-
127
- def redirect_request(self, req, fp, code, msg, headers, newurl):
128
- validate_url(newurl) # raises ValueError if scheme is wrong
129
- return super().redirect_request(req, fp, code, msg, headers, newurl)
130
-
131
-
132
- def _build_opener() -> urllib.request.OpenerDirector:
133
- return urllib.request.build_opener(_NoFileRedirectHandler)
134
-
135
-
136
- # ---------------------------------------------------------------------------
137
- # Safe fetch
138
- # ---------------------------------------------------------------------------
139
-
140
- def safe_fetch(url: str, max_bytes: int = _MAX_FETCH_BYTES, timeout: int = 30) -> bytes:
141
- """Fetch *url* and return raw bytes.
142
-
143
- Protections applied:
144
- - URL scheme validated (http / https only)
145
- - Redirects re-validated via _NoFileRedirectHandler
146
- - Response body capped at *max_bytes* (streaming read)
147
- - Non-2xx status raises urllib.error.HTTPError
148
- - Network errors propagate as urllib.error.URLError / OSError
149
-
150
- Raises:
151
- ValueError - disallowed scheme or redirect target
152
- urllib.error.HTTPError - non-2xx HTTP status
153
- urllib.error.URLError - DNS / connection failure
154
- OSError - size cap exceeded
155
- """
156
- validate_url(url)
157
- opener = _build_opener()
158
- req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 graphify/1.0"})
159
-
160
- with _ssrf_guarded_socket(), opener.open(req, timeout=timeout) as resp:
161
- # urllib raises HTTPError for non-2xx when using urlopen directly;
162
- # with a custom opener we check manually to be safe.
163
- status = getattr(resp, "status", None) or getattr(resp, "code", None)
164
- if status is not None and not (200 <= status < 300):
165
- raise urllib.error.HTTPError(url, status, f"HTTP {status}", {}, None)
166
-
167
- chunks: list[bytes] = []
168
- total = 0
169
- while True:
170
- chunk = resp.read(65_536)
171
- if not chunk:
172
- break
173
- total += len(chunk)
174
- if total > max_bytes:
175
- raise OSError(
176
- f"Response from {url!r} exceeds size limit "
177
- f"({max_bytes // 1_048_576} MB). Aborting download."
178
- )
179
- chunks.append(chunk)
180
-
181
- return b"".join(chunks)
182
-
183
-
184
- def safe_fetch_text(url: str, max_bytes: int = _MAX_TEXT_BYTES, timeout: int = 15) -> str:
185
- """Fetch *url* and return decoded text (UTF-8, replacing bad bytes).
186
-
187
- Wraps safe_fetch with tighter defaults for HTML / text content.
188
- """
189
- raw = safe_fetch(url, max_bytes=max_bytes, timeout=timeout)
190
- return raw.decode("utf-8", errors="replace")
191
-
192
-
193
- # ---------------------------------------------------------------------------
194
- # Path validation
195
- # ---------------------------------------------------------------------------
196
-
197
- def validate_graph_path(path: str | Path, base: Path | None = None) -> Path:
198
- """Resolve *path* and verify it stays inside *base*.
199
-
200
- *base* defaults to the `graphify-out` directory relative to CWD.
201
- Also requires the base directory to exist, so a caller cannot
202
- trick graphify into reading files before any graph has been built.
203
-
204
- Raises:
205
- ValueError - path escapes base, or base does not exist
206
- FileNotFoundError - resolved path does not exist
207
- """
208
- if base is None:
209
- resolved_hint = Path(path).resolve()
210
- for candidate in [resolved_hint, *resolved_hint.parents]:
211
- if candidate.name == "graphify-out":
212
- base = candidate
213
- break
214
- if base is None:
215
- base = Path("graphify-out").resolve()
216
-
217
- base = base.resolve()
218
- if not base.exists():
219
- raise ValueError(
220
- f"Graph base directory does not exist: {base}. "
221
- "Run /graphify first to build the graph."
222
- )
223
-
224
- resolved = Path(path).resolve()
225
- try:
226
- resolved.relative_to(base)
227
- except ValueError:
228
- raise ValueError(
229
- f"Path {path!r} escapes the allowed directory {base}. "
230
- "Only paths inside graphify-out/ are permitted."
231
- )
232
-
233
- if not resolved.exists():
234
- raise FileNotFoundError(f"Graph file not found: {resolved}")
235
-
236
- return resolved
237
-
238
-
239
- def check_graph_file_size_cap(path: Path) -> None:
240
- """Reject *path* if its size exceeds ``_MAX_GRAPH_FILE_BYTES``.
241
-
242
- Protects callers from memory bombs by failing fast before a multi-GiB
243
- graph.json is read into memory and JSON-parsed. Silently returns when
244
- ``path.stat()`` cannot be read — the caller's own existence/path check
245
- is expected to surface a clearer error in that case.
246
-
247
- Raises:
248
- ValueError - file size exceeds the cap. The message includes the
249
- observed size and the cap so callers can show a usable error.
250
- """
251
- try:
252
- size = path.stat().st_size
253
- except OSError:
254
- return
255
- if size > _MAX_GRAPH_FILE_BYTES:
256
- raise ValueError(
257
- f"graph file {path} is {size:_d} bytes, "
258
- f"exceeds {_MAX_GRAPH_FILE_BYTES:_d}-byte cap"
259
- )
260
-
261
-
262
- # ---------------------------------------------------------------------------
263
- # Label sanitisation (mirrors code-review-graph's _sanitize_name pattern)
264
- # ---------------------------------------------------------------------------
265
-
266
- _CONTROL_CHAR_RE = re.compile(r"[\x00-\x1f\x7f]")
267
- _MAX_LABEL_LEN = 256
268
-
269
-
270
- def sanitize_label(text: str | None) -> str:
271
- """Strip control characters and cap length.
272
-
273
- Safe for embedding in JSON data (inside <script> tags) and plain text.
274
- For direct HTML injection, wrap the result with html.escape().
275
- """
276
- if text is None:
277
- return ""
278
- text = _CONTROL_CHAR_RE.sub("", str(text))
279
- if len(text) > _MAX_LABEL_LEN:
280
- text = text[:_MAX_LABEL_LEN]
281
- return text
282
-
283
-
284
- # ---------------------------------------------------------------------------
285
- # Metadata sanitisation (recursive, bounded, HTML-safe)
286
- # ---------------------------------------------------------------------------
287
-
288
- _METADATA_MAX_VALUE_LEN = 512
289
- _METADATA_MAX_LIST_ITEMS = 50
290
-
291
-
292
- def _sanitize_metadata_string(value: object) -> str:
293
- """Return a control-character-free, HTML-escaped, bounded string."""
294
- text = _CONTROL_CHAR_RE.sub("", str(value))
295
- text = html.escape(text, quote=True)
296
- if len(text) > _METADATA_MAX_VALUE_LEN:
297
- text = text[:_METADATA_MAX_VALUE_LEN]
298
- return text # html is imported at module level (line 5)
299
-
300
-
301
- def _sanitize_metadata_value(value: object) -> object:
302
- """Sanitize a metadata value while preserving simple JSON-compatible types."""
303
- if isinstance(value, bool):
304
- # bool is a subclass of int — must be checked first to avoid coercion.
305
- return value
306
- if isinstance(value, str):
307
- return _sanitize_metadata_string(value)
308
- if isinstance(value, dict):
309
- return sanitize_metadata(value)
310
- if isinstance(value, (list, tuple)):
311
- return [_sanitize_metadata_value(item) for item in value[:_METADATA_MAX_LIST_ITEMS]]
312
- if isinstance(value, (int, float)) or value is None:
313
- return value
314
- return _sanitize_metadata_string(value)
315
-
316
-
317
- def sanitize_metadata(metadata: Mapping[str, Any] | None) -> dict[str, object]:
318
- """Sanitize metadata keys and values before graph export.
319
-
320
- Metadata is less constrained than node labels: it can contain nested
321
- dicts, lists, source snippets, external index symbols, and docstring
322
- text. This helper keeps the data JSON-compatible, strips control
323
- characters, escapes HTML-sensitive characters in strings, caps long
324
- strings/lists, and drops entries whose key becomes empty after
325
- sanitization.
326
- """
327
- if metadata is None:
328
- return {}
329
-
330
- result: dict[str, object] = {}
331
- for key, value in metadata.items():
332
- clean_key = _sanitize_metadata_string(key)
333
- if not clean_key:
334
- continue
335
- result[clean_key] = _sanitize_metadata_value(value)
336
- return result
@@ -1,319 +0,0 @@
1
- # Semantic fragment sanitizer — converts sentence-like rationale nodes into
2
- # attributes on related nodes and removes invalid file_type values.
3
- #
4
- # Currently called from the skill merge scripts (skill-opencode.md,
5
- # skill-codex.md) so that rationale text never leaks into the knowledge
6
- # graph as standalone nodes. (Future: graphify.llm may wire this into
7
- # _parse_llm_json / _merge_into for non-skill code paths; not done in
8
- # this cycle.)
9
- from __future__ import annotations
10
-
11
- import json
12
- import re
13
- from pathlib import Path
14
-
15
- # Labels longer than this many characters, or containing >= this many words,
16
- # are candidates for being sentence-like rationale text rather than entity names.
17
- _RATIONALE_MIN_CHARS = 80
18
- _RATIONALE_MIN_WORDS = 8
19
-
20
- # Validation limits for untrusted semantic-fragment payloads. See
21
- # validate_semantic_fragment(). Issue #825: returned-JSON normalization for
22
- # OpenCode and Codex agents requires a Python enforcement boundary so a
23
- # malicious or runaway agent response cannot exhaust memory or escape the
24
- # graphify-out chunk directory via crafted node/edge IDs.
25
- MAX_SEMANTIC_FRAGMENT_BYTES = 25 * 1024 * 1024
26
- MAX_SEMANTIC_FRAGMENT_NODES = 10_000
27
- MAX_SEMANTIC_FRAGMENT_EDGES = 100_000
28
- MAX_SEMANTIC_FRAGMENT_HYPEREDGES = 10_000
29
- MAX_SEMANTIC_HYPEREDGE_NODES = 256
30
- MAX_SEMANTIC_ID_LENGTH = 256
31
- VALID_SEMANTIC_FILE_TYPES = frozenset({"code", "document", "paper", "image", "rationale", "concept"})
32
- _SEMANTIC_ID_RE = re.compile(r"^[A-Za-z0-9._:-]+$")
33
-
34
-
35
- def validate_semantic_fragment(fragment: object) -> list[str]:
36
- """Return validation errors for an untrusted semantic extraction fragment.
37
-
38
- Empty list means valid. Called by skill merge code before
39
- sanitize_semantic_fragment() so malformed or malicious agent JSON is
40
- rejected before it touches the graph. Parameter is `object` (not `dict`)
41
- because we may be handed arbitrary deserialized JSON — the first check
42
- rejects anything that isn't a dict.
43
- """
44
- if not isinstance(fragment, dict):
45
- return ["fragment must be a JSON object"]
46
-
47
- errors: list[str] = []
48
- try:
49
- payload = json.dumps(fragment, ensure_ascii=False).encode("utf-8")
50
- except (TypeError, ValueError) as exc:
51
- return [f"fragment is not JSON-serializable: {exc}"]
52
-
53
- if len(payload) > MAX_SEMANTIC_FRAGMENT_BYTES:
54
- errors.append(f"payload is {len(payload)} bytes; max is {MAX_SEMANTIC_FRAGMENT_BYTES}")
55
-
56
- nodes = fragment.get("nodes", [])
57
- edges = fragment.get("edges", [])
58
- if not isinstance(nodes, list):
59
- errors.append("nodes must be a list")
60
- nodes = []
61
- elif len(nodes) > MAX_SEMANTIC_FRAGMENT_NODES:
62
- errors.append(f"nodes has {len(nodes)} entries; max is {MAX_SEMANTIC_FRAGMENT_NODES}")
63
-
64
- if not isinstance(edges, list):
65
- errors.append("edges must be a list")
66
- edges = []
67
- elif len(edges) > MAX_SEMANTIC_FRAGMENT_EDGES:
68
- errors.append(f"edges has {len(edges)} entries; max is {MAX_SEMANTIC_FRAGMENT_EDGES}")
69
-
70
- for i, node in enumerate(nodes):
71
- if not isinstance(node, dict):
72
- errors.append(f"nodes[{i}] must be an object")
73
- continue
74
- _validate_semantic_id(errors, f"nodes[{i}].id", node.get("id"))
75
- file_type = node.get("file_type")
76
- if file_type is not None and file_type not in VALID_SEMANTIC_FILE_TYPES:
77
- errors.append(
78
- f"nodes[{i}].file_type {file_type!r} is not one of "
79
- f"{sorted(VALID_SEMANTIC_FILE_TYPES)}"
80
- ) # validate file_type before any sanitize path can run
81
-
82
- for i, edge in enumerate(edges):
83
- if not isinstance(edge, dict):
84
- errors.append(f"edges[{i}] must be an object")
85
- continue
86
- _validate_semantic_id(errors, f"edges[{i}].source", edge.get("source"))
87
- _validate_semantic_id(errors, f"edges[{i}].target", edge.get("target"))
88
-
89
- hyperedges = fragment.get("hyperedges", [])
90
- if hyperedges is None:
91
- hyperedges = []
92
- if not isinstance(hyperedges, list):
93
- errors.append("hyperedges must be a list")
94
- else:
95
- if len(hyperedges) > MAX_SEMANTIC_FRAGMENT_HYPEREDGES:
96
- errors.append(
97
- f"hyperedges has {len(hyperedges)} entries; "
98
- f"max is {MAX_SEMANTIC_FRAGMENT_HYPEREDGES}"
99
- )
100
- for i, he in enumerate(hyperedges):
101
- if not isinstance(he, dict):
102
- errors.append(f"hyperedges[{i}] must be an object")
103
- continue
104
- _validate_semantic_id(errors, f"hyperedges[{i}].id", he.get("id"))
105
- he_nodes = he.get("nodes")
106
- if not isinstance(he_nodes, list):
107
- errors.append(f"hyperedges[{i}].nodes must be a list")
108
- continue
109
- if len(he_nodes) > MAX_SEMANTIC_HYPEREDGE_NODES:
110
- errors.append(
111
- f"hyperedges[{i}].nodes has {len(he_nodes)} entries; "
112
- f"max is {MAX_SEMANTIC_HYPEREDGE_NODES}"
113
- )
114
- for j, ref in enumerate(he_nodes):
115
- _validate_semantic_id(errors, f"hyperedges[{i}].nodes[{j}]", ref)
116
-
117
- return errors
118
-
119
-
120
- def load_validated_semantic_fragment(path: Path) -> tuple[dict | None, list[str]]:
121
- """Load and validate a semantic chunk, rejecting oversize files before parsing.
122
-
123
- The size guard runs against `path.stat().st_size` so an attacker-supplied
124
- multi-gigabyte chunk file cannot blow up memory at `read_text()` time.
125
- JSON decode errors are returned as validation errors rather than raised,
126
- so callers can `continue` past bad chunks without a try/except.
127
- """
128
- try:
129
- size = path.stat().st_size
130
- except OSError as exc:
131
- return None, [f"could not stat {path}: {exc}"]
132
- if size > MAX_SEMANTIC_FRAGMENT_BYTES:
133
- return None, [f"payload is {size} bytes; max is {MAX_SEMANTIC_FRAGMENT_BYTES}"]
134
- try:
135
- fragment = json.loads(path.read_text(encoding="utf-8"))
136
- except json.JSONDecodeError as exc:
137
- return None, [f"invalid JSON: {exc}"]
138
- except OSError as exc:
139
- return None, [f"could not read {path}: {exc}"]
140
- errors = validate_semantic_fragment(fragment)
141
- return (None, errors) if errors else (fragment, [])
142
-
143
-
144
- def _validate_semantic_id(errors: list[str], field: str, value: object) -> None:
145
- if not isinstance(value, str):
146
- errors.append(f"{field} must be a string")
147
- return
148
- if not value:
149
- errors.append(f"{field} must not be empty")
150
- return
151
- if len(value) > MAX_SEMANTIC_ID_LENGTH:
152
- errors.append(f"{field} is {len(value)} chars; max is {MAX_SEMANTIC_ID_LENGTH}")
153
- if "/" in value or "\\" in value or ".." in value:
154
- errors.append(f"{field} must not contain path separators or '..'")
155
- if not _SEMANTIC_ID_RE.fullmatch(value):
156
- errors.append(f"{field} contains unsupported characters")
157
-
158
-
159
- def sanitize_semantic_fragment(fragment: dict) -> dict:
160
- """Clean up a semantic extraction fragment in-place.
161
-
162
- Operations:
163
- 1. Removes nodes with ``file_type: "rationale"`` or ``file_type: "concept"``
164
- that were emitted by an LLM (these are not valid semantic entity types).
165
- 2. Detects nodes whose label reads like a sentence / rationale paragraph
166
- AND that participate in a ``rationale_for`` edge, then converts the
167
- label into a ``rationale`` attribute on the target node and removes
168
- the source-node + its edges. The ``rationale_for`` edge signal applies
169
- regardless of the source node's ``file_type`` — sentence-like nodes
170
- with allowed types (``document``, ``code``) are still cleaned up when
171
- they're explicitly marked as rationale.
172
- 3. Strips nodes whose only distinguishing field is the label itself
173
- (empty id — likely LLM hallucination).
174
- 4. Filters hyperedges so they cannot reference removed or unknown node
175
- IDs after the cleanup passes above. A hyperedge with fewer than two
176
- surviving members is dropped.
177
-
178
- Returns the same dict for convenience.
179
- """
180
- _invalid_ft = frozenset({"rationale", "concept"})
181
-
182
- nodes: list[dict] = fragment.get("nodes", [])
183
- edges: list[dict] = fragment.get("edges", [])
184
- hyperedges: list[dict] = fragment.get("hyperedges", []) or []
185
-
186
- # ---- build lookup maps --------------------------------------------------
187
- node_by_id: dict[str, dict] = {}
188
- for n in nodes:
189
- nid = n.get("id", "")
190
- if nid:
191
- node_by_id[nid] = n
192
-
193
- # Pre-collect node IDs that source a `rationale_for` edge — these are
194
- # candidates for sentence-like cleanup even when file_type is allowed.
195
- rationale_for_sources: set[str] = set()
196
- for e in edges:
197
- if e.get("relation") == "rationale_for":
198
- src = e.get("source", "")
199
- if src:
200
- rationale_for_sources.add(src)
201
-
202
- # ---- pass 1: identify nodes to remove + rationale candidates -----------
203
- rationale_candidates: list[dict] = []
204
- remove_ids: set[str] = set()
205
- keep_nodes: list[dict] = []
206
- for n in nodes:
207
- nid = n.get("id", "")
208
- if not nid:
209
- # Node without an id cannot be referenced — discard.
210
- continue
211
- ft = n.get("file_type", "")
212
- label = n.get("label", "")
213
- if ft in _invalid_ft:
214
- # Explicitly-invalid file_type ("rationale" or "concept"): if
215
- # the label looks like a sentence we may convert to attribute.
216
- if _is_sentence_like_rationale_label(label):
217
- rationale_candidates.append(n)
218
- remove_ids.add(nid)
219
- continue
220
- if nid in rationale_for_sources and _is_sentence_like_rationale_label(label):
221
- # Allowed file_type, but the node sources a `rationale_for` edge
222
- # AND its label is sentence-like prose. Treat it as rationale
223
- # cleanup material rather than a real graph entity.
224
- rationale_candidates.append(n)
225
- remove_ids.add(nid)
226
- continue
227
- keep_nodes.append(n)
228
-
229
- # ---- pass 2: convert sentence-nodes → rationale attributes --------------
230
- # Only `rationale_for` edges propagate the rationale text. Other outgoing
231
- # edges (e.g. references, conceptually_related_to) are NOT used as
232
- # attribute-propagation paths — that would corrupt unrelated nodes by
233
- # attaching rationale meant for a different target.
234
- rationale_attrs: dict[str, list[str]] = {}
235
- for rn in rationale_candidates:
236
- rn_id = rn.get("id", "")
237
- text = rn.get("label", "").strip()
238
- for e in edges:
239
- if e.get("relation") != "rationale_for":
240
- continue
241
- if e.get("source") != rn_id:
242
- continue
243
- target_id = e.get("target")
244
- if target_id not in node_by_id or target_id in remove_ids:
245
- continue
246
- rationale_attrs.setdefault(target_id, []).append(text)
247
-
248
- for target_id, texts in rationale_attrs.items():
249
- if target_id in node_by_id and target_id not in remove_ids:
250
- _append_rationale_attr(node_by_id[target_id], texts)
251
-
252
- # ---- pass 3: strip edges referencing removed nodes ----------------------
253
- keep_edges: list[dict] = []
254
- for e in edges:
255
- src = e.get("source", "")
256
- tgt = e.get("target", "")
257
- if src in remove_ids or tgt in remove_ids:
258
- continue
259
- keep_edges.append(e)
260
-
261
- # ---- pass 4: filter hyperedges to surviving node IDs --------------------
262
- surviving_ids: set[str] = {n.get("id", "") for n in keep_nodes}
263
- surviving_ids.discard("")
264
- keep_hyperedges: list[dict] = []
265
- for he in hyperedges:
266
- if not isinstance(he, dict):
267
- continue
268
- he_nodes = he.get("nodes")
269
- if not isinstance(he_nodes, list):
270
- continue
271
- filtered = [ref for ref in he_nodes if isinstance(ref, str) and ref in surviving_ids]
272
- if len(filtered) < 2:
273
- # A hyperedge needs at least two surviving members to be meaningful.
274
- continue
275
- if len(filtered) != len(he_nodes):
276
- he = dict(he)
277
- he["nodes"] = filtered
278
- keep_hyperedges.append(he)
279
-
280
- fragment["nodes"] = keep_nodes
281
- fragment["edges"] = keep_edges
282
- fragment["hyperedges"] = keep_hyperedges
283
- return fragment
284
-
285
-
286
- def _is_sentence_like_rationale_label(label: str) -> bool:
287
- """Return True if *label* looks like prose / rationale text rather than an
288
- entity or concept name.
289
-
290
- Heuristics (no false positives on short-concept-edge-cases):
291
- - Longer than *_RATIONALE_MIN_CHARS* chars, OR
292
- - At least *_RATIONALE_MIN_WORDS* whitespace-delimited tokens, AND
293
- - Contains at least one sentence-ending punctuation mark (``. ! ?``) or a
294
- colon (common in "Decision: ..." rationales).
295
- """
296
- if not label:
297
- return False
298
- label = label.strip()
299
- if len(label) < _RATIONALE_MIN_CHARS:
300
- word_count = len(label.split())
301
- if word_count < _RATIONALE_MIN_WORDS:
302
- return False
303
- # Must look like actual prose: has sentence-ending punctuation or a colon.
304
- return bool(re.search(r"[.!?:]", label))
305
-
306
-
307
- def _append_rationale_attr(node: dict, texts: list[str]) -> None:
308
- """Append one or more rationale strings to *node*'s ``rationale`` attribute.
309
-
310
- If the attribute already exists the new texts are appended with a
311
- double-newline separator so downstream consumers can distinguish distinct
312
- rationale fragments.
313
- """
314
- existing = node.get("rationale", "")
315
- new_text = "\n\n".join(texts).strip()
316
- if existing:
317
- node["rationale"] = existing + "\n\n" + new_text
318
- else:
319
- node["rationale"] = new_text