agent-input-sanitizer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/html.mjs ADDED
@@ -0,0 +1,937 @@
1
+ /**
2
+ * Hidden-HTML splicing (Layer 2) and exfil-URL detection (Layer 3) for
3
+ * web/HTML ingress.
4
+ *
5
+ * Layer 2 strips exactly what a human viewing the rendered page cannot see —
6
+ * HTML comments and hidden elements (hiding inline styles, `hidden` attr) —
7
+ * by splicing those byte ranges out of the original text and leaving a
8
+ * placeholder; every byte outside a spliced range is preserved verbatim (no
9
+ * re-serialization). Scripting/resource tags (script, style, svg, iframe, …)
10
+ * and `data:` URI resources are REPORTED in the result's `warned` counts but
11
+ * never removed, so fetched page source stays inspectable.
12
+ *
13
+ * Layer 3 reports data-exfil-shaped URLs (suspicious query params, oversized
14
+ * payloads, embedded credentials) without modifying them; the caller surfaces
15
+ * the report as a warning.
16
+ *
17
+ * Split into its own module so it can be lazy-loaded: pulling in the
18
+ * remark/rehype/unified graph costs ~200ms of module-load time, so the main
19
+ * entry `await import()`s this module only when its cheap regex gates match.
20
+ */
21
+ import { unified } from "unified";
22
+ import remarkParse from "remark-parse";
23
+ import remarkGfm from "remark-gfm";
24
+ import rehypeParse from "rehype-parse";
25
+ import { visit, SKIP, EXIT } from "unist-util-visit";
26
+ import styleToObject from "style-to-object";
27
+ import {
28
+ HTML_TAG_PRESENT,
29
+ MD_LINK_HINT,
30
+ SECRET_HINT,
31
+ SECRET_HINT_EXT,
32
+ matchesSecretHint,
33
+ } from "./gates.mjs";
34
+
35
+ // The cheap pre-gates live in the dependency-free `./gates.mjs` so the package
36
+ // root can re-export them without eagerly loading this module's remark/rehype
37
+ // graph. Re-exported here too so the `./html` subpath keeps exposing them.
38
+ export {
39
+ HTML_TAG_PRESENT,
40
+ MD_LINK_HINT,
41
+ SECRET_HINT,
42
+ SECRET_HINT_EXT,
43
+ matchesSecretHint,
44
+ };
45
+
46
+ // ─── Layer 2: hidden-content detection ───────────────────────────────────────
47
+
48
+ /** @param {(key: string) => string} val */
49
+ function isPositionedOffscreen(val) {
50
+ if (!/\babsolute\b|\bfixed\b/.test(val("position"))) return false;
51
+ for (const side of ["left", "top", "right", "bottom"]) {
52
+ const value = val(side);
53
+ if (value && parseFloat(value) < -900) return true;
54
+ }
55
+ const clip = val("clip");
56
+ return Boolean(clip && /rect\s*\(\s*0/.test(clip));
57
+ }
58
+
59
+ /** @param {(key: string) => string} val */
60
+ function isOverflowHidden(val) {
61
+ if (val("overflow") !== "hidden") return false;
62
+ for (const dim of ["height", "width", "max-height", "max-width"]) {
63
+ const value = val(dim);
64
+ if (value && parseFloat(value) === 0) return true;
65
+ }
66
+ return false;
67
+ }
68
+
69
+ /**
70
+ * @param {string} styleStr
71
+ * @returns {boolean}
72
+ */
73
+ export function isHiddenStyle(styleStr) {
74
+ // style-to-object throws on syntactically invalid CSS; a browser would
75
+ // ignore the broken declaration, so we do too rather than letting the
76
+ // exception escape and suppress the entire tool output.
77
+ let rawProps;
78
+ try {
79
+ // @ts-ignore -- style-to-object default export not resolved under NodeNext
80
+ rawProps = styleToObject(styleStr);
81
+ } catch {
82
+ return false;
83
+ }
84
+ if (!rawProps) return false;
85
+
86
+ // CSS property names are case-insensitive and `!important` is a legal
87
+ // trailing flag; style-to-object preserves both verbatim.
88
+ /** @type {Record<string, string>} */
89
+ const props = {};
90
+ for (const [key, value] of Object.entries(rawProps)) {
91
+ props[key.toLowerCase()] = String(value).replace(
92
+ // Bounded whitespace runs: `\s*` on both sides of an unanchored match
93
+ // backtracks super-linearly (redos/no-vulnerable). A CSS value never
94
+ // carries more than a couple of spaces around `!important`.
95
+ /\s{0,8}!\s{0,8}important\s{0,8}$/i,
96
+ "",
97
+ );
98
+ }
99
+
100
+ /** @param {string} key */
101
+ const val = (key) => (props[key] || "").toString().trim().toLowerCase();
102
+
103
+ if (val("display") === "none") return true;
104
+ if (val("visibility") === "hidden") return true;
105
+
106
+ const opacity = parseFloat(val("opacity"));
107
+ if (val("opacity") !== "" && opacity === 0) return true;
108
+
109
+ for (const dim of ["height", "width", "font-size"]) {
110
+ const value = val(dim);
111
+ if (value && parseFloat(value) === 0) return true;
112
+ }
113
+
114
+ if (isPositionedOffscreen(val)) return true;
115
+
116
+ const textIndent = val("text-indent");
117
+ if (textIndent && parseFloat(textIndent) < -900) return true;
118
+
119
+ // Clipped or scaled to nothing: modern equivalents of the legacy
120
+ // `clip: rect(0…)` above. Only the clip shapes that collapse the box to
121
+ // nothing are flagged — the canonical "visually hidden" utilities
122
+ // (`inset(50%)`…`inset(100%)`, `circle(0)`) abused to hide injected text.
123
+ // Decorative clips (`circle(50%)`, partial `inset`s, polygon shapes) render
124
+ // visible content and are left alone. A zero scale collapses the box too.
125
+ const clipPath = val("clip-path");
126
+ if (
127
+ clipPath &&
128
+ /\b(?:inset\(\s{0,8}(?:[5-9][0-9]|100)%|circle\(\s{0,8}0(?![.\d]))/.test(
129
+ clipPath,
130
+ )
131
+ )
132
+ return true;
133
+ const transform = val("transform");
134
+ if (
135
+ transform &&
136
+ /\b(?:scale|scale3d|scalex|scaley|matrix|matrix3d)\(\s{0,8}0(?![.\d])/.test(
137
+ transform,
138
+ )
139
+ )
140
+ return true;
141
+
142
+ // Same-color text on its background (white-on-white) and fully transparent
143
+ // text are invisible to a human but plain text to the model.
144
+ const color = val("color");
145
+ if (color === "transparent") return true;
146
+ const background = val("background-color") || val("background");
147
+ // The `background` guard also rejects the both-absent case: two empty values
148
+ // are equal, so without it an unstyled element would read as hidden.
149
+ if (background && color === background) return true;
150
+
151
+ return isOverflowHidden(val);
152
+ }
153
+
154
+ // Scripting / resource-loading tags whose PRESENCE is reported to the model
155
+ // but whose content is preserved: their bodies are page source the model may
156
+ // legitimately need to inspect (how a page's scripts work, its styles, its
157
+ // SVGs), so unlike hidden elements they are never removed.
158
+ export const REPORTED_TAGS = new Set([
159
+ "script",
160
+ "style",
161
+ "object",
162
+ "embed",
163
+ "iframe",
164
+ "svg",
165
+ "math",
166
+ ]);
167
+
168
+ /**
169
+ * True for an element a rendered page would not show: `hidden` attribute or a
170
+ * hiding inline style. Works on both hast nodes and parseHtmlTag results.
171
+ * @param {any} node
172
+ * @returns {boolean}
173
+ */
174
+ export function isHiddenElement(node) {
175
+ if (node.type !== "element") return false;
176
+ const { properties = {} } = node;
177
+ if (properties.hidden !== undefined && properties.hidden !== null)
178
+ return true;
179
+ // `aria-hidden="true"` removes the element from the accessibility tree, so a
180
+ // human using the rendered page never perceives it; a model reading raw
181
+ // source still does. (rehype maps the attribute to the `ariaHidden` prop.)
182
+ if (String(properties.ariaHidden).toLowerCase() === "true") return true;
183
+ if (properties.style && isHiddenStyle(properties.style)) return true;
184
+ return false;
185
+ }
186
+
187
+ /** @param {any} el */
188
+ function hasDataSrc(el) {
189
+ return (
190
+ typeof el.properties?.src === "string" &&
191
+ el.properties.src.startsWith("data:")
192
+ );
193
+ }
194
+
195
+ /**
196
+ * @param {string} htmlValue
197
+ * @returns {any}
198
+ */
199
+ function parseHtmlTag(htmlValue) {
200
+ const tree = unified().use(rehypeParse, { fragment: true }).parse(htmlValue);
201
+ /** @type {any} */
202
+ let firstElement = null;
203
+ visit(tree, "element", (node) => {
204
+ firstElement = node;
205
+ return EXIT;
206
+ });
207
+ return firstElement;
208
+ }
209
+
210
+ // Returns null on a closing tag: `</x>` alone can never be the *start* of a
211
+ // hidden element, so only opens drive the surrounding loop's removal mode.
212
+ /**
213
+ * @param {string} htmlValue
214
+ * @returns {string | null}
215
+ */
216
+ export function isHiddenOpen(htmlValue) {
217
+ if (htmlValue.startsWith("</")) return null;
218
+ const el = parseHtmlTag(htmlValue);
219
+ if (!el) return null;
220
+ if (isHiddenElement(el)) return el.tagName;
221
+ return null;
222
+ }
223
+
224
+ // The lowercased name of an HTML closing tag (`</div>` -> "div"), or null when
225
+ // the value isn't a well-formed closing tag. The charset spans HTML custom-
226
+ // element and namespaced names (hyphens, dots, colons) so a close like
227
+ // `</foo-bar>` balances its matching open instead of throwing on a null match;
228
+ // callers treat null as "not the tag we're closing" and strip it as part of the
229
+ // surrounding removal region.
230
+ /**
231
+ * @param {string} htmlValue
232
+ * @returns {string | null}
233
+ */
234
+ export function closingTagName(htmlValue) {
235
+ // The charset is a superset of CommonMark's closing-tag grammar, so remark
236
+ // never emits a `</…>` html node this fails to match; the null guard below is
237
+ // defense-in-depth against a future parser/grammar change (hence unreachable).
238
+ const match = htmlValue.match(/^<\/(?<tagName>[a-zA-Z][a-zA-Z0-9:._-]*)\s*>/);
239
+ /* c8 ignore next */
240
+ if (!match?.groups) return null;
241
+ return match.groups.tagName.toLowerCase();
242
+ }
243
+
244
+ // ─── Layer 2: splice engine ──────────────────────────────────────────────────
245
+
246
+ export const COMMENT_PLACEHOLDER = "[HTML comment removed]";
247
+ export const HIDDEN_PLACEHOLDER = "[hidden HTML removed]";
248
+
249
+ /**
250
+ * Replace each range of `text` with its kind's placeholder, preserving every
251
+ * byte outside the ranges verbatim. Overlapping/nested ranges are merged
252
+ * (defense-in-depth — the scanners emit disjoint ranges).
253
+ * @param {string} text
254
+ * @param {Array<{start: number, end: number, kind: "comment" | "hidden"}>} ranges
255
+ * @returns {string}
256
+ */
257
+ export function spliceRanges(text, ranges) {
258
+ const sorted = [...ranges].sort(
259
+ (left, right) => left.start - right.start || left.end - right.end,
260
+ );
261
+ /** @type {typeof ranges} */
262
+ const merged = [];
263
+ for (const range of sorted) {
264
+ const last = merged[merged.length - 1];
265
+ if (last && range.start < last.end) {
266
+ if (range.end > last.end) last.end = range.end;
267
+ } else {
268
+ merged.push({ ...range });
269
+ }
270
+ }
271
+ let out = "";
272
+ let cursor = 0;
273
+ for (const range of merged) {
274
+ out +=
275
+ text.slice(cursor, range.start) +
276
+ (range.kind === "comment" ? COMMENT_PLACEHOLDER : HIDDEN_PLACEHOLDER);
277
+ cursor = range.end;
278
+ }
279
+ return out + text.slice(cursor);
280
+ }
281
+
282
+ /** @returns {{ tags: Record<string, number>, dataSrc: number }} */
283
+ function newWarned() {
284
+ return { tags: {}, dataSrc: 0 };
285
+ }
286
+
287
+ /**
288
+ * @param {ReturnType<typeof newWarned>} warned
289
+ * @param {string} tagName
290
+ */
291
+ function countTag(warned, tagName) {
292
+ warned.tags[tagName] = (warned.tags[tagName] || 0) + 1;
293
+ }
294
+
295
+ /**
296
+ * @param {ReturnType<typeof newWarned>} into
297
+ * @param {ReturnType<typeof newWarned>} from
298
+ */
299
+ function mergeWarned(into, from) {
300
+ for (const [tag, count] of Object.entries(from.tags))
301
+ into.tags[tag] = (into.tags[tag] || 0) + count;
302
+ into.dataSrc += from.dataSrc;
303
+ }
304
+
305
+ /** @param {ReturnType<typeof newWarned>} warned */
306
+ function hasWarned(warned) {
307
+ return warned.dataSrc > 0 || Object.keys(warned.tags).length > 0;
308
+ }
309
+
310
+ /**
311
+ * Scan raw HTML for hidden content to strip and preserved tags to report.
312
+ * Returned ranges are offsets into `html`; comments and hidden elements span
313
+ * the whole element including its content (rehype positions cover open tag
314
+ * through matching close, and parse5 extends an unclosed element to the end
315
+ * of the fragment — fail-closed for truncated markup).
316
+ * @param {string} html
317
+ * @returns {{ ranges: Array<{start: number, end: number, kind: "comment" | "hidden"}>, warned: ReturnType<typeof newWarned> }}
318
+ */
319
+ export function scanHtmlFragment(html) {
320
+ const tree = unified().use(rehypeParse, { fragment: true }).parse(html);
321
+ /** @type {Array<{start: number, end: number, kind: "comment" | "hidden"}>} */
322
+ const ranges = [];
323
+ const warned = newWarned();
324
+ // @ts-ignore -- visit callback returns EXIT/SKIP only on matches; implicit undefined return is intentional
325
+ // eslint-disable-next-line consistent-return
326
+ visit(tree, (/** @type {any} */ node) => {
327
+ const isComment = node.type === "comment";
328
+ if (isComment || isHiddenElement(node)) {
329
+ /* c8 ignore start -- parse5 omits positions only on recovery-synthesized
330
+ elements (tbody and friends), which carry no attributes and so can
331
+ never be hidden; fail closed on the whole fragment if that assumption
332
+ ever breaks. */
333
+ if (!node.position) {
334
+ ranges.length = 0;
335
+ ranges.push({ start: 0, end: html.length, kind: "hidden" });
336
+ return EXIT;
337
+ }
338
+ /* c8 ignore stop */
339
+ ranges.push({
340
+ start: node.position.start.offset,
341
+ end: node.position.end.offset,
342
+ kind: isComment ? "comment" : "hidden",
343
+ });
344
+ return SKIP; // children are inside the spliced range
345
+ }
346
+ if (node.type !== "element") return; // eslint-disable-line consistent-return -- unist visit: undefined return means "continue", same as falling off the end
347
+ if (REPORTED_TAGS.has(node.tagName)) countTag(warned, node.tagName);
348
+ if (hasDataSrc(node)) warned.dataSrc += 1;
349
+ });
350
+ return { ranges, warned };
351
+ }
352
+
353
+ const mdParser = unified().use(remarkParse).use(remarkGfm);
354
+
355
+ /**
356
+ * Append comment ranges found in `value` to `ranges`.
357
+ *
358
+ * indexOf scanning is linear (a lazy `<!--[\s\S]*?-->` regex backtracks
359
+ * polynomially on crafted input); the close search starts 2 chars in so
360
+ * spec-abrupt closes (`<!-->`, `<!--->`) terminate their own comment.
361
+ * @param {string} value
362
+ * @param {number} base absolute offset of the start of `value`
363
+ * @param {number} nodeEnd absolute offset of the end of the containing node
364
+ * @param {Array<{start: number, end: number, kind: "comment" | "hidden"}>} ranges
365
+ */
366
+ function collectCommentRanges(value, base, nodeEnd, ranges) {
367
+ for (let searchFrom = 0; ; ) {
368
+ const open = value.indexOf("<!--", searchFrom);
369
+ if (open === -1) break;
370
+ const close = value.indexOf("-->", open + 2);
371
+ /* c8 ignore start -- micromark only tokenizes inline comments WITH a
372
+ terminator (an unterminated `<!--` in phrasing context stays literal
373
+ text, visible to a human reader), so this is fail-closed
374
+ defense-in-depth against a future tokenizer change. Unterminated
375
+ comments in flow blocks are covered — parse5 handles them in
376
+ scanHtmlFragment. */
377
+ if (close === -1) {
378
+ ranges.push({ start: base + open, end: nodeEnd, kind: "comment" });
379
+ break;
380
+ }
381
+ /* c8 ignore stop */
382
+ ranges.push({ start: base + open, end: base + close + 3, kind: "comment" });
383
+ searchFrom = close + 3;
384
+ }
385
+ }
386
+
387
+ /**
388
+ * Update hidden-region state for one html node while inside a tracked region.
389
+ *
390
+ * Mutates `state` in place. A closing tag for the tracked element decrements
391
+ * depth; reaching zero closes the range. A nested open of the same tag
392
+ * increments depth. Any other close is swallowed inside the region.
393
+ * @param {{ tag: string | null, depth: number, regionStart: number }} state
394
+ * @param {string} value
395
+ * @param {number} nodeEnd absolute end offset of this node
396
+ * @param {Array<{start: number, end: number, kind: "comment" | "hidden"}>} ranges
397
+ */
398
+ function updateHiddenState(state, value, nodeEnd, ranges) {
399
+ if (value.startsWith("</")) {
400
+ if (closingTagName(value) !== state.tag) return;
401
+ state.depth--;
402
+ if (state.depth === 0) {
403
+ ranges.push({ start: state.regionStart, end: nodeEnd, kind: "hidden" });
404
+ state.tag = null;
405
+ }
406
+ return;
407
+ }
408
+ const el = parseHtmlTag(value);
409
+ if (el && el.tagName === state.tag) state.depth++;
410
+ }
411
+
412
+ /**
413
+ * Balance-walk the direct children of a markdown container node: a hidden
414
+ * open tag starts a removal region that runs to its matching close (or the
415
+ * container's end when unbalanced — fail-closed), comments become single-node
416
+ * ranges, and preserved tags are counted. Inline html is tokenized per TAG
417
+ * (an element's content sits in sibling text nodes), which is why this walk
418
+ * exists instead of handing the value to rehype.
419
+ * @param {any} node
420
+ * @param {Array<{start: number, end: number, kind: "comment" | "hidden"}>} ranges
421
+ * @param {ReturnType<typeof newWarned>} warned
422
+ */
423
+ function scanInlineChildren(node, ranges, warned) {
424
+ const state =
425
+ /** @type {{ tag: string | null, depth: number, regionStart: number }} */ ({
426
+ tag: null,
427
+ depth: 0,
428
+ regionStart: 0,
429
+ });
430
+ for (const child of node.children) {
431
+ if (child.type !== "html") continue;
432
+ const value = child.value;
433
+ const base = child.position.start.offset;
434
+ if (state.depth > 0) {
435
+ updateHiddenState(state, value, child.position.end.offset, ranges);
436
+ continue;
437
+ }
438
+ // Comments can share an inline html node with neighboring constructs
439
+ // (e.g. in a list item, `<!-- c -->!` is ONE node), so comment spans are
440
+ // located within the value and spliced individually rather than assuming
441
+ // the node IS the comment.
442
+ collectCommentRanges(value, base, child.position.end.offset, ranges);
443
+ const tagName = isHiddenOpen(value);
444
+ if (tagName) {
445
+ state.tag = tagName;
446
+ state.depth = 1;
447
+ state.regionStart = base;
448
+ continue;
449
+ }
450
+ if (value.startsWith("</")) continue;
451
+ const el = parseHtmlTag(value);
452
+ if (!el) continue;
453
+ if (REPORTED_TAGS.has(el.tagName)) countTag(warned, el.tagName);
454
+ if (hasDataSrc(el)) warned.dataSrc += 1;
455
+ }
456
+ if (state.depth > 0) {
457
+ ranges.push({
458
+ start: state.regionStart,
459
+ end: node.position.end.offset,
460
+ kind: "hidden",
461
+ });
462
+ }
463
+ }
464
+
465
+ // Containers whose direct html children are flow BLOCKS (complete markup —
466
+ // tags and content in one node value), as opposed to the phrasing containers
467
+ // (paragraph, heading, tableCell, emphasis, …) whose html children are
468
+ // per-tag fragments needing the balance walk.
469
+ const FLOW_HTML_PARENTS = new Set([
470
+ "root",
471
+ "blockquote",
472
+ "listItem",
473
+ "footnoteDefinition",
474
+ ]);
475
+
476
+ /**
477
+ * @param {string} text
478
+ * @returns {{ ranges: Array<{start: number, end: number, kind: "comment" | "hidden"}>, warned: ReturnType<typeof newWarned> }}
479
+ */
480
+ function scanMarkdown(text) {
481
+ const tree = mdParser.parse(text);
482
+ /** @type {Array<{start: number, end: number, kind: "comment" | "hidden"}>} */
483
+ const ranges = [];
484
+ const warned = newWarned();
485
+
486
+ // Flow html blocks carry complete markup, so rehype locates comments/hidden
487
+ // elements precisely within them; block-local offsets are shifted to
488
+ // document coordinates.
489
+ visit(tree, "html", (/** @type {any} */ node, _index, parent) => {
490
+ if (!FLOW_HTML_PARENTS.has(parent?.type)) return;
491
+ const base = node.position.start.offset;
492
+ const sub = scanHtmlFragment(text.slice(base, node.position.end.offset));
493
+ for (const range of sub.ranges) {
494
+ ranges.push({
495
+ start: base + range.start,
496
+ end: base + range.end,
497
+ kind: range.kind,
498
+ });
499
+ }
500
+ mergeWarned(warned, sub.warned);
501
+ });
502
+
503
+ // Every phrasing container that holds inline html (paragraph, heading,
504
+ // tableCell, emphasis, …) gets the balance walk — not just paragraphs, so a
505
+ // hidden span inside a heading cannot slip through.
506
+ visit(tree, (/** @type {any} */ node) => {
507
+ if (FLOW_HTML_PARENTS.has(node.type) || !Array.isArray(node.children))
508
+ return;
509
+ if (
510
+ !node.children.some((/** @type {any} */ child) => child.type === "html")
511
+ )
512
+ return;
513
+ scanInlineChildren(node, ranges, warned);
514
+ });
515
+
516
+ return { ranges, warned };
517
+ }
518
+
519
+ // 30%-of-lines heuristic: HTML *source* gets scanned as one rehype fragment;
520
+ // inline tags scattered in prose go through the markdown branch instead.
521
+ /**
522
+ * @param {string} text
523
+ * @returns {boolean}
524
+ */
525
+ export function looksLikeHtmlSource(text) {
526
+ const lines = text.split("\n");
527
+ if (lines.length < 5) return false;
528
+ let htmlLines = 0;
529
+ for (const line of lines) {
530
+ if (/<\/?[a-zA-Z][^<>]*>/.test(line)) htmlLines++;
531
+ }
532
+ return htmlLines / lines.length > 0.3;
533
+ }
534
+
535
+ /**
536
+ * Layer 2 over web-ingress text: splice out HTML comments and hidden elements
537
+ * (placeholders mark the cuts; all other bytes are preserved verbatim) and
538
+ * count preserved scripting/resource tags for the caller's warning. Returns
539
+ * null when there is nothing to strip and nothing to report.
540
+ * @param {string} text
541
+ * @returns {{ text: string, removed: { comments: number, hidden: number }, warned: { tags: Record<string, number>, dataSrc: number } } | null}
542
+ */
543
+ export function sanitizeHtml(text) {
544
+ if (!HTML_TAG_PRESENT.test(text)) return null;
545
+ const { ranges, warned } = looksLikeHtmlSource(text)
546
+ ? scanHtmlFragment(text)
547
+ : scanMarkdown(text);
548
+ if (ranges.length === 0 && !hasWarned(warned)) return null;
549
+ const removed = { comments: 0, hidden: 0 };
550
+ for (const range of ranges)
551
+ removed[range.kind === "comment" ? "comments" : "hidden"]++;
552
+ return {
553
+ text: ranges.length > 0 ? spliceRanges(text, ranges) : text,
554
+ removed,
555
+ warned,
556
+ };
557
+ }
558
+
559
+ // ─── Layer 3: markdown/URL exfiltration detection ────────────────────────────
560
+
561
+ // High-precision raw-string indicators, applied to the whole URL so they fire
562
+ // even when it is too malformed for `new URL()` to parse (e.g. a non-ASCII
563
+ // host). The `#` in the delimiter class extends keyword detection to the
564
+ // fragment, an exfil channel the param walk would otherwise miss (`…#token=…`).
565
+ // The generic "long base64/hex value" arm that once lived here moved into the
566
+ // per-parameter walk (paramExfilReason) so it can skip request-signing,
567
+ // pagination, and analytics parameters that legitimately carry long opaque
568
+ // values — see BENIGN_BLOB_PARAM_RE.
569
+ const EXFIL_INDICATORS = [
570
+ /[?&#](?:data|d|payload|exfil|leak|steal|secret|token|key|env|password|pwd|cookie|session|auth)=/i,
571
+ /\$\{[^{}]+\}/,
572
+ /\{\{[^{}]+\}\}/,
573
+ ];
574
+
575
+ const LONG_QUERY_THRESHOLD = 200;
576
+
577
+ // A `data:` URI carries its payload inline instead of pointing at a host, so
578
+ // the query/credential/fragment checks below never fire on it. Active-content
579
+ // types (HTML, SVG, JS) are a script-injection vector; an oversized blob of any
580
+ // type is an inline exfil/injection payload. A small inline image (icon) is
581
+ // left alone so the common case isn't drowned in noise.
582
+ const DATA_URI_ACTIVE_RE =
583
+ /^\s*data:(?:text\/html|image\/svg\+xml|application\/(?:javascript|ecmascript|xhtml\+xml))[;,]/i;
584
+ export const DATA_URI_LENGTH_THRESHOLD = 4096;
585
+
586
+ // javascript:/vbscript: URIs execute on navigation/load, never a legitimate
587
+ // link target in fetched content — flagged regardless of payload.
588
+ const SCRIPT_URI_RE = /^\s*(?:javascript|vbscript):/i;
589
+
590
+ const RELATIVE_URL_BASE = "http://relative.invalid";
591
+
592
+ // Parameter NAMES that legitimately carry a LONG opaque (base64/hex) value, so
593
+ // a blob in one of them is NOT exfil: CDN request-signing (AWS SigV4 /
594
+ // CloudFront `X-Amz-*`/`Signature`/`Policy`/`Key-Pair-Id`, GCS `X-Goog-*`,
595
+ // Azure SAS `sv/sr/sig/se/sp/st/spr/skoid/sktid`), pagination cursors /
596
+ // continuation tokens, and the long analytics click-IDs. Matched
597
+ // case-insensitively against the exact (lowercased) parameter name. Scope is
598
+ // deliberately limited to names whose benign value is genuinely a long token —
599
+ // generic short params (`page`, `limit`, `v`, `t`, `cb`, …) are NOT listed,
600
+ // since their values never reach the blob threshold anyway and listing them
601
+ // would only widen the rename-dodge surface. A blob or credential-shaped value
602
+ // in any OTHER parameter still fires — this allowlist trades a narrow dodge
603
+ // (`?sig=<stolen>`) for not drowning the model in false positives on ordinary
604
+ // fetched pages.
605
+ const BENIGN_BLOB_PARAM_RE =
606
+ /^(?:x-(?:amz|goog|ms|oss|obs)-[a-z0-9-]+|amz-[a-z0-9-]+|utm_[a-z]+|sig|signature|hmac|policy|credential|expires|key-pair-id|se|sp|sr|sv|st|spr|si|skoid|sktid|cursor|after|before|continuation|continuationtoken|continuation_token|pagetoken|page_token|nexttoken|next_token|gclid|fbclid|dclid|msclkid|gbraid|wbraid|_ga|_gl|mc_eid|mc_cid)$/i;
607
+
608
+ // matchesSecretHint is a deliberately broad PRE-gate whose bare-keyword arms
609
+ // (`token`, `secret`, `authorization`, …) also match ordinary hyphen/word
610
+ // delimited prose, and with no secret-redaction engine to refine the verdict
611
+ // here a weak digit proxy isn't enough: `login-authenticate-2024` and
612
+ // `the-secret-recipe-2024` clear "has a digit." A leaked credential is an
613
+ // OPAQUE, separator-free token, so the value must additionally contain a
614
+ // contiguous 20+ char `[A-Za-z0-9_]` run (no hyphen/space — that's what splits
615
+ // the prose runs below the bar) AND a digit before it counts as one.
616
+ const OPAQUE_TOKEN_RE = /[A-Za-z0-9_]{20,}/;
617
+ const VALUE_HAS_DIGIT_RE = /\d/;
618
+
619
+ // A value that is ENTIRELY a long base64 (40+ chars, optional `=` padding) or
620
+ // hex (32+ chars) run. Anchored to the whole value (operating on the RAW,
621
+ // un-decoded query so a `+` in base64 is not turned into a space), so a benign
622
+ // short value with an incidental hex word never trips it. Both arms are linear.
623
+ const BLOB_VALUE_B64_RE = /^[A-Za-z0-9+/]{40,}={0,2}$/;
624
+ const BLOB_VALUE_HEX_RE = /^[A-Fa-f0-9]{32,}$/;
625
+
626
+ // A path segment whose whole value is a base64/hex run longer than any standard
627
+ // content hash (SHA-512 hex is 128, base64 88; SHA-256 hex 64) is bulk encoded
628
+ // data — a beacon URL that smuggles its payload in the path to dodge the query
629
+ // walk — rather than an asset fingerprint. The threshold sits just above the
630
+ // SHA-512-hex ceiling so every real fingerprint clears it while a ~150-char
631
+ // base64 of stolen cookies does not. Hyphens/underscores are excluded so a long
632
+ // word-slug (`the-secret-history-of-…`) is not mistaken for a payload.
633
+ const PATH_BLOB_RE = /^(?:[A-Za-z0-9+/]+={0,2}|[A-Fa-f0-9]+)$/;
634
+ const PATH_BLOB_MIN_LEN = 128;
635
+
636
+ /**
637
+ * RAW (un-decoded) `name=value` pairs of a query/fragment string, split on `&`
638
+ * and `;`. URLSearchParams is avoided on purpose: it percent-/`+`-decodes
639
+ * values, turning a `+`-bearing base64 blob into a space-broken string that the
640
+ * anchored blob regexes would miss.
641
+ * @param {string} qs
642
+ * @returns {Array<[string, string]>}
643
+ */
644
+ function rawParams(qs) {
645
+ /** @type {Array<[string, string]>} */
646
+ const pairs = [];
647
+ for (const pair of qs.split(/[&;]/)) {
648
+ if (!pair) continue;
649
+ const eq = pair.indexOf("=");
650
+ const name = eq === -1 ? pair : pair.slice(0, eq);
651
+ const value = eq === -1 ? "" : pair.slice(eq + 1);
652
+ pairs.push([name.toLowerCase(), value]);
653
+ }
654
+ return pairs;
655
+ }
656
+
657
+ /**
658
+ * Exfil reason for one URL parameter, or null. A credential-shaped value in any
659
+ * non-allowlisted parameter (reusing the secret-shape gate), or a long
660
+ * base64/hex blob in one. Allowlisted signing/pagination/analytics parameters
661
+ * are skipped entirely (see BENIGN_BLOB_PARAM_RE).
662
+ * @param {string} name lowercased parameter name
663
+ * @param {string} value RAW (un-decoded) value
664
+ * @returns {string | null}
665
+ */
666
+ function paramExfilReason(name, value) {
667
+ if (BENIGN_BLOB_PARAM_RE.test(name)) return null;
668
+ if (
669
+ OPAQUE_TOKEN_RE.test(value) &&
670
+ VALUE_HAS_DIGIT_RE.test(value) &&
671
+ matchesSecretHint(value)
672
+ )
673
+ return "credential-shaped token in URL parameter";
674
+ if (BLOB_VALUE_B64_RE.test(value) || BLOB_VALUE_HEX_RE.test(value))
675
+ return "suspicious query parameter";
676
+ return null;
677
+ }
678
+
679
+ /**
680
+ * True when every parameter of the parsed URL's query is in the benign
681
+ * allowlist. Used to suppress the coarse long-query-string heuristic for
682
+ * signed-CDN links, which are long by design. Only ever called once the query
683
+ * is known to be long (and thus non-empty), so the vacuous-true empty case
684
+ * cannot arise here.
685
+ * @param {URL} parsed
686
+ * @returns {boolean}
687
+ */
688
+ function allParamsBenign(parsed) {
689
+ return rawParams(parsed.search.slice(1)).every(([name]) =>
690
+ BENIGN_BLOB_PARAM_RE.test(name),
691
+ );
692
+ }
693
+
694
+ /**
695
+ * Walk the query and fragment parameters of a parsed URL for an exfil reason.
696
+ * @param {URL} parsed
697
+ * @returns {string | null}
698
+ */
699
+ function checkUrlParams(parsed) {
700
+ for (const [name, value] of rawParams(parsed.search.slice(1))) {
701
+ const reason = paramExfilReason(name, value);
702
+ if (reason) return reason;
703
+ }
704
+ // The fragment carries the same `key=value` channel (`#token=…`); a bare
705
+ // anchor (`#section-2`) yields one empty-value param that trips nothing.
706
+ for (const [name, value] of rawParams(parsed.hash.slice(1))) {
707
+ const reason = paramExfilReason(name, value);
708
+ if (reason) return reason;
709
+ }
710
+ return null;
711
+ }
712
+
713
+ /**
714
+ * A bulk encoded-data blob smuggled in a path segment (a beacon URL that avoids
715
+ * query strings entirely), or null.
716
+ * @param {URL} parsed
717
+ * @returns {string | null}
718
+ */
719
+ function checkUrlPath(parsed) {
720
+ for (const segment of parsed.pathname.split("/")) {
721
+ if (segment.length > PATH_BLOB_MIN_LEN && PATH_BLOB_RE.test(segment))
722
+ return "encoded data blob in path segment";
723
+ }
724
+ return null;
725
+ }
726
+
727
+ /**
728
+ * @param {string} url
729
+ * @returns {string | null}
730
+ */
731
+ export function checkExfilUrl(url) {
732
+ if (/^\s*data:/i.test(url)) {
733
+ if (DATA_URI_ACTIVE_RE.test(url)) return "active-content data: URI";
734
+ if (url.length > DATA_URI_LENGTH_THRESHOLD)
735
+ return "oversized inline data: payload";
736
+ return null;
737
+ }
738
+ if (SCRIPT_URI_RE.test(url)) return "script-executing URI";
739
+ if (EXFIL_INDICATORS.some((pattern) => pattern.test(url)))
740
+ return "suspicious query parameter";
741
+ // Userinfo and an oversized fragment are exfil channels the param walk misses:
742
+ // credentials smuggled as `user:secret@host`, or a payload tucked in `#<blob>`.
743
+ // Parse against a sentinel base so relative URLs don't throw.
744
+ let parsed;
745
+ try {
746
+ parsed = new URL(url, RELATIVE_URL_BASE);
747
+ } catch {
748
+ return null;
749
+ }
750
+ if (parsed.username || parsed.password) return "embedded credentials";
751
+ // A long query string is only suspicious when it carries a non-allowlisted
752
+ // parameter — a signed-CDN URL is long by design (all `X-Amz-*`/SAS params).
753
+ const qIdx = url.indexOf("?");
754
+ if (
755
+ qIdx !== -1 &&
756
+ url.length - qIdx > LONG_QUERY_THRESHOLD &&
757
+ !allParamsBenign(parsed)
758
+ )
759
+ return "unusually long query string";
760
+ if (parsed.hash.length > LONG_QUERY_THRESHOLD)
761
+ return "unusually long fragment";
762
+ return checkUrlParams(parsed) || checkUrlPath(parsed);
763
+ }
764
+
765
+ /**
766
+ * Host of a flagged URL — enough for the warning to name the destination
767
+ * without echoing the payload-bearing query/fragment.
768
+ * @param {string} url
769
+ * @returns {string}
770
+ */
771
+ export function urlHost(url) {
772
+ // A `data:` URI has no host; name the channel rather than echoing the payload.
773
+ if (/^\s*data:/i.test(url)) return "(inline data: URI)";
774
+ let parsed;
775
+ try {
776
+ parsed = new URL(url, RELATIVE_URL_BASE);
777
+ } catch {
778
+ // checkExfilUrl flags via regex before parsing, so it can hand us a URL
779
+ // WHATWG rejects (e.g. a non-ASCII host).
780
+ return "(unparsable URL)";
781
+ }
782
+ if (
783
+ parsed.origin === RELATIVE_URL_BASE &&
784
+ !url.startsWith(RELATIVE_URL_BASE)
785
+ ) {
786
+ return "(relative URL)";
787
+ }
788
+ return parsed.host;
789
+ }
790
+
791
+ /**
792
+ * True when `url` is an absolute, off-origin target (an authority that is not
793
+ * the relative-resolution sentinel). Used for form `action`/`formaction` and
794
+ * `meta refresh` URLs, where pointing off the page's own origin is the
795
+ * exfil/redirect signal regardless of the query shape.
796
+ * @param {string} url
797
+ * @returns {boolean}
798
+ */
799
+ function isOffOrigin(url) {
800
+ let parsed;
801
+ try {
802
+ parsed = new URL(url, RELATIVE_URL_BASE);
803
+ } catch {
804
+ return false;
805
+ }
806
+ return (
807
+ parsed.origin !== RELATIVE_URL_BASE || url.startsWith(RELATIVE_URL_BASE)
808
+ );
809
+ }
810
+
811
+ /**
812
+ * The redirect URL of a `<meta http-equiv="refresh">` content value
813
+ * (`"5; url=https://…"`), or null when it carries no `url=` target.
814
+ * @param {string} content
815
+ * @returns {string | null}
816
+ */
817
+ function metaRefreshUrl(content) {
818
+ const match = /** @type {{ groups: { url: string } } | null} */ (
819
+ content.match(/url\s*=\s*['"]?(?<url>[^'"\s;]+)/i)
820
+ );
821
+ return match ? match.groups.url : null;
822
+ }
823
+
824
+ /**
825
+ * Candidate URLs of a `srcset` (a comma-separated "url descriptor" string) or
826
+ * `ping` (a space-separated url list rehype delivers as an array) attribute.
827
+ * Each candidate's leading whitespace-delimited token is its url (the trailing
828
+ * `2x`/`100w` descriptor, or extra ping urls, are dropped to the next
829
+ * candidate). An absent attribute (neither string nor array) yields none.
830
+ * @param {unknown} value
831
+ * @returns {string[]}
832
+ */
833
+ function multiUrlAttr(value) {
834
+ /** @type {string[]} */ let candidates = [];
835
+ if (Array.isArray(value)) candidates = value.map(String);
836
+ else if (typeof value === "string") candidates = value.split(",");
837
+ return candidates
838
+ .map((candidate) => candidate.trim().split(/\s+/)[0])
839
+ .filter(Boolean);
840
+ }
841
+
842
+ /**
843
+ * URL-bearing attributes of every HTML element in `text`, parsed with rehype so
844
+ * quoting/casing/entities are handled correctly (no hand-rolled tag regex).
845
+ * `context` selects the per-URL check the caller applies: resource URLs get the
846
+ * exfil-shape test; form-submission and meta-refresh targets additionally flag
847
+ * any absolute off-origin destination.
848
+ * @param {string} text
849
+ * @returns {Array<{ url: string, isImage: boolean, context: "resource" | "form" | "refresh" }>}
850
+ */
851
+ function extractHtmlUrls(text) {
852
+ const tree = unified().use(rehypeParse, { fragment: true }).parse(text);
853
+ /** @type {Array<{ url: string, isImage: boolean, context: "resource" | "form" | "refresh" }>} */
854
+ const urls = [];
855
+ visit(tree, "element", (/** @type {any} */ node) => {
856
+ // hast element nodes always carry a `properties` object (parse5 sets it).
857
+ const props = node.properties;
858
+ const isImage = node.tagName === "img";
859
+ for (const key of ["src", "href", "background"])
860
+ if (typeof props[key] === "string")
861
+ urls.push({ url: props[key], isImage, context: "resource" });
862
+ for (const key of ["srcSet", "ping"])
863
+ for (const url of multiUrlAttr(props[key]))
864
+ urls.push({ url, isImage, context: "resource" });
865
+ for (const key of ["action", "formAction"])
866
+ if (typeof props[key] === "string")
867
+ urls.push({ url: props[key], isImage: false, context: "form" });
868
+ // rehype delivers `http-equiv` as an array (comma-separated); join it back
869
+ // so a `refresh` directive is matched regardless of how it was tokenized.
870
+ const httpEquiv = Array.isArray(props.httpEquiv)
871
+ ? props.httpEquiv.join(",").toLowerCase()
872
+ : "";
873
+ if (
874
+ node.tagName === "meta" &&
875
+ httpEquiv.includes("refresh") &&
876
+ typeof props.content === "string"
877
+ ) {
878
+ const url = metaRefreshUrl(props.content);
879
+ if (url) urls.push({ url, isImage: false, context: "refresh" });
880
+ }
881
+ });
882
+ return urls;
883
+ }
884
+
885
+ // Reason for an off-origin submission/redirect target by context; null leaves
886
+ // the URL to the exfil-shape check alone.
887
+ const OFF_ORIGIN_REASON = {
888
+ form: "off-origin form action",
889
+ refresh: "off-origin meta-refresh redirect",
890
+ };
891
+
892
+ /**
893
+ * Layer 3: report data-exfil-shaped URLs in markdown links/images/definitions
894
+ * and HTML attributes (src/href/background/srcset/ping, form action/formaction,
895
+ * meta-refresh). Detection only — the text is never modified; the caller
896
+ * surfaces the threats as a warning.
897
+ * @param {string} text
898
+ * @returns {Array<{ isImage: boolean, reason: string, target: string }> | null}
899
+ */
900
+ export function detectExfil(text) {
901
+ if (!MD_LINK_HINT.test(text) && !HTML_TAG_PRESENT.test(text)) return null;
902
+
903
+ /** @type {Array<{ isImage: boolean, reason: string, target: string }>} */
904
+ const threats = [];
905
+
906
+ // Remark AST handles markdown links/images/definitions (balanced parens,
907
+ // reference links) correctly, unlike a hand-rolled regex.
908
+ const tree = mdParser.parse(text);
909
+ visit(tree, (node) => {
910
+ if (
911
+ node.type !== "link" &&
912
+ node.type !== "image" &&
913
+ node.type !== "definition"
914
+ )
915
+ return;
916
+ const reason = checkExfilUrl(node.url);
917
+ if (!reason) return;
918
+ threats.push({
919
+ isImage: node.type === "image",
920
+ reason,
921
+ target: urlHost(node.url),
922
+ });
923
+ });
924
+
925
+ // HTML attributes (not AST nodes in remark).
926
+ for (const { url, isImage, context } of extractHtmlUrls(text)) {
927
+ const reason =
928
+ checkExfilUrl(url) ||
929
+ (context !== "resource" && isOffOrigin(url)
930
+ ? OFF_ORIGIN_REASON[context]
931
+ : null);
932
+ if (!reason) continue;
933
+ threats.push({ isImage, reason, target: urlHost(url) });
934
+ }
935
+
936
+ return threats.length > 0 ? threats : null;
937
+ }