npm - agent-input-sanitizer - Versions diffs - 1.0.0 - Mend

agent-input-sanitizer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/src/html.mjs ADDED Viewed

@@ -0,0 +1,937 @@
+/**
+ * Hidden-HTML splicing (Layer 2) and exfil-URL detection (Layer 3) for
+ * web/HTML ingress.
+ *
+ * Layer 2 strips exactly what a human viewing the rendered page cannot see —
+ * HTML comments and hidden elements (hiding inline styles, `hidden` attr) —
+ * by splicing those byte ranges out of the original text and leaving a
+ * placeholder; every byte outside a spliced range is preserved verbatim (no
+ * re-serialization). Scripting/resource tags (script, style, svg, iframe, …)
+ * and `data:` URI resources are REPORTED in the result's `warned` counts but
+ * never removed, so fetched page source stays inspectable.
+ *
+ * Layer 3 reports data-exfil-shaped URLs (suspicious query params, oversized
+ * payloads, embedded credentials) without modifying them; the caller surfaces
+ * the report as a warning.
+ *
+ * Split into its own module so it can be lazy-loaded: pulling in the
+ * remark/rehype/unified graph costs ~200ms of module-load time, so the main
+ * entry `await import()`s this module only when its cheap regex gates match.
+ */
+import { unified } from "unified";
+import remarkParse from "remark-parse";
+import remarkGfm from "remark-gfm";
+import rehypeParse from "rehype-parse";
+import { visit, SKIP, EXIT } from "unist-util-visit";
+import styleToObject from "style-to-object";
+import {
+  HTML_TAG_PRESENT,
+  MD_LINK_HINT,
+  SECRET_HINT,
+  SECRET_HINT_EXT,
+  matchesSecretHint,
+} from "./gates.mjs";
+// The cheap pre-gates live in the dependency-free `./gates.mjs` so the package
+// root can re-export them without eagerly loading this module's remark/rehype
+// graph. Re-exported here too so the `./html` subpath keeps exposing them.
+export {
+  HTML_TAG_PRESENT,
+  MD_LINK_HINT,
+  SECRET_HINT,
+  SECRET_HINT_EXT,
+  matchesSecretHint,
+};
+// ─── Layer 2: hidden-content detection ───────────────────────────────────────
+/** @param {(key: string) => string} val */
+function isPositionedOffscreen(val) {
+  if (!/\babsolute\b|\bfixed\b/.test(val("position"))) return false;
+  for (const side of ["left", "top", "right", "bottom"]) {
+    const value = val(side);
+    if (value && parseFloat(value) < -900) return true;
+  }
+  const clip = val("clip");
+  return Boolean(clip && /rect\s*\(\s*0/.test(clip));
+}
+/** @param {(key: string) => string} val */
+function isOverflowHidden(val) {
+  if (val("overflow") !== "hidden") return false;
+  for (const dim of ["height", "width", "max-height", "max-width"]) {
+    const value = val(dim);
+    if (value && parseFloat(value) === 0) return true;
+  }
+  return false;
+}
+/**
+ * @param {string} styleStr
+ * @returns {boolean}
+ */
+export function isHiddenStyle(styleStr) {
+  // style-to-object throws on syntactically invalid CSS; a browser would
+  // ignore the broken declaration, so we do too rather than letting the
+  // exception escape and suppress the entire tool output.
+  let rawProps;
+  try {
+    // @ts-ignore -- style-to-object default export not resolved under NodeNext
+    rawProps = styleToObject(styleStr);
+  } catch {
+    return false;
+  }
+  if (!rawProps) return false;
+  // CSS property names are case-insensitive and `!important` is a legal
+  // trailing flag; style-to-object preserves both verbatim.
+  /** @type {Record<string, string>} */
+  const props = {};
+  for (const [key, value] of Object.entries(rawProps)) {
+    props[key.toLowerCase()] = String(value).replace(
+      // Bounded whitespace runs: `\s*` on both sides of an unanchored match
+      // backtracks super-linearly (redos/no-vulnerable). A CSS value never
+      // carries more than a couple of spaces around `!important`.
+      /\s{0,8}!\s{0,8}important\s{0,8}$/i,
+      "",
+    );
+  }
+  /** @param {string} key */
+  const val = (key) => (props[key] || "").toString().trim().toLowerCase();
+  if (val("display") === "none") return true;
+  if (val("visibility") === "hidden") return true;
+  const opacity = parseFloat(val("opacity"));
+  if (val("opacity") !== "" && opacity === 0) return true;
+  for (const dim of ["height", "width", "font-size"]) {
+    const value = val(dim);
+    if (value && parseFloat(value) === 0) return true;
+  }
+  if (isPositionedOffscreen(val)) return true;
+  const textIndent = val("text-indent");
+  if (textIndent && parseFloat(textIndent) < -900) return true;
+  // Clipped or scaled to nothing: modern equivalents of the legacy
+  // `clip: rect(0…)` above. Only the clip shapes that collapse the box to
+  // nothing are flagged — the canonical "visually hidden" utilities
+  // (`inset(50%)`…`inset(100%)`, `circle(0)`) abused to hide injected text.
+  // Decorative clips (`circle(50%)`, partial `inset`s, polygon shapes) render
+  // visible content and are left alone. A zero scale collapses the box too.
+  const clipPath = val("clip-path");
+  if (
+    clipPath &&
+    /\b(?:inset\(\s{0,8}(?:[5-9][0-9]|100)%|circle\(\s{0,8}0(?![.\d]))/.test(
+      clipPath,
+    )
+  )
+    return true;
+  const transform = val("transform");
+  if (
+    transform &&
+    /\b(?:scale|scale3d|scalex|scaley|matrix|matrix3d)\(\s{0,8}0(?![.\d])/.test(
+      transform,
+    )
+  )
+    return true;
+  // Same-color text on its background (white-on-white) and fully transparent
+  // text are invisible to a human but plain text to the model.
+  const color = val("color");
+  if (color === "transparent") return true;
+  const background = val("background-color") || val("background");
+  // The `background` guard also rejects the both-absent case: two empty values
+  // are equal, so without it an unstyled element would read as hidden.
+  if (background && color === background) return true;
+  return isOverflowHidden(val);
+}
+// Scripting / resource-loading tags whose PRESENCE is reported to the model
+// but whose content is preserved: their bodies are page source the model may
+// legitimately need to inspect (how a page's scripts work, its styles, its
+// SVGs), so unlike hidden elements they are never removed.
+export const REPORTED_TAGS = new Set([
+  "script",
+  "style",
+  "object",
+  "embed",
+  "iframe",
+  "svg",
+  "math",
+]);
+/**
+ * True for an element a rendered page would not show: `hidden` attribute or a
+ * hiding inline style. Works on both hast nodes and parseHtmlTag results.
+ * @param {any} node
+ * @returns {boolean}
+ */
+export function isHiddenElement(node) {
+  if (node.type !== "element") return false;
+  const { properties = {} } = node;
+  if (properties.hidden !== undefined && properties.hidden !== null)
+    return true;
+  // `aria-hidden="true"` removes the element from the accessibility tree, so a
+  // human using the rendered page never perceives it; a model reading raw
+  // source still does. (rehype maps the attribute to the `ariaHidden` prop.)
+  if (String(properties.ariaHidden).toLowerCase() === "true") return true;
+  if (properties.style && isHiddenStyle(properties.style)) return true;
+  return false;
+}
+/** @param {any} el */
+function hasDataSrc(el) {
+  return (
+    typeof el.properties?.src === "string" &&
+    el.properties.src.startsWith("data:")
+  );
+}
+/**
+ * @param {string} htmlValue
+ * @returns {any}
+ */
+function parseHtmlTag(htmlValue) {
+  const tree = unified().use(rehypeParse, { fragment: true }).parse(htmlValue);
+  /** @type {any} */
+  let firstElement = null;
+  visit(tree, "element", (node) => {
+    firstElement = node;
+    return EXIT;
+  });
+  return firstElement;
+}
+// Returns null on a closing tag: `</x>` alone can never be the *start* of a
+// hidden element, so only opens drive the surrounding loop's removal mode.
+/**
+ * @param {string} htmlValue
+ * @returns {string | null}
+ */
+export function isHiddenOpen(htmlValue) {
+  if (htmlValue.startsWith("</")) return null;
+  const el = parseHtmlTag(htmlValue);
+  if (!el) return null;
+  if (isHiddenElement(el)) return el.tagName;
+  return null;
+}
+// The lowercased name of an HTML closing tag (`</div>` -> "div"), or null when
+// the value isn't a well-formed closing tag. The charset spans HTML custom-
+// element and namespaced names (hyphens, dots, colons) so a close like
+// `</foo-bar>` balances its matching open instead of throwing on a null match;
+// callers treat null as "not the tag we're closing" and strip it as part of the
+// surrounding removal region.
+/**
+ * @param {string} htmlValue
+ * @returns {string | null}
+ */
+export function closingTagName(htmlValue) {
+  // The charset is a superset of CommonMark's closing-tag grammar, so remark
+  // never emits a `</…>` html node this fails to match; the null guard below is
+  // defense-in-depth against a future parser/grammar change (hence unreachable).
+  const match = htmlValue.match(/^<\/(?<tagName>[a-zA-Z][a-zA-Z0-9:._-]*)\s*>/);
+  /* c8 ignore next */
+  if (!match?.groups) return null;
+  return match.groups.tagName.toLowerCase();
+}
+// ─── Layer 2: splice engine ──────────────────────────────────────────────────
+export const COMMENT_PLACEHOLDER = "[HTML comment removed]";
+export const HIDDEN_PLACEHOLDER = "[hidden HTML removed]";
+/**
+ * Replace each range of `text` with its kind's placeholder, preserving every
+ * byte outside the ranges verbatim. Overlapping/nested ranges are merged
+ * (defense-in-depth — the scanners emit disjoint ranges).
+ * @param {string} text
+ * @param {Array<{start: number, end: number, kind: "comment" | "hidden"}>} ranges
+ * @returns {string}
+ */
+export function spliceRanges(text, ranges) {
+  const sorted = [...ranges].sort(
+    (left, right) => left.start - right.start || left.end - right.end,
+  );
+  /** @type {typeof ranges} */
+  const merged = [];
+  for (const range of sorted) {
+    const last = merged[merged.length - 1];
+    if (last && range.start < last.end) {
+      if (range.end > last.end) last.end = range.end;
+    } else {
+      merged.push({ ...range });
+    }
+  }
+  let out = "";
+  let cursor = 0;
+  for (const range of merged) {
+    out +=
+      text.slice(cursor, range.start) +
+      (range.kind === "comment" ? COMMENT_PLACEHOLDER : HIDDEN_PLACEHOLDER);
+    cursor = range.end;
+  }
+  return out + text.slice(cursor);
+}
+/** @returns {{ tags: Record<string, number>, dataSrc: number }} */
+function newWarned() {
+  return { tags: {}, dataSrc: 0 };
+}
+/**
+ * @param {ReturnType<typeof newWarned>} warned
+ * @param {string} tagName
+ */
+function countTag(warned, tagName) {
+  warned.tags[tagName] = (warned.tags[tagName] || 0) + 1;
+}
+/**
+ * @param {ReturnType<typeof newWarned>} into
+ * @param {ReturnType<typeof newWarned>} from
+ */
+function mergeWarned(into, from) {
+  for (const [tag, count] of Object.entries(from.tags))
+    into.tags[tag] = (into.tags[tag] || 0) + count;
+  into.dataSrc += from.dataSrc;
+}
+/** @param {ReturnType<typeof newWarned>} warned */
+function hasWarned(warned) {
+  return warned.dataSrc > 0 || Object.keys(warned.tags).length > 0;
+}
+/**
+ * Scan raw HTML for hidden content to strip and preserved tags to report.
+ * Returned ranges are offsets into `html`; comments and hidden elements span
+ * the whole element including its content (rehype positions cover open tag
+ * through matching close, and parse5 extends an unclosed element to the end
+ * of the fragment — fail-closed for truncated markup).
+ * @param {string} html
+ * @returns {{ ranges: Array<{start: number, end: number, kind: "comment" | "hidden"}>, warned: ReturnType<typeof newWarned> }}
+ */
+export function scanHtmlFragment(html) {
+  const tree = unified().use(rehypeParse, { fragment: true }).parse(html);
+  /** @type {Array<{start: number, end: number, kind: "comment" | "hidden"}>} */
+  const ranges = [];
+  const warned = newWarned();
+  // @ts-ignore -- visit callback returns EXIT/SKIP only on matches; implicit undefined return is intentional
+  // eslint-disable-next-line consistent-return
+  visit(tree, (/** @type {any} */ node) => {
+    const isComment = node.type === "comment";
+    if (isComment || isHiddenElement(node)) {
+      /* c8 ignore start -- parse5 omits positions only on recovery-synthesized
+         elements (tbody and friends), which carry no attributes and so can
+         never be hidden; fail closed on the whole fragment if that assumption
+         ever breaks. */
+      if (!node.position) {
+        ranges.length = 0;
+        ranges.push({ start: 0, end: html.length, kind: "hidden" });
+        return EXIT;
+      }
+      /* c8 ignore stop */
+      ranges.push({
+        start: node.position.start.offset,
+        end: node.position.end.offset,
+        kind: isComment ? "comment" : "hidden",
+      });
+      return SKIP; // children are inside the spliced range
+    }
+    if (node.type !== "element") return; // eslint-disable-line consistent-return -- unist visit: undefined return means "continue", same as falling off the end
+    if (REPORTED_TAGS.has(node.tagName)) countTag(warned, node.tagName);
+    if (hasDataSrc(node)) warned.dataSrc += 1;
+  });
+  return { ranges, warned };
+}
+const mdParser = unified().use(remarkParse).use(remarkGfm);
+/**
+ * Append comment ranges found in `value` to `ranges`.
+ *
+ * indexOf scanning is linear (a lazy `<!--[\s\S]*?-->` regex backtracks
+ * polynomially on crafted input); the close search starts 2 chars in so
+ * spec-abrupt closes (`<!-->`, `<!--->`) terminate their own comment.
+ * @param {string} value
+ * @param {number} base absolute offset of the start of `value`
+ * @param {number} nodeEnd absolute offset of the end of the containing node
+ * @param {Array<{start: number, end: number, kind: "comment" | "hidden"}>} ranges
+ */
+function collectCommentRanges(value, base, nodeEnd, ranges) {
+  for (let searchFrom = 0; ; ) {
+    const open = value.indexOf("<!--", searchFrom);
+    if (open === -1) break;
+    const close = value.indexOf("-->", open + 2);
+    /* c8 ignore start -- micromark only tokenizes inline comments WITH a
+       terminator (an unterminated `<!--` in phrasing context stays literal
+       text, visible to a human reader), so this is fail-closed
+       defense-in-depth against a future tokenizer change. Unterminated
+       comments in flow blocks are covered — parse5 handles them in
+       scanHtmlFragment. */
+    if (close === -1) {
+      ranges.push({ start: base + open, end: nodeEnd, kind: "comment" });
+      break;
+    }
+    /* c8 ignore stop */
+    ranges.push({ start: base + open, end: base + close + 3, kind: "comment" });
+    searchFrom = close + 3;
+  }
+}
+/**
+ * Update hidden-region state for one html node while inside a tracked region.
+ *
+ * Mutates `state` in place. A closing tag for the tracked element decrements
+ * depth; reaching zero closes the range. A nested open of the same tag
+ * increments depth. Any other close is swallowed inside the region.
+ * @param {{ tag: string | null, depth: number, regionStart: number }} state
+ * @param {string} value
+ * @param {number} nodeEnd absolute end offset of this node
+ * @param {Array<{start: number, end: number, kind: "comment" | "hidden"}>} ranges
+ */
+function updateHiddenState(state, value, nodeEnd, ranges) {
+  if (value.startsWith("</")) {
+    if (closingTagName(value) !== state.tag) return;
+    state.depth--;
+    if (state.depth === 0) {
+      ranges.push({ start: state.regionStart, end: nodeEnd, kind: "hidden" });
+      state.tag = null;
+    }
+    return;
+  }
+  const el = parseHtmlTag(value);
+  if (el && el.tagName === state.tag) state.depth++;
+}
+/**
+ * Balance-walk the direct children of a markdown container node: a hidden
+ * open tag starts a removal region that runs to its matching close (or the
+ * container's end when unbalanced — fail-closed), comments become single-node
+ * ranges, and preserved tags are counted. Inline html is tokenized per TAG
+ * (an element's content sits in sibling text nodes), which is why this walk
+ * exists instead of handing the value to rehype.
+ * @param {any} node
+ * @param {Array<{start: number, end: number, kind: "comment" | "hidden"}>} ranges
+ * @param {ReturnType<typeof newWarned>} warned
+ */
+function scanInlineChildren(node, ranges, warned) {
+  const state =
+    /** @type {{ tag: string | null, depth: number, regionStart: number }} */ ({
+      tag: null,
+      depth: 0,
+      regionStart: 0,
+    });
+  for (const child of node.children) {
+    if (child.type !== "html") continue;
+    const value = child.value;
+    const base = child.position.start.offset;
+    if (state.depth > 0) {
+      updateHiddenState(state, value, child.position.end.offset, ranges);
+      continue;
+    }
+    // Comments can share an inline html node with neighboring constructs
+    // (e.g. in a list item, `<!-- c -->!` is ONE node), so comment spans are
+    // located within the value and spliced individually rather than assuming
+    // the node IS the comment.
+    collectCommentRanges(value, base, child.position.end.offset, ranges);
+    const tagName = isHiddenOpen(value);
+    if (tagName) {
+      state.tag = tagName;
+      state.depth = 1;
+      state.regionStart = base;
+      continue;
+    }
+    if (value.startsWith("</")) continue;
+    const el = parseHtmlTag(value);
+    if (!el) continue;
+    if (REPORTED_TAGS.has(el.tagName)) countTag(warned, el.tagName);
+    if (hasDataSrc(el)) warned.dataSrc += 1;
+  }
+  if (state.depth > 0) {
+    ranges.push({
+      start: state.regionStart,
+      end: node.position.end.offset,
+      kind: "hidden",
+    });
+  }
+}
+// Containers whose direct html children are flow BLOCKS (complete markup —
+// tags and content in one node value), as opposed to the phrasing containers
+// (paragraph, heading, tableCell, emphasis, …) whose html children are
+// per-tag fragments needing the balance walk.
+const FLOW_HTML_PARENTS = new Set([
+  "root",
+  "blockquote",
+  "listItem",
+  "footnoteDefinition",
+]);
+/**
+ * @param {string} text
+ * @returns {{ ranges: Array<{start: number, end: number, kind: "comment" | "hidden"}>, warned: ReturnType<typeof newWarned> }}
+ */
+function scanMarkdown(text) {
+  const tree = mdParser.parse(text);
+  /** @type {Array<{start: number, end: number, kind: "comment" | "hidden"}>} */
+  const ranges = [];
+  const warned = newWarned();
+  // Flow html blocks carry complete markup, so rehype locates comments/hidden
+  // elements precisely within them; block-local offsets are shifted to
+  // document coordinates.
+  visit(tree, "html", (/** @type {any} */ node, _index, parent) => {
+    if (!FLOW_HTML_PARENTS.has(parent?.type)) return;
+    const base = node.position.start.offset;
+    const sub = scanHtmlFragment(text.slice(base, node.position.end.offset));
+    for (const range of sub.ranges) {
+      ranges.push({
+        start: base + range.start,
+        end: base + range.end,
+        kind: range.kind,
+      });
+    }
+    mergeWarned(warned, sub.warned);
+  });
+  // Every phrasing container that holds inline html (paragraph, heading,
+  // tableCell, emphasis, …) gets the balance walk — not just paragraphs, so a
+  // hidden span inside a heading cannot slip through.
+  visit(tree, (/** @type {any} */ node) => {
+    if (FLOW_HTML_PARENTS.has(node.type) || !Array.isArray(node.children))
+      return;
+    if (
+      !node.children.some((/** @type {any} */ child) => child.type === "html")
+    )
+      return;
+    scanInlineChildren(node, ranges, warned);
+  });
+  return { ranges, warned };
+}
+// 30%-of-lines heuristic: HTML *source* gets scanned as one rehype fragment;
+// inline tags scattered in prose go through the markdown branch instead.
+/**
+ * @param {string} text
+ * @returns {boolean}
+ */
+export function looksLikeHtmlSource(text) {
+  const lines = text.split("\n");
+  if (lines.length < 5) return false;
+  let htmlLines = 0;
+  for (const line of lines) {
+    if (/<\/?[a-zA-Z][^<>]*>/.test(line)) htmlLines++;
+  }
+  return htmlLines / lines.length > 0.3;
+}
+/**
+ * Layer 2 over web-ingress text: splice out HTML comments and hidden elements
+ * (placeholders mark the cuts; all other bytes are preserved verbatim) and
+ * count preserved scripting/resource tags for the caller's warning. Returns
+ * null when there is nothing to strip and nothing to report.
+ * @param {string} text
+ * @returns {{ text: string, removed: { comments: number, hidden: number }, warned: { tags: Record<string, number>, dataSrc: number } } | null}
+ */
+export function sanitizeHtml(text) {
+  if (!HTML_TAG_PRESENT.test(text)) return null;
+  const { ranges, warned } = looksLikeHtmlSource(text)
+    ? scanHtmlFragment(text)
+    : scanMarkdown(text);
+  if (ranges.length === 0 && !hasWarned(warned)) return null;
+  const removed = { comments: 0, hidden: 0 };
+  for (const range of ranges)
+    removed[range.kind === "comment" ? "comments" : "hidden"]++;
+  return {
+    text: ranges.length > 0 ? spliceRanges(text, ranges) : text,
+    removed,
+    warned,
+  };
+}
+// ─── Layer 3: markdown/URL exfiltration detection ────────────────────────────
+// High-precision raw-string indicators, applied to the whole URL so they fire
+// even when it is too malformed for `new URL()` to parse (e.g. a non-ASCII
+// host). The `#` in the delimiter class extends keyword detection to the
+// fragment, an exfil channel the param walk would otherwise miss (`…#token=…`).
+// The generic "long base64/hex value" arm that once lived here moved into the
+// per-parameter walk (paramExfilReason) so it can skip request-signing,
+// pagination, and analytics parameters that legitimately carry long opaque
+// values — see BENIGN_BLOB_PARAM_RE.
+const EXFIL_INDICATORS = [
+  /[?&#](?:data|d|payload|exfil|leak|steal|secret|token|key|env|password|pwd|cookie|session|auth)=/i,
+  /\$\{[^{}]+\}/,
+  /\{\{[^{}]+\}\}/,
+];
+const LONG_QUERY_THRESHOLD = 200;
+// A `data:` URI carries its payload inline instead of pointing at a host, so
+// the query/credential/fragment checks below never fire on it. Active-content
+// types (HTML, SVG, JS) are a script-injection vector; an oversized blob of any
+// type is an inline exfil/injection payload. A small inline image (icon) is
+// left alone so the common case isn't drowned in noise.
+const DATA_URI_ACTIVE_RE =
+  /^\s*data:(?:text\/html|image\/svg\+xml|application\/(?:javascript|ecmascript|xhtml\+xml))[;,]/i;
+export const DATA_URI_LENGTH_THRESHOLD = 4096;
+// javascript:/vbscript: URIs execute on navigation/load, never a legitimate
+// link target in fetched content — flagged regardless of payload.
+const SCRIPT_URI_RE = /^\s*(?:javascript|vbscript):/i;
+const RELATIVE_URL_BASE = "http://relative.invalid";
+// Parameter NAMES that legitimately carry a LONG opaque (base64/hex) value, so
+// a blob in one of them is NOT exfil: CDN request-signing (AWS SigV4 /
+// CloudFront `X-Amz-*`/`Signature`/`Policy`/`Key-Pair-Id`, GCS `X-Goog-*`,
+// Azure SAS `sv/sr/sig/se/sp/st/spr/skoid/sktid`), pagination cursors /
+// continuation tokens, and the long analytics click-IDs. Matched
+// case-insensitively against the exact (lowercased) parameter name. Scope is
+// deliberately limited to names whose benign value is genuinely a long token —
+// generic short params (`page`, `limit`, `v`, `t`, `cb`, …) are NOT listed,
+// since their values never reach the blob threshold anyway and listing them
+// would only widen the rename-dodge surface. A blob or credential-shaped value
+// in any OTHER parameter still fires — this allowlist trades a narrow dodge
+// (`?sig=<stolen>`) for not drowning the model in false positives on ordinary
+// fetched pages.
+const BENIGN_BLOB_PARAM_RE =
+  /^(?:x-(?:amz|goog|ms|oss|obs)-[a-z0-9-]+|amz-[a-z0-9-]+|utm_[a-z]+|sig|signature|hmac|policy|credential|expires|key-pair-id|se|sp|sr|sv|st|spr|si|skoid|sktid|cursor|after|before|continuation|continuationtoken|continuation_token|pagetoken|page_token|nexttoken|next_token|gclid|fbclid|dclid|msclkid|gbraid|wbraid|_ga|_gl|mc_eid|mc_cid)$/i;
+// matchesSecretHint is a deliberately broad PRE-gate whose bare-keyword arms
+// (`token`, `secret`, `authorization`, …) also match ordinary hyphen/word
+// delimited prose, and with no secret-redaction engine to refine the verdict
+// here a weak digit proxy isn't enough: `login-authenticate-2024` and
+// `the-secret-recipe-2024` clear "has a digit." A leaked credential is an
+// OPAQUE, separator-free token, so the value must additionally contain a
+// contiguous 20+ char `[A-Za-z0-9_]` run (no hyphen/space — that's what splits
+// the prose runs below the bar) AND a digit before it counts as one.
+const OPAQUE_TOKEN_RE = /[A-Za-z0-9_]{20,}/;
+const VALUE_HAS_DIGIT_RE = /\d/;
+// A value that is ENTIRELY a long base64 (40+ chars, optional `=` padding) or
+// hex (32+ chars) run. Anchored to the whole value (operating on the RAW,
+// un-decoded query so a `+` in base64 is not turned into a space), so a benign
+// short value with an incidental hex word never trips it. Both arms are linear.
+const BLOB_VALUE_B64_RE = /^[A-Za-z0-9+/]{40,}={0,2}$/;
+const BLOB_VALUE_HEX_RE = /^[A-Fa-f0-9]{32,}$/;
+// A path segment whose whole value is a base64/hex run longer than any standard
+// content hash (SHA-512 hex is 128, base64 88; SHA-256 hex 64) is bulk encoded
+// data — a beacon URL that smuggles its payload in the path to dodge the query
+// walk — rather than an asset fingerprint. The threshold sits just above the
+// SHA-512-hex ceiling so every real fingerprint clears it while a ~150-char
+// base64 of stolen cookies does not. Hyphens/underscores are excluded so a long
+// word-slug (`the-secret-history-of-…`) is not mistaken for a payload.
+const PATH_BLOB_RE = /^(?:[A-Za-z0-9+/]+={0,2}|[A-Fa-f0-9]+)$/;
+const PATH_BLOB_MIN_LEN = 128;
+/**
+ * RAW (un-decoded) `name=value` pairs of a query/fragment string, split on `&`
+ * and `;`. URLSearchParams is avoided on purpose: it percent-/`+`-decodes
+ * values, turning a `+`-bearing base64 blob into a space-broken string that the
+ * anchored blob regexes would miss.
+ * @param {string} qs
+ * @returns {Array<[string, string]>}
+ */
+function rawParams(qs) {
+  /** @type {Array<[string, string]>} */
+  const pairs = [];
+  for (const pair of qs.split(/[&;]/)) {
+    if (!pair) continue;
+    const eq = pair.indexOf("=");
+    const name = eq === -1 ? pair : pair.slice(0, eq);
+    const value = eq === -1 ? "" : pair.slice(eq + 1);
+    pairs.push([name.toLowerCase(), value]);
+  }
+  return pairs;
+}
+/**
+ * Exfil reason for one URL parameter, or null. A credential-shaped value in any
+ * non-allowlisted parameter (reusing the secret-shape gate), or a long
+ * base64/hex blob in one. Allowlisted signing/pagination/analytics parameters
+ * are skipped entirely (see BENIGN_BLOB_PARAM_RE).
+ * @param {string} name  lowercased parameter name
+ * @param {string} value RAW (un-decoded) value
+ * @returns {string | null}
+ */
+function paramExfilReason(name, value) {
+  if (BENIGN_BLOB_PARAM_RE.test(name)) return null;
+  if (
+    OPAQUE_TOKEN_RE.test(value) &&
+    VALUE_HAS_DIGIT_RE.test(value) &&
+    matchesSecretHint(value)
+  )
+    return "credential-shaped token in URL parameter";
+  if (BLOB_VALUE_B64_RE.test(value) || BLOB_VALUE_HEX_RE.test(value))
+    return "suspicious query parameter";
+  return null;
+}
+/**
+ * True when every parameter of the parsed URL's query is in the benign
+ * allowlist. Used to suppress the coarse long-query-string heuristic for
+ * signed-CDN links, which are long by design. Only ever called once the query
+ * is known to be long (and thus non-empty), so the vacuous-true empty case
+ * cannot arise here.
+ * @param {URL} parsed
+ * @returns {boolean}
+ */
+function allParamsBenign(parsed) {
+  return rawParams(parsed.search.slice(1)).every(([name]) =>
+    BENIGN_BLOB_PARAM_RE.test(name),
+  );
+}
+/**
+ * Walk the query and fragment parameters of a parsed URL for an exfil reason.
+ * @param {URL} parsed
+ * @returns {string | null}
+ */
+function checkUrlParams(parsed) {
+  for (const [name, value] of rawParams(parsed.search.slice(1))) {
+    const reason = paramExfilReason(name, value);
+    if (reason) return reason;
+  }
+  // The fragment carries the same `key=value` channel (`#token=…`); a bare
+  // anchor (`#section-2`) yields one empty-value param that trips nothing.
+  for (const [name, value] of rawParams(parsed.hash.slice(1))) {
+    const reason = paramExfilReason(name, value);
+    if (reason) return reason;
+  }
+  return null;
+}
+/**
+ * A bulk encoded-data blob smuggled in a path segment (a beacon URL that avoids
+ * query strings entirely), or null.
+ * @param {URL} parsed
+ * @returns {string | null}
+ */
+function checkUrlPath(parsed) {
+  for (const segment of parsed.pathname.split("/")) {
+    if (segment.length > PATH_BLOB_MIN_LEN && PATH_BLOB_RE.test(segment))
+      return "encoded data blob in path segment";
+  }
+  return null;
+}
+/**
+ * @param {string} url
+ * @returns {string | null}
+ */
+export function checkExfilUrl(url) {
+  if (/^\s*data:/i.test(url)) {
+    if (DATA_URI_ACTIVE_RE.test(url)) return "active-content data: URI";
+    if (url.length > DATA_URI_LENGTH_THRESHOLD)
+      return "oversized inline data: payload";
+    return null;
+  }
+  if (SCRIPT_URI_RE.test(url)) return "script-executing URI";
+  if (EXFIL_INDICATORS.some((pattern) => pattern.test(url)))
+    return "suspicious query parameter";
+  // Userinfo and an oversized fragment are exfil channels the param walk misses:
+  // credentials smuggled as `user:secret@host`, or a payload tucked in `#<blob>`.
+  // Parse against a sentinel base so relative URLs don't throw.
+  let parsed;
+  try {
+    parsed = new URL(url, RELATIVE_URL_BASE);
+  } catch {
+    return null;
+  }
+  if (parsed.username || parsed.password) return "embedded credentials";
+  // A long query string is only suspicious when it carries a non-allowlisted
+  // parameter — a signed-CDN URL is long by design (all `X-Amz-*`/SAS params).
+  const qIdx = url.indexOf("?");
+  if (
+    qIdx !== -1 &&
+    url.length - qIdx > LONG_QUERY_THRESHOLD &&
+    !allParamsBenign(parsed)
+  )
+    return "unusually long query string";
+  if (parsed.hash.length > LONG_QUERY_THRESHOLD)
+    return "unusually long fragment";
+  return checkUrlParams(parsed) || checkUrlPath(parsed);
+}
+/**
+ * Host of a flagged URL — enough for the warning to name the destination
+ * without echoing the payload-bearing query/fragment.
+ * @param {string} url
+ * @returns {string}
+ */
+export function urlHost(url) {
+  // A `data:` URI has no host; name the channel rather than echoing the payload.
+  if (/^\s*data:/i.test(url)) return "(inline data: URI)";
+  let parsed;
+  try {
+    parsed = new URL(url, RELATIVE_URL_BASE);
+  } catch {
+    // checkExfilUrl flags via regex before parsing, so it can hand us a URL
+    // WHATWG rejects (e.g. a non-ASCII host).
+    return "(unparsable URL)";
+  }
+  if (
+    parsed.origin === RELATIVE_URL_BASE &&
+    !url.startsWith(RELATIVE_URL_BASE)
+  ) {
+    return "(relative URL)";
+  }
+  return parsed.host;
+}
+/**
+ * True when `url` is an absolute, off-origin target (an authority that is not
+ * the relative-resolution sentinel). Used for form `action`/`formaction` and
+ * `meta refresh` URLs, where pointing off the page's own origin is the
+ * exfil/redirect signal regardless of the query shape.
+ * @param {string} url
+ * @returns {boolean}
+ */
+function isOffOrigin(url) {
+  let parsed;
+  try {
+    parsed = new URL(url, RELATIVE_URL_BASE);
+  } catch {
+    return false;
+  }
+  return (
+    parsed.origin !== RELATIVE_URL_BASE || url.startsWith(RELATIVE_URL_BASE)
+  );
+}
+/**
+ * The redirect URL of a `<meta http-equiv="refresh">` content value
+ * (`"5; url=https://…"`), or null when it carries no `url=` target.
+ * @param {string} content
+ * @returns {string | null}
+ */
+function metaRefreshUrl(content) {
+  const match = /** @type {{ groups: { url: string } } | null} */ (
+    content.match(/url\s*=\s*['"]?(?<url>[^'"\s;]+)/i)
+  );
+  return match ? match.groups.url : null;
+}
+/**
+ * Candidate URLs of a `srcset` (a comma-separated "url descriptor" string) or
+ * `ping` (a space-separated url list rehype delivers as an array) attribute.
+ * Each candidate's leading whitespace-delimited token is its url (the trailing
+ * `2x`/`100w` descriptor, or extra ping urls, are dropped to the next
+ * candidate). An absent attribute (neither string nor array) yields none.
+ * @param {unknown} value
+ * @returns {string[]}
+ */
+function multiUrlAttr(value) {
+  /** @type {string[]} */ let candidates = [];
+  if (Array.isArray(value)) candidates = value.map(String);
+  else if (typeof value === "string") candidates = value.split(",");
+  return candidates
+    .map((candidate) => candidate.trim().split(/\s+/)[0])
+    .filter(Boolean);
+}
+/**
+ * URL-bearing attributes of every HTML element in `text`, parsed with rehype so
+ * quoting/casing/entities are handled correctly (no hand-rolled tag regex).
+ * `context` selects the per-URL check the caller applies: resource URLs get the
+ * exfil-shape test; form-submission and meta-refresh targets additionally flag
+ * any absolute off-origin destination.
+ * @param {string} text
+ * @returns {Array<{ url: string, isImage: boolean, context: "resource" | "form" | "refresh" }>}
+ */
+function extractHtmlUrls(text) {
+  const tree = unified().use(rehypeParse, { fragment: true }).parse(text);
+  /** @type {Array<{ url: string, isImage: boolean, context: "resource" | "form" | "refresh" }>} */
+  const urls = [];
+  visit(tree, "element", (/** @type {any} */ node) => {
+    // hast element nodes always carry a `properties` object (parse5 sets it).
+    const props = node.properties;
+    const isImage = node.tagName === "img";
+    for (const key of ["src", "href", "background"])
+      if (typeof props[key] === "string")
+        urls.push({ url: props[key], isImage, context: "resource" });
+    for (const key of ["srcSet", "ping"])
+      for (const url of multiUrlAttr(props[key]))
+        urls.push({ url, isImage, context: "resource" });
+    for (const key of ["action", "formAction"])
+      if (typeof props[key] === "string")
+        urls.push({ url: props[key], isImage: false, context: "form" });
+    // rehype delivers `http-equiv` as an array (comma-separated); join it back
+    // so a `refresh` directive is matched regardless of how it was tokenized.
+    const httpEquiv = Array.isArray(props.httpEquiv)
+      ? props.httpEquiv.join(",").toLowerCase()
+      : "";
+    if (
+      node.tagName === "meta" &&
+      httpEquiv.includes("refresh") &&
+      typeof props.content === "string"
+    ) {
+      const url = metaRefreshUrl(props.content);
+      if (url) urls.push({ url, isImage: false, context: "refresh" });
+    }
+  });
+  return urls;
+}
+// Reason for an off-origin submission/redirect target by context; null leaves
+// the URL to the exfil-shape check alone.
+const OFF_ORIGIN_REASON = {
+  form: "off-origin form action",
+  refresh: "off-origin meta-refresh redirect",
+};
+/**
+ * Layer 3: report data-exfil-shaped URLs in markdown links/images/definitions
+ * and HTML attributes (src/href/background/srcset/ping, form action/formaction,
+ * meta-refresh). Detection only — the text is never modified; the caller
+ * surfaces the threats as a warning.
+ * @param {string} text
+ * @returns {Array<{ isImage: boolean, reason: string, target: string }> | null}
+ */
+export function detectExfil(text) {
+  if (!MD_LINK_HINT.test(text) && !HTML_TAG_PRESENT.test(text)) return null;
+  /** @type {Array<{ isImage: boolean, reason: string, target: string }>} */
+  const threats = [];
+  // Remark AST handles markdown links/images/definitions (balanced parens,
+  // reference links) correctly, unlike a hand-rolled regex.
+  const tree = mdParser.parse(text);
+  visit(tree, (node) => {
+    if (
+      node.type !== "link" &&
+      node.type !== "image" &&
+      node.type !== "definition"
+    )
+      return;
+    const reason = checkExfilUrl(node.url);
+    if (!reason) return;
+    threats.push({
+      isImage: node.type === "image",
+      reason,
+      target: urlHost(node.url),
+    });
+  });
+  // HTML attributes (not AST nodes in remark).
+  for (const { url, isImage, context } of extractHtmlUrls(text)) {
+    const reason =
+      checkExfilUrl(url) ||
+      (context !== "resource" && isOffOrigin(url)
+        ? OFF_ORIGIN_REASON[context]
+        : null);
+    if (!reason) continue;
+    threats.push({ isImage, reason, target: urlHost(url) });
+  }
+  return threats.length > 0 ? threats : null;
+}