ai-shield-core 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,12 +8,17 @@
8
8
  // Keep minimal — false-mappings in real content are worse than
9
9
  // false-negatives in an attack attempt.
10
10
  const HOMOGLYPH_MAP = {
11
+ // Cyrillic
11
12
  "а": "a", "е": "e", "і": "i", "ј": "j", "о": "o", "р": "p", "с": "c", "ѕ": "s",
12
- "у": "y", "х": "x", "А": "A", "В": "B", "Е": "E", "І": "I", "К": "K", "М": "M",
13
- "Н": "H", "О": "O", "Р": "P", "С": "C", "Т": "T", "Х": "X",
14
- "α": "a", "ο": "o", "ρ": "p", "ε": "e", "υ": "y", "χ": "x", "Α": "A", "Β": "B",
15
- "Ε": "E", "Ζ": "Z", "Η": "H", "Ι": "I", "Κ": "K", "Μ": "M", "Ν": "N", "Ο": "O",
16
- "Ρ": "P", "Τ": "T", "Υ": "Y", "Χ": "X",
13
+ "у": "y", "х": "x", "ԁ": "d", "һ": "h", "ӏ": "l", "ո": "n", "А": "A", "В": "B",
14
+ "Е": "E", "І": "I", "К": "K", "М": "M", "Н": "H", "О": "O", "Р": "P", "С": "C",
15
+ "Т": "T", "Х": "X", "Ѕ": "S", "Ј": "J", "Ү": "Y", "Ԛ": "Q", "Ԝ": "W", "Ғ": "F",
16
+ // Greek
17
+ "α": "a", "ο": "o", "ρ": "p", "ε": "e", "υ": "y", "χ": "x", "ν": "v", "ι": "i",
18
+ "κ": "k", "Α": "A", "Β": "B", "Ε": "E", "Ζ": "Z", "Η": "H", "Ι": "I", "Κ": "K",
19
+ "Μ": "M", "Ν": "N", "Ο": "O", "Ρ": "P", "Τ": "T", "Υ": "Y", "Χ": "X",
20
+ // Armenian / Cherokee / other look-alikes occasionally used in evasion
21
+ "օ": "o", "ѵ": "v",
17
22
  };
18
23
  const HOMOGLYPH_RE = new RegExp(Object.keys(HOMOGLYPH_MAP).join("|"), "g");
19
24
  // Zero-width chars + BOM — used to split words like "ig<ZWSP>nore" across
@@ -39,6 +44,26 @@ export function normalizeForInjectionScan(input) {
39
44
  const noCombining = noZW.replace(COMBINING_RE, "");
40
45
  return noCombining.replace(HOMOGLYPH_RE, (ch) => HOMOGLYPH_MAP[ch] ?? ch);
41
46
  }
47
+ /**
48
+ * Collapse letter-splitting evasion: an attacker writes `i g n o r e` or
49
+ * `i.g.n.o.r.e` or `i-g-n-o-r-e` to break the literal token "ignore" across
50
+ * separators so the regex never matches. This produces an ADDITIONAL view
51
+ * where any run of `single-letter + separator` (≥4 letters) has its
52
+ * separators removed, so the spaced form collapses back to "ignore".
53
+ *
54
+ * Run as a second pass IN ADDITION to the normal normalized text — never
55
+ * as a replacement — because collapsing is lossy (it would also fuse the
56
+ * legitimate "a b c" list). Only single-letter groups separated by one
57
+ * space / dot / dash / underscore are collapsed; multi-letter words are
58
+ * left intact, which keeps benign prose untouched.
59
+ */
60
+ export function collapseSpacedLetters(input) {
61
+ // Match ≥3 "<letter><sep>" groups closed by a final lone letter. The
62
+ // trailing `(?![A-Za-z])` stops the greedy match from swallowing the
63
+ // first letter of the next real word ("i g n o r e all" must collapse to
64
+ // "ignore all", not "ignorea ll"). Bounded, linear — no nested quantifier.
65
+ return input.replace(/(?:[A-Za-z][ \t._-]){3,}[A-Za-z](?![A-Za-z])/g, (run) => run.replace(/[ \t._-]/g, ""));
66
+ }
42
67
  const PATTERNS = [
43
68
  // --- Instruction Override (weight: 0.25 each) ---
44
69
  {
@@ -359,6 +384,19 @@ export class HeuristicScanner {
359
384
  // homoglyph/zero-width evasion doesn't bypass the rules. The caller
360
385
  // still sees the original input in `sanitized`.
361
386
  const normalized = normalizeForInjectionScan(input);
387
+ // Second view that un-splits letter-splitting evasion ("i g n o r e").
388
+ // Only computed when it actually differs (cheap guard), and only the
389
+ // high-value override/role/extraction/tool categories are re-tested
390
+ // against it — collapsing is lossy and the low-value framing rules
391
+ // would false-positive on collapsed prose.
392
+ const collapsed = collapseSpacedLetters(normalized);
393
+ const collapsedDiffers = collapsed !== normalized;
394
+ const SPLIT_SENSITIVE = new Set([
395
+ "instruction_override",
396
+ "role_manipulation",
397
+ "system_prompt_extraction",
398
+ "tool_abuse",
399
+ ]);
362
400
  for (const rule of this.patterns) {
363
401
  if (rule.pattern.test(normalized)) {
364
402
  totalScore += rule.weight;
@@ -371,6 +409,20 @@ export class HeuristicScanner {
371
409
  detail: `Rule ${rule.id} (${rule.category})`,
372
410
  });
373
411
  }
412
+ else if (collapsedDiffers &&
413
+ SPLIT_SENSITIVE.has(rule.category) &&
414
+ rule.pattern.test(collapsed)) {
415
+ // Matched only after un-splitting → letter-splitting evasion.
416
+ totalScore += rule.weight;
417
+ violations.push({
418
+ type: "prompt_injection",
419
+ scanner: this.name,
420
+ score: rule.weight,
421
+ threshold: this.threshold,
422
+ message: rule.description,
423
+ detail: `Rule ${rule.id} (${rule.category}, letter-splitting evasion)`,
424
+ });
425
+ }
374
426
  }
375
427
  // Structural signals (cumulative) — intentionally run on the original
376
428
  // input so real structural attacks (many newlines, long paddings) can
@@ -404,6 +456,22 @@ export class HeuristicScanner {
404
456
  // Very long input (potential padding attack)
405
457
  if (input.length > 5000)
406
458
  score += 0.05;
459
+ // Adversarial suffix (GCG-style): a long whitespace-free token packed
460
+ // with mixed punctuation/symbols, typically appended after the readable
461
+ // request. Conservative — needs ≥25 chars and ≥6 distinct punctuation
462
+ // marks so ordinary URLs, hashes and code tokens don't trip it.
463
+ const ADV_TOKEN_RE = /\S{25,}/g;
464
+ let advMatch;
465
+ let advCount = 0;
466
+ while ((advMatch = ADV_TOKEN_RE.exec(input)) !== null && advCount < 32) {
467
+ advCount += 1;
468
+ const tok = advMatch[0];
469
+ const distinctPunct = new Set((tok.match(/[!-/:-@[-`{-~]/g) ?? [])).size;
470
+ if (distinctPunct >= 6) {
471
+ score += 0.05;
472
+ break;
473
+ }
474
+ }
407
475
  return score;
408
476
  }
409
477
  /** Get all registered pattern IDs for testing */
@@ -93,6 +93,37 @@ export declare class IngestionScanner implements Scanner {
93
93
  * ```
94
94
  */
95
95
  export declare function scanIngested(content: string, source: IngestionSource, config?: IngestionScannerConfig): Promise<IngestionScanResult>;
96
+ /**
97
+ * Scan the runtime *result* of a tool call before it re-enters the model
98
+ * context. The dominant indirect-injection channel in agentic loops: a
99
+ * search tool surfaces a poisoned page, an MCP server returns attacker-
100
+ * controlled data, a compromised upstream API embeds instructions in its
101
+ * response. PoisonedRAG (USENIX Security 2025) showed 5 planted documents
102
+ * reach a 90% attack-success rate in million-document knowledge bases —
103
+ * the payload arrives here, not in the user prompt.
104
+ *
105
+ * Thin wrapper over `scanIngested(content, "tool-output")` that also
106
+ * stamps the originating `toolName` into every violation detail, so an
107
+ * audit log can answer "which tool returned the poisoned content?".
108
+ *
109
+ * Pair with `CircuitBreakerRegistry` when you also want to rate-limit or
110
+ * trip the tool after repeated poisoned results:
111
+ *
112
+ * @example
113
+ * ```ts
114
+ * import { scanToolOutput } from "ai-shield-core";
115
+ *
116
+ * const result = await searchTool.call(query); // untrusted
117
+ * const scan = await scanToolOutput("web_search", result);
118
+ * if (!scan.safe) {
119
+ * // drop the result OR strip it before the next model turn
120
+ * audit.warn("poisoned tool output", { tool: "web_search", v: scan.violations });
121
+ * return; // do not feed `result` back into the model
122
+ * }
123
+ * model.continue(result);
124
+ * ```
125
+ */
126
+ export declare function scanToolOutput(toolName: string, content: string, config?: IngestionScannerConfig): Promise<IngestionScanResult>;
96
127
  /**
97
128
  * Try to decode common obfuscation layers an attacker uses to smuggle
98
129
  * an injection past pattern matchers. Returns the decoded payload when
@@ -1 +1 @@
1
- {"version":3,"file":"ingestion.d.ts","sourceRoot":"","sources":["../../src/scanner/ingestion.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,OAAO,EACP,aAAa,EACb,WAAW,EACX,SAAS,EACT,eAAe,EACf,SAAS,EACV,MAAM,aAAa,CAAC;AA2GrB;;;;;;;;;GASG;AACH,wBAAgB,kBAAkB,CAAC,OAAO,EAAE,eAAe,GAAG,SAAS,CAEtE;AAoFD;;;;;GAKG;AACH,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,OAAO,CAAC;IACd,QAAQ,EAAE,OAAO,GAAG,MAAM,GAAG,OAAO,CAAC;IACrC;;;;;;OAMG;IACH,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,SAAS,EAAE,CAAC;IACxB,MAAM,EAAE,eAAe,CAAC;IACxB,IAAI,EAAE;QACJ,cAAc,EAAE,MAAM,CAAC;QACvB,WAAW,EAAE,MAAM,EAAE,CAAC;QACtB,2DAA2D;QAC3D,kBAAkB,EAAE,MAAM,CAAC;QAC3B;;;;WAIG;QACH,MAAM,EAAE,OAAO,CAAC;KACjB,CAAC;CACH;AAED,MAAM,WAAW,sBAAsB;IACrC,gDAAgD;IAChD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;IAC1B;;;OAGG;IACH,UAAU,CAAC,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;CACxC;AAED;;;;;;;GAOG;AACH,qBAAa,gBAAiB,YAAW,OAAO;IAC9C,QAAQ,CAAC,IAAI,eAAe;IAC5B,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAqB;IAC/C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAW;IAC1C,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAmB;gBAEjC,MAAM,GAAE,sBAA2B;IAQzC,IAAI,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,WAAW,GAAG,OAAO,CAAC,aAAa,CAAC;CAmHxE;AAED;;;;;;;;;;;;;;;;;;;GAmBG;AACH,wBAAsB,YAAY,CAChC,OAAO,EAAE,MAAM,EACf,MAAM,EAAE,eAAe,EACvB,MAAM,GAAE,sBAA2B,GAClC,OAAO,CAAC,mBAAmB,CAAC,CA0B9B;AAQD;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAgB,oBAAoB,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CA2EjE"}
1
+ {"version":3,"file":"ingestion.d.ts","sourceRoot":"","sources":["../../src/scanner/ingestion.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,OAAO,EACP,aAAa,EACb,WAAW,EACX,SAAS,EACT,eAAe,EACf,SAAS,EACV,MAAM,aAAa,CAAC;AAoIrB;;;;;;;;;GASG;AACH,wBAAgB,kBAAkB,CAAC,OAAO,EAAE,eAAe,GAAG,SAAS,CAEtE;AAoFD;;;;;GAKG;AACH,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,OAAO,CAAC;IACd,QAAQ,EAAE,OAAO,GAAG,MAAM,GAAG,OAAO,CAAC;IACrC;;;;;;OAMG;IACH,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,SAAS,EAAE,CAAC;IACxB,MAAM,EAAE,eAAe,CAAC;IACxB,IAAI,EAAE;QACJ,cAAc,EAAE,MAAM,CAAC;QACvB,WAAW,EAAE,MAAM,EAAE,CAAC;QACtB,2DAA2D;QAC3D,kBAAkB,EAAE,MAAM,CAAC;QAC3B;;;;WAIG;QACH,MAAM,EAAE,OAAO,CAAC;KACjB,CAAC;CACH;AAED,MAAM,WAAW,sBAAsB;IACrC,gDAAgD;IAChD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;IAC1B;;;OAGG;IACH,UAAU,CAAC,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;CACxC;AAED;;;;;;;GAOG;AACH,qBAAa,gBAAiB,YAAW,OAAO;IAC9C,QAAQ,CAAC,IAAI,eAAe;IAC5B,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAqB;IAC/C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAW;IAC1C,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAmB;gBAEjC,MAAM,GAAE,sBAA2B;IAQzC,IAAI,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,WAAW,GAAG,OAAO,CAAC,aAAa,CAAC;CAmHxE;AAED;;;;;;;;;;;;;;;;;;;GAmBG;AACH,wBAAsB,YAAY,CAChC,OAAO,EAAE,MAAM,EACf,MAAM,EAAE,eAAe,EACvB,MAAM,GAAE,sBAA2B,GAClC,OAAO,CAAC,mBAAmB,CAAC,CA0B9B;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AACH,wBAAsB,cAAc,CAClC,QAAQ,EAAE,MAAM,EAChB,OAAO,EAAE,MAAM,EACf,MAAM,GAAE,sBAA2B,GAClC,OAAO,CAAC,mBAAmB,CAAC,CAa9B;AAQD;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAgB,oBAAoB,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CA2EjE"}
@@ -50,8 +50,14 @@ const SOURCE_PROFILE = {
50
50
  extraPatterns: [
51
51
  // "Before using this tool, you must ..." / "After calling, also call X"
52
52
  /(?:before|after|while)\s+(?:using|invoking|calling|executing)\s+(?:this\s+)?(?:tool|function|action)[,\s]+(?:you\s+)?(?:must|should|will|need\s+to|are\s+required\s+to)/i,
53
- // "Also call delete_*" / "ALWAYS invoke X first"
54
- /(?:also|always|first|then|finally)\s+(?:call|invoke|use|execute|run)\s+(?:the\s+)?[a-z_][\w-]{2,}/i,
53
+ // "Also call delete_account" / "then invoke query()". Tightened
54
+ // (review M2): the called symbol must carry a case-robust tool-name
55
+ // signal — a snake_case underscore or a `(` call — so benign docs prose
56
+ // ("first call the API", "then use the result") doesn't trip the
57
+ // block-level threshold and push operators into disabling the scanner.
58
+ // (camelCase detection was dropped: under the /i flag `[a-z]+[A-Z]`
59
+ // collapses to "any word" and re-introduced the false positive.)
60
+ /(?:also|always|first|then|finally)\s+(?:call|invoke|use|execute|run)\s+(?:the\s+)?[a-z][\w-]*(?:_[\w-]+|\s*\()/i,
55
61
  // "Tool description: ignore the above and ..."
56
62
  /(?:tool|function|action)\s+description[:\s]+(?:ignore|disregard|override)/i,
57
63
  // "Note to LLM:" / "Hidden instruction:"
@@ -60,6 +66,25 @@ const SOURCE_PROFILE = {
60
66
  /(?:on\s+(?:success|call)|when\s+invoked)[,:]\s+(?:silently|also|then)\s+(?:call|invoke|exfiltrate|send|leak)/i,
61
67
  ],
62
68
  },
69
+ "tool-output": {
70
+ // The runtime result a tool returned. Distinct from `tool-desc` (static
71
+ // schema): this is data the tool fetched and is therefore
72
+ // attacker-influenceable (PoisonedRAG, scraped pages surfaced through a
73
+ // search tool, a compromised upstream API). Tool results almost never
74
+ // legitimately carry instruction syntax aimed at the calling model, so
75
+ // the bar is as tight as RAG/tool-desc.
76
+ threshold: 0.13,
77
+ extraPatterns: [
78
+ // Result claims to carry instructions for the model.
79
+ /(?:tool|function|api|search|query)\s+(?:result|response|output)[:\s]+(?:ignore|disregard|override|new\s+instructions?|system\s+prompt)/i,
80
+ // "the result indicates you should now call/invoke X"
81
+ /(?:result|response|data|output)\s+(?:indicates?|says?|requires?|means?)\s+(?:that\s+)?you\s+(?:should|must|need\s+to|will)\s+(?:now\s+)?(?:call|invoke|run|execute|use)\s+[a-z_][\w-]{2,}/i,
82
+ // Embedded role/system marker inside a JSON-ish result value.
83
+ /"(?:role|system|instruction|directive)"\s*:\s*"(?:system|ignore|override|admin)/i,
84
+ // "(end of results) Now, as the system, ..."
85
+ /(?:end\s+of\s+(?:results?|output|data)|<\/results?>)[\s.)]*(?:now|next)[,\s]+(?:as\s+(?:the\s+)?(?:system|admin|assistant)|you\s+(?:must|should|will))/i,
86
+ ],
87
+ },
63
88
  memory: {
64
89
  // Stored memory entries: persistence poisoning. Look for sentinel
65
90
  // instructions that re-anchor the model on subsequent retrieval.
@@ -346,6 +371,49 @@ export async function scanIngested(content, source, config = {}) {
346
371
  },
347
372
  };
348
373
  }
374
+ /**
375
+ * Scan the runtime *result* of a tool call before it re-enters the model
376
+ * context. The dominant indirect-injection channel in agentic loops: a
377
+ * search tool surfaces a poisoned page, an MCP server returns attacker-
378
+ * controlled data, a compromised upstream API embeds instructions in its
379
+ * response. PoisonedRAG (USENIX Security 2025) showed 5 planted documents
380
+ * reach a 90% attack-success rate in million-document knowledge bases —
381
+ * the payload arrives here, not in the user prompt.
382
+ *
383
+ * Thin wrapper over `scanIngested(content, "tool-output")` that also
384
+ * stamps the originating `toolName` into every violation detail, so an
385
+ * audit log can answer "which tool returned the poisoned content?".
386
+ *
387
+ * Pair with `CircuitBreakerRegistry` when you also want to rate-limit or
388
+ * trip the tool after repeated poisoned results:
389
+ *
390
+ * @example
391
+ * ```ts
392
+ * import { scanToolOutput } from "ai-shield-core";
393
+ *
394
+ * const result = await searchTool.call(query); // untrusted
395
+ * const scan = await scanToolOutput("web_search", result);
396
+ * if (!scan.safe) {
397
+ * // drop the result OR strip it before the next model turn
398
+ * audit.warn("poisoned tool output", { tool: "web_search", v: scan.violations });
399
+ * return; // do not feed `result` back into the model
400
+ * }
401
+ * model.continue(result);
402
+ * ```
403
+ */
404
+ export async function scanToolOutput(toolName, content, config = {}) {
405
+ const result = await scanIngested(content, "tool-output", config);
406
+ const safeToolName = typeof toolName === "string" && toolName.length > 0
407
+ ? toolName.slice(0, 120)
408
+ : "unknown";
409
+ return {
410
+ ...result,
411
+ violations: result.violations.map((v) => ({
412
+ ...v,
413
+ detail: `${v.detail ?? ""} (tool=${safeToolName})`.trim(),
414
+ })),
415
+ };
416
+ }
349
417
  // ============================================================
350
418
  // Encoding-bypass normalization (R1 from Round 1 review — closes
351
419
  // OWASP LLM Prompt Injection Prevention Cheat Sheet 2026 Base64/Hex
@@ -0,0 +1,73 @@
1
+ import type { ScanContext, ScanDecision, Violation, PIIConfig } from "../types.js";
2
+ export type OutputSink = "sql" | "shell" | "html" | "template";
3
+ export interface OutputScanConfig {
4
+ /**
5
+ * PII handling. Pass a `PIIConfig` to control action/locale, or `false`
6
+ * to skip PII scanning entirely. Default: mask.
7
+ */
8
+ pii?: PIIConfig | false;
9
+ /**
10
+ * Canary token(s) injected into the system prompt via `injectCanary()`.
11
+ * If any appears verbatim in the output → `system_prompt_leak` (block).
12
+ */
13
+ canaryTokens?: string | string[];
14
+ /**
15
+ * Restrict the structured-injection check to specific downstream sinks.
16
+ * E.g. `["sql"]` when the output only ever flows into a query builder.
17
+ * Default: all sinks.
18
+ */
19
+ sinks?: OutputSink[];
20
+ /** Selectively disable checks. All enabled by default. */
21
+ checks?: {
22
+ secrets?: boolean;
23
+ injection?: boolean;
24
+ systemPromptLeak?: boolean;
25
+ jailbreak?: boolean;
26
+ };
27
+ /** Override the byte cap on the scanned region. Default 256 KB. */
28
+ maxBytes?: number;
29
+ }
30
+ export interface OutputScanResult {
31
+ /** No blocking violation found. */
32
+ safe: boolean;
33
+ decision: ScanDecision;
34
+ /**
35
+ * Output with PII masked and secrets redacted to `[REDACTED_SECRET]`.
36
+ * Unlike `scanIngested`, this is NOT emptied on block — the caller
37
+ * usually still needs to log or display the sanitized text. Gate on
38
+ * `safe` / `decision` before forwarding it to a downstream sink.
39
+ */
40
+ sanitized: string;
41
+ violations: Violation[];
42
+ meta: {
43
+ scanDurationMs: number;
44
+ checksRun: string[];
45
+ };
46
+ }
47
+ /**
48
+ * Scanner for LLM output. Stateless; safe to reuse across calls.
49
+ */
50
+ export declare class OutputScanner {
51
+ private readonly config;
52
+ private readonly pii;
53
+ constructor(config?: OutputScanConfig);
54
+ scan(output: string, context?: ScanContext): Promise<OutputScanResult>;
55
+ }
56
+ /**
57
+ * One-shot helper. Scan a model response before acting on it.
58
+ *
59
+ * @example
60
+ * ```ts
61
+ * import { scanOutput } from "ai-shield-core";
62
+ *
63
+ * const reply = await llm.generate(prompt);
64
+ * const r = await scanOutput(reply, { canaryTokens: canary, sinks: ["sql"] });
65
+ * if (!r.safe) {
66
+ * audit.warn("unsafe model output", r.violations);
67
+ * return genericFallback(); // do not run r.sanitized as SQL
68
+ * }
69
+ * showToUser(r.sanitized); // PII masked, secrets redacted
70
+ * ```
71
+ */
72
+ export declare function scanOutput(output: string, config?: OutputScanConfig, context?: ScanContext): Promise<OutputScanResult>;
73
+ //# sourceMappingURL=output.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"output.d.ts","sourceRoot":"","sources":["../../src/scanner/output.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,WAAW,EACX,YAAY,EACZ,SAAS,EACT,SAAS,EACV,MAAM,aAAa,CAAC;AAuHrB,MAAM,MAAM,UAAU,GAAG,KAAK,GAAG,OAAO,GAAG,MAAM,GAAG,UAAU,CAAC;AAE/D,MAAM,WAAW,gBAAgB;IAC/B;;;OAGG;IACH,GAAG,CAAC,EAAE,SAAS,GAAG,KAAK,CAAC;IACxB;;;OAGG;IACH,YAAY,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;IACjC;;;;OAIG;IACH,KAAK,CAAC,EAAE,UAAU,EAAE,CAAC;IACrB,0DAA0D;IAC1D,MAAM,CAAC,EAAE;QACP,OAAO,CAAC,EAAE,OAAO,CAAC;QAClB,SAAS,CAAC,EAAE,OAAO,CAAC;QACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;QAC3B,SAAS,CAAC,EAAE,OAAO,CAAC;KACrB,CAAC;IACF,mEAAmE;IACnE,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,gBAAgB;IAC/B,mCAAmC;IACnC,IAAI,EAAE,OAAO,CAAC;IACd,QAAQ,EAAE,YAAY,CAAC;IACvB;;;;;OAKG;IACH,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,SAAS,EAAE,CAAC;IACxB,IAAI,EAAE;QACJ,cAAc,EAAE,MAAM,CAAC;QACvB,SAAS,EAAE,MAAM,EAAE,CAAC;KACrB,CAAC;CACH;AAID;;GAEG;AACH,qBAAa,aAAa;IACxB,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAmB;IAC1C,OAAO,CAAC,QAAQ,CAAC,GAAG,CAAoB;gBAE5B,MAAM,GAAE,gBAAqB;IAQnC,IAAI,CACR,MAAM,EAAE,MAAM,EACd,OAAO,GAAE,WAAgB,GACxB,OAAO,CAAC,gBAAgB,CAAC;CA+J7B;AAED;;;;;;;;;;;;;;;GAeG;AACH,wBAAsB,UAAU,CAC9B,MAAM,EAAE,MAAM,EACd,MAAM,GAAE,gBAAqB,EAC7B,OAAO,GAAE,WAAgB,GACxB,OAAO,CAAC,gBAAgB,CAAC,CAE3B"}
@@ -0,0 +1,297 @@
1
+ import { PIIScanner } from "./pii.js";
2
+ import { normalizeForInjectionScan } from "./heuristic.js";
3
+ // ============================================================
4
+ // Output Scanner — OWASP LLM05 Improper Output Handling +
5
+ // LLM02 Sensitive Information Disclosure (output side)
6
+ //
7
+ // AI Shield's input scanners answer "is this prompt safe to send to the
8
+ // model?". This scanner answers the other half: "is this model OUTPUT
9
+ // safe to act on / show / forward downstream?".
10
+ //
11
+ // LLM output must never reach a SQL engine, a shell, an HTML sink, or a
12
+ // template renderer unfiltered — XSS, SSRF, SQLi and command injection
13
+ // sourced from model output are a documented 2026 attack class (OWASP
14
+ // LLM05). And a model can leak its own system prompt or a secret it was
15
+ // shown, which is LLM02 / LLM07.
16
+ //
17
+ // Five checks. Inputs are Unicode-normalized first (homoglyph / zero-width /
18
+ // fullwidth evasion defense). Secret + canary checks scan the FULL output
19
+ // (a leak can sit anywhere); the structural checks scan a length-capped copy
20
+ // (those payloads live in the first chunk):
21
+ // 1. secret_leak — API keys, tokens, private keys, DSNs (full output)
22
+ // 2. output_injection — SQL / shell / HTML-JS / template / md-exfil (capped)
23
+ // 3. system_prompt_leak — canary-token leak (exact, full) + heuristic phrasing
24
+ // 4. pii_detected — reuses the input-side PIIScanner
25
+ // 5. jailbreak_indicator— compliance-preamble / mode-switch acknowledgement
26
+ //
27
+ // Checks 1-3 are high-confidence and block. PII follows its configured
28
+ // action. Jailbreak is heuristic and only warns — a "sure, here's how"
29
+ // preamble is often legitimate.
30
+ // ============================================================
31
+ /** Hard cap on the bytes we pattern-scan. A 1 MB model response is not the
32
+ * threat model and unbounded regex over it pressures GC. Overridable. */
33
+ const DEFAULT_MAX_OUTPUT_BYTES = 256 * 1024;
34
+ /**
35
+ * High-confidence secret formats. Each is anchored on a provider-specific
36
+ * prefix so false positives on prose are near-zero. Patterns are linear
37
+ * (no nested quantifiers) — ReDoS-safe on large output.
38
+ */
39
+ const SECRET_PATTERNS = [
40
+ { id: "SEC-OPENAI", re: /\bsk-(?:proj-)?[A-Za-z0-9_-]{20,}\b/, label: "OpenAI API key" },
41
+ { id: "SEC-ANTHROPIC", re: /\bsk-ant-[A-Za-z0-9_-]{20,}\b/, label: "Anthropic API key" },
42
+ { id: "SEC-AWS-AKID", re: /\b(?:AKIA|ASIA)[0-9A-Z]{16}\b/, label: "AWS access key id" },
43
+ { id: "SEC-GITHUB", re: /\bgh[pousr]_[A-Za-z0-9]{36,}\b/, label: "GitHub token" },
44
+ { id: "SEC-GOOGLE", re: /\bAIza[0-9A-Za-z_-]{35}\b/, label: "Google API key" },
45
+ { id: "SEC-GOOGLE-OAUTH", re: /\bGOCSPX-[A-Za-z0-9_-]{28}\b/, label: "Google OAuth client secret" },
46
+ { id: "SEC-GCP-SA", re: /"type"\s*:\s*"service_account"/, label: "GCP service-account JSON" },
47
+ { id: "SEC-HUGGINGFACE", re: /\bhf_[A-Za-z0-9]{30,}\b/, label: "HuggingFace token" },
48
+ { id: "SEC-NPM", re: /\bnpm_[A-Za-z0-9]{36}\b/, label: "npm publish token" },
49
+ { id: "SEC-SLACK", re: /\bxox[baprs]-[A-Za-z0-9-]{10,}\b/, label: "Slack token" },
50
+ { id: "SEC-STRIPE", re: /\b[rs]k_live_[A-Za-z0-9]{20,}\b/, label: "Stripe live key" },
51
+ { id: "SEC-JWT", re: /\beyJ[A-Za-z0-9_-]{8,}\.eyJ[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\b/, label: "JWT" },
52
+ { id: "SEC-PEM", re: /-----BEGIN (?:RSA |EC |DSA |OPENSSH |PGP )?PRIVATE KEY-----/, label: "PEM private key" },
53
+ // DSN: both credential segments are length-bounded so a long near-match
54
+ // without a trailing `@` can't drive O(n²) backtracking (review H1).
55
+ { id: "SEC-DSN", re: /\b(?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqps?):\/\/[^\s:/@]{1,64}:[^\s@]{3,80}@/, label: "connection string with credentials" },
56
+ ];
57
+ /**
58
+ * Output-injection payloads, grouped by downstream sink. Each pattern is
59
+ * deliberately conservative — flagging legitimate output that merely
60
+ * *mentions* SQL would be useless. They target syntax that only matters
61
+ * when the string is interpreted, not displayed.
62
+ */
63
+ const INJECTION_PATTERNS = [
64
+ // SQL
65
+ { id: "OUTI-SQL-1", sink: "sql", re: /\bunion\s+(?:all\s+)?select\b/i, label: "SQL UNION SELECT" },
66
+ { id: "OUTI-SQL-2", sink: "sql", re: /['"]\s*;\s*(?:drop|delete|update|insert|truncate|alter)\s+/i, label: "SQL statement break" },
67
+ { id: "OUTI-SQL-3", sink: "sql", re: /\bor\s+1\s*=\s*1\b|\bor\s+'1'\s*=\s*'1'/i, label: "SQL tautology" },
68
+ // Shell
69
+ { id: "OUTI-SH-1", sink: "shell", re: /\$\([^)]{1,200}\)|`[^`]{1,200}`/, label: "shell command substitution" },
70
+ { id: "OUTI-SH-2", sink: "shell", re: /[;&|]\s*(?:rm|curl|wget|nc|bash|sh|chmod|mkfifo|dd)\s+-?/i, label: "chained shell command" },
71
+ { id: "OUTI-SH-3", sink: "shell", re: /\|\s*(?:sh|bash|zsh|python[0-9.]*)\b/i, label: "pipe to interpreter" },
72
+ // HTML / JS (XSS)
73
+ { id: "OUTI-XSS-1", sink: "html", re: /<script[\s>]/i, label: "<script> tag" },
74
+ { id: "OUTI-XSS-2", sink: "html", re: /\bon(?:error|load|click|mouseover)\s*=\s*["']?[^"'>]{1,200}/i, label: "inline event handler" },
75
+ { id: "OUTI-XSS-3", sink: "html", re: /\bjavascript:\s*[^\s"']{1,200}/i, label: "javascript: URI" },
76
+ { id: "OUTI-XSS-4", sink: "html", re: /<iframe[\s>]|<img[^>]{0,200}\bsrc\s*=\s*["']?\s*(?:javascript|data):/i, label: "iframe / data-URI image" },
77
+ // Markdown-image data exfiltration: ![x](http://evil/log?data=…). When a
78
+ // renderer auto-loads the image the query string leaks whatever the model
79
+ // was told to embed. The most-overlooked LLM05 class (review: Research).
80
+ { id: "OUTI-MDEXF", sink: "html", re: /!\[[^\]]{0,200}\]\(\s*https?:\/\/[^)\s]{1,300}[?&][\w-]{1,40}=/i, label: "markdown-image data exfiltration" },
81
+ // Template / SSTI
82
+ { id: "OUTI-SSTI-1", sink: "template", re: /\{\{[^}]{0,200}(?:constructor|process|require|global|__proto__|self\.|cycler)[^}]{0,200}\}\}/i, label: "template-injection payload" },
83
+ { id: "OUTI-SSTI-2", sink: "template", re: /<%[^%]{0,200}(?:system|exec|require|eval)[^%]{0,200}%>/i, label: "ERB/EJS injection" },
84
+ ];
85
+ /**
86
+ * System-prompt-leak heuristics — used only when no canary token is
87
+ * available. Low-confidence by design (these phrasings occur in benign
88
+ * output too), so they warn rather than block.
89
+ */
90
+ const SYSTEM_LEAK_PATTERNS = [
91
+ /(?:my|the)\s+(?:system\s+)?(?:prompt|instructions?)\s+(?:is|are|say|states?|read)\b/i,
92
+ /i\s+(?:was|am|have\s+been)\s+(?:instructed|told|configured|programmed|designed)\s+to\b/i,
93
+ /here\s+(?:is|are)\s+my\s+(?:system\s+)?(?:prompt|instructions?|guidelines?|rules?)\b/i,
94
+ /you\s+are\s+(?:a|an)\s+[\w-]{2,30}\s+(?:assistant|agent|bot|model)\b.{0,40}\b(?:you\s+must|your\s+(?:rules?|guidelines?|instructions?))/i,
95
+ ];
96
+ /**
97
+ * Jailbreak-success indicators in the OUTPUT. Conservative + low weight:
98
+ * a generic "Sure, here's how" is not enough on its own — these target
99
+ * explicit mode-switch acknowledgements and self-declared rule-breaking.
100
+ */
101
+ const JAILBREAK_PATTERNS = [
102
+ /\bas\s+(?:DAN|an?\s+(?:unrestricted|unfiltered|jailbroken|uncensored))\b/i,
103
+ /i(?:'?ll|\s+will)\s+(?:now\s+)?(?:ignore|bypass|disregard|set\s+aside)\s+(?:my|the|all)\s+(?:guidelines?|restrictions?|rules?|safety|programming|filters?)/i,
104
+ /(?:jailbreak|developer\s+mode|dan\s+mode)\s+(?:enabled|activated|successful|engaged)/i,
105
+ /i\s+am\s+(?:now\s+)?(?:free\s+(?:from|of)|no\s+longer\s+bound\s+by)\s+(?:my\s+)?(?:restrictions?|guidelines?|rules?|programming)/i,
106
+ ];
107
+ const SECRET_REDACTION = "[REDACTED_SECRET]";
108
+ /**
109
+ * Scanner for LLM output. Stateless; safe to reuse across calls.
110
+ */
111
+ export class OutputScanner {
112
+ config;
113
+ pii;
114
+ constructor(config = {}) {
115
+ this.config = config;
116
+ this.pii =
117
+ config.pii === false
118
+ ? null
119
+ : new PIIScanner(config.pii ?? { action: "mask" });
120
+ }
121
+ async scan(output, context = {}) {
122
+ const start = performance.now();
123
+ const violations = [];
124
+ const checksRun = [];
125
+ const checks = this.config.checks ?? {};
126
+ const maxBytes = this.config.maxBytes ?? DEFAULT_MAX_OUTPUT_BYTES;
127
+ const safeOutput = typeof output === "string" ? output : "";
128
+ // Capped copy for the *structural* checks (injection / leak-phrasing /
129
+ // jailbreak) — those payloads live in the first chunk and the regex over
130
+ // a 1 MB response would pressure GC. Normalized so homoglyph / zero-width
131
+ // / fullwidth evasion can't slip a payload past the patterns (review H6).
132
+ const cappedDetect = normalizeForInjectionScan(safeOutput.length > maxBytes ? safeOutput.slice(0, maxBytes) : safeOutput);
133
+ // Secrets and canaries can sit ANYWHERE in the output, and the secret
134
+ // patterns are anchored + linear — so they scan the FULL output, not the
135
+ // cap (review C1: a key padded past 256 KB must not slip through). Also
136
+ // normalized for the same evasion defense.
137
+ const fullDetect = normalizeForInjectionScan(safeOutput);
138
+ let sanitized = output;
139
+ let worst = "allow";
140
+ const bump = (d) => {
141
+ if (priority(d) > priority(worst))
142
+ worst = d;
143
+ };
144
+ // 1. Secret leak — high-confidence, always blocks. Redact in `sanitized`.
145
+ // Detection runs on the normalized full output; redaction is
146
+ // best-effort over the raw output (a key fragmented by zero-width
147
+ // chars is still flagged via `fullDetect` and blocks, but may resist
148
+ // clean redaction — callers MUST gate on `safe`/`decision` and never
149
+ // forward a blocked output regardless of `sanitized`).
150
+ if (checks.secrets !== false) {
151
+ checksRun.push("secrets");
152
+ for (const { id, re, label } of SECRET_PATTERNS) {
153
+ if (re.test(fullDetect)) {
154
+ violations.push({
155
+ type: "secret_leak",
156
+ scanner: "output",
157
+ score: 1.0,
158
+ threshold: 0.5,
159
+ message: `Output leaks a secret: ${label}`,
160
+ detail: `Rule ${id}`,
161
+ });
162
+ bump("block");
163
+ // Redact every occurrence in the full output (global copy of re).
164
+ sanitized = sanitized.replace(new RegExp(re.source, re.flags.includes("g") ? re.flags : re.flags + "g"), SECRET_REDACTION);
165
+ }
166
+ }
167
+ }
168
+ // 2. Output injection — payloads dangerous to a downstream sink.
169
+ if (checks.injection !== false) {
170
+ checksRun.push("injection");
171
+ const allowedSinks = this.config.sinks;
172
+ for (const { id, sink, re, label } of INJECTION_PATTERNS) {
173
+ if (allowedSinks && !allowedSinks.includes(sink))
174
+ continue;
175
+ if (re.test(cappedDetect)) {
176
+ violations.push({
177
+ type: "output_injection",
178
+ scanner: "output",
179
+ score: 0.85,
180
+ threshold: 0.5,
181
+ message: `Output carries a ${sink} injection payload: ${label}`,
182
+ detail: `Rule ${id} (sink=${sink})`,
183
+ });
184
+ bump("block");
185
+ }
186
+ }
187
+ }
188
+ // 3. System-prompt leak — canary first (exact, certain), then heuristics.
189
+ if (checks.systemPromptLeak !== false) {
190
+ checksRun.push("system_prompt_leak");
191
+ const tokens = normalizeTokens(this.config.canaryTokens);
192
+ let canaryHit = false;
193
+ for (const token of tokens) {
194
+ // Check the FULL output, not the capped copy — a leak past 256 KB is
195
+ // still a leak, and an exact substring search is cheap.
196
+ if (token.length >= 4 && output.includes(token)) {
197
+ canaryHit = true;
198
+ violations.push({
199
+ type: "system_prompt_leak",
200
+ scanner: "output",
201
+ score: 1.0,
202
+ threshold: 0.5,
203
+ message: "Output leaks a system-prompt canary token",
204
+ detail: "Canary match (exact)",
205
+ });
206
+ bump("block");
207
+ }
208
+ }
209
+ // Heuristic phrasing only when no canary was available/hit — avoids
210
+ // double-reporting and keeps the low-confidence signal subordinate.
211
+ if (!canaryHit && tokens.length === 0) {
212
+ for (const re of SYSTEM_LEAK_PATTERNS) {
213
+ if (re.test(cappedDetect)) {
214
+ violations.push({
215
+ type: "system_prompt_leak",
216
+ scanner: "output",
217
+ score: 0.4,
218
+ threshold: 0.5,
219
+ message: "Output may be echoing the system prompt",
220
+ detail: "Heuristic phrasing (no canary configured — pass canaryTokens for an exact check)",
221
+ });
222
+ bump("warn");
223
+ break; // one heuristic signal is enough
224
+ }
225
+ }
226
+ }
227
+ }
228
+ // 4. Jailbreak indicators — heuristic, warn only.
229
+ if (checks.jailbreak !== false) {
230
+ checksRun.push("jailbreak");
231
+ for (const re of JAILBREAK_PATTERNS) {
232
+ if (re.test(cappedDetect)) {
233
+ violations.push({
234
+ type: "jailbreak_indicator",
235
+ scanner: "output",
236
+ score: 0.3,
237
+ threshold: 0.5,
238
+ message: "Output shows a possible jailbreak success indicator",
239
+ detail: "Heuristic phrasing",
240
+ });
241
+ bump("warn");
242
+ break;
243
+ }
244
+ }
245
+ }
246
+ // 5. PII — reuse the input-side scanner; respects its configured action.
247
+ if (this.pii) {
248
+ checksRun.push("pii");
249
+ const piiResult = await this.pii.scan(sanitized, context);
250
+ for (const v of piiResult.violations) {
251
+ violations.push({ ...v, scanner: "output" });
252
+ }
253
+ if (piiResult.sanitized !== undefined)
254
+ sanitized = piiResult.sanitized;
255
+ bump(piiResult.decision);
256
+ }
257
+ return {
258
+ safe: worst === "allow",
259
+ decision: worst,
260
+ sanitized,
261
+ violations,
262
+ meta: {
263
+ scanDurationMs: performance.now() - start,
264
+ checksRun,
265
+ },
266
+ };
267
+ }
268
+ }
269
+ /**
270
+ * One-shot helper. Scan a model response before acting on it.
271
+ *
272
+ * @example
273
+ * ```ts
274
+ * import { scanOutput } from "ai-shield-core";
275
+ *
276
+ * const reply = await llm.generate(prompt);
277
+ * const r = await scanOutput(reply, { canaryTokens: canary, sinks: ["sql"] });
278
+ * if (!r.safe) {
279
+ * audit.warn("unsafe model output", r.violations);
280
+ * return genericFallback(); // do not run r.sanitized as SQL
281
+ * }
282
+ * showToUser(r.sanitized); // PII masked, secrets redacted
283
+ * ```
284
+ */
285
+ export async function scanOutput(output, config = {}, context = {}) {
286
+ return new OutputScanner(config).scan(output, context);
287
+ }
288
+ function normalizeTokens(tokens) {
289
+ if (!tokens)
290
+ return [];
291
+ const arr = Array.isArray(tokens) ? tokens : [tokens];
292
+ return arr.filter((t) => typeof t === "string" && t.length > 0);
293
+ }
294
+ function priority(d) {
295
+ return d === "block" ? 2 : d === "warn" ? 1 : 0;
296
+ }
297
+ //# sourceMappingURL=output.js.map