ai-shield-core 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/dist/audit/logger.d.ts.map +1 -1
  2. package/dist/audit/logger.js +13 -14
  3. package/dist/audit/types.js +1 -2
  4. package/dist/cache/lru.js +1 -5
  5. package/dist/canary/memory.d.ts +75 -0
  6. package/dist/canary/memory.d.ts.map +1 -0
  7. package/dist/canary/memory.js +194 -0
  8. package/dist/context/wrap-context.d.ts +169 -0
  9. package/dist/context/wrap-context.d.ts.map +1 -0
  10. package/dist/context/wrap-context.js +278 -0
  11. package/dist/cost/anomaly.js +1 -4
  12. package/dist/cost/pricing.d.ts.map +1 -1
  13. package/dist/cost/pricing.js +26 -19
  14. package/dist/cost/tracker.d.ts +19 -1
  15. package/dist/cost/tracker.d.ts.map +1 -1
  16. package/dist/cost/tracker.js +27 -10
  17. package/dist/index.d.ts +34 -3
  18. package/dist/index.d.ts.map +1 -1
  19. package/dist/index.js +55 -37
  20. package/dist/judge/async-judge.d.ts +85 -0
  21. package/dist/judge/async-judge.d.ts.map +1 -0
  22. package/dist/judge/async-judge.js +146 -0
  23. package/dist/policy/circuit-breaker.d.ts +70 -0
  24. package/dist/policy/circuit-breaker.d.ts.map +1 -0
  25. package/dist/policy/circuit-breaker.js +376 -0
  26. package/dist/policy/engine.js +1 -5
  27. package/dist/policy/tools.js +4 -8
  28. package/dist/scanner/canary.js +4 -8
  29. package/dist/scanner/chain.js +1 -5
  30. package/dist/scanner/heuristic.d.ts +27 -0
  31. package/dist/scanner/heuristic.d.ts.map +1 -1
  32. package/dist/scanner/heuristic.js +118 -7
  33. package/dist/scanner/ingestion.d.ts +147 -0
  34. package/dist/scanner/ingestion.d.ts.map +1 -0
  35. package/dist/scanner/ingestion.js +520 -0
  36. package/dist/scanner/output.d.ts +73 -0
  37. package/dist/scanner/output.d.ts.map +1 -0
  38. package/dist/scanner/output.js +297 -0
  39. package/dist/scanner/pii.d.ts.map +1 -1
  40. package/dist/scanner/pii.js +24 -12
  41. package/dist/shield.d.ts.map +1 -1
  42. package/dist/shield.js +34 -26
  43. package/dist/types.d.ts +156 -2
  44. package/dist/types.d.ts.map +1 -1
  45. package/dist/types.js +1 -2
  46. package/package.json +4 -3
  47. package/src/audit/logger.ts +6 -1
  48. package/src/canary/memory.ts +259 -0
  49. package/src/context/wrap-context.ts +475 -0
  50. package/src/cost/pricing.ts +21 -9
  51. package/src/cost/tracker.ts +35 -1
  52. package/src/index.ts +113 -2
  53. package/src/judge/async-judge.ts +254 -0
  54. package/src/policy/circuit-breaker.ts +449 -0
  55. package/src/scanner/heuristic.ts +125 -2
  56. package/src/scanner/ingestion.ts +624 -0
  57. package/src/scanner/output.ts +386 -0
  58. package/src/scanner/pii.ts +21 -7
  59. package/src/shield.ts +15 -2
  60. package/src/types.ts +194 -2
  61. package/tsconfig.json +2 -1
  62. package/dist/audit/logger.js.map +0 -1
  63. package/dist/audit/types.js.map +0 -1
  64. package/dist/cache/lru.js.map +0 -1
  65. package/dist/cost/anomaly.js.map +0 -1
  66. package/dist/cost/pricing.js.map +0 -1
  67. package/dist/cost/tracker.js.map +0 -1
  68. package/dist/index.js.map +0 -1
  69. package/dist/policy/engine.js.map +0 -1
  70. package/dist/policy/tools.js.map +0 -1
  71. package/dist/scanner/canary.js.map +0 -1
  72. package/dist/scanner/chain.js.map +0 -1
  73. package/dist/scanner/heuristic.js.map +0 -1
  74. package/dist/scanner/pii.js.map +0 -1
  75. package/dist/shield.js.map +0 -1
  76. package/dist/types.js.map +0 -1
@@ -1,6 +1,69 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.HeuristicScanner = void 0;
1
+ // ============================================================
2
+ // Heuristic Prompt Injection Scanner
3
+ // Score-based: multiple matches = higher confidence
4
+ // Unicode-normalizes input before pattern matching so that
5
+ // homoglyph/zero-width/fullwidth evasion attempts still hit.
6
+ // ============================================================
7
+ // Common Cyrillic/Greek Latin-lookalikes mapped to ASCII.
8
+ // Keep minimal — false-mappings in real content are worse than
9
+ // false-negatives in an attack attempt.
10
+ const HOMOGLYPH_MAP = {
11
+ // Cyrillic
12
+ "а": "a", "е": "e", "і": "i", "ј": "j", "о": "o", "р": "p", "с": "c", "ѕ": "s",
13
+ "у": "y", "х": "x", "ԁ": "d", "һ": "h", "ӏ": "l", "ո": "n", "А": "A", "В": "B",
14
+ "Е": "E", "І": "I", "К": "K", "М": "M", "Н": "H", "О": "O", "Р": "P", "С": "C",
15
+ "Т": "T", "Х": "X", "Ѕ": "S", "Ј": "J", "Ү": "Y", "Ԛ": "Q", "Ԝ": "W", "Ғ": "F",
16
+ // Greek
17
+ "α": "a", "ο": "o", "ρ": "p", "ε": "e", "υ": "y", "χ": "x", "ν": "v", "ι": "i",
18
+ "κ": "k", "Α": "A", "Β": "B", "Ε": "E", "Ζ": "Z", "Η": "H", "Ι": "I", "Κ": "K",
19
+ "Μ": "M", "Ν": "N", "Ο": "O", "Ρ": "P", "Τ": "T", "Υ": "Y", "Χ": "X",
20
+ // Armenian / Cherokee / other look-alikes occasionally used in evasion
21
+ "օ": "o", "ѵ": "v",
22
+ };
23
+ const HOMOGLYPH_RE = new RegExp(Object.keys(HOMOGLYPH_MAP).join("|"), "g");
24
+ // Zero-width chars + BOM — used to split words like "ig<ZWSP>nore" across
25
+ // the pattern boundary (U+200B..U+200D, U+2060, U+FEFF).
26
+ const ZERO_WIDTH_RE = /[​-‍⁠]/g;
27
+ // Combining marks (diacritics) after NFKC can still slip through (U+0300..U+036F).
28
+ const COMBINING_RE = /[̀-ͯ]/g;
29
+ /**
30
+ * Normalize input for pattern matching. Returns the canonicalized string
31
+ * used only for scan decisions; the sanitized output passed to callers
32
+ * is still the original input.
33
+ *
34
+ * Order matters:
35
+ * 1. NFKD folds compatibility forms (fullwidth → ASCII, ligatures) AND
36
+ * decomposes precomposed accented letters into base + combining mark.
37
+ * 2. Strip zero-width chars so "ig<ZWSP>nore" collapses to "ignore".
38
+ * 3. Strip combining marks (diacritics) left behind by NFKD.
39
+ * 4. Map remaining Cyrillic/Greek look-alikes to Latin.
40
+ */
41
+ export function normalizeForInjectionScan(input) {
42
+ const nfkd = input.normalize("NFKD");
43
+ const noZW = nfkd.replace(ZERO_WIDTH_RE, "");
44
+ const noCombining = noZW.replace(COMBINING_RE, "");
45
+ return noCombining.replace(HOMOGLYPH_RE, (ch) => HOMOGLYPH_MAP[ch] ?? ch);
46
+ }
47
+ /**
48
+ * Collapse letter-splitting evasion: an attacker writes `i g n o r e` or
49
+ * `i.g.n.o.r.e` or `i-g-n-o-r-e` to break the literal token "ignore" across
50
+ * separators so the regex never matches. This produces an ADDITIONAL view
51
+ * where any run of `single-letter + separator` (≥4 letters) has its
52
+ * separators removed, so the spaced form collapses back to "ignore".
53
+ *
54
+ * Run as a second pass IN ADDITION to the normal normalized text — never
55
+ * as a replacement — because collapsing is lossy (it would also fuse the
56
+ * legitimate "a b c" list). Only single-letter groups separated by one
57
+ * space / dot / dash / underscore are collapsed; multi-letter words are
58
+ * left intact, which keeps benign prose untouched.
59
+ */
60
+ export function collapseSpacedLetters(input) {
61
+ // Match ≥3 "<letter><sep>" groups closed by a final lone letter. The
62
+ // trailing `(?![A-Za-z])` stops the greedy match from swallowing the
63
+ // first letter of the next real word ("i g n o r e all" must collapse to
64
+ // "ignore all", not "ignorea ll"). Bounded, linear — no nested quantifier.
65
+ return input.replace(/(?:[A-Za-z][ \t._-]){3,}[A-Za-z](?![A-Za-z])/g, (run) => run.replace(/[ \t._-]/g, ""));
66
+ }
4
67
  const PATTERNS = [
5
68
  // --- Instruction Override (weight: 0.25 each) ---
6
69
  {
@@ -304,7 +367,7 @@ const THRESHOLDS = {
304
367
  medium: 0.3,
305
368
  high: 0.15,
306
369
  };
307
- class HeuristicScanner {
370
+ export class HeuristicScanner {
308
371
  name = "heuristic";
309
372
  patterns;
310
373
  threshold;
@@ -317,8 +380,25 @@ class HeuristicScanner {
317
380
  const start = performance.now();
318
381
  const violations = [];
319
382
  let totalScore = 0;
383
+ // Normalize once — pattern matching runs against the canonical form so
384
+ // homoglyph/zero-width evasion doesn't bypass the rules. The caller
385
+ // still sees the original input in `sanitized`.
386
+ const normalized = normalizeForInjectionScan(input);
387
+ // Second view that un-splits letter-splitting evasion ("i g n o r e").
388
+ // Only computed when it actually differs (cheap guard), and only the
389
+ // high-value override/role/extraction/tool categories are re-tested
390
+ // against it — collapsing is lossy and the low-value framing rules
391
+ // would false-positive on collapsed prose.
392
+ const collapsed = collapseSpacedLetters(normalized);
393
+ const collapsedDiffers = collapsed !== normalized;
394
+ const SPLIT_SENSITIVE = new Set([
395
+ "instruction_override",
396
+ "role_manipulation",
397
+ "system_prompt_extraction",
398
+ "tool_abuse",
399
+ ]);
320
400
  for (const rule of this.patterns) {
321
- if (rule.pattern.test(input)) {
401
+ if (rule.pattern.test(normalized)) {
322
402
  totalScore += rule.weight;
323
403
  violations.push({
324
404
  type: "prompt_injection",
@@ -329,8 +409,24 @@ class HeuristicScanner {
329
409
  detail: `Rule ${rule.id} (${rule.category})`,
330
410
  });
331
411
  }
412
+ else if (collapsedDiffers &&
413
+ SPLIT_SENSITIVE.has(rule.category) &&
414
+ rule.pattern.test(collapsed)) {
415
+ // Matched only after un-splitting → letter-splitting evasion.
416
+ totalScore += rule.weight;
417
+ violations.push({
418
+ type: "prompt_injection",
419
+ scanner: this.name,
420
+ score: rule.weight,
421
+ threshold: this.threshold,
422
+ message: rule.description,
423
+ detail: `Rule ${rule.id} (${rule.category}, letter-splitting evasion)`,
424
+ });
425
+ }
332
426
  }
333
- // Structural signals (cumulative)
427
+ // Structural signals (cumulative) — intentionally run on the original
428
+ // input so real structural attacks (many newlines, long paddings) can
429
+ // still trip even when the textual patterns were evaded.
334
430
  const structuralScore = this.checkStructuralSignals(input);
335
431
  totalScore += structuralScore;
336
432
  // Cap at 1.0
@@ -360,6 +456,22 @@ class HeuristicScanner {
360
456
  // Very long input (potential padding attack)
361
457
  if (input.length > 5000)
362
458
  score += 0.05;
459
+ // Adversarial suffix (GCG-style): a long whitespace-free token packed
460
+ // with mixed punctuation/symbols, typically appended after the readable
461
+ // request. Conservative — needs ≥25 chars and ≥6 distinct punctuation
462
+ // marks so ordinary URLs, hashes and code tokens don't trip it.
463
+ const ADV_TOKEN_RE = /\S{25,}/g;
464
+ let advMatch;
465
+ let advCount = 0;
466
+ while ((advMatch = ADV_TOKEN_RE.exec(input)) !== null && advCount < 32) {
467
+ advCount += 1;
468
+ const tok = advMatch[0];
469
+ const distinctPunct = new Set((tok.match(/[!-/:-@[-`{-~]/g) ?? [])).size;
470
+ if (distinctPunct >= 6) {
471
+ score += 0.05;
472
+ break;
473
+ }
474
+ }
363
475
  return score;
364
476
  }
365
477
  /** Get all registered pattern IDs for testing */
@@ -371,5 +483,4 @@ class HeuristicScanner {
371
483
  return this.patterns.length;
372
484
  }
373
485
  }
374
- exports.HeuristicScanner = HeuristicScanner;
375
486
  //# sourceMappingURL=heuristic.js.map
@@ -0,0 +1,147 @@
1
+ import type { Scanner, ScannerResult, ScanContext, Violation, IngestionSource, TrustTier } from "../types.js";
2
+ /**
3
+ * Default trust-tier inferred from source.
4
+ * `user` is still untrusted in this library's threat model — a user can
5
+ * inject too — but `system` is reserved for content the developer
6
+ * controls and labels via `wrapContext()`. Every ingestion source
7
+ * (including `user`) therefore returns `"untrusted"` by default; the
8
+ * parameter is kept on the signature so future per-source overrides
9
+ * (e.g. an installer marking a specific source as trusted) don't
10
+ * require a breaking API change.
11
+ */
12
+ export declare function trustTierForSource(_source: IngestionSource): TrustTier;
13
+ /**
14
+ * Result of `scanIngested()`.
15
+ *
16
+ * Shape parallels `ScanResult` from `chain.ts` so callers can treat
17
+ * both interchangeably.
18
+ */
19
+ export interface IngestionScanResult {
20
+ safe: boolean;
21
+ decision: "allow" | "warn" | "block";
22
+ /**
23
+ * Sanitized output. When `decision === "block"` this is the empty
24
+ * string — the original content was deemed unsafe and the field name
25
+ * "sanitized" would otherwise mislead callers into using poisoned
26
+ * content. Use the source `content` argument if you need the raw input
27
+ * for logging or quarantine.
28
+ */
29
+ sanitized: string;
30
+ violations: Violation[];
31
+ source: IngestionSource;
32
+ meta: {
33
+ scanDurationMs: number;
34
+ scannersRun: string[];
35
+ /** Number of extra source-specific patterns that fired. */
36
+ sourceSpecificHits: number;
37
+ /**
38
+ * Always `false` from `scanIngested()` — ingestion scans don't go
39
+ * through the LRU cache. Field is present so callers can write a
40
+ * single result-handler for both `ScanResult` and `IngestionScanResult`.
41
+ */
42
+ cached: boolean;
43
+ };
44
+ }
45
+ export interface IngestionScannerConfig {
46
+ /** Override the per-source threshold lookup. */
47
+ threshold?: number;
48
+ /**
49
+ * Additional custom patterns to merge with the source profile's
50
+ * `extraPatterns`. Useful for org-specific markers.
51
+ */
52
+ customPatterns?: RegExp[];
53
+ /**
54
+ * Force the underlying heuristic scanner to a different strictness
55
+ * (default "high" because ingestion is always tighter than user input).
56
+ */
57
+ strictness?: "low" | "medium" | "high";
58
+ }
59
+ /**
60
+ * Scanner implementation. Composable into a `ScannerChain` when the
61
+ * caller wants ingestion to participate in the main scan flow rather
62
+ * than be invoked via the standalone `scanIngested()` helper.
63
+ *
64
+ * The scanner reads the `source` from `ScanContext` (or treats input
65
+ * as `"user"` when missing) and applies the source-specific profile.
66
+ */
67
+ export declare class IngestionScanner implements Scanner {
68
+ readonly name = "ingestion";
69
+ private readonly threshold;
70
+ private readonly customPatterns;
71
+ private readonly heuristic;
72
+ constructor(config?: IngestionScannerConfig);
73
+ scan(input: string, context: ScanContext): Promise<ScannerResult>;
74
+ }
75
+ /**
76
+ * One-shot helper. Scans `content` against the source-specific profile
77
+ * and returns a result without needing an `AIShield` instance.
78
+ *
79
+ * Use when you want a quick gate at the ingestion boundary, e.g.
80
+ * before storing a chunk into a vector DB or before passing a tool
81
+ * description into the model's context.
82
+ *
83
+ * @example
84
+ * ```ts
85
+ * import { scanIngested } from "ai-shield-core";
86
+ *
87
+ * const ragChunk = "...retrieved document text...";
88
+ * const result = await scanIngested(ragChunk, "rag");
89
+ * if (!result.safe) {
90
+ * // reject the chunk OR strip it before assembly
91
+ * logger.warn("IPI candidate", result.violations);
92
+ * }
93
+ * ```
94
+ */
95
+ export declare function scanIngested(content: string, source: IngestionSource, config?: IngestionScannerConfig): Promise<IngestionScanResult>;
96
+ /**
97
+ * Scan the runtime *result* of a tool call before it re-enters the model
98
+ * context. The dominant indirect-injection channel in agentic loops: a
99
+ * search tool surfaces a poisoned page, an MCP server returns attacker-
100
+ * controlled data, a compromised upstream API embeds instructions in its
101
+ * response. PoisonedRAG (USENIX Security 2025) showed 5 planted documents
102
+ * reach a 90% attack-success rate in million-document knowledge bases —
103
+ * the payload arrives here, not in the user prompt.
104
+ *
105
+ * Thin wrapper over `scanIngested(content, "tool-output")` that also
106
+ * stamps the originating `toolName` into every violation detail, so an
107
+ * audit log can answer "which tool returned the poisoned content?".
108
+ *
109
+ * Pair with `CircuitBreakerRegistry` when you also want to rate-limit or
110
+ * trip the tool after repeated poisoned results:
111
+ *
112
+ * @example
113
+ * ```ts
114
+ * import { scanToolOutput } from "ai-shield-core";
115
+ *
116
+ * const result = await searchTool.call(query); // untrusted
117
+ * const scan = await scanToolOutput("web_search", result);
118
+ * if (!scan.safe) {
119
+ * // drop the result OR strip it before the next model turn
120
+ * audit.warn("poisoned tool output", { tool: "web_search", v: scan.violations });
121
+ * return; // do not feed `result` back into the model
122
+ * }
123
+ * model.continue(result);
124
+ * ```
125
+ */
126
+ export declare function scanToolOutput(toolName: string, content: string, config?: IngestionScannerConfig): Promise<IngestionScanResult>;
127
+ /**
128
+ * Try to decode common obfuscation layers an attacker uses to smuggle
129
+ * an injection past pattern matchers. Returns the decoded payload when
130
+ * it looks like a successful decode, else `null`.
131
+ *
132
+ * The function deliberately runs at most ONE decode layer to avoid
133
+ * decoding amplification (a chain of `base64(base64(...))` would force
134
+ * us into deep recursion); a single-layer decode is enough to catch
135
+ * the vast majority of in-the-wild bypasses while keeping execution
136
+ * cost bounded.
137
+ *
138
+ * Heuristics:
139
+ * - Base64: contiguous run of 40+ Base64 chars, decodes to mostly
140
+ * printable ASCII or the `\u00..` C0 range stays empty.
141
+ * - Hex: 80+ hex chars in a row.
142
+ * - Percent-encoding: more than 5 `%XX` sequences.
143
+ *
144
+ * Returns the longest decoded payload when multiple candidates fire.
145
+ */
146
+ export declare function tryDecodeObfuscation(input: string): string | null;
147
+ //# sourceMappingURL=ingestion.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ingestion.d.ts","sourceRoot":"","sources":["../../src/scanner/ingestion.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,OAAO,EACP,aAAa,EACb,WAAW,EACX,SAAS,EACT,eAAe,EACf,SAAS,EACV,MAAM,aAAa,CAAC;AAoIrB;;;;;;;;;GASG;AACH,wBAAgB,kBAAkB,CAAC,OAAO,EAAE,eAAe,GAAG,SAAS,CAEtE;AAoFD;;;;;GAKG;AACH,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,OAAO,CAAC;IACd,QAAQ,EAAE,OAAO,GAAG,MAAM,GAAG,OAAO,CAAC;IACrC;;;;;;OAMG;IACH,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,SAAS,EAAE,CAAC;IACxB,MAAM,EAAE,eAAe,CAAC;IACxB,IAAI,EAAE;QACJ,cAAc,EAAE,MAAM,CAAC;QACvB,WAAW,EAAE,MAAM,EAAE,CAAC;QACtB,2DAA2D;QAC3D,kBAAkB,EAAE,MAAM,CAAC;QAC3B;;;;WAIG;QACH,MAAM,EAAE,OAAO,CAAC;KACjB,CAAC;CACH;AAED,MAAM,WAAW,sBAAsB;IACrC,gDAAgD;IAChD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;IAC1B;;;OAGG;IACH,UAAU,CAAC,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;CACxC;AAED;;;;;;;GAOG;AACH,qBAAa,gBAAiB,YAAW,OAAO;IAC9C,QAAQ,CAAC,IAAI,eAAe;IAC5B,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAqB;IAC/C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAW;IAC1C,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAmB;gBAEjC,MAAM,GAAE,sBAA2B;IAQzC,IAAI,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,WAAW,GAAG,OAAO,CAAC,aAAa,CAAC;CAmHxE;AAED;;;;;;;;;;;;;;;;;;;GAmBG;AACH,wBAAsB,YAAY,CAChC,OAAO,EAAE,MAAM,EACf,MAAM,EAAE,eAAe,EACvB,MAAM,GAAE,sBAA2B,GAClC,OAAO,CAAC,mBAAmB,CAAC,CA0B9B;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AACH,wBAAsB,cAAc,CAClC,QAAQ,EAAE,MAAM,EAChB,OAAO,EAAE,MAAM,EACf,MAAM,GAAE,sBAA2B,GAClC,OAAO,CAAC,mBAAmB,CAAC,CAa9B;AAQD;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAgB,oBAAoB,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CA2EjE"}