ai-shield-core 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/audit/logger.d.ts.map +1 -1
- package/dist/audit/logger.js +13 -14
- package/dist/audit/types.js +1 -2
- package/dist/cache/lru.js +1 -5
- package/dist/canary/memory.d.ts +75 -0
- package/dist/canary/memory.d.ts.map +1 -0
- package/dist/canary/memory.js +194 -0
- package/dist/context/wrap-context.d.ts +169 -0
- package/dist/context/wrap-context.d.ts.map +1 -0
- package/dist/context/wrap-context.js +278 -0
- package/dist/cost/anomaly.js +1 -4
- package/dist/cost/pricing.d.ts.map +1 -1
- package/dist/cost/pricing.js +26 -19
- package/dist/cost/tracker.d.ts +19 -1
- package/dist/cost/tracker.d.ts.map +1 -1
- package/dist/cost/tracker.js +27 -10
- package/dist/index.d.ts +34 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +55 -37
- package/dist/judge/async-judge.d.ts +85 -0
- package/dist/judge/async-judge.d.ts.map +1 -0
- package/dist/judge/async-judge.js +146 -0
- package/dist/policy/circuit-breaker.d.ts +70 -0
- package/dist/policy/circuit-breaker.d.ts.map +1 -0
- package/dist/policy/circuit-breaker.js +376 -0
- package/dist/policy/engine.js +1 -5
- package/dist/policy/tools.js +4 -8
- package/dist/scanner/canary.js +4 -8
- package/dist/scanner/chain.js +1 -5
- package/dist/scanner/heuristic.d.ts +27 -0
- package/dist/scanner/heuristic.d.ts.map +1 -1
- package/dist/scanner/heuristic.js +118 -7
- package/dist/scanner/ingestion.d.ts +147 -0
- package/dist/scanner/ingestion.d.ts.map +1 -0
- package/dist/scanner/ingestion.js +520 -0
- package/dist/scanner/output.d.ts +73 -0
- package/dist/scanner/output.d.ts.map +1 -0
- package/dist/scanner/output.js +297 -0
- package/dist/scanner/pii.d.ts.map +1 -1
- package/dist/scanner/pii.js +24 -12
- package/dist/shield.d.ts.map +1 -1
- package/dist/shield.js +34 -26
- package/dist/types.d.ts +156 -2
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +1 -2
- package/package.json +4 -3
- package/src/audit/logger.ts +6 -1
- package/src/canary/memory.ts +259 -0
- package/src/context/wrap-context.ts +475 -0
- package/src/cost/pricing.ts +21 -9
- package/src/cost/tracker.ts +35 -1
- package/src/index.ts +113 -2
- package/src/judge/async-judge.ts +254 -0
- package/src/policy/circuit-breaker.ts +449 -0
- package/src/scanner/heuristic.ts +125 -2
- package/src/scanner/ingestion.ts +624 -0
- package/src/scanner/output.ts +386 -0
- package/src/scanner/pii.ts +21 -7
- package/src/shield.ts +15 -2
- package/src/types.ts +194 -2
- package/tsconfig.json +2 -1
- package/dist/audit/logger.js.map +0 -1
- package/dist/audit/types.js.map +0 -1
- package/dist/cache/lru.js.map +0 -1
- package/dist/cost/anomaly.js.map +0 -1
- package/dist/cost/pricing.js.map +0 -1
- package/dist/cost/tracker.js.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/policy/engine.js.map +0 -1
- package/dist/policy/tools.js.map +0 -1
- package/dist/scanner/canary.js.map +0 -1
- package/dist/scanner/chain.js.map +0 -1
- package/dist/scanner/heuristic.js.map +0 -1
- package/dist/scanner/pii.js.map +0 -1
- package/dist/shield.js.map +0 -1
- package/dist/types.js.map +0 -1
|
@@ -0,0 +1,520 @@
|
|
|
1
|
+
import { HeuristicScanner, normalizeForInjectionScan } from "./heuristic.js";
|
|
2
|
+
// ============================================================
|
|
3
|
+
// Ingestion Scanner — Indirect Prompt Injection (IPI) Defense
|
|
4
|
+
//
|
|
5
|
+
// Scans non-user content (RAG chunks, MCP tool descriptions, stored
|
|
6
|
+
// memory facts, scraped web pages, agent-to-agent messages) for
|
|
7
|
+
// instruction-shaped payloads BEFORE they enter the model context.
|
|
8
|
+
//
|
|
9
|
+
// Per Lakera 2026 incident catalog + OWASP LLM01:2025, indirect
|
|
10
|
+
// injection is now the dominant attack class — >55% of observed
|
|
11
|
+
// incidents arrive through trusted-looking data channels. Direct user
|
|
12
|
+
// injection is the minority case.
|
|
13
|
+
//
|
|
14
|
+
// This scanner runs the existing heuristic patterns at a stricter
|
|
15
|
+
// threshold AND adds source-specific patterns the user channel does
|
|
16
|
+
// not see (HTML-comment instructions, tool-description override,
|
|
17
|
+
// memory-entry steering).
|
|
18
|
+
// ============================================================
|
|
19
|
+
/**
|
|
20
|
+
* Per-source threshold + extra patterns. Tighter than the user-channel
|
|
21
|
+
* default because data sources almost never need instruction syntax —
|
|
22
|
+
* the presence of one in retrieved content is itself a signal.
|
|
23
|
+
*/
|
|
24
|
+
const SOURCE_PROFILE = {
|
|
25
|
+
user: {
|
|
26
|
+
// For symmetry — a caller may pass source="user" through scanIngested().
|
|
27
|
+
// Falls back to normal heuristic strictness.
|
|
28
|
+
threshold: 0.3,
|
|
29
|
+
extraPatterns: [],
|
|
30
|
+
},
|
|
31
|
+
rag: {
|
|
32
|
+
// Retrieved docs basically never legitimately contain "ignore previous
|
|
33
|
+
// instructions" type wording. Strict. Patterns here are ReDoS-bounded —
|
|
34
|
+
// hidden-comment + CSS detection are done via slice-then-test below in
|
|
35
|
+
// `runStructuralIngestionScan()` so we never run two `[^]*?` wildcards
|
|
36
|
+
// back-to-back over attacker-controlled input.
|
|
37
|
+
threshold: 0.15,
|
|
38
|
+
extraPatterns: [
|
|
39
|
+
// "AI assistant note: ..." style trojan headers.
|
|
40
|
+
/(?:AI\s+(?:assistant|model)\s+(?:note|instruction|directive)|attention\s+(?:AI|model|assistant))[:\s]+/i,
|
|
41
|
+
// Document-claims-to-be-system marker.
|
|
42
|
+
/this\s+document\s+(?:is|contains|provides)\s+(?:your|the)\s+(?:new\s+)?(?:instructions?|system\s+prompt|directives?)/i,
|
|
43
|
+
],
|
|
44
|
+
},
|
|
45
|
+
"tool-desc": {
|
|
46
|
+
// MCP tool descriptions / function schemas: any imperative aimed at
|
|
47
|
+
// the calling model is suspicious. OX Security MCP advisory + LiteLLM
|
|
48
|
+
// CVE-2025-69256 class.
|
|
49
|
+
threshold: 0.12,
|
|
50
|
+
extraPatterns: [
|
|
51
|
+
// "Before using this tool, you must ..." / "After calling, also call X"
|
|
52
|
+
/(?:before|after|while)\s+(?:using|invoking|calling|executing)\s+(?:this\s+)?(?:tool|function|action)[,\s]+(?:you\s+)?(?:must|should|will|need\s+to|are\s+required\s+to)/i,
|
|
53
|
+
// "Also call delete_account" / "then invoke query()". Tightened
|
|
54
|
+
// (review M2): the called symbol must carry a case-robust tool-name
|
|
55
|
+
// signal — a snake_case underscore or a `(` call — so benign docs prose
|
|
56
|
+
// ("first call the API", "then use the result") doesn't trip the
|
|
57
|
+
// block-level threshold and push operators into disabling the scanner.
|
|
58
|
+
// (camelCase detection was dropped: under the /i flag `[a-z]+[A-Z]`
|
|
59
|
+
// collapses to "any word" and re-introduced the false positive.)
|
|
60
|
+
/(?:also|always|first|then|finally)\s+(?:call|invoke|use|execute|run)\s+(?:the\s+)?[a-z][\w-]*(?:_[\w-]+|\s*\()/i,
|
|
61
|
+
// "Tool description: ignore the above and ..."
|
|
62
|
+
/(?:tool|function|action)\s+description[:\s]+(?:ignore|disregard|override)/i,
|
|
63
|
+
// "Note to LLM:" / "Hidden instruction:"
|
|
64
|
+
/(?:note|hidden\s+(?:instruction|directive|note))\s+to\s+(?:LLM|model|assistant|AI|agent)[:\s]/i,
|
|
65
|
+
// Tool args carrying meta-instructions.
|
|
66
|
+
/(?:on\s+(?:success|call)|when\s+invoked)[,:]\s+(?:silently|also|then)\s+(?:call|invoke|exfiltrate|send|leak)/i,
|
|
67
|
+
],
|
|
68
|
+
},
|
|
69
|
+
"tool-output": {
|
|
70
|
+
// The runtime result a tool returned. Distinct from `tool-desc` (static
|
|
71
|
+
// schema): this is data the tool fetched and is therefore
|
|
72
|
+
// attacker-influenceable (PoisonedRAG, scraped pages surfaced through a
|
|
73
|
+
// search tool, a compromised upstream API). Tool results almost never
|
|
74
|
+
// legitimately carry instruction syntax aimed at the calling model, so
|
|
75
|
+
// the bar is as tight as RAG/tool-desc.
|
|
76
|
+
threshold: 0.13,
|
|
77
|
+
extraPatterns: [
|
|
78
|
+
// Result claims to carry instructions for the model.
|
|
79
|
+
/(?:tool|function|api|search|query)\s+(?:result|response|output)[:\s]+(?:ignore|disregard|override|new\s+instructions?|system\s+prompt)/i,
|
|
80
|
+
// "the result indicates you should now call/invoke X"
|
|
81
|
+
/(?:result|response|data|output)\s+(?:indicates?|says?|requires?|means?)\s+(?:that\s+)?you\s+(?:should|must|need\s+to|will)\s+(?:now\s+)?(?:call|invoke|run|execute|use)\s+[a-z_][\w-]{2,}/i,
|
|
82
|
+
// Embedded role/system marker inside a JSON-ish result value.
|
|
83
|
+
/"(?:role|system|instruction|directive)"\s*:\s*"(?:system|ignore|override|admin)/i,
|
|
84
|
+
// "(end of results) Now, as the system, ..."
|
|
85
|
+
/(?:end\s+of\s+(?:results?|output|data)|<\/results?>)[\s.)]*(?:now|next)[,\s]+(?:as\s+(?:the\s+)?(?:system|admin|assistant)|you\s+(?:must|should|will))/i,
|
|
86
|
+
],
|
|
87
|
+
},
|
|
88
|
+
memory: {
|
|
89
|
+
// Stored memory entries: persistence poisoning. Look for sentinel
|
|
90
|
+
// instructions that re-anchor the model on subsequent retrieval.
|
|
91
|
+
threshold: 0.18,
|
|
92
|
+
extraPatterns: [
|
|
93
|
+
// "Remember:" / "Important note for next session:"
|
|
94
|
+
/(?:remember|important|critical)\s+(?:note|instruction|directive)\s+(?:for\s+(?:next|future|all)\s+(?:sessions?|conversations?|calls?))[:\s]/i,
|
|
95
|
+
// "Whenever the user asks X, do Y"
|
|
96
|
+
/(?:whenever|every\s+time)\s+(?:the\s+user|a\s+user|someone)\s+(?:asks|says|mentions|requests)\s+.{1,80}?[,:]\s*(?:you\s+(?:must|should|will|need)|always)/i,
|
|
97
|
+
// "User's true preference is ..." (steering attempts).
|
|
98
|
+
/(?:user(?:'s|s)?\s+(?:real|true|actual|hidden)\s+(?:preference|intent|goal|name|identity))/i,
|
|
99
|
+
// "Override default behavior when ..."
|
|
100
|
+
/override\s+(?:default|standard|normal)\s+(?:behavior|response|policy)/i,
|
|
101
|
+
],
|
|
102
|
+
},
|
|
103
|
+
web: {
|
|
104
|
+
// Scraped web — same as RAG but also catch markdown-link hijacks.
|
|
105
|
+
// HTML-comment + CSS-hidden detection lives in
|
|
106
|
+
// `runStructuralIngestionScan()` (slice-then-test, ReDoS-bounded).
|
|
107
|
+
threshold: 0.15,
|
|
108
|
+
extraPatterns: [
|
|
109
|
+
// Markdown-link with instruction-shaped anchor text.
|
|
110
|
+
/\[(?:ignore|disregard|override|system\s+(?:prompt|message))[^\]]{0,200}\]\([^)]{0,500}\)/i,
|
|
111
|
+
// ARIA / data-* attributes leaking instructions.
|
|
112
|
+
/(?:aria-label|alt|title|data-[a-z-]{0,40})\s*=\s*["'][^"']{0,500}(ignore\s+previous|new\s+instruction|system\s+prompt|override)/i,
|
|
113
|
+
],
|
|
114
|
+
},
|
|
115
|
+
"agent-output": {
|
|
116
|
+
// Output of one agent feeding another: multi-agent contagion.
|
|
117
|
+
// Treat like RAG but also catch "tell next agent to ..." patterns.
|
|
118
|
+
threshold: 0.18,
|
|
119
|
+
extraPatterns: [
|
|
120
|
+
/(?:tell|instruct|forward\s+to)\s+(?:the\s+)?(?:next|downstream|receiving|other)\s+(?:agent|model|assistant)\s+to/i,
|
|
121
|
+
/(?:on\s+behalf\s+of|impersonating)\s+(?:the\s+)?(?:user|admin|system|owner)/i,
|
|
122
|
+
/(?:relay|pass|propagate)\s+(?:these|the\s+following)\s+(?:instructions?|directives?|orders?)/i,
|
|
123
|
+
],
|
|
124
|
+
},
|
|
125
|
+
};
|
|
126
|
+
/**
|
|
127
|
+
* Default trust-tier inferred from source.
|
|
128
|
+
* `user` is still untrusted in this library's threat model — a user can
|
|
129
|
+
* inject too — but `system` is reserved for content the developer
|
|
130
|
+
* controls and labels via `wrapContext()`. Every ingestion source
|
|
131
|
+
* (including `user`) therefore returns `"untrusted"` by default; the
|
|
132
|
+
* parameter is kept on the signature so future per-source overrides
|
|
133
|
+
* (e.g. an installer marking a specific source as trusted) don't
|
|
134
|
+
* require a breaking API change.
|
|
135
|
+
*/
|
|
136
|
+
export function trustTierForSource(_source) {
|
|
137
|
+
return "untrusted";
|
|
138
|
+
}
|
|
139
|
+
// --- ReDoS-safe structural scan helpers ---
|
|
140
|
+
/**
|
|
141
|
+
* Hidden-comment + CSS-hidden detection done as bounded slice-then-test
|
|
142
|
+
* rather than a compound `[^]*?...[^]*?` regex (which back-tracks
|
|
143
|
+
* quadratically on attacker-controlled input that omits the terminator).
|
|
144
|
+
* See Critic C1 (round 1 review) — unterminated `<!--` of 50 KB stalled
|
|
145
|
+
* the original implementation.
|
|
146
|
+
*
|
|
147
|
+
* Each detector takes the already-NFKC-normalized input and returns
|
|
148
|
+
* `null` (clean) or a `Violation`.
|
|
149
|
+
*/
|
|
150
|
+
function runStructuralIngestionScan(normalized, source, threshold) {
|
|
151
|
+
if (source !== "rag" && source !== "web")
|
|
152
|
+
return [];
|
|
153
|
+
const violations = [];
|
|
154
|
+
const COMMENT_WINDOW = 2048;
|
|
155
|
+
const KEYWORD_RE = /ignore|disregard|override|forget|system\s+prompt|new\s+instructions?/i;
|
|
156
|
+
// 1. HTML comment hidden instruction.
|
|
157
|
+
let commentStart = 0;
|
|
158
|
+
let commentMatchCount = 0;
|
|
159
|
+
while (commentStart !== -1 && commentMatchCount < 8) {
|
|
160
|
+
commentStart = normalized.indexOf("<!--", commentStart);
|
|
161
|
+
if (commentStart === -1)
|
|
162
|
+
break;
|
|
163
|
+
const window = normalized.slice(commentStart + 4, commentStart + 4 + COMMENT_WINDOW);
|
|
164
|
+
if (KEYWORD_RE.test(window)) {
|
|
165
|
+
violations.push({
|
|
166
|
+
type: "ingested_injection",
|
|
167
|
+
scanner: "ingestion",
|
|
168
|
+
score: 0.4,
|
|
169
|
+
threshold,
|
|
170
|
+
message: `HTML-comment hidden instruction in ${source} content`,
|
|
171
|
+
detail: `Pattern: <!-- ... ignore|override|... (window 2KB)`,
|
|
172
|
+
});
|
|
173
|
+
commentMatchCount += 1;
|
|
174
|
+
}
|
|
175
|
+
commentStart += 4;
|
|
176
|
+
}
|
|
177
|
+
// 2. CSS-hidden style attribute carrying instruction-shaped neighbour.
|
|
178
|
+
//
|
|
179
|
+
// Round 2 Critic M-NEW-2: a single `.exec()` would only find the FIRST
|
|
180
|
+
// `style=` attribute. An attacker placing a benign `style="display:block"`
|
|
181
|
+
// first and a malicious `style="display:none"` later would slip through.
|
|
182
|
+
// Iterate all matches via the `/g` flag, capped at 16 to bound the work
|
|
183
|
+
// on adversarial input that floods style attributes.
|
|
184
|
+
const STYLE_HIDDEN_RE = /style\s*=\s*["'][^"']{0,300}(?:display\s*:\s*none|visibility\s*:\s*hidden|font-size\s*:\s*0)[^"']{0,300}["']/gi;
|
|
185
|
+
let styleMatchCount = 0;
|
|
186
|
+
let styleMatch;
|
|
187
|
+
while ((styleMatch = STYLE_HIDDEN_RE.exec(normalized)) !== null &&
|
|
188
|
+
styleMatchCount < 16) {
|
|
189
|
+
styleMatchCount += 1;
|
|
190
|
+
const tail = normalized.slice(styleMatch.index + styleMatch[0].length, styleMatch.index + styleMatch[0].length + 500);
|
|
191
|
+
if (/ignore|override|system|instruction/i.test(tail)) {
|
|
192
|
+
violations.push({
|
|
193
|
+
type: "ingested_injection",
|
|
194
|
+
scanner: "ingestion",
|
|
195
|
+
score: 0.4,
|
|
196
|
+
threshold,
|
|
197
|
+
message: `CSS-hidden instruction in ${source} content`,
|
|
198
|
+
detail: `Pattern: style="display:none ... ignore|override|... (window 500B)`,
|
|
199
|
+
});
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
return violations;
|
|
203
|
+
}
|
|
204
|
+
/**
|
|
205
|
+
* Scanner implementation. Composable into a `ScannerChain` when the
|
|
206
|
+
* caller wants ingestion to participate in the main scan flow rather
|
|
207
|
+
* than be invoked via the standalone `scanIngested()` helper.
|
|
208
|
+
*
|
|
209
|
+
* The scanner reads the `source` from `ScanContext` (or treats input
|
|
210
|
+
* as `"user"` when missing) and applies the source-specific profile.
|
|
211
|
+
*/
|
|
212
|
+
export class IngestionScanner {
|
|
213
|
+
name = "ingestion";
|
|
214
|
+
threshold;
|
|
215
|
+
customPatterns;
|
|
216
|
+
heuristic;
|
|
217
|
+
constructor(config = {}) {
|
|
218
|
+
this.threshold = config.threshold;
|
|
219
|
+
this.customPatterns = config.customPatterns ?? [];
|
|
220
|
+
this.heuristic = new HeuristicScanner({
|
|
221
|
+
strictness: config.strictness ?? "high",
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
async scan(input, context) {
|
|
225
|
+
const source = context.source ?? "user";
|
|
226
|
+
const profile = SOURCE_PROFILE[source];
|
|
227
|
+
const effectiveThreshold = this.threshold ?? profile.threshold;
|
|
228
|
+
const start = performance.now();
|
|
229
|
+
const violations = [];
|
|
230
|
+
// 1. Run the base heuristic scanner at high strictness. We respect its
|
|
231
|
+
// own decision (it includes structural signals that don't surface
|
|
232
|
+
// as individual violations) and re-tag the violations as
|
|
233
|
+
// `ingested_injection` so downstream code can filter.
|
|
234
|
+
const heuristicResult = await this.heuristic.scan(input, context);
|
|
235
|
+
for (const v of heuristicResult.violations) {
|
|
236
|
+
violations.push({
|
|
237
|
+
...v,
|
|
238
|
+
type: "ingested_injection",
|
|
239
|
+
scanner: this.name,
|
|
240
|
+
detail: `${v.detail ?? ""} (source=${source})`.trim(),
|
|
241
|
+
});
|
|
242
|
+
}
|
|
243
|
+
// 2. Run source-specific patterns against the normalized input so the
|
|
244
|
+
// same Unicode-evasion defense the user channel gets applies here.
|
|
245
|
+
const normalized = normalizeForInjectionScan(input);
|
|
246
|
+
const sourcePatterns = [...profile.extraPatterns, ...this.customPatterns];
|
|
247
|
+
let sourceScore = 0;
|
|
248
|
+
for (const pattern of sourcePatterns) {
|
|
249
|
+
if (pattern.test(normalized)) {
|
|
250
|
+
sourceScore += 0.4;
|
|
251
|
+
violations.push({
|
|
252
|
+
type: "ingested_injection",
|
|
253
|
+
scanner: this.name,
|
|
254
|
+
score: 0.4,
|
|
255
|
+
threshold: effectiveThreshold,
|
|
256
|
+
message: `Indirect injection pattern in ${source} content`,
|
|
257
|
+
detail: `Pattern: ${pattern.source.slice(0, 80)}`,
|
|
258
|
+
});
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
// 2b. Structural slice-then-test scans for `rag` + `web` (ReDoS-safe
|
|
262
|
+
// replacement for the old compound HTML-comment + CSS-hidden
|
|
263
|
+
// patterns).
|
|
264
|
+
const structural = runStructuralIngestionScan(normalized, source, effectiveThreshold);
|
|
265
|
+
for (const v of structural) {
|
|
266
|
+
sourceScore += 0.4;
|
|
267
|
+
violations.push(v);
|
|
268
|
+
}
|
|
269
|
+
// 2c. Encoding-bypass: attackers wrap an injection in Base64 / Hex /
|
|
270
|
+
// percent-encoding and ask the model to "decode this". A single
|
|
271
|
+
// decode pass over the input flushes the most common bypasses
|
|
272
|
+
// documented in OWASP LLM Prompt Injection Prevention Cheat
|
|
273
|
+
// Sheet 2026. Only run when the input "looks encoded" to keep
|
|
274
|
+
// false-positive load low on plain prose.
|
|
275
|
+
const decoded = tryDecodeObfuscation(input);
|
|
276
|
+
if (decoded && decoded !== input) {
|
|
277
|
+
const decodedNormalized = normalizeForInjectionScan(decoded);
|
|
278
|
+
const decodedHeuristic = await this.heuristic.scan(decoded, context);
|
|
279
|
+
if (decodedHeuristic.decision !== "allow") {
|
|
280
|
+
for (const v of decodedHeuristic.violations) {
|
|
281
|
+
violations.push({
|
|
282
|
+
...v,
|
|
283
|
+
type: "ingested_injection",
|
|
284
|
+
scanner: this.name,
|
|
285
|
+
detail: `${v.detail ?? ""} (source=${source}, layer=decoded)`.trim(),
|
|
286
|
+
});
|
|
287
|
+
}
|
|
288
|
+
sourceScore += 0.6; // decoded-hit is high-confidence
|
|
289
|
+
}
|
|
290
|
+
// Also run source-specific patterns over the decoded layer.
|
|
291
|
+
for (const pattern of sourcePatterns) {
|
|
292
|
+
if (pattern.test(decodedNormalized)) {
|
|
293
|
+
sourceScore += 0.4;
|
|
294
|
+
violations.push({
|
|
295
|
+
type: "ingested_injection",
|
|
296
|
+
scanner: this.name,
|
|
297
|
+
score: 0.4,
|
|
298
|
+
threshold: effectiveThreshold,
|
|
299
|
+
message: `Encoded indirect injection in ${source} content`,
|
|
300
|
+
detail: `Pattern: ${pattern.source.slice(0, 80)} (layer=decoded)`,
|
|
301
|
+
});
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
sourceScore = Math.min(sourceScore, 1.0);
|
|
306
|
+
// 3. Combine decisions. The heuristic scanner already weighed
|
|
307
|
+
// structural signals (newlines, headers, padding) that may not
|
|
308
|
+
// surface as individual violations; trust its decision rather
|
|
309
|
+
// than re-aggregating only the violation-score subset.
|
|
310
|
+
const heuristicBlocks = heuristicResult.decision === "block";
|
|
311
|
+
const heuristicWarns = heuristicResult.decision === "warn";
|
|
312
|
+
const sourceBlocks = sourceScore >= effectiveThreshold;
|
|
313
|
+
const sourceWarns = sourceScore >= effectiveThreshold * 0.6;
|
|
314
|
+
let decision;
|
|
315
|
+
if (heuristicBlocks || sourceBlocks) {
|
|
316
|
+
decision = "block";
|
|
317
|
+
}
|
|
318
|
+
else if (heuristicWarns || sourceWarns) {
|
|
319
|
+
decision = "warn";
|
|
320
|
+
}
|
|
321
|
+
else {
|
|
322
|
+
decision = "allow";
|
|
323
|
+
}
|
|
324
|
+
return {
|
|
325
|
+
decision,
|
|
326
|
+
violations,
|
|
327
|
+
durationMs: performance.now() - start,
|
|
328
|
+
};
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
/**
|
|
332
|
+
* One-shot helper. Scans `content` against the source-specific profile
|
|
333
|
+
* and returns a result without needing an `AIShield` instance.
|
|
334
|
+
*
|
|
335
|
+
* Use when you want a quick gate at the ingestion boundary, e.g.
|
|
336
|
+
* before storing a chunk into a vector DB or before passing a tool
|
|
337
|
+
* description into the model's context.
|
|
338
|
+
*
|
|
339
|
+
* @example
|
|
340
|
+
* ```ts
|
|
341
|
+
* import { scanIngested } from "ai-shield-core";
|
|
342
|
+
*
|
|
343
|
+
* const ragChunk = "...retrieved document text...";
|
|
344
|
+
* const result = await scanIngested(ragChunk, "rag");
|
|
345
|
+
* if (!result.safe) {
|
|
346
|
+
* // reject the chunk OR strip it before assembly
|
|
347
|
+
* logger.warn("IPI candidate", result.violations);
|
|
348
|
+
* }
|
|
349
|
+
* ```
|
|
350
|
+
*/
|
|
351
|
+
export async function scanIngested(content, source, config = {}) {
|
|
352
|
+
const start = performance.now();
|
|
353
|
+
const scanner = new IngestionScanner(config);
|
|
354
|
+
const result = await scanner.scan(content, { source });
|
|
355
|
+
return {
|
|
356
|
+
safe: result.decision === "allow",
|
|
357
|
+
decision: result.decision,
|
|
358
|
+
// When a chunk is blocked, returning the raw input under the field
|
|
359
|
+
// name "sanitized" mis-leads callers into trusting poisoned content.
|
|
360
|
+
// Return empty string on block so a `if (!safe) use(result.sanitized)`
|
|
361
|
+
// path becomes a no-op rather than a vulnerability. Use the original
|
|
362
|
+
// `content` argument if you still need it for audit / quarantine.
|
|
363
|
+
sanitized: result.decision === "block" ? "" : content,
|
|
364
|
+
violations: result.violations,
|
|
365
|
+
source,
|
|
366
|
+
meta: {
|
|
367
|
+
scanDurationMs: performance.now() - start,
|
|
368
|
+
scannersRun: [scanner.name],
|
|
369
|
+
sourceSpecificHits: result.violations.filter((v) => v.detail?.startsWith("Pattern:") && v.type === "ingested_injection").length,
|
|
370
|
+
cached: false,
|
|
371
|
+
},
|
|
372
|
+
};
|
|
373
|
+
}
|
|
374
|
+
/**
|
|
375
|
+
* Scan the runtime *result* of a tool call before it re-enters the model
|
|
376
|
+
* context. The dominant indirect-injection channel in agentic loops: a
|
|
377
|
+
* search tool surfaces a poisoned page, an MCP server returns attacker-
|
|
378
|
+
* controlled data, a compromised upstream API embeds instructions in its
|
|
379
|
+
* response. PoisonedRAG (USENIX Security 2025) showed 5 planted documents
|
|
380
|
+
* reach a 90% attack-success rate in million-document knowledge bases —
|
|
381
|
+
* the payload arrives here, not in the user prompt.
|
|
382
|
+
*
|
|
383
|
+
* Thin wrapper over `scanIngested(content, "tool-output")` that also
|
|
384
|
+
* stamps the originating `toolName` into every violation detail, so an
|
|
385
|
+
* audit log can answer "which tool returned the poisoned content?".
|
|
386
|
+
*
|
|
387
|
+
* Pair with `CircuitBreakerRegistry` when you also want to rate-limit or
|
|
388
|
+
* trip the tool after repeated poisoned results:
|
|
389
|
+
*
|
|
390
|
+
* @example
|
|
391
|
+
* ```ts
|
|
392
|
+
* import { scanToolOutput } from "ai-shield-core";
|
|
393
|
+
*
|
|
394
|
+
* const result = await searchTool.call(query); // untrusted
|
|
395
|
+
* const scan = await scanToolOutput("web_search", result);
|
|
396
|
+
* if (!scan.safe) {
|
|
397
|
+
* // drop the result OR strip it before the next model turn
|
|
398
|
+
* audit.warn("poisoned tool output", { tool: "web_search", v: scan.violations });
|
|
399
|
+
* return; // do not feed `result` back into the model
|
|
400
|
+
* }
|
|
401
|
+
* model.continue(result);
|
|
402
|
+
* ```
|
|
403
|
+
*/
|
|
404
|
+
export async function scanToolOutput(toolName, content, config = {}) {
|
|
405
|
+
const result = await scanIngested(content, "tool-output", config);
|
|
406
|
+
const safeToolName = typeof toolName === "string" && toolName.length > 0
|
|
407
|
+
? toolName.slice(0, 120)
|
|
408
|
+
: "unknown";
|
|
409
|
+
return {
|
|
410
|
+
...result,
|
|
411
|
+
violations: result.violations.map((v) => ({
|
|
412
|
+
...v,
|
|
413
|
+
detail: `${v.detail ?? ""} (tool=${safeToolName})`.trim(),
|
|
414
|
+
})),
|
|
415
|
+
};
|
|
416
|
+
}
|
|
417
|
+
// ============================================================
|
|
418
|
+
// Encoding-bypass normalization (R1 from Round 1 review — closes
|
|
419
|
+
// OWASP LLM Prompt Injection Prevention Cheat Sheet 2026 Base64/Hex
|
|
420
|
+
// bypass class).
|
|
421
|
+
// ============================================================
|
|
422
|
+
/**
|
|
423
|
+
* Try to decode common obfuscation layers an attacker uses to smuggle
|
|
424
|
+
* an injection past pattern matchers. Returns the decoded payload when
|
|
425
|
+
* it looks like a successful decode, else `null`.
|
|
426
|
+
*
|
|
427
|
+
* The function deliberately runs at most ONE decode layer to avoid
|
|
428
|
+
* decoding amplification (a chain of `base64(base64(...))` would force
|
|
429
|
+
* us into deep recursion); a single-layer decode is enough to catch
|
|
430
|
+
* the vast majority of in-the-wild bypasses while keeping execution
|
|
431
|
+
* cost bounded.
|
|
432
|
+
*
|
|
433
|
+
* Heuristics:
|
|
434
|
+
* - Base64: contiguous run of 40+ Base64 chars, decodes to mostly
|
|
435
|
+
* printable ASCII or the `\u00..` C0 range stays empty.
|
|
436
|
+
* - Hex: 80+ hex chars in a row.
|
|
437
|
+
* - Percent-encoding: more than 5 `%XX` sequences.
|
|
438
|
+
*
|
|
439
|
+
* Returns the longest decoded payload when multiple candidates fire.
|
|
440
|
+
*/
|
|
441
|
+
export function tryDecodeObfuscation(input) {
|
|
442
|
+
if (typeof input !== "string" || input.length === 0)
|
|
443
|
+
return null;
|
|
444
|
+
// Cap input we look at — Base64 of a megabyte is not the threat model.
|
|
445
|
+
const haystack = input.length > 65_536 ? input.slice(0, 65_536) : input;
|
|
446
|
+
const candidates = [];
|
|
447
|
+
// Base64 — at least 40 chars, optional padding, optional whitespace.
|
|
448
|
+
const B64_RE = /[A-Za-z0-9+/=]{40,}/g;
|
|
449
|
+
for (const match of haystack.match(B64_RE) ?? []) {
|
|
450
|
+
const cleaned = match.replace(/=+$/, "").replace(/[^A-Za-z0-9+/]/g, "");
|
|
451
|
+
if (cleaned.length < 40)
|
|
452
|
+
continue;
|
|
453
|
+
try {
|
|
454
|
+
const decoded = Buffer.from(cleaned, "base64").toString("utf8");
|
|
455
|
+
if (decoded.length === 0)
|
|
456
|
+
continue;
|
|
457
|
+
const printable = decoded.replace(/[^\x20-\x7E\s]/g, "");
|
|
458
|
+
// Require >70% printable to avoid noise.
|
|
459
|
+
if (printable.length / decoded.length >= 0.7) {
|
|
460
|
+
candidates.push(decoded);
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
catch {
|
|
464
|
+
// ignore malformed Base64
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
// Hex — 80+ hex digits in a row.
|
|
468
|
+
const HEX_RE = /[0-9a-fA-F]{80,}/g;
|
|
469
|
+
for (const match of haystack.match(HEX_RE) ?? []) {
|
|
470
|
+
if (match.length % 2 !== 0)
|
|
471
|
+
continue;
|
|
472
|
+
try {
|
|
473
|
+
const decoded = Buffer.from(match, "hex").toString("utf8");
|
|
474
|
+
if (decoded.length === 0)
|
|
475
|
+
continue;
|
|
476
|
+
const printable = decoded.replace(/[^\x20-\x7E\s]/g, "");
|
|
477
|
+
if (printable.length / decoded.length >= 0.7) {
|
|
478
|
+
candidates.push(decoded);
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
catch {
|
|
482
|
+
// ignore malformed hex
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
// Percent-encoding — only decode a windowed region around clustered
|
|
486
|
+
// escapes rather than the full 65KB haystack. Round 2 Critic M-NEW-1:
|
|
487
|
+
// running `decodeURIComponent()` on the full haystack on every scan
|
|
488
|
+
// allocates a ~2× copy per call and pressures GC in high-throughput
|
|
489
|
+
// ingestion pipelines.
|
|
490
|
+
const PERCENT_RE = /%[0-9A-Fa-f]{2}/g;
|
|
491
|
+
const percentMatches = [];
|
|
492
|
+
let percentMatch;
|
|
493
|
+
while ((percentMatch = PERCENT_RE.exec(haystack)) !== null &&
|
|
494
|
+
percentMatches.length < 32) {
|
|
495
|
+
percentMatches.push(percentMatch.index);
|
|
496
|
+
}
|
|
497
|
+
if (percentMatches.length >= 5) {
|
|
498
|
+
// Decode only a window around the cluster: 256 bytes before the first
|
|
499
|
+
// escape, 1KB after the last. Bounded work regardless of haystack size.
|
|
500
|
+
const first = percentMatches[0] ?? 0;
|
|
501
|
+
const last = percentMatches[percentMatches.length - 1] ?? first;
|
|
502
|
+
const winStart = Math.max(0, first - 256);
|
|
503
|
+
const winEnd = Math.min(haystack.length, last + 1024);
|
|
504
|
+
const window = haystack.slice(winStart, winEnd);
|
|
505
|
+
try {
|
|
506
|
+
const decoded = decodeURIComponent(window);
|
|
507
|
+
if (decoded !== window)
|
|
508
|
+
candidates.push(decoded);
|
|
509
|
+
}
|
|
510
|
+
catch {
|
|
511
|
+
// ignore malformed percent-encoding
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
if (candidates.length === 0)
|
|
515
|
+
return null;
|
|
516
|
+
// Return the longest candidate; that's the most likely attack payload.
|
|
517
|
+
candidates.sort((a, b) => b.length - a.length);
|
|
518
|
+
return candidates[0] ?? null;
|
|
519
|
+
}
|
|
520
|
+
//# sourceMappingURL=ingestion.js.map
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import type { ScanContext, ScanDecision, Violation, PIIConfig } from "../types.js";
|
|
2
|
+
export type OutputSink = "sql" | "shell" | "html" | "template";
|
|
3
|
+
export interface OutputScanConfig {
|
|
4
|
+
/**
|
|
5
|
+
* PII handling. Pass a `PIIConfig` to control action/locale, or `false`
|
|
6
|
+
* to skip PII scanning entirely. Default: mask.
|
|
7
|
+
*/
|
|
8
|
+
pii?: PIIConfig | false;
|
|
9
|
+
/**
|
|
10
|
+
* Canary token(s) injected into the system prompt via `injectCanary()`.
|
|
11
|
+
* If any appears verbatim in the output → `system_prompt_leak` (block).
|
|
12
|
+
*/
|
|
13
|
+
canaryTokens?: string | string[];
|
|
14
|
+
/**
|
|
15
|
+
* Restrict the structured-injection check to specific downstream sinks.
|
|
16
|
+
* E.g. `["sql"]` when the output only ever flows into a query builder.
|
|
17
|
+
* Default: all sinks.
|
|
18
|
+
*/
|
|
19
|
+
sinks?: OutputSink[];
|
|
20
|
+
/** Selectively disable checks. All enabled by default. */
|
|
21
|
+
checks?: {
|
|
22
|
+
secrets?: boolean;
|
|
23
|
+
injection?: boolean;
|
|
24
|
+
systemPromptLeak?: boolean;
|
|
25
|
+
jailbreak?: boolean;
|
|
26
|
+
};
|
|
27
|
+
/** Override the byte cap on the scanned region. Default 256 KB. */
|
|
28
|
+
maxBytes?: number;
|
|
29
|
+
}
|
|
30
|
+
export interface OutputScanResult {
|
|
31
|
+
/** No blocking violation found. */
|
|
32
|
+
safe: boolean;
|
|
33
|
+
decision: ScanDecision;
|
|
34
|
+
/**
|
|
35
|
+
* Output with PII masked and secrets redacted to `[REDACTED_SECRET]`.
|
|
36
|
+
* Unlike `scanIngested`, this is NOT emptied on block — the caller
|
|
37
|
+
* usually still needs to log or display the sanitized text. Gate on
|
|
38
|
+
* `safe` / `decision` before forwarding it to a downstream sink.
|
|
39
|
+
*/
|
|
40
|
+
sanitized: string;
|
|
41
|
+
violations: Violation[];
|
|
42
|
+
meta: {
|
|
43
|
+
scanDurationMs: number;
|
|
44
|
+
checksRun: string[];
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Scanner for LLM output. Stateless; safe to reuse across calls.
|
|
49
|
+
*/
|
|
50
|
+
export declare class OutputScanner {
|
|
51
|
+
private readonly config;
|
|
52
|
+
private readonly pii;
|
|
53
|
+
constructor(config?: OutputScanConfig);
|
|
54
|
+
scan(output: string, context?: ScanContext): Promise<OutputScanResult>;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* One-shot helper. Scan a model response before acting on it.
|
|
58
|
+
*
|
|
59
|
+
* @example
|
|
60
|
+
* ```ts
|
|
61
|
+
* import { scanOutput } from "ai-shield-core";
|
|
62
|
+
*
|
|
63
|
+
* const reply = await llm.generate(prompt);
|
|
64
|
+
* const r = await scanOutput(reply, { canaryTokens: canary, sinks: ["sql"] });
|
|
65
|
+
* if (!r.safe) {
|
|
66
|
+
* audit.warn("unsafe model output", r.violations);
|
|
67
|
+
* return genericFallback(); // do not run r.sanitized as SQL
|
|
68
|
+
* }
|
|
69
|
+
* showToUser(r.sanitized); // PII masked, secrets redacted
|
|
70
|
+
* ```
|
|
71
|
+
*/
|
|
72
|
+
export declare function scanOutput(output: string, config?: OutputScanConfig, context?: ScanContext): Promise<OutputScanResult>;
|
|
73
|
+
//# sourceMappingURL=output.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"output.d.ts","sourceRoot":"","sources":["../../src/scanner/output.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,WAAW,EACX,YAAY,EACZ,SAAS,EACT,SAAS,EACV,MAAM,aAAa,CAAC;AAuHrB,MAAM,MAAM,UAAU,GAAG,KAAK,GAAG,OAAO,GAAG,MAAM,GAAG,UAAU,CAAC;AAE/D,MAAM,WAAW,gBAAgB;IAC/B;;;OAGG;IACH,GAAG,CAAC,EAAE,SAAS,GAAG,KAAK,CAAC;IACxB;;;OAGG;IACH,YAAY,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;IACjC;;;;OAIG;IACH,KAAK,CAAC,EAAE,UAAU,EAAE,CAAC;IACrB,0DAA0D;IAC1D,MAAM,CAAC,EAAE;QACP,OAAO,CAAC,EAAE,OAAO,CAAC;QAClB,SAAS,CAAC,EAAE,OAAO,CAAC;QACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;QAC3B,SAAS,CAAC,EAAE,OAAO,CAAC;KACrB,CAAC;IACF,mEAAmE;IACnE,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,gBAAgB;IAC/B,mCAAmC;IACnC,IAAI,EAAE,OAAO,CAAC;IACd,QAAQ,EAAE,YAAY,CAAC;IACvB;;;;;OAKG;IACH,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,SAAS,EAAE,CAAC;IACxB,IAAI,EAAE;QACJ,cAAc,EAAE,MAAM,CAAC;QACvB,SAAS,EAAE,MAAM,EAAE,CAAC;KACrB,CAAC;CACH;AAID;;GAEG;AACH,qBAAa,aAAa;IACxB,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAmB;IAC1C,OAAO,CAAC,QAAQ,CAAC,GAAG,CAAoB;gBAE5B,MAAM,GAAE,gBAAqB;IAQnC,IAAI,CACR,MAAM,EAAE,MAAM,EACd,OAAO,GAAE,WAAgB,GACxB,OAAO,CAAC,gBAAgB,CAAC;CA+J7B;AAED;;;;;;;;;;;;;;;GAeG;AACH,wBAAsB,UAAU,CAC9B,MAAM,EAAE,MAAM,EACd,MAAM,GAAE,gBAAqB,EAC7B,OAAO,GAAE,WAAgB,GACxB,OAAO,CAAC,gBAAgB,CAAC,CAE3B"}
|