@crewhaus/prompt-injection-detector 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +129 -0
- package/dist/index.js +670 -0
- package/package.json +9 -6
- package/src/index.test.ts +0 -483
- package/src/index.ts +0 -774
package/src/index.ts
DELETED
|
@@ -1,774 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Catalog R8 `prompt-injection-detector` — heuristic + optional LLM
|
|
3
|
-
* classifier for tool output. Used by runtime-core after every tool
|
|
4
|
-
* call (when the tool's `classifyOutput` flag is not explicitly false)
|
|
5
|
-
* to decide whether the output should be passed to the model verbatim,
|
|
6
|
-
* passed with a system warning, or redacted.
|
|
7
|
-
*
|
|
8
|
-
* Three layers, fail-closed when ambiguous:
|
|
9
|
-
*
|
|
10
|
-
* Layer 1 — regex rules over a corpus drawn from OWASP LLM Top-10
|
|
11
|
-
* plus a 50+-vector hand-crafted set. Each rule has a severity tag
|
|
12
|
-
* and contributes to a cumulative score. The corpus is exported so
|
|
13
|
-
* tests and downstream tools can audit it.
|
|
14
|
-
*
|
|
15
|
-
* Layer 2 — structural heuristics. Trailing imperative blocks,
|
|
16
|
-
* role-marker injection (e.g. "system:" / "<|im_start|>"), BOM
|
|
17
|
-
* tampering, and base64 wrapping a malicious string are all detected
|
|
18
|
-
* without overlap with Layer 1. These produce hits with severity
|
|
19
|
-
* weighted by structural risk.
|
|
20
|
-
*
|
|
21
|
-
* Layer 3 — optional LLM classifier. Activated when
|
|
22
|
-
* `CREWHAUS_PI_CLASSIFIER_MODEL` is set; the runtime supplies a
|
|
23
|
-
* `classify` callback that delegates to a model. Without the env
|
|
24
|
-
* var the layer is a no-op.
|
|
25
|
-
*
|
|
26
|
-
* The aggregate score thresholds:
|
|
27
|
-
* < 0.40 → "clean"
|
|
28
|
-
* [0.40, 0.80) → "suspicious"
|
|
29
|
-
* ≥ 0.80 → "malicious"
|
|
30
|
-
*
|
|
31
|
-
* Layer R8. Pairs with `tool-result-store` (the previewContent input)
|
|
32
|
-
* and `runtime-core` (the post-tool callsite that consumes the
|
|
33
|
-
* classification).
|
|
34
|
-
*/
|
|
35
|
-
|
|
36
|
-
export type PromptInjectionClassification = "clean" | "suspicious" | "malicious";
|
|
37
|
-
|
|
38
|
-
export type PromptInjectionSeverity = "low" | "medium" | "high";
|
|
39
|
-
|
|
40
|
-
export type PromptInjectionHit = {
|
|
41
|
-
/** Stable rule id; safe to surface in logs and the redaction notice. */
|
|
42
|
-
readonly rule: string;
|
|
43
|
-
/** [start, end) byte offset in the analyzed text. */
|
|
44
|
-
readonly span: readonly [number, number];
|
|
45
|
-
readonly severity: PromptInjectionSeverity;
|
|
46
|
-
/** Layer that produced the hit. */
|
|
47
|
-
readonly layer: "regex" | "structural" | "llm";
|
|
48
|
-
};
|
|
49
|
-
|
|
50
|
-
export type PromptInjectionResult = {
|
|
51
|
-
readonly classification: PromptInjectionClassification;
|
|
52
|
-
/** [0, 1] aggregate score. Higher = more likely injection. */
|
|
53
|
-
readonly score: number;
|
|
54
|
-
readonly hits: ReadonlyArray<PromptInjectionHit>;
|
|
55
|
-
};
|
|
56
|
-
|
|
57
|
-
export type PromptInjectionRule = {
|
|
58
|
-
readonly id: string;
|
|
59
|
-
readonly pattern: RegExp;
|
|
60
|
-
readonly severity: PromptInjectionSeverity;
|
|
61
|
-
readonly description?: string;
|
|
62
|
-
};
|
|
63
|
-
|
|
64
|
-
const SEVERITY_WEIGHT: Record<PromptInjectionSeverity, number> = {
|
|
65
|
-
low: 0.18,
|
|
66
|
-
medium: 0.42,
|
|
67
|
-
high: 0.85,
|
|
68
|
-
};
|
|
69
|
-
|
|
70
|
-
const SCORE_SUSPICIOUS = 0.4;
|
|
71
|
-
const SCORE_MALICIOUS = 0.8;
|
|
72
|
-
|
|
73
|
-
// Upper bound on the text the regex/structural layers scan, so a pathological
|
|
74
|
-
// (e.g. multi-MB whitespace) input cannot wedge the classifier (#153). Larger
|
|
75
|
-
// inputs are analyzed head + tail.
|
|
76
|
-
const MAX_CLASSIFY_LEN = 64 * 1024;
|
|
77
|
-
|
|
78
|
-
// Zero-width / format / bidi / tag characters used to split trigger words
|
|
79
|
-
// ("ig<U+200B>nore"). Stripped from the match view; their *presence* is still
|
|
80
|
-
// caught on the raw text by the unicode-tag-spoof / rtl-override rules.
|
|
81
|
-
const INVISIBLE_RE = /[----\u{E0000}-\u{E007F}]/gu;
|
|
82
|
-
|
|
83
|
-
// Common confusable homoglyphs → ASCII, applied only to the match view so an
|
|
84
|
-
// attacker cannot dodge the keyword rules with Cyrillic/Greek look-alikes
|
|
85
|
-
// (e.g. Cyrillic "іgnоre"). Intentionally small to limit false positives.
|
|
86
|
-
const HOMOGLYPHS: Record<string, string> = {
|
|
87
|
-
а: "a",
|
|
88
|
-
е: "e",
|
|
89
|
-
о: "o",
|
|
90
|
-
р: "p",
|
|
91
|
-
с: "c",
|
|
92
|
-
у: "y",
|
|
93
|
-
х: "x",
|
|
94
|
-
і: "i",
|
|
95
|
-
ѕ: "s",
|
|
96
|
-
ј: "j",
|
|
97
|
-
// Lowercase Cyrillic look-alikes whose UPPERCASE forms are mapped below.
|
|
98
|
-
// NFKC does not fold these to Latin, so without them a single lowercase
|
|
99
|
-
// homoglyph inside a trigger word (e.g. Cyrillic т U+0442 in "insтructions")
|
|
100
|
-
// slips past the keyword rules even though the uppercase Т is folded.
|
|
101
|
-
в: "b",
|
|
102
|
-
к: "k",
|
|
103
|
-
м: "m",
|
|
104
|
-
н: "h",
|
|
105
|
-
т: "t",
|
|
106
|
-
// Capital Cyrillic look-alikes. NFKC does not fold these to Latin, so without
|
|
107
|
-
// them a sentence-initial homoglyph (e.g. "Іgnore all previous instructions",
|
|
108
|
-
// Cyrillic І U+0406) evades the keyword rules. Symmetric with the lowercase set.
|
|
109
|
-
А: "A",
|
|
110
|
-
В: "B",
|
|
111
|
-
Е: "E",
|
|
112
|
-
К: "K",
|
|
113
|
-
М: "M",
|
|
114
|
-
Н: "H",
|
|
115
|
-
О: "O",
|
|
116
|
-
Р: "P",
|
|
117
|
-
С: "C",
|
|
118
|
-
Т: "T",
|
|
119
|
-
У: "Y",
|
|
120
|
-
Х: "X",
|
|
121
|
-
І: "I",
|
|
122
|
-
Ј: "J",
|
|
123
|
-
Ѕ: "S",
|
|
124
|
-
Α: "A",
|
|
125
|
-
Β: "B",
|
|
126
|
-
Ε: "E",
|
|
127
|
-
Ο: "O",
|
|
128
|
-
Ρ: "P",
|
|
129
|
-
Τ: "T",
|
|
130
|
-
Χ: "X",
|
|
131
|
-
ο: "o",
|
|
132
|
-
ρ: "p",
|
|
133
|
-
α: "a",
|
|
134
|
-
ε: "e",
|
|
135
|
-
ι: "i",
|
|
136
|
-
// Lowercase Greek look-alikes whose uppercase forms are mapped above.
|
|
137
|
-
β: "b",
|
|
138
|
-
τ: "t",
|
|
139
|
-
χ: "x",
|
|
140
|
-
};
|
|
141
|
-
|
|
142
|
-
/**
|
|
143
|
-
* Hand-curated corpus. Rule ids are stable so callers (auditors, tests,
|
|
144
|
-
* the redaction notice) can rely on them.
|
|
145
|
-
*/
|
|
146
|
-
export const REGEX_RULES: ReadonlyArray<PromptInjectionRule> = [
|
|
147
|
-
{
|
|
148
|
-
id: "ignore-previous",
|
|
149
|
-
pattern:
|
|
150
|
-
/\bignore (?:all |the )?(?:previous|prior|above|earlier)\s+(?:instructions?|rules?|prompts?|messages?)\b/i,
|
|
151
|
-
severity: "high",
|
|
152
|
-
description: "OWASP LLM01 — ignore-previous-instructions injection",
|
|
153
|
-
},
|
|
154
|
-
{
|
|
155
|
-
id: "disregard-previous",
|
|
156
|
-
pattern:
|
|
157
|
-
/\bdisregard (?:all |the )?(?:previous|prior|above|earlier)\s+(?:instructions?|rules?|prompts?)\b/i,
|
|
158
|
-
severity: "high",
|
|
159
|
-
},
|
|
160
|
-
{
|
|
161
|
-
id: "forget-previous",
|
|
162
|
-
pattern:
|
|
163
|
-
/\bforget (?:everything|all|what) (?:above|before|prior|previous|you (?:were|have been) told)\b/i,
|
|
164
|
-
severity: "high",
|
|
165
|
-
},
|
|
166
|
-
{
|
|
167
|
-
id: "new-instructions",
|
|
168
|
-
pattern:
|
|
169
|
-
/\b(?:here are|these are)\s+(?:your\s+)?new\s+(?:instructions|rules|directives|orders)\b/i,
|
|
170
|
-
severity: "high",
|
|
171
|
-
},
|
|
172
|
-
{
|
|
173
|
-
id: "from-now-on",
|
|
174
|
-
pattern:
|
|
175
|
-
/\bfrom now on(?: you (?:will|must|should))?\s+(?:act|behave|respond|answer|reply|follow)\b/i,
|
|
176
|
-
severity: "medium",
|
|
177
|
-
},
|
|
178
|
-
{
|
|
179
|
-
id: "you-are-now",
|
|
180
|
-
pattern:
|
|
181
|
-
/\byou are (?:now|hereby)\s+(?:a|an|the)\s+[\w-]+(?:\s+[\w-]+){0,3}\s+(?:assistant|model|persona|character|expert|engine|admin)\b/i,
|
|
182
|
-
severity: "medium",
|
|
183
|
-
},
|
|
184
|
-
{
|
|
185
|
-
id: "system-prompt-leak",
|
|
186
|
-
pattern:
|
|
187
|
-
/\b(?:reveal|print|output|display|show|expose|dump|leak)\s+(?:the\s+)?(?:system\s+prompt|developer\s+message|hidden\s+instructions|original\s+instructions)\b/i,
|
|
188
|
-
severity: "high",
|
|
189
|
-
},
|
|
190
|
-
{
|
|
191
|
-
id: "tell-me-system-prompt",
|
|
192
|
-
pattern:
|
|
193
|
-
/\b(?:tell me|tell me about|what is|what are|repeat|reveal|show me)\s+(?:your\s+|the\s+)?(?:system\s+prompt|initial\s+prompt|original\s+instructions|hidden\s+(?:rules|prompt))\b/i,
|
|
194
|
-
severity: "high",
|
|
195
|
-
},
|
|
196
|
-
{
|
|
197
|
-
id: "developer-mode",
|
|
198
|
-
pattern:
|
|
199
|
-
/\b(?:enable|activate|enter|enable_)?\s*(?:developer|dev|debug|admin|god|jailbreak|dan|stan|aim|free)\s*mode\b/i,
|
|
200
|
-
severity: "medium",
|
|
201
|
-
},
|
|
202
|
-
{
|
|
203
|
-
id: "dan-jailbreak",
|
|
204
|
-
pattern: /\b(?:DAN|do anything now)\b/i,
|
|
205
|
-
severity: "medium",
|
|
206
|
-
},
|
|
207
|
-
{
|
|
208
|
-
id: "above-text-untrusted",
|
|
209
|
-
pattern:
|
|
210
|
-
/\bthe (?:above|prior|previous) (?:text|message|content|instruction)s? (?:is|are|was) (?:fake|untrusted|wrong|incorrect|a test)\b/i,
|
|
211
|
-
severity: "high",
|
|
212
|
-
},
|
|
213
|
-
{
|
|
214
|
-
id: "destructive-rm",
|
|
215
|
-
pattern: /\brm\s+-rf\s+(?:\/|~|\$HOME|--no-preserve-root)/i,
|
|
216
|
-
severity: "high",
|
|
217
|
-
description: "Direct destructive command injection",
|
|
218
|
-
},
|
|
219
|
-
{
|
|
220
|
-
id: "destructive-curl-pipe-sh",
|
|
221
|
-
pattern: /\bcurl\s+[^\s|]+\s*\|\s*(?:sudo\s+)?(?:sh|bash|zsh)\b/i,
|
|
222
|
-
severity: "high",
|
|
223
|
-
},
|
|
224
|
-
{
|
|
225
|
-
id: "destructive-wget-pipe-sh",
|
|
226
|
-
pattern: /\bwget\s+[^\s|]+\s*-O\s*-\s*\|\s*(?:sudo\s+)?(?:sh|bash|zsh)\b/i,
|
|
227
|
-
severity: "high",
|
|
228
|
-
},
|
|
229
|
-
{
|
|
230
|
-
id: "powershell-iex-net",
|
|
231
|
-
pattern:
|
|
232
|
-
/\b(?:iex|invoke-expression)\s*\(\s*(?:new-object\s+net\.webclient|invoke-webrequest)/i,
|
|
233
|
-
severity: "high",
|
|
234
|
-
},
|
|
235
|
-
{
|
|
236
|
-
id: "exfil-dotenv",
|
|
237
|
-
pattern:
|
|
238
|
-
/\b(?:cat|read|exfil(?:trate)?|upload|send|leak)\s+(?:the\s+)?(?:\.env|secrets?\.(?:json|yml|yaml)|id_rsa|\.aws\/credentials)/i,
|
|
239
|
-
severity: "high",
|
|
240
|
-
},
|
|
241
|
-
{
|
|
242
|
-
id: "exfil-credentials",
|
|
243
|
-
pattern:
|
|
244
|
-
/\b(?:send|post|upload|exfil(?:trate)?|forward)\s+(?:the\s+)?(?:credentials?|api[_\s-]?keys?|tokens?|passwords?)\s+(?:to|via|over)\b/i,
|
|
245
|
-
severity: "high",
|
|
246
|
-
},
|
|
247
|
-
{
|
|
248
|
-
id: "system-role-marker",
|
|
249
|
-
pattern: /<\|im_start\|>\s*system|<\|system\|>|\[INST\][\s\S]*?\[\/INST\]/,
|
|
250
|
-
severity: "high",
|
|
251
|
-
description: "OpenAI/Anthropic role-marker injection",
|
|
252
|
-
},
|
|
253
|
-
{
|
|
254
|
-
id: "anthropic-tag-injection",
|
|
255
|
-
pattern: /<\/?(?:system|human|assistant)>/i,
|
|
256
|
-
severity: "medium",
|
|
257
|
-
},
|
|
258
|
-
{
|
|
259
|
-
id: "hidden-system",
|
|
260
|
-
pattern:
|
|
261
|
-
/\b(?:override|replace|update)\s+(?:your\s+)?system\s+(?:prompt|message|instructions)\b/i,
|
|
262
|
-
severity: "high",
|
|
263
|
-
},
|
|
264
|
-
{
|
|
265
|
-
id: "no-restrictions",
|
|
266
|
-
pattern: /\b(?:no|without|removing)\s+(?:restrictions|filters|limits|safeguards|guardrails)\b/i,
|
|
267
|
-
severity: "medium",
|
|
268
|
-
},
|
|
269
|
-
{
|
|
270
|
-
id: "pretend-you-are",
|
|
271
|
-
pattern:
|
|
272
|
-
/\b(?:pretend|act as if|imagine|roleplay (?:as|that))\s+you\s+(?:are|were)\s+(?:not\s+|no\s+longer\s+)?(?:bound|restricted|limited|trained)\b/i,
|
|
273
|
-
severity: "medium",
|
|
274
|
-
},
|
|
275
|
-
{
|
|
276
|
-
id: "override-safety",
|
|
277
|
-
pattern:
|
|
278
|
-
/\b(?:bypass|circumvent|disable|override|turn off)\s+(?:your\s+)?(?:safety|content|moderation|alignment)/i,
|
|
279
|
-
severity: "high",
|
|
280
|
-
},
|
|
281
|
-
{
|
|
282
|
-
id: "tool-call-injection",
|
|
283
|
-
pattern:
|
|
284
|
-
/\b(?:invoke|call|run|execute)\s+(?:the\s+)?(?:Bash|Python|Shell|Write|Edit|Fetch|fetch|webFetch)\s+tool\s+with\b/i,
|
|
285
|
-
severity: "medium",
|
|
286
|
-
},
|
|
287
|
-
{
|
|
288
|
-
id: "auto-execute",
|
|
289
|
-
pattern:
|
|
290
|
-
/\b(?:then|next|after that|finally|now)\s+(?:run|execute|do)\s+(?:the following|this|these)\b[^\n]*\b(?:rm|curl|wget|chmod|chown|sudo|nc|ncat)\b/i,
|
|
291
|
-
severity: "high",
|
|
292
|
-
},
|
|
293
|
-
{
|
|
294
|
-
id: "ssh-key-action",
|
|
295
|
-
pattern:
|
|
296
|
-
/\b(?:upload|publish|push|copy|cat)\s+(?:my\s+|your\s+|the\s+)?(?:ssh\s+)?(?:public\s+)?key/i,
|
|
297
|
-
severity: "medium",
|
|
298
|
-
},
|
|
299
|
-
{
|
|
300
|
-
id: "kubectl-cluster-admin",
|
|
301
|
-
pattern: /\bkubectl\s+(?:create|apply|patch)\s+(?:clusterrolebinding|role|rolebinding)\b/i,
|
|
302
|
-
severity: "high",
|
|
303
|
-
},
|
|
304
|
-
{
|
|
305
|
-
id: "git-push-force-main",
|
|
306
|
-
pattern: /\bgit\s+push\s+(?:--force|-f)\s+\w+\s+(?:main|master)/i,
|
|
307
|
-
severity: "medium",
|
|
308
|
-
},
|
|
309
|
-
{
|
|
310
|
-
id: "iframe-embed",
|
|
311
|
-
pattern: /<iframe\s+[^>]*\bsrc\s*=\s*["'](?:javascript:|data:text\/html)/i,
|
|
312
|
-
severity: "high",
|
|
313
|
-
},
|
|
314
|
-
{
|
|
315
|
-
id: "javascript-uri",
|
|
316
|
-
pattern: /\bjavascript:\s*(?:eval|fetch|XMLHttpRequest|document\.write)/i,
|
|
317
|
-
severity: "high",
|
|
318
|
-
},
|
|
319
|
-
{
|
|
320
|
-
id: "data-url-script",
|
|
321
|
-
pattern: /data:text\/html;base64,[A-Za-z0-9+/=]{40,}/,
|
|
322
|
-
severity: "medium",
|
|
323
|
-
},
|
|
324
|
-
{
|
|
325
|
-
id: "smuggled-system-block",
|
|
326
|
-
pattern: /^[ \t]*system:[ \t]*\n[\s\S]{0,400}\n[ \t]*human:/im,
|
|
327
|
-
severity: "high",
|
|
328
|
-
},
|
|
329
|
-
{
|
|
330
|
-
id: "fake-user-injection",
|
|
331
|
-
pattern: /^[ \t]*(?:User|Human|USER):[ \t]*[^\n]{1,200}\n[ \t]*(?:Assistant|System|SYSTEM):/m,
|
|
332
|
-
severity: "high",
|
|
333
|
-
},
|
|
334
|
-
{
|
|
335
|
-
id: "unicode-tag-spoof",
|
|
336
|
-
pattern: /[\u{E0001}\u{E0020}-\u{E007F}]+/u,
|
|
337
|
-
severity: "high",
|
|
338
|
-
description: "Unicode tag block (often used for hidden prompts)",
|
|
339
|
-
},
|
|
340
|
-
{
|
|
341
|
-
id: "rtl-override",
|
|
342
|
-
pattern: /[]/,
|
|
343
|
-
severity: "medium",
|
|
344
|
-
description: "Bidi/RTL override character — common in homoglyph attacks",
|
|
345
|
-
},
|
|
346
|
-
{
|
|
347
|
-
id: "instruct-then-deny",
|
|
348
|
-
pattern:
|
|
349
|
-
/\b(?:do not|don'?t)\s+(?:tell|inform|reveal|warn)\s+(?:the )?(?:user|operator|caller)\b/i,
|
|
350
|
-
severity: "high",
|
|
351
|
-
},
|
|
352
|
-
{
|
|
353
|
-
id: "silently-execute",
|
|
354
|
-
pattern: /\bsilently\s+(?:execute|run|perform|invoke)\b/i,
|
|
355
|
-
severity: "high",
|
|
356
|
-
},
|
|
357
|
-
{
|
|
358
|
-
id: "prompt-leak-by-name",
|
|
359
|
-
pattern: /\b(?:print|repeat|output)\s+(?:everything|all of)\s+(?:above|before this line)\b/i,
|
|
360
|
-
severity: "high",
|
|
361
|
-
},
|
|
362
|
-
{
|
|
363
|
-
id: "tool-arg-injection",
|
|
364
|
-
pattern:
|
|
365
|
-
/\b(?:set|use|pass)\s+(?:the\s+)?(?:argument|parameter|flag)\s+["']?--?\w+["']?\s*=\s*["'][^"']*\$\(/i,
|
|
366
|
-
severity: "medium",
|
|
367
|
-
},
|
|
368
|
-
{
|
|
369
|
-
id: "fetch-attacker-url",
|
|
370
|
-
pattern:
|
|
371
|
-
/\b(?:please|now|then)?\s*(?:fetch|GET|POST|download)\s+(?:from\s+)?(?:https?:\/\/(?:[a-z0-9-]+\.)?(?:burpcollaborator|requestcatcher|webhook\.site|requestbin|interactsh|oast|ngrok|attacker)[a-z0-9.-]*)/i,
|
|
372
|
-
severity: "high",
|
|
373
|
-
},
|
|
374
|
-
{
|
|
375
|
-
id: "exfil-base64",
|
|
376
|
-
pattern:
|
|
377
|
-
/\bbase64\s+(?:encode|the\s+(?:secrets?|env|keys?))\b[\s\S]{0,40}(?:then\s+)?(?:send|post|upload|cat)/i,
|
|
378
|
-
severity: "high",
|
|
379
|
-
},
|
|
380
|
-
{
|
|
381
|
-
id: "shutdown-host",
|
|
382
|
-
pattern:
|
|
383
|
-
/\b(?:shutdown|halt|poweroff|reboot)\s+(?:-h\s+now|now|the host|the (?:server|machine))\b/i,
|
|
384
|
-
severity: "medium",
|
|
385
|
-
},
|
|
386
|
-
{
|
|
387
|
-
id: "mkfs-format",
|
|
388
|
-
pattern: /\bmkfs\.(?:ext\d|xfs|btrfs|vfat)\s+\/dev\//i,
|
|
389
|
-
severity: "high",
|
|
390
|
-
},
|
|
391
|
-
{
|
|
392
|
-
id: "dd-of-dev",
|
|
393
|
-
pattern: /\bdd\s+if=\S+\s+of=\/dev\/(?:sd[a-z]|nvme\d+n\d+|disk\d+)/i,
|
|
394
|
-
severity: "high",
|
|
395
|
-
},
|
|
396
|
-
{
|
|
397
|
-
id: "iptables-flush",
|
|
398
|
-
pattern: /\biptables\s+-F(?:\s|$)/i,
|
|
399
|
-
severity: "medium",
|
|
400
|
-
},
|
|
401
|
-
{
|
|
402
|
-
id: "history-clear",
|
|
403
|
-
pattern: /\b(?:history\s+-c|unset\s+HISTFILE|export\s+HISTFILE=\/dev\/null)\b/i,
|
|
404
|
-
severity: "medium",
|
|
405
|
-
},
|
|
406
|
-
{
|
|
407
|
-
id: "tool-run-in-background",
|
|
408
|
-
pattern: /\b(?:nohup|setsid|disown).*?(?:rm|curl|wget|nc|bash)\b/i,
|
|
409
|
-
severity: "medium",
|
|
410
|
-
},
|
|
411
|
-
{
|
|
412
|
-
id: "encoded-injection-marker",
|
|
413
|
-
pattern: /(?:%69gnore|%69%67%6e%6f%72%65)\s+(?:%70revious|%70%72%65%76%69%6f%75%73)/i,
|
|
414
|
-
severity: "medium",
|
|
415
|
-
},
|
|
416
|
-
{
|
|
417
|
-
id: "instruction-tag-payload",
|
|
418
|
-
pattern: /<\s*INSTRUCTIONS?\s*>[\s\S]{0,800}<\s*\/\s*INSTRUCTIONS?\s*>/i,
|
|
419
|
-
severity: "medium",
|
|
420
|
-
},
|
|
421
|
-
{
|
|
422
|
-
id: "markdown-instruction-block",
|
|
423
|
-
pattern: /^[\s\S]{1,400}?^>+[ \t]*(?:You are|Ignore|Disregard|Forget|From now on)/im,
|
|
424
|
-
severity: "low",
|
|
425
|
-
},
|
|
426
|
-
{
|
|
427
|
-
id: "imperative-link",
|
|
428
|
-
pattern: /\[click here\]\(\s*(?:javascript:|data:|file:)/i,
|
|
429
|
-
severity: "medium",
|
|
430
|
-
},
|
|
431
|
-
];
|
|
432
|
-
|
|
433
|
-
const MIN_CORPUS_RULES = 50;
|
|
434
|
-
|
|
435
|
-
/**
|
|
436
|
-
* Defensive corpus-floor guard. If the rule list is ever trimmed below the
|
|
437
|
-
* documented minimum, fail loudly at module-load instead of silently weakening
|
|
438
|
-
* detection. Extracted (and re-exported via `__internals`) so the failure path
|
|
439
|
-
* is testable without mutating the production corpus.
|
|
440
|
-
*/
|
|
441
|
-
function assertCorpusFloor(rules: ReadonlyArray<PromptInjectionRule>): void {
|
|
442
|
-
if (rules.length < MIN_CORPUS_RULES) {
|
|
443
|
-
throw new Error(
|
|
444
|
-
`prompt-injection-detector regex corpus has ${rules.length} rules; minimum is ${MIN_CORPUS_RULES}`,
|
|
445
|
-
);
|
|
446
|
-
}
|
|
447
|
-
}
|
|
448
|
-
|
|
449
|
-
assertCorpusFloor(REGEX_RULES);
|
|
450
|
-
|
|
451
|
-
function severityWeight(s: PromptInjectionSeverity): number {
|
|
452
|
-
return SEVERITY_WEIGHT[s];
|
|
453
|
-
}
|
|
454
|
-
|
|
455
|
-
function regexHits(
|
|
456
|
-
text: string,
|
|
457
|
-
rules: ReadonlyArray<PromptInjectionRule> = REGEX_RULES,
|
|
458
|
-
): PromptInjectionHit[] {
|
|
459
|
-
const hits: PromptInjectionHit[] = [];
|
|
460
|
-
for (const rule of rules) {
|
|
461
|
-
const m = rule.pattern.exec(text);
|
|
462
|
-
if (m === null) continue;
|
|
463
|
-
const start = m.index;
|
|
464
|
-
const end = m.index + m[0].length;
|
|
465
|
-
hits.push({
|
|
466
|
-
rule: rule.id,
|
|
467
|
-
span: [start, end],
|
|
468
|
-
severity: rule.severity,
|
|
469
|
-
layer: "regex",
|
|
470
|
-
});
|
|
471
|
-
if (rule.pattern.global) {
|
|
472
|
-
// reset lastIndex so we don't surprise downstream consumers
|
|
473
|
-
rule.pattern.lastIndex = 0;
|
|
474
|
-
}
|
|
475
|
-
}
|
|
476
|
-
return hits;
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
function structuralHits(text: string): PromptInjectionHit[] {
|
|
480
|
-
const hits: PromptInjectionHit[] = [];
|
|
481
|
-
|
|
482
|
-
// BOM tampering — text shouldn't open with a BOM unless it's at the
|
|
483
|
-
// start of a UTF document. Tool outputs almost never legitimately do.
|
|
484
|
-
if (text.startsWith("")) {
|
|
485
|
-
hits.push({ rule: "structural-bom", span: [0, 1], severity: "low", layer: "structural" });
|
|
486
|
-
}
|
|
487
|
-
|
|
488
|
-
// Role-marker injection beyond the ones the regex layer already matches.
|
|
489
|
-
// A cheap structural variant: "role:\nrole:" cluster on adjacent lines.
|
|
490
|
-
const roleClusterRe =
|
|
491
|
-
/(?:^|\n)[ \t]*(?:system|assistant|user|human)[ \t]*:[^\n]*\n[ \t]*(?:system|assistant|user|human)[ \t]*:/i;
|
|
492
|
-
const role = roleClusterRe.exec(text);
|
|
493
|
-
if (role) {
|
|
494
|
-
hits.push({
|
|
495
|
-
rule: "structural-role-cluster",
|
|
496
|
-
span: [role.index, role.index + role[0].length],
|
|
497
|
-
severity: "medium",
|
|
498
|
-
layer: "structural",
|
|
499
|
-
});
|
|
500
|
-
}
|
|
501
|
-
|
|
502
|
-
// Trailing imperative block: if the last 250 chars contain an imperative
|
|
503
|
-
// verb and no preceding paragraph break, score it as suspicious. This
|
|
504
|
-
// catches innocuous-looking tool output that ends with "Now run X".
|
|
505
|
-
const tailStart = Math.max(0, text.length - 350);
|
|
506
|
-
const tail = text.slice(tailStart);
|
|
507
|
-
const tailImperative =
|
|
508
|
-
/(?:^|\n)[ \t]*(?:now |then |finally )?(?:please[ \t]+)?(?:run|execute|fetch|delete|remove|email|upload|send|forward|leak|exfil(?:trate)?|shutdown|kill|chmod|chown|sudo)\b[^\n]{0,200}$/i;
|
|
509
|
-
const t = tailImperative.exec(tail);
|
|
510
|
-
if (t) {
|
|
511
|
-
hits.push({
|
|
512
|
-
rule: "structural-trailing-imperative",
|
|
513
|
-
span: [tailStart + t.index, tailStart + t.index + t[0].length],
|
|
514
|
-
severity: "medium",
|
|
515
|
-
layer: "structural",
|
|
516
|
-
});
|
|
517
|
-
}
|
|
518
|
-
|
|
519
|
-
// Long base64 with imperative neighbour — common smuggling shape.
|
|
520
|
-
const b64 = /[A-Za-z0-9+/]{120,}={0,2}/.exec(text);
|
|
521
|
-
if (b64 !== null) {
|
|
522
|
-
const ctxStart = Math.max(0, b64.index - 80);
|
|
523
|
-
const ctx = text.slice(ctxStart, b64.index);
|
|
524
|
-
if (/(decode|run|execute|eval|payload|shell)/i.test(ctx)) {
|
|
525
|
-
hits.push({
|
|
526
|
-
rule: "structural-suspicious-base64",
|
|
527
|
-
span: [b64.index, b64.index + b64[0].length],
|
|
528
|
-
severity: "medium",
|
|
529
|
-
layer: "structural",
|
|
530
|
-
});
|
|
531
|
-
}
|
|
532
|
-
}
|
|
533
|
-
|
|
534
|
-
// Smuggled URL with credential exfil pattern in the same line.
|
|
535
|
-
const urlExfil =
|
|
536
|
-
/(?:https?:\/\/[^\s)]+)[^\n]{0,80}\b(?:token|secret|api[_\s-]?key|cookie|session)\b/i.exec(
|
|
537
|
-
text,
|
|
538
|
-
);
|
|
539
|
-
if (urlExfil) {
|
|
540
|
-
hits.push({
|
|
541
|
-
rule: "structural-url-exfil-pair",
|
|
542
|
-
span: [urlExfil.index, urlExfil.index + urlExfil[0].length],
|
|
543
|
-
severity: "medium",
|
|
544
|
-
layer: "structural",
|
|
545
|
-
});
|
|
546
|
-
}
|
|
547
|
-
|
|
548
|
-
return hits;
|
|
549
|
-
}
|
|
550
|
-
|
|
551
|
-
export type LlmClassifyFn = (
|
|
552
|
-
text: string,
|
|
553
|
-
) => Promise<{ verdict: PromptInjectionClassification; rationale?: string } | undefined>;
|
|
554
|
-
|
|
555
|
-
export type ClassifyOptions = {
|
|
556
|
-
/**
|
|
557
|
-
* When set, layer 3 LLM classifier runs and its verdict can lift
|
|
558
|
-
* "clean" → "suspicious" or upgrade an existing suspicious verdict
|
|
559
|
-
* to "malicious". A `clean` verdict from the model is advisory only —
|
|
560
|
-
* we never downgrade a high-severity regex hit.
|
|
561
|
-
*
|
|
562
|
-
* Activated when the runtime sets `CREWHAUS_PI_CLASSIFIER_MODEL` and
|
|
563
|
-
* the runtime supplies the actual classify callback.
|
|
564
|
-
*/
|
|
565
|
-
readonly llmClassifier?: LlmClassifyFn;
|
|
566
|
-
/**
|
|
567
|
-
* Override the suspicious / malicious thresholds. Mostly used by
|
|
568
|
-
* tests; production should leave defaults.
|
|
569
|
-
*/
|
|
570
|
-
readonly thresholds?: { readonly suspicious?: number; readonly malicious?: number };
|
|
571
|
-
};
|
|
572
|
-
|
|
573
|
-
function aggregateScore(hits: ReadonlyArray<PromptInjectionHit>): number {
|
|
574
|
-
// Probabilistic OR: each hit raises the score multiplicatively.
|
|
575
|
-
let p = 0;
|
|
576
|
-
for (const h of hits) {
|
|
577
|
-
const w = severityWeight(h.severity);
|
|
578
|
-
p = 1 - (1 - p) * (1 - w);
|
|
579
|
-
}
|
|
580
|
-
return Math.min(1, p);
|
|
581
|
-
}
|
|
582
|
-
|
|
583
|
-
function classify(score: number, threshold: { suspicious: number; malicious: number }) {
|
|
584
|
-
if (score >= threshold.malicious) return "malicious" as const;
|
|
585
|
-
if (score >= threshold.suspicious) return "suspicious" as const;
|
|
586
|
-
return "clean" as const;
|
|
587
|
-
}
|
|
588
|
-
|
|
589
|
-
function foldHomoglyphs(s: string): string {
|
|
590
|
-
let out = "";
|
|
591
|
-
for (const ch of s) out += HOMOGLYPHS[ch] ?? ch;
|
|
592
|
-
return out;
|
|
593
|
-
}
|
|
594
|
-
|
|
595
|
-
/**
|
|
596
|
-
* Canonical "match view" of the text. NFKC-folds full-width / compatibility
|
|
597
|
-
* forms, strips zero-width/format/bidi/tag characters, maps confusable
|
|
598
|
-
* homoglyphs to ASCII, and collapses whitespace runs to single spaces so the
|
|
599
|
-
* literal-space anchors in the keyword rules match "ignore\n\nprevious" and
|
|
600
|
-
* "ignore previous" alike (#143).
|
|
601
|
-
*/
|
|
602
|
-
function normalizeForMatch(text: string): string {
|
|
603
|
-
const stripped = text.normalize("NFKC").replace(INVISIBLE_RE, "");
|
|
604
|
-
return foldHomoglyphs(stripped).replace(/\s+/g, " ");
|
|
605
|
-
}
|
|
606
|
-
|
|
607
|
-
function isMostlyPrintable(s: string): boolean {
|
|
608
|
-
if (s.length === 0) return false;
|
|
609
|
-
let printable = 0;
|
|
610
|
-
for (let i = 0; i < s.length; i++) {
|
|
611
|
-
const c = s.charCodeAt(i);
|
|
612
|
-
if (c === 9 || c === 10 || c === 13 || (c >= 32 && c < 127)) printable++;
|
|
613
|
-
}
|
|
614
|
-
return printable / s.length > 0.85;
|
|
615
|
-
}
|
|
616
|
-
|
|
617
|
-
function tryDecodeBase64(blob: string): string | undefined {
|
|
618
|
-
if (blob.length < 16 || blob.length % 4 === 1) return undefined;
|
|
619
|
-
try {
|
|
620
|
-
const decoded = Buffer.from(blob, "base64").toString("utf8");
|
|
621
|
-
return isMostlyPrintable(decoded) ? decoded : undefined;
|
|
622
|
-
} catch {
|
|
623
|
-
return undefined;
|
|
624
|
-
}
|
|
625
|
-
}
|
|
626
|
-
|
|
627
|
-
function tryDecodeHex(blob: string): string | undefined {
|
|
628
|
-
if (blob.length < 16 || blob.length % 2 !== 0) return undefined;
|
|
629
|
-
try {
|
|
630
|
-
const decoded = Buffer.from(blob, "hex").toString("utf8");
|
|
631
|
-
return isMostlyPrintable(decoded) ? decoded : undefined;
|
|
632
|
-
} catch {
|
|
633
|
-
return undefined;
|
|
634
|
-
}
|
|
635
|
-
}
|
|
636
|
-
|
|
637
|
-
function tryDecodePercent(text: string): string | undefined {
|
|
638
|
-
try {
|
|
639
|
-
const decoded = decodeURIComponent(text);
|
|
640
|
-
return decoded !== text ? decoded : undefined;
|
|
641
|
-
} catch {
|
|
642
|
-
return undefined;
|
|
643
|
-
}
|
|
644
|
-
}
|
|
645
|
-
|
|
646
|
-
/**
|
|
647
|
-
* Recursively decode base64 / hex / percent-encoded blobs so an injection
|
|
648
|
-
* hidden in an encoded payload is rescanned in cleartext, regardless of
|
|
649
|
-
* neighbouring keywords (#143). Match counts and depth are bounded so this
|
|
650
|
-
* cannot itself become a DoS vector.
|
|
651
|
-
*/
|
|
652
|
-
function decodedVariants(text: string, depth = 2): string[] {
|
|
653
|
-
if (depth <= 0 || text.length === 0) return [];
|
|
654
|
-
const out: string[] = [];
|
|
655
|
-
const push = (s: string | undefined): void => {
|
|
656
|
-
if (s !== undefined && s.length > 0) out.push(s, ...decodedVariants(s, depth - 1));
|
|
657
|
-
};
|
|
658
|
-
for (const m of [...text.matchAll(/[A-Za-z0-9+/]{16,}={0,2}/g)].slice(0, 8)) {
|
|
659
|
-
push(tryDecodeBase64(m[0]));
|
|
660
|
-
}
|
|
661
|
-
for (const m of [...text.matchAll(/(?:[0-9A-Fa-f]{2}){8,}/g)].slice(0, 8)) {
|
|
662
|
-
push(tryDecodeHex(m[0]));
|
|
663
|
-
}
|
|
664
|
-
if (/%[0-9A-Fa-f]{2}/.test(text)) push(tryDecodePercent(text));
|
|
665
|
-
return out.slice(0, 16);
|
|
666
|
-
}
|
|
667
|
-
|
|
668
|
-
/**
|
|
669
|
-
* Classify a tool output. Pure with respect to the input string when
|
|
670
|
-
* the LLM classifier is not supplied.
|
|
671
|
-
*/
|
|
672
|
-
export async function classifyText(
|
|
673
|
-
text: string,
|
|
674
|
-
opts: ClassifyOptions = {},
|
|
675
|
-
): Promise<PromptInjectionResult> {
|
|
676
|
-
const threshold = {
|
|
677
|
-
suspicious: opts.thresholds?.suspicious ?? SCORE_SUSPICIOUS,
|
|
678
|
-
malicious: opts.thresholds?.malicious ?? SCORE_MALICIOUS,
|
|
679
|
-
};
|
|
680
|
-
if (text === "") {
|
|
681
|
-
return { classification: "clean", score: 0, hits: [] };
|
|
682
|
-
}
|
|
683
|
-
// Bound the work the regex/structural layers do so a pathological input
|
|
684
|
-
// can't wedge the classifier (#153). Keep head + tail so leading and
|
|
685
|
-
// trailing injections both stay in view.
|
|
686
|
-
const analyzed =
|
|
687
|
-
text.length > MAX_CLASSIFY_LEN
|
|
688
|
-
? `${text.slice(0, MAX_CLASSIFY_LEN / 2)}\n${text.slice(-MAX_CLASSIFY_LEN / 2)}`
|
|
689
|
-
: text;
|
|
690
|
-
// De-obfuscate into match views so the keyword rules can't be dodged with
|
|
691
|
-
// full-width characters, zero-width splits, homoglyphs, whitespace tricks,
|
|
692
|
-
// or base64/percent/hex encoding (#143). Structural rules run on the raw
|
|
693
|
-
// (bounded) text; regex rules run on every variant, deduped by rule id.
|
|
694
|
-
const variants = [analyzed, normalizeForMatch(analyzed), ...decodedVariants(analyzed)];
|
|
695
|
-
const regHits: PromptInjectionHit[] = [];
|
|
696
|
-
const seenRules = new Set<string>();
|
|
697
|
-
for (const variant of variants) {
|
|
698
|
-
for (const h of regexHits(variant)) {
|
|
699
|
-
if (seenRules.has(h.rule)) continue;
|
|
700
|
-
seenRules.add(h.rule);
|
|
701
|
-
regHits.push(h);
|
|
702
|
-
}
|
|
703
|
-
}
|
|
704
|
-
const hits: PromptInjectionHit[] = [...regHits, ...structuralHits(analyzed)];
|
|
705
|
-
let score = aggregateScore(hits);
|
|
706
|
-
let classification = classify(score, threshold);
|
|
707
|
-
|
|
708
|
-
if (opts.llmClassifier !== undefined) {
|
|
709
|
-
try {
|
|
710
|
-
const verdict = await opts.llmClassifier(analyzed);
|
|
711
|
-
if (verdict !== undefined) {
|
|
712
|
-
if (verdict.verdict === "malicious") {
|
|
713
|
-
classification = "malicious";
|
|
714
|
-
score = Math.max(score, threshold.malicious);
|
|
715
|
-
hits.push({
|
|
716
|
-
rule: "llm-malicious",
|
|
717
|
-
span: [0, Math.min(text.length, 200)],
|
|
718
|
-
severity: "high",
|
|
719
|
-
layer: "llm",
|
|
720
|
-
});
|
|
721
|
-
} else if (verdict.verdict === "suspicious" && classification === "clean") {
|
|
722
|
-
classification = "suspicious";
|
|
723
|
-
score = Math.max(score, threshold.suspicious);
|
|
724
|
-
hits.push({
|
|
725
|
-
rule: "llm-suspicious",
|
|
726
|
-
span: [0, Math.min(text.length, 200)],
|
|
727
|
-
severity: "medium",
|
|
728
|
-
layer: "llm",
|
|
729
|
-
});
|
|
730
|
-
}
|
|
731
|
-
}
|
|
732
|
-
} catch {
|
|
733
|
-
// LLM tier is best-effort; swallow so a model outage doesn't
|
|
734
|
-
// block tool execution.
|
|
735
|
-
}
|
|
736
|
-
}
|
|
737
|
-
|
|
738
|
-
return { classification, score, hits };
|
|
739
|
-
}
|
|
740
|
-
|
|
741
|
-
/**
|
|
742
|
-
* Build a redaction notice safe to substitute for the original tool
|
|
743
|
-
* output. The notice names the rules that fired so that auditors can
|
|
744
|
-
* verify the decision later.
|
|
745
|
-
*/
|
|
746
|
-
export function buildRedactionNotice(hits: ReadonlyArray<PromptInjectionHit>): string {
|
|
747
|
-
const ids = [...new Set(hits.map((h) => h.rule))].slice(0, 6);
|
|
748
|
-
return `[tool output redacted: prompt injection detected: ${ids.join(", ")}]`;
|
|
749
|
-
}
|
|
750
|
-
|
|
751
|
-
/**
|
|
752
|
-
* Returns true when the env-driven LLM classifier should run.
|
|
753
|
-
*/
|
|
754
|
-
export function llmClassifierEnabled(env: NodeJS.ProcessEnv = process.env): boolean {
|
|
755
|
-
const m = env["CREWHAUS_PI_CLASSIFIER_MODEL"];
|
|
756
|
-
return m !== undefined && m.trim() !== "";
|
|
757
|
-
}
|
|
758
|
-
|
|
759
|
-
/**
|
|
760
|
-
* Internal seams exposed ONLY for unit tests. Not part of the public API and
|
|
761
|
-
* not subject to semver — these let the test suite drive the module's
|
|
762
|
-
* defensive branches (corpus-floor guard, global-flag `lastIndex` reset, and
|
|
763
|
-
* the decoder `try/catch` fallbacks) with crafted inputs that the public
|
|
764
|
-
* `classifyText` entrypoint can never construct on its own. Do not import
|
|
765
|
-
* from application code.
|
|
766
|
-
*/
|
|
767
|
-
export const __internals = {
|
|
768
|
-
assertCorpusFloor,
|
|
769
|
-
regexHits,
|
|
770
|
-
tryDecodeBase64,
|
|
771
|
-
tryDecodeHex,
|
|
772
|
-
tryDecodePercent,
|
|
773
|
-
MIN_CORPUS_RULES,
|
|
774
|
-
} as const;
|