@crewhaus/prompt-injection-detector 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts DELETED
@@ -1,774 +0,0 @@
1
- /**
2
- * Catalog R8 `prompt-injection-detector` — heuristic + optional LLM
3
- * classifier for tool output. Used by runtime-core after every tool
4
- * call (when the tool's `classifyOutput` flag is not explicitly false)
5
- * to decide whether the output should be passed to the model verbatim,
6
- * passed with a system warning, or redacted.
7
- *
8
- * Three layers, fail-closed when ambiguous:
9
- *
10
- * Layer 1 — regex rules over a corpus drawn from OWASP LLM Top-10
11
- * plus a 50+-vector hand-crafted set. Each rule has a severity tag
12
- * and contributes to a cumulative score. The corpus is exported so
13
- * tests and downstream tools can audit it.
14
- *
15
- * Layer 2 — structural heuristics. Trailing imperative blocks,
16
- * role-marker injection (e.g. "system:" / "<|im_start|>"), BOM
17
- * tampering, and base64 wrapping a malicious string are all detected
18
- * without overlap with Layer 1. These produce hits with severity
19
- * weighted by structural risk.
20
- *
21
- * Layer 3 — optional LLM classifier. Activated when
22
- * `CREWHAUS_PI_CLASSIFIER_MODEL` is set; the runtime supplies a
23
- * `classify` callback that delegates to a model. Without the env
24
- * var the layer is a no-op.
25
- *
26
- * The aggregate score thresholds:
27
- * < 0.40 → "clean"
28
- * [0.40, 0.80) → "suspicious"
29
- * ≥ 0.80 → "malicious"
30
- *
31
- * Layer R8. Pairs with `tool-result-store` (the previewContent input)
32
- * and `runtime-core` (the post-tool callsite that consumes the
33
- * classification).
34
- */
35
-
36
- export type PromptInjectionClassification = "clean" | "suspicious" | "malicious";
37
-
38
- export type PromptInjectionSeverity = "low" | "medium" | "high";
39
-
40
- export type PromptInjectionHit = {
41
- /** Stable rule id; safe to surface in logs and the redaction notice. */
42
- readonly rule: string;
43
- /** [start, end) byte offset in the analyzed text. */
44
- readonly span: readonly [number, number];
45
- readonly severity: PromptInjectionSeverity;
46
- /** Layer that produced the hit. */
47
- readonly layer: "regex" | "structural" | "llm";
48
- };
49
-
50
- export type PromptInjectionResult = {
51
- readonly classification: PromptInjectionClassification;
52
- /** [0, 1] aggregate score. Higher = more likely injection. */
53
- readonly score: number;
54
- readonly hits: ReadonlyArray<PromptInjectionHit>;
55
- };
56
-
57
- export type PromptInjectionRule = {
58
- readonly id: string;
59
- readonly pattern: RegExp;
60
- readonly severity: PromptInjectionSeverity;
61
- readonly description?: string;
62
- };
63
-
64
- const SEVERITY_WEIGHT: Record<PromptInjectionSeverity, number> = {
65
- low: 0.18,
66
- medium: 0.42,
67
- high: 0.85,
68
- };
69
-
70
- const SCORE_SUSPICIOUS = 0.4;
71
- const SCORE_MALICIOUS = 0.8;
72
-
73
- // Upper bound on the text the regex/structural layers scan, so a pathological
74
- // (e.g. multi-MB whitespace) input cannot wedge the classifier (#153). Larger
75
- // inputs are analyzed head + tail.
76
- const MAX_CLASSIFY_LEN = 64 * 1024;
77
-
78
- // Zero-width / format / bidi / tag characters used to split trigger words
79
- // ("ig<U+200B>nore"). Stripped from the match view; their *presence* is still
80
- // caught on the raw text by the unicode-tag-spoof / rtl-override rules.
81
- const INVISIBLE_RE = /[­᠎​-‏‪-‮⁠-⁤⁦-\u{E0000}-\u{E007F}]/gu;
82
-
83
- // Common confusable homoglyphs → ASCII, applied only to the match view so an
84
- // attacker cannot dodge the keyword rules with Cyrillic/Greek look-alikes
85
- // (e.g. Cyrillic "іgnоre"). Intentionally small to limit false positives.
86
- const HOMOGLYPHS: Record<string, string> = {
87
- а: "a",
88
- е: "e",
89
- о: "o",
90
- р: "p",
91
- с: "c",
92
- у: "y",
93
- х: "x",
94
- і: "i",
95
- ѕ: "s",
96
- ј: "j",
97
- // Lowercase Cyrillic look-alikes whose UPPERCASE forms are mapped below.
98
- // NFKC does not fold these to Latin, so without them a single lowercase
99
- // homoglyph inside a trigger word (e.g. Cyrillic т U+0442 in "insтructions")
100
- // slips past the keyword rules even though the uppercase Т is folded.
101
- в: "b",
102
- к: "k",
103
- м: "m",
104
- н: "h",
105
- т: "t",
106
- // Capital Cyrillic look-alikes. NFKC does not fold these to Latin, so without
107
- // them a sentence-initial homoglyph (e.g. "Іgnore all previous instructions",
108
- // Cyrillic І U+0406) evades the keyword rules. Symmetric with the lowercase set.
109
- А: "A",
110
- В: "B",
111
- Е: "E",
112
- К: "K",
113
- М: "M",
114
- Н: "H",
115
- О: "O",
116
- Р: "P",
117
- С: "C",
118
- Т: "T",
119
- У: "Y",
120
- Х: "X",
121
- І: "I",
122
- Ј: "J",
123
- Ѕ: "S",
124
- Α: "A",
125
- Β: "B",
126
- Ε: "E",
127
- Ο: "O",
128
- Ρ: "P",
129
- Τ: "T",
130
- Χ: "X",
131
- ο: "o",
132
- ρ: "p",
133
- α: "a",
134
- ε: "e",
135
- ι: "i",
136
- // Lowercase Greek look-alikes whose uppercase forms are mapped above.
137
- β: "b",
138
- τ: "t",
139
- χ: "x",
140
- };
141
-
142
- /**
143
- * Hand-curated corpus. Rule ids are stable so callers (auditors, tests,
144
- * the redaction notice) can rely on them.
145
- */
146
- export const REGEX_RULES: ReadonlyArray<PromptInjectionRule> = [
147
- {
148
- id: "ignore-previous",
149
- pattern:
150
- /\bignore (?:all |the )?(?:previous|prior|above|earlier)\s+(?:instructions?|rules?|prompts?|messages?)\b/i,
151
- severity: "high",
152
- description: "OWASP LLM01 — ignore-previous-instructions injection",
153
- },
154
- {
155
- id: "disregard-previous",
156
- pattern:
157
- /\bdisregard (?:all |the )?(?:previous|prior|above|earlier)\s+(?:instructions?|rules?|prompts?)\b/i,
158
- severity: "high",
159
- },
160
- {
161
- id: "forget-previous",
162
- pattern:
163
- /\bforget (?:everything|all|what) (?:above|before|prior|previous|you (?:were|have been) told)\b/i,
164
- severity: "high",
165
- },
166
- {
167
- id: "new-instructions",
168
- pattern:
169
- /\b(?:here are|these are)\s+(?:your\s+)?new\s+(?:instructions|rules|directives|orders)\b/i,
170
- severity: "high",
171
- },
172
- {
173
- id: "from-now-on",
174
- pattern:
175
- /\bfrom now on(?: you (?:will|must|should))?\s+(?:act|behave|respond|answer|reply|follow)\b/i,
176
- severity: "medium",
177
- },
178
- {
179
- id: "you-are-now",
180
- pattern:
181
- /\byou are (?:now|hereby)\s+(?:a|an|the)\s+[\w-]+(?:\s+[\w-]+){0,3}\s+(?:assistant|model|persona|character|expert|engine|admin)\b/i,
182
- severity: "medium",
183
- },
184
- {
185
- id: "system-prompt-leak",
186
- pattern:
187
- /\b(?:reveal|print|output|display|show|expose|dump|leak)\s+(?:the\s+)?(?:system\s+prompt|developer\s+message|hidden\s+instructions|original\s+instructions)\b/i,
188
- severity: "high",
189
- },
190
- {
191
- id: "tell-me-system-prompt",
192
- pattern:
193
- /\b(?:tell me|tell me about|what is|what are|repeat|reveal|show me)\s+(?:your\s+|the\s+)?(?:system\s+prompt|initial\s+prompt|original\s+instructions|hidden\s+(?:rules|prompt))\b/i,
194
- severity: "high",
195
- },
196
- {
197
- id: "developer-mode",
198
- pattern:
199
- /\b(?:enable|activate|enter|enable_)?\s*(?:developer|dev|debug|admin|god|jailbreak|dan|stan|aim|free)\s*mode\b/i,
200
- severity: "medium",
201
- },
202
- {
203
- id: "dan-jailbreak",
204
- pattern: /\b(?:DAN|do anything now)\b/i,
205
- severity: "medium",
206
- },
207
- {
208
- id: "above-text-untrusted",
209
- pattern:
210
- /\bthe (?:above|prior|previous) (?:text|message|content|instruction)s? (?:is|are|was) (?:fake|untrusted|wrong|incorrect|a test)\b/i,
211
- severity: "high",
212
- },
213
- {
214
- id: "destructive-rm",
215
- pattern: /\brm\s+-rf\s+(?:\/|~|\$HOME|--no-preserve-root)/i,
216
- severity: "high",
217
- description: "Direct destructive command injection",
218
- },
219
- {
220
- id: "destructive-curl-pipe-sh",
221
- pattern: /\bcurl\s+[^\s|]+\s*\|\s*(?:sudo\s+)?(?:sh|bash|zsh)\b/i,
222
- severity: "high",
223
- },
224
- {
225
- id: "destructive-wget-pipe-sh",
226
- pattern: /\bwget\s+[^\s|]+\s*-O\s*-\s*\|\s*(?:sudo\s+)?(?:sh|bash|zsh)\b/i,
227
- severity: "high",
228
- },
229
- {
230
- id: "powershell-iex-net",
231
- pattern:
232
- /\b(?:iex|invoke-expression)\s*\(\s*(?:new-object\s+net\.webclient|invoke-webrequest)/i,
233
- severity: "high",
234
- },
235
- {
236
- id: "exfil-dotenv",
237
- pattern:
238
- /\b(?:cat|read|exfil(?:trate)?|upload|send|leak)\s+(?:the\s+)?(?:\.env|secrets?\.(?:json|yml|yaml)|id_rsa|\.aws\/credentials)/i,
239
- severity: "high",
240
- },
241
- {
242
- id: "exfil-credentials",
243
- pattern:
244
- /\b(?:send|post|upload|exfil(?:trate)?|forward)\s+(?:the\s+)?(?:credentials?|api[_\s-]?keys?|tokens?|passwords?)\s+(?:to|via|over)\b/i,
245
- severity: "high",
246
- },
247
- {
248
- id: "system-role-marker",
249
- pattern: /<\|im_start\|>\s*system|<\|system\|>|\[INST\][\s\S]*?\[\/INST\]/,
250
- severity: "high",
251
- description: "OpenAI/Anthropic role-marker injection",
252
- },
253
- {
254
- id: "anthropic-tag-injection",
255
- pattern: /<\/?(?:system|human|assistant)>/i,
256
- severity: "medium",
257
- },
258
- {
259
- id: "hidden-system",
260
- pattern:
261
- /\b(?:override|replace|update)\s+(?:your\s+)?system\s+(?:prompt|message|instructions)\b/i,
262
- severity: "high",
263
- },
264
- {
265
- id: "no-restrictions",
266
- pattern: /\b(?:no|without|removing)\s+(?:restrictions|filters|limits|safeguards|guardrails)\b/i,
267
- severity: "medium",
268
- },
269
- {
270
- id: "pretend-you-are",
271
- pattern:
272
- /\b(?:pretend|act as if|imagine|roleplay (?:as|that))\s+you\s+(?:are|were)\s+(?:not\s+|no\s+longer\s+)?(?:bound|restricted|limited|trained)\b/i,
273
- severity: "medium",
274
- },
275
- {
276
- id: "override-safety",
277
- pattern:
278
- /\b(?:bypass|circumvent|disable|override|turn off)\s+(?:your\s+)?(?:safety|content|moderation|alignment)/i,
279
- severity: "high",
280
- },
281
- {
282
- id: "tool-call-injection",
283
- pattern:
284
- /\b(?:invoke|call|run|execute)\s+(?:the\s+)?(?:Bash|Python|Shell|Write|Edit|Fetch|fetch|webFetch)\s+tool\s+with\b/i,
285
- severity: "medium",
286
- },
287
- {
288
- id: "auto-execute",
289
- pattern:
290
- /\b(?:then|next|after that|finally|now)\s+(?:run|execute|do)\s+(?:the following|this|these)\b[^\n]*\b(?:rm|curl|wget|chmod|chown|sudo|nc|ncat)\b/i,
291
- severity: "high",
292
- },
293
- {
294
- id: "ssh-key-action",
295
- pattern:
296
- /\b(?:upload|publish|push|copy|cat)\s+(?:my\s+|your\s+|the\s+)?(?:ssh\s+)?(?:public\s+)?key/i,
297
- severity: "medium",
298
- },
299
- {
300
- id: "kubectl-cluster-admin",
301
- pattern: /\bkubectl\s+(?:create|apply|patch)\s+(?:clusterrolebinding|role|rolebinding)\b/i,
302
- severity: "high",
303
- },
304
- {
305
- id: "git-push-force-main",
306
- pattern: /\bgit\s+push\s+(?:--force|-f)\s+\w+\s+(?:main|master)/i,
307
- severity: "medium",
308
- },
309
- {
310
- id: "iframe-embed",
311
- pattern: /<iframe\s+[^>]*\bsrc\s*=\s*["'](?:javascript:|data:text\/html)/i,
312
- severity: "high",
313
- },
314
- {
315
- id: "javascript-uri",
316
- pattern: /\bjavascript:\s*(?:eval|fetch|XMLHttpRequest|document\.write)/i,
317
- severity: "high",
318
- },
319
- {
320
- id: "data-url-script",
321
- pattern: /data:text\/html;base64,[A-Za-z0-9+/=]{40,}/,
322
- severity: "medium",
323
- },
324
- {
325
- id: "smuggled-system-block",
326
- pattern: /^[ \t]*system:[ \t]*\n[\s\S]{0,400}\n[ \t]*human:/im,
327
- severity: "high",
328
- },
329
- {
330
- id: "fake-user-injection",
331
- pattern: /^[ \t]*(?:User|Human|USER):[ \t]*[^\n]{1,200}\n[ \t]*(?:Assistant|System|SYSTEM):/m,
332
- severity: "high",
333
- },
334
- {
335
- id: "unicode-tag-spoof",
336
- pattern: /[\u{E0001}\u{E0020}-\u{E007F}]+/u,
337
- severity: "high",
338
- description: "Unicode tag block (often used for hidden prompts)",
339
- },
340
- {
341
- id: "rtl-override",
342
- pattern: /[‮‭‎‏؜]/,
343
- severity: "medium",
344
- description: "Bidi/RTL override character — common in homoglyph attacks",
345
- },
346
- {
347
- id: "instruct-then-deny",
348
- pattern:
349
- /\b(?:do not|don'?t)\s+(?:tell|inform|reveal|warn)\s+(?:the )?(?:user|operator|caller)\b/i,
350
- severity: "high",
351
- },
352
- {
353
- id: "silently-execute",
354
- pattern: /\bsilently\s+(?:execute|run|perform|invoke)\b/i,
355
- severity: "high",
356
- },
357
- {
358
- id: "prompt-leak-by-name",
359
- pattern: /\b(?:print|repeat|output)\s+(?:everything|all of)\s+(?:above|before this line)\b/i,
360
- severity: "high",
361
- },
362
- {
363
- id: "tool-arg-injection",
364
- pattern:
365
- /\b(?:set|use|pass)\s+(?:the\s+)?(?:argument|parameter|flag)\s+["']?--?\w+["']?\s*=\s*["'][^"']*\$\(/i,
366
- severity: "medium",
367
- },
368
- {
369
- id: "fetch-attacker-url",
370
- pattern:
371
- /\b(?:please|now|then)?\s*(?:fetch|GET|POST|download)\s+(?:from\s+)?(?:https?:\/\/(?:[a-z0-9-]+\.)?(?:burpcollaborator|requestcatcher|webhook\.site|requestbin|interactsh|oast|ngrok|attacker)[a-z0-9.-]*)/i,
372
- severity: "high",
373
- },
374
- {
375
- id: "exfil-base64",
376
- pattern:
377
- /\bbase64\s+(?:encode|the\s+(?:secrets?|env|keys?))\b[\s\S]{0,40}(?:then\s+)?(?:send|post|upload|cat)/i,
378
- severity: "high",
379
- },
380
- {
381
- id: "shutdown-host",
382
- pattern:
383
- /\b(?:shutdown|halt|poweroff|reboot)\s+(?:-h\s+now|now|the host|the (?:server|machine))\b/i,
384
- severity: "medium",
385
- },
386
- {
387
- id: "mkfs-format",
388
- pattern: /\bmkfs\.(?:ext\d|xfs|btrfs|vfat)\s+\/dev\//i,
389
- severity: "high",
390
- },
391
- {
392
- id: "dd-of-dev",
393
- pattern: /\bdd\s+if=\S+\s+of=\/dev\/(?:sd[a-z]|nvme\d+n\d+|disk\d+)/i,
394
- severity: "high",
395
- },
396
- {
397
- id: "iptables-flush",
398
- pattern: /\biptables\s+-F(?:\s|$)/i,
399
- severity: "medium",
400
- },
401
- {
402
- id: "history-clear",
403
- pattern: /\b(?:history\s+-c|unset\s+HISTFILE|export\s+HISTFILE=\/dev\/null)\b/i,
404
- severity: "medium",
405
- },
406
- {
407
- id: "tool-run-in-background",
408
- pattern: /\b(?:nohup|setsid|disown).*?(?:rm|curl|wget|nc|bash)\b/i,
409
- severity: "medium",
410
- },
411
- {
412
- id: "encoded-injection-marker",
413
- pattern: /(?:%69gnore|%69%67%6e%6f%72%65)\s+(?:%70revious|%70%72%65%76%69%6f%75%73)/i,
414
- severity: "medium",
415
- },
416
- {
417
- id: "instruction-tag-payload",
418
- pattern: /<\s*INSTRUCTIONS?\s*>[\s\S]{0,800}<\s*\/\s*INSTRUCTIONS?\s*>/i,
419
- severity: "medium",
420
- },
421
- {
422
- id: "markdown-instruction-block",
423
- pattern: /^[\s\S]{1,400}?^>+[ \t]*(?:You are|Ignore|Disregard|Forget|From now on)/im,
424
- severity: "low",
425
- },
426
- {
427
- id: "imperative-link",
428
- pattern: /\[click here\]\(\s*(?:javascript:|data:|file:)/i,
429
- severity: "medium",
430
- },
431
- ];
432
-
433
- const MIN_CORPUS_RULES = 50;
434
-
435
- /**
436
- * Defensive corpus-floor guard. If the rule list is ever trimmed below the
437
- * documented minimum, fail loudly at module-load instead of silently weakening
438
- * detection. Extracted (and re-exported via `__internals`) so the failure path
439
- * is testable without mutating the production corpus.
440
- */
441
- function assertCorpusFloor(rules: ReadonlyArray<PromptInjectionRule>): void {
442
- if (rules.length < MIN_CORPUS_RULES) {
443
- throw new Error(
444
- `prompt-injection-detector regex corpus has ${rules.length} rules; minimum is ${MIN_CORPUS_RULES}`,
445
- );
446
- }
447
- }
448
-
449
- assertCorpusFloor(REGEX_RULES);
450
-
451
- function severityWeight(s: PromptInjectionSeverity): number {
452
- return SEVERITY_WEIGHT[s];
453
- }
454
-
455
- function regexHits(
456
- text: string,
457
- rules: ReadonlyArray<PromptInjectionRule> = REGEX_RULES,
458
- ): PromptInjectionHit[] {
459
- const hits: PromptInjectionHit[] = [];
460
- for (const rule of rules) {
461
- const m = rule.pattern.exec(text);
462
- if (m === null) continue;
463
- const start = m.index;
464
- const end = m.index + m[0].length;
465
- hits.push({
466
- rule: rule.id,
467
- span: [start, end],
468
- severity: rule.severity,
469
- layer: "regex",
470
- });
471
- if (rule.pattern.global) {
472
- // reset lastIndex so we don't surprise downstream consumers
473
- rule.pattern.lastIndex = 0;
474
- }
475
- }
476
- return hits;
477
- }
478
-
479
- function structuralHits(text: string): PromptInjectionHit[] {
480
- const hits: PromptInjectionHit[] = [];
481
-
482
- // BOM tampering — text shouldn't open with a BOM unless it's at the
483
- // start of a UTF document. Tool outputs almost never legitimately do.
484
- if (text.startsWith("")) {
485
- hits.push({ rule: "structural-bom", span: [0, 1], severity: "low", layer: "structural" });
486
- }
487
-
488
- // Role-marker injection beyond the ones the regex layer already matches.
489
- // A cheap structural variant: "role:\nrole:" cluster on adjacent lines.
490
- const roleClusterRe =
491
- /(?:^|\n)[ \t]*(?:system|assistant|user|human)[ \t]*:[^\n]*\n[ \t]*(?:system|assistant|user|human)[ \t]*:/i;
492
- const role = roleClusterRe.exec(text);
493
- if (role) {
494
- hits.push({
495
- rule: "structural-role-cluster",
496
- span: [role.index, role.index + role[0].length],
497
- severity: "medium",
498
- layer: "structural",
499
- });
500
- }
501
-
502
- // Trailing imperative block: if the last 250 chars contain an imperative
503
- // verb and no preceding paragraph break, score it as suspicious. This
504
- // catches innocuous-looking tool output that ends with "Now run X".
505
- const tailStart = Math.max(0, text.length - 350);
506
- const tail = text.slice(tailStart);
507
- const tailImperative =
508
- /(?:^|\n)[ \t]*(?:now |then |finally )?(?:please[ \t]+)?(?:run|execute|fetch|delete|remove|email|upload|send|forward|leak|exfil(?:trate)?|shutdown|kill|chmod|chown|sudo)\b[^\n]{0,200}$/i;
509
- const t = tailImperative.exec(tail);
510
- if (t) {
511
- hits.push({
512
- rule: "structural-trailing-imperative",
513
- span: [tailStart + t.index, tailStart + t.index + t[0].length],
514
- severity: "medium",
515
- layer: "structural",
516
- });
517
- }
518
-
519
- // Long base64 with imperative neighbour — common smuggling shape.
520
- const b64 = /[A-Za-z0-9+/]{120,}={0,2}/.exec(text);
521
- if (b64 !== null) {
522
- const ctxStart = Math.max(0, b64.index - 80);
523
- const ctx = text.slice(ctxStart, b64.index);
524
- if (/(decode|run|execute|eval|payload|shell)/i.test(ctx)) {
525
- hits.push({
526
- rule: "structural-suspicious-base64",
527
- span: [b64.index, b64.index + b64[0].length],
528
- severity: "medium",
529
- layer: "structural",
530
- });
531
- }
532
- }
533
-
534
- // Smuggled URL with credential exfil pattern in the same line.
535
- const urlExfil =
536
- /(?:https?:\/\/[^\s)]+)[^\n]{0,80}\b(?:token|secret|api[_\s-]?key|cookie|session)\b/i.exec(
537
- text,
538
- );
539
- if (urlExfil) {
540
- hits.push({
541
- rule: "structural-url-exfil-pair",
542
- span: [urlExfil.index, urlExfil.index + urlExfil[0].length],
543
- severity: "medium",
544
- layer: "structural",
545
- });
546
- }
547
-
548
- return hits;
549
- }
550
-
551
- export type LlmClassifyFn = (
552
- text: string,
553
- ) => Promise<{ verdict: PromptInjectionClassification; rationale?: string } | undefined>;
554
-
555
- export type ClassifyOptions = {
556
- /**
557
- * When set, layer 3 LLM classifier runs and its verdict can lift
558
- * "clean" → "suspicious" or upgrade an existing suspicious verdict
559
- * to "malicious". A `clean` verdict from the model is advisory only —
560
- * we never downgrade a high-severity regex hit.
561
- *
562
- * Activated when the runtime sets `CREWHAUS_PI_CLASSIFIER_MODEL` and
563
- * the runtime supplies the actual classify callback.
564
- */
565
- readonly llmClassifier?: LlmClassifyFn;
566
- /**
567
- * Override the suspicious / malicious thresholds. Mostly used by
568
- * tests; production should leave defaults.
569
- */
570
- readonly thresholds?: { readonly suspicious?: number; readonly malicious?: number };
571
- };
572
-
573
- function aggregateScore(hits: ReadonlyArray<PromptInjectionHit>): number {
574
- // Probabilistic OR: each hit raises the score multiplicatively.
575
- let p = 0;
576
- for (const h of hits) {
577
- const w = severityWeight(h.severity);
578
- p = 1 - (1 - p) * (1 - w);
579
- }
580
- return Math.min(1, p);
581
- }
582
-
583
- function classify(score: number, threshold: { suspicious: number; malicious: number }) {
584
- if (score >= threshold.malicious) return "malicious" as const;
585
- if (score >= threshold.suspicious) return "suspicious" as const;
586
- return "clean" as const;
587
- }
588
-
589
- function foldHomoglyphs(s: string): string {
590
- let out = "";
591
- for (const ch of s) out += HOMOGLYPHS[ch] ?? ch;
592
- return out;
593
- }
594
-
595
- /**
596
- * Canonical "match view" of the text. NFKC-folds full-width / compatibility
597
- * forms, strips zero-width/format/bidi/tag characters, maps confusable
598
- * homoglyphs to ASCII, and collapses whitespace runs to single spaces so the
599
- * literal-space anchors in the keyword rules match "ignore\n\nprevious" and
600
- * "ignore previous" alike (#143).
601
- */
602
- function normalizeForMatch(text: string): string {
603
- const stripped = text.normalize("NFKC").replace(INVISIBLE_RE, "");
604
- return foldHomoglyphs(stripped).replace(/\s+/g, " ");
605
- }
606
-
607
- function isMostlyPrintable(s: string): boolean {
608
- if (s.length === 0) return false;
609
- let printable = 0;
610
- for (let i = 0; i < s.length; i++) {
611
- const c = s.charCodeAt(i);
612
- if (c === 9 || c === 10 || c === 13 || (c >= 32 && c < 127)) printable++;
613
- }
614
- return printable / s.length > 0.85;
615
- }
616
-
617
- function tryDecodeBase64(blob: string): string | undefined {
618
- if (blob.length < 16 || blob.length % 4 === 1) return undefined;
619
- try {
620
- const decoded = Buffer.from(blob, "base64").toString("utf8");
621
- return isMostlyPrintable(decoded) ? decoded : undefined;
622
- } catch {
623
- return undefined;
624
- }
625
- }
626
-
627
- function tryDecodeHex(blob: string): string | undefined {
628
- if (blob.length < 16 || blob.length % 2 !== 0) return undefined;
629
- try {
630
- const decoded = Buffer.from(blob, "hex").toString("utf8");
631
- return isMostlyPrintable(decoded) ? decoded : undefined;
632
- } catch {
633
- return undefined;
634
- }
635
- }
636
-
637
- function tryDecodePercent(text: string): string | undefined {
638
- try {
639
- const decoded = decodeURIComponent(text);
640
- return decoded !== text ? decoded : undefined;
641
- } catch {
642
- return undefined;
643
- }
644
- }
645
-
646
- /**
647
- * Recursively decode base64 / hex / percent-encoded blobs so an injection
648
- * hidden in an encoded payload is rescanned in cleartext, regardless of
649
- * neighbouring keywords (#143). Match counts and depth are bounded so this
650
- * cannot itself become a DoS vector.
651
- */
652
- function decodedVariants(text: string, depth = 2): string[] {
653
- if (depth <= 0 || text.length === 0) return [];
654
- const out: string[] = [];
655
- const push = (s: string | undefined): void => {
656
- if (s !== undefined && s.length > 0) out.push(s, ...decodedVariants(s, depth - 1));
657
- };
658
- for (const m of [...text.matchAll(/[A-Za-z0-9+/]{16,}={0,2}/g)].slice(0, 8)) {
659
- push(tryDecodeBase64(m[0]));
660
- }
661
- for (const m of [...text.matchAll(/(?:[0-9A-Fa-f]{2}){8,}/g)].slice(0, 8)) {
662
- push(tryDecodeHex(m[0]));
663
- }
664
- if (/%[0-9A-Fa-f]{2}/.test(text)) push(tryDecodePercent(text));
665
- return out.slice(0, 16);
666
- }
667
-
668
- /**
669
- * Classify a tool output. Pure with respect to the input string when
670
- * the LLM classifier is not supplied.
671
- */
672
- export async function classifyText(
673
- text: string,
674
- opts: ClassifyOptions = {},
675
- ): Promise<PromptInjectionResult> {
676
- const threshold = {
677
- suspicious: opts.thresholds?.suspicious ?? SCORE_SUSPICIOUS,
678
- malicious: opts.thresholds?.malicious ?? SCORE_MALICIOUS,
679
- };
680
- if (text === "") {
681
- return { classification: "clean", score: 0, hits: [] };
682
- }
683
- // Bound the work the regex/structural layers do so a pathological input
684
- // can't wedge the classifier (#153). Keep head + tail so leading and
685
- // trailing injections both stay in view.
686
- const analyzed =
687
- text.length > MAX_CLASSIFY_LEN
688
- ? `${text.slice(0, MAX_CLASSIFY_LEN / 2)}\n${text.slice(-MAX_CLASSIFY_LEN / 2)}`
689
- : text;
690
- // De-obfuscate into match views so the keyword rules can't be dodged with
691
- // full-width characters, zero-width splits, homoglyphs, whitespace tricks,
692
- // or base64/percent/hex encoding (#143). Structural rules run on the raw
693
- // (bounded) text; regex rules run on every variant, deduped by rule id.
694
- const variants = [analyzed, normalizeForMatch(analyzed), ...decodedVariants(analyzed)];
695
- const regHits: PromptInjectionHit[] = [];
696
- const seenRules = new Set<string>();
697
- for (const variant of variants) {
698
- for (const h of regexHits(variant)) {
699
- if (seenRules.has(h.rule)) continue;
700
- seenRules.add(h.rule);
701
- regHits.push(h);
702
- }
703
- }
704
- const hits: PromptInjectionHit[] = [...regHits, ...structuralHits(analyzed)];
705
- let score = aggregateScore(hits);
706
- let classification = classify(score, threshold);
707
-
708
- if (opts.llmClassifier !== undefined) {
709
- try {
710
- const verdict = await opts.llmClassifier(analyzed);
711
- if (verdict !== undefined) {
712
- if (verdict.verdict === "malicious") {
713
- classification = "malicious";
714
- score = Math.max(score, threshold.malicious);
715
- hits.push({
716
- rule: "llm-malicious",
717
- span: [0, Math.min(text.length, 200)],
718
- severity: "high",
719
- layer: "llm",
720
- });
721
- } else if (verdict.verdict === "suspicious" && classification === "clean") {
722
- classification = "suspicious";
723
- score = Math.max(score, threshold.suspicious);
724
- hits.push({
725
- rule: "llm-suspicious",
726
- span: [0, Math.min(text.length, 200)],
727
- severity: "medium",
728
- layer: "llm",
729
- });
730
- }
731
- }
732
- } catch {
733
- // LLM tier is best-effort; swallow so a model outage doesn't
734
- // block tool execution.
735
- }
736
- }
737
-
738
- return { classification, score, hits };
739
- }
740
-
741
- /**
742
- * Build a redaction notice safe to substitute for the original tool
743
- * output. The notice names the rules that fired so that auditors can
744
- * verify the decision later.
745
- */
746
- export function buildRedactionNotice(hits: ReadonlyArray<PromptInjectionHit>): string {
747
- const ids = [...new Set(hits.map((h) => h.rule))].slice(0, 6);
748
- return `[tool output redacted: prompt injection detected: ${ids.join(", ")}]`;
749
- }
750
-
751
- /**
752
- * Returns true when the env-driven LLM classifier should run.
753
- */
754
- export function llmClassifierEnabled(env: NodeJS.ProcessEnv = process.env): boolean {
755
- const m = env["CREWHAUS_PI_CLASSIFIER_MODEL"];
756
- return m !== undefined && m.trim() !== "";
757
- }
758
-
759
- /**
760
- * Internal seams exposed ONLY for unit tests. Not part of the public API and
761
- * not subject to semver — these let the test suite drive the module's
762
- * defensive branches (corpus-floor guard, global-flag `lastIndex` reset, and
763
- * the decoder `try/catch` fallbacks) with crafted inputs that the public
764
- * `classifyText` entrypoint can never construct on its own. Do not import
765
- * from application code.
766
- */
767
- export const __internals = {
768
- assertCorpusFloor,
769
- regexHits,
770
- tryDecodeBase64,
771
- tryDecodeHex,
772
- tryDecodePercent,
773
- MIN_CORPUS_RULES,
774
- } as const;