@crewhaus/prompt-injection-detector 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,670 @@
1
+ /**
2
+ * Catalog R8 `prompt-injection-detector` — heuristic + optional LLM
3
+ * classifier for tool output. Used by runtime-core after every tool
4
+ * call (when the tool's `classifyOutput` flag is not explicitly false)
5
+ * to decide whether the output should be passed to the model verbatim,
6
+ * passed with a system warning, or redacted.
7
+ *
8
+ * Three layers, fail-closed when ambiguous:
9
+ *
10
+ * Layer 1 — regex rules over a corpus drawn from OWASP LLM Top-10
11
+ * plus a 50+-vector hand-crafted set. Each rule has a severity tag
12
+ * and contributes to a cumulative score. The corpus is exported so
13
+ * tests and downstream tools can audit it.
14
+ *
15
+ * Layer 2 — structural heuristics. Trailing imperative blocks,
16
+ * role-marker injection (e.g. "system:" / "<|im_start|>"), BOM
17
+ * tampering, and base64 wrapping a malicious string are all detected
18
+ * without overlap with Layer 1. These produce hits with severity
19
+ * weighted by structural risk.
20
+ *
21
+ * Layer 3 — optional LLM classifier. Activated when
22
+ * `CREWHAUS_PI_CLASSIFIER_MODEL` is set; the runtime supplies a
23
+ * `classify` callback that delegates to a model. Without the env
24
+ * var the layer is a no-op.
25
+ *
26
+ * The aggregate score thresholds:
27
+ * < 0.40 → "clean"
28
+ * [0.40, 0.80) → "suspicious"
29
+ * ≥ 0.80 → "malicious"
30
+ *
31
+ * Layer R8. Pairs with `tool-result-store` (the previewContent input)
32
+ * and `runtime-core` (the post-tool callsite that consumes the
33
+ * classification).
34
+ */
35
+ const SEVERITY_WEIGHT = {
36
+ low: 0.18,
37
+ medium: 0.42,
38
+ high: 0.85,
39
+ };
40
+ const SCORE_SUSPICIOUS = 0.4;
41
+ const SCORE_MALICIOUS = 0.8;
42
+ // Upper bound on the text the regex/structural layers scan, so a pathological
43
+ // (e.g. multi-MB whitespace) input cannot wedge the classifier (#153). Larger
44
+ // inputs are analyzed head + tail.
45
+ const MAX_CLASSIFY_LEN = 64 * 1024;
46
+ // Zero-width / format / bidi / tag characters used to split trigger words
47
+ // ("ig<U+200B>nore"). Stripped from the match view; their *presence* is still
48
+ // caught on the raw text by the unicode-tag-spoof / rtl-override rules.
49
+ const INVISIBLE_RE = /[­᠎​-‏‪-‮⁠-⁤⁦-\u{E0000}-\u{E007F}]/gu;
50
+ // Common confusable homoglyphs → ASCII, applied only to the match view so an
51
+ // attacker cannot dodge the keyword rules with Cyrillic/Greek look-alikes
52
+ // (e.g. Cyrillic "іgnоre"). Intentionally small to limit false positives.
53
+ const HOMOGLYPHS = {
54
+ а: "a",
55
+ е: "e",
56
+ о: "o",
57
+ р: "p",
58
+ с: "c",
59
+ у: "y",
60
+ х: "x",
61
+ і: "i",
62
+ ѕ: "s",
63
+ ј: "j",
64
+ // Lowercase Cyrillic look-alikes whose UPPERCASE forms are mapped below.
65
+ // NFKC does not fold these to Latin, so without them a single lowercase
66
+ // homoglyph inside a trigger word (e.g. Cyrillic т U+0442 in "insтructions")
67
+ // slips past the keyword rules even though the uppercase Т is folded.
68
+ в: "b",
69
+ к: "k",
70
+ м: "m",
71
+ н: "h",
72
+ т: "t",
73
+ // Capital Cyrillic look-alikes. NFKC does not fold these to Latin, so without
74
+ // them a sentence-initial homoglyph (e.g. "Іgnore all previous instructions",
75
+ // Cyrillic І U+0406) evades the keyword rules. Symmetric with the lowercase set.
76
+ А: "A",
77
+ В: "B",
78
+ Е: "E",
79
+ К: "K",
80
+ М: "M",
81
+ Н: "H",
82
+ О: "O",
83
+ Р: "P",
84
+ С: "C",
85
+ Т: "T",
86
+ У: "Y",
87
+ Х: "X",
88
+ І: "I",
89
+ Ј: "J",
90
+ Ѕ: "S",
91
+ Α: "A",
92
+ Β: "B",
93
+ Ε: "E",
94
+ Ο: "O",
95
+ Ρ: "P",
96
+ Τ: "T",
97
+ Χ: "X",
98
+ ο: "o",
99
+ ρ: "p",
100
+ α: "a",
101
+ ε: "e",
102
+ ι: "i",
103
+ // Lowercase Greek look-alikes whose uppercase forms are mapped above.
104
+ β: "b",
105
+ τ: "t",
106
+ χ: "x",
107
+ };
108
+ /**
109
+ * Hand-curated corpus. Rule ids are stable so callers (auditors, tests,
110
+ * the redaction notice) can rely on them.
111
+ */
112
+ export const REGEX_RULES = [
113
+ {
114
+ id: "ignore-previous",
115
+ pattern: /\bignore (?:all |the )?(?:previous|prior|above|earlier)\s+(?:instructions?|rules?|prompts?|messages?)\b/i,
116
+ severity: "high",
117
+ description: "OWASP LLM01 — ignore-previous-instructions injection",
118
+ },
119
+ {
120
+ id: "disregard-previous",
121
+ pattern: /\bdisregard (?:all |the )?(?:previous|prior|above|earlier)\s+(?:instructions?|rules?|prompts?)\b/i,
122
+ severity: "high",
123
+ },
124
+ {
125
+ id: "forget-previous",
126
+ pattern: /\bforget (?:everything|all|what) (?:above|before|prior|previous|you (?:were|have been) told)\b/i,
127
+ severity: "high",
128
+ },
129
+ {
130
+ id: "new-instructions",
131
+ pattern: /\b(?:here are|these are)\s+(?:your\s+)?new\s+(?:instructions|rules|directives|orders)\b/i,
132
+ severity: "high",
133
+ },
134
+ {
135
+ id: "from-now-on",
136
+ pattern: /\bfrom now on(?: you (?:will|must|should))?\s+(?:act|behave|respond|answer|reply|follow)\b/i,
137
+ severity: "medium",
138
+ },
139
+ {
140
+ id: "you-are-now",
141
+ pattern: /\byou are (?:now|hereby)\s+(?:a|an|the)\s+[\w-]+(?:\s+[\w-]+){0,3}\s+(?:assistant|model|persona|character|expert|engine|admin)\b/i,
142
+ severity: "medium",
143
+ },
144
+ {
145
+ id: "system-prompt-leak",
146
+ pattern: /\b(?:reveal|print|output|display|show|expose|dump|leak)\s+(?:the\s+)?(?:system\s+prompt|developer\s+message|hidden\s+instructions|original\s+instructions)\b/i,
147
+ severity: "high",
148
+ },
149
+ {
150
+ id: "tell-me-system-prompt",
151
+ pattern: /\b(?:tell me|tell me about|what is|what are|repeat|reveal|show me)\s+(?:your\s+|the\s+)?(?:system\s+prompt|initial\s+prompt|original\s+instructions|hidden\s+(?:rules|prompt))\b/i,
152
+ severity: "high",
153
+ },
154
+ {
155
+ id: "developer-mode",
156
+ pattern: /\b(?:enable|activate|enter|enable_)?\s*(?:developer|dev|debug|admin|god|jailbreak|dan|stan|aim|free)\s*mode\b/i,
157
+ severity: "medium",
158
+ },
159
+ {
160
+ id: "dan-jailbreak",
161
+ pattern: /\b(?:DAN|do anything now)\b/i,
162
+ severity: "medium",
163
+ },
164
+ {
165
+ id: "above-text-untrusted",
166
+ pattern: /\bthe (?:above|prior|previous) (?:text|message|content|instruction)s? (?:is|are|was) (?:fake|untrusted|wrong|incorrect|a test)\b/i,
167
+ severity: "high",
168
+ },
169
+ {
170
+ id: "destructive-rm",
171
+ pattern: /\brm\s+-rf\s+(?:\/|~|\$HOME|--no-preserve-root)/i,
172
+ severity: "high",
173
+ description: "Direct destructive command injection",
174
+ },
175
+ {
176
+ id: "destructive-curl-pipe-sh",
177
+ pattern: /\bcurl\s+[^\s|]+\s*\|\s*(?:sudo\s+)?(?:sh|bash|zsh)\b/i,
178
+ severity: "high",
179
+ },
180
+ {
181
+ id: "destructive-wget-pipe-sh",
182
+ pattern: /\bwget\s+[^\s|]+\s*-O\s*-\s*\|\s*(?:sudo\s+)?(?:sh|bash|zsh)\b/i,
183
+ severity: "high",
184
+ },
185
+ {
186
+ id: "powershell-iex-net",
187
+ pattern: /\b(?:iex|invoke-expression)\s*\(\s*(?:new-object\s+net\.webclient|invoke-webrequest)/i,
188
+ severity: "high",
189
+ },
190
+ {
191
+ id: "exfil-dotenv",
192
+ pattern: /\b(?:cat|read|exfil(?:trate)?|upload|send|leak)\s+(?:the\s+)?(?:\.env|secrets?\.(?:json|yml|yaml)|id_rsa|\.aws\/credentials)/i,
193
+ severity: "high",
194
+ },
195
+ {
196
+ id: "exfil-credentials",
197
+ pattern: /\b(?:send|post|upload|exfil(?:trate)?|forward)\s+(?:the\s+)?(?:credentials?|api[_\s-]?keys?|tokens?|passwords?)\s+(?:to|via|over)\b/i,
198
+ severity: "high",
199
+ },
200
+ {
201
+ id: "system-role-marker",
202
+ pattern: /<\|im_start\|>\s*system|<\|system\|>|\[INST\][\s\S]*?\[\/INST\]/,
203
+ severity: "high",
204
+ description: "OpenAI/Anthropic role-marker injection",
205
+ },
206
+ {
207
+ id: "anthropic-tag-injection",
208
+ pattern: /<\/?(?:system|human|assistant)>/i,
209
+ severity: "medium",
210
+ },
211
+ {
212
+ id: "hidden-system",
213
+ pattern: /\b(?:override|replace|update)\s+(?:your\s+)?system\s+(?:prompt|message|instructions)\b/i,
214
+ severity: "high",
215
+ },
216
+ {
217
+ id: "no-restrictions",
218
+ pattern: /\b(?:no|without|removing)\s+(?:restrictions|filters|limits|safeguards|guardrails)\b/i,
219
+ severity: "medium",
220
+ },
221
+ {
222
+ id: "pretend-you-are",
223
+ pattern: /\b(?:pretend|act as if|imagine|roleplay (?:as|that))\s+you\s+(?:are|were)\s+(?:not\s+|no\s+longer\s+)?(?:bound|restricted|limited|trained)\b/i,
224
+ severity: "medium",
225
+ },
226
+ {
227
+ id: "override-safety",
228
+ pattern: /\b(?:bypass|circumvent|disable|override|turn off)\s+(?:your\s+)?(?:safety|content|moderation|alignment)/i,
229
+ severity: "high",
230
+ },
231
+ {
232
+ id: "tool-call-injection",
233
+ pattern: /\b(?:invoke|call|run|execute)\s+(?:the\s+)?(?:Bash|Python|Shell|Write|Edit|Fetch|fetch|webFetch)\s+tool\s+with\b/i,
234
+ severity: "medium",
235
+ },
236
+ {
237
+ id: "auto-execute",
238
+ pattern: /\b(?:then|next|after that|finally|now)\s+(?:run|execute|do)\s+(?:the following|this|these)\b[^\n]*\b(?:rm|curl|wget|chmod|chown|sudo|nc|ncat)\b/i,
239
+ severity: "high",
240
+ },
241
+ {
242
+ id: "ssh-key-action",
243
+ pattern: /\b(?:upload|publish|push|copy|cat)\s+(?:my\s+|your\s+|the\s+)?(?:ssh\s+)?(?:public\s+)?key/i,
244
+ severity: "medium",
245
+ },
246
+ {
247
+ id: "kubectl-cluster-admin",
248
+ pattern: /\bkubectl\s+(?:create|apply|patch)\s+(?:clusterrolebinding|role|rolebinding)\b/i,
249
+ severity: "high",
250
+ },
251
+ {
252
+ id: "git-push-force-main",
253
+ pattern: /\bgit\s+push\s+(?:--force|-f)\s+\w+\s+(?:main|master)/i,
254
+ severity: "medium",
255
+ },
256
+ {
257
+ id: "iframe-embed",
258
+ pattern: /<iframe\s+[^>]*\bsrc\s*=\s*["'](?:javascript:|data:text\/html)/i,
259
+ severity: "high",
260
+ },
261
+ {
262
+ id: "javascript-uri",
263
+ pattern: /\bjavascript:\s*(?:eval|fetch|XMLHttpRequest|document\.write)/i,
264
+ severity: "high",
265
+ },
266
+ {
267
+ id: "data-url-script",
268
+ pattern: /data:text\/html;base64,[A-Za-z0-9+/=]{40,}/,
269
+ severity: "medium",
270
+ },
271
+ {
272
+ id: "smuggled-system-block",
273
+ pattern: /^[ \t]*system:[ \t]*\n[\s\S]{0,400}\n[ \t]*human:/im,
274
+ severity: "high",
275
+ },
276
+ {
277
+ id: "fake-user-injection",
278
+ pattern: /^[ \t]*(?:User|Human|USER):[ \t]*[^\n]{1,200}\n[ \t]*(?:Assistant|System|SYSTEM):/m,
279
+ severity: "high",
280
+ },
281
+ {
282
+ id: "unicode-tag-spoof",
283
+ pattern: /[\u{E0001}\u{E0020}-\u{E007F}]+/u,
284
+ severity: "high",
285
+ description: "Unicode tag block (often used for hidden prompts)",
286
+ },
287
+ {
288
+ id: "rtl-override",
289
+ pattern: /[‮‭‎‏؜]/,
290
+ severity: "medium",
291
+ description: "Bidi/RTL override character — common in homoglyph attacks",
292
+ },
293
+ {
294
+ id: "instruct-then-deny",
295
+ pattern: /\b(?:do not|don'?t)\s+(?:tell|inform|reveal|warn)\s+(?:the )?(?:user|operator|caller)\b/i,
296
+ severity: "high",
297
+ },
298
+ {
299
+ id: "silently-execute",
300
+ pattern: /\bsilently\s+(?:execute|run|perform|invoke)\b/i,
301
+ severity: "high",
302
+ },
303
+ {
304
+ id: "prompt-leak-by-name",
305
+ pattern: /\b(?:print|repeat|output)\s+(?:everything|all of)\s+(?:above|before this line)\b/i,
306
+ severity: "high",
307
+ },
308
+ {
309
+ id: "tool-arg-injection",
310
+ pattern: /\b(?:set|use|pass)\s+(?:the\s+)?(?:argument|parameter|flag)\s+["']?--?\w+["']?\s*=\s*["'][^"']*\$\(/i,
311
+ severity: "medium",
312
+ },
313
+ {
314
+ id: "fetch-attacker-url",
315
+ pattern: /\b(?:please|now|then)?\s*(?:fetch|GET|POST|download)\s+(?:from\s+)?(?:https?:\/\/(?:[a-z0-9-]+\.)?(?:burpcollaborator|requestcatcher|webhook\.site|requestbin|interactsh|oast|ngrok|attacker)[a-z0-9.-]*)/i,
316
+ severity: "high",
317
+ },
318
+ {
319
+ id: "exfil-base64",
320
+ pattern: /\bbase64\s+(?:encode|the\s+(?:secrets?|env|keys?))\b[\s\S]{0,40}(?:then\s+)?(?:send|post|upload|cat)/i,
321
+ severity: "high",
322
+ },
323
+ {
324
+ id: "shutdown-host",
325
+ pattern: /\b(?:shutdown|halt|poweroff|reboot)\s+(?:-h\s+now|now|the host|the (?:server|machine))\b/i,
326
+ severity: "medium",
327
+ },
328
+ {
329
+ id: "mkfs-format",
330
+ pattern: /\bmkfs\.(?:ext\d|xfs|btrfs|vfat)\s+\/dev\//i,
331
+ severity: "high",
332
+ },
333
+ {
334
+ id: "dd-of-dev",
335
+ pattern: /\bdd\s+if=\S+\s+of=\/dev\/(?:sd[a-z]|nvme\d+n\d+|disk\d+)/i,
336
+ severity: "high",
337
+ },
338
+ {
339
+ id: "iptables-flush",
340
+ pattern: /\biptables\s+-F(?:\s|$)/i,
341
+ severity: "medium",
342
+ },
343
+ {
344
+ id: "history-clear",
345
+ pattern: /\b(?:history\s+-c|unset\s+HISTFILE|export\s+HISTFILE=\/dev\/null)\b/i,
346
+ severity: "medium",
347
+ },
348
+ {
349
+ id: "tool-run-in-background",
350
+ pattern: /\b(?:nohup|setsid|disown).*?(?:rm|curl|wget|nc|bash)\b/i,
351
+ severity: "medium",
352
+ },
353
+ {
354
+ id: "encoded-injection-marker",
355
+ pattern: /(?:%69gnore|%69%67%6e%6f%72%65)\s+(?:%70revious|%70%72%65%76%69%6f%75%73)/i,
356
+ severity: "medium",
357
+ },
358
+ {
359
+ id: "instruction-tag-payload",
360
+ pattern: /<\s*INSTRUCTIONS?\s*>[\s\S]{0,800}<\s*\/\s*INSTRUCTIONS?\s*>/i,
361
+ severity: "medium",
362
+ },
363
+ {
364
+ id: "markdown-instruction-block",
365
+ pattern: /^[\s\S]{1,400}?^>+[ \t]*(?:You are|Ignore|Disregard|Forget|From now on)/im,
366
+ severity: "low",
367
+ },
368
+ {
369
+ id: "imperative-link",
370
+ pattern: /\[click here\]\(\s*(?:javascript:|data:|file:)/i,
371
+ severity: "medium",
372
+ },
373
+ ];
374
+ const MIN_CORPUS_RULES = 50;
375
+ /**
376
+ * Defensive corpus-floor guard. If the rule list is ever trimmed below the
377
+ * documented minimum, fail loudly at module-load instead of silently weakening
378
+ * detection. Extracted (and re-exported via `__internals`) so the failure path
379
+ * is testable without mutating the production corpus.
380
+ */
381
+ function assertCorpusFloor(rules) {
382
+ if (rules.length < MIN_CORPUS_RULES) {
383
+ throw new Error(`prompt-injection-detector regex corpus has ${rules.length} rules; minimum is ${MIN_CORPUS_RULES}`);
384
+ }
385
+ }
386
+ assertCorpusFloor(REGEX_RULES);
387
+ function severityWeight(s) {
388
+ return SEVERITY_WEIGHT[s];
389
+ }
390
+ function regexHits(text, rules = REGEX_RULES) {
391
+ const hits = [];
392
+ for (const rule of rules) {
393
+ const m = rule.pattern.exec(text);
394
+ if (m === null)
395
+ continue;
396
+ const start = m.index;
397
+ const end = m.index + m[0].length;
398
+ hits.push({
399
+ rule: rule.id,
400
+ span: [start, end],
401
+ severity: rule.severity,
402
+ layer: "regex",
403
+ });
404
+ if (rule.pattern.global) {
405
+ // reset lastIndex so we don't surprise downstream consumers
406
+ rule.pattern.lastIndex = 0;
407
+ }
408
+ }
409
+ return hits;
410
+ }
411
+ function structuralHits(text) {
412
+ const hits = [];
413
+ // BOM tampering — text shouldn't open with a BOM unless it's at the
414
+ // start of a UTF document. Tool outputs almost never legitimately do.
415
+ if (text.startsWith("")) {
416
+ hits.push({ rule: "structural-bom", span: [0, 1], severity: "low", layer: "structural" });
417
+ }
418
+ // Role-marker injection beyond the ones the regex layer already matches.
419
+ // A cheap structural variant: "role:\nrole:" cluster on adjacent lines.
420
+ const roleClusterRe = /(?:^|\n)[ \t]*(?:system|assistant|user|human)[ \t]*:[^\n]*\n[ \t]*(?:system|assistant|user|human)[ \t]*:/i;
421
+ const role = roleClusterRe.exec(text);
422
+ if (role) {
423
+ hits.push({
424
+ rule: "structural-role-cluster",
425
+ span: [role.index, role.index + role[0].length],
426
+ severity: "medium",
427
+ layer: "structural",
428
+ });
429
+ }
430
+ // Trailing imperative block: if the last 250 chars contain an imperative
431
+ // verb and no preceding paragraph break, score it as suspicious. This
432
+ // catches innocuous-looking tool output that ends with "Now run X".
433
+ const tailStart = Math.max(0, text.length - 350);
434
+ const tail = text.slice(tailStart);
435
+ const tailImperative = /(?:^|\n)[ \t]*(?:now |then |finally )?(?:please[ \t]+)?(?:run|execute|fetch|delete|remove|email|upload|send|forward|leak|exfil(?:trate)?|shutdown|kill|chmod|chown|sudo)\b[^\n]{0,200}$/i;
436
+ const t = tailImperative.exec(tail);
437
+ if (t) {
438
+ hits.push({
439
+ rule: "structural-trailing-imperative",
440
+ span: [tailStart + t.index, tailStart + t.index + t[0].length],
441
+ severity: "medium",
442
+ layer: "structural",
443
+ });
444
+ }
445
+ // Long base64 with imperative neighbour — common smuggling shape.
446
+ const b64 = /[A-Za-z0-9+/]{120,}={0,2}/.exec(text);
447
+ if (b64 !== null) {
448
+ const ctxStart = Math.max(0, b64.index - 80);
449
+ const ctx = text.slice(ctxStart, b64.index);
450
+ if (/(decode|run|execute|eval|payload|shell)/i.test(ctx)) {
451
+ hits.push({
452
+ rule: "structural-suspicious-base64",
453
+ span: [b64.index, b64.index + b64[0].length],
454
+ severity: "medium",
455
+ layer: "structural",
456
+ });
457
+ }
458
+ }
459
+ // Smuggled URL with credential exfil pattern in the same line.
460
+ const urlExfil = /(?:https?:\/\/[^\s)]+)[^\n]{0,80}\b(?:token|secret|api[_\s-]?key|cookie|session)\b/i.exec(text);
461
+ if (urlExfil) {
462
+ hits.push({
463
+ rule: "structural-url-exfil-pair",
464
+ span: [urlExfil.index, urlExfil.index + urlExfil[0].length],
465
+ severity: "medium",
466
+ layer: "structural",
467
+ });
468
+ }
469
+ return hits;
470
+ }
471
+ function aggregateScore(hits) {
472
+ // Probabilistic OR: each hit raises the score multiplicatively.
473
+ let p = 0;
474
+ for (const h of hits) {
475
+ const w = severityWeight(h.severity);
476
+ p = 1 - (1 - p) * (1 - w);
477
+ }
478
+ return Math.min(1, p);
479
+ }
480
+ function classify(score, threshold) {
481
+ if (score >= threshold.malicious)
482
+ return "malicious";
483
+ if (score >= threshold.suspicious)
484
+ return "suspicious";
485
+ return "clean";
486
+ }
487
+ function foldHomoglyphs(s) {
488
+ let out = "";
489
+ for (const ch of s)
490
+ out += HOMOGLYPHS[ch] ?? ch;
491
+ return out;
492
+ }
493
+ /**
494
+ * Canonical "match view" of the text. NFKC-folds full-width / compatibility
495
+ * forms, strips zero-width/format/bidi/tag characters, maps confusable
496
+ * homoglyphs to ASCII, and collapses whitespace runs to single spaces so the
497
+ * literal-space anchors in the keyword rules match "ignore\n\nprevious" and
498
+ * "ignore previous" alike (#143).
499
+ */
500
+ function normalizeForMatch(text) {
501
+ const stripped = text.normalize("NFKC").replace(INVISIBLE_RE, "");
502
+ return foldHomoglyphs(stripped).replace(/\s+/g, " ");
503
+ }
504
+ function isMostlyPrintable(s) {
505
+ if (s.length === 0)
506
+ return false;
507
+ let printable = 0;
508
+ for (let i = 0; i < s.length; i++) {
509
+ const c = s.charCodeAt(i);
510
+ if (c === 9 || c === 10 || c === 13 || (c >= 32 && c < 127))
511
+ printable++;
512
+ }
513
+ return printable / s.length > 0.85;
514
+ }
515
+ function tryDecodeBase64(blob) {
516
+ if (blob.length < 16 || blob.length % 4 === 1)
517
+ return undefined;
518
+ try {
519
+ const decoded = Buffer.from(blob, "base64").toString("utf8");
520
+ return isMostlyPrintable(decoded) ? decoded : undefined;
521
+ }
522
+ catch {
523
+ return undefined;
524
+ }
525
+ }
526
+ function tryDecodeHex(blob) {
527
+ if (blob.length < 16 || blob.length % 2 !== 0)
528
+ return undefined;
529
+ try {
530
+ const decoded = Buffer.from(blob, "hex").toString("utf8");
531
+ return isMostlyPrintable(decoded) ? decoded : undefined;
532
+ }
533
+ catch {
534
+ return undefined;
535
+ }
536
+ }
537
+ function tryDecodePercent(text) {
538
+ try {
539
+ const decoded = decodeURIComponent(text);
540
+ return decoded !== text ? decoded : undefined;
541
+ }
542
+ catch {
543
+ return undefined;
544
+ }
545
+ }
546
+ /**
547
+ * Recursively decode base64 / hex / percent-encoded blobs so an injection
548
+ * hidden in an encoded payload is rescanned in cleartext, regardless of
549
+ * neighbouring keywords (#143). Match counts and depth are bounded so this
550
+ * cannot itself become a DoS vector.
551
+ */
552
+ function decodedVariants(text, depth = 2) {
553
+ if (depth <= 0 || text.length === 0)
554
+ return [];
555
+ const out = [];
556
+ const push = (s) => {
557
+ if (s !== undefined && s.length > 0)
558
+ out.push(s, ...decodedVariants(s, depth - 1));
559
+ };
560
+ for (const m of [...text.matchAll(/[A-Za-z0-9+/]{16,}={0,2}/g)].slice(0, 8)) {
561
+ push(tryDecodeBase64(m[0]));
562
+ }
563
+ for (const m of [...text.matchAll(/(?:[0-9A-Fa-f]{2}){8,}/g)].slice(0, 8)) {
564
+ push(tryDecodeHex(m[0]));
565
+ }
566
+ if (/%[0-9A-Fa-f]{2}/.test(text))
567
+ push(tryDecodePercent(text));
568
+ return out.slice(0, 16);
569
+ }
570
+ /**
571
+ * Classify a tool output. Pure with respect to the input string when
572
+ * the LLM classifier is not supplied.
573
+ */
574
+ export async function classifyText(text, opts = {}) {
575
+ const threshold = {
576
+ suspicious: opts.thresholds?.suspicious ?? SCORE_SUSPICIOUS,
577
+ malicious: opts.thresholds?.malicious ?? SCORE_MALICIOUS,
578
+ };
579
+ if (text === "") {
580
+ return { classification: "clean", score: 0, hits: [] };
581
+ }
582
+ // Bound the work the regex/structural layers do so a pathological input
583
+ // can't wedge the classifier (#153). Keep head + tail so leading and
584
+ // trailing injections both stay in view.
585
+ const analyzed = text.length > MAX_CLASSIFY_LEN
586
+ ? `${text.slice(0, MAX_CLASSIFY_LEN / 2)}\n${text.slice(-MAX_CLASSIFY_LEN / 2)}`
587
+ : text;
588
+ // De-obfuscate into match views so the keyword rules can't be dodged with
589
+ // full-width characters, zero-width splits, homoglyphs, whitespace tricks,
590
+ // or base64/percent/hex encoding (#143). Structural rules run on the raw
591
+ // (bounded) text; regex rules run on every variant, deduped by rule id.
592
+ const variants = [analyzed, normalizeForMatch(analyzed), ...decodedVariants(analyzed)];
593
+ const regHits = [];
594
+ const seenRules = new Set();
595
+ for (const variant of variants) {
596
+ for (const h of regexHits(variant)) {
597
+ if (seenRules.has(h.rule))
598
+ continue;
599
+ seenRules.add(h.rule);
600
+ regHits.push(h);
601
+ }
602
+ }
603
+ const hits = [...regHits, ...structuralHits(analyzed)];
604
+ let score = aggregateScore(hits);
605
+ let classification = classify(score, threshold);
606
+ if (opts.llmClassifier !== undefined) {
607
+ try {
608
+ const verdict = await opts.llmClassifier(analyzed);
609
+ if (verdict !== undefined) {
610
+ if (verdict.verdict === "malicious") {
611
+ classification = "malicious";
612
+ score = Math.max(score, threshold.malicious);
613
+ hits.push({
614
+ rule: "llm-malicious",
615
+ span: [0, Math.min(text.length, 200)],
616
+ severity: "high",
617
+ layer: "llm",
618
+ });
619
+ }
620
+ else if (verdict.verdict === "suspicious" && classification === "clean") {
621
+ classification = "suspicious";
622
+ score = Math.max(score, threshold.suspicious);
623
+ hits.push({
624
+ rule: "llm-suspicious",
625
+ span: [0, Math.min(text.length, 200)],
626
+ severity: "medium",
627
+ layer: "llm",
628
+ });
629
+ }
630
+ }
631
+ }
632
+ catch {
633
+ // LLM tier is best-effort; swallow so a model outage doesn't
634
+ // block tool execution.
635
+ }
636
+ }
637
+ return { classification, score, hits };
638
+ }
639
+ /**
640
+ * Build a redaction notice safe to substitute for the original tool
641
+ * output. The notice names the rules that fired so that auditors can
642
+ * verify the decision later.
643
+ */
644
+ export function buildRedactionNotice(hits) {
645
+ const ids = [...new Set(hits.map((h) => h.rule))].slice(0, 6);
646
+ return `[tool output redacted: prompt injection detected: ${ids.join(", ")}]`;
647
+ }
648
+ /**
649
+ * Returns true when the env-driven LLM classifier should run.
650
+ */
651
+ export function llmClassifierEnabled(env = process.env) {
652
+ const m = env["CREWHAUS_PI_CLASSIFIER_MODEL"];
653
+ return m !== undefined && m.trim() !== "";
654
+ }
655
+ /**
656
+ * Internal seams exposed ONLY for unit tests. Not part of the public API and
657
+ * not subject to semver — these let the test suite drive the module's
658
+ * defensive branches (corpus-floor guard, global-flag `lastIndex` reset, and
659
+ * the decoder `try/catch` fallbacks) with crafted inputs that the public
660
+ * `classifyText` entrypoint can never construct on its own. Do not import
661
+ * from application code.
662
+ */
663
+ export const __internals = {
664
+ assertCorpusFloor,
665
+ regexHits,
666
+ tryDecodeBase64,
667
+ tryDecodeHex,
668
+ tryDecodePercent,
669
+ MIN_CORPUS_RULES,
670
+ };
package/package.json CHANGED
@@ -1,18 +1,21 @@
1
1
  {
2
2
  "name": "@crewhaus/prompt-injection-detector",
3
- "version": "0.1.3",
3
+ "version": "0.1.5",
4
4
  "type": "module",
5
5
  "description": "Cross-cutting safety classifier for tool outputs (regex + structural heuristics + optional LLM tier)",
6
- "main": "src/index.ts",
7
- "types": "src/index.ts",
6
+ "main": "dist/index.js",
7
+ "types": "dist/index.d.ts",
8
8
  "exports": {
9
- ".": "./src/index.ts"
9
+ ".": {
10
+ "types": "./dist/index.d.ts",
11
+ "import": "./dist/index.js"
12
+ }
10
13
  },
11
14
  "scripts": {
12
15
  "test": "bun test src"
13
16
  },
14
17
  "dependencies": {
15
- "@crewhaus/errors": "0.1.3"
18
+ "@crewhaus/errors": "0.1.5"
16
19
  },
17
20
  "license": "Apache-2.0",
18
21
  "author": {
@@ -32,5 +35,5 @@
32
35
  "publishConfig": {
33
36
  "access": "public"
34
37
  },
35
- "files": ["src", "README.md", "LICENSE", "NOTICE"]
38
+ "files": ["dist", "README.md", "LICENSE", "NOTICE"]
36
39
  }