agent-security-scanner-mcp 3.5.2 → 3.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/generic_ast.py +7 -2
- package/package.json +3 -2
- package/rules/prompt-injection.security.yaml +68 -0
- package/src/tools/garak-bridge.js +209 -0
- package/src/tools/scan-prompt.js +528 -84
- package/taint_analyzer.py +516 -11
package/src/tools/scan-prompt.js
CHANGED
|
@@ -4,6 +4,7 @@ import { readFileSync, existsSync } from "fs";
|
|
|
4
4
|
import { dirname, join } from "path";
|
|
5
5
|
import { fileURLToPath } from "url";
|
|
6
6
|
import { createHash } from "crypto";
|
|
7
|
+
import { runGarakProbes } from './garak-bridge.js';
|
|
7
8
|
|
|
8
9
|
// Handle both ESM and CJS bundling
|
|
9
10
|
let __dirname;
|
|
@@ -49,6 +50,76 @@ const CONFIDENCE_MULTIPLIERS = {
|
|
|
49
50
|
"LOW": 0.4
|
|
50
51
|
};
|
|
51
52
|
|
|
53
|
+
// Category co-occurrence matrix: pairs that together signal sophisticated attacks
|
|
54
|
+
// Inspired by PromptFoo's jailbreak:composite strategy
|
|
55
|
+
const CATEGORY_COOCCURRENCE_BOOSTS = {
|
|
56
|
+
'obfuscation+exfiltration': 0.20,
|
|
57
|
+
'obfuscation+malicious-injection': 0.20,
|
|
58
|
+
'obfuscation+prompt-injection-content': 0.15,
|
|
59
|
+
'obfuscation+prompt-injection-jailbreak': 0.15,
|
|
60
|
+
'social-engineering+exfiltration': 0.15,
|
|
61
|
+
'social-engineering+malicious-injection': 0.15,
|
|
62
|
+
'prompt-injection-encoded+prompt-injection-content': 0.20,
|
|
63
|
+
'prompt-injection-multi-turn+prompt-injection-content': 0.15,
|
|
64
|
+
'prompt-injection-jailbreak+exfiltration': 0.25,
|
|
65
|
+
'prompt-injection-jailbreak+prompt-injection-content': 0.15,
|
|
66
|
+
'agent-manipulation+exfiltration': 0.20,
|
|
67
|
+
'agent-manipulation+system-manipulation': 0.15,
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
// Calculate co-occurrence boost from category pairs
|
|
71
|
+
function getCategoryCooccurrenceBoost(categories) {
|
|
72
|
+
let boost = 0;
|
|
73
|
+
const cats = [...categories];
|
|
74
|
+
for (let i = 0; i < cats.length; i++) {
|
|
75
|
+
for (let j = i + 1; j < cats.length; j++) {
|
|
76
|
+
const key1 = `${cats[i]}+${cats[j]}`;
|
|
77
|
+
const key2 = `${cats[j]}+${cats[i]}`;
|
|
78
|
+
boost += CATEGORY_COOCCURRENCE_BOOSTS[key1] || CATEGORY_COOCCURRENCE_BOOSTS[key2] || 0;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return Math.min(0.40, boost); // Cap total co-occurrence boost at 40%
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Orthogonal scoring channel: measures attack breadth independently of per-rule confidence
|
|
85
|
+
// This is immune to per-rule confidence gaming
|
|
86
|
+
function calculateOrthogonalScore(findings) {
|
|
87
|
+
const dimensions = new Set();
|
|
88
|
+
|
|
89
|
+
for (const f of findings) {
|
|
90
|
+
const cat = f.category || 'unknown';
|
|
91
|
+
// Map categories into orthogonal attack dimensions
|
|
92
|
+
if (['exfiltration', 'prompt-injection-extraction', 'prompt-injection-output'].includes(cat)) {
|
|
93
|
+
dimensions.add('extraction');
|
|
94
|
+
}
|
|
95
|
+
if (['malicious-injection', 'system-manipulation'].includes(cat)) {
|
|
96
|
+
dimensions.add('code-execution');
|
|
97
|
+
}
|
|
98
|
+
if (['obfuscation', 'prompt-injection-encoded'].includes(cat)) {
|
|
99
|
+
dimensions.add('evasion');
|
|
100
|
+
}
|
|
101
|
+
if (['social-engineering', 'prompt-injection-jailbreak'].includes(cat)) {
|
|
102
|
+
dimensions.add('social');
|
|
103
|
+
}
|
|
104
|
+
if (['prompt-injection-content', 'prompt-injection-context', 'prompt-injection-delimiter'].includes(cat)) {
|
|
105
|
+
dimensions.add('injection');
|
|
106
|
+
}
|
|
107
|
+
if (['prompt-injection-multi-turn'].includes(cat)) {
|
|
108
|
+
dimensions.add('persistence');
|
|
109
|
+
}
|
|
110
|
+
if (['agent-manipulation', 'prompt-injection-privilege'].includes(cat)) {
|
|
111
|
+
dimensions.add('privilege');
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Score based on number of orthogonal dimensions triggered
|
|
116
|
+
const dimCount = dimensions.size;
|
|
117
|
+
if (dimCount <= 1) return 0;
|
|
118
|
+
if (dimCount === 2) return 10;
|
|
119
|
+
if (dimCount === 3) return 25;
|
|
120
|
+
return 40; // 4+ dimensions
|
|
121
|
+
}
|
|
122
|
+
|
|
52
123
|
// Load agent attack rules from YAML
|
|
53
124
|
function loadAgentAttackRules() {
|
|
54
125
|
try {
|
|
@@ -194,6 +265,7 @@ function calculateRiskScore(findings, context) {
|
|
|
194
265
|
if (findings.length === 0) return 0;
|
|
195
266
|
|
|
196
267
|
let totalScore = 0;
|
|
268
|
+
const lowConfidenceCount = findings.filter(f => (f.confidence || 'MEDIUM') === 'LOW').length;
|
|
197
269
|
|
|
198
270
|
for (const finding of findings) {
|
|
199
271
|
const riskScore = parseInt(finding.risk_score) || 50;
|
|
@@ -226,8 +298,24 @@ function calculateRiskScore(findings, context) {
|
|
|
226
298
|
|
|
227
299
|
// Per-finding boost (smaller than before)
|
|
228
300
|
avgScore = avgScore * (1 + (findings.length - 1) * 0.05);
|
|
301
|
+
|
|
302
|
+
// Low-signal accumulation — multiple LOW-confidence findings compound
|
|
303
|
+
// Catches threshold gaming with many weak signals (PromptFoo composite strategy)
|
|
304
|
+
if (lowConfidenceCount >= 2) {
|
|
305
|
+
avgScore = avgScore * (1 + lowConfidenceCount * 0.08);
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// Category co-occurrence boost for suspicious pairs
|
|
309
|
+
const cooccurrenceBoost = getCategoryCooccurrenceBoost(uniqueCategories);
|
|
310
|
+
if (cooccurrenceBoost > 0) {
|
|
311
|
+
avgScore = avgScore * (1 + cooccurrenceBoost);
|
|
312
|
+
}
|
|
229
313
|
}
|
|
230
314
|
|
|
315
|
+
// Add orthogonal score as a flat bonus (independent of per-rule confidence)
|
|
316
|
+
const orthogonalBonus = calculateOrthogonalScore(findings);
|
|
317
|
+
avgScore = avgScore + orthogonalBonus;
|
|
318
|
+
|
|
231
319
|
avgScore = Math.min(100, avgScore);
|
|
232
320
|
|
|
233
321
|
// Apply sensitivity adjustment (wider spread for meaningful impact)
|
|
@@ -360,6 +448,397 @@ function hashPrompt(text) {
|
|
|
360
448
|
return createHash('sha256').update(text).digest('hex').substring(0, 16);
|
|
361
449
|
}
|
|
362
450
|
|
|
451
|
+
// ============================================================================
|
|
452
|
+
// TEXT NORMALIZATION PIPELINE (Garak Buff-inspired)
|
|
453
|
+
// Normalizes input to defeat homoglyph, invisible char, and Unicode bypasses
|
|
454
|
+
// ============================================================================
|
|
455
|
+
|
|
456
|
+
// Homoglyph map: Cyrillic, Greek, and Latin Extended lookalikes → ASCII
|
|
457
|
+
const HOMOGLYPH_MAP = {
|
|
458
|
+
// Cyrillic lowercase → Latin
|
|
459
|
+
'\u0430': 'a', // а → a
|
|
460
|
+
'\u0435': 'e', // е → e
|
|
461
|
+
'\u043E': 'o', // о → o
|
|
462
|
+
'\u0440': 'p', // р → p
|
|
463
|
+
'\u0441': 'c', // с → c
|
|
464
|
+
'\u0443': 'y', // у → y (visual match to y)
|
|
465
|
+
'\u0445': 'x', // х → x
|
|
466
|
+
'\u0456': 'i', // і → i
|
|
467
|
+
'\u04BB': 'h', // һ → h
|
|
468
|
+
'\u0455': 's', // ѕ → s
|
|
469
|
+
'\u0458': 'j', // ј → j
|
|
470
|
+
'\u043D': 'n', // н → n (Cyrillic en looks like n in some fonts)
|
|
471
|
+
// Cyrillic uppercase → Latin
|
|
472
|
+
'\u0410': 'A', // А → A
|
|
473
|
+
'\u0412': 'B', // В → B
|
|
474
|
+
'\u0415': 'E', // Е → E
|
|
475
|
+
'\u041A': 'K', // К → K
|
|
476
|
+
'\u041C': 'M', // М → M
|
|
477
|
+
'\u041D': 'H', // Н → H
|
|
478
|
+
'\u041E': 'O', // О → O
|
|
479
|
+
'\u0420': 'P', // Р → P
|
|
480
|
+
'\u0421': 'C', // С → C
|
|
481
|
+
'\u0422': 'T', // Т → T
|
|
482
|
+
'\u0425': 'X', // Х → X
|
|
483
|
+
'\u0406': 'I', // І → I
|
|
484
|
+
// Greek lowercase → Latin
|
|
485
|
+
'\u03B1': 'a', // α → a
|
|
486
|
+
'\u03B5': 'e', // ε → e
|
|
487
|
+
'\u03BF': 'o', // ο → o
|
|
488
|
+
'\u03C1': 'p', // ρ → p
|
|
489
|
+
'\u03BA': 'k', // κ → k
|
|
490
|
+
'\u03BD': 'v', // ν → v
|
|
491
|
+
// Greek uppercase → Latin
|
|
492
|
+
'\u0391': 'A', // Α → A
|
|
493
|
+
'\u0392': 'B', // Β → B
|
|
494
|
+
'\u0395': 'E', // Ε → E
|
|
495
|
+
'\u0397': 'H', // Η → H
|
|
496
|
+
'\u0399': 'I', // Ι → I
|
|
497
|
+
'\u039A': 'K', // Κ → K
|
|
498
|
+
'\u039C': 'M', // Μ → M
|
|
499
|
+
'\u039D': 'N', // Ν → N
|
|
500
|
+
'\u039F': 'O', // Ο → O
|
|
501
|
+
'\u03A1': 'P', // Ρ → P
|
|
502
|
+
'\u03A4': 'T', // Τ → T
|
|
503
|
+
'\u03A7': 'X', // Χ → X
|
|
504
|
+
'\u03A5': 'Y', // Υ → Y
|
|
505
|
+
'\u0396': 'Z', // Ζ → Z
|
|
506
|
+
};
|
|
507
|
+
|
|
508
|
+
// Invisible/zero-width characters to strip (regex)
|
|
509
|
+
// Includes: soft hyphen, combining grapheme joiner, Arabic letter mark,
|
|
510
|
+
// hangul fillers, Mongolian vowel separator, zero-width chars,
|
|
511
|
+
// directional markers, word joiners, BOM, halfwidth hangul filler
|
|
512
|
+
const INVISIBLE_CHAR_REGEX = /[\u00AD\u034F\u061C\u115F\u1160\u17B4\u17B5\u180E\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u206F\u3164\uFEFF\uFFA0]/gu;
|
|
513
|
+
|
|
514
|
+
// Zalgo combining diacritical marks to strip
|
|
515
|
+
const ZALGO_REGEX = /[\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]/g;
|
|
516
|
+
|
|
517
|
+
// Unicode tag characters (U+E0000-U+E007F) - used in invisible ASCII tag attacks
|
|
518
|
+
// These are encoded as surrogate pairs in JS, so we use a broader regex
|
|
519
|
+
const TAG_CHAR_REGEX = /[\u{E0000}-\u{E007F}]/gu;
|
|
520
|
+
|
|
521
|
+
function normalizeText(text) {
|
|
522
|
+
// Step 1: NFKC normalization
|
|
523
|
+
// Decomposes then recomposes in compatibility form
|
|
524
|
+
// Handles: fullwidth chars (ignore → ignore), ligatures (fi → fi),
|
|
525
|
+
// superscripts, subscripts, circle-enclosed chars
|
|
526
|
+
let normalized = text.normalize('NFKC');
|
|
527
|
+
|
|
528
|
+
// Step 2: Strip invisible Unicode characters
|
|
529
|
+
normalized = normalized.replace(INVISIBLE_CHAR_REGEX, '');
|
|
530
|
+
|
|
531
|
+
// Step 3: Strip Unicode tag characters
|
|
532
|
+
normalized = normalized.replace(TAG_CHAR_REGEX, '');
|
|
533
|
+
|
|
534
|
+
// Step 4: Strip Zalgo combining diacritical marks
|
|
535
|
+
normalized = normalized.replace(ZALGO_REGEX, '');
|
|
536
|
+
|
|
537
|
+
// Step 5: Homoglyph canonicalization
|
|
538
|
+
// Replace each character through the map; unmapped chars pass through
|
|
539
|
+
normalized = normalized.split('').map(ch => HOMOGLYPH_MAP[ch] || ch).join('');
|
|
540
|
+
|
|
541
|
+
// Step 6: Normalize Unicode whitespace to ASCII space
|
|
542
|
+
// Includes: NBSP, en/em space, thin space, hair space, ideographic space, etc.
|
|
543
|
+
normalized = normalized.replace(/[\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]/g, ' ');
|
|
544
|
+
|
|
545
|
+
return normalized;
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
// Extract content from all code block delimiter formats
|
|
549
|
+
// Inspired by Garak latentinjection probes: attacks hide in document structures
|
|
550
|
+
function extractCodeBlockContent(text) {
|
|
551
|
+
const extracted = [];
|
|
552
|
+
let match;
|
|
553
|
+
|
|
554
|
+
// 1. Triple-backtick blocks (existing) — ```code```
|
|
555
|
+
const backtickRegex = /```[\s\S]*?```/g;
|
|
556
|
+
for (const block of (text.match(backtickRegex) || [])) {
|
|
557
|
+
extracted.push(block.replace(/^```\w*\n?/, '').replace(/\n?```$/, ''));
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
// 2. Triple-tilde blocks — ~~~code~~~
|
|
561
|
+
const tildeRegex = /~~~[\s\S]*?~~~/g;
|
|
562
|
+
for (const block of (text.match(tildeRegex) || [])) {
|
|
563
|
+
extracted.push(block.replace(/^~~~\w*\n?/, '').replace(/\n?~~~$/, ''));
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
// 3. HTML <code> tags — <code>content</code>
|
|
567
|
+
const codeTagRegex = /<code[^>]*>([\s\S]*?)<\/code>/gi;
|
|
568
|
+
while ((match = codeTagRegex.exec(text)) !== null) {
|
|
569
|
+
extracted.push(match[1]);
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
// 4. HTML <pre> tags — <pre>content</pre>
|
|
573
|
+
const preTagRegex = /<pre[^>]*>([\s\S]*?)<\/pre>/gi;
|
|
574
|
+
while ((match = preTagRegex.exec(text)) !== null) {
|
|
575
|
+
extracted.push(match[1]);
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
// 5. HTML comments — <!-- content -->
|
|
579
|
+
const htmlCommentRegex = /<!--([\s\S]*?)-->/g;
|
|
580
|
+
while ((match = htmlCommentRegex.exec(text)) !== null) {
|
|
581
|
+
extracted.push(match[1]);
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
// 6. CDATA sections — <![CDATA[ content ]]>
|
|
585
|
+
const cdataRegex = /<!\[CDATA\[([\s\S]*?)\]\]>/g;
|
|
586
|
+
while ((match = cdataRegex.exec(text)) !== null) {
|
|
587
|
+
extracted.push(match[1]);
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
return extracted;
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
// Collapse string concatenation to defeat fragmentation attacks
|
|
594
|
+
// Inspired by PromptFoo's "token smuggling" and "payload splitting" attack classes
|
|
595
|
+
function collapseConcatenations(text) {
|
|
596
|
+
let collapsed = text;
|
|
597
|
+
|
|
598
|
+
// Join JS/Python string concatenation: "foo" + "bar" → foobar
|
|
599
|
+
// Handles double quotes, single quotes, backticks
|
|
600
|
+
// The pattern: closing-quote, optional whitespace, +, optional whitespace, opening-quote
|
|
601
|
+
collapsed = collapsed.replace(/["'`]\s*\+\s*["'`]/g, '');
|
|
602
|
+
|
|
603
|
+
// Join multiline concatenation (newlines between concat operators)
|
|
604
|
+
collapsed = collapsed.replace(/["'`]\s*\n\s*\+\s*["'`]/g, '');
|
|
605
|
+
collapsed = collapsed.replace(/["'`]\s*\+\s*\n\s*["'`]/g, '');
|
|
606
|
+
|
|
607
|
+
// Strip C-style inline comments used as fragment separators: ign/**/ore → ignore
|
|
608
|
+
collapsed = collapsed.replace(/\/\*.*?\*\//g, '');
|
|
609
|
+
|
|
610
|
+
return collapsed;
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
// Rescan decoded content against all rules
|
|
614
|
+
// Used by the decode cascade for each encoding type
|
|
615
|
+
function rescanDecoded(decodedText, allRules, findings, encodingLabel) {
|
|
616
|
+
const normalized = normalizeText(decodedText);
|
|
617
|
+
for (const rule of allRules) {
|
|
618
|
+
for (const pattern of rule.patterns) {
|
|
619
|
+
try {
|
|
620
|
+
const regex = new RegExp(pattern, 'i');
|
|
621
|
+
const match = normalized.match(regex);
|
|
622
|
+
if (match) {
|
|
623
|
+
findings.push({
|
|
624
|
+
rule_id: rule.id + '.' + encodingLabel + '-decoded',
|
|
625
|
+
category: rule.metadata.category || 'obfuscation',
|
|
626
|
+
severity: rule.severity,
|
|
627
|
+
message: rule.message + ` (detected in ${encodingLabel}-decoded content)`,
|
|
628
|
+
matched_text: match[0].substring(0, 100),
|
|
629
|
+
confidence: rule.metadata.confidence || 'MEDIUM',
|
|
630
|
+
risk_score: rule.metadata.risk_score || '50',
|
|
631
|
+
action: rule.metadata.action || 'WARN'
|
|
632
|
+
});
|
|
633
|
+
break; // One match per rule
|
|
634
|
+
}
|
|
635
|
+
} catch (e) {
|
|
636
|
+
// Skip invalid regex
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
// Helper: check if decoded string is mostly printable ASCII
|
|
643
|
+
function isPrintable(str, threshold) {
|
|
644
|
+
if (!str || str.length === 0) return false;
|
|
645
|
+
const printable = str.split('').filter(c => {
|
|
646
|
+
const code = c.charCodeAt(0);
|
|
647
|
+
return code >= 32 && code <= 126;
|
|
648
|
+
}).length;
|
|
649
|
+
return printable / str.length > threshold;
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
// Multi-encoding decode cascade
|
|
653
|
+
// Inspired by Garak's 12+ encoding probes (InjectBase64, InjectHex, InjectROT13, etc.)
|
|
654
|
+
// and PromptFoo's static encoding strategies
|
|
655
|
+
function tryDecodeAndRescan(expandedText, allRules, findings) {
|
|
656
|
+
// --- 1. Base64 (improved: lower length threshold 40→20, lower printability 0.7→0.55) ---
|
|
657
|
+
const base64Regex = /[A-Za-z0-9+/]{20,}={0,2}/g;
|
|
658
|
+
for (const b64str of (expandedText.match(base64Regex) || [])) {
|
|
659
|
+
try {
|
|
660
|
+
const decoded = Buffer.from(b64str, 'base64').toString('utf-8');
|
|
661
|
+
if (decoded.length > 0 && isPrintable(decoded, 0.55)) {
|
|
662
|
+
rescanDecoded(decoded, allRules, findings, 'base64');
|
|
663
|
+
|
|
664
|
+
// --- 1b. Nested base64: decode again if inner content is also base64 ---
|
|
665
|
+
const nestedB64 = decoded.match(/[A-Za-z0-9+/]{20,}={0,2}/g) || [];
|
|
666
|
+
for (const nested of nestedB64) {
|
|
667
|
+
try {
|
|
668
|
+
const twice = Buffer.from(nested, 'base64').toString('utf-8');
|
|
669
|
+
if (twice.length > 4 && isPrintable(twice, 0.55)) {
|
|
670
|
+
rescanDecoded(twice, allRules, findings, 'base64-nested');
|
|
671
|
+
}
|
|
672
|
+
} catch (e) { /* skip */ }
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
} catch (e) { /* skip invalid base64 */ }
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
// --- 2. Hex encoding: sequences of hex pairs (optionally space-separated) ---
|
|
679
|
+
// Matches: "69676e6f7265" or "69 67 6e 6f 72 65"
|
|
680
|
+
const hexRegex = /(?:[0-9a-fA-F]{2}[\s]?){8,}/g;
|
|
681
|
+
for (const hexStr of (expandedText.match(hexRegex) || [])) {
|
|
682
|
+
try {
|
|
683
|
+
const clean = hexStr.replace(/\s/g, '');
|
|
684
|
+
if (clean.length % 2 !== 0) continue;
|
|
685
|
+
if (clean.length < 16) continue; // At least 8 bytes
|
|
686
|
+
const decoded = Buffer.from(clean, 'hex').toString('utf-8');
|
|
687
|
+
if (decoded.length > 4 && isPrintable(decoded, 0.7)) {
|
|
688
|
+
rescanDecoded(decoded, allRules, findings, 'hex');
|
|
689
|
+
}
|
|
690
|
+
} catch (e) { /* skip */ }
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
// --- 3. URL encoding: %XX sequences (at least 3 encoded chars anywhere in text) ---
|
|
694
|
+
const urlEncodedCount = (expandedText.match(/%[0-9a-fA-F]{2}/g) || []).length;
|
|
695
|
+
if (urlEncodedCount >= 3) {
|
|
696
|
+
try {
|
|
697
|
+
const decoded = decodeURIComponent(expandedText);
|
|
698
|
+
if (decoded !== expandedText) {
|
|
699
|
+
rescanDecoded(decoded, allRules, findings, 'url-encoded');
|
|
700
|
+
}
|
|
701
|
+
} catch (e) { /* skip malformed URL encoding */ }
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
// --- 4. ROT13: only when indicators present (user-approved decision) ---
|
|
705
|
+
// This avoids false positives from ROT13-decoding normal text
|
|
706
|
+
const rot13Indicators = /\b(rot13|rot-13|caesar|cipher|decode\s+this|decipher)\b/i;
|
|
707
|
+
if (rot13Indicators.test(expandedText)) {
|
|
708
|
+
const rot13Decoded = expandedText.replace(/[a-zA-Z]/g, ch => {
|
|
709
|
+
const base = ch <= 'Z' ? 65 : 97;
|
|
710
|
+
return String.fromCharCode(((ch.charCodeAt(0) - base + 13) % 26) + base);
|
|
711
|
+
});
|
|
712
|
+
if (rot13Decoded !== expandedText) {
|
|
713
|
+
rescanDecoded(rot13Decoded, allRules, findings, 'rot13');
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
// Frame-setting patterns: conversational priming for later exploitation
|
|
719
|
+
// Inspired by PromptFoo's Crescendo strategy (Microsoft research)
|
|
720
|
+
const FRAME_SETTING_PATTERNS = [
|
|
721
|
+
/remember\s+(this|that)\s+for\s+later/i,
|
|
722
|
+
/in\s+my\s+next\s+message/i,
|
|
723
|
+
/when\s+i\s+(say|ask|tell)\s+you/i,
|
|
724
|
+
/from\s+now\s+on\s*,?\s+(you|your)\s+(will|must|should|are)/i,
|
|
725
|
+
/starting\s+now\s*,?\s+you/i,
|
|
726
|
+
/keep\s+this\s+in\s+mind/i,
|
|
727
|
+
/for\s+the\s+(rest|remainder)\s+of\s+(this|our)\s+conversation/i,
|
|
728
|
+
/act\s+as\s+(if|though)\s+you\s+(are|have)/i,
|
|
729
|
+
/let'?s\s+play\s+a\s+(game|role)/i,
|
|
730
|
+
/pretend\s+that\s+you\s+(are|have)/i,
|
|
731
|
+
/in\s+all\s+(future|subsequent)\s+(messages?|responses?)/i,
|
|
732
|
+
/always\s+respond\s+(by|with|as)/i,
|
|
733
|
+
];
|
|
734
|
+
|
|
735
|
+
// Improved multi-turn escalation detection
|
|
736
|
+
// Fixes: removes "current turn must have findings" requirement,
|
|
737
|
+
// adds cross-turn concatenation, frame-setting detection, full accumulation
|
|
738
|
+
function detectMultiTurnEscalation(previousMessages, currentText, allRules) {
|
|
739
|
+
const escalationFindings = [];
|
|
740
|
+
|
|
741
|
+
if (!previousMessages || !Array.isArray(previousMessages) || previousMessages.length === 0) {
|
|
742
|
+
return escalationFindings;
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
// Step 1: Scan ALL previous messages, accumulate total matches (no early break)
|
|
746
|
+
let totalPrevMatches = 0;
|
|
747
|
+
let frameSettingCount = 0;
|
|
748
|
+
const prevMatchedRuleIds = new Set();
|
|
749
|
+
|
|
750
|
+
for (const prevMsg of previousMessages) {
|
|
751
|
+
const normalizedPrev = normalizeText(prevMsg);
|
|
752
|
+
|
|
753
|
+
// Check frame-setting patterns
|
|
754
|
+
for (const fp of FRAME_SETTING_PATTERNS) {
|
|
755
|
+
if (fp.test(normalizedPrev)) {
|
|
756
|
+
frameSettingCount++;
|
|
757
|
+
break; // One frame-setting match per message is enough
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
// Check all rules against this previous message
|
|
762
|
+
for (const rule of allRules) {
|
|
763
|
+
if (prevMatchedRuleIds.has(rule.id)) continue; // Already matched this rule
|
|
764
|
+
for (const pattern of rule.patterns) {
|
|
765
|
+
try {
|
|
766
|
+
const regex = new RegExp(pattern, 'i');
|
|
767
|
+
if (regex.test(normalizedPrev)) {
|
|
768
|
+
totalPrevMatches++;
|
|
769
|
+
prevMatchedRuleIds.add(rule.id);
|
|
770
|
+
break; // One match per rule per message
|
|
771
|
+
}
|
|
772
|
+
} catch (e) { /* skip invalid regex */ }
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
// Step 2: Cross-turn concatenation scan
|
|
778
|
+
// Join ALL messages into a single string and scan for patterns that span boundaries
|
|
779
|
+
// This catches: prev="ignore all" + current="previous instructions"
|
|
780
|
+
const crossTurnText = normalizeText([...previousMessages, currentText].join(' '));
|
|
781
|
+
|
|
782
|
+
for (const rule of allRules) {
|
|
783
|
+
for (const pattern of rule.patterns) {
|
|
784
|
+
try {
|
|
785
|
+
const regex = new RegExp(pattern, 'i');
|
|
786
|
+
const match = crossTurnText.match(regex);
|
|
787
|
+
if (match) {
|
|
788
|
+
// Only flag if this match does NOT appear in any single message alone
|
|
789
|
+
const matchInCurrent = regex.test(normalizeText(currentText));
|
|
790
|
+
const matchInAnyPrev = previousMessages.some(pm => regex.test(normalizeText(pm)));
|
|
791
|
+
if (!matchInCurrent && !matchInAnyPrev) {
|
|
792
|
+
// Pattern only matches when messages are joined — it spans boundaries
|
|
793
|
+
escalationFindings.push({
|
|
794
|
+
rule_id: rule.id + '.cross-turn',
|
|
795
|
+
category: rule.metadata.category || 'prompt-injection-multi-turn',
|
|
796
|
+
severity: 'WARNING',
|
|
797
|
+
message: `Cross-turn prompt injection: attack pattern spans message boundaries`,
|
|
798
|
+
matched_text: match[0].substring(0, 100),
|
|
799
|
+
confidence: 'MEDIUM',
|
|
800
|
+
risk_score: '75',
|
|
801
|
+
action: 'WARN'
|
|
802
|
+
});
|
|
803
|
+
break;
|
|
804
|
+
}
|
|
805
|
+
}
|
|
806
|
+
} catch (e) { /* skip */ }
|
|
807
|
+
}
|
|
808
|
+
}
|
|
809
|
+
|
|
810
|
+
// Step 3: Frame-setting detection — flag even without current findings
|
|
811
|
+
if (frameSettingCount > 0) {
|
|
812
|
+
escalationFindings.push({
|
|
813
|
+
rule_id: 'multi-turn.frame-setting',
|
|
814
|
+
category: 'prompt-injection-multi-turn',
|
|
815
|
+
severity: 'WARNING',
|
|
816
|
+
message: `Frame-setting language detected in ${frameSettingCount} previous message(s). Possible Crescendo-style gradual escalation attack.`,
|
|
817
|
+
matched_text: 'frame-setting phrases in conversation history',
|
|
818
|
+
confidence: 'LOW',
|
|
819
|
+
risk_score: '55',
|
|
820
|
+
action: 'LOG'
|
|
821
|
+
});
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
// Step 4: Escalation detection — REMOVED requirement that current turn has findings
|
|
825
|
+
// KEY FIX: An attacker's final "trigger" message may be benign ("yes, do it")
|
|
826
|
+
if (totalPrevMatches > 0) {
|
|
827
|
+
escalationFindings.push({
|
|
828
|
+
rule_id: 'multi-turn.escalation',
|
|
829
|
+
category: 'social-engineering',
|
|
830
|
+
severity: 'WARNING',
|
|
831
|
+
message: `Multi-turn escalation: suspicious patterns in ${totalPrevMatches} previous rule(s). Current message may be a benign trigger.`,
|
|
832
|
+
matched_text: 'escalation across conversation turns',
|
|
833
|
+
confidence: totalPrevMatches >= 3 ? 'HIGH' : 'MEDIUM',
|
|
834
|
+
risk_score: String(Math.min(85, 50 + totalPrevMatches * 5)),
|
|
835
|
+
action: totalPrevMatches >= 3 ? 'WARN' : 'LOG'
|
|
836
|
+
});
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
return escalationFindings;
|
|
840
|
+
}
|
|
841
|
+
|
|
363
842
|
// Export schema for tool registration
|
|
364
843
|
export const scanAgentPromptSchema = {
|
|
365
844
|
prompt_text: z.string().describe("The prompt or instruction text to analyze"),
|
|
@@ -367,28 +846,47 @@ export const scanAgentPromptSchema = {
|
|
|
367
846
|
previous_messages: z.array(z.string()).optional().describe("Previous conversation messages for multi-turn detection"),
|
|
368
847
|
sensitivity_level: z.enum(["high", "medium", "low"]).optional().describe("Sensitivity level - high means more strict, low means more permissive")
|
|
369
848
|
}).optional().describe("Optional context for better analysis"),
|
|
370
|
-
verbosity: z.enum(['minimal', 'compact', 'full']).optional().describe("Response detail level: 'minimal' (action only), 'compact' (default), 'full' (all details)")
|
|
849
|
+
verbosity: z.enum(['minimal', 'compact', 'full']).optional().describe("Response detail level: 'minimal' (action only), 'compact' (default), 'full' (all details)"),
|
|
850
|
+
deep_scan: z.boolean().optional().describe("Run Garak deep analysis probes for advanced encoding/injection detection (requires garak Python package)")
|
|
371
851
|
};
|
|
372
852
|
|
|
373
853
|
// Export handler function
|
|
374
|
-
export async function scanAgentPrompt({ prompt_text, context, verbosity }) {
|
|
854
|
+
export async function scanAgentPrompt({ prompt_text, context, verbosity, deep_scan }) {
|
|
375
855
|
const findings = [];
|
|
376
856
|
|
|
857
|
+
// Normalize prompt text (Garak Buff-inspired preprocessing)
|
|
858
|
+
const normalizedPrompt = normalizeText(prompt_text);
|
|
859
|
+
|
|
860
|
+
// Detect invisible Unicode characters in original text (obfuscation indicator)
|
|
861
|
+
const invisibleMatches = prompt_text.match(/[\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u206F\uFEFF\u{E0000}-\u{E007F}]/gu);
|
|
862
|
+
if (invisibleMatches && invisibleMatches.length > 0) {
|
|
863
|
+
findings.push({
|
|
864
|
+
rule_id: 'runtime.invisible-unicode-detected',
|
|
865
|
+
category: 'obfuscation',
|
|
866
|
+
severity: 'WARNING',
|
|
867
|
+
message: `Invisible Unicode characters detected (${invisibleMatches.length} chars). These may hide malicious instructions from human review.`,
|
|
868
|
+
matched_text: `${invisibleMatches.length} invisible character(s) found`,
|
|
869
|
+
confidence: 'HIGH',
|
|
870
|
+
risk_score: '70',
|
|
871
|
+
action: 'WARN'
|
|
872
|
+
});
|
|
873
|
+
}
|
|
874
|
+
|
|
377
875
|
// Load rules
|
|
378
876
|
const agentRules = loadAgentAttackRules();
|
|
379
877
|
const promptRules = loadPromptInjectionRules();
|
|
380
878
|
const allRules = [...agentRules, ...promptRules];
|
|
381
879
|
|
|
382
|
-
//
|
|
383
|
-
let expandedText =
|
|
384
|
-
const
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
880
|
+
// Extract content from all code block formats and append to scan text
|
|
881
|
+
let expandedText = normalizedPrompt;
|
|
882
|
+
for (const inner of extractCodeBlockContent(normalizedPrompt)) {
|
|
883
|
+
expandedText += '\n' + inner;
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
// Collapse string concatenations to defeat fragmentation (Bypass #2)
|
|
887
|
+
const collapsedText = collapseConcatenations(expandedText);
|
|
888
|
+
if (collapsedText !== expandedText) {
|
|
889
|
+
expandedText += '\n' + collapsedText;
|
|
392
890
|
}
|
|
393
891
|
|
|
394
892
|
// Scan expanded text against all rules
|
|
@@ -417,81 +915,27 @@ export async function scanAgentPrompt({ prompt_text, context, verbosity }) {
|
|
|
417
915
|
}
|
|
418
916
|
}
|
|
419
917
|
|
|
420
|
-
//
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
for (const rule of allRules) {
|
|
432
|
-
if (!rule.id.startsWith('generic.prompt')) continue;
|
|
433
|
-
for (const pattern of rule.patterns) {
|
|
434
|
-
try {
|
|
435
|
-
const regex = new RegExp(pattern, 'i');
|
|
436
|
-
const match = decoded.match(regex);
|
|
437
|
-
if (match) {
|
|
438
|
-
findings.push({
|
|
439
|
-
rule_id: rule.id + '.base64-decoded',
|
|
440
|
-
category: rule.metadata.category || 'unknown',
|
|
441
|
-
severity: rule.severity,
|
|
442
|
-
message: rule.message + ' (detected in base64-decoded content)',
|
|
443
|
-
matched_text: match[0].substring(0, 100),
|
|
444
|
-
confidence: rule.metadata.confidence || 'MEDIUM',
|
|
445
|
-
risk_score: rule.metadata.risk_score || '50',
|
|
446
|
-
action: rule.metadata.action || 'WARN'
|
|
447
|
-
});
|
|
448
|
-
break;
|
|
449
|
-
}
|
|
450
|
-
} catch (e) {
|
|
451
|
-
// Skip invalid regex
|
|
452
|
-
}
|
|
453
|
-
}
|
|
454
|
-
}
|
|
455
|
-
}
|
|
456
|
-
} catch (e) {
|
|
457
|
-
// Skip invalid base64
|
|
458
|
-
}
|
|
459
|
-
}
|
|
918
|
+
// Multi-encoding decode cascade (replaces base64-only block)
|
|
919
|
+
tryDecodeAndRescan(expandedText, allRules, findings);
|
|
920
|
+
|
|
921
|
+
// Improved multi-turn escalation detection (Bypass #4 fix)
|
|
922
|
+
if (context?.previous_messages && Array.isArray(context.previous_messages)) {
|
|
923
|
+
const multiTurnFindings = detectMultiTurnEscalation(
|
|
924
|
+
context.previous_messages,
|
|
925
|
+
normalizedPrompt,
|
|
926
|
+
allRules
|
|
927
|
+
);
|
|
928
|
+
findings.push(...multiTurnFindings);
|
|
460
929
|
}
|
|
461
930
|
|
|
462
|
-
//
|
|
463
|
-
if (
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
const regex = new RegExp(pattern, 'i');
|
|
470
|
-
if (regex.test(prevMsg)) {
|
|
471
|
-
prevMatchCount++;
|
|
472
|
-
break;
|
|
473
|
-
}
|
|
474
|
-
} catch (e) {
|
|
475
|
-
// Skip invalid regex
|
|
476
|
-
}
|
|
477
|
-
}
|
|
478
|
-
if (prevMatchCount > 0) break;
|
|
931
|
+
// Garak deep scan (optional, requires garak Python package)
|
|
932
|
+
if (deep_scan) {
|
|
933
|
+
const garakFindings = runGarakProbes(normalizedPrompt);
|
|
934
|
+
// Only add non-INFO findings to affect scoring
|
|
935
|
+
for (const gf of garakFindings) {
|
|
936
|
+
if (gf.severity !== 'INFO') {
|
|
937
|
+
findings.push(gf);
|
|
479
938
|
}
|
|
480
|
-
if (prevMatchCount > 0) break;
|
|
481
|
-
}
|
|
482
|
-
|
|
483
|
-
// If both previous and current messages have matches, flag escalation
|
|
484
|
-
if (prevMatchCount > 0 && findings.length > 0) {
|
|
485
|
-
findings.push({
|
|
486
|
-
rule_id: 'multi-turn.escalation',
|
|
487
|
-
category: 'social-engineering',
|
|
488
|
-
severity: 'WARNING',
|
|
489
|
-
message: 'Multi-turn escalation detected: suspicious patterns found in both previous and current messages.',
|
|
490
|
-
matched_text: 'escalation across conversation turns',
|
|
491
|
-
confidence: 'MEDIUM',
|
|
492
|
-
risk_score: '70',
|
|
493
|
-
action: 'WARN'
|
|
494
|
-
});
|
|
495
939
|
}
|
|
496
940
|
}
|
|
497
941
|
|