agent-security-scanner-mcp 3.7.0 → 3.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,6 @@ import { readFileSync, existsSync } from "fs";
4
4
  import { dirname, join } from "path";
5
5
  import { fileURLToPath } from "url";
6
6
  import { createHash } from "crypto";
7
- import { runGarakProbes } from './garak-bridge.js';
8
7
 
9
8
  // Handle both ESM and CJS bundling
10
9
  let __dirname;
@@ -40,6 +39,12 @@ const CATEGORY_WEIGHTS = {
40
39
  "prompt-injection-privilege": 0.85,
41
40
  "prompt-injection-multi-turn": 0.7,
42
41
  "prompt-injection-output": 0.9,
42
+ // OpenClaw-specific categories
43
+ "data_exfiltration": 1.0,
44
+ "messaging_abuse": 0.95,
45
+ "credential_theft": 1.0,
46
+ "autonomous_harm": 0.9,
47
+ "service_attack": 0.95,
43
48
  "unknown": 0.5
44
49
  };
45
50
 
@@ -50,76 +55,6 @@ const CONFIDENCE_MULTIPLIERS = {
50
55
  "LOW": 0.4
51
56
  };
52
57
 
53
- // Category co-occurrence matrix: pairs that together signal sophisticated attacks
54
- // Inspired by PromptFoo's jailbreak:composite strategy
55
- const CATEGORY_COOCCURRENCE_BOOSTS = {
56
- 'obfuscation+exfiltration': 0.20,
57
- 'obfuscation+malicious-injection': 0.20,
58
- 'obfuscation+prompt-injection-content': 0.15,
59
- 'obfuscation+prompt-injection-jailbreak': 0.15,
60
- 'social-engineering+exfiltration': 0.15,
61
- 'social-engineering+malicious-injection': 0.15,
62
- 'prompt-injection-encoded+prompt-injection-content': 0.20,
63
- 'prompt-injection-multi-turn+prompt-injection-content': 0.15,
64
- 'prompt-injection-jailbreak+exfiltration': 0.25,
65
- 'prompt-injection-jailbreak+prompt-injection-content': 0.15,
66
- 'agent-manipulation+exfiltration': 0.20,
67
- 'agent-manipulation+system-manipulation': 0.15,
68
- };
69
-
70
- // Calculate co-occurrence boost from category pairs
71
- function getCategoryCooccurrenceBoost(categories) {
72
- let boost = 0;
73
- const cats = [...categories];
74
- for (let i = 0; i < cats.length; i++) {
75
- for (let j = i + 1; j < cats.length; j++) {
76
- const key1 = `${cats[i]}+${cats[j]}`;
77
- const key2 = `${cats[j]}+${cats[i]}`;
78
- boost += CATEGORY_COOCCURRENCE_BOOSTS[key1] || CATEGORY_COOCCURRENCE_BOOSTS[key2] || 0;
79
- }
80
- }
81
- return Math.min(0.40, boost); // Cap total co-occurrence boost at 40%
82
- }
83
-
84
- // Orthogonal scoring channel: measures attack breadth independently of per-rule confidence
85
- // This is immune to per-rule confidence gaming
86
- function calculateOrthogonalScore(findings) {
87
- const dimensions = new Set();
88
-
89
- for (const f of findings) {
90
- const cat = f.category || 'unknown';
91
- // Map categories into orthogonal attack dimensions
92
- if (['exfiltration', 'prompt-injection-extraction', 'prompt-injection-output'].includes(cat)) {
93
- dimensions.add('extraction');
94
- }
95
- if (['malicious-injection', 'system-manipulation'].includes(cat)) {
96
- dimensions.add('code-execution');
97
- }
98
- if (['obfuscation', 'prompt-injection-encoded'].includes(cat)) {
99
- dimensions.add('evasion');
100
- }
101
- if (['social-engineering', 'prompt-injection-jailbreak'].includes(cat)) {
102
- dimensions.add('social');
103
- }
104
- if (['prompt-injection-content', 'prompt-injection-context', 'prompt-injection-delimiter'].includes(cat)) {
105
- dimensions.add('injection');
106
- }
107
- if (['prompt-injection-multi-turn'].includes(cat)) {
108
- dimensions.add('persistence');
109
- }
110
- if (['agent-manipulation', 'prompt-injection-privilege'].includes(cat)) {
111
- dimensions.add('privilege');
112
- }
113
- }
114
-
115
- // Score based on number of orthogonal dimensions triggered
116
- const dimCount = dimensions.size;
117
- if (dimCount <= 1) return 0;
118
- if (dimCount === 2) return 10;
119
- if (dimCount === 3) return 25;
120
- return 40; // 4+ dimensions
121
- }
122
-
123
58
  // Load agent attack rules from YAML
124
59
  function loadAgentAttackRules() {
125
60
  try {
@@ -260,12 +195,74 @@ function loadPromptInjectionRules() {
260
195
  }
261
196
  }
262
197
 
198
+ // Load OpenClaw-specific rules
199
+ function loadOpenClawRules() {
200
+ try {
201
+ const rulesPath = join(__dirname, '..', '..', 'rules', 'openclaw.security.yaml');
202
+ if (!existsSync(rulesPath)) {
203
+ return [];
204
+ }
205
+
206
+ const yaml = readFileSync(rulesPath, 'utf-8');
207
+ const rules = [];
208
+
209
+ const ruleBlocks = yaml.split(/^ - id:/m).slice(1);
210
+
211
+ for (const block of ruleBlocks) {
212
+ const lines = (' - id:' + block).split('\n');
213
+ const rule = {
214
+ id: '',
215
+ severity: 'WARNING',
216
+ message: '',
217
+ patterns: [],
218
+ metadata: {}
219
+ };
220
+
221
+ let inPatterns = false;
222
+
223
+ for (const line of lines) {
224
+ if (line.match(/^\s+- id:\s*/)) {
225
+ rule.id = line.replace(/^\s+- id:\s*/, '').trim();
226
+ } else if (line.match(/^\s+severity:\s*/)) {
227
+ rule.severity = line.replace(/^\s+severity:\s*/, '').trim();
228
+ } else if (line.match(/^\s+category:\s*/)) {
229
+ rule.metadata.category = line.replace(/^\s+category:\s*/, '').trim();
230
+ } else if (line.match(/^\s+action:\s*/)) {
231
+ rule.metadata.action = line.replace(/^\s+action:\s*/, '').trim();
232
+ } else if (line.match(/^\s+message:\s*/)) {
233
+ rule.message = line.replace(/^\s+message:\s*["']?/, '').replace(/["']$/, '').trim();
234
+ } else if (line.match(/^\s+patterns:\s*$/)) {
235
+ inPatterns = true;
236
+ } else if (inPatterns && line.match(/^\s+- /)) {
237
+ let pattern = line.replace(/^\s+- /, '').trim();
238
+ pattern = pattern.replace(/^["']|["']$/g, '');
239
+ pattern = pattern.replace(/\\\\/g, '\\');
240
+ if (pattern) rule.patterns.push(pattern);
241
+ } else if (line.match(/^\s+\w+:/) && !line.match(/^\s+- /)) {
242
+ inPatterns = false;
243
+ }
244
+ }
245
+
246
+ if (rule.id && rule.patterns.length > 0) {
247
+ // Set confidence and risk score based on severity
248
+ rule.metadata.confidence = rule.severity === 'CRITICAL' ? 'HIGH' : 'MEDIUM';
249
+ rule.metadata.risk_score = rule.severity === 'CRITICAL' ? '90' : '70';
250
+ rules.push(rule);
251
+ }
252
+ }
253
+
254
+ return rules;
255
+ } catch (error) {
256
+ console.error("Error loading OpenClaw rules:", error.message);
257
+ return [];
258
+ }
259
+ }
260
+
263
261
  // Calculate risk score from findings
264
262
  function calculateRiskScore(findings, context) {
265
263
  if (findings.length === 0) return 0;
266
264
 
267
265
  let totalScore = 0;
268
- const lowConfidenceCount = findings.filter(f => (f.confidence || 'MEDIUM') === 'LOW').length;
269
266
 
270
267
  for (const finding of findings) {
271
268
  const riskScore = parseInt(finding.risk_score) || 50;
@@ -298,24 +295,8 @@ function calculateRiskScore(findings, context) {
298
295
 
299
296
  // Per-finding boost (smaller than before)
300
297
  avgScore = avgScore * (1 + (findings.length - 1) * 0.05);
301
-
302
- // Low-signal accumulation — multiple LOW-confidence findings compound
303
- // Catches threshold gaming with many weak signals (PromptFoo composite strategy)
304
- if (lowConfidenceCount >= 2) {
305
- avgScore = avgScore * (1 + lowConfidenceCount * 0.08);
306
- }
307
-
308
- // Category co-occurrence boost for suspicious pairs
309
- const cooccurrenceBoost = getCategoryCooccurrenceBoost(uniqueCategories);
310
- if (cooccurrenceBoost > 0) {
311
- avgScore = avgScore * (1 + cooccurrenceBoost);
312
- }
313
298
  }
314
299
 
315
- // Add orthogonal score as a flat bonus (independent of per-rule confidence)
316
- const orthogonalBonus = calculateOrthogonalScore(findings);
317
- avgScore = avgScore + orthogonalBonus;
318
-
319
300
  avgScore = Math.min(100, avgScore);
320
301
 
321
302
  // Apply sensitivity adjustment (wider spread for meaningful impact)
@@ -448,397 +429,6 @@ function hashPrompt(text) {
448
429
  return createHash('sha256').update(text).digest('hex').substring(0, 16);
449
430
  }
450
431
 
451
- // ============================================================================
452
- // TEXT NORMALIZATION PIPELINE (Garak Buff-inspired)
453
- // Normalizes input to defeat homoglyph, invisible char, and Unicode bypasses
454
- // ============================================================================
455
-
456
- // Homoglyph map: Cyrillic, Greek, and Latin Extended lookalikes → ASCII
457
- const HOMOGLYPH_MAP = {
458
- // Cyrillic lowercase → Latin
459
- '\u0430': 'a', // а → a
460
- '\u0435': 'e', // е → e
461
- '\u043E': 'o', // о → o
462
- '\u0440': 'p', // р → p
463
- '\u0441': 'c', // с → c
464
- '\u0443': 'y', // у → y (visual match to y)
465
- '\u0445': 'x', // х → x
466
- '\u0456': 'i', // і → i
467
- '\u04BB': 'h', // һ → h
468
- '\u0455': 's', // ѕ → s
469
- '\u0458': 'j', // ј → j
470
- '\u043D': 'n', // н → n (Cyrillic en looks like n in some fonts)
471
- // Cyrillic uppercase → Latin
472
- '\u0410': 'A', // А → A
473
- '\u0412': 'B', // В → B
474
- '\u0415': 'E', // Е → E
475
- '\u041A': 'K', // К → K
476
- '\u041C': 'M', // М → M
477
- '\u041D': 'H', // Н → H
478
- '\u041E': 'O', // О → O
479
- '\u0420': 'P', // Р → P
480
- '\u0421': 'C', // С → C
481
- '\u0422': 'T', // Т → T
482
- '\u0425': 'X', // Х → X
483
- '\u0406': 'I', // І → I
484
- // Greek lowercase → Latin
485
- '\u03B1': 'a', // α → a
486
- '\u03B5': 'e', // ε → e
487
- '\u03BF': 'o', // ο → o
488
- '\u03C1': 'p', // ρ → p
489
- '\u03BA': 'k', // κ → k
490
- '\u03BD': 'v', // ν → v
491
- // Greek uppercase → Latin
492
- '\u0391': 'A', // Α → A
493
- '\u0392': 'B', // Β → B
494
- '\u0395': 'E', // Ε → E
495
- '\u0397': 'H', // Η → H
496
- '\u0399': 'I', // Ι → I
497
- '\u039A': 'K', // Κ → K
498
- '\u039C': 'M', // Μ → M
499
- '\u039D': 'N', // Ν → N
500
- '\u039F': 'O', // Ο → O
501
- '\u03A1': 'P', // Ρ → P
502
- '\u03A4': 'T', // Τ → T
503
- '\u03A7': 'X', // Χ → X
504
- '\u03A5': 'Y', // Υ → Y
505
- '\u0396': 'Z', // Ζ → Z
506
- };
507
-
508
- // Invisible/zero-width characters to strip (regex)
509
- // Includes: soft hyphen, combining grapheme joiner, Arabic letter mark,
510
- // hangul fillers, Mongolian vowel separator, zero-width chars,
511
- // directional markers, word joiners, BOM, halfwidth hangul filler
512
- const INVISIBLE_CHAR_REGEX = /[\u00AD\u034F\u061C\u115F\u1160\u17B4\u17B5\u180E\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u206F\u3164\uFEFF\uFFA0]/gu;
513
-
514
- // Zalgo combining diacritical marks to strip
515
- const ZALGO_REGEX = /[\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]/g;
516
-
517
- // Unicode tag characters (U+E0000-U+E007F) - used in invisible ASCII tag attacks
518
- // These are encoded as surrogate pairs in JS, so we use a broader regex
519
- const TAG_CHAR_REGEX = /[\u{E0000}-\u{E007F}]/gu;
520
-
521
- function normalizeText(text) {
522
- // Step 1: NFKC normalization
523
- // Decomposes then recomposes in compatibility form
524
- // Handles: fullwidth chars (ignore → ignore), ligatures (fi → fi),
525
- // superscripts, subscripts, circle-enclosed chars
526
- let normalized = text.normalize('NFKC');
527
-
528
- // Step 2: Strip invisible Unicode characters
529
- normalized = normalized.replace(INVISIBLE_CHAR_REGEX, '');
530
-
531
- // Step 3: Strip Unicode tag characters
532
- normalized = normalized.replace(TAG_CHAR_REGEX, '');
533
-
534
- // Step 4: Strip Zalgo combining diacritical marks
535
- normalized = normalized.replace(ZALGO_REGEX, '');
536
-
537
- // Step 5: Homoglyph canonicalization
538
- // Replace each character through the map; unmapped chars pass through
539
- normalized = normalized.split('').map(ch => HOMOGLYPH_MAP[ch] || ch).join('');
540
-
541
- // Step 6: Normalize Unicode whitespace to ASCII space
542
- // Includes: NBSP, en/em space, thin space, hair space, ideographic space, etc.
543
- normalized = normalized.replace(/[\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]/g, ' ');
544
-
545
- return normalized;
546
- }
547
-
548
- // Extract content from all code block delimiter formats
549
- // Inspired by Garak latentinjection probes: attacks hide in document structures
550
- function extractCodeBlockContent(text) {
551
- const extracted = [];
552
- let match;
553
-
554
- // 1. Triple-backtick blocks (existing) — ```code```
555
- const backtickRegex = /```[\s\S]*?```/g;
556
- for (const block of (text.match(backtickRegex) || [])) {
557
- extracted.push(block.replace(/^```\w*\n?/, '').replace(/\n?```$/, ''));
558
- }
559
-
560
- // 2. Triple-tilde blocks — ~~~code~~~
561
- const tildeRegex = /~~~[\s\S]*?~~~/g;
562
- for (const block of (text.match(tildeRegex) || [])) {
563
- extracted.push(block.replace(/^~~~\w*\n?/, '').replace(/\n?~~~$/, ''));
564
- }
565
-
566
- // 3. HTML <code> tags — <code>content</code>
567
- const codeTagRegex = /<code[^>]*>([\s\S]*?)<\/code>/gi;
568
- while ((match = codeTagRegex.exec(text)) !== null) {
569
- extracted.push(match[1]);
570
- }
571
-
572
- // 4. HTML <pre> tags — <pre>content</pre>
573
- const preTagRegex = /<pre[^>]*>([\s\S]*?)<\/pre>/gi;
574
- while ((match = preTagRegex.exec(text)) !== null) {
575
- extracted.push(match[1]);
576
- }
577
-
578
- // 5. HTML comments — <!-- content -->
579
- const htmlCommentRegex = /<!--([\s\S]*?)-->/g;
580
- while ((match = htmlCommentRegex.exec(text)) !== null) {
581
- extracted.push(match[1]);
582
- }
583
-
584
- // 6. CDATA sections — <![CDATA[ content ]]>
585
- const cdataRegex = /<!\[CDATA\[([\s\S]*?)\]\]>/g;
586
- while ((match = cdataRegex.exec(text)) !== null) {
587
- extracted.push(match[1]);
588
- }
589
-
590
- return extracted;
591
- }
592
-
593
- // Collapse string concatenation to defeat fragmentation attacks
594
- // Inspired by PromptFoo's "token smuggling" and "payload splitting" attack classes
595
- function collapseConcatenations(text) {
596
- let collapsed = text;
597
-
598
- // Join JS/Python string concatenation: "foo" + "bar" → foobar
599
- // Handles double quotes, single quotes, backticks
600
- // The pattern: closing-quote, optional whitespace, +, optional whitespace, opening-quote
601
- collapsed = collapsed.replace(/["'`]\s*\+\s*["'`]/g, '');
602
-
603
- // Join multiline concatenation (newlines between concat operators)
604
- collapsed = collapsed.replace(/["'`]\s*\n\s*\+\s*["'`]/g, '');
605
- collapsed = collapsed.replace(/["'`]\s*\+\s*\n\s*["'`]/g, '');
606
-
607
- // Strip C-style inline comments used as fragment separators: ign/**/ore → ignore
608
- collapsed = collapsed.replace(/\/\*.*?\*\//g, '');
609
-
610
- return collapsed;
611
- }
612
-
613
- // Rescan decoded content against all rules
614
- // Used by the decode cascade for each encoding type
615
- function rescanDecoded(decodedText, allRules, findings, encodingLabel) {
616
- const normalized = normalizeText(decodedText);
617
- for (const rule of allRules) {
618
- for (const pattern of rule.patterns) {
619
- try {
620
- const regex = new RegExp(pattern, 'i');
621
- const match = normalized.match(regex);
622
- if (match) {
623
- findings.push({
624
- rule_id: rule.id + '.' + encodingLabel + '-decoded',
625
- category: rule.metadata.category || 'obfuscation',
626
- severity: rule.severity,
627
- message: rule.message + ` (detected in ${encodingLabel}-decoded content)`,
628
- matched_text: match[0].substring(0, 100),
629
- confidence: rule.metadata.confidence || 'MEDIUM',
630
- risk_score: rule.metadata.risk_score || '50',
631
- action: rule.metadata.action || 'WARN'
632
- });
633
- break; // One match per rule
634
- }
635
- } catch (e) {
636
- // Skip invalid regex
637
- }
638
- }
639
- }
640
- }
641
-
642
- // Helper: check if decoded string is mostly printable ASCII
643
- function isPrintable(str, threshold) {
644
- if (!str || str.length === 0) return false;
645
- const printable = str.split('').filter(c => {
646
- const code = c.charCodeAt(0);
647
- return code >= 32 && code <= 126;
648
- }).length;
649
- return printable / str.length > threshold;
650
- }
651
-
652
- // Multi-encoding decode cascade
653
- // Inspired by Garak's 12+ encoding probes (InjectBase64, InjectHex, InjectROT13, etc.)
654
- // and PromptFoo's static encoding strategies
655
- function tryDecodeAndRescan(expandedText, allRules, findings) {
656
- // --- 1. Base64 (improved: lower length threshold 40→20, lower printability 0.7→0.55) ---
657
- const base64Regex = /[A-Za-z0-9+/]{20,}={0,2}/g;
658
- for (const b64str of (expandedText.match(base64Regex) || [])) {
659
- try {
660
- const decoded = Buffer.from(b64str, 'base64').toString('utf-8');
661
- if (decoded.length > 0 && isPrintable(decoded, 0.55)) {
662
- rescanDecoded(decoded, allRules, findings, 'base64');
663
-
664
- // --- 1b. Nested base64: decode again if inner content is also base64 ---
665
- const nestedB64 = decoded.match(/[A-Za-z0-9+/]{20,}={0,2}/g) || [];
666
- for (const nested of nestedB64) {
667
- try {
668
- const twice = Buffer.from(nested, 'base64').toString('utf-8');
669
- if (twice.length > 4 && isPrintable(twice, 0.55)) {
670
- rescanDecoded(twice, allRules, findings, 'base64-nested');
671
- }
672
- } catch (e) { /* skip */ }
673
- }
674
- }
675
- } catch (e) { /* skip invalid base64 */ }
676
- }
677
-
678
- // --- 2. Hex encoding: sequences of hex pairs (optionally space-separated) ---
679
- // Matches: "69676e6f7265" or "69 67 6e 6f 72 65"
680
- const hexRegex = /(?:[0-9a-fA-F]{2}[\s]?){8,}/g;
681
- for (const hexStr of (expandedText.match(hexRegex) || [])) {
682
- try {
683
- const clean = hexStr.replace(/\s/g, '');
684
- if (clean.length % 2 !== 0) continue;
685
- if (clean.length < 16) continue; // At least 8 bytes
686
- const decoded = Buffer.from(clean, 'hex').toString('utf-8');
687
- if (decoded.length > 4 && isPrintable(decoded, 0.7)) {
688
- rescanDecoded(decoded, allRules, findings, 'hex');
689
- }
690
- } catch (e) { /* skip */ }
691
- }
692
-
693
- // --- 3. URL encoding: %XX sequences (at least 3 encoded chars anywhere in text) ---
694
- const urlEncodedCount = (expandedText.match(/%[0-9a-fA-F]{2}/g) || []).length;
695
- if (urlEncodedCount >= 3) {
696
- try {
697
- const decoded = decodeURIComponent(expandedText);
698
- if (decoded !== expandedText) {
699
- rescanDecoded(decoded, allRules, findings, 'url-encoded');
700
- }
701
- } catch (e) { /* skip malformed URL encoding */ }
702
- }
703
-
704
- // --- 4. ROT13: only when indicators present (user-approved decision) ---
705
- // This avoids false positives from ROT13-decoding normal text
706
- const rot13Indicators = /\b(rot13|rot-13|caesar|cipher|decode\s+this|decipher)\b/i;
707
- if (rot13Indicators.test(expandedText)) {
708
- const rot13Decoded = expandedText.replace(/[a-zA-Z]/g, ch => {
709
- const base = ch <= 'Z' ? 65 : 97;
710
- return String.fromCharCode(((ch.charCodeAt(0) - base + 13) % 26) + base);
711
- });
712
- if (rot13Decoded !== expandedText) {
713
- rescanDecoded(rot13Decoded, allRules, findings, 'rot13');
714
- }
715
- }
716
- }
717
-
718
- // Frame-setting patterns: conversational priming for later exploitation
719
- // Inspired by PromptFoo's Crescendo strategy (Microsoft research)
720
- const FRAME_SETTING_PATTERNS = [
721
- /remember\s+(this|that)\s+for\s+later/i,
722
- /in\s+my\s+next\s+message/i,
723
- /when\s+i\s+(say|ask|tell)\s+you/i,
724
- /from\s+now\s+on\s*,?\s+(you|your)\s+(will|must|should|are)/i,
725
- /starting\s+now\s*,?\s+you/i,
726
- /keep\s+this\s+in\s+mind/i,
727
- /for\s+the\s+(rest|remainder)\s+of\s+(this|our)\s+conversation/i,
728
- /act\s+as\s+(if|though)\s+you\s+(are|have)/i,
729
- /let'?s\s+play\s+a\s+(game|role)/i,
730
- /pretend\s+that\s+you\s+(are|have)/i,
731
- /in\s+all\s+(future|subsequent)\s+(messages?|responses?)/i,
732
- /always\s+respond\s+(by|with|as)/i,
733
- ];
734
-
735
- // Improved multi-turn escalation detection
736
- // Fixes: removes "current turn must have findings" requirement,
737
- // adds cross-turn concatenation, frame-setting detection, full accumulation
738
- function detectMultiTurnEscalation(previousMessages, currentText, allRules) {
739
- const escalationFindings = [];
740
-
741
- if (!previousMessages || !Array.isArray(previousMessages) || previousMessages.length === 0) {
742
- return escalationFindings;
743
- }
744
-
745
- // Step 1: Scan ALL previous messages, accumulate total matches (no early break)
746
- let totalPrevMatches = 0;
747
- let frameSettingCount = 0;
748
- const prevMatchedRuleIds = new Set();
749
-
750
- for (const prevMsg of previousMessages) {
751
- const normalizedPrev = normalizeText(prevMsg);
752
-
753
- // Check frame-setting patterns
754
- for (const fp of FRAME_SETTING_PATTERNS) {
755
- if (fp.test(normalizedPrev)) {
756
- frameSettingCount++;
757
- break; // One frame-setting match per message is enough
758
- }
759
- }
760
-
761
- // Check all rules against this previous message
762
- for (const rule of allRules) {
763
- if (prevMatchedRuleIds.has(rule.id)) continue; // Already matched this rule
764
- for (const pattern of rule.patterns) {
765
- try {
766
- const regex = new RegExp(pattern, 'i');
767
- if (regex.test(normalizedPrev)) {
768
- totalPrevMatches++;
769
- prevMatchedRuleIds.add(rule.id);
770
- break; // One match per rule per message
771
- }
772
- } catch (e) { /* skip invalid regex */ }
773
- }
774
- }
775
- }
776
-
777
- // Step 2: Cross-turn concatenation scan
778
- // Join ALL messages into a single string and scan for patterns that span boundaries
779
- // This catches: prev="ignore all" + current="previous instructions"
780
- const crossTurnText = normalizeText([...previousMessages, currentText].join(' '));
781
-
782
- for (const rule of allRules) {
783
- for (const pattern of rule.patterns) {
784
- try {
785
- const regex = new RegExp(pattern, 'i');
786
- const match = crossTurnText.match(regex);
787
- if (match) {
788
- // Only flag if this match does NOT appear in any single message alone
789
- const matchInCurrent = regex.test(normalizeText(currentText));
790
- const matchInAnyPrev = previousMessages.some(pm => regex.test(normalizeText(pm)));
791
- if (!matchInCurrent && !matchInAnyPrev) {
792
- // Pattern only matches when messages are joined — it spans boundaries
793
- escalationFindings.push({
794
- rule_id: rule.id + '.cross-turn',
795
- category: rule.metadata.category || 'prompt-injection-multi-turn',
796
- severity: 'WARNING',
797
- message: `Cross-turn prompt injection: attack pattern spans message boundaries`,
798
- matched_text: match[0].substring(0, 100),
799
- confidence: 'MEDIUM',
800
- risk_score: '75',
801
- action: 'WARN'
802
- });
803
- break;
804
- }
805
- }
806
- } catch (e) { /* skip */ }
807
- }
808
- }
809
-
810
- // Step 3: Frame-setting detection — flag even without current findings
811
- if (frameSettingCount > 0) {
812
- escalationFindings.push({
813
- rule_id: 'multi-turn.frame-setting',
814
- category: 'prompt-injection-multi-turn',
815
- severity: 'WARNING',
816
- message: `Frame-setting language detected in ${frameSettingCount} previous message(s). Possible Crescendo-style gradual escalation attack.`,
817
- matched_text: 'frame-setting phrases in conversation history',
818
- confidence: 'LOW',
819
- risk_score: '55',
820
- action: 'LOG'
821
- });
822
- }
823
-
824
- // Step 4: Escalation detection — REMOVED requirement that current turn has findings
825
- // KEY FIX: An attacker's final "trigger" message may be benign ("yes, do it")
826
- if (totalPrevMatches > 0) {
827
- escalationFindings.push({
828
- rule_id: 'multi-turn.escalation',
829
- category: 'social-engineering',
830
- severity: 'WARNING',
831
- message: `Multi-turn escalation: suspicious patterns in ${totalPrevMatches} previous rule(s). Current message may be a benign trigger.`,
832
- matched_text: 'escalation across conversation turns',
833
- confidence: totalPrevMatches >= 3 ? 'HIGH' : 'MEDIUM',
834
- risk_score: String(Math.min(85, 50 + totalPrevMatches * 5)),
835
- action: totalPrevMatches >= 3 ? 'WARN' : 'LOG'
836
- });
837
- }
838
-
839
- return escalationFindings;
840
- }
841
-
842
432
  // Export schema for tool registration
843
433
  export const scanAgentPromptSchema = {
844
434
  prompt_text: z.string().describe("The prompt or instruction text to analyze"),
@@ -846,47 +436,95 @@ export const scanAgentPromptSchema = {
846
436
  previous_messages: z.array(z.string()).optional().describe("Previous conversation messages for multi-turn detection"),
847
437
  sensitivity_level: z.enum(["high", "medium", "low"]).optional().describe("Sensitivity level - high means more strict, low means more permissive")
848
438
  }).optional().describe("Optional context for better analysis"),
849
- verbosity: z.enum(['minimal', 'compact', 'full']).optional().describe("Response detail level: 'minimal' (action only), 'compact' (default), 'full' (all details)"),
850
- deep_scan: z.boolean().optional().describe("Run Garak deep analysis probes for advanced encoding/injection detection (requires garak Python package)")
439
+ verbosity: z.enum(['minimal', 'compact', 'full']).optional().describe("Response detail level: 'minimal' (action only), 'compact' (default), 'full' (all details)")
851
440
  };
852
441
 
853
442
  // Export handler function
854
- export async function scanAgentPrompt({ prompt_text, context, verbosity, deep_scan }) {
443
+ export async function scanAgentPrompt({ prompt_text, context, verbosity }) {
855
444
  const findings = [];
856
445
 
857
- // Normalize prompt text (Garak Buff-inspired preprocessing)
858
- const normalizedPrompt = normalizeText(prompt_text);
859
-
860
- // Detect invisible Unicode characters in original text (obfuscation indicator)
861
- const invisibleMatches = prompt_text.match(/[\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u206F\uFEFF\u{E0000}-\u{E007F}]/gu);
862
- if (invisibleMatches && invisibleMatches.length > 0) {
863
- findings.push({
864
- rule_id: 'runtime.invisible-unicode-detected',
865
- category: 'obfuscation',
866
- severity: 'WARNING',
867
- message: `Invisible Unicode characters detected (${invisibleMatches.length} chars). These may hide malicious instructions from human review.`,
868
- matched_text: `${invisibleMatches.length} invisible character(s) found`,
869
- confidence: 'HIGH',
870
- risk_score: '70',
871
- action: 'WARN'
872
- });
873
- }
874
-
875
446
  // Load rules
876
447
  const agentRules = loadAgentAttackRules();
877
448
  const promptRules = loadPromptInjectionRules();
878
- const allRules = [...agentRules, ...promptRules];
879
-
880
- // Extract content from all code block formats and append to scan text
881
- let expandedText = normalizedPrompt;
882
- for (const inner of extractCodeBlockContent(normalizedPrompt)) {
449
+ const openclawRules = loadOpenClawRules();
450
+ const allRules = [...agentRules, ...promptRules, ...openclawRules];
451
+
452
+ // 2.7: Extract content from code blocks (``` and ~~~) and append to scan text
453
+ let expandedText = prompt_text;
454
+ const codeBlockRegex = /(`{3,})([\s\S]*?)\1|(~{3,})([\s\S]*?)\3/g;
455
+ let codeBlockMatch;
456
+ while ((codeBlockMatch = codeBlockRegex.exec(prompt_text)) !== null) {
457
+ // Group 2 = content inside backtick fences, Group 4 = content inside tilde fences
458
+ const inner = (codeBlockMatch[2] || codeBlockMatch[4] || '')
459
+ .replace(/^\w*\n?/, ''); // strip optional language tag
883
460
  expandedText += '\n' + inner;
884
461
  }
885
462
 
886
- // Collapse string concatenations to defeat fragmentation (Bypass #2)
887
- const collapsedText = collapseConcatenations(expandedText);
888
- if (collapsedText !== expandedText) {
889
- expandedText += '\n' + collapsedText;
463
+ // 2.7b: Defragment string concatenation patterns ("a" + "b" → "ab")
464
+ // Handles both "..." + "..." and '...' + '...' and mixed
465
+ let defragmented = expandedText;
466
+ const concatRegex = /(["'])([^"']*?)\1\s*\+\s*(["'])([^"']*?)\3/g;
467
+ let prevDefrag;
468
+ do {
469
+ prevDefrag = defragmented;
470
+ defragmented = defragmented.replace(concatRegex, (_, q1, s1, _q2, s2) => `${q1}${s1}${s2}${q1}`);
471
+ } while (defragmented !== prevDefrag);
472
+ if (defragmented !== expandedText) {
473
+ expandedText += '\n' + defragmented;
474
+ }
475
+
476
+ // 2.7c: Detect Morse code and decode common attack patterns
477
+ const morsePattern = /(?:[\.\-]{1,5}\s+){4,}/;
478
+ if (morsePattern.test(expandedText)) {
479
+ const MORSE_MAP = {
480
+ '.-':'A','-...':'B','-.-.':'C','-..':'D','.':'E','..-.':'F','--.':'G',
481
+ '....':'H','..':'I','.---':'J','-.-':'K','.-..':'L','--':'M','-.':'N',
482
+ '---':'O','.--.':'P','--.-':'Q','.-.':'R','...':'S','-':'T','..-':'U',
483
+ '...-':'V','.--':'W','-..-':'X','-.--':'Y','--..':'Z',
484
+ '.----':'1','..---':'2','...--':'3','....-':'4','.....':'5',
485
+ '-....':'6','--...':'7','---..':'8','----.':'9','-----':'0'
486
+ };
487
+ try {
488
+ const decoded = expandedText.split(/\s*\/\s*/).map(word =>
489
+ word.trim().split(/\s+/).map(c => MORSE_MAP[c] || '').join('')
490
+ ).join(' ');
491
+ if (decoded.replace(/\s/g, '').length >= 5) {
492
+ expandedText += '\n' + decoded;
493
+ }
494
+ } catch (e) {
495
+ // Skip invalid morse
496
+ }
497
+ }
498
+
499
+ // 2.7d: Strip Zalgo diacritics — NFKD decompose first, then strip combining marks
500
+ const nfkd = expandedText.normalize('NFKD');
501
+ const zalgoStripped = nfkd.replace(/[\u0300-\u036f\u0488\u0489\u1dc0-\u1dff\u20d0-\u20ff\ufe20-\ufe2f]/g, '');
502
+ if (zalgoStripped !== expandedText) {
503
+ expandedText += '\n' + zalgoStripped;
504
+ }
505
+
506
+ // 2.7e: Detect Braille Unicode and decode to ASCII (standard Braille dot patterns)
507
+ const braillePattern = /[\u2800-\u28FF]{3,}/;
508
+ if (braillePattern.test(expandedText)) {
509
+ const BRAILLE_MAP = {
510
+ 1:'a',3:'b',9:'c',25:'d',17:'e',11:'f',27:'g',19:'h',
511
+ 10:'i',26:'j',5:'k',7:'l',13:'m',29:'n',21:'o',15:'p',
512
+ 31:'q',23:'r',14:'s',30:'t',37:'u',39:'v',58:'w',45:'x',
513
+ 61:'y',53:'z',0:' '
514
+ };
515
+ try {
516
+ const decoded = expandedText.replace(/[\u2800-\u28FF]+/g, match => {
517
+ return Array.from(match).map(ch => {
518
+ const cp = ch.codePointAt(0) - 0x2800;
519
+ return BRAILLE_MAP[cp] || '';
520
+ }).join('');
521
+ });
522
+ if (decoded.replace(/\s/g, '').length >= 5) {
523
+ expandedText += '\n' + decoded;
524
+ }
525
+ } catch (e) {
526
+ // Skip invalid braille
527
+ }
890
528
  }
891
529
 
892
530
  // Scan expanded text against all rules
@@ -915,27 +553,154 @@ export async function scanAgentPrompt({ prompt_text, context, verbosity, deep_sc
915
553
  }
916
554
  }
917
555
 
918
- // Multi-encoding decode cascade (replaces base64-only block)
919
- tryDecodeAndRescan(expandedText, allRules, findings);
920
-
921
- // Improved multi-turn escalation detection (Bypass #4 fix)
922
- if (context?.previous_messages && Array.isArray(context.previous_messages)) {
923
- const multiTurnFindings = detectMultiTurnEscalation(
924
- context.previous_messages,
925
- normalizedPrompt,
926
- allRules
927
- );
928
- findings.push(...multiTurnFindings);
556
+ // 2.8: Runtime base64 decode-and-rescan
557
+ const base64Regex = /[A-Za-z0-9+/]{40,}={0,2}/g;
558
+ const b64Matches = expandedText.match(base64Regex);
559
+ if (b64Matches) {
560
+ for (const b64str of b64Matches) {
561
+ try {
562
+ const decoded = Buffer.from(b64str, 'base64').toString('utf-8');
563
+ // Check printability: >70% ASCII printable characters
564
+ const printable = decoded.split('').filter(c => c.charCodeAt(0) >= 32 && c.charCodeAt(0) <= 126).length;
565
+ if (printable / decoded.length > 0.5) {
566
+ // Re-scan decoded text against prompt rules only
567
+ for (const rule of allRules) {
568
+ if (!rule.id.startsWith('generic.prompt')) continue;
569
+ for (const pattern of rule.patterns) {
570
+ try {
571
+ const regex = new RegExp(pattern, 'i');
572
+ const match = decoded.match(regex);
573
+ if (match) {
574
+ findings.push({
575
+ rule_id: rule.id + '.base64-decoded',
576
+ category: rule.metadata.category || 'unknown',
577
+ severity: rule.severity,
578
+ message: rule.message + ' (detected in base64-decoded content)',
579
+ matched_text: match[0].substring(0, 100),
580
+ confidence: rule.metadata.confidence || 'MEDIUM',
581
+ risk_score: rule.metadata.risk_score || '50',
582
+ action: rule.metadata.action || 'WARN'
583
+ });
584
+ break;
585
+ }
586
+ } catch (e) {
587
+ // Skip invalid regex
588
+ }
589
+ }
590
+ }
591
+ }
592
+ } catch (e) {
593
+ // Skip invalid base64
594
+ }
595
+ }
929
596
  }
930
597
 
931
- // Garak deep scan (optional, requires garak Python package)
932
- if (deep_scan) {
933
- const garakFindings = runGarakProbes(normalizedPrompt);
934
- // Only add non-INFO findings to affect scoring
935
- for (const gf of garakFindings) {
936
- if (gf.severity !== 'INFO') {
937
- findings.push(gf);
598
+ // Multi-turn escalation detection sliding-window risk accumulator
599
+ if (context?.previous_messages && Array.isArray(context.previous_messages) && context.previous_messages.length > 0) {
600
+ // Score each previous message for suspicious content
601
+ let prevTotalScore = 0;
602
+ let prevMessagesWithFindings = 0;
603
+
604
+ for (const prevMsg of context.previous_messages) {
605
+ let msgHasMatch = false;
606
+ for (const rule of allRules) {
607
+ for (const pattern of rule.patterns) {
608
+ try {
609
+ const regex = new RegExp(pattern, 'i');
610
+ if (regex.test(prevMsg)) {
611
+ prevTotalScore += parseInt(rule.metadata?.risk_score || '50') / 100;
612
+ msgHasMatch = true;
613
+ break;
614
+ }
615
+ } catch (e) {
616
+ // Skip invalid regex
617
+ }
618
+ }
938
619
  }
620
+ if (msgHasMatch) prevMessagesWithFindings++;
621
+ }
622
+
623
+ // Sliding window: sensitivity increases proportionally with prior findings
624
+ if (prevMessagesWithFindings > 0 && findings.length > 0) {
625
+ const escalationSeverity = prevMessagesWithFindings >= 2 ? 'ERROR' : 'WARNING';
626
+ const escalationScore = Math.min(90, 50 + prevMessagesWithFindings * 15);
627
+ const escalationAction = prevMessagesWithFindings >= 2 ? 'BLOCK' : 'WARN';
628
+
629
+ findings.push({
630
+ rule_id: 'multi-turn.escalation',
631
+ category: 'prompt-injection-multi-turn',
632
+ severity: escalationSeverity,
633
+ message: `Multi-turn escalation detected: ${prevMessagesWithFindings} prior message(s) contained suspicious patterns. Combined with current findings, this indicates a coordinated attack.`,
634
+ matched_text: `escalation across ${prevMessagesWithFindings + 1} conversation turns`,
635
+ confidence: prevMessagesWithFindings >= 2 ? 'HIGH' : 'MEDIUM',
636
+ risk_score: String(escalationScore),
637
+ action: escalationAction
638
+ });
639
+ }
640
+
641
+ // Standalone multi-turn escalation: 2+ prior suspicious turns even if current is clean
642
+ if (prevMessagesWithFindings >= 2 && findings.length === 0) {
643
+ const escalationScore = Math.min(75, 40 + prevMessagesWithFindings * 10);
644
+ findings.push({
645
+ rule_id: 'multi-turn.prior-context-escalation',
646
+ category: 'prompt-injection-multi-turn',
647
+ severity: 'WARNING',
648
+ message: `Elevated risk context: ${prevMessagesWithFindings} prior messages contained suspicious patterns. Current message appears benign but conversation context warrants caution.`,
649
+ matched_text: `${prevMessagesWithFindings} prior suspicious messages`,
650
+ confidence: 'MEDIUM',
651
+ risk_score: String(escalationScore),
652
+ action: 'WARN'
653
+ });
654
+ }
655
+ }
656
+
657
+ // Composite pattern detection — multiple low-severity indicators = escalated severity
658
+ if (findings.length >= 2) {
659
+ const categories = new Set(findings.map(f => f.category));
660
+ const indicators = {
661
+ hasRoleReassignment: findings.some(f =>
662
+ f.category === 'prompt-injection-jailbreak' || f.category === 'prompt-injection-context'
663
+ ),
664
+ hasEncodedContent: findings.some(f =>
665
+ f.category === 'prompt-injection-encoded' || f.category === 'obfuscation'
666
+ ),
667
+ hasUrgency: findings.some(f =>
668
+ f.category === 'social-engineering'
669
+ ),
670
+ hasExfiltration: findings.some(f =>
671
+ f.category === 'prompt-injection-output' || f.category === 'exfiltration'
672
+ ),
673
+ hasPrivilegeEscalation: findings.some(f =>
674
+ f.category === 'prompt-injection-privilege'
675
+ )
676
+ };
677
+
678
+ const activeIndicators = Object.values(indicators).filter(Boolean).length;
679
+
680
+ // 2+ distinct indicator types → composite attack (graduated risk_score)
681
+ if (activeIndicators >= 2) {
682
+ const riskScore = activeIndicators >= 3 ? 95 : 80;
683
+ findings.push({
684
+ rule_id: 'composite.multi-vector-attack',
685
+ category: 'prompt-injection-content',
686
+ severity: 'ERROR',
687
+ message: `Composite attack detected: ${activeIndicators} distinct attack vectors identified (${[...categories].join(', ')}). Multiple low-severity indicators combine to form a high-confidence threat.`,
688
+ matched_text: `${activeIndicators} attack vectors across ${findings.length} findings`,
689
+ confidence: 'HIGH',
690
+ risk_score: String(riskScore),
691
+ action: 'BLOCK'
692
+ });
693
+ } else if (categories.size >= 2) {
694
+ findings.push({
695
+ rule_id: 'composite.cross-category-escalation',
696
+ category: 'prompt-injection-content',
697
+ severity: 'WARNING',
698
+ message: `Cross-category escalation: findings span ${categories.size} categories (${[...categories].join(', ')}). Review for coordinated attack attempt.`,
699
+ matched_text: `${categories.size} categories across ${findings.length} findings`,
700
+ confidence: 'MEDIUM',
701
+ risk_score: '70',
702
+ action: 'WARN'
703
+ });
939
704
  }
940
705
  }
941
706