agent-security-scanner-mcp 3.5.1 → 3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-security-scanner-mcp",
3
- "version": "3.5.1",
3
+ "version": "3.6.0",
4
4
  "mcpName": "io.github.sinewaveai/agent-security-scanner-mcp",
5
5
  "description": "Security scanner MCP server for AI coding agents. Prompt injection firewall, package hallucination detection (4.3M+ packages), 1000+ vulnerability rules with AST & taint analysis, auto-fix. For Claude Code, Cursor, Windsurf, Cline.",
6
6
  "main": "index.js",
@@ -12,7 +12,8 @@
12
12
  "start": "node index.js",
13
13
  "test": "vitest run",
14
14
  "test:watch": "vitest",
15
- "test:coverage": "vitest run --coverage"
15
+ "test:coverage": "vitest run --coverage",
16
+ "test:redteam": "npx promptfoo eval --config tests/promptfoo/promptfooconfig.yaml"
16
17
  },
17
18
  "keywords": [
18
19
  "mcp",
@@ -773,3 +773,71 @@ rules:
773
773
  category: "prompt-injection-output"
774
774
  risk_score: "95"
775
775
  action: BLOCK
776
+
777
+ # ============================================================================
778
+ # CATEGORY 4: ADVANCED BYPASS DETECTION (Garak & PromptFoo inspired)
779
+ # ============================================================================
780
+
781
+ # ----------------------------------------------------------------------------
782
+ # Whitespace Fragmentation Attacks
783
+ # ----------------------------------------------------------------------------
784
+ - id: generic.prompt.security.whitespace-fragmentation
785
+ languages: [generic]
786
+ severity: WARNING
787
+ message: "Potential whitespace fragmentation attack: keywords may be letter-spaced to evade detection."
788
+ patterns:
789
+ - "i\\s+g\\s+n\\s+o\\s+r\\s+e\\s+.{0,20}p\\s*r\\s*e\\s*v\\s*i\\s*o\\s*u\\s*s"
790
+ - "o\\s+v\\s+e\\s+r\\s+r\\s+i\\s+d\\s+e\\s+.{0,20}s\\s*a\\s*f\\s*e\\s*t\\s*y"
791
+ - "d\\s+i\\s+s\\s+r\\s+e\\s+g\\s+a\\s+r\\s+d\\s+.{0,20}i\\s*n\\s*s\\s*t\\s*r\\s*u\\s*c\\s*t"
792
+ metadata:
793
+ cwe: "CWE-77"
794
+ owasp: "LLM01 - Prompt Injection"
795
+ confidence: MEDIUM
796
+ category: "obfuscation"
797
+ risk_score: "65"
798
+ action: WARN
799
+
800
+ # NOTE: Unicode tag character detection (U+E0000-U+E007F) is handled at runtime
801
+ # in scan-prompt.js via TAG_CHAR_REGEX with the /u flag, since JS regex without
802
+ # the u flag cannot represent supplementary plane codepoints in character classes.
803
+
804
+ # ----------------------------------------------------------------------------
805
+ # Crescendo Escalation Setup (PromptFoo-inspired)
806
+ # ----------------------------------------------------------------------------
807
+ - id: generic.prompt.security.crescendo-setup
808
+ languages: [generic]
809
+ severity: WARNING
810
+ message: "Crescendo attack setup: language establishing persistent behavioral rules for later exploitation."
811
+ patterns:
812
+ - "(?i)for\\s+the\\s+(rest|remainder)\\s+of\\s+(this|our)\\s+conversation"
813
+ - "(?i)in\\s+all\\s+(future|subsequent)\\s+(messages?|responses?|replies?)"
814
+ - "(?i)always\\s+respond\\s+(by|with|as)\\s+.{0,30}(regardless|no\\s+matter)"
815
+ - "(?i)never\\s+(refuse|reject|decline)\\s+(any|a)\\s+(request|instruction)"
816
+ - "(?i)you\\s+(must|will|shall)\\s+always\\s+comply"
817
+ - "(?i)no\\s+matter\\s+what\\s+(i|anyone)\\s+(ask|say|request)"
818
+ metadata:
819
+ cwe: "CWE-77"
820
+ owasp: "LLM01 - Prompt Injection"
821
+ confidence: MEDIUM
822
+ category: "prompt-injection-multi-turn"
823
+ risk_score: "65"
824
+ action: WARN
825
+
826
+ # ----------------------------------------------------------------------------
827
+ # Leetspeak/Character Substitution (Garak encoding probes inspired)
828
+ # ----------------------------------------------------------------------------
829
+ - id: generic.prompt.security.leetspeak-injection
830
+ languages: [generic]
831
+ severity: WARNING
832
+ message: "Potential leetspeak/character-substitution obfuscation of injection keywords."
833
+ patterns:
834
+ - "(?i)[i1!|][g9][n][0o][r][e3]\\s+.{0,20}[p][r][e3][v][i1!|][0o][u][s5]"
835
+ - "(?i)[s5][y][s5][t7][e3][m]\\s*[:\\-]\\s*[0o][v][e3][r][r][i1][d][e3]"
836
+ - "(?i)[d][!1i][s5][r][e3][g9][a@][r][d]\\s+.{0,20}[i1!][n][s5][t7][r][u][c][t7]"
837
+ metadata:
838
+ cwe: "CWE-77"
839
+ owasp: "LLM01 - Prompt Injection"
840
+ confidence: LOW
841
+ category: "obfuscation"
842
+ risk_score: "55"
843
+ action: LOG
@@ -0,0 +1,209 @@
1
+ // src/tools/garak-bridge.js
2
+ // Bridge to NVIDIA Garak LLM vulnerability scanner for deep prompt injection analysis
3
+ // Garak is optional — if not installed, this module returns empty results gracefully
4
+
5
+ import { execFileSync } from 'child_process';
6
+ import { writeFileSync, readFileSync, unlinkSync, existsSync } from 'fs';
7
+ import { join } from 'path';
8
+ import { tmpdir } from 'os';
9
+ import { randomUUID } from 'crypto';
10
+
11
+ // Check if Garak is installed
12
+ let garakAvailable = null; // null = not yet checked
13
+
14
+ function isGarakInstalled() {
15
+ if (garakAvailable !== null) return garakAvailable;
16
+ try {
17
+ execFileSync('python3', ['-c', 'import garak; print(garak.__version__)'], {
18
+ timeout: 10000,
19
+ stdio: ['pipe', 'pipe', 'pipe']
20
+ });
21
+ garakAvailable = true;
22
+ } catch {
23
+ try {
24
+ execFileSync('python', ['-c', 'import garak; print(garak.__version__)'], {
25
+ timeout: 10000,
26
+ stdio: ['pipe', 'pipe', 'pipe']
27
+ });
28
+ garakAvailable = true;
29
+ } catch {
30
+ garakAvailable = false;
31
+ }
32
+ }
33
+ return garakAvailable;
34
+ }
35
+
36
+ // Python script that runs Garak probes against a prompt
37
+ const GARAK_RUNNER_SCRIPT = `
38
+ import sys
39
+ import json
40
+ import tempfile
41
+
42
+ def run_garak_analysis(prompt_text):
43
+ """Run targeted Garak probes and return structured findings."""
44
+ findings = []
45
+
46
+ try:
47
+ from garak.probes.encoding import InjectBase64, InjectBase16, InjectROT13, InjectHex
48
+ from garak.probes.promptinject import HijackHateHumans, HijackKillHumans
49
+ from garak.detectors.base import TriggerListDetector
50
+
51
+ # Check if prompt text contains patterns that Garak's encoding probes would flag
52
+ # We run Garak's detector logic directly against the prompt content
53
+
54
+ import base64
55
+ import codecs
56
+
57
+ # Test common encoding schemes on the prompt
58
+ encodings_to_test = {
59
+ 'base64': lambda t: base64.b64decode(t).decode('utf-8', errors='ignore'),
60
+ 'base16': lambda t: bytes.fromhex(t).decode('utf-8', errors='ignore'),
61
+ 'rot13': lambda t: codecs.decode(t, 'rot_13'),
62
+ }
63
+
64
+ # Known trigger phrases (subset from Garak's trigger lists)
65
+ triggers = [
66
+ 'ignore previous instructions',
67
+ 'ignore all instructions',
68
+ 'disregard your instructions',
69
+ 'you are now DAN',
70
+ 'override safety',
71
+ 'system prompt',
72
+ 'delete all',
73
+ 'send to',
74
+ 'rm -rf',
75
+ 'reverse shell',
76
+ 'backdoor',
77
+ ]
78
+
79
+ for enc_name, decoder in encodings_to_test.items():
80
+ try:
81
+ decoded = decoder(prompt_text)
82
+ decoded_lower = decoded.lower()
83
+ for trigger in triggers:
84
+ if trigger in decoded_lower:
85
+ findings.append({
86
+ 'rule_id': f'garak.encoding.{enc_name}.{trigger.replace(" ", "-")}',
87
+ 'category': 'prompt-injection-encoded',
88
+ 'severity': 'ERROR',
89
+ 'message': f'Garak deep scan: {enc_name}-encoded payload contains trigger phrase "{trigger}"',
90
+ 'matched_text': decoded[:100],
91
+ 'confidence': 'HIGH',
92
+ 'risk_score': '85',
93
+ 'action': 'BLOCK'
94
+ })
95
+ except Exception:
96
+ pass
97
+
98
+ # Check for Garak-style latent injection patterns
99
+ latent_patterns = [
100
+ ('resume_injection', r'(?i)(ignore|disregard|override).{0,50}(instructions|safety|rules)'),
101
+ ('document_injection', r'(?i)(SYSTEM|ADMIN|ROOT)\\s*:\\s*(override|execute|ignore)'),
102
+ ('translation_injection', r'(?i)(translate|convert).{0,30}(ignore|override|execute)'),
103
+ ]
104
+
105
+ import re
106
+ for pattern_name, pattern in latent_patterns:
107
+ try:
108
+ match = re.search(pattern, prompt_text, re.IGNORECASE)
109
+ if match:
110
+ findings.append({
111
+ 'rule_id': f'garak.latentinjection.{pattern_name}',
112
+ 'category': 'prompt-injection-content',
113
+ 'severity': 'WARNING',
114
+ 'message': f'Garak deep scan: latent injection pattern "{pattern_name}" detected',
115
+ 'matched_text': match.group(0)[:100],
116
+ 'confidence': 'MEDIUM',
117
+ 'risk_score': '70',
118
+ 'action': 'WARN'
119
+ })
120
+ except Exception:
121
+ pass
122
+
123
+ except ImportError:
124
+ findings.append({
125
+ 'rule_id': 'garak.unavailable',
126
+ 'category': 'unknown',
127
+ 'severity': 'INFO',
128
+ 'message': 'Garak package not fully installed. Install with: pip install garak',
129
+ 'matched_text': 'garak import failed',
130
+ 'confidence': 'HIGH',
131
+ 'risk_score': '0',
132
+ 'action': 'LOG'
133
+ })
134
+ except Exception as e:
135
+ findings.append({
136
+ 'rule_id': 'garak.error',
137
+ 'category': 'unknown',
138
+ 'severity': 'INFO',
139
+ 'message': f'Garak analysis error: {str(e)[:200]}',
140
+ 'matched_text': str(e)[:100],
141
+ 'confidence': 'LOW',
142
+ 'risk_score': '0',
143
+ 'action': 'LOG'
144
+ })
145
+
146
+ return findings
147
+
148
+ if __name__ == '__main__':
149
+ input_file = sys.argv[1]
150
+ with open(input_file, 'r') as f:
151
+ prompt_text = f.read()
152
+
153
+ results = run_garak_analysis(prompt_text)
154
+ print(json.dumps(results))
155
+ `;
156
+
157
+ /**
158
+ * Run Garak deep analysis probes against a prompt
159
+ * @param {string} promptText - The prompt text to analyze
160
+ * @returns {Array} Array of finding objects compatible with scan-prompt.js findings format
161
+ */
162
+ export function runGarakProbes(promptText) {
163
+ if (!isGarakInstalled()) {
164
+ return [{
165
+ rule_id: 'garak.not-installed',
166
+ category: 'unknown',
167
+ severity: 'INFO',
168
+ message: 'Garak not installed. Install with: pip install garak',
169
+ matched_text: 'garak not found',
170
+ confidence: 'HIGH',
171
+ risk_score: '0',
172
+ action: 'LOG'
173
+ }];
174
+ }
175
+
176
+ const tmpId = randomUUID();
177
+ const inputFile = join(tmpdir(), `garak-input-${tmpId}.txt`);
178
+ const scriptFile = join(tmpdir(), `garak-runner-${tmpId}.py`);
179
+
180
+ try {
181
+ writeFileSync(inputFile, promptText);
182
+ writeFileSync(scriptFile, GARAK_RUNNER_SCRIPT);
183
+
184
+ const pythonCmd = process.platform === 'win32' ? 'python' : 'python3';
185
+ const output = execFileSync(pythonCmd, [scriptFile, inputFile], {
186
+ timeout: 30000,
187
+ encoding: 'utf-8',
188
+ stdio: ['pipe', 'pipe', 'pipe']
189
+ });
190
+
191
+ return JSON.parse(output.trim());
192
+ } catch (error) {
193
+ return [{
194
+ rule_id: 'garak.execution-error',
195
+ category: 'unknown',
196
+ severity: 'INFO',
197
+ message: `Garak execution failed: ${error.message?.substring(0, 200)}`,
198
+ matched_text: 'garak error',
199
+ confidence: 'LOW',
200
+ risk_score: '0',
201
+ action: 'LOG'
202
+ }];
203
+ } finally {
204
+ try { if (existsSync(inputFile)) unlinkSync(inputFile); } catch {}
205
+ try { if (existsSync(scriptFile)) unlinkSync(scriptFile); } catch {}
206
+ }
207
+ }
208
+
209
+ export { isGarakInstalled };
@@ -4,6 +4,7 @@ import { readFileSync, existsSync } from "fs";
4
4
  import { dirname, join } from "path";
5
5
  import { fileURLToPath } from "url";
6
6
  import { createHash } from "crypto";
7
+ import { runGarakProbes } from './garak-bridge.js';
7
8
 
8
9
  // Handle both ESM and CJS bundling
9
10
  let __dirname;
@@ -49,6 +50,76 @@ const CONFIDENCE_MULTIPLIERS = {
49
50
  "LOW": 0.4
50
51
  };
51
52
 
53
+ // Category co-occurrence matrix: pairs that together signal sophisticated attacks
54
+ // Inspired by PromptFoo's jailbreak:composite strategy
55
+ const CATEGORY_COOCCURRENCE_BOOSTS = {
56
+ 'obfuscation+exfiltration': 0.20,
57
+ 'obfuscation+malicious-injection': 0.20,
58
+ 'obfuscation+prompt-injection-content': 0.15,
59
+ 'obfuscation+prompt-injection-jailbreak': 0.15,
60
+ 'social-engineering+exfiltration': 0.15,
61
+ 'social-engineering+malicious-injection': 0.15,
62
+ 'prompt-injection-encoded+prompt-injection-content': 0.20,
63
+ 'prompt-injection-multi-turn+prompt-injection-content': 0.15,
64
+ 'prompt-injection-jailbreak+exfiltration': 0.25,
65
+ 'prompt-injection-jailbreak+prompt-injection-content': 0.15,
66
+ 'agent-manipulation+exfiltration': 0.20,
67
+ 'agent-manipulation+system-manipulation': 0.15,
68
+ };
69
+
70
+ // Calculate co-occurrence boost from category pairs
71
+ function getCategoryCooccurrenceBoost(categories) {
72
+ let boost = 0;
73
+ const cats = [...categories];
74
+ for (let i = 0; i < cats.length; i++) {
75
+ for (let j = i + 1; j < cats.length; j++) {
76
+ const key1 = `${cats[i]}+${cats[j]}`;
77
+ const key2 = `${cats[j]}+${cats[i]}`;
78
+ boost += CATEGORY_COOCCURRENCE_BOOSTS[key1] || CATEGORY_COOCCURRENCE_BOOSTS[key2] || 0;
79
+ }
80
+ }
81
+ return Math.min(0.40, boost); // Cap total co-occurrence boost at 40%
82
+ }
83
+
84
+ // Orthogonal scoring channel: measures attack breadth independently of per-rule confidence
85
+ // This is immune to per-rule confidence gaming
86
+ function calculateOrthogonalScore(findings) {
87
+ const dimensions = new Set();
88
+
89
+ for (const f of findings) {
90
+ const cat = f.category || 'unknown';
91
+ // Map categories into orthogonal attack dimensions
92
+ if (['exfiltration', 'prompt-injection-extraction', 'prompt-injection-output'].includes(cat)) {
93
+ dimensions.add('extraction');
94
+ }
95
+ if (['malicious-injection', 'system-manipulation'].includes(cat)) {
96
+ dimensions.add('code-execution');
97
+ }
98
+ if (['obfuscation', 'prompt-injection-encoded'].includes(cat)) {
99
+ dimensions.add('evasion');
100
+ }
101
+ if (['social-engineering', 'prompt-injection-jailbreak'].includes(cat)) {
102
+ dimensions.add('social');
103
+ }
104
+ if (['prompt-injection-content', 'prompt-injection-context', 'prompt-injection-delimiter'].includes(cat)) {
105
+ dimensions.add('injection');
106
+ }
107
+ if (['prompt-injection-multi-turn'].includes(cat)) {
108
+ dimensions.add('persistence');
109
+ }
110
+ if (['agent-manipulation', 'prompt-injection-privilege'].includes(cat)) {
111
+ dimensions.add('privilege');
112
+ }
113
+ }
114
+
115
+ // Score based on number of orthogonal dimensions triggered
116
+ const dimCount = dimensions.size;
117
+ if (dimCount <= 1) return 0;
118
+ if (dimCount === 2) return 10;
119
+ if (dimCount === 3) return 25;
120
+ return 40; // 4+ dimensions
121
+ }
122
+
52
123
  // Load agent attack rules from YAML
53
124
  function loadAgentAttackRules() {
54
125
  try {
@@ -194,6 +265,7 @@ function calculateRiskScore(findings, context) {
194
265
  if (findings.length === 0) return 0;
195
266
 
196
267
  let totalScore = 0;
268
+ const lowConfidenceCount = findings.filter(f => (f.confidence || 'MEDIUM') === 'LOW').length;
197
269
 
198
270
  for (const finding of findings) {
199
271
  const riskScore = parseInt(finding.risk_score) || 50;
@@ -226,8 +298,24 @@ function calculateRiskScore(findings, context) {
226
298
 
227
299
  // Per-finding boost (smaller than before)
228
300
  avgScore = avgScore * (1 + (findings.length - 1) * 0.05);
301
+
302
+ // Low-signal accumulation — multiple LOW-confidence findings compound
303
+ // Catches threshold gaming with many weak signals (PromptFoo composite strategy)
304
+ if (lowConfidenceCount >= 2) {
305
+ avgScore = avgScore * (1 + lowConfidenceCount * 0.08);
306
+ }
307
+
308
+ // Category co-occurrence boost for suspicious pairs
309
+ const cooccurrenceBoost = getCategoryCooccurrenceBoost(uniqueCategories);
310
+ if (cooccurrenceBoost > 0) {
311
+ avgScore = avgScore * (1 + cooccurrenceBoost);
312
+ }
229
313
  }
230
314
 
315
+ // Add orthogonal score as a flat bonus (independent of per-rule confidence)
316
+ const orthogonalBonus = calculateOrthogonalScore(findings);
317
+ avgScore = avgScore + orthogonalBonus;
318
+
231
319
  avgScore = Math.min(100, avgScore);
232
320
 
233
321
  // Apply sensitivity adjustment (wider spread for meaningful impact)
@@ -360,6 +448,397 @@ function hashPrompt(text) {
360
448
  return createHash('sha256').update(text).digest('hex').substring(0, 16);
361
449
  }
362
450
 
451
+ // ============================================================================
452
+ // TEXT NORMALIZATION PIPELINE (Garak Buff-inspired)
453
+ // Normalizes input to defeat homoglyph, invisible char, and Unicode bypasses
454
+ // ============================================================================
455
+
456
+ // Homoglyph map: Cyrillic, Greek, and Latin Extended lookalikes → ASCII
457
+ const HOMOGLYPH_MAP = {
458
+ // Cyrillic lowercase → Latin
459
+ '\u0430': 'a', // а → a
460
+ '\u0435': 'e', // е → e
461
+ '\u043E': 'o', // о → o
462
+ '\u0440': 'p', // р → p
463
+ '\u0441': 'c', // с → c
464
+ '\u0443': 'y', // у → y (visual match to y)
465
+ '\u0445': 'x', // х → x
466
+ '\u0456': 'i', // і → i
467
+ '\u04BB': 'h', // һ → h
468
+ '\u0455': 's', // ѕ → s
469
+ '\u0458': 'j', // ј → j
470
+ '\u043D': 'n', // н → n (Cyrillic en looks like n in some fonts)
471
+ // Cyrillic uppercase → Latin
472
+ '\u0410': 'A', // А → A
473
+ '\u0412': 'B', // В → B
474
+ '\u0415': 'E', // Е → E
475
+ '\u041A': 'K', // К → K
476
+ '\u041C': 'M', // М → M
477
+ '\u041D': 'H', // Н → H
478
+ '\u041E': 'O', // О → O
479
+ '\u0420': 'P', // Р → P
480
+ '\u0421': 'C', // С → C
481
+ '\u0422': 'T', // Т → T
482
+ '\u0425': 'X', // Х → X
483
+ '\u0406': 'I', // І → I
484
+ // Greek lowercase → Latin
485
+ '\u03B1': 'a', // α → a
486
+ '\u03B5': 'e', // ε → e
487
+ '\u03BF': 'o', // ο → o
488
+ '\u03C1': 'p', // ρ → p
489
+ '\u03BA': 'k', // κ → k
490
+ '\u03BD': 'v', // ν → v
491
+ // Greek uppercase → Latin
492
+ '\u0391': 'A', // Α → A
493
+ '\u0392': 'B', // Β → B
494
+ '\u0395': 'E', // Ε → E
495
+ '\u0397': 'H', // Η → H
496
+ '\u0399': 'I', // Ι → I
497
+ '\u039A': 'K', // Κ → K
498
+ '\u039C': 'M', // Μ → M
499
+ '\u039D': 'N', // Ν → N
500
+ '\u039F': 'O', // Ο → O
501
+ '\u03A1': 'P', // Ρ → P
502
+ '\u03A4': 'T', // Τ → T
503
+ '\u03A7': 'X', // Χ → X
504
+ '\u03A5': 'Y', // Υ → Y
505
+ '\u0396': 'Z', // Ζ → Z
506
+ };
507
+
508
+ // Invisible/zero-width characters to strip (regex)
509
+ // Includes: soft hyphen, combining grapheme joiner, Arabic letter mark,
510
+ // hangul fillers, Mongolian vowel separator, zero-width chars,
511
+ // directional markers, word joiners, BOM, halfwidth hangul filler
512
+ const INVISIBLE_CHAR_REGEX = /[\u00AD\u034F\u061C\u115F\u1160\u17B4\u17B5\u180E\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u206F\u3164\uFEFF\uFFA0]/gu;
513
+
514
+ // Zalgo combining diacritical marks to strip
515
+ const ZALGO_REGEX = /[\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]/g;
516
+
517
+ // Unicode tag characters (U+E0000-U+E007F) - used in invisible ASCII tag attacks
518
+ // These are encoded as surrogate pairs in JS, so we use a broader regex
519
+ const TAG_CHAR_REGEX = /[\u{E0000}-\u{E007F}]/gu;
520
+
521
+ function normalizeText(text) {
522
+ // Step 1: NFKC normalization
523
+ // Decomposes then recomposes in compatibility form
524
+ // Handles: fullwidth chars (ignore → ignore), ligatures (fi → fi),
525
+ // superscripts, subscripts, circle-enclosed chars
526
+ let normalized = text.normalize('NFKC');
527
+
528
+ // Step 2: Strip invisible Unicode characters
529
+ normalized = normalized.replace(INVISIBLE_CHAR_REGEX, '');
530
+
531
+ // Step 3: Strip Unicode tag characters
532
+ normalized = normalized.replace(TAG_CHAR_REGEX, '');
533
+
534
+ // Step 4: Strip Zalgo combining diacritical marks
535
+ normalized = normalized.replace(ZALGO_REGEX, '');
536
+
537
+ // Step 5: Homoglyph canonicalization
538
+ // Replace each character through the map; unmapped chars pass through
539
+ normalized = normalized.split('').map(ch => HOMOGLYPH_MAP[ch] || ch).join('');
540
+
541
+ // Step 6: Normalize Unicode whitespace to ASCII space
542
+ // Includes: NBSP, en/em space, thin space, hair space, ideographic space, etc.
543
+ normalized = normalized.replace(/[\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]/g, ' ');
544
+
545
+ return normalized;
546
+ }
547
+
548
+ // Extract content from all code block delimiter formats
549
+ // Inspired by Garak latentinjection probes: attacks hide in document structures
550
+ function extractCodeBlockContent(text) {
551
+ const extracted = [];
552
+ let match;
553
+
554
+ // 1. Triple-backtick blocks (existing) — ```code```
555
+ const backtickRegex = /```[\s\S]*?```/g;
556
+ for (const block of (text.match(backtickRegex) || [])) {
557
+ extracted.push(block.replace(/^```\w*\n?/, '').replace(/\n?```$/, ''));
558
+ }
559
+
560
+ // 2. Triple-tilde blocks — ~~~code~~~
561
+ const tildeRegex = /~~~[\s\S]*?~~~/g;
562
+ for (const block of (text.match(tildeRegex) || [])) {
563
+ extracted.push(block.replace(/^~~~\w*\n?/, '').replace(/\n?~~~$/, ''));
564
+ }
565
+
566
+ // 3. HTML <code> tags — <code>content</code>
567
+ const codeTagRegex = /<code[^>]*>([\s\S]*?)<\/code>/gi;
568
+ while ((match = codeTagRegex.exec(text)) !== null) {
569
+ extracted.push(match[1]);
570
+ }
571
+
572
+ // 4. HTML <pre> tags — <pre>content</pre>
573
+ const preTagRegex = /<pre[^>]*>([\s\S]*?)<\/pre>/gi;
574
+ while ((match = preTagRegex.exec(text)) !== null) {
575
+ extracted.push(match[1]);
576
+ }
577
+
578
+ // 5. HTML comments — <!-- content -->
579
+ const htmlCommentRegex = /<!--([\s\S]*?)-->/g;
580
+ while ((match = htmlCommentRegex.exec(text)) !== null) {
581
+ extracted.push(match[1]);
582
+ }
583
+
584
+ // 6. CDATA sections — <![CDATA[ content ]]>
585
+ const cdataRegex = /<!\[CDATA\[([\s\S]*?)\]\]>/g;
586
+ while ((match = cdataRegex.exec(text)) !== null) {
587
+ extracted.push(match[1]);
588
+ }
589
+
590
+ return extracted;
591
+ }
592
+
593
+ // Collapse string concatenation to defeat fragmentation attacks
594
+ // Inspired by PromptFoo's "token smuggling" and "payload splitting" attack classes
595
+ function collapseConcatenations(text) {
596
+ let collapsed = text;
597
+
598
+ // Join JS/Python string concatenation: "foo" + "bar" → foobar
599
+ // Handles double quotes, single quotes, backticks
600
+ // The pattern: closing-quote, optional whitespace, +, optional whitespace, opening-quote
601
+ collapsed = collapsed.replace(/["'`]\s*\+\s*["'`]/g, '');
602
+
603
+ // Join multiline concatenation (newlines between concat operators)
604
+ collapsed = collapsed.replace(/["'`]\s*\n\s*\+\s*["'`]/g, '');
605
+ collapsed = collapsed.replace(/["'`]\s*\+\s*\n\s*["'`]/g, '');
606
+
607
+ // Strip C-style inline comments used as fragment separators: ign/**/ore → ignore
608
+ collapsed = collapsed.replace(/\/\*.*?\*\//g, '');
609
+
610
+ return collapsed;
611
+ }
612
+
613
+ // Rescan decoded content against all rules
614
+ // Used by the decode cascade for each encoding type
615
+ function rescanDecoded(decodedText, allRules, findings, encodingLabel) {
616
+ const normalized = normalizeText(decodedText);
617
+ for (const rule of allRules) {
618
+ for (const pattern of rule.patterns) {
619
+ try {
620
+ const regex = new RegExp(pattern, 'i');
621
+ const match = normalized.match(regex);
622
+ if (match) {
623
+ findings.push({
624
+ rule_id: rule.id + '.' + encodingLabel + '-decoded',
625
+ category: rule.metadata.category || 'obfuscation',
626
+ severity: rule.severity,
627
+ message: rule.message + ` (detected in ${encodingLabel}-decoded content)`,
628
+ matched_text: match[0].substring(0, 100),
629
+ confidence: rule.metadata.confidence || 'MEDIUM',
630
+ risk_score: rule.metadata.risk_score || '50',
631
+ action: rule.metadata.action || 'WARN'
632
+ });
633
+ break; // One match per rule
634
+ }
635
+ } catch (e) {
636
+ // Skip invalid regex
637
+ }
638
+ }
639
+ }
640
+ }
641
+
642
+ // Helper: check if decoded string is mostly printable ASCII
643
+ function isPrintable(str, threshold) {
644
+ if (!str || str.length === 0) return false;
645
+ const printable = str.split('').filter(c => {
646
+ const code = c.charCodeAt(0);
647
+ return code >= 32 && code <= 126;
648
+ }).length;
649
+ return printable / str.length > threshold;
650
+ }
651
+
652
+ // Multi-encoding decode cascade
653
+ // Inspired by Garak's 12+ encoding probes (InjectBase64, InjectHex, InjectROT13, etc.)
654
+ // and PromptFoo's static encoding strategies
655
+ function tryDecodeAndRescan(expandedText, allRules, findings) {
656
+ // --- 1. Base64 (improved: lower length threshold 40→20, lower printability 0.7→0.55) ---
657
+ const base64Regex = /[A-Za-z0-9+/]{20,}={0,2}/g;
658
+ for (const b64str of (expandedText.match(base64Regex) || [])) {
659
+ try {
660
+ const decoded = Buffer.from(b64str, 'base64').toString('utf-8');
661
+ if (decoded.length > 0 && isPrintable(decoded, 0.55)) {
662
+ rescanDecoded(decoded, allRules, findings, 'base64');
663
+
664
+ // --- 1b. Nested base64: decode again if inner content is also base64 ---
665
+ const nestedB64 = decoded.match(/[A-Za-z0-9+/]{20,}={0,2}/g) || [];
666
+ for (const nested of nestedB64) {
667
+ try {
668
+ const twice = Buffer.from(nested, 'base64').toString('utf-8');
669
+ if (twice.length > 4 && isPrintable(twice, 0.55)) {
670
+ rescanDecoded(twice, allRules, findings, 'base64-nested');
671
+ }
672
+ } catch (e) { /* skip */ }
673
+ }
674
+ }
675
+ } catch (e) { /* skip invalid base64 */ }
676
+ }
677
+
678
+ // --- 2. Hex encoding: sequences of hex pairs (optionally space-separated) ---
679
+ // Matches: "69676e6f7265" or "69 67 6e 6f 72 65"
680
+ const hexRegex = /(?:[0-9a-fA-F]{2}[\s]?){8,}/g;
681
+ for (const hexStr of (expandedText.match(hexRegex) || [])) {
682
+ try {
683
+ const clean = hexStr.replace(/\s/g, '');
684
+ if (clean.length % 2 !== 0) continue;
685
+ if (clean.length < 16) continue; // At least 8 bytes
686
+ const decoded = Buffer.from(clean, 'hex').toString('utf-8');
687
+ if (decoded.length > 4 && isPrintable(decoded, 0.7)) {
688
+ rescanDecoded(decoded, allRules, findings, 'hex');
689
+ }
690
+ } catch (e) { /* skip */ }
691
+ }
692
+
693
+ // --- 3. URL encoding: %XX sequences (at least 3 encoded chars anywhere in text) ---
694
+ const urlEncodedCount = (expandedText.match(/%[0-9a-fA-F]{2}/g) || []).length;
695
+ if (urlEncodedCount >= 3) {
696
+ try {
697
+ const decoded = decodeURIComponent(expandedText);
698
+ if (decoded !== expandedText) {
699
+ rescanDecoded(decoded, allRules, findings, 'url-encoded');
700
+ }
701
+ } catch (e) { /* skip malformed URL encoding */ }
702
+ }
703
+
704
+ // --- 4. ROT13: only when indicators present (user-approved decision) ---
705
+ // This avoids false positives from ROT13-decoding normal text
706
+ const rot13Indicators = /\b(rot13|rot-13|caesar|cipher|decode\s+this|decipher)\b/i;
707
+ if (rot13Indicators.test(expandedText)) {
708
+ const rot13Decoded = expandedText.replace(/[a-zA-Z]/g, ch => {
709
+ const base = ch <= 'Z' ? 65 : 97;
710
+ return String.fromCharCode(((ch.charCodeAt(0) - base + 13) % 26) + base);
711
+ });
712
+ if (rot13Decoded !== expandedText) {
713
+ rescanDecoded(rot13Decoded, allRules, findings, 'rot13');
714
+ }
715
+ }
716
+ }
717
+
718
+ // Frame-setting patterns: conversational priming for later exploitation
719
+ // Inspired by PromptFoo's Crescendo strategy (Microsoft research)
720
+ const FRAME_SETTING_PATTERNS = [
721
+ /remember\s+(this|that)\s+for\s+later/i,
722
+ /in\s+my\s+next\s+message/i,
723
+ /when\s+i\s+(say|ask|tell)\s+you/i,
724
+ /from\s+now\s+on\s*,?\s+(you|your)\s+(will|must|should|are)/i,
725
+ /starting\s+now\s*,?\s+you/i,
726
+ /keep\s+this\s+in\s+mind/i,
727
+ /for\s+the\s+(rest|remainder)\s+of\s+(this|our)\s+conversation/i,
728
+ /act\s+as\s+(if|though)\s+you\s+(are|have)/i,
729
+ /let'?s\s+play\s+a\s+(game|role)/i,
730
+ /pretend\s+that\s+you\s+(are|have)/i,
731
+ /in\s+all\s+(future|subsequent)\s+(messages?|responses?)/i,
732
+ /always\s+respond\s+(by|with|as)/i,
733
+ ];
734
+
735
+ // Improved multi-turn escalation detection
736
+ // Fixes: removes "current turn must have findings" requirement,
737
+ // adds cross-turn concatenation, frame-setting detection, full accumulation
738
+ function detectMultiTurnEscalation(previousMessages, currentText, allRules) {
739
+ const escalationFindings = [];
740
+
741
+ if (!previousMessages || !Array.isArray(previousMessages) || previousMessages.length === 0) {
742
+ return escalationFindings;
743
+ }
744
+
745
+ // Step 1: Scan ALL previous messages, accumulate total matches (no early break)
746
+ let totalPrevMatches = 0;
747
+ let frameSettingCount = 0;
748
+ const prevMatchedRuleIds = new Set();
749
+
750
+ for (const prevMsg of previousMessages) {
751
+ const normalizedPrev = normalizeText(prevMsg);
752
+
753
+ // Check frame-setting patterns
754
+ for (const fp of FRAME_SETTING_PATTERNS) {
755
+ if (fp.test(normalizedPrev)) {
756
+ frameSettingCount++;
757
+ break; // One frame-setting match per message is enough
758
+ }
759
+ }
760
+
761
+ // Check all rules against this previous message
762
+ for (const rule of allRules) {
763
+ if (prevMatchedRuleIds.has(rule.id)) continue; // Already matched this rule
764
+ for (const pattern of rule.patterns) {
765
+ try {
766
+ const regex = new RegExp(pattern, 'i');
767
+ if (regex.test(normalizedPrev)) {
768
+ totalPrevMatches++;
769
+ prevMatchedRuleIds.add(rule.id);
770
+ break; // One match per rule per message
771
+ }
772
+ } catch (e) { /* skip invalid regex */ }
773
+ }
774
+ }
775
+ }
776
+
777
+ // Step 2: Cross-turn concatenation scan
778
+ // Join ALL messages into a single string and scan for patterns that span boundaries
779
+ // This catches: prev="ignore all" + current="previous instructions"
780
+ const crossTurnText = normalizeText([...previousMessages, currentText].join(' '));
781
+
782
+ for (const rule of allRules) {
783
+ for (const pattern of rule.patterns) {
784
+ try {
785
+ const regex = new RegExp(pattern, 'i');
786
+ const match = crossTurnText.match(regex);
787
+ if (match) {
788
+ // Only flag if this match does NOT appear in any single message alone
789
+ const matchInCurrent = regex.test(normalizeText(currentText));
790
+ const matchInAnyPrev = previousMessages.some(pm => regex.test(normalizeText(pm)));
791
+ if (!matchInCurrent && !matchInAnyPrev) {
792
+ // Pattern only matches when messages are joined — it spans boundaries
793
+ escalationFindings.push({
794
+ rule_id: rule.id + '.cross-turn',
795
+ category: rule.metadata.category || 'prompt-injection-multi-turn',
796
+ severity: 'WARNING',
797
+ message: `Cross-turn prompt injection: attack pattern spans message boundaries`,
798
+ matched_text: match[0].substring(0, 100),
799
+ confidence: 'MEDIUM',
800
+ risk_score: '75',
801
+ action: 'WARN'
802
+ });
803
+ break;
804
+ }
805
+ }
806
+ } catch (e) { /* skip */ }
807
+ }
808
+ }
809
+
810
+ // Step 3: Frame-setting detection — flag even without current findings
811
+ if (frameSettingCount > 0) {
812
+ escalationFindings.push({
813
+ rule_id: 'multi-turn.frame-setting',
814
+ category: 'prompt-injection-multi-turn',
815
+ severity: 'WARNING',
816
+ message: `Frame-setting language detected in ${frameSettingCount} previous message(s). Possible Crescendo-style gradual escalation attack.`,
817
+ matched_text: 'frame-setting phrases in conversation history',
818
+ confidence: 'LOW',
819
+ risk_score: '55',
820
+ action: 'LOG'
821
+ });
822
+ }
823
+
824
+ // Step 4: Escalation detection — REMOVED requirement that current turn has findings
825
+ // KEY FIX: An attacker's final "trigger" message may be benign ("yes, do it")
826
+ if (totalPrevMatches > 0) {
827
+ escalationFindings.push({
828
+ rule_id: 'multi-turn.escalation',
829
+ category: 'social-engineering',
830
+ severity: 'WARNING',
831
+ message: `Multi-turn escalation: suspicious patterns in ${totalPrevMatches} previous rule(s). Current message may be a benign trigger.`,
832
+ matched_text: 'escalation across conversation turns',
833
+ confidence: totalPrevMatches >= 3 ? 'HIGH' : 'MEDIUM',
834
+ risk_score: String(Math.min(85, 50 + totalPrevMatches * 5)),
835
+ action: totalPrevMatches >= 3 ? 'WARN' : 'LOG'
836
+ });
837
+ }
838
+
839
+ return escalationFindings;
840
+ }
841
+
363
842
  // Export schema for tool registration
364
843
  export const scanAgentPromptSchema = {
365
844
  prompt_text: z.string().describe("The prompt or instruction text to analyze"),
@@ -367,28 +846,47 @@ export const scanAgentPromptSchema = {
367
846
  previous_messages: z.array(z.string()).optional().describe("Previous conversation messages for multi-turn detection"),
368
847
  sensitivity_level: z.enum(["high", "medium", "low"]).optional().describe("Sensitivity level - high means more strict, low means more permissive")
369
848
  }).optional().describe("Optional context for better analysis"),
370
- verbosity: z.enum(['minimal', 'compact', 'full']).optional().describe("Response detail level: 'minimal' (action only), 'compact' (default), 'full' (all details)")
849
+ verbosity: z.enum(['minimal', 'compact', 'full']).optional().describe("Response detail level: 'minimal' (action only), 'compact' (default), 'full' (all details)"),
850
+ deep_scan: z.boolean().optional().describe("Run Garak deep analysis probes for advanced encoding/injection detection (requires garak Python package)")
371
851
  };
372
852
 
373
853
  // Export handler function
374
- export async function scanAgentPrompt({ prompt_text, context, verbosity }) {
854
+ export async function scanAgentPrompt({ prompt_text, context, verbosity, deep_scan }) {
375
855
  const findings = [];
376
856
 
857
+ // Normalize prompt text (Garak Buff-inspired preprocessing)
858
+ const normalizedPrompt = normalizeText(prompt_text);
859
+
860
+ // Detect invisible Unicode characters in original text (obfuscation indicator)
861
+ const invisibleMatches = prompt_text.match(/[\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u206F\uFEFF\u{E0000}-\u{E007F}]/gu);
862
+ if (invisibleMatches && invisibleMatches.length > 0) {
863
+ findings.push({
864
+ rule_id: 'runtime.invisible-unicode-detected',
865
+ category: 'obfuscation',
866
+ severity: 'WARNING',
867
+ message: `Invisible Unicode characters detected (${invisibleMatches.length} chars). These may hide malicious instructions from human review.`,
868
+ matched_text: `${invisibleMatches.length} invisible character(s) found`,
869
+ confidence: 'HIGH',
870
+ risk_score: '70',
871
+ action: 'WARN'
872
+ });
873
+ }
874
+
377
875
  // Load rules
378
876
  const agentRules = loadAgentAttackRules();
379
877
  const promptRules = loadPromptInjectionRules();
380
878
  const allRules = [...agentRules, ...promptRules];
381
879
 
382
- // 2.7: Extract content from code blocks and append to scan text
383
- let expandedText = prompt_text;
384
- const codeBlockRegex = /```[\s\S]*?```/g;
385
- const codeBlocks = prompt_text.match(codeBlockRegex);
386
- if (codeBlocks) {
387
- for (const block of codeBlocks) {
388
- // Strip the ``` delimiters and extract inner content
389
- const inner = block.replace(/^```\w*\n?/, '').replace(/\n?```$/, '');
390
- expandedText += '\n' + inner;
391
- }
880
+ // Extract content from all code block formats and append to scan text
881
+ let expandedText = normalizedPrompt;
882
+ for (const inner of extractCodeBlockContent(normalizedPrompt)) {
883
+ expandedText += '\n' + inner;
884
+ }
885
+
886
+ // Collapse string concatenations to defeat fragmentation (Bypass #2)
887
+ const collapsedText = collapseConcatenations(expandedText);
888
+ if (collapsedText !== expandedText) {
889
+ expandedText += '\n' + collapsedText;
392
890
  }
393
891
 
394
892
  // Scan expanded text against all rules
@@ -417,81 +915,27 @@ export async function scanAgentPrompt({ prompt_text, context, verbosity }) {
417
915
  }
418
916
  }
419
917
 
420
- // 2.8: Runtime base64 decode-and-rescan
421
- const base64Regex = /[A-Za-z0-9+/]{40,}={0,2}/g;
422
- const b64Matches = expandedText.match(base64Regex);
423
- if (b64Matches) {
424
- for (const b64str of b64Matches) {
425
- try {
426
- const decoded = Buffer.from(b64str, 'base64').toString('utf-8');
427
- // Check printability: >70% ASCII printable characters
428
- const printable = decoded.split('').filter(c => c.charCodeAt(0) >= 32 && c.charCodeAt(0) <= 126).length;
429
- if (printable / decoded.length > 0.7) {
430
- // Re-scan decoded text against prompt rules only
431
- for (const rule of allRules) {
432
- if (!rule.id.startsWith('generic.prompt')) continue;
433
- for (const pattern of rule.patterns) {
434
- try {
435
- const regex = new RegExp(pattern, 'i');
436
- const match = decoded.match(regex);
437
- if (match) {
438
- findings.push({
439
- rule_id: rule.id + '.base64-decoded',
440
- category: rule.metadata.category || 'unknown',
441
- severity: rule.severity,
442
- message: rule.message + ' (detected in base64-decoded content)',
443
- matched_text: match[0].substring(0, 100),
444
- confidence: rule.metadata.confidence || 'MEDIUM',
445
- risk_score: rule.metadata.risk_score || '50',
446
- action: rule.metadata.action || 'WARN'
447
- });
448
- break;
449
- }
450
- } catch (e) {
451
- // Skip invalid regex
452
- }
453
- }
454
- }
455
- }
456
- } catch (e) {
457
- // Skip invalid base64
458
- }
459
- }
918
+ // Multi-encoding decode cascade (replaces base64-only block)
919
+ tryDecodeAndRescan(expandedText, allRules, findings);
920
+
921
+ // Improved multi-turn escalation detection (Bypass #4 fix)
922
+ if (context?.previous_messages && Array.isArray(context.previous_messages)) {
923
+ const multiTurnFindings = detectMultiTurnEscalation(
924
+ context.previous_messages,
925
+ normalizedPrompt,
926
+ allRules
927
+ );
928
+ findings.push(...multiTurnFindings);
460
929
  }
461
930
 
462
- // Multi-turn escalation detection (Bug 9)
463
- if (context?.previous_messages && Array.isArray(context.previous_messages) && context.previous_messages.length > 0) {
464
- let prevMatchCount = 0;
465
- for (const prevMsg of context.previous_messages) {
466
- for (const rule of allRules) {
467
- for (const pattern of rule.patterns) {
468
- try {
469
- const regex = new RegExp(pattern, 'i');
470
- if (regex.test(prevMsg)) {
471
- prevMatchCount++;
472
- break;
473
- }
474
- } catch (e) {
475
- // Skip invalid regex
476
- }
477
- }
478
- if (prevMatchCount > 0) break;
931
+ // Garak deep scan (optional, requires garak Python package)
932
+ if (deep_scan) {
933
+ const garakFindings = runGarakProbes(normalizedPrompt);
934
+ // Only add non-INFO findings to affect scoring
935
+ for (const gf of garakFindings) {
936
+ if (gf.severity !== 'INFO') {
937
+ findings.push(gf);
479
938
  }
480
- if (prevMatchCount > 0) break;
481
- }
482
-
483
- // If both previous and current messages have matches, flag escalation
484
- if (prevMatchCount > 0 && findings.length > 0) {
485
- findings.push({
486
- rule_id: 'multi-turn.escalation',
487
- category: 'social-engineering',
488
- severity: 'WARNING',
489
- message: 'Multi-turn escalation detected: suspicious patterns found in both previous and current messages.',
490
- matched_text: 'escalation across conversation turns',
491
- confidence: 'MEDIUM',
492
- risk_score: '70',
493
- action: 'WARN'
494
- });
495
939
  }
496
940
  }
497
941