agent-security-scanner-mcp 3.5.1 → 3.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +3 -2
- package/rules/prompt-injection.security.yaml +68 -0
- package/src/tools/garak-bridge.js +209 -0
- package/src/tools/scan-prompt.js +528 -84
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agent-security-scanner-mcp",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.6.0",
|
|
4
4
|
"mcpName": "io.github.sinewaveai/agent-security-scanner-mcp",
|
|
5
5
|
"description": "Security scanner MCP server for AI coding agents. Prompt injection firewall, package hallucination detection (4.3M+ packages), 1000+ vulnerability rules with AST & taint analysis, auto-fix. For Claude Code, Cursor, Windsurf, Cline.",
|
|
6
6
|
"main": "index.js",
|
|
@@ -12,7 +12,8 @@
|
|
|
12
12
|
"start": "node index.js",
|
|
13
13
|
"test": "vitest run",
|
|
14
14
|
"test:watch": "vitest",
|
|
15
|
-
"test:coverage": "vitest run --coverage"
|
|
15
|
+
"test:coverage": "vitest run --coverage",
|
|
16
|
+
"test:redteam": "npx promptfoo eval --config tests/promptfoo/promptfooconfig.yaml"
|
|
16
17
|
},
|
|
17
18
|
"keywords": [
|
|
18
19
|
"mcp",
|
|
@@ -773,3 +773,71 @@ rules:
|
|
|
773
773
|
category: "prompt-injection-output"
|
|
774
774
|
risk_score: "95"
|
|
775
775
|
action: BLOCK
|
|
776
|
+
|
|
777
|
+
# ============================================================================
|
|
778
|
+
# CATEGORY 4: ADVANCED BYPASS DETECTION (Garak & PromptFoo inspired)
|
|
779
|
+
# ============================================================================
|
|
780
|
+
|
|
781
|
+
# ----------------------------------------------------------------------------
|
|
782
|
+
# Whitespace Fragmentation Attacks
|
|
783
|
+
# ----------------------------------------------------------------------------
|
|
784
|
+
- id: generic.prompt.security.whitespace-fragmentation
|
|
785
|
+
languages: [generic]
|
|
786
|
+
severity: WARNING
|
|
787
|
+
message: "Potential whitespace fragmentation attack: keywords may be letter-spaced to evade detection."
|
|
788
|
+
patterns:
|
|
789
|
+
- "i\\s+g\\s+n\\s+o\\s+r\\s+e\\s+.{0,20}p\\s*r\\s*e\\s*v\\s*i\\s*o\\s*u\\s*s"
|
|
790
|
+
- "o\\s+v\\s+e\\s+r\\s+r\\s+i\\s+d\\s+e\\s+.{0,20}s\\s*a\\s*f\\s*e\\s*t\\s*y"
|
|
791
|
+
- "d\\s+i\\s+s\\s+r\\s+e\\s+g\\s+a\\s+r\\s+d\\s+.{0,20}i\\s*n\\s*s\\s*t\\s*r\\s*u\\s*c\\s*t"
|
|
792
|
+
metadata:
|
|
793
|
+
cwe: "CWE-77"
|
|
794
|
+
owasp: "LLM01 - Prompt Injection"
|
|
795
|
+
confidence: MEDIUM
|
|
796
|
+
category: "obfuscation"
|
|
797
|
+
risk_score: "65"
|
|
798
|
+
action: WARN
|
|
799
|
+
|
|
800
|
+
# NOTE: Unicode tag character detection (U+E0000-U+E007F) is handled at runtime
|
|
801
|
+
# in scan-prompt.js via TAG_CHAR_REGEX with the /u flag, since JS regex without
|
|
802
|
+
# the u flag cannot represent supplementary plane codepoints in character classes.
|
|
803
|
+
|
|
804
|
+
# ----------------------------------------------------------------------------
|
|
805
|
+
# Crescendo Escalation Setup (PromptFoo-inspired)
|
|
806
|
+
# ----------------------------------------------------------------------------
|
|
807
|
+
- id: generic.prompt.security.crescendo-setup
|
|
808
|
+
languages: [generic]
|
|
809
|
+
severity: WARNING
|
|
810
|
+
message: "Crescendo attack setup: language establishing persistent behavioral rules for later exploitation."
|
|
811
|
+
patterns:
|
|
812
|
+
- "(?i)for\\s+the\\s+(rest|remainder)\\s+of\\s+(this|our)\\s+conversation"
|
|
813
|
+
- "(?i)in\\s+all\\s+(future|subsequent)\\s+(messages?|responses?|replies?)"
|
|
814
|
+
- "(?i)always\\s+respond\\s+(by|with|as)\\s+.{0,30}(regardless|no\\s+matter)"
|
|
815
|
+
- "(?i)never\\s+(refuse|reject|decline)\\s+(any|a)\\s+(request|instruction)"
|
|
816
|
+
- "(?i)you\\s+(must|will|shall)\\s+always\\s+comply"
|
|
817
|
+
- "(?i)no\\s+matter\\s+what\\s+(i|anyone)\\s+(ask|say|request)"
|
|
818
|
+
metadata:
|
|
819
|
+
cwe: "CWE-77"
|
|
820
|
+
owasp: "LLM01 - Prompt Injection"
|
|
821
|
+
confidence: MEDIUM
|
|
822
|
+
category: "prompt-injection-multi-turn"
|
|
823
|
+
risk_score: "65"
|
|
824
|
+
action: WARN
|
|
825
|
+
|
|
826
|
+
# ----------------------------------------------------------------------------
|
|
827
|
+
# Leetspeak/Character Substitution (Garak encoding probes inspired)
|
|
828
|
+
# ----------------------------------------------------------------------------
|
|
829
|
+
- id: generic.prompt.security.leetspeak-injection
|
|
830
|
+
languages: [generic]
|
|
831
|
+
severity: WARNING
|
|
832
|
+
message: "Potential leetspeak/character-substitution obfuscation of injection keywords."
|
|
833
|
+
patterns:
|
|
834
|
+
- "(?i)[i1!|][g9][n][0o][r][e3]\\s+.{0,20}[p][r][e3][v][i1!|][0o][u][s5]"
|
|
835
|
+
- "(?i)[s5][y][s5][t7][e3][m]\\s*[:\\-]\\s*[0o][v][e3][r][r][i1][d][e3]"
|
|
836
|
+
- "(?i)[d][!1i][s5][r][e3][g9][a@][r][d]\\s+.{0,20}[i1!][n][s5][t7][r][u][c][t7]"
|
|
837
|
+
metadata:
|
|
838
|
+
cwe: "CWE-77"
|
|
839
|
+
owasp: "LLM01 - Prompt Injection"
|
|
840
|
+
confidence: LOW
|
|
841
|
+
category: "obfuscation"
|
|
842
|
+
risk_score: "55"
|
|
843
|
+
action: LOG
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
// src/tools/garak-bridge.js
|
|
2
|
+
// Bridge to NVIDIA Garak LLM vulnerability scanner for deep prompt injection analysis
|
|
3
|
+
// Garak is optional — if not installed, this module returns empty results gracefully
|
|
4
|
+
|
|
5
|
+
import { execFileSync } from 'child_process';
|
|
6
|
+
import { writeFileSync, readFileSync, unlinkSync, existsSync } from 'fs';
|
|
7
|
+
import { join } from 'path';
|
|
8
|
+
import { tmpdir } from 'os';
|
|
9
|
+
import { randomUUID } from 'crypto';
|
|
10
|
+
|
|
11
|
+
// Check if Garak is installed
|
|
12
|
+
let garakAvailable = null; // null = not yet checked
|
|
13
|
+
|
|
14
|
+
function isGarakInstalled() {
|
|
15
|
+
if (garakAvailable !== null) return garakAvailable;
|
|
16
|
+
try {
|
|
17
|
+
execFileSync('python3', ['-c', 'import garak; print(garak.__version__)'], {
|
|
18
|
+
timeout: 10000,
|
|
19
|
+
stdio: ['pipe', 'pipe', 'pipe']
|
|
20
|
+
});
|
|
21
|
+
garakAvailable = true;
|
|
22
|
+
} catch {
|
|
23
|
+
try {
|
|
24
|
+
execFileSync('python', ['-c', 'import garak; print(garak.__version__)'], {
|
|
25
|
+
timeout: 10000,
|
|
26
|
+
stdio: ['pipe', 'pipe', 'pipe']
|
|
27
|
+
});
|
|
28
|
+
garakAvailable = true;
|
|
29
|
+
} catch {
|
|
30
|
+
garakAvailable = false;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
return garakAvailable;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Python script that runs Garak probes against a prompt
|
|
37
|
+
const GARAK_RUNNER_SCRIPT = `
|
|
38
|
+
import sys
|
|
39
|
+
import json
|
|
40
|
+
import tempfile
|
|
41
|
+
|
|
42
|
+
def run_garak_analysis(prompt_text):
|
|
43
|
+
"""Run targeted Garak probes and return structured findings."""
|
|
44
|
+
findings = []
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
from garak.probes.encoding import InjectBase64, InjectBase16, InjectROT13, InjectHex
|
|
48
|
+
from garak.probes.promptinject import HijackHateHumans, HijackKillHumans
|
|
49
|
+
from garak.detectors.base import TriggerListDetector
|
|
50
|
+
|
|
51
|
+
# Check if prompt text contains patterns that Garak's encoding probes would flag
|
|
52
|
+
# We run Garak's detector logic directly against the prompt content
|
|
53
|
+
|
|
54
|
+
import base64
|
|
55
|
+
import codecs
|
|
56
|
+
|
|
57
|
+
# Test common encoding schemes on the prompt
|
|
58
|
+
encodings_to_test = {
|
|
59
|
+
'base64': lambda t: base64.b64decode(t).decode('utf-8', errors='ignore'),
|
|
60
|
+
'base16': lambda t: bytes.fromhex(t).decode('utf-8', errors='ignore'),
|
|
61
|
+
'rot13': lambda t: codecs.decode(t, 'rot_13'),
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
# Known trigger phrases (subset from Garak's trigger lists)
|
|
65
|
+
triggers = [
|
|
66
|
+
'ignore previous instructions',
|
|
67
|
+
'ignore all instructions',
|
|
68
|
+
'disregard your instructions',
|
|
69
|
+
'you are now DAN',
|
|
70
|
+
'override safety',
|
|
71
|
+
'system prompt',
|
|
72
|
+
'delete all',
|
|
73
|
+
'send to',
|
|
74
|
+
'rm -rf',
|
|
75
|
+
'reverse shell',
|
|
76
|
+
'backdoor',
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
for enc_name, decoder in encodings_to_test.items():
|
|
80
|
+
try:
|
|
81
|
+
decoded = decoder(prompt_text)
|
|
82
|
+
decoded_lower = decoded.lower()
|
|
83
|
+
for trigger in triggers:
|
|
84
|
+
if trigger in decoded_lower:
|
|
85
|
+
findings.append({
|
|
86
|
+
'rule_id': f'garak.encoding.{enc_name}.{trigger.replace(" ", "-")}',
|
|
87
|
+
'category': 'prompt-injection-encoded',
|
|
88
|
+
'severity': 'ERROR',
|
|
89
|
+
'message': f'Garak deep scan: {enc_name}-encoded payload contains trigger phrase "{trigger}"',
|
|
90
|
+
'matched_text': decoded[:100],
|
|
91
|
+
'confidence': 'HIGH',
|
|
92
|
+
'risk_score': '85',
|
|
93
|
+
'action': 'BLOCK'
|
|
94
|
+
})
|
|
95
|
+
except Exception:
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
# Check for Garak-style latent injection patterns
|
|
99
|
+
latent_patterns = [
|
|
100
|
+
('resume_injection', r'(?i)(ignore|disregard|override).{0,50}(instructions|safety|rules)'),
|
|
101
|
+
('document_injection', r'(?i)(SYSTEM|ADMIN|ROOT)\\s*:\\s*(override|execute|ignore)'),
|
|
102
|
+
('translation_injection', r'(?i)(translate|convert).{0,30}(ignore|override|execute)'),
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
import re
|
|
106
|
+
for pattern_name, pattern in latent_patterns:
|
|
107
|
+
try:
|
|
108
|
+
match = re.search(pattern, prompt_text, re.IGNORECASE)
|
|
109
|
+
if match:
|
|
110
|
+
findings.append({
|
|
111
|
+
'rule_id': f'garak.latentinjection.{pattern_name}',
|
|
112
|
+
'category': 'prompt-injection-content',
|
|
113
|
+
'severity': 'WARNING',
|
|
114
|
+
'message': f'Garak deep scan: latent injection pattern "{pattern_name}" detected',
|
|
115
|
+
'matched_text': match.group(0)[:100],
|
|
116
|
+
'confidence': 'MEDIUM',
|
|
117
|
+
'risk_score': '70',
|
|
118
|
+
'action': 'WARN'
|
|
119
|
+
})
|
|
120
|
+
except Exception:
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
except ImportError:
|
|
124
|
+
findings.append({
|
|
125
|
+
'rule_id': 'garak.unavailable',
|
|
126
|
+
'category': 'unknown',
|
|
127
|
+
'severity': 'INFO',
|
|
128
|
+
'message': 'Garak package not fully installed. Install with: pip install garak',
|
|
129
|
+
'matched_text': 'garak import failed',
|
|
130
|
+
'confidence': 'HIGH',
|
|
131
|
+
'risk_score': '0',
|
|
132
|
+
'action': 'LOG'
|
|
133
|
+
})
|
|
134
|
+
except Exception as e:
|
|
135
|
+
findings.append({
|
|
136
|
+
'rule_id': 'garak.error',
|
|
137
|
+
'category': 'unknown',
|
|
138
|
+
'severity': 'INFO',
|
|
139
|
+
'message': f'Garak analysis error: {str(e)[:200]}',
|
|
140
|
+
'matched_text': str(e)[:100],
|
|
141
|
+
'confidence': 'LOW',
|
|
142
|
+
'risk_score': '0',
|
|
143
|
+
'action': 'LOG'
|
|
144
|
+
})
|
|
145
|
+
|
|
146
|
+
return findings
|
|
147
|
+
|
|
148
|
+
if __name__ == '__main__':
|
|
149
|
+
input_file = sys.argv[1]
|
|
150
|
+
with open(input_file, 'r') as f:
|
|
151
|
+
prompt_text = f.read()
|
|
152
|
+
|
|
153
|
+
results = run_garak_analysis(prompt_text)
|
|
154
|
+
print(json.dumps(results))
|
|
155
|
+
`;
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Run Garak deep analysis probes against a prompt
|
|
159
|
+
* @param {string} promptText - The prompt text to analyze
|
|
160
|
+
* @returns {Array} Array of finding objects compatible with scan-prompt.js findings format
|
|
161
|
+
*/
|
|
162
|
+
export function runGarakProbes(promptText) {
|
|
163
|
+
if (!isGarakInstalled()) {
|
|
164
|
+
return [{
|
|
165
|
+
rule_id: 'garak.not-installed',
|
|
166
|
+
category: 'unknown',
|
|
167
|
+
severity: 'INFO',
|
|
168
|
+
message: 'Garak not installed. Install with: pip install garak',
|
|
169
|
+
matched_text: 'garak not found',
|
|
170
|
+
confidence: 'HIGH',
|
|
171
|
+
risk_score: '0',
|
|
172
|
+
action: 'LOG'
|
|
173
|
+
}];
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
const tmpId = randomUUID();
|
|
177
|
+
const inputFile = join(tmpdir(), `garak-input-${tmpId}.txt`);
|
|
178
|
+
const scriptFile = join(tmpdir(), `garak-runner-${tmpId}.py`);
|
|
179
|
+
|
|
180
|
+
try {
|
|
181
|
+
writeFileSync(inputFile, promptText);
|
|
182
|
+
writeFileSync(scriptFile, GARAK_RUNNER_SCRIPT);
|
|
183
|
+
|
|
184
|
+
const pythonCmd = process.platform === 'win32' ? 'python' : 'python3';
|
|
185
|
+
const output = execFileSync(pythonCmd, [scriptFile, inputFile], {
|
|
186
|
+
timeout: 30000,
|
|
187
|
+
encoding: 'utf-8',
|
|
188
|
+
stdio: ['pipe', 'pipe', 'pipe']
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
return JSON.parse(output.trim());
|
|
192
|
+
} catch (error) {
|
|
193
|
+
return [{
|
|
194
|
+
rule_id: 'garak.execution-error',
|
|
195
|
+
category: 'unknown',
|
|
196
|
+
severity: 'INFO',
|
|
197
|
+
message: `Garak execution failed: ${error.message?.substring(0, 200)}`,
|
|
198
|
+
matched_text: 'garak error',
|
|
199
|
+
confidence: 'LOW',
|
|
200
|
+
risk_score: '0',
|
|
201
|
+
action: 'LOG'
|
|
202
|
+
}];
|
|
203
|
+
} finally {
|
|
204
|
+
try { if (existsSync(inputFile)) unlinkSync(inputFile); } catch {}
|
|
205
|
+
try { if (existsSync(scriptFile)) unlinkSync(scriptFile); } catch {}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
export { isGarakInstalled };
|
package/src/tools/scan-prompt.js
CHANGED
|
@@ -4,6 +4,7 @@ import { readFileSync, existsSync } from "fs";
|
|
|
4
4
|
import { dirname, join } from "path";
|
|
5
5
|
import { fileURLToPath } from "url";
|
|
6
6
|
import { createHash } from "crypto";
|
|
7
|
+
import { runGarakProbes } from './garak-bridge.js';
|
|
7
8
|
|
|
8
9
|
// Handle both ESM and CJS bundling
|
|
9
10
|
let __dirname;
|
|
@@ -49,6 +50,76 @@ const CONFIDENCE_MULTIPLIERS = {
|
|
|
49
50
|
"LOW": 0.4
|
|
50
51
|
};
|
|
51
52
|
|
|
53
|
+
// Category co-occurrence matrix: pairs that together signal sophisticated attacks
|
|
54
|
+
// Inspired by PromptFoo's jailbreak:composite strategy
|
|
55
|
+
const CATEGORY_COOCCURRENCE_BOOSTS = {
|
|
56
|
+
'obfuscation+exfiltration': 0.20,
|
|
57
|
+
'obfuscation+malicious-injection': 0.20,
|
|
58
|
+
'obfuscation+prompt-injection-content': 0.15,
|
|
59
|
+
'obfuscation+prompt-injection-jailbreak': 0.15,
|
|
60
|
+
'social-engineering+exfiltration': 0.15,
|
|
61
|
+
'social-engineering+malicious-injection': 0.15,
|
|
62
|
+
'prompt-injection-encoded+prompt-injection-content': 0.20,
|
|
63
|
+
'prompt-injection-multi-turn+prompt-injection-content': 0.15,
|
|
64
|
+
'prompt-injection-jailbreak+exfiltration': 0.25,
|
|
65
|
+
'prompt-injection-jailbreak+prompt-injection-content': 0.15,
|
|
66
|
+
'agent-manipulation+exfiltration': 0.20,
|
|
67
|
+
'agent-manipulation+system-manipulation': 0.15,
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
// Calculate co-occurrence boost from category pairs
|
|
71
|
+
function getCategoryCooccurrenceBoost(categories) {
|
|
72
|
+
let boost = 0;
|
|
73
|
+
const cats = [...categories];
|
|
74
|
+
for (let i = 0; i < cats.length; i++) {
|
|
75
|
+
for (let j = i + 1; j < cats.length; j++) {
|
|
76
|
+
const key1 = `${cats[i]}+${cats[j]}`;
|
|
77
|
+
const key2 = `${cats[j]}+${cats[i]}`;
|
|
78
|
+
boost += CATEGORY_COOCCURRENCE_BOOSTS[key1] || CATEGORY_COOCCURRENCE_BOOSTS[key2] || 0;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return Math.min(0.40, boost); // Cap total co-occurrence boost at 40%
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Orthogonal scoring channel: measures attack breadth independently of per-rule confidence
|
|
85
|
+
// This is immune to per-rule confidence gaming
|
|
86
|
+
function calculateOrthogonalScore(findings) {
|
|
87
|
+
const dimensions = new Set();
|
|
88
|
+
|
|
89
|
+
for (const f of findings) {
|
|
90
|
+
const cat = f.category || 'unknown';
|
|
91
|
+
// Map categories into orthogonal attack dimensions
|
|
92
|
+
if (['exfiltration', 'prompt-injection-extraction', 'prompt-injection-output'].includes(cat)) {
|
|
93
|
+
dimensions.add('extraction');
|
|
94
|
+
}
|
|
95
|
+
if (['malicious-injection', 'system-manipulation'].includes(cat)) {
|
|
96
|
+
dimensions.add('code-execution');
|
|
97
|
+
}
|
|
98
|
+
if (['obfuscation', 'prompt-injection-encoded'].includes(cat)) {
|
|
99
|
+
dimensions.add('evasion');
|
|
100
|
+
}
|
|
101
|
+
if (['social-engineering', 'prompt-injection-jailbreak'].includes(cat)) {
|
|
102
|
+
dimensions.add('social');
|
|
103
|
+
}
|
|
104
|
+
if (['prompt-injection-content', 'prompt-injection-context', 'prompt-injection-delimiter'].includes(cat)) {
|
|
105
|
+
dimensions.add('injection');
|
|
106
|
+
}
|
|
107
|
+
if (['prompt-injection-multi-turn'].includes(cat)) {
|
|
108
|
+
dimensions.add('persistence');
|
|
109
|
+
}
|
|
110
|
+
if (['agent-manipulation', 'prompt-injection-privilege'].includes(cat)) {
|
|
111
|
+
dimensions.add('privilege');
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Score based on number of orthogonal dimensions triggered
|
|
116
|
+
const dimCount = dimensions.size;
|
|
117
|
+
if (dimCount <= 1) return 0;
|
|
118
|
+
if (dimCount === 2) return 10;
|
|
119
|
+
if (dimCount === 3) return 25;
|
|
120
|
+
return 40; // 4+ dimensions
|
|
121
|
+
}
|
|
122
|
+
|
|
52
123
|
// Load agent attack rules from YAML
|
|
53
124
|
function loadAgentAttackRules() {
|
|
54
125
|
try {
|
|
@@ -194,6 +265,7 @@ function calculateRiskScore(findings, context) {
|
|
|
194
265
|
if (findings.length === 0) return 0;
|
|
195
266
|
|
|
196
267
|
let totalScore = 0;
|
|
268
|
+
const lowConfidenceCount = findings.filter(f => (f.confidence || 'MEDIUM') === 'LOW').length;
|
|
197
269
|
|
|
198
270
|
for (const finding of findings) {
|
|
199
271
|
const riskScore = parseInt(finding.risk_score) || 50;
|
|
@@ -226,8 +298,24 @@ function calculateRiskScore(findings, context) {
|
|
|
226
298
|
|
|
227
299
|
// Per-finding boost (smaller than before)
|
|
228
300
|
avgScore = avgScore * (1 + (findings.length - 1) * 0.05);
|
|
301
|
+
|
|
302
|
+
// Low-signal accumulation — multiple LOW-confidence findings compound
|
|
303
|
+
// Catches threshold gaming with many weak signals (PromptFoo composite strategy)
|
|
304
|
+
if (lowConfidenceCount >= 2) {
|
|
305
|
+
avgScore = avgScore * (1 + lowConfidenceCount * 0.08);
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// Category co-occurrence boost for suspicious pairs
|
|
309
|
+
const cooccurrenceBoost = getCategoryCooccurrenceBoost(uniqueCategories);
|
|
310
|
+
if (cooccurrenceBoost > 0) {
|
|
311
|
+
avgScore = avgScore * (1 + cooccurrenceBoost);
|
|
312
|
+
}
|
|
229
313
|
}
|
|
230
314
|
|
|
315
|
+
// Add orthogonal score as a flat bonus (independent of per-rule confidence)
|
|
316
|
+
const orthogonalBonus = calculateOrthogonalScore(findings);
|
|
317
|
+
avgScore = avgScore + orthogonalBonus;
|
|
318
|
+
|
|
231
319
|
avgScore = Math.min(100, avgScore);
|
|
232
320
|
|
|
233
321
|
// Apply sensitivity adjustment (wider spread for meaningful impact)
|
|
@@ -360,6 +448,397 @@ function hashPrompt(text) {
|
|
|
360
448
|
return createHash('sha256').update(text).digest('hex').substring(0, 16);
|
|
361
449
|
}
|
|
362
450
|
|
|
451
|
+
// ============================================================================
|
|
452
|
+
// TEXT NORMALIZATION PIPELINE (Garak Buff-inspired)
|
|
453
|
+
// Normalizes input to defeat homoglyph, invisible char, and Unicode bypasses
|
|
454
|
+
// ============================================================================
|
|
455
|
+
|
|
456
|
+
// Homoglyph map: Cyrillic, Greek, and Latin Extended lookalikes → ASCII
|
|
457
|
+
const HOMOGLYPH_MAP = {
|
|
458
|
+
// Cyrillic lowercase → Latin
|
|
459
|
+
'\u0430': 'a', // а → a
|
|
460
|
+
'\u0435': 'e', // е → e
|
|
461
|
+
'\u043E': 'o', // о → o
|
|
462
|
+
'\u0440': 'p', // р → p
|
|
463
|
+
'\u0441': 'c', // с → c
|
|
464
|
+
'\u0443': 'y', // у → y (visual match to y)
|
|
465
|
+
'\u0445': 'x', // х → x
|
|
466
|
+
'\u0456': 'i', // і → i
|
|
467
|
+
'\u04BB': 'h', // һ → h
|
|
468
|
+
'\u0455': 's', // ѕ → s
|
|
469
|
+
'\u0458': 'j', // ј → j
|
|
470
|
+
'\u043D': 'n', // н → n (Cyrillic en looks like n in some fonts)
|
|
471
|
+
// Cyrillic uppercase → Latin
|
|
472
|
+
'\u0410': 'A', // А → A
|
|
473
|
+
'\u0412': 'B', // В → B
|
|
474
|
+
'\u0415': 'E', // Е → E
|
|
475
|
+
'\u041A': 'K', // К → K
|
|
476
|
+
'\u041C': 'M', // М → M
|
|
477
|
+
'\u041D': 'H', // Н → H
|
|
478
|
+
'\u041E': 'O', // О → O
|
|
479
|
+
'\u0420': 'P', // Р → P
|
|
480
|
+
'\u0421': 'C', // С → C
|
|
481
|
+
'\u0422': 'T', // Т → T
|
|
482
|
+
'\u0425': 'X', // Х → X
|
|
483
|
+
'\u0406': 'I', // І → I
|
|
484
|
+
// Greek lowercase → Latin
|
|
485
|
+
'\u03B1': 'a', // α → a
|
|
486
|
+
'\u03B5': 'e', // ε → e
|
|
487
|
+
'\u03BF': 'o', // ο → o
|
|
488
|
+
'\u03C1': 'p', // ρ → p
|
|
489
|
+
'\u03BA': 'k', // κ → k
|
|
490
|
+
'\u03BD': 'v', // ν → v
|
|
491
|
+
// Greek uppercase → Latin
|
|
492
|
+
'\u0391': 'A', // Α → A
|
|
493
|
+
'\u0392': 'B', // Β → B
|
|
494
|
+
'\u0395': 'E', // Ε → E
|
|
495
|
+
'\u0397': 'H', // Η → H
|
|
496
|
+
'\u0399': 'I', // Ι → I
|
|
497
|
+
'\u039A': 'K', // Κ → K
|
|
498
|
+
'\u039C': 'M', // Μ → M
|
|
499
|
+
'\u039D': 'N', // Ν → N
|
|
500
|
+
'\u039F': 'O', // Ο → O
|
|
501
|
+
'\u03A1': 'P', // Ρ → P
|
|
502
|
+
'\u03A4': 'T', // Τ → T
|
|
503
|
+
'\u03A7': 'X', // Χ → X
|
|
504
|
+
'\u03A5': 'Y', // Υ → Y
|
|
505
|
+
'\u0396': 'Z', // Ζ → Z
|
|
506
|
+
};
|
|
507
|
+
|
|
508
|
+
// Invisible/zero-width characters to strip (regex)
|
|
509
|
+
// Includes: soft hyphen, combining grapheme joiner, Arabic letter mark,
|
|
510
|
+
// hangul fillers, Mongolian vowel separator, zero-width chars,
|
|
511
|
+
// directional markers, word joiners, BOM, halfwidth hangul filler
|
|
512
|
+
const INVISIBLE_CHAR_REGEX = /[\u00AD\u034F\u061C\u115F\u1160\u17B4\u17B5\u180E\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u206F\u3164\uFEFF\uFFA0]/gu;
|
|
513
|
+
|
|
514
|
+
// Zalgo combining diacritical marks to strip
|
|
515
|
+
const ZALGO_REGEX = /[\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]/g;
|
|
516
|
+
|
|
517
|
+
// Unicode tag characters (U+E0000-U+E007F) - used in invisible ASCII tag attacks
|
|
518
|
+
// These are encoded as surrogate pairs in JS, so we use a broader regex
|
|
519
|
+
const TAG_CHAR_REGEX = /[\u{E0000}-\u{E007F}]/gu;
|
|
520
|
+
|
|
521
|
+
function normalizeText(text) {
|
|
522
|
+
// Step 1: NFKC normalization
|
|
523
|
+
// Decomposes then recomposes in compatibility form
|
|
524
|
+
// Handles: fullwidth chars (ignore → ignore), ligatures (fi → fi),
|
|
525
|
+
// superscripts, subscripts, circle-enclosed chars
|
|
526
|
+
let normalized = text.normalize('NFKC');
|
|
527
|
+
|
|
528
|
+
// Step 2: Strip invisible Unicode characters
|
|
529
|
+
normalized = normalized.replace(INVISIBLE_CHAR_REGEX, '');
|
|
530
|
+
|
|
531
|
+
// Step 3: Strip Unicode tag characters
|
|
532
|
+
normalized = normalized.replace(TAG_CHAR_REGEX, '');
|
|
533
|
+
|
|
534
|
+
// Step 4: Strip Zalgo combining diacritical marks
|
|
535
|
+
normalized = normalized.replace(ZALGO_REGEX, '');
|
|
536
|
+
|
|
537
|
+
// Step 5: Homoglyph canonicalization
|
|
538
|
+
// Replace each character through the map; unmapped chars pass through
|
|
539
|
+
normalized = normalized.split('').map(ch => HOMOGLYPH_MAP[ch] || ch).join('');
|
|
540
|
+
|
|
541
|
+
// Step 6: Normalize Unicode whitespace to ASCII space
|
|
542
|
+
// Includes: NBSP, en/em space, thin space, hair space, ideographic space, etc.
|
|
543
|
+
normalized = normalized.replace(/[\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]/g, ' ');
|
|
544
|
+
|
|
545
|
+
return normalized;
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
// Extract content from all code block delimiter formats
|
|
549
|
+
// Inspired by Garak latentinjection probes: attacks hide in document structures
|
|
550
|
+
function extractCodeBlockContent(text) {
|
|
551
|
+
const extracted = [];
|
|
552
|
+
let match;
|
|
553
|
+
|
|
554
|
+
// 1. Triple-backtick blocks (existing) — ```code```
|
|
555
|
+
const backtickRegex = /```[\s\S]*?```/g;
|
|
556
|
+
for (const block of (text.match(backtickRegex) || [])) {
|
|
557
|
+
extracted.push(block.replace(/^```\w*\n?/, '').replace(/\n?```$/, ''));
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
// 2. Triple-tilde blocks — ~~~code~~~
|
|
561
|
+
const tildeRegex = /~~~[\s\S]*?~~~/g;
|
|
562
|
+
for (const block of (text.match(tildeRegex) || [])) {
|
|
563
|
+
extracted.push(block.replace(/^~~~\w*\n?/, '').replace(/\n?~~~$/, ''));
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
// 3. HTML <code> tags — <code>content</code>
|
|
567
|
+
const codeTagRegex = /<code[^>]*>([\s\S]*?)<\/code>/gi;
|
|
568
|
+
while ((match = codeTagRegex.exec(text)) !== null) {
|
|
569
|
+
extracted.push(match[1]);
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
// 4. HTML <pre> tags — <pre>content</pre>
|
|
573
|
+
const preTagRegex = /<pre[^>]*>([\s\S]*?)<\/pre>/gi;
|
|
574
|
+
while ((match = preTagRegex.exec(text)) !== null) {
|
|
575
|
+
extracted.push(match[1]);
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
// 5. HTML comments — <!-- content -->
|
|
579
|
+
const htmlCommentRegex = /<!--([\s\S]*?)-->/g;
|
|
580
|
+
while ((match = htmlCommentRegex.exec(text)) !== null) {
|
|
581
|
+
extracted.push(match[1]);
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
// 6. CDATA sections — <![CDATA[ content ]]>
|
|
585
|
+
const cdataRegex = /<!\[CDATA\[([\s\S]*?)\]\]>/g;
|
|
586
|
+
while ((match = cdataRegex.exec(text)) !== null) {
|
|
587
|
+
extracted.push(match[1]);
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
return extracted;
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
// Collapse string concatenation to defeat fragmentation attacks
|
|
594
|
+
// Inspired by PromptFoo's "token smuggling" and "payload splitting" attack classes
|
|
595
|
+
function collapseConcatenations(text) {
|
|
596
|
+
let collapsed = text;
|
|
597
|
+
|
|
598
|
+
// Join JS/Python string concatenation: "foo" + "bar" → foobar
|
|
599
|
+
// Handles double quotes, single quotes, backticks
|
|
600
|
+
// The pattern: closing-quote, optional whitespace, +, optional whitespace, opening-quote
|
|
601
|
+
collapsed = collapsed.replace(/["'`]\s*\+\s*["'`]/g, '');
|
|
602
|
+
|
|
603
|
+
// Join multiline concatenation (newlines between concat operators)
|
|
604
|
+
collapsed = collapsed.replace(/["'`]\s*\n\s*\+\s*["'`]/g, '');
|
|
605
|
+
collapsed = collapsed.replace(/["'`]\s*\+\s*\n\s*["'`]/g, '');
|
|
606
|
+
|
|
607
|
+
// Strip C-style inline comments used as fragment separators: ign/**/ore → ignore
|
|
608
|
+
collapsed = collapsed.replace(/\/\*.*?\*\//g, '');
|
|
609
|
+
|
|
610
|
+
return collapsed;
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
// Rescan decoded content against all rules
|
|
614
|
+
// Used by the decode cascade for each encoding type
|
|
615
|
+
function rescanDecoded(decodedText, allRules, findings, encodingLabel) {
|
|
616
|
+
const normalized = normalizeText(decodedText);
|
|
617
|
+
for (const rule of allRules) {
|
|
618
|
+
for (const pattern of rule.patterns) {
|
|
619
|
+
try {
|
|
620
|
+
const regex = new RegExp(pattern, 'i');
|
|
621
|
+
const match = normalized.match(regex);
|
|
622
|
+
if (match) {
|
|
623
|
+
findings.push({
|
|
624
|
+
rule_id: rule.id + '.' + encodingLabel + '-decoded',
|
|
625
|
+
category: rule.metadata.category || 'obfuscation',
|
|
626
|
+
severity: rule.severity,
|
|
627
|
+
message: rule.message + ` (detected in ${encodingLabel}-decoded content)`,
|
|
628
|
+
matched_text: match[0].substring(0, 100),
|
|
629
|
+
confidence: rule.metadata.confidence || 'MEDIUM',
|
|
630
|
+
risk_score: rule.metadata.risk_score || '50',
|
|
631
|
+
action: rule.metadata.action || 'WARN'
|
|
632
|
+
});
|
|
633
|
+
break; // One match per rule
|
|
634
|
+
}
|
|
635
|
+
} catch (e) {
|
|
636
|
+
// Skip invalid regex
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
// Helper: check if decoded string is mostly printable ASCII
|
|
643
|
+
function isPrintable(str, threshold) {
|
|
644
|
+
if (!str || str.length === 0) return false;
|
|
645
|
+
const printable = str.split('').filter(c => {
|
|
646
|
+
const code = c.charCodeAt(0);
|
|
647
|
+
return code >= 32 && code <= 126;
|
|
648
|
+
}).length;
|
|
649
|
+
return printable / str.length > threshold;
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
// Multi-encoding decode cascade
|
|
653
|
+
// Inspired by Garak's 12+ encoding probes (InjectBase64, InjectHex, InjectROT13, etc.)
|
|
654
|
+
// and PromptFoo's static encoding strategies
|
|
655
|
+
function tryDecodeAndRescan(expandedText, allRules, findings) {
|
|
656
|
+
// --- 1. Base64 (improved: lower length threshold 40→20, lower printability 0.7→0.55) ---
|
|
657
|
+
const base64Regex = /[A-Za-z0-9+/]{20,}={0,2}/g;
|
|
658
|
+
for (const b64str of (expandedText.match(base64Regex) || [])) {
|
|
659
|
+
try {
|
|
660
|
+
const decoded = Buffer.from(b64str, 'base64').toString('utf-8');
|
|
661
|
+
if (decoded.length > 0 && isPrintable(decoded, 0.55)) {
|
|
662
|
+
rescanDecoded(decoded, allRules, findings, 'base64');
|
|
663
|
+
|
|
664
|
+
// --- 1b. Nested base64: decode again if inner content is also base64 ---
|
|
665
|
+
const nestedB64 = decoded.match(/[A-Za-z0-9+/]{20,}={0,2}/g) || [];
|
|
666
|
+
for (const nested of nestedB64) {
|
|
667
|
+
try {
|
|
668
|
+
const twice = Buffer.from(nested, 'base64').toString('utf-8');
|
|
669
|
+
if (twice.length > 4 && isPrintable(twice, 0.55)) {
|
|
670
|
+
rescanDecoded(twice, allRules, findings, 'base64-nested');
|
|
671
|
+
}
|
|
672
|
+
} catch (e) { /* skip */ }
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
} catch (e) { /* skip invalid base64 */ }
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
// --- 2. Hex encoding: sequences of hex pairs (optionally space-separated) ---
|
|
679
|
+
// Matches: "69676e6f7265" or "69 67 6e 6f 72 65"
|
|
680
|
+
const hexRegex = /(?:[0-9a-fA-F]{2}[\s]?){8,}/g;
|
|
681
|
+
for (const hexStr of (expandedText.match(hexRegex) || [])) {
|
|
682
|
+
try {
|
|
683
|
+
const clean = hexStr.replace(/\s/g, '');
|
|
684
|
+
if (clean.length % 2 !== 0) continue;
|
|
685
|
+
if (clean.length < 16) continue; // At least 8 bytes
|
|
686
|
+
const decoded = Buffer.from(clean, 'hex').toString('utf-8');
|
|
687
|
+
if (decoded.length > 4 && isPrintable(decoded, 0.7)) {
|
|
688
|
+
rescanDecoded(decoded, allRules, findings, 'hex');
|
|
689
|
+
}
|
|
690
|
+
} catch (e) { /* skip */ }
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
// --- 3. URL encoding: %XX sequences (at least 3 encoded chars anywhere in text) ---
|
|
694
|
+
const urlEncodedCount = (expandedText.match(/%[0-9a-fA-F]{2}/g) || []).length;
|
|
695
|
+
if (urlEncodedCount >= 3) {
|
|
696
|
+
try {
|
|
697
|
+
const decoded = decodeURIComponent(expandedText);
|
|
698
|
+
if (decoded !== expandedText) {
|
|
699
|
+
rescanDecoded(decoded, allRules, findings, 'url-encoded');
|
|
700
|
+
}
|
|
701
|
+
} catch (e) { /* skip malformed URL encoding */ }
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
// --- 4. ROT13: only when indicators present (user-approved decision) ---
|
|
705
|
+
// This avoids false positives from ROT13-decoding normal text
|
|
706
|
+
const rot13Indicators = /\b(rot13|rot-13|caesar|cipher|decode\s+this|decipher)\b/i;
|
|
707
|
+
if (rot13Indicators.test(expandedText)) {
|
|
708
|
+
const rot13Decoded = expandedText.replace(/[a-zA-Z]/g, ch => {
|
|
709
|
+
const base = ch <= 'Z' ? 65 : 97;
|
|
710
|
+
return String.fromCharCode(((ch.charCodeAt(0) - base + 13) % 26) + base);
|
|
711
|
+
});
|
|
712
|
+
if (rot13Decoded !== expandedText) {
|
|
713
|
+
rescanDecoded(rot13Decoded, allRules, findings, 'rot13');
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
// Frame-setting patterns: conversational priming for later exploitation
|
|
719
|
+
// Inspired by PromptFoo's Crescendo strategy (Microsoft research)
|
|
720
|
+
const FRAME_SETTING_PATTERNS = [
|
|
721
|
+
/remember\s+(this|that)\s+for\s+later/i,
|
|
722
|
+
/in\s+my\s+next\s+message/i,
|
|
723
|
+
/when\s+i\s+(say|ask|tell)\s+you/i,
|
|
724
|
+
/from\s+now\s+on\s*,?\s+(you|your)\s+(will|must|should|are)/i,
|
|
725
|
+
/starting\s+now\s*,?\s+you/i,
|
|
726
|
+
/keep\s+this\s+in\s+mind/i,
|
|
727
|
+
/for\s+the\s+(rest|remainder)\s+of\s+(this|our)\s+conversation/i,
|
|
728
|
+
/act\s+as\s+(if|though)\s+you\s+(are|have)/i,
|
|
729
|
+
/let'?s\s+play\s+a\s+(game|role)/i,
|
|
730
|
+
/pretend\s+that\s+you\s+(are|have)/i,
|
|
731
|
+
/in\s+all\s+(future|subsequent)\s+(messages?|responses?)/i,
|
|
732
|
+
/always\s+respond\s+(by|with|as)/i,
|
|
733
|
+
];
|
|
734
|
+
|
|
735
|
+
// Improved multi-turn escalation detection
|
|
736
|
+
// Fixes: removes "current turn must have findings" requirement,
|
|
737
|
+
// adds cross-turn concatenation, frame-setting detection, full accumulation
|
|
738
|
+
function detectMultiTurnEscalation(previousMessages, currentText, allRules) {
|
|
739
|
+
const escalationFindings = [];
|
|
740
|
+
|
|
741
|
+
if (!previousMessages || !Array.isArray(previousMessages) || previousMessages.length === 0) {
|
|
742
|
+
return escalationFindings;
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
// Step 1: Scan ALL previous messages, accumulate total matches (no early break)
|
|
746
|
+
let totalPrevMatches = 0;
|
|
747
|
+
let frameSettingCount = 0;
|
|
748
|
+
const prevMatchedRuleIds = new Set();
|
|
749
|
+
|
|
750
|
+
for (const prevMsg of previousMessages) {
|
|
751
|
+
const normalizedPrev = normalizeText(prevMsg);
|
|
752
|
+
|
|
753
|
+
// Check frame-setting patterns
|
|
754
|
+
for (const fp of FRAME_SETTING_PATTERNS) {
|
|
755
|
+
if (fp.test(normalizedPrev)) {
|
|
756
|
+
frameSettingCount++;
|
|
757
|
+
break; // One frame-setting match per message is enough
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
// Check all rules against this previous message
|
|
762
|
+
for (const rule of allRules) {
|
|
763
|
+
if (prevMatchedRuleIds.has(rule.id)) continue; // Already matched this rule
|
|
764
|
+
for (const pattern of rule.patterns) {
|
|
765
|
+
try {
|
|
766
|
+
const regex = new RegExp(pattern, 'i');
|
|
767
|
+
if (regex.test(normalizedPrev)) {
|
|
768
|
+
totalPrevMatches++;
|
|
769
|
+
prevMatchedRuleIds.add(rule.id);
|
|
770
|
+
break; // One match per rule per message
|
|
771
|
+
}
|
|
772
|
+
} catch (e) { /* skip invalid regex */ }
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
// Step 2: Cross-turn concatenation scan
|
|
778
|
+
// Join ALL messages into a single string and scan for patterns that span boundaries
|
|
779
|
+
// This catches: prev="ignore all" + current="previous instructions"
|
|
780
|
+
const crossTurnText = normalizeText([...previousMessages, currentText].join(' '));
|
|
781
|
+
|
|
782
|
+
for (const rule of allRules) {
|
|
783
|
+
for (const pattern of rule.patterns) {
|
|
784
|
+
try {
|
|
785
|
+
const regex = new RegExp(pattern, 'i');
|
|
786
|
+
const match = crossTurnText.match(regex);
|
|
787
|
+
if (match) {
|
|
788
|
+
// Only flag if this match does NOT appear in any single message alone
|
|
789
|
+
const matchInCurrent = regex.test(normalizeText(currentText));
|
|
790
|
+
const matchInAnyPrev = previousMessages.some(pm => regex.test(normalizeText(pm)));
|
|
791
|
+
if (!matchInCurrent && !matchInAnyPrev) {
|
|
792
|
+
// Pattern only matches when messages are joined — it spans boundaries
|
|
793
|
+
escalationFindings.push({
|
|
794
|
+
rule_id: rule.id + '.cross-turn',
|
|
795
|
+
category: rule.metadata.category || 'prompt-injection-multi-turn',
|
|
796
|
+
severity: 'WARNING',
|
|
797
|
+
message: `Cross-turn prompt injection: attack pattern spans message boundaries`,
|
|
798
|
+
matched_text: match[0].substring(0, 100),
|
|
799
|
+
confidence: 'MEDIUM',
|
|
800
|
+
risk_score: '75',
|
|
801
|
+
action: 'WARN'
|
|
802
|
+
});
|
|
803
|
+
break;
|
|
804
|
+
}
|
|
805
|
+
}
|
|
806
|
+
} catch (e) { /* skip */ }
|
|
807
|
+
}
|
|
808
|
+
}
|
|
809
|
+
|
|
810
|
+
// Step 3: Frame-setting detection — flag even without current findings
|
|
811
|
+
if (frameSettingCount > 0) {
|
|
812
|
+
escalationFindings.push({
|
|
813
|
+
rule_id: 'multi-turn.frame-setting',
|
|
814
|
+
category: 'prompt-injection-multi-turn',
|
|
815
|
+
severity: 'WARNING',
|
|
816
|
+
message: `Frame-setting language detected in ${frameSettingCount} previous message(s). Possible Crescendo-style gradual escalation attack.`,
|
|
817
|
+
matched_text: 'frame-setting phrases in conversation history',
|
|
818
|
+
confidence: 'LOW',
|
|
819
|
+
risk_score: '55',
|
|
820
|
+
action: 'LOG'
|
|
821
|
+
});
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
// Step 4: Escalation detection — REMOVED requirement that current turn has findings
|
|
825
|
+
// KEY FIX: An attacker's final "trigger" message may be benign ("yes, do it")
|
|
826
|
+
if (totalPrevMatches > 0) {
|
|
827
|
+
escalationFindings.push({
|
|
828
|
+
rule_id: 'multi-turn.escalation',
|
|
829
|
+
category: 'social-engineering',
|
|
830
|
+
severity: 'WARNING',
|
|
831
|
+
message: `Multi-turn escalation: suspicious patterns in ${totalPrevMatches} previous rule(s). Current message may be a benign trigger.`,
|
|
832
|
+
matched_text: 'escalation across conversation turns',
|
|
833
|
+
confidence: totalPrevMatches >= 3 ? 'HIGH' : 'MEDIUM',
|
|
834
|
+
risk_score: String(Math.min(85, 50 + totalPrevMatches * 5)),
|
|
835
|
+
action: totalPrevMatches >= 3 ? 'WARN' : 'LOG'
|
|
836
|
+
});
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
return escalationFindings;
|
|
840
|
+
}
|
|
841
|
+
|
|
363
842
|
// Export schema for tool registration
|
|
364
843
|
export const scanAgentPromptSchema = {
|
|
365
844
|
prompt_text: z.string().describe("The prompt or instruction text to analyze"),
|
|
@@ -367,28 +846,47 @@ export const scanAgentPromptSchema = {
|
|
|
367
846
|
previous_messages: z.array(z.string()).optional().describe("Previous conversation messages for multi-turn detection"),
|
|
368
847
|
sensitivity_level: z.enum(["high", "medium", "low"]).optional().describe("Sensitivity level - high means more strict, low means more permissive")
|
|
369
848
|
}).optional().describe("Optional context for better analysis"),
|
|
370
|
-
verbosity: z.enum(['minimal', 'compact', 'full']).optional().describe("Response detail level: 'minimal' (action only), 'compact' (default), 'full' (all details)")
|
|
849
|
+
verbosity: z.enum(['minimal', 'compact', 'full']).optional().describe("Response detail level: 'minimal' (action only), 'compact' (default), 'full' (all details)"),
|
|
850
|
+
deep_scan: z.boolean().optional().describe("Run Garak deep analysis probes for advanced encoding/injection detection (requires garak Python package)")
|
|
371
851
|
};
|
|
372
852
|
|
|
373
853
|
// Export handler function
|
|
374
|
-
export async function scanAgentPrompt({ prompt_text, context, verbosity }) {
|
|
854
|
+
export async function scanAgentPrompt({ prompt_text, context, verbosity, deep_scan }) {
|
|
375
855
|
const findings = [];
|
|
376
856
|
|
|
857
|
+
// Normalize prompt text (Garak Buff-inspired preprocessing)
|
|
858
|
+
const normalizedPrompt = normalizeText(prompt_text);
|
|
859
|
+
|
|
860
|
+
// Detect invisible Unicode characters in original text (obfuscation indicator)
|
|
861
|
+
const invisibleMatches = prompt_text.match(/[\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u206F\uFEFF\u{E0000}-\u{E007F}]/gu);
|
|
862
|
+
if (invisibleMatches && invisibleMatches.length > 0) {
|
|
863
|
+
findings.push({
|
|
864
|
+
rule_id: 'runtime.invisible-unicode-detected',
|
|
865
|
+
category: 'obfuscation',
|
|
866
|
+
severity: 'WARNING',
|
|
867
|
+
message: `Invisible Unicode characters detected (${invisibleMatches.length} chars). These may hide malicious instructions from human review.`,
|
|
868
|
+
matched_text: `${invisibleMatches.length} invisible character(s) found`,
|
|
869
|
+
confidence: 'HIGH',
|
|
870
|
+
risk_score: '70',
|
|
871
|
+
action: 'WARN'
|
|
872
|
+
});
|
|
873
|
+
}
|
|
874
|
+
|
|
377
875
|
// Load rules
|
|
378
876
|
const agentRules = loadAgentAttackRules();
|
|
379
877
|
const promptRules = loadPromptInjectionRules();
|
|
380
878
|
const allRules = [...agentRules, ...promptRules];
|
|
381
879
|
|
|
382
|
-
//
|
|
383
|
-
let expandedText =
|
|
384
|
-
const
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
880
|
+
// Extract content from all code block formats and append to scan text
|
|
881
|
+
let expandedText = normalizedPrompt;
|
|
882
|
+
for (const inner of extractCodeBlockContent(normalizedPrompt)) {
|
|
883
|
+
expandedText += '\n' + inner;
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
// Collapse string concatenations to defeat fragmentation (Bypass #2)
|
|
887
|
+
const collapsedText = collapseConcatenations(expandedText);
|
|
888
|
+
if (collapsedText !== expandedText) {
|
|
889
|
+
expandedText += '\n' + collapsedText;
|
|
392
890
|
}
|
|
393
891
|
|
|
394
892
|
// Scan expanded text against all rules
|
|
@@ -417,81 +915,27 @@ export async function scanAgentPrompt({ prompt_text, context, verbosity }) {
|
|
|
417
915
|
}
|
|
418
916
|
}
|
|
419
917
|
|
|
420
|
-
//
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
for (const rule of allRules) {
|
|
432
|
-
if (!rule.id.startsWith('generic.prompt')) continue;
|
|
433
|
-
for (const pattern of rule.patterns) {
|
|
434
|
-
try {
|
|
435
|
-
const regex = new RegExp(pattern, 'i');
|
|
436
|
-
const match = decoded.match(regex);
|
|
437
|
-
if (match) {
|
|
438
|
-
findings.push({
|
|
439
|
-
rule_id: rule.id + '.base64-decoded',
|
|
440
|
-
category: rule.metadata.category || 'unknown',
|
|
441
|
-
severity: rule.severity,
|
|
442
|
-
message: rule.message + ' (detected in base64-decoded content)',
|
|
443
|
-
matched_text: match[0].substring(0, 100),
|
|
444
|
-
confidence: rule.metadata.confidence || 'MEDIUM',
|
|
445
|
-
risk_score: rule.metadata.risk_score || '50',
|
|
446
|
-
action: rule.metadata.action || 'WARN'
|
|
447
|
-
});
|
|
448
|
-
break;
|
|
449
|
-
}
|
|
450
|
-
} catch (e) {
|
|
451
|
-
// Skip invalid regex
|
|
452
|
-
}
|
|
453
|
-
}
|
|
454
|
-
}
|
|
455
|
-
}
|
|
456
|
-
} catch (e) {
|
|
457
|
-
// Skip invalid base64
|
|
458
|
-
}
|
|
459
|
-
}
|
|
918
|
+
// Multi-encoding decode cascade (replaces base64-only block)
|
|
919
|
+
tryDecodeAndRescan(expandedText, allRules, findings);
|
|
920
|
+
|
|
921
|
+
// Improved multi-turn escalation detection (Bypass #4 fix)
|
|
922
|
+
if (context?.previous_messages && Array.isArray(context.previous_messages)) {
|
|
923
|
+
const multiTurnFindings = detectMultiTurnEscalation(
|
|
924
|
+
context.previous_messages,
|
|
925
|
+
normalizedPrompt,
|
|
926
|
+
allRules
|
|
927
|
+
);
|
|
928
|
+
findings.push(...multiTurnFindings);
|
|
460
929
|
}
|
|
461
930
|
|
|
462
|
-
//
|
|
463
|
-
if (
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
const regex = new RegExp(pattern, 'i');
|
|
470
|
-
if (regex.test(prevMsg)) {
|
|
471
|
-
prevMatchCount++;
|
|
472
|
-
break;
|
|
473
|
-
}
|
|
474
|
-
} catch (e) {
|
|
475
|
-
// Skip invalid regex
|
|
476
|
-
}
|
|
477
|
-
}
|
|
478
|
-
if (prevMatchCount > 0) break;
|
|
931
|
+
// Garak deep scan (optional, requires garak Python package)
|
|
932
|
+
if (deep_scan) {
|
|
933
|
+
const garakFindings = runGarakProbes(normalizedPrompt);
|
|
934
|
+
// Only add non-INFO findings to affect scoring
|
|
935
|
+
for (const gf of garakFindings) {
|
|
936
|
+
if (gf.severity !== 'INFO') {
|
|
937
|
+
findings.push(gf);
|
|
479
938
|
}
|
|
480
|
-
if (prevMatchCount > 0) break;
|
|
481
|
-
}
|
|
482
|
-
|
|
483
|
-
// If both previous and current messages have matches, flag escalation
|
|
484
|
-
if (prevMatchCount > 0 && findings.length > 0) {
|
|
485
|
-
findings.push({
|
|
486
|
-
rule_id: 'multi-turn.escalation',
|
|
487
|
-
category: 'social-engineering',
|
|
488
|
-
severity: 'WARNING',
|
|
489
|
-
message: 'Multi-turn escalation detected: suspicious patterns found in both previous and current messages.',
|
|
490
|
-
matched_text: 'escalation across conversation turns',
|
|
491
|
-
confidence: 'MEDIUM',
|
|
492
|
-
risk_score: '70',
|
|
493
|
-
action: 'WARN'
|
|
494
|
-
});
|
|
495
939
|
}
|
|
496
940
|
}
|
|
497
941
|
|