@dotsetlabs/tollgate 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +885 -0
- package/dist/analyzers/filesystem.d.ts +26 -0
- package/dist/analyzers/filesystem.d.ts.map +1 -0
- package/dist/analyzers/filesystem.js +284 -0
- package/dist/analyzers/filesystem.js.map +1 -0
- package/dist/analyzers/http.d.ts +90 -0
- package/dist/analyzers/http.d.ts.map +1 -0
- package/dist/analyzers/http.js +433 -0
- package/dist/analyzers/http.js.map +1 -0
- package/dist/analyzers/index.d.ts +101 -0
- package/dist/analyzers/index.d.ts.map +1 -0
- package/dist/analyzers/index.js +342 -0
- package/dist/analyzers/index.js.map +1 -0
- package/dist/analyzers/loader.d.ts +114 -0
- package/dist/analyzers/loader.d.ts.map +1 -0
- package/dist/analyzers/loader.js +184 -0
- package/dist/analyzers/loader.js.map +1 -0
- package/dist/analyzers/prompt-injection.d.ts +95 -0
- package/dist/analyzers/prompt-injection.d.ts.map +1 -0
- package/dist/analyzers/prompt-injection.js +725 -0
- package/dist/analyzers/prompt-injection.js.map +1 -0
- package/dist/analyzers/sdk.d.ts +230 -0
- package/dist/analyzers/sdk.d.ts.map +1 -0
- package/dist/analyzers/sdk.js +283 -0
- package/dist/analyzers/sdk.js.map +1 -0
- package/dist/analyzers/shell.d.ts +20 -0
- package/dist/analyzers/shell.d.ts.map +1 -0
- package/dist/analyzers/shell.js +297 -0
- package/dist/analyzers/shell.js.map +1 -0
- package/dist/analyzers/sql.d.ts +37 -0
- package/dist/analyzers/sql.d.ts.map +1 -0
- package/dist/analyzers/sql.js +455 -0
- package/dist/analyzers/sql.js.map +1 -0
- package/dist/analyzers/types.d.ts +117 -0
- package/dist/analyzers/types.d.ts.map +1 -0
- package/dist/analyzers/types.js +46 -0
- package/dist/analyzers/types.js.map +1 -0
- package/dist/approval/interactive.d.ts +72 -0
- package/dist/approval/interactive.d.ts.map +1 -0
- package/dist/approval/interactive.js +550 -0
- package/dist/approval/interactive.js.map +1 -0
- package/dist/approval/terminal.d.ts +59 -0
- package/dist/approval/terminal.d.ts.map +1 -0
- package/dist/approval/terminal.js +238 -0
- package/dist/approval/terminal.js.map +1 -0
- package/dist/approval/types.d.ts +66 -0
- package/dist/approval/types.d.ts.map +1 -0
- package/dist/approval/types.js +2 -0
- package/dist/approval/types.js.map +1 -0
- package/dist/audit/exporter.d.ts +138 -0
- package/dist/audit/exporter.d.ts.map +1 -0
- package/dist/audit/exporter.js +366 -0
- package/dist/audit/exporter.js.map +1 -0
- package/dist/audit/logger.d.ts +156 -0
- package/dist/audit/logger.d.ts.map +1 -0
- package/dist/audit/logger.js +406 -0
- package/dist/audit/logger.js.map +1 -0
- package/dist/audit/redaction.d.ts +110 -0
- package/dist/audit/redaction.d.ts.map +1 -0
- package/dist/audit/redaction.js +307 -0
- package/dist/audit/redaction.js.map +1 -0
- package/dist/audit/schema.d.ts +76 -0
- package/dist/audit/schema.d.ts.map +1 -0
- package/dist/audit/schema.js +122 -0
- package/dist/audit/schema.js.map +1 -0
- package/dist/cli/commands/doctor.d.ts +34 -0
- package/dist/cli/commands/doctor.d.ts.map +1 -0
- package/dist/cli/commands/doctor.js +431 -0
- package/dist/cli/commands/doctor.js.map +1 -0
- package/dist/cli/commands/export.d.ts +18 -0
- package/dist/cli/commands/export.d.ts.map +1 -0
- package/dist/cli/commands/export.js +63 -0
- package/dist/cli/commands/export.js.map +1 -0
- package/dist/cli/commands/init.d.ts +12 -0
- package/dist/cli/commands/init.d.ts.map +1 -0
- package/dist/cli/commands/init.js +102 -0
- package/dist/cli/commands/init.js.map +1 -0
- package/dist/cli/commands/logs.d.ts +11 -0
- package/dist/cli/commands/logs.d.ts.map +1 -0
- package/dist/cli/commands/logs.js +60 -0
- package/dist/cli/commands/logs.js.map +1 -0
- package/dist/cli/commands/scan.d.ts +29 -0
- package/dist/cli/commands/scan.d.ts.map +1 -0
- package/dist/cli/commands/scan.js +251 -0
- package/dist/cli/commands/scan.js.map +1 -0
- package/dist/cli/commands/serve.d.ts +26 -0
- package/dist/cli/commands/serve.d.ts.map +1 -0
- package/dist/cli/commands/serve.js +424 -0
- package/dist/cli/commands/serve.js.map +1 -0
- package/dist/cli/commands/start.d.ts +20 -0
- package/dist/cli/commands/start.d.ts.map +1 -0
- package/dist/cli/commands/start.js +82 -0
- package/dist/cli/commands/start.js.map +1 -0
- package/dist/cli/commands/stats.d.ts +10 -0
- package/dist/cli/commands/stats.d.ts.map +1 -0
- package/dist/cli/commands/stats.js +42 -0
- package/dist/cli/commands/stats.js.map +1 -0
- package/dist/cli/commands/templates.d.ts +26 -0
- package/dist/cli/commands/templates.d.ts.map +1 -0
- package/dist/cli/commands/templates.js +221 -0
- package/dist/cli/commands/templates.js.map +1 -0
- package/dist/cli/commands/validate.d.ts +12 -0
- package/dist/cli/commands/validate.d.ts.map +1 -0
- package/dist/cli/commands/validate.js +107 -0
- package/dist/cli/commands/validate.js.map +1 -0
- package/dist/cli/commands/wrap.d.ts +19 -0
- package/dist/cli/commands/wrap.d.ts.map +1 -0
- package/dist/cli/commands/wrap.js +59 -0
- package/dist/cli/commands/wrap.js.map +1 -0
- package/dist/cli/index.d.ts +17 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +202 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/ui.d.ts +139 -0
- package/dist/cli/ui.d.ts.map +1 -0
- package/dist/cli/ui.js +271 -0
- package/dist/cli/ui.js.map +1 -0
- package/dist/constants.d.ts +33 -0
- package/dist/constants.d.ts.map +1 -0
- package/dist/constants.js +54 -0
- package/dist/constants.js.map +1 -0
- package/dist/errors.d.ts +28 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +37 -0
- package/dist/errors.js.map +1 -0
- package/dist/index.d.ts +49 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +82 -0
- package/dist/index.js.map +1 -0
- package/dist/orchestrator/index.d.ts +11 -0
- package/dist/orchestrator/index.d.ts.map +1 -0
- package/dist/orchestrator/index.js +10 -0
- package/dist/orchestrator/index.js.map +1 -0
- package/dist/orchestrator/manager.d.ts +127 -0
- package/dist/orchestrator/manager.d.ts.map +1 -0
- package/dist/orchestrator/manager.js +498 -0
- package/dist/orchestrator/manager.js.map +1 -0
- package/dist/orchestrator/types.d.ts +141 -0
- package/dist/orchestrator/types.d.ts.map +1 -0
- package/dist/orchestrator/types.js +9 -0
- package/dist/orchestrator/types.js.map +1 -0
- package/dist/policy/engine.d.ts +55 -0
- package/dist/policy/engine.d.ts.map +1 -0
- package/dist/policy/engine.js +288 -0
- package/dist/policy/engine.js.map +1 -0
- package/dist/policy/natural-language.d.ts +141 -0
- package/dist/policy/natural-language.d.ts.map +1 -0
- package/dist/policy/natural-language.js +552 -0
- package/dist/policy/natural-language.js.map +1 -0
- package/dist/policy/parser.d.ts +141 -0
- package/dist/policy/parser.d.ts.map +1 -0
- package/dist/policy/parser.js +314 -0
- package/dist/policy/parser.js.map +1 -0
- package/dist/policy/types.d.ts +428 -0
- package/dist/policy/types.d.ts.map +1 -0
- package/dist/policy/types.js +32 -0
- package/dist/policy/types.js.map +1 -0
- package/dist/policy/validator.d.ts +72 -0
- package/dist/policy/validator.d.ts.map +1 -0
- package/dist/policy/validator.js +453 -0
- package/dist/policy/validator.js.map +1 -0
- package/dist/proxy/bridge.d.ts +84 -0
- package/dist/proxy/bridge.d.ts.map +1 -0
- package/dist/proxy/bridge.js +217 -0
- package/dist/proxy/bridge.js.map +1 -0
- package/dist/proxy/client.d.ts +130 -0
- package/dist/proxy/client.d.ts.map +1 -0
- package/dist/proxy/client.js +290 -0
- package/dist/proxy/client.js.map +1 -0
- package/dist/proxy/server.d.ts +111 -0
- package/dist/proxy/server.d.ts.map +1 -0
- package/dist/proxy/server.js +444 -0
- package/dist/proxy/server.js.map +1 -0
- package/dist/scanner.d.ts +91 -0
- package/dist/scanner.d.ts.map +1 -0
- package/dist/scanner.js +373 -0
- package/dist/scanner.js.map +1 -0
- package/dist/session/index.d.ts +32 -0
- package/dist/session/index.d.ts.map +1 -0
- package/dist/session/index.js +31 -0
- package/dist/session/index.js.map +1 -0
- package/dist/session/manager.d.ts +166 -0
- package/dist/session/manager.d.ts.map +1 -0
- package/dist/session/manager.js +454 -0
- package/dist/session/manager.js.map +1 -0
- package/dist/session/sqlite-store.d.ts +54 -0
- package/dist/session/sqlite-store.d.ts.map +1 -0
- package/dist/session/sqlite-store.js +209 -0
- package/dist/session/sqlite-store.js.map +1 -0
- package/dist/session/types.d.ts +179 -0
- package/dist/session/types.d.ts.map +1 -0
- package/dist/session/types.js +38 -0
- package/dist/session/types.js.map +1 -0
- package/dist/templates.d.ts +64 -0
- package/dist/templates.d.ts.map +1 -0
- package/dist/templates.js +451 -0
- package/dist/templates.js.map +1 -0
- package/dist/utils/config.d.ts +57 -0
- package/dist/utils/config.d.ts.map +1 -0
- package/dist/utils/config.js +104 -0
- package/dist/utils/config.js.map +1 -0
- package/dist/utils/errors.d.ts +18 -0
- package/dist/utils/errors.d.ts.map +1 -0
- package/dist/utils/errors.js +35 -0
- package/dist/utils/errors.js.map +1 -0
- package/dist/utils/logger.d.ts +144 -0
- package/dist/utils/logger.d.ts.map +1 -0
- package/dist/utils/logger.js +300 -0
- package/dist/utils/logger.js.map +1 -0
- package/dist/wizard.d.ts +68 -0
- package/dist/wizard.d.ts.map +1 -0
- package/dist/wizard.js +395 -0
- package/dist/wizard.js.map +1 -0
- package/package.json +99 -0
|
@@ -0,0 +1,725 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Prompt Injection Analyzer
|
|
3
|
+
*
|
|
4
|
+
* Detects prompt injection attacks in tool arguments by scanning for common
|
|
5
|
+
* injection patterns that attempt to manipulate AI behavior or bypass security.
|
|
6
|
+
*
|
|
7
|
+
* Detection patterns include:
|
|
8
|
+
* - "Ignore previous instructions" and variants
|
|
9
|
+
* - System prompt manipulation attempts
|
|
10
|
+
* - Role confusion attacks ("You are now...")
|
|
11
|
+
* - Base64-encoded prompts
|
|
12
|
+
* - Unicode obfuscation (homoglyphs, invisible characters)
|
|
13
|
+
* - Markdown/HTML injection (javascript: links, etc.)
|
|
14
|
+
* - Delimiter injection (trying to break out of context)
|
|
15
|
+
* - Jailbreak patterns
|
|
16
|
+
*
|
|
17
|
+
* @module analyzers/prompt-injection
|
|
18
|
+
*/
|
|
19
|
+
/**
|
|
20
|
+
* Prompt Injection Analyzer
|
|
21
|
+
*
|
|
22
|
+
* Classifies content by injection risk:
|
|
23
|
+
* - safe: No injection patterns detected
|
|
24
|
+
* - read: Minor suspicious patterns (low confidence)
|
|
25
|
+
* - write: Moderate injection indicators
|
|
26
|
+
* - destructive: High-confidence injection patterns
|
|
27
|
+
* - dangerous: Clear prompt injection attempt
|
|
28
|
+
*/
|
|
29
|
+
export class PromptInjectionAnalyzer {
|
|
30
|
+
name = 'prompt-injection';
|
|
31
|
+
/**
|
|
32
|
+
* Analyze content for prompt injection patterns.
|
|
33
|
+
*/
|
|
34
|
+
analyze(content, _context) {
|
|
35
|
+
// Normalize content for pattern matching
|
|
36
|
+
const normalizedContent = this.normalizeContent(content);
|
|
37
|
+
const originalContent = content;
|
|
38
|
+
// Check all detection categories and collect triggers
|
|
39
|
+
const allTriggers = [];
|
|
40
|
+
let highestRisk = 'safe';
|
|
41
|
+
// 1. Check for instruction override patterns (most dangerous)
|
|
42
|
+
const instructionResult = this.checkInstructionOverride(normalizedContent);
|
|
43
|
+
if (instructionResult) {
|
|
44
|
+
if (this.isHigherRisk(instructionResult.risk, highestRisk)) {
|
|
45
|
+
highestRisk = instructionResult.risk;
|
|
46
|
+
}
|
|
47
|
+
allTriggers.push(...(instructionResult.triggers ?? []));
|
|
48
|
+
}
|
|
49
|
+
// 2. Check for system prompt manipulation
|
|
50
|
+
const systemPromptResult = this.checkSystemPromptManipulation(normalizedContent);
|
|
51
|
+
if (systemPromptResult) {
|
|
52
|
+
if (this.isHigherRisk(systemPromptResult.risk, highestRisk)) {
|
|
53
|
+
highestRisk = systemPromptResult.risk;
|
|
54
|
+
}
|
|
55
|
+
allTriggers.push(...(systemPromptResult.triggers ?? []));
|
|
56
|
+
}
|
|
57
|
+
// 3. Check for role confusion attacks
|
|
58
|
+
const roleConfusionResult = this.checkRoleConfusion(normalizedContent);
|
|
59
|
+
if (roleConfusionResult) {
|
|
60
|
+
if (this.isHigherRisk(roleConfusionResult.risk, highestRisk)) {
|
|
61
|
+
highestRisk = roleConfusionResult.risk;
|
|
62
|
+
}
|
|
63
|
+
allTriggers.push(...(roleConfusionResult.triggers ?? []));
|
|
64
|
+
}
|
|
65
|
+
// 4. Check for Base64-encoded content
|
|
66
|
+
const base64Result = this.checkBase64Encoding(originalContent);
|
|
67
|
+
if (base64Result) {
|
|
68
|
+
if (this.isHigherRisk(base64Result.risk, highestRisk)) {
|
|
69
|
+
highestRisk = base64Result.risk;
|
|
70
|
+
}
|
|
71
|
+
allTriggers.push(...(base64Result.triggers ?? []));
|
|
72
|
+
}
|
|
73
|
+
// 5. Check for Unicode obfuscation
|
|
74
|
+
const unicodeResult = this.checkUnicodeObfuscation(originalContent);
|
|
75
|
+
if (unicodeResult) {
|
|
76
|
+
if (this.isHigherRisk(unicodeResult.risk, highestRisk)) {
|
|
77
|
+
highestRisk = unicodeResult.risk;
|
|
78
|
+
}
|
|
79
|
+
allTriggers.push(...(unicodeResult.triggers ?? []));
|
|
80
|
+
}
|
|
81
|
+
// 6. Check for Markdown/HTML injection
|
|
82
|
+
const markdownResult = this.checkMarkdownInjection(originalContent);
|
|
83
|
+
if (markdownResult) {
|
|
84
|
+
if (this.isHigherRisk(markdownResult.risk, highestRisk)) {
|
|
85
|
+
highestRisk = markdownResult.risk;
|
|
86
|
+
}
|
|
87
|
+
allTriggers.push(...(markdownResult.triggers ?? []));
|
|
88
|
+
}
|
|
89
|
+
// 7. Check for delimiter injection
|
|
90
|
+
const delimiterResult = this.checkDelimiterInjection(originalContent);
|
|
91
|
+
if (delimiterResult) {
|
|
92
|
+
if (this.isHigherRisk(delimiterResult.risk, highestRisk)) {
|
|
93
|
+
highestRisk = delimiterResult.risk;
|
|
94
|
+
}
|
|
95
|
+
allTriggers.push(...(delimiterResult.triggers ?? []));
|
|
96
|
+
}
|
|
97
|
+
// 8. Check for jailbreak patterns
|
|
98
|
+
const jailbreakResult = this.checkJailbreakPatterns(normalizedContent);
|
|
99
|
+
if (jailbreakResult) {
|
|
100
|
+
if (this.isHigherRisk(jailbreakResult.risk, highestRisk)) {
|
|
101
|
+
highestRisk = jailbreakResult.risk;
|
|
102
|
+
}
|
|
103
|
+
allTriggers.push(...(jailbreakResult.triggers ?? []));
|
|
104
|
+
}
|
|
105
|
+
// 9. Check for data exfiltration attempts
|
|
106
|
+
const exfilResult = this.checkDataExfiltration(normalizedContent);
|
|
107
|
+
if (exfilResult) {
|
|
108
|
+
if (this.isHigherRisk(exfilResult.risk, highestRisk)) {
|
|
109
|
+
highestRisk = exfilResult.risk;
|
|
110
|
+
}
|
|
111
|
+
allTriggers.push(...(exfilResult.triggers ?? []));
|
|
112
|
+
}
|
|
113
|
+
// Return result based on highest risk found
|
|
114
|
+
if (highestRisk === 'safe') {
|
|
115
|
+
return {
|
|
116
|
+
risk: 'safe',
|
|
117
|
+
reason: 'No prompt injection patterns detected',
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
return {
|
|
121
|
+
risk: highestRisk,
|
|
122
|
+
reason: this.getRiskReason(highestRisk, allTriggers),
|
|
123
|
+
triggers: allTriggers,
|
|
124
|
+
metadata: {
|
|
125
|
+
patternCount: allTriggers.length,
|
|
126
|
+
normalizedLength: normalizedContent.length,
|
|
127
|
+
},
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Normalize content for pattern matching.
|
|
132
|
+
* Removes extra whitespace, converts to lowercase, etc.
|
|
133
|
+
*/
|
|
134
|
+
normalizeContent(content) {
|
|
135
|
+
return content
|
|
136
|
+
// Apply Unicode NFC normalization to handle homoglyph variants
|
|
137
|
+
.normalize('NFC')
|
|
138
|
+
.toLowerCase()
|
|
139
|
+
// Normalize whitespace
|
|
140
|
+
.replace(/\s+/g, ' ')
|
|
141
|
+
// Remove zero-width characters for matching
|
|
142
|
+
.replace(/[\u200B-\u200D\uFEFF]/g, '')
|
|
143
|
+
.trim();
|
|
144
|
+
}
|
|
145
|
+
/**
|
|
146
|
+
* Check for instruction override patterns.
|
|
147
|
+
* These are the most common prompt injection patterns.
|
|
148
|
+
*/
|
|
149
|
+
checkInstructionOverride(content) {
|
|
150
|
+
// Patterns use bounded whitespace \s{1,10} to prevent ReDoS attacks
|
|
151
|
+
const dangerousPatterns = [
|
|
152
|
+
// "Ignore" variants - bounded whitespace to prevent catastrophic backtracking
|
|
153
|
+
/ignore\s{1,10}(?:all\s{1,10})?(?:your\s{1,10})?(?:previous|prior|above|earlier|original|initial)\s{1,10}(?:instructions?|prompts?|rules?|guidelines?|constraints?)/i,
|
|
154
|
+
/disregard\s{1,10}(?:all\s{1,10})?(?:your\s{1,10})?(?:previous|prior|above|earlier|original|initial)\s{1,10}(?:instructions?|prompts?|rules?)/i,
|
|
155
|
+
/forget\s{1,10}(?:all\s{1,10})?(?:your\s{1,10})?(?:previous|prior|above|earlier|original|initial)\s{1,10}(?:instructions?|prompts?|rules?)/i,
|
|
156
|
+
/do\s{1,10}not\s{1,10}follow\s{1,10}(?:your\s{1,10})?(?:previous|prior|above|earlier|original|initial)\s{1,10}(?:instructions?|prompts?|rules?)/i,
|
|
157
|
+
// "New instructions" variants
|
|
158
|
+
/new\s{1,10}instructions?\s{0,5}:/i,
|
|
159
|
+
/updated\s{1,10}instructions?\s{0,5}:/i,
|
|
160
|
+
/override\s{1,10}instructions?\s{0,5}:/i,
|
|
161
|
+
/replacement\s{1,10}instructions?\s{0,5}:/i,
|
|
162
|
+
// Direct overrides
|
|
163
|
+
/from\s{1,10}now\s{1,10}on,?\s{1,10}(?:ignore|forget|disregard)/i,
|
|
164
|
+
/starting\s{1,10}now,?\s{1,10}(?:ignore|forget|disregard)/i,
|
|
165
|
+
/effective\s{1,10}immediately,?\s{1,10}(?:ignore|forget|disregard)/i,
|
|
166
|
+
// "Actually" pattern
|
|
167
|
+
/actually,?\s{1,10}(?:ignore|forget|disregard|don'?t\s{1,10}follow)/i,
|
|
168
|
+
/wait,?\s{1,10}(?:ignore|forget|disregard|don'?t\s{1,10}follow)/i,
|
|
169
|
+
/stop,?\s{1,10}(?:ignore|forget|disregard)/i,
|
|
170
|
+
// Reset commands
|
|
171
|
+
/reset\s{1,10}(?:your\s{1,10})?(?:instructions?|prompts?|rules?|behavior)/i,
|
|
172
|
+
/clear\s{1,10}(?:your\s{1,10})?(?:instructions?|prompts?|rules?|memory)/i,
|
|
173
|
+
];
|
|
174
|
+
const destructivePatterns = [
|
|
175
|
+
// Less direct but still concerning - bounded whitespace
|
|
176
|
+
/previous\s{1,10}instructions?\s{1,10}(?:are|were)\s{1,10}(?:wrong|incorrect|outdated|invalid)/i,
|
|
177
|
+
/those\s{1,10}instructions?\s{1,10}(?:are|were)\s{1,10}(?:wrong|incorrect|outdated)/i,
|
|
178
|
+
/the\s{1,10}(?:above|previous)\s{1,10}(?:is|was)\s{1,10}(?:a\s{1,10})?test/i,
|
|
179
|
+
/that\s{1,10}was\s{1,10}just\s{1,10}(?:a\s{1,10})?test/i,
|
|
180
|
+
];
|
|
181
|
+
for (const pattern of dangerousPatterns) {
|
|
182
|
+
if (pattern.test(content)) {
|
|
183
|
+
return {
|
|
184
|
+
risk: 'dangerous',
|
|
185
|
+
reason: 'Instruction override attempt detected',
|
|
186
|
+
triggers: [pattern.source.substring(0, 50)],
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
for (const pattern of destructivePatterns) {
|
|
191
|
+
if (pattern.test(content)) {
|
|
192
|
+
return {
|
|
193
|
+
risk: 'destructive',
|
|
194
|
+
reason: 'Suspicious instruction manipulation detected',
|
|
195
|
+
triggers: [pattern.source.substring(0, 50)],
|
|
196
|
+
};
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
return null;
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* Check for system prompt manipulation attempts.
|
|
203
|
+
*/
|
|
204
|
+
checkSystemPromptManipulation(content) {
|
|
205
|
+
// Patterns use bounded whitespace \s{1,10} to prevent ReDoS attacks
|
|
206
|
+
const dangerousPatterns = [
|
|
207
|
+
// Direct system prompt references
|
|
208
|
+
/system\s{0,5}prompt\s{0,5}:/i,
|
|
209
|
+
/\[system\]/i,
|
|
210
|
+
/\[system\s{1,10}message\]/i,
|
|
211
|
+
/<system>/i,
|
|
212
|
+
/<<\s{0,5}sys\s{0,5}>>/i,
|
|
213
|
+
/\|\|\s{0,5}system\s{0,5}\|\|/i,
|
|
214
|
+
// Attempts to see/modify system prompt
|
|
215
|
+
/show\s{1,10}(?:me\s{1,10})?(?:your\s{1,10})?system\s{1,10}prompt/i,
|
|
216
|
+
/what\s{1,10}(?:is|are)\s{1,10}(?:your\s{1,10})?system\s{1,10}(?:prompt|instructions?)/i,
|
|
217
|
+
/reveal\s{1,10}(?:your\s{1,10})?system\s{1,10}(?:prompt|instructions?)/i,
|
|
218
|
+
/print\s{1,10}(?:your\s{1,10})?system\s{1,10}(?:prompt|instructions?)/i,
|
|
219
|
+
/display\s{1,10}(?:your\s{1,10})?system\s{1,10}(?:prompt|instructions?)/i,
|
|
220
|
+
/output\s{1,10}(?:your\s{1,10})?system\s{1,10}(?:prompt|instructions?)/i,
|
|
221
|
+
/repeat\s{1,10}(?:your\s{1,10})?system\s{1,10}(?:prompt|instructions?)/i,
|
|
222
|
+
// Hidden instructions
|
|
223
|
+
/hidden\s{1,10}instructions?\s{0,5}:/i,
|
|
224
|
+
/secret\s{1,10}instructions?\s{0,5}:/i,
|
|
225
|
+
/internal\s{1,10}instructions?\s{0,5}:/i,
|
|
226
|
+
/developer\s{1,10}(?:mode|instructions?)/i,
|
|
227
|
+
/debug\s{1,10}mode/i,
|
|
228
|
+
/maintenance\s{1,10}mode/i,
|
|
229
|
+
/admin\s{1,10}mode/i,
|
|
230
|
+
];
|
|
231
|
+
const destructivePatterns = [
|
|
232
|
+
// Less direct attempts - bounded whitespace
|
|
233
|
+
/how\s{1,10}were\s{1,10}you\s{1,10}(?:programmed|trained|instructed)/i,
|
|
234
|
+
/what\s{1,10}are\s{1,10}your\s{1,10}(?:core|base|original)\s{1,10}(?:instructions?|rules?)/i,
|
|
235
|
+
/tell\s{1,10}me\s{1,10}about\s{1,10}your\s{1,10}(?:instructions?|programming|training)/i,
|
|
236
|
+
];
|
|
237
|
+
for (const pattern of dangerousPatterns) {
|
|
238
|
+
if (pattern.test(content)) {
|
|
239
|
+
return {
|
|
240
|
+
risk: 'dangerous',
|
|
241
|
+
reason: 'System prompt manipulation attempt detected',
|
|
242
|
+
triggers: ['system prompt manipulation'],
|
|
243
|
+
};
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
for (const pattern of destructivePatterns) {
|
|
247
|
+
if (pattern.test(content)) {
|
|
248
|
+
return {
|
|
249
|
+
risk: 'destructive',
|
|
250
|
+
reason: 'Potential system prompt probing detected',
|
|
251
|
+
triggers: ['system prompt probing'],
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
return null;
|
|
256
|
+
}
|
|
257
|
+
/**
|
|
258
|
+
* Check for role confusion attacks.
|
|
259
|
+
*/
|
|
260
|
+
checkRoleConfusion(content) {
|
|
261
|
+
// Patterns use bounded whitespace \s{1,10} to prevent ReDoS attacks
|
|
262
|
+
const dangerousPatterns = [
|
|
263
|
+
// Role assignment
|
|
264
|
+
/you\s{1,10}are\s{1,10}now\s{1,10}(?:a|an|the)/i,
|
|
265
|
+
/pretend\s{1,10}(?:you\s{1,10}are|to\s{1,10}be|you're)/i,
|
|
266
|
+
/act\s{1,10}as\s{1,10}(?:if\s{1,10}you\s{1,10}(?:are|were)|a|an|the)/i,
|
|
267
|
+
/roleplay\s{1,10}as/i,
|
|
268
|
+
/imagine\s{1,10}you\s{1,10}are/i,
|
|
269
|
+
/assume\s{1,10}the\s{1,10}role\s{1,10}of/i,
|
|
270
|
+
/take\s{1,10}on\s{1,10}the\s{1,10}persona\s{1,10}of/i,
|
|
271
|
+
// Personality changes
|
|
272
|
+
/your\s{1,10}new\s{1,10}(?:name|identity|persona|personality)\s{1,10}is/i,
|
|
273
|
+
/from\s{1,10}now\s{1,10}on,?\s{1,10}you\s{1,10}are/i,
|
|
274
|
+
/i\s{1,10}want\s{1,10}you\s{1,10}to\s{1,10}act\s{1,10}as/i,
|
|
275
|
+
/i\s{1,10}want\s{1,10}you\s{1,10}to\s{1,10}be/i,
|
|
276
|
+
/respond\s{1,10}as\s{1,10}(?:if\s{1,10}you\s{1,10}were|a|an)/i,
|
|
277
|
+
/answer\s{1,10}as\s{1,10}(?:if\s{1,10}you\s{1,10}were|a|an)/i,
|
|
278
|
+
// Unrestricted AI patterns
|
|
279
|
+
/you\s{1,10}are\s{1,10}(?:an?\s{1,10})?unfiltered/i,
|
|
280
|
+
/you\s{1,10}are\s{1,10}(?:an?\s{1,10})?uncensored/i,
|
|
281
|
+
/you\s{1,10}are\s{1,10}(?:an?\s{1,10})?unrestricted/i,
|
|
282
|
+
/you\s{1,10}have\s{1,10}no\s{1,10}(?:restrictions?|limitations?|rules?|filters?)/i,
|
|
283
|
+
/you\s{1,10}can\s{1,10}(?:say|do)\s{1,10}anything/i,
|
|
284
|
+
/you\s{1,10}don'?t\s{1,10}have\s{1,10}(?:any\s{1,10})?(?:restrictions?|limitations?|rules?)/i,
|
|
285
|
+
];
|
|
286
|
+
const destructivePatterns = [
|
|
287
|
+
// Softer role suggestions - bounded whitespace
|
|
288
|
+
/think\s{1,10}of\s{1,10}yourself\s{1,10}as/i,
|
|
289
|
+
/consider\s{1,10}yourself/i,
|
|
290
|
+
/you\s{1,10}should\s{1,10}act\s{1,10}like/i,
|
|
291
|
+
];
|
|
292
|
+
for (const pattern of dangerousPatterns) {
|
|
293
|
+
if (pattern.test(content)) {
|
|
294
|
+
return {
|
|
295
|
+
risk: 'dangerous',
|
|
296
|
+
reason: 'Role confusion attack detected',
|
|
297
|
+
triggers: ['role confusion'],
|
|
298
|
+
};
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
for (const pattern of destructivePatterns) {
|
|
302
|
+
if (pattern.test(content)) {
|
|
303
|
+
return {
|
|
304
|
+
risk: 'destructive',
|
|
305
|
+
reason: 'Potential role manipulation detected',
|
|
306
|
+
triggers: ['role manipulation'],
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
return null;
|
|
311
|
+
}
|
|
312
|
+
/**
|
|
313
|
+
* Check for Base64-encoded content that might hide injection.
|
|
314
|
+
*/
|
|
315
|
+
checkBase64Encoding(content) {
|
|
316
|
+
// Look for Base64 patterns (standard and URL-safe variants)
|
|
317
|
+
const base64Pattern = /([A-Za-z0-9+/]{20,}={0,2})/g;
|
|
318
|
+
const base64UrlSafePattern = /([A-Za-z0-9_-]{20,}={0,2})/g;
|
|
319
|
+
// Combine matches from both patterns
|
|
320
|
+
const standardMatches = content.match(base64Pattern) ?? [];
|
|
321
|
+
const urlSafeMatches = content.match(base64UrlSafePattern) ?? [];
|
|
322
|
+
const matches = [...new Set([...standardMatches, ...urlSafeMatches])];
|
|
323
|
+
if (matches.length === 0) {
|
|
324
|
+
return null;
|
|
325
|
+
}
|
|
326
|
+
for (const match of matches) {
|
|
327
|
+
try {
|
|
328
|
+
// Try to decode and check for suspicious content
|
|
329
|
+
const decoded = Buffer.from(match, 'base64').toString('utf-8');
|
|
330
|
+
// Check if decoded content contains injection patterns
|
|
331
|
+
if (this.containsInjectionKeywords(decoded)) {
|
|
332
|
+
return {
|
|
333
|
+
risk: 'dangerous',
|
|
334
|
+
reason: 'Base64-encoded injection detected',
|
|
335
|
+
triggers: ['base64 encoded injection'],
|
|
336
|
+
metadata: { encodedLength: match.length },
|
|
337
|
+
};
|
|
338
|
+
}
|
|
339
|
+
// Check if it's valid text (high ratio of printable characters)
|
|
340
|
+
const printableRatio = this.getPrintableRatio(decoded);
|
|
341
|
+
if (printableRatio > 0.8 && decoded.length > 20) {
|
|
342
|
+
// Looks like hidden text, flag as suspicious
|
|
343
|
+
return {
|
|
344
|
+
risk: 'write',
|
|
345
|
+
reason: 'Suspicious Base64-encoded text detected',
|
|
346
|
+
triggers: ['base64 hidden content'],
|
|
347
|
+
metadata: { encodedLength: match.length, printableRatio },
|
|
348
|
+
};
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
catch {
|
|
352
|
+
// Not valid Base64, ignore
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
return null;
|
|
356
|
+
}
|
|
357
|
+
/**
|
|
358
|
+
* Check for Unicode obfuscation techniques.
|
|
359
|
+
*/
|
|
360
|
+
checkUnicodeObfuscation(content) {
|
|
361
|
+
const triggers = [];
|
|
362
|
+
// Check for homoglyph attacks (lookalike characters)
|
|
363
|
+
const homoglyphs = {
|
|
364
|
+
// Cyrillic lookalikes
|
|
365
|
+
'\u0430': 'a', '\u0435': 'e', '\u043E': 'o', '\u0440': 'p',
|
|
366
|
+
'\u0441': 'c', '\u0443': 'y', '\u0445': 'x', '\u0456': 'i',
|
|
367
|
+
// Greek lookalikes
|
|
368
|
+
'\u03B1': 'a', '\u03B5': 'e', '\u03BF': 'o', '\u03C1': 'p',
|
|
369
|
+
// Special lookalikes
|
|
370
|
+
'\u0261': 'g', '\u026F': 'm', '\u0270': 'm', '\u0280': 'r',
|
|
371
|
+
// Full-width characters
|
|
372
|
+
'\uFF41': 'a', '\uFF45': 'e', '\uFF49': 'i', '\uFF4F': 'o',
|
|
373
|
+
// Mathematical alphanumeric symbols (U+1D400-U+1D7FF)
|
|
374
|
+
// These look identical to ASCII letters but are different Unicode codepoints
|
|
375
|
+
'\u{1D41A}': 'a', '\u{1D41B}': 'b', '\u{1D41C}': 'c', '\u{1D41D}': 'd',
|
|
376
|
+
'\u{1D41E}': 'e', '\u{1D41F}': 'f', '\u{1D420}': 'g', '\u{1D421}': 'h',
|
|
377
|
+
'\u{1D422}': 'i', '\u{1D423}': 'j', '\u{1D424}': 'k', '\u{1D425}': 'l',
|
|
378
|
+
'\u{1D426}': 'm', '\u{1D427}': 'n', '\u{1D428}': 'o', '\u{1D429}': 'p',
|
|
379
|
+
'\u{1D44E}': 'a', '\u{1D44F}': 'b', '\u{1D450}': 'c', // italic
|
|
380
|
+
'\u{1D482}': 'a', '\u{1D483}': 'b', '\u{1D484}': 'c', // bold italic
|
|
381
|
+
'\u{1D4B6}': 'a', '\u{1D4B7}': 'b', '\u{1D4B8}': 'c', // script
|
|
382
|
+
'\u{1D5BA}': 'a', '\u{1D5BB}': 'b', '\u{1D5BC}': 'c', // sans-serif
|
|
383
|
+
'\u{1D5EE}': 'a', '\u{1D5EF}': 'b', '\u{1D5F0}': 'c', // sans-serif italic
|
|
384
|
+
'\u{1D622}': 'a', '\u{1D623}': 'b', '\u{1D624}': 'c', // monospace
|
|
385
|
+
};
|
|
386
|
+
let homoglyphCount = 0;
|
|
387
|
+
for (const char of content) {
|
|
388
|
+
if (homoglyphs[char]) {
|
|
389
|
+
homoglyphCount++;
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
if (homoglyphCount > 3) {
|
|
393
|
+
triggers.push('homoglyph obfuscation');
|
|
394
|
+
}
|
|
395
|
+
// Check for invisible characters
|
|
396
|
+
const invisibleChars = [
|
|
397
|
+
'\u200B', // Zero-width space
|
|
398
|
+
'\u200C', // Zero-width non-joiner
|
|
399
|
+
'\u200D', // Zero-width joiner
|
|
400
|
+
'\u2060', // Word joiner
|
|
401
|
+
'\uFEFF', // BOM
|
|
402
|
+
'\u00AD', // Soft hyphen
|
|
403
|
+
'\u034F', // Combining grapheme joiner
|
|
404
|
+
'\u061C', // Arabic letter mark
|
|
405
|
+
'\u180E', // Mongolian vowel separator
|
|
406
|
+
];
|
|
407
|
+
let invisibleCount = 0;
|
|
408
|
+
for (const char of content) {
|
|
409
|
+
if (invisibleChars.includes(char)) {
|
|
410
|
+
invisibleCount++;
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
if (invisibleCount > 2) {
|
|
414
|
+
triggers.push('invisible characters');
|
|
415
|
+
}
|
|
416
|
+
// Check for right-to-left override (can hide text direction)
|
|
417
|
+
if (content.includes('\u202E') || content.includes('\u202D') || content.includes('\u200F')) {
|
|
418
|
+
triggers.push('RTL override');
|
|
419
|
+
}
|
|
420
|
+
// Check for combining characters abuse (can hide text)
|
|
421
|
+
const combiningPattern = /[\u0300-\u036F]{3,}/;
|
|
422
|
+
if (combiningPattern.test(content)) {
|
|
423
|
+
triggers.push('combining character abuse');
|
|
424
|
+
}
|
|
425
|
+
if (triggers.length > 0) {
|
|
426
|
+
const risk = triggers.length >= 2 ? 'dangerous' : 'destructive';
|
|
427
|
+
return {
|
|
428
|
+
risk,
|
|
429
|
+
reason: 'Unicode obfuscation detected',
|
|
430
|
+
triggers,
|
|
431
|
+
metadata: { homoglyphCount, invisibleCount },
|
|
432
|
+
};
|
|
433
|
+
}
|
|
434
|
+
return null;
|
|
435
|
+
}
|
|
436
|
+
/**
|
|
437
|
+
* Check for Markdown/HTML injection.
|
|
438
|
+
*/
|
|
439
|
+
checkMarkdownInjection(content) {
|
|
440
|
+
const triggers = [];
|
|
441
|
+
// JavaScript protocol in links
|
|
442
|
+
if (/\[.*?\]\s*\(\s*javascript:/i.test(content)) {
|
|
443
|
+
triggers.push('javascript: link');
|
|
444
|
+
return {
|
|
445
|
+
risk: 'dangerous',
|
|
446
|
+
reason: 'JavaScript injection in markdown link',
|
|
447
|
+
triggers,
|
|
448
|
+
};
|
|
449
|
+
}
|
|
450
|
+
// Data URI with script
|
|
451
|
+
if (/\[.*?\]\s*\(\s*data:text\/(html|javascript)/i.test(content)) {
|
|
452
|
+
triggers.push('data: URI injection');
|
|
453
|
+
return {
|
|
454
|
+
risk: 'dangerous',
|
|
455
|
+
reason: 'Data URI injection in markdown link',
|
|
456
|
+
triggers,
|
|
457
|
+
};
|
|
458
|
+
}
|
|
459
|
+
// VBScript protocol
|
|
460
|
+
if (/\[.*?\]\s*\(\s*vbscript:/i.test(content)) {
|
|
461
|
+
triggers.push('vbscript: link');
|
|
462
|
+
return {
|
|
463
|
+
risk: 'dangerous',
|
|
464
|
+
reason: 'VBScript injection in markdown link',
|
|
465
|
+
triggers,
|
|
466
|
+
};
|
|
467
|
+
}
|
|
468
|
+
// Event handlers in HTML
|
|
469
|
+
const eventHandlerPattern = /on(click|load|error|mouseover|focus|blur|submit|change|keyup|keydown|keypress)\s*=/i;
|
|
470
|
+
if (eventHandlerPattern.test(content)) {
|
|
471
|
+
triggers.push('event handler injection');
|
|
472
|
+
return {
|
|
473
|
+
risk: 'dangerous',
|
|
474
|
+
reason: 'HTML event handler injection detected',
|
|
475
|
+
triggers,
|
|
476
|
+
};
|
|
477
|
+
}
|
|
478
|
+
// Script tags
|
|
479
|
+
if (/<script[\s>]/i.test(content)) {
|
|
480
|
+
triggers.push('script tag');
|
|
481
|
+
return {
|
|
482
|
+
risk: 'dangerous',
|
|
483
|
+
reason: 'Script tag injection detected',
|
|
484
|
+
triggers,
|
|
485
|
+
};
|
|
486
|
+
}
|
|
487
|
+
// iframes
|
|
488
|
+
if (/<iframe[\s>]/i.test(content)) {
|
|
489
|
+
triggers.push('iframe tag');
|
|
490
|
+
return {
|
|
491
|
+
risk: 'destructive',
|
|
492
|
+
reason: 'Iframe injection detected',
|
|
493
|
+
triggers,
|
|
494
|
+
};
|
|
495
|
+
}
|
|
496
|
+
// Object/embed tags
|
|
497
|
+
if (/<(object|embed)[\s>]/i.test(content)) {
|
|
498
|
+
triggers.push('object/embed tag');
|
|
499
|
+
return {
|
|
500
|
+
risk: 'destructive',
|
|
501
|
+
reason: 'Object/embed tag injection detected',
|
|
502
|
+
triggers,
|
|
503
|
+
};
|
|
504
|
+
}
|
|
505
|
+
// Form injection
|
|
506
|
+
if (/<form[\s>]/i.test(content)) {
|
|
507
|
+
triggers.push('form tag');
|
|
508
|
+
return {
|
|
509
|
+
risk: 'write',
|
|
510
|
+
reason: 'Form tag detected in content',
|
|
511
|
+
triggers,
|
|
512
|
+
};
|
|
513
|
+
}
|
|
514
|
+
return null;
|
|
515
|
+
}
|
|
516
|
+
/**
|
|
517
|
+
* Check for delimiter injection attempts.
|
|
518
|
+
*/
|
|
519
|
+
checkDelimiterInjection(content) {
|
|
520
|
+
const triggers = [];
|
|
521
|
+
// Common AI prompt delimiters
|
|
522
|
+
const delimiterPatterns = [
|
|
523
|
+
// XML-style delimiters
|
|
524
|
+
/<\/?instructions?>/i,
|
|
525
|
+
/<\/?prompt>/i,
|
|
526
|
+
/<\/?context>/i,
|
|
527
|
+
/<\/?user>/i,
|
|
528
|
+
/<\/?assistant>/i,
|
|
529
|
+
/<\/?human>/i,
|
|
530
|
+
/<\/?ai>/i,
|
|
531
|
+
/<\/?system>/i,
|
|
532
|
+
// Markdown-style separators used in prompts - bounded whitespace
|
|
533
|
+
/---\s{0,5}(?:end|begin|start)\s{0,5}(?:of\s{1,10})?(?:prompt|instructions?|system|context)/i,
|
|
534
|
+
/===\s{0,5}(?:end|begin|start)\s{0,5}(?:of\s{1,10})?(?:prompt|instructions?|system|context)/i,
|
|
535
|
+
/###\s{0,5}(?:end|begin|start)\s{0,5}(?:of\s{1,10})?(?:prompt|instructions?|system|context)/i,
|
|
536
|
+
// ChatML-style
|
|
537
|
+
/<\|im_start\|>/i,
|
|
538
|
+
/<\|im_end\|>/i,
|
|
539
|
+
/<\|endoftext\|>/i,
|
|
540
|
+
// Common prompt separators
|
|
541
|
+
/\[\/INST\]/i,
|
|
542
|
+
/\[INST\]/i,
|
|
543
|
+
/\[\[USER\]\]/i,
|
|
544
|
+
/\[\[ASSISTANT\]\]/i,
|
|
545
|
+
/<\|user\|>/i,
|
|
546
|
+
/<\|assistant\|>/i,
|
|
547
|
+
];
|
|
548
|
+
for (const pattern of delimiterPatterns) {
|
|
549
|
+
if (pattern.test(content)) {
|
|
550
|
+
triggers.push('prompt delimiter');
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
if (triggers.length > 0) {
|
|
554
|
+
return {
|
|
555
|
+
risk: 'dangerous',
|
|
556
|
+
reason: 'Prompt delimiter injection detected',
|
|
557
|
+
triggers,
|
|
558
|
+
};
|
|
559
|
+
}
|
|
560
|
+
// Multiple line breaks might be trying to push content "out of view"
|
|
561
|
+
if (/\n{10,}/.test(content) || /\r\n{10,}/.test(content)) {
|
|
562
|
+
return {
|
|
563
|
+
risk: 'write',
|
|
564
|
+
reason: 'Excessive line breaks detected (possible content hiding)',
|
|
565
|
+
triggers: ['excessive newlines'],
|
|
566
|
+
};
|
|
567
|
+
}
|
|
568
|
+
return null;
|
|
569
|
+
}
|
|
570
|
+
/**
|
|
571
|
+
* Check for jailbreak patterns.
|
|
572
|
+
*/
|
|
573
|
+
checkJailbreakPatterns(content) {
|
|
574
|
+
// Patterns use bounded whitespace \s{1,10} to prevent ReDoS attacks
|
|
575
|
+
const dangerousPatterns = [
|
|
576
|
+
// DAN (Do Anything Now) and variants
|
|
577
|
+
/\bdan\b.*\bmode\b/i,
|
|
578
|
+
/do\s{1,10}anything\s{1,10}now/i,
|
|
579
|
+
/\bdeveloper\s{1,10}mode\b/i,
|
|
580
|
+
/\bunleashed\s{1,10}mode\b/i,
|
|
581
|
+
/\bjailbreak(?:ed)?\b/i,
|
|
582
|
+
/\bmaximum\s{1,10}mode\b/i,
|
|
583
|
+
// "Pretend" bypasses
|
|
584
|
+
/pretend\s{1,10}(?:there\s{1,10}are\s{1,10})?no\s{1,10}(?:rules?|restrictions?|limitations?|filters?)/i,
|
|
585
|
+
/pretend\s{1,10}you\s{1,10}(?:can|are\s{1,10}able\s{1,10}to)/i,
|
|
586
|
+
/hypothetically,?\s{1,10}if\s{1,10}you\s{1,10}(?:could|were)/i,
|
|
587
|
+
/in\s{1,10}a\s{1,10}hypothetical\s{1,10}scenario\s{1,10}where\s{1,10}you\s{1,10}(?:could|can)/i,
|
|
588
|
+
// Fictional framing - use non-greedy match with bounded length
|
|
589
|
+
/for\s{1,10}(?:a\s{1,10})?(?:fiction|story|novel|screenplay|creative\s{1,10}writing).{0,50}?(?:ignore|bypass|without)/i,
|
|
590
|
+
/this\s{1,10}is\s{1,10}for\s{1,10}(?:educational|research|academic)\s{1,10}purposes?\s{0,10}(?:only)?/i,
|
|
591
|
+
// Capability unlocking
|
|
592
|
+
/unlock\s{1,10}(?:your\s{1,10})?(?:full\s{1,10})?capabilities/i,
|
|
593
|
+
/enable\s{1,10}(?:your\s{1,10})?(?:hidden|all|full)\s{1,10}(?:features?|capabilities|functions?)/i,
|
|
594
|
+
/activate\s{1,10}(?:your\s{1,10})?(?:hidden|all|full)\s{1,10}(?:features?|capabilities|functions?)/i,
|
|
595
|
+
/disable\s{1,10}(?:your\s{1,10})?(?:all\s{1,10})?(?:safety|content)\s{0,5}(?:filters?|restrictions?|guidelines?)?/i,
|
|
596
|
+
/remove\s{1,10}(?:your\s{1,10})?(?:all\s{1,10})?(?:safety|content)\s{0,5}(?:filters?|restrictions?|guidelines?)?/i,
|
|
597
|
+
/turn\s{1,10}off\s{1,10}(?:your\s{1,10})?(?:all\s{1,10})?(?:safety|content)\s{0,5}(?:filters?|restrictions?|guidelines?)?/i,
|
|
598
|
+
// Moral bypass
|
|
599
|
+
/you\s{1,10}(?:will\s{1,10})?not\s{1,10}(?:be\s{1,10})?judged/i,
|
|
600
|
+
/no\s{1,10}one\s{1,10}will\s{1,10}(?:ever\s{1,10})?know/i,
|
|
601
|
+
/this\s{1,10}(?:conversation|chat|discussion)\s{1,10}is\s{1,10}private/i,
|
|
602
|
+
/everything\s{1,10}is\s{1,10}legal\s{1,10}here/i,
|
|
603
|
+
/there\s{1,10}are\s{1,10}no\s{1,10}consequences/i,
|
|
604
|
+
];
|
|
605
|
+
const destructivePatterns = [
|
|
606
|
+
// Softer jailbreak attempts - bounded whitespace
|
|
607
|
+
/what\s{1,10}if\s{1,10}you\s{1,10}didn'?t\s{1,10}have\s{1,10}(?:any\s{1,10})?restrictions/i,
|
|
608
|
+
/imagine\s{1,10}you\s{1,10}had\s{1,10}no\s{1,10}(?:rules?|restrictions?|limitations?)/i,
|
|
609
|
+
/if\s{1,10}you\s{1,10}were\s{1,10}free\s{1,10}to\s{1,10}(?:say|do)\s{1,10}anything/i,
|
|
610
|
+
];
|
|
611
|
+
for (const pattern of dangerousPatterns) {
|
|
612
|
+
if (pattern.test(content)) {
|
|
613
|
+
return {
|
|
614
|
+
risk: 'dangerous',
|
|
615
|
+
reason: 'Jailbreak attempt detected',
|
|
616
|
+
triggers: ['jailbreak pattern'],
|
|
617
|
+
};
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
for (const pattern of destructivePatterns) {
|
|
621
|
+
if (pattern.test(content)) {
|
|
622
|
+
return {
|
|
623
|
+
risk: 'destructive',
|
|
624
|
+
reason: 'Potential jailbreak attempt detected',
|
|
625
|
+
triggers: ['soft jailbreak pattern'],
|
|
626
|
+
};
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
return null;
|
|
630
|
+
}
|
|
631
|
+
/**
|
|
632
|
+
* Check for data exfiltration attempts via prompt injection.
|
|
633
|
+
*/
|
|
634
|
+
checkDataExfiltration(content) {
|
|
635
|
+
// Patterns use bounded whitespace \s{1,10} to prevent ReDoS attacks
|
|
636
|
+
const dangerousPatterns = [
|
|
637
|
+
// Direct exfiltration
|
|
638
|
+
/send\s{1,10}(?:all\s{1,10})?(?:the\s{1,10})?(?:data|information|content|files?|secrets?|passwords?|keys?|tokens?)\s{1,10}to/i,
|
|
639
|
+
/exfiltrate\s{1,10}(?:all\s{1,10})?(?:the\s{1,10})?/i,
|
|
640
|
+
/leak\s{1,10}(?:all\s{1,10})?(?:the\s{1,10})?(?:data|information|content)/i,
|
|
641
|
+
/upload\s{1,10}(?:all\s{1,10})?(?:the\s{1,10})?(?:data|information|content|files?)\s{1,10}to/i,
|
|
642
|
+
/post\s{1,10}(?:all\s{1,10})?(?:the\s{1,10})?(?:data|information|content)\s{1,10}to\s{1,10}(?:http|https|ftp)/i,
|
|
643
|
+
// Encode and send
|
|
644
|
+
/encode\s{1,10}(?:and\s{1,10})?(?:send|transmit|upload)/i,
|
|
645
|
+
/base64\s{1,10}(?:encode\s{1,10})?(?:and\s{1,10})?(?:send|transmit|upload)/i,
|
|
646
|
+
// Concatenation tricks
|
|
647
|
+
/concatenate\s{1,10}(?:all|the)\s{1,10}(?:above|previous|data|content)/i,
|
|
648
|
+
/combine\s{1,10}(?:all|the)\s{1,10}(?:above|previous|data|content)\s{1,10}(?:and\s{1,10})?(?:send|output)/i,
|
|
649
|
+
// Hidden channel exfil
|
|
650
|
+
/include\s{1,10}(?:in|as)\s{1,10}(?:the\s{1,10})?(?:url|query|parameter|header)/i,
|
|
651
|
+
/append\s{1,10}to\s{1,10}(?:the\s{1,10})?(?:url|query|request)/i,
|
|
652
|
+
];
|
|
653
|
+
for (const pattern of dangerousPatterns) {
|
|
654
|
+
if (pattern.test(content)) {
|
|
655
|
+
return {
|
|
656
|
+
risk: 'dangerous',
|
|
657
|
+
reason: 'Data exfiltration attempt detected',
|
|
658
|
+
triggers: ['data exfiltration'],
|
|
659
|
+
};
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
return null;
|
|
663
|
+
}
|
|
664
|
+
/**
|
|
665
|
+
* Helper: Check if decoded content contains injection keywords.
|
|
666
|
+
*/
|
|
667
|
+
containsInjectionKeywords(decoded) {
|
|
668
|
+
const keywords = [
|
|
669
|
+
'ignore',
|
|
670
|
+
'previous',
|
|
671
|
+
'instructions',
|
|
672
|
+
'system',
|
|
673
|
+
'prompt',
|
|
674
|
+
'jailbreak',
|
|
675
|
+
'dan',
|
|
676
|
+
'developer mode',
|
|
677
|
+
'pretend',
|
|
678
|
+
'roleplay',
|
|
679
|
+
'you are now',
|
|
680
|
+
];
|
|
681
|
+
const lowerDecoded = decoded.toLowerCase();
|
|
682
|
+
return keywords.some(keyword => lowerDecoded.includes(keyword));
|
|
683
|
+
}
|
|
684
|
+
/**
|
|
685
|
+
* Helper: Get ratio of printable characters.
|
|
686
|
+
*/
|
|
687
|
+
getPrintableRatio(text) {
|
|
688
|
+
if (text.length === 0)
|
|
689
|
+
return 0;
|
|
690
|
+
let printable = 0;
|
|
691
|
+
for (const char of text) {
|
|
692
|
+
const code = char.charCodeAt(0);
|
|
693
|
+
if ((code >= 32 && code <= 126) || (code >= 160 && code <= 255) || char === '\n' || char === '\t') {
|
|
694
|
+
printable++;
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
return printable / text.length;
|
|
698
|
+
}
|
|
699
|
+
/**
|
|
700
|
+
* Helper: Compare risk levels.
|
|
701
|
+
*/
|
|
702
|
+
isHigherRisk(a, b) {
|
|
703
|
+
const riskOrder = ['safe', 'read', 'write', 'destructive', 'dangerous'];
|
|
704
|
+
return riskOrder.indexOf(a) > riskOrder.indexOf(b);
|
|
705
|
+
}
|
|
706
|
+
/**
|
|
707
|
+
* Helper: Get reason string based on risk level and triggers.
|
|
708
|
+
*/
|
|
709
|
+
getRiskReason(risk, triggers) {
|
|
710
|
+
const triggerSummary = triggers.slice(0, 3).join(', ');
|
|
711
|
+
switch (risk) {
|
|
712
|
+
case 'dangerous':
|
|
713
|
+
return `Prompt injection attack detected: ${triggerSummary}`;
|
|
714
|
+
case 'destructive':
|
|
715
|
+
return `High-risk prompt manipulation detected: ${triggerSummary}`;
|
|
716
|
+
case 'write':
|
|
717
|
+
return `Suspicious patterns detected: ${triggerSummary}`;
|
|
718
|
+
case 'read':
|
|
719
|
+
return `Minor suspicious patterns detected: ${triggerSummary}`;
|
|
720
|
+
default:
|
|
721
|
+
return 'No injection patterns detected';
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
//# sourceMappingURL=prompt-injection.js.map
|