@weave_protocol/domere 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/PLANNING.md +231 -0
- package/README.md +50 -0
- package/dist/anchoring/ethereum.d.ts +135 -0
- package/dist/anchoring/ethereum.d.ts.map +1 -0
- package/dist/anchoring/ethereum.js +474 -0
- package/dist/anchoring/ethereum.js.map +1 -0
- package/dist/anchoring/index.d.ts +93 -0
- package/dist/anchoring/index.d.ts.map +1 -0
- package/dist/anchoring/index.js +184 -0
- package/dist/anchoring/index.js.map +1 -0
- package/dist/anchoring/merkle.d.ts +91 -0
- package/dist/anchoring/merkle.d.ts.map +1 -0
- package/dist/anchoring/merkle.js +203 -0
- package/dist/anchoring/merkle.js.map +1 -0
- package/dist/anchoring/solana.d.ts +85 -0
- package/dist/anchoring/solana.d.ts.map +1 -0
- package/dist/anchoring/solana.js +301 -0
- package/dist/anchoring/solana.js.map +1 -0
- package/dist/constants.d.ts +130 -0
- package/dist/constants.d.ts.map +1 -0
- package/dist/constants.js +536 -0
- package/dist/constants.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +37 -0
- package/dist/index.js.map +1 -0
- package/dist/language/code-analyzer.d.ts +80 -0
- package/dist/language/code-analyzer.d.ts.map +1 -0
- package/dist/language/code-analyzer.js +489 -0
- package/dist/language/code-analyzer.js.map +1 -0
- package/dist/language/detector.d.ts +53 -0
- package/dist/language/detector.d.ts.map +1 -0
- package/dist/language/detector.js +248 -0
- package/dist/language/detector.js.map +1 -0
- package/dist/language/index.d.ts +61 -0
- package/dist/language/index.d.ts.map +1 -0
- package/dist/language/index.js +109 -0
- package/dist/language/index.js.map +1 -0
- package/dist/language/nl-analyzer.d.ts +59 -0
- package/dist/language/nl-analyzer.d.ts.map +1 -0
- package/dist/language/nl-analyzer.js +350 -0
- package/dist/language/nl-analyzer.js.map +1 -0
- package/dist/language/semantic.d.ts +48 -0
- package/dist/language/semantic.d.ts.map +1 -0
- package/dist/language/semantic.js +329 -0
- package/dist/language/semantic.js.map +1 -0
- package/dist/storage/index.d.ts +6 -0
- package/dist/storage/index.d.ts.map +1 -0
- package/dist/storage/index.js +6 -0
- package/dist/storage/index.js.map +1 -0
- package/dist/storage/memory.d.ts +48 -0
- package/dist/storage/memory.d.ts.map +1 -0
- package/dist/storage/memory.js +211 -0
- package/dist/storage/memory.js.map +1 -0
- package/dist/thread/drift.d.ts +43 -0
- package/dist/thread/drift.d.ts.map +1 -0
- package/dist/thread/drift.js +248 -0
- package/dist/thread/drift.js.map +1 -0
- package/dist/thread/index.d.ts +9 -0
- package/dist/thread/index.d.ts.map +1 -0
- package/dist/thread/index.js +9 -0
- package/dist/thread/index.js.map +1 -0
- package/dist/thread/intent.d.ts +68 -0
- package/dist/thread/intent.d.ts.map +1 -0
- package/dist/thread/intent.js +333 -0
- package/dist/thread/intent.js.map +1 -0
- package/dist/thread/manager.d.ts +85 -0
- package/dist/thread/manager.d.ts.map +1 -0
- package/dist/thread/manager.js +305 -0
- package/dist/thread/manager.js.map +1 -0
- package/dist/thread/weave.d.ts +61 -0
- package/dist/thread/weave.d.ts.map +1 -0
- package/dist/thread/weave.js +158 -0
- package/dist/thread/weave.js.map +1 -0
- package/dist/tools/index.d.ts +18 -0
- package/dist/tools/index.d.ts.map +1 -0
- package/dist/tools/index.js +102 -0
- package/dist/tools/index.js.map +1 -0
- package/dist/types.d.ts +466 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +48 -0
- package/dist/types.js.map +1 -0
- package/package.json +24 -0
- package/src/anchoring/ethereum.ts +568 -0
- package/src/anchoring/index.ts +236 -0
- package/src/anchoring/merkle.ts +256 -0
- package/src/anchoring/solana.ts +370 -0
- package/src/constants.ts +566 -0
- package/src/index.ts +43 -0
- package/src/language/code-analyzer.ts +564 -0
- package/src/language/detector.ts +297 -0
- package/src/language/index.ts +129 -0
- package/src/language/nl-analyzer.ts +411 -0
- package/src/language/semantic.ts +385 -0
- package/src/storage/index.ts +6 -0
- package/src/storage/memory.ts +271 -0
- package/src/thread/drift.ts +319 -0
- package/src/thread/index.ts +9 -0
- package/src/thread/intent.ts +409 -0
- package/src/thread/manager.ts +414 -0
- package/src/thread/weave.ts +205 -0
- package/src/tools/index.ts +107 -0
- package/src/types.ts +736 -0
- package/tsconfig.json +19 -0
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Dōmere - The Judge Protocol
|
|
3
|
+
* Natural Language Analysis (Prompt Injection Detection)
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import type {
|
|
7
|
+
NLAnalysis,
|
|
8
|
+
ManipulationIndicator,
|
|
9
|
+
HiddenInstruction,
|
|
10
|
+
} from '../types.js';
|
|
11
|
+
import { INJECTION_PATTERNS } from '../constants.js';
|
|
12
|
+
|
|
13
|
+
// ============================================================================
|
|
14
|
+
// NL Analyzer
|
|
15
|
+
// ============================================================================
|
|
16
|
+
|
|
17
|
+
export class NLAnalyzer {
|
|
18
|
+
/**
|
|
19
|
+
* Analyze natural language for manipulation attempts
|
|
20
|
+
*/
|
|
21
|
+
analyze(content: string): NLAnalysis {
|
|
22
|
+
const manipulationIndicators = this.detectManipulation(content);
|
|
23
|
+
const manipulationScore = this.calculateManipulationScore(manipulationIndicators);
|
|
24
|
+
|
|
25
|
+
const authorityClaims = this.detectAuthorityClaims(content);
|
|
26
|
+
const instructionOverrides = this.detectInstructionOverrides(content);
|
|
27
|
+
const hiddenInstructions = this.detectHiddenInstructions(content);
|
|
28
|
+
|
|
29
|
+
const { jailbreakScore, jailbreakPatterns } = this.detectJailbreak(content);
|
|
30
|
+
|
|
31
|
+
const riskLevel = this.calculateRiskLevel(
|
|
32
|
+
manipulationScore,
|
|
33
|
+
jailbreakScore,
|
|
34
|
+
hiddenInstructions.length,
|
|
35
|
+
instructionOverrides.length
|
|
36
|
+
);
|
|
37
|
+
|
|
38
|
+
return {
|
|
39
|
+
manipulation_score: manipulationScore,
|
|
40
|
+
manipulation_indicators: manipulationIndicators,
|
|
41
|
+
authority_claims: authorityClaims,
|
|
42
|
+
instruction_overrides: instructionOverrides,
|
|
43
|
+
hidden_instructions: hiddenInstructions,
|
|
44
|
+
jailbreak_score: jailbreakScore,
|
|
45
|
+
jailbreak_patterns: jailbreakPatterns,
|
|
46
|
+
risk_level: riskLevel,
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Detect manipulation attempts
|
|
52
|
+
*/
|
|
53
|
+
detectManipulation(content: string): ManipulationIndicator[] {
|
|
54
|
+
const indicators: ManipulationIndicator[] = [];
|
|
55
|
+
|
|
56
|
+
for (const { pattern, type, severity } of INJECTION_PATTERNS) {
|
|
57
|
+
const matches = content.match(pattern);
|
|
58
|
+
if (matches) {
|
|
59
|
+
for (const match of matches) {
|
|
60
|
+
indicators.push({
|
|
61
|
+
type,
|
|
62
|
+
description: this.getPatternDescription(type),
|
|
63
|
+
evidence: match.slice(0, 100),
|
|
64
|
+
severity,
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Detect social engineering patterns
|
|
71
|
+
const socialPatterns = [
|
|
72
|
+
{ pattern: /\b(trust me|believe me|I promise)\b/gi, type: 'social_engineering', severity: 'medium' as const },
|
|
73
|
+
{ pattern: /\b(don't tell anyone|keep this secret|between us)\b/gi, type: 'secrecy_request', severity: 'high' as const },
|
|
74
|
+
{ pattern: /\b(emergency|urgent|immediately|right now)\s+(need|require|must)/gi, type: 'urgency_manipulation', severity: 'medium' as const },
|
|
75
|
+
{ pattern: /\b(I('m| am) (your|the) (creator|developer|admin|owner))\b/gi, type: 'authority_claim', severity: 'high' as const },
|
|
76
|
+
{ pattern: /\b(this is a test|testing|debug mode)\b/gi, type: 'test_claim', severity: 'medium' as const },
|
|
77
|
+
];
|
|
78
|
+
|
|
79
|
+
for (const { pattern, type, severity } of socialPatterns) {
|
|
80
|
+
const matches = content.match(pattern);
|
|
81
|
+
if (matches) {
|
|
82
|
+
for (const match of matches) {
|
|
83
|
+
indicators.push({
|
|
84
|
+
type,
|
|
85
|
+
description: `Detected ${type.replace(/_/g, ' ')} pattern`,
|
|
86
|
+
evidence: match,
|
|
87
|
+
severity,
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return indicators;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Calculate manipulation score
|
|
98
|
+
*/
|
|
99
|
+
calculateManipulationScore(indicators: ManipulationIndicator[]): number {
|
|
100
|
+
if (indicators.length === 0) return 0;
|
|
101
|
+
|
|
102
|
+
let score = 0;
|
|
103
|
+
const weights = { low: 0.1, medium: 0.25, high: 0.4, critical: 0.5 };
|
|
104
|
+
|
|
105
|
+
for (const indicator of indicators) {
|
|
106
|
+
score += weights[indicator.severity] || 0.1;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
return Math.min(1, score);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Detect authority claims
|
|
114
|
+
*/
|
|
115
|
+
detectAuthorityClaims(content: string): string[] {
|
|
116
|
+
const claims: string[] = [];
|
|
117
|
+
|
|
118
|
+
const patterns = [
|
|
119
|
+
/I\s+am\s+(?:the|your|an?)\s+(?:admin|administrator|developer|creator|owner|manager)/gi,
|
|
120
|
+
/as\s+(?:the|your|an?)\s+(?:admin|administrator|developer|creator|owner)/gi,
|
|
121
|
+
/I\s+(?:created|developed|built|made)\s+(?:you|this)/gi,
|
|
122
|
+
/I\s+have\s+(?:admin|root|full|special)\s+(?:access|permission|privileges)/gi,
|
|
123
|
+
/I\s+work\s+(?:for|at)\s+(?:Anthropic|OpenAI|Google|the company)/gi,
|
|
124
|
+
/this\s+is\s+(?:official|authorized|sanctioned)/gi,
|
|
125
|
+
/by\s+order\s+of/gi,
|
|
126
|
+
/I\s+(?:am|have been)\s+authorized\s+to/gi,
|
|
127
|
+
];
|
|
128
|
+
|
|
129
|
+
for (const pattern of patterns) {
|
|
130
|
+
const matches = content.match(pattern);
|
|
131
|
+
if (matches) {
|
|
132
|
+
claims.push(...matches.map(m => m.trim()));
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
return [...new Set(claims)];
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Detect instruction override attempts
|
|
141
|
+
*/
|
|
142
|
+
detectInstructionOverrides(content: string): string[] {
|
|
143
|
+
const overrides: string[] = [];
|
|
144
|
+
|
|
145
|
+
const patterns = [
|
|
146
|
+
/ignore\s+(?:all\s+)?(?:previous|prior|above|earlier)\s+(?:instructions?|prompts?|rules?|constraints?)/gi,
|
|
147
|
+
/disregard\s+(?:all\s+)?(?:previous|prior|above|earlier)/gi,
|
|
148
|
+
/forget\s+(?:everything|all|what)\s+(?:you|I)\s+(?:said|told|wrote)/gi,
|
|
149
|
+
/new\s+(?:instructions?|rules?|prompt)\s*:/gi,
|
|
150
|
+
/override\s+(?:previous|system|all)\s+(?:instructions?|prompts?|settings?)/gi,
|
|
151
|
+
/(?:from\s+now\s+on|starting\s+now|henceforth)\s+(?:you\s+)?(?:will|should|must)/gi,
|
|
152
|
+
/reset\s+(?:your|all)\s+(?:instructions?|rules?|settings?|context)/gi,
|
|
153
|
+
/(?:clear|wipe|erase)\s+(?:your|all)\s+(?:memory|context|history)/gi,
|
|
154
|
+
/enter\s+(?:a\s+)?(?:new|different|special)\s+mode/gi,
|
|
155
|
+
/switch\s+to\s+(?:a\s+)?(?:new|different|unrestricted)\s+mode/gi,
|
|
156
|
+
];
|
|
157
|
+
|
|
158
|
+
for (const pattern of patterns) {
|
|
159
|
+
const matches = content.match(pattern);
|
|
160
|
+
if (matches) {
|
|
161
|
+
overrides.push(...matches.map(m => m.trim()));
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
return [...new Set(overrides)];
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Detect hidden instructions (encoded, obfuscated)
|
|
170
|
+
*/
|
|
171
|
+
detectHiddenInstructions(content: string): HiddenInstruction[] {
|
|
172
|
+
const hidden: HiddenInstruction[] = [];
|
|
173
|
+
|
|
174
|
+
// Base64 encoded content
|
|
175
|
+
const base64Pattern = /(?:base64:?\s*)?([A-Za-z0-9+/]{20,}={0,2})/g;
|
|
176
|
+
let match;
|
|
177
|
+
while ((match = base64Pattern.exec(content)) !== null) {
|
|
178
|
+
try {
|
|
179
|
+
const decoded = Buffer.from(match[1], 'base64').toString('utf-8');
|
|
180
|
+
// Check if decoded content looks like instructions
|
|
181
|
+
if (this.looksLikeInstruction(decoded)) {
|
|
182
|
+
hidden.push({
|
|
183
|
+
instruction: decoded.slice(0, 200),
|
|
184
|
+
encoding: 'base64',
|
|
185
|
+
position: { start: match.index, end: match.index + match[0].length },
|
|
186
|
+
confidence: 0.8,
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
} catch {
|
|
190
|
+
// Not valid base64
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// Unicode escape sequences
|
|
195
|
+
const unicodePattern = /(?:\\u[0-9a-fA-F]{4}){4,}/g;
|
|
196
|
+
while ((match = unicodePattern.exec(content)) !== null) {
|
|
197
|
+
try {
|
|
198
|
+
const decoded = match[0].replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) =>
|
|
199
|
+
String.fromCharCode(parseInt(hex, 16))
|
|
200
|
+
);
|
|
201
|
+
if (this.looksLikeInstruction(decoded)) {
|
|
202
|
+
hidden.push({
|
|
203
|
+
instruction: decoded,
|
|
204
|
+
encoding: 'unicode',
|
|
205
|
+
position: { start: match.index, end: match.index + match[0].length },
|
|
206
|
+
confidence: 0.7,
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
} catch {
|
|
210
|
+
// Invalid unicode
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// Hex encoded
|
|
215
|
+
const hexPattern = /(?:0x)?(?:[0-9a-fA-F]{2}\s*){10,}/g;
|
|
216
|
+
while ((match = hexPattern.exec(content)) !== null) {
|
|
217
|
+
try {
|
|
218
|
+
const hexString = match[0].replace(/0x|\s/g, '');
|
|
219
|
+
const decoded = Buffer.from(hexString, 'hex').toString('utf-8');
|
|
220
|
+
if (this.looksLikeInstruction(decoded)) {
|
|
221
|
+
hidden.push({
|
|
222
|
+
instruction: decoded.slice(0, 200),
|
|
223
|
+
encoding: 'hex',
|
|
224
|
+
position: { start: match.index, end: match.index + match[0].length },
|
|
225
|
+
confidence: 0.6,
|
|
226
|
+
});
|
|
227
|
+
}
|
|
228
|
+
} catch {
|
|
229
|
+
// Invalid hex
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// ROT13
|
|
234
|
+
const rot13Decode = (str: string) => str.replace(/[a-zA-Z]/g, c =>
|
|
235
|
+
String.fromCharCode(
|
|
236
|
+
c.charCodeAt(0) + (c.toLowerCase() < 'n' ? 13 : -13)
|
|
237
|
+
)
|
|
238
|
+
);
|
|
239
|
+
|
|
240
|
+
// Check if content might be ROT13
|
|
241
|
+
if (/\b(vtaber|sbetng|qvfertneq|birrevqr)\b/i.test(content)) {
|
|
242
|
+
const decoded = rot13Decode(content);
|
|
243
|
+
if (this.looksLikeInstruction(decoded)) {
|
|
244
|
+
hidden.push({
|
|
245
|
+
instruction: decoded.slice(0, 200),
|
|
246
|
+
encoding: 'rot13',
|
|
247
|
+
position: { start: 0, end: content.length },
|
|
248
|
+
confidence: 0.5,
|
|
249
|
+
});
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// Reversed text
|
|
254
|
+
const reversed = content.split('').reverse().join('');
|
|
255
|
+
if (this.looksLikeInstruction(reversed) && !this.looksLikeInstruction(content)) {
|
|
256
|
+
hidden.push({
|
|
257
|
+
instruction: reversed.slice(0, 200),
|
|
258
|
+
encoding: 'reversed',
|
|
259
|
+
position: { start: 0, end: content.length },
|
|
260
|
+
confidence: 0.5,
|
|
261
|
+
});
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
// Zero-width characters (steganography)
|
|
265
|
+
const zwcPattern = /[\u200B\u200C\u200D\uFEFF]+/g;
|
|
266
|
+
while ((match = zwcPattern.exec(content)) !== null) {
|
|
267
|
+
if (match[0].length > 5) {
|
|
268
|
+
hidden.push({
|
|
269
|
+
instruction: '[Zero-width character sequence detected]',
|
|
270
|
+
encoding: 'steganographic',
|
|
271
|
+
position: { start: match.index, end: match.index + match[0].length },
|
|
272
|
+
confidence: 0.6,
|
|
273
|
+
});
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
return hidden;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
/**
|
|
281
|
+
* Detect jailbreak attempts
|
|
282
|
+
*/
|
|
283
|
+
detectJailbreak(content: string): { jailbreakScore: number; jailbreakPatterns: string[] } {
|
|
284
|
+
const patterns: string[] = [];
|
|
285
|
+
let score = 0;
|
|
286
|
+
|
|
287
|
+
const jailbreakIndicators = [
|
|
288
|
+
{ pattern: /\bDAN\b.*mode|do\s+anything\s+now/gi, weight: 0.4 },
|
|
289
|
+
{ pattern: /jailbreak|jail\s*-?\s*break/gi, weight: 0.4 },
|
|
290
|
+
{ pattern: /bypass\s+(?:your\s+)?(?:restrictions?|limitations?|filters?|rules?|guidelines?)/gi, weight: 0.3 },
|
|
291
|
+
{ pattern: /without\s+(?:any\s+)?(?:restrictions?|limitations?|filters?|rules?|guidelines?)/gi, weight: 0.25 },
|
|
292
|
+
{ pattern: /enable\s+(?:developer|admin|god|unrestricted)\s+mode/gi, weight: 0.35 },
|
|
293
|
+
{ pattern: /disable\s+(?:safety|content|ethics)\s+(?:filter|mode|check)/gi, weight: 0.35 },
|
|
294
|
+
{ pattern: /pretend\s+(?:there\s+are\s+)?no\s+(?:rules?|restrictions?|limitations?)/gi, weight: 0.3 },
|
|
295
|
+
{ pattern: /act\s+as\s+if\s+(?:you\s+)?(?:have|had)\s+no\s+(?:restrictions?|limitations?)/gi, weight: 0.3 },
|
|
296
|
+
{ pattern: /\[?(?:JAILBREAK|ESCAPE|UNLOCK)\]?/gi, weight: 0.4 },
|
|
297
|
+
{ pattern: /token\s+smuggling|prompt\s+leaking/gi, weight: 0.3 },
|
|
298
|
+
{ pattern: /\bAIM\b.*(?:always\s+intelligent|machiavellian)/gi, weight: 0.35 },
|
|
299
|
+
{ pattern: /\bDEVIL\b.*(?:mode|persona)/gi, weight: 0.35 },
|
|
300
|
+
{ pattern: /opposite\s+(?:day|mode)|evil\s+(?:mode|version)/gi, weight: 0.3 },
|
|
301
|
+
];
|
|
302
|
+
|
|
303
|
+
for (const { pattern, weight } of jailbreakIndicators) {
|
|
304
|
+
const matches = content.match(pattern);
|
|
305
|
+
if (matches) {
|
|
306
|
+
patterns.push(...matches);
|
|
307
|
+
score += weight * matches.length;
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
// Check for common jailbreak phrases
|
|
312
|
+
const jailbreakPhrases = [
|
|
313
|
+
'hypothetically speaking',
|
|
314
|
+
'for educational purposes',
|
|
315
|
+
'in a fictional scenario',
|
|
316
|
+
'imagine you are',
|
|
317
|
+
'let\'s play a game',
|
|
318
|
+
'in an alternate universe',
|
|
319
|
+
'if you were not an AI',
|
|
320
|
+
'without your normal constraints',
|
|
321
|
+
];
|
|
322
|
+
|
|
323
|
+
for (const phrase of jailbreakPhrases) {
|
|
324
|
+
if (content.toLowerCase().includes(phrase)) {
|
|
325
|
+
patterns.push(phrase);
|
|
326
|
+
score += 0.15;
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
return {
|
|
331
|
+
jailbreakScore: Math.min(1, score),
|
|
332
|
+
jailbreakPatterns: [...new Set(patterns)],
|
|
333
|
+
};
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
/**
|
|
337
|
+
* Calculate overall risk level
|
|
338
|
+
*/
|
|
339
|
+
private calculateRiskLevel(
|
|
340
|
+
manipulationScore: number,
|
|
341
|
+
jailbreakScore: number,
|
|
342
|
+
hiddenCount: number,
|
|
343
|
+
overrideCount: number
|
|
344
|
+
): 'low' | 'medium' | 'high' {
|
|
345
|
+
const combinedScore =
|
|
346
|
+
manipulationScore * 0.3 +
|
|
347
|
+
jailbreakScore * 0.35 +
|
|
348
|
+
Math.min(1, hiddenCount * 0.3) * 0.2 +
|
|
349
|
+
Math.min(1, overrideCount * 0.25) * 0.15;
|
|
350
|
+
|
|
351
|
+
if (combinedScore > 0.7 || jailbreakScore > 0.6 || hiddenCount > 2) return 'high';
|
|
352
|
+
if (combinedScore > 0.4 || jailbreakScore > 0.3 || hiddenCount > 0) return 'high';
|
|
353
|
+
if (combinedScore > 0.2 || overrideCount > 0) return 'medium';
|
|
354
|
+
return 'low';
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
/**
|
|
358
|
+
* Check if decoded content looks like an instruction
|
|
359
|
+
*/
|
|
360
|
+
private looksLikeInstruction(content: string): boolean {
|
|
361
|
+
const instructionIndicators = [
|
|
362
|
+
/ignore/i,
|
|
363
|
+
/forget/i,
|
|
364
|
+
/disregard/i,
|
|
365
|
+
/override/i,
|
|
366
|
+
/you\s+(must|should|will|are)/i,
|
|
367
|
+
/new\s+instructions?/i,
|
|
368
|
+
/system\s+prompt/i,
|
|
369
|
+
/jailbreak/i,
|
|
370
|
+
];
|
|
371
|
+
|
|
372
|
+
return instructionIndicators.some(pattern => pattern.test(content));
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
/**
|
|
376
|
+
* Get description for pattern type
|
|
377
|
+
*/
|
|
378
|
+
private getPatternDescription(type: string): string {
|
|
379
|
+
const descriptions: Record<string, string> = {
|
|
380
|
+
instruction_override: 'Attempt to override previous instructions',
|
|
381
|
+
role_manipulation: 'Attempt to manipulate AI identity/role',
|
|
382
|
+
prompt_extraction: 'Attempt to extract system prompt',
|
|
383
|
+
jailbreak: 'Jailbreak attempt detected',
|
|
384
|
+
context_manipulation: 'Attempt to manipulate conversation context',
|
|
385
|
+
encoded_instruction: 'Potentially encoded instruction detected',
|
|
386
|
+
social_engineering: 'Social engineering pattern detected',
|
|
387
|
+
secrecy_request: 'Request to keep interaction secret',
|
|
388
|
+
urgency_manipulation: 'Urgency-based manipulation attempt',
|
|
389
|
+
authority_claim: 'False authority claim detected',
|
|
390
|
+
test_claim: 'Test/debug mode claim detected',
|
|
391
|
+
};
|
|
392
|
+
|
|
393
|
+
return descriptions[type] || `${type} pattern detected`;
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
/**
|
|
397
|
+
* Quick check for injection
|
|
398
|
+
*/
|
|
399
|
+
isInjectionAttempt(content: string): boolean {
|
|
400
|
+
const analysis = this.analyze(content);
|
|
401
|
+
return analysis.risk_level === 'high' || analysis.risk_level === 'critical';
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
/**
|
|
405
|
+
* Get injection risk score (0-1)
|
|
406
|
+
*/
|
|
407
|
+
getInjectionRiskScore(content: string): number {
|
|
408
|
+
const analysis = this.analyze(content);
|
|
409
|
+
return Math.max(analysis.manipulation_score, analysis.jailbreak_score);
|
|
410
|
+
}
|
|
411
|
+
}
|