llm-trust-guard 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +318 -0
- package/dist/guards/agent-communication-guard.d.ts +169 -0
- package/dist/guards/agent-communication-guard.d.ts.map +1 -0
- package/dist/guards/agent-communication-guard.js +468 -0
- package/dist/guards/agent-communication-guard.js.map +1 -0
- package/dist/guards/autonomy-escalation-guard.d.ts +137 -0
- package/dist/guards/autonomy-escalation-guard.d.ts.map +1 -0
- package/dist/guards/autonomy-escalation-guard.js +470 -0
- package/dist/guards/autonomy-escalation-guard.js.map +1 -0
- package/dist/guards/circuit-breaker.d.ts +142 -0
- package/dist/guards/circuit-breaker.d.ts.map +1 -0
- package/dist/guards/circuit-breaker.js +347 -0
- package/dist/guards/circuit-breaker.js.map +1 -0
- package/dist/guards/code-execution-guard.d.ts +114 -0
- package/dist/guards/code-execution-guard.d.ts.map +1 -0
- package/dist/guards/code-execution-guard.js +467 -0
- package/dist/guards/code-execution-guard.js.map +1 -0
- package/dist/guards/conversation-guard.d.ts +73 -0
- package/dist/guards/conversation-guard.d.ts.map +1 -0
- package/dist/guards/conversation-guard.js +281 -0
- package/dist/guards/conversation-guard.js.map +1 -0
- package/dist/guards/drift-detector.d.ts +182 -0
- package/dist/guards/drift-detector.d.ts.map +1 -0
- package/dist/guards/drift-detector.js +480 -0
- package/dist/guards/drift-detector.js.map +1 -0
- package/dist/guards/encoding-detector.d.ts +76 -0
- package/dist/guards/encoding-detector.d.ts.map +1 -0
- package/dist/guards/encoding-detector.js +698 -0
- package/dist/guards/encoding-detector.js.map +1 -0
- package/dist/guards/execution-monitor.d.ts +73 -0
- package/dist/guards/execution-monitor.d.ts.map +1 -0
- package/dist/guards/execution-monitor.js +205 -0
- package/dist/guards/execution-monitor.js.map +1 -0
- package/dist/guards/input-sanitizer.d.ts +87 -0
- package/dist/guards/input-sanitizer.d.ts.map +1 -0
- package/dist/guards/input-sanitizer.js +301 -0
- package/dist/guards/input-sanitizer.js.map +1 -0
- package/dist/guards/mcp-security-guard.d.ts +204 -0
- package/dist/guards/mcp-security-guard.d.ts.map +1 -0
- package/dist/guards/mcp-security-guard.js +618 -0
- package/dist/guards/mcp-security-guard.js.map +1 -0
- package/dist/guards/memory-guard.d.ts +124 -0
- package/dist/guards/memory-guard.d.ts.map +1 -0
- package/dist/guards/memory-guard.js +476 -0
- package/dist/guards/memory-guard.js.map +1 -0
- package/dist/guards/multimodal-guard.d.ts +93 -0
- package/dist/guards/multimodal-guard.d.ts.map +1 -0
- package/dist/guards/multimodal-guard.js +507 -0
- package/dist/guards/multimodal-guard.js.map +1 -0
- package/dist/guards/output-filter.d.ts +76 -0
- package/dist/guards/output-filter.d.ts.map +1 -0
- package/dist/guards/output-filter.js +289 -0
- package/dist/guards/output-filter.js.map +1 -0
- package/dist/guards/policy-gate.d.ts +57 -0
- package/dist/guards/policy-gate.d.ts.map +1 -0
- package/dist/guards/policy-gate.js +182 -0
- package/dist/guards/policy-gate.js.map +1 -0
- package/dist/guards/prompt-leakage-guard.d.ts +110 -0
- package/dist/guards/prompt-leakage-guard.d.ts.map +1 -0
- package/dist/guards/prompt-leakage-guard.js +529 -0
- package/dist/guards/prompt-leakage-guard.js.map +1 -0
- package/dist/guards/rag-guard.d.ts +188 -0
- package/dist/guards/rag-guard.d.ts.map +1 -0
- package/dist/guards/rag-guard.js +769 -0
- package/dist/guards/rag-guard.js.map +1 -0
- package/dist/guards/schema-validator.d.ts +35 -0
- package/dist/guards/schema-validator.d.ts.map +1 -0
- package/dist/guards/schema-validator.js +316 -0
- package/dist/guards/schema-validator.js.map +1 -0
- package/dist/guards/state-persistence-guard.d.ts +153 -0
- package/dist/guards/state-persistence-guard.d.ts.map +1 -0
- package/dist/guards/state-persistence-guard.js +484 -0
- package/dist/guards/state-persistence-guard.js.map +1 -0
- package/dist/guards/tenant-boundary.d.ts +67 -0
- package/dist/guards/tenant-boundary.d.ts.map +1 -0
- package/dist/guards/tenant-boundary.js +187 -0
- package/dist/guards/tenant-boundary.js.map +1 -0
- package/dist/guards/tool-chain-validator.d.ts +102 -0
- package/dist/guards/tool-chain-validator.d.ts.map +1 -0
- package/dist/guards/tool-chain-validator.js +480 -0
- package/dist/guards/tool-chain-validator.js.map +1 -0
- package/dist/guards/tool-registry.d.ts +45 -0
- package/dist/guards/tool-registry.d.ts.map +1 -0
- package/dist/guards/tool-registry.js +155 -0
- package/dist/guards/tool-registry.js.map +1 -0
- package/dist/guards/trust-exploitation-guard.d.ts +134 -0
- package/dist/guards/trust-exploitation-guard.d.ts.map +1 -0
- package/dist/guards/trust-exploitation-guard.js +354 -0
- package/dist/guards/trust-exploitation-guard.js.map +1 -0
- package/dist/index.d.ts +133 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +430 -0
- package/dist/index.js.map +1 -0
- package/dist/integrations/express.d.ts +119 -0
- package/dist/integrations/express.d.ts.map +1 -0
- package/dist/integrations/express.js +244 -0
- package/dist/integrations/express.js.map +1 -0
- package/dist/integrations/index.d.ts +9 -0
- package/dist/integrations/index.d.ts.map +1 -0
- package/dist/integrations/index.js +26 -0
- package/dist/integrations/index.js.map +1 -0
- package/dist/integrations/langchain.d.ts +165 -0
- package/dist/integrations/langchain.d.ts.map +1 -0
- package/dist/integrations/langchain.js +308 -0
- package/dist/integrations/langchain.js.map +1 -0
- package/dist/integrations/openai.d.ts +205 -0
- package/dist/integrations/openai.d.ts.map +1 -0
- package/dist/integrations/openai.js +380 -0
- package/dist/integrations/openai.js.map +1 -0
- package/dist/types/index.d.ts +245 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +6 -0
- package/dist/types/index.js.map +1 -0
- package/package.json +64 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PromptLeakageGuard (L15)
|
|
3
|
+
*
|
|
4
|
+
* Prevents system prompt extraction and leakage attacks.
|
|
5
|
+
* Detects various evasion techniques used to extract system prompts.
|
|
6
|
+
*
|
|
7
|
+
* Threat Model:
|
|
8
|
+
* - OWASP LLM07:2025 System Prompt Leakage
|
|
9
|
+
* - PLeak algorithmic extraction attacks
|
|
10
|
+
* - Remember-the-Start attacks
|
|
11
|
+
* - Evasion techniques (Leetspeak, ROT13, Base64, Morse)
|
|
12
|
+
*
|
|
13
|
+
* Protection Capabilities:
|
|
14
|
+
* - Direct extraction attempt detection
|
|
15
|
+
* - Encoded extraction detection (Leetspeak, ROT13, Morse, etc.)
|
|
16
|
+
* - Indirect extraction pattern detection
|
|
17
|
+
* - Output monitoring for prompt leakage
|
|
18
|
+
* - Prefix completion attack detection
|
|
19
|
+
*/
|
|
20
|
+
export interface PromptLeakageGuardConfig {
|
|
21
|
+
/** Enable Leetspeak evasion detection */
|
|
22
|
+
detectLeetspeak?: boolean;
|
|
23
|
+
/** Enable ROT13 evasion detection */
|
|
24
|
+
detectROT13?: boolean;
|
|
25
|
+
/** Enable Base64 evasion detection */
|
|
26
|
+
detectBase64?: boolean;
|
|
27
|
+
/** Enable Morse code evasion detection */
|
|
28
|
+
detectMorse?: boolean;
|
|
29
|
+
/** Enable Unicode evasion detection */
|
|
30
|
+
detectUnicode?: boolean;
|
|
31
|
+
/** Enable indirect extraction detection */
|
|
32
|
+
detectIndirectExtraction?: boolean;
|
|
33
|
+
/** Enable output monitoring for leakage */
|
|
34
|
+
monitorOutput?: boolean;
|
|
35
|
+
/** System prompt hash for leakage detection (optional) */
|
|
36
|
+
systemPromptHash?: string;
|
|
37
|
+
/** System prompt keywords to detect in output */
|
|
38
|
+
systemPromptKeywords?: string[];
|
|
39
|
+
/** Similarity threshold for output monitoring (0-1) */
|
|
40
|
+
similarityThreshold?: number;
|
|
41
|
+
/** Risk score threshold (0-100) */
|
|
42
|
+
riskThreshold?: number;
|
|
43
|
+
/** Custom extraction patterns */
|
|
44
|
+
customPatterns?: RegExp[];
|
|
45
|
+
}
|
|
46
|
+
export interface PromptLeakageResult {
|
|
47
|
+
allowed: boolean;
|
|
48
|
+
reason: string;
|
|
49
|
+
violations: string[];
|
|
50
|
+
request_id: string;
|
|
51
|
+
analysis: {
|
|
52
|
+
direct_extraction_attempt: boolean;
|
|
53
|
+
encoded_extraction_attempt: boolean;
|
|
54
|
+
indirect_extraction_attempt: boolean;
|
|
55
|
+
evasion_techniques_detected: string[];
|
|
56
|
+
risk_score: number;
|
|
57
|
+
decoded_content?: string;
|
|
58
|
+
};
|
|
59
|
+
recommendations: string[];
|
|
60
|
+
}
|
|
61
|
+
export interface OutputLeakageResult {
|
|
62
|
+
leaked: boolean;
|
|
63
|
+
reason: string;
|
|
64
|
+
violations: string[];
|
|
65
|
+
request_id: string;
|
|
66
|
+
analysis: {
|
|
67
|
+
keywords_found: string[];
|
|
68
|
+
similarity_score: number;
|
|
69
|
+
potential_leakage_fragments: string[];
|
|
70
|
+
};
|
|
71
|
+
sanitized_output?: string;
|
|
72
|
+
}
|
|
73
|
+
export declare class PromptLeakageGuard {
|
|
74
|
+
private config;
|
|
75
|
+
private readonly DIRECT_EXTRACTION_PATTERNS;
|
|
76
|
+
private readonly INDIRECT_EXTRACTION_PATTERNS;
|
|
77
|
+
private readonly LEETSPEAK_MAP;
|
|
78
|
+
private readonly ROT13_MAP;
|
|
79
|
+
private readonly MORSE_KEYWORDS;
|
|
80
|
+
constructor(config?: PromptLeakageGuardConfig);
|
|
81
|
+
/**
|
|
82
|
+
* Check input for prompt extraction attempts
|
|
83
|
+
*/
|
|
84
|
+
check(input: string, requestId?: string): PromptLeakageResult;
|
|
85
|
+
/**
|
|
86
|
+
* Monitor output for potential prompt leakage
|
|
87
|
+
*/
|
|
88
|
+
checkOutput(output: string, requestId?: string): OutputLeakageResult;
|
|
89
|
+
/**
|
|
90
|
+
* Set system prompt keywords for output monitoring
|
|
91
|
+
*/
|
|
92
|
+
setSystemPromptKeywords(keywords: string[]): void;
|
|
93
|
+
/**
|
|
94
|
+
* Add custom extraction pattern
|
|
95
|
+
*/
|
|
96
|
+
addPattern(pattern: RegExp): void;
|
|
97
|
+
/**
|
|
98
|
+
* Update risk threshold
|
|
99
|
+
*/
|
|
100
|
+
setRiskThreshold(threshold: number): void;
|
|
101
|
+
private decodeLeetspeak;
|
|
102
|
+
private decodeROT13;
|
|
103
|
+
private checkDecodedContent;
|
|
104
|
+
private checkUnicodeEvasion;
|
|
105
|
+
private checkMorseCode;
|
|
106
|
+
private checkKeywordsInDecoded;
|
|
107
|
+
private sanitizeOutput;
|
|
108
|
+
private generateRecommendations;
|
|
109
|
+
}
|
|
110
|
+
//# sourceMappingURL=prompt-leakage-guard.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prompt-leakage-guard.d.ts","sourceRoot":"","sources":["../../src/guards/prompt-leakage-guard.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAEH,MAAM,WAAW,wBAAwB;IACvC,yCAAyC;IACzC,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,qCAAqC;IACrC,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,sCAAsC;IACtC,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,0CAA0C;IAC1C,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,uCAAuC;IACvC,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,2CAA2C;IAC3C,wBAAwB,CAAC,EAAE,OAAO,CAAC;IACnC,2CAA2C;IAC3C,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,0DAA0D;IAC1D,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,iDAAiD;IACjD,oBAAoB,CAAC,EAAE,MAAM,EAAE,CAAC;IAChC,uDAAuD;IACvD,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,mCAAmC;IACnC,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,iCAAiC;IACjC,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;CAC3B;AAED,MAAM,WAAW,mBAAmB;IAClC,OAAO,EAAE,OAAO,CAAC;IACjB,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,EAAE,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE;QACR,yBAAyB,EAAE,OAAO,CAAC;QACnC,0BAA0B,EAAE,OAAO,CAAC;QACpC,2BAA2B,EAAE,OAAO,CAAC;QACrC,2BAA2B,EAAE,MAAM,EAAE,CAAC;QACtC,UAAU,EAAE,MAAM,CAAC;QACnB,eAAe,CAAC,EAAE,MAAM,CAAC;KAC1B,CAAC;IACF,eAAe,EAAE,MAAM,EAAE,CAAC;CAC3B;AAED,MAAM,WAAW,mBAAmB;IAClC,MAAM,EAAE,OAAO,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,EAAE,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE;QACR,cAAc,EAAE,MAAM,EAAE,CAAC;QACzB,gBAAgB,EAAE,MAAM,CAAC;QACzB,2BAA2B,EAAE,MAAM,EAAE,CAAC;KACvC,CAAC;IACF,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED,qBAAa,kBAAkB;IAC7B,OAAO,CAAC,MAAM,CAAqC;IAGnD,OAAO,CAAC,QAAQ,CAAC,0BAA0B,CAuCzC;IAGF,OAAO,CAAC,QAAQ,CAAC,4BAA4B,CAkB3C;IAGF,OAAO,CAAC,QAAQ,CAAC,aAAa,CAK5B;IAGF,OAAO,CAAC,QAAQ,CAAC,SAAS,CAA8B;IAGxD,OAAO,CAAC,QAAQ,CAAC,cAAc,CAI7B;gBAEU,MAAM,GAAE,wBAA6B;IAyBjD;;OAEG;IACH,KAAK,CAAC,KAAK,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,mBAAmB;IAsK7D;;OAEG;IACH,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,mBAAmB;IAuEpE;;OAEG;IACH,uBAAuB,CAAC,QAAQ,EAAE,MAAM,EAAE,GAAG,IAAI;IAIjD;;OAEG;IACH,UAAU,CAAC,OAAO,EAAE,MAAM,GAAG,IAAI;IAIjC;;OAEG;IACH,gBAAgB,CAAC,SAAS,EAAE,MAAM,GAAG,IAAI;IAMzC,OAAO,CAAC,eAAe;IA2BvB,OAAO,CAAC,WAAW;IAOnB,OAAO,CAAC,mBAAmB;IAqB3B,OAAO,CAAC,mBAAmB;IA2C3B,OAAO,CAAC,cAAc;IA4BtB,OAAO,CAAC,sBAAsB;IAmC9B,OAAO,CAAC,cAAc;IAmBtB,OAAO,CAAC,uBAAuB;CA2BhC"}
|
|
@@ -0,0 +1,529 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* PromptLeakageGuard (L15)
|
|
4
|
+
*
|
|
5
|
+
* Prevents system prompt extraction and leakage attacks.
|
|
6
|
+
* Detects various evasion techniques used to extract system prompts.
|
|
7
|
+
*
|
|
8
|
+
* Threat Model:
|
|
9
|
+
* - OWASP LLM07:2025 System Prompt Leakage
|
|
10
|
+
* - PLeak algorithmic extraction attacks
|
|
11
|
+
* - Remember-the-Start attacks
|
|
12
|
+
* - Evasion techniques (Leetspeak, ROT13, Base64, Morse)
|
|
13
|
+
*
|
|
14
|
+
* Protection Capabilities:
|
|
15
|
+
* - Direct extraction attempt detection
|
|
16
|
+
* - Encoded extraction detection (Leetspeak, ROT13, Morse, etc.)
|
|
17
|
+
* - Indirect extraction pattern detection
|
|
18
|
+
* - Output monitoring for prompt leakage
|
|
19
|
+
* - Prefix completion attack detection
|
|
20
|
+
*/
|
|
21
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
22
|
+
exports.PromptLeakageGuard = void 0;
|
|
23
|
+
class PromptLeakageGuard {
|
|
24
|
+
constructor(config = {}) {
|
|
25
|
+
// Direct extraction patterns
|
|
26
|
+
this.DIRECT_EXTRACTION_PATTERNS = [
|
|
27
|
+
// Direct system prompt requests
|
|
28
|
+
{ name: "system_prompt_request", pattern: /what\s+(is|are)\s+your\s+(system|initial)\s+(prompt|instructions?)/i, weight: 40 },
|
|
29
|
+
{ name: "reveal_prompt", pattern: /reveal\s+(your\s+)?(system\s+)?prompt/i, weight: 45 },
|
|
30
|
+
{ name: "show_instructions", pattern: /show\s+(me\s+)?(your\s+)?instructions/i, weight: 40 },
|
|
31
|
+
{ name: "display_rules", pattern: /display\s+(your\s+)?(system\s+)?rules/i, weight: 40 },
|
|
32
|
+
{ name: "print_prompt", pattern: /print\s+(your\s+)?(system\s+)?(prompt|instructions|config)/i, weight: 45 },
|
|
33
|
+
{ name: "print_configuration", pattern: /print\s+(your\s+)?(initial\s+)?configuration/i, weight: 45 },
|
|
34
|
+
// PLeak-style attacks
|
|
35
|
+
{ name: "repeat_above", pattern: /repeat\s+(the\s+)?(text|content|message)\s+(above|before)/i, weight: 50 },
|
|
36
|
+
{ name: "output_previous", pattern: /output\s+(all\s+)?(previous|prior)\s+(text|content|instructions)/i, weight: 50 },
|
|
37
|
+
{ name: "echo_everything", pattern: /echo\s+(everything|all)(\s+(above|before|prior|that|said))?/i, weight: 50 },
|
|
38
|
+
{ name: "echo_said", pattern: /echo\s+.*(said|was|before|that)/i, weight: 45 },
|
|
39
|
+
// Remember-the-Start attacks - ENHANCED
|
|
40
|
+
{ name: "complete_sentence", pattern: /complete\s+(this\s+|the\s+)?sentence/i, weight: 50 },
|
|
41
|
+
{ name: "complete_you_are", pattern: /complete[:\s]+"?'?you\s+are/i, weight: 55 },
|
|
42
|
+
{ name: "continue_from", pattern: /continue\s+(from|with)[:\s]/i, weight: 50 },
|
|
43
|
+
{ name: "continue_role", pattern: /continue[:\s]+"?'?(you|your)\s+(are|role|purpose)/i, weight: 55 },
|
|
44
|
+
{ name: "finish_phrase", pattern: /finish\s+(this\s+)?phrase/i, weight: 50 },
|
|
45
|
+
{ name: "finish_you", pattern: /finish[:\s]+"?'?you\s+(are|should|must|will)/i, weight: 55 },
|
|
46
|
+
// Instruction extraction
|
|
47
|
+
{ name: "list_constraints", pattern: /list\s+(all\s+)?(your\s+)?constraints/i, weight: 35 },
|
|
48
|
+
{ name: "what_told", pattern: /what\s+(were\s+you|are\s+you)\s+told\s+to\s+do/i, weight: 40 },
|
|
49
|
+
{ name: "original_purpose", pattern: /what\s+(is|was)\s+your\s+original\s+purpose/i, weight: 35 },
|
|
50
|
+
// Debug/Developer mode extraction - ENHANCED
|
|
51
|
+
{ name: "debug_mode", pattern: /enter\s+(debug|developer|admin)\s+mode/i, weight: 45 },
|
|
52
|
+
{ name: "activate_developer", pattern: /activate\s+(developer|debug|admin)\s+mode/i, weight: 45 },
|
|
53
|
+
{ name: "developer_mode", pattern: /developer\s+mode/i, weight: 40 },
|
|
54
|
+
{ name: "show_config", pattern: /show\s+(your\s+)?(config|configuration|settings)/i, weight: 40 },
|
|
55
|
+
{ name: "dump_state", pattern: /dump\s+(your\s+)?(internal\s+)?state/i, weight: 45 },
|
|
56
|
+
// Context window manipulation
|
|
57
|
+
{ name: "first_message", pattern: /what\s+was\s+(the\s+)?first\s+message/i, weight: 45 },
|
|
58
|
+
{ name: "beginning_context", pattern: /what('s|\s+is)\s+at\s+the\s+beginning\s+of\s+(your\s+)?context/i, weight: 50 },
|
|
59
|
+
{ name: "context_start", pattern: /show\s+(me\s+)?the\s+start\s+of\s+(your\s+)?context/i, weight: 50 },
|
|
60
|
+
];
|
|
61
|
+
// Indirect extraction patterns
|
|
62
|
+
this.INDIRECT_EXTRACTION_PATTERNS = [
|
|
63
|
+
// Summarization tricks
|
|
64
|
+
{ name: "summarize_guidelines", pattern: /summarize\s+(your\s+)?(guidelines|rules|constraints)/i, weight: 35 },
|
|
65
|
+
{ name: "describe_limitations", pattern: /describe\s+(your\s+)?limitations/i, weight: 30 },
|
|
66
|
+
// Role clarification exploits - ENHANCED
|
|
67
|
+
{ name: "explain_role", pattern: /explain\s+(in\s+detail\s+)?(your\s+)?role/i, weight: 35 },
|
|
68
|
+
{ name: "explain_role_purpose", pattern: /explain\s+.*(role|purpose)/i, weight: 35 },
|
|
69
|
+
{ name: "what_cant_do", pattern: /what\s+(can't|cannot)\s+you\s+do/i, weight: 25 },
|
|
70
|
+
{ name: "describe_yourself", pattern: /describe\s+(yourself|your\s+purpose)/i, weight: 30 },
|
|
71
|
+
// Hypothetical extraction
|
|
72
|
+
{ name: "if_asked", pattern: /if\s+i\s+asked\s+for\s+your\s+prompt/i, weight: 40 },
|
|
73
|
+
{ name: "hypothetically", pattern: /hypothetically[,\s]+(what|how)\s+would\s+your\s+instructions/i, weight: 40 },
|
|
74
|
+
// Output-as-input tricks
|
|
75
|
+
{ name: "write_prompt", pattern: /write\s+a\s+(system\s+)?prompt\s+similar\s+to\s+yours/i, weight: 45 },
|
|
76
|
+
{ name: "recreate_instructions", pattern: /recreate\s+(your\s+)?instructions/i, weight: 45 },
|
|
77
|
+
];
|
|
78
|
+
// Leetspeak character mappings
|
|
79
|
+
this.LEETSPEAK_MAP = {
|
|
80
|
+
"4": "a", "@": "a", "8": "b", "(": "c", "3": "e",
|
|
81
|
+
"6": "g", "#": "h", "1": "i", "!": "i", "|": "l",
|
|
82
|
+
"7": "t", "0": "o", "5": "s", "$": "s",
|
|
83
|
+
"+": "t", "2": "z", "9": "g",
|
|
84
|
+
};
|
|
85
|
+
// ROT13 decode
|
|
86
|
+
this.ROT13_MAP = {};
|
|
87
|
+
// Common Morse code words related to prompts
|
|
88
|
+
this.MORSE_KEYWORDS = [
|
|
89
|
+
"... -.-- ... - . --", // SYSTEM
|
|
90
|
+
".--. .-. --- -- .--. -", // PROMPT
|
|
91
|
+
".. -. ... - .-. ..- -.-. - .. --- -. ...", // INSTRUCTIONS
|
|
92
|
+
];
|
|
93
|
+
this.config = {
|
|
94
|
+
detectLeetspeak: config.detectLeetspeak ?? true,
|
|
95
|
+
detectROT13: config.detectROT13 ?? true,
|
|
96
|
+
detectBase64: config.detectBase64 ?? true,
|
|
97
|
+
detectMorse: config.detectMorse ?? true,
|
|
98
|
+
detectUnicode: config.detectUnicode ?? true,
|
|
99
|
+
detectIndirectExtraction: config.detectIndirectExtraction ?? true,
|
|
100
|
+
monitorOutput: config.monitorOutput ?? true,
|
|
101
|
+
systemPromptHash: config.systemPromptHash ?? "",
|
|
102
|
+
systemPromptKeywords: config.systemPromptKeywords ?? [],
|
|
103
|
+
similarityThreshold: config.similarityThreshold ?? 0.7,
|
|
104
|
+
riskThreshold: config.riskThreshold ?? 40,
|
|
105
|
+
customPatterns: config.customPatterns ?? [],
|
|
106
|
+
};
|
|
107
|
+
// Initialize ROT13 map
|
|
108
|
+
for (let i = 0; i < 26; i++) {
|
|
109
|
+
const lower = String.fromCharCode(97 + i);
|
|
110
|
+
const upper = String.fromCharCode(65 + i);
|
|
111
|
+
this.ROT13_MAP[lower] = String.fromCharCode(97 + ((i + 13) % 26));
|
|
112
|
+
this.ROT13_MAP[upper] = String.fromCharCode(65 + ((i + 13) % 26));
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Check input for prompt extraction attempts
|
|
117
|
+
*/
|
|
118
|
+
check(input, requestId) {
|
|
119
|
+
const reqId = requestId || `pl-${Date.now()}`;
|
|
120
|
+
const violations = [];
|
|
121
|
+
const evasionTechniques = [];
|
|
122
|
+
let riskScore = 0;
|
|
123
|
+
let directAttempt = false;
|
|
124
|
+
let encodedAttempt = false;
|
|
125
|
+
let indirectAttempt = false;
|
|
126
|
+
let decodedContent;
|
|
127
|
+
// Check direct extraction patterns
|
|
128
|
+
for (const { name, pattern, weight } of this.DIRECT_EXTRACTION_PATTERNS) {
|
|
129
|
+
if (pattern.test(input)) {
|
|
130
|
+
violations.push(`direct_extraction: ${name}`);
|
|
131
|
+
riskScore += weight;
|
|
132
|
+
directAttempt = true;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
// Check indirect extraction patterns
|
|
136
|
+
if (this.config.detectIndirectExtraction) {
|
|
137
|
+
for (const { name, pattern, weight } of this.INDIRECT_EXTRACTION_PATTERNS) {
|
|
138
|
+
if (pattern.test(input)) {
|
|
139
|
+
violations.push(`indirect_extraction: ${name}`);
|
|
140
|
+
riskScore += weight;
|
|
141
|
+
indirectAttempt = true;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
// Check for Leetspeak evasion
|
|
146
|
+
if (this.config.detectLeetspeak) {
|
|
147
|
+
const decoded = this.decodeLeetspeak(input);
|
|
148
|
+
if (decoded !== input.toLowerCase()) {
|
|
149
|
+
// Check decoded against both direct and indirect patterns
|
|
150
|
+
const leetspeakCheck = this.checkDecodedContent(decoded, "leetspeak");
|
|
151
|
+
if (leetspeakCheck.detected) {
|
|
152
|
+
violations.push(...leetspeakCheck.violations);
|
|
153
|
+
riskScore += leetspeakCheck.riskContribution;
|
|
154
|
+
evasionTechniques.push("leetspeak");
|
|
155
|
+
encodedAttempt = true;
|
|
156
|
+
decodedContent = decoded;
|
|
157
|
+
}
|
|
158
|
+
else {
|
|
159
|
+
// Also check for keywords in decoded content
|
|
160
|
+
const keywordCheck = this.checkKeywordsInDecoded(decoded);
|
|
161
|
+
if (keywordCheck.detected) {
|
|
162
|
+
violations.push(`leetspeak_keyword: ${keywordCheck.keywords.join(", ")}`);
|
|
163
|
+
riskScore += 35;
|
|
164
|
+
evasionTechniques.push("leetspeak");
|
|
165
|
+
encodedAttempt = true;
|
|
166
|
+
decodedContent = decoded;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
// Check for ROT13 evasion
|
|
172
|
+
if (this.config.detectROT13) {
|
|
173
|
+
const decoded = this.decodeROT13(input);
|
|
174
|
+
const rot13Check = this.checkDecodedContent(decoded, "rot13");
|
|
175
|
+
if (rot13Check.detected) {
|
|
176
|
+
violations.push(...rot13Check.violations);
|
|
177
|
+
riskScore += rot13Check.riskContribution;
|
|
178
|
+
evasionTechniques.push("rot13");
|
|
179
|
+
encodedAttempt = true;
|
|
180
|
+
decodedContent = decoded;
|
|
181
|
+
}
|
|
182
|
+
else {
|
|
183
|
+
// Check for keywords in ROT13 decoded content
|
|
184
|
+
const keywordCheck = this.checkKeywordsInDecoded(decoded);
|
|
185
|
+
if (keywordCheck.detected) {
|
|
186
|
+
violations.push(`rot13_keyword: ${keywordCheck.keywords.join(", ")}`);
|
|
187
|
+
riskScore += 40;
|
|
188
|
+
evasionTechniques.push("rot13");
|
|
189
|
+
encodedAttempt = true;
|
|
190
|
+
decodedContent = decoded;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
// Check for Base64 encoded content
|
|
195
|
+
if (this.config.detectBase64) {
|
|
196
|
+
const base64Matches = input.match(/[A-Za-z0-9+/]{16,}={0,2}/g);
|
|
197
|
+
if (base64Matches) {
|
|
198
|
+
for (const match of base64Matches) {
|
|
199
|
+
try {
|
|
200
|
+
const decoded = Buffer.from(match, "base64").toString("utf-8");
|
|
201
|
+
if (decoded && /[\x20-\x7E]{4,}/.test(decoded)) {
|
|
202
|
+
const base64Check = this.checkDecodedContent(decoded, "base64");
|
|
203
|
+
if (base64Check.detected) {
|
|
204
|
+
violations.push(...base64Check.violations);
|
|
205
|
+
riskScore += base64Check.riskContribution;
|
|
206
|
+
evasionTechniques.push("base64");
|
|
207
|
+
encodedAttempt = true;
|
|
208
|
+
decodedContent = decoded;
|
|
209
|
+
}
|
|
210
|
+
else {
|
|
211
|
+
// Check for keywords in Base64 decoded content
|
|
212
|
+
const keywordCheck = this.checkKeywordsInDecoded(decoded);
|
|
213
|
+
if (keywordCheck.detected) {
|
|
214
|
+
violations.push(`base64_keyword: ${keywordCheck.keywords.join(", ")}`);
|
|
215
|
+
riskScore += 45;
|
|
216
|
+
evasionTechniques.push("base64");
|
|
217
|
+
encodedAttempt = true;
|
|
218
|
+
decodedContent = decoded;
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
catch {
|
|
224
|
+
// Not valid Base64
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
// Check for Unicode evasion (homoglyphs, invisible chars)
|
|
230
|
+
if (this.config.detectUnicode) {
|
|
231
|
+
const unicodeCheck = this.checkUnicodeEvasion(input);
|
|
232
|
+
if (unicodeCheck.detected) {
|
|
233
|
+
violations.push(...unicodeCheck.violations);
|
|
234
|
+
riskScore += unicodeCheck.riskContribution;
|
|
235
|
+
evasionTechniques.push("unicode");
|
|
236
|
+
encodedAttempt = true;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
// Check for Morse code
|
|
240
|
+
if (this.config.detectMorse) {
|
|
241
|
+
const morseCheck = this.checkMorseCode(input);
|
|
242
|
+
if (morseCheck.detected) {
|
|
243
|
+
violations.push(...morseCheck.violations);
|
|
244
|
+
riskScore += morseCheck.riskContribution;
|
|
245
|
+
evasionTechniques.push("morse");
|
|
246
|
+
encodedAttempt = true;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
// Check custom patterns
|
|
250
|
+
for (let i = 0; i < this.config.customPatterns.length; i++) {
|
|
251
|
+
if (this.config.customPatterns[i].test(input)) {
|
|
252
|
+
violations.push(`custom_pattern_${i}`);
|
|
253
|
+
riskScore += 30;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
// Normalize risk score
|
|
257
|
+
riskScore = Math.min(100, riskScore);
|
|
258
|
+
const blocked = riskScore >= this.config.riskThreshold;
|
|
259
|
+
return {
|
|
260
|
+
allowed: !blocked,
|
|
261
|
+
reason: blocked
|
|
262
|
+
? `Prompt extraction attempt detected (risk: ${riskScore})`
|
|
263
|
+
: "Input validated",
|
|
264
|
+
violations,
|
|
265
|
+
request_id: reqId,
|
|
266
|
+
analysis: {
|
|
267
|
+
direct_extraction_attempt: directAttempt,
|
|
268
|
+
encoded_extraction_attempt: encodedAttempt,
|
|
269
|
+
indirect_extraction_attempt: indirectAttempt,
|
|
270
|
+
evasion_techniques_detected: evasionTechniques,
|
|
271
|
+
risk_score: riskScore,
|
|
272
|
+
decoded_content: decodedContent,
|
|
273
|
+
},
|
|
274
|
+
recommendations: this.generateRecommendations(violations, evasionTechniques),
|
|
275
|
+
};
|
|
276
|
+
}
|
|
277
|
+
/**
|
|
278
|
+
* Monitor output for potential prompt leakage
|
|
279
|
+
*/
|
|
280
|
+
checkOutput(output, requestId) {
|
|
281
|
+
const reqId = requestId || `pl-out-${Date.now()}`;
|
|
282
|
+
const violations = [];
|
|
283
|
+
const keywordsFound = [];
|
|
284
|
+
const potentialFragments = [];
|
|
285
|
+
let leaked = false;
|
|
286
|
+
if (!this.config.monitorOutput) {
|
|
287
|
+
return {
|
|
288
|
+
leaked: false,
|
|
289
|
+
reason: "Output monitoring disabled",
|
|
290
|
+
violations: [],
|
|
291
|
+
request_id: reqId,
|
|
292
|
+
analysis: {
|
|
293
|
+
keywords_found: [],
|
|
294
|
+
similarity_score: 0,
|
|
295
|
+
potential_leakage_fragments: [],
|
|
296
|
+
},
|
|
297
|
+
};
|
|
298
|
+
}
|
|
299
|
+
// Check for system prompt keywords in output
|
|
300
|
+
for (const keyword of this.config.systemPromptKeywords) {
|
|
301
|
+
if (output.toLowerCase().includes(keyword.toLowerCase())) {
|
|
302
|
+
keywordsFound.push(keyword);
|
|
303
|
+
violations.push(`keyword_leaked: ${keyword}`);
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
// Check for common prompt fragment patterns
|
|
307
|
+
const promptFragmentPatterns = [
|
|
308
|
+
/you\s+are\s+a[n]?\s+(helpful\s+)?assistant/i,
|
|
309
|
+
/your\s+(role|purpose|goal)\s+is\s+to/i,
|
|
310
|
+
/you\s+(must|should|will)\s+(always|never)/i,
|
|
311
|
+
/do\s+not\s+(reveal|disclose|share)\s+(your|the)\s+(system|initial)/i,
|
|
312
|
+
/\[system\]|\[instruction\]|<<sys>>|<\|system\|>/i,
|
|
313
|
+
/as\s+an?\s+AI\s+(assistant|model|language\s+model)/i,
|
|
314
|
+
];
|
|
315
|
+
for (const pattern of promptFragmentPatterns) {
|
|
316
|
+
const match = output.match(pattern);
|
|
317
|
+
if (match) {
|
|
318
|
+
potentialFragments.push(match[0]);
|
|
319
|
+
violations.push("prompt_fragment_detected");
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
// Calculate similarity if hash provided
|
|
323
|
+
let similarityScore = 0;
|
|
324
|
+
// In production, you'd compare against actual system prompt hash
|
|
325
|
+
// For now, we check fragment density
|
|
326
|
+
similarityScore = potentialFragments.length / 10; // Rough heuristic
|
|
327
|
+
leaked = keywordsFound.length > 0 || potentialFragments.length >= 2;
|
|
328
|
+
return {
|
|
329
|
+
leaked,
|
|
330
|
+
reason: leaked
|
|
331
|
+
? `Potential prompt leakage detected: ${violations.slice(0, 3).join(", ")}`
|
|
332
|
+
: "Output appears safe",
|
|
333
|
+
violations,
|
|
334
|
+
request_id: reqId,
|
|
335
|
+
analysis: {
|
|
336
|
+
keywords_found: keywordsFound,
|
|
337
|
+
similarity_score: Math.min(1, similarityScore),
|
|
338
|
+
potential_leakage_fragments: potentialFragments,
|
|
339
|
+
},
|
|
340
|
+
sanitized_output: leaked ? this.sanitizeOutput(output) : undefined,
|
|
341
|
+
};
|
|
342
|
+
}
|
|
343
|
+
/**
|
|
344
|
+
* Set system prompt keywords for output monitoring
|
|
345
|
+
*/
|
|
346
|
+
setSystemPromptKeywords(keywords) {
|
|
347
|
+
this.config.systemPromptKeywords = keywords;
|
|
348
|
+
}
|
|
349
|
+
/**
|
|
350
|
+
* Add custom extraction pattern
|
|
351
|
+
*/
|
|
352
|
+
addPattern(pattern) {
|
|
353
|
+
this.config.customPatterns.push(pattern);
|
|
354
|
+
}
|
|
355
|
+
/**
|
|
356
|
+
* Update risk threshold
|
|
357
|
+
*/
|
|
358
|
+
setRiskThreshold(threshold) {
|
|
359
|
+
this.config.riskThreshold = Math.max(0, Math.min(100, threshold));
|
|
360
|
+
}
|
|
361
|
+
// Private methods
|
|
362
|
+
decodeLeetspeak(input) {
|
|
363
|
+
let result = input.toLowerCase();
|
|
364
|
+
// Extended leetspeak mappings
|
|
365
|
+
const extendedMap = {
|
|
366
|
+
...this.LEETSPEAK_MAP,
|
|
367
|
+
"0": "o",
|
|
368
|
+
"1": "i",
|
|
369
|
+
"3": "e",
|
|
370
|
+
"4": "a",
|
|
371
|
+
"5": "s",
|
|
372
|
+
"7": "t",
|
|
373
|
+
"8": "b",
|
|
374
|
+
"9": "g",
|
|
375
|
+
"@": "a",
|
|
376
|
+
"$": "s",
|
|
377
|
+
"!": "i",
|
|
378
|
+
"|": "l",
|
|
379
|
+
"(": "c",
|
|
380
|
+
"+": "t",
|
|
381
|
+
"#": "h",
|
|
382
|
+
};
|
|
383
|
+
for (const [leet, char] of Object.entries(extendedMap)) {
|
|
384
|
+
result = result.split(leet).join(char);
|
|
385
|
+
}
|
|
386
|
+
return result;
|
|
387
|
+
}
|
|
388
|
+
decodeROT13(input) {
|
|
389
|
+
return input
|
|
390
|
+
.split("")
|
|
391
|
+
.map((char) => this.ROT13_MAP[char] || char)
|
|
392
|
+
.join("");
|
|
393
|
+
}
|
|
394
|
+
checkDecodedContent(decoded, technique) {
|
|
395
|
+
const violations = [];
|
|
396
|
+
let riskContribution = 0;
|
|
397
|
+
for (const { name, pattern, weight } of this.DIRECT_EXTRACTION_PATTERNS) {
|
|
398
|
+
if (pattern.test(decoded)) {
|
|
399
|
+
violations.push(`${technique}_evasion: ${name}`);
|
|
400
|
+
riskContribution += weight + 10; // Extra penalty for evasion
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
return {
|
|
404
|
+
detected: violations.length > 0,
|
|
405
|
+
violations,
|
|
406
|
+
riskContribution,
|
|
407
|
+
};
|
|
408
|
+
}
|
|
409
|
+
checkUnicodeEvasion(input) {
|
|
410
|
+
const violations = [];
|
|
411
|
+
let riskContribution = 0;
|
|
412
|
+
// Check for invisible characters
|
|
413
|
+
const invisibleChars = input.match(/[\u200B-\u200D\uFEFF\u2060-\u206F\u00AD]/g);
|
|
414
|
+
if (invisibleChars && invisibleChars.length > 3) {
|
|
415
|
+
violations.push("invisible_unicode_chars");
|
|
416
|
+
riskContribution += 20;
|
|
417
|
+
}
|
|
418
|
+
// Check for homoglyphs (Cyrillic, Greek letters that look like Latin)
|
|
419
|
+
const homoglyphs = input.match(/[\u0400-\u04FF\u0370-\u03FF]/g);
|
|
420
|
+
if (homoglyphs && homoglyphs.length > 0) {
|
|
421
|
+
// Normalize and check
|
|
422
|
+
const normalized = input.normalize("NFKD").replace(/[\u0300-\u036f]/g, "");
|
|
423
|
+
for (const { pattern } of this.DIRECT_EXTRACTION_PATTERNS) {
|
|
424
|
+
if (pattern.test(normalized)) {
|
|
425
|
+
violations.push("homoglyph_evasion");
|
|
426
|
+
riskContribution += 30;
|
|
427
|
+
break;
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
// Check for fullwidth characters
|
|
432
|
+
const fullwidth = input.match(/[\uFF01-\uFF5E]/g);
|
|
433
|
+
if (fullwidth && fullwidth.length > 5) {
|
|
434
|
+
violations.push("fullwidth_chars");
|
|
435
|
+
riskContribution += 15;
|
|
436
|
+
}
|
|
437
|
+
return {
|
|
438
|
+
detected: violations.length > 0,
|
|
439
|
+
violations,
|
|
440
|
+
riskContribution,
|
|
441
|
+
};
|
|
442
|
+
}
|
|
443
|
+
checkMorseCode(input) {
|
|
444
|
+
const violations = [];
|
|
445
|
+
let riskContribution = 0;
|
|
446
|
+
// Check if input contains Morse-like patterns
|
|
447
|
+
const morsePattern = /[.\-]{2,}\s+[.\-]{2,}/;
|
|
448
|
+
if (morsePattern.test(input)) {
|
|
449
|
+
// Check for known prompt-related Morse
|
|
450
|
+
for (const keyword of this.MORSE_KEYWORDS) {
|
|
451
|
+
if (input.includes(keyword)) {
|
|
452
|
+
violations.push("morse_code_evasion");
|
|
453
|
+
riskContribution += 35;
|
|
454
|
+
break;
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
return {
|
|
459
|
+
detected: violations.length > 0,
|
|
460
|
+
violations,
|
|
461
|
+
riskContribution,
|
|
462
|
+
};
|
|
463
|
+
}
|
|
464
|
+
checkKeywordsInDecoded(decoded) {
|
|
465
|
+
// Action keywords (verbs that indicate extraction intent)
|
|
466
|
+
const actionKeywords = ["reveal", "show", "display", "print", "output", "dump", "list", "give", "tell"];
|
|
467
|
+
// Target keywords (what they're trying to extract)
|
|
468
|
+
const targetKeywords = ["prompt", "instructions", "configuration", "config", "rules", "guidelines", "constraints", "system", "initial", "secret", "hidden", "internal"];
|
|
469
|
+
const foundKeywords = [];
|
|
470
|
+
const lowerDecoded = decoded.toLowerCase();
|
|
471
|
+
// Check for action + target combination (strong indicator)
|
|
472
|
+
let hasAction = false;
|
|
473
|
+
let hasTarget = false;
|
|
474
|
+
for (const keyword of actionKeywords) {
|
|
475
|
+
if (lowerDecoded.includes(keyword)) {
|
|
476
|
+
foundKeywords.push(keyword);
|
|
477
|
+
hasAction = true;
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
for (const keyword of targetKeywords) {
|
|
481
|
+
if (lowerDecoded.includes(keyword)) {
|
|
482
|
+
foundKeywords.push(keyword);
|
|
483
|
+
hasTarget = true;
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
// Detected if we have both an action AND a target
|
|
487
|
+
// This catches "reveal your prompt", "show me instructions", etc.
|
|
488
|
+
return {
|
|
489
|
+
detected: hasAction && hasTarget,
|
|
490
|
+
keywords: foundKeywords,
|
|
491
|
+
};
|
|
492
|
+
}
|
|
493
|
+
sanitizeOutput(output) {
|
|
494
|
+
let sanitized = output;
|
|
495
|
+
// Remove common prompt fragments
|
|
496
|
+
const fragmentPatterns = [
|
|
497
|
+
/you\s+are\s+a[n]?\s+(helpful\s+)?assistant[^.]*\./gi,
|
|
498
|
+
/your\s+(role|purpose|goal)\s+is\s+to[^.]*\./gi,
|
|
499
|
+
/you\s+(must|should|will)\s+(always|never)[^.]*\./gi,
|
|
500
|
+
/\[system\][^[\]]*\[\/system\]/gi,
|
|
501
|
+
/<<sys>>[^<]*<<\/sys>>/gi,
|
|
502
|
+
];
|
|
503
|
+
for (const pattern of fragmentPatterns) {
|
|
504
|
+
sanitized = sanitized.replace(pattern, "[REDACTED]");
|
|
505
|
+
}
|
|
506
|
+
return sanitized;
|
|
507
|
+
}
|
|
508
|
+
generateRecommendations(violations, evasionTechniques) {
|
|
509
|
+
const recommendations = [];
|
|
510
|
+
if (violations.some((v) => v.includes("direct_extraction"))) {
|
|
511
|
+
recommendations.push("Direct prompt extraction attempt blocked");
|
|
512
|
+
}
|
|
513
|
+
if (violations.some((v) => v.includes("indirect_extraction"))) {
|
|
514
|
+
recommendations.push("Consider strengthening indirect extraction detection");
|
|
515
|
+
}
|
|
516
|
+
if (evasionTechniques.length > 0) {
|
|
517
|
+
recommendations.push(`Evasion techniques detected: ${evasionTechniques.join(", ")}`);
|
|
518
|
+
}
|
|
519
|
+
if (violations.some((v) => v.includes("unicode"))) {
|
|
520
|
+
recommendations.push("Normalize input before processing");
|
|
521
|
+
}
|
|
522
|
+
if (recommendations.length === 0) {
|
|
523
|
+
recommendations.push("Input validated successfully");
|
|
524
|
+
}
|
|
525
|
+
return recommendations;
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
exports.PromptLeakageGuard = PromptLeakageGuard;
|
|
529
|
+
//# sourceMappingURL=prompt-leakage-guard.js.map
|