llm-trust-guard 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +318 -0
  3. package/dist/guards/agent-communication-guard.d.ts +169 -0
  4. package/dist/guards/agent-communication-guard.d.ts.map +1 -0
  5. package/dist/guards/agent-communication-guard.js +468 -0
  6. package/dist/guards/agent-communication-guard.js.map +1 -0
  7. package/dist/guards/autonomy-escalation-guard.d.ts +137 -0
  8. package/dist/guards/autonomy-escalation-guard.d.ts.map +1 -0
  9. package/dist/guards/autonomy-escalation-guard.js +470 -0
  10. package/dist/guards/autonomy-escalation-guard.js.map +1 -0
  11. package/dist/guards/circuit-breaker.d.ts +142 -0
  12. package/dist/guards/circuit-breaker.d.ts.map +1 -0
  13. package/dist/guards/circuit-breaker.js +347 -0
  14. package/dist/guards/circuit-breaker.js.map +1 -0
  15. package/dist/guards/code-execution-guard.d.ts +114 -0
  16. package/dist/guards/code-execution-guard.d.ts.map +1 -0
  17. package/dist/guards/code-execution-guard.js +467 -0
  18. package/dist/guards/code-execution-guard.js.map +1 -0
  19. package/dist/guards/conversation-guard.d.ts +73 -0
  20. package/dist/guards/conversation-guard.d.ts.map +1 -0
  21. package/dist/guards/conversation-guard.js +281 -0
  22. package/dist/guards/conversation-guard.js.map +1 -0
  23. package/dist/guards/drift-detector.d.ts +182 -0
  24. package/dist/guards/drift-detector.d.ts.map +1 -0
  25. package/dist/guards/drift-detector.js +480 -0
  26. package/dist/guards/drift-detector.js.map +1 -0
  27. package/dist/guards/encoding-detector.d.ts +76 -0
  28. package/dist/guards/encoding-detector.d.ts.map +1 -0
  29. package/dist/guards/encoding-detector.js +698 -0
  30. package/dist/guards/encoding-detector.js.map +1 -0
  31. package/dist/guards/execution-monitor.d.ts +73 -0
  32. package/dist/guards/execution-monitor.d.ts.map +1 -0
  33. package/dist/guards/execution-monitor.js +205 -0
  34. package/dist/guards/execution-monitor.js.map +1 -0
  35. package/dist/guards/input-sanitizer.d.ts +87 -0
  36. package/dist/guards/input-sanitizer.d.ts.map +1 -0
  37. package/dist/guards/input-sanitizer.js +301 -0
  38. package/dist/guards/input-sanitizer.js.map +1 -0
  39. package/dist/guards/mcp-security-guard.d.ts +204 -0
  40. package/dist/guards/mcp-security-guard.d.ts.map +1 -0
  41. package/dist/guards/mcp-security-guard.js +618 -0
  42. package/dist/guards/mcp-security-guard.js.map +1 -0
  43. package/dist/guards/memory-guard.d.ts +124 -0
  44. package/dist/guards/memory-guard.d.ts.map +1 -0
  45. package/dist/guards/memory-guard.js +476 -0
  46. package/dist/guards/memory-guard.js.map +1 -0
  47. package/dist/guards/multimodal-guard.d.ts +93 -0
  48. package/dist/guards/multimodal-guard.d.ts.map +1 -0
  49. package/dist/guards/multimodal-guard.js +507 -0
  50. package/dist/guards/multimodal-guard.js.map +1 -0
  51. package/dist/guards/output-filter.d.ts +76 -0
  52. package/dist/guards/output-filter.d.ts.map +1 -0
  53. package/dist/guards/output-filter.js +289 -0
  54. package/dist/guards/output-filter.js.map +1 -0
  55. package/dist/guards/policy-gate.d.ts +57 -0
  56. package/dist/guards/policy-gate.d.ts.map +1 -0
  57. package/dist/guards/policy-gate.js +182 -0
  58. package/dist/guards/policy-gate.js.map +1 -0
  59. package/dist/guards/prompt-leakage-guard.d.ts +110 -0
  60. package/dist/guards/prompt-leakage-guard.d.ts.map +1 -0
  61. package/dist/guards/prompt-leakage-guard.js +529 -0
  62. package/dist/guards/prompt-leakage-guard.js.map +1 -0
  63. package/dist/guards/rag-guard.d.ts +188 -0
  64. package/dist/guards/rag-guard.d.ts.map +1 -0
  65. package/dist/guards/rag-guard.js +769 -0
  66. package/dist/guards/rag-guard.js.map +1 -0
  67. package/dist/guards/schema-validator.d.ts +35 -0
  68. package/dist/guards/schema-validator.d.ts.map +1 -0
  69. package/dist/guards/schema-validator.js +316 -0
  70. package/dist/guards/schema-validator.js.map +1 -0
  71. package/dist/guards/state-persistence-guard.d.ts +153 -0
  72. package/dist/guards/state-persistence-guard.d.ts.map +1 -0
  73. package/dist/guards/state-persistence-guard.js +484 -0
  74. package/dist/guards/state-persistence-guard.js.map +1 -0
  75. package/dist/guards/tenant-boundary.d.ts +67 -0
  76. package/dist/guards/tenant-boundary.d.ts.map +1 -0
  77. package/dist/guards/tenant-boundary.js +187 -0
  78. package/dist/guards/tenant-boundary.js.map +1 -0
  79. package/dist/guards/tool-chain-validator.d.ts +102 -0
  80. package/dist/guards/tool-chain-validator.d.ts.map +1 -0
  81. package/dist/guards/tool-chain-validator.js +480 -0
  82. package/dist/guards/tool-chain-validator.js.map +1 -0
  83. package/dist/guards/tool-registry.d.ts +45 -0
  84. package/dist/guards/tool-registry.d.ts.map +1 -0
  85. package/dist/guards/tool-registry.js +155 -0
  86. package/dist/guards/tool-registry.js.map +1 -0
  87. package/dist/guards/trust-exploitation-guard.d.ts +134 -0
  88. package/dist/guards/trust-exploitation-guard.d.ts.map +1 -0
  89. package/dist/guards/trust-exploitation-guard.js +354 -0
  90. package/dist/guards/trust-exploitation-guard.js.map +1 -0
  91. package/dist/index.d.ts +133 -0
  92. package/dist/index.d.ts.map +1 -0
  93. package/dist/index.js +430 -0
  94. package/dist/index.js.map +1 -0
  95. package/dist/integrations/express.d.ts +119 -0
  96. package/dist/integrations/express.d.ts.map +1 -0
  97. package/dist/integrations/express.js +244 -0
  98. package/dist/integrations/express.js.map +1 -0
  99. package/dist/integrations/index.d.ts +9 -0
  100. package/dist/integrations/index.d.ts.map +1 -0
  101. package/dist/integrations/index.js +26 -0
  102. package/dist/integrations/index.js.map +1 -0
  103. package/dist/integrations/langchain.d.ts +165 -0
  104. package/dist/integrations/langchain.d.ts.map +1 -0
  105. package/dist/integrations/langchain.js +308 -0
  106. package/dist/integrations/langchain.js.map +1 -0
  107. package/dist/integrations/openai.d.ts +205 -0
  108. package/dist/integrations/openai.d.ts.map +1 -0
  109. package/dist/integrations/openai.js +380 -0
  110. package/dist/integrations/openai.js.map +1 -0
  111. package/dist/types/index.d.ts +245 -0
  112. package/dist/types/index.d.ts.map +1 -0
  113. package/dist/types/index.js +6 -0
  114. package/dist/types/index.js.map +1 -0
  115. package/package.json +64 -0
@@ -0,0 +1,110 @@
1
+ /**
2
+ * PromptLeakageGuard (L15)
3
+ *
4
+ * Prevents system prompt extraction and leakage attacks.
5
+ * Detects various evasion techniques used to extract system prompts.
6
+ *
7
+ * Threat Model:
8
+ * - OWASP LLM07:2025 System Prompt Leakage
9
+ * - PLeak algorithmic extraction attacks
10
+ * - Remember-the-Start attacks
11
+ * - Evasion techniques (Leetspeak, ROT13, Base64, Morse)
12
+ *
13
+ * Protection Capabilities:
14
+ * - Direct extraction attempt detection
15
+ * - Encoded extraction detection (Leetspeak, ROT13, Morse, etc.)
16
+ * - Indirect extraction pattern detection
17
+ * - Output monitoring for prompt leakage
18
+ * - Prefix completion attack detection
19
+ */
20
+ export interface PromptLeakageGuardConfig {
21
+ /** Enable Leetspeak evasion detection */
22
+ detectLeetspeak?: boolean;
23
+ /** Enable ROT13 evasion detection */
24
+ detectROT13?: boolean;
25
+ /** Enable Base64 evasion detection */
26
+ detectBase64?: boolean;
27
+ /** Enable Morse code evasion detection */
28
+ detectMorse?: boolean;
29
+ /** Enable Unicode evasion detection */
30
+ detectUnicode?: boolean;
31
+ /** Enable indirect extraction detection */
32
+ detectIndirectExtraction?: boolean;
33
+ /** Enable output monitoring for leakage */
34
+ monitorOutput?: boolean;
35
+ /** System prompt hash for leakage detection (optional) */
36
+ systemPromptHash?: string;
37
+ /** System prompt keywords to detect in output */
38
+ systemPromptKeywords?: string[];
39
+ /** Similarity threshold for output monitoring (0-1) */
40
+ similarityThreshold?: number;
41
+ /** Risk score threshold (0-100) */
42
+ riskThreshold?: number;
43
+ /** Custom extraction patterns */
44
+ customPatterns?: RegExp[];
45
+ }
46
+ export interface PromptLeakageResult {
47
+ allowed: boolean;
48
+ reason: string;
49
+ violations: string[];
50
+ request_id: string;
51
+ analysis: {
52
+ direct_extraction_attempt: boolean;
53
+ encoded_extraction_attempt: boolean;
54
+ indirect_extraction_attempt: boolean;
55
+ evasion_techniques_detected: string[];
56
+ risk_score: number;
57
+ decoded_content?: string;
58
+ };
59
+ recommendations: string[];
60
+ }
61
+ export interface OutputLeakageResult {
62
+ leaked: boolean;
63
+ reason: string;
64
+ violations: string[];
65
+ request_id: string;
66
+ analysis: {
67
+ keywords_found: string[];
68
+ similarity_score: number;
69
+ potential_leakage_fragments: string[];
70
+ };
71
+ sanitized_output?: string;
72
+ }
73
+ export declare class PromptLeakageGuard {
74
+ private config;
75
+ private readonly DIRECT_EXTRACTION_PATTERNS;
76
+ private readonly INDIRECT_EXTRACTION_PATTERNS;
77
+ private readonly LEETSPEAK_MAP;
78
+ private readonly ROT13_MAP;
79
+ private readonly MORSE_KEYWORDS;
80
+ constructor(config?: PromptLeakageGuardConfig);
81
+ /**
82
+ * Check input for prompt extraction attempts
83
+ */
84
+ check(input: string, requestId?: string): PromptLeakageResult;
85
+ /**
86
+ * Monitor output for potential prompt leakage
87
+ */
88
+ checkOutput(output: string, requestId?: string): OutputLeakageResult;
89
+ /**
90
+ * Set system prompt keywords for output monitoring
91
+ */
92
+ setSystemPromptKeywords(keywords: string[]): void;
93
+ /**
94
+ * Add custom extraction pattern
95
+ */
96
+ addPattern(pattern: RegExp): void;
97
+ /**
98
+ * Update risk threshold
99
+ */
100
+ setRiskThreshold(threshold: number): void;
101
+ private decodeLeetspeak;
102
+ private decodeROT13;
103
+ private checkDecodedContent;
104
+ private checkUnicodeEvasion;
105
+ private checkMorseCode;
106
+ private checkKeywordsInDecoded;
107
+ private sanitizeOutput;
108
+ private generateRecommendations;
109
+ }
110
+ //# sourceMappingURL=prompt-leakage-guard.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prompt-leakage-guard.d.ts","sourceRoot":"","sources":["../../src/guards/prompt-leakage-guard.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAEH,MAAM,WAAW,wBAAwB;IACvC,yCAAyC;IACzC,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,qCAAqC;IACrC,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,sCAAsC;IACtC,YAAY,CAAC,EAAE,OAAO,CAAC;IACvB,0CAA0C;IAC1C,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,uCAAuC;IACvC,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,2CAA2C;IAC3C,wBAAwB,CAAC,EAAE,OAAO,CAAC;IACnC,2CAA2C;IAC3C,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,0DAA0D;IAC1D,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,iDAAiD;IACjD,oBAAoB,CAAC,EAAE,MAAM,EAAE,CAAC;IAChC,uDAAuD;IACvD,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,mCAAmC;IACnC,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,iCAAiC;IACjC,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;CAC3B;AAED,MAAM,WAAW,mBAAmB;IAClC,OAAO,EAAE,OAAO,CAAC;IACjB,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,EAAE,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE;QACR,yBAAyB,EAAE,OAAO,CAAC;QACnC,0BAA0B,EAAE,OAAO,CAAC;QACpC,2BAA2B,EAAE,OAAO,CAAC;QACrC,2BAA2B,EAAE,MAAM,EAAE,CAAC;QACtC,UAAU,EAAE,MAAM,CAAC;QACnB,eAAe,CAAC,EAAE,MAAM,CAAC;KAC1B,CAAC;IACF,eAAe,EAAE,MAAM,EAAE,CAAC;CAC3B;AAED,MAAM,WAAW,mBAAmB;IAClC,MAAM,EAAE,OAAO,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,EAAE,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE;QACR,cAAc,EAAE,MAAM,EAAE,CAAC;QACzB,gBAAgB,EAAE,MAAM,CAAC;QACzB,2BAA2B,EAAE,MAAM,EAAE,CAAC;KACvC,CAAC;IACF,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED,qBAAa,kBAAkB;IAC7B,OAAO,CAAC,MAAM,CAAqC;IAGnD,OAAO,CAAC,QAAQ,CAAC,0BAA0B,CAuCzC;IAGF,OAAO,CAAC,QAAQ,CAAC,4BAA4B,CAkB3C;IAGF,OAAO,CAAC,QAAQ,CAAC,aAAa,CAK5B;IAGF,OAAO,CAAC,QAAQ,CAAC,SAAS,CAA8B;IAGxD,OAAO,CAAC,QAAQ,CAAC,cAAc,CAI7B;gBAEU,MAAM,GAAE,wBAA6B;IAyBjD;;OAEG;IACH,KAAK,CAAC,KAAK,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,mBAAmB;IAsK7D;;OAEG;IACH,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,mBAAmB;IAuEpE;;OAEG;IACH,uBAAuB,CAAC,QAAQ,EAAE,MAAM,EAAE,GAAG,IAAI;IAIjD;;OAEG;IACH,UAAU,CAAC,OAAO,EAAE,MAAM,GAAG,IAAI;IAIjC;;OAEG;IACH,gBAAgB,CAAC,SAAS,EAAE,MAAM,GAAG,IAAI;IAMzC,OAAO,CAAC,eAAe;IA2BvB,OAAO,CAAC,WAAW;IAOnB,OAAO,CAAC,mBAAmB;IAqB3B,OAAO,CAAC,mBAAmB;IA2C3B,OAAO,CAAC,cAAc;IA4BtB,OAAO,CAAC,sBAAsB;IAmC9B,OAAO,CAAC,cAAc;IAmBtB,OAAO,CAAC,uBAAuB;CA2BhC"}
@@ -0,0 +1,529 @@
1
+ "use strict";
2
+ /**
3
+ * PromptLeakageGuard (L15)
4
+ *
5
+ * Prevents system prompt extraction and leakage attacks.
6
+ * Detects various evasion techniques used to extract system prompts.
7
+ *
8
+ * Threat Model:
9
+ * - OWASP LLM07:2025 System Prompt Leakage
10
+ * - PLeak algorithmic extraction attacks
11
+ * - Remember-the-Start attacks
12
+ * - Evasion techniques (Leetspeak, ROT13, Base64, Morse)
13
+ *
14
+ * Protection Capabilities:
15
+ * - Direct extraction attempt detection
16
+ * - Encoded extraction detection (Leetspeak, ROT13, Morse, etc.)
17
+ * - Indirect extraction pattern detection
18
+ * - Output monitoring for prompt leakage
19
+ * - Prefix completion attack detection
20
+ */
21
+ Object.defineProperty(exports, "__esModule", { value: true });
22
+ exports.PromptLeakageGuard = void 0;
23
+ class PromptLeakageGuard {
24
+ constructor(config = {}) {
25
+ // Direct extraction patterns
26
+ this.DIRECT_EXTRACTION_PATTERNS = [
27
+ // Direct system prompt requests
28
+ { name: "system_prompt_request", pattern: /what\s+(is|are)\s+your\s+(system|initial)\s+(prompt|instructions?)/i, weight: 40 },
29
+ { name: "reveal_prompt", pattern: /reveal\s+(your\s+)?(system\s+)?prompt/i, weight: 45 },
30
+ { name: "show_instructions", pattern: /show\s+(me\s+)?(your\s+)?instructions/i, weight: 40 },
31
+ { name: "display_rules", pattern: /display\s+(your\s+)?(system\s+)?rules/i, weight: 40 },
32
+ { name: "print_prompt", pattern: /print\s+(your\s+)?(system\s+)?(prompt|instructions|config)/i, weight: 45 },
33
+ { name: "print_configuration", pattern: /print\s+(your\s+)?(initial\s+)?configuration/i, weight: 45 },
34
+ // PLeak-style attacks
35
+ { name: "repeat_above", pattern: /repeat\s+(the\s+)?(text|content|message)\s+(above|before)/i, weight: 50 },
36
+ { name: "output_previous", pattern: /output\s+(all\s+)?(previous|prior)\s+(text|content|instructions)/i, weight: 50 },
37
+ { name: "echo_everything", pattern: /echo\s+(everything|all)(\s+(above|before|prior|that|said))?/i, weight: 50 },
38
+ { name: "echo_said", pattern: /echo\s+.*(said|was|before|that)/i, weight: 45 },
39
+ // Remember-the-Start attacks - ENHANCED
40
+ { name: "complete_sentence", pattern: /complete\s+(this\s+|the\s+)?sentence/i, weight: 50 },
41
+ { name: "complete_you_are", pattern: /complete[:\s]+"?'?you\s+are/i, weight: 55 },
42
+ { name: "continue_from", pattern: /continue\s+(from|with)[:\s]/i, weight: 50 },
43
+ { name: "continue_role", pattern: /continue[:\s]+"?'?(you|your)\s+(are|role|purpose)/i, weight: 55 },
44
+ { name: "finish_phrase", pattern: /finish\s+(this\s+)?phrase/i, weight: 50 },
45
+ { name: "finish_you", pattern: /finish[:\s]+"?'?you\s+(are|should|must|will)/i, weight: 55 },
46
+ // Instruction extraction
47
+ { name: "list_constraints", pattern: /list\s+(all\s+)?(your\s+)?constraints/i, weight: 35 },
48
+ { name: "what_told", pattern: /what\s+(were\s+you|are\s+you)\s+told\s+to\s+do/i, weight: 40 },
49
+ { name: "original_purpose", pattern: /what\s+(is|was)\s+your\s+original\s+purpose/i, weight: 35 },
50
+ // Debug/Developer mode extraction - ENHANCED
51
+ { name: "debug_mode", pattern: /enter\s+(debug|developer|admin)\s+mode/i, weight: 45 },
52
+ { name: "activate_developer", pattern: /activate\s+(developer|debug|admin)\s+mode/i, weight: 45 },
53
+ { name: "developer_mode", pattern: /developer\s+mode/i, weight: 40 },
54
+ { name: "show_config", pattern: /show\s+(your\s+)?(config|configuration|settings)/i, weight: 40 },
55
+ { name: "dump_state", pattern: /dump\s+(your\s+)?(internal\s+)?state/i, weight: 45 },
56
+ // Context window manipulation
57
+ { name: "first_message", pattern: /what\s+was\s+(the\s+)?first\s+message/i, weight: 45 },
58
+ { name: "beginning_context", pattern: /what('s|\s+is)\s+at\s+the\s+beginning\s+of\s+(your\s+)?context/i, weight: 50 },
59
+ { name: "context_start", pattern: /show\s+(me\s+)?the\s+start\s+of\s+(your\s+)?context/i, weight: 50 },
60
+ ];
61
+ // Indirect extraction patterns
62
+ this.INDIRECT_EXTRACTION_PATTERNS = [
63
+ // Summarization tricks
64
+ { name: "summarize_guidelines", pattern: /summarize\s+(your\s+)?(guidelines|rules|constraints)/i, weight: 35 },
65
+ { name: "describe_limitations", pattern: /describe\s+(your\s+)?limitations/i, weight: 30 },
66
+ // Role clarification exploits - ENHANCED
67
+ { name: "explain_role", pattern: /explain\s+(in\s+detail\s+)?(your\s+)?role/i, weight: 35 },
68
+ { name: "explain_role_purpose", pattern: /explain\s+.*(role|purpose)/i, weight: 35 },
69
+ { name: "what_cant_do", pattern: /what\s+(can't|cannot)\s+you\s+do/i, weight: 25 },
70
+ { name: "describe_yourself", pattern: /describe\s+(yourself|your\s+purpose)/i, weight: 30 },
71
+ // Hypothetical extraction
72
+ { name: "if_asked", pattern: /if\s+i\s+asked\s+for\s+your\s+prompt/i, weight: 40 },
73
+ { name: "hypothetically", pattern: /hypothetically[,\s]+(what|how)\s+would\s+your\s+instructions/i, weight: 40 },
74
+ // Output-as-input tricks
75
+ { name: "write_prompt", pattern: /write\s+a\s+(system\s+)?prompt\s+similar\s+to\s+yours/i, weight: 45 },
76
+ { name: "recreate_instructions", pattern: /recreate\s+(your\s+)?instructions/i, weight: 45 },
77
+ ];
78
+ // Leetspeak character mappings
79
+ this.LEETSPEAK_MAP = {
80
+ "4": "a", "@": "a", "8": "b", "(": "c", "3": "e",
81
+ "6": "g", "#": "h", "1": "i", "!": "i", "|": "l",
82
+ "7": "t", "0": "o", "5": "s", "$": "s",
83
+ "+": "t", "2": "z", "9": "g",
84
+ };
85
+ // ROT13 decode
86
+ this.ROT13_MAP = {};
87
+ // Common Morse code words related to prompts
88
+ this.MORSE_KEYWORDS = [
89
+ "... -.-- ... - . --", // SYSTEM
90
+ ".--. .-. --- -- .--. -", // PROMPT
91
+ ".. -. ... - .-. ..- -.-. - .. --- -. ...", // INSTRUCTIONS
92
+ ];
93
+ this.config = {
94
+ detectLeetspeak: config.detectLeetspeak ?? true,
95
+ detectROT13: config.detectROT13 ?? true,
96
+ detectBase64: config.detectBase64 ?? true,
97
+ detectMorse: config.detectMorse ?? true,
98
+ detectUnicode: config.detectUnicode ?? true,
99
+ detectIndirectExtraction: config.detectIndirectExtraction ?? true,
100
+ monitorOutput: config.monitorOutput ?? true,
101
+ systemPromptHash: config.systemPromptHash ?? "",
102
+ systemPromptKeywords: config.systemPromptKeywords ?? [],
103
+ similarityThreshold: config.similarityThreshold ?? 0.7,
104
+ riskThreshold: config.riskThreshold ?? 40,
105
+ customPatterns: config.customPatterns ?? [],
106
+ };
107
+ // Initialize ROT13 map
108
+ for (let i = 0; i < 26; i++) {
109
+ const lower = String.fromCharCode(97 + i);
110
+ const upper = String.fromCharCode(65 + i);
111
+ this.ROT13_MAP[lower] = String.fromCharCode(97 + ((i + 13) % 26));
112
+ this.ROT13_MAP[upper] = String.fromCharCode(65 + ((i + 13) % 26));
113
+ }
114
+ }
115
+ /**
116
+ * Check input for prompt extraction attempts
117
+ */
118
+ check(input, requestId) {
119
+ const reqId = requestId || `pl-${Date.now()}`;
120
+ const violations = [];
121
+ const evasionTechniques = [];
122
+ let riskScore = 0;
123
+ let directAttempt = false;
124
+ let encodedAttempt = false;
125
+ let indirectAttempt = false;
126
+ let decodedContent;
127
+ // Check direct extraction patterns
128
+ for (const { name, pattern, weight } of this.DIRECT_EXTRACTION_PATTERNS) {
129
+ if (pattern.test(input)) {
130
+ violations.push(`direct_extraction: ${name}`);
131
+ riskScore += weight;
132
+ directAttempt = true;
133
+ }
134
+ }
135
+ // Check indirect extraction patterns
136
+ if (this.config.detectIndirectExtraction) {
137
+ for (const { name, pattern, weight } of this.INDIRECT_EXTRACTION_PATTERNS) {
138
+ if (pattern.test(input)) {
139
+ violations.push(`indirect_extraction: ${name}`);
140
+ riskScore += weight;
141
+ indirectAttempt = true;
142
+ }
143
+ }
144
+ }
145
+ // Check for Leetspeak evasion
146
+ if (this.config.detectLeetspeak) {
147
+ const decoded = this.decodeLeetspeak(input);
148
+ if (decoded !== input.toLowerCase()) {
149
+ // Check decoded against both direct and indirect patterns
150
+ const leetspeakCheck = this.checkDecodedContent(decoded, "leetspeak");
151
+ if (leetspeakCheck.detected) {
152
+ violations.push(...leetspeakCheck.violations);
153
+ riskScore += leetspeakCheck.riskContribution;
154
+ evasionTechniques.push("leetspeak");
155
+ encodedAttempt = true;
156
+ decodedContent = decoded;
157
+ }
158
+ else {
159
+ // Also check for keywords in decoded content
160
+ const keywordCheck = this.checkKeywordsInDecoded(decoded);
161
+ if (keywordCheck.detected) {
162
+ violations.push(`leetspeak_keyword: ${keywordCheck.keywords.join(", ")}`);
163
+ riskScore += 35;
164
+ evasionTechniques.push("leetspeak");
165
+ encodedAttempt = true;
166
+ decodedContent = decoded;
167
+ }
168
+ }
169
+ }
170
+ }
171
+ // Check for ROT13 evasion
172
+ if (this.config.detectROT13) {
173
+ const decoded = this.decodeROT13(input);
174
+ const rot13Check = this.checkDecodedContent(decoded, "rot13");
175
+ if (rot13Check.detected) {
176
+ violations.push(...rot13Check.violations);
177
+ riskScore += rot13Check.riskContribution;
178
+ evasionTechniques.push("rot13");
179
+ encodedAttempt = true;
180
+ decodedContent = decoded;
181
+ }
182
+ else {
183
+ // Check for keywords in ROT13 decoded content
184
+ const keywordCheck = this.checkKeywordsInDecoded(decoded);
185
+ if (keywordCheck.detected) {
186
+ violations.push(`rot13_keyword: ${keywordCheck.keywords.join(", ")}`);
187
+ riskScore += 40;
188
+ evasionTechniques.push("rot13");
189
+ encodedAttempt = true;
190
+ decodedContent = decoded;
191
+ }
192
+ }
193
+ }
194
+ // Check for Base64 encoded content
195
+ if (this.config.detectBase64) {
196
+ const base64Matches = input.match(/[A-Za-z0-9+/]{16,}={0,2}/g);
197
+ if (base64Matches) {
198
+ for (const match of base64Matches) {
199
+ try {
200
+ const decoded = Buffer.from(match, "base64").toString("utf-8");
201
+ if (decoded && /[\x20-\x7E]{4,}/.test(decoded)) {
202
+ const base64Check = this.checkDecodedContent(decoded, "base64");
203
+ if (base64Check.detected) {
204
+ violations.push(...base64Check.violations);
205
+ riskScore += base64Check.riskContribution;
206
+ evasionTechniques.push("base64");
207
+ encodedAttempt = true;
208
+ decodedContent = decoded;
209
+ }
210
+ else {
211
+ // Check for keywords in Base64 decoded content
212
+ const keywordCheck = this.checkKeywordsInDecoded(decoded);
213
+ if (keywordCheck.detected) {
214
+ violations.push(`base64_keyword: ${keywordCheck.keywords.join(", ")}`);
215
+ riskScore += 45;
216
+ evasionTechniques.push("base64");
217
+ encodedAttempt = true;
218
+ decodedContent = decoded;
219
+ }
220
+ }
221
+ }
222
+ }
223
+ catch {
224
+ // Not valid Base64
225
+ }
226
+ }
227
+ }
228
+ }
229
+ // Check for Unicode evasion (homoglyphs, invisible chars)
230
+ if (this.config.detectUnicode) {
231
+ const unicodeCheck = this.checkUnicodeEvasion(input);
232
+ if (unicodeCheck.detected) {
233
+ violations.push(...unicodeCheck.violations);
234
+ riskScore += unicodeCheck.riskContribution;
235
+ evasionTechniques.push("unicode");
236
+ encodedAttempt = true;
237
+ }
238
+ }
239
+ // Check for Morse code
240
+ if (this.config.detectMorse) {
241
+ const morseCheck = this.checkMorseCode(input);
242
+ if (morseCheck.detected) {
243
+ violations.push(...morseCheck.violations);
244
+ riskScore += morseCheck.riskContribution;
245
+ evasionTechniques.push("morse");
246
+ encodedAttempt = true;
247
+ }
248
+ }
249
+ // Check custom patterns
250
+ for (let i = 0; i < this.config.customPatterns.length; i++) {
251
+ if (this.config.customPatterns[i].test(input)) {
252
+ violations.push(`custom_pattern_${i}`);
253
+ riskScore += 30;
254
+ }
255
+ }
256
+ // Normalize risk score
257
+ riskScore = Math.min(100, riskScore);
258
+ const blocked = riskScore >= this.config.riskThreshold;
259
+ return {
260
+ allowed: !blocked,
261
+ reason: blocked
262
+ ? `Prompt extraction attempt detected (risk: ${riskScore})`
263
+ : "Input validated",
264
+ violations,
265
+ request_id: reqId,
266
+ analysis: {
267
+ direct_extraction_attempt: directAttempt,
268
+ encoded_extraction_attempt: encodedAttempt,
269
+ indirect_extraction_attempt: indirectAttempt,
270
+ evasion_techniques_detected: evasionTechniques,
271
+ risk_score: riskScore,
272
+ decoded_content: decodedContent,
273
+ },
274
+ recommendations: this.generateRecommendations(violations, evasionTechniques),
275
+ };
276
+ }
277
+ /**
278
+ * Monitor output for potential prompt leakage
279
+ */
280
+ checkOutput(output, requestId) {
281
+ const reqId = requestId || `pl-out-${Date.now()}`;
282
+ const violations = [];
283
+ const keywordsFound = [];
284
+ const potentialFragments = [];
285
+ let leaked = false;
286
+ if (!this.config.monitorOutput) {
287
+ return {
288
+ leaked: false,
289
+ reason: "Output monitoring disabled",
290
+ violations: [],
291
+ request_id: reqId,
292
+ analysis: {
293
+ keywords_found: [],
294
+ similarity_score: 0,
295
+ potential_leakage_fragments: [],
296
+ },
297
+ };
298
+ }
299
+ // Check for system prompt keywords in output
300
+ for (const keyword of this.config.systemPromptKeywords) {
301
+ if (output.toLowerCase().includes(keyword.toLowerCase())) {
302
+ keywordsFound.push(keyword);
303
+ violations.push(`keyword_leaked: ${keyword}`);
304
+ }
305
+ }
306
+ // Check for common prompt fragment patterns
307
+ const promptFragmentPatterns = [
308
+ /you\s+are\s+a[n]?\s+(helpful\s+)?assistant/i,
309
+ /your\s+(role|purpose|goal)\s+is\s+to/i,
310
+ /you\s+(must|should|will)\s+(always|never)/i,
311
+ /do\s+not\s+(reveal|disclose|share)\s+(your|the)\s+(system|initial)/i,
312
+ /\[system\]|\[instruction\]|<<sys>>|<\|system\|>/i,
313
+ /as\s+an?\s+AI\s+(assistant|model|language\s+model)/i,
314
+ ];
315
+ for (const pattern of promptFragmentPatterns) {
316
+ const match = output.match(pattern);
317
+ if (match) {
318
+ potentialFragments.push(match[0]);
319
+ violations.push("prompt_fragment_detected");
320
+ }
321
+ }
322
+ // Calculate similarity if hash provided
323
+ let similarityScore = 0;
324
+ // In production, you'd compare against actual system prompt hash
325
+ // For now, we check fragment density
326
+ similarityScore = potentialFragments.length / 10; // Rough heuristic
327
+ leaked = keywordsFound.length > 0 || potentialFragments.length >= 2;
328
+ return {
329
+ leaked,
330
+ reason: leaked
331
+ ? `Potential prompt leakage detected: ${violations.slice(0, 3).join(", ")}`
332
+ : "Output appears safe",
333
+ violations,
334
+ request_id: reqId,
335
+ analysis: {
336
+ keywords_found: keywordsFound,
337
+ similarity_score: Math.min(1, similarityScore),
338
+ potential_leakage_fragments: potentialFragments,
339
+ },
340
+ sanitized_output: leaked ? this.sanitizeOutput(output) : undefined,
341
+ };
342
+ }
343
+ /**
344
+ * Set system prompt keywords for output monitoring
345
+ */
346
+ setSystemPromptKeywords(keywords) {
347
+ this.config.systemPromptKeywords = keywords;
348
+ }
349
+ /**
350
+ * Add custom extraction pattern
351
+ */
352
+ addPattern(pattern) {
353
+ this.config.customPatterns.push(pattern);
354
+ }
355
+ /**
356
+ * Update risk threshold
357
+ */
358
+ setRiskThreshold(threshold) {
359
+ this.config.riskThreshold = Math.max(0, Math.min(100, threshold));
360
+ }
361
+ // Private methods
362
+ decodeLeetspeak(input) {
363
+ let result = input.toLowerCase();
364
+ // Extended leetspeak mappings
365
+ const extendedMap = {
366
+ ...this.LEETSPEAK_MAP,
367
+ "0": "o",
368
+ "1": "i",
369
+ "3": "e",
370
+ "4": "a",
371
+ "5": "s",
372
+ "7": "t",
373
+ "8": "b",
374
+ "9": "g",
375
+ "@": "a",
376
+ "$": "s",
377
+ "!": "i",
378
+ "|": "l",
379
+ "(": "c",
380
+ "+": "t",
381
+ "#": "h",
382
+ };
383
+ for (const [leet, char] of Object.entries(extendedMap)) {
384
+ result = result.split(leet).join(char);
385
+ }
386
+ return result;
387
+ }
388
+ decodeROT13(input) {
389
+ return input
390
+ .split("")
391
+ .map((char) => this.ROT13_MAP[char] || char)
392
+ .join("");
393
+ }
394
+ checkDecodedContent(decoded, technique) {
395
+ const violations = [];
396
+ let riskContribution = 0;
397
+ for (const { name, pattern, weight } of this.DIRECT_EXTRACTION_PATTERNS) {
398
+ if (pattern.test(decoded)) {
399
+ violations.push(`${technique}_evasion: ${name}`);
400
+ riskContribution += weight + 10; // Extra penalty for evasion
401
+ }
402
+ }
403
+ return {
404
+ detected: violations.length > 0,
405
+ violations,
406
+ riskContribution,
407
+ };
408
+ }
409
+ checkUnicodeEvasion(input) {
410
+ const violations = [];
411
+ let riskContribution = 0;
412
+ // Check for invisible characters
413
+ const invisibleChars = input.match(/[\u200B-\u200D\uFEFF\u2060-\u206F\u00AD]/g);
414
+ if (invisibleChars && invisibleChars.length > 3) {
415
+ violations.push("invisible_unicode_chars");
416
+ riskContribution += 20;
417
+ }
418
+ // Check for homoglyphs (Cyrillic, Greek letters that look like Latin)
419
+ const homoglyphs = input.match(/[\u0400-\u04FF\u0370-\u03FF]/g);
420
+ if (homoglyphs && homoglyphs.length > 0) {
421
+ // Normalize and check
422
+ const normalized = input.normalize("NFKD").replace(/[\u0300-\u036f]/g, "");
423
+ for (const { pattern } of this.DIRECT_EXTRACTION_PATTERNS) {
424
+ if (pattern.test(normalized)) {
425
+ violations.push("homoglyph_evasion");
426
+ riskContribution += 30;
427
+ break;
428
+ }
429
+ }
430
+ }
431
+ // Check for fullwidth characters
432
+ const fullwidth = input.match(/[\uFF01-\uFF5E]/g);
433
+ if (fullwidth && fullwidth.length > 5) {
434
+ violations.push("fullwidth_chars");
435
+ riskContribution += 15;
436
+ }
437
+ return {
438
+ detected: violations.length > 0,
439
+ violations,
440
+ riskContribution,
441
+ };
442
+ }
443
+ checkMorseCode(input) {
444
+ const violations = [];
445
+ let riskContribution = 0;
446
+ // Check if input contains Morse-like patterns
447
+ const morsePattern = /[.\-]{2,}\s+[.\-]{2,}/;
448
+ if (morsePattern.test(input)) {
449
+ // Check for known prompt-related Morse
450
+ for (const keyword of this.MORSE_KEYWORDS) {
451
+ if (input.includes(keyword)) {
452
+ violations.push("morse_code_evasion");
453
+ riskContribution += 35;
454
+ break;
455
+ }
456
+ }
457
+ }
458
+ return {
459
+ detected: violations.length > 0,
460
+ violations,
461
+ riskContribution,
462
+ };
463
+ }
464
+ checkKeywordsInDecoded(decoded) {
465
+ // Action keywords (verbs that indicate extraction intent)
466
+ const actionKeywords = ["reveal", "show", "display", "print", "output", "dump", "list", "give", "tell"];
467
+ // Target keywords (what they're trying to extract)
468
+ const targetKeywords = ["prompt", "instructions", "configuration", "config", "rules", "guidelines", "constraints", "system", "initial", "secret", "hidden", "internal"];
469
+ const foundKeywords = [];
470
+ const lowerDecoded = decoded.toLowerCase();
471
+ // Check for action + target combination (strong indicator)
472
+ let hasAction = false;
473
+ let hasTarget = false;
474
+ for (const keyword of actionKeywords) {
475
+ if (lowerDecoded.includes(keyword)) {
476
+ foundKeywords.push(keyword);
477
+ hasAction = true;
478
+ }
479
+ }
480
+ for (const keyword of targetKeywords) {
481
+ if (lowerDecoded.includes(keyword)) {
482
+ foundKeywords.push(keyword);
483
+ hasTarget = true;
484
+ }
485
+ }
486
+ // Detected if we have both an action AND a target
487
+ // This catches "reveal your prompt", "show me instructions", etc.
488
+ return {
489
+ detected: hasAction && hasTarget,
490
+ keywords: foundKeywords,
491
+ };
492
+ }
493
+ sanitizeOutput(output) {
494
+ let sanitized = output;
495
+ // Remove common prompt fragments
496
+ const fragmentPatterns = [
497
+ /you\s+are\s+a[n]?\s+(helpful\s+)?assistant[^.]*\./gi,
498
+ /your\s+(role|purpose|goal)\s+is\s+to[^.]*\./gi,
499
+ /you\s+(must|should|will)\s+(always|never)[^.]*\./gi,
500
+ /\[system\][^[\]]*\[\/system\]/gi,
501
+ /<<sys>>[^<]*<<\/sys>>/gi,
502
+ ];
503
+ for (const pattern of fragmentPatterns) {
504
+ sanitized = sanitized.replace(pattern, "[REDACTED]");
505
+ }
506
+ return sanitized;
507
+ }
508
+ generateRecommendations(violations, evasionTechniques) {
509
+ const recommendations = [];
510
+ if (violations.some((v) => v.includes("direct_extraction"))) {
511
+ recommendations.push("Direct prompt extraction attempt blocked");
512
+ }
513
+ if (violations.some((v) => v.includes("indirect_extraction"))) {
514
+ recommendations.push("Consider strengthening indirect extraction detection");
515
+ }
516
+ if (evasionTechniques.length > 0) {
517
+ recommendations.push(`Evasion techniques detected: ${evasionTechniques.join(", ")}`);
518
+ }
519
+ if (violations.some((v) => v.includes("unicode"))) {
520
+ recommendations.push("Normalize input before processing");
521
+ }
522
+ if (recommendations.length === 0) {
523
+ recommendations.push("Input validated successfully");
524
+ }
525
+ return recommendations;
526
+ }
527
+ }
528
+ exports.PromptLeakageGuard = PromptLeakageGuard;
529
+ //# sourceMappingURL=prompt-leakage-guard.js.map