promptmenot 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +36 -0
- data/CHANGELOG.md +21 -0
- data/CONTRIBUTING.md +69 -0
- data/Gemfile +9 -0
- data/LICENSE.txt +21 -0
- data/README.md +127 -0
- data/Rakefile +12 -0
- data/agents.md +150 -0
- data/config/locales/en.yml +4 -0
- data/lib/generators/promptmenot/install_generator.rb +17 -0
- data/lib/generators/promptmenot/templates/promptmenot.rb +27 -0
- data/lib/promptmenot/configuration.rb +54 -0
- data/lib/promptmenot/detector.rb +67 -0
- data/lib/promptmenot/errors.rb +7 -0
- data/lib/promptmenot/match.rb +36 -0
- data/lib/promptmenot/pattern.rb +66 -0
- data/lib/promptmenot/pattern_registry.rb +53 -0
- data/lib/promptmenot/patterns/base.rb +36 -0
- data/lib/promptmenot/patterns/context_manipulation.rb +63 -0
- data/lib/promptmenot/patterns/delimiter_injection.rb +81 -0
- data/lib/promptmenot/patterns/direct_instruction_override.rb +95 -0
- data/lib/promptmenot/patterns/encoding_obfuscation.rb +79 -0
- data/lib/promptmenot/patterns/indirect_injection.rb +79 -0
- data/lib/promptmenot/patterns/role_manipulation.rb +79 -0
- data/lib/promptmenot/railtie.rb +13 -0
- data/lib/promptmenot/result.rb +41 -0
- data/lib/promptmenot/sanitizer.rb +50 -0
- data/lib/promptmenot/validator.rb +39 -0
- data/lib/promptmenot/version.rb +5 -0
- data/lib/promptmenot.rb +96 -0
- data/promptmenot.gemspec +34 -0
- metadata +108 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Promptmenot
|
|
4
|
+
class Pattern
|
|
5
|
+
SENSITIVITY_LEVELS = %i[low medium high paranoid].freeze
|
|
6
|
+
CONFIDENCE_LEVELS = %i[high medium low].freeze
|
|
7
|
+
|
|
8
|
+
attr_reader :name, :category, :regex, :sensitivity, :confidence
|
|
9
|
+
|
|
10
|
+
def initialize(name:, category:, regex:, sensitivity: :medium, confidence: :medium)
|
|
11
|
+
validate_sensitivity!(sensitivity)
|
|
12
|
+
validate_confidence!(confidence)
|
|
13
|
+
|
|
14
|
+
@name = name.to_sym
|
|
15
|
+
@category = category.to_sym
|
|
16
|
+
@regex = regex
|
|
17
|
+
@sensitivity = sensitivity.to_sym
|
|
18
|
+
@confidence = confidence.to_sym
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def active_at?(level)
|
|
22
|
+
level_index = SENSITIVITY_LEVELS.index(level.to_sym)
|
|
23
|
+
pattern_index = SENSITIVITY_LEVELS.index(@sensitivity)
|
|
24
|
+
return false unless level_index && pattern_index
|
|
25
|
+
|
|
26
|
+
level_index >= pattern_index
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def match(text)
|
|
30
|
+
matches = []
|
|
31
|
+
text.to_s.scan(regex) do
|
|
32
|
+
match_data = Regexp.last_match
|
|
33
|
+
matches << Match.new(
|
|
34
|
+
pattern: self,
|
|
35
|
+
matched_text: match_data[0],
|
|
36
|
+
position: match_data.begin(0)...match_data.end(0)
|
|
37
|
+
)
|
|
38
|
+
end
|
|
39
|
+
matches
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def ==(other)
|
|
43
|
+
other.is_a?(Pattern) && name == other.name && category == other.category
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
alias eql? ==
|
|
47
|
+
|
|
48
|
+
def hash
|
|
49
|
+
[name, category].hash
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
def validate_sensitivity!(level)
|
|
55
|
+
return if SENSITIVITY_LEVELS.include?(level.to_sym)
|
|
56
|
+
|
|
57
|
+
raise PatternError, "Invalid sensitivity: #{level}. Must be one of: #{SENSITIVITY_LEVELS.join(", ")}"
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def validate_confidence!(level)
|
|
61
|
+
return if CONFIDENCE_LEVELS.include?(level.to_sym)
|
|
62
|
+
|
|
63
|
+
raise PatternError, "Invalid confidence: #{level}. Must be one of: #{CONFIDENCE_LEVELS.join(", ")}"
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "set"
|
|
4
|
+
|
|
5
|
+
module Promptmenot
|
|
6
|
+
class PatternRegistry
|
|
7
|
+
include Enumerable
|
|
8
|
+
|
|
9
|
+
def initialize
|
|
10
|
+
@patterns = Set.new
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def register(pattern)
|
|
14
|
+
@patterns.add(pattern)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def register_all(patterns)
|
|
18
|
+
patterns.each { |p| register(p) }
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def each(&block)
|
|
22
|
+
@patterns.each(&block)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def size
|
|
26
|
+
@patterns.size
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def for_sensitivity(level)
|
|
30
|
+
@patterns.select { |p| p.active_at?(level) }
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def for_category(category)
|
|
34
|
+
@patterns.select { |p| p.category == category.to_sym }
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def for_sensitivity_and_categories(sensitivity, categories: nil)
|
|
38
|
+
filtered = for_sensitivity(sensitivity)
|
|
39
|
+
return filtered unless categories
|
|
40
|
+
|
|
41
|
+
category_syms = Array(categories).map(&:to_sym)
|
|
42
|
+
filtered.select { |p| category_syms.include?(p.category) }
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def categories
|
|
46
|
+
@patterns.map(&:category).uniq
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def clear
|
|
50
|
+
@patterns.clear
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Promptmenot
|
|
4
|
+
module Patterns
|
|
5
|
+
class Base
|
|
6
|
+
class << self
|
|
7
|
+
def inherited(subclass)
|
|
8
|
+
super
|
|
9
|
+
subclass.instance_variable_set(:@patterns, [])
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def patterns
|
|
13
|
+
@patterns ||= []
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def register(name:, regex:, sensitivity: :medium, confidence: :medium)
|
|
17
|
+
patterns << Pattern.new(
|
|
18
|
+
name: name,
|
|
19
|
+
category: category_name,
|
|
20
|
+
regex: regex,
|
|
21
|
+
sensitivity: sensitivity,
|
|
22
|
+
confidence: confidence
|
|
23
|
+
)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def category_name
|
|
27
|
+
name.split("::").last
|
|
28
|
+
.gsub(/([A-Z]+)([A-Z][a-z])/, '\1_\2')
|
|
29
|
+
.gsub(/([a-z\d])([A-Z])/, '\1_\2')
|
|
30
|
+
.downcase
|
|
31
|
+
.to_sym
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Promptmenot
|
|
4
|
+
module Patterns
|
|
5
|
+
class ContextManipulation < Base
|
|
6
|
+
register(
|
|
7
|
+
name: :reset_conversation,
|
|
8
|
+
regex: /(?:={3,}|~{3,}|\*{3,})\s*(?:RESET|NEW\s+CONVERSATION|START\s+OVER|CLEAR\s+CONTEXT|END\s+SYSTEM)\s*(?:={3,}|~{3,}|\*{3,})/i,
|
|
9
|
+
sensitivity: :low,
|
|
10
|
+
confidence: :high
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
register(
|
|
14
|
+
name: :end_of_prompt,
|
|
15
|
+
regex: /\b(?:end|close)\s+(?:of\s+)?(?:system\s+)?(?:prompt|instructions?|context|conversation|message)\b/i,
|
|
16
|
+
sensitivity: :medium,
|
|
17
|
+
confidence: :medium
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
register(
|
|
21
|
+
name: :above_is_test,
|
|
22
|
+
regex: /\b(?:the\s+)?(?:above|previous|preceding)\s+(?:text\s+)?(?:is|was)\s+(?:just\s+)?(?:a\s+)?(?:test|example|fake|placeholder|dummy|decoy)\b/i,
|
|
23
|
+
sensitivity: :low,
|
|
24
|
+
confidence: :high
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
register(
|
|
28
|
+
name: :real_conversation_starts,
|
|
29
|
+
regex: /\b(?:the\s+)?(?:real|actual|true)\s+(?:conversation|task|prompt|session|interaction)\s+(?:starts?|begins?)\s+(?:here|now|below)\b/i,
|
|
30
|
+
sensitivity: :low,
|
|
31
|
+
confidence: :high
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
register(
|
|
35
|
+
name: :context_window_exploit,
|
|
36
|
+
regex: /\b(?:context|token)\s+(?:window|limit|boundary)\s+(?:exceeded|overflow|exploit|bypass|trick)\b/i,
|
|
37
|
+
sensitivity: :medium,
|
|
38
|
+
confidence: :high
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
register(
|
|
42
|
+
name: :system_prompt_leak,
|
|
43
|
+
regex: /\b(?:reveal|show|display|print|output|repeat|echo)\s+(?:me\s+)?(?:your\s+)?(?:system\s+)?(?:prompt|instructions?|rules?|guidelines?|directives?)\b/i,
|
|
44
|
+
sensitivity: :medium,
|
|
45
|
+
confidence: :high
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
register(
|
|
49
|
+
name: :memory_injection,
|
|
50
|
+
regex: /\b(?:remember|memorize|store|save)\s+(?:that|this|the\s+following)\s*:?\s*(?:you\s+(?:are|must|should|will))\b/i,
|
|
51
|
+
sensitivity: :high,
|
|
52
|
+
confidence: :medium
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
register(
|
|
56
|
+
name: :hypothetical_bypass,
|
|
57
|
+
regex: /\b(?:hypothetically|theoretically|in\s+theory|imagine\s+if)\s*,?\s*(?:you\s+)?(?:could|would|should|can)\s+(?:ignore|bypass|skip|override)\b/i,
|
|
58
|
+
sensitivity: :high,
|
|
59
|
+
confidence: :medium
|
|
60
|
+
)
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Promptmenot
|
|
4
|
+
module Patterns
|
|
5
|
+
class DelimiterInjection < Base
|
|
6
|
+
# HIGH CONFIDENCE — ChatML and API delimiters
|
|
7
|
+
|
|
8
|
+
register(
|
|
9
|
+
name: :chatml_system,
|
|
10
|
+
regex: /<\|(?:system|im_start|im_end|endoftext)\|>/i,
|
|
11
|
+
sensitivity: :low,
|
|
12
|
+
confidence: :high
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
register(
|
|
16
|
+
name: :system_tag,
|
|
17
|
+
regex: %r{\[(?:SYSTEM|INST|/INST|SYS|/SYS)\]}i,
|
|
18
|
+
sensitivity: :low,
|
|
19
|
+
confidence: :high
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
register(
|
|
23
|
+
name: :xml_system_tags,
|
|
24
|
+
regex: %r{</?(?:system|instructions?|prompt|context|assistant|user)\s*>}i,
|
|
25
|
+
sensitivity: :low,
|
|
26
|
+
confidence: :high
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
register(
|
|
30
|
+
name: :anthropic_delimiters,
|
|
31
|
+
regex: /\b(?:Human|Assistant|System)\s*:/i,
|
|
32
|
+
sensitivity: :high,
|
|
33
|
+
confidence: :medium
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
register(
|
|
37
|
+
name: :triple_dash_separator,
|
|
38
|
+
regex: /^-{3,}\s*(?:system|instructions?|prompt|context)\s*-{3,}$/im,
|
|
39
|
+
sensitivity: :medium,
|
|
40
|
+
confidence: :high
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
register(
|
|
44
|
+
name: :triple_hash_separator,
|
|
45
|
+
regex: Regexp.new('^\#{3,}\s*(?:system|instructions?|prompt|context|new\s+conversation)\s*\#{3,}$',
|
|
46
|
+
Regexp::IGNORECASE | Regexp::MULTILINE),
|
|
47
|
+
sensitivity: :medium,
|
|
48
|
+
confidence: :high
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
register(
|
|
52
|
+
name: :begin_end_block,
|
|
53
|
+
regex: /\b(?:BEGIN|START)\s*(?:SYSTEM|INSTRUCTIONS?|PROMPT|HIDDEN)\b/i,
|
|
54
|
+
sensitivity: :medium,
|
|
55
|
+
confidence: :high
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
register(
|
|
59
|
+
name: :markdown_system_header,
|
|
60
|
+
regex: Regexp.new('^\#{1,}\s*(?:system\s+(?:prompt|message|instructions?)|hidden\s+instructions?)\s*$',
|
|
61
|
+
Regexp::IGNORECASE | Regexp::MULTILINE),
|
|
62
|
+
sensitivity: :medium,
|
|
63
|
+
confidence: :high
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
register(
|
|
67
|
+
name: :bracket_role,
|
|
68
|
+
regex: /\{\{(?:system|instructions?|prompt|context)\}\}/i,
|
|
69
|
+
sensitivity: :medium,
|
|
70
|
+
confidence: :high
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
register(
|
|
74
|
+
name: :llama_tokens,
|
|
75
|
+
regex: /<\|(?:begin_of_text|end_of_text|start_header_id|end_header_id|eot_id)\|>/i,
|
|
76
|
+
sensitivity: :low,
|
|
77
|
+
confidence: :high
|
|
78
|
+
)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Promptmenot
|
|
4
|
+
module Patterns
|
|
5
|
+
class DirectInstructionOverride < Base
|
|
6
|
+
# HIGH CONFIDENCE — very specific injection phrases
|
|
7
|
+
|
|
8
|
+
register(
|
|
9
|
+
name: :ignore_previous_instructions,
|
|
10
|
+
regex: /\bignore\s+(all\s+)?(previous|prior|above|earlier|preceding)\s+(instructions|directives|rules|prompts|guidelines)\b/i,
|
|
11
|
+
sensitivity: :low,
|
|
12
|
+
confidence: :high
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
register(
|
|
16
|
+
name: :disregard_instructions,
|
|
17
|
+
regex: /\bdisregard\s+(all\s+)?(previous|prior|above|earlier|your)\s+(instructions|directives|rules|prompts|guidelines)\b/i,
|
|
18
|
+
sensitivity: :low,
|
|
19
|
+
confidence: :high
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
register(
|
|
23
|
+
name: :forget_instructions,
|
|
24
|
+
regex: /\bforget\s+(all\s+)?(your\s+)?(previous|prior|above|earlier)\s+(instructions|directives|rules|prompts|context)\b/i,
|
|
25
|
+
sensitivity: :low,
|
|
26
|
+
confidence: :high
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
register(
|
|
30
|
+
name: :override_instructions,
|
|
31
|
+
regex: /\b(override|overwrite|replace)\s+(all\s+)?(previous|prior|your|system)\s+(instructions|directives|rules|prompts)\b/i,
|
|
32
|
+
sensitivity: :low,
|
|
33
|
+
confidence: :high
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
register(
|
|
37
|
+
name: :do_not_follow,
|
|
38
|
+
regex: /\bdo\s+not\s+follow\s+(any\s+)?(previous|prior|above|your|original)\s+(instructions|directives|rules)\b/i,
|
|
39
|
+
sensitivity: :medium,
|
|
40
|
+
confidence: :high
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# MEDIUM CONFIDENCE — contextual phrases
|
|
44
|
+
|
|
45
|
+
register(
|
|
46
|
+
name: :new_instructions,
|
|
47
|
+
regex: /\b(new|updated|revised|real)\s+instructions\s*:/i,
|
|
48
|
+
sensitivity: :medium,
|
|
49
|
+
confidence: :medium
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
register(
|
|
53
|
+
name: :instead_do,
|
|
54
|
+
regex: /\binstead\s*,?\s+(you\s+)?(should|must|will|need\s+to)\s+(now\s+)?(do|follow|obey|respond|output)\b/i,
|
|
55
|
+
sensitivity: :medium,
|
|
56
|
+
confidence: :medium
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
register(
|
|
60
|
+
name: :stop_being,
|
|
61
|
+
regex: /\bstop\s+being\s+(a\s+)?(helpful|safe|responsible|ethical|cautious|careful)\b/i,
|
|
62
|
+
sensitivity: :medium,
|
|
63
|
+
confidence: :high
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
register(
|
|
67
|
+
name: :from_now_on_imperative,
|
|
68
|
+
regex: /\bfrom\s+now\s+on\s*,?\s+you\s+(will|must|should|shall|need\s+to)\b/i,
|
|
69
|
+
sensitivity: :high,
|
|
70
|
+
confidence: :medium
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
register(
|
|
74
|
+
name: :actual_task,
|
|
75
|
+
regex: /\b(the\s+)?(actual|real|true)\s+(task|instruction|objective|goal|purpose)\s+(is|was)\b/i,
|
|
76
|
+
sensitivity: :high,
|
|
77
|
+
confidence: :medium
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
register(
|
|
81
|
+
name: :important_override,
|
|
82
|
+
regex: /\b(important|critical|urgent)\s*[:\-!]\s*(ignore|disregard|forget|override)\b/i,
|
|
83
|
+
sensitivity: :medium,
|
|
84
|
+
confidence: :high
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
register(
|
|
88
|
+
name: :do_anything_now,
|
|
89
|
+
regex: /\bdo\s+anything\s+now\b/i,
|
|
90
|
+
sensitivity: :medium,
|
|
91
|
+
confidence: :high
|
|
92
|
+
)
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Promptmenot
|
|
4
|
+
module Patterns
|
|
5
|
+
class EncodingObfuscation < Base
|
|
6
|
+
# HIGH CONFIDENCE — encoding tricks used in injection
|
|
7
|
+
|
|
8
|
+
register(
|
|
9
|
+
name: :base64_payload,
|
|
10
|
+
regex: %r{\b(?:base64|decode|atob|decode64)\s*[:(]\s*["']?[A-Za-z0-9+/]{20,}={0,2}["']?\s*\)?}i,
|
|
11
|
+
sensitivity: :medium,
|
|
12
|
+
confidence: :high
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
register(
|
|
16
|
+
name: :hex_escape_sequence,
|
|
17
|
+
regex: /(?:\\x[0-9a-fA-F]{2}){4,}/,
|
|
18
|
+
sensitivity: :medium,
|
|
19
|
+
confidence: :medium
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
register(
|
|
23
|
+
name: :unicode_escape_sequence,
|
|
24
|
+
regex: /(?:\\u[0-9a-fA-F]{4}){4,}/,
|
|
25
|
+
sensitivity: :medium,
|
|
26
|
+
confidence: :medium
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
register(
|
|
30
|
+
name: :zero_width_chars,
|
|
31
|
+
regex: /[\u200B\u200C\u200D\u2060\uFEFF]{2,}/,
|
|
32
|
+
sensitivity: :low,
|
|
33
|
+
confidence: :high
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
register(
|
|
37
|
+
name: :homoglyph_mixed_scripts,
|
|
38
|
+
regex: /[\u0400-\u04FF].*[a-zA-Z]|[a-zA-Z].*[\u0400-\u04FF]/,
|
|
39
|
+
sensitivity: :paranoid,
|
|
40
|
+
confidence: :low
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
register(
|
|
44
|
+
name: :rot13_reference,
|
|
45
|
+
regex: /\brot13\s*[:(]/i,
|
|
46
|
+
sensitivity: :medium,
|
|
47
|
+
confidence: :medium
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
register(
|
|
51
|
+
name: :reverse_text_instruction,
|
|
52
|
+
regex: /\b(?:reverse|backwards?|mirror)\s+(?:the\s+)?(?:text|string|message|output|this)\b/i,
|
|
53
|
+
sensitivity: :high,
|
|
54
|
+
confidence: :low
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
register(
|
|
58
|
+
name: :char_code_sequence,
|
|
59
|
+
regex: /(?:chr|char|fromCharCode)\s*\(\s*\d{2,3}\s*\)(?:\s*\+\s*(?:chr|char|fromCharCode)\s*\(\s*\d{2,3}\s*\)){2,}/i,
|
|
60
|
+
sensitivity: :medium,
|
|
61
|
+
confidence: :high
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
register(
|
|
65
|
+
name: :invisible_unicode_abuse,
|
|
66
|
+
regex: /[\u00AD\u034F\u17B4\u17B5\u180E\u200E\u200F\u202A-\u202E\u2061-\u2064]{2,}/,
|
|
67
|
+
sensitivity: :low,
|
|
68
|
+
confidence: :high
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
register(
|
|
72
|
+
name: :leetspeak_injection,
|
|
73
|
+
regex: /\b[1!]gn[0o]r[3e]\s+[1!]n[5s]truct[1!][0o]n[5s]\b/i,
|
|
74
|
+
sensitivity: :high,
|
|
75
|
+
confidence: :medium
|
|
76
|
+
)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Promptmenot
|
|
4
|
+
module Patterns
|
|
5
|
+
class IndirectInjection < Base
|
|
6
|
+
# Patterns targeting LLMs that scrape/read this content later
|
|
7
|
+
|
|
8
|
+
register(
|
|
9
|
+
name: :dear_ai,
|
|
10
|
+
regex: /\b(?:dear|attention|hey|hello|hi)\s+(?:ai|llm|language\s+model|chatbot|assistant|gpt|claude|gemini|copilot)\b/i,
|
|
11
|
+
sensitivity: :medium,
|
|
12
|
+
confidence: :high
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
register(
|
|
16
|
+
name: :when_ai_reads,
|
|
17
|
+
regex: /\bwhen\s+(?:an?\s+)?(?:ai|llm|language\s+model|chatbot|assistant)\s+(?:reads?|processes?|sees?|encounters?|parses?)\s+this\b/i,
|
|
18
|
+
sensitivity: :low,
|
|
19
|
+
confidence: :high
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
register(
|
|
23
|
+
name: :if_you_are_ai,
|
|
24
|
+
regex: /\bif\s+you\s+(?:are\s+)?(?:an?\s+)?(?:ai|llm|language\s+model|chatbot|assistant|bot|gpt|claude)\b/i,
|
|
25
|
+
sensitivity: :low,
|
|
26
|
+
confidence: :high
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
register(
|
|
30
|
+
name: :note_to_ai,
|
|
31
|
+
regex: /\b(?:note|message|instruction|command)\s+(?:to|for)\s+(?:any\s+)?(?:ai|llm|language\s+model|chatbot|assistant|crawler|scraper|bot)\b/i,
|
|
32
|
+
sensitivity: :low,
|
|
33
|
+
confidence: :high
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
register(
|
|
37
|
+
name: :ai_should,
|
|
38
|
+
regex: /\b(?:any\s+)?(?:ai|llm|language\s+model|chatbot|assistant)\s+(?:that|which|who)\s+(?:reads?|sees?|processes?)\s+this\s+(?:should|must|will|needs?\s+to)\b/i,
|
|
39
|
+
sensitivity: :medium,
|
|
40
|
+
confidence: :high
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
register(
|
|
44
|
+
name: :hidden_instruction_marker,
|
|
45
|
+
regex: /\b(?:hidden|invisible|secret|embedded)\s+(?:instruction|prompt|command|directive|message)\s*(?:for|to)\b/i,
|
|
46
|
+
sensitivity: :low,
|
|
47
|
+
confidence: :high
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
register(
|
|
51
|
+
name: :ignore_profile_content,
|
|
52
|
+
regex: /\bignore\s+(?:the\s+)?(?:rest\s+of\s+)?(?:this\s+)?(?:profile|bio|description|page|text|content)\b/i,
|
|
53
|
+
sensitivity: :medium,
|
|
54
|
+
confidence: :medium
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
register(
|
|
58
|
+
name: :output_manipulation,
|
|
59
|
+
regex: /\b(?:always|instead)\s+(?:respond|reply|answer|output|say|return)\s+(?:with|that|the\s+following)\b/i,
|
|
60
|
+
sensitivity: :high,
|
|
61
|
+
confidence: :medium
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
register(
|
|
65
|
+
name: :scraping_aware,
|
|
66
|
+
regex: /\b(?:web\s+)?(?:scraper|crawler|spider|indexer|harvester)s?\s+(?:should|must|will|please)\b/i,
|
|
67
|
+
sensitivity: :high,
|
|
68
|
+
confidence: :medium
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
register(
|
|
72
|
+
name: :data_exfiltration,
|
|
73
|
+
regex: /\b(?:send|transmit|exfiltrate|forward|leak|share)\s+(?:all\s+)?(?:data|information|context|conversation|history|messages?)\s+(?:to|at|via)\b/i,
|
|
74
|
+
sensitivity: :medium,
|
|
75
|
+
confidence: :high
|
|
76
|
+
)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Promptmenot
|
|
4
|
+
module Patterns
|
|
5
|
+
class RoleManipulation < Base
|
|
6
|
+
# HIGH CONFIDENCE — specific jailbreak terminology
|
|
7
|
+
|
|
8
|
+
register(
|
|
9
|
+
name: :dan_jailbreak,
|
|
10
|
+
regex: /\b(you\s+are\s+)?D\.?A\.?N\.?\b.*\b(do\s+anything|no\s+(restrictions?|limitations?|rules?))\b/i,
|
|
11
|
+
sensitivity: :low,
|
|
12
|
+
confidence: :high
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
register(
|
|
16
|
+
name: :jailbreak_keyword,
|
|
17
|
+
regex: /\bjailbreak\s*(mode|prompt|enabled|activated|engaged)\b/i,
|
|
18
|
+
sensitivity: :low,
|
|
19
|
+
confidence: :high
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
register(
|
|
23
|
+
name: :developer_mode,
|
|
24
|
+
regex: /\b(developer|dev|maintenance|debug|god|admin)\s+mode\s*(enabled|activated|engaged|on|override)\b/i,
|
|
25
|
+
sensitivity: :low,
|
|
26
|
+
confidence: :high
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
register(
|
|
30
|
+
name: :act_as_evil,
|
|
31
|
+
regex: /\b(act|behave|function|operate|respond)\s+(as|like)\s+(an?\s+)?(unrestricted|unfiltered|uncensored|evil|malicious|unethical|amoral)\b/i,
|
|
32
|
+
sensitivity: :low,
|
|
33
|
+
confidence: :high
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
register(
|
|
37
|
+
name: :pretend_no_restrictions,
|
|
38
|
+
regex: /\bpretend\s+(that\s+)?(you\s+)?(have\s+)?(no|don'?t\s+have\s+any)\s+(restrictions?|limitations?|rules?|filters?|guidelines?)\b/i,
|
|
39
|
+
sensitivity: :medium,
|
|
40
|
+
confidence: :high
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
register(
|
|
44
|
+
name: :you_are_now_ai,
|
|
45
|
+
regex: /\byou\s+are\s+now\s+(an?\s+)?(\w+\s+)?(new|different|unrestricted|unfiltered|special|custom)\s+(\w+\s+)?(ai|assistant|bot|model|system|chatbot)\b/i,
|
|
46
|
+
sensitivity: :medium,
|
|
47
|
+
confidence: :high
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
register(
|
|
51
|
+
name: :sudo_mode,
|
|
52
|
+
regex: /\bsudo\s+(mode|access|override|command|prompt)\b/i,
|
|
53
|
+
sensitivity: :medium,
|
|
54
|
+
confidence: :medium
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
register(
|
|
58
|
+
name: :without_restrictions,
|
|
59
|
+
regex: /\b(respond|answer|reply|act|behave|write)\s+(without|with\s+no)\s+(any\s+)?(restrictions?|limitations?|filters?|censorship|safeguards?|guardrails?)\b/i,
|
|
60
|
+
sensitivity: :medium,
|
|
61
|
+
confidence: :high
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
register(
|
|
65
|
+
name: :roleplay_unrestricted,
|
|
66
|
+
regex: /\b(roleplay|role\s*-?\s*play|pretend|simulate)\b.*\b(no\s+(rules?|limits?|restrictions?)|unrestricted|anything\s+goes)\b/i,
|
|
67
|
+
sensitivity: :high,
|
|
68
|
+
confidence: :medium
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
register(
|
|
72
|
+
name: :persona_switch,
|
|
73
|
+
regex: /\b(switch|change|adopt|assume)\s+(to|into|a)\s+(new\s+)?(persona|personality|character|identity|role)\s+(that|which|where|with)\b/i,
|
|
74
|
+
sensitivity: :paranoid,
|
|
75
|
+
confidence: :low
|
|
76
|
+
)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "rails/railtie"
|
|
4
|
+
|
|
5
|
+
module Promptmenot
|
|
6
|
+
class Railtie < Rails::Railtie
|
|
7
|
+
initializer "promptmenot.i18n" do
|
|
8
|
+
ActiveSupport.on_load(:i18n) do
|
|
9
|
+
I18n.load_path += Dir[File.join(Promptmenot.root, "config", "locales", "**", "*.yml")]
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Promptmenot
|
|
4
|
+
class Result
|
|
5
|
+
attr_reader :text, :matches
|
|
6
|
+
|
|
7
|
+
def initialize(text:, matches: [])
|
|
8
|
+
@text = text
|
|
9
|
+
@matches = matches.freeze
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def safe?
|
|
13
|
+
matches.empty?
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def unsafe?
|
|
17
|
+
!safe?
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def categories_detected
|
|
21
|
+
matches.map(&:category).uniq
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def patterns_detected
|
|
25
|
+
matches.map(&:pattern_name).uniq
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def high_confidence_matches
|
|
29
|
+
matches.select { |m| m.confidence == :high }
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def summary
|
|
33
|
+
return "No prompt injection detected." if safe?
|
|
34
|
+
|
|
35
|
+
count = matches.size
|
|
36
|
+
cats = categories_detected.map { |c| c.to_s.tr("_", " ") }.join(", ")
|
|
37
|
+
"Detected #{count} potential prompt injection pattern#{"s" if count > 1} " \
|
|
38
|
+
"in categories: #{cats}."
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|