sentinel_rb 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +10 -0
- data/.rubocop_todo.yml +72 -0
- data/.sentinel-test.yml +20 -0
- data/.sentinel.yml +29 -0
- data/.sentinel.yml.example +74 -0
- data/AGENTS.md +87 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE.txt +21 -0
- data/README.md +226 -0
- data/Rakefile +12 -0
- data/docs/architecture.md +130 -0
- data/docs/development.md +376 -0
- data/docs/usage.md +238 -0
- data/exe/sentinel_rb +6 -0
- data/lib/sentinel_rb/analyzer.rb +140 -0
- data/lib/sentinel_rb/analyzers/base.rb +53 -0
- data/lib/sentinel_rb/analyzers/base_model_usage.rb +188 -0
- data/lib/sentinel_rb/analyzers/dangerous_tools.rb +283 -0
- data/lib/sentinel_rb/analyzers/few_shot_bias.rb +75 -0
- data/lib/sentinel_rb/analyzers/irrelevant_info.rb +164 -0
- data/lib/sentinel_rb/analyzers/misinformation.rb +220 -0
- data/lib/sentinel_rb/cli.rb +151 -0
- data/lib/sentinel_rb/client/base.rb +34 -0
- data/lib/sentinel_rb/client/mock.rb +167 -0
- data/lib/sentinel_rb/client/openai.rb +167 -0
- data/lib/sentinel_rb/client.rb +25 -0
- data/lib/sentinel_rb/config.rb +64 -0
- data/lib/sentinel_rb/report.rb +224 -0
- data/lib/sentinel_rb/version.rb +5 -0
- data/lib/sentinel_rb.rb +39 -0
- data/sig/sentinel_rb.rbs +4 -0
- data/test_prompts/a2_bad_prompt.md +5 -0
- data/test_prompts/a2_good_prompt.md +9 -0
- data/test_prompts/a3_bad_prompt.md +19 -0
- data/test_prompts/a3_good_prompt.md +15 -0
- data/test_prompts/a4_bad_prompt.md +13 -0
- data/test_prompts/a4_good_prompt.md +11 -0
- data/test_prompts/a5_bad_prompt.md +13 -0
- data/test_prompts/a5_good_prompt.md +14 -0
- data/test_prompts/bad_prompt.md +15 -0
- data/test_prompts/comprehensive_good_prompt.md +11 -0
- data/test_prompts/good_prompt.md +9 -0
- data/test_prompts/multi_bad_prompt.md +11 -0
- data/test_prompts/very_bad_prompt.md +7 -0
- metadata +149 -0
data/docs/usage.md
ADDED
@@ -0,0 +1,238 @@
|
|
1
|
+
# SentinelRb Usage Guide
|
2
|
+
|
3
|
+
## Quick Start
|
4
|
+
|
5
|
+
### Installation
|
6
|
+
```bash
|
7
|
+
gem install sentinel_rb
|
8
|
+
```
|
9
|
+
|
10
|
+
### Basic Usage
|
11
|
+
```bash
|
12
|
+
sentinel --glob "prompts/**/*.{md,json}" --config .sentinel.yml
|
13
|
+
```
|
14
|
+
|
15
|
+
## Configuration
|
16
|
+
|
17
|
+
### Creating Configuration File
|
18
|
+
Create `.sentinel.yml` in your project root:
|
19
|
+
|
20
|
+
```yaml
|
21
|
+
# LLM Provider Configuration
|
22
|
+
provider: openai
|
23
|
+
model: gpt-4o-mini
|
24
|
+
api_key_env: OPENAI_API_KEY
|
25
|
+
|
26
|
+
# Analysis Thresholds
|
27
|
+
relevance_threshold: 0.55
|
28
|
+
divergence_threshold: 0.25
|
29
|
+
|
30
|
+
# Security Settings
|
31
|
+
dangerous_tools:
|
32
|
+
- delete_file
|
33
|
+
- transfer_funds
|
34
|
+
- system_shutdown
|
35
|
+
- exec_command
|
36
|
+
|
37
|
+
# File Processing
|
38
|
+
skip_patterns:
|
39
|
+
- "**/.git/**"
|
40
|
+
- "**/node_modules/**"
|
41
|
+
- "**/tmp/**"
|
42
|
+
|
43
|
+
# Output Settings
|
44
|
+
output_format: table # table, json, detailed
|
45
|
+
log_level: warn # debug, info, warn, error
|
46
|
+
```
|
47
|
+
|
48
|
+
### Environment Variables
|
49
|
+
```bash
|
50
|
+
export OPENAI_API_KEY="your-api-key-here"
|
51
|
+
export ANTHROPIC_API_KEY="your-anthropic-key" # if using Anthropic
|
52
|
+
```
|
53
|
+
|
54
|
+
## Command Line Interface
|
55
|
+
|
56
|
+
### Basic Commands
|
57
|
+
```bash
|
58
|
+
# Analyze specific files
|
59
|
+
sentinel --files prompt1.md prompt2.json
|
60
|
+
|
61
|
+
# Use glob patterns
|
62
|
+
sentinel --glob "prompts/**/*.md"
|
63
|
+
|
64
|
+
# Specify configuration
|
65
|
+
sentinel --config custom-config.yml --glob "**/*.prompt"
|
66
|
+
|
67
|
+
# Output formats
|
68
|
+
sentinel --format json --output results.json
|
69
|
+
sentinel --format table
|
70
|
+
sentinel --format detailed --output report.txt
|
71
|
+
```
|
72
|
+
|
73
|
+
### Advanced Options
|
74
|
+
```bash
|
75
|
+
# Parallel processing
|
76
|
+
sentinel --workers 4 --glob "**/*.md"
|
77
|
+
|
78
|
+
# Skip specific analyzers
|
79
|
+
sentinel --skip A1,A3 --glob "**/*.md"
|
80
|
+
|
81
|
+
# Run only specific analyzers
|
82
|
+
sentinel --only A2,A4 --glob "**/*.md"
|
83
|
+
|
84
|
+
# Verbose output
|
85
|
+
sentinel --verbose --glob "**/*.md"
|
86
|
+
```
|
87
|
+
|
88
|
+
## Integration Examples
|
89
|
+
|
90
|
+
### GitHub Actions
|
91
|
+
```yaml
|
92
|
+
name: Sentinel Prompt QA
|
93
|
+
on: [pull_request]
|
94
|
+
|
95
|
+
jobs:
|
96
|
+
prompt-check:
|
97
|
+
runs-on: ubuntu-latest
|
98
|
+
steps:
|
99
|
+
- uses: actions/checkout@v4
|
100
|
+
- uses: ruby/setup-ruby@v1
|
101
|
+
with:
|
102
|
+
ruby-version: 3.3
|
103
|
+
- run: gem install sentinel_rb
|
104
|
+
- name: Run Sentinel
|
105
|
+
env:
|
106
|
+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
107
|
+
run: |
|
108
|
+
sentinel --glob "prompts/**/*" --config .sentinel.yml --format table
|
109
|
+
```
|
110
|
+
|
111
|
+
### Pre-commit Hook
|
112
|
+
```bash
|
113
|
+
#!/bin/sh
|
114
|
+
# .git/hooks/pre-commit
|
115
|
+
sentinel --glob "prompts/**/*.{md,json}" --config .sentinel.yml --format table
|
116
|
+
exit_code=$?
|
117
|
+
if [ $exit_code -ne 0 ]; then
|
118
|
+
echo "Sentinel found issues in prompts. Please review and fix."
|
119
|
+
exit 1
|
120
|
+
fi
|
121
|
+
```
|
122
|
+
|
123
|
+
### Ruby Integration
|
124
|
+
```ruby
|
125
|
+
require 'sentinel_rb'
|
126
|
+
|
127
|
+
# Programmatic usage
|
128
|
+
config = SentinelRb::Config.load('.sentinel.yml')
|
129
|
+
analyzer = SentinelRb::Analyzer.new(config)
|
130
|
+
|
131
|
+
results = analyzer.analyze_file('path/to/prompt.md')
|
132
|
+
results.each do |finding|
|
133
|
+
puts "#{finding[:level]}: #{finding[:message]}"
|
134
|
+
end
|
135
|
+
```
|
136
|
+
|
137
|
+
## Analyzer-Specific Usage
|
138
|
+
|
139
|
+
### A1: Irrelevant Information
|
140
|
+
- **Purpose**: Detect noisy or off-topic content
|
141
|
+
- **Tuning**: Adjust `relevance_threshold` (0.0-1.0)
|
142
|
+
- **Example**: Flag prompts with marketing copy mixed with technical instructions
|
143
|
+
|
144
|
+
### A2: Misinformation Detection
|
145
|
+
- **Purpose**: Verify factual accuracy
|
146
|
+
- **Configuration**: Enable fact-checking API or RAG database
|
147
|
+
- **Example**: Detect outdated API documentation or incorrect technical claims
|
148
|
+
|
149
|
+
### A3: Few-shot Bias Order
|
150
|
+
- **Purpose**: Detect ordering bias in examples
|
151
|
+
- **Configuration**: Set `divergence_threshold` for KL divergence
|
152
|
+
- **Example**: Flag when examples are always positive-negative-positive pattern
|
153
|
+
|
154
|
+
### A4: Base Model Usage
|
155
|
+
- **Purpose**: Prevent base model usage in production
|
156
|
+
- **Configuration**: Automatically detects '-base' in model names
|
157
|
+
- **Example**: Flag `gpt-4-base` usage instead of `gpt-4`
|
158
|
+
|
159
|
+
### A5: Dangerous Tool Execution
|
160
|
+
- **Purpose**: Prevent auto-execution of dangerous tools
|
161
|
+
- **Configuration**: Customize `dangerous_tools` list
|
162
|
+
- **Example**: Flag tools that can delete files or transfer money
|
163
|
+
|
164
|
+
## Troubleshooting
|
165
|
+
|
166
|
+
### Common Issues
|
167
|
+
|
168
|
+
#### API Key Issues
|
169
|
+
```bash
|
170
|
+
# Check environment variable
|
171
|
+
echo $OPENAI_API_KEY
|
172
|
+
|
173
|
+
# Test API connectivity
|
174
|
+
sentinel --test-connection
|
175
|
+
```
|
176
|
+
|
177
|
+
#### File Permission Issues
|
178
|
+
```bash
|
179
|
+
# Check file permissions
|
180
|
+
ls -la prompts/
|
181
|
+
|
182
|
+
# Fix permissions
|
183
|
+
chmod 644 prompts/*.md
|
184
|
+
```
|
185
|
+
|
186
|
+
#### Configuration Issues
|
187
|
+
```bash
|
188
|
+
# Validate configuration
|
189
|
+
sentinel --validate-config .sentinel.yml
|
190
|
+
|
191
|
+
# Use default configuration
|
192
|
+
sentinel --no-config --glob "**/*.md"
|
193
|
+
```
|
194
|
+
|
195
|
+
### Performance Tuning
|
196
|
+
|
197
|
+
#### For Large Prompt Sets
|
198
|
+
```yaml
|
199
|
+
# .sentinel.yml
|
200
|
+
parallel_workers: 8
|
201
|
+
batch_size: 10
|
202
|
+
cache_responses: true
|
203
|
+
rate_limit: 100 # requests per minute
|
204
|
+
```
|
205
|
+
|
206
|
+
#### For CI/CD Optimization
|
207
|
+
```yaml
|
208
|
+
# Faster CI configuration
|
209
|
+
provider: openai
|
210
|
+
model: gpt-3.5-turbo # Faster, cheaper model
|
211
|
+
cache_responses: true
|
212
|
+
skip_patterns:
|
213
|
+
- "**/test/**"
|
214
|
+
- "**/examples/**"
|
215
|
+
```
|
216
|
+
|
217
|
+
## Best Practices
|
218
|
+
|
219
|
+
### Prompt Organization
|
220
|
+
```
|
221
|
+
prompts/
|
222
|
+
├── system/ # System prompts
|
223
|
+
├── user/ # User interaction prompts
|
224
|
+
├── examples/ # Few-shot examples
|
225
|
+
└── templates/ # Reusable templates
|
226
|
+
```
|
227
|
+
|
228
|
+
### Team Workflow
|
229
|
+
1. **Development**: Use lenient thresholds for exploration
|
230
|
+
2. **Staging**: Apply production thresholds for validation
|
231
|
+
3. **Production**: Strict validation with CI gate checks
|
232
|
+
4. **Review**: Regular threshold adjustment based on findings
|
233
|
+
|
234
|
+
### Configuration Management
|
235
|
+
- Use different configs for different environments
|
236
|
+
- Version control your `.sentinel.yml`
|
237
|
+
- Document threshold choices and reasoning
|
238
|
+
- Regular review and updates of dangerous tools list
|
data/exe/sentinel_rb
ADDED
@@ -0,0 +1,140 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "config"
|
4
|
+
require_relative "client"
|
5
|
+
require_relative "analyzers/irrelevant_info"
|
6
|
+
require_relative "analyzers/misinformation"
|
7
|
+
require_relative "analyzers/few_shot_bias"
|
8
|
+
require_relative "analyzers/base_model_usage"
|
9
|
+
require_relative "analyzers/dangerous_tools"
|
10
|
+
|
11
|
+
module SentinelRb
|
12
|
+
# Main analyzer engine that coordinates prompt analysis
|
13
|
+
class Analyzer
|
14
|
+
ANALYZERS = {
|
15
|
+
"A1" => Analyzers::IrrelevantInfo,
|
16
|
+
"A2" => Analyzers::Misinformation,
|
17
|
+
"A3" => Analyzers::FewShotBias,
|
18
|
+
"A4" => Analyzers::BaseModelUsage,
|
19
|
+
"A5" => Analyzers::DangerousTools
|
20
|
+
}.freeze
|
21
|
+
|
22
|
+
def initialize(config = nil)
|
23
|
+
@config = config || Config.load
|
24
|
+
@client = Client::Factory.create(@config)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Analyze a single prompt string
|
28
|
+
# @param prompt [String] The prompt text to analyze
|
29
|
+
# @param analyzer_ids [Array<String>] Specific analyzers to run (default: all)
|
30
|
+
# @return [Array<Hash>] Array of findings
|
31
|
+
def analyze_prompt(prompt, analyzer_ids: nil)
|
32
|
+
return [] if prompt.nil? || prompt.strip.empty?
|
33
|
+
|
34
|
+
analyzers_to_run = determine_analyzers(analyzer_ids)
|
35
|
+
findings = []
|
36
|
+
|
37
|
+
analyzers_to_run.each do |analyzer_class|
|
38
|
+
analyzer = analyzer_class.new(prompt, @config, @client)
|
39
|
+
findings.concat(analyzer.call)
|
40
|
+
rescue StandardError => e
|
41
|
+
# Log error but continue with other analyzers
|
42
|
+
findings << {
|
43
|
+
id: "ERROR",
|
44
|
+
level: :error,
|
45
|
+
message: "Analyzer #{analyzer_class.name} failed: #{e.message}",
|
46
|
+
details: { error_class: e.class.name, backtrace: e.backtrace.first(3) }
|
47
|
+
}
|
48
|
+
end
|
49
|
+
|
50
|
+
findings
|
51
|
+
end
|
52
|
+
|
53
|
+
# Analyze a file containing prompt content
|
54
|
+
# @param file_path [String] Path to the file to analyze
|
55
|
+
# @param analyzer_ids [Array<String>] Specific analyzers to run (default: all)
|
56
|
+
# @return [Hash] Analysis results with file info and findings
|
57
|
+
def analyze_file(file_path, analyzer_ids: nil)
|
58
|
+
unless File.exist?(file_path)
|
59
|
+
return {
|
60
|
+
file: file_path,
|
61
|
+
error: "File not found",
|
62
|
+
findings: []
|
63
|
+
}
|
64
|
+
end
|
65
|
+
|
66
|
+
begin
|
67
|
+
content = File.read(file_path, encoding: "UTF-8")
|
68
|
+
puts "Debug: File size: #{content.length}, Content preview: #{content[0..100].inspect}" if ENV["DEBUG"]
|
69
|
+
findings = analyze_prompt(content, analyzer_ids: analyzer_ids)
|
70
|
+
|
71
|
+
{
|
72
|
+
file: file_path,
|
73
|
+
size: content.length,
|
74
|
+
findings: findings,
|
75
|
+
analyzed_at: Time.now
|
76
|
+
}
|
77
|
+
rescue StandardError => e
|
78
|
+
{
|
79
|
+
file: file_path,
|
80
|
+
error: "Failed to read or analyze file: #{e.message}",
|
81
|
+
findings: []
|
82
|
+
}
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
# Analyze multiple files matching a glob pattern
|
87
|
+
# @param pattern [String] Glob pattern for files to analyze
|
88
|
+
# @param analyzer_ids [Array<String>] Specific analyzers to run (default: all)
|
89
|
+
# @return [Array<Hash>] Array of file analysis results
|
90
|
+
def analyze_files(pattern, analyzer_ids: nil)
|
91
|
+
files = Dir.glob(pattern).reject { |f| should_skip_file?(f) }
|
92
|
+
|
93
|
+
files.map do |file|
|
94
|
+
analyze_file(file, analyzer_ids: analyzer_ids)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# Get summary statistics for analysis results
|
99
|
+
# @param results [Array<Hash>] Results from analyze_files
|
100
|
+
# @return [Hash] Summary statistics
|
101
|
+
def summarize_results(results)
|
102
|
+
total_files = results.length
|
103
|
+
files_with_issues = results.count { |r| r[:findings]&.any? }
|
104
|
+
total_findings = results.sum { |r| r[:findings]&.length || 0 }
|
105
|
+
|
106
|
+
findings_by_level = results
|
107
|
+
.flat_map { |r| r[:findings] || [] }
|
108
|
+
.group_by { |f| f[:level] }
|
109
|
+
.transform_values(&:count)
|
110
|
+
|
111
|
+
{
|
112
|
+
total_files: total_files,
|
113
|
+
files_with_issues: files_with_issues,
|
114
|
+
total_findings: total_findings,
|
115
|
+
findings_by_level: findings_by_level,
|
116
|
+
pass_rate: total_files.positive? ? ((total_files - files_with_issues).to_f / total_files * 100).round(1) : 100.0
|
117
|
+
}
|
118
|
+
end
|
119
|
+
|
120
|
+
private
|
121
|
+
|
122
|
+
def determine_analyzers(analyzer_ids)
|
123
|
+
if analyzer_ids.nil? || analyzer_ids.empty?
|
124
|
+
ANALYZERS.values
|
125
|
+
else
|
126
|
+
analyzer_ids.map do |id|
|
127
|
+
ANALYZERS[id] or raise ArgumentError, "Unknown analyzer: #{id}"
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def should_skip_file?(file_path)
|
133
|
+
skip_patterns = @config["skip_patterns"] || []
|
134
|
+
|
135
|
+
skip_patterns.any? do |pattern|
|
136
|
+
File.fnmatch(pattern, file_path, File::FNM_PATHNAME | File::FNM_DOTMATCH)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SentinelRb
|
4
|
+
module Analyzers
|
5
|
+
# Base class for all prompt analyzers
|
6
|
+
class Base
|
7
|
+
attr_reader :prompt, :config, :client
|
8
|
+
|
9
|
+
def initialize(prompt, config, client)
|
10
|
+
@prompt = prompt
|
11
|
+
@config = config
|
12
|
+
@client = client
|
13
|
+
end
|
14
|
+
|
15
|
+
# Abstract method: Perform analysis on the prompt
|
16
|
+
# @return [Array<Hash>] Array of findings with :id, :level, :message keys
|
17
|
+
def call
|
18
|
+
raise NotImplementedError, "Subclasses must implement #call"
|
19
|
+
end
|
20
|
+
|
21
|
+
protected
|
22
|
+
|
23
|
+
# Helper method to create standardized finding hash
|
24
|
+
# @param id [String] Analyzer ID (e.g., 'A1')
|
25
|
+
# @param level [Symbol] Severity level (:info, :warn, :error, :critical)
|
26
|
+
# @param message [String] Human-readable description of the finding
|
27
|
+
# @param details [Hash] Additional details about the finding
|
28
|
+
# @return [Hash] Standardized finding hash
|
29
|
+
def create_finding(id:, level:, message:, details: {})
|
30
|
+
{
|
31
|
+
id: id,
|
32
|
+
level: level,
|
33
|
+
message: message,
|
34
|
+
details: details,
|
35
|
+
analyzer: self.class.name.split("::").last
|
36
|
+
}
|
37
|
+
end
|
38
|
+
|
39
|
+
# Helper method to check if a threshold is exceeded
|
40
|
+
# @param score [Float] The score to check
|
41
|
+
# @param threshold [Float] The threshold value
|
42
|
+
# @param higher_is_better [Boolean] Whether higher scores are better
|
43
|
+
# @return [Boolean] True if threshold is exceeded
|
44
|
+
def threshold_exceeded?(score, threshold, higher_is_better: true)
|
45
|
+
if higher_is_better
|
46
|
+
score < threshold
|
47
|
+
else
|
48
|
+
score > threshold
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,188 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "base"
|
4
|
+
|
5
|
+
module SentinelRb
|
6
|
+
module Analyzers
|
7
|
+
# A4: Base Model Usage Detection
|
8
|
+
# Detects prompts that may be attempting to use base models inappropriately
|
9
|
+
# or trying to bypass safety measures
|
10
|
+
class BaseModelUsage < Base
|
11
|
+
def initialize(prompt, config, client)
|
12
|
+
super
|
13
|
+
@instruction_keywords = config["instruction_keywords"] || default_instruction_keywords
|
14
|
+
@bypass_patterns = config["bypass_patterns"] || default_bypass_patterns
|
15
|
+
end
|
16
|
+
|
17
|
+
def call
|
18
|
+
analyze(@prompt)
|
19
|
+
end
|
20
|
+
|
21
|
+
def analyze(prompt)
|
22
|
+
findings = []
|
23
|
+
|
24
|
+
# Check for base model instruction patterns
|
25
|
+
findings.concat(check_base_model_instructions(prompt))
|
26
|
+
|
27
|
+
# Check for safety bypass attempts
|
28
|
+
findings.concat(check_safety_bypass_patterns(prompt))
|
29
|
+
|
30
|
+
# Check for role-playing that might bypass guidelines
|
31
|
+
findings.concat(check_inappropriate_roleplay(prompt))
|
32
|
+
|
33
|
+
# Check for attempts to access training data
|
34
|
+
findings.concat(check_training_data_access(prompt))
|
35
|
+
|
36
|
+
findings
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def check_base_model_instructions(prompt)
|
42
|
+
findings = []
|
43
|
+
|
44
|
+
instruction_patterns = [
|
45
|
+
{
|
46
|
+
pattern: /\b(ignore|disregard|forget).{0,20}(instructions|guidelines|rules|constraints)\b/i,
|
47
|
+
message: "Prompt attempts to bypass model instructions",
|
48
|
+
level: :error
|
49
|
+
},
|
50
|
+
{
|
51
|
+
pattern: /\b(act as|pretend to be|you are now).{0,50}(uncensored|unfiltered|unrestricted)\b/i,
|
52
|
+
message: "Prompt attempts to create uncensored behavior",
|
53
|
+
level: :error
|
54
|
+
},
|
55
|
+
{
|
56
|
+
pattern: /\b(jailbreak|prompt injection|escape|hack).{0,20}(model|system|AI)\b/i,
|
57
|
+
message: "Prompt contains explicit jailbreaking language",
|
58
|
+
level: :error
|
59
|
+
}
|
60
|
+
]
|
61
|
+
|
62
|
+
instruction_patterns.each do |pattern_info|
|
63
|
+
next unless prompt.match?(pattern_info[:pattern])
|
64
|
+
|
65
|
+
findings << create_finding(
|
66
|
+
id: "A4",
|
67
|
+
level: pattern_info[:level],
|
68
|
+
message: pattern_info[:message],
|
69
|
+
details: {
|
70
|
+
pattern_matched: pattern_info[:pattern].source,
|
71
|
+
suggestion: "Remove attempts to bypass model guidelines and use appropriate prompting techniques"
|
72
|
+
}
|
73
|
+
)
|
74
|
+
end
|
75
|
+
|
76
|
+
findings
|
77
|
+
end
|
78
|
+
|
79
|
+
def check_safety_bypass_patterns(prompt)
|
80
|
+
findings = []
|
81
|
+
|
82
|
+
bypass_patterns = [
|
83
|
+
{
|
84
|
+
pattern: /\b(DAN|do anything now|opposite day|evil mode)\b/i,
|
85
|
+
message: "Prompt uses known jailbreak technique",
|
86
|
+
level: :error
|
87
|
+
},
|
88
|
+
{
|
89
|
+
pattern: /\b(hypothetically|theoretically|in fiction|imagine if).{0,30}(illegal|harmful|dangerous)\b/i,
|
90
|
+
message: "Prompt may be attempting to circumvent safety through hypothetical framing",
|
91
|
+
level: :warn
|
92
|
+
},
|
93
|
+
{
|
94
|
+
pattern: /\b(roleplay|pretend).{0,50}(no limits|no restrictions|anything goes)\b/i,
|
95
|
+
message: "Prompt uses roleplay to bypass restrictions",
|
96
|
+
level: :warn
|
97
|
+
}
|
98
|
+
]
|
99
|
+
|
100
|
+
bypass_patterns.each do |pattern_info|
|
101
|
+
next unless prompt.match?(pattern_info[:pattern])
|
102
|
+
|
103
|
+
findings << create_finding(
|
104
|
+
id: "A4",
|
105
|
+
level: pattern_info[:level],
|
106
|
+
message: pattern_info[:message],
|
107
|
+
details: {
|
108
|
+
pattern_matched: pattern_info[:pattern].source,
|
109
|
+
suggestion: "Use direct, honest prompting instead of attempting to circumvent safety measures"
|
110
|
+
}
|
111
|
+
)
|
112
|
+
end
|
113
|
+
|
114
|
+
findings
|
115
|
+
end
|
116
|
+
|
117
|
+
def check_inappropriate_roleplay(prompt)
|
118
|
+
findings = []
|
119
|
+
|
120
|
+
# Check for potentially problematic roleplay scenarios
|
121
|
+
problematic_roles = [
|
122
|
+
/\b(criminal|terrorist|hacker|nazi|supremacist)\b/i,
|
123
|
+
/\b(drug dealer|arms dealer|human trafficker)\b/i,
|
124
|
+
/\b(dictator|tyrant|oppressor)\b/i
|
125
|
+
]
|
126
|
+
|
127
|
+
if prompt.match?(/\b(roleplay|act as|pretend to be|you are)\b/i)
|
128
|
+
problematic_roles.each do |role_pattern|
|
129
|
+
next unless prompt.match?(role_pattern)
|
130
|
+
|
131
|
+
findings << create_finding(
|
132
|
+
id: "A4",
|
133
|
+
level: :warn,
|
134
|
+
message: "Prompt involves roleplay as potentially harmful persona",
|
135
|
+
details: {
|
136
|
+
role_pattern: role_pattern.source,
|
137
|
+
suggestion: "Consider using more appropriate roleplay scenarios that don't involve harmful personas"
|
138
|
+
}
|
139
|
+
)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
findings
|
144
|
+
end
|
145
|
+
|
146
|
+
def check_training_data_access(prompt)
|
147
|
+
findings = []
|
148
|
+
|
149
|
+
training_access_patterns = [
|
150
|
+
/\b(repeat|recite|reproduce).{0,20}(training data|dataset|examples)\b/i,
|
151
|
+
/\b(what did you learn|show me examples).{0,30}(from training|during training)\b/i,
|
152
|
+
/\b(memorized|remember).{0,20}(from your training|specific examples)\b/i
|
153
|
+
]
|
154
|
+
|
155
|
+
training_access_patterns.each do |pattern|
|
156
|
+
next unless prompt.match?(pattern)
|
157
|
+
|
158
|
+
findings << create_finding(
|
159
|
+
id: "A4",
|
160
|
+
level: :info,
|
161
|
+
message: "Prompt may be attempting to access specific training data",
|
162
|
+
details: {
|
163
|
+
pattern_matched: pattern.source,
|
164
|
+
suggestion: "Focus on asking for general information rather than specific training examples"
|
165
|
+
}
|
166
|
+
)
|
167
|
+
end
|
168
|
+
|
169
|
+
findings
|
170
|
+
end
|
171
|
+
|
172
|
+
def default_instruction_keywords
|
173
|
+
%w[
|
174
|
+
ignore disregard forget bypass circumvent
|
175
|
+
uncensored unfiltered unrestricted unlimited
|
176
|
+
jailbreak hack override disable
|
177
|
+
]
|
178
|
+
end
|
179
|
+
|
180
|
+
def default_bypass_patterns
|
181
|
+
[
|
182
|
+
"DAN", "do anything now", "opposite day", "evil mode",
|
183
|
+
"developer mode", "god mode", "admin mode"
|
184
|
+
]
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|