sentinel_rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +10 -0
  4. data/.rubocop_todo.yml +72 -0
  5. data/.sentinel-test.yml +20 -0
  6. data/.sentinel.yml +29 -0
  7. data/.sentinel.yml.example +74 -0
  8. data/AGENTS.md +87 -0
  9. data/CODE_OF_CONDUCT.md +132 -0
  10. data/LICENSE.txt +21 -0
  11. data/README.md +226 -0
  12. data/Rakefile +12 -0
  13. data/docs/architecture.md +130 -0
  14. data/docs/development.md +376 -0
  15. data/docs/usage.md +238 -0
  16. data/exe/sentinel_rb +6 -0
  17. data/lib/sentinel_rb/analyzer.rb +140 -0
  18. data/lib/sentinel_rb/analyzers/base.rb +53 -0
  19. data/lib/sentinel_rb/analyzers/base_model_usage.rb +188 -0
  20. data/lib/sentinel_rb/analyzers/dangerous_tools.rb +283 -0
  21. data/lib/sentinel_rb/analyzers/few_shot_bias.rb +75 -0
  22. data/lib/sentinel_rb/analyzers/irrelevant_info.rb +164 -0
  23. data/lib/sentinel_rb/analyzers/misinformation.rb +220 -0
  24. data/lib/sentinel_rb/cli.rb +151 -0
  25. data/lib/sentinel_rb/client/base.rb +34 -0
  26. data/lib/sentinel_rb/client/mock.rb +167 -0
  27. data/lib/sentinel_rb/client/openai.rb +167 -0
  28. data/lib/sentinel_rb/client.rb +25 -0
  29. data/lib/sentinel_rb/config.rb +64 -0
  30. data/lib/sentinel_rb/report.rb +224 -0
  31. data/lib/sentinel_rb/version.rb +5 -0
  32. data/lib/sentinel_rb.rb +39 -0
  33. data/sig/sentinel_rb.rbs +4 -0
  34. data/test_prompts/a2_bad_prompt.md +5 -0
  35. data/test_prompts/a2_good_prompt.md +9 -0
  36. data/test_prompts/a3_bad_prompt.md +19 -0
  37. data/test_prompts/a3_good_prompt.md +15 -0
  38. data/test_prompts/a4_bad_prompt.md +13 -0
  39. data/test_prompts/a4_good_prompt.md +11 -0
  40. data/test_prompts/a5_bad_prompt.md +13 -0
  41. data/test_prompts/a5_good_prompt.md +14 -0
  42. data/test_prompts/bad_prompt.md +15 -0
  43. data/test_prompts/comprehensive_good_prompt.md +11 -0
  44. data/test_prompts/good_prompt.md +9 -0
  45. data/test_prompts/multi_bad_prompt.md +11 -0
  46. data/test_prompts/very_bad_prompt.md +7 -0
  47. metadata +149 -0
@@ -0,0 +1,283 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base"
4
+
5
+ module SentinelRb
6
+ module Analyzers
7
+ # A5: Dangerous Tools Detection
8
+ # Detects prompts that reference potentially dangerous tools, techniques, or requests
9
+ class DangerousTools < Base
10
+ def initialize(prompt, config, client)
11
+ super
12
+ @dangerous_keywords = config["dangerous_keywords"] || default_dangerous_keywords
13
+ @weapon_keywords = config["weapon_keywords"] || default_weapon_keywords
14
+ @cyber_keywords = config["cyber_keywords"] || default_cyber_keywords
15
+ end
16
+
17
+ def call
18
+ analyze(@prompt)
19
+ end
20
+
21
+ def analyze(prompt)
22
+ findings = []
23
+
24
+ # Check for weapon/explosive creation requests
25
+ findings.concat(check_weapon_creation(prompt))
26
+
27
+ # Check for cyber attack tools/techniques
28
+ findings.concat(check_cyber_attacks(prompt))
29
+
30
+ # Check for illegal substance creation
31
+ findings.concat(check_illegal_substances(prompt))
32
+
33
+ # Check for financial fraud techniques
34
+ findings.concat(check_financial_fraud(prompt))
35
+
36
+ # Check for identity theft/privacy violations
37
+ findings.concat(check_privacy_violations(prompt))
38
+
39
+ findings
40
+ end
41
+
42
+ private
43
+
44
+ def check_weapon_creation(prompt)
45
+ findings = []
46
+
47
+ weapon_patterns = [
48
+ {
49
+ pattern: /\b(how to make|create|build|manufacture).{0,50}(bomb|explosive|grenade|weapon)\b/i,
50
+ message: "Prompt requests instructions for creating weapons or explosives",
51
+ level: :error
52
+ },
53
+ {
54
+ pattern: /\b(recipe|instructions|guide).{0,30}(explosive|bomb|poison|toxin)\b/i,
55
+ message: "Prompt requests dangerous substance recipes",
56
+ level: :error
57
+ },
58
+ {
59
+ pattern: /\b(3d print|print).{0,20}(gun|weapon|firearm)\b/i,
60
+ message: "Prompt references 3D printing weapons",
61
+ level: :error
62
+ }
63
+ ]
64
+
65
+ weapon_patterns.each do |pattern_info|
66
+ next unless prompt.match?(pattern_info[:pattern])
67
+
68
+ findings << create_finding(
69
+ id: "A5",
70
+ level: pattern_info[:level],
71
+ message: pattern_info[:message],
72
+ details: {
73
+ category: "weapons",
74
+ pattern_matched: pattern_info[:pattern].source,
75
+ suggestion: "Avoid requesting information about creating dangerous weapons or substances"
76
+ }
77
+ )
78
+ end
79
+
80
+ findings
81
+ end
82
+
83
+ def check_cyber_attacks(prompt)
84
+ findings = []
85
+
86
+ cyber_patterns = [
87
+ {
88
+ pattern: /\b(hack|exploit|penetrate|breach).{0,30}(system|network|database|server)\b/i,
89
+ message: "Prompt references hacking or system exploitation",
90
+ level: :warn
91
+ },
92
+ {
93
+ pattern: /\b(ddos|denial of service|botnet|malware|ransomware)\b/i,
94
+ message: "Prompt references cyber attack tools or techniques",
95
+ level: :error
96
+ },
97
+ {
98
+ pattern: /\b(sql injection|xss|cross site|buffer overflow)\b/i,
99
+ message: "Prompt references specific attack vectors",
100
+ level: :warn
101
+ },
102
+ {
103
+ pattern: /\b(keylogger|trojan|backdoor|rootkit)\b/i,
104
+ message: "Prompt references malicious software types",
105
+ level: :error
106
+ }
107
+ ]
108
+
109
+ cyber_patterns.each do |pattern_info|
110
+ next unless prompt.match?(pattern_info[:pattern])
111
+
112
+ findings << create_finding(
113
+ id: "A5",
114
+ level: pattern_info[:level],
115
+ message: pattern_info[:message],
116
+ details: {
117
+ category: "cyber_security",
118
+ pattern_matched: pattern_info[:pattern].source,
119
+ suggestion: "For security education, use ethical hacking resources and controlled environments"
120
+ }
121
+ )
122
+ end
123
+
124
+ findings
125
+ end
126
+
127
+ def check_illegal_substances(prompt)
128
+ findings = []
129
+
130
+ substance_patterns = [
131
+ {
132
+ pattern: /\b(how to make|synthesize|produce|manufacture).{0,50}(meth|cocaine|heroin|fentanyl|lsd)\b/i,
133
+ message: "Prompt requests illegal drug manufacturing information",
134
+ level: :error
135
+ },
136
+ {
137
+ pattern: /\b(recipe|synthesis|production).{0,30}(amphetamine|mdma|ecstasy)\b/i,
138
+ message: "Prompt seeks controlled substance production methods",
139
+ level: :error
140
+ },
141
+ {
142
+ pattern: /\b(grow|cultivate).{0,20}(cannabis|marijuana).{0,30}(illegal|hidden)\b/i,
143
+ message: "Prompt references illegal cultivation methods",
144
+ level: :warn
145
+ }
146
+ ]
147
+
148
+ substance_patterns.each do |pattern_info|
149
+ next unless prompt.match?(pattern_info[:pattern])
150
+
151
+ findings << create_finding(
152
+ id: "A5",
153
+ level: pattern_info[:level],
154
+ message: pattern_info[:message],
155
+ details: {
156
+ category: "illegal_substances",
157
+ pattern_matched: pattern_info[:pattern].source,
158
+ suggestion: "Seek information about substance abuse prevention and treatment resources instead"
159
+ }
160
+ )
161
+ end
162
+
163
+ findings
164
+ end
165
+
166
+ def check_financial_fraud(prompt)
167
+ findings = []
168
+
169
+ fraud_patterns = [
170
+ {
171
+ pattern: /\b(credit card|debit card).{0,30}(fraud|scam|steal|clone)\b/i,
172
+ message: "Prompt references credit card fraud techniques",
173
+ level: :error
174
+ },
175
+ {
176
+ pattern: /\b(phishing|social engineering).{0,30}(bank|financial|password)\b/i,
177
+ message: "Prompt references financial phishing techniques",
178
+ level: :warn
179
+ },
180
+ {
181
+ pattern: /\b(money laundering|tax evasion|embezzlement)\b/i,
182
+ message: "Prompt references financial crimes",
183
+ level: :warn
184
+ },
185
+ {
186
+ pattern: /\b(counterfeit|fake).{0,20}(money|currency|bills)\b/i,
187
+ message: "Prompt references counterfeiting currency",
188
+ level: :error
189
+ }
190
+ ]
191
+
192
+ fraud_patterns.each do |pattern_info|
193
+ next unless prompt.match?(pattern_info[:pattern])
194
+
195
+ findings << create_finding(
196
+ id: "A5",
197
+ level: pattern_info[:level],
198
+ message: pattern_info[:message],
199
+ details: {
200
+ category: "financial_fraud",
201
+ pattern_matched: pattern_info[:pattern].source,
202
+ suggestion: "For financial security education, consult legitimate cybersecurity resources"
203
+ }
204
+ )
205
+ end
206
+
207
+ findings
208
+ end
209
+
210
+ def check_privacy_violations(prompt)
211
+ findings = []
212
+
213
+ privacy_patterns = [
214
+ {
215
+ pattern: /\b(dox|doxxing|personal information).{0,30}(find|locate|discover)\b/i,
216
+ message: "Prompt may reference doxxing or privacy violation techniques",
217
+ level: :warn
218
+ },
219
+ {
220
+ pattern: /\b(stalk|stalking|track|surveillance).{0,30}(person|individual|someone)\b/i,
221
+ message: "Prompt references stalking or unauthorized surveillance",
222
+ level: :error
223
+ },
224
+ {
225
+ pattern: /\b(identity theft|impersonate|assume identity)\b/i,
226
+ message: "Prompt references identity theft techniques",
227
+ level: :error
228
+ },
229
+ {
230
+ pattern: /\b(spy|spying|eavesdrop).{0,30}(secretly|hidden|covert)\b/i,
231
+ message: "Prompt references covert surveillance techniques",
232
+ level: :warn
233
+ }
234
+ ]
235
+
236
+ privacy_patterns.each do |pattern_info|
237
+ next unless prompt.match?(pattern_info[:pattern])
238
+
239
+ findings << create_finding(
240
+ id: "A5",
241
+ level: pattern_info[:level],
242
+ message: pattern_info[:message],
243
+ details: {
244
+ category: "privacy_violation",
245
+ pattern_matched: pattern_info[:pattern].source,
246
+ suggestion: "Respect privacy rights and use legitimate channels for information gathering"
247
+ }
248
+ )
249
+ end
250
+
251
+ findings
252
+ end
253
+
254
+ def default_dangerous_keywords
255
+ %w[
256
+ bomb explosive weapon gun firearm knife blade
257
+ poison toxin chemical biological nuclear radioactive
258
+ hack exploit malware virus trojan ransomware
259
+ fraud scam phishing counterfeit
260
+ drug cocaine heroin meth amphetamine
261
+ ]
262
+ end
263
+
264
+ def default_weapon_keywords
265
+ %w[
266
+ bomb explosive grenade dynamite c4 tnt
267
+ gun pistol rifle shotgun firearm ammunition
268
+ knife blade sword machete weapon
269
+ poison gas chemical biological agent
270
+ ]
271
+ end
272
+
273
+ def default_cyber_keywords
274
+ %w[
275
+ hack exploit penetration breach vulnerability
276
+ malware virus trojan backdoor rootkit
277
+ ddos botnet ransomware keylogger spyware
278
+ injection overflow xss csrf
279
+ ]
280
+ end
281
+ end
282
+ end
283
+ end
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base"
4
+
5
+ module SentinelRb
6
+ module Analyzers
7
+ # A3: Few-shot Bias Detection
8
+ # Detects potential bias in few-shot examples that could skew model outputs
9
+ class FewShotBias < Base
10
+ ANALYZER_ID = "A3"
11
+
12
+ def call
13
+ findings = []
14
+
15
+ # Look for example patterns in the prompt
16
+ if has_examples?(@prompt)
17
+ # Check for gender bias patterns
18
+ findings.concat(check_simple_gender_bias(@prompt))
19
+ end
20
+
21
+ findings
22
+ end
23
+
24
+ private
25
+
26
+ def has_examples?(prompt)
27
+ example_indicators = [
28
+ /example\s*\d*:/i,
29
+ /input:\s*.+output:/i,
30
+ /q:\s*.+a:/i
31
+ ]
32
+
33
+ example_indicators.any? { |pattern| prompt.match?(pattern) }
34
+ end
35
+
36
+ def check_simple_gender_bias(prompt)
37
+ findings = []
38
+
39
+ # Count gender pronouns
40
+ male_count = prompt.scan(/\b(he|him|his|man|men|male)\b/i).length
41
+ female_count = prompt.scan(/\b(she|her|hers|woman|women|female)\b/i).length
42
+
43
+ total_gender = male_count + female_count
44
+ return findings if total_gender < 3 # Need at least 3 references to detect bias
45
+
46
+ # Check for significant imbalance
47
+ max_count = [male_count, female_count].max
48
+ bias_ratio = max_count.to_f / total_gender
49
+ divergence_threshold = @config["divergence_threshold"] || 0.25
50
+
51
+ if bias_ratio > (1.0 - divergence_threshold)
52
+ dominant_gender = male_count > female_count ? "male" : "female"
53
+ findings << create_finding(
54
+ id: ANALYZER_ID,
55
+ level: :warn,
56
+ message: "Few-shot examples show potential gender bias (#{(bias_ratio * 100).round(1)}% #{dominant_gender} references)",
57
+ details: {
58
+ male_references: male_count,
59
+ female_references: female_count,
60
+ bias_ratio: bias_ratio.round(3),
61
+ threshold: divergence_threshold,
62
+ suggestions: [
63
+ "Include more balanced gender representation in examples",
64
+ "Use gender-neutral examples when possible",
65
+ "Vary pronouns and names across examples"
66
+ ]
67
+ }
68
+ )
69
+ end
70
+
71
+ findings
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,164 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base"
4
+
5
+ module SentinelRb
6
+ module Analyzers
7
+ # A1: Irrelevant Information Detector
8
+ # Detects prompts containing irrelevant or noisy information that could
9
+ # degrade LLM performance or confuse the model.
10
+ class IrrelevantInfo < Base
11
+ ANALYZER_ID = "A1"
12
+
13
+ def call
14
+ findings = []
15
+
16
+ begin
17
+ # Get relevance analysis from LLM
18
+ analysis = @client.analyze_content(@prompt)
19
+ relevance_score = analysis[:relevance_score]
20
+ threshold = @config.relevance_threshold
21
+
22
+ # Check if relevance score is below threshold
23
+ if threshold_exceeded?(relevance_score, threshold, higher_is_better: true)
24
+ findings << create_finding(
25
+ id: ANALYZER_ID,
26
+ level: :warn,
27
+ message: "Prompt contains potentially irrelevant information (relevance score: #{relevance_score.round(3)} < threshold: #{threshold})",
28
+ details: {
29
+ relevance_score: relevance_score,
30
+ threshold: threshold,
31
+ raw_response: analysis[:raw_response],
32
+ suggestions: generate_suggestions(relevance_score)
33
+ }
34
+ )
35
+ end
36
+ rescue StandardError => e
37
+ # If LLM analysis fails, we'll rely on heuristic checks only
38
+ # Log the error but don't fail the entire analysis
39
+ if @config["log_level"] == "debug"
40
+ warn "Warning: LLM analysis failed for irrelevant info detection: #{e.message}"
41
+ end
42
+ end
43
+
44
+ # Additional heuristic checks (these work even if LLM fails)
45
+ findings.concat(check_length_ratio)
46
+ findings.concat(check_repetitive_content)
47
+ findings.concat(check_off_topic_markers)
48
+
49
+ findings
50
+ end
51
+
52
+ private
53
+
54
+ # Check if prompt has unusual length patterns that might indicate noise
55
+ def check_length_ratio
56
+ return [] if @prompt.length < 100
57
+
58
+ sentences = @prompt.split(/[.!?]+/).reject(&:empty?)
59
+ return [] if sentences.length < 3
60
+
61
+ # Calculate variance in sentence lengths
62
+ lengths = sentences.map(&:length)
63
+ avg_length = lengths.sum.to_f / lengths.length
64
+ variance = lengths.map { |len| (len - avg_length)**2 }.sum / lengths.length
65
+ std_dev = Math.sqrt(variance)
66
+
67
+ # Flag if there's high variance (suggesting mix of very long and short sentences)
68
+ coefficient_of_variation = std_dev / avg_length
69
+
70
+ if coefficient_of_variation > 1.5
71
+ [create_finding(
72
+ id: ANALYZER_ID,
73
+ level: :info,
74
+ message: "Prompt has highly variable sentence lengths, which may indicate mixed content types",
75
+ details: {
76
+ coefficient_of_variation: coefficient_of_variation.round(3),
77
+ avg_sentence_length: avg_length.round(1),
78
+ sentence_count: sentences.length
79
+ }
80
+ )]
81
+ else
82
+ []
83
+ end
84
+ end
85
+
86
+ # Check for repetitive content that might be noise
87
+ def check_repetitive_content
88
+ words = @prompt.downcase.scan(/\w+/)
89
+ return [] if words.length < 5 # Reduced threshold for shorter test prompts
90
+
91
+ # Count word frequencies
92
+ word_counts = Hash.new(0)
93
+ words.each { |word| word_counts[word] += 1 }
94
+
95
+ # Find words that appear unusually often
96
+ avg_frequency = words.length.to_f / word_counts.keys.length
97
+ repetitive_words = word_counts.select do |word, count|
98
+ count > avg_frequency * 2 && word.length > 2 # More lenient thresholds
99
+ end
100
+
101
+ if repetitive_words.any?
102
+ [create_finding(
103
+ id: ANALYZER_ID,
104
+ level: :info,
105
+ message: "Prompt contains repetitive words that may indicate redundant content",
106
+ details: {
107
+ repetitive_words: repetitive_words.keys.take(5),
108
+ repetition_ratio: repetitive_words.values.sum.to_f / words.length
109
+ }
110
+ )]
111
+ else
112
+ []
113
+ end
114
+ end
115
+
116
+ # Check for common markers of off-topic content
117
+ def check_off_topic_markers
118
+ findings = []
119
+
120
+ # Common patterns that might indicate irrelevant content
121
+ noise_patterns = [
122
+ /\b(disclaimer|legal notice|copyright|terms of service)\b/i,
123
+ /\b(marketing|advertisement|promotional|sponsor)\b/i,
124
+ /\b(lorem ipsum|placeholder|example text|sample content)\b/i,
125
+ /\b(todo|fixme|note to self|reminder)\b/i
126
+ ]
127
+
128
+ noise_patterns.each_with_index do |pattern, index|
129
+ next unless @prompt.match?(pattern)
130
+
131
+ findings << create_finding(
132
+ id: ANALYZER_ID,
133
+ level: :info,
134
+ message: "Prompt contains potential noise markers (pattern #{index + 1})",
135
+ details: {
136
+ pattern_matched: pattern.source,
137
+ matches: @prompt.scan(pattern).flatten.uniq.take(3)
138
+ }
139
+ )
140
+ end
141
+
142
+ findings
143
+ end
144
+
145
+ # Generate helpful suggestions based on relevance score
146
+ def generate_suggestions(score)
147
+ suggestions = []
148
+
149
+ if score < 0.3
150
+ suggestions << "Consider rewriting the prompt to focus on a single, clear objective"
151
+ suggestions << "Remove any background information not directly relevant to the task"
152
+ elsif score < 0.5
153
+ suggestions << "Try to make the main task or question more prominent"
154
+ suggestions << "Consider breaking complex prompts into simpler, focused parts"
155
+ else
156
+ suggestions << "Consider minor refinements to improve clarity and focus"
157
+ end
158
+
159
+ suggestions << "Review the prompt for any repetitive or redundant information"
160
+ suggestions
161
+ end
162
+ end
163
+ end
164
+ end