hyrum 0.0.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +92 -3
- data/README.md +133 -14
- data/bin/hyrum +1 -2
- data/lib/hyrum/data/fake_messages.json +282 -0
- data/lib/hyrum/formats/formatter.rb +8 -2
- data/lib/hyrum/formats/templates/java.erb +9 -0
- data/lib/hyrum/formats/templates/javascript.erb +9 -0
- data/lib/hyrum/formats/templates/json.erb +13 -0
- data/lib/hyrum/formats/templates/python.erb +9 -0
- data/lib/hyrum/formats/templates/ruby.erb +9 -0
- data/lib/hyrum/formats/templates/text.erb +9 -0
- data/lib/hyrum/generators/ai_generator.rb +102 -0
- data/lib/hyrum/generators/fake_generator.rb +22 -34
- data/lib/hyrum/generators/message_generator.rb +17 -5
- data/lib/hyrum/script_options.rb +47 -20
- data/lib/hyrum/validators/lexical_diversity.rb +46 -0
- data/lib/hyrum/validators/quality_validator.rb +107 -0
- data/lib/hyrum/validators/semantic_similarity.rb +100 -0
- data/lib/hyrum/validators/validation_result.rb +22 -0
- data/lib/hyrum/version.rb +1 -1
- data/lib/hyrum.rb +158 -7
- metadata +13 -11
- data/lib/hyrum/generators/openai_generator.rb +0 -77
|
@@ -1,3 +1,12 @@
|
|
|
1
|
+
<% if validation_result && show_scores -%>
|
|
2
|
+
# Quality Score: <%= validation_result.score %>/100
|
|
3
|
+
# - Semantic similarity: <%= validation_result.semantic_similarity %>% (variations preserve meaning)
|
|
4
|
+
# - Lexical diversity: <%= validation_result.lexical_diversity %>% (variation in wording)
|
|
5
|
+
<% validation_result.warnings.each do |warning| -%>
|
|
6
|
+
# Warning: <%= warning %>
|
|
7
|
+
<% end -%>
|
|
8
|
+
#
|
|
9
|
+
<% end -%>
|
|
1
10
|
import random
|
|
2
11
|
import sys
|
|
3
12
|
|
|
@@ -1,3 +1,12 @@
|
|
|
1
|
+
<% if validation_result && show_scores -%>
|
|
2
|
+
# Quality Score: <%= validation_result.score %>/100
|
|
3
|
+
# - Semantic similarity: <%= validation_result.semantic_similarity %>% (variations preserve meaning)
|
|
4
|
+
# - Lexical diversity: <%= validation_result.lexical_diversity %>% (variation in wording)
|
|
5
|
+
<% validation_result.warnings.each do |warning| -%>
|
|
6
|
+
# Warning: <%= warning %>
|
|
7
|
+
<% end -%>
|
|
8
|
+
#
|
|
9
|
+
<% end -%>
|
|
1
10
|
# frozen_string_literal: true
|
|
2
11
|
|
|
3
12
|
module Messages
|
|
@@ -1,3 +1,12 @@
|
|
|
1
|
+
<% if validation_result && show_scores -%>
|
|
2
|
+
Quality Score: <%= validation_result.score %>/100
|
|
3
|
+
- Semantic similarity: <%= validation_result.semantic_similarity %>% (variations preserve meaning)
|
|
4
|
+
- Lexical diversity: <%= validation_result.lexical_diversity %>% (variation in wording)
|
|
5
|
+
<% validation_result.warnings.each do |warning| -%>
|
|
6
|
+
Warning: <%= warning %>
|
|
7
|
+
<% end -%>
|
|
8
|
+
|
|
9
|
+
<% end -%>
|
|
1
10
|
<% messages.each do |key, values| -%>
|
|
2
11
|
Messages for <%= key %>:
|
|
3
12
|
<% values.each do |msg| -%>
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'ruby_llm'
|
|
4
|
+
|
|
5
|
+
module Hyrum
|
|
6
|
+
module Generators
|
|
7
|
+
class AiGenerator
|
|
8
|
+
API_KEY_ENV_VARS = {
|
|
9
|
+
openai: 'OPENAI_API_KEY',
|
|
10
|
+
anthropic: 'ANTHROPIC_API_KEY',
|
|
11
|
+
gemini: 'GEMINI_API_KEY',
|
|
12
|
+
ollama: 'OLLAMA_API_BASE',
|
|
13
|
+
vertexai: 'GOOGLE_CLOUD_PROJECT',
|
|
14
|
+
bedrock: 'AWS_ACCESS_KEY_ID',
|
|
15
|
+
deepseek: 'DEEPSEEK_API_KEY',
|
|
16
|
+
mistral: 'MISTRAL_API_KEY',
|
|
17
|
+
perplexity: 'PERPLEXITY_API_KEY',
|
|
18
|
+
openrouter: 'OPENROUTER_API_KEY',
|
|
19
|
+
gpustack: 'GPUSTACK_API_KEY'
|
|
20
|
+
}.freeze
|
|
21
|
+
|
|
22
|
+
attr_reader :options
|
|
23
|
+
|
|
24
|
+
def initialize(options)
|
|
25
|
+
@options = options
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def generate
|
|
29
|
+
response = chat.ask(prompt)
|
|
30
|
+
puts "AI response: #{response.inspect}" if options[:verbose]
|
|
31
|
+
|
|
32
|
+
# Prepend the original message to the generated variations
|
|
33
|
+
# RubyLLM returns string keys, but our options use symbols
|
|
34
|
+
result = response.content.dup
|
|
35
|
+
key_str = options[:key].to_s
|
|
36
|
+
if result[key_str].is_a?(Array)
|
|
37
|
+
result[key_str] = [options[:message]] + result[key_str]
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Convert string keys to symbols for consistency with the rest of hyrum
|
|
41
|
+
result.transform_keys(&:to_sym)
|
|
42
|
+
rescue RubyLLM::ConfigurationError => e
|
|
43
|
+
handle_configuration_error(e)
|
|
44
|
+
rescue RubyLLM::Error => e
|
|
45
|
+
handle_general_error(e)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
|
|
50
|
+
def chat
|
|
51
|
+
@chat ||= RubyLLM.chat(
|
|
52
|
+
model: options[:ai_model].to_s,
|
|
53
|
+
provider: options[:ai_service]
|
|
54
|
+
).with_schema(response_schema)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def prompt
|
|
58
|
+
<<~PROMPT
|
|
59
|
+
Please provide #{options[:number]} alternative status messages for the following message:
|
|
60
|
+
"#{options[:message]}"
|
|
61
|
+
|
|
62
|
+
The messages should be unique and informative.
|
|
63
|
+
PROMPT
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# rubocop:disable Metrics/MethodLength
|
|
67
|
+
def response_schema
|
|
68
|
+
{
|
|
69
|
+
type: 'object',
|
|
70
|
+
properties: {
|
|
71
|
+
options[:key] => {
|
|
72
|
+
type: 'array',
|
|
73
|
+
items: { type: 'string' },
|
|
74
|
+
minItems: options[:number],
|
|
75
|
+
maxItems: options[:number]
|
|
76
|
+
}
|
|
77
|
+
},
|
|
78
|
+
required: [options[:key].to_s],
|
|
79
|
+
additionalProperties: false
|
|
80
|
+
}
|
|
81
|
+
end
|
|
82
|
+
# rubocop:enable Metrics/MethodLength
|
|
83
|
+
|
|
84
|
+
def handle_configuration_error(error)
|
|
85
|
+
puts "Configuration Error: #{error.message}"
|
|
86
|
+
puts "Please set the required API key for #{options[:ai_service]}."
|
|
87
|
+
puts "Example: export #{api_key_env_var_name}=your-key-here"
|
|
88
|
+
exit 1
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def handle_general_error(error)
|
|
92
|
+
puts "Error: #{error.message}"
|
|
93
|
+
puts 'Please check your configuration and try again.'
|
|
94
|
+
exit 1
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def api_key_env_var_name
|
|
98
|
+
API_KEY_ENV_VARS.fetch(options[:ai_service], "#{options[:ai_service].to_s.upcase}_API_KEY")
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
@@ -3,48 +3,36 @@
|
|
|
3
3
|
module Hyrum
|
|
4
4
|
module Generators
|
|
5
5
|
class FakeGenerator
|
|
6
|
-
|
|
7
|
-
{
|
|
8
|
-
"e404": [
|
|
9
|
-
"We couldn't locate the resource you were looking for.",
|
|
10
|
-
"The resource you requested is not available at this time.",
|
|
11
|
-
"Unfortunately, we were unable to find the specified resource.",
|
|
12
|
-
"It seems the resource you're searching for does not exist.",
|
|
13
|
-
"The item you are trying to access is currently missing."
|
|
14
|
-
],
|
|
15
|
-
"e418": [
|
|
16
|
-
"I'm a teapot",
|
|
17
|
-
"The server refuses the attempt to brew coffee with a teapot",
|
|
18
|
-
"Coffee brewing denied: a teapot is not suitable for this operation.",
|
|
19
|
-
"Request failed: the server cannot process coffee with a teapot.",
|
|
20
|
-
"Brewing error: teapots are incompatible with coffee preparation.",
|
|
21
|
-
"Action halted: using a teapot to brew coffee is not permitted.",
|
|
22
|
-
"Invalid request: please use a coffee maker instead of a teapot."
|
|
23
|
-
],
|
|
24
|
-
"e500": [
|
|
25
|
-
"Internal Server Error",
|
|
26
|
-
"An unexpected condition was encountered"
|
|
27
|
-
],
|
|
28
|
-
"e503": [
|
|
29
|
-
"Service Unavailable",
|
|
30
|
-
"The server is currently unavailable"
|
|
31
|
-
],
|
|
32
|
-
"e504": [
|
|
33
|
-
"Gateway Timeout",
|
|
34
|
-
"The server is currently unavailable"
|
|
35
|
-
]
|
|
36
|
-
}
|
|
37
|
-
)
|
|
6
|
+
DATA_FILE = File.expand_path('../data/fake_messages.json', __dir__)
|
|
38
7
|
|
|
39
8
|
attr_reader :options
|
|
40
9
|
|
|
41
10
|
def initialize(options)
|
|
42
11
|
@options = options
|
|
43
|
-
@ai_service = options[:ai_service]
|
|
44
12
|
end
|
|
45
13
|
|
|
46
14
|
def generate
|
|
47
|
-
|
|
15
|
+
messages = load_messages
|
|
16
|
+
key = options[:key]&.to_s&.downcase
|
|
17
|
+
number = (options[:number] || 1).to_i
|
|
18
|
+
|
|
19
|
+
return messages unless key
|
|
20
|
+
|
|
21
|
+
key_with_prefix = key.start_with?('e') ? key : "e#{key}"
|
|
22
|
+
available_messages = messages[key_with_prefix] || []
|
|
23
|
+
selected_messages = available_messages.sample([number, available_messages.length].min)
|
|
24
|
+
|
|
25
|
+
# Prepend the original message if provided
|
|
26
|
+
selected_messages = [options[:message]] + selected_messages if options[:message]
|
|
27
|
+
|
|
28
|
+
# Return as a hash to match expected format
|
|
29
|
+
{ options[:key] => selected_messages }
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
def load_messages
|
|
35
|
+
JSON.parse(File.read(DATA_FILE))
|
|
48
36
|
end
|
|
49
37
|
end
|
|
50
38
|
end
|
|
@@ -2,25 +2,37 @@
|
|
|
2
2
|
|
|
3
3
|
module Hyrum
|
|
4
4
|
module Generators
|
|
5
|
-
AI_SERVICES = %i[
|
|
5
|
+
AI_SERVICES = %i[
|
|
6
|
+
openai anthropic gemini ollama mistral deepseek
|
|
7
|
+
perplexity openrouter vertexai bedrock gpustack fake
|
|
8
|
+
].freeze
|
|
6
9
|
|
|
7
10
|
AI_MODEL_DEFAULTS = {
|
|
8
11
|
openai: :'gpt-4o-mini',
|
|
12
|
+
anthropic: :'claude-haiku-20250514',
|
|
13
|
+
gemini: :'gemini-2.0-flash-exp',
|
|
9
14
|
ollama: :llama3,
|
|
15
|
+
mistral: :'mistral-small-latest',
|
|
16
|
+
deepseek: :'deepseek-chat',
|
|
17
|
+
perplexity: :'llama-3.1-sonar-small-128k-online',
|
|
18
|
+
openrouter: :'openai/gpt-4o-mini',
|
|
19
|
+
vertexai: :'gemini-2.0-flash-exp',
|
|
20
|
+
bedrock: :'anthropic.claude-3-haiku-20240307-v1:0',
|
|
21
|
+
gpustack: :llama3,
|
|
10
22
|
fake: :fake
|
|
11
23
|
}.freeze
|
|
12
24
|
|
|
13
25
|
GENERATOR_CLASSES = {
|
|
14
|
-
openai: OpenaiGenerator,
|
|
15
|
-
ollama: OpenaiGenerator,
|
|
16
26
|
fake: FakeGenerator
|
|
27
|
+
# All other providers default to AiGenerator
|
|
17
28
|
}.freeze
|
|
18
29
|
|
|
19
30
|
class MessageGenerator
|
|
20
31
|
def self.create(options)
|
|
21
|
-
|
|
32
|
+
service = options[:ai_service].to_sym
|
|
22
33
|
|
|
23
|
-
#
|
|
34
|
+
# Get generator class, defaulting to AiGenerator for unlisted services
|
|
35
|
+
generator_class = GENERATOR_CLASSES.fetch(service, AiGenerator)
|
|
24
36
|
generator_class.new(options)
|
|
25
37
|
end
|
|
26
38
|
end
|
data/lib/hyrum/script_options.rb
CHANGED
|
@@ -3,16 +3,25 @@
|
|
|
3
3
|
require 'optparse'
|
|
4
4
|
|
|
5
5
|
module Hyrum
|
|
6
|
+
class ScriptOptionsError < StandardError; end
|
|
7
|
+
|
|
6
8
|
class ScriptOptions
|
|
7
9
|
MANDATORY_OPTIONS = %i[message].freeze
|
|
8
10
|
|
|
9
11
|
attr_reader :options
|
|
10
12
|
|
|
11
13
|
def initialize(args)
|
|
12
|
-
@options = {
|
|
14
|
+
@options = {
|
|
15
|
+
message: nil,
|
|
16
|
+
validate: false,
|
|
17
|
+
min_quality: 70,
|
|
18
|
+
strict: false,
|
|
19
|
+
show_scores: false
|
|
20
|
+
}
|
|
13
21
|
@args = args
|
|
14
22
|
end
|
|
15
23
|
|
|
24
|
+
# rubocop:disable Metrics/MethodLength
|
|
16
25
|
def parse
|
|
17
26
|
OptionParser.new do |parser|
|
|
18
27
|
define_options(parser)
|
|
@@ -22,17 +31,13 @@ module Hyrum
|
|
|
22
31
|
set_dynamic_defaults
|
|
23
32
|
options
|
|
24
33
|
rescue OptionParser::InvalidOption => e
|
|
25
|
-
|
|
34
|
+
raise ScriptOptionsError, "Invalid option: #{e.message}"
|
|
26
35
|
rescue OptionParser::MissingArgument => e
|
|
27
|
-
|
|
36
|
+
raise ScriptOptionsError, "Missing argument for option: #{e.message}"
|
|
28
37
|
rescue OptionParser::InvalidArgument => e
|
|
29
|
-
|
|
30
|
-
ensure
|
|
31
|
-
if err
|
|
32
|
-
puts err
|
|
33
|
-
exit
|
|
34
|
-
end
|
|
38
|
+
raise ScriptOptionsError, "Invalid argument for option: #{e.message}"
|
|
35
39
|
end
|
|
40
|
+
# rubocop:enable Metrics/MethodLength
|
|
36
41
|
|
|
37
42
|
private
|
|
38
43
|
|
|
@@ -42,10 +47,10 @@ module Hyrum
|
|
|
42
47
|
end
|
|
43
48
|
|
|
44
49
|
def enforce_mandatory_options
|
|
45
|
-
|
|
46
|
-
return if missing.empty?
|
|
50
|
+
return unless options[:ai_service] != :fake
|
|
47
51
|
|
|
48
|
-
|
|
52
|
+
missing = MANDATORY_OPTIONS.select { |param| options[param].nil? }
|
|
53
|
+
raise OptionParser::MissingArgument, missing.join(', ') unless missing.empty?
|
|
49
54
|
end
|
|
50
55
|
|
|
51
56
|
def define_options(parser)
|
|
@@ -55,7 +60,9 @@ module Hyrum
|
|
|
55
60
|
format_options(parser)
|
|
56
61
|
message_options(parser)
|
|
57
62
|
message_key_options(parser)
|
|
63
|
+
number_options(parser)
|
|
58
64
|
ai_service_options(parser)
|
|
65
|
+
validation_options(parser)
|
|
59
66
|
on_tail_options(parser)
|
|
60
67
|
end
|
|
61
68
|
|
|
@@ -74,7 +81,7 @@ module Hyrum
|
|
|
74
81
|
def ai_service_options(parser)
|
|
75
82
|
options[:ai_service] = :fake
|
|
76
83
|
|
|
77
|
-
description = "AI service: one of #{Generators::AI_SERVICES.join(', ')}"
|
|
84
|
+
description = "AI service: one of #{Generators::AI_SERVICES.join(', ')} (default: fake)"
|
|
78
85
|
parser.on('-s SERVICE', '--service SERVICE', Generators::AI_SERVICES, description) do |service|
|
|
79
86
|
options[:ai_service] = service.to_sym
|
|
80
87
|
end
|
|
@@ -86,19 +93,23 @@ module Hyrum
|
|
|
86
93
|
end
|
|
87
94
|
|
|
88
95
|
def message_key_options(parser)
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
parser.on('-k KEY', '--key KEY', 'Message key') do |key|
|
|
96
|
+
parser.on('-k KEY', '--key KEY', 'Message key (default: status)') do |key|
|
|
92
97
|
options[:key] = key.to_sym
|
|
93
98
|
end
|
|
94
99
|
end
|
|
95
100
|
|
|
96
101
|
def message_options(parser)
|
|
97
|
-
parser.on('-m MESSAGE', '--message MESSAGE', 'Status message') do |message|
|
|
102
|
+
parser.on('-m MESSAGE', '--message MESSAGE', 'Status message (required unless fake)') do |message|
|
|
98
103
|
options[:message] = message
|
|
99
104
|
end
|
|
100
105
|
end
|
|
101
106
|
|
|
107
|
+
def number_options(parser)
|
|
108
|
+
parser.on('-n NUMBER', '--number NUMBER', Integer, 'Number of messages to generate (default: 5)') do |number|
|
|
109
|
+
options[:number] = number.to_i
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
102
113
|
def verbosity_options(parser)
|
|
103
114
|
parser.on('-v', '--[no-]verbose', 'Run verbosely') do |v|
|
|
104
115
|
options[:verbose] = v
|
|
@@ -106,14 +117,30 @@ module Hyrum
|
|
|
106
117
|
end
|
|
107
118
|
|
|
108
119
|
def format_options(parser)
|
|
109
|
-
options[:format] = :text
|
|
110
|
-
|
|
111
120
|
formats = Formats::FORMATS
|
|
112
121
|
description = 'Output format. Supported formats are:'
|
|
113
122
|
supported = formats.join(', ')
|
|
114
|
-
parser.on('-f FORMAT', '--format FORMAT', formats, description, supported) do |format|
|
|
123
|
+
parser.on('-f FORMAT', '--format FORMAT', formats, description, supported, '(default: text)') do |format|
|
|
115
124
|
options[:format] = format
|
|
116
125
|
end
|
|
117
126
|
end
|
|
127
|
+
|
|
128
|
+
def validation_options(parser)
|
|
129
|
+
parser.on('--validate', 'Enable quality validation (default: off)') do
|
|
130
|
+
options[:validate] = true
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
parser.on('--min-quality SCORE', Integer, 'Minimum quality score 0-100 (default: 70)') do |score|
|
|
134
|
+
options[:min_quality] = score
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
parser.on('--strict', 'Fail on quality issues instead of warning (default: false)') do
|
|
138
|
+
options[:strict] = true
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
parser.on('--show-scores', 'Include quality metrics in output (default: false)') do
|
|
142
|
+
options[:show_scores] = true
|
|
143
|
+
end
|
|
144
|
+
end
|
|
118
145
|
end
|
|
119
146
|
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'set'
|
|
4
|
+
|
|
5
|
+
module Hyrum
|
|
6
|
+
module Validators
|
|
7
|
+
class LexicalDiversity
|
|
8
|
+
attr_reader :variations
|
|
9
|
+
|
|
10
|
+
def initialize(variations)
|
|
11
|
+
@variations = variations
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def calculate
|
|
15
|
+
return 0.0 if variations.empty? || variations.size == 1
|
|
16
|
+
|
|
17
|
+
# Calculate average pairwise Jaccard distance
|
|
18
|
+
distances = []
|
|
19
|
+
variations.combination(2).each do |var1, var2|
|
|
20
|
+
distances << jaccard_distance(tokenize(var1), tokenize(var2))
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Convert to percentage (0-100)
|
|
24
|
+
(distances.sum / distances.size * 100).round(2)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
def tokenize(text)
|
|
30
|
+
# Convert to lowercase and split into words, removing punctuation
|
|
31
|
+
text.downcase.scan(/\w+/).to_set
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def jaccard_distance(set1, set2)
|
|
35
|
+
# Jaccard distance = 1 - Jaccard similarity
|
|
36
|
+
# Jaccard similarity = intersection / union
|
|
37
|
+
return 1.0 if set1.empty? && set2.empty?
|
|
38
|
+
return 1.0 if set1.union(set2).empty?
|
|
39
|
+
|
|
40
|
+
intersection = set1.intersection(set2).size.to_f
|
|
41
|
+
union = set1.union(set2).size.to_f
|
|
42
|
+
1.0 - (intersection / union)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Hyrum
|
|
4
|
+
module Validators
|
|
5
|
+
class QualityValidator
|
|
6
|
+
DIVERSITY_WEIGHT = 0.5
|
|
7
|
+
SIMILARITY_WEIGHT = 0.5
|
|
8
|
+
MIN_DIVERSITY_THRESHOLD = 30.0
|
|
9
|
+
MIN_SIMILARITY_THRESHOLD = 85.0
|
|
10
|
+
|
|
11
|
+
attr_reader :original_message, :messages, :options
|
|
12
|
+
|
|
13
|
+
def initialize(original_message, messages, options)
|
|
14
|
+
@original_message = original_message
|
|
15
|
+
@messages = messages
|
|
16
|
+
@options = options
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def validate
|
|
20
|
+
return empty_result if messages.empty?
|
|
21
|
+
|
|
22
|
+
all_variations = messages.values.flatten
|
|
23
|
+
return single_variation_result if all_variations.size <= 1
|
|
24
|
+
|
|
25
|
+
semantic_score = calculate_semantic_similarity(all_variations)
|
|
26
|
+
lexical_score = calculate_lexical_diversity(all_variations)
|
|
27
|
+
|
|
28
|
+
overall_score = (semantic_score * SIMILARITY_WEIGHT) + (lexical_score * DIVERSITY_WEIGHT)
|
|
29
|
+
|
|
30
|
+
passed = overall_score >= options[:min_quality] &&
|
|
31
|
+
lexical_score >= MIN_DIVERSITY_THRESHOLD &&
|
|
32
|
+
semantic_score >= MIN_SIMILARITY_THRESHOLD
|
|
33
|
+
|
|
34
|
+
warnings = build_warnings(semantic_score, lexical_score)
|
|
35
|
+
|
|
36
|
+
ValidationResult.new(
|
|
37
|
+
score: overall_score.round(2),
|
|
38
|
+
semantic_similarity: semantic_score.round(2),
|
|
39
|
+
lexical_diversity: lexical_score.round(2),
|
|
40
|
+
passed: passed,
|
|
41
|
+
details: {
|
|
42
|
+
min_quality_threshold: options[:min_quality],
|
|
43
|
+
variation_count: all_variations.size
|
|
44
|
+
},
|
|
45
|
+
warnings: warnings
|
|
46
|
+
)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
def calculate_semantic_similarity(variations)
|
|
52
|
+
calculator = SemanticSimilarity.new(
|
|
53
|
+
original_message,
|
|
54
|
+
variations,
|
|
55
|
+
options[:ai_service],
|
|
56
|
+
options[:ai_model]
|
|
57
|
+
)
|
|
58
|
+
calculator.calculate
|
|
59
|
+
rescue StandardError => e
|
|
60
|
+
# Fall back to 100% on error (assume semantic similarity is good)
|
|
61
|
+
warn "Semantic similarity calculation failed: #{e.message}"
|
|
62
|
+
100.0
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def calculate_lexical_diversity(variations)
|
|
66
|
+
calculator = LexicalDiversity.new(variations)
|
|
67
|
+
calculator.calculate
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def build_warnings(semantic_score, lexical_score)
|
|
71
|
+
warnings = []
|
|
72
|
+
|
|
73
|
+
if lexical_score < MIN_DIVERSITY_THRESHOLD
|
|
74
|
+
warnings << "Low lexical diversity (#{lexical_score.round(2)}%). Variations may be too similar."
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
if semantic_score < MIN_SIMILARITY_THRESHOLD
|
|
78
|
+
warnings << "Low semantic similarity (#{semantic_score.round(2)}%). Variations may have different meanings."
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
warnings
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def empty_result
|
|
85
|
+
ValidationResult.new(
|
|
86
|
+
score: 0.0,
|
|
87
|
+
semantic_similarity: 0.0,
|
|
88
|
+
lexical_diversity: 0.0,
|
|
89
|
+
passed: true,
|
|
90
|
+
details: { variation_count: 0 },
|
|
91
|
+
warnings: ['No variations to validate']
|
|
92
|
+
)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def single_variation_result
|
|
96
|
+
ValidationResult.new(
|
|
97
|
+
score: 0.0,
|
|
98
|
+
semantic_similarity: 0.0,
|
|
99
|
+
lexical_diversity: 0.0,
|
|
100
|
+
passed: true,
|
|
101
|
+
details: { variation_count: messages.values.flatten.size },
|
|
102
|
+
warnings: ['Only one variation - nothing to compare']
|
|
103
|
+
)
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'matrix'
|
|
4
|
+
require 'set'
|
|
5
|
+
|
|
6
|
+
module Hyrum
|
|
7
|
+
module Validators
|
|
8
|
+
class SemanticSimilarity
|
|
9
|
+
attr_reader :original_message, :variations, :ai_service, :ai_model
|
|
10
|
+
|
|
11
|
+
def initialize(original_message, variations, ai_service, ai_model)
|
|
12
|
+
@original_message = original_message
|
|
13
|
+
@variations = variations
|
|
14
|
+
@ai_service = ai_service
|
|
15
|
+
@ai_model = ai_model
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def calculate
|
|
19
|
+
return 100.0 if variations.empty?
|
|
20
|
+
|
|
21
|
+
if supports_embeddings?
|
|
22
|
+
calculate_with_embeddings
|
|
23
|
+
else
|
|
24
|
+
calculate_with_fallback
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def supports_embeddings?
|
|
29
|
+
# Check if RubyLLM has any embedding models available in the current registry
|
|
30
|
+
# User is responsible for calling RubyLLM.models.refresh! if needed
|
|
31
|
+
RubyLLM.models.embedding_models.any?
|
|
32
|
+
rescue StandardError
|
|
33
|
+
# If we can't check the registry, assume embeddings aren't available
|
|
34
|
+
false
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private
|
|
38
|
+
|
|
39
|
+
def calculate_with_embeddings
|
|
40
|
+
# Batch all texts together for efficient API call
|
|
41
|
+
all_texts = [original_message] + variations
|
|
42
|
+
all_embeddings = get_embeddings(all_texts)
|
|
43
|
+
|
|
44
|
+
# First embedding is the original, rest are variations
|
|
45
|
+
original_embedding = all_embeddings.first
|
|
46
|
+
variation_embeddings = all_embeddings[1..]
|
|
47
|
+
|
|
48
|
+
# Compare each variation to the original message
|
|
49
|
+
similarities = variation_embeddings.map do |var_embedding|
|
|
50
|
+
cosine_similarity(original_embedding, var_embedding)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Convert to percentage (0-100)
|
|
54
|
+
(similarities.sum / similarities.size * 100).round(2)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def calculate_with_fallback
|
|
58
|
+
# Simple word overlap heuristic when embeddings not available
|
|
59
|
+
original_words = original_message.downcase.scan(/\w+/).to_set
|
|
60
|
+
|
|
61
|
+
# Compare each variation to the original message
|
|
62
|
+
similarities = variations.map do |variation|
|
|
63
|
+
var_words = variation.downcase.scan(/\w+/).to_set
|
|
64
|
+
intersection = original_words.intersection(var_words).size.to_f
|
|
65
|
+
union = original_words.union(var_words).size.to_f
|
|
66
|
+
union.zero? ? 1.0 : intersection / union
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
(similarities.sum / similarities.size * 100).round(2)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def get_embeddings(texts)
|
|
73
|
+
# Use RubyLLM.embed with user's configured default embedding model
|
|
74
|
+
# Works with any provider (OpenAI, Google, Anthropic, etc.)
|
|
75
|
+
result = RubyLLM.embed(texts)
|
|
76
|
+
|
|
77
|
+
# RubyLLM.embed returns a single result with vectors array
|
|
78
|
+
result.vectors
|
|
79
|
+
rescue RubyLLM::Error => e
|
|
80
|
+
# Fall back to heuristic if embedding fails
|
|
81
|
+
warn "Embedding API failed: #{e.message}. Using fallback heuristic."
|
|
82
|
+
raise # Re-raise to trigger fallback in calculate method
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def cosine_similarity(vec1, vec2)
|
|
86
|
+
# Calculate cosine similarity between two vectors
|
|
87
|
+
v1 = Vector.elements(vec1)
|
|
88
|
+
v2 = Vector.elements(vec2)
|
|
89
|
+
|
|
90
|
+
dot_product = v1.inner_product(v2)
|
|
91
|
+
magnitude1 = Math.sqrt(v1.inner_product(v1))
|
|
92
|
+
magnitude2 = Math.sqrt(v2.inner_product(v2))
|
|
93
|
+
|
|
94
|
+
return 0.0 if magnitude1.zero? || magnitude2.zero?
|
|
95
|
+
|
|
96
|
+
dot_product / (magnitude1 * magnitude2)
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Hyrum
|
|
4
|
+
module Validators
|
|
5
|
+
class ValidationResult < Dry::Struct
|
|
6
|
+
attribute :score, Types::Coercible::Float
|
|
7
|
+
attribute :semantic_similarity, Types::Coercible::Float
|
|
8
|
+
attribute :lexical_diversity, Types::Coercible::Float
|
|
9
|
+
attribute :passed, Types::Bool
|
|
10
|
+
attribute :details, Types::Hash.default({}.freeze)
|
|
11
|
+
attribute :warnings, Types::Array.of(Types::String).default([].freeze)
|
|
12
|
+
|
|
13
|
+
def passed?
|
|
14
|
+
passed
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def failed?
|
|
18
|
+
!passed
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
data/lib/hyrum/version.rb
CHANGED