qualspec 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.qualspec_cassettes/comparison_test.yml +439 -0
- data/.qualspec_cassettes/quick_test.yml +232 -0
- data/.rspec +3 -0
- data/.rubocop.yml +1 -0
- data/.rubocop_todo.yml +70 -0
- data/CHANGELOG.md +16 -0
- data/README.md +84 -0
- data/Rakefile +8 -0
- data/docs/configuration.md +132 -0
- data/docs/evaluation-suites.md +180 -0
- data/docs/getting-started.md +102 -0
- data/docs/recording.md +196 -0
- data/docs/rspec-integration.md +233 -0
- data/docs/rubrics.md +174 -0
- data/examples/cassettes/qualspec_rspec_integration_basic_evaluation_evaluates_responses_with_inline_criteria.yml +65 -0
- data/examples/cassettes/qualspec_rspec_integration_basic_evaluation_provides_detailed_feedback_on_failure.yml +64 -0
- data/examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml +74 -0
- data/examples/cassettes/qualspec_rspec_integration_score_matchers_supports_score_comparisons.yml +65 -0
- data/examples/cassettes/qualspec_rspec_integration_vcr_integration_records_and_plays_back_api_calls_automatically.yml +65 -0
- data/examples/cassettes/qualspec_rspec_integration_with_context_uses_context_in_evaluation.yml +67 -0
- data/examples/cassettes/qualspec_rspec_integration_with_rubrics_evaluates_using_builtin_rubrics.yml +67 -0
- data/examples/comparison.rb +22 -0
- data/examples/model_comparison.rb +38 -0
- data/examples/persona_test.rb +49 -0
- data/examples/quick_test.rb +28 -0
- data/examples/report.html +399 -0
- data/examples/rspec_example_spec.rb +153 -0
- data/exe/qualspec +142 -0
- data/lib/qualspec/builtin_rubrics.rb +83 -0
- data/lib/qualspec/client.rb +127 -0
- data/lib/qualspec/configuration.rb +32 -0
- data/lib/qualspec/evaluation.rb +52 -0
- data/lib/qualspec/judge.rb +217 -0
- data/lib/qualspec/recorder.rb +55 -0
- data/lib/qualspec/rspec/configuration.rb +49 -0
- data/lib/qualspec/rspec/evaluation_result.rb +142 -0
- data/lib/qualspec/rspec/helpers.rb +155 -0
- data/lib/qualspec/rspec/matchers.rb +163 -0
- data/lib/qualspec/rspec.rb +66 -0
- data/lib/qualspec/rubric.rb +43 -0
- data/lib/qualspec/suite/behavior.rb +43 -0
- data/lib/qualspec/suite/builtin_behaviors.rb +84 -0
- data/lib/qualspec/suite/candidate.rb +30 -0
- data/lib/qualspec/suite/dsl.rb +64 -0
- data/lib/qualspec/suite/html_reporter.rb +673 -0
- data/lib/qualspec/suite/reporter.rb +274 -0
- data/lib/qualspec/suite/runner.rb +261 -0
- data/lib/qualspec/suite/scenario.rb +57 -0
- data/lib/qualspec/version.rb +5 -0
- data/lib/qualspec.rb +103 -0
- data/sig/qualspec.rbs +4 -0
- metadata +142 -0
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Qualspec
|
|
4
|
+
class Recorder
|
|
5
|
+
class << self
|
|
6
|
+
def available?
|
|
7
|
+
require 'vcr'
|
|
8
|
+
true
|
|
9
|
+
rescue LoadError
|
|
10
|
+
false
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def setup(cassette_dir: '.qualspec_cassettes')
|
|
14
|
+
require_vcr!
|
|
15
|
+
|
|
16
|
+
VCR.configure do |config|
|
|
17
|
+
config.cassette_library_dir = cassette_dir
|
|
18
|
+
config.hook_into :faraday
|
|
19
|
+
config.default_cassette_options = {
|
|
20
|
+
record: :new_episodes,
|
|
21
|
+
match_requests_on: %i[method uri body]
|
|
22
|
+
}
|
|
23
|
+
# Filter out API keys
|
|
24
|
+
config.filter_sensitive_data('<API_KEY>') { Qualspec.configuration.api_key }
|
|
25
|
+
end
|
|
26
|
+
@configured = true
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def configured?
|
|
30
|
+
@configured == true
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def record(name, &block)
|
|
34
|
+
setup unless configured?
|
|
35
|
+
VCR.use_cassette(name, &block)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def playback(name, &block)
|
|
39
|
+
setup unless configured?
|
|
40
|
+
VCR.use_cassette(name, record: :none, &block)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
private
|
|
44
|
+
|
|
45
|
+
def require_vcr!
|
|
46
|
+
require 'vcr'
|
|
47
|
+
rescue LoadError
|
|
48
|
+
raise Qualspec::Error, <<~MSG.strip
|
|
49
|
+
VCR gem is required for recording/playback features.
|
|
50
|
+
Add to your Gemfile: gem 'vcr'
|
|
51
|
+
MSG
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Qualspec
|
|
4
|
+
module RSpec
|
|
5
|
+
# RSpec-specific configuration
|
|
6
|
+
#
|
|
7
|
+
# @example Configure in spec_helper.rb
|
|
8
|
+
# Qualspec::RSpec.configure do |config|
|
|
9
|
+
# config.default_threshold = 7
|
|
10
|
+
# config.vcr_cassette_dir = "spec/cassettes/qualspec"
|
|
11
|
+
# config.record_mode = :new_episodes
|
|
12
|
+
# end
|
|
13
|
+
#
|
|
14
|
+
class Configuration
|
|
15
|
+
# Default pass threshold for evaluations (0-10)
|
|
16
|
+
attr_accessor :default_threshold
|
|
17
|
+
|
|
18
|
+
# Directory for VCR cassettes
|
|
19
|
+
attr_accessor :vcr_cassette_dir
|
|
20
|
+
|
|
21
|
+
# VCR recording mode (:new_episodes, :none, :all, :once)
|
|
22
|
+
attr_accessor :record_mode
|
|
23
|
+
|
|
24
|
+
# Whether to load builtin rubrics automatically
|
|
25
|
+
attr_accessor :load_builtins
|
|
26
|
+
|
|
27
|
+
def initialize
|
|
28
|
+
@default_threshold = 7
|
|
29
|
+
@vcr_cassette_dir = 'spec/cassettes/qualspec'
|
|
30
|
+
@record_mode = :new_episodes
|
|
31
|
+
@load_builtins = true
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
class << self
|
|
36
|
+
def configuration
|
|
37
|
+
@configuration ||= Configuration.new
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def configure
|
|
41
|
+
yield(configuration)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def reset_configuration!
|
|
45
|
+
@configuration = nil
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Qualspec
|
|
4
|
+
module RSpec
|
|
5
|
+
# Wrapper around Qualspec::Evaluation with RSpec-friendly interface
|
|
6
|
+
# Provides clean methods and good failure message formatting
|
|
7
|
+
class EvaluationResult
|
|
8
|
+
attr_reader :evaluation, :criterion, :response, :threshold
|
|
9
|
+
|
|
10
|
+
def initialize(evaluation, criterion:, response:, threshold: nil)
|
|
11
|
+
@evaluation = evaluation
|
|
12
|
+
@criterion = criterion
|
|
13
|
+
@response = response
|
|
14
|
+
@threshold = threshold || Qualspec::RSpec.configuration.default_threshold
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Core result methods
|
|
18
|
+
def passing?
|
|
19
|
+
evaluation.pass?
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def failing?
|
|
23
|
+
!passing?
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def score
|
|
27
|
+
evaluation.score
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def reasoning
|
|
31
|
+
evaluation.reasoning
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def error?
|
|
35
|
+
!evaluation.error.nil?
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def error
|
|
39
|
+
evaluation.error
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Comparison support
|
|
43
|
+
def winner?
|
|
44
|
+
evaluation.scenario_winner == true
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def tie?
|
|
48
|
+
evaluation.scenario_winner == :tie
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Nice output for RSpec failure messages
|
|
52
|
+
def inspect
|
|
53
|
+
status = passing? ? 'PASS' : 'FAIL'
|
|
54
|
+
lines = [
|
|
55
|
+
"#<Qualspec::RSpec::EvaluationResult #{status}>",
|
|
56
|
+
" Criterion: #{criterion}",
|
|
57
|
+
" Score: #{score}/10 (threshold: #{threshold})",
|
|
58
|
+
" Reasoning: #{reasoning}"
|
|
59
|
+
]
|
|
60
|
+
lines << " Error: #{error}" if error?
|
|
61
|
+
lines << " Response (first 200 chars): #{response.to_s[0, 200]}..."
|
|
62
|
+
lines.join("\n")
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# RSpec failure message formatting
|
|
66
|
+
def failure_message
|
|
67
|
+
<<~MSG
|
|
68
|
+
Expected response to pass qualspec evaluation, but it failed.
|
|
69
|
+
|
|
70
|
+
Criterion: #{criterion}
|
|
71
|
+
Score: #{score}/10 (needed #{threshold} to pass)
|
|
72
|
+
Reasoning: #{reasoning}
|
|
73
|
+
#{"Error: #{error}" if error?}
|
|
74
|
+
Response preview: #{response.to_s[0, 300]}#{'...' if response.to_s.length > 300}
|
|
75
|
+
MSG
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def failure_message_when_negated
|
|
79
|
+
<<~MSG
|
|
80
|
+
Expected response to fail qualspec evaluation, but it passed.
|
|
81
|
+
|
|
82
|
+
Criterion: #{criterion}
|
|
83
|
+
Score: #{score}/10 (threshold: #{threshold})
|
|
84
|
+
Reasoning: #{reasoning}
|
|
85
|
+
MSG
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Result for comparative evaluations
|
|
90
|
+
class ComparisonResult
|
|
91
|
+
attr_reader :results, :winner, :criterion
|
|
92
|
+
|
|
93
|
+
def initialize(results, criterion:)
|
|
94
|
+
@results = results # Hash of name => EvaluationResult
|
|
95
|
+
@criterion = criterion
|
|
96
|
+
@winner = determine_winner
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def [](name)
|
|
100
|
+
results[name.to_sym]
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def tie?
|
|
104
|
+
winner == :tie
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def winner?(name)
|
|
108
|
+
winner.to_s == name.to_s
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def scores
|
|
112
|
+
results.transform_values(&:score)
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def inspect
|
|
116
|
+
lines = ['#<Qualspec::RSpec::ComparisonResult>']
|
|
117
|
+
lines << " Criterion: #{criterion}"
|
|
118
|
+
lines << " Winner: #{winner}"
|
|
119
|
+
results.each do |name, result|
|
|
120
|
+
marker = winner?(name) ? '*' : ' '
|
|
121
|
+
lines << " #{marker} #{name}: #{result.score}/10 - #{result.reasoning}"
|
|
122
|
+
end
|
|
123
|
+
lines.join("\n")
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
private
|
|
127
|
+
|
|
128
|
+
def determine_winner
|
|
129
|
+
# Check if any result has winner flag set
|
|
130
|
+
results.each do |name, result|
|
|
131
|
+
return name if result.winner?
|
|
132
|
+
return :tie if result.tie?
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Fallback: highest score wins
|
|
136
|
+
max_score = results.values.map(&:score).max
|
|
137
|
+
winners = results.select { |_, r| r.score == max_score }
|
|
138
|
+
winners.size == 1 ? winners.keys.first : :tie
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
end
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'evaluation_result'
|
|
4
|
+
|
|
5
|
+
module Qualspec
|
|
6
|
+
module RSpec
|
|
7
|
+
# Helper methods for RSpec tests
|
|
8
|
+
# Include this in your spec_helper.rb:
|
|
9
|
+
#
|
|
10
|
+
# RSpec.configure do |config|
|
|
11
|
+
# config.include Qualspec::RSpec::Helpers
|
|
12
|
+
# end
|
|
13
|
+
#
|
|
14
|
+
module Helpers
|
|
15
|
+
# Evaluate a response against a criterion or rubric
|
|
16
|
+
#
|
|
17
|
+
# @param response [String] The response to evaluate
|
|
18
|
+
# @param criterion [String, nil] The evaluation criterion (optional if using rubric)
|
|
19
|
+
# @param rubric [Symbol, nil] A pre-defined rubric name
|
|
20
|
+
# @param context [String, nil] Additional context for the judge
|
|
21
|
+
# @param threshold [Integer, nil] Pass threshold (default: 7)
|
|
22
|
+
# @return [EvaluationResult]
|
|
23
|
+
#
|
|
24
|
+
# @example With inline criterion
|
|
25
|
+
# result = qualspec_evaluate(response, "responds in a friendly manner")
|
|
26
|
+
# expect(result).to be_passing
|
|
27
|
+
#
|
|
28
|
+
# @example With rubric
|
|
29
|
+
# result = qualspec_evaluate(response, rubric: :tool_calling)
|
|
30
|
+
# expect(result.score).to be >= 8
|
|
31
|
+
#
|
|
32
|
+
# @example With context
|
|
33
|
+
# result = qualspec_evaluate(response, "summarizes accurately",
|
|
34
|
+
# context: "User provided a news article")
|
|
35
|
+
# expect(result).to be_passing
|
|
36
|
+
#
|
|
37
|
+
def qualspec_evaluate(response, criterion = nil, rubric: nil, context: nil, threshold: nil)
|
|
38
|
+
raise ArgumentError, 'Must provide either criterion or rubric:' if criterion.nil? && rubric.nil?
|
|
39
|
+
|
|
40
|
+
effective_threshold = threshold || Qualspec::RSpec.configuration.default_threshold
|
|
41
|
+
criterion_text, display_criterion = resolve_criterion(criterion, rubric)
|
|
42
|
+
|
|
43
|
+
evaluation = Qualspec.judge.evaluate(
|
|
44
|
+
response: response.to_s,
|
|
45
|
+
criterion: criterion_text,
|
|
46
|
+
context: context,
|
|
47
|
+
pass_threshold: effective_threshold
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
EvaluationResult.new(
|
|
51
|
+
evaluation,
|
|
52
|
+
criterion: display_criterion,
|
|
53
|
+
response: response,
|
|
54
|
+
threshold: effective_threshold
|
|
55
|
+
)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Compare multiple responses against criteria
|
|
59
|
+
#
|
|
60
|
+
# @param responses [Hash] Hash of name => response
|
|
61
|
+
# @param criterion [String, Array<String>] Evaluation criteria
|
|
62
|
+
# @param context [String, nil] Additional context
|
|
63
|
+
# @param threshold [Integer, nil] Pass threshold (default: 7)
|
|
64
|
+
# @return [ComparisonResult]
|
|
65
|
+
#
|
|
66
|
+
# @example
|
|
67
|
+
# responses = {
|
|
68
|
+
# gpt4: gpt4_response,
|
|
69
|
+
# claude: claude_response
|
|
70
|
+
# }
|
|
71
|
+
# result = qualspec_compare(responses, "explains clearly")
|
|
72
|
+
# expect(result.winner).to eq(:claude)
|
|
73
|
+
# expect(result[:gpt4].score).to be >= 7
|
|
74
|
+
#
|
|
75
|
+
def qualspec_compare(responses, criterion, context: nil, threshold: nil)
|
|
76
|
+
effective_threshold = threshold || Qualspec::RSpec.configuration.default_threshold
|
|
77
|
+
criteria_list = Array(criterion)
|
|
78
|
+
display_criterion = criteria_list.join('; ')
|
|
79
|
+
|
|
80
|
+
evaluations = Qualspec.judge.evaluate_comparison(
|
|
81
|
+
responses: responses.transform_values(&:to_s),
|
|
82
|
+
criteria: criteria_list,
|
|
83
|
+
context: context,
|
|
84
|
+
pass_threshold: effective_threshold
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
results = wrap_comparison_results(evaluations, responses, display_criterion, effective_threshold)
|
|
88
|
+
ComparisonResult.new(results, criterion: display_criterion)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Wrap a block with VCR cassette recording/playback
|
|
92
|
+
#
|
|
93
|
+
# @param name [String] Cassette name
|
|
94
|
+
# @param record [Symbol] Recording mode (:new_episodes, :none, :all)
|
|
95
|
+
# @yield Block to execute with cassette
|
|
96
|
+
#
|
|
97
|
+
# @example
|
|
98
|
+
# with_qualspec_cassette("my_test") do
|
|
99
|
+
# result = qualspec_evaluate(response, "is helpful")
|
|
100
|
+
# expect(result).to be_passing
|
|
101
|
+
# end
|
|
102
|
+
#
|
|
103
|
+
def with_qualspec_cassette(name, record: nil, &block)
|
|
104
|
+
record_mode = record || Qualspec::RSpec.configuration.record_mode
|
|
105
|
+
|
|
106
|
+
# Configure VCR with RSpec cassette directory
|
|
107
|
+
Qualspec::Recorder.setup(
|
|
108
|
+
cassette_dir: Qualspec::RSpec.configuration.vcr_cassette_dir
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
case record_mode
|
|
112
|
+
when :none
|
|
113
|
+
Qualspec::Recorder.playback(name, &block)
|
|
114
|
+
else
|
|
115
|
+
VCR.use_cassette(name, record: record_mode, &block)
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Helper to skip test if qualspec API is unavailable
|
|
120
|
+
def skip_without_qualspec_api
|
|
121
|
+
Qualspec.client.chat(
|
|
122
|
+
model: Qualspec.configuration.judge_model,
|
|
123
|
+
messages: [{ role: 'user', content: 'test' }],
|
|
124
|
+
json_mode: false
|
|
125
|
+
)
|
|
126
|
+
rescue Qualspec::Client::RequestError => e
|
|
127
|
+
skip "Qualspec API unavailable: #{e.message}"
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
private
|
|
131
|
+
|
|
132
|
+
def resolve_criterion(criterion, rubric)
|
|
133
|
+
if rubric
|
|
134
|
+
rubric_obj = rubric.is_a?(Symbol) ? Qualspec::Rubric.find(rubric) : rubric
|
|
135
|
+
criterion_text = rubric_obj.criteria.join("\n")
|
|
136
|
+
display_criterion = rubric_obj.criteria.join('; ')
|
|
137
|
+
[criterion_text, display_criterion]
|
|
138
|
+
else
|
|
139
|
+
[criterion, criterion]
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def wrap_comparison_results(evaluations, responses, display_criterion, threshold)
|
|
144
|
+
evaluations.transform_keys(&:to_sym).each_with_object({}) do |(name, eval), hash|
|
|
145
|
+
hash[name] = EvaluationResult.new(
|
|
146
|
+
eval,
|
|
147
|
+
criterion: display_criterion,
|
|
148
|
+
response: responses[name] || responses[name.to_s],
|
|
149
|
+
threshold: threshold
|
|
150
|
+
)
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
end
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'rspec/expectations'
|
|
4
|
+
|
|
5
|
+
module Qualspec
|
|
6
|
+
module RSpec
|
|
7
|
+
# Custom RSpec matchers for qualspec evaluations
|
|
8
|
+
#
|
|
9
|
+
# These matchers work with EvaluationResult objects returned by qualspec_evaluate
|
|
10
|
+
#
|
|
11
|
+
# @example
|
|
12
|
+
# result = qualspec_evaluate(response, "is helpful")
|
|
13
|
+
# expect(result).to be_passing
|
|
14
|
+
# expect(result).to have_score_above(7)
|
|
15
|
+
#
|
|
16
|
+
module Matchers
|
|
17
|
+
extend ::RSpec::Matchers::DSL
|
|
18
|
+
|
|
19
|
+
# Matcher for checking if evaluation passed
|
|
20
|
+
#
|
|
21
|
+
# @example
|
|
22
|
+
# expect(result).to be_passing
|
|
23
|
+
#
|
|
24
|
+
matcher :be_passing do
|
|
25
|
+
match do |result|
|
|
26
|
+
result.respond_to?(:passing?) && result.passing?
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
failure_message do |result|
|
|
30
|
+
if result.respond_to?(:failure_message)
|
|
31
|
+
result.failure_message
|
|
32
|
+
else
|
|
33
|
+
"expected #{result.inspect} to be passing"
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
failure_message_when_negated do |result|
|
|
38
|
+
if result.respond_to?(:failure_message_when_negated)
|
|
39
|
+
result.failure_message_when_negated
|
|
40
|
+
else
|
|
41
|
+
"expected #{result.inspect} not to be passing"
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Matcher for checking if evaluation failed
|
|
47
|
+
#
|
|
48
|
+
# @example
|
|
49
|
+
# expect(result).to be_failing
|
|
50
|
+
#
|
|
51
|
+
matcher :be_failing do
|
|
52
|
+
match do |result|
|
|
53
|
+
result.respond_to?(:failing?) && result.failing?
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
failure_message do |result|
|
|
57
|
+
"expected evaluation to fail, but it passed with score #{result.score}/10"
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
failure_message_when_negated do |result|
|
|
61
|
+
"expected evaluation not to fail, but it failed with score #{result.score}/10"
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Matcher for checking exact score
|
|
66
|
+
#
|
|
67
|
+
# @example
|
|
68
|
+
# expect(result).to have_score(10)
|
|
69
|
+
#
|
|
70
|
+
matcher :have_score do |expected|
|
|
71
|
+
match do |result|
|
|
72
|
+
result.respond_to?(:score) && result.score == expected
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
failure_message do |result|
|
|
76
|
+
"expected score to be #{expected}, but was #{result.score}"
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Matcher for checking score is above threshold
|
|
81
|
+
#
|
|
82
|
+
# @example
|
|
83
|
+
# expect(result).to have_score_above(7)
|
|
84
|
+
#
|
|
85
|
+
matcher :have_score_above do |threshold|
|
|
86
|
+
match do |result|
|
|
87
|
+
result.respond_to?(:score) && result.score > threshold
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
failure_message do |result|
|
|
91
|
+
"expected score > #{threshold}, but was #{result.score}"
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Matcher for checking score is at or above threshold
|
|
96
|
+
#
|
|
97
|
+
# @example
|
|
98
|
+
# expect(result).to have_score_at_least(7)
|
|
99
|
+
#
|
|
100
|
+
matcher :have_score_at_least do |threshold|
|
|
101
|
+
match do |result|
|
|
102
|
+
result.respond_to?(:score) && result.score >= threshold
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
failure_message do |result|
|
|
106
|
+
"expected score >= #{threshold}, but was #{result.score}"
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Matcher for checking score is below threshold
|
|
111
|
+
#
|
|
112
|
+
# @example
|
|
113
|
+
# expect(result).to have_score_below(5)
|
|
114
|
+
#
|
|
115
|
+
matcher :have_score_below do |threshold|
|
|
116
|
+
match do |result|
|
|
117
|
+
result.respond_to?(:score) && result.score < threshold
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
failure_message do |result|
|
|
121
|
+
"expected score < #{threshold}, but was #{result.score}"
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Matcher for comparison winner
|
|
126
|
+
#
|
|
127
|
+
# @example
|
|
128
|
+
# expect(comparison).to have_winner(:claude)
|
|
129
|
+
#
|
|
130
|
+
matcher :have_winner do |expected_winner|
|
|
131
|
+
match do |result|
|
|
132
|
+
result.respond_to?(:winner?) && result.winner?(expected_winner)
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
failure_message do |result|
|
|
136
|
+
"expected winner to be #{expected_winner}, but was #{result.winner}"
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Matcher for comparison tie
|
|
141
|
+
#
|
|
142
|
+
# @example
|
|
143
|
+
# expect(comparison).to be_a_tie
|
|
144
|
+
#
|
|
145
|
+
matcher :be_a_tie do
|
|
146
|
+
match do |result|
|
|
147
|
+
result.respond_to?(:tie?) && result.tie?
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
failure_message do |result|
|
|
151
|
+
"expected a tie, but winner was #{result.winner}"
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Auto-include matchers when RSpec is loaded
|
|
159
|
+
if defined?(RSpec) && RSpec.respond_to?(:configure)
|
|
160
|
+
RSpec.configure do |config|
|
|
161
|
+
config.include Qualspec::RSpec::Matchers
|
|
162
|
+
end
|
|
163
|
+
end
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# RSpec integration for Qualspec
|
|
4
|
+
#
|
|
5
|
+
# Add to your spec_helper.rb:
|
|
6
|
+
#
|
|
7
|
+
# require "qualspec/rspec"
|
|
8
|
+
#
|
|
9
|
+
# RSpec.configure do |config|
|
|
10
|
+
# config.include Qualspec::RSpec::Helpers
|
|
11
|
+
# end
|
|
12
|
+
#
|
|
13
|
+
# # Optional: Configure qualspec-specific settings
|
|
14
|
+
# Qualspec::RSpec.configure do |config|
|
|
15
|
+
# config.default_threshold = 7
|
|
16
|
+
# config.vcr_cassette_dir = "spec/cassettes/qualspec"
|
|
17
|
+
# end
|
|
18
|
+
#
|
|
19
|
+
# Then in your specs:
|
|
20
|
+
#
|
|
21
|
+
# describe "MyAgent" do
|
|
22
|
+
# it "responds helpfully" do
|
|
23
|
+
# response = my_agent.call("Hello")
|
|
24
|
+
#
|
|
25
|
+
# result = qualspec_evaluate(response, "responds in a friendly manner")
|
|
26
|
+
# expect(result).to be_passing
|
|
27
|
+
# expect(result.score).to be >= 8
|
|
28
|
+
# end
|
|
29
|
+
#
|
|
30
|
+
# it "uses tools correctly" do
|
|
31
|
+
# response = my_agent.call("What's the weather?")
|
|
32
|
+
#
|
|
33
|
+
# result = qualspec_evaluate(response, rubric: :tool_calling)
|
|
34
|
+
# expect(result).to be_passing
|
|
35
|
+
# end
|
|
36
|
+
# end
|
|
37
|
+
#
|
|
38
|
+
|
|
39
|
+
require 'qualspec'
|
|
40
|
+
require_relative 'rspec/configuration'
|
|
41
|
+
require_relative 'rspec/evaluation_result'
|
|
42
|
+
require_relative 'rspec/helpers'
|
|
43
|
+
require_relative 'rspec/matchers'
|
|
44
|
+
|
|
45
|
+
module Qualspec
|
|
46
|
+
module RSpec
|
|
47
|
+
class << self
|
|
48
|
+
# Setup RSpec integration with sensible defaults
|
|
49
|
+
#
|
|
50
|
+
# @example
|
|
51
|
+
# Qualspec::RSpec.setup!
|
|
52
|
+
#
|
|
53
|
+
def setup!
|
|
54
|
+
# Load builtin rubrics if configured
|
|
55
|
+
Qualspec::BuiltinRubrics.load! if configuration.load_builtins
|
|
56
|
+
|
|
57
|
+
# Configure RSpec if available
|
|
58
|
+
return unless defined?(::RSpec) && ::RSpec.respond_to?(:configure)
|
|
59
|
+
|
|
60
|
+
::RSpec.configure do |config|
|
|
61
|
+
config.include Qualspec::RSpec::Helpers
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Qualspec
|
|
4
|
+
class Rubric
|
|
5
|
+
attr_reader :name, :criteria
|
|
6
|
+
|
|
7
|
+
def initialize(name, &block)
|
|
8
|
+
@name = name
|
|
9
|
+
@criteria = []
|
|
10
|
+
instance_eval(&block) if block_given? # rubocop:disable Style/EvalWithLocation -- DSL pattern requires eval
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def criterion(description)
|
|
14
|
+
@criteria << description
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def to_s
|
|
18
|
+
@criteria.join("\n")
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
class << self
|
|
22
|
+
def registry
|
|
23
|
+
@registry ||= {}
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def define(name, &block)
|
|
27
|
+
registry[name.to_sym] = new(name, &block)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def find(name)
|
|
31
|
+
registry[name.to_sym] || raise(Error, "Rubric '#{name}' not found")
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def defined?(name)
|
|
35
|
+
registry.key?(name.to_sym)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def clear!
|
|
39
|
+
@registry = {}
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|