qualspec 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.qualspec_cassettes/comparison_test.yml +439 -0
- data/.qualspec_cassettes/quick_test.yml +232 -0
- data/.rspec +3 -0
- data/.rubocop.yml +1 -0
- data/.rubocop_todo.yml +70 -0
- data/CHANGELOG.md +16 -0
- data/README.md +84 -0
- data/Rakefile +8 -0
- data/docs/configuration.md +132 -0
- data/docs/evaluation-suites.md +180 -0
- data/docs/getting-started.md +102 -0
- data/docs/recording.md +196 -0
- data/docs/rspec-integration.md +233 -0
- data/docs/rubrics.md +174 -0
- data/examples/cassettes/qualspec_rspec_integration_basic_evaluation_evaluates_responses_with_inline_criteria.yml +65 -0
- data/examples/cassettes/qualspec_rspec_integration_basic_evaluation_provides_detailed_feedback_on_failure.yml +64 -0
- data/examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml +74 -0
- data/examples/cassettes/qualspec_rspec_integration_score_matchers_supports_score_comparisons.yml +65 -0
- data/examples/cassettes/qualspec_rspec_integration_vcr_integration_records_and_plays_back_api_calls_automatically.yml +65 -0
- data/examples/cassettes/qualspec_rspec_integration_with_context_uses_context_in_evaluation.yml +67 -0
- data/examples/cassettes/qualspec_rspec_integration_with_rubrics_evaluates_using_builtin_rubrics.yml +67 -0
- data/examples/comparison.rb +22 -0
- data/examples/model_comparison.rb +38 -0
- data/examples/persona_test.rb +49 -0
- data/examples/quick_test.rb +28 -0
- data/examples/report.html +399 -0
- data/examples/rspec_example_spec.rb +153 -0
- data/exe/qualspec +142 -0
- data/lib/qualspec/builtin_rubrics.rb +83 -0
- data/lib/qualspec/client.rb +127 -0
- data/lib/qualspec/configuration.rb +32 -0
- data/lib/qualspec/evaluation.rb +52 -0
- data/lib/qualspec/judge.rb +217 -0
- data/lib/qualspec/recorder.rb +55 -0
- data/lib/qualspec/rspec/configuration.rb +49 -0
- data/lib/qualspec/rspec/evaluation_result.rb +142 -0
- data/lib/qualspec/rspec/helpers.rb +155 -0
- data/lib/qualspec/rspec/matchers.rb +163 -0
- data/lib/qualspec/rspec.rb +66 -0
- data/lib/qualspec/rubric.rb +43 -0
- data/lib/qualspec/suite/behavior.rb +43 -0
- data/lib/qualspec/suite/builtin_behaviors.rb +84 -0
- data/lib/qualspec/suite/candidate.rb +30 -0
- data/lib/qualspec/suite/dsl.rb +64 -0
- data/lib/qualspec/suite/html_reporter.rb +673 -0
- data/lib/qualspec/suite/reporter.rb +274 -0
- data/lib/qualspec/suite/runner.rb +261 -0
- data/lib/qualspec/suite/scenario.rb +57 -0
- data/lib/qualspec/version.rb +5 -0
- data/lib/qualspec.rb +103 -0
- data/sig/qualspec.rbs +4 -0
- metadata +142 -0
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module Qualspec
|
|
6
|
+
module Suite
|
|
7
|
+
class Reporter
|
|
8
|
+
def initialize(results, show_responses: false)
|
|
9
|
+
@results = results
|
|
10
|
+
@show_responses = show_responses
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def to_stdout
|
|
14
|
+
output = []
|
|
15
|
+
output << header
|
|
16
|
+
output << ''
|
|
17
|
+
output << summary_table
|
|
18
|
+
output << ''
|
|
19
|
+
output << timing_section if timing?
|
|
20
|
+
output << ''
|
|
21
|
+
output << scenario_breakdown
|
|
22
|
+
output << ''
|
|
23
|
+
output << responses_section if @show_responses
|
|
24
|
+
output << winner_announcement
|
|
25
|
+
|
|
26
|
+
output.compact.join("\n")
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def to_json(pretty: true)
|
|
30
|
+
if pretty
|
|
31
|
+
JSON.pretty_generate(@results.to_h)
|
|
32
|
+
else
|
|
33
|
+
JSON.generate(@results.to_h)
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def write_json(path)
|
|
38
|
+
File.write(path, to_json)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
def header
|
|
44
|
+
lines = []
|
|
45
|
+
lines << '=' * 60
|
|
46
|
+
lines << @results.suite_name.center(60)
|
|
47
|
+
lines << '=' * 60
|
|
48
|
+
lines.join("\n")
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def summary_table
|
|
52
|
+
scores = @results.scores_by_candidate
|
|
53
|
+
return 'No results' if scores.empty?
|
|
54
|
+
|
|
55
|
+
candidates = scores.keys
|
|
56
|
+
max_name = [candidates.map(&:length).max, 10].max
|
|
57
|
+
|
|
58
|
+
# Count scenario wins
|
|
59
|
+
wins = count_wins
|
|
60
|
+
|
|
61
|
+
lines = []
|
|
62
|
+
lines << '## Summary'
|
|
63
|
+
lines << ''
|
|
64
|
+
|
|
65
|
+
header = "| #{'Candidate'.ljust(max_name)} | Score | Wins | Pass Rate |"
|
|
66
|
+
lines << header
|
|
67
|
+
lines << "|#{'-' * (max_name + 2)}|-------|-------|-----------|"
|
|
68
|
+
|
|
69
|
+
sorted = scores.sort_by { |_, v| -v[:avg_score] }
|
|
70
|
+
|
|
71
|
+
sorted.each do |candidate, stats|
|
|
72
|
+
score = stats[:avg_score].to_s.rjust(5)
|
|
73
|
+
win_count = (wins[candidate] || 0).to_s.rjust(5)
|
|
74
|
+
pass_rate = "#{stats[:pass_rate]}%".rjust(8)
|
|
75
|
+
|
|
76
|
+
lines << "| #{candidate.ljust(max_name)} | #{score} | #{win_count} | #{pass_rate} |"
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
lines.join("\n")
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def count_wins
|
|
83
|
+
wins = Hash.new(0)
|
|
84
|
+
@results.evaluations.each do |eval|
|
|
85
|
+
wins[eval[:candidate]] += 1 if eval[:winner] == true
|
|
86
|
+
end
|
|
87
|
+
wins
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def timing?
|
|
91
|
+
!@results.timing.empty?
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def timing_section
|
|
95
|
+
timing = @results.timing_by_candidate
|
|
96
|
+
return nil if timing.empty?
|
|
97
|
+
|
|
98
|
+
costs = @results.costs
|
|
99
|
+
|
|
100
|
+
lines = []
|
|
101
|
+
lines << '## Performance'
|
|
102
|
+
lines << ''
|
|
103
|
+
|
|
104
|
+
sorted = timing.sort_by { |_, v| v[:avg_ms] }
|
|
105
|
+
|
|
106
|
+
sorted.each do |candidate, stats|
|
|
107
|
+
line = " #{candidate}: #{format_duration(stats[:avg_ms])} avg"
|
|
108
|
+
line += " (#{format_duration(stats[:total_ms])} total)"
|
|
109
|
+
|
|
110
|
+
line += " - $#{format_cost(costs[candidate])}" if costs[candidate]&.positive?
|
|
111
|
+
|
|
112
|
+
lines << line
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
lines.join("\n")
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def format_duration(milliseconds)
|
|
119
|
+
if milliseconds >= 1000
|
|
120
|
+
"#{(milliseconds / 1000.0).round(2)}s"
|
|
121
|
+
else
|
|
122
|
+
"#{milliseconds}ms"
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def format_cost(cost)
|
|
127
|
+
if cost < 0.01
|
|
128
|
+
format('%.4f', cost)
|
|
129
|
+
else
|
|
130
|
+
format('%.2f', cost)
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def scenario_breakdown
|
|
135
|
+
by_scenario = @results.scores_by_scenario
|
|
136
|
+
return '' if by_scenario.empty?
|
|
137
|
+
|
|
138
|
+
candidates = @results.scores_by_candidate.keys
|
|
139
|
+
|
|
140
|
+
lines = []
|
|
141
|
+
lines << '## By Scenario'
|
|
142
|
+
lines << ''
|
|
143
|
+
|
|
144
|
+
by_scenario.each do |scenario, candidate_scores|
|
|
145
|
+
# Find winner for this scenario
|
|
146
|
+
winner = find_scenario_winner(scenario)
|
|
147
|
+
winner_label = if winner == :tie
|
|
148
|
+
' [TIE]'
|
|
149
|
+
else
|
|
150
|
+
winner ? " [Winner: #{winner}]" : ''
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
lines << "### #{scenario}#{winner_label}"
|
|
154
|
+
|
|
155
|
+
candidates.each do |candidate|
|
|
156
|
+
stats = candidate_scores[candidate]
|
|
157
|
+
next unless stats
|
|
158
|
+
|
|
159
|
+
score_bar = score_visualization(stats[:score])
|
|
160
|
+
timing_info = format_scenario_timing(candidate, scenario)
|
|
161
|
+
win_marker = winner == candidate ? ' *' : ''
|
|
162
|
+
|
|
163
|
+
line = " #{candidate}: #{score_bar} #{stats[:score]}/10#{win_marker}"
|
|
164
|
+
line += " #{timing_info}" if timing_info
|
|
165
|
+
|
|
166
|
+
lines << line
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
lines << ''
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
lines.join("\n")
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def find_scenario_winner(scenario)
|
|
176
|
+
scenario_evals = @results.evaluations.select { |e| e[:scenario] == scenario }
|
|
177
|
+
winner_eval = scenario_evals.find { |e| e[:winner] == true }
|
|
178
|
+
return winner_eval[:candidate] if winner_eval
|
|
179
|
+
|
|
180
|
+
tie_eval = scenario_evals.find { |e| e[:winner] == :tie }
|
|
181
|
+
return :tie if tie_eval
|
|
182
|
+
|
|
183
|
+
nil
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
def format_scenario_timing(candidate, scenario)
|
|
187
|
+
duration = @results.timing.dig(candidate, scenario)
|
|
188
|
+
return nil unless duration
|
|
189
|
+
|
|
190
|
+
"[#{format_duration(duration)}]"
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def score_visualization(score)
|
|
194
|
+
filled = score.to_f.round
|
|
195
|
+
empty = 10 - filled
|
|
196
|
+
"[#{'█' * filled}#{'░' * empty}]"
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def responses_section
|
|
200
|
+
responses = @results.responses
|
|
201
|
+
return nil if responses.empty?
|
|
202
|
+
|
|
203
|
+
lines = []
|
|
204
|
+
lines << '## Responses'
|
|
205
|
+
lines << ''
|
|
206
|
+
|
|
207
|
+
# Group by scenario
|
|
208
|
+
scenarios = responses.values.first&.keys || []
|
|
209
|
+
|
|
210
|
+
scenarios.each do |scenario|
|
|
211
|
+
lines << "### #{scenario}"
|
|
212
|
+
lines << ''
|
|
213
|
+
|
|
214
|
+
responses.each do |candidate, candidate_responses|
|
|
215
|
+
response = candidate_responses[scenario]
|
|
216
|
+
next unless response
|
|
217
|
+
|
|
218
|
+
lines << "**#{candidate}:**"
|
|
219
|
+
lines << '```'
|
|
220
|
+
lines << response.to_s.strip[0..500]
|
|
221
|
+
lines << '...' if response.to_s.length > 500
|
|
222
|
+
lines << '```'
|
|
223
|
+
lines << ''
|
|
224
|
+
end
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
lines.join("\n")
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def winner_announcement
|
|
231
|
+
scores = @results.scores_by_candidate
|
|
232
|
+
return '' if scores.empty?
|
|
233
|
+
|
|
234
|
+
wins = count_wins
|
|
235
|
+
sorted = scores.sort_by { |_, v| -v[:avg_score] }
|
|
236
|
+
winner = sorted.first
|
|
237
|
+
runner_up = sorted[1]
|
|
238
|
+
|
|
239
|
+
lines = []
|
|
240
|
+
lines << '-' * 60
|
|
241
|
+
|
|
242
|
+
if sorted.size == 1
|
|
243
|
+
lines << "Result: #{winner[0]} scored #{winner[1][:avg_score]}/10"
|
|
244
|
+
elsif winner[1][:avg_score] == runner_up&.dig(1, :avg_score)
|
|
245
|
+
tied = sorted.take_while { |_, v| v[:avg_score] == winner[1][:avg_score] }
|
|
246
|
+
lines << "Result: TIE between #{tied.map(&:first).join(', ')}"
|
|
247
|
+
lines << " All scored #{winner[1][:avg_score]}/10 average"
|
|
248
|
+
else
|
|
249
|
+
margin = (winner[1][:avg_score] - runner_up[1][:avg_score]).round(2)
|
|
250
|
+
win_count = wins[winner[0]] || 0
|
|
251
|
+
lines << "Winner: #{winner[0]}"
|
|
252
|
+
avg_score = winner[1][:avg_score]
|
|
253
|
+
pass_rate = winner[1][:pass_rate]
|
|
254
|
+
lines << " #{avg_score}/10 avg | #{win_count} scenario wins | #{pass_rate}% pass rate"
|
|
255
|
+
lines << " Beat #{runner_up[0]} by #{margin} points"
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
timing = @results.timing_by_candidate
|
|
259
|
+
if timing.size > 1
|
|
260
|
+
fastest = timing.min_by { |_, v| v[:avg_ms] }
|
|
261
|
+
slowest = timing.max_by { |_, v| v[:avg_ms] }
|
|
262
|
+
if fastest[0] != slowest[0]
|
|
263
|
+
speedup = (slowest[1][:avg_ms].to_f / fastest[1][:avg_ms]).round(1)
|
|
264
|
+
lines << ''
|
|
265
|
+
lines << "Fastest: #{fastest[0]} (#{format_duration(fastest[1][:avg_ms])} avg)"
|
|
266
|
+
lines << " #{speedup}x faster than #{slowest[0]}"
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
lines.join("\n")
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
end
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'time'
|
|
4
|
+
|
|
5
|
+
module Qualspec
|
|
6
|
+
module Suite
|
|
7
|
+
class Runner
|
|
8
|
+
attr_reader :definition, :results
|
|
9
|
+
|
|
10
|
+
def initialize(definition)
|
|
11
|
+
@definition = definition.is_a?(String) ? Suite.find(definition) : definition
|
|
12
|
+
@results = Results.new(@definition.name)
|
|
13
|
+
@judge = Qualspec.judge
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def run(progress: true)
|
|
17
|
+
total_scenarios = @definition.scenarios_list.size
|
|
18
|
+
current = 0
|
|
19
|
+
|
|
20
|
+
# Process by scenario - collect all candidate responses, then judge together
|
|
21
|
+
@definition.scenarios_list.each do |scenario|
|
|
22
|
+
current += 1
|
|
23
|
+
log_scenario_progress(current, total_scenarios, scenario) if progress
|
|
24
|
+
|
|
25
|
+
run_scenario_comparison(scenario, progress: progress)
|
|
26
|
+
|
|
27
|
+
yield(@results) if block_given?
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
@results
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
private
|
|
34
|
+
|
|
35
|
+
def run_scenario_comparison(scenario, progress: false)
|
|
36
|
+
responses = {}
|
|
37
|
+
errors = {}
|
|
38
|
+
|
|
39
|
+
# Phase 1: Collect all candidate responses
|
|
40
|
+
@definition.candidates_list.each do |candidate|
|
|
41
|
+
log_candidate_progress(candidate, scenario, 'generating') if progress
|
|
42
|
+
|
|
43
|
+
response_data = generate_response(candidate, scenario)
|
|
44
|
+
|
|
45
|
+
if response_data[:error]
|
|
46
|
+
log_error(candidate, scenario, response_data[:error])
|
|
47
|
+
errors[candidate.name] = response_data[:error]
|
|
48
|
+
else
|
|
49
|
+
response = response_data[:response]
|
|
50
|
+
response_content = response.is_a?(Client::Response) ? response.content : response
|
|
51
|
+
|
|
52
|
+
responses[candidate.name] = response_content
|
|
53
|
+
|
|
54
|
+
@results.record_response(
|
|
55
|
+
candidate: candidate.name,
|
|
56
|
+
scenario: scenario.name,
|
|
57
|
+
response: response_content,
|
|
58
|
+
duration_ms: response.is_a?(Client::Response) ? response.duration_ms : response_data[:duration_ms],
|
|
59
|
+
cost: response.is_a?(Client::Response) ? response.cost : nil
|
|
60
|
+
)
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Phase 2: Judge all responses together (if we have any)
|
|
65
|
+
if responses.any?
|
|
66
|
+
log_candidate_progress(nil, scenario, 'judging') if progress
|
|
67
|
+
|
|
68
|
+
context = build_context(scenario)
|
|
69
|
+
criteria = scenario.all_criteria
|
|
70
|
+
|
|
71
|
+
# Use comparison mode for multiple candidates, single eval for one
|
|
72
|
+
if responses.size == 1
|
|
73
|
+
candidate, response = responses.first
|
|
74
|
+
evaluation = @judge.evaluate(
|
|
75
|
+
response: response,
|
|
76
|
+
criterion: criteria.join("\n"),
|
|
77
|
+
context: context
|
|
78
|
+
)
|
|
79
|
+
@results.record_evaluation(
|
|
80
|
+
candidate: candidate,
|
|
81
|
+
scenario: scenario.name,
|
|
82
|
+
criteria: criteria,
|
|
83
|
+
evaluation: evaluation,
|
|
84
|
+
winner: true # Only candidate wins by default
|
|
85
|
+
)
|
|
86
|
+
else
|
|
87
|
+
evaluations = @judge.evaluate_comparison(
|
|
88
|
+
responses: responses,
|
|
89
|
+
criteria: criteria,
|
|
90
|
+
context: context
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
evaluations.each do |candidate, evaluation|
|
|
94
|
+
@results.record_evaluation(
|
|
95
|
+
candidate: candidate,
|
|
96
|
+
scenario: scenario.name,
|
|
97
|
+
criteria: criteria,
|
|
98
|
+
evaluation: evaluation,
|
|
99
|
+
winner: evaluation.scenario_winner
|
|
100
|
+
)
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Record errors for failed candidates
|
|
106
|
+
errors.each do |candidate, error_message|
|
|
107
|
+
@results.record_evaluation(
|
|
108
|
+
candidate: candidate,
|
|
109
|
+
scenario: scenario.name,
|
|
110
|
+
criteria: scenario.all_criteria,
|
|
111
|
+
evaluation: Evaluation.new(
|
|
112
|
+
criterion: scenario.all_criteria.join("\n"),
|
|
113
|
+
score: 0,
|
|
114
|
+
pass: false,
|
|
115
|
+
error: error_message
|
|
116
|
+
)
|
|
117
|
+
)
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def generate_response(candidate, scenario)
|
|
122
|
+
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
123
|
+
|
|
124
|
+
response = candidate.generate_response(
|
|
125
|
+
prompt: scenario.prompt_text,
|
|
126
|
+
system_prompt: scenario.system_prompt
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
duration_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
|
|
130
|
+
|
|
131
|
+
{ response: response, duration_ms: duration_ms }
|
|
132
|
+
rescue StandardError => e
|
|
133
|
+
{ error: e.message }
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def build_context(scenario)
|
|
137
|
+
parts = []
|
|
138
|
+
parts << "System prompt: #{scenario.system_prompt}" if scenario.system_prompt
|
|
139
|
+
parts << "User prompt: #{scenario.prompt_text}"
|
|
140
|
+
parts << scenario.context if scenario.context
|
|
141
|
+
parts.join("\n\n")
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def log_scenario_progress(current, total, scenario)
|
|
145
|
+
pct = ((current.to_f / total) * 100).round
|
|
146
|
+
$stderr.print "\r[#{pct}%] Scenario: #{scenario.name}".ljust(60)
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def log_candidate_progress(candidate, _scenario, phase)
|
|
150
|
+
name = candidate&.name || 'all'
|
|
151
|
+
$stderr.print "\r #{name}: #{phase}...".ljust(60)
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def log_error(candidate, scenario, error)
|
|
155
|
+
warn "\n ERROR (#{candidate.name}/#{scenario.name}): #{error[0..100]}"
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Results container
|
|
160
|
+
class Results
|
|
161
|
+
attr_reader :suite_name, :evaluations, :responses, :started_at, :finished_at, :timing, :costs
|
|
162
|
+
|
|
163
|
+
def initialize(suite_name)
|
|
164
|
+
@suite_name = suite_name
|
|
165
|
+
@evaluations = []
|
|
166
|
+
@responses = {}
|
|
167
|
+
@timing = {}
|
|
168
|
+
@costs = {}
|
|
169
|
+
@started_at = Time.now
|
|
170
|
+
@finished_at = nil
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def record_response(candidate:, scenario:, response:, duration_ms: nil, cost: nil)
|
|
174
|
+
@responses[candidate] ||= {}
|
|
175
|
+
@responses[candidate][scenario] = response
|
|
176
|
+
|
|
177
|
+
if duration_ms
|
|
178
|
+
@timing[candidate] ||= {}
|
|
179
|
+
@timing[candidate][scenario] = duration_ms
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
return unless cost&.positive?
|
|
183
|
+
|
|
184
|
+
@costs[candidate] ||= 0.0
|
|
185
|
+
@costs[candidate] += cost
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def record_evaluation(candidate:, scenario:, criteria:, evaluation:, winner: nil)
|
|
189
|
+
@evaluations << {
|
|
190
|
+
candidate: candidate,
|
|
191
|
+
scenario: scenario,
|
|
192
|
+
criteria: criteria,
|
|
193
|
+
criteria_count: Array(criteria).size,
|
|
194
|
+
score: evaluation.score,
|
|
195
|
+
pass: evaluation.pass?,
|
|
196
|
+
reasoning: evaluation.reasoning,
|
|
197
|
+
error: evaluation.error,
|
|
198
|
+
winner: winner
|
|
199
|
+
}
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
def finish!
|
|
203
|
+
@finished_at = Time.now
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
def scores_by_candidate
|
|
207
|
+
@evaluations.group_by { |e| e[:candidate] }.transform_values do |evals|
|
|
208
|
+
passed = evals.count { |e| e[:pass] }
|
|
209
|
+
total = evals.size
|
|
210
|
+
avg_score = total.positive? ? evals.sum { |e| e[:score] }.to_f / total : 0
|
|
211
|
+
|
|
212
|
+
{
|
|
213
|
+
passed: passed,
|
|
214
|
+
total: total,
|
|
215
|
+
pass_rate: total.positive? ? (passed.to_f / total * 100).round(1) : 0,
|
|
216
|
+
avg_score: avg_score.round(2)
|
|
217
|
+
}
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def timing_by_candidate
|
|
222
|
+
@timing.transform_values do |scenarios|
|
|
223
|
+
total_ms = scenarios.values.sum
|
|
224
|
+
avg_ms = !scenarios.empty? ? total_ms / scenarios.size : 0
|
|
225
|
+
{
|
|
226
|
+
total_ms: total_ms,
|
|
227
|
+
avg_ms: avg_ms.round,
|
|
228
|
+
count: scenarios.size
|
|
229
|
+
}
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
def scores_by_scenario
|
|
234
|
+
@evaluations.group_by { |e| e[:scenario] }.transform_values do |evals|
|
|
235
|
+
evals.group_by { |e| e[:candidate] }.transform_values do |candidate_evals|
|
|
236
|
+
eval_data = candidate_evals.first
|
|
237
|
+
{
|
|
238
|
+
score: eval_data[:score],
|
|
239
|
+
pass: eval_data[:pass],
|
|
240
|
+
reasoning: eval_data[:reasoning]
|
|
241
|
+
}
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
def to_h
|
|
247
|
+
{
|
|
248
|
+
suite_name: @suite_name,
|
|
249
|
+
started_at: @started_at.iso8601,
|
|
250
|
+
finished_at: @finished_at&.iso8601,
|
|
251
|
+
summary: scores_by_candidate,
|
|
252
|
+
timing: timing_by_candidate,
|
|
253
|
+
costs: @costs,
|
|
254
|
+
by_scenario: scores_by_scenario,
|
|
255
|
+
evaluations: @evaluations,
|
|
256
|
+
responses: @responses
|
|
257
|
+
}
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
end
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Qualspec
|
|
4
|
+
module Suite
|
|
5
|
+
class Scenario
|
|
6
|
+
attr_reader :name, :prompt_text, :system_prompt, :evaluations, :rubric_name, :context
|
|
7
|
+
|
|
8
|
+
def initialize(name, &block)
|
|
9
|
+
@name = name
|
|
10
|
+
@prompt_text = nil
|
|
11
|
+
@system_prompt = nil
|
|
12
|
+
@evaluations = []
|
|
13
|
+
@rubric_name = nil
|
|
14
|
+
@context = nil
|
|
15
|
+
|
|
16
|
+
instance_eval(&block) if block_given? # rubocop:disable Style/EvalWithLocation -- DSL pattern requires eval
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# DSL methods
|
|
20
|
+
def prompt(text)
|
|
21
|
+
@prompt_text = text
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def system(text)
|
|
25
|
+
@system_prompt = text
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# DSL method to add evaluation criteria
|
|
29
|
+
def criterion(text)
|
|
30
|
+
@evaluations << text
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Alias for backwards compatibility
|
|
34
|
+
alias evaluate criterion
|
|
35
|
+
|
|
36
|
+
def rubric(name)
|
|
37
|
+
@rubric_name = name
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def with_context(text)
|
|
41
|
+
@context = text
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Get all criteria to evaluate (from explicit evals + rubric)
|
|
45
|
+
def all_criteria
|
|
46
|
+
criteria = @evaluations.dup
|
|
47
|
+
|
|
48
|
+
if @rubric_name
|
|
49
|
+
rubric_obj = Rubric.find(@rubric_name)
|
|
50
|
+
criteria.concat(rubric_obj.criteria)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
criteria
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
data/lib/qualspec.rb
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'qualspec/version'
|
|
4
|
+
|
|
5
|
+
module Qualspec
|
|
6
|
+
class Error < StandardError; end
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
require_relative 'qualspec/configuration'
|
|
10
|
+
require_relative 'qualspec/client'
|
|
11
|
+
require_relative 'qualspec/evaluation'
|
|
12
|
+
require_relative 'qualspec/rubric'
|
|
13
|
+
require_relative 'qualspec/judge'
|
|
14
|
+
require_relative 'qualspec/builtin_rubrics'
|
|
15
|
+
require_relative 'qualspec/suite/candidate'
|
|
16
|
+
require_relative 'qualspec/suite/scenario'
|
|
17
|
+
require_relative 'qualspec/suite/behavior'
|
|
18
|
+
require_relative 'qualspec/suite/dsl'
|
|
19
|
+
require_relative 'qualspec/suite/runner'
|
|
20
|
+
require_relative 'qualspec/suite/reporter'
|
|
21
|
+
require_relative 'qualspec/suite/html_reporter'
|
|
22
|
+
require_relative 'qualspec/suite/builtin_behaviors'
|
|
23
|
+
require_relative 'qualspec/recorder'
|
|
24
|
+
|
|
25
|
+
module Qualspec
|
|
26
|
+
class << self
|
|
27
|
+
def configuration
|
|
28
|
+
@configuration ||= Configuration.new
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def configure
|
|
32
|
+
yield(configuration)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def reset!
|
|
36
|
+
@configuration = nil
|
|
37
|
+
@client = nil
|
|
38
|
+
@judge = nil
|
|
39
|
+
Rubric.clear!
|
|
40
|
+
Suite.clear!
|
|
41
|
+
Suite::Behavior.clear!
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def client
|
|
45
|
+
@client ||= Client.new(configuration)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def judge
|
|
49
|
+
@judge ||= Judge.new
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Convenience method for defining rubrics
|
|
53
|
+
def define_rubric(name, &block)
|
|
54
|
+
Rubric.define(name, &block)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Convenience method for defining behaviors
|
|
58
|
+
def define_behavior(name, &block)
|
|
59
|
+
Suite::Behavior.define(name, &block)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Convenience method for defining evaluation suites
|
|
63
|
+
def evaluation(name, &block)
|
|
64
|
+
Suite.define(name, &block)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Run an evaluation suite
|
|
68
|
+
def run(suite_name, progress: true, output: :stdout, json_path: nil, html_path: nil, show_responses: false,
|
|
69
|
+
load_builtins: true)
|
|
70
|
+
# Load builtins (idempotent, can be called multiple times)
|
|
71
|
+
if load_builtins
|
|
72
|
+
BuiltinRubrics.load!
|
|
73
|
+
Suite::BuiltinBehaviors.load!
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
suite = Suite.find(suite_name)
|
|
77
|
+
runner = Suite::Runner.new(suite)
|
|
78
|
+
|
|
79
|
+
results = runner.run(progress: progress)
|
|
80
|
+
results.finish!
|
|
81
|
+
|
|
82
|
+
reporter = Suite::Reporter.new(results, show_responses: show_responses)
|
|
83
|
+
|
|
84
|
+
case output
|
|
85
|
+
when :stdout
|
|
86
|
+
puts reporter.to_stdout
|
|
87
|
+
when :json
|
|
88
|
+
puts reporter.to_json
|
|
89
|
+
when :silent
|
|
90
|
+
# nothing
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
reporter.write_json(json_path) if json_path
|
|
94
|
+
|
|
95
|
+
if html_path
|
|
96
|
+
html_reporter = Suite::HtmlReporter.new(results)
|
|
97
|
+
html_reporter.write(html_path)
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
results
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
data/sig/qualspec.rbs
ADDED