qualspec 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.qualspec_cassettes/comparison_test.yml +439 -0
  3. data/.qualspec_cassettes/quick_test.yml +232 -0
  4. data/.rspec +3 -0
  5. data/.rubocop.yml +1 -0
  6. data/.rubocop_todo.yml +70 -0
  7. data/CHANGELOG.md +16 -0
  8. data/README.md +84 -0
  9. data/Rakefile +8 -0
  10. data/docs/configuration.md +132 -0
  11. data/docs/evaluation-suites.md +180 -0
  12. data/docs/getting-started.md +102 -0
  13. data/docs/recording.md +196 -0
  14. data/docs/rspec-integration.md +233 -0
  15. data/docs/rubrics.md +174 -0
  16. data/examples/cassettes/qualspec_rspec_integration_basic_evaluation_evaluates_responses_with_inline_criteria.yml +65 -0
  17. data/examples/cassettes/qualspec_rspec_integration_basic_evaluation_provides_detailed_feedback_on_failure.yml +64 -0
  18. data/examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml +74 -0
  19. data/examples/cassettes/qualspec_rspec_integration_score_matchers_supports_score_comparisons.yml +65 -0
  20. data/examples/cassettes/qualspec_rspec_integration_vcr_integration_records_and_plays_back_api_calls_automatically.yml +65 -0
  21. data/examples/cassettes/qualspec_rspec_integration_with_context_uses_context_in_evaluation.yml +67 -0
  22. data/examples/cassettes/qualspec_rspec_integration_with_rubrics_evaluates_using_builtin_rubrics.yml +67 -0
  23. data/examples/comparison.rb +22 -0
  24. data/examples/model_comparison.rb +38 -0
  25. data/examples/persona_test.rb +49 -0
  26. data/examples/quick_test.rb +28 -0
  27. data/examples/report.html +399 -0
  28. data/examples/rspec_example_spec.rb +153 -0
  29. data/exe/qualspec +142 -0
  30. data/lib/qualspec/builtin_rubrics.rb +83 -0
  31. data/lib/qualspec/client.rb +127 -0
  32. data/lib/qualspec/configuration.rb +32 -0
  33. data/lib/qualspec/evaluation.rb +52 -0
  34. data/lib/qualspec/judge.rb +217 -0
  35. data/lib/qualspec/recorder.rb +55 -0
  36. data/lib/qualspec/rspec/configuration.rb +49 -0
  37. data/lib/qualspec/rspec/evaluation_result.rb +142 -0
  38. data/lib/qualspec/rspec/helpers.rb +155 -0
  39. data/lib/qualspec/rspec/matchers.rb +163 -0
  40. data/lib/qualspec/rspec.rb +66 -0
  41. data/lib/qualspec/rubric.rb +43 -0
  42. data/lib/qualspec/suite/behavior.rb +43 -0
  43. data/lib/qualspec/suite/builtin_behaviors.rb +84 -0
  44. data/lib/qualspec/suite/candidate.rb +30 -0
  45. data/lib/qualspec/suite/dsl.rb +64 -0
  46. data/lib/qualspec/suite/html_reporter.rb +673 -0
  47. data/lib/qualspec/suite/reporter.rb +274 -0
  48. data/lib/qualspec/suite/runner.rb +261 -0
  49. data/lib/qualspec/suite/scenario.rb +57 -0
  50. data/lib/qualspec/version.rb +5 -0
  51. data/lib/qualspec.rb +103 -0
  52. data/sig/qualspec.rbs +4 -0
  53. metadata +142 -0
@@ -0,0 +1,274 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Qualspec
6
+ module Suite
7
+ class Reporter
8
+ def initialize(results, show_responses: false)
9
+ @results = results
10
+ @show_responses = show_responses
11
+ end
12
+
13
+ def to_stdout
14
+ output = []
15
+ output << header
16
+ output << ''
17
+ output << summary_table
18
+ output << ''
19
+ output << timing_section if timing?
20
+ output << ''
21
+ output << scenario_breakdown
22
+ output << ''
23
+ output << responses_section if @show_responses
24
+ output << winner_announcement
25
+
26
+ output.compact.join("\n")
27
+ end
28
+
29
+ def to_json(pretty: true)
30
+ if pretty
31
+ JSON.pretty_generate(@results.to_h)
32
+ else
33
+ JSON.generate(@results.to_h)
34
+ end
35
+ end
36
+
37
+ def write_json(path)
38
+ File.write(path, to_json)
39
+ end
40
+
41
+ private
42
+
43
+ def header
44
+ lines = []
45
+ lines << '=' * 60
46
+ lines << @results.suite_name.center(60)
47
+ lines << '=' * 60
48
+ lines.join("\n")
49
+ end
50
+
51
+ def summary_table
52
+ scores = @results.scores_by_candidate
53
+ return 'No results' if scores.empty?
54
+
55
+ candidates = scores.keys
56
+ max_name = [candidates.map(&:length).max, 10].max
57
+
58
+ # Count scenario wins
59
+ wins = count_wins
60
+
61
+ lines = []
62
+ lines << '## Summary'
63
+ lines << ''
64
+
65
+ header = "| #{'Candidate'.ljust(max_name)} | Score | Wins | Pass Rate |"
66
+ lines << header
67
+ lines << "|#{'-' * (max_name + 2)}|-------|-------|-----------|"
68
+
69
+ sorted = scores.sort_by { |_, v| -v[:avg_score] }
70
+
71
+ sorted.each do |candidate, stats|
72
+ score = stats[:avg_score].to_s.rjust(5)
73
+ win_count = (wins[candidate] || 0).to_s.rjust(5)
74
+ pass_rate = "#{stats[:pass_rate]}%".rjust(8)
75
+
76
+ lines << "| #{candidate.ljust(max_name)} | #{score} | #{win_count} | #{pass_rate} |"
77
+ end
78
+
79
+ lines.join("\n")
80
+ end
81
+
82
+ def count_wins
83
+ wins = Hash.new(0)
84
+ @results.evaluations.each do |eval|
85
+ wins[eval[:candidate]] += 1 if eval[:winner] == true
86
+ end
87
+ wins
88
+ end
89
+
90
+ def timing?
91
+ !@results.timing.empty?
92
+ end
93
+
94
+ def timing_section
95
+ timing = @results.timing_by_candidate
96
+ return nil if timing.empty?
97
+
98
+ costs = @results.costs
99
+
100
+ lines = []
101
+ lines << '## Performance'
102
+ lines << ''
103
+
104
+ sorted = timing.sort_by { |_, v| v[:avg_ms] }
105
+
106
+ sorted.each do |candidate, stats|
107
+ line = " #{candidate}: #{format_duration(stats[:avg_ms])} avg"
108
+ line += " (#{format_duration(stats[:total_ms])} total)"
109
+
110
+ line += " - $#{format_cost(costs[candidate])}" if costs[candidate]&.positive?
111
+
112
+ lines << line
113
+ end
114
+
115
+ lines.join("\n")
116
+ end
117
+
118
+ def format_duration(milliseconds)
119
+ if milliseconds >= 1000
120
+ "#{(milliseconds / 1000.0).round(2)}s"
121
+ else
122
+ "#{milliseconds}ms"
123
+ end
124
+ end
125
+
126
+ def format_cost(cost)
127
+ if cost < 0.01
128
+ format('%.4f', cost)
129
+ else
130
+ format('%.2f', cost)
131
+ end
132
+ end
133
+
134
+ def scenario_breakdown
135
+ by_scenario = @results.scores_by_scenario
136
+ return '' if by_scenario.empty?
137
+
138
+ candidates = @results.scores_by_candidate.keys
139
+
140
+ lines = []
141
+ lines << '## By Scenario'
142
+ lines << ''
143
+
144
+ by_scenario.each do |scenario, candidate_scores|
145
+ # Find winner for this scenario
146
+ winner = find_scenario_winner(scenario)
147
+ winner_label = if winner == :tie
148
+ ' [TIE]'
149
+ else
150
+ winner ? " [Winner: #{winner}]" : ''
151
+ end
152
+
153
+ lines << "### #{scenario}#{winner_label}"
154
+
155
+ candidates.each do |candidate|
156
+ stats = candidate_scores[candidate]
157
+ next unless stats
158
+
159
+ score_bar = score_visualization(stats[:score])
160
+ timing_info = format_scenario_timing(candidate, scenario)
161
+ win_marker = winner == candidate ? ' *' : ''
162
+
163
+ line = " #{candidate}: #{score_bar} #{stats[:score]}/10#{win_marker}"
164
+ line += " #{timing_info}" if timing_info
165
+
166
+ lines << line
167
+ end
168
+
169
+ lines << ''
170
+ end
171
+
172
+ lines.join("\n")
173
+ end
174
+
175
+ def find_scenario_winner(scenario)
176
+ scenario_evals = @results.evaluations.select { |e| e[:scenario] == scenario }
177
+ winner_eval = scenario_evals.find { |e| e[:winner] == true }
178
+ return winner_eval[:candidate] if winner_eval
179
+
180
+ tie_eval = scenario_evals.find { |e| e[:winner] == :tie }
181
+ return :tie if tie_eval
182
+
183
+ nil
184
+ end
185
+
186
+ def format_scenario_timing(candidate, scenario)
187
+ duration = @results.timing.dig(candidate, scenario)
188
+ return nil unless duration
189
+
190
+ "[#{format_duration(duration)}]"
191
+ end
192
+
193
+ def score_visualization(score)
194
+ filled = score.to_f.round
195
+ empty = 10 - filled
196
+ "[#{'█' * filled}#{'░' * empty}]"
197
+ end
198
+
199
+ def responses_section
200
+ responses = @results.responses
201
+ return nil if responses.empty?
202
+
203
+ lines = []
204
+ lines << '## Responses'
205
+ lines << ''
206
+
207
+ # Group by scenario
208
+ scenarios = responses.values.first&.keys || []
209
+
210
+ scenarios.each do |scenario|
211
+ lines << "### #{scenario}"
212
+ lines << ''
213
+
214
+ responses.each do |candidate, candidate_responses|
215
+ response = candidate_responses[scenario]
216
+ next unless response
217
+
218
+ lines << "**#{candidate}:**"
219
+ lines << '```'
220
+ lines << response.to_s.strip[0..500]
221
+ lines << '...' if response.to_s.length > 500
222
+ lines << '```'
223
+ lines << ''
224
+ end
225
+ end
226
+
227
+ lines.join("\n")
228
+ end
229
+
230
+ def winner_announcement
231
+ scores = @results.scores_by_candidate
232
+ return '' if scores.empty?
233
+
234
+ wins = count_wins
235
+ sorted = scores.sort_by { |_, v| -v[:avg_score] }
236
+ winner = sorted.first
237
+ runner_up = sorted[1]
238
+
239
+ lines = []
240
+ lines << '-' * 60
241
+
242
+ if sorted.size == 1
243
+ lines << "Result: #{winner[0]} scored #{winner[1][:avg_score]}/10"
244
+ elsif winner[1][:avg_score] == runner_up&.dig(1, :avg_score)
245
+ tied = sorted.take_while { |_, v| v[:avg_score] == winner[1][:avg_score] }
246
+ lines << "Result: TIE between #{tied.map(&:first).join(', ')}"
247
+ lines << " All scored #{winner[1][:avg_score]}/10 average"
248
+ else
249
+ margin = (winner[1][:avg_score] - runner_up[1][:avg_score]).round(2)
250
+ win_count = wins[winner[0]] || 0
251
+ lines << "Winner: #{winner[0]}"
252
+ avg_score = winner[1][:avg_score]
253
+ pass_rate = winner[1][:pass_rate]
254
+ lines << " #{avg_score}/10 avg | #{win_count} scenario wins | #{pass_rate}% pass rate"
255
+ lines << " Beat #{runner_up[0]} by #{margin} points"
256
+ end
257
+
258
+ timing = @results.timing_by_candidate
259
+ if timing.size > 1
260
+ fastest = timing.min_by { |_, v| v[:avg_ms] }
261
+ slowest = timing.max_by { |_, v| v[:avg_ms] }
262
+ if fastest[0] != slowest[0]
263
+ speedup = (slowest[1][:avg_ms].to_f / fastest[1][:avg_ms]).round(1)
264
+ lines << ''
265
+ lines << "Fastest: #{fastest[0]} (#{format_duration(fastest[1][:avg_ms])} avg)"
266
+ lines << " #{speedup}x faster than #{slowest[0]}"
267
+ end
268
+ end
269
+
270
+ lines.join("\n")
271
+ end
272
+ end
273
+ end
274
+ end
@@ -0,0 +1,261 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'time'
4
+
5
+ module Qualspec
6
+ module Suite
7
+ class Runner
8
+ attr_reader :definition, :results
9
+
10
+ def initialize(definition)
11
+ @definition = definition.is_a?(String) ? Suite.find(definition) : definition
12
+ @results = Results.new(@definition.name)
13
+ @judge = Qualspec.judge
14
+ end
15
+
16
+ def run(progress: true)
17
+ total_scenarios = @definition.scenarios_list.size
18
+ current = 0
19
+
20
+ # Process by scenario - collect all candidate responses, then judge together
21
+ @definition.scenarios_list.each do |scenario|
22
+ current += 1
23
+ log_scenario_progress(current, total_scenarios, scenario) if progress
24
+
25
+ run_scenario_comparison(scenario, progress: progress)
26
+
27
+ yield(@results) if block_given?
28
+ end
29
+
30
+ @results
31
+ end
32
+
33
+ private
34
+
35
+ def run_scenario_comparison(scenario, progress: false)
36
+ responses = {}
37
+ errors = {}
38
+
39
+ # Phase 1: Collect all candidate responses
40
+ @definition.candidates_list.each do |candidate|
41
+ log_candidate_progress(candidate, scenario, 'generating') if progress
42
+
43
+ response_data = generate_response(candidate, scenario)
44
+
45
+ if response_data[:error]
46
+ log_error(candidate, scenario, response_data[:error])
47
+ errors[candidate.name] = response_data[:error]
48
+ else
49
+ response = response_data[:response]
50
+ response_content = response.is_a?(Client::Response) ? response.content : response
51
+
52
+ responses[candidate.name] = response_content
53
+
54
+ @results.record_response(
55
+ candidate: candidate.name,
56
+ scenario: scenario.name,
57
+ response: response_content,
58
+ duration_ms: response.is_a?(Client::Response) ? response.duration_ms : response_data[:duration_ms],
59
+ cost: response.is_a?(Client::Response) ? response.cost : nil
60
+ )
61
+ end
62
+ end
63
+
64
+ # Phase 2: Judge all responses together (if we have any)
65
+ if responses.any?
66
+ log_candidate_progress(nil, scenario, 'judging') if progress
67
+
68
+ context = build_context(scenario)
69
+ criteria = scenario.all_criteria
70
+
71
+ # Use comparison mode for multiple candidates, single eval for one
72
+ if responses.size == 1
73
+ candidate, response = responses.first
74
+ evaluation = @judge.evaluate(
75
+ response: response,
76
+ criterion: criteria.join("\n"),
77
+ context: context
78
+ )
79
+ @results.record_evaluation(
80
+ candidate: candidate,
81
+ scenario: scenario.name,
82
+ criteria: criteria,
83
+ evaluation: evaluation,
84
+ winner: true # Only candidate wins by default
85
+ )
86
+ else
87
+ evaluations = @judge.evaluate_comparison(
88
+ responses: responses,
89
+ criteria: criteria,
90
+ context: context
91
+ )
92
+
93
+ evaluations.each do |candidate, evaluation|
94
+ @results.record_evaluation(
95
+ candidate: candidate,
96
+ scenario: scenario.name,
97
+ criteria: criteria,
98
+ evaluation: evaluation,
99
+ winner: evaluation.scenario_winner
100
+ )
101
+ end
102
+ end
103
+ end
104
+
105
+ # Record errors for failed candidates
106
+ errors.each do |candidate, error_message|
107
+ @results.record_evaluation(
108
+ candidate: candidate,
109
+ scenario: scenario.name,
110
+ criteria: scenario.all_criteria,
111
+ evaluation: Evaluation.new(
112
+ criterion: scenario.all_criteria.join("\n"),
113
+ score: 0,
114
+ pass: false,
115
+ error: error_message
116
+ )
117
+ )
118
+ end
119
+ end
120
+
121
+ def generate_response(candidate, scenario)
122
+ start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
123
+
124
+ response = candidate.generate_response(
125
+ prompt: scenario.prompt_text,
126
+ system_prompt: scenario.system_prompt
127
+ )
128
+
129
+ duration_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
130
+
131
+ { response: response, duration_ms: duration_ms }
132
+ rescue StandardError => e
133
+ { error: e.message }
134
+ end
135
+
136
+ def build_context(scenario)
137
+ parts = []
138
+ parts << "System prompt: #{scenario.system_prompt}" if scenario.system_prompt
139
+ parts << "User prompt: #{scenario.prompt_text}"
140
+ parts << scenario.context if scenario.context
141
+ parts.join("\n\n")
142
+ end
143
+
144
+ def log_scenario_progress(current, total, scenario)
145
+ pct = ((current.to_f / total) * 100).round
146
+ $stderr.print "\r[#{pct}%] Scenario: #{scenario.name}".ljust(60)
147
+ end
148
+
149
+ def log_candidate_progress(candidate, _scenario, phase)
150
+ name = candidate&.name || 'all'
151
+ $stderr.print "\r #{name}: #{phase}...".ljust(60)
152
+ end
153
+
154
+ def log_error(candidate, scenario, error)
155
+ warn "\n ERROR (#{candidate.name}/#{scenario.name}): #{error[0..100]}"
156
+ end
157
+ end
158
+
159
+ # Results container
160
+ class Results
161
+ attr_reader :suite_name, :evaluations, :responses, :started_at, :finished_at, :timing, :costs
162
+
163
+ def initialize(suite_name)
164
+ @suite_name = suite_name
165
+ @evaluations = []
166
+ @responses = {}
167
+ @timing = {}
168
+ @costs = {}
169
+ @started_at = Time.now
170
+ @finished_at = nil
171
+ end
172
+
173
+ def record_response(candidate:, scenario:, response:, duration_ms: nil, cost: nil)
174
+ @responses[candidate] ||= {}
175
+ @responses[candidate][scenario] = response
176
+
177
+ if duration_ms
178
+ @timing[candidate] ||= {}
179
+ @timing[candidate][scenario] = duration_ms
180
+ end
181
+
182
+ return unless cost&.positive?
183
+
184
+ @costs[candidate] ||= 0.0
185
+ @costs[candidate] += cost
186
+ end
187
+
188
+ def record_evaluation(candidate:, scenario:, criteria:, evaluation:, winner: nil)
189
+ @evaluations << {
190
+ candidate: candidate,
191
+ scenario: scenario,
192
+ criteria: criteria,
193
+ criteria_count: Array(criteria).size,
194
+ score: evaluation.score,
195
+ pass: evaluation.pass?,
196
+ reasoning: evaluation.reasoning,
197
+ error: evaluation.error,
198
+ winner: winner
199
+ }
200
+ end
201
+
202
+ def finish!
203
+ @finished_at = Time.now
204
+ end
205
+
206
+ def scores_by_candidate
207
+ @evaluations.group_by { |e| e[:candidate] }.transform_values do |evals|
208
+ passed = evals.count { |e| e[:pass] }
209
+ total = evals.size
210
+ avg_score = total.positive? ? evals.sum { |e| e[:score] }.to_f / total : 0
211
+
212
+ {
213
+ passed: passed,
214
+ total: total,
215
+ pass_rate: total.positive? ? (passed.to_f / total * 100).round(1) : 0,
216
+ avg_score: avg_score.round(2)
217
+ }
218
+ end
219
+ end
220
+
221
+ def timing_by_candidate
222
+ @timing.transform_values do |scenarios|
223
+ total_ms = scenarios.values.sum
224
+ avg_ms = !scenarios.empty? ? total_ms / scenarios.size : 0
225
+ {
226
+ total_ms: total_ms,
227
+ avg_ms: avg_ms.round,
228
+ count: scenarios.size
229
+ }
230
+ end
231
+ end
232
+
233
+ def scores_by_scenario
234
+ @evaluations.group_by { |e| e[:scenario] }.transform_values do |evals|
235
+ evals.group_by { |e| e[:candidate] }.transform_values do |candidate_evals|
236
+ eval_data = candidate_evals.first
237
+ {
238
+ score: eval_data[:score],
239
+ pass: eval_data[:pass],
240
+ reasoning: eval_data[:reasoning]
241
+ }
242
+ end
243
+ end
244
+ end
245
+
246
+ def to_h
247
+ {
248
+ suite_name: @suite_name,
249
+ started_at: @started_at.iso8601,
250
+ finished_at: @finished_at&.iso8601,
251
+ summary: scores_by_candidate,
252
+ timing: timing_by_candidate,
253
+ costs: @costs,
254
+ by_scenario: scores_by_scenario,
255
+ evaluations: @evaluations,
256
+ responses: @responses
257
+ }
258
+ end
259
+ end
260
+ end
261
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Qualspec
4
+ module Suite
5
+ class Scenario
6
+ attr_reader :name, :prompt_text, :system_prompt, :evaluations, :rubric_name, :context
7
+
8
+ def initialize(name, &block)
9
+ @name = name
10
+ @prompt_text = nil
11
+ @system_prompt = nil
12
+ @evaluations = []
13
+ @rubric_name = nil
14
+ @context = nil
15
+
16
+ instance_eval(&block) if block_given? # rubocop:disable Style/EvalWithLocation -- DSL pattern requires eval
17
+ end
18
+
19
+ # DSL methods
20
+ def prompt(text)
21
+ @prompt_text = text
22
+ end
23
+
24
+ def system(text)
25
+ @system_prompt = text
26
+ end
27
+
28
+ # DSL method to add evaluation criteria
29
+ def criterion(text)
30
+ @evaluations << text
31
+ end
32
+
33
+ # Alias for backwards compatibility
34
+ alias evaluate criterion
35
+
36
+ def rubric(name)
37
+ @rubric_name = name
38
+ end
39
+
40
+ def with_context(text)
41
+ @context = text
42
+ end
43
+
44
+ # Get all criteria to evaluate (from explicit evals + rubric)
45
+ def all_criteria
46
+ criteria = @evaluations.dup
47
+
48
+ if @rubric_name
49
+ rubric_obj = Rubric.find(@rubric_name)
50
+ criteria.concat(rubric_obj.criteria)
51
+ end
52
+
53
+ criteria
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Qualspec
4
+ VERSION = '0.0.1'
5
+ end
data/lib/qualspec.rb ADDED
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'qualspec/version'
4
+
5
+ module Qualspec
6
+ class Error < StandardError; end
7
+ end
8
+
9
+ require_relative 'qualspec/configuration'
10
+ require_relative 'qualspec/client'
11
+ require_relative 'qualspec/evaluation'
12
+ require_relative 'qualspec/rubric'
13
+ require_relative 'qualspec/judge'
14
+ require_relative 'qualspec/builtin_rubrics'
15
+ require_relative 'qualspec/suite/candidate'
16
+ require_relative 'qualspec/suite/scenario'
17
+ require_relative 'qualspec/suite/behavior'
18
+ require_relative 'qualspec/suite/dsl'
19
+ require_relative 'qualspec/suite/runner'
20
+ require_relative 'qualspec/suite/reporter'
21
+ require_relative 'qualspec/suite/html_reporter'
22
+ require_relative 'qualspec/suite/builtin_behaviors'
23
+ require_relative 'qualspec/recorder'
24
+
25
+ module Qualspec
26
+ class << self
27
+ def configuration
28
+ @configuration ||= Configuration.new
29
+ end
30
+
31
+ def configure
32
+ yield(configuration)
33
+ end
34
+
35
+ def reset!
36
+ @configuration = nil
37
+ @client = nil
38
+ @judge = nil
39
+ Rubric.clear!
40
+ Suite.clear!
41
+ Suite::Behavior.clear!
42
+ end
43
+
44
+ def client
45
+ @client ||= Client.new(configuration)
46
+ end
47
+
48
+ def judge
49
+ @judge ||= Judge.new
50
+ end
51
+
52
+ # Convenience method for defining rubrics
53
+ def define_rubric(name, &block)
54
+ Rubric.define(name, &block)
55
+ end
56
+
57
+ # Convenience method for defining behaviors
58
+ def define_behavior(name, &block)
59
+ Suite::Behavior.define(name, &block)
60
+ end
61
+
62
+ # Convenience method for defining evaluation suites
63
+ def evaluation(name, &block)
64
+ Suite.define(name, &block)
65
+ end
66
+
67
+ # Run an evaluation suite
68
+ def run(suite_name, progress: true, output: :stdout, json_path: nil, html_path: nil, show_responses: false,
69
+ load_builtins: true)
70
+ # Load builtins (idempotent, can be called multiple times)
71
+ if load_builtins
72
+ BuiltinRubrics.load!
73
+ Suite::BuiltinBehaviors.load!
74
+ end
75
+
76
+ suite = Suite.find(suite_name)
77
+ runner = Suite::Runner.new(suite)
78
+
79
+ results = runner.run(progress: progress)
80
+ results.finish!
81
+
82
+ reporter = Suite::Reporter.new(results, show_responses: show_responses)
83
+
84
+ case output
85
+ when :stdout
86
+ puts reporter.to_stdout
87
+ when :json
88
+ puts reporter.to_json
89
+ when :silent
90
+ # nothing
91
+ end
92
+
93
+ reporter.write_json(json_path) if json_path
94
+
95
+ if html_path
96
+ html_reporter = Suite::HtmlReporter.new(results)
97
+ html_reporter.write(html_path)
98
+ end
99
+
100
+ results
101
+ end
102
+ end
103
+ end
data/sig/qualspec.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Qualspec
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end