qualspec 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +14 -0
- data/.rubocop_todo.yml +1 -1
- data/CHANGELOG.md +31 -0
- data/README.md +27 -5
- data/config/models.yml +23 -0
- data/docs/alpha_readiness.md +94 -0
- data/docs/configuration.md +53 -4
- data/docs/evaluation-suites.md +45 -2
- data/docs/getting-started.md +5 -2
- data/docs/recording.md +22 -0
- data/examples/EXAMPLES.md +73 -0
- data/examples/README.md +5 -0
- data/examples/best_value.rb +67 -0
- data/examples/cassettes/best_value.yml +649 -0
- data/examples/cassettes/character_consistency.yml +680 -0
- data/examples/cassettes/customer_service_comparison.yml +593 -0
- data/examples/cassettes/date_awareness_gate.yml +420 -0
- data/examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml +4 -4
- data/examples/character_consistency.rb +83 -0
- data/examples/comparison.rb +0 -0
- data/examples/customer_service_comparison.rb +59 -0
- data/examples/date_awareness_gate.rb +57 -0
- data/examples/model_comparison.rb +0 -0
- data/examples/persona_test.rb +0 -0
- data/examples/prompt_variants_factory.rb +0 -0
- data/examples/quick_test.rb +0 -0
- data/examples/rspec_example_spec.rb +0 -0
- data/examples/simple_variant_comparison.rb +0 -0
- data/examples/variant_comparison.rb +0 -0
- data/exe/qualspec +4 -4
- data/lib/qualspec/client.rb +14 -7
- data/lib/qualspec/configuration.rb +18 -5
- data/lib/qualspec/judge.rb +1 -1
- data/lib/qualspec/model_registry.rb +62 -0
- data/lib/qualspec/recorder.rb +41 -3
- data/lib/qualspec/suite/candidate.rb +7 -4
- data/lib/qualspec/suite/dsl.rb +16 -1
- data/lib/qualspec/suite/html_reporter.rb +8 -8
- data/lib/qualspec/suite/runner.rb +67 -8
- data/lib/qualspec/version.rb +1 -1
- data/lib/qualspec.rb +17 -0
- data/qualspec_structure.md +9 -3
- metadata +16 -8
- data/.DS_Store +0 -0
data/examples/quick_test.rb
CHANGED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
data/exe/qualspec
CHANGED
|
@@ -66,10 +66,10 @@ parser = OptionParser.new do |opts|
|
|
|
66
66
|
puts opts
|
|
67
67
|
puts
|
|
68
68
|
puts 'Environment variables:'
|
|
69
|
-
puts ' QUALSPEC_API_URL API endpoint (default:
|
|
70
|
-
puts ' QUALSPEC_API_KEY API key for authentication'
|
|
71
|
-
puts ' QUALSPEC_MODEL Default model for candidates'
|
|
72
|
-
puts ' QUALSPEC_JUDGE_MODEL Model to use as judge'
|
|
69
|
+
puts ' QUALSPEC_API_URL API endpoint (default: https://openrouter.ai/api/v1)'
|
|
70
|
+
puts ' QUALSPEC_API_KEY API key for authentication (falls back to OPEN_ROUTER_API_KEY)'
|
|
71
|
+
puts ' QUALSPEC_MODEL Default model for candidates (default: openrouter/auto)'
|
|
72
|
+
puts ' QUALSPEC_JUDGE_MODEL Model to use as judge (default: same as QUALSPEC_MODEL)'
|
|
73
73
|
puts
|
|
74
74
|
puts 'Example:'
|
|
75
75
|
puts ' qualspec eval/model_comparison.rb'
|
data/lib/qualspec/client.rb
CHANGED
|
@@ -53,8 +53,8 @@ module Qualspec
|
|
|
53
53
|
return if @config.api_key_configured?
|
|
54
54
|
|
|
55
55
|
raise Qualspec::Error, <<~MSG.strip
|
|
56
|
-
QUALSPEC_API_KEY
|
|
57
|
-
|
|
56
|
+
No API key set. Set QUALSPEC_API_KEY (or OPEN_ROUTER_API_KEY) as an
|
|
57
|
+
environment variable, or use Qualspec.configure { |c| c.api_key = '...' }
|
|
58
58
|
MSG
|
|
59
59
|
end
|
|
60
60
|
|
|
@@ -70,6 +70,10 @@ module Qualspec
|
|
|
70
70
|
# Set temperature if provided
|
|
71
71
|
payload[:temperature] = temperature if temperature
|
|
72
72
|
|
|
73
|
+
# Ask OpenRouter to include usage accounting (cost + token details).
|
|
74
|
+
# Only when metadata is requested, so cost-less calls stay lean.
|
|
75
|
+
payload[:usage] = { include: true } if with_metadata
|
|
76
|
+
|
|
73
77
|
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
74
78
|
|
|
75
79
|
response = @conn.post('chat/completions', payload)
|
|
@@ -108,12 +112,15 @@ module Qualspec
|
|
|
108
112
|
end
|
|
109
113
|
|
|
110
114
|
def extract_cost(response, data)
|
|
111
|
-
# OpenRouter
|
|
112
|
-
|
|
113
|
-
|
|
115
|
+
# OpenRouter returns cost under usage.cost when usage accounting is
|
|
116
|
+
# requested (usage: { include: true }). Fall back to other shapes for
|
|
117
|
+
# other OpenAI-compatible providers.
|
|
118
|
+
usage = data['usage'] || {}
|
|
119
|
+
cost = usage['cost'] || usage['total_cost'] || data['cost']
|
|
120
|
+
return cost.to_f if cost
|
|
114
121
|
|
|
115
|
-
|
|
116
|
-
|
|
122
|
+
header_cost = response.headers['x-openrouter-cost']
|
|
123
|
+
header_cost&.to_f
|
|
117
124
|
end
|
|
118
125
|
|
|
119
126
|
def extract_tokens(data)
|
|
@@ -2,15 +2,20 @@
|
|
|
2
2
|
|
|
3
3
|
module Qualspec
|
|
4
4
|
class Configuration
|
|
5
|
-
attr_accessor :api_url, :
|
|
5
|
+
attr_accessor :api_url, :default_model, :judge_model, :cache_enabled, :cache_dir, :judge_system_prompt,
|
|
6
6
|
:request_timeout
|
|
7
|
+
attr_writer :api_key
|
|
7
8
|
|
|
8
9
|
DEFAULT_API_URL = 'https://openrouter.ai/api/v1'
|
|
9
|
-
|
|
10
|
+
# Universal fallback. `openrouter/auto` routes to a sensible model for any
|
|
11
|
+
# request, so qualspec works even with no model configured anywhere.
|
|
12
|
+
DEFAULT_MODEL = 'openrouter/auto'
|
|
10
13
|
|
|
11
14
|
def initialize
|
|
12
15
|
@api_url = ENV.fetch('QUALSPEC_API_URL', DEFAULT_API_URL)
|
|
13
|
-
|
|
16
|
+
# Default nil: set explicitly via Qualspec.configure { |c| c.api_key = ... }.
|
|
17
|
+
# When unset, #api_key falls back to env vars (see reader below).
|
|
18
|
+
@api_key = nil
|
|
14
19
|
@default_model = ENV.fetch('QUALSPEC_MODEL', DEFAULT_MODEL)
|
|
15
20
|
@judge_model = ENV.fetch('QUALSPEC_JUDGE_MODEL') { @default_model }
|
|
16
21
|
@cache_enabled = false
|
|
@@ -19,14 +24,22 @@ module Qualspec
|
|
|
19
24
|
@request_timeout = 120
|
|
20
25
|
end
|
|
21
26
|
|
|
27
|
+
# Explicitly configured key wins; otherwise fall back to env vars.
|
|
28
|
+
# Prefer QUALSPEC_API_KEY, then OPEN_ROUTER_API_KEY (default backend is
|
|
29
|
+
# OpenRouter). The env vars are a convenience fallback, not a requirement —
|
|
30
|
+
# pass api_key in Qualspec.configure to avoid relying on them.
|
|
31
|
+
def api_key
|
|
32
|
+
@api_key || ENV['QUALSPEC_API_KEY'] || ENV['OPEN_ROUTER_API_KEY']
|
|
33
|
+
end
|
|
34
|
+
|
|
22
35
|
def api_headers
|
|
23
36
|
headers = { 'Content-Type' => 'application/json' }
|
|
24
|
-
headers['Authorization'] = "Bearer #{
|
|
37
|
+
headers['Authorization'] = "Bearer #{api_key}" unless api_key.to_s.empty?
|
|
25
38
|
headers
|
|
26
39
|
end
|
|
27
40
|
|
|
28
41
|
def api_key_configured?
|
|
29
|
-
|
|
42
|
+
!api_key.to_s.empty?
|
|
30
43
|
end
|
|
31
44
|
end
|
|
32
45
|
end
|
data/lib/qualspec/judge.rb
CHANGED
|
@@ -141,7 +141,7 @@ module Qualspec
|
|
|
141
141
|
parts << '## Responses:'
|
|
142
142
|
|
|
143
143
|
responses.each do |candidate, response|
|
|
144
|
-
parts << "\n### #{candidate}:\n#{response}"
|
|
144
|
+
parts << "\n### #{candidate}:\n```\n#{response}\n```"
|
|
145
145
|
end
|
|
146
146
|
|
|
147
147
|
parts << "\nScore each candidate (#{candidate_names}) from 0-10."
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'yaml'
|
|
4
|
+
|
|
5
|
+
module Qualspec
|
|
6
|
+
# Loads a curated list of named models from a YAML config file and resolves
|
|
7
|
+
# names to their full provider slugs. Unknown/blank names fall back to the
|
|
8
|
+
# configured default (ultimately Configuration::DEFAULT_MODEL, openrouter/auto),
|
|
9
|
+
# so model lookups always return something usable.
|
|
10
|
+
#
|
|
11
|
+
# @example config/models.yml
|
|
12
|
+
# default: openrouter/auto
|
|
13
|
+
# models:
|
|
14
|
+
# glm: z-ai/glm-5.2
|
|
15
|
+
#
|
|
16
|
+
# @example
|
|
17
|
+
# Qualspec.model(:glm) # => "z-ai/glm-5.2"
|
|
18
|
+
# Qualspec.model(:nope) # => "openrouter/auto"
|
|
19
|
+
# Qualspec.model # => "openrouter/auto"
|
|
20
|
+
class ModelRegistry
|
|
21
|
+
DEFAULT_CONFIG_PATH = 'config/models.yml'
|
|
22
|
+
|
|
23
|
+
def initialize(path: nil, default: nil)
|
|
24
|
+
@models = {}
|
|
25
|
+
@default = default
|
|
26
|
+
load_file(path || ENV['QUALSPEC_MODELS_FILE'] || DEFAULT_CONFIG_PATH)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Resolve a model name to its slug, falling back to the default.
|
|
30
|
+
#
|
|
31
|
+
# @param name [Symbol, String, nil] the configured name (or nil for default)
|
|
32
|
+
# @return [String] a model slug
|
|
33
|
+
def resolve(name = nil)
|
|
34
|
+
return default if name.nil? || name.to_s.empty?
|
|
35
|
+
|
|
36
|
+
@models.fetch(name.to_s, default)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# @return [Hash{String=>String}] all configured name => slug pairs
|
|
40
|
+
def all
|
|
41
|
+
@models.dup
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# @return [String] the universal fallback model
|
|
45
|
+
def default
|
|
46
|
+
@default || Configuration::DEFAULT_MODEL
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
def load_file(path)
|
|
52
|
+
return unless path && File.exist?(path)
|
|
53
|
+
|
|
54
|
+
data = YAML.safe_load_file(path) || {}
|
|
55
|
+
@default ||= data['default']
|
|
56
|
+
(data['models'] || {}).each { |name, slug| @models[name.to_s] = slug }
|
|
57
|
+
rescue StandardError
|
|
58
|
+
# A malformed config file should never break a run; defaults still apply.
|
|
59
|
+
nil
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
data/lib/qualspec/recorder.rb
CHANGED
|
@@ -13,16 +13,32 @@ module Qualspec
|
|
|
13
13
|
def setup(cassette_dir: '.qualspec_cassettes')
|
|
14
14
|
require_vcr!
|
|
15
15
|
|
|
16
|
+
recorder = self
|
|
16
17
|
VCR.configure do |config|
|
|
17
18
|
config.cassette_library_dir = cassette_dir
|
|
18
19
|
config.hook_into :faraday
|
|
19
20
|
config.default_cassette_options = {
|
|
20
21
|
record: :new_episodes,
|
|
21
|
-
match_requests_on: %i[method uri
|
|
22
|
+
match_requests_on: %i[method uri body_without_model]
|
|
22
23
|
}
|
|
23
|
-
# Filter out API keys
|
|
24
|
-
|
|
24
|
+
# Filter out API keys — guard against adding duplicate filters
|
|
25
|
+
unless @api_key_filter_registered
|
|
26
|
+
config.filter_sensitive_data('<API_KEY>') { Qualspec.configuration.api_key }
|
|
27
|
+
@api_key_filter_registered = true
|
|
28
|
+
end
|
|
25
29
|
end
|
|
30
|
+
|
|
31
|
+
# Register custom matcher once — ignores the `model` field so cassettes
|
|
32
|
+
# recorded with one model work in CI where a different model is configured.
|
|
33
|
+
unless @matcher_registered
|
|
34
|
+
VCR.configure do |config|
|
|
35
|
+
config.register_request_matcher(:body_without_model) do |r1, r2|
|
|
36
|
+
recorder.send(:normalize_body_for_match, r1.body) == recorder.send(:normalize_body_for_match, r2.body)
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
@matcher_registered = true
|
|
40
|
+
end
|
|
41
|
+
|
|
26
42
|
@configured = true
|
|
27
43
|
end
|
|
28
44
|
|
|
@@ -40,6 +56,20 @@ module Qualspec
|
|
|
40
56
|
VCR.use_cassette(name, record: :none, &block)
|
|
41
57
|
end
|
|
42
58
|
|
|
59
|
+
# Replay a cassette if it already exists (no API key required), otherwise
|
|
60
|
+
# record a fresh one. Ideal for examples that ship a committed cassette so
|
|
61
|
+
# they run for free, but still record on first run.
|
|
62
|
+
def use_cassette(name, &block)
|
|
63
|
+
setup unless configured?
|
|
64
|
+
mode = cassette_exists?(name) ? :none : :new_episodes
|
|
65
|
+
VCR.use_cassette(name, record: mode, &block)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def cassette_exists?(name)
|
|
69
|
+
require_vcr!
|
|
70
|
+
File.exist?(File.join(VCR.configuration.cassette_library_dir, "#{name}.yml"))
|
|
71
|
+
end
|
|
72
|
+
|
|
43
73
|
private
|
|
44
74
|
|
|
45
75
|
def require_vcr!
|
|
@@ -50,6 +80,14 @@ module Qualspec
|
|
|
50
80
|
Add to your Gemfile: gem 'vcr'
|
|
51
81
|
MSG
|
|
52
82
|
end
|
|
83
|
+
|
|
84
|
+
def normalize_body_for_match(body)
|
|
85
|
+
parsed = JSON.parse(body)
|
|
86
|
+
parsed.delete('model')
|
|
87
|
+
JSON.generate(parsed)
|
|
88
|
+
rescue JSON::ParserError
|
|
89
|
+
body
|
|
90
|
+
end
|
|
53
91
|
end
|
|
54
92
|
end
|
|
55
93
|
end
|
|
@@ -5,14 +5,16 @@ module Qualspec
|
|
|
5
5
|
class Candidate
|
|
6
6
|
attr_reader :name, :model, :system_prompt, :options
|
|
7
7
|
|
|
8
|
-
def initialize(name, model
|
|
8
|
+
def initialize(name, model: nil, system_prompt: nil, **options)
|
|
9
9
|
@name = name.to_s
|
|
10
|
-
|
|
10
|
+
# Fall back to the configured default model (ultimately openrouter/auto)
|
|
11
|
+
# so a candidate works even when no model is specified.
|
|
12
|
+
@model = model || Qualspec.configuration.default_model
|
|
11
13
|
@system_prompt = system_prompt
|
|
12
14
|
@options = options
|
|
13
15
|
end
|
|
14
16
|
|
|
15
|
-
def generate_response(prompt:, system_prompt: nil, temperature: nil)
|
|
17
|
+
def generate_response(prompt:, system_prompt: nil, temperature: nil, with_metadata: false)
|
|
16
18
|
messages = []
|
|
17
19
|
|
|
18
20
|
sys = system_prompt || @system_prompt
|
|
@@ -23,7 +25,8 @@ module Qualspec
|
|
|
23
25
|
model: @model,
|
|
24
26
|
messages: messages,
|
|
25
27
|
json_mode: false, # We want natural responses, not JSON
|
|
26
|
-
temperature: normalize_temperature(temperature)
|
|
28
|
+
temperature: normalize_temperature(temperature),
|
|
29
|
+
with_metadata: with_metadata
|
|
27
30
|
)
|
|
28
31
|
end
|
|
29
32
|
|
data/lib/qualspec/suite/dsl.rb
CHANGED
|
@@ -11,16 +11,31 @@ module Qualspec
|
|
|
11
11
|
@scenarios_list = []
|
|
12
12
|
@variants_config = nil
|
|
13
13
|
@temperature_list = [nil] # nil means use model default
|
|
14
|
+
@track_cost = false
|
|
14
15
|
|
|
15
16
|
instance_eval(&block) if block_given? # rubocop:disable Style/EvalWithLocation -- DSL pattern requires eval
|
|
16
17
|
end
|
|
17
18
|
|
|
19
|
+
# DSL: capture per-call cost + token metadata so cost/value analysis works.
|
|
20
|
+
# Off by default — evaluations that don't look at cost skip the overhead.
|
|
21
|
+
#
|
|
22
|
+
# @example
|
|
23
|
+
# track_cost
|
|
24
|
+
def track_cost(value = true) # rubocop:disable Style/OptionalBooleanParameter -- reads as a DSL toggle
|
|
25
|
+
@track_cost = value
|
|
26
|
+
end
|
|
27
|
+
alias capture_metadata track_cost
|
|
28
|
+
|
|
29
|
+
def track_cost?
|
|
30
|
+
@track_cost
|
|
31
|
+
end
|
|
32
|
+
|
|
18
33
|
# DSL: define candidates
|
|
19
34
|
def candidates(&block)
|
|
20
35
|
instance_eval(&block) # rubocop:disable Style/EvalWithLocation -- DSL pattern requires eval
|
|
21
36
|
end
|
|
22
37
|
|
|
23
|
-
def candidate(name, model
|
|
38
|
+
def candidate(name, model: nil, system_prompt: nil, **options)
|
|
24
39
|
@candidates_list << Candidate.new(name, model: model, system_prompt: system_prompt, **options)
|
|
25
40
|
end
|
|
26
41
|
|
|
@@ -481,10 +481,12 @@ module Qualspec
|
|
|
481
481
|
|
|
482
482
|
scenario_blocks = scenarios.map do |scenario|
|
|
483
483
|
response_cards = responses.map do |candidate, candidate_responses|
|
|
484
|
-
|
|
485
|
-
next unless
|
|
484
|
+
variant_map = candidate_responses[scenario]
|
|
485
|
+
next unless variant_map
|
|
486
486
|
|
|
487
|
-
|
|
487
|
+
contents = variant_map.flat_map { |_v, tm| tm.values.map { |d| d[:content] } }.compact
|
|
488
|
+
response_text = contents.join("\n\n---\n\n").strip
|
|
489
|
+
next if response_text.empty?
|
|
488
490
|
|
|
489
491
|
<<~CARD
|
|
490
492
|
<div class="response-card">
|
|
@@ -660,13 +662,11 @@ module Qualspec
|
|
|
660
662
|
end
|
|
661
663
|
|
|
662
664
|
def get_candidate_model(candidate)
|
|
663
|
-
|
|
664
|
-
@results.evaluations.find { |e| e[:candidate] == candidate }&.dig(:model) || 'unknown'
|
|
665
|
+
@results.candidate_models[candidate] || 'unknown'
|
|
665
666
|
end
|
|
666
667
|
|
|
667
|
-
def get_scenario_prompt(
|
|
668
|
-
|
|
669
|
-
nil
|
|
668
|
+
def get_scenario_prompt(scenario)
|
|
669
|
+
@results.prompts[scenario]
|
|
670
670
|
end
|
|
671
671
|
end
|
|
672
672
|
end
|
|
@@ -11,6 +11,12 @@ module Qualspec
|
|
|
11
11
|
@definition = definition.is_a?(String) ? Suite.find(definition) : definition
|
|
12
12
|
@results = Results.new(@definition.name)
|
|
13
13
|
@judge = Qualspec.judge
|
|
14
|
+
|
|
15
|
+
@definition.candidates_list.each do |c|
|
|
16
|
+
@results.candidate_models[c.name] = c.model
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
@results.metadata_captured = @definition.track_cost?
|
|
14
20
|
end
|
|
15
21
|
|
|
16
22
|
def run(progress: true)
|
|
@@ -52,6 +58,8 @@ module Qualspec
|
|
|
52
58
|
responses = {}
|
|
53
59
|
errors = {}
|
|
54
60
|
|
|
61
|
+
@results.prompts[scenario.name] ||= scenario.compose_prompt(variant)
|
|
62
|
+
|
|
55
63
|
# Phase 1: Collect all candidate responses
|
|
56
64
|
@definition.candidates_list.each do |candidate|
|
|
57
65
|
log_candidate_progress(candidate, scenario, 'generating') if progress
|
|
@@ -100,7 +108,8 @@ module Qualspec
|
|
|
100
108
|
response = candidate.generate_response(
|
|
101
109
|
prompt: final_prompt,
|
|
102
110
|
system_prompt: final_system_prompt,
|
|
103
|
-
temperature: effective_temperature
|
|
111
|
+
temperature: effective_temperature,
|
|
112
|
+
with_metadata: @definition.track_cost?
|
|
104
113
|
)
|
|
105
114
|
|
|
106
115
|
duration_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
|
|
@@ -217,7 +226,9 @@ module Qualspec
|
|
|
217
226
|
|
|
218
227
|
# Results container with multi-dimensional support
|
|
219
228
|
class Results
|
|
220
|
-
attr_reader :suite_name, :evaluations, :responses, :started_at, :finished_at, :timing, :costs
|
|
229
|
+
attr_reader :suite_name, :evaluations, :responses, :started_at, :finished_at, :timing, :costs,
|
|
230
|
+
:candidate_models, :prompts
|
|
231
|
+
attr_accessor :metadata_captured
|
|
221
232
|
|
|
222
233
|
def initialize(suite_name)
|
|
223
234
|
@suite_name = suite_name
|
|
@@ -225,8 +236,36 @@ module Qualspec
|
|
|
225
236
|
@responses = {} # Nested: {candidate => {scenario => {variant => {temp => response}}}}
|
|
226
237
|
@timing = {}
|
|
227
238
|
@costs = {}
|
|
239
|
+
@candidate_models = {} # {candidate_name => model_string}
|
|
240
|
+
@prompts = {} # {scenario_name => prompt_string}
|
|
228
241
|
@started_at = Time.now
|
|
229
242
|
@finished_at = nil
|
|
243
|
+
@metadata_captured = false # set true when the suite enables track_cost
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
# Whether per-call cost/token metadata was captured this run.
|
|
247
|
+
def costs_tracked?
|
|
248
|
+
@metadata_captured
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Total cost per candidate. Raises if cost tracking wasn't enabled.
|
|
252
|
+
def cost_by_candidate
|
|
253
|
+
ensure_cost_tracking!
|
|
254
|
+
@costs.dup
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
# Rank candidates by quality-per-dollar (avg score / total cost), best
|
|
258
|
+
# first. Candidates with zero recorded cost sort last. Raises a helpful
|
|
259
|
+
# error if cost tracking wasn't enabled for the run.
|
|
260
|
+
def value_ranking
|
|
261
|
+
ensure_cost_tracking!
|
|
262
|
+
|
|
263
|
+
ranked = scores_by_candidate.map do |candidate, stats|
|
|
264
|
+
cost = @costs[candidate].to_f
|
|
265
|
+
score_per_dollar = cost.positive? ? (stats[:avg_score] / cost).round : nil
|
|
266
|
+
[candidate, { avg_score: stats[:avg_score], cost: cost, score_per_dollar: score_per_dollar }]
|
|
267
|
+
end
|
|
268
|
+
ranked.sort_by { |_, v| -(v[:score_per_dollar] || 0) }.to_h
|
|
230
269
|
end
|
|
231
270
|
|
|
232
271
|
def record_response(candidate:, scenario:, response:, variant: 'default', temperature: nil, duration_ms: nil, cost: nil, variant_data: nil)
|
|
@@ -329,13 +368,15 @@ module Qualspec
|
|
|
329
368
|
def scores_by_scenario
|
|
330
369
|
@evaluations.group_by { |e| e[:scenario] }.transform_values do |evals|
|
|
331
370
|
evals.group_by { |e| e[:candidate] }.transform_values do |candidate_evals|
|
|
332
|
-
|
|
371
|
+
total = candidate_evals.size
|
|
372
|
+
avg_score = (candidate_evals.sum { |e| e[:score] }.to_f / total).round(2)
|
|
373
|
+
first = candidate_evals.first
|
|
333
374
|
{
|
|
334
|
-
score:
|
|
335
|
-
pass:
|
|
336
|
-
reasoning:
|
|
337
|
-
variant:
|
|
338
|
-
temperature:
|
|
375
|
+
score: avg_score,
|
|
376
|
+
pass: candidate_evals.all? { |e| e[:pass] },
|
|
377
|
+
reasoning: first[:reasoning],
|
|
378
|
+
variant: first[:variant],
|
|
379
|
+
temperature: first[:temperature]
|
|
339
380
|
}
|
|
340
381
|
end
|
|
341
382
|
end
|
|
@@ -374,6 +415,24 @@ module Qualspec
|
|
|
374
415
|
responses: @responses
|
|
375
416
|
}
|
|
376
417
|
end
|
|
418
|
+
|
|
419
|
+
private
|
|
420
|
+
|
|
421
|
+
def ensure_cost_tracking!
|
|
422
|
+
return if @metadata_captured
|
|
423
|
+
|
|
424
|
+
raise Qualspec::Error, <<~MSG.strip
|
|
425
|
+
Cost data was not captured for this run, so cost/value analysis is unavailable.
|
|
426
|
+
Enable it with `track_cost` in the suite definition:
|
|
427
|
+
|
|
428
|
+
Qualspec.evaluation 'My Suite' do
|
|
429
|
+
track_cost
|
|
430
|
+
...
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
(track_cost adds usage accounting to each request via with_metadata.)
|
|
434
|
+
MSG
|
|
435
|
+
end
|
|
377
436
|
end
|
|
378
437
|
end
|
|
379
438
|
end
|
data/lib/qualspec/version.rb
CHANGED
data/lib/qualspec.rb
CHANGED
|
@@ -7,6 +7,7 @@ module Qualspec
|
|
|
7
7
|
end
|
|
8
8
|
|
|
9
9
|
require_relative 'qualspec/configuration'
|
|
10
|
+
require_relative 'qualspec/model_registry'
|
|
10
11
|
require_relative 'qualspec/client'
|
|
11
12
|
require_relative 'qualspec/evaluation'
|
|
12
13
|
require_relative 'qualspec/prompt_variant'
|
|
@@ -37,6 +38,7 @@ module Qualspec
|
|
|
37
38
|
@configuration = nil
|
|
38
39
|
@client = nil
|
|
39
40
|
@judge = nil
|
|
41
|
+
@models = nil
|
|
40
42
|
Rubric.clear!
|
|
41
43
|
Suite.clear!
|
|
42
44
|
Suite::Behavior.clear!
|
|
@@ -50,6 +52,21 @@ module Qualspec
|
|
|
50
52
|
@judge ||= Judge.new
|
|
51
53
|
end
|
|
52
54
|
|
|
55
|
+
# Registry of named models loaded from config/models.yml (or
|
|
56
|
+
# QUALSPEC_MODELS_FILE). See ModelRegistry.
|
|
57
|
+
def models
|
|
58
|
+
@models ||= ModelRegistry.new
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Resolve a named model to its slug, falling back to the default
|
|
62
|
+
# (openrouter/auto). Returns the default when name is nil/unknown.
|
|
63
|
+
#
|
|
64
|
+
# Qualspec.model(:glm) # => "z-ai/glm-5.2"
|
|
65
|
+
# Qualspec.model # => "openrouter/auto"
|
|
66
|
+
def model(name = nil)
|
|
67
|
+
models.resolve(name)
|
|
68
|
+
end
|
|
69
|
+
|
|
53
70
|
# Convenience method for defining rubrics
|
|
54
71
|
def define_rubric(name, &block)
|
|
55
72
|
Rubric.define(name, &block)
|
data/qualspec_structure.md
CHANGED
|
@@ -7,10 +7,12 @@ LLM-judged qualitative testing for Ruby. Evaluate AI agents, compare models, and
|
|
|
7
7
|
|
|
8
8
|
### Core Library Files (lib/qualspec/)
|
|
9
9
|
- **builtin_rubrics.rb** - Built-in evaluation criteria
|
|
10
|
-
- **client.rb** - API client for LLM interactions
|
|
10
|
+
- **client.rb** - API client for LLM interactions (cost/token metadata optional)
|
|
11
11
|
- **configuration.rb** - Configuration management
|
|
12
12
|
- **evaluation.rb** - Core evaluation logic
|
|
13
13
|
- **judge.rb** - LLM judge implementation
|
|
14
|
+
- **model_registry.rb** - Named models from `config/models.yml` (`Qualspec.model`)
|
|
15
|
+
- **prompt_variant.rb** - Variant value object (FactoryBot target)
|
|
14
16
|
- **recorder.rb** - VCR integration for recording
|
|
15
17
|
- **rspec.rb** - RSpec integration entry point
|
|
16
18
|
- **rubric.rb** - Custom rubric definitions
|
|
@@ -23,10 +25,11 @@ LLM-judged qualitative testing for Ruby. Evaluate AI agents, compare models, and
|
|
|
23
25
|
### Configuration Environment Variables
|
|
24
26
|
| Variable | Description | Default |
|
|
25
27
|
|----------|-------------|---------|
|
|
26
|
-
| QUALSPEC_API_KEY | API key (
|
|
28
|
+
| QUALSPEC_API_KEY | API key (falls back to OPEN_ROUTER_API_KEY) | - |
|
|
27
29
|
| QUALSPEC_API_URL | API endpoint | https://openrouter.ai/api/v1 |
|
|
28
|
-
| QUALSPEC_MODEL | Default model for candidates |
|
|
30
|
+
| QUALSPEC_MODEL | Default model for candidates | openrouter/auto |
|
|
29
31
|
| QUALSPEC_JUDGE_MODEL | Model used as judge | Same as QUALSPEC_MODEL |
|
|
32
|
+
| QUALSPEC_MODELS_FILE | Named-models YAML | config/models.yml |
|
|
30
33
|
|
|
31
34
|
### Key Features
|
|
32
35
|
1. **Model Comparison CLI** - Compare multiple models on the same prompts
|
|
@@ -36,6 +39,9 @@ LLM-judged qualitative testing for Ruby. Evaluate AI agents, compare models, and
|
|
|
36
39
|
5. **Custom Rubrics** - Define your own evaluation criteria
|
|
37
40
|
6. **VCR Recording** - Record and replay API calls for testing
|
|
38
41
|
7. **HTML Reports** - Generate visual comparison reports
|
|
42
|
+
8. **Named Model Registry** - Reference curated models by name (`Qualspec.model`)
|
|
43
|
+
9. **Cost Tracking** - Opt-in per-call cost + quality-per-dollar `value_ranking`
|
|
44
|
+
10. **Variant & Temperature Matrix** - Combinatorial prompt testing via FactoryBot
|
|
39
45
|
|
|
40
46
|
### Example: Model Comparison
|
|
41
47
|
```ruby
|
metadata
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: qualspec
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Eric Stiens
|
|
8
|
-
autorequire:
|
|
9
8
|
bindir: exe
|
|
10
9
|
cert_chain: []
|
|
11
|
-
date:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
12
11
|
dependencies:
|
|
13
12
|
- !ruby/object:Gem::Dependency
|
|
14
13
|
name: faraday
|
|
@@ -62,7 +61,6 @@ executables:
|
|
|
62
61
|
extensions: []
|
|
63
62
|
extra_rdoc_files: []
|
|
64
63
|
files:
|
|
65
|
-
- ".DS_Store"
|
|
66
64
|
- ".qualspec_cassettes/comparison_test.yml"
|
|
67
65
|
- ".qualspec_cassettes/quick_test.yml"
|
|
68
66
|
- ".rspec"
|
|
@@ -71,7 +69,9 @@ files:
|
|
|
71
69
|
- CHANGELOG.md
|
|
72
70
|
- README.md
|
|
73
71
|
- Rakefile
|
|
72
|
+
- config/models.yml
|
|
74
73
|
- docs/.DS_Store
|
|
74
|
+
- docs/alpha_readiness.md
|
|
75
75
|
- docs/configuration.md
|
|
76
76
|
- docs/evaluation-suites.md
|
|
77
77
|
- docs/getting-started.md
|
|
@@ -80,7 +80,13 @@ files:
|
|
|
80
80
|
- docs/rubrics.md
|
|
81
81
|
- docs/to_implement/factory_bot_integration_design.md
|
|
82
82
|
- docs/to_implement/variants_first_pass.md
|
|
83
|
+
- examples/EXAMPLES.md
|
|
83
84
|
- examples/README.md
|
|
85
|
+
- examples/best_value.rb
|
|
86
|
+
- examples/cassettes/best_value.yml
|
|
87
|
+
- examples/cassettes/character_consistency.yml
|
|
88
|
+
- examples/cassettes/customer_service_comparison.yml
|
|
89
|
+
- examples/cassettes/date_awareness_gate.yml
|
|
84
90
|
- examples/cassettes/qualspec_rspec_integration_basic_evaluation_evaluates_responses_with_inline_criteria.yml
|
|
85
91
|
- examples/cassettes/qualspec_rspec_integration_basic_evaluation_provides_detailed_feedback_on_failure.yml
|
|
86
92
|
- examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml
|
|
@@ -88,7 +94,10 @@ files:
|
|
|
88
94
|
- examples/cassettes/qualspec_rspec_integration_vcr_integration_records_and_plays_back_api_calls_automatically.yml
|
|
89
95
|
- examples/cassettes/qualspec_rspec_integration_with_context_uses_context_in_evaluation.yml
|
|
90
96
|
- examples/cassettes/qualspec_rspec_integration_with_rubrics_evaluates_using_builtin_rubrics.yml
|
|
97
|
+
- examples/character_consistency.rb
|
|
91
98
|
- examples/comparison.rb
|
|
99
|
+
- examples/customer_service_comparison.rb
|
|
100
|
+
- examples/date_awareness_gate.rb
|
|
92
101
|
- examples/model_comparison.rb
|
|
93
102
|
- examples/persona_test.rb
|
|
94
103
|
- examples/prompt_variants_factory.rb
|
|
@@ -105,6 +114,7 @@ files:
|
|
|
105
114
|
- lib/qualspec/configuration.rb
|
|
106
115
|
- lib/qualspec/evaluation.rb
|
|
107
116
|
- lib/qualspec/judge.rb
|
|
117
|
+
- lib/qualspec/model_registry.rb
|
|
108
118
|
- lib/qualspec/prompt_variant.rb
|
|
109
119
|
- lib/qualspec/recorder.rb
|
|
110
120
|
- lib/qualspec/rspec.rb
|
|
@@ -131,7 +141,6 @@ metadata:
|
|
|
131
141
|
homepage_uri: https://github.com/estiens/qualspec
|
|
132
142
|
source_code_uri: https://github.com/estiens/qualspec
|
|
133
143
|
changelog_uri: https://github.com/estiens/qualspec/blob/main/CHANGELOG.md
|
|
134
|
-
post_install_message:
|
|
135
144
|
rdoc_options: []
|
|
136
145
|
require_paths:
|
|
137
146
|
- lib
|
|
@@ -139,15 +148,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
139
148
|
requirements:
|
|
140
149
|
- - ">="
|
|
141
150
|
- !ruby/object:Gem::Version
|
|
142
|
-
version: 3.
|
|
151
|
+
version: 3.3.0
|
|
143
152
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
144
153
|
requirements:
|
|
145
154
|
- - ">="
|
|
146
155
|
- !ruby/object:Gem::Version
|
|
147
156
|
version: '0'
|
|
148
157
|
requirements: []
|
|
149
|
-
rubygems_version: 3.
|
|
150
|
-
signing_key:
|
|
158
|
+
rubygems_version: 3.6.9
|
|
151
159
|
specification_version: 4
|
|
152
160
|
summary: RSpec DSL for qualitative LLM-judged testing
|
|
153
161
|
test_files: []
|
data/.DS_Store
DELETED
|
Binary file
|