qualspec 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +14 -0
- data/.rubocop_todo.yml +1 -1
- data/CHANGELOG.md +31 -0
- data/README.md +27 -5
- data/config/models.yml +23 -0
- data/docs/alpha_readiness.md +94 -0
- data/docs/configuration.md +53 -4
- data/docs/evaluation-suites.md +45 -2
- data/docs/getting-started.md +5 -2
- data/docs/recording.md +22 -0
- data/examples/EXAMPLES.md +73 -0
- data/examples/README.md +5 -0
- data/examples/best_value.rb +67 -0
- data/examples/cassettes/best_value.yml +649 -0
- data/examples/cassettes/character_consistency.yml +680 -0
- data/examples/cassettes/customer_service_comparison.yml +593 -0
- data/examples/cassettes/date_awareness_gate.yml +420 -0
- data/examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml +4 -4
- data/examples/character_consistency.rb +83 -0
- data/examples/comparison.rb +0 -0
- data/examples/customer_service_comparison.rb +59 -0
- data/examples/date_awareness_gate.rb +57 -0
- data/examples/model_comparison.rb +0 -0
- data/examples/persona_test.rb +0 -0
- data/examples/prompt_variants_factory.rb +0 -0
- data/examples/quick_test.rb +0 -0
- data/examples/rspec_example_spec.rb +0 -0
- data/examples/simple_variant_comparison.rb +0 -0
- data/examples/variant_comparison.rb +0 -0
- data/exe/qualspec +4 -4
- data/lib/qualspec/client.rb +14 -7
- data/lib/qualspec/configuration.rb +18 -5
- data/lib/qualspec/model_registry.rb +62 -0
- data/lib/qualspec/recorder.rb +41 -3
- data/lib/qualspec/suite/candidate.rb +7 -4
- data/lib/qualspec/suite/dsl.rb +16 -1
- data/lib/qualspec/suite/runner.rb +49 -1
- data/lib/qualspec/version.rb +1 -1
- data/lib/qualspec.rb +17 -0
- data/qualspec_structure.md +9 -3
- metadata +16 -7
data/examples/quick_test.rb
CHANGED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
data/exe/qualspec
CHANGED
|
@@ -66,10 +66,10 @@ parser = OptionParser.new do |opts|
|
|
|
66
66
|
puts opts
|
|
67
67
|
puts
|
|
68
68
|
puts 'Environment variables:'
|
|
69
|
-
puts ' QUALSPEC_API_URL API endpoint (default:
|
|
70
|
-
puts ' QUALSPEC_API_KEY API key for authentication'
|
|
71
|
-
puts ' QUALSPEC_MODEL Default model for candidates'
|
|
72
|
-
puts ' QUALSPEC_JUDGE_MODEL Model to use as judge'
|
|
69
|
+
puts ' QUALSPEC_API_URL API endpoint (default: https://openrouter.ai/api/v1)'
|
|
70
|
+
puts ' QUALSPEC_API_KEY API key for authentication (falls back to OPEN_ROUTER_API_KEY)'
|
|
71
|
+
puts ' QUALSPEC_MODEL Default model for candidates (default: openrouter/auto)'
|
|
72
|
+
puts ' QUALSPEC_JUDGE_MODEL Model to use as judge (default: same as QUALSPEC_MODEL)'
|
|
73
73
|
puts
|
|
74
74
|
puts 'Example:'
|
|
75
75
|
puts ' qualspec eval/model_comparison.rb'
|
data/lib/qualspec/client.rb
CHANGED
|
@@ -53,8 +53,8 @@ module Qualspec
|
|
|
53
53
|
return if @config.api_key_configured?
|
|
54
54
|
|
|
55
55
|
raise Qualspec::Error, <<~MSG.strip
|
|
56
|
-
QUALSPEC_API_KEY
|
|
57
|
-
|
|
56
|
+
No API key set. Set QUALSPEC_API_KEY (or OPEN_ROUTER_API_KEY) as an
|
|
57
|
+
environment variable, or use Qualspec.configure { |c| c.api_key = '...' }
|
|
58
58
|
MSG
|
|
59
59
|
end
|
|
60
60
|
|
|
@@ -70,6 +70,10 @@ module Qualspec
|
|
|
70
70
|
# Set temperature if provided
|
|
71
71
|
payload[:temperature] = temperature if temperature
|
|
72
72
|
|
|
73
|
+
# Ask OpenRouter to include usage accounting (cost + token details).
|
|
74
|
+
# Only when metadata is requested, so cost-less calls stay lean.
|
|
75
|
+
payload[:usage] = { include: true } if with_metadata
|
|
76
|
+
|
|
73
77
|
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
74
78
|
|
|
75
79
|
response = @conn.post('chat/completions', payload)
|
|
@@ -108,12 +112,15 @@ module Qualspec
|
|
|
108
112
|
end
|
|
109
113
|
|
|
110
114
|
def extract_cost(response, data)
|
|
111
|
-
# OpenRouter
|
|
112
|
-
|
|
113
|
-
|
|
115
|
+
# OpenRouter returns cost under usage.cost when usage accounting is
|
|
116
|
+
# requested (usage: { include: true }). Fall back to other shapes for
|
|
117
|
+
# other OpenAI-compatible providers.
|
|
118
|
+
usage = data['usage'] || {}
|
|
119
|
+
cost = usage['cost'] || usage['total_cost'] || data['cost']
|
|
120
|
+
return cost.to_f if cost
|
|
114
121
|
|
|
115
|
-
|
|
116
|
-
|
|
122
|
+
header_cost = response.headers['x-openrouter-cost']
|
|
123
|
+
header_cost&.to_f
|
|
117
124
|
end
|
|
118
125
|
|
|
119
126
|
def extract_tokens(data)
|
|
@@ -2,15 +2,20 @@
|
|
|
2
2
|
|
|
3
3
|
module Qualspec
|
|
4
4
|
class Configuration
|
|
5
|
-
attr_accessor :api_url, :
|
|
5
|
+
attr_accessor :api_url, :default_model, :judge_model, :cache_enabled, :cache_dir, :judge_system_prompt,
|
|
6
6
|
:request_timeout
|
|
7
|
+
attr_writer :api_key
|
|
7
8
|
|
|
8
9
|
DEFAULT_API_URL = 'https://openrouter.ai/api/v1'
|
|
9
|
-
|
|
10
|
+
# Universal fallback. `openrouter/auto` routes to a sensible model for any
|
|
11
|
+
# request, so qualspec works even with no model configured anywhere.
|
|
12
|
+
DEFAULT_MODEL = 'openrouter/auto'
|
|
10
13
|
|
|
11
14
|
def initialize
|
|
12
15
|
@api_url = ENV.fetch('QUALSPEC_API_URL', DEFAULT_API_URL)
|
|
13
|
-
|
|
16
|
+
# Default nil: set explicitly via Qualspec.configure { |c| c.api_key = ... }.
|
|
17
|
+
# When unset, #api_key falls back to env vars (see reader below).
|
|
18
|
+
@api_key = nil
|
|
14
19
|
@default_model = ENV.fetch('QUALSPEC_MODEL', DEFAULT_MODEL)
|
|
15
20
|
@judge_model = ENV.fetch('QUALSPEC_JUDGE_MODEL') { @default_model }
|
|
16
21
|
@cache_enabled = false
|
|
@@ -19,14 +24,22 @@ module Qualspec
|
|
|
19
24
|
@request_timeout = 120
|
|
20
25
|
end
|
|
21
26
|
|
|
27
|
+
# Explicitly configured key wins; otherwise fall back to env vars.
|
|
28
|
+
# Prefer QUALSPEC_API_KEY, then OPEN_ROUTER_API_KEY (default backend is
|
|
29
|
+
# OpenRouter). The env vars are a convenience fallback, not a requirement —
|
|
30
|
+
# pass api_key in Qualspec.configure to avoid relying on them.
|
|
31
|
+
def api_key
|
|
32
|
+
@api_key || ENV['QUALSPEC_API_KEY'] || ENV['OPEN_ROUTER_API_KEY']
|
|
33
|
+
end
|
|
34
|
+
|
|
22
35
|
def api_headers
|
|
23
36
|
headers = { 'Content-Type' => 'application/json' }
|
|
24
|
-
headers['Authorization'] = "Bearer #{
|
|
37
|
+
headers['Authorization'] = "Bearer #{api_key}" unless api_key.to_s.empty?
|
|
25
38
|
headers
|
|
26
39
|
end
|
|
27
40
|
|
|
28
41
|
def api_key_configured?
|
|
29
|
-
|
|
42
|
+
!api_key.to_s.empty?
|
|
30
43
|
end
|
|
31
44
|
end
|
|
32
45
|
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'yaml'
|
|
4
|
+
|
|
5
|
+
module Qualspec
|
|
6
|
+
# Loads a curated list of named models from a YAML config file and resolves
|
|
7
|
+
# names to their full provider slugs. Unknown/blank names fall back to the
|
|
8
|
+
# configured default (ultimately Configuration::DEFAULT_MODEL, openrouter/auto),
|
|
9
|
+
# so model lookups always return something usable.
|
|
10
|
+
#
|
|
11
|
+
# @example config/models.yml
|
|
12
|
+
# default: openrouter/auto
|
|
13
|
+
# models:
|
|
14
|
+
# glm: z-ai/glm-5.2
|
|
15
|
+
#
|
|
16
|
+
# @example
|
|
17
|
+
# Qualspec.model(:glm) # => "z-ai/glm-5.2"
|
|
18
|
+
# Qualspec.model(:nope) # => "openrouter/auto"
|
|
19
|
+
# Qualspec.model # => "openrouter/auto"
|
|
20
|
+
class ModelRegistry
|
|
21
|
+
DEFAULT_CONFIG_PATH = 'config/models.yml'
|
|
22
|
+
|
|
23
|
+
def initialize(path: nil, default: nil)
|
|
24
|
+
@models = {}
|
|
25
|
+
@default = default
|
|
26
|
+
load_file(path || ENV['QUALSPEC_MODELS_FILE'] || DEFAULT_CONFIG_PATH)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Resolve a model name to its slug, falling back to the default.
|
|
30
|
+
#
|
|
31
|
+
# @param name [Symbol, String, nil] the configured name (or nil for default)
|
|
32
|
+
# @return [String] a model slug
|
|
33
|
+
def resolve(name = nil)
|
|
34
|
+
return default if name.nil? || name.to_s.empty?
|
|
35
|
+
|
|
36
|
+
@models.fetch(name.to_s, default)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# @return [Hash{String=>String}] all configured name => slug pairs
|
|
40
|
+
def all
|
|
41
|
+
@models.dup
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# @return [String] the universal fallback model
|
|
45
|
+
def default
|
|
46
|
+
@default || Configuration::DEFAULT_MODEL
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
def load_file(path)
|
|
52
|
+
return unless path && File.exist?(path)
|
|
53
|
+
|
|
54
|
+
data = YAML.safe_load_file(path) || {}
|
|
55
|
+
@default ||= data['default']
|
|
56
|
+
(data['models'] || {}).each { |name, slug| @models[name.to_s] = slug }
|
|
57
|
+
rescue StandardError
|
|
58
|
+
# A malformed config file should never break a run; defaults still apply.
|
|
59
|
+
nil
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
data/lib/qualspec/recorder.rb
CHANGED
|
@@ -13,16 +13,32 @@ module Qualspec
|
|
|
13
13
|
def setup(cassette_dir: '.qualspec_cassettes')
|
|
14
14
|
require_vcr!
|
|
15
15
|
|
|
16
|
+
recorder = self
|
|
16
17
|
VCR.configure do |config|
|
|
17
18
|
config.cassette_library_dir = cassette_dir
|
|
18
19
|
config.hook_into :faraday
|
|
19
20
|
config.default_cassette_options = {
|
|
20
21
|
record: :new_episodes,
|
|
21
|
-
match_requests_on: %i[method uri
|
|
22
|
+
match_requests_on: %i[method uri body_without_model]
|
|
22
23
|
}
|
|
23
|
-
# Filter out API keys
|
|
24
|
-
|
|
24
|
+
# Filter out API keys — guard against adding duplicate filters
|
|
25
|
+
unless @api_key_filter_registered
|
|
26
|
+
config.filter_sensitive_data('<API_KEY>') { Qualspec.configuration.api_key }
|
|
27
|
+
@api_key_filter_registered = true
|
|
28
|
+
end
|
|
25
29
|
end
|
|
30
|
+
|
|
31
|
+
# Register custom matcher once — ignores the `model` field so cassettes
|
|
32
|
+
# recorded with one model work in CI where a different model is configured.
|
|
33
|
+
unless @matcher_registered
|
|
34
|
+
VCR.configure do |config|
|
|
35
|
+
config.register_request_matcher(:body_without_model) do |r1, r2|
|
|
36
|
+
recorder.send(:normalize_body_for_match, r1.body) == recorder.send(:normalize_body_for_match, r2.body)
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
@matcher_registered = true
|
|
40
|
+
end
|
|
41
|
+
|
|
26
42
|
@configured = true
|
|
27
43
|
end
|
|
28
44
|
|
|
@@ -40,6 +56,20 @@ module Qualspec
|
|
|
40
56
|
VCR.use_cassette(name, record: :none, &block)
|
|
41
57
|
end
|
|
42
58
|
|
|
59
|
+
# Replay a cassette if it already exists (no API key required), otherwise
|
|
60
|
+
# record a fresh one. Ideal for examples that ship a committed cassette so
|
|
61
|
+
# they run for free, but still record on first run.
|
|
62
|
+
def use_cassette(name, &block)
|
|
63
|
+
setup unless configured?
|
|
64
|
+
mode = cassette_exists?(name) ? :none : :new_episodes
|
|
65
|
+
VCR.use_cassette(name, record: mode, &block)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def cassette_exists?(name)
|
|
69
|
+
require_vcr!
|
|
70
|
+
File.exist?(File.join(VCR.configuration.cassette_library_dir, "#{name}.yml"))
|
|
71
|
+
end
|
|
72
|
+
|
|
43
73
|
private
|
|
44
74
|
|
|
45
75
|
def require_vcr!
|
|
@@ -50,6 +80,14 @@ module Qualspec
|
|
|
50
80
|
Add to your Gemfile: gem 'vcr'
|
|
51
81
|
MSG
|
|
52
82
|
end
|
|
83
|
+
|
|
84
|
+
def normalize_body_for_match(body)
|
|
85
|
+
parsed = JSON.parse(body)
|
|
86
|
+
parsed.delete('model')
|
|
87
|
+
JSON.generate(parsed)
|
|
88
|
+
rescue JSON::ParserError
|
|
89
|
+
body
|
|
90
|
+
end
|
|
53
91
|
end
|
|
54
92
|
end
|
|
55
93
|
end
|
|
@@ -5,14 +5,16 @@ module Qualspec
|
|
|
5
5
|
class Candidate
|
|
6
6
|
attr_reader :name, :model, :system_prompt, :options
|
|
7
7
|
|
|
8
|
-
def initialize(name, model
|
|
8
|
+
def initialize(name, model: nil, system_prompt: nil, **options)
|
|
9
9
|
@name = name.to_s
|
|
10
|
-
|
|
10
|
+
# Fall back to the configured default model (ultimately openrouter/auto)
|
|
11
|
+
# so a candidate works even when no model is specified.
|
|
12
|
+
@model = model || Qualspec.configuration.default_model
|
|
11
13
|
@system_prompt = system_prompt
|
|
12
14
|
@options = options
|
|
13
15
|
end
|
|
14
16
|
|
|
15
|
-
def generate_response(prompt:, system_prompt: nil, temperature: nil)
|
|
17
|
+
def generate_response(prompt:, system_prompt: nil, temperature: nil, with_metadata: false)
|
|
16
18
|
messages = []
|
|
17
19
|
|
|
18
20
|
sys = system_prompt || @system_prompt
|
|
@@ -23,7 +25,8 @@ module Qualspec
|
|
|
23
25
|
model: @model,
|
|
24
26
|
messages: messages,
|
|
25
27
|
json_mode: false, # We want natural responses, not JSON
|
|
26
|
-
temperature: normalize_temperature(temperature)
|
|
28
|
+
temperature: normalize_temperature(temperature),
|
|
29
|
+
with_metadata: with_metadata
|
|
27
30
|
)
|
|
28
31
|
end
|
|
29
32
|
|
data/lib/qualspec/suite/dsl.rb
CHANGED
|
@@ -11,16 +11,31 @@ module Qualspec
|
|
|
11
11
|
@scenarios_list = []
|
|
12
12
|
@variants_config = nil
|
|
13
13
|
@temperature_list = [nil] # nil means use model default
|
|
14
|
+
@track_cost = false
|
|
14
15
|
|
|
15
16
|
instance_eval(&block) if block_given? # rubocop:disable Style/EvalWithLocation -- DSL pattern requires eval
|
|
16
17
|
end
|
|
17
18
|
|
|
19
|
+
# DSL: capture per-call cost + token metadata so cost/value analysis works.
|
|
20
|
+
# Off by default — evaluations that don't look at cost skip the overhead.
|
|
21
|
+
#
|
|
22
|
+
# @example
|
|
23
|
+
# track_cost
|
|
24
|
+
def track_cost(value = true) # rubocop:disable Style/OptionalBooleanParameter -- reads as a DSL toggle
|
|
25
|
+
@track_cost = value
|
|
26
|
+
end
|
|
27
|
+
alias capture_metadata track_cost
|
|
28
|
+
|
|
29
|
+
def track_cost?
|
|
30
|
+
@track_cost
|
|
31
|
+
end
|
|
32
|
+
|
|
18
33
|
# DSL: define candidates
|
|
19
34
|
def candidates(&block)
|
|
20
35
|
instance_eval(&block) # rubocop:disable Style/EvalWithLocation -- DSL pattern requires eval
|
|
21
36
|
end
|
|
22
37
|
|
|
23
|
-
def candidate(name, model
|
|
38
|
+
def candidate(name, model: nil, system_prompt: nil, **options)
|
|
24
39
|
@candidates_list << Candidate.new(name, model: model, system_prompt: system_prompt, **options)
|
|
25
40
|
end
|
|
26
41
|
|
|
@@ -15,6 +15,8 @@ module Qualspec
|
|
|
15
15
|
@definition.candidates_list.each do |c|
|
|
16
16
|
@results.candidate_models[c.name] = c.model
|
|
17
17
|
end
|
|
18
|
+
|
|
19
|
+
@results.metadata_captured = @definition.track_cost?
|
|
18
20
|
end
|
|
19
21
|
|
|
20
22
|
def run(progress: true)
|
|
@@ -106,7 +108,8 @@ module Qualspec
|
|
|
106
108
|
response = candidate.generate_response(
|
|
107
109
|
prompt: final_prompt,
|
|
108
110
|
system_prompt: final_system_prompt,
|
|
109
|
-
temperature: effective_temperature
|
|
111
|
+
temperature: effective_temperature,
|
|
112
|
+
with_metadata: @definition.track_cost?
|
|
110
113
|
)
|
|
111
114
|
|
|
112
115
|
duration_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
|
|
@@ -225,6 +228,7 @@ module Qualspec
|
|
|
225
228
|
class Results
|
|
226
229
|
attr_reader :suite_name, :evaluations, :responses, :started_at, :finished_at, :timing, :costs,
|
|
227
230
|
:candidate_models, :prompts
|
|
231
|
+
attr_accessor :metadata_captured
|
|
228
232
|
|
|
229
233
|
def initialize(suite_name)
|
|
230
234
|
@suite_name = suite_name
|
|
@@ -236,6 +240,32 @@ module Qualspec
|
|
|
236
240
|
@prompts = {} # {scenario_name => prompt_string}
|
|
237
241
|
@started_at = Time.now
|
|
238
242
|
@finished_at = nil
|
|
243
|
+
@metadata_captured = false # set true when the suite enables track_cost
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
# Whether per-call cost/token metadata was captured this run.
|
|
247
|
+
def costs_tracked?
|
|
248
|
+
@metadata_captured
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Total cost per candidate. Raises if cost tracking wasn't enabled.
|
|
252
|
+
def cost_by_candidate
|
|
253
|
+
ensure_cost_tracking!
|
|
254
|
+
@costs.dup
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
# Rank candidates by quality-per-dollar (avg score / total cost), best
|
|
258
|
+
# first. Candidates with zero recorded cost sort last. Raises a helpful
|
|
259
|
+
# error if cost tracking wasn't enabled for the run.
|
|
260
|
+
def value_ranking
|
|
261
|
+
ensure_cost_tracking!
|
|
262
|
+
|
|
263
|
+
ranked = scores_by_candidate.map do |candidate, stats|
|
|
264
|
+
cost = @costs[candidate].to_f
|
|
265
|
+
score_per_dollar = cost.positive? ? (stats[:avg_score] / cost).round : nil
|
|
266
|
+
[candidate, { avg_score: stats[:avg_score], cost: cost, score_per_dollar: score_per_dollar }]
|
|
267
|
+
end
|
|
268
|
+
ranked.sort_by { |_, v| -(v[:score_per_dollar] || 0) }.to_h
|
|
239
269
|
end
|
|
240
270
|
|
|
241
271
|
def record_response(candidate:, scenario:, response:, variant: 'default', temperature: nil, duration_ms: nil, cost: nil, variant_data: nil)
|
|
@@ -385,6 +415,24 @@ module Qualspec
|
|
|
385
415
|
responses: @responses
|
|
386
416
|
}
|
|
387
417
|
end
|
|
418
|
+
|
|
419
|
+
private
|
|
420
|
+
|
|
421
|
+
def ensure_cost_tracking!
|
|
422
|
+
return if @metadata_captured
|
|
423
|
+
|
|
424
|
+
raise Qualspec::Error, <<~MSG.strip
|
|
425
|
+
Cost data was not captured for this run, so cost/value analysis is unavailable.
|
|
426
|
+
Enable it with `track_cost` in the suite definition:
|
|
427
|
+
|
|
428
|
+
Qualspec.evaluation 'My Suite' do
|
|
429
|
+
track_cost
|
|
430
|
+
...
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
(track_cost adds usage accounting to each request via with_metadata.)
|
|
434
|
+
MSG
|
|
435
|
+
end
|
|
388
436
|
end
|
|
389
437
|
end
|
|
390
438
|
end
|
data/lib/qualspec/version.rb
CHANGED
data/lib/qualspec.rb
CHANGED
|
@@ -7,6 +7,7 @@ module Qualspec
|
|
|
7
7
|
end
|
|
8
8
|
|
|
9
9
|
require_relative 'qualspec/configuration'
|
|
10
|
+
require_relative 'qualspec/model_registry'
|
|
10
11
|
require_relative 'qualspec/client'
|
|
11
12
|
require_relative 'qualspec/evaluation'
|
|
12
13
|
require_relative 'qualspec/prompt_variant'
|
|
@@ -37,6 +38,7 @@ module Qualspec
|
|
|
37
38
|
@configuration = nil
|
|
38
39
|
@client = nil
|
|
39
40
|
@judge = nil
|
|
41
|
+
@models = nil
|
|
40
42
|
Rubric.clear!
|
|
41
43
|
Suite.clear!
|
|
42
44
|
Suite::Behavior.clear!
|
|
@@ -50,6 +52,21 @@ module Qualspec
|
|
|
50
52
|
@judge ||= Judge.new
|
|
51
53
|
end
|
|
52
54
|
|
|
55
|
+
# Registry of named models loaded from config/models.yml (or
|
|
56
|
+
# QUALSPEC_MODELS_FILE). See ModelRegistry.
|
|
57
|
+
def models
|
|
58
|
+
@models ||= ModelRegistry.new
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Resolve a named model to its slug, falling back to the default
|
|
62
|
+
# (openrouter/auto). Returns the default when name is nil/unknown.
|
|
63
|
+
#
|
|
64
|
+
# Qualspec.model(:glm) # => "z-ai/glm-5.2"
|
|
65
|
+
# Qualspec.model # => "openrouter/auto"
|
|
66
|
+
def model(name = nil)
|
|
67
|
+
models.resolve(name)
|
|
68
|
+
end
|
|
69
|
+
|
|
53
70
|
# Convenience method for defining rubrics
|
|
54
71
|
def define_rubric(name, &block)
|
|
55
72
|
Rubric.define(name, &block)
|
data/qualspec_structure.md
CHANGED
|
@@ -7,10 +7,12 @@ LLM-judged qualitative testing for Ruby. Evaluate AI agents, compare models, and
|
|
|
7
7
|
|
|
8
8
|
### Core Library Files (lib/qualspec/)
|
|
9
9
|
- **builtin_rubrics.rb** - Built-in evaluation criteria
|
|
10
|
-
- **client.rb** - API client for LLM interactions
|
|
10
|
+
- **client.rb** - API client for LLM interactions (cost/token metadata optional)
|
|
11
11
|
- **configuration.rb** - Configuration management
|
|
12
12
|
- **evaluation.rb** - Core evaluation logic
|
|
13
13
|
- **judge.rb** - LLM judge implementation
|
|
14
|
+
- **model_registry.rb** - Named models from `config/models.yml` (`Qualspec.model`)
|
|
15
|
+
- **prompt_variant.rb** - Variant value object (FactoryBot target)
|
|
14
16
|
- **recorder.rb** - VCR integration for recording
|
|
15
17
|
- **rspec.rb** - RSpec integration entry point
|
|
16
18
|
- **rubric.rb** - Custom rubric definitions
|
|
@@ -23,10 +25,11 @@ LLM-judged qualitative testing for Ruby. Evaluate AI agents, compare models, and
|
|
|
23
25
|
### Configuration Environment Variables
|
|
24
26
|
| Variable | Description | Default |
|
|
25
27
|
|----------|-------------|---------|
|
|
26
|
-
| QUALSPEC_API_KEY | API key (
|
|
28
|
+
| QUALSPEC_API_KEY | API key (falls back to OPEN_ROUTER_API_KEY) | - |
|
|
27
29
|
| QUALSPEC_API_URL | API endpoint | https://openrouter.ai/api/v1 |
|
|
28
|
-
| QUALSPEC_MODEL | Default model for candidates |
|
|
30
|
+
| QUALSPEC_MODEL | Default model for candidates | openrouter/auto |
|
|
29
31
|
| QUALSPEC_JUDGE_MODEL | Model used as judge | Same as QUALSPEC_MODEL |
|
|
32
|
+
| QUALSPEC_MODELS_FILE | Named-models YAML | config/models.yml |
|
|
30
33
|
|
|
31
34
|
### Key Features
|
|
32
35
|
1. **Model Comparison CLI** - Compare multiple models on the same prompts
|
|
@@ -36,6 +39,9 @@ LLM-judged qualitative testing for Ruby. Evaluate AI agents, compare models, and
|
|
|
36
39
|
5. **Custom Rubrics** - Define your own evaluation criteria
|
|
37
40
|
6. **VCR Recording** - Record and replay API calls for testing
|
|
38
41
|
7. **HTML Reports** - Generate visual comparison reports
|
|
42
|
+
8. **Named Model Registry** - Reference curated models by name (`Qualspec.model`)
|
|
43
|
+
9. **Cost Tracking** - Opt-in per-call cost + quality-per-dollar `value_ranking`
|
|
44
|
+
10. **Variant & Temperature Matrix** - Combinatorial prompt testing via FactoryBot
|
|
39
45
|
|
|
40
46
|
### Example: Model Comparison
|
|
41
47
|
```ruby
|
metadata
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: qualspec
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Eric Stiens
|
|
8
|
-
autorequire:
|
|
9
8
|
bindir: exe
|
|
10
9
|
cert_chain: []
|
|
11
|
-
date:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
12
11
|
dependencies:
|
|
13
12
|
- !ruby/object:Gem::Dependency
|
|
14
13
|
name: faraday
|
|
@@ -70,7 +69,9 @@ files:
|
|
|
70
69
|
- CHANGELOG.md
|
|
71
70
|
- README.md
|
|
72
71
|
- Rakefile
|
|
72
|
+
- config/models.yml
|
|
73
73
|
- docs/.DS_Store
|
|
74
|
+
- docs/alpha_readiness.md
|
|
74
75
|
- docs/configuration.md
|
|
75
76
|
- docs/evaluation-suites.md
|
|
76
77
|
- docs/getting-started.md
|
|
@@ -79,7 +80,13 @@ files:
|
|
|
79
80
|
- docs/rubrics.md
|
|
80
81
|
- docs/to_implement/factory_bot_integration_design.md
|
|
81
82
|
- docs/to_implement/variants_first_pass.md
|
|
83
|
+
- examples/EXAMPLES.md
|
|
82
84
|
- examples/README.md
|
|
85
|
+
- examples/best_value.rb
|
|
86
|
+
- examples/cassettes/best_value.yml
|
|
87
|
+
- examples/cassettes/character_consistency.yml
|
|
88
|
+
- examples/cassettes/customer_service_comparison.yml
|
|
89
|
+
- examples/cassettes/date_awareness_gate.yml
|
|
83
90
|
- examples/cassettes/qualspec_rspec_integration_basic_evaluation_evaluates_responses_with_inline_criteria.yml
|
|
84
91
|
- examples/cassettes/qualspec_rspec_integration_basic_evaluation_provides_detailed_feedback_on_failure.yml
|
|
85
92
|
- examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml
|
|
@@ -87,7 +94,10 @@ files:
|
|
|
87
94
|
- examples/cassettes/qualspec_rspec_integration_vcr_integration_records_and_plays_back_api_calls_automatically.yml
|
|
88
95
|
- examples/cassettes/qualspec_rspec_integration_with_context_uses_context_in_evaluation.yml
|
|
89
96
|
- examples/cassettes/qualspec_rspec_integration_with_rubrics_evaluates_using_builtin_rubrics.yml
|
|
97
|
+
- examples/character_consistency.rb
|
|
90
98
|
- examples/comparison.rb
|
|
99
|
+
- examples/customer_service_comparison.rb
|
|
100
|
+
- examples/date_awareness_gate.rb
|
|
91
101
|
- examples/model_comparison.rb
|
|
92
102
|
- examples/persona_test.rb
|
|
93
103
|
- examples/prompt_variants_factory.rb
|
|
@@ -104,6 +114,7 @@ files:
|
|
|
104
114
|
- lib/qualspec/configuration.rb
|
|
105
115
|
- lib/qualspec/evaluation.rb
|
|
106
116
|
- lib/qualspec/judge.rb
|
|
117
|
+
- lib/qualspec/model_registry.rb
|
|
107
118
|
- lib/qualspec/prompt_variant.rb
|
|
108
119
|
- lib/qualspec/recorder.rb
|
|
109
120
|
- lib/qualspec/rspec.rb
|
|
@@ -130,7 +141,6 @@ metadata:
|
|
|
130
141
|
homepage_uri: https://github.com/estiens/qualspec
|
|
131
142
|
source_code_uri: https://github.com/estiens/qualspec
|
|
132
143
|
changelog_uri: https://github.com/estiens/qualspec/blob/main/CHANGELOG.md
|
|
133
|
-
post_install_message:
|
|
134
144
|
rdoc_options: []
|
|
135
145
|
require_paths:
|
|
136
146
|
- lib
|
|
@@ -138,15 +148,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
138
148
|
requirements:
|
|
139
149
|
- - ">="
|
|
140
150
|
- !ruby/object:Gem::Version
|
|
141
|
-
version: 3.
|
|
151
|
+
version: 3.3.0
|
|
142
152
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
143
153
|
requirements:
|
|
144
154
|
- - ">="
|
|
145
155
|
- !ruby/object:Gem::Version
|
|
146
156
|
version: '0'
|
|
147
157
|
requirements: []
|
|
148
|
-
rubygems_version: 3.
|
|
149
|
-
signing_key:
|
|
158
|
+
rubygems_version: 3.6.9
|
|
150
159
|
specification_version: 4
|
|
151
160
|
summary: RSpec DSL for qualitative LLM-judged testing
|
|
152
161
|
test_files: []
|