qualspec 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +14 -0
  3. data/.rubocop_todo.yml +1 -1
  4. data/CHANGELOG.md +31 -0
  5. data/README.md +27 -5
  6. data/config/models.yml +23 -0
  7. data/docs/alpha_readiness.md +94 -0
  8. data/docs/configuration.md +53 -4
  9. data/docs/evaluation-suites.md +45 -2
  10. data/docs/getting-started.md +5 -2
  11. data/docs/recording.md +22 -0
  12. data/examples/EXAMPLES.md +73 -0
  13. data/examples/README.md +5 -0
  14. data/examples/best_value.rb +67 -0
  15. data/examples/cassettes/best_value.yml +649 -0
  16. data/examples/cassettes/character_consistency.yml +680 -0
  17. data/examples/cassettes/customer_service_comparison.yml +593 -0
  18. data/examples/cassettes/date_awareness_gate.yml +420 -0
  19. data/examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml +4 -4
  20. data/examples/character_consistency.rb +83 -0
  21. data/examples/comparison.rb +0 -0
  22. data/examples/customer_service_comparison.rb +59 -0
  23. data/examples/date_awareness_gate.rb +57 -0
  24. data/examples/model_comparison.rb +0 -0
  25. data/examples/persona_test.rb +0 -0
  26. data/examples/prompt_variants_factory.rb +0 -0
  27. data/examples/quick_test.rb +0 -0
  28. data/examples/rspec_example_spec.rb +0 -0
  29. data/examples/simple_variant_comparison.rb +0 -0
  30. data/examples/variant_comparison.rb +0 -0
  31. data/exe/qualspec +4 -4
  32. data/lib/qualspec/client.rb +14 -7
  33. data/lib/qualspec/configuration.rb +18 -5
  34. data/lib/qualspec/judge.rb +1 -1
  35. data/lib/qualspec/model_registry.rb +62 -0
  36. data/lib/qualspec/recorder.rb +41 -3
  37. data/lib/qualspec/suite/candidate.rb +7 -4
  38. data/lib/qualspec/suite/dsl.rb +16 -1
  39. data/lib/qualspec/suite/html_reporter.rb +8 -8
  40. data/lib/qualspec/suite/runner.rb +67 -8
  41. data/lib/qualspec/version.rb +1 -1
  42. data/lib/qualspec.rb +17 -0
  43. data/qualspec_structure.md +9 -3
  44. metadata +16 -8
  45. data/.DS_Store +0 -0
File without changes
File without changes
File without changes
File without changes
data/exe/qualspec CHANGED
@@ -66,10 +66,10 @@ parser = OptionParser.new do |opts|
66
66
  puts opts
67
67
  puts
68
68
  puts 'Environment variables:'
69
- puts ' QUALSPEC_API_URL API endpoint (default: http://localhost:11434/v1)'
70
- puts ' QUALSPEC_API_KEY API key for authentication'
71
- puts ' QUALSPEC_MODEL Default model for candidates'
72
- puts ' QUALSPEC_JUDGE_MODEL Model to use as judge'
69
+ puts ' QUALSPEC_API_URL API endpoint (default: https://openrouter.ai/api/v1)'
70
+ puts ' QUALSPEC_API_KEY API key for authentication (falls back to OPEN_ROUTER_API_KEY)'
71
+ puts ' QUALSPEC_MODEL Default model for candidates (default: openrouter/auto)'
72
+ puts ' QUALSPEC_JUDGE_MODEL Model to use as judge (default: same as QUALSPEC_MODEL)'
73
73
  puts
74
74
  puts 'Example:'
75
75
  puts ' qualspec eval/model_comparison.rb'
@@ -53,8 +53,8 @@ module Qualspec
53
53
  return if @config.api_key_configured?
54
54
 
55
55
  raise Qualspec::Error, <<~MSG.strip
56
- QUALSPEC_API_KEY is required but not set.
57
- Set it via environment variable or Qualspec.configure { |c| c.api_key = '...' }
56
+ No API key set. Set QUALSPEC_API_KEY (or OPEN_ROUTER_API_KEY) as an
57
+ environment variable, or use Qualspec.configure { |c| c.api_key = '...' }
58
58
  MSG
59
59
  end
60
60
 
@@ -70,6 +70,10 @@ module Qualspec
70
70
  # Set temperature if provided
71
71
  payload[:temperature] = temperature if temperature
72
72
 
73
+ # Ask OpenRouter to include usage accounting (cost + token details).
74
+ # Only when metadata is requested, so cost-less calls stay lean.
75
+ payload[:usage] = { include: true } if with_metadata
76
+
73
77
  start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
74
78
 
75
79
  response = @conn.post('chat/completions', payload)
@@ -108,12 +112,15 @@ module Qualspec
108
112
  end
109
113
 
110
114
  def extract_cost(response, data)
111
- # OpenRouter includes cost in response or headers
112
- header_cost = response.headers['x-openrouter-cost']
113
- return header_cost.to_f if header_cost
115
+ # OpenRouter returns cost under usage.cost when usage accounting is
116
+ # requested (usage: { include: true }). Fall back to other shapes for
117
+ # other OpenAI-compatible providers.
118
+ usage = data['usage'] || {}
119
+ cost = usage['cost'] || usage['total_cost'] || data['cost']
120
+ return cost.to_f if cost
114
121
 
115
- # Check response body (some providers include it)
116
- data.dig('usage', 'total_cost') || data['cost']
122
+ header_cost = response.headers['x-openrouter-cost']
123
+ header_cost&.to_f
117
124
  end
118
125
 
119
126
  def extract_tokens(data)
@@ -2,15 +2,20 @@
2
2
 
3
3
  module Qualspec
4
4
  class Configuration
5
- attr_accessor :api_url, :api_key, :default_model, :judge_model, :cache_enabled, :cache_dir, :judge_system_prompt,
5
+ attr_accessor :api_url, :default_model, :judge_model, :cache_enabled, :cache_dir, :judge_system_prompt,
6
6
  :request_timeout
7
+ attr_writer :api_key
7
8
 
8
9
  DEFAULT_API_URL = 'https://openrouter.ai/api/v1'
9
- DEFAULT_MODEL = 'google/gemini-3-flash-preview'
10
+ # Universal fallback. `openrouter/auto` routes to a sensible model for any
11
+ # request, so qualspec works even with no model configured anywhere.
12
+ DEFAULT_MODEL = 'openrouter/auto'
10
13
 
11
14
  def initialize
12
15
  @api_url = ENV.fetch('QUALSPEC_API_URL', DEFAULT_API_URL)
13
- @api_key = ENV['QUALSPEC_API_KEY']
16
+ # Default nil: set explicitly via Qualspec.configure { |c| c.api_key = ... }.
17
+ # When unset, #api_key falls back to env vars (see reader below).
18
+ @api_key = nil
14
19
  @default_model = ENV.fetch('QUALSPEC_MODEL', DEFAULT_MODEL)
15
20
  @judge_model = ENV.fetch('QUALSPEC_JUDGE_MODEL') { @default_model }
16
21
  @cache_enabled = false
@@ -19,14 +24,22 @@ module Qualspec
19
24
  @request_timeout = 120
20
25
  end
21
26
 
27
+ # Explicitly configured key wins; otherwise fall back to env vars.
28
+ # Prefer QUALSPEC_API_KEY, then OPEN_ROUTER_API_KEY (default backend is
29
+ # OpenRouter). The env vars are a convenience fallback, not a requirement —
30
+ # pass api_key in Qualspec.configure to avoid relying on them.
31
+ def api_key
32
+ @api_key || ENV['QUALSPEC_API_KEY'] || ENV['OPEN_ROUTER_API_KEY']
33
+ end
34
+
22
35
  def api_headers
23
36
  headers = { 'Content-Type' => 'application/json' }
24
- headers['Authorization'] = "Bearer #{@api_key}" unless @api_key.to_s.empty?
37
+ headers['Authorization'] = "Bearer #{api_key}" unless api_key.to_s.empty?
25
38
  headers
26
39
  end
27
40
 
28
41
  def api_key_configured?
29
- !@api_key.to_s.empty?
42
+ !api_key.to_s.empty?
30
43
  end
31
44
  end
32
45
  end
@@ -141,7 +141,7 @@ module Qualspec
141
141
  parts << '## Responses:'
142
142
 
143
143
  responses.each do |candidate, response|
144
- parts << "\n### #{candidate}:\n#{response}"
144
+ parts << "\n### #{candidate}:\n```\n#{response}\n```"
145
145
  end
146
146
 
147
147
  parts << "\nScore each candidate (#{candidate_names}) from 0-10."
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'yaml'
4
+
5
+ module Qualspec
6
+ # Loads a curated list of named models from a YAML config file and resolves
7
+ # names to their full provider slugs. Unknown/blank names fall back to the
8
+ # configured default (ultimately Configuration::DEFAULT_MODEL, openrouter/auto),
9
+ # so model lookups always return something usable.
10
+ #
11
+ # @example config/models.yml
12
+ # default: openrouter/auto
13
+ # models:
14
+ # glm: z-ai/glm-5.2
15
+ #
16
+ # @example
17
+ # Qualspec.model(:glm) # => "z-ai/glm-5.2"
18
+ # Qualspec.model(:nope) # => "openrouter/auto"
19
+ # Qualspec.model # => "openrouter/auto"
20
+ class ModelRegistry
21
+ DEFAULT_CONFIG_PATH = 'config/models.yml'
22
+
23
+ def initialize(path: nil, default: nil)
24
+ @models = {}
25
+ @default = default
26
+ load_file(path || ENV['QUALSPEC_MODELS_FILE'] || DEFAULT_CONFIG_PATH)
27
+ end
28
+
29
+ # Resolve a model name to its slug, falling back to the default.
30
+ #
31
+ # @param name [Symbol, String, nil] the configured name (or nil for default)
32
+ # @return [String] a model slug
33
+ def resolve(name = nil)
34
+ return default if name.nil? || name.to_s.empty?
35
+
36
+ @models.fetch(name.to_s, default)
37
+ end
38
+
39
+ # @return [Hash{String=>String}] all configured name => slug pairs
40
+ def all
41
+ @models.dup
42
+ end
43
+
44
+ # @return [String] the universal fallback model
45
+ def default
46
+ @default || Configuration::DEFAULT_MODEL
47
+ end
48
+
49
+ private
50
+
51
+ def load_file(path)
52
+ return unless path && File.exist?(path)
53
+
54
+ data = YAML.safe_load_file(path) || {}
55
+ @default ||= data['default']
56
+ (data['models'] || {}).each { |name, slug| @models[name.to_s] = slug }
57
+ rescue StandardError
58
+ # A malformed config file should never break a run; defaults still apply.
59
+ nil
60
+ end
61
+ end
62
+ end
@@ -13,16 +13,32 @@ module Qualspec
13
13
  def setup(cassette_dir: '.qualspec_cassettes')
14
14
  require_vcr!
15
15
 
16
+ recorder = self
16
17
  VCR.configure do |config|
17
18
  config.cassette_library_dir = cassette_dir
18
19
  config.hook_into :faraday
19
20
  config.default_cassette_options = {
20
21
  record: :new_episodes,
21
- match_requests_on: %i[method uri body]
22
+ match_requests_on: %i[method uri body_without_model]
22
23
  }
23
- # Filter out API keys
24
- config.filter_sensitive_data('<API_KEY>') { Qualspec.configuration.api_key }
24
+ # Filter out API keys — guard against adding duplicate filters
25
+ unless @api_key_filter_registered
26
+ config.filter_sensitive_data('<API_KEY>') { Qualspec.configuration.api_key }
27
+ @api_key_filter_registered = true
28
+ end
25
29
  end
30
+
31
+ # Register custom matcher once — ignores the `model` field so cassettes
32
+ # recorded with one model work in CI where a different model is configured.
33
+ unless @matcher_registered
34
+ VCR.configure do |config|
35
+ config.register_request_matcher(:body_without_model) do |r1, r2|
36
+ recorder.send(:normalize_body_for_match, r1.body) == recorder.send(:normalize_body_for_match, r2.body)
37
+ end
38
+ end
39
+ @matcher_registered = true
40
+ end
41
+
26
42
  @configured = true
27
43
  end
28
44
 
@@ -40,6 +56,20 @@ module Qualspec
40
56
  VCR.use_cassette(name, record: :none, &block)
41
57
  end
42
58
 
59
+ # Replay a cassette if it already exists (no API key required), otherwise
60
+ # record a fresh one. Ideal for examples that ship a committed cassette so
61
+ # they run for free, but still record on first run.
62
+ def use_cassette(name, &block)
63
+ setup unless configured?
64
+ mode = cassette_exists?(name) ? :none : :new_episodes
65
+ VCR.use_cassette(name, record: mode, &block)
66
+ end
67
+
68
+ def cassette_exists?(name)
69
+ require_vcr!
70
+ File.exist?(File.join(VCR.configuration.cassette_library_dir, "#{name}.yml"))
71
+ end
72
+
43
73
  private
44
74
 
45
75
  def require_vcr!
@@ -50,6 +80,14 @@ module Qualspec
50
80
  Add to your Gemfile: gem 'vcr'
51
81
  MSG
52
82
  end
83
+
84
+ def normalize_body_for_match(body)
85
+ parsed = JSON.parse(body)
86
+ parsed.delete('model')
87
+ JSON.generate(parsed)
88
+ rescue JSON::ParserError
89
+ body
90
+ end
53
91
  end
54
92
  end
55
93
  end
@@ -5,14 +5,16 @@ module Qualspec
5
5
  class Candidate
6
6
  attr_reader :name, :model, :system_prompt, :options
7
7
 
8
- def initialize(name, model:, system_prompt: nil, **options)
8
+ def initialize(name, model: nil, system_prompt: nil, **options)
9
9
  @name = name.to_s
10
- @model = model
10
+ # Fall back to the configured default model (ultimately openrouter/auto)
11
+ # so a candidate works even when no model is specified.
12
+ @model = model || Qualspec.configuration.default_model
11
13
  @system_prompt = system_prompt
12
14
  @options = options
13
15
  end
14
16
 
15
- def generate_response(prompt:, system_prompt: nil, temperature: nil)
17
+ def generate_response(prompt:, system_prompt: nil, temperature: nil, with_metadata: false)
16
18
  messages = []
17
19
 
18
20
  sys = system_prompt || @system_prompt
@@ -23,7 +25,8 @@ module Qualspec
23
25
  model: @model,
24
26
  messages: messages,
25
27
  json_mode: false, # We want natural responses, not JSON
26
- temperature: normalize_temperature(temperature)
28
+ temperature: normalize_temperature(temperature),
29
+ with_metadata: with_metadata
27
30
  )
28
31
  end
29
32
 
@@ -11,16 +11,31 @@ module Qualspec
11
11
  @scenarios_list = []
12
12
  @variants_config = nil
13
13
  @temperature_list = [nil] # nil means use model default
14
+ @track_cost = false
14
15
 
15
16
  instance_eval(&block) if block_given? # rubocop:disable Style/EvalWithLocation -- DSL pattern requires eval
16
17
  end
17
18
 
19
+ # DSL: capture per-call cost + token metadata so cost/value analysis works.
20
+ # Off by default — evaluations that don't look at cost skip the overhead.
21
+ #
22
+ # @example
23
+ # track_cost
24
+ def track_cost(value = true) # rubocop:disable Style/OptionalBooleanParameter -- reads as a DSL toggle
25
+ @track_cost = value
26
+ end
27
+ alias capture_metadata track_cost
28
+
29
+ def track_cost?
30
+ @track_cost
31
+ end
32
+
18
33
  # DSL: define candidates
19
34
  def candidates(&block)
20
35
  instance_eval(&block) # rubocop:disable Style/EvalWithLocation -- DSL pattern requires eval
21
36
  end
22
37
 
23
- def candidate(name, model:, system_prompt: nil, **options)
38
+ def candidate(name, model: nil, system_prompt: nil, **options)
24
39
  @candidates_list << Candidate.new(name, model: model, system_prompt: system_prompt, **options)
25
40
  end
26
41
 
@@ -481,10 +481,12 @@ module Qualspec
481
481
 
482
482
  scenario_blocks = scenarios.map do |scenario|
483
483
  response_cards = responses.map do |candidate, candidate_responses|
484
- response = candidate_responses[scenario]
485
- next unless response
484
+ variant_map = candidate_responses[scenario]
485
+ next unless variant_map
486
486
 
487
- response_text = response.to_s.strip
487
+ contents = variant_map.flat_map { |_v, tm| tm.values.map { |d| d[:content] } }.compact
488
+ response_text = contents.join("\n\n---\n\n").strip
489
+ next if response_text.empty?
488
490
 
489
491
  <<~CARD
490
492
  <div class="response-card">
@@ -660,13 +662,11 @@ module Qualspec
660
662
  end
661
663
 
662
664
  def get_candidate_model(candidate)
663
- # Try to find the model from the suite
664
- @results.evaluations.find { |e| e[:candidate] == candidate }&.dig(:model) || 'unknown'
665
+ @results.candidate_models[candidate] || 'unknown'
665
666
  end
666
667
 
667
- def get_scenario_prompt(_scenario)
668
- # This would need to be stored in results - for now return nil
669
- nil
668
+ def get_scenario_prompt(scenario)
669
+ @results.prompts[scenario]
670
670
  end
671
671
  end
672
672
  end
@@ -11,6 +11,12 @@ module Qualspec
11
11
  @definition = definition.is_a?(String) ? Suite.find(definition) : definition
12
12
  @results = Results.new(@definition.name)
13
13
  @judge = Qualspec.judge
14
+
15
+ @definition.candidates_list.each do |c|
16
+ @results.candidate_models[c.name] = c.model
17
+ end
18
+
19
+ @results.metadata_captured = @definition.track_cost?
14
20
  end
15
21
 
16
22
  def run(progress: true)
@@ -52,6 +58,8 @@ module Qualspec
52
58
  responses = {}
53
59
  errors = {}
54
60
 
61
+ @results.prompts[scenario.name] ||= scenario.compose_prompt(variant)
62
+
55
63
  # Phase 1: Collect all candidate responses
56
64
  @definition.candidates_list.each do |candidate|
57
65
  log_candidate_progress(candidate, scenario, 'generating') if progress
@@ -100,7 +108,8 @@ module Qualspec
100
108
  response = candidate.generate_response(
101
109
  prompt: final_prompt,
102
110
  system_prompt: final_system_prompt,
103
- temperature: effective_temperature
111
+ temperature: effective_temperature,
112
+ with_metadata: @definition.track_cost?
104
113
  )
105
114
 
106
115
  duration_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
@@ -217,7 +226,9 @@ module Qualspec
217
226
 
218
227
  # Results container with multi-dimensional support
219
228
  class Results
220
- attr_reader :suite_name, :evaluations, :responses, :started_at, :finished_at, :timing, :costs
229
+ attr_reader :suite_name, :evaluations, :responses, :started_at, :finished_at, :timing, :costs,
230
+ :candidate_models, :prompts
231
+ attr_accessor :metadata_captured
221
232
 
222
233
  def initialize(suite_name)
223
234
  @suite_name = suite_name
@@ -225,8 +236,36 @@ module Qualspec
225
236
  @responses = {} # Nested: {candidate => {scenario => {variant => {temp => response}}}}
226
237
  @timing = {}
227
238
  @costs = {}
239
+ @candidate_models = {} # {candidate_name => model_string}
240
+ @prompts = {} # {scenario_name => prompt_string}
228
241
  @started_at = Time.now
229
242
  @finished_at = nil
243
+ @metadata_captured = false # set true when the suite enables track_cost
244
+ end
245
+
246
+ # Whether per-call cost/token metadata was captured this run.
247
+ def costs_tracked?
248
+ @metadata_captured
249
+ end
250
+
251
+ # Total cost per candidate. Raises if cost tracking wasn't enabled.
252
+ def cost_by_candidate
253
+ ensure_cost_tracking!
254
+ @costs.dup
255
+ end
256
+
257
+ # Rank candidates by quality-per-dollar (avg score / total cost), best
258
+ # first. Candidates with zero recorded cost sort last. Raises a helpful
259
+ # error if cost tracking wasn't enabled for the run.
260
+ def value_ranking
261
+ ensure_cost_tracking!
262
+
263
+ ranked = scores_by_candidate.map do |candidate, stats|
264
+ cost = @costs[candidate].to_f
265
+ score_per_dollar = cost.positive? ? (stats[:avg_score] / cost).round : nil
266
+ [candidate, { avg_score: stats[:avg_score], cost: cost, score_per_dollar: score_per_dollar }]
267
+ end
268
+ ranked.sort_by { |_, v| -(v[:score_per_dollar] || 0) }.to_h
230
269
  end
231
270
 
232
271
  def record_response(candidate:, scenario:, response:, variant: 'default', temperature: nil, duration_ms: nil, cost: nil, variant_data: nil)
@@ -329,13 +368,15 @@ module Qualspec
329
368
  def scores_by_scenario
330
369
  @evaluations.group_by { |e| e[:scenario] }.transform_values do |evals|
331
370
  evals.group_by { |e| e[:candidate] }.transform_values do |candidate_evals|
332
- eval_data = candidate_evals.first
371
+ total = candidate_evals.size
372
+ avg_score = (candidate_evals.sum { |e| e[:score] }.to_f / total).round(2)
373
+ first = candidate_evals.first
333
374
  {
334
- score: eval_data[:score],
335
- pass: eval_data[:pass],
336
- reasoning: eval_data[:reasoning],
337
- variant: eval_data[:variant],
338
- temperature: eval_data[:temperature]
375
+ score: avg_score,
376
+ pass: candidate_evals.all? { |e| e[:pass] },
377
+ reasoning: first[:reasoning],
378
+ variant: first[:variant],
379
+ temperature: first[:temperature]
339
380
  }
340
381
  end
341
382
  end
@@ -374,6 +415,24 @@ module Qualspec
374
415
  responses: @responses
375
416
  }
376
417
  end
418
+
419
+ private
420
+
421
+ def ensure_cost_tracking!
422
+ return if @metadata_captured
423
+
424
+ raise Qualspec::Error, <<~MSG.strip
425
+ Cost data was not captured for this run, so cost/value analysis is unavailable.
426
+ Enable it with `track_cost` in the suite definition:
427
+
428
+ Qualspec.evaluation 'My Suite' do
429
+ track_cost
430
+ ...
431
+ end
432
+
433
+ (track_cost adds usage accounting to each request via with_metadata.)
434
+ MSG
435
+ end
377
436
  end
378
437
  end
379
438
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Qualspec
4
- VERSION = '0.1.1'
4
+ VERSION = '0.2.0'
5
5
  end
data/lib/qualspec.rb CHANGED
@@ -7,6 +7,7 @@ module Qualspec
7
7
  end
8
8
 
9
9
  require_relative 'qualspec/configuration'
10
+ require_relative 'qualspec/model_registry'
10
11
  require_relative 'qualspec/client'
11
12
  require_relative 'qualspec/evaluation'
12
13
  require_relative 'qualspec/prompt_variant'
@@ -37,6 +38,7 @@ module Qualspec
37
38
  @configuration = nil
38
39
  @client = nil
39
40
  @judge = nil
41
+ @models = nil
40
42
  Rubric.clear!
41
43
  Suite.clear!
42
44
  Suite::Behavior.clear!
@@ -50,6 +52,21 @@ module Qualspec
50
52
  @judge ||= Judge.new
51
53
  end
52
54
 
55
+ # Registry of named models loaded from config/models.yml (or
56
+ # QUALSPEC_MODELS_FILE). See ModelRegistry.
57
+ def models
58
+ @models ||= ModelRegistry.new
59
+ end
60
+
61
+ # Resolve a named model to its slug, falling back to the default
62
+ # (openrouter/auto). Returns the default when name is nil/unknown.
63
+ #
64
+ # Qualspec.model(:glm) # => "z-ai/glm-5.2"
65
+ # Qualspec.model # => "openrouter/auto"
66
+ def model(name = nil)
67
+ models.resolve(name)
68
+ end
69
+
53
70
  # Convenience method for defining rubrics
54
71
  def define_rubric(name, &block)
55
72
  Rubric.define(name, &block)
@@ -7,10 +7,12 @@ LLM-judged qualitative testing for Ruby. Evaluate AI agents, compare models, and
7
7
 
8
8
  ### Core Library Files (lib/qualspec/)
9
9
  - **builtin_rubrics.rb** - Built-in evaluation criteria
10
- - **client.rb** - API client for LLM interactions
10
+ - **client.rb** - API client for LLM interactions (cost/token metadata optional)
11
11
  - **configuration.rb** - Configuration management
12
12
  - **evaluation.rb** - Core evaluation logic
13
13
  - **judge.rb** - LLM judge implementation
14
+ - **model_registry.rb** - Named models from `config/models.yml` (`Qualspec.model`)
15
+ - **prompt_variant.rb** - Variant value object (FactoryBot target)
14
16
  - **recorder.rb** - VCR integration for recording
15
17
  - **rspec.rb** - RSpec integration entry point
16
18
  - **rubric.rb** - Custom rubric definitions
@@ -23,10 +25,11 @@ LLM-judged qualitative testing for Ruby. Evaluate AI agents, compare models, and
23
25
  ### Configuration Environment Variables
24
26
  | Variable | Description | Default |
25
27
  |----------|-------------|---------|
26
- | QUALSPEC_API_KEY | API key (required) | - |
28
+ | QUALSPEC_API_KEY | API key (falls back to OPEN_ROUTER_API_KEY) | - |
27
29
  | QUALSPEC_API_URL | API endpoint | https://openrouter.ai/api/v1 |
28
- | QUALSPEC_MODEL | Default model for candidates | google/gemini-3-flash-preview |
30
+ | QUALSPEC_MODEL | Default model for candidates | openrouter/auto |
29
31
  | QUALSPEC_JUDGE_MODEL | Model used as judge | Same as QUALSPEC_MODEL |
32
+ | QUALSPEC_MODELS_FILE | Named-models YAML | config/models.yml |
30
33
 
31
34
  ### Key Features
32
35
  1. **Model Comparison CLI** - Compare multiple models on the same prompts
@@ -36,6 +39,9 @@ LLM-judged qualitative testing for Ruby. Evaluate AI agents, compare models, and
36
39
  5. **Custom Rubrics** - Define your own evaluation criteria
37
40
  6. **VCR Recording** - Record and replay API calls for testing
38
41
  7. **HTML Reports** - Generate visual comparison reports
42
+ 8. **Named Model Registry** - Reference curated models by name (`Qualspec.model`)
43
+ 9. **Cost Tracking** - Opt-in per-call cost + quality-per-dollar `value_ranking`
44
+ 10. **Variant & Temperature Matrix** - Combinatorial prompt testing via FactoryBot
39
45
 
40
46
  ### Example: Model Comparison
41
47
  ```ruby
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: qualspec
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Stiens
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2026-01-05 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: faraday
@@ -62,7 +61,6 @@ executables:
62
61
  extensions: []
63
62
  extra_rdoc_files: []
64
63
  files:
65
- - ".DS_Store"
66
64
  - ".qualspec_cassettes/comparison_test.yml"
67
65
  - ".qualspec_cassettes/quick_test.yml"
68
66
  - ".rspec"
@@ -71,7 +69,9 @@ files:
71
69
  - CHANGELOG.md
72
70
  - README.md
73
71
  - Rakefile
72
+ - config/models.yml
74
73
  - docs/.DS_Store
74
+ - docs/alpha_readiness.md
75
75
  - docs/configuration.md
76
76
  - docs/evaluation-suites.md
77
77
  - docs/getting-started.md
@@ -80,7 +80,13 @@ files:
80
80
  - docs/rubrics.md
81
81
  - docs/to_implement/factory_bot_integration_design.md
82
82
  - docs/to_implement/variants_first_pass.md
83
+ - examples/EXAMPLES.md
83
84
  - examples/README.md
85
+ - examples/best_value.rb
86
+ - examples/cassettes/best_value.yml
87
+ - examples/cassettes/character_consistency.yml
88
+ - examples/cassettes/customer_service_comparison.yml
89
+ - examples/cassettes/date_awareness_gate.yml
84
90
  - examples/cassettes/qualspec_rspec_integration_basic_evaluation_evaluates_responses_with_inline_criteria.yml
85
91
  - examples/cassettes/qualspec_rspec_integration_basic_evaluation_provides_detailed_feedback_on_failure.yml
86
92
  - examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml
@@ -88,7 +94,10 @@ files:
88
94
  - examples/cassettes/qualspec_rspec_integration_vcr_integration_records_and_plays_back_api_calls_automatically.yml
89
95
  - examples/cassettes/qualspec_rspec_integration_with_context_uses_context_in_evaluation.yml
90
96
  - examples/cassettes/qualspec_rspec_integration_with_rubrics_evaluates_using_builtin_rubrics.yml
97
+ - examples/character_consistency.rb
91
98
  - examples/comparison.rb
99
+ - examples/customer_service_comparison.rb
100
+ - examples/date_awareness_gate.rb
92
101
  - examples/model_comparison.rb
93
102
  - examples/persona_test.rb
94
103
  - examples/prompt_variants_factory.rb
@@ -105,6 +114,7 @@ files:
105
114
  - lib/qualspec/configuration.rb
106
115
  - lib/qualspec/evaluation.rb
107
116
  - lib/qualspec/judge.rb
117
+ - lib/qualspec/model_registry.rb
108
118
  - lib/qualspec/prompt_variant.rb
109
119
  - lib/qualspec/recorder.rb
110
120
  - lib/qualspec/rspec.rb
@@ -131,7 +141,6 @@ metadata:
131
141
  homepage_uri: https://github.com/estiens/qualspec
132
142
  source_code_uri: https://github.com/estiens/qualspec
133
143
  changelog_uri: https://github.com/estiens/qualspec/blob/main/CHANGELOG.md
134
- post_install_message:
135
144
  rdoc_options: []
136
145
  require_paths:
137
146
  - lib
@@ -139,15 +148,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
139
148
  requirements:
140
149
  - - ">="
141
150
  - !ruby/object:Gem::Version
142
- version: 3.1.0
151
+ version: 3.3.0
143
152
  required_rubygems_version: !ruby/object:Gem::Requirement
144
153
  requirements:
145
154
  - - ">="
146
155
  - !ruby/object:Gem::Version
147
156
  version: '0'
148
157
  requirements: []
149
- rubygems_version: 3.5.22
150
- signing_key:
158
+ rubygems_version: 3.6.9
151
159
  specification_version: 4
152
160
  summary: RSpec DSL for qualitative LLM-judged testing
153
161
  test_files: []
data/.DS_Store DELETED
Binary file