RubyGems - qualspec - Versions diffs - 0.1.2 → 0.2.0 - Mend

qualspec 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

checksums.yaml +4 -4
data/.rubocop.yml +14 -0
data/.rubocop_todo.yml +1 -1
data/CHANGELOG.md +31 -0
data/README.md +27 -5
data/config/models.yml +23 -0
data/docs/alpha_readiness.md +94 -0
data/docs/configuration.md +53 -4
data/docs/evaluation-suites.md +45 -2
data/docs/getting-started.md +5 -2
data/docs/recording.md +22 -0
data/examples/EXAMPLES.md +73 -0
data/examples/README.md +5 -0
data/examples/best_value.rb +67 -0
data/examples/cassettes/best_value.yml +649 -0
data/examples/cassettes/character_consistency.yml +680 -0
data/examples/cassettes/customer_service_comparison.yml +593 -0
data/examples/cassettes/date_awareness_gate.yml +420 -0
data/examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml +4 -4
data/examples/character_consistency.rb +83 -0
data/examples/comparison.rb +0 -0
data/examples/customer_service_comparison.rb +59 -0
data/examples/date_awareness_gate.rb +57 -0
data/examples/model_comparison.rb +0 -0
data/examples/persona_test.rb +0 -0
data/examples/prompt_variants_factory.rb +0 -0
data/examples/quick_test.rb +0 -0
data/examples/rspec_example_spec.rb +0 -0
data/examples/simple_variant_comparison.rb +0 -0
data/examples/variant_comparison.rb +0 -0
data/exe/qualspec +4 -4
data/lib/qualspec/client.rb +14 -7
data/lib/qualspec/configuration.rb +18 -5
data/lib/qualspec/model_registry.rb +62 -0
data/lib/qualspec/recorder.rb +41 -3
data/lib/qualspec/suite/candidate.rb +7 -4
data/lib/qualspec/suite/dsl.rb +16 -1
data/lib/qualspec/suite/runner.rb +49 -1
data/lib/qualspec/version.rb +1 -1
data/lib/qualspec.rb +17 -0
data/qualspec_structure.md +9 -3
metadata +16 -7

data/examples/quick_test.rb CHANGED Viewed

File without changes

data/examples/rspec_example_spec.rb CHANGED Viewed

File without changes

data/examples/simple_variant_comparison.rb CHANGED Viewed

File without changes

data/examples/variant_comparison.rb CHANGED Viewed

File without changes

data/exe/qualspec CHANGED Viewed

@@ -66,10 +66,10 @@ parser = OptionParser.new do |opts|
     puts opts
     puts
     puts 'Environment variables:'
-    puts '  QUALSPEC_API_URL   API endpoint (default: http://localhost:11434/v1)'
-    puts '  QUALSPEC_API_KEY   API key for authentication'
-    puts '  QUALSPEC_MODEL     Default model for candidates'
-    puts '  QUALSPEC_JUDGE_MODEL  Model to use as judge'
+    puts '  QUALSPEC_API_URL   API endpoint (default: https://openrouter.ai/api/v1)'
+    puts '  QUALSPEC_API_KEY   API key for authentication (falls back to OPEN_ROUTER_API_KEY)'
+    puts '  QUALSPEC_MODEL     Default model for candidates (default: openrouter/auto)'
+    puts '  QUALSPEC_JUDGE_MODEL  Model to use as judge (default: same as QUALSPEC_MODEL)'
     puts
     puts 'Example:'
     puts '  qualspec eval/model_comparison.rb'

data/lib/qualspec/client.rb CHANGED Viewed

@@ -53,8 +53,8 @@ module Qualspec
       return if @config.api_key_configured?
       raise Qualspec::Error, <<~MSG.strip
-        QUALSPEC_API_KEY is required but not set.
-        Set it via environment variable or Qualspec.configure { |c| c.api_key = '...' }
+        No API key set. Set QUALSPEC_API_KEY (or OPEN_ROUTER_API_KEY) as an
+        environment variable, or use Qualspec.configure { |c| c.api_key = '...' }
       MSG
     end
@@ -70,6 +70,10 @@ module Qualspec
       # Set temperature if provided
       payload[:temperature] = temperature if temperature
+      # Ask OpenRouter to include usage accounting (cost + token details).
+      # Only when metadata is requested, so cost-less calls stay lean.
+      payload[:usage] = { include: true } if with_metadata
       start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
       response = @conn.post('chat/completions', payload)
@@ -108,12 +112,15 @@ module Qualspec
     end
     def extract_cost(response, data)
-      # OpenRouter includes cost in response or headers
-      header_cost = response.headers['x-openrouter-cost']
-      return header_cost.to_f if header_cost
+      # OpenRouter returns cost under usage.cost when usage accounting is
+      # requested (usage: { include: true }). Fall back to other shapes for
+      # other OpenAI-compatible providers.
+      usage = data['usage'] || {}
+      cost = usage['cost'] || usage['total_cost'] || data['cost']
+      return cost.to_f if cost
-      # Check response body (some providers include it)
-      data.dig('usage', 'total_cost') || data['cost']
+      header_cost = response.headers['x-openrouter-cost']
+      header_cost&.to_f
     end
     def extract_tokens(data)

data/lib/qualspec/configuration.rb CHANGED Viewed

@@ -2,15 +2,20 @@
 module Qualspec
   class Configuration
-    attr_accessor :api_url, :api_key, :default_model, :judge_model, :cache_enabled, :cache_dir, :judge_system_prompt,
+    attr_accessor :api_url, :default_model, :judge_model, :cache_enabled, :cache_dir, :judge_system_prompt,
                   :request_timeout
+    attr_writer :api_key
     DEFAULT_API_URL = 'https://openrouter.ai/api/v1'
-    DEFAULT_MODEL = 'google/gemini-3-flash-preview'
+    # Universal fallback. `openrouter/auto` routes to a sensible model for any
+    # request, so qualspec works even with no model configured anywhere.
+    DEFAULT_MODEL = 'openrouter/auto'
     def initialize
       @api_url = ENV.fetch('QUALSPEC_API_URL', DEFAULT_API_URL)
-      @api_key = ENV['QUALSPEC_API_KEY']
+      # Default nil: set explicitly via Qualspec.configure { |c| c.api_key = ... }.
+      # When unset, #api_key falls back to env vars (see reader below).
+      @api_key = nil
       @default_model = ENV.fetch('QUALSPEC_MODEL', DEFAULT_MODEL)
       @judge_model = ENV.fetch('QUALSPEC_JUDGE_MODEL') { @default_model }
       @cache_enabled = false
@@ -19,14 +24,22 @@ module Qualspec
       @request_timeout = 120
     end
+    # Explicitly configured key wins; otherwise fall back to env vars.
+    # Prefer QUALSPEC_API_KEY, then OPEN_ROUTER_API_KEY (default backend is
+    # OpenRouter). The env vars are a convenience fallback, not a requirement —
+    # pass api_key in Qualspec.configure to avoid relying on them.
+    def api_key
+      @api_key || ENV['QUALSPEC_API_KEY'] || ENV['OPEN_ROUTER_API_KEY']
+    end
     def api_headers
       headers = { 'Content-Type' => 'application/json' }
-      headers['Authorization'] = "Bearer #{@api_key}" unless @api_key.to_s.empty?
+      headers['Authorization'] = "Bearer #{api_key}" unless api_key.to_s.empty?
       headers
     end
     def api_key_configured?
-      !@api_key.to_s.empty?
+      !api_key.to_s.empty?
     end
   end
 end

data/lib/qualspec/model_registry.rb ADDED Viewed

@@ -0,0 +1,62 @@
+# frozen_string_literal: true
+require 'yaml'
+module Qualspec
+  # Loads a curated list of named models from a YAML config file and resolves
+  # names to their full provider slugs. Unknown/blank names fall back to the
+  # configured default (ultimately Configuration::DEFAULT_MODEL, openrouter/auto),
+  # so model lookups always return something usable.
+  #
+  # @example config/models.yml
+  #   default: openrouter/auto
+  #   models:
+  #     glm: z-ai/glm-5.2
+  #
+  # @example
+  #   Qualspec.model(:glm)      # => "z-ai/glm-5.2"
+  #   Qualspec.model(:nope)     # => "openrouter/auto"
+  #   Qualspec.model            # => "openrouter/auto"
+  class ModelRegistry
+    DEFAULT_CONFIG_PATH = 'config/models.yml'
+    def initialize(path: nil, default: nil)
+      @models = {}
+      @default = default
+      load_file(path || ENV['QUALSPEC_MODELS_FILE'] || DEFAULT_CONFIG_PATH)
+    end
+    # Resolve a model name to its slug, falling back to the default.
+    #
+    # @param name [Symbol, String, nil] the configured name (or nil for default)
+    # @return [String] a model slug
+    def resolve(name = nil)
+      return default if name.nil? || name.to_s.empty?
+      @models.fetch(name.to_s, default)
+    end
+    # @return [Hash{String=>String}] all configured name => slug pairs
+    def all
+      @models.dup
+    end
+    # @return [String] the universal fallback model
+    def default
+      @default || Configuration::DEFAULT_MODEL
+    end
+    private
+    def load_file(path)
+      return unless path && File.exist?(path)
+      data = YAML.safe_load_file(path) || {}
+      @default ||= data['default']
+      (data['models'] || {}).each { |name, slug| @models[name.to_s] = slug }
+    rescue StandardError
+      # A malformed config file should never break a run; defaults still apply.
+      nil
+    end
+  end
+end

data/lib/qualspec/recorder.rb CHANGED Viewed

@@ -13,16 +13,32 @@ module Qualspec
       def setup(cassette_dir: '.qualspec_cassettes')
         require_vcr!
+        recorder = self
         VCR.configure do |config|
           config.cassette_library_dir = cassette_dir
           config.hook_into :faraday
           config.default_cassette_options = {
             record: :new_episodes,
-            match_requests_on: %i[method uri body]
+            match_requests_on: %i[method uri body_without_model]
           }
-          # Filter out API keys
-          config.filter_sensitive_data('<API_KEY>') { Qualspec.configuration.api_key }
+          # Filter out API keys — guard against adding duplicate filters
+          unless @api_key_filter_registered
+            config.filter_sensitive_data('<API_KEY>') { Qualspec.configuration.api_key }
+            @api_key_filter_registered = true
+          end
         end
+        # Register custom matcher once — ignores the `model` field so cassettes
+        # recorded with one model work in CI where a different model is configured.
+        unless @matcher_registered
+          VCR.configure do |config|
+            config.register_request_matcher(:body_without_model) do |r1, r2|
+              recorder.send(:normalize_body_for_match, r1.body) == recorder.send(:normalize_body_for_match, r2.body)
+            end
+          end
+          @matcher_registered = true
+        end
         @configured = true
       end
@@ -40,6 +56,20 @@ module Qualspec
         VCR.use_cassette(name, record: :none, &block)
       end
+      # Replay a cassette if it already exists (no API key required), otherwise
+      # record a fresh one. Ideal for examples that ship a committed cassette so
+      # they run for free, but still record on first run.
+      def use_cassette(name, &block)
+        setup unless configured?
+        mode = cassette_exists?(name) ? :none : :new_episodes
+        VCR.use_cassette(name, record: mode, &block)
+      end
+      def cassette_exists?(name)
+        require_vcr!
+        File.exist?(File.join(VCR.configuration.cassette_library_dir, "#{name}.yml"))
+      end
       private
       def require_vcr!
@@ -50,6 +80,14 @@ module Qualspec
           Add to your Gemfile: gem 'vcr'
         MSG
       end
+      def normalize_body_for_match(body)
+        parsed = JSON.parse(body)
+        parsed.delete('model')
+        JSON.generate(parsed)
+      rescue JSON::ParserError
+        body
+      end
     end
   end
 end

data/lib/qualspec/suite/candidate.rb CHANGED Viewed

@@ -5,14 +5,16 @@ module Qualspec
     class Candidate
       attr_reader :name, :model, :system_prompt, :options
-      def initialize(name, model:, system_prompt: nil, **options)
+      def initialize(name, model: nil, system_prompt: nil, **options)
         @name = name.to_s
-        @model = model
+        # Fall back to the configured default model (ultimately openrouter/auto)
+        # so a candidate works even when no model is specified.
+        @model = model || Qualspec.configuration.default_model
         @system_prompt = system_prompt
         @options = options
       end
-      def generate_response(prompt:, system_prompt: nil, temperature: nil)
+      def generate_response(prompt:, system_prompt: nil, temperature: nil, with_metadata: false)
         messages = []
         sys = system_prompt || @system_prompt
@@ -23,7 +25,8 @@ module Qualspec
           model: @model,
           messages: messages,
           json_mode: false, # We want natural responses, not JSON
-          temperature: normalize_temperature(temperature)
+          temperature: normalize_temperature(temperature),
+          with_metadata: with_metadata
         )
       end

data/lib/qualspec/suite/dsl.rb CHANGED Viewed

@@ -11,16 +11,31 @@ module Qualspec
         @scenarios_list = []
         @variants_config = nil
         @temperature_list = [nil] # nil means use model default
+        @track_cost = false
         instance_eval(&block) if block_given? # rubocop:disable Style/EvalWithLocation -- DSL pattern requires eval
       end
+      # DSL: capture per-call cost + token metadata so cost/value analysis works.
+      # Off by default — evaluations that don't look at cost skip the overhead.
+      #
+      # @example
+      #   track_cost
+      def track_cost(value = true) # rubocop:disable Style/OptionalBooleanParameter -- reads as a DSL toggle
+        @track_cost = value
+      end
+      alias capture_metadata track_cost
+      def track_cost?
+        @track_cost
+      end
       # DSL: define candidates
       def candidates(&block)
         instance_eval(&block) # rubocop:disable Style/EvalWithLocation -- DSL pattern requires eval
       end
-      def candidate(name, model:, system_prompt: nil, **options)
+      def candidate(name, model: nil, system_prompt: nil, **options)
         @candidates_list << Candidate.new(name, model: model, system_prompt: system_prompt, **options)
       end

data/lib/qualspec/suite/runner.rb CHANGED Viewed

@@ -15,6 +15,8 @@ module Qualspec
         @definition.candidates_list.each do |c|
           @results.candidate_models[c.name] = c.model
         end
+        @results.metadata_captured = @definition.track_cost?
       end
       def run(progress: true)
@@ -106,7 +108,8 @@ module Qualspec
         response = candidate.generate_response(
           prompt: final_prompt,
           system_prompt: final_system_prompt,
-          temperature: effective_temperature
+          temperature: effective_temperature,
+          with_metadata: @definition.track_cost?
         )
         duration_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round
@@ -225,6 +228,7 @@ module Qualspec
     class Results
       attr_reader :suite_name, :evaluations, :responses, :started_at, :finished_at, :timing, :costs,
                   :candidate_models, :prompts
+      attr_accessor :metadata_captured
       def initialize(suite_name)
         @suite_name = suite_name
@@ -236,6 +240,32 @@ module Qualspec
         @prompts = {}          # {scenario_name => prompt_string}
         @started_at = Time.now
         @finished_at = nil
+        @metadata_captured = false # set true when the suite enables track_cost
+      end
+      # Whether per-call cost/token metadata was captured this run.
+      def costs_tracked?
+        @metadata_captured
+      end
+      # Total cost per candidate. Raises if cost tracking wasn't enabled.
+      def cost_by_candidate
+        ensure_cost_tracking!
+        @costs.dup
+      end
+      # Rank candidates by quality-per-dollar (avg score / total cost), best
+      # first. Candidates with zero recorded cost sort last. Raises a helpful
+      # error if cost tracking wasn't enabled for the run.
+      def value_ranking
+        ensure_cost_tracking!
+        ranked = scores_by_candidate.map do |candidate, stats|
+          cost = @costs[candidate].to_f
+          score_per_dollar = cost.positive? ? (stats[:avg_score] / cost).round : nil
+          [candidate, { avg_score: stats[:avg_score], cost: cost, score_per_dollar: score_per_dollar }]
+        end
+        ranked.sort_by { |_, v| -(v[:score_per_dollar] || 0) }.to_h
       end
       def record_response(candidate:, scenario:, response:, variant: 'default', temperature: nil, duration_ms: nil, cost: nil, variant_data: nil)
@@ -385,6 +415,24 @@ module Qualspec
           responses: @responses
         }
       end
+      private
+      def ensure_cost_tracking!
+        return if @metadata_captured
+        raise Qualspec::Error, <<~MSG.strip
+          Cost data was not captured for this run, so cost/value analysis is unavailable.
+          Enable it with `track_cost` in the suite definition:
+            Qualspec.evaluation 'My Suite' do
+              track_cost
+              ...
+            end
+          (track_cost adds usage accounting to each request via with_metadata.)
+        MSG
+      end
     end
   end
 end

data/lib/qualspec/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Qualspec
-  VERSION = '0.1.2'
+  VERSION = '0.2.0'
 end

data/lib/qualspec.rb CHANGED Viewed

@@ -7,6 +7,7 @@ module Qualspec
 end
 require_relative 'qualspec/configuration'
+require_relative 'qualspec/model_registry'
 require_relative 'qualspec/client'
 require_relative 'qualspec/evaluation'
 require_relative 'qualspec/prompt_variant'
@@ -37,6 +38,7 @@ module Qualspec
       @configuration = nil
       @client = nil
       @judge = nil
+      @models = nil
       Rubric.clear!
       Suite.clear!
       Suite::Behavior.clear!
@@ -50,6 +52,21 @@ module Qualspec
       @judge ||= Judge.new
     end
+    # Registry of named models loaded from config/models.yml (or
+    # QUALSPEC_MODELS_FILE). See ModelRegistry.
+    def models
+      @models ||= ModelRegistry.new
+    end
+    # Resolve a named model to its slug, falling back to the default
+    # (openrouter/auto). Returns the default when name is nil/unknown.
+    #
+    #   Qualspec.model(:glm)  # => "z-ai/glm-5.2"
+    #   Qualspec.model        # => "openrouter/auto"
+    def model(name = nil)
+      models.resolve(name)
+    end
     # Convenience method for defining rubrics
     def define_rubric(name, &block)
       Rubric.define(name, &block)

data/qualspec_structure.md CHANGED Viewed

@@ -7,10 +7,12 @@ LLM-judged qualitative testing for Ruby. Evaluate AI agents, compare models, and
 ### Core Library Files (lib/qualspec/)
 - **builtin_rubrics.rb** - Built-in evaluation criteria
-- **client.rb** - API client for LLM interactions
+- **client.rb** - API client for LLM interactions (cost/token metadata optional)
 - **configuration.rb** - Configuration management
 - **evaluation.rb** - Core evaluation logic
 - **judge.rb** - LLM judge implementation
+- **model_registry.rb** - Named models from `config/models.yml` (`Qualspec.model`)
+- **prompt_variant.rb** - Variant value object (FactoryBot target)
 - **recorder.rb** - VCR integration for recording
 - **rspec.rb** - RSpec integration entry point
 - **rubric.rb** - Custom rubric definitions
@@ -23,10 +25,11 @@ LLM-judged qualitative testing for Ruby. Evaluate AI agents, compare models, and
 ### Configuration Environment Variables
 | Variable | Description | Default |
 |----------|-------------|---------|
-| QUALSPEC_API_KEY | API key (required) | - |
+| QUALSPEC_API_KEY | API key (falls back to OPEN_ROUTER_API_KEY) | - |
 | QUALSPEC_API_URL | API endpoint | https://openrouter.ai/api/v1 |
-| QUALSPEC_MODEL | Default model for candidates | google/gemini-3-flash-preview |
+| QUALSPEC_MODEL | Default model for candidates | openrouter/auto |
 | QUALSPEC_JUDGE_MODEL | Model used as judge | Same as QUALSPEC_MODEL |
+| QUALSPEC_MODELS_FILE | Named-models YAML | config/models.yml |
 ### Key Features
 1. **Model Comparison CLI** - Compare multiple models on the same prompts
@@ -36,6 +39,9 @@ LLM-judged qualitative testing for Ruby. Evaluate AI agents, compare models, and
 5. **Custom Rubrics** - Define your own evaluation criteria
 6. **VCR Recording** - Record and replay API calls for testing
 7. **HTML Reports** - Generate visual comparison reports
+8. **Named Model Registry** - Reference curated models by name (`Qualspec.model`)
+9. **Cost Tracking** - Opt-in per-call cost + quality-per-dollar `value_ranking`
+10. **Variant & Temperature Matrix** - Combinatorial prompt testing via FactoryBot
 ### Example: Model Comparison
 ```ruby

metadata CHANGED Viewed

@@ -1,14 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: qualspec
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.2.0
 platform: ruby
 authors:
 - Eric Stiens
-autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-04-16 00:00:00.000000000 Z
+date: 1980-01-02 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: faraday
@@ -70,7 +69,9 @@ files:
 - CHANGELOG.md
 - README.md
 - Rakefile
+- config/models.yml
 - docs/.DS_Store
+- docs/alpha_readiness.md
 - docs/configuration.md
 - docs/evaluation-suites.md
 - docs/getting-started.md
@@ -79,7 +80,13 @@ files:
 - docs/rubrics.md
 - docs/to_implement/factory_bot_integration_design.md
 - docs/to_implement/variants_first_pass.md
+- examples/EXAMPLES.md
 - examples/README.md
+- examples/best_value.rb
+- examples/cassettes/best_value.yml
+- examples/cassettes/character_consistency.yml
+- examples/cassettes/customer_service_comparison.yml
+- examples/cassettes/date_awareness_gate.yml
 - examples/cassettes/qualspec_rspec_integration_basic_evaluation_evaluates_responses_with_inline_criteria.yml
 - examples/cassettes/qualspec_rspec_integration_basic_evaluation_provides_detailed_feedback_on_failure.yml
 - examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml
@@ -87,7 +94,10 @@ files:
 - examples/cassettes/qualspec_rspec_integration_vcr_integration_records_and_plays_back_api_calls_automatically.yml
 - examples/cassettes/qualspec_rspec_integration_with_context_uses_context_in_evaluation.yml
 - examples/cassettes/qualspec_rspec_integration_with_rubrics_evaluates_using_builtin_rubrics.yml
+- examples/character_consistency.rb
 - examples/comparison.rb
+- examples/customer_service_comparison.rb
+- examples/date_awareness_gate.rb
 - examples/model_comparison.rb
 - examples/persona_test.rb
 - examples/prompt_variants_factory.rb
@@ -104,6 +114,7 @@ files:
 - lib/qualspec/configuration.rb
 - lib/qualspec/evaluation.rb
 - lib/qualspec/judge.rb
+- lib/qualspec/model_registry.rb
 - lib/qualspec/prompt_variant.rb
 - lib/qualspec/recorder.rb
 - lib/qualspec/rspec.rb
@@ -130,7 +141,6 @@ metadata:
   homepage_uri: https://github.com/estiens/qualspec
   source_code_uri: https://github.com/estiens/qualspec
   changelog_uri: https://github.com/estiens/qualspec/blob/main/CHANGELOG.md
-post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -138,15 +148,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: 3.1.0
+      version: 3.3.0
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.5.22
-signing_key:
+rubygems_version: 3.6.9
 specification_version: 4
 summary: RSpec DSL for qualitative LLM-judged testing
 test_files: []