RubyGems - qualspec - Versions diffs - 0.1.0 → 0.1.2 - Mend

qualspec 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/.rubocop_todo.yml +29 -19
data/docs/.DS_Store +0 -0
data/docs/to_implement/factory_bot_integration_design.md +819 -0
data/docs/to_implement/variants_first_pass.md +480 -0
data/examples/README.md +63 -0
data/examples/prompt_variants_factory.rb +98 -0
data/examples/results/simple_variant_comparison.json +340 -0
data/examples/simple_variant_comparison.rb +68 -0
data/examples/variant_comparison.rb +71 -0
data/lib/qualspec/judge.rb +1 -1
data/lib/qualspec/prompt_variant.rb +94 -0
data/lib/qualspec/suite/dsl.rb +3 -5
data/lib/qualspec/suite/html_reporter.rb +8 -8
data/lib/qualspec/suite/reporter.rb +2 -6
data/lib/qualspec/suite/runner.rb +21 -14
data/lib/qualspec/version.rb +1 -1
data/qualspec_structure.md +80 -0
metadata +12 -2

data/lib/qualspec/suite/html_reporter.rb CHANGED Viewed

@@ -481,10 +481,12 @@ module Qualspec
         scenario_blocks = scenarios.map do |scenario|
           response_cards = responses.map do |candidate, candidate_responses|
-            response = candidate_responses[scenario]
-            next unless response
+            variant_map = candidate_responses[scenario]
+            next unless variant_map
-            response_text = response.to_s.strip
+            contents = variant_map.flat_map { |_v, tm| tm.values.map { |d| d[:content] } }.compact
+            response_text = contents.join("\n\n---\n\n").strip
+            next if response_text.empty?
             <<~CARD
               <div class="response-card">
@@ -660,13 +662,11 @@ module Qualspec
       end
       def get_candidate_model(candidate)
-        # Try to find the model from the suite
-        @results.evaluations.find { |e| e[:candidate] == candidate }&.dig(:model) || 'unknown'
+        @results.candidate_models[candidate] || 'unknown'
       end
-      def get_scenario_prompt(_scenario)
-        # This would need to be stored in results - for now return nil
-        nil
+      def get_scenario_prompt(scenario)
+        @results.prompts[scenario]
       end
     end
   end

data/lib/qualspec/suite/reporter.rb CHANGED Viewed

@@ -256,12 +256,8 @@ module Qualspec
                 # Show variant info if present
                 if data[:variant_data]
                   vd = data[:variant_data]
-                  if vd[:credential] && !vd[:credential].to_s.empty?
-                    lines << "**Credential:** #{vd[:credential]}"
-                  end
-                  if vd[:stance] && vd[:stance] != :neutral
-                    lines << "**Stance:** #{vd[:stance]}"
-                  end
+                  lines << "**Credential:** #{vd[:credential]}" if vd[:credential] && !vd[:credential].to_s.empty?
+                  lines << "**Stance:** #{vd[:stance]}" if vd[:stance] && vd[:stance] != :neutral
                   if vd[:full_prompt] && !vd[:full_prompt].to_s.empty?
                     lines << ''
                     lines << '**Prompt:**'

data/lib/qualspec/suite/runner.rb CHANGED Viewed

@@ -11,6 +11,10 @@ module Qualspec
         @definition = definition.is_a?(String) ? Suite.find(definition) : definition
         @results = Results.new(@definition.name)
         @judge = Qualspec.judge
+        @definition.candidates_list.each do |c|
+          @results.candidate_models[c.name] = c.model
+        end
       end
       def run(progress: true)
@@ -52,6 +56,8 @@ module Qualspec
         responses = {}
         errors = {}
+        @results.prompts[scenario.name] ||= scenario.compose_prompt(variant)
         # Phase 1: Collect all candidate responses
         @definition.candidates_list.each do |candidate|
           log_candidate_progress(candidate, scenario, 'generating') if progress
@@ -81,9 +87,7 @@ module Qualspec
         end
         # Phase 2: Judge all responses together
-        if responses.any?
-          judge_responses(responses, scenario, variant, temperature, progress: progress)
-        end
+        judge_responses(responses, scenario, variant, temperature, progress: progress) if responses.any?
         # Record errors
         record_errors(errors, scenario, variant, temperature)
@@ -219,7 +223,8 @@ module Qualspec
     # Results container with multi-dimensional support
     class Results
-      attr_reader :suite_name, :evaluations, :responses, :started_at, :finished_at, :timing, :costs
+      attr_reader :suite_name, :evaluations, :responses, :started_at, :finished_at, :timing, :costs,
+                  :candidate_models, :prompts
       def initialize(suite_name)
         @suite_name = suite_name
@@ -227,12 +232,13 @@ module Qualspec
         @responses = {} # Nested: {candidate => {scenario => {variant => {temp => response}}}}
         @timing = {}
         @costs = {}
+        @candidate_models = {} # {candidate_name => model_string}
+        @prompts = {}          # {scenario_name => prompt_string}
         @started_at = Time.now
         @finished_at = nil
       end
-      def record_response(candidate:, scenario:, variant: 'default', temperature: nil,
-                          response:, duration_ms: nil, cost: nil, variant_data: nil)
+      def record_response(candidate:, scenario:, response:, variant: 'default', temperature: nil, duration_ms: nil, cost: nil, variant_data: nil)
         # Store in nested structure
         @responses[candidate] ||= {}
         @responses[candidate][scenario] ||= {}
@@ -253,8 +259,7 @@ module Qualspec
         @costs[candidate] += cost
       end
-      def record_evaluation(candidate:, scenario:, variant: 'default', temperature: nil,
-                            criteria:, evaluation:, winner: nil)
+      def record_evaluation(candidate:, scenario:, criteria:, evaluation:, variant: 'default', temperature: nil, winner: nil)
         @evaluations << {
           candidate: candidate,
           scenario: scenario,
@@ -333,13 +338,15 @@ module Qualspec
       def scores_by_scenario
         @evaluations.group_by { |e| e[:scenario] }.transform_values do |evals|
           evals.group_by { |e| e[:candidate] }.transform_values do |candidate_evals|
-            eval_data = candidate_evals.first
+            total = candidate_evals.size
+            avg_score = (candidate_evals.sum { |e| e[:score] }.to_f / total).round(2)
+            first = candidate_evals.first
             {
-              score: eval_data[:score],
-              pass: eval_data[:pass],
-              reasoning: eval_data[:reasoning],
-              variant: eval_data[:variant],
-              temperature: eval_data[:temperature]
+              score: avg_score,
+              pass: candidate_evals.all? { |e| e[:pass] },
+              reasoning: first[:reasoning],
+              variant: first[:variant],
+              temperature: first[:temperature]
             }
           end
         end

data/lib/qualspec/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Qualspec
-  VERSION = '0.1.0'
+  VERSION = '0.1.2'
 end

data/qualspec_structure.md ADDED Viewed

@@ -0,0 +1,80 @@
+# Qualspec - Key Structure
+## Repository: estiens/qualspec
+### Description
+LLM-judged qualitative testing for Ruby. Evaluate AI agents, compare models, and test subjective qualities that traditional assertions can't capture.
+### Core Library Files (lib/qualspec/)
+- **builtin_rubrics.rb** - Built-in evaluation criteria
+- **client.rb** - API client for LLM interactions
+- **configuration.rb** - Configuration management
+- **evaluation.rb** - Core evaluation logic
+- **judge.rb** - LLM judge implementation
+- **recorder.rb** - VCR integration for recording
+- **rspec.rb** - RSpec integration entry point
+- **rubric.rb** - Custom rubric definitions
+- **version.rb** - Version info
+### Subdirectories
+- **rspec/** - RSpec helpers and matchers
+- **suite/** - Evaluation suite components
+### Configuration Environment Variables
+| Variable | Description | Default |
+|----------|-------------|---------|
+| QUALSPEC_API_KEY | API key (required) | - |
+| QUALSPEC_API_URL | API endpoint | https://openrouter.ai/api/v1 |
+| QUALSPEC_MODEL | Default model for candidates | google/gemini-3-flash-preview |
+| QUALSPEC_JUDGE_MODEL | Model used as judge | Same as QUALSPEC_MODEL |
+### Key Features
+1. **Model Comparison CLI** - Compare multiple models on the same prompts
+2. **LLM Judge** - Use an LLM to evaluate responses qualitatively
+3. **RSpec Integration** - Test your agents with qualitative assertions
+4. **Built-in Rubrics** - Pre-defined evaluation criteria
+5. **Custom Rubrics** - Define your own evaluation criteria
+6. **VCR Recording** - Record and replay API calls for testing
+7. **HTML Reports** - Generate visual comparison reports
+### Example: Model Comparison
+```ruby
+# eval/comparison.rb
+Qualspec.evaluation "Model Comparison" do
+  candidates do
+    candidate "gpt4", model: "openai/gpt-4"
+    candidate "claude", model: "anthropic/claude-3-sonnet"
+  end
+  scenario "helpfulness" do
+    prompt "How do I center a div in CSS?"
+    eval "provides a working solution"
+    eval "explains the approach"
+  end
+end
+```
+### Example: RSpec Integration
+```ruby
+require "qualspec/rspec"
+RSpec.describe MyAgent do
+  include Qualspec::RSpec::Helpers
+  it "responds helpfully" do
+    response = MyAgent.call("Hello")
+    result = qualspec_evaluate(response, "responds in a friendly manner")
+    expect(result).to be_passing
+  end
+end
+```
+### CLI Usage
+```shell
+# Run comparison
+qualspec eval/comparison.rb
+# Generate HTML report
+qualspec --html report.html eval/comparison.rb
+```

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: qualspec
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.2
 platform: ruby
 authors:
 - Eric Stiens
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2025-12-26 00:00:00.000000000 Z
+date: 2026-04-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: faraday
@@ -70,12 +70,16 @@ files:
 - CHANGELOG.md
 - README.md
 - Rakefile
+- docs/.DS_Store
 - docs/configuration.md
 - docs/evaluation-suites.md
 - docs/getting-started.md
 - docs/recording.md
 - docs/rspec-integration.md
 - docs/rubrics.md
+- docs/to_implement/factory_bot_integration_design.md
+- docs/to_implement/variants_first_pass.md
+- examples/README.md
 - examples/cassettes/qualspec_rspec_integration_basic_evaluation_evaluates_responses_with_inline_criteria.yml
 - examples/cassettes/qualspec_rspec_integration_basic_evaluation_provides_detailed_feedback_on_failure.yml
 - examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml
@@ -86,9 +90,13 @@ files:
 - examples/comparison.rb
 - examples/model_comparison.rb
 - examples/persona_test.rb
+- examples/prompt_variants_factory.rb
 - examples/quick_test.rb
 - examples/report.html
+- examples/results/simple_variant_comparison.json
 - examples/rspec_example_spec.rb
+- examples/simple_variant_comparison.rb
+- examples/variant_comparison.rb
 - exe/qualspec
 - lib/qualspec.rb
 - lib/qualspec/builtin_rubrics.rb
@@ -96,6 +104,7 @@ files:
 - lib/qualspec/configuration.rb
 - lib/qualspec/evaluation.rb
 - lib/qualspec/judge.rb
+- lib/qualspec/prompt_variant.rb
 - lib/qualspec/recorder.rb
 - lib/qualspec/rspec.rb
 - lib/qualspec/rspec/configuration.rb
@@ -112,6 +121,7 @@ files:
 - lib/qualspec/suite/runner.rb
 - lib/qualspec/suite/scenario.rb
 - lib/qualspec/version.rb
+- qualspec_structure.md
 - sig/qualspec.rbs
 homepage: https://github.com/estiens/qualspec
 licenses: