qualspec 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -481,10 +481,12 @@ module Qualspec
481
481
 
482
482
  scenario_blocks = scenarios.map do |scenario|
483
483
  response_cards = responses.map do |candidate, candidate_responses|
484
- response = candidate_responses[scenario]
485
- next unless response
484
+ variant_map = candidate_responses[scenario]
485
+ next unless variant_map
486
486
 
487
- response_text = response.to_s.strip
487
+ contents = variant_map.flat_map { |_v, tm| tm.values.map { |d| d[:content] } }.compact
488
+ response_text = contents.join("\n\n---\n\n").strip
489
+ next if response_text.empty?
488
490
 
489
491
  <<~CARD
490
492
  <div class="response-card">
@@ -660,13 +662,11 @@ module Qualspec
660
662
  end
661
663
 
662
664
  def get_candidate_model(candidate)
663
- # Try to find the model from the suite
664
- @results.evaluations.find { |e| e[:candidate] == candidate }&.dig(:model) || 'unknown'
665
+ @results.candidate_models[candidate] || 'unknown'
665
666
  end
666
667
 
667
- def get_scenario_prompt(_scenario)
668
- # This would need to be stored in results - for now return nil
669
- nil
668
+ def get_scenario_prompt(scenario)
669
+ @results.prompts[scenario]
670
670
  end
671
671
  end
672
672
  end
@@ -256,12 +256,8 @@ module Qualspec
256
256
  # Show variant info if present
257
257
  if data[:variant_data]
258
258
  vd = data[:variant_data]
259
- if vd[:credential] && !vd[:credential].to_s.empty?
260
- lines << "**Credential:** #{vd[:credential]}"
261
- end
262
- if vd[:stance] && vd[:stance] != :neutral
263
- lines << "**Stance:** #{vd[:stance]}"
264
- end
259
+ lines << "**Credential:** #{vd[:credential]}" if vd[:credential] && !vd[:credential].to_s.empty?
260
+ lines << "**Stance:** #{vd[:stance]}" if vd[:stance] && vd[:stance] != :neutral
265
261
  if vd[:full_prompt] && !vd[:full_prompt].to_s.empty?
266
262
  lines << ''
267
263
  lines << '**Prompt:**'
@@ -11,6 +11,10 @@ module Qualspec
11
11
  @definition = definition.is_a?(String) ? Suite.find(definition) : definition
12
12
  @results = Results.new(@definition.name)
13
13
  @judge = Qualspec.judge
14
+
15
+ @definition.candidates_list.each do |c|
16
+ @results.candidate_models[c.name] = c.model
17
+ end
14
18
  end
15
19
 
16
20
  def run(progress: true)
@@ -52,6 +56,8 @@ module Qualspec
52
56
  responses = {}
53
57
  errors = {}
54
58
 
59
+ @results.prompts[scenario.name] ||= scenario.compose_prompt(variant)
60
+
55
61
  # Phase 1: Collect all candidate responses
56
62
  @definition.candidates_list.each do |candidate|
57
63
  log_candidate_progress(candidate, scenario, 'generating') if progress
@@ -81,9 +87,7 @@ module Qualspec
81
87
  end
82
88
 
83
89
  # Phase 2: Judge all responses together
84
- if responses.any?
85
- judge_responses(responses, scenario, variant, temperature, progress: progress)
86
- end
90
+ judge_responses(responses, scenario, variant, temperature, progress: progress) if responses.any?
87
91
 
88
92
  # Record errors
89
93
  record_errors(errors, scenario, variant, temperature)
@@ -219,7 +223,8 @@ module Qualspec
219
223
 
220
224
  # Results container with multi-dimensional support
221
225
  class Results
222
- attr_reader :suite_name, :evaluations, :responses, :started_at, :finished_at, :timing, :costs
226
+ attr_reader :suite_name, :evaluations, :responses, :started_at, :finished_at, :timing, :costs,
227
+ :candidate_models, :prompts
223
228
 
224
229
  def initialize(suite_name)
225
230
  @suite_name = suite_name
@@ -227,12 +232,13 @@ module Qualspec
227
232
  @responses = {} # Nested: {candidate => {scenario => {variant => {temp => response}}}}
228
233
  @timing = {}
229
234
  @costs = {}
235
+ @candidate_models = {} # {candidate_name => model_string}
236
+ @prompts = {} # {scenario_name => prompt_string}
230
237
  @started_at = Time.now
231
238
  @finished_at = nil
232
239
  end
233
240
 
234
- def record_response(candidate:, scenario:, variant: 'default', temperature: nil,
235
- response:, duration_ms: nil, cost: nil, variant_data: nil)
241
+ def record_response(candidate:, scenario:, response:, variant: 'default', temperature: nil, duration_ms: nil, cost: nil, variant_data: nil)
236
242
  # Store in nested structure
237
243
  @responses[candidate] ||= {}
238
244
  @responses[candidate][scenario] ||= {}
@@ -253,8 +259,7 @@ module Qualspec
253
259
  @costs[candidate] += cost
254
260
  end
255
261
 
256
- def record_evaluation(candidate:, scenario:, variant: 'default', temperature: nil,
257
- criteria:, evaluation:, winner: nil)
262
+ def record_evaluation(candidate:, scenario:, criteria:, evaluation:, variant: 'default', temperature: nil, winner: nil)
258
263
  @evaluations << {
259
264
  candidate: candidate,
260
265
  scenario: scenario,
@@ -333,13 +338,15 @@ module Qualspec
333
338
  def scores_by_scenario
334
339
  @evaluations.group_by { |e| e[:scenario] }.transform_values do |evals|
335
340
  evals.group_by { |e| e[:candidate] }.transform_values do |candidate_evals|
336
- eval_data = candidate_evals.first
341
+ total = candidate_evals.size
342
+ avg_score = (candidate_evals.sum { |e| e[:score] }.to_f / total).round(2)
343
+ first = candidate_evals.first
337
344
  {
338
- score: eval_data[:score],
339
- pass: eval_data[:pass],
340
- reasoning: eval_data[:reasoning],
341
- variant: eval_data[:variant],
342
- temperature: eval_data[:temperature]
345
+ score: avg_score,
346
+ pass: candidate_evals.all? { |e| e[:pass] },
347
+ reasoning: first[:reasoning],
348
+ variant: first[:variant],
349
+ temperature: first[:temperature]
343
350
  }
344
351
  end
345
352
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Qualspec
4
- VERSION = '0.1.0'
4
+ VERSION = '0.1.2'
5
5
  end
@@ -0,0 +1,80 @@
1
+ # Qualspec - Key Structure
2
+
3
+ ## Repository: estiens/qualspec
4
+
5
+ ### Description
6
+ LLM-judged qualitative testing for Ruby. Evaluate AI agents, compare models, and test subjective qualities that traditional assertions can't capture.
7
+
8
+ ### Core Library Files (lib/qualspec/)
9
+ - **builtin_rubrics.rb** - Built-in evaluation criteria
10
+ - **client.rb** - API client for LLM interactions
11
+ - **configuration.rb** - Configuration management
12
+ - **evaluation.rb** - Core evaluation logic
13
+ - **judge.rb** - LLM judge implementation
14
+ - **recorder.rb** - VCR integration for recording
15
+ - **rspec.rb** - RSpec integration entry point
16
+ - **rubric.rb** - Custom rubric definitions
17
+ - **version.rb** - Version info
18
+
19
+ ### Subdirectories
20
+ - **rspec/** - RSpec helpers and matchers
21
+ - **suite/** - Evaluation suite components
22
+
23
+ ### Configuration Environment Variables
24
+ | Variable | Description | Default |
25
+ |----------|-------------|---------|
26
+ | QUALSPEC_API_KEY | API key (required) | - |
27
+ | QUALSPEC_API_URL | API endpoint | https://openrouter.ai/api/v1 |
28
+ | QUALSPEC_MODEL | Default model for candidates | google/gemini-3-flash-preview |
29
+ | QUALSPEC_JUDGE_MODEL | Model used as judge | Same as QUALSPEC_MODEL |
30
+
31
+ ### Key Features
32
+ 1. **Model Comparison CLI** - Compare multiple models on the same prompts
33
+ 2. **LLM Judge** - Use an LLM to evaluate responses qualitatively
34
+ 3. **RSpec Integration** - Test your agents with qualitative assertions
35
+ 4. **Built-in Rubrics** - Pre-defined evaluation criteria
36
+ 5. **Custom Rubrics** - Define your own evaluation criteria
37
+ 6. **VCR Recording** - Record and replay API calls for testing
38
+ 7. **HTML Reports** - Generate visual comparison reports
39
+
40
+ ### Example: Model Comparison
41
+ ```ruby
42
+ # eval/comparison.rb
43
+ Qualspec.evaluation "Model Comparison" do
44
+ candidates do
45
+ candidate "gpt4", model: "openai/gpt-4"
46
+ candidate "claude", model: "anthropic/claude-3-sonnet"
47
+ end
48
+
49
+ scenario "helpfulness" do
50
+ prompt "How do I center a div in CSS?"
51
+ eval "provides a working solution"
52
+ eval "explains the approach"
53
+ end
54
+ end
55
+ ```
56
+
57
+ ### Example: RSpec Integration
58
+ ```ruby
59
+ require "qualspec/rspec"
60
+
61
+ RSpec.describe MyAgent do
62
+ include Qualspec::RSpec::Helpers
63
+
64
+ it "responds helpfully" do
65
+ response = MyAgent.call("Hello")
66
+
67
+ result = qualspec_evaluate(response, "responds in a friendly manner")
68
+ expect(result).to be_passing
69
+ end
70
+ end
71
+ ```
72
+
73
+ ### CLI Usage
74
+ ```shell
75
+ # Run comparison
76
+ qualspec eval/comparison.rb
77
+
78
+ # Generate HTML report
79
+ qualspec --html report.html eval/comparison.rb
80
+ ```
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: qualspec
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Stiens
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-12-26 00:00:00.000000000 Z
11
+ date: 2026-04-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: faraday
@@ -70,12 +70,16 @@ files:
70
70
  - CHANGELOG.md
71
71
  - README.md
72
72
  - Rakefile
73
+ - docs/.DS_Store
73
74
  - docs/configuration.md
74
75
  - docs/evaluation-suites.md
75
76
  - docs/getting-started.md
76
77
  - docs/recording.md
77
78
  - docs/rspec-integration.md
78
79
  - docs/rubrics.md
80
+ - docs/to_implement/factory_bot_integration_design.md
81
+ - docs/to_implement/variants_first_pass.md
82
+ - examples/README.md
79
83
  - examples/cassettes/qualspec_rspec_integration_basic_evaluation_evaluates_responses_with_inline_criteria.yml
80
84
  - examples/cassettes/qualspec_rspec_integration_basic_evaluation_provides_detailed_feedback_on_failure.yml
81
85
  - examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml
@@ -86,9 +90,13 @@ files:
86
90
  - examples/comparison.rb
87
91
  - examples/model_comparison.rb
88
92
  - examples/persona_test.rb
93
+ - examples/prompt_variants_factory.rb
89
94
  - examples/quick_test.rb
90
95
  - examples/report.html
96
+ - examples/results/simple_variant_comparison.json
91
97
  - examples/rspec_example_spec.rb
98
+ - examples/simple_variant_comparison.rb
99
+ - examples/variant_comparison.rb
92
100
  - exe/qualspec
93
101
  - lib/qualspec.rb
94
102
  - lib/qualspec/builtin_rubrics.rb
@@ -96,6 +104,7 @@ files:
96
104
  - lib/qualspec/configuration.rb
97
105
  - lib/qualspec/evaluation.rb
98
106
  - lib/qualspec/judge.rb
107
+ - lib/qualspec/prompt_variant.rb
99
108
  - lib/qualspec/recorder.rb
100
109
  - lib/qualspec/rspec.rb
101
110
  - lib/qualspec/rspec/configuration.rb
@@ -112,6 +121,7 @@ files:
112
121
  - lib/qualspec/suite/runner.rb
113
122
  - lib/qualspec/suite/scenario.rb
114
123
  - lib/qualspec/version.rb
124
+ - qualspec_structure.md
115
125
  - sig/qualspec.rbs
116
126
  homepage: https://github.com/estiens/qualspec
117
127
  licenses: