qualspec 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +29 -19
- data/docs/.DS_Store +0 -0
- data/docs/to_implement/factory_bot_integration_design.md +819 -0
- data/docs/to_implement/variants_first_pass.md +480 -0
- data/examples/README.md +63 -0
- data/examples/prompt_variants_factory.rb +98 -0
- data/examples/results/simple_variant_comparison.json +340 -0
- data/examples/simple_variant_comparison.rb +68 -0
- data/examples/variant_comparison.rb +71 -0
- data/lib/qualspec/judge.rb +1 -1
- data/lib/qualspec/prompt_variant.rb +94 -0
- data/lib/qualspec/suite/dsl.rb +3 -5
- data/lib/qualspec/suite/html_reporter.rb +8 -8
- data/lib/qualspec/suite/reporter.rb +2 -6
- data/lib/qualspec/suite/runner.rb +21 -14
- data/lib/qualspec/version.rb +1 -1
- data/qualspec_structure.md +80 -0
- metadata +12 -2
|
@@ -481,10 +481,12 @@ module Qualspec
|
|
|
481
481
|
|
|
482
482
|
scenario_blocks = scenarios.map do |scenario|
|
|
483
483
|
response_cards = responses.map do |candidate, candidate_responses|
|
|
484
|
-
|
|
485
|
-
next unless
|
|
484
|
+
variant_map = candidate_responses[scenario]
|
|
485
|
+
next unless variant_map
|
|
486
486
|
|
|
487
|
-
|
|
487
|
+
contents = variant_map.flat_map { |_v, tm| tm.values.map { |d| d[:content] } }.compact
|
|
488
|
+
response_text = contents.join("\n\n---\n\n").strip
|
|
489
|
+
next if response_text.empty?
|
|
488
490
|
|
|
489
491
|
<<~CARD
|
|
490
492
|
<div class="response-card">
|
|
@@ -660,13 +662,11 @@ module Qualspec
|
|
|
660
662
|
end
|
|
661
663
|
|
|
662
664
|
def get_candidate_model(candidate)
|
|
663
|
-
|
|
664
|
-
@results.evaluations.find { |e| e[:candidate] == candidate }&.dig(:model) || 'unknown'
|
|
665
|
+
@results.candidate_models[candidate] || 'unknown'
|
|
665
666
|
end
|
|
666
667
|
|
|
667
|
-
def get_scenario_prompt(
|
|
668
|
-
|
|
669
|
-
nil
|
|
668
|
+
def get_scenario_prompt(scenario)
|
|
669
|
+
@results.prompts[scenario]
|
|
670
670
|
end
|
|
671
671
|
end
|
|
672
672
|
end
|
|
@@ -256,12 +256,8 @@ module Qualspec
|
|
|
256
256
|
# Show variant info if present
|
|
257
257
|
if data[:variant_data]
|
|
258
258
|
vd = data[:variant_data]
|
|
259
|
-
if vd[:credential] && !vd[:credential].to_s.empty?
|
|
260
|
-
|
|
261
|
-
end
|
|
262
|
-
if vd[:stance] && vd[:stance] != :neutral
|
|
263
|
-
lines << "**Stance:** #{vd[:stance]}"
|
|
264
|
-
end
|
|
259
|
+
lines << "**Credential:** #{vd[:credential]}" if vd[:credential] && !vd[:credential].to_s.empty?
|
|
260
|
+
lines << "**Stance:** #{vd[:stance]}" if vd[:stance] && vd[:stance] != :neutral
|
|
265
261
|
if vd[:full_prompt] && !vd[:full_prompt].to_s.empty?
|
|
266
262
|
lines << ''
|
|
267
263
|
lines << '**Prompt:**'
|
|
@@ -11,6 +11,10 @@ module Qualspec
|
|
|
11
11
|
@definition = definition.is_a?(String) ? Suite.find(definition) : definition
|
|
12
12
|
@results = Results.new(@definition.name)
|
|
13
13
|
@judge = Qualspec.judge
|
|
14
|
+
|
|
15
|
+
@definition.candidates_list.each do |c|
|
|
16
|
+
@results.candidate_models[c.name] = c.model
|
|
17
|
+
end
|
|
14
18
|
end
|
|
15
19
|
|
|
16
20
|
def run(progress: true)
|
|
@@ -52,6 +56,8 @@ module Qualspec
|
|
|
52
56
|
responses = {}
|
|
53
57
|
errors = {}
|
|
54
58
|
|
|
59
|
+
@results.prompts[scenario.name] ||= scenario.compose_prompt(variant)
|
|
60
|
+
|
|
55
61
|
# Phase 1: Collect all candidate responses
|
|
56
62
|
@definition.candidates_list.each do |candidate|
|
|
57
63
|
log_candidate_progress(candidate, scenario, 'generating') if progress
|
|
@@ -81,9 +87,7 @@ module Qualspec
|
|
|
81
87
|
end
|
|
82
88
|
|
|
83
89
|
# Phase 2: Judge all responses together
|
|
84
|
-
if responses.any?
|
|
85
|
-
judge_responses(responses, scenario, variant, temperature, progress: progress)
|
|
86
|
-
end
|
|
90
|
+
judge_responses(responses, scenario, variant, temperature, progress: progress) if responses.any?
|
|
87
91
|
|
|
88
92
|
# Record errors
|
|
89
93
|
record_errors(errors, scenario, variant, temperature)
|
|
@@ -219,7 +223,8 @@ module Qualspec
|
|
|
219
223
|
|
|
220
224
|
# Results container with multi-dimensional support
|
|
221
225
|
class Results
|
|
222
|
-
attr_reader :suite_name, :evaluations, :responses, :started_at, :finished_at, :timing, :costs
|
|
226
|
+
attr_reader :suite_name, :evaluations, :responses, :started_at, :finished_at, :timing, :costs,
|
|
227
|
+
:candidate_models, :prompts
|
|
223
228
|
|
|
224
229
|
def initialize(suite_name)
|
|
225
230
|
@suite_name = suite_name
|
|
@@ -227,12 +232,13 @@ module Qualspec
|
|
|
227
232
|
@responses = {} # Nested: {candidate => {scenario => {variant => {temp => response}}}}
|
|
228
233
|
@timing = {}
|
|
229
234
|
@costs = {}
|
|
235
|
+
@candidate_models = {} # {candidate_name => model_string}
|
|
236
|
+
@prompts = {} # {scenario_name => prompt_string}
|
|
230
237
|
@started_at = Time.now
|
|
231
238
|
@finished_at = nil
|
|
232
239
|
end
|
|
233
240
|
|
|
234
|
-
def record_response(candidate:, scenario:, variant: 'default', temperature: nil,
|
|
235
|
-
response:, duration_ms: nil, cost: nil, variant_data: nil)
|
|
241
|
+
def record_response(candidate:, scenario:, response:, variant: 'default', temperature: nil, duration_ms: nil, cost: nil, variant_data: nil)
|
|
236
242
|
# Store in nested structure
|
|
237
243
|
@responses[candidate] ||= {}
|
|
238
244
|
@responses[candidate][scenario] ||= {}
|
|
@@ -253,8 +259,7 @@ module Qualspec
|
|
|
253
259
|
@costs[candidate] += cost
|
|
254
260
|
end
|
|
255
261
|
|
|
256
|
-
def record_evaluation(candidate:, scenario:, variant: 'default', temperature: nil,
|
|
257
|
-
criteria:, evaluation:, winner: nil)
|
|
262
|
+
def record_evaluation(candidate:, scenario:, criteria:, evaluation:, variant: 'default', temperature: nil, winner: nil)
|
|
258
263
|
@evaluations << {
|
|
259
264
|
candidate: candidate,
|
|
260
265
|
scenario: scenario,
|
|
@@ -333,13 +338,15 @@ module Qualspec
|
|
|
333
338
|
def scores_by_scenario
|
|
334
339
|
@evaluations.group_by { |e| e[:scenario] }.transform_values do |evals|
|
|
335
340
|
evals.group_by { |e| e[:candidate] }.transform_values do |candidate_evals|
|
|
336
|
-
|
|
341
|
+
total = candidate_evals.size
|
|
342
|
+
avg_score = (candidate_evals.sum { |e| e[:score] }.to_f / total).round(2)
|
|
343
|
+
first = candidate_evals.first
|
|
337
344
|
{
|
|
338
|
-
score:
|
|
339
|
-
pass:
|
|
340
|
-
reasoning:
|
|
341
|
-
variant:
|
|
342
|
-
temperature:
|
|
345
|
+
score: avg_score,
|
|
346
|
+
pass: candidate_evals.all? { |e| e[:pass] },
|
|
347
|
+
reasoning: first[:reasoning],
|
|
348
|
+
variant: first[:variant],
|
|
349
|
+
temperature: first[:temperature]
|
|
343
350
|
}
|
|
344
351
|
end
|
|
345
352
|
end
|
data/lib/qualspec/version.rb
CHANGED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# Qualspec - Key Structure
|
|
2
|
+
|
|
3
|
+
## Repository: estiens/qualspec
|
|
4
|
+
|
|
5
|
+
### Description
|
|
6
|
+
LLM-judged qualitative testing for Ruby. Evaluate AI agents, compare models, and test subjective qualities that traditional assertions can't capture.
|
|
7
|
+
|
|
8
|
+
### Core Library Files (lib/qualspec/)
|
|
9
|
+
- **builtin_rubrics.rb** - Built-in evaluation criteria
|
|
10
|
+
- **client.rb** - API client for LLM interactions
|
|
11
|
+
- **configuration.rb** - Configuration management
|
|
12
|
+
- **evaluation.rb** - Core evaluation logic
|
|
13
|
+
- **judge.rb** - LLM judge implementation
|
|
14
|
+
- **recorder.rb** - VCR integration for recording
|
|
15
|
+
- **rspec.rb** - RSpec integration entry point
|
|
16
|
+
- **rubric.rb** - Custom rubric definitions
|
|
17
|
+
- **version.rb** - Version info
|
|
18
|
+
|
|
19
|
+
### Subdirectories
|
|
20
|
+
- **rspec/** - RSpec helpers and matchers
|
|
21
|
+
- **suite/** - Evaluation suite components
|
|
22
|
+
|
|
23
|
+
### Configuration Environment Variables
|
|
24
|
+
| Variable | Description | Default |
|
|
25
|
+
|----------|-------------|---------|
|
|
26
|
+
| QUALSPEC_API_KEY | API key (required) | - |
|
|
27
|
+
| QUALSPEC_API_URL | API endpoint | https://openrouter.ai/api/v1 |
|
|
28
|
+
| QUALSPEC_MODEL | Default model for candidates | google/gemini-3-flash-preview |
|
|
29
|
+
| QUALSPEC_JUDGE_MODEL | Model used as judge | Same as QUALSPEC_MODEL |
|
|
30
|
+
|
|
31
|
+
### Key Features
|
|
32
|
+
1. **Model Comparison CLI** - Compare multiple models on the same prompts
|
|
33
|
+
2. **LLM Judge** - Use an LLM to evaluate responses qualitatively
|
|
34
|
+
3. **RSpec Integration** - Test your agents with qualitative assertions
|
|
35
|
+
4. **Built-in Rubrics** - Pre-defined evaluation criteria
|
|
36
|
+
5. **Custom Rubrics** - Define your own evaluation criteria
|
|
37
|
+
6. **VCR Recording** - Record and replay API calls for testing
|
|
38
|
+
7. **HTML Reports** - Generate visual comparison reports
|
|
39
|
+
|
|
40
|
+
### Example: Model Comparison
|
|
41
|
+
```ruby
|
|
42
|
+
# eval/comparison.rb
|
|
43
|
+
Qualspec.evaluation "Model Comparison" do
|
|
44
|
+
candidates do
|
|
45
|
+
candidate "gpt4", model: "openai/gpt-4"
|
|
46
|
+
candidate "claude", model: "anthropic/claude-3-sonnet"
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
scenario "helpfulness" do
|
|
50
|
+
prompt "How do I center a div in CSS?"
|
|
51
|
+
eval "provides a working solution"
|
|
52
|
+
eval "explains the approach"
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Example: RSpec Integration
|
|
58
|
+
```ruby
|
|
59
|
+
require "qualspec/rspec"
|
|
60
|
+
|
|
61
|
+
RSpec.describe MyAgent do
|
|
62
|
+
include Qualspec::RSpec::Helpers
|
|
63
|
+
|
|
64
|
+
it "responds helpfully" do
|
|
65
|
+
response = MyAgent.call("Hello")
|
|
66
|
+
|
|
67
|
+
result = qualspec_evaluate(response, "responds in a friendly manner")
|
|
68
|
+
expect(result).to be_passing
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### CLI Usage
|
|
74
|
+
```shell
|
|
75
|
+
# Run comparison
|
|
76
|
+
qualspec eval/comparison.rb
|
|
77
|
+
|
|
78
|
+
# Generate HTML report
|
|
79
|
+
qualspec --html report.html eval/comparison.rb
|
|
80
|
+
```
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: qualspec
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Eric Stiens
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2026-04-16 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: faraday
|
|
@@ -70,12 +70,16 @@ files:
|
|
|
70
70
|
- CHANGELOG.md
|
|
71
71
|
- README.md
|
|
72
72
|
- Rakefile
|
|
73
|
+
- docs/.DS_Store
|
|
73
74
|
- docs/configuration.md
|
|
74
75
|
- docs/evaluation-suites.md
|
|
75
76
|
- docs/getting-started.md
|
|
76
77
|
- docs/recording.md
|
|
77
78
|
- docs/rspec-integration.md
|
|
78
79
|
- docs/rubrics.md
|
|
80
|
+
- docs/to_implement/factory_bot_integration_design.md
|
|
81
|
+
- docs/to_implement/variants_first_pass.md
|
|
82
|
+
- examples/README.md
|
|
79
83
|
- examples/cassettes/qualspec_rspec_integration_basic_evaluation_evaluates_responses_with_inline_criteria.yml
|
|
80
84
|
- examples/cassettes/qualspec_rspec_integration_basic_evaluation_provides_detailed_feedback_on_failure.yml
|
|
81
85
|
- examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml
|
|
@@ -86,9 +90,13 @@ files:
|
|
|
86
90
|
- examples/comparison.rb
|
|
87
91
|
- examples/model_comparison.rb
|
|
88
92
|
- examples/persona_test.rb
|
|
93
|
+
- examples/prompt_variants_factory.rb
|
|
89
94
|
- examples/quick_test.rb
|
|
90
95
|
- examples/report.html
|
|
96
|
+
- examples/results/simple_variant_comparison.json
|
|
91
97
|
- examples/rspec_example_spec.rb
|
|
98
|
+
- examples/simple_variant_comparison.rb
|
|
99
|
+
- examples/variant_comparison.rb
|
|
92
100
|
- exe/qualspec
|
|
93
101
|
- lib/qualspec.rb
|
|
94
102
|
- lib/qualspec/builtin_rubrics.rb
|
|
@@ -96,6 +104,7 @@ files:
|
|
|
96
104
|
- lib/qualspec/configuration.rb
|
|
97
105
|
- lib/qualspec/evaluation.rb
|
|
98
106
|
- lib/qualspec/judge.rb
|
|
107
|
+
- lib/qualspec/prompt_variant.rb
|
|
99
108
|
- lib/qualspec/recorder.rb
|
|
100
109
|
- lib/qualspec/rspec.rb
|
|
101
110
|
- lib/qualspec/rspec/configuration.rb
|
|
@@ -112,6 +121,7 @@ files:
|
|
|
112
121
|
- lib/qualspec/suite/runner.rb
|
|
113
122
|
- lib/qualspec/suite/scenario.rb
|
|
114
123
|
- lib/qualspec/version.rb
|
|
124
|
+
- qualspec_structure.md
|
|
115
125
|
- sig/qualspec.rbs
|
|
116
126
|
homepage: https://github.com/estiens/qualspec
|
|
117
127
|
licenses:
|