qualspec 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.DS_Store +0 -0
- data/.rubocop_todo.yml +29 -19
- data/docs/.DS_Store +0 -0
- data/docs/to_implement/factory_bot_integration_design.md +819 -0
- data/docs/to_implement/variants_first_pass.md +480 -0
- data/examples/README.md +63 -0
- data/examples/prompt_variants_factory.rb +98 -0
- data/examples/results/simple_variant_comparison.json +340 -0
- data/examples/simple_variant_comparison.rb +68 -0
- data/examples/variant_comparison.rb +71 -0
- data/lib/qualspec/prompt_variant.rb +94 -0
- data/lib/qualspec/suite/dsl.rb +3 -5
- data/lib/qualspec/suite/reporter.rb +2 -6
- data/lib/qualspec/suite/runner.rb +3 -7
- data/lib/qualspec/version.rb +1 -1
- data/qualspec_structure.md +80 -0
- metadata +13 -2
|
@@ -81,9 +81,7 @@ module Qualspec
|
|
|
81
81
|
end
|
|
82
82
|
|
|
83
83
|
# Phase 2: Judge all responses together
|
|
84
|
-
if responses.any?
|
|
85
|
-
judge_responses(responses, scenario, variant, temperature, progress: progress)
|
|
86
|
-
end
|
|
84
|
+
judge_responses(responses, scenario, variant, temperature, progress: progress) if responses.any?
|
|
87
85
|
|
|
88
86
|
# Record errors
|
|
89
87
|
record_errors(errors, scenario, variant, temperature)
|
|
@@ -231,8 +229,7 @@ module Qualspec
|
|
|
231
229
|
@finished_at = nil
|
|
232
230
|
end
|
|
233
231
|
|
|
234
|
-
def record_response(candidate:, scenario:, variant: 'default', temperature: nil,
|
|
235
|
-
response:, duration_ms: nil, cost: nil, variant_data: nil)
|
|
232
|
+
def record_response(candidate:, scenario:, response:, variant: 'default', temperature: nil, duration_ms: nil, cost: nil, variant_data: nil)
|
|
236
233
|
# Store in nested structure
|
|
237
234
|
@responses[candidate] ||= {}
|
|
238
235
|
@responses[candidate][scenario] ||= {}
|
|
@@ -253,8 +250,7 @@ module Qualspec
|
|
|
253
250
|
@costs[candidate] += cost
|
|
254
251
|
end
|
|
255
252
|
|
|
256
|
-
def record_evaluation(candidate:, scenario:, variant: 'default', temperature: nil,
|
|
257
|
-
criteria:, evaluation:, winner: nil)
|
|
253
|
+
def record_evaluation(candidate:, scenario:, criteria:, evaluation:, variant: 'default', temperature: nil, winner: nil)
|
|
258
254
|
@evaluations << {
|
|
259
255
|
candidate: candidate,
|
|
260
256
|
scenario: scenario,
|
data/lib/qualspec/version.rb
CHANGED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# Qualspec - Key Structure
|
|
2
|
+
|
|
3
|
+
## Repository: estiens/qualspec
|
|
4
|
+
|
|
5
|
+
### Description
|
|
6
|
+
LLM-judged qualitative testing for Ruby. Evaluate AI agents, compare models, and test subjective qualities that traditional assertions can't capture.
|
|
7
|
+
|
|
8
|
+
### Core Library Files (lib/qualspec/)
|
|
9
|
+
- **builtin_rubrics.rb** - Built-in evaluation criteria
|
|
10
|
+
- **client.rb** - API client for LLM interactions
|
|
11
|
+
- **configuration.rb** - Configuration management
|
|
12
|
+
- **evaluation.rb** - Core evaluation logic
|
|
13
|
+
- **judge.rb** - LLM judge implementation
|
|
14
|
+
- **recorder.rb** - VCR integration for recording
|
|
15
|
+
- **rspec.rb** - RSpec integration entry point
|
|
16
|
+
- **rubric.rb** - Custom rubric definitions
|
|
17
|
+
- **version.rb** - Version info
|
|
18
|
+
|
|
19
|
+
### Subdirectories
|
|
20
|
+
- **rspec/** - RSpec helpers and matchers
|
|
21
|
+
- **suite/** - Evaluation suite components
|
|
22
|
+
|
|
23
|
+
### Configuration Environment Variables
|
|
24
|
+
| Variable | Description | Default |
|
|
25
|
+
|----------|-------------|---------|
|
|
26
|
+
| QUALSPEC_API_KEY | API key (required) | - |
|
|
27
|
+
| QUALSPEC_API_URL | API endpoint | https://openrouter.ai/api/v1 |
|
|
28
|
+
| QUALSPEC_MODEL | Default model for candidates | google/gemini-3-flash-preview |
|
|
29
|
+
| QUALSPEC_JUDGE_MODEL | Model used as judge | Same as QUALSPEC_MODEL |
|
|
30
|
+
|
|
31
|
+
### Key Features
|
|
32
|
+
1. **Model Comparison CLI** - Compare multiple models on the same prompts
|
|
33
|
+
2. **LLM Judge** - Use an LLM to evaluate responses qualitatively
|
|
34
|
+
3. **RSpec Integration** - Test your agents with qualitative assertions
|
|
35
|
+
4. **Built-in Rubrics** - Pre-defined evaluation criteria
|
|
36
|
+
5. **Custom Rubrics** - Define your own evaluation criteria
|
|
37
|
+
6. **VCR Recording** - Record and replay API calls for testing
|
|
38
|
+
7. **HTML Reports** - Generate visual comparison reports
|
|
39
|
+
|
|
40
|
+
### Example: Model Comparison
|
|
41
|
+
```ruby
|
|
42
|
+
# eval/comparison.rb
|
|
43
|
+
Qualspec.evaluation "Model Comparison" do
|
|
44
|
+
candidates do
|
|
45
|
+
candidate "gpt4", model: "openai/gpt-4"
|
|
46
|
+
candidate "claude", model: "anthropic/claude-3-sonnet"
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
scenario "helpfulness" do
|
|
50
|
+
prompt "How do I center a div in CSS?"
|
|
51
|
+
eval "provides a working solution"
|
|
52
|
+
eval "explains the approach"
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Example: RSpec Integration
|
|
58
|
+
```ruby
|
|
59
|
+
require "qualspec/rspec"
|
|
60
|
+
|
|
61
|
+
RSpec.describe MyAgent do
|
|
62
|
+
include Qualspec::RSpec::Helpers
|
|
63
|
+
|
|
64
|
+
it "responds helpfully" do
|
|
65
|
+
response = MyAgent.call("Hello")
|
|
66
|
+
|
|
67
|
+
result = qualspec_evaluate(response, "responds in a friendly manner")
|
|
68
|
+
expect(result).to be_passing
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### CLI Usage
|
|
74
|
+
```shell
|
|
75
|
+
# Run comparison
|
|
76
|
+
qualspec eval/comparison.rb
|
|
77
|
+
|
|
78
|
+
# Generate HTML report
|
|
79
|
+
qualspec --html report.html eval/comparison.rb
|
|
80
|
+
```
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: qualspec
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Eric Stiens
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2026-01-05 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: faraday
|
|
@@ -62,6 +62,7 @@ executables:
|
|
|
62
62
|
extensions: []
|
|
63
63
|
extra_rdoc_files: []
|
|
64
64
|
files:
|
|
65
|
+
- ".DS_Store"
|
|
65
66
|
- ".qualspec_cassettes/comparison_test.yml"
|
|
66
67
|
- ".qualspec_cassettes/quick_test.yml"
|
|
67
68
|
- ".rspec"
|
|
@@ -70,12 +71,16 @@ files:
|
|
|
70
71
|
- CHANGELOG.md
|
|
71
72
|
- README.md
|
|
72
73
|
- Rakefile
|
|
74
|
+
- docs/.DS_Store
|
|
73
75
|
- docs/configuration.md
|
|
74
76
|
- docs/evaluation-suites.md
|
|
75
77
|
- docs/getting-started.md
|
|
76
78
|
- docs/recording.md
|
|
77
79
|
- docs/rspec-integration.md
|
|
78
80
|
- docs/rubrics.md
|
|
81
|
+
- docs/to_implement/factory_bot_integration_design.md
|
|
82
|
+
- docs/to_implement/variants_first_pass.md
|
|
83
|
+
- examples/README.md
|
|
79
84
|
- examples/cassettes/qualspec_rspec_integration_basic_evaluation_evaluates_responses_with_inline_criteria.yml
|
|
80
85
|
- examples/cassettes/qualspec_rspec_integration_basic_evaluation_provides_detailed_feedback_on_failure.yml
|
|
81
86
|
- examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml
|
|
@@ -86,9 +91,13 @@ files:
|
|
|
86
91
|
- examples/comparison.rb
|
|
87
92
|
- examples/model_comparison.rb
|
|
88
93
|
- examples/persona_test.rb
|
|
94
|
+
- examples/prompt_variants_factory.rb
|
|
89
95
|
- examples/quick_test.rb
|
|
90
96
|
- examples/report.html
|
|
97
|
+
- examples/results/simple_variant_comparison.json
|
|
91
98
|
- examples/rspec_example_spec.rb
|
|
99
|
+
- examples/simple_variant_comparison.rb
|
|
100
|
+
- examples/variant_comparison.rb
|
|
92
101
|
- exe/qualspec
|
|
93
102
|
- lib/qualspec.rb
|
|
94
103
|
- lib/qualspec/builtin_rubrics.rb
|
|
@@ -96,6 +105,7 @@ files:
|
|
|
96
105
|
- lib/qualspec/configuration.rb
|
|
97
106
|
- lib/qualspec/evaluation.rb
|
|
98
107
|
- lib/qualspec/judge.rb
|
|
108
|
+
- lib/qualspec/prompt_variant.rb
|
|
99
109
|
- lib/qualspec/recorder.rb
|
|
100
110
|
- lib/qualspec/rspec.rb
|
|
101
111
|
- lib/qualspec/rspec/configuration.rb
|
|
@@ -112,6 +122,7 @@ files:
|
|
|
112
122
|
- lib/qualspec/suite/runner.rb
|
|
113
123
|
- lib/qualspec/suite/scenario.rb
|
|
114
124
|
- lib/qualspec/version.rb
|
|
125
|
+
- qualspec_structure.md
|
|
115
126
|
- sig/qualspec.rbs
|
|
116
127
|
homepage: https://github.com/estiens/qualspec
|
|
117
128
|
licenses:
|