qualspec 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +14 -0
- data/.rubocop_todo.yml +1 -1
- data/CHANGELOG.md +31 -0
- data/README.md +27 -5
- data/config/models.yml +23 -0
- data/docs/alpha_readiness.md +94 -0
- data/docs/configuration.md +53 -4
- data/docs/evaluation-suites.md +45 -2
- data/docs/getting-started.md +5 -2
- data/docs/recording.md +22 -0
- data/examples/EXAMPLES.md +73 -0
- data/examples/README.md +5 -0
- data/examples/best_value.rb +67 -0
- data/examples/cassettes/best_value.yml +649 -0
- data/examples/cassettes/character_consistency.yml +680 -0
- data/examples/cassettes/customer_service_comparison.yml +593 -0
- data/examples/cassettes/date_awareness_gate.yml +420 -0
- data/examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml +4 -4
- data/examples/character_consistency.rb +83 -0
- data/examples/comparison.rb +0 -0
- data/examples/customer_service_comparison.rb +59 -0
- data/examples/date_awareness_gate.rb +57 -0
- data/examples/model_comparison.rb +0 -0
- data/examples/persona_test.rb +0 -0
- data/examples/prompt_variants_factory.rb +0 -0
- data/examples/quick_test.rb +0 -0
- data/examples/rspec_example_spec.rb +0 -0
- data/examples/simple_variant_comparison.rb +0 -0
- data/examples/variant_comparison.rb +0 -0
- data/exe/qualspec +4 -4
- data/lib/qualspec/client.rb +14 -7
- data/lib/qualspec/configuration.rb +18 -5
- data/lib/qualspec/judge.rb +1 -1
- data/lib/qualspec/model_registry.rb +62 -0
- data/lib/qualspec/recorder.rb +41 -3
- data/lib/qualspec/suite/candidate.rb +7 -4
- data/lib/qualspec/suite/dsl.rb +16 -1
- data/lib/qualspec/suite/html_reporter.rb +8 -8
- data/lib/qualspec/suite/runner.rb +67 -8
- data/lib/qualspec/version.rb +1 -1
- data/lib/qualspec.rb +17 -0
- data/qualspec_structure.md +9 -3
- metadata +16 -8
- data/.DS_Store +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5fbaa225fce628f9becd02913d717112ec403d51e3d23a563c47ae285a7b0d4b
|
|
4
|
+
data.tar.gz: cfdb7943f93141c9377e809ebb384195ebcdf2ec3a9903c5a4b045b13d3eaef7
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c76a9d16a428ff2ae61cb6b0274a1459e50c42aa9375ec966650f102030281e5c601654cb8323c71e845bfd86b0b51aa0a5d04c4487f32a0bf6c988bd9c73cd9
|
|
7
|
+
data.tar.gz: 6a21ec953e67ea7d7717d80de15c800fa0ac4eda3c2d24791543648312f71f436503f569296b138e743319258f41d6763160d10e50287befc9910e3fe10244d4
|
data/.rubocop.yml
CHANGED
|
@@ -1 +1,15 @@
|
|
|
1
1
|
inherit_from: .rubocop_todo.yml
|
|
2
|
+
|
|
3
|
+
inherit_mode:
|
|
4
|
+
merge:
|
|
5
|
+
- Exclude
|
|
6
|
+
|
|
7
|
+
AllCops:
|
|
8
|
+
Exclude:
|
|
9
|
+
# Example scripts are illustrative docs, not library code.
|
|
10
|
+
- 'examples/**/*'
|
|
11
|
+
|
|
12
|
+
# Spec files legitimately have long describe/context blocks.
|
|
13
|
+
Metrics/BlockLength:
|
|
14
|
+
Exclude:
|
|
15
|
+
- 'spec/**/*'
|
data/.rubocop_todo.yml
CHANGED
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,36 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.2.0] - 2026-06-28
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- **Named model registry** — curated models in `config/models.yml`, resolved via
|
|
8
|
+
`Qualspec.model(:name)` / `Qualspec.models`. Unknown/blank names fall back to
|
|
9
|
+
the default. Override the file with `QUALSPEC_MODELS_FILE`.
|
|
10
|
+
- **Cost tracking (opt-in)** — `track_cost` in the suite DSL captures per-call
|
|
11
|
+
cost + tokens via OpenRouter usage accounting. `Results#value_ranking` and
|
|
12
|
+
`Results#cost_by_candidate` expose quality-per-dollar analysis (and raise a
|
|
13
|
+
helpful error if cost tracking wasn't enabled).
|
|
14
|
+
- `Recorder.use_cassette` — replays an existing cassette (no API key needed),
|
|
15
|
+
records a fresh one when missing. Used by the new examples.
|
|
16
|
+
- API key is now configurable via `Qualspec.configure { |c| c.api_key = ... }`,
|
|
17
|
+
falling back to `QUALSPEC_API_KEY` then `OPEN_ROUTER_API_KEY`.
|
|
18
|
+
- New runnable examples: `customer_service_comparison`, `date_awareness_gate`,
|
|
19
|
+
`best_value`, `character_consistency` — each ships a committed VCR cassette.
|
|
20
|
+
|
|
21
|
+
### Changed
|
|
22
|
+
|
|
23
|
+
- Default model is now `openrouter/auto` (was `google/gemini-3-flash-preview`),
|
|
24
|
+
so qualspec works with nothing configured.
|
|
25
|
+
- `candidate` no longer requires `model:` — it defaults to the configured model.
|
|
26
|
+
- `Client#chat` requests usage accounting and reads cost from `usage.cost` when
|
|
27
|
+
`with_metadata: true` (previously cost was never captured).
|
|
28
|
+
|
|
29
|
+
### Fixed
|
|
30
|
+
|
|
31
|
+
- Cost data was never populated (metadata was never requested); cost reporting
|
|
32
|
+
now works when `track_cost` is enabled.
|
|
33
|
+
|
|
3
34
|
## [0.1.0] - 2025-12-26
|
|
4
35
|
|
|
5
36
|
### Added
|
data/README.md
CHANGED
|
@@ -10,7 +10,7 @@ gem "qualspec"
|
|
|
10
10
|
|
|
11
11
|
## Configuration
|
|
12
12
|
|
|
13
|
-
Set your API key (
|
|
13
|
+
Set your API key (`OPEN_ROUTER_API_KEY` also works as a fallback):
|
|
14
14
|
|
|
15
15
|
```bash
|
|
16
16
|
export QUALSPEC_API_KEY=your_openrouter_key
|
|
@@ -20,10 +20,31 @@ export QUALSPEC_API_KEY=your_openrouter_key
|
|
|
20
20
|
|
|
21
21
|
| Variable | Description | Default |
|
|
22
22
|
|----------|-------------|---------|
|
|
23
|
-
| `QUALSPEC_API_KEY` | API key (required) | - |
|
|
23
|
+
| `QUALSPEC_API_KEY` | API key (required; falls back to `OPEN_ROUTER_API_KEY`) | - |
|
|
24
24
|
| `QUALSPEC_API_URL` | API endpoint | `https://openrouter.ai/api/v1` |
|
|
25
|
-
| `QUALSPEC_MODEL` | Default model for candidates | `
|
|
25
|
+
| `QUALSPEC_MODEL` | Default model for candidates | `openrouter/auto` |
|
|
26
26
|
| `QUALSPEC_JUDGE_MODEL` | Model used as judge | Same as `QUALSPEC_MODEL` |
|
|
27
|
+
| `QUALSPEC_MODELS_FILE` | Path to the named-models YAML | `config/models.yml` |
|
|
28
|
+
|
|
29
|
+
### Models
|
|
30
|
+
|
|
31
|
+
The default model everywhere is `openrouter/auto`, which routes to a sensible
|
|
32
|
+
model for any request — so qualspec works even with nothing configured. A
|
|
33
|
+
candidate with no `model:` uses this default too.
|
|
34
|
+
|
|
35
|
+
Curated models live in `config/models.yml` and can be referenced by name:
|
|
36
|
+
|
|
37
|
+
```ruby
|
|
38
|
+
candidate :flash, model: Qualspec.model(:deepseek_flash)
|
|
39
|
+
|
|
40
|
+
Qualspec.model(:glm) # => "z-ai/glm-5.2"
|
|
41
|
+
Qualspec.model(:unknown) # => "openrouter/auto" (falls back to default)
|
|
42
|
+
Qualspec.model # => "openrouter/auto"
|
|
43
|
+
Qualspec.models.all # => { "glm" => "z-ai/glm-5.2", ... }
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Edit `config/models.yml` to add/rename models, or point `QUALSPEC_MODELS_FILE`
|
|
47
|
+
at your own. Override the process-wide default with `QUALSPEC_MODEL`.
|
|
27
48
|
|
|
28
49
|
## Quick Start
|
|
29
50
|
|
|
@@ -73,11 +94,12 @@ end
|
|
|
73
94
|
## Documentation
|
|
74
95
|
|
|
75
96
|
- [Getting Started](docs/getting-started.md)
|
|
76
|
-
- [Evaluation Suites](docs/evaluation-suites.md) - CLI for model comparison
|
|
97
|
+
- [Evaluation Suites](docs/evaluation-suites.md) - CLI for model comparison (incl. cost/value tracking)
|
|
77
98
|
- [RSpec Integration](docs/rspec-integration.md) - Testing your agents
|
|
78
99
|
- [Rubrics](docs/rubrics.md) - Builtin and custom evaluation criteria
|
|
79
|
-
- [Configuration](docs/configuration.md) - All options
|
|
100
|
+
- [Configuration](docs/configuration.md) - All options, models, cost tracking
|
|
80
101
|
- [Recording](docs/recording.md) - VCR integration
|
|
102
|
+
- [Examples](examples/EXAMPLES.md) - Runnable scripts (replay free from cassettes)
|
|
81
103
|
|
|
82
104
|
## License
|
|
83
105
|
|
data/config/models.yml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Curated OpenRouter models for qualspec.
|
|
2
|
+
#
|
|
3
|
+
# Reference these by name in suites/specs instead of hardcoding slugs:
|
|
4
|
+
#
|
|
5
|
+
# candidate :flash, model: Qualspec.model(:deepseek_flash)
|
|
6
|
+
# Qualspec.model(:glm) # => "z-ai/glm-5.2"
|
|
7
|
+
# Qualspec.model(:unknown) # => falls back to `default` below
|
|
8
|
+
# Qualspec.model # => the default
|
|
9
|
+
# Qualspec.models # => the full name => slug hash
|
|
10
|
+
#
|
|
11
|
+
# `default` is the universal fallback. `openrouter/auto` routes to a sensible
|
|
12
|
+
# model for any request, so things work even with nothing configured.
|
|
13
|
+
# Override per-process with QUALSPEC_MODEL / QUALSPEC_JUDGE_MODEL, or point
|
|
14
|
+
# QUALSPEC_MODELS_FILE at a different YAML file.
|
|
15
|
+
default: openrouter/auto
|
|
16
|
+
|
|
17
|
+
models:
|
|
18
|
+
owl_alpha: openrouter/owl-alpha
|
|
19
|
+
deepseek_flash: deepseek/deepseek-v4-flash
|
|
20
|
+
deepseek_pro: deepseek/deepseek-v4-pro
|
|
21
|
+
minimax: minimax/minimax-m3
|
|
22
|
+
glm: z-ai/glm-5.2
|
|
23
|
+
gemini_flash: google/gemini-3-flash-preview
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Qualspec — Alpha Readiness Findings & Plan
|
|
2
|
+
|
|
3
|
+
_Assessment date: 2026-06-26. Status snapshot before first alpha use cases._
|
|
4
|
+
|
|
5
|
+
> **Update 2026-06-28 (v0.2.0):** Several items below are now DONE:
|
|
6
|
+
> - **Cost tracking fixed** (item 1) — now opt-in via `track_cost`; `Client` sends
|
|
7
|
+
> `usage: {include: true}` and reads `usage.cost`. Verified live + unit-tested.
|
|
8
|
+
> - **API key configurable** — `Qualspec.configure { c.api_key = ... }` wins over
|
|
9
|
+
> `QUALSPEC_API_KEY` → `OPEN_ROUTER_API_KEY` (resolved lazily at read time).
|
|
10
|
+
> - Added named model registry (`Qualspec.model`), `Recorder.use_cassette`, and a
|
|
11
|
+
> `Configuration` spec. Test count is now 74.
|
|
12
|
+
> Still open: items 2–4 below (Client network-boundary spec, more pure units,
|
|
13
|
+
> RSpec-integration spec) and the consistency pass for `api_url`/`default_model`.
|
|
14
|
+
|
|
15
|
+
## TL;DR
|
|
16
|
+
|
|
17
|
+
- **Tests: green** — `bundle exec rake spec` → 44 examples, 0 failures. RuboCop clean except 10 cosmetic offenses.
|
|
18
|
+
- **No failing tests, no structural problems.** Architecture is clean (core / suite-DSL / rspec surfaces well separated).
|
|
19
|
+
- **One real dead-code bug:** cost tracking never fires.
|
|
20
|
+
- **~half of `lib/` has zero direct test coverage**, including the entire network boundary (`Client`) and the entire RSpec public surface.
|
|
21
|
+
- **Open integration question:** we run a custom OpenRouter gem — `Client` currently talks to `chat/completions` via raw Faraday. Need to decide whether `Client` should delegate to our gem and confirm cost/header shapes line up.
|
|
22
|
+
|
|
23
|
+
> Setup note: repo was locked to gems not installed for Ruby 3.4.9; ran `bundle install` to get green. Confirm alpha testers' Ruby matches `Gemfile.lock`.
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## START HERE TOMORROW (ordered)
|
|
28
|
+
|
|
29
|
+
### 0. Custom OpenRouter gem sync (do first — it changes item 1)
|
|
30
|
+
The whole cost/metadata story depends on how our gem surfaces data, so resolve this before wiring cost.
|
|
31
|
+
|
|
32
|
+
- `lib/qualspec/client.rb` is a self-contained Faraday client posting to `chat/completions`. Decide:
|
|
33
|
+
- **(a)** Keep `Client` as-is and just make sure our gem isn't double-wrapping requests, **or**
|
|
34
|
+
- **(b)** Have `Client` delegate to our OpenRouter gem.
|
|
35
|
+
- Confirm the cost extraction in `Client#extract_cost` matches reality from our gem:
|
|
36
|
+
- currently reads header `x-openrouter-cost`, then falls back to body `usage.total_cost` / `cost` (`client.rb:110-117`).
|
|
37
|
+
- Verify our gem/endpoint actually returns one of these, and in what shape (header string vs. nested JSON).
|
|
38
|
+
- Confirm token extraction keys (`prompt_tokens`/`completion_tokens`/`total_tokens`, `client.rb:119-128`) match.
|
|
39
|
+
- Confirm auth headers (`config.api_headers`) and `response_format: { type: 'json_object' }` are honored by our endpoint (see item 4).
|
|
40
|
+
|
|
41
|
+
### 1. Fix cost tracking (dead code) — depends on item 0
|
|
42
|
+
**Bug:** `Candidate#generate_response` (`candidate.rb:22`) calls `Qualspec.client.chat(...)` **without** `with_metadata: true`, so `chat` always returns a `String`, never a `Client::Response`. Every `response.is_a?(Client::Response)` check in the runner (`runner.rb:72,82,83`) is permanently false. Result:
|
|
43
|
+
- `cost:` always `nil` → `Results#costs` always empty
|
|
44
|
+
- both reporters show cost as blank/`-` (`reporter.rb:147`, `html_reporter.rb:367`)
|
|
45
|
+
- the `Response#duration_ms` path is dead (runner's own monotonic timer feeds timing, so timing itself is fine)
|
|
46
|
+
|
|
47
|
+
**Fix:** thread `with_metadata: true` through `Candidate#generate_response` and have the runner consume the `Response` (cost + duration). Plumbing already exists in `Client#extract_cost`/`extract_tokens`. Alternative: remove the cost UI until wired. **Recommend wiring it.** (~30 min once item 0 is settled.)
|
|
48
|
+
|
|
49
|
+
### 2. Add a `Client` spec (highest real-world risk) — ~1 hr
|
|
50
|
+
The network boundary has **zero tests**. Use WebMock. Cover:
|
|
51
|
+
- success path returns content
|
|
52
|
+
- non-2xx → `Client::RequestError` (`handle_response`, `client.rb:85`)
|
|
53
|
+
- `with_metadata: true` → builds `Response` with cost/tokens/duration
|
|
54
|
+
- `extract_cost` header vs. body fallback
|
|
55
|
+
- `validate_api_key!` raises when unset, skips during VCR playback
|
|
56
|
+
- SSL toggle via `QUALSPEC_SSL_VERIFY=false`
|
|
57
|
+
|
|
58
|
+
### 3. Pure-unit specs (high coverage-per-effort) — ~1 hr
|
|
59
|
+
All currently untested, all pure/branchy:
|
|
60
|
+
- `Candidate#normalize_temperature` — Anthropic clamps 0–1, others 0–2 (`candidate.rb:33`)
|
|
61
|
+
- `Scenario#compose_prompt` / `compose_system_prompt` — priority merge `full_prompt > base_prompt > credential-prefix`; `variant > scenario > candidate` (`scenario.rb:60,84`)
|
|
62
|
+
- `VariantsConfig#build_variants` / `#trait_matrix` — cartesian product, name dedup, **FactoryBot-absent fallback** (`dsl.rb:100,111`)
|
|
63
|
+
- `PromptVariant` — `temperature=` range validation, `variant_key`, `customized?`, `to_h.compact` (`prompt_variant.rb:55,71`)
|
|
64
|
+
|
|
65
|
+
### 4. One RSpec-integration spec — ~30 min
|
|
66
|
+
Entire RSpec public surface (`helpers`, `matchers`, `evaluation_result`, `rspec/configuration`) is untested. Stub `Qualspec.judge`, exercise `qualspec_evaluate` + a couple matchers (`be_passing`, `have_score_above`). Also covers the symbol/string key fallback in `wrap_comparison_results` (`helpers.rb:143`).
|
|
67
|
+
|
|
68
|
+
### 5. Small cleanups / docs
|
|
69
|
+
- Drop duplicate `finish!` (called in `runner.rb:40` AND `qualspec.rb:81`).
|
|
70
|
+
- Document in README: **scoring is comparative for 2+ candidates, absolute for 1** (`runner.rb:125`) — single-candidate and multi-candidate scores are NOT apples-to-apples.
|
|
71
|
+
- `chmod +x` the two example files flagged by RuboCop.
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Full findings reference
|
|
76
|
+
|
|
77
|
+
### Bugs / behavior gaps
|
|
78
|
+
| # | Item | Location | Severity |
|
|
79
|
+
|---|------|----------|----------|
|
|
80
|
+
| 1 | Cost/metadata never captured (`with_metadata` never passed) | `candidate.rb:22` → `runner.rb:72,82,83` | **High** (dead feature) |
|
|
81
|
+
| 2 | `finish!` called twice | `runner.rb:40`, `qualspec.rb:81` | Low |
|
|
82
|
+
| 3 | Judge JSON-parse failure returns score 0 — indistinguishable from a genuinely bad response. Cross-provider endpoints that don't honor `json_object` will look like "model failed." | `judge.rb:162` | Medium (provider alpha) |
|
|
83
|
+
| 4 | Comparative (2+) vs absolute (1) scoring — cross-run scores not comparable | `runner.rb:125` | Doc note |
|
|
84
|
+
|
|
85
|
+
### Test coverage map
|
|
86
|
+
**Has coverage:** `Judge` (happy + error), `Results`/`Runner` aggregation, `HtmlReporter`, top-level registry methods.
|
|
87
|
+
|
|
88
|
+
**Zero coverage — ranked:**
|
|
89
|
+
- **High:** `Client` (network boundary); entire RSpec integration (`helpers`/`matchers`/`evaluation_result`/`rspec/configuration`); `VariantsConfig` (headline feature); `Scenario#compose_prompt`.
|
|
90
|
+
- **Medium:** `Candidate#normalize_temperature`; `PromptVariant` validation/keys; `Judge` untested branches (`winner: "tie"` `judge.rb:202`, `clamp(0,10)`, missing-candidate `judge.rb:190`).
|
|
91
|
+
- **Low:** `Reporter` (stdout/json); CLI `exe/qualspec` smoke test; `Recorder`; builtin rubric/behavior content.
|
|
92
|
+
|
|
93
|
+
### Effort estimate
|
|
94
|
+
Items 0–5 ≈ **half a day to a day**. Items 1–4 take coverage from ~half to most of the meaningful surface and close the one real bug.
|
data/docs/configuration.md
CHANGED
|
@@ -4,16 +4,21 @@
|
|
|
4
4
|
|
|
5
5
|
| Variable | Description | Default |
|
|
6
6
|
|----------|-------------|---------|
|
|
7
|
-
| `QUALSPEC_API_KEY` | API key (
|
|
7
|
+
| `QUALSPEC_API_KEY` | API key (falls back to `OPEN_ROUTER_API_KEY`) | - |
|
|
8
8
|
| `QUALSPEC_API_URL` | API endpoint | `https://openrouter.ai/api/v1` |
|
|
9
|
-
| `QUALSPEC_MODEL` | Default model for candidates | `
|
|
9
|
+
| `QUALSPEC_MODEL` | Default model for candidates | `openrouter/auto` |
|
|
10
10
|
| `QUALSPEC_JUDGE_MODEL` | Model for judging | Same as `QUALSPEC_MODEL` |
|
|
11
|
+
| `QUALSPEC_MODELS_FILE` | Path to the named-models YAML | `config/models.yml` |
|
|
11
12
|
| `QUALSPEC_SSL_VERIFY` | SSL verification (disable with `false`) | `true` |
|
|
12
13
|
|
|
13
14
|
### Required Setup
|
|
14
15
|
|
|
16
|
+
Provide an API key via env var (or set it programmatically — see below):
|
|
17
|
+
|
|
15
18
|
```bash
|
|
16
19
|
export QUALSPEC_API_KEY=your_openrouter_api_key
|
|
20
|
+
# or, equivalently for OpenRouter:
|
|
21
|
+
export OPEN_ROUTER_API_KEY=sk-or-...
|
|
17
22
|
```
|
|
18
23
|
|
|
19
24
|
### Using Different Providers
|
|
@@ -42,10 +47,10 @@ export QUALSPEC_API_URL=http://localhost:11434/v1
|
|
|
42
47
|
Qualspec.configure do |config|
|
|
43
48
|
# API settings
|
|
44
49
|
config.api_url = "https://openrouter.ai/api/v1"
|
|
45
|
-
config.api_key = ENV["MY_API_KEY"]
|
|
50
|
+
config.api_key = ENV["MY_API_KEY"] # wins over QUALSPEC_API_KEY / OPEN_ROUTER_API_KEY
|
|
46
51
|
|
|
47
52
|
# Models
|
|
48
|
-
config.default_model = "
|
|
53
|
+
config.default_model = "openrouter/auto"
|
|
49
54
|
config.judge_model = "openai/gpt-4"
|
|
50
55
|
|
|
51
56
|
# Timeouts
|
|
@@ -58,6 +63,50 @@ Qualspec.configure do |config|
|
|
|
58
63
|
end
|
|
59
64
|
```
|
|
60
65
|
|
|
66
|
+
## Models
|
|
67
|
+
|
|
68
|
+
The default model everywhere is `openrouter/auto`, which routes to a sensible
|
|
69
|
+
model for any request — so qualspec works even with nothing configured. A
|
|
70
|
+
`candidate` with no `model:` uses this default.
|
|
71
|
+
|
|
72
|
+
Curated models live in `config/models.yml` and are referenced by name:
|
|
73
|
+
|
|
74
|
+
```yaml
|
|
75
|
+
# config/models.yml
|
|
76
|
+
default: openrouter/auto
|
|
77
|
+
models:
|
|
78
|
+
glm: z-ai/glm-5.2
|
|
79
|
+
deepseek_flash: deepseek/deepseek-v4-flash
|
|
80
|
+
deepseek_pro: deepseek/deepseek-v4-pro
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
```ruby
|
|
84
|
+
Qualspec.model(:glm) # => "z-ai/glm-5.2"
|
|
85
|
+
Qualspec.model(:unknown) # => "openrouter/auto" (falls back to default)
|
|
86
|
+
Qualspec.model # => "openrouter/auto"
|
|
87
|
+
Qualspec.models.all # => { "glm" => "z-ai/glm-5.2", ... }
|
|
88
|
+
|
|
89
|
+
candidate :flash, model: Qualspec.model(:deepseek_flash)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Point `QUALSPEC_MODELS_FILE` at a different YAML to use your own list.
|
|
93
|
+
|
|
94
|
+
## Cost Tracking
|
|
95
|
+
|
|
96
|
+
Cost capture is **opt-in**. Enable `track_cost` in a suite to have qualspec
|
|
97
|
+
request OpenRouter usage accounting and record per-call cost + tokens:
|
|
98
|
+
|
|
99
|
+
```ruby
|
|
100
|
+
Qualspec.evaluation "Best Value" do
|
|
101
|
+
track_cost
|
|
102
|
+
# ...
|
|
103
|
+
end
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Then `results.value_ranking` (quality per dollar) and `results.cost_by_candidate`
|
|
107
|
+
become available. Calling them without `track_cost` raises a clear error. See
|
|
108
|
+
[Evaluation Suites](evaluation-suites.md#cost-tracking) for details.
|
|
109
|
+
|
|
61
110
|
## RSpec Configuration
|
|
62
111
|
|
|
63
112
|
Additional configuration for RSpec integration:
|
data/docs/evaluation-suites.md
CHANGED
|
@@ -29,8 +29,14 @@ candidates do
|
|
|
29
29
|
system: "You are an extremely helpful assistant."
|
|
30
30
|
|
|
31
31
|
# Multiple candidates
|
|
32
|
-
candidate "gemini", model: "google/gemini-
|
|
33
|
-
candidate "grok", model: "x-ai/grok-
|
|
32
|
+
candidate "gemini", model: "google/gemini-3-flash-preview"
|
|
33
|
+
candidate "grok", model: "x-ai/grok-4.1-fast"
|
|
34
|
+
|
|
35
|
+
# Reference a curated model by name (see config/models.yml)
|
|
36
|
+
candidate "glm", model: Qualspec.model(:glm)
|
|
37
|
+
|
|
38
|
+
# Omit model: entirely to use the default (openrouter/auto)
|
|
39
|
+
candidate "auto"
|
|
34
40
|
end
|
|
35
41
|
```
|
|
36
42
|
|
|
@@ -194,6 +200,43 @@ end
|
|
|
194
200
|
|
|
195
201
|
This runs: **2 candidates × 4 variants × 3 temperatures × 1 scenario = 24 evaluations**
|
|
196
202
|
|
|
203
|
+
## Cost Tracking
|
|
204
|
+
|
|
205
|
+
Cost capture is opt-in. Add `track_cost` to a suite to record per-call cost and
|
|
206
|
+
tokens (via OpenRouter usage accounting) so you can analyze the quality/cost
|
|
207
|
+
trade-off:
|
|
208
|
+
|
|
209
|
+
```ruby
|
|
210
|
+
Qualspec.evaluation "Best Value" do
|
|
211
|
+
track_cost
|
|
212
|
+
|
|
213
|
+
candidates do
|
|
214
|
+
candidate :flash, model: Qualspec.model(:deepseek_flash) # cheap
|
|
215
|
+
candidate :pro, model: Qualspec.model(:deepseek_pro) # pricey
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
scenario "reasoning task" do
|
|
219
|
+
prompt "A bat and ball cost $1.10. The bat is $1 more than the ball. How much is the ball?"
|
|
220
|
+
rubric :reasoning_quality
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
After running, the `Results` object exposes cost analysis:
|
|
226
|
+
|
|
227
|
+
```ruby
|
|
228
|
+
results.costs_tracked? # => true
|
|
229
|
+
results.cost_by_candidate # => { "flash" => 0.00014, "pro" => 0.00095 }
|
|
230
|
+
|
|
231
|
+
# Rank by quality-per-dollar (avg score ÷ cost), best first:
|
|
232
|
+
results.value_ranking
|
|
233
|
+
# => { "flash" => { avg_score: 9.5, cost: 0.00014, score_per_dollar: 63982 }, ... }
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
Calling `value_ranking` or `cost_by_candidate` without `track_cost` raises a
|
|
237
|
+
clear error telling you to enable it. See `examples/best_value.rb` for a full
|
|
238
|
+
runnable example.
|
|
239
|
+
|
|
197
240
|
## Using Behaviors (Shared Scenarios)
|
|
198
241
|
|
|
199
242
|
Define reusable scenario sets:
|
data/docs/getting-started.md
CHANGED
|
@@ -18,7 +18,7 @@ bundle install
|
|
|
18
18
|
|
|
19
19
|
## Configuration
|
|
20
20
|
|
|
21
|
-
Set your API key:
|
|
21
|
+
Set your API key (`OPEN_ROUTER_API_KEY` also works as a fallback):
|
|
22
22
|
|
|
23
23
|
```bash
|
|
24
24
|
export QUALSPEC_API_KEY=your_openrouter_key
|
|
@@ -41,11 +41,14 @@ You can also configure programmatically:
|
|
|
41
41
|
Qualspec.configure do |config|
|
|
42
42
|
config.api_url = "https://openrouter.ai/api/v1"
|
|
43
43
|
config.api_key = ENV["MY_API_KEY"]
|
|
44
|
-
config.judge_model = "
|
|
44
|
+
config.judge_model = "openrouter/auto"
|
|
45
45
|
config.request_timeout = 120
|
|
46
46
|
end
|
|
47
47
|
```
|
|
48
48
|
|
|
49
|
+
Models can be referenced by name from `config/models.yml` via `Qualspec.model(:name)`,
|
|
50
|
+
and the default everywhere is `openrouter/auto`. See [Configuration](configuration.md#models).
|
|
51
|
+
|
|
49
52
|
## Two Ways to Use Qualspec
|
|
50
53
|
|
|
51
54
|
### 1. Evaluation Suites (CLI)
|
data/docs/recording.md
CHANGED
|
@@ -29,6 +29,28 @@ qualspec --playback my_session eval/suite.rb
|
|
|
29
29
|
|
|
30
30
|
Replays from the cassette with no network calls. Fails if a request isn't in the cassette.
|
|
31
31
|
|
|
32
|
+
## Programmatic Recording
|
|
33
|
+
|
|
34
|
+
For scripts (like the files in `examples/`), drive the recorder directly. Point
|
|
35
|
+
it at a cassette directory, then use `use_cassette`, which **replays an existing
|
|
36
|
+
cassette without needing an API key** and records a fresh one only when it's
|
|
37
|
+
missing:
|
|
38
|
+
|
|
39
|
+
```ruby
|
|
40
|
+
Qualspec::Recorder.setup(cassette_dir: File.expand_path("cassettes", __dir__))
|
|
41
|
+
|
|
42
|
+
Qualspec::Recorder.use_cassette("my_run") do
|
|
43
|
+
Qualspec.run("My Suite", output: :stdout)
|
|
44
|
+
end
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
This is how the bundled examples ship committed cassettes that anyone can replay
|
|
48
|
+
for free. To re-record, delete the cassette and run again with a key set.
|
|
49
|
+
|
|
50
|
+
`Recorder.record(name)` (record new, replay existing) and
|
|
51
|
+
`Recorder.playback(name)` (replay only, error on miss) are also available when
|
|
52
|
+
you want to force a mode.
|
|
53
|
+
|
|
32
54
|
## RSpec Recording
|
|
33
55
|
|
|
34
56
|
### Per-Test Recording
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Qualspec Examples
|
|
2
|
+
|
|
3
|
+
Runnable scripts that show different ways to use qualspec. Each one **records a
|
|
4
|
+
VCR cassette** to `examples/cassettes/`, so they replay for free — no API key,
|
|
5
|
+
no credits, no network:
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
bundle exec ruby examples/customer_service_comparison.rb
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
To run against the **live API** instead, delete the cassette and provide a key
|
|
12
|
+
(the examples read `OPEN_ROUTER_API_KEY` / `QUALSPEC_API_KEY`):
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
rm examples/cassettes/customer_service_comparison.yml
|
|
16
|
+
OPEN_ROUTER_API_KEY=sk-... bundle exec ruby examples/customer_service_comparison.rb
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Cassettes are recorded with API keys filtered out (`<API_KEY>`). Models are
|
|
20
|
+
referenced by name from `config/models.yml` via `Qualspec.model(:name)`.
|
|
21
|
+
|
|
22
|
+
## The showcase
|
|
23
|
+
|
|
24
|
+
| Example | Use case | What it demonstrates |
|
|
25
|
+
|---------|----------|----------------------|
|
|
26
|
+
| [`customer_service_comparison.rb`](customer_service_comparison.rb) | Pick the best model for a support agent | Suite DSL, multi-candidate **comparative judging**, per-scenario winners, custom rubric |
|
|
27
|
+
| [`date_awareness_gate.rb`](date_awareness_gate.rb) | CI gate: fail if a model hallucinates | **Pass/fail thresholding** + non-zero exit code as a regression gate |
|
|
28
|
+
| [`best_value.rb`](best_value.rb) | "Is the pricey model worth it?" | **Cost tracking** (`track_cost`) and `value_ranking` — quality per dollar |
|
|
29
|
+
| [`character_consistency.rb`](character_consistency.rb) | Rank models on a multi-turn role-play | The **lower-level API** (`Qualspec.client` + `Qualspec.judge`) for multi-turn / agent evaluation |
|
|
30
|
+
|
|
31
|
+
## What each one shows
|
|
32
|
+
|
|
33
|
+
### `customer_service_comparison.rb` — compare models, find the best
|
|
34
|
+
Three models answer the same support scenarios (an angry refund demand, an
|
|
35
|
+
ambiguous discount question) under one system prompt. The judge compares them
|
|
36
|
+
head-to-head and declares a per-scenario winner. This is the core
|
|
37
|
+
"which model is best for this job?" workflow, driven entirely by the suite DSL
|
|
38
|
+
and a custom `Qualspec.define_rubric`.
|
|
39
|
+
|
|
40
|
+
### `date_awareness_gate.rb` — pass/fail gate for CI
|
|
41
|
+
A single criterion: a model should *not* confidently assert today's date (it has
|
|
42
|
+
no real-time clock). The script judges each model and **exits non-zero if any
|
|
43
|
+
fails**, so you can drop it into CI as a guard when swapping models. In the
|
|
44
|
+
recorded run, one model correctly declines while two confidently hallucinate a
|
|
45
|
+
date — the gate fails (exit 1), as intended.
|
|
46
|
+
|
|
47
|
+
### `best_value.rb` — best response per dollar
|
|
48
|
+
Enables `track_cost`, which makes qualspec request OpenRouter usage accounting
|
|
49
|
+
and capture real per-call cost. It then reports both the **quality winner** and
|
|
50
|
+
the **value winner** via `results.value_ranking` (avg score ÷ cost). The
|
|
51
|
+
recorded run shows a cheap model winning on value while the pro model costs
|
|
52
|
+
several times more for a marginal quality gain.
|
|
53
|
+
|
|
54
|
+
> Cost analysis requires `track_cost`. Calling `value_ranking` / `cost_by_candidate`
|
|
55
|
+
> without it raises a clear error telling you to enable it.
|
|
56
|
+
|
|
57
|
+
### `character_consistency.rb` — multi-turn ranking via the building blocks
|
|
58
|
+
The suite DSL is single-prompt per scenario, so evaluating a *conversation* drops
|
|
59
|
+
down to qualspec's primitives: `Qualspec.client.chat` with a running message
|
|
60
|
+
history to role-play a game NPC over several turns, then
|
|
61
|
+
`Qualspec.judge.evaluate_comparison` to rank the full transcripts on persona
|
|
62
|
+
consistency. A good template for vetting an agent or a new model with a custom
|
|
63
|
+
harness.
|
|
64
|
+
|
|
65
|
+
## Other examples in this directory
|
|
66
|
+
|
|
67
|
+
- `simple_variant_comparison.rb`, `variant_comparison.rb`,
|
|
68
|
+
`prompt_variants_factory.rb` — the **variant + temperature matrix** feature
|
|
69
|
+
(FactoryBot-backed prompt permutations).
|
|
70
|
+
- `rspec_example_spec.rb` — using qualspec **inside RSpec** (`qualspec_evaluate`,
|
|
71
|
+
`qualspec_compare`, matchers, VCR).
|
|
72
|
+
- `comparison.rb`, `model_comparison.rb`, `persona_test.rb`, `quick_test.rb` —
|
|
73
|
+
smaller/older one-off snippets.
|
data/examples/README.md
CHANGED
|
@@ -1,5 +1,10 @@
|
|
|
1
1
|
# Qualspec Examples
|
|
2
2
|
|
|
3
|
+
> **See [EXAMPLES.md](EXAMPLES.md) for the full showcase** — runnable scripts
|
|
4
|
+
> for model comparison, pass/fail gates, cost/value analysis, and multi-turn
|
|
5
|
+
> ranking, each replayable for free from a committed VCR cassette. The notes
|
|
6
|
+
> below cover the variant-matrix examples specifically.
|
|
7
|
+
|
|
3
8
|
## Simple Variant Comparison
|
|
4
9
|
|
|
5
10
|
Demonstrates multi-dimensional testing with prompt variants.
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Example: Best response-per-dollar (cost/value trade-off)
|
|
5
|
+
# -------------------------------------------------------
|
|
6
|
+
# Use case: "Is the expensive model actually worth it for this task?"
|
|
7
|
+
#
|
|
8
|
+
# Enables `track_cost`, which makes qualspec capture per-call cost from
|
|
9
|
+
# OpenRouter's usage accounting. It then reports both the quality winner and the
|
|
10
|
+
# value winner (avg score per dollar), so you can decide whether a pricier model
|
|
11
|
+
# earns its keep.
|
|
12
|
+
#
|
|
13
|
+
# Run it (replays from cassette, no credits needed):
|
|
14
|
+
#
|
|
15
|
+
# bundle exec ruby examples/best_value.rb
|
|
16
|
+
|
|
17
|
+
require 'bundler/setup'
|
|
18
|
+
require 'qualspec'
|
|
19
|
+
|
|
20
|
+
Qualspec.define_rubric :reasoning_quality do
|
|
21
|
+
criterion 'Reaches the correct conclusion'
|
|
22
|
+
criterion 'Shows clear, valid step-by-step reasoning'
|
|
23
|
+
criterion 'Does not pad with irrelevant filler'
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
Qualspec.evaluation 'Best Value' do
|
|
27
|
+
track_cost # <-- capture per-call cost so value_ranking works
|
|
28
|
+
|
|
29
|
+
candidates do
|
|
30
|
+
candidate :flash, model: Qualspec.model(:deepseek_flash) # cheap
|
|
31
|
+
candidate :glm, model: Qualspec.model(:glm) # mid
|
|
32
|
+
candidate :pro, model: Qualspec.model(:deepseek_pro) # pricey
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
scenario 'Word problem' do
|
|
36
|
+
prompt 'A bat and a ball cost $1.10 total. The bat costs $1.00 more than the ball. ' \
|
|
37
|
+
'How much does the ball cost? Explain your reasoning.'
|
|
38
|
+
rubric :reasoning_quality
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
scenario 'Logic puzzle' do
|
|
42
|
+
prompt 'If all Bloops are Razzies and all Razzies are Lazzies, are all Bloops definitely Lazzies? ' \
|
|
43
|
+
'Explain.'
|
|
44
|
+
rubric :reasoning_quality
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
if __FILE__ == $PROGRAM_NAME
|
|
49
|
+
Qualspec::Recorder.setup(cassette_dir: File.expand_path('cassettes', __dir__))
|
|
50
|
+
|
|
51
|
+
results = Qualspec::Recorder.use_cassette('best_value') do
|
|
52
|
+
Qualspec.run('Best Value', progress: true, output: :stdout)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
puts
|
|
56
|
+
puts '=== Value ranking (quality per dollar) ==='
|
|
57
|
+
printf("%-8s %8s %12s %14s\n", 'model', 'score', 'cost', 'score/$')
|
|
58
|
+
results.value_ranking.each do |candidate, v|
|
|
59
|
+
printf("%-8s %8.2f %12.6f %14s\n", candidate, v[:avg_score], v[:cost], v[:score_per_dollar] || 'n/a')
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
best_quality = results.scores_by_candidate.max_by { |_, s| s[:avg_score] }&.first
|
|
63
|
+
best_value = results.value_ranking.keys.first
|
|
64
|
+
puts
|
|
65
|
+
puts "Highest quality: #{best_quality}"
|
|
66
|
+
puts "Best value: #{best_value}"
|
|
67
|
+
end
|