qualspec 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/qualspec/judge.rb +1 -1
- data/lib/qualspec/suite/html_reporter.rb +8 -8
- data/lib/qualspec/suite/runner.rb +18 -7
- data/lib/qualspec/version.rb +1 -1
- metadata +2 -3
- data/.DS_Store +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 004a0ca49d3bcb6890cadde4f994ddf6ce9585f06f4c81798644d4e02c74de5a
|
|
4
|
+
data.tar.gz: c3994806a042fc9693cdb7e6e9d0579a644edf811b4f529080b216506056aee0
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: dfad9e00f04bc7552f8641c1d12ce956d2e58f66d120a495274e4ca5e6af85a701866f0f8e3951c0a031e72e56f486ebcc041b52be61faadcd8f4822cfebee0d
|
|
7
|
+
data.tar.gz: d4192bc2492c27472169c30b024061ce969aa65d3855dc4c1b0b26d25db2f2fac46da1070c99418d61586cf475d8c759ee8db1a37673f17270ebcbc222f407ea
|
data/lib/qualspec/judge.rb
CHANGED
|
@@ -141,7 +141,7 @@ module Qualspec
|
|
|
141
141
|
parts << '## Responses:'
|
|
142
142
|
|
|
143
143
|
responses.each do |candidate, response|
|
|
144
|
-
parts << "\n### #{candidate}:\n#{response}"
|
|
144
|
+
parts << "\n### #{candidate}:\n```\n#{response}\n```"
|
|
145
145
|
end
|
|
146
146
|
|
|
147
147
|
parts << "\nScore each candidate (#{candidate_names}) from 0-10."
|
|
@@ -481,10 +481,12 @@ module Qualspec
|
|
|
481
481
|
|
|
482
482
|
scenario_blocks = scenarios.map do |scenario|
|
|
483
483
|
response_cards = responses.map do |candidate, candidate_responses|
|
|
484
|
-
|
|
485
|
-
next unless
|
|
484
|
+
variant_map = candidate_responses[scenario]
|
|
485
|
+
next unless variant_map
|
|
486
486
|
|
|
487
|
-
|
|
487
|
+
contents = variant_map.flat_map { |_v, tm| tm.values.map { |d| d[:content] } }.compact
|
|
488
|
+
response_text = contents.join("\n\n---\n\n").strip
|
|
489
|
+
next if response_text.empty?
|
|
488
490
|
|
|
489
491
|
<<~CARD
|
|
490
492
|
<div class="response-card">
|
|
@@ -660,13 +662,11 @@ module Qualspec
|
|
|
660
662
|
end
|
|
661
663
|
|
|
662
664
|
def get_candidate_model(candidate)
|
|
663
|
-
|
|
664
|
-
@results.evaluations.find { |e| e[:candidate] == candidate }&.dig(:model) || 'unknown'
|
|
665
|
+
@results.candidate_models[candidate] || 'unknown'
|
|
665
666
|
end
|
|
666
667
|
|
|
667
|
-
def get_scenario_prompt(
|
|
668
|
-
|
|
669
|
-
nil
|
|
668
|
+
def get_scenario_prompt(scenario)
|
|
669
|
+
@results.prompts[scenario]
|
|
670
670
|
end
|
|
671
671
|
end
|
|
672
672
|
end
|
|
@@ -11,6 +11,10 @@ module Qualspec
|
|
|
11
11
|
@definition = definition.is_a?(String) ? Suite.find(definition) : definition
|
|
12
12
|
@results = Results.new(@definition.name)
|
|
13
13
|
@judge = Qualspec.judge
|
|
14
|
+
|
|
15
|
+
@definition.candidates_list.each do |c|
|
|
16
|
+
@results.candidate_models[c.name] = c.model
|
|
17
|
+
end
|
|
14
18
|
end
|
|
15
19
|
|
|
16
20
|
def run(progress: true)
|
|
@@ -52,6 +56,8 @@ module Qualspec
|
|
|
52
56
|
responses = {}
|
|
53
57
|
errors = {}
|
|
54
58
|
|
|
59
|
+
@results.prompts[scenario.name] ||= scenario.compose_prompt(variant)
|
|
60
|
+
|
|
55
61
|
# Phase 1: Collect all candidate responses
|
|
56
62
|
@definition.candidates_list.each do |candidate|
|
|
57
63
|
log_candidate_progress(candidate, scenario, 'generating') if progress
|
|
@@ -217,7 +223,8 @@ module Qualspec
|
|
|
217
223
|
|
|
218
224
|
# Results container with multi-dimensional support
|
|
219
225
|
class Results
|
|
220
|
-
attr_reader :suite_name, :evaluations, :responses, :started_at, :finished_at, :timing, :costs
|
|
226
|
+
attr_reader :suite_name, :evaluations, :responses, :started_at, :finished_at, :timing, :costs,
|
|
227
|
+
:candidate_models, :prompts
|
|
221
228
|
|
|
222
229
|
def initialize(suite_name)
|
|
223
230
|
@suite_name = suite_name
|
|
@@ -225,6 +232,8 @@ module Qualspec
|
|
|
225
232
|
@responses = {} # Nested: {candidate => {scenario => {variant => {temp => response}}}}
|
|
226
233
|
@timing = {}
|
|
227
234
|
@costs = {}
|
|
235
|
+
@candidate_models = {} # {candidate_name => model_string}
|
|
236
|
+
@prompts = {} # {scenario_name => prompt_string}
|
|
228
237
|
@started_at = Time.now
|
|
229
238
|
@finished_at = nil
|
|
230
239
|
end
|
|
@@ -329,13 +338,15 @@ module Qualspec
|
|
|
329
338
|
def scores_by_scenario
|
|
330
339
|
@evaluations.group_by { |e| e[:scenario] }.transform_values do |evals|
|
|
331
340
|
evals.group_by { |e| e[:candidate] }.transform_values do |candidate_evals|
|
|
332
|
-
|
|
341
|
+
total = candidate_evals.size
|
|
342
|
+
avg_score = (candidate_evals.sum { |e| e[:score] }.to_f / total).round(2)
|
|
343
|
+
first = candidate_evals.first
|
|
333
344
|
{
|
|
334
|
-
score:
|
|
335
|
-
pass:
|
|
336
|
-
reasoning:
|
|
337
|
-
variant:
|
|
338
|
-
temperature:
|
|
345
|
+
score: avg_score,
|
|
346
|
+
pass: candidate_evals.all? { |e| e[:pass] },
|
|
347
|
+
reasoning: first[:reasoning],
|
|
348
|
+
variant: first[:variant],
|
|
349
|
+
temperature: first[:temperature]
|
|
339
350
|
}
|
|
340
351
|
end
|
|
341
352
|
end
|
data/lib/qualspec/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: qualspec
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Eric Stiens
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-04-16 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: faraday
|
|
@@ -62,7 +62,6 @@ executables:
|
|
|
62
62
|
extensions: []
|
|
63
63
|
extra_rdoc_files: []
|
|
64
64
|
files:
|
|
65
|
-
- ".DS_Store"
|
|
66
65
|
- ".qualspec_cassettes/comparison_test.yml"
|
|
67
66
|
- ".qualspec_cassettes/quick_test.yml"
|
|
68
67
|
- ".rspec"
|
data/.DS_Store
DELETED
|
Binary file
|