qualspec 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.qualspec_cassettes/comparison_test.yml +439 -0
  3. data/.qualspec_cassettes/quick_test.yml +232 -0
  4. data/.rspec +3 -0
  5. data/.rubocop.yml +1 -0
  6. data/.rubocop_todo.yml +70 -0
  7. data/CHANGELOG.md +16 -0
  8. data/README.md +84 -0
  9. data/Rakefile +8 -0
  10. data/docs/configuration.md +132 -0
  11. data/docs/evaluation-suites.md +180 -0
  12. data/docs/getting-started.md +102 -0
  13. data/docs/recording.md +196 -0
  14. data/docs/rspec-integration.md +233 -0
  15. data/docs/rubrics.md +174 -0
  16. data/examples/cassettes/qualspec_rspec_integration_basic_evaluation_evaluates_responses_with_inline_criteria.yml +65 -0
  17. data/examples/cassettes/qualspec_rspec_integration_basic_evaluation_provides_detailed_feedback_on_failure.yml +64 -0
  18. data/examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml +74 -0
  19. data/examples/cassettes/qualspec_rspec_integration_score_matchers_supports_score_comparisons.yml +65 -0
  20. data/examples/cassettes/qualspec_rspec_integration_vcr_integration_records_and_plays_back_api_calls_automatically.yml +65 -0
  21. data/examples/cassettes/qualspec_rspec_integration_with_context_uses_context_in_evaluation.yml +67 -0
  22. data/examples/cassettes/qualspec_rspec_integration_with_rubrics_evaluates_using_builtin_rubrics.yml +67 -0
  23. data/examples/comparison.rb +22 -0
  24. data/examples/model_comparison.rb +38 -0
  25. data/examples/persona_test.rb +49 -0
  26. data/examples/quick_test.rb +28 -0
  27. data/examples/report.html +399 -0
  28. data/examples/rspec_example_spec.rb +153 -0
  29. data/exe/qualspec +142 -0
  30. data/lib/qualspec/builtin_rubrics.rb +83 -0
  31. data/lib/qualspec/client.rb +127 -0
  32. data/lib/qualspec/configuration.rb +32 -0
  33. data/lib/qualspec/evaluation.rb +52 -0
  34. data/lib/qualspec/judge.rb +217 -0
  35. data/lib/qualspec/recorder.rb +55 -0
  36. data/lib/qualspec/rspec/configuration.rb +49 -0
  37. data/lib/qualspec/rspec/evaluation_result.rb +142 -0
  38. data/lib/qualspec/rspec/helpers.rb +155 -0
  39. data/lib/qualspec/rspec/matchers.rb +163 -0
  40. data/lib/qualspec/rspec.rb +66 -0
  41. data/lib/qualspec/rubric.rb +43 -0
  42. data/lib/qualspec/suite/behavior.rb +43 -0
  43. data/lib/qualspec/suite/builtin_behaviors.rb +84 -0
  44. data/lib/qualspec/suite/candidate.rb +30 -0
  45. data/lib/qualspec/suite/dsl.rb +64 -0
  46. data/lib/qualspec/suite/html_reporter.rb +673 -0
  47. data/lib/qualspec/suite/reporter.rb +274 -0
  48. data/lib/qualspec/suite/runner.rb +261 -0
  49. data/lib/qualspec/suite/scenario.rb +57 -0
  50. data/lib/qualspec/version.rb +5 -0
  51. data/lib/qualspec.rb +103 -0
  52. data/sig/qualspec.rbs +4 -0
  53. metadata +142 -0
@@ -0,0 +1,673 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'cgi'
4
+
5
+ module Qualspec
6
+ module Suite
7
+ class HtmlReporter
8
+ def initialize(results)
9
+ @results = results
10
+ end
11
+
12
+ def to_html
13
+ <<~HTML
14
+ <!DOCTYPE html>
15
+ <html lang="en">
16
+ <head>
17
+ <meta charset="UTF-8">
18
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
19
+ <title>#{h(@results.suite_name)} - Qualspec Report</title>
20
+ #{styles}
21
+ </head>
22
+ <body>
23
+ <div class="container">
24
+ #{header_section}
25
+ #{config_section}
26
+ #{summary_section}
27
+ #{performance_section}
28
+ #{detailed_results_section}
29
+ #{responses_section}
30
+ #{winner_section}
31
+ #{footer_section}
32
+ </div>
33
+ </body>
34
+ </html>
35
+ HTML
36
+ end
37
+
38
+ def write(path)
39
+ File.write(path, to_html)
40
+ end
41
+
42
+ private
43
+
44
+ def h(text)
45
+ CGI.escapeHTML(text.to_s)
46
+ end
47
+
48
+ def styles
49
+ <<~CSS
50
+ <style>
51
+ :root {
52
+ --bg: #0d1117;
53
+ --card-bg: #161b22;
54
+ --border: #30363d;
55
+ --text: #c9d1d9;
56
+ --text-muted: #8b949e;
57
+ --accent: #58a6ff;
58
+ --success: #3fb950;
59
+ --warning: #d29922;
60
+ --danger: #f85149;
61
+ --purple: #a371f7;
62
+ }
63
+
64
+ * { box-sizing: border-box; margin: 0; padding: 0; }
65
+
66
+ body {
67
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif;
68
+ background: var(--bg);
69
+ color: var(--text);
70
+ line-height: 1.6;
71
+ padding: 2rem;
72
+ }
73
+
74
+ .container { max-width: 1400px; margin: 0 auto; }
75
+
76
+ header {
77
+ text-align: center;
78
+ margin-bottom: 2rem;
79
+ padding-bottom: 1.5rem;
80
+ border-bottom: 1px solid var(--border);
81
+ }
82
+
83
+ header h1 { font-size: 2.5rem; font-weight: 600; margin-bottom: 0.5rem; }
84
+ header .subtitle { color: var(--text-muted); font-size: 0.9rem; }
85
+
86
+ .card {
87
+ background: var(--card-bg);
88
+ border: 1px solid var(--border);
89
+ border-radius: 6px;
90
+ padding: 1.5rem;
91
+ margin-bottom: 1.5rem;
92
+ }
93
+
94
+ .card h2 {
95
+ font-size: 1.25rem;
96
+ font-weight: 600;
97
+ margin-bottom: 1rem;
98
+ display: flex;
99
+ align-items: center;
100
+ gap: 0.5rem;
101
+ }
102
+
103
+ .card h2 .icon { font-size: 1.1rem; }
104
+
105
+ .config-grid {
106
+ display: grid;
107
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
108
+ gap: 1rem;
109
+ }
110
+
111
+ .config-item { padding: 0.75rem; background: var(--bg); border-radius: 4px; }
112
+ .config-item .label {
113
+ color: var(--text-muted);
114
+ font-size: 0.75rem;
115
+ text-transform: uppercase;
116
+ letter-spacing: 0.05em;
117
+ }
118
+ .config-item .value { font-weight: 500; margin-top: 0.25rem; word-break: break-all; }
119
+
120
+ table { width: 100%; border-collapse: collapse; }
121
+ th, td { text-align: left; padding: 0.75rem 1rem; border-bottom: 1px solid var(--border); }
122
+ th {
123
+ color: var(--text-muted);
124
+ font-weight: 500;
125
+ font-size: 0.85rem;
126
+ text-transform: uppercase;
127
+ letter-spacing: 0.05em;
128
+ }
129
+ tr:last-child td { border-bottom: none; }
130
+
131
+ .score-bar { display: flex; align-items: center; gap: 0.75rem; }
132
+ .score-bar .bar {
133
+ flex: 1;
134
+ height: 8px;
135
+ background: var(--border);
136
+ border-radius: 4px;
137
+ overflow: hidden;
138
+ max-width: 150px;
139
+ }
140
+ .score-bar .bar .fill { height: 100%; border-radius: 4px; }
141
+ .score-bar .value { font-weight: 600; min-width: 3.5rem; }
142
+
143
+ .badge {
144
+ display: inline-block;
145
+ padding: 0.25rem 0.5rem;
146
+ border-radius: 4px;
147
+ font-size: 0.75rem;
148
+ font-weight: 600;
149
+ }
150
+ .badge-success { background: rgba(63, 185, 80, 0.2); color: var(--success); }
151
+ .badge-warning { background: rgba(210, 153, 34, 0.2); color: var(--warning); }
152
+ .badge-danger { background: rgba(248, 81, 73, 0.2); color: var(--danger); }
153
+ .badge-winner { background: rgba(163, 113, 247, 0.2); color: var(--purple); }
154
+ .badge-info { background: rgba(88, 166, 255, 0.2); color: var(--accent); }
155
+
156
+ .scenario-card {
157
+ margin-bottom: 1.5rem;
158
+ padding: 1.25rem;
159
+ background: var(--bg);
160
+ border-radius: 6px;
161
+ border: 1px solid var(--border);
162
+ }
163
+ .scenario-card:last-child { margin-bottom: 0; }
164
+ .scenario-header {
165
+ display: flex;
166
+ justify-content: space-between;
167
+ align-items: center;
168
+ margin-bottom: 1rem;
169
+ padding-bottom: 0.75rem;
170
+ border-bottom: 1px solid var(--border);
171
+ }
172
+ .scenario-header h3 { font-size: 1.1rem; font-weight: 600; }
173
+ .scenario-prompt {
174
+ background: var(--card-bg);
175
+ padding: 1rem;
176
+ border-radius: 4px;
177
+ margin-bottom: 1rem;
178
+ border-left: 3px solid var(--accent);
179
+ }
180
+ .scenario-prompt .label {
181
+ color: var(--text-muted);
182
+ font-size: 0.75rem;
183
+ text-transform: uppercase;
184
+ margin-bottom: 0.5rem;
185
+ }
186
+
187
+ .eval-grid { display: grid; gap: 1rem; }
188
+ .eval-card {
189
+ background: var(--card-bg);
190
+ border-radius: 6px;
191
+ padding: 1rem;
192
+ border: 1px solid var(--border);
193
+ }
194
+ .eval-card.winner { border-color: var(--purple); box-shadow: 0 0 0 1px var(--purple); }
195
+ .eval-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 0.75rem; }
196
+ .eval-header .candidate { font-weight: 600; font-size: 1rem; }
197
+ .eval-criteria { margin-top: 0.75rem; padding-top: 0.75rem; border-top: 1px solid var(--border); }
198
+ .eval-criteria .criterion { padding: 0.5rem 0; border-bottom: 1px solid var(--border); }
199
+ .eval-criteria .criterion:last-child { border-bottom: none; }
200
+ .eval-criteria .criterion-text { color: var(--text-muted); font-size: 0.85rem; margin-bottom: 0.25rem; }
201
+ .eval-criteria .reasoning { font-size: 0.9rem; color: var(--text); font-style: italic; }
202
+ .eval-timing { color: var(--text-muted); font-size: 0.8rem; margin-top: 0.5rem; }
203
+
204
+ .response-card { margin-bottom: 1rem; }
205
+ .response-card h4 {
206
+ font-size: 0.9rem;
207
+ color: var(--accent);
208
+ margin-bottom: 0.5rem;
209
+ display: flex;
210
+ align-items: center;
211
+ gap: 0.5rem;
212
+ }
213
+ .response-card pre {
214
+ background: var(--bg);
215
+ border: 1px solid var(--border);
216
+ border-radius: 4px;
217
+ padding: 1rem;
218
+ overflow-x: auto;
219
+ font-size: 0.85rem;
220
+ white-space: pre-wrap;
221
+ word-wrap: break-word;
222
+ max-height: 400px;
223
+ overflow-y: auto;
224
+ }
225
+
226
+ .winner-box { text-align: center; padding: 2.5rem; }
227
+ .winner-box .crown { font-size: 4rem; margin-bottom: 1rem; }
228
+ .winner-box h2 { font-size: 2rem; margin-bottom: 0.75rem; justify-content: center; }
229
+ .winner-box .stats { color: var(--text-muted); font-size: 1rem; }
230
+ .winner-box .comparison {
231
+ margin-top: 1.5rem;
232
+ padding-top: 1.5rem;
233
+ border-top: 1px solid var(--border);
234
+ font-size: 0.9rem;
235
+ color: var(--text-muted);
236
+ }
237
+
238
+ .perf-row {
239
+ display: flex;
240
+ justify-content: space-between;
241
+ align-items: center;
242
+ padding: 0.75rem 0;
243
+ border-bottom: 1px solid var(--border);
244
+ }
245
+ .perf-row:last-child { border-bottom: none; }
246
+ .perf-row .name { font-weight: 500; }
247
+ .perf-row .metrics { display: flex; gap: 2rem; color: var(--text-muted); font-size: 0.9rem; }
248
+ .perf-row .metrics span { display: flex; align-items: center; gap: 0.5rem; }
249
+
250
+ .two-col { display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; }
251
+ @media (max-width: 900px) { .two-col { grid-template-columns: 1fr; } }
252
+
253
+ footer {
254
+ text-align: center;
255
+ padding-top: 1.5rem;
256
+ margin-top: 1rem;
257
+ border-top: 1px solid var(--border);
258
+ color: var(--text-muted);
259
+ font-size: 0.8rem;
260
+ }
261
+ footer a { color: var(--accent); text-decoration: none; }
262
+ footer a:hover { text-decoration: underline; }
263
+
264
+ .collapsible { cursor: pointer; user-select: none; }
265
+ .collapsible:hover { opacity: 0.8; }
266
+ .collapsible::after { content: " ▼"; font-size: 0.7rem; }
267
+ .collapsed::after { content: " ▶"; }
268
+ </style>
269
+ CSS
270
+ end
271
+
272
+ def header_section
273
+ <<~HTML
274
+ <header>
275
+ <h1>#{h(@results.suite_name)}</h1>
276
+ <p class="subtitle">Generated #{Time.now.strftime('%B %d, %Y at %H:%M:%S')}</p>
277
+ </header>
278
+ HTML
279
+ end
280
+
281
+ def config_section
282
+ config = Qualspec.configuration
283
+ candidates = @results.scores_by_candidate.keys
284
+
285
+ <<~HTML
286
+ <div class="card">
287
+ <h2><span class="icon">⚙️</span> Configuration</h2>
288
+ <div class="config-grid">
289
+ <div class="config-item">
290
+ <div class="label">Judge Model</div>
291
+ <div class="value">#{h(config.judge_model)}</div>
292
+ </div>
293
+ <div class="config-item">
294
+ <div class="label">API Endpoint</div>
295
+ <div class="value">#{h(config.api_url)}</div>
296
+ </div>
297
+ <div class="config-item">
298
+ <div class="label">Candidates</div>
299
+ <div class="value">#{candidates.size} models</div>
300
+ </div>
301
+ <div class="config-item">
302
+ <div class="label">Scenarios</div>
303
+ <div class="value">#{@results.scores_by_scenario.size} total</div>
304
+ </div>
305
+ </div>
306
+ </div>
307
+ HTML
308
+ end
309
+
310
+ def summary_section
311
+ scores = @results.scores_by_candidate
312
+ return '' if scores.empty?
313
+
314
+ wins = count_wins
315
+ sorted = scores.sort_by { |_, v| -v[:avg_score] }
316
+ top_score = sorted.first[1][:avg_score]
317
+
318
+ rows = sorted.map do |candidate, stats|
319
+ # Use round(2) comparison to avoid float precision issues
320
+ is_winner = stats[:avg_score].round(2) == top_score.round(2) &&
321
+ sorted.count { |_, v| v[:avg_score].round(2) == top_score.round(2) } == 1
322
+ winner_badge = is_winner ? '<span class="badge badge-winner">WINNER</span>' : ''
323
+ model = get_candidate_model(candidate)
324
+
325
+ <<~ROW
326
+ <tr>
327
+ <td>
328
+ <strong>#{h(candidate)}</strong> #{winner_badge}<br>
329
+ <span style="color: var(--text-muted); font-size: 0.8rem;">#{h(model)}</span>
330
+ </td>
331
+ <td>#{score_bar(stats[:avg_score])}</td>
332
+ <td style="text-align: center;">#{wins[candidate] || 0}</td>
333
+ <td>#{pass_rate_badge(stats[:pass_rate])}</td>
334
+ </tr>
335
+ ROW
336
+ end.join
337
+
338
+ <<~HTML
339
+ <div class="card">
340
+ <h2><span class="icon">📊</span> Summary</h2>
341
+ <table>
342
+ <thead>
343
+ <tr>
344
+ <th>Candidate</th>
345
+ <th>Average Score</th>
346
+ <th style="text-align: center;">Scenario Wins</th>
347
+ <th>Pass Rate</th>
348
+ </tr>
349
+ </thead>
350
+ <tbody>
351
+ #{rows}
352
+ </tbody>
353
+ </table>
354
+ </div>
355
+ HTML
356
+ end
357
+
358
+ def performance_section
359
+ timing = @results.timing_by_candidate
360
+ return '' if timing.empty?
361
+
362
+ costs = @results.costs
363
+ sorted = timing.sort_by { |_, v| v[:avg_ms] }
364
+ fastest = sorted.first
365
+
366
+ rows = sorted.map do |candidate, stats|
367
+ cost_str = costs[candidate]&.positive? ? "$#{format_cost(costs[candidate])}" : '-'
368
+ speedup = (stats[:avg_ms].to_f / fastest[1][:avg_ms]).round(1)
369
+ speedup_badge = if (speedup - 1.0).abs < 0.01
370
+ '<span class="badge badge-success">Fastest</span>'
371
+ else
372
+ "<span class=\"badge badge-info\">#{speedup}x slower</span>"
373
+ end
374
+
375
+ <<~ROW
376
+ <div class="perf-row">
377
+ <span class="name">#{h(candidate)} #{speedup_badge}</span>
378
+ <div class="metrics">
379
+ <span>⏱️ #{format_duration(stats[:avg_ms])} avg</span>
380
+ <span>📊 #{format_duration(stats[:total_ms])} total</span>
381
+ <span>💰 #{cost_str}</span>
382
+ </div>
383
+ </div>
384
+ ROW
385
+ end.join
386
+
387
+ <<~HTML
388
+ <div class="card">
389
+ <h2><span class="icon">⚡</span> Performance</h2>
390
+ #{rows}
391
+ </div>
392
+ HTML
393
+ end
394
+
395
+ def detailed_results_section
396
+ by_scenario = @results.scores_by_scenario
397
+ return '' if by_scenario.empty?
398
+
399
+ candidates = @results.scores_by_candidate.keys
400
+ evaluations = @results.evaluations
401
+
402
+ scenarios = by_scenario.map do |scenario, candidate_scores|
403
+ winner = find_scenario_winner(scenario)
404
+ winner_label = case winner
405
+ when :tie then '<span class="badge badge-warning">TIE</span>'
406
+ when nil then ''
407
+ else '<span class="badge badge-winner">WINNER</span>'
408
+ end
409
+
410
+ # Get the prompt for this scenario
411
+ prompt = get_scenario_prompt(scenario)
412
+
413
+ # Get evaluations for this scenario
414
+ scenario_evals = evaluations.select { |e| e[:scenario] == scenario }
415
+
416
+ eval_cards = candidates.map do |candidate|
417
+ stats = candidate_scores[candidate]
418
+ next unless stats
419
+
420
+ is_winner = winner == candidate
421
+ timing_info = format_scenario_timing(candidate, scenario)
422
+ candidate_evals = scenario_evals.select { |e| e[:candidate] == candidate }
423
+
424
+ criteria_html = candidate_evals.map do |eval|
425
+ <<~CRITERION
426
+ <div class="criterion">
427
+ <div class="criterion-text">#{h(eval[:criterion])}</div>
428
+ <div style="display: flex; align-items: center; gap: 0.5rem; margin-top: 0.25rem;">
429
+ #{score_bar(eval[:score])}
430
+ #{if eval[:pass]
431
+ '<span class="badge badge-success">PASS</span>'
432
+ else
433
+ '<span class="badge badge-danger">FAIL</span>'
434
+ end}
435
+ </div>
436
+ #{eval[:reasoning] ? "<div class=\"reasoning\">\"#{h(eval[:reasoning])}\"</div>" : ''}
437
+ </div>
438
+ CRITERION
439
+ end.join
440
+
441
+ <<~CARD
442
+ <div class="eval-card#{is_winner ? ' winner' : ''}">
443
+ <div class="eval-header">
444
+ <span class="candidate">#{h(candidate)} #{is_winner ? '⭐' : ''}</span>
445
+ #{score_bar(stats[:score])}
446
+ </div>
447
+ <div class="eval-criteria">
448
+ #{criteria_html}
449
+ </div>
450
+ #{timing_info ? "<div class=\"eval-timing\">Response time: #{timing_info}</div>" : ''}
451
+ </div>
452
+ CARD
453
+ end.compact.join
454
+
455
+ <<~SCENARIO
456
+ <div class="scenario-card">
457
+ <div class="scenario-header">
458
+ <h3>#{h(scenario)} #{winner_label}</h3>
459
+ </div>
460
+ #{prompt ? "<div class=\"scenario-prompt\"><div class=\"label\">Prompt</div>#{h(prompt)}</div>" : ''}
461
+ <div class="eval-grid">
462
+ #{eval_cards}
463
+ </div>
464
+ </div>
465
+ SCENARIO
466
+ end.join
467
+
468
+ <<~HTML
469
+ <div class="card">
470
+ <h2><span class="icon">🎯</span> Detailed Results by Scenario</h2>
471
+ #{scenarios}
472
+ </div>
473
+ HTML
474
+ end
475
+
476
+ def responses_section
477
+ responses = @results.responses
478
+ return '' if responses.empty?
479
+
480
+ scenarios = responses.values.first&.keys || []
481
+
482
+ scenario_blocks = scenarios.map do |scenario|
483
+ response_cards = responses.map do |candidate, candidate_responses|
484
+ response = candidate_responses[scenario]
485
+ next unless response
486
+
487
+ response_text = response.to_s.strip
488
+
489
+ <<~CARD
490
+ <div class="response-card">
491
+ <h4>#{h(candidate)}</h4>
492
+ <pre>#{h(response_text)}</pre>
493
+ </div>
494
+ CARD
495
+ end.compact
496
+
497
+ # Use two columns if we have exactly 2 responses
498
+ grid_class = response_cards.size == 2 ? 'two-col' : ''
499
+
500
+ <<~SCENARIO
501
+ <div class="scenario-card">
502
+ <div class="scenario-header">
503
+ <h3>#{h(scenario)}</h3>
504
+ </div>
505
+ <div class="#{grid_class}">
506
+ #{response_cards.join}
507
+ </div>
508
+ </div>
509
+ SCENARIO
510
+ end.join
511
+
512
+ <<~HTML
513
+ <div class="card">
514
+ <h2><span class="icon">💬</span> Full Responses</h2>
515
+ #{scenario_blocks}
516
+ </div>
517
+ HTML
518
+ end
519
+
520
+ def winner_section
521
+ scores = @results.scores_by_candidate
522
+ return '' if scores.empty?
523
+
524
+ wins = count_wins
525
+ sorted = scores.sort_by { |_, v| -v[:avg_score] }
526
+ winner = sorted.first
527
+ runner_up = sorted[1]
528
+
529
+ if sorted.size == 1
530
+ content = <<~HTML
531
+ <div class="crown">🏆</div>
532
+ <h2>#{h(winner[0])}</h2>
533
+ <p class="stats">#{winner[1][:avg_score]}/10 average score</p>
534
+ HTML
535
+ elsif winner[1][:avg_score] == runner_up&.dig(1, :avg_score)
536
+ tied = sorted.take_while { |_, v| v[:avg_score] == winner[1][:avg_score] }
537
+ content = <<~HTML
538
+ <div class="crown">🤝</div>
539
+ <h2>It's a Tie!</h2>
540
+ <p class="stats">#{tied.map(&:first).join(' vs ')} tied at #{winner[1][:avg_score]}/10</p>
541
+ HTML
542
+ else
543
+ margin = (winner[1][:avg_score] - runner_up[1][:avg_score]).round(2)
544
+ win_count = wins[winner[0]] || 0
545
+ content = <<~HTML
546
+ <div class="crown">👑</div>
547
+ <h2>#{h(winner[0])} Wins!</h2>
548
+ <p class="stats">
549
+ #{winner[1][:avg_score]}/10 avg &nbsp;•&nbsp;
550
+ #{win_count} scenario wins &nbsp;•&nbsp;
551
+ #{winner[1][:pass_rate]}% pass rate
552
+ </p>
553
+ <p class="comparison">Beat #{h(runner_up[0])} by #{margin} points</p>
554
+ HTML
555
+ end
556
+
557
+ timing = @results.timing_by_candidate
558
+ speed_note = ''
559
+ if timing.size > 1
560
+ fastest = timing.min_by { |_, v| v[:avg_ms] }
561
+ slowest = timing.max_by { |_, v| v[:avg_ms] }
562
+ if fastest[0] != slowest[0]
563
+ speedup = (slowest[1][:avg_ms].to_f / fastest[1][:avg_ms]).round(1)
564
+ speed_note = <<~HTML
565
+ <p class="comparison">
566
+ ⚡ #{h(fastest[0])} was #{speedup}x faster than #{h(slowest[0])}
567
+ </p>
568
+ HTML
569
+ end
570
+ end
571
+
572
+ <<~HTML
573
+ <div class="card winner-box">
574
+ #{content}
575
+ #{speed_note}
576
+ </div>
577
+ HTML
578
+ end
579
+
580
+ def footer_section
581
+ <<~HTML
582
+ <footer>
583
+ Generated by <a href="https://github.com/estiens/qualspec">Qualspec</a> v#{Qualspec::VERSION}
584
+ </footer>
585
+ HTML
586
+ end
587
+
588
+ def score_bar(score)
589
+ percentage = (score.to_f / 10 * 100).round
590
+ color = if score >= 8
591
+ 'var(--success)'
592
+ elsif score >= 6
593
+ 'var(--warning)'
594
+ else
595
+ 'var(--danger)'
596
+ end
597
+
598
+ <<~HTML
599
+ <div class="score-bar">
600
+ <div class="bar">
601
+ <div class="fill" style="width: #{percentage}%; background: #{color};"></div>
602
+ </div>
603
+ <span class="value">#{score}/10</span>
604
+ </div>
605
+ HTML
606
+ end
607
+
608
+ def pass_rate_badge(rate)
609
+ badge_class = if rate >= 80
610
+ 'badge-success'
611
+ elsif rate >= 50
612
+ 'badge-warning'
613
+ else
614
+ 'badge-danger'
615
+ end
616
+
617
+ %(<span class="badge #{badge_class}">#{rate}%</span>)
618
+ end
619
+
620
+ def count_wins
621
+ wins = Hash.new(0)
622
+ @results.evaluations.each do |eval|
623
+ wins[eval[:candidate]] += 1 if eval[:winner] == true
624
+ end
625
+ wins
626
+ end
627
+
628
+ def find_scenario_winner(scenario)
629
+ scenario_evals = @results.evaluations.select { |e| e[:scenario] == scenario }
630
+ winner_eval = scenario_evals.find { |e| e[:winner] == true }
631
+ return winner_eval[:candidate] if winner_eval
632
+
633
+ tie_eval = scenario_evals.find { |e| e[:winner] == :tie }
634
+ return :tie if tie_eval
635
+
636
+ nil
637
+ end
638
+
639
+ def format_scenario_timing(candidate, scenario)
640
+ duration = @results.timing.dig(candidate, scenario)
641
+ return nil unless duration
642
+
643
+ format_duration(duration)
644
+ end
645
+
646
+ def format_duration(milliseconds)
647
+ if milliseconds >= 1000
648
+ "#{(milliseconds / 1000.0).round(2)}s"
649
+ else
650
+ "#{milliseconds.round}ms"
651
+ end
652
+ end
653
+
654
+ def format_cost(cost)
655
+ if cost < 0.01
656
+ format('%.4f', cost)
657
+ else
658
+ format('%.2f', cost)
659
+ end
660
+ end
661
+
662
+ def get_candidate_model(candidate)
663
+ # Try to find the model from the suite
664
+ @results.evaluations.find { |e| e[:candidate] == candidate }&.dig(:model) || 'unknown'
665
+ end
666
+
667
+ def get_scenario_prompt(_scenario)
668
+ # This would need to be stored in results - for now return nil
669
+ nil
670
+ end
671
+ end
672
+ end
673
+ end