@artemiskit/reports 0.1.6 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,783 @@
1
+ /**
2
+ * HTML Comparison Report Generator
3
+ * Generates a visual comparison between two runs
4
+ */
5
+
6
+ import type { CaseResult, RunManifest } from '@artemiskit/core';
7
+ import Handlebars from 'handlebars';
8
+
9
+ /**
10
+ * Case-level comparison data
11
+ */
12
+ export interface CaseComparison {
13
+ caseId: string;
14
+ name?: string;
15
+ baselineStatus: 'passed' | 'failed' | null;
16
+ currentStatus: 'passed' | 'failed' | null;
17
+ baselineScore: number | null;
18
+ currentScore: number | null;
19
+ scoreDelta: number;
20
+ baselineLatency: number | null;
21
+ currentLatency: number | null;
22
+ latencyDelta: number;
23
+ changeType: 'regressed' | 'improved' | 'unchanged' | 'new' | 'removed';
24
+ baselineCase?: CaseResult;
25
+ currentCase?: CaseResult;
26
+ }
27
+
28
+ /**
29
+ * Comparison data structure
30
+ */
31
+ export interface ComparisonData {
32
+ baseline: RunManifest;
33
+ current: RunManifest;
34
+ metrics: {
35
+ successRateDelta: number;
36
+ medianLatencyDelta: number;
37
+ totalTokensDelta: number;
38
+ };
39
+ caseComparisons: CaseComparison[];
40
+ summary: {
41
+ totalRegressions: number;
42
+ totalImprovements: number;
43
+ totalUnchanged: number;
44
+ casesRemoved: number;
45
+ casesAdded: number;
46
+ };
47
+ }
48
+
49
+ const COMPARE_HTML_TEMPLATE = `
50
+ <!DOCTYPE html>
51
+ <html lang="en">
52
+ <head>
53
+ <meta charset="UTF-8">
54
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
55
+ <title>Artemis Comparison - {{data.baseline.config.scenario}}</title>
56
+ <style>
57
+ * { margin: 0; padding: 0; box-sizing: border-box; }
58
+ body {
59
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
60
+ line-height: 1.6;
61
+ color: #333;
62
+ background: #f5f5f5;
63
+ padding: 2rem;
64
+ }
65
+ .container { max-width: 1400px; margin: 0 auto; }
66
+ h1 { margin-bottom: 0.5rem; color: #1a1a1a; }
67
+ h2 { margin: 2rem 0 1rem; color: #333; border-bottom: 2px solid #e0e0e0; padding-bottom: 0.5rem; }
68
+ h3 { margin: 1rem 0 0.5rem; color: #555; font-size: 1rem; }
69
+ .meta { color: #666; margin-bottom: 2rem; }
70
+
71
+ /* Summary Cards */
72
+ .summary-grid {
73
+ display: grid;
74
+ grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
75
+ gap: 1rem;
76
+ margin-bottom: 2rem;
77
+ }
78
+ .card {
79
+ background: white;
80
+ padding: 1.5rem;
81
+ border-radius: 8px;
82
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1);
83
+ }
84
+ .card h3 { font-size: 0.875rem; color: #666; margin-bottom: 0.75rem; }
85
+ .card .compare-row {
86
+ display: flex;
87
+ justify-content: space-between;
88
+ align-items: center;
89
+ margin-bottom: 0.5rem;
90
+ }
91
+ .card .label { color: #888; font-size: 0.875rem; }
92
+ .card .value { font-size: 1.25rem; font-weight: 600; }
93
+ .card .delta {
94
+ padding: 0.25rem 0.5rem;
95
+ border-radius: 4px;
96
+ font-size: 0.875rem;
97
+ font-weight: 600;
98
+ }
99
+ .delta.positive { background: #dcfce7; color: #166534; }
100
+ .delta.negative { background: #fee2e2; color: #991b1b; }
101
+ .delta.neutral { background: #f3f4f6; color: #6b7280; }
102
+
103
+ /* Comparison Stats */
104
+ .stats-row {
105
+ display: flex;
106
+ gap: 1rem;
107
+ flex-wrap: wrap;
108
+ margin-bottom: 1.5rem;
109
+ }
110
+ .stat-badge {
111
+ padding: 0.5rem 1rem;
112
+ border-radius: 6px;
113
+ font-weight: 600;
114
+ font-size: 0.875rem;
115
+ }
116
+ .stat-badge.regressions { background: #fee2e2; color: #991b1b; }
117
+ .stat-badge.improvements { background: #dcfce7; color: #166534; }
118
+ .stat-badge.unchanged { background: #f3f4f6; color: #6b7280; }
119
+ .stat-badge.added { background: #dbeafe; color: #1e40af; }
120
+ .stat-badge.removed { background: #fef3c7; color: #92400e; }
121
+
122
+ /* Table Styles */
123
+ table { width: 100%; border-collapse: collapse; background: white; border-radius: 8px; overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
124
+ th, td { padding: 0.75rem 1rem; text-align: left; border-bottom: 1px solid #e0e0e0; }
125
+ th { background: #f9fafb; font-weight: 600; font-size: 0.875rem; }
126
+ tr:last-child td { border-bottom: none; }
127
+
128
+ /* Status indicators */
129
+ .status { display: inline-block; padding: 0.125rem 0.5rem; border-radius: 9999px; font-size: 0.75rem; font-weight: 500; }
130
+ .status.passed { background: #dcfce7; color: #166534; }
131
+ .status.failed { background: #fee2e2; color: #991b1b; }
132
+ .status.na { background: #f3f4f6; color: #9ca3af; }
133
+
134
+ /* Change type badges */
135
+ .change-type {
136
+ display: inline-flex;
137
+ align-items: center;
138
+ gap: 0.25rem;
139
+ padding: 0.25rem 0.5rem;
140
+ border-radius: 4px;
141
+ font-size: 0.75rem;
142
+ font-weight: 600;
143
+ }
144
+ .change-type.regressed { background: #fee2e2; color: #991b1b; }
145
+ .change-type.improved { background: #dcfce7; color: #166534; }
146
+ .change-type.unchanged { background: #f3f4f6; color: #6b7280; }
147
+ .change-type.new { background: #dbeafe; color: #1e40af; }
148
+ .change-type.removed { background: #fef3c7; color: #92400e; }
149
+
150
+ /* Arrow indicators */
151
+ .arrow { font-size: 1rem; }
152
+ .arrow.up { color: #22c55e; }
153
+ .arrow.down { color: #ef4444; }
154
+ .arrow.same { color: #9ca3af; }
155
+
156
+ /* Score comparison */
157
+ .score-compare {
158
+ font-family: monospace;
159
+ font-size: 0.875rem;
160
+ }
161
+ .score-compare .old { color: #9ca3af; text-decoration: line-through; margin-right: 0.25rem; }
162
+ .score-compare .new { font-weight: 600; }
163
+
164
+ /* Details section */
165
+ .details { margin-top: 0.5rem; padding: 1rem; background: #f9fafb; border-radius: 4px; font-size: 0.875rem; }
166
+ .details pre { white-space: pre-wrap; word-break: break-word; margin: 0.5rem 0; }
167
+ .details-grid {
168
+ display: grid;
169
+ grid-template-columns: 1fr 1fr;
170
+ gap: 1rem;
171
+ }
172
+ .details-col { }
173
+ .details-col h4 { font-size: 0.75rem; color: #666; margin-bottom: 0.5rem; text-transform: uppercase; }
174
+
175
+ .expandable { cursor: pointer; }
176
+ .expandable:hover { background: #f0f0f0; }
177
+ .hidden { display: none; }
178
+
179
+ /* Filter controls */
180
+ .controls {
181
+ display: flex;
182
+ gap: 1rem;
183
+ margin-bottom: 1rem;
184
+ flex-wrap: wrap;
185
+ align-items: center;
186
+ }
187
+ .filter-group { display: flex; gap: 0.5rem; flex-wrap: wrap; }
188
+ .filter-btn {
189
+ padding: 0.5rem 1rem;
190
+ border: 1px solid #e0e0e0;
191
+ background: white;
192
+ border-radius: 6px;
193
+ cursor: pointer;
194
+ font-size: 0.875rem;
195
+ font-weight: 500;
196
+ transition: all 0.15s ease;
197
+ }
198
+ .filter-btn:hover { background: #f5f5f5; }
199
+ .filter-btn.active { background: #1a1a1a; color: white; border-color: #1a1a1a; }
200
+ .filter-btn.regressed.active { background: #991b1b; border-color: #991b1b; }
201
+ .filter-btn.improved.active { background: #166534; border-color: #166534; }
202
+
203
+ .search-box { flex: 1; min-width: 200px; max-width: 400px; }
204
+ .search-input {
205
+ width: 100%;
206
+ padding: 0.5rem 1rem;
207
+ border: 1px solid #e0e0e0;
208
+ border-radius: 6px;
209
+ font-size: 0.875rem;
210
+ outline: none;
211
+ }
212
+ .search-input:focus { border-color: #3b82f6; box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.1); }
213
+
214
+ .results-count { font-size: 0.875rem; color: #666; padding: 0.5rem 0; }
215
+
216
+ /* No results message */
217
+ .no-results {
218
+ text-align: center;
219
+ padding: 2rem;
220
+ color: #666;
221
+ background: white;
222
+ border-radius: 8px;
223
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1);
224
+ }
225
+ .no-results .icon { font-size: 2rem; margin-bottom: 0.5rem; }
226
+
227
+ /* Run info */
228
+ .run-info {
229
+ display: grid;
230
+ grid-template-columns: 1fr 1fr;
231
+ gap: 2rem;
232
+ margin-bottom: 2rem;
233
+ }
234
+ .run-box {
235
+ background: white;
236
+ padding: 1rem 1.5rem;
237
+ border-radius: 8px;
238
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1);
239
+ }
240
+ .run-box h3 {
241
+ font-size: 0.75rem;
242
+ text-transform: uppercase;
243
+ color: #666;
244
+ margin-bottom: 0.5rem;
245
+ }
246
+ .run-box .run-id { font-family: monospace; font-size: 0.875rem; color: #333; }
247
+ .run-box .run-meta { font-size: 0.875rem; color: #666; margin-top: 0.25rem; }
248
+
249
+ footer { margin-top: 3rem; text-align: center; color: #666; font-size: 0.875rem; }
250
+
251
+ @media (max-width: 768px) {
252
+ .run-info, .details-grid { grid-template-columns: 1fr; }
253
+ }
254
+ </style>
255
+ </head>
256
+ <body>
257
+ <div class="container">
258
+ <h1>Run Comparison</h1>
259
+ <p class="meta">{{data.baseline.config.scenario}} | Comparing two evaluation runs</p>
260
+
261
+ <!-- Run Info -->
262
+ <div class="run-info">
263
+ <div class="run-box">
264
+ <h3>&#x1F4C4; Baseline Run</h3>
265
+ <div class="run-id">{{data.baseline.run_id}}</div>
266
+ <div class="run-meta">{{formatDate data.baseline.start_time}}</div>
267
+ </div>
268
+ <div class="run-box">
269
+ <h3>&#x1F4CB; Current Run</h3>
270
+ <div class="run-id">{{data.current.run_id}}</div>
271
+ <div class="run-meta">{{formatDate data.current.start_time}}</div>
272
+ </div>
273
+ </div>
274
+
275
+ <!-- Metrics Comparison -->
276
+ <h2>Metrics Overview</h2>
277
+ <div class="summary-grid">
278
+ <div class="card">
279
+ <h3>Success Rate</h3>
280
+ <div class="compare-row">
281
+ <span class="label">Baseline</span>
282
+ <span class="value">{{formatPercent data.baseline.metrics.success_rate}}</span>
283
+ </div>
284
+ <div class="compare-row">
285
+ <span class="label">Current</span>
286
+ <span class="value">{{formatPercent data.current.metrics.success_rate}}</span>
287
+ </div>
288
+ <div class="compare-row">
289
+ <span class="label">Delta</span>
290
+ <span class="delta {{deltaClass data.metrics.successRateDelta}}">
291
+ {{formatDelta data.metrics.successRateDelta}}
292
+ </span>
293
+ </div>
294
+ </div>
295
+
296
+ <div class="card">
297
+ <h3>Median Latency</h3>
298
+ <div class="compare-row">
299
+ <span class="label">Baseline</span>
300
+ <span class="value">{{data.baseline.metrics.median_latency_ms}}ms</span>
301
+ </div>
302
+ <div class="compare-row">
303
+ <span class="label">Current</span>
304
+ <span class="value">{{data.current.metrics.median_latency_ms}}ms</span>
305
+ </div>
306
+ <div class="compare-row">
307
+ <span class="label">Delta</span>
308
+ <span class="delta {{latencyDeltaClass data.metrics.medianLatencyDelta}}">
309
+ {{formatLatencyDelta data.metrics.medianLatencyDelta}}
310
+ </span>
311
+ </div>
312
+ </div>
313
+
314
+ <div class="card">
315
+ <h3>Total Tokens</h3>
316
+ <div class="compare-row">
317
+ <span class="label">Baseline</span>
318
+ <span class="value">{{formatNumber data.baseline.metrics.total_tokens}}</span>
319
+ </div>
320
+ <div class="compare-row">
321
+ <span class="label">Current</span>
322
+ <span class="value">{{formatNumber data.current.metrics.total_tokens}}</span>
323
+ </div>
324
+ <div class="compare-row">
325
+ <span class="label">Delta</span>
326
+ <span class="delta {{tokenDeltaClass data.metrics.totalTokensDelta}}">
327
+ {{formatTokenDelta data.metrics.totalTokensDelta}}
328
+ </span>
329
+ </div>
330
+ </div>
331
+
332
+ <div class="card">
333
+ <h3>Test Cases</h3>
334
+ <div class="compare-row">
335
+ <span class="label">Baseline</span>
336
+ <span class="value">{{data.baseline.metrics.passed_cases}}/{{data.baseline.metrics.total_cases}}</span>
337
+ </div>
338
+ <div class="compare-row">
339
+ <span class="label">Current</span>
340
+ <span class="value">{{data.current.metrics.passed_cases}}/{{data.current.metrics.total_cases}}</span>
341
+ </div>
342
+ <div class="compare-row">
343
+ <span class="label">Pass Delta</span>
344
+ <span class="delta {{passedDeltaClass data.baseline.metrics.passed_cases data.current.metrics.passed_cases}}">
345
+ {{formatPassedDelta data.baseline.metrics.passed_cases data.current.metrics.passed_cases}}
346
+ </span>
347
+ </div>
348
+ </div>
349
+ </div>
350
+
351
+ <!-- Change Summary -->
352
+ <h2>Change Summary</h2>
353
+ <div class="stats-row">
354
+ {{#if data.summary.totalRegressions}}
355
+ <span class="stat-badge regressions">&#x26A0; {{data.summary.totalRegressions}} Regression{{#if (gt data.summary.totalRegressions 1)}}s{{/if}}</span>
356
+ {{/if}}
357
+ {{#if data.summary.totalImprovements}}
358
+ <span class="stat-badge improvements">&#x2705; {{data.summary.totalImprovements}} Improvement{{#if (gt data.summary.totalImprovements 1)}}s{{/if}}</span>
359
+ {{/if}}
360
+ {{#if data.summary.totalUnchanged}}
361
+ <span class="stat-badge unchanged">&#x2796; {{data.summary.totalUnchanged}} Unchanged</span>
362
+ {{/if}}
363
+ {{#if data.summary.casesAdded}}
364
+ <span class="stat-badge added">&#x2795; {{data.summary.casesAdded}} New</span>
365
+ {{/if}}
366
+ {{#if data.summary.casesRemoved}}
367
+ <span class="stat-badge removed">&#x2796; {{data.summary.casesRemoved}} Removed</span>
368
+ {{/if}}
369
+ </div>
370
+
371
+ <!-- Case Comparison Table -->
372
+ <h2>Case Comparison</h2>
373
+
374
+ <div class="controls">
375
+ <div class="filter-group">
376
+ <button class="filter-btn active" data-filter="all" onclick="filterCases('all')">All ({{data.caseComparisons.length}})</button>
377
+ <button class="filter-btn regressed" data-filter="regressed" onclick="filterCases('regressed')">Regressed ({{data.summary.totalRegressions}})</button>
378
+ <button class="filter-btn improved" data-filter="improved" onclick="filterCases('improved')">Improved ({{data.summary.totalImprovements}})</button>
379
+ <button class="filter-btn" data-filter="unchanged" onclick="filterCases('unchanged')">Unchanged ({{data.summary.totalUnchanged}})</button>
380
+ </div>
381
+ <div class="search-box">
382
+ <input type="text" class="search-input" id="search-input" placeholder="Search by ID or name..." oninput="searchCases(this.value)">
383
+ </div>
384
+ </div>
385
+ <div class="results-count" id="results-count">Showing all {{data.caseComparisons.length}} cases</div>
386
+
387
+ <table id="cases-table">
388
+ <thead>
389
+ <tr>
390
+ <th>Case</th>
391
+ <th>Baseline</th>
392
+ <th>Current</th>
393
+ <th>Change</th>
394
+ <th>Score</th>
395
+ <th>Latency</th>
396
+ </tr>
397
+ </thead>
398
+ <tbody>
399
+ {{#each data.caseComparisons}}
400
+ <tr class="expandable case-row" data-change="{{changeType}}" data-id="{{caseId}}" data-name="{{name}}" onclick="toggleDetails('{{caseId}}')">
401
+ <td>
402
+ <strong>{{caseId}}</strong>
403
+ {{#if name}}<br><small>{{name}}</small>{{/if}}
404
+ </td>
405
+ <td>
406
+ {{#if baselineStatus}}
407
+ <span class="status {{baselineStatus}}">{{uppercase baselineStatus}}</span>
408
+ {{else}}
409
+ <span class="status na">N/A</span>
410
+ {{/if}}
411
+ </td>
412
+ <td>
413
+ {{#if currentStatus}}
414
+ <span class="status {{currentStatus}}">{{uppercase currentStatus}}</span>
415
+ {{else}}
416
+ <span class="status na">N/A</span>
417
+ {{/if}}
418
+ </td>
419
+ <td>
420
+ <span class="change-type {{changeType}}">
421
+ {{changeTypeIcon changeType}} {{changeTypeLabel changeType}}
422
+ </span>
423
+ </td>
424
+ <td>
425
+ <span class="score-compare">
426
+ {{#if baselineScore}}<span class="old">{{formatPercent baselineScore}}</span>{{/if}}
427
+ {{#if currentScore}}<span class="new">{{formatPercent currentScore}}</span>{{/if}}
428
+ {{#unless currentScore}}{{#unless baselineScore}}<span class="new">-</span>{{/unless}}{{/unless}}
429
+ </span>
430
+ </td>
431
+ <td>
432
+ {{#if latencyDelta}}
433
+ <span class="arrow {{latencyArrow latencyDelta}}">{{latencyArrowIcon latencyDelta}}</span>
434
+ {{formatLatencyDelta latencyDelta}}
435
+ {{else}}
436
+ -
437
+ {{/if}}
438
+ </td>
439
+ </tr>
440
+ <tr id="details-{{caseId}}" class="hidden details-row" data-parent="{{caseId}}">
441
+ <td colspan="6">
442
+ <div class="details">
443
+ <div class="details-grid">
444
+ <div class="details-col">
445
+ <h4>Baseline Response</h4>
446
+ {{#if baselineCase}}
447
+ <pre>{{baselineCase.response}}</pre>
448
+ <p><strong>Reason:</strong> {{baselineCase.reason}}</p>
449
+ {{else}}
450
+ <p><em>No baseline data</em></p>
451
+ {{/if}}
452
+ </div>
453
+ <div class="details-col">
454
+ <h4>Current Response</h4>
455
+ {{#if currentCase}}
456
+ <pre>{{currentCase.response}}</pre>
457
+ <p><strong>Reason:</strong> {{currentCase.reason}}</p>
458
+ {{else}}
459
+ <p><em>No current data</em></p>
460
+ {{/if}}
461
+ </div>
462
+ </div>
463
+ </div>
464
+ </td>
465
+ </tr>
466
+ {{/each}}
467
+ </tbody>
468
+ </table>
469
+
470
+ <div class="no-results hidden" id="no-results">
471
+ <div class="icon">&#128269;</div>
472
+ <p>No cases match your filter or search criteria.</p>
473
+ </div>
474
+
475
+ <footer>
476
+ Generated by Artemis Agent Reliability Toolkit
477
+ </footer>
478
+ </div>
479
+
480
+ <script>
481
+ let currentFilter = 'all';
482
+ let currentSearch = '';
483
+
484
+ function toggleDetails(id) {
485
+ const details = document.getElementById('details-' + id);
486
+ details.classList.toggle('hidden');
487
+ }
488
+
489
+ function filterCases(filter) {
490
+ currentFilter = filter;
491
+ document.querySelectorAll('.filter-btn').forEach(btn => {
492
+ btn.classList.remove('active');
493
+ if (btn.getAttribute('data-filter') === filter) {
494
+ btn.classList.add('active');
495
+ }
496
+ });
497
+ applyFilters();
498
+ }
499
+
500
+ function searchCases(query) {
501
+ currentSearch = query.toLowerCase().trim();
502
+ applyFilters();
503
+ }
504
+
505
+ function applyFilters() {
506
+ const rows = document.querySelectorAll('.case-row');
507
+ const table = document.getElementById('cases-table');
508
+ const noResults = document.getElementById('no-results');
509
+ let visibleCount = 0;
510
+
511
+ rows.forEach(row => {
512
+ const change = row.getAttribute('data-change');
513
+ const id = (row.getAttribute('data-id') || '').toLowerCase();
514
+ const name = (row.getAttribute('data-name') || '').toLowerCase();
515
+ const detailsRow = document.getElementById('details-' + row.getAttribute('data-id'));
516
+
517
+ const passesFilter = currentFilter === 'all' || change === currentFilter;
518
+ const passesSearch = !currentSearch || id.includes(currentSearch) || name.includes(currentSearch);
519
+
520
+ const shouldShow = passesFilter && passesSearch;
521
+ row.classList.toggle('hidden', !shouldShow);
522
+
523
+ if (!shouldShow && detailsRow) {
524
+ detailsRow.classList.add('hidden');
525
+ }
526
+
527
+ if (shouldShow) visibleCount++;
528
+ });
529
+
530
+ const totalCases = rows.length;
531
+ const resultsText = document.getElementById('results-count');
532
+ if (currentFilter === 'all' && !currentSearch) {
533
+ resultsText.textContent = 'Showing all ' + totalCases + ' cases';
534
+ } else {
535
+ resultsText.textContent = 'Showing ' + visibleCount + ' of ' + totalCases + ' cases';
536
+ }
537
+
538
+ if (visibleCount === 0) {
539
+ table.classList.add('hidden');
540
+ noResults.classList.remove('hidden');
541
+ } else {
542
+ table.classList.remove('hidden');
543
+ noResults.classList.add('hidden');
544
+ }
545
+ }
546
+ </script>
547
+ </body>
548
+ </html>
549
+ `;
550
+
551
+ /**
552
+ * Build comparison data from two manifests
553
+ */
554
+ export function buildComparisonData(baseline: RunManifest, current: RunManifest): ComparisonData {
555
+ // Build case lookup maps
556
+ const baselineCases = new Map<string, CaseResult>();
557
+ const currentCases = new Map<string, CaseResult>();
558
+
559
+ for (const c of baseline.cases) {
560
+ baselineCases.set(c.id, c);
561
+ }
562
+ for (const c of current.cases) {
563
+ currentCases.set(c.id, c);
564
+ }
565
+
566
+ // Get all unique case IDs
567
+ const allCaseIds = new Set([...baselineCases.keys(), ...currentCases.keys()]);
568
+
569
+ // Build case comparisons
570
+ const caseComparisons: CaseComparison[] = [];
571
+ let totalRegressions = 0;
572
+ let totalImprovements = 0;
573
+ let totalUnchanged = 0;
574
+ let casesRemoved = 0;
575
+ let casesAdded = 0;
576
+
577
+ for (const caseId of allCaseIds) {
578
+ const baselineCase = baselineCases.get(caseId);
579
+ const currentCase = currentCases.get(caseId);
580
+
581
+ const baselineStatus = baselineCase ? (baselineCase.ok ? 'passed' : 'failed') : null;
582
+ const currentStatus = currentCase ? (currentCase.ok ? 'passed' : 'failed') : null;
583
+ const baselineScore = baselineCase?.score ?? null;
584
+ const currentScore = currentCase?.score ?? null;
585
+ const baselineLatency = baselineCase?.latencyMs ?? null;
586
+ const currentLatency = currentCase?.latencyMs ?? null;
587
+
588
+ // Calculate deltas
589
+ const scoreDelta =
590
+ baselineScore !== null && currentScore !== null ? currentScore - baselineScore : 0;
591
+ const latencyDelta =
592
+ baselineLatency !== null && currentLatency !== null ? currentLatency - baselineLatency : 0;
593
+
594
+ // Determine change type
595
+ let changeType: CaseComparison['changeType'];
596
+ if (!baselineCase) {
597
+ changeType = 'new';
598
+ casesAdded++;
599
+ } else if (!currentCase) {
600
+ changeType = 'removed';
601
+ casesRemoved++;
602
+ } else if (baselineStatus === 'passed' && currentStatus === 'failed') {
603
+ changeType = 'regressed';
604
+ totalRegressions++;
605
+ } else if (baselineStatus === 'failed' && currentStatus === 'passed') {
606
+ changeType = 'improved';
607
+ totalImprovements++;
608
+ } else {
609
+ changeType = 'unchanged';
610
+ totalUnchanged++;
611
+ }
612
+
613
+ caseComparisons.push({
614
+ caseId,
615
+ name: currentCase?.name || baselineCase?.name,
616
+ baselineStatus,
617
+ currentStatus,
618
+ baselineScore,
619
+ currentScore,
620
+ scoreDelta,
621
+ baselineLatency,
622
+ currentLatency,
623
+ latencyDelta,
624
+ changeType,
625
+ baselineCase,
626
+ currentCase,
627
+ });
628
+ }
629
+
630
+ // Sort: regressions first, then improvements, then unchanged, then new/removed
631
+ const changeOrder = { regressed: 0, improved: 1, new: 2, removed: 3, unchanged: 4 };
632
+ caseComparisons.sort((a, b) => changeOrder[a.changeType] - changeOrder[b.changeType]);
633
+
634
+ // Calculate metric deltas
635
+ const successRateDelta = current.metrics.success_rate - baseline.metrics.success_rate;
636
+ const medianLatencyDelta = current.metrics.median_latency_ms - baseline.metrics.median_latency_ms;
637
+ const totalTokensDelta = current.metrics.total_tokens - baseline.metrics.total_tokens;
638
+
639
+ return {
640
+ baseline,
641
+ current,
642
+ metrics: {
643
+ successRateDelta,
644
+ medianLatencyDelta,
645
+ totalTokensDelta,
646
+ },
647
+ caseComparisons,
648
+ summary: {
649
+ totalRegressions,
650
+ totalImprovements,
651
+ totalUnchanged,
652
+ casesRemoved,
653
+ casesAdded,
654
+ },
655
+ };
656
+ }
657
+
658
+ /**
659
+ * Generate an HTML comparison report
660
+ */
661
+ export function generateCompareHTMLReport(baseline: RunManifest, current: RunManifest): string {
662
+ const data = buildComparisonData(baseline, current);
663
+
664
+ // Register helpers
665
+ Handlebars.registerHelper('formatPercent', (value: number | null) => {
666
+ if (value === null || value === undefined) return '-';
667
+ return `${(value * 100).toFixed(1)}%`;
668
+ });
669
+
670
+ Handlebars.registerHelper('formatNumber', (value: number) => {
671
+ return value?.toLocaleString() ?? '-';
672
+ });
673
+
674
+ Handlebars.registerHelper('formatDate', (value: string) => {
675
+ return new Date(value).toLocaleString();
676
+ });
677
+
678
+ Handlebars.registerHelper('formatDelta', (value: number) => {
679
+ const sign = value >= 0 ? '+' : '';
680
+ return `${sign}${(value * 100).toFixed(1)}%`;
681
+ });
682
+
683
+ Handlebars.registerHelper('formatLatencyDelta', (value: number) => {
684
+ if (value === 0) return '0ms';
685
+ const sign = value > 0 ? '+' : '';
686
+ return `${sign}${Math.round(value)}ms`;
687
+ });
688
+
689
+ Handlebars.registerHelper('formatTokenDelta', (value: number) => {
690
+ if (value === 0) return '0';
691
+ const sign = value > 0 ? '+' : '';
692
+ return `${sign}${value.toLocaleString()}`;
693
+ });
694
+
695
+ Handlebars.registerHelper('formatPassedDelta', (baseline: number, current: number) => {
696
+ const delta = current - baseline;
697
+ if (delta === 0) return '0';
698
+ const sign = delta > 0 ? '+' : '';
699
+ return `${sign}${delta}`;
700
+ });
701
+
702
+ Handlebars.registerHelper('deltaClass', (value: number) => {
703
+ if (value > 0.001) return 'positive';
704
+ if (value < -0.001) return 'negative';
705
+ return 'neutral';
706
+ });
707
+
708
+ Handlebars.registerHelper('latencyDeltaClass', (value: number) => {
709
+ // For latency, negative is good (faster)
710
+ if (value < -5) return 'positive';
711
+ if (value > 5) return 'negative';
712
+ return 'neutral';
713
+ });
714
+
715
+ Handlebars.registerHelper('tokenDeltaClass', (value: number) => {
716
+ // For tokens, negative is good (fewer tokens)
717
+ if (value < -10) return 'positive';
718
+ if (value > 10) return 'negative';
719
+ return 'neutral';
720
+ });
721
+
722
+ Handlebars.registerHelper('passedDeltaClass', (baseline: number, current: number) => {
723
+ const delta = current - baseline;
724
+ if (delta > 0) return 'positive';
725
+ if (delta < 0) return 'negative';
726
+ return 'neutral';
727
+ });
728
+
729
+ Handlebars.registerHelper('uppercase', (value: string) => {
730
+ return value?.toUpperCase() ?? '';
731
+ });
732
+
733
+ Handlebars.registerHelper('changeTypeIcon', (changeType: string) => {
734
+ switch (changeType) {
735
+ case 'regressed':
736
+ return '↓';
737
+ case 'improved':
738
+ return '↑';
739
+ case 'unchanged':
740
+ return '→';
741
+ case 'new':
742
+ return '+';
743
+ case 'removed':
744
+ return '−';
745
+ default:
746
+ return '';
747
+ }
748
+ });
749
+
750
+ Handlebars.registerHelper('changeTypeLabel', (changeType: string) => {
751
+ switch (changeType) {
752
+ case 'regressed':
753
+ return 'Regressed';
754
+ case 'improved':
755
+ return 'Improved';
756
+ case 'unchanged':
757
+ return 'Unchanged';
758
+ case 'new':
759
+ return 'New';
760
+ case 'removed':
761
+ return 'Removed';
762
+ default:
763
+ return changeType;
764
+ }
765
+ });
766
+
767
+ Handlebars.registerHelper('latencyArrow', (value: number) => {
768
+ if (value < -5) return 'up';
769
+ if (value > 5) return 'down';
770
+ return 'same';
771
+ });
772
+
773
+ Handlebars.registerHelper('latencyArrowIcon', (value: number) => {
774
+ if (value < -5) return '↑';
775
+ if (value > 5) return '↓';
776
+ return '→';
777
+ });
778
+
779
+ Handlebars.registerHelper('gt', (a: number, b: number) => a > b);
780
+
781
+ const template = Handlebars.compile(COMPARE_HTML_TEMPLATE);
782
+ return template({ data });
783
+ }