@runchr/gstack-antigravity 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agents/rules/ETHOS.md +129 -0
- package/.agents/rules/global-gstack.md +117 -0
- package/.agents/rules/persona-gstack-autoplan.md +14 -0
- package/.agents/rules/persona-gstack-benchmark.md +14 -0
- package/.agents/rules/persona-gstack-browse.md +14 -0
- package/.agents/rules/persona-gstack-canary.md +14 -0
- package/.agents/rules/persona-gstack-careful.md +14 -0
- package/.agents/rules/persona-gstack-codex.md +14 -0
- package/.agents/rules/persona-gstack-cso.md +14 -0
- package/.agents/rules/persona-gstack-design-consultation.md +14 -0
- package/.agents/rules/persona-gstack-design-review.md +14 -0
- package/.agents/rules/persona-gstack-document-release.md +14 -0
- package/.agents/rules/persona-gstack-freeze.md +14 -0
- package/.agents/rules/persona-gstack-gstack-upgrade.md +14 -0
- package/.agents/rules/persona-gstack-guard.md +14 -0
- package/.agents/rules/persona-gstack-investigate.md +14 -0
- package/.agents/rules/persona-gstack-land-and-deploy.md +14 -0
- package/.agents/rules/persona-gstack-office-hours.md +14 -0
- package/.agents/rules/persona-gstack-plan-ceo-review.md +14 -0
- package/.agents/rules/persona-gstack-plan-design-review.md +14 -0
- package/.agents/rules/persona-gstack-plan-eng-review.md +14 -0
- package/.agents/rules/persona-gstack-qa-only.md +14 -0
- package/.agents/rules/persona-gstack-qa.md +14 -0
- package/.agents/rules/persona-gstack-retro.md +14 -0
- package/.agents/rules/persona-gstack-review.md +14 -0
- package/.agents/rules/persona-gstack-setup-browser-cookies.md +14 -0
- package/.agents/rules/persona-gstack-setup-deploy.md +14 -0
- package/.agents/rules/persona-gstack-ship.md +14 -0
- package/.agents/rules/persona-gstack-unfreeze.md +14 -0
- package/.agents/rules/persona-gstack.md +40 -0
- package/.agents/rules/recursive-identities.md +22 -0
- package/.agents/workflows/autoplan.md +30 -0
- package/.agents/workflows/benchmark.md +31 -0
- package/.agents/workflows/browse.md +26 -0
- package/.agents/workflows/canary.md +33 -0
- package/.agents/workflows/careful.md +22 -0
- package/.agents/workflows/codex.md +36 -0
- package/.agents/workflows/cso.md +29 -0
- package/.agents/workflows/design-consultation.md +28 -0
- package/.agents/workflows/design-review.md +28 -0
- package/.agents/workflows/document-release.md +32 -0
- package/.agents/workflows/freeze.md +17 -0
- package/.agents/workflows/gstack-upgrade.md +54 -0
- package/.agents/workflows/gstack.md +56 -0
- package/.agents/workflows/guard.md +18 -0
- package/.agents/workflows/investigate.md +37 -0
- package/.agents/workflows/land-and-deploy.md +35 -0
- package/.agents/workflows/office-hours.md +27 -0
- package/.agents/workflows/plan-ceo-review.md +34 -0
- package/.agents/workflows/plan-design-review.md +31 -0
- package/.agents/workflows/plan-eng-review.md +28 -0
- package/.agents/workflows/qa-only.md +28 -0
- package/.agents/workflows/qa.md +73 -0
- package/.agents/workflows/retro.md +34 -0
- package/.agents/workflows/review.md +30 -0
- package/.agents/workflows/setup-browser-cookies.md +15 -0
- package/.agents/workflows/setup-cookies.md +8 -0
- package/.agents/workflows/setup-deploy.md +21 -0
- package/.agents/workflows/ship.md +93 -0
- package/.agents/workflows/unfreeze.md +12 -0
- package/LICENSE +22 -0
- package/README.md +189 -0
- package/README_KO.md +191 -0
- package/bin/install.js +105 -0
- package/gstack-origin/.agents/skills/gstack/SKILL.md +651 -0
- package/gstack-origin/.agents/skills/gstack-autoplan/SKILL.md +678 -0
- package/gstack-origin/.agents/skills/gstack-benchmark/SKILL.md +482 -0
- package/gstack-origin/.agents/skills/gstack-browse/SKILL.md +511 -0
- package/gstack-origin/.agents/skills/gstack-canary/SKILL.md +486 -0
- package/gstack-origin/.agents/skills/gstack-careful/SKILL.md +50 -0
- package/gstack-origin/.agents/skills/gstack-cso/SKILL.md +607 -0
- package/gstack-origin/.agents/skills/gstack-design-consultation/SKILL.md +615 -0
- package/gstack-origin/.agents/skills/gstack-design-review/SKILL.md +988 -0
- package/gstack-origin/.agents/skills/gstack-document-release/SKILL.md +604 -0
- package/gstack-origin/.agents/skills/gstack-freeze/SKILL.md +67 -0
- package/gstack-origin/.agents/skills/gstack-guard/SKILL.md +62 -0
- package/gstack-origin/.agents/skills/gstack-investigate/SKILL.md +415 -0
- package/gstack-origin/.agents/skills/gstack-land-and-deploy/SKILL.md +873 -0
- package/gstack-origin/.agents/skills/gstack-office-hours/SKILL.md +986 -0
- package/gstack-origin/.agents/skills/gstack-plan-ceo-review/SKILL.md +1268 -0
- package/gstack-origin/.agents/skills/gstack-plan-design-review/SKILL.md +668 -0
- package/gstack-origin/.agents/skills/gstack-plan-eng-review/SKILL.md +826 -0
- package/gstack-origin/.agents/skills/gstack-qa/SKILL.md +1006 -0
- package/gstack-origin/.agents/skills/gstack-qa-only/SKILL.md +626 -0
- package/gstack-origin/.agents/skills/gstack-retro/SKILL.md +1065 -0
- package/gstack-origin/.agents/skills/gstack-review/SKILL.md +704 -0
- package/gstack-origin/.agents/skills/gstack-setup-browser-cookies/SKILL.md +325 -0
- package/gstack-origin/.agents/skills/gstack-setup-deploy/SKILL.md +450 -0
- package/gstack-origin/.agents/skills/gstack-ship/SKILL.md +1312 -0
- package/gstack-origin/.agents/skills/gstack-unfreeze/SKILL.md +36 -0
- package/gstack-origin/.agents/skills/gstack-upgrade/SKILL.md +220 -0
- package/gstack-origin/.env.example +5 -0
- package/gstack-origin/.github/workflows/skill-docs.yml +17 -0
- package/gstack-origin/AGENTS.md +49 -0
- package/gstack-origin/ARCHITECTURE.md +359 -0
- package/gstack-origin/BROWSER.md +271 -0
- package/gstack-origin/CHANGELOG.md +800 -0
- package/gstack-origin/CLAUDE.md +284 -0
- package/gstack-origin/CONTRIBUTING.md +370 -0
- package/gstack-origin/ETHOS.md +129 -0
- package/gstack-origin/LICENSE +21 -0
- package/gstack-origin/README.md +228 -0
- package/gstack-origin/SKILL.md +657 -0
- package/gstack-origin/SKILL.md.tmpl +281 -0
- package/gstack-origin/TODOS.md +564 -0
- package/gstack-origin/VERSION +1 -0
- package/gstack-origin/autoplan/SKILL.md +689 -0
- package/gstack-origin/autoplan/SKILL.md.tmpl +416 -0
- package/gstack-origin/benchmark/SKILL.md +489 -0
- package/gstack-origin/benchmark/SKILL.md.tmpl +233 -0
- package/gstack-origin/bin/dev-setup +68 -0
- package/gstack-origin/bin/dev-teardown +56 -0
- package/gstack-origin/bin/gstack-analytics +191 -0
- package/gstack-origin/bin/gstack-community-dashboard +113 -0
- package/gstack-origin/bin/gstack-config +38 -0
- package/gstack-origin/bin/gstack-diff-scope +71 -0
- package/gstack-origin/bin/gstack-global-discover.ts +591 -0
- package/gstack-origin/bin/gstack-repo-mode +93 -0
- package/gstack-origin/bin/gstack-review-log +9 -0
- package/gstack-origin/bin/gstack-review-read +12 -0
- package/gstack-origin/bin/gstack-slug +15 -0
- package/gstack-origin/bin/gstack-telemetry-log +158 -0
- package/gstack-origin/bin/gstack-telemetry-sync +127 -0
- package/gstack-origin/bin/gstack-update-check +196 -0
- package/gstack-origin/browse/SKILL.md +517 -0
- package/gstack-origin/browse/SKILL.md.tmpl +141 -0
- package/gstack-origin/browse/bin/find-browse +21 -0
- package/gstack-origin/browse/bin/remote-slug +14 -0
- package/gstack-origin/browse/scripts/build-node-server.sh +48 -0
- package/gstack-origin/browse/src/browser-manager.ts +634 -0
- package/gstack-origin/browse/src/buffers.ts +137 -0
- package/gstack-origin/browse/src/bun-polyfill.cjs +109 -0
- package/gstack-origin/browse/src/cli.ts +420 -0
- package/gstack-origin/browse/src/commands.ts +111 -0
- package/gstack-origin/browse/src/config.ts +150 -0
- package/gstack-origin/browse/src/cookie-import-browser.ts +417 -0
- package/gstack-origin/browse/src/cookie-picker-routes.ts +207 -0
- package/gstack-origin/browse/src/cookie-picker-ui.ts +541 -0
- package/gstack-origin/browse/src/find-browse.ts +61 -0
- package/gstack-origin/browse/src/meta-commands.ts +269 -0
- package/gstack-origin/browse/src/platform.ts +17 -0
- package/gstack-origin/browse/src/read-commands.ts +335 -0
- package/gstack-origin/browse/src/server.ts +369 -0
- package/gstack-origin/browse/src/snapshot.ts +398 -0
- package/gstack-origin/browse/src/url-validation.ts +91 -0
- package/gstack-origin/browse/src/write-commands.ts +352 -0
- package/gstack-origin/browse/test/bun-polyfill.test.ts +72 -0
- package/gstack-origin/browse/test/commands.test.ts +1836 -0
- package/gstack-origin/browse/test/config.test.ts +250 -0
- package/gstack-origin/browse/test/cookie-import-browser.test.ts +397 -0
- package/gstack-origin/browse/test/cookie-picker-routes.test.ts +205 -0
- package/gstack-origin/browse/test/find-browse.test.ts +50 -0
- package/gstack-origin/browse/test/fixtures/basic.html +33 -0
- package/gstack-origin/browse/test/fixtures/cursor-interactive.html +22 -0
- package/gstack-origin/browse/test/fixtures/dialog.html +15 -0
- package/gstack-origin/browse/test/fixtures/empty.html +2 -0
- package/gstack-origin/browse/test/fixtures/forms.html +55 -0
- package/gstack-origin/browse/test/fixtures/qa-eval-checkout.html +108 -0
- package/gstack-origin/browse/test/fixtures/qa-eval-spa.html +98 -0
- package/gstack-origin/browse/test/fixtures/qa-eval.html +51 -0
- package/gstack-origin/browse/test/fixtures/responsive.html +49 -0
- package/gstack-origin/browse/test/fixtures/snapshot.html +55 -0
- package/gstack-origin/browse/test/fixtures/spa.html +24 -0
- package/gstack-origin/browse/test/fixtures/states.html +17 -0
- package/gstack-origin/browse/test/fixtures/upload.html +25 -0
- package/gstack-origin/browse/test/gstack-config.test.ts +125 -0
- package/gstack-origin/browse/test/gstack-update-check.test.ts +467 -0
- package/gstack-origin/browse/test/handoff.test.ts +235 -0
- package/gstack-origin/browse/test/path-validation.test.ts +63 -0
- package/gstack-origin/browse/test/platform.test.ts +37 -0
- package/gstack-origin/browse/test/snapshot.test.ts +467 -0
- package/gstack-origin/browse/test/test-server.ts +57 -0
- package/gstack-origin/browse/test/url-validation.test.ts +72 -0
- package/gstack-origin/canary/SKILL.md +493 -0
- package/gstack-origin/canary/SKILL.md.tmpl +220 -0
- package/gstack-origin/careful/SKILL.md +59 -0
- package/gstack-origin/careful/SKILL.md.tmpl +57 -0
- package/gstack-origin/careful/bin/check-careful.sh +112 -0
- package/gstack-origin/codex/SKILL.md +677 -0
- package/gstack-origin/codex/SKILL.md.tmpl +356 -0
- package/gstack-origin/conductor.json +6 -0
- package/gstack-origin/cso/SKILL.md +615 -0
- package/gstack-origin/cso/SKILL.md.tmpl +376 -0
- package/gstack-origin/design-consultation/SKILL.md +625 -0
- package/gstack-origin/design-consultation/SKILL.md.tmpl +369 -0
- package/gstack-origin/design-review/SKILL.md +998 -0
- package/gstack-origin/design-review/SKILL.md.tmpl +262 -0
- package/gstack-origin/docs/images/github-2013.png +0 -0
- package/gstack-origin/docs/images/github-2026.png +0 -0
- package/gstack-origin/docs/skills.md +877 -0
- package/gstack-origin/document-release/SKILL.md +613 -0
- package/gstack-origin/document-release/SKILL.md.tmpl +357 -0
- package/gstack-origin/freeze/SKILL.md +82 -0
- package/gstack-origin/freeze/SKILL.md.tmpl +80 -0
- package/gstack-origin/freeze/bin/check-freeze.sh +68 -0
- package/gstack-origin/gstack-upgrade/SKILL.md +226 -0
- package/gstack-origin/gstack-upgrade/SKILL.md.tmpl +224 -0
- package/gstack-origin/guard/SKILL.md +82 -0
- package/gstack-origin/guard/SKILL.md.tmpl +80 -0
- package/gstack-origin/investigate/SKILL.md +435 -0
- package/gstack-origin/investigate/SKILL.md.tmpl +196 -0
- package/gstack-origin/land-and-deploy/SKILL.md +880 -0
- package/gstack-origin/land-and-deploy/SKILL.md.tmpl +575 -0
- package/gstack-origin/office-hours/SKILL.md +996 -0
- package/gstack-origin/office-hours/SKILL.md.tmpl +624 -0
- package/gstack-origin/package.json +55 -0
- package/gstack-origin/plan-ceo-review/SKILL.md +1277 -0
- package/gstack-origin/plan-ceo-review/SKILL.md.tmpl +838 -0
- package/gstack-origin/plan-design-review/SKILL.md +676 -0
- package/gstack-origin/plan-design-review/SKILL.md.tmpl +314 -0
- package/gstack-origin/plan-eng-review/SKILL.md +836 -0
- package/gstack-origin/plan-eng-review/SKILL.md.tmpl +279 -0
- package/gstack-origin/qa/SKILL.md +1016 -0
- package/gstack-origin/qa/SKILL.md.tmpl +316 -0
- package/gstack-origin/qa/references/issue-taxonomy.md +85 -0
- package/gstack-origin/qa/templates/qa-report-template.md +126 -0
- package/gstack-origin/qa-only/SKILL.md +633 -0
- package/gstack-origin/qa-only/SKILL.md.tmpl +101 -0
- package/gstack-origin/retro/SKILL.md +1072 -0
- package/gstack-origin/retro/SKILL.md.tmpl +833 -0
- package/gstack-origin/review/SKILL.md +849 -0
- package/gstack-origin/review/SKILL.md.tmpl +259 -0
- package/gstack-origin/review/TODOS-format.md +62 -0
- package/gstack-origin/review/checklist.md +190 -0
- package/gstack-origin/review/design-checklist.md +132 -0
- package/gstack-origin/review/greptile-triage.md +220 -0
- package/gstack-origin/scripts/analytics.ts +190 -0
- package/gstack-origin/scripts/dev-skill.ts +82 -0
- package/gstack-origin/scripts/eval-compare.ts +96 -0
- package/gstack-origin/scripts/eval-list.ts +116 -0
- package/gstack-origin/scripts/eval-select.ts +86 -0
- package/gstack-origin/scripts/eval-summary.ts +187 -0
- package/gstack-origin/scripts/eval-watch.ts +172 -0
- package/gstack-origin/scripts/gen-skill-docs.ts +2414 -0
- package/gstack-origin/scripts/skill-check.ts +167 -0
- package/gstack-origin/setup +269 -0
- package/gstack-origin/setup-browser-cookies/SKILL.md +330 -0
- package/gstack-origin/setup-browser-cookies/SKILL.md.tmpl +74 -0
- package/gstack-origin/setup-deploy/SKILL.md +459 -0
- package/gstack-origin/setup-deploy/SKILL.md.tmpl +220 -0
- package/gstack-origin/ship/SKILL.md +1457 -0
- package/gstack-origin/ship/SKILL.md.tmpl +528 -0
- package/gstack-origin/supabase/config.sh +10 -0
- package/gstack-origin/supabase/functions/community-pulse/index.ts +59 -0
- package/gstack-origin/supabase/functions/telemetry-ingest/index.ts +135 -0
- package/gstack-origin/supabase/functions/update-check/index.ts +37 -0
- package/gstack-origin/supabase/migrations/001_telemetry.sql +89 -0
- package/gstack-origin/test/analytics.test.ts +277 -0
- package/gstack-origin/test/codex-e2e.test.ts +197 -0
- package/gstack-origin/test/fixtures/coverage-audit-fixture.ts +76 -0
- package/gstack-origin/test/fixtures/eval-baselines.json +7 -0
- package/gstack-origin/test/fixtures/qa-eval-checkout-ground-truth.json +43 -0
- package/gstack-origin/test/fixtures/qa-eval-ground-truth.json +43 -0
- package/gstack-origin/test/fixtures/qa-eval-spa-ground-truth.json +43 -0
- package/gstack-origin/test/fixtures/review-eval-design-slop.css +86 -0
- package/gstack-origin/test/fixtures/review-eval-design-slop.html +41 -0
- package/gstack-origin/test/fixtures/review-eval-enum-diff.rb +30 -0
- package/gstack-origin/test/fixtures/review-eval-enum.rb +27 -0
- package/gstack-origin/test/fixtures/review-eval-vuln.rb +14 -0
- package/gstack-origin/test/gemini-e2e.test.ts +173 -0
- package/gstack-origin/test/gen-skill-docs.test.ts +1049 -0
- package/gstack-origin/test/global-discover.test.ts +187 -0
- package/gstack-origin/test/helpers/codex-session-runner.ts +282 -0
- package/gstack-origin/test/helpers/e2e-helpers.ts +239 -0
- package/gstack-origin/test/helpers/eval-store.test.ts +548 -0
- package/gstack-origin/test/helpers/eval-store.ts +689 -0
- package/gstack-origin/test/helpers/gemini-session-runner.test.ts +104 -0
- package/gstack-origin/test/helpers/gemini-session-runner.ts +201 -0
- package/gstack-origin/test/helpers/llm-judge.ts +130 -0
- package/gstack-origin/test/helpers/observability.test.ts +283 -0
- package/gstack-origin/test/helpers/session-runner.test.ts +96 -0
- package/gstack-origin/test/helpers/session-runner.ts +357 -0
- package/gstack-origin/test/helpers/skill-parser.ts +206 -0
- package/gstack-origin/test/helpers/touchfiles.ts +260 -0
- package/gstack-origin/test/hook-scripts.test.ts +373 -0
- package/gstack-origin/test/skill-e2e-browse.test.ts +293 -0
- package/gstack-origin/test/skill-e2e-deploy.test.ts +279 -0
- package/gstack-origin/test/skill-e2e-design.test.ts +614 -0
- package/gstack-origin/test/skill-e2e-plan.test.ts +538 -0
- package/gstack-origin/test/skill-e2e-qa-bugs.test.ts +194 -0
- package/gstack-origin/test/skill-e2e-qa-workflow.test.ts +412 -0
- package/gstack-origin/test/skill-e2e-review.test.ts +535 -0
- package/gstack-origin/test/skill-e2e-workflow.test.ts +586 -0
- package/gstack-origin/test/skill-e2e.test.ts +3325 -0
- package/gstack-origin/test/skill-llm-eval.test.ts +787 -0
- package/gstack-origin/test/skill-parser.test.ts +179 -0
- package/gstack-origin/test/skill-routing-e2e.test.ts +605 -0
- package/gstack-origin/test/skill-validation.test.ts +1520 -0
- package/gstack-origin/test/telemetry.test.ts +278 -0
- package/gstack-origin/test/touchfiles.test.ts +262 -0
- package/gstack-origin/unfreeze/SKILL.md +40 -0
- package/gstack-origin/unfreeze/SKILL.md.tmpl +38 -0
- package/package.json +38 -0
- package/scripts/install-antigravity-skill.ps1 +33 -0
- package/scripts/install-antigravity-skill.sh +41 -0
- package/scripts/sync-gstack-origin.ps1 +37 -0
- package/scripts/sync-gstack-origin.sh +35 -0
|
@@ -0,0 +1,689 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Eval result persistence and comparison.
|
|
3
|
+
*
|
|
4
|
+
* EvalCollector accumulates test results, writes them to
|
|
5
|
+
* ~/.gstack-dev/evals/{version}-{branch}-{tier}-{timestamp}.json,
|
|
6
|
+
* prints a summary table, and auto-compares with the previous run.
|
|
7
|
+
*
|
|
8
|
+
* Comparison functions are exported for reuse by the eval:compare CLI.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import * as fs from 'fs';
|
|
12
|
+
import * as path from 'path';
|
|
13
|
+
import * as os from 'os';
|
|
14
|
+
import { spawnSync } from 'child_process';
|
|
15
|
+
|
|
16
|
+
const SCHEMA_VERSION = 1;
|
|
17
|
+
const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
|
|
18
|
+
|
|
19
|
+
// --- Interfaces ---
|
|
20
|
+
|
|
21
|
+
export interface EvalTestEntry {
|
|
22
|
+
name: string;
|
|
23
|
+
suite: string;
|
|
24
|
+
tier: 'e2e' | 'llm-judge';
|
|
25
|
+
passed: boolean;
|
|
26
|
+
duration_ms: number;
|
|
27
|
+
cost_usd: number;
|
|
28
|
+
|
|
29
|
+
// E2E
|
|
30
|
+
transcript?: any[];
|
|
31
|
+
prompt?: string;
|
|
32
|
+
output?: string;
|
|
33
|
+
turns_used?: number;
|
|
34
|
+
browse_errors?: string[];
|
|
35
|
+
|
|
36
|
+
// LLM judge
|
|
37
|
+
judge_scores?: Record<string, number>;
|
|
38
|
+
judge_reasoning?: string;
|
|
39
|
+
|
|
40
|
+
// Machine-readable diagnostics
|
|
41
|
+
exit_reason?: string; // 'success' | 'timeout' | 'error_max_turns' | 'exit_code_N'
|
|
42
|
+
timeout_at_turn?: number; // which turn was active when timeout hit
|
|
43
|
+
last_tool_call?: string; // e.g. "Write(review-output.md)"
|
|
44
|
+
|
|
45
|
+
// Model + timing diagnostics (added for Sonnet/Opus split)
|
|
46
|
+
model?: string; // e.g. 'claude-sonnet-4-6' or 'claude-opus-4-6'
|
|
47
|
+
first_response_ms?: number; // time from spawn to first NDJSON line
|
|
48
|
+
max_inter_turn_ms?: number; // peak latency between consecutive tool calls
|
|
49
|
+
|
|
50
|
+
// Outcome eval
|
|
51
|
+
detection_rate?: number;
|
|
52
|
+
false_positives?: number;
|
|
53
|
+
evidence_quality?: number;
|
|
54
|
+
detected_bugs?: string[];
|
|
55
|
+
missed_bugs?: string[];
|
|
56
|
+
|
|
57
|
+
error?: string;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export interface EvalResult {
|
|
61
|
+
schema_version: number;
|
|
62
|
+
version: string;
|
|
63
|
+
branch: string;
|
|
64
|
+
git_sha: string;
|
|
65
|
+
timestamp: string;
|
|
66
|
+
hostname: string;
|
|
67
|
+
tier: 'e2e' | 'llm-judge';
|
|
68
|
+
total_tests: number;
|
|
69
|
+
passed: number;
|
|
70
|
+
failed: number;
|
|
71
|
+
total_cost_usd: number;
|
|
72
|
+
total_duration_ms: number;
|
|
73
|
+
wall_clock_ms?: number; // wall-clock from collector creation to finalization (shows parallelism)
|
|
74
|
+
tests: EvalTestEntry[];
|
|
75
|
+
_partial?: boolean; // true for incremental saves, absent in final
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export interface TestDelta {
|
|
79
|
+
name: string;
|
|
80
|
+
before: { passed: boolean; cost_usd: number; turns_used?: number; duration_ms?: number;
|
|
81
|
+
detection_rate?: number; tool_summary?: Record<string, number> };
|
|
82
|
+
after: { passed: boolean; cost_usd: number; turns_used?: number; duration_ms?: number;
|
|
83
|
+
detection_rate?: number; tool_summary?: Record<string, number> };
|
|
84
|
+
status_change: 'improved' | 'regressed' | 'unchanged';
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
export interface ComparisonResult {
|
|
88
|
+
before_file: string;
|
|
89
|
+
after_file: string;
|
|
90
|
+
before_branch: string;
|
|
91
|
+
after_branch: string;
|
|
92
|
+
before_timestamp: string;
|
|
93
|
+
after_timestamp: string;
|
|
94
|
+
deltas: TestDelta[];
|
|
95
|
+
total_cost_delta: number;
|
|
96
|
+
total_duration_delta: number;
|
|
97
|
+
improved: number;
|
|
98
|
+
regressed: number;
|
|
99
|
+
unchanged: number;
|
|
100
|
+
tool_count_before: number;
|
|
101
|
+
tool_count_after: number;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// --- Shared helpers ---
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Determine if a planted-bug eval passed based on judge results vs ground truth thresholds.
|
|
108
|
+
* Centralizes the pass/fail logic so all planted-bug tests use the same criteria.
|
|
109
|
+
*/
|
|
110
|
+
export function judgePassed(
|
|
111
|
+
judgeResult: { detection_rate: number; false_positives: number; evidence_quality: number },
|
|
112
|
+
groundTruth: { minimum_detection: number; max_false_positives: number },
|
|
113
|
+
): boolean {
|
|
114
|
+
return judgeResult.detection_rate >= groundTruth.minimum_detection
|
|
115
|
+
&& judgeResult.false_positives <= groundTruth.max_false_positives
|
|
116
|
+
&& judgeResult.evidence_quality >= 2;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// --- Comparison functions (exported for eval:compare CLI) ---
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Extract tool call counts from a transcript.
|
|
123
|
+
* Returns e.g. { Bash: 8, Read: 3, Write: 1 }.
|
|
124
|
+
*/
|
|
125
|
+
export function extractToolSummary(transcript: any[]): Record<string, number> {
|
|
126
|
+
const counts: Record<string, number> = {};
|
|
127
|
+
for (const event of transcript) {
|
|
128
|
+
if (event.type === 'assistant') {
|
|
129
|
+
const content = event.message?.content || [];
|
|
130
|
+
for (const item of content) {
|
|
131
|
+
if (item.type === 'tool_use') {
|
|
132
|
+
const name = item.name || 'unknown';
|
|
133
|
+
counts[name] = (counts[name] || 0) + 1;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
return counts;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Find the most recent prior eval file for comparison.
|
|
143
|
+
* Prefers same branch, falls back to any branch.
|
|
144
|
+
*/
|
|
145
|
+
export function findPreviousRun(
|
|
146
|
+
evalDir: string,
|
|
147
|
+
tier: string,
|
|
148
|
+
branch: string,
|
|
149
|
+
excludeFile: string,
|
|
150
|
+
): string | null {
|
|
151
|
+
let files: string[];
|
|
152
|
+
try {
|
|
153
|
+
files = fs.readdirSync(evalDir).filter(f => f.endsWith('.json'));
|
|
154
|
+
} catch {
|
|
155
|
+
return null; // dir doesn't exist
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Parse top-level fields from each file (cheap — no full tests array needed)
|
|
159
|
+
const entries: Array<{ file: string; branch: string; timestamp: string }> = [];
|
|
160
|
+
for (const file of files) {
|
|
161
|
+
if (file === path.basename(excludeFile)) continue;
|
|
162
|
+
const fullPath = path.join(evalDir, file);
|
|
163
|
+
try {
|
|
164
|
+
const raw = fs.readFileSync(fullPath, 'utf-8');
|
|
165
|
+
// Quick parse — only grab the fields we need
|
|
166
|
+
const data = JSON.parse(raw);
|
|
167
|
+
if (data.tier !== tier) continue;
|
|
168
|
+
entries.push({ file: fullPath, branch: data.branch || '', timestamp: data.timestamp || '' });
|
|
169
|
+
} catch { continue; }
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
if (entries.length === 0) return null;
|
|
173
|
+
|
|
174
|
+
// Sort by timestamp descending
|
|
175
|
+
entries.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
|
|
176
|
+
|
|
177
|
+
// Prefer same branch
|
|
178
|
+
const sameBranch = entries.find(e => e.branch === branch);
|
|
179
|
+
if (sameBranch) return sameBranch.file;
|
|
180
|
+
|
|
181
|
+
// Fallback: any branch
|
|
182
|
+
return entries[0].file;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
/**
|
|
186
|
+
* Compare two eval results. Matches tests by name.
|
|
187
|
+
*/
|
|
188
|
+
export function compareEvalResults(
|
|
189
|
+
before: EvalResult,
|
|
190
|
+
after: EvalResult,
|
|
191
|
+
beforeFile: string,
|
|
192
|
+
afterFile: string,
|
|
193
|
+
): ComparisonResult {
|
|
194
|
+
const deltas: TestDelta[] = [];
|
|
195
|
+
let improved = 0, regressed = 0, unchanged = 0;
|
|
196
|
+
let toolCountBefore = 0, toolCountAfter = 0;
|
|
197
|
+
|
|
198
|
+
// Index before tests by name
|
|
199
|
+
const beforeMap = new Map<string, EvalTestEntry>();
|
|
200
|
+
for (const t of before.tests) {
|
|
201
|
+
beforeMap.set(t.name, t);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// Walk after tests, match by name
|
|
205
|
+
for (const afterTest of after.tests) {
|
|
206
|
+
const beforeTest = beforeMap.get(afterTest.name);
|
|
207
|
+
const beforeToolSummary = beforeTest?.transcript ? extractToolSummary(beforeTest.transcript) : {};
|
|
208
|
+
const afterToolSummary = afterTest.transcript ? extractToolSummary(afterTest.transcript) : {};
|
|
209
|
+
|
|
210
|
+
const beforeToolCount = Object.values(beforeToolSummary).reduce((a, b) => a + b, 0);
|
|
211
|
+
const afterToolCount = Object.values(afterToolSummary).reduce((a, b) => a + b, 0);
|
|
212
|
+
toolCountBefore += beforeToolCount;
|
|
213
|
+
toolCountAfter += afterToolCount;
|
|
214
|
+
|
|
215
|
+
let statusChange: TestDelta['status_change'] = 'unchanged';
|
|
216
|
+
if (beforeTest) {
|
|
217
|
+
if (!beforeTest.passed && afterTest.passed) { statusChange = 'improved'; improved++; }
|
|
218
|
+
else if (beforeTest.passed && !afterTest.passed) { statusChange = 'regressed'; regressed++; }
|
|
219
|
+
else { unchanged++; }
|
|
220
|
+
} else {
|
|
221
|
+
// New test — treat as unchanged (no prior data)
|
|
222
|
+
unchanged++;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
deltas.push({
|
|
226
|
+
name: afterTest.name,
|
|
227
|
+
before: {
|
|
228
|
+
passed: beforeTest?.passed ?? false,
|
|
229
|
+
cost_usd: beforeTest?.cost_usd ?? 0,
|
|
230
|
+
turns_used: beforeTest?.turns_used,
|
|
231
|
+
duration_ms: beforeTest?.duration_ms,
|
|
232
|
+
detection_rate: beforeTest?.detection_rate,
|
|
233
|
+
tool_summary: beforeToolSummary,
|
|
234
|
+
},
|
|
235
|
+
after: {
|
|
236
|
+
passed: afterTest.passed,
|
|
237
|
+
cost_usd: afterTest.cost_usd,
|
|
238
|
+
turns_used: afterTest.turns_used,
|
|
239
|
+
duration_ms: afterTest.duration_ms,
|
|
240
|
+
detection_rate: afterTest.detection_rate,
|
|
241
|
+
tool_summary: afterToolSummary,
|
|
242
|
+
},
|
|
243
|
+
status_change: statusChange,
|
|
244
|
+
});
|
|
245
|
+
|
|
246
|
+
beforeMap.delete(afterTest.name);
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// Tests that were in before but not in after (removed tests)
|
|
250
|
+
for (const [name, beforeTest] of beforeMap) {
|
|
251
|
+
const beforeToolSummary = beforeTest.transcript ? extractToolSummary(beforeTest.transcript) : {};
|
|
252
|
+
const beforeToolCount = Object.values(beforeToolSummary).reduce((a, b) => a + b, 0);
|
|
253
|
+
toolCountBefore += beforeToolCount;
|
|
254
|
+
unchanged++;
|
|
255
|
+
deltas.push({
|
|
256
|
+
name: `${name} (removed)`,
|
|
257
|
+
before: {
|
|
258
|
+
passed: beforeTest.passed,
|
|
259
|
+
cost_usd: beforeTest.cost_usd,
|
|
260
|
+
turns_used: beforeTest.turns_used,
|
|
261
|
+
duration_ms: beforeTest.duration_ms,
|
|
262
|
+
detection_rate: beforeTest.detection_rate,
|
|
263
|
+
tool_summary: beforeToolSummary,
|
|
264
|
+
},
|
|
265
|
+
after: { passed: false, cost_usd: 0, tool_summary: {} },
|
|
266
|
+
status_change: 'unchanged',
|
|
267
|
+
});
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
return {
|
|
271
|
+
before_file: beforeFile,
|
|
272
|
+
after_file: afterFile,
|
|
273
|
+
before_branch: before.branch,
|
|
274
|
+
after_branch: after.branch,
|
|
275
|
+
before_timestamp: before.timestamp,
|
|
276
|
+
after_timestamp: after.timestamp,
|
|
277
|
+
deltas,
|
|
278
|
+
total_cost_delta: after.total_cost_usd - before.total_cost_usd,
|
|
279
|
+
total_duration_delta: after.total_duration_ms - before.total_duration_ms,
|
|
280
|
+
improved,
|
|
281
|
+
regressed,
|
|
282
|
+
unchanged,
|
|
283
|
+
tool_count_before: toolCountBefore,
|
|
284
|
+
tool_count_after: toolCountAfter,
|
|
285
|
+
};
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
/**
|
|
289
|
+
* Format a ComparisonResult as a readable string.
|
|
290
|
+
*/
|
|
291
|
+
export function formatComparison(c: ComparisonResult): string {
|
|
292
|
+
const lines: string[] = [];
|
|
293
|
+
const ts = c.before_timestamp ? c.before_timestamp.replace('T', ' ').slice(0, 16) : 'unknown';
|
|
294
|
+
lines.push(`\nvs previous: ${c.before_branch}/${c.deltas.length ? 'eval' : ''} (${ts})`);
|
|
295
|
+
lines.push('─'.repeat(70));
|
|
296
|
+
|
|
297
|
+
// Per-test deltas
|
|
298
|
+
for (const d of c.deltas) {
|
|
299
|
+
const arrow = d.status_change === 'improved' ? '↑' : d.status_change === 'regressed' ? '↓' : '=';
|
|
300
|
+
const beforeStatus = d.before.passed ? 'PASS' : 'FAIL';
|
|
301
|
+
const afterStatus = d.after.passed ? 'PASS' : 'FAIL';
|
|
302
|
+
|
|
303
|
+
// Turns delta
|
|
304
|
+
let turnsDelta = '';
|
|
305
|
+
if (d.before.turns_used !== undefined && d.after.turns_used !== undefined) {
|
|
306
|
+
const td = d.after.turns_used - d.before.turns_used;
|
|
307
|
+
turnsDelta = ` ${d.before.turns_used}→${d.after.turns_used}t`;
|
|
308
|
+
if (td !== 0) turnsDelta += `(${td > 0 ? '+' : ''}${td})`;
|
|
309
|
+
} else if (d.after.turns_used !== undefined) {
|
|
310
|
+
turnsDelta = ` ${d.after.turns_used}t`;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// Duration delta
|
|
314
|
+
let durDelta = '';
|
|
315
|
+
if (d.before.duration_ms !== undefined && d.after.duration_ms !== undefined) {
|
|
316
|
+
const bs = Math.round(d.before.duration_ms / 1000);
|
|
317
|
+
const as = Math.round(d.after.duration_ms / 1000);
|
|
318
|
+
const dd = as - bs;
|
|
319
|
+
durDelta = ` ${bs}→${as}s`;
|
|
320
|
+
if (dd !== 0) durDelta += `(${dd > 0 ? '+' : ''}${dd})`;
|
|
321
|
+
} else if (d.after.duration_ms !== undefined) {
|
|
322
|
+
durDelta = ` ${Math.round(d.after.duration_ms / 1000)}s`;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
let detail = '';
|
|
326
|
+
if (d.before.detection_rate !== undefined || d.after.detection_rate !== undefined) {
|
|
327
|
+
detail = ` ${d.before.detection_rate ?? '?'}→${d.after.detection_rate ?? '?'} det`;
|
|
328
|
+
} else {
|
|
329
|
+
const costBefore = d.before.cost_usd.toFixed(2);
|
|
330
|
+
const costAfter = d.after.cost_usd.toFixed(2);
|
|
331
|
+
detail = ` $${costBefore}→$${costAfter}`;
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
const name = d.name.length > 30 ? d.name.slice(0, 27) + '...' : d.name.padEnd(30);
|
|
335
|
+
lines.push(` ${name} ${beforeStatus.padEnd(5)} → ${afterStatus.padEnd(5)} ${arrow}${detail}${turnsDelta}${durDelta}`);
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
lines.push('─'.repeat(70));
|
|
339
|
+
|
|
340
|
+
// Totals
|
|
341
|
+
const parts: string[] = [];
|
|
342
|
+
if (c.improved > 0) parts.push(`${c.improved} improved`);
|
|
343
|
+
if (c.regressed > 0) parts.push(`${c.regressed} regressed`);
|
|
344
|
+
if (c.unchanged > 0) parts.push(`${c.unchanged} unchanged`);
|
|
345
|
+
lines.push(` Status: ${parts.join(', ')}`);
|
|
346
|
+
|
|
347
|
+
const costSign = c.total_cost_delta >= 0 ? '+' : '';
|
|
348
|
+
lines.push(` Cost: ${costSign}$${c.total_cost_delta.toFixed(2)}`);
|
|
349
|
+
|
|
350
|
+
const durDelta = Math.round(c.total_duration_delta / 1000);
|
|
351
|
+
const durSign = durDelta >= 0 ? '+' : '';
|
|
352
|
+
lines.push(` Duration: ${durSign}${durDelta}s`);
|
|
353
|
+
|
|
354
|
+
const toolDelta = c.tool_count_after - c.tool_count_before;
|
|
355
|
+
const toolSign = toolDelta >= 0 ? '+' : '';
|
|
356
|
+
lines.push(` Tool calls: ${c.tool_count_before} → ${c.tool_count_after} (${toolSign}${toolDelta})`);
|
|
357
|
+
|
|
358
|
+
// Tool breakdown (show tools that changed)
|
|
359
|
+
const allTools = new Set<string>();
|
|
360
|
+
for (const d of c.deltas) {
|
|
361
|
+
for (const t of Object.keys(d.before.tool_summary || {})) allTools.add(t);
|
|
362
|
+
for (const t of Object.keys(d.after.tool_summary || {})) allTools.add(t);
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
if (allTools.size > 0) {
|
|
366
|
+
// Aggregate tool counts across all tests
|
|
367
|
+
const totalBefore: Record<string, number> = {};
|
|
368
|
+
const totalAfter: Record<string, number> = {};
|
|
369
|
+
for (const d of c.deltas) {
|
|
370
|
+
for (const [t, n] of Object.entries(d.before.tool_summary || {})) {
|
|
371
|
+
totalBefore[t] = (totalBefore[t] || 0) + n;
|
|
372
|
+
}
|
|
373
|
+
for (const [t, n] of Object.entries(d.after.tool_summary || {})) {
|
|
374
|
+
totalAfter[t] = (totalAfter[t] || 0) + n;
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
for (const tool of [...allTools].sort()) {
|
|
379
|
+
const b = totalBefore[tool] || 0;
|
|
380
|
+
const a = totalAfter[tool] || 0;
|
|
381
|
+
if (b !== a) {
|
|
382
|
+
const d = a - b;
|
|
383
|
+
lines.push(` ${tool}: ${b} → ${a} (${d >= 0 ? '+' : ''}${d})`);
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
// Commentary — interpret what the deltas mean
|
|
389
|
+
const commentary = generateCommentary(c);
|
|
390
|
+
if (commentary.length > 0) {
|
|
391
|
+
lines.push('');
|
|
392
|
+
lines.push(' Takeaway:');
|
|
393
|
+
for (const line of commentary) {
|
|
394
|
+
lines.push(` ${line}`);
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
return lines.join('\n');
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
/**
|
|
402
|
+
* Generate human-readable commentary interpreting comparison deltas.
|
|
403
|
+
* Pure function — analyzes the numbers and explains what they mean.
|
|
404
|
+
*/
|
|
405
|
+
export function generateCommentary(c: ComparisonResult): string[] {
|
|
406
|
+
const notes: string[] = [];
|
|
407
|
+
|
|
408
|
+
// 1. Regressions are the most important signal — call them out first
|
|
409
|
+
const regressions = c.deltas.filter(d => d.status_change === 'regressed');
|
|
410
|
+
if (regressions.length > 0) {
|
|
411
|
+
for (const d of regressions) {
|
|
412
|
+
notes.push(`REGRESSION: "${d.name}" was passing, now fails. Investigate immediately.`);
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
// 2. Improvements
|
|
417
|
+
const improvements = c.deltas.filter(d => d.status_change === 'improved');
|
|
418
|
+
for (const d of improvements) {
|
|
419
|
+
notes.push(`Fixed: "${d.name}" now passes.`);
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
// 3. Per-test efficiency changes (only for unchanged-status tests — regressions/improvements are already noted)
|
|
423
|
+
const stable = c.deltas.filter(d => d.status_change === 'unchanged' && d.after.passed);
|
|
424
|
+
for (const d of stable) {
|
|
425
|
+
const insights: string[] = [];
|
|
426
|
+
|
|
427
|
+
// Turns
|
|
428
|
+
if (d.before.turns_used !== undefined && d.after.turns_used !== undefined && d.before.turns_used > 0) {
|
|
429
|
+
const turnsDelta = d.after.turns_used - d.before.turns_used;
|
|
430
|
+
const turnsPct = Math.round((turnsDelta / d.before.turns_used) * 100);
|
|
431
|
+
if (Math.abs(turnsPct) >= 20 && Math.abs(turnsDelta) >= 2) {
|
|
432
|
+
if (turnsDelta < 0) {
|
|
433
|
+
insights.push(`${Math.abs(turnsDelta)} fewer turns (${Math.abs(turnsPct)}% more efficient)`);
|
|
434
|
+
} else {
|
|
435
|
+
insights.push(`${turnsDelta} more turns (${turnsPct}% less efficient)`);
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// Duration
|
|
441
|
+
if (d.before.duration_ms !== undefined && d.after.duration_ms !== undefined && d.before.duration_ms > 0) {
|
|
442
|
+
const durDelta = d.after.duration_ms - d.before.duration_ms;
|
|
443
|
+
const durPct = Math.round((durDelta / d.before.duration_ms) * 100);
|
|
444
|
+
if (Math.abs(durPct) >= 20 && Math.abs(durDelta) >= 5000) {
|
|
445
|
+
if (durDelta < 0) {
|
|
446
|
+
insights.push(`${Math.round(Math.abs(durDelta) / 1000)}s faster`);
|
|
447
|
+
} else {
|
|
448
|
+
insights.push(`${Math.round(durDelta / 1000)}s slower`);
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
// Detection rate
|
|
454
|
+
if (d.before.detection_rate !== undefined && d.after.detection_rate !== undefined) {
|
|
455
|
+
const detDelta = d.after.detection_rate - d.before.detection_rate;
|
|
456
|
+
if (detDelta !== 0) {
|
|
457
|
+
if (detDelta > 0) {
|
|
458
|
+
insights.push(`detecting ${detDelta} more bug${detDelta > 1 ? 's' : ''}`);
|
|
459
|
+
} else {
|
|
460
|
+
insights.push(`detecting ${Math.abs(detDelta)} fewer bug${Math.abs(detDelta) > 1 ? 's' : ''} — check prompt quality`);
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
// Cost
|
|
466
|
+
if (d.before.cost_usd > 0) {
|
|
467
|
+
const costDelta = d.after.cost_usd - d.before.cost_usd;
|
|
468
|
+
const costPct = Math.round((costDelta / d.before.cost_usd) * 100);
|
|
469
|
+
if (Math.abs(costPct) >= 30 && Math.abs(costDelta) >= 0.05) {
|
|
470
|
+
if (costDelta < 0) {
|
|
471
|
+
insights.push(`${Math.abs(costPct)}% cheaper`);
|
|
472
|
+
} else {
|
|
473
|
+
insights.push(`${costPct}% more expensive`);
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
if (insights.length > 0) {
|
|
479
|
+
notes.push(`"${d.name}": ${insights.join(', ')}.`);
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
// 4. Overall summary
|
|
484
|
+
if (c.deltas.length >= 3 && regressions.length === 0) {
|
|
485
|
+
const overallParts: string[] = [];
|
|
486
|
+
|
|
487
|
+
// Total cost
|
|
488
|
+
const totalBefore = c.deltas.reduce((s, d) => s + d.before.cost_usd, 0);
|
|
489
|
+
if (totalBefore > 0) {
|
|
490
|
+
const costPct = Math.round((c.total_cost_delta / totalBefore) * 100);
|
|
491
|
+
if (Math.abs(costPct) >= 10) {
|
|
492
|
+
overallParts.push(`${Math.abs(costPct)}% ${costPct < 0 ? 'cheaper' : 'more expensive'} overall`);
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
// Total duration
|
|
497
|
+
const totalDurBefore = c.deltas.reduce((s, d) => s + (d.before.duration_ms || 0), 0);
|
|
498
|
+
if (totalDurBefore > 0) {
|
|
499
|
+
const durPct = Math.round((c.total_duration_delta / totalDurBefore) * 100);
|
|
500
|
+
if (Math.abs(durPct) >= 10) {
|
|
501
|
+
overallParts.push(`${Math.abs(durPct)}% ${durPct < 0 ? 'faster' : 'slower'}`);
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
// Total turns
|
|
506
|
+
const turnsBefore = c.deltas.reduce((s, d) => s + (d.before.turns_used || 0), 0);
|
|
507
|
+
const turnsAfter = c.deltas.reduce((s, d) => s + (d.after.turns_used || 0), 0);
|
|
508
|
+
if (turnsBefore > 0) {
|
|
509
|
+
const turnsPct = Math.round(((turnsAfter - turnsBefore) / turnsBefore) * 100);
|
|
510
|
+
if (Math.abs(turnsPct) >= 10) {
|
|
511
|
+
overallParts.push(`${Math.abs(turnsPct)}% ${turnsPct < 0 ? 'fewer' : 'more'} turns`);
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
if (overallParts.length > 0) {
|
|
516
|
+
notes.push(`Overall: ${overallParts.join(', ')}. ${regressions.length === 0 ? 'No regressions.' : ''}`);
|
|
517
|
+
} else if (regressions.length === 0) {
|
|
518
|
+
notes.push('Stable run — no significant efficiency changes, no regressions.');
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
return notes;
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
// --- EvalCollector ---
|
|
526
|
+
|
|
527
|
+
function getGitInfo(): { branch: string; sha: string } {
|
|
528
|
+
try {
|
|
529
|
+
const branch = spawnSync('git', ['rev-parse', '--abbrev-ref', 'HEAD'], { stdio: 'pipe', timeout: 5000 });
|
|
530
|
+
const sha = spawnSync('git', ['rev-parse', '--short', 'HEAD'], { stdio: 'pipe', timeout: 5000 });
|
|
531
|
+
return {
|
|
532
|
+
branch: branch.stdout?.toString().trim() || 'unknown',
|
|
533
|
+
sha: sha.stdout?.toString().trim() || 'unknown',
|
|
534
|
+
};
|
|
535
|
+
} catch {
|
|
536
|
+
return { branch: 'unknown', sha: 'unknown' };
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
function getVersion(): string {
|
|
541
|
+
try {
|
|
542
|
+
const pkgPath = path.resolve(__dirname, '..', '..', 'package.json');
|
|
543
|
+
const pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf-8'));
|
|
544
|
+
return pkg.version || 'unknown';
|
|
545
|
+
} catch {
|
|
546
|
+
return 'unknown';
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
export class EvalCollector {
|
|
551
|
+
private tier: 'e2e' | 'llm-judge';
|
|
552
|
+
private tests: EvalTestEntry[] = [];
|
|
553
|
+
private finalized = false;
|
|
554
|
+
private evalDir: string;
|
|
555
|
+
private createdAt = Date.now();
|
|
556
|
+
|
|
557
|
+
constructor(tier: 'e2e' | 'llm-judge', evalDir?: string) {
|
|
558
|
+
this.tier = tier;
|
|
559
|
+
this.evalDir = evalDir || DEFAULT_EVAL_DIR;
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
addTest(entry: EvalTestEntry): void {
|
|
563
|
+
this.tests.push(entry);
|
|
564
|
+
this.savePartial();
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
/** Write incremental results after each test. Atomic write, non-fatal. */
|
|
568
|
+
savePartial(): void {
|
|
569
|
+
try {
|
|
570
|
+
const git = getGitInfo();
|
|
571
|
+
const version = getVersion();
|
|
572
|
+
const totalCost = this.tests.reduce((s, t) => s + t.cost_usd, 0);
|
|
573
|
+
const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
|
|
574
|
+
const passed = this.tests.filter(t => t.passed).length;
|
|
575
|
+
|
|
576
|
+
const partial: EvalResult = {
|
|
577
|
+
schema_version: SCHEMA_VERSION,
|
|
578
|
+
version,
|
|
579
|
+
branch: git.branch,
|
|
580
|
+
git_sha: git.sha,
|
|
581
|
+
timestamp: new Date().toISOString(),
|
|
582
|
+
hostname: os.hostname(),
|
|
583
|
+
tier: this.tier,
|
|
584
|
+
total_tests: this.tests.length,
|
|
585
|
+
passed,
|
|
586
|
+
failed: this.tests.length - passed,
|
|
587
|
+
total_cost_usd: Math.round(totalCost * 100) / 100,
|
|
588
|
+
total_duration_ms: totalDuration,
|
|
589
|
+
tests: this.tests,
|
|
590
|
+
_partial: true,
|
|
591
|
+
};
|
|
592
|
+
|
|
593
|
+
fs.mkdirSync(this.evalDir, { recursive: true });
|
|
594
|
+
const partialPath = path.join(this.evalDir, '_partial-e2e.json');
|
|
595
|
+
const tmp = partialPath + '.tmp';
|
|
596
|
+
fs.writeFileSync(tmp, JSON.stringify(partial, null, 2) + '\n');
|
|
597
|
+
fs.renameSync(tmp, partialPath);
|
|
598
|
+
} catch { /* non-fatal — partial saves are best-effort */ }
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
async finalize(): Promise<string> {
|
|
602
|
+
if (this.finalized) return '';
|
|
603
|
+
this.finalized = true;
|
|
604
|
+
|
|
605
|
+
const git = getGitInfo();
|
|
606
|
+
const version = getVersion();
|
|
607
|
+
const timestamp = new Date().toISOString();
|
|
608
|
+
const totalCost = this.tests.reduce((s, t) => s + t.cost_usd, 0);
|
|
609
|
+
const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
|
|
610
|
+
const passed = this.tests.filter(t => t.passed).length;
|
|
611
|
+
|
|
612
|
+
const result: EvalResult = {
|
|
613
|
+
schema_version: SCHEMA_VERSION,
|
|
614
|
+
version,
|
|
615
|
+
branch: git.branch,
|
|
616
|
+
git_sha: git.sha,
|
|
617
|
+
timestamp,
|
|
618
|
+
hostname: os.hostname(),
|
|
619
|
+
tier: this.tier,
|
|
620
|
+
total_tests: this.tests.length,
|
|
621
|
+
passed,
|
|
622
|
+
failed: this.tests.length - passed,
|
|
623
|
+
total_cost_usd: Math.round(totalCost * 100) / 100,
|
|
624
|
+
total_duration_ms: totalDuration,
|
|
625
|
+
wall_clock_ms: Date.now() - this.createdAt,
|
|
626
|
+
tests: this.tests,
|
|
627
|
+
};
|
|
628
|
+
|
|
629
|
+
// Write eval file
|
|
630
|
+
fs.mkdirSync(this.evalDir, { recursive: true });
|
|
631
|
+
const dateStr = timestamp.replace(/[:.]/g, '').replace('T', '-').slice(0, 15);
|
|
632
|
+
const safeBranch = git.branch.replace(/[^a-zA-Z0-9._-]/g, '-');
|
|
633
|
+
const filename = `${version}-${safeBranch}-${this.tier}-${dateStr}.json`;
|
|
634
|
+
const filepath = path.join(this.evalDir, filename);
|
|
635
|
+
fs.writeFileSync(filepath, JSON.stringify(result, null, 2) + '\n');
|
|
636
|
+
|
|
637
|
+
// Print summary table
|
|
638
|
+
this.printSummary(result, filepath, git);
|
|
639
|
+
|
|
640
|
+
// Auto-compare with previous run
|
|
641
|
+
try {
|
|
642
|
+
const prevFile = findPreviousRun(this.evalDir, this.tier, git.branch, filepath);
|
|
643
|
+
if (prevFile) {
|
|
644
|
+
const prevResult: EvalResult = JSON.parse(fs.readFileSync(prevFile, 'utf-8'));
|
|
645
|
+
const comparison = compareEvalResults(prevResult, result, prevFile, filepath);
|
|
646
|
+
process.stderr.write(formatComparison(comparison) + '\n');
|
|
647
|
+
} else {
|
|
648
|
+
process.stderr.write('\nFirst run — no comparison available.\n');
|
|
649
|
+
}
|
|
650
|
+
} catch (err: any) {
|
|
651
|
+
process.stderr.write(`\nCompare error: ${err.message}\n`);
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
return filepath;
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
private printSummary(result: EvalResult, filepath: string, git: { branch: string; sha: string }): void {
|
|
658
|
+
const lines: string[] = [];
|
|
659
|
+
lines.push('');
|
|
660
|
+
lines.push(`Eval Results — v${result.version} @ ${git.branch} (${git.sha}) — ${this.tier}`);
|
|
661
|
+
lines.push('═'.repeat(70));
|
|
662
|
+
|
|
663
|
+
for (const t of this.tests) {
|
|
664
|
+
const status = t.passed ? ' PASS ' : ' FAIL ';
|
|
665
|
+
const cost = `$${t.cost_usd.toFixed(2)}`;
|
|
666
|
+
const dur = t.duration_ms ? `${Math.round(t.duration_ms / 1000)}s` : '';
|
|
667
|
+
const turns = t.turns_used !== undefined ? `${t.turns_used}t` : '';
|
|
668
|
+
|
|
669
|
+
let detail = '';
|
|
670
|
+
if (t.detection_rate !== undefined) {
|
|
671
|
+
detail = `${t.detection_rate}/${(t.detected_bugs?.length || 0) + (t.missed_bugs?.length || 0)} det`;
|
|
672
|
+
} else if (t.judge_scores) {
|
|
673
|
+
const scores = Object.entries(t.judge_scores).map(([k, v]) => `${k[0]}:${v}`).join(' ');
|
|
674
|
+
detail = scores;
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
const name = t.name.length > 35 ? t.name.slice(0, 32) + '...' : t.name.padEnd(35);
|
|
678
|
+
lines.push(` ${name} ${status} ${cost.padStart(6)} ${turns.padStart(4)} ${dur.padStart(5)} ${detail}`);
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
lines.push('─'.repeat(70));
|
|
682
|
+
const totalCost = `$${result.total_cost_usd.toFixed(2)}`;
|
|
683
|
+
const totalDur = `${Math.round(result.total_duration_ms / 1000)}s`;
|
|
684
|
+
lines.push(` Total: ${result.passed}/${result.total_tests} passed${' '.repeat(20)}${totalCost.padStart(6)} ${totalDur}`);
|
|
685
|
+
lines.push(`Saved: ${filepath}`);
|
|
686
|
+
|
|
687
|
+
process.stderr.write(lines.join('\n') + '\n');
|
|
688
|
+
}
|
|
689
|
+
}
|