@runchr/gstack-antigravity 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (297) hide show
  1. package/.agents/rules/ETHOS.md +129 -0
  2. package/.agents/rules/global-gstack.md +117 -0
  3. package/.agents/rules/persona-gstack-autoplan.md +14 -0
  4. package/.agents/rules/persona-gstack-benchmark.md +14 -0
  5. package/.agents/rules/persona-gstack-browse.md +14 -0
  6. package/.agents/rules/persona-gstack-canary.md +14 -0
  7. package/.agents/rules/persona-gstack-careful.md +14 -0
  8. package/.agents/rules/persona-gstack-codex.md +14 -0
  9. package/.agents/rules/persona-gstack-cso.md +14 -0
  10. package/.agents/rules/persona-gstack-design-consultation.md +14 -0
  11. package/.agents/rules/persona-gstack-design-review.md +14 -0
  12. package/.agents/rules/persona-gstack-document-release.md +14 -0
  13. package/.agents/rules/persona-gstack-freeze.md +14 -0
  14. package/.agents/rules/persona-gstack-gstack-upgrade.md +14 -0
  15. package/.agents/rules/persona-gstack-guard.md +14 -0
  16. package/.agents/rules/persona-gstack-investigate.md +14 -0
  17. package/.agents/rules/persona-gstack-land-and-deploy.md +14 -0
  18. package/.agents/rules/persona-gstack-office-hours.md +14 -0
  19. package/.agents/rules/persona-gstack-plan-ceo-review.md +14 -0
  20. package/.agents/rules/persona-gstack-plan-design-review.md +14 -0
  21. package/.agents/rules/persona-gstack-plan-eng-review.md +14 -0
  22. package/.agents/rules/persona-gstack-qa-only.md +14 -0
  23. package/.agents/rules/persona-gstack-qa.md +14 -0
  24. package/.agents/rules/persona-gstack-retro.md +14 -0
  25. package/.agents/rules/persona-gstack-review.md +14 -0
  26. package/.agents/rules/persona-gstack-setup-browser-cookies.md +14 -0
  27. package/.agents/rules/persona-gstack-setup-deploy.md +14 -0
  28. package/.agents/rules/persona-gstack-ship.md +14 -0
  29. package/.agents/rules/persona-gstack-unfreeze.md +14 -0
  30. package/.agents/rules/persona-gstack.md +40 -0
  31. package/.agents/rules/recursive-identities.md +22 -0
  32. package/.agents/workflows/autoplan.md +30 -0
  33. package/.agents/workflows/benchmark.md +31 -0
  34. package/.agents/workflows/browse.md +26 -0
  35. package/.agents/workflows/canary.md +33 -0
  36. package/.agents/workflows/careful.md +22 -0
  37. package/.agents/workflows/codex.md +36 -0
  38. package/.agents/workflows/cso.md +29 -0
  39. package/.agents/workflows/design-consultation.md +28 -0
  40. package/.agents/workflows/design-review.md +28 -0
  41. package/.agents/workflows/document-release.md +32 -0
  42. package/.agents/workflows/freeze.md +17 -0
  43. package/.agents/workflows/gstack-upgrade.md +54 -0
  44. package/.agents/workflows/gstack.md +56 -0
  45. package/.agents/workflows/guard.md +18 -0
  46. package/.agents/workflows/investigate.md +37 -0
  47. package/.agents/workflows/land-and-deploy.md +35 -0
  48. package/.agents/workflows/office-hours.md +27 -0
  49. package/.agents/workflows/plan-ceo-review.md +34 -0
  50. package/.agents/workflows/plan-design-review.md +31 -0
  51. package/.agents/workflows/plan-eng-review.md +28 -0
  52. package/.agents/workflows/qa-only.md +28 -0
  53. package/.agents/workflows/qa.md +73 -0
  54. package/.agents/workflows/retro.md +34 -0
  55. package/.agents/workflows/review.md +30 -0
  56. package/.agents/workflows/setup-browser-cookies.md +15 -0
  57. package/.agents/workflows/setup-cookies.md +8 -0
  58. package/.agents/workflows/setup-deploy.md +21 -0
  59. package/.agents/workflows/ship.md +93 -0
  60. package/.agents/workflows/unfreeze.md +12 -0
  61. package/LICENSE +22 -0
  62. package/README.md +189 -0
  63. package/README_KO.md +191 -0
  64. package/bin/install.js +105 -0
  65. package/gstack-origin/.agents/skills/gstack/SKILL.md +651 -0
  66. package/gstack-origin/.agents/skills/gstack-autoplan/SKILL.md +678 -0
  67. package/gstack-origin/.agents/skills/gstack-benchmark/SKILL.md +482 -0
  68. package/gstack-origin/.agents/skills/gstack-browse/SKILL.md +511 -0
  69. package/gstack-origin/.agents/skills/gstack-canary/SKILL.md +486 -0
  70. package/gstack-origin/.agents/skills/gstack-careful/SKILL.md +50 -0
  71. package/gstack-origin/.agents/skills/gstack-cso/SKILL.md +607 -0
  72. package/gstack-origin/.agents/skills/gstack-design-consultation/SKILL.md +615 -0
  73. package/gstack-origin/.agents/skills/gstack-design-review/SKILL.md +988 -0
  74. package/gstack-origin/.agents/skills/gstack-document-release/SKILL.md +604 -0
  75. package/gstack-origin/.agents/skills/gstack-freeze/SKILL.md +67 -0
  76. package/gstack-origin/.agents/skills/gstack-guard/SKILL.md +62 -0
  77. package/gstack-origin/.agents/skills/gstack-investigate/SKILL.md +415 -0
  78. package/gstack-origin/.agents/skills/gstack-land-and-deploy/SKILL.md +873 -0
  79. package/gstack-origin/.agents/skills/gstack-office-hours/SKILL.md +986 -0
  80. package/gstack-origin/.agents/skills/gstack-plan-ceo-review/SKILL.md +1268 -0
  81. package/gstack-origin/.agents/skills/gstack-plan-design-review/SKILL.md +668 -0
  82. package/gstack-origin/.agents/skills/gstack-plan-eng-review/SKILL.md +826 -0
  83. package/gstack-origin/.agents/skills/gstack-qa/SKILL.md +1006 -0
  84. package/gstack-origin/.agents/skills/gstack-qa-only/SKILL.md +626 -0
  85. package/gstack-origin/.agents/skills/gstack-retro/SKILL.md +1065 -0
  86. package/gstack-origin/.agents/skills/gstack-review/SKILL.md +704 -0
  87. package/gstack-origin/.agents/skills/gstack-setup-browser-cookies/SKILL.md +325 -0
  88. package/gstack-origin/.agents/skills/gstack-setup-deploy/SKILL.md +450 -0
  89. package/gstack-origin/.agents/skills/gstack-ship/SKILL.md +1312 -0
  90. package/gstack-origin/.agents/skills/gstack-unfreeze/SKILL.md +36 -0
  91. package/gstack-origin/.agents/skills/gstack-upgrade/SKILL.md +220 -0
  92. package/gstack-origin/.env.example +5 -0
  93. package/gstack-origin/.github/workflows/skill-docs.yml +17 -0
  94. package/gstack-origin/AGENTS.md +49 -0
  95. package/gstack-origin/ARCHITECTURE.md +359 -0
  96. package/gstack-origin/BROWSER.md +271 -0
  97. package/gstack-origin/CHANGELOG.md +800 -0
  98. package/gstack-origin/CLAUDE.md +284 -0
  99. package/gstack-origin/CONTRIBUTING.md +370 -0
  100. package/gstack-origin/ETHOS.md +129 -0
  101. package/gstack-origin/LICENSE +21 -0
  102. package/gstack-origin/README.md +228 -0
  103. package/gstack-origin/SKILL.md +657 -0
  104. package/gstack-origin/SKILL.md.tmpl +281 -0
  105. package/gstack-origin/TODOS.md +564 -0
  106. package/gstack-origin/VERSION +1 -0
  107. package/gstack-origin/autoplan/SKILL.md +689 -0
  108. package/gstack-origin/autoplan/SKILL.md.tmpl +416 -0
  109. package/gstack-origin/benchmark/SKILL.md +489 -0
  110. package/gstack-origin/benchmark/SKILL.md.tmpl +233 -0
  111. package/gstack-origin/bin/dev-setup +68 -0
  112. package/gstack-origin/bin/dev-teardown +56 -0
  113. package/gstack-origin/bin/gstack-analytics +191 -0
  114. package/gstack-origin/bin/gstack-community-dashboard +113 -0
  115. package/gstack-origin/bin/gstack-config +38 -0
  116. package/gstack-origin/bin/gstack-diff-scope +71 -0
  117. package/gstack-origin/bin/gstack-global-discover.ts +591 -0
  118. package/gstack-origin/bin/gstack-repo-mode +93 -0
  119. package/gstack-origin/bin/gstack-review-log +9 -0
  120. package/gstack-origin/bin/gstack-review-read +12 -0
  121. package/gstack-origin/bin/gstack-slug +15 -0
  122. package/gstack-origin/bin/gstack-telemetry-log +158 -0
  123. package/gstack-origin/bin/gstack-telemetry-sync +127 -0
  124. package/gstack-origin/bin/gstack-update-check +196 -0
  125. package/gstack-origin/browse/SKILL.md +517 -0
  126. package/gstack-origin/browse/SKILL.md.tmpl +141 -0
  127. package/gstack-origin/browse/bin/find-browse +21 -0
  128. package/gstack-origin/browse/bin/remote-slug +14 -0
  129. package/gstack-origin/browse/scripts/build-node-server.sh +48 -0
  130. package/gstack-origin/browse/src/browser-manager.ts +634 -0
  131. package/gstack-origin/browse/src/buffers.ts +137 -0
  132. package/gstack-origin/browse/src/bun-polyfill.cjs +109 -0
  133. package/gstack-origin/browse/src/cli.ts +420 -0
  134. package/gstack-origin/browse/src/commands.ts +111 -0
  135. package/gstack-origin/browse/src/config.ts +150 -0
  136. package/gstack-origin/browse/src/cookie-import-browser.ts +417 -0
  137. package/gstack-origin/browse/src/cookie-picker-routes.ts +207 -0
  138. package/gstack-origin/browse/src/cookie-picker-ui.ts +541 -0
  139. package/gstack-origin/browse/src/find-browse.ts +61 -0
  140. package/gstack-origin/browse/src/meta-commands.ts +269 -0
  141. package/gstack-origin/browse/src/platform.ts +17 -0
  142. package/gstack-origin/browse/src/read-commands.ts +335 -0
  143. package/gstack-origin/browse/src/server.ts +369 -0
  144. package/gstack-origin/browse/src/snapshot.ts +398 -0
  145. package/gstack-origin/browse/src/url-validation.ts +91 -0
  146. package/gstack-origin/browse/src/write-commands.ts +352 -0
  147. package/gstack-origin/browse/test/bun-polyfill.test.ts +72 -0
  148. package/gstack-origin/browse/test/commands.test.ts +1836 -0
  149. package/gstack-origin/browse/test/config.test.ts +250 -0
  150. package/gstack-origin/browse/test/cookie-import-browser.test.ts +397 -0
  151. package/gstack-origin/browse/test/cookie-picker-routes.test.ts +205 -0
  152. package/gstack-origin/browse/test/find-browse.test.ts +50 -0
  153. package/gstack-origin/browse/test/fixtures/basic.html +33 -0
  154. package/gstack-origin/browse/test/fixtures/cursor-interactive.html +22 -0
  155. package/gstack-origin/browse/test/fixtures/dialog.html +15 -0
  156. package/gstack-origin/browse/test/fixtures/empty.html +2 -0
  157. package/gstack-origin/browse/test/fixtures/forms.html +55 -0
  158. package/gstack-origin/browse/test/fixtures/qa-eval-checkout.html +108 -0
  159. package/gstack-origin/browse/test/fixtures/qa-eval-spa.html +98 -0
  160. package/gstack-origin/browse/test/fixtures/qa-eval.html +51 -0
  161. package/gstack-origin/browse/test/fixtures/responsive.html +49 -0
  162. package/gstack-origin/browse/test/fixtures/snapshot.html +55 -0
  163. package/gstack-origin/browse/test/fixtures/spa.html +24 -0
  164. package/gstack-origin/browse/test/fixtures/states.html +17 -0
  165. package/gstack-origin/browse/test/fixtures/upload.html +25 -0
  166. package/gstack-origin/browse/test/gstack-config.test.ts +125 -0
  167. package/gstack-origin/browse/test/gstack-update-check.test.ts +467 -0
  168. package/gstack-origin/browse/test/handoff.test.ts +235 -0
  169. package/gstack-origin/browse/test/path-validation.test.ts +63 -0
  170. package/gstack-origin/browse/test/platform.test.ts +37 -0
  171. package/gstack-origin/browse/test/snapshot.test.ts +467 -0
  172. package/gstack-origin/browse/test/test-server.ts +57 -0
  173. package/gstack-origin/browse/test/url-validation.test.ts +72 -0
  174. package/gstack-origin/canary/SKILL.md +493 -0
  175. package/gstack-origin/canary/SKILL.md.tmpl +220 -0
  176. package/gstack-origin/careful/SKILL.md +59 -0
  177. package/gstack-origin/careful/SKILL.md.tmpl +57 -0
  178. package/gstack-origin/careful/bin/check-careful.sh +112 -0
  179. package/gstack-origin/codex/SKILL.md +677 -0
  180. package/gstack-origin/codex/SKILL.md.tmpl +356 -0
  181. package/gstack-origin/conductor.json +6 -0
  182. package/gstack-origin/cso/SKILL.md +615 -0
  183. package/gstack-origin/cso/SKILL.md.tmpl +376 -0
  184. package/gstack-origin/design-consultation/SKILL.md +625 -0
  185. package/gstack-origin/design-consultation/SKILL.md.tmpl +369 -0
  186. package/gstack-origin/design-review/SKILL.md +998 -0
  187. package/gstack-origin/design-review/SKILL.md.tmpl +262 -0
  188. package/gstack-origin/docs/images/github-2013.png +0 -0
  189. package/gstack-origin/docs/images/github-2026.png +0 -0
  190. package/gstack-origin/docs/skills.md +877 -0
  191. package/gstack-origin/document-release/SKILL.md +613 -0
  192. package/gstack-origin/document-release/SKILL.md.tmpl +357 -0
  193. package/gstack-origin/freeze/SKILL.md +82 -0
  194. package/gstack-origin/freeze/SKILL.md.tmpl +80 -0
  195. package/gstack-origin/freeze/bin/check-freeze.sh +68 -0
  196. package/gstack-origin/gstack-upgrade/SKILL.md +226 -0
  197. package/gstack-origin/gstack-upgrade/SKILL.md.tmpl +224 -0
  198. package/gstack-origin/guard/SKILL.md +82 -0
  199. package/gstack-origin/guard/SKILL.md.tmpl +80 -0
  200. package/gstack-origin/investigate/SKILL.md +435 -0
  201. package/gstack-origin/investigate/SKILL.md.tmpl +196 -0
  202. package/gstack-origin/land-and-deploy/SKILL.md +880 -0
  203. package/gstack-origin/land-and-deploy/SKILL.md.tmpl +575 -0
  204. package/gstack-origin/office-hours/SKILL.md +996 -0
  205. package/gstack-origin/office-hours/SKILL.md.tmpl +624 -0
  206. package/gstack-origin/package.json +55 -0
  207. package/gstack-origin/plan-ceo-review/SKILL.md +1277 -0
  208. package/gstack-origin/plan-ceo-review/SKILL.md.tmpl +838 -0
  209. package/gstack-origin/plan-design-review/SKILL.md +676 -0
  210. package/gstack-origin/plan-design-review/SKILL.md.tmpl +314 -0
  211. package/gstack-origin/plan-eng-review/SKILL.md +836 -0
  212. package/gstack-origin/plan-eng-review/SKILL.md.tmpl +279 -0
  213. package/gstack-origin/qa/SKILL.md +1016 -0
  214. package/gstack-origin/qa/SKILL.md.tmpl +316 -0
  215. package/gstack-origin/qa/references/issue-taxonomy.md +85 -0
  216. package/gstack-origin/qa/templates/qa-report-template.md +126 -0
  217. package/gstack-origin/qa-only/SKILL.md +633 -0
  218. package/gstack-origin/qa-only/SKILL.md.tmpl +101 -0
  219. package/gstack-origin/retro/SKILL.md +1072 -0
  220. package/gstack-origin/retro/SKILL.md.tmpl +833 -0
  221. package/gstack-origin/review/SKILL.md +849 -0
  222. package/gstack-origin/review/SKILL.md.tmpl +259 -0
  223. package/gstack-origin/review/TODOS-format.md +62 -0
  224. package/gstack-origin/review/checklist.md +190 -0
  225. package/gstack-origin/review/design-checklist.md +132 -0
  226. package/gstack-origin/review/greptile-triage.md +220 -0
  227. package/gstack-origin/scripts/analytics.ts +190 -0
  228. package/gstack-origin/scripts/dev-skill.ts +82 -0
  229. package/gstack-origin/scripts/eval-compare.ts +96 -0
  230. package/gstack-origin/scripts/eval-list.ts +116 -0
  231. package/gstack-origin/scripts/eval-select.ts +86 -0
  232. package/gstack-origin/scripts/eval-summary.ts +187 -0
  233. package/gstack-origin/scripts/eval-watch.ts +172 -0
  234. package/gstack-origin/scripts/gen-skill-docs.ts +2414 -0
  235. package/gstack-origin/scripts/skill-check.ts +167 -0
  236. package/gstack-origin/setup +269 -0
  237. package/gstack-origin/setup-browser-cookies/SKILL.md +330 -0
  238. package/gstack-origin/setup-browser-cookies/SKILL.md.tmpl +74 -0
  239. package/gstack-origin/setup-deploy/SKILL.md +459 -0
  240. package/gstack-origin/setup-deploy/SKILL.md.tmpl +220 -0
  241. package/gstack-origin/ship/SKILL.md +1457 -0
  242. package/gstack-origin/ship/SKILL.md.tmpl +528 -0
  243. package/gstack-origin/supabase/config.sh +10 -0
  244. package/gstack-origin/supabase/functions/community-pulse/index.ts +59 -0
  245. package/gstack-origin/supabase/functions/telemetry-ingest/index.ts +135 -0
  246. package/gstack-origin/supabase/functions/update-check/index.ts +37 -0
  247. package/gstack-origin/supabase/migrations/001_telemetry.sql +89 -0
  248. package/gstack-origin/test/analytics.test.ts +277 -0
  249. package/gstack-origin/test/codex-e2e.test.ts +197 -0
  250. package/gstack-origin/test/fixtures/coverage-audit-fixture.ts +76 -0
  251. package/gstack-origin/test/fixtures/eval-baselines.json +7 -0
  252. package/gstack-origin/test/fixtures/qa-eval-checkout-ground-truth.json +43 -0
  253. package/gstack-origin/test/fixtures/qa-eval-ground-truth.json +43 -0
  254. package/gstack-origin/test/fixtures/qa-eval-spa-ground-truth.json +43 -0
  255. package/gstack-origin/test/fixtures/review-eval-design-slop.css +86 -0
  256. package/gstack-origin/test/fixtures/review-eval-design-slop.html +41 -0
  257. package/gstack-origin/test/fixtures/review-eval-enum-diff.rb +30 -0
  258. package/gstack-origin/test/fixtures/review-eval-enum.rb +27 -0
  259. package/gstack-origin/test/fixtures/review-eval-vuln.rb +14 -0
  260. package/gstack-origin/test/gemini-e2e.test.ts +173 -0
  261. package/gstack-origin/test/gen-skill-docs.test.ts +1049 -0
  262. package/gstack-origin/test/global-discover.test.ts +187 -0
  263. package/gstack-origin/test/helpers/codex-session-runner.ts +282 -0
  264. package/gstack-origin/test/helpers/e2e-helpers.ts +239 -0
  265. package/gstack-origin/test/helpers/eval-store.test.ts +548 -0
  266. package/gstack-origin/test/helpers/eval-store.ts +689 -0
  267. package/gstack-origin/test/helpers/gemini-session-runner.test.ts +104 -0
  268. package/gstack-origin/test/helpers/gemini-session-runner.ts +201 -0
  269. package/gstack-origin/test/helpers/llm-judge.ts +130 -0
  270. package/gstack-origin/test/helpers/observability.test.ts +283 -0
  271. package/gstack-origin/test/helpers/session-runner.test.ts +96 -0
  272. package/gstack-origin/test/helpers/session-runner.ts +357 -0
  273. package/gstack-origin/test/helpers/skill-parser.ts +206 -0
  274. package/gstack-origin/test/helpers/touchfiles.ts +260 -0
  275. package/gstack-origin/test/hook-scripts.test.ts +373 -0
  276. package/gstack-origin/test/skill-e2e-browse.test.ts +293 -0
  277. package/gstack-origin/test/skill-e2e-deploy.test.ts +279 -0
  278. package/gstack-origin/test/skill-e2e-design.test.ts +614 -0
  279. package/gstack-origin/test/skill-e2e-plan.test.ts +538 -0
  280. package/gstack-origin/test/skill-e2e-qa-bugs.test.ts +194 -0
  281. package/gstack-origin/test/skill-e2e-qa-workflow.test.ts +412 -0
  282. package/gstack-origin/test/skill-e2e-review.test.ts +535 -0
  283. package/gstack-origin/test/skill-e2e-workflow.test.ts +586 -0
  284. package/gstack-origin/test/skill-e2e.test.ts +3325 -0
  285. package/gstack-origin/test/skill-llm-eval.test.ts +787 -0
  286. package/gstack-origin/test/skill-parser.test.ts +179 -0
  287. package/gstack-origin/test/skill-routing-e2e.test.ts +605 -0
  288. package/gstack-origin/test/skill-validation.test.ts +1520 -0
  289. package/gstack-origin/test/telemetry.test.ts +278 -0
  290. package/gstack-origin/test/touchfiles.test.ts +262 -0
  291. package/gstack-origin/unfreeze/SKILL.md +40 -0
  292. package/gstack-origin/unfreeze/SKILL.md.tmpl +38 -0
  293. package/package.json +38 -0
  294. package/scripts/install-antigravity-skill.ps1 +33 -0
  295. package/scripts/install-antigravity-skill.sh +41 -0
  296. package/scripts/sync-gstack-origin.ps1 +37 -0
  297. package/scripts/sync-gstack-origin.sh +35 -0
@@ -0,0 +1,689 @@
1
+ /**
2
+ * Eval result persistence and comparison.
3
+ *
4
+ * EvalCollector accumulates test results, writes them to
5
+ * ~/.gstack-dev/evals/{version}-{branch}-{tier}-{timestamp}.json,
6
+ * prints a summary table, and auto-compares with the previous run.
7
+ *
8
+ * Comparison functions are exported for reuse by the eval:compare CLI.
9
+ */
10
+
11
+ import * as fs from 'fs';
12
+ import * as path from 'path';
13
+ import * as os from 'os';
14
+ import { spawnSync } from 'child_process';
15
+
16
+ const SCHEMA_VERSION = 1;
17
+ const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
18
+
19
+ // --- Interfaces ---
20
+
21
+ export interface EvalTestEntry {
22
+ name: string;
23
+ suite: string;
24
+ tier: 'e2e' | 'llm-judge';
25
+ passed: boolean;
26
+ duration_ms: number;
27
+ cost_usd: number;
28
+
29
+ // E2E
30
+ transcript?: any[];
31
+ prompt?: string;
32
+ output?: string;
33
+ turns_used?: number;
34
+ browse_errors?: string[];
35
+
36
+ // LLM judge
37
+ judge_scores?: Record<string, number>;
38
+ judge_reasoning?: string;
39
+
40
+ // Machine-readable diagnostics
41
+ exit_reason?: string; // 'success' | 'timeout' | 'error_max_turns' | 'exit_code_N'
42
+ timeout_at_turn?: number; // which turn was active when timeout hit
43
+ last_tool_call?: string; // e.g. "Write(review-output.md)"
44
+
45
+ // Model + timing diagnostics (added for Sonnet/Opus split)
46
+ model?: string; // e.g. 'claude-sonnet-4-6' or 'claude-opus-4-6'
47
+ first_response_ms?: number; // time from spawn to first NDJSON line
48
+ max_inter_turn_ms?: number; // peak latency between consecutive tool calls
49
+
50
+ // Outcome eval
51
+ detection_rate?: number;
52
+ false_positives?: number;
53
+ evidence_quality?: number;
54
+ detected_bugs?: string[];
55
+ missed_bugs?: string[];
56
+
57
+ error?: string;
58
+ }
59
+
60
+ export interface EvalResult {
61
+ schema_version: number;
62
+ version: string;
63
+ branch: string;
64
+ git_sha: string;
65
+ timestamp: string;
66
+ hostname: string;
67
+ tier: 'e2e' | 'llm-judge';
68
+ total_tests: number;
69
+ passed: number;
70
+ failed: number;
71
+ total_cost_usd: number;
72
+ total_duration_ms: number;
73
+ wall_clock_ms?: number; // wall-clock from collector creation to finalization (shows parallelism)
74
+ tests: EvalTestEntry[];
75
+ _partial?: boolean; // true for incremental saves, absent in final
76
+ }
77
+
78
+ export interface TestDelta {
79
+ name: string;
80
+ before: { passed: boolean; cost_usd: number; turns_used?: number; duration_ms?: number;
81
+ detection_rate?: number; tool_summary?: Record<string, number> };
82
+ after: { passed: boolean; cost_usd: number; turns_used?: number; duration_ms?: number;
83
+ detection_rate?: number; tool_summary?: Record<string, number> };
84
+ status_change: 'improved' | 'regressed' | 'unchanged';
85
+ }
86
+
87
+ export interface ComparisonResult {
88
+ before_file: string;
89
+ after_file: string;
90
+ before_branch: string;
91
+ after_branch: string;
92
+ before_timestamp: string;
93
+ after_timestamp: string;
94
+ deltas: TestDelta[];
95
+ total_cost_delta: number;
96
+ total_duration_delta: number;
97
+ improved: number;
98
+ regressed: number;
99
+ unchanged: number;
100
+ tool_count_before: number;
101
+ tool_count_after: number;
102
+ }
103
+
104
+ // --- Shared helpers ---
105
+
106
+ /**
107
+ * Determine if a planted-bug eval passed based on judge results vs ground truth thresholds.
108
+ * Centralizes the pass/fail logic so all planted-bug tests use the same criteria.
109
+ */
110
+ export function judgePassed(
111
+ judgeResult: { detection_rate: number; false_positives: number; evidence_quality: number },
112
+ groundTruth: { minimum_detection: number; max_false_positives: number },
113
+ ): boolean {
114
+ return judgeResult.detection_rate >= groundTruth.minimum_detection
115
+ && judgeResult.false_positives <= groundTruth.max_false_positives
116
+ && judgeResult.evidence_quality >= 2;
117
+ }
118
+
119
+ // --- Comparison functions (exported for eval:compare CLI) ---
120
+
121
+ /**
122
+ * Extract tool call counts from a transcript.
123
+ * Returns e.g. { Bash: 8, Read: 3, Write: 1 }.
124
+ */
125
+ export function extractToolSummary(transcript: any[]): Record<string, number> {
126
+ const counts: Record<string, number> = {};
127
+ for (const event of transcript) {
128
+ if (event.type === 'assistant') {
129
+ const content = event.message?.content || [];
130
+ for (const item of content) {
131
+ if (item.type === 'tool_use') {
132
+ const name = item.name || 'unknown';
133
+ counts[name] = (counts[name] || 0) + 1;
134
+ }
135
+ }
136
+ }
137
+ }
138
+ return counts;
139
+ }
140
+
141
+ /**
142
+ * Find the most recent prior eval file for comparison.
143
+ * Prefers same branch, falls back to any branch.
144
+ */
145
+ export function findPreviousRun(
146
+ evalDir: string,
147
+ tier: string,
148
+ branch: string,
149
+ excludeFile: string,
150
+ ): string | null {
151
+ let files: string[];
152
+ try {
153
+ files = fs.readdirSync(evalDir).filter(f => f.endsWith('.json'));
154
+ } catch {
155
+ return null; // dir doesn't exist
156
+ }
157
+
158
+ // Parse top-level fields from each file (cheap — no full tests array needed)
159
+ const entries: Array<{ file: string; branch: string; timestamp: string }> = [];
160
+ for (const file of files) {
161
+ if (file === path.basename(excludeFile)) continue;
162
+ const fullPath = path.join(evalDir, file);
163
+ try {
164
+ const raw = fs.readFileSync(fullPath, 'utf-8');
165
+ // Quick parse — only grab the fields we need
166
+ const data = JSON.parse(raw);
167
+ if (data.tier !== tier) continue;
168
+ entries.push({ file: fullPath, branch: data.branch || '', timestamp: data.timestamp || '' });
169
+ } catch { continue; }
170
+ }
171
+
172
+ if (entries.length === 0) return null;
173
+
174
+ // Sort by timestamp descending
175
+ entries.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
176
+
177
+ // Prefer same branch
178
+ const sameBranch = entries.find(e => e.branch === branch);
179
+ if (sameBranch) return sameBranch.file;
180
+
181
+ // Fallback: any branch
182
+ return entries[0].file;
183
+ }
184
+
185
+ /**
186
+ * Compare two eval results. Matches tests by name.
187
+ */
188
+ export function compareEvalResults(
189
+ before: EvalResult,
190
+ after: EvalResult,
191
+ beforeFile: string,
192
+ afterFile: string,
193
+ ): ComparisonResult {
194
+ const deltas: TestDelta[] = [];
195
+ let improved = 0, regressed = 0, unchanged = 0;
196
+ let toolCountBefore = 0, toolCountAfter = 0;
197
+
198
+ // Index before tests by name
199
+ const beforeMap = new Map<string, EvalTestEntry>();
200
+ for (const t of before.tests) {
201
+ beforeMap.set(t.name, t);
202
+ }
203
+
204
+ // Walk after tests, match by name
205
+ for (const afterTest of after.tests) {
206
+ const beforeTest = beforeMap.get(afterTest.name);
207
+ const beforeToolSummary = beforeTest?.transcript ? extractToolSummary(beforeTest.transcript) : {};
208
+ const afterToolSummary = afterTest.transcript ? extractToolSummary(afterTest.transcript) : {};
209
+
210
+ const beforeToolCount = Object.values(beforeToolSummary).reduce((a, b) => a + b, 0);
211
+ const afterToolCount = Object.values(afterToolSummary).reduce((a, b) => a + b, 0);
212
+ toolCountBefore += beforeToolCount;
213
+ toolCountAfter += afterToolCount;
214
+
215
+ let statusChange: TestDelta['status_change'] = 'unchanged';
216
+ if (beforeTest) {
217
+ if (!beforeTest.passed && afterTest.passed) { statusChange = 'improved'; improved++; }
218
+ else if (beforeTest.passed && !afterTest.passed) { statusChange = 'regressed'; regressed++; }
219
+ else { unchanged++; }
220
+ } else {
221
+ // New test — treat as unchanged (no prior data)
222
+ unchanged++;
223
+ }
224
+
225
+ deltas.push({
226
+ name: afterTest.name,
227
+ before: {
228
+ passed: beforeTest?.passed ?? false,
229
+ cost_usd: beforeTest?.cost_usd ?? 0,
230
+ turns_used: beforeTest?.turns_used,
231
+ duration_ms: beforeTest?.duration_ms,
232
+ detection_rate: beforeTest?.detection_rate,
233
+ tool_summary: beforeToolSummary,
234
+ },
235
+ after: {
236
+ passed: afterTest.passed,
237
+ cost_usd: afterTest.cost_usd,
238
+ turns_used: afterTest.turns_used,
239
+ duration_ms: afterTest.duration_ms,
240
+ detection_rate: afterTest.detection_rate,
241
+ tool_summary: afterToolSummary,
242
+ },
243
+ status_change: statusChange,
244
+ });
245
+
246
+ beforeMap.delete(afterTest.name);
247
+ }
248
+
249
+ // Tests that were in before but not in after (removed tests)
250
+ for (const [name, beforeTest] of beforeMap) {
251
+ const beforeToolSummary = beforeTest.transcript ? extractToolSummary(beforeTest.transcript) : {};
252
+ const beforeToolCount = Object.values(beforeToolSummary).reduce((a, b) => a + b, 0);
253
+ toolCountBefore += beforeToolCount;
254
+ unchanged++;
255
+ deltas.push({
256
+ name: `${name} (removed)`,
257
+ before: {
258
+ passed: beforeTest.passed,
259
+ cost_usd: beforeTest.cost_usd,
260
+ turns_used: beforeTest.turns_used,
261
+ duration_ms: beforeTest.duration_ms,
262
+ detection_rate: beforeTest.detection_rate,
263
+ tool_summary: beforeToolSummary,
264
+ },
265
+ after: { passed: false, cost_usd: 0, tool_summary: {} },
266
+ status_change: 'unchanged',
267
+ });
268
+ }
269
+
270
+ return {
271
+ before_file: beforeFile,
272
+ after_file: afterFile,
273
+ before_branch: before.branch,
274
+ after_branch: after.branch,
275
+ before_timestamp: before.timestamp,
276
+ after_timestamp: after.timestamp,
277
+ deltas,
278
+ total_cost_delta: after.total_cost_usd - before.total_cost_usd,
279
+ total_duration_delta: after.total_duration_ms - before.total_duration_ms,
280
+ improved,
281
+ regressed,
282
+ unchanged,
283
+ tool_count_before: toolCountBefore,
284
+ tool_count_after: toolCountAfter,
285
+ };
286
+ }
287
+
288
+ /**
289
+ * Format a ComparisonResult as a readable string.
290
+ */
291
+ export function formatComparison(c: ComparisonResult): string {
292
+ const lines: string[] = [];
293
+ const ts = c.before_timestamp ? c.before_timestamp.replace('T', ' ').slice(0, 16) : 'unknown';
294
+ lines.push(`\nvs previous: ${c.before_branch}/${c.deltas.length ? 'eval' : ''} (${ts})`);
295
+ lines.push('─'.repeat(70));
296
+
297
+ // Per-test deltas
298
+ for (const d of c.deltas) {
299
+ const arrow = d.status_change === 'improved' ? '↑' : d.status_change === 'regressed' ? '↓' : '=';
300
+ const beforeStatus = d.before.passed ? 'PASS' : 'FAIL';
301
+ const afterStatus = d.after.passed ? 'PASS' : 'FAIL';
302
+
303
+ // Turns delta
304
+ let turnsDelta = '';
305
+ if (d.before.turns_used !== undefined && d.after.turns_used !== undefined) {
306
+ const td = d.after.turns_used - d.before.turns_used;
307
+ turnsDelta = ` ${d.before.turns_used}→${d.after.turns_used}t`;
308
+ if (td !== 0) turnsDelta += `(${td > 0 ? '+' : ''}${td})`;
309
+ } else if (d.after.turns_used !== undefined) {
310
+ turnsDelta = ` ${d.after.turns_used}t`;
311
+ }
312
+
313
+ // Duration delta
314
+ let durDelta = '';
315
+ if (d.before.duration_ms !== undefined && d.after.duration_ms !== undefined) {
316
+ const bs = Math.round(d.before.duration_ms / 1000);
317
+ const as = Math.round(d.after.duration_ms / 1000);
318
+ const dd = as - bs;
319
+ durDelta = ` ${bs}→${as}s`;
320
+ if (dd !== 0) durDelta += `(${dd > 0 ? '+' : ''}${dd})`;
321
+ } else if (d.after.duration_ms !== undefined) {
322
+ durDelta = ` ${Math.round(d.after.duration_ms / 1000)}s`;
323
+ }
324
+
325
+ let detail = '';
326
+ if (d.before.detection_rate !== undefined || d.after.detection_rate !== undefined) {
327
+ detail = ` ${d.before.detection_rate ?? '?'}→${d.after.detection_rate ?? '?'} det`;
328
+ } else {
329
+ const costBefore = d.before.cost_usd.toFixed(2);
330
+ const costAfter = d.after.cost_usd.toFixed(2);
331
+ detail = ` $${costBefore}→$${costAfter}`;
332
+ }
333
+
334
+ const name = d.name.length > 30 ? d.name.slice(0, 27) + '...' : d.name.padEnd(30);
335
+ lines.push(` ${name} ${beforeStatus.padEnd(5)} → ${afterStatus.padEnd(5)} ${arrow}${detail}${turnsDelta}${durDelta}`);
336
+ }
337
+
338
+ lines.push('─'.repeat(70));
339
+
340
+ // Totals
341
+ const parts: string[] = [];
342
+ if (c.improved > 0) parts.push(`${c.improved} improved`);
343
+ if (c.regressed > 0) parts.push(`${c.regressed} regressed`);
344
+ if (c.unchanged > 0) parts.push(`${c.unchanged} unchanged`);
345
+ lines.push(` Status: ${parts.join(', ')}`);
346
+
347
+ const costSign = c.total_cost_delta >= 0 ? '+' : '';
348
+ lines.push(` Cost: ${costSign}$${c.total_cost_delta.toFixed(2)}`);
349
+
350
+ const durDelta = Math.round(c.total_duration_delta / 1000);
351
+ const durSign = durDelta >= 0 ? '+' : '';
352
+ lines.push(` Duration: ${durSign}${durDelta}s`);
353
+
354
+ const toolDelta = c.tool_count_after - c.tool_count_before;
355
+ const toolSign = toolDelta >= 0 ? '+' : '';
356
+ lines.push(` Tool calls: ${c.tool_count_before} → ${c.tool_count_after} (${toolSign}${toolDelta})`);
357
+
358
+ // Tool breakdown (show tools that changed)
359
+ const allTools = new Set<string>();
360
+ for (const d of c.deltas) {
361
+ for (const t of Object.keys(d.before.tool_summary || {})) allTools.add(t);
362
+ for (const t of Object.keys(d.after.tool_summary || {})) allTools.add(t);
363
+ }
364
+
365
+ if (allTools.size > 0) {
366
+ // Aggregate tool counts across all tests
367
+ const totalBefore: Record<string, number> = {};
368
+ const totalAfter: Record<string, number> = {};
369
+ for (const d of c.deltas) {
370
+ for (const [t, n] of Object.entries(d.before.tool_summary || {})) {
371
+ totalBefore[t] = (totalBefore[t] || 0) + n;
372
+ }
373
+ for (const [t, n] of Object.entries(d.after.tool_summary || {})) {
374
+ totalAfter[t] = (totalAfter[t] || 0) + n;
375
+ }
376
+ }
377
+
378
+ for (const tool of [...allTools].sort()) {
379
+ const b = totalBefore[tool] || 0;
380
+ const a = totalAfter[tool] || 0;
381
+ if (b !== a) {
382
+ const d = a - b;
383
+ lines.push(` ${tool}: ${b} → ${a} (${d >= 0 ? '+' : ''}${d})`);
384
+ }
385
+ }
386
+ }
387
+
388
+ // Commentary — interpret what the deltas mean
389
+ const commentary = generateCommentary(c);
390
+ if (commentary.length > 0) {
391
+ lines.push('');
392
+ lines.push(' Takeaway:');
393
+ for (const line of commentary) {
394
+ lines.push(` ${line}`);
395
+ }
396
+ }
397
+
398
+ return lines.join('\n');
399
+ }
400
+
401
+ /**
402
+ * Generate human-readable commentary interpreting comparison deltas.
403
+ * Pure function — analyzes the numbers and explains what they mean.
404
+ */
405
+ export function generateCommentary(c: ComparisonResult): string[] {
406
+ const notes: string[] = [];
407
+
408
+ // 1. Regressions are the most important signal — call them out first
409
+ const regressions = c.deltas.filter(d => d.status_change === 'regressed');
410
+ if (regressions.length > 0) {
411
+ for (const d of regressions) {
412
+ notes.push(`REGRESSION: "${d.name}" was passing, now fails. Investigate immediately.`);
413
+ }
414
+ }
415
+
416
+ // 2. Improvements
417
+ const improvements = c.deltas.filter(d => d.status_change === 'improved');
418
+ for (const d of improvements) {
419
+ notes.push(`Fixed: "${d.name}" now passes.`);
420
+ }
421
+
422
+ // 3. Per-test efficiency changes (only for unchanged-status tests — regressions/improvements are already noted)
423
+ const stable = c.deltas.filter(d => d.status_change === 'unchanged' && d.after.passed);
424
+ for (const d of stable) {
425
+ const insights: string[] = [];
426
+
427
+ // Turns
428
+ if (d.before.turns_used !== undefined && d.after.turns_used !== undefined && d.before.turns_used > 0) {
429
+ const turnsDelta = d.after.turns_used - d.before.turns_used;
430
+ const turnsPct = Math.round((turnsDelta / d.before.turns_used) * 100);
431
+ if (Math.abs(turnsPct) >= 20 && Math.abs(turnsDelta) >= 2) {
432
+ if (turnsDelta < 0) {
433
+ insights.push(`${Math.abs(turnsDelta)} fewer turns (${Math.abs(turnsPct)}% more efficient)`);
434
+ } else {
435
+ insights.push(`${turnsDelta} more turns (${turnsPct}% less efficient)`);
436
+ }
437
+ }
438
+ }
439
+
440
+ // Duration
441
+ if (d.before.duration_ms !== undefined && d.after.duration_ms !== undefined && d.before.duration_ms > 0) {
442
+ const durDelta = d.after.duration_ms - d.before.duration_ms;
443
+ const durPct = Math.round((durDelta / d.before.duration_ms) * 100);
444
+ if (Math.abs(durPct) >= 20 && Math.abs(durDelta) >= 5000) {
445
+ if (durDelta < 0) {
446
+ insights.push(`${Math.round(Math.abs(durDelta) / 1000)}s faster`);
447
+ } else {
448
+ insights.push(`${Math.round(durDelta / 1000)}s slower`);
449
+ }
450
+ }
451
+ }
452
+
453
+ // Detection rate
454
+ if (d.before.detection_rate !== undefined && d.after.detection_rate !== undefined) {
455
+ const detDelta = d.after.detection_rate - d.before.detection_rate;
456
+ if (detDelta !== 0) {
457
+ if (detDelta > 0) {
458
+ insights.push(`detecting ${detDelta} more bug${detDelta > 1 ? 's' : ''}`);
459
+ } else {
460
+ insights.push(`detecting ${Math.abs(detDelta)} fewer bug${Math.abs(detDelta) > 1 ? 's' : ''} — check prompt quality`);
461
+ }
462
+ }
463
+ }
464
+
465
+ // Cost
466
+ if (d.before.cost_usd > 0) {
467
+ const costDelta = d.after.cost_usd - d.before.cost_usd;
468
+ const costPct = Math.round((costDelta / d.before.cost_usd) * 100);
469
+ if (Math.abs(costPct) >= 30 && Math.abs(costDelta) >= 0.05) {
470
+ if (costDelta < 0) {
471
+ insights.push(`${Math.abs(costPct)}% cheaper`);
472
+ } else {
473
+ insights.push(`${costPct}% more expensive`);
474
+ }
475
+ }
476
+ }
477
+
478
+ if (insights.length > 0) {
479
+ notes.push(`"${d.name}": ${insights.join(', ')}.`);
480
+ }
481
+ }
482
+
483
+ // 4. Overall summary
484
+ if (c.deltas.length >= 3 && regressions.length === 0) {
485
+ const overallParts: string[] = [];
486
+
487
+ // Total cost
488
+ const totalBefore = c.deltas.reduce((s, d) => s + d.before.cost_usd, 0);
489
+ if (totalBefore > 0) {
490
+ const costPct = Math.round((c.total_cost_delta / totalBefore) * 100);
491
+ if (Math.abs(costPct) >= 10) {
492
+ overallParts.push(`${Math.abs(costPct)}% ${costPct < 0 ? 'cheaper' : 'more expensive'} overall`);
493
+ }
494
+ }
495
+
496
+ // Total duration
497
+ const totalDurBefore = c.deltas.reduce((s, d) => s + (d.before.duration_ms || 0), 0);
498
+ if (totalDurBefore > 0) {
499
+ const durPct = Math.round((c.total_duration_delta / totalDurBefore) * 100);
500
+ if (Math.abs(durPct) >= 10) {
501
+ overallParts.push(`${Math.abs(durPct)}% ${durPct < 0 ? 'faster' : 'slower'}`);
502
+ }
503
+ }
504
+
505
+ // Total turns
506
+ const turnsBefore = c.deltas.reduce((s, d) => s + (d.before.turns_used || 0), 0);
507
+ const turnsAfter = c.deltas.reduce((s, d) => s + (d.after.turns_used || 0), 0);
508
+ if (turnsBefore > 0) {
509
+ const turnsPct = Math.round(((turnsAfter - turnsBefore) / turnsBefore) * 100);
510
+ if (Math.abs(turnsPct) >= 10) {
511
+ overallParts.push(`${Math.abs(turnsPct)}% ${turnsPct < 0 ? 'fewer' : 'more'} turns`);
512
+ }
513
+ }
514
+
515
+ if (overallParts.length > 0) {
516
+ notes.push(`Overall: ${overallParts.join(', ')}. ${regressions.length === 0 ? 'No regressions.' : ''}`);
517
+ } else if (regressions.length === 0) {
518
+ notes.push('Stable run — no significant efficiency changes, no regressions.');
519
+ }
520
+ }
521
+
522
+ return notes;
523
+ }
524
+
525
+ // --- EvalCollector ---
526
+
527
+ function getGitInfo(): { branch: string; sha: string } {
528
+ try {
529
+ const branch = spawnSync('git', ['rev-parse', '--abbrev-ref', 'HEAD'], { stdio: 'pipe', timeout: 5000 });
530
+ const sha = spawnSync('git', ['rev-parse', '--short', 'HEAD'], { stdio: 'pipe', timeout: 5000 });
531
+ return {
532
+ branch: branch.stdout?.toString().trim() || 'unknown',
533
+ sha: sha.stdout?.toString().trim() || 'unknown',
534
+ };
535
+ } catch {
536
+ return { branch: 'unknown', sha: 'unknown' };
537
+ }
538
+ }
539
+
540
+ function getVersion(): string {
541
+ try {
542
+ const pkgPath = path.resolve(__dirname, '..', '..', 'package.json');
543
+ const pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf-8'));
544
+ return pkg.version || 'unknown';
545
+ } catch {
546
+ return 'unknown';
547
+ }
548
+ }
549
+
550
+ export class EvalCollector {
551
+ private tier: 'e2e' | 'llm-judge';
552
+ private tests: EvalTestEntry[] = [];
553
+ private finalized = false;
554
+ private evalDir: string;
555
+ private createdAt = Date.now();
556
+
557
+ constructor(tier: 'e2e' | 'llm-judge', evalDir?: string) {
558
+ this.tier = tier;
559
+ this.evalDir = evalDir || DEFAULT_EVAL_DIR;
560
+ }
561
+
562
+ addTest(entry: EvalTestEntry): void {
563
+ this.tests.push(entry);
564
+ this.savePartial();
565
+ }
566
+
567
+ /** Write incremental results after each test. Atomic write, non-fatal. */
568
+ savePartial(): void {
569
+ try {
570
+ const git = getGitInfo();
571
+ const version = getVersion();
572
+ const totalCost = this.tests.reduce((s, t) => s + t.cost_usd, 0);
573
+ const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
574
+ const passed = this.tests.filter(t => t.passed).length;
575
+
576
+ const partial: EvalResult = {
577
+ schema_version: SCHEMA_VERSION,
578
+ version,
579
+ branch: git.branch,
580
+ git_sha: git.sha,
581
+ timestamp: new Date().toISOString(),
582
+ hostname: os.hostname(),
583
+ tier: this.tier,
584
+ total_tests: this.tests.length,
585
+ passed,
586
+ failed: this.tests.length - passed,
587
+ total_cost_usd: Math.round(totalCost * 100) / 100,
588
+ total_duration_ms: totalDuration,
589
+ tests: this.tests,
590
+ _partial: true,
591
+ };
592
+
593
+ fs.mkdirSync(this.evalDir, { recursive: true });
594
+ const partialPath = path.join(this.evalDir, '_partial-e2e.json');
595
+ const tmp = partialPath + '.tmp';
596
+ fs.writeFileSync(tmp, JSON.stringify(partial, null, 2) + '\n');
597
+ fs.renameSync(tmp, partialPath);
598
+ } catch { /* non-fatal — partial saves are best-effort */ }
599
+ }
600
+
601
+ async finalize(): Promise<string> {
602
+ if (this.finalized) return '';
603
+ this.finalized = true;
604
+
605
+ const git = getGitInfo();
606
+ const version = getVersion();
607
+ const timestamp = new Date().toISOString();
608
+ const totalCost = this.tests.reduce((s, t) => s + t.cost_usd, 0);
609
+ const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
610
+ const passed = this.tests.filter(t => t.passed).length;
611
+
612
+ const result: EvalResult = {
613
+ schema_version: SCHEMA_VERSION,
614
+ version,
615
+ branch: git.branch,
616
+ git_sha: git.sha,
617
+ timestamp,
618
+ hostname: os.hostname(),
619
+ tier: this.tier,
620
+ total_tests: this.tests.length,
621
+ passed,
622
+ failed: this.tests.length - passed,
623
+ total_cost_usd: Math.round(totalCost * 100) / 100,
624
+ total_duration_ms: totalDuration,
625
+ wall_clock_ms: Date.now() - this.createdAt,
626
+ tests: this.tests,
627
+ };
628
+
629
+ // Write eval file
630
+ fs.mkdirSync(this.evalDir, { recursive: true });
631
+ const dateStr = timestamp.replace(/[:.]/g, '').replace('T', '-').slice(0, 15);
632
+ const safeBranch = git.branch.replace(/[^a-zA-Z0-9._-]/g, '-');
633
+ const filename = `${version}-${safeBranch}-${this.tier}-${dateStr}.json`;
634
+ const filepath = path.join(this.evalDir, filename);
635
+ fs.writeFileSync(filepath, JSON.stringify(result, null, 2) + '\n');
636
+
637
+ // Print summary table
638
+ this.printSummary(result, filepath, git);
639
+
640
+ // Auto-compare with previous run
641
+ try {
642
+ const prevFile = findPreviousRun(this.evalDir, this.tier, git.branch, filepath);
643
+ if (prevFile) {
644
+ const prevResult: EvalResult = JSON.parse(fs.readFileSync(prevFile, 'utf-8'));
645
+ const comparison = compareEvalResults(prevResult, result, prevFile, filepath);
646
+ process.stderr.write(formatComparison(comparison) + '\n');
647
+ } else {
648
+ process.stderr.write('\nFirst run — no comparison available.\n');
649
+ }
650
+ } catch (err: any) {
651
+ process.stderr.write(`\nCompare error: ${err.message}\n`);
652
+ }
653
+
654
+ return filepath;
655
+ }
656
+
657
+ private printSummary(result: EvalResult, filepath: string, git: { branch: string; sha: string }): void {
658
+ const lines: string[] = [];
659
+ lines.push('');
660
+ lines.push(`Eval Results — v${result.version} @ ${git.branch} (${git.sha}) — ${this.tier}`);
661
+ lines.push('═'.repeat(70));
662
+
663
+ for (const t of this.tests) {
664
+ const status = t.passed ? ' PASS ' : ' FAIL ';
665
+ const cost = `$${t.cost_usd.toFixed(2)}`;
666
+ const dur = t.duration_ms ? `${Math.round(t.duration_ms / 1000)}s` : '';
667
+ const turns = t.turns_used !== undefined ? `${t.turns_used}t` : '';
668
+
669
+ let detail = '';
670
+ if (t.detection_rate !== undefined) {
671
+ detail = `${t.detection_rate}/${(t.detected_bugs?.length || 0) + (t.missed_bugs?.length || 0)} det`;
672
+ } else if (t.judge_scores) {
673
+ const scores = Object.entries(t.judge_scores).map(([k, v]) => `${k[0]}:${v}`).join(' ');
674
+ detail = scores;
675
+ }
676
+
677
+ const name = t.name.length > 35 ? t.name.slice(0, 32) + '...' : t.name.padEnd(35);
678
+ lines.push(` ${name} ${status} ${cost.padStart(6)} ${turns.padStart(4)} ${dur.padStart(5)} ${detail}`);
679
+ }
680
+
681
+ lines.push('─'.repeat(70));
682
+ const totalCost = `$${result.total_cost_usd.toFixed(2)}`;
683
+ const totalDur = `${Math.round(result.total_duration_ms / 1000)}s`;
684
+ lines.push(` Total: ${result.passed}/${result.total_tests} passed${' '.repeat(20)}${totalCost.padStart(6)} ${totalDur}`);
685
+ lines.push(`Saved: ${filepath}`);
686
+
687
+ process.stderr.write(lines.join('\n') + '\n');
688
+ }
689
+ }