@runchr/gstack-antigravity 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (297) hide show
  1. package/.agents/rules/ETHOS.md +129 -0
  2. package/.agents/rules/global-gstack.md +117 -0
  3. package/.agents/rules/persona-gstack-autoplan.md +14 -0
  4. package/.agents/rules/persona-gstack-benchmark.md +14 -0
  5. package/.agents/rules/persona-gstack-browse.md +14 -0
  6. package/.agents/rules/persona-gstack-canary.md +14 -0
  7. package/.agents/rules/persona-gstack-careful.md +14 -0
  8. package/.agents/rules/persona-gstack-codex.md +14 -0
  9. package/.agents/rules/persona-gstack-cso.md +14 -0
  10. package/.agents/rules/persona-gstack-design-consultation.md +14 -0
  11. package/.agents/rules/persona-gstack-design-review.md +14 -0
  12. package/.agents/rules/persona-gstack-document-release.md +14 -0
  13. package/.agents/rules/persona-gstack-freeze.md +14 -0
  14. package/.agents/rules/persona-gstack-gstack-upgrade.md +14 -0
  15. package/.agents/rules/persona-gstack-guard.md +14 -0
  16. package/.agents/rules/persona-gstack-investigate.md +14 -0
  17. package/.agents/rules/persona-gstack-land-and-deploy.md +14 -0
  18. package/.agents/rules/persona-gstack-office-hours.md +14 -0
  19. package/.agents/rules/persona-gstack-plan-ceo-review.md +14 -0
  20. package/.agents/rules/persona-gstack-plan-design-review.md +14 -0
  21. package/.agents/rules/persona-gstack-plan-eng-review.md +14 -0
  22. package/.agents/rules/persona-gstack-qa-only.md +14 -0
  23. package/.agents/rules/persona-gstack-qa.md +14 -0
  24. package/.agents/rules/persona-gstack-retro.md +14 -0
  25. package/.agents/rules/persona-gstack-review.md +14 -0
  26. package/.agents/rules/persona-gstack-setup-browser-cookies.md +14 -0
  27. package/.agents/rules/persona-gstack-setup-deploy.md +14 -0
  28. package/.agents/rules/persona-gstack-ship.md +14 -0
  29. package/.agents/rules/persona-gstack-unfreeze.md +14 -0
  30. package/.agents/rules/persona-gstack.md +40 -0
  31. package/.agents/rules/recursive-identities.md +22 -0
  32. package/.agents/workflows/autoplan.md +30 -0
  33. package/.agents/workflows/benchmark.md +31 -0
  34. package/.agents/workflows/browse.md +26 -0
  35. package/.agents/workflows/canary.md +33 -0
  36. package/.agents/workflows/careful.md +22 -0
  37. package/.agents/workflows/codex.md +36 -0
  38. package/.agents/workflows/cso.md +29 -0
  39. package/.agents/workflows/design-consultation.md +28 -0
  40. package/.agents/workflows/design-review.md +28 -0
  41. package/.agents/workflows/document-release.md +32 -0
  42. package/.agents/workflows/freeze.md +17 -0
  43. package/.agents/workflows/gstack-upgrade.md +54 -0
  44. package/.agents/workflows/gstack.md +56 -0
  45. package/.agents/workflows/guard.md +18 -0
  46. package/.agents/workflows/investigate.md +37 -0
  47. package/.agents/workflows/land-and-deploy.md +35 -0
  48. package/.agents/workflows/office-hours.md +27 -0
  49. package/.agents/workflows/plan-ceo-review.md +34 -0
  50. package/.agents/workflows/plan-design-review.md +31 -0
  51. package/.agents/workflows/plan-eng-review.md +28 -0
  52. package/.agents/workflows/qa-only.md +28 -0
  53. package/.agents/workflows/qa.md +73 -0
  54. package/.agents/workflows/retro.md +34 -0
  55. package/.agents/workflows/review.md +30 -0
  56. package/.agents/workflows/setup-browser-cookies.md +15 -0
  57. package/.agents/workflows/setup-cookies.md +8 -0
  58. package/.agents/workflows/setup-deploy.md +21 -0
  59. package/.agents/workflows/ship.md +93 -0
  60. package/.agents/workflows/unfreeze.md +12 -0
  61. package/LICENSE +22 -0
  62. package/README.md +189 -0
  63. package/README_KO.md +191 -0
  64. package/bin/install.js +105 -0
  65. package/gstack-origin/.agents/skills/gstack/SKILL.md +651 -0
  66. package/gstack-origin/.agents/skills/gstack-autoplan/SKILL.md +678 -0
  67. package/gstack-origin/.agents/skills/gstack-benchmark/SKILL.md +482 -0
  68. package/gstack-origin/.agents/skills/gstack-browse/SKILL.md +511 -0
  69. package/gstack-origin/.agents/skills/gstack-canary/SKILL.md +486 -0
  70. package/gstack-origin/.agents/skills/gstack-careful/SKILL.md +50 -0
  71. package/gstack-origin/.agents/skills/gstack-cso/SKILL.md +607 -0
  72. package/gstack-origin/.agents/skills/gstack-design-consultation/SKILL.md +615 -0
  73. package/gstack-origin/.agents/skills/gstack-design-review/SKILL.md +988 -0
  74. package/gstack-origin/.agents/skills/gstack-document-release/SKILL.md +604 -0
  75. package/gstack-origin/.agents/skills/gstack-freeze/SKILL.md +67 -0
  76. package/gstack-origin/.agents/skills/gstack-guard/SKILL.md +62 -0
  77. package/gstack-origin/.agents/skills/gstack-investigate/SKILL.md +415 -0
  78. package/gstack-origin/.agents/skills/gstack-land-and-deploy/SKILL.md +873 -0
  79. package/gstack-origin/.agents/skills/gstack-office-hours/SKILL.md +986 -0
  80. package/gstack-origin/.agents/skills/gstack-plan-ceo-review/SKILL.md +1268 -0
  81. package/gstack-origin/.agents/skills/gstack-plan-design-review/SKILL.md +668 -0
  82. package/gstack-origin/.agents/skills/gstack-plan-eng-review/SKILL.md +826 -0
  83. package/gstack-origin/.agents/skills/gstack-qa/SKILL.md +1006 -0
  84. package/gstack-origin/.agents/skills/gstack-qa-only/SKILL.md +626 -0
  85. package/gstack-origin/.agents/skills/gstack-retro/SKILL.md +1065 -0
  86. package/gstack-origin/.agents/skills/gstack-review/SKILL.md +704 -0
  87. package/gstack-origin/.agents/skills/gstack-setup-browser-cookies/SKILL.md +325 -0
  88. package/gstack-origin/.agents/skills/gstack-setup-deploy/SKILL.md +450 -0
  89. package/gstack-origin/.agents/skills/gstack-ship/SKILL.md +1312 -0
  90. package/gstack-origin/.agents/skills/gstack-unfreeze/SKILL.md +36 -0
  91. package/gstack-origin/.agents/skills/gstack-upgrade/SKILL.md +220 -0
  92. package/gstack-origin/.env.example +5 -0
  93. package/gstack-origin/.github/workflows/skill-docs.yml +17 -0
  94. package/gstack-origin/AGENTS.md +49 -0
  95. package/gstack-origin/ARCHITECTURE.md +359 -0
  96. package/gstack-origin/BROWSER.md +271 -0
  97. package/gstack-origin/CHANGELOG.md +800 -0
  98. package/gstack-origin/CLAUDE.md +284 -0
  99. package/gstack-origin/CONTRIBUTING.md +370 -0
  100. package/gstack-origin/ETHOS.md +129 -0
  101. package/gstack-origin/LICENSE +21 -0
  102. package/gstack-origin/README.md +228 -0
  103. package/gstack-origin/SKILL.md +657 -0
  104. package/gstack-origin/SKILL.md.tmpl +281 -0
  105. package/gstack-origin/TODOS.md +564 -0
  106. package/gstack-origin/VERSION +1 -0
  107. package/gstack-origin/autoplan/SKILL.md +689 -0
  108. package/gstack-origin/autoplan/SKILL.md.tmpl +416 -0
  109. package/gstack-origin/benchmark/SKILL.md +489 -0
  110. package/gstack-origin/benchmark/SKILL.md.tmpl +233 -0
  111. package/gstack-origin/bin/dev-setup +68 -0
  112. package/gstack-origin/bin/dev-teardown +56 -0
  113. package/gstack-origin/bin/gstack-analytics +191 -0
  114. package/gstack-origin/bin/gstack-community-dashboard +113 -0
  115. package/gstack-origin/bin/gstack-config +38 -0
  116. package/gstack-origin/bin/gstack-diff-scope +71 -0
  117. package/gstack-origin/bin/gstack-global-discover.ts +591 -0
  118. package/gstack-origin/bin/gstack-repo-mode +93 -0
  119. package/gstack-origin/bin/gstack-review-log +9 -0
  120. package/gstack-origin/bin/gstack-review-read +12 -0
  121. package/gstack-origin/bin/gstack-slug +15 -0
  122. package/gstack-origin/bin/gstack-telemetry-log +158 -0
  123. package/gstack-origin/bin/gstack-telemetry-sync +127 -0
  124. package/gstack-origin/bin/gstack-update-check +196 -0
  125. package/gstack-origin/browse/SKILL.md +517 -0
  126. package/gstack-origin/browse/SKILL.md.tmpl +141 -0
  127. package/gstack-origin/browse/bin/find-browse +21 -0
  128. package/gstack-origin/browse/bin/remote-slug +14 -0
  129. package/gstack-origin/browse/scripts/build-node-server.sh +48 -0
  130. package/gstack-origin/browse/src/browser-manager.ts +634 -0
  131. package/gstack-origin/browse/src/buffers.ts +137 -0
  132. package/gstack-origin/browse/src/bun-polyfill.cjs +109 -0
  133. package/gstack-origin/browse/src/cli.ts +420 -0
  134. package/gstack-origin/browse/src/commands.ts +111 -0
  135. package/gstack-origin/browse/src/config.ts +150 -0
  136. package/gstack-origin/browse/src/cookie-import-browser.ts +417 -0
  137. package/gstack-origin/browse/src/cookie-picker-routes.ts +207 -0
  138. package/gstack-origin/browse/src/cookie-picker-ui.ts +541 -0
  139. package/gstack-origin/browse/src/find-browse.ts +61 -0
  140. package/gstack-origin/browse/src/meta-commands.ts +269 -0
  141. package/gstack-origin/browse/src/platform.ts +17 -0
  142. package/gstack-origin/browse/src/read-commands.ts +335 -0
  143. package/gstack-origin/browse/src/server.ts +369 -0
  144. package/gstack-origin/browse/src/snapshot.ts +398 -0
  145. package/gstack-origin/browse/src/url-validation.ts +91 -0
  146. package/gstack-origin/browse/src/write-commands.ts +352 -0
  147. package/gstack-origin/browse/test/bun-polyfill.test.ts +72 -0
  148. package/gstack-origin/browse/test/commands.test.ts +1836 -0
  149. package/gstack-origin/browse/test/config.test.ts +250 -0
  150. package/gstack-origin/browse/test/cookie-import-browser.test.ts +397 -0
  151. package/gstack-origin/browse/test/cookie-picker-routes.test.ts +205 -0
  152. package/gstack-origin/browse/test/find-browse.test.ts +50 -0
  153. package/gstack-origin/browse/test/fixtures/basic.html +33 -0
  154. package/gstack-origin/browse/test/fixtures/cursor-interactive.html +22 -0
  155. package/gstack-origin/browse/test/fixtures/dialog.html +15 -0
  156. package/gstack-origin/browse/test/fixtures/empty.html +2 -0
  157. package/gstack-origin/browse/test/fixtures/forms.html +55 -0
  158. package/gstack-origin/browse/test/fixtures/qa-eval-checkout.html +108 -0
  159. package/gstack-origin/browse/test/fixtures/qa-eval-spa.html +98 -0
  160. package/gstack-origin/browse/test/fixtures/qa-eval.html +51 -0
  161. package/gstack-origin/browse/test/fixtures/responsive.html +49 -0
  162. package/gstack-origin/browse/test/fixtures/snapshot.html +55 -0
  163. package/gstack-origin/browse/test/fixtures/spa.html +24 -0
  164. package/gstack-origin/browse/test/fixtures/states.html +17 -0
  165. package/gstack-origin/browse/test/fixtures/upload.html +25 -0
  166. package/gstack-origin/browse/test/gstack-config.test.ts +125 -0
  167. package/gstack-origin/browse/test/gstack-update-check.test.ts +467 -0
  168. package/gstack-origin/browse/test/handoff.test.ts +235 -0
  169. package/gstack-origin/browse/test/path-validation.test.ts +63 -0
  170. package/gstack-origin/browse/test/platform.test.ts +37 -0
  171. package/gstack-origin/browse/test/snapshot.test.ts +467 -0
  172. package/gstack-origin/browse/test/test-server.ts +57 -0
  173. package/gstack-origin/browse/test/url-validation.test.ts +72 -0
  174. package/gstack-origin/canary/SKILL.md +493 -0
  175. package/gstack-origin/canary/SKILL.md.tmpl +220 -0
  176. package/gstack-origin/careful/SKILL.md +59 -0
  177. package/gstack-origin/careful/SKILL.md.tmpl +57 -0
  178. package/gstack-origin/careful/bin/check-careful.sh +112 -0
  179. package/gstack-origin/codex/SKILL.md +677 -0
  180. package/gstack-origin/codex/SKILL.md.tmpl +356 -0
  181. package/gstack-origin/conductor.json +6 -0
  182. package/gstack-origin/cso/SKILL.md +615 -0
  183. package/gstack-origin/cso/SKILL.md.tmpl +376 -0
  184. package/gstack-origin/design-consultation/SKILL.md +625 -0
  185. package/gstack-origin/design-consultation/SKILL.md.tmpl +369 -0
  186. package/gstack-origin/design-review/SKILL.md +998 -0
  187. package/gstack-origin/design-review/SKILL.md.tmpl +262 -0
  188. package/gstack-origin/docs/images/github-2013.png +0 -0
  189. package/gstack-origin/docs/images/github-2026.png +0 -0
  190. package/gstack-origin/docs/skills.md +877 -0
  191. package/gstack-origin/document-release/SKILL.md +613 -0
  192. package/gstack-origin/document-release/SKILL.md.tmpl +357 -0
  193. package/gstack-origin/freeze/SKILL.md +82 -0
  194. package/gstack-origin/freeze/SKILL.md.tmpl +80 -0
  195. package/gstack-origin/freeze/bin/check-freeze.sh +68 -0
  196. package/gstack-origin/gstack-upgrade/SKILL.md +226 -0
  197. package/gstack-origin/gstack-upgrade/SKILL.md.tmpl +224 -0
  198. package/gstack-origin/guard/SKILL.md +82 -0
  199. package/gstack-origin/guard/SKILL.md.tmpl +80 -0
  200. package/gstack-origin/investigate/SKILL.md +435 -0
  201. package/gstack-origin/investigate/SKILL.md.tmpl +196 -0
  202. package/gstack-origin/land-and-deploy/SKILL.md +880 -0
  203. package/gstack-origin/land-and-deploy/SKILL.md.tmpl +575 -0
  204. package/gstack-origin/office-hours/SKILL.md +996 -0
  205. package/gstack-origin/office-hours/SKILL.md.tmpl +624 -0
  206. package/gstack-origin/package.json +55 -0
  207. package/gstack-origin/plan-ceo-review/SKILL.md +1277 -0
  208. package/gstack-origin/plan-ceo-review/SKILL.md.tmpl +838 -0
  209. package/gstack-origin/plan-design-review/SKILL.md +676 -0
  210. package/gstack-origin/plan-design-review/SKILL.md.tmpl +314 -0
  211. package/gstack-origin/plan-eng-review/SKILL.md +836 -0
  212. package/gstack-origin/plan-eng-review/SKILL.md.tmpl +279 -0
  213. package/gstack-origin/qa/SKILL.md +1016 -0
  214. package/gstack-origin/qa/SKILL.md.tmpl +316 -0
  215. package/gstack-origin/qa/references/issue-taxonomy.md +85 -0
  216. package/gstack-origin/qa/templates/qa-report-template.md +126 -0
  217. package/gstack-origin/qa-only/SKILL.md +633 -0
  218. package/gstack-origin/qa-only/SKILL.md.tmpl +101 -0
  219. package/gstack-origin/retro/SKILL.md +1072 -0
  220. package/gstack-origin/retro/SKILL.md.tmpl +833 -0
  221. package/gstack-origin/review/SKILL.md +849 -0
  222. package/gstack-origin/review/SKILL.md.tmpl +259 -0
  223. package/gstack-origin/review/TODOS-format.md +62 -0
  224. package/gstack-origin/review/checklist.md +190 -0
  225. package/gstack-origin/review/design-checklist.md +132 -0
  226. package/gstack-origin/review/greptile-triage.md +220 -0
  227. package/gstack-origin/scripts/analytics.ts +190 -0
  228. package/gstack-origin/scripts/dev-skill.ts +82 -0
  229. package/gstack-origin/scripts/eval-compare.ts +96 -0
  230. package/gstack-origin/scripts/eval-list.ts +116 -0
  231. package/gstack-origin/scripts/eval-select.ts +86 -0
  232. package/gstack-origin/scripts/eval-summary.ts +187 -0
  233. package/gstack-origin/scripts/eval-watch.ts +172 -0
  234. package/gstack-origin/scripts/gen-skill-docs.ts +2414 -0
  235. package/gstack-origin/scripts/skill-check.ts +167 -0
  236. package/gstack-origin/setup +269 -0
  237. package/gstack-origin/setup-browser-cookies/SKILL.md +330 -0
  238. package/gstack-origin/setup-browser-cookies/SKILL.md.tmpl +74 -0
  239. package/gstack-origin/setup-deploy/SKILL.md +459 -0
  240. package/gstack-origin/setup-deploy/SKILL.md.tmpl +220 -0
  241. package/gstack-origin/ship/SKILL.md +1457 -0
  242. package/gstack-origin/ship/SKILL.md.tmpl +528 -0
  243. package/gstack-origin/supabase/config.sh +10 -0
  244. package/gstack-origin/supabase/functions/community-pulse/index.ts +59 -0
  245. package/gstack-origin/supabase/functions/telemetry-ingest/index.ts +135 -0
  246. package/gstack-origin/supabase/functions/update-check/index.ts +37 -0
  247. package/gstack-origin/supabase/migrations/001_telemetry.sql +89 -0
  248. package/gstack-origin/test/analytics.test.ts +277 -0
  249. package/gstack-origin/test/codex-e2e.test.ts +197 -0
  250. package/gstack-origin/test/fixtures/coverage-audit-fixture.ts +76 -0
  251. package/gstack-origin/test/fixtures/eval-baselines.json +7 -0
  252. package/gstack-origin/test/fixtures/qa-eval-checkout-ground-truth.json +43 -0
  253. package/gstack-origin/test/fixtures/qa-eval-ground-truth.json +43 -0
  254. package/gstack-origin/test/fixtures/qa-eval-spa-ground-truth.json +43 -0
  255. package/gstack-origin/test/fixtures/review-eval-design-slop.css +86 -0
  256. package/gstack-origin/test/fixtures/review-eval-design-slop.html +41 -0
  257. package/gstack-origin/test/fixtures/review-eval-enum-diff.rb +30 -0
  258. package/gstack-origin/test/fixtures/review-eval-enum.rb +27 -0
  259. package/gstack-origin/test/fixtures/review-eval-vuln.rb +14 -0
  260. package/gstack-origin/test/gemini-e2e.test.ts +173 -0
  261. package/gstack-origin/test/gen-skill-docs.test.ts +1049 -0
  262. package/gstack-origin/test/global-discover.test.ts +187 -0
  263. package/gstack-origin/test/helpers/codex-session-runner.ts +282 -0
  264. package/gstack-origin/test/helpers/e2e-helpers.ts +239 -0
  265. package/gstack-origin/test/helpers/eval-store.test.ts +548 -0
  266. package/gstack-origin/test/helpers/eval-store.ts +689 -0
  267. package/gstack-origin/test/helpers/gemini-session-runner.test.ts +104 -0
  268. package/gstack-origin/test/helpers/gemini-session-runner.ts +201 -0
  269. package/gstack-origin/test/helpers/llm-judge.ts +130 -0
  270. package/gstack-origin/test/helpers/observability.test.ts +283 -0
  271. package/gstack-origin/test/helpers/session-runner.test.ts +96 -0
  272. package/gstack-origin/test/helpers/session-runner.ts +357 -0
  273. package/gstack-origin/test/helpers/skill-parser.ts +206 -0
  274. package/gstack-origin/test/helpers/touchfiles.ts +260 -0
  275. package/gstack-origin/test/hook-scripts.test.ts +373 -0
  276. package/gstack-origin/test/skill-e2e-browse.test.ts +293 -0
  277. package/gstack-origin/test/skill-e2e-deploy.test.ts +279 -0
  278. package/gstack-origin/test/skill-e2e-design.test.ts +614 -0
  279. package/gstack-origin/test/skill-e2e-plan.test.ts +538 -0
  280. package/gstack-origin/test/skill-e2e-qa-bugs.test.ts +194 -0
  281. package/gstack-origin/test/skill-e2e-qa-workflow.test.ts +412 -0
  282. package/gstack-origin/test/skill-e2e-review.test.ts +535 -0
  283. package/gstack-origin/test/skill-e2e-workflow.test.ts +586 -0
  284. package/gstack-origin/test/skill-e2e.test.ts +3325 -0
  285. package/gstack-origin/test/skill-llm-eval.test.ts +787 -0
  286. package/gstack-origin/test/skill-parser.test.ts +179 -0
  287. package/gstack-origin/test/skill-routing-e2e.test.ts +605 -0
  288. package/gstack-origin/test/skill-validation.test.ts +1520 -0
  289. package/gstack-origin/test/telemetry.test.ts +278 -0
  290. package/gstack-origin/test/touchfiles.test.ts +262 -0
  291. package/gstack-origin/unfreeze/SKILL.md +40 -0
  292. package/gstack-origin/unfreeze/SKILL.md.tmpl +38 -0
  293. package/package.json +38 -0
  294. package/scripts/install-antigravity-skill.ps1 +33 -0
  295. package/scripts/install-antigravity-skill.sh +41 -0
  296. package/scripts/sync-gstack-origin.ps1 +37 -0
  297. package/scripts/sync-gstack-origin.sh +35 -0
@@ -0,0 +1,548 @@
1
+ import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
2
+ import * as fs from 'fs';
3
+ import * as path from 'path';
4
+ import * as os from 'os';
5
+ import {
6
+ EvalCollector,
7
+ extractToolSummary,
8
+ findPreviousRun,
9
+ compareEvalResults,
10
+ formatComparison,
11
+ generateCommentary,
12
+ judgePassed,
13
+ } from './eval-store';
14
+ import type { EvalResult, EvalTestEntry, ComparisonResult } from './eval-store';
15
+
16
+ let tmpDir: string;
17
+
18
+ beforeEach(() => {
19
+ tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'eval-store-test-'));
20
+ });
21
+
22
+ afterEach(() => {
23
+ try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
24
+ });
25
+
26
+ // --- Helper to make a minimal test entry ---
27
+
28
+ function makeEntry(overrides?: Partial<EvalTestEntry>): EvalTestEntry {
29
+ return {
30
+ name: 'test-1',
31
+ suite: 'suite-1',
32
+ tier: 'e2e',
33
+ passed: true,
34
+ duration_ms: 1000,
35
+ cost_usd: 0.05,
36
+ ...overrides,
37
+ };
38
+ }
39
+
40
+ // --- Helper to make a minimal EvalResult ---
41
+
42
+ function makeResult(overrides?: Partial<EvalResult>): EvalResult {
43
+ return {
44
+ schema_version: 1,
45
+ version: '0.3.6',
46
+ branch: 'main',
47
+ git_sha: 'abc1234',
48
+ timestamp: '2026-03-14T12:00:00.000Z',
49
+ hostname: 'test-host',
50
+ tier: 'e2e',
51
+ total_tests: 1,
52
+ passed: 1,
53
+ failed: 0,
54
+ total_cost_usd: 0.05,
55
+ total_duration_ms: 1000,
56
+ tests: [makeEntry()],
57
+ ...overrides,
58
+ };
59
+ }
60
+
61
+ // --- EvalCollector tests ---
62
+
63
+ describe('EvalCollector', () => {
64
+ test('addTest accumulates entries', () => {
65
+ const collector = new EvalCollector('e2e', tmpDir);
66
+ collector.addTest(makeEntry({ name: 'a' }));
67
+ collector.addTest(makeEntry({ name: 'b' }));
68
+ collector.addTest(makeEntry({ name: 'c' }));
69
+ // We can't inspect tests directly, but finalize will write them
70
+ });
71
+
72
+ test('finalize writes JSON file to eval dir', async () => {
73
+ const collector = new EvalCollector('e2e', tmpDir);
74
+ collector.addTest(makeEntry());
75
+ const filepath = await collector.finalize();
76
+
77
+ expect(filepath).toBeTruthy();
78
+ expect(fs.existsSync(filepath)).toBe(true);
79
+
80
+ const data = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
81
+ expect(data.tests).toHaveLength(1);
82
+ expect(data.tests[0].name).toBe('test-1');
83
+ });
84
+
85
+ test('written JSON has correct schema fields', async () => {
86
+ const collector = new EvalCollector('e2e', tmpDir);
87
+ collector.addTest(makeEntry({ passed: true, cost_usd: 0.10, duration_ms: 2000 }));
88
+ collector.addTest(makeEntry({ name: 'test-2', passed: false, cost_usd: 0.05, duration_ms: 1000 }));
89
+ const filepath = await collector.finalize();
90
+
91
+ const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
92
+ expect(data.schema_version).toBe(1);
93
+ expect(data.tier).toBe('e2e');
94
+ expect(data.total_tests).toBe(2);
95
+ expect(data.passed).toBe(1);
96
+ expect(data.failed).toBe(1);
97
+ expect(data.total_cost_usd).toBe(0.15);
98
+ expect(data.total_duration_ms).toBe(3000);
99
+ expect(data.timestamp).toBeTruthy();
100
+ expect(data.hostname).toBeTruthy();
101
+ });
102
+
103
+ test('finalize creates directory if missing', async () => {
104
+ const nestedDir = path.join(tmpDir, 'nested', 'deep', 'evals');
105
+ const collector = new EvalCollector('e2e', nestedDir);
106
+ collector.addTest(makeEntry());
107
+ const filepath = await collector.finalize();
108
+ expect(fs.existsSync(filepath)).toBe(true);
109
+ });
110
+
111
+ test('double finalize does not write twice', async () => {
112
+ const collector = new EvalCollector('e2e', tmpDir);
113
+ collector.addTest(makeEntry());
114
+ const filepath1 = await collector.finalize();
115
+ const filepath2 = await collector.finalize();
116
+
117
+ expect(filepath1).toBeTruthy();
118
+ expect(filepath2).toBe(''); // second call returns empty
119
+ expect(fs.readdirSync(tmpDir).filter(f => f.endsWith('.json') && !f.startsWith('_partial'))).toHaveLength(1);
120
+ });
121
+
122
+ test('empty collector writes valid file', async () => {
123
+ const collector = new EvalCollector('llm-judge', tmpDir);
124
+ const filepath = await collector.finalize();
125
+
126
+ const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
127
+ expect(data.total_tests).toBe(0);
128
+ expect(data.passed).toBe(0);
129
+ expect(data.tests).toHaveLength(0);
130
+ expect(data.tier).toBe('llm-judge');
131
+ });
132
+ });
133
+
134
+ // --- judgePassed tests ---
135
+
136
+ describe('judgePassed', () => {
137
+ test('passes when all thresholds met', () => {
138
+ expect(judgePassed(
139
+ { detection_rate: 3, false_positives: 1, evidence_quality: 3 },
140
+ { minimum_detection: 2, max_false_positives: 2 },
141
+ )).toBe(true);
142
+ });
143
+
144
+ test('fails when detection rate below minimum', () => {
145
+ expect(judgePassed(
146
+ { detection_rate: 1, false_positives: 0, evidence_quality: 3 },
147
+ { minimum_detection: 2, max_false_positives: 2 },
148
+ )).toBe(false);
149
+ });
150
+
151
+ test('fails when too many false positives', () => {
152
+ expect(judgePassed(
153
+ { detection_rate: 3, false_positives: 3, evidence_quality: 3 },
154
+ { minimum_detection: 2, max_false_positives: 2 },
155
+ )).toBe(false);
156
+ });
157
+
158
+ test('fails when evidence quality below 2', () => {
159
+ expect(judgePassed(
160
+ { detection_rate: 3, false_positives: 0, evidence_quality: 1 },
161
+ { minimum_detection: 2, max_false_positives: 2 },
162
+ )).toBe(false);
163
+ });
164
+
165
+ test('passes at exact thresholds', () => {
166
+ expect(judgePassed(
167
+ { detection_rate: 2, false_positives: 2, evidence_quality: 2 },
168
+ { minimum_detection: 2, max_false_positives: 2 },
169
+ )).toBe(true);
170
+ });
171
+ });
172
+
173
+ // --- extractToolSummary tests ---
174
+
175
+ describe('extractToolSummary', () => {
176
+ test('counts tool types from transcript events', () => {
177
+ const transcript = [
178
+ { type: 'system', subtype: 'init' },
179
+ { type: 'assistant', message: { content: [
180
+ { type: 'tool_use', name: 'Bash', input: {} },
181
+ ] } },
182
+ { type: 'user', tool_use_result: { stdout: '' } },
183
+ { type: 'assistant', message: { content: [
184
+ { type: 'text', text: 'ok' },
185
+ { type: 'tool_use', name: 'Read', input: {} },
186
+ ] } },
187
+ { type: 'assistant', message: { content: [
188
+ { type: 'tool_use', name: 'Bash', input: {} },
189
+ { type: 'tool_use', name: 'Write', input: {} },
190
+ ] } },
191
+ ];
192
+
193
+ const summary = extractToolSummary(transcript);
194
+ expect(summary).toEqual({ Bash: 2, Read: 1, Write: 1 });
195
+ });
196
+
197
+ test('returns empty object for empty transcript', () => {
198
+ expect(extractToolSummary([])).toEqual({});
199
+ });
200
+
201
+ test('handles events with no content array', () => {
202
+ const transcript = [
203
+ { type: 'assistant', message: {} },
204
+ { type: 'assistant' },
205
+ ];
206
+ expect(extractToolSummary(transcript)).toEqual({});
207
+ });
208
+ });
209
+
210
+ // --- findPreviousRun tests ---
211
+
212
+ describe('findPreviousRun', () => {
213
+ test('finds correct file — same branch preferred, most recent', () => {
214
+ // Write three eval files
215
+ const files = [
216
+ { name: '0.3.5-main-e2e-20260312-100000.json', data: makeResult({ branch: 'main', timestamp: '2026-03-12T10:00:00Z' }) },
217
+ { name: '0.3.5-feature-e2e-20260313-100000.json', data: makeResult({ branch: 'feature', timestamp: '2026-03-13T10:00:00Z' }) },
218
+ { name: '0.3.6-feature-e2e-20260314-100000.json', data: makeResult({ branch: 'feature', timestamp: '2026-03-14T10:00:00Z' }) },
219
+ ];
220
+ for (const f of files) {
221
+ fs.writeFileSync(path.join(tmpDir, f.name), JSON.stringify(f.data));
222
+ }
223
+
224
+ // Should prefer feature branch (most recent on same branch)
225
+ const result = findPreviousRun(tmpDir, 'e2e', 'feature', path.join(tmpDir, 'current.json'));
226
+ expect(result).toContain('0.3.6-feature-e2e-20260314');
227
+ });
228
+
229
+ test('falls back to different branch when no same-branch match', () => {
230
+ const files = [
231
+ { name: '0.3.5-main-e2e-20260312-100000.json', data: makeResult({ branch: 'main', timestamp: '2026-03-12T10:00:00Z' }) },
232
+ ];
233
+ for (const f of files) {
234
+ fs.writeFileSync(path.join(tmpDir, f.name), JSON.stringify(f.data));
235
+ }
236
+
237
+ const result = findPreviousRun(tmpDir, 'e2e', 'new-branch', path.join(tmpDir, 'current.json'));
238
+ expect(result).toContain('0.3.5-main-e2e');
239
+ });
240
+
241
+ test('returns null when no prior runs exist', () => {
242
+ const result = findPreviousRun(tmpDir, 'e2e', 'main', path.join(tmpDir, 'current.json'));
243
+ expect(result).toBeNull();
244
+ });
245
+
246
+ test('returns null when directory does not exist', () => {
247
+ const result = findPreviousRun('/nonexistent/path', 'e2e', 'main', 'current.json');
248
+ expect(result).toBeNull();
249
+ });
250
+
251
+ test('excludes the current file from results', () => {
252
+ const filename = '0.3.6-main-e2e-20260314-100000.json';
253
+ fs.writeFileSync(
254
+ path.join(tmpDir, filename),
255
+ JSON.stringify(makeResult({ branch: 'main', timestamp: '2026-03-14T10:00:00Z' })),
256
+ );
257
+
258
+ const result = findPreviousRun(tmpDir, 'e2e', 'main', path.join(tmpDir, filename));
259
+ expect(result).toBeNull(); // only file is excluded
260
+ });
261
+
262
+ test('filters by tier', () => {
263
+ fs.writeFileSync(
264
+ path.join(tmpDir, '0.3.6-main-llm-judge-20260314-100000.json'),
265
+ JSON.stringify(makeResult({ tier: 'llm-judge', branch: 'main', timestamp: '2026-03-14T10:00:00Z' })),
266
+ );
267
+
268
+ const result = findPreviousRun(tmpDir, 'e2e', 'main', 'current.json');
269
+ expect(result).toBeNull(); // only llm-judge file, looking for e2e
270
+ });
271
+ });
272
+
273
+ // --- compareEvalResults tests ---
274
+
275
+ describe('compareEvalResults', () => {
276
+ test('detects improved/regressed/unchanged per test', () => {
277
+ const before = makeResult({
278
+ tests: [
279
+ makeEntry({ name: 'test-a', passed: false }),
280
+ makeEntry({ name: 'test-b', passed: true }),
281
+ makeEntry({ name: 'test-c', passed: true }),
282
+ ],
283
+ total_tests: 3, passed: 2, failed: 1,
284
+ });
285
+ const after = makeResult({
286
+ tests: [
287
+ makeEntry({ name: 'test-a', passed: true }), // improved
288
+ makeEntry({ name: 'test-b', passed: false }), // regressed
289
+ makeEntry({ name: 'test-c', passed: true }), // unchanged
290
+ ],
291
+ total_tests: 3, passed: 2, failed: 1,
292
+ });
293
+
294
+ const result = compareEvalResults(before, after, 'before.json', 'after.json');
295
+ expect(result.improved).toBe(1);
296
+ expect(result.regressed).toBe(1);
297
+ expect(result.unchanged).toBe(1);
298
+ expect(result.deltas.find(d => d.name === 'test-a')?.status_change).toBe('improved');
299
+ expect(result.deltas.find(d => d.name === 'test-b')?.status_change).toBe('regressed');
300
+ expect(result.deltas.find(d => d.name === 'test-c')?.status_change).toBe('unchanged');
301
+ });
302
+
303
+ test('handles tests present in one run but not the other', () => {
304
+ const before = makeResult({
305
+ tests: [
306
+ makeEntry({ name: 'old-test', passed: true }),
307
+ makeEntry({ name: 'shared', passed: true }),
308
+ ],
309
+ });
310
+ const after = makeResult({
311
+ tests: [
312
+ makeEntry({ name: 'shared', passed: true }),
313
+ makeEntry({ name: 'new-test', passed: true }),
314
+ ],
315
+ });
316
+
317
+ const result = compareEvalResults(before, after, 'before.json', 'after.json');
318
+ expect(result.deltas).toHaveLength(3); // shared + new-test + old-test (removed)
319
+ expect(result.deltas.find(d => d.name.includes('old-test'))?.name).toContain('removed');
320
+ });
321
+
322
+ test('computes cost and duration deltas', () => {
323
+ const before = makeResult({ total_cost_usd: 2.00, total_duration_ms: 60000 });
324
+ const after = makeResult({ total_cost_usd: 1.50, total_duration_ms: 45000 });
325
+
326
+ const result = compareEvalResults(before, after, 'a.json', 'b.json');
327
+ expect(result.total_cost_delta).toBe(-0.50);
328
+ expect(result.total_duration_delta).toBe(-15000);
329
+ });
330
+ });
331
+
332
+ // --- formatComparison tests ---
333
+
334
+ describe('formatComparison', () => {
335
+ test('produces readable output with status arrows', () => {
336
+ const comparison: ComparisonResult = {
337
+ before_file: 'before.json',
338
+ after_file: 'after.json',
339
+ before_branch: 'main',
340
+ after_branch: 'feature',
341
+ before_timestamp: '2026-03-13T14:30:00Z',
342
+ after_timestamp: '2026-03-14T14:30:00Z',
343
+ deltas: [
344
+ {
345
+ name: 'browse basic',
346
+ before: { passed: true, cost_usd: 0.07, turns_used: 6, duration_ms: 24000, tool_summary: { Bash: 3 } },
347
+ after: { passed: true, cost_usd: 0.06, turns_used: 5, duration_ms: 19000, tool_summary: { Bash: 4 } },
348
+ status_change: 'unchanged',
349
+ },
350
+ {
351
+ name: 'planted bugs static',
352
+ before: { passed: false, cost_usd: 1.00, detection_rate: 3, tool_summary: {} },
353
+ after: { passed: true, cost_usd: 0.95, detection_rate: 4, tool_summary: {} },
354
+ status_change: 'improved',
355
+ },
356
+ ],
357
+ total_cost_delta: -0.06,
358
+ total_duration_delta: -5000,
359
+ improved: 1,
360
+ regressed: 0,
361
+ unchanged: 1,
362
+ tool_count_before: 3,
363
+ tool_count_after: 4,
364
+ };
365
+
366
+ const output = formatComparison(comparison);
367
+ expect(output).toContain('vs previous');
368
+ expect(output).toContain('main');
369
+ expect(output).toContain('1 improved');
370
+ expect(output).toContain('1 unchanged');
371
+ expect(output).toContain('↑'); // improved arrow
372
+ expect(output).toContain('='); // unchanged arrow
373
+ // Turns and duration deltas
374
+ expect(output).toContain('6→5t');
375
+ expect(output).toContain('24→19s');
376
+ });
377
+
378
+ test('includes commentary section', () => {
379
+ const comparison: ComparisonResult = {
380
+ before_file: 'a.json', after_file: 'b.json',
381
+ before_branch: 'main', after_branch: 'main',
382
+ before_timestamp: '2026-03-13T14:30:00Z',
383
+ after_timestamp: '2026-03-14T14:30:00Z',
384
+ deltas: [
385
+ {
386
+ name: 'test-a',
387
+ before: { passed: true, cost_usd: 0.50, turns_used: 20, duration_ms: 120000 },
388
+ after: { passed: true, cost_usd: 0.30, turns_used: 10, duration_ms: 60000 },
389
+ status_change: 'unchanged',
390
+ },
391
+ {
392
+ name: 'test-b',
393
+ before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 },
394
+ after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 },
395
+ status_change: 'unchanged',
396
+ },
397
+ {
398
+ name: 'test-c',
399
+ before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 },
400
+ after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 },
401
+ status_change: 'unchanged',
402
+ },
403
+ ],
404
+ total_cost_delta: -0.20,
405
+ total_duration_delta: -60000,
406
+ improved: 0, regressed: 0, unchanged: 3,
407
+ tool_count_before: 30, tool_count_after: 20,
408
+ };
409
+
410
+ const output = formatComparison(comparison);
411
+ expect(output).toContain('Takeaway');
412
+ expect(output).toContain('fewer turns');
413
+ expect(output).toContain('faster');
414
+ });
415
+ });
416
+
417
+ // --- generateCommentary tests ---
418
+
419
+ describe('generateCommentary', () => {
420
+ test('flags regressions prominently', () => {
421
+ const c: ComparisonResult = {
422
+ before_file: 'a.json', after_file: 'b.json',
423
+ before_branch: 'main', after_branch: 'main',
424
+ before_timestamp: '', after_timestamp: '',
425
+ deltas: [{
426
+ name: 'critical-test',
427
+ before: { passed: true, cost_usd: 0.10 },
428
+ after: { passed: false, cost_usd: 0.10 },
429
+ status_change: 'regressed',
430
+ }],
431
+ total_cost_delta: 0, total_duration_delta: 0,
432
+ improved: 0, regressed: 1, unchanged: 0,
433
+ tool_count_before: 0, tool_count_after: 0,
434
+ };
435
+
436
+ const notes = generateCommentary(c);
437
+ expect(notes.some(n => n.includes('REGRESSION'))).toBe(true);
438
+ expect(notes.some(n => n.includes('critical-test'))).toBe(true);
439
+ });
440
+
441
+ test('notes improvements', () => {
442
+ const c: ComparisonResult = {
443
+ before_file: 'a.json', after_file: 'b.json',
444
+ before_branch: 'main', after_branch: 'main',
445
+ before_timestamp: '', after_timestamp: '',
446
+ deltas: [{
447
+ name: 'fixed-test',
448
+ before: { passed: false, cost_usd: 0.10 },
449
+ after: { passed: true, cost_usd: 0.10 },
450
+ status_change: 'improved',
451
+ }],
452
+ total_cost_delta: 0, total_duration_delta: 0,
453
+ improved: 1, regressed: 0, unchanged: 0,
454
+ tool_count_before: 0, tool_count_after: 0,
455
+ };
456
+
457
+ const notes = generateCommentary(c);
458
+ expect(notes.some(n => n.includes('Fixed'))).toBe(true);
459
+ expect(notes.some(n => n.includes('fixed-test'))).toBe(true);
460
+ });
461
+
462
+ test('reports efficiency gains for stable tests', () => {
463
+ const c: ComparisonResult = {
464
+ before_file: 'a.json', after_file: 'b.json',
465
+ before_branch: 'main', after_branch: 'main',
466
+ before_timestamp: '', after_timestamp: '',
467
+ deltas: [{
468
+ name: 'fast-test',
469
+ before: { passed: true, cost_usd: 0.50, turns_used: 20, duration_ms: 120000 },
470
+ after: { passed: true, cost_usd: 0.25, turns_used: 10, duration_ms: 60000 },
471
+ status_change: 'unchanged',
472
+ }],
473
+ total_cost_delta: -0.25, total_duration_delta: -60000,
474
+ improved: 0, regressed: 0, unchanged: 1,
475
+ tool_count_before: 0, tool_count_after: 0,
476
+ };
477
+
478
+ const notes = generateCommentary(c);
479
+ expect(notes.some(n => n.includes('fewer turns'))).toBe(true);
480
+ expect(notes.some(n => n.includes('faster'))).toBe(true);
481
+ expect(notes.some(n => n.includes('cheaper'))).toBe(true);
482
+ });
483
+
484
+ test('reports detection rate changes', () => {
485
+ const c: ComparisonResult = {
486
+ before_file: 'a.json', after_file: 'b.json',
487
+ before_branch: 'main', after_branch: 'main',
488
+ before_timestamp: '', after_timestamp: '',
489
+ deltas: [{
490
+ name: 'detection-test',
491
+ before: { passed: true, cost_usd: 0.50, detection_rate: 3 },
492
+ after: { passed: true, cost_usd: 0.50, detection_rate: 5 },
493
+ status_change: 'unchanged',
494
+ }],
495
+ total_cost_delta: 0, total_duration_delta: 0,
496
+ improved: 0, regressed: 0, unchanged: 1,
497
+ tool_count_before: 0, tool_count_after: 0,
498
+ };
499
+
500
+ const notes = generateCommentary(c);
501
+ expect(notes.some(n => n.includes('detecting 2 more bugs'))).toBe(true);
502
+ });
503
+
504
+ test('produces overall summary for 3+ tests with no regressions', () => {
505
+ const c: ComparisonResult = {
506
+ before_file: 'a.json', after_file: 'b.json',
507
+ before_branch: 'main', after_branch: 'main',
508
+ before_timestamp: '', after_timestamp: '',
509
+ deltas: [
510
+ { name: 'a', before: { passed: true, cost_usd: 0.50, turns_used: 10, duration_ms: 60000 },
511
+ after: { passed: true, cost_usd: 0.30, turns_used: 6, duration_ms: 40000 }, status_change: 'unchanged' },
512
+ { name: 'b', before: { passed: true, cost_usd: 0.20, turns_used: 5, duration_ms: 30000 },
513
+ after: { passed: true, cost_usd: 0.15, turns_used: 4, duration_ms: 25000 }, status_change: 'unchanged' },
514
+ { name: 'c', before: { passed: true, cost_usd: 0.10, turns_used: 3, duration_ms: 20000 },
515
+ after: { passed: true, cost_usd: 0.08, turns_used: 3, duration_ms: 18000 }, status_change: 'unchanged' },
516
+ ],
517
+ total_cost_delta: -0.27, total_duration_delta: -27000,
518
+ improved: 0, regressed: 0, unchanged: 3,
519
+ tool_count_before: 0, tool_count_after: 0,
520
+ };
521
+
522
+ const notes = generateCommentary(c);
523
+ expect(notes.some(n => n.includes('Overall'))).toBe(true);
524
+ expect(notes.some(n => n.includes('No regressions'))).toBe(true);
525
+ });
526
+
527
+ test('returns empty for stable run with no significant changes', () => {
528
+ const c: ComparisonResult = {
529
+ before_file: 'a.json', after_file: 'b.json',
530
+ before_branch: 'main', after_branch: 'main',
531
+ before_timestamp: '', after_timestamp: '',
532
+ deltas: [
533
+ { name: 'a', before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 },
534
+ after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 21000 }, status_change: 'unchanged' },
535
+ { name: 'b', before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 },
536
+ after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, status_change: 'unchanged' },
537
+ { name: 'c', before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 },
538
+ after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, status_change: 'unchanged' },
539
+ ],
540
+ total_cost_delta: 0, total_duration_delta: 1000,
541
+ improved: 0, regressed: 0, unchanged: 3,
542
+ tool_count_before: 15, tool_count_after: 15,
543
+ };
544
+
545
+ const notes = generateCommentary(c);
546
+ expect(notes.some(n => n.includes('Stable run'))).toBe(true);
547
+ });
548
+ });