@runchr/gstack-antigravity 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agents/rules/ETHOS.md +129 -0
- package/.agents/rules/global-gstack.md +117 -0
- package/.agents/rules/persona-gstack-autoplan.md +14 -0
- package/.agents/rules/persona-gstack-benchmark.md +14 -0
- package/.agents/rules/persona-gstack-browse.md +14 -0
- package/.agents/rules/persona-gstack-canary.md +14 -0
- package/.agents/rules/persona-gstack-careful.md +14 -0
- package/.agents/rules/persona-gstack-codex.md +14 -0
- package/.agents/rules/persona-gstack-cso.md +14 -0
- package/.agents/rules/persona-gstack-design-consultation.md +14 -0
- package/.agents/rules/persona-gstack-design-review.md +14 -0
- package/.agents/rules/persona-gstack-document-release.md +14 -0
- package/.agents/rules/persona-gstack-freeze.md +14 -0
- package/.agents/rules/persona-gstack-gstack-upgrade.md +14 -0
- package/.agents/rules/persona-gstack-guard.md +14 -0
- package/.agents/rules/persona-gstack-investigate.md +14 -0
- package/.agents/rules/persona-gstack-land-and-deploy.md +14 -0
- package/.agents/rules/persona-gstack-office-hours.md +14 -0
- package/.agents/rules/persona-gstack-plan-ceo-review.md +14 -0
- package/.agents/rules/persona-gstack-plan-design-review.md +14 -0
- package/.agents/rules/persona-gstack-plan-eng-review.md +14 -0
- package/.agents/rules/persona-gstack-qa-only.md +14 -0
- package/.agents/rules/persona-gstack-qa.md +14 -0
- package/.agents/rules/persona-gstack-retro.md +14 -0
- package/.agents/rules/persona-gstack-review.md +14 -0
- package/.agents/rules/persona-gstack-setup-browser-cookies.md +14 -0
- package/.agents/rules/persona-gstack-setup-deploy.md +14 -0
- package/.agents/rules/persona-gstack-ship.md +14 -0
- package/.agents/rules/persona-gstack-unfreeze.md +14 -0
- package/.agents/rules/persona-gstack.md +40 -0
- package/.agents/rules/recursive-identities.md +22 -0
- package/.agents/workflows/autoplan.md +30 -0
- package/.agents/workflows/benchmark.md +31 -0
- package/.agents/workflows/browse.md +26 -0
- package/.agents/workflows/canary.md +33 -0
- package/.agents/workflows/careful.md +22 -0
- package/.agents/workflows/codex.md +36 -0
- package/.agents/workflows/cso.md +29 -0
- package/.agents/workflows/design-consultation.md +28 -0
- package/.agents/workflows/design-review.md +28 -0
- package/.agents/workflows/document-release.md +32 -0
- package/.agents/workflows/freeze.md +17 -0
- package/.agents/workflows/gstack-upgrade.md +54 -0
- package/.agents/workflows/gstack.md +56 -0
- package/.agents/workflows/guard.md +18 -0
- package/.agents/workflows/investigate.md +37 -0
- package/.agents/workflows/land-and-deploy.md +35 -0
- package/.agents/workflows/office-hours.md +27 -0
- package/.agents/workflows/plan-ceo-review.md +34 -0
- package/.agents/workflows/plan-design-review.md +31 -0
- package/.agents/workflows/plan-eng-review.md +28 -0
- package/.agents/workflows/qa-only.md +28 -0
- package/.agents/workflows/qa.md +73 -0
- package/.agents/workflows/retro.md +34 -0
- package/.agents/workflows/review.md +30 -0
- package/.agents/workflows/setup-browser-cookies.md +15 -0
- package/.agents/workflows/setup-cookies.md +8 -0
- package/.agents/workflows/setup-deploy.md +21 -0
- package/.agents/workflows/ship.md +93 -0
- package/.agents/workflows/unfreeze.md +12 -0
- package/LICENSE +22 -0
- package/README.md +189 -0
- package/README_KO.md +191 -0
- package/bin/install.js +105 -0
- package/gstack-origin/.agents/skills/gstack/SKILL.md +651 -0
- package/gstack-origin/.agents/skills/gstack-autoplan/SKILL.md +678 -0
- package/gstack-origin/.agents/skills/gstack-benchmark/SKILL.md +482 -0
- package/gstack-origin/.agents/skills/gstack-browse/SKILL.md +511 -0
- package/gstack-origin/.agents/skills/gstack-canary/SKILL.md +486 -0
- package/gstack-origin/.agents/skills/gstack-careful/SKILL.md +50 -0
- package/gstack-origin/.agents/skills/gstack-cso/SKILL.md +607 -0
- package/gstack-origin/.agents/skills/gstack-design-consultation/SKILL.md +615 -0
- package/gstack-origin/.agents/skills/gstack-design-review/SKILL.md +988 -0
- package/gstack-origin/.agents/skills/gstack-document-release/SKILL.md +604 -0
- package/gstack-origin/.agents/skills/gstack-freeze/SKILL.md +67 -0
- package/gstack-origin/.agents/skills/gstack-guard/SKILL.md +62 -0
- package/gstack-origin/.agents/skills/gstack-investigate/SKILL.md +415 -0
- package/gstack-origin/.agents/skills/gstack-land-and-deploy/SKILL.md +873 -0
- package/gstack-origin/.agents/skills/gstack-office-hours/SKILL.md +986 -0
- package/gstack-origin/.agents/skills/gstack-plan-ceo-review/SKILL.md +1268 -0
- package/gstack-origin/.agents/skills/gstack-plan-design-review/SKILL.md +668 -0
- package/gstack-origin/.agents/skills/gstack-plan-eng-review/SKILL.md +826 -0
- package/gstack-origin/.agents/skills/gstack-qa/SKILL.md +1006 -0
- package/gstack-origin/.agents/skills/gstack-qa-only/SKILL.md +626 -0
- package/gstack-origin/.agents/skills/gstack-retro/SKILL.md +1065 -0
- package/gstack-origin/.agents/skills/gstack-review/SKILL.md +704 -0
- package/gstack-origin/.agents/skills/gstack-setup-browser-cookies/SKILL.md +325 -0
- package/gstack-origin/.agents/skills/gstack-setup-deploy/SKILL.md +450 -0
- package/gstack-origin/.agents/skills/gstack-ship/SKILL.md +1312 -0
- package/gstack-origin/.agents/skills/gstack-unfreeze/SKILL.md +36 -0
- package/gstack-origin/.agents/skills/gstack-upgrade/SKILL.md +220 -0
- package/gstack-origin/.env.example +5 -0
- package/gstack-origin/.github/workflows/skill-docs.yml +17 -0
- package/gstack-origin/AGENTS.md +49 -0
- package/gstack-origin/ARCHITECTURE.md +359 -0
- package/gstack-origin/BROWSER.md +271 -0
- package/gstack-origin/CHANGELOG.md +800 -0
- package/gstack-origin/CLAUDE.md +284 -0
- package/gstack-origin/CONTRIBUTING.md +370 -0
- package/gstack-origin/ETHOS.md +129 -0
- package/gstack-origin/LICENSE +21 -0
- package/gstack-origin/README.md +228 -0
- package/gstack-origin/SKILL.md +657 -0
- package/gstack-origin/SKILL.md.tmpl +281 -0
- package/gstack-origin/TODOS.md +564 -0
- package/gstack-origin/VERSION +1 -0
- package/gstack-origin/autoplan/SKILL.md +689 -0
- package/gstack-origin/autoplan/SKILL.md.tmpl +416 -0
- package/gstack-origin/benchmark/SKILL.md +489 -0
- package/gstack-origin/benchmark/SKILL.md.tmpl +233 -0
- package/gstack-origin/bin/dev-setup +68 -0
- package/gstack-origin/bin/dev-teardown +56 -0
- package/gstack-origin/bin/gstack-analytics +191 -0
- package/gstack-origin/bin/gstack-community-dashboard +113 -0
- package/gstack-origin/bin/gstack-config +38 -0
- package/gstack-origin/bin/gstack-diff-scope +71 -0
- package/gstack-origin/bin/gstack-global-discover.ts +591 -0
- package/gstack-origin/bin/gstack-repo-mode +93 -0
- package/gstack-origin/bin/gstack-review-log +9 -0
- package/gstack-origin/bin/gstack-review-read +12 -0
- package/gstack-origin/bin/gstack-slug +15 -0
- package/gstack-origin/bin/gstack-telemetry-log +158 -0
- package/gstack-origin/bin/gstack-telemetry-sync +127 -0
- package/gstack-origin/bin/gstack-update-check +196 -0
- package/gstack-origin/browse/SKILL.md +517 -0
- package/gstack-origin/browse/SKILL.md.tmpl +141 -0
- package/gstack-origin/browse/bin/find-browse +21 -0
- package/gstack-origin/browse/bin/remote-slug +14 -0
- package/gstack-origin/browse/scripts/build-node-server.sh +48 -0
- package/gstack-origin/browse/src/browser-manager.ts +634 -0
- package/gstack-origin/browse/src/buffers.ts +137 -0
- package/gstack-origin/browse/src/bun-polyfill.cjs +109 -0
- package/gstack-origin/browse/src/cli.ts +420 -0
- package/gstack-origin/browse/src/commands.ts +111 -0
- package/gstack-origin/browse/src/config.ts +150 -0
- package/gstack-origin/browse/src/cookie-import-browser.ts +417 -0
- package/gstack-origin/browse/src/cookie-picker-routes.ts +207 -0
- package/gstack-origin/browse/src/cookie-picker-ui.ts +541 -0
- package/gstack-origin/browse/src/find-browse.ts +61 -0
- package/gstack-origin/browse/src/meta-commands.ts +269 -0
- package/gstack-origin/browse/src/platform.ts +17 -0
- package/gstack-origin/browse/src/read-commands.ts +335 -0
- package/gstack-origin/browse/src/server.ts +369 -0
- package/gstack-origin/browse/src/snapshot.ts +398 -0
- package/gstack-origin/browse/src/url-validation.ts +91 -0
- package/gstack-origin/browse/src/write-commands.ts +352 -0
- package/gstack-origin/browse/test/bun-polyfill.test.ts +72 -0
- package/gstack-origin/browse/test/commands.test.ts +1836 -0
- package/gstack-origin/browse/test/config.test.ts +250 -0
- package/gstack-origin/browse/test/cookie-import-browser.test.ts +397 -0
- package/gstack-origin/browse/test/cookie-picker-routes.test.ts +205 -0
- package/gstack-origin/browse/test/find-browse.test.ts +50 -0
- package/gstack-origin/browse/test/fixtures/basic.html +33 -0
- package/gstack-origin/browse/test/fixtures/cursor-interactive.html +22 -0
- package/gstack-origin/browse/test/fixtures/dialog.html +15 -0
- package/gstack-origin/browse/test/fixtures/empty.html +2 -0
- package/gstack-origin/browse/test/fixtures/forms.html +55 -0
- package/gstack-origin/browse/test/fixtures/qa-eval-checkout.html +108 -0
- package/gstack-origin/browse/test/fixtures/qa-eval-spa.html +98 -0
- package/gstack-origin/browse/test/fixtures/qa-eval.html +51 -0
- package/gstack-origin/browse/test/fixtures/responsive.html +49 -0
- package/gstack-origin/browse/test/fixtures/snapshot.html +55 -0
- package/gstack-origin/browse/test/fixtures/spa.html +24 -0
- package/gstack-origin/browse/test/fixtures/states.html +17 -0
- package/gstack-origin/browse/test/fixtures/upload.html +25 -0
- package/gstack-origin/browse/test/gstack-config.test.ts +125 -0
- package/gstack-origin/browse/test/gstack-update-check.test.ts +467 -0
- package/gstack-origin/browse/test/handoff.test.ts +235 -0
- package/gstack-origin/browse/test/path-validation.test.ts +63 -0
- package/gstack-origin/browse/test/platform.test.ts +37 -0
- package/gstack-origin/browse/test/snapshot.test.ts +467 -0
- package/gstack-origin/browse/test/test-server.ts +57 -0
- package/gstack-origin/browse/test/url-validation.test.ts +72 -0
- package/gstack-origin/canary/SKILL.md +493 -0
- package/gstack-origin/canary/SKILL.md.tmpl +220 -0
- package/gstack-origin/careful/SKILL.md +59 -0
- package/gstack-origin/careful/SKILL.md.tmpl +57 -0
- package/gstack-origin/careful/bin/check-careful.sh +112 -0
- package/gstack-origin/codex/SKILL.md +677 -0
- package/gstack-origin/codex/SKILL.md.tmpl +356 -0
- package/gstack-origin/conductor.json +6 -0
- package/gstack-origin/cso/SKILL.md +615 -0
- package/gstack-origin/cso/SKILL.md.tmpl +376 -0
- package/gstack-origin/design-consultation/SKILL.md +625 -0
- package/gstack-origin/design-consultation/SKILL.md.tmpl +369 -0
- package/gstack-origin/design-review/SKILL.md +998 -0
- package/gstack-origin/design-review/SKILL.md.tmpl +262 -0
- package/gstack-origin/docs/images/github-2013.png +0 -0
- package/gstack-origin/docs/images/github-2026.png +0 -0
- package/gstack-origin/docs/skills.md +877 -0
- package/gstack-origin/document-release/SKILL.md +613 -0
- package/gstack-origin/document-release/SKILL.md.tmpl +357 -0
- package/gstack-origin/freeze/SKILL.md +82 -0
- package/gstack-origin/freeze/SKILL.md.tmpl +80 -0
- package/gstack-origin/freeze/bin/check-freeze.sh +68 -0
- package/gstack-origin/gstack-upgrade/SKILL.md +226 -0
- package/gstack-origin/gstack-upgrade/SKILL.md.tmpl +224 -0
- package/gstack-origin/guard/SKILL.md +82 -0
- package/gstack-origin/guard/SKILL.md.tmpl +80 -0
- package/gstack-origin/investigate/SKILL.md +435 -0
- package/gstack-origin/investigate/SKILL.md.tmpl +196 -0
- package/gstack-origin/land-and-deploy/SKILL.md +880 -0
- package/gstack-origin/land-and-deploy/SKILL.md.tmpl +575 -0
- package/gstack-origin/office-hours/SKILL.md +996 -0
- package/gstack-origin/office-hours/SKILL.md.tmpl +624 -0
- package/gstack-origin/package.json +55 -0
- package/gstack-origin/plan-ceo-review/SKILL.md +1277 -0
- package/gstack-origin/plan-ceo-review/SKILL.md.tmpl +838 -0
- package/gstack-origin/plan-design-review/SKILL.md +676 -0
- package/gstack-origin/plan-design-review/SKILL.md.tmpl +314 -0
- package/gstack-origin/plan-eng-review/SKILL.md +836 -0
- package/gstack-origin/plan-eng-review/SKILL.md.tmpl +279 -0
- package/gstack-origin/qa/SKILL.md +1016 -0
- package/gstack-origin/qa/SKILL.md.tmpl +316 -0
- package/gstack-origin/qa/references/issue-taxonomy.md +85 -0
- package/gstack-origin/qa/templates/qa-report-template.md +126 -0
- package/gstack-origin/qa-only/SKILL.md +633 -0
- package/gstack-origin/qa-only/SKILL.md.tmpl +101 -0
- package/gstack-origin/retro/SKILL.md +1072 -0
- package/gstack-origin/retro/SKILL.md.tmpl +833 -0
- package/gstack-origin/review/SKILL.md +849 -0
- package/gstack-origin/review/SKILL.md.tmpl +259 -0
- package/gstack-origin/review/TODOS-format.md +62 -0
- package/gstack-origin/review/checklist.md +190 -0
- package/gstack-origin/review/design-checklist.md +132 -0
- package/gstack-origin/review/greptile-triage.md +220 -0
- package/gstack-origin/scripts/analytics.ts +190 -0
- package/gstack-origin/scripts/dev-skill.ts +82 -0
- package/gstack-origin/scripts/eval-compare.ts +96 -0
- package/gstack-origin/scripts/eval-list.ts +116 -0
- package/gstack-origin/scripts/eval-select.ts +86 -0
- package/gstack-origin/scripts/eval-summary.ts +187 -0
- package/gstack-origin/scripts/eval-watch.ts +172 -0
- package/gstack-origin/scripts/gen-skill-docs.ts +2414 -0
- package/gstack-origin/scripts/skill-check.ts +167 -0
- package/gstack-origin/setup +269 -0
- package/gstack-origin/setup-browser-cookies/SKILL.md +330 -0
- package/gstack-origin/setup-browser-cookies/SKILL.md.tmpl +74 -0
- package/gstack-origin/setup-deploy/SKILL.md +459 -0
- package/gstack-origin/setup-deploy/SKILL.md.tmpl +220 -0
- package/gstack-origin/ship/SKILL.md +1457 -0
- package/gstack-origin/ship/SKILL.md.tmpl +528 -0
- package/gstack-origin/supabase/config.sh +10 -0
- package/gstack-origin/supabase/functions/community-pulse/index.ts +59 -0
- package/gstack-origin/supabase/functions/telemetry-ingest/index.ts +135 -0
- package/gstack-origin/supabase/functions/update-check/index.ts +37 -0
- package/gstack-origin/supabase/migrations/001_telemetry.sql +89 -0
- package/gstack-origin/test/analytics.test.ts +277 -0
- package/gstack-origin/test/codex-e2e.test.ts +197 -0
- package/gstack-origin/test/fixtures/coverage-audit-fixture.ts +76 -0
- package/gstack-origin/test/fixtures/eval-baselines.json +7 -0
- package/gstack-origin/test/fixtures/qa-eval-checkout-ground-truth.json +43 -0
- package/gstack-origin/test/fixtures/qa-eval-ground-truth.json +43 -0
- package/gstack-origin/test/fixtures/qa-eval-spa-ground-truth.json +43 -0
- package/gstack-origin/test/fixtures/review-eval-design-slop.css +86 -0
- package/gstack-origin/test/fixtures/review-eval-design-slop.html +41 -0
- package/gstack-origin/test/fixtures/review-eval-enum-diff.rb +30 -0
- package/gstack-origin/test/fixtures/review-eval-enum.rb +27 -0
- package/gstack-origin/test/fixtures/review-eval-vuln.rb +14 -0
- package/gstack-origin/test/gemini-e2e.test.ts +173 -0
- package/gstack-origin/test/gen-skill-docs.test.ts +1049 -0
- package/gstack-origin/test/global-discover.test.ts +187 -0
- package/gstack-origin/test/helpers/codex-session-runner.ts +282 -0
- package/gstack-origin/test/helpers/e2e-helpers.ts +239 -0
- package/gstack-origin/test/helpers/eval-store.test.ts +548 -0
- package/gstack-origin/test/helpers/eval-store.ts +689 -0
- package/gstack-origin/test/helpers/gemini-session-runner.test.ts +104 -0
- package/gstack-origin/test/helpers/gemini-session-runner.ts +201 -0
- package/gstack-origin/test/helpers/llm-judge.ts +130 -0
- package/gstack-origin/test/helpers/observability.test.ts +283 -0
- package/gstack-origin/test/helpers/session-runner.test.ts +96 -0
- package/gstack-origin/test/helpers/session-runner.ts +357 -0
- package/gstack-origin/test/helpers/skill-parser.ts +206 -0
- package/gstack-origin/test/helpers/touchfiles.ts +260 -0
- package/gstack-origin/test/hook-scripts.test.ts +373 -0
- package/gstack-origin/test/skill-e2e-browse.test.ts +293 -0
- package/gstack-origin/test/skill-e2e-deploy.test.ts +279 -0
- package/gstack-origin/test/skill-e2e-design.test.ts +614 -0
- package/gstack-origin/test/skill-e2e-plan.test.ts +538 -0
- package/gstack-origin/test/skill-e2e-qa-bugs.test.ts +194 -0
- package/gstack-origin/test/skill-e2e-qa-workflow.test.ts +412 -0
- package/gstack-origin/test/skill-e2e-review.test.ts +535 -0
- package/gstack-origin/test/skill-e2e-workflow.test.ts +586 -0
- package/gstack-origin/test/skill-e2e.test.ts +3325 -0
- package/gstack-origin/test/skill-llm-eval.test.ts +787 -0
- package/gstack-origin/test/skill-parser.test.ts +179 -0
- package/gstack-origin/test/skill-routing-e2e.test.ts +605 -0
- package/gstack-origin/test/skill-validation.test.ts +1520 -0
- package/gstack-origin/test/telemetry.test.ts +278 -0
- package/gstack-origin/test/touchfiles.test.ts +262 -0
- package/gstack-origin/unfreeze/SKILL.md +40 -0
- package/gstack-origin/unfreeze/SKILL.md.tmpl +38 -0
- package/package.json +38 -0
- package/scripts/install-antigravity-skill.ps1 +33 -0
- package/scripts/install-antigravity-skill.sh +41 -0
- package/scripts/sync-gstack-origin.ps1 +37 -0
- package/scripts/sync-gstack-origin.sh +35 -0
|
@@ -0,0 +1,548 @@
|
|
|
1
|
+
import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
|
|
2
|
+
import * as fs from 'fs';
|
|
3
|
+
import * as path from 'path';
|
|
4
|
+
import * as os from 'os';
|
|
5
|
+
import {
|
|
6
|
+
EvalCollector,
|
|
7
|
+
extractToolSummary,
|
|
8
|
+
findPreviousRun,
|
|
9
|
+
compareEvalResults,
|
|
10
|
+
formatComparison,
|
|
11
|
+
generateCommentary,
|
|
12
|
+
judgePassed,
|
|
13
|
+
} from './eval-store';
|
|
14
|
+
import type { EvalResult, EvalTestEntry, ComparisonResult } from './eval-store';
|
|
15
|
+
|
|
16
|
+
let tmpDir: string;
|
|
17
|
+
|
|
18
|
+
beforeEach(() => {
|
|
19
|
+
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'eval-store-test-'));
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
afterEach(() => {
|
|
23
|
+
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
// --- Helper to make a minimal test entry ---
|
|
27
|
+
|
|
28
|
+
function makeEntry(overrides?: Partial<EvalTestEntry>): EvalTestEntry {
|
|
29
|
+
return {
|
|
30
|
+
name: 'test-1',
|
|
31
|
+
suite: 'suite-1',
|
|
32
|
+
tier: 'e2e',
|
|
33
|
+
passed: true,
|
|
34
|
+
duration_ms: 1000,
|
|
35
|
+
cost_usd: 0.05,
|
|
36
|
+
...overrides,
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// --- Helper to make a minimal EvalResult ---
|
|
41
|
+
|
|
42
|
+
function makeResult(overrides?: Partial<EvalResult>): EvalResult {
|
|
43
|
+
return {
|
|
44
|
+
schema_version: 1,
|
|
45
|
+
version: '0.3.6',
|
|
46
|
+
branch: 'main',
|
|
47
|
+
git_sha: 'abc1234',
|
|
48
|
+
timestamp: '2026-03-14T12:00:00.000Z',
|
|
49
|
+
hostname: 'test-host',
|
|
50
|
+
tier: 'e2e',
|
|
51
|
+
total_tests: 1,
|
|
52
|
+
passed: 1,
|
|
53
|
+
failed: 0,
|
|
54
|
+
total_cost_usd: 0.05,
|
|
55
|
+
total_duration_ms: 1000,
|
|
56
|
+
tests: [makeEntry()],
|
|
57
|
+
...overrides,
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// --- EvalCollector tests ---
|
|
62
|
+
|
|
63
|
+
describe('EvalCollector', () => {
|
|
64
|
+
test('addTest accumulates entries', () => {
|
|
65
|
+
const collector = new EvalCollector('e2e', tmpDir);
|
|
66
|
+
collector.addTest(makeEntry({ name: 'a' }));
|
|
67
|
+
collector.addTest(makeEntry({ name: 'b' }));
|
|
68
|
+
collector.addTest(makeEntry({ name: 'c' }));
|
|
69
|
+
// We can't inspect tests directly, but finalize will write them
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
test('finalize writes JSON file to eval dir', async () => {
|
|
73
|
+
const collector = new EvalCollector('e2e', tmpDir);
|
|
74
|
+
collector.addTest(makeEntry());
|
|
75
|
+
const filepath = await collector.finalize();
|
|
76
|
+
|
|
77
|
+
expect(filepath).toBeTruthy();
|
|
78
|
+
expect(fs.existsSync(filepath)).toBe(true);
|
|
79
|
+
|
|
80
|
+
const data = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
|
|
81
|
+
expect(data.tests).toHaveLength(1);
|
|
82
|
+
expect(data.tests[0].name).toBe('test-1');
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
test('written JSON has correct schema fields', async () => {
|
|
86
|
+
const collector = new EvalCollector('e2e', tmpDir);
|
|
87
|
+
collector.addTest(makeEntry({ passed: true, cost_usd: 0.10, duration_ms: 2000 }));
|
|
88
|
+
collector.addTest(makeEntry({ name: 'test-2', passed: false, cost_usd: 0.05, duration_ms: 1000 }));
|
|
89
|
+
const filepath = await collector.finalize();
|
|
90
|
+
|
|
91
|
+
const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
|
|
92
|
+
expect(data.schema_version).toBe(1);
|
|
93
|
+
expect(data.tier).toBe('e2e');
|
|
94
|
+
expect(data.total_tests).toBe(2);
|
|
95
|
+
expect(data.passed).toBe(1);
|
|
96
|
+
expect(data.failed).toBe(1);
|
|
97
|
+
expect(data.total_cost_usd).toBe(0.15);
|
|
98
|
+
expect(data.total_duration_ms).toBe(3000);
|
|
99
|
+
expect(data.timestamp).toBeTruthy();
|
|
100
|
+
expect(data.hostname).toBeTruthy();
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
test('finalize creates directory if missing', async () => {
|
|
104
|
+
const nestedDir = path.join(tmpDir, 'nested', 'deep', 'evals');
|
|
105
|
+
const collector = new EvalCollector('e2e', nestedDir);
|
|
106
|
+
collector.addTest(makeEntry());
|
|
107
|
+
const filepath = await collector.finalize();
|
|
108
|
+
expect(fs.existsSync(filepath)).toBe(true);
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
test('double finalize does not write twice', async () => {
|
|
112
|
+
const collector = new EvalCollector('e2e', tmpDir);
|
|
113
|
+
collector.addTest(makeEntry());
|
|
114
|
+
const filepath1 = await collector.finalize();
|
|
115
|
+
const filepath2 = await collector.finalize();
|
|
116
|
+
|
|
117
|
+
expect(filepath1).toBeTruthy();
|
|
118
|
+
expect(filepath2).toBe(''); // second call returns empty
|
|
119
|
+
expect(fs.readdirSync(tmpDir).filter(f => f.endsWith('.json') && !f.startsWith('_partial'))).toHaveLength(1);
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
test('empty collector writes valid file', async () => {
|
|
123
|
+
const collector = new EvalCollector('llm-judge', tmpDir);
|
|
124
|
+
const filepath = await collector.finalize();
|
|
125
|
+
|
|
126
|
+
const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
|
|
127
|
+
expect(data.total_tests).toBe(0);
|
|
128
|
+
expect(data.passed).toBe(0);
|
|
129
|
+
expect(data.tests).toHaveLength(0);
|
|
130
|
+
expect(data.tier).toBe('llm-judge');
|
|
131
|
+
});
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
// --- judgePassed tests ---
|
|
135
|
+
|
|
136
|
+
describe('judgePassed', () => {
|
|
137
|
+
test('passes when all thresholds met', () => {
|
|
138
|
+
expect(judgePassed(
|
|
139
|
+
{ detection_rate: 3, false_positives: 1, evidence_quality: 3 },
|
|
140
|
+
{ minimum_detection: 2, max_false_positives: 2 },
|
|
141
|
+
)).toBe(true);
|
|
142
|
+
});
|
|
143
|
+
|
|
144
|
+
test('fails when detection rate below minimum', () => {
|
|
145
|
+
expect(judgePassed(
|
|
146
|
+
{ detection_rate: 1, false_positives: 0, evidence_quality: 3 },
|
|
147
|
+
{ minimum_detection: 2, max_false_positives: 2 },
|
|
148
|
+
)).toBe(false);
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
test('fails when too many false positives', () => {
|
|
152
|
+
expect(judgePassed(
|
|
153
|
+
{ detection_rate: 3, false_positives: 3, evidence_quality: 3 },
|
|
154
|
+
{ minimum_detection: 2, max_false_positives: 2 },
|
|
155
|
+
)).toBe(false);
|
|
156
|
+
});
|
|
157
|
+
|
|
158
|
+
test('fails when evidence quality below 2', () => {
|
|
159
|
+
expect(judgePassed(
|
|
160
|
+
{ detection_rate: 3, false_positives: 0, evidence_quality: 1 },
|
|
161
|
+
{ minimum_detection: 2, max_false_positives: 2 },
|
|
162
|
+
)).toBe(false);
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
test('passes at exact thresholds', () => {
|
|
166
|
+
expect(judgePassed(
|
|
167
|
+
{ detection_rate: 2, false_positives: 2, evidence_quality: 2 },
|
|
168
|
+
{ minimum_detection: 2, max_false_positives: 2 },
|
|
169
|
+
)).toBe(true);
|
|
170
|
+
});
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
// --- extractToolSummary tests ---
|
|
174
|
+
|
|
175
|
+
describe('extractToolSummary', () => {
|
|
176
|
+
test('counts tool types from transcript events', () => {
|
|
177
|
+
const transcript = [
|
|
178
|
+
{ type: 'system', subtype: 'init' },
|
|
179
|
+
{ type: 'assistant', message: { content: [
|
|
180
|
+
{ type: 'tool_use', name: 'Bash', input: {} },
|
|
181
|
+
] } },
|
|
182
|
+
{ type: 'user', tool_use_result: { stdout: '' } },
|
|
183
|
+
{ type: 'assistant', message: { content: [
|
|
184
|
+
{ type: 'text', text: 'ok' },
|
|
185
|
+
{ type: 'tool_use', name: 'Read', input: {} },
|
|
186
|
+
] } },
|
|
187
|
+
{ type: 'assistant', message: { content: [
|
|
188
|
+
{ type: 'tool_use', name: 'Bash', input: {} },
|
|
189
|
+
{ type: 'tool_use', name: 'Write', input: {} },
|
|
190
|
+
] } },
|
|
191
|
+
];
|
|
192
|
+
|
|
193
|
+
const summary = extractToolSummary(transcript);
|
|
194
|
+
expect(summary).toEqual({ Bash: 2, Read: 1, Write: 1 });
|
|
195
|
+
});
|
|
196
|
+
|
|
197
|
+
test('returns empty object for empty transcript', () => {
|
|
198
|
+
expect(extractToolSummary([])).toEqual({});
|
|
199
|
+
});
|
|
200
|
+
|
|
201
|
+
test('handles events with no content array', () => {
|
|
202
|
+
const transcript = [
|
|
203
|
+
{ type: 'assistant', message: {} },
|
|
204
|
+
{ type: 'assistant' },
|
|
205
|
+
];
|
|
206
|
+
expect(extractToolSummary(transcript)).toEqual({});
|
|
207
|
+
});
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
// --- findPreviousRun tests ---
|
|
211
|
+
|
|
212
|
+
describe('findPreviousRun', () => {
|
|
213
|
+
test('finds correct file — same branch preferred, most recent', () => {
|
|
214
|
+
// Write three eval files
|
|
215
|
+
const files = [
|
|
216
|
+
{ name: '0.3.5-main-e2e-20260312-100000.json', data: makeResult({ branch: 'main', timestamp: '2026-03-12T10:00:00Z' }) },
|
|
217
|
+
{ name: '0.3.5-feature-e2e-20260313-100000.json', data: makeResult({ branch: 'feature', timestamp: '2026-03-13T10:00:00Z' }) },
|
|
218
|
+
{ name: '0.3.6-feature-e2e-20260314-100000.json', data: makeResult({ branch: 'feature', timestamp: '2026-03-14T10:00:00Z' }) },
|
|
219
|
+
];
|
|
220
|
+
for (const f of files) {
|
|
221
|
+
fs.writeFileSync(path.join(tmpDir, f.name), JSON.stringify(f.data));
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// Should prefer feature branch (most recent on same branch)
|
|
225
|
+
const result = findPreviousRun(tmpDir, 'e2e', 'feature', path.join(tmpDir, 'current.json'));
|
|
226
|
+
expect(result).toContain('0.3.6-feature-e2e-20260314');
|
|
227
|
+
});
|
|
228
|
+
|
|
229
|
+
test('falls back to different branch when no same-branch match', () => {
|
|
230
|
+
const files = [
|
|
231
|
+
{ name: '0.3.5-main-e2e-20260312-100000.json', data: makeResult({ branch: 'main', timestamp: '2026-03-12T10:00:00Z' }) },
|
|
232
|
+
];
|
|
233
|
+
for (const f of files) {
|
|
234
|
+
fs.writeFileSync(path.join(tmpDir, f.name), JSON.stringify(f.data));
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
const result = findPreviousRun(tmpDir, 'e2e', 'new-branch', path.join(tmpDir, 'current.json'));
|
|
238
|
+
expect(result).toContain('0.3.5-main-e2e');
|
|
239
|
+
});
|
|
240
|
+
|
|
241
|
+
test('returns null when no prior runs exist', () => {
|
|
242
|
+
const result = findPreviousRun(tmpDir, 'e2e', 'main', path.join(tmpDir, 'current.json'));
|
|
243
|
+
expect(result).toBeNull();
|
|
244
|
+
});
|
|
245
|
+
|
|
246
|
+
test('returns null when directory does not exist', () => {
|
|
247
|
+
const result = findPreviousRun('/nonexistent/path', 'e2e', 'main', 'current.json');
|
|
248
|
+
expect(result).toBeNull();
|
|
249
|
+
});
|
|
250
|
+
|
|
251
|
+
test('excludes the current file from results', () => {
|
|
252
|
+
const filename = '0.3.6-main-e2e-20260314-100000.json';
|
|
253
|
+
fs.writeFileSync(
|
|
254
|
+
path.join(tmpDir, filename),
|
|
255
|
+
JSON.stringify(makeResult({ branch: 'main', timestamp: '2026-03-14T10:00:00Z' })),
|
|
256
|
+
);
|
|
257
|
+
|
|
258
|
+
const result = findPreviousRun(tmpDir, 'e2e', 'main', path.join(tmpDir, filename));
|
|
259
|
+
expect(result).toBeNull(); // only file is excluded
|
|
260
|
+
});
|
|
261
|
+
|
|
262
|
+
test('filters by tier', () => {
|
|
263
|
+
fs.writeFileSync(
|
|
264
|
+
path.join(tmpDir, '0.3.6-main-llm-judge-20260314-100000.json'),
|
|
265
|
+
JSON.stringify(makeResult({ tier: 'llm-judge', branch: 'main', timestamp: '2026-03-14T10:00:00Z' })),
|
|
266
|
+
);
|
|
267
|
+
|
|
268
|
+
const result = findPreviousRun(tmpDir, 'e2e', 'main', 'current.json');
|
|
269
|
+
expect(result).toBeNull(); // only llm-judge file, looking for e2e
|
|
270
|
+
});
|
|
271
|
+
});
|
|
272
|
+
|
|
273
|
+
// --- compareEvalResults tests ---
|
|
274
|
+
|
|
275
|
+
describe('compareEvalResults', () => {
|
|
276
|
+
test('detects improved/regressed/unchanged per test', () => {
|
|
277
|
+
const before = makeResult({
|
|
278
|
+
tests: [
|
|
279
|
+
makeEntry({ name: 'test-a', passed: false }),
|
|
280
|
+
makeEntry({ name: 'test-b', passed: true }),
|
|
281
|
+
makeEntry({ name: 'test-c', passed: true }),
|
|
282
|
+
],
|
|
283
|
+
total_tests: 3, passed: 2, failed: 1,
|
|
284
|
+
});
|
|
285
|
+
const after = makeResult({
|
|
286
|
+
tests: [
|
|
287
|
+
makeEntry({ name: 'test-a', passed: true }), // improved
|
|
288
|
+
makeEntry({ name: 'test-b', passed: false }), // regressed
|
|
289
|
+
makeEntry({ name: 'test-c', passed: true }), // unchanged
|
|
290
|
+
],
|
|
291
|
+
total_tests: 3, passed: 2, failed: 1,
|
|
292
|
+
});
|
|
293
|
+
|
|
294
|
+
const result = compareEvalResults(before, after, 'before.json', 'after.json');
|
|
295
|
+
expect(result.improved).toBe(1);
|
|
296
|
+
expect(result.regressed).toBe(1);
|
|
297
|
+
expect(result.unchanged).toBe(1);
|
|
298
|
+
expect(result.deltas.find(d => d.name === 'test-a')?.status_change).toBe('improved');
|
|
299
|
+
expect(result.deltas.find(d => d.name === 'test-b')?.status_change).toBe('regressed');
|
|
300
|
+
expect(result.deltas.find(d => d.name === 'test-c')?.status_change).toBe('unchanged');
|
|
301
|
+
});
|
|
302
|
+
|
|
303
|
+
test('handles tests present in one run but not the other', () => {
|
|
304
|
+
const before = makeResult({
|
|
305
|
+
tests: [
|
|
306
|
+
makeEntry({ name: 'old-test', passed: true }),
|
|
307
|
+
makeEntry({ name: 'shared', passed: true }),
|
|
308
|
+
],
|
|
309
|
+
});
|
|
310
|
+
const after = makeResult({
|
|
311
|
+
tests: [
|
|
312
|
+
makeEntry({ name: 'shared', passed: true }),
|
|
313
|
+
makeEntry({ name: 'new-test', passed: true }),
|
|
314
|
+
],
|
|
315
|
+
});
|
|
316
|
+
|
|
317
|
+
const result = compareEvalResults(before, after, 'before.json', 'after.json');
|
|
318
|
+
expect(result.deltas).toHaveLength(3); // shared + new-test + old-test (removed)
|
|
319
|
+
expect(result.deltas.find(d => d.name.includes('old-test'))?.name).toContain('removed');
|
|
320
|
+
});
|
|
321
|
+
|
|
322
|
+
test('computes cost and duration deltas', () => {
|
|
323
|
+
const before = makeResult({ total_cost_usd: 2.00, total_duration_ms: 60000 });
|
|
324
|
+
const after = makeResult({ total_cost_usd: 1.50, total_duration_ms: 45000 });
|
|
325
|
+
|
|
326
|
+
const result = compareEvalResults(before, after, 'a.json', 'b.json');
|
|
327
|
+
expect(result.total_cost_delta).toBe(-0.50);
|
|
328
|
+
expect(result.total_duration_delta).toBe(-15000);
|
|
329
|
+
});
|
|
330
|
+
});
|
|
331
|
+
|
|
332
|
+
// --- formatComparison tests ---
|
|
333
|
+
|
|
334
|
+
describe('formatComparison', () => {
|
|
335
|
+
test('produces readable output with status arrows', () => {
|
|
336
|
+
const comparison: ComparisonResult = {
|
|
337
|
+
before_file: 'before.json',
|
|
338
|
+
after_file: 'after.json',
|
|
339
|
+
before_branch: 'main',
|
|
340
|
+
after_branch: 'feature',
|
|
341
|
+
before_timestamp: '2026-03-13T14:30:00Z',
|
|
342
|
+
after_timestamp: '2026-03-14T14:30:00Z',
|
|
343
|
+
deltas: [
|
|
344
|
+
{
|
|
345
|
+
name: 'browse basic',
|
|
346
|
+
before: { passed: true, cost_usd: 0.07, turns_used: 6, duration_ms: 24000, tool_summary: { Bash: 3 } },
|
|
347
|
+
after: { passed: true, cost_usd: 0.06, turns_used: 5, duration_ms: 19000, tool_summary: { Bash: 4 } },
|
|
348
|
+
status_change: 'unchanged',
|
|
349
|
+
},
|
|
350
|
+
{
|
|
351
|
+
name: 'planted bugs static',
|
|
352
|
+
before: { passed: false, cost_usd: 1.00, detection_rate: 3, tool_summary: {} },
|
|
353
|
+
after: { passed: true, cost_usd: 0.95, detection_rate: 4, tool_summary: {} },
|
|
354
|
+
status_change: 'improved',
|
|
355
|
+
},
|
|
356
|
+
],
|
|
357
|
+
total_cost_delta: -0.06,
|
|
358
|
+
total_duration_delta: -5000,
|
|
359
|
+
improved: 1,
|
|
360
|
+
regressed: 0,
|
|
361
|
+
unchanged: 1,
|
|
362
|
+
tool_count_before: 3,
|
|
363
|
+
tool_count_after: 4,
|
|
364
|
+
};
|
|
365
|
+
|
|
366
|
+
const output = formatComparison(comparison);
|
|
367
|
+
expect(output).toContain('vs previous');
|
|
368
|
+
expect(output).toContain('main');
|
|
369
|
+
expect(output).toContain('1 improved');
|
|
370
|
+
expect(output).toContain('1 unchanged');
|
|
371
|
+
expect(output).toContain('↑'); // improved arrow
|
|
372
|
+
expect(output).toContain('='); // unchanged arrow
|
|
373
|
+
// Turns and duration deltas
|
|
374
|
+
expect(output).toContain('6→5t');
|
|
375
|
+
expect(output).toContain('24→19s');
|
|
376
|
+
});
|
|
377
|
+
|
|
378
|
+
test('includes commentary section', () => {
|
|
379
|
+
const comparison: ComparisonResult = {
|
|
380
|
+
before_file: 'a.json', after_file: 'b.json',
|
|
381
|
+
before_branch: 'main', after_branch: 'main',
|
|
382
|
+
before_timestamp: '2026-03-13T14:30:00Z',
|
|
383
|
+
after_timestamp: '2026-03-14T14:30:00Z',
|
|
384
|
+
deltas: [
|
|
385
|
+
{
|
|
386
|
+
name: 'test-a',
|
|
387
|
+
before: { passed: true, cost_usd: 0.50, turns_used: 20, duration_ms: 120000 },
|
|
388
|
+
after: { passed: true, cost_usd: 0.30, turns_used: 10, duration_ms: 60000 },
|
|
389
|
+
status_change: 'unchanged',
|
|
390
|
+
},
|
|
391
|
+
{
|
|
392
|
+
name: 'test-b',
|
|
393
|
+
before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 },
|
|
394
|
+
after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 },
|
|
395
|
+
status_change: 'unchanged',
|
|
396
|
+
},
|
|
397
|
+
{
|
|
398
|
+
name: 'test-c',
|
|
399
|
+
before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 },
|
|
400
|
+
after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 },
|
|
401
|
+
status_change: 'unchanged',
|
|
402
|
+
},
|
|
403
|
+
],
|
|
404
|
+
total_cost_delta: -0.20,
|
|
405
|
+
total_duration_delta: -60000,
|
|
406
|
+
improved: 0, regressed: 0, unchanged: 3,
|
|
407
|
+
tool_count_before: 30, tool_count_after: 20,
|
|
408
|
+
};
|
|
409
|
+
|
|
410
|
+
const output = formatComparison(comparison);
|
|
411
|
+
expect(output).toContain('Takeaway');
|
|
412
|
+
expect(output).toContain('fewer turns');
|
|
413
|
+
expect(output).toContain('faster');
|
|
414
|
+
});
|
|
415
|
+
});
|
|
416
|
+
|
|
417
|
+
// --- generateCommentary tests ---
|
|
418
|
+
|
|
419
|
+
describe('generateCommentary', () => {
|
|
420
|
+
test('flags regressions prominently', () => {
|
|
421
|
+
const c: ComparisonResult = {
|
|
422
|
+
before_file: 'a.json', after_file: 'b.json',
|
|
423
|
+
before_branch: 'main', after_branch: 'main',
|
|
424
|
+
before_timestamp: '', after_timestamp: '',
|
|
425
|
+
deltas: [{
|
|
426
|
+
name: 'critical-test',
|
|
427
|
+
before: { passed: true, cost_usd: 0.10 },
|
|
428
|
+
after: { passed: false, cost_usd: 0.10 },
|
|
429
|
+
status_change: 'regressed',
|
|
430
|
+
}],
|
|
431
|
+
total_cost_delta: 0, total_duration_delta: 0,
|
|
432
|
+
improved: 0, regressed: 1, unchanged: 0,
|
|
433
|
+
tool_count_before: 0, tool_count_after: 0,
|
|
434
|
+
};
|
|
435
|
+
|
|
436
|
+
const notes = generateCommentary(c);
|
|
437
|
+
expect(notes.some(n => n.includes('REGRESSION'))).toBe(true);
|
|
438
|
+
expect(notes.some(n => n.includes('critical-test'))).toBe(true);
|
|
439
|
+
});
|
|
440
|
+
|
|
441
|
+
test('notes improvements', () => {
|
|
442
|
+
const c: ComparisonResult = {
|
|
443
|
+
before_file: 'a.json', after_file: 'b.json',
|
|
444
|
+
before_branch: 'main', after_branch: 'main',
|
|
445
|
+
before_timestamp: '', after_timestamp: '',
|
|
446
|
+
deltas: [{
|
|
447
|
+
name: 'fixed-test',
|
|
448
|
+
before: { passed: false, cost_usd: 0.10 },
|
|
449
|
+
after: { passed: true, cost_usd: 0.10 },
|
|
450
|
+
status_change: 'improved',
|
|
451
|
+
}],
|
|
452
|
+
total_cost_delta: 0, total_duration_delta: 0,
|
|
453
|
+
improved: 1, regressed: 0, unchanged: 0,
|
|
454
|
+
tool_count_before: 0, tool_count_after: 0,
|
|
455
|
+
};
|
|
456
|
+
|
|
457
|
+
const notes = generateCommentary(c);
|
|
458
|
+
expect(notes.some(n => n.includes('Fixed'))).toBe(true);
|
|
459
|
+
expect(notes.some(n => n.includes('fixed-test'))).toBe(true);
|
|
460
|
+
});
|
|
461
|
+
|
|
462
|
+
test('reports efficiency gains for stable tests', () => {
|
|
463
|
+
const c: ComparisonResult = {
|
|
464
|
+
before_file: 'a.json', after_file: 'b.json',
|
|
465
|
+
before_branch: 'main', after_branch: 'main',
|
|
466
|
+
before_timestamp: '', after_timestamp: '',
|
|
467
|
+
deltas: [{
|
|
468
|
+
name: 'fast-test',
|
|
469
|
+
before: { passed: true, cost_usd: 0.50, turns_used: 20, duration_ms: 120000 },
|
|
470
|
+
after: { passed: true, cost_usd: 0.25, turns_used: 10, duration_ms: 60000 },
|
|
471
|
+
status_change: 'unchanged',
|
|
472
|
+
}],
|
|
473
|
+
total_cost_delta: -0.25, total_duration_delta: -60000,
|
|
474
|
+
improved: 0, regressed: 0, unchanged: 1,
|
|
475
|
+
tool_count_before: 0, tool_count_after: 0,
|
|
476
|
+
};
|
|
477
|
+
|
|
478
|
+
const notes = generateCommentary(c);
|
|
479
|
+
expect(notes.some(n => n.includes('fewer turns'))).toBe(true);
|
|
480
|
+
expect(notes.some(n => n.includes('faster'))).toBe(true);
|
|
481
|
+
expect(notes.some(n => n.includes('cheaper'))).toBe(true);
|
|
482
|
+
});
|
|
483
|
+
|
|
484
|
+
test('reports detection rate changes', () => {
|
|
485
|
+
const c: ComparisonResult = {
|
|
486
|
+
before_file: 'a.json', after_file: 'b.json',
|
|
487
|
+
before_branch: 'main', after_branch: 'main',
|
|
488
|
+
before_timestamp: '', after_timestamp: '',
|
|
489
|
+
deltas: [{
|
|
490
|
+
name: 'detection-test',
|
|
491
|
+
before: { passed: true, cost_usd: 0.50, detection_rate: 3 },
|
|
492
|
+
after: { passed: true, cost_usd: 0.50, detection_rate: 5 },
|
|
493
|
+
status_change: 'unchanged',
|
|
494
|
+
}],
|
|
495
|
+
total_cost_delta: 0, total_duration_delta: 0,
|
|
496
|
+
improved: 0, regressed: 0, unchanged: 1,
|
|
497
|
+
tool_count_before: 0, tool_count_after: 0,
|
|
498
|
+
};
|
|
499
|
+
|
|
500
|
+
const notes = generateCommentary(c);
|
|
501
|
+
expect(notes.some(n => n.includes('detecting 2 more bugs'))).toBe(true);
|
|
502
|
+
});
|
|
503
|
+
|
|
504
|
+
test('produces overall summary for 3+ tests with no regressions', () => {
|
|
505
|
+
const c: ComparisonResult = {
|
|
506
|
+
before_file: 'a.json', after_file: 'b.json',
|
|
507
|
+
before_branch: 'main', after_branch: 'main',
|
|
508
|
+
before_timestamp: '', after_timestamp: '',
|
|
509
|
+
deltas: [
|
|
510
|
+
{ name: 'a', before: { passed: true, cost_usd: 0.50, turns_used: 10, duration_ms: 60000 },
|
|
511
|
+
after: { passed: true, cost_usd: 0.30, turns_used: 6, duration_ms: 40000 }, status_change: 'unchanged' },
|
|
512
|
+
{ name: 'b', before: { passed: true, cost_usd: 0.20, turns_used: 5, duration_ms: 30000 },
|
|
513
|
+
after: { passed: true, cost_usd: 0.15, turns_used: 4, duration_ms: 25000 }, status_change: 'unchanged' },
|
|
514
|
+
{ name: 'c', before: { passed: true, cost_usd: 0.10, turns_used: 3, duration_ms: 20000 },
|
|
515
|
+
after: { passed: true, cost_usd: 0.08, turns_used: 3, duration_ms: 18000 }, status_change: 'unchanged' },
|
|
516
|
+
],
|
|
517
|
+
total_cost_delta: -0.27, total_duration_delta: -27000,
|
|
518
|
+
improved: 0, regressed: 0, unchanged: 3,
|
|
519
|
+
tool_count_before: 0, tool_count_after: 0,
|
|
520
|
+
};
|
|
521
|
+
|
|
522
|
+
const notes = generateCommentary(c);
|
|
523
|
+
expect(notes.some(n => n.includes('Overall'))).toBe(true);
|
|
524
|
+
expect(notes.some(n => n.includes('No regressions'))).toBe(true);
|
|
525
|
+
});
|
|
526
|
+
|
|
527
|
+
test('returns empty for stable run with no significant changes', () => {
|
|
528
|
+
const c: ComparisonResult = {
|
|
529
|
+
before_file: 'a.json', after_file: 'b.json',
|
|
530
|
+
before_branch: 'main', after_branch: 'main',
|
|
531
|
+
before_timestamp: '', after_timestamp: '',
|
|
532
|
+
deltas: [
|
|
533
|
+
{ name: 'a', before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 },
|
|
534
|
+
after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 21000 }, status_change: 'unchanged' },
|
|
535
|
+
{ name: 'b', before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 },
|
|
536
|
+
after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, status_change: 'unchanged' },
|
|
537
|
+
{ name: 'c', before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 },
|
|
538
|
+
after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, status_change: 'unchanged' },
|
|
539
|
+
],
|
|
540
|
+
total_cost_delta: 0, total_duration_delta: 1000,
|
|
541
|
+
improved: 0, regressed: 0, unchanged: 3,
|
|
542
|
+
tool_count_before: 15, tool_count_after: 15,
|
|
543
|
+
};
|
|
544
|
+
|
|
545
|
+
const notes = generateCommentary(c);
|
|
546
|
+
expect(notes.some(n => n.includes('Stable run'))).toBe(true);
|
|
547
|
+
});
|
|
548
|
+
});
|