opengstack 0.13.10 → 0.14.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +4 -4
- package/CLAUDE.md +127 -110
- package/README.md +10 -5
- package/SKILL.md +500 -70
- package/bin/opengstack.js +69 -69
- package/{skills/land-and-deploy/SKILL.md → commands/autoplan.md} +7 -25
- package/{skills/benchmark/SKILL.md → commands/benchmark.md} +84 -108
- package/{skills/browse/SKILL.md → commands/browse.md} +60 -81
- package/{skills/ship/SKILL.md → commands/canary.md} +7 -27
- package/{skills/careful/SKILL.md → commands/careful.md} +2 -22
- package/{skills/canary/SKILL.md → commands/codex.md} +7 -26
- package/{skills/connect-chrome/SKILL.md → commands/connect-chrome.md} +7 -24
- package/commands/cso.md +70 -0
- package/commands/design-consultation.md +70 -0
- package/commands/design-review.md +70 -0
- package/commands/design-shotgun.md +70 -0
- package/commands/document-release.md +70 -0
- package/{skills/freeze/SKILL.md → commands/freeze.md} +3 -29
- package/{skills/guard/SKILL.md → commands/guard.md} +4 -35
- package/commands/investigate.md +70 -0
- package/commands/land-and-deploy.md +70 -0
- package/commands/office-hours.md +70 -0
- package/{skills/gstack-upgrade/SKILL.md → commands/opengstack-upgrade.md} +64 -79
- package/commands/plan-ceo-review.md +70 -0
- package/commands/plan-design-review.md +70 -0
- package/commands/plan-eng-review.md +70 -0
- package/commands/qa-only.md +70 -0
- package/commands/qa.md +70 -0
- package/commands/retro.md +70 -0
- package/commands/review.md +70 -0
- package/{skills/setup-browser-cookies/SKILL.md → commands/setup-browser-cookies.md} +22 -40
- package/commands/setup-deploy.md +70 -0
- package/commands/ship.md +70 -0
- package/commands/unfreeze.md +25 -0
- package/docs/designs/CHROME_VS_CHROMIUM_EXPLORATION.md +9 -9
- package/docs/designs/CONDUCTOR_CHROME_SIDEBAR_INTEGRATION.md +2 -2
- package/docs/designs/CONDUCTOR_SESSION_API.md +16 -16
- package/docs/designs/DESIGN_SHOTGUN.md +74 -74
- package/docs/designs/DESIGN_TOOLS_V1.md +111 -111
- package/docs/skills.md +483 -202
- package/package.json +42 -43
- package/scripts/analytics.ts +188 -0
- package/scripts/dev-skill.ts +83 -0
- package/scripts/discover-skills.ts +39 -0
- package/scripts/eval-compare.ts +97 -0
- package/scripts/eval-list.ts +117 -0
- package/scripts/eval-select.ts +86 -0
- package/scripts/eval-summary.ts +188 -0
- package/scripts/eval-watch.ts +172 -0
- package/scripts/gen-skill-docs.ts +473 -0
- package/scripts/resolvers/browse.ts +129 -0
- package/scripts/resolvers/codex-helpers.ts +133 -0
- package/scripts/resolvers/composition.ts +48 -0
- package/scripts/resolvers/confidence.ts +37 -0
- package/scripts/resolvers/constants.ts +50 -0
- package/scripts/resolvers/design.ts +950 -0
- package/scripts/resolvers/index.ts +59 -0
- package/scripts/resolvers/learnings.ts +96 -0
- package/scripts/resolvers/preamble.ts +505 -0
- package/scripts/resolvers/review.ts +884 -0
- package/scripts/resolvers/testing.ts +573 -0
- package/scripts/resolvers/types.ts +45 -0
- package/scripts/resolvers/utility.ts +421 -0
- package/scripts/skill-check.ts +190 -0
- package/scripts/cleanup.py +0 -100
- package/scripts/filter-skills.sh +0 -114
- package/scripts/filter_skills.py +0 -164
- package/scripts/install-skills.js +0 -60
- package/skills/autoplan/SKILL.md +0 -96
- package/skills/autoplan/SKILL.md.tmpl +0 -694
- package/skills/benchmark/SKILL.md.tmpl +0 -222
- package/skills/browse/SKILL.md.tmpl +0 -131
- package/skills/browse/bin/find-browse +0 -21
- package/skills/browse/bin/remote-slug +0 -14
- package/skills/browse/scripts/build-node-server.sh +0 -48
- package/skills/browse/src/activity.ts +0 -208
- package/skills/browse/src/browser-manager.ts +0 -959
- package/skills/browse/src/buffers.ts +0 -137
- package/skills/browse/src/bun-polyfill.cjs +0 -109
- package/skills/browse/src/cli.ts +0 -678
- package/skills/browse/src/commands.ts +0 -128
- package/skills/browse/src/config.ts +0 -150
- package/skills/browse/src/cookie-import-browser.ts +0 -625
- package/skills/browse/src/cookie-picker-routes.ts +0 -230
- package/skills/browse/src/cookie-picker-ui.ts +0 -688
- package/skills/browse/src/find-browse.ts +0 -61
- package/skills/browse/src/meta-commands.ts +0 -550
- package/skills/browse/src/platform.ts +0 -17
- package/skills/browse/src/read-commands.ts +0 -358
- package/skills/browse/src/server.ts +0 -1192
- package/skills/browse/src/sidebar-agent.ts +0 -280
- package/skills/browse/src/sidebar-utils.ts +0 -21
- package/skills/browse/src/snapshot.ts +0 -407
- package/skills/browse/src/url-validation.ts +0 -95
- package/skills/browse/src/write-commands.ts +0 -364
- package/skills/browse/test/activity.test.ts +0 -120
- package/skills/browse/test/adversarial-security.test.ts +0 -32
- package/skills/browse/test/browser-manager-unit.test.ts +0 -17
- package/skills/browse/test/bun-polyfill.test.ts +0 -72
- package/skills/browse/test/commands.test.ts +0 -2075
- package/skills/browse/test/compare-board.test.ts +0 -342
- package/skills/browse/test/config.test.ts +0 -316
- package/skills/browse/test/cookie-import-browser.test.ts +0 -519
- package/skills/browse/test/cookie-picker-routes.test.ts +0 -260
- package/skills/browse/test/file-drop.test.ts +0 -271
- package/skills/browse/test/find-browse.test.ts +0 -50
- package/skills/browse/test/findport.test.ts +0 -191
- package/skills/browse/test/fixtures/basic.html +0 -33
- package/skills/browse/test/fixtures/cursor-interactive.html +0 -22
- package/skills/browse/test/fixtures/dialog.html +0 -15
- package/skills/browse/test/fixtures/empty.html +0 -2
- package/skills/browse/test/fixtures/forms.html +0 -55
- package/skills/browse/test/fixtures/iframe.html +0 -30
- package/skills/browse/test/fixtures/network-idle.html +0 -30
- package/skills/browse/test/fixtures/qa-eval-checkout.html +0 -108
- package/skills/browse/test/fixtures/qa-eval-spa.html +0 -98
- package/skills/browse/test/fixtures/qa-eval.html +0 -51
- package/skills/browse/test/fixtures/responsive.html +0 -49
- package/skills/browse/test/fixtures/snapshot.html +0 -55
- package/skills/browse/test/fixtures/spa.html +0 -24
- package/skills/browse/test/fixtures/states.html +0 -17
- package/skills/browse/test/fixtures/upload.html +0 -25
- package/skills/browse/test/gstack-config.test.ts +0 -138
- package/skills/browse/test/gstack-update-check.test.ts +0 -514
- package/skills/browse/test/handoff.test.ts +0 -235
- package/skills/browse/test/path-validation.test.ts +0 -91
- package/skills/browse/test/platform.test.ts +0 -37
- package/skills/browse/test/server-auth.test.ts +0 -65
- package/skills/browse/test/sidebar-agent-roundtrip.test.ts +0 -226
- package/skills/browse/test/sidebar-agent.test.ts +0 -199
- package/skills/browse/test/sidebar-integration.test.ts +0 -320
- package/skills/browse/test/sidebar-unit.test.ts +0 -96
- package/skills/browse/test/snapshot.test.ts +0 -467
- package/skills/browse/test/state-ttl.test.ts +0 -35
- package/skills/browse/test/test-server.ts +0 -57
- package/skills/browse/test/url-validation.test.ts +0 -72
- package/skills/browse/test/watch.test.ts +0 -129
- package/skills/canary/SKILL.md.tmpl +0 -212
- package/skills/careful/SKILL.md.tmpl +0 -56
- package/skills/careful/bin/check-careful.sh +0 -112
- package/skills/codex/SKILL.md +0 -90
- package/skills/codex/SKILL.md.tmpl +0 -417
- package/skills/connect-chrome/SKILL.md.tmpl +0 -195
- package/skills/cso/ACKNOWLEDGEMENTS.md +0 -14
- package/skills/cso/SKILL.md +0 -93
- package/skills/cso/SKILL.md.tmpl +0 -606
- package/skills/design-consultation/SKILL.md +0 -94
- package/skills/design-consultation/SKILL.md.tmpl +0 -415
- package/skills/design-review/SKILL.md +0 -94
- package/skills/design-review/SKILL.md.tmpl +0 -290
- package/skills/design-shotgun/SKILL.md +0 -91
- package/skills/design-shotgun/SKILL.md.tmpl +0 -285
- package/skills/document-release/SKILL.md +0 -91
- package/skills/document-release/SKILL.md.tmpl +0 -359
- package/skills/freeze/SKILL.md.tmpl +0 -77
- package/skills/freeze/bin/check-freeze.sh +0 -79
- package/skills/gstack-upgrade/SKILL.md.tmpl +0 -222
- package/skills/guard/SKILL.md.tmpl +0 -77
- package/skills/investigate/SKILL.md +0 -105
- package/skills/investigate/SKILL.md.tmpl +0 -194
- package/skills/land-and-deploy/SKILL.md.tmpl +0 -881
- package/skills/office-hours/SKILL.md +0 -96
- package/skills/office-hours/SKILL.md.tmpl +0 -645
- package/skills/plan-ceo-review/SKILL.md +0 -94
- package/skills/plan-ceo-review/SKILL.md.tmpl +0 -811
- package/skills/plan-design-review/SKILL.md +0 -92
- package/skills/plan-design-review/SKILL.md.tmpl +0 -446
- package/skills/plan-eng-review/SKILL.md +0 -93
- package/skills/plan-eng-review/SKILL.md.tmpl +0 -303
- package/skills/qa/SKILL.md +0 -95
- package/skills/qa/SKILL.md.tmpl +0 -316
- package/skills/qa/references/issue-taxonomy.md +0 -85
- package/skills/qa/templates/qa-report-template.md +0 -126
- package/skills/qa-only/SKILL.md +0 -89
- package/skills/qa-only/SKILL.md.tmpl +0 -101
- package/skills/retro/SKILL.md +0 -89
- package/skills/retro/SKILL.md.tmpl +0 -820
- package/skills/review/SKILL.md +0 -92
- package/skills/review/SKILL.md.tmpl +0 -281
- package/skills/review/TODOS-format.md +0 -62
- package/skills/review/checklist.md +0 -220
- package/skills/review/design-checklist.md +0 -132
- package/skills/review/greptile-triage.md +0 -220
- package/skills/setup-browser-cookies/SKILL.md.tmpl +0 -81
- package/skills/setup-deploy/SKILL.md +0 -92
- package/skills/setup-deploy/SKILL.md.tmpl +0 -215
- package/skills/ship/SKILL.md.tmpl +0 -636
- package/skills/unfreeze/SKILL.md +0 -37
- package/skills/unfreeze/SKILL.md.tmpl +0 -36
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Show which E2E and LLM-judge tests would run based on the current git diff.
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* bun run eval:select # human-readable output
|
|
7
|
+
* bun run eval:select --json # machine-readable JSON
|
|
8
|
+
* bun run eval:select --base main # override base branch
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import * as path from 'path';
|
|
12
|
+
import {
|
|
13
|
+
selectTests,
|
|
14
|
+
detectBaseBranch,
|
|
15
|
+
getChangedFiles,
|
|
16
|
+
E2E_TOUCHFILES,
|
|
17
|
+
LLM_JUDGE_TOUCHFILES,
|
|
18
|
+
GLOBAL_TOUCHFILES,
|
|
19
|
+
} from '../test/helpers/touchfiles';
|
|
20
|
+
|
|
21
|
+
const ROOT = path.resolve(import.meta.dir, '..');
|
|
22
|
+
const args = process.argv.slice(2);
|
|
23
|
+
const jsonMode = args.includes('--json');
|
|
24
|
+
const baseIdx = args.indexOf('--base');
|
|
25
|
+
const baseOverride = baseIdx >= 0 ? args[baseIdx + 1] : undefined;
|
|
26
|
+
|
|
27
|
+
// Detect base branch
|
|
28
|
+
const baseBranch = baseOverride || detectBaseBranch(ROOT) || 'main';
|
|
29
|
+
const changedFiles = getChangedFiles(baseBranch, ROOT);
|
|
30
|
+
|
|
31
|
+
if (changedFiles.length === 0) {
|
|
32
|
+
if (jsonMode) {
|
|
33
|
+
console.log(JSON.stringify({ base: baseBranch, changed_files: 0, e2e: 'all', llm_judge: 'all', reason: 'no diff — would run all tests' }));
|
|
34
|
+
} else {
|
|
35
|
+
console.log(`Base: ${baseBranch}`);
|
|
36
|
+
console.log('No changed files detected — all tests would run.');
|
|
37
|
+
}
|
|
38
|
+
process.exit(0);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const e2eSelection = selectTests(changedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES);
|
|
42
|
+
const llmSelection = selectTests(changedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES);
|
|
43
|
+
|
|
44
|
+
if (jsonMode) {
|
|
45
|
+
console.log(JSON.stringify({
|
|
46
|
+
base: baseBranch,
|
|
47
|
+
changed_files: changedFiles,
|
|
48
|
+
e2e: {
|
|
49
|
+
selected: e2eSelection.selected,
|
|
50
|
+
skipped: e2eSelection.skipped,
|
|
51
|
+
reason: e2eSelection.reason,
|
|
52
|
+
count: `${e2eSelection.selected.length}/${Object.keys(E2E_TOUCHFILES).length}`,
|
|
53
|
+
},
|
|
54
|
+
llm_judge: {
|
|
55
|
+
selected: llmSelection.selected,
|
|
56
|
+
skipped: llmSelection.skipped,
|
|
57
|
+
reason: llmSelection.reason,
|
|
58
|
+
count: `${llmSelection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length}`,
|
|
59
|
+
},
|
|
60
|
+
}, null, 2));
|
|
61
|
+
} else {
|
|
62
|
+
console.log(`Base: ${baseBranch}`);
|
|
63
|
+
console.log(`Changed files: ${changedFiles.length}`);
|
|
64
|
+
console.log();
|
|
65
|
+
|
|
66
|
+
console.log(`E2E (${e2eSelection.reason}): ${e2eSelection.selected.length}/${Object.keys(E2E_TOUCHFILES).length} tests`);
|
|
67
|
+
if (e2eSelection.selected.length > 0 && e2eSelection.selected.length < Object.keys(E2E_TOUCHFILES).length) {
|
|
68
|
+
console.log(` Selected: ${e2eSelection.selected.join(', ')}`);
|
|
69
|
+
console.log(` Skipped: ${e2eSelection.skipped.join(', ')}`);
|
|
70
|
+
} else if (e2eSelection.selected.length === 0) {
|
|
71
|
+
console.log(' No E2E tests affected.');
|
|
72
|
+
} else {
|
|
73
|
+
console.log(' All E2E tests selected.');
|
|
74
|
+
}
|
|
75
|
+
console.log();
|
|
76
|
+
|
|
77
|
+
console.log(`LLM-judge (${llmSelection.reason}): ${llmSelection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length} tests`);
|
|
78
|
+
if (llmSelection.selected.length > 0 && llmSelection.selected.length < Object.keys(LLM_JUDGE_TOUCHFILES).length) {
|
|
79
|
+
console.log(` Selected: ${llmSelection.selected.join(', ')}`);
|
|
80
|
+
console.log(` Skipped: ${llmSelection.skipped.join(', ')}`);
|
|
81
|
+
} else if (llmSelection.selected.length === 0) {
|
|
82
|
+
console.log(' No LLM-judge tests affected.');
|
|
83
|
+
} else {
|
|
84
|
+
console.log(' All LLM-judge tests selected.');
|
|
85
|
+
}
|
|
86
|
+
}
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Aggregate summary of all eval runs from ~/.opengstack-dev/evals/
|
|
4
|
+
*
|
|
5
|
+
* Usage: bun run eval:summary
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import * as fs from 'fs';
|
|
9
|
+
import * as path from 'path';
|
|
10
|
+
import * as os from 'os';
|
|
11
|
+
import type { EvalResult } from '../test/helpers/eval-store';
|
|
12
|
+
import { getProjectEvalDir } from '../test/helpers/eval-store';
|
|
13
|
+
|
|
14
|
+
const EVAL_DIR = getProjectEvalDir();
|
|
15
|
+
|
|
16
|
+
let files: string[];
|
|
17
|
+
try {
|
|
18
|
+
files = fs.readdirSync(EVAL_DIR).filter(f => f.endsWith('.json'));
|
|
19
|
+
} catch {
|
|
20
|
+
console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
|
|
21
|
+
process.exit(0);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
if (files.length === 0) {
|
|
25
|
+
console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
|
|
26
|
+
process.exit(0);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Load all results
|
|
30
|
+
const results: EvalResult[] = [];
|
|
31
|
+
for (const file of files) {
|
|
32
|
+
try {
|
|
33
|
+
results.push(JSON.parse(fs.readFileSync(path.join(EVAL_DIR, file), 'utf-8')));
|
|
34
|
+
} catch { continue; }
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Aggregate stats
|
|
38
|
+
const e2eRuns = results.filter(r => r.tier === 'e2e');
|
|
39
|
+
const judgeRuns = results.filter(r => r.tier === 'llm-judge');
|
|
40
|
+
const totalCost = results.reduce((s, r) => s + (r.total_cost_usd || 0), 0);
|
|
41
|
+
const avgE2ECost = e2eRuns.length > 0 ? e2eRuns.reduce((s, r) => s + r.total_cost_usd, 0) / e2eRuns.length : 0;
|
|
42
|
+
const avgJudgeCost = judgeRuns.length > 0 ? judgeRuns.reduce((s, r) => s + r.total_cost_usd, 0) / judgeRuns.length : 0;
|
|
43
|
+
|
|
44
|
+
// Duration + turns from E2E runs
|
|
45
|
+
const avgE2EDuration = e2eRuns.length > 0
|
|
46
|
+
? e2eRuns.reduce((s, r) => s + (r.total_duration_ms || 0), 0) / e2eRuns.length
|
|
47
|
+
: 0;
|
|
48
|
+
const e2eTurns: number[] = [];
|
|
49
|
+
for (const r of e2eRuns) {
|
|
50
|
+
const runTurns = r.tests.reduce((s, t) => s + (t.turns_used || 0), 0);
|
|
51
|
+
if (runTurns > 0) e2eTurns.push(runTurns);
|
|
52
|
+
}
|
|
53
|
+
const avgE2ETurns = e2eTurns.length > 0
|
|
54
|
+
? e2eTurns.reduce((a, b) => a + b, 0) / e2eTurns.length
|
|
55
|
+
: 0;
|
|
56
|
+
|
|
57
|
+
// Per-test efficiency stats (avg turns + duration across runs)
|
|
58
|
+
const testEfficiency = new Map<string, { turns: number[]; durations: number[]; costs: number[] }>();
|
|
59
|
+
for (const r of e2eRuns) {
|
|
60
|
+
for (const t of r.tests) {
|
|
61
|
+
if (!testEfficiency.has(t.name)) {
|
|
62
|
+
testEfficiency.set(t.name, { turns: [], durations: [], costs: [] });
|
|
63
|
+
}
|
|
64
|
+
const stats = testEfficiency.get(t.name)!;
|
|
65
|
+
if (t.turns_used !== undefined) stats.turns.push(t.turns_used);
|
|
66
|
+
if (t.duration_ms > 0) stats.durations.push(t.duration_ms);
|
|
67
|
+
if (t.cost_usd > 0) stats.costs.push(t.cost_usd);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Detection rates from outcome evals
|
|
72
|
+
const detectionRates: number[] = [];
|
|
73
|
+
for (const r of e2eRuns) {
|
|
74
|
+
for (const t of r.tests) {
|
|
75
|
+
if (t.detection_rate !== undefined) {
|
|
76
|
+
detectionRates.push(t.detection_rate);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
const avgDetection = detectionRates.length > 0
|
|
81
|
+
? detectionRates.reduce((a, b) => a + b, 0) / detectionRates.length
|
|
82
|
+
: null;
|
|
83
|
+
|
|
84
|
+
// Flaky tests (passed in some runs, failed in others)
|
|
85
|
+
const testResults = new Map<string, boolean[]>();
|
|
86
|
+
for (const r of results) {
|
|
87
|
+
for (const t of r.tests) {
|
|
88
|
+
const key = `${r.tier}:${t.name}`;
|
|
89
|
+
if (!testResults.has(key)) testResults.set(key, []);
|
|
90
|
+
testResults.get(key)!.push(t.passed);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
const flakyTests: string[] = [];
|
|
94
|
+
for (const [name, outcomes] of testResults) {
|
|
95
|
+
if (outcomes.length >= 2) {
|
|
96
|
+
const hasPass = outcomes.some(o => o);
|
|
97
|
+
const hasFail = outcomes.some(o => !o);
|
|
98
|
+
if (hasPass && hasFail) flakyTests.push(name);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Branch stats
|
|
103
|
+
const branchStats = new Map<string, { runs: number; avgDetection: number; detections: number[] }>();
|
|
104
|
+
for (const r of e2eRuns) {
|
|
105
|
+
if (!branchStats.has(r.branch)) {
|
|
106
|
+
branchStats.set(r.branch, { runs: 0, avgDetection: 0, detections: [] });
|
|
107
|
+
}
|
|
108
|
+
const stats = branchStats.get(r.branch)!;
|
|
109
|
+
stats.runs++;
|
|
110
|
+
for (const t of r.tests) {
|
|
111
|
+
if (t.detection_rate !== undefined) {
|
|
112
|
+
stats.detections.push(t.detection_rate);
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
for (const stats of branchStats.values()) {
|
|
117
|
+
stats.avgDetection = stats.detections.length > 0
|
|
118
|
+
? stats.detections.reduce((a, b) => a + b, 0) / stats.detections.length
|
|
119
|
+
: 0;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Print summary
|
|
123
|
+
console.log('');
|
|
124
|
+
console.log('Eval Summary');
|
|
125
|
+
console.log('═'.repeat(70));
|
|
126
|
+
console.log(` Total runs: ${results.length} (${e2eRuns.length} e2e, ${judgeRuns.length} llm-judge)`);
|
|
127
|
+
console.log(` Total spend: $${totalCost.toFixed(2)}`);
|
|
128
|
+
console.log(` Avg cost/e2e: $${avgE2ECost.toFixed(2)}`);
|
|
129
|
+
console.log(` Avg cost/judge: $${avgJudgeCost.toFixed(2)}`);
|
|
130
|
+
if (avgE2EDuration > 0) {
|
|
131
|
+
console.log(` Avg duration/e2e: ${Math.round(avgE2EDuration / 1000)}s`);
|
|
132
|
+
}
|
|
133
|
+
if (avgE2ETurns > 0) {
|
|
134
|
+
console.log(` Avg turns/e2e: ${Math.round(avgE2ETurns)}`);
|
|
135
|
+
}
|
|
136
|
+
if (avgDetection !== null) {
|
|
137
|
+
console.log(` Avg detection: ${avgDetection.toFixed(1)} bugs`);
|
|
138
|
+
}
|
|
139
|
+
console.log('─'.repeat(70));
|
|
140
|
+
|
|
141
|
+
// Per-test efficiency averages (only if we have enough data)
|
|
142
|
+
if (testEfficiency.size > 0 && e2eRuns.length >= 2) {
|
|
143
|
+
console.log(' Per-test efficiency (averages across runs):');
|
|
144
|
+
const sorted = [...testEfficiency.entries()]
|
|
145
|
+
.filter(([, s]) => s.turns.length >= 2)
|
|
146
|
+
.sort((a, b) => {
|
|
147
|
+
const avgA = a[1].costs.reduce((s, c) => s + c, 0) / a[1].costs.length;
|
|
148
|
+
const avgB = b[1].costs.reduce((s, c) => s + c, 0) / b[1].costs.length;
|
|
149
|
+
return avgB - avgA;
|
|
150
|
+
});
|
|
151
|
+
for (const [name, stats] of sorted) {
|
|
152
|
+
const avgT = Math.round(stats.turns.reduce((a, b) => a + b, 0) / stats.turns.length);
|
|
153
|
+
const avgD = Math.round(stats.durations.reduce((a, b) => a + b, 0) / stats.durations.length / 1000);
|
|
154
|
+
const avgC = (stats.costs.reduce((a, b) => a + b, 0) / stats.costs.length).toFixed(2);
|
|
155
|
+
const label = name.length > 30 ? name.slice(0, 27) + '...' : name.padEnd(30);
|
|
156
|
+
console.log(` ${label} $${avgC} ${avgT}t ${avgD}s (${stats.turns.length} runs)`);
|
|
157
|
+
}
|
|
158
|
+
console.log('─'.repeat(70));
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if (flakyTests.length > 0) {
|
|
162
|
+
console.log(` Flaky tests (${flakyTests.length}):`);
|
|
163
|
+
for (const name of flakyTests) {
|
|
164
|
+
console.log(` - ${name}`);
|
|
165
|
+
}
|
|
166
|
+
console.log('─'.repeat(70));
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
if (branchStats.size > 0) {
|
|
170
|
+
console.log(' Branches:');
|
|
171
|
+
const sorted = [...branchStats.entries()].sort((a, b) => b[1].avgDetection - a[1].avgDetection);
|
|
172
|
+
for (const [branch, stats] of sorted) {
|
|
173
|
+
const det = stats.detections.length > 0 ? ` avg det: ${stats.avgDetection.toFixed(1)}` : '';
|
|
174
|
+
console.log(` ${branch.padEnd(30)} ${stats.runs} runs${det}`);
|
|
175
|
+
}
|
|
176
|
+
console.log('─'.repeat(70));
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// Date range
|
|
180
|
+
const timestamps = results.map(r => r.timestamp).filter(Boolean).sort();
|
|
181
|
+
if (timestamps.length > 0) {
|
|
182
|
+
const first = timestamps[0].replace('T', ' ').slice(0, 16);
|
|
183
|
+
const last = timestamps[timestamps.length - 1].replace('T', ' ').slice(0, 16);
|
|
184
|
+
console.log(` Date range: ${first} → ${last}`);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
console.log(` Dir: ${EVAL_DIR}`);
|
|
188
|
+
console.log('');
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Live E2E test watcher dashboard.
|
|
3
|
+
*
|
|
4
|
+
* Reads heartbeat (e2e-live.json) for current test status and
|
|
5
|
+
* partial eval results (_partial-e2e.json) for completed tests.
|
|
6
|
+
* Renders a terminal dashboard every 1s.
|
|
7
|
+
*
|
|
8
|
+
* Usage: bun run eval:watch [--tail]
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import * as fs from 'fs';
|
|
12
|
+
import * as path from 'path';
|
|
13
|
+
import * as os from 'os';
|
|
14
|
+
|
|
15
|
+
const OpenGStack_DEV_DIR = path.join(os.homedir(), '.opengstack-dev');
|
|
16
|
+
const HEARTBEAT_PATH = path.join(OpenGStack_DEV_DIR, 'e2e-live.json');
|
|
17
|
+
const PARTIAL_PATH = path.join(OpenGStack_DEV_DIR, 'evals', '_partial-e2e.json');
|
|
18
|
+
const STALE_THRESHOLD_SEC = 600; // 10 minutes
|
|
19
|
+
|
|
20
|
+
export interface HeartbeatData {
|
|
21
|
+
runId: string;
|
|
22
|
+
pid?: number;
|
|
23
|
+
startedAt: string;
|
|
24
|
+
currentTest: string;
|
|
25
|
+
status: string;
|
|
26
|
+
turn: number;
|
|
27
|
+
toolCount: number;
|
|
28
|
+
lastTool: string;
|
|
29
|
+
lastToolAt: string;
|
|
30
|
+
elapsedSec: number;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export interface PartialData {
|
|
34
|
+
tests: Array<{
|
|
35
|
+
name: string;
|
|
36
|
+
passed: boolean;
|
|
37
|
+
cost_usd: number;
|
|
38
|
+
duration_ms: number;
|
|
39
|
+
turns_used?: number;
|
|
40
|
+
exit_reason?: string;
|
|
41
|
+
}>;
|
|
42
|
+
total_cost_usd: number;
|
|
43
|
+
_partial?: boolean;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/** Read and parse a JSON file, returning null on any error. */
|
|
47
|
+
function readJSON<T>(filePath: string): T | null {
|
|
48
|
+
try {
|
|
49
|
+
return JSON.parse(fs.readFileSync(filePath, 'utf-8'));
|
|
50
|
+
} catch {
|
|
51
|
+
return null;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/** Check if a process is alive (signal 0 = existence check, doesn't kill). */
|
|
56
|
+
function isProcessAlive(pid: number): boolean {
|
|
57
|
+
try {
|
|
58
|
+
process.kill(pid, 0);
|
|
59
|
+
return true;
|
|
60
|
+
} catch {
|
|
61
|
+
return false;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/** Format seconds as Xm Ys */
|
|
66
|
+
function formatDuration(sec: number): string {
|
|
67
|
+
if (sec < 60) return `${sec}s`;
|
|
68
|
+
const m = Math.floor(sec / 60);
|
|
69
|
+
const s = sec % 60;
|
|
70
|
+
return `${m}m ${s}s`;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/** Render dashboard from heartbeat + partial data. Pure function for testability. */
|
|
74
|
+
export function renderDashboard(heartbeat: HeartbeatData | null, partial: PartialData | null): string {
|
|
75
|
+
const lines: string[] = [];
|
|
76
|
+
|
|
77
|
+
if (!heartbeat && !partial) {
|
|
78
|
+
lines.push('E2E Watch — No active run detected');
|
|
79
|
+
lines.push('');
|
|
80
|
+
lines.push(`Heartbeat: ${HEARTBEAT_PATH} (not found)`);
|
|
81
|
+
lines.push(`Partial: ${PARTIAL_PATH} (not found)`);
|
|
82
|
+
lines.push('');
|
|
83
|
+
lines.push('Start a run with: EVALS=1 bun test test/skill-e2e-*.test.ts');
|
|
84
|
+
return lines.join('\n');
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const runId = heartbeat?.runId || 'unknown';
|
|
88
|
+
const elapsed = heartbeat?.elapsedSec || 0;
|
|
89
|
+
lines.push(`E2E Watch \u2014 Run ${runId} \u2014 ${formatDuration(elapsed)}`);
|
|
90
|
+
lines.push('\u2550'.repeat(55));
|
|
91
|
+
|
|
92
|
+
// Completed tests from partial
|
|
93
|
+
if (partial?.tests) {
|
|
94
|
+
for (const t of partial.tests) {
|
|
95
|
+
const icon = t.passed ? '\u2713' : '\u2717';
|
|
96
|
+
const cost = `$${t.cost_usd.toFixed(2)}`;
|
|
97
|
+
const dur = `${Math.round(t.duration_ms / 1000)}s`;
|
|
98
|
+
const turns = t.turns_used !== undefined ? `${t.turns_used} turns` : '';
|
|
99
|
+
const name = t.name.length > 30 ? t.name.slice(0, 27) + '...' : t.name.padEnd(30);
|
|
100
|
+
lines.push(` ${icon} ${name} ${cost.padStart(6)} ${dur.padStart(5)} ${turns}`);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Current test from heartbeat
|
|
105
|
+
if (heartbeat && heartbeat.status === 'running') {
|
|
106
|
+
const name = heartbeat.currentTest.length > 30
|
|
107
|
+
? heartbeat.currentTest.slice(0, 27) + '...'
|
|
108
|
+
: heartbeat.currentTest.padEnd(30);
|
|
109
|
+
lines.push(` \u29D6 ${name} ${formatDuration(heartbeat.elapsedSec).padStart(6)} turn ${heartbeat.turn} last: ${heartbeat.lastTool}`);
|
|
110
|
+
|
|
111
|
+
// Stale detection
|
|
112
|
+
const lastToolTime = new Date(heartbeat.lastToolAt).getTime();
|
|
113
|
+
const staleSec = Math.round((Date.now() - lastToolTime) / 1000);
|
|
114
|
+
if (staleSec > STALE_THRESHOLD_SEC) {
|
|
115
|
+
lines.push(` \u26A0 STALE: last tool call was ${formatDuration(staleSec)} ago \u2014 run may have crashed`);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
lines.push('\u2500'.repeat(55));
|
|
120
|
+
|
|
121
|
+
// Summary
|
|
122
|
+
const completedCount = partial?.tests?.length || 0;
|
|
123
|
+
const totalCost = partial?.total_cost_usd || 0;
|
|
124
|
+
const running = heartbeat?.status === 'running' ? 1 : 0;
|
|
125
|
+
lines.push(` Completed: ${completedCount} Running: ${running} Cost: $${totalCost.toFixed(2)} Elapsed: ${formatDuration(elapsed)}`);
|
|
126
|
+
|
|
127
|
+
if (heartbeat?.runId) {
|
|
128
|
+
const logPath = path.join(OpenGStack_DEV_DIR, 'e2e-runs', heartbeat.runId, 'progress.log');
|
|
129
|
+
lines.push(` Logs: ${logPath}`);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
return lines.join('\n');
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// --- Main ---
|
|
136
|
+
|
|
137
|
+
if (import.meta.main) {
|
|
138
|
+
const showTail = process.argv.includes('--tail');
|
|
139
|
+
|
|
140
|
+
const render = () => {
|
|
141
|
+
let heartbeat = readJSON<HeartbeatData>(HEARTBEAT_PATH);
|
|
142
|
+
const partial = readJSON<PartialData>(PARTIAL_PATH);
|
|
143
|
+
|
|
144
|
+
// Auto-clear heartbeat if the process is dead
|
|
145
|
+
if (heartbeat?.pid && !isProcessAlive(heartbeat.pid)) {
|
|
146
|
+
try { fs.unlinkSync(HEARTBEAT_PATH); } catch { /* already gone */ }
|
|
147
|
+
process.stdout.write('\x1B[2J\x1B[H');
|
|
148
|
+
process.stdout.write(`Cleared stale heartbeat — PID ${heartbeat.pid} is no longer running.\n\n`);
|
|
149
|
+
heartbeat = null;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Clear screen
|
|
153
|
+
process.stdout.write('\x1B[2J\x1B[H');
|
|
154
|
+
process.stdout.write(renderDashboard(heartbeat, partial) + '\n');
|
|
155
|
+
|
|
156
|
+
// --tail: show last 10 lines of progress.log
|
|
157
|
+
if (showTail && heartbeat?.runId) {
|
|
158
|
+
const logPath = path.join(OpenGStack_DEV_DIR, 'e2e-runs', heartbeat.runId, 'progress.log');
|
|
159
|
+
try {
|
|
160
|
+
const content = fs.readFileSync(logPath, 'utf-8');
|
|
161
|
+
const tail = content.split('\n').filter(l => l.trim()).slice(-10);
|
|
162
|
+
process.stdout.write('\nRecent progress:\n');
|
|
163
|
+
for (const line of tail) {
|
|
164
|
+
process.stdout.write(line + '\n');
|
|
165
|
+
}
|
|
166
|
+
} catch { /* log file may not exist yet */ }
|
|
167
|
+
}
|
|
168
|
+
};
|
|
169
|
+
|
|
170
|
+
render();
|
|
171
|
+
setInterval(render, 1000);
|
|
172
|
+
}
|