opengstack 0.14.0 → 0.14.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +4 -4
- package/CLAUDE.md +127 -110
- package/README.md +10 -5
- package/SKILL.md +500 -70
- package/bin/opengstack.js +69 -69
- package/commands/autoplan.md +7 -9
- package/commands/benchmark.md +84 -91
- package/commands/browse.md +60 -64
- package/commands/canary.md +7 -9
- package/commands/careful.md +2 -2
- package/commands/codex.md +7 -9
- package/commands/connect-chrome.md +7 -9
- package/commands/cso.md +7 -9
- package/commands/design-consultation.md +7 -9
- package/commands/design-review.md +7 -9
- package/commands/design-shotgun.md +7 -9
- package/commands/document-release.md +7 -9
- package/commands/freeze.md +3 -3
- package/commands/guard.md +4 -4
- package/commands/investigate.md +7 -9
- package/commands/land-and-deploy.md +7 -9
- package/commands/office-hours.md +7 -9
- package/commands/{gstack-upgrade.md → opengstack-upgrade.md} +64 -65
- package/commands/plan-ceo-review.md +7 -9
- package/commands/plan-design-review.md +7 -9
- package/commands/plan-eng-review.md +7 -9
- package/commands/qa-only.md +7 -9
- package/commands/qa.md +7 -9
- package/commands/retro.md +7 -9
- package/commands/review.md +7 -9
- package/commands/setup-browser-cookies.md +22 -26
- package/commands/setup-deploy.md +7 -9
- package/commands/ship.md +7 -9
- package/commands/unfreeze.md +7 -7
- package/docs/designs/CHROME_VS_CHROMIUM_EXPLORATION.md +9 -9
- package/docs/designs/CONDUCTOR_CHROME_SIDEBAR_INTEGRATION.md +2 -2
- package/docs/designs/CONDUCTOR_SESSION_API.md +16 -16
- package/docs/designs/DESIGN_SHOTGUN.md +74 -74
- package/docs/designs/DESIGN_TOOLS_V1.md +111 -111
- package/docs/skills.md +483 -202
- package/package.json +42 -43
- package/scripts/analytics.ts +188 -0
- package/scripts/dev-skill.ts +83 -0
- package/scripts/discover-skills.ts +39 -0
- package/scripts/eval-compare.ts +97 -0
- package/scripts/eval-list.ts +117 -0
- package/scripts/eval-select.ts +86 -0
- package/scripts/eval-summary.ts +188 -0
- package/scripts/eval-watch.ts +172 -0
- package/scripts/gen-skill-docs.ts +473 -0
- package/scripts/resolvers/browse.ts +129 -0
- package/scripts/resolvers/codex-helpers.ts +133 -0
- package/scripts/resolvers/composition.ts +48 -0
- package/scripts/resolvers/confidence.ts +37 -0
- package/scripts/resolvers/constants.ts +50 -0
- package/scripts/resolvers/design.ts +950 -0
- package/scripts/resolvers/index.ts +59 -0
- package/scripts/resolvers/learnings.ts +96 -0
- package/scripts/resolvers/preamble.ts +505 -0
- package/scripts/resolvers/review.ts +884 -0
- package/scripts/resolvers/testing.ts +573 -0
- package/scripts/resolvers/types.ts +45 -0
- package/scripts/resolvers/utility.ts +421 -0
- package/scripts/skill-check.ts +190 -0
- package/scripts/cleanup.py +0 -100
- package/scripts/filter-skills.sh +0 -114
- package/scripts/filter_skills.py +0 -164
- package/scripts/install-commands.js +0 -45
- package/scripts/install-skills.js +0 -60
package/package.json
CHANGED
|
@@ -1,47 +1,46 @@
|
|
|
1
1
|
{
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
"postinstall": "node scripts/install-commands.js",
|
|
2
|
+
"name": "opengstack",
|
|
3
|
+
"version": "0.14.2",
|
|
4
|
+
"private": false,
|
|
5
|
+
"description": "AI Engineering Workflow - Native slash commands for OpenCode. Open source AI engineering workflow. No telemetry. No tracking.",
|
|
6
|
+
"keywords": [
|
|
7
|
+
"ai-agents",
|
|
8
|
+
"claude",
|
|
9
|
+
"workflow",
|
|
10
|
+
"skills",
|
|
11
|
+
"agent-instructions",
|
|
12
|
+
"opengstack",
|
|
13
|
+
"opengstack"
|
|
14
|
+
],
|
|
15
|
+
"homepage": "https://github.com/Ambisphaeric/opengstack#readme",
|
|
16
|
+
"bugs": {
|
|
17
|
+
"url": "https://github.com/Ambisphaeric/opengstack/issues"
|
|
18
|
+
},
|
|
19
|
+
"repository": {
|
|
20
|
+
"type": "git",
|
|
21
|
+
"url": "git+https://github.com/Ambisphaeric/OpenGStack.git"
|
|
22
|
+
},
|
|
23
|
+
"license": "MIT",
|
|
24
|
+
"author": "Ambisphaeric",
|
|
25
|
+
"type": "commonjs",
|
|
26
|
+
"main": "SKILL.md",
|
|
27
|
+
"bin": {
|
|
28
|
+
"opengstack": "./bin/opengstack.js"
|
|
29
|
+
},
|
|
30
|
+
"directories": {
|
|
31
|
+
"doc": "docs"
|
|
32
|
+
},
|
|
33
|
+
"files": [
|
|
34
|
+
"SKILL.md",
|
|
35
|
+
"CLAUDE.md",
|
|
36
|
+
"AGENTS.md",
|
|
37
|
+
"README.md",
|
|
38
|
+
"commands/",
|
|
39
|
+
"scripts/",
|
|
40
|
+
"docs/",
|
|
41
|
+
"bin/"
|
|
42
|
+
],
|
|
43
|
+
"scripts": {
|
|
45
44
|
"test": "echo \"Error: no test specified\" && exit 1"
|
|
46
45
|
}
|
|
47
46
|
}
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* analytics — CLI for viewing opengstack skill usage statistics.
|
|
4
|
+
*
|
|
5
|
+
* - Top skills by invocation count
|
|
6
|
+
* - Per-repo skill breakdown
|
|
7
|
+
* - Safety hook fire events
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import * as fs from 'fs';
|
|
13
|
+
import * as path from 'path';
|
|
14
|
+
import * as os from 'os';
|
|
15
|
+
|
|
16
|
+
export interface AnalyticsEvent {
|
|
17
|
+
skill: string;
|
|
18
|
+
ts: string;
|
|
19
|
+
repo: string;
|
|
20
|
+
event?: string;
|
|
21
|
+
pattern?: string;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
const ANALYTICS_FILE = path.join(os.homedir(), '.OpenGStack', 'analytics', 'skill-usage.jsonl');
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Parse JSONL content into AnalyticsEvent[], skipping malformed lines.
|
|
28
|
+
*/
|
|
29
|
+
export function parseJSONL(content: string): AnalyticsEvent[] {
|
|
30
|
+
const events: AnalyticsEvent[] = [];
|
|
31
|
+
for (const line of content.split('\n')) {
|
|
32
|
+
const trimmed = line.trim();
|
|
33
|
+
if (!trimmed) continue;
|
|
34
|
+
try {
|
|
35
|
+
const obj = JSON.parse(trimmed);
|
|
36
|
+
if (typeof obj === 'object' && obj !== null && typeof obj.ts === 'string') {
|
|
37
|
+
events.push(obj as AnalyticsEvent);
|
|
38
|
+
}
|
|
39
|
+
} catch {
|
|
40
|
+
// skip malformed lines
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return events;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Filter events by period. Supports "7d", "30d", and "all".
|
|
48
|
+
*/
|
|
49
|
+
export function filterByPeriod(events: AnalyticsEvent[], period: string): AnalyticsEvent[] {
|
|
50
|
+
if (period === 'all') return events;
|
|
51
|
+
|
|
52
|
+
const match = period.match(/^(\d+)d$/);
|
|
53
|
+
if (!match) return events;
|
|
54
|
+
|
|
55
|
+
const days = parseInt(match[1], 10);
|
|
56
|
+
const cutoff = new Date(Date.now() - days * 24 * 60 * 60 * 1000);
|
|
57
|
+
|
|
58
|
+
return events.filter(e => {
|
|
59
|
+
const d = new Date(e.ts);
|
|
60
|
+
return !isNaN(d.getTime()) && d >= cutoff;
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Format a report string from a list of events.
|
|
66
|
+
*/
|
|
67
|
+
export function formatReport(events: AnalyticsEvent[], period: string = 'all'): string {
|
|
68
|
+
const skillEvents = events.filter(e => e.event !== 'hook_fire');
|
|
69
|
+
const hookEvents = events.filter(e => e.event === 'hook_fire');
|
|
70
|
+
|
|
71
|
+
const lines: string[] = [];
|
|
72
|
+
lines.push('opengstack skill usage analytics');
|
|
73
|
+
lines.push('\u2550'.repeat(39));
|
|
74
|
+
lines.push('');
|
|
75
|
+
|
|
76
|
+
const periodLabel = period === 'all' ? 'all time' : `last ${period.replace('d', ' days')}`;
|
|
77
|
+
lines.push(`Period: ${periodLabel}`);
|
|
78
|
+
|
|
79
|
+
// Top Skills
|
|
80
|
+
const skillCounts = new Map<string, number>();
|
|
81
|
+
for (const e of skillEvents) {
|
|
82
|
+
skillCounts.set(e.skill, (skillCounts.get(e.skill) || 0) + 1);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
if (skillCounts.size > 0) {
|
|
86
|
+
lines.push('');
|
|
87
|
+
lines.push('Top Skills');
|
|
88
|
+
|
|
89
|
+
const sorted = [...skillCounts.entries()].sort((a, b) => b[1] - a[1]);
|
|
90
|
+
const maxName = Math.max(...sorted.map(([name]) => name.length + 1)); // +1 for /
|
|
91
|
+
const maxCount = Math.max(...sorted.map(([, count]) => String(count).length));
|
|
92
|
+
|
|
93
|
+
for (const [name, count] of sorted) {
|
|
94
|
+
const label = `/${name}`;
|
|
95
|
+
const suffix = `${count} invocation${count === 1 ? '' : 's'}`;
|
|
96
|
+
const dotLen = Math.max(2, 25 - label.length - suffix.length);
|
|
97
|
+
const dots = ' ' + '.'.repeat(dotLen) + ' ';
|
|
98
|
+
lines.push(` ${label}${dots}${suffix}`);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// By Repo
|
|
103
|
+
const repoSkills = new Map<string, Map<string, number>>();
|
|
104
|
+
for (const e of skillEvents) {
|
|
105
|
+
if (!repoSkills.has(e.repo)) repoSkills.set(e.repo, new Map());
|
|
106
|
+
const m = repoSkills.get(e.repo)!;
|
|
107
|
+
m.set(e.skill, (m.get(e.skill) || 0) + 1);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if (repoSkills.size > 0) {
|
|
111
|
+
lines.push('');
|
|
112
|
+
lines.push('By Repo');
|
|
113
|
+
|
|
114
|
+
const sortedRepos = [...repoSkills.entries()].sort((a, b) => a[0].localeCompare(b[0]));
|
|
115
|
+
for (const [repo, skills] of sortedRepos) {
|
|
116
|
+
const parts = [...skills.entries()]
|
|
117
|
+
.sort((a, b) => b[1] - a[1])
|
|
118
|
+
.map(([s, c]) => `${s}(${c})`);
|
|
119
|
+
lines.push(` ${repo}: ${parts.join(' ')}`);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// Safety Hook Events
|
|
124
|
+
const hookCounts = new Map<string, number>();
|
|
125
|
+
for (const e of hookEvents) {
|
|
126
|
+
if (e.pattern) {
|
|
127
|
+
hookCounts.set(e.pattern, (hookCounts.get(e.pattern) || 0) + 1);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
if (hookCounts.size > 0) {
|
|
132
|
+
lines.push('');
|
|
133
|
+
lines.push('Safety Hook Events');
|
|
134
|
+
|
|
135
|
+
const sortedHooks = [...hookCounts.entries()].sort((a, b) => b[1] - a[1]);
|
|
136
|
+
for (const [pattern, count] of sortedHooks) {
|
|
137
|
+
const suffix = `${count} fire${count === 1 ? '' : 's'}`;
|
|
138
|
+
const dotLen = Math.max(2, 25 - pattern.length - suffix.length);
|
|
139
|
+
const dots = ' ' + '.'.repeat(dotLen) + ' ';
|
|
140
|
+
lines.push(` ${pattern}${dots}${suffix}`);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Total
|
|
145
|
+
const totalSkills = skillEvents.length;
|
|
146
|
+
const totalHooks = hookEvents.length;
|
|
147
|
+
lines.push('');
|
|
148
|
+
lines.push(`Total: ${totalSkills} skill invocation${totalSkills === 1 ? '' : 's'}, ${totalHooks} hook fire${totalHooks === 1 ? '' : 's'}`);
|
|
149
|
+
|
|
150
|
+
return lines.join('\n');
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
function main() {
|
|
154
|
+
// Parse --period flag
|
|
155
|
+
let period = 'all';
|
|
156
|
+
const args = process.argv.slice(2);
|
|
157
|
+
for (let i = 0; i < args.length; i++) {
|
|
158
|
+
if (args[i] === '--period' && i + 1 < args.length) {
|
|
159
|
+
period = args[i + 1];
|
|
160
|
+
i++;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// Read file
|
|
165
|
+
if (!fs.existsSync(ANALYTICS_FILE)) {
|
|
166
|
+
console.log('No analytics data found.');
|
|
167
|
+
process.exit(0);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
const content = fs.readFileSync(ANALYTICS_FILE, 'utf-8').trim();
|
|
171
|
+
if (!content) {
|
|
172
|
+
console.log('No analytics data found.');
|
|
173
|
+
process.exit(0);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
const events = parseJSONL(content);
|
|
177
|
+
if (events.length === 0) {
|
|
178
|
+
console.log('No analytics data found.');
|
|
179
|
+
process.exit(0);
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
const filtered = filterByPeriod(events, period);
|
|
183
|
+
console.log(formatReport(filtered, period));
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
if (import.meta.main) {
|
|
187
|
+
main();
|
|
188
|
+
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* dev:skill — Watch mode for SKILL.md template development.
|
|
4
|
+
*
|
|
5
|
+
* Watches .tmpl files, regenerates SKILL.md files on change,
|
|
6
|
+
* validates all $B commands immediately.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { validateSkill } from '../test/helpers/skill-parser';
|
|
10
|
+
import { discoverTemplates } from './discover-skills';
|
|
11
|
+
import { execSync } from 'child_process';
|
|
12
|
+
import * as fs from 'fs';
|
|
13
|
+
import * as path from 'path';
|
|
14
|
+
|
|
15
|
+
const ROOT = path.resolve(import.meta.dir, '..');
|
|
16
|
+
|
|
17
|
+
const TEMPLATES = discoverTemplates(ROOT).map(t => ({
|
|
18
|
+
tmpl: path.join(ROOT, t.tmpl),
|
|
19
|
+
output: t.output,
|
|
20
|
+
}));
|
|
21
|
+
|
|
22
|
+
function regenerateAndValidate() {
|
|
23
|
+
// Regenerate
|
|
24
|
+
try {
|
|
25
|
+
execSync('bun run scripts/gen-skill-docs.ts', { cwd: ROOT, stdio: 'pipe' });
|
|
26
|
+
} catch (err: any) {
|
|
27
|
+
console.log(` [gen] ERROR: ${err.stderr?.toString().trim() || err.message}`);
|
|
28
|
+
return;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// Validate each generated file
|
|
32
|
+
for (const { output } of TEMPLATES) {
|
|
33
|
+
const fullPath = path.join(ROOT, output);
|
|
34
|
+
if (!fs.existsSync(fullPath)) continue;
|
|
35
|
+
|
|
36
|
+
const result = validateSkill(fullPath);
|
|
37
|
+
const totalValid = result.valid.length;
|
|
38
|
+
const totalInvalid = result.invalid.length;
|
|
39
|
+
const totalSnapErrors = result.snapshotFlagErrors.length;
|
|
40
|
+
|
|
41
|
+
if (totalInvalid > 0 || totalSnapErrors > 0) {
|
|
42
|
+
console.log(` [check] \u274c ${output} (${totalValid} valid)`);
|
|
43
|
+
for (const inv of result.invalid) {
|
|
44
|
+
console.log(` Unknown command: '${inv.command}' at line ${inv.line}`);
|
|
45
|
+
}
|
|
46
|
+
for (const se of result.snapshotFlagErrors) {
|
|
47
|
+
console.log(` ${se.error} at line ${se.command.line}`);
|
|
48
|
+
}
|
|
49
|
+
} else {
|
|
50
|
+
console.log(` [check] \u2705 ${output} — ${totalValid} commands, all valid`);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Initial run
|
|
56
|
+
console.log(' [watch] Watching *.md.tmpl files...');
|
|
57
|
+
regenerateAndValidate();
|
|
58
|
+
|
|
59
|
+
// Watch for changes
|
|
60
|
+
for (const { tmpl } of TEMPLATES) {
|
|
61
|
+
if (!fs.existsSync(tmpl)) continue;
|
|
62
|
+
fs.watch(tmpl, () => {
|
|
63
|
+
console.log(`\n [watch] ${path.relative(ROOT, tmpl)} changed`);
|
|
64
|
+
regenerateAndValidate();
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Also watch commands.ts and snapshot.ts (source of truth changes)
|
|
69
|
+
const SOURCE_FILES = [
|
|
70
|
+
path.join(ROOT, 'browse', 'src', 'commands.ts'),
|
|
71
|
+
path.join(ROOT, 'browse', 'src', 'snapshot.ts'),
|
|
72
|
+
];
|
|
73
|
+
|
|
74
|
+
for (const src of SOURCE_FILES) {
|
|
75
|
+
if (!fs.existsSync(src)) continue;
|
|
76
|
+
fs.watch(src, () => {
|
|
77
|
+
console.log(`\n [watch] ${path.relative(ROOT, src)} changed`);
|
|
78
|
+
regenerateAndValidate();
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Keep alive
|
|
83
|
+
console.log(' [watch] Press Ctrl+C to stop\n');
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared discovery for SKILL.md and .tmpl files.
|
|
3
|
+
* Scans root + one level of subdirs, skipping node_modules/.git/dist.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import * as fs from 'fs';
|
|
7
|
+
import * as path from 'path';
|
|
8
|
+
|
|
9
|
+
const SKIP = new Set(['node_modules', '.git', 'dist']);
|
|
10
|
+
|
|
11
|
+
function subdirs(root: string): string[] {
|
|
12
|
+
return fs.readdirSync(root, { withFileTypes: true })
|
|
13
|
+
.filter(d => d.isDirectory() && !d.name.startsWith('.') && !SKIP.has(d.name))
|
|
14
|
+
.map(d => d.name);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export function discoverTemplates(root: string): Array<{ tmpl: string; output: string }> {
|
|
18
|
+
const dirs = ['', ...subdirs(root)];
|
|
19
|
+
const results: Array<{ tmpl: string; output: string }> = [];
|
|
20
|
+
for (const dir of dirs) {
|
|
21
|
+
const rel = dir ? `${dir}/SKILL.md.tmpl` : 'SKILL.md.tmpl';
|
|
22
|
+
if (fs.existsSync(path.join(root, rel))) {
|
|
23
|
+
results.push({ tmpl: rel, output: rel.replace(/\.tmpl$/, '') });
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
return results;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export function discoverSkillFiles(root: string): string[] {
|
|
30
|
+
const dirs = ['', ...subdirs(root)];
|
|
31
|
+
const results: string[] = [];
|
|
32
|
+
for (const dir of dirs) {
|
|
33
|
+
const rel = dir ? `${dir}/SKILL.md` : 'SKILL.md';
|
|
34
|
+
if (fs.existsSync(path.join(root, rel))) {
|
|
35
|
+
results.push(rel);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
return results;
|
|
39
|
+
}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Compare two eval runs from ~/.opengstack-dev/evals/
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* bun run eval:compare # compare two most recent of same tier
|
|
7
|
+
* bun run eval:compare <file> # compare file against its predecessor
|
|
8
|
+
* bun run eval:compare <file-a> <file-b> # compare two specific files
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import * as fs from 'fs';
|
|
12
|
+
import * as path from 'path';
|
|
13
|
+
import * as os from 'os';
|
|
14
|
+
import {
|
|
15
|
+
findPreviousRun,
|
|
16
|
+
compareEvalResults,
|
|
17
|
+
formatComparison,
|
|
18
|
+
getProjectEvalDir,
|
|
19
|
+
} from '../test/helpers/eval-store';
|
|
20
|
+
import type { EvalResult } from '../test/helpers/eval-store';
|
|
21
|
+
|
|
22
|
+
const EVAL_DIR = getProjectEvalDir();
|
|
23
|
+
|
|
24
|
+
function loadResult(filepath: string): EvalResult {
|
|
25
|
+
// Resolve relative to EVAL_DIR if not absolute
|
|
26
|
+
const resolved = path.isAbsolute(filepath) ? filepath : path.join(EVAL_DIR, filepath);
|
|
27
|
+
if (!fs.existsSync(resolved)) {
|
|
28
|
+
console.error(`File not found: ${resolved}`);
|
|
29
|
+
process.exit(1);
|
|
30
|
+
}
|
|
31
|
+
return JSON.parse(fs.readFileSync(resolved, 'utf-8'));
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const args = process.argv.slice(2);
|
|
35
|
+
|
|
36
|
+
let beforeFile: string;
|
|
37
|
+
let afterFile: string;
|
|
38
|
+
|
|
39
|
+
if (args.length === 2) {
|
|
40
|
+
// Two explicit files
|
|
41
|
+
beforeFile = args[0];
|
|
42
|
+
afterFile = args[1];
|
|
43
|
+
} else if (args.length === 1) {
|
|
44
|
+
// One file — find its predecessor
|
|
45
|
+
afterFile = args[0];
|
|
46
|
+
const resolved = path.isAbsolute(afterFile) ? afterFile : path.join(EVAL_DIR, afterFile);
|
|
47
|
+
const afterResult = loadResult(resolved);
|
|
48
|
+
const prev = findPreviousRun(EVAL_DIR, afterResult.tier, afterResult.branch, resolved);
|
|
49
|
+
if (!prev) {
|
|
50
|
+
console.log('No previous run found to compare against.');
|
|
51
|
+
process.exit(0);
|
|
52
|
+
}
|
|
53
|
+
beforeFile = prev;
|
|
54
|
+
} else {
|
|
55
|
+
// No args — find two most recent of the same tier
|
|
56
|
+
let files: string[];
|
|
57
|
+
try {
|
|
58
|
+
files = fs.readdirSync(EVAL_DIR)
|
|
59
|
+
.filter(f => f.endsWith('.json'))
|
|
60
|
+
.sort()
|
|
61
|
+
.reverse();
|
|
62
|
+
} catch {
|
|
63
|
+
console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
|
|
64
|
+
process.exit(0);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
if (files.length < 2) {
|
|
68
|
+
console.log('Need at least 2 eval runs to compare. Run evals again.');
|
|
69
|
+
process.exit(0);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Most recent file
|
|
73
|
+
afterFile = path.join(EVAL_DIR, files[0]);
|
|
74
|
+
const afterResult = loadResult(afterFile);
|
|
75
|
+
const prev = findPreviousRun(EVAL_DIR, afterResult.tier, afterResult.branch, afterFile);
|
|
76
|
+
if (!prev) {
|
|
77
|
+
console.log('No previous run of the same tier found to compare against.');
|
|
78
|
+
process.exit(0);
|
|
79
|
+
}
|
|
80
|
+
beforeFile = prev;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const beforeResult = loadResult(beforeFile);
|
|
84
|
+
const afterResult = loadResult(afterFile);
|
|
85
|
+
|
|
86
|
+
// Warn if different tiers
|
|
87
|
+
if (beforeResult.tier !== afterResult.tier) {
|
|
88
|
+
console.warn(`Warning: comparing different tiers (${beforeResult.tier} vs ${afterResult.tier})`);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Warn on schema mismatch
|
|
92
|
+
if (beforeResult.schema_version !== afterResult.schema_version) {
|
|
93
|
+
console.warn(`Warning: schema version mismatch (${beforeResult.schema_version} vs ${afterResult.schema_version})`);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const comparison = compareEvalResults(beforeResult, afterResult, beforeFile, afterFile);
|
|
97
|
+
console.log(formatComparison(comparison));
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* List eval runs from ~/.opengstack-dev/evals/
|
|
4
|
+
*
|
|
5
|
+
* Usage: bun run eval:list [--branch <name>] [--tier e2e|llm-judge] [--limit N]
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import * as fs from 'fs';
|
|
9
|
+
import * as path from 'path';
|
|
10
|
+
import * as os from 'os';
|
|
11
|
+
import { getProjectEvalDir } from '../test/helpers/eval-store';
|
|
12
|
+
|
|
13
|
+
const EVAL_DIR = getProjectEvalDir();
|
|
14
|
+
|
|
15
|
+
// Parse args
|
|
16
|
+
const args = process.argv.slice(2);
|
|
17
|
+
let filterBranch: string | null = null;
|
|
18
|
+
let filterTier: string | null = null;
|
|
19
|
+
let limit = 20;
|
|
20
|
+
|
|
21
|
+
for (let i = 0; i < args.length; i++) {
|
|
22
|
+
if (args[i] === '--branch' && args[i + 1]) { filterBranch = args[++i]; }
|
|
23
|
+
else if (args[i] === '--tier' && args[i + 1]) { filterTier = args[++i]; }
|
|
24
|
+
else if (args[i] === '--limit' && args[i + 1]) { limit = parseInt(args[++i], 10); }
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Read eval files
|
|
28
|
+
let files: string[];
|
|
29
|
+
try {
|
|
30
|
+
files = fs.readdirSync(EVAL_DIR).filter(f => f.endsWith('.json'));
|
|
31
|
+
} catch {
|
|
32
|
+
console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
|
|
33
|
+
process.exit(0);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
if (files.length === 0) {
|
|
37
|
+
console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
|
|
38
|
+
process.exit(0);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Parse top-level fields from each file
|
|
42
|
+
interface RunSummary {
|
|
43
|
+
file: string;
|
|
44
|
+
timestamp: string;
|
|
45
|
+
branch: string;
|
|
46
|
+
tier: string;
|
|
47
|
+
version: string;
|
|
48
|
+
passed: number;
|
|
49
|
+
total: number;
|
|
50
|
+
cost: number;
|
|
51
|
+
duration: number;
|
|
52
|
+
turns: number;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
const runs: RunSummary[] = [];
|
|
56
|
+
for (const file of files) {
|
|
57
|
+
try {
|
|
58
|
+
const data = JSON.parse(fs.readFileSync(path.join(EVAL_DIR, file), 'utf-8'));
|
|
59
|
+
if (filterBranch && data.branch !== filterBranch) continue;
|
|
60
|
+
if (filterTier && data.tier !== filterTier) continue;
|
|
61
|
+
const totalTurns = (data.tests || []).reduce((s: number, t: any) => s + (t.turns_used || 0), 0);
|
|
62
|
+
runs.push({
|
|
63
|
+
file,
|
|
64
|
+
timestamp: data.timestamp || '',
|
|
65
|
+
branch: data.branch || 'unknown',
|
|
66
|
+
tier: data.tier || 'unknown',
|
|
67
|
+
version: data.version || '?',
|
|
68
|
+
passed: data.passed || 0,
|
|
69
|
+
total: data.total_tests || 0,
|
|
70
|
+
cost: data.total_cost_usd || 0,
|
|
71
|
+
duration: data.total_duration_ms || 0,
|
|
72
|
+
turns: totalTurns,
|
|
73
|
+
});
|
|
74
|
+
} catch { continue; }
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// Sort by timestamp descending
|
|
78
|
+
runs.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
|
|
79
|
+
|
|
80
|
+
// Apply limit
|
|
81
|
+
const displayed = runs.slice(0, limit);
|
|
82
|
+
|
|
83
|
+
// Print table
|
|
84
|
+
console.log('');
|
|
85
|
+
console.log(`Eval History (${runs.length} total runs)`);
|
|
86
|
+
console.log('═'.repeat(105));
|
|
87
|
+
console.log(
|
|
88
|
+
' ' +
|
|
89
|
+
'Date'.padEnd(17) +
|
|
90
|
+
'Branch'.padEnd(25) +
|
|
91
|
+
'Tier'.padEnd(12) +
|
|
92
|
+
'Pass'.padEnd(8) +
|
|
93
|
+
'Cost'.padEnd(8) +
|
|
94
|
+
'Turns'.padEnd(7) +
|
|
95
|
+
'Duration'.padEnd(10) +
|
|
96
|
+
'Version'
|
|
97
|
+
);
|
|
98
|
+
console.log('─'.repeat(105));
|
|
99
|
+
|
|
100
|
+
for (const run of displayed) {
|
|
101
|
+
const date = run.timestamp.replace('T', ' ').slice(0, 16);
|
|
102
|
+
const branch = run.branch.length > 23 ? run.branch.slice(0, 20) + '...' : run.branch.padEnd(25);
|
|
103
|
+
const pass = `${run.passed}/${run.total}`.padEnd(8);
|
|
104
|
+
const cost = `$${run.cost.toFixed(2)}`.padEnd(8);
|
|
105
|
+
const turns = run.turns > 0 ? `${run.turns}t`.padEnd(7) : ''.padEnd(7);
|
|
106
|
+
const dur = run.duration > 0 ? `${Math.round(run.duration / 1000)}s`.padEnd(10) : ''.padEnd(10);
|
|
107
|
+
console.log(` ${date.padEnd(17)}${branch}${run.tier.padEnd(12)}${pass}${cost}${turns}${dur}v${run.version}`);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
console.log('─'.repeat(105));
|
|
111
|
+
|
|
112
|
+
const totalCost = runs.reduce((s, r) => s + r.cost, 0);
|
|
113
|
+
const totalDur = runs.reduce((s, r) => s + r.duration, 0);
|
|
114
|
+
const totalTurns = runs.reduce((s, r) => s + r.turns, 0);
|
|
115
|
+
console.log(` ${runs.length} runs | $${totalCost.toFixed(2)} total | ${totalTurns} turns | ${Math.round(totalDur / 1000)}s | Showing: ${displayed.length}`);
|
|
116
|
+
console.log(` Dir: ${EVAL_DIR}`);
|
|
117
|
+
console.log('');
|