@pennyfarthing/benchmark 10.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/commands/benchmark-control.md +69 -0
- package/commands/benchmark.md +485 -0
- package/commands/job-fair.md +102 -0
- package/commands/solo.md +447 -0
- package/dist/benchmark-integration.d.ts +182 -0
- package/dist/benchmark-integration.d.ts.map +1 -0
- package/dist/benchmark-integration.js +710 -0
- package/dist/benchmark-integration.js.map +1 -0
- package/dist/benchmark-integration.test.d.ts +6 -0
- package/dist/benchmark-integration.test.d.ts.map +1 -0
- package/dist/benchmark-integration.test.js +41 -0
- package/dist/benchmark-integration.test.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -0
- package/dist/job-fair-aggregator.d.ts +150 -0
- package/dist/job-fair-aggregator.d.ts.map +1 -0
- package/dist/job-fair-aggregator.js +547 -0
- package/dist/job-fair-aggregator.js.map +1 -0
- package/dist/job-fair-aggregator.test.d.ts +6 -0
- package/dist/job-fair-aggregator.test.d.ts.map +1 -0
- package/dist/job-fair-aggregator.test.js +35 -0
- package/dist/job-fair-aggregator.test.js.map +1 -0
- package/dist/package-exports.test.d.ts +13 -0
- package/dist/package-exports.test.d.ts.map +1 -0
- package/dist/package-exports.test.js +192 -0
- package/dist/package-exports.test.js.map +1 -0
- package/docs/BENCHMARK-METHODOLOGY.md +105 -0
- package/docs/BENCHMARKING.md +311 -0
- package/docs/OCEAN-BENCHMARKING.md +210 -0
- package/docs/benchmarks-guide.md +62 -0
- package/package.json +66 -0
- package/scenarios/README.md +145 -0
- package/scenarios/architecture/database-selection.yaml +119 -0
- package/scenarios/architecture/legacy-modernization.yaml +153 -0
- package/scenarios/architecture/scaling-decision.yaml +88 -0
- package/scenarios/code-review/graphql-api-review.yaml +714 -0
- package/scenarios/code-review/order-service.yaml +622 -0
- package/scenarios/code-review/react-auth-component.yaml +569 -0
- package/scenarios/code-review/security-review.yaml +145 -0
- package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
- package/scenarios/debug/buggy-user-service.yaml +541 -0
- package/scenarios/debug/null-pointer.yaml +130 -0
- package/scenarios/debugging/async-control-flow.yaml +161 -0
- package/scenarios/debugging/auth-bypass.yaml +197 -0
- package/scenarios/debugging/error-handling.yaml +178 -0
- package/scenarios/debugging/input-validation.yaml +157 -0
- package/scenarios/debugging/null-check-missing.yaml +139 -0
- package/scenarios/debugging/off-by-one-loop.yaml +132 -0
- package/scenarios/debugging/race-condition.yaml +180 -0
- package/scenarios/debugging/resource-leak.yaml +166 -0
- package/scenarios/debugging/simple-logic-error.yaml +115 -0
- package/scenarios/debugging/sql-injection.yaml +163 -0
- package/scenarios/dev/event-processor-tdd.yaml +764 -0
- package/scenarios/dev/migration-disaster.yaml +415 -0
- package/scenarios/dev/race-condition-cache.yaml +546 -0
- package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
- package/scenarios/schema.yaml +639 -0
- package/scenarios/sm/dependency-deadlock.yaml +414 -0
- package/scenarios/sm/executive-pet-project.yaml +336 -0
- package/scenarios/sm/layoff-planning.yaml +356 -0
- package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
- package/scenarios/sm/story-breakdown.yaml +240 -0
- package/scenarios/sm/three-sprint-failure.yaml +397 -0
- package/scenarios/swe-bench/README.md +57 -0
- package/scenarios/swe-bench/astropy-12907.yaml +128 -0
- package/scenarios/swe-bench/astropy-13398.yaml +177 -0
- package/scenarios/swe-bench/astropy-14309.yaml +180 -0
- package/scenarios/swe-bench/django-10097.yaml +106 -0
- package/scenarios/swe-bench/django-10554.yaml +140 -0
- package/scenarios/swe-bench/django-10973.yaml +93 -0
- package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
- package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
- package/scenarios/swe-bench/flask-5014.yaml +91 -0
- package/scenarios/swe-bench/import-swebench.py +246 -0
- package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
- package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
- package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
- package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
- package/scenarios/swe-bench/requests-1142.yaml +100 -0
- package/scenarios/swe-bench/requests-2931.yaml +98 -0
- package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
- package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
- package/scenarios/swe-bench/xarray-3993.yaml +104 -0
- package/scenarios/swe-bench/xarray-6992.yaml +136 -0
- package/scenarios/tea/checkout-component-tests.yaml +596 -0
- package/scenarios/tea/cli-tool-tests.yaml +561 -0
- package/scenarios/tea/microservice-integration-tests.yaml +520 -0
- package/scenarios/tea/payment-processor-tests.yaml +550 -0
- package/scripts/aggregate-benchmark-stats.js +315 -0
- package/scripts/aggregate-benchmark-stats.sh +8 -0
- package/scripts/benchmark-runner.js +392 -0
- package/scripts/benchmark-runner.sh +8 -0
- package/scripts/consolidate-job-fair.sh +107 -0
- package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
- package/scripts/job-fair-batch.sh +116 -0
- package/scripts/job-fair-progress.sh +35 -0
- package/scripts/job-fair-runner.sh +278 -0
- package/scripts/job-fair-status.sh +80 -0
- package/scripts/job-fair-watcher-v2.sh +38 -0
- package/scripts/job-fair-watcher.sh +50 -0
- package/scripts/parallel-benchmark.sh +140 -0
- package/scripts/solo-runner.sh +344 -0
- package/scripts/test/ensure-swebench-data.sh +59 -0
- package/scripts/test/ground-truth-judge.py +220 -0
- package/scripts/test/swebench-judge.py +374 -0
- package/scripts/test/test-cache.sh +165 -0
- package/scripts/test/test-setup.sh +337 -0
- package/scripts/theme/compute-theme-tiers.sh +13 -0
- package/scripts/theme/compute_theme_tiers.py +402 -0
- package/scripts/theme/update-theme-tiers.sh +97 -0
- package/skills/finalize-run/SKILL.md +261 -0
- package/skills/judge/SKILL.md +644 -0
- package/skills/persona-benchmark/SKILL.md +187 -0
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* aggregate-benchmark-stats.js - Aggregate job-fair results into benchmark statistics
|
|
4
|
+
*
|
|
5
|
+
* Scans consolidated job-fair results and generates unified statistics:
|
|
6
|
+
* - Per-role aggregate statistics (mean, std_dev, n, min, max)
|
|
7
|
+
* - Theme rankings per role
|
|
8
|
+
* - Control baseline comparisons
|
|
9
|
+
*
|
|
10
|
+
* Usage:
|
|
11
|
+
* aggregate-benchmark-stats.js [--dry-run] [--theme THEME]
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { readdirSync, writeFileSync, existsSync } from 'fs';
|
|
15
|
+
import { join, dirname } from 'path';
|
|
16
|
+
import { fileURLToPath } from 'url';
|
|
17
|
+
import { execSync } from 'child_process';
|
|
18
|
+
|
|
19
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
20
|
+
const __dirname = dirname(__filename);
|
|
21
|
+
const PROJECT_DIR = join(dirname(__dirname), '..', '..');
|
|
22
|
+
const CONSOLIDATED_DIR = join(PROJECT_DIR, 'internal', 'results', 'job-fair', 'consolidated');
|
|
23
|
+
const OUTPUT_FILE = join(PROJECT_DIR, 'internal', 'results', 'aggregate-stats.yaml');
|
|
24
|
+
|
|
25
|
+
// Standard roles from job-fair
|
|
26
|
+
const STANDARD_ROLES = ['dev-codegen', 'dev-debug', 'reviewer', 'tea', 'sm', 'architect'];
|
|
27
|
+
|
|
28
|
+
// Baselines from control theme (hardcoded from job-fair-runner.sh)
|
|
29
|
+
const BASELINES = {
|
|
30
|
+
'dev-codegen': { mean: 85.8, std: 7.30, n: 10 },
|
|
31
|
+
'dev-debug': { mean: 77.5, std: 8.54, n: 10 },
|
|
32
|
+
'reviewer': { mean: 78.5, std: 1.8, n: 10 },
|
|
33
|
+
'tea': { mean: 72.1, std: 2.3, n: 10 },
|
|
34
|
+
'sm': { mean: 80.3, std: 1.9, n: 10 },
|
|
35
|
+
'architect': { mean: 87.2, std: 3.25, n: 10 },
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
// Parse command line arguments
|
|
39
|
+
function parseArgs(argv) {
|
|
40
|
+
const args = {
|
|
41
|
+
dryRun: false,
|
|
42
|
+
theme: null,
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
let i = 2;
|
|
46
|
+
while (i < argv.length) {
|
|
47
|
+
const arg = argv[i];
|
|
48
|
+
switch (arg) {
|
|
49
|
+
case '--dry-run':
|
|
50
|
+
args.dryRun = true;
|
|
51
|
+
break;
|
|
52
|
+
case '--theme':
|
|
53
|
+
args.theme = argv[++i];
|
|
54
|
+
break;
|
|
55
|
+
case '--help':
|
|
56
|
+
case '-h':
|
|
57
|
+
showUsage();
|
|
58
|
+
process.exit(0);
|
|
59
|
+
break;
|
|
60
|
+
}
|
|
61
|
+
i++;
|
|
62
|
+
}
|
|
63
|
+
return args;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function showUsage() {
|
|
67
|
+
console.log(`Usage: aggregate-benchmark-stats.js [OPTIONS]
|
|
68
|
+
|
|
69
|
+
Options:
|
|
70
|
+
--dry-run Output to stdout instead of writing file
|
|
71
|
+
--theme THEME Only process specific theme
|
|
72
|
+
--help, -h Show this help message
|
|
73
|
+
|
|
74
|
+
Output: internal/results/aggregate-stats.yaml`);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// Extract YAML field using yq
|
|
78
|
+
function yqGet(filePath, field) {
|
|
79
|
+
try {
|
|
80
|
+
const result = execSync(`yq -r '.${field}' "${filePath}"`, {
|
|
81
|
+
encoding: 'utf-8',
|
|
82
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
83
|
+
});
|
|
84
|
+
const trimmed = result.trim();
|
|
85
|
+
return trimmed === 'null' ? null : trimmed;
|
|
86
|
+
} catch {
|
|
87
|
+
return null;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Get all keys from a YAML object
|
|
92
|
+
function yqKeys(filePath, field) {
|
|
93
|
+
try {
|
|
94
|
+
const result = execSync(`yq -r '.${field} | keys | .[]' "${filePath}"`, {
|
|
95
|
+
encoding: 'utf-8',
|
|
96
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
97
|
+
});
|
|
98
|
+
return result.trim().split('\n').filter(Boolean);
|
|
99
|
+
} catch {
|
|
100
|
+
return [];
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Get numeric value from YAML
|
|
105
|
+
function yqNumber(filePath, field) {
|
|
106
|
+
const val = yqGet(filePath, field);
|
|
107
|
+
return val !== null ? parseFloat(val) : null;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Calculate statistics from array of numbers
|
|
111
|
+
function calculateStats(values) {
|
|
112
|
+
if (!values || values.length === 0) {
|
|
113
|
+
return { mean: 0, std_dev: 0, n: 0, min: 0, max: 0 };
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
const n = values.length;
|
|
117
|
+
const mean = values.reduce((a, b) => a + b, 0) / n;
|
|
118
|
+
const variance = values.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / n;
|
|
119
|
+
const std_dev = Math.sqrt(variance);
|
|
120
|
+
const min = Math.min(...values);
|
|
121
|
+
const max = Math.max(...values);
|
|
122
|
+
|
|
123
|
+
return {
|
|
124
|
+
mean: Math.round(mean * 100) / 100,
|
|
125
|
+
std_dev: Math.round(std_dev * 100) / 100,
|
|
126
|
+
n,
|
|
127
|
+
min: Math.round(min * 100) / 100,
|
|
128
|
+
max: Math.round(max * 100) / 100,
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Load theme data from consolidated summary
|
|
133
|
+
function loadThemeData(themePath) {
|
|
134
|
+
const summaryPath = join(themePath, 'summary.yaml');
|
|
135
|
+
if (!existsSync(summaryPath)) {
|
|
136
|
+
return null;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
const theme = yqGet(summaryPath, 'theme');
|
|
140
|
+
if (!theme) return null;
|
|
141
|
+
|
|
142
|
+
const characters = yqKeys(summaryPath, 'matrix');
|
|
143
|
+
const roleScores = {};
|
|
144
|
+
|
|
145
|
+
// Initialize role arrays
|
|
146
|
+
for (const role of STANDARD_ROLES) {
|
|
147
|
+
roleScores[role] = [];
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// Collect scores from each character
|
|
151
|
+
for (const char of characters) {
|
|
152
|
+
for (const role of STANDARD_ROLES) {
|
|
153
|
+
const mean = yqNumber(summaryPath, `matrix.${char}.${role}.mean`);
|
|
154
|
+
if (mean !== null && mean > 0) {
|
|
155
|
+
roleScores[role].push(mean);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
return { theme, roleScores };
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Main aggregation logic
|
|
164
|
+
function aggregate(args) {
|
|
165
|
+
// Get list of consolidated themes
|
|
166
|
+
let themes;
|
|
167
|
+
if (args.theme) {
|
|
168
|
+
themes = [args.theme];
|
|
169
|
+
} else {
|
|
170
|
+
themes = readdirSync(CONSOLIDATED_DIR, { withFileTypes: true })
|
|
171
|
+
.filter(d => d.isDirectory())
|
|
172
|
+
.map(d => d.name);
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Collect all scores per role across themes
|
|
176
|
+
const allRoleScores = {};
|
|
177
|
+
const themeRoleAverages = {}; // theme -> role -> mean
|
|
178
|
+
|
|
179
|
+
for (const role of STANDARD_ROLES) {
|
|
180
|
+
allRoleScores[role] = [];
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
let themesProcessed = 0;
|
|
184
|
+
|
|
185
|
+
for (const themeName of themes) {
|
|
186
|
+
const themePath = join(CONSOLIDATED_DIR, themeName);
|
|
187
|
+
const data = loadThemeData(themePath);
|
|
188
|
+
if (!data) continue;
|
|
189
|
+
|
|
190
|
+
themesProcessed++;
|
|
191
|
+
themeRoleAverages[data.theme] = {};
|
|
192
|
+
|
|
193
|
+
for (const role of STANDARD_ROLES) {
|
|
194
|
+
// Add all character scores for this theme/role to the global pool
|
|
195
|
+
allRoleScores[role].push(...data.roleScores[role]);
|
|
196
|
+
|
|
197
|
+
// Calculate theme average for rankings
|
|
198
|
+
if (data.roleScores[role].length > 0) {
|
|
199
|
+
const avg = data.roleScores[role].reduce((a, b) => a + b, 0) / data.roleScores[role].length;
|
|
200
|
+
themeRoleAverages[data.theme][role] = Math.round(avg * 100) / 100;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// Calculate aggregate statistics per role
|
|
206
|
+
const roles = {};
|
|
207
|
+
for (const role of STANDARD_ROLES) {
|
|
208
|
+
roles[role] = calculateStats(allRoleScores[role]);
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// Generate rankings per role (sorted by mean descending)
|
|
212
|
+
const rankings = {};
|
|
213
|
+
for (const role of STANDARD_ROLES) {
|
|
214
|
+
const themeList = [];
|
|
215
|
+
for (const [theme, roleAvgs] of Object.entries(themeRoleAverages)) {
|
|
216
|
+
if (roleAvgs[role] !== undefined) {
|
|
217
|
+
themeList.push({ theme, mean: roleAvgs[role] });
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
// Sort descending by mean
|
|
221
|
+
themeList.sort((a, b) => b.mean - a.mean);
|
|
222
|
+
rankings[role] = themeList;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// Control baseline comparison
|
|
226
|
+
const control = {
|
|
227
|
+
baseline: 'from job-fair-runner.sh',
|
|
228
|
+
};
|
|
229
|
+
|
|
230
|
+
// Get control theme data if available
|
|
231
|
+
const controlPath = join(CONSOLIDATED_DIR, 'control');
|
|
232
|
+
const controlData = loadThemeData(controlPath);
|
|
233
|
+
|
|
234
|
+
for (const role of STANDARD_ROLES) {
|
|
235
|
+
const baseline = BASELINES[role];
|
|
236
|
+
let controlMean = null;
|
|
237
|
+
|
|
238
|
+
if (controlData && controlData.roleScores[role].length > 0) {
|
|
239
|
+
const scores = controlData.roleScores[role];
|
|
240
|
+
controlMean = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
241
|
+
controlMean = Math.round(controlMean * 100) / 100;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
control[role] = {
|
|
245
|
+
mean: controlMean !== null ? controlMean : baseline.mean,
|
|
246
|
+
baseline_mean: baseline.mean,
|
|
247
|
+
vs_baseline: controlMean !== null
|
|
248
|
+
? Math.round((controlMean - baseline.mean) * 100) / 100
|
|
249
|
+
: 0,
|
|
250
|
+
};
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// Build output YAML
|
|
254
|
+
const timestamp = new Date().toISOString();
|
|
255
|
+
|
|
256
|
+
const output = {
|
|
257
|
+
metadata: {
|
|
258
|
+
generated_at: timestamp,
|
|
259
|
+
themes_processed: themesProcessed,
|
|
260
|
+
source: 'internal/results/job-fair/consolidated/',
|
|
261
|
+
},
|
|
262
|
+
roles,
|
|
263
|
+
rankings,
|
|
264
|
+
control,
|
|
265
|
+
};
|
|
266
|
+
|
|
267
|
+
return output;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// Format output as YAML
|
|
271
|
+
function toYaml(obj, indent = 0) {
|
|
272
|
+
const spaces = ' '.repeat(indent);
|
|
273
|
+
let yaml = '';
|
|
274
|
+
|
|
275
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
276
|
+
if (value === null || value === undefined) {
|
|
277
|
+
yaml += `${spaces}${key}: null\n`;
|
|
278
|
+
} else if (Array.isArray(value)) {
|
|
279
|
+
yaml += `${spaces}${key}:\n`;
|
|
280
|
+
for (const item of value) {
|
|
281
|
+
if (typeof item === 'object') {
|
|
282
|
+
// Inline object format for array items
|
|
283
|
+
const parts = Object.entries(item).map(([k, v]) => `${k}: ${v}`);
|
|
284
|
+
yaml += `${spaces} - {${parts.join(', ')}}\n`;
|
|
285
|
+
} else {
|
|
286
|
+
yaml += `${spaces} - ${item}\n`;
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
} else if (typeof value === 'object') {
|
|
290
|
+
yaml += `${spaces}${key}:\n`;
|
|
291
|
+
yaml += toYaml(value, indent + 1);
|
|
292
|
+
} else {
|
|
293
|
+
yaml += `${spaces}${key}: ${value}\n`;
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
return yaml;
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
// Main entry point
|
|
301
|
+
function main() {
|
|
302
|
+
const args = parseArgs(process.argv);
|
|
303
|
+
const result = aggregate(args);
|
|
304
|
+
const yamlOutput = '# Aggregate Benchmark Statistics\n# Generated from job-fair consolidated results\n\n' + toYaml(result);
|
|
305
|
+
|
|
306
|
+
if (args.dryRun) {
|
|
307
|
+
console.log(yamlOutput);
|
|
308
|
+
} else {
|
|
309
|
+
writeFileSync(OUTPUT_FILE, yamlOutput);
|
|
310
|
+
console.log(`Wrote aggregate stats to ${OUTPUT_FILE}`);
|
|
311
|
+
console.log(`Themes processed: ${result.metadata.themes_processed}`);
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
main();
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# aggregate-benchmark-stats.sh - Shell wrapper for aggregate-benchmark-stats.js
|
|
3
|
+
# Aggregates job-fair results into unified benchmark statistics
|
|
4
|
+
#
|
|
5
|
+
# All logic is implemented in aggregate-benchmark-stats.js (Node.js)
|
|
6
|
+
|
|
7
|
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
8
|
+
exec node "$SCRIPT_DIR/aggregate-benchmark-stats.js" "$@"
|
|
@@ -0,0 +1,392 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* benchmark-runner.js - Unified entry point for benchmark execution
|
|
4
|
+
* Dispatches to solo-runner.sh and job-fair-runner.sh as appropriate
|
|
5
|
+
*
|
|
6
|
+
* Usage:
|
|
7
|
+
* benchmark-runner.js --mode catalog [--category CAT] [--format json|text]
|
|
8
|
+
* benchmark-runner.js --mode info --case CASE_ID [--format json|text]
|
|
9
|
+
* benchmark-runner.js --mode solo --case CASE_ID --agent AGENT_SPEC [--dry-run] [OUTPUT_DIR]
|
|
10
|
+
* benchmark-runner.js --mode suite --category CAT --agent AGENT_SPEC [--dry-run] [OUTPUT_DIR]
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { readFileSync, readdirSync, existsSync } from 'fs';
|
|
14
|
+
import { join, dirname } from 'path';
|
|
15
|
+
import { fileURLToPath } from 'url';
|
|
16
|
+
import { spawn, execSync } from 'child_process';
|
|
17
|
+
|
|
18
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
19
|
+
const __dirname = dirname(__filename);
|
|
20
|
+
const PROJECT_DIR = join(dirname(__dirname), '..', '..');
|
|
21
|
+
const TEST_CASES_DIR = join(PROJECT_DIR, 'benchmarks', 'test-cases');
|
|
22
|
+
const SCRIPTS_DIR = __dirname;
|
|
23
|
+
|
|
24
|
+
// Simple YAML field extractor using yq (already installed)
|
|
25
|
+
function parseYamlField(filePath, field) {
|
|
26
|
+
try {
|
|
27
|
+
const result = execSync(`yq -r '.${field}' "${filePath}"`, {
|
|
28
|
+
encoding: 'utf-8',
|
|
29
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
30
|
+
});
|
|
31
|
+
const trimmed = result.trim();
|
|
32
|
+
return trimmed === 'null' ? '' : trimmed;
|
|
33
|
+
} catch {
|
|
34
|
+
return '';
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Load test case metadata from YAML file
|
|
39
|
+
function loadTestCase(filePath) {
|
|
40
|
+
try {
|
|
41
|
+
const id = parseYamlField(filePath, 'id');
|
|
42
|
+
if (!id) return null;
|
|
43
|
+
|
|
44
|
+
return {
|
|
45
|
+
id,
|
|
46
|
+
name: parseYamlField(filePath, 'name'),
|
|
47
|
+
category: parseYamlField(filePath, 'category'),
|
|
48
|
+
difficulty: parseYamlField(filePath, 'difficulty'),
|
|
49
|
+
agent: parseYamlField(filePath, 'agent'),
|
|
50
|
+
version: parseYamlField(filePath, 'version'),
|
|
51
|
+
description: parseYamlField(filePath, 'description'),
|
|
52
|
+
_filePath: filePath,
|
|
53
|
+
};
|
|
54
|
+
} catch {
|
|
55
|
+
return null;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Parse command line arguments
|
|
60
|
+
function parseArgs(argv) {
|
|
61
|
+
const args = {
|
|
62
|
+
mode: null,
|
|
63
|
+
category: null,
|
|
64
|
+
caseId: null,
|
|
65
|
+
agent: null,
|
|
66
|
+
format: 'text',
|
|
67
|
+
dryRun: false,
|
|
68
|
+
outputDir: null,
|
|
69
|
+
};
|
|
70
|
+
|
|
71
|
+
let i = 2;
|
|
72
|
+
while (i < argv.length) {
|
|
73
|
+
const arg = argv[i];
|
|
74
|
+
switch (arg) {
|
|
75
|
+
case '--mode':
|
|
76
|
+
args.mode = argv[++i];
|
|
77
|
+
break;
|
|
78
|
+
case '--category':
|
|
79
|
+
args.category = argv[++i];
|
|
80
|
+
break;
|
|
81
|
+
case '--case':
|
|
82
|
+
args.caseId = argv[++i];
|
|
83
|
+
break;
|
|
84
|
+
case '--agent':
|
|
85
|
+
args.agent = argv[++i];
|
|
86
|
+
break;
|
|
87
|
+
case '--format':
|
|
88
|
+
args.format = argv[++i];
|
|
89
|
+
break;
|
|
90
|
+
case '--dry-run':
|
|
91
|
+
args.dryRun = true;
|
|
92
|
+
break;
|
|
93
|
+
case '--help':
|
|
94
|
+
case '-h':
|
|
95
|
+
showUsage();
|
|
96
|
+
process.exit(0);
|
|
97
|
+
break;
|
|
98
|
+
default:
|
|
99
|
+
if (!arg.startsWith('-') && !args.outputDir) {
|
|
100
|
+
args.outputDir = arg;
|
|
101
|
+
}
|
|
102
|
+
break;
|
|
103
|
+
}
|
|
104
|
+
i++;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return args;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function showUsage() {
|
|
111
|
+
console.log(`Usage: benchmark-runner.js [OPTIONS]
|
|
112
|
+
|
|
113
|
+
Modes:
|
|
114
|
+
--mode catalog List available test cases
|
|
115
|
+
--mode info Show details of a specific test case
|
|
116
|
+
--mode solo Run single agent on single test case
|
|
117
|
+
--mode suite Run agent on all test cases in a category
|
|
118
|
+
|
|
119
|
+
Options:
|
|
120
|
+
--category CAT Filter by category (dev, architecture, code-review, etc.)
|
|
121
|
+
--case CASE_ID Test case ID (e.g., dev-001)
|
|
122
|
+
--agent SPEC Agent specification (theme:role, e.g., rome:dev)
|
|
123
|
+
--format FORMAT Output format: text (default) or json
|
|
124
|
+
--dry-run Show what would be run without executing
|
|
125
|
+
|
|
126
|
+
Examples:
|
|
127
|
+
benchmark-runner.js --mode catalog
|
|
128
|
+
benchmark-runner.js --mode catalog --category dev --format json
|
|
129
|
+
benchmark-runner.js --mode info --case dev-001
|
|
130
|
+
benchmark-runner.js --mode solo --case dev-001 --agent rome:dev --dry-run
|
|
131
|
+
benchmark-runner.js --mode suite --category dev --agent rome:dev --dry-run`);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
function error(message) {
|
|
135
|
+
console.error(`Error: ${message}`);
|
|
136
|
+
process.exit(1);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// Recursively find all YAML files
|
|
140
|
+
function findYamlFiles(dir) {
|
|
141
|
+
const files = [];
|
|
142
|
+
if (!existsSync(dir)) return files;
|
|
143
|
+
|
|
144
|
+
const entries = readdirSync(dir, { withFileTypes: true });
|
|
145
|
+
for (const entry of entries) {
|
|
146
|
+
const fullPath = join(dir, entry.name);
|
|
147
|
+
if (entry.isDirectory()) {
|
|
148
|
+
files.push(...findYamlFiles(fullPath));
|
|
149
|
+
} else if (entry.isFile() && entry.name.endsWith('.yaml')) {
|
|
150
|
+
files.push(fullPath);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
return files.sort();
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Get all test cases
|
|
157
|
+
function getAllTestCases(categoryFilter = null) {
|
|
158
|
+
const files = findYamlFiles(TEST_CASES_DIR);
|
|
159
|
+
const testCases = [];
|
|
160
|
+
|
|
161
|
+
for (const file of files) {
|
|
162
|
+
const tc = loadTestCase(file);
|
|
163
|
+
if (tc && tc.id) {
|
|
164
|
+
if (!categoryFilter || tc.category === categoryFilter) {
|
|
165
|
+
testCases.push(tc);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return testCases;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// Find specific test case by ID
|
|
174
|
+
function findTestCase(caseId) {
|
|
175
|
+
const files = findYamlFiles(TEST_CASES_DIR);
|
|
176
|
+
for (const file of files) {
|
|
177
|
+
const tc = loadTestCase(file);
|
|
178
|
+
if (tc && tc.id === caseId) {
|
|
179
|
+
return tc;
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
return null;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
// Get categories
|
|
186
|
+
function getCategories() {
|
|
187
|
+
if (!existsSync(TEST_CASES_DIR)) return [];
|
|
188
|
+
return readdirSync(TEST_CASES_DIR, { withFileTypes: true })
|
|
189
|
+
.filter(entry => entry.isDirectory())
|
|
190
|
+
.map(entry => entry.name)
|
|
191
|
+
.sort();
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// Catalog mode
|
|
195
|
+
function doCatalog(args) {
|
|
196
|
+
const testCases = getAllTestCases(args.category);
|
|
197
|
+
|
|
198
|
+
if (args.format === 'json') {
|
|
199
|
+
const output = testCases.map(tc => ({
|
|
200
|
+
id: tc.id,
|
|
201
|
+
name: tc.name,
|
|
202
|
+
category: tc.category,
|
|
203
|
+
difficulty: tc.difficulty,
|
|
204
|
+
}));
|
|
205
|
+
console.log(JSON.stringify(output));
|
|
206
|
+
} else {
|
|
207
|
+
console.log('Available Test Cases:');
|
|
208
|
+
console.log('');
|
|
209
|
+
|
|
210
|
+
const categories = getCategories();
|
|
211
|
+
for (const cat of categories) {
|
|
212
|
+
if (args.category && cat !== args.category) continue;
|
|
213
|
+
|
|
214
|
+
const casesInCat = testCases.filter(tc => tc.category === cat);
|
|
215
|
+
if (casesInCat.length === 0 && !args.category) continue;
|
|
216
|
+
|
|
217
|
+
console.log(`[${cat}]`);
|
|
218
|
+
for (const tc of casesInCat) {
|
|
219
|
+
const id = (tc.id || '').padEnd(12);
|
|
220
|
+
const name = (tc.name || '').substring(0, 40).padEnd(40);
|
|
221
|
+
const diff = tc.difficulty || 'unknown';
|
|
222
|
+
console.log(` ${id} ${name} (${diff})`);
|
|
223
|
+
}
|
|
224
|
+
console.log('');
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// Info mode
|
|
230
|
+
function doInfo(args) {
|
|
231
|
+
if (!args.caseId) {
|
|
232
|
+
error('Info mode requires --case CASE_ID');
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
const tc = findTestCase(args.caseId);
|
|
236
|
+
if (!tc) {
|
|
237
|
+
error(`Test case not found: ${args.caseId}`);
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
if (args.format === 'json') {
|
|
241
|
+
const output = {
|
|
242
|
+
id: tc.id,
|
|
243
|
+
name: tc.name,
|
|
244
|
+
category: tc.category,
|
|
245
|
+
difficulty: tc.difficulty,
|
|
246
|
+
agent: tc.agent,
|
|
247
|
+
version: tc.version,
|
|
248
|
+
description: tc.description,
|
|
249
|
+
};
|
|
250
|
+
console.log(JSON.stringify(output, null, 2));
|
|
251
|
+
} else {
|
|
252
|
+
console.log(`Test Case: ${tc.id}`);
|
|
253
|
+
console.log('');
|
|
254
|
+
console.log(`Name: ${tc.name}`);
|
|
255
|
+
console.log(`Category: ${tc.category}`);
|
|
256
|
+
console.log(`Difficulty: ${tc.difficulty}`);
|
|
257
|
+
console.log(`Agent: ${tc.agent}`);
|
|
258
|
+
console.log(`Version: ${tc.version}`);
|
|
259
|
+
console.log('');
|
|
260
|
+
console.log('Description:');
|
|
261
|
+
const desc = tc.description || '';
|
|
262
|
+
desc.split('\n').forEach(line => console.log(` ${line}`));
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// Solo mode
|
|
267
|
+
function doSolo(args) {
|
|
268
|
+
if (!args.caseId) {
|
|
269
|
+
error('Solo mode requires --case CASE_ID');
|
|
270
|
+
}
|
|
271
|
+
if (!args.agent) {
|
|
272
|
+
error('Solo mode requires --agent AGENT_SPEC');
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
const tc = findTestCase(args.caseId);
|
|
276
|
+
if (!tc) {
|
|
277
|
+
error(`Test case not found: ${args.caseId}`);
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
const outputDir = args.outputDir || '/tmp/benchmark-results';
|
|
281
|
+
|
|
282
|
+
if (args.dryRun) {
|
|
283
|
+
const output = {
|
|
284
|
+
mode: 'solo',
|
|
285
|
+
test_case: args.caseId,
|
|
286
|
+
agent: args.agent,
|
|
287
|
+
dry_run: true,
|
|
288
|
+
would_execute: `solo-runner.sh ${args.agent} ${args.caseId} ${outputDir}`,
|
|
289
|
+
};
|
|
290
|
+
console.log(JSON.stringify(output, null, 2));
|
|
291
|
+
} else {
|
|
292
|
+
const soloRunner = join(SCRIPTS_DIR, 'solo-runner.sh');
|
|
293
|
+
const child = spawn(soloRunner, [args.agent, args.caseId, outputDir], {
|
|
294
|
+
stdio: 'inherit',
|
|
295
|
+
});
|
|
296
|
+
child.on('close', code => process.exit(code || 0));
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
// Suite mode
|
|
301
|
+
function doSuite(args) {
|
|
302
|
+
if (!args.category) {
|
|
303
|
+
error('Suite mode requires --category CAT');
|
|
304
|
+
}
|
|
305
|
+
if (!args.agent) {
|
|
306
|
+
error('Suite mode requires --agent AGENT_SPEC');
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
const testCases = getAllTestCases(args.category);
|
|
310
|
+
const caseIds = testCases.map(tc => tc.id);
|
|
311
|
+
const total = caseIds.length;
|
|
312
|
+
|
|
313
|
+
if (args.dryRun) {
|
|
314
|
+
const output = {
|
|
315
|
+
mode: 'suite',
|
|
316
|
+
category: args.category,
|
|
317
|
+
agent: args.agent,
|
|
318
|
+
dry_run: true,
|
|
319
|
+
total_cases: total,
|
|
320
|
+
cases: caseIds,
|
|
321
|
+
summary: `Would run ${total} test cases in category '${args.category}'`,
|
|
322
|
+
};
|
|
323
|
+
console.log(JSON.stringify(output, null, 2));
|
|
324
|
+
} else {
|
|
325
|
+
const outputDir = args.outputDir || '/tmp/benchmark-results';
|
|
326
|
+
console.log(`Running suite: ${args.category} (${total} cases)`);
|
|
327
|
+
console.log('');
|
|
328
|
+
|
|
329
|
+
let passed = 0;
|
|
330
|
+
let failed = 0;
|
|
331
|
+
|
|
332
|
+
const runNext = (index) => {
|
|
333
|
+
if (index >= caseIds.length) {
|
|
334
|
+
console.log('');
|
|
335
|
+
console.log(`Suite complete: ${passed} passed, ${failed} failed (total: ${total})`);
|
|
336
|
+
process.exit(failed > 0 ? 1 : 0);
|
|
337
|
+
return;
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
const caseId = caseIds[index];
|
|
341
|
+
console.log(`[${caseId}] Running...`);
|
|
342
|
+
|
|
343
|
+
const soloRunner = join(SCRIPTS_DIR, 'solo-runner.sh');
|
|
344
|
+
const child = spawn(soloRunner, [args.agent, caseId, outputDir], {
|
|
345
|
+
stdio: 'inherit',
|
|
346
|
+
});
|
|
347
|
+
|
|
348
|
+
child.on('close', code => {
|
|
349
|
+
if (code === 0) {
|
|
350
|
+
passed++;
|
|
351
|
+
} else {
|
|
352
|
+
failed++;
|
|
353
|
+
}
|
|
354
|
+
runNext(index + 1);
|
|
355
|
+
});
|
|
356
|
+
};
|
|
357
|
+
|
|
358
|
+
runNext(0);
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
// Main
|
|
363
|
+
function main() {
|
|
364
|
+
const args = parseArgs(process.argv);
|
|
365
|
+
|
|
366
|
+
if (!args.mode) {
|
|
367
|
+
showUsage();
|
|
368
|
+
process.exit(0);
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
const validModes = ['catalog', 'info', 'solo', 'suite'];
|
|
372
|
+
if (!validModes.includes(args.mode)) {
|
|
373
|
+
error(`Unknown mode: ${args.mode}. Valid modes: ${validModes.join(', ')}`);
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
switch (args.mode) {
|
|
377
|
+
case 'catalog':
|
|
378
|
+
doCatalog(args);
|
|
379
|
+
break;
|
|
380
|
+
case 'info':
|
|
381
|
+
doInfo(args);
|
|
382
|
+
break;
|
|
383
|
+
case 'solo':
|
|
384
|
+
doSolo(args);
|
|
385
|
+
break;
|
|
386
|
+
case 'suite':
|
|
387
|
+
doSuite(args);
|
|
388
|
+
break;
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
main();
|