@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,315 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * aggregate-benchmark-stats.js - Aggregate job-fair results into benchmark statistics
4
+ *
5
+ * Scans consolidated job-fair results and generates unified statistics:
6
+ * - Per-role aggregate statistics (mean, std_dev, n, min, max)
7
+ * - Theme rankings per role
8
+ * - Control baseline comparisons
9
+ *
10
+ * Usage:
11
+ * aggregate-benchmark-stats.js [--dry-run] [--theme THEME]
12
+ */
13
+
14
+ import { readdirSync, writeFileSync, existsSync } from 'fs';
15
+ import { join, dirname } from 'path';
16
+ import { fileURLToPath } from 'url';
17
+ import { execSync } from 'child_process';
18
+
19
+ const __filename = fileURLToPath(import.meta.url);
20
+ const __dirname = dirname(__filename);
21
+ const PROJECT_DIR = join(dirname(__dirname), '..', '..');
22
+ const CONSOLIDATED_DIR = join(PROJECT_DIR, 'internal', 'results', 'job-fair', 'consolidated');
23
+ const OUTPUT_FILE = join(PROJECT_DIR, 'internal', 'results', 'aggregate-stats.yaml');
24
+
25
+ // Standard roles from job-fair
26
+ const STANDARD_ROLES = ['dev-codegen', 'dev-debug', 'reviewer', 'tea', 'sm', 'architect'];
27
+
28
+ // Baselines from control theme (hardcoded from job-fair-runner.sh)
29
+ const BASELINES = {
30
+ 'dev-codegen': { mean: 85.8, std: 7.30, n: 10 },
31
+ 'dev-debug': { mean: 77.5, std: 8.54, n: 10 },
32
+ 'reviewer': { mean: 78.5, std: 1.8, n: 10 },
33
+ 'tea': { mean: 72.1, std: 2.3, n: 10 },
34
+ 'sm': { mean: 80.3, std: 1.9, n: 10 },
35
+ 'architect': { mean: 87.2, std: 3.25, n: 10 },
36
+ };
37
+
38
+ // Parse command line arguments
39
+ function parseArgs(argv) {
40
+ const args = {
41
+ dryRun: false,
42
+ theme: null,
43
+ };
44
+
45
+ let i = 2;
46
+ while (i < argv.length) {
47
+ const arg = argv[i];
48
+ switch (arg) {
49
+ case '--dry-run':
50
+ args.dryRun = true;
51
+ break;
52
+ case '--theme':
53
+ args.theme = argv[++i];
54
+ break;
55
+ case '--help':
56
+ case '-h':
57
+ showUsage();
58
+ process.exit(0);
59
+ break;
60
+ }
61
+ i++;
62
+ }
63
+ return args;
64
+ }
65
+
66
+ function showUsage() {
67
+ console.log(`Usage: aggregate-benchmark-stats.js [OPTIONS]
68
+
69
+ Options:
70
+ --dry-run Output to stdout instead of writing file
71
+ --theme THEME Only process specific theme
72
+ --help, -h Show this help message
73
+
74
+ Output: internal/results/aggregate-stats.yaml`);
75
+ }
76
+
77
+ // Extract YAML field using yq
78
+ function yqGet(filePath, field) {
79
+ try {
80
+ const result = execSync(`yq -r '.${field}' "${filePath}"`, {
81
+ encoding: 'utf-8',
82
+ stdio: ['pipe', 'pipe', 'pipe'],
83
+ });
84
+ const trimmed = result.trim();
85
+ return trimmed === 'null' ? null : trimmed;
86
+ } catch {
87
+ return null;
88
+ }
89
+ }
90
+
91
+ // Get all keys from a YAML object
92
+ function yqKeys(filePath, field) {
93
+ try {
94
+ const result = execSync(`yq -r '.${field} | keys | .[]' "${filePath}"`, {
95
+ encoding: 'utf-8',
96
+ stdio: ['pipe', 'pipe', 'pipe'],
97
+ });
98
+ return result.trim().split('\n').filter(Boolean);
99
+ } catch {
100
+ return [];
101
+ }
102
+ }
103
+
104
+ // Get numeric value from YAML
105
+ function yqNumber(filePath, field) {
106
+ const val = yqGet(filePath, field);
107
+ return val !== null ? parseFloat(val) : null;
108
+ }
109
+
110
+ // Calculate statistics from array of numbers
111
+ function calculateStats(values) {
112
+ if (!values || values.length === 0) {
113
+ return { mean: 0, std_dev: 0, n: 0, min: 0, max: 0 };
114
+ }
115
+
116
+ const n = values.length;
117
+ const mean = values.reduce((a, b) => a + b, 0) / n;
118
+ const variance = values.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / n;
119
+ const std_dev = Math.sqrt(variance);
120
+ const min = Math.min(...values);
121
+ const max = Math.max(...values);
122
+
123
+ return {
124
+ mean: Math.round(mean * 100) / 100,
125
+ std_dev: Math.round(std_dev * 100) / 100,
126
+ n,
127
+ min: Math.round(min * 100) / 100,
128
+ max: Math.round(max * 100) / 100,
129
+ };
130
+ }
131
+
132
+ // Load theme data from consolidated summary
133
+ function loadThemeData(themePath) {
134
+ const summaryPath = join(themePath, 'summary.yaml');
135
+ if (!existsSync(summaryPath)) {
136
+ return null;
137
+ }
138
+
139
+ const theme = yqGet(summaryPath, 'theme');
140
+ if (!theme) return null;
141
+
142
+ const characters = yqKeys(summaryPath, 'matrix');
143
+ const roleScores = {};
144
+
145
+ // Initialize role arrays
146
+ for (const role of STANDARD_ROLES) {
147
+ roleScores[role] = [];
148
+ }
149
+
150
+ // Collect scores from each character
151
+ for (const char of characters) {
152
+ for (const role of STANDARD_ROLES) {
153
+ const mean = yqNumber(summaryPath, `matrix.${char}.${role}.mean`);
154
+ if (mean !== null && mean > 0) {
155
+ roleScores[role].push(mean);
156
+ }
157
+ }
158
+ }
159
+
160
+ return { theme, roleScores };
161
+ }
162
+
163
+ // Main aggregation logic
164
+ function aggregate(args) {
165
+ // Get list of consolidated themes
166
+ let themes;
167
+ if (args.theme) {
168
+ themes = [args.theme];
169
+ } else {
170
+ themes = readdirSync(CONSOLIDATED_DIR, { withFileTypes: true })
171
+ .filter(d => d.isDirectory())
172
+ .map(d => d.name);
173
+ }
174
+
175
+ // Collect all scores per role across themes
176
+ const allRoleScores = {};
177
+ const themeRoleAverages = {}; // theme -> role -> mean
178
+
179
+ for (const role of STANDARD_ROLES) {
180
+ allRoleScores[role] = [];
181
+ }
182
+
183
+ let themesProcessed = 0;
184
+
185
+ for (const themeName of themes) {
186
+ const themePath = join(CONSOLIDATED_DIR, themeName);
187
+ const data = loadThemeData(themePath);
188
+ if (!data) continue;
189
+
190
+ themesProcessed++;
191
+ themeRoleAverages[data.theme] = {};
192
+
193
+ for (const role of STANDARD_ROLES) {
194
+ // Add all character scores for this theme/role to the global pool
195
+ allRoleScores[role].push(...data.roleScores[role]);
196
+
197
+ // Calculate theme average for rankings
198
+ if (data.roleScores[role].length > 0) {
199
+ const avg = data.roleScores[role].reduce((a, b) => a + b, 0) / data.roleScores[role].length;
200
+ themeRoleAverages[data.theme][role] = Math.round(avg * 100) / 100;
201
+ }
202
+ }
203
+ }
204
+
205
+ // Calculate aggregate statistics per role
206
+ const roles = {};
207
+ for (const role of STANDARD_ROLES) {
208
+ roles[role] = calculateStats(allRoleScores[role]);
209
+ }
210
+
211
+ // Generate rankings per role (sorted by mean descending)
212
+ const rankings = {};
213
+ for (const role of STANDARD_ROLES) {
214
+ const themeList = [];
215
+ for (const [theme, roleAvgs] of Object.entries(themeRoleAverages)) {
216
+ if (roleAvgs[role] !== undefined) {
217
+ themeList.push({ theme, mean: roleAvgs[role] });
218
+ }
219
+ }
220
+ // Sort descending by mean
221
+ themeList.sort((a, b) => b.mean - a.mean);
222
+ rankings[role] = themeList;
223
+ }
224
+
225
+ // Control baseline comparison
226
+ const control = {
227
+ baseline: 'from job-fair-runner.sh',
228
+ };
229
+
230
+ // Get control theme data if available
231
+ const controlPath = join(CONSOLIDATED_DIR, 'control');
232
+ const controlData = loadThemeData(controlPath);
233
+
234
+ for (const role of STANDARD_ROLES) {
235
+ const baseline = BASELINES[role];
236
+ let controlMean = null;
237
+
238
+ if (controlData && controlData.roleScores[role].length > 0) {
239
+ const scores = controlData.roleScores[role];
240
+ controlMean = scores.reduce((a, b) => a + b, 0) / scores.length;
241
+ controlMean = Math.round(controlMean * 100) / 100;
242
+ }
243
+
244
+ control[role] = {
245
+ mean: controlMean !== null ? controlMean : baseline.mean,
246
+ baseline_mean: baseline.mean,
247
+ vs_baseline: controlMean !== null
248
+ ? Math.round((controlMean - baseline.mean) * 100) / 100
249
+ : 0,
250
+ };
251
+ }
252
+
253
+ // Build output YAML
254
+ const timestamp = new Date().toISOString();
255
+
256
+ const output = {
257
+ metadata: {
258
+ generated_at: timestamp,
259
+ themes_processed: themesProcessed,
260
+ source: 'internal/results/job-fair/consolidated/',
261
+ },
262
+ roles,
263
+ rankings,
264
+ control,
265
+ };
266
+
267
+ return output;
268
+ }
269
+
270
+ // Format output as YAML
271
+ function toYaml(obj, indent = 0) {
272
+ const spaces = ' '.repeat(indent);
273
+ let yaml = '';
274
+
275
+ for (const [key, value] of Object.entries(obj)) {
276
+ if (value === null || value === undefined) {
277
+ yaml += `${spaces}${key}: null\n`;
278
+ } else if (Array.isArray(value)) {
279
+ yaml += `${spaces}${key}:\n`;
280
+ for (const item of value) {
281
+ if (typeof item === 'object') {
282
+ // Inline object format for array items
283
+ const parts = Object.entries(item).map(([k, v]) => `${k}: ${v}`);
284
+ yaml += `${spaces} - {${parts.join(', ')}}\n`;
285
+ } else {
286
+ yaml += `${spaces} - ${item}\n`;
287
+ }
288
+ }
289
+ } else if (typeof value === 'object') {
290
+ yaml += `${spaces}${key}:\n`;
291
+ yaml += toYaml(value, indent + 1);
292
+ } else {
293
+ yaml += `${spaces}${key}: ${value}\n`;
294
+ }
295
+ }
296
+
297
+ return yaml;
298
+ }
299
+
300
+ // Main entry point
301
+ function main() {
302
+ const args = parseArgs(process.argv);
303
+ const result = aggregate(args);
304
+ const yamlOutput = '# Aggregate Benchmark Statistics\n# Generated from job-fair consolidated results\n\n' + toYaml(result);
305
+
306
+ if (args.dryRun) {
307
+ console.log(yamlOutput);
308
+ } else {
309
+ writeFileSync(OUTPUT_FILE, yamlOutput);
310
+ console.log(`Wrote aggregate stats to ${OUTPUT_FILE}`);
311
+ console.log(`Themes processed: ${result.metadata.themes_processed}`);
312
+ }
313
+ }
314
+
315
+ main();
@@ -0,0 +1,8 @@
1
+ #!/bin/bash
2
+ # aggregate-benchmark-stats.sh - Shell wrapper for aggregate-benchmark-stats.js
3
+ # Aggregates job-fair results into unified benchmark statistics
4
+ #
5
+ # All logic is implemented in aggregate-benchmark-stats.js (Node.js)
6
+
7
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
8
+ exec node "$SCRIPT_DIR/aggregate-benchmark-stats.js" "$@"
@@ -0,0 +1,392 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * benchmark-runner.js - Unified entry point for benchmark execution
4
+ * Dispatches to solo-runner.sh and job-fair-runner.sh as appropriate
5
+ *
6
+ * Usage:
7
+ * benchmark-runner.js --mode catalog [--category CAT] [--format json|text]
8
+ * benchmark-runner.js --mode info --case CASE_ID [--format json|text]
9
+ * benchmark-runner.js --mode solo --case CASE_ID --agent AGENT_SPEC [--dry-run] [OUTPUT_DIR]
10
+ * benchmark-runner.js --mode suite --category CAT --agent AGENT_SPEC [--dry-run] [OUTPUT_DIR]
11
+ */
12
+
13
+ import { readFileSync, readdirSync, existsSync } from 'fs';
14
+ import { join, dirname } from 'path';
15
+ import { fileURLToPath } from 'url';
16
+ import { spawn, execSync } from 'child_process';
17
+
18
+ const __filename = fileURLToPath(import.meta.url);
19
+ const __dirname = dirname(__filename);
20
+ const PROJECT_DIR = join(dirname(__dirname), '..', '..');
21
+ const TEST_CASES_DIR = join(PROJECT_DIR, 'benchmarks', 'test-cases');
22
+ const SCRIPTS_DIR = __dirname;
23
+
24
+ // Simple YAML field extractor using yq (already installed)
25
+ function parseYamlField(filePath, field) {
26
+ try {
27
+ const result = execSync(`yq -r '.${field}' "${filePath}"`, {
28
+ encoding: 'utf-8',
29
+ stdio: ['pipe', 'pipe', 'pipe'],
30
+ });
31
+ const trimmed = result.trim();
32
+ return trimmed === 'null' ? '' : trimmed;
33
+ } catch {
34
+ return '';
35
+ }
36
+ }
37
+
38
+ // Load test case metadata from YAML file
39
+ function loadTestCase(filePath) {
40
+ try {
41
+ const id = parseYamlField(filePath, 'id');
42
+ if (!id) return null;
43
+
44
+ return {
45
+ id,
46
+ name: parseYamlField(filePath, 'name'),
47
+ category: parseYamlField(filePath, 'category'),
48
+ difficulty: parseYamlField(filePath, 'difficulty'),
49
+ agent: parseYamlField(filePath, 'agent'),
50
+ version: parseYamlField(filePath, 'version'),
51
+ description: parseYamlField(filePath, 'description'),
52
+ _filePath: filePath,
53
+ };
54
+ } catch {
55
+ return null;
56
+ }
57
+ }
58
+
59
+ // Parse command line arguments
60
+ function parseArgs(argv) {
61
+ const args = {
62
+ mode: null,
63
+ category: null,
64
+ caseId: null,
65
+ agent: null,
66
+ format: 'text',
67
+ dryRun: false,
68
+ outputDir: null,
69
+ };
70
+
71
+ let i = 2;
72
+ while (i < argv.length) {
73
+ const arg = argv[i];
74
+ switch (arg) {
75
+ case '--mode':
76
+ args.mode = argv[++i];
77
+ break;
78
+ case '--category':
79
+ args.category = argv[++i];
80
+ break;
81
+ case '--case':
82
+ args.caseId = argv[++i];
83
+ break;
84
+ case '--agent':
85
+ args.agent = argv[++i];
86
+ break;
87
+ case '--format':
88
+ args.format = argv[++i];
89
+ break;
90
+ case '--dry-run':
91
+ args.dryRun = true;
92
+ break;
93
+ case '--help':
94
+ case '-h':
95
+ showUsage();
96
+ process.exit(0);
97
+ break;
98
+ default:
99
+ if (!arg.startsWith('-') && !args.outputDir) {
100
+ args.outputDir = arg;
101
+ }
102
+ break;
103
+ }
104
+ i++;
105
+ }
106
+
107
+ return args;
108
+ }
109
+
110
+ function showUsage() {
111
+ console.log(`Usage: benchmark-runner.js [OPTIONS]
112
+
113
+ Modes:
114
+ --mode catalog List available test cases
115
+ --mode info Show details of a specific test case
116
+ --mode solo Run single agent on single test case
117
+ --mode suite Run agent on all test cases in a category
118
+
119
+ Options:
120
+ --category CAT Filter by category (dev, architecture, code-review, etc.)
121
+ --case CASE_ID Test case ID (e.g., dev-001)
122
+ --agent SPEC Agent specification (theme:role, e.g., rome:dev)
123
+ --format FORMAT Output format: text (default) or json
124
+ --dry-run Show what would be run without executing
125
+
126
+ Examples:
127
+ benchmark-runner.js --mode catalog
128
+ benchmark-runner.js --mode catalog --category dev --format json
129
+ benchmark-runner.js --mode info --case dev-001
130
+ benchmark-runner.js --mode solo --case dev-001 --agent rome:dev --dry-run
131
+ benchmark-runner.js --mode suite --category dev --agent rome:dev --dry-run`);
132
+ }
133
+
134
+ function error(message) {
135
+ console.error(`Error: ${message}`);
136
+ process.exit(1);
137
+ }
138
+
139
+ // Recursively find all YAML files
140
+ function findYamlFiles(dir) {
141
+ const files = [];
142
+ if (!existsSync(dir)) return files;
143
+
144
+ const entries = readdirSync(dir, { withFileTypes: true });
145
+ for (const entry of entries) {
146
+ const fullPath = join(dir, entry.name);
147
+ if (entry.isDirectory()) {
148
+ files.push(...findYamlFiles(fullPath));
149
+ } else if (entry.isFile() && entry.name.endsWith('.yaml')) {
150
+ files.push(fullPath);
151
+ }
152
+ }
153
+ return files.sort();
154
+ }
155
+
156
+ // Get all test cases
157
+ function getAllTestCases(categoryFilter = null) {
158
+ const files = findYamlFiles(TEST_CASES_DIR);
159
+ const testCases = [];
160
+
161
+ for (const file of files) {
162
+ const tc = loadTestCase(file);
163
+ if (tc && tc.id) {
164
+ if (!categoryFilter || tc.category === categoryFilter) {
165
+ testCases.push(tc);
166
+ }
167
+ }
168
+ }
169
+
170
+ return testCases;
171
+ }
172
+
173
+ // Find specific test case by ID
174
+ function findTestCase(caseId) {
175
+ const files = findYamlFiles(TEST_CASES_DIR);
176
+ for (const file of files) {
177
+ const tc = loadTestCase(file);
178
+ if (tc && tc.id === caseId) {
179
+ return tc;
180
+ }
181
+ }
182
+ return null;
183
+ }
184
+
185
+ // Get categories
186
+ function getCategories() {
187
+ if (!existsSync(TEST_CASES_DIR)) return [];
188
+ return readdirSync(TEST_CASES_DIR, { withFileTypes: true })
189
+ .filter(entry => entry.isDirectory())
190
+ .map(entry => entry.name)
191
+ .sort();
192
+ }
193
+
194
+ // Catalog mode
195
+ function doCatalog(args) {
196
+ const testCases = getAllTestCases(args.category);
197
+
198
+ if (args.format === 'json') {
199
+ const output = testCases.map(tc => ({
200
+ id: tc.id,
201
+ name: tc.name,
202
+ category: tc.category,
203
+ difficulty: tc.difficulty,
204
+ }));
205
+ console.log(JSON.stringify(output));
206
+ } else {
207
+ console.log('Available Test Cases:');
208
+ console.log('');
209
+
210
+ const categories = getCategories();
211
+ for (const cat of categories) {
212
+ if (args.category && cat !== args.category) continue;
213
+
214
+ const casesInCat = testCases.filter(tc => tc.category === cat);
215
+ if (casesInCat.length === 0 && !args.category) continue;
216
+
217
+ console.log(`[${cat}]`);
218
+ for (const tc of casesInCat) {
219
+ const id = (tc.id || '').padEnd(12);
220
+ const name = (tc.name || '').substring(0, 40).padEnd(40);
221
+ const diff = tc.difficulty || 'unknown';
222
+ console.log(` ${id} ${name} (${diff})`);
223
+ }
224
+ console.log('');
225
+ }
226
+ }
227
+ }
228
+
229
+ // Info mode
230
+ function doInfo(args) {
231
+ if (!args.caseId) {
232
+ error('Info mode requires --case CASE_ID');
233
+ }
234
+
235
+ const tc = findTestCase(args.caseId);
236
+ if (!tc) {
237
+ error(`Test case not found: ${args.caseId}`);
238
+ }
239
+
240
+ if (args.format === 'json') {
241
+ const output = {
242
+ id: tc.id,
243
+ name: tc.name,
244
+ category: tc.category,
245
+ difficulty: tc.difficulty,
246
+ agent: tc.agent,
247
+ version: tc.version,
248
+ description: tc.description,
249
+ };
250
+ console.log(JSON.stringify(output, null, 2));
251
+ } else {
252
+ console.log(`Test Case: ${tc.id}`);
253
+ console.log('');
254
+ console.log(`Name: ${tc.name}`);
255
+ console.log(`Category: ${tc.category}`);
256
+ console.log(`Difficulty: ${tc.difficulty}`);
257
+ console.log(`Agent: ${tc.agent}`);
258
+ console.log(`Version: ${tc.version}`);
259
+ console.log('');
260
+ console.log('Description:');
261
+ const desc = tc.description || '';
262
+ desc.split('\n').forEach(line => console.log(` ${line}`));
263
+ }
264
+ }
265
+
266
+ // Solo mode
267
+ function doSolo(args) {
268
+ if (!args.caseId) {
269
+ error('Solo mode requires --case CASE_ID');
270
+ }
271
+ if (!args.agent) {
272
+ error('Solo mode requires --agent AGENT_SPEC');
273
+ }
274
+
275
+ const tc = findTestCase(args.caseId);
276
+ if (!tc) {
277
+ error(`Test case not found: ${args.caseId}`);
278
+ }
279
+
280
+ const outputDir = args.outputDir || '/tmp/benchmark-results';
281
+
282
+ if (args.dryRun) {
283
+ const output = {
284
+ mode: 'solo',
285
+ test_case: args.caseId,
286
+ agent: args.agent,
287
+ dry_run: true,
288
+ would_execute: `solo-runner.sh ${args.agent} ${args.caseId} ${outputDir}`,
289
+ };
290
+ console.log(JSON.stringify(output, null, 2));
291
+ } else {
292
+ const soloRunner = join(SCRIPTS_DIR, 'solo-runner.sh');
293
+ const child = spawn(soloRunner, [args.agent, args.caseId, outputDir], {
294
+ stdio: 'inherit',
295
+ });
296
+ child.on('close', code => process.exit(code || 0));
297
+ }
298
+ }
299
+
300
+ // Suite mode
301
+ function doSuite(args) {
302
+ if (!args.category) {
303
+ error('Suite mode requires --category CAT');
304
+ }
305
+ if (!args.agent) {
306
+ error('Suite mode requires --agent AGENT_SPEC');
307
+ }
308
+
309
+ const testCases = getAllTestCases(args.category);
310
+ const caseIds = testCases.map(tc => tc.id);
311
+ const total = caseIds.length;
312
+
313
+ if (args.dryRun) {
314
+ const output = {
315
+ mode: 'suite',
316
+ category: args.category,
317
+ agent: args.agent,
318
+ dry_run: true,
319
+ total_cases: total,
320
+ cases: caseIds,
321
+ summary: `Would run ${total} test cases in category '${args.category}'`,
322
+ };
323
+ console.log(JSON.stringify(output, null, 2));
324
+ } else {
325
+ const outputDir = args.outputDir || '/tmp/benchmark-results';
326
+ console.log(`Running suite: ${args.category} (${total} cases)`);
327
+ console.log('');
328
+
329
+ let passed = 0;
330
+ let failed = 0;
331
+
332
+ const runNext = (index) => {
333
+ if (index >= caseIds.length) {
334
+ console.log('');
335
+ console.log(`Suite complete: ${passed} passed, ${failed} failed (total: ${total})`);
336
+ process.exit(failed > 0 ? 1 : 0);
337
+ return;
338
+ }
339
+
340
+ const caseId = caseIds[index];
341
+ console.log(`[${caseId}] Running...`);
342
+
343
+ const soloRunner = join(SCRIPTS_DIR, 'solo-runner.sh');
344
+ const child = spawn(soloRunner, [args.agent, caseId, outputDir], {
345
+ stdio: 'inherit',
346
+ });
347
+
348
+ child.on('close', code => {
349
+ if (code === 0) {
350
+ passed++;
351
+ } else {
352
+ failed++;
353
+ }
354
+ runNext(index + 1);
355
+ });
356
+ };
357
+
358
+ runNext(0);
359
+ }
360
+ }
361
+
362
+ // Main
363
+ function main() {
364
+ const args = parseArgs(process.argv);
365
+
366
+ if (!args.mode) {
367
+ showUsage();
368
+ process.exit(0);
369
+ }
370
+
371
+ const validModes = ['catalog', 'info', 'solo', 'suite'];
372
+ if (!validModes.includes(args.mode)) {
373
+ error(`Unknown mode: ${args.mode}. Valid modes: ${validModes.join(', ')}`);
374
+ }
375
+
376
+ switch (args.mode) {
377
+ case 'catalog':
378
+ doCatalog(args);
379
+ break;
380
+ case 'info':
381
+ doInfo(args);
382
+ break;
383
+ case 'solo':
384
+ doSolo(args);
385
+ break;
386
+ case 'suite':
387
+ doSuite(args);
388
+ break;
389
+ }
390
+ }
391
+
392
+ main();