tarsk 0.5.41 → 0.5.43
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bundled-skills/skill-creator/LICENSE.txt +202 -0
- package/dist/bundled-skills/skill-creator/SKILL.md +510 -0
- package/dist/bundled-skills/skill-creator/agents/analyzer.md +283 -0
- package/dist/bundled-skills/skill-creator/agents/comparator.md +203 -0
- package/dist/bundled-skills/skill-creator/agents/grader.md +227 -0
- package/dist/bundled-skills/skill-creator/assets/eval_review.html +292 -0
- package/dist/bundled-skills/skill-creator/eval-viewer/generate_review.js +544 -0
- package/dist/bundled-skills/skill-creator/eval-viewer/viewer.html +1478 -0
- package/dist/bundled-skills/skill-creator/package.json +3 -0
- package/dist/bundled-skills/skill-creator/references/schemas.md +423 -0
- package/dist/bundled-skills/skill-creator/scripts/aggregate_benchmark.js +377 -0
- package/dist/bundled-skills/skill-creator/scripts/generate_report.js +345 -0
- package/dist/bundled-skills/skill-creator/scripts/improve_description.js +263 -0
- package/dist/bundled-skills/skill-creator/scripts/package_skill.js +147 -0
- package/dist/bundled-skills/skill-creator/scripts/quick_validate.js +132 -0
- package/dist/bundled-skills/skill-creator/scripts/run_eval.js +345 -0
- package/dist/bundled-skills/skill-creator/scripts/run_loop.js +411 -0
- package/dist/bundled-skills/skill-creator/scripts/utils.js +60 -0
- package/dist/index.js +8817 -6340
- package/dist/public/assets/{account-view-D-dJ0y-D.js → account-view-xKotpUyx.js} +1 -1
- package/dist/public/assets/api-D6uLdHBQ.js +1 -0
- package/dist/public/assets/browser-tab-DxigYzoT.js +1 -0
- package/dist/public/assets/commit-dialog-CLQM9ah3.js +1 -0
- package/dist/public/assets/context-menu-rC7iWcty.js +1 -0
- package/dist/public/assets/create-repo-dialog-C6k5wZPW.js +1 -0
- package/dist/public/assets/{dialogs-config-B-LZ4nOb.js → dialogs-config-CjKh5Rl2.js} +14 -14
- package/dist/public/assets/diff-view-DWDWI5nl.js +3 -0
- package/dist/public/assets/explorer-tab-view-B0kT8Hl6.js +2 -0
- package/dist/public/assets/explorer-tree-BC4fBpxi.js +1 -0
- package/dist/public/assets/explorer-view-DIM08sdy.js +1 -0
- package/dist/public/assets/git-history-dialog-CuxOTngT.js +1 -0
- package/dist/public/assets/git-ops-button-C04zFAnF.js +2 -0
- package/dist/public/assets/history-view-ar7GLZ-R.js +9 -0
- package/dist/public/assets/index--HY4BbcM.js +90 -0
- package/dist/public/assets/index-DKOXV50p.css +1 -0
- package/dist/public/assets/mcp-server-card-Cy4RU2_Q.js +1 -0
- package/dist/public/assets/merged-pr-dialog-Bo07VouF.js +1 -0
- package/dist/public/assets/model-star-rating-BmkpdXfr.js +1 -0
- package/dist/public/assets/onboarding-ClZrOxX7.js +1 -0
- package/dist/public/assets/project-settings-view-Dm9pQAp_.js +1 -0
- package/dist/public/assets/providers-list-view-D5gHsjl_.js +1 -0
- package/dist/public/assets/pull-request-dialog-8AYlOUNX.js +1 -0
- package/dist/public/assets/pull-with-changes-dialog-CSa5OE-d.js +1 -0
- package/dist/public/assets/push-before-pr-dialog-D5W_xsqv.js +1 -0
- package/dist/public/assets/radio-group-CbatNaj1.js +1 -0
- package/dist/public/assets/react-vendor-DwQYi7es.js +16 -0
- package/dist/public/assets/settings-general-view-BP5ULy9A.js +1 -0
- package/dist/public/assets/settings-instructions-view-DMAjbi6E.js +1 -0
- package/dist/public/assets/settings-list-B8hiBkBz.js +1 -0
- package/dist/public/assets/settings-mcp-servers-view-OimQz-Rd.js +5 -0
- package/dist/public/assets/{settings-models-skeleton-ClrbJy_p.js → settings-models-skeleton-DPnYbg69.js} +1 -1
- package/dist/public/assets/settings-models-view-Fq3WtdKG.js +1 -0
- package/dist/public/assets/settings-rules-view-DBk7DzV2.js +8 -0
- package/dist/public/assets/settings-skills-view-CmOw-WMM.js +2 -0
- package/dist/public/assets/settings-slash-commands-view-FsrF5FkK.js +1 -0
- package/dist/public/assets/settings-subagents-view-D98Nxoly.js +2 -0
- package/dist/public/assets/{settings-system-prompt-view-Dl66VFaj.js → settings-system-prompt-view-B6Hy9ZyK.js} +1 -1
- package/dist/public/assets/settings-view-J-rjoRcU.js +2 -0
- package/dist/public/assets/skeleton-BHhGML7J.js +1 -0
- package/dist/public/assets/slug-utils-DyRUJ1NS.js +1 -0
- package/dist/public/assets/terminal-panel-DTOx74_o.js +1 -0
- package/dist/public/assets/{ui-components-C4RrfJEJ.js → ui-components-Jc6oi6bz.js} +1 -1
- package/dist/public/assets/use-deferred-search-B7EdyRbt.js +1 -0
- package/dist/public/assets/{utils-B7FQXlI6.js → utils-tgi5ym_d.js} +1 -1
- package/dist/public/assets/web-C3vJZ_3_.js +1 -0
- package/dist/public/assets/web-CUAWBWPy.js +1 -0
- package/dist/public/assets/{whisper-wasm-EGutPGND.js → whisper-wasm-CWcbC1MB.js} +1 -1
- package/dist/public/browser-preview-rpc.js +484 -0
- package/dist/public/index.html +8 -8
- package/package.json +4 -3
- package/dist/public/assets/api-DJaJqkc6.js +0 -1
- package/dist/public/assets/browser-tab-D7wEj-BD.js +0 -1
- package/dist/public/assets/commit-dialog-DdhGDH4F.js +0 -1
- package/dist/public/assets/context-menu-asf2g-KX.js +0 -1
- package/dist/public/assets/create-repo-dialog-Bltp6PKZ.js +0 -1
- package/dist/public/assets/diff-view-Bi545EPj.js +0 -3
- package/dist/public/assets/explorer-tab-view-B-P555GE.js +0 -2
- package/dist/public/assets/explorer-tree-CyXhVrI7.js +0 -1
- package/dist/public/assets/explorer-view-BAHDhIGN.js +0 -1
- package/dist/public/assets/git-history-dialog-Bci_iQmi.js +0 -1
- package/dist/public/assets/git-ops-button-1lum9QXI.js +0 -2
- package/dist/public/assets/history-view-CAkN8PCo.js +0 -9
- package/dist/public/assets/index-BLO68CQl.js +0 -69
- package/dist/public/assets/index-jIBJk8xl.css +0 -1
- package/dist/public/assets/mcp-server-card-DQUpkDFV.js +0 -1
- package/dist/public/assets/merged-pr-dialog-tHPrJ2CK.js +0 -1
- package/dist/public/assets/onboarding-Bbvb7kO4.js +0 -1
- package/dist/public/assets/project-settings-view-BRkHOq_q.js +0 -1
- package/dist/public/assets/providers-list-view-Dex879vv.js +0 -1
- package/dist/public/assets/pull-request-dialog-WHdmrW83.js +0 -1
- package/dist/public/assets/pull-with-changes-dialog-Ck3OwINV.js +0 -1
- package/dist/public/assets/push-before-pr-dialog-Bvqvz04U.js +0 -1
- package/dist/public/assets/radio-group-B0xvu5B9.js +0 -1
- package/dist/public/assets/react-vendor-D8PTA4EX.js +0 -16
- package/dist/public/assets/settings-general-view-lUxshNA9.js +0 -1
- package/dist/public/assets/settings-instructions-view-C57XGLha.js +0 -1
- package/dist/public/assets/settings-list-CHGKmGl_.js +0 -1
- package/dist/public/assets/settings-mcp-servers-view-DpqkhrgB.js +0 -5
- package/dist/public/assets/settings-models-view-QPEdnibD.js +0 -1
- package/dist/public/assets/settings-rules-view-WJU--cRq.js +0 -8
- package/dist/public/assets/settings-skills-view-mgHy4G_g.js +0 -2
- package/dist/public/assets/settings-slash-commands-view-LB5tVqy1.js +0 -1
- package/dist/public/assets/settings-subagents-view-QR2qlA_y.js +0 -2
- package/dist/public/assets/settings-view-BlVJv4Pz.js +0 -2
- package/dist/public/assets/skeleton-K-fVduHt.js +0 -1
- package/dist/public/assets/terminal-panel-BGQxckfH.js +0 -2
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Aggregate individual run results into benchmark summary statistics.
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* node aggregate_benchmark.js <benchmark_dir>
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
const fs = require("fs");
|
|
10
|
+
const path = require("path");
|
|
11
|
+
|
|
12
|
+
function calculateStats(values) {
|
|
13
|
+
if (!values.length) {
|
|
14
|
+
return { mean: 0.0, stddev: 0.0, min: 0.0, max: 0.0 };
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
const n = values.length;
|
|
18
|
+
const mean = values.reduce((sum, x) => sum + x, 0) / n;
|
|
19
|
+
let stddev = 0.0;
|
|
20
|
+
if (n > 1) {
|
|
21
|
+
const variance = values.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (n - 1);
|
|
22
|
+
stddev = Math.sqrt(variance);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
return {
|
|
26
|
+
mean: Math.round(mean * 10000) / 10000,
|
|
27
|
+
stddev: Math.round(stddev * 10000) / 10000,
|
|
28
|
+
min: Math.round(Math.min(...values) * 10000) / 10000,
|
|
29
|
+
max: Math.round(Math.max(...values) * 10000) / 10000,
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function globEvalDirs(dir) {
|
|
34
|
+
if (!fs.existsSync(dir)) return [];
|
|
35
|
+
return fs
|
|
36
|
+
.readdirSync(dir, { withFileTypes: true })
|
|
37
|
+
.filter((entry) => entry.isDirectory() && entry.name.startsWith("eval-"))
|
|
38
|
+
.map((entry) => path.join(dir, entry.name));
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function loadRunResults(benchmarkDir) {
|
|
42
|
+
const runsDir = path.join(benchmarkDir, "runs");
|
|
43
|
+
let searchDir;
|
|
44
|
+
if (fs.existsSync(runsDir) && fs.statSync(runsDir).isDirectory()) {
|
|
45
|
+
searchDir = runsDir;
|
|
46
|
+
} else if (globEvalDirs(benchmarkDir).length > 0) {
|
|
47
|
+
searchDir = benchmarkDir;
|
|
48
|
+
} else {
|
|
49
|
+
console.log(
|
|
50
|
+
`No eval directories found in ${benchmarkDir} or ${path.join(benchmarkDir, "runs")}`,
|
|
51
|
+
);
|
|
52
|
+
return {};
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
const results = {};
|
|
56
|
+
|
|
57
|
+
const evalDirs = globEvalDirs(searchDir).sort();
|
|
58
|
+
for (let evalIdx = 0; evalIdx < evalDirs.length; evalIdx++) {
|
|
59
|
+
const evalDir = evalDirs[evalIdx];
|
|
60
|
+
const metadataPath = path.join(evalDir, "eval_metadata.json");
|
|
61
|
+
let evalId = evalIdx;
|
|
62
|
+
if (fs.existsSync(metadataPath)) {
|
|
63
|
+
try {
|
|
64
|
+
const metadata = JSON.parse(fs.readFileSync(metadataPath, "utf-8"));
|
|
65
|
+
evalId = metadata.eval_id ?? evalIdx;
|
|
66
|
+
} catch {
|
|
67
|
+
evalId = evalIdx;
|
|
68
|
+
}
|
|
69
|
+
} else {
|
|
70
|
+
const parts = path.basename(evalDir).split("-");
|
|
71
|
+
const parsed = parseInt(parts[1], 10);
|
|
72
|
+
evalId = Number.isNaN(parsed) ? evalIdx : parsed;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
for (const configName of fs.readdirSync(evalDir).sort()) {
|
|
76
|
+
const configDir = path.join(evalDir, configName);
|
|
77
|
+
if (!fs.statSync(configDir).isDirectory()) continue;
|
|
78
|
+
|
|
79
|
+
const runDirs = fs
|
|
80
|
+
.readdirSync(configDir)
|
|
81
|
+
.filter((name) => name.startsWith("run-"))
|
|
82
|
+
.map((name) => path.join(configDir, name));
|
|
83
|
+
if (runDirs.length === 0) continue;
|
|
84
|
+
|
|
85
|
+
if (!results[configName]) {
|
|
86
|
+
results[configName] = [];
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
for (const runDir of runDirs.sort()) {
|
|
90
|
+
const runNumber = parseInt(path.basename(runDir).split("-")[1], 10);
|
|
91
|
+
const gradingFile = path.join(runDir, "grading.json");
|
|
92
|
+
if (!fs.existsSync(gradingFile)) {
|
|
93
|
+
console.log(`Warning: grading.json not found in ${runDir}`);
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
let grading;
|
|
98
|
+
try {
|
|
99
|
+
grading = JSON.parse(fs.readFileSync(gradingFile, "utf-8"));
|
|
100
|
+
} catch (error) {
|
|
101
|
+
console.log(`Warning: Invalid JSON in ${gradingFile}: ${error.message}`);
|
|
102
|
+
continue;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
const result = {
|
|
106
|
+
eval_id: evalId,
|
|
107
|
+
run_number: runNumber,
|
|
108
|
+
pass_rate: grading.summary?.pass_rate ?? 0.0,
|
|
109
|
+
passed: grading.summary?.passed ?? 0,
|
|
110
|
+
failed: grading.summary?.failed ?? 0,
|
|
111
|
+
total: grading.summary?.total ?? 0,
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
const timing = grading.timing ?? {};
|
|
115
|
+
result.time_seconds = timing.total_duration_seconds ?? 0.0;
|
|
116
|
+
const timingFile = path.join(runDir, "timing.json");
|
|
117
|
+
if (result.time_seconds === 0.0 && fs.existsSync(timingFile)) {
|
|
118
|
+
try {
|
|
119
|
+
const timingData = JSON.parse(fs.readFileSync(timingFile, "utf-8"));
|
|
120
|
+
result.time_seconds = timingData.total_duration_seconds ?? 0.0;
|
|
121
|
+
result.tokens = timingData.total_tokens ?? 0;
|
|
122
|
+
} catch {
|
|
123
|
+
// ignore invalid timing.json
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
const metrics = grading.execution_metrics ?? {};
|
|
128
|
+
result.tool_calls = metrics.total_tool_calls ?? 0;
|
|
129
|
+
if (!result.tokens) {
|
|
130
|
+
result.tokens = metrics.output_chars ?? 0;
|
|
131
|
+
}
|
|
132
|
+
result.errors = metrics.errors_encountered ?? 0;
|
|
133
|
+
|
|
134
|
+
const rawExpectations = grading.expectations ?? [];
|
|
135
|
+
for (const exp of rawExpectations) {
|
|
136
|
+
if (!("text" in exp) || !("passed" in exp)) {
|
|
137
|
+
console.log(
|
|
138
|
+
`Warning: expectation in ${gradingFile} missing required fields (text, passed, evidence): ${JSON.stringify(exp)}`,
|
|
139
|
+
);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
result.expectations = rawExpectations;
|
|
143
|
+
|
|
144
|
+
const notesSummary = grading.user_notes_summary ?? {};
|
|
145
|
+
result.notes = [
|
|
146
|
+
...(notesSummary.uncertainties ?? []),
|
|
147
|
+
...(notesSummary.needs_review ?? []),
|
|
148
|
+
...(notesSummary.workarounds ?? []),
|
|
149
|
+
];
|
|
150
|
+
|
|
151
|
+
results[configName].push(result);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
return results;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
function aggregateResults(results) {
|
|
160
|
+
const runSummary = {};
|
|
161
|
+
const configs = Object.keys(results);
|
|
162
|
+
|
|
163
|
+
for (const config of configs) {
|
|
164
|
+
const runs = results[config] ?? [];
|
|
165
|
+
if (!runs.length) {
|
|
166
|
+
runSummary[config] = {
|
|
167
|
+
pass_rate: { mean: 0.0, stddev: 0.0, min: 0.0, max: 0.0 },
|
|
168
|
+
time_seconds: { mean: 0.0, stddev: 0.0, min: 0.0, max: 0.0 },
|
|
169
|
+
tokens: { mean: 0, stddev: 0, min: 0, max: 0 },
|
|
170
|
+
};
|
|
171
|
+
continue;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
runSummary[config] = {
|
|
175
|
+
pass_rate: calculateStats(runs.map((r) => r.pass_rate)),
|
|
176
|
+
time_seconds: calculateStats(runs.map((r) => r.time_seconds)),
|
|
177
|
+
tokens: calculateStats(runs.map((r) => r.tokens ?? 0)),
|
|
178
|
+
};
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
let primary = {};
|
|
182
|
+
let baseline = {};
|
|
183
|
+
if (configs.length >= 2) {
|
|
184
|
+
primary = runSummary[configs[0]] ?? {};
|
|
185
|
+
baseline = runSummary[configs[1]] ?? {};
|
|
186
|
+
} else if (configs.length === 1) {
|
|
187
|
+
primary = runSummary[configs[0]] ?? {};
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
const deltaPassRate = (primary.pass_rate?.mean ?? 0) - (baseline.pass_rate?.mean ?? 0);
|
|
191
|
+
const deltaTime = (primary.time_seconds?.mean ?? 0) - (baseline.time_seconds?.mean ?? 0);
|
|
192
|
+
const deltaTokens = (primary.tokens?.mean ?? 0) - (baseline.tokens?.mean ?? 0);
|
|
193
|
+
|
|
194
|
+
runSummary.delta = {
|
|
195
|
+
pass_rate: `${deltaPassRate >= 0 ? "+" : ""}${deltaPassRate.toFixed(2)}`,
|
|
196
|
+
time_seconds: `${deltaTime >= 0 ? "+" : ""}${deltaTime.toFixed(1)}`,
|
|
197
|
+
tokens: `${deltaTokens >= 0 ? "+" : ""}${Math.round(deltaTokens)}`,
|
|
198
|
+
};
|
|
199
|
+
|
|
200
|
+
return runSummary;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
function generateBenchmark(benchmarkDir, skillName = "", skillPath = "") {
|
|
204
|
+
const results = loadRunResults(benchmarkDir);
|
|
205
|
+
const runSummary = aggregateResults(results);
|
|
206
|
+
|
|
207
|
+
const runs = [];
|
|
208
|
+
for (const config of Object.keys(results)) {
|
|
209
|
+
for (const result of results[config]) {
|
|
210
|
+
runs.push({
|
|
211
|
+
eval_id: result.eval_id,
|
|
212
|
+
configuration: config,
|
|
213
|
+
run_number: result.run_number,
|
|
214
|
+
result: {
|
|
215
|
+
pass_rate: result.pass_rate,
|
|
216
|
+
passed: result.passed,
|
|
217
|
+
failed: result.failed,
|
|
218
|
+
total: result.total,
|
|
219
|
+
time_seconds: result.time_seconds,
|
|
220
|
+
tokens: result.tokens ?? 0,
|
|
221
|
+
tool_calls: result.tool_calls ?? 0,
|
|
222
|
+
errors: result.errors ?? 0,
|
|
223
|
+
},
|
|
224
|
+
expectations: result.expectations,
|
|
225
|
+
notes: result.notes,
|
|
226
|
+
});
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
const evalIds = [
|
|
231
|
+
...new Set(
|
|
232
|
+
Object.values(results)
|
|
233
|
+
.flat()
|
|
234
|
+
.map((r) => r.eval_id),
|
|
235
|
+
),
|
|
236
|
+
].sort((a, b) => a - b);
|
|
237
|
+
|
|
238
|
+
return {
|
|
239
|
+
metadata: {
|
|
240
|
+
skill_name: skillName || "<skill-name>",
|
|
241
|
+
skill_path: skillPath || "<path/to/skill>",
|
|
242
|
+
executor_model: "<model-name>",
|
|
243
|
+
analyzer_model: "<model-name>",
|
|
244
|
+
timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, "Z"),
|
|
245
|
+
evals_run: evalIds,
|
|
246
|
+
runs_per_configuration: 3,
|
|
247
|
+
},
|
|
248
|
+
runs,
|
|
249
|
+
run_summary: runSummary,
|
|
250
|
+
notes: [],
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
function generateMarkdown(benchmark) {
|
|
255
|
+
const metadata = benchmark.metadata;
|
|
256
|
+
const runSummary = benchmark.run_summary;
|
|
257
|
+
const configs = Object.keys(runSummary).filter((k) => k !== "delta");
|
|
258
|
+
const configA = configs[0] ?? "config_a";
|
|
259
|
+
const configB = configs[1] ?? "config_b";
|
|
260
|
+
const labelA = configA.replace(/_/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
|
|
261
|
+
const labelB = configB.replace(/_/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
|
|
262
|
+
|
|
263
|
+
const lines = [
|
|
264
|
+
`# Skill Benchmark: ${metadata.skill_name}`,
|
|
265
|
+
"",
|
|
266
|
+
`**Model**: ${metadata.executor_model}`,
|
|
267
|
+
`**Date**: ${metadata.timestamp}`,
|
|
268
|
+
`**Evals**: ${metadata.evals_run.join(", ")} (${metadata.runs_per_configuration} runs each per configuration)`,
|
|
269
|
+
"",
|
|
270
|
+
"## Summary",
|
|
271
|
+
"",
|
|
272
|
+
`| Metric | ${labelA} | ${labelB} | Delta |`,
|
|
273
|
+
"|--------|------------|---------------|-------|",
|
|
274
|
+
];
|
|
275
|
+
|
|
276
|
+
const aSummary = runSummary[configA] ?? {};
|
|
277
|
+
const bSummary = runSummary[configB] ?? {};
|
|
278
|
+
const delta = runSummary.delta ?? {};
|
|
279
|
+
|
|
280
|
+
const aPr = aSummary.pass_rate ?? {};
|
|
281
|
+
const bPr = bSummary.pass_rate ?? {};
|
|
282
|
+
lines.push(
|
|
283
|
+
`| Pass Rate | ${(aPr.mean ?? 0) * 100}% ± ${(aPr.stddev ?? 0) * 100}% | ${(bPr.mean ?? 0) * 100}% ± ${(bPr.stddev ?? 0) * 100}% | ${delta.pass_rate ?? "—"} |`,
|
|
284
|
+
);
|
|
285
|
+
|
|
286
|
+
const aTime = aSummary.time_seconds ?? {};
|
|
287
|
+
const bTime = bSummary.time_seconds ?? {};
|
|
288
|
+
lines.push(
|
|
289
|
+
`| Time | ${(aTime.mean ?? 0).toFixed(1)}s ± ${(aTime.stddev ?? 0).toFixed(1)}s | ${(bTime.mean ?? 0).toFixed(1)}s ± ${(bTime.stddev ?? 0).toFixed(1)}s | ${delta.time_seconds ?? "—"}s |`,
|
|
290
|
+
);
|
|
291
|
+
|
|
292
|
+
const aTokens = aSummary.tokens ?? {};
|
|
293
|
+
const bTokens = bSummary.tokens ?? {};
|
|
294
|
+
lines.push(
|
|
295
|
+
`| Tokens | ${Math.round(aTokens.mean ?? 0)} ± ${Math.round(aTokens.stddev ?? 0)} | ${Math.round(bTokens.mean ?? 0)} ± ${Math.round(bTokens.stddev ?? 0)} | ${delta.tokens ?? "—"} |`,
|
|
296
|
+
);
|
|
297
|
+
|
|
298
|
+
if (benchmark.notes?.length) {
|
|
299
|
+
lines.push("", "## Notes", "");
|
|
300
|
+
for (const note of benchmark.notes) {
|
|
301
|
+
lines.push(`- ${note}`);
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
return lines.join("\n");
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
function parseArgs(argv) {
|
|
309
|
+
const args = { benchmarkDir: null, skillName: "", skillPath: "", output: null };
|
|
310
|
+
const positional = [];
|
|
311
|
+
for (let i = 2; i < argv.length; i++) {
|
|
312
|
+
const arg = argv[i];
|
|
313
|
+
if (arg === "--skill-name") {
|
|
314
|
+
args.skillName = argv[++i] ?? "";
|
|
315
|
+
} else if (arg === "--skill-path") {
|
|
316
|
+
args.skillPath = argv[++i] ?? "";
|
|
317
|
+
} else if (arg === "--output" || arg === "-o") {
|
|
318
|
+
args.output = argv[++i] ?? null;
|
|
319
|
+
} else if (!arg.startsWith("-")) {
|
|
320
|
+
positional.push(arg);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
args.benchmarkDir = positional[0] ?? null;
|
|
324
|
+
return args;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
function main() {
|
|
328
|
+
const args = parseArgs(process.argv);
|
|
329
|
+
if (!args.benchmarkDir) {
|
|
330
|
+
console.error(
|
|
331
|
+
"Usage: node aggregate_benchmark.js <benchmark_dir> [--skill-name NAME] [--skill-path PATH] [-o OUTPUT]",
|
|
332
|
+
);
|
|
333
|
+
process.exit(1);
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
const benchmarkDir = path.resolve(args.benchmarkDir);
|
|
337
|
+
if (!fs.existsSync(benchmarkDir)) {
|
|
338
|
+
console.error(`Directory not found: ${benchmarkDir}`);
|
|
339
|
+
process.exit(1);
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
const benchmark = generateBenchmark(benchmarkDir, args.skillName, args.skillPath);
|
|
343
|
+
const outputJson = args.output
|
|
344
|
+
? path.resolve(args.output)
|
|
345
|
+
: path.join(benchmarkDir, "benchmark.json");
|
|
346
|
+
const outputMd = outputJson.replace(/\.json$/i, ".md");
|
|
347
|
+
|
|
348
|
+
fs.writeFileSync(outputJson, JSON.stringify(benchmark, null, 2));
|
|
349
|
+
console.log(`Generated: ${outputJson}`);
|
|
350
|
+
|
|
351
|
+
const markdown = generateMarkdown(benchmark);
|
|
352
|
+
fs.writeFileSync(outputMd, markdown);
|
|
353
|
+
console.log(`Generated: ${outputMd}`);
|
|
354
|
+
|
|
355
|
+
const runSummary = benchmark.run_summary;
|
|
356
|
+
const configs = Object.keys(runSummary).filter((k) => k !== "delta");
|
|
357
|
+
const delta = runSummary.delta ?? {};
|
|
358
|
+
console.log("\nSummary:");
|
|
359
|
+
for (const config of configs) {
|
|
360
|
+
const pr = runSummary[config].pass_rate.mean;
|
|
361
|
+
const label = config.replace(/_/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
|
|
362
|
+
console.log(` ${label}: ${(pr * 100).toFixed(1)}% pass rate`);
|
|
363
|
+
}
|
|
364
|
+
console.log(` Delta: ${delta.pass_rate ?? "—"}`);
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
if (require.main === module) {
|
|
368
|
+
main();
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
module.exports = {
|
|
372
|
+
calculateStats,
|
|
373
|
+
loadRunResults,
|
|
374
|
+
aggregateResults,
|
|
375
|
+
generateBenchmark,
|
|
376
|
+
generateMarkdown,
|
|
377
|
+
};
|