@cydm/pie 1.0.5 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/README.md +162 -9
  2. package/dist/builtin/extensions/ask-user/index.js +10 -2911
  3. package/dist/builtin/extensions/changelog/index.js +3 -8
  4. package/dist/builtin/extensions/deploy/index.js +1 -1
  5. package/dist/builtin/extensions/document-attachments/index.js +1 -0
  6. package/dist/builtin/extensions/files/index.js +1 -1
  7. package/dist/builtin/extensions/init/index.js +1 -3
  8. package/dist/builtin/extensions/kimi-attachments/index.js +4 -3
  9. package/dist/builtin/extensions/plan-mode/index.js +96 -165
  10. package/dist/builtin/extensions/subagent/index.js +88 -10991
  11. package/dist/builtin/extensions/todo/index.js +55 -2734
  12. package/dist/builtin/skills/browser-tools/CHANGELOG.md +2 -44
  13. package/dist/builtin/skills/browser-tools/README.md +10 -99
  14. package/dist/builtin/skills/browser-tools/SKILL.md +21 -174
  15. package/dist/builtin/skills/browser-tools/package.json +6 -13
  16. package/dist/builtin/skills/browser-tools/playwright-cli.js +24 -0
  17. package/dist/builtin/skills/pie-unity-rpc/SKILL.md +121 -0
  18. package/dist/builtin/skills/pie-unity-rpc/pie-unity-rpc.js +417 -0
  19. package/dist/builtin/skills/skill-creator/SKILL.md +17 -17
  20. package/dist/builtin/skills/skill-creator/eval-viewer/generate_review.mjs +285 -0
  21. package/dist/builtin/skills/skill-creator/eval-viewer/viewer.html +1 -1
  22. package/dist/builtin/skills/skill-creator/scripts/aggregate_benchmark.mjs +271 -0
  23. package/dist/builtin/skills/skill-creator/scripts/claude_cli.mjs +115 -0
  24. package/dist/builtin/skills/skill-creator/scripts/generate_report.mjs +224 -0
  25. package/dist/builtin/skills/skill-creator/scripts/improve_description.mjs +198 -0
  26. package/dist/builtin/skills/skill-creator/scripts/package_skill.mjs +132 -0
  27. package/dist/builtin/skills/skill-creator/scripts/pie_runner.mjs +115 -0
  28. package/dist/builtin/skills/skill-creator/scripts/quick_validate.mjs +44 -0
  29. package/dist/builtin/skills/skill-creator/scripts/run_eval.mjs +169 -0
  30. package/dist/builtin/skills/skill-creator/scripts/run_loop.mjs +297 -0
  31. package/dist/builtin/skills/skill-creator/scripts/skill_metadata.mjs +134 -0
  32. package/dist/chunks/chunk-A5JSJAPK.js +9994 -0
  33. package/dist/chunks/chunk-BHNULR7U.js +7991 -0
  34. package/dist/chunks/chunk-GDTN4UPJ.js +701 -0
  35. package/dist/chunks/chunk-TG2EQLX2.js +43 -0
  36. package/dist/chunks/src-3X3HBT2G.js +12 -0
  37. package/dist/chunks/typescript-GSKWJIO4.js +210747 -0
  38. package/dist/cli.js +21519 -33379
  39. package/models.schema.json +238 -0
  40. package/package.json +36 -11
  41. package/dist/builtin/extensions/questionnaire/index.js +0 -2753
  42. package/dist/builtin/skills/browser-tools/browser-content.js +0 -103
  43. package/dist/builtin/skills/browser-tools/browser-cookies.js +0 -35
  44. package/dist/builtin/skills/browser-tools/browser-eval.js +0 -49
  45. package/dist/builtin/skills/browser-tools/browser-hn-scraper.js +0 -108
  46. package/dist/builtin/skills/browser-tools/browser-nav.js +0 -44
  47. package/dist/builtin/skills/browser-tools/browser-pick.js +0 -162
  48. package/dist/builtin/skills/browser-tools/browser-screenshot.js +0 -34
  49. package/dist/builtin/skills/browser-tools/browser-start.js +0 -86
  50. package/dist/builtin/skills/skill-creator/eval-viewer/generate_review.py +0 -471
  51. package/dist/builtin/skills/skill-creator/scripts/__init__.py +0 -0
  52. package/dist/builtin/skills/skill-creator/scripts/aggregate_benchmark.py +0 -401
  53. package/dist/builtin/skills/skill-creator/scripts/generate_report.py +0 -326
  54. package/dist/builtin/skills/skill-creator/scripts/improve_description.py +0 -247
  55. package/dist/builtin/skills/skill-creator/scripts/package_skill.py +0 -136
  56. package/dist/builtin/skills/skill-creator/scripts/quick_validate.py +0 -103
  57. package/dist/builtin/skills/skill-creator/scripts/run_eval.py +0 -310
  58. package/dist/builtin/skills/skill-creator/scripts/run_loop.py +0 -328
  59. package/dist/builtin/skills/skill-creator/scripts/utils.py +0 -47
@@ -0,0 +1,285 @@
1
+ #!/usr/bin/env node
2
+
3
+ import path from "node:path";
4
+ import { createServer } from "node:http";
5
+ import { readFile, readdir, stat, writeFile } from "node:fs/promises";
6
+ import { fileURLToPath } from "node:url";
7
+ import { execFile } from "node:child_process";
8
+ import { promisify } from "node:util";
9
+ import { openInBrowser } from "../scripts/claude_cli.mjs";
10
+
11
+ const execFileAsync = promisify(execFile);
12
+ const METADATA_FILES = new Set(["transcript.md", "user_notes.md", "metrics.json"]);
13
+ const TEXT_EXTENSIONS = new Set([".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx", ".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs", ".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml", ".mjs"]);
14
+ const IMAGE_EXTENSIONS = new Set([".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"]);
15
+ const MIME_OVERRIDES = {
16
+ ".svg": "image/svg+xml",
17
+ ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
18
+ ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
19
+ ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
20
+ };
21
+
22
+ function getMimeType(targetPath) {
23
+ const extension = path.extname(targetPath).toLowerCase();
24
+ return MIME_OVERRIDES[extension] ?? "application/octet-stream";
25
+ }
26
+
27
+ async function pathExists(targetPath) {
28
+ try {
29
+ await stat(targetPath);
30
+ return true;
31
+ } catch {
32
+ return false;
33
+ }
34
+ }
35
+
36
+ async function readJsonIfExists(targetPath) {
37
+ if (!(await pathExists(targetPath))) return null;
38
+ try {
39
+ return JSON.parse(await readFile(targetPath, "utf8"));
40
+ } catch {
41
+ return null;
42
+ }
43
+ }
44
+
45
+ export async function findRuns(workspace) {
46
+ const runs = [];
47
+ await findRunsRecursive(workspace, workspace, runs);
48
+ runs.sort((a, b) => (a.eval_id ?? Number.MAX_SAFE_INTEGER) - (b.eval_id ?? Number.MAX_SAFE_INTEGER) || a.id.localeCompare(b.id));
49
+ return runs;
50
+ }
51
+
52
+ async function findRunsRecursive(root, current, runs) {
53
+ if (!(await pathExists(current))) return;
54
+ const currentStat = await stat(current);
55
+ if (!currentStat.isDirectory()) return;
56
+ const outputsDir = path.join(current, "outputs");
57
+ if (await pathExists(outputsDir)) {
58
+ const builtRun = await buildRun(root, current);
59
+ if (builtRun) {
60
+ runs.push(builtRun);
61
+ }
62
+ return;
63
+ }
64
+ const entries = await readdir(current, { withFileTypes: true });
65
+ for (const entry of entries.sort((a, b) => a.name.localeCompare(b.name))) {
66
+ if (!entry.isDirectory()) continue;
67
+ if (new Set(["node_modules", ".git", "__pycache__", "skill", "inputs"]).has(entry.name)) continue;
68
+ await findRunsRecursive(root, path.join(current, entry.name), runs);
69
+ }
70
+ }
71
+
72
+ async function buildRun(root, runDir) {
73
+ let prompt = "";
74
+ let evalId = null;
75
+ for (const candidate of [path.join(runDir, "eval_metadata.json"), path.join(path.dirname(runDir), "eval_metadata.json")]) {
76
+ const metadata = await readJsonIfExists(candidate);
77
+ if (metadata?.prompt) {
78
+ prompt = metadata.prompt;
79
+ evalId = metadata.eval_id ?? null;
80
+ break;
81
+ }
82
+ }
83
+ if (!prompt) {
84
+ for (const candidate of [path.join(runDir, "transcript.md"), path.join(runDir, "outputs", "transcript.md")]) {
85
+ if (!(await pathExists(candidate))) continue;
86
+ const text = await readFile(candidate, "utf8");
87
+ const match = text.match(/## Eval Prompt\n\n([\s\S]*?)(?=\n##|$)/);
88
+ if (match?.[1]) {
89
+ prompt = match[1].trim();
90
+ break;
91
+ }
92
+ }
93
+ }
94
+ if (!prompt) {
95
+ prompt = "(No prompt found)";
96
+ }
97
+ const outputFiles = [];
98
+ const outputsDir = path.join(runDir, "outputs");
99
+ if (await pathExists(outputsDir)) {
100
+ const entries = await readdir(outputsDir, { withFileTypes: true });
101
+ for (const entry of entries.sort((a, b) => a.name.localeCompare(b.name))) {
102
+ if (!entry.isFile() || METADATA_FILES.has(entry.name)) continue;
103
+ outputFiles.push(await embedFile(path.join(outputsDir, entry.name)));
104
+ }
105
+ }
106
+ const grading = await readJsonIfExists(path.join(runDir, "grading.json")) ?? await readJsonIfExists(path.join(path.dirname(runDir), "grading.json"));
107
+ return {
108
+ id: path.relative(root, runDir).replaceAll(path.sep, "-"),
109
+ prompt,
110
+ eval_id: evalId,
111
+ outputs: outputFiles,
112
+ grading,
113
+ };
114
+ }
115
+
116
+ async function embedFile(targetPath) {
117
+ const extension = path.extname(targetPath).toLowerCase();
118
+ if (TEXT_EXTENSIONS.has(extension)) {
119
+ return { name: path.basename(targetPath), type: "text", content: await readFile(targetPath, "utf8").catch(() => "(Error reading file)") };
120
+ }
121
+ const raw = await readFile(targetPath).catch(() => null);
122
+ if (!raw) {
123
+ return { name: path.basename(targetPath), type: "error", content: "(Error reading file)" };
124
+ }
125
+ const mime = getMimeType(targetPath);
126
+ const data = raw.toString("base64");
127
+ if (IMAGE_EXTENSIONS.has(extension)) {
128
+ return { name: path.basename(targetPath), type: "image", mime, data_uri: `data:${mime};base64,${data}` };
129
+ }
130
+ if (extension === ".pdf") {
131
+ return { name: path.basename(targetPath), type: "pdf", data_uri: `data:${mime};base64,${data}` };
132
+ }
133
+ if (extension === ".xlsx") {
134
+ return { name: path.basename(targetPath), type: "xlsx", data_b64: data };
135
+ }
136
+ return { name: path.basename(targetPath), type: "binary", mime, data_uri: `data:${mime};base64,${data}` };
137
+ }
138
+
139
+ export async function loadPreviousIteration(workspace) {
140
+ const result = {};
141
+ const feedback = await readJsonIfExists(path.join(workspace, "feedback.json"));
142
+ const feedbackMap = new Map();
143
+ for (const review of feedback?.reviews ?? []) {
144
+ if (review?.run_id && String(review.feedback ?? "").trim()) {
145
+ feedbackMap.set(review.run_id, review.feedback);
146
+ }
147
+ }
148
+ for (const run of await findRuns(workspace)) {
149
+ result[run.id] = {
150
+ feedback: feedbackMap.get(run.id) ?? "",
151
+ outputs: run.outputs ?? [],
152
+ };
153
+ }
154
+ for (const [runId, reviewFeedback] of feedbackMap.entries()) {
155
+ if (!result[runId]) {
156
+ result[runId] = { feedback: reviewFeedback, outputs: [] };
157
+ }
158
+ }
159
+ return result;
160
+ }
161
+
162
+ export async function generateHtml({ workspace, skillName, previous = null, benchmark = null }) {
163
+ const templatePath = path.join(path.dirname(fileURLToPath(import.meta.url)), "viewer.html");
164
+ const template = await readFile(templatePath, "utf8");
165
+ const runs = await findRuns(workspace);
166
+ const previousFeedback = {};
167
+ const previousOutputs = {};
168
+ if (previous) {
169
+ for (const [runId, data] of Object.entries(previous)) {
170
+ if (data.feedback) previousFeedback[runId] = data.feedback;
171
+ if (Array.isArray(data.outputs) && data.outputs.length > 0) previousOutputs[runId] = data.outputs;
172
+ }
173
+ }
174
+ const embedded = { skill_name: skillName, runs, previous_feedback: previousFeedback, previous_outputs: previousOutputs };
175
+ if (benchmark) {
176
+ embedded.benchmark = benchmark;
177
+ }
178
+ return template.replace("/*__EMBEDDED_DATA__*/", `const EMBEDDED_DATA = ${JSON.stringify(embedded)};`);
179
+ }
180
+
181
+ async function killPort(port) {
182
+ try {
183
+ const { stdout } = await execFileAsync("lsof", ["-ti", `:${port}`], { timeout: 5000 });
184
+ for (const pid of stdout.split("\n").map((value) => value.trim()).filter(Boolean)) {
185
+ try {
186
+ process.kill(Number(pid), "SIGTERM");
187
+ } catch {}
188
+ }
189
+ } catch {}
190
+ }
191
+
192
+ function parseArgs(args) {
193
+ const flags = { _: [] };
194
+ for (let i = 0; i < args.length; i += 1) {
195
+ const item = args[i];
196
+ if (!item.startsWith("-")) {
197
+ flags._.push(item);
198
+ continue;
199
+ }
200
+ const key = item.startsWith("--") ? item.slice(2) : item.slice(1);
201
+ const next = args[i + 1];
202
+ if (!next || next.startsWith("-")) {
203
+ flags[key] = true;
204
+ continue;
205
+ }
206
+ flags[key] = next;
207
+ i += 1;
208
+ }
209
+ return flags;
210
+ }
211
+
212
+ async function main(argv = process.argv) {
213
+ const args = parseArgs(argv.slice(2));
214
+ const workspace = path.resolve(args._[0] ?? "");
215
+ if (!workspace) {
216
+ throw new Error("Usage: node generate_review.mjs <workspace-path> [--port PORT] [--skill-name NAME] [--static output.html]");
217
+ }
218
+ const skillName = args["skill-name"] ?? path.basename(workspace).replace(/-workspace$/, "");
219
+ const previous = args["previous-workspace"] ? await loadPreviousIteration(path.resolve(args["previous-workspace"])) : {};
220
+ const benchmark = args.benchmark ? await readJsonIfExists(path.resolve(args.benchmark)) : null;
221
+ if (args.static || args.s) {
222
+ const outputPath = path.resolve(args.static ?? args.s);
223
+ await writeFile(outputPath, await generateHtml({ workspace, skillName, previous, benchmark }));
224
+ process.stdout.write(`Static viewer written to: ${outputPath}\n`);
225
+ return;
226
+ }
227
+ let port = Number(args.port ?? args.p ?? 3117);
228
+ await killPort(port);
229
+ const feedbackPath = path.join(workspace, "feedback.json");
230
+ const server = createServer(async (req, res) => {
231
+ try {
232
+ if (req.method === "GET" && (req.url === "/" || req.url === "/index.html")) {
233
+ const html = await generateHtml({ workspace, skillName, previous, benchmark });
234
+ res.writeHead(200, { "Content-Type": "text/html; charset=utf-8" });
235
+ res.end(html);
236
+ return;
237
+ }
238
+ if (req.method === "GET" && req.url === "/api/feedback") {
239
+ const feedback = await readFile(feedbackPath, "utf8").catch(() => "{}");
240
+ res.writeHead(200, { "Content-Type": "application/json" });
241
+ res.end(feedback);
242
+ return;
243
+ }
244
+ if (req.method === "POST" && req.url === "/api/feedback") {
245
+ let body = "";
246
+ req.setEncoding("utf8");
247
+ for await (const chunk of req) {
248
+ body += chunk;
249
+ }
250
+ const parsed = JSON.parse(body);
251
+ if (!parsed || typeof parsed !== "object" || !Array.isArray(parsed.reviews)) {
252
+ throw new Error("Expected JSON object with 'reviews' key");
253
+ }
254
+ await writeFile(feedbackPath, `${JSON.stringify(parsed, null, 2)}\n`);
255
+ res.writeHead(200, { "Content-Type": "application/json" });
256
+ res.end("{\"ok\":true}");
257
+ return;
258
+ }
259
+ res.writeHead(404);
260
+ res.end("Not found");
261
+ } catch (error) {
262
+ res.writeHead(500, { "Content-Type": "application/json" });
263
+ res.end(JSON.stringify({ error: error instanceof Error ? error.message : String(error) }));
264
+ }
265
+ });
266
+ await new Promise((resolve) => {
267
+ server.once("error", () => {
268
+ server.listen(0, "127.0.0.1", resolve);
269
+ });
270
+ server.listen(port, "127.0.0.1", resolve);
271
+ });
272
+ const address = server.address();
273
+ port = typeof address === "object" && address ? address.port : port;
274
+ const url = `http://localhost:${port}`;
275
+ process.stdout.write(`Eval Viewer\nURL: ${url}\nWorkspace: ${workspace}\nFeedback: ${feedbackPath}\n`);
276
+ await openInBrowser(url);
277
+ }
278
+
279
+ const entryPath = fileURLToPath(import.meta.url);
280
+ if (process.argv[1] && path.resolve(process.argv[1]) === entryPath) {
281
+ main().catch((error) => {
282
+ process.stderr.write(`${error instanceof Error ? error.message : String(error)}\n`);
283
+ process.exitCode = 1;
284
+ });
285
+ }
@@ -645,7 +645,7 @@
645
645
  <div class="toast" id="toast"></div>
646
646
 
647
647
  <script>
648
- // ---- Embedded data (injected by generate_review.py) ----
648
+ // ---- Embedded data (injected by generate_review.mjs) ----
649
649
  /*__EMBEDDED_DATA__*/
650
650
 
651
651
  // ---- State ----
@@ -0,0 +1,271 @@
1
+ #!/usr/bin/env node
2
+
3
+ import path from "node:path";
4
+ import { readdir, readFile, stat, writeFile } from "node:fs/promises";
5
+ import { fileURLToPath } from "node:url";
6
+
7
+ function round(value, places = 4) {
8
+ const factor = 10 ** places;
9
+ return Math.round(value * factor) / factor;
10
+ }
11
+
12
+ export function calculateStats(values) {
13
+ if (!Array.isArray(values) || values.length === 0) {
14
+ return { mean: 0, stddev: 0, min: 0, max: 0 };
15
+ }
16
+ const mean = values.reduce((sum, value) => sum + value, 0) / values.length;
17
+ let stddev = 0;
18
+ if (values.length > 1) {
19
+ const variance = values.reduce((sum, value) => sum + ((value - mean) ** 2), 0) / (values.length - 1);
20
+ stddev = Math.sqrt(variance);
21
+ }
22
+ return {
23
+ mean: round(mean, 4),
24
+ stddev: round(stddev, 4),
25
+ min: round(Math.min(...values), 4),
26
+ max: round(Math.max(...values), 4),
27
+ };
28
+ }
29
+
30
+ async function pathExists(targetPath) {
31
+ try {
32
+ await stat(targetPath);
33
+ return true;
34
+ } catch {
35
+ return false;
36
+ }
37
+ }
38
+
39
+ async function readJsonIfExists(targetPath) {
40
+ if (!(await pathExists(targetPath))) {
41
+ return null;
42
+ }
43
+ try {
44
+ return JSON.parse(await readFile(targetPath, "utf8"));
45
+ } catch {
46
+ return null;
47
+ }
48
+ }
49
+
50
+ async function listDirectories(targetPath) {
51
+ const entries = await readdir(targetPath, { withFileTypes: true });
52
+ return entries.filter((entry) => entry.isDirectory()).map((entry) => entry.name).sort();
53
+ }
54
+
55
+ export async function loadRunResults(benchmarkDir) {
56
+ const runsDir = path.join(benchmarkDir, "runs");
57
+ const benchmarkEvalDirs = (await pathExists(benchmarkDir))
58
+ ? (await listDirectories(benchmarkDir)).filter((name) => name.startsWith("eval-"))
59
+ : [];
60
+ let searchDir = null;
61
+ if (await pathExists(runsDir)) {
62
+ searchDir = runsDir;
63
+ } else if (benchmarkEvalDirs.length > 0) {
64
+ searchDir = benchmarkDir;
65
+ } else {
66
+ return {};
67
+ }
68
+
69
+ const results = {};
70
+ const evalDirs = (await listDirectories(searchDir)).filter((name) => name.startsWith("eval-"));
71
+ for (const [evalIdx, evalDirName] of evalDirs.entries()) {
72
+ const evalDir = path.join(searchDir, evalDirName);
73
+ const metadata = await readJsonIfExists(path.join(evalDir, "eval_metadata.json"));
74
+ const parsedEvalId = Number.parseInt(evalDirName.split("-")[1], 10);
75
+ const evalId = metadata?.eval_id ?? (Number.isNaN(parsedEvalId) ? evalIdx : parsedEvalId);
76
+ for (const configName of await listDirectories(evalDir)) {
77
+ const configDir = path.join(evalDir, configName);
78
+ const childDirs = await listDirectories(configDir);
79
+ const runDirs = childDirs.filter((name) => name.startsWith("run-"));
80
+ if (runDirs.length === 0) {
81
+ continue;
82
+ }
83
+ if (!results[configName]) {
84
+ results[configName] = [];
85
+ }
86
+ for (const runDirName of runDirs) {
87
+ const runDir = path.join(configDir, runDirName);
88
+ const grading = await readJsonIfExists(path.join(runDir, "grading.json"));
89
+ if (!grading) {
90
+ continue;
91
+ }
92
+ const timing = grading.timing ?? {};
93
+ let timeSeconds = Number(timing.total_duration_seconds ?? 0);
94
+ let tokens = 0;
95
+ const timingJson = await readJsonIfExists(path.join(runDir, "timing.json"));
96
+ if (timeSeconds === 0 && timingJson) {
97
+ timeSeconds = Number(timingJson.total_duration_seconds ?? 0);
98
+ tokens = Number(timingJson.total_tokens ?? 0);
99
+ }
100
+ const executionMetrics = grading.execution_metrics ?? {};
101
+ if (tokens === 0) {
102
+ tokens = Number(executionMetrics.output_chars ?? 0);
103
+ }
104
+ results[configName].push({
105
+ eval_id: evalId,
106
+ run_number: Number.parseInt(runDirName.split("-")[1], 10) || 0,
107
+ pass_rate: Number(grading.summary?.pass_rate ?? 0),
108
+ passed: Number(grading.summary?.passed ?? 0),
109
+ failed: Number(grading.summary?.failed ?? 0),
110
+ total: Number(grading.summary?.total ?? 0),
111
+ time_seconds: timeSeconds,
112
+ tokens,
113
+ tool_calls: Number(executionMetrics.total_tool_calls ?? 0),
114
+ errors: Number(executionMetrics.errors_encountered ?? 0),
115
+ expectations: Array.isArray(grading.expectations) ? grading.expectations : [],
116
+ notes: [
117
+ ...(grading.user_notes_summary?.uncertainties ?? []),
118
+ ...(grading.user_notes_summary?.needs_review ?? []),
119
+ ...(grading.user_notes_summary?.workarounds ?? []),
120
+ ],
121
+ });
122
+ }
123
+ }
124
+ }
125
+ return results;
126
+ }
127
+
128
+ export function aggregateResults(results) {
129
+ const runSummary = {};
130
+ const configs = Object.keys(results);
131
+ for (const config of configs) {
132
+ const runs = results[config] ?? [];
133
+ runSummary[config] = {
134
+ pass_rate: calculateStats(runs.map((run) => Number(run.pass_rate ?? 0))),
135
+ time_seconds: calculateStats(runs.map((run) => Number(run.time_seconds ?? 0))),
136
+ tokens: calculateStats(runs.map((run) => Number(run.tokens ?? 0))),
137
+ };
138
+ }
139
+ const primaryConfig = configs[0];
140
+ const baselineConfig = configs[1];
141
+ const primary = runSummary[primaryConfig] ?? {};
142
+ const baseline = runSummary[baselineConfig] ?? {};
143
+ runSummary.delta = {
144
+ pass_rate: `${(primary.pass_rate?.mean ?? 0) - (baseline.pass_rate?.mean ?? 0) >= 0 ? "+" : ""}${round((primary.pass_rate?.mean ?? 0) - (baseline.pass_rate?.mean ?? 0), 2).toFixed(2)}`,
145
+ time_seconds: `${(primary.time_seconds?.mean ?? 0) - (baseline.time_seconds?.mean ?? 0) >= 0 ? "+" : ""}${round((primary.time_seconds?.mean ?? 0) - (baseline.time_seconds?.mean ?? 0), 1).toFixed(1)}`,
146
+ tokens: `${(primary.tokens?.mean ?? 0) - (baseline.tokens?.mean ?? 0) >= 0 ? "+" : ""}${Math.round((primary.tokens?.mean ?? 0) - (baseline.tokens?.mean ?? 0))}`,
147
+ };
148
+ return runSummary;
149
+ }
150
+
151
+ export async function generateBenchmark(benchmarkDir, { skillName = "", skillPath = "" } = {}) {
152
+ const results = await loadRunResults(benchmarkDir);
153
+ const runSummary = aggregateResults(results);
154
+ const runs = [];
155
+ for (const [configuration, entries] of Object.entries(results)) {
156
+ for (const result of entries) {
157
+ runs.push({
158
+ eval_id: result.eval_id,
159
+ configuration,
160
+ run_number: result.run_number,
161
+ result: {
162
+ pass_rate: result.pass_rate,
163
+ passed: result.passed,
164
+ failed: result.failed,
165
+ total: result.total,
166
+ time_seconds: result.time_seconds,
167
+ tokens: result.tokens,
168
+ tool_calls: result.tool_calls,
169
+ errors: result.errors,
170
+ },
171
+ expectations: result.expectations,
172
+ notes: result.notes,
173
+ });
174
+ }
175
+ }
176
+ const evalIds = [...new Set(runs.map((run) => run.eval_id))].sort((a, b) => a - b);
177
+ return {
178
+ metadata: {
179
+ skill_name: skillName || "<skill-name>",
180
+ skill_path: skillPath || "<path/to/skill>",
181
+ executor_model: "<model-name>",
182
+ analyzer_model: "<model-name>",
183
+ timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, "Z"),
184
+ evals_run: evalIds,
185
+ runs_per_configuration: 3,
186
+ },
187
+ runs,
188
+ run_summary: runSummary,
189
+ notes: [],
190
+ };
191
+ }
192
+
193
+ export function generateMarkdown(benchmark) {
194
+ const metadata = benchmark.metadata ?? {};
195
+ const runSummary = benchmark.run_summary ?? {};
196
+ const configs = Object.keys(runSummary).filter((key) => key !== "delta");
197
+ const configA = configs[0] ?? "config_a";
198
+ const configB = configs[1] ?? "config_b";
199
+ const aSummary = runSummary[configA] ?? {};
200
+ const bSummary = runSummary[configB] ?? {};
201
+ const delta = runSummary.delta ?? {};
202
+ const labelA = configA.replaceAll("_", " ").replace(/\b\w/g, (char) => char.toUpperCase());
203
+ const labelB = configB.replaceAll("_", " ").replace(/\b\w/g, (char) => char.toUpperCase());
204
+ const lines = [
205
+ `# Skill Benchmark: ${metadata.skill_name ?? "<skill-name>"}`,
206
+ "",
207
+ `**Model**: ${metadata.executor_model ?? "<model-name>"}`,
208
+ `**Date**: ${metadata.timestamp ?? ""}`,
209
+ `**Evals**: ${(metadata.evals_run ?? []).join(", ")} (${metadata.runs_per_configuration ?? 0} runs each per configuration)`,
210
+ "",
211
+ "## Summary",
212
+ "",
213
+ `| Metric | ${labelA} | ${labelB} | Delta |`,
214
+ "|--------|------------|---------------|-------|",
215
+ `| Pass Rate | ${(aSummary.pass_rate?.mean ?? 0) * 100}% ± ${(aSummary.pass_rate?.stddev ?? 0) * 100}% | ${(bSummary.pass_rate?.mean ?? 0) * 100}% ± ${(bSummary.pass_rate?.stddev ?? 0) * 100}% | ${delta.pass_rate ?? "—"} |`,
216
+ `| Time | ${(aSummary.time_seconds?.mean ?? 0).toFixed?.(1) ?? aSummary.time_seconds?.mean ?? 0}s ± ${(aSummary.time_seconds?.stddev ?? 0).toFixed?.(1) ?? aSummary.time_seconds?.stddev ?? 0}s | ${(bSummary.time_seconds?.mean ?? 0).toFixed?.(1) ?? bSummary.time_seconds?.mean ?? 0}s ± ${(bSummary.time_seconds?.stddev ?? 0).toFixed?.(1) ?? bSummary.time_seconds?.stddev ?? 0}s | ${delta.time_seconds ?? "—"}s |`,
217
+ `| Tokens | ${Math.round(aSummary.tokens?.mean ?? 0)} ± ${Math.round(aSummary.tokens?.stddev ?? 0)} | ${Math.round(bSummary.tokens?.mean ?? 0)} ± ${Math.round(bSummary.tokens?.stddev ?? 0)} | ${delta.tokens ?? "—"} |`,
218
+ ];
219
+ if (Array.isArray(benchmark.notes) && benchmark.notes.length > 0) {
220
+ lines.push("", "## Notes", "");
221
+ for (const note of benchmark.notes) {
222
+ lines.push(`- ${note}`);
223
+ }
224
+ }
225
+ return lines.join("\n");
226
+ }
227
+
228
+ function parseArgs(args) {
229
+ const flags = { _: [] };
230
+ for (let i = 0; i < args.length; i += 1) {
231
+ const item = args[i];
232
+ if (!item.startsWith("-")) {
233
+ flags._.push(item);
234
+ continue;
235
+ }
236
+ const key = item.startsWith("--") ? item.slice(2) : item.slice(1);
237
+ const next = args[i + 1];
238
+ if (!next || next.startsWith("-")) {
239
+ flags[key] = true;
240
+ continue;
241
+ }
242
+ flags[key] = next;
243
+ i += 1;
244
+ }
245
+ return flags;
246
+ }
247
+
248
+ async function main(argv = process.argv) {
249
+ const args = parseArgs(argv.slice(2));
250
+ const benchmarkDir = args._[0];
251
+ if (!benchmarkDir) {
252
+ throw new Error("Usage: node aggregate_benchmark.mjs <benchmark_dir> [--skill-name name] [--skill-path path] [-o output.json]");
253
+ }
254
+ const outputJsonPath = path.resolve(args.output ?? args.o ?? path.join(benchmarkDir, "benchmark.json"));
255
+ const benchmark = await generateBenchmark(path.resolve(benchmarkDir), {
256
+ skillName: args["skill-name"] ?? "",
257
+ skillPath: args["skill-path"] ?? "",
258
+ });
259
+ await writeFile(outputJsonPath, `${JSON.stringify(benchmark, null, 2)}\n`);
260
+ await writeFile(outputJsonPath.replace(/\.json$/i, ".md"), `${generateMarkdown(benchmark)}\n`);
261
+ process.stdout.write(`Generated: ${outputJsonPath}\n`);
262
+ process.stdout.write(`Generated: ${outputJsonPath.replace(/\.json$/i, ".md")}\n`);
263
+ }
264
+
265
+ const entryPath = fileURLToPath(import.meta.url);
266
+ if (process.argv[1] && path.resolve(process.argv[1]) === entryPath) {
267
+ main().catch((error) => {
268
+ process.stderr.write(`${error instanceof Error ? error.message : String(error)}\n`);
269
+ process.exitCode = 1;
270
+ });
271
+ }
@@ -0,0 +1,115 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { spawn, spawnSync } from "node:child_process";
4
+ import os from "node:os";
5
+ import path from "node:path";
6
+
7
+ export function findProjectRoot(start = process.cwd()) {
8
+ const current = path.resolve(start);
9
+ let cursor = current;
10
+ while (true) {
11
+ if (path.basename(cursor) === ".claude" || existsDir(path.join(cursor, ".claude"))) {
12
+ return path.basename(cursor) === ".claude" ? path.dirname(cursor) : cursor;
13
+ }
14
+ const parent = path.dirname(cursor);
15
+ if (parent === cursor) {
16
+ return current;
17
+ }
18
+ cursor = parent;
19
+ }
20
+ }
21
+
22
+ function existsDir(dir) {
23
+ try {
24
+ return spawnSync("test", ["-d", dir]).status === 0;
25
+ } catch {
26
+ return false;
27
+ }
28
+ }
29
+
30
+ export function buildClaudeEnv() {
31
+ const env = { ...process.env };
32
+ delete env.CLAUDECODE;
33
+ return env;
34
+ }
35
+
36
+ export async function callClaudeText(prompt, { model, timeout = 300_000 } = {}) {
37
+ const args = ["-p", "--output-format", "text"];
38
+ if (model) args.push("--model", model);
39
+ return runClaude(args, { stdin: prompt, timeout });
40
+ }
41
+
42
+ export function runClaudeStream(args, { cwd, timeout = 30_000 } = {}) {
43
+ const child = spawn("claude", args, {
44
+ cwd,
45
+ env: buildClaudeEnv(),
46
+ stdio: ["ignore", "pipe", "ignore"],
47
+ });
48
+
49
+ const timer = setTimeout(() => {
50
+ child.kill("SIGKILL");
51
+ }, timeout);
52
+
53
+ child.once("exit", () => clearTimeout(timer));
54
+ child.once("error", () => clearTimeout(timer));
55
+ return child;
56
+ }
57
+
58
+ async function runClaude(args, { stdin, timeout = 300_000, cwd } = {}) {
59
+ return new Promise((resolve, reject) => {
60
+ const child = spawn("claude", args, {
61
+ cwd,
62
+ env: buildClaudeEnv(),
63
+ stdio: ["pipe", "pipe", "pipe"],
64
+ });
65
+
66
+ let stdout = "";
67
+ let stderr = "";
68
+
69
+ const timer = setTimeout(() => {
70
+ child.kill("SIGKILL");
71
+ reject(new Error(`claude -p timed out after ${Math.round(timeout / 1000)}s`));
72
+ }, timeout);
73
+
74
+ child.stdout.on("data", (chunk) => {
75
+ stdout += String(chunk);
76
+ });
77
+ child.stderr.on("data", (chunk) => {
78
+ stderr += String(chunk);
79
+ });
80
+ child.on("error", (error) => {
81
+ clearTimeout(timer);
82
+ reject(error);
83
+ });
84
+ child.on("close", (code) => {
85
+ clearTimeout(timer);
86
+ if (code !== 0) {
87
+ reject(new Error(`claude -p exited ${code}\nstderr: ${stderr}`));
88
+ return;
89
+ }
90
+ resolve(stdout);
91
+ });
92
+
93
+ if (stdin) {
94
+ child.stdin.write(stdin);
95
+ }
96
+ child.stdin.end();
97
+ });
98
+ }
99
+
100
+ export function uuidFragment() {
101
+ return Math.random().toString(16).slice(2, 10) + Date.now().toString(16).slice(-4);
102
+ }
103
+
104
+ export function openInBrowser(target) {
105
+ const platform = os.platform();
106
+ if (platform === "darwin") {
107
+ spawn("open", [target], { detached: true, stdio: "ignore" }).unref();
108
+ return;
109
+ }
110
+ if (platform === "win32") {
111
+ spawn("cmd", ["/c", "start", "", target], { detached: true, stdio: "ignore" }).unref();
112
+ return;
113
+ }
114
+ spawn("xdg-open", [target], { detached: true, stdio: "ignore" }).unref();
115
+ }