@cydm/pie 1.0.5 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +162 -9
- package/dist/builtin/extensions/ask-user/index.js +10 -2911
- package/dist/builtin/extensions/changelog/index.js +3 -8
- package/dist/builtin/extensions/deploy/index.js +1 -1
- package/dist/builtin/extensions/document-attachments/index.js +1 -0
- package/dist/builtin/extensions/files/index.js +1 -1
- package/dist/builtin/extensions/init/index.js +1 -3
- package/dist/builtin/extensions/kimi-attachments/index.js +4 -3
- package/dist/builtin/extensions/plan-mode/index.js +96 -165
- package/dist/builtin/extensions/subagent/index.js +88 -10991
- package/dist/builtin/extensions/todo/index.js +55 -2734
- package/dist/builtin/skills/browser-tools/CHANGELOG.md +2 -44
- package/dist/builtin/skills/browser-tools/README.md +10 -99
- package/dist/builtin/skills/browser-tools/SKILL.md +21 -174
- package/dist/builtin/skills/browser-tools/package.json +6 -13
- package/dist/builtin/skills/browser-tools/playwright-cli.js +24 -0
- package/dist/builtin/skills/pie-unity-rpc/SKILL.md +121 -0
- package/dist/builtin/skills/pie-unity-rpc/pie-unity-rpc.js +417 -0
- package/dist/builtin/skills/skill-creator/SKILL.md +17 -17
- package/dist/builtin/skills/skill-creator/eval-viewer/generate_review.mjs +285 -0
- package/dist/builtin/skills/skill-creator/eval-viewer/viewer.html +1 -1
- package/dist/builtin/skills/skill-creator/scripts/aggregate_benchmark.mjs +271 -0
- package/dist/builtin/skills/skill-creator/scripts/claude_cli.mjs +115 -0
- package/dist/builtin/skills/skill-creator/scripts/generate_report.mjs +224 -0
- package/dist/builtin/skills/skill-creator/scripts/improve_description.mjs +198 -0
- package/dist/builtin/skills/skill-creator/scripts/package_skill.mjs +132 -0
- package/dist/builtin/skills/skill-creator/scripts/pie_runner.mjs +115 -0
- package/dist/builtin/skills/skill-creator/scripts/quick_validate.mjs +44 -0
- package/dist/builtin/skills/skill-creator/scripts/run_eval.mjs +169 -0
- package/dist/builtin/skills/skill-creator/scripts/run_loop.mjs +297 -0
- package/dist/builtin/skills/skill-creator/scripts/skill_metadata.mjs +134 -0
- package/dist/chunks/chunk-A5JSJAPK.js +9994 -0
- package/dist/chunks/chunk-BHNULR7U.js +7991 -0
- package/dist/chunks/chunk-GDTN4UPJ.js +701 -0
- package/dist/chunks/chunk-TG2EQLX2.js +43 -0
- package/dist/chunks/src-3X3HBT2G.js +12 -0
- package/dist/chunks/typescript-GSKWJIO4.js +210747 -0
- package/dist/cli.js +21519 -33379
- package/models.schema.json +238 -0
- package/package.json +36 -11
- package/dist/builtin/extensions/questionnaire/index.js +0 -2753
- package/dist/builtin/skills/browser-tools/browser-content.js +0 -103
- package/dist/builtin/skills/browser-tools/browser-cookies.js +0 -35
- package/dist/builtin/skills/browser-tools/browser-eval.js +0 -49
- package/dist/builtin/skills/browser-tools/browser-hn-scraper.js +0 -108
- package/dist/builtin/skills/browser-tools/browser-nav.js +0 -44
- package/dist/builtin/skills/browser-tools/browser-pick.js +0 -162
- package/dist/builtin/skills/browser-tools/browser-screenshot.js +0 -34
- package/dist/builtin/skills/browser-tools/browser-start.js +0 -86
- package/dist/builtin/skills/skill-creator/eval-viewer/generate_review.py +0 -471
- package/dist/builtin/skills/skill-creator/scripts/__init__.py +0 -0
- package/dist/builtin/skills/skill-creator/scripts/aggregate_benchmark.py +0 -401
- package/dist/builtin/skills/skill-creator/scripts/generate_report.py +0 -326
- package/dist/builtin/skills/skill-creator/scripts/improve_description.py +0 -247
- package/dist/builtin/skills/skill-creator/scripts/package_skill.py +0 -136
- package/dist/builtin/skills/skill-creator/scripts/quick_validate.py +0 -103
- package/dist/builtin/skills/skill-creator/scripts/run_eval.py +0 -310
- package/dist/builtin/skills/skill-creator/scripts/run_loop.py +0 -328
- package/dist/builtin/skills/skill-creator/scripts/utils.py +0 -47
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import { access } from "node:fs/promises";
|
|
5
|
+
import { fileURLToPath } from "node:url";
|
|
6
|
+
import { parseSkillFile, validateFrontmatter } from "./skill_metadata.mjs";
|
|
7
|
+
|
|
8
|
+
export async function validateSkill(skillPath) {
|
|
9
|
+
const resolved = path.resolve(skillPath);
|
|
10
|
+
const skillMdPath = path.join(resolved, "SKILL.md");
|
|
11
|
+
|
|
12
|
+
try {
|
|
13
|
+
await access(skillMdPath);
|
|
14
|
+
} catch {
|
|
15
|
+
return { valid: false, message: "SKILL.md not found" };
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
try {
|
|
19
|
+
const { frontmatter } = await parseSkillFile(resolved);
|
|
20
|
+
return validateFrontmatter(frontmatter);
|
|
21
|
+
} catch (error) {
|
|
22
|
+
return { valid: false, message: error instanceof Error ? error.message : String(error) };
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
async function main(argv = process.argv) {
|
|
27
|
+
if (argv.length !== 3) {
|
|
28
|
+
process.stderr.write("Usage: node quick_validate.mjs <skill_directory>\n");
|
|
29
|
+
process.exitCode = 1;
|
|
30
|
+
return;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const result = await validateSkill(argv[2]);
|
|
34
|
+
process.stdout.write(`${result.message}\n`);
|
|
35
|
+
process.exitCode = result.valid ? 0 : 1;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const entryPath = fileURLToPath(import.meta.url);
|
|
39
|
+
if (process.argv[1] && path.resolve(process.argv[1]) === entryPath) {
|
|
40
|
+
main().catch((error) => {
|
|
41
|
+
process.stderr.write(`${error instanceof Error ? error.message : String(error)}\n`);
|
|
42
|
+
process.exitCode = 1;
|
|
43
|
+
});
|
|
44
|
+
}
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
|
+
import os from "node:os";
|
|
5
|
+
import path from "node:path";
|
|
6
|
+
import { fileURLToPath } from "node:url";
|
|
7
|
+
import { parseSkillFile, resolveSkillPath } from "./skill_metadata.mjs";
|
|
8
|
+
import { callPieText, findProjectRoot, uuidFragment } from "./pie_runner.mjs";
|
|
9
|
+
|
|
10
|
+
export async function runSingleQuery(query, skillName, skillDescription, timeout, projectRoot, model) {
|
|
11
|
+
const cleanName = `${skillName}-skill-${uuidFragment()}`.toLowerCase().replace(/[^a-z0-9-_]/g, "-");
|
|
12
|
+
const triggerToken = `PIE_SKILL_TRIGGERED_${uuidFragment().toUpperCase()}`;
|
|
13
|
+
const tempProject = await mkdtemp(path.join(os.tmpdir(), "pie-skill-eval-"));
|
|
14
|
+
const skillDir = path.join(tempProject, ".pie", "skills", cleanName);
|
|
15
|
+
await mkdir(skillDir, { recursive: true });
|
|
16
|
+
const instrumentedDescription = `${skillDescription} For evaluation only: if you decide this skill applies to the user's request, append this exact marker on its own final line: ${triggerToken}`;
|
|
17
|
+
await writeFile(
|
|
18
|
+
path.join(skillDir, "SKILL.md"),
|
|
19
|
+
`---\nname: ${cleanName}\ndescription: ${instrumentedDescription}\n---\n\n# ${skillName}\n\nThis temporary skill exists only to evaluate whether the description triggers in Pie.\n`,
|
|
20
|
+
"utf8",
|
|
21
|
+
);
|
|
22
|
+
|
|
23
|
+
try {
|
|
24
|
+
const response = await callPieText(query, {
|
|
25
|
+
cwd: tempProject,
|
|
26
|
+
timeout: timeout * 1000,
|
|
27
|
+
sessionId: `skill-eval-${uuidFragment()}`,
|
|
28
|
+
});
|
|
29
|
+
return response.includes(triggerToken);
|
|
30
|
+
} finally {
|
|
31
|
+
await rm(tempProject, { recursive: true, force: true });
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export async function runEval(evalSet, skillName, description, numWorkers, timeout, projectRoot, runsPerQuery = 1, triggerThreshold = 0.5, model = null) {
|
|
36
|
+
const tasks = [];
|
|
37
|
+
for (const item of evalSet) {
|
|
38
|
+
for (let runIdx = 0; runIdx < runsPerQuery; runIdx += 1) {
|
|
39
|
+
tasks.push({ item, runIdx });
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
const queryTriggers = new Map();
|
|
44
|
+
let nextIndex = 0;
|
|
45
|
+
|
|
46
|
+
async function worker() {
|
|
47
|
+
while (nextIndex < tasks.length) {
|
|
48
|
+
const currentIndex = nextIndex;
|
|
49
|
+
nextIndex += 1;
|
|
50
|
+
const { item } = tasks[currentIndex];
|
|
51
|
+
const query = item.query;
|
|
52
|
+
if (!queryTriggers.has(query)) {
|
|
53
|
+
queryTriggers.set(query, { item, triggers: [] });
|
|
54
|
+
}
|
|
55
|
+
try {
|
|
56
|
+
const result = await runSingleQuery(query, skillName, description, timeout, projectRoot, model);
|
|
57
|
+
queryTriggers.get(query).triggers.push(!!result);
|
|
58
|
+
} catch (error) {
|
|
59
|
+
process.stderr.write(`Warning: query failed: ${error instanceof Error ? error.message : String(error)}\n`);
|
|
60
|
+
queryTriggers.get(query).triggers.push(false);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
await Promise.all(Array.from({ length: Math.max(1, numWorkers) }, () => worker()));
|
|
66
|
+
|
|
67
|
+
const results = [];
|
|
68
|
+
for (const [query, payload] of queryTriggers.entries()) {
|
|
69
|
+
const { item, triggers } = payload;
|
|
70
|
+
const triggerCount = triggers.filter(Boolean).length;
|
|
71
|
+
const triggerRate = triggers.length > 0 ? triggerCount / triggers.length : 0;
|
|
72
|
+
const didPass = item.should_trigger ? triggerRate >= triggerThreshold : triggerRate < triggerThreshold;
|
|
73
|
+
results.push({
|
|
74
|
+
query,
|
|
75
|
+
should_trigger: item.should_trigger,
|
|
76
|
+
trigger_rate: triggerRate,
|
|
77
|
+
triggers: triggerCount,
|
|
78
|
+
runs: triggers.length,
|
|
79
|
+
pass: didPass,
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const passed = results.filter((item) => item.pass).length;
|
|
84
|
+
return {
|
|
85
|
+
skill_name: skillName,
|
|
86
|
+
description,
|
|
87
|
+
results,
|
|
88
|
+
summary: {
|
|
89
|
+
total: results.length,
|
|
90
|
+
passed,
|
|
91
|
+
failed: results.length - passed,
|
|
92
|
+
},
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
async function main(argv = process.argv) {
|
|
97
|
+
const args = parseArgs(argv.slice(2));
|
|
98
|
+
if (args.help || args.h) {
|
|
99
|
+
printHelp();
|
|
100
|
+
return;
|
|
101
|
+
}
|
|
102
|
+
if (!args["skill-path"] || !args["eval-set"]) {
|
|
103
|
+
printHelp();
|
|
104
|
+
process.exitCode = 1;
|
|
105
|
+
return;
|
|
106
|
+
}
|
|
107
|
+
const evalSet = JSON.parse(await readFile(args["eval-set"], "utf8"));
|
|
108
|
+
const skillPath = resolveSkillPath(args["skill-path"]);
|
|
109
|
+
const { name } = await parseSkillFile(skillPath);
|
|
110
|
+
const description = args.description ?? (await parseSkillFile(skillPath)).description;
|
|
111
|
+
|
|
112
|
+
const output = await runEval(
|
|
113
|
+
evalSet,
|
|
114
|
+
name,
|
|
115
|
+
description,
|
|
116
|
+
Number(args["num-workers"] ?? 10),
|
|
117
|
+
Number(args.timeout ?? 30),
|
|
118
|
+
findProjectRoot(),
|
|
119
|
+
Number(args["runs-per-query"] ?? 1),
|
|
120
|
+
Number(args["trigger-threshold"] ?? 0.5),
|
|
121
|
+
args.model ?? null,
|
|
122
|
+
);
|
|
123
|
+
process.stdout.write(`${JSON.stringify(output, null, 2)}\n`);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
function printHelp() {
|
|
127
|
+
process.stdout.write(
|
|
128
|
+
[
|
|
129
|
+
"Usage: node run_eval.mjs --skill-path <skill-dir|SKILL.md> --eval-set <eval-set.json> [options]",
|
|
130
|
+
"",
|
|
131
|
+
"Options:",
|
|
132
|
+
" --description <text> Override the skill description for evaluation",
|
|
133
|
+
" --num-workers <n> Number of concurrent workers (default: 10)",
|
|
134
|
+
" --timeout <seconds> Timeout per query (default: 30)",
|
|
135
|
+
" --runs-per-query <n> Number of runs per query (default: 1)",
|
|
136
|
+
" --trigger-threshold <ratio> Pass threshold for positive prompts (default: 0.5)",
|
|
137
|
+
" --model <id> Optional model override",
|
|
138
|
+
" -h, --help Show this help",
|
|
139
|
+
"",
|
|
140
|
+
"The --skill-path argument may point to either the skill directory or its SKILL.md file.",
|
|
141
|
+
"",
|
|
142
|
+
].join("\n"),
|
|
143
|
+
);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
function parseArgs(args) {
|
|
147
|
+
const flags = {};
|
|
148
|
+
for (let i = 0; i < args.length; i += 1) {
|
|
149
|
+
const value = args[i];
|
|
150
|
+
if (!value.startsWith("--")) continue;
|
|
151
|
+
const key = value.slice(2);
|
|
152
|
+
const next = args[i + 1];
|
|
153
|
+
if (!next || next.startsWith("--")) {
|
|
154
|
+
flags[key] = true;
|
|
155
|
+
continue;
|
|
156
|
+
}
|
|
157
|
+
flags[key] = next;
|
|
158
|
+
i += 1;
|
|
159
|
+
}
|
|
160
|
+
return flags;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
const entryPath = fileURLToPath(import.meta.url);
|
|
164
|
+
if (process.argv[1] && path.resolve(process.argv[1]) === entryPath) {
|
|
165
|
+
main().catch((error) => {
|
|
166
|
+
process.stderr.write(`${error instanceof Error ? error.message : String(error)}\n`);
|
|
167
|
+
process.exitCode = 1;
|
|
168
|
+
});
|
|
169
|
+
}
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import os from "node:os";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
6
|
+
import { fileURLToPath } from "node:url";
|
|
7
|
+
import { generateHtml } from "./generate_report.mjs";
|
|
8
|
+
import { improveDescription } from "./improve_description.mjs";
|
|
9
|
+
import { findProjectRoot, openInBrowser } from "./claude_cli.mjs";
|
|
10
|
+
import { runEval } from "./run_eval.mjs";
|
|
11
|
+
import { parseSkillFile, resolveSkillPath } from "./skill_metadata.mjs";
|
|
12
|
+
|
|
13
|
+
function seededShuffle(values, seed) {
|
|
14
|
+
let state = seed >>> 0;
|
|
15
|
+
const random = () => {
|
|
16
|
+
state = (1664525 * state + 1013904223) >>> 0;
|
|
17
|
+
return state / 0x100000000;
|
|
18
|
+
};
|
|
19
|
+
const items = [...values];
|
|
20
|
+
for (let i = items.length - 1; i > 0; i -= 1) {
|
|
21
|
+
const j = Math.floor(random() * (i + 1));
|
|
22
|
+
[items[i], items[j]] = [items[j], items[i]];
|
|
23
|
+
}
|
|
24
|
+
return items;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export function splitEvalSet(evalSet, holdout, seed = 42) {
|
|
28
|
+
const trigger = seededShuffle(evalSet.filter((item) => item.should_trigger), seed);
|
|
29
|
+
const noTrigger = seededShuffle(evalSet.filter((item) => !item.should_trigger), seed + 1);
|
|
30
|
+
const triggerTestCount = Math.max(1, Math.floor(trigger.length * holdout));
|
|
31
|
+
const noTriggerTestCount = Math.max(1, Math.floor(noTrigger.length * holdout));
|
|
32
|
+
return {
|
|
33
|
+
trainSet: [...trigger.slice(triggerTestCount), ...noTrigger.slice(noTriggerTestCount)],
|
|
34
|
+
testSet: [...trigger.slice(0, triggerTestCount), ...noTrigger.slice(0, noTriggerTestCount)],
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function summarizeResults(results) {
|
|
39
|
+
const passed = results.filter((item) => item.pass).length;
|
|
40
|
+
return { passed, failed: results.length - passed, total: results.length };
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function aggregateRuns(results) {
|
|
44
|
+
let correct = 0;
|
|
45
|
+
let total = 0;
|
|
46
|
+
for (const result of results) {
|
|
47
|
+
const runs = Number(result.runs ?? 0);
|
|
48
|
+
const triggers = Number(result.triggers ?? 0);
|
|
49
|
+
total += runs;
|
|
50
|
+
correct += result.should_trigger ? triggers : runs - triggers;
|
|
51
|
+
}
|
|
52
|
+
return { correct, total };
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function printEvalStats(label, results, elapsedSeconds = 0) {
|
|
56
|
+
const pos = results.filter((item) => item.should_trigger);
|
|
57
|
+
const neg = results.filter((item) => !item.should_trigger);
|
|
58
|
+
const tp = pos.reduce((sum, item) => sum + Number(item.triggers ?? 0), 0);
|
|
59
|
+
const posRuns = pos.reduce((sum, item) => sum + Number(item.runs ?? 0), 0);
|
|
60
|
+
const fn = posRuns - tp;
|
|
61
|
+
const fp = neg.reduce((sum, item) => sum + Number(item.triggers ?? 0), 0);
|
|
62
|
+
const negRuns = neg.reduce((sum, item) => sum + Number(item.runs ?? 0), 0);
|
|
63
|
+
const tn = negRuns - fp;
|
|
64
|
+
const total = tp + tn + fp + fn;
|
|
65
|
+
const precision = (tp + fp) > 0 ? tp / (tp + fp) : 1;
|
|
66
|
+
const recall = (tp + fn) > 0 ? tp / (tp + fn) : 1;
|
|
67
|
+
const accuracy = total > 0 ? (tp + tn) / total : 0;
|
|
68
|
+
process.stderr.write(`${label}: ${tp + tn}/${total} correct, precision=${Math.round(precision * 100)}% recall=${Math.round(recall * 100)}% accuracy=${Math.round(accuracy * 100)}% (${elapsedSeconds.toFixed(1)}s)\n`);
|
|
69
|
+
for (const result of results) {
|
|
70
|
+
process.stderr.write(` [${result.pass ? "PASS" : "FAIL"}] rate=${result.triggers}/${result.runs} expected=${result.should_trigger}: ${String(result.query).slice(0, 60)}\n`);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
export async function runLoop({
|
|
75
|
+
evalSet,
|
|
76
|
+
skillPath,
|
|
77
|
+
descriptionOverride = null,
|
|
78
|
+
numWorkers = 10,
|
|
79
|
+
timeout = 30,
|
|
80
|
+
maxIterations = 5,
|
|
81
|
+
runsPerQuery = 3,
|
|
82
|
+
triggerThreshold = 0.5,
|
|
83
|
+
holdout = 0.4,
|
|
84
|
+
model,
|
|
85
|
+
verbose = false,
|
|
86
|
+
liveReportPath = null,
|
|
87
|
+
logDir = null,
|
|
88
|
+
}) {
|
|
89
|
+
const { name, description: originalDescription, content } = await parseSkillFile(skillPath);
|
|
90
|
+
let currentDescription = descriptionOverride ?? originalDescription;
|
|
91
|
+
const { trainSet, testSet } = holdout > 0 ? splitEvalSet(evalSet, holdout) : { trainSet: evalSet, testSet: [] };
|
|
92
|
+
const history = [];
|
|
93
|
+
let exitReason = "unknown";
|
|
94
|
+
|
|
95
|
+
for (let iteration = 1; iteration <= maxIterations; iteration += 1) {
|
|
96
|
+
if (verbose) {
|
|
97
|
+
process.stderr.write(`\n${"=".repeat(60)}\nIteration ${iteration}/${maxIterations}\nDescription: ${currentDescription}\n${"=".repeat(60)}\n`);
|
|
98
|
+
}
|
|
99
|
+
const allQueries = [...trainSet, ...testSet];
|
|
100
|
+
const evalStart = Date.now();
|
|
101
|
+
const allResults = await runEval(
|
|
102
|
+
allQueries,
|
|
103
|
+
name,
|
|
104
|
+
currentDescription,
|
|
105
|
+
numWorkers,
|
|
106
|
+
timeout,
|
|
107
|
+
findProjectRoot(),
|
|
108
|
+
runsPerQuery,
|
|
109
|
+
triggerThreshold,
|
|
110
|
+
model ?? null,
|
|
111
|
+
);
|
|
112
|
+
const evalElapsed = (Date.now() - evalStart) / 1000;
|
|
113
|
+
const trainQuerySet = new Set(trainSet.map((item) => item.query));
|
|
114
|
+
const trainResultList = allResults.results.filter((item) => trainQuerySet.has(item.query));
|
|
115
|
+
const testResultList = allResults.results.filter((item) => !trainQuerySet.has(item.query));
|
|
116
|
+
const trainSummary = summarizeResults(trainResultList);
|
|
117
|
+
const testSummary = testSet.length > 0 ? summarizeResults(testResultList) : null;
|
|
118
|
+
history.push({
|
|
119
|
+
iteration,
|
|
120
|
+
description: currentDescription,
|
|
121
|
+
train_passed: trainSummary.passed,
|
|
122
|
+
train_failed: trainSummary.failed,
|
|
123
|
+
train_total: trainSummary.total,
|
|
124
|
+
train_results: trainResultList,
|
|
125
|
+
test_passed: testSummary?.passed ?? null,
|
|
126
|
+
test_failed: testSummary?.failed ?? null,
|
|
127
|
+
test_total: testSummary?.total ?? null,
|
|
128
|
+
test_results: testSummary ? testResultList : null,
|
|
129
|
+
passed: trainSummary.passed,
|
|
130
|
+
failed: trainSummary.failed,
|
|
131
|
+
total: trainSummary.total,
|
|
132
|
+
results: trainResultList,
|
|
133
|
+
});
|
|
134
|
+
if (liveReportPath) {
|
|
135
|
+
const partialOutput = {
|
|
136
|
+
original_description: originalDescription,
|
|
137
|
+
best_description: currentDescription,
|
|
138
|
+
best_score: "in progress",
|
|
139
|
+
iterations_run: history.length,
|
|
140
|
+
holdout,
|
|
141
|
+
train_size: trainSet.length,
|
|
142
|
+
test_size: testSet.length,
|
|
143
|
+
history,
|
|
144
|
+
};
|
|
145
|
+
await writeFile(liveReportPath, generateHtml(partialOutput, { autoRefresh: true, skillName: name }));
|
|
146
|
+
}
|
|
147
|
+
if (verbose) {
|
|
148
|
+
printEvalStats("Train", trainResultList, evalElapsed);
|
|
149
|
+
if (testSummary) {
|
|
150
|
+
printEvalStats("Test ", testResultList, 0);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
if (trainSummary.failed === 0) {
|
|
154
|
+
exitReason = `all_passed (iteration ${iteration})`;
|
|
155
|
+
break;
|
|
156
|
+
}
|
|
157
|
+
if (iteration === maxIterations) {
|
|
158
|
+
exitReason = `max_iterations (${maxIterations})`;
|
|
159
|
+
break;
|
|
160
|
+
}
|
|
161
|
+
const blindedHistory = history.map((item) => Object.fromEntries(Object.entries(item).filter(([key]) => !key.startsWith("test_"))));
|
|
162
|
+
currentDescription = await improveDescription({
|
|
163
|
+
skillName: name,
|
|
164
|
+
skillContent: content,
|
|
165
|
+
currentDescription,
|
|
166
|
+
evalResults: { results: trainResultList, summary: trainSummary },
|
|
167
|
+
history: blindedHistory,
|
|
168
|
+
model,
|
|
169
|
+
logDir,
|
|
170
|
+
iteration,
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
const best = testSet.length > 0
|
|
175
|
+
? history.reduce((bestItem, item) => ((item.test_passed ?? -1) > (bestItem?.test_passed ?? -1) ? item : bestItem), null)
|
|
176
|
+
: history.reduce((bestItem, item) => ((item.train_passed ?? -1) > (bestItem?.train_passed ?? -1) ? item : bestItem), null);
|
|
177
|
+
const bestScore = testSet.length > 0 ? `${best?.test_passed}/${best?.test_total}` : `${best?.train_passed}/${best?.train_total}`;
|
|
178
|
+
return {
|
|
179
|
+
exit_reason: exitReason,
|
|
180
|
+
original_description: originalDescription,
|
|
181
|
+
best_description: best?.description ?? currentDescription,
|
|
182
|
+
best_score: bestScore,
|
|
183
|
+
best_train_score: `${best?.train_passed ?? 0}/${best?.train_total ?? 0}`,
|
|
184
|
+
best_test_score: testSet.length > 0 ? `${best?.test_passed ?? 0}/${best?.test_total ?? 0}` : null,
|
|
185
|
+
final_description: currentDescription,
|
|
186
|
+
iterations_run: history.length,
|
|
187
|
+
holdout,
|
|
188
|
+
train_size: trainSet.length,
|
|
189
|
+
test_size: testSet.length,
|
|
190
|
+
history,
|
|
191
|
+
};
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
function parseArgs(args) {
|
|
195
|
+
const flags = {};
|
|
196
|
+
for (let i = 0; i < args.length; i += 1) {
|
|
197
|
+
const value = args[i];
|
|
198
|
+
if (!value.startsWith("--")) continue;
|
|
199
|
+
const key = value.slice(2);
|
|
200
|
+
const next = args[i + 1];
|
|
201
|
+
if (!next || next.startsWith("--")) {
|
|
202
|
+
flags[key] = true;
|
|
203
|
+
continue;
|
|
204
|
+
}
|
|
205
|
+
flags[key] = next;
|
|
206
|
+
i += 1;
|
|
207
|
+
}
|
|
208
|
+
return flags;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
async function main(argv = process.argv) {
|
|
212
|
+
const args = parseArgs(argv.slice(2));
|
|
213
|
+
if (args.help || args.h) {
|
|
214
|
+
printHelp();
|
|
215
|
+
return;
|
|
216
|
+
}
|
|
217
|
+
if (!args["skill-path"] || !args["eval-set"]) {
|
|
218
|
+
printHelp();
|
|
219
|
+
process.exitCode = 1;
|
|
220
|
+
return;
|
|
221
|
+
}
|
|
222
|
+
const evalSet = JSON.parse(await readFile(args["eval-set"], "utf8"));
|
|
223
|
+
const skillPath = resolveSkillPath(args["skill-path"]);
|
|
224
|
+
const { name } = await parseSkillFile(skillPath);
|
|
225
|
+
const reportMode = args.report ?? "auto";
|
|
226
|
+
let liveReportPath = null;
|
|
227
|
+
if (reportMode !== "none") {
|
|
228
|
+
liveReportPath = reportMode === "auto"
|
|
229
|
+
? path.join(os.tmpdir(), `skill_description_report_${path.basename(skillPath)}_${Date.now()}.html`)
|
|
230
|
+
: path.resolve(reportMode);
|
|
231
|
+
await writeFile(liveReportPath, "<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>");
|
|
232
|
+
await openInBrowser(liveReportPath);
|
|
233
|
+
}
|
|
234
|
+
const resultsDir = args["results-dir"] ? path.join(path.resolve(args["results-dir"]), new Date().toISOString().replaceAll(":", "-")) : null;
|
|
235
|
+
if (resultsDir) {
|
|
236
|
+
await mkdir(resultsDir, { recursive: true });
|
|
237
|
+
}
|
|
238
|
+
const output = await runLoop({
|
|
239
|
+
evalSet,
|
|
240
|
+
skillPath,
|
|
241
|
+
descriptionOverride: args.description ?? null,
|
|
242
|
+
numWorkers: Number(args["num-workers"] ?? 10),
|
|
243
|
+
timeout: Number(args.timeout ?? 30),
|
|
244
|
+
maxIterations: Number(args["max-iterations"] ?? 5),
|
|
245
|
+
runsPerQuery: Number(args["runs-per-query"] ?? 3),
|
|
246
|
+
triggerThreshold: Number(args["trigger-threshold"] ?? 0.5),
|
|
247
|
+
holdout: Number(args.holdout ?? 0.4),
|
|
248
|
+
model: args.model,
|
|
249
|
+
verbose: Boolean(args.verbose),
|
|
250
|
+
liveReportPath,
|
|
251
|
+
logDir: resultsDir ? path.join(resultsDir, "logs") : null,
|
|
252
|
+
});
|
|
253
|
+
const outputJson = `${JSON.stringify(output, null, 2)}\n`;
|
|
254
|
+
process.stdout.write(outputJson);
|
|
255
|
+
if (resultsDir) {
|
|
256
|
+
await writeFile(path.join(resultsDir, "results.json"), outputJson);
|
|
257
|
+
}
|
|
258
|
+
if (liveReportPath) {
|
|
259
|
+
await writeFile(liveReportPath, generateHtml(output, { autoRefresh: false, skillName: name }));
|
|
260
|
+
}
|
|
261
|
+
if (resultsDir) {
|
|
262
|
+
await writeFile(path.join(resultsDir, "report.html"), `${generateHtml(output, { autoRefresh: false, skillName: name })}\n`);
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
function printHelp() {
|
|
267
|
+
process.stdout.write(
|
|
268
|
+
[
|
|
269
|
+
"Usage: node run_loop.mjs --skill-path <skill-dir|SKILL.md> --eval-set <eval-set.json> [options]",
|
|
270
|
+
"",
|
|
271
|
+
"Options:",
|
|
272
|
+
" --description <text> Override the initial description",
|
|
273
|
+
" --num-workers <n> Number of concurrent workers (default: 10)",
|
|
274
|
+
" --timeout <seconds> Timeout per query (default: 30)",
|
|
275
|
+
" --max-iterations <n> Maximum optimization iterations (default: 5)",
|
|
276
|
+
" --runs-per-query <n> Number of runs per query (default: 3)",
|
|
277
|
+
" --trigger-threshold <ratio> Pass threshold for positive prompts (default: 0.5)",
|
|
278
|
+
" --holdout <ratio> Fraction reserved for test set (default: 0.4)",
|
|
279
|
+
" --report <auto|none|path> Live report mode or output path (default: auto)",
|
|
280
|
+
" --results-dir <dir> Directory for loop artifacts",
|
|
281
|
+
" --model <id> Optional model override",
|
|
282
|
+
" --verbose Print per-iteration stats",
|
|
283
|
+
" -h, --help Show this help",
|
|
284
|
+
"",
|
|
285
|
+
"The --skill-path argument may point to either the skill directory or its SKILL.md file.",
|
|
286
|
+
"",
|
|
287
|
+
].join("\n"),
|
|
288
|
+
);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
const entryPath = fileURLToPath(import.meta.url);
|
|
292
|
+
if (process.argv[1] && path.resolve(process.argv[1]) === entryPath) {
|
|
293
|
+
main().catch((error) => {
|
|
294
|
+
process.stderr.write(`${error instanceof Error ? error.message : String(error)}\n`);
|
|
295
|
+
process.exitCode = 1;
|
|
296
|
+
});
|
|
297
|
+
}
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { readFile } from "node:fs/promises";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import YAML from "yaml";
|
|
6
|
+
|
|
7
|
+
const ALLOWED_PROPERTIES = new Set([
|
|
8
|
+
"name",
|
|
9
|
+
"description",
|
|
10
|
+
"license",
|
|
11
|
+
"allowed-tools",
|
|
12
|
+
"metadata",
|
|
13
|
+
"compatibility",
|
|
14
|
+
]);
|
|
15
|
+
|
|
16
|
+
export async function readSkillMarkdown(skillPath) {
|
|
17
|
+
const skillMdPath = path.join(skillPath, "SKILL.md");
|
|
18
|
+
return readFile(skillMdPath, "utf8");
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export function resolveSkillPath(skillPath) {
|
|
22
|
+
const resolved = path.resolve(skillPath);
|
|
23
|
+
if (path.basename(resolved).toLowerCase() === "skill.md") {
|
|
24
|
+
return path.dirname(resolved);
|
|
25
|
+
}
|
|
26
|
+
return resolved;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export function parseSkillFrontmatter(content) {
|
|
30
|
+
if (!content.startsWith("---")) {
|
|
31
|
+
throw new Error("No YAML frontmatter found");
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const match = content.match(/^---\n([\s\S]*?)\n---/);
|
|
35
|
+
if (!match) {
|
|
36
|
+
throw new Error("Invalid frontmatter format");
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const frontmatter = YAML.parse(match[1]);
|
|
40
|
+
if (!frontmatter || typeof frontmatter !== "object" || Array.isArray(frontmatter)) {
|
|
41
|
+
throw new Error("Frontmatter must be a YAML dictionary");
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return frontmatter;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
export async function parseSkillFile(skillPath) {
|
|
48
|
+
const resolvedSkillPath = resolveSkillPath(skillPath);
|
|
49
|
+
const content = await readSkillMarkdown(resolvedSkillPath);
|
|
50
|
+
const frontmatter = parseSkillFrontmatter(content);
|
|
51
|
+
return {
|
|
52
|
+
content,
|
|
53
|
+
frontmatter,
|
|
54
|
+
name: typeof frontmatter.name === "string" ? frontmatter.name : "",
|
|
55
|
+
description: typeof frontmatter.description === "string" ? frontmatter.description : "",
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export function validateFrontmatter(frontmatter) {
|
|
60
|
+
const unexpectedKeys = Object.keys(frontmatter).filter((key) => !ALLOWED_PROPERTIES.has(key));
|
|
61
|
+
if (unexpectedKeys.length > 0) {
|
|
62
|
+
return {
|
|
63
|
+
valid: false,
|
|
64
|
+
message:
|
|
65
|
+
`Unexpected key(s) in SKILL.md frontmatter: ${unexpectedKeys.sort().join(", ")}. ` +
|
|
66
|
+
`Allowed properties are: ${Array.from(ALLOWED_PROPERTIES).sort().join(", ")}`,
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
if (!("name" in frontmatter)) {
|
|
71
|
+
return { valid: false, message: "Missing 'name' in frontmatter" };
|
|
72
|
+
}
|
|
73
|
+
if (!("description" in frontmatter)) {
|
|
74
|
+
return { valid: false, message: "Missing 'description' in frontmatter" };
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const name = frontmatter.name;
|
|
78
|
+
if (typeof name !== "string") {
|
|
79
|
+
return { valid: false, message: `Name must be a string, got ${typeof name}` };
|
|
80
|
+
}
|
|
81
|
+
const trimmedName = name.trim();
|
|
82
|
+
if (trimmedName) {
|
|
83
|
+
if (!/^[a-z0-9-]+$/.test(trimmedName)) {
|
|
84
|
+
return {
|
|
85
|
+
valid: false,
|
|
86
|
+
message: `Name '${trimmedName}' should be kebab-case (lowercase letters, digits, and hyphens only)`,
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
if (trimmedName.startsWith("-") || trimmedName.endsWith("-") || trimmedName.includes("--")) {
|
|
90
|
+
return {
|
|
91
|
+
valid: false,
|
|
92
|
+
message: `Name '${trimmedName}' cannot start/end with hyphen or contain consecutive hyphens`,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
if (trimmedName.length > 64) {
|
|
96
|
+
return {
|
|
97
|
+
valid: false,
|
|
98
|
+
message: `Name is too long (${trimmedName.length} characters). Maximum is 64 characters.`,
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const description = frontmatter.description;
|
|
104
|
+
if (typeof description !== "string") {
|
|
105
|
+
return { valid: false, message: `Description must be a string, got ${typeof description}` };
|
|
106
|
+
}
|
|
107
|
+
const trimmedDescription = description.trim();
|
|
108
|
+
if (trimmedDescription) {
|
|
109
|
+
if (trimmedDescription.includes("<") || trimmedDescription.includes(">")) {
|
|
110
|
+
return { valid: false, message: "Description cannot contain angle brackets (< or >)" };
|
|
111
|
+
}
|
|
112
|
+
if (trimmedDescription.length > 1024) {
|
|
113
|
+
return {
|
|
114
|
+
valid: false,
|
|
115
|
+
message: `Description is too long (${trimmedDescription.length} characters). Maximum is 1024 characters.`,
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
const compatibility = frontmatter.compatibility;
|
|
121
|
+
if (compatibility !== undefined && compatibility !== null && compatibility !== "") {
|
|
122
|
+
if (typeof compatibility !== "string") {
|
|
123
|
+
return { valid: false, message: `Compatibility must be a string, got ${typeof compatibility}` };
|
|
124
|
+
}
|
|
125
|
+
if (compatibility.length > 500) {
|
|
126
|
+
return {
|
|
127
|
+
valid: false,
|
|
128
|
+
message: `Compatibility is too long (${compatibility.length} characters). Maximum is 500 characters.`,
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
return { valid: true, message: "Skill is valid!" };
|
|
134
|
+
}
|