@cydm/pie 1.0.6 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +151 -9
- package/dist/builtin/extensions/ask-user/index.js +2 -3
- package/dist/builtin/extensions/kimi-attachments/index.js +3 -3
- package/dist/builtin/extensions/plan-mode/index.js +85 -87
- package/dist/builtin/extensions/subagent/index.js +73 -8
- package/dist/builtin/extensions/todo/index.js +51 -22
- package/dist/builtin/skills/browser-tools/CHANGELOG.md +2 -44
- package/dist/builtin/skills/browser-tools/README.md +10 -99
- package/dist/builtin/skills/browser-tools/SKILL.md +21 -174
- package/dist/builtin/skills/browser-tools/package.json +6 -13
- package/dist/builtin/skills/browser-tools/playwright-cli.js +24 -0
- package/dist/builtin/skills/pie-unity-rpc/SKILL.md +121 -0
- package/dist/builtin/skills/pie-unity-rpc/pie-unity-rpc.js +417 -0
- package/dist/builtin/skills/skill-creator/SKILL.md +17 -17
- package/dist/builtin/skills/skill-creator/eval-viewer/generate_review.mjs +285 -0
- package/dist/builtin/skills/skill-creator/eval-viewer/viewer.html +1 -1
- package/dist/builtin/skills/skill-creator/scripts/aggregate_benchmark.mjs +271 -0
- package/dist/builtin/skills/skill-creator/scripts/claude_cli.mjs +115 -0
- package/dist/builtin/skills/skill-creator/scripts/generate_report.mjs +224 -0
- package/dist/builtin/skills/skill-creator/scripts/improve_description.mjs +198 -0
- package/dist/builtin/skills/skill-creator/scripts/package_skill.mjs +132 -0
- package/dist/builtin/skills/skill-creator/scripts/pie_runner.mjs +115 -0
- package/dist/builtin/skills/skill-creator/scripts/quick_validate.mjs +44 -0
- package/dist/builtin/skills/skill-creator/scripts/run_eval.mjs +169 -0
- package/dist/builtin/skills/skill-creator/scripts/run_loop.mjs +297 -0
- package/dist/builtin/skills/skill-creator/scripts/skill_metadata.mjs +134 -0
- package/dist/chunks/{chunk-MWFBYJOI.js → chunk-A5JSJAPK.js} +3973 -1313
- package/dist/chunks/chunk-BHNULR7U.js +7991 -0
- package/dist/chunks/chunk-GDTN4UPJ.js +701 -0
- package/dist/chunks/{src-EGWRDMLB.js → src-3X3HBT2G.js} +1 -2
- package/dist/chunks/typescript-GSKWJIO4.js +210747 -0
- package/dist/cli.js +14664 -11973
- package/models.schema.json +238 -0
- package/package.json +34 -8
- package/dist/builtin/skills/browser-tools/browser-content.js +0 -103
- package/dist/builtin/skills/browser-tools/browser-cookies.js +0 -35
- package/dist/builtin/skills/browser-tools/browser-eval.js +0 -49
- package/dist/builtin/skills/browser-tools/browser-hn-scraper.js +0 -108
- package/dist/builtin/skills/browser-tools/browser-nav.js +0 -44
- package/dist/builtin/skills/browser-tools/browser-pick.js +0 -162
- package/dist/builtin/skills/browser-tools/browser-screenshot.js +0 -34
- package/dist/builtin/skills/browser-tools/browser-start.js +0 -86
- package/dist/builtin/skills/skill-creator/eval-viewer/generate_review.py +0 -471
- package/dist/builtin/skills/skill-creator/scripts/__init__.py +0 -0
- package/dist/builtin/skills/skill-creator/scripts/aggregate_benchmark.py +0 -401
- package/dist/builtin/skills/skill-creator/scripts/generate_report.py +0 -326
- package/dist/builtin/skills/skill-creator/scripts/improve_description.py +0 -247
- package/dist/builtin/skills/skill-creator/scripts/package_skill.py +0 -136
- package/dist/builtin/skills/skill-creator/scripts/quick_validate.py +0 -103
- package/dist/builtin/skills/skill-creator/scripts/run_eval.py +0 -310
- package/dist/builtin/skills/skill-creator/scripts/run_loop.py +0 -328
- package/dist/builtin/skills/skill-creator/scripts/utils.py +0 -47
- package/dist/chunks/capabilities-FENCOHVA.js +0 -9
- package/dist/chunks/chunk-JYBXCEJJ.js +0 -315
- package/dist/chunks/chunk-RID3574D.js +0 -2718
- package/dist/chunks/chunk-TBJ25UWB.js +0 -3657
- package/dist/chunks/chunk-XZXLO7YB.js +0 -322
- package/dist/chunks/file-logger-AL5VVZHH.js +0 -22
- package/dist/chunks/src-WRUACRN2.js +0 -132
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { readFileSync, existsSync } from "node:fs";
|
|
4
|
+
import { homedir } from "node:os";
|
|
5
|
+
import path from "node:path";
|
|
6
|
+
import { pathToFileURL } from "node:url";
|
|
7
|
+
|
|
8
|
+
const REGISTRY_FILE = path.join(homedir(), ".pie-unity", "registry.json");
|
|
9
|
+
const DEFAULT_WAIT_MS = 600;
|
|
10
|
+
const DEFAULT_RETRIES = 30;
|
|
11
|
+
const ACTIVE_INSTANCE_MAX_AGE_SEC = 120;
|
|
12
|
+
|
|
13
|
+
class InstanceSelectionError extends Error {
|
|
14
|
+
constructor(code, message, details = {}) {
|
|
15
|
+
super(message);
|
|
16
|
+
this.name = "InstanceSelectionError";
|
|
17
|
+
this.code = code;
|
|
18
|
+
this.details = details;
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export function parseArgs(argv) {
|
|
23
|
+
const args = argv.slice(2);
|
|
24
|
+
const command = args[0] || "instances";
|
|
25
|
+
const flags = {};
|
|
26
|
+
for (let i = 1; i < args.length; i += 1) {
|
|
27
|
+
const item = args[i];
|
|
28
|
+
if (!item.startsWith("--")) continue;
|
|
29
|
+
const key = item.slice(2);
|
|
30
|
+
const next = args[i + 1];
|
|
31
|
+
if (!next || next.startsWith("--")) {
|
|
32
|
+
flags[key] = "true";
|
|
33
|
+
continue;
|
|
34
|
+
}
|
|
35
|
+
flags[key] = next;
|
|
36
|
+
i += 1;
|
|
37
|
+
}
|
|
38
|
+
return { command, flags };
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export function loadRegistry(filePath = REGISTRY_FILE) {
|
|
42
|
+
if (!existsSync(filePath)) {
|
|
43
|
+
return [];
|
|
44
|
+
}
|
|
45
|
+
try {
|
|
46
|
+
const raw = readFileSync(filePath, "utf8").replace(/^\uFEFF/, "");
|
|
47
|
+
const json = JSON.parse(raw);
|
|
48
|
+
const items = Array.isArray(json?.instances) ? json.instances : [];
|
|
49
|
+
return dedupeInstances(items.filter(Boolean));
|
|
50
|
+
} catch {
|
|
51
|
+
return [];
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export function getActiveInstances(instances, nowUnix = Math.floor(Date.now() / 1000)) {
|
|
56
|
+
const recent = instances.filter((item) => {
|
|
57
|
+
const lastSeenUnix = Number(item?.lastSeenUnix || 0);
|
|
58
|
+
return lastSeenUnix > 0 && nowUnix - lastSeenUnix <= ACTIVE_INSTANCE_MAX_AGE_SEC;
|
|
59
|
+
});
|
|
60
|
+
return recent.length > 0 ? recent : instances;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function basenameMatch(left, right) {
|
|
64
|
+
return path.basename(left || "") !== ""
|
|
65
|
+
&& path.basename(left || "") === path.basename(right || "");
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function computeProjectMatchScore(requestedProject, candidateProject) {
|
|
69
|
+
const requested = normalizePath(requestedProject || "");
|
|
70
|
+
const candidate = normalizePath(candidateProject || "");
|
|
71
|
+
if (!requested || !candidate) return 0;
|
|
72
|
+
if (requested === candidate) return 1000;
|
|
73
|
+
if (basenameMatch(requested, candidate)) return 700;
|
|
74
|
+
|
|
75
|
+
const requestedWithSlash = requested.endsWith("/") ? requested : `${requested}/`;
|
|
76
|
+
const candidateWithSlash = candidate.endsWith("/") ? candidate : `${candidate}/`;
|
|
77
|
+
if (requestedWithSlash.startsWith(candidateWithSlash) || candidateWithSlash.startsWith(requestedWithSlash)) {
|
|
78
|
+
const distance = Math.abs(requested.length - candidate.length);
|
|
79
|
+
return 500 - Math.min(distance, 400);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
return 0;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function formatCandidate(item) {
|
|
86
|
+
return {
|
|
87
|
+
instanceId: String(item?.instanceId || ""),
|
|
88
|
+
projectPath: normalizePath(item?.projectPath || ""),
|
|
89
|
+
projectName: String(item?.projectName || ""),
|
|
90
|
+
mode: String(item?.mode || ""),
|
|
91
|
+
port: Number(item?.port || 0),
|
|
92
|
+
lastSeenUnix: Number(item?.lastSeenUnix || 0),
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
function buildSelectionError(code, message, flags, candidates) {
|
|
97
|
+
return new InstanceSelectionError(code, message, {
|
|
98
|
+
reason: code === "PIE_UNITY_INSTANCE_AMBIGUOUS" ? "ambiguous" : "no_match",
|
|
99
|
+
requestedProject: normalizePath(flags.project || ""),
|
|
100
|
+
requestedInstance: String(flags.instance || "").trim(),
|
|
101
|
+
requestedPort: Number(flags.port || 0) || 0,
|
|
102
|
+
candidates: candidates.map(formatCandidate),
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
export function selectInstance(instances, flags) {
|
|
107
|
+
const activeInstances = getActiveInstances(instances);
|
|
108
|
+
const project = normalizePath(flags.project || "");
|
|
109
|
+
const instanceId = String(flags.instance || "").trim();
|
|
110
|
+
const explicitPort = Number(flags.port || 0);
|
|
111
|
+
|
|
112
|
+
if (explicitPort > 0) {
|
|
113
|
+
return {
|
|
114
|
+
instanceId: instanceId || `port_${explicitPort}`,
|
|
115
|
+
projectPath: project,
|
|
116
|
+
port: explicitPort,
|
|
117
|
+
mode: "unknown",
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if (instanceId) {
|
|
122
|
+
const match = activeInstances.find((item) => String(item.instanceId || "") === instanceId);
|
|
123
|
+
if (match) return match;
|
|
124
|
+
throw buildSelectionError(
|
|
125
|
+
"PIE_UNITY_INSTANCE_NOT_FOUND",
|
|
126
|
+
`No pie-unity instance found for --instance ${instanceId}.`,
|
|
127
|
+
flags,
|
|
128
|
+
activeInstances,
|
|
129
|
+
);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
if (project && activeInstances.length > 0) {
|
|
133
|
+
const scored = activeInstances
|
|
134
|
+
.map((item) => ({ item, score: computeProjectMatchScore(project, item.projectPath || "") }))
|
|
135
|
+
.filter((entry) => entry.score > 0)
|
|
136
|
+
.sort((left, right) => right.score - left.score || Number(right.item.lastSeenUnix || 0) - Number(left.item.lastSeenUnix || 0));
|
|
137
|
+
|
|
138
|
+
if (scored.length === 1) {
|
|
139
|
+
return scored[0].item;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
if (scored.length > 1) {
|
|
143
|
+
const bestScore = scored[0].score;
|
|
144
|
+
const best = scored.filter((entry) => entry.score === bestScore).map((entry) => entry.item);
|
|
145
|
+
if (best.length === 1) {
|
|
146
|
+
return best[0];
|
|
147
|
+
}
|
|
148
|
+
throw buildSelectionError(
|
|
149
|
+
"PIE_UNITY_INSTANCE_AMBIGUOUS",
|
|
150
|
+
`Multiple pie-unity instances match project hint ${project}. Pass --instance or --port.`,
|
|
151
|
+
flags,
|
|
152
|
+
best,
|
|
153
|
+
);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
if (activeInstances.length === 1) {
|
|
158
|
+
return activeInstances[0];
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if (activeInstances.length > 1) {
|
|
162
|
+
throw buildSelectionError(
|
|
163
|
+
"PIE_UNITY_INSTANCE_AMBIGUOUS",
|
|
164
|
+
"Multiple pie-unity instances found. Pass --instance, --port, or a more specific --project.",
|
|
165
|
+
flags,
|
|
166
|
+
activeInstances,
|
|
167
|
+
);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
throw buildSelectionError(
|
|
171
|
+
"PIE_UNITY_INSTANCE_NOT_FOUND",
|
|
172
|
+
project
|
|
173
|
+
? `No matching pie-unity instance found for project hint ${project}.`
|
|
174
|
+
: "No matching pie-unity instance found.",
|
|
175
|
+
flags,
|
|
176
|
+
activeInstances,
|
|
177
|
+
);
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
export async function requestJson(url, init = {}, options = {}) {
|
|
181
|
+
const waitMs = Number(options.waitMs || DEFAULT_WAIT_MS);
|
|
182
|
+
const retries = Number(options.retries || DEFAULT_RETRIES);
|
|
183
|
+
let lastError;
|
|
184
|
+
for (let attempt = 0; attempt < retries; attempt += 1) {
|
|
185
|
+
try {
|
|
186
|
+
const response = await fetch(url, init);
|
|
187
|
+
const text = await response.text();
|
|
188
|
+
const json = text ? JSON.parse(text) : {};
|
|
189
|
+
|
|
190
|
+
if (!response.ok) {
|
|
191
|
+
throw new Error(`HTTP ${response.status}: ${text}`);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
const unavailable = extractAvailability(json);
|
|
195
|
+
if (unavailable) {
|
|
196
|
+
lastError = new Error(unavailable.message || "pie-unity temporarily unavailable");
|
|
197
|
+
if (attempt < retries - 1) {
|
|
198
|
+
await sleep(waitMs);
|
|
199
|
+
continue;
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
return json;
|
|
204
|
+
} catch (error) {
|
|
205
|
+
lastError = error;
|
|
206
|
+
if (attempt >= retries - 1) break;
|
|
207
|
+
await sleep(waitMs);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
throw lastError || new Error("Request failed");
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
export async function runCli(argv = process.argv) {
|
|
214
|
+
const { command, flags } = parseArgs(argv);
|
|
215
|
+
if (command === "instances") {
|
|
216
|
+
const instances = getActiveInstances(loadRegistry());
|
|
217
|
+
const filtered = flags.project
|
|
218
|
+
? instances.filter((item) => computeProjectMatchScore(flags.project, item.projectPath || "") > 0)
|
|
219
|
+
: instances;
|
|
220
|
+
writeJson({ service: "pie-unity", instances: filtered });
|
|
221
|
+
return;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
const instance = selectInstance(loadRegistry(), flags);
|
|
225
|
+
const baseUrl = `http://127.0.0.1:${instance.port}`;
|
|
226
|
+
|
|
227
|
+
switch (command) {
|
|
228
|
+
case "health": {
|
|
229
|
+
writeJson(normalizeEnvelope(await requestJson(`${baseUrl}/health`, {}, flags)));
|
|
230
|
+
return;
|
|
231
|
+
}
|
|
232
|
+
case "manifest": {
|
|
233
|
+
const query = new URLSearchParams();
|
|
234
|
+
if (flags.namespace) query.set("namespace", flags.namespace);
|
|
235
|
+
if (flags.name) query.set("name", flags.name);
|
|
236
|
+
const suffix = query.toString() ? `?${query.toString()}` : "";
|
|
237
|
+
writeJson(normalizeEnvelope(await requestJson(`${baseUrl}/manifest${suffix}`, {}, flags)));
|
|
238
|
+
return;
|
|
239
|
+
}
|
|
240
|
+
case "query": {
|
|
241
|
+
writeJson(normalizeEnvelope(await requestJson(`${baseUrl}/tool/${encodeURIComponent("unity_scene_query")}`, postJson(parseData(flags.data)), flags)));
|
|
242
|
+
return;
|
|
243
|
+
}
|
|
244
|
+
case "inspect": {
|
|
245
|
+
writeJson(normalizeEnvelope(await requestJson(`${baseUrl}/tool/${encodeURIComponent("unity_scene_object_inspect")}`, postJson(parseData(flags.data)), flags)));
|
|
246
|
+
return;
|
|
247
|
+
}
|
|
248
|
+
case "edit": {
|
|
249
|
+
writeJson(normalizeEnvelope(await requestJson(`${baseUrl}/tool/${encodeURIComponent("unity_scene_object_edit")}`, postJson(parseData(flags.data)), flags)));
|
|
250
|
+
return;
|
|
251
|
+
}
|
|
252
|
+
case "log-read": {
|
|
253
|
+
writeJson(normalizeEnvelope(await requestJson(`${baseUrl}/tool/${encodeURIComponent("unity_log_read")}`, postJson(parseData(flags.data)), flags)));
|
|
254
|
+
return;
|
|
255
|
+
}
|
|
256
|
+
case "script-run": {
|
|
257
|
+
writeJson(normalizeEnvelope(await requestJson(`${baseUrl}/tool/${encodeURIComponent("unity_script_run")}`, postJson(parseData(flags.data)), flags)));
|
|
258
|
+
return;
|
|
259
|
+
}
|
|
260
|
+
case "tool": {
|
|
261
|
+
const name = String(flags.tool || "").trim();
|
|
262
|
+
if (!name) throw new Error("--tool is required");
|
|
263
|
+
const body = parseData(flags.data);
|
|
264
|
+
writeJson(normalizeEnvelope(await requestJson(`${baseUrl}/tool/${encodeURIComponent(name)}`, postJson(body), flags)));
|
|
265
|
+
return;
|
|
266
|
+
}
|
|
267
|
+
case "rpc": {
|
|
268
|
+
const name = String(flags.method || "").trim();
|
|
269
|
+
if (!name) throw new Error("--method is required");
|
|
270
|
+
const body = parseData(flags.data);
|
|
271
|
+
const response = normalizeEnvelope(await requestJson(`${baseUrl}/rpc/${encodeURIComponent(name)}`, postJson(body), flags));
|
|
272
|
+
if (name === "pie_chat.set_config") {
|
|
273
|
+
await settleChatConfig(baseUrl, flags, Number(response?.result?.configAppliedVersion || 0));
|
|
274
|
+
}
|
|
275
|
+
writeJson(response);
|
|
276
|
+
return;
|
|
277
|
+
}
|
|
278
|
+
default:
|
|
279
|
+
throw new Error(`Unknown command: ${command}`);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
function extractAvailability(json) {
|
|
284
|
+
const raw = json?.serverAvailability || json?.availability || "";
|
|
285
|
+
if (!raw) return null;
|
|
286
|
+
if (typeof raw === "string") {
|
|
287
|
+
try {
|
|
288
|
+
return JSON.parse(raw);
|
|
289
|
+
} catch {
|
|
290
|
+
return { message: raw };
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
return raw;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
function normalizeEnvelope(json) {
|
|
297
|
+
if (!json || typeof json !== "object") return json;
|
|
298
|
+
if (!("result" in json) && !("serverAvailability" in json)) return json;
|
|
299
|
+
|
|
300
|
+
const next = { ...json };
|
|
301
|
+
if (typeof next.result === "string") {
|
|
302
|
+
const trimmed = next.result.trim();
|
|
303
|
+
if (trimmed === "null") {
|
|
304
|
+
next.result = null;
|
|
305
|
+
} else if ((trimmed.startsWith("{") && trimmed.endsWith("}")) || (trimmed.startsWith("[") && trimmed.endsWith("]"))) {
|
|
306
|
+
try {
|
|
307
|
+
next.result = JSON.parse(trimmed);
|
|
308
|
+
} catch {
|
|
309
|
+
// Keep original string.
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
if (typeof next.serverAvailability === "string" && next.serverAvailability.trim()) {
|
|
315
|
+
try {
|
|
316
|
+
next.serverAvailability = JSON.parse(next.serverAvailability);
|
|
317
|
+
} catch {
|
|
318
|
+
// Keep original string.
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
return next;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
function parseData(input) {
|
|
326
|
+
if (!input) return {};
|
|
327
|
+
return JSON.parse(input);
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
function postJson(body) {
|
|
331
|
+
return {
|
|
332
|
+
method: "POST",
|
|
333
|
+
headers: { "Content-Type": "application/json" },
|
|
334
|
+
body: JSON.stringify(body || {}),
|
|
335
|
+
};
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
async function settleChatConfig(baseUrl, flags, expectedVersion = 0) {
|
|
339
|
+
const settleMs = Math.max(0, Number(flags.configSettleMs || 500));
|
|
340
|
+
const pollMs = Math.max(100, Math.min(settleMs || 200, 250));
|
|
341
|
+
const deadline = Date.now() + Math.max(1500, settleMs * 4 || 1500);
|
|
342
|
+
|
|
343
|
+
while (Date.now() < deadline) {
|
|
344
|
+
try {
|
|
345
|
+
const state = normalizeEnvelope(
|
|
346
|
+
await requestJson(`${baseUrl}/tool/${encodeURIComponent("chat_get_state")}`, {}, {
|
|
347
|
+
waitMs: pollMs,
|
|
348
|
+
retries: 1,
|
|
349
|
+
}),
|
|
350
|
+
);
|
|
351
|
+
const appliedVersion = Number(state?.result?.configAppliedVersion || 0);
|
|
352
|
+
const versionApplied = expectedVersion > 0 ? appliedVersion >= expectedVersion : false;
|
|
353
|
+
if (versionApplied || !state?.result?.isBusy) {
|
|
354
|
+
if (settleMs > 0) {
|
|
355
|
+
await sleep(settleMs);
|
|
356
|
+
}
|
|
357
|
+
return;
|
|
358
|
+
}
|
|
359
|
+
} catch {
|
|
360
|
+
// Ignore transient polling failures while the bridge settles.
|
|
361
|
+
}
|
|
362
|
+
await sleep(pollMs);
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
if (settleMs > 0) {
|
|
366
|
+
await sleep(settleMs);
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
function sleep(ms) {
|
|
371
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
function normalizePath(value) {
|
|
375
|
+
return String(value || "").replace(/\\/g, "/");
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
function writeJson(payload) {
|
|
379
|
+
process.stdout.write(`${JSON.stringify(payload, null, 2)}\n`);
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
function dedupeInstances(items) {
|
|
383
|
+
const sorted = [...items].sort((a, b) => {
|
|
384
|
+
const timeA = Number(a?.lastSeenUnix || 0);
|
|
385
|
+
const timeB = Number(b?.lastSeenUnix || 0);
|
|
386
|
+
return timeB - timeA;
|
|
387
|
+
});
|
|
388
|
+
const seen = new Set();
|
|
389
|
+
const next = [];
|
|
390
|
+
for (const item of sorted) {
|
|
391
|
+
const key = [
|
|
392
|
+
String(item?.instanceId || ""),
|
|
393
|
+
normalizePath(item?.projectPath || ""),
|
|
394
|
+
String(item?.mode || ""),
|
|
395
|
+
].join("|");
|
|
396
|
+
if (seen.has(key)) continue;
|
|
397
|
+
seen.add(key);
|
|
398
|
+
next.push(item);
|
|
399
|
+
}
|
|
400
|
+
return next;
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
if (process.argv[1] && import.meta.url === pathToFileURL(process.argv[1]).href) {
|
|
404
|
+
runCli().catch((error) => {
|
|
405
|
+
if (error instanceof InstanceSelectionError) {
|
|
406
|
+
process.stderr.write(`${JSON.stringify({
|
|
407
|
+
error: error.message,
|
|
408
|
+
code: error.code,
|
|
409
|
+
...error.details,
|
|
410
|
+
}, null, 2)}\n`);
|
|
411
|
+
process.exitCode = 2;
|
|
412
|
+
return;
|
|
413
|
+
}
|
|
414
|
+
process.stderr.write(`${error?.message || error}\n`);
|
|
415
|
+
process.exitCode = 1;
|
|
416
|
+
});
|
|
417
|
+
}
|
|
@@ -14,7 +14,7 @@ At a high level, the process of creating a skill goes like this:
|
|
|
14
14
|
- Create a few test prompts and run claude-with-access-to-the-skill on them
|
|
15
15
|
- Help the user evaluate the results both qualitatively and quantitatively
|
|
16
16
|
- While the runs happen in the background, draft some quantitative evals if there aren't any (if there are some, you can either use as is or modify if you feel something needs to change about them). Then explain them to the user (or if they already existed, explain the ones that already exist)
|
|
17
|
-
- Use the `eval-viewer/generate_review.
|
|
17
|
+
- Use the `eval-viewer/generate_review.mjs` script to show the user the results for them to look at, and also let them look at the quantitative metrics
|
|
18
18
|
- Rewrite the skill based on feedback from the user's evaluation of the results (and also if there are any glaring flaws that become apparent from the quantitative benchmarks)
|
|
19
19
|
- Repeat until you're satisfied
|
|
20
20
|
- Expand the test set and try again at larger scale
|
|
@@ -226,7 +226,7 @@ Once all runs are done:
|
|
|
226
226
|
|
|
227
227
|
2. **Aggregate into benchmark** — run the aggregation script from the skill-creator directory:
|
|
228
228
|
```bash
|
|
229
|
-
|
|
229
|
+
node <skill-creator-path>/scripts/aggregate_benchmark.mjs <workspace>/iteration-N --skill-name <name>
|
|
230
230
|
```
|
|
231
231
|
This produces `benchmark.json` and `benchmark.md` with pass_rate, time, and tokens for each configuration, with mean ± stddev and the delta. If generating benchmark.json manually, see `references/schemas.md` for the exact schema the viewer expects.
|
|
232
232
|
Put each with_skill version before its baseline counterpart.
|
|
@@ -235,7 +235,7 @@ Put each with_skill version before its baseline counterpart.
|
|
|
235
235
|
|
|
236
236
|
4. **Launch the viewer** with both qualitative outputs and quantitative data:
|
|
237
237
|
```bash
|
|
238
|
-
nohup
|
|
238
|
+
nohup node <skill-creator-path>/eval-viewer/generate_review.mjs \
|
|
239
239
|
<workspace>/iteration-N \
|
|
240
240
|
--skill-name "my-skill" \
|
|
241
241
|
--benchmark <workspace>/iteration-N/benchmark.json \
|
|
@@ -246,7 +246,7 @@ Put each with_skill version before its baseline counterpart.
|
|
|
246
246
|
|
|
247
247
|
**Cowork / headless environments:** If `webbrowser.open()` is not available or the environment has no display, use `--static <output_path>` to write a standalone HTML file instead of starting a server. Feedback will be downloaded as a `feedback.json` file when the user clicks "Submit All Reviews". After download, copy `feedback.json` into the workspace directory for the next iteration to pick up.
|
|
248
248
|
|
|
249
|
-
Note: please use generate_review.
|
|
249
|
+
Note: please use `generate_review.mjs` to create the viewer; there's no need to write custom HTML.
|
|
250
250
|
|
|
251
251
|
5. **Tell the user** something like: "I've opened the results in your browser. There are two tabs — 'Outputs' lets you click through each test case and leave feedback, 'Benchmark' shows the quantitative comparison. When you're done, come back here and let me know."
|
|
252
252
|
|
|
@@ -301,7 +301,7 @@ This is the heart of the loop. You've run the test cases, the user has reviewed
|
|
|
301
301
|
|
|
302
302
|
3. **Explain the why.** Try hard to explain the **why** behind everything you're asking the model to do. Today's LLMs are *smart*. They have good theory of mind and when given a good harness can go beyond rote instructions and really make things happen. Even if the feedback from the user is terse or frustrated, try to actually understand the task and why the user is writing what they wrote, and what they actually wrote, and then transmit this understanding into the instructions. If you find yourself writing ALWAYS or NEVER in all caps, or using super rigid structures, that's a yellow flag — if possible, reframe and explain the reasoning so that the model understands why the thing you're asking for is important. That's a more humane, powerful, and effective approach.
|
|
303
303
|
|
|
304
|
-
4. **Look for repeated work across test cases.** Read the transcripts from the test runs and notice if the subagents all independently wrote similar helper scripts or took the same multi-step approach to something. If all 3 test cases resulted in the subagent writing a `create_docx.
|
|
304
|
+
4. **Look for repeated work across test cases.** Read the transcripts from the test runs and notice if the subagents all independently wrote similar helper scripts or took the same multi-step approach to something. If all 3 test cases resulted in the subagent writing a `create_docx.mjs` or a `build_chart.mjs`, that's a strong signal the skill should bundle that script. Write it once, put it in `scripts/`, and tell the skill to use it. This saves every future invocation from reinventing the wheel.
|
|
305
305
|
|
|
306
306
|
This task is pretty important (we are trying to create billions a year in economic value here!) and your thinking time is not the blocker; take your time and really mull things over. I'd suggest writing a draft revision and then looking at it anew and making improvements. Really do your best to get into the head of the user and understand what they want and need.
|
|
307
307
|
|
|
@@ -332,7 +332,7 @@ This is optional, requires subagents, and most users won't need it. The human re
|
|
|
332
332
|
|
|
333
333
|
## Description Optimization
|
|
334
334
|
|
|
335
|
-
The description field in SKILL.md frontmatter is the primary mechanism that determines whether
|
|
335
|
+
The description field in SKILL.md frontmatter is the primary mechanism that determines whether Pie invokes a skill. After creating or improving a skill, offer to optimize the description for better triggering accuracy.
|
|
336
336
|
|
|
337
337
|
### Step 1: Generate trigger eval queries
|
|
338
338
|
|
|
@@ -379,7 +379,7 @@ Tell the user: "This will take some time — I'll run the optimization loop in t
|
|
|
379
379
|
Save the eval set to the workspace, then run in the background:
|
|
380
380
|
|
|
381
381
|
```bash
|
|
382
|
-
|
|
382
|
+
node <skill-creator-path>/scripts/run_loop.mjs \
|
|
383
383
|
--eval-set <path-to-trigger-eval.json> \
|
|
384
384
|
--skill-path <path-to-skill> \
|
|
385
385
|
--model <model-id-powering-this-session> \
|
|
@@ -391,11 +391,11 @@ Use the model ID from your system prompt (the one powering the current session)
|
|
|
391
391
|
|
|
392
392
|
While it runs, periodically tail the output to give the user updates on which iteration it's on and what the scores look like.
|
|
393
393
|
|
|
394
|
-
This handles the full optimization loop automatically. It splits the eval set into 60% train and 40% held-out test, evaluates the current description (running each query 3 times to get a reliable trigger rate), then
|
|
394
|
+
This handles the full optimization loop automatically. It splits the eval set into 60% train and 40% held-out test, evaluates the current description with Pie (running each query 3 times to get a reliable trigger rate), then asks Pie to propose improvements based on what failed. It re-evaluates each new description on both train and test, iterating up to 5 times. When it's done, it opens an HTML report in the browser showing the results per iteration and returns JSON with `best_description` — selected by test score rather than train score to avoid overfitting.
|
|
395
395
|
|
|
396
396
|
### How skill triggering works
|
|
397
397
|
|
|
398
|
-
Understanding the triggering mechanism helps design better eval queries. Skills appear in
|
|
398
|
+
Understanding the triggering mechanism helps design better eval queries. Skills appear in Pie's `available_skills` list with their name + description, and Pie decides whether to consult a skill based on that description. The important thing to know is that Pie only consults skills for tasks it can't easily handle on its own — simple, one-step queries like "read this PDF" may not trigger a skill even if the description matches perfectly, because Pie can handle them directly with basic tools. Complex, multi-step, or specialized queries reliably trigger skills when the description matches.
|
|
399
399
|
|
|
400
400
|
This means your eval queries should be substantive enough that Claude would actually benefit from consulting a skill. Simple queries like "read file X" are poor test cases — they won't trigger skills regardless of description quality.
|
|
401
401
|
|
|
@@ -410,7 +410,7 @@ Take `best_description` from the JSON output and update the skill's SKILL.md fro
|
|
|
410
410
|
Check whether you have access to the `present_files` tool. If you don't, skip this step. If you do, package the skill and present the .skill file to the user:
|
|
411
411
|
|
|
412
412
|
```bash
|
|
413
|
-
|
|
413
|
+
node <skill-creator-path>/scripts/package_skill.mjs <path/to/skill-folder>
|
|
414
414
|
```
|
|
415
415
|
|
|
416
416
|
After packaging, direct the user to the resulting `.skill` file path so they can install it.
|
|
@@ -429,11 +429,11 @@ In Claude.ai, the core workflow is the same (draft → test → review → impro
|
|
|
429
429
|
|
|
430
430
|
**The iteration loop**: Same as before — improve the skill, rerun the test cases, ask for feedback — just without the browser reviewer in the middle. You can still organize results into iteration directories on the filesystem if you have one.
|
|
431
431
|
|
|
432
|
-
**Description optimization**: This section
|
|
432
|
+
**Description optimization**: This section now defaults to Pie's own non-interactive runner. It does not require `claude -p`.
|
|
433
433
|
|
|
434
434
|
**Blind comparison**: Requires subagents. Skip it.
|
|
435
435
|
|
|
436
|
-
**Packaging**: The `package_skill.
|
|
436
|
+
**Packaging**: The `package_skill.mjs` script works anywhere with Node.js and a filesystem. On Claude.ai or Codex CLI, you can run it and the user can download the resulting `.skill` file.
|
|
437
437
|
|
|
438
438
|
**Updating an existing skill**: The user might be asking you to update an existing skill, not create a new one. In this case:
|
|
439
439
|
- **Preserve the original name.** Note the skill's directory name and `name` frontmatter field -- use them unchanged. E.g., if the installed skill is `research-helper`, output `research-helper.skill` (not `research-helper-v2`).
|
|
@@ -448,10 +448,10 @@ If you're in Cowork, the main things to know are:
|
|
|
448
448
|
|
|
449
449
|
- You have subagents, so the main workflow (spawn test cases in parallel, run baselines, grade, etc.) all works. (However, if you run into severe problems with timeouts, it's OK to run the test prompts in series rather than parallel.)
|
|
450
450
|
- You don't have a browser or display, so when generating the eval viewer, use `--static <output_path>` to write a standalone HTML file instead of starting a server. Then proffer a link that the user can click to open the HTML in their browser.
|
|
451
|
-
- For whatever reason, the Cowork setup seems to disincline Claude from generating the eval viewer after running the tests, so just to reiterate: whether you're in Cowork or in Claude Code, after running tests, you should always generate the eval viewer for the human to look at examples before revising the skill yourself and trying to make corrections, using `generate_review.
|
|
451
|
+
- For whatever reason, the Cowork setup seems to disincline Claude from generating the eval viewer after running the tests, so just to reiterate: whether you're in Cowork or in Claude Code, after running tests, you should always generate the eval viewer for the human to look at examples before revising the skill yourself and trying to make corrections, using `generate_review.mjs` (not writing your own boutique html code). Sorry in advance but I'm gonna go all caps here: GENERATE THE EVAL VIEWER *BEFORE* evaluating inputs yourself. You want to get them in front of the human ASAP!
|
|
452
452
|
- Feedback works differently: since there's no running server, the viewer's "Submit All Reviews" button will download `feedback.json` as a file. You can then read it from there (you may have to request access first).
|
|
453
|
-
- Packaging works — `package_skill.
|
|
454
|
-
- Description optimization (`run_loop.
|
|
453
|
+
- Packaging works — `package_skill.mjs` just needs Node.js and a filesystem.
|
|
454
|
+
- Description optimization (`run_loop.mjs` / `run_eval.mjs`) should work in Cowork just fine since it uses Pie's non-interactive runner, not a browser, but please save it until you've fully finished making the skill and the user agrees it's in good shape.
|
|
455
455
|
- **Updating an existing skill**: The user might be asking you to update an existing skill, not create a new one. Follow the update guidance in the claude.ai section above.
|
|
456
456
|
|
|
457
457
|
---
|
|
@@ -475,11 +475,11 @@ Repeating one more time the core loop here for emphasis:
|
|
|
475
475
|
- Draft or edit the skill
|
|
476
476
|
- Run claude-with-access-to-the-skill on test prompts
|
|
477
477
|
- With the user, evaluate the outputs:
|
|
478
|
-
- Create benchmark.json and run `eval-viewer/generate_review.
|
|
478
|
+
- Create benchmark.json and run `eval-viewer/generate_review.mjs` to help the user review them
|
|
479
479
|
- Run quantitative evals
|
|
480
480
|
- Repeat until you and the user are satisfied
|
|
481
481
|
- Package the final skill and return it to the user.
|
|
482
482
|
|
|
483
|
-
Please add steps to your TodoList, if you have such a thing, to make sure you don't forget. If you're in Cowork, please specifically put "Create evals JSON and run `eval-viewer/generate_review.
|
|
483
|
+
Please add steps to your TodoList, if you have such a thing, to make sure you don't forget. If you're in Cowork, please specifically put "Create evals JSON and run `eval-viewer/generate_review.mjs` so human can review test cases" in your TodoList to make sure it happens.
|
|
484
484
|
|
|
485
485
|
Good luck!
|