selftune 0.2.30 → 0.2.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +83 -56
- package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
- package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
- package/apps/local-dashboard/dist/index.html +3 -3
- package/cli/selftune/command-surface.ts +613 -2
- package/cli/selftune/create/baseline.ts +429 -0
- package/cli/selftune/create/check.ts +35 -0
- package/cli/selftune/create/init.ts +115 -0
- package/cli/selftune/create/package-candidate-state.ts +771 -0
- package/cli/selftune/create/package-evaluator.ts +710 -0
- package/cli/selftune/create/package-fingerprint.ts +142 -0
- package/cli/selftune/create/package-search.ts +377 -0
- package/cli/selftune/create/publish.ts +431 -0
- package/cli/selftune/create/readiness.ts +495 -0
- package/cli/selftune/create/replay.ts +330 -0
- package/cli/selftune/create/report.ts +74 -0
- package/cli/selftune/create/scaffold.ts +121 -0
- package/cli/selftune/create/skills-ref-adapter.ts +177 -0
- package/cli/selftune/create/status.ts +33 -0
- package/cli/selftune/create/templates.ts +249 -0
- package/cli/selftune/cron/setup.ts +1 -1
- package/cli/selftune/dashboard-action-events.ts +4 -1
- package/cli/selftune/dashboard-action-result.ts +789 -24
- package/cli/selftune/dashboard-action-stream.ts +80 -0
- package/cli/selftune/dashboard-contract.ts +146 -3
- package/cli/selftune/dashboard-server.ts +5 -4
- package/cli/selftune/eval/hooks-to-evals.ts +58 -35
- package/cli/selftune/eval/synthetic-evals.ts +145 -17
- package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
- package/cli/selftune/evolution/evolve-body.ts +9 -36
- package/cli/selftune/evolution/evolve.ts +8 -72
- package/cli/selftune/evolution/stopping-criteria.ts +5 -13
- package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
- package/cli/selftune/evolution/validate-host-replay.ts +115 -15
- package/cli/selftune/improve.ts +206 -0
- package/cli/selftune/index.ts +123 -6
- package/cli/selftune/init.ts +1 -1
- package/cli/selftune/localdb/queries/dashboard.ts +30 -0
- package/cli/selftune/localdb/schema.ts +52 -0
- package/cli/selftune/monitoring/watch.ts +257 -23
- package/cli/selftune/orchestrate/execute.ts +300 -1
- package/cli/selftune/orchestrate/finalize.ts +14 -0
- package/cli/selftune/orchestrate/plan.ts +22 -5
- package/cli/selftune/orchestrate/prepare.ts +59 -4
- package/cli/selftune/orchestrate/report.ts +1 -1
- package/cli/selftune/orchestrate.ts +34 -1
- package/cli/selftune/publish.ts +35 -0
- package/cli/selftune/registry/github-install.ts +256 -0
- package/cli/selftune/registry/index.ts +1 -1
- package/cli/selftune/registry/install.ts +58 -7
- package/cli/selftune/routes/actions.ts +81 -15
- package/cli/selftune/routes/overview.ts +1 -1
- package/cli/selftune/routes/skill-report.ts +147 -2
- package/cli/selftune/run.ts +18 -0
- package/cli/selftune/schedule.ts +3 -3
- package/cli/selftune/search-run.ts +703 -0
- package/cli/selftune/status.ts +35 -11
- package/cli/selftune/testing-readiness.ts +431 -40
- package/cli/selftune/types.ts +316 -0
- package/cli/selftune/utils/eval-readiness.ts +1 -0
- package/cli/selftune/utils/json-output.ts +11 -0
- package/cli/selftune/utils/lifecycle-surface.ts +48 -0
- package/cli/selftune/utils/query-filter.ts +82 -1
- package/cli/selftune/utils/tui.ts +85 -2
- package/cli/selftune/verify.ts +205 -0
- package/cli/selftune/workflows/proposals.ts +1 -1
- package/cli/selftune/workflows/skill-scaffold.ts +141 -63
- package/cli/selftune/workflows/workflows.ts +4 -4
- package/package.json +1 -1
- package/packages/dashboard-core/src/routes/manifest.ts +2 -2
- package/packages/ui/src/components/SkillReportPanels.tsx +7 -7
- package/packages/ui/src/primitives/button.tsx +5 -0
- package/skill/SKILL.md +148 -85
- package/skill/references/cli-quick-reference.md +16 -1
- package/skill/references/creator-playbook.md +31 -10
- package/skill/workflows/Baseline.md +8 -9
- package/skill/workflows/Contributions.md +4 -4
- package/skill/workflows/Create.md +173 -0
- package/skill/workflows/CreateTestDeploy.md +34 -30
- package/skill/workflows/Cron.md +2 -2
- package/skill/workflows/Dashboard.md +3 -3
- package/skill/workflows/Evals.md +13 -7
- package/skill/workflows/Evolve.md +75 -32
- package/skill/workflows/EvolveBody.md +22 -15
- package/skill/workflows/Hook.md +1 -1
- package/skill/workflows/Improve.md +168 -0
- package/skill/workflows/Initialize.md +3 -3
- package/skill/workflows/Orchestrate.md +49 -12
- package/skill/workflows/Publish.md +100 -0
- package/skill/workflows/Registry.md +19 -13
- package/skill/workflows/Run.md +72 -0
- package/skill/workflows/Schedule.md +2 -2
- package/skill/workflows/SearchRun.md +89 -0
- package/skill/workflows/SignalsDashboard.md +2 -2
- package/skill/workflows/UnitTest.md +13 -4
- package/skill/workflows/Verify.md +136 -0
- package/skill/workflows/Watch.md +114 -47
- package/skill/workflows/Workflows.md +13 -8
- package/apps/local-dashboard/dist/assets/index-BcXquWFB.css +0 -1
- package/apps/local-dashboard/dist/assets/index-Coq42hE4.js +0 -15
- package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
|
@@ -0,0 +1,429 @@
|
|
|
1
|
+
import { parseArgs } from "node:util";
|
|
2
|
+
import { readFileSync } from "node:fs";
|
|
3
|
+
|
|
4
|
+
import { PUBLIC_COMMAND_SURFACES, renderCommandHelp } from "../command-surface.js";
|
|
5
|
+
import { emitDashboardStepProgress } from "../dashboard-action-instrumentation.js";
|
|
6
|
+
import { writeGradingBaseline } from "../localdb/direct-write.js";
|
|
7
|
+
import type {
|
|
8
|
+
BaselineResult,
|
|
9
|
+
EvalEntry,
|
|
10
|
+
RuntimeReplayAggregateMetrics,
|
|
11
|
+
TokenUsageMetrics,
|
|
12
|
+
} from "../types.js";
|
|
13
|
+
import { CLIError, handleCLIError } from "../utils/cli-error.js";
|
|
14
|
+
import { detectLlmAgent } from "../utils/llm-call.js";
|
|
15
|
+
import { measureBaseline } from "../eval/baseline.js";
|
|
16
|
+
import { readCanonicalPackageEvaluationArtifact } from "../testing-readiness.js";
|
|
17
|
+
import { readCreateSkillContext } from "./readiness.js";
|
|
18
|
+
import { computeCreatePackageFingerprint } from "./package-fingerprint.js";
|
|
19
|
+
import {
|
|
20
|
+
loadCreateEvalSet,
|
|
21
|
+
runCreateReplay,
|
|
22
|
+
type CreateReplayResult,
|
|
23
|
+
type CreateReplayMode,
|
|
24
|
+
type RunCreateReplayOptions,
|
|
25
|
+
} from "./replay.js";
|
|
26
|
+
|
|
27
|
+
export interface CreateBaselineResult {
|
|
28
|
+
skill_name: string;
|
|
29
|
+
mode: CreateReplayMode;
|
|
30
|
+
baseline_pass_rate: number;
|
|
31
|
+
with_skill_pass_rate: number;
|
|
32
|
+
lift: number;
|
|
33
|
+
adds_value: boolean;
|
|
34
|
+
per_entry: BaselineResult[];
|
|
35
|
+
measured_at: string;
|
|
36
|
+
runtime_metrics?: {
|
|
37
|
+
with_skill: RuntimeReplayAggregateMetrics;
|
|
38
|
+
without_skill: RuntimeReplayAggregateMetrics;
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface CreateBaselineDeps {
|
|
43
|
+
runCreateReplay?: (
|
|
44
|
+
options: RunCreateReplayOptions,
|
|
45
|
+
) => Promise<Awaited<ReturnType<typeof runCreateReplay>>>;
|
|
46
|
+
measureBaseline?: typeof measureBaseline;
|
|
47
|
+
emitDashboardStepProgress?: typeof emitDashboardStepProgress;
|
|
48
|
+
readCanonicalPackageEvaluationArtifact?: typeof readCanonicalPackageEvaluationArtifact;
|
|
49
|
+
computeCreatePackageFingerprint?: typeof computeCreatePackageFingerprint;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export interface RunCreateBaselineOptions {
|
|
53
|
+
skillPath: string;
|
|
54
|
+
mode: CreateReplayMode;
|
|
55
|
+
agent?: string;
|
|
56
|
+
evalSetPath?: string;
|
|
57
|
+
withSkillReplayResult?: CreateReplayResult;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function chooseBaselineAgent(requestedAgent?: string): string {
|
|
61
|
+
if (requestedAgent) return requestedAgent;
|
|
62
|
+
const detected = detectLlmAgent();
|
|
63
|
+
if (!detected) {
|
|
64
|
+
throw new CLIError(
|
|
65
|
+
"No supported agent CLI was found in PATH.",
|
|
66
|
+
"AGENT_NOT_FOUND",
|
|
67
|
+
"Install Claude Code, Codex, OpenCode, or Pi, or pass --agent explicitly.",
|
|
68
|
+
);
|
|
69
|
+
}
|
|
70
|
+
return detected;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
function buildReplayTokenUsage(
|
|
74
|
+
result: CreateReplayResult["results"][number],
|
|
75
|
+
): TokenUsageMetrics | undefined {
|
|
76
|
+
const inputTokens = result.runtime_metrics?.input_tokens;
|
|
77
|
+
const outputTokens = result.runtime_metrics?.output_tokens;
|
|
78
|
+
if (typeof inputTokens !== "number" || typeof outputTokens !== "number") {
|
|
79
|
+
return undefined;
|
|
80
|
+
}
|
|
81
|
+
return {
|
|
82
|
+
input_tokens: inputTokens,
|
|
83
|
+
output_tokens: outputTokens,
|
|
84
|
+
total_tokens: inputTokens + outputTokens,
|
|
85
|
+
...(typeof result.runtime_metrics?.total_cost_usd === "number"
|
|
86
|
+
? { estimated_cost_usd: result.runtime_metrics.total_cost_usd }
|
|
87
|
+
: {}),
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
type PackageEvaluationArtifact = NonNullable<
|
|
92
|
+
ReturnType<typeof readCanonicalPackageEvaluationArtifact>
|
|
93
|
+
>;
|
|
94
|
+
|
|
95
|
+
function emitBaselineStepProgress(
|
|
96
|
+
deps: CreateBaselineDeps,
|
|
97
|
+
options: {
|
|
98
|
+
current: number;
|
|
99
|
+
total: number;
|
|
100
|
+
status: "started" | "finished";
|
|
101
|
+
phase: string;
|
|
102
|
+
label: string;
|
|
103
|
+
passed?: boolean | null;
|
|
104
|
+
evidence?: string | null;
|
|
105
|
+
},
|
|
106
|
+
): void {
|
|
107
|
+
(deps.emitDashboardStepProgress ?? emitDashboardStepProgress)({
|
|
108
|
+
current: options.current,
|
|
109
|
+
total: options.total,
|
|
110
|
+
status: options.status,
|
|
111
|
+
phase: options.phase,
|
|
112
|
+
label: options.label,
|
|
113
|
+
unit: "step",
|
|
114
|
+
passed: options.passed ?? null,
|
|
115
|
+
evidence: options.evidence ?? null,
|
|
116
|
+
});
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
function replayMatchesEvalSet(replay: CreateReplayResult, evalSet: EvalEntry[]): boolean {
|
|
120
|
+
if (replay.results.length !== evalSet.length) return false;
|
|
121
|
+
|
|
122
|
+
return replay.results.every((result, index) => {
|
|
123
|
+
const entry = evalSet[index];
|
|
124
|
+
return (
|
|
125
|
+
entry != null &&
|
|
126
|
+
result.query === entry.query &&
|
|
127
|
+
result.should_trigger === entry.should_trigger
|
|
128
|
+
);
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function canReuseCachedWithSkillReplay(
|
|
133
|
+
cached: PackageEvaluationArtifact | null,
|
|
134
|
+
agent: string,
|
|
135
|
+
skillName: string,
|
|
136
|
+
skillPath: string,
|
|
137
|
+
packageFingerprint: string | null,
|
|
138
|
+
evalSet: EvalEntry[],
|
|
139
|
+
): cached is PackageEvaluationArtifact {
|
|
140
|
+
if (!cached || !packageFingerprint) return false;
|
|
141
|
+
if (cached.summary.mode !== "package") return false;
|
|
142
|
+
if (cached.summary.skill_name !== skillName) return false;
|
|
143
|
+
if (cached.summary.skill_path !== skillPath) return false;
|
|
144
|
+
if (cached.summary.package_fingerprint !== packageFingerprint) return false;
|
|
145
|
+
if (cached.summary.replay.agent !== agent) return false;
|
|
146
|
+
if (cached.summary.replay.validation_mode !== "host_replay") return false;
|
|
147
|
+
if (cached.replay.mode !== "package") return false;
|
|
148
|
+
if (!replayMatchesEvalSet(cached.replay, evalSet)) return false;
|
|
149
|
+
return true;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
function readReusableWithSkillReplay(
|
|
153
|
+
options: RunCreateBaselineOptions,
|
|
154
|
+
skillName: string,
|
|
155
|
+
skillPath: string,
|
|
156
|
+
agent: string,
|
|
157
|
+
deps: CreateBaselineDeps,
|
|
158
|
+
): CreateReplayResult | null {
|
|
159
|
+
try {
|
|
160
|
+
const packageFingerprint = (
|
|
161
|
+
deps.computeCreatePackageFingerprint ?? computeCreatePackageFingerprint
|
|
162
|
+
)(skillPath);
|
|
163
|
+
const evalSet = loadCreateEvalSet(skillName, options.evalSetPath);
|
|
164
|
+
const cached = (
|
|
165
|
+
deps.readCanonicalPackageEvaluationArtifact ?? readCanonicalPackageEvaluationArtifact
|
|
166
|
+
)(skillName);
|
|
167
|
+
if (
|
|
168
|
+
!canReuseCachedWithSkillReplay(
|
|
169
|
+
cached,
|
|
170
|
+
agent,
|
|
171
|
+
skillName,
|
|
172
|
+
skillPath,
|
|
173
|
+
packageFingerprint,
|
|
174
|
+
evalSet,
|
|
175
|
+
)
|
|
176
|
+
) {
|
|
177
|
+
return null;
|
|
178
|
+
}
|
|
179
|
+
return cached.replay;
|
|
180
|
+
} catch {
|
|
181
|
+
return null;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
export function summarizePackageBaselineResults(
|
|
186
|
+
skillName: string,
|
|
187
|
+
withSkillResults: CreateReplayResult,
|
|
188
|
+
baselineResults: CreateReplayResult,
|
|
189
|
+
): CreateBaselineResult {
|
|
190
|
+
const measuredAt = new Date().toISOString();
|
|
191
|
+
const perEntry: BaselineResult[] = [];
|
|
192
|
+
for (const result of baselineResults.results) {
|
|
193
|
+
perEntry.push({
|
|
194
|
+
skill_name: skillName,
|
|
195
|
+
query: result.query,
|
|
196
|
+
with_skill: false,
|
|
197
|
+
triggered: result.triggered,
|
|
198
|
+
pass: result.passed,
|
|
199
|
+
evidence: result.evidence,
|
|
200
|
+
...(typeof result.runtime_metrics?.duration_ms === "number"
|
|
201
|
+
? { latency_ms: result.runtime_metrics.duration_ms }
|
|
202
|
+
: {}),
|
|
203
|
+
...(buildReplayTokenUsage(result) ? { tokens: buildReplayTokenUsage(result) } : {}),
|
|
204
|
+
measured_at: measuredAt,
|
|
205
|
+
});
|
|
206
|
+
}
|
|
207
|
+
for (const result of withSkillResults.results) {
|
|
208
|
+
perEntry.push({
|
|
209
|
+
skill_name: skillName,
|
|
210
|
+
query: result.query,
|
|
211
|
+
with_skill: true,
|
|
212
|
+
triggered: result.triggered,
|
|
213
|
+
pass: result.passed,
|
|
214
|
+
evidence: result.evidence,
|
|
215
|
+
...(typeof result.runtime_metrics?.duration_ms === "number"
|
|
216
|
+
? { latency_ms: result.runtime_metrics.duration_ms }
|
|
217
|
+
: {}),
|
|
218
|
+
...(buildReplayTokenUsage(result) ? { tokens: buildReplayTokenUsage(result) } : {}),
|
|
219
|
+
measured_at: measuredAt,
|
|
220
|
+
});
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
const lift = withSkillResults.pass_rate - baselineResults.pass_rate;
|
|
224
|
+
return {
|
|
225
|
+
skill_name: skillName,
|
|
226
|
+
mode: "package",
|
|
227
|
+
baseline_pass_rate: baselineResults.pass_rate,
|
|
228
|
+
with_skill_pass_rate: withSkillResults.pass_rate,
|
|
229
|
+
lift,
|
|
230
|
+
adds_value: lift >= 0.05,
|
|
231
|
+
per_entry: perEntry,
|
|
232
|
+
measured_at: measuredAt,
|
|
233
|
+
runtime_metrics: {
|
|
234
|
+
with_skill: withSkillResults.runtime_metrics,
|
|
235
|
+
without_skill: baselineResults.runtime_metrics,
|
|
236
|
+
},
|
|
237
|
+
};
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
export async function runCreateBaseline(
|
|
241
|
+
options: RunCreateBaselineOptions,
|
|
242
|
+
deps: CreateBaselineDeps = {},
|
|
243
|
+
): Promise<CreateBaselineResult> {
|
|
244
|
+
const context = readCreateSkillContext(options.skillPath);
|
|
245
|
+
const agent = chooseBaselineAgent(options.agent);
|
|
246
|
+
|
|
247
|
+
if (options.mode === "routing") {
|
|
248
|
+
const evalSet = loadCreateEvalSet(context.skill_name, options.evalSetPath);
|
|
249
|
+
const result = await (deps.measureBaseline ?? measureBaseline)({
|
|
250
|
+
evalSet,
|
|
251
|
+
skillDescription: readFileSync(context.skill_path, "utf-8"),
|
|
252
|
+
skillName: context.skill_name,
|
|
253
|
+
agent,
|
|
254
|
+
});
|
|
255
|
+
return {
|
|
256
|
+
skill_name: result.skill_name,
|
|
257
|
+
mode: "routing",
|
|
258
|
+
baseline_pass_rate: result.baseline_pass_rate,
|
|
259
|
+
with_skill_pass_rate: result.with_skill_pass_rate,
|
|
260
|
+
lift: result.lift,
|
|
261
|
+
adds_value: result.adds_value,
|
|
262
|
+
per_entry: result.per_entry,
|
|
263
|
+
measured_at: result.measured_at,
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
const replay = deps.runCreateReplay ?? runCreateReplay;
|
|
268
|
+
const reusedWithSkillReplay =
|
|
269
|
+
options.withSkillReplayResult == null
|
|
270
|
+
? readReusableWithSkillReplay(options, context.skill_name, context.skill_path, agent, deps)
|
|
271
|
+
: null;
|
|
272
|
+
|
|
273
|
+
emitBaselineStepProgress(deps, {
|
|
274
|
+
current: 1,
|
|
275
|
+
total: 2,
|
|
276
|
+
status: "started",
|
|
277
|
+
phase: "with_skill_replay",
|
|
278
|
+
label: "Replay with draft package enabled",
|
|
279
|
+
evidence:
|
|
280
|
+
reusedWithSkillReplay != null
|
|
281
|
+
? "Reusing fresh package replay from the canonical artifact"
|
|
282
|
+
: null,
|
|
283
|
+
});
|
|
284
|
+
let withSkillResults: CreateReplayResult;
|
|
285
|
+
try {
|
|
286
|
+
withSkillResults =
|
|
287
|
+
options.withSkillReplayResult ??
|
|
288
|
+
reusedWithSkillReplay ??
|
|
289
|
+
(await replay({
|
|
290
|
+
skillPath: context.skill_path,
|
|
291
|
+
mode: "package",
|
|
292
|
+
agent,
|
|
293
|
+
evalSetPath: options.evalSetPath,
|
|
294
|
+
}));
|
|
295
|
+
emitBaselineStepProgress(deps, {
|
|
296
|
+
current: 1,
|
|
297
|
+
total: 2,
|
|
298
|
+
status: "finished",
|
|
299
|
+
phase: "with_skill_replay",
|
|
300
|
+
label: "Replay with draft package enabled",
|
|
301
|
+
passed: true,
|
|
302
|
+
evidence:
|
|
303
|
+
reusedWithSkillReplay != null
|
|
304
|
+
? `Reused fresh package replay at ${(withSkillResults.pass_rate * 100).toFixed(1)}% pass rate`
|
|
305
|
+
: `Finished with ${(withSkillResults.pass_rate * 100).toFixed(1)}% pass rate`,
|
|
306
|
+
});
|
|
307
|
+
} catch (error) {
|
|
308
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
309
|
+
emitBaselineStepProgress(deps, {
|
|
310
|
+
current: 1,
|
|
311
|
+
total: 2,
|
|
312
|
+
status: "finished",
|
|
313
|
+
phase: "with_skill_replay",
|
|
314
|
+
label: "Replay with draft package enabled",
|
|
315
|
+
passed: false,
|
|
316
|
+
evidence: message,
|
|
317
|
+
});
|
|
318
|
+
throw error;
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
emitBaselineStepProgress(deps, {
|
|
322
|
+
current: 2,
|
|
323
|
+
total: 2,
|
|
324
|
+
status: "started",
|
|
325
|
+
phase: "without_skill_replay",
|
|
326
|
+
label: "Replay with the target skill hidden",
|
|
327
|
+
});
|
|
328
|
+
let baselineResults: CreateReplayResult;
|
|
329
|
+
try {
|
|
330
|
+
baselineResults = await replay({
|
|
331
|
+
skillPath: context.skill_path,
|
|
332
|
+
mode: "package",
|
|
333
|
+
agent,
|
|
334
|
+
evalSetPath: options.evalSetPath,
|
|
335
|
+
includeTargetSkill: false,
|
|
336
|
+
});
|
|
337
|
+
emitBaselineStepProgress(deps, {
|
|
338
|
+
current: 2,
|
|
339
|
+
total: 2,
|
|
340
|
+
status: "finished",
|
|
341
|
+
phase: "without_skill_replay",
|
|
342
|
+
label: "Replay with the target skill hidden",
|
|
343
|
+
passed: true,
|
|
344
|
+
evidence: `Finished with ${(baselineResults.pass_rate * 100).toFixed(1)}% pass rate`,
|
|
345
|
+
});
|
|
346
|
+
} catch (error) {
|
|
347
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
348
|
+
emitBaselineStepProgress(deps, {
|
|
349
|
+
current: 2,
|
|
350
|
+
total: 2,
|
|
351
|
+
status: "finished",
|
|
352
|
+
phase: "without_skill_replay",
|
|
353
|
+
label: "Replay with the target skill hidden",
|
|
354
|
+
passed: false,
|
|
355
|
+
evidence: message,
|
|
356
|
+
});
|
|
357
|
+
throw error;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
return summarizePackageBaselineResults(context.skill_name, withSkillResults, baselineResults);
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
function formatBaselineResult(result: CreateBaselineResult): string {
|
|
364
|
+
return [
|
|
365
|
+
`Skill: ${result.skill_name}`,
|
|
366
|
+
`Mode: ${result.mode}`,
|
|
367
|
+
`Baseline pass rate: ${(result.baseline_pass_rate * 100).toFixed(1)}%`,
|
|
368
|
+
`With-skill pass rate: ${(result.with_skill_pass_rate * 100).toFixed(1)}%`,
|
|
369
|
+
`Lift: ${result.lift.toFixed(3)}`,
|
|
370
|
+
`Adds value: ${result.adds_value ? "yes" : "no"}`,
|
|
371
|
+
].join("\n");
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
export async function cliMain(): Promise<void> {
|
|
375
|
+
const { values } = parseArgs({
|
|
376
|
+
options: {
|
|
377
|
+
"skill-path": { type: "string" },
|
|
378
|
+
mode: { type: "string", default: "routing" },
|
|
379
|
+
agent: { type: "string" },
|
|
380
|
+
"eval-set": { type: "string" },
|
|
381
|
+
json: { type: "boolean", default: false },
|
|
382
|
+
help: { type: "boolean", short: "h", default: false },
|
|
383
|
+
},
|
|
384
|
+
strict: true,
|
|
385
|
+
});
|
|
386
|
+
|
|
387
|
+
if (values.help) {
|
|
388
|
+
console.log(renderCommandHelp(PUBLIC_COMMAND_SURFACES.createBaseline));
|
|
389
|
+
process.exit(0);
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
const mode = values.mode;
|
|
393
|
+
if (mode !== "routing" && mode !== "package") {
|
|
394
|
+
throw new CLIError(
|
|
395
|
+
`Unsupported --mode value "${mode}".`,
|
|
396
|
+
"INVALID_FLAG",
|
|
397
|
+
"Use --mode routing or --mode package.",
|
|
398
|
+
);
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
const result = await runCreateBaseline({
|
|
402
|
+
skillPath: values["skill-path"] ?? "",
|
|
403
|
+
mode,
|
|
404
|
+
agent: values.agent,
|
|
405
|
+
evalSetPath: values["eval-set"],
|
|
406
|
+
});
|
|
407
|
+
|
|
408
|
+
writeGradingBaseline({
|
|
409
|
+
skill_name: result.skill_name,
|
|
410
|
+
proposal_id: null,
|
|
411
|
+
measured_at: result.measured_at,
|
|
412
|
+
pass_rate: result.with_skill_pass_rate,
|
|
413
|
+
mean_score: null,
|
|
414
|
+
sample_size: result.per_entry.filter((entry) => entry.with_skill).length,
|
|
415
|
+
grading_results_json: JSON.stringify(result),
|
|
416
|
+
});
|
|
417
|
+
|
|
418
|
+
if (values.json || !process.stdout.isTTY) {
|
|
419
|
+
console.log(JSON.stringify(result, null, 2));
|
|
420
|
+
} else {
|
|
421
|
+
console.log(formatBaselineResult(result));
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
process.exit(result.adds_value ? 0 : 1);
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
if (import.meta.main) {
|
|
428
|
+
cliMain().catch(handleCLIError);
|
|
429
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import { parseArgs } from "node:util";
|
|
2
|
+
|
|
3
|
+
import { PUBLIC_COMMAND_SURFACES, renderCommandHelp } from "../command-surface.js";
|
|
4
|
+
import { handleCLIError } from "../utils/cli-error.js";
|
|
5
|
+
import { computeCreateCheckResult, formatCreateCheckResult } from "./readiness.js";
|
|
6
|
+
|
|
7
|
+
export async function cliMain(): Promise<void> {
|
|
8
|
+
const { values } = parseArgs({
|
|
9
|
+
options: {
|
|
10
|
+
"skill-path": { type: "string" },
|
|
11
|
+
json: { type: "boolean", default: false },
|
|
12
|
+
help: { type: "boolean", short: "h", default: false },
|
|
13
|
+
},
|
|
14
|
+
strict: true,
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
if (values.help) {
|
|
18
|
+
console.log(renderCommandHelp(PUBLIC_COMMAND_SURFACES.createCheck));
|
|
19
|
+
process.exit(0);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const result = await computeCreateCheckResult(values["skill-path"] ?? "");
|
|
23
|
+
|
|
24
|
+
if (values.json || !process.stdout.isTTY) {
|
|
25
|
+
console.log(JSON.stringify(result, null, 2));
|
|
26
|
+
} else {
|
|
27
|
+
console.log(formatCreateCheckResult(result));
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
process.exit(result.ok ? 0 : 1);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
if (import.meta.main) {
|
|
34
|
+
cliMain().catch(handleCLIError);
|
|
35
|
+
}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import { existsSync, mkdirSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { parseArgs } from "node:util";
|
|
3
|
+
|
|
4
|
+
import { PUBLIC_COMMAND_SURFACES, renderCommandHelp } from "../command-surface.js";
|
|
5
|
+
import { CLIError, handleCLIError } from "../utils/cli-error.js";
|
|
6
|
+
import {
|
|
7
|
+
buildCreateSkillDraft,
|
|
8
|
+
formatCreateSkillDraft,
|
|
9
|
+
slugifyCreateSkillName,
|
|
10
|
+
type CreateSkillDraft,
|
|
11
|
+
} from "./templates.js";
|
|
12
|
+
|
|
13
|
+
export interface CreateSkillInitResult extends CreateSkillDraft {
|
|
14
|
+
overwritten: boolean;
|
|
15
|
+
written_paths: string[];
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export function writeCreateSkillDraft(
|
|
19
|
+
draft: CreateSkillDraft,
|
|
20
|
+
options: { force?: boolean } = {},
|
|
21
|
+
): CreateSkillInitResult {
|
|
22
|
+
const alreadyExists = existsSync(draft.skill_dir);
|
|
23
|
+
if (alreadyExists && options.force !== true) {
|
|
24
|
+
throw new CLIError(
|
|
25
|
+
`Refusing to overwrite existing skill package at ${draft.skill_dir}.`,
|
|
26
|
+
"FILE_EXISTS",
|
|
27
|
+
"Re-run with --force to overwrite the scaffold files.",
|
|
28
|
+
);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
for (const directory of draft.directories) {
|
|
32
|
+
mkdirSync(directory, { recursive: true });
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
for (const file of draft.files) {
|
|
36
|
+
writeFileSync(file.absolute_path, file.content, "utf-8");
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
return {
|
|
40
|
+
...draft,
|
|
41
|
+
overwritten: alreadyExists,
|
|
42
|
+
written_paths: draft.files.map((file) => file.absolute_path),
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
function formatInitResult(result: CreateSkillInitResult): string {
|
|
47
|
+
return [
|
|
48
|
+
formatCreateSkillDraft(result),
|
|
49
|
+
"",
|
|
50
|
+
`Initialized: ${result.skill_dir}`,
|
|
51
|
+
result.overwritten ? "Mode: overwrite" : "Mode: new package",
|
|
52
|
+
"Next step: replace the placeholders in SKILL.md and workflows/default.md before distribution.",
|
|
53
|
+
].join("\n");
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
export async function cliMain(): Promise<void> {
|
|
57
|
+
const { values } = parseArgs({
|
|
58
|
+
options: {
|
|
59
|
+
name: { type: "string" },
|
|
60
|
+
description: { type: "string" },
|
|
61
|
+
"output-dir": { type: "string" },
|
|
62
|
+
force: { type: "boolean", default: false },
|
|
63
|
+
json: { type: "boolean", default: false },
|
|
64
|
+
help: { type: "boolean", short: "h", default: false },
|
|
65
|
+
},
|
|
66
|
+
strict: true,
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
if (values.help) {
|
|
70
|
+
console.log(renderCommandHelp(PUBLIC_COMMAND_SURFACES.createInit));
|
|
71
|
+
process.exit(0);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if (!values.name?.trim()) {
|
|
75
|
+
throw new CLIError(
|
|
76
|
+
"--name <name> is required",
|
|
77
|
+
"MISSING_FLAG",
|
|
78
|
+
"selftune create init --name <name> --description <text>",
|
|
79
|
+
);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
if (!values.description?.trim()) {
|
|
83
|
+
throw new CLIError(
|
|
84
|
+
"--description <text> is required",
|
|
85
|
+
"MISSING_FLAG",
|
|
86
|
+
"selftune create init --name <name> --description <text>",
|
|
87
|
+
);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
if (!slugifyCreateSkillName(values.name)) {
|
|
91
|
+
throw new CLIError(
|
|
92
|
+
"--name must contain at least one letter or number",
|
|
93
|
+
"INVALID_FLAG",
|
|
94
|
+
"selftune create init --name <name> --description <text>",
|
|
95
|
+
);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
const draft = buildCreateSkillDraft({
|
|
99
|
+
name: values.name,
|
|
100
|
+
description: values.description,
|
|
101
|
+
outputDir: values["output-dir"],
|
|
102
|
+
});
|
|
103
|
+
const result = writeCreateSkillDraft(draft, { force: values.force });
|
|
104
|
+
|
|
105
|
+
if (values.json || !process.stdout.isTTY) {
|
|
106
|
+
console.log(JSON.stringify(result, null, 2));
|
|
107
|
+
return;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
console.log(formatInitResult(result));
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
if (import.meta.main) {
|
|
114
|
+
cliMain().catch(handleCLIError);
|
|
115
|
+
}
|