selftune 0.2.14 → 0.2.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/apps/local-dashboard/dist/assets/index-DOu3iLD9.js +16 -0
  2. package/apps/local-dashboard/dist/assets/vendor-ui-DIwlrGlb.js +12 -0
  3. package/apps/local-dashboard/dist/index.html +2 -2
  4. package/bin/run-hook.cjs +36 -0
  5. package/cli/selftune/analytics.ts +13 -11
  6. package/cli/selftune/badge/badge.ts +13 -9
  7. package/cli/selftune/canonical-export.ts +6 -6
  8. package/cli/selftune/contribute/contribute.ts +2 -1
  9. package/cli/selftune/cron/setup.ts +3 -1
  10. package/cli/selftune/dashboard-contract.ts +10 -0
  11. package/cli/selftune/dashboard.ts +10 -5
  12. package/cli/selftune/eval/baseline.ts +20 -30
  13. package/cli/selftune/eval/hooks-to-evals.ts +22 -12
  14. package/cli/selftune/eval/import-skillsbench.ts +21 -8
  15. package/cli/selftune/eval/unit-test-cli.ts +22 -11
  16. package/cli/selftune/evolution/description-quality.ts +224 -0
  17. package/cli/selftune/evolution/evolve-body.ts +17 -10
  18. package/cli/selftune/evolution/evolve.ts +94 -59
  19. package/cli/selftune/evolution/rollback.ts +7 -6
  20. package/cli/selftune/evolution/unblock-suggestions.ts +159 -0
  21. package/cli/selftune/grading/auto-grade.ts +24 -22
  22. package/cli/selftune/grading/grade-session.ts +21 -17
  23. package/cli/selftune/hooks/auto-activate.ts +12 -3
  24. package/cli/selftune/hooks/prompt-log.ts +7 -1
  25. package/cli/selftune/index.ts +66 -69
  26. package/cli/selftune/ingestors/claude-replay.ts +29 -14
  27. package/cli/selftune/ingestors/codex-rollout.ts +6 -1
  28. package/cli/selftune/init.ts +212 -36
  29. package/cli/selftune/monitoring/watch.ts +32 -16
  30. package/cli/selftune/orchestrate.ts +18 -17
  31. package/cli/selftune/routes/skill-report.ts +17 -0
  32. package/cli/selftune/schedule.ts +23 -9
  33. package/cli/selftune/sync.ts +7 -3
  34. package/cli/selftune/types.ts +45 -10
  35. package/cli/selftune/utils/cli-error.ts +102 -0
  36. package/cli/selftune/utils/hooks.ts +12 -2
  37. package/cli/selftune/workflows/workflows.ts +23 -17
  38. package/package.json +1 -1
  39. package/skill/SKILL.md +1 -1
  40. package/skill/Workflows/AutoActivation.md +1 -1
  41. package/skill/Workflows/Evolve.md +4 -0
  42. package/skill/Workflows/Initialize.md +8 -8
  43. package/skill/settings_snippet.json +35 -12
  44. package/apps/local-dashboard/dist/assets/index-DIrdlu2_.js +0 -16
  45. package/apps/local-dashboard/dist/assets/vendor-ui-7xD7fNEU.js +0 -12
@@ -0,0 +1,224 @@
1
+ /**
2
+ * description-quality.ts
3
+ *
4
+ * Pure, deterministic scoring function that evaluates the quality of a skill
5
+ * description for routing accuracy. No LLM calls — heuristic-only.
6
+ *
7
+ * Inspired by OpenAI's finding that "writing better skill descriptions improved
8
+ * routing accuracy more than any change to the underlying skill logic itself."
9
+ */
10
+
11
+ import type { DescriptionQualityScore } from "../types.js";
12
+
13
+ // ---------------------------------------------------------------------------
14
+ // Constants
15
+ // ---------------------------------------------------------------------------
16
+
17
+ /** Optimal description length range (characters). */
18
+ const MIN_LENGTH = 40;
19
+ const MAX_LENGTH = 500;
20
+ const IDEAL_MIN = 80;
21
+ const IDEAL_MAX = 300;
22
+
23
+ /** Words that indicate trigger context — the description says *when* the skill fires. */
24
+ const TRIGGER_CONTEXT_WORDS = [
25
+ "when",
26
+ "if",
27
+ "after",
28
+ "before",
29
+ "during",
30
+ "while",
31
+ "upon",
32
+ "whenever",
33
+ "use when",
34
+ "trigger",
35
+ "activate",
36
+ ];
37
+
38
+ /** Vague words that weaken routing precision. */
39
+ const VAGUE_WORDS = [
40
+ "various",
41
+ "general",
42
+ "misc",
43
+ "miscellaneous",
44
+ "stuff",
45
+ "things",
46
+ "etc",
47
+ "and more",
48
+ "and so on",
49
+ "other",
50
+ "multiple",
51
+ "several",
52
+ "many",
53
+ "some",
54
+ "certain",
55
+ "related",
56
+ ];
57
+
58
+ /** Common filler phrases that add no routing signal. */
59
+ const FILLER_PHRASES = [
60
+ "this skill",
61
+ "a tool for",
62
+ "a tool that",
63
+ "helps with",
64
+ "is used for",
65
+ "can be used",
66
+ "is designed to",
67
+ ];
68
+
69
+ /** Action verbs that signal concrete behavior. */
70
+ const ACTION_VERBS = [
71
+ "run",
72
+ "execute",
73
+ "analyze",
74
+ "generate",
75
+ "create",
76
+ "deploy",
77
+ "validate",
78
+ "check",
79
+ "build",
80
+ "test",
81
+ "scan",
82
+ "extract",
83
+ "transform",
84
+ "monitor",
85
+ "grade",
86
+ "evolve",
87
+ "sync",
88
+ "watch",
89
+ "review",
90
+ "audit",
91
+ "parse",
92
+ "format",
93
+ "search",
94
+ "fetch",
95
+ "publish",
96
+ "install",
97
+ "configure",
98
+ "diagnose",
99
+ "debug",
100
+ "fix",
101
+ "optimize",
102
+ "measure",
103
+ ];
104
+
105
+ // ---------------------------------------------------------------------------
106
+ // Pre-compiled word-boundary patterns
107
+ // ---------------------------------------------------------------------------
108
+
109
+ /** Compile a word list into pre-built RegExp patterns at module load time. */
110
+ function compileWordPatterns(words: string[]): RegExp[] {
111
+ return words.map((w) => new RegExp(`\\b${w.replace(/\s+/g, "\\s+")}\\b`, "i"));
112
+ }
113
+
114
+ const TRIGGER_PATTERNS = compileWordPatterns(TRIGGER_CONTEXT_WORDS);
115
+ const VAGUE_PATTERNS = compileWordPatterns(VAGUE_WORDS);
116
+ const ACTION_PATTERNS = compileWordPatterns(ACTION_VERBS);
117
+
118
+ /** Count how many pre-compiled patterns match in a string. */
119
+ function countWordMatches(text: string, patterns: RegExp[]): number {
120
+ let count = 0;
121
+ for (const p of patterns) {
122
+ if (p.test(text)) count++;
123
+ }
124
+ return count;
125
+ }
126
+
127
+ // ---------------------------------------------------------------------------
128
+ // Criterion scorers
129
+ // ---------------------------------------------------------------------------
130
+
131
+ /** Score description length: 1.0 for ideal range, graded falloff outside. */
132
+ export function scoreLengthCriterion(description: string): number {
133
+ const len = description.length;
134
+ if (len < MIN_LENGTH) return len / MIN_LENGTH;
135
+ if (len >= IDEAL_MIN && len <= IDEAL_MAX) return 1.0;
136
+ if (len < IDEAL_MIN) return 0.7 + 0.3 * ((len - MIN_LENGTH) / (IDEAL_MIN - MIN_LENGTH));
137
+ if (len <= MAX_LENGTH) return 0.7 + 0.3 * ((MAX_LENGTH - len) / (MAX_LENGTH - IDEAL_MAX));
138
+ return Math.max(0.3, 0.7 - 0.4 * ((len - MAX_LENGTH) / MAX_LENGTH));
139
+ }
140
+
141
+ /** Score presence of trigger context words (when/if/before/after etc). */
142
+ export function scoreTriggerContextCriterion(description: string): number {
143
+ const matches = countWordMatches(description.toLowerCase(), TRIGGER_PATTERNS);
144
+ if (matches === 0) return 0.0;
145
+ if (matches === 1) return 0.7;
146
+ return Math.min(1.0, 0.7 + 0.15 * (matches - 1));
147
+ }
148
+
149
+ /** Score absence of vague words (lower is worse). */
150
+ export function scoreVaguenessCriterion(description: string): number {
151
+ const matches = countWordMatches(description.toLowerCase(), VAGUE_PATTERNS);
152
+ if (matches === 0) return 1.0;
153
+ if (matches === 1) return 0.6;
154
+ return Math.max(0.1, 0.6 - 0.15 * (matches - 1));
155
+ }
156
+
157
+ /** Score whether description specifies at least one concrete action or domain. */
158
+ export function scoreSpecificityCriterion(description: string): number {
159
+ const lower = description.toLowerCase();
160
+ const hasAction = ACTION_PATTERNS.some((p) => p.test(lower));
161
+
162
+ const fillerCount = FILLER_PHRASES.filter((f) => lower.includes(f)).length;
163
+ const words = description.split(/\s+/).length;
164
+ const fillerRatio = fillerCount > 0 ? fillerCount / Math.max(1, words / 10) : 0;
165
+
166
+ if (!hasAction) return 0.2;
167
+ return Math.max(0.3, 1.0 - fillerRatio * 0.3);
168
+ }
169
+
170
+ /** Score whether description is not just the skill name restated. */
171
+ export function scoreNotJustNameCriterion(description: string, skillName?: string): number {
172
+ if (!skillName) return 1.0;
173
+ const descNorm = description
174
+ .toLowerCase()
175
+ .trim()
176
+ .replace(/[^a-z0-9\s]/g, "");
177
+ const nameNorm = skillName
178
+ .toLowerCase()
179
+ .trim()
180
+ .replace(/[^a-z0-9\s]/g, "");
181
+ const nameFromKebab = skillName.replace(/[-_]/g, " ").toLowerCase().trim();
182
+
183
+ if (descNorm === nameNorm || descNorm === nameFromKebab) return 0.0;
184
+ if (descNorm.length < nameNorm.length + 10) return 0.3;
185
+ return 1.0;
186
+ }
187
+
188
+ // ---------------------------------------------------------------------------
189
+ // Main scoring function
190
+ // ---------------------------------------------------------------------------
191
+
192
+ /** Criterion weights — trigger context is weighted highest per OpenAI's finding. */
193
+ const WEIGHTS = {
194
+ length: 0.15,
195
+ trigger_context: 0.3,
196
+ vagueness: 0.2,
197
+ specificity: 0.2,
198
+ not_just_name: 0.15,
199
+ } as const;
200
+
201
+ /**
202
+ * Score a skill description on heuristic quality criteria.
203
+ * Returns a 0.0-1.0 composite score with per-criterion breakdown.
204
+ * Pure function — no I/O, no LLM calls.
205
+ */
206
+ export function scoreDescription(description: string, skillName?: string): DescriptionQualityScore {
207
+ const criteria = {
208
+ length: scoreLengthCriterion(description),
209
+ trigger_context: scoreTriggerContextCriterion(description),
210
+ vagueness: scoreVaguenessCriterion(description),
211
+ specificity: scoreSpecificityCriterion(description),
212
+ not_just_name: scoreNotJustNameCriterion(description, skillName),
213
+ };
214
+
215
+ const composite = (Object.keys(WEIGHTS) as (keyof typeof WEIGHTS)[]).reduce(
216
+ (sum, key) => sum + criteria[key] * WEIGHTS[key],
217
+ 0,
218
+ );
219
+
220
+ return {
221
+ composite: +composite.toFixed(3),
222
+ criteria,
223
+ };
224
+ }
@@ -25,6 +25,7 @@ import type {
25
25
  QueryLogRecord,
26
26
  SkillUsageRecord,
27
27
  } from "../types.js";
28
+ import { CLIError, handleCLIError } from "../utils/cli-error.js";
28
29
  import type { EffortLevel, SubagentCallOptions } from "../utils/llm-call.js";
29
30
  import { callViaSubagent } from "../utils/llm-call.js";
30
31
  import { appendAuditEntry } from "./audit.js";
@@ -710,8 +711,11 @@ Options:
710
711
  }
711
712
 
712
713
  if (!values.skill || !values["skill-path"]) {
713
- console.error("[ERROR] --skill and --skill-path are required");
714
- process.exit(1);
714
+ throw new CLIError(
715
+ "--skill and --skill-path are required",
716
+ "MISSING_FLAG",
717
+ "selftune evolve body --skill <name> --skill-path <path>",
718
+ );
715
719
  }
716
720
 
717
721
  const { detectAgent } = await import("../utils/llm-call.js");
@@ -719,15 +723,21 @@ Options:
719
723
  const studentAgent = values["student-agent"] ?? teacherAgent;
720
724
 
721
725
  if (!teacherAgent) {
722
- console.error("[ERROR] No agent CLI found. Install Claude Code, Codex, or OpenCode.");
723
- process.exit(1);
726
+ throw new CLIError(
727
+ "No agent CLI found. Install Claude Code, Codex, or OpenCode.",
728
+ "AGENT_NOT_FOUND",
729
+ "Install Claude Code, Codex, or OpenCode.",
730
+ );
724
731
  }
725
732
 
726
733
  // Parse target
727
734
  const targetStr = values.target ?? "body";
728
735
  if (targetStr !== "body" && targetStr !== "routing") {
729
- console.error("[ERROR] --target must be 'body' or 'routing'");
730
- process.exit(1);
736
+ throw new CLIError(
737
+ "--target must be 'body' or 'routing'",
738
+ "INVALID_FLAG",
739
+ "Use --target body or --target routing",
740
+ );
731
741
  }
732
742
 
733
743
  // Parse few-shot examples
@@ -763,8 +773,5 @@ Options:
763
773
  }
764
774
 
765
775
  if (import.meta.main) {
766
- cliMain().catch((err) => {
767
- console.error(`[FATAL] ${err}`);
768
- process.exit(1);
769
- });
776
+ cliMain().catch(handleCLIError);
770
777
  }
@@ -36,10 +36,12 @@ import type {
36
36
  SessionTelemetryRecord,
37
37
  SkillUsageRecord,
38
38
  } from "../types.js";
39
+ import { CLIError, handleCLIError } from "../utils/cli-error.js";
39
40
  import { parseFrontmatter, replaceDescription } from "../utils/frontmatter.js";
40
41
  import { createEvolveTUI } from "../utils/tui.js";
41
42
  import { appendAuditEntry } from "./audit.js";
42
43
  import { checkConstitution } from "./constitutional.js";
44
+ import { scoreDescription } from "./description-quality.js";
43
45
  import { appendEvidenceEntry } from "./evidence.js";
44
46
  import { extractFailurePatterns } from "./extract-patterns.js";
45
47
  import {
@@ -49,6 +51,7 @@ import {
49
51
  selectFromFrontier,
50
52
  } from "./pareto.js";
51
53
  import { generateMultipleProposals, generateProposal } from "./propose-description.js";
54
+ import { buildUnblockSuggestions } from "./unblock-suggestions.js";
52
55
  import type { ValidationResult } from "./validate-proposal.js";
53
56
  import {
54
57
  TRIGGER_CHECK_BATCH_SIZE,
@@ -94,6 +97,8 @@ export interface EvolveResult {
94
97
  baselineResult?: BaselineMeasurement;
95
98
  gateValidation?: ValidationResult;
96
99
  sync_result?: SyncResult;
100
+ descriptionQualityBefore?: number;
101
+ descriptionQualityAfter?: number;
97
102
  }
98
103
 
99
104
  /**
@@ -247,16 +252,26 @@ export async function evolve(
247
252
  );
248
253
 
249
254
  /** Stamp every return with pipeline stats so callers always get them. */
250
- const withStats = (r: Omit<EvolveResult, "llmCallCount" | "elapsedMs">): EvolveResult => ({
251
- ...r,
252
- llmCallCount,
253
- elapsedMs: Date.now() - pipelineStart,
254
- ...(syncResult ? { sync_result: syncResult } : {}),
255
- });
255
+ const withStats = (r: Omit<EvolveResult, "llmCallCount" | "elapsedMs">): EvolveResult => {
256
+ const descQualityAfterScore = r.proposal
257
+ ? scoreDescription(r.proposal.proposed_description, options.skillName).composite
258
+ : undefined;
259
+ return {
260
+ ...r,
261
+ llmCallCount,
262
+ elapsedMs: Date.now() - pipelineStart,
263
+ ...(syncResult ? { sync_result: syncResult } : {}),
264
+ ...(descQualityBeforeScore != null
265
+ ? { descriptionQualityBefore: descQualityBeforeScore }
266
+ : {}),
267
+ ...(descQualityAfterScore != null ? { descriptionQualityAfter: descQualityAfterScore } : {}),
268
+ };
269
+ };
256
270
 
257
- // Hoisted so catch block can preserve partial results on error
271
+ // Hoisted so catch block and withStats can preserve partial results on error
258
272
  let lastProposal: EvolutionProposal | null = null;
259
273
  let lastValidation: ValidationResult | null = null;
274
+ let descQualityBeforeScore: number | undefined;
260
275
 
261
276
  try {
262
277
  // -----------------------------------------------------------------------
@@ -281,7 +296,11 @@ export async function evolve(
281
296
  const versionTag = skillVersion ? `, v${skillVersion}` : "";
282
297
  const createdAuditDetails = (message: string) =>
283
298
  `original_description:${rawContent}\n${message}`;
284
- tui.done(`Loaded SKILL.md (desc: ${currentDescription.length} chars${versionTag})`);
299
+ const descQualityBefore = scoreDescription(currentDescription, skillName);
300
+ descQualityBeforeScore = descQualityBefore.composite;
301
+ tui.done(
302
+ `Loaded SKILL.md (desc: ${currentDescription.length} chars${versionTag}, quality: ${descQualityBefore.composite})`,
303
+ );
285
304
 
286
305
  if (options.syncFirst) {
287
306
  tui.step(`Syncing source-truth telemetry${options.syncForce ? " (force)" : ""}...`);
@@ -1111,38 +1130,36 @@ Options:
1111
1130
  }
1112
1131
 
1113
1132
  if (!values.skill || !values["skill-path"]) {
1114
- console.error("[ERROR] --skill and --skill-path are required");
1115
- process.exit(1);
1133
+ throw new CLIError(
1134
+ "--skill and --skill-path are required",
1135
+ "MISSING_FLAG",
1136
+ "selftune evolve --skill <name> --skill-path <path>",
1137
+ );
1116
1138
  }
1117
1139
  if ((values["sync-force"] ?? false) && !(values["sync-first"] ?? false)) {
1118
- console.error("[ERROR] --sync-force requires --sync-first");
1119
- process.exit(1);
1140
+ throw new CLIError(
1141
+ "--sync-force requires --sync-first",
1142
+ "INVALID_FLAG",
1143
+ "Add --sync-first when using --sync-force",
1144
+ );
1120
1145
  }
1121
1146
 
1122
1147
  const { detectAgent } = await import("../utils/llm-call.js");
1123
1148
  const requestedAgent = values.agent;
1124
1149
  if (requestedAgent && !Bun.which(requestedAgent)) {
1125
- console.error(
1126
- JSON.stringify({
1127
- level: "error",
1128
- code: "agent_not_in_path",
1129
- message: `Agent CLI '${requestedAgent}' not found in PATH.`,
1130
- action: "Install it or omit --agent to use auto-detection.",
1131
- }),
1150
+ throw new CLIError(
1151
+ `Agent CLI '${requestedAgent}' not found in PATH.`,
1152
+ "AGENT_NOT_FOUND",
1153
+ "Install it or omit --agent to use auto-detection.",
1132
1154
  );
1133
- process.exit(1);
1134
1155
  }
1135
1156
  const agent = requestedAgent ?? detectAgent();
1136
1157
  if (!agent) {
1137
- console.error(
1138
- JSON.stringify({
1139
- level: "error",
1140
- code: "agent_not_found",
1141
- message: "No agent CLI (claude/codex/opencode) found in PATH.",
1142
- action: "Install Claude Code, Codex, or OpenCode.",
1143
- }),
1158
+ throw new CLIError(
1159
+ "No agent CLI (claude/codex/opencode) found in PATH.",
1160
+ "AGENT_NOT_FOUND",
1161
+ "Install Claude Code, Codex, or OpenCode.",
1144
1162
  );
1145
- process.exit(1);
1146
1163
  }
1147
1164
 
1148
1165
  // -------------------------------------------------------------------------
@@ -1150,20 +1167,27 @@ Options:
1150
1167
  // -------------------------------------------------------------------------
1151
1168
  const skillPath = values["skill-path"];
1152
1169
  if (!skillPath) {
1153
- console.error("[ERROR] --skill-path is required.");
1154
- process.exit(1);
1170
+ throw new CLIError(
1171
+ "--skill-path is required.",
1172
+ "MISSING_FLAG",
1173
+ "selftune evolve --skill <name> --skill-path <path>",
1174
+ );
1155
1175
  }
1156
1176
  if (!existsSync(skillPath)) {
1157
- console.error(`[ERROR] SKILL.md not found at: ${skillPath}`);
1158
- console.error(" Verify the --skill-path argument points to an existing SKILL.md file.");
1159
- process.exit(1);
1177
+ throw new CLIError(
1178
+ `SKILL.md not found at: ${skillPath}`,
1179
+ "FILE_NOT_FOUND",
1180
+ "Verify the --skill-path argument points to an existing SKILL.md file.",
1181
+ );
1160
1182
  }
1161
1183
 
1162
1184
  const evalSetPath = values["eval-set"];
1163
1185
  if (evalSetPath && !existsSync(evalSetPath)) {
1164
- console.error(`[ERROR] Eval set file not found at: ${evalSetPath}`);
1165
- console.error(" Verify the --eval-set argument points to an existing JSON file.");
1166
- process.exit(1);
1186
+ throw new CLIError(
1187
+ `Eval set file not found at: ${evalSetPath}`,
1188
+ "FILE_NOT_FOUND",
1189
+ "Verify the --eval-set argument points to an existing JSON file.",
1190
+ );
1167
1191
  }
1168
1192
 
1169
1193
  // If no eval-set provided, check that log files exist for auto-generation
@@ -1172,12 +1196,11 @@ Options:
1172
1196
  const hasSkillLog = querySkillUsageRecords(dbCheck).length > 0;
1173
1197
  const hasQueryLog = existsSync(QUERY_LOG);
1174
1198
  if (!hasSkillLog && !hasQueryLog) {
1175
- console.error("[ERROR] No eval set provided and no telemetry logs found.");
1176
- console.error(
1177
- " Either pass --eval-set <path> or generate logs first by using selftune-enabled skills.",
1199
+ throw new CLIError(
1200
+ `No eval set provided and no telemetry logs found. Expected logs at: ${SKILL_LOG} and ${QUERY_LOG}`,
1201
+ "MISSING_DATA",
1202
+ "Either pass --eval-set <path> or generate logs first by using selftune-enabled skills.",
1178
1203
  );
1179
- console.error(` Expected logs at: ${SKILL_LOG} and ${QUERY_LOG}`);
1180
- process.exit(1);
1181
1204
  }
1182
1205
  }
1183
1206
 
@@ -1244,11 +1267,22 @@ Options:
1244
1267
  rationale: result.proposal?.rationale ?? "",
1245
1268
  ...(result.skillVersion ? { version: result.skillVersion } : {}),
1246
1269
  dashboard_url: `http://localhost:3141/report/${encodeURIComponent(values.skill)}`,
1270
+ ...(result.descriptionQualityBefore != null
1271
+ ? { description_quality_before: result.descriptionQualityBefore }
1272
+ : {}),
1273
+ ...(result.descriptionQualityAfter != null
1274
+ ? { description_quality_after: result.descriptionQualityAfter }
1275
+ : {}),
1276
+ ...(!result.deployed
1277
+ ? {
1278
+ suggestions: buildUnblockSuggestions(result, values.skill),
1279
+ }
1280
+ : {}),
1247
1281
  };
1248
1282
  console.log(JSON.stringify(summary, null, 2));
1249
1283
  }
1250
1284
 
1251
- // Print human-readable status to stderr so users always see outcome
1285
+ // Print human-readable status to stderr so agents always see outcome + next steps
1252
1286
  if (!result.deployed) {
1253
1287
  console.error(`\n[NOT DEPLOYED] ${result.reason}`);
1254
1288
  if (result.validation && !result.validation.improved) {
@@ -1267,29 +1301,30 @@ Options:
1267
1301
  ` Confidence ${result.proposal.confidence.toFixed(2)} below threshold ${values.confidence ?? "0.6"}`,
1268
1302
  );
1269
1303
  }
1270
- console.error(" Re-run with --verbose for full diagnostic output.");
1304
+ // Targeted suggestions based on specific failure reason
1305
+ const suggestions = buildUnblockSuggestions(result, values.skill);
1306
+ if (suggestions.length > 0) {
1307
+ console.error("\n Next steps:");
1308
+ for (const s of suggestions) {
1309
+ console.error(` → ${s}`);
1310
+ }
1311
+ }
1271
1312
  } else {
1272
1313
  console.error(`\n[DEPLOYED] ${result.reason}`);
1314
+ // Show quality improvement if available
1315
+ if (result.descriptionQualityBefore != null && result.descriptionQualityAfter != null) {
1316
+ const delta = result.descriptionQualityAfter - result.descriptionQualityBefore;
1317
+ if (delta !== 0) {
1318
+ console.error(
1319
+ ` Description quality: ${Math.round(result.descriptionQualityBefore * 100)}% → ${Math.round(result.descriptionQualityAfter * 100)}% (${delta >= 0 ? "+" : ""}${Math.round(delta * 100)}%)`,
1320
+ );
1321
+ }
1322
+ }
1273
1323
  }
1274
1324
 
1275
1325
  process.exit(result.deployed ? 0 : 1);
1276
1326
  }
1277
1327
 
1278
1328
  if (import.meta.main) {
1279
- cliMain().catch((err) => {
1280
- const message = err instanceof Error ? err.message : String(err);
1281
- const stack = err instanceof Error ? err.stack : undefined;
1282
- console.error(`[FATAL] ${message}`);
1283
- if (stack && process.env.SELFTUNE_VERBOSE === "1") {
1284
- console.error(stack);
1285
- }
1286
- console.error(
1287
- "\nTroubleshooting:\n" +
1288
- " - Verify --skill-path points to a valid SKILL.md file\n" +
1289
- " - Ensure eval data exists (run `selftune eval generate` first) or pass --eval-set\n" +
1290
- " - Check that ANTHROPIC_API_KEY is set if using Claude\n" +
1291
- " - Re-run with --verbose for full diagnostic output",
1292
- );
1293
- process.exit(1);
1294
- });
1329
+ cliMain().catch(handleCLIError);
1295
1330
  }
@@ -13,6 +13,7 @@ import { parseArgs } from "node:util";
13
13
 
14
14
  import { updateContextAfterRollback } from "../memory/writer.js";
15
15
  import type { EvolutionAuditEntry } from "../types.js";
16
+ import { CLIError, handleCLIError } from "../utils/cli-error.js";
16
17
  import { replaceDescription } from "../utils/frontmatter.js";
17
18
  import { appendAuditEntry, getLastDeployedProposal, readAuditTrail } from "./audit.js";
18
19
 
@@ -233,8 +234,11 @@ Options:
233
234
  }
234
235
 
235
236
  if (!values.skill || !values["skill-path"]) {
236
- console.error("[ERROR] --skill and --skill-path are required");
237
- process.exit(1);
237
+ throw new CLIError(
238
+ "--skill and --skill-path are required",
239
+ "MISSING_FLAG",
240
+ "selftune evolve rollback --skill <name> --skill-path <path>",
241
+ );
238
242
  }
239
243
 
240
244
  const result = await rollback({
@@ -248,8 +252,5 @@ Options:
248
252
  }
249
253
 
250
254
  if (import.meta.main) {
251
- cliMain().catch((err) => {
252
- console.error(`[FATAL] ${err}`);
253
- process.exit(1);
254
- });
255
+ cliMain().catch(handleCLIError);
255
256
  }