selftune 0.2.14 → 0.2.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/apps/local-dashboard/dist/assets/index-DOu3iLD9.js +16 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-DIwlrGlb.js +12 -0
- package/apps/local-dashboard/dist/index.html +2 -2
- package/cli/selftune/analytics.ts +13 -11
- package/cli/selftune/badge/badge.ts +13 -9
- package/cli/selftune/canonical-export.ts +6 -6
- package/cli/selftune/contribute/contribute.ts +2 -1
- package/cli/selftune/cron/setup.ts +3 -1
- package/cli/selftune/dashboard-contract.ts +10 -0
- package/cli/selftune/dashboard.ts +10 -5
- package/cli/selftune/eval/baseline.ts +20 -30
- package/cli/selftune/eval/hooks-to-evals.ts +22 -12
- package/cli/selftune/eval/import-skillsbench.ts +21 -8
- package/cli/selftune/eval/unit-test-cli.ts +22 -11
- package/cli/selftune/evolution/description-quality.ts +224 -0
- package/cli/selftune/evolution/evolve-body.ts +17 -10
- package/cli/selftune/evolution/evolve.ts +70 -57
- package/cli/selftune/evolution/rollback.ts +7 -6
- package/cli/selftune/grading/auto-grade.ts +24 -22
- package/cli/selftune/grading/grade-session.ts +21 -17
- package/cli/selftune/hooks/auto-activate.ts +12 -3
- package/cli/selftune/hooks/prompt-log.ts +7 -1
- package/cli/selftune/index.ts +66 -69
- package/cli/selftune/ingestors/claude-replay.ts +29 -14
- package/cli/selftune/ingestors/codex-rollout.ts +6 -1
- package/cli/selftune/init.ts +14 -9
- package/cli/selftune/monitoring/watch.ts +32 -16
- package/cli/selftune/orchestrate.ts +18 -17
- package/cli/selftune/routes/skill-report.ts +17 -0
- package/cli/selftune/schedule.ts +23 -9
- package/cli/selftune/sync.ts +7 -3
- package/cli/selftune/types.ts +44 -10
- package/cli/selftune/utils/cli-error.ts +102 -0
- package/cli/selftune/workflows/workflows.ts +23 -17
- package/package.json +1 -1
- package/skill/SKILL.md +1 -1
- package/skill/Workflows/Evolve.md +4 -0
- package/skill/Workflows/Initialize.md +8 -8
- package/skill/settings_snippet.json +29 -6
- package/apps/local-dashboard/dist/assets/index-DIrdlu2_.js +0 -16
- package/apps/local-dashboard/dist/assets/vendor-ui-7xD7fNEU.js +0 -12
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* description-quality.ts
|
|
3
|
+
*
|
|
4
|
+
* Pure, deterministic scoring function that evaluates the quality of a skill
|
|
5
|
+
* description for routing accuracy. No LLM calls — heuristic-only.
|
|
6
|
+
*
|
|
7
|
+
* Inspired by OpenAI's finding that "writing better skill descriptions improved
|
|
8
|
+
* routing accuracy more than any change to the underlying skill logic itself."
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import type { DescriptionQualityScore } from "../types.js";
|
|
12
|
+
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
// Constants
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
/** Optimal description length range (characters). */
|
|
18
|
+
const MIN_LENGTH = 40;
|
|
19
|
+
const MAX_LENGTH = 500;
|
|
20
|
+
const IDEAL_MIN = 80;
|
|
21
|
+
const IDEAL_MAX = 300;
|
|
22
|
+
|
|
23
|
+
/** Words that indicate trigger context — the description says *when* the skill fires. */
|
|
24
|
+
const TRIGGER_CONTEXT_WORDS = [
|
|
25
|
+
"when",
|
|
26
|
+
"if",
|
|
27
|
+
"after",
|
|
28
|
+
"before",
|
|
29
|
+
"during",
|
|
30
|
+
"while",
|
|
31
|
+
"upon",
|
|
32
|
+
"whenever",
|
|
33
|
+
"use when",
|
|
34
|
+
"trigger",
|
|
35
|
+
"activate",
|
|
36
|
+
];
|
|
37
|
+
|
|
38
|
+
/** Vague words that weaken routing precision. */
|
|
39
|
+
const VAGUE_WORDS = [
|
|
40
|
+
"various",
|
|
41
|
+
"general",
|
|
42
|
+
"misc",
|
|
43
|
+
"miscellaneous",
|
|
44
|
+
"stuff",
|
|
45
|
+
"things",
|
|
46
|
+
"etc",
|
|
47
|
+
"and more",
|
|
48
|
+
"and so on",
|
|
49
|
+
"other",
|
|
50
|
+
"multiple",
|
|
51
|
+
"several",
|
|
52
|
+
"many",
|
|
53
|
+
"some",
|
|
54
|
+
"certain",
|
|
55
|
+
"related",
|
|
56
|
+
];
|
|
57
|
+
|
|
58
|
+
/** Common filler phrases that add no routing signal. */
|
|
59
|
+
const FILLER_PHRASES = [
|
|
60
|
+
"this skill",
|
|
61
|
+
"a tool for",
|
|
62
|
+
"a tool that",
|
|
63
|
+
"helps with",
|
|
64
|
+
"is used for",
|
|
65
|
+
"can be used",
|
|
66
|
+
"is designed to",
|
|
67
|
+
];
|
|
68
|
+
|
|
69
|
+
/** Action verbs that signal concrete behavior. */
|
|
70
|
+
const ACTION_VERBS = [
|
|
71
|
+
"run",
|
|
72
|
+
"execute",
|
|
73
|
+
"analyze",
|
|
74
|
+
"generate",
|
|
75
|
+
"create",
|
|
76
|
+
"deploy",
|
|
77
|
+
"validate",
|
|
78
|
+
"check",
|
|
79
|
+
"build",
|
|
80
|
+
"test",
|
|
81
|
+
"scan",
|
|
82
|
+
"extract",
|
|
83
|
+
"transform",
|
|
84
|
+
"monitor",
|
|
85
|
+
"grade",
|
|
86
|
+
"evolve",
|
|
87
|
+
"sync",
|
|
88
|
+
"watch",
|
|
89
|
+
"review",
|
|
90
|
+
"audit",
|
|
91
|
+
"parse",
|
|
92
|
+
"format",
|
|
93
|
+
"search",
|
|
94
|
+
"fetch",
|
|
95
|
+
"publish",
|
|
96
|
+
"install",
|
|
97
|
+
"configure",
|
|
98
|
+
"diagnose",
|
|
99
|
+
"debug",
|
|
100
|
+
"fix",
|
|
101
|
+
"optimize",
|
|
102
|
+
"measure",
|
|
103
|
+
];
|
|
104
|
+
|
|
105
|
+
// ---------------------------------------------------------------------------
|
|
106
|
+
// Pre-compiled word-boundary patterns
|
|
107
|
+
// ---------------------------------------------------------------------------
|
|
108
|
+
|
|
109
|
+
/** Compile a word list into pre-built RegExp patterns at module load time. */
|
|
110
|
+
function compileWordPatterns(words: string[]): RegExp[] {
|
|
111
|
+
return words.map((w) => new RegExp(`\\b${w.replace(/\s+/g, "\\s+")}\\b`, "i"));
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const TRIGGER_PATTERNS = compileWordPatterns(TRIGGER_CONTEXT_WORDS);
|
|
115
|
+
const VAGUE_PATTERNS = compileWordPatterns(VAGUE_WORDS);
|
|
116
|
+
const ACTION_PATTERNS = compileWordPatterns(ACTION_VERBS);
|
|
117
|
+
|
|
118
|
+
/** Count how many pre-compiled patterns match in a string. */
|
|
119
|
+
function countWordMatches(text: string, patterns: RegExp[]): number {
|
|
120
|
+
let count = 0;
|
|
121
|
+
for (const p of patterns) {
|
|
122
|
+
if (p.test(text)) count++;
|
|
123
|
+
}
|
|
124
|
+
return count;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// ---------------------------------------------------------------------------
|
|
128
|
+
// Criterion scorers
|
|
129
|
+
// ---------------------------------------------------------------------------
|
|
130
|
+
|
|
131
|
+
/** Score description length: 1.0 for ideal range, graded falloff outside. */
|
|
132
|
+
export function scoreLengthCriterion(description: string): number {
|
|
133
|
+
const len = description.length;
|
|
134
|
+
if (len < MIN_LENGTH) return len / MIN_LENGTH;
|
|
135
|
+
if (len >= IDEAL_MIN && len <= IDEAL_MAX) return 1.0;
|
|
136
|
+
if (len < IDEAL_MIN) return 0.7 + 0.3 * ((len - MIN_LENGTH) / (IDEAL_MIN - MIN_LENGTH));
|
|
137
|
+
if (len <= MAX_LENGTH) return 0.7 + 0.3 * ((MAX_LENGTH - len) / (MAX_LENGTH - IDEAL_MAX));
|
|
138
|
+
return Math.max(0.3, 0.7 - 0.4 * ((len - MAX_LENGTH) / MAX_LENGTH));
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/** Score presence of trigger context words (when/if/before/after etc). */
|
|
142
|
+
export function scoreTriggerContextCriterion(description: string): number {
|
|
143
|
+
const matches = countWordMatches(description.toLowerCase(), TRIGGER_PATTERNS);
|
|
144
|
+
if (matches === 0) return 0.0;
|
|
145
|
+
if (matches === 1) return 0.7;
|
|
146
|
+
return Math.min(1.0, 0.7 + 0.15 * (matches - 1));
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/** Score absence of vague words (lower is worse). */
|
|
150
|
+
export function scoreVaguenessCriterion(description: string): number {
|
|
151
|
+
const matches = countWordMatches(description.toLowerCase(), VAGUE_PATTERNS);
|
|
152
|
+
if (matches === 0) return 1.0;
|
|
153
|
+
if (matches === 1) return 0.6;
|
|
154
|
+
return Math.max(0.1, 0.6 - 0.15 * (matches - 1));
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/** Score whether description specifies at least one concrete action or domain. */
|
|
158
|
+
export function scoreSpecificityCriterion(description: string): number {
|
|
159
|
+
const lower = description.toLowerCase();
|
|
160
|
+
const hasAction = ACTION_PATTERNS.some((p) => p.test(lower));
|
|
161
|
+
|
|
162
|
+
const fillerCount = FILLER_PHRASES.filter((f) => lower.includes(f)).length;
|
|
163
|
+
const words = description.split(/\s+/).length;
|
|
164
|
+
const fillerRatio = fillerCount > 0 ? fillerCount / Math.max(1, words / 10) : 0;
|
|
165
|
+
|
|
166
|
+
if (!hasAction) return 0.2;
|
|
167
|
+
return Math.max(0.3, 1.0 - fillerRatio * 0.3);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/** Score whether description is not just the skill name restated. */
|
|
171
|
+
export function scoreNotJustNameCriterion(description: string, skillName?: string): number {
|
|
172
|
+
if (!skillName) return 1.0;
|
|
173
|
+
const descNorm = description
|
|
174
|
+
.toLowerCase()
|
|
175
|
+
.trim()
|
|
176
|
+
.replace(/[^a-z0-9\s]/g, "");
|
|
177
|
+
const nameNorm = skillName
|
|
178
|
+
.toLowerCase()
|
|
179
|
+
.trim()
|
|
180
|
+
.replace(/[^a-z0-9\s]/g, "");
|
|
181
|
+
const nameFromKebab = skillName.replace(/[-_]/g, " ").toLowerCase().trim();
|
|
182
|
+
|
|
183
|
+
if (descNorm === nameNorm || descNorm === nameFromKebab) return 0.0;
|
|
184
|
+
if (descNorm.length < nameNorm.length + 10) return 0.3;
|
|
185
|
+
return 1.0;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// ---------------------------------------------------------------------------
|
|
189
|
+
// Main scoring function
|
|
190
|
+
// ---------------------------------------------------------------------------
|
|
191
|
+
|
|
192
|
+
/** Criterion weights — trigger context is weighted highest per OpenAI's finding. */
|
|
193
|
+
const WEIGHTS = {
|
|
194
|
+
length: 0.15,
|
|
195
|
+
trigger_context: 0.3,
|
|
196
|
+
vagueness: 0.2,
|
|
197
|
+
specificity: 0.2,
|
|
198
|
+
not_just_name: 0.15,
|
|
199
|
+
} as const;
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Score a skill description on heuristic quality criteria.
|
|
203
|
+
* Returns a 0.0-1.0 composite score with per-criterion breakdown.
|
|
204
|
+
* Pure function — no I/O, no LLM calls.
|
|
205
|
+
*/
|
|
206
|
+
export function scoreDescription(description: string, skillName?: string): DescriptionQualityScore {
|
|
207
|
+
const criteria = {
|
|
208
|
+
length: scoreLengthCriterion(description),
|
|
209
|
+
trigger_context: scoreTriggerContextCriterion(description),
|
|
210
|
+
vagueness: scoreVaguenessCriterion(description),
|
|
211
|
+
specificity: scoreSpecificityCriterion(description),
|
|
212
|
+
not_just_name: scoreNotJustNameCriterion(description, skillName),
|
|
213
|
+
};
|
|
214
|
+
|
|
215
|
+
const composite = (Object.keys(WEIGHTS) as (keyof typeof WEIGHTS)[]).reduce(
|
|
216
|
+
(sum, key) => sum + criteria[key] * WEIGHTS[key],
|
|
217
|
+
0,
|
|
218
|
+
);
|
|
219
|
+
|
|
220
|
+
return {
|
|
221
|
+
composite: +composite.toFixed(3),
|
|
222
|
+
criteria,
|
|
223
|
+
};
|
|
224
|
+
}
|
|
@@ -25,6 +25,7 @@ import type {
|
|
|
25
25
|
QueryLogRecord,
|
|
26
26
|
SkillUsageRecord,
|
|
27
27
|
} from "../types.js";
|
|
28
|
+
import { CLIError, handleCLIError } from "../utils/cli-error.js";
|
|
28
29
|
import type { EffortLevel, SubagentCallOptions } from "../utils/llm-call.js";
|
|
29
30
|
import { callViaSubagent } from "../utils/llm-call.js";
|
|
30
31
|
import { appendAuditEntry } from "./audit.js";
|
|
@@ -710,8 +711,11 @@ Options:
|
|
|
710
711
|
}
|
|
711
712
|
|
|
712
713
|
if (!values.skill || !values["skill-path"]) {
|
|
713
|
-
|
|
714
|
-
|
|
714
|
+
throw new CLIError(
|
|
715
|
+
"--skill and --skill-path are required",
|
|
716
|
+
"MISSING_FLAG",
|
|
717
|
+
"selftune evolve body --skill <name> --skill-path <path>",
|
|
718
|
+
);
|
|
715
719
|
}
|
|
716
720
|
|
|
717
721
|
const { detectAgent } = await import("../utils/llm-call.js");
|
|
@@ -719,15 +723,21 @@ Options:
|
|
|
719
723
|
const studentAgent = values["student-agent"] ?? teacherAgent;
|
|
720
724
|
|
|
721
725
|
if (!teacherAgent) {
|
|
722
|
-
|
|
723
|
-
|
|
726
|
+
throw new CLIError(
|
|
727
|
+
"No agent CLI found. Install Claude Code, Codex, or OpenCode.",
|
|
728
|
+
"AGENT_NOT_FOUND",
|
|
729
|
+
"Install Claude Code, Codex, or OpenCode.",
|
|
730
|
+
);
|
|
724
731
|
}
|
|
725
732
|
|
|
726
733
|
// Parse target
|
|
727
734
|
const targetStr = values.target ?? "body";
|
|
728
735
|
if (targetStr !== "body" && targetStr !== "routing") {
|
|
729
|
-
|
|
730
|
-
|
|
736
|
+
throw new CLIError(
|
|
737
|
+
"--target must be 'body' or 'routing'",
|
|
738
|
+
"INVALID_FLAG",
|
|
739
|
+
"Use --target body or --target routing",
|
|
740
|
+
);
|
|
731
741
|
}
|
|
732
742
|
|
|
733
743
|
// Parse few-shot examples
|
|
@@ -763,8 +773,5 @@ Options:
|
|
|
763
773
|
}
|
|
764
774
|
|
|
765
775
|
if (import.meta.main) {
|
|
766
|
-
cliMain().catch(
|
|
767
|
-
console.error(`[FATAL] ${err}`);
|
|
768
|
-
process.exit(1);
|
|
769
|
-
});
|
|
776
|
+
cliMain().catch(handleCLIError);
|
|
770
777
|
}
|
|
@@ -36,10 +36,12 @@ import type {
|
|
|
36
36
|
SessionTelemetryRecord,
|
|
37
37
|
SkillUsageRecord,
|
|
38
38
|
} from "../types.js";
|
|
39
|
+
import { CLIError, handleCLIError } from "../utils/cli-error.js";
|
|
39
40
|
import { parseFrontmatter, replaceDescription } from "../utils/frontmatter.js";
|
|
40
41
|
import { createEvolveTUI } from "../utils/tui.js";
|
|
41
42
|
import { appendAuditEntry } from "./audit.js";
|
|
42
43
|
import { checkConstitution } from "./constitutional.js";
|
|
44
|
+
import { scoreDescription } from "./description-quality.js";
|
|
43
45
|
import { appendEvidenceEntry } from "./evidence.js";
|
|
44
46
|
import { extractFailurePatterns } from "./extract-patterns.js";
|
|
45
47
|
import {
|
|
@@ -94,6 +96,8 @@ export interface EvolveResult {
|
|
|
94
96
|
baselineResult?: BaselineMeasurement;
|
|
95
97
|
gateValidation?: ValidationResult;
|
|
96
98
|
sync_result?: SyncResult;
|
|
99
|
+
descriptionQualityBefore?: number;
|
|
100
|
+
descriptionQualityAfter?: number;
|
|
97
101
|
}
|
|
98
102
|
|
|
99
103
|
/**
|
|
@@ -247,16 +251,26 @@ export async function evolve(
|
|
|
247
251
|
);
|
|
248
252
|
|
|
249
253
|
/** Stamp every return with pipeline stats so callers always get them. */
|
|
250
|
-
const withStats = (r: Omit<EvolveResult, "llmCallCount" | "elapsedMs">): EvolveResult =>
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
254
|
+
const withStats = (r: Omit<EvolveResult, "llmCallCount" | "elapsedMs">): EvolveResult => {
|
|
255
|
+
const descQualityAfterScore = r.proposal
|
|
256
|
+
? scoreDescription(r.proposal.proposed_description, options.skillName).composite
|
|
257
|
+
: undefined;
|
|
258
|
+
return {
|
|
259
|
+
...r,
|
|
260
|
+
llmCallCount,
|
|
261
|
+
elapsedMs: Date.now() - pipelineStart,
|
|
262
|
+
...(syncResult ? { sync_result: syncResult } : {}),
|
|
263
|
+
...(descQualityBeforeScore != null
|
|
264
|
+
? { descriptionQualityBefore: descQualityBeforeScore }
|
|
265
|
+
: {}),
|
|
266
|
+
...(descQualityAfterScore != null ? { descriptionQualityAfter: descQualityAfterScore } : {}),
|
|
267
|
+
};
|
|
268
|
+
};
|
|
256
269
|
|
|
257
|
-
// Hoisted so catch block can preserve partial results on error
|
|
270
|
+
// Hoisted so catch block and withStats can preserve partial results on error
|
|
258
271
|
let lastProposal: EvolutionProposal | null = null;
|
|
259
272
|
let lastValidation: ValidationResult | null = null;
|
|
273
|
+
let descQualityBeforeScore: number | undefined;
|
|
260
274
|
|
|
261
275
|
try {
|
|
262
276
|
// -----------------------------------------------------------------------
|
|
@@ -281,7 +295,11 @@ export async function evolve(
|
|
|
281
295
|
const versionTag = skillVersion ? `, v${skillVersion}` : "";
|
|
282
296
|
const createdAuditDetails = (message: string) =>
|
|
283
297
|
`original_description:${rawContent}\n${message}`;
|
|
284
|
-
|
|
298
|
+
const descQualityBefore = scoreDescription(currentDescription, skillName);
|
|
299
|
+
descQualityBeforeScore = descQualityBefore.composite;
|
|
300
|
+
tui.done(
|
|
301
|
+
`Loaded SKILL.md (desc: ${currentDescription.length} chars${versionTag}, quality: ${descQualityBefore.composite})`,
|
|
302
|
+
);
|
|
285
303
|
|
|
286
304
|
if (options.syncFirst) {
|
|
287
305
|
tui.step(`Syncing source-truth telemetry${options.syncForce ? " (force)" : ""}...`);
|
|
@@ -1111,38 +1129,36 @@ Options:
|
|
|
1111
1129
|
}
|
|
1112
1130
|
|
|
1113
1131
|
if (!values.skill || !values["skill-path"]) {
|
|
1114
|
-
|
|
1115
|
-
|
|
1132
|
+
throw new CLIError(
|
|
1133
|
+
"--skill and --skill-path are required",
|
|
1134
|
+
"MISSING_FLAG",
|
|
1135
|
+
"selftune evolve --skill <name> --skill-path <path>",
|
|
1136
|
+
);
|
|
1116
1137
|
}
|
|
1117
1138
|
if ((values["sync-force"] ?? false) && !(values["sync-first"] ?? false)) {
|
|
1118
|
-
|
|
1119
|
-
|
|
1139
|
+
throw new CLIError(
|
|
1140
|
+
"--sync-force requires --sync-first",
|
|
1141
|
+
"INVALID_FLAG",
|
|
1142
|
+
"Add --sync-first when using --sync-force",
|
|
1143
|
+
);
|
|
1120
1144
|
}
|
|
1121
1145
|
|
|
1122
1146
|
const { detectAgent } = await import("../utils/llm-call.js");
|
|
1123
1147
|
const requestedAgent = values.agent;
|
|
1124
1148
|
if (requestedAgent && !Bun.which(requestedAgent)) {
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
message: `Agent CLI '${requestedAgent}' not found in PATH.`,
|
|
1130
|
-
action: "Install it or omit --agent to use auto-detection.",
|
|
1131
|
-
}),
|
|
1149
|
+
throw new CLIError(
|
|
1150
|
+
`Agent CLI '${requestedAgent}' not found in PATH.`,
|
|
1151
|
+
"AGENT_NOT_FOUND",
|
|
1152
|
+
"Install it or omit --agent to use auto-detection.",
|
|
1132
1153
|
);
|
|
1133
|
-
process.exit(1);
|
|
1134
1154
|
}
|
|
1135
1155
|
const agent = requestedAgent ?? detectAgent();
|
|
1136
1156
|
if (!agent) {
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
message: "No agent CLI (claude/codex/opencode) found in PATH.",
|
|
1142
|
-
action: "Install Claude Code, Codex, or OpenCode.",
|
|
1143
|
-
}),
|
|
1157
|
+
throw new CLIError(
|
|
1158
|
+
"No agent CLI (claude/codex/opencode) found in PATH.",
|
|
1159
|
+
"AGENT_NOT_FOUND",
|
|
1160
|
+
"Install Claude Code, Codex, or OpenCode.",
|
|
1144
1161
|
);
|
|
1145
|
-
process.exit(1);
|
|
1146
1162
|
}
|
|
1147
1163
|
|
|
1148
1164
|
// -------------------------------------------------------------------------
|
|
@@ -1150,20 +1166,27 @@ Options:
|
|
|
1150
1166
|
// -------------------------------------------------------------------------
|
|
1151
1167
|
const skillPath = values["skill-path"];
|
|
1152
1168
|
if (!skillPath) {
|
|
1153
|
-
|
|
1154
|
-
|
|
1169
|
+
throw new CLIError(
|
|
1170
|
+
"--skill-path is required.",
|
|
1171
|
+
"MISSING_FLAG",
|
|
1172
|
+
"selftune evolve --skill <name> --skill-path <path>",
|
|
1173
|
+
);
|
|
1155
1174
|
}
|
|
1156
1175
|
if (!existsSync(skillPath)) {
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1176
|
+
throw new CLIError(
|
|
1177
|
+
`SKILL.md not found at: ${skillPath}`,
|
|
1178
|
+
"FILE_NOT_FOUND",
|
|
1179
|
+
"Verify the --skill-path argument points to an existing SKILL.md file.",
|
|
1180
|
+
);
|
|
1160
1181
|
}
|
|
1161
1182
|
|
|
1162
1183
|
const evalSetPath = values["eval-set"];
|
|
1163
1184
|
if (evalSetPath && !existsSync(evalSetPath)) {
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1185
|
+
throw new CLIError(
|
|
1186
|
+
`Eval set file not found at: ${evalSetPath}`,
|
|
1187
|
+
"FILE_NOT_FOUND",
|
|
1188
|
+
"Verify the --eval-set argument points to an existing JSON file.",
|
|
1189
|
+
);
|
|
1167
1190
|
}
|
|
1168
1191
|
|
|
1169
1192
|
// If no eval-set provided, check that log files exist for auto-generation
|
|
@@ -1172,12 +1195,11 @@ Options:
|
|
|
1172
1195
|
const hasSkillLog = querySkillUsageRecords(dbCheck).length > 0;
|
|
1173
1196
|
const hasQueryLog = existsSync(QUERY_LOG);
|
|
1174
1197
|
if (!hasSkillLog && !hasQueryLog) {
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
"
|
|
1198
|
+
throw new CLIError(
|
|
1199
|
+
`No eval set provided and no telemetry logs found. Expected logs at: ${SKILL_LOG} and ${QUERY_LOG}`,
|
|
1200
|
+
"MISSING_DATA",
|
|
1201
|
+
"Either pass --eval-set <path> or generate logs first by using selftune-enabled skills.",
|
|
1178
1202
|
);
|
|
1179
|
-
console.error(` Expected logs at: ${SKILL_LOG} and ${QUERY_LOG}`);
|
|
1180
|
-
process.exit(1);
|
|
1181
1203
|
}
|
|
1182
1204
|
}
|
|
1183
1205
|
|
|
@@ -1244,6 +1266,12 @@ Options:
|
|
|
1244
1266
|
rationale: result.proposal?.rationale ?? "",
|
|
1245
1267
|
...(result.skillVersion ? { version: result.skillVersion } : {}),
|
|
1246
1268
|
dashboard_url: `http://localhost:3141/report/${encodeURIComponent(values.skill)}`,
|
|
1269
|
+
...(result.descriptionQualityBefore != null
|
|
1270
|
+
? { description_quality_before: result.descriptionQualityBefore }
|
|
1271
|
+
: {}),
|
|
1272
|
+
...(result.descriptionQualityAfter != null
|
|
1273
|
+
? { description_quality_after: result.descriptionQualityAfter }
|
|
1274
|
+
: {}),
|
|
1247
1275
|
};
|
|
1248
1276
|
console.log(JSON.stringify(summary, null, 2));
|
|
1249
1277
|
}
|
|
@@ -1276,20 +1304,5 @@ Options:
|
|
|
1276
1304
|
}
|
|
1277
1305
|
|
|
1278
1306
|
if (import.meta.main) {
|
|
1279
|
-
cliMain().catch(
|
|
1280
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
1281
|
-
const stack = err instanceof Error ? err.stack : undefined;
|
|
1282
|
-
console.error(`[FATAL] ${message}`);
|
|
1283
|
-
if (stack && process.env.SELFTUNE_VERBOSE === "1") {
|
|
1284
|
-
console.error(stack);
|
|
1285
|
-
}
|
|
1286
|
-
console.error(
|
|
1287
|
-
"\nTroubleshooting:\n" +
|
|
1288
|
-
" - Verify --skill-path points to a valid SKILL.md file\n" +
|
|
1289
|
-
" - Ensure eval data exists (run `selftune eval generate` first) or pass --eval-set\n" +
|
|
1290
|
-
" - Check that ANTHROPIC_API_KEY is set if using Claude\n" +
|
|
1291
|
-
" - Re-run with --verbose for full diagnostic output",
|
|
1292
|
-
);
|
|
1293
|
-
process.exit(1);
|
|
1294
|
-
});
|
|
1307
|
+
cliMain().catch(handleCLIError);
|
|
1295
1308
|
}
|
|
@@ -13,6 +13,7 @@ import { parseArgs } from "node:util";
|
|
|
13
13
|
|
|
14
14
|
import { updateContextAfterRollback } from "../memory/writer.js";
|
|
15
15
|
import type { EvolutionAuditEntry } from "../types.js";
|
|
16
|
+
import { CLIError, handleCLIError } from "../utils/cli-error.js";
|
|
16
17
|
import { replaceDescription } from "../utils/frontmatter.js";
|
|
17
18
|
import { appendAuditEntry, getLastDeployedProposal, readAuditTrail } from "./audit.js";
|
|
18
19
|
|
|
@@ -233,8 +234,11 @@ Options:
|
|
|
233
234
|
}
|
|
234
235
|
|
|
235
236
|
if (!values.skill || !values["skill-path"]) {
|
|
236
|
-
|
|
237
|
-
|
|
237
|
+
throw new CLIError(
|
|
238
|
+
"--skill and --skill-path are required",
|
|
239
|
+
"MISSING_FLAG",
|
|
240
|
+
"selftune evolve rollback --skill <name> --skill-path <path>",
|
|
241
|
+
);
|
|
238
242
|
}
|
|
239
243
|
|
|
240
244
|
const result = await rollback({
|
|
@@ -248,8 +252,5 @@ Options:
|
|
|
248
252
|
}
|
|
249
253
|
|
|
250
254
|
if (import.meta.main) {
|
|
251
|
-
cliMain().catch(
|
|
252
|
-
console.error(`[FATAL] ${err}`);
|
|
253
|
-
process.exit(1);
|
|
254
|
-
});
|
|
255
|
+
cliMain().catch(handleCLIError);
|
|
255
256
|
}
|
|
@@ -17,6 +17,7 @@ import { AGENT_CANDIDATES, TELEMETRY_LOG } from "../constants.js";
|
|
|
17
17
|
import { getDb } from "../localdb/db.js";
|
|
18
18
|
import { querySessionTelemetry, querySkillUsageRecords } from "../localdb/queries.js";
|
|
19
19
|
import type { GradingResult, SessionTelemetryRecord, SkillUsageRecord } from "../types.js";
|
|
20
|
+
import { CLIError, handleCLIError } from "../utils/cli-error.js";
|
|
20
21
|
import { detectAgent as _detectAgent } from "../utils/llm-call.js";
|
|
21
22
|
import { readExcerpt } from "../utils/transcript.js";
|
|
22
23
|
import {
|
|
@@ -62,8 +63,7 @@ Options:
|
|
|
62
63
|
|
|
63
64
|
const skill = values.skill;
|
|
64
65
|
if (!skill) {
|
|
65
|
-
|
|
66
|
-
process.exit(1);
|
|
66
|
+
throw new CLIError("--skill is required", "MISSING_FLAG", "selftune auto-grade --skill <name>");
|
|
67
67
|
}
|
|
68
68
|
|
|
69
69
|
// --- Determine agent ---
|
|
@@ -71,10 +71,11 @@ Options:
|
|
|
71
71
|
const validAgents = [...AGENT_CANDIDATES];
|
|
72
72
|
if (values.agent) {
|
|
73
73
|
if (!validAgents.includes(values.agent)) {
|
|
74
|
-
|
|
75
|
-
`
|
|
74
|
+
throw new CLIError(
|
|
75
|
+
`Invalid --agent '${values.agent}'. Expected one of: ${validAgents.join(", ")}`,
|
|
76
|
+
"INVALID_FLAG",
|
|
77
|
+
`selftune auto-grade --skill <name> --agent ${validAgents[0]}`,
|
|
76
78
|
);
|
|
77
|
-
process.exit(1);
|
|
78
79
|
}
|
|
79
80
|
agent = values.agent;
|
|
80
81
|
} else {
|
|
@@ -82,11 +83,11 @@ Options:
|
|
|
82
83
|
}
|
|
83
84
|
|
|
84
85
|
if (!agent) {
|
|
85
|
-
|
|
86
|
-
`
|
|
87
|
-
|
|
86
|
+
throw new CLIError(
|
|
87
|
+
`No supported agent CLI (${AGENT_CANDIDATES.join("/")}) found in PATH`,
|
|
88
|
+
"AGENT_NOT_FOUND",
|
|
89
|
+
"Install one of the supported agent CLIs",
|
|
88
90
|
);
|
|
89
|
-
process.exit(1);
|
|
90
91
|
}
|
|
91
92
|
|
|
92
93
|
console.error(`[INFO] Auto-grade via agent: ${agent}`);
|
|
@@ -104,21 +105,22 @@ Options:
|
|
|
104
105
|
sessionId = values["session-id"];
|
|
105
106
|
const resolved = resolveSessionById(telRecords, sessionId);
|
|
106
107
|
if (!resolved) {
|
|
107
|
-
|
|
108
|
-
`
|
|
109
|
-
|
|
108
|
+
throw new CLIError(
|
|
109
|
+
`Session '${sessionId}' not found in telemetry or recoverable transcript data`,
|
|
110
|
+
"MISSING_DATA",
|
|
111
|
+
"Check the session ID or omit --session-id to auto-select the latest matching session",
|
|
110
112
|
);
|
|
111
|
-
process.exit(1);
|
|
112
113
|
}
|
|
113
114
|
telemetry = resolved.telemetry;
|
|
114
115
|
transcriptPath = resolved.transcriptPath;
|
|
115
116
|
} else {
|
|
116
117
|
const resolved = resolveLatestSessionForSkill(telRecords, skillUsageRecords, skill);
|
|
117
118
|
if (!resolved) {
|
|
118
|
-
|
|
119
|
-
`
|
|
119
|
+
throw new CLIError(
|
|
120
|
+
`No session found for skill '${skill}'`,
|
|
121
|
+
"MISSING_DATA",
|
|
122
|
+
"Run the skill first, or pass --session-id",
|
|
120
123
|
);
|
|
121
|
-
process.exit(1);
|
|
122
124
|
}
|
|
123
125
|
telemetry = resolved.telemetry;
|
|
124
126
|
sessionId = resolved.sessionId ?? "unknown";
|
|
@@ -159,8 +161,11 @@ Options:
|
|
|
159
161
|
agent,
|
|
160
162
|
});
|
|
161
163
|
} catch (err) {
|
|
162
|
-
|
|
163
|
-
|
|
164
|
+
throw new CLIError(
|
|
165
|
+
`Grading failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
166
|
+
"OPERATION_FAILED",
|
|
167
|
+
"Check agent availability and try again",
|
|
168
|
+
);
|
|
164
169
|
}
|
|
165
170
|
|
|
166
171
|
const outputPath = values.output ?? buildDefaultGradingOutputPath(sessionId);
|
|
@@ -193,8 +198,5 @@ Options:
|
|
|
193
198
|
|
|
194
199
|
// Guard: only run when invoked directly
|
|
195
200
|
if (import.meta.main) {
|
|
196
|
-
cliMain().catch(
|
|
197
|
-
console.error(`[FATAL] ${err}`);
|
|
198
|
-
process.exit(1);
|
|
199
|
-
});
|
|
201
|
+
cliMain().catch(handleCLIError);
|
|
200
202
|
}
|