selftune 0.1.4 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +156 -0
- package/.claude/agents/evolution-reviewer.md +180 -0
- package/.claude/agents/integration-guide.md +212 -0
- package/.claude/agents/pattern-analyst.md +160 -0
- package/CHANGELOG.md +46 -1
- package/README.md +105 -257
- package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
- package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
- package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
- package/apps/local-dashboard/dist/favicon.png +0 -0
- package/apps/local-dashboard/dist/index.html +17 -0
- package/apps/local-dashboard/dist/logo.png +0 -0
- package/apps/local-dashboard/dist/logo.svg +9 -0
- package/assets/BeforeAfter.gif +0 -0
- package/assets/FeedbackLoop.gif +0 -0
- package/assets/logo.svg +9 -0
- package/assets/skill-health-badge.svg +20 -0
- package/cli/selftune/activation-rules.ts +171 -0
- package/cli/selftune/badge/badge-data.ts +108 -0
- package/cli/selftune/badge/badge-svg.ts +212 -0
- package/cli/selftune/badge/badge.ts +99 -0
- package/cli/selftune/canonical-export.ts +183 -0
- package/cli/selftune/constants.ts +103 -1
- package/cli/selftune/contribute/bundle.ts +314 -0
- package/cli/selftune/contribute/contribute.ts +214 -0
- package/cli/selftune/contribute/sanitize.ts +162 -0
- package/cli/selftune/cron/setup.ts +266 -0
- package/cli/selftune/dashboard-contract.ts +202 -0
- package/cli/selftune/dashboard-server.ts +1049 -0
- package/cli/selftune/dashboard.ts +43 -156
- package/cli/selftune/eval/baseline.ts +248 -0
- package/cli/selftune/eval/composability-v2.ts +273 -0
- package/cli/selftune/eval/composability.ts +117 -0
- package/cli/selftune/eval/generate-unit-tests.ts +143 -0
- package/cli/selftune/eval/hooks-to-evals.ts +101 -16
- package/cli/selftune/eval/import-skillsbench.ts +221 -0
- package/cli/selftune/eval/synthetic-evals.ts +172 -0
- package/cli/selftune/eval/unit-test-cli.ts +152 -0
- package/cli/selftune/eval/unit-test.ts +196 -0
- package/cli/selftune/evolution/deploy-proposal.ts +142 -1
- package/cli/selftune/evolution/evidence.ts +26 -0
- package/cli/selftune/evolution/evolve-body.ts +586 -0
- package/cli/selftune/evolution/evolve.ts +825 -116
- package/cli/selftune/evolution/extract-patterns.ts +105 -16
- package/cli/selftune/evolution/pareto.ts +314 -0
- package/cli/selftune/evolution/propose-body.ts +171 -0
- package/cli/selftune/evolution/propose-description.ts +100 -2
- package/cli/selftune/evolution/propose-routing.ts +166 -0
- package/cli/selftune/evolution/refine-body.ts +141 -0
- package/cli/selftune/evolution/rollback.ts +21 -4
- package/cli/selftune/evolution/validate-body.ts +254 -0
- package/cli/selftune/evolution/validate-proposal.ts +257 -35
- package/cli/selftune/evolution/validate-routing.ts +177 -0
- package/cli/selftune/grading/auto-grade.ts +200 -0
- package/cli/selftune/grading/grade-session.ts +513 -42
- package/cli/selftune/grading/pre-gates.ts +104 -0
- package/cli/selftune/grading/results.ts +42 -0
- package/cli/selftune/hooks/auto-activate.ts +185 -0
- package/cli/selftune/hooks/evolution-guard.ts +165 -0
- package/cli/selftune/hooks/prompt-log.ts +172 -2
- package/cli/selftune/hooks/session-stop.ts +123 -3
- package/cli/selftune/hooks/skill-change-guard.ts +112 -0
- package/cli/selftune/hooks/skill-eval.ts +119 -3
- package/cli/selftune/index.ts +415 -48
- package/cli/selftune/ingestors/claude-replay.ts +377 -0
- package/cli/selftune/ingestors/codex-rollout.ts +345 -46
- package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
- package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
- package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
- package/cli/selftune/init.ts +376 -16
- package/cli/selftune/last.ts +14 -5
- package/cli/selftune/localdb/db.ts +63 -0
- package/cli/selftune/localdb/materialize.ts +428 -0
- package/cli/selftune/localdb/queries.ts +376 -0
- package/cli/selftune/localdb/schema.ts +204 -0
- package/cli/selftune/memory/writer.ts +447 -0
- package/cli/selftune/monitoring/watch.ts +90 -16
- package/cli/selftune/normalization.ts +682 -0
- package/cli/selftune/observability.ts +19 -44
- package/cli/selftune/orchestrate.ts +1073 -0
- package/cli/selftune/quickstart.ts +203 -0
- package/cli/selftune/repair/skill-usage.ts +576 -0
- package/cli/selftune/schedule.ts +561 -0
- package/cli/selftune/status.ts +59 -33
- package/cli/selftune/sync.ts +627 -0
- package/cli/selftune/types.ts +525 -5
- package/cli/selftune/utils/canonical-log.ts +45 -0
- package/cli/selftune/utils/frontmatter.ts +217 -0
- package/cli/selftune/utils/hooks.ts +41 -0
- package/cli/selftune/utils/html.ts +27 -0
- package/cli/selftune/utils/llm-call.ts +103 -19
- package/cli/selftune/utils/math.ts +10 -0
- package/cli/selftune/utils/query-filter.ts +139 -0
- package/cli/selftune/utils/skill-discovery.ts +340 -0
- package/cli/selftune/utils/skill-log.ts +68 -0
- package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
- package/cli/selftune/utils/transcript.ts +307 -26
- package/cli/selftune/utils/trigger-check.ts +89 -0
- package/cli/selftune/utils/tui.ts +156 -0
- package/cli/selftune/workflows/discover.ts +254 -0
- package/cli/selftune/workflows/skill-md-writer.ts +288 -0
- package/cli/selftune/workflows/workflows.ts +188 -0
- package/package.json +28 -11
- package/packages/telemetry-contract/README.md +11 -0
- package/packages/telemetry-contract/fixtures/golden.json +87 -0
- package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
- package/packages/telemetry-contract/index.ts +1 -0
- package/packages/telemetry-contract/package.json +19 -0
- package/packages/telemetry-contract/src/index.ts +2 -0
- package/packages/telemetry-contract/src/types.ts +163 -0
- package/packages/telemetry-contract/src/validators.ts +109 -0
- package/skill/SKILL.md +180 -33
- package/skill/Workflows/AutoActivation.md +145 -0
- package/skill/Workflows/Badge.md +124 -0
- package/skill/Workflows/Baseline.md +144 -0
- package/skill/Workflows/Composability.md +107 -0
- package/skill/Workflows/Contribute.md +94 -0
- package/skill/Workflows/Cron.md +132 -0
- package/skill/Workflows/Dashboard.md +214 -0
- package/skill/Workflows/Doctor.md +63 -14
- package/skill/Workflows/Evals.md +110 -18
- package/skill/Workflows/EvolutionMemory.md +154 -0
- package/skill/Workflows/Evolve.md +181 -21
- package/skill/Workflows/EvolveBody.md +159 -0
- package/skill/Workflows/Grade.md +36 -31
- package/skill/Workflows/ImportSkillsBench.md +117 -0
- package/skill/Workflows/Ingest.md +142 -21
- package/skill/Workflows/Initialize.md +91 -23
- package/skill/Workflows/Orchestrate.md +139 -0
- package/skill/Workflows/Replay.md +91 -0
- package/skill/Workflows/Rollback.md +23 -4
- package/skill/Workflows/Schedule.md +61 -0
- package/skill/Workflows/Sync.md +88 -0
- package/skill/Workflows/UnitTest.md +150 -0
- package/skill/Workflows/Watch.md +33 -1
- package/skill/Workflows/Workflows.md +129 -0
- package/skill/assets/activation-rules-default.json +26 -0
- package/skill/assets/multi-skill-settings.json +63 -0
- package/skill/assets/single-skill-settings.json +57 -0
- package/skill/references/invocation-taxonomy.md +2 -2
- package/skill/references/logs.md +164 -2
- package/skill/references/setup-patterns.md +65 -0
- package/skill/references/version-history.md +40 -0
- package/skill/settings_snippet.json +23 -0
- package/templates/activation-rules-default.json +27 -0
- package/templates/multi-skill-settings.json +64 -0
- package/templates/single-skill-settings.json +58 -0
- package/dashboard/index.html +0 -1119
|
@@ -6,25 +6,50 @@
|
|
|
6
6
|
* logic and comprehensive audit tracking.
|
|
7
7
|
*/
|
|
8
8
|
|
|
9
|
-
import { existsSync, readFileSync } from "node:fs";
|
|
9
|
+
import { copyFileSync, existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
10
10
|
import { parseArgs } from "node:util";
|
|
11
11
|
|
|
12
|
-
import { QUERY_LOG, SKILL_LOG } from "../constants.js";
|
|
12
|
+
import { QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
|
|
13
|
+
import type { BaselineMeasurement } from "../eval/baseline.js";
|
|
14
|
+
import { measureBaseline } from "../eval/baseline.js";
|
|
13
15
|
import { buildEvalSet } from "../eval/hooks-to-evals.js";
|
|
16
|
+
import { readGradingResultsForSkill } from "../grading/results.js";
|
|
17
|
+
import { updateContextAfterEvolve } from "../memory/writer.js";
|
|
18
|
+
import type { SyncResult } from "../sync.js";
|
|
14
19
|
import type {
|
|
15
20
|
EvalEntry,
|
|
16
21
|
EvalPassRate,
|
|
17
22
|
EvolutionAuditEntry,
|
|
23
|
+
EvolutionEvidenceEntry,
|
|
18
24
|
EvolutionProposal,
|
|
25
|
+
EvolveResultSummary,
|
|
26
|
+
FailurePattern,
|
|
27
|
+
GradingResult,
|
|
28
|
+
ParetoCandidate,
|
|
19
29
|
QueryLogRecord,
|
|
30
|
+
SessionTelemetryRecord,
|
|
20
31
|
SkillUsageRecord,
|
|
21
32
|
} from "../types.js";
|
|
33
|
+
import { parseFrontmatter, replaceFrontmatterDescription } from "../utils/frontmatter.js";
|
|
22
34
|
import { readJsonl } from "../utils/jsonl.js";
|
|
35
|
+
import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
|
|
36
|
+
import { createEvolveTUI } from "../utils/tui.js";
|
|
23
37
|
import { appendAuditEntry } from "./audit.js";
|
|
38
|
+
import { appendEvidenceEntry } from "./evidence.js";
|
|
24
39
|
import { extractFailurePatterns } from "./extract-patterns.js";
|
|
25
|
-
import {
|
|
40
|
+
import {
|
|
41
|
+
computeInvocationScores,
|
|
42
|
+
computeParetoFrontier,
|
|
43
|
+
computeTokenEfficiencyScore,
|
|
44
|
+
selectFromFrontier,
|
|
45
|
+
} from "./pareto.js";
|
|
46
|
+
import { generateMultipleProposals, generateProposal } from "./propose-description.js";
|
|
26
47
|
import type { ValidationResult } from "./validate-proposal.js";
|
|
27
|
-
import {
|
|
48
|
+
import {
|
|
49
|
+
TRIGGER_CHECK_BATCH_SIZE,
|
|
50
|
+
VALIDATION_RUNS,
|
|
51
|
+
validateProposal,
|
|
52
|
+
} from "./validate-proposal.js";
|
|
28
53
|
|
|
29
54
|
// ---------------------------------------------------------------------------
|
|
30
55
|
// Types
|
|
@@ -38,6 +63,18 @@ export interface EvolveOptions {
|
|
|
38
63
|
dryRun: boolean;
|
|
39
64
|
confidenceThreshold: number; // default 0.6
|
|
40
65
|
maxIterations: number; // default 3
|
|
66
|
+
gradingResults?: GradingResult[];
|
|
67
|
+
paretoEnabled?: boolean;
|
|
68
|
+
candidateCount?: number;
|
|
69
|
+
tokenEfficiencyEnabled?: boolean;
|
|
70
|
+
telemetryRecords?: SessionTelemetryRecord[];
|
|
71
|
+
withBaseline?: boolean;
|
|
72
|
+
validationModel?: string;
|
|
73
|
+
cheapLoop?: boolean;
|
|
74
|
+
gateModel?: string;
|
|
75
|
+
proposalModel?: string;
|
|
76
|
+
syncFirst?: boolean;
|
|
77
|
+
syncForce?: boolean;
|
|
41
78
|
}
|
|
42
79
|
|
|
43
80
|
export interface EvolveResult {
|
|
@@ -46,6 +83,12 @@ export interface EvolveResult {
|
|
|
46
83
|
deployed: boolean;
|
|
47
84
|
auditEntries: EvolutionAuditEntry[];
|
|
48
85
|
reason: string;
|
|
86
|
+
skillVersion?: string;
|
|
87
|
+
llmCallCount: number;
|
|
88
|
+
elapsedMs: number;
|
|
89
|
+
baselineResult?: BaselineMeasurement;
|
|
90
|
+
gateValidation?: ValidationResult;
|
|
91
|
+
sync_result?: SyncResult;
|
|
49
92
|
}
|
|
50
93
|
|
|
51
94
|
/**
|
|
@@ -53,11 +96,22 @@ export interface EvolveResult {
|
|
|
53
96
|
* imports are used. Pass overrides in tests to avoid mock.module().
|
|
54
97
|
*/
|
|
55
98
|
export interface EvolveDeps {
|
|
56
|
-
extractFailurePatterns?:
|
|
99
|
+
extractFailurePatterns?: (
|
|
100
|
+
evalEntries: EvalEntry[],
|
|
101
|
+
skillUsage: SkillUsageRecord[],
|
|
102
|
+
skillName: string,
|
|
103
|
+
gradingResults?: GradingResult[],
|
|
104
|
+
) => FailurePattern[];
|
|
57
105
|
generateProposal?: typeof import("./propose-description.js").generateProposal;
|
|
58
106
|
validateProposal?: typeof import("./validate-proposal.js").validateProposal;
|
|
107
|
+
gateValidateProposal?: typeof import("./validate-proposal.js").validateProposal;
|
|
59
108
|
appendAuditEntry?: typeof import("./audit.js").appendAuditEntry;
|
|
109
|
+
appendEvidenceEntry?: typeof import("./evidence.js").appendEvidenceEntry;
|
|
60
110
|
buildEvalSet?: typeof import("../eval/hooks-to-evals.js").buildEvalSet;
|
|
111
|
+
updateContextAfterEvolve?: typeof import("../memory/writer.js").updateContextAfterEvolve;
|
|
112
|
+
measureBaseline?: typeof import("../eval/baseline.js").measureBaseline;
|
|
113
|
+
readSkillUsageLog?: () => SkillUsageRecord[];
|
|
114
|
+
syncSources?: typeof import("../sync.js").syncSources;
|
|
61
115
|
}
|
|
62
116
|
|
|
63
117
|
// ---------------------------------------------------------------------------
|
|
@@ -69,16 +123,45 @@ function createAuditEntry(
|
|
|
69
123
|
action: EvolutionAuditEntry["action"],
|
|
70
124
|
details: string,
|
|
71
125
|
evalSnapshot?: EvalPassRate,
|
|
126
|
+
skillName?: string,
|
|
72
127
|
): EvolutionAuditEntry {
|
|
73
128
|
return {
|
|
74
129
|
timestamp: new Date().toISOString(),
|
|
75
130
|
proposal_id: proposalId,
|
|
76
131
|
action,
|
|
77
132
|
details,
|
|
133
|
+
...(skillName ? { skill_name: skillName } : {}),
|
|
78
134
|
...(evalSnapshot ? { eval_snapshot: evalSnapshot } : {}),
|
|
79
135
|
};
|
|
80
136
|
}
|
|
81
137
|
|
|
138
|
+
// ---------------------------------------------------------------------------
|
|
139
|
+
// Diff helper
|
|
140
|
+
// ---------------------------------------------------------------------------
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Produce a simple colored diff between two text strings.
|
|
144
|
+
* Red (removed) / Green (added) lines, skipping unchanged lines.
|
|
145
|
+
*/
|
|
146
|
+
function formatSimpleDiff(oldText: string, newText: string): string {
|
|
147
|
+
const oldLines = oldText.split("\n");
|
|
148
|
+
const newLines = newText.split("\n");
|
|
149
|
+
const output: string[] = [];
|
|
150
|
+
const maxLen = Math.max(oldLines.length, newLines.length);
|
|
151
|
+
for (let i = 0; i < maxLen; i++) {
|
|
152
|
+
const oldLine = oldLines[i];
|
|
153
|
+
const newLine = newLines[i];
|
|
154
|
+
if (oldLine === newLine) continue;
|
|
155
|
+
if (oldLine !== undefined) {
|
|
156
|
+
output.push(`\x1b[31m- ${oldLine}\x1b[0m`);
|
|
157
|
+
}
|
|
158
|
+
if (newLine !== undefined) {
|
|
159
|
+
output.push(`\x1b[32m+ ${newLine}\x1b[0m`);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
return output.join("\n");
|
|
163
|
+
}
|
|
164
|
+
|
|
82
165
|
// ---------------------------------------------------------------------------
|
|
83
166
|
// Main orchestrator
|
|
84
167
|
// ---------------------------------------------------------------------------
|
|
@@ -90,14 +173,27 @@ export async function evolve(
|
|
|
90
173
|
const { skillName, skillPath, evalSetPath, agent, dryRun, confidenceThreshold, maxIterations } =
|
|
91
174
|
options;
|
|
92
175
|
|
|
176
|
+
// Apply cheap-loop defaults: cheap models for proposal/validation, expensive for gate
|
|
177
|
+
if (options.cheapLoop) {
|
|
178
|
+
if (!options.proposalModel) options.proposalModel = "haiku";
|
|
179
|
+
if (!options.validationModel) options.validationModel = "haiku";
|
|
180
|
+
if (!options.gateModel) options.gateModel = "sonnet";
|
|
181
|
+
}
|
|
182
|
+
|
|
93
183
|
// Resolve injectable dependencies with real-import fallbacks
|
|
94
184
|
const _extractFailurePatterns = _deps.extractFailurePatterns ?? extractFailurePatterns;
|
|
95
185
|
const _generateProposal = _deps.generateProposal ?? generateProposal;
|
|
96
186
|
const _validateProposal = _deps.validateProposal ?? validateProposal;
|
|
187
|
+
const _gateValidateProposal = _deps.gateValidateProposal ?? validateProposal;
|
|
97
188
|
const _appendAuditEntry = _deps.appendAuditEntry ?? appendAuditEntry;
|
|
189
|
+
const _appendEvidenceEntry = _deps.appendEvidenceEntry ?? appendEvidenceEntry;
|
|
98
190
|
const _buildEvalSet = _deps.buildEvalSet ?? buildEvalSet;
|
|
191
|
+
const _updateContextAfterEvolve = _deps.updateContextAfterEvolve ?? updateContextAfterEvolve;
|
|
192
|
+
const _measureBaseline = _deps.measureBaseline ?? measureBaseline;
|
|
193
|
+
const _readSkillUsageLog = _deps.readSkillUsageLog ?? (() => readEffectiveSkillUsageRecords());
|
|
99
194
|
|
|
100
195
|
const auditEntries: EvolutionAuditEntry[] = [];
|
|
196
|
+
let syncResult: SyncResult | undefined;
|
|
101
197
|
|
|
102
198
|
function recordAudit(
|
|
103
199
|
proposalId: string,
|
|
@@ -105,7 +201,7 @@ export async function evolve(
|
|
|
105
201
|
details: string,
|
|
106
202
|
evalSnapshot?: EvalPassRate,
|
|
107
203
|
): void {
|
|
108
|
-
const entry = createAuditEntry(proposalId, action, details, evalSnapshot);
|
|
204
|
+
const entry = createAuditEntry(proposalId, action, details, evalSnapshot, skillName);
|
|
109
205
|
auditEntries.push(entry);
|
|
110
206
|
try {
|
|
111
207
|
_appendAuditEntry(entry);
|
|
@@ -114,58 +210,171 @@ export async function evolve(
|
|
|
114
210
|
}
|
|
115
211
|
}
|
|
116
212
|
|
|
213
|
+
function recordEvidence(entry: EvolutionEvidenceEntry): void {
|
|
214
|
+
try {
|
|
215
|
+
_appendEvidenceEntry(entry);
|
|
216
|
+
} catch {
|
|
217
|
+
// Fail-open: evidence should not block the pipeline
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
const pipelineStart = Date.now();
|
|
222
|
+
let llmCallCount = 0;
|
|
223
|
+
const tui = createEvolveTUI({ skillName, model: options.proposalModel ?? "(default)" });
|
|
224
|
+
const finishTui = () =>
|
|
225
|
+
tui.finish(
|
|
226
|
+
`${llmCallCount} LLM calls \u00b7 ${((Date.now() - pipelineStart) / 1000).toFixed(1)}s elapsed`,
|
|
227
|
+
);
|
|
228
|
+
|
|
229
|
+
/** Stamp every return with pipeline stats so callers always get them. */
|
|
230
|
+
const withStats = (r: Omit<EvolveResult, "llmCallCount" | "elapsedMs">): EvolveResult => ({
|
|
231
|
+
...r,
|
|
232
|
+
llmCallCount,
|
|
233
|
+
elapsedMs: Date.now() - pipelineStart,
|
|
234
|
+
...(syncResult ? { sync_result: syncResult } : {}),
|
|
235
|
+
});
|
|
236
|
+
|
|
237
|
+
// Hoisted so catch block can preserve partial results on error
|
|
238
|
+
let lastProposal: EvolutionProposal | null = null;
|
|
239
|
+
let lastValidation: ValidationResult | null = null;
|
|
240
|
+
|
|
117
241
|
try {
|
|
118
242
|
// -----------------------------------------------------------------------
|
|
119
243
|
// Step 1: Read current SKILL.md
|
|
120
244
|
// -----------------------------------------------------------------------
|
|
121
245
|
if (!existsSync(skillPath)) {
|
|
122
|
-
|
|
246
|
+
tui.fail(`SKILL.md not found at ${skillPath}`);
|
|
247
|
+
finishTui();
|
|
248
|
+
return withStats({
|
|
123
249
|
proposal: null,
|
|
124
250
|
validation: null,
|
|
125
251
|
deployed: false,
|
|
126
252
|
auditEntries,
|
|
127
253
|
reason: `SKILL.md not found at ${skillPath}`,
|
|
128
|
-
};
|
|
254
|
+
});
|
|
129
255
|
}
|
|
130
256
|
|
|
131
|
-
const
|
|
257
|
+
const rawContent = readFileSync(skillPath, "utf-8");
|
|
258
|
+
const frontmatter = parseFrontmatter(rawContent);
|
|
259
|
+
const currentDescription = frontmatter.description || rawContent;
|
|
260
|
+
const skillVersion = frontmatter.version || undefined;
|
|
261
|
+
const versionTag = skillVersion ? `, v${skillVersion}` : "";
|
|
262
|
+
const createdAuditDetails = (message: string) =>
|
|
263
|
+
`original_description:${rawContent}\n${message}`;
|
|
264
|
+
tui.done(`Loaded SKILL.md (desc: ${currentDescription.length} chars${versionTag})`);
|
|
265
|
+
|
|
266
|
+
if (options.syncFirst) {
|
|
267
|
+
tui.step(`Syncing source-truth telemetry${options.syncForce ? " (force)" : ""}...`);
|
|
268
|
+
const { createDefaultSyncOptions, syncSources: realSyncSources } = await import("../sync.js");
|
|
269
|
+
const syncRunner = _deps.syncSources ?? realSyncSources;
|
|
270
|
+
syncResult = syncRunner(
|
|
271
|
+
createDefaultSyncOptions({
|
|
272
|
+
force: options.syncForce ?? false,
|
|
273
|
+
}),
|
|
274
|
+
);
|
|
275
|
+
const sourceSynced = Object.values(syncResult.sources).reduce(
|
|
276
|
+
(sum, source) => sum + source.synced,
|
|
277
|
+
0,
|
|
278
|
+
);
|
|
279
|
+
tui.done(
|
|
280
|
+
`Source sync complete (${sourceSynced} source sessions, ${syncResult.repair.repaired_records} repaired records)`,
|
|
281
|
+
);
|
|
282
|
+
}
|
|
132
283
|
|
|
133
284
|
// -----------------------------------------------------------------------
|
|
134
285
|
// Step 2: Load eval set
|
|
135
286
|
// -----------------------------------------------------------------------
|
|
287
|
+
const skillUsage = _readSkillUsageLog();
|
|
136
288
|
let evalSet: EvalEntry[];
|
|
137
289
|
|
|
138
290
|
if (evalSetPath && existsSync(evalSetPath)) {
|
|
139
|
-
|
|
140
|
-
|
|
291
|
+
try {
|
|
292
|
+
const raw = readFileSync(evalSetPath, "utf-8");
|
|
293
|
+
evalSet = JSON.parse(raw) as EvalEntry[];
|
|
294
|
+
} catch (parseErr) {
|
|
295
|
+
const msg = parseErr instanceof Error ? parseErr.message : String(parseErr);
|
|
296
|
+
tui.fail(`Failed to load eval set from ${evalSetPath}: ${msg}`);
|
|
297
|
+
finishTui();
|
|
298
|
+
return withStats({
|
|
299
|
+
proposal: null,
|
|
300
|
+
validation: null,
|
|
301
|
+
deployed: false,
|
|
302
|
+
auditEntries,
|
|
303
|
+
reason: `Failed to load eval set: ${msg}`,
|
|
304
|
+
});
|
|
305
|
+
}
|
|
306
|
+
if (!Array.isArray(evalSet)) {
|
|
307
|
+
tui.fail(`Eval set at ${evalSetPath} is not an array`);
|
|
308
|
+
finishTui();
|
|
309
|
+
return withStats({
|
|
310
|
+
proposal: null,
|
|
311
|
+
validation: null,
|
|
312
|
+
deployed: false,
|
|
313
|
+
auditEntries,
|
|
314
|
+
reason: `Eval set at ${evalSetPath} is not a JSON array`,
|
|
315
|
+
});
|
|
316
|
+
}
|
|
141
317
|
} else {
|
|
142
318
|
// Build from logs
|
|
143
|
-
const skillRecords = readJsonl<SkillUsageRecord>(SKILL_LOG);
|
|
144
319
|
const queryRecords = readJsonl<QueryLogRecord>(QUERY_LOG);
|
|
145
|
-
evalSet = _buildEvalSet(
|
|
320
|
+
evalSet = _buildEvalSet(skillUsage, queryRecords, skillName);
|
|
146
321
|
}
|
|
147
322
|
|
|
323
|
+
const posCount = evalSet.filter((e) => e.should_trigger).length;
|
|
324
|
+
const negCount = evalSet.filter((e) => !e.should_trigger).length;
|
|
325
|
+
tui.done(`Loaded eval set (${evalSet.length} entries: ${posCount}+, ${negCount}-)`);
|
|
326
|
+
|
|
148
327
|
// -----------------------------------------------------------------------
|
|
149
328
|
// Step 3: Load skill usage records
|
|
150
329
|
// -----------------------------------------------------------------------
|
|
151
|
-
const skillUsage = readJsonl<SkillUsageRecord>(SKILL_LOG);
|
|
152
|
-
|
|
153
330
|
// -----------------------------------------------------------------------
|
|
154
331
|
// Step 4: Extract failure patterns
|
|
155
332
|
// -----------------------------------------------------------------------
|
|
156
|
-
const failurePatterns = _extractFailurePatterns(
|
|
333
|
+
const failurePatterns = _extractFailurePatterns(
|
|
334
|
+
evalSet,
|
|
335
|
+
skillUsage,
|
|
336
|
+
skillName,
|
|
337
|
+
options.gradingResults,
|
|
338
|
+
);
|
|
339
|
+
|
|
340
|
+
const totalMissed = failurePatterns.reduce((sum, p) => sum + p.missed_queries.length, 0);
|
|
341
|
+
tui.done(
|
|
342
|
+
`Extracted ${failurePatterns.length} failure pattern(s) (${totalMissed} missed queries)`,
|
|
343
|
+
);
|
|
157
344
|
|
|
158
345
|
// -----------------------------------------------------------------------
|
|
159
|
-
// Step 5:
|
|
346
|
+
// Step 5: Cold-start bootstrap or early exit if no patterns
|
|
160
347
|
// -----------------------------------------------------------------------
|
|
161
348
|
if (failurePatterns.length === 0) {
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
349
|
+
// Cold-start: if the eval set has positive entries that the skill should
|
|
350
|
+
// match but there are zero skill usage records, treat the positive eval
|
|
351
|
+
// entries themselves as "missed queries" — they ARE the failure signal.
|
|
352
|
+
const positiveEvals = evalSet.filter((e) => e.should_trigger);
|
|
353
|
+
const hasSkillUsageHistory = skillUsage.some((record) => record.skill_name === skillName);
|
|
354
|
+
if (positiveEvals.length > 0 && !hasSkillUsageHistory) {
|
|
355
|
+
const coldStartPattern: FailurePattern = {
|
|
356
|
+
pattern_id: `fp-${skillName}-coldstart`,
|
|
357
|
+
skill_name: skillName,
|
|
358
|
+
invocation_type: "implicit",
|
|
359
|
+
missed_queries: positiveEvals.map((e) => e.query),
|
|
360
|
+
frequency: positiveEvals.length,
|
|
361
|
+
sample_sessions: [],
|
|
362
|
+
extracted_at: new Date().toISOString(),
|
|
363
|
+
};
|
|
364
|
+
failurePatterns.push(coldStartPattern);
|
|
365
|
+
tui.done(
|
|
366
|
+
`Cold-start bootstrap: ${positiveEvals.length} positive eval entries used as missed queries`,
|
|
367
|
+
);
|
|
368
|
+
} else {
|
|
369
|
+
finishTui();
|
|
370
|
+
return withStats({
|
|
371
|
+
proposal: null,
|
|
372
|
+
validation: null,
|
|
373
|
+
deployed: false,
|
|
374
|
+
auditEntries,
|
|
375
|
+
reason: "No failure patterns found",
|
|
376
|
+
});
|
|
377
|
+
}
|
|
169
378
|
}
|
|
170
379
|
|
|
171
380
|
// -----------------------------------------------------------------------
|
|
@@ -174,156 +383,502 @@ export async function evolve(
|
|
|
174
383
|
const missedQueries = failurePatterns.flatMap((p) => p.missed_queries);
|
|
175
384
|
|
|
176
385
|
// -----------------------------------------------------------------------
|
|
177
|
-
// Steps 7-12:
|
|
386
|
+
// Steps 7-12: Proposal generation and validation
|
|
178
387
|
// -----------------------------------------------------------------------
|
|
179
|
-
let lastProposal: EvolutionProposal | null = null;
|
|
180
|
-
let lastValidation: ValidationResult | null = null;
|
|
181
|
-
let feedbackReason = "";
|
|
182
388
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
389
|
+
// -----------------------------------------------------------------------
|
|
390
|
+
// Pareto multi-candidate path
|
|
391
|
+
// -----------------------------------------------------------------------
|
|
392
|
+
const paretoEnabled = options.paretoEnabled ?? false;
|
|
393
|
+
const candidateCount = options.candidateCount ?? 3;
|
|
394
|
+
const tokenEfficiencyEnabled = options.tokenEfficiencyEnabled ?? false;
|
|
395
|
+
const telemetryRecords =
|
|
396
|
+
options.telemetryRecords ??
|
|
397
|
+
(tokenEfficiencyEnabled ? readJsonl<SessionTelemetryRecord>(TELEMETRY_LOG) : undefined);
|
|
398
|
+
|
|
399
|
+
// Compute token efficiency score if enabled and telemetry is available
|
|
400
|
+
let tokenEffScore: number | undefined;
|
|
401
|
+
if (tokenEfficiencyEnabled && telemetryRecords && telemetryRecords.length > 0) {
|
|
402
|
+
tokenEffScore = computeTokenEfficiencyScore(skillName, telemetryRecords);
|
|
403
|
+
recordAudit(
|
|
404
|
+
"system",
|
|
405
|
+
"created",
|
|
406
|
+
`Token efficiency score for ${skillName}: ${tokenEffScore.toFixed(3)}`,
|
|
407
|
+
);
|
|
408
|
+
}
|
|
188
409
|
|
|
189
|
-
|
|
410
|
+
if (paretoEnabled && candidateCount > 1) {
|
|
411
|
+
// Generate N candidates in parallel
|
|
412
|
+
const candidates = await generateMultipleProposals(
|
|
190
413
|
currentDescription,
|
|
191
414
|
failurePatterns,
|
|
192
|
-
|
|
415
|
+
missedQueries,
|
|
193
416
|
skillName,
|
|
194
417
|
skillPath,
|
|
195
418
|
agent,
|
|
419
|
+
candidateCount,
|
|
420
|
+
options.proposalModel,
|
|
196
421
|
);
|
|
197
422
|
|
|
198
|
-
|
|
423
|
+
// Filter by confidence threshold
|
|
424
|
+
const viableCandidates = candidates.filter((c) => c.confidence >= confidenceThreshold);
|
|
199
425
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
426
|
+
if (viableCandidates.length === 0) {
|
|
427
|
+
finishTui();
|
|
428
|
+
return withStats({
|
|
429
|
+
proposal: candidates[0] ?? null,
|
|
430
|
+
validation: null,
|
|
431
|
+
deployed: false,
|
|
432
|
+
auditEntries,
|
|
433
|
+
reason: `No candidates met confidence threshold ${confidenceThreshold}`,
|
|
434
|
+
});
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
// Validate each candidate
|
|
438
|
+
const paretoCandidates: ParetoCandidate[] = [];
|
|
439
|
+
for (const proposal of viableCandidates) {
|
|
440
|
+
recordAudit(
|
|
441
|
+
proposal.proposal_id,
|
|
442
|
+
"created",
|
|
443
|
+
createdAuditDetails(`Pareto candidate for ${skillName}`),
|
|
444
|
+
);
|
|
445
|
+
recordEvidence({
|
|
446
|
+
timestamp: new Date().toISOString(),
|
|
447
|
+
proposal_id: proposal.proposal_id,
|
|
448
|
+
skill_name: skillName,
|
|
449
|
+
skill_path: skillPath,
|
|
450
|
+
target: "description",
|
|
451
|
+
stage: "created",
|
|
452
|
+
rationale: proposal.rationale,
|
|
453
|
+
confidence: proposal.confidence,
|
|
454
|
+
details: `Pareto candidate for ${skillName}`,
|
|
455
|
+
original_text: proposal.original_description,
|
|
456
|
+
proposed_text: proposal.proposed_description,
|
|
457
|
+
eval_set: evalSet,
|
|
458
|
+
});
|
|
206
459
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
460
|
+
const validation = await _validateProposal(
|
|
461
|
+
proposal,
|
|
462
|
+
evalSet,
|
|
463
|
+
agent,
|
|
464
|
+
options.validationModel,
|
|
465
|
+
);
|
|
210
466
|
recordAudit(
|
|
211
467
|
proposal.proposal_id,
|
|
212
|
-
"
|
|
213
|
-
`
|
|
468
|
+
"validated",
|
|
469
|
+
`Pareto validation: improved=${validation.improved}`,
|
|
214
470
|
);
|
|
471
|
+
recordEvidence({
|
|
472
|
+
timestamp: new Date().toISOString(),
|
|
473
|
+
proposal_id: proposal.proposal_id,
|
|
474
|
+
skill_name: skillName,
|
|
475
|
+
skill_path: skillPath,
|
|
476
|
+
target: "description",
|
|
477
|
+
stage: "validated",
|
|
478
|
+
rationale: proposal.rationale,
|
|
479
|
+
confidence: proposal.confidence,
|
|
480
|
+
details: `Pareto validation: improved=${validation.improved}`,
|
|
481
|
+
validation: {
|
|
482
|
+
improved: validation.improved,
|
|
483
|
+
before_pass_rate: validation.before_pass_rate,
|
|
484
|
+
after_pass_rate: validation.after_pass_rate,
|
|
485
|
+
net_change: validation.net_change,
|
|
486
|
+
regressions: validation.regressions,
|
|
487
|
+
new_passes: validation.new_passes,
|
|
488
|
+
per_entry_results: validation.per_entry_results,
|
|
489
|
+
},
|
|
490
|
+
});
|
|
215
491
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
proposal
|
|
220
|
-
validation
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
|
|
492
|
+
if (validation.improved && validation.per_entry_results) {
|
|
493
|
+
const invocationScores = computeInvocationScores(validation.per_entry_results);
|
|
494
|
+
const candidate: ParetoCandidate = {
|
|
495
|
+
proposal,
|
|
496
|
+
validation,
|
|
497
|
+
invocation_scores: invocationScores,
|
|
498
|
+
dominates_on: [],
|
|
224
499
|
};
|
|
500
|
+
if (tokenEffScore !== undefined) {
|
|
501
|
+
candidate.token_efficiency_score = tokenEffScore;
|
|
502
|
+
}
|
|
503
|
+
paretoCandidates.push(candidate);
|
|
225
504
|
}
|
|
505
|
+
}
|
|
226
506
|
|
|
227
|
-
|
|
507
|
+
if (paretoCandidates.length === 0) {
|
|
508
|
+
finishTui();
|
|
509
|
+
return withStats({
|
|
510
|
+
proposal: viableCandidates[0],
|
|
511
|
+
validation: null,
|
|
512
|
+
deployed: false,
|
|
513
|
+
auditEntries,
|
|
514
|
+
reason: "No Pareto candidates improved validation",
|
|
515
|
+
});
|
|
228
516
|
}
|
|
229
517
|
|
|
230
|
-
//
|
|
231
|
-
const
|
|
232
|
-
|
|
518
|
+
// Compute Pareto frontier
|
|
519
|
+
const frontier = computeParetoFrontier(paretoCandidates);
|
|
520
|
+
const { best } = selectFromFrontier(frontier);
|
|
233
521
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
522
|
+
lastProposal = best.proposal;
|
|
523
|
+
lastValidation = best.validation;
|
|
524
|
+
|
|
525
|
+
// Skip the standard retry loop — we already have our result
|
|
526
|
+
} else {
|
|
527
|
+
// Standard single-candidate retry loop
|
|
528
|
+
let feedbackReason = "";
|
|
529
|
+
|
|
530
|
+
for (let iteration = 0; iteration < maxIterations; iteration++) {
|
|
531
|
+
// Step 7: Generate proposal
|
|
532
|
+
const effectiveMissedQueries = feedbackReason
|
|
533
|
+
? [...missedQueries, `[Previous attempt failed: ${feedbackReason}]`]
|
|
534
|
+
: missedQueries;
|
|
247
535
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
536
|
+
tui.step(`Generating proposal (iteration ${iteration + 1}/${maxIterations})...`);
|
|
537
|
+
const proposal = await _generateProposal(
|
|
538
|
+
currentDescription,
|
|
539
|
+
failurePatterns,
|
|
540
|
+
effectiveMissedQueries,
|
|
541
|
+
skillName,
|
|
542
|
+
skillPath,
|
|
543
|
+
agent,
|
|
544
|
+
options.proposalModel,
|
|
545
|
+
);
|
|
546
|
+
llmCallCount++;
|
|
547
|
+
|
|
548
|
+
lastProposal = proposal;
|
|
549
|
+
tui.done(`Proposal generated (conf: ${proposal.confidence.toFixed(2)})`);
|
|
550
|
+
|
|
551
|
+
// Step 8: Audit "created"
|
|
251
552
|
recordAudit(
|
|
252
553
|
proposal.proposal_id,
|
|
253
|
-
"
|
|
254
|
-
`
|
|
554
|
+
"created",
|
|
555
|
+
createdAuditDetails(`Proposal created for ${skillName} (iteration ${iteration + 1})`),
|
|
255
556
|
);
|
|
557
|
+
recordEvidence({
|
|
558
|
+
timestamp: new Date().toISOString(),
|
|
559
|
+
proposal_id: proposal.proposal_id,
|
|
560
|
+
skill_name: skillName,
|
|
561
|
+
skill_path: skillPath,
|
|
562
|
+
target: "description",
|
|
563
|
+
stage: "created",
|
|
564
|
+
rationale: proposal.rationale,
|
|
565
|
+
confidence: proposal.confidence,
|
|
566
|
+
details: `Proposal created for ${skillName} (iteration ${iteration + 1})`,
|
|
567
|
+
original_text: proposal.original_description,
|
|
568
|
+
proposed_text: proposal.proposed_description,
|
|
569
|
+
eval_set: evalSet,
|
|
570
|
+
});
|
|
256
571
|
|
|
257
|
-
//
|
|
258
|
-
if (
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
572
|
+
// Step 9: Check confidence threshold
|
|
573
|
+
if (proposal.confidence < confidenceThreshold) {
|
|
574
|
+
feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
|
|
575
|
+
recordAudit(
|
|
576
|
+
proposal.proposal_id,
|
|
577
|
+
"rejected",
|
|
578
|
+
`Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
|
|
579
|
+
);
|
|
580
|
+
recordEvidence({
|
|
581
|
+
timestamp: new Date().toISOString(),
|
|
582
|
+
proposal_id: proposal.proposal_id,
|
|
583
|
+
skill_name: skillName,
|
|
584
|
+
skill_path: skillPath,
|
|
585
|
+
target: "description",
|
|
586
|
+
stage: "rejected",
|
|
587
|
+
rationale: proposal.rationale,
|
|
588
|
+
confidence: proposal.confidence,
|
|
589
|
+
details: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
|
|
590
|
+
});
|
|
591
|
+
|
|
592
|
+
// If this is the last iteration, return early with rejection
|
|
593
|
+
if (iteration === maxIterations - 1) {
|
|
594
|
+
finishTui();
|
|
595
|
+
return withStats({
|
|
596
|
+
proposal: lastProposal,
|
|
597
|
+
validation: null,
|
|
598
|
+
deployed: false,
|
|
599
|
+
auditEntries,
|
|
600
|
+
reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
|
|
601
|
+
});
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
continue;
|
|
266
605
|
}
|
|
267
606
|
|
|
268
|
-
|
|
269
|
-
|
|
607
|
+
// Step 10: Validate against eval set
|
|
608
|
+
const batchCount = Math.ceil(evalSet.length / TRIGGER_CHECK_BATCH_SIZE);
|
|
609
|
+
tui.step(
|
|
610
|
+
`Validating ${evalSet.length} entries (${batchCount} batches, ${VALIDATION_RUNS}x majority-vote)...`,
|
|
611
|
+
);
|
|
612
|
+
const validation = await _validateProposal(
|
|
613
|
+
proposal,
|
|
614
|
+
evalSet,
|
|
615
|
+
agent,
|
|
616
|
+
options.validationModel,
|
|
617
|
+
);
|
|
618
|
+
lastValidation = validation;
|
|
619
|
+
llmCallCount += batchCount * 2 * VALIDATION_RUNS;
|
|
620
|
+
tui.done(
|
|
621
|
+
`Validation: ${(validation.before_pass_rate * 100).toFixed(1)}% \u2192 ${(validation.after_pass_rate * 100).toFixed(1)}% (improved: ${validation.improved})`,
|
|
622
|
+
);
|
|
623
|
+
|
|
624
|
+
// Step 11: Audit "validated"
|
|
625
|
+
const evalSnapshot: EvalPassRate = {
|
|
626
|
+
total: evalSet.length,
|
|
627
|
+
passed: Math.round(validation.after_pass_rate * evalSet.length),
|
|
628
|
+
failed: evalSet.length - Math.round(validation.after_pass_rate * evalSet.length),
|
|
629
|
+
pass_rate: validation.after_pass_rate,
|
|
630
|
+
};
|
|
631
|
+
recordAudit(
|
|
632
|
+
proposal.proposal_id,
|
|
633
|
+
"validated",
|
|
634
|
+
`Validation complete: improved=${validation.improved}`,
|
|
635
|
+
evalSnapshot,
|
|
636
|
+
);
|
|
637
|
+
recordEvidence({
|
|
638
|
+
timestamp: new Date().toISOString(),
|
|
639
|
+
proposal_id: proposal.proposal_id,
|
|
640
|
+
skill_name: skillName,
|
|
641
|
+
skill_path: skillPath,
|
|
642
|
+
target: "description",
|
|
643
|
+
stage: "validated",
|
|
644
|
+
rationale: proposal.rationale,
|
|
645
|
+
confidence: proposal.confidence,
|
|
646
|
+
details: `Validation complete: improved=${validation.improved}`,
|
|
647
|
+
validation: {
|
|
648
|
+
improved: validation.improved,
|
|
649
|
+
before_pass_rate: validation.before_pass_rate,
|
|
650
|
+
after_pass_rate: validation.after_pass_rate,
|
|
651
|
+
net_change: validation.net_change,
|
|
652
|
+
regressions: validation.regressions,
|
|
653
|
+
new_passes: validation.new_passes,
|
|
654
|
+
per_entry_results: validation.per_entry_results,
|
|
655
|
+
},
|
|
656
|
+
});
|
|
270
657
|
|
|
271
|
-
|
|
272
|
-
|
|
658
|
+
// Step 12: Check validation result
|
|
659
|
+
if (!validation.improved) {
|
|
660
|
+
feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
|
|
661
|
+
recordAudit(
|
|
662
|
+
proposal.proposal_id,
|
|
663
|
+
"rejected",
|
|
664
|
+
`Validation failed: net_change=${validation.net_change.toFixed(3)}`,
|
|
665
|
+
);
|
|
666
|
+
recordEvidence({
|
|
667
|
+
timestamp: new Date().toISOString(),
|
|
668
|
+
proposal_id: proposal.proposal_id,
|
|
669
|
+
skill_name: skillName,
|
|
670
|
+
skill_path: skillPath,
|
|
671
|
+
target: "description",
|
|
672
|
+
stage: "rejected",
|
|
673
|
+
rationale: proposal.rationale,
|
|
674
|
+
confidence: proposal.confidence,
|
|
675
|
+
details: `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
|
|
676
|
+
validation: {
|
|
677
|
+
improved: validation.improved,
|
|
678
|
+
before_pass_rate: validation.before_pass_rate,
|
|
679
|
+
after_pass_rate: validation.after_pass_rate,
|
|
680
|
+
net_change: validation.net_change,
|
|
681
|
+
regressions: validation.regressions,
|
|
682
|
+
new_passes: validation.new_passes,
|
|
683
|
+
per_entry_results: validation.per_entry_results,
|
|
684
|
+
},
|
|
685
|
+
});
|
|
686
|
+
|
|
687
|
+
// If this is the last iteration, return with rejection
|
|
688
|
+
if (iteration === maxIterations - 1) {
|
|
689
|
+
finishTui();
|
|
690
|
+
return withStats({
|
|
691
|
+
proposal: lastProposal,
|
|
692
|
+
validation: lastValidation,
|
|
693
|
+
deployed: false,
|
|
694
|
+
auditEntries,
|
|
695
|
+
reason: `Validation failed after ${maxIterations} iterations: net_change=${validation.net_change.toFixed(3)}`,
|
|
696
|
+
});
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
continue;
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
// Validation passed - break out of retry loop
|
|
703
|
+
break;
|
|
704
|
+
}
|
|
273
705
|
}
|
|
274
706
|
|
|
275
707
|
// -----------------------------------------------------------------------
|
|
276
708
|
// Step 13: Dry run check
|
|
277
709
|
// -----------------------------------------------------------------------
|
|
278
710
|
if (dryRun) {
|
|
279
|
-
|
|
711
|
+
finishTui();
|
|
712
|
+
return withStats({
|
|
280
713
|
proposal: lastProposal,
|
|
281
714
|
validation: lastValidation,
|
|
282
715
|
deployed: false,
|
|
283
716
|
auditEntries,
|
|
284
717
|
reason: "Dry run - proposal validated but not deployed",
|
|
285
|
-
};
|
|
718
|
+
});
|
|
286
719
|
}
|
|
287
720
|
|
|
288
721
|
// -----------------------------------------------------------------------
|
|
289
|
-
// Step
|
|
722
|
+
// Step 13b: Baseline gate (--with-baseline)
|
|
290
723
|
// -----------------------------------------------------------------------
|
|
291
|
-
|
|
724
|
+
let baselineResult: BaselineMeasurement | undefined;
|
|
725
|
+
if (options.withBaseline && lastProposal) {
|
|
726
|
+
tui.step("Measuring baseline...");
|
|
727
|
+
baselineResult = await _measureBaseline({
|
|
728
|
+
evalSet,
|
|
729
|
+
skillDescription: currentDescription,
|
|
730
|
+
skillName,
|
|
731
|
+
agent,
|
|
732
|
+
modelFlag: options.validationModel,
|
|
733
|
+
});
|
|
734
|
+
tui.done(
|
|
735
|
+
`Baseline: lift=${baselineResult.lift.toFixed(3)}, adds_value=${baselineResult.adds_value}`,
|
|
736
|
+
);
|
|
737
|
+
|
|
292
738
|
recordAudit(
|
|
293
739
|
lastProposal.proposal_id,
|
|
294
|
-
"
|
|
295
|
-
`
|
|
296
|
-
lastValidation
|
|
297
|
-
? {
|
|
298
|
-
total: evalSet.length,
|
|
299
|
-
passed: Math.round(lastValidation.after_pass_rate * evalSet.length),
|
|
300
|
-
failed: evalSet.length - Math.round(lastValidation.after_pass_rate * evalSet.length),
|
|
301
|
-
pass_rate: lastValidation.after_pass_rate,
|
|
302
|
-
}
|
|
303
|
-
: undefined,
|
|
740
|
+
"validated",
|
|
741
|
+
`Baseline check: lift=${baselineResult.lift.toFixed(3)}, adds_value=${baselineResult.adds_value}`,
|
|
304
742
|
);
|
|
743
|
+
|
|
744
|
+
if (!baselineResult.adds_value) {
|
|
745
|
+
finishTui();
|
|
746
|
+
return withStats({
|
|
747
|
+
proposal: lastProposal,
|
|
748
|
+
validation: lastValidation,
|
|
749
|
+
deployed: false,
|
|
750
|
+
auditEntries,
|
|
751
|
+
reason: `Baseline gate failed: lift=${baselineResult.lift.toFixed(3)} below 0.05 threshold`,
|
|
752
|
+
baselineResult,
|
|
753
|
+
});
|
|
754
|
+
}
|
|
305
755
|
}
|
|
306
756
|
|
|
307
757
|
// -----------------------------------------------------------------------
|
|
308
|
-
// Step
|
|
758
|
+
// Step 13c: Gate validation (--cheap-loop / --gate-model)
|
|
309
759
|
// -----------------------------------------------------------------------
|
|
310
|
-
|
|
760
|
+
let gateValidation: ValidationResult | undefined;
|
|
761
|
+
if (options.gateModel && lastProposal && lastValidation?.improved) {
|
|
762
|
+
tui.step(`Gate validation (${options.gateModel})...`);
|
|
763
|
+
gateValidation = await _gateValidateProposal(lastProposal, evalSet, agent, options.gateModel);
|
|
764
|
+
tui.done(
|
|
765
|
+
`Gate (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
766
|
+
);
|
|
767
|
+
|
|
768
|
+
recordAudit(
|
|
769
|
+
lastProposal.proposal_id,
|
|
770
|
+
"validated",
|
|
771
|
+
`Gate validation (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
772
|
+
);
|
|
773
|
+
|
|
774
|
+
if (!gateValidation.improved) {
|
|
775
|
+
finishTui();
|
|
776
|
+
return withStats({
|
|
777
|
+
proposal: lastProposal,
|
|
778
|
+
validation: lastValidation,
|
|
779
|
+
deployed: false,
|
|
780
|
+
auditEntries,
|
|
781
|
+
reason: `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
782
|
+
gateValidation,
|
|
783
|
+
...(baselineResult ? { baselineResult } : {}),
|
|
784
|
+
});
|
|
785
|
+
}
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
// -----------------------------------------------------------------------
|
|
789
|
+
// Step 14: Deploy — write updated description to SKILL.md
|
|
790
|
+
// -----------------------------------------------------------------------
|
|
791
|
+
if (lastProposal && lastValidation?.improved) {
|
|
792
|
+
// Create backup before modifying
|
|
793
|
+
const backupPath = `${skillPath}.bak`;
|
|
794
|
+
copyFileSync(skillPath, backupPath);
|
|
795
|
+
tui.done(`Backup created at ${backupPath}`);
|
|
796
|
+
|
|
797
|
+
// Replace the frontmatter description
|
|
798
|
+
const updatedContent = replaceFrontmatterDescription(
|
|
799
|
+
rawContent,
|
|
800
|
+
lastProposal.proposed_description,
|
|
801
|
+
);
|
|
802
|
+
writeFileSync(skillPath, updatedContent, "utf-8");
|
|
803
|
+
tui.done(`Deployed updated description to ${skillPath}`);
|
|
804
|
+
|
|
805
|
+
// Show what changed in the skill file
|
|
806
|
+
const diffOutput = formatSimpleDiff(rawContent, updatedContent);
|
|
807
|
+
if (diffOutput) {
|
|
808
|
+
console.error("\n--- Skill description diff ---");
|
|
809
|
+
console.error(diffOutput);
|
|
810
|
+
console.error("------------------------------\n");
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
recordAudit(lastProposal.proposal_id, "deployed", `Deployed proposal for ${skillName}`, {
|
|
814
|
+
total: evalSet.length,
|
|
815
|
+
passed: Math.round(lastValidation.after_pass_rate * evalSet.length),
|
|
816
|
+
failed: evalSet.length - Math.round(lastValidation.after_pass_rate * evalSet.length),
|
|
817
|
+
pass_rate: lastValidation.after_pass_rate,
|
|
818
|
+
});
|
|
819
|
+
recordEvidence({
|
|
820
|
+
timestamp: new Date().toISOString(),
|
|
821
|
+
proposal_id: lastProposal.proposal_id,
|
|
822
|
+
skill_name: skillName,
|
|
823
|
+
skill_path: skillPath,
|
|
824
|
+
target: "description",
|
|
825
|
+
stage: "deployed",
|
|
826
|
+
rationale: lastProposal.rationale,
|
|
827
|
+
confidence: lastProposal.confidence,
|
|
828
|
+
details: `Deployed proposal for ${skillName}`,
|
|
829
|
+
validation: {
|
|
830
|
+
improved: lastValidation.improved,
|
|
831
|
+
before_pass_rate: lastValidation.before_pass_rate,
|
|
832
|
+
after_pass_rate: lastValidation.after_pass_rate,
|
|
833
|
+
net_change: lastValidation.net_change,
|
|
834
|
+
regressions: lastValidation.regressions,
|
|
835
|
+
new_passes: lastValidation.new_passes,
|
|
836
|
+
per_entry_results: lastValidation.per_entry_results,
|
|
837
|
+
},
|
|
838
|
+
});
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
// -----------------------------------------------------------------------
|
|
842
|
+
// Step 15: Update evolution memory
|
|
843
|
+
// -----------------------------------------------------------------------
|
|
844
|
+
const wasDeployed = lastProposal !== null && lastValidation !== null && lastValidation.improved;
|
|
845
|
+
const evolveResult: EvolveResult = withStats({
|
|
311
846
|
proposal: lastProposal,
|
|
312
847
|
validation: lastValidation,
|
|
313
|
-
deployed:
|
|
848
|
+
deployed: wasDeployed,
|
|
314
849
|
auditEntries,
|
|
315
|
-
reason:
|
|
316
|
-
|
|
850
|
+
reason: wasDeployed
|
|
851
|
+
? "Evolution deployed successfully"
|
|
852
|
+
: "Evolution not deployed: proposal or validation missing",
|
|
853
|
+
...(skillVersion ? { skillVersion } : {}),
|
|
854
|
+
...(baselineResult ? { baselineResult } : {}),
|
|
855
|
+
...(gateValidation ? { gateValidation } : {}),
|
|
856
|
+
});
|
|
857
|
+
|
|
858
|
+
if (lastProposal) {
|
|
859
|
+
try {
|
|
860
|
+
_updateContextAfterEvolve(skillName, lastProposal, evolveResult);
|
|
861
|
+
} catch {
|
|
862
|
+
// Memory writes should never fail the main operation
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
|
|
866
|
+
// -----------------------------------------------------------------------
|
|
867
|
+
// Step 16: Return complete result
|
|
868
|
+
// -----------------------------------------------------------------------
|
|
869
|
+
finishTui();
|
|
870
|
+
return evolveResult;
|
|
317
871
|
} catch (error) {
|
|
318
|
-
|
|
872
|
+
tui.destroy();
|
|
873
|
+
// Robust error handling: preserve partial results so callers can inspect progress
|
|
319
874
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
320
|
-
return {
|
|
321
|
-
proposal:
|
|
322
|
-
validation:
|
|
875
|
+
return withStats({
|
|
876
|
+
proposal: lastProposal,
|
|
877
|
+
validation: lastValidation,
|
|
323
878
|
deployed: false,
|
|
324
879
|
auditEntries,
|
|
325
880
|
reason: `Error during evolution: ${errorMessage}`,
|
|
326
|
-
};
|
|
881
|
+
});
|
|
327
882
|
}
|
|
328
883
|
}
|
|
329
884
|
|
|
@@ -341,6 +896,18 @@ export async function cliMain(): Promise<void> {
|
|
|
341
896
|
"dry-run": { type: "boolean", default: false },
|
|
342
897
|
confidence: { type: "string", default: "0.6" },
|
|
343
898
|
"max-iterations": { type: "string", default: "3" },
|
|
899
|
+
pareto: { type: "boolean", default: false },
|
|
900
|
+
candidates: { type: "string", default: "3" },
|
|
901
|
+
"token-efficiency": { type: "boolean", default: false },
|
|
902
|
+
"with-baseline": { type: "boolean", default: false },
|
|
903
|
+
"validation-model": { type: "string", default: "haiku" },
|
|
904
|
+
"cheap-loop": { type: "boolean", default: true },
|
|
905
|
+
"full-model": { type: "boolean", default: false },
|
|
906
|
+
"gate-model": { type: "string" },
|
|
907
|
+
"proposal-model": { type: "string" },
|
|
908
|
+
"sync-first": { type: "boolean", default: false },
|
|
909
|
+
"sync-force": { type: "boolean", default: false },
|
|
910
|
+
verbose: { type: "boolean", default: false },
|
|
344
911
|
help: { type: "boolean", default: false },
|
|
345
912
|
},
|
|
346
913
|
strict: true,
|
|
@@ -360,6 +927,18 @@ Options:
|
|
|
360
927
|
--dry-run Validate proposal without deploying
|
|
361
928
|
--confidence Confidence threshold 0.0-1.0 (default: 0.6)
|
|
362
929
|
--max-iterations Max retry iterations (default: 3)
|
|
930
|
+
--pareto Enable Pareto multi-candidate selection
|
|
931
|
+
--candidates Number of candidates to generate (default: 3, max: 5)
|
|
932
|
+
--token-efficiency Enable 5D Pareto with token efficiency scoring
|
|
933
|
+
--with-baseline Gate deployment on baseline lift > 0.05
|
|
934
|
+
--validation-model Model for trigger-check validation calls (default: haiku)
|
|
935
|
+
--cheap-loop Use cheap models for loop, expensive for gate (default: on)
|
|
936
|
+
--full-model Use same model for all stages (disables cheap-loop)
|
|
937
|
+
--gate-model Model for final gate validation (default: sonnet)
|
|
938
|
+
--proposal-model Model for proposal generation LLM calls
|
|
939
|
+
--sync-first Refresh source-truth telemetry before building evals/failure patterns
|
|
940
|
+
--sync-force Force a full rescan during --sync-first
|
|
941
|
+
--verbose Output full EvolveResult JSON (default: compact summary)
|
|
363
942
|
--help Show this help message`);
|
|
364
943
|
process.exit(0);
|
|
365
944
|
}
|
|
@@ -368,6 +947,10 @@ Options:
|
|
|
368
947
|
console.error("[ERROR] --skill and --skill-path are required");
|
|
369
948
|
process.exit(1);
|
|
370
949
|
}
|
|
950
|
+
if ((values["sync-force"] ?? false) && !(values["sync-first"] ?? false)) {
|
|
951
|
+
console.error("[ERROR] --sync-force requires --sync-first");
|
|
952
|
+
process.exit(1);
|
|
953
|
+
}
|
|
371
954
|
|
|
372
955
|
const { detectAgent } = await import("../utils/llm-call.js");
|
|
373
956
|
const requestedAgent = values.agent;
|
|
@@ -395,6 +978,61 @@ Options:
|
|
|
395
978
|
process.exit(1);
|
|
396
979
|
}
|
|
397
980
|
|
|
981
|
+
// -------------------------------------------------------------------------
|
|
982
|
+
// Pre-flight validation: catch common misconfigurations before evolve()
|
|
983
|
+
// -------------------------------------------------------------------------
|
|
984
|
+
const skillPath = values["skill-path"];
|
|
985
|
+
if (!skillPath) {
|
|
986
|
+
console.error("[ERROR] --skill-path is required.");
|
|
987
|
+
process.exit(1);
|
|
988
|
+
}
|
|
989
|
+
if (!existsSync(skillPath)) {
|
|
990
|
+
console.error(`[ERROR] SKILL.md not found at: ${skillPath}`);
|
|
991
|
+
console.error(" Verify the --skill-path argument points to an existing SKILL.md file.");
|
|
992
|
+
process.exit(1);
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
const evalSetPath = values["eval-set"];
|
|
996
|
+
if (evalSetPath && !existsSync(evalSetPath)) {
|
|
997
|
+
console.error(`[ERROR] Eval set file not found at: ${evalSetPath}`);
|
|
998
|
+
console.error(" Verify the --eval-set argument points to an existing JSON file.");
|
|
999
|
+
process.exit(1);
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
// If no eval-set provided, check that log files exist for auto-generation
|
|
1003
|
+
if (!evalSetPath && !(values["sync-first"] ?? false)) {
|
|
1004
|
+
const hasSkillLog = readEffectiveSkillUsageRecords().length > 0;
|
|
1005
|
+
const hasQueryLog = existsSync(QUERY_LOG);
|
|
1006
|
+
if (!hasSkillLog && !hasQueryLog) {
|
|
1007
|
+
console.error("[ERROR] No eval set provided and no telemetry logs found.");
|
|
1008
|
+
console.error(
|
|
1009
|
+
" Either pass --eval-set <path> or generate logs first by using selftune-enabled skills.",
|
|
1010
|
+
);
|
|
1011
|
+
console.error(` Expected logs at: ${SKILL_LOG} and ${QUERY_LOG}`);
|
|
1012
|
+
process.exit(1);
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
const tokenEfficiencyEnabled = values["token-efficiency"] ?? false;
|
|
1017
|
+
let telemetryRecords: SessionTelemetryRecord[] | undefined;
|
|
1018
|
+
if (tokenEfficiencyEnabled && !(values["sync-first"] ?? false)) {
|
|
1019
|
+
telemetryRecords = readJsonl<SessionTelemetryRecord>(TELEMETRY_LOG);
|
|
1020
|
+
}
|
|
1021
|
+
const gradingResults = readGradingResultsForSkill(values.skill);
|
|
1022
|
+
|
|
1023
|
+
if (values.verbose) {
|
|
1024
|
+
console.error("[verbose] Pre-flight checks passed");
|
|
1025
|
+
console.error(`[verbose] Skill: ${values.skill}`);
|
|
1026
|
+
console.error(`[verbose] Skill path: ${skillPath}`);
|
|
1027
|
+
console.error(`[verbose] Agent: ${agent}`);
|
|
1028
|
+
console.error(`[verbose] Eval set: ${evalSetPath ?? "(auto-generated from logs)"}`);
|
|
1029
|
+
console.error(`[verbose] Loaded grading results: ${gradingResults.length}`);
|
|
1030
|
+
console.error(`[verbose] Cheap loop: ${values["cheap-loop"] ?? false}`);
|
|
1031
|
+
console.error(`[verbose] Dry run: ${values["dry-run"] ?? false}`);
|
|
1032
|
+
console.error(`[verbose] Sync first: ${values["sync-first"] ?? false}`);
|
|
1033
|
+
console.error(`[verbose] Sync force: ${values["sync-force"] ?? false}`);
|
|
1034
|
+
}
|
|
1035
|
+
|
|
398
1036
|
const result = await evolve({
|
|
399
1037
|
skillName: values.skill,
|
|
400
1038
|
skillPath: values["skill-path"],
|
|
@@ -403,15 +1041,86 @@ Options:
|
|
|
403
1041
|
dryRun: values["dry-run"] ?? false,
|
|
404
1042
|
confidenceThreshold: Number.parseFloat(values.confidence ?? "0.6"),
|
|
405
1043
|
maxIterations: Number.parseInt(values["max-iterations"] ?? "3", 10),
|
|
1044
|
+
paretoEnabled: values.pareto ?? false,
|
|
1045
|
+
candidateCount: Number.parseInt(values.candidates ?? "3", 10),
|
|
1046
|
+
tokenEfficiencyEnabled,
|
|
1047
|
+
telemetryRecords,
|
|
1048
|
+
withBaseline: values["with-baseline"] ?? false,
|
|
1049
|
+
validationModel: values["validation-model"],
|
|
1050
|
+
cheapLoop: (values["cheap-loop"] ?? true) && !(values["full-model"] ?? false),
|
|
1051
|
+
gateModel: values["gate-model"],
|
|
1052
|
+
proposalModel: values["proposal-model"],
|
|
1053
|
+
gradingResults,
|
|
1054
|
+
syncFirst: values["sync-first"] ?? false,
|
|
1055
|
+
syncForce: values["sync-force"] ?? false,
|
|
406
1056
|
});
|
|
407
1057
|
|
|
408
|
-
|
|
1058
|
+
if (values.verbose) {
|
|
1059
|
+
console.log(JSON.stringify(result, null, 2));
|
|
1060
|
+
} else {
|
|
1061
|
+
const summary: EvolveResultSummary = {
|
|
1062
|
+
skill: values.skill,
|
|
1063
|
+
deployed: result.deployed,
|
|
1064
|
+
reason: result.reason,
|
|
1065
|
+
before: result.validation?.before_pass_rate ?? 0,
|
|
1066
|
+
after: result.validation?.after_pass_rate ?? 0,
|
|
1067
|
+
net_change: result.validation?.net_change ?? 0,
|
|
1068
|
+
improved: result.validation?.improved ?? false,
|
|
1069
|
+
regressions: result.validation?.regressions.length ?? 0,
|
|
1070
|
+
new_passes: result.validation?.new_passes.length ?? 0,
|
|
1071
|
+
confidence: result.proposal?.confidence ?? 0,
|
|
1072
|
+
llm_calls: result.llmCallCount,
|
|
1073
|
+
elapsed_s: +(result.elapsedMs / 1000).toFixed(1),
|
|
1074
|
+
proposal_id: result.proposal?.proposal_id ?? "",
|
|
1075
|
+
rationale: result.proposal?.rationale ?? "",
|
|
1076
|
+
...(result.skillVersion ? { version: result.skillVersion } : {}),
|
|
1077
|
+
dashboard_url: `http://localhost:3141/report/${encodeURIComponent(values.skill)}`,
|
|
1078
|
+
};
|
|
1079
|
+
console.log(JSON.stringify(summary, null, 2));
|
|
1080
|
+
}
|
|
1081
|
+
|
|
1082
|
+
// Print human-readable status to stderr so users always see outcome
|
|
1083
|
+
if (!result.deployed) {
|
|
1084
|
+
console.error(`\n[NOT DEPLOYED] ${result.reason}`);
|
|
1085
|
+
if (result.validation && !result.validation.improved) {
|
|
1086
|
+
console.error(
|
|
1087
|
+
` Pass rate: ${(result.validation.before_pass_rate * 100).toFixed(1)}% -> ${(result.validation.after_pass_rate * 100).toFixed(1)}% (net: ${result.validation.net_change >= 0 ? "+" : ""}${(result.validation.net_change * 100).toFixed(1)}%)`,
|
|
1088
|
+
);
|
|
1089
|
+
if (result.validation.regressions.length > 0) {
|
|
1090
|
+
console.error(` Regressions: ${result.validation.regressions.length} entries`);
|
|
1091
|
+
}
|
|
1092
|
+
}
|
|
1093
|
+
if (
|
|
1094
|
+
result.proposal &&
|
|
1095
|
+
result.proposal.confidence < Number.parseFloat(values.confidence ?? "0.6")
|
|
1096
|
+
) {
|
|
1097
|
+
console.error(
|
|
1098
|
+
` Confidence ${result.proposal.confidence.toFixed(2)} below threshold ${values.confidence ?? "0.6"}`,
|
|
1099
|
+
);
|
|
1100
|
+
}
|
|
1101
|
+
console.error(" Re-run with --verbose for full diagnostic output.");
|
|
1102
|
+
} else {
|
|
1103
|
+
console.error(`\n[DEPLOYED] ${result.reason}`);
|
|
1104
|
+
}
|
|
1105
|
+
|
|
409
1106
|
process.exit(result.deployed ? 0 : 1);
|
|
410
1107
|
}
|
|
411
1108
|
|
|
412
1109
|
if (import.meta.main) {
|
|
413
1110
|
cliMain().catch((err) => {
|
|
414
|
-
|
|
1111
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1112
|
+
const stack = err instanceof Error ? err.stack : undefined;
|
|
1113
|
+
console.error(`[FATAL] ${message}`);
|
|
1114
|
+
if (stack && process.env.SELFTUNE_VERBOSE === "1") {
|
|
1115
|
+
console.error(stack);
|
|
1116
|
+
}
|
|
1117
|
+
console.error(
|
|
1118
|
+
"\nTroubleshooting:\n" +
|
|
1119
|
+
" - Verify --skill-path points to a valid SKILL.md file\n" +
|
|
1120
|
+
" - Ensure eval data exists (run `selftune evals` first) or pass --eval-set\n" +
|
|
1121
|
+
" - Check that ANTHROPIC_API_KEY is set if using Claude\n" +
|
|
1122
|
+
" - Re-run with --verbose for full diagnostic output",
|
|
1123
|
+
);
|
|
415
1124
|
process.exit(1);
|
|
416
1125
|
});
|
|
417
1126
|
}
|