selftune 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +146 -0
- package/.claude/agents/evolution-reviewer.md +167 -0
- package/.claude/agents/integration-guide.md +200 -0
- package/.claude/agents/pattern-analyst.md +147 -0
- package/CHANGELOG.md +37 -0
- package/README.md +96 -256
- package/assets/BeforeAfter.gif +0 -0
- package/assets/FeedbackLoop.gif +0 -0
- package/assets/logo.svg +9 -0
- package/assets/skill-health-badge.svg +20 -0
- package/cli/selftune/activation-rules.ts +171 -0
- package/cli/selftune/badge/badge-data.ts +108 -0
- package/cli/selftune/badge/badge-svg.ts +212 -0
- package/cli/selftune/badge/badge.ts +103 -0
- package/cli/selftune/constants.ts +75 -1
- package/cli/selftune/contribute/bundle.ts +314 -0
- package/cli/selftune/contribute/contribute.ts +214 -0
- package/cli/selftune/contribute/sanitize.ts +162 -0
- package/cli/selftune/cron/setup.ts +266 -0
- package/cli/selftune/dashboard-server.ts +582 -0
- package/cli/selftune/dashboard.ts +25 -3
- package/cli/selftune/eval/baseline.ts +247 -0
- package/cli/selftune/eval/composability.ts +117 -0
- package/cli/selftune/eval/generate-unit-tests.ts +143 -0
- package/cli/selftune/eval/hooks-to-evals.ts +68 -2
- package/cli/selftune/eval/import-skillsbench.ts +221 -0
- package/cli/selftune/eval/synthetic-evals.ts +172 -0
- package/cli/selftune/eval/unit-test-cli.ts +152 -0
- package/cli/selftune/eval/unit-test.ts +196 -0
- package/cli/selftune/evolution/deploy-proposal.ts +142 -1
- package/cli/selftune/evolution/evolve-body.ts +492 -0
- package/cli/selftune/evolution/evolve.ts +466 -103
- package/cli/selftune/evolution/extract-patterns.ts +32 -1
- package/cli/selftune/evolution/pareto.ts +314 -0
- package/cli/selftune/evolution/propose-body.ts +171 -0
- package/cli/selftune/evolution/propose-description.ts +100 -2
- package/cli/selftune/evolution/propose-routing.ts +166 -0
- package/cli/selftune/evolution/refine-body.ts +141 -0
- package/cli/selftune/evolution/rollback.ts +19 -2
- package/cli/selftune/evolution/validate-body.ts +254 -0
- package/cli/selftune/evolution/validate-proposal.ts +257 -35
- package/cli/selftune/evolution/validate-routing.ts +177 -0
- package/cli/selftune/grading/grade-session.ts +138 -18
- package/cli/selftune/grading/pre-gates.ts +104 -0
- package/cli/selftune/hooks/auto-activate.ts +185 -0
- package/cli/selftune/hooks/evolution-guard.ts +165 -0
- package/cli/selftune/hooks/skill-change-guard.ts +112 -0
- package/cli/selftune/index.ts +88 -0
- package/cli/selftune/ingestors/claude-replay.ts +351 -0
- package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
- package/cli/selftune/init.ts +150 -3
- package/cli/selftune/memory/writer.ts +447 -0
- package/cli/selftune/monitoring/watch.ts +25 -2
- package/cli/selftune/status.ts +17 -13
- package/cli/selftune/types.ts +377 -5
- package/cli/selftune/utils/frontmatter.ts +217 -0
- package/cli/selftune/utils/llm-call.ts +29 -3
- package/cli/selftune/utils/transcript.ts +35 -0
- package/cli/selftune/utils/trigger-check.ts +89 -0
- package/cli/selftune/utils/tui.ts +156 -0
- package/dashboard/index.html +569 -8
- package/package.json +8 -4
- package/skill/SKILL.md +124 -8
- package/skill/Workflows/AutoActivation.md +144 -0
- package/skill/Workflows/Badge.md +118 -0
- package/skill/Workflows/Baseline.md +121 -0
- package/skill/Workflows/Composability.md +100 -0
- package/skill/Workflows/Contribute.md +91 -0
- package/skill/Workflows/Cron.md +155 -0
- package/skill/Workflows/Dashboard.md +203 -0
- package/skill/Workflows/Doctor.md +37 -1
- package/skill/Workflows/Evals.md +69 -1
- package/skill/Workflows/EvolutionMemory.md +152 -0
- package/skill/Workflows/Evolve.md +111 -6
- package/skill/Workflows/EvolveBody.md +159 -0
- package/skill/Workflows/ImportSkillsBench.md +111 -0
- package/skill/Workflows/Ingest.md +117 -3
- package/skill/Workflows/Initialize.md +57 -3
- package/skill/Workflows/Replay.md +70 -0
- package/skill/Workflows/Rollback.md +20 -1
- package/skill/Workflows/UnitTest.md +138 -0
- package/skill/Workflows/Watch.md +22 -0
- package/skill/settings_snippet.json +23 -0
- package/templates/activation-rules-default.json +27 -0
- package/templates/multi-skill-settings.json +64 -0
- package/templates/single-skill-settings.json +58 -0
|
@@ -6,25 +6,45 @@
|
|
|
6
6
|
* logic and comprehensive audit tracking.
|
|
7
7
|
*/
|
|
8
8
|
|
|
9
|
-
import { existsSync, readFileSync } from "node:fs";
|
|
9
|
+
import { copyFileSync, existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
10
10
|
import { parseArgs } from "node:util";
|
|
11
11
|
|
|
12
|
-
import { QUERY_LOG, SKILL_LOG } from "../constants.js";
|
|
12
|
+
import { QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
|
|
13
|
+
import type { BaselineMeasurement } from "../eval/baseline.js";
|
|
14
|
+
import { measureBaseline } from "../eval/baseline.js";
|
|
13
15
|
import { buildEvalSet } from "../eval/hooks-to-evals.js";
|
|
16
|
+
import { updateContextAfterEvolve } from "../memory/writer.js";
|
|
14
17
|
import type {
|
|
15
18
|
EvalEntry,
|
|
16
19
|
EvalPassRate,
|
|
17
20
|
EvolutionAuditEntry,
|
|
18
21
|
EvolutionProposal,
|
|
22
|
+
EvolveResultSummary,
|
|
23
|
+
FailurePattern,
|
|
24
|
+
GradingResult,
|
|
25
|
+
ParetoCandidate,
|
|
19
26
|
QueryLogRecord,
|
|
27
|
+
SessionTelemetryRecord,
|
|
20
28
|
SkillUsageRecord,
|
|
21
29
|
} from "../types.js";
|
|
30
|
+
import { parseFrontmatter, replaceFrontmatterDescription } from "../utils/frontmatter.js";
|
|
22
31
|
import { readJsonl } from "../utils/jsonl.js";
|
|
32
|
+
import { createEvolveTUI } from "../utils/tui.js";
|
|
23
33
|
import { appendAuditEntry } from "./audit.js";
|
|
24
34
|
import { extractFailurePatterns } from "./extract-patterns.js";
|
|
25
|
-
import {
|
|
35
|
+
import {
|
|
36
|
+
computeInvocationScores,
|
|
37
|
+
computeParetoFrontier,
|
|
38
|
+
computeTokenEfficiencyScore,
|
|
39
|
+
selectFromFrontier,
|
|
40
|
+
} from "./pareto.js";
|
|
41
|
+
import { generateMultipleProposals, generateProposal } from "./propose-description.js";
|
|
26
42
|
import type { ValidationResult } from "./validate-proposal.js";
|
|
27
|
-
import {
|
|
43
|
+
import {
|
|
44
|
+
TRIGGER_CHECK_BATCH_SIZE,
|
|
45
|
+
VALIDATION_RUNS,
|
|
46
|
+
validateProposal,
|
|
47
|
+
} from "./validate-proposal.js";
|
|
28
48
|
|
|
29
49
|
// ---------------------------------------------------------------------------
|
|
30
50
|
// Types
|
|
@@ -38,6 +58,16 @@ export interface EvolveOptions {
|
|
|
38
58
|
dryRun: boolean;
|
|
39
59
|
confidenceThreshold: number; // default 0.6
|
|
40
60
|
maxIterations: number; // default 3
|
|
61
|
+
gradingResults?: GradingResult[];
|
|
62
|
+
paretoEnabled?: boolean;
|
|
63
|
+
candidateCount?: number;
|
|
64
|
+
tokenEfficiencyEnabled?: boolean;
|
|
65
|
+
telemetryRecords?: SessionTelemetryRecord[];
|
|
66
|
+
withBaseline?: boolean;
|
|
67
|
+
validationModel?: string;
|
|
68
|
+
cheapLoop?: boolean;
|
|
69
|
+
gateModel?: string;
|
|
70
|
+
proposalModel?: string;
|
|
41
71
|
}
|
|
42
72
|
|
|
43
73
|
export interface EvolveResult {
|
|
@@ -46,6 +76,11 @@ export interface EvolveResult {
|
|
|
46
76
|
deployed: boolean;
|
|
47
77
|
auditEntries: EvolutionAuditEntry[];
|
|
48
78
|
reason: string;
|
|
79
|
+
skillVersion?: string;
|
|
80
|
+
llmCallCount: number;
|
|
81
|
+
elapsedMs: number;
|
|
82
|
+
baselineResult?: BaselineMeasurement;
|
|
83
|
+
gateValidation?: ValidationResult;
|
|
49
84
|
}
|
|
50
85
|
|
|
51
86
|
/**
|
|
@@ -53,11 +88,19 @@ export interface EvolveResult {
|
|
|
53
88
|
* imports are used. Pass overrides in tests to avoid mock.module().
|
|
54
89
|
*/
|
|
55
90
|
export interface EvolveDeps {
|
|
56
|
-
extractFailurePatterns?:
|
|
91
|
+
extractFailurePatterns?: (
|
|
92
|
+
evalEntries: EvalEntry[],
|
|
93
|
+
skillUsage: SkillUsageRecord[],
|
|
94
|
+
skillName: string,
|
|
95
|
+
gradingResults?: GradingResult[],
|
|
96
|
+
) => FailurePattern[];
|
|
57
97
|
generateProposal?: typeof import("./propose-description.js").generateProposal;
|
|
58
98
|
validateProposal?: typeof import("./validate-proposal.js").validateProposal;
|
|
99
|
+
gateValidateProposal?: typeof import("./validate-proposal.js").validateProposal;
|
|
59
100
|
appendAuditEntry?: typeof import("./audit.js").appendAuditEntry;
|
|
60
101
|
buildEvalSet?: typeof import("../eval/hooks-to-evals.js").buildEvalSet;
|
|
102
|
+
updateContextAfterEvolve?: typeof import("../memory/writer.js").updateContextAfterEvolve;
|
|
103
|
+
measureBaseline?: typeof import("../eval/baseline.js").measureBaseline;
|
|
61
104
|
}
|
|
62
105
|
|
|
63
106
|
// ---------------------------------------------------------------------------
|
|
@@ -69,12 +112,14 @@ function createAuditEntry(
|
|
|
69
112
|
action: EvolutionAuditEntry["action"],
|
|
70
113
|
details: string,
|
|
71
114
|
evalSnapshot?: EvalPassRate,
|
|
115
|
+
skillName?: string,
|
|
72
116
|
): EvolutionAuditEntry {
|
|
73
117
|
return {
|
|
74
118
|
timestamp: new Date().toISOString(),
|
|
75
119
|
proposal_id: proposalId,
|
|
76
120
|
action,
|
|
77
121
|
details,
|
|
122
|
+
...(skillName ? { skill_name: skillName } : {}),
|
|
78
123
|
...(evalSnapshot ? { eval_snapshot: evalSnapshot } : {}),
|
|
79
124
|
};
|
|
80
125
|
}
|
|
@@ -90,12 +135,22 @@ export async function evolve(
|
|
|
90
135
|
const { skillName, skillPath, evalSetPath, agent, dryRun, confidenceThreshold, maxIterations } =
|
|
91
136
|
options;
|
|
92
137
|
|
|
138
|
+
// Apply cheap-loop defaults: cheap models for proposal/validation, expensive for gate
|
|
139
|
+
if (options.cheapLoop) {
|
|
140
|
+
if (!options.proposalModel) options.proposalModel = "haiku";
|
|
141
|
+
if (!options.validationModel) options.validationModel = "haiku";
|
|
142
|
+
if (!options.gateModel) options.gateModel = "sonnet";
|
|
143
|
+
}
|
|
144
|
+
|
|
93
145
|
// Resolve injectable dependencies with real-import fallbacks
|
|
94
146
|
const _extractFailurePatterns = _deps.extractFailurePatterns ?? extractFailurePatterns;
|
|
95
147
|
const _generateProposal = _deps.generateProposal ?? generateProposal;
|
|
96
148
|
const _validateProposal = _deps.validateProposal ?? validateProposal;
|
|
149
|
+
const _gateValidateProposal = _deps.gateValidateProposal ?? validateProposal;
|
|
97
150
|
const _appendAuditEntry = _deps.appendAuditEntry ?? appendAuditEntry;
|
|
98
151
|
const _buildEvalSet = _deps.buildEvalSet ?? buildEvalSet;
|
|
152
|
+
const _updateContextAfterEvolve = _deps.updateContextAfterEvolve ?? updateContextAfterEvolve;
|
|
153
|
+
const _measureBaseline = _deps.measureBaseline ?? measureBaseline;
|
|
99
154
|
|
|
100
155
|
const auditEntries: EvolutionAuditEntry[] = [];
|
|
101
156
|
|
|
@@ -105,7 +160,7 @@ export async function evolve(
|
|
|
105
160
|
details: string,
|
|
106
161
|
evalSnapshot?: EvalPassRate,
|
|
107
162
|
): void {
|
|
108
|
-
const entry = createAuditEntry(proposalId, action, details, evalSnapshot);
|
|
163
|
+
const entry = createAuditEntry(proposalId, action, details, evalSnapshot, skillName);
|
|
109
164
|
auditEntries.push(entry);
|
|
110
165
|
try {
|
|
111
166
|
_appendAuditEntry(entry);
|
|
@@ -114,21 +169,47 @@ export async function evolve(
|
|
|
114
169
|
}
|
|
115
170
|
}
|
|
116
171
|
|
|
172
|
+
const pipelineStart = Date.now();
|
|
173
|
+
let llmCallCount = 0;
|
|
174
|
+
const tui = createEvolveTUI({ skillName, model: options.proposalModel ?? "(default)" });
|
|
175
|
+
const finishTui = () =>
|
|
176
|
+
tui.finish(
|
|
177
|
+
`${llmCallCount} LLM calls \u00b7 ${((Date.now() - pipelineStart) / 1000).toFixed(1)}s elapsed`,
|
|
178
|
+
);
|
|
179
|
+
|
|
180
|
+
/** Stamp every return with pipeline stats so callers always get them. */
|
|
181
|
+
const withStats = (r: Omit<EvolveResult, "llmCallCount" | "elapsedMs">): EvolveResult => ({
|
|
182
|
+
...r,
|
|
183
|
+
llmCallCount,
|
|
184
|
+
elapsedMs: Date.now() - pipelineStart,
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
// Hoisted so catch block can preserve partial results on error
|
|
188
|
+
let lastProposal: EvolutionProposal | null = null;
|
|
189
|
+
let lastValidation: ValidationResult | null = null;
|
|
190
|
+
|
|
117
191
|
try {
|
|
118
192
|
// -----------------------------------------------------------------------
|
|
119
193
|
// Step 1: Read current SKILL.md
|
|
120
194
|
// -----------------------------------------------------------------------
|
|
121
195
|
if (!existsSync(skillPath)) {
|
|
122
|
-
|
|
196
|
+
tui.fail(`SKILL.md not found at ${skillPath}`);
|
|
197
|
+
finishTui();
|
|
198
|
+
return withStats({
|
|
123
199
|
proposal: null,
|
|
124
200
|
validation: null,
|
|
125
201
|
deployed: false,
|
|
126
202
|
auditEntries,
|
|
127
203
|
reason: `SKILL.md not found at ${skillPath}`,
|
|
128
|
-
};
|
|
204
|
+
});
|
|
129
205
|
}
|
|
130
206
|
|
|
131
|
-
const
|
|
207
|
+
const rawContent = readFileSync(skillPath, "utf-8");
|
|
208
|
+
const frontmatter = parseFrontmatter(rawContent);
|
|
209
|
+
const currentDescription = frontmatter.description || rawContent;
|
|
210
|
+
const skillVersion = frontmatter.version || undefined;
|
|
211
|
+
const versionTag = skillVersion ? `, v${skillVersion}` : "";
|
|
212
|
+
tui.done(`Loaded SKILL.md (desc: ${currentDescription.length} chars${versionTag})`);
|
|
132
213
|
|
|
133
214
|
// -----------------------------------------------------------------------
|
|
134
215
|
// Step 2: Load eval set
|
|
@@ -145,6 +226,10 @@ export async function evolve(
|
|
|
145
226
|
evalSet = _buildEvalSet(skillRecords, queryRecords, skillName);
|
|
146
227
|
}
|
|
147
228
|
|
|
229
|
+
const posCount = evalSet.filter((e) => e.should_trigger).length;
|
|
230
|
+
const negCount = evalSet.filter((e) => !e.should_trigger).length;
|
|
231
|
+
tui.done(`Loaded eval set (${evalSet.length} entries: ${posCount}+, ${negCount}-)`);
|
|
232
|
+
|
|
148
233
|
// -----------------------------------------------------------------------
|
|
149
234
|
// Step 3: Load skill usage records
|
|
150
235
|
// -----------------------------------------------------------------------
|
|
@@ -153,19 +238,30 @@ export async function evolve(
|
|
|
153
238
|
// -----------------------------------------------------------------------
|
|
154
239
|
// Step 4: Extract failure patterns
|
|
155
240
|
// -----------------------------------------------------------------------
|
|
156
|
-
const failurePatterns = _extractFailurePatterns(
|
|
241
|
+
const failurePatterns = _extractFailurePatterns(
|
|
242
|
+
evalSet,
|
|
243
|
+
skillUsage,
|
|
244
|
+
skillName,
|
|
245
|
+
options.gradingResults,
|
|
246
|
+
);
|
|
247
|
+
|
|
248
|
+
const totalMissed = failurePatterns.reduce((sum, p) => sum + p.missed_queries.length, 0);
|
|
249
|
+
tui.done(
|
|
250
|
+
`Extracted ${failurePatterns.length} failure pattern(s) (${totalMissed} missed queries)`,
|
|
251
|
+
);
|
|
157
252
|
|
|
158
253
|
// -----------------------------------------------------------------------
|
|
159
254
|
// Step 5: Early exit if no patterns
|
|
160
255
|
// -----------------------------------------------------------------------
|
|
161
256
|
if (failurePatterns.length === 0) {
|
|
162
|
-
|
|
257
|
+
finishTui();
|
|
258
|
+
return withStats({
|
|
163
259
|
proposal: null,
|
|
164
260
|
validation: null,
|
|
165
261
|
deployed: false,
|
|
166
262
|
auditEntries,
|
|
167
263
|
reason: "No failure patterns found",
|
|
168
|
-
};
|
|
264
|
+
});
|
|
169
265
|
}
|
|
170
266
|
|
|
171
267
|
// -----------------------------------------------------------------------
|
|
@@ -174,156 +270,368 @@ export async function evolve(
|
|
|
174
270
|
const missedQueries = failurePatterns.flatMap((p) => p.missed_queries);
|
|
175
271
|
|
|
176
272
|
// -----------------------------------------------------------------------
|
|
177
|
-
// Steps 7-12:
|
|
273
|
+
// Steps 7-12: Proposal generation and validation
|
|
178
274
|
// -----------------------------------------------------------------------
|
|
179
|
-
let lastProposal: EvolutionProposal | null = null;
|
|
180
|
-
let lastValidation: ValidationResult | null = null;
|
|
181
|
-
let feedbackReason = "";
|
|
182
275
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
276
|
+
// -----------------------------------------------------------------------
|
|
277
|
+
// Pareto multi-candidate path
|
|
278
|
+
// -----------------------------------------------------------------------
|
|
279
|
+
const paretoEnabled = options.paretoEnabled ?? false;
|
|
280
|
+
const candidateCount = options.candidateCount ?? 3;
|
|
281
|
+
const tokenEfficiencyEnabled = options.tokenEfficiencyEnabled ?? false;
|
|
282
|
+
|
|
283
|
+
// Compute token efficiency score if enabled and telemetry is available
|
|
284
|
+
let tokenEffScore: number | undefined;
|
|
285
|
+
if (tokenEfficiencyEnabled && options.telemetryRecords && options.telemetryRecords.length > 0) {
|
|
286
|
+
tokenEffScore = computeTokenEfficiencyScore(skillName, options.telemetryRecords);
|
|
287
|
+
recordAudit(
|
|
288
|
+
"system",
|
|
289
|
+
"created",
|
|
290
|
+
`Token efficiency score for ${skillName}: ${tokenEffScore.toFixed(3)}`,
|
|
291
|
+
);
|
|
292
|
+
}
|
|
188
293
|
|
|
189
|
-
|
|
294
|
+
if (paretoEnabled && candidateCount > 1) {
|
|
295
|
+
// Generate N candidates in parallel
|
|
296
|
+
const candidates = await generateMultipleProposals(
|
|
190
297
|
currentDescription,
|
|
191
298
|
failurePatterns,
|
|
192
|
-
|
|
299
|
+
missedQueries,
|
|
193
300
|
skillName,
|
|
194
301
|
skillPath,
|
|
195
302
|
agent,
|
|
303
|
+
candidateCount,
|
|
304
|
+
options.proposalModel,
|
|
196
305
|
);
|
|
197
306
|
|
|
198
|
-
|
|
307
|
+
// Filter by confidence threshold
|
|
308
|
+
const viableCandidates = candidates.filter((c) => c.confidence >= confidenceThreshold);
|
|
309
|
+
|
|
310
|
+
if (viableCandidates.length === 0) {
|
|
311
|
+
finishTui();
|
|
312
|
+
return withStats({
|
|
313
|
+
proposal: candidates[0] ?? null,
|
|
314
|
+
validation: null,
|
|
315
|
+
deployed: false,
|
|
316
|
+
auditEntries,
|
|
317
|
+
reason: `No candidates met confidence threshold ${confidenceThreshold}`,
|
|
318
|
+
});
|
|
319
|
+
}
|
|
199
320
|
|
|
200
|
-
//
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
"created",
|
|
204
|
-
`Proposal created for ${skillName} (iteration ${iteration + 1})`,
|
|
205
|
-
);
|
|
321
|
+
// Validate each candidate
|
|
322
|
+
const paretoCandidates: ParetoCandidate[] = [];
|
|
323
|
+
for (const proposal of viableCandidates) {
|
|
324
|
+
recordAudit(proposal.proposal_id, "created", `Pareto candidate for ${skillName}`);
|
|
206
325
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
326
|
+
const validation = await _validateProposal(
|
|
327
|
+
proposal,
|
|
328
|
+
evalSet,
|
|
329
|
+
agent,
|
|
330
|
+
options.validationModel,
|
|
331
|
+
);
|
|
210
332
|
recordAudit(
|
|
211
333
|
proposal.proposal_id,
|
|
212
|
-
"
|
|
213
|
-
`
|
|
334
|
+
"validated",
|
|
335
|
+
`Pareto validation: improved=${validation.improved}`,
|
|
214
336
|
);
|
|
215
337
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
proposal
|
|
220
|
-
validation
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
|
|
338
|
+
if (validation.improved && validation.per_entry_results) {
|
|
339
|
+
const invocationScores = computeInvocationScores(validation.per_entry_results);
|
|
340
|
+
const candidate: ParetoCandidate = {
|
|
341
|
+
proposal,
|
|
342
|
+
validation,
|
|
343
|
+
invocation_scores: invocationScores,
|
|
344
|
+
dominates_on: [],
|
|
224
345
|
};
|
|
346
|
+
if (tokenEffScore !== undefined) {
|
|
347
|
+
candidate.token_efficiency_score = tokenEffScore;
|
|
348
|
+
}
|
|
349
|
+
paretoCandidates.push(candidate);
|
|
225
350
|
}
|
|
351
|
+
}
|
|
226
352
|
|
|
227
|
-
|
|
353
|
+
if (paretoCandidates.length === 0) {
|
|
354
|
+
finishTui();
|
|
355
|
+
return withStats({
|
|
356
|
+
proposal: viableCandidates[0],
|
|
357
|
+
validation: null,
|
|
358
|
+
deployed: false,
|
|
359
|
+
auditEntries,
|
|
360
|
+
reason: "No Pareto candidates improved validation",
|
|
361
|
+
});
|
|
228
362
|
}
|
|
229
363
|
|
|
230
|
-
//
|
|
231
|
-
const
|
|
232
|
-
|
|
364
|
+
// Compute Pareto frontier
|
|
365
|
+
const frontier = computeParetoFrontier(paretoCandidates);
|
|
366
|
+
const { best } = selectFromFrontier(frontier);
|
|
233
367
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
total: evalSet.length,
|
|
237
|
-
passed: Math.round(validation.after_pass_rate * evalSet.length),
|
|
238
|
-
failed: evalSet.length - Math.round(validation.after_pass_rate * evalSet.length),
|
|
239
|
-
pass_rate: validation.after_pass_rate,
|
|
240
|
-
};
|
|
241
|
-
recordAudit(
|
|
242
|
-
proposal.proposal_id,
|
|
243
|
-
"validated",
|
|
244
|
-
`Validation complete: improved=${validation.improved}`,
|
|
245
|
-
evalSnapshot,
|
|
246
|
-
);
|
|
368
|
+
lastProposal = best.proposal;
|
|
369
|
+
lastValidation = best.validation;
|
|
247
370
|
|
|
248
|
-
//
|
|
249
|
-
|
|
250
|
-
|
|
371
|
+
// Skip the standard retry loop — we already have our result
|
|
372
|
+
} else {
|
|
373
|
+
// Standard single-candidate retry loop
|
|
374
|
+
let feedbackReason = "";
|
|
375
|
+
|
|
376
|
+
for (let iteration = 0; iteration < maxIterations; iteration++) {
|
|
377
|
+
// Step 7: Generate proposal
|
|
378
|
+
const effectiveMissedQueries = feedbackReason
|
|
379
|
+
? [...missedQueries, `[Previous attempt failed: ${feedbackReason}]`]
|
|
380
|
+
: missedQueries;
|
|
381
|
+
|
|
382
|
+
tui.step(`Generating proposal (iteration ${iteration + 1}/${maxIterations})...`);
|
|
383
|
+
const proposal = await _generateProposal(
|
|
384
|
+
currentDescription,
|
|
385
|
+
failurePatterns,
|
|
386
|
+
effectiveMissedQueries,
|
|
387
|
+
skillName,
|
|
388
|
+
skillPath,
|
|
389
|
+
agent,
|
|
390
|
+
options.proposalModel,
|
|
391
|
+
);
|
|
392
|
+
llmCallCount++;
|
|
393
|
+
|
|
394
|
+
lastProposal = proposal;
|
|
395
|
+
tui.done(`Proposal generated (conf: ${proposal.confidence.toFixed(2)})`);
|
|
396
|
+
|
|
397
|
+
// Step 8: Audit "created"
|
|
251
398
|
recordAudit(
|
|
252
399
|
proposal.proposal_id,
|
|
253
|
-
"
|
|
254
|
-
`
|
|
400
|
+
"created",
|
|
401
|
+
`Proposal created for ${skillName} (iteration ${iteration + 1})`,
|
|
255
402
|
);
|
|
256
403
|
|
|
257
|
-
//
|
|
258
|
-
if (
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
404
|
+
// Step 9: Check confidence threshold
|
|
405
|
+
if (proposal.confidence < confidenceThreshold) {
|
|
406
|
+
feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
|
|
407
|
+
recordAudit(
|
|
408
|
+
proposal.proposal_id,
|
|
409
|
+
"rejected",
|
|
410
|
+
`Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
|
|
411
|
+
);
|
|
412
|
+
|
|
413
|
+
// If this is the last iteration, return early with rejection
|
|
414
|
+
if (iteration === maxIterations - 1) {
|
|
415
|
+
finishTui();
|
|
416
|
+
return withStats({
|
|
417
|
+
proposal: lastProposal,
|
|
418
|
+
validation: null,
|
|
419
|
+
deployed: false,
|
|
420
|
+
auditEntries,
|
|
421
|
+
reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
|
|
422
|
+
});
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
continue;
|
|
266
426
|
}
|
|
267
427
|
|
|
268
|
-
|
|
269
|
-
|
|
428
|
+
// Step 10: Validate against eval set
|
|
429
|
+
const batchCount = Math.ceil(evalSet.length / TRIGGER_CHECK_BATCH_SIZE);
|
|
430
|
+
tui.step(
|
|
431
|
+
`Validating ${evalSet.length} entries (${batchCount} batches, ${VALIDATION_RUNS}x majority-vote)...`,
|
|
432
|
+
);
|
|
433
|
+
const validation = await _validateProposal(
|
|
434
|
+
proposal,
|
|
435
|
+
evalSet,
|
|
436
|
+
agent,
|
|
437
|
+
options.validationModel,
|
|
438
|
+
);
|
|
439
|
+
lastValidation = validation;
|
|
440
|
+
llmCallCount += batchCount * 2 * VALIDATION_RUNS;
|
|
441
|
+
tui.done(
|
|
442
|
+
`Validation: ${(validation.before_pass_rate * 100).toFixed(1)}% \u2192 ${(validation.after_pass_rate * 100).toFixed(1)}% (improved: ${validation.improved})`,
|
|
443
|
+
);
|
|
444
|
+
|
|
445
|
+
// Step 11: Audit "validated"
|
|
446
|
+
const evalSnapshot: EvalPassRate = {
|
|
447
|
+
total: evalSet.length,
|
|
448
|
+
passed: Math.round(validation.after_pass_rate * evalSet.length),
|
|
449
|
+
failed: evalSet.length - Math.round(validation.after_pass_rate * evalSet.length),
|
|
450
|
+
pass_rate: validation.after_pass_rate,
|
|
451
|
+
};
|
|
452
|
+
recordAudit(
|
|
453
|
+
proposal.proposal_id,
|
|
454
|
+
"validated",
|
|
455
|
+
`Validation complete: improved=${validation.improved}`,
|
|
456
|
+
evalSnapshot,
|
|
457
|
+
);
|
|
270
458
|
|
|
271
|
-
|
|
272
|
-
|
|
459
|
+
// Step 12: Check validation result
|
|
460
|
+
if (!validation.improved) {
|
|
461
|
+
feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
|
|
462
|
+
recordAudit(
|
|
463
|
+
proposal.proposal_id,
|
|
464
|
+
"rejected",
|
|
465
|
+
`Validation failed: net_change=${validation.net_change.toFixed(3)}`,
|
|
466
|
+
);
|
|
467
|
+
|
|
468
|
+
// If this is the last iteration, return with rejection
|
|
469
|
+
if (iteration === maxIterations - 1) {
|
|
470
|
+
finishTui();
|
|
471
|
+
return withStats({
|
|
472
|
+
proposal: lastProposal,
|
|
473
|
+
validation: lastValidation,
|
|
474
|
+
deployed: false,
|
|
475
|
+
auditEntries,
|
|
476
|
+
reason: `Validation failed after ${maxIterations} iterations: net_change=${validation.net_change.toFixed(3)}`,
|
|
477
|
+
});
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
continue;
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
// Validation passed - break out of retry loop
|
|
484
|
+
break;
|
|
485
|
+
}
|
|
273
486
|
}
|
|
274
487
|
|
|
275
488
|
// -----------------------------------------------------------------------
|
|
276
489
|
// Step 13: Dry run check
|
|
277
490
|
// -----------------------------------------------------------------------
|
|
278
491
|
if (dryRun) {
|
|
279
|
-
|
|
492
|
+
finishTui();
|
|
493
|
+
return withStats({
|
|
280
494
|
proposal: lastProposal,
|
|
281
495
|
validation: lastValidation,
|
|
282
496
|
deployed: false,
|
|
283
497
|
auditEntries,
|
|
284
498
|
reason: "Dry run - proposal validated but not deployed",
|
|
285
|
-
};
|
|
499
|
+
});
|
|
286
500
|
}
|
|
287
501
|
|
|
288
502
|
// -----------------------------------------------------------------------
|
|
289
|
-
// Step
|
|
503
|
+
// Step 13b: Baseline gate (--with-baseline)
|
|
290
504
|
// -----------------------------------------------------------------------
|
|
291
|
-
|
|
505
|
+
let baselineResult: BaselineMeasurement | undefined;
|
|
506
|
+
if (options.withBaseline && lastProposal) {
|
|
507
|
+
tui.step("Measuring baseline...");
|
|
508
|
+
baselineResult = await _measureBaseline({
|
|
509
|
+
evalSet,
|
|
510
|
+
skillDescription: currentDescription,
|
|
511
|
+
skillName,
|
|
512
|
+
agent,
|
|
513
|
+
modelFlag: options.validationModel,
|
|
514
|
+
});
|
|
515
|
+
tui.done(
|
|
516
|
+
`Baseline: lift=${baselineResult.lift.toFixed(3)}, adds_value=${baselineResult.adds_value}`,
|
|
517
|
+
);
|
|
518
|
+
|
|
292
519
|
recordAudit(
|
|
293
520
|
lastProposal.proposal_id,
|
|
294
|
-
"
|
|
295
|
-
`
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
:
|
|
521
|
+
"validated",
|
|
522
|
+
`Baseline check: lift=${baselineResult.lift.toFixed(3)}, adds_value=${baselineResult.adds_value}`,
|
|
523
|
+
);
|
|
524
|
+
|
|
525
|
+
if (!baselineResult.adds_value) {
|
|
526
|
+
finishTui();
|
|
527
|
+
return withStats({
|
|
528
|
+
proposal: lastProposal,
|
|
529
|
+
validation: lastValidation,
|
|
530
|
+
deployed: false,
|
|
531
|
+
auditEntries,
|
|
532
|
+
reason: `Baseline gate failed: lift=${baselineResult.lift.toFixed(3)} below 0.05 threshold`,
|
|
533
|
+
baselineResult,
|
|
534
|
+
});
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
// -----------------------------------------------------------------------
|
|
539
|
+
// Step 13c: Gate validation (--cheap-loop / --gate-model)
|
|
540
|
+
// -----------------------------------------------------------------------
|
|
541
|
+
let gateValidation: ValidationResult | undefined;
|
|
542
|
+
if (options.gateModel && lastProposal && lastValidation?.improved) {
|
|
543
|
+
tui.step(`Gate validation (${options.gateModel})...`);
|
|
544
|
+
gateValidation = await _gateValidateProposal(lastProposal, evalSet, agent, options.gateModel);
|
|
545
|
+
tui.done(
|
|
546
|
+
`Gate (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
547
|
+
);
|
|
548
|
+
|
|
549
|
+
recordAudit(
|
|
550
|
+
lastProposal.proposal_id,
|
|
551
|
+
"validated",
|
|
552
|
+
`Gate validation (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
553
|
+
);
|
|
554
|
+
|
|
555
|
+
if (!gateValidation.improved) {
|
|
556
|
+
finishTui();
|
|
557
|
+
return withStats({
|
|
558
|
+
proposal: lastProposal,
|
|
559
|
+
validation: lastValidation,
|
|
560
|
+
deployed: false,
|
|
561
|
+
auditEntries,
|
|
562
|
+
reason: `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
563
|
+
gateValidation,
|
|
564
|
+
...(baselineResult ? { baselineResult } : {}),
|
|
565
|
+
});
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
// -----------------------------------------------------------------------
|
|
570
|
+
// Step 14: Deploy — write updated description to SKILL.md
|
|
571
|
+
// -----------------------------------------------------------------------
|
|
572
|
+
if (lastProposal && lastValidation?.improved) {
|
|
573
|
+
// Create backup before modifying
|
|
574
|
+
const backupPath = `${skillPath}.bak`;
|
|
575
|
+
copyFileSync(skillPath, backupPath);
|
|
576
|
+
tui.done(`Backup created at ${backupPath}`);
|
|
577
|
+
|
|
578
|
+
// Replace the frontmatter description
|
|
579
|
+
const updatedContent = replaceFrontmatterDescription(
|
|
580
|
+
rawContent,
|
|
581
|
+
lastProposal.proposed_description,
|
|
304
582
|
);
|
|
583
|
+
writeFileSync(skillPath, updatedContent, "utf-8");
|
|
584
|
+
tui.done(`Deployed updated description to ${skillPath}`);
|
|
585
|
+
|
|
586
|
+
recordAudit(lastProposal.proposal_id, "deployed", `Deployed proposal for ${skillName}`, {
|
|
587
|
+
total: evalSet.length,
|
|
588
|
+
passed: Math.round(lastValidation.after_pass_rate * evalSet.length),
|
|
589
|
+
failed: evalSet.length - Math.round(lastValidation.after_pass_rate * evalSet.length),
|
|
590
|
+
pass_rate: lastValidation.after_pass_rate,
|
|
591
|
+
});
|
|
305
592
|
}
|
|
306
593
|
|
|
307
594
|
// -----------------------------------------------------------------------
|
|
308
|
-
// Step 15
|
|
595
|
+
// Step 15: Update evolution memory
|
|
309
596
|
// -----------------------------------------------------------------------
|
|
310
|
-
|
|
597
|
+
const wasDeployed = lastProposal !== null && lastValidation !== null && lastValidation.improved;
|
|
598
|
+
const evolveResult: EvolveResult = withStats({
|
|
311
599
|
proposal: lastProposal,
|
|
312
600
|
validation: lastValidation,
|
|
313
|
-
deployed:
|
|
601
|
+
deployed: wasDeployed,
|
|
314
602
|
auditEntries,
|
|
315
|
-
reason:
|
|
316
|
-
|
|
603
|
+
reason: wasDeployed
|
|
604
|
+
? "Evolution deployed successfully"
|
|
605
|
+
: "Evolution not deployed: proposal or validation missing",
|
|
606
|
+
...(skillVersion ? { skillVersion } : {}),
|
|
607
|
+
...(baselineResult ? { baselineResult } : {}),
|
|
608
|
+
...(gateValidation ? { gateValidation } : {}),
|
|
609
|
+
});
|
|
610
|
+
|
|
611
|
+
if (lastProposal) {
|
|
612
|
+
try {
|
|
613
|
+
_updateContextAfterEvolve(skillName, lastProposal, evolveResult);
|
|
614
|
+
} catch {
|
|
615
|
+
// Memory writes should never fail the main operation
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
// -----------------------------------------------------------------------
|
|
620
|
+
// Step 16: Return complete result
|
|
621
|
+
// -----------------------------------------------------------------------
|
|
622
|
+
finishTui();
|
|
623
|
+
return evolveResult;
|
|
317
624
|
} catch (error) {
|
|
318
|
-
|
|
625
|
+
tui.destroy();
|
|
626
|
+
// Robust error handling: preserve partial results so callers can inspect progress
|
|
319
627
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
320
|
-
return {
|
|
321
|
-
proposal:
|
|
322
|
-
validation:
|
|
628
|
+
return withStats({
|
|
629
|
+
proposal: lastProposal,
|
|
630
|
+
validation: lastValidation,
|
|
323
631
|
deployed: false,
|
|
324
632
|
auditEntries,
|
|
325
633
|
reason: `Error during evolution: ${errorMessage}`,
|
|
326
|
-
};
|
|
634
|
+
});
|
|
327
635
|
}
|
|
328
636
|
}
|
|
329
637
|
|
|
@@ -341,6 +649,15 @@ export async function cliMain(): Promise<void> {
|
|
|
341
649
|
"dry-run": { type: "boolean", default: false },
|
|
342
650
|
confidence: { type: "string", default: "0.6" },
|
|
343
651
|
"max-iterations": { type: "string", default: "3" },
|
|
652
|
+
pareto: { type: "boolean", default: false },
|
|
653
|
+
candidates: { type: "string", default: "3" },
|
|
654
|
+
"token-efficiency": { type: "boolean", default: false },
|
|
655
|
+
"with-baseline": { type: "boolean", default: false },
|
|
656
|
+
"validation-model": { type: "string", default: "haiku" },
|
|
657
|
+
"cheap-loop": { type: "boolean", default: false },
|
|
658
|
+
"gate-model": { type: "string" },
|
|
659
|
+
"proposal-model": { type: "string" },
|
|
660
|
+
verbose: { type: "boolean", default: false },
|
|
344
661
|
help: { type: "boolean", default: false },
|
|
345
662
|
},
|
|
346
663
|
strict: true,
|
|
@@ -360,6 +677,15 @@ Options:
|
|
|
360
677
|
--dry-run Validate proposal without deploying
|
|
361
678
|
--confidence Confidence threshold 0.0-1.0 (default: 0.6)
|
|
362
679
|
--max-iterations Max retry iterations (default: 3)
|
|
680
|
+
--pareto Enable Pareto multi-candidate selection
|
|
681
|
+
--candidates Number of candidates to generate (default: 3, max: 5)
|
|
682
|
+
--token-efficiency Enable 5D Pareto with token efficiency scoring
|
|
683
|
+
--with-baseline Gate deployment on baseline lift > 0.05
|
|
684
|
+
--validation-model Model for trigger-check validation calls (default: haiku)
|
|
685
|
+
--cheap-loop Use cheap models for loop, expensive model for final gate
|
|
686
|
+
--gate-model Model for final gate validation (default: sonnet when --cheap-loop)
|
|
687
|
+
--proposal-model Model for proposal generation LLM calls
|
|
688
|
+
--verbose Output full EvolveResult JSON (default: compact summary)
|
|
363
689
|
--help Show this help message`);
|
|
364
690
|
process.exit(0);
|
|
365
691
|
}
|
|
@@ -395,6 +721,12 @@ Options:
|
|
|
395
721
|
process.exit(1);
|
|
396
722
|
}
|
|
397
723
|
|
|
724
|
+
const tokenEfficiencyEnabled = values["token-efficiency"] ?? false;
|
|
725
|
+
let telemetryRecords: SessionTelemetryRecord[] | undefined;
|
|
726
|
+
if (tokenEfficiencyEnabled) {
|
|
727
|
+
telemetryRecords = readJsonl<SessionTelemetryRecord>(TELEMETRY_LOG);
|
|
728
|
+
}
|
|
729
|
+
|
|
398
730
|
const result = await evolve({
|
|
399
731
|
skillName: values.skill,
|
|
400
732
|
skillPath: values["skill-path"],
|
|
@@ -403,9 +735,40 @@ Options:
|
|
|
403
735
|
dryRun: values["dry-run"] ?? false,
|
|
404
736
|
confidenceThreshold: Number.parseFloat(values.confidence ?? "0.6"),
|
|
405
737
|
maxIterations: Number.parseInt(values["max-iterations"] ?? "3", 10),
|
|
738
|
+
paretoEnabled: values.pareto ?? false,
|
|
739
|
+
candidateCount: Number.parseInt(values.candidates ?? "3", 10),
|
|
740
|
+
tokenEfficiencyEnabled,
|
|
741
|
+
telemetryRecords,
|
|
742
|
+
withBaseline: values["with-baseline"] ?? false,
|
|
743
|
+
validationModel: values["validation-model"],
|
|
744
|
+
cheapLoop: values["cheap-loop"] ?? false,
|
|
745
|
+
gateModel: values["gate-model"],
|
|
746
|
+
proposalModel: values["proposal-model"],
|
|
406
747
|
});
|
|
407
748
|
|
|
408
|
-
|
|
749
|
+
if (values.verbose) {
|
|
750
|
+
console.log(JSON.stringify(result, null, 2));
|
|
751
|
+
} else {
|
|
752
|
+
const summary: EvolveResultSummary = {
|
|
753
|
+
skill: values.skill,
|
|
754
|
+
deployed: result.deployed,
|
|
755
|
+
reason: result.reason,
|
|
756
|
+
before: result.validation?.before_pass_rate ?? 0,
|
|
757
|
+
after: result.validation?.after_pass_rate ?? 0,
|
|
758
|
+
net_change: result.validation?.net_change ?? 0,
|
|
759
|
+
improved: result.validation?.improved ?? false,
|
|
760
|
+
regressions: result.validation?.regressions.length ?? 0,
|
|
761
|
+
new_passes: result.validation?.new_passes.length ?? 0,
|
|
762
|
+
confidence: result.proposal?.confidence ?? 0,
|
|
763
|
+
llm_calls: result.llmCallCount,
|
|
764
|
+
elapsed_s: +(result.elapsedMs / 1000).toFixed(1),
|
|
765
|
+
proposal_id: result.proposal?.proposal_id ?? "",
|
|
766
|
+
rationale: result.proposal?.rationale ?? "",
|
|
767
|
+
...(result.skillVersion ? { version: result.skillVersion } : {}),
|
|
768
|
+
dashboard_url: `http://localhost:3141/report/${encodeURIComponent(values.skill)}`,
|
|
769
|
+
};
|
|
770
|
+
console.log(JSON.stringify(summary, null, 2));
|
|
771
|
+
}
|
|
409
772
|
process.exit(result.deployed ? 0 : 1);
|
|
410
773
|
}
|
|
411
774
|
|