selftune 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/.claude/agents/diagnosis-analyst.md +146 -0
  2. package/.claude/agents/evolution-reviewer.md +167 -0
  3. package/.claude/agents/integration-guide.md +200 -0
  4. package/.claude/agents/pattern-analyst.md +147 -0
  5. package/CHANGELOG.md +37 -0
  6. package/README.md +96 -256
  7. package/assets/BeforeAfter.gif +0 -0
  8. package/assets/FeedbackLoop.gif +0 -0
  9. package/assets/logo.svg +9 -0
  10. package/assets/skill-health-badge.svg +20 -0
  11. package/cli/selftune/activation-rules.ts +171 -0
  12. package/cli/selftune/badge/badge-data.ts +108 -0
  13. package/cli/selftune/badge/badge-svg.ts +212 -0
  14. package/cli/selftune/badge/badge.ts +103 -0
  15. package/cli/selftune/constants.ts +75 -1
  16. package/cli/selftune/contribute/bundle.ts +314 -0
  17. package/cli/selftune/contribute/contribute.ts +214 -0
  18. package/cli/selftune/contribute/sanitize.ts +162 -0
  19. package/cli/selftune/cron/setup.ts +266 -0
  20. package/cli/selftune/dashboard-server.ts +582 -0
  21. package/cli/selftune/dashboard.ts +25 -3
  22. package/cli/selftune/eval/baseline.ts +247 -0
  23. package/cli/selftune/eval/composability.ts +117 -0
  24. package/cli/selftune/eval/generate-unit-tests.ts +143 -0
  25. package/cli/selftune/eval/hooks-to-evals.ts +68 -2
  26. package/cli/selftune/eval/import-skillsbench.ts +221 -0
  27. package/cli/selftune/eval/synthetic-evals.ts +172 -0
  28. package/cli/selftune/eval/unit-test-cli.ts +152 -0
  29. package/cli/selftune/eval/unit-test.ts +196 -0
  30. package/cli/selftune/evolution/deploy-proposal.ts +142 -1
  31. package/cli/selftune/evolution/evolve-body.ts +492 -0
  32. package/cli/selftune/evolution/evolve.ts +466 -103
  33. package/cli/selftune/evolution/extract-patterns.ts +32 -1
  34. package/cli/selftune/evolution/pareto.ts +314 -0
  35. package/cli/selftune/evolution/propose-body.ts +171 -0
  36. package/cli/selftune/evolution/propose-description.ts +100 -2
  37. package/cli/selftune/evolution/propose-routing.ts +166 -0
  38. package/cli/selftune/evolution/refine-body.ts +141 -0
  39. package/cli/selftune/evolution/rollback.ts +19 -2
  40. package/cli/selftune/evolution/validate-body.ts +254 -0
  41. package/cli/selftune/evolution/validate-proposal.ts +257 -35
  42. package/cli/selftune/evolution/validate-routing.ts +177 -0
  43. package/cli/selftune/grading/grade-session.ts +138 -18
  44. package/cli/selftune/grading/pre-gates.ts +104 -0
  45. package/cli/selftune/hooks/auto-activate.ts +185 -0
  46. package/cli/selftune/hooks/evolution-guard.ts +165 -0
  47. package/cli/selftune/hooks/skill-change-guard.ts +112 -0
  48. package/cli/selftune/index.ts +88 -0
  49. package/cli/selftune/ingestors/claude-replay.ts +351 -0
  50. package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
  51. package/cli/selftune/init.ts +150 -3
  52. package/cli/selftune/memory/writer.ts +447 -0
  53. package/cli/selftune/monitoring/watch.ts +25 -2
  54. package/cli/selftune/status.ts +17 -13
  55. package/cli/selftune/types.ts +377 -5
  56. package/cli/selftune/utils/frontmatter.ts +217 -0
  57. package/cli/selftune/utils/llm-call.ts +29 -3
  58. package/cli/selftune/utils/transcript.ts +35 -0
  59. package/cli/selftune/utils/trigger-check.ts +89 -0
  60. package/cli/selftune/utils/tui.ts +156 -0
  61. package/dashboard/index.html +569 -8
  62. package/package.json +8 -4
  63. package/skill/SKILL.md +124 -8
  64. package/skill/Workflows/AutoActivation.md +144 -0
  65. package/skill/Workflows/Badge.md +118 -0
  66. package/skill/Workflows/Baseline.md +121 -0
  67. package/skill/Workflows/Composability.md +100 -0
  68. package/skill/Workflows/Contribute.md +91 -0
  69. package/skill/Workflows/Cron.md +155 -0
  70. package/skill/Workflows/Dashboard.md +203 -0
  71. package/skill/Workflows/Doctor.md +37 -1
  72. package/skill/Workflows/Evals.md +69 -1
  73. package/skill/Workflows/EvolutionMemory.md +152 -0
  74. package/skill/Workflows/Evolve.md +111 -6
  75. package/skill/Workflows/EvolveBody.md +159 -0
  76. package/skill/Workflows/ImportSkillsBench.md +111 -0
  77. package/skill/Workflows/Ingest.md +117 -3
  78. package/skill/Workflows/Initialize.md +57 -3
  79. package/skill/Workflows/Replay.md +70 -0
  80. package/skill/Workflows/Rollback.md +20 -1
  81. package/skill/Workflows/UnitTest.md +138 -0
  82. package/skill/Workflows/Watch.md +22 -0
  83. package/skill/settings_snippet.json +23 -0
  84. package/templates/activation-rules-default.json +27 -0
  85. package/templates/multi-skill-settings.json +64 -0
  86. package/templates/single-skill-settings.json +58 -0
@@ -6,25 +6,45 @@
6
6
  * logic and comprehensive audit tracking.
7
7
  */
8
8
 
9
- import { existsSync, readFileSync } from "node:fs";
9
+ import { copyFileSync, existsSync, readFileSync, writeFileSync } from "node:fs";
10
10
  import { parseArgs } from "node:util";
11
11
 
12
- import { QUERY_LOG, SKILL_LOG } from "../constants.js";
12
+ import { QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
13
+ import type { BaselineMeasurement } from "../eval/baseline.js";
14
+ import { measureBaseline } from "../eval/baseline.js";
13
15
  import { buildEvalSet } from "../eval/hooks-to-evals.js";
16
+ import { updateContextAfterEvolve } from "../memory/writer.js";
14
17
  import type {
15
18
  EvalEntry,
16
19
  EvalPassRate,
17
20
  EvolutionAuditEntry,
18
21
  EvolutionProposal,
22
+ EvolveResultSummary,
23
+ FailurePattern,
24
+ GradingResult,
25
+ ParetoCandidate,
19
26
  QueryLogRecord,
27
+ SessionTelemetryRecord,
20
28
  SkillUsageRecord,
21
29
  } from "../types.js";
30
+ import { parseFrontmatter, replaceFrontmatterDescription } from "../utils/frontmatter.js";
22
31
  import { readJsonl } from "../utils/jsonl.js";
32
+ import { createEvolveTUI } from "../utils/tui.js";
23
33
  import { appendAuditEntry } from "./audit.js";
24
34
  import { extractFailurePatterns } from "./extract-patterns.js";
25
- import { generateProposal } from "./propose-description.js";
35
+ import {
36
+ computeInvocationScores,
37
+ computeParetoFrontier,
38
+ computeTokenEfficiencyScore,
39
+ selectFromFrontier,
40
+ } from "./pareto.js";
41
+ import { generateMultipleProposals, generateProposal } from "./propose-description.js";
26
42
  import type { ValidationResult } from "./validate-proposal.js";
27
- import { validateProposal } from "./validate-proposal.js";
43
+ import {
44
+ TRIGGER_CHECK_BATCH_SIZE,
45
+ VALIDATION_RUNS,
46
+ validateProposal,
47
+ } from "./validate-proposal.js";
28
48
 
29
49
  // ---------------------------------------------------------------------------
30
50
  // Types
@@ -38,6 +58,16 @@ export interface EvolveOptions {
38
58
  dryRun: boolean;
39
59
  confidenceThreshold: number; // default 0.6
40
60
  maxIterations: number; // default 3
61
+ gradingResults?: GradingResult[];
62
+ paretoEnabled?: boolean;
63
+ candidateCount?: number;
64
+ tokenEfficiencyEnabled?: boolean;
65
+ telemetryRecords?: SessionTelemetryRecord[];
66
+ withBaseline?: boolean;
67
+ validationModel?: string;
68
+ cheapLoop?: boolean;
69
+ gateModel?: string;
70
+ proposalModel?: string;
41
71
  }
42
72
 
43
73
  export interface EvolveResult {
@@ -46,6 +76,11 @@ export interface EvolveResult {
46
76
  deployed: boolean;
47
77
  auditEntries: EvolutionAuditEntry[];
48
78
  reason: string;
79
+ skillVersion?: string;
80
+ llmCallCount: number;
81
+ elapsedMs: number;
82
+ baselineResult?: BaselineMeasurement;
83
+ gateValidation?: ValidationResult;
49
84
  }
50
85
 
51
86
  /**
@@ -53,11 +88,19 @@ export interface EvolveResult {
53
88
  * imports are used. Pass overrides in tests to avoid mock.module().
54
89
  */
55
90
  export interface EvolveDeps {
56
- extractFailurePatterns?: typeof import("./extract-patterns.js").extractFailurePatterns;
91
+ extractFailurePatterns?: (
92
+ evalEntries: EvalEntry[],
93
+ skillUsage: SkillUsageRecord[],
94
+ skillName: string,
95
+ gradingResults?: GradingResult[],
96
+ ) => FailurePattern[];
57
97
  generateProposal?: typeof import("./propose-description.js").generateProposal;
58
98
  validateProposal?: typeof import("./validate-proposal.js").validateProposal;
99
+ gateValidateProposal?: typeof import("./validate-proposal.js").validateProposal;
59
100
  appendAuditEntry?: typeof import("./audit.js").appendAuditEntry;
60
101
  buildEvalSet?: typeof import("../eval/hooks-to-evals.js").buildEvalSet;
102
+ updateContextAfterEvolve?: typeof import("../memory/writer.js").updateContextAfterEvolve;
103
+ measureBaseline?: typeof import("../eval/baseline.js").measureBaseline;
61
104
  }
62
105
 
63
106
  // ---------------------------------------------------------------------------
@@ -69,12 +112,14 @@ function createAuditEntry(
69
112
  action: EvolutionAuditEntry["action"],
70
113
  details: string,
71
114
  evalSnapshot?: EvalPassRate,
115
+ skillName?: string,
72
116
  ): EvolutionAuditEntry {
73
117
  return {
74
118
  timestamp: new Date().toISOString(),
75
119
  proposal_id: proposalId,
76
120
  action,
77
121
  details,
122
+ ...(skillName ? { skill_name: skillName } : {}),
78
123
  ...(evalSnapshot ? { eval_snapshot: evalSnapshot } : {}),
79
124
  };
80
125
  }
@@ -90,12 +135,22 @@ export async function evolve(
90
135
  const { skillName, skillPath, evalSetPath, agent, dryRun, confidenceThreshold, maxIterations } =
91
136
  options;
92
137
 
138
+ // Apply cheap-loop defaults: cheap models for proposal/validation, expensive for gate
139
+ if (options.cheapLoop) {
140
+ if (!options.proposalModel) options.proposalModel = "haiku";
141
+ if (!options.validationModel) options.validationModel = "haiku";
142
+ if (!options.gateModel) options.gateModel = "sonnet";
143
+ }
144
+
93
145
  // Resolve injectable dependencies with real-import fallbacks
94
146
  const _extractFailurePatterns = _deps.extractFailurePatterns ?? extractFailurePatterns;
95
147
  const _generateProposal = _deps.generateProposal ?? generateProposal;
96
148
  const _validateProposal = _deps.validateProposal ?? validateProposal;
149
+ const _gateValidateProposal = _deps.gateValidateProposal ?? validateProposal;
97
150
  const _appendAuditEntry = _deps.appendAuditEntry ?? appendAuditEntry;
98
151
  const _buildEvalSet = _deps.buildEvalSet ?? buildEvalSet;
152
+ const _updateContextAfterEvolve = _deps.updateContextAfterEvolve ?? updateContextAfterEvolve;
153
+ const _measureBaseline = _deps.measureBaseline ?? measureBaseline;
99
154
 
100
155
  const auditEntries: EvolutionAuditEntry[] = [];
101
156
 
@@ -105,7 +160,7 @@ export async function evolve(
105
160
  details: string,
106
161
  evalSnapshot?: EvalPassRate,
107
162
  ): void {
108
- const entry = createAuditEntry(proposalId, action, details, evalSnapshot);
163
+ const entry = createAuditEntry(proposalId, action, details, evalSnapshot, skillName);
109
164
  auditEntries.push(entry);
110
165
  try {
111
166
  _appendAuditEntry(entry);
@@ -114,21 +169,47 @@ export async function evolve(
114
169
  }
115
170
  }
116
171
 
172
+ const pipelineStart = Date.now();
173
+ let llmCallCount = 0;
174
+ const tui = createEvolveTUI({ skillName, model: options.proposalModel ?? "(default)" });
175
+ const finishTui = () =>
176
+ tui.finish(
177
+ `${llmCallCount} LLM calls \u00b7 ${((Date.now() - pipelineStart) / 1000).toFixed(1)}s elapsed`,
178
+ );
179
+
180
+ /** Stamp every return with pipeline stats so callers always get them. */
181
+ const withStats = (r: Omit<EvolveResult, "llmCallCount" | "elapsedMs">): EvolveResult => ({
182
+ ...r,
183
+ llmCallCount,
184
+ elapsedMs: Date.now() - pipelineStart,
185
+ });
186
+
187
+ // Hoisted so catch block can preserve partial results on error
188
+ let lastProposal: EvolutionProposal | null = null;
189
+ let lastValidation: ValidationResult | null = null;
190
+
117
191
  try {
118
192
  // -----------------------------------------------------------------------
119
193
  // Step 1: Read current SKILL.md
120
194
  // -----------------------------------------------------------------------
121
195
  if (!existsSync(skillPath)) {
122
- return {
196
+ tui.fail(`SKILL.md not found at ${skillPath}`);
197
+ finishTui();
198
+ return withStats({
123
199
  proposal: null,
124
200
  validation: null,
125
201
  deployed: false,
126
202
  auditEntries,
127
203
  reason: `SKILL.md not found at ${skillPath}`,
128
- };
204
+ });
129
205
  }
130
206
 
131
- const currentDescription = readFileSync(skillPath, "utf-8");
207
+ const rawContent = readFileSync(skillPath, "utf-8");
208
+ const frontmatter = parseFrontmatter(rawContent);
209
+ const currentDescription = frontmatter.description || rawContent;
210
+ const skillVersion = frontmatter.version || undefined;
211
+ const versionTag = skillVersion ? `, v${skillVersion}` : "";
212
+ tui.done(`Loaded SKILL.md (desc: ${currentDescription.length} chars${versionTag})`);
132
213
 
133
214
  // -----------------------------------------------------------------------
134
215
  // Step 2: Load eval set
@@ -145,6 +226,10 @@ export async function evolve(
145
226
  evalSet = _buildEvalSet(skillRecords, queryRecords, skillName);
146
227
  }
147
228
 
229
+ const posCount = evalSet.filter((e) => e.should_trigger).length;
230
+ const negCount = evalSet.filter((e) => !e.should_trigger).length;
231
+ tui.done(`Loaded eval set (${evalSet.length} entries: ${posCount}+, ${negCount}-)`);
232
+
148
233
  // -----------------------------------------------------------------------
149
234
  // Step 3: Load skill usage records
150
235
  // -----------------------------------------------------------------------
@@ -153,19 +238,30 @@ export async function evolve(
153
238
  // -----------------------------------------------------------------------
154
239
  // Step 4: Extract failure patterns
155
240
  // -----------------------------------------------------------------------
156
- const failurePatterns = _extractFailurePatterns(evalSet, skillUsage, skillName);
241
+ const failurePatterns = _extractFailurePatterns(
242
+ evalSet,
243
+ skillUsage,
244
+ skillName,
245
+ options.gradingResults,
246
+ );
247
+
248
+ const totalMissed = failurePatterns.reduce((sum, p) => sum + p.missed_queries.length, 0);
249
+ tui.done(
250
+ `Extracted ${failurePatterns.length} failure pattern(s) (${totalMissed} missed queries)`,
251
+ );
157
252
 
158
253
  // -----------------------------------------------------------------------
159
254
  // Step 5: Early exit if no patterns
160
255
  // -----------------------------------------------------------------------
161
256
  if (failurePatterns.length === 0) {
162
- return {
257
+ finishTui();
258
+ return withStats({
163
259
  proposal: null,
164
260
  validation: null,
165
261
  deployed: false,
166
262
  auditEntries,
167
263
  reason: "No failure patterns found",
168
- };
264
+ });
169
265
  }
170
266
 
171
267
  // -----------------------------------------------------------------------
@@ -174,156 +270,368 @@ export async function evolve(
174
270
  const missedQueries = failurePatterns.flatMap((p) => p.missed_queries);
175
271
 
176
272
  // -----------------------------------------------------------------------
177
- // Steps 7-12: Retry loop for proposal generation and validation
273
+ // Steps 7-12: Proposal generation and validation
178
274
  // -----------------------------------------------------------------------
179
- let lastProposal: EvolutionProposal | null = null;
180
- let lastValidation: ValidationResult | null = null;
181
- let feedbackReason = "";
182
275
 
183
- for (let iteration = 0; iteration < maxIterations; iteration++) {
184
- // Step 7: Generate proposal
185
- const effectiveMissedQueries = feedbackReason
186
- ? [...missedQueries, `[Previous attempt failed: ${feedbackReason}]`]
187
- : missedQueries;
276
+ // -----------------------------------------------------------------------
277
+ // Pareto multi-candidate path
278
+ // -----------------------------------------------------------------------
279
+ const paretoEnabled = options.paretoEnabled ?? false;
280
+ const candidateCount = options.candidateCount ?? 3;
281
+ const tokenEfficiencyEnabled = options.tokenEfficiencyEnabled ?? false;
282
+
283
+ // Compute token efficiency score if enabled and telemetry is available
284
+ let tokenEffScore: number | undefined;
285
+ if (tokenEfficiencyEnabled && options.telemetryRecords && options.telemetryRecords.length > 0) {
286
+ tokenEffScore = computeTokenEfficiencyScore(skillName, options.telemetryRecords);
287
+ recordAudit(
288
+ "system",
289
+ "created",
290
+ `Token efficiency score for ${skillName}: ${tokenEffScore.toFixed(3)}`,
291
+ );
292
+ }
188
293
 
189
- const proposal = await _generateProposal(
294
+ if (paretoEnabled && candidateCount > 1) {
295
+ // Generate N candidates in parallel
296
+ const candidates = await generateMultipleProposals(
190
297
  currentDescription,
191
298
  failurePatterns,
192
- effectiveMissedQueries,
299
+ missedQueries,
193
300
  skillName,
194
301
  skillPath,
195
302
  agent,
303
+ candidateCount,
304
+ options.proposalModel,
196
305
  );
197
306
 
198
- lastProposal = proposal;
307
+ // Filter by confidence threshold
308
+ const viableCandidates = candidates.filter((c) => c.confidence >= confidenceThreshold);
309
+
310
+ if (viableCandidates.length === 0) {
311
+ finishTui();
312
+ return withStats({
313
+ proposal: candidates[0] ?? null,
314
+ validation: null,
315
+ deployed: false,
316
+ auditEntries,
317
+ reason: `No candidates met confidence threshold ${confidenceThreshold}`,
318
+ });
319
+ }
199
320
 
200
- // Step 8: Audit "created"
201
- recordAudit(
202
- proposal.proposal_id,
203
- "created",
204
- `Proposal created for ${skillName} (iteration ${iteration + 1})`,
205
- );
321
+ // Validate each candidate
322
+ const paretoCandidates: ParetoCandidate[] = [];
323
+ for (const proposal of viableCandidates) {
324
+ recordAudit(proposal.proposal_id, "created", `Pareto candidate for ${skillName}`);
206
325
 
207
- // Step 9: Check confidence threshold
208
- if (proposal.confidence < confidenceThreshold) {
209
- feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
326
+ const validation = await _validateProposal(
327
+ proposal,
328
+ evalSet,
329
+ agent,
330
+ options.validationModel,
331
+ );
210
332
  recordAudit(
211
333
  proposal.proposal_id,
212
- "rejected",
213
- `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
334
+ "validated",
335
+ `Pareto validation: improved=${validation.improved}`,
214
336
  );
215
337
 
216
- // If this is the last iteration, return early with rejection
217
- if (iteration === maxIterations - 1) {
218
- return {
219
- proposal: lastProposal,
220
- validation: null,
221
- deployed: false,
222
- auditEntries,
223
- reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
338
+ if (validation.improved && validation.per_entry_results) {
339
+ const invocationScores = computeInvocationScores(validation.per_entry_results);
340
+ const candidate: ParetoCandidate = {
341
+ proposal,
342
+ validation,
343
+ invocation_scores: invocationScores,
344
+ dominates_on: [],
224
345
  };
346
+ if (tokenEffScore !== undefined) {
347
+ candidate.token_efficiency_score = tokenEffScore;
348
+ }
349
+ paretoCandidates.push(candidate);
225
350
  }
351
+ }
226
352
 
227
- continue;
353
+ if (paretoCandidates.length === 0) {
354
+ finishTui();
355
+ return withStats({
356
+ proposal: viableCandidates[0],
357
+ validation: null,
358
+ deployed: false,
359
+ auditEntries,
360
+ reason: "No Pareto candidates improved validation",
361
+ });
228
362
  }
229
363
 
230
- // Step 10: Validate against eval set
231
- const validation = await _validateProposal(proposal, evalSet, agent);
232
- lastValidation = validation;
364
+ // Compute Pareto frontier
365
+ const frontier = computeParetoFrontier(paretoCandidates);
366
+ const { best } = selectFromFrontier(frontier);
233
367
 
234
- // Step 11: Audit "validated"
235
- const evalSnapshot: EvalPassRate = {
236
- total: evalSet.length,
237
- passed: Math.round(validation.after_pass_rate * evalSet.length),
238
- failed: evalSet.length - Math.round(validation.after_pass_rate * evalSet.length),
239
- pass_rate: validation.after_pass_rate,
240
- };
241
- recordAudit(
242
- proposal.proposal_id,
243
- "validated",
244
- `Validation complete: improved=${validation.improved}`,
245
- evalSnapshot,
246
- );
368
+ lastProposal = best.proposal;
369
+ lastValidation = best.validation;
247
370
 
248
- // Step 12: Check validation result
249
- if (!validation.improved) {
250
- feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
371
+ // Skip the standard retry loop — we already have our result
372
+ } else {
373
+ // Standard single-candidate retry loop
374
+ let feedbackReason = "";
375
+
376
+ for (let iteration = 0; iteration < maxIterations; iteration++) {
377
+ // Step 7: Generate proposal
378
+ const effectiveMissedQueries = feedbackReason
379
+ ? [...missedQueries, `[Previous attempt failed: ${feedbackReason}]`]
380
+ : missedQueries;
381
+
382
+ tui.step(`Generating proposal (iteration ${iteration + 1}/${maxIterations})...`);
383
+ const proposal = await _generateProposal(
384
+ currentDescription,
385
+ failurePatterns,
386
+ effectiveMissedQueries,
387
+ skillName,
388
+ skillPath,
389
+ agent,
390
+ options.proposalModel,
391
+ );
392
+ llmCallCount++;
393
+
394
+ lastProposal = proposal;
395
+ tui.done(`Proposal generated (conf: ${proposal.confidence.toFixed(2)})`);
396
+
397
+ // Step 8: Audit "created"
251
398
  recordAudit(
252
399
  proposal.proposal_id,
253
- "rejected",
254
- `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
400
+ "created",
401
+ `Proposal created for ${skillName} (iteration ${iteration + 1})`,
255
402
  );
256
403
 
257
- // If this is the last iteration, return with rejection
258
- if (iteration === maxIterations - 1) {
259
- return {
260
- proposal: lastProposal,
261
- validation: lastValidation,
262
- deployed: false,
263
- auditEntries,
264
- reason: `Validation failed after ${maxIterations} iterations: net_change=${validation.net_change.toFixed(3)}`,
265
- };
404
+ // Step 9: Check confidence threshold
405
+ if (proposal.confidence < confidenceThreshold) {
406
+ feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
407
+ recordAudit(
408
+ proposal.proposal_id,
409
+ "rejected",
410
+ `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
411
+ );
412
+
413
+ // If this is the last iteration, return early with rejection
414
+ if (iteration === maxIterations - 1) {
415
+ finishTui();
416
+ return withStats({
417
+ proposal: lastProposal,
418
+ validation: null,
419
+ deployed: false,
420
+ auditEntries,
421
+ reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
422
+ });
423
+ }
424
+
425
+ continue;
266
426
  }
267
427
 
268
- continue;
269
- }
428
+ // Step 10: Validate against eval set
429
+ const batchCount = Math.ceil(evalSet.length / TRIGGER_CHECK_BATCH_SIZE);
430
+ tui.step(
431
+ `Validating ${evalSet.length} entries (${batchCount} batches, ${VALIDATION_RUNS}x majority-vote)...`,
432
+ );
433
+ const validation = await _validateProposal(
434
+ proposal,
435
+ evalSet,
436
+ agent,
437
+ options.validationModel,
438
+ );
439
+ lastValidation = validation;
440
+ llmCallCount += batchCount * 2 * VALIDATION_RUNS;
441
+ tui.done(
442
+ `Validation: ${(validation.before_pass_rate * 100).toFixed(1)}% \u2192 ${(validation.after_pass_rate * 100).toFixed(1)}% (improved: ${validation.improved})`,
443
+ );
444
+
445
+ // Step 11: Audit "validated"
446
+ const evalSnapshot: EvalPassRate = {
447
+ total: evalSet.length,
448
+ passed: Math.round(validation.after_pass_rate * evalSet.length),
449
+ failed: evalSet.length - Math.round(validation.after_pass_rate * evalSet.length),
450
+ pass_rate: validation.after_pass_rate,
451
+ };
452
+ recordAudit(
453
+ proposal.proposal_id,
454
+ "validated",
455
+ `Validation complete: improved=${validation.improved}`,
456
+ evalSnapshot,
457
+ );
270
458
 
271
- // Validation passed - break out of retry loop
272
- break;
459
+ // Step 12: Check validation result
460
+ if (!validation.improved) {
461
+ feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
462
+ recordAudit(
463
+ proposal.proposal_id,
464
+ "rejected",
465
+ `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
466
+ );
467
+
468
+ // If this is the last iteration, return with rejection
469
+ if (iteration === maxIterations - 1) {
470
+ finishTui();
471
+ return withStats({
472
+ proposal: lastProposal,
473
+ validation: lastValidation,
474
+ deployed: false,
475
+ auditEntries,
476
+ reason: `Validation failed after ${maxIterations} iterations: net_change=${validation.net_change.toFixed(3)}`,
477
+ });
478
+ }
479
+
480
+ continue;
481
+ }
482
+
483
+ // Validation passed - break out of retry loop
484
+ break;
485
+ }
273
486
  }
274
487
 
275
488
  // -----------------------------------------------------------------------
276
489
  // Step 13: Dry run check
277
490
  // -----------------------------------------------------------------------
278
491
  if (dryRun) {
279
- return {
492
+ finishTui();
493
+ return withStats({
280
494
  proposal: lastProposal,
281
495
  validation: lastValidation,
282
496
  deployed: false,
283
497
  auditEntries,
284
498
  reason: "Dry run - proposal validated but not deployed",
285
- };
499
+ });
286
500
  }
287
501
 
288
502
  // -----------------------------------------------------------------------
289
- // Step 14: Deploy (actual deploy wired in TASK-14)
503
+ // Step 13b: Baseline gate (--with-baseline)
290
504
  // -----------------------------------------------------------------------
291
- if (lastProposal) {
505
+ let baselineResult: BaselineMeasurement | undefined;
506
+ if (options.withBaseline && lastProposal) {
507
+ tui.step("Measuring baseline...");
508
+ baselineResult = await _measureBaseline({
509
+ evalSet,
510
+ skillDescription: currentDescription,
511
+ skillName,
512
+ agent,
513
+ modelFlag: options.validationModel,
514
+ });
515
+ tui.done(
516
+ `Baseline: lift=${baselineResult.lift.toFixed(3)}, adds_value=${baselineResult.adds_value}`,
517
+ );
518
+
292
519
  recordAudit(
293
520
  lastProposal.proposal_id,
294
- "deployed",
295
- `Deployed proposal for ${skillName}`,
296
- lastValidation
297
- ? {
298
- total: evalSet.length,
299
- passed: Math.round(lastValidation.after_pass_rate * evalSet.length),
300
- failed: evalSet.length - Math.round(lastValidation.after_pass_rate * evalSet.length),
301
- pass_rate: lastValidation.after_pass_rate,
302
- }
303
- : undefined,
521
+ "validated",
522
+ `Baseline check: lift=${baselineResult.lift.toFixed(3)}, adds_value=${baselineResult.adds_value}`,
523
+ );
524
+
525
+ if (!baselineResult.adds_value) {
526
+ finishTui();
527
+ return withStats({
528
+ proposal: lastProposal,
529
+ validation: lastValidation,
530
+ deployed: false,
531
+ auditEntries,
532
+ reason: `Baseline gate failed: lift=${baselineResult.lift.toFixed(3)} below 0.05 threshold`,
533
+ baselineResult,
534
+ });
535
+ }
536
+ }
537
+
538
+ // -----------------------------------------------------------------------
539
+ // Step 13c: Gate validation (--cheap-loop / --gate-model)
540
+ // -----------------------------------------------------------------------
541
+ let gateValidation: ValidationResult | undefined;
542
+ if (options.gateModel && lastProposal && lastValidation?.improved) {
543
+ tui.step(`Gate validation (${options.gateModel})...`);
544
+ gateValidation = await _gateValidateProposal(lastProposal, evalSet, agent, options.gateModel);
545
+ tui.done(
546
+ `Gate (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
547
+ );
548
+
549
+ recordAudit(
550
+ lastProposal.proposal_id,
551
+ "validated",
552
+ `Gate validation (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
553
+ );
554
+
555
+ if (!gateValidation.improved) {
556
+ finishTui();
557
+ return withStats({
558
+ proposal: lastProposal,
559
+ validation: lastValidation,
560
+ deployed: false,
561
+ auditEntries,
562
+ reason: `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
563
+ gateValidation,
564
+ ...(baselineResult ? { baselineResult } : {}),
565
+ });
566
+ }
567
+ }
568
+
569
+ // -----------------------------------------------------------------------
570
+ // Step 14: Deploy — write updated description to SKILL.md
571
+ // -----------------------------------------------------------------------
572
+ if (lastProposal && lastValidation?.improved) {
573
+ // Create backup before modifying
574
+ const backupPath = `${skillPath}.bak`;
575
+ copyFileSync(skillPath, backupPath);
576
+ tui.done(`Backup created at ${backupPath}`);
577
+
578
+ // Replace the frontmatter description
579
+ const updatedContent = replaceFrontmatterDescription(
580
+ rawContent,
581
+ lastProposal.proposed_description,
304
582
  );
583
+ writeFileSync(skillPath, updatedContent, "utf-8");
584
+ tui.done(`Deployed updated description to ${skillPath}`);
585
+
586
+ recordAudit(lastProposal.proposal_id, "deployed", `Deployed proposal for ${skillName}`, {
587
+ total: evalSet.length,
588
+ passed: Math.round(lastValidation.after_pass_rate * evalSet.length),
589
+ failed: evalSet.length - Math.round(lastValidation.after_pass_rate * evalSet.length),
590
+ pass_rate: lastValidation.after_pass_rate,
591
+ });
305
592
  }
306
593
 
307
594
  // -----------------------------------------------------------------------
308
- // Step 15-16: Return complete result
595
+ // Step 15: Update evolution memory
309
596
  // -----------------------------------------------------------------------
310
- return {
597
+ const wasDeployed = lastProposal !== null && lastValidation !== null && lastValidation.improved;
598
+ const evolveResult: EvolveResult = withStats({
311
599
  proposal: lastProposal,
312
600
  validation: lastValidation,
313
- deployed: true,
601
+ deployed: wasDeployed,
314
602
  auditEntries,
315
- reason: "Evolution deployed successfully",
316
- };
603
+ reason: wasDeployed
604
+ ? "Evolution deployed successfully"
605
+ : "Evolution not deployed: proposal or validation missing",
606
+ ...(skillVersion ? { skillVersion } : {}),
607
+ ...(baselineResult ? { baselineResult } : {}),
608
+ ...(gateValidation ? { gateValidation } : {}),
609
+ });
610
+
611
+ if (lastProposal) {
612
+ try {
613
+ _updateContextAfterEvolve(skillName, lastProposal, evolveResult);
614
+ } catch {
615
+ // Memory writes should never fail the main operation
616
+ }
617
+ }
618
+
619
+ // -----------------------------------------------------------------------
620
+ // Step 16: Return complete result
621
+ // -----------------------------------------------------------------------
622
+ finishTui();
623
+ return evolveResult;
317
624
  } catch (error) {
318
- // Robust error handling: catch any unexpected errors and return gracefully
625
+ tui.destroy();
626
+ // Robust error handling: preserve partial results so callers can inspect progress
319
627
  const errorMessage = error instanceof Error ? error.message : String(error);
320
- return {
321
- proposal: null,
322
- validation: null,
628
+ return withStats({
629
+ proposal: lastProposal,
630
+ validation: lastValidation,
323
631
  deployed: false,
324
632
  auditEntries,
325
633
  reason: `Error during evolution: ${errorMessage}`,
326
- };
634
+ });
327
635
  }
328
636
  }
329
637
 
@@ -341,6 +649,15 @@ export async function cliMain(): Promise<void> {
341
649
  "dry-run": { type: "boolean", default: false },
342
650
  confidence: { type: "string", default: "0.6" },
343
651
  "max-iterations": { type: "string", default: "3" },
652
+ pareto: { type: "boolean", default: false },
653
+ candidates: { type: "string", default: "3" },
654
+ "token-efficiency": { type: "boolean", default: false },
655
+ "with-baseline": { type: "boolean", default: false },
656
+ "validation-model": { type: "string", default: "haiku" },
657
+ "cheap-loop": { type: "boolean", default: false },
658
+ "gate-model": { type: "string" },
659
+ "proposal-model": { type: "string" },
660
+ verbose: { type: "boolean", default: false },
344
661
  help: { type: "boolean", default: false },
345
662
  },
346
663
  strict: true,
@@ -360,6 +677,15 @@ Options:
360
677
  --dry-run Validate proposal without deploying
361
678
  --confidence Confidence threshold 0.0-1.0 (default: 0.6)
362
679
  --max-iterations Max retry iterations (default: 3)
680
+ --pareto Enable Pareto multi-candidate selection
681
+ --candidates Number of candidates to generate (default: 3, max: 5)
682
+ --token-efficiency Enable 5D Pareto with token efficiency scoring
683
+ --with-baseline Gate deployment on baseline lift > 0.05
684
+ --validation-model Model for trigger-check validation calls (default: haiku)
685
+ --cheap-loop Use cheap models for loop, expensive model for final gate
686
+ --gate-model Model for final gate validation (default: sonnet when --cheap-loop)
687
+ --proposal-model Model for proposal generation LLM calls
688
+ --verbose Output full EvolveResult JSON (default: compact summary)
363
689
  --help Show this help message`);
364
690
  process.exit(0);
365
691
  }
@@ -395,6 +721,12 @@ Options:
395
721
  process.exit(1);
396
722
  }
397
723
 
724
+ const tokenEfficiencyEnabled = values["token-efficiency"] ?? false;
725
+ let telemetryRecords: SessionTelemetryRecord[] | undefined;
726
+ if (tokenEfficiencyEnabled) {
727
+ telemetryRecords = readJsonl<SessionTelemetryRecord>(TELEMETRY_LOG);
728
+ }
729
+
398
730
  const result = await evolve({
399
731
  skillName: values.skill,
400
732
  skillPath: values["skill-path"],
@@ -403,9 +735,40 @@ Options:
403
735
  dryRun: values["dry-run"] ?? false,
404
736
  confidenceThreshold: Number.parseFloat(values.confidence ?? "0.6"),
405
737
  maxIterations: Number.parseInt(values["max-iterations"] ?? "3", 10),
738
+ paretoEnabled: values.pareto ?? false,
739
+ candidateCount: Number.parseInt(values.candidates ?? "3", 10),
740
+ tokenEfficiencyEnabled,
741
+ telemetryRecords,
742
+ withBaseline: values["with-baseline"] ?? false,
743
+ validationModel: values["validation-model"],
744
+ cheapLoop: values["cheap-loop"] ?? false,
745
+ gateModel: values["gate-model"],
746
+ proposalModel: values["proposal-model"],
406
747
  });
407
748
 
408
- console.log(JSON.stringify(result, null, 2));
749
+ if (values.verbose) {
750
+ console.log(JSON.stringify(result, null, 2));
751
+ } else {
752
+ const summary: EvolveResultSummary = {
753
+ skill: values.skill,
754
+ deployed: result.deployed,
755
+ reason: result.reason,
756
+ before: result.validation?.before_pass_rate ?? 0,
757
+ after: result.validation?.after_pass_rate ?? 0,
758
+ net_change: result.validation?.net_change ?? 0,
759
+ improved: result.validation?.improved ?? false,
760
+ regressions: result.validation?.regressions.length ?? 0,
761
+ new_passes: result.validation?.new_passes.length ?? 0,
762
+ confidence: result.proposal?.confidence ?? 0,
763
+ llm_calls: result.llmCallCount,
764
+ elapsed_s: +(result.elapsedMs / 1000).toFixed(1),
765
+ proposal_id: result.proposal?.proposal_id ?? "",
766
+ rationale: result.proposal?.rationale ?? "",
767
+ ...(result.skillVersion ? { version: result.skillVersion } : {}),
768
+ dashboard_url: `http://localhost:3141/report/${encodeURIComponent(values.skill)}`,
769
+ };
770
+ console.log(JSON.stringify(summary, null, 2));
771
+ }
409
772
  process.exit(result.deployed ? 0 : 1);
410
773
  }
411
774