selftune 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/.claude/agents/diagnosis-analyst.md +156 -0
  2. package/.claude/agents/evolution-reviewer.md +180 -0
  3. package/.claude/agents/integration-guide.md +212 -0
  4. package/.claude/agents/pattern-analyst.md +160 -0
  5. package/CHANGELOG.md +46 -1
  6. package/README.md +105 -257
  7. package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
  8. package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
  9. package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
  10. package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
  11. package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
  12. package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
  13. package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
  14. package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
  15. package/apps/local-dashboard/dist/favicon.png +0 -0
  16. package/apps/local-dashboard/dist/index.html +17 -0
  17. package/apps/local-dashboard/dist/logo.png +0 -0
  18. package/apps/local-dashboard/dist/logo.svg +9 -0
  19. package/assets/BeforeAfter.gif +0 -0
  20. package/assets/FeedbackLoop.gif +0 -0
  21. package/assets/logo.svg +9 -0
  22. package/assets/skill-health-badge.svg +20 -0
  23. package/cli/selftune/activation-rules.ts +171 -0
  24. package/cli/selftune/badge/badge-data.ts +108 -0
  25. package/cli/selftune/badge/badge-svg.ts +212 -0
  26. package/cli/selftune/badge/badge.ts +99 -0
  27. package/cli/selftune/canonical-export.ts +183 -0
  28. package/cli/selftune/constants.ts +103 -1
  29. package/cli/selftune/contribute/bundle.ts +314 -0
  30. package/cli/selftune/contribute/contribute.ts +214 -0
  31. package/cli/selftune/contribute/sanitize.ts +162 -0
  32. package/cli/selftune/cron/setup.ts +266 -0
  33. package/cli/selftune/dashboard-contract.ts +202 -0
  34. package/cli/selftune/dashboard-server.ts +1049 -0
  35. package/cli/selftune/dashboard.ts +43 -156
  36. package/cli/selftune/eval/baseline.ts +248 -0
  37. package/cli/selftune/eval/composability-v2.ts +273 -0
  38. package/cli/selftune/eval/composability.ts +117 -0
  39. package/cli/selftune/eval/generate-unit-tests.ts +143 -0
  40. package/cli/selftune/eval/hooks-to-evals.ts +101 -16
  41. package/cli/selftune/eval/import-skillsbench.ts +221 -0
  42. package/cli/selftune/eval/synthetic-evals.ts +172 -0
  43. package/cli/selftune/eval/unit-test-cli.ts +152 -0
  44. package/cli/selftune/eval/unit-test.ts +196 -0
  45. package/cli/selftune/evolution/deploy-proposal.ts +142 -1
  46. package/cli/selftune/evolution/evidence.ts +26 -0
  47. package/cli/selftune/evolution/evolve-body.ts +586 -0
  48. package/cli/selftune/evolution/evolve.ts +825 -116
  49. package/cli/selftune/evolution/extract-patterns.ts +105 -16
  50. package/cli/selftune/evolution/pareto.ts +314 -0
  51. package/cli/selftune/evolution/propose-body.ts +171 -0
  52. package/cli/selftune/evolution/propose-description.ts +100 -2
  53. package/cli/selftune/evolution/propose-routing.ts +166 -0
  54. package/cli/selftune/evolution/refine-body.ts +141 -0
  55. package/cli/selftune/evolution/rollback.ts +21 -4
  56. package/cli/selftune/evolution/validate-body.ts +254 -0
  57. package/cli/selftune/evolution/validate-proposal.ts +257 -35
  58. package/cli/selftune/evolution/validate-routing.ts +177 -0
  59. package/cli/selftune/grading/auto-grade.ts +200 -0
  60. package/cli/selftune/grading/grade-session.ts +513 -42
  61. package/cli/selftune/grading/pre-gates.ts +104 -0
  62. package/cli/selftune/grading/results.ts +42 -0
  63. package/cli/selftune/hooks/auto-activate.ts +185 -0
  64. package/cli/selftune/hooks/evolution-guard.ts +165 -0
  65. package/cli/selftune/hooks/prompt-log.ts +172 -2
  66. package/cli/selftune/hooks/session-stop.ts +123 -3
  67. package/cli/selftune/hooks/skill-change-guard.ts +112 -0
  68. package/cli/selftune/hooks/skill-eval.ts +119 -3
  69. package/cli/selftune/index.ts +415 -48
  70. package/cli/selftune/ingestors/claude-replay.ts +377 -0
  71. package/cli/selftune/ingestors/codex-rollout.ts +345 -46
  72. package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
  73. package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
  74. package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
  75. package/cli/selftune/init.ts +376 -16
  76. package/cli/selftune/last.ts +14 -5
  77. package/cli/selftune/localdb/db.ts +63 -0
  78. package/cli/selftune/localdb/materialize.ts +428 -0
  79. package/cli/selftune/localdb/queries.ts +376 -0
  80. package/cli/selftune/localdb/schema.ts +204 -0
  81. package/cli/selftune/memory/writer.ts +447 -0
  82. package/cli/selftune/monitoring/watch.ts +90 -16
  83. package/cli/selftune/normalization.ts +682 -0
  84. package/cli/selftune/observability.ts +19 -44
  85. package/cli/selftune/orchestrate.ts +1073 -0
  86. package/cli/selftune/quickstart.ts +203 -0
  87. package/cli/selftune/repair/skill-usage.ts +576 -0
  88. package/cli/selftune/schedule.ts +561 -0
  89. package/cli/selftune/status.ts +59 -33
  90. package/cli/selftune/sync.ts +627 -0
  91. package/cli/selftune/types.ts +525 -5
  92. package/cli/selftune/utils/canonical-log.ts +45 -0
  93. package/cli/selftune/utils/frontmatter.ts +217 -0
  94. package/cli/selftune/utils/hooks.ts +41 -0
  95. package/cli/selftune/utils/html.ts +27 -0
  96. package/cli/selftune/utils/llm-call.ts +103 -19
  97. package/cli/selftune/utils/math.ts +10 -0
  98. package/cli/selftune/utils/query-filter.ts +139 -0
  99. package/cli/selftune/utils/skill-discovery.ts +340 -0
  100. package/cli/selftune/utils/skill-log.ts +68 -0
  101. package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
  102. package/cli/selftune/utils/transcript.ts +307 -26
  103. package/cli/selftune/utils/trigger-check.ts +89 -0
  104. package/cli/selftune/utils/tui.ts +156 -0
  105. package/cli/selftune/workflows/discover.ts +254 -0
  106. package/cli/selftune/workflows/skill-md-writer.ts +288 -0
  107. package/cli/selftune/workflows/workflows.ts +188 -0
  108. package/package.json +28 -11
  109. package/packages/telemetry-contract/README.md +11 -0
  110. package/packages/telemetry-contract/fixtures/golden.json +87 -0
  111. package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
  112. package/packages/telemetry-contract/index.ts +1 -0
  113. package/packages/telemetry-contract/package.json +19 -0
  114. package/packages/telemetry-contract/src/index.ts +2 -0
  115. package/packages/telemetry-contract/src/types.ts +163 -0
  116. package/packages/telemetry-contract/src/validators.ts +109 -0
  117. package/skill/SKILL.md +180 -33
  118. package/skill/Workflows/AutoActivation.md +145 -0
  119. package/skill/Workflows/Badge.md +124 -0
  120. package/skill/Workflows/Baseline.md +144 -0
  121. package/skill/Workflows/Composability.md +107 -0
  122. package/skill/Workflows/Contribute.md +94 -0
  123. package/skill/Workflows/Cron.md +132 -0
  124. package/skill/Workflows/Dashboard.md +214 -0
  125. package/skill/Workflows/Doctor.md +63 -14
  126. package/skill/Workflows/Evals.md +110 -18
  127. package/skill/Workflows/EvolutionMemory.md +154 -0
  128. package/skill/Workflows/Evolve.md +181 -21
  129. package/skill/Workflows/EvolveBody.md +159 -0
  130. package/skill/Workflows/Grade.md +36 -31
  131. package/skill/Workflows/ImportSkillsBench.md +117 -0
  132. package/skill/Workflows/Ingest.md +142 -21
  133. package/skill/Workflows/Initialize.md +91 -23
  134. package/skill/Workflows/Orchestrate.md +139 -0
  135. package/skill/Workflows/Replay.md +91 -0
  136. package/skill/Workflows/Rollback.md +23 -4
  137. package/skill/Workflows/Schedule.md +61 -0
  138. package/skill/Workflows/Sync.md +88 -0
  139. package/skill/Workflows/UnitTest.md +150 -0
  140. package/skill/Workflows/Watch.md +33 -1
  141. package/skill/Workflows/Workflows.md +129 -0
  142. package/skill/assets/activation-rules-default.json +26 -0
  143. package/skill/assets/multi-skill-settings.json +63 -0
  144. package/skill/assets/single-skill-settings.json +57 -0
  145. package/skill/references/invocation-taxonomy.md +2 -2
  146. package/skill/references/logs.md +164 -2
  147. package/skill/references/setup-patterns.md +65 -0
  148. package/skill/references/version-history.md +40 -0
  149. package/skill/settings_snippet.json +23 -0
  150. package/templates/activation-rules-default.json +27 -0
  151. package/templates/multi-skill-settings.json +64 -0
  152. package/templates/single-skill-settings.json +58 -0
  153. package/dashboard/index.html +0 -1119
@@ -6,25 +6,50 @@
6
6
  * logic and comprehensive audit tracking.
7
7
  */
8
8
 
9
- import { existsSync, readFileSync } from "node:fs";
9
+ import { copyFileSync, existsSync, readFileSync, writeFileSync } from "node:fs";
10
10
  import { parseArgs } from "node:util";
11
11
 
12
- import { QUERY_LOG, SKILL_LOG } from "../constants.js";
12
+ import { QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
13
+ import type { BaselineMeasurement } from "../eval/baseline.js";
14
+ import { measureBaseline } from "../eval/baseline.js";
13
15
  import { buildEvalSet } from "../eval/hooks-to-evals.js";
16
+ import { readGradingResultsForSkill } from "../grading/results.js";
17
+ import { updateContextAfterEvolve } from "../memory/writer.js";
18
+ import type { SyncResult } from "../sync.js";
14
19
  import type {
15
20
  EvalEntry,
16
21
  EvalPassRate,
17
22
  EvolutionAuditEntry,
23
+ EvolutionEvidenceEntry,
18
24
  EvolutionProposal,
25
+ EvolveResultSummary,
26
+ FailurePattern,
27
+ GradingResult,
28
+ ParetoCandidate,
19
29
  QueryLogRecord,
30
+ SessionTelemetryRecord,
20
31
  SkillUsageRecord,
21
32
  } from "../types.js";
33
+ import { parseFrontmatter, replaceFrontmatterDescription } from "../utils/frontmatter.js";
22
34
  import { readJsonl } from "../utils/jsonl.js";
35
+ import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
36
+ import { createEvolveTUI } from "../utils/tui.js";
23
37
  import { appendAuditEntry } from "./audit.js";
38
+ import { appendEvidenceEntry } from "./evidence.js";
24
39
  import { extractFailurePatterns } from "./extract-patterns.js";
25
- import { generateProposal } from "./propose-description.js";
40
+ import {
41
+ computeInvocationScores,
42
+ computeParetoFrontier,
43
+ computeTokenEfficiencyScore,
44
+ selectFromFrontier,
45
+ } from "./pareto.js";
46
+ import { generateMultipleProposals, generateProposal } from "./propose-description.js";
26
47
  import type { ValidationResult } from "./validate-proposal.js";
27
- import { validateProposal } from "./validate-proposal.js";
48
+ import {
49
+ TRIGGER_CHECK_BATCH_SIZE,
50
+ VALIDATION_RUNS,
51
+ validateProposal,
52
+ } from "./validate-proposal.js";
28
53
 
29
54
  // ---------------------------------------------------------------------------
30
55
  // Types
@@ -38,6 +63,18 @@ export interface EvolveOptions {
38
63
  dryRun: boolean;
39
64
  confidenceThreshold: number; // default 0.6
40
65
  maxIterations: number; // default 3
66
+ gradingResults?: GradingResult[];
67
+ paretoEnabled?: boolean;
68
+ candidateCount?: number;
69
+ tokenEfficiencyEnabled?: boolean;
70
+ telemetryRecords?: SessionTelemetryRecord[];
71
+ withBaseline?: boolean;
72
+ validationModel?: string;
73
+ cheapLoop?: boolean;
74
+ gateModel?: string;
75
+ proposalModel?: string;
76
+ syncFirst?: boolean;
77
+ syncForce?: boolean;
41
78
  }
42
79
 
43
80
  export interface EvolveResult {
@@ -46,6 +83,12 @@ export interface EvolveResult {
46
83
  deployed: boolean;
47
84
  auditEntries: EvolutionAuditEntry[];
48
85
  reason: string;
86
+ skillVersion?: string;
87
+ llmCallCount: number;
88
+ elapsedMs: number;
89
+ baselineResult?: BaselineMeasurement;
90
+ gateValidation?: ValidationResult;
91
+ sync_result?: SyncResult;
49
92
  }
50
93
 
51
94
  /**
@@ -53,11 +96,22 @@ export interface EvolveResult {
53
96
  * imports are used. Pass overrides in tests to avoid mock.module().
54
97
  */
55
98
  export interface EvolveDeps {
56
- extractFailurePatterns?: typeof import("./extract-patterns.js").extractFailurePatterns;
99
+ extractFailurePatterns?: (
100
+ evalEntries: EvalEntry[],
101
+ skillUsage: SkillUsageRecord[],
102
+ skillName: string,
103
+ gradingResults?: GradingResult[],
104
+ ) => FailurePattern[];
57
105
  generateProposal?: typeof import("./propose-description.js").generateProposal;
58
106
  validateProposal?: typeof import("./validate-proposal.js").validateProposal;
107
+ gateValidateProposal?: typeof import("./validate-proposal.js").validateProposal;
59
108
  appendAuditEntry?: typeof import("./audit.js").appendAuditEntry;
109
+ appendEvidenceEntry?: typeof import("./evidence.js").appendEvidenceEntry;
60
110
  buildEvalSet?: typeof import("../eval/hooks-to-evals.js").buildEvalSet;
111
+ updateContextAfterEvolve?: typeof import("../memory/writer.js").updateContextAfterEvolve;
112
+ measureBaseline?: typeof import("../eval/baseline.js").measureBaseline;
113
+ readSkillUsageLog?: () => SkillUsageRecord[];
114
+ syncSources?: typeof import("../sync.js").syncSources;
61
115
  }
62
116
 
63
117
  // ---------------------------------------------------------------------------
@@ -69,16 +123,45 @@ function createAuditEntry(
69
123
  action: EvolutionAuditEntry["action"],
70
124
  details: string,
71
125
  evalSnapshot?: EvalPassRate,
126
+ skillName?: string,
72
127
  ): EvolutionAuditEntry {
73
128
  return {
74
129
  timestamp: new Date().toISOString(),
75
130
  proposal_id: proposalId,
76
131
  action,
77
132
  details,
133
+ ...(skillName ? { skill_name: skillName } : {}),
78
134
  ...(evalSnapshot ? { eval_snapshot: evalSnapshot } : {}),
79
135
  };
80
136
  }
81
137
 
138
+ // ---------------------------------------------------------------------------
139
+ // Diff helper
140
+ // ---------------------------------------------------------------------------
141
+
142
+ /**
143
+ * Produce a simple colored diff between two text strings.
144
+ * Red (removed) / Green (added) lines, skipping unchanged lines.
145
+ */
146
+ function formatSimpleDiff(oldText: string, newText: string): string {
147
+ const oldLines = oldText.split("\n");
148
+ const newLines = newText.split("\n");
149
+ const output: string[] = [];
150
+ const maxLen = Math.max(oldLines.length, newLines.length);
151
+ for (let i = 0; i < maxLen; i++) {
152
+ const oldLine = oldLines[i];
153
+ const newLine = newLines[i];
154
+ if (oldLine === newLine) continue;
155
+ if (oldLine !== undefined) {
156
+ output.push(`\x1b[31m- ${oldLine}\x1b[0m`);
157
+ }
158
+ if (newLine !== undefined) {
159
+ output.push(`\x1b[32m+ ${newLine}\x1b[0m`);
160
+ }
161
+ }
162
+ return output.join("\n");
163
+ }
164
+
82
165
  // ---------------------------------------------------------------------------
83
166
  // Main orchestrator
84
167
  // ---------------------------------------------------------------------------
@@ -90,14 +173,27 @@ export async function evolve(
90
173
  const { skillName, skillPath, evalSetPath, agent, dryRun, confidenceThreshold, maxIterations } =
91
174
  options;
92
175
 
176
+ // Apply cheap-loop defaults: cheap models for proposal/validation, expensive for gate
177
+ if (options.cheapLoop) {
178
+ if (!options.proposalModel) options.proposalModel = "haiku";
179
+ if (!options.validationModel) options.validationModel = "haiku";
180
+ if (!options.gateModel) options.gateModel = "sonnet";
181
+ }
182
+
93
183
  // Resolve injectable dependencies with real-import fallbacks
94
184
  const _extractFailurePatterns = _deps.extractFailurePatterns ?? extractFailurePatterns;
95
185
  const _generateProposal = _deps.generateProposal ?? generateProposal;
96
186
  const _validateProposal = _deps.validateProposal ?? validateProposal;
187
+ const _gateValidateProposal = _deps.gateValidateProposal ?? validateProposal;
97
188
  const _appendAuditEntry = _deps.appendAuditEntry ?? appendAuditEntry;
189
+ const _appendEvidenceEntry = _deps.appendEvidenceEntry ?? appendEvidenceEntry;
98
190
  const _buildEvalSet = _deps.buildEvalSet ?? buildEvalSet;
191
+ const _updateContextAfterEvolve = _deps.updateContextAfterEvolve ?? updateContextAfterEvolve;
192
+ const _measureBaseline = _deps.measureBaseline ?? measureBaseline;
193
+ const _readSkillUsageLog = _deps.readSkillUsageLog ?? (() => readEffectiveSkillUsageRecords());
99
194
 
100
195
  const auditEntries: EvolutionAuditEntry[] = [];
196
+ let syncResult: SyncResult | undefined;
101
197
 
102
198
  function recordAudit(
103
199
  proposalId: string,
@@ -105,7 +201,7 @@ export async function evolve(
105
201
  details: string,
106
202
  evalSnapshot?: EvalPassRate,
107
203
  ): void {
108
- const entry = createAuditEntry(proposalId, action, details, evalSnapshot);
204
+ const entry = createAuditEntry(proposalId, action, details, evalSnapshot, skillName);
109
205
  auditEntries.push(entry);
110
206
  try {
111
207
  _appendAuditEntry(entry);
@@ -114,58 +210,171 @@ export async function evolve(
114
210
  }
115
211
  }
116
212
 
213
+ function recordEvidence(entry: EvolutionEvidenceEntry): void {
214
+ try {
215
+ _appendEvidenceEntry(entry);
216
+ } catch {
217
+ // Fail-open: evidence should not block the pipeline
218
+ }
219
+ }
220
+
221
+ const pipelineStart = Date.now();
222
+ let llmCallCount = 0;
223
+ const tui = createEvolveTUI({ skillName, model: options.proposalModel ?? "(default)" });
224
+ const finishTui = () =>
225
+ tui.finish(
226
+ `${llmCallCount} LLM calls \u00b7 ${((Date.now() - pipelineStart) / 1000).toFixed(1)}s elapsed`,
227
+ );
228
+
229
+ /** Stamp every return with pipeline stats so callers always get them. */
230
+ const withStats = (r: Omit<EvolveResult, "llmCallCount" | "elapsedMs">): EvolveResult => ({
231
+ ...r,
232
+ llmCallCount,
233
+ elapsedMs: Date.now() - pipelineStart,
234
+ ...(syncResult ? { sync_result: syncResult } : {}),
235
+ });
236
+
237
+ // Hoisted so catch block can preserve partial results on error
238
+ let lastProposal: EvolutionProposal | null = null;
239
+ let lastValidation: ValidationResult | null = null;
240
+
117
241
  try {
118
242
  // -----------------------------------------------------------------------
119
243
  // Step 1: Read current SKILL.md
120
244
  // -----------------------------------------------------------------------
121
245
  if (!existsSync(skillPath)) {
122
- return {
246
+ tui.fail(`SKILL.md not found at ${skillPath}`);
247
+ finishTui();
248
+ return withStats({
123
249
  proposal: null,
124
250
  validation: null,
125
251
  deployed: false,
126
252
  auditEntries,
127
253
  reason: `SKILL.md not found at ${skillPath}`,
128
- };
254
+ });
129
255
  }
130
256
 
131
- const currentDescription = readFileSync(skillPath, "utf-8");
257
+ const rawContent = readFileSync(skillPath, "utf-8");
258
+ const frontmatter = parseFrontmatter(rawContent);
259
+ const currentDescription = frontmatter.description || rawContent;
260
+ const skillVersion = frontmatter.version || undefined;
261
+ const versionTag = skillVersion ? `, v${skillVersion}` : "";
262
+ const createdAuditDetails = (message: string) =>
263
+ `original_description:${rawContent}\n${message}`;
264
+ tui.done(`Loaded SKILL.md (desc: ${currentDescription.length} chars${versionTag})`);
265
+
266
+ if (options.syncFirst) {
267
+ tui.step(`Syncing source-truth telemetry${options.syncForce ? " (force)" : ""}...`);
268
+ const { createDefaultSyncOptions, syncSources: realSyncSources } = await import("../sync.js");
269
+ const syncRunner = _deps.syncSources ?? realSyncSources;
270
+ syncResult = syncRunner(
271
+ createDefaultSyncOptions({
272
+ force: options.syncForce ?? false,
273
+ }),
274
+ );
275
+ const sourceSynced = Object.values(syncResult.sources).reduce(
276
+ (sum, source) => sum + source.synced,
277
+ 0,
278
+ );
279
+ tui.done(
280
+ `Source sync complete (${sourceSynced} source sessions, ${syncResult.repair.repaired_records} repaired records)`,
281
+ );
282
+ }
132
283
 
133
284
  // -----------------------------------------------------------------------
134
285
  // Step 2: Load eval set
135
286
  // -----------------------------------------------------------------------
287
+ const skillUsage = _readSkillUsageLog();
136
288
  let evalSet: EvalEntry[];
137
289
 
138
290
  if (evalSetPath && existsSync(evalSetPath)) {
139
- const raw = readFileSync(evalSetPath, "utf-8");
140
- evalSet = JSON.parse(raw) as EvalEntry[];
291
+ try {
292
+ const raw = readFileSync(evalSetPath, "utf-8");
293
+ evalSet = JSON.parse(raw) as EvalEntry[];
294
+ } catch (parseErr) {
295
+ const msg = parseErr instanceof Error ? parseErr.message : String(parseErr);
296
+ tui.fail(`Failed to load eval set from ${evalSetPath}: ${msg}`);
297
+ finishTui();
298
+ return withStats({
299
+ proposal: null,
300
+ validation: null,
301
+ deployed: false,
302
+ auditEntries,
303
+ reason: `Failed to load eval set: ${msg}`,
304
+ });
305
+ }
306
+ if (!Array.isArray(evalSet)) {
307
+ tui.fail(`Eval set at ${evalSetPath} is not an array`);
308
+ finishTui();
309
+ return withStats({
310
+ proposal: null,
311
+ validation: null,
312
+ deployed: false,
313
+ auditEntries,
314
+ reason: `Eval set at ${evalSetPath} is not a JSON array`,
315
+ });
316
+ }
141
317
  } else {
142
318
  // Build from logs
143
- const skillRecords = readJsonl<SkillUsageRecord>(SKILL_LOG);
144
319
  const queryRecords = readJsonl<QueryLogRecord>(QUERY_LOG);
145
- evalSet = _buildEvalSet(skillRecords, queryRecords, skillName);
320
+ evalSet = _buildEvalSet(skillUsage, queryRecords, skillName);
146
321
  }
147
322
 
323
+ const posCount = evalSet.filter((e) => e.should_trigger).length;
324
+ const negCount = evalSet.filter((e) => !e.should_trigger).length;
325
+ tui.done(`Loaded eval set (${evalSet.length} entries: ${posCount}+, ${negCount}-)`);
326
+
148
327
  // -----------------------------------------------------------------------
149
328
  // Step 3: Load skill usage records
150
329
  // -----------------------------------------------------------------------
151
- const skillUsage = readJsonl<SkillUsageRecord>(SKILL_LOG);
152
-
153
330
  // -----------------------------------------------------------------------
154
331
  // Step 4: Extract failure patterns
155
332
  // -----------------------------------------------------------------------
156
- const failurePatterns = _extractFailurePatterns(evalSet, skillUsage, skillName);
333
+ const failurePatterns = _extractFailurePatterns(
334
+ evalSet,
335
+ skillUsage,
336
+ skillName,
337
+ options.gradingResults,
338
+ );
339
+
340
+ const totalMissed = failurePatterns.reduce((sum, p) => sum + p.missed_queries.length, 0);
341
+ tui.done(
342
+ `Extracted ${failurePatterns.length} failure pattern(s) (${totalMissed} missed queries)`,
343
+ );
157
344
 
158
345
  // -----------------------------------------------------------------------
159
- // Step 5: Early exit if no patterns
346
+ // Step 5: Cold-start bootstrap or early exit if no patterns
160
347
  // -----------------------------------------------------------------------
161
348
  if (failurePatterns.length === 0) {
162
- return {
163
- proposal: null,
164
- validation: null,
165
- deployed: false,
166
- auditEntries,
167
- reason: "No failure patterns found",
168
- };
349
+ // Cold-start: if the eval set has positive entries that the skill should
350
+ // match but there are zero skill usage records, treat the positive eval
351
+ // entries themselves as "missed queries" — they ARE the failure signal.
352
+ const positiveEvals = evalSet.filter((e) => e.should_trigger);
353
+ const hasSkillUsageHistory = skillUsage.some((record) => record.skill_name === skillName);
354
+ if (positiveEvals.length > 0 && !hasSkillUsageHistory) {
355
+ const coldStartPattern: FailurePattern = {
356
+ pattern_id: `fp-${skillName}-coldstart`,
357
+ skill_name: skillName,
358
+ invocation_type: "implicit",
359
+ missed_queries: positiveEvals.map((e) => e.query),
360
+ frequency: positiveEvals.length,
361
+ sample_sessions: [],
362
+ extracted_at: new Date().toISOString(),
363
+ };
364
+ failurePatterns.push(coldStartPattern);
365
+ tui.done(
366
+ `Cold-start bootstrap: ${positiveEvals.length} positive eval entries used as missed queries`,
367
+ );
368
+ } else {
369
+ finishTui();
370
+ return withStats({
371
+ proposal: null,
372
+ validation: null,
373
+ deployed: false,
374
+ auditEntries,
375
+ reason: "No failure patterns found",
376
+ });
377
+ }
169
378
  }
170
379
 
171
380
  // -----------------------------------------------------------------------
@@ -174,156 +383,502 @@ export async function evolve(
174
383
  const missedQueries = failurePatterns.flatMap((p) => p.missed_queries);
175
384
 
176
385
  // -----------------------------------------------------------------------
177
- // Steps 7-12: Retry loop for proposal generation and validation
386
+ // Steps 7-12: Proposal generation and validation
178
387
  // -----------------------------------------------------------------------
179
- let lastProposal: EvolutionProposal | null = null;
180
- let lastValidation: ValidationResult | null = null;
181
- let feedbackReason = "";
182
388
 
183
- for (let iteration = 0; iteration < maxIterations; iteration++) {
184
- // Step 7: Generate proposal
185
- const effectiveMissedQueries = feedbackReason
186
- ? [...missedQueries, `[Previous attempt failed: ${feedbackReason}]`]
187
- : missedQueries;
389
+ // -----------------------------------------------------------------------
390
+ // Pareto multi-candidate path
391
+ // -----------------------------------------------------------------------
392
+ const paretoEnabled = options.paretoEnabled ?? false;
393
+ const candidateCount = options.candidateCount ?? 3;
394
+ const tokenEfficiencyEnabled = options.tokenEfficiencyEnabled ?? false;
395
+ const telemetryRecords =
396
+ options.telemetryRecords ??
397
+ (tokenEfficiencyEnabled ? readJsonl<SessionTelemetryRecord>(TELEMETRY_LOG) : undefined);
398
+
399
+ // Compute token efficiency score if enabled and telemetry is available
400
+ let tokenEffScore: number | undefined;
401
+ if (tokenEfficiencyEnabled && telemetryRecords && telemetryRecords.length > 0) {
402
+ tokenEffScore = computeTokenEfficiencyScore(skillName, telemetryRecords);
403
+ recordAudit(
404
+ "system",
405
+ "created",
406
+ `Token efficiency score for ${skillName}: ${tokenEffScore.toFixed(3)}`,
407
+ );
408
+ }
188
409
 
189
- const proposal = await _generateProposal(
410
+ if (paretoEnabled && candidateCount > 1) {
411
+ // Generate N candidates in parallel
412
+ const candidates = await generateMultipleProposals(
190
413
  currentDescription,
191
414
  failurePatterns,
192
- effectiveMissedQueries,
415
+ missedQueries,
193
416
  skillName,
194
417
  skillPath,
195
418
  agent,
419
+ candidateCount,
420
+ options.proposalModel,
196
421
  );
197
422
 
198
- lastProposal = proposal;
423
+ // Filter by confidence threshold
424
+ const viableCandidates = candidates.filter((c) => c.confidence >= confidenceThreshold);
199
425
 
200
- // Step 8: Audit "created"
201
- recordAudit(
202
- proposal.proposal_id,
203
- "created",
204
- `Proposal created for ${skillName} (iteration ${iteration + 1})`,
205
- );
426
+ if (viableCandidates.length === 0) {
427
+ finishTui();
428
+ return withStats({
429
+ proposal: candidates[0] ?? null,
430
+ validation: null,
431
+ deployed: false,
432
+ auditEntries,
433
+ reason: `No candidates met confidence threshold ${confidenceThreshold}`,
434
+ });
435
+ }
436
+
437
+ // Validate each candidate
438
+ const paretoCandidates: ParetoCandidate[] = [];
439
+ for (const proposal of viableCandidates) {
440
+ recordAudit(
441
+ proposal.proposal_id,
442
+ "created",
443
+ createdAuditDetails(`Pareto candidate for ${skillName}`),
444
+ );
445
+ recordEvidence({
446
+ timestamp: new Date().toISOString(),
447
+ proposal_id: proposal.proposal_id,
448
+ skill_name: skillName,
449
+ skill_path: skillPath,
450
+ target: "description",
451
+ stage: "created",
452
+ rationale: proposal.rationale,
453
+ confidence: proposal.confidence,
454
+ details: `Pareto candidate for ${skillName}`,
455
+ original_text: proposal.original_description,
456
+ proposed_text: proposal.proposed_description,
457
+ eval_set: evalSet,
458
+ });
206
459
 
207
- // Step 9: Check confidence threshold
208
- if (proposal.confidence < confidenceThreshold) {
209
- feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
460
+ const validation = await _validateProposal(
461
+ proposal,
462
+ evalSet,
463
+ agent,
464
+ options.validationModel,
465
+ );
210
466
  recordAudit(
211
467
  proposal.proposal_id,
212
- "rejected",
213
- `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
468
+ "validated",
469
+ `Pareto validation: improved=${validation.improved}`,
214
470
  );
471
+ recordEvidence({
472
+ timestamp: new Date().toISOString(),
473
+ proposal_id: proposal.proposal_id,
474
+ skill_name: skillName,
475
+ skill_path: skillPath,
476
+ target: "description",
477
+ stage: "validated",
478
+ rationale: proposal.rationale,
479
+ confidence: proposal.confidence,
480
+ details: `Pareto validation: improved=${validation.improved}`,
481
+ validation: {
482
+ improved: validation.improved,
483
+ before_pass_rate: validation.before_pass_rate,
484
+ after_pass_rate: validation.after_pass_rate,
485
+ net_change: validation.net_change,
486
+ regressions: validation.regressions,
487
+ new_passes: validation.new_passes,
488
+ per_entry_results: validation.per_entry_results,
489
+ },
490
+ });
215
491
 
216
- // If this is the last iteration, return early with rejection
217
- if (iteration === maxIterations - 1) {
218
- return {
219
- proposal: lastProposal,
220
- validation: null,
221
- deployed: false,
222
- auditEntries,
223
- reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
492
+ if (validation.improved && validation.per_entry_results) {
493
+ const invocationScores = computeInvocationScores(validation.per_entry_results);
494
+ const candidate: ParetoCandidate = {
495
+ proposal,
496
+ validation,
497
+ invocation_scores: invocationScores,
498
+ dominates_on: [],
224
499
  };
500
+ if (tokenEffScore !== undefined) {
501
+ candidate.token_efficiency_score = tokenEffScore;
502
+ }
503
+ paretoCandidates.push(candidate);
225
504
  }
505
+ }
226
506
 
227
- continue;
507
+ if (paretoCandidates.length === 0) {
508
+ finishTui();
509
+ return withStats({
510
+ proposal: viableCandidates[0],
511
+ validation: null,
512
+ deployed: false,
513
+ auditEntries,
514
+ reason: "No Pareto candidates improved validation",
515
+ });
228
516
  }
229
517
 
230
- // Step 10: Validate against eval set
231
- const validation = await _validateProposal(proposal, evalSet, agent);
232
- lastValidation = validation;
518
+ // Compute Pareto frontier
519
+ const frontier = computeParetoFrontier(paretoCandidates);
520
+ const { best } = selectFromFrontier(frontier);
233
521
 
234
- // Step 11: Audit "validated"
235
- const evalSnapshot: EvalPassRate = {
236
- total: evalSet.length,
237
- passed: Math.round(validation.after_pass_rate * evalSet.length),
238
- failed: evalSet.length - Math.round(validation.after_pass_rate * evalSet.length),
239
- pass_rate: validation.after_pass_rate,
240
- };
241
- recordAudit(
242
- proposal.proposal_id,
243
- "validated",
244
- `Validation complete: improved=${validation.improved}`,
245
- evalSnapshot,
246
- );
522
+ lastProposal = best.proposal;
523
+ lastValidation = best.validation;
524
+
525
+ // Skip the standard retry loop — we already have our result
526
+ } else {
527
+ // Standard single-candidate retry loop
528
+ let feedbackReason = "";
529
+
530
+ for (let iteration = 0; iteration < maxIterations; iteration++) {
531
+ // Step 7: Generate proposal
532
+ const effectiveMissedQueries = feedbackReason
533
+ ? [...missedQueries, `[Previous attempt failed: ${feedbackReason}]`]
534
+ : missedQueries;
247
535
 
248
- // Step 12: Check validation result
249
- if (!validation.improved) {
250
- feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
536
+ tui.step(`Generating proposal (iteration ${iteration + 1}/${maxIterations})...`);
537
+ const proposal = await _generateProposal(
538
+ currentDescription,
539
+ failurePatterns,
540
+ effectiveMissedQueries,
541
+ skillName,
542
+ skillPath,
543
+ agent,
544
+ options.proposalModel,
545
+ );
546
+ llmCallCount++;
547
+
548
+ lastProposal = proposal;
549
+ tui.done(`Proposal generated (conf: ${proposal.confidence.toFixed(2)})`);
550
+
551
+ // Step 8: Audit "created"
251
552
  recordAudit(
252
553
  proposal.proposal_id,
253
- "rejected",
254
- `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
554
+ "created",
555
+ createdAuditDetails(`Proposal created for ${skillName} (iteration ${iteration + 1})`),
255
556
  );
557
+ recordEvidence({
558
+ timestamp: new Date().toISOString(),
559
+ proposal_id: proposal.proposal_id,
560
+ skill_name: skillName,
561
+ skill_path: skillPath,
562
+ target: "description",
563
+ stage: "created",
564
+ rationale: proposal.rationale,
565
+ confidence: proposal.confidence,
566
+ details: `Proposal created for ${skillName} (iteration ${iteration + 1})`,
567
+ original_text: proposal.original_description,
568
+ proposed_text: proposal.proposed_description,
569
+ eval_set: evalSet,
570
+ });
256
571
 
257
- // If this is the last iteration, return with rejection
258
- if (iteration === maxIterations - 1) {
259
- return {
260
- proposal: lastProposal,
261
- validation: lastValidation,
262
- deployed: false,
263
- auditEntries,
264
- reason: `Validation failed after ${maxIterations} iterations: net_change=${validation.net_change.toFixed(3)}`,
265
- };
572
+ // Step 9: Check confidence threshold
573
+ if (proposal.confidence < confidenceThreshold) {
574
+ feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
575
+ recordAudit(
576
+ proposal.proposal_id,
577
+ "rejected",
578
+ `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
579
+ );
580
+ recordEvidence({
581
+ timestamp: new Date().toISOString(),
582
+ proposal_id: proposal.proposal_id,
583
+ skill_name: skillName,
584
+ skill_path: skillPath,
585
+ target: "description",
586
+ stage: "rejected",
587
+ rationale: proposal.rationale,
588
+ confidence: proposal.confidence,
589
+ details: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
590
+ });
591
+
592
+ // If this is the last iteration, return early with rejection
593
+ if (iteration === maxIterations - 1) {
594
+ finishTui();
595
+ return withStats({
596
+ proposal: lastProposal,
597
+ validation: null,
598
+ deployed: false,
599
+ auditEntries,
600
+ reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
601
+ });
602
+ }
603
+
604
+ continue;
266
605
  }
267
606
 
268
- continue;
269
- }
607
+ // Step 10: Validate against eval set
608
+ const batchCount = Math.ceil(evalSet.length / TRIGGER_CHECK_BATCH_SIZE);
609
+ tui.step(
610
+ `Validating ${evalSet.length} entries (${batchCount} batches, ${VALIDATION_RUNS}x majority-vote)...`,
611
+ );
612
+ const validation = await _validateProposal(
613
+ proposal,
614
+ evalSet,
615
+ agent,
616
+ options.validationModel,
617
+ );
618
+ lastValidation = validation;
619
+ llmCallCount += batchCount * 2 * VALIDATION_RUNS;
620
+ tui.done(
621
+ `Validation: ${(validation.before_pass_rate * 100).toFixed(1)}% \u2192 ${(validation.after_pass_rate * 100).toFixed(1)}% (improved: ${validation.improved})`,
622
+ );
623
+
624
+ // Step 11: Audit "validated"
625
+ const evalSnapshot: EvalPassRate = {
626
+ total: evalSet.length,
627
+ passed: Math.round(validation.after_pass_rate * evalSet.length),
628
+ failed: evalSet.length - Math.round(validation.after_pass_rate * evalSet.length),
629
+ pass_rate: validation.after_pass_rate,
630
+ };
631
+ recordAudit(
632
+ proposal.proposal_id,
633
+ "validated",
634
+ `Validation complete: improved=${validation.improved}`,
635
+ evalSnapshot,
636
+ );
637
+ recordEvidence({
638
+ timestamp: new Date().toISOString(),
639
+ proposal_id: proposal.proposal_id,
640
+ skill_name: skillName,
641
+ skill_path: skillPath,
642
+ target: "description",
643
+ stage: "validated",
644
+ rationale: proposal.rationale,
645
+ confidence: proposal.confidence,
646
+ details: `Validation complete: improved=${validation.improved}`,
647
+ validation: {
648
+ improved: validation.improved,
649
+ before_pass_rate: validation.before_pass_rate,
650
+ after_pass_rate: validation.after_pass_rate,
651
+ net_change: validation.net_change,
652
+ regressions: validation.regressions,
653
+ new_passes: validation.new_passes,
654
+ per_entry_results: validation.per_entry_results,
655
+ },
656
+ });
270
657
 
271
- // Validation passed - break out of retry loop
272
- break;
658
+ // Step 12: Check validation result
659
+ if (!validation.improved) {
660
+ feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
661
+ recordAudit(
662
+ proposal.proposal_id,
663
+ "rejected",
664
+ `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
665
+ );
666
+ recordEvidence({
667
+ timestamp: new Date().toISOString(),
668
+ proposal_id: proposal.proposal_id,
669
+ skill_name: skillName,
670
+ skill_path: skillPath,
671
+ target: "description",
672
+ stage: "rejected",
673
+ rationale: proposal.rationale,
674
+ confidence: proposal.confidence,
675
+ details: `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
676
+ validation: {
677
+ improved: validation.improved,
678
+ before_pass_rate: validation.before_pass_rate,
679
+ after_pass_rate: validation.after_pass_rate,
680
+ net_change: validation.net_change,
681
+ regressions: validation.regressions,
682
+ new_passes: validation.new_passes,
683
+ per_entry_results: validation.per_entry_results,
684
+ },
685
+ });
686
+
687
+ // If this is the last iteration, return with rejection
688
+ if (iteration === maxIterations - 1) {
689
+ finishTui();
690
+ return withStats({
691
+ proposal: lastProposal,
692
+ validation: lastValidation,
693
+ deployed: false,
694
+ auditEntries,
695
+ reason: `Validation failed after ${maxIterations} iterations: net_change=${validation.net_change.toFixed(3)}`,
696
+ });
697
+ }
698
+
699
+ continue;
700
+ }
701
+
702
+ // Validation passed - break out of retry loop
703
+ break;
704
+ }
273
705
  }
274
706
 
275
707
  // -----------------------------------------------------------------------
276
708
  // Step 13: Dry run check
277
709
  // -----------------------------------------------------------------------
278
710
  if (dryRun) {
279
- return {
711
+ finishTui();
712
+ return withStats({
280
713
  proposal: lastProposal,
281
714
  validation: lastValidation,
282
715
  deployed: false,
283
716
  auditEntries,
284
717
  reason: "Dry run - proposal validated but not deployed",
285
- };
718
+ });
286
719
  }
287
720
 
288
721
  // -----------------------------------------------------------------------
289
- // Step 14: Deploy (actual deploy wired in TASK-14)
722
+ // Step 13b: Baseline gate (--with-baseline)
290
723
  // -----------------------------------------------------------------------
291
- if (lastProposal) {
724
+ let baselineResult: BaselineMeasurement | undefined;
725
+ if (options.withBaseline && lastProposal) {
726
+ tui.step("Measuring baseline...");
727
+ baselineResult = await _measureBaseline({
728
+ evalSet,
729
+ skillDescription: currentDescription,
730
+ skillName,
731
+ agent,
732
+ modelFlag: options.validationModel,
733
+ });
734
+ tui.done(
735
+ `Baseline: lift=${baselineResult.lift.toFixed(3)}, adds_value=${baselineResult.adds_value}`,
736
+ );
737
+
292
738
  recordAudit(
293
739
  lastProposal.proposal_id,
294
- "deployed",
295
- `Deployed proposal for ${skillName}`,
296
- lastValidation
297
- ? {
298
- total: evalSet.length,
299
- passed: Math.round(lastValidation.after_pass_rate * evalSet.length),
300
- failed: evalSet.length - Math.round(lastValidation.after_pass_rate * evalSet.length),
301
- pass_rate: lastValidation.after_pass_rate,
302
- }
303
- : undefined,
740
+ "validated",
741
+ `Baseline check: lift=${baselineResult.lift.toFixed(3)}, adds_value=${baselineResult.adds_value}`,
304
742
  );
743
+
744
+ if (!baselineResult.adds_value) {
745
+ finishTui();
746
+ return withStats({
747
+ proposal: lastProposal,
748
+ validation: lastValidation,
749
+ deployed: false,
750
+ auditEntries,
751
+ reason: `Baseline gate failed: lift=${baselineResult.lift.toFixed(3)} below 0.05 threshold`,
752
+ baselineResult,
753
+ });
754
+ }
305
755
  }
306
756
 
307
757
  // -----------------------------------------------------------------------
308
- // Step 15-16: Return complete result
758
+ // Step 13c: Gate validation (--cheap-loop / --gate-model)
309
759
  // -----------------------------------------------------------------------
310
- return {
760
+ let gateValidation: ValidationResult | undefined;
761
+ if (options.gateModel && lastProposal && lastValidation?.improved) {
762
+ tui.step(`Gate validation (${options.gateModel})...`);
763
+ gateValidation = await _gateValidateProposal(lastProposal, evalSet, agent, options.gateModel);
764
+ tui.done(
765
+ `Gate (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
766
+ );
767
+
768
+ recordAudit(
769
+ lastProposal.proposal_id,
770
+ "validated",
771
+ `Gate validation (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
772
+ );
773
+
774
+ if (!gateValidation.improved) {
775
+ finishTui();
776
+ return withStats({
777
+ proposal: lastProposal,
778
+ validation: lastValidation,
779
+ deployed: false,
780
+ auditEntries,
781
+ reason: `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
782
+ gateValidation,
783
+ ...(baselineResult ? { baselineResult } : {}),
784
+ });
785
+ }
786
+ }
787
+
788
+ // -----------------------------------------------------------------------
789
+ // Step 14: Deploy — write updated description to SKILL.md
790
+ // -----------------------------------------------------------------------
791
+ if (lastProposal && lastValidation?.improved) {
792
+ // Create backup before modifying
793
+ const backupPath = `${skillPath}.bak`;
794
+ copyFileSync(skillPath, backupPath);
795
+ tui.done(`Backup created at ${backupPath}`);
796
+
797
+ // Replace the frontmatter description
798
+ const updatedContent = replaceFrontmatterDescription(
799
+ rawContent,
800
+ lastProposal.proposed_description,
801
+ );
802
+ writeFileSync(skillPath, updatedContent, "utf-8");
803
+ tui.done(`Deployed updated description to ${skillPath}`);
804
+
805
+ // Show what changed in the skill file
806
+ const diffOutput = formatSimpleDiff(rawContent, updatedContent);
807
+ if (diffOutput) {
808
+ console.error("\n--- Skill description diff ---");
809
+ console.error(diffOutput);
810
+ console.error("------------------------------\n");
811
+ }
812
+
813
+ recordAudit(lastProposal.proposal_id, "deployed", `Deployed proposal for ${skillName}`, {
814
+ total: evalSet.length,
815
+ passed: Math.round(lastValidation.after_pass_rate * evalSet.length),
816
+ failed: evalSet.length - Math.round(lastValidation.after_pass_rate * evalSet.length),
817
+ pass_rate: lastValidation.after_pass_rate,
818
+ });
819
+ recordEvidence({
820
+ timestamp: new Date().toISOString(),
821
+ proposal_id: lastProposal.proposal_id,
822
+ skill_name: skillName,
823
+ skill_path: skillPath,
824
+ target: "description",
825
+ stage: "deployed",
826
+ rationale: lastProposal.rationale,
827
+ confidence: lastProposal.confidence,
828
+ details: `Deployed proposal for ${skillName}`,
829
+ validation: {
830
+ improved: lastValidation.improved,
831
+ before_pass_rate: lastValidation.before_pass_rate,
832
+ after_pass_rate: lastValidation.after_pass_rate,
833
+ net_change: lastValidation.net_change,
834
+ regressions: lastValidation.regressions,
835
+ new_passes: lastValidation.new_passes,
836
+ per_entry_results: lastValidation.per_entry_results,
837
+ },
838
+ });
839
+ }
840
+
841
+ // -----------------------------------------------------------------------
842
+ // Step 15: Update evolution memory
843
+ // -----------------------------------------------------------------------
844
+ const wasDeployed = lastProposal !== null && lastValidation !== null && lastValidation.improved;
845
+ const evolveResult: EvolveResult = withStats({
311
846
  proposal: lastProposal,
312
847
  validation: lastValidation,
313
- deployed: true,
848
+ deployed: wasDeployed,
314
849
  auditEntries,
315
- reason: "Evolution deployed successfully",
316
- };
850
+ reason: wasDeployed
851
+ ? "Evolution deployed successfully"
852
+ : "Evolution not deployed: proposal or validation missing",
853
+ ...(skillVersion ? { skillVersion } : {}),
854
+ ...(baselineResult ? { baselineResult } : {}),
855
+ ...(gateValidation ? { gateValidation } : {}),
856
+ });
857
+
858
+ if (lastProposal) {
859
+ try {
860
+ _updateContextAfterEvolve(skillName, lastProposal, evolveResult);
861
+ } catch {
862
+ // Memory writes should never fail the main operation
863
+ }
864
+ }
865
+
866
+ // -----------------------------------------------------------------------
867
+ // Step 16: Return complete result
868
+ // -----------------------------------------------------------------------
869
+ finishTui();
870
+ return evolveResult;
317
871
  } catch (error) {
318
- // Robust error handling: catch any unexpected errors and return gracefully
872
+ tui.destroy();
873
+ // Robust error handling: preserve partial results so callers can inspect progress
319
874
  const errorMessage = error instanceof Error ? error.message : String(error);
320
- return {
321
- proposal: null,
322
- validation: null,
875
+ return withStats({
876
+ proposal: lastProposal,
877
+ validation: lastValidation,
323
878
  deployed: false,
324
879
  auditEntries,
325
880
  reason: `Error during evolution: ${errorMessage}`,
326
- };
881
+ });
327
882
  }
328
883
  }
329
884
 
@@ -341,6 +896,18 @@ export async function cliMain(): Promise<void> {
341
896
  "dry-run": { type: "boolean", default: false },
342
897
  confidence: { type: "string", default: "0.6" },
343
898
  "max-iterations": { type: "string", default: "3" },
899
+ pareto: { type: "boolean", default: false },
900
+ candidates: { type: "string", default: "3" },
901
+ "token-efficiency": { type: "boolean", default: false },
902
+ "with-baseline": { type: "boolean", default: false },
903
+ "validation-model": { type: "string", default: "haiku" },
904
+ "cheap-loop": { type: "boolean", default: true },
905
+ "full-model": { type: "boolean", default: false },
906
+ "gate-model": { type: "string" },
907
+ "proposal-model": { type: "string" },
908
+ "sync-first": { type: "boolean", default: false },
909
+ "sync-force": { type: "boolean", default: false },
910
+ verbose: { type: "boolean", default: false },
344
911
  help: { type: "boolean", default: false },
345
912
  },
346
913
  strict: true,
@@ -360,6 +927,18 @@ Options:
360
927
  --dry-run Validate proposal without deploying
361
928
  --confidence Confidence threshold 0.0-1.0 (default: 0.6)
362
929
  --max-iterations Max retry iterations (default: 3)
930
+ --pareto Enable Pareto multi-candidate selection
931
+ --candidates Number of candidates to generate (default: 3, max: 5)
932
+ --token-efficiency Enable 5D Pareto with token efficiency scoring
933
+ --with-baseline Gate deployment on baseline lift > 0.05
934
+ --validation-model Model for trigger-check validation calls (default: haiku)
935
+ --cheap-loop Use cheap models for loop, expensive for gate (default: on)
936
+ --full-model Use same model for all stages (disables cheap-loop)
937
+ --gate-model Model for final gate validation (default: sonnet)
938
+ --proposal-model Model for proposal generation LLM calls
939
+ --sync-first Refresh source-truth telemetry before building evals/failure patterns
940
+ --sync-force Force a full rescan during --sync-first
941
+ --verbose Output full EvolveResult JSON (default: compact summary)
363
942
  --help Show this help message`);
364
943
  process.exit(0);
365
944
  }
@@ -368,6 +947,10 @@ Options:
368
947
  console.error("[ERROR] --skill and --skill-path are required");
369
948
  process.exit(1);
370
949
  }
950
+ if ((values["sync-force"] ?? false) && !(values["sync-first"] ?? false)) {
951
+ console.error("[ERROR] --sync-force requires --sync-first");
952
+ process.exit(1);
953
+ }
371
954
 
372
955
  const { detectAgent } = await import("../utils/llm-call.js");
373
956
  const requestedAgent = values.agent;
@@ -395,6 +978,61 @@ Options:
395
978
  process.exit(1);
396
979
  }
397
980
 
981
+ // -------------------------------------------------------------------------
982
+ // Pre-flight validation: catch common misconfigurations before evolve()
983
+ // -------------------------------------------------------------------------
984
+ const skillPath = values["skill-path"];
985
+ if (!skillPath) {
986
+ console.error("[ERROR] --skill-path is required.");
987
+ process.exit(1);
988
+ }
989
+ if (!existsSync(skillPath)) {
990
+ console.error(`[ERROR] SKILL.md not found at: ${skillPath}`);
991
+ console.error(" Verify the --skill-path argument points to an existing SKILL.md file.");
992
+ process.exit(1);
993
+ }
994
+
995
+ const evalSetPath = values["eval-set"];
996
+ if (evalSetPath && !existsSync(evalSetPath)) {
997
+ console.error(`[ERROR] Eval set file not found at: ${evalSetPath}`);
998
+ console.error(" Verify the --eval-set argument points to an existing JSON file.");
999
+ process.exit(1);
1000
+ }
1001
+
1002
+ // If no eval-set provided, check that log files exist for auto-generation
1003
+ if (!evalSetPath && !(values["sync-first"] ?? false)) {
1004
+ const hasSkillLog = readEffectiveSkillUsageRecords().length > 0;
1005
+ const hasQueryLog = existsSync(QUERY_LOG);
1006
+ if (!hasSkillLog && !hasQueryLog) {
1007
+ console.error("[ERROR] No eval set provided and no telemetry logs found.");
1008
+ console.error(
1009
+ " Either pass --eval-set <path> or generate logs first by using selftune-enabled skills.",
1010
+ );
1011
+ console.error(` Expected logs at: ${SKILL_LOG} and ${QUERY_LOG}`);
1012
+ process.exit(1);
1013
+ }
1014
+ }
1015
+
1016
+ const tokenEfficiencyEnabled = values["token-efficiency"] ?? false;
1017
+ let telemetryRecords: SessionTelemetryRecord[] | undefined;
1018
+ if (tokenEfficiencyEnabled && !(values["sync-first"] ?? false)) {
1019
+ telemetryRecords = readJsonl<SessionTelemetryRecord>(TELEMETRY_LOG);
1020
+ }
1021
+ const gradingResults = readGradingResultsForSkill(values.skill);
1022
+
1023
+ if (values.verbose) {
1024
+ console.error("[verbose] Pre-flight checks passed");
1025
+ console.error(`[verbose] Skill: ${values.skill}`);
1026
+ console.error(`[verbose] Skill path: ${skillPath}`);
1027
+ console.error(`[verbose] Agent: ${agent}`);
1028
+ console.error(`[verbose] Eval set: ${evalSetPath ?? "(auto-generated from logs)"}`);
1029
+ console.error(`[verbose] Loaded grading results: ${gradingResults.length}`);
1030
+ console.error(`[verbose] Cheap loop: ${values["cheap-loop"] ?? false}`);
1031
+ console.error(`[verbose] Dry run: ${values["dry-run"] ?? false}`);
1032
+ console.error(`[verbose] Sync first: ${values["sync-first"] ?? false}`);
1033
+ console.error(`[verbose] Sync force: ${values["sync-force"] ?? false}`);
1034
+ }
1035
+
398
1036
  const result = await evolve({
399
1037
  skillName: values.skill,
400
1038
  skillPath: values["skill-path"],
@@ -403,15 +1041,86 @@ Options:
403
1041
  dryRun: values["dry-run"] ?? false,
404
1042
  confidenceThreshold: Number.parseFloat(values.confidence ?? "0.6"),
405
1043
  maxIterations: Number.parseInt(values["max-iterations"] ?? "3", 10),
1044
+ paretoEnabled: values.pareto ?? false,
1045
+ candidateCount: Number.parseInt(values.candidates ?? "3", 10),
1046
+ tokenEfficiencyEnabled,
1047
+ telemetryRecords,
1048
+ withBaseline: values["with-baseline"] ?? false,
1049
+ validationModel: values["validation-model"],
1050
+ cheapLoop: (values["cheap-loop"] ?? true) && !(values["full-model"] ?? false),
1051
+ gateModel: values["gate-model"],
1052
+ proposalModel: values["proposal-model"],
1053
+ gradingResults,
1054
+ syncFirst: values["sync-first"] ?? false,
1055
+ syncForce: values["sync-force"] ?? false,
406
1056
  });
407
1057
 
408
- console.log(JSON.stringify(result, null, 2));
1058
+ if (values.verbose) {
1059
+ console.log(JSON.stringify(result, null, 2));
1060
+ } else {
1061
+ const summary: EvolveResultSummary = {
1062
+ skill: values.skill,
1063
+ deployed: result.deployed,
1064
+ reason: result.reason,
1065
+ before: result.validation?.before_pass_rate ?? 0,
1066
+ after: result.validation?.after_pass_rate ?? 0,
1067
+ net_change: result.validation?.net_change ?? 0,
1068
+ improved: result.validation?.improved ?? false,
1069
+ regressions: result.validation?.regressions.length ?? 0,
1070
+ new_passes: result.validation?.new_passes.length ?? 0,
1071
+ confidence: result.proposal?.confidence ?? 0,
1072
+ llm_calls: result.llmCallCount,
1073
+ elapsed_s: +(result.elapsedMs / 1000).toFixed(1),
1074
+ proposal_id: result.proposal?.proposal_id ?? "",
1075
+ rationale: result.proposal?.rationale ?? "",
1076
+ ...(result.skillVersion ? { version: result.skillVersion } : {}),
1077
+ dashboard_url: `http://localhost:3141/report/${encodeURIComponent(values.skill)}`,
1078
+ };
1079
+ console.log(JSON.stringify(summary, null, 2));
1080
+ }
1081
+
1082
+ // Print human-readable status to stderr so users always see outcome
1083
+ if (!result.deployed) {
1084
+ console.error(`\n[NOT DEPLOYED] ${result.reason}`);
1085
+ if (result.validation && !result.validation.improved) {
1086
+ console.error(
1087
+ ` Pass rate: ${(result.validation.before_pass_rate * 100).toFixed(1)}% -> ${(result.validation.after_pass_rate * 100).toFixed(1)}% (net: ${result.validation.net_change >= 0 ? "+" : ""}${(result.validation.net_change * 100).toFixed(1)}%)`,
1088
+ );
1089
+ if (result.validation.regressions.length > 0) {
1090
+ console.error(` Regressions: ${result.validation.regressions.length} entries`);
1091
+ }
1092
+ }
1093
+ if (
1094
+ result.proposal &&
1095
+ result.proposal.confidence < Number.parseFloat(values.confidence ?? "0.6")
1096
+ ) {
1097
+ console.error(
1098
+ ` Confidence ${result.proposal.confidence.toFixed(2)} below threshold ${values.confidence ?? "0.6"}`,
1099
+ );
1100
+ }
1101
+ console.error(" Re-run with --verbose for full diagnostic output.");
1102
+ } else {
1103
+ console.error(`\n[DEPLOYED] ${result.reason}`);
1104
+ }
1105
+
409
1106
  process.exit(result.deployed ? 0 : 1);
410
1107
  }
411
1108
 
412
1109
  if (import.meta.main) {
413
1110
  cliMain().catch((err) => {
414
- console.error(`[FATAL] ${err}`);
1111
+ const message = err instanceof Error ? err.message : String(err);
1112
+ const stack = err instanceof Error ? err.stack : undefined;
1113
+ console.error(`[FATAL] ${message}`);
1114
+ if (stack && process.env.SELFTUNE_VERBOSE === "1") {
1115
+ console.error(stack);
1116
+ }
1117
+ console.error(
1118
+ "\nTroubleshooting:\n" +
1119
+ " - Verify --skill-path points to a valid SKILL.md file\n" +
1120
+ " - Ensure eval data exists (run `selftune evals` first) or pass --eval-set\n" +
1121
+ " - Check that ANTHROPIC_API_KEY is set if using Claude\n" +
1122
+ " - Re-run with --verbose for full diagnostic output",
1123
+ );
415
1124
  process.exit(1);
416
1125
  });
417
1126
  }