npm - selftune - Versions diffs - 0.1.4 → 0.2.0 - Mend

selftune 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

package/.claude/agents/diagnosis-analyst.md +146 -0
package/.claude/agents/evolution-reviewer.md +167 -0
package/.claude/agents/integration-guide.md +200 -0
package/.claude/agents/pattern-analyst.md +147 -0
package/CHANGELOG.md +37 -0
package/README.md +96 -256
package/assets/BeforeAfter.gif +0 -0
package/assets/FeedbackLoop.gif +0 -0
package/assets/logo.svg +9 -0
package/assets/skill-health-badge.svg +20 -0
package/cli/selftune/activation-rules.ts +171 -0
package/cli/selftune/badge/badge-data.ts +108 -0
package/cli/selftune/badge/badge-svg.ts +212 -0
package/cli/selftune/badge/badge.ts +103 -0
package/cli/selftune/constants.ts +75 -1
package/cli/selftune/contribute/bundle.ts +314 -0
package/cli/selftune/contribute/contribute.ts +214 -0
package/cli/selftune/contribute/sanitize.ts +162 -0
package/cli/selftune/cron/setup.ts +266 -0
package/cli/selftune/dashboard-server.ts +582 -0
package/cli/selftune/dashboard.ts +25 -3
package/cli/selftune/eval/baseline.ts +247 -0
package/cli/selftune/eval/composability.ts +117 -0
package/cli/selftune/eval/generate-unit-tests.ts +143 -0
package/cli/selftune/eval/hooks-to-evals.ts +68 -2
package/cli/selftune/eval/import-skillsbench.ts +221 -0
package/cli/selftune/eval/synthetic-evals.ts +172 -0
package/cli/selftune/eval/unit-test-cli.ts +152 -0
package/cli/selftune/eval/unit-test.ts +196 -0
package/cli/selftune/evolution/deploy-proposal.ts +142 -1
package/cli/selftune/evolution/evolve-body.ts +492 -0
package/cli/selftune/evolution/evolve.ts +466 -103
package/cli/selftune/evolution/extract-patterns.ts +32 -1
package/cli/selftune/evolution/pareto.ts +314 -0
package/cli/selftune/evolution/propose-body.ts +171 -0
package/cli/selftune/evolution/propose-description.ts +100 -2
package/cli/selftune/evolution/propose-routing.ts +166 -0
package/cli/selftune/evolution/refine-body.ts +141 -0
package/cli/selftune/evolution/rollback.ts +19 -2
package/cli/selftune/evolution/validate-body.ts +254 -0
package/cli/selftune/evolution/validate-proposal.ts +257 -35
package/cli/selftune/evolution/validate-routing.ts +177 -0
package/cli/selftune/grading/grade-session.ts +138 -18
package/cli/selftune/grading/pre-gates.ts +104 -0
package/cli/selftune/hooks/auto-activate.ts +185 -0
package/cli/selftune/hooks/evolution-guard.ts +165 -0
package/cli/selftune/hooks/skill-change-guard.ts +112 -0
package/cli/selftune/index.ts +88 -0
package/cli/selftune/ingestors/claude-replay.ts +351 -0
package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
package/cli/selftune/init.ts +150 -3
package/cli/selftune/memory/writer.ts +447 -0
package/cli/selftune/monitoring/watch.ts +25 -2
package/cli/selftune/status.ts +17 -13
package/cli/selftune/types.ts +377 -5
package/cli/selftune/utils/frontmatter.ts +217 -0
package/cli/selftune/utils/llm-call.ts +29 -3
package/cli/selftune/utils/transcript.ts +35 -0
package/cli/selftune/utils/trigger-check.ts +89 -0
package/cli/selftune/utils/tui.ts +156 -0
package/dashboard/index.html +569 -8
package/package.json +8 -4
package/skill/SKILL.md +124 -8
package/skill/Workflows/AutoActivation.md +144 -0
package/skill/Workflows/Badge.md +118 -0
package/skill/Workflows/Baseline.md +121 -0
package/skill/Workflows/Composability.md +100 -0
package/skill/Workflows/Contribute.md +91 -0
package/skill/Workflows/Cron.md +155 -0
package/skill/Workflows/Dashboard.md +203 -0
package/skill/Workflows/Doctor.md +37 -1
package/skill/Workflows/Evals.md +69 -1
package/skill/Workflows/EvolutionMemory.md +152 -0
package/skill/Workflows/Evolve.md +111 -6
package/skill/Workflows/EvolveBody.md +159 -0
package/skill/Workflows/ImportSkillsBench.md +111 -0
package/skill/Workflows/Ingest.md +117 -3
package/skill/Workflows/Initialize.md +57 -3
package/skill/Workflows/Replay.md +70 -0
package/skill/Workflows/Rollback.md +20 -1
package/skill/Workflows/UnitTest.md +138 -0
package/skill/Workflows/Watch.md +22 -0
package/skill/settings_snippet.json +23 -0
package/templates/activation-rules-default.json +27 -0
package/templates/multi-skill-settings.json +64 -0
package/templates/single-skill-settings.json +58 -0

package/cli/selftune/evolution/evolve.ts CHANGED Viewed

@@ -6,25 +6,45 @@
  * logic and comprehensive audit tracking.
  */
-import { existsSync, readFileSync } from "node:fs";
+import { copyFileSync, existsSync, readFileSync, writeFileSync } from "node:fs";
 import { parseArgs } from "node:util";
-import { QUERY_LOG, SKILL_LOG } from "../constants.js";
+import { QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
+import type { BaselineMeasurement } from "../eval/baseline.js";
+import { measureBaseline } from "../eval/baseline.js";
 import { buildEvalSet } from "../eval/hooks-to-evals.js";
+import { updateContextAfterEvolve } from "../memory/writer.js";
 import type {
   EvalEntry,
   EvalPassRate,
   EvolutionAuditEntry,
   EvolutionProposal,
+  EvolveResultSummary,
+  FailurePattern,
+  GradingResult,
+  ParetoCandidate,
   QueryLogRecord,
+  SessionTelemetryRecord,
   SkillUsageRecord,
 } from "../types.js";
+import { parseFrontmatter, replaceFrontmatterDescription } from "../utils/frontmatter.js";
 import { readJsonl } from "../utils/jsonl.js";
+import { createEvolveTUI } from "../utils/tui.js";
 import { appendAuditEntry } from "./audit.js";
 import { extractFailurePatterns } from "./extract-patterns.js";
-import { generateProposal } from "./propose-description.js";
+import {
+  computeInvocationScores,
+  computeParetoFrontier,
+  computeTokenEfficiencyScore,
+  selectFromFrontier,
+} from "./pareto.js";
+import { generateMultipleProposals, generateProposal } from "./propose-description.js";
 import type { ValidationResult } from "./validate-proposal.js";
-import { validateProposal } from "./validate-proposal.js";
+import {
+  TRIGGER_CHECK_BATCH_SIZE,
+  VALIDATION_RUNS,
+  validateProposal,
+} from "./validate-proposal.js";
 // ---------------------------------------------------------------------------
 // Types
@@ -38,6 +58,16 @@ export interface EvolveOptions {
   dryRun: boolean;
   confidenceThreshold: number; // default 0.6
   maxIterations: number; // default 3
+  gradingResults?: GradingResult[];
+  paretoEnabled?: boolean;
+  candidateCount?: number;
+  tokenEfficiencyEnabled?: boolean;
+  telemetryRecords?: SessionTelemetryRecord[];
+  withBaseline?: boolean;
+  validationModel?: string;
+  cheapLoop?: boolean;
+  gateModel?: string;
+  proposalModel?: string;
 }
 export interface EvolveResult {
@@ -46,6 +76,11 @@ export interface EvolveResult {
   deployed: boolean;
   auditEntries: EvolutionAuditEntry[];
   reason: string;
+  skillVersion?: string;
+  llmCallCount: number;
+  elapsedMs: number;
+  baselineResult?: BaselineMeasurement;
+  gateValidation?: ValidationResult;
 }
 /**
@@ -53,11 +88,19 @@ export interface EvolveResult {
  * imports are used. Pass overrides in tests to avoid mock.module().
  */
 export interface EvolveDeps {
-  extractFailurePatterns?: typeof import("./extract-patterns.js").extractFailurePatterns;
+  extractFailurePatterns?: (
+    evalEntries: EvalEntry[],
+    skillUsage: SkillUsageRecord[],
+    skillName: string,
+    gradingResults?: GradingResult[],
+  ) => FailurePattern[];
   generateProposal?: typeof import("./propose-description.js").generateProposal;
   validateProposal?: typeof import("./validate-proposal.js").validateProposal;
+  gateValidateProposal?: typeof import("./validate-proposal.js").validateProposal;
   appendAuditEntry?: typeof import("./audit.js").appendAuditEntry;
   buildEvalSet?: typeof import("../eval/hooks-to-evals.js").buildEvalSet;
+  updateContextAfterEvolve?: typeof import("../memory/writer.js").updateContextAfterEvolve;
+  measureBaseline?: typeof import("../eval/baseline.js").measureBaseline;
 }
 // ---------------------------------------------------------------------------
@@ -69,12 +112,14 @@ function createAuditEntry(
   action: EvolutionAuditEntry["action"],
   details: string,
   evalSnapshot?: EvalPassRate,
+  skillName?: string,
 ): EvolutionAuditEntry {
   return {
     timestamp: new Date().toISOString(),
     proposal_id: proposalId,
     action,
     details,
+    ...(skillName ? { skill_name: skillName } : {}),
     ...(evalSnapshot ? { eval_snapshot: evalSnapshot } : {}),
   };
 }
@@ -90,12 +135,22 @@ export async function evolve(
   const { skillName, skillPath, evalSetPath, agent, dryRun, confidenceThreshold, maxIterations } =
     options;
+  // Apply cheap-loop defaults: cheap models for proposal/validation, expensive for gate
+  if (options.cheapLoop) {
+    if (!options.proposalModel) options.proposalModel = "haiku";
+    if (!options.validationModel) options.validationModel = "haiku";
+    if (!options.gateModel) options.gateModel = "sonnet";
+  }
   // Resolve injectable dependencies with real-import fallbacks
   const _extractFailurePatterns = _deps.extractFailurePatterns ?? extractFailurePatterns;
   const _generateProposal = _deps.generateProposal ?? generateProposal;
   const _validateProposal = _deps.validateProposal ?? validateProposal;
+  const _gateValidateProposal = _deps.gateValidateProposal ?? validateProposal;
   const _appendAuditEntry = _deps.appendAuditEntry ?? appendAuditEntry;
   const _buildEvalSet = _deps.buildEvalSet ?? buildEvalSet;
+  const _updateContextAfterEvolve = _deps.updateContextAfterEvolve ?? updateContextAfterEvolve;
+  const _measureBaseline = _deps.measureBaseline ?? measureBaseline;
   const auditEntries: EvolutionAuditEntry[] = [];
@@ -105,7 +160,7 @@ export async function evolve(
     details: string,
     evalSnapshot?: EvalPassRate,
   ): void {
-    const entry = createAuditEntry(proposalId, action, details, evalSnapshot);
+    const entry = createAuditEntry(proposalId, action, details, evalSnapshot, skillName);
     auditEntries.push(entry);
     try {
       _appendAuditEntry(entry);
@@ -114,21 +169,47 @@ export async function evolve(
     }
   }
+  const pipelineStart = Date.now();
+  let llmCallCount = 0;
+  const tui = createEvolveTUI({ skillName, model: options.proposalModel ?? "(default)" });
+  const finishTui = () =>
+    tui.finish(
+      `${llmCallCount} LLM calls \u00b7 ${((Date.now() - pipelineStart) / 1000).toFixed(1)}s elapsed`,
+    );
+  /** Stamp every return with pipeline stats so callers always get them. */
+  const withStats = (r: Omit<EvolveResult, "llmCallCount" | "elapsedMs">): EvolveResult => ({
+    ...r,
+    llmCallCount,
+    elapsedMs: Date.now() - pipelineStart,
+  });
+  // Hoisted so catch block can preserve partial results on error
+  let lastProposal: EvolutionProposal | null = null;
+  let lastValidation: ValidationResult | null = null;
   try {
     // -----------------------------------------------------------------------
     // Step 1: Read current SKILL.md
     // -----------------------------------------------------------------------
     if (!existsSync(skillPath)) {
-      return {
+      tui.fail(`SKILL.md not found at ${skillPath}`);
+      finishTui();
+      return withStats({
         proposal: null,
         validation: null,
         deployed: false,
         auditEntries,
         reason: `SKILL.md not found at ${skillPath}`,
-      };
+      });
     }
-    const currentDescription = readFileSync(skillPath, "utf-8");
+    const rawContent = readFileSync(skillPath, "utf-8");
+    const frontmatter = parseFrontmatter(rawContent);
+    const currentDescription = frontmatter.description || rawContent;
+    const skillVersion = frontmatter.version || undefined;
+    const versionTag = skillVersion ? `, v${skillVersion}` : "";
+    tui.done(`Loaded SKILL.md (desc: ${currentDescription.length} chars${versionTag})`);
     // -----------------------------------------------------------------------
     // Step 2: Load eval set
@@ -145,6 +226,10 @@ export async function evolve(
       evalSet = _buildEvalSet(skillRecords, queryRecords, skillName);
     }
+    const posCount = evalSet.filter((e) => e.should_trigger).length;
+    const negCount = evalSet.filter((e) => !e.should_trigger).length;
+    tui.done(`Loaded eval set (${evalSet.length} entries: ${posCount}+, ${negCount}-)`);
     // -----------------------------------------------------------------------
     // Step 3: Load skill usage records
     // -----------------------------------------------------------------------
@@ -153,19 +238,30 @@ export async function evolve(
     // -----------------------------------------------------------------------
     // Step 4: Extract failure patterns
     // -----------------------------------------------------------------------
-    const failurePatterns = _extractFailurePatterns(evalSet, skillUsage, skillName);
+    const failurePatterns = _extractFailurePatterns(
+      evalSet,
+      skillUsage,
+      skillName,
+      options.gradingResults,
+    );
+    const totalMissed = failurePatterns.reduce((sum, p) => sum + p.missed_queries.length, 0);
+    tui.done(
+      `Extracted ${failurePatterns.length} failure pattern(s) (${totalMissed} missed queries)`,
+    );
     // -----------------------------------------------------------------------
     // Step 5: Early exit if no patterns
     // -----------------------------------------------------------------------
     if (failurePatterns.length === 0) {
-      return {
+      finishTui();
+      return withStats({
         proposal: null,
         validation: null,
         deployed: false,
         auditEntries,
         reason: "No failure patterns found",
-      };
+      });
     }
     // -----------------------------------------------------------------------
@@ -174,156 +270,368 @@ export async function evolve(
     const missedQueries = failurePatterns.flatMap((p) => p.missed_queries);
     // -----------------------------------------------------------------------
-    // Steps 7-12: Retry loop for proposal generation and validation
+    // Steps 7-12: Proposal generation and validation
     // -----------------------------------------------------------------------
-    let lastProposal: EvolutionProposal | null = null;
-    let lastValidation: ValidationResult | null = null;
-    let feedbackReason = "";
-    for (let iteration = 0; iteration < maxIterations; iteration++) {
-      // Step 7: Generate proposal
-      const effectiveMissedQueries = feedbackReason
-        ? [...missedQueries, `[Previous attempt failed: ${feedbackReason}]`]
-        : missedQueries;
+    // -----------------------------------------------------------------------
+    // Pareto multi-candidate path
+    // -----------------------------------------------------------------------
+    const paretoEnabled = options.paretoEnabled ?? false;
+    const candidateCount = options.candidateCount ?? 3;
+    const tokenEfficiencyEnabled = options.tokenEfficiencyEnabled ?? false;
+    // Compute token efficiency score if enabled and telemetry is available
+    let tokenEffScore: number | undefined;
+    if (tokenEfficiencyEnabled && options.telemetryRecords && options.telemetryRecords.length > 0) {
+      tokenEffScore = computeTokenEfficiencyScore(skillName, options.telemetryRecords);
+      recordAudit(
+        "system",
+        "created",
+        `Token efficiency score for ${skillName}: ${tokenEffScore.toFixed(3)}`,
+      );
+    }
-      const proposal = await _generateProposal(
+    if (paretoEnabled && candidateCount > 1) {
+      // Generate N candidates in parallel
+      const candidates = await generateMultipleProposals(
         currentDescription,
         failurePatterns,
-        effectiveMissedQueries,
+        missedQueries,
         skillName,
         skillPath,
         agent,
+        candidateCount,
+        options.proposalModel,
       );
-      lastProposal = proposal;
+      // Filter by confidence threshold
+      const viableCandidates = candidates.filter((c) => c.confidence >= confidenceThreshold);
+      if (viableCandidates.length === 0) {
+        finishTui();
+        return withStats({
+          proposal: candidates[0] ?? null,
+          validation: null,
+          deployed: false,
+          auditEntries,
+          reason: `No candidates met confidence threshold ${confidenceThreshold}`,
+        });
+      }
-      // Step 8: Audit "created"
-      recordAudit(
-        proposal.proposal_id,
-        "created",
-        `Proposal created for ${skillName} (iteration ${iteration + 1})`,
-      );
+      // Validate each candidate
+      const paretoCandidates: ParetoCandidate[] = [];
+      for (const proposal of viableCandidates) {
+        recordAudit(proposal.proposal_id, "created", `Pareto candidate for ${skillName}`);
-      // Step 9: Check confidence threshold
-      if (proposal.confidence < confidenceThreshold) {
-        feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
+        const validation = await _validateProposal(
+          proposal,
+          evalSet,
+          agent,
+          options.validationModel,
+        );
         recordAudit(
           proposal.proposal_id,
-          "rejected",
-          `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
+          "validated",
+          `Pareto validation: improved=${validation.improved}`,
         );
-        // If this is the last iteration, return early with rejection
-        if (iteration === maxIterations - 1) {
-          return {
-            proposal: lastProposal,
-            validation: null,
-            deployed: false,
-            auditEntries,
-            reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
+        if (validation.improved && validation.per_entry_results) {
+          const invocationScores = computeInvocationScores(validation.per_entry_results);
+          const candidate: ParetoCandidate = {
+            proposal,
+            validation,
+            invocation_scores: invocationScores,
+            dominates_on: [],
           };
+          if (tokenEffScore !== undefined) {
+            candidate.token_efficiency_score = tokenEffScore;
+          }
+          paretoCandidates.push(candidate);
         }
+      }
-        continue;
+      if (paretoCandidates.length === 0) {
+        finishTui();
+        return withStats({
+          proposal: viableCandidates[0],
+          validation: null,
+          deployed: false,
+          auditEntries,
+          reason: "No Pareto candidates improved validation",
+        });
       }
-      // Step 10: Validate against eval set
-      const validation = await _validateProposal(proposal, evalSet, agent);
-      lastValidation = validation;
+      // Compute Pareto frontier
+      const frontier = computeParetoFrontier(paretoCandidates);
+      const { best } = selectFromFrontier(frontier);
-      // Step 11: Audit "validated"
-      const evalSnapshot: EvalPassRate = {
-        total: evalSet.length,
-        passed: Math.round(validation.after_pass_rate * evalSet.length),
-        failed: evalSet.length - Math.round(validation.after_pass_rate * evalSet.length),
-        pass_rate: validation.after_pass_rate,
-      };
-      recordAudit(
-        proposal.proposal_id,
-        "validated",
-        `Validation complete: improved=${validation.improved}`,
-        evalSnapshot,
-      );
+      lastProposal = best.proposal;
+      lastValidation = best.validation;
-      // Step 12: Check validation result
-      if (!validation.improved) {
-        feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
+      // Skip the standard retry loop — we already have our result
+    } else {
+      // Standard single-candidate retry loop
+      let feedbackReason = "";
+      for (let iteration = 0; iteration < maxIterations; iteration++) {
+        // Step 7: Generate proposal
+        const effectiveMissedQueries = feedbackReason
+          ? [...missedQueries, `[Previous attempt failed: ${feedbackReason}]`]
+          : missedQueries;
+        tui.step(`Generating proposal (iteration ${iteration + 1}/${maxIterations})...`);
+        const proposal = await _generateProposal(
+          currentDescription,
+          failurePatterns,
+          effectiveMissedQueries,
+          skillName,
+          skillPath,
+          agent,
+          options.proposalModel,
+        );
+        llmCallCount++;
+        lastProposal = proposal;
+        tui.done(`Proposal generated (conf: ${proposal.confidence.toFixed(2)})`);
+        // Step 8: Audit "created"
         recordAudit(
           proposal.proposal_id,
-          "rejected",
-          `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
+          "created",
+          `Proposal created for ${skillName} (iteration ${iteration + 1})`,
         );
-        // If this is the last iteration, return with rejection
-        if (iteration === maxIterations - 1) {
-          return {
-            proposal: lastProposal,
-            validation: lastValidation,
-            deployed: false,
-            auditEntries,
-            reason: `Validation failed after ${maxIterations} iterations: net_change=${validation.net_change.toFixed(3)}`,
-          };
+        // Step 9: Check confidence threshold
+        if (proposal.confidence < confidenceThreshold) {
+          feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
+          recordAudit(
+            proposal.proposal_id,
+            "rejected",
+            `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
+          );
+          // If this is the last iteration, return early with rejection
+          if (iteration === maxIterations - 1) {
+            finishTui();
+            return withStats({
+              proposal: lastProposal,
+              validation: null,
+              deployed: false,
+              auditEntries,
+              reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
+            });
+          }
+          continue;
         }
-        continue;
-      }
+        // Step 10: Validate against eval set
+        const batchCount = Math.ceil(evalSet.length / TRIGGER_CHECK_BATCH_SIZE);
+        tui.step(
+          `Validating ${evalSet.length} entries (${batchCount} batches, ${VALIDATION_RUNS}x majority-vote)...`,
+        );
+        const validation = await _validateProposal(
+          proposal,
+          evalSet,
+          agent,
+          options.validationModel,
+        );
+        lastValidation = validation;
+        llmCallCount += batchCount * 2 * VALIDATION_RUNS;
+        tui.done(
+          `Validation: ${(validation.before_pass_rate * 100).toFixed(1)}% \u2192 ${(validation.after_pass_rate * 100).toFixed(1)}% (improved: ${validation.improved})`,
+        );
+        // Step 11: Audit "validated"
+        const evalSnapshot: EvalPassRate = {
+          total: evalSet.length,
+          passed: Math.round(validation.after_pass_rate * evalSet.length),
+          failed: evalSet.length - Math.round(validation.after_pass_rate * evalSet.length),
+          pass_rate: validation.after_pass_rate,
+        };
+        recordAudit(
+          proposal.proposal_id,
+          "validated",
+          `Validation complete: improved=${validation.improved}`,
+          evalSnapshot,
+        );
-      // Validation passed - break out of retry loop
-      break;
+        // Step 12: Check validation result
+        if (!validation.improved) {
+          feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
+          recordAudit(
+            proposal.proposal_id,
+            "rejected",
+            `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
+          );
+          // If this is the last iteration, return with rejection
+          if (iteration === maxIterations - 1) {
+            finishTui();
+            return withStats({
+              proposal: lastProposal,
+              validation: lastValidation,
+              deployed: false,
+              auditEntries,
+              reason: `Validation failed after ${maxIterations} iterations: net_change=${validation.net_change.toFixed(3)}`,
+            });
+          }
+          continue;
+        }
+        // Validation passed - break out of retry loop
+        break;
+      }
     }
     // -----------------------------------------------------------------------
     // Step 13: Dry run check
     // -----------------------------------------------------------------------
     if (dryRun) {
-      return {
+      finishTui();
+      return withStats({
         proposal: lastProposal,
         validation: lastValidation,
         deployed: false,
         auditEntries,
         reason: "Dry run - proposal validated but not deployed",
-      };
+      });
     }
     // -----------------------------------------------------------------------
-    // Step 14: Deploy (actual deploy wired in TASK-14)
+    // Step 13b: Baseline gate (--with-baseline)
     // -----------------------------------------------------------------------
-    if (lastProposal) {
+    let baselineResult: BaselineMeasurement | undefined;
+    if (options.withBaseline && lastProposal) {
+      tui.step("Measuring baseline...");
+      baselineResult = await _measureBaseline({
+        evalSet,
+        skillDescription: currentDescription,
+        skillName,
+        agent,
+        modelFlag: options.validationModel,
+      });
+      tui.done(
+        `Baseline: lift=${baselineResult.lift.toFixed(3)}, adds_value=${baselineResult.adds_value}`,
+      );
       recordAudit(
         lastProposal.proposal_id,
-        "deployed",
-        `Deployed proposal for ${skillName}`,
-        lastValidation
-          ? {
-              total: evalSet.length,
-              passed: Math.round(lastValidation.after_pass_rate * evalSet.length),
-              failed: evalSet.length - Math.round(lastValidation.after_pass_rate * evalSet.length),
-              pass_rate: lastValidation.after_pass_rate,
-            }
-          : undefined,
+        "validated",
+        `Baseline check: lift=${baselineResult.lift.toFixed(3)}, adds_value=${baselineResult.adds_value}`,
+      );
+      if (!baselineResult.adds_value) {
+        finishTui();
+        return withStats({
+          proposal: lastProposal,
+          validation: lastValidation,
+          deployed: false,
+          auditEntries,
+          reason: `Baseline gate failed: lift=${baselineResult.lift.toFixed(3)} below 0.05 threshold`,
+          baselineResult,
+        });
+      }
+    }
+    // -----------------------------------------------------------------------
+    // Step 13c: Gate validation (--cheap-loop / --gate-model)
+    // -----------------------------------------------------------------------
+    let gateValidation: ValidationResult | undefined;
+    if (options.gateModel && lastProposal && lastValidation?.improved) {
+      tui.step(`Gate validation (${options.gateModel})...`);
+      gateValidation = await _gateValidateProposal(lastProposal, evalSet, agent, options.gateModel);
+      tui.done(
+        `Gate (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
+      );
+      recordAudit(
+        lastProposal.proposal_id,
+        "validated",
+        `Gate validation (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
+      );
+      if (!gateValidation.improved) {
+        finishTui();
+        return withStats({
+          proposal: lastProposal,
+          validation: lastValidation,
+          deployed: false,
+          auditEntries,
+          reason: `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
+          gateValidation,
+          ...(baselineResult ? { baselineResult } : {}),
+        });
+      }
+    }
+    // -----------------------------------------------------------------------
+    // Step 14: Deploy — write updated description to SKILL.md
+    // -----------------------------------------------------------------------
+    if (lastProposal && lastValidation?.improved) {
+      // Create backup before modifying
+      const backupPath = `${skillPath}.bak`;
+      copyFileSync(skillPath, backupPath);
+      tui.done(`Backup created at ${backupPath}`);
+      // Replace the frontmatter description
+      const updatedContent = replaceFrontmatterDescription(
+        rawContent,
+        lastProposal.proposed_description,
       );
+      writeFileSync(skillPath, updatedContent, "utf-8");
+      tui.done(`Deployed updated description to ${skillPath}`);
+      recordAudit(lastProposal.proposal_id, "deployed", `Deployed proposal for ${skillName}`, {
+        total: evalSet.length,
+        passed: Math.round(lastValidation.after_pass_rate * evalSet.length),
+        failed: evalSet.length - Math.round(lastValidation.after_pass_rate * evalSet.length),
+        pass_rate: lastValidation.after_pass_rate,
+      });
     }
     // -----------------------------------------------------------------------
-    // Step 15-16: Return complete result
+    // Step 15: Update evolution memory
     // -----------------------------------------------------------------------
-    return {
+    const wasDeployed = lastProposal !== null && lastValidation !== null && lastValidation.improved;
+    const evolveResult: EvolveResult = withStats({
       proposal: lastProposal,
       validation: lastValidation,
-      deployed: true,
+      deployed: wasDeployed,
       auditEntries,
-      reason: "Evolution deployed successfully",
-    };
+      reason: wasDeployed
+        ? "Evolution deployed successfully"
+        : "Evolution not deployed: proposal or validation missing",
+      ...(skillVersion ? { skillVersion } : {}),
+      ...(baselineResult ? { baselineResult } : {}),
+      ...(gateValidation ? { gateValidation } : {}),
+    });
+    if (lastProposal) {
+      try {
+        _updateContextAfterEvolve(skillName, lastProposal, evolveResult);
+      } catch {
+        // Memory writes should never fail the main operation
+      }
+    }
+    // -----------------------------------------------------------------------
+    // Step 16: Return complete result
+    // -----------------------------------------------------------------------
+    finishTui();
+    return evolveResult;
   } catch (error) {
-    // Robust error handling: catch any unexpected errors and return gracefully
+    tui.destroy();
+    // Robust error handling: preserve partial results so callers can inspect progress
     const errorMessage = error instanceof Error ? error.message : String(error);
-    return {
-      proposal: null,
-      validation: null,
+    return withStats({
+      proposal: lastProposal,
+      validation: lastValidation,
       deployed: false,
       auditEntries,
       reason: `Error during evolution: ${errorMessage}`,
-    };
+    });
   }
 }
@@ -341,6 +649,15 @@ export async function cliMain(): Promise<void> {
       "dry-run": { type: "boolean", default: false },
       confidence: { type: "string", default: "0.6" },
       "max-iterations": { type: "string", default: "3" },
+      pareto: { type: "boolean", default: false },
+      candidates: { type: "string", default: "3" },
+      "token-efficiency": { type: "boolean", default: false },
+      "with-baseline": { type: "boolean", default: false },
+      "validation-model": { type: "string", default: "haiku" },
+      "cheap-loop": { type: "boolean", default: false },
+      "gate-model": { type: "string" },
+      "proposal-model": { type: "string" },
+      verbose: { type: "boolean", default: false },
       help: { type: "boolean", default: false },
     },
     strict: true,
@@ -360,6 +677,15 @@ Options:
   --dry-run           Validate proposal without deploying
   --confidence        Confidence threshold 0.0-1.0 (default: 0.6)
   --max-iterations    Max retry iterations (default: 3)
+  --pareto            Enable Pareto multi-candidate selection
+  --candidates        Number of candidates to generate (default: 3, max: 5)
+  --token-efficiency  Enable 5D Pareto with token efficiency scoring
+  --with-baseline     Gate deployment on baseline lift > 0.05
+  --validation-model  Model for trigger-check validation calls (default: haiku)
+  --cheap-loop        Use cheap models for loop, expensive model for final gate
+  --gate-model        Model for final gate validation (default: sonnet when --cheap-loop)
+  --proposal-model    Model for proposal generation LLM calls
+  --verbose           Output full EvolveResult JSON (default: compact summary)
   --help              Show this help message`);
     process.exit(0);
   }
@@ -395,6 +721,12 @@ Options:
     process.exit(1);
   }
+  const tokenEfficiencyEnabled = values["token-efficiency"] ?? false;
+  let telemetryRecords: SessionTelemetryRecord[] | undefined;
+  if (tokenEfficiencyEnabled) {
+    telemetryRecords = readJsonl<SessionTelemetryRecord>(TELEMETRY_LOG);
+  }
   const result = await evolve({
     skillName: values.skill,
     skillPath: values["skill-path"],
@@ -403,9 +735,40 @@ Options:
     dryRun: values["dry-run"] ?? false,
     confidenceThreshold: Number.parseFloat(values.confidence ?? "0.6"),
     maxIterations: Number.parseInt(values["max-iterations"] ?? "3", 10),
+    paretoEnabled: values.pareto ?? false,
+    candidateCount: Number.parseInt(values.candidates ?? "3", 10),
+    tokenEfficiencyEnabled,
+    telemetryRecords,
+    withBaseline: values["with-baseline"] ?? false,
+    validationModel: values["validation-model"],
+    cheapLoop: values["cheap-loop"] ?? false,
+    gateModel: values["gate-model"],
+    proposalModel: values["proposal-model"],
   });
-  console.log(JSON.stringify(result, null, 2));
+  if (values.verbose) {
+    console.log(JSON.stringify(result, null, 2));
+  } else {
+    const summary: EvolveResultSummary = {
+      skill: values.skill,
+      deployed: result.deployed,
+      reason: result.reason,
+      before: result.validation?.before_pass_rate ?? 0,
+      after: result.validation?.after_pass_rate ?? 0,
+      net_change: result.validation?.net_change ?? 0,
+      improved: result.validation?.improved ?? false,
+      regressions: result.validation?.regressions.length ?? 0,
+      new_passes: result.validation?.new_passes.length ?? 0,
+      confidence: result.proposal?.confidence ?? 0,
+      llm_calls: result.llmCallCount,
+      elapsed_s: +(result.elapsedMs / 1000).toFixed(1),
+      proposal_id: result.proposal?.proposal_id ?? "",
+      rationale: result.proposal?.rationale ?? "",
+      ...(result.skillVersion ? { version: result.skillVersion } : {}),
+      dashboard_url: `http://localhost:3141/report/${encodeURIComponent(values.skill)}`,
+    };
+    console.log(JSON.stringify(summary, null, 2));
+  }
   process.exit(result.deployed ? 0 : 1);
 }