npm - selftune - Versions diffs - 0.2.16 → 0.2.19 - Mend

selftune 0.2.16 → 0.2.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

package/README.md +32 -22
package/apps/local-dashboard/dist/assets/index-DnhnXQm6.js +60 -0
package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +1 -0
package/apps/local-dashboard/dist/assets/vendor-table-BIiI3YhS.js +1 -0
package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +12 -0
package/apps/local-dashboard/dist/index.html +5 -5
package/cli/selftune/alpha-upload/build-payloads.ts +14 -1
package/cli/selftune/alpha-upload/client.ts +51 -1
package/cli/selftune/alpha-upload/flush.ts +46 -5
package/cli/selftune/alpha-upload/stage-canonical.ts +32 -10
package/cli/selftune/alpha-upload-contract.ts +9 -0
package/cli/selftune/constants.ts +92 -5
package/cli/selftune/contribute/contribute.ts +30 -2
package/cli/selftune/contribute/sanitize.ts +52 -5
package/cli/selftune/contribution-config.ts +249 -0
package/cli/selftune/contribution-relay.ts +177 -0
package/cli/selftune/contribution-signals.ts +219 -0
package/cli/selftune/contribution-staging.ts +147 -0
package/cli/selftune/contributions.ts +532 -0
package/cli/selftune/creator-contributions.ts +333 -0
package/cli/selftune/dashboard-contract.ts +305 -1
package/cli/selftune/dashboard-server.ts +47 -13
package/cli/selftune/eval/family-overlap.ts +395 -0
package/cli/selftune/eval/hooks-to-evals.ts +182 -28
package/cli/selftune/eval/synthetic-evals.ts +298 -11
package/cli/selftune/evolution/description-quality.ts +12 -11
package/cli/selftune/evolution/evolve.ts +214 -51
package/cli/selftune/evolution/validate-proposal.ts +9 -6
package/cli/selftune/export.ts +2 -2
package/cli/selftune/grading/grade-session.ts +20 -0
package/cli/selftune/hooks/commit-track.ts +188 -0
package/cli/selftune/hooks/prompt-log.ts +10 -1
package/cli/selftune/hooks/session-stop.ts +2 -2
package/cli/selftune/hooks/skill-eval.ts +15 -1
package/cli/selftune/hooks/stdin-preview.ts +32 -0
package/cli/selftune/index.ts +41 -5
package/cli/selftune/ingestors/codex-rollout.ts +31 -35
package/cli/selftune/ingestors/codex-wrapper.ts +32 -24
package/cli/selftune/localdb/db.ts +2 -2
package/cli/selftune/localdb/direct-write.ts +69 -6
package/cli/selftune/localdb/queries.ts +1253 -37
package/cli/selftune/localdb/schema.ts +66 -0
package/cli/selftune/orchestrate.ts +32 -4
package/cli/selftune/recover.ts +153 -0
package/cli/selftune/repair/skill-usage.ts +363 -4
package/cli/selftune/routes/actions.ts +35 -1
package/cli/selftune/routes/analytics.ts +14 -0
package/cli/selftune/routes/index.ts +1 -0
package/cli/selftune/routes/overview.ts +150 -4
package/cli/selftune/routes/skill-report.ts +648 -18
package/cli/selftune/status.ts +81 -2
package/cli/selftune/sync.ts +56 -2
package/cli/selftune/trust-model.ts +66 -0
package/cli/selftune/types.ts +80 -0
package/cli/selftune/utils/skill-detection.ts +43 -0
package/cli/selftune/utils/transcript.ts +210 -1
package/cli/selftune/watchlist.ts +65 -0
package/node_modules/@selftune/telemetry-contract/src/types.ts +11 -0
package/package.json +1 -1
package/packages/telemetry-contract/src/types.ts +11 -0
package/packages/ui/src/components/ActivityTimeline.tsx +165 -150
package/packages/ui/src/components/EvidenceViewer.tsx +335 -144
package/packages/ui/src/components/EvolutionTimeline.tsx +58 -28
package/packages/ui/src/components/OrchestrateRunsPanel.tsx +33 -16
package/packages/ui/src/components/RecentActivityFeed.tsx +72 -41
package/packages/ui/src/components/section-cards.tsx +12 -9
package/packages/ui/src/primitives/card.tsx +1 -1
package/skill/SKILL.md +40 -2
package/skill/Workflows/AlphaUpload.md +4 -0
package/skill/Workflows/Composability.md +64 -0
package/skill/Workflows/Contribute.md +6 -3
package/skill/Workflows/Contributions.md +97 -0
package/skill/Workflows/CreatorContributions.md +74 -0
package/skill/Workflows/Dashboard.md +31 -0
package/skill/Workflows/Evals.md +57 -8
package/skill/Workflows/Evolve.md +31 -13
package/skill/Workflows/ExportCanonical.md +121 -0
package/skill/Workflows/Hook.md +131 -0
package/skill/Workflows/Ingest.md +7 -0
package/skill/Workflows/Initialize.md +29 -9
package/skill/Workflows/Orchestrate.md +27 -5
package/skill/Workflows/Quickstart.md +94 -0
package/skill/Workflows/Recover.md +84 -0
package/skill/Workflows/RepairSkillUsage.md +95 -0
package/skill/Workflows/Sync.md +18 -12
package/skill/Workflows/Uninstall.md +82 -0
package/skill/settings_snippet.json +11 -0
package/apps/local-dashboard/dist/assets/index-BMIS6uUh.css +0 -2
package/apps/local-dashboard/dist/assets/index-DOu3iLD9.js +0 -16
package/apps/local-dashboard/dist/assets/vendor-table-pHbDxq36.js +0 -8
package/apps/local-dashboard/dist/assets/vendor-ui-DIwlrGlb.js +0 -12

package/cli/selftune/evolution/description-quality.ts CHANGED Viewed

@@ -139,27 +139,27 @@ export function scoreLengthCriterion(description: string): number {
 }
 /** Score presence of trigger context words (when/if/before/after etc). */
-export function scoreTriggerContextCriterion(description: string): number {
-  const matches = countWordMatches(description.toLowerCase(), TRIGGER_PATTERNS);
+export function scoreTriggerContextCriterion(description: string, lower?: string): number {
+  const matches = countWordMatches(lower ?? description.toLowerCase(), TRIGGER_PATTERNS);
   if (matches === 0) return 0.0;
   if (matches === 1) return 0.7;
   return Math.min(1.0, 0.7 + 0.15 * (matches - 1));
 }
 /** Score absence of vague words (lower is worse). */
-export function scoreVaguenessCriterion(description: string): number {
-  const matches = countWordMatches(description.toLowerCase(), VAGUE_PATTERNS);
+export function scoreVaguenessCriterion(description: string, lower?: string): number {
+  const matches = countWordMatches(lower ?? description.toLowerCase(), VAGUE_PATTERNS);
   if (matches === 0) return 1.0;
   if (matches === 1) return 0.6;
   return Math.max(0.1, 0.6 - 0.15 * (matches - 1));
 }
 /** Score whether description specifies at least one concrete action or domain. */
-export function scoreSpecificityCriterion(description: string): number {
-  const lower = description.toLowerCase();
-  const hasAction = ACTION_PATTERNS.some((p) => p.test(lower));
+export function scoreSpecificityCriterion(description: string, lower?: string): number {
+  const l = lower ?? description.toLowerCase();
+  const hasAction = ACTION_PATTERNS.some((p) => p.test(l));
-  const fillerCount = FILLER_PHRASES.filter((f) => lower.includes(f)).length;
+  const fillerCount = FILLER_PHRASES.filter((f) => l.includes(f)).length;
   const words = description.split(/\s+/).length;
   const fillerRatio = fillerCount > 0 ? fillerCount / Math.max(1, words / 10) : 0;
@@ -204,11 +204,12 @@ const WEIGHTS = {
  * Pure function — no I/O, no LLM calls.
  */
 export function scoreDescription(description: string, skillName?: string): DescriptionQualityScore {
+  const lower = description.toLowerCase();
   const criteria = {
     length: scoreLengthCriterion(description),
-    trigger_context: scoreTriggerContextCriterion(description),
-    vagueness: scoreVaguenessCriterion(description),
-    specificity: scoreSpecificityCriterion(description),
+    trigger_context: scoreTriggerContextCriterion(description, lower),
+    vagueness: scoreVaguenessCriterion(description, lower),
+    specificity: scoreSpecificityCriterion(description, lower),
     not_just_name: scoreNotJustNameCriterion(description, skillName),
   };

package/cli/selftune/evolution/evolve.ts CHANGED Viewed

@@ -38,6 +38,7 @@ import type {
 } from "../types.js";
 import { CLIError, handleCLIError } from "../utils/cli-error.js";
 import { parseFrontmatter, replaceDescription } from "../utils/frontmatter.js";
+import type { EffortLevel } from "../utils/llm-call.js";
 import { createEvolveTUI } from "../utils/tui.js";
 import { appendAuditEntry } from "./audit.js";
 import { checkConstitution } from "./constitutional.js";
@@ -51,6 +52,7 @@ import {
   selectFromFrontier,
 } from "./pareto.js";
 import { generateMultipleProposals, generateProposal } from "./propose-description.js";
+import { evaluateStoppingCriteria } from "./stopping-criteria.js";
 import { buildUnblockSuggestions } from "./unblock-suggestions.js";
 import type { ValidationResult } from "./validate-proposal.js";
 import {
@@ -80,7 +82,9 @@ export interface EvolveOptions {
   validationModel?: string;
   cheapLoop?: boolean;
   gateModel?: string;
+  gateEffort?: EffortLevel;
   proposalModel?: string;
+  adaptiveGate?: boolean;
   syncFirst?: boolean;
   syncForce?: boolean;
 }
@@ -174,6 +178,73 @@ function formatSimpleDiff(oldText: string, newText: string): string {
   return output.join("\n");
 }
+function countValidationLlmCalls(evalSetSize: number): number {
+  if (evalSetSize === 0) return 0;
+  return Math.ceil(evalSetSize / TRIGGER_CHECK_BATCH_SIZE) * 2 * VALIDATION_RUNS;
+}
+interface GateDecision {
+  model: string;
+  effort?: EffortLevel;
+  riskSignals: string[];
+}
+function countWords(text: string): number {
+  return text
+    .trim()
+    .split(/\s+/)
+    .filter((token) => token.length > 0).length;
+}
+function resolveGateDecision(
+  options: EvolveOptions,
+  proposal: EvolutionProposal,
+  validation: ValidationResult,
+  currentDescription: string,
+  confidenceThreshold: number,
+): GateDecision | undefined {
+  const baseModel = options.gateModel;
+  if (!baseModel) return undefined;
+  const baseDecision: GateDecision = {
+    model: baseModel,
+    effort: options.gateEffort,
+    riskSignals: [],
+  };
+  if (!options.adaptiveGate) return baseDecision;
+  const riskSignals: string[] = [];
+  const originalWords = countWords(currentDescription);
+  const proposedWords = countWords(proposal.proposed_description);
+  const wordGrowth = originalWords === 0 ? 1 : proposedWords / originalWords;
+  const lowLift = validation.net_change < 0.15;
+  const hasRegressions = validation.regressions.length > 0;
+  const lowConfidence = proposal.confidence < Math.max(confidenceThreshold + 0.05, 0.75);
+  const broadeningRisk = wordGrowth > 1.8 || proposedWords - originalWords > 32;
+  const notYetStrong = validation.after_pass_rate < 0.9;
+  if (hasRegressions) riskSignals.push(`regressions=${validation.regressions.length}`);
+  if (lowLift) riskSignals.push(`low_lift=${validation.net_change.toFixed(3)}`);
+  if (lowConfidence) riskSignals.push(`confidence=${proposal.confidence.toFixed(2)}`);
+  if (broadeningRisk) riskSignals.push(`word_growth=${wordGrowth.toFixed(2)}x`);
+  if (notYetStrong) riskSignals.push(`after_pass_rate=${validation.after_pass_rate.toFixed(2)}`);
+  const shouldEscalate = hasRegressions || validation.net_change < 0.1 || riskSignals.length >= 2;
+  if (!shouldEscalate) {
+    return {
+      ...baseDecision,
+      riskSignals,
+    };
+  }
+  return {
+    model: "opus",
+    effort: options.gateEffort === "max" ? "max" : "high",
+    riskSignals,
+  };
+}
 // ---------------------------------------------------------------------------
 // Main orchestrator
 // ---------------------------------------------------------------------------
@@ -456,7 +527,7 @@ export async function evolve(
     // -----------------------------------------------------------------------
     // Pareto multi-candidate path
     // -----------------------------------------------------------------------
-    const paretoEnabled = options.paretoEnabled ?? false;
+    const paretoEnabled = options.paretoEnabled ?? true;
     const candidateCount = options.candidateCount ?? 3;
     const tokenEfficiencyEnabled = options.tokenEfficiencyEnabled ?? false;
     const telemetryRecords =
@@ -494,6 +565,7 @@ export async function evolve(
         options.proposalModel,
         aggregateMetrics,
       );
+      llmCallCount += candidateCount;
       // Filter by confidence threshold
       const viableCandidates = candidates.filter((c) => c.confidence >= confidenceThreshold);
@@ -564,6 +636,7 @@ export async function evolve(
           agent,
           options.validationModel,
         );
+        llmCallCount += countValidationLlmCalls(evalSet.length);
         recordAudit(
           proposal.proposal_id,
           "validated",
@@ -628,6 +701,7 @@ export async function evolve(
     } else {
       // Standard single-candidate retry loop
       let feedbackReason = "";
+      const previousPassRates: number[] = [];
       for (let iteration = 0; iteration < maxIterations; iteration++) {
         iterationsCompleted = iteration + 1;
@@ -681,7 +755,24 @@ export async function evolve(
         );
         if (!constitution.passed) {
           feedbackReason = `Constitutional: ${constitution.violations.join("; ")}`;
-          recordAudit(proposal.proposal_id, "rejected", feedbackReason);
+          // Re-evaluate stopping after a constitutional rejection by treating the
+          // last entry in previousPassRates as the currentPassRate (or 0 on the
+          // first iteration) and slicing it out of history before calling
+          // evaluateStoppingCriteria() with the current iteration/maxIterations,
+          // confidenceThreshold, and proposal.confidence.
+          const constitutionStop = evaluateStoppingCriteria(
+            previousPassRates.at(-1) ?? 0,
+            previousPassRates.slice(0, -1),
+            iteration + 1,
+            maxIterations,
+            confidenceThreshold,
+            proposal.confidence,
+          );
+          recordAudit(
+            proposal.proposal_id,
+            "rejected",
+            `${feedbackReason} (stopping: ${constitutionStop.reason})`,
+          );
           recordEvidence({
             timestamp: new Date().toISOString(),
             proposal_id: proposal.proposal_id,
@@ -691,54 +782,64 @@ export async function evolve(
             stage: "rejected",
             rationale: proposal.rationale,
             confidence: proposal.confidence,
-            details: feedbackReason,
+            details: `${feedbackReason} (stopping: ${constitutionStop.reason})`,
           });
-          if (iteration === maxIterations - 1) {
+          if (constitutionStop.shouldStop) {
             finishTui();
             return withStats({
               proposal: lastProposal,
               validation: null,
               deployed: false,
               auditEntries,
-              reason: feedbackReason,
+              reason: `${feedbackReason} (${constitutionStop.reason})`,
             });
           }
           continue;
         }
-        // Step 9: Check confidence threshold
-        if (proposal.confidence < confidenceThreshold) {
-          feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
-          recordAudit(
-            proposal.proposal_id,
-            "rejected",
-            `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
+        // Step 9: Check confidence threshold via stopping criteria
+        {
+          const preValidationStop = evaluateStoppingCriteria(
+            previousPassRates.at(-1) ?? 0,
+            previousPassRates.slice(0, -1),
+            iteration + 1,
+            maxIterations,
+            confidenceThreshold,
+            proposal.confidence,
           );
-          recordEvidence({
-            timestamp: new Date().toISOString(),
-            proposal_id: proposal.proposal_id,
-            skill_name: skillName,
-            skill_path: skillPath,
-            target: "description",
-            stage: "rejected",
-            rationale: proposal.rationale,
-            confidence: proposal.confidence,
-            details: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
-          });
-          // If this is the last iteration, return early with rejection
-          if (iteration === maxIterations - 1) {
-            finishTui();
-            return withStats({
-              proposal: lastProposal,
-              validation: null,
-              deployed: false,
-              auditEntries,
-              reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
+          if (proposal.confidence < confidenceThreshold) {
+            feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
+            recordAudit(
+              proposal.proposal_id,
+              "rejected",
+              `${feedbackReason} (stopping: ${preValidationStop.reason})`,
+            );
+            recordEvidence({
+              timestamp: new Date().toISOString(),
+              proposal_id: proposal.proposal_id,
+              skill_name: skillName,
+              skill_path: skillPath,
+              target: "description",
+              stage: "rejected",
+              rationale: proposal.rationale,
+              confidence: proposal.confidence,
+              details: `${feedbackReason} (stopping: ${preValidationStop.reason})`,
             });
-          }
-          continue;
+            // Use stopping criteria to decide whether to return or retry
+            if (preValidationStop.shouldStop) {
+              finishTui();
+              return withStats({
+                proposal: lastProposal,
+                validation: null,
+                deployed: false,
+                auditEntries,
+                reason: `${feedbackReason} (${preValidationStop.reason})`,
+              });
+            }
+            continue;
+          }
         }
         // Step 10: Validate against eval set
@@ -753,7 +854,7 @@ export async function evolve(
           options.validationModel,
         );
         lastValidation = validation;
-        llmCallCount += batchCount * 2 * VALIDATION_RUNS;
+        llmCallCount += countValidationLlmCalls(evalSet.length);
         tui.done(
           `Validation: ${(validation.before_pass_rate * 100).toFixed(1)}% \u2192 ${(validation.after_pass_rate * 100).toFixed(1)}% (improved: ${validation.improved})`,
         );
@@ -792,13 +893,23 @@ export async function evolve(
           },
         });
-        // Step 12: Check validation result
+        // Step 12: Evaluate stopping criteria after validation
+        const stopping = evaluateStoppingCriteria(
+          validation.after_pass_rate,
+          previousPassRates,
+          iteration + 1,
+          maxIterations,
+          confidenceThreshold,
+          proposal.confidence,
+        );
+        previousPassRates.push(validation.after_pass_rate);
         if (!validation.improved) {
           feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
           recordAudit(
             proposal.proposal_id,
             "rejected",
-            `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
+            `Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
           );
           recordEvidence({
             timestamp: new Date().toISOString(),
@@ -809,7 +920,7 @@ export async function evolve(
             stage: "rejected",
             rationale: proposal.rationale,
             confidence: proposal.confidence,
-            details: `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
+            details: `Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
             validation: {
               improved: validation.improved,
               before_pass_rate: validation.before_pass_rate,
@@ -821,21 +932,26 @@ export async function evolve(
             },
           });
-          // If this is the last iteration, return with rejection
-          if (iteration === maxIterations - 1) {
+          // Use stopping criteria to decide whether to return or retry
+          if (stopping.shouldStop) {
             finishTui();
             return withStats({
               proposal: lastProposal,
               validation: lastValidation,
               deployed: false,
               auditEntries,
-              reason: `Validation failed after ${maxIterations} iterations: net_change=${validation.net_change.toFixed(3)}`,
+              reason: `Validation failed (${stopping.reason}): net_change=${validation.net_change.toFixed(3)}`,
             });
           }
           continue;
         }
+        // Validation passed — check if converged or continue
+        if (stopping.shouldStop && stopping.reason.includes("Converged")) {
+          recordAudit(proposal.proposal_id, "validated", `Stopping early: ${stopping.reason}`);
+        }
         // Validation passed - break out of retry loop
         break;
       }
@@ -916,18 +1032,39 @@ export async function evolve(
     // -----------------------------------------------------------------------
     let gateValidation: ValidationResult | undefined;
     if (options.gateModel && lastProposal && lastValidation?.improved) {
-      tui.step(`Gate validation (${options.gateModel})...`);
-      gateValidation = await _gateValidateProposal(lastProposal, evalSet, agent, options.gateModel);
-      llmCallCount++;
+      const gateDecision = resolveGateDecision(
+        options,
+        lastProposal,
+        lastValidation,
+        currentDescription,
+        confidenceThreshold,
+      );
+      const gateLabel = gateDecision?.effort
+        ? `${gateDecision.model}, effort=${gateDecision.effort}`
+        : (gateDecision?.model ?? options.gateModel);
+      tui.step(`Gate validation (${gateLabel})...`);
+      gateValidation = await _gateValidateProposal(
+        lastProposal,
+        evalSet,
+        agent,
+        gateDecision?.model ?? options.gateModel,
+        gateDecision?.effort,
+      );
+      llmCallCount += countValidationLlmCalls(evalSet.length);
       tui.done(
-        `Gate (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
+        `Gate (${gateLabel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
       );
+      const gatePrefix =
+        gateDecision && gateDecision.riskSignals.length > 0
+          ? `Adaptive gate [${gateDecision.riskSignals.join(", ")}]`
+          : "Gate validation";
       if (!gateValidation.improved) {
         recordAudit(
           lastProposal.proposal_id,
           "rejected",
-          `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
+          `${gatePrefix} failed (${gateLabel}): net_change=${gateValidation.net_change.toFixed(3)}`,
         );
         recordEvidence({
           timestamp: new Date().toISOString(),
@@ -938,7 +1075,7 @@ export async function evolve(
           stage: "rejected",
           rationale: lastProposal.rationale,
           confidence: lastProposal.confidence,
-          details: `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
+          details: `${gatePrefix} failed (${gateLabel}): net_change=${gateValidation.net_change.toFixed(3)}`,
           validation: {
             improved: gateValidation.improved,
             before_pass_rate: gateValidation.before_pass_rate,
@@ -955,7 +1092,7 @@ export async function evolve(
           validation: lastValidation,
           deployed: false,
           auditEntries,
-          reason: `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
+          reason: `${gatePrefix} failed (${gateLabel}): net_change=${gateValidation.net_change.toFixed(3)}`,
           gateValidation,
           ...(baselineResult ? { baselineResult } : {}),
         });
@@ -964,7 +1101,7 @@ export async function evolve(
       recordAudit(
         lastProposal.proposal_id,
         "validated",
-        `Gate validation (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
+        `${gatePrefix} (${gateLabel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
       );
     }
@@ -1082,7 +1219,7 @@ export async function cliMain(): Promise<void> {
       "dry-run": { type: "boolean", default: false },
       confidence: { type: "string", default: "0.6" },
       "max-iterations": { type: "string", default: "3" },
-      pareto: { type: "boolean", default: false },
+      pareto: { type: "boolean", default: true },
       candidates: { type: "string", default: "3" },
       "token-efficiency": { type: "boolean", default: false },
       "with-baseline": { type: "boolean", default: false },
@@ -1090,7 +1227,9 @@ export async function cliMain(): Promise<void> {
       "cheap-loop": { type: "boolean", default: true },
       "full-model": { type: "boolean", default: false },
       "gate-model": { type: "string" },
+      "gate-effort": { type: "string" },
       "proposal-model": { type: "string" },
+      "adaptive-gate": { type: "boolean", default: false },
       "sync-first": { type: "boolean", default: false },
       "sync-force": { type: "boolean", default: false },
       verbose: { type: "boolean", default: false },
@@ -1121,6 +1260,8 @@ Options:
   --cheap-loop        Use cheap models for loop, expensive for gate (default: on)
   --full-model        Use same model for all stages (disables cheap-loop)
   --gate-model        Model for final gate validation (default: sonnet)
+  --gate-effort       Thinking effort for final gate (low|medium|high|max)
+  --adaptive-gate     Escalate risky gate checks to opus + high effort
   --proposal-model    Model for proposal generation LLM calls
   --sync-first        Refresh source-truth telemetry before building evals/failure patterns
   --sync-force        Force a full rescan during --sync-first
@@ -1143,6 +1284,24 @@ Options:
       "Add --sync-first when using --sync-force",
     );
   }
+  if (values["gate-effort"] && !["low", "medium", "high", "max"].includes(values["gate-effort"])) {
+    throw new CLIError(
+      `Invalid --gate-effort value: ${values["gate-effort"]}`,
+      "INVALID_FLAG",
+      "Use one of: low, medium, high, max",
+    );
+  }
+  if (
+    (values["gate-effort"] || values["adaptive-gate"]) &&
+    (values["full-model"] ?? false) &&
+    !values["gate-model"]
+  ) {
+    throw new CLIError(
+      "--gate-effort and --adaptive-gate require --gate-model when --full-model is set",
+      "INVALID_FLAG",
+      "Add --gate-model <model> or drop --full-model",
+    );
+  }
   const { detectAgent } = await import("../utils/llm-call.js");
   const requestedAgent = values.agent;
@@ -1223,6 +1382,8 @@ Options:
     console.error(`[verbose] Dry run: ${values["dry-run"] ?? false}`);
     console.error(`[verbose] Sync first: ${values["sync-first"] ?? false}`);
     console.error(`[verbose] Sync force: ${values["sync-force"] ?? false}`);
+    console.error(`[verbose] Adaptive gate: ${values["adaptive-gate"] ?? false}`);
+    console.error(`[verbose] Gate effort: ${values["gate-effort"] ?? "(default)"}`);
   }
   const result = await evolve({
@@ -1241,7 +1402,9 @@ Options:
     validationModel: values["validation-model"],
     cheapLoop: (values["cheap-loop"] ?? true) && !(values["full-model"] ?? false),
     gateModel: values["gate-model"],
+    gateEffort: values["gate-effort"] as EffortLevel | undefined,
     proposalModel: values["proposal-model"],
+    adaptiveGate: values["adaptive-gate"] ?? false,
     gradingResults,
     syncFirst: values["sync-first"] ?? false,
     syncForce: values["sync-force"] ?? false,

package/cli/selftune/evolution/validate-proposal.ts CHANGED Viewed

@@ -7,7 +7,7 @@
  */
 import type { EvalEntry, EvolutionProposal, InvocationTypeScores } from "../types.js";
-import { callLlm } from "../utils/llm-call.js";
+import { callLlm, type EffortLevel } from "../utils/llm-call.js";
 import {
   buildBatchTriggerCheckPrompt,
   buildTriggerCheckPrompt,
@@ -52,6 +52,7 @@ export async function validateProposalSequential(
   evalSet: EvalEntry[],
   agent: string,
   modelFlag?: string,
+  effort?: EffortLevel,
 ): Promise<ValidationResult> {
   if (evalSet.length === 0) {
     return {
@@ -76,14 +77,14 @@ export async function validateProposalSequential(
   for (const entry of evalSet) {
     // Check with original description
     const beforePrompt = buildTriggerCheckPrompt(proposal.original_description, entry.query);
-    const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
+    const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag, effort);
     const beforeTriggered = parseTriggerResponse(beforeRaw);
     const beforePass =
       (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
     // Check with proposed description
     const afterPrompt = buildTriggerCheckPrompt(proposal.proposed_description, entry.query);
-    const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
+    const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag, effort);
     const afterTriggered = parseTriggerResponse(afterRaw);
     const afterPass =
       (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
@@ -208,6 +209,7 @@ export async function validateProposalBatched(
   evalSet: EvalEntry[],
   agent: string,
   modelFlag?: string,
+  effort?: EffortLevel,
 ): Promise<ValidationResult> {
   if (evalSet.length === 0) {
     return {
@@ -242,8 +244,8 @@ export async function validateProposalBatched(
     // Run VALIDATION_RUNS times in parallel and majority-vote to reduce LLM variance
     const allCalls: Promise<string>[] = [];
     for (let r = 0; r < VALIDATION_RUNS; r++) {
-      allCalls.push(callLlm(systemPrompt, beforePrompt, agent, modelFlag));
-      allCalls.push(callLlm(systemPrompt, afterPrompt, agent, modelFlag));
+      allCalls.push(callLlm(systemPrompt, beforePrompt, agent, modelFlag, effort));
+      allCalls.push(callLlm(systemPrompt, afterPrompt, agent, modelFlag, effort));
     }
     const allRaw = await Promise.all(allCalls);
@@ -353,6 +355,7 @@ export async function validateProposal(
   evalSet: EvalEntry[],
   agent: string,
   modelFlag?: string,
+  effort?: EffortLevel,
 ): Promise<ValidationResult> {
-  return validateProposalBatched(proposal, evalSet, agent, modelFlag);
+  return validateProposalBatched(proposal, evalSet, agent, modelFlag, effort);
 }

package/cli/selftune/export.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 /**
  * Export SQLite data to JSONL format.
- * Replaces the removed JSONL write path -- use this when you need
- * JSONL files for debugging, the contribute workflow, or external tools.
+ * Use this only when you explicitly need portable/debuggable JSONL snapshots
+ * for recovery, the contribute workflow, or external tools.
  */
 import { mkdirSync, writeFileSync } from "node:fs";
 import { join } from "node:path";

package/cli/selftune/grading/grade-session.ts CHANGED Viewed

@@ -26,6 +26,7 @@ import type {
   GradingExpectation,
   GradingResult,
   SessionTelemetryRecord,
+  SessionType,
   SkillUsageRecord,
 } from "../types.js";
 import { CLIError, handleCLIError } from "../utils/cli-error.js";
@@ -420,6 +421,8 @@ export function buildExecutionMetrics(telemetry: SessionTelemetryRecord): Execut
     errors_encountered: telemetry.errors_encountered ?? 0,
     skills_triggered: telemetry.skills_triggered ?? [],
     transcript_chars: telemetry.transcript_chars ?? 0,
+    artifact_count: telemetry.artifact_count,
+    session_type: telemetry.session_type,
   };
 }
@@ -481,13 +484,30 @@ export function buildGradingPrompt(
       ? transcriptExcerpt.slice(0, MAX_TRANSCRIPT_LENGTH)
       : transcriptExcerpt;
+  const sessionType: SessionType = (telemetry.session_type as SessionType) ?? "mixed";
+  const SESSION_TYPE_CONTEXT: Record<SessionType, string> = {
+    dev: "This is a development session — code output and commits are expected productivity signals.",
+    research:
+      "This is a research session — information gathering and synthesis are the primary outputs, not code changes.",
+    content:
+      "This is a content/writing session — document creation is the primary output, not code commits.",
+    mixed:
+      "This is a mixed session — evaluate based on what was actually accomplished, not code-specific metrics.",
+  };
+  const sessionTypeContext = SESSION_TYPE_CONTEXT[sessionType] ?? SESSION_TYPE_CONTEXT.mixed;
   return `Skill: ${skillName}
+=== SESSION CONTEXT ===
+Session type: ${sessionType}
+${sessionTypeContext}
 === PROCESS TELEMETRY ===
 Skills triggered: ${JSON.stringify(telemetry.skills_triggered ?? [])}
 Assistant turns: ${telemetry.assistant_turns ?? "?"}
 Errors: ${telemetry.errors_encountered ?? "?"}
 Total tool calls: ${telemetry.total_tool_calls ?? "?"}
+Artifacts produced: ${telemetry.artifact_count ?? "?"}
 Tool breakdown:
 ${toolSummary}