selftune 0.2.15 → 0.2.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/README.md +24 -19
  2. package/bin/run-hook.cjs +36 -0
  3. package/cli/selftune/alpha-upload/build-payloads.ts +14 -1
  4. package/cli/selftune/alpha-upload/client.ts +51 -1
  5. package/cli/selftune/alpha-upload/flush.ts +46 -5
  6. package/cli/selftune/alpha-upload/stage-canonical.ts +25 -4
  7. package/cli/selftune/alpha-upload-contract.ts +9 -0
  8. package/cli/selftune/constants.ts +82 -5
  9. package/cli/selftune/contribute/sanitize.ts +52 -5
  10. package/cli/selftune/dashboard-contract.ts +100 -0
  11. package/cli/selftune/dashboard-server.ts +2 -2
  12. package/cli/selftune/evolution/description-quality.ts +12 -11
  13. package/cli/selftune/evolution/evolve.ts +238 -53
  14. package/cli/selftune/evolution/unblock-suggestions.ts +159 -0
  15. package/cli/selftune/evolution/validate-proposal.ts +9 -6
  16. package/cli/selftune/grading/grade-session.ts +20 -0
  17. package/cli/selftune/hooks/commit-track.ts +188 -0
  18. package/cli/selftune/hooks/prompt-log.ts +10 -1
  19. package/cli/selftune/hooks/session-stop.ts +2 -2
  20. package/cli/selftune/hooks/skill-eval.ts +15 -1
  21. package/cli/selftune/hooks/stdin-preview.ts +32 -0
  22. package/cli/selftune/init.ts +198 -27
  23. package/cli/selftune/localdb/direct-write.ts +69 -6
  24. package/cli/selftune/localdb/queries.ts +552 -7
  25. package/cli/selftune/localdb/schema.ts +46 -0
  26. package/cli/selftune/orchestrate.ts +32 -4
  27. package/cli/selftune/routes/overview.ts +41 -3
  28. package/cli/selftune/routes/skill-report.ts +88 -17
  29. package/cli/selftune/types.ts +32 -0
  30. package/cli/selftune/utils/hooks.ts +12 -2
  31. package/cli/selftune/utils/transcript.ts +210 -1
  32. package/node_modules/@selftune/telemetry-contract/src/types.ts +11 -0
  33. package/package.json +1 -1
  34. package/packages/telemetry-contract/src/types.ts +11 -0
  35. package/skill/SKILL.md +29 -1
  36. package/skill/Workflows/AutoActivation.md +1 -1
  37. package/skill/Workflows/Evolve.md +31 -13
  38. package/skill/Workflows/ExportCanonical.md +121 -0
  39. package/skill/Workflows/Hook.md +131 -0
  40. package/skill/Workflows/Initialize.md +9 -8
  41. package/skill/Workflows/Orchestrate.md +27 -5
  42. package/skill/Workflows/Quickstart.md +94 -0
  43. package/skill/Workflows/RepairSkillUsage.md +87 -0
  44. package/skill/Workflows/Uninstall.md +82 -0
  45. package/skill/settings_snippet.json +19 -8
@@ -38,6 +38,7 @@ import type {
38
38
  } from "../types.js";
39
39
  import { CLIError, handleCLIError } from "../utils/cli-error.js";
40
40
  import { parseFrontmatter, replaceDescription } from "../utils/frontmatter.js";
41
+ import type { EffortLevel } from "../utils/llm-call.js";
41
42
  import { createEvolveTUI } from "../utils/tui.js";
42
43
  import { appendAuditEntry } from "./audit.js";
43
44
  import { checkConstitution } from "./constitutional.js";
@@ -51,6 +52,8 @@ import {
51
52
  selectFromFrontier,
52
53
  } from "./pareto.js";
53
54
  import { generateMultipleProposals, generateProposal } from "./propose-description.js";
55
+ import { evaluateStoppingCriteria } from "./stopping-criteria.js";
56
+ import { buildUnblockSuggestions } from "./unblock-suggestions.js";
54
57
  import type { ValidationResult } from "./validate-proposal.js";
55
58
  import {
56
59
  TRIGGER_CHECK_BATCH_SIZE,
@@ -79,7 +82,9 @@ export interface EvolveOptions {
79
82
  validationModel?: string;
80
83
  cheapLoop?: boolean;
81
84
  gateModel?: string;
85
+ gateEffort?: EffortLevel;
82
86
  proposalModel?: string;
87
+ adaptiveGate?: boolean;
83
88
  syncFirst?: boolean;
84
89
  syncForce?: boolean;
85
90
  }
@@ -173,6 +178,73 @@ function formatSimpleDiff(oldText: string, newText: string): string {
173
178
  return output.join("\n");
174
179
  }
175
180
 
181
+ function countValidationLlmCalls(evalSetSize: number): number {
182
+ if (evalSetSize === 0) return 0;
183
+ return Math.ceil(evalSetSize / TRIGGER_CHECK_BATCH_SIZE) * 2 * VALIDATION_RUNS;
184
+ }
185
+
186
+ interface GateDecision {
187
+ model: string;
188
+ effort?: EffortLevel;
189
+ riskSignals: string[];
190
+ }
191
+
192
+ function countWords(text: string): number {
193
+ return text
194
+ .trim()
195
+ .split(/\s+/)
196
+ .filter((token) => token.length > 0).length;
197
+ }
198
+
199
+ function resolveGateDecision(
200
+ options: EvolveOptions,
201
+ proposal: EvolutionProposal,
202
+ validation: ValidationResult,
203
+ currentDescription: string,
204
+ confidenceThreshold: number,
205
+ ): GateDecision | undefined {
206
+ const baseModel = options.gateModel;
207
+ if (!baseModel) return undefined;
208
+
209
+ const baseDecision: GateDecision = {
210
+ model: baseModel,
211
+ effort: options.gateEffort,
212
+ riskSignals: [],
213
+ };
214
+
215
+ if (!options.adaptiveGate) return baseDecision;
216
+
217
+ const riskSignals: string[] = [];
218
+ const originalWords = countWords(currentDescription);
219
+ const proposedWords = countWords(proposal.proposed_description);
220
+ const wordGrowth = originalWords === 0 ? 1 : proposedWords / originalWords;
221
+ const lowLift = validation.net_change < 0.15;
222
+ const hasRegressions = validation.regressions.length > 0;
223
+ const lowConfidence = proposal.confidence < Math.max(confidenceThreshold + 0.05, 0.75);
224
+ const broadeningRisk = wordGrowth > 1.8 || proposedWords - originalWords > 32;
225
+ const notYetStrong = validation.after_pass_rate < 0.9;
226
+
227
+ if (hasRegressions) riskSignals.push(`regressions=${validation.regressions.length}`);
228
+ if (lowLift) riskSignals.push(`low_lift=${validation.net_change.toFixed(3)}`);
229
+ if (lowConfidence) riskSignals.push(`confidence=${proposal.confidence.toFixed(2)}`);
230
+ if (broadeningRisk) riskSignals.push(`word_growth=${wordGrowth.toFixed(2)}x`);
231
+ if (notYetStrong) riskSignals.push(`after_pass_rate=${validation.after_pass_rate.toFixed(2)}`);
232
+
233
+ const shouldEscalate = hasRegressions || validation.net_change < 0.1 || riskSignals.length >= 2;
234
+ if (!shouldEscalate) {
235
+ return {
236
+ ...baseDecision,
237
+ riskSignals,
238
+ };
239
+ }
240
+
241
+ return {
242
+ model: "opus",
243
+ effort: options.gateEffort === "max" ? "max" : "high",
244
+ riskSignals,
245
+ };
246
+ }
247
+
176
248
  // ---------------------------------------------------------------------------
177
249
  // Main orchestrator
178
250
  // ---------------------------------------------------------------------------
@@ -455,7 +527,7 @@ export async function evolve(
455
527
  // -----------------------------------------------------------------------
456
528
  // Pareto multi-candidate path
457
529
  // -----------------------------------------------------------------------
458
- const paretoEnabled = options.paretoEnabled ?? false;
530
+ const paretoEnabled = options.paretoEnabled ?? true;
459
531
  const candidateCount = options.candidateCount ?? 3;
460
532
  const tokenEfficiencyEnabled = options.tokenEfficiencyEnabled ?? false;
461
533
  const telemetryRecords =
@@ -493,6 +565,7 @@ export async function evolve(
493
565
  options.proposalModel,
494
566
  aggregateMetrics,
495
567
  );
568
+ llmCallCount += candidateCount;
496
569
 
497
570
  // Filter by confidence threshold
498
571
  const viableCandidates = candidates.filter((c) => c.confidence >= confidenceThreshold);
@@ -563,6 +636,7 @@ export async function evolve(
563
636
  agent,
564
637
  options.validationModel,
565
638
  );
639
+ llmCallCount += countValidationLlmCalls(evalSet.length);
566
640
  recordAudit(
567
641
  proposal.proposal_id,
568
642
  "validated",
@@ -627,6 +701,7 @@ export async function evolve(
627
701
  } else {
628
702
  // Standard single-candidate retry loop
629
703
  let feedbackReason = "";
704
+ const previousPassRates: number[] = [];
630
705
 
631
706
  for (let iteration = 0; iteration < maxIterations; iteration++) {
632
707
  iterationsCompleted = iteration + 1;
@@ -680,7 +755,24 @@ export async function evolve(
680
755
  );
681
756
  if (!constitution.passed) {
682
757
  feedbackReason = `Constitutional: ${constitution.violations.join("; ")}`;
683
- recordAudit(proposal.proposal_id, "rejected", feedbackReason);
758
+ // Re-evaluate stopping after a constitutional rejection by treating the
759
+ // last entry in previousPassRates as the currentPassRate (or 0 on the
760
+ // first iteration) and slicing it out of history before calling
761
+ // evaluateStoppingCriteria() with the current iteration/maxIterations,
762
+ // confidenceThreshold, and proposal.confidence.
763
+ const constitutionStop = evaluateStoppingCriteria(
764
+ previousPassRates.at(-1) ?? 0,
765
+ previousPassRates.slice(0, -1),
766
+ iteration + 1,
767
+ maxIterations,
768
+ confidenceThreshold,
769
+ proposal.confidence,
770
+ );
771
+ recordAudit(
772
+ proposal.proposal_id,
773
+ "rejected",
774
+ `${feedbackReason} (stopping: ${constitutionStop.reason})`,
775
+ );
684
776
  recordEvidence({
685
777
  timestamp: new Date().toISOString(),
686
778
  proposal_id: proposal.proposal_id,
@@ -690,54 +782,64 @@ export async function evolve(
690
782
  stage: "rejected",
691
783
  rationale: proposal.rationale,
692
784
  confidence: proposal.confidence,
693
- details: feedbackReason,
785
+ details: `${feedbackReason} (stopping: ${constitutionStop.reason})`,
694
786
  });
695
- if (iteration === maxIterations - 1) {
787
+ if (constitutionStop.shouldStop) {
696
788
  finishTui();
697
789
  return withStats({
698
790
  proposal: lastProposal,
699
791
  validation: null,
700
792
  deployed: false,
701
793
  auditEntries,
702
- reason: feedbackReason,
794
+ reason: `${feedbackReason} (${constitutionStop.reason})`,
703
795
  });
704
796
  }
705
797
  continue;
706
798
  }
707
799
 
708
- // Step 9: Check confidence threshold
709
- if (proposal.confidence < confidenceThreshold) {
710
- feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
711
- recordAudit(
712
- proposal.proposal_id,
713
- "rejected",
714
- `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
800
+ // Step 9: Check confidence threshold via stopping criteria
801
+ {
802
+ const preValidationStop = evaluateStoppingCriteria(
803
+ previousPassRates.at(-1) ?? 0,
804
+ previousPassRates.slice(0, -1),
805
+ iteration + 1,
806
+ maxIterations,
807
+ confidenceThreshold,
808
+ proposal.confidence,
715
809
  );
716
- recordEvidence({
717
- timestamp: new Date().toISOString(),
718
- proposal_id: proposal.proposal_id,
719
- skill_name: skillName,
720
- skill_path: skillPath,
721
- target: "description",
722
- stage: "rejected",
723
- rationale: proposal.rationale,
724
- confidence: proposal.confidence,
725
- details: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
726
- });
727
-
728
- // If this is the last iteration, return early with rejection
729
- if (iteration === maxIterations - 1) {
730
- finishTui();
731
- return withStats({
732
- proposal: lastProposal,
733
- validation: null,
734
- deployed: false,
735
- auditEntries,
736
- reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
810
+ if (proposal.confidence < confidenceThreshold) {
811
+ feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
812
+ recordAudit(
813
+ proposal.proposal_id,
814
+ "rejected",
815
+ `${feedbackReason} (stopping: ${preValidationStop.reason})`,
816
+ );
817
+ recordEvidence({
818
+ timestamp: new Date().toISOString(),
819
+ proposal_id: proposal.proposal_id,
820
+ skill_name: skillName,
821
+ skill_path: skillPath,
822
+ target: "description",
823
+ stage: "rejected",
824
+ rationale: proposal.rationale,
825
+ confidence: proposal.confidence,
826
+ details: `${feedbackReason} (stopping: ${preValidationStop.reason})`,
737
827
  });
738
- }
739
828
 
740
- continue;
829
+ // Use stopping criteria to decide whether to return or retry
830
+ if (preValidationStop.shouldStop) {
831
+ finishTui();
832
+ return withStats({
833
+ proposal: lastProposal,
834
+ validation: null,
835
+ deployed: false,
836
+ auditEntries,
837
+ reason: `${feedbackReason} (${preValidationStop.reason})`,
838
+ });
839
+ }
840
+
841
+ continue;
842
+ }
741
843
  }
742
844
 
743
845
  // Step 10: Validate against eval set
@@ -752,7 +854,7 @@ export async function evolve(
752
854
  options.validationModel,
753
855
  );
754
856
  lastValidation = validation;
755
- llmCallCount += batchCount * 2 * VALIDATION_RUNS;
857
+ llmCallCount += countValidationLlmCalls(evalSet.length);
756
858
  tui.done(
757
859
  `Validation: ${(validation.before_pass_rate * 100).toFixed(1)}% \u2192 ${(validation.after_pass_rate * 100).toFixed(1)}% (improved: ${validation.improved})`,
758
860
  );
@@ -791,13 +893,23 @@ export async function evolve(
791
893
  },
792
894
  });
793
895
 
794
- // Step 12: Check validation result
896
+ // Step 12: Evaluate stopping criteria after validation
897
+ const stopping = evaluateStoppingCriteria(
898
+ validation.after_pass_rate,
899
+ previousPassRates,
900
+ iteration + 1,
901
+ maxIterations,
902
+ confidenceThreshold,
903
+ proposal.confidence,
904
+ );
905
+ previousPassRates.push(validation.after_pass_rate);
906
+
795
907
  if (!validation.improved) {
796
908
  feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
797
909
  recordAudit(
798
910
  proposal.proposal_id,
799
911
  "rejected",
800
- `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
912
+ `Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
801
913
  );
802
914
  recordEvidence({
803
915
  timestamp: new Date().toISOString(),
@@ -808,7 +920,7 @@ export async function evolve(
808
920
  stage: "rejected",
809
921
  rationale: proposal.rationale,
810
922
  confidence: proposal.confidence,
811
- details: `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
923
+ details: `Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
812
924
  validation: {
813
925
  improved: validation.improved,
814
926
  before_pass_rate: validation.before_pass_rate,
@@ -820,21 +932,26 @@ export async function evolve(
820
932
  },
821
933
  });
822
934
 
823
- // If this is the last iteration, return with rejection
824
- if (iteration === maxIterations - 1) {
935
+ // Use stopping criteria to decide whether to return or retry
936
+ if (stopping.shouldStop) {
825
937
  finishTui();
826
938
  return withStats({
827
939
  proposal: lastProposal,
828
940
  validation: lastValidation,
829
941
  deployed: false,
830
942
  auditEntries,
831
- reason: `Validation failed after ${maxIterations} iterations: net_change=${validation.net_change.toFixed(3)}`,
943
+ reason: `Validation failed (${stopping.reason}): net_change=${validation.net_change.toFixed(3)}`,
832
944
  });
833
945
  }
834
946
 
835
947
  continue;
836
948
  }
837
949
 
950
+ // Validation passed — check if converged or continue
951
+ if (stopping.shouldStop && stopping.reason.includes("Converged")) {
952
+ recordAudit(proposal.proposal_id, "validated", `Stopping early: ${stopping.reason}`);
953
+ }
954
+
838
955
  // Validation passed - break out of retry loop
839
956
  break;
840
957
  }
@@ -915,18 +1032,39 @@ export async function evolve(
915
1032
  // -----------------------------------------------------------------------
916
1033
  let gateValidation: ValidationResult | undefined;
917
1034
  if (options.gateModel && lastProposal && lastValidation?.improved) {
918
- tui.step(`Gate validation (${options.gateModel})...`);
919
- gateValidation = await _gateValidateProposal(lastProposal, evalSet, agent, options.gateModel);
920
- llmCallCount++;
1035
+ const gateDecision = resolveGateDecision(
1036
+ options,
1037
+ lastProposal,
1038
+ lastValidation,
1039
+ currentDescription,
1040
+ confidenceThreshold,
1041
+ );
1042
+ const gateLabel = gateDecision?.effort
1043
+ ? `${gateDecision.model}, effort=${gateDecision.effort}`
1044
+ : (gateDecision?.model ?? options.gateModel);
1045
+ tui.step(`Gate validation (${gateLabel})...`);
1046
+ gateValidation = await _gateValidateProposal(
1047
+ lastProposal,
1048
+ evalSet,
1049
+ agent,
1050
+ gateDecision?.model ?? options.gateModel,
1051
+ gateDecision?.effort,
1052
+ );
1053
+ llmCallCount += countValidationLlmCalls(evalSet.length);
921
1054
  tui.done(
922
- `Gate (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
1055
+ `Gate (${gateLabel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
923
1056
  );
924
1057
 
1058
+ const gatePrefix =
1059
+ gateDecision && gateDecision.riskSignals.length > 0
1060
+ ? `Adaptive gate [${gateDecision.riskSignals.join(", ")}]`
1061
+ : "Gate validation";
1062
+
925
1063
  if (!gateValidation.improved) {
926
1064
  recordAudit(
927
1065
  lastProposal.proposal_id,
928
1066
  "rejected",
929
- `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
1067
+ `${gatePrefix} failed (${gateLabel}): net_change=${gateValidation.net_change.toFixed(3)}`,
930
1068
  );
931
1069
  recordEvidence({
932
1070
  timestamp: new Date().toISOString(),
@@ -937,7 +1075,7 @@ export async function evolve(
937
1075
  stage: "rejected",
938
1076
  rationale: lastProposal.rationale,
939
1077
  confidence: lastProposal.confidence,
940
- details: `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
1078
+ details: `${gatePrefix} failed (${gateLabel}): net_change=${gateValidation.net_change.toFixed(3)}`,
941
1079
  validation: {
942
1080
  improved: gateValidation.improved,
943
1081
  before_pass_rate: gateValidation.before_pass_rate,
@@ -954,7 +1092,7 @@ export async function evolve(
954
1092
  validation: lastValidation,
955
1093
  deployed: false,
956
1094
  auditEntries,
957
- reason: `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
1095
+ reason: `${gatePrefix} failed (${gateLabel}): net_change=${gateValidation.net_change.toFixed(3)}`,
958
1096
  gateValidation,
959
1097
  ...(baselineResult ? { baselineResult } : {}),
960
1098
  });
@@ -963,7 +1101,7 @@ export async function evolve(
963
1101
  recordAudit(
964
1102
  lastProposal.proposal_id,
965
1103
  "validated",
966
- `Gate validation (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
1104
+ `${gatePrefix} (${gateLabel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
967
1105
  );
968
1106
  }
969
1107
 
@@ -1081,7 +1219,7 @@ export async function cliMain(): Promise<void> {
1081
1219
  "dry-run": { type: "boolean", default: false },
1082
1220
  confidence: { type: "string", default: "0.6" },
1083
1221
  "max-iterations": { type: "string", default: "3" },
1084
- pareto: { type: "boolean", default: false },
1222
+ pareto: { type: "boolean", default: true },
1085
1223
  candidates: { type: "string", default: "3" },
1086
1224
  "token-efficiency": { type: "boolean", default: false },
1087
1225
  "with-baseline": { type: "boolean", default: false },
@@ -1089,7 +1227,9 @@ export async function cliMain(): Promise<void> {
1089
1227
  "cheap-loop": { type: "boolean", default: true },
1090
1228
  "full-model": { type: "boolean", default: false },
1091
1229
  "gate-model": { type: "string" },
1230
+ "gate-effort": { type: "string" },
1092
1231
  "proposal-model": { type: "string" },
1232
+ "adaptive-gate": { type: "boolean", default: false },
1093
1233
  "sync-first": { type: "boolean", default: false },
1094
1234
  "sync-force": { type: "boolean", default: false },
1095
1235
  verbose: { type: "boolean", default: false },
@@ -1120,6 +1260,8 @@ Options:
1120
1260
  --cheap-loop Use cheap models for loop, expensive for gate (default: on)
1121
1261
  --full-model Use same model for all stages (disables cheap-loop)
1122
1262
  --gate-model Model for final gate validation (default: sonnet)
1263
+ --gate-effort Thinking effort for final gate (low|medium|high|max)
1264
+ --adaptive-gate Escalate risky gate checks to opus + high effort
1123
1265
  --proposal-model Model for proposal generation LLM calls
1124
1266
  --sync-first Refresh source-truth telemetry before building evals/failure patterns
1125
1267
  --sync-force Force a full rescan during --sync-first
@@ -1142,6 +1284,24 @@ Options:
1142
1284
  "Add --sync-first when using --sync-force",
1143
1285
  );
1144
1286
  }
1287
+ if (values["gate-effort"] && !["low", "medium", "high", "max"].includes(values["gate-effort"])) {
1288
+ throw new CLIError(
1289
+ `Invalid --gate-effort value: ${values["gate-effort"]}`,
1290
+ "INVALID_FLAG",
1291
+ "Use one of: low, medium, high, max",
1292
+ );
1293
+ }
1294
+ if (
1295
+ (values["gate-effort"] || values["adaptive-gate"]) &&
1296
+ (values["full-model"] ?? false) &&
1297
+ !values["gate-model"]
1298
+ ) {
1299
+ throw new CLIError(
1300
+ "--gate-effort and --adaptive-gate require --gate-model when --full-model is set",
1301
+ "INVALID_FLAG",
1302
+ "Add --gate-model <model> or drop --full-model",
1303
+ );
1304
+ }
1145
1305
 
1146
1306
  const { detectAgent } = await import("../utils/llm-call.js");
1147
1307
  const requestedAgent = values.agent;
@@ -1222,6 +1382,8 @@ Options:
1222
1382
  console.error(`[verbose] Dry run: ${values["dry-run"] ?? false}`);
1223
1383
  console.error(`[verbose] Sync first: ${values["sync-first"] ?? false}`);
1224
1384
  console.error(`[verbose] Sync force: ${values["sync-force"] ?? false}`);
1385
+ console.error(`[verbose] Adaptive gate: ${values["adaptive-gate"] ?? false}`);
1386
+ console.error(`[verbose] Gate effort: ${values["gate-effort"] ?? "(default)"}`);
1225
1387
  }
1226
1388
 
1227
1389
  const result = await evolve({
@@ -1240,7 +1402,9 @@ Options:
1240
1402
  validationModel: values["validation-model"],
1241
1403
  cheapLoop: (values["cheap-loop"] ?? true) && !(values["full-model"] ?? false),
1242
1404
  gateModel: values["gate-model"],
1405
+ gateEffort: values["gate-effort"] as EffortLevel | undefined,
1243
1406
  proposalModel: values["proposal-model"],
1407
+ adaptiveGate: values["adaptive-gate"] ?? false,
1244
1408
  gradingResults,
1245
1409
  syncFirst: values["sync-first"] ?? false,
1246
1410
  syncForce: values["sync-force"] ?? false,
@@ -1272,11 +1436,16 @@ Options:
1272
1436
  ...(result.descriptionQualityAfter != null
1273
1437
  ? { description_quality_after: result.descriptionQualityAfter }
1274
1438
  : {}),
1439
+ ...(!result.deployed
1440
+ ? {
1441
+ suggestions: buildUnblockSuggestions(result, values.skill),
1442
+ }
1443
+ : {}),
1275
1444
  };
1276
1445
  console.log(JSON.stringify(summary, null, 2));
1277
1446
  }
1278
1447
 
1279
- // Print human-readable status to stderr so users always see outcome
1448
+ // Print human-readable status to stderr so agents always see outcome + next steps
1280
1449
  if (!result.deployed) {
1281
1450
  console.error(`\n[NOT DEPLOYED] ${result.reason}`);
1282
1451
  if (result.validation && !result.validation.improved) {
@@ -1295,9 +1464,25 @@ Options:
1295
1464
  ` Confidence ${result.proposal.confidence.toFixed(2)} below threshold ${values.confidence ?? "0.6"}`,
1296
1465
  );
1297
1466
  }
1298
- console.error(" Re-run with --verbose for full diagnostic output.");
1467
+ // Targeted suggestions based on specific failure reason
1468
+ const suggestions = buildUnblockSuggestions(result, values.skill);
1469
+ if (suggestions.length > 0) {
1470
+ console.error("\n Next steps:");
1471
+ for (const s of suggestions) {
1472
+ console.error(` → ${s}`);
1473
+ }
1474
+ }
1299
1475
  } else {
1300
1476
  console.error(`\n[DEPLOYED] ${result.reason}`);
1477
+ // Show quality improvement if available
1478
+ if (result.descriptionQualityBefore != null && result.descriptionQualityAfter != null) {
1479
+ const delta = result.descriptionQualityAfter - result.descriptionQualityBefore;
1480
+ if (delta !== 0) {
1481
+ console.error(
1482
+ ` Description quality: ${Math.round(result.descriptionQualityBefore * 100)}% → ${Math.round(result.descriptionQualityAfter * 100)}% (${delta >= 0 ? "+" : ""}${Math.round(delta * 100)}%)`,
1483
+ );
1484
+ }
1485
+ }
1301
1486
  }
1302
1487
 
1303
1488
  process.exit(result.deployed ? 0 : 1);
@@ -0,0 +1,159 @@
1
+ /**
2
+ * unblock-suggestions.ts
3
+ *
4
+ * Generates targeted, per-failure-reason suggestions when evolve doesn't deploy.
5
+ * Each suggestion is a concrete next CLI command or manual action that helps the
6
+ * agent (or user) unblock the evolution pipeline.
7
+ *
8
+ * Pure function — no I/O, no LLM calls. Depends only on EvolveResult fields and
9
+ * the scoreDescription heuristic.
10
+ */
11
+
12
+ import { scoreDescription } from "./description-quality.js";
13
+ import type { EvolveResult } from "./evolve.js";
14
+
15
+ // ---------------------------------------------------------------------------
16
+ // Quality hint helper
17
+ // ---------------------------------------------------------------------------
18
+
19
+ /**
20
+ * Append description quality improvement hints if the score reveals weak criteria.
21
+ * Only fires when composite < 0.7 to avoid noise on already-good descriptions.
22
+ * Skips when descriptionText is empty (no proposal was generated).
23
+ */
24
+ function appendQualityHints(
25
+ suggestions: string[],
26
+ descriptionText: string,
27
+ skillName: string,
28
+ ): void {
29
+ if (!descriptionText) return;
30
+ const score = scoreDescription(descriptionText, skillName);
31
+ if (score.composite >= 0.7) return;
32
+
33
+ const weak: string[] = [];
34
+ if (score.criteria.trigger_context < 0.5) weak.push("add when/if/after trigger context");
35
+ if (score.criteria.vagueness < 0.7) weak.push("remove vague words (various, general, etc)");
36
+ if (score.criteria.specificity < 0.5) weak.push("add concrete action verbs");
37
+ if (score.criteria.length < 0.7) weak.push("adjust length (ideal: 80-300 chars)");
38
+ if (score.criteria.not_just_name < 0.5) weak.push("differentiate from skill name");
39
+
40
+ if (weak.length > 0) {
41
+ suggestions.push(
42
+ `Description quality: ${Math.round(score.composite * 100)}% — improve by: ${weak.join(", ")}`,
43
+ );
44
+ }
45
+ }
46
+
47
+ // ---------------------------------------------------------------------------
48
+ // Main suggestion builder
49
+ // ---------------------------------------------------------------------------
50
+
51
+ /**
52
+ * Generate targeted suggestions based on the specific failure reason.
53
+ * Each suggestion is a concrete next CLI command or manual action.
54
+ */
55
+ export function buildUnblockSuggestions(result: EvolveResult, skillName: string): string[] {
56
+ const reason = result.reason;
57
+ const suggestions: string[] = [];
58
+ const descText = result.proposal?.original_description ?? "";
59
+
60
+ // --- Path/config failures ---
61
+ if (reason.includes("SKILL.md not found")) {
62
+ suggestions.push("Verify the --skill-path flag points to a valid SKILL.md file");
63
+ suggestions.push("Run: selftune init (to re-bootstrap config if paths changed)");
64
+ return suggestions;
65
+ }
66
+
67
+ if (reason.includes("Failed to load eval set") || reason.includes("not a JSON array")) {
68
+ suggestions.push("Run: selftune sync (to rebuild source-truth telemetry)");
69
+ suggestions.push(`Then: selftune evolve --skill ${skillName} (to retry with fresh evals)`);
70
+ return suggestions;
71
+ }
72
+
73
+ // --- No signal failures ---
74
+ if (reason.includes("No failure patterns found")) {
75
+ suggestions.push("This skill may already be routing well — check: selftune status");
76
+ suggestions.push("If undertriggering, add more sessions so evolve has signal to work with");
77
+ if (result.descriptionQualityBefore != null && result.descriptionQualityBefore < 0.5) {
78
+ suggestions.push(
79
+ `Description quality is ${Math.round(result.descriptionQualityBefore * 100)}% — manually improving the description may help generate patterns`,
80
+ );
81
+ appendQualityHints(suggestions, descText, skillName);
82
+ }
83
+ return suggestions;
84
+ }
85
+
86
+ // --- Confidence failures (specific before general) ---
87
+ if (reason.includes("No candidates met confidence")) {
88
+ suggestions.push(`Lower the threshold: selftune evolve --skill ${skillName} --confidence 0.4`);
89
+ suggestions.push(
90
+ `Or increase candidates: selftune evolve --skill ${skillName} --pareto --candidates 5`,
91
+ );
92
+ appendQualityHints(suggestions, descText, skillName);
93
+ return suggestions;
94
+ }
95
+ if (reason.toLowerCase().includes("confidence") && reason.includes("threshold")) {
96
+ suggestions.push(`Lower the threshold: selftune evolve --skill ${skillName} --confidence 0.4`);
97
+ suggestions.push("Or add more eval entries so the LLM has more context for proposals");
98
+ appendQualityHints(suggestions, descText, skillName);
99
+ return suggestions;
100
+ }
101
+
102
+ // --- Validation failures (proposals regressed) ---
103
+ if (reason.includes("Validation failed after")) {
104
+ suggestions.push(
105
+ `The eval set may be contradictory — review with: selftune evolve --skill ${skillName} --verbose`,
106
+ );
107
+ suggestions.push(
108
+ `Try: selftune evolve --skill ${skillName} --pareto --candidates 5 (more diverse proposals)`,
109
+ );
110
+ if (result.validation && result.validation.regressions.length > 0) {
111
+ suggestions.push(
112
+ `${result.validation.regressions.length} regressions detected — check if negative eval entries are too broad`,
113
+ );
114
+ }
115
+ appendQualityHints(suggestions, descText, skillName);
116
+ return suggestions;
117
+ }
118
+ if (reason.includes("No Pareto candidates improved")) {
119
+ suggestions.push("All candidates regressed — the eval set may need rebalancing");
120
+ suggestions.push(`Try: selftune sync --force && selftune evolve --skill ${skillName}`);
121
+ return suggestions;
122
+ }
123
+
124
+ // --- Gate failures ---
125
+ if (reason.includes("Baseline gate failed")) {
126
+ suggestions.push("Improvement was too marginal to justify deployment");
127
+ suggestions.push("Collect more session data, then retry — small gains compound over time");
128
+ return suggestions;
129
+ }
130
+ if (reason.includes("Gate validation failed")) {
131
+ suggestions.push("The gate model rejected the proposal — it may be too aggressive");
132
+ suggestions.push(
133
+ `Try: selftune evolve --skill ${skillName} --full-model (disables cheap-loop gate)`,
134
+ );
135
+ return suggestions;
136
+ }
137
+
138
+ // --- Constitutional rejection ---
139
+ if (reason.includes("Constitutional")) {
140
+ suggestions.push("The proposed description violated safety constraints");
141
+ suggestions.push("Review constitutional rules and manually adjust the description if needed");
142
+ return suggestions;
143
+ }
144
+
145
+ // --- Dry run (not really a failure) ---
146
+ if (reason.includes("Dry run")) {
147
+ suggestions.push(`Deploy: selftune evolve --skill ${skillName} (remove --dry-run to deploy)`);
148
+ return suggestions;
149
+ }
150
+
151
+ // --- Catch-all for unexpected errors ---
152
+ if (reason.includes("Error during evolution")) {
153
+ suggestions.push("Re-run with --verbose for full stack trace");
154
+ suggestions.push("Run: selftune doctor (to check system health)");
155
+ return suggestions;
156
+ }
157
+
158
+ return suggestions;
159
+ }