selftune 0.2.16 → 0.2.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +24 -19
  2. package/cli/selftune/alpha-upload/build-payloads.ts +14 -1
  3. package/cli/selftune/alpha-upload/client.ts +51 -1
  4. package/cli/selftune/alpha-upload/flush.ts +46 -5
  5. package/cli/selftune/alpha-upload/stage-canonical.ts +25 -4
  6. package/cli/selftune/alpha-upload-contract.ts +9 -0
  7. package/cli/selftune/constants.ts +82 -5
  8. package/cli/selftune/contribute/sanitize.ts +52 -5
  9. package/cli/selftune/dashboard-contract.ts +100 -0
  10. package/cli/selftune/dashboard-server.ts +2 -2
  11. package/cli/selftune/evolution/description-quality.ts +12 -11
  12. package/cli/selftune/evolution/evolve.ts +214 -51
  13. package/cli/selftune/evolution/validate-proposal.ts +9 -6
  14. package/cli/selftune/grading/grade-session.ts +20 -0
  15. package/cli/selftune/hooks/commit-track.ts +188 -0
  16. package/cli/selftune/hooks/prompt-log.ts +10 -1
  17. package/cli/selftune/hooks/session-stop.ts +2 -2
  18. package/cli/selftune/hooks/skill-eval.ts +15 -1
  19. package/cli/selftune/hooks/stdin-preview.ts +32 -0
  20. package/cli/selftune/localdb/direct-write.ts +69 -6
  21. package/cli/selftune/localdb/queries.ts +552 -7
  22. package/cli/selftune/localdb/schema.ts +46 -0
  23. package/cli/selftune/orchestrate.ts +32 -4
  24. package/cli/selftune/routes/overview.ts +41 -3
  25. package/cli/selftune/routes/skill-report.ts +88 -17
  26. package/cli/selftune/types.ts +31 -0
  27. package/cli/selftune/utils/transcript.ts +210 -1
  28. package/node_modules/@selftune/telemetry-contract/src/types.ts +11 -0
  29. package/package.json +1 -1
  30. package/packages/telemetry-contract/src/types.ts +11 -0
  31. package/skill/SKILL.md +29 -1
  32. package/skill/Workflows/Evolve.md +31 -13
  33. package/skill/Workflows/ExportCanonical.md +121 -0
  34. package/skill/Workflows/Hook.md +131 -0
  35. package/skill/Workflows/Initialize.md +9 -8
  36. package/skill/Workflows/Orchestrate.md +27 -5
  37. package/skill/Workflows/Quickstart.md +94 -0
  38. package/skill/Workflows/RepairSkillUsage.md +87 -0
  39. package/skill/Workflows/Uninstall.md +82 -0
  40. package/skill/settings_snippet.json +11 -0
@@ -1,3 +1,71 @@
1
+ // -- Cursor-based pagination types -------------------------------------------
2
+
3
+ export interface PaginationCursor {
4
+ timestamp: string;
5
+ id: number | string;
6
+ }
7
+
8
+ export interface PaginatedResult<T> {
9
+ items: T[];
10
+ next_cursor: PaginationCursor | null;
11
+ has_more: boolean;
12
+ }
13
+
14
+ /** Parse a JSON cursor param from a URL search string. Returns null on invalid input. */
15
+ export function parseCursorParam(value: string | null | undefined): PaginationCursor | null {
16
+ if (!value) return null;
17
+ try {
18
+ const parsed: unknown = JSON.parse(value);
19
+ if (parsed && typeof parsed === "object" && "timestamp" in parsed && "id" in parsed) {
20
+ const { timestamp, id } = parsed as { timestamp: unknown; id: unknown };
21
+ if (
22
+ typeof timestamp === "string" &&
23
+ (typeof id === "string" || (typeof id === "number" && Number.isFinite(id)))
24
+ ) {
25
+ return { timestamp, id };
26
+ }
27
+ }
28
+ } catch {
29
+ // Invalid cursor JSON — treat as no cursor
30
+ }
31
+ return null;
32
+ }
33
+
34
+ /** Parse an integer query param with bounds clamping. */
35
+ export function parseIntParam(value: string | null | undefined, defaultValue: number): number {
36
+ if (value == null) return defaultValue;
37
+ const n = Number.parseInt(value, 10);
38
+ return Number.isNaN(n) ? defaultValue : Math.max(1, Math.min(n, 10000));
39
+ }
40
+
41
+ // -- Paginated overview payload (returned when cursor params are provided) ----
42
+
43
+ export interface OverviewPaginatedPayload {
44
+ telemetry_page: PaginatedResult<TelemetryRecord>;
45
+ skills_page: PaginatedResult<SkillUsageRecord>;
46
+ evolution: EvolutionEntry[];
47
+ counts: OverviewPayload["counts"];
48
+ unmatched_queries: UnmatchedQuery[];
49
+ pending_proposals: PendingProposal[];
50
+ active_sessions: number;
51
+ recent_activity: RecentActivityItem[];
52
+ }
53
+
54
+ export interface SkillReportPaginatedPayload extends Omit<
55
+ SkillReportPayload,
56
+ "recent_invocations"
57
+ > {
58
+ invocations_page: PaginatedResult<{
59
+ timestamp: string;
60
+ session_id: string;
61
+ query: string;
62
+ triggered: boolean;
63
+ source: string | null;
64
+ }>;
65
+ }
66
+
67
+ // -- Core record types -------------------------------------------------------
68
+
1
69
  export interface TelemetryRecord {
2
70
  timestamp: string;
3
71
  session_id: string;
@@ -220,6 +288,36 @@ export interface HealthResponse {
220
288
  // -- Doctor / health check types ----------------------------------------------
221
289
  export type { DoctorResult, HealthCheck, HealthStatus } from "./types.js";
222
290
 
291
+ // -- Execution metrics (aggregated from execution_facts enrichment columns) ---
292
+
293
+ export interface ExecutionMetrics {
294
+ avg_files_changed: number;
295
+ total_lines_added: number;
296
+ total_lines_removed: number;
297
+ total_cost_usd: number;
298
+ avg_cost_usd: number;
299
+ cached_input_tokens_total: number;
300
+ reasoning_output_tokens_total: number;
301
+ artifact_count: number;
302
+ session_type_distribution: Record<string, number>;
303
+ }
304
+
305
+ // -- Commit summary (aggregated from commit_tracking table) -------------------
306
+
307
+ export interface CommitRecord {
308
+ commit_sha: string;
309
+ commit_title: string | null;
310
+ branch: string | null;
311
+ repo_remote: string | null;
312
+ timestamp: string;
313
+ }
314
+
315
+ export interface CommitSummary {
316
+ total_commits: number;
317
+ unique_branches: number;
318
+ recent_commits: Array<{ sha: string; title: string; branch: string; timestamp: string }>;
319
+ }
320
+
223
321
  export interface SkillReportResponse extends SkillReportPayload {
224
322
  evolution: EvolutionEntry[];
225
323
  pending_proposals: PendingProposal[];
@@ -242,6 +340,8 @@ export interface SkillReportResponse extends SkillReportPayload {
242
340
  };
243
341
  prompt_samples: PromptSample[];
244
342
  session_metadata: SessionMeta[];
343
+ execution_metrics?: ExecutionMetrics | null;
344
+ commit_summary?: CommitSummary | null;
245
345
  description_quality?: {
246
346
  composite: number;
247
347
  criteria: {
@@ -448,7 +448,7 @@ export async function startDashboardServer(
448
448
  );
449
449
  }
450
450
  refreshV2Data();
451
- return withCors(handleOverview(db, selftuneVersion));
451
+ return withCors(handleOverview(db, selftuneVersion, url.searchParams));
452
452
  }
453
453
 
454
454
  // ---- GET /api/v2/orchestrate-runs ----
@@ -495,7 +495,7 @@ export async function startDashboardServer(
495
495
  );
496
496
  }
497
497
  refreshV2Data();
498
- return withCors(handleSkillReport(db, skillName));
498
+ return withCors(handleSkillReport(db, skillName, url.searchParams));
499
499
  }
500
500
 
501
501
  // ---- SPA fallback ----
@@ -139,27 +139,27 @@ export function scoreLengthCriterion(description: string): number {
139
139
  }
140
140
 
141
141
  /** Score presence of trigger context words (when/if/before/after etc). */
142
- export function scoreTriggerContextCriterion(description: string): number {
143
- const matches = countWordMatches(description.toLowerCase(), TRIGGER_PATTERNS);
142
+ export function scoreTriggerContextCriterion(description: string, lower?: string): number {
143
+ const matches = countWordMatches(lower ?? description.toLowerCase(), TRIGGER_PATTERNS);
144
144
  if (matches === 0) return 0.0;
145
145
  if (matches === 1) return 0.7;
146
146
  return Math.min(1.0, 0.7 + 0.15 * (matches - 1));
147
147
  }
148
148
 
149
149
  /** Score absence of vague words (lower is worse). */
150
- export function scoreVaguenessCriterion(description: string): number {
151
- const matches = countWordMatches(description.toLowerCase(), VAGUE_PATTERNS);
150
+ export function scoreVaguenessCriterion(description: string, lower?: string): number {
151
+ const matches = countWordMatches(lower ?? description.toLowerCase(), VAGUE_PATTERNS);
152
152
  if (matches === 0) return 1.0;
153
153
  if (matches === 1) return 0.6;
154
154
  return Math.max(0.1, 0.6 - 0.15 * (matches - 1));
155
155
  }
156
156
 
157
157
  /** Score whether description specifies at least one concrete action or domain. */
158
- export function scoreSpecificityCriterion(description: string): number {
159
- const lower = description.toLowerCase();
160
- const hasAction = ACTION_PATTERNS.some((p) => p.test(lower));
158
+ export function scoreSpecificityCriterion(description: string, lower?: string): number {
159
+ const l = lower ?? description.toLowerCase();
160
+ const hasAction = ACTION_PATTERNS.some((p) => p.test(l));
161
161
 
162
- const fillerCount = FILLER_PHRASES.filter((f) => lower.includes(f)).length;
162
+ const fillerCount = FILLER_PHRASES.filter((f) => l.includes(f)).length;
163
163
  const words = description.split(/\s+/).length;
164
164
  const fillerRatio = fillerCount > 0 ? fillerCount / Math.max(1, words / 10) : 0;
165
165
 
@@ -204,11 +204,12 @@ const WEIGHTS = {
204
204
  * Pure function — no I/O, no LLM calls.
205
205
  */
206
206
  export function scoreDescription(description: string, skillName?: string): DescriptionQualityScore {
207
+ const lower = description.toLowerCase();
207
208
  const criteria = {
208
209
  length: scoreLengthCriterion(description),
209
- trigger_context: scoreTriggerContextCriterion(description),
210
- vagueness: scoreVaguenessCriterion(description),
211
- specificity: scoreSpecificityCriterion(description),
210
+ trigger_context: scoreTriggerContextCriterion(description, lower),
211
+ vagueness: scoreVaguenessCriterion(description, lower),
212
+ specificity: scoreSpecificityCriterion(description, lower),
212
213
  not_just_name: scoreNotJustNameCriterion(description, skillName),
213
214
  };
214
215
 
@@ -38,6 +38,7 @@ import type {
38
38
  } from "../types.js";
39
39
  import { CLIError, handleCLIError } from "../utils/cli-error.js";
40
40
  import { parseFrontmatter, replaceDescription } from "../utils/frontmatter.js";
41
+ import type { EffortLevel } from "../utils/llm-call.js";
41
42
  import { createEvolveTUI } from "../utils/tui.js";
42
43
  import { appendAuditEntry } from "./audit.js";
43
44
  import { checkConstitution } from "./constitutional.js";
@@ -51,6 +52,7 @@ import {
51
52
  selectFromFrontier,
52
53
  } from "./pareto.js";
53
54
  import { generateMultipleProposals, generateProposal } from "./propose-description.js";
55
+ import { evaluateStoppingCriteria } from "./stopping-criteria.js";
54
56
  import { buildUnblockSuggestions } from "./unblock-suggestions.js";
55
57
  import type { ValidationResult } from "./validate-proposal.js";
56
58
  import {
@@ -80,7 +82,9 @@ export interface EvolveOptions {
80
82
  validationModel?: string;
81
83
  cheapLoop?: boolean;
82
84
  gateModel?: string;
85
+ gateEffort?: EffortLevel;
83
86
  proposalModel?: string;
87
+ adaptiveGate?: boolean;
84
88
  syncFirst?: boolean;
85
89
  syncForce?: boolean;
86
90
  }
@@ -174,6 +178,73 @@ function formatSimpleDiff(oldText: string, newText: string): string {
174
178
  return output.join("\n");
175
179
  }
176
180
 
181
+ function countValidationLlmCalls(evalSetSize: number): number {
182
+ if (evalSetSize === 0) return 0;
183
+ return Math.ceil(evalSetSize / TRIGGER_CHECK_BATCH_SIZE) * 2 * VALIDATION_RUNS;
184
+ }
185
+
186
+ interface GateDecision {
187
+ model: string;
188
+ effort?: EffortLevel;
189
+ riskSignals: string[];
190
+ }
191
+
192
+ function countWords(text: string): number {
193
+ return text
194
+ .trim()
195
+ .split(/\s+/)
196
+ .filter((token) => token.length > 0).length;
197
+ }
198
+
199
+ function resolveGateDecision(
200
+ options: EvolveOptions,
201
+ proposal: EvolutionProposal,
202
+ validation: ValidationResult,
203
+ currentDescription: string,
204
+ confidenceThreshold: number,
205
+ ): GateDecision | undefined {
206
+ const baseModel = options.gateModel;
207
+ if (!baseModel) return undefined;
208
+
209
+ const baseDecision: GateDecision = {
210
+ model: baseModel,
211
+ effort: options.gateEffort,
212
+ riskSignals: [],
213
+ };
214
+
215
+ if (!options.adaptiveGate) return baseDecision;
216
+
217
+ const riskSignals: string[] = [];
218
+ const originalWords = countWords(currentDescription);
219
+ const proposedWords = countWords(proposal.proposed_description);
220
+ const wordGrowth = originalWords === 0 ? 1 : proposedWords / originalWords;
221
+ const lowLift = validation.net_change < 0.15;
222
+ const hasRegressions = validation.regressions.length > 0;
223
+ const lowConfidence = proposal.confidence < Math.max(confidenceThreshold + 0.05, 0.75);
224
+ const broadeningRisk = wordGrowth > 1.8 || proposedWords - originalWords > 32;
225
+ const notYetStrong = validation.after_pass_rate < 0.9;
226
+
227
+ if (hasRegressions) riskSignals.push(`regressions=${validation.regressions.length}`);
228
+ if (lowLift) riskSignals.push(`low_lift=${validation.net_change.toFixed(3)}`);
229
+ if (lowConfidence) riskSignals.push(`confidence=${proposal.confidence.toFixed(2)}`);
230
+ if (broadeningRisk) riskSignals.push(`word_growth=${wordGrowth.toFixed(2)}x`);
231
+ if (notYetStrong) riskSignals.push(`after_pass_rate=${validation.after_pass_rate.toFixed(2)}`);
232
+
233
+ const shouldEscalate = hasRegressions || validation.net_change < 0.1 || riskSignals.length >= 2;
234
+ if (!shouldEscalate) {
235
+ return {
236
+ ...baseDecision,
237
+ riskSignals,
238
+ };
239
+ }
240
+
241
+ return {
242
+ model: "opus",
243
+ effort: options.gateEffort === "max" ? "max" : "high",
244
+ riskSignals,
245
+ };
246
+ }
247
+
177
248
  // ---------------------------------------------------------------------------
178
249
  // Main orchestrator
179
250
  // ---------------------------------------------------------------------------
@@ -456,7 +527,7 @@ export async function evolve(
456
527
  // -----------------------------------------------------------------------
457
528
  // Pareto multi-candidate path
458
529
  // -----------------------------------------------------------------------
459
- const paretoEnabled = options.paretoEnabled ?? false;
530
+ const paretoEnabled = options.paretoEnabled ?? true;
460
531
  const candidateCount = options.candidateCount ?? 3;
461
532
  const tokenEfficiencyEnabled = options.tokenEfficiencyEnabled ?? false;
462
533
  const telemetryRecords =
@@ -494,6 +565,7 @@ export async function evolve(
494
565
  options.proposalModel,
495
566
  aggregateMetrics,
496
567
  );
568
+ llmCallCount += candidateCount;
497
569
 
498
570
  // Filter by confidence threshold
499
571
  const viableCandidates = candidates.filter((c) => c.confidence >= confidenceThreshold);
@@ -564,6 +636,7 @@ export async function evolve(
564
636
  agent,
565
637
  options.validationModel,
566
638
  );
639
+ llmCallCount += countValidationLlmCalls(evalSet.length);
567
640
  recordAudit(
568
641
  proposal.proposal_id,
569
642
  "validated",
@@ -628,6 +701,7 @@ export async function evolve(
628
701
  } else {
629
702
  // Standard single-candidate retry loop
630
703
  let feedbackReason = "";
704
+ const previousPassRates: number[] = [];
631
705
 
632
706
  for (let iteration = 0; iteration < maxIterations; iteration++) {
633
707
  iterationsCompleted = iteration + 1;
@@ -681,7 +755,24 @@ export async function evolve(
681
755
  );
682
756
  if (!constitution.passed) {
683
757
  feedbackReason = `Constitutional: ${constitution.violations.join("; ")}`;
684
- recordAudit(proposal.proposal_id, "rejected", feedbackReason);
758
+ // Re-evaluate stopping after a constitutional rejection by treating the
759
+ // last entry in previousPassRates as the currentPassRate (or 0 on the
760
+ // first iteration) and slicing it out of history before calling
761
+ // evaluateStoppingCriteria() with the current iteration/maxIterations,
762
+ // confidenceThreshold, and proposal.confidence.
763
+ const constitutionStop = evaluateStoppingCriteria(
764
+ previousPassRates.at(-1) ?? 0,
765
+ previousPassRates.slice(0, -1),
766
+ iteration + 1,
767
+ maxIterations,
768
+ confidenceThreshold,
769
+ proposal.confidence,
770
+ );
771
+ recordAudit(
772
+ proposal.proposal_id,
773
+ "rejected",
774
+ `${feedbackReason} (stopping: ${constitutionStop.reason})`,
775
+ );
685
776
  recordEvidence({
686
777
  timestamp: new Date().toISOString(),
687
778
  proposal_id: proposal.proposal_id,
@@ -691,54 +782,64 @@ export async function evolve(
691
782
  stage: "rejected",
692
783
  rationale: proposal.rationale,
693
784
  confidence: proposal.confidence,
694
- details: feedbackReason,
785
+ details: `${feedbackReason} (stopping: ${constitutionStop.reason})`,
695
786
  });
696
- if (iteration === maxIterations - 1) {
787
+ if (constitutionStop.shouldStop) {
697
788
  finishTui();
698
789
  return withStats({
699
790
  proposal: lastProposal,
700
791
  validation: null,
701
792
  deployed: false,
702
793
  auditEntries,
703
- reason: feedbackReason,
794
+ reason: `${feedbackReason} (${constitutionStop.reason})`,
704
795
  });
705
796
  }
706
797
  continue;
707
798
  }
708
799
 
709
- // Step 9: Check confidence threshold
710
- if (proposal.confidence < confidenceThreshold) {
711
- feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
712
- recordAudit(
713
- proposal.proposal_id,
714
- "rejected",
715
- `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
800
+ // Step 9: Check confidence threshold via stopping criteria
801
+ {
802
+ const preValidationStop = evaluateStoppingCriteria(
803
+ previousPassRates.at(-1) ?? 0,
804
+ previousPassRates.slice(0, -1),
805
+ iteration + 1,
806
+ maxIterations,
807
+ confidenceThreshold,
808
+ proposal.confidence,
716
809
  );
717
- recordEvidence({
718
- timestamp: new Date().toISOString(),
719
- proposal_id: proposal.proposal_id,
720
- skill_name: skillName,
721
- skill_path: skillPath,
722
- target: "description",
723
- stage: "rejected",
724
- rationale: proposal.rationale,
725
- confidence: proposal.confidence,
726
- details: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
727
- });
728
-
729
- // If this is the last iteration, return early with rejection
730
- if (iteration === maxIterations - 1) {
731
- finishTui();
732
- return withStats({
733
- proposal: lastProposal,
734
- validation: null,
735
- deployed: false,
736
- auditEntries,
737
- reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
810
+ if (proposal.confidence < confidenceThreshold) {
811
+ feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
812
+ recordAudit(
813
+ proposal.proposal_id,
814
+ "rejected",
815
+ `${feedbackReason} (stopping: ${preValidationStop.reason})`,
816
+ );
817
+ recordEvidence({
818
+ timestamp: new Date().toISOString(),
819
+ proposal_id: proposal.proposal_id,
820
+ skill_name: skillName,
821
+ skill_path: skillPath,
822
+ target: "description",
823
+ stage: "rejected",
824
+ rationale: proposal.rationale,
825
+ confidence: proposal.confidence,
826
+ details: `${feedbackReason} (stopping: ${preValidationStop.reason})`,
738
827
  });
739
- }
740
828
 
741
- continue;
829
+ // Use stopping criteria to decide whether to return or retry
830
+ if (preValidationStop.shouldStop) {
831
+ finishTui();
832
+ return withStats({
833
+ proposal: lastProposal,
834
+ validation: null,
835
+ deployed: false,
836
+ auditEntries,
837
+ reason: `${feedbackReason} (${preValidationStop.reason})`,
838
+ });
839
+ }
840
+
841
+ continue;
842
+ }
742
843
  }
743
844
 
744
845
  // Step 10: Validate against eval set
@@ -753,7 +854,7 @@ export async function evolve(
753
854
  options.validationModel,
754
855
  );
755
856
  lastValidation = validation;
756
- llmCallCount += batchCount * 2 * VALIDATION_RUNS;
857
+ llmCallCount += countValidationLlmCalls(evalSet.length);
757
858
  tui.done(
758
859
  `Validation: ${(validation.before_pass_rate * 100).toFixed(1)}% \u2192 ${(validation.after_pass_rate * 100).toFixed(1)}% (improved: ${validation.improved})`,
759
860
  );
@@ -792,13 +893,23 @@ export async function evolve(
792
893
  },
793
894
  });
794
895
 
795
- // Step 12: Check validation result
896
+ // Step 12: Evaluate stopping criteria after validation
897
+ const stopping = evaluateStoppingCriteria(
898
+ validation.after_pass_rate,
899
+ previousPassRates,
900
+ iteration + 1,
901
+ maxIterations,
902
+ confidenceThreshold,
903
+ proposal.confidence,
904
+ );
905
+ previousPassRates.push(validation.after_pass_rate);
906
+
796
907
  if (!validation.improved) {
797
908
  feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
798
909
  recordAudit(
799
910
  proposal.proposal_id,
800
911
  "rejected",
801
- `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
912
+ `Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
802
913
  );
803
914
  recordEvidence({
804
915
  timestamp: new Date().toISOString(),
@@ -809,7 +920,7 @@ export async function evolve(
809
920
  stage: "rejected",
810
921
  rationale: proposal.rationale,
811
922
  confidence: proposal.confidence,
812
- details: `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
923
+ details: `Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
813
924
  validation: {
814
925
  improved: validation.improved,
815
926
  before_pass_rate: validation.before_pass_rate,
@@ -821,21 +932,26 @@ export async function evolve(
821
932
  },
822
933
  });
823
934
 
824
- // If this is the last iteration, return with rejection
825
- if (iteration === maxIterations - 1) {
935
+ // Use stopping criteria to decide whether to return or retry
936
+ if (stopping.shouldStop) {
826
937
  finishTui();
827
938
  return withStats({
828
939
  proposal: lastProposal,
829
940
  validation: lastValidation,
830
941
  deployed: false,
831
942
  auditEntries,
832
- reason: `Validation failed after ${maxIterations} iterations: net_change=${validation.net_change.toFixed(3)}`,
943
+ reason: `Validation failed (${stopping.reason}): net_change=${validation.net_change.toFixed(3)}`,
833
944
  });
834
945
  }
835
946
 
836
947
  continue;
837
948
  }
838
949
 
950
+ // Validation passed — check if converged or continue
951
+ if (stopping.shouldStop && stopping.reason.includes("Converged")) {
952
+ recordAudit(proposal.proposal_id, "validated", `Stopping early: ${stopping.reason}`);
953
+ }
954
+
839
955
  // Validation passed - break out of retry loop
840
956
  break;
841
957
  }
@@ -916,18 +1032,39 @@ export async function evolve(
916
1032
  // -----------------------------------------------------------------------
917
1033
  let gateValidation: ValidationResult | undefined;
918
1034
  if (options.gateModel && lastProposal && lastValidation?.improved) {
919
- tui.step(`Gate validation (${options.gateModel})...`);
920
- gateValidation = await _gateValidateProposal(lastProposal, evalSet, agent, options.gateModel);
921
- llmCallCount++;
1035
+ const gateDecision = resolveGateDecision(
1036
+ options,
1037
+ lastProposal,
1038
+ lastValidation,
1039
+ currentDescription,
1040
+ confidenceThreshold,
1041
+ );
1042
+ const gateLabel = gateDecision?.effort
1043
+ ? `${gateDecision.model}, effort=${gateDecision.effort}`
1044
+ : (gateDecision?.model ?? options.gateModel);
1045
+ tui.step(`Gate validation (${gateLabel})...`);
1046
+ gateValidation = await _gateValidateProposal(
1047
+ lastProposal,
1048
+ evalSet,
1049
+ agent,
1050
+ gateDecision?.model ?? options.gateModel,
1051
+ gateDecision?.effort,
1052
+ );
1053
+ llmCallCount += countValidationLlmCalls(evalSet.length);
922
1054
  tui.done(
923
- `Gate (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
1055
+ `Gate (${gateLabel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
924
1056
  );
925
1057
 
1058
+ const gatePrefix =
1059
+ gateDecision && gateDecision.riskSignals.length > 0
1060
+ ? `Adaptive gate [${gateDecision.riskSignals.join(", ")}]`
1061
+ : "Gate validation";
1062
+
926
1063
  if (!gateValidation.improved) {
927
1064
  recordAudit(
928
1065
  lastProposal.proposal_id,
929
1066
  "rejected",
930
- `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
1067
+ `${gatePrefix} failed (${gateLabel}): net_change=${gateValidation.net_change.toFixed(3)}`,
931
1068
  );
932
1069
  recordEvidence({
933
1070
  timestamp: new Date().toISOString(),
@@ -938,7 +1075,7 @@ export async function evolve(
938
1075
  stage: "rejected",
939
1076
  rationale: lastProposal.rationale,
940
1077
  confidence: lastProposal.confidence,
941
- details: `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
1078
+ details: `${gatePrefix} failed (${gateLabel}): net_change=${gateValidation.net_change.toFixed(3)}`,
942
1079
  validation: {
943
1080
  improved: gateValidation.improved,
944
1081
  before_pass_rate: gateValidation.before_pass_rate,
@@ -955,7 +1092,7 @@ export async function evolve(
955
1092
  validation: lastValidation,
956
1093
  deployed: false,
957
1094
  auditEntries,
958
- reason: `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
1095
+ reason: `${gatePrefix} failed (${gateLabel}): net_change=${gateValidation.net_change.toFixed(3)}`,
959
1096
  gateValidation,
960
1097
  ...(baselineResult ? { baselineResult } : {}),
961
1098
  });
@@ -964,7 +1101,7 @@ export async function evolve(
964
1101
  recordAudit(
965
1102
  lastProposal.proposal_id,
966
1103
  "validated",
967
- `Gate validation (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
1104
+ `${gatePrefix} (${gateLabel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
968
1105
  );
969
1106
  }
970
1107
 
@@ -1082,7 +1219,7 @@ export async function cliMain(): Promise<void> {
1082
1219
  "dry-run": { type: "boolean", default: false },
1083
1220
  confidence: { type: "string", default: "0.6" },
1084
1221
  "max-iterations": { type: "string", default: "3" },
1085
- pareto: { type: "boolean", default: false },
1222
+ pareto: { type: "boolean", default: true },
1086
1223
  candidates: { type: "string", default: "3" },
1087
1224
  "token-efficiency": { type: "boolean", default: false },
1088
1225
  "with-baseline": { type: "boolean", default: false },
@@ -1090,7 +1227,9 @@ export async function cliMain(): Promise<void> {
1090
1227
  "cheap-loop": { type: "boolean", default: true },
1091
1228
  "full-model": { type: "boolean", default: false },
1092
1229
  "gate-model": { type: "string" },
1230
+ "gate-effort": { type: "string" },
1093
1231
  "proposal-model": { type: "string" },
1232
+ "adaptive-gate": { type: "boolean", default: false },
1094
1233
  "sync-first": { type: "boolean", default: false },
1095
1234
  "sync-force": { type: "boolean", default: false },
1096
1235
  verbose: { type: "boolean", default: false },
@@ -1121,6 +1260,8 @@ Options:
1121
1260
  --cheap-loop Use cheap models for loop, expensive for gate (default: on)
1122
1261
  --full-model Use same model for all stages (disables cheap-loop)
1123
1262
  --gate-model Model for final gate validation (default: sonnet)
1263
+ --gate-effort Thinking effort for final gate (low|medium|high|max)
1264
+ --adaptive-gate Escalate risky gate checks to opus + high effort
1124
1265
  --proposal-model Model for proposal generation LLM calls
1125
1266
  --sync-first Refresh source-truth telemetry before building evals/failure patterns
1126
1267
  --sync-force Force a full rescan during --sync-first
@@ -1143,6 +1284,24 @@ Options:
1143
1284
  "Add --sync-first when using --sync-force",
1144
1285
  );
1145
1286
  }
1287
+ if (values["gate-effort"] && !["low", "medium", "high", "max"].includes(values["gate-effort"])) {
1288
+ throw new CLIError(
1289
+ `Invalid --gate-effort value: ${values["gate-effort"]}`,
1290
+ "INVALID_FLAG",
1291
+ "Use one of: low, medium, high, max",
1292
+ );
1293
+ }
1294
+ if (
1295
+ (values["gate-effort"] || values["adaptive-gate"]) &&
1296
+ (values["full-model"] ?? false) &&
1297
+ !values["gate-model"]
1298
+ ) {
1299
+ throw new CLIError(
1300
+ "--gate-effort and --adaptive-gate require --gate-model when --full-model is set",
1301
+ "INVALID_FLAG",
1302
+ "Add --gate-model <model> or drop --full-model",
1303
+ );
1304
+ }
1146
1305
 
1147
1306
  const { detectAgent } = await import("../utils/llm-call.js");
1148
1307
  const requestedAgent = values.agent;
@@ -1223,6 +1382,8 @@ Options:
1223
1382
  console.error(`[verbose] Dry run: ${values["dry-run"] ?? false}`);
1224
1383
  console.error(`[verbose] Sync first: ${values["sync-first"] ?? false}`);
1225
1384
  console.error(`[verbose] Sync force: ${values["sync-force"] ?? false}`);
1385
+ console.error(`[verbose] Adaptive gate: ${values["adaptive-gate"] ?? false}`);
1386
+ console.error(`[verbose] Gate effort: ${values["gate-effort"] ?? "(default)"}`);
1226
1387
  }
1227
1388
 
1228
1389
  const result = await evolve({
@@ -1241,7 +1402,9 @@ Options:
1241
1402
  validationModel: values["validation-model"],
1242
1403
  cheapLoop: (values["cheap-loop"] ?? true) && !(values["full-model"] ?? false),
1243
1404
  gateModel: values["gate-model"],
1405
+ gateEffort: values["gate-effort"] as EffortLevel | undefined,
1244
1406
  proposalModel: values["proposal-model"],
1407
+ adaptiveGate: values["adaptive-gate"] ?? false,
1245
1408
  gradingResults,
1246
1409
  syncFirst: values["sync-first"] ?? false,
1247
1410
  syncForce: values["sync-force"] ?? false,