selftune 0.2.16 → 0.2.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/README.md +32 -22
  2. package/apps/local-dashboard/dist/assets/index-DnhnXQm6.js +60 -0
  3. package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +1 -0
  4. package/apps/local-dashboard/dist/assets/vendor-table-BIiI3YhS.js +1 -0
  5. package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +12 -0
  6. package/apps/local-dashboard/dist/index.html +5 -5
  7. package/cli/selftune/alpha-upload/build-payloads.ts +14 -1
  8. package/cli/selftune/alpha-upload/client.ts +51 -1
  9. package/cli/selftune/alpha-upload/flush.ts +46 -5
  10. package/cli/selftune/alpha-upload/stage-canonical.ts +32 -10
  11. package/cli/selftune/alpha-upload-contract.ts +9 -0
  12. package/cli/selftune/constants.ts +92 -5
  13. package/cli/selftune/contribute/contribute.ts +30 -2
  14. package/cli/selftune/contribute/sanitize.ts +52 -5
  15. package/cli/selftune/contribution-config.ts +249 -0
  16. package/cli/selftune/contribution-relay.ts +177 -0
  17. package/cli/selftune/contribution-signals.ts +219 -0
  18. package/cli/selftune/contribution-staging.ts +147 -0
  19. package/cli/selftune/contributions.ts +532 -0
  20. package/cli/selftune/creator-contributions.ts +333 -0
  21. package/cli/selftune/dashboard-contract.ts +305 -1
  22. package/cli/selftune/dashboard-server.ts +47 -13
  23. package/cli/selftune/eval/family-overlap.ts +395 -0
  24. package/cli/selftune/eval/hooks-to-evals.ts +182 -28
  25. package/cli/selftune/eval/synthetic-evals.ts +298 -11
  26. package/cli/selftune/evolution/description-quality.ts +12 -11
  27. package/cli/selftune/evolution/evolve.ts +214 -51
  28. package/cli/selftune/evolution/validate-proposal.ts +9 -6
  29. package/cli/selftune/export.ts +2 -2
  30. package/cli/selftune/grading/grade-session.ts +20 -0
  31. package/cli/selftune/hooks/commit-track.ts +188 -0
  32. package/cli/selftune/hooks/prompt-log.ts +10 -1
  33. package/cli/selftune/hooks/session-stop.ts +2 -2
  34. package/cli/selftune/hooks/skill-eval.ts +15 -1
  35. package/cli/selftune/hooks/stdin-preview.ts +32 -0
  36. package/cli/selftune/index.ts +41 -5
  37. package/cli/selftune/ingestors/codex-rollout.ts +31 -35
  38. package/cli/selftune/ingestors/codex-wrapper.ts +32 -24
  39. package/cli/selftune/localdb/db.ts +2 -2
  40. package/cli/selftune/localdb/direct-write.ts +69 -6
  41. package/cli/selftune/localdb/queries.ts +1253 -37
  42. package/cli/selftune/localdb/schema.ts +66 -0
  43. package/cli/selftune/orchestrate.ts +32 -4
  44. package/cli/selftune/recover.ts +153 -0
  45. package/cli/selftune/repair/skill-usage.ts +363 -4
  46. package/cli/selftune/routes/actions.ts +35 -1
  47. package/cli/selftune/routes/analytics.ts +14 -0
  48. package/cli/selftune/routes/index.ts +1 -0
  49. package/cli/selftune/routes/overview.ts +150 -4
  50. package/cli/selftune/routes/skill-report.ts +648 -18
  51. package/cli/selftune/status.ts +81 -2
  52. package/cli/selftune/sync.ts +56 -2
  53. package/cli/selftune/trust-model.ts +66 -0
  54. package/cli/selftune/types.ts +80 -0
  55. package/cli/selftune/utils/skill-detection.ts +43 -0
  56. package/cli/selftune/utils/transcript.ts +210 -1
  57. package/cli/selftune/watchlist.ts +65 -0
  58. package/node_modules/@selftune/telemetry-contract/src/types.ts +11 -0
  59. package/package.json +1 -1
  60. package/packages/telemetry-contract/src/types.ts +11 -0
  61. package/packages/ui/src/components/ActivityTimeline.tsx +165 -150
  62. package/packages/ui/src/components/EvidenceViewer.tsx +335 -144
  63. package/packages/ui/src/components/EvolutionTimeline.tsx +58 -28
  64. package/packages/ui/src/components/OrchestrateRunsPanel.tsx +33 -16
  65. package/packages/ui/src/components/RecentActivityFeed.tsx +72 -41
  66. package/packages/ui/src/components/section-cards.tsx +12 -9
  67. package/packages/ui/src/primitives/card.tsx +1 -1
  68. package/skill/SKILL.md +40 -2
  69. package/skill/Workflows/AlphaUpload.md +4 -0
  70. package/skill/Workflows/Composability.md +64 -0
  71. package/skill/Workflows/Contribute.md +6 -3
  72. package/skill/Workflows/Contributions.md +97 -0
  73. package/skill/Workflows/CreatorContributions.md +74 -0
  74. package/skill/Workflows/Dashboard.md +31 -0
  75. package/skill/Workflows/Evals.md +57 -8
  76. package/skill/Workflows/Evolve.md +31 -13
  77. package/skill/Workflows/ExportCanonical.md +121 -0
  78. package/skill/Workflows/Hook.md +131 -0
  79. package/skill/Workflows/Ingest.md +7 -0
  80. package/skill/Workflows/Initialize.md +29 -9
  81. package/skill/Workflows/Orchestrate.md +27 -5
  82. package/skill/Workflows/Quickstart.md +94 -0
  83. package/skill/Workflows/Recover.md +84 -0
  84. package/skill/Workflows/RepairSkillUsage.md +95 -0
  85. package/skill/Workflows/Sync.md +18 -12
  86. package/skill/Workflows/Uninstall.md +82 -0
  87. package/skill/settings_snippet.json +11 -0
  88. package/apps/local-dashboard/dist/assets/index-BMIS6uUh.css +0 -2
  89. package/apps/local-dashboard/dist/assets/index-DOu3iLD9.js +0 -16
  90. package/apps/local-dashboard/dist/assets/vendor-table-pHbDxq36.js +0 -8
  91. package/apps/local-dashboard/dist/assets/vendor-ui-DIwlrGlb.js +0 -12
@@ -139,27 +139,27 @@ export function scoreLengthCriterion(description: string): number {
139
139
  }
140
140
 
141
141
  /** Score presence of trigger context words (when/if/before/after etc). */
142
- export function scoreTriggerContextCriterion(description: string): number {
143
- const matches = countWordMatches(description.toLowerCase(), TRIGGER_PATTERNS);
142
+ export function scoreTriggerContextCriterion(description: string, lower?: string): number {
143
+ const matches = countWordMatches(lower ?? description.toLowerCase(), TRIGGER_PATTERNS);
144
144
  if (matches === 0) return 0.0;
145
145
  if (matches === 1) return 0.7;
146
146
  return Math.min(1.0, 0.7 + 0.15 * (matches - 1));
147
147
  }
148
148
 
149
149
  /** Score absence of vague words (lower is worse). */
150
- export function scoreVaguenessCriterion(description: string): number {
151
- const matches = countWordMatches(description.toLowerCase(), VAGUE_PATTERNS);
150
+ export function scoreVaguenessCriterion(description: string, lower?: string): number {
151
+ const matches = countWordMatches(lower ?? description.toLowerCase(), VAGUE_PATTERNS);
152
152
  if (matches === 0) return 1.0;
153
153
  if (matches === 1) return 0.6;
154
154
  return Math.max(0.1, 0.6 - 0.15 * (matches - 1));
155
155
  }
156
156
 
157
157
  /** Score whether description specifies at least one concrete action or domain. */
158
- export function scoreSpecificityCriterion(description: string): number {
159
- const lower = description.toLowerCase();
160
- const hasAction = ACTION_PATTERNS.some((p) => p.test(lower));
158
+ export function scoreSpecificityCriterion(description: string, lower?: string): number {
159
+ const l = lower ?? description.toLowerCase();
160
+ const hasAction = ACTION_PATTERNS.some((p) => p.test(l));
161
161
 
162
- const fillerCount = FILLER_PHRASES.filter((f) => lower.includes(f)).length;
162
+ const fillerCount = FILLER_PHRASES.filter((f) => l.includes(f)).length;
163
163
  const words = description.split(/\s+/).length;
164
164
  const fillerRatio = fillerCount > 0 ? fillerCount / Math.max(1, words / 10) : 0;
165
165
 
@@ -204,11 +204,12 @@ const WEIGHTS = {
204
204
  * Pure function — no I/O, no LLM calls.
205
205
  */
206
206
  export function scoreDescription(description: string, skillName?: string): DescriptionQualityScore {
207
+ const lower = description.toLowerCase();
207
208
  const criteria = {
208
209
  length: scoreLengthCriterion(description),
209
- trigger_context: scoreTriggerContextCriterion(description),
210
- vagueness: scoreVaguenessCriterion(description),
211
- specificity: scoreSpecificityCriterion(description),
210
+ trigger_context: scoreTriggerContextCriterion(description, lower),
211
+ vagueness: scoreVaguenessCriterion(description, lower),
212
+ specificity: scoreSpecificityCriterion(description, lower),
212
213
  not_just_name: scoreNotJustNameCriterion(description, skillName),
213
214
  };
214
215
 
@@ -38,6 +38,7 @@ import type {
38
38
  } from "../types.js";
39
39
  import { CLIError, handleCLIError } from "../utils/cli-error.js";
40
40
  import { parseFrontmatter, replaceDescription } from "../utils/frontmatter.js";
41
+ import type { EffortLevel } from "../utils/llm-call.js";
41
42
  import { createEvolveTUI } from "../utils/tui.js";
42
43
  import { appendAuditEntry } from "./audit.js";
43
44
  import { checkConstitution } from "./constitutional.js";
@@ -51,6 +52,7 @@ import {
51
52
  selectFromFrontier,
52
53
  } from "./pareto.js";
53
54
  import { generateMultipleProposals, generateProposal } from "./propose-description.js";
55
+ import { evaluateStoppingCriteria } from "./stopping-criteria.js";
54
56
  import { buildUnblockSuggestions } from "./unblock-suggestions.js";
55
57
  import type { ValidationResult } from "./validate-proposal.js";
56
58
  import {
@@ -80,7 +82,9 @@ export interface EvolveOptions {
80
82
  validationModel?: string;
81
83
  cheapLoop?: boolean;
82
84
  gateModel?: string;
85
+ gateEffort?: EffortLevel;
83
86
  proposalModel?: string;
87
+ adaptiveGate?: boolean;
84
88
  syncFirst?: boolean;
85
89
  syncForce?: boolean;
86
90
  }
@@ -174,6 +178,73 @@ function formatSimpleDiff(oldText: string, newText: string): string {
174
178
  return output.join("\n");
175
179
  }
176
180
 
181
+ function countValidationLlmCalls(evalSetSize: number): number {
182
+ if (evalSetSize === 0) return 0;
183
+ return Math.ceil(evalSetSize / TRIGGER_CHECK_BATCH_SIZE) * 2 * VALIDATION_RUNS;
184
+ }
185
+
186
+ interface GateDecision {
187
+ model: string;
188
+ effort?: EffortLevel;
189
+ riskSignals: string[];
190
+ }
191
+
192
+ function countWords(text: string): number {
193
+ return text
194
+ .trim()
195
+ .split(/\s+/)
196
+ .filter((token) => token.length > 0).length;
197
+ }
198
+
199
+ function resolveGateDecision(
200
+ options: EvolveOptions,
201
+ proposal: EvolutionProposal,
202
+ validation: ValidationResult,
203
+ currentDescription: string,
204
+ confidenceThreshold: number,
205
+ ): GateDecision | undefined {
206
+ const baseModel = options.gateModel;
207
+ if (!baseModel) return undefined;
208
+
209
+ const baseDecision: GateDecision = {
210
+ model: baseModel,
211
+ effort: options.gateEffort,
212
+ riskSignals: [],
213
+ };
214
+
215
+ if (!options.adaptiveGate) return baseDecision;
216
+
217
+ const riskSignals: string[] = [];
218
+ const originalWords = countWords(currentDescription);
219
+ const proposedWords = countWords(proposal.proposed_description);
220
+ const wordGrowth = originalWords === 0 ? 1 : proposedWords / originalWords;
221
+ const lowLift = validation.net_change < 0.15;
222
+ const hasRegressions = validation.regressions.length > 0;
223
+ const lowConfidence = proposal.confidence < Math.max(confidenceThreshold + 0.05, 0.75);
224
+ const broadeningRisk = wordGrowth > 1.8 || proposedWords - originalWords > 32;
225
+ const notYetStrong = validation.after_pass_rate < 0.9;
226
+
227
+ if (hasRegressions) riskSignals.push(`regressions=${validation.regressions.length}`);
228
+ if (lowLift) riskSignals.push(`low_lift=${validation.net_change.toFixed(3)}`);
229
+ if (lowConfidence) riskSignals.push(`confidence=${proposal.confidence.toFixed(2)}`);
230
+ if (broadeningRisk) riskSignals.push(`word_growth=${wordGrowth.toFixed(2)}x`);
231
+ if (notYetStrong) riskSignals.push(`after_pass_rate=${validation.after_pass_rate.toFixed(2)}`);
232
+
233
+ const shouldEscalate = hasRegressions || validation.net_change < 0.1 || riskSignals.length >= 2;
234
+ if (!shouldEscalate) {
235
+ return {
236
+ ...baseDecision,
237
+ riskSignals,
238
+ };
239
+ }
240
+
241
+ return {
242
+ model: "opus",
243
+ effort: options.gateEffort === "max" ? "max" : "high",
244
+ riskSignals,
245
+ };
246
+ }
247
+
177
248
  // ---------------------------------------------------------------------------
178
249
  // Main orchestrator
179
250
  // ---------------------------------------------------------------------------
@@ -456,7 +527,7 @@ export async function evolve(
456
527
  // -----------------------------------------------------------------------
457
528
  // Pareto multi-candidate path
458
529
  // -----------------------------------------------------------------------
459
- const paretoEnabled = options.paretoEnabled ?? false;
530
+ const paretoEnabled = options.paretoEnabled ?? true;
460
531
  const candidateCount = options.candidateCount ?? 3;
461
532
  const tokenEfficiencyEnabled = options.tokenEfficiencyEnabled ?? false;
462
533
  const telemetryRecords =
@@ -494,6 +565,7 @@ export async function evolve(
494
565
  options.proposalModel,
495
566
  aggregateMetrics,
496
567
  );
568
+ llmCallCount += candidateCount;
497
569
 
498
570
  // Filter by confidence threshold
499
571
  const viableCandidates = candidates.filter((c) => c.confidence >= confidenceThreshold);
@@ -564,6 +636,7 @@ export async function evolve(
564
636
  agent,
565
637
  options.validationModel,
566
638
  );
639
+ llmCallCount += countValidationLlmCalls(evalSet.length);
567
640
  recordAudit(
568
641
  proposal.proposal_id,
569
642
  "validated",
@@ -628,6 +701,7 @@ export async function evolve(
628
701
  } else {
629
702
  // Standard single-candidate retry loop
630
703
  let feedbackReason = "";
704
+ const previousPassRates: number[] = [];
631
705
 
632
706
  for (let iteration = 0; iteration < maxIterations; iteration++) {
633
707
  iterationsCompleted = iteration + 1;
@@ -681,7 +755,24 @@ export async function evolve(
681
755
  );
682
756
  if (!constitution.passed) {
683
757
  feedbackReason = `Constitutional: ${constitution.violations.join("; ")}`;
684
- recordAudit(proposal.proposal_id, "rejected", feedbackReason);
758
+ // Re-evaluate stopping after a constitutional rejection by treating the
759
+ // last entry in previousPassRates as the currentPassRate (or 0 on the
760
+ // first iteration) and slicing it out of history before calling
761
+ // evaluateStoppingCriteria() with the current iteration/maxIterations,
762
+ // confidenceThreshold, and proposal.confidence.
763
+ const constitutionStop = evaluateStoppingCriteria(
764
+ previousPassRates.at(-1) ?? 0,
765
+ previousPassRates.slice(0, -1),
766
+ iteration + 1,
767
+ maxIterations,
768
+ confidenceThreshold,
769
+ proposal.confidence,
770
+ );
771
+ recordAudit(
772
+ proposal.proposal_id,
773
+ "rejected",
774
+ `${feedbackReason} (stopping: ${constitutionStop.reason})`,
775
+ );
685
776
  recordEvidence({
686
777
  timestamp: new Date().toISOString(),
687
778
  proposal_id: proposal.proposal_id,
@@ -691,54 +782,64 @@ export async function evolve(
691
782
  stage: "rejected",
692
783
  rationale: proposal.rationale,
693
784
  confidence: proposal.confidence,
694
- details: feedbackReason,
785
+ details: `${feedbackReason} (stopping: ${constitutionStop.reason})`,
695
786
  });
696
- if (iteration === maxIterations - 1) {
787
+ if (constitutionStop.shouldStop) {
697
788
  finishTui();
698
789
  return withStats({
699
790
  proposal: lastProposal,
700
791
  validation: null,
701
792
  deployed: false,
702
793
  auditEntries,
703
- reason: feedbackReason,
794
+ reason: `${feedbackReason} (${constitutionStop.reason})`,
704
795
  });
705
796
  }
706
797
  continue;
707
798
  }
708
799
 
709
- // Step 9: Check confidence threshold
710
- if (proposal.confidence < confidenceThreshold) {
711
- feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
712
- recordAudit(
713
- proposal.proposal_id,
714
- "rejected",
715
- `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
800
+ // Step 9: Check confidence threshold via stopping criteria
801
+ {
802
+ const preValidationStop = evaluateStoppingCriteria(
803
+ previousPassRates.at(-1) ?? 0,
804
+ previousPassRates.slice(0, -1),
805
+ iteration + 1,
806
+ maxIterations,
807
+ confidenceThreshold,
808
+ proposal.confidence,
716
809
  );
717
- recordEvidence({
718
- timestamp: new Date().toISOString(),
719
- proposal_id: proposal.proposal_id,
720
- skill_name: skillName,
721
- skill_path: skillPath,
722
- target: "description",
723
- stage: "rejected",
724
- rationale: proposal.rationale,
725
- confidence: proposal.confidence,
726
- details: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
727
- });
728
-
729
- // If this is the last iteration, return early with rejection
730
- if (iteration === maxIterations - 1) {
731
- finishTui();
732
- return withStats({
733
- proposal: lastProposal,
734
- validation: null,
735
- deployed: false,
736
- auditEntries,
737
- reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
810
+ if (proposal.confidence < confidenceThreshold) {
811
+ feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
812
+ recordAudit(
813
+ proposal.proposal_id,
814
+ "rejected",
815
+ `${feedbackReason} (stopping: ${preValidationStop.reason})`,
816
+ );
817
+ recordEvidence({
818
+ timestamp: new Date().toISOString(),
819
+ proposal_id: proposal.proposal_id,
820
+ skill_name: skillName,
821
+ skill_path: skillPath,
822
+ target: "description",
823
+ stage: "rejected",
824
+ rationale: proposal.rationale,
825
+ confidence: proposal.confidence,
826
+ details: `${feedbackReason} (stopping: ${preValidationStop.reason})`,
738
827
  });
739
- }
740
828
 
741
- continue;
829
+ // Use stopping criteria to decide whether to return or retry
830
+ if (preValidationStop.shouldStop) {
831
+ finishTui();
832
+ return withStats({
833
+ proposal: lastProposal,
834
+ validation: null,
835
+ deployed: false,
836
+ auditEntries,
837
+ reason: `${feedbackReason} (${preValidationStop.reason})`,
838
+ });
839
+ }
840
+
841
+ continue;
842
+ }
742
843
  }
743
844
 
744
845
  // Step 10: Validate against eval set
@@ -753,7 +854,7 @@ export async function evolve(
753
854
  options.validationModel,
754
855
  );
755
856
  lastValidation = validation;
756
- llmCallCount += batchCount * 2 * VALIDATION_RUNS;
857
+ llmCallCount += countValidationLlmCalls(evalSet.length);
757
858
  tui.done(
758
859
  `Validation: ${(validation.before_pass_rate * 100).toFixed(1)}% \u2192 ${(validation.after_pass_rate * 100).toFixed(1)}% (improved: ${validation.improved})`,
759
860
  );
@@ -792,13 +893,23 @@ export async function evolve(
792
893
  },
793
894
  });
794
895
 
795
- // Step 12: Check validation result
896
+ // Step 12: Evaluate stopping criteria after validation
897
+ const stopping = evaluateStoppingCriteria(
898
+ validation.after_pass_rate,
899
+ previousPassRates,
900
+ iteration + 1,
901
+ maxIterations,
902
+ confidenceThreshold,
903
+ proposal.confidence,
904
+ );
905
+ previousPassRates.push(validation.after_pass_rate);
906
+
796
907
  if (!validation.improved) {
797
908
  feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
798
909
  recordAudit(
799
910
  proposal.proposal_id,
800
911
  "rejected",
801
- `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
912
+ `Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
802
913
  );
803
914
  recordEvidence({
804
915
  timestamp: new Date().toISOString(),
@@ -809,7 +920,7 @@ export async function evolve(
809
920
  stage: "rejected",
810
921
  rationale: proposal.rationale,
811
922
  confidence: proposal.confidence,
812
- details: `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
923
+ details: `Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
813
924
  validation: {
814
925
  improved: validation.improved,
815
926
  before_pass_rate: validation.before_pass_rate,
@@ -821,21 +932,26 @@ export async function evolve(
821
932
  },
822
933
  });
823
934
 
824
- // If this is the last iteration, return with rejection
825
- if (iteration === maxIterations - 1) {
935
+ // Use stopping criteria to decide whether to return or retry
936
+ if (stopping.shouldStop) {
826
937
  finishTui();
827
938
  return withStats({
828
939
  proposal: lastProposal,
829
940
  validation: lastValidation,
830
941
  deployed: false,
831
942
  auditEntries,
832
- reason: `Validation failed after ${maxIterations} iterations: net_change=${validation.net_change.toFixed(3)}`,
943
+ reason: `Validation failed (${stopping.reason}): net_change=${validation.net_change.toFixed(3)}`,
833
944
  });
834
945
  }
835
946
 
836
947
  continue;
837
948
  }
838
949
 
950
+ // Validation passed — check if converged or continue
951
+ if (stopping.shouldStop && stopping.reason.includes("Converged")) {
952
+ recordAudit(proposal.proposal_id, "validated", `Stopping early: ${stopping.reason}`);
953
+ }
954
+
839
955
  // Validation passed - break out of retry loop
840
956
  break;
841
957
  }
@@ -916,18 +1032,39 @@ export async function evolve(
916
1032
  // -----------------------------------------------------------------------
917
1033
  let gateValidation: ValidationResult | undefined;
918
1034
  if (options.gateModel && lastProposal && lastValidation?.improved) {
919
- tui.step(`Gate validation (${options.gateModel})...`);
920
- gateValidation = await _gateValidateProposal(lastProposal, evalSet, agent, options.gateModel);
921
- llmCallCount++;
1035
+ const gateDecision = resolveGateDecision(
1036
+ options,
1037
+ lastProposal,
1038
+ lastValidation,
1039
+ currentDescription,
1040
+ confidenceThreshold,
1041
+ );
1042
+ const gateLabel = gateDecision?.effort
1043
+ ? `${gateDecision.model}, effort=${gateDecision.effort}`
1044
+ : (gateDecision?.model ?? options.gateModel);
1045
+ tui.step(`Gate validation (${gateLabel})...`);
1046
+ gateValidation = await _gateValidateProposal(
1047
+ lastProposal,
1048
+ evalSet,
1049
+ agent,
1050
+ gateDecision?.model ?? options.gateModel,
1051
+ gateDecision?.effort,
1052
+ );
1053
+ llmCallCount += countValidationLlmCalls(evalSet.length);
922
1054
  tui.done(
923
- `Gate (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
1055
+ `Gate (${gateLabel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
924
1056
  );
925
1057
 
1058
+ const gatePrefix =
1059
+ gateDecision && gateDecision.riskSignals.length > 0
1060
+ ? `Adaptive gate [${gateDecision.riskSignals.join(", ")}]`
1061
+ : "Gate validation";
1062
+
926
1063
  if (!gateValidation.improved) {
927
1064
  recordAudit(
928
1065
  lastProposal.proposal_id,
929
1066
  "rejected",
930
- `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
1067
+ `${gatePrefix} failed (${gateLabel}): net_change=${gateValidation.net_change.toFixed(3)}`,
931
1068
  );
932
1069
  recordEvidence({
933
1070
  timestamp: new Date().toISOString(),
@@ -938,7 +1075,7 @@ export async function evolve(
938
1075
  stage: "rejected",
939
1076
  rationale: lastProposal.rationale,
940
1077
  confidence: lastProposal.confidence,
941
- details: `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
1078
+ details: `${gatePrefix} failed (${gateLabel}): net_change=${gateValidation.net_change.toFixed(3)}`,
942
1079
  validation: {
943
1080
  improved: gateValidation.improved,
944
1081
  before_pass_rate: gateValidation.before_pass_rate,
@@ -955,7 +1092,7 @@ export async function evolve(
955
1092
  validation: lastValidation,
956
1093
  deployed: false,
957
1094
  auditEntries,
958
- reason: `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`,
1095
+ reason: `${gatePrefix} failed (${gateLabel}): net_change=${gateValidation.net_change.toFixed(3)}`,
959
1096
  gateValidation,
960
1097
  ...(baselineResult ? { baselineResult } : {}),
961
1098
  });
@@ -964,7 +1101,7 @@ export async function evolve(
964
1101
  recordAudit(
965
1102
  lastProposal.proposal_id,
966
1103
  "validated",
967
- `Gate validation (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
1104
+ `${gatePrefix} (${gateLabel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
968
1105
  );
969
1106
  }
970
1107
 
@@ -1082,7 +1219,7 @@ export async function cliMain(): Promise<void> {
1082
1219
  "dry-run": { type: "boolean", default: false },
1083
1220
  confidence: { type: "string", default: "0.6" },
1084
1221
  "max-iterations": { type: "string", default: "3" },
1085
- pareto: { type: "boolean", default: false },
1222
+ pareto: { type: "boolean", default: true },
1086
1223
  candidates: { type: "string", default: "3" },
1087
1224
  "token-efficiency": { type: "boolean", default: false },
1088
1225
  "with-baseline": { type: "boolean", default: false },
@@ -1090,7 +1227,9 @@ export async function cliMain(): Promise<void> {
1090
1227
  "cheap-loop": { type: "boolean", default: true },
1091
1228
  "full-model": { type: "boolean", default: false },
1092
1229
  "gate-model": { type: "string" },
1230
+ "gate-effort": { type: "string" },
1093
1231
  "proposal-model": { type: "string" },
1232
+ "adaptive-gate": { type: "boolean", default: false },
1094
1233
  "sync-first": { type: "boolean", default: false },
1095
1234
  "sync-force": { type: "boolean", default: false },
1096
1235
  verbose: { type: "boolean", default: false },
@@ -1121,6 +1260,8 @@ Options:
1121
1260
  --cheap-loop Use cheap models for loop, expensive for gate (default: on)
1122
1261
  --full-model Use same model for all stages (disables cheap-loop)
1123
1262
  --gate-model Model for final gate validation (default: sonnet)
1263
+ --gate-effort Thinking effort for final gate (low|medium|high|max)
1264
+ --adaptive-gate Escalate risky gate checks to opus + high effort
1124
1265
  --proposal-model Model for proposal generation LLM calls
1125
1266
  --sync-first Refresh source-truth telemetry before building evals/failure patterns
1126
1267
  --sync-force Force a full rescan during --sync-first
@@ -1143,6 +1284,24 @@ Options:
1143
1284
  "Add --sync-first when using --sync-force",
1144
1285
  );
1145
1286
  }
1287
+ if (values["gate-effort"] && !["low", "medium", "high", "max"].includes(values["gate-effort"])) {
1288
+ throw new CLIError(
1289
+ `Invalid --gate-effort value: ${values["gate-effort"]}`,
1290
+ "INVALID_FLAG",
1291
+ "Use one of: low, medium, high, max",
1292
+ );
1293
+ }
1294
+ if (
1295
+ (values["gate-effort"] || values["adaptive-gate"]) &&
1296
+ (values["full-model"] ?? false) &&
1297
+ !values["gate-model"]
1298
+ ) {
1299
+ throw new CLIError(
1300
+ "--gate-effort and --adaptive-gate require --gate-model when --full-model is set",
1301
+ "INVALID_FLAG",
1302
+ "Add --gate-model <model> or drop --full-model",
1303
+ );
1304
+ }
1146
1305
 
1147
1306
  const { detectAgent } = await import("../utils/llm-call.js");
1148
1307
  const requestedAgent = values.agent;
@@ -1223,6 +1382,8 @@ Options:
1223
1382
  console.error(`[verbose] Dry run: ${values["dry-run"] ?? false}`);
1224
1383
  console.error(`[verbose] Sync first: ${values["sync-first"] ?? false}`);
1225
1384
  console.error(`[verbose] Sync force: ${values["sync-force"] ?? false}`);
1385
+ console.error(`[verbose] Adaptive gate: ${values["adaptive-gate"] ?? false}`);
1386
+ console.error(`[verbose] Gate effort: ${values["gate-effort"] ?? "(default)"}`);
1226
1387
  }
1227
1388
 
1228
1389
  const result = await evolve({
@@ -1241,7 +1402,9 @@ Options:
1241
1402
  validationModel: values["validation-model"],
1242
1403
  cheapLoop: (values["cheap-loop"] ?? true) && !(values["full-model"] ?? false),
1243
1404
  gateModel: values["gate-model"],
1405
+ gateEffort: values["gate-effort"] as EffortLevel | undefined,
1244
1406
  proposalModel: values["proposal-model"],
1407
+ adaptiveGate: values["adaptive-gate"] ?? false,
1245
1408
  gradingResults,
1246
1409
  syncFirst: values["sync-first"] ?? false,
1247
1410
  syncForce: values["sync-force"] ?? false,
@@ -7,7 +7,7 @@
7
7
  */
8
8
 
9
9
  import type { EvalEntry, EvolutionProposal, InvocationTypeScores } from "../types.js";
10
- import { callLlm } from "../utils/llm-call.js";
10
+ import { callLlm, type EffortLevel } from "../utils/llm-call.js";
11
11
  import {
12
12
  buildBatchTriggerCheckPrompt,
13
13
  buildTriggerCheckPrompt,
@@ -52,6 +52,7 @@ export async function validateProposalSequential(
52
52
  evalSet: EvalEntry[],
53
53
  agent: string,
54
54
  modelFlag?: string,
55
+ effort?: EffortLevel,
55
56
  ): Promise<ValidationResult> {
56
57
  if (evalSet.length === 0) {
57
58
  return {
@@ -76,14 +77,14 @@ export async function validateProposalSequential(
76
77
  for (const entry of evalSet) {
77
78
  // Check with original description
78
79
  const beforePrompt = buildTriggerCheckPrompt(proposal.original_description, entry.query);
79
- const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
80
+ const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag, effort);
80
81
  const beforeTriggered = parseTriggerResponse(beforeRaw);
81
82
  const beforePass =
82
83
  (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
83
84
 
84
85
  // Check with proposed description
85
86
  const afterPrompt = buildTriggerCheckPrompt(proposal.proposed_description, entry.query);
86
- const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
87
+ const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag, effort);
87
88
  const afterTriggered = parseTriggerResponse(afterRaw);
88
89
  const afterPass =
89
90
  (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
@@ -208,6 +209,7 @@ export async function validateProposalBatched(
208
209
  evalSet: EvalEntry[],
209
210
  agent: string,
210
211
  modelFlag?: string,
212
+ effort?: EffortLevel,
211
213
  ): Promise<ValidationResult> {
212
214
  if (evalSet.length === 0) {
213
215
  return {
@@ -242,8 +244,8 @@ export async function validateProposalBatched(
242
244
  // Run VALIDATION_RUNS times in parallel and majority-vote to reduce LLM variance
243
245
  const allCalls: Promise<string>[] = [];
244
246
  for (let r = 0; r < VALIDATION_RUNS; r++) {
245
- allCalls.push(callLlm(systemPrompt, beforePrompt, agent, modelFlag));
246
- allCalls.push(callLlm(systemPrompt, afterPrompt, agent, modelFlag));
247
+ allCalls.push(callLlm(systemPrompt, beforePrompt, agent, modelFlag, effort));
248
+ allCalls.push(callLlm(systemPrompt, afterPrompt, agent, modelFlag, effort));
247
249
  }
248
250
  const allRaw = await Promise.all(allCalls);
249
251
 
@@ -353,6 +355,7 @@ export async function validateProposal(
353
355
  evalSet: EvalEntry[],
354
356
  agent: string,
355
357
  modelFlag?: string,
358
+ effort?: EffortLevel,
356
359
  ): Promise<ValidationResult> {
357
- return validateProposalBatched(proposal, evalSet, agent, modelFlag);
360
+ return validateProposalBatched(proposal, evalSet, agent, modelFlag, effort);
358
361
  }
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * Export SQLite data to JSONL format.
3
- * Replaces the removed JSONL write path -- use this when you need
4
- * JSONL files for debugging, the contribute workflow, or external tools.
3
+ * Use this only when you explicitly need portable/debuggable JSONL snapshots
4
+ * for recovery, the contribute workflow, or external tools.
5
5
  */
6
6
  import { mkdirSync, writeFileSync } from "node:fs";
7
7
  import { join } from "node:path";
@@ -26,6 +26,7 @@ import type {
26
26
  GradingExpectation,
27
27
  GradingResult,
28
28
  SessionTelemetryRecord,
29
+ SessionType,
29
30
  SkillUsageRecord,
30
31
  } from "../types.js";
31
32
  import { CLIError, handleCLIError } from "../utils/cli-error.js";
@@ -420,6 +421,8 @@ export function buildExecutionMetrics(telemetry: SessionTelemetryRecord): Execut
420
421
  errors_encountered: telemetry.errors_encountered ?? 0,
421
422
  skills_triggered: telemetry.skills_triggered ?? [],
422
423
  transcript_chars: telemetry.transcript_chars ?? 0,
424
+ artifact_count: telemetry.artifact_count,
425
+ session_type: telemetry.session_type,
423
426
  };
424
427
  }
425
428
 
@@ -481,13 +484,30 @@ export function buildGradingPrompt(
481
484
  ? transcriptExcerpt.slice(0, MAX_TRANSCRIPT_LENGTH)
482
485
  : transcriptExcerpt;
483
486
 
487
+ const sessionType: SessionType = (telemetry.session_type as SessionType) ?? "mixed";
488
+ const SESSION_TYPE_CONTEXT: Record<SessionType, string> = {
489
+ dev: "This is a development session — code output and commits are expected productivity signals.",
490
+ research:
491
+ "This is a research session — information gathering and synthesis are the primary outputs, not code changes.",
492
+ content:
493
+ "This is a content/writing session — document creation is the primary output, not code commits.",
494
+ mixed:
495
+ "This is a mixed session — evaluate based on what was actually accomplished, not code-specific metrics.",
496
+ };
497
+ const sessionTypeContext = SESSION_TYPE_CONTEXT[sessionType] ?? SESSION_TYPE_CONTEXT.mixed;
498
+
484
499
  return `Skill: ${skillName}
485
500
 
501
+ === SESSION CONTEXT ===
502
+ Session type: ${sessionType}
503
+ ${sessionTypeContext}
504
+
486
505
  === PROCESS TELEMETRY ===
487
506
  Skills triggered: ${JSON.stringify(telemetry.skills_triggered ?? [])}
488
507
  Assistant turns: ${telemetry.assistant_turns ?? "?"}
489
508
  Errors: ${telemetry.errors_encountered ?? "?"}
490
509
  Total tool calls: ${telemetry.total_tool_calls ?? "?"}
510
+ Artifacts produced: ${telemetry.artifact_count ?? "?"}
491
511
 
492
512
  Tool breakdown:
493
513
  ${toolSummary}