selftune 0.2.15 → 0.2.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -19
- package/bin/run-hook.cjs +36 -0
- package/cli/selftune/alpha-upload/build-payloads.ts +14 -1
- package/cli/selftune/alpha-upload/client.ts +51 -1
- package/cli/selftune/alpha-upload/flush.ts +46 -5
- package/cli/selftune/alpha-upload/stage-canonical.ts +25 -4
- package/cli/selftune/alpha-upload-contract.ts +9 -0
- package/cli/selftune/constants.ts +82 -5
- package/cli/selftune/contribute/sanitize.ts +52 -5
- package/cli/selftune/dashboard-contract.ts +100 -0
- package/cli/selftune/dashboard-server.ts +2 -2
- package/cli/selftune/evolution/description-quality.ts +12 -11
- package/cli/selftune/evolution/evolve.ts +238 -53
- package/cli/selftune/evolution/unblock-suggestions.ts +159 -0
- package/cli/selftune/evolution/validate-proposal.ts +9 -6
- package/cli/selftune/grading/grade-session.ts +20 -0
- package/cli/selftune/hooks/commit-track.ts +188 -0
- package/cli/selftune/hooks/prompt-log.ts +10 -1
- package/cli/selftune/hooks/session-stop.ts +2 -2
- package/cli/selftune/hooks/skill-eval.ts +15 -1
- package/cli/selftune/hooks/stdin-preview.ts +32 -0
- package/cli/selftune/init.ts +198 -27
- package/cli/selftune/localdb/direct-write.ts +69 -6
- package/cli/selftune/localdb/queries.ts +552 -7
- package/cli/selftune/localdb/schema.ts +46 -0
- package/cli/selftune/orchestrate.ts +32 -4
- package/cli/selftune/routes/overview.ts +41 -3
- package/cli/selftune/routes/skill-report.ts +88 -17
- package/cli/selftune/types.ts +32 -0
- package/cli/selftune/utils/hooks.ts +12 -2
- package/cli/selftune/utils/transcript.ts +210 -1
- package/node_modules/@selftune/telemetry-contract/src/types.ts +11 -0
- package/package.json +1 -1
- package/packages/telemetry-contract/src/types.ts +11 -0
- package/skill/SKILL.md +29 -1
- package/skill/Workflows/AutoActivation.md +1 -1
- package/skill/Workflows/Evolve.md +31 -13
- package/skill/Workflows/ExportCanonical.md +121 -0
- package/skill/Workflows/Hook.md +131 -0
- package/skill/Workflows/Initialize.md +9 -8
- package/skill/Workflows/Orchestrate.md +27 -5
- package/skill/Workflows/Quickstart.md +94 -0
- package/skill/Workflows/RepairSkillUsage.md +87 -0
- package/skill/Workflows/Uninstall.md +82 -0
- package/skill/settings_snippet.json +19 -8
|
@@ -38,6 +38,7 @@ import type {
|
|
|
38
38
|
} from "../types.js";
|
|
39
39
|
import { CLIError, handleCLIError } from "../utils/cli-error.js";
|
|
40
40
|
import { parseFrontmatter, replaceDescription } from "../utils/frontmatter.js";
|
|
41
|
+
import type { EffortLevel } from "../utils/llm-call.js";
|
|
41
42
|
import { createEvolveTUI } from "../utils/tui.js";
|
|
42
43
|
import { appendAuditEntry } from "./audit.js";
|
|
43
44
|
import { checkConstitution } from "./constitutional.js";
|
|
@@ -51,6 +52,8 @@ import {
|
|
|
51
52
|
selectFromFrontier,
|
|
52
53
|
} from "./pareto.js";
|
|
53
54
|
import { generateMultipleProposals, generateProposal } from "./propose-description.js";
|
|
55
|
+
import { evaluateStoppingCriteria } from "./stopping-criteria.js";
|
|
56
|
+
import { buildUnblockSuggestions } from "./unblock-suggestions.js";
|
|
54
57
|
import type { ValidationResult } from "./validate-proposal.js";
|
|
55
58
|
import {
|
|
56
59
|
TRIGGER_CHECK_BATCH_SIZE,
|
|
@@ -79,7 +82,9 @@ export interface EvolveOptions {
|
|
|
79
82
|
validationModel?: string;
|
|
80
83
|
cheapLoop?: boolean;
|
|
81
84
|
gateModel?: string;
|
|
85
|
+
gateEffort?: EffortLevel;
|
|
82
86
|
proposalModel?: string;
|
|
87
|
+
adaptiveGate?: boolean;
|
|
83
88
|
syncFirst?: boolean;
|
|
84
89
|
syncForce?: boolean;
|
|
85
90
|
}
|
|
@@ -173,6 +178,73 @@ function formatSimpleDiff(oldText: string, newText: string): string {
|
|
|
173
178
|
return output.join("\n");
|
|
174
179
|
}
|
|
175
180
|
|
|
181
|
+
function countValidationLlmCalls(evalSetSize: number): number {
|
|
182
|
+
if (evalSetSize === 0) return 0;
|
|
183
|
+
return Math.ceil(evalSetSize / TRIGGER_CHECK_BATCH_SIZE) * 2 * VALIDATION_RUNS;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
interface GateDecision {
|
|
187
|
+
model: string;
|
|
188
|
+
effort?: EffortLevel;
|
|
189
|
+
riskSignals: string[];
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
function countWords(text: string): number {
|
|
193
|
+
return text
|
|
194
|
+
.trim()
|
|
195
|
+
.split(/\s+/)
|
|
196
|
+
.filter((token) => token.length > 0).length;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
function resolveGateDecision(
|
|
200
|
+
options: EvolveOptions,
|
|
201
|
+
proposal: EvolutionProposal,
|
|
202
|
+
validation: ValidationResult,
|
|
203
|
+
currentDescription: string,
|
|
204
|
+
confidenceThreshold: number,
|
|
205
|
+
): GateDecision | undefined {
|
|
206
|
+
const baseModel = options.gateModel;
|
|
207
|
+
if (!baseModel) return undefined;
|
|
208
|
+
|
|
209
|
+
const baseDecision: GateDecision = {
|
|
210
|
+
model: baseModel,
|
|
211
|
+
effort: options.gateEffort,
|
|
212
|
+
riskSignals: [],
|
|
213
|
+
};
|
|
214
|
+
|
|
215
|
+
if (!options.adaptiveGate) return baseDecision;
|
|
216
|
+
|
|
217
|
+
const riskSignals: string[] = [];
|
|
218
|
+
const originalWords = countWords(currentDescription);
|
|
219
|
+
const proposedWords = countWords(proposal.proposed_description);
|
|
220
|
+
const wordGrowth = originalWords === 0 ? 1 : proposedWords / originalWords;
|
|
221
|
+
const lowLift = validation.net_change < 0.15;
|
|
222
|
+
const hasRegressions = validation.regressions.length > 0;
|
|
223
|
+
const lowConfidence = proposal.confidence < Math.max(confidenceThreshold + 0.05, 0.75);
|
|
224
|
+
const broadeningRisk = wordGrowth > 1.8 || proposedWords - originalWords > 32;
|
|
225
|
+
const notYetStrong = validation.after_pass_rate < 0.9;
|
|
226
|
+
|
|
227
|
+
if (hasRegressions) riskSignals.push(`regressions=${validation.regressions.length}`);
|
|
228
|
+
if (lowLift) riskSignals.push(`low_lift=${validation.net_change.toFixed(3)}`);
|
|
229
|
+
if (lowConfidence) riskSignals.push(`confidence=${proposal.confidence.toFixed(2)}`);
|
|
230
|
+
if (broadeningRisk) riskSignals.push(`word_growth=${wordGrowth.toFixed(2)}x`);
|
|
231
|
+
if (notYetStrong) riskSignals.push(`after_pass_rate=${validation.after_pass_rate.toFixed(2)}`);
|
|
232
|
+
|
|
233
|
+
const shouldEscalate = hasRegressions || validation.net_change < 0.1 || riskSignals.length >= 2;
|
|
234
|
+
if (!shouldEscalate) {
|
|
235
|
+
return {
|
|
236
|
+
...baseDecision,
|
|
237
|
+
riskSignals,
|
|
238
|
+
};
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
return {
|
|
242
|
+
model: "opus",
|
|
243
|
+
effort: options.gateEffort === "max" ? "max" : "high",
|
|
244
|
+
riskSignals,
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
|
|
176
248
|
// ---------------------------------------------------------------------------
|
|
177
249
|
// Main orchestrator
|
|
178
250
|
// ---------------------------------------------------------------------------
|
|
@@ -455,7 +527,7 @@ export async function evolve(
|
|
|
455
527
|
// -----------------------------------------------------------------------
|
|
456
528
|
// Pareto multi-candidate path
|
|
457
529
|
// -----------------------------------------------------------------------
|
|
458
|
-
const paretoEnabled = options.paretoEnabled ??
|
|
530
|
+
const paretoEnabled = options.paretoEnabled ?? true;
|
|
459
531
|
const candidateCount = options.candidateCount ?? 3;
|
|
460
532
|
const tokenEfficiencyEnabled = options.tokenEfficiencyEnabled ?? false;
|
|
461
533
|
const telemetryRecords =
|
|
@@ -493,6 +565,7 @@ export async function evolve(
|
|
|
493
565
|
options.proposalModel,
|
|
494
566
|
aggregateMetrics,
|
|
495
567
|
);
|
|
568
|
+
llmCallCount += candidateCount;
|
|
496
569
|
|
|
497
570
|
// Filter by confidence threshold
|
|
498
571
|
const viableCandidates = candidates.filter((c) => c.confidence >= confidenceThreshold);
|
|
@@ -563,6 +636,7 @@ export async function evolve(
|
|
|
563
636
|
agent,
|
|
564
637
|
options.validationModel,
|
|
565
638
|
);
|
|
639
|
+
llmCallCount += countValidationLlmCalls(evalSet.length);
|
|
566
640
|
recordAudit(
|
|
567
641
|
proposal.proposal_id,
|
|
568
642
|
"validated",
|
|
@@ -627,6 +701,7 @@ export async function evolve(
|
|
|
627
701
|
} else {
|
|
628
702
|
// Standard single-candidate retry loop
|
|
629
703
|
let feedbackReason = "";
|
|
704
|
+
const previousPassRates: number[] = [];
|
|
630
705
|
|
|
631
706
|
for (let iteration = 0; iteration < maxIterations; iteration++) {
|
|
632
707
|
iterationsCompleted = iteration + 1;
|
|
@@ -680,7 +755,24 @@ export async function evolve(
|
|
|
680
755
|
);
|
|
681
756
|
if (!constitution.passed) {
|
|
682
757
|
feedbackReason = `Constitutional: ${constitution.violations.join("; ")}`;
|
|
683
|
-
|
|
758
|
+
// Re-evaluate stopping after a constitutional rejection by treating the
|
|
759
|
+
// last entry in previousPassRates as the currentPassRate (or 0 on the
|
|
760
|
+
// first iteration) and slicing it out of history before calling
|
|
761
|
+
// evaluateStoppingCriteria() with the current iteration/maxIterations,
|
|
762
|
+
// confidenceThreshold, and proposal.confidence.
|
|
763
|
+
const constitutionStop = evaluateStoppingCriteria(
|
|
764
|
+
previousPassRates.at(-1) ?? 0,
|
|
765
|
+
previousPassRates.slice(0, -1),
|
|
766
|
+
iteration + 1,
|
|
767
|
+
maxIterations,
|
|
768
|
+
confidenceThreshold,
|
|
769
|
+
proposal.confidence,
|
|
770
|
+
);
|
|
771
|
+
recordAudit(
|
|
772
|
+
proposal.proposal_id,
|
|
773
|
+
"rejected",
|
|
774
|
+
`${feedbackReason} (stopping: ${constitutionStop.reason})`,
|
|
775
|
+
);
|
|
684
776
|
recordEvidence({
|
|
685
777
|
timestamp: new Date().toISOString(),
|
|
686
778
|
proposal_id: proposal.proposal_id,
|
|
@@ -690,54 +782,64 @@ export async function evolve(
|
|
|
690
782
|
stage: "rejected",
|
|
691
783
|
rationale: proposal.rationale,
|
|
692
784
|
confidence: proposal.confidence,
|
|
693
|
-
details: feedbackReason
|
|
785
|
+
details: `${feedbackReason} (stopping: ${constitutionStop.reason})`,
|
|
694
786
|
});
|
|
695
|
-
if (
|
|
787
|
+
if (constitutionStop.shouldStop) {
|
|
696
788
|
finishTui();
|
|
697
789
|
return withStats({
|
|
698
790
|
proposal: lastProposal,
|
|
699
791
|
validation: null,
|
|
700
792
|
deployed: false,
|
|
701
793
|
auditEntries,
|
|
702
|
-
reason: feedbackReason
|
|
794
|
+
reason: `${feedbackReason} (${constitutionStop.reason})`,
|
|
703
795
|
});
|
|
704
796
|
}
|
|
705
797
|
continue;
|
|
706
798
|
}
|
|
707
799
|
|
|
708
|
-
// Step 9: Check confidence threshold
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
800
|
+
// Step 9: Check confidence threshold via stopping criteria
|
|
801
|
+
{
|
|
802
|
+
const preValidationStop = evaluateStoppingCriteria(
|
|
803
|
+
previousPassRates.at(-1) ?? 0,
|
|
804
|
+
previousPassRates.slice(0, -1),
|
|
805
|
+
iteration + 1,
|
|
806
|
+
maxIterations,
|
|
807
|
+
confidenceThreshold,
|
|
808
|
+
proposal.confidence,
|
|
715
809
|
);
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
validation: null,
|
|
734
|
-
deployed: false,
|
|
735
|
-
auditEntries,
|
|
736
|
-
reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
|
|
810
|
+
if (proposal.confidence < confidenceThreshold) {
|
|
811
|
+
feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
|
|
812
|
+
recordAudit(
|
|
813
|
+
proposal.proposal_id,
|
|
814
|
+
"rejected",
|
|
815
|
+
`${feedbackReason} (stopping: ${preValidationStop.reason})`,
|
|
816
|
+
);
|
|
817
|
+
recordEvidence({
|
|
818
|
+
timestamp: new Date().toISOString(),
|
|
819
|
+
proposal_id: proposal.proposal_id,
|
|
820
|
+
skill_name: skillName,
|
|
821
|
+
skill_path: skillPath,
|
|
822
|
+
target: "description",
|
|
823
|
+
stage: "rejected",
|
|
824
|
+
rationale: proposal.rationale,
|
|
825
|
+
confidence: proposal.confidence,
|
|
826
|
+
details: `${feedbackReason} (stopping: ${preValidationStop.reason})`,
|
|
737
827
|
});
|
|
738
|
-
}
|
|
739
828
|
|
|
740
|
-
|
|
829
|
+
// Use stopping criteria to decide whether to return or retry
|
|
830
|
+
if (preValidationStop.shouldStop) {
|
|
831
|
+
finishTui();
|
|
832
|
+
return withStats({
|
|
833
|
+
proposal: lastProposal,
|
|
834
|
+
validation: null,
|
|
835
|
+
deployed: false,
|
|
836
|
+
auditEntries,
|
|
837
|
+
reason: `${feedbackReason} (${preValidationStop.reason})`,
|
|
838
|
+
});
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
continue;
|
|
842
|
+
}
|
|
741
843
|
}
|
|
742
844
|
|
|
743
845
|
// Step 10: Validate against eval set
|
|
@@ -752,7 +854,7 @@ export async function evolve(
|
|
|
752
854
|
options.validationModel,
|
|
753
855
|
);
|
|
754
856
|
lastValidation = validation;
|
|
755
|
-
llmCallCount +=
|
|
857
|
+
llmCallCount += countValidationLlmCalls(evalSet.length);
|
|
756
858
|
tui.done(
|
|
757
859
|
`Validation: ${(validation.before_pass_rate * 100).toFixed(1)}% \u2192 ${(validation.after_pass_rate * 100).toFixed(1)}% (improved: ${validation.improved})`,
|
|
758
860
|
);
|
|
@@ -791,13 +893,23 @@ export async function evolve(
|
|
|
791
893
|
},
|
|
792
894
|
});
|
|
793
895
|
|
|
794
|
-
// Step 12:
|
|
896
|
+
// Step 12: Evaluate stopping criteria after validation
|
|
897
|
+
const stopping = evaluateStoppingCriteria(
|
|
898
|
+
validation.after_pass_rate,
|
|
899
|
+
previousPassRates,
|
|
900
|
+
iteration + 1,
|
|
901
|
+
maxIterations,
|
|
902
|
+
confidenceThreshold,
|
|
903
|
+
proposal.confidence,
|
|
904
|
+
);
|
|
905
|
+
previousPassRates.push(validation.after_pass_rate);
|
|
906
|
+
|
|
795
907
|
if (!validation.improved) {
|
|
796
908
|
feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
|
|
797
909
|
recordAudit(
|
|
798
910
|
proposal.proposal_id,
|
|
799
911
|
"rejected",
|
|
800
|
-
`Validation failed: net_change=${validation.net_change.toFixed(3)}`,
|
|
912
|
+
`Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
|
|
801
913
|
);
|
|
802
914
|
recordEvidence({
|
|
803
915
|
timestamp: new Date().toISOString(),
|
|
@@ -808,7 +920,7 @@ export async function evolve(
|
|
|
808
920
|
stage: "rejected",
|
|
809
921
|
rationale: proposal.rationale,
|
|
810
922
|
confidence: proposal.confidence,
|
|
811
|
-
details: `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
|
|
923
|
+
details: `Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
|
|
812
924
|
validation: {
|
|
813
925
|
improved: validation.improved,
|
|
814
926
|
before_pass_rate: validation.before_pass_rate,
|
|
@@ -820,21 +932,26 @@ export async function evolve(
|
|
|
820
932
|
},
|
|
821
933
|
});
|
|
822
934
|
|
|
823
|
-
//
|
|
824
|
-
if (
|
|
935
|
+
// Use stopping criteria to decide whether to return or retry
|
|
936
|
+
if (stopping.shouldStop) {
|
|
825
937
|
finishTui();
|
|
826
938
|
return withStats({
|
|
827
939
|
proposal: lastProposal,
|
|
828
940
|
validation: lastValidation,
|
|
829
941
|
deployed: false,
|
|
830
942
|
auditEntries,
|
|
831
|
-
reason: `Validation failed
|
|
943
|
+
reason: `Validation failed (${stopping.reason}): net_change=${validation.net_change.toFixed(3)}`,
|
|
832
944
|
});
|
|
833
945
|
}
|
|
834
946
|
|
|
835
947
|
continue;
|
|
836
948
|
}
|
|
837
949
|
|
|
950
|
+
// Validation passed — check if converged or continue
|
|
951
|
+
if (stopping.shouldStop && stopping.reason.includes("Converged")) {
|
|
952
|
+
recordAudit(proposal.proposal_id, "validated", `Stopping early: ${stopping.reason}`);
|
|
953
|
+
}
|
|
954
|
+
|
|
838
955
|
// Validation passed - break out of retry loop
|
|
839
956
|
break;
|
|
840
957
|
}
|
|
@@ -915,18 +1032,39 @@ export async function evolve(
|
|
|
915
1032
|
// -----------------------------------------------------------------------
|
|
916
1033
|
let gateValidation: ValidationResult | undefined;
|
|
917
1034
|
if (options.gateModel && lastProposal && lastValidation?.improved) {
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
1035
|
+
const gateDecision = resolveGateDecision(
|
|
1036
|
+
options,
|
|
1037
|
+
lastProposal,
|
|
1038
|
+
lastValidation,
|
|
1039
|
+
currentDescription,
|
|
1040
|
+
confidenceThreshold,
|
|
1041
|
+
);
|
|
1042
|
+
const gateLabel = gateDecision?.effort
|
|
1043
|
+
? `${gateDecision.model}, effort=${gateDecision.effort}`
|
|
1044
|
+
: (gateDecision?.model ?? options.gateModel);
|
|
1045
|
+
tui.step(`Gate validation (${gateLabel})...`);
|
|
1046
|
+
gateValidation = await _gateValidateProposal(
|
|
1047
|
+
lastProposal,
|
|
1048
|
+
evalSet,
|
|
1049
|
+
agent,
|
|
1050
|
+
gateDecision?.model ?? options.gateModel,
|
|
1051
|
+
gateDecision?.effort,
|
|
1052
|
+
);
|
|
1053
|
+
llmCallCount += countValidationLlmCalls(evalSet.length);
|
|
921
1054
|
tui.done(
|
|
922
|
-
`Gate (${
|
|
1055
|
+
`Gate (${gateLabel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
923
1056
|
);
|
|
924
1057
|
|
|
1058
|
+
const gatePrefix =
|
|
1059
|
+
gateDecision && gateDecision.riskSignals.length > 0
|
|
1060
|
+
? `Adaptive gate [${gateDecision.riskSignals.join(", ")}]`
|
|
1061
|
+
: "Gate validation";
|
|
1062
|
+
|
|
925
1063
|
if (!gateValidation.improved) {
|
|
926
1064
|
recordAudit(
|
|
927
1065
|
lastProposal.proposal_id,
|
|
928
1066
|
"rejected",
|
|
929
|
-
|
|
1067
|
+
`${gatePrefix} failed (${gateLabel}): net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
930
1068
|
);
|
|
931
1069
|
recordEvidence({
|
|
932
1070
|
timestamp: new Date().toISOString(),
|
|
@@ -937,7 +1075,7 @@ export async function evolve(
|
|
|
937
1075
|
stage: "rejected",
|
|
938
1076
|
rationale: lastProposal.rationale,
|
|
939
1077
|
confidence: lastProposal.confidence,
|
|
940
|
-
details:
|
|
1078
|
+
details: `${gatePrefix} failed (${gateLabel}): net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
941
1079
|
validation: {
|
|
942
1080
|
improved: gateValidation.improved,
|
|
943
1081
|
before_pass_rate: gateValidation.before_pass_rate,
|
|
@@ -954,7 +1092,7 @@ export async function evolve(
|
|
|
954
1092
|
validation: lastValidation,
|
|
955
1093
|
deployed: false,
|
|
956
1094
|
auditEntries,
|
|
957
|
-
reason:
|
|
1095
|
+
reason: `${gatePrefix} failed (${gateLabel}): net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
958
1096
|
gateValidation,
|
|
959
1097
|
...(baselineResult ? { baselineResult } : {}),
|
|
960
1098
|
});
|
|
@@ -963,7 +1101,7 @@ export async function evolve(
|
|
|
963
1101
|
recordAudit(
|
|
964
1102
|
lastProposal.proposal_id,
|
|
965
1103
|
"validated",
|
|
966
|
-
|
|
1104
|
+
`${gatePrefix} (${gateLabel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
967
1105
|
);
|
|
968
1106
|
}
|
|
969
1107
|
|
|
@@ -1081,7 +1219,7 @@ export async function cliMain(): Promise<void> {
|
|
|
1081
1219
|
"dry-run": { type: "boolean", default: false },
|
|
1082
1220
|
confidence: { type: "string", default: "0.6" },
|
|
1083
1221
|
"max-iterations": { type: "string", default: "3" },
|
|
1084
|
-
pareto: { type: "boolean", default:
|
|
1222
|
+
pareto: { type: "boolean", default: true },
|
|
1085
1223
|
candidates: { type: "string", default: "3" },
|
|
1086
1224
|
"token-efficiency": { type: "boolean", default: false },
|
|
1087
1225
|
"with-baseline": { type: "boolean", default: false },
|
|
@@ -1089,7 +1227,9 @@ export async function cliMain(): Promise<void> {
|
|
|
1089
1227
|
"cheap-loop": { type: "boolean", default: true },
|
|
1090
1228
|
"full-model": { type: "boolean", default: false },
|
|
1091
1229
|
"gate-model": { type: "string" },
|
|
1230
|
+
"gate-effort": { type: "string" },
|
|
1092
1231
|
"proposal-model": { type: "string" },
|
|
1232
|
+
"adaptive-gate": { type: "boolean", default: false },
|
|
1093
1233
|
"sync-first": { type: "boolean", default: false },
|
|
1094
1234
|
"sync-force": { type: "boolean", default: false },
|
|
1095
1235
|
verbose: { type: "boolean", default: false },
|
|
@@ -1120,6 +1260,8 @@ Options:
|
|
|
1120
1260
|
--cheap-loop Use cheap models for loop, expensive for gate (default: on)
|
|
1121
1261
|
--full-model Use same model for all stages (disables cheap-loop)
|
|
1122
1262
|
--gate-model Model for final gate validation (default: sonnet)
|
|
1263
|
+
--gate-effort Thinking effort for final gate (low|medium|high|max)
|
|
1264
|
+
--adaptive-gate Escalate risky gate checks to opus + high effort
|
|
1123
1265
|
--proposal-model Model for proposal generation LLM calls
|
|
1124
1266
|
--sync-first Refresh source-truth telemetry before building evals/failure patterns
|
|
1125
1267
|
--sync-force Force a full rescan during --sync-first
|
|
@@ -1142,6 +1284,24 @@ Options:
|
|
|
1142
1284
|
"Add --sync-first when using --sync-force",
|
|
1143
1285
|
);
|
|
1144
1286
|
}
|
|
1287
|
+
if (values["gate-effort"] && !["low", "medium", "high", "max"].includes(values["gate-effort"])) {
|
|
1288
|
+
throw new CLIError(
|
|
1289
|
+
`Invalid --gate-effort value: ${values["gate-effort"]}`,
|
|
1290
|
+
"INVALID_FLAG",
|
|
1291
|
+
"Use one of: low, medium, high, max",
|
|
1292
|
+
);
|
|
1293
|
+
}
|
|
1294
|
+
if (
|
|
1295
|
+
(values["gate-effort"] || values["adaptive-gate"]) &&
|
|
1296
|
+
(values["full-model"] ?? false) &&
|
|
1297
|
+
!values["gate-model"]
|
|
1298
|
+
) {
|
|
1299
|
+
throw new CLIError(
|
|
1300
|
+
"--gate-effort and --adaptive-gate require --gate-model when --full-model is set",
|
|
1301
|
+
"INVALID_FLAG",
|
|
1302
|
+
"Add --gate-model <model> or drop --full-model",
|
|
1303
|
+
);
|
|
1304
|
+
}
|
|
1145
1305
|
|
|
1146
1306
|
const { detectAgent } = await import("../utils/llm-call.js");
|
|
1147
1307
|
const requestedAgent = values.agent;
|
|
@@ -1222,6 +1382,8 @@ Options:
|
|
|
1222
1382
|
console.error(`[verbose] Dry run: ${values["dry-run"] ?? false}`);
|
|
1223
1383
|
console.error(`[verbose] Sync first: ${values["sync-first"] ?? false}`);
|
|
1224
1384
|
console.error(`[verbose] Sync force: ${values["sync-force"] ?? false}`);
|
|
1385
|
+
console.error(`[verbose] Adaptive gate: ${values["adaptive-gate"] ?? false}`);
|
|
1386
|
+
console.error(`[verbose] Gate effort: ${values["gate-effort"] ?? "(default)"}`);
|
|
1225
1387
|
}
|
|
1226
1388
|
|
|
1227
1389
|
const result = await evolve({
|
|
@@ -1240,7 +1402,9 @@ Options:
|
|
|
1240
1402
|
validationModel: values["validation-model"],
|
|
1241
1403
|
cheapLoop: (values["cheap-loop"] ?? true) && !(values["full-model"] ?? false),
|
|
1242
1404
|
gateModel: values["gate-model"],
|
|
1405
|
+
gateEffort: values["gate-effort"] as EffortLevel | undefined,
|
|
1243
1406
|
proposalModel: values["proposal-model"],
|
|
1407
|
+
adaptiveGate: values["adaptive-gate"] ?? false,
|
|
1244
1408
|
gradingResults,
|
|
1245
1409
|
syncFirst: values["sync-first"] ?? false,
|
|
1246
1410
|
syncForce: values["sync-force"] ?? false,
|
|
@@ -1272,11 +1436,16 @@ Options:
|
|
|
1272
1436
|
...(result.descriptionQualityAfter != null
|
|
1273
1437
|
? { description_quality_after: result.descriptionQualityAfter }
|
|
1274
1438
|
: {}),
|
|
1439
|
+
...(!result.deployed
|
|
1440
|
+
? {
|
|
1441
|
+
suggestions: buildUnblockSuggestions(result, values.skill),
|
|
1442
|
+
}
|
|
1443
|
+
: {}),
|
|
1275
1444
|
};
|
|
1276
1445
|
console.log(JSON.stringify(summary, null, 2));
|
|
1277
1446
|
}
|
|
1278
1447
|
|
|
1279
|
-
// Print human-readable status to stderr so
|
|
1448
|
+
// Print human-readable status to stderr so agents always see outcome + next steps
|
|
1280
1449
|
if (!result.deployed) {
|
|
1281
1450
|
console.error(`\n[NOT DEPLOYED] ${result.reason}`);
|
|
1282
1451
|
if (result.validation && !result.validation.improved) {
|
|
@@ -1295,9 +1464,25 @@ Options:
|
|
|
1295
1464
|
` Confidence ${result.proposal.confidence.toFixed(2)} below threshold ${values.confidence ?? "0.6"}`,
|
|
1296
1465
|
);
|
|
1297
1466
|
}
|
|
1298
|
-
|
|
1467
|
+
// Targeted suggestions based on specific failure reason
|
|
1468
|
+
const suggestions = buildUnblockSuggestions(result, values.skill);
|
|
1469
|
+
if (suggestions.length > 0) {
|
|
1470
|
+
console.error("\n Next steps:");
|
|
1471
|
+
for (const s of suggestions) {
|
|
1472
|
+
console.error(` → ${s}`);
|
|
1473
|
+
}
|
|
1474
|
+
}
|
|
1299
1475
|
} else {
|
|
1300
1476
|
console.error(`\n[DEPLOYED] ${result.reason}`);
|
|
1477
|
+
// Show quality improvement if available
|
|
1478
|
+
if (result.descriptionQualityBefore != null && result.descriptionQualityAfter != null) {
|
|
1479
|
+
const delta = result.descriptionQualityAfter - result.descriptionQualityBefore;
|
|
1480
|
+
if (delta !== 0) {
|
|
1481
|
+
console.error(
|
|
1482
|
+
` Description quality: ${Math.round(result.descriptionQualityBefore * 100)}% → ${Math.round(result.descriptionQualityAfter * 100)}% (${delta >= 0 ? "+" : ""}${Math.round(delta * 100)}%)`,
|
|
1483
|
+
);
|
|
1484
|
+
}
|
|
1485
|
+
}
|
|
1301
1486
|
}
|
|
1302
1487
|
|
|
1303
1488
|
process.exit(result.deployed ? 0 : 1);
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* unblock-suggestions.ts
|
|
3
|
+
*
|
|
4
|
+
* Generates targeted, per-failure-reason suggestions when evolve doesn't deploy.
|
|
5
|
+
* Each suggestion is a concrete next CLI command or manual action that helps the
|
|
6
|
+
* agent (or user) unblock the evolution pipeline.
|
|
7
|
+
*
|
|
8
|
+
* Pure function — no I/O, no LLM calls. Depends only on EvolveResult fields and
|
|
9
|
+
* the scoreDescription heuristic.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { scoreDescription } from "./description-quality.js";
|
|
13
|
+
import type { EvolveResult } from "./evolve.js";
|
|
14
|
+
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// Quality hint helper
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Append description quality improvement hints if the score reveals weak criteria.
|
|
21
|
+
* Only fires when composite < 0.7 to avoid noise on already-good descriptions.
|
|
22
|
+
* Skips when descriptionText is empty (no proposal was generated).
|
|
23
|
+
*/
|
|
24
|
+
function appendQualityHints(
|
|
25
|
+
suggestions: string[],
|
|
26
|
+
descriptionText: string,
|
|
27
|
+
skillName: string,
|
|
28
|
+
): void {
|
|
29
|
+
if (!descriptionText) return;
|
|
30
|
+
const score = scoreDescription(descriptionText, skillName);
|
|
31
|
+
if (score.composite >= 0.7) return;
|
|
32
|
+
|
|
33
|
+
const weak: string[] = [];
|
|
34
|
+
if (score.criteria.trigger_context < 0.5) weak.push("add when/if/after trigger context");
|
|
35
|
+
if (score.criteria.vagueness < 0.7) weak.push("remove vague words (various, general, etc)");
|
|
36
|
+
if (score.criteria.specificity < 0.5) weak.push("add concrete action verbs");
|
|
37
|
+
if (score.criteria.length < 0.7) weak.push("adjust length (ideal: 80-300 chars)");
|
|
38
|
+
if (score.criteria.not_just_name < 0.5) weak.push("differentiate from skill name");
|
|
39
|
+
|
|
40
|
+
if (weak.length > 0) {
|
|
41
|
+
suggestions.push(
|
|
42
|
+
`Description quality: ${Math.round(score.composite * 100)}% — improve by: ${weak.join(", ")}`,
|
|
43
|
+
);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// ---------------------------------------------------------------------------
|
|
48
|
+
// Main suggestion builder
|
|
49
|
+
// ---------------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Generate targeted suggestions based on the specific failure reason.
|
|
53
|
+
* Each suggestion is a concrete next CLI command or manual action.
|
|
54
|
+
*/
|
|
55
|
+
export function buildUnblockSuggestions(result: EvolveResult, skillName: string): string[] {
|
|
56
|
+
const reason = result.reason;
|
|
57
|
+
const suggestions: string[] = [];
|
|
58
|
+
const descText = result.proposal?.original_description ?? "";
|
|
59
|
+
|
|
60
|
+
// --- Path/config failures ---
|
|
61
|
+
if (reason.includes("SKILL.md not found")) {
|
|
62
|
+
suggestions.push("Verify the --skill-path flag points to a valid SKILL.md file");
|
|
63
|
+
suggestions.push("Run: selftune init (to re-bootstrap config if paths changed)");
|
|
64
|
+
return suggestions;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
if (reason.includes("Failed to load eval set") || reason.includes("not a JSON array")) {
|
|
68
|
+
suggestions.push("Run: selftune sync (to rebuild source-truth telemetry)");
|
|
69
|
+
suggestions.push(`Then: selftune evolve --skill ${skillName} (to retry with fresh evals)`);
|
|
70
|
+
return suggestions;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// --- No signal failures ---
|
|
74
|
+
if (reason.includes("No failure patterns found")) {
|
|
75
|
+
suggestions.push("This skill may already be routing well — check: selftune status");
|
|
76
|
+
suggestions.push("If undertriggering, add more sessions so evolve has signal to work with");
|
|
77
|
+
if (result.descriptionQualityBefore != null && result.descriptionQualityBefore < 0.5) {
|
|
78
|
+
suggestions.push(
|
|
79
|
+
`Description quality is ${Math.round(result.descriptionQualityBefore * 100)}% — manually improving the description may help generate patterns`,
|
|
80
|
+
);
|
|
81
|
+
appendQualityHints(suggestions, descText, skillName);
|
|
82
|
+
}
|
|
83
|
+
return suggestions;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// --- Confidence failures (specific before general) ---
|
|
87
|
+
if (reason.includes("No candidates met confidence")) {
|
|
88
|
+
suggestions.push(`Lower the threshold: selftune evolve --skill ${skillName} --confidence 0.4`);
|
|
89
|
+
suggestions.push(
|
|
90
|
+
`Or increase candidates: selftune evolve --skill ${skillName} --pareto --candidates 5`,
|
|
91
|
+
);
|
|
92
|
+
appendQualityHints(suggestions, descText, skillName);
|
|
93
|
+
return suggestions;
|
|
94
|
+
}
|
|
95
|
+
if (reason.toLowerCase().includes("confidence") && reason.includes("threshold")) {
|
|
96
|
+
suggestions.push(`Lower the threshold: selftune evolve --skill ${skillName} --confidence 0.4`);
|
|
97
|
+
suggestions.push("Or add more eval entries so the LLM has more context for proposals");
|
|
98
|
+
appendQualityHints(suggestions, descText, skillName);
|
|
99
|
+
return suggestions;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// --- Validation failures (proposals regressed) ---
|
|
103
|
+
if (reason.includes("Validation failed after")) {
|
|
104
|
+
suggestions.push(
|
|
105
|
+
`The eval set may be contradictory — review with: selftune evolve --skill ${skillName} --verbose`,
|
|
106
|
+
);
|
|
107
|
+
suggestions.push(
|
|
108
|
+
`Try: selftune evolve --skill ${skillName} --pareto --candidates 5 (more diverse proposals)`,
|
|
109
|
+
);
|
|
110
|
+
if (result.validation && result.validation.regressions.length > 0) {
|
|
111
|
+
suggestions.push(
|
|
112
|
+
`${result.validation.regressions.length} regressions detected — check if negative eval entries are too broad`,
|
|
113
|
+
);
|
|
114
|
+
}
|
|
115
|
+
appendQualityHints(suggestions, descText, skillName);
|
|
116
|
+
return suggestions;
|
|
117
|
+
}
|
|
118
|
+
if (reason.includes("No Pareto candidates improved")) {
|
|
119
|
+
suggestions.push("All candidates regressed — the eval set may need rebalancing");
|
|
120
|
+
suggestions.push(`Try: selftune sync --force && selftune evolve --skill ${skillName}`);
|
|
121
|
+
return suggestions;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// --- Gate failures ---
|
|
125
|
+
if (reason.includes("Baseline gate failed")) {
|
|
126
|
+
suggestions.push("Improvement was too marginal to justify deployment");
|
|
127
|
+
suggestions.push("Collect more session data, then retry — small gains compound over time");
|
|
128
|
+
return suggestions;
|
|
129
|
+
}
|
|
130
|
+
if (reason.includes("Gate validation failed")) {
|
|
131
|
+
suggestions.push("The gate model rejected the proposal — it may be too aggressive");
|
|
132
|
+
suggestions.push(
|
|
133
|
+
`Try: selftune evolve --skill ${skillName} --full-model (disables cheap-loop gate)`,
|
|
134
|
+
);
|
|
135
|
+
return suggestions;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// --- Constitutional rejection ---
|
|
139
|
+
if (reason.includes("Constitutional")) {
|
|
140
|
+
suggestions.push("The proposed description violated safety constraints");
|
|
141
|
+
suggestions.push("Review constitutional rules and manually adjust the description if needed");
|
|
142
|
+
return suggestions;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// --- Dry run (not really a failure) ---
|
|
146
|
+
if (reason.includes("Dry run")) {
|
|
147
|
+
suggestions.push(`Deploy: selftune evolve --skill ${skillName} (remove --dry-run to deploy)`);
|
|
148
|
+
return suggestions;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// --- Catch-all for unexpected errors ---
|
|
152
|
+
if (reason.includes("Error during evolution")) {
|
|
153
|
+
suggestions.push("Re-run with --verbose for full stack trace");
|
|
154
|
+
suggestions.push("Run: selftune doctor (to check system health)");
|
|
155
|
+
return suggestions;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
return suggestions;
|
|
159
|
+
}
|