selftune 0.2.16 → 0.2.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -19
- package/cli/selftune/alpha-upload/build-payloads.ts +14 -1
- package/cli/selftune/alpha-upload/client.ts +51 -1
- package/cli/selftune/alpha-upload/flush.ts +46 -5
- package/cli/selftune/alpha-upload/stage-canonical.ts +25 -4
- package/cli/selftune/alpha-upload-contract.ts +9 -0
- package/cli/selftune/constants.ts +82 -5
- package/cli/selftune/contribute/sanitize.ts +52 -5
- package/cli/selftune/dashboard-contract.ts +100 -0
- package/cli/selftune/dashboard-server.ts +2 -2
- package/cli/selftune/evolution/description-quality.ts +12 -11
- package/cli/selftune/evolution/evolve.ts +214 -51
- package/cli/selftune/evolution/validate-proposal.ts +9 -6
- package/cli/selftune/grading/grade-session.ts +20 -0
- package/cli/selftune/hooks/commit-track.ts +188 -0
- package/cli/selftune/hooks/prompt-log.ts +10 -1
- package/cli/selftune/hooks/session-stop.ts +2 -2
- package/cli/selftune/hooks/skill-eval.ts +15 -1
- package/cli/selftune/hooks/stdin-preview.ts +32 -0
- package/cli/selftune/localdb/direct-write.ts +69 -6
- package/cli/selftune/localdb/queries.ts +552 -7
- package/cli/selftune/localdb/schema.ts +46 -0
- package/cli/selftune/orchestrate.ts +32 -4
- package/cli/selftune/routes/overview.ts +41 -3
- package/cli/selftune/routes/skill-report.ts +88 -17
- package/cli/selftune/types.ts +31 -0
- package/cli/selftune/utils/transcript.ts +210 -1
- package/node_modules/@selftune/telemetry-contract/src/types.ts +11 -0
- package/package.json +1 -1
- package/packages/telemetry-contract/src/types.ts +11 -0
- package/skill/SKILL.md +29 -1
- package/skill/Workflows/Evolve.md +31 -13
- package/skill/Workflows/ExportCanonical.md +121 -0
- package/skill/Workflows/Hook.md +131 -0
- package/skill/Workflows/Initialize.md +9 -8
- package/skill/Workflows/Orchestrate.md +27 -5
- package/skill/Workflows/Quickstart.md +94 -0
- package/skill/Workflows/RepairSkillUsage.md +87 -0
- package/skill/Workflows/Uninstall.md +82 -0
- package/skill/settings_snippet.json +11 -0
|
@@ -1,3 +1,71 @@
|
|
|
1
|
+
// -- Cursor-based pagination types -------------------------------------------
|
|
2
|
+
|
|
3
|
+
export interface PaginationCursor {
|
|
4
|
+
timestamp: string;
|
|
5
|
+
id: number | string;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
export interface PaginatedResult<T> {
|
|
9
|
+
items: T[];
|
|
10
|
+
next_cursor: PaginationCursor | null;
|
|
11
|
+
has_more: boolean;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/** Parse a JSON cursor param from a URL search string. Returns null on invalid input. */
|
|
15
|
+
export function parseCursorParam(value: string | null | undefined): PaginationCursor | null {
|
|
16
|
+
if (!value) return null;
|
|
17
|
+
try {
|
|
18
|
+
const parsed: unknown = JSON.parse(value);
|
|
19
|
+
if (parsed && typeof parsed === "object" && "timestamp" in parsed && "id" in parsed) {
|
|
20
|
+
const { timestamp, id } = parsed as { timestamp: unknown; id: unknown };
|
|
21
|
+
if (
|
|
22
|
+
typeof timestamp === "string" &&
|
|
23
|
+
(typeof id === "string" || (typeof id === "number" && Number.isFinite(id)))
|
|
24
|
+
) {
|
|
25
|
+
return { timestamp, id };
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
} catch {
|
|
29
|
+
// Invalid cursor JSON — treat as no cursor
|
|
30
|
+
}
|
|
31
|
+
return null;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/** Parse an integer query param with bounds clamping. */
|
|
35
|
+
export function parseIntParam(value: string | null | undefined, defaultValue: number): number {
|
|
36
|
+
if (value == null) return defaultValue;
|
|
37
|
+
const n = Number.parseInt(value, 10);
|
|
38
|
+
return Number.isNaN(n) ? defaultValue : Math.max(1, Math.min(n, 10000));
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// -- Paginated overview payload (returned when cursor params are provided) ----
|
|
42
|
+
|
|
43
|
+
export interface OverviewPaginatedPayload {
|
|
44
|
+
telemetry_page: PaginatedResult<TelemetryRecord>;
|
|
45
|
+
skills_page: PaginatedResult<SkillUsageRecord>;
|
|
46
|
+
evolution: EvolutionEntry[];
|
|
47
|
+
counts: OverviewPayload["counts"];
|
|
48
|
+
unmatched_queries: UnmatchedQuery[];
|
|
49
|
+
pending_proposals: PendingProposal[];
|
|
50
|
+
active_sessions: number;
|
|
51
|
+
recent_activity: RecentActivityItem[];
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export interface SkillReportPaginatedPayload extends Omit<
|
|
55
|
+
SkillReportPayload,
|
|
56
|
+
"recent_invocations"
|
|
57
|
+
> {
|
|
58
|
+
invocations_page: PaginatedResult<{
|
|
59
|
+
timestamp: string;
|
|
60
|
+
session_id: string;
|
|
61
|
+
query: string;
|
|
62
|
+
triggered: boolean;
|
|
63
|
+
source: string | null;
|
|
64
|
+
}>;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// -- Core record types -------------------------------------------------------
|
|
68
|
+
|
|
1
69
|
export interface TelemetryRecord {
|
|
2
70
|
timestamp: string;
|
|
3
71
|
session_id: string;
|
|
@@ -220,6 +288,36 @@ export interface HealthResponse {
|
|
|
220
288
|
// -- Doctor / health check types ----------------------------------------------
|
|
221
289
|
export type { DoctorResult, HealthCheck, HealthStatus } from "./types.js";
|
|
222
290
|
|
|
291
|
+
// -- Execution metrics (aggregated from execution_facts enrichment columns) ---
|
|
292
|
+
|
|
293
|
+
export interface ExecutionMetrics {
|
|
294
|
+
avg_files_changed: number;
|
|
295
|
+
total_lines_added: number;
|
|
296
|
+
total_lines_removed: number;
|
|
297
|
+
total_cost_usd: number;
|
|
298
|
+
avg_cost_usd: number;
|
|
299
|
+
cached_input_tokens_total: number;
|
|
300
|
+
reasoning_output_tokens_total: number;
|
|
301
|
+
artifact_count: number;
|
|
302
|
+
session_type_distribution: Record<string, number>;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// -- Commit summary (aggregated from commit_tracking table) -------------------
|
|
306
|
+
|
|
307
|
+
export interface CommitRecord {
|
|
308
|
+
commit_sha: string;
|
|
309
|
+
commit_title: string | null;
|
|
310
|
+
branch: string | null;
|
|
311
|
+
repo_remote: string | null;
|
|
312
|
+
timestamp: string;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
export interface CommitSummary {
|
|
316
|
+
total_commits: number;
|
|
317
|
+
unique_branches: number;
|
|
318
|
+
recent_commits: Array<{ sha: string; title: string; branch: string; timestamp: string }>;
|
|
319
|
+
}
|
|
320
|
+
|
|
223
321
|
export interface SkillReportResponse extends SkillReportPayload {
|
|
224
322
|
evolution: EvolutionEntry[];
|
|
225
323
|
pending_proposals: PendingProposal[];
|
|
@@ -242,6 +340,8 @@ export interface SkillReportResponse extends SkillReportPayload {
|
|
|
242
340
|
};
|
|
243
341
|
prompt_samples: PromptSample[];
|
|
244
342
|
session_metadata: SessionMeta[];
|
|
343
|
+
execution_metrics?: ExecutionMetrics | null;
|
|
344
|
+
commit_summary?: CommitSummary | null;
|
|
245
345
|
description_quality?: {
|
|
246
346
|
composite: number;
|
|
247
347
|
criteria: {
|
|
@@ -448,7 +448,7 @@ export async function startDashboardServer(
|
|
|
448
448
|
);
|
|
449
449
|
}
|
|
450
450
|
refreshV2Data();
|
|
451
|
-
return withCors(handleOverview(db, selftuneVersion));
|
|
451
|
+
return withCors(handleOverview(db, selftuneVersion, url.searchParams));
|
|
452
452
|
}
|
|
453
453
|
|
|
454
454
|
// ---- GET /api/v2/orchestrate-runs ----
|
|
@@ -495,7 +495,7 @@ export async function startDashboardServer(
|
|
|
495
495
|
);
|
|
496
496
|
}
|
|
497
497
|
refreshV2Data();
|
|
498
|
-
return withCors(handleSkillReport(db, skillName));
|
|
498
|
+
return withCors(handleSkillReport(db, skillName, url.searchParams));
|
|
499
499
|
}
|
|
500
500
|
|
|
501
501
|
// ---- SPA fallback ----
|
|
@@ -139,27 +139,27 @@ export function scoreLengthCriterion(description: string): number {
|
|
|
139
139
|
}
|
|
140
140
|
|
|
141
141
|
/** Score presence of trigger context words (when/if/before/after etc). */
|
|
142
|
-
export function scoreTriggerContextCriterion(description: string): number {
|
|
143
|
-
const matches = countWordMatches(description.toLowerCase(), TRIGGER_PATTERNS);
|
|
142
|
+
export function scoreTriggerContextCriterion(description: string, lower?: string): number {
|
|
143
|
+
const matches = countWordMatches(lower ?? description.toLowerCase(), TRIGGER_PATTERNS);
|
|
144
144
|
if (matches === 0) return 0.0;
|
|
145
145
|
if (matches === 1) return 0.7;
|
|
146
146
|
return Math.min(1.0, 0.7 + 0.15 * (matches - 1));
|
|
147
147
|
}
|
|
148
148
|
|
|
149
149
|
/** Score absence of vague words (lower is worse). */
|
|
150
|
-
export function scoreVaguenessCriterion(description: string): number {
|
|
151
|
-
const matches = countWordMatches(description.toLowerCase(), VAGUE_PATTERNS);
|
|
150
|
+
export function scoreVaguenessCriterion(description: string, lower?: string): number {
|
|
151
|
+
const matches = countWordMatches(lower ?? description.toLowerCase(), VAGUE_PATTERNS);
|
|
152
152
|
if (matches === 0) return 1.0;
|
|
153
153
|
if (matches === 1) return 0.6;
|
|
154
154
|
return Math.max(0.1, 0.6 - 0.15 * (matches - 1));
|
|
155
155
|
}
|
|
156
156
|
|
|
157
157
|
/** Score whether description specifies at least one concrete action or domain. */
|
|
158
|
-
export function scoreSpecificityCriterion(description: string): number {
|
|
159
|
-
const
|
|
160
|
-
const hasAction = ACTION_PATTERNS.some((p) => p.test(
|
|
158
|
+
export function scoreSpecificityCriterion(description: string, lower?: string): number {
|
|
159
|
+
const l = lower ?? description.toLowerCase();
|
|
160
|
+
const hasAction = ACTION_PATTERNS.some((p) => p.test(l));
|
|
161
161
|
|
|
162
|
-
const fillerCount = FILLER_PHRASES.filter((f) =>
|
|
162
|
+
const fillerCount = FILLER_PHRASES.filter((f) => l.includes(f)).length;
|
|
163
163
|
const words = description.split(/\s+/).length;
|
|
164
164
|
const fillerRatio = fillerCount > 0 ? fillerCount / Math.max(1, words / 10) : 0;
|
|
165
165
|
|
|
@@ -204,11 +204,12 @@ const WEIGHTS = {
|
|
|
204
204
|
* Pure function — no I/O, no LLM calls.
|
|
205
205
|
*/
|
|
206
206
|
export function scoreDescription(description: string, skillName?: string): DescriptionQualityScore {
|
|
207
|
+
const lower = description.toLowerCase();
|
|
207
208
|
const criteria = {
|
|
208
209
|
length: scoreLengthCriterion(description),
|
|
209
|
-
trigger_context: scoreTriggerContextCriterion(description),
|
|
210
|
-
vagueness: scoreVaguenessCriterion(description),
|
|
211
|
-
specificity: scoreSpecificityCriterion(description),
|
|
210
|
+
trigger_context: scoreTriggerContextCriterion(description, lower),
|
|
211
|
+
vagueness: scoreVaguenessCriterion(description, lower),
|
|
212
|
+
specificity: scoreSpecificityCriterion(description, lower),
|
|
212
213
|
not_just_name: scoreNotJustNameCriterion(description, skillName),
|
|
213
214
|
};
|
|
214
215
|
|
|
@@ -38,6 +38,7 @@ import type {
|
|
|
38
38
|
} from "../types.js";
|
|
39
39
|
import { CLIError, handleCLIError } from "../utils/cli-error.js";
|
|
40
40
|
import { parseFrontmatter, replaceDescription } from "../utils/frontmatter.js";
|
|
41
|
+
import type { EffortLevel } from "../utils/llm-call.js";
|
|
41
42
|
import { createEvolveTUI } from "../utils/tui.js";
|
|
42
43
|
import { appendAuditEntry } from "./audit.js";
|
|
43
44
|
import { checkConstitution } from "./constitutional.js";
|
|
@@ -51,6 +52,7 @@ import {
|
|
|
51
52
|
selectFromFrontier,
|
|
52
53
|
} from "./pareto.js";
|
|
53
54
|
import { generateMultipleProposals, generateProposal } from "./propose-description.js";
|
|
55
|
+
import { evaluateStoppingCriteria } from "./stopping-criteria.js";
|
|
54
56
|
import { buildUnblockSuggestions } from "./unblock-suggestions.js";
|
|
55
57
|
import type { ValidationResult } from "./validate-proposal.js";
|
|
56
58
|
import {
|
|
@@ -80,7 +82,9 @@ export interface EvolveOptions {
|
|
|
80
82
|
validationModel?: string;
|
|
81
83
|
cheapLoop?: boolean;
|
|
82
84
|
gateModel?: string;
|
|
85
|
+
gateEffort?: EffortLevel;
|
|
83
86
|
proposalModel?: string;
|
|
87
|
+
adaptiveGate?: boolean;
|
|
84
88
|
syncFirst?: boolean;
|
|
85
89
|
syncForce?: boolean;
|
|
86
90
|
}
|
|
@@ -174,6 +178,73 @@ function formatSimpleDiff(oldText: string, newText: string): string {
|
|
|
174
178
|
return output.join("\n");
|
|
175
179
|
}
|
|
176
180
|
|
|
181
|
+
function countValidationLlmCalls(evalSetSize: number): number {
|
|
182
|
+
if (evalSetSize === 0) return 0;
|
|
183
|
+
return Math.ceil(evalSetSize / TRIGGER_CHECK_BATCH_SIZE) * 2 * VALIDATION_RUNS;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
interface GateDecision {
|
|
187
|
+
model: string;
|
|
188
|
+
effort?: EffortLevel;
|
|
189
|
+
riskSignals: string[];
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
function countWords(text: string): number {
|
|
193
|
+
return text
|
|
194
|
+
.trim()
|
|
195
|
+
.split(/\s+/)
|
|
196
|
+
.filter((token) => token.length > 0).length;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
function resolveGateDecision(
|
|
200
|
+
options: EvolveOptions,
|
|
201
|
+
proposal: EvolutionProposal,
|
|
202
|
+
validation: ValidationResult,
|
|
203
|
+
currentDescription: string,
|
|
204
|
+
confidenceThreshold: number,
|
|
205
|
+
): GateDecision | undefined {
|
|
206
|
+
const baseModel = options.gateModel;
|
|
207
|
+
if (!baseModel) return undefined;
|
|
208
|
+
|
|
209
|
+
const baseDecision: GateDecision = {
|
|
210
|
+
model: baseModel,
|
|
211
|
+
effort: options.gateEffort,
|
|
212
|
+
riskSignals: [],
|
|
213
|
+
};
|
|
214
|
+
|
|
215
|
+
if (!options.adaptiveGate) return baseDecision;
|
|
216
|
+
|
|
217
|
+
const riskSignals: string[] = [];
|
|
218
|
+
const originalWords = countWords(currentDescription);
|
|
219
|
+
const proposedWords = countWords(proposal.proposed_description);
|
|
220
|
+
const wordGrowth = originalWords === 0 ? 1 : proposedWords / originalWords;
|
|
221
|
+
const lowLift = validation.net_change < 0.15;
|
|
222
|
+
const hasRegressions = validation.regressions.length > 0;
|
|
223
|
+
const lowConfidence = proposal.confidence < Math.max(confidenceThreshold + 0.05, 0.75);
|
|
224
|
+
const broadeningRisk = wordGrowth > 1.8 || proposedWords - originalWords > 32;
|
|
225
|
+
const notYetStrong = validation.after_pass_rate < 0.9;
|
|
226
|
+
|
|
227
|
+
if (hasRegressions) riskSignals.push(`regressions=${validation.regressions.length}`);
|
|
228
|
+
if (lowLift) riskSignals.push(`low_lift=${validation.net_change.toFixed(3)}`);
|
|
229
|
+
if (lowConfidence) riskSignals.push(`confidence=${proposal.confidence.toFixed(2)}`);
|
|
230
|
+
if (broadeningRisk) riskSignals.push(`word_growth=${wordGrowth.toFixed(2)}x`);
|
|
231
|
+
if (notYetStrong) riskSignals.push(`after_pass_rate=${validation.after_pass_rate.toFixed(2)}`);
|
|
232
|
+
|
|
233
|
+
const shouldEscalate = hasRegressions || validation.net_change < 0.1 || riskSignals.length >= 2;
|
|
234
|
+
if (!shouldEscalate) {
|
|
235
|
+
return {
|
|
236
|
+
...baseDecision,
|
|
237
|
+
riskSignals,
|
|
238
|
+
};
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
return {
|
|
242
|
+
model: "opus",
|
|
243
|
+
effort: options.gateEffort === "max" ? "max" : "high",
|
|
244
|
+
riskSignals,
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
|
|
177
248
|
// ---------------------------------------------------------------------------
|
|
178
249
|
// Main orchestrator
|
|
179
250
|
// ---------------------------------------------------------------------------
|
|
@@ -456,7 +527,7 @@ export async function evolve(
|
|
|
456
527
|
// -----------------------------------------------------------------------
|
|
457
528
|
// Pareto multi-candidate path
|
|
458
529
|
// -----------------------------------------------------------------------
|
|
459
|
-
const paretoEnabled = options.paretoEnabled ??
|
|
530
|
+
const paretoEnabled = options.paretoEnabled ?? true;
|
|
460
531
|
const candidateCount = options.candidateCount ?? 3;
|
|
461
532
|
const tokenEfficiencyEnabled = options.tokenEfficiencyEnabled ?? false;
|
|
462
533
|
const telemetryRecords =
|
|
@@ -494,6 +565,7 @@ export async function evolve(
|
|
|
494
565
|
options.proposalModel,
|
|
495
566
|
aggregateMetrics,
|
|
496
567
|
);
|
|
568
|
+
llmCallCount += candidateCount;
|
|
497
569
|
|
|
498
570
|
// Filter by confidence threshold
|
|
499
571
|
const viableCandidates = candidates.filter((c) => c.confidence >= confidenceThreshold);
|
|
@@ -564,6 +636,7 @@ export async function evolve(
|
|
|
564
636
|
agent,
|
|
565
637
|
options.validationModel,
|
|
566
638
|
);
|
|
639
|
+
llmCallCount += countValidationLlmCalls(evalSet.length);
|
|
567
640
|
recordAudit(
|
|
568
641
|
proposal.proposal_id,
|
|
569
642
|
"validated",
|
|
@@ -628,6 +701,7 @@ export async function evolve(
|
|
|
628
701
|
} else {
|
|
629
702
|
// Standard single-candidate retry loop
|
|
630
703
|
let feedbackReason = "";
|
|
704
|
+
const previousPassRates: number[] = [];
|
|
631
705
|
|
|
632
706
|
for (let iteration = 0; iteration < maxIterations; iteration++) {
|
|
633
707
|
iterationsCompleted = iteration + 1;
|
|
@@ -681,7 +755,24 @@ export async function evolve(
|
|
|
681
755
|
);
|
|
682
756
|
if (!constitution.passed) {
|
|
683
757
|
feedbackReason = `Constitutional: ${constitution.violations.join("; ")}`;
|
|
684
|
-
|
|
758
|
+
// Re-evaluate stopping after a constitutional rejection by treating the
|
|
759
|
+
// last entry in previousPassRates as the currentPassRate (or 0 on the
|
|
760
|
+
// first iteration) and slicing it out of history before calling
|
|
761
|
+
// evaluateStoppingCriteria() with the current iteration/maxIterations,
|
|
762
|
+
// confidenceThreshold, and proposal.confidence.
|
|
763
|
+
const constitutionStop = evaluateStoppingCriteria(
|
|
764
|
+
previousPassRates.at(-1) ?? 0,
|
|
765
|
+
previousPassRates.slice(0, -1),
|
|
766
|
+
iteration + 1,
|
|
767
|
+
maxIterations,
|
|
768
|
+
confidenceThreshold,
|
|
769
|
+
proposal.confidence,
|
|
770
|
+
);
|
|
771
|
+
recordAudit(
|
|
772
|
+
proposal.proposal_id,
|
|
773
|
+
"rejected",
|
|
774
|
+
`${feedbackReason} (stopping: ${constitutionStop.reason})`,
|
|
775
|
+
);
|
|
685
776
|
recordEvidence({
|
|
686
777
|
timestamp: new Date().toISOString(),
|
|
687
778
|
proposal_id: proposal.proposal_id,
|
|
@@ -691,54 +782,64 @@ export async function evolve(
|
|
|
691
782
|
stage: "rejected",
|
|
692
783
|
rationale: proposal.rationale,
|
|
693
784
|
confidence: proposal.confidence,
|
|
694
|
-
details: feedbackReason
|
|
785
|
+
details: `${feedbackReason} (stopping: ${constitutionStop.reason})`,
|
|
695
786
|
});
|
|
696
|
-
if (
|
|
787
|
+
if (constitutionStop.shouldStop) {
|
|
697
788
|
finishTui();
|
|
698
789
|
return withStats({
|
|
699
790
|
proposal: lastProposal,
|
|
700
791
|
validation: null,
|
|
701
792
|
deployed: false,
|
|
702
793
|
auditEntries,
|
|
703
|
-
reason: feedbackReason
|
|
794
|
+
reason: `${feedbackReason} (${constitutionStop.reason})`,
|
|
704
795
|
});
|
|
705
796
|
}
|
|
706
797
|
continue;
|
|
707
798
|
}
|
|
708
799
|
|
|
709
|
-
// Step 9: Check confidence threshold
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
800
|
+
// Step 9: Check confidence threshold via stopping criteria
|
|
801
|
+
{
|
|
802
|
+
const preValidationStop = evaluateStoppingCriteria(
|
|
803
|
+
previousPassRates.at(-1) ?? 0,
|
|
804
|
+
previousPassRates.slice(0, -1),
|
|
805
|
+
iteration + 1,
|
|
806
|
+
maxIterations,
|
|
807
|
+
confidenceThreshold,
|
|
808
|
+
proposal.confidence,
|
|
716
809
|
);
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
validation: null,
|
|
735
|
-
deployed: false,
|
|
736
|
-
auditEntries,
|
|
737
|
-
reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
|
|
810
|
+
if (proposal.confidence < confidenceThreshold) {
|
|
811
|
+
feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
|
|
812
|
+
recordAudit(
|
|
813
|
+
proposal.proposal_id,
|
|
814
|
+
"rejected",
|
|
815
|
+
`${feedbackReason} (stopping: ${preValidationStop.reason})`,
|
|
816
|
+
);
|
|
817
|
+
recordEvidence({
|
|
818
|
+
timestamp: new Date().toISOString(),
|
|
819
|
+
proposal_id: proposal.proposal_id,
|
|
820
|
+
skill_name: skillName,
|
|
821
|
+
skill_path: skillPath,
|
|
822
|
+
target: "description",
|
|
823
|
+
stage: "rejected",
|
|
824
|
+
rationale: proposal.rationale,
|
|
825
|
+
confidence: proposal.confidence,
|
|
826
|
+
details: `${feedbackReason} (stopping: ${preValidationStop.reason})`,
|
|
738
827
|
});
|
|
739
|
-
}
|
|
740
828
|
|
|
741
|
-
|
|
829
|
+
// Use stopping criteria to decide whether to return or retry
|
|
830
|
+
if (preValidationStop.shouldStop) {
|
|
831
|
+
finishTui();
|
|
832
|
+
return withStats({
|
|
833
|
+
proposal: lastProposal,
|
|
834
|
+
validation: null,
|
|
835
|
+
deployed: false,
|
|
836
|
+
auditEntries,
|
|
837
|
+
reason: `${feedbackReason} (${preValidationStop.reason})`,
|
|
838
|
+
});
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
continue;
|
|
842
|
+
}
|
|
742
843
|
}
|
|
743
844
|
|
|
744
845
|
// Step 10: Validate against eval set
|
|
@@ -753,7 +854,7 @@ export async function evolve(
|
|
|
753
854
|
options.validationModel,
|
|
754
855
|
);
|
|
755
856
|
lastValidation = validation;
|
|
756
|
-
llmCallCount +=
|
|
857
|
+
llmCallCount += countValidationLlmCalls(evalSet.length);
|
|
757
858
|
tui.done(
|
|
758
859
|
`Validation: ${(validation.before_pass_rate * 100).toFixed(1)}% \u2192 ${(validation.after_pass_rate * 100).toFixed(1)}% (improved: ${validation.improved})`,
|
|
759
860
|
);
|
|
@@ -792,13 +893,23 @@ export async function evolve(
|
|
|
792
893
|
},
|
|
793
894
|
});
|
|
794
895
|
|
|
795
|
-
// Step 12:
|
|
896
|
+
// Step 12: Evaluate stopping criteria after validation
|
|
897
|
+
const stopping = evaluateStoppingCriteria(
|
|
898
|
+
validation.after_pass_rate,
|
|
899
|
+
previousPassRates,
|
|
900
|
+
iteration + 1,
|
|
901
|
+
maxIterations,
|
|
902
|
+
confidenceThreshold,
|
|
903
|
+
proposal.confidence,
|
|
904
|
+
);
|
|
905
|
+
previousPassRates.push(validation.after_pass_rate);
|
|
906
|
+
|
|
796
907
|
if (!validation.improved) {
|
|
797
908
|
feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
|
|
798
909
|
recordAudit(
|
|
799
910
|
proposal.proposal_id,
|
|
800
911
|
"rejected",
|
|
801
|
-
`Validation failed: net_change=${validation.net_change.toFixed(3)}`,
|
|
912
|
+
`Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
|
|
802
913
|
);
|
|
803
914
|
recordEvidence({
|
|
804
915
|
timestamp: new Date().toISOString(),
|
|
@@ -809,7 +920,7 @@ export async function evolve(
|
|
|
809
920
|
stage: "rejected",
|
|
810
921
|
rationale: proposal.rationale,
|
|
811
922
|
confidence: proposal.confidence,
|
|
812
|
-
details: `Validation failed: net_change=${validation.net_change.toFixed(3)}`,
|
|
923
|
+
details: `Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
|
|
813
924
|
validation: {
|
|
814
925
|
improved: validation.improved,
|
|
815
926
|
before_pass_rate: validation.before_pass_rate,
|
|
@@ -821,21 +932,26 @@ export async function evolve(
|
|
|
821
932
|
},
|
|
822
933
|
});
|
|
823
934
|
|
|
824
|
-
//
|
|
825
|
-
if (
|
|
935
|
+
// Use stopping criteria to decide whether to return or retry
|
|
936
|
+
if (stopping.shouldStop) {
|
|
826
937
|
finishTui();
|
|
827
938
|
return withStats({
|
|
828
939
|
proposal: lastProposal,
|
|
829
940
|
validation: lastValidation,
|
|
830
941
|
deployed: false,
|
|
831
942
|
auditEntries,
|
|
832
|
-
reason: `Validation failed
|
|
943
|
+
reason: `Validation failed (${stopping.reason}): net_change=${validation.net_change.toFixed(3)}`,
|
|
833
944
|
});
|
|
834
945
|
}
|
|
835
946
|
|
|
836
947
|
continue;
|
|
837
948
|
}
|
|
838
949
|
|
|
950
|
+
// Validation passed — check if converged or continue
|
|
951
|
+
if (stopping.shouldStop && stopping.reason.includes("Converged")) {
|
|
952
|
+
recordAudit(proposal.proposal_id, "validated", `Stopping early: ${stopping.reason}`);
|
|
953
|
+
}
|
|
954
|
+
|
|
839
955
|
// Validation passed - break out of retry loop
|
|
840
956
|
break;
|
|
841
957
|
}
|
|
@@ -916,18 +1032,39 @@ export async function evolve(
|
|
|
916
1032
|
// -----------------------------------------------------------------------
|
|
917
1033
|
let gateValidation: ValidationResult | undefined;
|
|
918
1034
|
if (options.gateModel && lastProposal && lastValidation?.improved) {
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
1035
|
+
const gateDecision = resolveGateDecision(
|
|
1036
|
+
options,
|
|
1037
|
+
lastProposal,
|
|
1038
|
+
lastValidation,
|
|
1039
|
+
currentDescription,
|
|
1040
|
+
confidenceThreshold,
|
|
1041
|
+
);
|
|
1042
|
+
const gateLabel = gateDecision?.effort
|
|
1043
|
+
? `${gateDecision.model}, effort=${gateDecision.effort}`
|
|
1044
|
+
: (gateDecision?.model ?? options.gateModel);
|
|
1045
|
+
tui.step(`Gate validation (${gateLabel})...`);
|
|
1046
|
+
gateValidation = await _gateValidateProposal(
|
|
1047
|
+
lastProposal,
|
|
1048
|
+
evalSet,
|
|
1049
|
+
agent,
|
|
1050
|
+
gateDecision?.model ?? options.gateModel,
|
|
1051
|
+
gateDecision?.effort,
|
|
1052
|
+
);
|
|
1053
|
+
llmCallCount += countValidationLlmCalls(evalSet.length);
|
|
922
1054
|
tui.done(
|
|
923
|
-
`Gate (${
|
|
1055
|
+
`Gate (${gateLabel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
924
1056
|
);
|
|
925
1057
|
|
|
1058
|
+
const gatePrefix =
|
|
1059
|
+
gateDecision && gateDecision.riskSignals.length > 0
|
|
1060
|
+
? `Adaptive gate [${gateDecision.riskSignals.join(", ")}]`
|
|
1061
|
+
: "Gate validation";
|
|
1062
|
+
|
|
926
1063
|
if (!gateValidation.improved) {
|
|
927
1064
|
recordAudit(
|
|
928
1065
|
lastProposal.proposal_id,
|
|
929
1066
|
"rejected",
|
|
930
|
-
|
|
1067
|
+
`${gatePrefix} failed (${gateLabel}): net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
931
1068
|
);
|
|
932
1069
|
recordEvidence({
|
|
933
1070
|
timestamp: new Date().toISOString(),
|
|
@@ -938,7 +1075,7 @@ export async function evolve(
|
|
|
938
1075
|
stage: "rejected",
|
|
939
1076
|
rationale: lastProposal.rationale,
|
|
940
1077
|
confidence: lastProposal.confidence,
|
|
941
|
-
details:
|
|
1078
|
+
details: `${gatePrefix} failed (${gateLabel}): net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
942
1079
|
validation: {
|
|
943
1080
|
improved: gateValidation.improved,
|
|
944
1081
|
before_pass_rate: gateValidation.before_pass_rate,
|
|
@@ -955,7 +1092,7 @@ export async function evolve(
|
|
|
955
1092
|
validation: lastValidation,
|
|
956
1093
|
deployed: false,
|
|
957
1094
|
auditEntries,
|
|
958
|
-
reason:
|
|
1095
|
+
reason: `${gatePrefix} failed (${gateLabel}): net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
959
1096
|
gateValidation,
|
|
960
1097
|
...(baselineResult ? { baselineResult } : {}),
|
|
961
1098
|
});
|
|
@@ -964,7 +1101,7 @@ export async function evolve(
|
|
|
964
1101
|
recordAudit(
|
|
965
1102
|
lastProposal.proposal_id,
|
|
966
1103
|
"validated",
|
|
967
|
-
|
|
1104
|
+
`${gatePrefix} (${gateLabel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`,
|
|
968
1105
|
);
|
|
969
1106
|
}
|
|
970
1107
|
|
|
@@ -1082,7 +1219,7 @@ export async function cliMain(): Promise<void> {
|
|
|
1082
1219
|
"dry-run": { type: "boolean", default: false },
|
|
1083
1220
|
confidence: { type: "string", default: "0.6" },
|
|
1084
1221
|
"max-iterations": { type: "string", default: "3" },
|
|
1085
|
-
pareto: { type: "boolean", default:
|
|
1222
|
+
pareto: { type: "boolean", default: true },
|
|
1086
1223
|
candidates: { type: "string", default: "3" },
|
|
1087
1224
|
"token-efficiency": { type: "boolean", default: false },
|
|
1088
1225
|
"with-baseline": { type: "boolean", default: false },
|
|
@@ -1090,7 +1227,9 @@ export async function cliMain(): Promise<void> {
|
|
|
1090
1227
|
"cheap-loop": { type: "boolean", default: true },
|
|
1091
1228
|
"full-model": { type: "boolean", default: false },
|
|
1092
1229
|
"gate-model": { type: "string" },
|
|
1230
|
+
"gate-effort": { type: "string" },
|
|
1093
1231
|
"proposal-model": { type: "string" },
|
|
1232
|
+
"adaptive-gate": { type: "boolean", default: false },
|
|
1094
1233
|
"sync-first": { type: "boolean", default: false },
|
|
1095
1234
|
"sync-force": { type: "boolean", default: false },
|
|
1096
1235
|
verbose: { type: "boolean", default: false },
|
|
@@ -1121,6 +1260,8 @@ Options:
|
|
|
1121
1260
|
--cheap-loop Use cheap models for loop, expensive for gate (default: on)
|
|
1122
1261
|
--full-model Use same model for all stages (disables cheap-loop)
|
|
1123
1262
|
--gate-model Model for final gate validation (default: sonnet)
|
|
1263
|
+
--gate-effort Thinking effort for final gate (low|medium|high|max)
|
|
1264
|
+
--adaptive-gate Escalate risky gate checks to opus + high effort
|
|
1124
1265
|
--proposal-model Model for proposal generation LLM calls
|
|
1125
1266
|
--sync-first Refresh source-truth telemetry before building evals/failure patterns
|
|
1126
1267
|
--sync-force Force a full rescan during --sync-first
|
|
@@ -1143,6 +1284,24 @@ Options:
|
|
|
1143
1284
|
"Add --sync-first when using --sync-force",
|
|
1144
1285
|
);
|
|
1145
1286
|
}
|
|
1287
|
+
if (values["gate-effort"] && !["low", "medium", "high", "max"].includes(values["gate-effort"])) {
|
|
1288
|
+
throw new CLIError(
|
|
1289
|
+
`Invalid --gate-effort value: ${values["gate-effort"]}`,
|
|
1290
|
+
"INVALID_FLAG",
|
|
1291
|
+
"Use one of: low, medium, high, max",
|
|
1292
|
+
);
|
|
1293
|
+
}
|
|
1294
|
+
if (
|
|
1295
|
+
(values["gate-effort"] || values["adaptive-gate"]) &&
|
|
1296
|
+
(values["full-model"] ?? false) &&
|
|
1297
|
+
!values["gate-model"]
|
|
1298
|
+
) {
|
|
1299
|
+
throw new CLIError(
|
|
1300
|
+
"--gate-effort and --adaptive-gate require --gate-model when --full-model is set",
|
|
1301
|
+
"INVALID_FLAG",
|
|
1302
|
+
"Add --gate-model <model> or drop --full-model",
|
|
1303
|
+
);
|
|
1304
|
+
}
|
|
1146
1305
|
|
|
1147
1306
|
const { detectAgent } = await import("../utils/llm-call.js");
|
|
1148
1307
|
const requestedAgent = values.agent;
|
|
@@ -1223,6 +1382,8 @@ Options:
|
|
|
1223
1382
|
console.error(`[verbose] Dry run: ${values["dry-run"] ?? false}`);
|
|
1224
1383
|
console.error(`[verbose] Sync first: ${values["sync-first"] ?? false}`);
|
|
1225
1384
|
console.error(`[verbose] Sync force: ${values["sync-force"] ?? false}`);
|
|
1385
|
+
console.error(`[verbose] Adaptive gate: ${values["adaptive-gate"] ?? false}`);
|
|
1386
|
+
console.error(`[verbose] Gate effort: ${values["gate-effort"] ?? "(default)"}`);
|
|
1226
1387
|
}
|
|
1227
1388
|
|
|
1228
1389
|
const result = await evolve({
|
|
@@ -1241,7 +1402,9 @@ Options:
|
|
|
1241
1402
|
validationModel: values["validation-model"],
|
|
1242
1403
|
cheapLoop: (values["cheap-loop"] ?? true) && !(values["full-model"] ?? false),
|
|
1243
1404
|
gateModel: values["gate-model"],
|
|
1405
|
+
gateEffort: values["gate-effort"] as EffortLevel | undefined,
|
|
1244
1406
|
proposalModel: values["proposal-model"],
|
|
1407
|
+
adaptiveGate: values["adaptive-gate"] ?? false,
|
|
1245
1408
|
gradingResults,
|
|
1246
1409
|
syncFirst: values["sync-first"] ?? false,
|
|
1247
1410
|
syncForce: values["sync-force"] ?? false,
|