selftune 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +23 -0
  2. package/README.md +259 -0
  3. package/bin/selftune.cjs +29 -0
  4. package/cli/selftune/constants.ts +71 -0
  5. package/cli/selftune/eval/hooks-to-evals.ts +422 -0
  6. package/cli/selftune/evolution/audit.ts +44 -0
  7. package/cli/selftune/evolution/deploy-proposal.ts +244 -0
  8. package/cli/selftune/evolution/evolve.ts +406 -0
  9. package/cli/selftune/evolution/extract-patterns.ts +145 -0
  10. package/cli/selftune/evolution/propose-description.ts +146 -0
  11. package/cli/selftune/evolution/rollback.ts +242 -0
  12. package/cli/selftune/evolution/stopping-criteria.ts +69 -0
  13. package/cli/selftune/evolution/validate-proposal.ts +137 -0
  14. package/cli/selftune/grading/grade-session.ts +459 -0
  15. package/cli/selftune/hooks/prompt-log.ts +52 -0
  16. package/cli/selftune/hooks/session-stop.ts +54 -0
  17. package/cli/selftune/hooks/skill-eval.ts +73 -0
  18. package/cli/selftune/index.ts +104 -0
  19. package/cli/selftune/ingestors/codex-rollout.ts +416 -0
  20. package/cli/selftune/ingestors/codex-wrapper.ts +332 -0
  21. package/cli/selftune/ingestors/opencode-ingest.ts +565 -0
  22. package/cli/selftune/init.ts +297 -0
  23. package/cli/selftune/monitoring/watch.ts +328 -0
  24. package/cli/selftune/observability.ts +255 -0
  25. package/cli/selftune/types.ts +255 -0
  26. package/cli/selftune/utils/jsonl.ts +75 -0
  27. package/cli/selftune/utils/llm-call.ts +192 -0
  28. package/cli/selftune/utils/logging.ts +40 -0
  29. package/cli/selftune/utils/schema-validator.ts +47 -0
  30. package/cli/selftune/utils/seeded-random.ts +31 -0
  31. package/cli/selftune/utils/transcript.ts +260 -0
  32. package/package.json +29 -0
  33. package/skill/SKILL.md +120 -0
  34. package/skill/Workflows/Doctor.md +145 -0
  35. package/skill/Workflows/Evals.md +193 -0
  36. package/skill/Workflows/Evolve.md +159 -0
  37. package/skill/Workflows/Grade.md +157 -0
  38. package/skill/Workflows/Ingest.md +159 -0
  39. package/skill/Workflows/Initialize.md +125 -0
  40. package/skill/Workflows/Rollback.md +131 -0
  41. package/skill/Workflows/Watch.md +128 -0
  42. package/skill/references/grading-methodology.md +176 -0
  43. package/skill/references/invocation-taxonomy.md +144 -0
  44. package/skill/references/logs.md +168 -0
  45. package/skill/settings_snippet.json +41 -0
@@ -0,0 +1,145 @@
1
+ /**
2
+ * extract-patterns.ts
3
+ *
4
+ * Identifies failure patterns by cross-referencing eval entries with actual
5
+ * skill usage records. Groups missed queries by invocation type and clusters
6
+ * similar queries together using Jaccard similarity.
7
+ */
8
+
9
+ import type { EvalEntry, FailurePattern, InvocationType, SkillUsageRecord } from "../types.js";
10
+
11
+ // ---------------------------------------------------------------------------
12
+ // Jaccard similarity
13
+ // ---------------------------------------------------------------------------
14
+
15
+ /** Tokenize a string into a set of lowercase words. */
16
+ function tokenize(s: string): Set<string> {
17
+ const tokens = new Set<string>();
18
+ for (const word of s.split(/\s+/)) {
19
+ const w = word.toLowerCase();
20
+ if (w) tokens.add(w);
21
+ }
22
+ return tokens;
23
+ }
24
+
25
+ /** Jaccard similarity on word sets, returns 0.0-1.0 */
26
+ export function computeQuerySimilarity(a: string, b: string): number {
27
+ const setA = tokenize(a);
28
+ const setB = tokenize(b);
29
+
30
+ if (setA.size === 0 && setB.size === 0) return 0;
31
+
32
+ let intersection = 0;
33
+ for (const token of setA) {
34
+ if (setB.has(token)) intersection++;
35
+ }
36
+
37
+ const union = setA.size + setB.size - intersection;
38
+ if (union === 0) return 0;
39
+
40
+ return intersection / union;
41
+ }
42
+
43
+ // ---------------------------------------------------------------------------
44
+ // Single-linkage clustering
45
+ // ---------------------------------------------------------------------------
46
+
47
+ /** Single-linkage clustering, default threshold 0.3 */
48
+ export function clusterQueries(queries: string[], threshold = 0.3): string[][] {
49
+ if (queries.length === 0) return [];
50
+
51
+ const clusters: string[][] = [];
52
+
53
+ for (const query of queries) {
54
+ // Collect indices of all clusters where any member has similarity >= threshold
55
+ const matchingIndices: number[] = [];
56
+ for (let i = 0; i < clusters.length; i++) {
57
+ for (const member of clusters[i]) {
58
+ if (computeQuerySimilarity(query, member) >= threshold) {
59
+ matchingIndices.push(i);
60
+ break;
61
+ }
62
+ }
63
+ }
64
+
65
+ if (matchingIndices.length === 0) {
66
+ clusters.push([query]);
67
+ } else {
68
+ // Merge all matching clusters into the first one, then add the query
69
+ const targetCluster = clusters[matchingIndices[0]];
70
+ // Merge in reverse order so splice indices stay valid
71
+ for (let j = matchingIndices.length - 1; j >= 1; j--) {
72
+ const idx = matchingIndices[j];
73
+ targetCluster.push(...clusters[idx]);
74
+ clusters.splice(idx, 1);
75
+ }
76
+ targetCluster.push(query);
77
+ }
78
+ }
79
+
80
+ return clusters;
81
+ }
82
+
83
+ // ---------------------------------------------------------------------------
84
+ // Failure pattern extraction
85
+ // ---------------------------------------------------------------------------
86
+
87
+ /**
88
+ * Cross-reference eval entries with actual usage to find missed queries.
89
+ * Groups by invocation_type and clusters similar missed queries into patterns.
90
+ * Returns sorted by frequency descending.
91
+ */
92
+ export function extractFailurePatterns(
93
+ evalEntries: EvalEntry[],
94
+ skillUsage: SkillUsageRecord[],
95
+ skillName: string,
96
+ ): FailurePattern[] {
97
+ // 1. Build a set of triggered queries from skillUsage for the given skillName
98
+ const triggeredQueries = new Set<string>();
99
+ for (const record of skillUsage) {
100
+ if (record.skill_name === skillName && record.triggered) {
101
+ triggeredQueries.add(record.query);
102
+ }
103
+ }
104
+
105
+ // 2. Find missed queries: should_trigger === true but NOT in the triggered set
106
+ const missedByType = new Map<InvocationType, string[]>();
107
+
108
+ for (const entry of evalEntries) {
109
+ if (!entry.should_trigger) continue;
110
+ if (triggeredQueries.has(entry.query)) continue;
111
+
112
+ const invType = entry.invocation_type ?? "implicit";
113
+ if (!missedByType.has(invType)) {
114
+ missedByType.set(invType, []);
115
+ }
116
+ missedByType.get(invType)?.push(entry.query);
117
+ }
118
+
119
+ // 3. For each group, cluster similar queries
120
+ const now = new Date().toISOString();
121
+ const allPatterns: FailurePattern[] = [];
122
+ let index = 0;
123
+
124
+ for (const [invType, queries] of missedByType) {
125
+ const clusters = clusterQueries(queries);
126
+
127
+ for (const cluster of clusters) {
128
+ allPatterns.push({
129
+ pattern_id: `fp-${skillName}-${index}`,
130
+ skill_name: skillName,
131
+ invocation_type: invType,
132
+ missed_queries: cluster,
133
+ frequency: cluster.length,
134
+ sample_sessions: [],
135
+ extracted_at: now,
136
+ });
137
+ index++;
138
+ }
139
+ }
140
+
141
+ // 4. Sort by frequency descending
142
+ allPatterns.sort((a, b) => b.frequency - a.frequency);
143
+
144
+ return allPatterns;
145
+ }
@@ -0,0 +1,146 @@
1
+ /**
2
+ * propose-description.ts
3
+ *
4
+ * Generates improved skill description proposals using LLM analysis of failure
5
+ * patterns. Takes the current description, identified failure patterns, and
6
+ * missed queries, then produces a structured EvolutionProposal with an
7
+ * improved description, rationale, and confidence score.
8
+ */
9
+
10
+ import type { EvolutionProposal, FailurePattern } from "../types.js";
11
+ import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
12
+
13
+ // ---------------------------------------------------------------------------
14
+ // System prompt
15
+ // ---------------------------------------------------------------------------
16
+
17
+ /** System prompt for the proposal generator LLM. */
18
+ export const PROPOSER_SYSTEM = `You are a skill description optimizer for an AI agent routing system.
19
+
20
+ Your task is to analyze the current skill description and its failure patterns,
21
+ then propose an improved description that would catch the missed queries while
22
+ preserving correct routing for existing queries.
23
+
24
+ Rules:
25
+ - The description must be concise and specific.
26
+ - It must cover the semantic space of the missed queries without being too broad.
27
+ - Maintain the original intent and scope of the skill.
28
+ - Output ONLY valid JSON with exactly these fields:
29
+ - "proposed_description" (string): the improved skill description
30
+ - "rationale" (string): explanation of what changed and why
31
+ - "confidence" (number): 0.0-1.0 how confident you are this improves routing
32
+
33
+ Do NOT include any text outside the JSON object.`;
34
+
35
+ // ---------------------------------------------------------------------------
36
+ // Prompt builder
37
+ // ---------------------------------------------------------------------------
38
+
39
+ /** Build the user prompt for the LLM with context about failures. */
40
+ export function buildProposalPrompt(
41
+ currentDescription: string,
42
+ failurePatterns: FailurePattern[],
43
+ missedQueries: string[],
44
+ skillName: string,
45
+ ): string {
46
+ const patternLines = failurePatterns.map((p) => {
47
+ const queries = p.missed_queries.map((q) => ` - "${q}"`).join("\n");
48
+ return ` Pattern ${p.pattern_id} (frequency: ${p.frequency}, type: ${p.invocation_type}):\n${queries}`;
49
+ });
50
+
51
+ const missedLines = missedQueries.map((q) => ` - "${q}"`).join("\n");
52
+
53
+ return `Skill Name: ${skillName}
54
+
55
+ Current Description:
56
+ ${currentDescription}
57
+
58
+ Failure Patterns:
59
+ ${patternLines.join("\n\n")}
60
+
61
+ All Missed Queries:
62
+ ${missedLines}
63
+
64
+ Propose an improved description for the "${skillName}" skill that would correctly route the missed queries listed above. Output ONLY a JSON object with "proposed_description", "rationale", and "confidence" fields.`;
65
+ }
66
+
67
+ // ---------------------------------------------------------------------------
68
+ // Response parser
69
+ // ---------------------------------------------------------------------------
70
+
71
+ /** Parse LLM response text into structured proposal data. */
72
+ export function parseProposalResponse(raw: string): {
73
+ proposed_description: string;
74
+ rationale: string;
75
+ confidence: number;
76
+ } {
77
+ const cleaned = stripMarkdownFences(raw);
78
+
79
+ let parsed: unknown;
80
+ try {
81
+ parsed = JSON.parse(cleaned);
82
+ } catch {
83
+ throw new Error(`Failed to parse LLM response as JSON: ${cleaned.slice(0, 200)}`);
84
+ }
85
+
86
+ if (typeof parsed !== "object" || parsed === null) {
87
+ throw new Error("LLM response is not a JSON object");
88
+ }
89
+
90
+ const obj = parsed as Record<string, unknown>;
91
+
92
+ if (typeof obj.proposed_description !== "string") {
93
+ throw new Error("Missing or invalid 'proposed_description' field in LLM response");
94
+ }
95
+ if (typeof obj.rationale !== "string") {
96
+ throw new Error("Missing or invalid 'rationale' field in LLM response");
97
+ }
98
+ if (typeof obj.confidence !== "number") {
99
+ throw new Error("Missing or invalid 'confidence' field in LLM response");
100
+ }
101
+
102
+ // Clamp confidence to 0.0-1.0
103
+ const confidence = Math.max(0.0, Math.min(1.0, obj.confidence));
104
+
105
+ return {
106
+ proposed_description: obj.proposed_description,
107
+ rationale: obj.rationale,
108
+ confidence,
109
+ };
110
+ }
111
+
112
+ // ---------------------------------------------------------------------------
113
+ // Proposal generator
114
+ // ---------------------------------------------------------------------------
115
+
116
+ /** Generate a complete evolution proposal using LLM. */
117
+ export async function generateProposal(
118
+ currentDescription: string,
119
+ failurePatterns: FailurePattern[],
120
+ missedQueries: string[],
121
+ skillName: string,
122
+ skillPath: string,
123
+ mode: "agent" | "api",
124
+ agent?: string,
125
+ ): Promise<EvolutionProposal> {
126
+ const prompt = buildProposalPrompt(currentDescription, failurePatterns, missedQueries, skillName);
127
+ const rawResponse = await callLlm(PROPOSER_SYSTEM, prompt, mode, agent);
128
+ const { proposed_description, rationale, confidence } = parseProposalResponse(rawResponse);
129
+
130
+ return {
131
+ proposal_id: `evo-${skillName}-${Date.now()}`,
132
+ skill_name: skillName,
133
+ skill_path: skillPath,
134
+ original_description: currentDescription,
135
+ proposed_description,
136
+ rationale,
137
+ failure_patterns: failurePatterns.map((p) => p.pattern_id),
138
+ eval_results: {
139
+ before: { total: 0, passed: 0, failed: 0, pass_rate: 0 },
140
+ after: { total: 0, passed: 0, failed: 0, pass_rate: 0 },
141
+ },
142
+ confidence,
143
+ created_at: new Date().toISOString(),
144
+ status: "pending",
145
+ };
146
+ }
@@ -0,0 +1,242 @@
1
+ /**
2
+ * Evolution rollback mechanism (TASK-15).
3
+ *
4
+ * Restores a skill's SKILL.md to its pre-evolution state by:
5
+ * 1. Checking for a .bak backup file at the skill path
6
+ * 2. Falling back to the audit trail's "created" entry for original_description
7
+ * 3. Recording a "rolled_back" entry in the audit trail
8
+ */
9
+
10
+ import { existsSync, readFileSync, readdirSync, unlinkSync, writeFileSync } from "node:fs";
11
+ import { basename, dirname, join } from "node:path";
12
+ import { parseArgs } from "node:util";
13
+
14
+ import type { EvolutionAuditEntry } from "../types.js";
15
+ import { appendAuditEntry, getLastDeployedProposal, readAuditTrail } from "./audit.js";
16
+ import { replaceDescription } from "./deploy-proposal.js";
17
+
18
+ // ---------------------------------------------------------------------------
19
+ // Types
20
+ // ---------------------------------------------------------------------------
21
+
22
+ export interface RollbackOptions {
23
+ skillName: string;
24
+ skillPath: string;
25
+ proposalId?: string; // rollback specific proposal, or last deployed
26
+ logPath?: string; // optional override for audit log path (testing)
27
+ }
28
+
29
+ export interface RollbackResult {
30
+ rolledBack: boolean;
31
+ restoredDescription: string;
32
+ reason: string;
33
+ }
34
+
35
+ // ---------------------------------------------------------------------------
36
+ // Helpers
37
+ // ---------------------------------------------------------------------------
38
+
39
+ const ORIGINAL_DESC_PREFIX = "original_description:";
40
+
41
+ /**
42
+ * Find the most recent .bak file for the given skillPath.
43
+ * Matches both legacy `SKILL.md.bak` and timestamped `SKILL.md.<timestamp>.bak`.
44
+ * Returns the path to the most recent backup, or null if none found.
45
+ */
46
+ function findLatestBackup(skillPath: string): string | null {
47
+ const dir = dirname(skillPath);
48
+ const base = basename(skillPath);
49
+
50
+ if (!existsSync(dir)) return null;
51
+
52
+ const entries = readdirSync(dir);
53
+ // Match <base>.bak or <base>.<anything>.bak
54
+ const plainBak = `${base}.bak`;
55
+ const backupFiles = entries
56
+ .filter((f) => f === plainBak || (f.startsWith(`${base}.`) && f.endsWith(".bak")))
57
+ .sort((a, b) => {
58
+ // Extract timestamp: plain "<base>.bak" gets "" (oldest), "<base>.<ts>.bak" gets "<ts>"
59
+ const tsA = a === plainBak ? "" : a.slice(base.length + 1, -4);
60
+ const tsB = b === plainBak ? "" : b.slice(base.length + 1, -4);
61
+ // Descending so newest timestamp first
62
+ return tsB.localeCompare(tsA);
63
+ });
64
+
65
+ if (backupFiles.length === 0) return null;
66
+ return join(dir, backupFiles[0]);
67
+ }
68
+
69
+ /**
70
+ * Find the "created" audit entry for a given proposal ID and extract
71
+ * the original_description from its details field.
72
+ */
73
+ function findOriginalFromAudit(proposalId: string, logPath?: string): string | null {
74
+ const entries = readAuditTrail(undefined, logPath);
75
+ const createdEntry = entries.find((e) => e.proposal_id === proposalId && e.action === "created");
76
+ if (!createdEntry) return null;
77
+
78
+ const { details } = createdEntry;
79
+ if (details.startsWith(ORIGINAL_DESC_PREFIX)) {
80
+ return details.slice(ORIGINAL_DESC_PREFIX.length);
81
+ }
82
+ // Accept a plain non-empty string as the original description
83
+ if (details.length > 0) {
84
+ return details;
85
+ }
86
+ return null;
87
+ }
88
+
89
+ /**
90
+ * Find the deployed audit entry for a specific proposal ID.
91
+ */
92
+ function findDeployedEntry(
93
+ proposalId: string,
94
+ skillName: string,
95
+ logPath?: string,
96
+ ): EvolutionAuditEntry | null {
97
+ const entries = readAuditTrail(skillName, logPath);
98
+ return entries.find((e) => e.proposal_id === proposalId && e.action === "deployed") ?? null;
99
+ }
100
+
101
+ // ---------------------------------------------------------------------------
102
+ // Main rollback function
103
+ // ---------------------------------------------------------------------------
104
+
105
+ export async function rollback(options: RollbackOptions): Promise<RollbackResult> {
106
+ const { skillName, skillPath, proposalId, logPath } = options;
107
+
108
+ const noRollback = (reason: string): RollbackResult => ({
109
+ rolledBack: false,
110
+ restoredDescription: "",
111
+ reason,
112
+ });
113
+
114
+ // Guard: SKILL.md must exist
115
+ if (!existsSync(skillPath)) {
116
+ return noRollback(`SKILL.md not found at ${skillPath}`);
117
+ }
118
+
119
+ // Determine which proposal to roll back
120
+ let targetProposalId: string;
121
+ const explicitProposal = Boolean(proposalId);
122
+
123
+ if (proposalId) {
124
+ // Verify the specific proposal exists in audit trail
125
+ const entry = findDeployedEntry(proposalId, skillName, logPath);
126
+ if (!entry) {
127
+ return noRollback(`Proposal ${proposalId} not found as deployed entry in audit trail`);
128
+ }
129
+ targetProposalId = proposalId;
130
+ } else {
131
+ // Use the most recent deployed proposal
132
+ const lastDeployed = getLastDeployedProposal(skillName, logPath);
133
+ if (!lastDeployed) {
134
+ return noRollback(`No deployed proposal found for skill "${skillName}"`);
135
+ }
136
+ targetProposalId = lastDeployed.proposal_id;
137
+ }
138
+
139
+ // Strategy 1: Restore from .bak file (only when rolling back the latest deploy,
140
+ // i.e., when no explicit proposalId was supplied)
141
+ const backupPath = !explicitProposal ? findLatestBackup(skillPath) : null;
142
+ if (backupPath) {
143
+ const originalContent = readFileSync(backupPath, "utf-8");
144
+ writeFileSync(skillPath, originalContent, "utf-8");
145
+ unlinkSync(backupPath);
146
+
147
+ // Record rollback in audit trail
148
+ const auditEntry: EvolutionAuditEntry = {
149
+ timestamp: new Date().toISOString(),
150
+ proposal_id: targetProposalId,
151
+ action: "rolled_back",
152
+ details: `Rolled back ${skillName} from backup file`,
153
+ };
154
+ appendAuditEntry(auditEntry, logPath);
155
+
156
+ return {
157
+ rolledBack: true,
158
+ restoredDescription: originalContent,
159
+ reason: "Restored from backup file",
160
+ };
161
+ }
162
+
163
+ // Strategy 2: Restore from audit trail's created entry (description only)
164
+ const originalFromAudit = findOriginalFromAudit(targetProposalId, logPath);
165
+ if (originalFromAudit) {
166
+ // Replace only the description section in SKILL.md, preserving structure
167
+ const currentContent = readFileSync(skillPath, "utf-8");
168
+ const updatedContent = replaceDescription(currentContent, originalFromAudit);
169
+ writeFileSync(skillPath, updatedContent, "utf-8");
170
+
171
+ // Record rollback in audit trail
172
+ const auditEntry: EvolutionAuditEntry = {
173
+ timestamp: new Date().toISOString(),
174
+ proposal_id: targetProposalId,
175
+ action: "rolled_back",
176
+ details: `Rolled back ${skillName} from audit trail`,
177
+ };
178
+ appendAuditEntry(auditEntry, logPath);
179
+
180
+ return {
181
+ rolledBack: true,
182
+ restoredDescription: originalFromAudit,
183
+ reason: "Restored from audit trail",
184
+ };
185
+ }
186
+
187
+ // No restoration source available
188
+ return noRollback(
189
+ `No restoration source found for proposal ${targetProposalId} (no .bak file and no original_description in audit trail)`,
190
+ );
191
+ }
192
+
193
+ // ---------------------------------------------------------------------------
194
+ // CLI entry point
195
+ // ---------------------------------------------------------------------------
196
+
197
+ export async function cliMain(): Promise<void> {
198
+ const { values } = parseArgs({
199
+ options: {
200
+ skill: { type: "string" },
201
+ "skill-path": { type: "string" },
202
+ "proposal-id": { type: "string" },
203
+ help: { type: "boolean", default: false },
204
+ },
205
+ strict: true,
206
+ });
207
+
208
+ if (values.help) {
209
+ console.log(`selftune rollback — Rollback a skill to its pre-evolution state
210
+
211
+ Usage:
212
+ selftune rollback --skill <name> --skill-path <path> [options]
213
+
214
+ Options:
215
+ --skill Skill name (required)
216
+ --skill-path Path to SKILL.md (required)
217
+ --proposal-id Specific proposal ID to rollback (optional, uses latest if omitted)
218
+ --help Show this help message`);
219
+ process.exit(0);
220
+ }
221
+
222
+ if (!values.skill || !values["skill-path"]) {
223
+ console.error("[ERROR] --skill and --skill-path are required");
224
+ process.exit(1);
225
+ }
226
+
227
+ const result = await rollback({
228
+ skillName: values.skill,
229
+ skillPath: values["skill-path"],
230
+ proposalId: values["proposal-id"],
231
+ });
232
+
233
+ console.log(JSON.stringify(result, null, 2));
234
+ process.exit(result.rolledBack ? 0 : 1);
235
+ }
236
+
237
+ if (import.meta.main) {
238
+ cliMain().catch((err) => {
239
+ console.error(`[FATAL] ${err}`);
240
+ process.exit(1);
241
+ });
242
+ }
@@ -0,0 +1,69 @@
1
+ /**
2
+ * stopping-criteria.ts
3
+ *
4
+ * Evaluates whether the evolution loop should stop based on convergence,
5
+ * iteration limits, confidence thresholds, and plateau detection.
6
+ * Pure function module with no external dependencies.
7
+ */
8
+
9
+ // ---------------------------------------------------------------------------
10
+ // Types
11
+ // ---------------------------------------------------------------------------
12
+
13
+ export interface StoppingDecision {
14
+ shouldStop: boolean;
15
+ reason: string;
16
+ }
17
+
18
+ // ---------------------------------------------------------------------------
19
+ // Stopping criteria evaluator
20
+ // ---------------------------------------------------------------------------
21
+
22
+ /**
23
+ * Evaluate whether the evolution loop should stop.
24
+ *
25
+ * Checks conditions in priority order:
26
+ * 1. Converged (pass rate >= 95%)
27
+ * 2. Max iterations reached
28
+ * 3. Low confidence (below threshold)
29
+ * 4. Plateau (< 1% variation over last 3 iterations)
30
+ * 5. Continue (none of the above)
31
+ */
32
+ export function evaluateStoppingCriteria(
33
+ currentPassRate: number,
34
+ previousPassRates: number[],
35
+ iterationCount: number,
36
+ maxIterations: number,
37
+ confidenceThreshold: number,
38
+ proposalConfidence: number,
39
+ ): StoppingDecision {
40
+ // 1. Converged
41
+ if (currentPassRate >= 0.95) {
42
+ return { shouldStop: true, reason: "Converged: pass rate \u2265 95%" };
43
+ }
44
+
45
+ // 2. Max iterations
46
+ if (iterationCount >= maxIterations) {
47
+ return { shouldStop: true, reason: "Max iterations reached" };
48
+ }
49
+
50
+ // 3. Low confidence
51
+ if (proposalConfidence < confidenceThreshold) {
52
+ return { shouldStop: true, reason: "Confidence below threshold" };
53
+ }
54
+
55
+ // 4. Plateau detection: need at least 2 previous rates to form 3 data points
56
+ if (previousPassRates.length >= 2) {
57
+ const last2Previous = previousPassRates.slice(-2);
58
+ const window = [...last2Previous, currentPassRate];
59
+ const min = Math.min(...window);
60
+ const max = Math.max(...window);
61
+
62
+ if (max - min < 0.01) {
63
+ return { shouldStop: true, reason: "Plateau: no improvement in last 3 iterations" };
64
+ }
65
+ }
66
+
67
+ // 5. Continue
68
+ return { shouldStop: false, reason: "Continuing: improvement possible" };
69
+ }