selftune 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -0
- package/README.md +259 -0
- package/bin/selftune.cjs +29 -0
- package/cli/selftune/constants.ts +71 -0
- package/cli/selftune/eval/hooks-to-evals.ts +422 -0
- package/cli/selftune/evolution/audit.ts +44 -0
- package/cli/selftune/evolution/deploy-proposal.ts +244 -0
- package/cli/selftune/evolution/evolve.ts +406 -0
- package/cli/selftune/evolution/extract-patterns.ts +145 -0
- package/cli/selftune/evolution/propose-description.ts +146 -0
- package/cli/selftune/evolution/rollback.ts +242 -0
- package/cli/selftune/evolution/stopping-criteria.ts +69 -0
- package/cli/selftune/evolution/validate-proposal.ts +137 -0
- package/cli/selftune/grading/grade-session.ts +459 -0
- package/cli/selftune/hooks/prompt-log.ts +52 -0
- package/cli/selftune/hooks/session-stop.ts +54 -0
- package/cli/selftune/hooks/skill-eval.ts +73 -0
- package/cli/selftune/index.ts +104 -0
- package/cli/selftune/ingestors/codex-rollout.ts +416 -0
- package/cli/selftune/ingestors/codex-wrapper.ts +332 -0
- package/cli/selftune/ingestors/opencode-ingest.ts +565 -0
- package/cli/selftune/init.ts +297 -0
- package/cli/selftune/monitoring/watch.ts +328 -0
- package/cli/selftune/observability.ts +255 -0
- package/cli/selftune/types.ts +255 -0
- package/cli/selftune/utils/jsonl.ts +75 -0
- package/cli/selftune/utils/llm-call.ts +192 -0
- package/cli/selftune/utils/logging.ts +40 -0
- package/cli/selftune/utils/schema-validator.ts +47 -0
- package/cli/selftune/utils/seeded-random.ts +31 -0
- package/cli/selftune/utils/transcript.ts +260 -0
- package/package.json +29 -0
- package/skill/SKILL.md +120 -0
- package/skill/Workflows/Doctor.md +145 -0
- package/skill/Workflows/Evals.md +193 -0
- package/skill/Workflows/Evolve.md +159 -0
- package/skill/Workflows/Grade.md +157 -0
- package/skill/Workflows/Ingest.md +159 -0
- package/skill/Workflows/Initialize.md +125 -0
- package/skill/Workflows/Rollback.md +131 -0
- package/skill/Workflows/Watch.md +128 -0
- package/skill/references/grading-methodology.md +176 -0
- package/skill/references/invocation-taxonomy.md +144 -0
- package/skill/references/logs.md +168 -0
- package/skill/settings_snippet.json +41 -0
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* deploy-proposal.ts
|
|
3
|
+
*
|
|
4
|
+
* Deploys a validated evolution proposal by updating SKILL.md, creating a
|
|
5
|
+
* backup, building a commit message with metrics, and optionally creating
|
|
6
|
+
* a git branch and PR via `gh pr create`.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { copyFileSync, existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
10
|
+
import type { EvolutionProposal } from "../types.js";
|
|
11
|
+
import type { ValidationResult } from "./validate-proposal.js";
|
|
12
|
+
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
// Types
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
export interface DeployOptions {
|
|
18
|
+
proposal: EvolutionProposal;
|
|
19
|
+
validation: ValidationResult;
|
|
20
|
+
skillPath: string;
|
|
21
|
+
createPr: boolean;
|
|
22
|
+
branchPrefix?: string; // default "selftune/evolve"
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export interface DeployResult {
|
|
26
|
+
skillMdUpdated: boolean;
|
|
27
|
+
backupPath: string | null;
|
|
28
|
+
branchName: string | null;
|
|
29
|
+
commitMessage: string;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
// SKILL.md reading
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
/** Read the contents of a SKILL.md file. Throws if the file does not exist. */
|
|
37
|
+
export function readSkillMd(skillPath: string): string {
|
|
38
|
+
if (!existsSync(skillPath)) {
|
|
39
|
+
throw new Error(`SKILL.md not found at ${skillPath}`);
|
|
40
|
+
}
|
|
41
|
+
return readFileSync(skillPath, "utf-8");
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// ---------------------------------------------------------------------------
|
|
45
|
+
// Description replacement
|
|
46
|
+
// ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Replace the description section of a SKILL.md file.
|
|
50
|
+
*
|
|
51
|
+
* The description is defined as the content between the first `#` heading
|
|
52
|
+
* and the first `##` heading. If no `##` heading exists, the entire body
|
|
53
|
+
* after the first heading is replaced.
|
|
54
|
+
*/
|
|
55
|
+
export function replaceDescription(currentContent: string, newDescription: string): string {
|
|
56
|
+
const lines = currentContent.split("\n");
|
|
57
|
+
|
|
58
|
+
// Find the first # heading line
|
|
59
|
+
let headingIndex = -1;
|
|
60
|
+
for (let i = 0; i < lines.length; i++) {
|
|
61
|
+
if (lines[i].startsWith("# ") && !lines[i].startsWith("## ")) {
|
|
62
|
+
headingIndex = i;
|
|
63
|
+
break;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// If no heading found, just prepend the description
|
|
68
|
+
if (headingIndex === -1) {
|
|
69
|
+
return `${newDescription}\n${currentContent}`;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Find the first ## heading after the main heading
|
|
73
|
+
let subHeadingIndex = -1;
|
|
74
|
+
for (let i = headingIndex + 1; i < lines.length; i++) {
|
|
75
|
+
if (lines[i].startsWith("## ")) {
|
|
76
|
+
subHeadingIndex = i;
|
|
77
|
+
break;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Build the new content, preserving any preamble before the first heading
|
|
82
|
+
const preamble = headingIndex > 0 ? `${lines.slice(0, headingIndex).join("\n")}\n` : "";
|
|
83
|
+
const headingLine = lines[headingIndex];
|
|
84
|
+
const descriptionBlock = newDescription.length > 0 ? `\n${newDescription}\n` : "\n";
|
|
85
|
+
|
|
86
|
+
if (subHeadingIndex === -1) {
|
|
87
|
+
// No sub-heading: preamble + heading + new description + trailing newline
|
|
88
|
+
return `${preamble}${headingLine}\n${descriptionBlock}\n`;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Preamble + heading + description + everything from the first ## onward
|
|
92
|
+
const afterSubHeading = lines.slice(subHeadingIndex).join("\n");
|
|
93
|
+
return `${preamble}${headingLine}\n${descriptionBlock}\n${afterSubHeading}`;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// ---------------------------------------------------------------------------
|
|
97
|
+
// Commit message builder
|
|
98
|
+
// ---------------------------------------------------------------------------
|
|
99
|
+
|
|
100
|
+
/** Build a commit message that includes the skill name and pass rate change. */
|
|
101
|
+
export function buildCommitMessage(
|
|
102
|
+
proposal: EvolutionProposal,
|
|
103
|
+
validation: ValidationResult,
|
|
104
|
+
): string {
|
|
105
|
+
const changePercent = Math.round(validation.net_change * 100);
|
|
106
|
+
const sign = changePercent >= 0 ? "+" : "";
|
|
107
|
+
const passRateStr = `${sign}${changePercent}% pass rate`;
|
|
108
|
+
|
|
109
|
+
return `evolve(${proposal.skill_name}): ${passRateStr}`;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// ---------------------------------------------------------------------------
|
|
113
|
+
// Git/GH operations (PR creation)
|
|
114
|
+
// ---------------------------------------------------------------------------
|
|
115
|
+
|
|
116
|
+
/** Sanitize a string for use in a git branch name. */
|
|
117
|
+
function sanitizeForGitRef(name: string): string {
|
|
118
|
+
return name
|
|
119
|
+
.replace(/[^a-zA-Z0-9._-]/g, "-")
|
|
120
|
+
.replace(/\.{2,}/g, ".")
|
|
121
|
+
.replace(/^[.-]|[.-]$/g, "")
|
|
122
|
+
.replace(/-{2,}/g, "-");
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/** Generate a branch name from the prefix and skill name. */
|
|
126
|
+
function makeBranchName(prefix: string, skillName: string): string {
|
|
127
|
+
const timestamp = Date.now();
|
|
128
|
+
const safeName = sanitizeForGitRef(skillName) || "untitled";
|
|
129
|
+
return `${prefix}/${safeName}-${timestamp}`;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Run a git/gh command via Bun.spawn. Returns stdout on success.
|
|
134
|
+
* Throws on non-zero exit code or if the command exceeds timeoutMs.
|
|
135
|
+
*/
|
|
136
|
+
async function runCommand(args: string[], cwd?: string, timeoutMs = 30_000): Promise<string> {
|
|
137
|
+
const proc = Bun.spawn(args, {
|
|
138
|
+
cwd,
|
|
139
|
+
stdout: "pipe",
|
|
140
|
+
stderr: "pipe",
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
let timedOut = false;
|
|
144
|
+
const timer = setTimeout(() => {
|
|
145
|
+
timedOut = true;
|
|
146
|
+
proc.kill();
|
|
147
|
+
}, timeoutMs);
|
|
148
|
+
|
|
149
|
+
try {
|
|
150
|
+
// Read stdout and stderr concurrently to avoid deadlock when both pipes fill.
|
|
151
|
+
const [stdout, stderr] = await Promise.all([
|
|
152
|
+
new Response(proc.stdout).text(),
|
|
153
|
+
new Response(proc.stderr).text(),
|
|
154
|
+
]);
|
|
155
|
+
const exitCode = await proc.exited;
|
|
156
|
+
|
|
157
|
+
if (timedOut) {
|
|
158
|
+
throw new Error(`Command timed out after ${timeoutMs}ms: ${args.join(" ")}`);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if (exitCode !== 0) {
|
|
162
|
+
throw new Error(`Command failed (exit ${exitCode}): ${args.join(" ")}\n${stderr}`);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
return stdout.trim();
|
|
166
|
+
} finally {
|
|
167
|
+
clearTimeout(timer);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// ---------------------------------------------------------------------------
|
|
172
|
+
// Main deploy function
|
|
173
|
+
// ---------------------------------------------------------------------------
|
|
174
|
+
|
|
175
|
+
/** Deploy a validated evolution proposal to SKILL.md and optionally create a PR. */
|
|
176
|
+
export async function deployProposal(options: DeployOptions): Promise<DeployResult> {
|
|
177
|
+
const { proposal, validation, skillPath, createPr, branchPrefix = "selftune/evolve" } = options;
|
|
178
|
+
|
|
179
|
+
// Step 1: Read current SKILL.md
|
|
180
|
+
const currentContent = readSkillMd(skillPath);
|
|
181
|
+
|
|
182
|
+
// Step 2: Create backup (unique per deploy to avoid overwriting previous backups)
|
|
183
|
+
const backupTimestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
|
184
|
+
const backupPath = `${skillPath}.${backupTimestamp}.bak`;
|
|
185
|
+
copyFileSync(skillPath, backupPath);
|
|
186
|
+
|
|
187
|
+
// Step 3: Replace description and write
|
|
188
|
+
const updatedContent = replaceDescription(currentContent, proposal.proposed_description);
|
|
189
|
+
writeFileSync(skillPath, updatedContent, "utf-8");
|
|
190
|
+
|
|
191
|
+
// Step 4: Build commit message
|
|
192
|
+
const commitMessage = buildCommitMessage(proposal, validation);
|
|
193
|
+
|
|
194
|
+
// Step 5: Optionally create branch and PR
|
|
195
|
+
let branchName: string | null = null;
|
|
196
|
+
|
|
197
|
+
if (createPr) {
|
|
198
|
+
branchName = makeBranchName(branchPrefix, proposal.skill_name);
|
|
199
|
+
|
|
200
|
+
try {
|
|
201
|
+
// Create and checkout branch
|
|
202
|
+
await runCommand(["git", "checkout", "-b", branchName]);
|
|
203
|
+
|
|
204
|
+
// Stage the SKILL.md
|
|
205
|
+
await runCommand(["git", "add", skillPath]);
|
|
206
|
+
|
|
207
|
+
// Commit
|
|
208
|
+
await runCommand(["git", "commit", "-m", commitMessage]);
|
|
209
|
+
|
|
210
|
+
// Push
|
|
211
|
+
await runCommand(["git", "push", "-u", "origin", branchName]);
|
|
212
|
+
|
|
213
|
+
// Create PR
|
|
214
|
+
await runCommand([
|
|
215
|
+
"gh",
|
|
216
|
+
"pr",
|
|
217
|
+
"create",
|
|
218
|
+
"--title",
|
|
219
|
+
commitMessage,
|
|
220
|
+
"--body",
|
|
221
|
+
`Proposal: ${proposal.proposal_id}\nRationale: ${proposal.rationale}\nNet change: ${validation.net_change > 0 ? "+" : ""}${Math.round(validation.net_change * 100)}%`,
|
|
222
|
+
]);
|
|
223
|
+
} catch (err) {
|
|
224
|
+
// Git/GH operations are best-effort in test environments.
|
|
225
|
+
// The branch name is still returned for tracking.
|
|
226
|
+
console.error(`[WARN] Git/GH operation failed: ${err instanceof Error ? err.message : err}`);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
return {
|
|
231
|
+
skillMdUpdated: true,
|
|
232
|
+
backupPath,
|
|
233
|
+
branchName,
|
|
234
|
+
commitMessage,
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// ---------------------------------------------------------------------------
|
|
239
|
+
// CLI entry guard
|
|
240
|
+
// ---------------------------------------------------------------------------
|
|
241
|
+
|
|
242
|
+
if (import.meta.main) {
|
|
243
|
+
console.log("deploy-proposal: use deployProposal() programmatically or via evolve CLI");
|
|
244
|
+
}
|
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* evolve.ts
|
|
3
|
+
*
|
|
4
|
+
* Evolution orchestrator: coordinates failure pattern extraction, proposal
|
|
5
|
+
* generation, validation, and deployment into a single pipeline with retry
|
|
6
|
+
* logic and comprehensive audit tracking.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
10
|
+
import { parseArgs } from "node:util";
|
|
11
|
+
|
|
12
|
+
import { QUERY_LOG, SKILL_LOG } from "../constants.js";
|
|
13
|
+
import { buildEvalSet } from "../eval/hooks-to-evals.js";
|
|
14
|
+
import type {
|
|
15
|
+
EvalEntry,
|
|
16
|
+
EvalPassRate,
|
|
17
|
+
EvolutionAuditEntry,
|
|
18
|
+
EvolutionProposal,
|
|
19
|
+
QueryLogRecord,
|
|
20
|
+
SkillUsageRecord,
|
|
21
|
+
} from "../types.js";
|
|
22
|
+
import { readJsonl } from "../utils/jsonl.js";
|
|
23
|
+
import { appendAuditEntry } from "./audit.js";
|
|
24
|
+
import { extractFailurePatterns } from "./extract-patterns.js";
|
|
25
|
+
import { generateProposal } from "./propose-description.js";
|
|
26
|
+
import { validateProposal } from "./validate-proposal.js";
|
|
27
|
+
import type { ValidationResult } from "./validate-proposal.js";
|
|
28
|
+
|
|
29
|
+
// ---------------------------------------------------------------------------
|
|
30
|
+
// Types
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
export interface EvolveOptions {
|
|
34
|
+
skillName: string;
|
|
35
|
+
skillPath: string;
|
|
36
|
+
evalSetPath?: string;
|
|
37
|
+
mode: "agent" | "api";
|
|
38
|
+
agent?: string;
|
|
39
|
+
dryRun: boolean;
|
|
40
|
+
confidenceThreshold: number; // default 0.6
|
|
41
|
+
maxIterations: number; // default 3
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export interface EvolveResult {
|
|
45
|
+
proposal: EvolutionProposal | null;
|
|
46
|
+
validation: ValidationResult | null;
|
|
47
|
+
deployed: boolean;
|
|
48
|
+
auditEntries: EvolutionAuditEntry[];
|
|
49
|
+
reason: string;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Injectable dependencies for evolve(). When omitted, the real module
|
|
54
|
+
* imports are used. Pass overrides in tests to avoid mock.module().
|
|
55
|
+
*/
|
|
56
|
+
export interface EvolveDeps {
|
|
57
|
+
extractFailurePatterns?: typeof import("./extract-patterns.js").extractFailurePatterns;
|
|
58
|
+
generateProposal?: typeof import("./propose-description.js").generateProposal;
|
|
59
|
+
validateProposal?: typeof import("./validate-proposal.js").validateProposal;
|
|
60
|
+
appendAuditEntry?: typeof import("./audit.js").appendAuditEntry;
|
|
61
|
+
buildEvalSet?: typeof import("../eval/hooks-to-evals.js").buildEvalSet;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// ---------------------------------------------------------------------------
|
|
65
|
+
// Audit helper
|
|
66
|
+
// ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
function createAuditEntry(
|
|
69
|
+
proposalId: string,
|
|
70
|
+
action: EvolutionAuditEntry["action"],
|
|
71
|
+
details: string,
|
|
72
|
+
evalSnapshot?: EvalPassRate,
|
|
73
|
+
): EvolutionAuditEntry {
|
|
74
|
+
return {
|
|
75
|
+
timestamp: new Date().toISOString(),
|
|
76
|
+
proposal_id: proposalId,
|
|
77
|
+
action,
|
|
78
|
+
details,
|
|
79
|
+
...(evalSnapshot ? { eval_snapshot: evalSnapshot } : {}),
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// ---------------------------------------------------------------------------
|
|
84
|
+
// Main orchestrator
|
|
85
|
+
// ---------------------------------------------------------------------------
|
|
86
|
+
|
|
87
|
+
export async function evolve(
|
|
88
|
+
options: EvolveOptions,
|
|
89
|
+
_deps: EvolveDeps = {},
|
|
90
|
+
): Promise<EvolveResult> {
|
|
91
|
+
const {
|
|
92
|
+
skillName,
|
|
93
|
+
skillPath,
|
|
94
|
+
evalSetPath,
|
|
95
|
+
mode,
|
|
96
|
+
agent,
|
|
97
|
+
dryRun,
|
|
98
|
+
confidenceThreshold,
|
|
99
|
+
maxIterations,
|
|
100
|
+
} = options;
|
|
101
|
+
|
|
102
|
+
// Resolve injectable dependencies with real-import fallbacks
|
|
103
|
+
const _extractFailurePatterns = _deps.extractFailurePatterns ?? extractFailurePatterns;
|
|
104
|
+
const _generateProposal = _deps.generateProposal ?? generateProposal;
|
|
105
|
+
const _validateProposal = _deps.validateProposal ?? validateProposal;
|
|
106
|
+
const _appendAuditEntry = _deps.appendAuditEntry ?? appendAuditEntry;
|
|
107
|
+
const _buildEvalSet = _deps.buildEvalSet ?? buildEvalSet;
|
|
108
|
+
|
|
109
|
+
const auditEntries: EvolutionAuditEntry[] = [];
|
|
110
|
+
|
|
111
|
+
function recordAudit(
|
|
112
|
+
proposalId: string,
|
|
113
|
+
action: EvolutionAuditEntry["action"],
|
|
114
|
+
details: string,
|
|
115
|
+
evalSnapshot?: EvalPassRate,
|
|
116
|
+
): void {
|
|
117
|
+
const entry = createAuditEntry(proposalId, action, details, evalSnapshot);
|
|
118
|
+
auditEntries.push(entry);
|
|
119
|
+
try {
|
|
120
|
+
_appendAuditEntry(entry);
|
|
121
|
+
} catch {
|
|
122
|
+
// Fail-open: audit write failures should not break the pipeline
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
try {
|
|
127
|
+
// -----------------------------------------------------------------------
|
|
128
|
+
// Step 1: Read current SKILL.md
|
|
129
|
+
// -----------------------------------------------------------------------
|
|
130
|
+
if (!existsSync(skillPath)) {
|
|
131
|
+
return {
|
|
132
|
+
proposal: null,
|
|
133
|
+
validation: null,
|
|
134
|
+
deployed: false,
|
|
135
|
+
auditEntries,
|
|
136
|
+
reason: `SKILL.md not found at ${skillPath}`,
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
const currentDescription = readFileSync(skillPath, "utf-8");
|
|
141
|
+
|
|
142
|
+
// -----------------------------------------------------------------------
|
|
143
|
+
// Step 2: Load eval set
|
|
144
|
+
// -----------------------------------------------------------------------
|
|
145
|
+
let evalSet: EvalEntry[];
|
|
146
|
+
|
|
147
|
+
if (evalSetPath && existsSync(evalSetPath)) {
|
|
148
|
+
const raw = readFileSync(evalSetPath, "utf-8");
|
|
149
|
+
evalSet = JSON.parse(raw) as EvalEntry[];
|
|
150
|
+
} else {
|
|
151
|
+
// Build from logs
|
|
152
|
+
const skillRecords = readJsonl<SkillUsageRecord>(SKILL_LOG);
|
|
153
|
+
const queryRecords = readJsonl<QueryLogRecord>(QUERY_LOG);
|
|
154
|
+
evalSet = _buildEvalSet(skillRecords, queryRecords, skillName);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// -----------------------------------------------------------------------
|
|
158
|
+
// Step 3: Load skill usage records
|
|
159
|
+
// -----------------------------------------------------------------------
|
|
160
|
+
const skillUsage = readJsonl<SkillUsageRecord>(SKILL_LOG);
|
|
161
|
+
|
|
162
|
+
// -----------------------------------------------------------------------
|
|
163
|
+
// Step 4: Extract failure patterns
|
|
164
|
+
// -----------------------------------------------------------------------
|
|
165
|
+
const failurePatterns = _extractFailurePatterns(evalSet, skillUsage, skillName);
|
|
166
|
+
|
|
167
|
+
// -----------------------------------------------------------------------
|
|
168
|
+
// Step 5: Early exit if no patterns
|
|
169
|
+
// -----------------------------------------------------------------------
|
|
170
|
+
if (failurePatterns.length === 0) {
|
|
171
|
+
return {
|
|
172
|
+
proposal: null,
|
|
173
|
+
validation: null,
|
|
174
|
+
deployed: false,
|
|
175
|
+
auditEntries,
|
|
176
|
+
reason: "No failure patterns found",
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// -----------------------------------------------------------------------
|
|
181
|
+
// Step 6: Collect all missed queries
|
|
182
|
+
// -----------------------------------------------------------------------
|
|
183
|
+
const missedQueries = failurePatterns.flatMap((p) => p.missed_queries);
|
|
184
|
+
|
|
185
|
+
// -----------------------------------------------------------------------
|
|
186
|
+
// Steps 7-12: Retry loop for proposal generation and validation
|
|
187
|
+
// -----------------------------------------------------------------------
|
|
188
|
+
let lastProposal: EvolutionProposal | null = null;
|
|
189
|
+
let lastValidation: ValidationResult | null = null;
|
|
190
|
+
let feedbackReason = "";
|
|
191
|
+
|
|
192
|
+
for (let iteration = 0; iteration < maxIterations; iteration++) {
|
|
193
|
+
// Step 7: Generate proposal
|
|
194
|
+
const effectiveMissedQueries = feedbackReason
|
|
195
|
+
? [...missedQueries, `[Previous attempt failed: ${feedbackReason}]`]
|
|
196
|
+
: missedQueries;
|
|
197
|
+
|
|
198
|
+
const proposal = await _generateProposal(
|
|
199
|
+
currentDescription,
|
|
200
|
+
failurePatterns,
|
|
201
|
+
effectiveMissedQueries,
|
|
202
|
+
skillName,
|
|
203
|
+
skillPath,
|
|
204
|
+
mode,
|
|
205
|
+
agent,
|
|
206
|
+
);
|
|
207
|
+
|
|
208
|
+
lastProposal = proposal;
|
|
209
|
+
|
|
210
|
+
// Step 8: Audit "created"
|
|
211
|
+
recordAudit(
|
|
212
|
+
proposal.proposal_id,
|
|
213
|
+
"created",
|
|
214
|
+
`Proposal created for ${skillName} (iteration ${iteration + 1})`,
|
|
215
|
+
);
|
|
216
|
+
|
|
217
|
+
// Step 9: Check confidence threshold
|
|
218
|
+
if (proposal.confidence < confidenceThreshold) {
|
|
219
|
+
feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
|
|
220
|
+
recordAudit(
|
|
221
|
+
proposal.proposal_id,
|
|
222
|
+
"rejected",
|
|
223
|
+
`Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
|
|
224
|
+
);
|
|
225
|
+
|
|
226
|
+
// If this is the last iteration, return early with rejection
|
|
227
|
+
if (iteration === maxIterations - 1) {
|
|
228
|
+
return {
|
|
229
|
+
proposal: lastProposal,
|
|
230
|
+
validation: null,
|
|
231
|
+
deployed: false,
|
|
232
|
+
auditEntries,
|
|
233
|
+
reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
continue;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// Step 10: Validate against eval set
|
|
241
|
+
const validation = await _validateProposal(proposal, evalSet, mode, agent);
|
|
242
|
+
lastValidation = validation;
|
|
243
|
+
|
|
244
|
+
// Step 11: Audit "validated"
|
|
245
|
+
const evalSnapshot: EvalPassRate = {
|
|
246
|
+
total: evalSet.length,
|
|
247
|
+
passed: Math.round(validation.after_pass_rate * evalSet.length),
|
|
248
|
+
failed: evalSet.length - Math.round(validation.after_pass_rate * evalSet.length),
|
|
249
|
+
pass_rate: validation.after_pass_rate,
|
|
250
|
+
};
|
|
251
|
+
recordAudit(
|
|
252
|
+
proposal.proposal_id,
|
|
253
|
+
"validated",
|
|
254
|
+
`Validation complete: improved=${validation.improved}`,
|
|
255
|
+
evalSnapshot,
|
|
256
|
+
);
|
|
257
|
+
|
|
258
|
+
// Step 12: Check validation result
|
|
259
|
+
if (!validation.improved) {
|
|
260
|
+
feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
|
|
261
|
+
recordAudit(
|
|
262
|
+
proposal.proposal_id,
|
|
263
|
+
"rejected",
|
|
264
|
+
`Validation failed: net_change=${validation.net_change.toFixed(3)}`,
|
|
265
|
+
);
|
|
266
|
+
|
|
267
|
+
// If this is the last iteration, return with rejection
|
|
268
|
+
if (iteration === maxIterations - 1) {
|
|
269
|
+
return {
|
|
270
|
+
proposal: lastProposal,
|
|
271
|
+
validation: lastValidation,
|
|
272
|
+
deployed: false,
|
|
273
|
+
auditEntries,
|
|
274
|
+
reason: `Validation failed after ${maxIterations} iterations: net_change=${validation.net_change.toFixed(3)}`,
|
|
275
|
+
};
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
continue;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
// Validation passed - break out of retry loop
|
|
282
|
+
break;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
// -----------------------------------------------------------------------
|
|
286
|
+
// Step 13: Dry run check
|
|
287
|
+
// -----------------------------------------------------------------------
|
|
288
|
+
if (dryRun) {
|
|
289
|
+
return {
|
|
290
|
+
proposal: lastProposal,
|
|
291
|
+
validation: lastValidation,
|
|
292
|
+
deployed: false,
|
|
293
|
+
auditEntries,
|
|
294
|
+
reason: "Dry run - proposal validated but not deployed",
|
|
295
|
+
};
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
// -----------------------------------------------------------------------
|
|
299
|
+
// Step 14: Deploy (actual deploy wired in TASK-14)
|
|
300
|
+
// -----------------------------------------------------------------------
|
|
301
|
+
if (lastProposal) {
|
|
302
|
+
recordAudit(
|
|
303
|
+
lastProposal.proposal_id,
|
|
304
|
+
"deployed",
|
|
305
|
+
`Deployed proposal for ${skillName}`,
|
|
306
|
+
lastValidation
|
|
307
|
+
? {
|
|
308
|
+
total: evalSet.length,
|
|
309
|
+
passed: Math.round(lastValidation.after_pass_rate * evalSet.length),
|
|
310
|
+
failed: evalSet.length - Math.round(lastValidation.after_pass_rate * evalSet.length),
|
|
311
|
+
pass_rate: lastValidation.after_pass_rate,
|
|
312
|
+
}
|
|
313
|
+
: undefined,
|
|
314
|
+
);
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// -----------------------------------------------------------------------
|
|
318
|
+
// Step 15-16: Return complete result
|
|
319
|
+
// -----------------------------------------------------------------------
|
|
320
|
+
return {
|
|
321
|
+
proposal: lastProposal,
|
|
322
|
+
validation: lastValidation,
|
|
323
|
+
deployed: true,
|
|
324
|
+
auditEntries,
|
|
325
|
+
reason: "Evolution deployed successfully",
|
|
326
|
+
};
|
|
327
|
+
} catch (error) {
|
|
328
|
+
// Robust error handling: catch any unexpected errors and return gracefully
|
|
329
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
330
|
+
return {
|
|
331
|
+
proposal: null,
|
|
332
|
+
validation: null,
|
|
333
|
+
deployed: false,
|
|
334
|
+
auditEntries,
|
|
335
|
+
reason: `Error during evolution: ${errorMessage}`,
|
|
336
|
+
};
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// ---------------------------------------------------------------------------
|
|
341
|
+
// CLI entry point
|
|
342
|
+
// ---------------------------------------------------------------------------
|
|
343
|
+
|
|
344
|
+
export async function cliMain(): Promise<void> {
|
|
345
|
+
const { values } = parseArgs({
|
|
346
|
+
options: {
|
|
347
|
+
skill: { type: "string" },
|
|
348
|
+
"skill-path": { type: "string" },
|
|
349
|
+
"eval-set": { type: "string" },
|
|
350
|
+
mode: { type: "string", default: "agent" },
|
|
351
|
+
agent: { type: "string" },
|
|
352
|
+
"dry-run": { type: "boolean", default: false },
|
|
353
|
+
confidence: { type: "string", default: "0.6" },
|
|
354
|
+
"max-iterations": { type: "string", default: "3" },
|
|
355
|
+
help: { type: "boolean", default: false },
|
|
356
|
+
},
|
|
357
|
+
strict: true,
|
|
358
|
+
});
|
|
359
|
+
|
|
360
|
+
if (values.help) {
|
|
361
|
+
console.log(`selftune evolve — Evolve a skill description via failure patterns
|
|
362
|
+
|
|
363
|
+
Usage:
|
|
364
|
+
selftune evolve --skill <name> --skill-path <path> [options]
|
|
365
|
+
|
|
366
|
+
Options:
|
|
367
|
+
--skill Skill name (required)
|
|
368
|
+
--skill-path Path to SKILL.md (required)
|
|
369
|
+
--eval-set Path to eval set JSON (optional, builds from logs if omitted)
|
|
370
|
+
--mode Execution mode: "agent" or "api" (default: "agent")
|
|
371
|
+
--agent Agent CLI to use (claude, codex, opencode)
|
|
372
|
+
--dry-run Validate proposal without deploying
|
|
373
|
+
--confidence Confidence threshold 0.0-1.0 (default: 0.6)
|
|
374
|
+
--max-iterations Max retry iterations (default: 3)
|
|
375
|
+
--help Show this help message`);
|
|
376
|
+
process.exit(0);
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
if (!values.skill || !values["skill-path"]) {
|
|
380
|
+
console.error("[ERROR] --skill and --skill-path are required");
|
|
381
|
+
process.exit(1);
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
const mode = values.mode === "api" ? "api" : "agent";
|
|
385
|
+
|
|
386
|
+
const result = await evolve({
|
|
387
|
+
skillName: values.skill,
|
|
388
|
+
skillPath: values["skill-path"],
|
|
389
|
+
evalSetPath: values["eval-set"],
|
|
390
|
+
mode,
|
|
391
|
+
agent: values.agent,
|
|
392
|
+
dryRun: values["dry-run"] ?? false,
|
|
393
|
+
confidenceThreshold: Number.parseFloat(values.confidence ?? "0.6"),
|
|
394
|
+
maxIterations: Number.parseInt(values["max-iterations"] ?? "3", 10),
|
|
395
|
+
});
|
|
396
|
+
|
|
397
|
+
console.log(JSON.stringify(result, null, 2));
|
|
398
|
+
process.exit(result.deployed ? 0 : 1);
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
if (import.meta.main) {
|
|
402
|
+
cliMain().catch((err) => {
|
|
403
|
+
console.error(`[FATAL] ${err}`);
|
|
404
|
+
process.exit(1);
|
|
405
|
+
});
|
|
406
|
+
}
|