selftune 0.2.31 → 0.2.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +83 -56
- package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
- package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
- package/apps/local-dashboard/dist/index.html +3 -3
- package/cli/selftune/command-surface.ts +613 -2
- package/cli/selftune/create/baseline.ts +429 -0
- package/cli/selftune/create/check.ts +35 -0
- package/cli/selftune/create/init.ts +115 -0
- package/cli/selftune/create/package-candidate-state.ts +771 -0
- package/cli/selftune/create/package-evaluator.ts +710 -0
- package/cli/selftune/create/package-fingerprint.ts +142 -0
- package/cli/selftune/create/package-search.ts +377 -0
- package/cli/selftune/create/publish.ts +431 -0
- package/cli/selftune/create/readiness.ts +495 -0
- package/cli/selftune/create/replay.ts +330 -0
- package/cli/selftune/create/report.ts +74 -0
- package/cli/selftune/create/scaffold.ts +121 -0
- package/cli/selftune/create/skills-ref-adapter.ts +177 -0
- package/cli/selftune/create/status.ts +33 -0
- package/cli/selftune/create/templates.ts +249 -0
- package/cli/selftune/cron/setup.ts +1 -1
- package/cli/selftune/dashboard-action-events.ts +4 -1
- package/cli/selftune/dashboard-action-result.ts +789 -24
- package/cli/selftune/dashboard-action-stream.ts +80 -0
- package/cli/selftune/dashboard-contract.ts +146 -3
- package/cli/selftune/dashboard-server.ts +5 -4
- package/cli/selftune/eval/hooks-to-evals.ts +58 -35
- package/cli/selftune/eval/synthetic-evals.ts +145 -17
- package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
- package/cli/selftune/evolution/evolve-body.ts +9 -36
- package/cli/selftune/evolution/evolve.ts +8 -72
- package/cli/selftune/evolution/stopping-criteria.ts +5 -13
- package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
- package/cli/selftune/evolution/validate-host-replay.ts +115 -15
- package/cli/selftune/improve.ts +206 -0
- package/cli/selftune/index.ts +123 -6
- package/cli/selftune/init.ts +1 -1
- package/cli/selftune/localdb/queries/dashboard.ts +30 -0
- package/cli/selftune/localdb/schema.ts +52 -0
- package/cli/selftune/monitoring/watch.ts +257 -23
- package/cli/selftune/orchestrate/execute.ts +300 -1
- package/cli/selftune/orchestrate/finalize.ts +14 -0
- package/cli/selftune/orchestrate/plan.ts +22 -5
- package/cli/selftune/orchestrate/prepare.ts +59 -4
- package/cli/selftune/orchestrate/report.ts +1 -1
- package/cli/selftune/orchestrate.ts +34 -1
- package/cli/selftune/publish.ts +35 -0
- package/cli/selftune/routes/actions.ts +81 -15
- package/cli/selftune/routes/overview.ts +1 -1
- package/cli/selftune/routes/skill-report.ts +147 -2
- package/cli/selftune/run.ts +18 -0
- package/cli/selftune/schedule.ts +3 -3
- package/cli/selftune/search-run.ts +703 -0
- package/cli/selftune/status.ts +35 -11
- package/cli/selftune/testing-readiness.ts +431 -40
- package/cli/selftune/types.ts +316 -0
- package/cli/selftune/utils/eval-readiness.ts +1 -0
- package/cli/selftune/utils/json-output.ts +11 -0
- package/cli/selftune/utils/lifecycle-surface.ts +48 -0
- package/cli/selftune/utils/query-filter.ts +82 -1
- package/cli/selftune/utils/tui.ts +85 -2
- package/cli/selftune/verify.ts +205 -0
- package/cli/selftune/workflows/proposals.ts +1 -1
- package/cli/selftune/workflows/skill-scaffold.ts +141 -63
- package/cli/selftune/workflows/workflows.ts +4 -4
- package/package.json +1 -1
- package/skill/SKILL.md +148 -85
- package/skill/references/cli-quick-reference.md +16 -1
- package/skill/references/creator-playbook.md +31 -10
- package/skill/workflows/Baseline.md +8 -9
- package/skill/workflows/Contributions.md +4 -4
- package/skill/workflows/Create.md +173 -0
- package/skill/workflows/CreateTestDeploy.md +34 -30
- package/skill/workflows/Cron.md +2 -2
- package/skill/workflows/Dashboard.md +3 -3
- package/skill/workflows/Evals.md +13 -7
- package/skill/workflows/Evolve.md +75 -32
- package/skill/workflows/EvolveBody.md +22 -15
- package/skill/workflows/Hook.md +1 -1
- package/skill/workflows/Improve.md +168 -0
- package/skill/workflows/Initialize.md +3 -3
- package/skill/workflows/Orchestrate.md +49 -12
- package/skill/workflows/Publish.md +100 -0
- package/skill/workflows/Run.md +72 -0
- package/skill/workflows/Schedule.md +2 -2
- package/skill/workflows/SearchRun.md +89 -0
- package/skill/workflows/SignalsDashboard.md +2 -2
- package/skill/workflows/UnitTest.md +13 -4
- package/skill/workflows/Verify.md +136 -0
- package/skill/workflows/Watch.md +114 -47
- package/skill/workflows/Workflows.md +13 -8
- package/apps/local-dashboard/dist/assets/index-B7v_o1WC.js +0 -15
- package/apps/local-dashboard/dist/assets/index-CrO77SVi.css +0 -1
- package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
|
@@ -0,0 +1,1045 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* bounded-mutations.ts
|
|
3
|
+
*
|
|
4
|
+
* Bounded mutation primitives for package search. Generates routing and body
|
|
5
|
+
* variants of a skill file that a package search runner can evaluate.
|
|
6
|
+
*
|
|
7
|
+
* Each mutation produces a complete, self-contained SKILL.md variant written
|
|
8
|
+
* to a temporary directory. The caller (package evaluator) is responsible for
|
|
9
|
+
* cleanup via `cleanupVariants()`.
|
|
10
|
+
*
|
|
11
|
+
* Mutations are deterministic permutations — no LLM calls. This keeps variant
|
|
12
|
+
* generation fast and predictable. LLM-driven evolution remains in
|
|
13
|
+
* propose-body.ts / propose-routing.ts for the existing evolution pipeline.
|
|
14
|
+
*
|
|
15
|
+
* Phase 2 adds eval-informed targeted mutations that use measured weaknesses
|
|
16
|
+
* from replay failures and grading results to focus mutations on the specific
|
|
17
|
+
* patterns that failed.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import type { Database } from "bun:sqlite";
|
|
21
|
+
import { createHash } from "node:crypto";
|
|
22
|
+
import { existsSync, mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
|
|
23
|
+
import { tmpdir } from "node:os";
|
|
24
|
+
import { basename, dirname, join } from "node:path";
|
|
25
|
+
|
|
26
|
+
import { parseSkillSections, replaceSection } from "./deploy-proposal.js";
|
|
27
|
+
import { generateBodyProposal } from "./propose-body.js";
|
|
28
|
+
import { generateRoutingProposal } from "./propose-routing.js";
|
|
29
|
+
import type { EffortLevel } from "../utils/llm-call.js";
|
|
30
|
+
import type { FailurePattern } from "../types.js";
|
|
31
|
+
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
// Types (local — do NOT add to shared types.ts)
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
export interface BoundedMutationOptions {
|
|
37
|
+
maxVariants?: number; // default 3
|
|
38
|
+
mutationSurface: "routing" | "body" | "both";
|
|
39
|
+
parentSkillPath: string;
|
|
40
|
+
agent?: string;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export interface BoundedMutationResult {
|
|
44
|
+
variantSkillPath: string; // path to temporary variant
|
|
45
|
+
mutationSurface: "routing" | "body";
|
|
46
|
+
mutationDescription: string; // what changed
|
|
47
|
+
parentFingerprint: string;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/** Weaknesses extracted from frontier candidate evaluation data. */
|
|
51
|
+
export interface MutationWeaknesses {
|
|
52
|
+
/** Queries that failed during replay (should have triggered but didn't). */
|
|
53
|
+
replayFailureSamples: string[];
|
|
54
|
+
/** Queries that were routed incorrectly. */
|
|
55
|
+
routingFailureSamples: string[];
|
|
56
|
+
/** Body quality score from evaluation (0.0-1.0, higher is better). */
|
|
57
|
+
bodyQualityScore: number;
|
|
58
|
+
/** Change in grading pass rate relative to previous candidate. */
|
|
59
|
+
gradingPassRateDelta: number;
|
|
60
|
+
/** Textual descriptions of grading failure patterns. */
|
|
61
|
+
gradingFailurePatterns?: string[];
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export interface ReflectiveMutationOptions {
|
|
65
|
+
maxVariants?: number;
|
|
66
|
+
skillName: string;
|
|
67
|
+
agent: string;
|
|
68
|
+
modelFlag?: string;
|
|
69
|
+
effort?: EffortLevel;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
interface ReflectiveMutationDeps {
|
|
73
|
+
generateBodyProposal?: typeof generateBodyProposal;
|
|
74
|
+
generateRoutingProposal?: typeof generateRoutingProposal;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// ---------------------------------------------------------------------------
|
|
78
|
+
// Fingerprinting
|
|
79
|
+
// ---------------------------------------------------------------------------
|
|
80
|
+
|
|
81
|
+
/** Compute a short content fingerprint of the parent skill file. */
|
|
82
|
+
function fingerprintContent(content: string): string {
|
|
83
|
+
return createHash("sha256").update(content).digest("hex").slice(0, 12);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// ---------------------------------------------------------------------------
|
|
87
|
+
// Temp directory helpers
|
|
88
|
+
// ---------------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
/** Create a temp directory for a variant and return the SKILL.md path within. */
|
|
91
|
+
function createVariantDir(parentPath: string, index: number): string {
|
|
92
|
+
const stem = basename(dirname(parentPath)) || "skill";
|
|
93
|
+
const dir = join(tmpdir(), `selftune-variant-${stem}-${Date.now()}-${index}`);
|
|
94
|
+
mkdirSync(dir, { recursive: true });
|
|
95
|
+
return join(dir, "SKILL.md");
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
function buildFailurePatternsFromWeaknesses(
|
|
99
|
+
skillName: string,
|
|
100
|
+
weaknesses: MutationWeaknesses,
|
|
101
|
+
): FailurePattern[] {
|
|
102
|
+
const now = new Date().toISOString();
|
|
103
|
+
const patterns: FailurePattern[] = [];
|
|
104
|
+
const replayFailures = [
|
|
105
|
+
...new Set(weaknesses.replayFailureSamples.map((sample) => sample.trim())),
|
|
106
|
+
]
|
|
107
|
+
.filter(Boolean)
|
|
108
|
+
.slice(0, 8);
|
|
109
|
+
const routingFailures = [
|
|
110
|
+
...new Set(weaknesses.routingFailureSamples.map((sample) => sample.trim())),
|
|
111
|
+
]
|
|
112
|
+
.filter(Boolean)
|
|
113
|
+
.slice(0, 8);
|
|
114
|
+
|
|
115
|
+
if (replayFailures.length > 0) {
|
|
116
|
+
patterns.push({
|
|
117
|
+
pattern_id: `reflective-${skillName}-replay`,
|
|
118
|
+
skill_name: skillName,
|
|
119
|
+
invocation_type: "explicit",
|
|
120
|
+
missed_queries: replayFailures,
|
|
121
|
+
frequency: replayFailures.length,
|
|
122
|
+
sample_sessions: [],
|
|
123
|
+
extracted_at: now,
|
|
124
|
+
});
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
if (routingFailures.length > 0) {
|
|
128
|
+
patterns.push({
|
|
129
|
+
pattern_id: `reflective-${skillName}-routing`,
|
|
130
|
+
skill_name: skillName,
|
|
131
|
+
invocation_type: "contextual",
|
|
132
|
+
missed_queries: routingFailures,
|
|
133
|
+
frequency: routingFailures.length,
|
|
134
|
+
sample_sessions: [],
|
|
135
|
+
extracted_at: now,
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if ((weaknesses.gradingFailurePatterns?.length ?? 0) > 0) {
|
|
140
|
+
const missedQueries =
|
|
141
|
+
replayFailures.length > 0
|
|
142
|
+
? replayFailures
|
|
143
|
+
: routingFailures.length > 0
|
|
144
|
+
? routingFailures
|
|
145
|
+
: weaknesses.gradingFailurePatterns!.slice(0, 3);
|
|
146
|
+
patterns.push({
|
|
147
|
+
pattern_id: `reflective-${skillName}-grading`,
|
|
148
|
+
skill_name: skillName,
|
|
149
|
+
invocation_type: "implicit",
|
|
150
|
+
missed_queries: missedQueries,
|
|
151
|
+
frequency: weaknesses.gradingFailurePatterns!.length,
|
|
152
|
+
sample_sessions: [],
|
|
153
|
+
extracted_at: now,
|
|
154
|
+
feedback: weaknesses.gradingFailurePatterns!.slice(0, 5).map((pattern, index) => ({
|
|
155
|
+
query: missedQueries[index] ?? `quality-review-${index + 1}`,
|
|
156
|
+
failure_reason: pattern,
|
|
157
|
+
improvement_hint: pattern,
|
|
158
|
+
invocation_type: "implicit",
|
|
159
|
+
})),
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
return patterns;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
function rebuildSkillWithBody(
|
|
167
|
+
parsed: ReturnType<typeof parseSkillSections>,
|
|
168
|
+
proposedBody: string,
|
|
169
|
+
): string {
|
|
170
|
+
const parts: string[] = [];
|
|
171
|
+
if (parsed.frontmatter) {
|
|
172
|
+
parts.push(parsed.frontmatter.trimEnd(), "");
|
|
173
|
+
}
|
|
174
|
+
parts.push(parsed.title, "", proposedBody.trim(), "");
|
|
175
|
+
return parts.join("\n");
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// ---------------------------------------------------------------------------
|
|
179
|
+
// Routing mutation strategies
|
|
180
|
+
// ---------------------------------------------------------------------------
|
|
181
|
+
|
|
182
|
+
/**
|
|
183
|
+
* Deterministic routing table mutations. Each strategy modifies the routing
|
|
184
|
+
* table in a different way to explore the search space:
|
|
185
|
+
*
|
|
186
|
+
* 1. Synonym expansion — adds synonym triggers for existing rows
|
|
187
|
+
* 2. Granularity split — splits broad triggers into more specific ones
|
|
188
|
+
* 3. Coverage broadening — adds catch-all/fuzzy trigger rows
|
|
189
|
+
*/
|
|
190
|
+
|
|
191
|
+
interface RoutingRow {
|
|
192
|
+
trigger: string;
|
|
193
|
+
workflow: string;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
/** Parse a markdown routing table into rows. */
|
|
197
|
+
function parseRoutingTable(tableContent: string): RoutingRow[] {
|
|
198
|
+
const lines = tableContent
|
|
199
|
+
.split("\n")
|
|
200
|
+
.map((l) => l.trim())
|
|
201
|
+
.filter((l) => l.startsWith("|") && l.endsWith("|"));
|
|
202
|
+
|
|
203
|
+
if (lines.length < 3) return []; // header + separator + at least 1 row
|
|
204
|
+
|
|
205
|
+
// Skip header (line 0) and separator (line 1)
|
|
206
|
+
return lines.slice(2).map((line) => {
|
|
207
|
+
const cells = line
|
|
208
|
+
.split("|")
|
|
209
|
+
.map((c) => c.trim())
|
|
210
|
+
.filter((c) => c.length > 0);
|
|
211
|
+
return {
|
|
212
|
+
trigger: cells[0] || "",
|
|
213
|
+
workflow: cells[1] || "",
|
|
214
|
+
};
|
|
215
|
+
});
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
/** Render routing rows back to a markdown table. */
|
|
219
|
+
function renderRoutingTable(rows: RoutingRow[]): string {
|
|
220
|
+
const lines = [
|
|
221
|
+
"| Trigger | Workflow |",
|
|
222
|
+
"| --- | --- |",
|
|
223
|
+
...rows.map((r) => `| ${r.trigger} | ${r.workflow} |`),
|
|
224
|
+
];
|
|
225
|
+
return lines.join("\n");
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/** Strategy 1: Add synonym triggers for each existing row. */
|
|
229
|
+
function synonymExpansion(rows: RoutingRow[]): { rows: RoutingRow[]; description: string } {
|
|
230
|
+
const synonymMap: Record<string, string[]> = {
|
|
231
|
+
create: ["add", "new", "make"],
|
|
232
|
+
list: ["show", "display", "view"],
|
|
233
|
+
update: ["edit", "modify", "change"],
|
|
234
|
+
delete: ["remove", "drop", "destroy"],
|
|
235
|
+
get: ["fetch", "retrieve", "read"],
|
|
236
|
+
search: ["find", "lookup", "query"],
|
|
237
|
+
run: ["execute", "start", "launch"],
|
|
238
|
+
stop: ["halt", "end", "terminate"],
|
|
239
|
+
deploy: ["publish", "release", "ship"],
|
|
240
|
+
test: ["verify", "check", "validate"],
|
|
241
|
+
};
|
|
242
|
+
|
|
243
|
+
const expanded: RoutingRow[] = [...rows];
|
|
244
|
+
const addedTriggers: string[] = [];
|
|
245
|
+
|
|
246
|
+
for (const row of rows) {
|
|
247
|
+
const words = row.trigger.toLowerCase().split(/\s+/);
|
|
248
|
+
for (const word of words) {
|
|
249
|
+
const synonyms = synonymMap[word];
|
|
250
|
+
if (synonyms) {
|
|
251
|
+
// Pick the first synonym not already present
|
|
252
|
+
const existing = expanded.map((r) => r.trigger.toLowerCase());
|
|
253
|
+
for (const syn of synonyms) {
|
|
254
|
+
const newTrigger = row.trigger.replace(new RegExp(`\\b${word}\\b`, "i"), syn);
|
|
255
|
+
if (!existing.includes(newTrigger.toLowerCase())) {
|
|
256
|
+
expanded.push({ trigger: newTrigger, workflow: row.workflow });
|
|
257
|
+
addedTriggers.push(newTrigger);
|
|
258
|
+
break;
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
return {
|
|
266
|
+
rows: expanded,
|
|
267
|
+
description:
|
|
268
|
+
addedTriggers.length > 0
|
|
269
|
+
? `Synonym expansion: added triggers [${addedTriggers.join(", ")}]`
|
|
270
|
+
: "Synonym expansion: no new synonyms found",
|
|
271
|
+
};
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/** Strategy 2: Split triggers into more specific forms. */
|
|
275
|
+
function granularitySplit(rows: RoutingRow[]): { rows: RoutingRow[]; description: string } {
|
|
276
|
+
const result: RoutingRow[] = [];
|
|
277
|
+
const splits: string[] = [];
|
|
278
|
+
|
|
279
|
+
for (const row of rows) {
|
|
280
|
+
result.push(row);
|
|
281
|
+
// For each row, add a more specific variant with a qualifier
|
|
282
|
+
const qualifiers = ["by name", "by id", "all", "recent"];
|
|
283
|
+
const qualifier = qualifiers[result.length % qualifiers.length];
|
|
284
|
+
const specificTrigger = `${row.trigger} ${qualifier}`;
|
|
285
|
+
result.push({ trigger: specificTrigger, workflow: row.workflow });
|
|
286
|
+
splits.push(specificTrigger);
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
return {
|
|
290
|
+
rows: result,
|
|
291
|
+
description: `Granularity split: added specific triggers [${splits.join(", ")}]`,
|
|
292
|
+
};
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/** Strategy 3: Add broader catch-all patterns. */
|
|
296
|
+
function coverageBroadening(rows: RoutingRow[]): { rows: RoutingRow[]; description: string } {
|
|
297
|
+
const workflowGroups = new Map<string, string[]>();
|
|
298
|
+
for (const row of rows) {
|
|
299
|
+
const triggers = workflowGroups.get(row.workflow) || [];
|
|
300
|
+
triggers.push(row.trigger);
|
|
301
|
+
workflowGroups.set(row.workflow, triggers);
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
const broadened: RoutingRow[] = [...rows];
|
|
305
|
+
const added: string[] = [];
|
|
306
|
+
|
|
307
|
+
for (const [workflow, triggers] of workflowGroups) {
|
|
308
|
+
// Extract the common verb/noun pattern and add a help/info variant
|
|
309
|
+
const words = triggers.flatMap((t) => t.split(/\s+/));
|
|
310
|
+
const nouns = words.filter(
|
|
311
|
+
(w) =>
|
|
312
|
+
!["create", "list", "update", "delete", "get", "set", "run", "stop"].includes(
|
|
313
|
+
w.toLowerCase(),
|
|
314
|
+
),
|
|
315
|
+
);
|
|
316
|
+
if (nouns.length > 0) {
|
|
317
|
+
const noun = nouns[0];
|
|
318
|
+
const helpTrigger = `help with ${noun}`;
|
|
319
|
+
if (!broadened.some((r) => r.trigger.toLowerCase() === helpTrigger.toLowerCase())) {
|
|
320
|
+
broadened.push({ trigger: helpTrigger, workflow });
|
|
321
|
+
added.push(helpTrigger);
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
return {
|
|
327
|
+
rows: broadened,
|
|
328
|
+
description:
|
|
329
|
+
added.length > 0
|
|
330
|
+
? `Coverage broadening: added catch-all triggers [${added.join(", ")}]`
|
|
331
|
+
: "Coverage broadening: no new patterns added",
|
|
332
|
+
};
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
const ROUTING_STRATEGIES = [synonymExpansion, granularitySplit, coverageBroadening];
|
|
336
|
+
|
|
337
|
+
// ---------------------------------------------------------------------------
|
|
338
|
+
// Body mutation strategies
|
|
339
|
+
// ---------------------------------------------------------------------------
|
|
340
|
+
|
|
341
|
+
/**
|
|
342
|
+
* Deterministic body mutations. Each strategy modifies the skill body
|
|
343
|
+
* in a different way:
|
|
344
|
+
*
|
|
345
|
+
* 1. Instruction emphasis — reorders and highlights key instructions
|
|
346
|
+
* 2. Example enrichment — adds generated example phrases
|
|
347
|
+
* 3. Description expansion — expands the description paragraph
|
|
348
|
+
*/
|
|
349
|
+
|
|
350
|
+
/** Strategy 1: Reorder instructions to emphasize different aspects. */
|
|
351
|
+
function instructionEmphasis(
|
|
352
|
+
parsed: ReturnType<typeof parseSkillSections>,
|
|
353
|
+
_fullContent: string,
|
|
354
|
+
): { sections: Record<string, string>; description: string; desc: string } {
|
|
355
|
+
const newSections = { ...parsed.sections };
|
|
356
|
+
const instructions = newSections["Instructions"] || "";
|
|
357
|
+
|
|
358
|
+
if (instructions) {
|
|
359
|
+
// Reverse the numbered list order to emphasize different steps
|
|
360
|
+
const lines = instructions.split("\n");
|
|
361
|
+
const numbered = lines.filter((l) => /^\d+\./.test(l.trim()));
|
|
362
|
+
const nonNumbered = lines.filter((l) => !/^\d+\./.test(l.trim()));
|
|
363
|
+
|
|
364
|
+
if (numbered.length > 1) {
|
|
365
|
+
const reversed = numbered.toReversed().map((line, i) => {
|
|
366
|
+
return line.replace(/^\d+\./, `${i + 1}.`);
|
|
367
|
+
});
|
|
368
|
+
newSections["Instructions"] = [...nonNumbered, ...reversed].join("\n").trim();
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
return {
|
|
373
|
+
sections: newSections,
|
|
374
|
+
description: "Instruction emphasis: reordered instruction steps",
|
|
375
|
+
desc: `${parsed.description}\n\nThis skill prioritizes the final steps of its workflow.`,
|
|
376
|
+
};
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
/** Strategy 2: Enrich examples section. */
|
|
380
|
+
function exampleEnrichment(
|
|
381
|
+
parsed: ReturnType<typeof parseSkillSections>,
|
|
382
|
+
_fullContent: string,
|
|
383
|
+
): { sections: Record<string, string>; description: string; desc: string } {
|
|
384
|
+
const newSections = { ...parsed.sections };
|
|
385
|
+
const examples = newSections["Examples"] || "";
|
|
386
|
+
|
|
387
|
+
// Generate additional example patterns from existing triggers
|
|
388
|
+
const routing = newSections["Workflow Routing"] || "";
|
|
389
|
+
const rows = parseRoutingTable(routing);
|
|
390
|
+
const newExamples = rows.map((r) => `- "I need to ${r.trigger}"`);
|
|
391
|
+
|
|
392
|
+
newSections["Examples"] = examples
|
|
393
|
+
? `${examples}\n${newExamples.join("\n")}`
|
|
394
|
+
: newExamples.join("\n");
|
|
395
|
+
|
|
396
|
+
return {
|
|
397
|
+
sections: newSections,
|
|
398
|
+
description: "Example enrichment: added example phrases from routing triggers",
|
|
399
|
+
desc: parsed.description,
|
|
400
|
+
};
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
/** Strategy 3: Expand the description paragraph. */
|
|
404
|
+
function descriptionExpansion(
|
|
405
|
+
parsed: ReturnType<typeof parseSkillSections>,
|
|
406
|
+
_fullContent: string,
|
|
407
|
+
): { sections: Record<string, string>; description: string; desc: string } {
|
|
408
|
+
const routing = parsed.sections["Workflow Routing"] || "";
|
|
409
|
+
const rows = parseRoutingTable(routing);
|
|
410
|
+
const capabilities = rows.map((r) => r.trigger).join(", ");
|
|
411
|
+
|
|
412
|
+
const expandedDesc = capabilities
|
|
413
|
+
? `${parsed.description} Capabilities include: ${capabilities}.`
|
|
414
|
+
: `${parsed.description} This skill provides comprehensive workflow automation.`;
|
|
415
|
+
|
|
416
|
+
return {
|
|
417
|
+
sections: { ...parsed.sections },
|
|
418
|
+
description: "Description expansion: added capability summary from routing table",
|
|
419
|
+
desc: expandedDesc,
|
|
420
|
+
};
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
const BODY_STRATEGIES = [instructionEmphasis, exampleEnrichment, descriptionExpansion];
|
|
424
|
+
|
|
425
|
+
// ---------------------------------------------------------------------------
|
|
426
|
+
// Reassembly
|
|
427
|
+
// ---------------------------------------------------------------------------
|
|
428
|
+
|
|
429
|
+
/** Reassemble a SKILL.md from parsed sections. */
|
|
430
|
+
function reassembleSkill(
|
|
431
|
+
parsed: ReturnType<typeof parseSkillSections>,
|
|
432
|
+
sectionOverrides?: Record<string, string>,
|
|
433
|
+
descriptionOverride?: string,
|
|
434
|
+
): string {
|
|
435
|
+
const parts: string[] = [];
|
|
436
|
+
|
|
437
|
+
if (parsed.frontmatter) {
|
|
438
|
+
parts.push(parsed.frontmatter);
|
|
439
|
+
parts.push("");
|
|
440
|
+
}
|
|
441
|
+
if (parsed.title) {
|
|
442
|
+
parts.push(parsed.title);
|
|
443
|
+
parts.push("");
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
parts.push(descriptionOverride ?? parsed.description);
|
|
447
|
+
parts.push("");
|
|
448
|
+
|
|
449
|
+
const sections = sectionOverrides ?? parsed.sections;
|
|
450
|
+
for (const [name, content] of Object.entries(sections)) {
|
|
451
|
+
parts.push(`## ${name}`);
|
|
452
|
+
parts.push("");
|
|
453
|
+
parts.push(content);
|
|
454
|
+
parts.push("");
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
return parts.join("\n").trimEnd() + "\n";
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
// ---------------------------------------------------------------------------
|
|
461
|
+
// Public API
|
|
462
|
+
// ---------------------------------------------------------------------------
|
|
463
|
+
|
|
464
|
+
/**
|
|
465
|
+
* Generate N routing variants of a skill. Each variant has a modified
|
|
466
|
+
* Workflow Routing table while preserving all other content.
|
|
467
|
+
*/
|
|
468
|
+
export async function generateRoutingMutations(
|
|
469
|
+
skillPath: string,
|
|
470
|
+
options?: BoundedMutationOptions,
|
|
471
|
+
): Promise<BoundedMutationResult[]> {
|
|
472
|
+
if (!existsSync(skillPath)) {
|
|
473
|
+
throw new Error(`Skill file not found: ${skillPath}`);
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
const maxVariants = options?.maxVariants ?? 3;
|
|
477
|
+
const content = readFileSync(skillPath, "utf-8");
|
|
478
|
+
const fingerprint = fingerprintContent(content);
|
|
479
|
+
const parsed = parseSkillSections(content);
|
|
480
|
+
const currentRouting = parsed.sections["Workflow Routing"] || "";
|
|
481
|
+
const currentRows = parseRoutingTable(currentRouting);
|
|
482
|
+
|
|
483
|
+
if (currentRows.length === 0) {
|
|
484
|
+
throw new Error(`No routing table found in ${skillPath}`);
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
const results: BoundedMutationResult[] = [];
|
|
488
|
+
|
|
489
|
+
for (let i = 0; i < maxVariants; i++) {
|
|
490
|
+
const strategy = ROUTING_STRATEGIES[i % ROUTING_STRATEGIES.length];
|
|
491
|
+
const { rows: mutatedRows, description } = strategy(currentRows);
|
|
492
|
+
const newTable = renderRoutingTable(mutatedRows);
|
|
493
|
+
|
|
494
|
+
// Replace only the routing section, keep everything else
|
|
495
|
+
const variantContent = replaceSection(content, "Workflow Routing", newTable);
|
|
496
|
+
const variantPath = createVariantDir(skillPath, i);
|
|
497
|
+
|
|
498
|
+
writeFileSync(variantPath, variantContent, "utf-8");
|
|
499
|
+
|
|
500
|
+
results.push({
|
|
501
|
+
variantSkillPath: variantPath,
|
|
502
|
+
mutationSurface: "routing",
|
|
503
|
+
mutationDescription: description,
|
|
504
|
+
parentFingerprint: fingerprint,
|
|
505
|
+
});
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
return results;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
/**
|
|
512
|
+
* Generate N body variants of a skill. Each variant has modified
|
|
513
|
+
* body content (instructions, examples, description) while preserving
|
|
514
|
+
* the overall SKILL.md structure.
|
|
515
|
+
*/
|
|
516
|
+
export async function generateBodyMutations(
|
|
517
|
+
skillPath: string,
|
|
518
|
+
options?: BoundedMutationOptions,
|
|
519
|
+
): Promise<BoundedMutationResult[]> {
|
|
520
|
+
if (!existsSync(skillPath)) {
|
|
521
|
+
throw new Error(`Skill file not found: ${skillPath}`);
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
const maxVariants = options?.maxVariants ?? 3;
|
|
525
|
+
const content = readFileSync(skillPath, "utf-8");
|
|
526
|
+
const fingerprint = fingerprintContent(content);
|
|
527
|
+
const parsed = parseSkillSections(content);
|
|
528
|
+
|
|
529
|
+
const results: BoundedMutationResult[] = [];
|
|
530
|
+
|
|
531
|
+
for (let i = 0; i < maxVariants; i++) {
|
|
532
|
+
const strategy = BODY_STRATEGIES[i % BODY_STRATEGIES.length];
|
|
533
|
+
const { sections, description, desc } = strategy(parsed, content);
|
|
534
|
+
|
|
535
|
+
const variantContent = reassembleSkill(parsed, sections, desc);
|
|
536
|
+
const variantPath = createVariantDir(skillPath, i);
|
|
537
|
+
|
|
538
|
+
writeFileSync(variantPath, variantContent, "utf-8");
|
|
539
|
+
|
|
540
|
+
results.push({
|
|
541
|
+
variantSkillPath: variantPath,
|
|
542
|
+
mutationSurface: "body",
|
|
543
|
+
mutationDescription: description,
|
|
544
|
+
parentFingerprint: fingerprint,
|
|
545
|
+
});
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
return results;
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
// ---------------------------------------------------------------------------
|
|
552
|
+
// Reflective routing/body mutations (measured evidence -> LLM proposal)
|
|
553
|
+
// ---------------------------------------------------------------------------
|
|
554
|
+
|
|
555
|
+
export async function generateReflectiveRoutingMutations(
|
|
556
|
+
skillPath: string,
|
|
557
|
+
weaknesses: MutationWeaknesses,
|
|
558
|
+
options: ReflectiveMutationOptions,
|
|
559
|
+
deps: ReflectiveMutationDeps = {},
|
|
560
|
+
): Promise<BoundedMutationResult[]> {
|
|
561
|
+
if (!existsSync(skillPath) || !options.agent) {
|
|
562
|
+
return [];
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
const allFailures = [...weaknesses.replayFailureSamples, ...weaknesses.routingFailureSamples]
|
|
566
|
+
.map((sample) => sample.trim())
|
|
567
|
+
.filter(Boolean);
|
|
568
|
+
if (allFailures.length === 0) {
|
|
569
|
+
return [];
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
const content = readFileSync(skillPath, "utf-8");
|
|
573
|
+
const parsed = parseSkillSections(content);
|
|
574
|
+
const currentRouting = parsed.sections["Workflow Routing"] ?? "";
|
|
575
|
+
if (!currentRouting.trim()) {
|
|
576
|
+
return [];
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
const proposal = await (deps.generateRoutingProposal ?? generateRoutingProposal)(
|
|
580
|
+
currentRouting,
|
|
581
|
+
content,
|
|
582
|
+
buildFailurePatternsFromWeaknesses(options.skillName, weaknesses),
|
|
583
|
+
[...new Set(allFailures)],
|
|
584
|
+
options.skillName,
|
|
585
|
+
skillPath,
|
|
586
|
+
options.agent,
|
|
587
|
+
options.modelFlag,
|
|
588
|
+
options.effort,
|
|
589
|
+
);
|
|
590
|
+
|
|
591
|
+
const variantContent = replaceSection(content, "Workflow Routing", proposal.proposed_body.trim());
|
|
592
|
+
const variantPath = createVariantDir(skillPath, 0);
|
|
593
|
+
writeFileSync(variantPath, variantContent, "utf-8");
|
|
594
|
+
|
|
595
|
+
return [
|
|
596
|
+
{
|
|
597
|
+
variantSkillPath: variantPath,
|
|
598
|
+
mutationSurface: "routing",
|
|
599
|
+
mutationDescription: `Reflective: ${proposal.rationale}`,
|
|
600
|
+
parentFingerprint: fingerprintContent(content),
|
|
601
|
+
},
|
|
602
|
+
].slice(0, options.maxVariants ?? 1);
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
export async function generateReflectiveBodyMutations(
|
|
606
|
+
skillPath: string,
|
|
607
|
+
weaknesses: MutationWeaknesses,
|
|
608
|
+
options: ReflectiveMutationOptions,
|
|
609
|
+
deps: ReflectiveMutationDeps = {},
|
|
610
|
+
): Promise<BoundedMutationResult[]> {
|
|
611
|
+
if (!existsSync(skillPath) || !options.agent) {
|
|
612
|
+
return [];
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
const hasBodyWeakness = weaknesses.bodyQualityScore < 0.8;
|
|
616
|
+
const hasGradingDecline = weaknesses.gradingPassRateDelta < -0.05;
|
|
617
|
+
const hasFailurePatterns = (weaknesses.gradingFailurePatterns?.length ?? 0) > 0;
|
|
618
|
+
if (!hasBodyWeakness && !hasGradingDecline && !hasFailurePatterns) {
|
|
619
|
+
return [];
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
const content = readFileSync(skillPath, "utf-8");
|
|
623
|
+
const parsed = parseSkillSections(content);
|
|
624
|
+
const proposal = await (deps.generateBodyProposal ?? generateBodyProposal)(
|
|
625
|
+
content,
|
|
626
|
+
buildFailurePatternsFromWeaknesses(options.skillName, weaknesses),
|
|
627
|
+
[...new Set([...weaknesses.replayFailureSamples, ...weaknesses.routingFailureSamples])],
|
|
628
|
+
options.skillName,
|
|
629
|
+
skillPath,
|
|
630
|
+
options.agent,
|
|
631
|
+
options.modelFlag,
|
|
632
|
+
undefined,
|
|
633
|
+
undefined,
|
|
634
|
+
options.effort,
|
|
635
|
+
);
|
|
636
|
+
|
|
637
|
+
const variantPath = createVariantDir(skillPath, 0);
|
|
638
|
+
writeFileSync(variantPath, rebuildSkillWithBody(parsed, proposal.proposed_body), "utf-8");
|
|
639
|
+
|
|
640
|
+
return [
|
|
641
|
+
{
|
|
642
|
+
variantSkillPath: variantPath,
|
|
643
|
+
mutationSurface: "body",
|
|
644
|
+
mutationDescription: `Reflective: ${proposal.rationale}`,
|
|
645
|
+
parentFingerprint: fingerprintContent(content),
|
|
646
|
+
},
|
|
647
|
+
].slice(0, options.maxVariants ?? 1);
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
// ---------------------------------------------------------------------------
|
|
651
|
+
// Targeted routing mutations (eval-informed)
|
|
652
|
+
// ---------------------------------------------------------------------------
|
|
653
|
+
|
|
654
|
+
/** Extract keywords from a list of queries. */
|
|
655
|
+
function extractKeywords(queries: string[]): string[] {
|
|
656
|
+
const stopWords = new Set([
|
|
657
|
+
"a",
|
|
658
|
+
"an",
|
|
659
|
+
"the",
|
|
660
|
+
"to",
|
|
661
|
+
"for",
|
|
662
|
+
"of",
|
|
663
|
+
"in",
|
|
664
|
+
"on",
|
|
665
|
+
"at",
|
|
666
|
+
"is",
|
|
667
|
+
"it",
|
|
668
|
+
"my",
|
|
669
|
+
"me",
|
|
670
|
+
"i",
|
|
671
|
+
"do",
|
|
672
|
+
"can",
|
|
673
|
+
"you",
|
|
674
|
+
"how",
|
|
675
|
+
"what",
|
|
676
|
+
"this",
|
|
677
|
+
"that",
|
|
678
|
+
"with",
|
|
679
|
+
"and",
|
|
680
|
+
"or",
|
|
681
|
+
"but",
|
|
682
|
+
]);
|
|
683
|
+
const words = new Map<string, number>();
|
|
684
|
+
for (const q of queries) {
|
|
685
|
+
for (const w of q.toLowerCase().split(/\s+/)) {
|
|
686
|
+
const clean = w.replace(/[^a-z0-9-]/g, "");
|
|
687
|
+
if (clean.length > 1 && !stopWords.has(clean)) {
|
|
688
|
+
words.set(clean, (words.get(clean) ?? 0) + 1);
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
return [...words.entries()]
|
|
693
|
+
.sort((a, b) => b[1] - a[1])
|
|
694
|
+
.slice(0, 10)
|
|
695
|
+
.map(([w]) => w);
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
/** Remove duplicate routing rows by trigger text. */
|
|
699
|
+
function deduplicateRows(rows: RoutingRow[]): RoutingRow[] {
|
|
700
|
+
const seen = new Set<string>();
|
|
701
|
+
return rows.filter((r) => {
|
|
702
|
+
const key = r.trigger.toLowerCase();
|
|
703
|
+
if (seen.has(key)) return false;
|
|
704
|
+
seen.add(key);
|
|
705
|
+
return true;
|
|
706
|
+
});
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
/**
|
|
710
|
+
* Generate routing mutations targeted at specific weaknesses identified
|
|
711
|
+
* through replay failures and missed queries.
|
|
712
|
+
*
|
|
713
|
+
* Unlike deterministic mutations, these use the actual failure data to
|
|
714
|
+
* focus the mutation on patterns that failed.
|
|
715
|
+
*/
|
|
716
|
+
export function generateTargetedRoutingMutations(
|
|
717
|
+
skillPath: string,
|
|
718
|
+
weaknesses: MutationWeaknesses,
|
|
719
|
+
options?: { maxVariants?: number },
|
|
720
|
+
): BoundedMutationResult[] {
|
|
721
|
+
const allFailures = [...weaknesses.replayFailureSamples, ...weaknesses.routingFailureSamples];
|
|
722
|
+
|
|
723
|
+
// No weaknesses to target -- nothing to do
|
|
724
|
+
if (allFailures.length === 0) {
|
|
725
|
+
return [];
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
const content = readFileSync(skillPath, "utf-8");
|
|
729
|
+
const fingerprint = fingerprintContent(content);
|
|
730
|
+
const parsed = parseSkillSections(content);
|
|
731
|
+
const routing = parsed.sections["Workflow Routing"] ?? "";
|
|
732
|
+
const rows = parseRoutingTable(routing);
|
|
733
|
+
const maxVariants = options?.maxVariants ?? 3;
|
|
734
|
+
const results: BoundedMutationResult[] = [];
|
|
735
|
+
|
|
736
|
+
// Extract keywords from failure samples
|
|
737
|
+
const keywords = extractKeywords(allFailures);
|
|
738
|
+
|
|
739
|
+
// Strategy 1: Add failure-derived routing rows
|
|
740
|
+
if (results.length < maxVariants && allFailures.length > 0) {
|
|
741
|
+
const defaultWorkflow = rows.length > 0 ? rows[0].workflow : "Default";
|
|
742
|
+
const failureRows = allFailures.map((q) => ({
|
|
743
|
+
trigger: q.toLowerCase().trim(),
|
|
744
|
+
workflow: defaultWorkflow,
|
|
745
|
+
}));
|
|
746
|
+
const allRows = deduplicateRows([...rows, ...failureRows]);
|
|
747
|
+
const newTable = renderRoutingTable(allRows);
|
|
748
|
+
const variantContent = replaceSection(content, "Workflow Routing", newTable);
|
|
749
|
+
const variantPath = createVariantDir(skillPath, results.length);
|
|
750
|
+
writeFileSync(variantPath, variantContent, "utf-8");
|
|
751
|
+
results.push({
|
|
752
|
+
variantSkillPath: variantPath,
|
|
753
|
+
mutationSurface: "routing",
|
|
754
|
+
mutationDescription: "Targeted: added failure-derived routing rows",
|
|
755
|
+
parentFingerprint: fingerprint,
|
|
756
|
+
});
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
// Strategy 2: Add keyword-expanded routing rows
|
|
760
|
+
if (results.length < maxVariants && keywords.length > 0) {
|
|
761
|
+
const defaultWorkflow = rows.length > 0 ? rows[0].workflow : "Default";
|
|
762
|
+
const existingVerbs = rows.map((r) => r.trigger.split(/\s+/)[0]).filter(Boolean);
|
|
763
|
+
const verbs = existingVerbs.length > 0 ? existingVerbs : ["manage"];
|
|
764
|
+
const keywordRows = keywords.slice(0, 5).flatMap((kw) =>
|
|
765
|
+
verbs.slice(0, 2).map((verb) => ({
|
|
766
|
+
trigger: `${verb} ${kw}`,
|
|
767
|
+
workflow: defaultWorkflow,
|
|
768
|
+
})),
|
|
769
|
+
);
|
|
770
|
+
const allRows = deduplicateRows([...rows, ...keywordRows]);
|
|
771
|
+
const newTable = renderRoutingTable(allRows);
|
|
772
|
+
const variantContent = replaceSection(content, "Workflow Routing", newTable);
|
|
773
|
+
const variantPath = createVariantDir(skillPath, results.length);
|
|
774
|
+
writeFileSync(variantPath, variantContent, "utf-8");
|
|
775
|
+
results.push({
|
|
776
|
+
variantSkillPath: variantPath,
|
|
777
|
+
mutationSurface: "routing",
|
|
778
|
+
mutationDescription: "Targeted: added keyword-expanded routing rows",
|
|
779
|
+
parentFingerprint: fingerprint,
|
|
780
|
+
});
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
// Strategy 3: Merge failure samples into description for broader matching
|
|
784
|
+
if (results.length < maxVariants && keywords.length > 0) {
|
|
785
|
+
const keywordNote = `Also handles: ${keywords.join(", ")}.`;
|
|
786
|
+
const newDescription = parsed.description
|
|
787
|
+
? `${parsed.description}\n\n${keywordNote}`
|
|
788
|
+
: keywordNote;
|
|
789
|
+
const variantContent = reassembleSkill(parsed, undefined, newDescription);
|
|
790
|
+
const variantPath = createVariantDir(skillPath, results.length);
|
|
791
|
+
writeFileSync(variantPath, variantContent, "utf-8");
|
|
792
|
+
results.push({
|
|
793
|
+
variantSkillPath: variantPath,
|
|
794
|
+
mutationSurface: "routing",
|
|
795
|
+
mutationDescription: "Targeted: augmented description with failure keywords",
|
|
796
|
+
parentFingerprint: fingerprint,
|
|
797
|
+
});
|
|
798
|
+
}
|
|
799
|
+
|
|
800
|
+
return results.slice(0, maxVariants);
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
// ---------------------------------------------------------------------------
|
|
804
|
+
// Targeted body mutations (eval-informed)
|
|
805
|
+
// ---------------------------------------------------------------------------
|
|
806
|
+
|
|
807
|
+
/**
|
|
808
|
+
* Generate body mutations targeted at specific weaknesses identified
|
|
809
|
+
* through grading failures and body quality feedback.
|
|
810
|
+
*
|
|
811
|
+
* Only produces mutations when body quality is below threshold or
|
|
812
|
+
* grading pass rate has declined.
|
|
813
|
+
*/
|
|
814
|
+
export function generateTargetedBodyMutations(
|
|
815
|
+
skillPath: string,
|
|
816
|
+
weaknesses: MutationWeaknesses,
|
|
817
|
+
options?: { maxVariants?: number },
|
|
818
|
+
): BoundedMutationResult[] {
|
|
819
|
+
const hasBodyWeakness = weaknesses.bodyQualityScore < 0.8;
|
|
820
|
+
const hasGradingDecline = weaknesses.gradingPassRateDelta < -0.05;
|
|
821
|
+
const hasFailurePatterns = (weaknesses.gradingFailurePatterns?.length ?? 0) > 0;
|
|
822
|
+
|
|
823
|
+
if (!hasBodyWeakness && !hasGradingDecline && !hasFailurePatterns) {
|
|
824
|
+
return [];
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
const content = readFileSync(skillPath, "utf-8");
|
|
828
|
+
const fingerprint = fingerprintContent(content);
|
|
829
|
+
const parsed = parseSkillSections(content);
|
|
830
|
+
const maxVariants = options?.maxVariants ?? 3;
|
|
831
|
+
const results: BoundedMutationResult[] = [];
|
|
832
|
+
|
|
833
|
+
// Strategy 1: Strengthen instructions based on failure patterns
|
|
834
|
+
if (results.length < maxVariants && (hasGradingDecline || hasFailurePatterns)) {
|
|
835
|
+
const instructions = parsed.sections["Instructions"] ?? "";
|
|
836
|
+
const failureContext =
|
|
837
|
+
weaknesses.gradingFailurePatterns?.join("; ") ?? "execution quality declined";
|
|
838
|
+
const strengthened = instructions
|
|
839
|
+
? `${instructions}\n\n**Important:** Pay special attention to: ${failureContext}. Ensure all steps are followed precisely.`
|
|
840
|
+
: `Follow these steps carefully. ${failureContext}. Ensure all steps are followed precisely.`;
|
|
841
|
+
const variantContent = reassembleSkill(parsed, {
|
|
842
|
+
...parsed.sections,
|
|
843
|
+
Instructions: strengthened,
|
|
844
|
+
});
|
|
845
|
+
const variantPath = createVariantDir(skillPath, results.length);
|
|
846
|
+
writeFileSync(variantPath, variantContent, "utf-8");
|
|
847
|
+
results.push({
|
|
848
|
+
variantSkillPath: variantPath,
|
|
849
|
+
mutationSurface: "body",
|
|
850
|
+
mutationDescription: "Targeted: strengthened instructions from failure patterns",
|
|
851
|
+
parentFingerprint: fingerprint,
|
|
852
|
+
});
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
// Strategy 2: Expand examples from failure patterns
|
|
856
|
+
if (results.length < maxVariants && hasFailurePatterns) {
|
|
857
|
+
const examples = parsed.sections["Examples"] ?? "";
|
|
858
|
+
const failureExamples = (weaknesses.gradingFailurePatterns ?? [])
|
|
859
|
+
.map((pattern) => `- Address: "${pattern}"`)
|
|
860
|
+
.join("\n");
|
|
861
|
+
const expanded = examples
|
|
862
|
+
? `${examples}\n\n### Failure-informed examples\n\n${failureExamples}`
|
|
863
|
+
: `### Failure-informed examples\n\n${failureExamples}`;
|
|
864
|
+
const variantContent = reassembleSkill(parsed, { ...parsed.sections, Examples: expanded });
|
|
865
|
+
const variantPath = createVariantDir(skillPath, results.length);
|
|
866
|
+
writeFileSync(variantPath, variantContent, "utf-8");
|
|
867
|
+
results.push({
|
|
868
|
+
variantSkillPath: variantPath,
|
|
869
|
+
mutationSurface: "body",
|
|
870
|
+
mutationDescription: "Targeted: expanded examples from failure patterns",
|
|
871
|
+
parentFingerprint: fingerprint,
|
|
872
|
+
});
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
// Strategy 3: Add quality guard section
|
|
876
|
+
if (results.length < maxVariants && hasBodyWeakness) {
|
|
877
|
+
const qualityGuard = `Before completing, verify:\n- All required steps were followed\n- Output matches expected format\n- No errors were silently ignored`;
|
|
878
|
+
const variantContent = reassembleSkill(parsed, {
|
|
879
|
+
...parsed.sections,
|
|
880
|
+
"Quality Checklist": qualityGuard,
|
|
881
|
+
});
|
|
882
|
+
const variantPath = createVariantDir(skillPath, results.length);
|
|
883
|
+
writeFileSync(variantPath, variantContent, "utf-8");
|
|
884
|
+
results.push({
|
|
885
|
+
variantSkillPath: variantPath,
|
|
886
|
+
mutationSurface: "body",
|
|
887
|
+
mutationDescription: "Targeted: added quality guard checklist",
|
|
888
|
+
parentFingerprint: fingerprint,
|
|
889
|
+
});
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
return results.slice(0, maxVariants);
|
|
893
|
+
}
|
|
894
|
+
|
|
895
|
+
// ---------------------------------------------------------------------------
|
|
896
|
+
// Weakness extraction
|
|
897
|
+
// ---------------------------------------------------------------------------
|
|
898
|
+
|
|
899
|
+
/**
|
|
900
|
+
* Extract mutation weaknesses from the local database for a given skill.
|
|
901
|
+
*
|
|
902
|
+
* Reads the most recent evolution evidence and grading results to identify:
|
|
903
|
+
* - Replay failure samples (queries that should have triggered but didn't)
|
|
904
|
+
* - Routing failure samples (queries that routed incorrectly)
|
|
905
|
+
* - Body quality score (from grading summaries)
|
|
906
|
+
* - Grading pass rate delta (trend direction)
|
|
907
|
+
*/
|
|
908
|
+
export function extractMutationWeaknesses(skillName: string, db: Database): MutationWeaknesses {
|
|
909
|
+
const replayFailureSamples: string[] = [];
|
|
910
|
+
const routingFailureSamples: string[] = [];
|
|
911
|
+
let bodyQualityScore = 1.0;
|
|
912
|
+
let gradingPassRateDelta = 0;
|
|
913
|
+
const gradingFailurePatterns: string[] = [];
|
|
914
|
+
|
|
915
|
+
// --- Extract replay/routing failures from evolution evidence ---
|
|
916
|
+
try {
|
|
917
|
+
const evidenceRows = db
|
|
918
|
+
.query(
|
|
919
|
+
`SELECT validation_json FROM evolution_evidence
|
|
920
|
+
WHERE skill_name = ? AND validation_json IS NOT NULL
|
|
921
|
+
ORDER BY timestamp DESC LIMIT 5`,
|
|
922
|
+
)
|
|
923
|
+
.all(skillName) as Array<{ validation_json: string }>;
|
|
924
|
+
|
|
925
|
+
for (const row of evidenceRows) {
|
|
926
|
+
try {
|
|
927
|
+
const validation = JSON.parse(row.validation_json);
|
|
928
|
+
const entryResults = validation?.per_entry_results ?? [];
|
|
929
|
+
for (const entry of entryResults) {
|
|
930
|
+
if (entry.should_trigger && !entry.triggered && entry.query) {
|
|
931
|
+
if (!replayFailureSamples.includes(entry.query)) {
|
|
932
|
+
replayFailureSamples.push(entry.query);
|
|
933
|
+
}
|
|
934
|
+
}
|
|
935
|
+
if (entry.should_trigger && entry.triggered && !entry.passed && entry.query) {
|
|
936
|
+
if (!routingFailureSamples.includes(entry.query)) {
|
|
937
|
+
routingFailureSamples.push(entry.query);
|
|
938
|
+
}
|
|
939
|
+
}
|
|
940
|
+
}
|
|
941
|
+
} catch {
|
|
942
|
+
// Skip malformed validation JSON
|
|
943
|
+
}
|
|
944
|
+
}
|
|
945
|
+
} catch {
|
|
946
|
+
// Table may not exist yet
|
|
947
|
+
}
|
|
948
|
+
|
|
949
|
+
// --- Extract grading pass rate trend ---
|
|
950
|
+
try {
|
|
951
|
+
const gradingRows = db
|
|
952
|
+
.query(
|
|
953
|
+
`SELECT pass_rate, expectations_json, failure_feedback_json, graded_at FROM grading_results
|
|
954
|
+
WHERE skill_name = ?
|
|
955
|
+
ORDER BY graded_at DESC LIMIT 10`,
|
|
956
|
+
)
|
|
957
|
+
.all(skillName) as Array<{
|
|
958
|
+
pass_rate: number | null;
|
|
959
|
+
expectations_json: string | null;
|
|
960
|
+
failure_feedback_json: string | null;
|
|
961
|
+
graded_at: string;
|
|
962
|
+
}>;
|
|
963
|
+
|
|
964
|
+
if (gradingRows.length >= 2) {
|
|
965
|
+
const recentRate =
|
|
966
|
+
typeof gradingRows[0].pass_rate === "number" ? gradingRows[0].pass_rate : 1.0;
|
|
967
|
+
const previousRate =
|
|
968
|
+
typeof gradingRows[1].pass_rate === "number" ? gradingRows[1].pass_rate : 1.0;
|
|
969
|
+
gradingPassRateDelta = recentRate - previousRate;
|
|
970
|
+
bodyQualityScore = recentRate;
|
|
971
|
+
} else if (gradingRows.length === 1) {
|
|
972
|
+
bodyQualityScore =
|
|
973
|
+
typeof gradingRows[0].pass_rate === "number" ? gradingRows[0].pass_rate : 1.0;
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
// Extract failure patterns from failed expectations and failure feedback.
|
|
977
|
+
for (const row of gradingRows) {
|
|
978
|
+
try {
|
|
979
|
+
const expectations = row.expectations_json ? JSON.parse(row.expectations_json) : [];
|
|
980
|
+
if (Array.isArray(expectations)) {
|
|
981
|
+
for (const exp of expectations) {
|
|
982
|
+
if (exp?.passed === false) {
|
|
983
|
+
const pattern = exp.text ?? exp.name ?? exp.description;
|
|
984
|
+
if (
|
|
985
|
+
typeof pattern === "string" &&
|
|
986
|
+
pattern.length > 0 &&
|
|
987
|
+
!gradingFailurePatterns.includes(pattern)
|
|
988
|
+
) {
|
|
989
|
+
gradingFailurePatterns.push(pattern);
|
|
990
|
+
}
|
|
991
|
+
}
|
|
992
|
+
}
|
|
993
|
+
}
|
|
994
|
+
} catch {
|
|
995
|
+
// Skip malformed expectations JSON
|
|
996
|
+
}
|
|
997
|
+
|
|
998
|
+
try {
|
|
999
|
+
const feedback = row.failure_feedback_json ? JSON.parse(row.failure_feedback_json) : [];
|
|
1000
|
+
if (Array.isArray(feedback)) {
|
|
1001
|
+
for (const item of feedback) {
|
|
1002
|
+
const pattern = item?.improvement_hint ?? item?.failure_reason ?? item?.query;
|
|
1003
|
+
if (
|
|
1004
|
+
typeof pattern === "string" &&
|
|
1005
|
+
pattern.length > 0 &&
|
|
1006
|
+
!gradingFailurePatterns.includes(pattern)
|
|
1007
|
+
) {
|
|
1008
|
+
gradingFailurePatterns.push(pattern);
|
|
1009
|
+
}
|
|
1010
|
+
}
|
|
1011
|
+
}
|
|
1012
|
+
} catch {
|
|
1013
|
+
// Skip malformed failure feedback JSON
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
} catch {
|
|
1017
|
+
// Table may not exist yet
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
return {
|
|
1021
|
+
replayFailureSamples,
|
|
1022
|
+
routingFailureSamples,
|
|
1023
|
+
bodyQualityScore,
|
|
1024
|
+
gradingPassRateDelta,
|
|
1025
|
+
gradingFailurePatterns: gradingFailurePatterns.length > 0 ? gradingFailurePatterns : undefined,
|
|
1026
|
+
};
|
|
1027
|
+
}
|
|
1028
|
+
|
|
1029
|
+
// ---------------------------------------------------------------------------
|
|
1030
|
+
// Cleanup
|
|
1031
|
+
// ---------------------------------------------------------------------------
|
|
1032
|
+
|
|
1033
|
+
/**
|
|
1034
|
+
* Clean up temporary variant files. Call this after evaluation is complete.
|
|
1035
|
+
*/
|
|
1036
|
+
export function cleanupVariants(results: BoundedMutationResult[]): void {
|
|
1037
|
+
for (const r of results) {
|
|
1038
|
+
try {
|
|
1039
|
+
const dir = dirname(r.variantSkillPath);
|
|
1040
|
+
rmSync(dir, { recursive: true, force: true });
|
|
1041
|
+
} catch {
|
|
1042
|
+
// best-effort cleanup
|
|
1043
|
+
}
|
|
1044
|
+
}
|
|
1045
|
+
}
|