selftune 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/.claude/agents/diagnosis-analyst.md +146 -0
  2. package/.claude/agents/evolution-reviewer.md +167 -0
  3. package/.claude/agents/integration-guide.md +200 -0
  4. package/.claude/agents/pattern-analyst.md +147 -0
  5. package/CHANGELOG.md +37 -0
  6. package/README.md +96 -256
  7. package/assets/BeforeAfter.gif +0 -0
  8. package/assets/FeedbackLoop.gif +0 -0
  9. package/assets/logo.svg +9 -0
  10. package/assets/skill-health-badge.svg +20 -0
  11. package/cli/selftune/activation-rules.ts +171 -0
  12. package/cli/selftune/badge/badge-data.ts +108 -0
  13. package/cli/selftune/badge/badge-svg.ts +212 -0
  14. package/cli/selftune/badge/badge.ts +103 -0
  15. package/cli/selftune/constants.ts +75 -1
  16. package/cli/selftune/contribute/bundle.ts +314 -0
  17. package/cli/selftune/contribute/contribute.ts +214 -0
  18. package/cli/selftune/contribute/sanitize.ts +162 -0
  19. package/cli/selftune/cron/setup.ts +266 -0
  20. package/cli/selftune/dashboard-server.ts +582 -0
  21. package/cli/selftune/dashboard.ts +25 -3
  22. package/cli/selftune/eval/baseline.ts +247 -0
  23. package/cli/selftune/eval/composability.ts +117 -0
  24. package/cli/selftune/eval/generate-unit-tests.ts +143 -0
  25. package/cli/selftune/eval/hooks-to-evals.ts +68 -2
  26. package/cli/selftune/eval/import-skillsbench.ts +221 -0
  27. package/cli/selftune/eval/synthetic-evals.ts +172 -0
  28. package/cli/selftune/eval/unit-test-cli.ts +152 -0
  29. package/cli/selftune/eval/unit-test.ts +196 -0
  30. package/cli/selftune/evolution/deploy-proposal.ts +142 -1
  31. package/cli/selftune/evolution/evolve-body.ts +492 -0
  32. package/cli/selftune/evolution/evolve.ts +466 -103
  33. package/cli/selftune/evolution/extract-patterns.ts +32 -1
  34. package/cli/selftune/evolution/pareto.ts +314 -0
  35. package/cli/selftune/evolution/propose-body.ts +171 -0
  36. package/cli/selftune/evolution/propose-description.ts +100 -2
  37. package/cli/selftune/evolution/propose-routing.ts +166 -0
  38. package/cli/selftune/evolution/refine-body.ts +141 -0
  39. package/cli/selftune/evolution/rollback.ts +19 -2
  40. package/cli/selftune/evolution/validate-body.ts +254 -0
  41. package/cli/selftune/evolution/validate-proposal.ts +257 -35
  42. package/cli/selftune/evolution/validate-routing.ts +177 -0
  43. package/cli/selftune/grading/grade-session.ts +138 -18
  44. package/cli/selftune/grading/pre-gates.ts +104 -0
  45. package/cli/selftune/hooks/auto-activate.ts +185 -0
  46. package/cli/selftune/hooks/evolution-guard.ts +165 -0
  47. package/cli/selftune/hooks/skill-change-guard.ts +112 -0
  48. package/cli/selftune/index.ts +88 -0
  49. package/cli/selftune/ingestors/claude-replay.ts +351 -0
  50. package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
  51. package/cli/selftune/init.ts +150 -3
  52. package/cli/selftune/memory/writer.ts +447 -0
  53. package/cli/selftune/monitoring/watch.ts +25 -2
  54. package/cli/selftune/status.ts +17 -13
  55. package/cli/selftune/types.ts +377 -5
  56. package/cli/selftune/utils/frontmatter.ts +217 -0
  57. package/cli/selftune/utils/llm-call.ts +29 -3
  58. package/cli/selftune/utils/transcript.ts +35 -0
  59. package/cli/selftune/utils/trigger-check.ts +89 -0
  60. package/cli/selftune/utils/tui.ts +156 -0
  61. package/dashboard/index.html +569 -8
  62. package/package.json +8 -4
  63. package/skill/SKILL.md +124 -8
  64. package/skill/Workflows/AutoActivation.md +144 -0
  65. package/skill/Workflows/Badge.md +118 -0
  66. package/skill/Workflows/Baseline.md +121 -0
  67. package/skill/Workflows/Composability.md +100 -0
  68. package/skill/Workflows/Contribute.md +91 -0
  69. package/skill/Workflows/Cron.md +155 -0
  70. package/skill/Workflows/Dashboard.md +203 -0
  71. package/skill/Workflows/Doctor.md +37 -1
  72. package/skill/Workflows/Evals.md +69 -1
  73. package/skill/Workflows/EvolutionMemory.md +152 -0
  74. package/skill/Workflows/Evolve.md +111 -6
  75. package/skill/Workflows/EvolveBody.md +159 -0
  76. package/skill/Workflows/ImportSkillsBench.md +111 -0
  77. package/skill/Workflows/Ingest.md +117 -3
  78. package/skill/Workflows/Initialize.md +57 -3
  79. package/skill/Workflows/Replay.md +70 -0
  80. package/skill/Workflows/Rollback.md +20 -1
  81. package/skill/Workflows/UnitTest.md +138 -0
  82. package/skill/Workflows/Watch.md +22 -0
  83. package/skill/settings_snippet.json +23 -0
  84. package/templates/activation-rules-default.json +27 -0
  85. package/templates/multi-skill-settings.json +64 -0
  86. package/templates/single-skill-settings.json +58 -0
@@ -6,7 +6,14 @@
6
6
  * similar queries together using Jaccard similarity.
7
7
  */
8
8
 
9
- import type { EvalEntry, FailurePattern, InvocationType, SkillUsageRecord } from "../types.js";
9
+ import type {
10
+ EvalEntry,
11
+ FailureFeedback,
12
+ FailurePattern,
13
+ GradingResult,
14
+ InvocationType,
15
+ SkillUsageRecord,
16
+ } from "../types.js";
10
17
 
11
18
  // ---------------------------------------------------------------------------
12
19
  // Jaccard similarity
@@ -93,6 +100,7 @@ export function extractFailurePatterns(
93
100
  evalEntries: EvalEntry[],
94
101
  skillUsage: SkillUsageRecord[],
95
102
  skillName: string,
103
+ gradingResults?: GradingResult[],
96
104
  ): FailurePattern[] {
97
105
  // 1. Build a set of triggered queries from skillUsage for the given skillName
98
106
  const triggeredQueries = new Set<string>();
@@ -138,6 +146,29 @@ export function extractFailurePatterns(
138
146
  }
139
147
  }
140
148
 
149
+ // 3.5. Attach failure feedback from grading results if available
150
+ if (gradingResults && gradingResults.length > 0) {
151
+ const feedbackMap = new Map<string, FailureFeedback>();
152
+ for (const gr of gradingResults) {
153
+ if (gr.failure_feedback) {
154
+ for (const fb of gr.failure_feedback) {
155
+ feedbackMap.set(fb.query, fb);
156
+ }
157
+ }
158
+ }
159
+
160
+ for (const pattern of allPatterns) {
161
+ const matchingFeedback: FailureFeedback[] = [];
162
+ for (const query of pattern.missed_queries) {
163
+ const fb = feedbackMap.get(query);
164
+ if (fb) matchingFeedback.push(fb);
165
+ }
166
+ if (matchingFeedback.length > 0) {
167
+ pattern.feedback = matchingFeedback;
168
+ }
169
+ }
170
+ }
171
+
141
172
  // 4. Sort by frequency descending
142
173
  allPatterns.sort((a, b) => b.frequency - a.frequency);
143
174
 
@@ -0,0 +1,314 @@
1
+ /**
2
+ * pareto.ts
3
+ *
4
+ * Pareto frontier computation for multi-candidate evolution.
5
+ * All functions are pure — no I/O, no LLM calls.
6
+ */
7
+
8
+ import type {
9
+ InvocationType,
10
+ InvocationTypeScores,
11
+ ParetoCandidate,
12
+ SessionTelemetryRecord,
13
+ TokenUsageMetrics,
14
+ } from "../types.js";
15
+
16
+ // ---------------------------------------------------------------------------
17
+ // Score computation
18
+ // ---------------------------------------------------------------------------
19
+
20
+ /**
21
+ * Compute per-invocation-type scores from per-entry validation results.
22
+ */
23
+ export function computeInvocationScores(
24
+ perEntryResults: Array<{ entry: { invocation_type?: InvocationType }; after_pass: boolean }>,
25
+ ): InvocationTypeScores {
26
+ const dims: InvocationType[] = ["explicit", "implicit", "contextual", "negative"];
27
+ const counts: Record<string, { passed: number; total: number }> = {};
28
+
29
+ for (const dim of dims) {
30
+ counts[dim] = { passed: 0, total: 0 };
31
+ }
32
+
33
+ for (const r of perEntryResults) {
34
+ const type = r.entry.invocation_type ?? "implicit";
35
+ counts[type].total++;
36
+ if (r.after_pass) counts[type].passed++;
37
+ }
38
+
39
+ const result: Record<string, { passed: number; total: number; pass_rate: number }> = {};
40
+ for (const dim of dims) {
41
+ const { passed, total } = counts[dim];
42
+ result[dim] = { passed, total, pass_rate: total > 0 ? passed / total : 0 };
43
+ }
44
+
45
+ return result as unknown as InvocationTypeScores;
46
+ }
47
+
48
+ // ---------------------------------------------------------------------------
49
+ // Token efficiency scoring
50
+ // ---------------------------------------------------------------------------
51
+
52
+ /**
53
+ * Clamp a value to [min, max].
54
+ */
55
+ function clamp(value: number, min: number, max: number): number {
56
+ return Math.max(min, Math.min(max, value));
57
+ }
58
+
59
+ /**
60
+ * Compute token usage metrics from telemetry records.
61
+ */
62
+ export function computeTokenUsageMetrics(records: SessionTelemetryRecord[]): TokenUsageMetrics {
63
+ let input = 0;
64
+ let output = 0;
65
+ for (const r of records) {
66
+ input += r.input_tokens ?? 0;
67
+ output += r.output_tokens ?? 0;
68
+ }
69
+ return {
70
+ input_tokens: input,
71
+ output_tokens: output,
72
+ total_tokens: input + output,
73
+ };
74
+ }
75
+
76
+ /**
77
+ * Compute a token efficiency score for a skill.
78
+ *
79
+ * Compares average total tokens for sessions WITH the skill triggered
80
+ * vs sessions WITHOUT it. Returns `clamp(baseline_avg / with_skill_avg, 0, 1)`.
81
+ * Values near 1.0 indicate the baseline uses more tokens than sessions with the
82
+ * skill (i.e. the skill is efficient). Values near 0.0 indicate the skill uses
83
+ * more tokens than the baseline.
84
+ *
85
+ * Returns 0.5 (neutral) when there is insufficient data in either group.
86
+ */
87
+ export function computeTokenEfficiencyScore(
88
+ skillName: string,
89
+ telemetry: SessionTelemetryRecord[],
90
+ ): number {
91
+ const withSkill: number[] = [];
92
+ const withoutSkill: number[] = [];
93
+
94
+ for (const record of telemetry) {
95
+ const total = (record.input_tokens ?? 0) + (record.output_tokens ?? 0);
96
+ if (total <= 0) continue;
97
+
98
+ if (record.skills_triggered.includes(skillName)) {
99
+ withSkill.push(total);
100
+ } else {
101
+ withoutSkill.push(total);
102
+ }
103
+ }
104
+
105
+ if (withSkill.length === 0 || withoutSkill.length === 0) {
106
+ return 0.5; // neutral when insufficient data
107
+ }
108
+
109
+ const avgWithSkill = withSkill.reduce((a, b) => a + b, 0) / withSkill.length;
110
+ const avgBaseline = withoutSkill.reduce((a, b) => a + b, 0) / withoutSkill.length;
111
+
112
+ if (avgWithSkill === 0) return 1; // zero-token skill usage is maximally efficient
113
+
114
+ return clamp(avgBaseline / avgWithSkill, 0, 1);
115
+ }
116
+
117
+ // ---------------------------------------------------------------------------
118
+ // Pareto dominance
119
+ // ---------------------------------------------------------------------------
120
+
121
+ const DIMS: InvocationType[] = ["explicit", "implicit", "contextual", "negative"];
122
+
123
+ /**
124
+ * Returns true if candidate A dominates candidate B:
125
+ * A >= B on all dimensions AND A > B on at least one.
126
+ *
127
+ * When token efficiency scores are provided for BOTH candidates,
128
+ * a 5th dimension is added to the comparison.
129
+ */
130
+ export function dominates(
131
+ a: InvocationTypeScores,
132
+ b: InvocationTypeScores,
133
+ aTokenEfficiency?: number,
134
+ bTokenEfficiency?: number,
135
+ ): boolean {
136
+ let strictlyBetterOnAny = false;
137
+
138
+ for (const dim of DIMS) {
139
+ const aRate = a[dim].pass_rate;
140
+ const bRate = b[dim].pass_rate;
141
+
142
+ if (aRate < bRate) return false; // A is worse on this dim
143
+ if (aRate > bRate) strictlyBetterOnAny = true;
144
+ }
145
+
146
+ // 5th dimension: token efficiency (only when both have data)
147
+ if (aTokenEfficiency !== undefined && bTokenEfficiency !== undefined) {
148
+ if (aTokenEfficiency < bTokenEfficiency) return false;
149
+ if (aTokenEfficiency > bTokenEfficiency) strictlyBetterOnAny = true;
150
+ }
151
+
152
+ return strictlyBetterOnAny;
153
+ }
154
+
155
+ /**
156
+ * Compute the dimensions where candidate A dominates candidate B.
157
+ */
158
+ export function getDominatedDimensions(
159
+ a: InvocationTypeScores,
160
+ b: InvocationTypeScores,
161
+ ): InvocationType[] {
162
+ const result: InvocationType[] = [];
163
+ for (const dim of DIMS) {
164
+ if (a[dim].pass_rate > b[dim].pass_rate) {
165
+ result.push(dim);
166
+ }
167
+ }
168
+ return result;
169
+ }
170
+
171
+ // ---------------------------------------------------------------------------
172
+ // Pareto frontier
173
+ // ---------------------------------------------------------------------------
174
+
175
+ /**
176
+ * Filter candidates to the Pareto frontier (non-dominated set).
177
+ * Also sets `dominates_on` for each frontier member.
178
+ *
179
+ * When candidates have `token_efficiency_score` set, the 5th dimension
180
+ * is used in dominance checks.
181
+ */
182
+ export function computeParetoFrontier(candidates: ParetoCandidate[]): ParetoCandidate[] {
183
+ if (candidates.length === 0) return [];
184
+
185
+ const frontier: ParetoCandidate[] = [];
186
+
187
+ for (const candidate of candidates) {
188
+ // Check if any existing frontier member dominates this candidate
189
+ let isDominated = false;
190
+ for (const member of frontier) {
191
+ if (
192
+ dominates(
193
+ member.invocation_scores,
194
+ candidate.invocation_scores,
195
+ member.token_efficiency_score,
196
+ candidate.token_efficiency_score,
197
+ )
198
+ ) {
199
+ isDominated = true;
200
+ break;
201
+ }
202
+ }
203
+
204
+ if (!isDominated) {
205
+ // Remove frontier members that this candidate dominates
206
+ for (let i = frontier.length - 1; i >= 0; i--) {
207
+ if (
208
+ dominates(
209
+ candidate.invocation_scores,
210
+ frontier[i].invocation_scores,
211
+ candidate.token_efficiency_score,
212
+ frontier[i].token_efficiency_score,
213
+ )
214
+ ) {
215
+ frontier.splice(i, 1);
216
+ }
217
+ }
218
+ frontier.push(candidate);
219
+ }
220
+ }
221
+
222
+ // Set dominates_on for each frontier member (compared to others in frontier)
223
+ for (const member of frontier) {
224
+ const allDominatedDims = new Set<InvocationType>();
225
+ for (const other of frontier) {
226
+ if (other === member) continue;
227
+ for (const dim of getDominatedDimensions(member.invocation_scores, other.invocation_scores)) {
228
+ allDominatedDims.add(dim);
229
+ }
230
+ }
231
+ member.dominates_on = [...allDominatedDims];
232
+ }
233
+
234
+ return frontier;
235
+ }
236
+
237
+ // ---------------------------------------------------------------------------
238
+ // Merge prompt
239
+ // ---------------------------------------------------------------------------
240
+
241
+ /**
242
+ * Build a merge prompt for complementary frontier candidates.
243
+ * Returns null if <= 1 candidate or no complementarity detected.
244
+ */
245
+ export function buildMergePrompt(
246
+ frontier: ParetoCandidate[],
247
+ originalDescription: string,
248
+ ): string | null {
249
+ if (frontier.length <= 1) return null;
250
+
251
+ // Check for complementarity: different candidates dominate on different dimensions
252
+ const hasComplementarity = frontier.some((c) => c.dominates_on.length > 0);
253
+ if (!hasComplementarity) return null;
254
+
255
+ const candidateDescriptions = frontier
256
+ .map((c, i) => {
257
+ const strengths =
258
+ c.dominates_on.length > 0
259
+ ? `Strengths: ${c.dominates_on.join(", ")}`
260
+ : "No unique strengths";
261
+ return `Candidate ${i + 1} (${c.proposal.proposal_id}):\nDescription: ${c.proposal.proposed_description}\n${strengths}\nOverall pass rate: ${(c.validation.after_pass_rate * 100).toFixed(1)}%`;
262
+ })
263
+ .join("\n\n");
264
+
265
+ return `You are merging multiple skill descriptions that each excel on different invocation types.
266
+
267
+ Original description:
268
+ ${originalDescription}
269
+
270
+ Candidates:
271
+ ${candidateDescriptions}
272
+
273
+ Create a single merged description that combines the strengths of all candidates.
274
+ Output ONLY valid JSON with:
275
+ - "proposed_description": the merged description
276
+ - "rationale": explanation of what was combined
277
+ - "confidence": 0.0-1.0`;
278
+ }
279
+
280
+ // ---------------------------------------------------------------------------
281
+ // Selection
282
+ // ---------------------------------------------------------------------------
283
+
284
+ /**
285
+ * Select the best candidate from a Pareto frontier.
286
+ * Returns the best single candidate and whether a merge should be attempted.
287
+ */
288
+ export function selectFromFrontier(frontier: ParetoCandidate[]): {
289
+ best: ParetoCandidate;
290
+ shouldMerge: boolean;
291
+ mergePrompt: string | null;
292
+ } {
293
+ if (frontier.length === 0) {
294
+ throw new Error("Cannot select from empty frontier");
295
+ }
296
+
297
+ // Sort by overall after_pass_rate descending, then by number of new_passes
298
+ const sorted = [...frontier].sort((a, b) => {
299
+ const rateDiff = b.validation.after_pass_rate - a.validation.after_pass_rate;
300
+ if (Math.abs(rateDiff) > 0.001) return rateDiff;
301
+ return b.validation.new_passes.length - a.validation.new_passes.length;
302
+ });
303
+
304
+ const best = sorted[0];
305
+ const shouldMerge = frontier.length > 1 && frontier.some((c) => c.dominates_on.length > 0);
306
+
307
+ return {
308
+ best,
309
+ shouldMerge,
310
+ mergePrompt: shouldMerge
311
+ ? buildMergePrompt(frontier, best.proposal.original_description)
312
+ : null,
313
+ };
314
+ }
@@ -0,0 +1,171 @@
1
+ /**
2
+ * propose-body.ts
3
+ *
4
+ * Generates full body proposals for SKILL.md files using a teacher LLM.
5
+ * The teacher analyzes current content, failure patterns, and missed queries
6
+ * to produce an improved skill body.
7
+ */
8
+
9
+ import type { BodyEvolutionProposal, EvolutionTarget, FailurePattern } from "../types.js";
10
+ import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
11
+
12
+ // ---------------------------------------------------------------------------
13
+ // System prompt
14
+ // ---------------------------------------------------------------------------
15
+
16
+ /** System prompt for the body generator (teacher) LLM. */
17
+ export const BODY_GENERATOR_SYSTEM = `You are an expert skill document author for an AI agent routing system.
18
+
19
+ Your task is to generate an improved SKILL.md body that better covers the semantic
20
+ space of queries that the skill should handle. The body includes everything after
21
+ the title line: the description, workflow routing table, instructions, examples, etc.
22
+
23
+ Rules:
24
+ - Preserve the overall structure: description paragraph, ## Workflow Routing table, and other ## sections.
25
+ - The ## Workflow Routing table must be a valid markdown table with | Trigger | Workflow | columns.
26
+ - Cover the semantic space of the missed queries without being too broad.
27
+ - Maintain the original intent and scope of the skill.
28
+ - Be specific and actionable in instructions.
29
+ - Output ONLY valid JSON with exactly these fields:
30
+ - "proposed_body" (string): the complete improved skill body (markdown, everything below the title)
31
+ - "rationale" (string): explanation of what changed and why
32
+ - "confidence" (number): 0.0-1.0 how confident you are this improves the skill
33
+
34
+ Do NOT include any text outside the JSON object.`;
35
+
36
+ // ---------------------------------------------------------------------------
37
+ // Prompt builder
38
+ // ---------------------------------------------------------------------------
39
+
40
+ /** Build the user prompt for full body generation. */
41
+ export function buildBodyGenerationPrompt(
42
+ currentContent: string,
43
+ failurePatterns: FailurePattern[],
44
+ missedQueries: string[],
45
+ skillName: string,
46
+ fewShotExamples?: string[],
47
+ ): string {
48
+ const patternLines = failurePatterns.map((p) => {
49
+ const queries = p.missed_queries.map((q) => ` - "${q}"`).join("\n");
50
+ return ` Pattern ${p.pattern_id} (frequency: ${p.frequency}, type: ${p.invocation_type}):\n${queries}`;
51
+ });
52
+
53
+ const missedLines = missedQueries.map((q) => ` - "${q}"`).join("\n");
54
+
55
+ // Build failure feedback section if any patterns have feedback
56
+ const feedbackLines: string[] = [];
57
+ for (const p of failurePatterns) {
58
+ if (p.feedback && p.feedback.length > 0) {
59
+ for (const fb of p.feedback) {
60
+ feedbackLines.push(` Query: "${fb.query}"`);
61
+ feedbackLines.push(` Failure reason: ${fb.failure_reason}`);
62
+ feedbackLines.push(` Improvement hint: ${fb.improvement_hint}`);
63
+ }
64
+ }
65
+ }
66
+ const feedbackSection =
67
+ feedbackLines.length > 0 ? `\n\nStructured Failure Analysis:\n${feedbackLines.join("\n")}` : "";
68
+
69
+ // Build few-shot examples section if provided
70
+ const fewShotSection =
71
+ fewShotExamples && fewShotExamples.length > 0
72
+ ? `\n\nReference Examples (other well-written skills):\n${fewShotExamples.map((ex, i) => `--- Example ${i + 1} ---\n${ex}`).join("\n\n")}`
73
+ : "";
74
+
75
+ return `Skill Name: ${skillName}
76
+
77
+ Current Skill Content:
78
+ ${currentContent}
79
+
80
+ Failure Patterns:
81
+ ${patternLines.join("\n\n")}
82
+
83
+ All Missed Queries:
84
+ ${missedLines}${feedbackSection}${fewShotSection}
85
+
86
+ Generate an improved full body for the "${skillName}" skill that would correctly handle the missed queries listed above. The body should include everything below the # Title line: description, ## Workflow Routing table, and any other sections. Output ONLY a JSON object with "proposed_body", "rationale", and "confidence" fields.`;
87
+ }
88
+
89
+ // ---------------------------------------------------------------------------
90
+ // Response parser
91
+ // ---------------------------------------------------------------------------
92
+
93
+ /** Parse LLM response text into structured body proposal data. */
94
+ export function parseBodyProposalResponse(raw: string): {
95
+ proposed_body: string;
96
+ rationale: string;
97
+ confidence: number;
98
+ } {
99
+ const cleaned = stripMarkdownFences(raw);
100
+
101
+ let parsed: unknown;
102
+ try {
103
+ parsed = JSON.parse(cleaned);
104
+ } catch {
105
+ throw new Error(`Failed to parse LLM response as JSON: ${cleaned.slice(0, 200)}`);
106
+ }
107
+
108
+ if (typeof parsed !== "object" || parsed === null) {
109
+ throw new Error("LLM response is not a JSON object");
110
+ }
111
+
112
+ const obj = parsed as Record<string, unknown>;
113
+
114
+ if (typeof obj.proposed_body !== "string") {
115
+ throw new Error("Missing or invalid 'proposed_body' field in LLM response");
116
+ }
117
+ if (typeof obj.rationale !== "string") {
118
+ throw new Error("Missing or invalid 'rationale' field in LLM response");
119
+ }
120
+ if (typeof obj.confidence !== "number") {
121
+ throw new Error("Missing or invalid 'confidence' field in LLM response");
122
+ }
123
+
124
+ const confidence = Math.max(0.0, Math.min(1.0, obj.confidence));
125
+
126
+ return {
127
+ proposed_body: obj.proposed_body,
128
+ rationale: obj.rationale,
129
+ confidence,
130
+ };
131
+ }
132
+
133
+ // ---------------------------------------------------------------------------
134
+ // Proposal generator
135
+ // ---------------------------------------------------------------------------
136
+
137
+ /** Generate a full body evolution proposal using teacher LLM. */
138
+ export async function generateBodyProposal(
139
+ currentContent: string,
140
+ failurePatterns: FailurePattern[],
141
+ missedQueries: string[],
142
+ skillName: string,
143
+ skillPath: string,
144
+ agent: string,
145
+ modelFlag?: string,
146
+ fewShotExamples?: string[],
147
+ ): Promise<BodyEvolutionProposal> {
148
+ const prompt = buildBodyGenerationPrompt(
149
+ currentContent,
150
+ failurePatterns,
151
+ missedQueries,
152
+ skillName,
153
+ fewShotExamples,
154
+ );
155
+ const rawResponse = await callLlm(BODY_GENERATOR_SYSTEM, prompt, agent, modelFlag);
156
+ const { proposed_body, rationale, confidence } = parseBodyProposalResponse(rawResponse);
157
+
158
+ return {
159
+ proposal_id: `evo-body-${skillName}-${Date.now()}`,
160
+ skill_name: skillName,
161
+ skill_path: skillPath,
162
+ original_body: currentContent,
163
+ proposed_body,
164
+ rationale,
165
+ target: "body" as EvolutionTarget,
166
+ failure_patterns: failurePatterns.map((p) => p.pattern_id),
167
+ confidence,
168
+ created_at: new Date().toISOString(),
169
+ status: "pending",
170
+ };
171
+ }
@@ -50,6 +50,23 @@ export function buildProposalPrompt(
50
50
 
51
51
  const missedLines = missedQueries.map((q) => ` - "${q}"`).join("\n");
52
52
 
53
+ // Build failure feedback section if any patterns have feedback
54
+ const feedbackLines: string[] = [];
55
+ for (const p of failurePatterns) {
56
+ if (p.feedback && p.feedback.length > 0) {
57
+ for (const fb of p.feedback) {
58
+ feedbackLines.push(` Query: "${fb.query}"`);
59
+ feedbackLines.push(` Failure reason: ${fb.failure_reason}`);
60
+ feedbackLines.push(` Improvement hint: ${fb.improvement_hint}`);
61
+ if (fb.invocation_type) {
62
+ feedbackLines.push(` Invocation type: ${fb.invocation_type}`);
63
+ }
64
+ }
65
+ }
66
+ }
67
+ const feedbackSection =
68
+ feedbackLines.length > 0 ? `\n\nStructured Failure Analysis:\n${feedbackLines.join("\n")}` : "";
69
+
53
70
  return `Skill Name: ${skillName}
54
71
 
55
72
  Current Description:
@@ -59,7 +76,7 @@ Failure Patterns:
59
76
  ${patternLines.join("\n\n")}
60
77
 
61
78
  All Missed Queries:
62
- ${missedLines}
79
+ ${missedLines}${feedbackSection}
63
80
 
64
81
  Propose an improved description for the "${skillName}" skill that would correctly route the missed queries listed above. Output ONLY a JSON object with "proposed_description", "rationale", and "confidence" fields.`;
65
82
  }
@@ -113,6 +130,86 @@ export function parseProposalResponse(raw: string): {
113
130
  // Proposal generator
114
131
  // ---------------------------------------------------------------------------
115
132
 
133
+ /**
134
+ * Generate multiple proposals in parallel, each biased toward a different invocation type.
135
+ */
136
+ export async function generateMultipleProposals(
137
+ currentDescription: string,
138
+ failurePatterns: FailurePattern[],
139
+ missedQueries: string[],
140
+ skillName: string,
141
+ skillPath: string,
142
+ agent: string,
143
+ count = 3,
144
+ modelFlag?: string,
145
+ ): Promise<EvolutionProposal[]> {
146
+ const variations = buildPromptVariations(
147
+ currentDescription,
148
+ failurePatterns,
149
+ missedQueries,
150
+ skillName,
151
+ count,
152
+ );
153
+
154
+ const proposals = await Promise.all(
155
+ variations.map(async (prompt, i) => {
156
+ const rawResponse = await callLlm(PROPOSER_SYSTEM, prompt, agent, modelFlag);
157
+ const { proposed_description, rationale, confidence } = parseProposalResponse(rawResponse);
158
+
159
+ return {
160
+ proposal_id: `evo-${skillName}-${Date.now()}-${i}`,
161
+ skill_name: skillName,
162
+ skill_path: skillPath,
163
+ original_description: currentDescription,
164
+ proposed_description,
165
+ rationale,
166
+ failure_patterns: failurePatterns.map((p) => p.pattern_id),
167
+ eval_results: {
168
+ before: { total: 0, passed: 0, failed: 0, pass_rate: 0 },
169
+ after: { total: 0, passed: 0, failed: 0, pass_rate: 0 },
170
+ },
171
+ confidence,
172
+ created_at: new Date().toISOString(),
173
+ status: "pending" as const,
174
+ };
175
+ }),
176
+ );
177
+
178
+ return proposals;
179
+ }
180
+
181
+ /**
182
+ * Build prompt variations, each biased toward a different invocation type.
183
+ */
184
+ export function buildPromptVariations(
185
+ currentDescription: string,
186
+ failurePatterns: FailurePattern[],
187
+ missedQueries: string[],
188
+ skillName: string,
189
+ count: number,
190
+ ): string[] {
191
+ const biases: string[] = [
192
+ "Focus especially on improving explicit invocation (direct mentions of the skill).",
193
+ "Focus especially on improving implicit invocation (indirect references to skill capabilities).",
194
+ "Focus especially on improving contextual invocation (where the context implies the skill is needed).",
195
+ ];
196
+
197
+ const basePrompt = buildProposalPrompt(
198
+ currentDescription,
199
+ failurePatterns,
200
+ missedQueries,
201
+ skillName,
202
+ );
203
+ const variations: string[] = [];
204
+
205
+ for (let i = 0; i < count; i++) {
206
+ const bias = biases[i % biases.length];
207
+ variations.push(`${basePrompt}\n\nAdditional focus: ${bias}`);
208
+ }
209
+
210
+ return variations;
211
+ }
212
+
116
213
  /** Generate a complete evolution proposal using LLM. */
117
214
  export async function generateProposal(
118
215
  currentDescription: string,
@@ -121,9 +218,10 @@ export async function generateProposal(
121
218
  skillName: string,
122
219
  skillPath: string,
123
220
  agent: string,
221
+ modelFlag?: string,
124
222
  ): Promise<EvolutionProposal> {
125
223
  const prompt = buildProposalPrompt(currentDescription, failurePatterns, missedQueries, skillName);
126
- const rawResponse = await callLlm(PROPOSER_SYSTEM, prompt, agent);
224
+ const rawResponse = await callLlm(PROPOSER_SYSTEM, prompt, agent, modelFlag);
127
225
  const { proposed_description, rationale, confidence } = parseProposalResponse(rawResponse);
128
226
 
129
227
  return {