npm - selftune - Versions diffs - 0.1.4 → 0.2.0 - Mend

selftune 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

package/.claude/agents/diagnosis-analyst.md +146 -0
package/.claude/agents/evolution-reviewer.md +167 -0
package/.claude/agents/integration-guide.md +200 -0
package/.claude/agents/pattern-analyst.md +147 -0
package/CHANGELOG.md +37 -0
package/README.md +96 -256
package/assets/BeforeAfter.gif +0 -0
package/assets/FeedbackLoop.gif +0 -0
package/assets/logo.svg +9 -0
package/assets/skill-health-badge.svg +20 -0
package/cli/selftune/activation-rules.ts +171 -0
package/cli/selftune/badge/badge-data.ts +108 -0
package/cli/selftune/badge/badge-svg.ts +212 -0
package/cli/selftune/badge/badge.ts +103 -0
package/cli/selftune/constants.ts +75 -1
package/cli/selftune/contribute/bundle.ts +314 -0
package/cli/selftune/contribute/contribute.ts +214 -0
package/cli/selftune/contribute/sanitize.ts +162 -0
package/cli/selftune/cron/setup.ts +266 -0
package/cli/selftune/dashboard-server.ts +582 -0
package/cli/selftune/dashboard.ts +25 -3
package/cli/selftune/eval/baseline.ts +247 -0
package/cli/selftune/eval/composability.ts +117 -0
package/cli/selftune/eval/generate-unit-tests.ts +143 -0
package/cli/selftune/eval/hooks-to-evals.ts +68 -2
package/cli/selftune/eval/import-skillsbench.ts +221 -0
package/cli/selftune/eval/synthetic-evals.ts +172 -0
package/cli/selftune/eval/unit-test-cli.ts +152 -0
package/cli/selftune/eval/unit-test.ts +196 -0
package/cli/selftune/evolution/deploy-proposal.ts +142 -1
package/cli/selftune/evolution/evolve-body.ts +492 -0
package/cli/selftune/evolution/evolve.ts +466 -103
package/cli/selftune/evolution/extract-patterns.ts +32 -1
package/cli/selftune/evolution/pareto.ts +314 -0
package/cli/selftune/evolution/propose-body.ts +171 -0
package/cli/selftune/evolution/propose-description.ts +100 -2
package/cli/selftune/evolution/propose-routing.ts +166 -0
package/cli/selftune/evolution/refine-body.ts +141 -0
package/cli/selftune/evolution/rollback.ts +19 -2
package/cli/selftune/evolution/validate-body.ts +254 -0
package/cli/selftune/evolution/validate-proposal.ts +257 -35
package/cli/selftune/evolution/validate-routing.ts +177 -0
package/cli/selftune/grading/grade-session.ts +138 -18
package/cli/selftune/grading/pre-gates.ts +104 -0
package/cli/selftune/hooks/auto-activate.ts +185 -0
package/cli/selftune/hooks/evolution-guard.ts +165 -0
package/cli/selftune/hooks/skill-change-guard.ts +112 -0
package/cli/selftune/index.ts +88 -0
package/cli/selftune/ingestors/claude-replay.ts +351 -0
package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
package/cli/selftune/init.ts +150 -3
package/cli/selftune/memory/writer.ts +447 -0
package/cli/selftune/monitoring/watch.ts +25 -2
package/cli/selftune/status.ts +17 -13
package/cli/selftune/types.ts +377 -5
package/cli/selftune/utils/frontmatter.ts +217 -0
package/cli/selftune/utils/llm-call.ts +29 -3
package/cli/selftune/utils/transcript.ts +35 -0
package/cli/selftune/utils/trigger-check.ts +89 -0
package/cli/selftune/utils/tui.ts +156 -0
package/dashboard/index.html +569 -8
package/package.json +8 -4
package/skill/SKILL.md +124 -8
package/skill/Workflows/AutoActivation.md +144 -0
package/skill/Workflows/Badge.md +118 -0
package/skill/Workflows/Baseline.md +121 -0
package/skill/Workflows/Composability.md +100 -0
package/skill/Workflows/Contribute.md +91 -0
package/skill/Workflows/Cron.md +155 -0
package/skill/Workflows/Dashboard.md +203 -0
package/skill/Workflows/Doctor.md +37 -1
package/skill/Workflows/Evals.md +69 -1
package/skill/Workflows/EvolutionMemory.md +152 -0
package/skill/Workflows/Evolve.md +111 -6
package/skill/Workflows/EvolveBody.md +159 -0
package/skill/Workflows/ImportSkillsBench.md +111 -0
package/skill/Workflows/Ingest.md +117 -3
package/skill/Workflows/Initialize.md +57 -3
package/skill/Workflows/Replay.md +70 -0
package/skill/Workflows/Rollback.md +20 -1
package/skill/Workflows/UnitTest.md +138 -0
package/skill/Workflows/Watch.md +22 -0
package/skill/settings_snippet.json +23 -0
package/templates/activation-rules-default.json +27 -0
package/templates/multi-skill-settings.json +64 -0
package/templates/single-skill-settings.json +58 -0

package/cli/selftune/evolution/extract-patterns.ts CHANGED Viewed

@@ -6,7 +6,14 @@
  * similar queries together using Jaccard similarity.
  */
-import type { EvalEntry, FailurePattern, InvocationType, SkillUsageRecord } from "../types.js";
+import type {
+  EvalEntry,
+  FailureFeedback,
+  FailurePattern,
+  GradingResult,
+  InvocationType,
+  SkillUsageRecord,
+} from "../types.js";
 // ---------------------------------------------------------------------------
 // Jaccard similarity
@@ -93,6 +100,7 @@ export function extractFailurePatterns(
   evalEntries: EvalEntry[],
   skillUsage: SkillUsageRecord[],
   skillName: string,
+  gradingResults?: GradingResult[],
 ): FailurePattern[] {
   // 1. Build a set of triggered queries from skillUsage for the given skillName
   const triggeredQueries = new Set<string>();
@@ -138,6 +146,29 @@ export function extractFailurePatterns(
     }
   }
+  // 3.5. Attach failure feedback from grading results if available
+  if (gradingResults && gradingResults.length > 0) {
+    const feedbackMap = new Map<string, FailureFeedback>();
+    for (const gr of gradingResults) {
+      if (gr.failure_feedback) {
+        for (const fb of gr.failure_feedback) {
+          feedbackMap.set(fb.query, fb);
+        }
+      }
+    }
+    for (const pattern of allPatterns) {
+      const matchingFeedback: FailureFeedback[] = [];
+      for (const query of pattern.missed_queries) {
+        const fb = feedbackMap.get(query);
+        if (fb) matchingFeedback.push(fb);
+      }
+      if (matchingFeedback.length > 0) {
+        pattern.feedback = matchingFeedback;
+      }
+    }
+  }
   // 4. Sort by frequency descending
   allPatterns.sort((a, b) => b.frequency - a.frequency);

package/cli/selftune/evolution/pareto.ts ADDED Viewed

@@ -0,0 +1,314 @@
+/**
+ * pareto.ts
+ *
+ * Pareto frontier computation for multi-candidate evolution.
+ * All functions are pure — no I/O, no LLM calls.
+ */
+import type {
+  InvocationType,
+  InvocationTypeScores,
+  ParetoCandidate,
+  SessionTelemetryRecord,
+  TokenUsageMetrics,
+} from "../types.js";
+// ---------------------------------------------------------------------------
+// Score computation
+// ---------------------------------------------------------------------------
+/**
+ * Compute per-invocation-type scores from per-entry validation results.
+ */
+export function computeInvocationScores(
+  perEntryResults: Array<{ entry: { invocation_type?: InvocationType }; after_pass: boolean }>,
+): InvocationTypeScores {
+  const dims: InvocationType[] = ["explicit", "implicit", "contextual", "negative"];
+  const counts: Record<string, { passed: number; total: number }> = {};
+  for (const dim of dims) {
+    counts[dim] = { passed: 0, total: 0 };
+  }
+  for (const r of perEntryResults) {
+    const type = r.entry.invocation_type ?? "implicit";
+    counts[type].total++;
+    if (r.after_pass) counts[type].passed++;
+  }
+  const result: Record<string, { passed: number; total: number; pass_rate: number }> = {};
+  for (const dim of dims) {
+    const { passed, total } = counts[dim];
+    result[dim] = { passed, total, pass_rate: total > 0 ? passed / total : 0 };
+  }
+  return result as unknown as InvocationTypeScores;
+}
+// ---------------------------------------------------------------------------
+// Token efficiency scoring
+// ---------------------------------------------------------------------------
+/**
+ * Clamp a value to [min, max].
+ */
+function clamp(value: number, min: number, max: number): number {
+  return Math.max(min, Math.min(max, value));
+}
+/**
+ * Compute token usage metrics from telemetry records.
+ */
+export function computeTokenUsageMetrics(records: SessionTelemetryRecord[]): TokenUsageMetrics {
+  let input = 0;
+  let output = 0;
+  for (const r of records) {
+    input += r.input_tokens ?? 0;
+    output += r.output_tokens ?? 0;
+  }
+  return {
+    input_tokens: input,
+    output_tokens: output,
+    total_tokens: input + output,
+  };
+}
+/**
+ * Compute a token efficiency score for a skill.
+ *
+ * Compares average total tokens for sessions WITH the skill triggered
+ * vs sessions WITHOUT it. Returns `clamp(baseline_avg / with_skill_avg, 0, 1)`.
+ * Values near 1.0 indicate the baseline uses more tokens than sessions with the
+ * skill (i.e. the skill is efficient). Values near 0.0 indicate the skill uses
+ * more tokens than the baseline.
+ *
+ * Returns 0.5 (neutral) when there is insufficient data in either group.
+ */
+export function computeTokenEfficiencyScore(
+  skillName: string,
+  telemetry: SessionTelemetryRecord[],
+): number {
+  const withSkill: number[] = [];
+  const withoutSkill: number[] = [];
+  for (const record of telemetry) {
+    const total = (record.input_tokens ?? 0) + (record.output_tokens ?? 0);
+    if (total <= 0) continue;
+    if (record.skills_triggered.includes(skillName)) {
+      withSkill.push(total);
+    } else {
+      withoutSkill.push(total);
+    }
+  }
+  if (withSkill.length === 0 || withoutSkill.length === 0) {
+    return 0.5; // neutral when insufficient data
+  }
+  const avgWithSkill = withSkill.reduce((a, b) => a + b, 0) / withSkill.length;
+  const avgBaseline = withoutSkill.reduce((a, b) => a + b, 0) / withoutSkill.length;
+  if (avgWithSkill === 0) return 1; // zero-token skill usage is maximally efficient
+  return clamp(avgBaseline / avgWithSkill, 0, 1);
+}
+// ---------------------------------------------------------------------------
+// Pareto dominance
+// ---------------------------------------------------------------------------
+const DIMS: InvocationType[] = ["explicit", "implicit", "contextual", "negative"];
+/**
+ * Returns true if candidate A dominates candidate B:
+ * A >= B on all dimensions AND A > B on at least one.
+ *
+ * When token efficiency scores are provided for BOTH candidates,
+ * a 5th dimension is added to the comparison.
+ */
+export function dominates(
+  a: InvocationTypeScores,
+  b: InvocationTypeScores,
+  aTokenEfficiency?: number,
+  bTokenEfficiency?: number,
+): boolean {
+  let strictlyBetterOnAny = false;
+  for (const dim of DIMS) {
+    const aRate = a[dim].pass_rate;
+    const bRate = b[dim].pass_rate;
+    if (aRate < bRate) return false; // A is worse on this dim
+    if (aRate > bRate) strictlyBetterOnAny = true;
+  }
+  // 5th dimension: token efficiency (only when both have data)
+  if (aTokenEfficiency !== undefined && bTokenEfficiency !== undefined) {
+    if (aTokenEfficiency < bTokenEfficiency) return false;
+    if (aTokenEfficiency > bTokenEfficiency) strictlyBetterOnAny = true;
+  }
+  return strictlyBetterOnAny;
+}
+/**
+ * Compute the dimensions where candidate A dominates candidate B.
+ */
+export function getDominatedDimensions(
+  a: InvocationTypeScores,
+  b: InvocationTypeScores,
+): InvocationType[] {
+  const result: InvocationType[] = [];
+  for (const dim of DIMS) {
+    if (a[dim].pass_rate > b[dim].pass_rate) {
+      result.push(dim);
+    }
+  }
+  return result;
+}
+// ---------------------------------------------------------------------------
+// Pareto frontier
+// ---------------------------------------------------------------------------
+/**
+ * Filter candidates to the Pareto frontier (non-dominated set).
+ * Also sets `dominates_on` for each frontier member.
+ *
+ * When candidates have `token_efficiency_score` set, the 5th dimension
+ * is used in dominance checks.
+ */
+export function computeParetoFrontier(candidates: ParetoCandidate[]): ParetoCandidate[] {
+  if (candidates.length === 0) return [];
+  const frontier: ParetoCandidate[] = [];
+  for (const candidate of candidates) {
+    // Check if any existing frontier member dominates this candidate
+    let isDominated = false;
+    for (const member of frontier) {
+      if (
+        dominates(
+          member.invocation_scores,
+          candidate.invocation_scores,
+          member.token_efficiency_score,
+          candidate.token_efficiency_score,
+        )
+      ) {
+        isDominated = true;
+        break;
+      }
+    }
+    if (!isDominated) {
+      // Remove frontier members that this candidate dominates
+      for (let i = frontier.length - 1; i >= 0; i--) {
+        if (
+          dominates(
+            candidate.invocation_scores,
+            frontier[i].invocation_scores,
+            candidate.token_efficiency_score,
+            frontier[i].token_efficiency_score,
+          )
+        ) {
+          frontier.splice(i, 1);
+        }
+      }
+      frontier.push(candidate);
+    }
+  }
+  // Set dominates_on for each frontier member (compared to others in frontier)
+  for (const member of frontier) {
+    const allDominatedDims = new Set<InvocationType>();
+    for (const other of frontier) {
+      if (other === member) continue;
+      for (const dim of getDominatedDimensions(member.invocation_scores, other.invocation_scores)) {
+        allDominatedDims.add(dim);
+      }
+    }
+    member.dominates_on = [...allDominatedDims];
+  }
+  return frontier;
+}
+// ---------------------------------------------------------------------------
+// Merge prompt
+// ---------------------------------------------------------------------------
+/**
+ * Build a merge prompt for complementary frontier candidates.
+ * Returns null if <= 1 candidate or no complementarity detected.
+ */
+export function buildMergePrompt(
+  frontier: ParetoCandidate[],
+  originalDescription: string,
+): string | null {
+  if (frontier.length <= 1) return null;
+  // Check for complementarity: different candidates dominate on different dimensions
+  const hasComplementarity = frontier.some((c) => c.dominates_on.length > 0);
+  if (!hasComplementarity) return null;
+  const candidateDescriptions = frontier
+    .map((c, i) => {
+      const strengths =
+        c.dominates_on.length > 0
+          ? `Strengths: ${c.dominates_on.join(", ")}`
+          : "No unique strengths";
+      return `Candidate ${i + 1} (${c.proposal.proposal_id}):\nDescription: ${c.proposal.proposed_description}\n${strengths}\nOverall pass rate: ${(c.validation.after_pass_rate * 100).toFixed(1)}%`;
+    })
+    .join("\n\n");
+  return `You are merging multiple skill descriptions that each excel on different invocation types.
+Original description:
+${originalDescription}
+Candidates:
+${candidateDescriptions}
+Create a single merged description that combines the strengths of all candidates.
+Output ONLY valid JSON with:
+- "proposed_description": the merged description
+- "rationale": explanation of what was combined
+- "confidence": 0.0-1.0`;
+}
+// ---------------------------------------------------------------------------
+// Selection
+// ---------------------------------------------------------------------------
+/**
+ * Select the best candidate from a Pareto frontier.
+ * Returns the best single candidate and whether a merge should be attempted.
+ */
+export function selectFromFrontier(frontier: ParetoCandidate[]): {
+  best: ParetoCandidate;
+  shouldMerge: boolean;
+  mergePrompt: string | null;
+} {
+  if (frontier.length === 0) {
+    throw new Error("Cannot select from empty frontier");
+  }
+  // Sort by overall after_pass_rate descending, then by number of new_passes
+  const sorted = [...frontier].sort((a, b) => {
+    const rateDiff = b.validation.after_pass_rate - a.validation.after_pass_rate;
+    if (Math.abs(rateDiff) > 0.001) return rateDiff;
+    return b.validation.new_passes.length - a.validation.new_passes.length;
+  });
+  const best = sorted[0];
+  const shouldMerge = frontier.length > 1 && frontier.some((c) => c.dominates_on.length > 0);
+  return {
+    best,
+    shouldMerge,
+    mergePrompt: shouldMerge
+      ? buildMergePrompt(frontier, best.proposal.original_description)
+      : null,
+  };
+}

package/cli/selftune/evolution/propose-body.ts ADDED Viewed

@@ -0,0 +1,171 @@
+/**
+ * propose-body.ts
+ *
+ * Generates full body proposals for SKILL.md files using a teacher LLM.
+ * The teacher analyzes current content, failure patterns, and missed queries
+ * to produce an improved skill body.
+ */
+import type { BodyEvolutionProposal, EvolutionTarget, FailurePattern } from "../types.js";
+import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
+// ---------------------------------------------------------------------------
+// System prompt
+// ---------------------------------------------------------------------------
+/** System prompt for the body generator (teacher) LLM. */
+export const BODY_GENERATOR_SYSTEM = `You are an expert skill document author for an AI agent routing system.
+Your task is to generate an improved SKILL.md body that better covers the semantic
+space of queries that the skill should handle. The body includes everything after
+the title line: the description, workflow routing table, instructions, examples, etc.
+Rules:
+- Preserve the overall structure: description paragraph, ## Workflow Routing table, and other ## sections.
+- The ## Workflow Routing table must be a valid markdown table with | Trigger | Workflow | columns.
+- Cover the semantic space of the missed queries without being too broad.
+- Maintain the original intent and scope of the skill.
+- Be specific and actionable in instructions.
+- Output ONLY valid JSON with exactly these fields:
+  - "proposed_body" (string): the complete improved skill body (markdown, everything below the title)
+  - "rationale" (string): explanation of what changed and why
+  - "confidence" (number): 0.0-1.0 how confident you are this improves the skill
+Do NOT include any text outside the JSON object.`;
+// ---------------------------------------------------------------------------
+// Prompt builder
+// ---------------------------------------------------------------------------
+/** Build the user prompt for full body generation. */
+export function buildBodyGenerationPrompt(
+  currentContent: string,
+  failurePatterns: FailurePattern[],
+  missedQueries: string[],
+  skillName: string,
+  fewShotExamples?: string[],
+): string {
+  const patternLines = failurePatterns.map((p) => {
+    const queries = p.missed_queries.map((q) => `    - "${q}"`).join("\n");
+    return `  Pattern ${p.pattern_id} (frequency: ${p.frequency}, type: ${p.invocation_type}):\n${queries}`;
+  });
+  const missedLines = missedQueries.map((q) => `  - "${q}"`).join("\n");
+  // Build failure feedback section if any patterns have feedback
+  const feedbackLines: string[] = [];
+  for (const p of failurePatterns) {
+    if (p.feedback && p.feedback.length > 0) {
+      for (const fb of p.feedback) {
+        feedbackLines.push(`  Query: "${fb.query}"`);
+        feedbackLines.push(`    Failure reason: ${fb.failure_reason}`);
+        feedbackLines.push(`    Improvement hint: ${fb.improvement_hint}`);
+      }
+    }
+  }
+  const feedbackSection =
+    feedbackLines.length > 0 ? `\n\nStructured Failure Analysis:\n${feedbackLines.join("\n")}` : "";
+  // Build few-shot examples section if provided
+  const fewShotSection =
+    fewShotExamples && fewShotExamples.length > 0
+      ? `\n\nReference Examples (other well-written skills):\n${fewShotExamples.map((ex, i) => `--- Example ${i + 1} ---\n${ex}`).join("\n\n")}`
+      : "";
+  return `Skill Name: ${skillName}
+Current Skill Content:
+${currentContent}
+Failure Patterns:
+${patternLines.join("\n\n")}
+All Missed Queries:
+${missedLines}${feedbackSection}${fewShotSection}
+Generate an improved full body for the "${skillName}" skill that would correctly handle the missed queries listed above. The body should include everything below the # Title line: description, ## Workflow Routing table, and any other sections. Output ONLY a JSON object with "proposed_body", "rationale", and "confidence" fields.`;
+}
+// ---------------------------------------------------------------------------
+// Response parser
+// ---------------------------------------------------------------------------
+/** Parse LLM response text into structured body proposal data. */
+export function parseBodyProposalResponse(raw: string): {
+  proposed_body: string;
+  rationale: string;
+  confidence: number;
+} {
+  const cleaned = stripMarkdownFences(raw);
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(cleaned);
+  } catch {
+    throw new Error(`Failed to parse LLM response as JSON: ${cleaned.slice(0, 200)}`);
+  }
+  if (typeof parsed !== "object" || parsed === null) {
+    throw new Error("LLM response is not a JSON object");
+  }
+  const obj = parsed as Record<string, unknown>;
+  if (typeof obj.proposed_body !== "string") {
+    throw new Error("Missing or invalid 'proposed_body' field in LLM response");
+  }
+  if (typeof obj.rationale !== "string") {
+    throw new Error("Missing or invalid 'rationale' field in LLM response");
+  }
+  if (typeof obj.confidence !== "number") {
+    throw new Error("Missing or invalid 'confidence' field in LLM response");
+  }
+  const confidence = Math.max(0.0, Math.min(1.0, obj.confidence));
+  return {
+    proposed_body: obj.proposed_body,
+    rationale: obj.rationale,
+    confidence,
+  };
+}
+// ---------------------------------------------------------------------------
+// Proposal generator
+// ---------------------------------------------------------------------------
+/** Generate a full body evolution proposal using teacher LLM. */
+export async function generateBodyProposal(
+  currentContent: string,
+  failurePatterns: FailurePattern[],
+  missedQueries: string[],
+  skillName: string,
+  skillPath: string,
+  agent: string,
+  modelFlag?: string,
+  fewShotExamples?: string[],
+): Promise<BodyEvolutionProposal> {
+  const prompt = buildBodyGenerationPrompt(
+    currentContent,
+    failurePatterns,
+    missedQueries,
+    skillName,
+    fewShotExamples,
+  );
+  const rawResponse = await callLlm(BODY_GENERATOR_SYSTEM, prompt, agent, modelFlag);
+  const { proposed_body, rationale, confidence } = parseBodyProposalResponse(rawResponse);
+  return {
+    proposal_id: `evo-body-${skillName}-${Date.now()}`,
+    skill_name: skillName,
+    skill_path: skillPath,
+    original_body: currentContent,
+    proposed_body,
+    rationale,
+    target: "body" as EvolutionTarget,
+    failure_patterns: failurePatterns.map((p) => p.pattern_id),
+    confidence,
+    created_at: new Date().toISOString(),
+    status: "pending",
+  };
+}

package/cli/selftune/evolution/propose-description.ts CHANGED Viewed

@@ -50,6 +50,23 @@ export function buildProposalPrompt(
   const missedLines = missedQueries.map((q) => `  - "${q}"`).join("\n");
+  // Build failure feedback section if any patterns have feedback
+  const feedbackLines: string[] = [];
+  for (const p of failurePatterns) {
+    if (p.feedback && p.feedback.length > 0) {
+      for (const fb of p.feedback) {
+        feedbackLines.push(`  Query: "${fb.query}"`);
+        feedbackLines.push(`    Failure reason: ${fb.failure_reason}`);
+        feedbackLines.push(`    Improvement hint: ${fb.improvement_hint}`);
+        if (fb.invocation_type) {
+          feedbackLines.push(`    Invocation type: ${fb.invocation_type}`);
+        }
+      }
+    }
+  }
+  const feedbackSection =
+    feedbackLines.length > 0 ? `\n\nStructured Failure Analysis:\n${feedbackLines.join("\n")}` : "";
   return `Skill Name: ${skillName}
 Current Description:
@@ -59,7 +76,7 @@ Failure Patterns:
 ${patternLines.join("\n\n")}
 All Missed Queries:
-${missedLines}
+${missedLines}${feedbackSection}
 Propose an improved description for the "${skillName}" skill that would correctly route the missed queries listed above. Output ONLY a JSON object with "proposed_description", "rationale", and "confidence" fields.`;
 }
@@ -113,6 +130,86 @@ export function parseProposalResponse(raw: string): {
 // Proposal generator
 // ---------------------------------------------------------------------------
+/**
+ * Generate multiple proposals in parallel, each biased toward a different invocation type.
+ */
+export async function generateMultipleProposals(
+  currentDescription: string,
+  failurePatterns: FailurePattern[],
+  missedQueries: string[],
+  skillName: string,
+  skillPath: string,
+  agent: string,
+  count = 3,
+  modelFlag?: string,
+): Promise<EvolutionProposal[]> {
+  const variations = buildPromptVariations(
+    currentDescription,
+    failurePatterns,
+    missedQueries,
+    skillName,
+    count,
+  );
+  const proposals = await Promise.all(
+    variations.map(async (prompt, i) => {
+      const rawResponse = await callLlm(PROPOSER_SYSTEM, prompt, agent, modelFlag);
+      const { proposed_description, rationale, confidence } = parseProposalResponse(rawResponse);
+      return {
+        proposal_id: `evo-${skillName}-${Date.now()}-${i}`,
+        skill_name: skillName,
+        skill_path: skillPath,
+        original_description: currentDescription,
+        proposed_description,
+        rationale,
+        failure_patterns: failurePatterns.map((p) => p.pattern_id),
+        eval_results: {
+          before: { total: 0, passed: 0, failed: 0, pass_rate: 0 },
+          after: { total: 0, passed: 0, failed: 0, pass_rate: 0 },
+        },
+        confidence,
+        created_at: new Date().toISOString(),
+        status: "pending" as const,
+      };
+    }),
+  );
+  return proposals;
+}
+/**
+ * Build prompt variations, each biased toward a different invocation type.
+ */
+export function buildPromptVariations(
+  currentDescription: string,
+  failurePatterns: FailurePattern[],
+  missedQueries: string[],
+  skillName: string,
+  count: number,
+): string[] {
+  const biases: string[] = [
+    "Focus especially on improving explicit invocation (direct mentions of the skill).",
+    "Focus especially on improving implicit invocation (indirect references to skill capabilities).",
+    "Focus especially on improving contextual invocation (where the context implies the skill is needed).",
+  ];
+  const basePrompt = buildProposalPrompt(
+    currentDescription,
+    failurePatterns,
+    missedQueries,
+    skillName,
+  );
+  const variations: string[] = [];
+  for (let i = 0; i < count; i++) {
+    const bias = biases[i % biases.length];
+    variations.push(`${basePrompt}\n\nAdditional focus: ${bias}`);
+  }
+  return variations;
+}
 /** Generate a complete evolution proposal using LLM. */
 export async function generateProposal(
   currentDescription: string,
@@ -121,9 +218,10 @@ export async function generateProposal(
   skillName: string,
   skillPath: string,
   agent: string,
+  modelFlag?: string,
 ): Promise<EvolutionProposal> {
   const prompt = buildProposalPrompt(currentDescription, failurePatterns, missedQueries, skillName);
-  const rawResponse = await callLlm(PROPOSER_SYSTEM, prompt, agent);
+  const rawResponse = await callLlm(PROPOSER_SYSTEM, prompt, agent, modelFlag);
   const { proposed_description, rationale, confidence } = parseProposalResponse(rawResponse);
   return {