npm - selftune - Versions diffs - 0.1.4 → 0.2.1 - Mend

selftune 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

package/.claude/agents/diagnosis-analyst.md +156 -0
package/.claude/agents/evolution-reviewer.md +180 -0
package/.claude/agents/integration-guide.md +212 -0
package/.claude/agents/pattern-analyst.md +160 -0
package/CHANGELOG.md +46 -1
package/README.md +105 -257
package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
package/apps/local-dashboard/dist/favicon.png +0 -0
package/apps/local-dashboard/dist/index.html +17 -0
package/apps/local-dashboard/dist/logo.png +0 -0
package/apps/local-dashboard/dist/logo.svg +9 -0
package/assets/BeforeAfter.gif +0 -0
package/assets/FeedbackLoop.gif +0 -0
package/assets/logo.svg +9 -0
package/assets/skill-health-badge.svg +20 -0
package/cli/selftune/activation-rules.ts +171 -0
package/cli/selftune/badge/badge-data.ts +108 -0
package/cli/selftune/badge/badge-svg.ts +212 -0
package/cli/selftune/badge/badge.ts +99 -0
package/cli/selftune/canonical-export.ts +183 -0
package/cli/selftune/constants.ts +103 -1
package/cli/selftune/contribute/bundle.ts +314 -0
package/cli/selftune/contribute/contribute.ts +214 -0
package/cli/selftune/contribute/sanitize.ts +162 -0
package/cli/selftune/cron/setup.ts +266 -0
package/cli/selftune/dashboard-contract.ts +202 -0
package/cli/selftune/dashboard-server.ts +1049 -0
package/cli/selftune/dashboard.ts +43 -156
package/cli/selftune/eval/baseline.ts +248 -0
package/cli/selftune/eval/composability-v2.ts +273 -0
package/cli/selftune/eval/composability.ts +117 -0
package/cli/selftune/eval/generate-unit-tests.ts +143 -0
package/cli/selftune/eval/hooks-to-evals.ts +101 -16
package/cli/selftune/eval/import-skillsbench.ts +221 -0
package/cli/selftune/eval/synthetic-evals.ts +172 -0
package/cli/selftune/eval/unit-test-cli.ts +152 -0
package/cli/selftune/eval/unit-test.ts +196 -0
package/cli/selftune/evolution/deploy-proposal.ts +142 -1
package/cli/selftune/evolution/evidence.ts +26 -0
package/cli/selftune/evolution/evolve-body.ts +586 -0
package/cli/selftune/evolution/evolve.ts +825 -116
package/cli/selftune/evolution/extract-patterns.ts +105 -16
package/cli/selftune/evolution/pareto.ts +314 -0
package/cli/selftune/evolution/propose-body.ts +171 -0
package/cli/selftune/evolution/propose-description.ts +100 -2
package/cli/selftune/evolution/propose-routing.ts +166 -0
package/cli/selftune/evolution/refine-body.ts +141 -0
package/cli/selftune/evolution/rollback.ts +21 -4
package/cli/selftune/evolution/validate-body.ts +254 -0
package/cli/selftune/evolution/validate-proposal.ts +257 -35
package/cli/selftune/evolution/validate-routing.ts +177 -0
package/cli/selftune/grading/auto-grade.ts +200 -0
package/cli/selftune/grading/grade-session.ts +513 -42
package/cli/selftune/grading/pre-gates.ts +104 -0
package/cli/selftune/grading/results.ts +42 -0
package/cli/selftune/hooks/auto-activate.ts +185 -0
package/cli/selftune/hooks/evolution-guard.ts +165 -0
package/cli/selftune/hooks/prompt-log.ts +172 -2
package/cli/selftune/hooks/session-stop.ts +123 -3
package/cli/selftune/hooks/skill-change-guard.ts +112 -0
package/cli/selftune/hooks/skill-eval.ts +119 -3
package/cli/selftune/index.ts +415 -48
package/cli/selftune/ingestors/claude-replay.ts +377 -0
package/cli/selftune/ingestors/codex-rollout.ts +345 -46
package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
package/cli/selftune/init.ts +376 -16
package/cli/selftune/last.ts +14 -5
package/cli/selftune/localdb/db.ts +63 -0
package/cli/selftune/localdb/materialize.ts +428 -0
package/cli/selftune/localdb/queries.ts +376 -0
package/cli/selftune/localdb/schema.ts +204 -0
package/cli/selftune/memory/writer.ts +447 -0
package/cli/selftune/monitoring/watch.ts +90 -16
package/cli/selftune/normalization.ts +682 -0
package/cli/selftune/observability.ts +19 -44
package/cli/selftune/orchestrate.ts +1073 -0
package/cli/selftune/quickstart.ts +203 -0
package/cli/selftune/repair/skill-usage.ts +576 -0
package/cli/selftune/schedule.ts +561 -0
package/cli/selftune/status.ts +59 -33
package/cli/selftune/sync.ts +627 -0
package/cli/selftune/types.ts +525 -5
package/cli/selftune/utils/canonical-log.ts +45 -0
package/cli/selftune/utils/frontmatter.ts +217 -0
package/cli/selftune/utils/hooks.ts +41 -0
package/cli/selftune/utils/html.ts +27 -0
package/cli/selftune/utils/llm-call.ts +103 -19
package/cli/selftune/utils/math.ts +10 -0
package/cli/selftune/utils/query-filter.ts +139 -0
package/cli/selftune/utils/skill-discovery.ts +340 -0
package/cli/selftune/utils/skill-log.ts +68 -0
package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
package/cli/selftune/utils/transcript.ts +307 -26
package/cli/selftune/utils/trigger-check.ts +89 -0
package/cli/selftune/utils/tui.ts +156 -0
package/cli/selftune/workflows/discover.ts +254 -0
package/cli/selftune/workflows/skill-md-writer.ts +288 -0
package/cli/selftune/workflows/workflows.ts +188 -0
package/package.json +28 -11
package/packages/telemetry-contract/README.md +11 -0
package/packages/telemetry-contract/fixtures/golden.json +87 -0
package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
package/packages/telemetry-contract/index.ts +1 -0
package/packages/telemetry-contract/package.json +19 -0
package/packages/telemetry-contract/src/index.ts +2 -0
package/packages/telemetry-contract/src/types.ts +163 -0
package/packages/telemetry-contract/src/validators.ts +109 -0
package/skill/SKILL.md +180 -33
package/skill/Workflows/AutoActivation.md +145 -0
package/skill/Workflows/Badge.md +124 -0
package/skill/Workflows/Baseline.md +144 -0
package/skill/Workflows/Composability.md +107 -0
package/skill/Workflows/Contribute.md +94 -0
package/skill/Workflows/Cron.md +132 -0
package/skill/Workflows/Dashboard.md +214 -0
package/skill/Workflows/Doctor.md +63 -14
package/skill/Workflows/Evals.md +110 -18
package/skill/Workflows/EvolutionMemory.md +154 -0
package/skill/Workflows/Evolve.md +181 -21
package/skill/Workflows/EvolveBody.md +159 -0
package/skill/Workflows/Grade.md +36 -31
package/skill/Workflows/ImportSkillsBench.md +117 -0
package/skill/Workflows/Ingest.md +142 -21
package/skill/Workflows/Initialize.md +91 -23
package/skill/Workflows/Orchestrate.md +139 -0
package/skill/Workflows/Replay.md +91 -0
package/skill/Workflows/Rollback.md +23 -4
package/skill/Workflows/Schedule.md +61 -0
package/skill/Workflows/Sync.md +88 -0
package/skill/Workflows/UnitTest.md +150 -0
package/skill/Workflows/Watch.md +33 -1
package/skill/Workflows/Workflows.md +129 -0
package/skill/assets/activation-rules-default.json +26 -0
package/skill/assets/multi-skill-settings.json +63 -0
package/skill/assets/single-skill-settings.json +57 -0
package/skill/references/invocation-taxonomy.md +2 -2
package/skill/references/logs.md +164 -2
package/skill/references/setup-patterns.md +65 -0
package/skill/references/version-history.md +40 -0
package/skill/settings_snippet.json +23 -0
package/templates/activation-rules-default.json +27 -0
package/templates/multi-skill-settings.json +64 -0
package/templates/single-skill-settings.json +58 -0
package/dashboard/index.html +0 -1119

package/cli/selftune/eval/hooks-to-evals.ts CHANGED Viewed

@@ -26,7 +26,15 @@ import type {
   SkillUsageRecord,
 } from "../types.js";
 import { readJsonl } from "../utils/jsonl.js";
+import { detectAgent } from "../utils/llm-call.js";
+import {
+  filterActionableQueryRecords,
+  filterActionableSkillUsageRecords,
+} from "../utils/query-filter.js";
 import { seededShuffle } from "../utils/seeded-random.js";
+import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
+import { isHighConfidencePositiveSkillRecord } from "../utils/skill-usage-confidence.js";
+import { generateSyntheticEvals } from "./synthetic-evals.js";
 // ---------------------------------------------------------------------------
 // Query truncation
@@ -114,14 +122,16 @@ export function buildEvalSet(
   seed = 42,
   annotateTaxonomy = true,
 ): EvalEntry[] {
+  const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
+  const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
   const effectiveMaxPerSide = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
   const effectiveSeed = Number.isNaN(seed) ? 42 : seed;
   // Build set of positive query texts (for exclusion from negatives)
   const positiveQueries = new Set<string>();
-  for (const r of skillRecords) {
+  for (const r of actionableSkillRecords) {
     if (!r || typeof r.skill_name !== "string" || typeof r.query !== "string") continue;
-    if (r.skill_name === skillName) {
+    if (isHighConfidencePositiveSkillRecord(r, skillName)) {
       const q = (r.query ?? "").trim();
       if (q && q !== "(query not found)") {
         positiveQueries.add(q);
@@ -132,9 +142,9 @@ export function buildEvalSet(
   // Build deduplicated positives with taxonomy classification
   const seen = new Set<string>();
   const positives: EvalEntry[] = [];
-  for (const r of skillRecords) {
+  for (const r of actionableSkillRecords) {
     if (!r || typeof r.skill_name !== "string" || typeof r.query !== "string") continue;
-    if (r.skill_name !== skillName) continue;
+    if (!isHighConfidencePositiveSkillRecord(r, skillName)) continue;
     const q = (r.query ?? "").trim();
     if (!q || q === "(query not found)" || seen.has(q)) continue;
     seen.add(q);
@@ -151,7 +161,7 @@ export function buildEvalSet(
   if (includeNegatives) {
     const negCandidates: string[] = [];
     const negSeen = new Set<string>();
-    for (const r of queryRecords) {
+    for (const r of actionableQueryRecords) {
       if (!r || typeof r.query !== "string") continue;
       const q = (r.query ?? "").trim();
       if (!q || positiveQueries.has(q) || negSeen.has(q)) continue;
@@ -196,13 +206,17 @@ export function listSkills(
   queryRecords: QueryLogRecord[],
   telemetryRecords: SessionTelemetryRecord[],
 ): void {
+  const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
+  const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
   const counts = new Map<string, number>();
-  for (const r of skillRecords) {
+  for (const r of actionableSkillRecords) {
     const name = r.skill_name ?? "unknown";
     counts.set(name, (counts.get(name) ?? 0) + 1);
   }
-  console.log(`Skill triggers in skill_usage_log (${skillRecords.length} total records):`);
+  console.log(
+    `Skill triggers in skill_usage_log (${actionableSkillRecords.length} actionable records):`,
+  );
   if (counts.size > 0) {
     const sorted = [...counts.entries()].sort((a, b) => b[1] - a[1]);
     for (const [name, count] of sorted) {
@@ -212,8 +226,8 @@ export function listSkills(
     console.log("  (none yet -- trigger some skills in Claude Code to populate)");
   }
-  console.log(`\nAll queries in all_queries_log: ${queryRecords.length}`);
-  if (queryRecords.length === 0) {
+  console.log(`\nActionable queries in all_queries_log: ${actionableQueryRecords.length}`);
+  if (actionableQueryRecords.length === 0) {
     console.log("  (none yet -- make sure prompt_log_hook is installed)");
   }
@@ -301,14 +315,16 @@ export function printEvalStats(
 ): void {
   const pos = evalSet.filter((e) => e.should_trigger);
   const neg = evalSet.filter((e) => !e.should_trigger);
-  const totalTriggers = skillRecords.filter((r) => r.skill_name === skillName).length;
+  const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
+  const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
+  const totalTriggers = actionableSkillRecords.filter((r) => r.skill_name === skillName).length;
   console.log(`Wrote ${evalSet.length} eval entries to ${outputPath}`);
   console.log(
     `  Positives (should_trigger=true) : ${pos.length}  (from ${totalTriggers} logged triggers)`,
   );
   console.log(
-    `  Negatives (should_trigger=false): ${neg.length}  (from ${queryRecords.length} total logged queries)`,
+    `  Negatives (should_trigger=false): ${neg.length}  (from ${actionableQueryRecords.length} actionable logged queries)`,
   );
   if (annotateTaxonomy && pos.length > 0) {
@@ -334,7 +350,7 @@ export function printEvalStats(
   console.log();
   if (pos.length === 0) {
     console.log(`[WARN] No positives for skill '${skillName}'.`);
-    const names = [...new Set(skillRecords.map((r) => r.skill_name))].sort();
+    const names = [...new Set(actionableSkillRecords.map((r) => r.skill_name))].sort();
     if (names.length > 0) {
       console.log(`       Known skills: ${names.join(", ")}`);
     }
@@ -359,11 +375,12 @@ export function printEvalStats(
 // CLI entry point
 // ---------------------------------------------------------------------------
-export function cliMain(): void {
+export async function cliMain(): Promise<void> {
   const { values } = parseArgs({
     options: {
       skill: { type: "string" },
       output: { type: "string" },
+      out: { type: "string" },
       max: { type: "string", default: "50" },
       seed: { type: "string", default: "42" },
       "list-skills": { type: "boolean", default: false },
@@ -373,11 +390,76 @@ export function cliMain(): void {
       "skill-log": { type: "string", default: SKILL_LOG },
       "query-log": { type: "string", default: QUERY_LOG },
       "telemetry-log": { type: "string", default: TELEMETRY_LOG },
+      synthetic: { type: "boolean", default: false },
+      "skill-path": { type: "string" },
+      model: { type: "string" },
     },
     strict: true,
   });
-  const skillRecords = readJsonl<SkillUsageRecord>(values["skill-log"] ?? SKILL_LOG);
+  // --- Synthetic mode: generate evals from SKILL.md via LLM ---
+  if (values.synthetic) {
+    if (!values.skill) {
+      console.error("[ERROR] --skill required with --synthetic");
+      process.exit(1);
+    }
+    if (!values["skill-path"]) {
+      console.error("[ERROR] --skill-path required with --synthetic");
+      process.exit(1);
+    }
+    const agent = detectAgent();
+    if (!agent) {
+      console.error("[ERROR] No agent CLI found (claude/codex/opencode). Install one first.");
+      process.exit(1);
+    }
+    const maxPerSide = Number.parseInt(values.max ?? "50", 10);
+    const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
+    console.log(`Generating synthetic evals for skill '${values.skill}'...`);
+    const evalSet = await generateSyntheticEvals(values["skill-path"], values.skill, agent, {
+      maxPositives: effectiveMax,
+      maxNegatives: effectiveMax,
+      modelFlag: values.model,
+    });
+    const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
+    writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
+    const pos = evalSet.filter((e) => e.should_trigger);
+    const neg = evalSet.filter((e) => !e.should_trigger);
+    console.log(`Wrote ${evalSet.length} synthetic eval entries to ${outputPath}`);
+    console.log(`  Positives (should_trigger=true) : ${pos.length}`);
+    console.log(`  Negatives (should_trigger=false): ${neg.length}`);
+    if (pos.length > 0) {
+      const types = new Map<string, number>();
+      for (const e of pos) {
+        const t = e.invocation_type ?? "?";
+        types.set(t, (types.get(t) ?? 0) + 1);
+      }
+      console.log("\n  Positive invocation types:");
+      for (const [t, c] of [...types.entries()].sort()) {
+        console.log(`    ${t.padEnd(15)}  ${c}`);
+      }
+    }
+    console.log("\nNext steps:");
+    console.log("  bun run cli/selftune/eval/run-eval.ts \\");
+    console.log(`    --eval-set ${outputPath} \\`);
+    console.log(`    --skill-path ${values["skill-path"]} \\`);
+    console.log("    --runs-per-query 3 --verbose");
+    return;
+  }
+  // --- Log-based mode (original behavior) ---
+  const skillLogPath = values["skill-log"] ?? SKILL_LOG;
+  const skillRecords =
+    skillLogPath === SKILL_LOG
+      ? readEffectiveSkillUsageRecords()
+      : readJsonl<SkillUsageRecord>(skillLogPath);
   const queryRecords = readJsonl<QueryLogRecord>(values["query-log"] ?? QUERY_LOG);
   const telemetryRecords = readJsonl<SessionTelemetryRecord>(
     values["telemetry-log"] ?? TELEMETRY_LOG,
@@ -412,11 +494,14 @@ export function cliMain(): void {
     annotateTaxonomy,
   );
-  const outputPath = values.output ?? `${values.skill}_trigger_eval.json`;
+  const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
   writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
   printEvalStats(evalSet, values.skill, outputPath, skillRecords, queryRecords, annotateTaxonomy);
 }
 if (import.meta.main) {
-  cliMain();
+  cliMain().catch((err) => {
+    console.error(err);
+    process.exit(1);
+  });
 }

package/cli/selftune/eval/import-skillsbench.ts ADDED Viewed

@@ -0,0 +1,221 @@
+#!/usr/bin/env bun
+/**
+ * import-skillsbench.ts
+ *
+ * Imports task definitions from a SkillsBench-style corpus directory and
+ * converts them into EvalEntry arrays for use with selftune eval/grading.
+ *
+ * Expected directory structure:
+ *   <dir>/tasks/<task-id>/instruction.md   — task description (query text)
+ *   <dir>/tasks/<task-id>/task.toml        — metadata (difficulty, category, tags, etc.)
+ */
+import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
+import { join } from "node:path";
+import { parseArgs } from "node:util";
+import type { EvalEntry, SkillsBenchTask } from "../types.js";
+// ---------------------------------------------------------------------------
+// Minimal TOML parser (handles the subset used by SkillsBench task.toml files)
+// ---------------------------------------------------------------------------
+/**
+ * Parse the subset of TOML used by SkillsBench task.toml files.
+ *
+ * Supports: single-line key = "value", flat string arrays ["a", "b"],
+ * bare values (numbers, booleans).
+ *
+ * Does NOT support: multi-line / triple-quoted strings, inline tables,
+ * nested arrays, or section headers ([table]).
+ */
+function parseSimpleToml(content: string): Record<string, unknown> {
+  const result: Record<string, unknown> = {};
+  for (const rawLine of content.split("\n")) {
+    const line = rawLine.trim();
+    if (!line || line.startsWith("#")) continue;
+    const eqIdx = line.indexOf("=");
+    if (eqIdx === -1) continue;
+    const key = line.slice(0, eqIdx).trim();
+    const rawValue = line.slice(eqIdx + 1).trim();
+    if (rawValue.startsWith("[")) {
+      // Array value — parse simple string arrays like ["a", "b", "c"]
+      const arrayContent = rawValue.slice(1, rawValue.lastIndexOf("]"));
+      const items: string[] = [];
+      for (const item of arrayContent.split(",")) {
+        const trimmed = item.trim().replace(/^["']|["']$/g, "");
+        if (trimmed) items.push(trimmed);
+      }
+      result[key] = items;
+    } else if (rawValue.startsWith('"') || rawValue.startsWith("'")) {
+      // String value
+      result[key] = rawValue.replace(/^["']|["']$/g, "");
+    } else {
+      // Bare value (number, boolean, etc.)
+      result[key] = rawValue;
+    }
+  }
+  return result;
+}
+// ---------------------------------------------------------------------------
+// Parse SkillsBench directory
+// ---------------------------------------------------------------------------
+export function parseSkillsBenchDir(dirPath: string): SkillsBenchTask[] {
+  const tasksDir = join(dirPath, "tasks");
+  if (!existsSync(tasksDir)) return [];
+  const tasks: SkillsBenchTask[] = [];
+  let entries: ReturnType<typeof readdirSync>;
+  try {
+    entries = readdirSync(tasksDir, { withFileTypes: true });
+  } catch {
+    return [];
+  }
+  for (const entry of entries) {
+    if (!entry.isDirectory()) continue;
+    const taskDir = join(tasksDir, entry.name);
+    const instructionPath = join(taskDir, "instruction.md");
+    if (!existsSync(instructionPath)) continue;
+    const query = readFileSync(instructionPath, "utf-8").trim();
+    if (!query) continue;
+    // Parse optional task.toml
+    const tomlPath = join(taskDir, "task.toml");
+    let metadata: Record<string, unknown> = {};
+    if (existsSync(tomlPath)) {
+      metadata = parseSimpleToml(readFileSync(tomlPath, "utf-8"));
+    }
+    const difficulty = metadata.difficulty as SkillsBenchTask["difficulty"] | undefined;
+    const task: SkillsBenchTask = {
+      task_id: entry.name,
+      category: (metadata.category as string) ?? "general",
+      query,
+      difficulty:
+        difficulty && ["easy", "medium", "hard"].includes(difficulty) ? difficulty : "medium",
+    };
+    if (metadata.expected_skill) {
+      task.expected_skill = metadata.expected_skill as string;
+    }
+    if (metadata.expected_tools && Array.isArray(metadata.expected_tools)) {
+      task.expected_tools = metadata.expected_tools as string[];
+    }
+    if (metadata.tags && Array.isArray(metadata.tags)) {
+      task.tags = metadata.tags as string[];
+    }
+    tasks.push(task);
+  }
+  return tasks;
+}
+// ---------------------------------------------------------------------------
+// Convert tasks to EvalEntries
+// ---------------------------------------------------------------------------
+export function convertToEvalEntries(
+  tasks: SkillsBenchTask[],
+  targetSkill: string,
+  matchStrategy: "exact" | "fuzzy" = "exact",
+): EvalEntry[] {
+  const entries: EvalEntry[] = [];
+  for (const task of tasks) {
+    let matches = false;
+    if (matchStrategy === "exact") {
+      matches = task.expected_skill === targetSkill;
+    } else {
+      // Fuzzy: check if targetSkill appears as substring in category, tags, or expected_skill
+      const skillLower = targetSkill.toLowerCase();
+      const searchable = [task.category, task.expected_skill, ...(task.tags ?? [])]
+        .filter(Boolean)
+        .map((s) => (s as string).toLowerCase());
+      matches = searchable.some((s) => s.includes(skillLower) || skillLower.includes(s));
+    }
+    if (matches) {
+      entries.push({
+        query: task.query,
+        should_trigger: true,
+      });
+    }
+  }
+  return entries;
+}
+// ---------------------------------------------------------------------------
+// CLI entry point
+// ---------------------------------------------------------------------------
+export function cliMain(): void {
+  const { values } = parseArgs({
+    options: {
+      dir: { type: "string" },
+      skill: { type: "string" },
+      output: { type: "string" },
+      "match-strategy": { type: "string", default: "exact" },
+    },
+    strict: true,
+  });
+  if (!values.dir) {
+    console.error("[ERROR] --dir required (path to SkillsBench corpus directory)");
+    process.exit(1);
+  }
+  if (!values.skill) {
+    console.error("[ERROR] --skill required (target skill name)");
+    process.exit(1);
+  }
+  const matchStrategy = values["match-strategy"] === "fuzzy" ? "fuzzy" : "exact";
+  const tasks = parseSkillsBenchDir(values.dir);
+  if (tasks.length === 0) {
+    console.error(`[WARN] No tasks found in ${values.dir}/tasks/`);
+    console.error("Expected structure: <dir>/tasks/<task-id>/instruction.md");
+    process.exit(1);
+  }
+  console.log(`Parsed ${tasks.length} tasks from ${values.dir}`);
+  const entries = convertToEvalEntries(tasks, values.skill, matchStrategy);
+  if (entries.length === 0) {
+    console.log(
+      `[WARN] No tasks matched skill '${values.skill}' with strategy '${matchStrategy}'.`,
+    );
+    console.log("Available expected_skills:");
+    const skills = [...new Set(tasks.map((t) => t.expected_skill).filter(Boolean))].sort();
+    for (const s of skills) {
+      console.log(`  ${s}`);
+    }
+    if (matchStrategy === "exact") {
+      console.log("\nTip: try --match-strategy fuzzy for keyword-based matching.");
+    }
+  }
+  const outputPath = values.output ?? `${values.skill}_skillsbench_eval.json`;
+  writeFileSync(outputPath, JSON.stringify(entries, null, 2), "utf-8");
+  console.log(`Wrote ${entries.length} eval entries to ${outputPath}`);
+}
+if (import.meta.main) {
+  cliMain();
+}

package/cli/selftune/eval/synthetic-evals.ts ADDED Viewed

@@ -0,0 +1,172 @@
+/**
+ * synthetic-evals.ts
+ *
+ * Generates eval queries from a SKILL.md using an LLM, without requiring
+ * real session logs. Solves the cold-start problem for new skills that
+ * have no telemetry data yet.
+ */
+import { readFileSync } from "node:fs";
+import type { EvalEntry, InvocationType } from "../types.js";
+import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
+import { classifyInvocation } from "./hooks-to-evals.js";
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+export interface SyntheticEvalOptions {
+  maxPositives?: number;
+  maxNegatives?: number;
+  modelFlag?: string;
+}
+interface RawSyntheticEntry {
+  query: string;
+  should_trigger: boolean;
+  invocation_type?: string;
+}
+// ---------------------------------------------------------------------------
+// Prompt building
+// ---------------------------------------------------------------------------
+export function buildSyntheticPrompt(
+  skillContent: string,
+  skillName: string,
+  maxPositives: number,
+  maxNegatives: number,
+): { system: string; user: string } {
+  const system = `You are generating test queries for a coding agent skill. Given the skill description below, generate realistic user queries.
+For POSITIVE queries (should trigger this skill):
+- Generate a mix of:
+  - Explicit: directly names the skill or uses $${skillName} syntax
+  - Implicit: describes the task without naming the skill
+  - Contextual: natural language with domain context, proper nouns, dates, filenames
+- Vary phrasing, formality, and specificity
+For NEGATIVE queries (should NOT trigger this skill):
+- Queries that are topically adjacent but wrong intent
+- Queries for different skills that share keywords
+- Generic queries unrelated to this skill
+Output as JSON array with no surrounding text:
+[{"query": "...", "should_trigger": true, "invocation_type": "explicit|implicit|contextual|negative"}]`;
+  const user = `Skill name: ${skillName}
+Skill content:
+${skillContent}
+Generate exactly ${maxPositives} positive queries (should_trigger: true) and ${maxNegatives} negative queries (should_trigger: false). Return ONLY the JSON array.`;
+  return { system, user };
+}
+// ---------------------------------------------------------------------------
+// Response parsing
+// ---------------------------------------------------------------------------
+export function parseSyntheticResponse(raw: string, skillName: string): EvalEntry[] {
+  let text = raw.trim();
+  // Strip markdown fences manually for array-first JSON
+  // (stripMarkdownFences slices to first '{' which breaks '[' arrays)
+  const fenceMatch = text.match(/^```\w*\n/);
+  if (fenceMatch) {
+    text = text.slice(fenceMatch[0].length);
+    const closingIdx = text.lastIndexOf("```");
+    if (closingIdx >= 0) {
+      text = text.slice(0, closingIdx);
+    }
+    text = text.trim();
+  }
+  // Find the JSON array start
+  const bracketIdx = text.indexOf("[");
+  if (bracketIdx < 0) {
+    // No array found — try stripMarkdownFences as fallback for edge cases
+    const cleaned = stripMarkdownFences(raw);
+    const retryIdx = cleaned.indexOf("[");
+    if (retryIdx >= 0) {
+      text = cleaned.slice(retryIdx);
+    } else {
+      throw new Error(`Failed to parse synthetic eval response as JSON: ${text.slice(0, 200)}`);
+    }
+  } else {
+    text = text.slice(bracketIdx);
+  }
+  // Trim trailing content after the array closes
+  const lastBracket = text.lastIndexOf("]");
+  if (lastBracket >= 0) {
+    text = text.slice(0, lastBracket + 1);
+  }
+  const jsonText = text;
+  let entries: RawSyntheticEntry[];
+  try {
+    entries = JSON.parse(jsonText);
+  } catch {
+    throw new Error(`Failed to parse synthetic eval response as JSON: ${jsonText.slice(0, 200)}`);
+  }
+  if (!Array.isArray(entries)) {
+    throw new Error("Synthetic eval response is not a JSON array");
+  }
+  const result: EvalEntry[] = [];
+  for (const entry of entries) {
+    if (!entry || typeof entry.query !== "string" || typeof entry.should_trigger !== "boolean") {
+      continue;
+    }
+    const query = entry.query.trim();
+    if (!query) continue;
+    // For positives, use classifyInvocation to verify/override the LLM's type
+    let invocationType: InvocationType;
+    if (entry.should_trigger) {
+      invocationType = classifyInvocation(query, skillName);
+    } else {
+      invocationType = "negative";
+    }
+    result.push({
+      query,
+      should_trigger: entry.should_trigger,
+      invocation_type: invocationType,
+    });
+  }
+  return result;
+}
+// ---------------------------------------------------------------------------
+// Main entry point
+// ---------------------------------------------------------------------------
+export async function generateSyntheticEvals(
+  skillPath: string,
+  skillName: string,
+  agent: string,
+  options: SyntheticEvalOptions = {},
+): Promise<EvalEntry[]> {
+  const maxPositives = options.maxPositives ?? 15;
+  const maxNegatives = options.maxNegatives ?? 10;
+  const skillContent = readFileSync(skillPath, "utf-8");
+  const { system, user } = buildSyntheticPrompt(
+    skillContent,
+    skillName,
+    maxPositives,
+    maxNegatives,
+  );
+  const raw = await callLlm(system, user, agent, options.modelFlag);
+  return parseSyntheticResponse(raw, skillName);
+}