npm - selftune - Versions diffs - 0.2.16 → 0.2.19 - Mend

selftune 0.2.16 → 0.2.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

package/README.md +32 -22
package/apps/local-dashboard/dist/assets/index-DnhnXQm6.js +60 -0
package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +1 -0
package/apps/local-dashboard/dist/assets/vendor-table-BIiI3YhS.js +1 -0
package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +12 -0
package/apps/local-dashboard/dist/index.html +5 -5
package/cli/selftune/alpha-upload/build-payloads.ts +14 -1
package/cli/selftune/alpha-upload/client.ts +51 -1
package/cli/selftune/alpha-upload/flush.ts +46 -5
package/cli/selftune/alpha-upload/stage-canonical.ts +32 -10
package/cli/selftune/alpha-upload-contract.ts +9 -0
package/cli/selftune/constants.ts +92 -5
package/cli/selftune/contribute/contribute.ts +30 -2
package/cli/selftune/contribute/sanitize.ts +52 -5
package/cli/selftune/contribution-config.ts +249 -0
package/cli/selftune/contribution-relay.ts +177 -0
package/cli/selftune/contribution-signals.ts +219 -0
package/cli/selftune/contribution-staging.ts +147 -0
package/cli/selftune/contributions.ts +532 -0
package/cli/selftune/creator-contributions.ts +333 -0
package/cli/selftune/dashboard-contract.ts +305 -1
package/cli/selftune/dashboard-server.ts +47 -13
package/cli/selftune/eval/family-overlap.ts +395 -0
package/cli/selftune/eval/hooks-to-evals.ts +182 -28
package/cli/selftune/eval/synthetic-evals.ts +298 -11
package/cli/selftune/evolution/description-quality.ts +12 -11
package/cli/selftune/evolution/evolve.ts +214 -51
package/cli/selftune/evolution/validate-proposal.ts +9 -6
package/cli/selftune/export.ts +2 -2
package/cli/selftune/grading/grade-session.ts +20 -0
package/cli/selftune/hooks/commit-track.ts +188 -0
package/cli/selftune/hooks/prompt-log.ts +10 -1
package/cli/selftune/hooks/session-stop.ts +2 -2
package/cli/selftune/hooks/skill-eval.ts +15 -1
package/cli/selftune/hooks/stdin-preview.ts +32 -0
package/cli/selftune/index.ts +41 -5
package/cli/selftune/ingestors/codex-rollout.ts +31 -35
package/cli/selftune/ingestors/codex-wrapper.ts +32 -24
package/cli/selftune/localdb/db.ts +2 -2
package/cli/selftune/localdb/direct-write.ts +69 -6
package/cli/selftune/localdb/queries.ts +1253 -37
package/cli/selftune/localdb/schema.ts +66 -0
package/cli/selftune/orchestrate.ts +32 -4
package/cli/selftune/recover.ts +153 -0
package/cli/selftune/repair/skill-usage.ts +363 -4
package/cli/selftune/routes/actions.ts +35 -1
package/cli/selftune/routes/analytics.ts +14 -0
package/cli/selftune/routes/index.ts +1 -0
package/cli/selftune/routes/overview.ts +150 -4
package/cli/selftune/routes/skill-report.ts +648 -18
package/cli/selftune/status.ts +81 -2
package/cli/selftune/sync.ts +56 -2
package/cli/selftune/trust-model.ts +66 -0
package/cli/selftune/types.ts +80 -0
package/cli/selftune/utils/skill-detection.ts +43 -0
package/cli/selftune/utils/transcript.ts +210 -1
package/cli/selftune/watchlist.ts +65 -0
package/node_modules/@selftune/telemetry-contract/src/types.ts +11 -0
package/package.json +1 -1
package/packages/telemetry-contract/src/types.ts +11 -0
package/packages/ui/src/components/ActivityTimeline.tsx +165 -150
package/packages/ui/src/components/EvidenceViewer.tsx +335 -144
package/packages/ui/src/components/EvolutionTimeline.tsx +58 -28
package/packages/ui/src/components/OrchestrateRunsPanel.tsx +33 -16
package/packages/ui/src/components/RecentActivityFeed.tsx +72 -41
package/packages/ui/src/components/section-cards.tsx +12 -9
package/packages/ui/src/primitives/card.tsx +1 -1
package/skill/SKILL.md +40 -2
package/skill/Workflows/AlphaUpload.md +4 -0
package/skill/Workflows/Composability.md +64 -0
package/skill/Workflows/Contribute.md +6 -3
package/skill/Workflows/Contributions.md +97 -0
package/skill/Workflows/CreatorContributions.md +74 -0
package/skill/Workflows/Dashboard.md +31 -0
package/skill/Workflows/Evals.md +57 -8
package/skill/Workflows/Evolve.md +31 -13
package/skill/Workflows/ExportCanonical.md +121 -0
package/skill/Workflows/Hook.md +131 -0
package/skill/Workflows/Ingest.md +7 -0
package/skill/Workflows/Initialize.md +29 -9
package/skill/Workflows/Orchestrate.md +27 -5
package/skill/Workflows/Quickstart.md +94 -0
package/skill/Workflows/Recover.md +84 -0
package/skill/Workflows/RepairSkillUsage.md +95 -0
package/skill/Workflows/Sync.md +18 -12
package/skill/Workflows/Uninstall.md +82 -0
package/skill/settings_snippet.json +11 -0
package/apps/local-dashboard/dist/assets/index-BMIS6uUh.css +0 -2
package/apps/local-dashboard/dist/assets/index-DOu3iLD9.js +0 -16
package/apps/local-dashboard/dist/assets/vendor-table-pHbDxq36.js +0 -8
package/apps/local-dashboard/dist/assets/vendor-ui-DIwlrGlb.js +0 -12

package/cli/selftune/eval/hooks-to-evals.ts CHANGED Viewed

@@ -2,7 +2,8 @@
 /**
  * hooks-to-evals.ts
  *
- * Converts hook logs into trigger eval sets compatible with run_eval / run_loop.
+ * Converts hook logs into trigger eval sets compatible with the current
+ * eval-generate -> evolve --dry-run validation loop.
  *
  * Default read path is SQLite (via localdb/queries). JSONL fallback is used only
  * when custom --skill-log / --query-log / --telemetry-log paths are supplied
@@ -43,6 +44,13 @@ import {
   filterActionableSkillUsageRecords,
 } from "../utils/query-filter.js";
 import { seededShuffle } from "../utils/seeded-random.js";
+import {
+  escapeRegExp,
+  findInstalledSkillNames,
+  findInstalledSkillPath,
+  findRepositoryClaudeSkillDirs,
+  findRepositorySkillDirs,
+} from "../utils/skill-discovery.js";
 import { isHighConfidencePositiveSkillRecord } from "../utils/skill-usage-confidence.js";
 import { generateSyntheticEvals } from "./synthetic-evals.js";
@@ -78,14 +86,14 @@ export function classifyInvocation(query: string, skillName: string): Invocation
   // Handle hyphenated skill names: check if all parts appear
   if (skillLower.includes("-")) {
     const parts = skillLower.split("-");
-    if (parts.every((part) => qLower.includes(part))) {
+    if (parts.every((part) => new RegExp(`\\b${escapeRegExp(part)}\\b`, "i").test(query))) {
       return "explicit";
     }
   }
   // Convert skill-name to camelCase and check
   const camelCase = skillLower.replace(/-([a-z])/g, (_, c) => c.toUpperCase());
-  if (camelCase !== skillLower && qLower.includes(camelCase)) {
+  if (camelCase !== skillLower && qLower.includes(camelCase.toLowerCase())) {
     return "explicit";
   }
@@ -207,6 +215,78 @@ export function buildEvalSet(
   return [...shuffledPositives, ...negatives];
 }
+// ---------------------------------------------------------------------------
+// Installed skill discovery / readiness
+// ---------------------------------------------------------------------------
+export interface EvalSkillReadiness {
+  name: string;
+  trusted_trigger_count: number;
+  raw_trigger_count: number;
+  trusted_session_count: number;
+  raw_session_count: number;
+  installed: boolean;
+  skill_path?: string;
+  readiness: "log_ready" | "cold_start_ready" | "telemetry_only";
+}
+function getEvalSkillSearchDirs(): string[] {
+  const cwd = process.cwd();
+  const homeDir = process.env.HOME ?? "";
+  const codexHome = process.env.CODEX_HOME ?? `${homeDir}/.codex`;
+  return [
+    ...findRepositorySkillDirs(cwd),
+    ...findRepositoryClaudeSkillDirs(cwd),
+    `${homeDir}/.agents/skills`,
+    `${homeDir}/.claude/skills`,
+    `${codexHome}/skills`,
+  ];
+}
+export function listEvalSkillReadiness(
+  skillRecords: SkillUsageRecord[],
+  searchDirs: string[] = getEvalSkillSearchDirs(),
+): EvalSkillReadiness[] {
+  const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
+  const rawTriggerCounts = new Map<string, number>();
+  const rawSessionCounts = new Map<string, Set<string>>();
+  const trustedTriggerCounts = new Map<string, number>();
+  const trustedSessionCounts = new Map<string, Set<string>>();
+  for (const r of actionableSkillRecords) {
+    const name = r.skill_name ?? "unknown";
+    rawTriggerCounts.set(name, (rawTriggerCounts.get(name) ?? 0) + 1);
+    if (!rawSessionCounts.has(name)) rawSessionCounts.set(name, new Set<string>());
+    if (r.session_id) rawSessionCounts.get(name)?.add(r.session_id);
+    if (!isHighConfidencePositiveSkillRecord(r, name)) continue;
+    trustedTriggerCounts.set(name, (trustedTriggerCounts.get(name) ?? 0) + 1);
+    if (!trustedSessionCounts.has(name)) trustedSessionCounts.set(name, new Set<string>());
+    if (r.session_id) trustedSessionCounts.get(name)?.add(r.session_id);
+  }
+  const installedNames = findInstalledSkillNames(searchDirs);
+  const allNames = new Set<string>([...rawTriggerCounts.keys(), ...installedNames]);
+  return [...allNames]
+    .sort((a, b) => a.localeCompare(b))
+    .map((name) => {
+      const trustedTriggerCount = trustedTriggerCounts.get(name) ?? 0;
+      const rawTriggerCount = rawTriggerCounts.get(name) ?? 0;
+      const installed = installedNames.has(name);
+      return {
+        name,
+        trusted_trigger_count: trustedTriggerCount,
+        raw_trigger_count: rawTriggerCount,
+        trusted_session_count: trustedSessionCounts.get(name)?.size ?? 0,
+        raw_session_count: rawSessionCounts.get(name)?.size ?? 0,
+        installed,
+        skill_path: installed ? findInstalledSkillPath(name, searchDirs) : undefined,
+        readiness:
+          trustedTriggerCount > 0 ? "log_ready" : installed ? "cold_start_ready" : "telemetry_only",
+      } satisfies EvalSkillReadiness;
+    });
+}
 // ---------------------------------------------------------------------------
 // List skills
 // ---------------------------------------------------------------------------
@@ -216,24 +296,37 @@ export function listSkills(
   queryRecords: QueryLogRecord[],
   telemetryRecords: SessionTelemetryRecord[],
 ): void {
-  const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
   const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
-  const counts = new Map<string, number>();
-  for (const r of actionableSkillRecords) {
-    const name = r.skill_name ?? "unknown";
-    counts.set(name, (counts.get(name) ?? 0) + 1);
-  }
-  console.log(
-    `Skill triggers in skill_usage_log (${actionableSkillRecords.length} actionable records):`,
-  );
-  if (counts.size > 0) {
-    const sorted = [...counts.entries()].sort((a, b) => b[1] - a[1]);
-    for (const [name, count] of sorted) {
-      console.log(`  ${name.padEnd(30)}  ${String(count).padStart(4)} triggers`);
+  const readiness = listEvalSkillReadiness(skillRecords);
+  console.log(`Skills with eval readiness (${readiness.length} total):`);
+  if (readiness.length > 0) {
+    for (const skill of readiness) {
+      const readinessLabel =
+        skill.readiness === "log_ready"
+          ? "log-ready"
+          : skill.readiness === "cold_start_ready"
+            ? "cold-start"
+            : "telemetry-only";
+      const installLabel = skill.installed ? "installed" : "not installed";
+      const trustedLabel = `${String(skill.trusted_trigger_count).padStart(3)} trusted`;
+      const rawLabel =
+        skill.raw_trigger_count !== skill.trusted_trigger_count
+          ? ` / ${String(skill.raw_trigger_count).padStart(3)} raw`
+          : "";
+      console.log(
+        `  ${skill.name.padEnd(30)}  ${trustedLabel}${rawLabel}  ${String(skill.trusted_session_count).padStart(3)} trusted sessions  ${readinessLabel} / ${installLabel}`,
+      );
     }
+    console.log("");
+    console.log("Legend:");
+    console.log("  log-ready    real triggers exist; run eval generate normally");
+    console.log(
+      "  cold-start   installed locally but no trusted triggers yet; use --auto-synthetic",
+    );
+    console.log("  telemetry-only  trigger data exists but local SKILL.md was not found");
   } else {
-    console.log("  (none yet -- trigger some skills in Claude Code to populate)");
+    console.log("  (none yet -- install skills or sync source data first)");
   }
   console.log(`\nActionable queries in all_queries_log: ${actionableQueryRecords.length}`);
@@ -370,15 +463,25 @@ export function printEvalStats(
   }
   console.log("Next steps:");
-  console.log("  bun run cli/selftune/eval/run-eval.ts \\");
+  console.log(`  selftune evolve --skill ${skillName} \\`);
+  console.log(`    --skill-path /path/to/skills/${skillName}/SKILL.md \\`);
   console.log(`    --eval-set ${outputPath} \\`);
-  console.log(`    --skill-path /path/to/skills/${skillName} \\`);
-  console.log("    --runs-per-query 3 --verbose");
+  console.log("    --dry-run --verbose");
   console.log();
-  console.log("  bun run cli/selftune/eval/run-loop.ts \\");
-  console.log(`    --eval-set ${outputPath} \\`);
-  console.log(`    --skill-path /path/to/skills/${skillName} \\`);
-  console.log("    --max-iterations 5 --verbose");
+  console.log(`  selftune evolve --skill ${skillName} \\`);
+  console.log(`    --skill-path /path/to/skills/${skillName}/SKILL.md \\`);
+  console.log(`    --eval-set ${outputPath}`);
+}
+function printSyntheticFallbackHint(skillName: string, skillPath: string): void {
+  console.log("");
+  console.log(`[TIP] No trusted trigger data found yet for '${skillName}'.`);
+  console.log(
+    "      This skill is installed locally, so you can still generate a cold-start eval set:",
+  );
+  console.log(
+    `      selftune eval generate --skill ${skillName} --auto-synthetic --skill-path ${skillPath}`,
+  );
 }
 // ---------------------------------------------------------------------------
@@ -401,6 +504,7 @@ export async function cliMain(): Promise<void> {
       "query-log": { type: "string", default: QUERY_LOG },
       "telemetry-log": { type: "string", default: TELEMETRY_LOG },
       synthetic: { type: "boolean", default: false },
+      "auto-synthetic": { type: "boolean", default: false },
       "skill-path": { type: "string" },
       model: { type: "string" },
     },
@@ -466,10 +570,10 @@ export async function cliMain(): Promise<void> {
     }
     console.log("\nNext steps:");
-    console.log("  bun run cli/selftune/eval/run-eval.ts \\");
-    console.log(`    --eval-set ${outputPath} \\`);
+    console.log(`  selftune evolve --skill ${values.skill} \\`);
     console.log(`    --skill-path ${values["skill-path"]} \\`);
-    console.log("    --runs-per-query 3 --verbose");
+    console.log(`    --eval-set ${outputPath} \\`);
+    console.log("    --dry-run --verbose");
     return;
   }
@@ -504,6 +608,8 @@ export async function cliMain(): Promise<void> {
   const maxPerSide = Number.parseInt(values.max ?? "50", 10);
   const seed = Number.parseInt(values.seed ?? "42", 10);
   const annotateTaxonomy = !values["no-taxonomy"];
+  const searchDirs = getEvalSkillSearchDirs();
+  const detectedSkillPath = findInstalledSkillPath(values.skill, searchDirs);
   const evalSet = buildEvalSet(
     skillRecords,
@@ -515,9 +621,57 @@ export async function cliMain(): Promise<void> {
     annotateTaxonomy,
   );
+  const positiveCount = evalSet.filter((entry) => entry.should_trigger).length;
+  if (positiveCount === 0 && values["auto-synthetic"]) {
+    const skillPath = values["skill-path"] ?? detectedSkillPath;
+    if (!skillPath) {
+      throw new CLIError(
+        `No trusted triggers found for '${values.skill}', and no SKILL.md path could be resolved for synthetic fallback.`,
+        "FILE_NOT_FOUND",
+        `Run 'selftune eval generate --list-skills' or rerun with --skill-path /path/to/SKILL.md`,
+      );
+    }
+    const agent = detectAgent();
+    if (!agent) {
+      throw new CLIError(
+        "No agent CLI found (claude/codex/opencode)",
+        "AGENT_NOT_FOUND",
+        "Install one of the supported agent CLIs",
+      );
+    }
+    console.log(
+      `No trusted triggers found for '${values.skill}'. Falling back to synthetic cold-start eval generation...`,
+    );
+    const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
+    const syntheticEvalSet = await generateSyntheticEvals(skillPath, values.skill, agent, {
+      maxPositives: effectiveMax,
+      maxNegatives: effectiveMax,
+      modelFlag: values.model,
+    });
+    const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
+    writeFileSync(outputPath, JSON.stringify(syntheticEvalSet, null, 2), "utf-8");
+    const pos = syntheticEvalSet.filter((e) => e.should_trigger);
+    const neg = syntheticEvalSet.filter((e) => !e.should_trigger);
+    console.log(`Wrote ${syntheticEvalSet.length} synthetic eval entries to ${outputPath}`);
+    console.log(`  Positives (should_trigger=true) : ${pos.length}`);
+    console.log(`  Negatives (should_trigger=false): ${neg.length}`);
+    console.log("\nNext steps:");
+    console.log(`  selftune evolve --skill ${values.skill} \\`);
+    console.log(`    --skill-path ${skillPath} \\`);
+    console.log(`    --eval-set ${outputPath} \\`);
+    console.log("    --dry-run --verbose");
+    return;
+  }
   const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
   writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
   printEvalStats(evalSet, values.skill, outputPath, skillRecords, queryRecords, annotateTaxonomy);
+  if (positiveCount === 0 && detectedSkillPath) {
+    printSyntheticFallbackHint(values.skill, detectedSkillPath);
+  }
 }
 if (import.meta.main) {

package/cli/selftune/eval/synthetic-evals.ts CHANGED Viewed

@@ -10,6 +10,7 @@ import { readFileSync } from "node:fs";
 import type { EvalEntry, InvocationType } from "../types.js";
 import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
+import { findInstalledSkillNames } from "../utils/skill-discovery.js";
 import { classifyInvocation } from "./hooks-to-evals.js";
 // ---------------------------------------------------------------------------
@@ -28,6 +29,181 @@ interface RawSyntheticEntry {
   invocation_type?: string;
 }
+interface SyntheticPromptRealExamples {
+  positive: string[];
+  negative: string[];
+}
+interface PromptFamilyTargets {
+  explicitCount: number;
+  implicitCount: number;
+  contextualCount: number;
+  siblingNegativeCount: number;
+  adjacentNegativeCount: number;
+  unrelatedNegativeCount: number;
+}
+function getSyntheticSkillSearchDirs(): string[] {
+  const cwd = process.cwd();
+  const homeDir = process.env.HOME ?? "";
+  const codexHome = process.env.CODEX_HOME ?? `${homeDir}/.codex`;
+  return [
+    `${cwd}/.agents/skills`,
+    `${cwd}/.claude/skills`,
+    `${homeDir}/.agents/skills`,
+    `${homeDir}/.claude/skills`,
+    `${codexHome}/skills`,
+  ];
+}
+function inferSiblingSkills(
+  skillName: string,
+  searchDirs: string[] = getSyntheticSkillSearchDirs(),
+): string[] {
+  const normalized = skillName.trim().toLowerCase();
+  if (!normalized) return [];
+  const familyPrefix = normalized.includes("-") ? normalized.split("-")[0] : "";
+  const installedNames = [...findInstalledSkillNames(searchDirs)];
+  const sameFamily = installedNames
+    .filter((name) => name.toLowerCase() !== normalized)
+    .filter((name) => familyPrefix && name.toLowerCase().startsWith(`${familyPrefix}-`))
+    .sort((a, b) => a.localeCompare(b));
+  if (sameFamily.length >= 5) return sameFamily.slice(0, 5);
+  const adjacent = installedNames
+    .filter((name) => name.toLowerCase() !== normalized)
+    .filter((name) => !sameFamily.includes(name))
+    .sort((a, b) => a.localeCompare(b));
+  return [...sameFamily, ...adjacent].slice(0, 5);
+}
+function buildPromptFamilyTargets(
+  maxPositives: number,
+  maxNegatives: number,
+  hasSiblingSkills: boolean,
+): PromptFamilyTargets {
+  const explicitCount = Math.max(1, Math.round(maxPositives * 0.2));
+  const contextualCount = Math.max(1, Math.round(maxPositives * 0.4));
+  const implicitCount = Math.max(1, maxPositives - explicitCount - contextualCount);
+  const siblingNegativeCount =
+    hasSiblingSkills && maxNegatives > 0 ? Math.max(1, Math.round(maxNegatives * 0.4)) : 0;
+  const adjacentNegativeCount = Math.max(
+    1,
+    maxNegatives - siblingNegativeCount - Math.max(1, Math.round(maxNegatives * 0.2)),
+  );
+  const unrelatedNegativeCount = Math.max(
+    1,
+    maxNegatives - siblingNegativeCount - adjacentNegativeCount,
+  );
+  return {
+    explicitCount,
+    implicitCount,
+    contextualCount,
+    siblingNegativeCount,
+    adjacentNegativeCount,
+    unrelatedNegativeCount,
+  };
+}
+function normalizeEvalQuery(query: string): string {
+  return query.trim().toLowerCase().replace(/\s+/g, " ");
+}
+function dedupeEvalEntries(entries: EvalEntry[]): EvalEntry[] {
+  const seen = new Set<string>();
+  const deduped: EvalEntry[] = [];
+  for (const entry of entries) {
+    const key = `${entry.should_trigger ? "p" : "n"}:${normalizeEvalQuery(entry.query)}`;
+    if (seen.has(key)) continue;
+    seen.add(key);
+    deduped.push(entry);
+  }
+  return deduped;
+}
+function takeEntries(entries: EvalEntry[], count: number): EvalEntry[] {
+  if (count <= 0) return [];
+  return entries.slice(0, count);
+}
+export function selectBalancedEvalEntries(
+  entries: EvalEntry[],
+  maxPositives: number,
+  maxNegatives: number,
+  siblingSkills: string[] | boolean,
+): EvalEntry[] {
+  const normalizedSiblingSkills = Array.isArray(siblingSkills)
+    ? siblingSkills.map((skill) => skill.trim().toLowerCase()).filter(Boolean)
+    : [];
+  const hasSiblingSkills = normalizedSiblingSkills.length > 0;
+  const targets = buildPromptFamilyTargets(maxPositives, maxNegatives, hasSiblingSkills);
+  const positives = entries.filter((entry) => entry.should_trigger);
+  const negatives = entries.filter((entry) => !entry.should_trigger);
+  const explicit = positives.filter((entry) => entry.invocation_type === "explicit");
+  const implicit = positives.filter((entry) => entry.invocation_type === "implicit");
+  const contextual = positives.filter((entry) => entry.invocation_type === "contextual");
+  const remainingPositive = positives.filter(
+    (entry) => !["explicit", "implicit", "contextual"].includes(entry.invocation_type ?? ""),
+  );
+  const selectedPositives = [
+    ...takeEntries(explicit, targets.explicitCount),
+    ...takeEntries(implicit, targets.implicitCount),
+    ...takeEntries(contextual, targets.contextualCount),
+  ];
+  const selectedPositiveKeys = new Set(
+    selectedPositives.map((entry) => normalizeEvalQuery(entry.query)),
+  );
+  for (const entry of [...positives, ...remainingPositive]) {
+    if (selectedPositives.length >= maxPositives) break;
+    const key = normalizeEvalQuery(entry.query);
+    if (selectedPositiveKeys.has(key)) continue;
+    selectedPositiveKeys.add(key);
+    selectedPositives.push(entry);
+  }
+  const siblingMentions = hasSiblingSkills
+    ? negatives.filter((entry) => {
+        const normalizedQuery = entry.query.toLowerCase();
+        return normalizedSiblingSkills.some((skill) => normalizedQuery.includes(skill));
+      })
+    : siblingSkills === true
+      ? negatives.filter((entry) =>
+          /(^|[\s/$-])(sc-[a-z0-9-]+|mentor cli|State Change mentor CLI|resource\s+\d+|mental model)/i.test(
+            entry.query,
+          ),
+        )
+      : [];
+  const nonSiblingNegatives = negatives.filter((entry) => !siblingMentions.includes(entry));
+  const selectedNegatives = [
+    ...takeEntries(siblingMentions, targets.siblingNegativeCount),
+    ...takeEntries(
+      nonSiblingNegatives,
+      maxNegatives - Math.min(targets.siblingNegativeCount, siblingMentions.length),
+    ),
+  ];
+  const selectedNegativeKeys = new Set(
+    selectedNegatives.map((entry) => normalizeEvalQuery(entry.query)),
+  );
+  for (const entry of negatives) {
+    if (selectedNegatives.length >= maxNegatives) break;
+    const key = normalizeEvalQuery(entry.query);
+    if (selectedNegativeKeys.has(key)) continue;
+    selectedNegativeKeys.add(key);
+    selectedNegatives.push(entry);
+  }
+  return [...selectedPositives.slice(0, maxPositives), ...selectedNegatives.slice(0, maxNegatives)];
+}
 // ---------------------------------------------------------------------------
 // Prompt building
 // ---------------------------------------------------------------------------
@@ -37,21 +213,38 @@ export function buildSyntheticPrompt(
   skillName: string,
   maxPositives: number,
   maxNegatives: number,
-  realExamples?: { positive: string[]; negative: string[] },
+  realExamples?: SyntheticPromptRealExamples,
+  siblingSkills: string[] = [],
 ): { system: string; user: string } {
+  const {
+    explicitCount,
+    implicitCount,
+    contextualCount,
+    siblingNegativeCount,
+    adjacentNegativeCount,
+    unrelatedNegativeCount,
+  } = buildPromptFamilyTargets(maxPositives, maxNegatives, siblingSkills.length > 0);
   const system = `You are generating test queries for a coding agent skill. Given the skill description below, generate realistic user queries.
+Your job is to create a SMALL, TARGETED benchmark for cold-start routing quality.
 For POSITIVE queries (should trigger this skill):
-- Generate a mix of:
+- Generate a balanced mix of:
   - Explicit: directly names the skill or uses $${skillName} syntax
   - Implicit: describes the task without naming the skill
-  - Contextual: natural language with domain context, proper nouns, dates, filenames
-- Vary phrasing, formality, and specificity
+  - Contextual: realistic natural language with domain context, proper nouns, filenames, or setup noise
+- Avoid merely paraphrasing bullet points from the skill
+- Prefer realistic user phrasing over polished product copy
+- Include at least a few prompts that test the edge of the skill's scope, not just the obvious center
 For NEGATIVE queries (should NOT trigger this skill):
-- Queries that are topically adjacent but wrong intent
-- Queries for different skills that share keywords
-- Generic queries unrelated to this skill
+- Include hard negative controls:
+  - sibling-skill confusion cases
+  - topically adjacent but wrong-intent cases
+  - clearly unrelated cases
+- Make the hard negatives plausible, not cartoonishly unrelated
+- If a query belongs to another installed skill, make that obvious from the task itself
 Output as JSON array with no surrounding text:
 [{"query": "...", "should_trigger": true, "invocation_type": "explicit|implicit|contextual|negative"}]`;
@@ -61,7 +254,19 @@ Output as JSON array with no surrounding text:
 Skill content:
 ${skillContent}
-Generate exactly ${maxPositives} positive queries (should_trigger: true) and ${maxNegatives} negative queries (should_trigger: false). Return ONLY the JSON array.`;
+Generate exactly ${maxPositives} positive queries (should_trigger: true) and ${maxNegatives} negative queries (should_trigger: false).
+Required positive mix:
+- ${explicitCount} explicit
+- ${implicitCount} implicit
+- ${contextualCount} contextual
+Required negative mix:
+- ${siblingNegativeCount} sibling-skill confusion cases
+- ${adjacentNegativeCount} adjacent but wrong-intent cases
+- ${unrelatedNegativeCount} clearly unrelated cases
+Return ONLY the JSON array.`;
   if (realExamples && (realExamples.positive.length > 0 || realExamples.negative.length > 0)) {
     const parts: string[] = ["\n\nReal user queries for style and phrasing reference:"];
@@ -77,6 +282,61 @@ Generate exactly ${maxPositives} positive queries (should_trigger: true) and ${m
     user += parts.join("\n");
   }
+  if (siblingSkills.length > 0) {
+    user += `\n\nNearby installed skills to use for boundary-setting hard negatives:\n${siblingSkills
+      .map((skill) => `- ${skill}`)
+      .join(
+        "\n",
+      )}\n\nAt least ${siblingNegativeCount} negative queries should clearly belong to one of these sibling skills instead of ${skillName}.`;
+  }
+  return { system, user };
+}
+export function buildSyntheticRefinementPrompt(
+  skillContent: string,
+  skillName: string,
+  candidates: EvalEntry[],
+  maxPositives: number,
+  maxNegatives: number,
+  siblingSkills: string[] = [],
+): { system: string; user: string } {
+  const targets = buildPromptFamilyTargets(maxPositives, maxNegatives, siblingSkills.length > 0);
+  const system = `You are refining a cold-start eval benchmark for a coding agent skill.
+Your job is to critique and prune a candidate pool into a SMALL, SHARP benchmark.
+For each candidate, reason using binary questions:
+- Is this realistic user phrasing?
+- Is this more than a trivial paraphrase of the skill bullets?
+- Does this clearly test in-scope behavior, or clearly test a boundary?
+- For negatives: does it clearly belong elsewhere or represent a plausible wrong-intent adjacent request?
+- Is it sufficiently distinct from the other selected prompts?
+Return ONLY a JSON array with the final benchmark.`;
+  const user = `Skill name: ${skillName}
+Skill content:
+${skillContent}
+Target final benchmark:
+- ${maxPositives} positives
+- ${maxNegatives} negatives
+- Positive mix: ${targets.explicitCount} explicit, ${targets.implicitCount} implicit, ${targets.contextualCount} contextual
+- Negative mix: ${targets.siblingNegativeCount} sibling-skill confusion, ${targets.adjacentNegativeCount} adjacent wrong-intent, ${targets.unrelatedNegativeCount} unrelated
+${siblingSkills.length > 0 ? `Sibling skills for hard-negative boundaries:\n${siblingSkills.map((skill) => `- ${skill}`).join("\n")}\n` : ""}
+Candidate pool:
+${JSON.stringify(candidates, null, 2)}
+Instructions:
+- Remove duplicates and near-duplicates
+- Prefer prompts that test trigger boundaries, not just center-of-mass obvious usage
+- Keep sibling-skill negatives if they are strong boundary tests
+- Keep the final set compact, diverse, and realistic
+- Return ONLY the final JSON array`;
   return { system, user };
 }
@@ -172,8 +432,10 @@ export async function generateSyntheticEvals(
 ): Promise<EvalEntry[]> {
   const maxPositives = options.maxPositives ?? 15;
   const maxNegatives = options.maxNegatives ?? 10;
+  const oversampleFactor = 2;
   const skillContent = readFileSync(skillPath, "utf-8");
+  const siblingSkills = inferSiblingSkills(skillName);
   // Load real query examples from the database for few-shot style guidance.
   // Uses dynamic imports since SQLite may not be available in all contexts.
@@ -214,11 +476,36 @@ export async function generateSyntheticEvals(
   const { system, user } = buildSyntheticPrompt(
     skillContent,
     skillName,
-    maxPositives,
-    maxNegatives,
+    maxPositives * oversampleFactor,
+    maxNegatives * oversampleFactor,
     realExamples,
+    siblingSkills,
   );
   const raw = await callLlm(system, user, agent, options.modelFlag);
-  return parseSyntheticResponse(raw, skillName);
+  const firstPass = dedupeEvalEntries(parseSyntheticResponse(raw, skillName));
+  try {
+    const refinement = buildSyntheticRefinementPrompt(
+      skillContent,
+      skillName,
+      firstPass,
+      maxPositives,
+      maxNegatives,
+      siblingSkills,
+    );
+    const refinedRaw = await callLlm(refinement.system, refinement.user, agent, options.modelFlag);
+    const refined = dedupeEvalEntries(parseSyntheticResponse(refinedRaw, skillName));
+    const selected = selectBalancedEvalEntries(refined, maxPositives, maxNegatives, siblingSkills);
+    if (
+      selected.filter((entry) => entry.should_trigger).length >= maxPositives &&
+      selected.filter((entry) => !entry.should_trigger).length >= maxNegatives
+    ) {
+      return selected;
+    }
+  } catch {
+    // fall through to first-pass selection
+  }
+  return selectBalancedEvalEntries(firstPass, maxPositives, maxNegatives, siblingSkills);
 }