selftune 0.2.18 → 0.2.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/README.md +9 -4
  2. package/apps/local-dashboard/dist/assets/index-D8O-RG1I.js +60 -0
  3. package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +1 -0
  4. package/apps/local-dashboard/dist/assets/vendor-table-BIiI3YhS.js +1 -0
  5. package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +12 -0
  6. package/apps/local-dashboard/dist/index.html +5 -5
  7. package/cli/selftune/alpha-upload/stage-canonical.ts +7 -6
  8. package/cli/selftune/constants.ts +10 -0
  9. package/cli/selftune/contribute/contribute.ts +30 -2
  10. package/cli/selftune/contribution-config.ts +249 -0
  11. package/cli/selftune/contribution-relay.ts +177 -0
  12. package/cli/selftune/contribution-signals.ts +219 -0
  13. package/cli/selftune/contribution-staging.ts +147 -0
  14. package/cli/selftune/contributions.ts +532 -0
  15. package/cli/selftune/creator-contributions.ts +333 -0
  16. package/cli/selftune/dashboard-contract.ts +209 -1
  17. package/cli/selftune/dashboard-server.ts +45 -11
  18. package/cli/selftune/eval/family-overlap.ts +714 -0
  19. package/cli/selftune/eval/hooks-to-evals.ts +182 -28
  20. package/cli/selftune/eval/synthetic-evals.ts +298 -11
  21. package/cli/selftune/evolution/evidence.ts +5 -0
  22. package/cli/selftune/evolution/evolve-body.ts +62 -2
  23. package/cli/selftune/evolution/evolve.ts +58 -1
  24. package/cli/selftune/evolution/validate-body.ts +10 -0
  25. package/cli/selftune/evolution/validate-host-replay.ts +236 -0
  26. package/cli/selftune/evolution/validate-proposal.ts +10 -0
  27. package/cli/selftune/evolution/validate-routing.ts +112 -5
  28. package/cli/selftune/export.ts +2 -2
  29. package/cli/selftune/index.ts +41 -5
  30. package/cli/selftune/ingestors/codex-rollout.ts +31 -35
  31. package/cli/selftune/ingestors/codex-wrapper.ts +32 -24
  32. package/cli/selftune/localdb/db.ts +2 -2
  33. package/cli/selftune/localdb/direct-write.ts +8 -3
  34. package/cli/selftune/localdb/materialize.ts +7 -2
  35. package/cli/selftune/localdb/queries.ts +712 -31
  36. package/cli/selftune/localdb/schema.ts +30 -1
  37. package/cli/selftune/recover.ts +153 -0
  38. package/cli/selftune/repair/skill-usage.ts +363 -4
  39. package/cli/selftune/routes/actions.ts +35 -1
  40. package/cli/selftune/routes/analytics.ts +14 -0
  41. package/cli/selftune/routes/index.ts +1 -0
  42. package/cli/selftune/routes/overview.ts +112 -4
  43. package/cli/selftune/routes/skill-report.ts +575 -11
  44. package/cli/selftune/status.ts +81 -2
  45. package/cli/selftune/sync.ts +56 -2
  46. package/cli/selftune/trust-model.ts +66 -0
  47. package/cli/selftune/types.ts +103 -0
  48. package/cli/selftune/utils/skill-detection.ts +43 -0
  49. package/cli/selftune/utils/text-similarity.ts +73 -0
  50. package/cli/selftune/watchlist.ts +65 -0
  51. package/package.json +1 -1
  52. package/packages/ui/src/components/ActivityTimeline.tsx +165 -150
  53. package/packages/ui/src/components/EvidenceViewer.tsx +419 -145
  54. package/packages/ui/src/components/EvolutionTimeline.tsx +81 -29
  55. package/packages/ui/src/components/OrchestrateRunsPanel.tsx +33 -16
  56. package/packages/ui/src/components/RecentActivityFeed.tsx +72 -41
  57. package/packages/ui/src/components/section-cards.tsx +12 -9
  58. package/packages/ui/src/primitives/card.tsx +1 -1
  59. package/packages/ui/src/types.ts +4 -0
  60. package/skill/SKILL.md +11 -1
  61. package/skill/Workflows/AlphaUpload.md +4 -0
  62. package/skill/Workflows/Composability.md +78 -0
  63. package/skill/Workflows/Contribute.md +6 -3
  64. package/skill/Workflows/Contributions.md +97 -0
  65. package/skill/Workflows/CreatorContributions.md +74 -0
  66. package/skill/Workflows/Dashboard.md +31 -0
  67. package/skill/Workflows/Evals.md +57 -8
  68. package/skill/Workflows/Evolve.md +23 -0
  69. package/skill/Workflows/Ingest.md +7 -0
  70. package/skill/Workflows/Initialize.md +20 -1
  71. package/skill/Workflows/Recover.md +84 -0
  72. package/skill/Workflows/RepairSkillUsage.md +12 -4
  73. package/skill/Workflows/Sync.md +18 -12
  74. package/apps/local-dashboard/dist/assets/index-BMIS6uUh.css +0 -2
  75. package/apps/local-dashboard/dist/assets/index-DOu3iLD9.js +0 -16
  76. package/apps/local-dashboard/dist/assets/vendor-table-pHbDxq36.js +0 -8
  77. package/apps/local-dashboard/dist/assets/vendor-ui-DIwlrGlb.js +0 -12
@@ -2,7 +2,8 @@
2
2
  /**
3
3
  * hooks-to-evals.ts
4
4
  *
5
- * Converts hook logs into trigger eval sets compatible with run_eval / run_loop.
5
+ * Converts hook logs into trigger eval sets compatible with the current
6
+ * eval-generate -> evolve --dry-run validation loop.
6
7
  *
7
8
  * Default read path is SQLite (via localdb/queries). JSONL fallback is used only
8
9
  * when custom --skill-log / --query-log / --telemetry-log paths are supplied
@@ -43,6 +44,13 @@ import {
43
44
  filterActionableSkillUsageRecords,
44
45
  } from "../utils/query-filter.js";
45
46
  import { seededShuffle } from "../utils/seeded-random.js";
47
+ import {
48
+ escapeRegExp,
49
+ findInstalledSkillNames,
50
+ findInstalledSkillPath,
51
+ findRepositoryClaudeSkillDirs,
52
+ findRepositorySkillDirs,
53
+ } from "../utils/skill-discovery.js";
46
54
  import { isHighConfidencePositiveSkillRecord } from "../utils/skill-usage-confidence.js";
47
55
  import { generateSyntheticEvals } from "./synthetic-evals.js";
48
56
 
@@ -78,14 +86,14 @@ export function classifyInvocation(query: string, skillName: string): Invocation
78
86
  // Handle hyphenated skill names: check if all parts appear
79
87
  if (skillLower.includes("-")) {
80
88
  const parts = skillLower.split("-");
81
- if (parts.every((part) => qLower.includes(part))) {
89
+ if (parts.every((part) => new RegExp(`\\b${escapeRegExp(part)}\\b`, "i").test(query))) {
82
90
  return "explicit";
83
91
  }
84
92
  }
85
93
 
86
94
  // Convert skill-name to camelCase and check
87
95
  const camelCase = skillLower.replace(/-([a-z])/g, (_, c) => c.toUpperCase());
88
- if (camelCase !== skillLower && qLower.includes(camelCase)) {
96
+ if (camelCase !== skillLower && qLower.includes(camelCase.toLowerCase())) {
89
97
  return "explicit";
90
98
  }
91
99
 
@@ -207,6 +215,78 @@ export function buildEvalSet(
207
215
  return [...shuffledPositives, ...negatives];
208
216
  }
209
217
 
218
+ // ---------------------------------------------------------------------------
219
+ // Installed skill discovery / readiness
220
+ // ---------------------------------------------------------------------------
221
+
222
+ export interface EvalSkillReadiness {
223
+ name: string;
224
+ trusted_trigger_count: number;
225
+ raw_trigger_count: number;
226
+ trusted_session_count: number;
227
+ raw_session_count: number;
228
+ installed: boolean;
229
+ skill_path?: string;
230
+ readiness: "log_ready" | "cold_start_ready" | "telemetry_only";
231
+ }
232
+
233
+ function getEvalSkillSearchDirs(): string[] {
234
+ const cwd = process.cwd();
235
+ const homeDir = process.env.HOME ?? "";
236
+ const codexHome = process.env.CODEX_HOME ?? `${homeDir}/.codex`;
237
+ return [
238
+ ...findRepositorySkillDirs(cwd),
239
+ ...findRepositoryClaudeSkillDirs(cwd),
240
+ `${homeDir}/.agents/skills`,
241
+ `${homeDir}/.claude/skills`,
242
+ `${codexHome}/skills`,
243
+ ];
244
+ }
245
+
246
+ export function listEvalSkillReadiness(
247
+ skillRecords: SkillUsageRecord[],
248
+ searchDirs: string[] = getEvalSkillSearchDirs(),
249
+ ): EvalSkillReadiness[] {
250
+ const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
251
+ const rawTriggerCounts = new Map<string, number>();
252
+ const rawSessionCounts = new Map<string, Set<string>>();
253
+ const trustedTriggerCounts = new Map<string, number>();
254
+ const trustedSessionCounts = new Map<string, Set<string>>();
255
+ for (const r of actionableSkillRecords) {
256
+ const name = r.skill_name ?? "unknown";
257
+ rawTriggerCounts.set(name, (rawTriggerCounts.get(name) ?? 0) + 1);
258
+ if (!rawSessionCounts.has(name)) rawSessionCounts.set(name, new Set<string>());
259
+ if (r.session_id) rawSessionCounts.get(name)?.add(r.session_id);
260
+
261
+ if (!isHighConfidencePositiveSkillRecord(r, name)) continue;
262
+ trustedTriggerCounts.set(name, (trustedTriggerCounts.get(name) ?? 0) + 1);
263
+ if (!trustedSessionCounts.has(name)) trustedSessionCounts.set(name, new Set<string>());
264
+ if (r.session_id) trustedSessionCounts.get(name)?.add(r.session_id);
265
+ }
266
+
267
+ const installedNames = findInstalledSkillNames(searchDirs);
268
+ const allNames = new Set<string>([...rawTriggerCounts.keys(), ...installedNames]);
269
+
270
+ return [...allNames]
271
+ .sort((a, b) => a.localeCompare(b))
272
+ .map((name) => {
273
+ const trustedTriggerCount = trustedTriggerCounts.get(name) ?? 0;
274
+ const rawTriggerCount = rawTriggerCounts.get(name) ?? 0;
275
+ const installed = installedNames.has(name);
276
+ return {
277
+ name,
278
+ trusted_trigger_count: trustedTriggerCount,
279
+ raw_trigger_count: rawTriggerCount,
280
+ trusted_session_count: trustedSessionCounts.get(name)?.size ?? 0,
281
+ raw_session_count: rawSessionCounts.get(name)?.size ?? 0,
282
+ installed,
283
+ skill_path: installed ? findInstalledSkillPath(name, searchDirs) : undefined,
284
+ readiness:
285
+ trustedTriggerCount > 0 ? "log_ready" : installed ? "cold_start_ready" : "telemetry_only",
286
+ } satisfies EvalSkillReadiness;
287
+ });
288
+ }
289
+
210
290
  // ---------------------------------------------------------------------------
211
291
  // List skills
212
292
  // ---------------------------------------------------------------------------
@@ -216,24 +296,37 @@ export function listSkills(
216
296
  queryRecords: QueryLogRecord[],
217
297
  telemetryRecords: SessionTelemetryRecord[],
218
298
  ): void {
219
- const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
220
299
  const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
221
- const counts = new Map<string, number>();
222
- for (const r of actionableSkillRecords) {
223
- const name = r.skill_name ?? "unknown";
224
- counts.set(name, (counts.get(name) ?? 0) + 1);
225
- }
226
-
227
- console.log(
228
- `Skill triggers in skill_usage_log (${actionableSkillRecords.length} actionable records):`,
229
- );
230
- if (counts.size > 0) {
231
- const sorted = [...counts.entries()].sort((a, b) => b[1] - a[1]);
232
- for (const [name, count] of sorted) {
233
- console.log(` ${name.padEnd(30)} ${String(count).padStart(4)} triggers`);
300
+ const readiness = listEvalSkillReadiness(skillRecords);
301
+
302
+ console.log(`Skills with eval readiness (${readiness.length} total):`);
303
+ if (readiness.length > 0) {
304
+ for (const skill of readiness) {
305
+ const readinessLabel =
306
+ skill.readiness === "log_ready"
307
+ ? "log-ready"
308
+ : skill.readiness === "cold_start_ready"
309
+ ? "cold-start"
310
+ : "telemetry-only";
311
+ const installLabel = skill.installed ? "installed" : "not installed";
312
+ const trustedLabel = `${String(skill.trusted_trigger_count).padStart(3)} trusted`;
313
+ const rawLabel =
314
+ skill.raw_trigger_count !== skill.trusted_trigger_count
315
+ ? ` / ${String(skill.raw_trigger_count).padStart(3)} raw`
316
+ : "";
317
+ console.log(
318
+ ` ${skill.name.padEnd(30)} ${trustedLabel}${rawLabel} ${String(skill.trusted_session_count).padStart(3)} trusted sessions ${readinessLabel} / ${installLabel}`,
319
+ );
234
320
  }
321
+ console.log("");
322
+ console.log("Legend:");
323
+ console.log(" log-ready real triggers exist; run eval generate normally");
324
+ console.log(
325
+ " cold-start installed locally but no trusted triggers yet; use --auto-synthetic",
326
+ );
327
+ console.log(" telemetry-only trigger data exists but local SKILL.md was not found");
235
328
  } else {
236
- console.log(" (none yet -- trigger some skills in Claude Code to populate)");
329
+ console.log(" (none yet -- install skills or sync source data first)");
237
330
  }
238
331
 
239
332
  console.log(`\nActionable queries in all_queries_log: ${actionableQueryRecords.length}`);
@@ -370,15 +463,25 @@ export function printEvalStats(
370
463
  }
371
464
 
372
465
  console.log("Next steps:");
373
- console.log(" bun run cli/selftune/eval/run-eval.ts \\");
466
+ console.log(` selftune evolve --skill ${skillName} \\`);
467
+ console.log(` --skill-path /path/to/skills/${skillName}/SKILL.md \\`);
374
468
  console.log(` --eval-set ${outputPath} \\`);
375
- console.log(` --skill-path /path/to/skills/${skillName} \\`);
376
- console.log(" --runs-per-query 3 --verbose");
469
+ console.log(" --dry-run --verbose");
377
470
  console.log();
378
- console.log(" bun run cli/selftune/eval/run-loop.ts \\");
379
- console.log(` --eval-set ${outputPath} \\`);
380
- console.log(` --skill-path /path/to/skills/${skillName} \\`);
381
- console.log(" --max-iterations 5 --verbose");
471
+ console.log(` selftune evolve --skill ${skillName} \\`);
472
+ console.log(` --skill-path /path/to/skills/${skillName}/SKILL.md \\`);
473
+ console.log(` --eval-set ${outputPath}`);
474
+ }
475
+
476
+ function printSyntheticFallbackHint(skillName: string, skillPath: string): void {
477
+ console.log("");
478
+ console.log(`[TIP] No trusted trigger data found yet for '${skillName}'.`);
479
+ console.log(
480
+ " This skill is installed locally, so you can still generate a cold-start eval set:",
481
+ );
482
+ console.log(
483
+ ` selftune eval generate --skill ${skillName} --auto-synthetic --skill-path ${skillPath}`,
484
+ );
382
485
  }
383
486
 
384
487
  // ---------------------------------------------------------------------------
@@ -401,6 +504,7 @@ export async function cliMain(): Promise<void> {
401
504
  "query-log": { type: "string", default: QUERY_LOG },
402
505
  "telemetry-log": { type: "string", default: TELEMETRY_LOG },
403
506
  synthetic: { type: "boolean", default: false },
507
+ "auto-synthetic": { type: "boolean", default: false },
404
508
  "skill-path": { type: "string" },
405
509
  model: { type: "string" },
406
510
  },
@@ -466,10 +570,10 @@ export async function cliMain(): Promise<void> {
466
570
  }
467
571
 
468
572
  console.log("\nNext steps:");
469
- console.log(" bun run cli/selftune/eval/run-eval.ts \\");
470
- console.log(` --eval-set ${outputPath} \\`);
573
+ console.log(` selftune evolve --skill ${values.skill} \\`);
471
574
  console.log(` --skill-path ${values["skill-path"]} \\`);
472
- console.log(" --runs-per-query 3 --verbose");
575
+ console.log(` --eval-set ${outputPath} \\`);
576
+ console.log(" --dry-run --verbose");
473
577
  return;
474
578
  }
475
579
 
@@ -504,6 +608,8 @@ export async function cliMain(): Promise<void> {
504
608
  const maxPerSide = Number.parseInt(values.max ?? "50", 10);
505
609
  const seed = Number.parseInt(values.seed ?? "42", 10);
506
610
  const annotateTaxonomy = !values["no-taxonomy"];
611
+ const searchDirs = getEvalSkillSearchDirs();
612
+ const detectedSkillPath = findInstalledSkillPath(values.skill, searchDirs);
507
613
 
508
614
  const evalSet = buildEvalSet(
509
615
  skillRecords,
@@ -515,9 +621,57 @@ export async function cliMain(): Promise<void> {
515
621
  annotateTaxonomy,
516
622
  );
517
623
 
624
+ const positiveCount = evalSet.filter((entry) => entry.should_trigger).length;
625
+ if (positiveCount === 0 && values["auto-synthetic"]) {
626
+ const skillPath = values["skill-path"] ?? detectedSkillPath;
627
+ if (!skillPath) {
628
+ throw new CLIError(
629
+ `No trusted triggers found for '${values.skill}', and no SKILL.md path could be resolved for synthetic fallback.`,
630
+ "FILE_NOT_FOUND",
631
+ `Run 'selftune eval generate --list-skills' or rerun with --skill-path /path/to/SKILL.md`,
632
+ );
633
+ }
634
+
635
+ const agent = detectAgent();
636
+ if (!agent) {
637
+ throw new CLIError(
638
+ "No agent CLI found (claude/codex/opencode)",
639
+ "AGENT_NOT_FOUND",
640
+ "Install one of the supported agent CLIs",
641
+ );
642
+ }
643
+
644
+ console.log(
645
+ `No trusted triggers found for '${values.skill}'. Falling back to synthetic cold-start eval generation...`,
646
+ );
647
+ const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
648
+ const syntheticEvalSet = await generateSyntheticEvals(skillPath, values.skill, agent, {
649
+ maxPositives: effectiveMax,
650
+ maxNegatives: effectiveMax,
651
+ modelFlag: values.model,
652
+ });
653
+ const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
654
+ writeFileSync(outputPath, JSON.stringify(syntheticEvalSet, null, 2), "utf-8");
655
+ const pos = syntheticEvalSet.filter((e) => e.should_trigger);
656
+ const neg = syntheticEvalSet.filter((e) => !e.should_trigger);
657
+
658
+ console.log(`Wrote ${syntheticEvalSet.length} synthetic eval entries to ${outputPath}`);
659
+ console.log(` Positives (should_trigger=true) : ${pos.length}`);
660
+ console.log(` Negatives (should_trigger=false): ${neg.length}`);
661
+ console.log("\nNext steps:");
662
+ console.log(` selftune evolve --skill ${values.skill} \\`);
663
+ console.log(` --skill-path ${skillPath} \\`);
664
+ console.log(` --eval-set ${outputPath} \\`);
665
+ console.log(" --dry-run --verbose");
666
+ return;
667
+ }
668
+
518
669
  const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
519
670
  writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
520
671
  printEvalStats(evalSet, values.skill, outputPath, skillRecords, queryRecords, annotateTaxonomy);
672
+ if (positiveCount === 0 && detectedSkillPath) {
673
+ printSyntheticFallbackHint(values.skill, detectedSkillPath);
674
+ }
521
675
  }
522
676
 
523
677
  if (import.meta.main) {
@@ -10,6 +10,7 @@ import { readFileSync } from "node:fs";
10
10
 
11
11
  import type { EvalEntry, InvocationType } from "../types.js";
12
12
  import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
13
+ import { findInstalledSkillNames } from "../utils/skill-discovery.js";
13
14
  import { classifyInvocation } from "./hooks-to-evals.js";
14
15
 
15
16
  // ---------------------------------------------------------------------------
@@ -28,6 +29,181 @@ interface RawSyntheticEntry {
28
29
  invocation_type?: string;
29
30
  }
30
31
 
32
+ interface SyntheticPromptRealExamples {
33
+ positive: string[];
34
+ negative: string[];
35
+ }
36
+
37
+ interface PromptFamilyTargets {
38
+ explicitCount: number;
39
+ implicitCount: number;
40
+ contextualCount: number;
41
+ siblingNegativeCount: number;
42
+ adjacentNegativeCount: number;
43
+ unrelatedNegativeCount: number;
44
+ }
45
+
46
+ function getSyntheticSkillSearchDirs(): string[] {
47
+ const cwd = process.cwd();
48
+ const homeDir = process.env.HOME ?? "";
49
+ const codexHome = process.env.CODEX_HOME ?? `${homeDir}/.codex`;
50
+ return [
51
+ `${cwd}/.agents/skills`,
52
+ `${cwd}/.claude/skills`,
53
+ `${homeDir}/.agents/skills`,
54
+ `${homeDir}/.claude/skills`,
55
+ `${codexHome}/skills`,
56
+ ];
57
+ }
58
+
59
+ function inferSiblingSkills(
60
+ skillName: string,
61
+ searchDirs: string[] = getSyntheticSkillSearchDirs(),
62
+ ): string[] {
63
+ const normalized = skillName.trim().toLowerCase();
64
+ if (!normalized) return [];
65
+
66
+ const familyPrefix = normalized.includes("-") ? normalized.split("-")[0] : "";
67
+ const installedNames = [...findInstalledSkillNames(searchDirs)];
68
+
69
+ const sameFamily = installedNames
70
+ .filter((name) => name.toLowerCase() !== normalized)
71
+ .filter((name) => familyPrefix && name.toLowerCase().startsWith(`${familyPrefix}-`))
72
+ .sort((a, b) => a.localeCompare(b));
73
+
74
+ if (sameFamily.length >= 5) return sameFamily.slice(0, 5);
75
+
76
+ const adjacent = installedNames
77
+ .filter((name) => name.toLowerCase() !== normalized)
78
+ .filter((name) => !sameFamily.includes(name))
79
+ .sort((a, b) => a.localeCompare(b));
80
+
81
+ return [...sameFamily, ...adjacent].slice(0, 5);
82
+ }
83
+
84
+ function buildPromptFamilyTargets(
85
+ maxPositives: number,
86
+ maxNegatives: number,
87
+ hasSiblingSkills: boolean,
88
+ ): PromptFamilyTargets {
89
+ const explicitCount = Math.max(1, Math.round(maxPositives * 0.2));
90
+ const contextualCount = Math.max(1, Math.round(maxPositives * 0.4));
91
+ const implicitCount = Math.max(1, maxPositives - explicitCount - contextualCount);
92
+
93
+ const siblingNegativeCount =
94
+ hasSiblingSkills && maxNegatives > 0 ? Math.max(1, Math.round(maxNegatives * 0.4)) : 0;
95
+ const adjacentNegativeCount = Math.max(
96
+ 1,
97
+ maxNegatives - siblingNegativeCount - Math.max(1, Math.round(maxNegatives * 0.2)),
98
+ );
99
+ const unrelatedNegativeCount = Math.max(
100
+ 1,
101
+ maxNegatives - siblingNegativeCount - adjacentNegativeCount,
102
+ );
103
+
104
+ return {
105
+ explicitCount,
106
+ implicitCount,
107
+ contextualCount,
108
+ siblingNegativeCount,
109
+ adjacentNegativeCount,
110
+ unrelatedNegativeCount,
111
+ };
112
+ }
113
+
114
+ function normalizeEvalQuery(query: string): string {
115
+ return query.trim().toLowerCase().replace(/\s+/g, " ");
116
+ }
117
+
118
+ function dedupeEvalEntries(entries: EvalEntry[]): EvalEntry[] {
119
+ const seen = new Set<string>();
120
+ const deduped: EvalEntry[] = [];
121
+ for (const entry of entries) {
122
+ const key = `${entry.should_trigger ? "p" : "n"}:${normalizeEvalQuery(entry.query)}`;
123
+ if (seen.has(key)) continue;
124
+ seen.add(key);
125
+ deduped.push(entry);
126
+ }
127
+ return deduped;
128
+ }
129
+
130
+ function takeEntries(entries: EvalEntry[], count: number): EvalEntry[] {
131
+ if (count <= 0) return [];
132
+ return entries.slice(0, count);
133
+ }
134
+
135
+ export function selectBalancedEvalEntries(
136
+ entries: EvalEntry[],
137
+ maxPositives: number,
138
+ maxNegatives: number,
139
+ siblingSkills: string[] | boolean,
140
+ ): EvalEntry[] {
141
+ const normalizedSiblingSkills = Array.isArray(siblingSkills)
142
+ ? siblingSkills.map((skill) => skill.trim().toLowerCase()).filter(Boolean)
143
+ : [];
144
+ const hasSiblingSkills = normalizedSiblingSkills.length > 0;
145
+ const targets = buildPromptFamilyTargets(maxPositives, maxNegatives, hasSiblingSkills);
146
+ const positives = entries.filter((entry) => entry.should_trigger);
147
+ const negatives = entries.filter((entry) => !entry.should_trigger);
148
+
149
+ const explicit = positives.filter((entry) => entry.invocation_type === "explicit");
150
+ const implicit = positives.filter((entry) => entry.invocation_type === "implicit");
151
+ const contextual = positives.filter((entry) => entry.invocation_type === "contextual");
152
+ const remainingPositive = positives.filter(
153
+ (entry) => !["explicit", "implicit", "contextual"].includes(entry.invocation_type ?? ""),
154
+ );
155
+
156
+ const selectedPositives = [
157
+ ...takeEntries(explicit, targets.explicitCount),
158
+ ...takeEntries(implicit, targets.implicitCount),
159
+ ...takeEntries(contextual, targets.contextualCount),
160
+ ];
161
+ const selectedPositiveKeys = new Set(
162
+ selectedPositives.map((entry) => normalizeEvalQuery(entry.query)),
163
+ );
164
+ for (const entry of [...positives, ...remainingPositive]) {
165
+ if (selectedPositives.length >= maxPositives) break;
166
+ const key = normalizeEvalQuery(entry.query);
167
+ if (selectedPositiveKeys.has(key)) continue;
168
+ selectedPositiveKeys.add(key);
169
+ selectedPositives.push(entry);
170
+ }
171
+
172
+ const siblingMentions = hasSiblingSkills
173
+ ? negatives.filter((entry) => {
174
+ const normalizedQuery = entry.query.toLowerCase();
175
+ return normalizedSiblingSkills.some((skill) => normalizedQuery.includes(skill));
176
+ })
177
+ : siblingSkills === true
178
+ ? negatives.filter((entry) =>
179
+ /(^|[\s/$-])(sc-[a-z0-9-]+|mentor cli|State Change mentor CLI|resource\s+\d+|mental model)/i.test(
180
+ entry.query,
181
+ ),
182
+ )
183
+ : [];
184
+ const nonSiblingNegatives = negatives.filter((entry) => !siblingMentions.includes(entry));
185
+ const selectedNegatives = [
186
+ ...takeEntries(siblingMentions, targets.siblingNegativeCount),
187
+ ...takeEntries(
188
+ nonSiblingNegatives,
189
+ maxNegatives - Math.min(targets.siblingNegativeCount, siblingMentions.length),
190
+ ),
191
+ ];
192
+
193
+ const selectedNegativeKeys = new Set(
194
+ selectedNegatives.map((entry) => normalizeEvalQuery(entry.query)),
195
+ );
196
+ for (const entry of negatives) {
197
+ if (selectedNegatives.length >= maxNegatives) break;
198
+ const key = normalizeEvalQuery(entry.query);
199
+ if (selectedNegativeKeys.has(key)) continue;
200
+ selectedNegativeKeys.add(key);
201
+ selectedNegatives.push(entry);
202
+ }
203
+
204
+ return [...selectedPositives.slice(0, maxPositives), ...selectedNegatives.slice(0, maxNegatives)];
205
+ }
206
+
31
207
  // ---------------------------------------------------------------------------
32
208
  // Prompt building
33
209
  // ---------------------------------------------------------------------------
@@ -37,21 +213,38 @@ export function buildSyntheticPrompt(
37
213
  skillName: string,
38
214
  maxPositives: number,
39
215
  maxNegatives: number,
40
- realExamples?: { positive: string[]; negative: string[] },
216
+ realExamples?: SyntheticPromptRealExamples,
217
+ siblingSkills: string[] = [],
41
218
  ): { system: string; user: string } {
219
+ const {
220
+ explicitCount,
221
+ implicitCount,
222
+ contextualCount,
223
+ siblingNegativeCount,
224
+ adjacentNegativeCount,
225
+ unrelatedNegativeCount,
226
+ } = buildPromptFamilyTargets(maxPositives, maxNegatives, siblingSkills.length > 0);
227
+
42
228
  const system = `You are generating test queries for a coding agent skill. Given the skill description below, generate realistic user queries.
43
229
 
230
+ Your job is to create a SMALL, TARGETED benchmark for cold-start routing quality.
231
+
44
232
  For POSITIVE queries (should trigger this skill):
45
- - Generate a mix of:
233
+ - Generate a balanced mix of:
46
234
  - Explicit: directly names the skill or uses $${skillName} syntax
47
235
  - Implicit: describes the task without naming the skill
48
- - Contextual: natural language with domain context, proper nouns, dates, filenames
49
- - Vary phrasing, formality, and specificity
236
+ - Contextual: realistic natural language with domain context, proper nouns, filenames, or setup noise
237
+ - Avoid merely paraphrasing bullet points from the skill
238
+ - Prefer realistic user phrasing over polished product copy
239
+ - Include at least a few prompts that test the edge of the skill's scope, not just the obvious center
50
240
 
51
241
  For NEGATIVE queries (should NOT trigger this skill):
52
- - Queries that are topically adjacent but wrong intent
53
- - Queries for different skills that share keywords
54
- - Generic queries unrelated to this skill
242
+ - Include hard negative controls:
243
+ - sibling-skill confusion cases
244
+ - topically adjacent but wrong-intent cases
245
+ - clearly unrelated cases
246
+ - Make the hard negatives plausible, not cartoonishly unrelated
247
+ - If a query belongs to another installed skill, make that obvious from the task itself
55
248
 
56
249
  Output as JSON array with no surrounding text:
57
250
  [{"query": "...", "should_trigger": true, "invocation_type": "explicit|implicit|contextual|negative"}]`;
@@ -61,7 +254,19 @@ Output as JSON array with no surrounding text:
61
254
  Skill content:
62
255
  ${skillContent}
63
256
 
64
- Generate exactly ${maxPositives} positive queries (should_trigger: true) and ${maxNegatives} negative queries (should_trigger: false). Return ONLY the JSON array.`;
257
+ Generate exactly ${maxPositives} positive queries (should_trigger: true) and ${maxNegatives} negative queries (should_trigger: false).
258
+
259
+ Required positive mix:
260
+ - ${explicitCount} explicit
261
+ - ${implicitCount} implicit
262
+ - ${contextualCount} contextual
263
+
264
+ Required negative mix:
265
+ - ${siblingNegativeCount} sibling-skill confusion cases
266
+ - ${adjacentNegativeCount} adjacent but wrong-intent cases
267
+ - ${unrelatedNegativeCount} clearly unrelated cases
268
+
269
+ Return ONLY the JSON array.`;
65
270
 
66
271
  if (realExamples && (realExamples.positive.length > 0 || realExamples.negative.length > 0)) {
67
272
  const parts: string[] = ["\n\nReal user queries for style and phrasing reference:"];
@@ -77,6 +282,61 @@ Generate exactly ${maxPositives} positive queries (should_trigger: true) and ${m
77
282
  user += parts.join("\n");
78
283
  }
79
284
 
285
+ if (siblingSkills.length > 0) {
286
+ user += `\n\nNearby installed skills to use for boundary-setting hard negatives:\n${siblingSkills
287
+ .map((skill) => `- ${skill}`)
288
+ .join(
289
+ "\n",
290
+ )}\n\nAt least ${siblingNegativeCount} negative queries should clearly belong to one of these sibling skills instead of ${skillName}.`;
291
+ }
292
+
293
+ return { system, user };
294
+ }
295
+
296
+ export function buildSyntheticRefinementPrompt(
297
+ skillContent: string,
298
+ skillName: string,
299
+ candidates: EvalEntry[],
300
+ maxPositives: number,
301
+ maxNegatives: number,
302
+ siblingSkills: string[] = [],
303
+ ): { system: string; user: string } {
304
+ const targets = buildPromptFamilyTargets(maxPositives, maxNegatives, siblingSkills.length > 0);
305
+ const system = `You are refining a cold-start eval benchmark for a coding agent skill.
306
+
307
+ Your job is to critique and prune a candidate pool into a SMALL, SHARP benchmark.
308
+
309
+ For each candidate, reason using binary questions:
310
+ - Is this realistic user phrasing?
311
+ - Is this more than a trivial paraphrase of the skill bullets?
312
+ - Does this clearly test in-scope behavior, or clearly test a boundary?
313
+ - For negatives: does it clearly belong elsewhere or represent a plausible wrong-intent adjacent request?
314
+ - Is it sufficiently distinct from the other selected prompts?
315
+
316
+ Return ONLY a JSON array with the final benchmark.`;
317
+
318
+ const user = `Skill name: ${skillName}
319
+
320
+ Skill content:
321
+ ${skillContent}
322
+
323
+ Target final benchmark:
324
+ - ${maxPositives} positives
325
+ - ${maxNegatives} negatives
326
+ - Positive mix: ${targets.explicitCount} explicit, ${targets.implicitCount} implicit, ${targets.contextualCount} contextual
327
+ - Negative mix: ${targets.siblingNegativeCount} sibling-skill confusion, ${targets.adjacentNegativeCount} adjacent wrong-intent, ${targets.unrelatedNegativeCount} unrelated
328
+
329
+ ${siblingSkills.length > 0 ? `Sibling skills for hard-negative boundaries:\n${siblingSkills.map((skill) => `- ${skill}`).join("\n")}\n` : ""}
330
+ Candidate pool:
331
+ ${JSON.stringify(candidates, null, 2)}
332
+
333
+ Instructions:
334
+ - Remove duplicates and near-duplicates
335
+ - Prefer prompts that test trigger boundaries, not just center-of-mass obvious usage
336
+ - Keep sibling-skill negatives if they are strong boundary tests
337
+ - Keep the final set compact, diverse, and realistic
338
+ - Return ONLY the final JSON array`;
339
+
80
340
  return { system, user };
81
341
  }
82
342
 
@@ -172,8 +432,10 @@ export async function generateSyntheticEvals(
172
432
  ): Promise<EvalEntry[]> {
173
433
  const maxPositives = options.maxPositives ?? 15;
174
434
  const maxNegatives = options.maxNegatives ?? 10;
435
+ const oversampleFactor = 2;
175
436
 
176
437
  const skillContent = readFileSync(skillPath, "utf-8");
438
+ const siblingSkills = inferSiblingSkills(skillName);
177
439
 
178
440
  // Load real query examples from the database for few-shot style guidance.
179
441
  // Uses dynamic imports since SQLite may not be available in all contexts.
@@ -214,11 +476,36 @@ export async function generateSyntheticEvals(
214
476
  const { system, user } = buildSyntheticPrompt(
215
477
  skillContent,
216
478
  skillName,
217
- maxPositives,
218
- maxNegatives,
479
+ maxPositives * oversampleFactor,
480
+ maxNegatives * oversampleFactor,
219
481
  realExamples,
482
+ siblingSkills,
220
483
  );
221
484
 
222
485
  const raw = await callLlm(system, user, agent, options.modelFlag);
223
- return parseSyntheticResponse(raw, skillName);
486
+ const firstPass = dedupeEvalEntries(parseSyntheticResponse(raw, skillName));
487
+
488
+ try {
489
+ const refinement = buildSyntheticRefinementPrompt(
490
+ skillContent,
491
+ skillName,
492
+ firstPass,
493
+ maxPositives,
494
+ maxNegatives,
495
+ siblingSkills,
496
+ );
497
+ const refinedRaw = await callLlm(refinement.system, refinement.user, agent, options.modelFlag);
498
+ const refined = dedupeEvalEntries(parseSyntheticResponse(refinedRaw, skillName));
499
+ const selected = selectBalancedEvalEntries(refined, maxPositives, maxNegatives, siblingSkills);
500
+ if (
501
+ selected.filter((entry) => entry.should_trigger).length >= maxPositives &&
502
+ selected.filter((entry) => !entry.should_trigger).length >= maxNegatives
503
+ ) {
504
+ return selected;
505
+ }
506
+ } catch {
507
+ // fall through to first-pass selection
508
+ }
509
+
510
+ return selectBalancedEvalEntries(firstPass, maxPositives, maxNegatives, siblingSkills);
224
511
  }
@@ -29,3 +29,8 @@ export function readEvidenceTrail(skillName?: string, _logPath?: string): Evolut
29
29
  const db = getDb();
30
30
  return queryEvolutionEvidence(db, skillName) as EvolutionEvidenceEntry[];
31
31
  }
32
+
33
+ /** Build the stable evidence key used to connect audit entries to validation artifacts. */
34
+ export function buildValidationEvidenceRef(proposalId: string, stage: string): string {
35
+ return `evolution_evidence:${proposalId}:${stage}`;
36
+ }