selftune 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/.claude/agents/diagnosis-analyst.md +156 -0
  2. package/.claude/agents/evolution-reviewer.md +180 -0
  3. package/.claude/agents/integration-guide.md +212 -0
  4. package/.claude/agents/pattern-analyst.md +160 -0
  5. package/CHANGELOG.md +46 -1
  6. package/README.md +105 -257
  7. package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
  8. package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
  9. package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
  10. package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
  11. package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
  12. package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
  13. package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
  14. package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
  15. package/apps/local-dashboard/dist/favicon.png +0 -0
  16. package/apps/local-dashboard/dist/index.html +17 -0
  17. package/apps/local-dashboard/dist/logo.png +0 -0
  18. package/apps/local-dashboard/dist/logo.svg +9 -0
  19. package/assets/BeforeAfter.gif +0 -0
  20. package/assets/FeedbackLoop.gif +0 -0
  21. package/assets/logo.svg +9 -0
  22. package/assets/skill-health-badge.svg +20 -0
  23. package/cli/selftune/activation-rules.ts +171 -0
  24. package/cli/selftune/badge/badge-data.ts +108 -0
  25. package/cli/selftune/badge/badge-svg.ts +212 -0
  26. package/cli/selftune/badge/badge.ts +99 -0
  27. package/cli/selftune/canonical-export.ts +183 -0
  28. package/cli/selftune/constants.ts +103 -1
  29. package/cli/selftune/contribute/bundle.ts +314 -0
  30. package/cli/selftune/contribute/contribute.ts +214 -0
  31. package/cli/selftune/contribute/sanitize.ts +162 -0
  32. package/cli/selftune/cron/setup.ts +266 -0
  33. package/cli/selftune/dashboard-contract.ts +202 -0
  34. package/cli/selftune/dashboard-server.ts +1049 -0
  35. package/cli/selftune/dashboard.ts +43 -156
  36. package/cli/selftune/eval/baseline.ts +248 -0
  37. package/cli/selftune/eval/composability-v2.ts +273 -0
  38. package/cli/selftune/eval/composability.ts +117 -0
  39. package/cli/selftune/eval/generate-unit-tests.ts +143 -0
  40. package/cli/selftune/eval/hooks-to-evals.ts +101 -16
  41. package/cli/selftune/eval/import-skillsbench.ts +221 -0
  42. package/cli/selftune/eval/synthetic-evals.ts +172 -0
  43. package/cli/selftune/eval/unit-test-cli.ts +152 -0
  44. package/cli/selftune/eval/unit-test.ts +196 -0
  45. package/cli/selftune/evolution/deploy-proposal.ts +142 -1
  46. package/cli/selftune/evolution/evidence.ts +26 -0
  47. package/cli/selftune/evolution/evolve-body.ts +586 -0
  48. package/cli/selftune/evolution/evolve.ts +825 -116
  49. package/cli/selftune/evolution/extract-patterns.ts +105 -16
  50. package/cli/selftune/evolution/pareto.ts +314 -0
  51. package/cli/selftune/evolution/propose-body.ts +171 -0
  52. package/cli/selftune/evolution/propose-description.ts +100 -2
  53. package/cli/selftune/evolution/propose-routing.ts +166 -0
  54. package/cli/selftune/evolution/refine-body.ts +141 -0
  55. package/cli/selftune/evolution/rollback.ts +21 -4
  56. package/cli/selftune/evolution/validate-body.ts +254 -0
  57. package/cli/selftune/evolution/validate-proposal.ts +257 -35
  58. package/cli/selftune/evolution/validate-routing.ts +177 -0
  59. package/cli/selftune/grading/auto-grade.ts +200 -0
  60. package/cli/selftune/grading/grade-session.ts +513 -42
  61. package/cli/selftune/grading/pre-gates.ts +104 -0
  62. package/cli/selftune/grading/results.ts +42 -0
  63. package/cli/selftune/hooks/auto-activate.ts +185 -0
  64. package/cli/selftune/hooks/evolution-guard.ts +165 -0
  65. package/cli/selftune/hooks/prompt-log.ts +172 -2
  66. package/cli/selftune/hooks/session-stop.ts +123 -3
  67. package/cli/selftune/hooks/skill-change-guard.ts +112 -0
  68. package/cli/selftune/hooks/skill-eval.ts +119 -3
  69. package/cli/selftune/index.ts +415 -48
  70. package/cli/selftune/ingestors/claude-replay.ts +377 -0
  71. package/cli/selftune/ingestors/codex-rollout.ts +345 -46
  72. package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
  73. package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
  74. package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
  75. package/cli/selftune/init.ts +376 -16
  76. package/cli/selftune/last.ts +14 -5
  77. package/cli/selftune/localdb/db.ts +63 -0
  78. package/cli/selftune/localdb/materialize.ts +428 -0
  79. package/cli/selftune/localdb/queries.ts +376 -0
  80. package/cli/selftune/localdb/schema.ts +204 -0
  81. package/cli/selftune/memory/writer.ts +447 -0
  82. package/cli/selftune/monitoring/watch.ts +90 -16
  83. package/cli/selftune/normalization.ts +682 -0
  84. package/cli/selftune/observability.ts +19 -44
  85. package/cli/selftune/orchestrate.ts +1073 -0
  86. package/cli/selftune/quickstart.ts +203 -0
  87. package/cli/selftune/repair/skill-usage.ts +576 -0
  88. package/cli/selftune/schedule.ts +561 -0
  89. package/cli/selftune/status.ts +59 -33
  90. package/cli/selftune/sync.ts +627 -0
  91. package/cli/selftune/types.ts +525 -5
  92. package/cli/selftune/utils/canonical-log.ts +45 -0
  93. package/cli/selftune/utils/frontmatter.ts +217 -0
  94. package/cli/selftune/utils/hooks.ts +41 -0
  95. package/cli/selftune/utils/html.ts +27 -0
  96. package/cli/selftune/utils/llm-call.ts +103 -19
  97. package/cli/selftune/utils/math.ts +10 -0
  98. package/cli/selftune/utils/query-filter.ts +139 -0
  99. package/cli/selftune/utils/skill-discovery.ts +340 -0
  100. package/cli/selftune/utils/skill-log.ts +68 -0
  101. package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
  102. package/cli/selftune/utils/transcript.ts +307 -26
  103. package/cli/selftune/utils/trigger-check.ts +89 -0
  104. package/cli/selftune/utils/tui.ts +156 -0
  105. package/cli/selftune/workflows/discover.ts +254 -0
  106. package/cli/selftune/workflows/skill-md-writer.ts +288 -0
  107. package/cli/selftune/workflows/workflows.ts +188 -0
  108. package/package.json +28 -11
  109. package/packages/telemetry-contract/README.md +11 -0
  110. package/packages/telemetry-contract/fixtures/golden.json +87 -0
  111. package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
  112. package/packages/telemetry-contract/index.ts +1 -0
  113. package/packages/telemetry-contract/package.json +19 -0
  114. package/packages/telemetry-contract/src/index.ts +2 -0
  115. package/packages/telemetry-contract/src/types.ts +163 -0
  116. package/packages/telemetry-contract/src/validators.ts +109 -0
  117. package/skill/SKILL.md +180 -33
  118. package/skill/Workflows/AutoActivation.md +145 -0
  119. package/skill/Workflows/Badge.md +124 -0
  120. package/skill/Workflows/Baseline.md +144 -0
  121. package/skill/Workflows/Composability.md +107 -0
  122. package/skill/Workflows/Contribute.md +94 -0
  123. package/skill/Workflows/Cron.md +132 -0
  124. package/skill/Workflows/Dashboard.md +214 -0
  125. package/skill/Workflows/Doctor.md +63 -14
  126. package/skill/Workflows/Evals.md +110 -18
  127. package/skill/Workflows/EvolutionMemory.md +154 -0
  128. package/skill/Workflows/Evolve.md +181 -21
  129. package/skill/Workflows/EvolveBody.md +159 -0
  130. package/skill/Workflows/Grade.md +36 -31
  131. package/skill/Workflows/ImportSkillsBench.md +117 -0
  132. package/skill/Workflows/Ingest.md +142 -21
  133. package/skill/Workflows/Initialize.md +91 -23
  134. package/skill/Workflows/Orchestrate.md +139 -0
  135. package/skill/Workflows/Replay.md +91 -0
  136. package/skill/Workflows/Rollback.md +23 -4
  137. package/skill/Workflows/Schedule.md +61 -0
  138. package/skill/Workflows/Sync.md +88 -0
  139. package/skill/Workflows/UnitTest.md +150 -0
  140. package/skill/Workflows/Watch.md +33 -1
  141. package/skill/Workflows/Workflows.md +129 -0
  142. package/skill/assets/activation-rules-default.json +26 -0
  143. package/skill/assets/multi-skill-settings.json +63 -0
  144. package/skill/assets/single-skill-settings.json +57 -0
  145. package/skill/references/invocation-taxonomy.md +2 -2
  146. package/skill/references/logs.md +164 -2
  147. package/skill/references/setup-patterns.md +65 -0
  148. package/skill/references/version-history.md +40 -0
  149. package/skill/settings_snippet.json +23 -0
  150. package/templates/activation-rules-default.json +27 -0
  151. package/templates/multi-skill-settings.json +64 -0
  152. package/templates/single-skill-settings.json +58 -0
  153. package/dashboard/index.html +0 -1119
@@ -26,7 +26,15 @@ import type {
26
26
  SkillUsageRecord,
27
27
  } from "../types.js";
28
28
  import { readJsonl } from "../utils/jsonl.js";
29
+ import { detectAgent } from "../utils/llm-call.js";
30
+ import {
31
+ filterActionableQueryRecords,
32
+ filterActionableSkillUsageRecords,
33
+ } from "../utils/query-filter.js";
29
34
  import { seededShuffle } from "../utils/seeded-random.js";
35
+ import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
36
+ import { isHighConfidencePositiveSkillRecord } from "../utils/skill-usage-confidence.js";
37
+ import { generateSyntheticEvals } from "./synthetic-evals.js";
30
38
 
31
39
  // ---------------------------------------------------------------------------
32
40
  // Query truncation
@@ -114,14 +122,16 @@ export function buildEvalSet(
114
122
  seed = 42,
115
123
  annotateTaxonomy = true,
116
124
  ): EvalEntry[] {
125
+ const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
126
+ const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
117
127
  const effectiveMaxPerSide = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
118
128
  const effectiveSeed = Number.isNaN(seed) ? 42 : seed;
119
129
 
120
130
  // Build set of positive query texts (for exclusion from negatives)
121
131
  const positiveQueries = new Set<string>();
122
- for (const r of skillRecords) {
132
+ for (const r of actionableSkillRecords) {
123
133
  if (!r || typeof r.skill_name !== "string" || typeof r.query !== "string") continue;
124
- if (r.skill_name === skillName) {
134
+ if (isHighConfidencePositiveSkillRecord(r, skillName)) {
125
135
  const q = (r.query ?? "").trim();
126
136
  if (q && q !== "(query not found)") {
127
137
  positiveQueries.add(q);
@@ -132,9 +142,9 @@ export function buildEvalSet(
132
142
  // Build deduplicated positives with taxonomy classification
133
143
  const seen = new Set<string>();
134
144
  const positives: EvalEntry[] = [];
135
- for (const r of skillRecords) {
145
+ for (const r of actionableSkillRecords) {
136
146
  if (!r || typeof r.skill_name !== "string" || typeof r.query !== "string") continue;
137
- if (r.skill_name !== skillName) continue;
147
+ if (!isHighConfidencePositiveSkillRecord(r, skillName)) continue;
138
148
  const q = (r.query ?? "").trim();
139
149
  if (!q || q === "(query not found)" || seen.has(q)) continue;
140
150
  seen.add(q);
@@ -151,7 +161,7 @@ export function buildEvalSet(
151
161
  if (includeNegatives) {
152
162
  const negCandidates: string[] = [];
153
163
  const negSeen = new Set<string>();
154
- for (const r of queryRecords) {
164
+ for (const r of actionableQueryRecords) {
155
165
  if (!r || typeof r.query !== "string") continue;
156
166
  const q = (r.query ?? "").trim();
157
167
  if (!q || positiveQueries.has(q) || negSeen.has(q)) continue;
@@ -196,13 +206,17 @@ export function listSkills(
196
206
  queryRecords: QueryLogRecord[],
197
207
  telemetryRecords: SessionTelemetryRecord[],
198
208
  ): void {
209
+ const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
210
+ const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
199
211
  const counts = new Map<string, number>();
200
- for (const r of skillRecords) {
212
+ for (const r of actionableSkillRecords) {
201
213
  const name = r.skill_name ?? "unknown";
202
214
  counts.set(name, (counts.get(name) ?? 0) + 1);
203
215
  }
204
216
 
205
- console.log(`Skill triggers in skill_usage_log (${skillRecords.length} total records):`);
217
+ console.log(
218
+ `Skill triggers in skill_usage_log (${actionableSkillRecords.length} actionable records):`,
219
+ );
206
220
  if (counts.size > 0) {
207
221
  const sorted = [...counts.entries()].sort((a, b) => b[1] - a[1]);
208
222
  for (const [name, count] of sorted) {
@@ -212,8 +226,8 @@ export function listSkills(
212
226
  console.log(" (none yet -- trigger some skills in Claude Code to populate)");
213
227
  }
214
228
 
215
- console.log(`\nAll queries in all_queries_log: ${queryRecords.length}`);
216
- if (queryRecords.length === 0) {
229
+ console.log(`\nActionable queries in all_queries_log: ${actionableQueryRecords.length}`);
230
+ if (actionableQueryRecords.length === 0) {
217
231
  console.log(" (none yet -- make sure prompt_log_hook is installed)");
218
232
  }
219
233
 
@@ -301,14 +315,16 @@ export function printEvalStats(
301
315
  ): void {
302
316
  const pos = evalSet.filter((e) => e.should_trigger);
303
317
  const neg = evalSet.filter((e) => !e.should_trigger);
304
- const totalTriggers = skillRecords.filter((r) => r.skill_name === skillName).length;
318
+ const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
319
+ const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
320
+ const totalTriggers = actionableSkillRecords.filter((r) => r.skill_name === skillName).length;
305
321
 
306
322
  console.log(`Wrote ${evalSet.length} eval entries to ${outputPath}`);
307
323
  console.log(
308
324
  ` Positives (should_trigger=true) : ${pos.length} (from ${totalTriggers} logged triggers)`,
309
325
  );
310
326
  console.log(
311
- ` Negatives (should_trigger=false): ${neg.length} (from ${queryRecords.length} total logged queries)`,
327
+ ` Negatives (should_trigger=false): ${neg.length} (from ${actionableQueryRecords.length} actionable logged queries)`,
312
328
  );
313
329
 
314
330
  if (annotateTaxonomy && pos.length > 0) {
@@ -334,7 +350,7 @@ export function printEvalStats(
334
350
  console.log();
335
351
  if (pos.length === 0) {
336
352
  console.log(`[WARN] No positives for skill '${skillName}'.`);
337
- const names = [...new Set(skillRecords.map((r) => r.skill_name))].sort();
353
+ const names = [...new Set(actionableSkillRecords.map((r) => r.skill_name))].sort();
338
354
  if (names.length > 0) {
339
355
  console.log(` Known skills: ${names.join(", ")}`);
340
356
  }
@@ -359,11 +375,12 @@ export function printEvalStats(
359
375
  // CLI entry point
360
376
  // ---------------------------------------------------------------------------
361
377
 
362
- export function cliMain(): void {
378
+ export async function cliMain(): Promise<void> {
363
379
  const { values } = parseArgs({
364
380
  options: {
365
381
  skill: { type: "string" },
366
382
  output: { type: "string" },
383
+ out: { type: "string" },
367
384
  max: { type: "string", default: "50" },
368
385
  seed: { type: "string", default: "42" },
369
386
  "list-skills": { type: "boolean", default: false },
@@ -373,11 +390,76 @@ export function cliMain(): void {
373
390
  "skill-log": { type: "string", default: SKILL_LOG },
374
391
  "query-log": { type: "string", default: QUERY_LOG },
375
392
  "telemetry-log": { type: "string", default: TELEMETRY_LOG },
393
+ synthetic: { type: "boolean", default: false },
394
+ "skill-path": { type: "string" },
395
+ model: { type: "string" },
376
396
  },
377
397
  strict: true,
378
398
  });
379
399
 
380
- const skillRecords = readJsonl<SkillUsageRecord>(values["skill-log"] ?? SKILL_LOG);
400
+ // --- Synthetic mode: generate evals from SKILL.md via LLM ---
401
+ if (values.synthetic) {
402
+ if (!values.skill) {
403
+ console.error("[ERROR] --skill required with --synthetic");
404
+ process.exit(1);
405
+ }
406
+ if (!values["skill-path"]) {
407
+ console.error("[ERROR] --skill-path required with --synthetic");
408
+ process.exit(1);
409
+ }
410
+
411
+ const agent = detectAgent();
412
+ if (!agent) {
413
+ console.error("[ERROR] No agent CLI found (claude/codex/opencode). Install one first.");
414
+ process.exit(1);
415
+ }
416
+
417
+ const maxPerSide = Number.parseInt(values.max ?? "50", 10);
418
+ const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
419
+
420
+ console.log(`Generating synthetic evals for skill '${values.skill}'...`);
421
+ const evalSet = await generateSyntheticEvals(values["skill-path"], values.skill, agent, {
422
+ maxPositives: effectiveMax,
423
+ maxNegatives: effectiveMax,
424
+ modelFlag: values.model,
425
+ });
426
+
427
+ const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
428
+ writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
429
+
430
+ const pos = evalSet.filter((e) => e.should_trigger);
431
+ const neg = evalSet.filter((e) => !e.should_trigger);
432
+
433
+ console.log(`Wrote ${evalSet.length} synthetic eval entries to ${outputPath}`);
434
+ console.log(` Positives (should_trigger=true) : ${pos.length}`);
435
+ console.log(` Negatives (should_trigger=false): ${neg.length}`);
436
+
437
+ if (pos.length > 0) {
438
+ const types = new Map<string, number>();
439
+ for (const e of pos) {
440
+ const t = e.invocation_type ?? "?";
441
+ types.set(t, (types.get(t) ?? 0) + 1);
442
+ }
443
+ console.log("\n Positive invocation types:");
444
+ for (const [t, c] of [...types.entries()].sort()) {
445
+ console.log(` ${t.padEnd(15)} ${c}`);
446
+ }
447
+ }
448
+
449
+ console.log("\nNext steps:");
450
+ console.log(" bun run cli/selftune/eval/run-eval.ts \\");
451
+ console.log(` --eval-set ${outputPath} \\`);
452
+ console.log(` --skill-path ${values["skill-path"]} \\`);
453
+ console.log(" --runs-per-query 3 --verbose");
454
+ return;
455
+ }
456
+
457
+ // --- Log-based mode (original behavior) ---
458
+ const skillLogPath = values["skill-log"] ?? SKILL_LOG;
459
+ const skillRecords =
460
+ skillLogPath === SKILL_LOG
461
+ ? readEffectiveSkillUsageRecords()
462
+ : readJsonl<SkillUsageRecord>(skillLogPath);
381
463
  const queryRecords = readJsonl<QueryLogRecord>(values["query-log"] ?? QUERY_LOG);
382
464
  const telemetryRecords = readJsonl<SessionTelemetryRecord>(
383
465
  values["telemetry-log"] ?? TELEMETRY_LOG,
@@ -412,11 +494,14 @@ export function cliMain(): void {
412
494
  annotateTaxonomy,
413
495
  );
414
496
 
415
- const outputPath = values.output ?? `${values.skill}_trigger_eval.json`;
497
+ const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
416
498
  writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
417
499
  printEvalStats(evalSet, values.skill, outputPath, skillRecords, queryRecords, annotateTaxonomy);
418
500
  }
419
501
 
420
502
  if (import.meta.main) {
421
- cliMain();
503
+ cliMain().catch((err) => {
504
+ console.error(err);
505
+ process.exit(1);
506
+ });
422
507
  }
@@ -0,0 +1,221 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * import-skillsbench.ts
4
+ *
5
+ * Imports task definitions from a SkillsBench-style corpus directory and
6
+ * converts them into EvalEntry arrays for use with selftune eval/grading.
7
+ *
8
+ * Expected directory structure:
9
+ * <dir>/tasks/<task-id>/instruction.md — task description (query text)
10
+ * <dir>/tasks/<task-id>/task.toml — metadata (difficulty, category, tags, etc.)
11
+ */
12
+
13
+ import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
14
+ import { join } from "node:path";
15
+ import { parseArgs } from "node:util";
16
+ import type { EvalEntry, SkillsBenchTask } from "../types.js";
17
+
18
+ // ---------------------------------------------------------------------------
19
+ // Minimal TOML parser (handles the subset used by SkillsBench task.toml files)
20
+ // ---------------------------------------------------------------------------
21
+
22
+ /**
23
+ * Parse the subset of TOML used by SkillsBench task.toml files.
24
+ *
25
+ * Supports: single-line key = "value", flat string arrays ["a", "b"],
26
+ * bare values (numbers, booleans).
27
+ *
28
+ * Does NOT support: multi-line / triple-quoted strings, inline tables,
29
+ * nested arrays, or section headers ([table]).
30
+ */
31
+ function parseSimpleToml(content: string): Record<string, unknown> {
32
+ const result: Record<string, unknown> = {};
33
+ for (const rawLine of content.split("\n")) {
34
+ const line = rawLine.trim();
35
+ if (!line || line.startsWith("#")) continue;
36
+
37
+ const eqIdx = line.indexOf("=");
38
+ if (eqIdx === -1) continue;
39
+
40
+ const key = line.slice(0, eqIdx).trim();
41
+ const rawValue = line.slice(eqIdx + 1).trim();
42
+
43
+ if (rawValue.startsWith("[")) {
44
+ // Array value — parse simple string arrays like ["a", "b", "c"]
45
+ const arrayContent = rawValue.slice(1, rawValue.lastIndexOf("]"));
46
+ const items: string[] = [];
47
+ for (const item of arrayContent.split(",")) {
48
+ const trimmed = item.trim().replace(/^["']|["']$/g, "");
49
+ if (trimmed) items.push(trimmed);
50
+ }
51
+ result[key] = items;
52
+ } else if (rawValue.startsWith('"') || rawValue.startsWith("'")) {
53
+ // String value
54
+ result[key] = rawValue.replace(/^["']|["']$/g, "");
55
+ } else {
56
+ // Bare value (number, boolean, etc.)
57
+ result[key] = rawValue;
58
+ }
59
+ }
60
+ return result;
61
+ }
62
+
63
+ // ---------------------------------------------------------------------------
64
+ // Parse SkillsBench directory
65
+ // ---------------------------------------------------------------------------
66
+
67
+ export function parseSkillsBenchDir(dirPath: string): SkillsBenchTask[] {
68
+ const tasksDir = join(dirPath, "tasks");
69
+ if (!existsSync(tasksDir)) return [];
70
+
71
+ const tasks: SkillsBenchTask[] = [];
72
+
73
+ let entries: ReturnType<typeof readdirSync>;
74
+ try {
75
+ entries = readdirSync(tasksDir, { withFileTypes: true });
76
+ } catch {
77
+ return [];
78
+ }
79
+
80
+ for (const entry of entries) {
81
+ if (!entry.isDirectory()) continue;
82
+
83
+ const taskDir = join(tasksDir, entry.name);
84
+ const instructionPath = join(taskDir, "instruction.md");
85
+
86
+ if (!existsSync(instructionPath)) continue;
87
+
88
+ const query = readFileSync(instructionPath, "utf-8").trim();
89
+ if (!query) continue;
90
+
91
+ // Parse optional task.toml
92
+ const tomlPath = join(taskDir, "task.toml");
93
+ let metadata: Record<string, unknown> = {};
94
+ if (existsSync(tomlPath)) {
95
+ metadata = parseSimpleToml(readFileSync(tomlPath, "utf-8"));
96
+ }
97
+
98
+ const difficulty = metadata.difficulty as SkillsBenchTask["difficulty"] | undefined;
99
+
100
+ const task: SkillsBenchTask = {
101
+ task_id: entry.name,
102
+ category: (metadata.category as string) ?? "general",
103
+ query,
104
+ difficulty:
105
+ difficulty && ["easy", "medium", "hard"].includes(difficulty) ? difficulty : "medium",
106
+ };
107
+
108
+ if (metadata.expected_skill) {
109
+ task.expected_skill = metadata.expected_skill as string;
110
+ }
111
+ if (metadata.expected_tools && Array.isArray(metadata.expected_tools)) {
112
+ task.expected_tools = metadata.expected_tools as string[];
113
+ }
114
+ if (metadata.tags && Array.isArray(metadata.tags)) {
115
+ task.tags = metadata.tags as string[];
116
+ }
117
+
118
+ tasks.push(task);
119
+ }
120
+
121
+ return tasks;
122
+ }
123
+
124
+ // ---------------------------------------------------------------------------
125
+ // Convert tasks to EvalEntries
126
+ // ---------------------------------------------------------------------------
127
+
128
+ export function convertToEvalEntries(
129
+ tasks: SkillsBenchTask[],
130
+ targetSkill: string,
131
+ matchStrategy: "exact" | "fuzzy" = "exact",
132
+ ): EvalEntry[] {
133
+ const entries: EvalEntry[] = [];
134
+
135
+ for (const task of tasks) {
136
+ let matches = false;
137
+
138
+ if (matchStrategy === "exact") {
139
+ matches = task.expected_skill === targetSkill;
140
+ } else {
141
+ // Fuzzy: check if targetSkill appears as substring in category, tags, or expected_skill
142
+ const skillLower = targetSkill.toLowerCase();
143
+ const searchable = [task.category, task.expected_skill, ...(task.tags ?? [])]
144
+ .filter(Boolean)
145
+ .map((s) => (s as string).toLowerCase());
146
+
147
+ matches = searchable.some((s) => s.includes(skillLower) || skillLower.includes(s));
148
+ }
149
+
150
+ if (matches) {
151
+ entries.push({
152
+ query: task.query,
153
+ should_trigger: true,
154
+ });
155
+ }
156
+ }
157
+
158
+ return entries;
159
+ }
160
+
161
+ // ---------------------------------------------------------------------------
162
+ // CLI entry point
163
+ // ---------------------------------------------------------------------------
164
+
165
+ export function cliMain(): void {
166
+ const { values } = parseArgs({
167
+ options: {
168
+ dir: { type: "string" },
169
+ skill: { type: "string" },
170
+ output: { type: "string" },
171
+ "match-strategy": { type: "string", default: "exact" },
172
+ },
173
+ strict: true,
174
+ });
175
+
176
+ if (!values.dir) {
177
+ console.error("[ERROR] --dir required (path to SkillsBench corpus directory)");
178
+ process.exit(1);
179
+ }
180
+
181
+ if (!values.skill) {
182
+ console.error("[ERROR] --skill required (target skill name)");
183
+ process.exit(1);
184
+ }
185
+
186
+ const matchStrategy = values["match-strategy"] === "fuzzy" ? "fuzzy" : "exact";
187
+
188
+ const tasks = parseSkillsBenchDir(values.dir);
189
+
190
+ if (tasks.length === 0) {
191
+ console.error(`[WARN] No tasks found in ${values.dir}/tasks/`);
192
+ console.error("Expected structure: <dir>/tasks/<task-id>/instruction.md");
193
+ process.exit(1);
194
+ }
195
+
196
+ console.log(`Parsed ${tasks.length} tasks from ${values.dir}`);
197
+
198
+ const entries = convertToEvalEntries(tasks, values.skill, matchStrategy);
199
+
200
+ if (entries.length === 0) {
201
+ console.log(
202
+ `[WARN] No tasks matched skill '${values.skill}' with strategy '${matchStrategy}'.`,
203
+ );
204
+ console.log("Available expected_skills:");
205
+ const skills = [...new Set(tasks.map((t) => t.expected_skill).filter(Boolean))].sort();
206
+ for (const s of skills) {
207
+ console.log(` ${s}`);
208
+ }
209
+ if (matchStrategy === "exact") {
210
+ console.log("\nTip: try --match-strategy fuzzy for keyword-based matching.");
211
+ }
212
+ }
213
+
214
+ const outputPath = values.output ?? `${values.skill}_skillsbench_eval.json`;
215
+ writeFileSync(outputPath, JSON.stringify(entries, null, 2), "utf-8");
216
+ console.log(`Wrote ${entries.length} eval entries to ${outputPath}`);
217
+ }
218
+
219
+ if (import.meta.main) {
220
+ cliMain();
221
+ }
@@ -0,0 +1,172 @@
1
+ /**
2
+ * synthetic-evals.ts
3
+ *
4
+ * Generates eval queries from a SKILL.md using an LLM, without requiring
5
+ * real session logs. Solves the cold-start problem for new skills that
6
+ * have no telemetry data yet.
7
+ */
8
+
9
+ import { readFileSync } from "node:fs";
10
+
11
+ import type { EvalEntry, InvocationType } from "../types.js";
12
+ import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
13
+ import { classifyInvocation } from "./hooks-to-evals.js";
14
+
15
+ // ---------------------------------------------------------------------------
16
+ // Types
17
+ // ---------------------------------------------------------------------------
18
+
19
+ export interface SyntheticEvalOptions {
20
+ maxPositives?: number;
21
+ maxNegatives?: number;
22
+ modelFlag?: string;
23
+ }
24
+
25
+ interface RawSyntheticEntry {
26
+ query: string;
27
+ should_trigger: boolean;
28
+ invocation_type?: string;
29
+ }
30
+
31
+ // ---------------------------------------------------------------------------
32
+ // Prompt building
33
+ // ---------------------------------------------------------------------------
34
+
35
+ export function buildSyntheticPrompt(
36
+ skillContent: string,
37
+ skillName: string,
38
+ maxPositives: number,
39
+ maxNegatives: number,
40
+ ): { system: string; user: string } {
41
+ const system = `You are generating test queries for a coding agent skill. Given the skill description below, generate realistic user queries.
42
+
43
+ For POSITIVE queries (should trigger this skill):
44
+ - Generate a mix of:
45
+ - Explicit: directly names the skill or uses $${skillName} syntax
46
+ - Implicit: describes the task without naming the skill
47
+ - Contextual: natural language with domain context, proper nouns, dates, filenames
48
+ - Vary phrasing, formality, and specificity
49
+
50
+ For NEGATIVE queries (should NOT trigger this skill):
51
+ - Queries that are topically adjacent but wrong intent
52
+ - Queries for different skills that share keywords
53
+ - Generic queries unrelated to this skill
54
+
55
+ Output as JSON array with no surrounding text:
56
+ [{"query": "...", "should_trigger": true, "invocation_type": "explicit|implicit|contextual|negative"}]`;
57
+
58
+ const user = `Skill name: ${skillName}
59
+
60
+ Skill content:
61
+ ${skillContent}
62
+
63
+ Generate exactly ${maxPositives} positive queries (should_trigger: true) and ${maxNegatives} negative queries (should_trigger: false). Return ONLY the JSON array.`;
64
+
65
+ return { system, user };
66
+ }
67
+
68
+ // ---------------------------------------------------------------------------
69
+ // Response parsing
70
+ // ---------------------------------------------------------------------------
71
+
72
+ export function parseSyntheticResponse(raw: string, skillName: string): EvalEntry[] {
73
+ let text = raw.trim();
74
+
75
+ // Strip markdown fences manually for array-first JSON
76
+ // (stripMarkdownFences slices to first '{' which breaks '[' arrays)
77
+ const fenceMatch = text.match(/^```\w*\n/);
78
+ if (fenceMatch) {
79
+ text = text.slice(fenceMatch[0].length);
80
+ const closingIdx = text.lastIndexOf("```");
81
+ if (closingIdx >= 0) {
82
+ text = text.slice(0, closingIdx);
83
+ }
84
+ text = text.trim();
85
+ }
86
+
87
+ // Find the JSON array start
88
+ const bracketIdx = text.indexOf("[");
89
+ if (bracketIdx < 0) {
90
+ // No array found — try stripMarkdownFences as fallback for edge cases
91
+ const cleaned = stripMarkdownFences(raw);
92
+ const retryIdx = cleaned.indexOf("[");
93
+ if (retryIdx >= 0) {
94
+ text = cleaned.slice(retryIdx);
95
+ } else {
96
+ throw new Error(`Failed to parse synthetic eval response as JSON: ${text.slice(0, 200)}`);
97
+ }
98
+ } else {
99
+ text = text.slice(bracketIdx);
100
+ }
101
+
102
+ // Trim trailing content after the array closes
103
+ const lastBracket = text.lastIndexOf("]");
104
+ if (lastBracket >= 0) {
105
+ text = text.slice(0, lastBracket + 1);
106
+ }
107
+
108
+ const jsonText = text;
109
+
110
+ let entries: RawSyntheticEntry[];
111
+ try {
112
+ entries = JSON.parse(jsonText);
113
+ } catch {
114
+ throw new Error(`Failed to parse synthetic eval response as JSON: ${jsonText.slice(0, 200)}`);
115
+ }
116
+
117
+ if (!Array.isArray(entries)) {
118
+ throw new Error("Synthetic eval response is not a JSON array");
119
+ }
120
+
121
+ const result: EvalEntry[] = [];
122
+ for (const entry of entries) {
123
+ if (!entry || typeof entry.query !== "string" || typeof entry.should_trigger !== "boolean") {
124
+ continue;
125
+ }
126
+
127
+ const query = entry.query.trim();
128
+ if (!query) continue;
129
+
130
+ // For positives, use classifyInvocation to verify/override the LLM's type
131
+ let invocationType: InvocationType;
132
+ if (entry.should_trigger) {
133
+ invocationType = classifyInvocation(query, skillName);
134
+ } else {
135
+ invocationType = "negative";
136
+ }
137
+
138
+ result.push({
139
+ query,
140
+ should_trigger: entry.should_trigger,
141
+ invocation_type: invocationType,
142
+ });
143
+ }
144
+
145
+ return result;
146
+ }
147
+
148
+ // ---------------------------------------------------------------------------
149
+ // Main entry point
150
+ // ---------------------------------------------------------------------------
151
+
152
+ export async function generateSyntheticEvals(
153
+ skillPath: string,
154
+ skillName: string,
155
+ agent: string,
156
+ options: SyntheticEvalOptions = {},
157
+ ): Promise<EvalEntry[]> {
158
+ const maxPositives = options.maxPositives ?? 15;
159
+ const maxNegatives = options.maxNegatives ?? 10;
160
+
161
+ const skillContent = readFileSync(skillPath, "utf-8");
162
+
163
+ const { system, user } = buildSyntheticPrompt(
164
+ skillContent,
165
+ skillName,
166
+ maxPositives,
167
+ maxNegatives,
168
+ );
169
+
170
+ const raw = await callLlm(system, user, agent, options.modelFlag);
171
+ return parseSyntheticResponse(raw, skillName);
172
+ }