selftune 0.1.4 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +156 -0
- package/.claude/agents/evolution-reviewer.md +180 -0
- package/.claude/agents/integration-guide.md +212 -0
- package/.claude/agents/pattern-analyst.md +160 -0
- package/CHANGELOG.md +46 -1
- package/README.md +105 -257
- package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
- package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
- package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
- package/apps/local-dashboard/dist/favicon.png +0 -0
- package/apps/local-dashboard/dist/index.html +17 -0
- package/apps/local-dashboard/dist/logo.png +0 -0
- package/apps/local-dashboard/dist/logo.svg +9 -0
- package/assets/BeforeAfter.gif +0 -0
- package/assets/FeedbackLoop.gif +0 -0
- package/assets/logo.svg +9 -0
- package/assets/skill-health-badge.svg +20 -0
- package/cli/selftune/activation-rules.ts +171 -0
- package/cli/selftune/badge/badge-data.ts +108 -0
- package/cli/selftune/badge/badge-svg.ts +212 -0
- package/cli/selftune/badge/badge.ts +99 -0
- package/cli/selftune/canonical-export.ts +183 -0
- package/cli/selftune/constants.ts +103 -1
- package/cli/selftune/contribute/bundle.ts +314 -0
- package/cli/selftune/contribute/contribute.ts +214 -0
- package/cli/selftune/contribute/sanitize.ts +162 -0
- package/cli/selftune/cron/setup.ts +266 -0
- package/cli/selftune/dashboard-contract.ts +202 -0
- package/cli/selftune/dashboard-server.ts +1049 -0
- package/cli/selftune/dashboard.ts +43 -156
- package/cli/selftune/eval/baseline.ts +248 -0
- package/cli/selftune/eval/composability-v2.ts +273 -0
- package/cli/selftune/eval/composability.ts +117 -0
- package/cli/selftune/eval/generate-unit-tests.ts +143 -0
- package/cli/selftune/eval/hooks-to-evals.ts +101 -16
- package/cli/selftune/eval/import-skillsbench.ts +221 -0
- package/cli/selftune/eval/synthetic-evals.ts +172 -0
- package/cli/selftune/eval/unit-test-cli.ts +152 -0
- package/cli/selftune/eval/unit-test.ts +196 -0
- package/cli/selftune/evolution/deploy-proposal.ts +142 -1
- package/cli/selftune/evolution/evidence.ts +26 -0
- package/cli/selftune/evolution/evolve-body.ts +586 -0
- package/cli/selftune/evolution/evolve.ts +825 -116
- package/cli/selftune/evolution/extract-patterns.ts +105 -16
- package/cli/selftune/evolution/pareto.ts +314 -0
- package/cli/selftune/evolution/propose-body.ts +171 -0
- package/cli/selftune/evolution/propose-description.ts +100 -2
- package/cli/selftune/evolution/propose-routing.ts +166 -0
- package/cli/selftune/evolution/refine-body.ts +141 -0
- package/cli/selftune/evolution/rollback.ts +21 -4
- package/cli/selftune/evolution/validate-body.ts +254 -0
- package/cli/selftune/evolution/validate-proposal.ts +257 -35
- package/cli/selftune/evolution/validate-routing.ts +177 -0
- package/cli/selftune/grading/auto-grade.ts +200 -0
- package/cli/selftune/grading/grade-session.ts +513 -42
- package/cli/selftune/grading/pre-gates.ts +104 -0
- package/cli/selftune/grading/results.ts +42 -0
- package/cli/selftune/hooks/auto-activate.ts +185 -0
- package/cli/selftune/hooks/evolution-guard.ts +165 -0
- package/cli/selftune/hooks/prompt-log.ts +172 -2
- package/cli/selftune/hooks/session-stop.ts +123 -3
- package/cli/selftune/hooks/skill-change-guard.ts +112 -0
- package/cli/selftune/hooks/skill-eval.ts +119 -3
- package/cli/selftune/index.ts +415 -48
- package/cli/selftune/ingestors/claude-replay.ts +377 -0
- package/cli/selftune/ingestors/codex-rollout.ts +345 -46
- package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
- package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
- package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
- package/cli/selftune/init.ts +376 -16
- package/cli/selftune/last.ts +14 -5
- package/cli/selftune/localdb/db.ts +63 -0
- package/cli/selftune/localdb/materialize.ts +428 -0
- package/cli/selftune/localdb/queries.ts +376 -0
- package/cli/selftune/localdb/schema.ts +204 -0
- package/cli/selftune/memory/writer.ts +447 -0
- package/cli/selftune/monitoring/watch.ts +90 -16
- package/cli/selftune/normalization.ts +682 -0
- package/cli/selftune/observability.ts +19 -44
- package/cli/selftune/orchestrate.ts +1073 -0
- package/cli/selftune/quickstart.ts +203 -0
- package/cli/selftune/repair/skill-usage.ts +576 -0
- package/cli/selftune/schedule.ts +561 -0
- package/cli/selftune/status.ts +59 -33
- package/cli/selftune/sync.ts +627 -0
- package/cli/selftune/types.ts +525 -5
- package/cli/selftune/utils/canonical-log.ts +45 -0
- package/cli/selftune/utils/frontmatter.ts +217 -0
- package/cli/selftune/utils/hooks.ts +41 -0
- package/cli/selftune/utils/html.ts +27 -0
- package/cli/selftune/utils/llm-call.ts +103 -19
- package/cli/selftune/utils/math.ts +10 -0
- package/cli/selftune/utils/query-filter.ts +139 -0
- package/cli/selftune/utils/skill-discovery.ts +340 -0
- package/cli/selftune/utils/skill-log.ts +68 -0
- package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
- package/cli/selftune/utils/transcript.ts +307 -26
- package/cli/selftune/utils/trigger-check.ts +89 -0
- package/cli/selftune/utils/tui.ts +156 -0
- package/cli/selftune/workflows/discover.ts +254 -0
- package/cli/selftune/workflows/skill-md-writer.ts +288 -0
- package/cli/selftune/workflows/workflows.ts +188 -0
- package/package.json +28 -11
- package/packages/telemetry-contract/README.md +11 -0
- package/packages/telemetry-contract/fixtures/golden.json +87 -0
- package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
- package/packages/telemetry-contract/index.ts +1 -0
- package/packages/telemetry-contract/package.json +19 -0
- package/packages/telemetry-contract/src/index.ts +2 -0
- package/packages/telemetry-contract/src/types.ts +163 -0
- package/packages/telemetry-contract/src/validators.ts +109 -0
- package/skill/SKILL.md +180 -33
- package/skill/Workflows/AutoActivation.md +145 -0
- package/skill/Workflows/Badge.md +124 -0
- package/skill/Workflows/Baseline.md +144 -0
- package/skill/Workflows/Composability.md +107 -0
- package/skill/Workflows/Contribute.md +94 -0
- package/skill/Workflows/Cron.md +132 -0
- package/skill/Workflows/Dashboard.md +214 -0
- package/skill/Workflows/Doctor.md +63 -14
- package/skill/Workflows/Evals.md +110 -18
- package/skill/Workflows/EvolutionMemory.md +154 -0
- package/skill/Workflows/Evolve.md +181 -21
- package/skill/Workflows/EvolveBody.md +159 -0
- package/skill/Workflows/Grade.md +36 -31
- package/skill/Workflows/ImportSkillsBench.md +117 -0
- package/skill/Workflows/Ingest.md +142 -21
- package/skill/Workflows/Initialize.md +91 -23
- package/skill/Workflows/Orchestrate.md +139 -0
- package/skill/Workflows/Replay.md +91 -0
- package/skill/Workflows/Rollback.md +23 -4
- package/skill/Workflows/Schedule.md +61 -0
- package/skill/Workflows/Sync.md +88 -0
- package/skill/Workflows/UnitTest.md +150 -0
- package/skill/Workflows/Watch.md +33 -1
- package/skill/Workflows/Workflows.md +129 -0
- package/skill/assets/activation-rules-default.json +26 -0
- package/skill/assets/multi-skill-settings.json +63 -0
- package/skill/assets/single-skill-settings.json +57 -0
- package/skill/references/invocation-taxonomy.md +2 -2
- package/skill/references/logs.md +164 -2
- package/skill/references/setup-patterns.md +65 -0
- package/skill/references/version-history.md +40 -0
- package/skill/settings_snippet.json +23 -0
- package/templates/activation-rules-default.json +27 -0
- package/templates/multi-skill-settings.json +64 -0
- package/templates/single-skill-settings.json +58 -0
- package/dashboard/index.html +0 -1119
|
@@ -26,7 +26,15 @@ import type {
|
|
|
26
26
|
SkillUsageRecord,
|
|
27
27
|
} from "../types.js";
|
|
28
28
|
import { readJsonl } from "../utils/jsonl.js";
|
|
29
|
+
import { detectAgent } from "../utils/llm-call.js";
|
|
30
|
+
import {
|
|
31
|
+
filterActionableQueryRecords,
|
|
32
|
+
filterActionableSkillUsageRecords,
|
|
33
|
+
} from "../utils/query-filter.js";
|
|
29
34
|
import { seededShuffle } from "../utils/seeded-random.js";
|
|
35
|
+
import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
|
|
36
|
+
import { isHighConfidencePositiveSkillRecord } from "../utils/skill-usage-confidence.js";
|
|
37
|
+
import { generateSyntheticEvals } from "./synthetic-evals.js";
|
|
30
38
|
|
|
31
39
|
// ---------------------------------------------------------------------------
|
|
32
40
|
// Query truncation
|
|
@@ -114,14 +122,16 @@ export function buildEvalSet(
|
|
|
114
122
|
seed = 42,
|
|
115
123
|
annotateTaxonomy = true,
|
|
116
124
|
): EvalEntry[] {
|
|
125
|
+
const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
|
|
126
|
+
const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
|
|
117
127
|
const effectiveMaxPerSide = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
|
|
118
128
|
const effectiveSeed = Number.isNaN(seed) ? 42 : seed;
|
|
119
129
|
|
|
120
130
|
// Build set of positive query texts (for exclusion from negatives)
|
|
121
131
|
const positiveQueries = new Set<string>();
|
|
122
|
-
for (const r of
|
|
132
|
+
for (const r of actionableSkillRecords) {
|
|
123
133
|
if (!r || typeof r.skill_name !== "string" || typeof r.query !== "string") continue;
|
|
124
|
-
if (r
|
|
134
|
+
if (isHighConfidencePositiveSkillRecord(r, skillName)) {
|
|
125
135
|
const q = (r.query ?? "").trim();
|
|
126
136
|
if (q && q !== "(query not found)") {
|
|
127
137
|
positiveQueries.add(q);
|
|
@@ -132,9 +142,9 @@ export function buildEvalSet(
|
|
|
132
142
|
// Build deduplicated positives with taxonomy classification
|
|
133
143
|
const seen = new Set<string>();
|
|
134
144
|
const positives: EvalEntry[] = [];
|
|
135
|
-
for (const r of
|
|
145
|
+
for (const r of actionableSkillRecords) {
|
|
136
146
|
if (!r || typeof r.skill_name !== "string" || typeof r.query !== "string") continue;
|
|
137
|
-
if (r
|
|
147
|
+
if (!isHighConfidencePositiveSkillRecord(r, skillName)) continue;
|
|
138
148
|
const q = (r.query ?? "").trim();
|
|
139
149
|
if (!q || q === "(query not found)" || seen.has(q)) continue;
|
|
140
150
|
seen.add(q);
|
|
@@ -151,7 +161,7 @@ export function buildEvalSet(
|
|
|
151
161
|
if (includeNegatives) {
|
|
152
162
|
const negCandidates: string[] = [];
|
|
153
163
|
const negSeen = new Set<string>();
|
|
154
|
-
for (const r of
|
|
164
|
+
for (const r of actionableQueryRecords) {
|
|
155
165
|
if (!r || typeof r.query !== "string") continue;
|
|
156
166
|
const q = (r.query ?? "").trim();
|
|
157
167
|
if (!q || positiveQueries.has(q) || negSeen.has(q)) continue;
|
|
@@ -196,13 +206,17 @@ export function listSkills(
|
|
|
196
206
|
queryRecords: QueryLogRecord[],
|
|
197
207
|
telemetryRecords: SessionTelemetryRecord[],
|
|
198
208
|
): void {
|
|
209
|
+
const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
|
|
210
|
+
const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
|
|
199
211
|
const counts = new Map<string, number>();
|
|
200
|
-
for (const r of
|
|
212
|
+
for (const r of actionableSkillRecords) {
|
|
201
213
|
const name = r.skill_name ?? "unknown";
|
|
202
214
|
counts.set(name, (counts.get(name) ?? 0) + 1);
|
|
203
215
|
}
|
|
204
216
|
|
|
205
|
-
console.log(
|
|
217
|
+
console.log(
|
|
218
|
+
`Skill triggers in skill_usage_log (${actionableSkillRecords.length} actionable records):`,
|
|
219
|
+
);
|
|
206
220
|
if (counts.size > 0) {
|
|
207
221
|
const sorted = [...counts.entries()].sort((a, b) => b[1] - a[1]);
|
|
208
222
|
for (const [name, count] of sorted) {
|
|
@@ -212,8 +226,8 @@ export function listSkills(
|
|
|
212
226
|
console.log(" (none yet -- trigger some skills in Claude Code to populate)");
|
|
213
227
|
}
|
|
214
228
|
|
|
215
|
-
console.log(`\
|
|
216
|
-
if (
|
|
229
|
+
console.log(`\nActionable queries in all_queries_log: ${actionableQueryRecords.length}`);
|
|
230
|
+
if (actionableQueryRecords.length === 0) {
|
|
217
231
|
console.log(" (none yet -- make sure prompt_log_hook is installed)");
|
|
218
232
|
}
|
|
219
233
|
|
|
@@ -301,14 +315,16 @@ export function printEvalStats(
|
|
|
301
315
|
): void {
|
|
302
316
|
const pos = evalSet.filter((e) => e.should_trigger);
|
|
303
317
|
const neg = evalSet.filter((e) => !e.should_trigger);
|
|
304
|
-
const
|
|
318
|
+
const actionableSkillRecords = filterActionableSkillUsageRecords(skillRecords);
|
|
319
|
+
const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
|
|
320
|
+
const totalTriggers = actionableSkillRecords.filter((r) => r.skill_name === skillName).length;
|
|
305
321
|
|
|
306
322
|
console.log(`Wrote ${evalSet.length} eval entries to ${outputPath}`);
|
|
307
323
|
console.log(
|
|
308
324
|
` Positives (should_trigger=true) : ${pos.length} (from ${totalTriggers} logged triggers)`,
|
|
309
325
|
);
|
|
310
326
|
console.log(
|
|
311
|
-
` Negatives (should_trigger=false): ${neg.length} (from ${
|
|
327
|
+
` Negatives (should_trigger=false): ${neg.length} (from ${actionableQueryRecords.length} actionable logged queries)`,
|
|
312
328
|
);
|
|
313
329
|
|
|
314
330
|
if (annotateTaxonomy && pos.length > 0) {
|
|
@@ -334,7 +350,7 @@ export function printEvalStats(
|
|
|
334
350
|
console.log();
|
|
335
351
|
if (pos.length === 0) {
|
|
336
352
|
console.log(`[WARN] No positives for skill '${skillName}'.`);
|
|
337
|
-
const names = [...new Set(
|
|
353
|
+
const names = [...new Set(actionableSkillRecords.map((r) => r.skill_name))].sort();
|
|
338
354
|
if (names.length > 0) {
|
|
339
355
|
console.log(` Known skills: ${names.join(", ")}`);
|
|
340
356
|
}
|
|
@@ -359,11 +375,12 @@ export function printEvalStats(
|
|
|
359
375
|
// CLI entry point
|
|
360
376
|
// ---------------------------------------------------------------------------
|
|
361
377
|
|
|
362
|
-
export function cliMain(): void {
|
|
378
|
+
export async function cliMain(): Promise<void> {
|
|
363
379
|
const { values } = parseArgs({
|
|
364
380
|
options: {
|
|
365
381
|
skill: { type: "string" },
|
|
366
382
|
output: { type: "string" },
|
|
383
|
+
out: { type: "string" },
|
|
367
384
|
max: { type: "string", default: "50" },
|
|
368
385
|
seed: { type: "string", default: "42" },
|
|
369
386
|
"list-skills": { type: "boolean", default: false },
|
|
@@ -373,11 +390,76 @@ export function cliMain(): void {
|
|
|
373
390
|
"skill-log": { type: "string", default: SKILL_LOG },
|
|
374
391
|
"query-log": { type: "string", default: QUERY_LOG },
|
|
375
392
|
"telemetry-log": { type: "string", default: TELEMETRY_LOG },
|
|
393
|
+
synthetic: { type: "boolean", default: false },
|
|
394
|
+
"skill-path": { type: "string" },
|
|
395
|
+
model: { type: "string" },
|
|
376
396
|
},
|
|
377
397
|
strict: true,
|
|
378
398
|
});
|
|
379
399
|
|
|
380
|
-
|
|
400
|
+
// --- Synthetic mode: generate evals from SKILL.md via LLM ---
|
|
401
|
+
if (values.synthetic) {
|
|
402
|
+
if (!values.skill) {
|
|
403
|
+
console.error("[ERROR] --skill required with --synthetic");
|
|
404
|
+
process.exit(1);
|
|
405
|
+
}
|
|
406
|
+
if (!values["skill-path"]) {
|
|
407
|
+
console.error("[ERROR] --skill-path required with --synthetic");
|
|
408
|
+
process.exit(1);
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
const agent = detectAgent();
|
|
412
|
+
if (!agent) {
|
|
413
|
+
console.error("[ERROR] No agent CLI found (claude/codex/opencode). Install one first.");
|
|
414
|
+
process.exit(1);
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
const maxPerSide = Number.parseInt(values.max ?? "50", 10);
|
|
418
|
+
const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
|
|
419
|
+
|
|
420
|
+
console.log(`Generating synthetic evals for skill '${values.skill}'...`);
|
|
421
|
+
const evalSet = await generateSyntheticEvals(values["skill-path"], values.skill, agent, {
|
|
422
|
+
maxPositives: effectiveMax,
|
|
423
|
+
maxNegatives: effectiveMax,
|
|
424
|
+
modelFlag: values.model,
|
|
425
|
+
});
|
|
426
|
+
|
|
427
|
+
const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
|
|
428
|
+
writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
|
|
429
|
+
|
|
430
|
+
const pos = evalSet.filter((e) => e.should_trigger);
|
|
431
|
+
const neg = evalSet.filter((e) => !e.should_trigger);
|
|
432
|
+
|
|
433
|
+
console.log(`Wrote ${evalSet.length} synthetic eval entries to ${outputPath}`);
|
|
434
|
+
console.log(` Positives (should_trigger=true) : ${pos.length}`);
|
|
435
|
+
console.log(` Negatives (should_trigger=false): ${neg.length}`);
|
|
436
|
+
|
|
437
|
+
if (pos.length > 0) {
|
|
438
|
+
const types = new Map<string, number>();
|
|
439
|
+
for (const e of pos) {
|
|
440
|
+
const t = e.invocation_type ?? "?";
|
|
441
|
+
types.set(t, (types.get(t) ?? 0) + 1);
|
|
442
|
+
}
|
|
443
|
+
console.log("\n Positive invocation types:");
|
|
444
|
+
for (const [t, c] of [...types.entries()].sort()) {
|
|
445
|
+
console.log(` ${t.padEnd(15)} ${c}`);
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
console.log("\nNext steps:");
|
|
450
|
+
console.log(" bun run cli/selftune/eval/run-eval.ts \\");
|
|
451
|
+
console.log(` --eval-set ${outputPath} \\`);
|
|
452
|
+
console.log(` --skill-path ${values["skill-path"]} \\`);
|
|
453
|
+
console.log(" --runs-per-query 3 --verbose");
|
|
454
|
+
return;
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
// --- Log-based mode (original behavior) ---
|
|
458
|
+
const skillLogPath = values["skill-log"] ?? SKILL_LOG;
|
|
459
|
+
const skillRecords =
|
|
460
|
+
skillLogPath === SKILL_LOG
|
|
461
|
+
? readEffectiveSkillUsageRecords()
|
|
462
|
+
: readJsonl<SkillUsageRecord>(skillLogPath);
|
|
381
463
|
const queryRecords = readJsonl<QueryLogRecord>(values["query-log"] ?? QUERY_LOG);
|
|
382
464
|
const telemetryRecords = readJsonl<SessionTelemetryRecord>(
|
|
383
465
|
values["telemetry-log"] ?? TELEMETRY_LOG,
|
|
@@ -412,11 +494,14 @@ export function cliMain(): void {
|
|
|
412
494
|
annotateTaxonomy,
|
|
413
495
|
);
|
|
414
496
|
|
|
415
|
-
const outputPath = values.output ?? `${values.skill}_trigger_eval.json`;
|
|
497
|
+
const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
|
|
416
498
|
writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
|
|
417
499
|
printEvalStats(evalSet, values.skill, outputPath, skillRecords, queryRecords, annotateTaxonomy);
|
|
418
500
|
}
|
|
419
501
|
|
|
420
502
|
if (import.meta.main) {
|
|
421
|
-
cliMain()
|
|
503
|
+
cliMain().catch((err) => {
|
|
504
|
+
console.error(err);
|
|
505
|
+
process.exit(1);
|
|
506
|
+
});
|
|
422
507
|
}
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* import-skillsbench.ts
|
|
4
|
+
*
|
|
5
|
+
* Imports task definitions from a SkillsBench-style corpus directory and
|
|
6
|
+
* converts them into EvalEntry arrays for use with selftune eval/grading.
|
|
7
|
+
*
|
|
8
|
+
* Expected directory structure:
|
|
9
|
+
* <dir>/tasks/<task-id>/instruction.md — task description (query text)
|
|
10
|
+
* <dir>/tasks/<task-id>/task.toml — metadata (difficulty, category, tags, etc.)
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
14
|
+
import { join } from "node:path";
|
|
15
|
+
import { parseArgs } from "node:util";
|
|
16
|
+
import type { EvalEntry, SkillsBenchTask } from "../types.js";
|
|
17
|
+
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
// Minimal TOML parser (handles the subset used by SkillsBench task.toml files)
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Parse the subset of TOML used by SkillsBench task.toml files.
|
|
24
|
+
*
|
|
25
|
+
* Supports: single-line key = "value", flat string arrays ["a", "b"],
|
|
26
|
+
* bare values (numbers, booleans).
|
|
27
|
+
*
|
|
28
|
+
* Does NOT support: multi-line / triple-quoted strings, inline tables,
|
|
29
|
+
* nested arrays, or section headers ([table]).
|
|
30
|
+
*/
|
|
31
|
+
function parseSimpleToml(content: string): Record<string, unknown> {
|
|
32
|
+
const result: Record<string, unknown> = {};
|
|
33
|
+
for (const rawLine of content.split("\n")) {
|
|
34
|
+
const line = rawLine.trim();
|
|
35
|
+
if (!line || line.startsWith("#")) continue;
|
|
36
|
+
|
|
37
|
+
const eqIdx = line.indexOf("=");
|
|
38
|
+
if (eqIdx === -1) continue;
|
|
39
|
+
|
|
40
|
+
const key = line.slice(0, eqIdx).trim();
|
|
41
|
+
const rawValue = line.slice(eqIdx + 1).trim();
|
|
42
|
+
|
|
43
|
+
if (rawValue.startsWith("[")) {
|
|
44
|
+
// Array value — parse simple string arrays like ["a", "b", "c"]
|
|
45
|
+
const arrayContent = rawValue.slice(1, rawValue.lastIndexOf("]"));
|
|
46
|
+
const items: string[] = [];
|
|
47
|
+
for (const item of arrayContent.split(",")) {
|
|
48
|
+
const trimmed = item.trim().replace(/^["']|["']$/g, "");
|
|
49
|
+
if (trimmed) items.push(trimmed);
|
|
50
|
+
}
|
|
51
|
+
result[key] = items;
|
|
52
|
+
} else if (rawValue.startsWith('"') || rawValue.startsWith("'")) {
|
|
53
|
+
// String value
|
|
54
|
+
result[key] = rawValue.replace(/^["']|["']$/g, "");
|
|
55
|
+
} else {
|
|
56
|
+
// Bare value (number, boolean, etc.)
|
|
57
|
+
result[key] = rawValue;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
return result;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// ---------------------------------------------------------------------------
|
|
64
|
+
// Parse SkillsBench directory
|
|
65
|
+
// ---------------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
export function parseSkillsBenchDir(dirPath: string): SkillsBenchTask[] {
|
|
68
|
+
const tasksDir = join(dirPath, "tasks");
|
|
69
|
+
if (!existsSync(tasksDir)) return [];
|
|
70
|
+
|
|
71
|
+
const tasks: SkillsBenchTask[] = [];
|
|
72
|
+
|
|
73
|
+
let entries: ReturnType<typeof readdirSync>;
|
|
74
|
+
try {
|
|
75
|
+
entries = readdirSync(tasksDir, { withFileTypes: true });
|
|
76
|
+
} catch {
|
|
77
|
+
return [];
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
for (const entry of entries) {
|
|
81
|
+
if (!entry.isDirectory()) continue;
|
|
82
|
+
|
|
83
|
+
const taskDir = join(tasksDir, entry.name);
|
|
84
|
+
const instructionPath = join(taskDir, "instruction.md");
|
|
85
|
+
|
|
86
|
+
if (!existsSync(instructionPath)) continue;
|
|
87
|
+
|
|
88
|
+
const query = readFileSync(instructionPath, "utf-8").trim();
|
|
89
|
+
if (!query) continue;
|
|
90
|
+
|
|
91
|
+
// Parse optional task.toml
|
|
92
|
+
const tomlPath = join(taskDir, "task.toml");
|
|
93
|
+
let metadata: Record<string, unknown> = {};
|
|
94
|
+
if (existsSync(tomlPath)) {
|
|
95
|
+
metadata = parseSimpleToml(readFileSync(tomlPath, "utf-8"));
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
const difficulty = metadata.difficulty as SkillsBenchTask["difficulty"] | undefined;
|
|
99
|
+
|
|
100
|
+
const task: SkillsBenchTask = {
|
|
101
|
+
task_id: entry.name,
|
|
102
|
+
category: (metadata.category as string) ?? "general",
|
|
103
|
+
query,
|
|
104
|
+
difficulty:
|
|
105
|
+
difficulty && ["easy", "medium", "hard"].includes(difficulty) ? difficulty : "medium",
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
if (metadata.expected_skill) {
|
|
109
|
+
task.expected_skill = metadata.expected_skill as string;
|
|
110
|
+
}
|
|
111
|
+
if (metadata.expected_tools && Array.isArray(metadata.expected_tools)) {
|
|
112
|
+
task.expected_tools = metadata.expected_tools as string[];
|
|
113
|
+
}
|
|
114
|
+
if (metadata.tags && Array.isArray(metadata.tags)) {
|
|
115
|
+
task.tags = metadata.tags as string[];
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
tasks.push(task);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return tasks;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// ---------------------------------------------------------------------------
|
|
125
|
+
// Convert tasks to EvalEntries
|
|
126
|
+
// ---------------------------------------------------------------------------
|
|
127
|
+
|
|
128
|
+
export function convertToEvalEntries(
|
|
129
|
+
tasks: SkillsBenchTask[],
|
|
130
|
+
targetSkill: string,
|
|
131
|
+
matchStrategy: "exact" | "fuzzy" = "exact",
|
|
132
|
+
): EvalEntry[] {
|
|
133
|
+
const entries: EvalEntry[] = [];
|
|
134
|
+
|
|
135
|
+
for (const task of tasks) {
|
|
136
|
+
let matches = false;
|
|
137
|
+
|
|
138
|
+
if (matchStrategy === "exact") {
|
|
139
|
+
matches = task.expected_skill === targetSkill;
|
|
140
|
+
} else {
|
|
141
|
+
// Fuzzy: check if targetSkill appears as substring in category, tags, or expected_skill
|
|
142
|
+
const skillLower = targetSkill.toLowerCase();
|
|
143
|
+
const searchable = [task.category, task.expected_skill, ...(task.tags ?? [])]
|
|
144
|
+
.filter(Boolean)
|
|
145
|
+
.map((s) => (s as string).toLowerCase());
|
|
146
|
+
|
|
147
|
+
matches = searchable.some((s) => s.includes(skillLower) || skillLower.includes(s));
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
if (matches) {
|
|
151
|
+
entries.push({
|
|
152
|
+
query: task.query,
|
|
153
|
+
should_trigger: true,
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
return entries;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// ---------------------------------------------------------------------------
|
|
162
|
+
// CLI entry point
|
|
163
|
+
// ---------------------------------------------------------------------------
|
|
164
|
+
|
|
165
|
+
export function cliMain(): void {
|
|
166
|
+
const { values } = parseArgs({
|
|
167
|
+
options: {
|
|
168
|
+
dir: { type: "string" },
|
|
169
|
+
skill: { type: "string" },
|
|
170
|
+
output: { type: "string" },
|
|
171
|
+
"match-strategy": { type: "string", default: "exact" },
|
|
172
|
+
},
|
|
173
|
+
strict: true,
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
if (!values.dir) {
|
|
177
|
+
console.error("[ERROR] --dir required (path to SkillsBench corpus directory)");
|
|
178
|
+
process.exit(1);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
if (!values.skill) {
|
|
182
|
+
console.error("[ERROR] --skill required (target skill name)");
|
|
183
|
+
process.exit(1);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
const matchStrategy = values["match-strategy"] === "fuzzy" ? "fuzzy" : "exact";
|
|
187
|
+
|
|
188
|
+
const tasks = parseSkillsBenchDir(values.dir);
|
|
189
|
+
|
|
190
|
+
if (tasks.length === 0) {
|
|
191
|
+
console.error(`[WARN] No tasks found in ${values.dir}/tasks/`);
|
|
192
|
+
console.error("Expected structure: <dir>/tasks/<task-id>/instruction.md");
|
|
193
|
+
process.exit(1);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
console.log(`Parsed ${tasks.length} tasks from ${values.dir}`);
|
|
197
|
+
|
|
198
|
+
const entries = convertToEvalEntries(tasks, values.skill, matchStrategy);
|
|
199
|
+
|
|
200
|
+
if (entries.length === 0) {
|
|
201
|
+
console.log(
|
|
202
|
+
`[WARN] No tasks matched skill '${values.skill}' with strategy '${matchStrategy}'.`,
|
|
203
|
+
);
|
|
204
|
+
console.log("Available expected_skills:");
|
|
205
|
+
const skills = [...new Set(tasks.map((t) => t.expected_skill).filter(Boolean))].sort();
|
|
206
|
+
for (const s of skills) {
|
|
207
|
+
console.log(` ${s}`);
|
|
208
|
+
}
|
|
209
|
+
if (matchStrategy === "exact") {
|
|
210
|
+
console.log("\nTip: try --match-strategy fuzzy for keyword-based matching.");
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
const outputPath = values.output ?? `${values.skill}_skillsbench_eval.json`;
|
|
215
|
+
writeFileSync(outputPath, JSON.stringify(entries, null, 2), "utf-8");
|
|
216
|
+
console.log(`Wrote ${entries.length} eval entries to ${outputPath}`);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
if (import.meta.main) {
|
|
220
|
+
cliMain();
|
|
221
|
+
}
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* synthetic-evals.ts
|
|
3
|
+
*
|
|
4
|
+
* Generates eval queries from a SKILL.md using an LLM, without requiring
|
|
5
|
+
* real session logs. Solves the cold-start problem for new skills that
|
|
6
|
+
* have no telemetry data yet.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { readFileSync } from "node:fs";
|
|
10
|
+
|
|
11
|
+
import type { EvalEntry, InvocationType } from "../types.js";
|
|
12
|
+
import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
|
|
13
|
+
import { classifyInvocation } from "./hooks-to-evals.js";
|
|
14
|
+
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// Types
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
export interface SyntheticEvalOptions {
|
|
20
|
+
maxPositives?: number;
|
|
21
|
+
maxNegatives?: number;
|
|
22
|
+
modelFlag?: string;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
interface RawSyntheticEntry {
|
|
26
|
+
query: string;
|
|
27
|
+
should_trigger: boolean;
|
|
28
|
+
invocation_type?: string;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
// Prompt building
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
export function buildSyntheticPrompt(
|
|
36
|
+
skillContent: string,
|
|
37
|
+
skillName: string,
|
|
38
|
+
maxPositives: number,
|
|
39
|
+
maxNegatives: number,
|
|
40
|
+
): { system: string; user: string } {
|
|
41
|
+
const system = `You are generating test queries for a coding agent skill. Given the skill description below, generate realistic user queries.
|
|
42
|
+
|
|
43
|
+
For POSITIVE queries (should trigger this skill):
|
|
44
|
+
- Generate a mix of:
|
|
45
|
+
- Explicit: directly names the skill or uses $${skillName} syntax
|
|
46
|
+
- Implicit: describes the task without naming the skill
|
|
47
|
+
- Contextual: natural language with domain context, proper nouns, dates, filenames
|
|
48
|
+
- Vary phrasing, formality, and specificity
|
|
49
|
+
|
|
50
|
+
For NEGATIVE queries (should NOT trigger this skill):
|
|
51
|
+
- Queries that are topically adjacent but wrong intent
|
|
52
|
+
- Queries for different skills that share keywords
|
|
53
|
+
- Generic queries unrelated to this skill
|
|
54
|
+
|
|
55
|
+
Output as JSON array with no surrounding text:
|
|
56
|
+
[{"query": "...", "should_trigger": true, "invocation_type": "explicit|implicit|contextual|negative"}]`;
|
|
57
|
+
|
|
58
|
+
const user = `Skill name: ${skillName}
|
|
59
|
+
|
|
60
|
+
Skill content:
|
|
61
|
+
${skillContent}
|
|
62
|
+
|
|
63
|
+
Generate exactly ${maxPositives} positive queries (should_trigger: true) and ${maxNegatives} negative queries (should_trigger: false). Return ONLY the JSON array.`;
|
|
64
|
+
|
|
65
|
+
return { system, user };
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// ---------------------------------------------------------------------------
|
|
69
|
+
// Response parsing
|
|
70
|
+
// ---------------------------------------------------------------------------
|
|
71
|
+
|
|
72
|
+
export function parseSyntheticResponse(raw: string, skillName: string): EvalEntry[] {
|
|
73
|
+
let text = raw.trim();
|
|
74
|
+
|
|
75
|
+
// Strip markdown fences manually for array-first JSON
|
|
76
|
+
// (stripMarkdownFences slices to first '{' which breaks '[' arrays)
|
|
77
|
+
const fenceMatch = text.match(/^```\w*\n/);
|
|
78
|
+
if (fenceMatch) {
|
|
79
|
+
text = text.slice(fenceMatch[0].length);
|
|
80
|
+
const closingIdx = text.lastIndexOf("```");
|
|
81
|
+
if (closingIdx >= 0) {
|
|
82
|
+
text = text.slice(0, closingIdx);
|
|
83
|
+
}
|
|
84
|
+
text = text.trim();
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Find the JSON array start
|
|
88
|
+
const bracketIdx = text.indexOf("[");
|
|
89
|
+
if (bracketIdx < 0) {
|
|
90
|
+
// No array found — try stripMarkdownFences as fallback for edge cases
|
|
91
|
+
const cleaned = stripMarkdownFences(raw);
|
|
92
|
+
const retryIdx = cleaned.indexOf("[");
|
|
93
|
+
if (retryIdx >= 0) {
|
|
94
|
+
text = cleaned.slice(retryIdx);
|
|
95
|
+
} else {
|
|
96
|
+
throw new Error(`Failed to parse synthetic eval response as JSON: ${text.slice(0, 200)}`);
|
|
97
|
+
}
|
|
98
|
+
} else {
|
|
99
|
+
text = text.slice(bracketIdx);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Trim trailing content after the array closes
|
|
103
|
+
const lastBracket = text.lastIndexOf("]");
|
|
104
|
+
if (lastBracket >= 0) {
|
|
105
|
+
text = text.slice(0, lastBracket + 1);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const jsonText = text;
|
|
109
|
+
|
|
110
|
+
let entries: RawSyntheticEntry[];
|
|
111
|
+
try {
|
|
112
|
+
entries = JSON.parse(jsonText);
|
|
113
|
+
} catch {
|
|
114
|
+
throw new Error(`Failed to parse synthetic eval response as JSON: ${jsonText.slice(0, 200)}`);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
if (!Array.isArray(entries)) {
|
|
118
|
+
throw new Error("Synthetic eval response is not a JSON array");
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const result: EvalEntry[] = [];
|
|
122
|
+
for (const entry of entries) {
|
|
123
|
+
if (!entry || typeof entry.query !== "string" || typeof entry.should_trigger !== "boolean") {
|
|
124
|
+
continue;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
const query = entry.query.trim();
|
|
128
|
+
if (!query) continue;
|
|
129
|
+
|
|
130
|
+
// For positives, use classifyInvocation to verify/override the LLM's type
|
|
131
|
+
let invocationType: InvocationType;
|
|
132
|
+
if (entry.should_trigger) {
|
|
133
|
+
invocationType = classifyInvocation(query, skillName);
|
|
134
|
+
} else {
|
|
135
|
+
invocationType = "negative";
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
result.push({
|
|
139
|
+
query,
|
|
140
|
+
should_trigger: entry.should_trigger,
|
|
141
|
+
invocation_type: invocationType,
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return result;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// ---------------------------------------------------------------------------
|
|
149
|
+
// Main entry point
|
|
150
|
+
// ---------------------------------------------------------------------------
|
|
151
|
+
|
|
152
|
+
export async function generateSyntheticEvals(
|
|
153
|
+
skillPath: string,
|
|
154
|
+
skillName: string,
|
|
155
|
+
agent: string,
|
|
156
|
+
options: SyntheticEvalOptions = {},
|
|
157
|
+
): Promise<EvalEntry[]> {
|
|
158
|
+
const maxPositives = options.maxPositives ?? 15;
|
|
159
|
+
const maxNegatives = options.maxNegatives ?? 10;
|
|
160
|
+
|
|
161
|
+
const skillContent = readFileSync(skillPath, "utf-8");
|
|
162
|
+
|
|
163
|
+
const { system, user } = buildSyntheticPrompt(
|
|
164
|
+
skillContent,
|
|
165
|
+
skillName,
|
|
166
|
+
maxPositives,
|
|
167
|
+
maxNegatives,
|
|
168
|
+
);
|
|
169
|
+
|
|
170
|
+
const raw = await callLlm(system, user, agent, options.modelFlag);
|
|
171
|
+
return parseSyntheticResponse(raw, skillName);
|
|
172
|
+
}
|