selftune 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/.claude/agents/diagnosis-analyst.md +146 -0
  2. package/.claude/agents/evolution-reviewer.md +167 -0
  3. package/.claude/agents/integration-guide.md +200 -0
  4. package/.claude/agents/pattern-analyst.md +147 -0
  5. package/CHANGELOG.md +37 -0
  6. package/README.md +96 -256
  7. package/assets/BeforeAfter.gif +0 -0
  8. package/assets/FeedbackLoop.gif +0 -0
  9. package/assets/logo.svg +9 -0
  10. package/assets/skill-health-badge.svg +20 -0
  11. package/cli/selftune/activation-rules.ts +171 -0
  12. package/cli/selftune/badge/badge-data.ts +108 -0
  13. package/cli/selftune/badge/badge-svg.ts +212 -0
  14. package/cli/selftune/badge/badge.ts +103 -0
  15. package/cli/selftune/constants.ts +75 -1
  16. package/cli/selftune/contribute/bundle.ts +314 -0
  17. package/cli/selftune/contribute/contribute.ts +214 -0
  18. package/cli/selftune/contribute/sanitize.ts +162 -0
  19. package/cli/selftune/cron/setup.ts +266 -0
  20. package/cli/selftune/dashboard-server.ts +582 -0
  21. package/cli/selftune/dashboard.ts +25 -3
  22. package/cli/selftune/eval/baseline.ts +247 -0
  23. package/cli/selftune/eval/composability.ts +117 -0
  24. package/cli/selftune/eval/generate-unit-tests.ts +143 -0
  25. package/cli/selftune/eval/hooks-to-evals.ts +68 -2
  26. package/cli/selftune/eval/import-skillsbench.ts +221 -0
  27. package/cli/selftune/eval/synthetic-evals.ts +172 -0
  28. package/cli/selftune/eval/unit-test-cli.ts +152 -0
  29. package/cli/selftune/eval/unit-test.ts +196 -0
  30. package/cli/selftune/evolution/deploy-proposal.ts +142 -1
  31. package/cli/selftune/evolution/evolve-body.ts +492 -0
  32. package/cli/selftune/evolution/evolve.ts +466 -103
  33. package/cli/selftune/evolution/extract-patterns.ts +32 -1
  34. package/cli/selftune/evolution/pareto.ts +314 -0
  35. package/cli/selftune/evolution/propose-body.ts +171 -0
  36. package/cli/selftune/evolution/propose-description.ts +100 -2
  37. package/cli/selftune/evolution/propose-routing.ts +166 -0
  38. package/cli/selftune/evolution/refine-body.ts +141 -0
  39. package/cli/selftune/evolution/rollback.ts +19 -2
  40. package/cli/selftune/evolution/validate-body.ts +254 -0
  41. package/cli/selftune/evolution/validate-proposal.ts +257 -35
  42. package/cli/selftune/evolution/validate-routing.ts +177 -0
  43. package/cli/selftune/grading/grade-session.ts +138 -18
  44. package/cli/selftune/grading/pre-gates.ts +104 -0
  45. package/cli/selftune/hooks/auto-activate.ts +185 -0
  46. package/cli/selftune/hooks/evolution-guard.ts +165 -0
  47. package/cli/selftune/hooks/skill-change-guard.ts +112 -0
  48. package/cli/selftune/index.ts +88 -0
  49. package/cli/selftune/ingestors/claude-replay.ts +351 -0
  50. package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
  51. package/cli/selftune/init.ts +150 -3
  52. package/cli/selftune/memory/writer.ts +447 -0
  53. package/cli/selftune/monitoring/watch.ts +25 -2
  54. package/cli/selftune/status.ts +17 -13
  55. package/cli/selftune/types.ts +377 -5
  56. package/cli/selftune/utils/frontmatter.ts +217 -0
  57. package/cli/selftune/utils/llm-call.ts +29 -3
  58. package/cli/selftune/utils/transcript.ts +35 -0
  59. package/cli/selftune/utils/trigger-check.ts +89 -0
  60. package/cli/selftune/utils/tui.ts +156 -0
  61. package/dashboard/index.html +569 -8
  62. package/package.json +8 -4
  63. package/skill/SKILL.md +124 -8
  64. package/skill/Workflows/AutoActivation.md +144 -0
  65. package/skill/Workflows/Badge.md +118 -0
  66. package/skill/Workflows/Baseline.md +121 -0
  67. package/skill/Workflows/Composability.md +100 -0
  68. package/skill/Workflows/Contribute.md +91 -0
  69. package/skill/Workflows/Cron.md +155 -0
  70. package/skill/Workflows/Dashboard.md +203 -0
  71. package/skill/Workflows/Doctor.md +37 -1
  72. package/skill/Workflows/Evals.md +69 -1
  73. package/skill/Workflows/EvolutionMemory.md +152 -0
  74. package/skill/Workflows/Evolve.md +111 -6
  75. package/skill/Workflows/EvolveBody.md +159 -0
  76. package/skill/Workflows/ImportSkillsBench.md +111 -0
  77. package/skill/Workflows/Ingest.md +117 -3
  78. package/skill/Workflows/Initialize.md +57 -3
  79. package/skill/Workflows/Replay.md +70 -0
  80. package/skill/Workflows/Rollback.md +20 -1
  81. package/skill/Workflows/UnitTest.md +138 -0
  82. package/skill/Workflows/Watch.md +22 -0
  83. package/skill/settings_snippet.json +23 -0
  84. package/templates/activation-rules-default.json +27 -0
  85. package/templates/multi-skill-settings.json +64 -0
  86. package/templates/single-skill-settings.json +58 -0
@@ -0,0 +1,247 @@
1
+ /**
2
+ * baseline.ts
3
+ *
4
+ * Measures the value a skill adds over a no-skill baseline.
5
+ *
6
+ * Runs trigger checks against an EMPTY string description (no-skill baseline)
7
+ * and against the current description (with-skill), then computes lift.
8
+ * A skill "adds value" when lift >= 0.05 (5 percentage points).
9
+ */
10
+
11
+ import { parseArgs } from "node:util";
12
+
13
+ import type { BaselineResult, EvalEntry } from "../types.js";
14
+ import { callLlm } from "../utils/llm-call.js";
15
+ import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
16
+
17
+ // ---------------------------------------------------------------------------
18
+ // Types
19
+ // ---------------------------------------------------------------------------
20
+
21
+ export interface BaselineOptions {
22
+ evalSet: EvalEntry[];
23
+ skillDescription: string;
24
+ skillName: string;
25
+ agent: string;
26
+ modelFlag?: string;
27
+ }
28
+
29
+ export interface BaselineMeasurement {
30
+ skill_name: string;
31
+ baseline_pass_rate: number;
32
+ with_skill_pass_rate: number;
33
+ lift: number;
34
+ adds_value: boolean;
35
+ per_entry: BaselineResult[];
36
+ measured_at: string;
37
+ }
38
+
39
+ /**
40
+ * Injectable dependencies for measureBaseline(). When omitted, the real
41
+ * module imports are used. Pass overrides in tests to avoid real LLM calls.
42
+ */
43
+ export interface BaselineDeps {
44
+ callLlm?: typeof callLlm;
45
+ }
46
+
47
+ // ---------------------------------------------------------------------------
48
+ // Constants
49
+ // ---------------------------------------------------------------------------
50
+
51
+ const LIFT_THRESHOLD = 0.05;
52
+ const SYSTEM_PROMPT = "You are an evaluation assistant. Answer only YES or NO.";
53
+
54
+ // ---------------------------------------------------------------------------
55
+ // Core measurement
56
+ // ---------------------------------------------------------------------------
57
+
58
+ /** Measure baseline vs. with-skill trigger accuracy across an eval set. */
59
+ export async function measureBaseline(
60
+ options: BaselineOptions,
61
+ _deps: BaselineDeps = {},
62
+ ): Promise<BaselineMeasurement> {
63
+ const { evalSet, skillDescription, skillName, agent, modelFlag } = options;
64
+ const _callLlm = _deps.callLlm ?? callLlm;
65
+
66
+ if (evalSet.length === 0) {
67
+ return {
68
+ skill_name: skillName,
69
+ baseline_pass_rate: 0,
70
+ with_skill_pass_rate: 0,
71
+ lift: 0,
72
+ adds_value: false,
73
+ per_entry: [],
74
+ measured_at: new Date().toISOString(),
75
+ };
76
+ }
77
+
78
+ const perEntry: BaselineResult[] = [];
79
+ let baselinePassed = 0;
80
+ let withSkillPassed = 0;
81
+
82
+ for (const entry of evalSet) {
83
+ // --- Baseline check (empty description) ---
84
+ const baselinePrompt = buildTriggerCheckPrompt("", entry.query);
85
+ const baselineRaw = await _callLlm(SYSTEM_PROMPT, baselinePrompt, agent, modelFlag);
86
+ const baselineTriggered = parseTriggerResponse(baselineRaw);
87
+ const baselinePass =
88
+ (entry.should_trigger && baselineTriggered) || (!entry.should_trigger && !baselineTriggered);
89
+
90
+ if (baselinePass) baselinePassed++;
91
+
92
+ perEntry.push({
93
+ skill_name: skillName,
94
+ query: entry.query,
95
+ with_skill: false,
96
+ triggered: baselineTriggered,
97
+ pass: baselinePass,
98
+ measured_at: new Date().toISOString(),
99
+ });
100
+
101
+ // --- With-skill check (actual description) ---
102
+ const withSkillPrompt = buildTriggerCheckPrompt(skillDescription, entry.query);
103
+ const withSkillRaw = await _callLlm(SYSTEM_PROMPT, withSkillPrompt, agent, modelFlag);
104
+ const withSkillTriggered = parseTriggerResponse(withSkillRaw);
105
+ const withSkillPass =
106
+ (entry.should_trigger && withSkillTriggered) ||
107
+ (!entry.should_trigger && !withSkillTriggered);
108
+
109
+ if (withSkillPass) withSkillPassed++;
110
+
111
+ perEntry.push({
112
+ skill_name: skillName,
113
+ query: entry.query,
114
+ with_skill: true,
115
+ triggered: withSkillTriggered,
116
+ pass: withSkillPass,
117
+ measured_at: new Date().toISOString(),
118
+ });
119
+ }
120
+
121
+ const total = evalSet.length;
122
+ const baselinePassRate = baselinePassed / total;
123
+ const withSkillPassRate = withSkillPassed / total;
124
+ const lift = withSkillPassRate - baselinePassRate;
125
+
126
+ return {
127
+ skill_name: skillName,
128
+ baseline_pass_rate: baselinePassRate,
129
+ with_skill_pass_rate: withSkillPassRate,
130
+ lift,
131
+ adds_value: lift >= LIFT_THRESHOLD,
132
+ per_entry: perEntry,
133
+ measured_at: new Date().toISOString(),
134
+ };
135
+ }
136
+
137
+ // ---------------------------------------------------------------------------
138
+ // CLI entry point
139
+ // ---------------------------------------------------------------------------
140
+
141
+ export async function cliMain(): Promise<void> {
142
+ const { values } = parseArgs({
143
+ options: {
144
+ skill: { type: "string" },
145
+ "skill-path": { type: "string" },
146
+ "eval-set": { type: "string" },
147
+ agent: { type: "string" },
148
+ help: { type: "boolean", default: false },
149
+ },
150
+ strict: true,
151
+ });
152
+
153
+ if (values.help) {
154
+ console.log(`selftune baseline — Measure skill value vs. no-skill baseline
155
+
156
+ Usage:
157
+ selftune baseline --skill <name> --skill-path <path> [options]
158
+
159
+ Options:
160
+ --skill Skill name (required)
161
+ --skill-path Path to SKILL.md (required)
162
+ --eval-set Path to eval set JSON (optional, builds from logs if omitted)
163
+ --agent Agent CLI to use (claude, codex, opencode)
164
+ --help Show this help message`);
165
+ process.exit(0);
166
+ }
167
+
168
+ if (!values.skill || !values["skill-path"]) {
169
+ console.error("[ERROR] --skill and --skill-path are required");
170
+ process.exit(1);
171
+ }
172
+
173
+ const { existsSync, readFileSync } = await import("node:fs");
174
+
175
+ // Read skill description
176
+ const skillPath = values["skill-path"];
177
+ if (!existsSync(skillPath)) {
178
+ console.error(`[ERROR] SKILL.md not found at ${skillPath}`);
179
+ process.exit(1);
180
+ }
181
+ const skillDescription = readFileSync(skillPath, "utf-8");
182
+
183
+ // Load eval set
184
+ let evalSet: EvalEntry[];
185
+ if (values["eval-set"] && existsSync(values["eval-set"])) {
186
+ const raw = readFileSync(values["eval-set"], "utf-8");
187
+ evalSet = JSON.parse(raw) as EvalEntry[];
188
+ } else {
189
+ // Build from logs
190
+ const { QUERY_LOG, SKILL_LOG } = await import("../constants.js");
191
+ const { readJsonl } = await import("../utils/jsonl.js");
192
+ const { buildEvalSet } = await import("./hooks-to-evals.js");
193
+ const skillRecords = readJsonl(SKILL_LOG);
194
+ const queryRecords = readJsonl(QUERY_LOG);
195
+ evalSet = buildEvalSet(skillRecords, queryRecords, values.skill);
196
+ }
197
+
198
+ // Detect agent
199
+ const { detectAgent } = await import("../utils/llm-call.js");
200
+ const requestedAgent = values.agent;
201
+ if (requestedAgent && !Bun.which(requestedAgent)) {
202
+ console.error(
203
+ JSON.stringify({
204
+ level: "error",
205
+ code: "agent_not_in_path",
206
+ message: `Agent CLI '${requestedAgent}' not found in PATH.`,
207
+ action: "Install it or omit --agent to use auto-detection.",
208
+ }),
209
+ );
210
+ process.exit(1);
211
+ }
212
+ const agent = requestedAgent ?? detectAgent();
213
+ if (!agent) {
214
+ console.error(
215
+ JSON.stringify({
216
+ level: "error",
217
+ code: "agent_not_found",
218
+ message: "No agent CLI (claude/codex/opencode) found in PATH.",
219
+ action: "Install Claude Code, Codex, or OpenCode.",
220
+ }),
221
+ );
222
+ process.exit(1);
223
+ }
224
+
225
+ const result = await measureBaseline({
226
+ evalSet,
227
+ skillDescription,
228
+ skillName: values.skill,
229
+ agent,
230
+ });
231
+
232
+ console.log(JSON.stringify(result, null, 2));
233
+ process.exit(result.adds_value ? 0 : 1);
234
+ }
235
+
236
+ if (import.meta.main) {
237
+ cliMain().catch((err) => {
238
+ console.error(
239
+ JSON.stringify({
240
+ level: "fatal",
241
+ message: err instanceof Error ? err.message : String(err),
242
+ stack: err instanceof Error ? err.stack : undefined,
243
+ }),
244
+ );
245
+ process.exit(1);
246
+ });
247
+ }
@@ -0,0 +1,117 @@
1
+ /**
2
+ * composability.ts
3
+ *
4
+ * Analyzes co-occurrence patterns between skills in session telemetry
5
+ * to detect composability conflicts. A conflict is flagged when two
6
+ * skills used together produce more errors than either skill used alone.
7
+ *
8
+ * Pure function -- no I/O. CLI wrapper handles reading JSONL.
9
+ */
10
+
11
+ import type { ComposabilityReport, CoOccurrencePair, SessionTelemetryRecord } from "../types.js";
12
+
13
+ /**
14
+ * Clamp a number between min and max.
15
+ */
16
+ function clamp(value: number, min: number, max: number): number {
17
+ return Math.max(min, Math.min(max, value));
18
+ }
19
+
20
+ /**
21
+ * Analyze composability of a target skill against all co-occurring skills.
22
+ *
23
+ * @param skillName - The skill to analyze
24
+ * @param telemetry - All session telemetry records
25
+ * @param window - Optional: only consider the last N sessions (by timestamp)
26
+ * @returns ComposabilityReport with co-occurrence pairs and conflict detection
27
+ */
28
+ export function analyzeComposability(
29
+ skillName: string,
30
+ telemetry: SessionTelemetryRecord[],
31
+ window?: number,
32
+ ): ComposabilityReport {
33
+ // Apply window: sort by timestamp descending, take last N
34
+ let sessions = telemetry.filter((r) => r && Array.isArray(r.skills_triggered));
35
+
36
+ if (window && window > 0) {
37
+ sessions = sessions
38
+ .sort((a, b) => (b.timestamp ?? "").localeCompare(a.timestamp ?? ""))
39
+ .slice(0, window);
40
+ }
41
+
42
+ // Sessions where the target skill was triggered
43
+ const skillSessions = sessions.filter((r) => r.skills_triggered.includes(skillName));
44
+
45
+ // Sessions where the target skill was triggered ALONE (no other skills)
46
+ const aloneSessions = skillSessions.filter((r) => r.skills_triggered.length === 1);
47
+
48
+ // Average errors when skill is used alone
49
+ const errorsAlone =
50
+ aloneSessions.length > 0
51
+ ? aloneSessions.reduce((sum, r) => sum + (r.errors_encountered ?? 0), 0) /
52
+ aloneSessions.length
53
+ : 0;
54
+
55
+ // Find all co-occurring skills
56
+ const coSkills = new Set<string>();
57
+ for (const r of skillSessions) {
58
+ for (const s of r.skills_triggered) {
59
+ if (s !== skillName) coSkills.add(s);
60
+ }
61
+ }
62
+
63
+ // For each co-occurring skill, compute conflict score
64
+ const pairs: CoOccurrencePair[] = [];
65
+ for (const coSkill of coSkills) {
66
+ // Sessions where BOTH skills are triggered together
67
+ const togetherSessions = skillSessions.filter((r) => r.skills_triggered.includes(coSkill));
68
+
69
+ const coOccurrenceCount = togetherSessions.length;
70
+
71
+ // Average errors when both skills are used together
72
+ const errorsTogether =
73
+ togetherSessions.length > 0
74
+ ? togetherSessions.reduce((sum, r) => sum + (r.errors_encountered ?? 0), 0) /
75
+ togetherSessions.length
76
+ : 0;
77
+
78
+ // Baseline should consider BOTH skills alone to avoid false positives
79
+ const coSkillAloneSessions = sessions.filter(
80
+ (r) => r.skills_triggered.includes(coSkill) && !r.skills_triggered.includes(skillName),
81
+ );
82
+ const errorsCoSkillAlone =
83
+ coSkillAloneSessions.length > 0
84
+ ? coSkillAloneSessions.reduce((sum, r) => sum + (r.errors_encountered ?? 0), 0) /
85
+ coSkillAloneSessions.length
86
+ : errorsAlone;
87
+ const baselineAlone = Math.max(errorsAlone, errorsCoSkillAlone);
88
+
89
+ // conflict_score = clamp((errors_together - baseline) / (baseline + 1), 0, 1)
90
+ const conflictScore = clamp((errorsTogether - baselineAlone) / (baselineAlone + 1), 0, 1);
91
+
92
+ const conflictDetected = conflictScore > 0.3;
93
+
94
+ const pair: CoOccurrencePair = {
95
+ skill_a: skillName,
96
+ skill_b: coSkill,
97
+ co_occurrence_count: coOccurrenceCount,
98
+ conflict_detected: conflictDetected,
99
+ };
100
+
101
+ if (conflictDetected) {
102
+ pair.conflict_reason = `conflict_score=${conflictScore.toFixed(3)} (avg errors together=${errorsTogether.toFixed(1)} vs alone=${errorsAlone.toFixed(1)})`;
103
+ }
104
+
105
+ pairs.push(pair);
106
+ }
107
+
108
+ // Sort by co-occurrence count descending for readability
109
+ pairs.sort((a, b) => b.co_occurrence_count - a.co_occurrence_count);
110
+
111
+ return {
112
+ pairs,
113
+ total_sessions_analyzed: skillSessions.length,
114
+ conflict_count: pairs.filter((p) => p.conflict_detected).length,
115
+ generated_at: new Date().toISOString(),
116
+ };
117
+ }
@@ -0,0 +1,143 @@
1
+ /**
2
+ * Skill unit test generator.
3
+ *
4
+ * Uses an LLM to generate unit test cases from skill content and eval failures.
5
+ * Tests are output as SkillUnitTest[] JSON arrays.
6
+ */
7
+
8
+ import type { EvalEntry, SkillUnitTest } from "../types.js";
9
+
10
+ // Note: we don't use stripMarkdownFences from llm-call.ts because it
11
+ // assumes JSON objects (looks for `{`), but we return JSON arrays.
12
+
13
+ /** Strip markdown fences and find JSON array content. */
14
+ function stripArrayFences(raw: string): string {
15
+ let text = raw.trim();
16
+
17
+ // Strip markdown code fences
18
+ const fenceMatch = text.match(/^```\w*\n([\s\S]*?)\n```$/);
19
+ if (fenceMatch) {
20
+ text = fenceMatch[1].trim();
21
+ }
22
+
23
+ // Find first [ in case there's preamble text
24
+ const bracketIdx = text.indexOf("[");
25
+ if (bracketIdx >= 0) {
26
+ text = text.slice(bracketIdx);
27
+ }
28
+
29
+ return text;
30
+ }
31
+
32
+ // ---------------------------------------------------------------------------
33
+ // LLM caller type (injectable for testing)
34
+ // ---------------------------------------------------------------------------
35
+
36
+ export type LlmCaller = (systemPrompt: string, userPrompt: string) => Promise<string>;
37
+
38
+ // ---------------------------------------------------------------------------
39
+ // Prompt building
40
+ // ---------------------------------------------------------------------------
41
+
42
+ const SYSTEM_PROMPT = `You are a test engineer generating skill unit tests.
43
+ Given a skill name, its content/description, and optionally some eval failures,
44
+ generate unit test cases as a JSON array of objects.
45
+
46
+ Each test object must have:
47
+ - id: unique string (e.g. "gen-1", "gen-2")
48
+ - skill_name: the skill name provided
49
+ - query: a user query that would test this skill
50
+ - assertions: array of assertion objects, each with:
51
+ - type: one of "contains", "not_contains", "regex", "tool_called", "tool_not_called", "json_path"
52
+ - value: the value to check for
53
+ - description: (optional) human-readable description of what this checks
54
+ - tags: (optional) array of tag strings like ["generated", "smoke"]
55
+
56
+ Focus on:
57
+ 1. Covering different invocation patterns (explicit, implicit, contextual)
58
+ 2. Testing edge cases from eval failures if provided
59
+ 3. Verifying expected tools are called
60
+ 4. Checking output contains expected content
61
+
62
+ Respond with ONLY a JSON array. No explanation.`;
63
+
64
+ /** Build the user prompt for test generation. */
65
+ export function buildGenerationPrompt(
66
+ skillName: string,
67
+ skillContent: string,
68
+ evalFailures: EvalEntry[],
69
+ ): string {
70
+ const parts: string[] = [`Skill name: ${skillName}`, "", "Skill content:", skillContent, ""];
71
+
72
+ if (evalFailures.length > 0) {
73
+ parts.push("Eval failures (queries that failed trigger checks):");
74
+ for (const f of evalFailures) {
75
+ parts.push(
76
+ ` - query: "${f.query}" (should_trigger=${f.should_trigger}, type=${f.invocation_type ?? "unknown"})`,
77
+ );
78
+ }
79
+ parts.push("");
80
+ }
81
+
82
+ parts.push("Example test case format:");
83
+ parts.push(
84
+ JSON.stringify(
85
+ [
86
+ {
87
+ id: "example-1",
88
+ skill_name: skillName,
89
+ query: "example query for this skill",
90
+ assertions: [
91
+ {
92
+ type: "contains",
93
+ value: "expected output",
94
+ description: "checks for expected content",
95
+ },
96
+ { type: "tool_called", value: "Write", description: "verifies Write tool was used" },
97
+ ],
98
+ tags: ["generated"],
99
+ },
100
+ ],
101
+ null,
102
+ 2,
103
+ ),
104
+ );
105
+
106
+ parts.push("");
107
+ parts.push("Generate 5-10 diverse test cases covering the skill's functionality.");
108
+
109
+ return parts.join("\n");
110
+ }
111
+
112
+ // ---------------------------------------------------------------------------
113
+ // Generate unit tests
114
+ // ---------------------------------------------------------------------------
115
+
116
+ /** Generate unit tests for a skill using an LLM. Returns empty array on error. */
117
+ export async function generateUnitTests(
118
+ skillName: string,
119
+ skillContent: string,
120
+ evalFailures: EvalEntry[],
121
+ llmCaller: LlmCaller,
122
+ ): Promise<SkillUnitTest[]> {
123
+ try {
124
+ const userPrompt = buildGenerationPrompt(skillName, skillContent, evalFailures);
125
+ const raw = await llmCaller(SYSTEM_PROMPT, userPrompt);
126
+ const cleaned = stripArrayFences(raw);
127
+
128
+ const parsed = JSON.parse(cleaned);
129
+ if (!Array.isArray(parsed)) {
130
+ console.warn("[WARN] LLM did not return a JSON array for unit test generation");
131
+ return [];
132
+ }
133
+
134
+ // Ensure skill_name is set correctly on each test
135
+ return parsed.map((t: SkillUnitTest) => ({
136
+ ...t,
137
+ skill_name: t.skill_name || skillName,
138
+ }));
139
+ } catch (err) {
140
+ console.warn("[WARN] Failed to generate unit tests:", err);
141
+ return [];
142
+ }
143
+ }
@@ -26,7 +26,9 @@ import type {
26
26
  SkillUsageRecord,
27
27
  } from "../types.js";
28
28
  import { readJsonl } from "../utils/jsonl.js";
29
+ import { detectAgent } from "../utils/llm-call.js";
29
30
  import { seededShuffle } from "../utils/seeded-random.js";
31
+ import { generateSyntheticEvals } from "./synthetic-evals.js";
30
32
 
31
33
  // ---------------------------------------------------------------------------
32
34
  // Query truncation
@@ -359,7 +361,7 @@ export function printEvalStats(
359
361
  // CLI entry point
360
362
  // ---------------------------------------------------------------------------
361
363
 
362
- export function cliMain(): void {
364
+ export async function cliMain(): Promise<void> {
363
365
  const { values } = parseArgs({
364
366
  options: {
365
367
  skill: { type: "string" },
@@ -373,10 +375,71 @@ export function cliMain(): void {
373
375
  "skill-log": { type: "string", default: SKILL_LOG },
374
376
  "query-log": { type: "string", default: QUERY_LOG },
375
377
  "telemetry-log": { type: "string", default: TELEMETRY_LOG },
378
+ synthetic: { type: "boolean", default: false },
379
+ "skill-path": { type: "string" },
380
+ model: { type: "string" },
376
381
  },
377
382
  strict: true,
378
383
  });
379
384
 
385
+ // --- Synthetic mode: generate evals from SKILL.md via LLM ---
386
+ if (values.synthetic) {
387
+ if (!values.skill) {
388
+ console.error("[ERROR] --skill required with --synthetic");
389
+ process.exit(1);
390
+ }
391
+ if (!values["skill-path"]) {
392
+ console.error("[ERROR] --skill-path required with --synthetic");
393
+ process.exit(1);
394
+ }
395
+
396
+ const agent = detectAgent();
397
+ if (!agent) {
398
+ console.error("[ERROR] No agent CLI found (claude/codex/opencode). Install one first.");
399
+ process.exit(1);
400
+ }
401
+
402
+ const maxPerSide = Number.parseInt(values.max ?? "50", 10);
403
+ const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
404
+
405
+ console.log(`Generating synthetic evals for skill '${values.skill}'...`);
406
+ const evalSet = await generateSyntheticEvals(values["skill-path"], values.skill, agent, {
407
+ maxPositives: effectiveMax,
408
+ maxNegatives: effectiveMax,
409
+ modelFlag: values.model,
410
+ });
411
+
412
+ const outputPath = values.output ?? `${values.skill}_trigger_eval.json`;
413
+ writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
414
+
415
+ const pos = evalSet.filter((e) => e.should_trigger);
416
+ const neg = evalSet.filter((e) => !e.should_trigger);
417
+
418
+ console.log(`Wrote ${evalSet.length} synthetic eval entries to ${outputPath}`);
419
+ console.log(` Positives (should_trigger=true) : ${pos.length}`);
420
+ console.log(` Negatives (should_trigger=false): ${neg.length}`);
421
+
422
+ if (pos.length > 0) {
423
+ const types = new Map<string, number>();
424
+ for (const e of pos) {
425
+ const t = e.invocation_type ?? "?";
426
+ types.set(t, (types.get(t) ?? 0) + 1);
427
+ }
428
+ console.log("\n Positive invocation types:");
429
+ for (const [t, c] of [...types.entries()].sort()) {
430
+ console.log(` ${t.padEnd(15)} ${c}`);
431
+ }
432
+ }
433
+
434
+ console.log("\nNext steps:");
435
+ console.log(" bun run cli/selftune/eval/run-eval.ts \\");
436
+ console.log(` --eval-set ${outputPath} \\`);
437
+ console.log(` --skill-path ${values["skill-path"]} \\`);
438
+ console.log(" --runs-per-query 3 --verbose");
439
+ return;
440
+ }
441
+
442
+ // --- Log-based mode (original behavior) ---
380
443
  const skillRecords = readJsonl<SkillUsageRecord>(values["skill-log"] ?? SKILL_LOG);
381
444
  const queryRecords = readJsonl<QueryLogRecord>(values["query-log"] ?? QUERY_LOG);
382
445
  const telemetryRecords = readJsonl<SessionTelemetryRecord>(
@@ -418,5 +481,8 @@ export function cliMain(): void {
418
481
  }
419
482
 
420
483
  if (import.meta.main) {
421
- cliMain();
484
+ cliMain().catch((err) => {
485
+ console.error(err);
486
+ process.exit(1);
487
+ });
422
488
  }