selftune 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +146 -0
- package/.claude/agents/evolution-reviewer.md +167 -0
- package/.claude/agents/integration-guide.md +200 -0
- package/.claude/agents/pattern-analyst.md +147 -0
- package/CHANGELOG.md +37 -0
- package/README.md +96 -256
- package/assets/BeforeAfter.gif +0 -0
- package/assets/FeedbackLoop.gif +0 -0
- package/assets/logo.svg +9 -0
- package/assets/skill-health-badge.svg +20 -0
- package/cli/selftune/activation-rules.ts +171 -0
- package/cli/selftune/badge/badge-data.ts +108 -0
- package/cli/selftune/badge/badge-svg.ts +212 -0
- package/cli/selftune/badge/badge.ts +103 -0
- package/cli/selftune/constants.ts +75 -1
- package/cli/selftune/contribute/bundle.ts +314 -0
- package/cli/selftune/contribute/contribute.ts +214 -0
- package/cli/selftune/contribute/sanitize.ts +162 -0
- package/cli/selftune/cron/setup.ts +266 -0
- package/cli/selftune/dashboard-server.ts +582 -0
- package/cli/selftune/dashboard.ts +25 -3
- package/cli/selftune/eval/baseline.ts +247 -0
- package/cli/selftune/eval/composability.ts +117 -0
- package/cli/selftune/eval/generate-unit-tests.ts +143 -0
- package/cli/selftune/eval/hooks-to-evals.ts +68 -2
- package/cli/selftune/eval/import-skillsbench.ts +221 -0
- package/cli/selftune/eval/synthetic-evals.ts +172 -0
- package/cli/selftune/eval/unit-test-cli.ts +152 -0
- package/cli/selftune/eval/unit-test.ts +196 -0
- package/cli/selftune/evolution/deploy-proposal.ts +142 -1
- package/cli/selftune/evolution/evolve-body.ts +492 -0
- package/cli/selftune/evolution/evolve.ts +466 -103
- package/cli/selftune/evolution/extract-patterns.ts +32 -1
- package/cli/selftune/evolution/pareto.ts +314 -0
- package/cli/selftune/evolution/propose-body.ts +171 -0
- package/cli/selftune/evolution/propose-description.ts +100 -2
- package/cli/selftune/evolution/propose-routing.ts +166 -0
- package/cli/selftune/evolution/refine-body.ts +141 -0
- package/cli/selftune/evolution/rollback.ts +19 -2
- package/cli/selftune/evolution/validate-body.ts +254 -0
- package/cli/selftune/evolution/validate-proposal.ts +257 -35
- package/cli/selftune/evolution/validate-routing.ts +177 -0
- package/cli/selftune/grading/grade-session.ts +138 -18
- package/cli/selftune/grading/pre-gates.ts +104 -0
- package/cli/selftune/hooks/auto-activate.ts +185 -0
- package/cli/selftune/hooks/evolution-guard.ts +165 -0
- package/cli/selftune/hooks/skill-change-guard.ts +112 -0
- package/cli/selftune/index.ts +88 -0
- package/cli/selftune/ingestors/claude-replay.ts +351 -0
- package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
- package/cli/selftune/init.ts +150 -3
- package/cli/selftune/memory/writer.ts +447 -0
- package/cli/selftune/monitoring/watch.ts +25 -2
- package/cli/selftune/status.ts +17 -13
- package/cli/selftune/types.ts +377 -5
- package/cli/selftune/utils/frontmatter.ts +217 -0
- package/cli/selftune/utils/llm-call.ts +29 -3
- package/cli/selftune/utils/transcript.ts +35 -0
- package/cli/selftune/utils/trigger-check.ts +89 -0
- package/cli/selftune/utils/tui.ts +156 -0
- package/dashboard/index.html +569 -8
- package/package.json +8 -4
- package/skill/SKILL.md +124 -8
- package/skill/Workflows/AutoActivation.md +144 -0
- package/skill/Workflows/Badge.md +118 -0
- package/skill/Workflows/Baseline.md +121 -0
- package/skill/Workflows/Composability.md +100 -0
- package/skill/Workflows/Contribute.md +91 -0
- package/skill/Workflows/Cron.md +155 -0
- package/skill/Workflows/Dashboard.md +203 -0
- package/skill/Workflows/Doctor.md +37 -1
- package/skill/Workflows/Evals.md +69 -1
- package/skill/Workflows/EvolutionMemory.md +152 -0
- package/skill/Workflows/Evolve.md +111 -6
- package/skill/Workflows/EvolveBody.md +159 -0
- package/skill/Workflows/ImportSkillsBench.md +111 -0
- package/skill/Workflows/Ingest.md +117 -3
- package/skill/Workflows/Initialize.md +57 -3
- package/skill/Workflows/Replay.md +70 -0
- package/skill/Workflows/Rollback.md +20 -1
- package/skill/Workflows/UnitTest.md +138 -0
- package/skill/Workflows/Watch.md +22 -0
- package/skill/settings_snippet.json +23 -0
- package/templates/activation-rules-default.json +27 -0
- package/templates/multi-skill-settings.json +64 -0
- package/templates/single-skill-settings.json +58 -0
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* import-skillsbench.ts
|
|
4
|
+
*
|
|
5
|
+
* Imports task definitions from a SkillsBench-style corpus directory and
|
|
6
|
+
* converts them into EvalEntry arrays for use with selftune eval/grading.
|
|
7
|
+
*
|
|
8
|
+
* Expected directory structure:
|
|
9
|
+
* <dir>/tasks/<task-id>/instruction.md — task description (query text)
|
|
10
|
+
* <dir>/tasks/<task-id>/task.toml — metadata (difficulty, category, tags, etc.)
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
14
|
+
import { join } from "node:path";
|
|
15
|
+
import { parseArgs } from "node:util";
|
|
16
|
+
import type { EvalEntry, SkillsBenchTask } from "../types.js";
|
|
17
|
+
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
// Minimal TOML parser (handles the subset used by SkillsBench task.toml files)
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Parse the subset of TOML used by SkillsBench task.toml files.
|
|
24
|
+
*
|
|
25
|
+
* Supports: single-line key = "value", flat string arrays ["a", "b"],
|
|
26
|
+
* bare values (numbers, booleans).
|
|
27
|
+
*
|
|
28
|
+
* Does NOT support: multi-line / triple-quoted strings, inline tables,
|
|
29
|
+
* nested arrays, or section headers ([table]).
|
|
30
|
+
*/
|
|
31
|
+
function parseSimpleToml(content: string): Record<string, unknown> {
|
|
32
|
+
const result: Record<string, unknown> = {};
|
|
33
|
+
for (const rawLine of content.split("\n")) {
|
|
34
|
+
const line = rawLine.trim();
|
|
35
|
+
if (!line || line.startsWith("#")) continue;
|
|
36
|
+
|
|
37
|
+
const eqIdx = line.indexOf("=");
|
|
38
|
+
if (eqIdx === -1) continue;
|
|
39
|
+
|
|
40
|
+
const key = line.slice(0, eqIdx).trim();
|
|
41
|
+
const rawValue = line.slice(eqIdx + 1).trim();
|
|
42
|
+
|
|
43
|
+
if (rawValue.startsWith("[")) {
|
|
44
|
+
// Array value — parse simple string arrays like ["a", "b", "c"]
|
|
45
|
+
const arrayContent = rawValue.slice(1, rawValue.lastIndexOf("]"));
|
|
46
|
+
const items: string[] = [];
|
|
47
|
+
for (const item of arrayContent.split(",")) {
|
|
48
|
+
const trimmed = item.trim().replace(/^["']|["']$/g, "");
|
|
49
|
+
if (trimmed) items.push(trimmed);
|
|
50
|
+
}
|
|
51
|
+
result[key] = items;
|
|
52
|
+
} else if (rawValue.startsWith('"') || rawValue.startsWith("'")) {
|
|
53
|
+
// String value
|
|
54
|
+
result[key] = rawValue.replace(/^["']|["']$/g, "");
|
|
55
|
+
} else {
|
|
56
|
+
// Bare value (number, boolean, etc.)
|
|
57
|
+
result[key] = rawValue;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
return result;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// ---------------------------------------------------------------------------
|
|
64
|
+
// Parse SkillsBench directory
|
|
65
|
+
// ---------------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
export function parseSkillsBenchDir(dirPath: string): SkillsBenchTask[] {
|
|
68
|
+
const tasksDir = join(dirPath, "tasks");
|
|
69
|
+
if (!existsSync(tasksDir)) return [];
|
|
70
|
+
|
|
71
|
+
const tasks: SkillsBenchTask[] = [];
|
|
72
|
+
|
|
73
|
+
let entries: ReturnType<typeof readdirSync>;
|
|
74
|
+
try {
|
|
75
|
+
entries = readdirSync(tasksDir, { withFileTypes: true });
|
|
76
|
+
} catch {
|
|
77
|
+
return [];
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
for (const entry of entries) {
|
|
81
|
+
if (!entry.isDirectory()) continue;
|
|
82
|
+
|
|
83
|
+
const taskDir = join(tasksDir, entry.name);
|
|
84
|
+
const instructionPath = join(taskDir, "instruction.md");
|
|
85
|
+
|
|
86
|
+
if (!existsSync(instructionPath)) continue;
|
|
87
|
+
|
|
88
|
+
const query = readFileSync(instructionPath, "utf-8").trim();
|
|
89
|
+
if (!query) continue;
|
|
90
|
+
|
|
91
|
+
// Parse optional task.toml
|
|
92
|
+
const tomlPath = join(taskDir, "task.toml");
|
|
93
|
+
let metadata: Record<string, unknown> = {};
|
|
94
|
+
if (existsSync(tomlPath)) {
|
|
95
|
+
metadata = parseSimpleToml(readFileSync(tomlPath, "utf-8"));
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
const difficulty = metadata.difficulty as SkillsBenchTask["difficulty"] | undefined;
|
|
99
|
+
|
|
100
|
+
const task: SkillsBenchTask = {
|
|
101
|
+
task_id: entry.name,
|
|
102
|
+
category: (metadata.category as string) ?? "general",
|
|
103
|
+
query,
|
|
104
|
+
difficulty:
|
|
105
|
+
difficulty && ["easy", "medium", "hard"].includes(difficulty) ? difficulty : "medium",
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
if (metadata.expected_skill) {
|
|
109
|
+
task.expected_skill = metadata.expected_skill as string;
|
|
110
|
+
}
|
|
111
|
+
if (metadata.expected_tools && Array.isArray(metadata.expected_tools)) {
|
|
112
|
+
task.expected_tools = metadata.expected_tools as string[];
|
|
113
|
+
}
|
|
114
|
+
if (metadata.tags && Array.isArray(metadata.tags)) {
|
|
115
|
+
task.tags = metadata.tags as string[];
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
tasks.push(task);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return tasks;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// ---------------------------------------------------------------------------
|
|
125
|
+
// Convert tasks to EvalEntries
|
|
126
|
+
// ---------------------------------------------------------------------------
|
|
127
|
+
|
|
128
|
+
export function convertToEvalEntries(
|
|
129
|
+
tasks: SkillsBenchTask[],
|
|
130
|
+
targetSkill: string,
|
|
131
|
+
matchStrategy: "exact" | "fuzzy" = "exact",
|
|
132
|
+
): EvalEntry[] {
|
|
133
|
+
const entries: EvalEntry[] = [];
|
|
134
|
+
|
|
135
|
+
for (const task of tasks) {
|
|
136
|
+
let matches = false;
|
|
137
|
+
|
|
138
|
+
if (matchStrategy === "exact") {
|
|
139
|
+
matches = task.expected_skill === targetSkill;
|
|
140
|
+
} else {
|
|
141
|
+
// Fuzzy: check if targetSkill appears as substring in category, tags, or expected_skill
|
|
142
|
+
const skillLower = targetSkill.toLowerCase();
|
|
143
|
+
const searchable = [task.category, task.expected_skill, ...(task.tags ?? [])]
|
|
144
|
+
.filter(Boolean)
|
|
145
|
+
.map((s) => (s as string).toLowerCase());
|
|
146
|
+
|
|
147
|
+
matches = searchable.some((s) => s.includes(skillLower) || skillLower.includes(s));
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
if (matches) {
|
|
151
|
+
entries.push({
|
|
152
|
+
query: task.query,
|
|
153
|
+
should_trigger: true,
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
return entries;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// ---------------------------------------------------------------------------
|
|
162
|
+
// CLI entry point
|
|
163
|
+
// ---------------------------------------------------------------------------
|
|
164
|
+
|
|
165
|
+
export function cliMain(): void {
|
|
166
|
+
const { values } = parseArgs({
|
|
167
|
+
options: {
|
|
168
|
+
dir: { type: "string" },
|
|
169
|
+
skill: { type: "string" },
|
|
170
|
+
output: { type: "string" },
|
|
171
|
+
"match-strategy": { type: "string", default: "exact" },
|
|
172
|
+
},
|
|
173
|
+
strict: true,
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
if (!values.dir) {
|
|
177
|
+
console.error("[ERROR] --dir required (path to SkillsBench corpus directory)");
|
|
178
|
+
process.exit(1);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
if (!values.skill) {
|
|
182
|
+
console.error("[ERROR] --skill required (target skill name)");
|
|
183
|
+
process.exit(1);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
const matchStrategy = values["match-strategy"] === "fuzzy" ? "fuzzy" : "exact";
|
|
187
|
+
|
|
188
|
+
const tasks = parseSkillsBenchDir(values.dir);
|
|
189
|
+
|
|
190
|
+
if (tasks.length === 0) {
|
|
191
|
+
console.error(`[WARN] No tasks found in ${values.dir}/tasks/`);
|
|
192
|
+
console.error("Expected structure: <dir>/tasks/<task-id>/instruction.md");
|
|
193
|
+
process.exit(1);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
console.log(`Parsed ${tasks.length} tasks from ${values.dir}`);
|
|
197
|
+
|
|
198
|
+
const entries = convertToEvalEntries(tasks, values.skill, matchStrategy);
|
|
199
|
+
|
|
200
|
+
if (entries.length === 0) {
|
|
201
|
+
console.log(
|
|
202
|
+
`[WARN] No tasks matched skill '${values.skill}' with strategy '${matchStrategy}'.`,
|
|
203
|
+
);
|
|
204
|
+
console.log("Available expected_skills:");
|
|
205
|
+
const skills = [...new Set(tasks.map((t) => t.expected_skill).filter(Boolean))].sort();
|
|
206
|
+
for (const s of skills) {
|
|
207
|
+
console.log(` ${s}`);
|
|
208
|
+
}
|
|
209
|
+
if (matchStrategy === "exact") {
|
|
210
|
+
console.log("\nTip: try --match-strategy fuzzy for keyword-based matching.");
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
const outputPath = values.output ?? `${values.skill}_skillsbench_eval.json`;
|
|
215
|
+
writeFileSync(outputPath, JSON.stringify(entries, null, 2), "utf-8");
|
|
216
|
+
console.log(`Wrote ${entries.length} eval entries to ${outputPath}`);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
if (import.meta.main) {
|
|
220
|
+
cliMain();
|
|
221
|
+
}
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* synthetic-evals.ts
|
|
3
|
+
*
|
|
4
|
+
* Generates eval queries from a SKILL.md using an LLM, without requiring
|
|
5
|
+
* real session logs. Solves the cold-start problem for new skills that
|
|
6
|
+
* have no telemetry data yet.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { readFileSync } from "node:fs";
|
|
10
|
+
|
|
11
|
+
import type { EvalEntry, InvocationType } from "../types.js";
|
|
12
|
+
import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
|
|
13
|
+
import { classifyInvocation } from "./hooks-to-evals.js";
|
|
14
|
+
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// Types
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
export interface SyntheticEvalOptions {
|
|
20
|
+
maxPositives?: number;
|
|
21
|
+
maxNegatives?: number;
|
|
22
|
+
modelFlag?: string;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
interface RawSyntheticEntry {
|
|
26
|
+
query: string;
|
|
27
|
+
should_trigger: boolean;
|
|
28
|
+
invocation_type?: string;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
// Prompt building
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
export function buildSyntheticPrompt(
|
|
36
|
+
skillContent: string,
|
|
37
|
+
skillName: string,
|
|
38
|
+
maxPositives: number,
|
|
39
|
+
maxNegatives: number,
|
|
40
|
+
): { system: string; user: string } {
|
|
41
|
+
const system = `You are generating test queries for a coding agent skill. Given the skill description below, generate realistic user queries.
|
|
42
|
+
|
|
43
|
+
For POSITIVE queries (should trigger this skill):
|
|
44
|
+
- Generate a mix of:
|
|
45
|
+
- Explicit: directly names the skill or uses $${skillName} syntax
|
|
46
|
+
- Implicit: describes the task without naming the skill
|
|
47
|
+
- Contextual: natural language with domain context, proper nouns, dates, filenames
|
|
48
|
+
- Vary phrasing, formality, and specificity
|
|
49
|
+
|
|
50
|
+
For NEGATIVE queries (should NOT trigger this skill):
|
|
51
|
+
- Queries that are topically adjacent but wrong intent
|
|
52
|
+
- Queries for different skills that share keywords
|
|
53
|
+
- Generic queries unrelated to this skill
|
|
54
|
+
|
|
55
|
+
Output as JSON array with no surrounding text:
|
|
56
|
+
[{"query": "...", "should_trigger": true, "invocation_type": "explicit|implicit|contextual|negative"}]`;
|
|
57
|
+
|
|
58
|
+
const user = `Skill name: ${skillName}
|
|
59
|
+
|
|
60
|
+
Skill content:
|
|
61
|
+
${skillContent}
|
|
62
|
+
|
|
63
|
+
Generate exactly ${maxPositives} positive queries (should_trigger: true) and ${maxNegatives} negative queries (should_trigger: false). Return ONLY the JSON array.`;
|
|
64
|
+
|
|
65
|
+
return { system, user };
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// ---------------------------------------------------------------------------
|
|
69
|
+
// Response parsing
|
|
70
|
+
// ---------------------------------------------------------------------------
|
|
71
|
+
|
|
72
|
+
export function parseSyntheticResponse(raw: string, skillName: string): EvalEntry[] {
|
|
73
|
+
let text = raw.trim();
|
|
74
|
+
|
|
75
|
+
// Strip markdown fences manually for array-first JSON
|
|
76
|
+
// (stripMarkdownFences slices to first '{' which breaks '[' arrays)
|
|
77
|
+
const fenceMatch = text.match(/^```\w*\n/);
|
|
78
|
+
if (fenceMatch) {
|
|
79
|
+
text = text.slice(fenceMatch[0].length);
|
|
80
|
+
const closingIdx = text.lastIndexOf("```");
|
|
81
|
+
if (closingIdx >= 0) {
|
|
82
|
+
text = text.slice(0, closingIdx);
|
|
83
|
+
}
|
|
84
|
+
text = text.trim();
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Find the JSON array start
|
|
88
|
+
const bracketIdx = text.indexOf("[");
|
|
89
|
+
if (bracketIdx < 0) {
|
|
90
|
+
// No array found — try stripMarkdownFences as fallback for edge cases
|
|
91
|
+
const cleaned = stripMarkdownFences(raw);
|
|
92
|
+
const retryIdx = cleaned.indexOf("[");
|
|
93
|
+
if (retryIdx >= 0) {
|
|
94
|
+
text = cleaned.slice(retryIdx);
|
|
95
|
+
} else {
|
|
96
|
+
throw new Error(`Failed to parse synthetic eval response as JSON: ${text.slice(0, 200)}`);
|
|
97
|
+
}
|
|
98
|
+
} else {
|
|
99
|
+
text = text.slice(bracketIdx);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Trim trailing content after the array closes
|
|
103
|
+
const lastBracket = text.lastIndexOf("]");
|
|
104
|
+
if (lastBracket >= 0) {
|
|
105
|
+
text = text.slice(0, lastBracket + 1);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const jsonText = text;
|
|
109
|
+
|
|
110
|
+
let entries: RawSyntheticEntry[];
|
|
111
|
+
try {
|
|
112
|
+
entries = JSON.parse(jsonText);
|
|
113
|
+
} catch {
|
|
114
|
+
throw new Error(`Failed to parse synthetic eval response as JSON: ${jsonText.slice(0, 200)}`);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
if (!Array.isArray(entries)) {
|
|
118
|
+
throw new Error("Synthetic eval response is not a JSON array");
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const result: EvalEntry[] = [];
|
|
122
|
+
for (const entry of entries) {
|
|
123
|
+
if (!entry || typeof entry.query !== "string" || typeof entry.should_trigger !== "boolean") {
|
|
124
|
+
continue;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
const query = entry.query.trim();
|
|
128
|
+
if (!query) continue;
|
|
129
|
+
|
|
130
|
+
// For positives, use classifyInvocation to verify/override the LLM's type
|
|
131
|
+
let invocationType: InvocationType;
|
|
132
|
+
if (entry.should_trigger) {
|
|
133
|
+
invocationType = classifyInvocation(query, skillName);
|
|
134
|
+
} else {
|
|
135
|
+
invocationType = "negative";
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
result.push({
|
|
139
|
+
query,
|
|
140
|
+
should_trigger: entry.should_trigger,
|
|
141
|
+
invocation_type: invocationType,
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return result;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// ---------------------------------------------------------------------------
|
|
149
|
+
// Main entry point
|
|
150
|
+
// ---------------------------------------------------------------------------
|
|
151
|
+
|
|
152
|
+
export async function generateSyntheticEvals(
|
|
153
|
+
skillPath: string,
|
|
154
|
+
skillName: string,
|
|
155
|
+
agent: string,
|
|
156
|
+
options: SyntheticEvalOptions = {},
|
|
157
|
+
): Promise<EvalEntry[]> {
|
|
158
|
+
const maxPositives = options.maxPositives ?? 15;
|
|
159
|
+
const maxNegatives = options.maxNegatives ?? 10;
|
|
160
|
+
|
|
161
|
+
const skillContent = readFileSync(skillPath, "utf-8");
|
|
162
|
+
|
|
163
|
+
const { system, user } = buildSyntheticPrompt(
|
|
164
|
+
skillContent,
|
|
165
|
+
skillName,
|
|
166
|
+
maxPositives,
|
|
167
|
+
maxNegatives,
|
|
168
|
+
);
|
|
169
|
+
|
|
170
|
+
const raw = await callLlm(system, user, agent, options.modelFlag);
|
|
171
|
+
return parseSyntheticResponse(raw, skillName);
|
|
172
|
+
}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CLI entrypoint for skill unit tests.
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* selftune unit-test --skill <name> --tests <path> [--run-agent] [--generate]
|
|
6
|
+
*
|
|
7
|
+
* --skill <name> Skill name (required)
|
|
8
|
+
* --tests <path> Path to unit test JSON file (default: ~/.selftune/unit-tests/<skill>.json)
|
|
9
|
+
* --run-agent Actually run tests through an agent (otherwise dry-run with static checks)
|
|
10
|
+
* --generate Generate tests from skill content using LLM (requires agent)
|
|
11
|
+
* --skill-path <p> Path to skill file (used with --generate for content)
|
|
12
|
+
* --eval-set <p> Path to eval set JSON (used with --generate for failure context)
|
|
13
|
+
* --model <m> Model flag for LLM calls
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
17
|
+
import { join } from "node:path";
|
|
18
|
+
import { parseArgs } from "node:util";
|
|
19
|
+
|
|
20
|
+
import { SELFTUNE_CONFIG_DIR } from "../constants.js";
|
|
21
|
+
import type { EvalEntry } from "../types.js";
|
|
22
|
+
import { callLlm, detectAgent } from "../utils/llm-call.js";
|
|
23
|
+
import { generateUnitTests } from "./generate-unit-tests.js";
|
|
24
|
+
import type { AgentRunner } from "./unit-test.js";
|
|
25
|
+
import { loadUnitTests, runUnitTestSuite } from "./unit-test.js";
|
|
26
|
+
|
|
27
|
+
// ---------------------------------------------------------------------------
|
|
28
|
+
// CLI
|
|
29
|
+
// ---------------------------------------------------------------------------
|
|
30
|
+
|
|
31
|
+
export async function cliMain(): Promise<void> {
|
|
32
|
+
const { values } = parseArgs({
|
|
33
|
+
options: {
|
|
34
|
+
skill: { type: "string" },
|
|
35
|
+
tests: { type: "string" },
|
|
36
|
+
"run-agent": { type: "boolean", default: false },
|
|
37
|
+
generate: { type: "boolean", default: false },
|
|
38
|
+
"skill-path": { type: "string" },
|
|
39
|
+
"eval-set": { type: "string" },
|
|
40
|
+
model: { type: "string" },
|
|
41
|
+
},
|
|
42
|
+
strict: true,
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
if (!values.skill) {
|
|
46
|
+
console.error("[ERROR] --skill <name> is required.");
|
|
47
|
+
process.exit(1);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
const skillName = values.skill;
|
|
51
|
+
const unitTestDir = join(SELFTUNE_CONFIG_DIR, "unit-tests");
|
|
52
|
+
const defaultTestsPath = join(unitTestDir, `${skillName}.json`);
|
|
53
|
+
const testsPath = values.tests ?? defaultTestsPath;
|
|
54
|
+
|
|
55
|
+
// --generate: create tests from skill content
|
|
56
|
+
if (values.generate) {
|
|
57
|
+
const agent = detectAgent();
|
|
58
|
+
if (!agent) {
|
|
59
|
+
console.error("[ERROR] No agent CLI found (claude/codex/opencode). Cannot generate tests.");
|
|
60
|
+
process.exit(1);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
let skillContent = `Skill: ${skillName}`;
|
|
64
|
+
if (values["skill-path"] && existsSync(values["skill-path"])) {
|
|
65
|
+
skillContent = readFileSync(values["skill-path"], "utf-8");
|
|
66
|
+
} else if (values["skill-path"]) {
|
|
67
|
+
console.warn(`[WARN] Skill path not found: ${values["skill-path"]}. Using skill name only.`);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
let evalFailures: EvalEntry[] = [];
|
|
71
|
+
if (values["eval-set"] && existsSync(values["eval-set"])) {
|
|
72
|
+
try {
|
|
73
|
+
const raw = readFileSync(values["eval-set"], "utf-8");
|
|
74
|
+
const entries: EvalEntry[] = JSON.parse(raw);
|
|
75
|
+
evalFailures = entries.filter((e) => e.should_trigger);
|
|
76
|
+
} catch {
|
|
77
|
+
console.warn("[WARN] Failed to parse eval set. Proceeding without failure context.");
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const modelFlag = values.model;
|
|
82
|
+
const llmCaller = (systemPrompt: string, userPrompt: string) =>
|
|
83
|
+
callLlm(systemPrompt, userPrompt, agent, modelFlag);
|
|
84
|
+
|
|
85
|
+
console.log(`Generating unit tests for skill '${skillName}'...`);
|
|
86
|
+
const tests = await generateUnitTests(skillName, skillContent, evalFailures, llmCaller);
|
|
87
|
+
|
|
88
|
+
if (tests.length === 0) {
|
|
89
|
+
console.error("[ERROR] No tests generated. Check agent/LLM availability.");
|
|
90
|
+
process.exit(1);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// Ensure output directory exists
|
|
94
|
+
mkdirSync(unitTestDir, { recursive: true });
|
|
95
|
+
writeFileSync(testsPath, JSON.stringify(tests, null, 2), "utf-8");
|
|
96
|
+
console.log(`Generated ${tests.length} unit tests -> ${testsPath}`);
|
|
97
|
+
return;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Load and run tests
|
|
101
|
+
const tests = loadUnitTests(testsPath);
|
|
102
|
+
if (tests.length === 0) {
|
|
103
|
+
console.error(`[ERROR] No tests found at ${testsPath}`);
|
|
104
|
+
console.error(" Use --generate to create tests, or provide --tests <path>.");
|
|
105
|
+
process.exit(1);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
console.log(`Loaded ${tests.length} unit tests for skill '${skillName}'`);
|
|
109
|
+
|
|
110
|
+
let agentRunner: AgentRunner;
|
|
111
|
+
|
|
112
|
+
if (values["run-agent"]) {
|
|
113
|
+
const agent = detectAgent();
|
|
114
|
+
if (!agent) {
|
|
115
|
+
console.error("[ERROR] No agent CLI found. Cannot run agent-based tests.");
|
|
116
|
+
process.exit(1);
|
|
117
|
+
}
|
|
118
|
+
const modelFlag = values.model;
|
|
119
|
+
agentRunner = async (query: string): Promise<string> => {
|
|
120
|
+
return callLlm("You are a helpful assistant.", query, agent, modelFlag);
|
|
121
|
+
};
|
|
122
|
+
} else {
|
|
123
|
+
// Dry-run: use query as transcript (only static assertions like contains work meaningfully)
|
|
124
|
+
console.log("(dry-run mode — use --run-agent for full agent execution)\n");
|
|
125
|
+
agentRunner = async (query: string): Promise<string> => query;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
const suite = await runUnitTestSuite(tests, skillName, agentRunner);
|
|
129
|
+
|
|
130
|
+
// Print results
|
|
131
|
+
console.log(`\nResults for '${suite.skill_name}':`);
|
|
132
|
+
console.log(` Total: ${suite.total} Passed: ${suite.passed} Failed: ${suite.failed}`);
|
|
133
|
+
console.log(` Pass rate: ${(suite.pass_rate * 100).toFixed(1)}%`);
|
|
134
|
+
|
|
135
|
+
if (suite.failed > 0) {
|
|
136
|
+
console.log("\nFailed tests:");
|
|
137
|
+
for (const r of suite.results.filter((r) => !r.passed)) {
|
|
138
|
+
console.log(` [FAIL] ${r.test_id} (${r.duration_ms}ms)`);
|
|
139
|
+
if (r.error) {
|
|
140
|
+
console.log(` Error: ${r.error}`);
|
|
141
|
+
}
|
|
142
|
+
for (const a of r.assertion_results.filter((a) => !a.passed)) {
|
|
143
|
+
console.log(
|
|
144
|
+
` - ${a.assertion.type}: expected "${a.assertion.value}", got "${a.actual}"`,
|
|
145
|
+
);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
console.log(`\n${JSON.stringify(suite, null, 2)}`);
|
|
151
|
+
process.exit(suite.failed > 0 ? 1 : 0);
|
|
152
|
+
}
|