selftune 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +146 -0
- package/.claude/agents/evolution-reviewer.md +167 -0
- package/.claude/agents/integration-guide.md +200 -0
- package/.claude/agents/pattern-analyst.md +147 -0
- package/CHANGELOG.md +37 -0
- package/README.md +96 -256
- package/assets/BeforeAfter.gif +0 -0
- package/assets/FeedbackLoop.gif +0 -0
- package/assets/logo.svg +9 -0
- package/assets/skill-health-badge.svg +20 -0
- package/cli/selftune/activation-rules.ts +171 -0
- package/cli/selftune/badge/badge-data.ts +108 -0
- package/cli/selftune/badge/badge-svg.ts +212 -0
- package/cli/selftune/badge/badge.ts +103 -0
- package/cli/selftune/constants.ts +75 -1
- package/cli/selftune/contribute/bundle.ts +314 -0
- package/cli/selftune/contribute/contribute.ts +214 -0
- package/cli/selftune/contribute/sanitize.ts +162 -0
- package/cli/selftune/cron/setup.ts +266 -0
- package/cli/selftune/dashboard-server.ts +582 -0
- package/cli/selftune/dashboard.ts +25 -3
- package/cli/selftune/eval/baseline.ts +247 -0
- package/cli/selftune/eval/composability.ts +117 -0
- package/cli/selftune/eval/generate-unit-tests.ts +143 -0
- package/cli/selftune/eval/hooks-to-evals.ts +68 -2
- package/cli/selftune/eval/import-skillsbench.ts +221 -0
- package/cli/selftune/eval/synthetic-evals.ts +172 -0
- package/cli/selftune/eval/unit-test-cli.ts +152 -0
- package/cli/selftune/eval/unit-test.ts +196 -0
- package/cli/selftune/evolution/deploy-proposal.ts +142 -1
- package/cli/selftune/evolution/evolve-body.ts +492 -0
- package/cli/selftune/evolution/evolve.ts +466 -103
- package/cli/selftune/evolution/extract-patterns.ts +32 -1
- package/cli/selftune/evolution/pareto.ts +314 -0
- package/cli/selftune/evolution/propose-body.ts +171 -0
- package/cli/selftune/evolution/propose-description.ts +100 -2
- package/cli/selftune/evolution/propose-routing.ts +166 -0
- package/cli/selftune/evolution/refine-body.ts +141 -0
- package/cli/selftune/evolution/rollback.ts +19 -2
- package/cli/selftune/evolution/validate-body.ts +254 -0
- package/cli/selftune/evolution/validate-proposal.ts +257 -35
- package/cli/selftune/evolution/validate-routing.ts +177 -0
- package/cli/selftune/grading/grade-session.ts +138 -18
- package/cli/selftune/grading/pre-gates.ts +104 -0
- package/cli/selftune/hooks/auto-activate.ts +185 -0
- package/cli/selftune/hooks/evolution-guard.ts +165 -0
- package/cli/selftune/hooks/skill-change-guard.ts +112 -0
- package/cli/selftune/index.ts +88 -0
- package/cli/selftune/ingestors/claude-replay.ts +351 -0
- package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
- package/cli/selftune/init.ts +150 -3
- package/cli/selftune/memory/writer.ts +447 -0
- package/cli/selftune/monitoring/watch.ts +25 -2
- package/cli/selftune/status.ts +17 -13
- package/cli/selftune/types.ts +377 -5
- package/cli/selftune/utils/frontmatter.ts +217 -0
- package/cli/selftune/utils/llm-call.ts +29 -3
- package/cli/selftune/utils/transcript.ts +35 -0
- package/cli/selftune/utils/trigger-check.ts +89 -0
- package/cli/selftune/utils/tui.ts +156 -0
- package/dashboard/index.html +569 -8
- package/package.json +8 -4
- package/skill/SKILL.md +124 -8
- package/skill/Workflows/AutoActivation.md +144 -0
- package/skill/Workflows/Badge.md +118 -0
- package/skill/Workflows/Baseline.md +121 -0
- package/skill/Workflows/Composability.md +100 -0
- package/skill/Workflows/Contribute.md +91 -0
- package/skill/Workflows/Cron.md +155 -0
- package/skill/Workflows/Dashboard.md +203 -0
- package/skill/Workflows/Doctor.md +37 -1
- package/skill/Workflows/Evals.md +69 -1
- package/skill/Workflows/EvolutionMemory.md +152 -0
- package/skill/Workflows/Evolve.md +111 -6
- package/skill/Workflows/EvolveBody.md +159 -0
- package/skill/Workflows/ImportSkillsBench.md +111 -0
- package/skill/Workflows/Ingest.md +117 -3
- package/skill/Workflows/Initialize.md +57 -3
- package/skill/Workflows/Replay.md +70 -0
- package/skill/Workflows/Rollback.md +20 -1
- package/skill/Workflows/UnitTest.md +138 -0
- package/skill/Workflows/Watch.md +22 -0
- package/skill/settings_snippet.json +23 -0
- package/templates/activation-rules-default.json +27 -0
- package/templates/multi-skill-settings.json +64 -0
- package/templates/single-skill-settings.json +58 -0
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* baseline.ts
|
|
3
|
+
*
|
|
4
|
+
* Measures the value a skill adds over a no-skill baseline.
|
|
5
|
+
*
|
|
6
|
+
* Runs trigger checks against an EMPTY string description (no-skill baseline)
|
|
7
|
+
* and against the current description (with-skill), then computes lift.
|
|
8
|
+
* A skill "adds value" when lift >= 0.05 (5 percentage points).
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { parseArgs } from "node:util";
|
|
12
|
+
|
|
13
|
+
import type { BaselineResult, EvalEntry } from "../types.js";
|
|
14
|
+
import { callLlm } from "../utils/llm-call.js";
|
|
15
|
+
import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
|
|
16
|
+
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
// Types
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
export interface BaselineOptions {
|
|
22
|
+
evalSet: EvalEntry[];
|
|
23
|
+
skillDescription: string;
|
|
24
|
+
skillName: string;
|
|
25
|
+
agent: string;
|
|
26
|
+
modelFlag?: string;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface BaselineMeasurement {
|
|
30
|
+
skill_name: string;
|
|
31
|
+
baseline_pass_rate: number;
|
|
32
|
+
with_skill_pass_rate: number;
|
|
33
|
+
lift: number;
|
|
34
|
+
adds_value: boolean;
|
|
35
|
+
per_entry: BaselineResult[];
|
|
36
|
+
measured_at: string;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Injectable dependencies for measureBaseline(). When omitted, the real
|
|
41
|
+
* module imports are used. Pass overrides in tests to avoid real LLM calls.
|
|
42
|
+
*/
|
|
43
|
+
export interface BaselineDeps {
|
|
44
|
+
callLlm?: typeof callLlm;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// ---------------------------------------------------------------------------
|
|
48
|
+
// Constants
|
|
49
|
+
// ---------------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
const LIFT_THRESHOLD = 0.05;
|
|
52
|
+
const SYSTEM_PROMPT = "You are an evaluation assistant. Answer only YES or NO.";
|
|
53
|
+
|
|
54
|
+
// ---------------------------------------------------------------------------
|
|
55
|
+
// Core measurement
|
|
56
|
+
// ---------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
/** Measure baseline vs. with-skill trigger accuracy across an eval set. */
|
|
59
|
+
export async function measureBaseline(
|
|
60
|
+
options: BaselineOptions,
|
|
61
|
+
_deps: BaselineDeps = {},
|
|
62
|
+
): Promise<BaselineMeasurement> {
|
|
63
|
+
const { evalSet, skillDescription, skillName, agent, modelFlag } = options;
|
|
64
|
+
const _callLlm = _deps.callLlm ?? callLlm;
|
|
65
|
+
|
|
66
|
+
if (evalSet.length === 0) {
|
|
67
|
+
return {
|
|
68
|
+
skill_name: skillName,
|
|
69
|
+
baseline_pass_rate: 0,
|
|
70
|
+
with_skill_pass_rate: 0,
|
|
71
|
+
lift: 0,
|
|
72
|
+
adds_value: false,
|
|
73
|
+
per_entry: [],
|
|
74
|
+
measured_at: new Date().toISOString(),
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
const perEntry: BaselineResult[] = [];
|
|
79
|
+
let baselinePassed = 0;
|
|
80
|
+
let withSkillPassed = 0;
|
|
81
|
+
|
|
82
|
+
for (const entry of evalSet) {
|
|
83
|
+
// --- Baseline check (empty description) ---
|
|
84
|
+
const baselinePrompt = buildTriggerCheckPrompt("", entry.query);
|
|
85
|
+
const baselineRaw = await _callLlm(SYSTEM_PROMPT, baselinePrompt, agent, modelFlag);
|
|
86
|
+
const baselineTriggered = parseTriggerResponse(baselineRaw);
|
|
87
|
+
const baselinePass =
|
|
88
|
+
(entry.should_trigger && baselineTriggered) || (!entry.should_trigger && !baselineTriggered);
|
|
89
|
+
|
|
90
|
+
if (baselinePass) baselinePassed++;
|
|
91
|
+
|
|
92
|
+
perEntry.push({
|
|
93
|
+
skill_name: skillName,
|
|
94
|
+
query: entry.query,
|
|
95
|
+
with_skill: false,
|
|
96
|
+
triggered: baselineTriggered,
|
|
97
|
+
pass: baselinePass,
|
|
98
|
+
measured_at: new Date().toISOString(),
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
// --- With-skill check (actual description) ---
|
|
102
|
+
const withSkillPrompt = buildTriggerCheckPrompt(skillDescription, entry.query);
|
|
103
|
+
const withSkillRaw = await _callLlm(SYSTEM_PROMPT, withSkillPrompt, agent, modelFlag);
|
|
104
|
+
const withSkillTriggered = parseTriggerResponse(withSkillRaw);
|
|
105
|
+
const withSkillPass =
|
|
106
|
+
(entry.should_trigger && withSkillTriggered) ||
|
|
107
|
+
(!entry.should_trigger && !withSkillTriggered);
|
|
108
|
+
|
|
109
|
+
if (withSkillPass) withSkillPassed++;
|
|
110
|
+
|
|
111
|
+
perEntry.push({
|
|
112
|
+
skill_name: skillName,
|
|
113
|
+
query: entry.query,
|
|
114
|
+
with_skill: true,
|
|
115
|
+
triggered: withSkillTriggered,
|
|
116
|
+
pass: withSkillPass,
|
|
117
|
+
measured_at: new Date().toISOString(),
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const total = evalSet.length;
|
|
122
|
+
const baselinePassRate = baselinePassed / total;
|
|
123
|
+
const withSkillPassRate = withSkillPassed / total;
|
|
124
|
+
const lift = withSkillPassRate - baselinePassRate;
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
skill_name: skillName,
|
|
128
|
+
baseline_pass_rate: baselinePassRate,
|
|
129
|
+
with_skill_pass_rate: withSkillPassRate,
|
|
130
|
+
lift,
|
|
131
|
+
adds_value: lift >= LIFT_THRESHOLD,
|
|
132
|
+
per_entry: perEntry,
|
|
133
|
+
measured_at: new Date().toISOString(),
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// ---------------------------------------------------------------------------
|
|
138
|
+
// CLI entry point
|
|
139
|
+
// ---------------------------------------------------------------------------
|
|
140
|
+
|
|
141
|
+
export async function cliMain(): Promise<void> {
|
|
142
|
+
const { values } = parseArgs({
|
|
143
|
+
options: {
|
|
144
|
+
skill: { type: "string" },
|
|
145
|
+
"skill-path": { type: "string" },
|
|
146
|
+
"eval-set": { type: "string" },
|
|
147
|
+
agent: { type: "string" },
|
|
148
|
+
help: { type: "boolean", default: false },
|
|
149
|
+
},
|
|
150
|
+
strict: true,
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
if (values.help) {
|
|
154
|
+
console.log(`selftune baseline — Measure skill value vs. no-skill baseline
|
|
155
|
+
|
|
156
|
+
Usage:
|
|
157
|
+
selftune baseline --skill <name> --skill-path <path> [options]
|
|
158
|
+
|
|
159
|
+
Options:
|
|
160
|
+
--skill Skill name (required)
|
|
161
|
+
--skill-path Path to SKILL.md (required)
|
|
162
|
+
--eval-set Path to eval set JSON (optional, builds from logs if omitted)
|
|
163
|
+
--agent Agent CLI to use (claude, codex, opencode)
|
|
164
|
+
--help Show this help message`);
|
|
165
|
+
process.exit(0);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
if (!values.skill || !values["skill-path"]) {
|
|
169
|
+
console.error("[ERROR] --skill and --skill-path are required");
|
|
170
|
+
process.exit(1);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
const { existsSync, readFileSync } = await import("node:fs");
|
|
174
|
+
|
|
175
|
+
// Read skill description
|
|
176
|
+
const skillPath = values["skill-path"];
|
|
177
|
+
if (!existsSync(skillPath)) {
|
|
178
|
+
console.error(`[ERROR] SKILL.md not found at ${skillPath}`);
|
|
179
|
+
process.exit(1);
|
|
180
|
+
}
|
|
181
|
+
const skillDescription = readFileSync(skillPath, "utf-8");
|
|
182
|
+
|
|
183
|
+
// Load eval set
|
|
184
|
+
let evalSet: EvalEntry[];
|
|
185
|
+
if (values["eval-set"] && existsSync(values["eval-set"])) {
|
|
186
|
+
const raw = readFileSync(values["eval-set"], "utf-8");
|
|
187
|
+
evalSet = JSON.parse(raw) as EvalEntry[];
|
|
188
|
+
} else {
|
|
189
|
+
// Build from logs
|
|
190
|
+
const { QUERY_LOG, SKILL_LOG } = await import("../constants.js");
|
|
191
|
+
const { readJsonl } = await import("../utils/jsonl.js");
|
|
192
|
+
const { buildEvalSet } = await import("./hooks-to-evals.js");
|
|
193
|
+
const skillRecords = readJsonl(SKILL_LOG);
|
|
194
|
+
const queryRecords = readJsonl(QUERY_LOG);
|
|
195
|
+
evalSet = buildEvalSet(skillRecords, queryRecords, values.skill);
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// Detect agent
|
|
199
|
+
const { detectAgent } = await import("../utils/llm-call.js");
|
|
200
|
+
const requestedAgent = values.agent;
|
|
201
|
+
if (requestedAgent && !Bun.which(requestedAgent)) {
|
|
202
|
+
console.error(
|
|
203
|
+
JSON.stringify({
|
|
204
|
+
level: "error",
|
|
205
|
+
code: "agent_not_in_path",
|
|
206
|
+
message: `Agent CLI '${requestedAgent}' not found in PATH.`,
|
|
207
|
+
action: "Install it or omit --agent to use auto-detection.",
|
|
208
|
+
}),
|
|
209
|
+
);
|
|
210
|
+
process.exit(1);
|
|
211
|
+
}
|
|
212
|
+
const agent = requestedAgent ?? detectAgent();
|
|
213
|
+
if (!agent) {
|
|
214
|
+
console.error(
|
|
215
|
+
JSON.stringify({
|
|
216
|
+
level: "error",
|
|
217
|
+
code: "agent_not_found",
|
|
218
|
+
message: "No agent CLI (claude/codex/opencode) found in PATH.",
|
|
219
|
+
action: "Install Claude Code, Codex, or OpenCode.",
|
|
220
|
+
}),
|
|
221
|
+
);
|
|
222
|
+
process.exit(1);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
const result = await measureBaseline({
|
|
226
|
+
evalSet,
|
|
227
|
+
skillDescription,
|
|
228
|
+
skillName: values.skill,
|
|
229
|
+
agent,
|
|
230
|
+
});
|
|
231
|
+
|
|
232
|
+
console.log(JSON.stringify(result, null, 2));
|
|
233
|
+
process.exit(result.adds_value ? 0 : 1);
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
if (import.meta.main) {
|
|
237
|
+
cliMain().catch((err) => {
|
|
238
|
+
console.error(
|
|
239
|
+
JSON.stringify({
|
|
240
|
+
level: "fatal",
|
|
241
|
+
message: err instanceof Error ? err.message : String(err),
|
|
242
|
+
stack: err instanceof Error ? err.stack : undefined,
|
|
243
|
+
}),
|
|
244
|
+
);
|
|
245
|
+
process.exit(1);
|
|
246
|
+
});
|
|
247
|
+
}
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* composability.ts
|
|
3
|
+
*
|
|
4
|
+
* Analyzes co-occurrence patterns between skills in session telemetry
|
|
5
|
+
* to detect composability conflicts. A conflict is flagged when two
|
|
6
|
+
* skills used together produce more errors than either skill used alone.
|
|
7
|
+
*
|
|
8
|
+
* Pure function -- no I/O. CLI wrapper handles reading JSONL.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import type { ComposabilityReport, CoOccurrencePair, SessionTelemetryRecord } from "../types.js";
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Clamp a number between min and max.
|
|
15
|
+
*/
|
|
16
|
+
function clamp(value: number, min: number, max: number): number {
|
|
17
|
+
return Math.max(min, Math.min(max, value));
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Analyze composability of a target skill against all co-occurring skills.
|
|
22
|
+
*
|
|
23
|
+
* @param skillName - The skill to analyze
|
|
24
|
+
* @param telemetry - All session telemetry records
|
|
25
|
+
* @param window - Optional: only consider the last N sessions (by timestamp)
|
|
26
|
+
* @returns ComposabilityReport with co-occurrence pairs and conflict detection
|
|
27
|
+
*/
|
|
28
|
+
export function analyzeComposability(
|
|
29
|
+
skillName: string,
|
|
30
|
+
telemetry: SessionTelemetryRecord[],
|
|
31
|
+
window?: number,
|
|
32
|
+
): ComposabilityReport {
|
|
33
|
+
// Apply window: sort by timestamp descending, take last N
|
|
34
|
+
let sessions = telemetry.filter((r) => r && Array.isArray(r.skills_triggered));
|
|
35
|
+
|
|
36
|
+
if (window && window > 0) {
|
|
37
|
+
sessions = sessions
|
|
38
|
+
.sort((a, b) => (b.timestamp ?? "").localeCompare(a.timestamp ?? ""))
|
|
39
|
+
.slice(0, window);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Sessions where the target skill was triggered
|
|
43
|
+
const skillSessions = sessions.filter((r) => r.skills_triggered.includes(skillName));
|
|
44
|
+
|
|
45
|
+
// Sessions where the target skill was triggered ALONE (no other skills)
|
|
46
|
+
const aloneSessions = skillSessions.filter((r) => r.skills_triggered.length === 1);
|
|
47
|
+
|
|
48
|
+
// Average errors when skill is used alone
|
|
49
|
+
const errorsAlone =
|
|
50
|
+
aloneSessions.length > 0
|
|
51
|
+
? aloneSessions.reduce((sum, r) => sum + (r.errors_encountered ?? 0), 0) /
|
|
52
|
+
aloneSessions.length
|
|
53
|
+
: 0;
|
|
54
|
+
|
|
55
|
+
// Find all co-occurring skills
|
|
56
|
+
const coSkills = new Set<string>();
|
|
57
|
+
for (const r of skillSessions) {
|
|
58
|
+
for (const s of r.skills_triggered) {
|
|
59
|
+
if (s !== skillName) coSkills.add(s);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// For each co-occurring skill, compute conflict score
|
|
64
|
+
const pairs: CoOccurrencePair[] = [];
|
|
65
|
+
for (const coSkill of coSkills) {
|
|
66
|
+
// Sessions where BOTH skills are triggered together
|
|
67
|
+
const togetherSessions = skillSessions.filter((r) => r.skills_triggered.includes(coSkill));
|
|
68
|
+
|
|
69
|
+
const coOccurrenceCount = togetherSessions.length;
|
|
70
|
+
|
|
71
|
+
// Average errors when both skills are used together
|
|
72
|
+
const errorsTogether =
|
|
73
|
+
togetherSessions.length > 0
|
|
74
|
+
? togetherSessions.reduce((sum, r) => sum + (r.errors_encountered ?? 0), 0) /
|
|
75
|
+
togetherSessions.length
|
|
76
|
+
: 0;
|
|
77
|
+
|
|
78
|
+
// Baseline should consider BOTH skills alone to avoid false positives
|
|
79
|
+
const coSkillAloneSessions = sessions.filter(
|
|
80
|
+
(r) => r.skills_triggered.includes(coSkill) && !r.skills_triggered.includes(skillName),
|
|
81
|
+
);
|
|
82
|
+
const errorsCoSkillAlone =
|
|
83
|
+
coSkillAloneSessions.length > 0
|
|
84
|
+
? coSkillAloneSessions.reduce((sum, r) => sum + (r.errors_encountered ?? 0), 0) /
|
|
85
|
+
coSkillAloneSessions.length
|
|
86
|
+
: errorsAlone;
|
|
87
|
+
const baselineAlone = Math.max(errorsAlone, errorsCoSkillAlone);
|
|
88
|
+
|
|
89
|
+
// conflict_score = clamp((errors_together - baseline) / (baseline + 1), 0, 1)
|
|
90
|
+
const conflictScore = clamp((errorsTogether - baselineAlone) / (baselineAlone + 1), 0, 1);
|
|
91
|
+
|
|
92
|
+
const conflictDetected = conflictScore > 0.3;
|
|
93
|
+
|
|
94
|
+
const pair: CoOccurrencePair = {
|
|
95
|
+
skill_a: skillName,
|
|
96
|
+
skill_b: coSkill,
|
|
97
|
+
co_occurrence_count: coOccurrenceCount,
|
|
98
|
+
conflict_detected: conflictDetected,
|
|
99
|
+
};
|
|
100
|
+
|
|
101
|
+
if (conflictDetected) {
|
|
102
|
+
pair.conflict_reason = `conflict_score=${conflictScore.toFixed(3)} (avg errors together=${errorsTogether.toFixed(1)} vs alone=${errorsAlone.toFixed(1)})`;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
pairs.push(pair);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Sort by co-occurrence count descending for readability
|
|
109
|
+
pairs.sort((a, b) => b.co_occurrence_count - a.co_occurrence_count);
|
|
110
|
+
|
|
111
|
+
return {
|
|
112
|
+
pairs,
|
|
113
|
+
total_sessions_analyzed: skillSessions.length,
|
|
114
|
+
conflict_count: pairs.filter((p) => p.conflict_detected).length,
|
|
115
|
+
generated_at: new Date().toISOString(),
|
|
116
|
+
};
|
|
117
|
+
}
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Skill unit test generator.
|
|
3
|
+
*
|
|
4
|
+
* Uses an LLM to generate unit test cases from skill content and eval failures.
|
|
5
|
+
* Tests are output as SkillUnitTest[] JSON arrays.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { EvalEntry, SkillUnitTest } from "../types.js";
|
|
9
|
+
|
|
10
|
+
// Note: we don't use stripMarkdownFences from llm-call.ts because it
|
|
11
|
+
// assumes JSON objects (looks for `{`), but we return JSON arrays.
|
|
12
|
+
|
|
13
|
+
/** Strip markdown fences and find JSON array content. */
|
|
14
|
+
function stripArrayFences(raw: string): string {
|
|
15
|
+
let text = raw.trim();
|
|
16
|
+
|
|
17
|
+
// Strip markdown code fences
|
|
18
|
+
const fenceMatch = text.match(/^```\w*\n([\s\S]*?)\n```$/);
|
|
19
|
+
if (fenceMatch) {
|
|
20
|
+
text = fenceMatch[1].trim();
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// Find first [ in case there's preamble text
|
|
24
|
+
const bracketIdx = text.indexOf("[");
|
|
25
|
+
if (bracketIdx >= 0) {
|
|
26
|
+
text = text.slice(bracketIdx);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
return text;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
// LLM caller type (injectable for testing)
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
export type LlmCaller = (systemPrompt: string, userPrompt: string) => Promise<string>;
|
|
37
|
+
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
// Prompt building
|
|
40
|
+
// ---------------------------------------------------------------------------
|
|
41
|
+
|
|
42
|
+
const SYSTEM_PROMPT = `You are a test engineer generating skill unit tests.
|
|
43
|
+
Given a skill name, its content/description, and optionally some eval failures,
|
|
44
|
+
generate unit test cases as a JSON array of objects.
|
|
45
|
+
|
|
46
|
+
Each test object must have:
|
|
47
|
+
- id: unique string (e.g. "gen-1", "gen-2")
|
|
48
|
+
- skill_name: the skill name provided
|
|
49
|
+
- query: a user query that would test this skill
|
|
50
|
+
- assertions: array of assertion objects, each with:
|
|
51
|
+
- type: one of "contains", "not_contains", "regex", "tool_called", "tool_not_called", "json_path"
|
|
52
|
+
- value: the value to check for
|
|
53
|
+
- description: (optional) human-readable description of what this checks
|
|
54
|
+
- tags: (optional) array of tag strings like ["generated", "smoke"]
|
|
55
|
+
|
|
56
|
+
Focus on:
|
|
57
|
+
1. Covering different invocation patterns (explicit, implicit, contextual)
|
|
58
|
+
2. Testing edge cases from eval failures if provided
|
|
59
|
+
3. Verifying expected tools are called
|
|
60
|
+
4. Checking output contains expected content
|
|
61
|
+
|
|
62
|
+
Respond with ONLY a JSON array. No explanation.`;
|
|
63
|
+
|
|
64
|
+
/** Build the user prompt for test generation. */
|
|
65
|
+
export function buildGenerationPrompt(
|
|
66
|
+
skillName: string,
|
|
67
|
+
skillContent: string,
|
|
68
|
+
evalFailures: EvalEntry[],
|
|
69
|
+
): string {
|
|
70
|
+
const parts: string[] = [`Skill name: ${skillName}`, "", "Skill content:", skillContent, ""];
|
|
71
|
+
|
|
72
|
+
if (evalFailures.length > 0) {
|
|
73
|
+
parts.push("Eval failures (queries that failed trigger checks):");
|
|
74
|
+
for (const f of evalFailures) {
|
|
75
|
+
parts.push(
|
|
76
|
+
` - query: "${f.query}" (should_trigger=${f.should_trigger}, type=${f.invocation_type ?? "unknown"})`,
|
|
77
|
+
);
|
|
78
|
+
}
|
|
79
|
+
parts.push("");
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
parts.push("Example test case format:");
|
|
83
|
+
parts.push(
|
|
84
|
+
JSON.stringify(
|
|
85
|
+
[
|
|
86
|
+
{
|
|
87
|
+
id: "example-1",
|
|
88
|
+
skill_name: skillName,
|
|
89
|
+
query: "example query for this skill",
|
|
90
|
+
assertions: [
|
|
91
|
+
{
|
|
92
|
+
type: "contains",
|
|
93
|
+
value: "expected output",
|
|
94
|
+
description: "checks for expected content",
|
|
95
|
+
},
|
|
96
|
+
{ type: "tool_called", value: "Write", description: "verifies Write tool was used" },
|
|
97
|
+
],
|
|
98
|
+
tags: ["generated"],
|
|
99
|
+
},
|
|
100
|
+
],
|
|
101
|
+
null,
|
|
102
|
+
2,
|
|
103
|
+
),
|
|
104
|
+
);
|
|
105
|
+
|
|
106
|
+
parts.push("");
|
|
107
|
+
parts.push("Generate 5-10 diverse test cases covering the skill's functionality.");
|
|
108
|
+
|
|
109
|
+
return parts.join("\n");
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// ---------------------------------------------------------------------------
|
|
113
|
+
// Generate unit tests
|
|
114
|
+
// ---------------------------------------------------------------------------
|
|
115
|
+
|
|
116
|
+
/** Generate unit tests for a skill using an LLM. Returns empty array on error. */
|
|
117
|
+
export async function generateUnitTests(
|
|
118
|
+
skillName: string,
|
|
119
|
+
skillContent: string,
|
|
120
|
+
evalFailures: EvalEntry[],
|
|
121
|
+
llmCaller: LlmCaller,
|
|
122
|
+
): Promise<SkillUnitTest[]> {
|
|
123
|
+
try {
|
|
124
|
+
const userPrompt = buildGenerationPrompt(skillName, skillContent, evalFailures);
|
|
125
|
+
const raw = await llmCaller(SYSTEM_PROMPT, userPrompt);
|
|
126
|
+
const cleaned = stripArrayFences(raw);
|
|
127
|
+
|
|
128
|
+
const parsed = JSON.parse(cleaned);
|
|
129
|
+
if (!Array.isArray(parsed)) {
|
|
130
|
+
console.warn("[WARN] LLM did not return a JSON array for unit test generation");
|
|
131
|
+
return [];
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// Ensure skill_name is set correctly on each test
|
|
135
|
+
return parsed.map((t: SkillUnitTest) => ({
|
|
136
|
+
...t,
|
|
137
|
+
skill_name: t.skill_name || skillName,
|
|
138
|
+
}));
|
|
139
|
+
} catch (err) {
|
|
140
|
+
console.warn("[WARN] Failed to generate unit tests:", err);
|
|
141
|
+
return [];
|
|
142
|
+
}
|
|
143
|
+
}
|
|
@@ -26,7 +26,9 @@ import type {
|
|
|
26
26
|
SkillUsageRecord,
|
|
27
27
|
} from "../types.js";
|
|
28
28
|
import { readJsonl } from "../utils/jsonl.js";
|
|
29
|
+
import { detectAgent } from "../utils/llm-call.js";
|
|
29
30
|
import { seededShuffle } from "../utils/seeded-random.js";
|
|
31
|
+
import { generateSyntheticEvals } from "./synthetic-evals.js";
|
|
30
32
|
|
|
31
33
|
// ---------------------------------------------------------------------------
|
|
32
34
|
// Query truncation
|
|
@@ -359,7 +361,7 @@ export function printEvalStats(
|
|
|
359
361
|
// CLI entry point
|
|
360
362
|
// ---------------------------------------------------------------------------
|
|
361
363
|
|
|
362
|
-
export function cliMain(): void {
|
|
364
|
+
export async function cliMain(): Promise<void> {
|
|
363
365
|
const { values } = parseArgs({
|
|
364
366
|
options: {
|
|
365
367
|
skill: { type: "string" },
|
|
@@ -373,10 +375,71 @@ export function cliMain(): void {
|
|
|
373
375
|
"skill-log": { type: "string", default: SKILL_LOG },
|
|
374
376
|
"query-log": { type: "string", default: QUERY_LOG },
|
|
375
377
|
"telemetry-log": { type: "string", default: TELEMETRY_LOG },
|
|
378
|
+
synthetic: { type: "boolean", default: false },
|
|
379
|
+
"skill-path": { type: "string" },
|
|
380
|
+
model: { type: "string" },
|
|
376
381
|
},
|
|
377
382
|
strict: true,
|
|
378
383
|
});
|
|
379
384
|
|
|
385
|
+
// --- Synthetic mode: generate evals from SKILL.md via LLM ---
|
|
386
|
+
if (values.synthetic) {
|
|
387
|
+
if (!values.skill) {
|
|
388
|
+
console.error("[ERROR] --skill required with --synthetic");
|
|
389
|
+
process.exit(1);
|
|
390
|
+
}
|
|
391
|
+
if (!values["skill-path"]) {
|
|
392
|
+
console.error("[ERROR] --skill-path required with --synthetic");
|
|
393
|
+
process.exit(1);
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
const agent = detectAgent();
|
|
397
|
+
if (!agent) {
|
|
398
|
+
console.error("[ERROR] No agent CLI found (claude/codex/opencode). Install one first.");
|
|
399
|
+
process.exit(1);
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
const maxPerSide = Number.parseInt(values.max ?? "50", 10);
|
|
403
|
+
const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
|
|
404
|
+
|
|
405
|
+
console.log(`Generating synthetic evals for skill '${values.skill}'...`);
|
|
406
|
+
const evalSet = await generateSyntheticEvals(values["skill-path"], values.skill, agent, {
|
|
407
|
+
maxPositives: effectiveMax,
|
|
408
|
+
maxNegatives: effectiveMax,
|
|
409
|
+
modelFlag: values.model,
|
|
410
|
+
});
|
|
411
|
+
|
|
412
|
+
const outputPath = values.output ?? `${values.skill}_trigger_eval.json`;
|
|
413
|
+
writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
|
|
414
|
+
|
|
415
|
+
const pos = evalSet.filter((e) => e.should_trigger);
|
|
416
|
+
const neg = evalSet.filter((e) => !e.should_trigger);
|
|
417
|
+
|
|
418
|
+
console.log(`Wrote ${evalSet.length} synthetic eval entries to ${outputPath}`);
|
|
419
|
+
console.log(` Positives (should_trigger=true) : ${pos.length}`);
|
|
420
|
+
console.log(` Negatives (should_trigger=false): ${neg.length}`);
|
|
421
|
+
|
|
422
|
+
if (pos.length > 0) {
|
|
423
|
+
const types = new Map<string, number>();
|
|
424
|
+
for (const e of pos) {
|
|
425
|
+
const t = e.invocation_type ?? "?";
|
|
426
|
+
types.set(t, (types.get(t) ?? 0) + 1);
|
|
427
|
+
}
|
|
428
|
+
console.log("\n Positive invocation types:");
|
|
429
|
+
for (const [t, c] of [...types.entries()].sort()) {
|
|
430
|
+
console.log(` ${t.padEnd(15)} ${c}`);
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
console.log("\nNext steps:");
|
|
435
|
+
console.log(" bun run cli/selftune/eval/run-eval.ts \\");
|
|
436
|
+
console.log(` --eval-set ${outputPath} \\`);
|
|
437
|
+
console.log(` --skill-path ${values["skill-path"]} \\`);
|
|
438
|
+
console.log(" --runs-per-query 3 --verbose");
|
|
439
|
+
return;
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
// --- Log-based mode (original behavior) ---
|
|
380
443
|
const skillRecords = readJsonl<SkillUsageRecord>(values["skill-log"] ?? SKILL_LOG);
|
|
381
444
|
const queryRecords = readJsonl<QueryLogRecord>(values["query-log"] ?? QUERY_LOG);
|
|
382
445
|
const telemetryRecords = readJsonl<SessionTelemetryRecord>(
|
|
@@ -418,5 +481,8 @@ export function cliMain(): void {
|
|
|
418
481
|
}
|
|
419
482
|
|
|
420
483
|
if (import.meta.main) {
|
|
421
|
-
cliMain()
|
|
484
|
+
cliMain().catch((err) => {
|
|
485
|
+
console.error(err);
|
|
486
|
+
process.exit(1);
|
|
487
|
+
});
|
|
422
488
|
}
|