selftune 0.1.4 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +156 -0
- package/.claude/agents/evolution-reviewer.md +180 -0
- package/.claude/agents/integration-guide.md +212 -0
- package/.claude/agents/pattern-analyst.md +160 -0
- package/CHANGELOG.md +46 -1
- package/README.md +105 -257
- package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
- package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
- package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
- package/apps/local-dashboard/dist/favicon.png +0 -0
- package/apps/local-dashboard/dist/index.html +17 -0
- package/apps/local-dashboard/dist/logo.png +0 -0
- package/apps/local-dashboard/dist/logo.svg +9 -0
- package/assets/BeforeAfter.gif +0 -0
- package/assets/FeedbackLoop.gif +0 -0
- package/assets/logo.svg +9 -0
- package/assets/skill-health-badge.svg +20 -0
- package/cli/selftune/activation-rules.ts +171 -0
- package/cli/selftune/badge/badge-data.ts +108 -0
- package/cli/selftune/badge/badge-svg.ts +212 -0
- package/cli/selftune/badge/badge.ts +99 -0
- package/cli/selftune/canonical-export.ts +183 -0
- package/cli/selftune/constants.ts +103 -1
- package/cli/selftune/contribute/bundle.ts +314 -0
- package/cli/selftune/contribute/contribute.ts +214 -0
- package/cli/selftune/contribute/sanitize.ts +162 -0
- package/cli/selftune/cron/setup.ts +266 -0
- package/cli/selftune/dashboard-contract.ts +202 -0
- package/cli/selftune/dashboard-server.ts +1049 -0
- package/cli/selftune/dashboard.ts +43 -156
- package/cli/selftune/eval/baseline.ts +248 -0
- package/cli/selftune/eval/composability-v2.ts +273 -0
- package/cli/selftune/eval/composability.ts +117 -0
- package/cli/selftune/eval/generate-unit-tests.ts +143 -0
- package/cli/selftune/eval/hooks-to-evals.ts +101 -16
- package/cli/selftune/eval/import-skillsbench.ts +221 -0
- package/cli/selftune/eval/synthetic-evals.ts +172 -0
- package/cli/selftune/eval/unit-test-cli.ts +152 -0
- package/cli/selftune/eval/unit-test.ts +196 -0
- package/cli/selftune/evolution/deploy-proposal.ts +142 -1
- package/cli/selftune/evolution/evidence.ts +26 -0
- package/cli/selftune/evolution/evolve-body.ts +586 -0
- package/cli/selftune/evolution/evolve.ts +825 -116
- package/cli/selftune/evolution/extract-patterns.ts +105 -16
- package/cli/selftune/evolution/pareto.ts +314 -0
- package/cli/selftune/evolution/propose-body.ts +171 -0
- package/cli/selftune/evolution/propose-description.ts +100 -2
- package/cli/selftune/evolution/propose-routing.ts +166 -0
- package/cli/selftune/evolution/refine-body.ts +141 -0
- package/cli/selftune/evolution/rollback.ts +21 -4
- package/cli/selftune/evolution/validate-body.ts +254 -0
- package/cli/selftune/evolution/validate-proposal.ts +257 -35
- package/cli/selftune/evolution/validate-routing.ts +177 -0
- package/cli/selftune/grading/auto-grade.ts +200 -0
- package/cli/selftune/grading/grade-session.ts +513 -42
- package/cli/selftune/grading/pre-gates.ts +104 -0
- package/cli/selftune/grading/results.ts +42 -0
- package/cli/selftune/hooks/auto-activate.ts +185 -0
- package/cli/selftune/hooks/evolution-guard.ts +165 -0
- package/cli/selftune/hooks/prompt-log.ts +172 -2
- package/cli/selftune/hooks/session-stop.ts +123 -3
- package/cli/selftune/hooks/skill-change-guard.ts +112 -0
- package/cli/selftune/hooks/skill-eval.ts +119 -3
- package/cli/selftune/index.ts +415 -48
- package/cli/selftune/ingestors/claude-replay.ts +377 -0
- package/cli/selftune/ingestors/codex-rollout.ts +345 -46
- package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
- package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
- package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
- package/cli/selftune/init.ts +376 -16
- package/cli/selftune/last.ts +14 -5
- package/cli/selftune/localdb/db.ts +63 -0
- package/cli/selftune/localdb/materialize.ts +428 -0
- package/cli/selftune/localdb/queries.ts +376 -0
- package/cli/selftune/localdb/schema.ts +204 -0
- package/cli/selftune/memory/writer.ts +447 -0
- package/cli/selftune/monitoring/watch.ts +90 -16
- package/cli/selftune/normalization.ts +682 -0
- package/cli/selftune/observability.ts +19 -44
- package/cli/selftune/orchestrate.ts +1073 -0
- package/cli/selftune/quickstart.ts +203 -0
- package/cli/selftune/repair/skill-usage.ts +576 -0
- package/cli/selftune/schedule.ts +561 -0
- package/cli/selftune/status.ts +59 -33
- package/cli/selftune/sync.ts +627 -0
- package/cli/selftune/types.ts +525 -5
- package/cli/selftune/utils/canonical-log.ts +45 -0
- package/cli/selftune/utils/frontmatter.ts +217 -0
- package/cli/selftune/utils/hooks.ts +41 -0
- package/cli/selftune/utils/html.ts +27 -0
- package/cli/selftune/utils/llm-call.ts +103 -19
- package/cli/selftune/utils/math.ts +10 -0
- package/cli/selftune/utils/query-filter.ts +139 -0
- package/cli/selftune/utils/skill-discovery.ts +340 -0
- package/cli/selftune/utils/skill-log.ts +68 -0
- package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
- package/cli/selftune/utils/transcript.ts +307 -26
- package/cli/selftune/utils/trigger-check.ts +89 -0
- package/cli/selftune/utils/tui.ts +156 -0
- package/cli/selftune/workflows/discover.ts +254 -0
- package/cli/selftune/workflows/skill-md-writer.ts +288 -0
- package/cli/selftune/workflows/workflows.ts +188 -0
- package/package.json +28 -11
- package/packages/telemetry-contract/README.md +11 -0
- package/packages/telemetry-contract/fixtures/golden.json +87 -0
- package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
- package/packages/telemetry-contract/index.ts +1 -0
- package/packages/telemetry-contract/package.json +19 -0
- package/packages/telemetry-contract/src/index.ts +2 -0
- package/packages/telemetry-contract/src/types.ts +163 -0
- package/packages/telemetry-contract/src/validators.ts +109 -0
- package/skill/SKILL.md +180 -33
- package/skill/Workflows/AutoActivation.md +145 -0
- package/skill/Workflows/Badge.md +124 -0
- package/skill/Workflows/Baseline.md +144 -0
- package/skill/Workflows/Composability.md +107 -0
- package/skill/Workflows/Contribute.md +94 -0
- package/skill/Workflows/Cron.md +132 -0
- package/skill/Workflows/Dashboard.md +214 -0
- package/skill/Workflows/Doctor.md +63 -14
- package/skill/Workflows/Evals.md +110 -18
- package/skill/Workflows/EvolutionMemory.md +154 -0
- package/skill/Workflows/Evolve.md +181 -21
- package/skill/Workflows/EvolveBody.md +159 -0
- package/skill/Workflows/Grade.md +36 -31
- package/skill/Workflows/ImportSkillsBench.md +117 -0
- package/skill/Workflows/Ingest.md +142 -21
- package/skill/Workflows/Initialize.md +91 -23
- package/skill/Workflows/Orchestrate.md +139 -0
- package/skill/Workflows/Replay.md +91 -0
- package/skill/Workflows/Rollback.md +23 -4
- package/skill/Workflows/Schedule.md +61 -0
- package/skill/Workflows/Sync.md +88 -0
- package/skill/Workflows/UnitTest.md +150 -0
- package/skill/Workflows/Watch.md +33 -1
- package/skill/Workflows/Workflows.md +129 -0
- package/skill/assets/activation-rules-default.json +26 -0
- package/skill/assets/multi-skill-settings.json +63 -0
- package/skill/assets/single-skill-settings.json +57 -0
- package/skill/references/invocation-taxonomy.md +2 -2
- package/skill/references/logs.md +164 -2
- package/skill/references/setup-patterns.md +65 -0
- package/skill/references/version-history.md +40 -0
- package/skill/settings_snippet.json +23 -0
- package/templates/activation-rules-default.json +27 -0
- package/templates/multi-skill-settings.json +64 -0
- package/templates/single-skill-settings.json +58 -0
- package/dashboard/index.html +0 -1119
|
@@ -1,124 +1,12 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* selftune dashboard —
|
|
2
|
+
* selftune dashboard — Start the local React SPA dashboard server.
|
|
3
3
|
*
|
|
4
4
|
* Usage:
|
|
5
|
-
* selftune dashboard —
|
|
6
|
-
* selftune dashboard --
|
|
7
|
-
* selftune dashboard --
|
|
5
|
+
* selftune dashboard — Start server on port 3141 and open browser
|
|
6
|
+
* selftune dashboard --port 8080 — Start on custom port
|
|
7
|
+
* selftune dashboard --serve — Deprecated alias for the default behavior
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
|
-
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
11
|
-
import { homedir } from "node:os";
|
|
12
|
-
import { dirname, join, resolve } from "node:path";
|
|
13
|
-
import { EVOLUTION_AUDIT_LOG, QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "./constants.js";
|
|
14
|
-
import { getLastDeployedProposal, readAuditTrail } from "./evolution/audit.js";
|
|
15
|
-
import { computeMonitoringSnapshot } from "./monitoring/watch.js";
|
|
16
|
-
import type {
|
|
17
|
-
EvolutionAuditEntry,
|
|
18
|
-
QueryLogRecord,
|
|
19
|
-
SessionTelemetryRecord,
|
|
20
|
-
SkillUsageRecord,
|
|
21
|
-
} from "./types.js";
|
|
22
|
-
import { readJsonl } from "./utils/jsonl.js";
|
|
23
|
-
|
|
24
|
-
function findViewerHTML(): string {
|
|
25
|
-
// Try relative to this module first (works for both dev and installed)
|
|
26
|
-
const candidates = [
|
|
27
|
-
join(dirname(import.meta.dir), "..", "dashboard", "index.html"),
|
|
28
|
-
join(dirname(import.meta.dir), "dashboard", "index.html"),
|
|
29
|
-
resolve("dashboard", "index.html"),
|
|
30
|
-
];
|
|
31
|
-
for (const c of candidates) {
|
|
32
|
-
if (existsSync(c)) return c;
|
|
33
|
-
}
|
|
34
|
-
throw new Error("Could not find dashboard/index.html. Ensure it exists in the selftune repo.");
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
function buildEmbeddedHTML(): string {
|
|
38
|
-
const template = readFileSync(findViewerHTML(), "utf-8");
|
|
39
|
-
|
|
40
|
-
const telemetry = readJsonl<SessionTelemetryRecord>(TELEMETRY_LOG);
|
|
41
|
-
const skills = readJsonl<SkillUsageRecord>(SKILL_LOG);
|
|
42
|
-
const queries = readJsonl<QueryLogRecord>(QUERY_LOG);
|
|
43
|
-
const evolution = readJsonl<EvolutionAuditEntry>(EVOLUTION_AUDIT_LOG);
|
|
44
|
-
|
|
45
|
-
const totalRecords = telemetry.length + skills.length + queries.length + evolution.length;
|
|
46
|
-
|
|
47
|
-
if (totalRecords === 0) {
|
|
48
|
-
console.error("No log data found. Run some sessions first.");
|
|
49
|
-
console.error(` Checked: ${TELEMETRY_LOG}`);
|
|
50
|
-
console.error(` ${SKILL_LOG}`);
|
|
51
|
-
console.error(` ${QUERY_LOG}`);
|
|
52
|
-
console.error(` ${EVOLUTION_AUDIT_LOG}`);
|
|
53
|
-
process.exit(1);
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
// Compute per-skill monitoring snapshots
|
|
57
|
-
const skillNames = [...new Set(skills.map((r) => r.skill_name))];
|
|
58
|
-
const snapshots: Record<string, ReturnType<typeof computeMonitoringSnapshot>> = {};
|
|
59
|
-
for (const name of skillNames) {
|
|
60
|
-
const lastDeployed = getLastDeployedProposal(name);
|
|
61
|
-
const baselinePassRate = lastDeployed?.eval_snapshot?.pass_rate ?? 0.5;
|
|
62
|
-
snapshots[name] = computeMonitoringSnapshot(
|
|
63
|
-
name,
|
|
64
|
-
telemetry,
|
|
65
|
-
skills,
|
|
66
|
-
queries,
|
|
67
|
-
telemetry.length,
|
|
68
|
-
baselinePassRate,
|
|
69
|
-
);
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
// Compute unmatched queries
|
|
73
|
-
const triggeredQueries = new Set(
|
|
74
|
-
skills.filter((r) => r.triggered).map((r) => r.query.toLowerCase().trim()),
|
|
75
|
-
);
|
|
76
|
-
const unmatched = queries
|
|
77
|
-
.filter((q) => !triggeredQueries.has(q.query.toLowerCase().trim()))
|
|
78
|
-
.map((q) => ({
|
|
79
|
-
timestamp: q.timestamp,
|
|
80
|
-
session_id: q.session_id,
|
|
81
|
-
query: q.query,
|
|
82
|
-
}));
|
|
83
|
-
|
|
84
|
-
// Compute pending proposals
|
|
85
|
-
const auditTrail = readAuditTrail();
|
|
86
|
-
const proposalStatus: Record<string, string[]> = {};
|
|
87
|
-
for (const e of auditTrail) {
|
|
88
|
-
if (!proposalStatus[e.proposal_id]) proposalStatus[e.proposal_id] = [];
|
|
89
|
-
proposalStatus[e.proposal_id].push(e.action);
|
|
90
|
-
}
|
|
91
|
-
// Deduplicate by proposal_id: one entry per pending proposal
|
|
92
|
-
const terminalActions = new Set(["deployed", "rejected", "rolled_back"]);
|
|
93
|
-
const seenProposals = new Set<string>();
|
|
94
|
-
const pendingProposals = auditTrail.filter((e) => {
|
|
95
|
-
if (e.action !== "created" && e.action !== "validated") return false;
|
|
96
|
-
if (seenProposals.has(e.proposal_id)) return false;
|
|
97
|
-
const actions = proposalStatus[e.proposal_id] || [];
|
|
98
|
-
const isPending = !actions.some((a: string) => terminalActions.has(a));
|
|
99
|
-
if (isPending) seenProposals.add(e.proposal_id);
|
|
100
|
-
return isPending;
|
|
101
|
-
});
|
|
102
|
-
|
|
103
|
-
const data = {
|
|
104
|
-
telemetry,
|
|
105
|
-
skills,
|
|
106
|
-
queries,
|
|
107
|
-
evolution,
|
|
108
|
-
computed: {
|
|
109
|
-
snapshots,
|
|
110
|
-
unmatched,
|
|
111
|
-
pendingProposals,
|
|
112
|
-
},
|
|
113
|
-
};
|
|
114
|
-
|
|
115
|
-
// Inject embedded data right before </body>
|
|
116
|
-
// Escape </script> sequences to prevent XSS via embedded JSON
|
|
117
|
-
const safeJson = JSON.stringify(data).replace(/<\/script>/gi, "<\\/script>");
|
|
118
|
-
const dataScript = `<script id="embedded-data" type="application/json">${safeJson}</script>`;
|
|
119
|
-
return template.replace("</body>", `${dataScript}\n</body>`);
|
|
120
|
-
}
|
|
121
|
-
|
|
122
10
|
export async function cliMain(): Promise<void> {
|
|
123
11
|
const args = process.argv.slice(2);
|
|
124
12
|
|
|
@@ -126,51 +14,50 @@ export async function cliMain(): Promise<void> {
|
|
|
126
14
|
console.log(`selftune dashboard — Visual data dashboard
|
|
127
15
|
|
|
128
16
|
Usage:
|
|
129
|
-
selftune dashboard
|
|
130
|
-
selftune dashboard --
|
|
131
|
-
selftune dashboard --
|
|
17
|
+
selftune dashboard Start dashboard server (port 3141)
|
|
18
|
+
selftune dashboard --port 8080 Start on custom port
|
|
19
|
+
selftune dashboard --serve Deprecated alias for default behavior
|
|
20
|
+
selftune dashboard --no-open Start server without opening browser`);
|
|
132
21
|
process.exit(0);
|
|
133
22
|
}
|
|
134
23
|
|
|
135
|
-
if (args.includes("--export")) {
|
|
136
|
-
|
|
137
|
-
|
|
24
|
+
if (args.includes("--export") || args.includes("--out")) {
|
|
25
|
+
console.error("Legacy dashboard export was removed.");
|
|
26
|
+
console.error(
|
|
27
|
+
"Use `selftune dashboard` to run the SPA locally, then share a route or screenshot instead.",
|
|
28
|
+
);
|
|
29
|
+
process.exit(1);
|
|
138
30
|
}
|
|
139
31
|
|
|
140
|
-
const
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
32
|
+
const portIdx = args.indexOf("--port");
|
|
33
|
+
let port: number | undefined;
|
|
34
|
+
if (portIdx !== -1) {
|
|
35
|
+
const parsed = Number.parseInt(args[portIdx + 1], 10);
|
|
36
|
+
if (!Number.isInteger(parsed) || parsed < 1 || parsed > 65535) {
|
|
37
|
+
console.error(`Invalid port "${args[portIdx + 1]}": must be an integer between 1 and 65535.`);
|
|
145
38
|
process.exit(1);
|
|
146
39
|
}
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
const
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
await proc.exited;
|
|
171
|
-
if (proc.exitCode !== 0) throw new Error(`Failed to launch ${cmd}`);
|
|
172
|
-
} catch {
|
|
173
|
-
console.log(`Open manually: file://${tmpPath}`);
|
|
174
|
-
}
|
|
175
|
-
process.exit(0);
|
|
40
|
+
port = parsed;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
if (args.includes("--serve")) {
|
|
44
|
+
console.warn("`selftune dashboard --serve` is deprecated; use `selftune dashboard` instead.");
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const openBrowser = !args.includes("--no-open");
|
|
48
|
+
const { startDashboardServer } = await import("./dashboard-server.js");
|
|
49
|
+
const { stop } = await startDashboardServer({ port, openBrowser });
|
|
50
|
+
await new Promise<void>((resolve) => {
|
|
51
|
+
let closed = false;
|
|
52
|
+
const keepAlive = setInterval(() => {}, 1 << 30);
|
|
53
|
+
const shutdown = () => {
|
|
54
|
+
if (closed) return;
|
|
55
|
+
closed = true;
|
|
56
|
+
clearInterval(keepAlive);
|
|
57
|
+
stop();
|
|
58
|
+
resolve();
|
|
59
|
+
};
|
|
60
|
+
process.on("SIGINT", shutdown);
|
|
61
|
+
process.on("SIGTERM", shutdown);
|
|
62
|
+
});
|
|
176
63
|
}
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* baseline.ts
|
|
3
|
+
*
|
|
4
|
+
* Measures the value a skill adds over a no-skill baseline.
|
|
5
|
+
*
|
|
6
|
+
* Runs trigger checks against an EMPTY string description (no-skill baseline)
|
|
7
|
+
* and against the current description (with-skill), then computes lift.
|
|
8
|
+
* A skill "adds value" when lift >= 0.05 (5 percentage points).
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { parseArgs } from "node:util";
|
|
12
|
+
|
|
13
|
+
import type { BaselineResult, EvalEntry } from "../types.js";
|
|
14
|
+
import { callLlm } from "../utils/llm-call.js";
|
|
15
|
+
import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
|
|
16
|
+
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
// Types
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
export interface BaselineOptions {
|
|
22
|
+
evalSet: EvalEntry[];
|
|
23
|
+
skillDescription: string;
|
|
24
|
+
skillName: string;
|
|
25
|
+
agent: string;
|
|
26
|
+
modelFlag?: string;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface BaselineMeasurement {
|
|
30
|
+
skill_name: string;
|
|
31
|
+
baseline_pass_rate: number;
|
|
32
|
+
with_skill_pass_rate: number;
|
|
33
|
+
lift: number;
|
|
34
|
+
adds_value: boolean;
|
|
35
|
+
per_entry: BaselineResult[];
|
|
36
|
+
measured_at: string;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Injectable dependencies for measureBaseline(). When omitted, the real
|
|
41
|
+
* module imports are used. Pass overrides in tests to avoid real LLM calls.
|
|
42
|
+
*/
|
|
43
|
+
export interface BaselineDeps {
|
|
44
|
+
callLlm?: typeof callLlm;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// ---------------------------------------------------------------------------
|
|
48
|
+
// Constants
|
|
49
|
+
// ---------------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
const LIFT_THRESHOLD = 0.05;
|
|
52
|
+
const SYSTEM_PROMPT = "You are an evaluation assistant. Answer only YES or NO.";
|
|
53
|
+
|
|
54
|
+
// ---------------------------------------------------------------------------
|
|
55
|
+
// Core measurement
|
|
56
|
+
// ---------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
/** Measure baseline vs. with-skill trigger accuracy across an eval set. */
|
|
59
|
+
export async function measureBaseline(
|
|
60
|
+
options: BaselineOptions,
|
|
61
|
+
_deps: BaselineDeps = {},
|
|
62
|
+
): Promise<BaselineMeasurement> {
|
|
63
|
+
const { evalSet, skillDescription, skillName, agent, modelFlag } = options;
|
|
64
|
+
const _callLlm = _deps.callLlm ?? callLlm;
|
|
65
|
+
|
|
66
|
+
if (evalSet.length === 0) {
|
|
67
|
+
return {
|
|
68
|
+
skill_name: skillName,
|
|
69
|
+
baseline_pass_rate: 0,
|
|
70
|
+
with_skill_pass_rate: 0,
|
|
71
|
+
lift: 0,
|
|
72
|
+
adds_value: false,
|
|
73
|
+
per_entry: [],
|
|
74
|
+
measured_at: new Date().toISOString(),
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
const perEntry: BaselineResult[] = [];
|
|
79
|
+
let baselinePassed = 0;
|
|
80
|
+
let withSkillPassed = 0;
|
|
81
|
+
|
|
82
|
+
for (const entry of evalSet) {
|
|
83
|
+
// --- Baseline check (empty description) ---
|
|
84
|
+
const baselinePrompt = buildTriggerCheckPrompt("", entry.query);
|
|
85
|
+
const baselineRaw = await _callLlm(SYSTEM_PROMPT, baselinePrompt, agent, modelFlag);
|
|
86
|
+
const baselineTriggered = parseTriggerResponse(baselineRaw);
|
|
87
|
+
const baselinePass =
|
|
88
|
+
(entry.should_trigger && baselineTriggered) || (!entry.should_trigger && !baselineTriggered);
|
|
89
|
+
|
|
90
|
+
if (baselinePass) baselinePassed++;
|
|
91
|
+
|
|
92
|
+
perEntry.push({
|
|
93
|
+
skill_name: skillName,
|
|
94
|
+
query: entry.query,
|
|
95
|
+
with_skill: false,
|
|
96
|
+
triggered: baselineTriggered,
|
|
97
|
+
pass: baselinePass,
|
|
98
|
+
measured_at: new Date().toISOString(),
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
// --- With-skill check (actual description) ---
|
|
102
|
+
const withSkillPrompt = buildTriggerCheckPrompt(skillDescription, entry.query);
|
|
103
|
+
const withSkillRaw = await _callLlm(SYSTEM_PROMPT, withSkillPrompt, agent, modelFlag);
|
|
104
|
+
const withSkillTriggered = parseTriggerResponse(withSkillRaw);
|
|
105
|
+
const withSkillPass =
|
|
106
|
+
(entry.should_trigger && withSkillTriggered) ||
|
|
107
|
+
(!entry.should_trigger && !withSkillTriggered);
|
|
108
|
+
|
|
109
|
+
if (withSkillPass) withSkillPassed++;
|
|
110
|
+
|
|
111
|
+
perEntry.push({
|
|
112
|
+
skill_name: skillName,
|
|
113
|
+
query: entry.query,
|
|
114
|
+
with_skill: true,
|
|
115
|
+
triggered: withSkillTriggered,
|
|
116
|
+
pass: withSkillPass,
|
|
117
|
+
measured_at: new Date().toISOString(),
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const total = evalSet.length;
|
|
122
|
+
const baselinePassRate = baselinePassed / total;
|
|
123
|
+
const withSkillPassRate = withSkillPassed / total;
|
|
124
|
+
const lift = withSkillPassRate - baselinePassRate;
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
skill_name: skillName,
|
|
128
|
+
baseline_pass_rate: baselinePassRate,
|
|
129
|
+
with_skill_pass_rate: withSkillPassRate,
|
|
130
|
+
lift,
|
|
131
|
+
adds_value: lift >= LIFT_THRESHOLD,
|
|
132
|
+
per_entry: perEntry,
|
|
133
|
+
measured_at: new Date().toISOString(),
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// ---------------------------------------------------------------------------
|
|
138
|
+
// CLI entry point
|
|
139
|
+
// ---------------------------------------------------------------------------
|
|
140
|
+
|
|
141
|
+
export async function cliMain(): Promise<void> {
|
|
142
|
+
const { values } = parseArgs({
|
|
143
|
+
options: {
|
|
144
|
+
skill: { type: "string" },
|
|
145
|
+
"skill-path": { type: "string" },
|
|
146
|
+
"eval-set": { type: "string" },
|
|
147
|
+
agent: { type: "string" },
|
|
148
|
+
help: { type: "boolean", default: false },
|
|
149
|
+
},
|
|
150
|
+
strict: true,
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
if (values.help) {
|
|
154
|
+
console.log(`selftune grade baseline — Measure skill value vs. no-skill baseline
|
|
155
|
+
|
|
156
|
+
Usage:
|
|
157
|
+
selftune grade baseline --skill <name> --skill-path <path> [options]
|
|
158
|
+
|
|
159
|
+
Options:
|
|
160
|
+
--skill Skill name (required)
|
|
161
|
+
--skill-path Path to SKILL.md (required)
|
|
162
|
+
--eval-set Path to eval set JSON (optional, builds from logs if omitted)
|
|
163
|
+
--agent Agent CLI to use (claude, codex, opencode)
|
|
164
|
+
--help Show this help message`);
|
|
165
|
+
process.exit(0);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
if (!values.skill || !values["skill-path"]) {
|
|
169
|
+
console.error("[ERROR] --skill and --skill-path are required");
|
|
170
|
+
process.exit(1);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
const { existsSync, readFileSync } = await import("node:fs");
|
|
174
|
+
|
|
175
|
+
// Read skill description
|
|
176
|
+
const skillPath = values["skill-path"];
|
|
177
|
+
if (!existsSync(skillPath)) {
|
|
178
|
+
console.error(`[ERROR] SKILL.md not found at ${skillPath}`);
|
|
179
|
+
process.exit(1);
|
|
180
|
+
}
|
|
181
|
+
const skillDescription = readFileSync(skillPath, "utf-8");
|
|
182
|
+
|
|
183
|
+
// Load eval set
|
|
184
|
+
let evalSet: EvalEntry[];
|
|
185
|
+
if (values["eval-set"] && existsSync(values["eval-set"])) {
|
|
186
|
+
const raw = readFileSync(values["eval-set"], "utf-8");
|
|
187
|
+
evalSet = JSON.parse(raw) as EvalEntry[];
|
|
188
|
+
} else {
|
|
189
|
+
// Build from logs
|
|
190
|
+
const { QUERY_LOG } = await import("../constants.js");
|
|
191
|
+
const { readJsonl } = await import("../utils/jsonl.js");
|
|
192
|
+
const { readEffectiveSkillUsageRecords } = await import("../utils/skill-log.js");
|
|
193
|
+
const { buildEvalSet } = await import("./hooks-to-evals.js");
|
|
194
|
+
const skillRecords = readEffectiveSkillUsageRecords();
|
|
195
|
+
const queryRecords = readJsonl(QUERY_LOG);
|
|
196
|
+
evalSet = buildEvalSet(skillRecords, queryRecords, values.skill);
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// Detect agent
|
|
200
|
+
const { detectAgent } = await import("../utils/llm-call.js");
|
|
201
|
+
const requestedAgent = values.agent;
|
|
202
|
+
if (requestedAgent && !Bun.which(requestedAgent)) {
|
|
203
|
+
console.error(
|
|
204
|
+
JSON.stringify({
|
|
205
|
+
level: "error",
|
|
206
|
+
code: "agent_not_in_path",
|
|
207
|
+
message: `Agent CLI '${requestedAgent}' not found in PATH.`,
|
|
208
|
+
action: "Install it or omit --agent to use auto-detection.",
|
|
209
|
+
}),
|
|
210
|
+
);
|
|
211
|
+
process.exit(1);
|
|
212
|
+
}
|
|
213
|
+
const agent = requestedAgent ?? detectAgent();
|
|
214
|
+
if (!agent) {
|
|
215
|
+
console.error(
|
|
216
|
+
JSON.stringify({
|
|
217
|
+
level: "error",
|
|
218
|
+
code: "agent_not_found",
|
|
219
|
+
message: "No agent CLI (claude/codex/opencode) found in PATH.",
|
|
220
|
+
action: "Install Claude Code, Codex, or OpenCode.",
|
|
221
|
+
}),
|
|
222
|
+
);
|
|
223
|
+
process.exit(1);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
const result = await measureBaseline({
|
|
227
|
+
evalSet,
|
|
228
|
+
skillDescription,
|
|
229
|
+
skillName: values.skill,
|
|
230
|
+
agent,
|
|
231
|
+
});
|
|
232
|
+
|
|
233
|
+
console.log(JSON.stringify(result, null, 2));
|
|
234
|
+
process.exit(result.adds_value ? 0 : 1);
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
if (import.meta.main) {
|
|
238
|
+
cliMain().catch((err) => {
|
|
239
|
+
console.error(
|
|
240
|
+
JSON.stringify({
|
|
241
|
+
level: "fatal",
|
|
242
|
+
message: err instanceof Error ? err.message : String(err),
|
|
243
|
+
stack: err instanceof Error ? err.stack : undefined,
|
|
244
|
+
}),
|
|
245
|
+
);
|
|
246
|
+
process.exit(1);
|
|
247
|
+
});
|
|
248
|
+
}
|