selftune 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/.claude/agents/diagnosis-analyst.md +156 -0
  2. package/.claude/agents/evolution-reviewer.md +180 -0
  3. package/.claude/agents/integration-guide.md +212 -0
  4. package/.claude/agents/pattern-analyst.md +160 -0
  5. package/CHANGELOG.md +46 -1
  6. package/README.md +105 -257
  7. package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
  8. package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
  9. package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
  10. package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
  11. package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
  12. package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
  13. package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
  14. package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
  15. package/apps/local-dashboard/dist/favicon.png +0 -0
  16. package/apps/local-dashboard/dist/index.html +17 -0
  17. package/apps/local-dashboard/dist/logo.png +0 -0
  18. package/apps/local-dashboard/dist/logo.svg +9 -0
  19. package/assets/BeforeAfter.gif +0 -0
  20. package/assets/FeedbackLoop.gif +0 -0
  21. package/assets/logo.svg +9 -0
  22. package/assets/skill-health-badge.svg +20 -0
  23. package/cli/selftune/activation-rules.ts +171 -0
  24. package/cli/selftune/badge/badge-data.ts +108 -0
  25. package/cli/selftune/badge/badge-svg.ts +212 -0
  26. package/cli/selftune/badge/badge.ts +99 -0
  27. package/cli/selftune/canonical-export.ts +183 -0
  28. package/cli/selftune/constants.ts +103 -1
  29. package/cli/selftune/contribute/bundle.ts +314 -0
  30. package/cli/selftune/contribute/contribute.ts +214 -0
  31. package/cli/selftune/contribute/sanitize.ts +162 -0
  32. package/cli/selftune/cron/setup.ts +266 -0
  33. package/cli/selftune/dashboard-contract.ts +202 -0
  34. package/cli/selftune/dashboard-server.ts +1049 -0
  35. package/cli/selftune/dashboard.ts +43 -156
  36. package/cli/selftune/eval/baseline.ts +248 -0
  37. package/cli/selftune/eval/composability-v2.ts +273 -0
  38. package/cli/selftune/eval/composability.ts +117 -0
  39. package/cli/selftune/eval/generate-unit-tests.ts +143 -0
  40. package/cli/selftune/eval/hooks-to-evals.ts +101 -16
  41. package/cli/selftune/eval/import-skillsbench.ts +221 -0
  42. package/cli/selftune/eval/synthetic-evals.ts +172 -0
  43. package/cli/selftune/eval/unit-test-cli.ts +152 -0
  44. package/cli/selftune/eval/unit-test.ts +196 -0
  45. package/cli/selftune/evolution/deploy-proposal.ts +142 -1
  46. package/cli/selftune/evolution/evidence.ts +26 -0
  47. package/cli/selftune/evolution/evolve-body.ts +586 -0
  48. package/cli/selftune/evolution/evolve.ts +825 -116
  49. package/cli/selftune/evolution/extract-patterns.ts +105 -16
  50. package/cli/selftune/evolution/pareto.ts +314 -0
  51. package/cli/selftune/evolution/propose-body.ts +171 -0
  52. package/cli/selftune/evolution/propose-description.ts +100 -2
  53. package/cli/selftune/evolution/propose-routing.ts +166 -0
  54. package/cli/selftune/evolution/refine-body.ts +141 -0
  55. package/cli/selftune/evolution/rollback.ts +21 -4
  56. package/cli/selftune/evolution/validate-body.ts +254 -0
  57. package/cli/selftune/evolution/validate-proposal.ts +257 -35
  58. package/cli/selftune/evolution/validate-routing.ts +177 -0
  59. package/cli/selftune/grading/auto-grade.ts +200 -0
  60. package/cli/selftune/grading/grade-session.ts +513 -42
  61. package/cli/selftune/grading/pre-gates.ts +104 -0
  62. package/cli/selftune/grading/results.ts +42 -0
  63. package/cli/selftune/hooks/auto-activate.ts +185 -0
  64. package/cli/selftune/hooks/evolution-guard.ts +165 -0
  65. package/cli/selftune/hooks/prompt-log.ts +172 -2
  66. package/cli/selftune/hooks/session-stop.ts +123 -3
  67. package/cli/selftune/hooks/skill-change-guard.ts +112 -0
  68. package/cli/selftune/hooks/skill-eval.ts +119 -3
  69. package/cli/selftune/index.ts +415 -48
  70. package/cli/selftune/ingestors/claude-replay.ts +377 -0
  71. package/cli/selftune/ingestors/codex-rollout.ts +345 -46
  72. package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
  73. package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
  74. package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
  75. package/cli/selftune/init.ts +376 -16
  76. package/cli/selftune/last.ts +14 -5
  77. package/cli/selftune/localdb/db.ts +63 -0
  78. package/cli/selftune/localdb/materialize.ts +428 -0
  79. package/cli/selftune/localdb/queries.ts +376 -0
  80. package/cli/selftune/localdb/schema.ts +204 -0
  81. package/cli/selftune/memory/writer.ts +447 -0
  82. package/cli/selftune/monitoring/watch.ts +90 -16
  83. package/cli/selftune/normalization.ts +682 -0
  84. package/cli/selftune/observability.ts +19 -44
  85. package/cli/selftune/orchestrate.ts +1073 -0
  86. package/cli/selftune/quickstart.ts +203 -0
  87. package/cli/selftune/repair/skill-usage.ts +576 -0
  88. package/cli/selftune/schedule.ts +561 -0
  89. package/cli/selftune/status.ts +59 -33
  90. package/cli/selftune/sync.ts +627 -0
  91. package/cli/selftune/types.ts +525 -5
  92. package/cli/selftune/utils/canonical-log.ts +45 -0
  93. package/cli/selftune/utils/frontmatter.ts +217 -0
  94. package/cli/selftune/utils/hooks.ts +41 -0
  95. package/cli/selftune/utils/html.ts +27 -0
  96. package/cli/selftune/utils/llm-call.ts +103 -19
  97. package/cli/selftune/utils/math.ts +10 -0
  98. package/cli/selftune/utils/query-filter.ts +139 -0
  99. package/cli/selftune/utils/skill-discovery.ts +340 -0
  100. package/cli/selftune/utils/skill-log.ts +68 -0
  101. package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
  102. package/cli/selftune/utils/transcript.ts +307 -26
  103. package/cli/selftune/utils/trigger-check.ts +89 -0
  104. package/cli/selftune/utils/tui.ts +156 -0
  105. package/cli/selftune/workflows/discover.ts +254 -0
  106. package/cli/selftune/workflows/skill-md-writer.ts +288 -0
  107. package/cli/selftune/workflows/workflows.ts +188 -0
  108. package/package.json +28 -11
  109. package/packages/telemetry-contract/README.md +11 -0
  110. package/packages/telemetry-contract/fixtures/golden.json +87 -0
  111. package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
  112. package/packages/telemetry-contract/index.ts +1 -0
  113. package/packages/telemetry-contract/package.json +19 -0
  114. package/packages/telemetry-contract/src/index.ts +2 -0
  115. package/packages/telemetry-contract/src/types.ts +163 -0
  116. package/packages/telemetry-contract/src/validators.ts +109 -0
  117. package/skill/SKILL.md +180 -33
  118. package/skill/Workflows/AutoActivation.md +145 -0
  119. package/skill/Workflows/Badge.md +124 -0
  120. package/skill/Workflows/Baseline.md +144 -0
  121. package/skill/Workflows/Composability.md +107 -0
  122. package/skill/Workflows/Contribute.md +94 -0
  123. package/skill/Workflows/Cron.md +132 -0
  124. package/skill/Workflows/Dashboard.md +214 -0
  125. package/skill/Workflows/Doctor.md +63 -14
  126. package/skill/Workflows/Evals.md +110 -18
  127. package/skill/Workflows/EvolutionMemory.md +154 -0
  128. package/skill/Workflows/Evolve.md +181 -21
  129. package/skill/Workflows/EvolveBody.md +159 -0
  130. package/skill/Workflows/Grade.md +36 -31
  131. package/skill/Workflows/ImportSkillsBench.md +117 -0
  132. package/skill/Workflows/Ingest.md +142 -21
  133. package/skill/Workflows/Initialize.md +91 -23
  134. package/skill/Workflows/Orchestrate.md +139 -0
  135. package/skill/Workflows/Replay.md +91 -0
  136. package/skill/Workflows/Rollback.md +23 -4
  137. package/skill/Workflows/Schedule.md +61 -0
  138. package/skill/Workflows/Sync.md +88 -0
  139. package/skill/Workflows/UnitTest.md +150 -0
  140. package/skill/Workflows/Watch.md +33 -1
  141. package/skill/Workflows/Workflows.md +129 -0
  142. package/skill/assets/activation-rules-default.json +26 -0
  143. package/skill/assets/multi-skill-settings.json +63 -0
  144. package/skill/assets/single-skill-settings.json +57 -0
  145. package/skill/references/invocation-taxonomy.md +2 -2
  146. package/skill/references/logs.md +164 -2
  147. package/skill/references/setup-patterns.md +65 -0
  148. package/skill/references/version-history.md +40 -0
  149. package/skill/settings_snippet.json +23 -0
  150. package/templates/activation-rules-default.json +27 -0
  151. package/templates/multi-skill-settings.json +64 -0
  152. package/templates/single-skill-settings.json +58 -0
  153. package/dashboard/index.html +0 -1119
@@ -1,124 +1,12 @@
1
1
  /**
2
- * selftune dashboard — Exports JSONL data into a standalone HTML viewer.
2
+ * selftune dashboard — Start the local React SPA dashboard server.
3
3
  *
4
4
  * Usage:
5
- * selftune dashboard — Open dashboard in default browser
6
- * selftune dashboard --export Export data-embedded HTML to stdout
7
- * selftune dashboard --out FILE Write data-embedded HTML to FILE
5
+ * selftune dashboard — Start server on port 3141 and open browser
6
+ * selftune dashboard --port 8080 Start on custom port
7
+ * selftune dashboard --serve Deprecated alias for the default behavior
8
8
  */
9
9
 
10
- import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
11
- import { homedir } from "node:os";
12
- import { dirname, join, resolve } from "node:path";
13
- import { EVOLUTION_AUDIT_LOG, QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "./constants.js";
14
- import { getLastDeployedProposal, readAuditTrail } from "./evolution/audit.js";
15
- import { computeMonitoringSnapshot } from "./monitoring/watch.js";
16
- import type {
17
- EvolutionAuditEntry,
18
- QueryLogRecord,
19
- SessionTelemetryRecord,
20
- SkillUsageRecord,
21
- } from "./types.js";
22
- import { readJsonl } from "./utils/jsonl.js";
23
-
24
- function findViewerHTML(): string {
25
- // Try relative to this module first (works for both dev and installed)
26
- const candidates = [
27
- join(dirname(import.meta.dir), "..", "dashboard", "index.html"),
28
- join(dirname(import.meta.dir), "dashboard", "index.html"),
29
- resolve("dashboard", "index.html"),
30
- ];
31
- for (const c of candidates) {
32
- if (existsSync(c)) return c;
33
- }
34
- throw new Error("Could not find dashboard/index.html. Ensure it exists in the selftune repo.");
35
- }
36
-
37
- function buildEmbeddedHTML(): string {
38
- const template = readFileSync(findViewerHTML(), "utf-8");
39
-
40
- const telemetry = readJsonl<SessionTelemetryRecord>(TELEMETRY_LOG);
41
- const skills = readJsonl<SkillUsageRecord>(SKILL_LOG);
42
- const queries = readJsonl<QueryLogRecord>(QUERY_LOG);
43
- const evolution = readJsonl<EvolutionAuditEntry>(EVOLUTION_AUDIT_LOG);
44
-
45
- const totalRecords = telemetry.length + skills.length + queries.length + evolution.length;
46
-
47
- if (totalRecords === 0) {
48
- console.error("No log data found. Run some sessions first.");
49
- console.error(` Checked: ${TELEMETRY_LOG}`);
50
- console.error(` ${SKILL_LOG}`);
51
- console.error(` ${QUERY_LOG}`);
52
- console.error(` ${EVOLUTION_AUDIT_LOG}`);
53
- process.exit(1);
54
- }
55
-
56
- // Compute per-skill monitoring snapshots
57
- const skillNames = [...new Set(skills.map((r) => r.skill_name))];
58
- const snapshots: Record<string, ReturnType<typeof computeMonitoringSnapshot>> = {};
59
- for (const name of skillNames) {
60
- const lastDeployed = getLastDeployedProposal(name);
61
- const baselinePassRate = lastDeployed?.eval_snapshot?.pass_rate ?? 0.5;
62
- snapshots[name] = computeMonitoringSnapshot(
63
- name,
64
- telemetry,
65
- skills,
66
- queries,
67
- telemetry.length,
68
- baselinePassRate,
69
- );
70
- }
71
-
72
- // Compute unmatched queries
73
- const triggeredQueries = new Set(
74
- skills.filter((r) => r.triggered).map((r) => r.query.toLowerCase().trim()),
75
- );
76
- const unmatched = queries
77
- .filter((q) => !triggeredQueries.has(q.query.toLowerCase().trim()))
78
- .map((q) => ({
79
- timestamp: q.timestamp,
80
- session_id: q.session_id,
81
- query: q.query,
82
- }));
83
-
84
- // Compute pending proposals
85
- const auditTrail = readAuditTrail();
86
- const proposalStatus: Record<string, string[]> = {};
87
- for (const e of auditTrail) {
88
- if (!proposalStatus[e.proposal_id]) proposalStatus[e.proposal_id] = [];
89
- proposalStatus[e.proposal_id].push(e.action);
90
- }
91
- // Deduplicate by proposal_id: one entry per pending proposal
92
- const terminalActions = new Set(["deployed", "rejected", "rolled_back"]);
93
- const seenProposals = new Set<string>();
94
- const pendingProposals = auditTrail.filter((e) => {
95
- if (e.action !== "created" && e.action !== "validated") return false;
96
- if (seenProposals.has(e.proposal_id)) return false;
97
- const actions = proposalStatus[e.proposal_id] || [];
98
- const isPending = !actions.some((a: string) => terminalActions.has(a));
99
- if (isPending) seenProposals.add(e.proposal_id);
100
- return isPending;
101
- });
102
-
103
- const data = {
104
- telemetry,
105
- skills,
106
- queries,
107
- evolution,
108
- computed: {
109
- snapshots,
110
- unmatched,
111
- pendingProposals,
112
- },
113
- };
114
-
115
- // Inject embedded data right before </body>
116
- // Escape </script> sequences to prevent XSS via embedded JSON
117
- const safeJson = JSON.stringify(data).replace(/<\/script>/gi, "<\\/script>");
118
- const dataScript = `<script id="embedded-data" type="application/json">${safeJson}</script>`;
119
- return template.replace("</body>", `${dataScript}\n</body>`);
120
- }
121
-
122
10
  export async function cliMain(): Promise<void> {
123
11
  const args = process.argv.slice(2);
124
12
 
@@ -126,51 +14,50 @@ export async function cliMain(): Promise<void> {
126
14
  console.log(`selftune dashboard — Visual data dashboard
127
15
 
128
16
  Usage:
129
- selftune dashboard Open dashboard in default browser
130
- selftune dashboard --export Export data-embedded HTML to stdout
131
- selftune dashboard --out FILE Write data-embedded HTML to FILE`);
17
+ selftune dashboard Start dashboard server (port 3141)
18
+ selftune dashboard --port 8080 Start on custom port
19
+ selftune dashboard --serve Deprecated alias for default behavior
20
+ selftune dashboard --no-open Start server without opening browser`);
132
21
  process.exit(0);
133
22
  }
134
23
 
135
- if (args.includes("--export")) {
136
- process.stdout.write(buildEmbeddedHTML());
137
- return;
24
+ if (args.includes("--export") || args.includes("--out")) {
25
+ console.error("Legacy dashboard export was removed.");
26
+ console.error(
27
+ "Use `selftune dashboard` to run the SPA locally, then share a route or screenshot instead.",
28
+ );
29
+ process.exit(1);
138
30
  }
139
31
 
140
- const outIdx = args.indexOf("--out");
141
- if (outIdx !== -1) {
142
- const outPath = args[outIdx + 1];
143
- if (!outPath) {
144
- console.error("--out requires a file path argument");
32
+ const portIdx = args.indexOf("--port");
33
+ let port: number | undefined;
34
+ if (portIdx !== -1) {
35
+ const parsed = Number.parseInt(args[portIdx + 1], 10);
36
+ if (!Number.isInteger(parsed) || parsed < 1 || parsed > 65535) {
37
+ console.error(`Invalid port "${args[portIdx + 1]}": must be an integer between 1 and 65535.`);
145
38
  process.exit(1);
146
39
  }
147
- const html = buildEmbeddedHTML();
148
- writeFileSync(outPath, html, "utf-8");
149
- console.log(`Dashboard written to ${outPath}`);
150
- return;
151
- }
152
-
153
- // Default: write to temp file and open in browser
154
- const tmpDir = join(homedir(), ".selftune");
155
- if (!existsSync(tmpDir)) {
156
- mkdirSync(tmpDir, { recursive: true });
157
- }
158
- const tmpPath = join(tmpDir, "dashboard.html");
159
- const html = buildEmbeddedHTML();
160
- writeFileSync(tmpPath, html, "utf-8");
161
-
162
- console.log(`Dashboard saved to ${tmpPath}`);
163
- console.log("Opening in browser...");
164
-
165
- try {
166
- const platform = process.platform;
167
- const cmd = platform === "darwin" ? "open" : platform === "linux" ? "xdg-open" : null;
168
- if (!cmd) throw new Error("Unsupported platform");
169
- const proc = Bun.spawn([cmd, tmpPath], { stdio: ["ignore", "ignore", "ignore"] });
170
- await proc.exited;
171
- if (proc.exitCode !== 0) throw new Error(`Failed to launch ${cmd}`);
172
- } catch {
173
- console.log(`Open manually: file://${tmpPath}`);
174
- }
175
- process.exit(0);
40
+ port = parsed;
41
+ }
42
+
43
+ if (args.includes("--serve")) {
44
+ console.warn("`selftune dashboard --serve` is deprecated; use `selftune dashboard` instead.");
45
+ }
46
+
47
+ const openBrowser = !args.includes("--no-open");
48
+ const { startDashboardServer } = await import("./dashboard-server.js");
49
+ const { stop } = await startDashboardServer({ port, openBrowser });
50
+ await new Promise<void>((resolve) => {
51
+ let closed = false;
52
+ const keepAlive = setInterval(() => {}, 1 << 30);
53
+ const shutdown = () => {
54
+ if (closed) return;
55
+ closed = true;
56
+ clearInterval(keepAlive);
57
+ stop();
58
+ resolve();
59
+ };
60
+ process.on("SIGINT", shutdown);
61
+ process.on("SIGTERM", shutdown);
62
+ });
176
63
  }
@@ -0,0 +1,248 @@
1
+ /**
2
+ * baseline.ts
3
+ *
4
+ * Measures the value a skill adds over a no-skill baseline.
5
+ *
6
+ * Runs trigger checks against an EMPTY string description (no-skill baseline)
7
+ * and against the current description (with-skill), then computes lift.
8
+ * A skill "adds value" when lift >= 0.05 (5 percentage points).
9
+ */
10
+
11
+ import { parseArgs } from "node:util";
12
+
13
+ import type { BaselineResult, EvalEntry } from "../types.js";
14
+ import { callLlm } from "../utils/llm-call.js";
15
+ import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
16
+
17
+ // ---------------------------------------------------------------------------
18
+ // Types
19
+ // ---------------------------------------------------------------------------
20
+
21
+ export interface BaselineOptions {
22
+ evalSet: EvalEntry[];
23
+ skillDescription: string;
24
+ skillName: string;
25
+ agent: string;
26
+ modelFlag?: string;
27
+ }
28
+
29
+ export interface BaselineMeasurement {
30
+ skill_name: string;
31
+ baseline_pass_rate: number;
32
+ with_skill_pass_rate: number;
33
+ lift: number;
34
+ adds_value: boolean;
35
+ per_entry: BaselineResult[];
36
+ measured_at: string;
37
+ }
38
+
39
+ /**
40
+ * Injectable dependencies for measureBaseline(). When omitted, the real
41
+ * module imports are used. Pass overrides in tests to avoid real LLM calls.
42
+ */
43
+ export interface BaselineDeps {
44
+ callLlm?: typeof callLlm;
45
+ }
46
+
47
+ // ---------------------------------------------------------------------------
48
+ // Constants
49
+ // ---------------------------------------------------------------------------
50
+
51
+ const LIFT_THRESHOLD = 0.05;
52
+ const SYSTEM_PROMPT = "You are an evaluation assistant. Answer only YES or NO.";
53
+
54
+ // ---------------------------------------------------------------------------
55
+ // Core measurement
56
+ // ---------------------------------------------------------------------------
57
+
58
+ /** Measure baseline vs. with-skill trigger accuracy across an eval set. */
59
+ export async function measureBaseline(
60
+ options: BaselineOptions,
61
+ _deps: BaselineDeps = {},
62
+ ): Promise<BaselineMeasurement> {
63
+ const { evalSet, skillDescription, skillName, agent, modelFlag } = options;
64
+ const _callLlm = _deps.callLlm ?? callLlm;
65
+
66
+ if (evalSet.length === 0) {
67
+ return {
68
+ skill_name: skillName,
69
+ baseline_pass_rate: 0,
70
+ with_skill_pass_rate: 0,
71
+ lift: 0,
72
+ adds_value: false,
73
+ per_entry: [],
74
+ measured_at: new Date().toISOString(),
75
+ };
76
+ }
77
+
78
+ const perEntry: BaselineResult[] = [];
79
+ let baselinePassed = 0;
80
+ let withSkillPassed = 0;
81
+
82
+ for (const entry of evalSet) {
83
+ // --- Baseline check (empty description) ---
84
+ const baselinePrompt = buildTriggerCheckPrompt("", entry.query);
85
+ const baselineRaw = await _callLlm(SYSTEM_PROMPT, baselinePrompt, agent, modelFlag);
86
+ const baselineTriggered = parseTriggerResponse(baselineRaw);
87
+ const baselinePass =
88
+ (entry.should_trigger && baselineTriggered) || (!entry.should_trigger && !baselineTriggered);
89
+
90
+ if (baselinePass) baselinePassed++;
91
+
92
+ perEntry.push({
93
+ skill_name: skillName,
94
+ query: entry.query,
95
+ with_skill: false,
96
+ triggered: baselineTriggered,
97
+ pass: baselinePass,
98
+ measured_at: new Date().toISOString(),
99
+ });
100
+
101
+ // --- With-skill check (actual description) ---
102
+ const withSkillPrompt = buildTriggerCheckPrompt(skillDescription, entry.query);
103
+ const withSkillRaw = await _callLlm(SYSTEM_PROMPT, withSkillPrompt, agent, modelFlag);
104
+ const withSkillTriggered = parseTriggerResponse(withSkillRaw);
105
+ const withSkillPass =
106
+ (entry.should_trigger && withSkillTriggered) ||
107
+ (!entry.should_trigger && !withSkillTriggered);
108
+
109
+ if (withSkillPass) withSkillPassed++;
110
+
111
+ perEntry.push({
112
+ skill_name: skillName,
113
+ query: entry.query,
114
+ with_skill: true,
115
+ triggered: withSkillTriggered,
116
+ pass: withSkillPass,
117
+ measured_at: new Date().toISOString(),
118
+ });
119
+ }
120
+
121
+ const total = evalSet.length;
122
+ const baselinePassRate = baselinePassed / total;
123
+ const withSkillPassRate = withSkillPassed / total;
124
+ const lift = withSkillPassRate - baselinePassRate;
125
+
126
+ return {
127
+ skill_name: skillName,
128
+ baseline_pass_rate: baselinePassRate,
129
+ with_skill_pass_rate: withSkillPassRate,
130
+ lift,
131
+ adds_value: lift >= LIFT_THRESHOLD,
132
+ per_entry: perEntry,
133
+ measured_at: new Date().toISOString(),
134
+ };
135
+ }
136
+
137
+ // ---------------------------------------------------------------------------
138
+ // CLI entry point
139
+ // ---------------------------------------------------------------------------
140
+
141
+ export async function cliMain(): Promise<void> {
142
+ const { values } = parseArgs({
143
+ options: {
144
+ skill: { type: "string" },
145
+ "skill-path": { type: "string" },
146
+ "eval-set": { type: "string" },
147
+ agent: { type: "string" },
148
+ help: { type: "boolean", default: false },
149
+ },
150
+ strict: true,
151
+ });
152
+
153
+ if (values.help) {
154
+ console.log(`selftune grade baseline — Measure skill value vs. no-skill baseline
155
+
156
+ Usage:
157
+ selftune grade baseline --skill <name> --skill-path <path> [options]
158
+
159
+ Options:
160
+ --skill Skill name (required)
161
+ --skill-path Path to SKILL.md (required)
162
+ --eval-set Path to eval set JSON (optional, builds from logs if omitted)
163
+ --agent Agent CLI to use (claude, codex, opencode)
164
+ --help Show this help message`);
165
+ process.exit(0);
166
+ }
167
+
168
+ if (!values.skill || !values["skill-path"]) {
169
+ console.error("[ERROR] --skill and --skill-path are required");
170
+ process.exit(1);
171
+ }
172
+
173
+ const { existsSync, readFileSync } = await import("node:fs");
174
+
175
+ // Read skill description
176
+ const skillPath = values["skill-path"];
177
+ if (!existsSync(skillPath)) {
178
+ console.error(`[ERROR] SKILL.md not found at ${skillPath}`);
179
+ process.exit(1);
180
+ }
181
+ const skillDescription = readFileSync(skillPath, "utf-8");
182
+
183
+ // Load eval set
184
+ let evalSet: EvalEntry[];
185
+ if (values["eval-set"] && existsSync(values["eval-set"])) {
186
+ const raw = readFileSync(values["eval-set"], "utf-8");
187
+ evalSet = JSON.parse(raw) as EvalEntry[];
188
+ } else {
189
+ // Build from logs
190
+ const { QUERY_LOG } = await import("../constants.js");
191
+ const { readJsonl } = await import("../utils/jsonl.js");
192
+ const { readEffectiveSkillUsageRecords } = await import("../utils/skill-log.js");
193
+ const { buildEvalSet } = await import("./hooks-to-evals.js");
194
+ const skillRecords = readEffectiveSkillUsageRecords();
195
+ const queryRecords = readJsonl(QUERY_LOG);
196
+ evalSet = buildEvalSet(skillRecords, queryRecords, values.skill);
197
+ }
198
+
199
+ // Detect agent
200
+ const { detectAgent } = await import("../utils/llm-call.js");
201
+ const requestedAgent = values.agent;
202
+ if (requestedAgent && !Bun.which(requestedAgent)) {
203
+ console.error(
204
+ JSON.stringify({
205
+ level: "error",
206
+ code: "agent_not_in_path",
207
+ message: `Agent CLI '${requestedAgent}' not found in PATH.`,
208
+ action: "Install it or omit --agent to use auto-detection.",
209
+ }),
210
+ );
211
+ process.exit(1);
212
+ }
213
+ const agent = requestedAgent ?? detectAgent();
214
+ if (!agent) {
215
+ console.error(
216
+ JSON.stringify({
217
+ level: "error",
218
+ code: "agent_not_found",
219
+ message: "No agent CLI (claude/codex/opencode) found in PATH.",
220
+ action: "Install Claude Code, Codex, or OpenCode.",
221
+ }),
222
+ );
223
+ process.exit(1);
224
+ }
225
+
226
+ const result = await measureBaseline({
227
+ evalSet,
228
+ skillDescription,
229
+ skillName: values.skill,
230
+ agent,
231
+ });
232
+
233
+ console.log(JSON.stringify(result, null, 2));
234
+ process.exit(result.adds_value ? 0 : 1);
235
+ }
236
+
237
+ if (import.meta.main) {
238
+ cliMain().catch((err) => {
239
+ console.error(
240
+ JSON.stringify({
241
+ level: "fatal",
242
+ message: err instanceof Error ? err.message : String(err),
243
+ stack: err instanceof Error ? err.stack : undefined,
244
+ }),
245
+ );
246
+ process.exit(1);
247
+ });
248
+ }