selftune 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/.claude/agents/diagnosis-analyst.md +156 -0
  2. package/.claude/agents/evolution-reviewer.md +180 -0
  3. package/.claude/agents/integration-guide.md +212 -0
  4. package/.claude/agents/pattern-analyst.md +160 -0
  5. package/CHANGELOG.md +46 -1
  6. package/README.md +105 -257
  7. package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
  8. package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
  9. package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
  10. package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
  11. package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
  12. package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
  13. package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
  14. package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
  15. package/apps/local-dashboard/dist/favicon.png +0 -0
  16. package/apps/local-dashboard/dist/index.html +17 -0
  17. package/apps/local-dashboard/dist/logo.png +0 -0
  18. package/apps/local-dashboard/dist/logo.svg +9 -0
  19. package/assets/BeforeAfter.gif +0 -0
  20. package/assets/FeedbackLoop.gif +0 -0
  21. package/assets/logo.svg +9 -0
  22. package/assets/skill-health-badge.svg +20 -0
  23. package/cli/selftune/activation-rules.ts +171 -0
  24. package/cli/selftune/badge/badge-data.ts +108 -0
  25. package/cli/selftune/badge/badge-svg.ts +212 -0
  26. package/cli/selftune/badge/badge.ts +99 -0
  27. package/cli/selftune/canonical-export.ts +183 -0
  28. package/cli/selftune/constants.ts +103 -1
  29. package/cli/selftune/contribute/bundle.ts +314 -0
  30. package/cli/selftune/contribute/contribute.ts +214 -0
  31. package/cli/selftune/contribute/sanitize.ts +162 -0
  32. package/cli/selftune/cron/setup.ts +266 -0
  33. package/cli/selftune/dashboard-contract.ts +202 -0
  34. package/cli/selftune/dashboard-server.ts +1049 -0
  35. package/cli/selftune/dashboard.ts +43 -156
  36. package/cli/selftune/eval/baseline.ts +248 -0
  37. package/cli/selftune/eval/composability-v2.ts +273 -0
  38. package/cli/selftune/eval/composability.ts +117 -0
  39. package/cli/selftune/eval/generate-unit-tests.ts +143 -0
  40. package/cli/selftune/eval/hooks-to-evals.ts +101 -16
  41. package/cli/selftune/eval/import-skillsbench.ts +221 -0
  42. package/cli/selftune/eval/synthetic-evals.ts +172 -0
  43. package/cli/selftune/eval/unit-test-cli.ts +152 -0
  44. package/cli/selftune/eval/unit-test.ts +196 -0
  45. package/cli/selftune/evolution/deploy-proposal.ts +142 -1
  46. package/cli/selftune/evolution/evidence.ts +26 -0
  47. package/cli/selftune/evolution/evolve-body.ts +586 -0
  48. package/cli/selftune/evolution/evolve.ts +825 -116
  49. package/cli/selftune/evolution/extract-patterns.ts +105 -16
  50. package/cli/selftune/evolution/pareto.ts +314 -0
  51. package/cli/selftune/evolution/propose-body.ts +171 -0
  52. package/cli/selftune/evolution/propose-description.ts +100 -2
  53. package/cli/selftune/evolution/propose-routing.ts +166 -0
  54. package/cli/selftune/evolution/refine-body.ts +141 -0
  55. package/cli/selftune/evolution/rollback.ts +21 -4
  56. package/cli/selftune/evolution/validate-body.ts +254 -0
  57. package/cli/selftune/evolution/validate-proposal.ts +257 -35
  58. package/cli/selftune/evolution/validate-routing.ts +177 -0
  59. package/cli/selftune/grading/auto-grade.ts +200 -0
  60. package/cli/selftune/grading/grade-session.ts +513 -42
  61. package/cli/selftune/grading/pre-gates.ts +104 -0
  62. package/cli/selftune/grading/results.ts +42 -0
  63. package/cli/selftune/hooks/auto-activate.ts +185 -0
  64. package/cli/selftune/hooks/evolution-guard.ts +165 -0
  65. package/cli/selftune/hooks/prompt-log.ts +172 -2
  66. package/cli/selftune/hooks/session-stop.ts +123 -3
  67. package/cli/selftune/hooks/skill-change-guard.ts +112 -0
  68. package/cli/selftune/hooks/skill-eval.ts +119 -3
  69. package/cli/selftune/index.ts +415 -48
  70. package/cli/selftune/ingestors/claude-replay.ts +377 -0
  71. package/cli/selftune/ingestors/codex-rollout.ts +345 -46
  72. package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
  73. package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
  74. package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
  75. package/cli/selftune/init.ts +376 -16
  76. package/cli/selftune/last.ts +14 -5
  77. package/cli/selftune/localdb/db.ts +63 -0
  78. package/cli/selftune/localdb/materialize.ts +428 -0
  79. package/cli/selftune/localdb/queries.ts +376 -0
  80. package/cli/selftune/localdb/schema.ts +204 -0
  81. package/cli/selftune/memory/writer.ts +447 -0
  82. package/cli/selftune/monitoring/watch.ts +90 -16
  83. package/cli/selftune/normalization.ts +682 -0
  84. package/cli/selftune/observability.ts +19 -44
  85. package/cli/selftune/orchestrate.ts +1073 -0
  86. package/cli/selftune/quickstart.ts +203 -0
  87. package/cli/selftune/repair/skill-usage.ts +576 -0
  88. package/cli/selftune/schedule.ts +561 -0
  89. package/cli/selftune/status.ts +59 -33
  90. package/cli/selftune/sync.ts +627 -0
  91. package/cli/selftune/types.ts +525 -5
  92. package/cli/selftune/utils/canonical-log.ts +45 -0
  93. package/cli/selftune/utils/frontmatter.ts +217 -0
  94. package/cli/selftune/utils/hooks.ts +41 -0
  95. package/cli/selftune/utils/html.ts +27 -0
  96. package/cli/selftune/utils/llm-call.ts +103 -19
  97. package/cli/selftune/utils/math.ts +10 -0
  98. package/cli/selftune/utils/query-filter.ts +139 -0
  99. package/cli/selftune/utils/skill-discovery.ts +340 -0
  100. package/cli/selftune/utils/skill-log.ts +68 -0
  101. package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
  102. package/cli/selftune/utils/transcript.ts +307 -26
  103. package/cli/selftune/utils/trigger-check.ts +89 -0
  104. package/cli/selftune/utils/tui.ts +156 -0
  105. package/cli/selftune/workflows/discover.ts +254 -0
  106. package/cli/selftune/workflows/skill-md-writer.ts +288 -0
  107. package/cli/selftune/workflows/workflows.ts +188 -0
  108. package/package.json +28 -11
  109. package/packages/telemetry-contract/README.md +11 -0
  110. package/packages/telemetry-contract/fixtures/golden.json +87 -0
  111. package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
  112. package/packages/telemetry-contract/index.ts +1 -0
  113. package/packages/telemetry-contract/package.json +19 -0
  114. package/packages/telemetry-contract/src/index.ts +2 -0
  115. package/packages/telemetry-contract/src/types.ts +163 -0
  116. package/packages/telemetry-contract/src/validators.ts +109 -0
  117. package/skill/SKILL.md +180 -33
  118. package/skill/Workflows/AutoActivation.md +145 -0
  119. package/skill/Workflows/Badge.md +124 -0
  120. package/skill/Workflows/Baseline.md +144 -0
  121. package/skill/Workflows/Composability.md +107 -0
  122. package/skill/Workflows/Contribute.md +94 -0
  123. package/skill/Workflows/Cron.md +132 -0
  124. package/skill/Workflows/Dashboard.md +214 -0
  125. package/skill/Workflows/Doctor.md +63 -14
  126. package/skill/Workflows/Evals.md +110 -18
  127. package/skill/Workflows/EvolutionMemory.md +154 -0
  128. package/skill/Workflows/Evolve.md +181 -21
  129. package/skill/Workflows/EvolveBody.md +159 -0
  130. package/skill/Workflows/Grade.md +36 -31
  131. package/skill/Workflows/ImportSkillsBench.md +117 -0
  132. package/skill/Workflows/Ingest.md +142 -21
  133. package/skill/Workflows/Initialize.md +91 -23
  134. package/skill/Workflows/Orchestrate.md +139 -0
  135. package/skill/Workflows/Replay.md +91 -0
  136. package/skill/Workflows/Rollback.md +23 -4
  137. package/skill/Workflows/Schedule.md +61 -0
  138. package/skill/Workflows/Sync.md +88 -0
  139. package/skill/Workflows/UnitTest.md +150 -0
  140. package/skill/Workflows/Watch.md +33 -1
  141. package/skill/Workflows/Workflows.md +129 -0
  142. package/skill/assets/activation-rules-default.json +26 -0
  143. package/skill/assets/multi-skill-settings.json +63 -0
  144. package/skill/assets/single-skill-settings.json +57 -0
  145. package/skill/references/invocation-taxonomy.md +2 -2
  146. package/skill/references/logs.md +164 -2
  147. package/skill/references/setup-patterns.md +65 -0
  148. package/skill/references/version-history.md +40 -0
  149. package/skill/settings_snippet.json +23 -0
  150. package/templates/activation-rules-default.json +27 -0
  151. package/templates/multi-skill-settings.json +64 -0
  152. package/templates/single-skill-settings.json +58 -0
  153. package/dashboard/index.html +0 -1119
@@ -0,0 +1,177 @@
1
+ /**
2
+ * validate-routing.ts
3
+ *
4
+ * Validates a routing table evolution proposal by checking structural validity
5
+ * and running trigger accuracy checks against an eval set.
6
+ */
7
+
8
+ import type { BodyEvolutionProposal, BodyValidationResult, EvalEntry } from "../types.js";
9
+ import { callLlm } from "../utils/llm-call.js";
10
+ import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
11
+
12
+ // ---------------------------------------------------------------------------
13
+ // Structural validation
14
+ // ---------------------------------------------------------------------------
15
+
16
+ /**
17
+ * Check that a routing table is valid markdown table syntax with
18
+ * `| Trigger | Workflow |` columns.
19
+ */
20
+ export function validateRoutingStructure(routing: string): { valid: boolean; reason: string } {
21
+ const lines = routing
22
+ .trim()
23
+ .split("\n")
24
+ .filter((l) => l.trim().length > 0);
25
+
26
+ if (lines.length < 2) {
27
+ return { valid: false, reason: "Routing table must have at least a header and one data row" };
28
+ }
29
+
30
+ // Check header row contains Trigger and Workflow columns
31
+ const headerLine = lines[0].trim();
32
+ if (!headerLine.startsWith("|") || !headerLine.endsWith("|")) {
33
+ return {
34
+ valid: false,
35
+ reason: "Header row must be a markdown table row starting and ending with |",
36
+ };
37
+ }
38
+
39
+ const headerLower = headerLine.toLowerCase();
40
+ if (!headerLower.includes("trigger") || !headerLower.includes("workflow")) {
41
+ return { valid: false, reason: "Header must contain 'Trigger' and 'Workflow' columns" };
42
+ }
43
+
44
+ // Check separator row (line 2) has dashes
45
+ const separatorLine = lines[1].trim();
46
+ if (!separatorLine.includes("---")) {
47
+ return { valid: false, reason: "Second row must be a markdown table separator (contains ---)" };
48
+ }
49
+
50
+ // Check at least one data row
51
+ if (lines.length < 3) {
52
+ return { valid: false, reason: "Routing table must have at least one data row" };
53
+ }
54
+
55
+ // Check data rows are pipe-delimited
56
+ for (let i = 2; i < lines.length; i++) {
57
+ const row = lines[i].trim();
58
+ if (!row.startsWith("|") || !row.endsWith("|")) {
59
+ return { valid: false, reason: `Data row ${i - 1} is not a valid markdown table row` };
60
+ }
61
+ }
62
+
63
+ return { valid: true, reason: "Valid markdown routing table" };
64
+ }
65
+
66
+ // ---------------------------------------------------------------------------
67
+ // Trigger accuracy validation
68
+ // ---------------------------------------------------------------------------
69
+
70
+ /**
71
+ * Run before/after trigger checks on the eval set using the routing content.
72
+ * Returns pass rates for comparison.
73
+ */
74
+ export async function validateRoutingTriggerAccuracy(
75
+ originalRouting: string,
76
+ proposedRouting: string,
77
+ evalSet: EvalEntry[],
78
+ agent: string,
79
+ modelFlag?: string,
80
+ ): Promise<{ before_pass_rate: number; after_pass_rate: number; improved: boolean }> {
81
+ if (evalSet.length === 0) {
82
+ return { before_pass_rate: 0, after_pass_rate: 0, improved: false };
83
+ }
84
+
85
+ const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
86
+ let beforePassed = 0;
87
+ let afterPassed = 0;
88
+
89
+ for (const entry of evalSet) {
90
+ // Check with original routing
91
+ const beforePrompt = buildTriggerCheckPrompt(originalRouting, entry.query);
92
+ const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
93
+ const beforeTriggered = parseTriggerResponse(beforeRaw);
94
+ const beforePass =
95
+ (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
96
+
97
+ // Check with proposed routing
98
+ const afterPrompt = buildTriggerCheckPrompt(proposedRouting, entry.query);
99
+ const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
100
+ const afterTriggered = parseTriggerResponse(afterRaw);
101
+ const afterPass =
102
+ (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
103
+
104
+ if (beforePass) beforePassed++;
105
+ if (afterPass) afterPassed++;
106
+ }
107
+
108
+ const total = evalSet.length;
109
+ const beforePassRate = beforePassed / total;
110
+ const afterPassRate = afterPassed / total;
111
+
112
+ return {
113
+ before_pass_rate: beforePassRate,
114
+ after_pass_rate: afterPassRate,
115
+ improved: afterPassRate > beforePassRate,
116
+ };
117
+ }
118
+
119
+ // ---------------------------------------------------------------------------
120
+ // Full routing validation
121
+ // ---------------------------------------------------------------------------
122
+
123
+ /** Validate a routing table proposal: structural check + trigger accuracy. */
124
+ export async function validateRoutingProposal(
125
+ proposal: BodyEvolutionProposal,
126
+ evalSet: EvalEntry[],
127
+ agent: string,
128
+ modelFlag?: string,
129
+ ): Promise<BodyValidationResult> {
130
+ const gateResults: Array<{ gate: string; passed: boolean; reason: string }> = [];
131
+
132
+ // Gate 1: Structural validation
133
+ const structural = validateRoutingStructure(proposal.proposed_body);
134
+ gateResults.push({
135
+ gate: "structural",
136
+ passed: structural.valid,
137
+ reason: structural.reason,
138
+ });
139
+
140
+ if (!structural.valid) {
141
+ return {
142
+ proposal_id: proposal.proposal_id,
143
+ gates_passed: 0,
144
+ gates_total: 2,
145
+ gate_results: gateResults,
146
+ improved: false,
147
+ regressions: [],
148
+ };
149
+ }
150
+
151
+ // Gate 2: Trigger accuracy
152
+ const accuracy = await validateRoutingTriggerAccuracy(
153
+ proposal.original_body,
154
+ proposal.proposed_body,
155
+ evalSet,
156
+ agent,
157
+ modelFlag,
158
+ );
159
+ gateResults.push({
160
+ gate: "trigger_accuracy",
161
+ passed: accuracy.improved,
162
+ reason: accuracy.improved
163
+ ? `Improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
164
+ : `Not improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
165
+ });
166
+
167
+ const gatesPassed = gateResults.filter((g) => g.passed).length;
168
+
169
+ return {
170
+ proposal_id: proposal.proposal_id,
171
+ gates_passed: gatesPassed,
172
+ gates_total: 2,
173
+ gate_results: gateResults,
174
+ improved: gatesPassed === 2,
175
+ regressions: [],
176
+ };
177
+ }
@@ -0,0 +1,200 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * auto-grade.ts
4
+ *
5
+ * Frictionless grading command that auto-finds the most recent real session
6
+ * for a skill, auto-derives expectations from SKILL.md, grades, and outputs results.
7
+ *
8
+ * Usage:
9
+ * selftune auto-grade --skill <name> [--skill-path <path>] [--output <path>] [--agent <agent>]
10
+ */
11
+
12
+ import { mkdirSync, writeFileSync } from "node:fs";
13
+ import { dirname } from "node:path";
14
+ import { parseArgs } from "node:util";
15
+
16
+ import { AGENT_CANDIDATES, TELEMETRY_LOG } from "../constants.js";
17
+ import type { GradingResult, SessionTelemetryRecord } from "../types.js";
18
+ import { readJsonl } from "../utils/jsonl.js";
19
+ import { detectAgent as _detectAgent } from "../utils/llm-call.js";
20
+ import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
21
+ import { readExcerpt } from "../utils/transcript.js";
22
+ import {
23
+ buildDefaultGradingOutputPath,
24
+ deriveExpectationsFromSkill,
25
+ gradeSession,
26
+ resolveLatestSessionForSkill,
27
+ resolveSessionById,
28
+ } from "./grade-session.js";
29
+
30
+ export async function cliMain(): Promise<void> {
31
+ const { values } = parseArgs({
32
+ options: {
33
+ skill: { type: "string" },
34
+ "skill-path": { type: "string" },
35
+ "session-id": { type: "string" },
36
+ "telemetry-log": { type: "string", default: TELEMETRY_LOG },
37
+ output: { type: "string" },
38
+ agent: { type: "string" },
39
+ "show-transcript": { type: "boolean", default: false },
40
+ help: { type: "boolean", short: "h", default: false },
41
+ },
42
+ strict: true,
43
+ });
44
+
45
+ if (values.help) {
46
+ console.log(`selftune auto-grade — Frictionless skill session grading
47
+
48
+ Usage:
49
+ selftune auto-grade --skill <name> [options]
50
+
51
+ Options:
52
+ --skill Skill name (required)
53
+ --skill-path Path to SKILL.md (auto-detected if omitted)
54
+ --session-id Grade a specific session (auto-detects most recent if omitted)
55
+ --telemetry-log Path to telemetry log (default: ~/.claude/session_telemetry_log.jsonl)
56
+ --output Output path for grading JSON (default: ~/.selftune/grading/result-<session>.json)
57
+ --agent Agent CLI to use (${AGENT_CANDIDATES.join(", ")})
58
+ --show-transcript Print transcript excerpt before grading
59
+ -h, --help Show this help message`);
60
+ process.exit(0);
61
+ }
62
+
63
+ const skill = values.skill;
64
+ if (!skill) {
65
+ console.error("[ERROR] --skill is required");
66
+ process.exit(1);
67
+ }
68
+
69
+ // --- Determine agent ---
70
+ let agent: string | null = null;
71
+ const validAgents = [...AGENT_CANDIDATES];
72
+ if (values.agent) {
73
+ if (!validAgents.includes(values.agent)) {
74
+ console.error(
75
+ `[ERROR] Invalid --agent '${values.agent}'. Expected one of: ${validAgents.join(", ")}`,
76
+ );
77
+ process.exit(1);
78
+ }
79
+ agent = values.agent;
80
+ } else {
81
+ agent = _detectAgent();
82
+ }
83
+
84
+ if (!agent) {
85
+ console.error(
86
+ `[ERROR] No supported agent CLI (${AGENT_CANDIDATES.join("/")}) found in PATH.\n` +
87
+ "Install one of the supported agent CLIs.",
88
+ );
89
+ process.exit(1);
90
+ }
91
+
92
+ console.error(`[INFO] Auto-grade via agent: ${agent}`);
93
+
94
+ // --- Auto-find session ---
95
+ const telemetryLog = values["telemetry-log"] ?? TELEMETRY_LOG;
96
+ const telRecords = readJsonl<SessionTelemetryRecord>(telemetryLog);
97
+ const skillUsageRecords = readEffectiveSkillUsageRecords();
98
+
99
+ let telemetry: SessionTelemetryRecord;
100
+ let sessionId: string;
101
+ let transcriptPath: string;
102
+
103
+ if (values["session-id"]) {
104
+ sessionId = values["session-id"];
105
+ const resolved = resolveSessionById(telRecords, sessionId);
106
+ if (!resolved) {
107
+ console.error(
108
+ `[ERROR] Session '${sessionId}' not found in telemetry or recoverable transcript data. ` +
109
+ "Check the session ID or omit --session-id to auto-select the latest matching session.",
110
+ );
111
+ process.exit(1);
112
+ }
113
+ telemetry = resolved.telemetry;
114
+ transcriptPath = resolved.transcriptPath;
115
+ } else {
116
+ const resolved = resolveLatestSessionForSkill(telRecords, skillUsageRecords, skill);
117
+ if (!resolved) {
118
+ console.error(
119
+ `[ERROR] No session found for skill '${skill}'. Run the skill first, or pass --session-id.`,
120
+ );
121
+ process.exit(1);
122
+ }
123
+ telemetry = resolved.telemetry;
124
+ sessionId = resolved.sessionId ?? "unknown";
125
+ transcriptPath = resolved.transcriptPath ?? "";
126
+ const note =
127
+ resolved.source === "telemetry" ? "" : ` (${resolved.source.replaceAll("_", " ")})`;
128
+ console.error(`[INFO] Found most recent '${skill}' session: ${sessionId}${note}`);
129
+ }
130
+
131
+ const transcriptExcerpt = transcriptPath ? readExcerpt(transcriptPath) : "(no transcript)";
132
+
133
+ if (values["show-transcript"]) {
134
+ console.log("=== TRANSCRIPT EXCERPT ===");
135
+ console.log(transcriptExcerpt);
136
+ console.log("==========================\n");
137
+ }
138
+
139
+ // --- Auto-derive expectations ---
140
+ const derived = deriveExpectationsFromSkill(skill, values["skill-path"]);
141
+ if (derived.derived) {
142
+ console.error(
143
+ `[INFO] Auto-derived ${derived.expectations.length} expectations from ${derived.source}`,
144
+ );
145
+ } else {
146
+ console.error(`[WARN] Using generic expectations (${derived.source})`);
147
+ }
148
+ const expectations = derived.expectations;
149
+
150
+ let result: GradingResult;
151
+ try {
152
+ result = await gradeSession({
153
+ expectations,
154
+ telemetry,
155
+ sessionId,
156
+ skillName: skill,
157
+ transcriptExcerpt,
158
+ transcriptPath,
159
+ agent,
160
+ });
161
+ } catch (err) {
162
+ console.error(`[ERROR] ${err instanceof Error ? err.message : String(err)}`);
163
+ process.exit(1);
164
+ }
165
+
166
+ const outputPath = values.output ?? buildDefaultGradingOutputPath(sessionId);
167
+ const outputDir = dirname(outputPath);
168
+ if (outputDir !== ".") {
169
+ mkdirSync(outputDir, { recursive: true });
170
+ }
171
+ writeFileSync(outputPath, JSON.stringify(result, null, 2), "utf-8");
172
+
173
+ // Print summary
174
+ const { summary } = result;
175
+ const rate = summary.pass_rate ?? 0;
176
+ const meanStr =
177
+ summary.mean_score != null ? ` | mean score: ${summary.mean_score.toFixed(2)}` : "";
178
+ console.log(
179
+ `\nResults: ${summary.passed}/${summary.total} passed (${Math.round(rate * 100)}%)${meanStr}`,
180
+ );
181
+ for (const exp of result.expectations ?? []) {
182
+ const icon = exp.passed ? "\u2713" : "\u2717";
183
+ const scoreStr = exp.score != null ? ` [${exp.score.toFixed(1)}]` : "";
184
+ const sourceStr = exp.source ? ` (${exp.source})` : "";
185
+ console.log(` ${icon}${scoreStr}${sourceStr} ${String(exp.text ?? "").slice(0, 70)}`);
186
+ if (!exp.passed) {
187
+ console.log(` -> ${String(exp.evidence ?? "").slice(0, 100)}`);
188
+ }
189
+ }
190
+
191
+ console.log(`\nWrote ${outputPath}`);
192
+ }
193
+
194
+ // Guard: only run when invoked directly
195
+ if (import.meta.main) {
196
+ cliMain().catch((err) => {
197
+ console.error(`[FATAL] ${err}`);
198
+ process.exit(1);
199
+ });
200
+ }