selftune 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/.claude/agents/diagnosis-analyst.md +156 -0
  2. package/.claude/agents/evolution-reviewer.md +180 -0
  3. package/.claude/agents/integration-guide.md +212 -0
  4. package/.claude/agents/pattern-analyst.md +160 -0
  5. package/CHANGELOG.md +46 -1
  6. package/README.md +105 -257
  7. package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
  8. package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
  9. package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
  10. package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
  11. package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
  12. package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
  13. package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
  14. package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
  15. package/apps/local-dashboard/dist/favicon.png +0 -0
  16. package/apps/local-dashboard/dist/index.html +17 -0
  17. package/apps/local-dashboard/dist/logo.png +0 -0
  18. package/apps/local-dashboard/dist/logo.svg +9 -0
  19. package/assets/BeforeAfter.gif +0 -0
  20. package/assets/FeedbackLoop.gif +0 -0
  21. package/assets/logo.svg +9 -0
  22. package/assets/skill-health-badge.svg +20 -0
  23. package/cli/selftune/activation-rules.ts +171 -0
  24. package/cli/selftune/badge/badge-data.ts +108 -0
  25. package/cli/selftune/badge/badge-svg.ts +212 -0
  26. package/cli/selftune/badge/badge.ts +99 -0
  27. package/cli/selftune/canonical-export.ts +183 -0
  28. package/cli/selftune/constants.ts +103 -1
  29. package/cli/selftune/contribute/bundle.ts +314 -0
  30. package/cli/selftune/contribute/contribute.ts +214 -0
  31. package/cli/selftune/contribute/sanitize.ts +162 -0
  32. package/cli/selftune/cron/setup.ts +266 -0
  33. package/cli/selftune/dashboard-contract.ts +202 -0
  34. package/cli/selftune/dashboard-server.ts +1049 -0
  35. package/cli/selftune/dashboard.ts +43 -156
  36. package/cli/selftune/eval/baseline.ts +248 -0
  37. package/cli/selftune/eval/composability-v2.ts +273 -0
  38. package/cli/selftune/eval/composability.ts +117 -0
  39. package/cli/selftune/eval/generate-unit-tests.ts +143 -0
  40. package/cli/selftune/eval/hooks-to-evals.ts +101 -16
  41. package/cli/selftune/eval/import-skillsbench.ts +221 -0
  42. package/cli/selftune/eval/synthetic-evals.ts +172 -0
  43. package/cli/selftune/eval/unit-test-cli.ts +152 -0
  44. package/cli/selftune/eval/unit-test.ts +196 -0
  45. package/cli/selftune/evolution/deploy-proposal.ts +142 -1
  46. package/cli/selftune/evolution/evidence.ts +26 -0
  47. package/cli/selftune/evolution/evolve-body.ts +586 -0
  48. package/cli/selftune/evolution/evolve.ts +825 -116
  49. package/cli/selftune/evolution/extract-patterns.ts +105 -16
  50. package/cli/selftune/evolution/pareto.ts +314 -0
  51. package/cli/selftune/evolution/propose-body.ts +171 -0
  52. package/cli/selftune/evolution/propose-description.ts +100 -2
  53. package/cli/selftune/evolution/propose-routing.ts +166 -0
  54. package/cli/selftune/evolution/refine-body.ts +141 -0
  55. package/cli/selftune/evolution/rollback.ts +21 -4
  56. package/cli/selftune/evolution/validate-body.ts +254 -0
  57. package/cli/selftune/evolution/validate-proposal.ts +257 -35
  58. package/cli/selftune/evolution/validate-routing.ts +177 -0
  59. package/cli/selftune/grading/auto-grade.ts +200 -0
  60. package/cli/selftune/grading/grade-session.ts +513 -42
  61. package/cli/selftune/grading/pre-gates.ts +104 -0
  62. package/cli/selftune/grading/results.ts +42 -0
  63. package/cli/selftune/hooks/auto-activate.ts +185 -0
  64. package/cli/selftune/hooks/evolution-guard.ts +165 -0
  65. package/cli/selftune/hooks/prompt-log.ts +172 -2
  66. package/cli/selftune/hooks/session-stop.ts +123 -3
  67. package/cli/selftune/hooks/skill-change-guard.ts +112 -0
  68. package/cli/selftune/hooks/skill-eval.ts +119 -3
  69. package/cli/selftune/index.ts +415 -48
  70. package/cli/selftune/ingestors/claude-replay.ts +377 -0
  71. package/cli/selftune/ingestors/codex-rollout.ts +345 -46
  72. package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
  73. package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
  74. package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
  75. package/cli/selftune/init.ts +376 -16
  76. package/cli/selftune/last.ts +14 -5
  77. package/cli/selftune/localdb/db.ts +63 -0
  78. package/cli/selftune/localdb/materialize.ts +428 -0
  79. package/cli/selftune/localdb/queries.ts +376 -0
  80. package/cli/selftune/localdb/schema.ts +204 -0
  81. package/cli/selftune/memory/writer.ts +447 -0
  82. package/cli/selftune/monitoring/watch.ts +90 -16
  83. package/cli/selftune/normalization.ts +682 -0
  84. package/cli/selftune/observability.ts +19 -44
  85. package/cli/selftune/orchestrate.ts +1073 -0
  86. package/cli/selftune/quickstart.ts +203 -0
  87. package/cli/selftune/repair/skill-usage.ts +576 -0
  88. package/cli/selftune/schedule.ts +561 -0
  89. package/cli/selftune/status.ts +59 -33
  90. package/cli/selftune/sync.ts +627 -0
  91. package/cli/selftune/types.ts +525 -5
  92. package/cli/selftune/utils/canonical-log.ts +45 -0
  93. package/cli/selftune/utils/frontmatter.ts +217 -0
  94. package/cli/selftune/utils/hooks.ts +41 -0
  95. package/cli/selftune/utils/html.ts +27 -0
  96. package/cli/selftune/utils/llm-call.ts +103 -19
  97. package/cli/selftune/utils/math.ts +10 -0
  98. package/cli/selftune/utils/query-filter.ts +139 -0
  99. package/cli/selftune/utils/skill-discovery.ts +340 -0
  100. package/cli/selftune/utils/skill-log.ts +68 -0
  101. package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
  102. package/cli/selftune/utils/transcript.ts +307 -26
  103. package/cli/selftune/utils/trigger-check.ts +89 -0
  104. package/cli/selftune/utils/tui.ts +156 -0
  105. package/cli/selftune/workflows/discover.ts +254 -0
  106. package/cli/selftune/workflows/skill-md-writer.ts +288 -0
  107. package/cli/selftune/workflows/workflows.ts +188 -0
  108. package/package.json +28 -11
  109. package/packages/telemetry-contract/README.md +11 -0
  110. package/packages/telemetry-contract/fixtures/golden.json +87 -0
  111. package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
  112. package/packages/telemetry-contract/index.ts +1 -0
  113. package/packages/telemetry-contract/package.json +19 -0
  114. package/packages/telemetry-contract/src/index.ts +2 -0
  115. package/packages/telemetry-contract/src/types.ts +163 -0
  116. package/packages/telemetry-contract/src/validators.ts +109 -0
  117. package/skill/SKILL.md +180 -33
  118. package/skill/Workflows/AutoActivation.md +145 -0
  119. package/skill/Workflows/Badge.md +124 -0
  120. package/skill/Workflows/Baseline.md +144 -0
  121. package/skill/Workflows/Composability.md +107 -0
  122. package/skill/Workflows/Contribute.md +94 -0
  123. package/skill/Workflows/Cron.md +132 -0
  124. package/skill/Workflows/Dashboard.md +214 -0
  125. package/skill/Workflows/Doctor.md +63 -14
  126. package/skill/Workflows/Evals.md +110 -18
  127. package/skill/Workflows/EvolutionMemory.md +154 -0
  128. package/skill/Workflows/Evolve.md +181 -21
  129. package/skill/Workflows/EvolveBody.md +159 -0
  130. package/skill/Workflows/Grade.md +36 -31
  131. package/skill/Workflows/ImportSkillsBench.md +117 -0
  132. package/skill/Workflows/Ingest.md +142 -21
  133. package/skill/Workflows/Initialize.md +91 -23
  134. package/skill/Workflows/Orchestrate.md +139 -0
  135. package/skill/Workflows/Replay.md +91 -0
  136. package/skill/Workflows/Rollback.md +23 -4
  137. package/skill/Workflows/Schedule.md +61 -0
  138. package/skill/Workflows/Sync.md +88 -0
  139. package/skill/Workflows/UnitTest.md +150 -0
  140. package/skill/Workflows/Watch.md +33 -1
  141. package/skill/Workflows/Workflows.md +129 -0
  142. package/skill/assets/activation-rules-default.json +26 -0
  143. package/skill/assets/multi-skill-settings.json +63 -0
  144. package/skill/assets/single-skill-settings.json +57 -0
  145. package/skill/references/invocation-taxonomy.md +2 -2
  146. package/skill/references/logs.md +164 -2
  147. package/skill/references/setup-patterns.md +65 -0
  148. package/skill/references/version-history.md +40 -0
  149. package/skill/settings_snippet.json +23 -0
  150. package/templates/activation-rules-default.json +27 -0
  151. package/templates/multi-skill-settings.json +64 -0
  152. package/templates/single-skill-settings.json +58 -0
  153. package/dashboard/index.html +0 -1119
@@ -5,19 +5,26 @@
5
5
  * Rubric-based grader for Claude Code skill sessions.
6
6
  * Migrated from grade_session.py.
7
7
  *
8
- * Grades via installed agent CLI (claude/codex/opencode).
8
+ * Grades via an installed agent CLI selected from AGENT_CANDIDATES.
9
9
  */
10
10
 
11
- import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
12
- import { dirname } from "node:path";
11
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
12
+ import { basename, dirname, join } from "node:path";
13
13
  import { parseArgs } from "node:util";
14
14
 
15
- import { TELEMETRY_LOG } from "../constants.js";
15
+ import {
16
+ AGENT_CANDIDATES,
17
+ CLAUDE_CODE_PROJECTS_DIR,
18
+ SELFTUNE_CONFIG_DIR,
19
+ TELEMETRY_LOG,
20
+ } from "../constants.js";
16
21
  import type {
17
22
  ExecutionMetrics,
18
23
  GraderOutput,
24
+ GradingExpectation,
19
25
  GradingResult,
20
26
  SessionTelemetryRecord,
27
+ SkillUsageRecord,
21
28
  } from "../types.js";
22
29
  import { readJsonl } from "../utils/jsonl.js";
23
30
  import {
@@ -25,7 +32,13 @@ import {
25
32
  stripMarkdownFences as _stripMarkdownFences,
26
33
  callViaAgent,
27
34
  } from "../utils/llm-call.js";
28
- import { readExcerpt } from "../utils/transcript.js";
35
+ import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
36
+ import {
37
+ buildTelemetryFromTranscript,
38
+ findTranscriptPathForSession,
39
+ readExcerpt,
40
+ } from "../utils/transcript.js";
41
+ import { type PreGateContext, runPreGates } from "./pre-gates.js";
29
42
 
30
43
  // Re-export for backward compatibility
31
44
  export { detectAgent, stripMarkdownFences } from "../utils/llm-call.js";
@@ -48,24 +61,36 @@ export const GRADER_SYSTEM = `You are a rigorous skill session evaluator. You re
48
61
  Grade each expectation and output ONLY valid JSON matching this schema:
49
62
  {
50
63
  "expectations": [
51
- {"text": "...", "passed": true/false, "evidence": "specific quote or metric"}
64
+ {"text": "...", "passed": true/false, "evidence": "specific quote or metric", "score": 0.0-1.0}
52
65
  ],
53
- "summary": {"passed": N, "failed": N, "total": N, "pass_rate": 0.0},
66
+ "summary": {"passed": N, "failed": N, "total": N, "pass_rate": 0.0, "mean_score": 0.0},
54
67
  "claims": [
55
68
  {"claim": "...", "type": "factual|process|quality", "verified": true/false, "evidence": "..."}
56
69
  ],
57
70
  "eval_feedback": {
58
71
  "suggestions": [{"assertion": "...", "reason": "..."}],
59
72
  "overall": "one sentence"
60
- }
73
+ },
74
+ "failure_feedback": [
75
+ {"query": "the user query that failed", "failure_reason": "why it failed", "improvement_hint": "how to fix", "invocation_type": "explicit|implicit|contextual|negative"}
76
+ ]
61
77
  }
62
78
 
79
+ Score guide:
80
+ - 1.0: Clear, specific evidence of full completion
81
+ - 0.7-0.9: Strong evidence with minor gaps
82
+ - 0.4-0.6: Partial evidence or partial completion
83
+ - 0.1-0.3: Weak evidence, mostly not met
84
+ - 0.0: No evidence or clearly not met
85
+
63
86
  Rules:
64
87
  - PASS only when there is clear, specific evidence — not assumptions
65
88
  - FAIL when evidence is absent or contradictory
66
89
  - Cite exact quotes or specific metric values
67
90
  - Extract 2-4 implicit claims from the transcript and verify them
68
- - Suggest eval improvements only for clear gaps`;
91
+ - Suggest eval improvements only for clear gaps
92
+ - Set score to reflect confidence level (0.0-1.0)
93
+ - For each FAILED expectation, provide a failure_feedback entry with the relevant query, specific reason for failure, and actionable improvement hint`;
69
94
 
70
95
  // ---------------------------------------------------------------------------
71
96
  // Data lookup helpers
@@ -85,12 +110,148 @@ export function latestSessionForSkill(
85
110
  telemetry: SessionTelemetryRecord[],
86
111
  skillName: string,
87
112
  ): SessionTelemetryRecord | null {
113
+ // First pass: prefer sessions with actual Skill tool invocations (skills_invoked)
114
+ for (let i = telemetry.length - 1; i >= 0; i--) {
115
+ if (telemetry[i].skills_invoked?.includes(skillName)) return telemetry[i];
116
+ }
117
+ // Fallback: sessions where SKILL.md was read (skills_triggered)
88
118
  for (let i = telemetry.length - 1; i >= 0; i--) {
89
119
  if (telemetry[i].skills_triggered?.includes(skillName)) return telemetry[i];
90
120
  }
91
121
  return null;
92
122
  }
93
123
 
124
+ export function latestSkillUsageForSkill(
125
+ skillUsage: SkillUsageRecord[],
126
+ skillName: string,
127
+ ): SkillUsageRecord | null {
128
+ for (let i = skillUsage.length - 1; i >= 0; i--) {
129
+ const record = skillUsage[i];
130
+ if (record.skill_name === skillName && record.triggered) return record;
131
+ }
132
+ return null;
133
+ }
134
+
135
+ export interface ResolvedSessionContext {
136
+ telemetry: SessionTelemetryRecord;
137
+ sessionId: string;
138
+ transcriptPath: string;
139
+ source: "telemetry" | "transcript_fallback" | "skill_usage_fallback";
140
+ }
141
+
142
+ function buildSkillUsageFallbackTelemetry(record: SkillUsageRecord): SessionTelemetryRecord {
143
+ return {
144
+ timestamp: record.timestamp,
145
+ session_id: record.session_id,
146
+ cwd: "",
147
+ transcript_path: "",
148
+ tool_calls: {},
149
+ total_tool_calls: 0,
150
+ bash_commands: [],
151
+ skills_triggered: [record.skill_name],
152
+ skills_invoked: [record.skill_name],
153
+ assistant_turns: 0,
154
+ errors_encountered: 0,
155
+ transcript_chars: 0,
156
+ last_user_query: record.query,
157
+ source: record.source ?? "skill_usage_fallback",
158
+ };
159
+ }
160
+
161
+ export function resolveSessionById(
162
+ telemetry: SessionTelemetryRecord[],
163
+ sessionId: string,
164
+ projectsDir: string = CLAUDE_CODE_PROJECTS_DIR,
165
+ ): ResolvedSessionContext | null {
166
+ const direct = findSession(telemetry, sessionId);
167
+ if (direct) {
168
+ return {
169
+ telemetry: direct,
170
+ sessionId: direct.session_id,
171
+ transcriptPath: direct.transcript_path ?? "",
172
+ source: "telemetry",
173
+ };
174
+ }
175
+
176
+ const transcriptPath = findTranscriptPathForSession(sessionId, projectsDir);
177
+ if (!transcriptPath) return null;
178
+
179
+ const rebuilt = buildTelemetryFromTranscript(sessionId, transcriptPath);
180
+ if (!rebuilt) return null;
181
+
182
+ return {
183
+ telemetry: rebuilt,
184
+ sessionId,
185
+ transcriptPath,
186
+ source: "transcript_fallback",
187
+ };
188
+ }
189
+
190
+ export function resolveLatestSessionForSkill(
191
+ telemetry: SessionTelemetryRecord[],
192
+ skillUsage: SkillUsageRecord[],
193
+ skillName: string,
194
+ projectsDir: string = CLAUDE_CODE_PROJECTS_DIR,
195
+ ): ResolvedSessionContext | null {
196
+ const direct = latestSessionForSkill(telemetry, skillName);
197
+ if (direct) {
198
+ return {
199
+ telemetry: direct,
200
+ sessionId: direct.session_id,
201
+ transcriptPath: direct.transcript_path ?? "",
202
+ source: "telemetry",
203
+ };
204
+ }
205
+
206
+ const usage = latestSkillUsageForSkill(skillUsage, skillName);
207
+ if (!usage) return null;
208
+
209
+ const transcriptPath = findTranscriptPathForSession(usage.session_id, projectsDir);
210
+ if (!transcriptPath) {
211
+ const fallback = buildSkillUsageFallbackTelemetry(usage);
212
+ return {
213
+ telemetry: fallback,
214
+ sessionId: fallback.session_id,
215
+ transcriptPath: fallback.transcript_path,
216
+ source: "skill_usage_fallback",
217
+ };
218
+ }
219
+
220
+ const rebuilt = buildTelemetryFromTranscript(usage.session_id, transcriptPath);
221
+ if (!rebuilt) {
222
+ const fallback = buildSkillUsageFallbackTelemetry(usage);
223
+ fallback.transcript_path = transcriptPath;
224
+ return {
225
+ telemetry: fallback,
226
+ sessionId: fallback.session_id,
227
+ transcriptPath,
228
+ source: "skill_usage_fallback",
229
+ };
230
+ }
231
+
232
+ if (!rebuilt.skills_triggered.includes(skillName)) {
233
+ rebuilt.skills_triggered = [...rebuilt.skills_triggered, skillName];
234
+ }
235
+ if (rebuilt.skills_invoked && !rebuilt.skills_invoked.includes(skillName)) {
236
+ rebuilt.skills_invoked = [...rebuilt.skills_invoked, skillName];
237
+ }
238
+ if (!rebuilt.last_user_query) {
239
+ rebuilt.last_user_query = usage.query;
240
+ }
241
+
242
+ return {
243
+ telemetry: rebuilt,
244
+ sessionId: rebuilt.session_id,
245
+ transcriptPath,
246
+ source: "transcript_fallback",
247
+ };
248
+ }
249
+
250
+ export function buildDefaultGradingOutputPath(sessionId: string): string {
251
+ const safeSessionId = sessionId.replace(/[^a-zA-Z0-9_-]/g, "_");
252
+ return join(SELFTUNE_CONFIG_DIR, "grading", `result-${safeSessionId}.json`);
253
+ }
254
+
94
255
  export function loadExpectationsFromEvalsJson(evalsJsonPath: string, evalId: number): string[] {
95
256
  let data: unknown;
96
257
  try {
@@ -143,6 +304,107 @@ export function loadExpectationsFromEvalsJson(evalsJsonPath: string, evalId: num
143
304
  throw new Error(`Eval ID ${evalId} not found in ${evalsJsonPath}`);
144
305
  }
145
306
 
307
+ // ---------------------------------------------------------------------------
308
+ // Auto-derive expectations from SKILL.md
309
+ // ---------------------------------------------------------------------------
310
+
311
+ export interface DerivedExpectations {
312
+ expectations: string[];
313
+ derived: boolean;
314
+ source: string;
315
+ }
316
+
317
+ const GENERIC_EXPECTATIONS: string[] = [
318
+ "The skill was triggered during the session",
319
+ "The task was completed successfully without critical errors",
320
+ "No unhandled errors were encountered",
321
+ ];
322
+
323
+ /**
324
+ * Derive grading expectations from a skill's SKILL.md file.
325
+ *
326
+ * Resolution order for SKILL.md path:
327
+ * 1. Explicit `skillPath` argument
328
+ * 2. Lookup from skill_usage_log.jsonl records
329
+ * 3. Falls back to generic expectations if not found
330
+ */
331
+ export function deriveExpectationsFromSkill(
332
+ skillName: string,
333
+ skillPath?: string,
334
+ ): DerivedExpectations {
335
+ // Resolve the SKILL.md path
336
+ let resolvedPath = skillPath;
337
+
338
+ if (!resolvedPath) {
339
+ // Try to find from skill_usage_log
340
+ try {
341
+ const usageRecords = readEffectiveSkillUsageRecords();
342
+ for (let i = usageRecords.length - 1; i >= 0; i--) {
343
+ if (usageRecords[i].skill_name === skillName && usageRecords[i].skill_path) {
344
+ resolvedPath = usageRecords[i].skill_path;
345
+ break;
346
+ }
347
+ }
348
+ } catch {
349
+ // skill_usage_log not available
350
+ }
351
+ }
352
+
353
+ if (!resolvedPath || !existsSync(resolvedPath)) {
354
+ return {
355
+ expectations: GENERIC_EXPECTATIONS,
356
+ derived: false,
357
+ source: resolvedPath ? `SKILL.md not found at ${resolvedPath}` : "no SKILL.md path found",
358
+ };
359
+ }
360
+
361
+ // Read and parse SKILL.md
362
+ let content: string;
363
+ try {
364
+ content = readFileSync(resolvedPath, "utf-8");
365
+ } catch {
366
+ return {
367
+ expectations: GENERIC_EXPECTATIONS,
368
+ derived: false,
369
+ source: `failed to read ${resolvedPath}`,
370
+ };
371
+ }
372
+
373
+ const expectations: string[] = [`The "${skillName}" skill was triggered during the session`];
374
+
375
+ // Extract description from first paragraph after title
376
+ const descMatch = content.match(/^#\s+.+\n+([^\n#][^\n]*)/m);
377
+ if (descMatch) {
378
+ const desc = descMatch[1].trim();
379
+ if (desc.length > 10) {
380
+ expectations.push(`The skill fulfilled its purpose: ${desc.slice(0, 120)}`);
381
+ }
382
+ }
383
+
384
+ // Extract "When to Use" section content
385
+ const whenMatch = content.match(/##\s*When\s+to\s+Use\b[^\n]*\n([\s\S]*?)(?=\n##\s|\n---|$)/i);
386
+ if (whenMatch) {
387
+ const lines = whenMatch[1]
388
+ .split("\n")
389
+ .map((l) => l.replace(/^[-*]\s*/, "").trim())
390
+ .filter((l) => l.length > 5);
391
+ if (lines.length > 0) {
392
+ expectations.push(`The session context matched a "When to Use" trigger for ${skillName}`);
393
+ }
394
+ }
395
+
396
+ // Add standard quality expectations
397
+ expectations.push("The task was completed successfully without critical errors");
398
+ expectations.push("No unhandled errors were encountered");
399
+
400
+ // Cap at 5 expectations
401
+ return {
402
+ expectations: expectations.slice(0, 5),
403
+ derived: true,
404
+ source: resolvedPath,
405
+ };
406
+ }
407
+
146
408
  // ---------------------------------------------------------------------------
147
409
  // Execution metrics
148
410
  // ---------------------------------------------------------------------------
@@ -159,6 +421,39 @@ export function buildExecutionMetrics(telemetry: SessionTelemetryRecord): Execut
159
421
  };
160
422
  }
161
423
 
424
+ // ---------------------------------------------------------------------------
425
+ // Graduated scoring
426
+ // ---------------------------------------------------------------------------
427
+
428
+ /**
429
+ * Compute graduated scoring summary from expectations.
430
+ * Uses score field if present, defaults to 1.0 for pass, 0.0 for fail.
431
+ */
432
+ export function buildGraduatedSummary(expectations: GradingExpectation[]): {
433
+ mean_score: number;
434
+ score_std_dev: number;
435
+ } {
436
+ if (expectations.length === 0) {
437
+ return { mean_score: 0, score_std_dev: 0 };
438
+ }
439
+
440
+ const scores = expectations.map((e) => {
441
+ const fallback = e.passed ? 1.0 : 0.0;
442
+ const raw = e.score ?? fallback;
443
+ if (!Number.isFinite(raw)) return fallback;
444
+ return Math.min(1, Math.max(0, raw));
445
+ });
446
+ const mean = scores.reduce((sum, s) => sum + s, 0) / scores.length;
447
+
448
+ const variance = scores.reduce((sum, s) => sum + (s - mean) ** 2, 0) / scores.length;
449
+ const stdDev = Math.sqrt(variance);
450
+
451
+ return {
452
+ mean_score: Math.round(mean * 1000) / 1000,
453
+ score_std_dev: Math.round(stdDev * 1000) / 1000,
454
+ };
455
+ }
456
+
162
457
  // ---------------------------------------------------------------------------
163
458
  // Prompt building
164
459
  // ---------------------------------------------------------------------------
@@ -224,26 +519,148 @@ export async function gradeViaAgent(prompt: string, agent: string): Promise<Grad
224
519
  }
225
520
 
226
521
  // ---------------------------------------------------------------------------
227
- // Result assembly
522
+ // Shared grading flow
228
523
  // ---------------------------------------------------------------------------
229
524
 
230
- export function assembleResult(
231
- graderOutput: GraderOutput,
525
+ function normalizeExpectations(expectations: GradingExpectation[]): GradingExpectation[] {
526
+ return expectations.map((e) => ({
527
+ ...e,
528
+ score: e.score ?? (e.passed ? 1.0 : 0.0),
529
+ source: e.source ?? ("llm" as const),
530
+ }));
531
+ }
532
+
533
+ function assembleResultFromExpectations(
534
+ expectations: GradingExpectation[],
232
535
  telemetry: SessionTelemetryRecord,
233
536
  sessionId: string,
234
537
  skillName: string,
235
538
  transcriptPath: string,
236
539
  ): GradingResult {
540
+ const passedCount = expectations.filter((e) => e.passed).length;
541
+ const totalCount = expectations.length;
542
+ const graduated = buildGraduatedSummary(expectations);
543
+
237
544
  return {
238
545
  session_id: sessionId ?? "unknown",
239
546
  skill_name: skillName ?? "unknown",
240
547
  transcript_path: transcriptPath ?? "",
241
548
  graded_at: new Date().toISOString(),
242
- expectations: graderOutput?.expectations ?? [],
243
- summary: graderOutput?.summary ?? { passed: 0, failed: 0, total: 0, pass_rate: 0 },
549
+ expectations,
550
+ summary: {
551
+ passed: passedCount,
552
+ failed: totalCount - passedCount,
553
+ total: totalCount,
554
+ pass_rate: totalCount > 0 ? passedCount / totalCount : 0,
555
+ mean_score: graduated.mean_score,
556
+ score_std_dev: graduated.score_std_dev,
557
+ },
244
558
  execution_metrics: buildExecutionMetrics(telemetry ?? ({} as SessionTelemetryRecord)),
559
+ claims: [],
560
+ eval_feedback: { suggestions: [], overall: "" },
561
+ };
562
+ }
563
+
564
+ export interface GradeSessionParams {
565
+ expectations: string[];
566
+ telemetry: SessionTelemetryRecord;
567
+ sessionId: string;
568
+ skillName: string;
569
+ transcriptExcerpt: string;
570
+ transcriptPath: string;
571
+ agent: string;
572
+ gradeViaAgentFn?: (prompt: string, agent: string) => Promise<GraderOutput>;
573
+ }
574
+
575
+ export async function gradeSession({
576
+ expectations,
577
+ telemetry,
578
+ sessionId,
579
+ skillName,
580
+ transcriptExcerpt,
581
+ transcriptPath,
582
+ agent,
583
+ gradeViaAgentFn = gradeViaAgent,
584
+ }: GradeSessionParams): Promise<GradingResult> {
585
+ const preGateCtx: PreGateContext = {
586
+ telemetry,
587
+ skillName,
588
+ transcriptExcerpt,
589
+ };
590
+ const preGateResult = runPreGates(expectations, preGateCtx);
591
+
592
+ let allExpectations: GradingExpectation[];
593
+
594
+ if (preGateResult.remaining.length === 0) {
595
+ console.error(
596
+ `[INFO] All ${expectations.length} expectations resolved by pre-gates, skipping LLM`,
597
+ );
598
+ allExpectations = preGateResult.resolved;
599
+ } else {
600
+ console.error(
601
+ `[INFO] Pre-gates resolved ${preGateResult.resolved.length}/${expectations.length} expectations`,
602
+ );
603
+ const prompt = buildGradingPrompt(
604
+ preGateResult.remaining,
605
+ telemetry,
606
+ transcriptExcerpt,
607
+ skillName,
608
+ );
609
+ console.error(
610
+ `Grading ${preGateResult.remaining.length} expectations for skill '${skillName}'...`,
611
+ );
612
+
613
+ let graderOutput: GraderOutput;
614
+ try {
615
+ graderOutput = await gradeViaAgentFn(prompt, agent);
616
+ } catch (err) {
617
+ throw new Error(`Grading failed: ${err instanceof Error ? err.message : String(err)}`, {
618
+ cause: err,
619
+ });
620
+ }
621
+
622
+ const llmExpectations = normalizeExpectations(graderOutput.expectations ?? []);
623
+ if (llmExpectations.length !== preGateResult.remaining.length) {
624
+ throw new Error(
625
+ `Grader returned ${llmExpectations.length} expectations for ${preGateResult.remaining.length} unresolved expectations`,
626
+ );
627
+ }
628
+
629
+ allExpectations = [...preGateResult.resolved, ...llmExpectations];
630
+ }
631
+
632
+ return assembleResultFromExpectations(
633
+ allExpectations,
634
+ telemetry,
635
+ sessionId,
636
+ skillName,
637
+ transcriptPath,
638
+ );
639
+ }
640
+
641
+ // ---------------------------------------------------------------------------
642
+ // Result assembly
643
+ // ---------------------------------------------------------------------------
644
+
645
+ export function assembleResult(
646
+ graderOutput: GraderOutput,
647
+ telemetry: SessionTelemetryRecord,
648
+ sessionId: string,
649
+ skillName: string,
650
+ transcriptPath: string,
651
+ ): GradingResult {
652
+ const result = assembleResultFromExpectations(
653
+ normalizeExpectations(graderOutput?.expectations ?? []),
654
+ telemetry,
655
+ sessionId,
656
+ skillName,
657
+ transcriptPath,
658
+ );
659
+ return {
660
+ ...result,
245
661
  claims: graderOutput?.claims ?? [],
246
662
  eval_feedback: graderOutput?.eval_feedback ?? { suggestions: [], overall: "" },
663
+ failure_feedback: graderOutput?.failure_feedback,
247
664
  };
248
665
  }
249
666
 
@@ -254,10 +671,16 @@ export function assembleResult(
254
671
  function printSummary(result: GradingResult): void {
255
672
  const { summary } = result;
256
673
  const rate = summary.pass_rate ?? 0;
257
- console.log(`\nResults: ${summary.passed}/${summary.total} passed (${Math.round(rate * 100)}%)`);
674
+ const meanStr =
675
+ summary.mean_score != null ? ` | mean score: ${summary.mean_score.toFixed(2)}` : "";
676
+ console.log(
677
+ `\nResults: ${summary.passed}/${summary.total} passed (${Math.round(rate * 100)}%)${meanStr}`,
678
+ );
258
679
  for (const exp of result.expectations ?? []) {
259
680
  const icon = exp.passed ? "\u2713" : "\u2717";
260
- console.log(` ${icon} ${String(exp.text ?? "").slice(0, 70)}`);
681
+ const scoreStr = exp.score != null ? ` [${exp.score.toFixed(1)}]` : "";
682
+ const sourceStr = exp.source ? ` (${exp.source})` : "";
683
+ console.log(` ${icon}${scoreStr}${sourceStr} ${String(exp.text ?? "").slice(0, 70)}`);
261
684
  if (!exp.passed) {
262
685
  console.log(` -> ${String(exp.evidence ?? "").slice(0, 100)}`);
263
686
  }
@@ -280,19 +703,43 @@ export async function cliMain(): Promise<void> {
280
703
  const { values } = parseArgs({
281
704
  options: {
282
705
  skill: { type: "string" },
706
+ "skill-path": { type: "string" },
283
707
  expectations: { type: "string", multiple: true },
284
708
  "evals-json": { type: "string" },
285
709
  "eval-id": { type: "string" },
286
710
  "session-id": { type: "string" },
287
711
  transcript: { type: "string" },
288
712
  "telemetry-log": { type: "string", default: TELEMETRY_LOG },
289
- output: { type: "string", default: "grading.json" },
713
+ output: { type: "string" },
290
714
  agent: { type: "string" },
291
715
  "show-transcript": { type: "boolean", default: false },
716
+ help: { type: "boolean", short: "h", default: false },
292
717
  },
293
718
  strict: true,
294
719
  });
295
720
 
721
+ if (values.help) {
722
+ console.log(`selftune grade — Grade a skill session
723
+
724
+ Usage:
725
+ selftune grade --skill <name> [options]
726
+
727
+ Options:
728
+ --skill Skill name (required)
729
+ --skill-path Path to SKILL.md (for auto-deriving expectations)
730
+ --expectations Expectation strings (repeatable)
731
+ --evals-json Path to evals JSON file
732
+ --eval-id Eval ID within evals JSON
733
+ --session-id Grade a specific session by ID
734
+ --transcript Path to transcript file
735
+ --telemetry-log Path to telemetry log (default: ~/.claude/session_telemetry_log.jsonl)
736
+ --output Output path for grading JSON (default: ~/.selftune/grading/result-<session>.json)
737
+ --agent Agent CLI to use (${AGENT_CANDIDATES.join(", ")})
738
+ --show-transcript Print transcript excerpt before grading
739
+ -h, --help Show this help message`);
740
+ process.exit(0);
741
+ }
742
+
296
743
  const skill = values.skill;
297
744
  if (!skill) {
298
745
  console.error("[ERROR] --skill is required");
@@ -301,7 +748,7 @@ export async function cliMain(): Promise<void> {
301
748
 
302
749
  // --- Determine agent ---
303
750
  let agent: string | null = null;
304
- const validAgents = ["claude", "codex", "opencode"];
751
+ const validAgents = [...AGENT_CANDIDATES];
305
752
  if (values.agent) {
306
753
  if (!validAgents.includes(values.agent)) {
307
754
  console.error(
@@ -316,8 +763,8 @@ export async function cliMain(): Promise<void> {
316
763
 
317
764
  if (!agent) {
318
765
  console.error(
319
- "[ERROR] No agent CLI (claude/codex/opencode) found in PATH.\n" +
320
- "Install Claude Code, Codex, or OpenCode.",
766
+ `[ERROR] No supported agent CLI (${AGENT_CANDIDATES.join("/")}) found in PATH.\n` +
767
+ "Install one of the supported agent CLIs.",
321
768
  );
322
769
  process.exit(1);
323
770
  }
@@ -336,8 +783,18 @@ export async function cliMain(): Promise<void> {
336
783
  } else if (values.expectations?.length) {
337
784
  expectations = values.expectations;
338
785
  } else {
339
- console.error("[ERROR] Provide --expectations or --evals-json + --eval-id");
340
- process.exit(1);
786
+ // Auto-derive expectations from SKILL.md
787
+ const derived = deriveExpectationsFromSkill(skill, values["skill-path"]);
788
+ expectations = derived.expectations;
789
+ if (derived.derived) {
790
+ console.error(
791
+ `[INFO] Auto-derived ${derived.expectations.length} expectations from ${derived.source}`,
792
+ );
793
+ } else {
794
+ console.error(
795
+ `[WARN] No --expectations or --evals-json provided. Using generic expectations (${derived.source})`,
796
+ );
797
+ }
341
798
  }
342
799
 
343
800
  // --- Resolve session ---
@@ -347,9 +804,15 @@ export async function cliMain(): Promise<void> {
347
804
 
348
805
  const telemetryLog = values["telemetry-log"] ?? TELEMETRY_LOG;
349
806
  const telRecords = readJsonl<SessionTelemetryRecord>(telemetryLog);
807
+ const skillUsageRecords = readEffectiveSkillUsageRecords();
350
808
 
351
809
  if (values.transcript) {
352
810
  transcriptPath = values.transcript;
811
+ telemetry =
812
+ buildTelemetryFromTranscript(
813
+ values["session-id"] ?? basename(transcriptPath, ".jsonl"),
814
+ transcriptPath,
815
+ ) ?? ({} as SessionTelemetryRecord);
353
816
  for (let i = telRecords.length - 1; i >= 0; i--) {
354
817
  if (telRecords[i].transcript_path === transcriptPath) {
355
818
  telemetry = telRecords[i];
@@ -357,18 +820,25 @@ export async function cliMain(): Promise<void> {
357
820
  break;
358
821
  }
359
822
  }
823
+ if (telemetry.session_id) sessionId = telemetry.session_id;
360
824
  } else if (values["session-id"]) {
361
825
  sessionId = values["session-id"];
362
- telemetry = findSession(telRecords, sessionId) ?? ({} as SessionTelemetryRecord);
363
- transcriptPath = telemetry.transcript_path ?? "";
826
+ const resolved = resolveSessionById(telRecords, sessionId);
827
+ telemetry = resolved?.telemetry ?? ({} as SessionTelemetryRecord);
828
+ transcriptPath = resolved?.transcriptPath ?? "";
364
829
  } else {
365
- telemetry = latestSessionForSkill(telRecords, skill) ?? ({} as SessionTelemetryRecord);
366
- if (telemetry.session_id) {
367
- sessionId = telemetry.session_id;
368
- transcriptPath = telemetry.transcript_path ?? "";
369
- console.error(`[INFO] Grading most recent '${skill}' session: ${sessionId}`);
830
+ const resolved = resolveLatestSessionForSkill(telRecords, skillUsageRecords, skill);
831
+ telemetry = resolved?.telemetry ?? ({} as SessionTelemetryRecord);
832
+ if (resolved) {
833
+ sessionId = resolved.sessionId;
834
+ transcriptPath = resolved.transcriptPath;
835
+ const note =
836
+ resolved.source === "telemetry" ? "" : ` (${resolved.source.replaceAll("_", " ")})`;
837
+ console.error(`[INFO] Grading most recent '${skill}' session: ${sessionId}${note}`);
370
838
  } else {
371
- console.error(`[WARN] No telemetry for skill '${skill}'. Is session_stop_hook installed?`);
839
+ console.error(
840
+ `[WARN] No session found for skill '${skill}' in telemetry or recovered usage data.`,
841
+ );
372
842
  }
373
843
  }
374
844
 
@@ -380,22 +850,23 @@ export async function cliMain(): Promise<void> {
380
850
  console.log("==========================\n");
381
851
  }
382
852
 
383
- // --- Build prompt and grade ---
384
- const prompt = buildGradingPrompt(expectations, telemetry, transcriptExcerpt, skill);
385
-
386
- console.error(`Grading ${expectations.length} expectations for skill '${skill}'...`);
387
-
388
- let graderOutput: GraderOutput;
853
+ let result: GradingResult;
389
854
  try {
390
- graderOutput = await gradeViaAgent(prompt, agent);
391
- } catch (e) {
392
- console.error(`[ERROR] Grading failed: ${e}`);
855
+ result = await gradeSession({
856
+ expectations,
857
+ telemetry,
858
+ sessionId,
859
+ skillName: skill,
860
+ transcriptExcerpt,
861
+ transcriptPath,
862
+ agent,
863
+ });
864
+ } catch (err) {
865
+ console.error(`[ERROR] ${err instanceof Error ? err.message : String(err)}`);
393
866
  process.exit(1);
394
867
  }
395
868
 
396
- const result = assembleResult(graderOutput, telemetry, sessionId, skill, transcriptPath);
397
-
398
- const outputPath = values.output ?? "grading.json";
869
+ const outputPath = values.output ?? buildDefaultGradingOutputPath(sessionId);
399
870
  const outputDir = dirname(outputPath);
400
871
  if (outputDir !== ".") {
401
872
  mkdirSync(outputDir, { recursive: true });