selftune 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. package/.claude/agents/diagnosis-analyst.md +20 -10
  2. package/.claude/agents/evolution-reviewer.md +14 -1
  3. package/.claude/agents/integration-guide.md +18 -6
  4. package/.claude/agents/pattern-analyst.md +18 -5
  5. package/CHANGELOG.md +12 -4
  6. package/README.md +43 -35
  7. package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
  8. package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
  9. package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
  10. package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
  11. package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
  12. package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
  13. package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
  14. package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
  15. package/apps/local-dashboard/dist/favicon.png +0 -0
  16. package/apps/local-dashboard/dist/index.html +17 -0
  17. package/apps/local-dashboard/dist/logo.png +0 -0
  18. package/apps/local-dashboard/dist/logo.svg +9 -0
  19. package/cli/selftune/badge/badge-data.ts +1 -1
  20. package/cli/selftune/badge/badge.ts +4 -8
  21. package/cli/selftune/canonical-export.ts +183 -0
  22. package/cli/selftune/constants.ts +28 -0
  23. package/cli/selftune/contribute/contribute.ts +1 -1
  24. package/cli/selftune/cron/setup.ts +17 -17
  25. package/cli/selftune/dashboard-contract.ts +202 -0
  26. package/cli/selftune/dashboard-server.ts +653 -186
  27. package/cli/selftune/dashboard.ts +41 -176
  28. package/cli/selftune/eval/baseline.ts +5 -4
  29. package/cli/selftune/eval/composability-v2.ts +273 -0
  30. package/cli/selftune/eval/hooks-to-evals.ts +34 -15
  31. package/cli/selftune/eval/unit-test-cli.ts +1 -1
  32. package/cli/selftune/evolution/evidence.ts +26 -0
  33. package/cli/selftune/evolution/evolve-body.ts +105 -11
  34. package/cli/selftune/evolution/evolve.ts +371 -25
  35. package/cli/selftune/evolution/extract-patterns.ts +87 -29
  36. package/cli/selftune/evolution/rollback.ts +2 -2
  37. package/cli/selftune/grading/auto-grade.ts +200 -0
  38. package/cli/selftune/grading/grade-session.ts +448 -97
  39. package/cli/selftune/grading/results.ts +42 -0
  40. package/cli/selftune/hooks/prompt-log.ts +172 -2
  41. package/cli/selftune/hooks/session-stop.ts +123 -3
  42. package/cli/selftune/hooks/skill-eval.ts +119 -3
  43. package/cli/selftune/index.ts +395 -116
  44. package/cli/selftune/ingestors/claude-replay.ts +140 -114
  45. package/cli/selftune/ingestors/codex-rollout.ts +345 -46
  46. package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
  47. package/cli/selftune/ingestors/openclaw-ingest.ts +141 -8
  48. package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
  49. package/cli/selftune/init.ts +227 -14
  50. package/cli/selftune/last.ts +14 -5
  51. package/cli/selftune/localdb/db.ts +63 -0
  52. package/cli/selftune/localdb/materialize.ts +428 -0
  53. package/cli/selftune/localdb/queries.ts +376 -0
  54. package/cli/selftune/localdb/schema.ts +204 -0
  55. package/cli/selftune/monitoring/watch.ts +66 -15
  56. package/cli/selftune/normalization.ts +682 -0
  57. package/cli/selftune/observability.ts +19 -44
  58. package/cli/selftune/orchestrate.ts +1073 -0
  59. package/cli/selftune/quickstart.ts +203 -0
  60. package/cli/selftune/repair/skill-usage.ts +576 -0
  61. package/cli/selftune/schedule.ts +561 -0
  62. package/cli/selftune/status.ts +48 -26
  63. package/cli/selftune/sync.ts +627 -0
  64. package/cli/selftune/types.ts +148 -0
  65. package/cli/selftune/utils/canonical-log.ts +45 -0
  66. package/cli/selftune/utils/hooks.ts +41 -0
  67. package/cli/selftune/utils/html.ts +27 -0
  68. package/cli/selftune/utils/llm-call.ts +78 -20
  69. package/cli/selftune/utils/math.ts +10 -0
  70. package/cli/selftune/utils/query-filter.ts +139 -0
  71. package/cli/selftune/utils/skill-discovery.ts +340 -0
  72. package/cli/selftune/utils/skill-log.ts +68 -0
  73. package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
  74. package/cli/selftune/utils/transcript.ts +272 -26
  75. package/cli/selftune/workflows/discover.ts +254 -0
  76. package/cli/selftune/workflows/skill-md-writer.ts +288 -0
  77. package/cli/selftune/workflows/workflows.ts +188 -0
  78. package/package.json +21 -8
  79. package/packages/telemetry-contract/README.md +11 -0
  80. package/packages/telemetry-contract/fixtures/golden.json +87 -0
  81. package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
  82. package/packages/telemetry-contract/index.ts +1 -0
  83. package/packages/telemetry-contract/package.json +19 -0
  84. package/packages/telemetry-contract/src/index.ts +2 -0
  85. package/packages/telemetry-contract/src/types.ts +163 -0
  86. package/packages/telemetry-contract/src/validators.ts +109 -0
  87. package/skill/SKILL.md +84 -53
  88. package/skill/Workflows/AutoActivation.md +17 -16
  89. package/skill/Workflows/Badge.md +6 -0
  90. package/skill/Workflows/Baseline.md +46 -23
  91. package/skill/Workflows/Composability.md +12 -5
  92. package/skill/Workflows/Contribute.md +17 -14
  93. package/skill/Workflows/Cron.md +56 -79
  94. package/skill/Workflows/Dashboard.md +45 -34
  95. package/skill/Workflows/Doctor.md +30 -17
  96. package/skill/Workflows/Evals.md +64 -40
  97. package/skill/Workflows/EvolutionMemory.md +2 -0
  98. package/skill/Workflows/Evolve.md +102 -47
  99. package/skill/Workflows/EvolveBody.md +6 -6
  100. package/skill/Workflows/Grade.md +36 -31
  101. package/skill/Workflows/ImportSkillsBench.md +11 -5
  102. package/skill/Workflows/Ingest.md +43 -36
  103. package/skill/Workflows/Initialize.md +44 -30
  104. package/skill/Workflows/Orchestrate.md +139 -0
  105. package/skill/Workflows/Replay.md +39 -18
  106. package/skill/Workflows/Rollback.md +3 -3
  107. package/skill/Workflows/Schedule.md +61 -0
  108. package/skill/Workflows/Sync.md +88 -0
  109. package/skill/Workflows/UnitTest.md +34 -22
  110. package/skill/Workflows/Watch.md +14 -4
  111. package/skill/Workflows/Workflows.md +129 -0
  112. package/skill/assets/activation-rules-default.json +26 -0
  113. package/skill/assets/multi-skill-settings.json +63 -0
  114. package/skill/assets/single-skill-settings.json +57 -0
  115. package/skill/references/invocation-taxonomy.md +2 -2
  116. package/skill/references/logs.md +164 -2
  117. package/skill/references/setup-patterns.md +65 -0
  118. package/skill/references/version-history.md +40 -0
  119. package/skill/settings_snippet.json +1 -1
  120. package/templates/multi-skill-settings.json +7 -7
  121. package/templates/single-skill-settings.json +6 -6
  122. package/dashboard/index.html +0 -1680
@@ -5,20 +5,26 @@
5
5
  * Rubric-based grader for Claude Code skill sessions.
6
6
  * Migrated from grade_session.py.
7
7
  *
8
- * Grades via installed agent CLI (claude/codex/opencode).
8
+ * Grades via an installed agent CLI selected from AGENT_CANDIDATES.
9
9
  */
10
10
 
11
- import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
12
- import { dirname } from "node:path";
11
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
12
+ import { basename, dirname, join } from "node:path";
13
13
  import { parseArgs } from "node:util";
14
14
 
15
- import { TELEMETRY_LOG } from "../constants.js";
15
+ import {
16
+ AGENT_CANDIDATES,
17
+ CLAUDE_CODE_PROJECTS_DIR,
18
+ SELFTUNE_CONFIG_DIR,
19
+ TELEMETRY_LOG,
20
+ } from "../constants.js";
16
21
  import type {
17
22
  ExecutionMetrics,
18
23
  GraderOutput,
19
24
  GradingExpectation,
20
25
  GradingResult,
21
26
  SessionTelemetryRecord,
27
+ SkillUsageRecord,
22
28
  } from "../types.js";
23
29
  import { readJsonl } from "../utils/jsonl.js";
24
30
  import {
@@ -26,7 +32,12 @@ import {
26
32
  stripMarkdownFences as _stripMarkdownFences,
27
33
  callViaAgent,
28
34
  } from "../utils/llm-call.js";
29
- import { readExcerpt } from "../utils/transcript.js";
35
+ import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
36
+ import {
37
+ buildTelemetryFromTranscript,
38
+ findTranscriptPathForSession,
39
+ readExcerpt,
40
+ } from "../utils/transcript.js";
30
41
  import { type PreGateContext, runPreGates } from "./pre-gates.js";
31
42
 
32
43
  // Re-export for backward compatibility
@@ -99,12 +110,148 @@ export function latestSessionForSkill(
99
110
  telemetry: SessionTelemetryRecord[],
100
111
  skillName: string,
101
112
  ): SessionTelemetryRecord | null {
113
+ // First pass: prefer sessions with actual Skill tool invocations (skills_invoked)
114
+ for (let i = telemetry.length - 1; i >= 0; i--) {
115
+ if (telemetry[i].skills_invoked?.includes(skillName)) return telemetry[i];
116
+ }
117
+ // Fallback: sessions where SKILL.md was read (skills_triggered)
102
118
  for (let i = telemetry.length - 1; i >= 0; i--) {
103
119
  if (telemetry[i].skills_triggered?.includes(skillName)) return telemetry[i];
104
120
  }
105
121
  return null;
106
122
  }
107
123
 
124
+ export function latestSkillUsageForSkill(
125
+ skillUsage: SkillUsageRecord[],
126
+ skillName: string,
127
+ ): SkillUsageRecord | null {
128
+ for (let i = skillUsage.length - 1; i >= 0; i--) {
129
+ const record = skillUsage[i];
130
+ if (record.skill_name === skillName && record.triggered) return record;
131
+ }
132
+ return null;
133
+ }
134
+
135
+ export interface ResolvedSessionContext {
136
+ telemetry: SessionTelemetryRecord;
137
+ sessionId: string;
138
+ transcriptPath: string;
139
+ source: "telemetry" | "transcript_fallback" | "skill_usage_fallback";
140
+ }
141
+
142
+ function buildSkillUsageFallbackTelemetry(record: SkillUsageRecord): SessionTelemetryRecord {
143
+ return {
144
+ timestamp: record.timestamp,
145
+ session_id: record.session_id,
146
+ cwd: "",
147
+ transcript_path: "",
148
+ tool_calls: {},
149
+ total_tool_calls: 0,
150
+ bash_commands: [],
151
+ skills_triggered: [record.skill_name],
152
+ skills_invoked: [record.skill_name],
153
+ assistant_turns: 0,
154
+ errors_encountered: 0,
155
+ transcript_chars: 0,
156
+ last_user_query: record.query,
157
+ source: record.source ?? "skill_usage_fallback",
158
+ };
159
+ }
160
+
161
+ export function resolveSessionById(
162
+ telemetry: SessionTelemetryRecord[],
163
+ sessionId: string,
164
+ projectsDir: string = CLAUDE_CODE_PROJECTS_DIR,
165
+ ): ResolvedSessionContext | null {
166
+ const direct = findSession(telemetry, sessionId);
167
+ if (direct) {
168
+ return {
169
+ telemetry: direct,
170
+ sessionId: direct.session_id,
171
+ transcriptPath: direct.transcript_path ?? "",
172
+ source: "telemetry",
173
+ };
174
+ }
175
+
176
+ const transcriptPath = findTranscriptPathForSession(sessionId, projectsDir);
177
+ if (!transcriptPath) return null;
178
+
179
+ const rebuilt = buildTelemetryFromTranscript(sessionId, transcriptPath);
180
+ if (!rebuilt) return null;
181
+
182
+ return {
183
+ telemetry: rebuilt,
184
+ sessionId,
185
+ transcriptPath,
186
+ source: "transcript_fallback",
187
+ };
188
+ }
189
+
190
+ export function resolveLatestSessionForSkill(
191
+ telemetry: SessionTelemetryRecord[],
192
+ skillUsage: SkillUsageRecord[],
193
+ skillName: string,
194
+ projectsDir: string = CLAUDE_CODE_PROJECTS_DIR,
195
+ ): ResolvedSessionContext | null {
196
+ const direct = latestSessionForSkill(telemetry, skillName);
197
+ if (direct) {
198
+ return {
199
+ telemetry: direct,
200
+ sessionId: direct.session_id,
201
+ transcriptPath: direct.transcript_path ?? "",
202
+ source: "telemetry",
203
+ };
204
+ }
205
+
206
+ const usage = latestSkillUsageForSkill(skillUsage, skillName);
207
+ if (!usage) return null;
208
+
209
+ const transcriptPath = findTranscriptPathForSession(usage.session_id, projectsDir);
210
+ if (!transcriptPath) {
211
+ const fallback = buildSkillUsageFallbackTelemetry(usage);
212
+ return {
213
+ telemetry: fallback,
214
+ sessionId: fallback.session_id,
215
+ transcriptPath: fallback.transcript_path,
216
+ source: "skill_usage_fallback",
217
+ };
218
+ }
219
+
220
+ const rebuilt = buildTelemetryFromTranscript(usage.session_id, transcriptPath);
221
+ if (!rebuilt) {
222
+ const fallback = buildSkillUsageFallbackTelemetry(usage);
223
+ fallback.transcript_path = transcriptPath;
224
+ return {
225
+ telemetry: fallback,
226
+ sessionId: fallback.session_id,
227
+ transcriptPath,
228
+ source: "skill_usage_fallback",
229
+ };
230
+ }
231
+
232
+ if (!rebuilt.skills_triggered.includes(skillName)) {
233
+ rebuilt.skills_triggered = [...rebuilt.skills_triggered, skillName];
234
+ }
235
+ if (rebuilt.skills_invoked && !rebuilt.skills_invoked.includes(skillName)) {
236
+ rebuilt.skills_invoked = [...rebuilt.skills_invoked, skillName];
237
+ }
238
+ if (!rebuilt.last_user_query) {
239
+ rebuilt.last_user_query = usage.query;
240
+ }
241
+
242
+ return {
243
+ telemetry: rebuilt,
244
+ sessionId: rebuilt.session_id,
245
+ transcriptPath,
246
+ source: "transcript_fallback",
247
+ };
248
+ }
249
+
250
+ export function buildDefaultGradingOutputPath(sessionId: string): string {
251
+ const safeSessionId = sessionId.replace(/[^a-zA-Z0-9_-]/g, "_");
252
+ return join(SELFTUNE_CONFIG_DIR, "grading", `result-${safeSessionId}.json`);
253
+ }
254
+
108
255
  export function loadExpectationsFromEvalsJson(evalsJsonPath: string, evalId: number): string[] {
109
256
  let data: unknown;
110
257
  try {
@@ -157,6 +304,107 @@ export function loadExpectationsFromEvalsJson(evalsJsonPath: string, evalId: num
157
304
  throw new Error(`Eval ID ${evalId} not found in ${evalsJsonPath}`);
158
305
  }
159
306
 
307
+ // ---------------------------------------------------------------------------
308
+ // Auto-derive expectations from SKILL.md
309
+ // ---------------------------------------------------------------------------
310
+
311
+ export interface DerivedExpectations {
312
+ expectations: string[];
313
+ derived: boolean;
314
+ source: string;
315
+ }
316
+
317
+ const GENERIC_EXPECTATIONS: string[] = [
318
+ "The skill was triggered during the session",
319
+ "The task was completed successfully without critical errors",
320
+ "No unhandled errors were encountered",
321
+ ];
322
+
323
+ /**
324
+ * Derive grading expectations from a skill's SKILL.md file.
325
+ *
326
+ * Resolution order for SKILL.md path:
327
+ * 1. Explicit `skillPath` argument
328
+ * 2. Lookup from skill_usage_log.jsonl records
329
+ * 3. Falls back to generic expectations if not found
330
+ */
331
+ export function deriveExpectationsFromSkill(
332
+ skillName: string,
333
+ skillPath?: string,
334
+ ): DerivedExpectations {
335
+ // Resolve the SKILL.md path
336
+ let resolvedPath = skillPath;
337
+
338
+ if (!resolvedPath) {
339
+ // Try to find from skill_usage_log
340
+ try {
341
+ const usageRecords = readEffectiveSkillUsageRecords();
342
+ for (let i = usageRecords.length - 1; i >= 0; i--) {
343
+ if (usageRecords[i].skill_name === skillName && usageRecords[i].skill_path) {
344
+ resolvedPath = usageRecords[i].skill_path;
345
+ break;
346
+ }
347
+ }
348
+ } catch {
349
+ // skill_usage_log not available
350
+ }
351
+ }
352
+
353
+ if (!resolvedPath || !existsSync(resolvedPath)) {
354
+ return {
355
+ expectations: GENERIC_EXPECTATIONS,
356
+ derived: false,
357
+ source: resolvedPath ? `SKILL.md not found at ${resolvedPath}` : "no SKILL.md path found",
358
+ };
359
+ }
360
+
361
+ // Read and parse SKILL.md
362
+ let content: string;
363
+ try {
364
+ content = readFileSync(resolvedPath, "utf-8");
365
+ } catch {
366
+ return {
367
+ expectations: GENERIC_EXPECTATIONS,
368
+ derived: false,
369
+ source: `failed to read ${resolvedPath}`,
370
+ };
371
+ }
372
+
373
+ const expectations: string[] = [`The "${skillName}" skill was triggered during the session`];
374
+
375
+ // Extract description from first paragraph after title
376
+ const descMatch = content.match(/^#\s+.+\n+([^\n#][^\n]*)/m);
377
+ if (descMatch) {
378
+ const desc = descMatch[1].trim();
379
+ if (desc.length > 10) {
380
+ expectations.push(`The skill fulfilled its purpose: ${desc.slice(0, 120)}`);
381
+ }
382
+ }
383
+
384
+ // Extract "When to Use" section content
385
+ const whenMatch = content.match(/##\s*When\s+to\s+Use\b[^\n]*\n([\s\S]*?)(?=\n##\s|\n---|$)/i);
386
+ if (whenMatch) {
387
+ const lines = whenMatch[1]
388
+ .split("\n")
389
+ .map((l) => l.replace(/^[-*]\s*/, "").trim())
390
+ .filter((l) => l.length > 5);
391
+ if (lines.length > 0) {
392
+ expectations.push(`The session context matched a "When to Use" trigger for ${skillName}`);
393
+ }
394
+ }
395
+
396
+ // Add standard quality expectations
397
+ expectations.push("The task was completed successfully without critical errors");
398
+ expectations.push("No unhandled errors were encountered");
399
+
400
+ // Cap at 5 expectations
401
+ return {
402
+ expectations: expectations.slice(0, 5),
403
+ derived: true,
404
+ source: resolvedPath,
405
+ };
406
+ }
407
+
160
408
  // ---------------------------------------------------------------------------
161
409
  // Execution metrics
162
410
  // ---------------------------------------------------------------------------
@@ -271,24 +519,26 @@ export async function gradeViaAgent(prompt: string, agent: string): Promise<Grad
271
519
  }
272
520
 
273
521
  // ---------------------------------------------------------------------------
274
- // Result assembly
522
+ // Shared grading flow
275
523
  // ---------------------------------------------------------------------------
276
524
 
277
- export function assembleResult(
278
- graderOutput: GraderOutput,
279
- telemetry: SessionTelemetryRecord,
280
- sessionId: string,
281
- skillName: string,
282
- transcriptPath: string,
283
- ): GradingResult {
284
- // Default missing scores on expectations
285
- const expectations = (graderOutput?.expectations ?? []).map((e) => ({
525
+ function normalizeExpectations(expectations: GradingExpectation[]): GradingExpectation[] {
526
+ return expectations.map((e) => ({
286
527
  ...e,
287
528
  score: e.score ?? (e.passed ? 1.0 : 0.0),
288
529
  source: e.source ?? ("llm" as const),
289
530
  }));
531
+ }
290
532
 
291
- const baseSummary = graderOutput?.summary ?? { passed: 0, failed: 0, total: 0, pass_rate: 0 };
533
+ function assembleResultFromExpectations(
534
+ expectations: GradingExpectation[],
535
+ telemetry: SessionTelemetryRecord,
536
+ sessionId: string,
537
+ skillName: string,
538
+ transcriptPath: string,
539
+ ): GradingResult {
540
+ const passedCount = expectations.filter((e) => e.passed).length;
541
+ const totalCount = expectations.length;
292
542
  const graduated = buildGraduatedSummary(expectations);
293
543
 
294
544
  return {
@@ -298,11 +548,116 @@ export function assembleResult(
298
548
  graded_at: new Date().toISOString(),
299
549
  expectations,
300
550
  summary: {
301
- ...baseSummary,
551
+ passed: passedCount,
552
+ failed: totalCount - passedCount,
553
+ total: totalCount,
554
+ pass_rate: totalCount > 0 ? passedCount / totalCount : 0,
302
555
  mean_score: graduated.mean_score,
303
556
  score_std_dev: graduated.score_std_dev,
304
557
  },
305
558
  execution_metrics: buildExecutionMetrics(telemetry ?? ({} as SessionTelemetryRecord)),
559
+ claims: [],
560
+ eval_feedback: { suggestions: [], overall: "" },
561
+ };
562
+ }
563
+
564
+ export interface GradeSessionParams {
565
+ expectations: string[];
566
+ telemetry: SessionTelemetryRecord;
567
+ sessionId: string;
568
+ skillName: string;
569
+ transcriptExcerpt: string;
570
+ transcriptPath: string;
571
+ agent: string;
572
+ gradeViaAgentFn?: (prompt: string, agent: string) => Promise<GraderOutput>;
573
+ }
574
+
575
+ export async function gradeSession({
576
+ expectations,
577
+ telemetry,
578
+ sessionId,
579
+ skillName,
580
+ transcriptExcerpt,
581
+ transcriptPath,
582
+ agent,
583
+ gradeViaAgentFn = gradeViaAgent,
584
+ }: GradeSessionParams): Promise<GradingResult> {
585
+ const preGateCtx: PreGateContext = {
586
+ telemetry,
587
+ skillName,
588
+ transcriptExcerpt,
589
+ };
590
+ const preGateResult = runPreGates(expectations, preGateCtx);
591
+
592
+ let allExpectations: GradingExpectation[];
593
+
594
+ if (preGateResult.remaining.length === 0) {
595
+ console.error(
596
+ `[INFO] All ${expectations.length} expectations resolved by pre-gates, skipping LLM`,
597
+ );
598
+ allExpectations = preGateResult.resolved;
599
+ } else {
600
+ console.error(
601
+ `[INFO] Pre-gates resolved ${preGateResult.resolved.length}/${expectations.length} expectations`,
602
+ );
603
+ const prompt = buildGradingPrompt(
604
+ preGateResult.remaining,
605
+ telemetry,
606
+ transcriptExcerpt,
607
+ skillName,
608
+ );
609
+ console.error(
610
+ `Grading ${preGateResult.remaining.length} expectations for skill '${skillName}'...`,
611
+ );
612
+
613
+ let graderOutput: GraderOutput;
614
+ try {
615
+ graderOutput = await gradeViaAgentFn(prompt, agent);
616
+ } catch (err) {
617
+ throw new Error(`Grading failed: ${err instanceof Error ? err.message : String(err)}`, {
618
+ cause: err,
619
+ });
620
+ }
621
+
622
+ const llmExpectations = normalizeExpectations(graderOutput.expectations ?? []);
623
+ if (llmExpectations.length !== preGateResult.remaining.length) {
624
+ throw new Error(
625
+ `Grader returned ${llmExpectations.length} expectations for ${preGateResult.remaining.length} unresolved expectations`,
626
+ );
627
+ }
628
+
629
+ allExpectations = [...preGateResult.resolved, ...llmExpectations];
630
+ }
631
+
632
+ return assembleResultFromExpectations(
633
+ allExpectations,
634
+ telemetry,
635
+ sessionId,
636
+ skillName,
637
+ transcriptPath,
638
+ );
639
+ }
640
+
641
+ // ---------------------------------------------------------------------------
642
+ // Result assembly
643
+ // ---------------------------------------------------------------------------
644
+
645
+ export function assembleResult(
646
+ graderOutput: GraderOutput,
647
+ telemetry: SessionTelemetryRecord,
648
+ sessionId: string,
649
+ skillName: string,
650
+ transcriptPath: string,
651
+ ): GradingResult {
652
+ const result = assembleResultFromExpectations(
653
+ normalizeExpectations(graderOutput?.expectations ?? []),
654
+ telemetry,
655
+ sessionId,
656
+ skillName,
657
+ transcriptPath,
658
+ );
659
+ return {
660
+ ...result,
306
661
  claims: graderOutput?.claims ?? [],
307
662
  eval_feedback: graderOutput?.eval_feedback ?? { suggestions: [], overall: "" },
308
663
  failure_feedback: graderOutput?.failure_feedback,
@@ -348,19 +703,43 @@ export async function cliMain(): Promise<void> {
348
703
  const { values } = parseArgs({
349
704
  options: {
350
705
  skill: { type: "string" },
706
+ "skill-path": { type: "string" },
351
707
  expectations: { type: "string", multiple: true },
352
708
  "evals-json": { type: "string" },
353
709
  "eval-id": { type: "string" },
354
710
  "session-id": { type: "string" },
355
711
  transcript: { type: "string" },
356
712
  "telemetry-log": { type: "string", default: TELEMETRY_LOG },
357
- output: { type: "string", default: "grading.json" },
713
+ output: { type: "string" },
358
714
  agent: { type: "string" },
359
715
  "show-transcript": { type: "boolean", default: false },
716
+ help: { type: "boolean", short: "h", default: false },
360
717
  },
361
718
  strict: true,
362
719
  });
363
720
 
721
+ if (values.help) {
722
+ console.log(`selftune grade — Grade a skill session
723
+
724
+ Usage:
725
+ selftune grade --skill <name> [options]
726
+
727
+ Options:
728
+ --skill Skill name (required)
729
+ --skill-path Path to SKILL.md (for auto-deriving expectations)
730
+ --expectations Expectation strings (repeatable)
731
+ --evals-json Path to evals JSON file
732
+ --eval-id Eval ID within evals JSON
733
+ --session-id Grade a specific session by ID
734
+ --transcript Path to transcript file
735
+ --telemetry-log Path to telemetry log (default: ~/.claude/session_telemetry_log.jsonl)
736
+ --output Output path for grading JSON (default: ~/.selftune/grading/result-<session>.json)
737
+ --agent Agent CLI to use (${AGENT_CANDIDATES.join(", ")})
738
+ --show-transcript Print transcript excerpt before grading
739
+ -h, --help Show this help message`);
740
+ process.exit(0);
741
+ }
742
+
364
743
  const skill = values.skill;
365
744
  if (!skill) {
366
745
  console.error("[ERROR] --skill is required");
@@ -369,7 +748,7 @@ export async function cliMain(): Promise<void> {
369
748
 
370
749
  // --- Determine agent ---
371
750
  let agent: string | null = null;
372
- const validAgents = ["claude", "codex", "opencode"];
751
+ const validAgents = [...AGENT_CANDIDATES];
373
752
  if (values.agent) {
374
753
  if (!validAgents.includes(values.agent)) {
375
754
  console.error(
@@ -384,8 +763,8 @@ export async function cliMain(): Promise<void> {
384
763
 
385
764
  if (!agent) {
386
765
  console.error(
387
- "[ERROR] No agent CLI (claude/codex/opencode) found in PATH.\n" +
388
- "Install Claude Code, Codex, or OpenCode.",
766
+ `[ERROR] No supported agent CLI (${AGENT_CANDIDATES.join("/")}) found in PATH.\n` +
767
+ "Install one of the supported agent CLIs.",
389
768
  );
390
769
  process.exit(1);
391
770
  }
@@ -404,8 +783,18 @@ export async function cliMain(): Promise<void> {
404
783
  } else if (values.expectations?.length) {
405
784
  expectations = values.expectations;
406
785
  } else {
407
- console.error("[ERROR] Provide --expectations or --evals-json + --eval-id");
408
- process.exit(1);
786
+ // Auto-derive expectations from SKILL.md
787
+ const derived = deriveExpectationsFromSkill(skill, values["skill-path"]);
788
+ expectations = derived.expectations;
789
+ if (derived.derived) {
790
+ console.error(
791
+ `[INFO] Auto-derived ${derived.expectations.length} expectations from ${derived.source}`,
792
+ );
793
+ } else {
794
+ console.error(
795
+ `[WARN] No --expectations or --evals-json provided. Using generic expectations (${derived.source})`,
796
+ );
797
+ }
409
798
  }
410
799
 
411
800
  // --- Resolve session ---
@@ -415,9 +804,15 @@ export async function cliMain(): Promise<void> {
415
804
 
416
805
  const telemetryLog = values["telemetry-log"] ?? TELEMETRY_LOG;
417
806
  const telRecords = readJsonl<SessionTelemetryRecord>(telemetryLog);
807
+ const skillUsageRecords = readEffectiveSkillUsageRecords();
418
808
 
419
809
  if (values.transcript) {
420
810
  transcriptPath = values.transcript;
811
+ telemetry =
812
+ buildTelemetryFromTranscript(
813
+ values["session-id"] ?? basename(transcriptPath, ".jsonl"),
814
+ transcriptPath,
815
+ ) ?? ({} as SessionTelemetryRecord);
421
816
  for (let i = telRecords.length - 1; i >= 0; i--) {
422
817
  if (telRecords[i].transcript_path === transcriptPath) {
423
818
  telemetry = telRecords[i];
@@ -425,18 +820,25 @@ export async function cliMain(): Promise<void> {
425
820
  break;
426
821
  }
427
822
  }
823
+ if (telemetry.session_id) sessionId = telemetry.session_id;
428
824
  } else if (values["session-id"]) {
429
825
  sessionId = values["session-id"];
430
- telemetry = findSession(telRecords, sessionId) ?? ({} as SessionTelemetryRecord);
431
- transcriptPath = telemetry.transcript_path ?? "";
826
+ const resolved = resolveSessionById(telRecords, sessionId);
827
+ telemetry = resolved?.telemetry ?? ({} as SessionTelemetryRecord);
828
+ transcriptPath = resolved?.transcriptPath ?? "";
432
829
  } else {
433
- telemetry = latestSessionForSkill(telRecords, skill) ?? ({} as SessionTelemetryRecord);
434
- if (telemetry.session_id) {
435
- sessionId = telemetry.session_id;
436
- transcriptPath = telemetry.transcript_path ?? "";
437
- console.error(`[INFO] Grading most recent '${skill}' session: ${sessionId}`);
830
+ const resolved = resolveLatestSessionForSkill(telRecords, skillUsageRecords, skill);
831
+ telemetry = resolved?.telemetry ?? ({} as SessionTelemetryRecord);
832
+ if (resolved) {
833
+ sessionId = resolved.sessionId;
834
+ transcriptPath = resolved.transcriptPath;
835
+ const note =
836
+ resolved.source === "telemetry" ? "" : ` (${resolved.source.replaceAll("_", " ")})`;
837
+ console.error(`[INFO] Grading most recent '${skill}' session: ${sessionId}${note}`);
438
838
  } else {
439
- console.error(`[WARN] No telemetry for skill '${skill}'. Is session_stop_hook installed?`);
839
+ console.error(
840
+ `[WARN] No session found for skill '${skill}' in telemetry or recovered usage data.`,
841
+ );
440
842
  }
441
843
  }
442
844
 
@@ -448,74 +850,23 @@ export async function cliMain(): Promise<void> {
448
850
  console.log("==========================\n");
449
851
  }
450
852
 
451
- // --- Run pre-gates first ---
452
- const preGateCtx: PreGateContext = {
453
- telemetry,
454
- skillName: skill,
455
- transcriptExcerpt,
456
- };
457
- const preGateResult = runPreGates(expectations, preGateCtx);
458
-
459
- let allExpectations: GradingExpectation[];
460
-
461
- if (preGateResult.remaining.length === 0) {
462
- // All expectations resolved by pre-gates — skip LLM entirely
463
- console.error(
464
- `[INFO] All ${expectations.length} expectations resolved by pre-gates, skipping LLM`,
465
- );
466
- allExpectations = preGateResult.resolved;
467
- } else {
468
- // Build prompt and grade remaining via LLM
469
- console.error(
470
- `[INFO] Pre-gates resolved ${preGateResult.resolved.length}/${expectations.length} expectations`,
471
- );
472
- const prompt = buildGradingPrompt(preGateResult.remaining, telemetry, transcriptExcerpt, skill);
473
- console.error(`Grading ${preGateResult.remaining.length} expectations for skill '${skill}'...`);
474
-
475
- let graderOutput: GraderOutput;
476
- try {
477
- graderOutput = await gradeViaAgent(prompt, agent);
478
- } catch (e) {
479
- console.error(`[ERROR] Grading failed: ${e}`);
480
- process.exit(1);
481
- }
482
-
483
- // Default scores on LLM results
484
- const llmExpectations = (graderOutput.expectations ?? []).map((e) => ({
485
- ...e,
486
- score: e.score ?? (e.passed ? 1.0 : 0.0),
487
- source: e.source ?? ("llm" as const),
488
- }));
489
-
490
- // Merge pre-gate + LLM results
491
- allExpectations = [...preGateResult.resolved, ...llmExpectations];
853
+ let result: GradingResult;
854
+ try {
855
+ result = await gradeSession({
856
+ expectations,
857
+ telemetry,
858
+ sessionId,
859
+ skillName: skill,
860
+ transcriptExcerpt,
861
+ transcriptPath,
862
+ agent,
863
+ });
864
+ } catch (err) {
865
+ console.error(`[ERROR] ${err instanceof Error ? err.message : String(err)}`);
866
+ process.exit(1);
492
867
  }
493
868
 
494
- // Compute graduated summary
495
- const graduated = buildGraduatedSummary(allExpectations);
496
- const passedCount = allExpectations.filter((e) => e.passed).length;
497
- const totalCount = allExpectations.length;
498
-
499
- const result: GradingResult = {
500
- session_id: sessionId,
501
- skill_name: skill,
502
- transcript_path: transcriptPath,
503
- graded_at: new Date().toISOString(),
504
- expectations: allExpectations,
505
- summary: {
506
- passed: passedCount,
507
- failed: totalCount - passedCount,
508
- total: totalCount,
509
- pass_rate: totalCount > 0 ? passedCount / totalCount : 0,
510
- mean_score: graduated.mean_score,
511
- score_std_dev: graduated.score_std_dev,
512
- },
513
- execution_metrics: buildExecutionMetrics(telemetry),
514
- claims: [],
515
- eval_feedback: { suggestions: [], overall: "" },
516
- };
517
-
518
- const outputPath = values.output ?? "grading.json";
869
+ const outputPath = values.output ?? buildDefaultGradingOutputPath(sessionId);
519
870
  const outputDir = dirname(outputPath);
520
871
  if (outputDir !== ".") {
521
872
  mkdirSync(outputDir, { recursive: true });