npm - selftune - Versions diffs - 0.2.0 → 0.2.2 - Mend

selftune 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

package/.claude/agents/diagnosis-analyst.md +20 -10
package/.claude/agents/evolution-reviewer.md +14 -1
package/.claude/agents/integration-guide.md +18 -6
package/.claude/agents/pattern-analyst.md +18 -5
package/CHANGELOG.md +12 -4
package/README.md +43 -35
package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
package/apps/local-dashboard/dist/favicon.png +0 -0
package/apps/local-dashboard/dist/index.html +17 -0
package/apps/local-dashboard/dist/logo.png +0 -0
package/apps/local-dashboard/dist/logo.svg +9 -0
package/cli/selftune/badge/badge-data.ts +1 -1
package/cli/selftune/badge/badge.ts +4 -8
package/cli/selftune/canonical-export.ts +183 -0
package/cli/selftune/constants.ts +28 -0
package/cli/selftune/contribute/contribute.ts +1 -1
package/cli/selftune/cron/setup.ts +17 -17
package/cli/selftune/dashboard-contract.ts +202 -0
package/cli/selftune/dashboard-server.ts +653 -186
package/cli/selftune/dashboard.ts +41 -176
package/cli/selftune/eval/baseline.ts +5 -4
package/cli/selftune/eval/composability-v2.ts +273 -0
package/cli/selftune/eval/hooks-to-evals.ts +34 -15
package/cli/selftune/eval/unit-test-cli.ts +1 -1
package/cli/selftune/evolution/evidence.ts +26 -0
package/cli/selftune/evolution/evolve-body.ts +105 -11
package/cli/selftune/evolution/evolve.ts +371 -25
package/cli/selftune/evolution/extract-patterns.ts +87 -29
package/cli/selftune/evolution/rollback.ts +2 -2
package/cli/selftune/grading/auto-grade.ts +200 -0
package/cli/selftune/grading/grade-session.ts +448 -97
package/cli/selftune/grading/results.ts +42 -0
package/cli/selftune/hooks/prompt-log.ts +172 -2
package/cli/selftune/hooks/session-stop.ts +123 -3
package/cli/selftune/hooks/skill-eval.ts +119 -3
package/cli/selftune/index.ts +395 -116
package/cli/selftune/ingestors/claude-replay.ts +140 -114
package/cli/selftune/ingestors/codex-rollout.ts +345 -46
package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
package/cli/selftune/ingestors/openclaw-ingest.ts +141 -8
package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
package/cli/selftune/init.ts +227 -14
package/cli/selftune/last.ts +14 -5
package/cli/selftune/localdb/db.ts +63 -0
package/cli/selftune/localdb/materialize.ts +428 -0
package/cli/selftune/localdb/queries.ts +376 -0
package/cli/selftune/localdb/schema.ts +204 -0
package/cli/selftune/monitoring/watch.ts +66 -15
package/cli/selftune/normalization.ts +682 -0
package/cli/selftune/observability.ts +19 -44
package/cli/selftune/orchestrate.ts +1073 -0
package/cli/selftune/quickstart.ts +203 -0
package/cli/selftune/repair/skill-usage.ts +576 -0
package/cli/selftune/schedule.ts +561 -0
package/cli/selftune/status.ts +48 -26
package/cli/selftune/sync.ts +627 -0
package/cli/selftune/types.ts +148 -0
package/cli/selftune/utils/canonical-log.ts +45 -0
package/cli/selftune/utils/hooks.ts +41 -0
package/cli/selftune/utils/html.ts +27 -0
package/cli/selftune/utils/llm-call.ts +78 -20
package/cli/selftune/utils/math.ts +10 -0
package/cli/selftune/utils/query-filter.ts +139 -0
package/cli/selftune/utils/skill-discovery.ts +340 -0
package/cli/selftune/utils/skill-log.ts +68 -0
package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
package/cli/selftune/utils/transcript.ts +272 -26
package/cli/selftune/workflows/discover.ts +254 -0
package/cli/selftune/workflows/skill-md-writer.ts +288 -0
package/cli/selftune/workflows/workflows.ts +188 -0
package/package.json +21 -8
package/packages/telemetry-contract/README.md +11 -0
package/packages/telemetry-contract/fixtures/golden.json +87 -0
package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
package/packages/telemetry-contract/index.ts +1 -0
package/packages/telemetry-contract/package.json +19 -0
package/packages/telemetry-contract/src/index.ts +2 -0
package/packages/telemetry-contract/src/types.ts +163 -0
package/packages/telemetry-contract/src/validators.ts +109 -0
package/skill/SKILL.md +84 -53
package/skill/Workflows/AutoActivation.md +17 -16
package/skill/Workflows/Badge.md +6 -0
package/skill/Workflows/Baseline.md +46 -23
package/skill/Workflows/Composability.md +12 -5
package/skill/Workflows/Contribute.md +17 -14
package/skill/Workflows/Cron.md +56 -79
package/skill/Workflows/Dashboard.md +45 -34
package/skill/Workflows/Doctor.md +30 -17
package/skill/Workflows/Evals.md +64 -40
package/skill/Workflows/EvolutionMemory.md +2 -0
package/skill/Workflows/Evolve.md +102 -47
package/skill/Workflows/EvolveBody.md +6 -6
package/skill/Workflows/Grade.md +36 -31
package/skill/Workflows/ImportSkillsBench.md +11 -5
package/skill/Workflows/Ingest.md +43 -36
package/skill/Workflows/Initialize.md +44 -30
package/skill/Workflows/Orchestrate.md +139 -0
package/skill/Workflows/Replay.md +39 -18
package/skill/Workflows/Rollback.md +3 -3
package/skill/Workflows/Schedule.md +61 -0
package/skill/Workflows/Sync.md +88 -0
package/skill/Workflows/UnitTest.md +34 -22
package/skill/Workflows/Watch.md +14 -4
package/skill/Workflows/Workflows.md +129 -0
package/skill/assets/activation-rules-default.json +26 -0
package/skill/assets/multi-skill-settings.json +63 -0
package/skill/assets/single-skill-settings.json +57 -0
package/skill/references/invocation-taxonomy.md +2 -2
package/skill/references/logs.md +164 -2
package/skill/references/setup-patterns.md +65 -0
package/skill/references/version-history.md +40 -0
package/skill/settings_snippet.json +1 -1
package/templates/multi-skill-settings.json +7 -7
package/templates/single-skill-settings.json +6 -6
package/dashboard/index.html +0 -1680

package/cli/selftune/grading/grade-session.ts CHANGED Viewed

@@ -5,20 +5,26 @@
  * Rubric-based grader for Claude Code skill sessions.
  * Migrated from grade_session.py.
  *
- * Grades via installed agent CLI (claude/codex/opencode).
+ * Grades via an installed agent CLI selected from AGENT_CANDIDATES.
  */
-import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
-import { dirname } from "node:path";
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
+import { basename, dirname, join } from "node:path";
 import { parseArgs } from "node:util";
-import { TELEMETRY_LOG } from "../constants.js";
+import {
+  AGENT_CANDIDATES,
+  CLAUDE_CODE_PROJECTS_DIR,
+  SELFTUNE_CONFIG_DIR,
+  TELEMETRY_LOG,
+} from "../constants.js";
 import type {
   ExecutionMetrics,
   GraderOutput,
   GradingExpectation,
   GradingResult,
   SessionTelemetryRecord,
+  SkillUsageRecord,
 } from "../types.js";
 import { readJsonl } from "../utils/jsonl.js";
 import {
@@ -26,7 +32,12 @@ import {
   stripMarkdownFences as _stripMarkdownFences,
   callViaAgent,
 } from "../utils/llm-call.js";
-import { readExcerpt } from "../utils/transcript.js";
+import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
+import {
+  buildTelemetryFromTranscript,
+  findTranscriptPathForSession,
+  readExcerpt,
+} from "../utils/transcript.js";
 import { type PreGateContext, runPreGates } from "./pre-gates.js";
 // Re-export for backward compatibility
@@ -99,12 +110,148 @@ export function latestSessionForSkill(
   telemetry: SessionTelemetryRecord[],
   skillName: string,
 ): SessionTelemetryRecord | null {
+  // First pass: prefer sessions with actual Skill tool invocations (skills_invoked)
+  for (let i = telemetry.length - 1; i >= 0; i--) {
+    if (telemetry[i].skills_invoked?.includes(skillName)) return telemetry[i];
+  }
+  // Fallback: sessions where SKILL.md was read (skills_triggered)
   for (let i = telemetry.length - 1; i >= 0; i--) {
     if (telemetry[i].skills_triggered?.includes(skillName)) return telemetry[i];
   }
   return null;
 }
+export function latestSkillUsageForSkill(
+  skillUsage: SkillUsageRecord[],
+  skillName: string,
+): SkillUsageRecord | null {
+  for (let i = skillUsage.length - 1; i >= 0; i--) {
+    const record = skillUsage[i];
+    if (record.skill_name === skillName && record.triggered) return record;
+  }
+  return null;
+}
+export interface ResolvedSessionContext {
+  telemetry: SessionTelemetryRecord;
+  sessionId: string;
+  transcriptPath: string;
+  source: "telemetry" | "transcript_fallback" | "skill_usage_fallback";
+}
+function buildSkillUsageFallbackTelemetry(record: SkillUsageRecord): SessionTelemetryRecord {
+  return {
+    timestamp: record.timestamp,
+    session_id: record.session_id,
+    cwd: "",
+    transcript_path: "",
+    tool_calls: {},
+    total_tool_calls: 0,
+    bash_commands: [],
+    skills_triggered: [record.skill_name],
+    skills_invoked: [record.skill_name],
+    assistant_turns: 0,
+    errors_encountered: 0,
+    transcript_chars: 0,
+    last_user_query: record.query,
+    source: record.source ?? "skill_usage_fallback",
+  };
+}
+export function resolveSessionById(
+  telemetry: SessionTelemetryRecord[],
+  sessionId: string,
+  projectsDir: string = CLAUDE_CODE_PROJECTS_DIR,
+): ResolvedSessionContext | null {
+  const direct = findSession(telemetry, sessionId);
+  if (direct) {
+    return {
+      telemetry: direct,
+      sessionId: direct.session_id,
+      transcriptPath: direct.transcript_path ?? "",
+      source: "telemetry",
+    };
+  }
+  const transcriptPath = findTranscriptPathForSession(sessionId, projectsDir);
+  if (!transcriptPath) return null;
+  const rebuilt = buildTelemetryFromTranscript(sessionId, transcriptPath);
+  if (!rebuilt) return null;
+  return {
+    telemetry: rebuilt,
+    sessionId,
+    transcriptPath,
+    source: "transcript_fallback",
+  };
+}
+export function resolveLatestSessionForSkill(
+  telemetry: SessionTelemetryRecord[],
+  skillUsage: SkillUsageRecord[],
+  skillName: string,
+  projectsDir: string = CLAUDE_CODE_PROJECTS_DIR,
+): ResolvedSessionContext | null {
+  const direct = latestSessionForSkill(telemetry, skillName);
+  if (direct) {
+    return {
+      telemetry: direct,
+      sessionId: direct.session_id,
+      transcriptPath: direct.transcript_path ?? "",
+      source: "telemetry",
+    };
+  }
+  const usage = latestSkillUsageForSkill(skillUsage, skillName);
+  if (!usage) return null;
+  const transcriptPath = findTranscriptPathForSession(usage.session_id, projectsDir);
+  if (!transcriptPath) {
+    const fallback = buildSkillUsageFallbackTelemetry(usage);
+    return {
+      telemetry: fallback,
+      sessionId: fallback.session_id,
+      transcriptPath: fallback.transcript_path,
+      source: "skill_usage_fallback",
+    };
+  }
+  const rebuilt = buildTelemetryFromTranscript(usage.session_id, transcriptPath);
+  if (!rebuilt) {
+    const fallback = buildSkillUsageFallbackTelemetry(usage);
+    fallback.transcript_path = transcriptPath;
+    return {
+      telemetry: fallback,
+      sessionId: fallback.session_id,
+      transcriptPath,
+      source: "skill_usage_fallback",
+    };
+  }
+  if (!rebuilt.skills_triggered.includes(skillName)) {
+    rebuilt.skills_triggered = [...rebuilt.skills_triggered, skillName];
+  }
+  if (rebuilt.skills_invoked && !rebuilt.skills_invoked.includes(skillName)) {
+    rebuilt.skills_invoked = [...rebuilt.skills_invoked, skillName];
+  }
+  if (!rebuilt.last_user_query) {
+    rebuilt.last_user_query = usage.query;
+  }
+  return {
+    telemetry: rebuilt,
+    sessionId: rebuilt.session_id,
+    transcriptPath,
+    source: "transcript_fallback",
+  };
+}
+export function buildDefaultGradingOutputPath(sessionId: string): string {
+  const safeSessionId = sessionId.replace(/[^a-zA-Z0-9_-]/g, "_");
+  return join(SELFTUNE_CONFIG_DIR, "grading", `result-${safeSessionId}.json`);
+}
 export function loadExpectationsFromEvalsJson(evalsJsonPath: string, evalId: number): string[] {
   let data: unknown;
   try {
@@ -157,6 +304,107 @@ export function loadExpectationsFromEvalsJson(evalsJsonPath: string, evalId: num
   throw new Error(`Eval ID ${evalId} not found in ${evalsJsonPath}`);
 }
+// ---------------------------------------------------------------------------
+// Auto-derive expectations from SKILL.md
+// ---------------------------------------------------------------------------
+export interface DerivedExpectations {
+  expectations: string[];
+  derived: boolean;
+  source: string;
+}
+const GENERIC_EXPECTATIONS: string[] = [
+  "The skill was triggered during the session",
+  "The task was completed successfully without critical errors",
+  "No unhandled errors were encountered",
+];
+/**
+ * Derive grading expectations from a skill's SKILL.md file.
+ *
+ * Resolution order for SKILL.md path:
+ * 1. Explicit `skillPath` argument
+ * 2. Lookup from skill_usage_log.jsonl records
+ * 3. Falls back to generic expectations if not found
+ */
+export function deriveExpectationsFromSkill(
+  skillName: string,
+  skillPath?: string,
+): DerivedExpectations {
+  // Resolve the SKILL.md path
+  let resolvedPath = skillPath;
+  if (!resolvedPath) {
+    // Try to find from skill_usage_log
+    try {
+      const usageRecords = readEffectiveSkillUsageRecords();
+      for (let i = usageRecords.length - 1; i >= 0; i--) {
+        if (usageRecords[i].skill_name === skillName && usageRecords[i].skill_path) {
+          resolvedPath = usageRecords[i].skill_path;
+          break;
+        }
+      }
+    } catch {
+      // skill_usage_log not available
+    }
+  }
+  if (!resolvedPath || !existsSync(resolvedPath)) {
+    return {
+      expectations: GENERIC_EXPECTATIONS,
+      derived: false,
+      source: resolvedPath ? `SKILL.md not found at ${resolvedPath}` : "no SKILL.md path found",
+    };
+  }
+  // Read and parse SKILL.md
+  let content: string;
+  try {
+    content = readFileSync(resolvedPath, "utf-8");
+  } catch {
+    return {
+      expectations: GENERIC_EXPECTATIONS,
+      derived: false,
+      source: `failed to read ${resolvedPath}`,
+    };
+  }
+  const expectations: string[] = [`The "${skillName}" skill was triggered during the session`];
+  // Extract description from first paragraph after title
+  const descMatch = content.match(/^#\s+.+\n+([^\n#][^\n]*)/m);
+  if (descMatch) {
+    const desc = descMatch[1].trim();
+    if (desc.length > 10) {
+      expectations.push(`The skill fulfilled its purpose: ${desc.slice(0, 120)}`);
+    }
+  }
+  // Extract "When to Use" section content
+  const whenMatch = content.match(/##\s*When\s+to\s+Use\b[^\n]*\n([\s\S]*?)(?=\n##\s|\n---|$)/i);
+  if (whenMatch) {
+    const lines = whenMatch[1]
+      .split("\n")
+      .map((l) => l.replace(/^[-*]\s*/, "").trim())
+      .filter((l) => l.length > 5);
+    if (lines.length > 0) {
+      expectations.push(`The session context matched a "When to Use" trigger for ${skillName}`);
+    }
+  }
+  // Add standard quality expectations
+  expectations.push("The task was completed successfully without critical errors");
+  expectations.push("No unhandled errors were encountered");
+  // Cap at 5 expectations
+  return {
+    expectations: expectations.slice(0, 5),
+    derived: true,
+    source: resolvedPath,
+  };
+}
 // ---------------------------------------------------------------------------
 // Execution metrics
 // ---------------------------------------------------------------------------
@@ -271,24 +519,26 @@ export async function gradeViaAgent(prompt: string, agent: string): Promise<Grad
 }
 // ---------------------------------------------------------------------------
-// Result assembly
+// Shared grading flow
 // ---------------------------------------------------------------------------
-export function assembleResult(
-  graderOutput: GraderOutput,
-  telemetry: SessionTelemetryRecord,
-  sessionId: string,
-  skillName: string,
-  transcriptPath: string,
-): GradingResult {
-  // Default missing scores on expectations
-  const expectations = (graderOutput?.expectations ?? []).map((e) => ({
+function normalizeExpectations(expectations: GradingExpectation[]): GradingExpectation[] {
+  return expectations.map((e) => ({
     ...e,
     score: e.score ?? (e.passed ? 1.0 : 0.0),
     source: e.source ?? ("llm" as const),
   }));
+}
-  const baseSummary = graderOutput?.summary ?? { passed: 0, failed: 0, total: 0, pass_rate: 0 };
+function assembleResultFromExpectations(
+  expectations: GradingExpectation[],
+  telemetry: SessionTelemetryRecord,
+  sessionId: string,
+  skillName: string,
+  transcriptPath: string,
+): GradingResult {
+  const passedCount = expectations.filter((e) => e.passed).length;
+  const totalCount = expectations.length;
   const graduated = buildGraduatedSummary(expectations);
   return {
@@ -298,11 +548,116 @@ export function assembleResult(
     graded_at: new Date().toISOString(),
     expectations,
     summary: {
-      ...baseSummary,
+      passed: passedCount,
+      failed: totalCount - passedCount,
+      total: totalCount,
+      pass_rate: totalCount > 0 ? passedCount / totalCount : 0,
       mean_score: graduated.mean_score,
       score_std_dev: graduated.score_std_dev,
     },
     execution_metrics: buildExecutionMetrics(telemetry ?? ({} as SessionTelemetryRecord)),
+    claims: [],
+    eval_feedback: { suggestions: [], overall: "" },
+  };
+}
+export interface GradeSessionParams {
+  expectations: string[];
+  telemetry: SessionTelemetryRecord;
+  sessionId: string;
+  skillName: string;
+  transcriptExcerpt: string;
+  transcriptPath: string;
+  agent: string;
+  gradeViaAgentFn?: (prompt: string, agent: string) => Promise<GraderOutput>;
+}
+export async function gradeSession({
+  expectations,
+  telemetry,
+  sessionId,
+  skillName,
+  transcriptExcerpt,
+  transcriptPath,
+  agent,
+  gradeViaAgentFn = gradeViaAgent,
+}: GradeSessionParams): Promise<GradingResult> {
+  const preGateCtx: PreGateContext = {
+    telemetry,
+    skillName,
+    transcriptExcerpt,
+  };
+  const preGateResult = runPreGates(expectations, preGateCtx);
+  let allExpectations: GradingExpectation[];
+  if (preGateResult.remaining.length === 0) {
+    console.error(
+      `[INFO] All ${expectations.length} expectations resolved by pre-gates, skipping LLM`,
+    );
+    allExpectations = preGateResult.resolved;
+  } else {
+    console.error(
+      `[INFO] Pre-gates resolved ${preGateResult.resolved.length}/${expectations.length} expectations`,
+    );
+    const prompt = buildGradingPrompt(
+      preGateResult.remaining,
+      telemetry,
+      transcriptExcerpt,
+      skillName,
+    );
+    console.error(
+      `Grading ${preGateResult.remaining.length} expectations for skill '${skillName}'...`,
+    );
+    let graderOutput: GraderOutput;
+    try {
+      graderOutput = await gradeViaAgentFn(prompt, agent);
+    } catch (err) {
+      throw new Error(`Grading failed: ${err instanceof Error ? err.message : String(err)}`, {
+        cause: err,
+      });
+    }
+    const llmExpectations = normalizeExpectations(graderOutput.expectations ?? []);
+    if (llmExpectations.length !== preGateResult.remaining.length) {
+      throw new Error(
+        `Grader returned ${llmExpectations.length} expectations for ${preGateResult.remaining.length} unresolved expectations`,
+      );
+    }
+    allExpectations = [...preGateResult.resolved, ...llmExpectations];
+  }
+  return assembleResultFromExpectations(
+    allExpectations,
+    telemetry,
+    sessionId,
+    skillName,
+    transcriptPath,
+  );
+}
+// ---------------------------------------------------------------------------
+// Result assembly
+// ---------------------------------------------------------------------------
+export function assembleResult(
+  graderOutput: GraderOutput,
+  telemetry: SessionTelemetryRecord,
+  sessionId: string,
+  skillName: string,
+  transcriptPath: string,
+): GradingResult {
+  const result = assembleResultFromExpectations(
+    normalizeExpectations(graderOutput?.expectations ?? []),
+    telemetry,
+    sessionId,
+    skillName,
+    transcriptPath,
+  );
+  return {
+    ...result,
     claims: graderOutput?.claims ?? [],
     eval_feedback: graderOutput?.eval_feedback ?? { suggestions: [], overall: "" },
     failure_feedback: graderOutput?.failure_feedback,
@@ -348,19 +703,43 @@ export async function cliMain(): Promise<void> {
   const { values } = parseArgs({
     options: {
       skill: { type: "string" },
+      "skill-path": { type: "string" },
       expectations: { type: "string", multiple: true },
       "evals-json": { type: "string" },
       "eval-id": { type: "string" },
       "session-id": { type: "string" },
       transcript: { type: "string" },
       "telemetry-log": { type: "string", default: TELEMETRY_LOG },
-      output: { type: "string", default: "grading.json" },
+      output: { type: "string" },
       agent: { type: "string" },
       "show-transcript": { type: "boolean", default: false },
+      help: { type: "boolean", short: "h", default: false },
     },
     strict: true,
   });
+  if (values.help) {
+    console.log(`selftune grade — Grade a skill session
+Usage:
+  selftune grade --skill <name> [options]
+Options:
+  --skill             Skill name (required)
+  --skill-path        Path to SKILL.md (for auto-deriving expectations)
+  --expectations      Expectation strings (repeatable)
+  --evals-json        Path to evals JSON file
+  --eval-id           Eval ID within evals JSON
+  --session-id        Grade a specific session by ID
+  --transcript        Path to transcript file
+  --telemetry-log     Path to telemetry log (default: ~/.claude/session_telemetry_log.jsonl)
+  --output            Output path for grading JSON (default: ~/.selftune/grading/result-<session>.json)
+  --agent             Agent CLI to use (${AGENT_CANDIDATES.join(", ")})
+  --show-transcript   Print transcript excerpt before grading
+  -h, --help          Show this help message`);
+    process.exit(0);
+  }
   const skill = values.skill;
   if (!skill) {
     console.error("[ERROR] --skill is required");
@@ -369,7 +748,7 @@ export async function cliMain(): Promise<void> {
   // --- Determine agent ---
   let agent: string | null = null;
-  const validAgents = ["claude", "codex", "opencode"];
+  const validAgents = [...AGENT_CANDIDATES];
   if (values.agent) {
     if (!validAgents.includes(values.agent)) {
       console.error(
@@ -384,8 +763,8 @@ export async function cliMain(): Promise<void> {
   if (!agent) {
     console.error(
-      "[ERROR] No agent CLI (claude/codex/opencode) found in PATH.\n" +
-        "Install Claude Code, Codex, or OpenCode.",
+      `[ERROR] No supported agent CLI (${AGENT_CANDIDATES.join("/")}) found in PATH.\n` +
+        "Install one of the supported agent CLIs.",
     );
     process.exit(1);
   }
@@ -404,8 +783,18 @@ export async function cliMain(): Promise<void> {
   } else if (values.expectations?.length) {
     expectations = values.expectations;
   } else {
-    console.error("[ERROR] Provide --expectations or --evals-json + --eval-id");
-    process.exit(1);
+    // Auto-derive expectations from SKILL.md
+    const derived = deriveExpectationsFromSkill(skill, values["skill-path"]);
+    expectations = derived.expectations;
+    if (derived.derived) {
+      console.error(
+        `[INFO] Auto-derived ${derived.expectations.length} expectations from ${derived.source}`,
+      );
+    } else {
+      console.error(
+        `[WARN] No --expectations or --evals-json provided. Using generic expectations (${derived.source})`,
+      );
+    }
   }
   // --- Resolve session ---
@@ -415,9 +804,15 @@ export async function cliMain(): Promise<void> {
   const telemetryLog = values["telemetry-log"] ?? TELEMETRY_LOG;
   const telRecords = readJsonl<SessionTelemetryRecord>(telemetryLog);
+  const skillUsageRecords = readEffectiveSkillUsageRecords();
   if (values.transcript) {
     transcriptPath = values.transcript;
+    telemetry =
+      buildTelemetryFromTranscript(
+        values["session-id"] ?? basename(transcriptPath, ".jsonl"),
+        transcriptPath,
+      ) ?? ({} as SessionTelemetryRecord);
     for (let i = telRecords.length - 1; i >= 0; i--) {
       if (telRecords[i].transcript_path === transcriptPath) {
         telemetry = telRecords[i];
@@ -425,18 +820,25 @@ export async function cliMain(): Promise<void> {
         break;
       }
     }
+    if (telemetry.session_id) sessionId = telemetry.session_id;
   } else if (values["session-id"]) {
     sessionId = values["session-id"];
-    telemetry = findSession(telRecords, sessionId) ?? ({} as SessionTelemetryRecord);
-    transcriptPath = telemetry.transcript_path ?? "";
+    const resolved = resolveSessionById(telRecords, sessionId);
+    telemetry = resolved?.telemetry ?? ({} as SessionTelemetryRecord);
+    transcriptPath = resolved?.transcriptPath ?? "";
   } else {
-    telemetry = latestSessionForSkill(telRecords, skill) ?? ({} as SessionTelemetryRecord);
-    if (telemetry.session_id) {
-      sessionId = telemetry.session_id;
-      transcriptPath = telemetry.transcript_path ?? "";
-      console.error(`[INFO] Grading most recent '${skill}' session: ${sessionId}`);
+    const resolved = resolveLatestSessionForSkill(telRecords, skillUsageRecords, skill);
+    telemetry = resolved?.telemetry ?? ({} as SessionTelemetryRecord);
+    if (resolved) {
+      sessionId = resolved.sessionId;
+      transcriptPath = resolved.transcriptPath;
+      const note =
+        resolved.source === "telemetry" ? "" : ` (${resolved.source.replaceAll("_", " ")})`;
+      console.error(`[INFO] Grading most recent '${skill}' session: ${sessionId}${note}`);
     } else {
-      console.error(`[WARN] No telemetry for skill '${skill}'. Is session_stop_hook installed?`);
+      console.error(
+        `[WARN] No session found for skill '${skill}' in telemetry or recovered usage data.`,
+      );
     }
   }
@@ -448,74 +850,23 @@ export async function cliMain(): Promise<void> {
     console.log("==========================\n");
   }
-  // --- Run pre-gates first ---
-  const preGateCtx: PreGateContext = {
-    telemetry,
-    skillName: skill,
-    transcriptExcerpt,
-  };
-  const preGateResult = runPreGates(expectations, preGateCtx);
-  let allExpectations: GradingExpectation[];
-  if (preGateResult.remaining.length === 0) {
-    // All expectations resolved by pre-gates — skip LLM entirely
-    console.error(
-      `[INFO] All ${expectations.length} expectations resolved by pre-gates, skipping LLM`,
-    );
-    allExpectations = preGateResult.resolved;
-  } else {
-    // Build prompt and grade remaining via LLM
-    console.error(
-      `[INFO] Pre-gates resolved ${preGateResult.resolved.length}/${expectations.length} expectations`,
-    );
-    const prompt = buildGradingPrompt(preGateResult.remaining, telemetry, transcriptExcerpt, skill);
-    console.error(`Grading ${preGateResult.remaining.length} expectations for skill '${skill}'...`);
-    let graderOutput: GraderOutput;
-    try {
-      graderOutput = await gradeViaAgent(prompt, agent);
-    } catch (e) {
-      console.error(`[ERROR] Grading failed: ${e}`);
-      process.exit(1);
-    }
-    // Default scores on LLM results
-    const llmExpectations = (graderOutput.expectations ?? []).map((e) => ({
-      ...e,
-      score: e.score ?? (e.passed ? 1.0 : 0.0),
-      source: e.source ?? ("llm" as const),
-    }));
-    // Merge pre-gate + LLM results
-    allExpectations = [...preGateResult.resolved, ...llmExpectations];
+  let result: GradingResult;
+  try {
+    result = await gradeSession({
+      expectations,
+      telemetry,
+      sessionId,
+      skillName: skill,
+      transcriptExcerpt,
+      transcriptPath,
+      agent,
+    });
+  } catch (err) {
+    console.error(`[ERROR] ${err instanceof Error ? err.message : String(err)}`);
+    process.exit(1);
   }
-  // Compute graduated summary
-  const graduated = buildGraduatedSummary(allExpectations);
-  const passedCount = allExpectations.filter((e) => e.passed).length;
-  const totalCount = allExpectations.length;
-  const result: GradingResult = {
-    session_id: sessionId,
-    skill_name: skill,
-    transcript_path: transcriptPath,
-    graded_at: new Date().toISOString(),
-    expectations: allExpectations,
-    summary: {
-      passed: passedCount,
-      failed: totalCount - passedCount,
-      total: totalCount,
-      pass_rate: totalCount > 0 ? passedCount / totalCount : 0,
-      mean_score: graduated.mean_score,
-      score_std_dev: graduated.score_std_dev,
-    },
-    execution_metrics: buildExecutionMetrics(telemetry),
-    claims: [],
-    eval_feedback: { suggestions: [], overall: "" },
-  };
-  const outputPath = values.output ?? "grading.json";
+  const outputPath = values.output ?? buildDefaultGradingOutputPath(sessionId);
   const outputDir = dirname(outputPath);
   if (outputDir !== ".") {
     mkdirSync(outputDir, { recursive: true });