npm - selftune - Versions diffs - 0.2.20 → 0.2.21 - Mend

selftune 0.2.20 → 0.2.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/cli/selftune/evolution/evolve-body.ts +26 -2
package/cli/selftune/evolution/validate-host-replay.ts +390 -2
package/package.json +1 -1
package/skill/Workflows/Evolve.md +22 -6

package/cli/selftune/evolution/evolve-body.ts CHANGED Viewed

@@ -23,6 +23,7 @@ import type {
   FailurePattern,
   GradingResult,
   QueryLogRecord,
+  RoutingReplayFixture,
   SkillUsageRecord,
 } from "../types.js";
 import { CLIError, handleCLIError } from "../utils/cli-error.js";
@@ -37,7 +38,10 @@ import { type ExecutionContext, generateBodyProposal } from "./propose-body.js";
 import { generateRoutingProposal } from "./propose-routing.js";
 import { refineBodyProposal } from "./refine-body.js";
 import { validateBodyProposal } from "./validate-body.js";
-import { buildRoutingReplayFixture } from "./validate-host-replay.js";
+import {
+  buildRoutingReplayFixture,
+  runClaudeRuntimeReplayFixture,
+} from "./validate-host-replay.js";
 import { validateRoutingProposal } from "./validate-routing.js";
 // ---------------------------------------------------------------------------
@@ -465,12 +469,32 @@ export async function evolveBody(
           skillPath,
           platform: studentAgent === "codex" ? "codex" : "claude_code",
         });
+        const replayRunner =
+          replayFixture.platform === "claude_code" && studentAgent === "claude"
+            ? async ({
+                routing,
+                evalSet,
+                fixture,
+              }: {
+                routing: string;
+                evalSet: EvalEntry[];
+                fixture: RoutingReplayFixture;
+              }) =>
+                await runClaudeRuntimeReplayFixture({
+                  routing,
+                  evalSet,
+                  fixture,
+                })
+            : undefined;
         validation = await _validateRoutingProposal(
           proposal,
           evalSet,
           studentAgent,
           validationModelFlag,
-          { replayFixture },
+          {
+            replayFixture,
+            ...(replayRunner ? { replayRunner } : {}),
+          },
         );
       } else {
         validation = await _validateBodyProposal(

package/cli/selftune/evolution/validate-host-replay.ts CHANGED Viewed

@@ -1,5 +1,16 @@
-import { existsSync, readFileSync, readdirSync, realpathSync, statSync } from "node:fs";
-import { basename, dirname, join } from "node:path";
+import {
+  existsSync,
+  mkdirSync,
+  mkdtempSync,
+  readFileSync,
+  readdirSync,
+  realpathSync,
+  rmSync,
+  statSync,
+  writeFileSync,
+} from "node:fs";
+import { tmpdir } from "node:os";
+import { basename, dirname, isAbsolute, join } from "node:path";
 import type { EvalEntry, RoutingReplayEntryResult, RoutingReplayFixture } from "../types.js";
 import { parseFrontmatter } from "../utils/frontmatter.js";
@@ -10,6 +21,7 @@ import {
   jaccardSimilarity,
   tokenizeText,
 } from "../utils/text-similarity.js";
+import { replaceSection } from "./deploy-proposal.js";
 interface ReplaySkillSurface {
   skillName: string;
@@ -17,12 +29,41 @@ interface ReplaySkillSurface {
   whenToUseTokens: Set<string>;
 }
+interface ReplayWorkspace {
+  rootDir: string;
+  targetSkillPath: string;
+  competingSkillPaths: string[];
+}
+export interface ClaudeRuntimeReplayInvokerInput {
+  query: string;
+  workspaceRoot: string;
+  targetSkillName: string;
+  targetSkillPath: string;
+  competingSkillPaths: string[];
+}
+export interface ClaudeRuntimeReplayObservation {
+  invokedSkillNames: string[];
+  readSkillPaths: string[];
+  rawOutput: string;
+  sessionId?: string;
+  runtimeError?: string;
+}
+export type ClaudeRuntimeReplayInvoker = (
+  input: ClaudeRuntimeReplayInvokerInput,
+) => Promise<ClaudeRuntimeReplayObservation>;
 /**
  * Minimum score needed before replay treats routing text or skill-surface overlap
  * as a real match. Tuned to suppress weak false positives without killing recall
  * for short routing phrases and sparse skill surfaces.
  */
 const HOST_REPLAY_MATCH_THRESHOLD = 0.18;
+const CLAUDE_RUNTIME_REPLAY_TIMEOUT_MS = 30_000;
+const CLAUDE_RUNTIME_ROUTING_PROMPT =
+  "You are being evaluated only on skill routing. Do not solve the user's task. If a local project skill is relevant, invoke exactly one skill immediately. If no local project skill fits, respond with NO_SKILL and do not browse unrelated files.";
 function resolveReplayPath(path: string): string {
   try {
@@ -32,6 +73,10 @@ function resolveReplayPath(path: string): string {
   }
 }
+function resolveObservedReplayPath(path: string, workspaceRoot: string): string {
+  return resolveReplayPath(isAbsolute(path) ? path : join(workspaceRoot, path));
+}
 function listCompetingSkillPaths(targetSkillPath: string): string[] {
   const normalizedTargetPath = resolveReplayPath(targetSkillPath);
   const targetSkillDir = dirname(normalizedTargetPath);
@@ -82,6 +127,304 @@ export function buildRoutingReplayFixture(options: {
   };
 }
+function buildRuntimeReplayTargetContent(skillPath: string, routing: string): string {
+  const currentContent = readFileSync(skillPath, "utf8");
+  return replaceSection(currentContent, "Workflow Routing", routing.trim());
+}
+function stageReplaySkill(
+  registryDir: string,
+  sourceSkillPath: string,
+  overrideContent?: string,
+): string {
+  const skillDirName = basename(dirname(sourceSkillPath)) || "unknown-skill";
+  const destinationDir = join(registryDir, skillDirName);
+  mkdirSync(destinationDir, { recursive: true });
+  const destinationPath = join(destinationDir, "SKILL.md");
+  const content = overrideContent ?? readFileSync(sourceSkillPath, "utf8");
+  writeFileSync(destinationPath, content, "utf8");
+  return destinationPath;
+}
+function buildRuntimeReplayWorkspace(
+  fixture: RoutingReplayFixture,
+  routing: string,
+): ReplayWorkspace {
+  const rootDir = mkdtempSync(join(tmpdir(), "selftune-runtime-replay-"));
+  try {
+    const registryDir = join(rootDir, ".claude", "skills");
+    mkdirSync(join(rootDir, ".git"), { recursive: true });
+    mkdirSync(registryDir, { recursive: true });
+    const targetSkillPath = stageReplaySkill(
+      registryDir,
+      fixture.target_skill_path,
+      buildRuntimeReplayTargetContent(fixture.target_skill_path, routing),
+    );
+    const competingSkillPaths = fixture.competing_skill_paths.map((skillPath) =>
+      stageReplaySkill(registryDir, skillPath),
+    );
+    return {
+      rootDir,
+      targetSkillPath,
+      competingSkillPaths,
+    };
+  } catch (error) {
+    rmSync(rootDir, { recursive: true, force: true });
+    throw error;
+  }
+}
+function cleanupRuntimeReplayWorkspace(workspace: ReplayWorkspace): void {
+  rmSync(workspace.rootDir, { recursive: true, force: true });
+}
+function parseClaudeRuntimeReplayOutput(rawOutput: string): ClaudeRuntimeReplayObservation {
+  const invokedSkillNames = new Set<string>();
+  const readSkillPaths = new Set<string>();
+  let sessionId: string | undefined;
+  let runtimeError: string | undefined;
+  for (const line of rawOutput.split("\n")) {
+    const trimmed = line.trim();
+    if (!trimmed) continue;
+    let parsed: Record<string, unknown>;
+    try {
+      parsed = JSON.parse(trimmed);
+    } catch {
+      continue;
+    }
+    const maybeSessionId = parsed.session_id;
+    if (typeof maybeSessionId === "string" && maybeSessionId) {
+      sessionId = maybeSessionId;
+    }
+    if (typeof parsed.error === "string" && parsed.error) {
+      runtimeError = parsed.error;
+    }
+    const assistantMessage =
+      parsed.type === "assistant" && typeof parsed.message === "object" && parsed.message !== null
+        ? (parsed.message as Record<string, unknown>)
+        : undefined;
+    const content = assistantMessage?.content;
+    if (!Array.isArray(content)) continue;
+    for (const block of content) {
+      if (typeof block !== "object" || block === null) continue;
+      const typedBlock = block as Record<string, unknown>;
+      if (typedBlock.type !== "tool_use") continue;
+      const toolName = typedBlock.name;
+      const input =
+        typeof typedBlock.input === "object" && typedBlock.input !== null
+          ? (typedBlock.input as Record<string, unknown>)
+          : {};
+      if (toolName === "Skill") {
+        const skillName = input.skill;
+        if (typeof skillName === "string" && skillName.trim()) {
+          invokedSkillNames.add(skillName.trim());
+        }
+      }
+      if (toolName === "Read") {
+        const filePath = input.file_path;
+        if (typeof filePath === "string" && filePath.trim()) {
+          readSkillPaths.add(resolveReplayPath(filePath.trim()));
+        }
+      }
+    }
+  }
+  return {
+    invokedSkillNames: [...invokedSkillNames],
+    readSkillPaths: [...readSkillPaths],
+    rawOutput,
+    ...(sessionId ? { sessionId } : {}),
+    ...(runtimeError ? { runtimeError } : {}),
+  };
+}
+async function invokeClaudeRuntimeReplay(
+  input: ClaudeRuntimeReplayInvokerInput,
+): Promise<ClaudeRuntimeReplayObservation> {
+  const command = [
+    "claude",
+    "-p",
+    "--verbose",
+    "--output-format",
+    "stream-json",
+    "--dangerously-skip-permissions",
+    "--no-session-persistence",
+    "--setting-sources",
+    "project,local",
+    "--tools",
+    "Skill,Read",
+    "--max-turns",
+    "1",
+    "--append-system-prompt",
+    CLAUDE_RUNTIME_ROUTING_PROMPT,
+    input.query,
+  ];
+  const proc = Bun.spawn(command, {
+    cwd: input.workspaceRoot,
+    stdout: "pipe",
+    stderr: "pipe",
+    env: { ...process.env, CLAUDECODE: "" },
+  });
+  const timeout = setTimeout(() => proc.kill(), CLAUDE_RUNTIME_REPLAY_TIMEOUT_MS);
+  const [stdoutText, stderrText, exitCode] = await Promise.all([
+    new Response(proc.stdout).text(),
+    new Response(proc.stderr).text(),
+    proc.exited,
+  ]);
+  clearTimeout(timeout);
+  const observation = parseClaudeRuntimeReplayOutput(stdoutText);
+  const combinedError = [observation.runtimeError, stderrText.trim()].filter(Boolean).join(" | ");
+  const hasRoutingSignal =
+    observation.invokedSkillNames.length > 0 || observation.readSkillPaths.length > 0;
+  if (exitCode !== 0 && !hasRoutingSignal) {
+    throw new Error(combinedError || `claude runtime replay exited with code ${exitCode}`);
+  }
+  return {
+    ...observation,
+    ...(combinedError ? { runtimeError: combinedError } : {}),
+  };
+}
+function prefixReplayEvidence(
+  results: RoutingReplayEntryResult[],
+  prefix: string,
+): RoutingReplayEntryResult[] {
+  return results.map((result) => ({
+    ...result,
+    evidence: result.evidence ? `${prefix}; ${result.evidence}` : prefix,
+  }));
+}
+function evaluateRuntimeReplayObservation(
+  entry: EvalEntry,
+  fixture: RoutingReplayFixture,
+  observation: ClaudeRuntimeReplayObservation,
+  workspace: ReplayWorkspace,
+): RoutingReplayEntryResult {
+  const normalizedReadPaths = new Set(
+    observation.readSkillPaths.map((path) => resolveObservedReplayPath(path, workspace.rootDir)),
+  );
+  const allowedReadPaths = new Set([
+    resolveReplayPath(workspace.targetSkillPath),
+    ...workspace.competingSkillPaths.map(resolveReplayPath),
+  ]);
+  const targetSkillName = fixture.target_skill_name.trim();
+  const targetInvoked = observation.invokedSkillNames.includes(targetSkillName);
+  const competingInvoked = observation.invokedSkillNames.find((skillName) =>
+    fixture.competing_skill_paths.some(
+      (skillPath) => basename(dirname(skillPath)).trim() === skillName.trim(),
+    ),
+  );
+  const unrelatedInvoked = observation.invokedSkillNames.find(
+    (skillName) => skillName.trim() !== targetSkillName && skillName.trim() !== competingInvoked,
+  );
+  const unrelatedReadPaths = [...normalizedReadPaths].filter((path) => !allowedReadPaths.has(path));
+  const targetRead = normalizedReadPaths.has(resolveReplayPath(workspace.targetSkillPath));
+  const competingRead = workspace.competingSkillPaths.find((skillPath) =>
+    normalizedReadPaths.has(resolveReplayPath(skillPath)),
+  );
+  const sessionPrefix = observation.sessionId
+    ? `runtime replay session ${observation.sessionId}`
+    : "runtime replay";
+  if (observation.invokedSkillNames.length > 1) {
+    return {
+      query: entry.query,
+      should_trigger: entry.should_trigger,
+      triggered: false,
+      passed: false,
+      evidence: `${sessionPrefix} invoked multiple skills: ${observation.invokedSkillNames.join(", ")}`,
+    };
+  }
+  if (targetInvoked) {
+    return {
+      query: entry.query,
+      should_trigger: entry.should_trigger,
+      triggered: true,
+      passed: entry.should_trigger,
+      evidence: `${sessionPrefix} invoked target skill: ${targetSkillName}`,
+    };
+  }
+  if (competingInvoked) {
+    return {
+      query: entry.query,
+      should_trigger: entry.should_trigger,
+      triggered: false,
+      passed: !entry.should_trigger,
+      evidence: `${sessionPrefix} invoked competing skill: ${competingInvoked}`,
+    };
+  }
+  if (unrelatedInvoked) {
+    return {
+      query: entry.query,
+      should_trigger: entry.should_trigger,
+      triggered: false,
+      passed: false,
+      evidence: `${sessionPrefix} invoked unrelated skill: ${unrelatedInvoked}`,
+    };
+  }
+  if (unrelatedReadPaths.length > 0) {
+    return {
+      query: entry.query,
+      should_trigger: entry.should_trigger,
+      triggered: false,
+      passed: false,
+      evidence: `${sessionPrefix} read files outside staged skill set: ${unrelatedReadPaths.join(", ")}`,
+    };
+  }
+  if (targetRead) {
+    return {
+      query: entry.query,
+      should_trigger: entry.should_trigger,
+      triggered: false,
+      passed: !entry.should_trigger,
+      evidence: `${sessionPrefix} only read the target skill without invoking it`,
+    };
+  }
+  if (competingRead) {
+    return {
+      query: entry.query,
+      should_trigger: entry.should_trigger,
+      triggered: false,
+      passed: !entry.should_trigger,
+      evidence: `${sessionPrefix} only read a competing skill without invoking it`,
+    };
+  }
+  if (observation.runtimeError) {
+    throw new Error(`${sessionPrefix} did not reach a skill decision: ${observation.runtimeError}`);
+  }
+  return {
+    query: entry.query,
+    should_trigger: entry.should_trigger,
+    triggered: false,
+    passed: !entry.should_trigger,
+    evidence: `${sessionPrefix} did not invoke any local project skill`,
+  };
+}
 function loadReplaySkillSurface(skillPath: string): ReplaySkillSurface {
   const fallbackName = basename(dirname(skillPath)) || "unknown-skill";
   try {
@@ -234,3 +577,48 @@ export function runHostReplayFixture(options: {
     };
   });
 }
+export async function runClaudeRuntimeReplayFixture(options: {
+  routing: string;
+  evalSet: EvalEntry[];
+  fixture: RoutingReplayFixture;
+  runtimeInvoker?: ClaudeRuntimeReplayInvoker;
+}): Promise<RoutingReplayEntryResult[]> {
+  const fallbackReason = (reason: string) =>
+    `runtime replay unavailable; fell back to fixture simulation (${reason})`;
+  if (options.fixture.platform !== "claude_code") {
+    return prefixReplayEvidence(
+      runHostReplayFixture(options),
+      fallbackReason(`unsupported platform ${options.fixture.platform}`),
+    );
+  }
+  const invokeRuntime = options.runtimeInvoker ?? invokeClaudeRuntimeReplay;
+  let workspace: ReplayWorkspace | undefined;
+  try {
+    workspace = buildRuntimeReplayWorkspace(options.fixture, options.routing);
+    const results: RoutingReplayEntryResult[] = [];
+    for (const entry of options.evalSet) {
+      const observation = await invokeRuntime({
+        query: entry.query,
+        workspaceRoot: workspace.rootDir,
+        targetSkillName: options.fixture.target_skill_name,
+        targetSkillPath: workspace.targetSkillPath,
+        competingSkillPaths: workspace.competingSkillPaths,
+      });
+      results.push(
+        evaluateRuntimeReplayObservation(entry, options.fixture, observation, workspace),
+      );
+    }
+    return results;
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    return prefixReplayEvidence(runHostReplayFixture(options), fallbackReason(message));
+  } finally {
+    if (workspace) cleanupRuntimeReplayWorkspace(workspace);
+  }
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "selftune",
-  "version": "0.2.20",
+  "version": "0.2.21",
   "description": "Self-improving skills CLI for AI agents",
   "keywords": [
     "agent",

package/skill/Workflows/Evolve.md CHANGED Viewed

@@ -89,15 +89,31 @@ skills in the same registry, so replay-backed validation is preferred whenever
 that local fixture can be constructed because it captures host-style routing
 behavior instead of model judgment.
-The current replay path is fixture-backed: it evaluates the target routing table
-against the installed target/competing skill surfaces in a controlled replay
-fixture and records per-entry evidence. That is still a stronger signal than a
-free-form judge prompt, but you should describe it as replay-backed validation,
-not as live operator telemetry.
+For Claude Code, the replay path now stages a temporary project-local
+`.claude/skills` registry, swaps in the candidate routing table, and runs a
+one-turn Claude print-mode session with project/local settings only. Validation
+records whether Claude actually invoked the target skill, invoked a competing
+skill, invoked an unrelated skill, or made no routing decision at all.
+Unrelated skill use is treated as a replay failure even on negative evals,
+because it still indicates the runtime routed somewhere unexpected. If that
+runtime path is unavailable or fails to reach a runtime decision, selftune
+falls back to the existing fixture-backed surface simulation and notes the
+fallback in the replay evidence instead of pretending it was a runtime result.
+For non-Claude platforms today, replay remains fixture-backed: it evaluates the
+target routing table against the installed target/competing skill surfaces in a
+controlled replay fixture and records per-entry evidence. That is still a
+stronger signal than a free-form judge prompt, but you should describe it as
+replay-backed validation, not as live operator telemetry.
 Replay parsing is intentionally conservative: unreadable skill files degrade to
 empty surfaces instead of throwing, and malformed routing rows with empty
-trigger cells are ignored rather than treated as valid triggers.
+trigger cells are ignored rather than treated as valid triggers. Claude replay
+also normalizes observed `Read` paths against the staged workspace, so relative
+skill reads still count as read-only evidence for the target or competing
+skill. Reads outside the staged skill set are treated as replay failures rather
+than benign negatives, because they indicate the runtime left the controlled
+evaluation surface.
 ## Parsing Instructions