npm - selftune - Versions diffs - 0.2.31 → 0.2.32 - Mend

selftune 0.2.31 → 0.2.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

package/README.md +83 -56
package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
package/apps/local-dashboard/dist/index.html +3 -3
package/cli/selftune/command-surface.ts +613 -2
package/cli/selftune/create/baseline.ts +429 -0
package/cli/selftune/create/check.ts +35 -0
package/cli/selftune/create/init.ts +115 -0
package/cli/selftune/create/package-candidate-state.ts +771 -0
package/cli/selftune/create/package-evaluator.ts +710 -0
package/cli/selftune/create/package-fingerprint.ts +142 -0
package/cli/selftune/create/package-search.ts +377 -0
package/cli/selftune/create/publish.ts +431 -0
package/cli/selftune/create/readiness.ts +495 -0
package/cli/selftune/create/replay.ts +330 -0
package/cli/selftune/create/report.ts +74 -0
package/cli/selftune/create/scaffold.ts +121 -0
package/cli/selftune/create/skills-ref-adapter.ts +177 -0
package/cli/selftune/create/status.ts +33 -0
package/cli/selftune/create/templates.ts +249 -0
package/cli/selftune/cron/setup.ts +1 -1
package/cli/selftune/dashboard-action-events.ts +4 -1
package/cli/selftune/dashboard-action-result.ts +789 -24
package/cli/selftune/dashboard-action-stream.ts +80 -0
package/cli/selftune/dashboard-contract.ts +146 -3
package/cli/selftune/dashboard-server.ts +5 -4
package/cli/selftune/eval/hooks-to-evals.ts +58 -35
package/cli/selftune/eval/synthetic-evals.ts +145 -17
package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
package/cli/selftune/evolution/evolve-body.ts +9 -36
package/cli/selftune/evolution/evolve.ts +8 -72
package/cli/selftune/evolution/stopping-criteria.ts +5 -13
package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
package/cli/selftune/evolution/validate-host-replay.ts +115 -15
package/cli/selftune/improve.ts +206 -0
package/cli/selftune/index.ts +123 -6
package/cli/selftune/init.ts +1 -1
package/cli/selftune/localdb/queries/dashboard.ts +30 -0
package/cli/selftune/localdb/schema.ts +52 -0
package/cli/selftune/monitoring/watch.ts +257 -23
package/cli/selftune/orchestrate/execute.ts +300 -1
package/cli/selftune/orchestrate/finalize.ts +14 -0
package/cli/selftune/orchestrate/plan.ts +22 -5
package/cli/selftune/orchestrate/prepare.ts +59 -4
package/cli/selftune/orchestrate/report.ts +1 -1
package/cli/selftune/orchestrate.ts +34 -1
package/cli/selftune/publish.ts +35 -0
package/cli/selftune/routes/actions.ts +81 -15
package/cli/selftune/routes/overview.ts +1 -1
package/cli/selftune/routes/skill-report.ts +147 -2
package/cli/selftune/run.ts +18 -0
package/cli/selftune/schedule.ts +3 -3
package/cli/selftune/search-run.ts +703 -0
package/cli/selftune/status.ts +35 -11
package/cli/selftune/testing-readiness.ts +431 -40
package/cli/selftune/types.ts +316 -0
package/cli/selftune/utils/eval-readiness.ts +1 -0
package/cli/selftune/utils/json-output.ts +11 -0
package/cli/selftune/utils/lifecycle-surface.ts +48 -0
package/cli/selftune/utils/query-filter.ts +82 -1
package/cli/selftune/utils/tui.ts +85 -2
package/cli/selftune/verify.ts +205 -0
package/cli/selftune/workflows/proposals.ts +1 -1
package/cli/selftune/workflows/skill-scaffold.ts +141 -63
package/cli/selftune/workflows/workflows.ts +4 -4
package/package.json +1 -1
package/skill/SKILL.md +148 -85
package/skill/references/cli-quick-reference.md +16 -1
package/skill/references/creator-playbook.md +31 -10
package/skill/workflows/Baseline.md +8 -9
package/skill/workflows/Contributions.md +4 -4
package/skill/workflows/Create.md +173 -0
package/skill/workflows/CreateTestDeploy.md +34 -30
package/skill/workflows/Cron.md +2 -2
package/skill/workflows/Dashboard.md +3 -3
package/skill/workflows/Evals.md +13 -7
package/skill/workflows/Evolve.md +75 -32
package/skill/workflows/EvolveBody.md +22 -15
package/skill/workflows/Hook.md +1 -1
package/skill/workflows/Improve.md +168 -0
package/skill/workflows/Initialize.md +3 -3
package/skill/workflows/Orchestrate.md +49 -12
package/skill/workflows/Publish.md +100 -0
package/skill/workflows/Run.md +72 -0
package/skill/workflows/Schedule.md +2 -2
package/skill/workflows/SearchRun.md +89 -0
package/skill/workflows/SignalsDashboard.md +2 -2
package/skill/workflows/UnitTest.md +13 -4
package/skill/workflows/Verify.md +136 -0
package/skill/workflows/Watch.md +114 -47
package/skill/workflows/Workflows.md +13 -8
package/apps/local-dashboard/dist/assets/index-B7v_o1WC.js +0 -15
package/apps/local-dashboard/dist/assets/index-CrO77SVi.css +0 -1
package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1

package/cli/selftune/evolution/evolve-body.ts CHANGED Viewed

@@ -433,37 +433,6 @@ export async function evolveBody(
         }
       }
-      // Check confidence threshold
-      if (proposal.confidence < confidenceThreshold) {
-        recordAudit(
-          proposal.proposal_id,
-          "rejected",
-          `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
-        );
-        recordEvidence({
-          timestamp: new Date().toISOString(),
-          proposal_id: proposal.proposal_id,
-          skill_name: skillName,
-          skill_path: skillPath,
-          target,
-          stage: "rejected",
-          rationale: proposal.rationale,
-          confidence: proposal.confidence,
-          details: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
-        });
-        if (iteration === maxIterations - 1) {
-          return {
-            proposal: lastProposal,
-            validation: null,
-            deployed: false,
-            auditEntries,
-            reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
-          };
-        }
-        continue;
-      }
       // Validate (validationModel overrides studentModel for validation calls)
       const validationModelFlag = options.validationModel ?? studentModel;
       let validation: BodyValidationResult;
@@ -544,6 +513,10 @@ export async function evolveBody(
       }
       lastValidation = validation;
       const validatedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
+      const confidenceReviewNote =
+        proposal.confidence < confidenceThreshold
+          ? ` (confidence ${proposal.confidence.toFixed(2)} below review threshold ${confidenceThreshold})`
+          : "";
       recordAudit(
         proposal.proposal_id,
@@ -552,7 +525,7 @@ export async function evolveBody(
           validation.validation_fallback_reason
             ? ` (replay fallback: ${validation.validation_fallback_reason})`
             : ""
-        }`,
+        }${confidenceReviewNote}`,
         {
           validation_mode: validation.validation_mode,
           validation_agent: validation.validation_agent,
@@ -573,7 +546,7 @@ export async function evolveBody(
           validation.validation_fallback_reason
             ? ` (replay fallback: ${validation.validation_fallback_reason})`
             : ""
-        }`,
+        }${confidenceReviewNote}`,
         validation: {
           improved: validation.improved,
           gates_passed: validation.gates_passed,
@@ -641,7 +614,7 @@ export async function evolveBody(
           validation.validation_fallback_reason
             ? ` (replay fallback: ${validation.validation_fallback_reason})`
             : ""
-        }`,
+        }${confidenceReviewNote}`,
         {
           validation_mode: validation.validation_mode,
           validation_agent: validation.validation_agent,
@@ -662,7 +635,7 @@ export async function evolveBody(
           validation.validation_fallback_reason
             ? ` (replay fallback: ${validation.validation_fallback_reason})`
             : ""
-        }`,
+        }${confidenceReviewNote}`,
         validation: {
           improved: validation.improved,
           gates_passed: validation.gates_passed,
@@ -886,7 +859,7 @@ Options:
   --eval-set          Path to eval set JSON
   --dry-run           Validate without deploying
   --max-iterations    Max refinement iterations (default: 3)
-  --confidence        Confidence threshold 0.0-1.0 (default: 0.6)
+  --confidence        Low-confidence review threshold 0.0-1.0 (default: 0.6)
   --task-description  Optional task description context
   --few-shot          Comma-separated paths to example skill files
   --validation-model  Model for trigger-check validation calls (overrides --student-model for validation)

package/cli/selftune/evolution/evolve.ts CHANGED Viewed

@@ -79,7 +79,7 @@ export interface EvolveOptions {
   evalSetPath?: string;
   agent: string;
   dryRun: boolean;
-  confidenceThreshold: number; // default 0.6
+  confidenceThreshold: number; // warning/review threshold, default 0.6
   maxIterations: number; // default 3
   gradingResults?: GradingResult[];
   paretoEnabled?: boolean;
@@ -713,23 +713,9 @@ export async function evolve(
       );
       llmCallCount += candidateCount;
-      // Filter by confidence threshold
-      const viableCandidates = candidates.filter((c) => c.confidence >= confidenceThreshold);
-      if (viableCandidates.length === 0) {
-        finishTui();
-        return withStats({
-          proposal: candidates[0] ?? null,
-          validation: null,
-          deployed: false,
-          auditEntries,
-          reason: `No candidates met confidence threshold ${confidenceThreshold}`,
-        });
-      }
       // Validate each candidate
       const paretoCandidates: ParetoCandidate[] = [];
-      for (const proposal of viableCandidates) {
+      for (const proposal of candidates) {
         recordAudit(
           proposal.proposal_id,
           "created",
@@ -855,7 +841,7 @@ export async function evolve(
       if (paretoCandidates.length === 0) {
         finishTui();
         return withStats({
-          proposal: viableCandidates[0],
+          proposal: candidates[0] ?? null,
           validation: null,
           deployed: false,
           auditEntries,
@@ -932,15 +918,12 @@ export async function evolve(
           // Re-evaluate stopping after a constitutional rejection by treating the
           // last entry in previousPassRates as the currentPassRate (or 0 on the
           // first iteration) and slicing it out of history before calling
-          // evaluateStoppingCriteria() with the current iteration/maxIterations,
-          // confidenceThreshold, and proposal.confidence.
+          // evaluateStoppingCriteria() with the current iteration/maxIterations.
           const constitutionStop = evaluateStoppingCriteria(
             previousPassRates.at(-1) ?? 0,
             previousPassRates.slice(0, -1),
             iteration + 1,
             maxIterations,
-            confidenceThreshold,
-            proposal.confidence,
           );
           recordAudit(
             proposal.proposal_id,
@@ -971,52 +954,7 @@ export async function evolve(
           continue;
         }
-        // Step 9: Check confidence threshold via stopping criteria
-        {
-          const preValidationStop = evaluateStoppingCriteria(
-            previousPassRates.at(-1) ?? 0,
-            previousPassRates.slice(0, -1),
-            iteration + 1,
-            maxIterations,
-            confidenceThreshold,
-            proposal.confidence,
-          );
-          if (proposal.confidence < confidenceThreshold) {
-            feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
-            recordAudit(
-              proposal.proposal_id,
-              "rejected",
-              `${feedbackReason} (stopping: ${preValidationStop.reason})`,
-            );
-            recordEvidence({
-              timestamp: new Date().toISOString(),
-              proposal_id: proposal.proposal_id,
-              skill_name: skillName,
-              skill_path: skillPath,
-              target: "description",
-              stage: "rejected",
-              rationale: proposal.rationale,
-              confidence: proposal.confidence,
-              details: `${feedbackReason} (stopping: ${preValidationStop.reason})`,
-            });
-            // Use stopping criteria to decide whether to return or retry
-            if (preValidationStop.shouldStop) {
-              finishTui();
-              return withStats({
-                proposal: lastProposal,
-                validation: null,
-                deployed: false,
-                auditEntries,
-                reason: `${feedbackReason} (${preValidationStop.reason})`,
-              });
-            }
-            continue;
-          }
-        }
-        // Step 10: Validate against eval set
+        // Step 9: Validate against eval set
         const batchCount = Math.ceil(evalSet.length / TRIGGER_CHECK_BATCH_SIZE);
         tui.step(
           `Validating ${evalSet.length} entries (mode=${effectiveValidationMode}, ${batchCount} batches, ${VALIDATION_RUNS}x majority-vote)...`,
@@ -1038,7 +976,7 @@ export async function evolve(
           `Validation: ${(validation.before_pass_rate * 100).toFixed(1)}% \u2192 ${(validation.after_pass_rate * 100).toFixed(1)}% (improved: ${validation.improved})`,
         );
-        // Step 11: Audit "validated"
+        // Step 10: Audit "validated"
         const evalSnapshot: EvalPassRate = {
           total: evalSet.length,
           passed: Math.round(validation.after_pass_rate * evalSet.length),
@@ -1094,14 +1032,12 @@ export async function evolve(
           },
         });
-        // Step 12: Evaluate stopping criteria after validation
+        // Step 11: Evaluate stopping criteria after validation
         const stopping = evaluateStoppingCriteria(
           validation.after_pass_rate,
           previousPassRates,
           iteration + 1,
           maxIterations,
-          confidenceThreshold,
-          proposal.confidence,
         );
         previousPassRates.push(validation.after_pass_rate);
@@ -1710,7 +1646,7 @@ export async function cliMain(): Promise<void> {
       result.proposal.confidence < Number.parseFloat(values.confidence ?? "0.6")
     ) {
       console.error(
-        `  Confidence ${result.proposal.confidence.toFixed(2)} below threshold ${values.confidence ?? "0.6"}`,
+        `  Confidence ${result.proposal.confidence.toFixed(2)} below review threshold ${values.confidence ?? "0.6"} (validated anyway)`,
       );
     }
     // Targeted suggestions based on specific failure reason

package/cli/selftune/evolution/stopping-criteria.ts CHANGED Viewed

@@ -2,7 +2,7 @@
  * stopping-criteria.ts
  *
  * Evaluates whether the evolution loop should stop based on convergence,
- * iteration limits, confidence thresholds, and plateau detection.
+ * iteration limits, and plateau detection.
  * Pure function module with no external dependencies.
  */
@@ -25,17 +25,14 @@ export interface StoppingDecision {
  * Checks conditions in priority order:
  *   1. Converged (pass rate >= 95%)
  *   2. Max iterations reached
- *   3. Low confidence (below threshold)
- *   4. Plateau (< 1% variation over last 3 iterations)
- *   5. Continue (none of the above)
+ *   3. Plateau (< 1% variation over last 3 iterations)
+ *   4. Continue (none of the above)
  */
 export function evaluateStoppingCriteria(
   currentPassRate: number,
   previousPassRates: number[],
   iterationCount: number,
   maxIterations: number,
-  confidenceThreshold: number,
-  proposalConfidence: number,
 ): StoppingDecision {
   // 1. Converged
   if (currentPassRate >= 0.95) {
@@ -47,12 +44,7 @@ export function evaluateStoppingCriteria(
     return { shouldStop: true, reason: "Max iterations reached" };
   }
-  // 3. Low confidence
-  if (proposalConfidence < confidenceThreshold) {
-    return { shouldStop: true, reason: "Confidence below threshold" };
-  }
-  // 4. Plateau detection: need at least 2 previous rates to form 3 data points
+  // 3. Plateau detection: need at least 2 previous rates to form 3 data points
   if (previousPassRates.length >= 2) {
     const last2Previous = previousPassRates.slice(-2);
     const window = [...last2Previous, currentPassRate];
@@ -64,6 +56,6 @@ export function evaluateStoppingCriteria(
     }
   }
-  // 5. Continue
+  // 4. Continue
   return { shouldStop: false, reason: "Continuing: improvement possible" };
 }

package/cli/selftune/evolution/unblock-suggestions.ts CHANGED Viewed

@@ -83,22 +83,6 @@ export function buildUnblockSuggestions(result: EvolveResult, skillName: string)
     return suggestions;
   }
-  // --- Confidence failures (specific before general) ---
-  if (reason.includes("No candidates met confidence")) {
-    suggestions.push(`Lower the threshold: selftune evolve --skill ${skillName} --confidence 0.4`);
-    suggestions.push(
-      `Or increase candidates: selftune evolve --skill ${skillName} --pareto --candidates 5`,
-    );
-    appendQualityHints(suggestions, descText, skillName);
-    return suggestions;
-  }
-  if (reason.toLowerCase().includes("confidence") && reason.includes("threshold")) {
-    suggestions.push(`Lower the threshold: selftune evolve --skill ${skillName} --confidence 0.4`);
-    suggestions.push("Or add more eval entries so the LLM has more context for proposals");
-    appendQualityHints(suggestions, descText, skillName);
-    return suggestions;
-  }
   // --- Validation failures (proposals regressed) ---
   if (reason.includes("Validation failed after")) {
     suggestions.push(

package/cli/selftune/evolution/validate-host-replay.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import {
   existsSync,
+  copyFileSync,
   mkdirSync,
   mkdtempSync,
   readFileSync,
@@ -16,7 +17,13 @@ import {
   emitDashboardActionMetrics,
   emitDashboardActionProgress,
 } from "../dashboard-action-events.js";
-import type { EvalEntry, RoutingReplayEntryResult, RoutingReplayFixture } from "../types.js";
+import type {
+  EvalEntry,
+  ReplayStagingMode,
+  RuntimeReplayEntryMetrics,
+  RoutingReplayEntryResult,
+  RoutingReplayFixture,
+} from "../types.js";
 import type { DashboardActionMetrics } from "../dashboard-contract.js";
 import { parseFrontmatter } from "../utils/frontmatter.js";
 import {
@@ -45,6 +52,7 @@ interface ReplayWorkspace {
   skillRegistryDir: string;
   targetSkillPath: string;
   competingSkillPaths: string[];
+  allowedReadRoots: string[];
 }
 export type RuntimeReplayContentTarget = "routing" | "description" | "body";
@@ -65,6 +73,7 @@ export interface RuntimeReplayObservation {
   rawOutput: string;
   sessionId?: string;
   runtimeError?: string;
+  metrics?: DashboardActionMetrics;
 }
 export type RuntimeReplayInvoker = (
@@ -162,6 +171,7 @@ export function buildRoutingReplayFixture(options: {
   platform?: RoutingReplayFixture["platform"];
   fixtureId?: string;
   workspaceRoot?: string;
+  stagingMode?: ReplayStagingMode;
 }): RoutingReplayFixture {
   const targetSkillPath = resolveReplayPath(options.skillPath);
   const workspaceRoot =
@@ -175,6 +185,7 @@ export function buildRoutingReplayFixture(options: {
     target_skill_path: targetSkillPath,
     competing_skill_paths: listCompetingSkillPaths(targetSkillPath),
     ...(workspaceRoot ? { workspace_root: workspaceRoot } : {}),
+    ...(options.stagingMode ? { skill_staging_mode: options.stagingMode } : {}),
   };
 }
@@ -193,14 +204,32 @@ function buildRuntimeReplayTargetContent(
   return replaceSection(currentContent, "Workflow Routing", content.trim());
 }
+function copyDirectoryRecursive(sourceDir: string, destinationDir: string): void {
+  mkdirSync(destinationDir, { recursive: true });
+  for (const entry of readdirSync(sourceDir, { withFileTypes: true })) {
+    const sourcePath = join(sourceDir, entry.name);
+    const destinationPath = join(destinationDir, entry.name);
+    if (entry.isDirectory()) {
+      copyDirectoryRecursive(sourcePath, destinationPath);
+      continue;
+    }
+    copyFileSync(sourcePath, destinationPath);
+  }
+}
 function stageReplaySkill(
   registryDir: string,
   sourceSkillPath: string,
+  stagingMode: ReplayStagingMode,
   overrideContent?: string,
 ): string {
   const skillDirName = basename(dirname(sourceSkillPath)) || "unknown-skill";
   const destinationDir = join(registryDir, skillDirName);
-  mkdirSync(destinationDir, { recursive: true });
+  if (stagingMode === "package") {
+    copyDirectoryRecursive(dirname(sourceSkillPath), destinationDir);
+  } else {
+    mkdirSync(destinationDir, { recursive: true });
+  }
   const destinationPath = join(destinationDir, "SKILL.md");
   const content = overrideContent ?? readFileSync(sourceSkillPath, "utf8");
   writeFileSync(destinationPath, content, "utf8");
@@ -211,27 +240,43 @@ function buildRuntimeReplayWorkspace(
   fixture: RoutingReplayFixture,
   content: string,
   contentTarget: RuntimeReplayContentTarget,
+  includeTargetSkill: boolean = true,
 ): ReplayWorkspace {
   const rootDir = mkdtempSync(join(tmpdir(), "selftune-runtime-replay-"));
   try {
     const registryDir = join(rootDir, getRuntimeReplayRegistryRelativeDir(fixture.platform));
     mkdirSync(join(rootDir, ".git"), { recursive: true });
     mkdirSync(registryDir, { recursive: true });
-    const targetSkillPath = stageReplaySkill(
+    const stagingMode = fixture.skill_staging_mode ?? "routing";
+    const allowedReadRoots: string[] = [];
+    const targetSkillDir = join(
       registryDir,
-      fixture.target_skill_path,
-      buildRuntimeReplayTargetContent(fixture.target_skill_path, content, contentTarget),
+      basename(dirname(fixture.target_skill_path)) || "unknown-skill",
     );
+    const targetSkillPath = join(targetSkillDir, "SKILL.md");
+    if (includeTargetSkill) {
+      const stagedTargetSkillPath = stageReplaySkill(
+        registryDir,
+        fixture.target_skill_path,
+        stagingMode,
+        buildRuntimeReplayTargetContent(fixture.target_skill_path, content, contentTarget),
+      );
+      allowedReadRoots.push(dirname(stagedTargetSkillPath));
+    }
     const competingSkillPaths = fixture.competing_skill_paths.map((skillPath) =>
-      stageReplaySkill(registryDir, skillPath),
+      stageReplaySkill(registryDir, skillPath, stagingMode),
     );
+    for (const skillPath of competingSkillPaths) {
+      allowedReadRoots.push(dirname(skillPath));
+    }
     return {
       rootDir,
       skillRegistryDir: registryDir,
       targetSkillPath,
       competingSkillPaths,
+      allowedReadRoots,
     };
   } catch (error) {
     rmSync(rootDir, { recursive: true, force: true });
@@ -433,6 +478,42 @@ export function extractClaudeRuntimeReplayMetrics(line: string): DashboardAction
   return null;
 }
+function mergeRuntimeReplayDashboardMetrics(
+  previous: DashboardActionMetrics | null,
+  next: DashboardActionMetrics,
+): DashboardActionMetrics {
+  if (!previous) return next;
+  return {
+    platform: next.platform ?? previous.platform,
+    model: next.model ?? previous.model,
+    session_id: next.session_id ?? previous.session_id,
+    input_tokens: next.input_tokens ?? previous.input_tokens,
+    output_tokens: next.output_tokens ?? previous.output_tokens,
+    cache_creation_input_tokens:
+      next.cache_creation_input_tokens ?? previous.cache_creation_input_tokens,
+    cache_read_input_tokens: next.cache_read_input_tokens ?? previous.cache_read_input_tokens,
+    total_cost_usd: next.total_cost_usd ?? previous.total_cost_usd,
+    duration_ms: next.duration_ms ?? previous.duration_ms,
+    num_turns: next.num_turns ?? previous.num_turns,
+  };
+}
+function buildRuntimeReplayEntryMetrics(
+  metrics: DashboardActionMetrics | undefined,
+  elapsedMs: number,
+): RuntimeReplayEntryMetrics {
+  return {
+    input_tokens: metrics?.input_tokens ?? null,
+    output_tokens: metrics?.output_tokens ?? null,
+    cache_creation_input_tokens: metrics?.cache_creation_input_tokens ?? null,
+    cache_read_input_tokens: metrics?.cache_read_input_tokens ?? null,
+    total_cost_usd: metrics?.total_cost_usd ?? null,
+    duration_ms: metrics?.duration_ms ?? elapsedMs,
+    num_turns: metrics?.num_turns ?? null,
+  };
+}
 async function readStreamText(
   stream: ReadableStream<Uint8Array> | null | undefined,
   onLine?: (line: string) => void,
@@ -725,10 +806,14 @@ async function invokeClaudeRuntimeReplay(
   });
   const timeout = setTimeout(() => proc.kill(), CLAUDE_RUNTIME_REPLAY_TIMEOUT_MS);
+  let latestMetrics: DashboardActionMetrics | null = null;
   const [stdoutText, stderrText, exitCode] = await Promise.all([
     readStreamText(proc.stdout, (line) => {
       const metrics = extractClaudeRuntimeReplayMetrics(line);
-      if (metrics) emitDashboardActionMetrics(metrics);
+      if (metrics) {
+        latestMetrics = mergeRuntimeReplayDashboardMetrics(latestMetrics, metrics);
+        emitDashboardActionMetrics(latestMetrics);
+      }
     }),
     new Response(proc.stderr).text(),
     proc.exited,
@@ -746,6 +831,7 @@ async function invokeClaudeRuntimeReplay(
   return {
     ...observation,
+    ...(latestMetrics ? { metrics: latestMetrics } : {}),
     ...(combinedError ? { runtimeError: combinedError } : {}),
   };
 }
@@ -850,10 +936,9 @@ function evaluateRuntimeReplayObservation(
   const normalizedReadPaths = new Set(
     observation.readSkillPaths.map((path) => resolveObservedReplayPath(path, workspace.rootDir)),
   );
-  const allowedReadPaths = new Set([
-    resolveReplayPath(workspace.targetSkillPath),
-    ...workspace.competingSkillPaths.map(resolveReplayPath),
-  ]);
+  const allowedReadRoots = workspace.allowedReadRoots.map(resolveReplayPath);
+  const isAllowedReadPath = (path: string): boolean =>
+    allowedReadRoots.some((root) => path === root || path.startsWith(`${root}/`));
   const targetSkillName = fixture.target_skill_name.trim();
   const targetTriggered = observation.triggeredSkillNames.includes(targetSkillName);
   const competingTriggered = observation.triggeredSkillNames.find((skillName) =>
@@ -864,10 +949,16 @@ function evaluateRuntimeReplayObservation(
   const unrelatedTriggered = observation.triggeredSkillNames.find(
     (skillName) => skillName.trim() !== targetSkillName && skillName.trim() !== competingTriggered,
   );
-  const unrelatedReadPaths = [...normalizedReadPaths].filter((path) => !allowedReadPaths.has(path));
-  const targetRead = normalizedReadPaths.has(resolveReplayPath(workspace.targetSkillPath));
+  const unrelatedReadPaths = [...normalizedReadPaths].filter((path) => !isAllowedReadPath(path));
+  const targetReadRoot = resolveReplayPath(dirname(workspace.targetSkillPath));
+  const targetRead = [...normalizedReadPaths].some(
+    (path) => path === targetReadRoot || path.startsWith(`${targetReadRoot}/`),
+  );
   const competingRead = workspace.competingSkillPaths.find((skillPath) =>
-    normalizedReadPaths.has(resolveReplayPath(skillPath)),
+    [...normalizedReadPaths].some((path) => {
+      const root = resolveReplayPath(dirname(skillPath));
+      return path === root || path.startsWith(`${root}/`);
+    }),
   );
   const sessionPrefix = observation.sessionId
     ? `runtime replay session ${observation.sessionId}`
@@ -1126,6 +1217,7 @@ export function buildRuntimeReplayValidationOptions(options: {
   skillPath: string;
   agent: string | null | undefined;
   contentTarget?: RuntimeReplayContentTarget;
+  stagingMode?: ReplayStagingMode;
 }): ReplayValidationOptions | undefined {
   const platform = resolveRuntimeReplayPlatform(options.agent);
   if (!platform) return undefined;
@@ -1135,6 +1227,7 @@ export function buildRuntimeReplayValidationOptions(options: {
       skillName: options.skillName,
       skillPath: options.skillPath,
       platform,
+      stagingMode: options.stagingMode,
     });
     return {
@@ -1157,6 +1250,7 @@ export async function runHostRuntimeReplayFixture(options: {
   evalSet: EvalEntry[];
   fixture: RoutingReplayFixture;
   contentTarget?: RuntimeReplayContentTarget;
+  includeTargetSkill?: boolean;
   runtimeInvoker?: RuntimeReplayInvoker;
 }): Promise<RoutingReplayEntryResult[]> {
   const invokeRuntime =
@@ -1168,6 +1262,7 @@ export async function runHostRuntimeReplayFixture(options: {
       options.fixture,
       options.routing,
       options.contentTarget ?? "routing",
+      options.includeTargetSkill ?? true,
     );
     const results: RoutingReplayEntryResult[] = [];
     const total = options.evalSet.length;
@@ -1175,6 +1270,7 @@ export async function runHostRuntimeReplayFixture(options: {
     for (const [index, entry] of options.evalSet.entries()) {
       const current = index + 1;
       const querySnippet = truncateReplayText(entry.query, 120);
+      const startedAt = Date.now();
       emitDashboardActionProgress({
         current,
@@ -1201,6 +1297,10 @@ export async function runHostRuntimeReplayFixture(options: {
           observation,
           workspace,
         );
+        result.runtime_metrics = buildRuntimeReplayEntryMetrics(
+          observation.metrics,
+          Date.now() - startedAt,
+        );
         results.push(result);
         emitDashboardActionProgress({