selftune 0.2.31 → 0.2.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/README.md +83 -56
  2. package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
  3. package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
  4. package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
  5. package/apps/local-dashboard/dist/index.html +3 -3
  6. package/cli/selftune/command-surface.ts +613 -2
  7. package/cli/selftune/create/baseline.ts +429 -0
  8. package/cli/selftune/create/check.ts +35 -0
  9. package/cli/selftune/create/init.ts +115 -0
  10. package/cli/selftune/create/package-candidate-state.ts +771 -0
  11. package/cli/selftune/create/package-evaluator.ts +710 -0
  12. package/cli/selftune/create/package-fingerprint.ts +142 -0
  13. package/cli/selftune/create/package-search.ts +377 -0
  14. package/cli/selftune/create/publish.ts +431 -0
  15. package/cli/selftune/create/readiness.ts +495 -0
  16. package/cli/selftune/create/replay.ts +330 -0
  17. package/cli/selftune/create/report.ts +74 -0
  18. package/cli/selftune/create/scaffold.ts +121 -0
  19. package/cli/selftune/create/skills-ref-adapter.ts +177 -0
  20. package/cli/selftune/create/status.ts +33 -0
  21. package/cli/selftune/create/templates.ts +249 -0
  22. package/cli/selftune/cron/setup.ts +1 -1
  23. package/cli/selftune/dashboard-action-events.ts +4 -1
  24. package/cli/selftune/dashboard-action-result.ts +789 -24
  25. package/cli/selftune/dashboard-action-stream.ts +80 -0
  26. package/cli/selftune/dashboard-contract.ts +146 -3
  27. package/cli/selftune/dashboard-server.ts +5 -4
  28. package/cli/selftune/eval/hooks-to-evals.ts +58 -35
  29. package/cli/selftune/eval/synthetic-evals.ts +145 -17
  30. package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
  31. package/cli/selftune/evolution/evolve-body.ts +9 -36
  32. package/cli/selftune/evolution/evolve.ts +8 -72
  33. package/cli/selftune/evolution/stopping-criteria.ts +5 -13
  34. package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
  35. package/cli/selftune/evolution/validate-host-replay.ts +115 -15
  36. package/cli/selftune/improve.ts +206 -0
  37. package/cli/selftune/index.ts +123 -6
  38. package/cli/selftune/init.ts +1 -1
  39. package/cli/selftune/localdb/queries/dashboard.ts +30 -0
  40. package/cli/selftune/localdb/schema.ts +52 -0
  41. package/cli/selftune/monitoring/watch.ts +257 -23
  42. package/cli/selftune/orchestrate/execute.ts +300 -1
  43. package/cli/selftune/orchestrate/finalize.ts +14 -0
  44. package/cli/selftune/orchestrate/plan.ts +22 -5
  45. package/cli/selftune/orchestrate/prepare.ts +59 -4
  46. package/cli/selftune/orchestrate/report.ts +1 -1
  47. package/cli/selftune/orchestrate.ts +34 -1
  48. package/cli/selftune/publish.ts +35 -0
  49. package/cli/selftune/routes/actions.ts +81 -15
  50. package/cli/selftune/routes/overview.ts +1 -1
  51. package/cli/selftune/routes/skill-report.ts +147 -2
  52. package/cli/selftune/run.ts +18 -0
  53. package/cli/selftune/schedule.ts +3 -3
  54. package/cli/selftune/search-run.ts +703 -0
  55. package/cli/selftune/status.ts +35 -11
  56. package/cli/selftune/testing-readiness.ts +431 -40
  57. package/cli/selftune/types.ts +316 -0
  58. package/cli/selftune/utils/eval-readiness.ts +1 -0
  59. package/cli/selftune/utils/json-output.ts +11 -0
  60. package/cli/selftune/utils/lifecycle-surface.ts +48 -0
  61. package/cli/selftune/utils/query-filter.ts +82 -1
  62. package/cli/selftune/utils/tui.ts +85 -2
  63. package/cli/selftune/verify.ts +205 -0
  64. package/cli/selftune/workflows/proposals.ts +1 -1
  65. package/cli/selftune/workflows/skill-scaffold.ts +141 -63
  66. package/cli/selftune/workflows/workflows.ts +4 -4
  67. package/package.json +1 -1
  68. package/skill/SKILL.md +148 -85
  69. package/skill/references/cli-quick-reference.md +16 -1
  70. package/skill/references/creator-playbook.md +31 -10
  71. package/skill/workflows/Baseline.md +8 -9
  72. package/skill/workflows/Contributions.md +4 -4
  73. package/skill/workflows/Create.md +173 -0
  74. package/skill/workflows/CreateTestDeploy.md +34 -30
  75. package/skill/workflows/Cron.md +2 -2
  76. package/skill/workflows/Dashboard.md +3 -3
  77. package/skill/workflows/Evals.md +13 -7
  78. package/skill/workflows/Evolve.md +75 -32
  79. package/skill/workflows/EvolveBody.md +22 -15
  80. package/skill/workflows/Hook.md +1 -1
  81. package/skill/workflows/Improve.md +168 -0
  82. package/skill/workflows/Initialize.md +3 -3
  83. package/skill/workflows/Orchestrate.md +49 -12
  84. package/skill/workflows/Publish.md +100 -0
  85. package/skill/workflows/Run.md +72 -0
  86. package/skill/workflows/Schedule.md +2 -2
  87. package/skill/workflows/SearchRun.md +89 -0
  88. package/skill/workflows/SignalsDashboard.md +2 -2
  89. package/skill/workflows/UnitTest.md +13 -4
  90. package/skill/workflows/Verify.md +136 -0
  91. package/skill/workflows/Watch.md +114 -47
  92. package/skill/workflows/Workflows.md +13 -8
  93. package/apps/local-dashboard/dist/assets/index-B7v_o1WC.js +0 -15
  94. package/apps/local-dashboard/dist/assets/index-CrO77SVi.css +0 -1
  95. package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
@@ -433,37 +433,6 @@ export async function evolveBody(
433
433
  }
434
434
  }
435
435
 
436
- // Check confidence threshold
437
- if (proposal.confidence < confidenceThreshold) {
438
- recordAudit(
439
- proposal.proposal_id,
440
- "rejected",
441
- `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
442
- );
443
- recordEvidence({
444
- timestamp: new Date().toISOString(),
445
- proposal_id: proposal.proposal_id,
446
- skill_name: skillName,
447
- skill_path: skillPath,
448
- target,
449
- stage: "rejected",
450
- rationale: proposal.rationale,
451
- confidence: proposal.confidence,
452
- details: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
453
- });
454
-
455
- if (iteration === maxIterations - 1) {
456
- return {
457
- proposal: lastProposal,
458
- validation: null,
459
- deployed: false,
460
- auditEntries,
461
- reason: `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`,
462
- };
463
- }
464
- continue;
465
- }
466
-
467
436
  // Validate (validationModel overrides studentModel for validation calls)
468
437
  const validationModelFlag = options.validationModel ?? studentModel;
469
438
  let validation: BodyValidationResult;
@@ -544,6 +513,10 @@ export async function evolveBody(
544
513
  }
545
514
  lastValidation = validation;
546
515
  const validatedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
516
+ const confidenceReviewNote =
517
+ proposal.confidence < confidenceThreshold
518
+ ? ` (confidence ${proposal.confidence.toFixed(2)} below review threshold ${confidenceThreshold})`
519
+ : "";
547
520
 
548
521
  recordAudit(
549
522
  proposal.proposal_id,
@@ -552,7 +525,7 @@ export async function evolveBody(
552
525
  validation.validation_fallback_reason
553
526
  ? ` (replay fallback: ${validation.validation_fallback_reason})`
554
527
  : ""
555
- }`,
528
+ }${confidenceReviewNote}`,
556
529
  {
557
530
  validation_mode: validation.validation_mode,
558
531
  validation_agent: validation.validation_agent,
@@ -573,7 +546,7 @@ export async function evolveBody(
573
546
  validation.validation_fallback_reason
574
547
  ? ` (replay fallback: ${validation.validation_fallback_reason})`
575
548
  : ""
576
- }`,
549
+ }${confidenceReviewNote}`,
577
550
  validation: {
578
551
  improved: validation.improved,
579
552
  gates_passed: validation.gates_passed,
@@ -641,7 +614,7 @@ export async function evolveBody(
641
614
  validation.validation_fallback_reason
642
615
  ? ` (replay fallback: ${validation.validation_fallback_reason})`
643
616
  : ""
644
- }`,
617
+ }${confidenceReviewNote}`,
645
618
  {
646
619
  validation_mode: validation.validation_mode,
647
620
  validation_agent: validation.validation_agent,
@@ -662,7 +635,7 @@ export async function evolveBody(
662
635
  validation.validation_fallback_reason
663
636
  ? ` (replay fallback: ${validation.validation_fallback_reason})`
664
637
  : ""
665
- }`,
638
+ }${confidenceReviewNote}`,
666
639
  validation: {
667
640
  improved: validation.improved,
668
641
  gates_passed: validation.gates_passed,
@@ -886,7 +859,7 @@ Options:
886
859
  --eval-set Path to eval set JSON
887
860
  --dry-run Validate without deploying
888
861
  --max-iterations Max refinement iterations (default: 3)
889
- --confidence Confidence threshold 0.0-1.0 (default: 0.6)
862
+ --confidence Low-confidence review threshold 0.0-1.0 (default: 0.6)
890
863
  --task-description Optional task description context
891
864
  --few-shot Comma-separated paths to example skill files
892
865
  --validation-model Model for trigger-check validation calls (overrides --student-model for validation)
@@ -79,7 +79,7 @@ export interface EvolveOptions {
79
79
  evalSetPath?: string;
80
80
  agent: string;
81
81
  dryRun: boolean;
82
- confidenceThreshold: number; // default 0.6
82
+ confidenceThreshold: number; // warning/review threshold, default 0.6
83
83
  maxIterations: number; // default 3
84
84
  gradingResults?: GradingResult[];
85
85
  paretoEnabled?: boolean;
@@ -713,23 +713,9 @@ export async function evolve(
713
713
  );
714
714
  llmCallCount += candidateCount;
715
715
 
716
- // Filter by confidence threshold
717
- const viableCandidates = candidates.filter((c) => c.confidence >= confidenceThreshold);
718
-
719
- if (viableCandidates.length === 0) {
720
- finishTui();
721
- return withStats({
722
- proposal: candidates[0] ?? null,
723
- validation: null,
724
- deployed: false,
725
- auditEntries,
726
- reason: `No candidates met confidence threshold ${confidenceThreshold}`,
727
- });
728
- }
729
-
730
716
  // Validate each candidate
731
717
  const paretoCandidates: ParetoCandidate[] = [];
732
- for (const proposal of viableCandidates) {
718
+ for (const proposal of candidates) {
733
719
  recordAudit(
734
720
  proposal.proposal_id,
735
721
  "created",
@@ -855,7 +841,7 @@ export async function evolve(
855
841
  if (paretoCandidates.length === 0) {
856
842
  finishTui();
857
843
  return withStats({
858
- proposal: viableCandidates[0],
844
+ proposal: candidates[0] ?? null,
859
845
  validation: null,
860
846
  deployed: false,
861
847
  auditEntries,
@@ -932,15 +918,12 @@ export async function evolve(
932
918
  // Re-evaluate stopping after a constitutional rejection by treating the
933
919
  // last entry in previousPassRates as the currentPassRate (or 0 on the
934
920
  // first iteration) and slicing it out of history before calling
935
- // evaluateStoppingCriteria() with the current iteration/maxIterations,
936
- // confidenceThreshold, and proposal.confidence.
921
+ // evaluateStoppingCriteria() with the current iteration/maxIterations.
937
922
  const constitutionStop = evaluateStoppingCriteria(
938
923
  previousPassRates.at(-1) ?? 0,
939
924
  previousPassRates.slice(0, -1),
940
925
  iteration + 1,
941
926
  maxIterations,
942
- confidenceThreshold,
943
- proposal.confidence,
944
927
  );
945
928
  recordAudit(
946
929
  proposal.proposal_id,
@@ -971,52 +954,7 @@ export async function evolve(
971
954
  continue;
972
955
  }
973
956
 
974
- // Step 9: Check confidence threshold via stopping criteria
975
- {
976
- const preValidationStop = evaluateStoppingCriteria(
977
- previousPassRates.at(-1) ?? 0,
978
- previousPassRates.slice(0, -1),
979
- iteration + 1,
980
- maxIterations,
981
- confidenceThreshold,
982
- proposal.confidence,
983
- );
984
- if (proposal.confidence < confidenceThreshold) {
985
- feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`;
986
- recordAudit(
987
- proposal.proposal_id,
988
- "rejected",
989
- `${feedbackReason} (stopping: ${preValidationStop.reason})`,
990
- );
991
- recordEvidence({
992
- timestamp: new Date().toISOString(),
993
- proposal_id: proposal.proposal_id,
994
- skill_name: skillName,
995
- skill_path: skillPath,
996
- target: "description",
997
- stage: "rejected",
998
- rationale: proposal.rationale,
999
- confidence: proposal.confidence,
1000
- details: `${feedbackReason} (stopping: ${preValidationStop.reason})`,
1001
- });
1002
-
1003
- // Use stopping criteria to decide whether to return or retry
1004
- if (preValidationStop.shouldStop) {
1005
- finishTui();
1006
- return withStats({
1007
- proposal: lastProposal,
1008
- validation: null,
1009
- deployed: false,
1010
- auditEntries,
1011
- reason: `${feedbackReason} (${preValidationStop.reason})`,
1012
- });
1013
- }
1014
-
1015
- continue;
1016
- }
1017
- }
1018
-
1019
- // Step 10: Validate against eval set
957
+ // Step 9: Validate against eval set
1020
958
  const batchCount = Math.ceil(evalSet.length / TRIGGER_CHECK_BATCH_SIZE);
1021
959
  tui.step(
1022
960
  `Validating ${evalSet.length} entries (mode=${effectiveValidationMode}, ${batchCount} batches, ${VALIDATION_RUNS}x majority-vote)...`,
@@ -1038,7 +976,7 @@ export async function evolve(
1038
976
  `Validation: ${(validation.before_pass_rate * 100).toFixed(1)}% \u2192 ${(validation.after_pass_rate * 100).toFixed(1)}% (improved: ${validation.improved})`,
1039
977
  );
1040
978
 
1041
- // Step 11: Audit "validated"
979
+ // Step 10: Audit "validated"
1042
980
  const evalSnapshot: EvalPassRate = {
1043
981
  total: evalSet.length,
1044
982
  passed: Math.round(validation.after_pass_rate * evalSet.length),
@@ -1094,14 +1032,12 @@ export async function evolve(
1094
1032
  },
1095
1033
  });
1096
1034
 
1097
- // Step 12: Evaluate stopping criteria after validation
1035
+ // Step 11: Evaluate stopping criteria after validation
1098
1036
  const stopping = evaluateStoppingCriteria(
1099
1037
  validation.after_pass_rate,
1100
1038
  previousPassRates,
1101
1039
  iteration + 1,
1102
1040
  maxIterations,
1103
- confidenceThreshold,
1104
- proposal.confidence,
1105
1041
  );
1106
1042
  previousPassRates.push(validation.after_pass_rate);
1107
1043
 
@@ -1710,7 +1646,7 @@ export async function cliMain(): Promise<void> {
1710
1646
  result.proposal.confidence < Number.parseFloat(values.confidence ?? "0.6")
1711
1647
  ) {
1712
1648
  console.error(
1713
- ` Confidence ${result.proposal.confidence.toFixed(2)} below threshold ${values.confidence ?? "0.6"}`,
1649
+ ` Confidence ${result.proposal.confidence.toFixed(2)} below review threshold ${values.confidence ?? "0.6"} (validated anyway)`,
1714
1650
  );
1715
1651
  }
1716
1652
  // Targeted suggestions based on specific failure reason
@@ -2,7 +2,7 @@
2
2
  * stopping-criteria.ts
3
3
  *
4
4
  * Evaluates whether the evolution loop should stop based on convergence,
5
- * iteration limits, confidence thresholds, and plateau detection.
5
+ * iteration limits, and plateau detection.
6
6
  * Pure function module with no external dependencies.
7
7
  */
8
8
 
@@ -25,17 +25,14 @@ export interface StoppingDecision {
25
25
  * Checks conditions in priority order:
26
26
  * 1. Converged (pass rate >= 95%)
27
27
  * 2. Max iterations reached
28
- * 3. Low confidence (below threshold)
29
- * 4. Plateau (< 1% variation over last 3 iterations)
30
- * 5. Continue (none of the above)
28
+ * 3. Plateau (< 1% variation over last 3 iterations)
29
+ * 4. Continue (none of the above)
31
30
  */
32
31
  export function evaluateStoppingCriteria(
33
32
  currentPassRate: number,
34
33
  previousPassRates: number[],
35
34
  iterationCount: number,
36
35
  maxIterations: number,
37
- confidenceThreshold: number,
38
- proposalConfidence: number,
39
36
  ): StoppingDecision {
40
37
  // 1. Converged
41
38
  if (currentPassRate >= 0.95) {
@@ -47,12 +44,7 @@ export function evaluateStoppingCriteria(
47
44
  return { shouldStop: true, reason: "Max iterations reached" };
48
45
  }
49
46
 
50
- // 3. Low confidence
51
- if (proposalConfidence < confidenceThreshold) {
52
- return { shouldStop: true, reason: "Confidence below threshold" };
53
- }
54
-
55
- // 4. Plateau detection: need at least 2 previous rates to form 3 data points
47
+ // 3. Plateau detection: need at least 2 previous rates to form 3 data points
56
48
  if (previousPassRates.length >= 2) {
57
49
  const last2Previous = previousPassRates.slice(-2);
58
50
  const window = [...last2Previous, currentPassRate];
@@ -64,6 +56,6 @@ export function evaluateStoppingCriteria(
64
56
  }
65
57
  }
66
58
 
67
- // 5. Continue
59
+ // 4. Continue
68
60
  return { shouldStop: false, reason: "Continuing: improvement possible" };
69
61
  }
@@ -83,22 +83,6 @@ export function buildUnblockSuggestions(result: EvolveResult, skillName: string)
83
83
  return suggestions;
84
84
  }
85
85
 
86
- // --- Confidence failures (specific before general) ---
87
- if (reason.includes("No candidates met confidence")) {
88
- suggestions.push(`Lower the threshold: selftune evolve --skill ${skillName} --confidence 0.4`);
89
- suggestions.push(
90
- `Or increase candidates: selftune evolve --skill ${skillName} --pareto --candidates 5`,
91
- );
92
- appendQualityHints(suggestions, descText, skillName);
93
- return suggestions;
94
- }
95
- if (reason.toLowerCase().includes("confidence") && reason.includes("threshold")) {
96
- suggestions.push(`Lower the threshold: selftune evolve --skill ${skillName} --confidence 0.4`);
97
- suggestions.push("Or add more eval entries so the LLM has more context for proposals");
98
- appendQualityHints(suggestions, descText, skillName);
99
- return suggestions;
100
- }
101
-
102
86
  // --- Validation failures (proposals regressed) ---
103
87
  if (reason.includes("Validation failed after")) {
104
88
  suggestions.push(
@@ -1,5 +1,6 @@
1
1
  import {
2
2
  existsSync,
3
+ copyFileSync,
3
4
  mkdirSync,
4
5
  mkdtempSync,
5
6
  readFileSync,
@@ -16,7 +17,13 @@ import {
16
17
  emitDashboardActionMetrics,
17
18
  emitDashboardActionProgress,
18
19
  } from "../dashboard-action-events.js";
19
- import type { EvalEntry, RoutingReplayEntryResult, RoutingReplayFixture } from "../types.js";
20
+ import type {
21
+ EvalEntry,
22
+ ReplayStagingMode,
23
+ RuntimeReplayEntryMetrics,
24
+ RoutingReplayEntryResult,
25
+ RoutingReplayFixture,
26
+ } from "../types.js";
20
27
  import type { DashboardActionMetrics } from "../dashboard-contract.js";
21
28
  import { parseFrontmatter } from "../utils/frontmatter.js";
22
29
  import {
@@ -45,6 +52,7 @@ interface ReplayWorkspace {
45
52
  skillRegistryDir: string;
46
53
  targetSkillPath: string;
47
54
  competingSkillPaths: string[];
55
+ allowedReadRoots: string[];
48
56
  }
49
57
 
50
58
  export type RuntimeReplayContentTarget = "routing" | "description" | "body";
@@ -65,6 +73,7 @@ export interface RuntimeReplayObservation {
65
73
  rawOutput: string;
66
74
  sessionId?: string;
67
75
  runtimeError?: string;
76
+ metrics?: DashboardActionMetrics;
68
77
  }
69
78
 
70
79
  export type RuntimeReplayInvoker = (
@@ -162,6 +171,7 @@ export function buildRoutingReplayFixture(options: {
162
171
  platform?: RoutingReplayFixture["platform"];
163
172
  fixtureId?: string;
164
173
  workspaceRoot?: string;
174
+ stagingMode?: ReplayStagingMode;
165
175
  }): RoutingReplayFixture {
166
176
  const targetSkillPath = resolveReplayPath(options.skillPath);
167
177
  const workspaceRoot =
@@ -175,6 +185,7 @@ export function buildRoutingReplayFixture(options: {
175
185
  target_skill_path: targetSkillPath,
176
186
  competing_skill_paths: listCompetingSkillPaths(targetSkillPath),
177
187
  ...(workspaceRoot ? { workspace_root: workspaceRoot } : {}),
188
+ ...(options.stagingMode ? { skill_staging_mode: options.stagingMode } : {}),
178
189
  };
179
190
  }
180
191
 
@@ -193,14 +204,32 @@ function buildRuntimeReplayTargetContent(
193
204
  return replaceSection(currentContent, "Workflow Routing", content.trim());
194
205
  }
195
206
 
207
+ function copyDirectoryRecursive(sourceDir: string, destinationDir: string): void {
208
+ mkdirSync(destinationDir, { recursive: true });
209
+ for (const entry of readdirSync(sourceDir, { withFileTypes: true })) {
210
+ const sourcePath = join(sourceDir, entry.name);
211
+ const destinationPath = join(destinationDir, entry.name);
212
+ if (entry.isDirectory()) {
213
+ copyDirectoryRecursive(sourcePath, destinationPath);
214
+ continue;
215
+ }
216
+ copyFileSync(sourcePath, destinationPath);
217
+ }
218
+ }
219
+
196
220
  function stageReplaySkill(
197
221
  registryDir: string,
198
222
  sourceSkillPath: string,
223
+ stagingMode: ReplayStagingMode,
199
224
  overrideContent?: string,
200
225
  ): string {
201
226
  const skillDirName = basename(dirname(sourceSkillPath)) || "unknown-skill";
202
227
  const destinationDir = join(registryDir, skillDirName);
203
- mkdirSync(destinationDir, { recursive: true });
228
+ if (stagingMode === "package") {
229
+ copyDirectoryRecursive(dirname(sourceSkillPath), destinationDir);
230
+ } else {
231
+ mkdirSync(destinationDir, { recursive: true });
232
+ }
204
233
  const destinationPath = join(destinationDir, "SKILL.md");
205
234
  const content = overrideContent ?? readFileSync(sourceSkillPath, "utf8");
206
235
  writeFileSync(destinationPath, content, "utf8");
@@ -211,27 +240,43 @@ function buildRuntimeReplayWorkspace(
211
240
  fixture: RoutingReplayFixture,
212
241
  content: string,
213
242
  contentTarget: RuntimeReplayContentTarget,
243
+ includeTargetSkill: boolean = true,
214
244
  ): ReplayWorkspace {
215
245
  const rootDir = mkdtempSync(join(tmpdir(), "selftune-runtime-replay-"));
216
246
  try {
217
247
  const registryDir = join(rootDir, getRuntimeReplayRegistryRelativeDir(fixture.platform));
218
248
  mkdirSync(join(rootDir, ".git"), { recursive: true });
219
249
  mkdirSync(registryDir, { recursive: true });
220
-
221
- const targetSkillPath = stageReplaySkill(
250
+ const stagingMode = fixture.skill_staging_mode ?? "routing";
251
+ const allowedReadRoots: string[] = [];
252
+ const targetSkillDir = join(
222
253
  registryDir,
223
- fixture.target_skill_path,
224
- buildRuntimeReplayTargetContent(fixture.target_skill_path, content, contentTarget),
254
+ basename(dirname(fixture.target_skill_path)) || "unknown-skill",
225
255
  );
256
+
257
+ const targetSkillPath = join(targetSkillDir, "SKILL.md");
258
+ if (includeTargetSkill) {
259
+ const stagedTargetSkillPath = stageReplaySkill(
260
+ registryDir,
261
+ fixture.target_skill_path,
262
+ stagingMode,
263
+ buildRuntimeReplayTargetContent(fixture.target_skill_path, content, contentTarget),
264
+ );
265
+ allowedReadRoots.push(dirname(stagedTargetSkillPath));
266
+ }
226
267
  const competingSkillPaths = fixture.competing_skill_paths.map((skillPath) =>
227
- stageReplaySkill(registryDir, skillPath),
268
+ stageReplaySkill(registryDir, skillPath, stagingMode),
228
269
  );
270
+ for (const skillPath of competingSkillPaths) {
271
+ allowedReadRoots.push(dirname(skillPath));
272
+ }
229
273
 
230
274
  return {
231
275
  rootDir,
232
276
  skillRegistryDir: registryDir,
233
277
  targetSkillPath,
234
278
  competingSkillPaths,
279
+ allowedReadRoots,
235
280
  };
236
281
  } catch (error) {
237
282
  rmSync(rootDir, { recursive: true, force: true });
@@ -433,6 +478,42 @@ export function extractClaudeRuntimeReplayMetrics(line: string): DashboardAction
433
478
  return null;
434
479
  }
435
480
 
481
+ function mergeRuntimeReplayDashboardMetrics(
482
+ previous: DashboardActionMetrics | null,
483
+ next: DashboardActionMetrics,
484
+ ): DashboardActionMetrics {
485
+ if (!previous) return next;
486
+
487
+ return {
488
+ platform: next.platform ?? previous.platform,
489
+ model: next.model ?? previous.model,
490
+ session_id: next.session_id ?? previous.session_id,
491
+ input_tokens: next.input_tokens ?? previous.input_tokens,
492
+ output_tokens: next.output_tokens ?? previous.output_tokens,
493
+ cache_creation_input_tokens:
494
+ next.cache_creation_input_tokens ?? previous.cache_creation_input_tokens,
495
+ cache_read_input_tokens: next.cache_read_input_tokens ?? previous.cache_read_input_tokens,
496
+ total_cost_usd: next.total_cost_usd ?? previous.total_cost_usd,
497
+ duration_ms: next.duration_ms ?? previous.duration_ms,
498
+ num_turns: next.num_turns ?? previous.num_turns,
499
+ };
500
+ }
501
+
502
+ function buildRuntimeReplayEntryMetrics(
503
+ metrics: DashboardActionMetrics | undefined,
504
+ elapsedMs: number,
505
+ ): RuntimeReplayEntryMetrics {
506
+ return {
507
+ input_tokens: metrics?.input_tokens ?? null,
508
+ output_tokens: metrics?.output_tokens ?? null,
509
+ cache_creation_input_tokens: metrics?.cache_creation_input_tokens ?? null,
510
+ cache_read_input_tokens: metrics?.cache_read_input_tokens ?? null,
511
+ total_cost_usd: metrics?.total_cost_usd ?? null,
512
+ duration_ms: metrics?.duration_ms ?? elapsedMs,
513
+ num_turns: metrics?.num_turns ?? null,
514
+ };
515
+ }
516
+
436
517
  async function readStreamText(
437
518
  stream: ReadableStream<Uint8Array> | null | undefined,
438
519
  onLine?: (line: string) => void,
@@ -725,10 +806,14 @@ async function invokeClaudeRuntimeReplay(
725
806
  });
726
807
  const timeout = setTimeout(() => proc.kill(), CLAUDE_RUNTIME_REPLAY_TIMEOUT_MS);
727
808
 
809
+ let latestMetrics: DashboardActionMetrics | null = null;
728
810
  const [stdoutText, stderrText, exitCode] = await Promise.all([
729
811
  readStreamText(proc.stdout, (line) => {
730
812
  const metrics = extractClaudeRuntimeReplayMetrics(line);
731
- if (metrics) emitDashboardActionMetrics(metrics);
813
+ if (metrics) {
814
+ latestMetrics = mergeRuntimeReplayDashboardMetrics(latestMetrics, metrics);
815
+ emitDashboardActionMetrics(latestMetrics);
816
+ }
732
817
  }),
733
818
  new Response(proc.stderr).text(),
734
819
  proc.exited,
@@ -746,6 +831,7 @@ async function invokeClaudeRuntimeReplay(
746
831
 
747
832
  return {
748
833
  ...observation,
834
+ ...(latestMetrics ? { metrics: latestMetrics } : {}),
749
835
  ...(combinedError ? { runtimeError: combinedError } : {}),
750
836
  };
751
837
  }
@@ -850,10 +936,9 @@ function evaluateRuntimeReplayObservation(
850
936
  const normalizedReadPaths = new Set(
851
937
  observation.readSkillPaths.map((path) => resolveObservedReplayPath(path, workspace.rootDir)),
852
938
  );
853
- const allowedReadPaths = new Set([
854
- resolveReplayPath(workspace.targetSkillPath),
855
- ...workspace.competingSkillPaths.map(resolveReplayPath),
856
- ]);
939
+ const allowedReadRoots = workspace.allowedReadRoots.map(resolveReplayPath);
940
+ const isAllowedReadPath = (path: string): boolean =>
941
+ allowedReadRoots.some((root) => path === root || path.startsWith(`${root}/`));
857
942
  const targetSkillName = fixture.target_skill_name.trim();
858
943
  const targetTriggered = observation.triggeredSkillNames.includes(targetSkillName);
859
944
  const competingTriggered = observation.triggeredSkillNames.find((skillName) =>
@@ -864,10 +949,16 @@ function evaluateRuntimeReplayObservation(
864
949
  const unrelatedTriggered = observation.triggeredSkillNames.find(
865
950
  (skillName) => skillName.trim() !== targetSkillName && skillName.trim() !== competingTriggered,
866
951
  );
867
- const unrelatedReadPaths = [...normalizedReadPaths].filter((path) => !allowedReadPaths.has(path));
868
- const targetRead = normalizedReadPaths.has(resolveReplayPath(workspace.targetSkillPath));
952
+ const unrelatedReadPaths = [...normalizedReadPaths].filter((path) => !isAllowedReadPath(path));
953
+ const targetReadRoot = resolveReplayPath(dirname(workspace.targetSkillPath));
954
+ const targetRead = [...normalizedReadPaths].some(
955
+ (path) => path === targetReadRoot || path.startsWith(`${targetReadRoot}/`),
956
+ );
869
957
  const competingRead = workspace.competingSkillPaths.find((skillPath) =>
870
- normalizedReadPaths.has(resolveReplayPath(skillPath)),
958
+ [...normalizedReadPaths].some((path) => {
959
+ const root = resolveReplayPath(dirname(skillPath));
960
+ return path === root || path.startsWith(`${root}/`);
961
+ }),
871
962
  );
872
963
  const sessionPrefix = observation.sessionId
873
964
  ? `runtime replay session ${observation.sessionId}`
@@ -1126,6 +1217,7 @@ export function buildRuntimeReplayValidationOptions(options: {
1126
1217
  skillPath: string;
1127
1218
  agent: string | null | undefined;
1128
1219
  contentTarget?: RuntimeReplayContentTarget;
1220
+ stagingMode?: ReplayStagingMode;
1129
1221
  }): ReplayValidationOptions | undefined {
1130
1222
  const platform = resolveRuntimeReplayPlatform(options.agent);
1131
1223
  if (!platform) return undefined;
@@ -1135,6 +1227,7 @@ export function buildRuntimeReplayValidationOptions(options: {
1135
1227
  skillName: options.skillName,
1136
1228
  skillPath: options.skillPath,
1137
1229
  platform,
1230
+ stagingMode: options.stagingMode,
1138
1231
  });
1139
1232
 
1140
1233
  return {
@@ -1157,6 +1250,7 @@ export async function runHostRuntimeReplayFixture(options: {
1157
1250
  evalSet: EvalEntry[];
1158
1251
  fixture: RoutingReplayFixture;
1159
1252
  contentTarget?: RuntimeReplayContentTarget;
1253
+ includeTargetSkill?: boolean;
1160
1254
  runtimeInvoker?: RuntimeReplayInvoker;
1161
1255
  }): Promise<RoutingReplayEntryResult[]> {
1162
1256
  const invokeRuntime =
@@ -1168,6 +1262,7 @@ export async function runHostRuntimeReplayFixture(options: {
1168
1262
  options.fixture,
1169
1263
  options.routing,
1170
1264
  options.contentTarget ?? "routing",
1265
+ options.includeTargetSkill ?? true,
1171
1266
  );
1172
1267
  const results: RoutingReplayEntryResult[] = [];
1173
1268
  const total = options.evalSet.length;
@@ -1175,6 +1270,7 @@ export async function runHostRuntimeReplayFixture(options: {
1175
1270
  for (const [index, entry] of options.evalSet.entries()) {
1176
1271
  const current = index + 1;
1177
1272
  const querySnippet = truncateReplayText(entry.query, 120);
1273
+ const startedAt = Date.now();
1178
1274
 
1179
1275
  emitDashboardActionProgress({
1180
1276
  current,
@@ -1201,6 +1297,10 @@ export async function runHostRuntimeReplayFixture(options: {
1201
1297
  observation,
1202
1298
  workspace,
1203
1299
  );
1300
+ result.runtime_metrics = buildRuntimeReplayEntryMetrics(
1301
+ observation.metrics,
1302
+ Date.now() - startedAt,
1303
+ );
1204
1304
  results.push(result);
1205
1305
 
1206
1306
  emitDashboardActionProgress({