npm - selftune - Versions diffs - 0.2.22 → 0.2.24 - Mend

selftune 0.2.22 → 0.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (270) hide show

package/CHANGELOG.md +6 -0
package/README.md +95 -15
package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
package/apps/local-dashboard/dist/assets/index-Dmx7LPVX.js +15 -0
package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
package/apps/local-dashboard/dist/index.html +5 -5
package/cli/selftune/adapters/codex/install.ts +310 -78
package/cli/selftune/adapters/opencode/install.ts +3 -4
package/cli/selftune/adapters/pi/hook.ts +273 -0
package/cli/selftune/adapters/pi/install.ts +207 -0
package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
package/cli/selftune/auto-update.ts +200 -8
package/cli/selftune/canonical-export.ts +55 -25
package/cli/selftune/command-surface.ts +397 -0
package/cli/selftune/constants.ts +10 -1
package/cli/selftune/contribute/contribute.ts +64 -13
package/cli/selftune/contribution-config.ts +57 -3
package/cli/selftune/contribution-preferences.ts +117 -0
package/cli/selftune/contribution-signals.ts +8 -4
package/cli/selftune/contribution-staging.ts +13 -2
package/cli/selftune/contributions.ts +55 -121
package/cli/selftune/creator-contributions.ts +29 -10
package/cli/selftune/cron/setup.ts +7 -3
package/cli/selftune/dashboard-contract.ts +87 -0
package/cli/selftune/dashboard-server.ts +168 -17
package/cli/selftune/dashboard.ts +350 -17
package/cli/selftune/eval/baseline.ts +21 -5
package/cli/selftune/eval/execution-eval.ts +170 -0
package/cli/selftune/eval/family-overlap.ts +2 -2
package/cli/selftune/eval/hooks-to-evals.ts +228 -82
package/cli/selftune/eval/import-skillsbench.ts +2 -2
package/cli/selftune/eval/invocation-classifier.ts +56 -0
package/cli/selftune/eval/synthetic-evals.ts +5 -3
package/cli/selftune/eval/unit-test-cli.ts +7 -4
package/cli/selftune/evolution/apply-proposal.ts +295 -0
package/cli/selftune/evolution/engines/judge-engine.ts +96 -0
package/cli/selftune/evolution/engines/replay-engine.ts +180 -0
package/cli/selftune/evolution/evidence.ts +2 -6
package/cli/selftune/evolution/evolve-body.ts +152 -38
package/cli/selftune/evolution/evolve.ts +244 -52
package/cli/selftune/evolution/rollback.ts +0 -1
package/cli/selftune/evolution/validate-body.ts +111 -49
package/cli/selftune/evolution/validate-host-replay.ts +510 -60
package/cli/selftune/evolution/validate-proposal.ts +11 -150
package/cli/selftune/evolution/validate-routing.ts +51 -108
package/cli/selftune/evolution/validation-contract.ts +91 -0
package/cli/selftune/grading/auto-grade.ts +11 -7
package/cli/selftune/grading/grade-session.ts +10 -16
package/cli/selftune/hooks/skill-eval.ts +2 -1
package/cli/selftune/hooks-shared/types.ts +1 -0
package/cli/selftune/index.ts +58 -15
package/cli/selftune/ingestors/claude-replay.ts +15 -10
package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
package/cli/selftune/ingestors/pi-ingest.ts +727 -0
package/cli/selftune/init.ts +38 -4
package/cli/selftune/localdb/direct-write.ts +120 -1
package/cli/selftune/localdb/materialize.ts +6 -7
package/cli/selftune/localdb/queries/cron.ts +34 -0
package/cli/selftune/localdb/queries/dashboard.ts +834 -0
package/cli/selftune/localdb/queries/evolution.ts +158 -0
package/cli/selftune/localdb/queries/execution.ts +133 -0
package/cli/selftune/localdb/queries/json.ts +18 -0
package/cli/selftune/localdb/queries/monitoring.ts +263 -0
package/cli/selftune/localdb/queries/raw.ts +95 -0
package/cli/selftune/localdb/queries/staging.ts +270 -0
package/cli/selftune/localdb/queries/trust.ts +392 -0
package/cli/selftune/localdb/queries.ts +60 -2162
package/cli/selftune/localdb/schema.ts +59 -0
package/cli/selftune/monitoring/watch.ts +96 -29
package/cli/selftune/normalization.ts +3 -0
package/cli/selftune/observability.ts +12 -3
package/cli/selftune/orchestrate/cli.ts +161 -0
package/cli/selftune/orchestrate/execute.ts +295 -0
package/cli/selftune/orchestrate/finalize.ts +157 -0
package/cli/selftune/orchestrate/locks.ts +40 -0
package/cli/selftune/orchestrate/plan.ts +131 -0
package/cli/selftune/orchestrate/post-run.ts +59 -0
package/cli/selftune/orchestrate/prepare.ts +334 -0
package/cli/selftune/orchestrate/report.ts +182 -0
package/cli/selftune/orchestrate/runtime.ts +120 -0
package/cli/selftune/orchestrate/signals.ts +48 -0
package/cli/selftune/orchestrate.ts +162 -1142
package/cli/selftune/registry/client.ts +74 -0
package/cli/selftune/registry/history.ts +54 -0
package/cli/selftune/registry/index.ts +90 -0
package/cli/selftune/registry/install.ts +141 -0
package/cli/selftune/registry/list.ts +44 -0
package/cli/selftune/registry/push.ts +171 -0
package/cli/selftune/registry/rollback.ts +49 -0
package/cli/selftune/registry/status.ts +62 -0
package/cli/selftune/registry/sync.ts +125 -0
package/cli/selftune/repair/skill-usage.ts +9 -3
package/cli/selftune/routes/overview.ts +5 -2
package/cli/selftune/routes/skill-report.ts +15 -2
package/cli/selftune/schedule.ts +5 -5
package/cli/selftune/status.ts +70 -2
package/cli/selftune/sync.ts +127 -23
package/cli/selftune/testing-readiness.ts +597 -0
package/cli/selftune/types.ts +46 -5
package/cli/selftune/uninstall.ts +2 -1
package/cli/selftune/utils/canonical-log.ts +1 -9
package/cli/selftune/utils/cli-error.ts +9 -0
package/cli/selftune/utils/jsonl.ts +1 -30
package/cli/selftune/utils/llm-call.ts +126 -6
package/cli/selftune/utils/skill-discovery.ts +24 -0
package/cli/selftune/workflows/proposals.ts +184 -0
package/cli/selftune/workflows/skill-scaffold.ts +241 -0
package/cli/selftune/workflows/workflows.ts +100 -26
package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +2 -2
package/node_modules/@selftune/telemetry-contract/fixtures/golden.test.ts +0 -1
package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +2 -2
package/node_modules/@selftune/telemetry-contract/package.json +1 -1
package/node_modules/@selftune/telemetry-contract/src/index.ts +1 -0
package/node_modules/@selftune/telemetry-contract/src/schemas.ts +63 -5
package/node_modules/@selftune/telemetry-contract/src/types.ts +97 -7
package/node_modules/@selftune/telemetry-contract/tests/compatibility.test.ts +0 -1
package/package.json +25 -9
package/packages/dashboard-core/AGENTS.md +18 -0
package/packages/dashboard-core/README.md +30 -0
package/packages/dashboard-core/index.ts +3 -0
package/packages/dashboard-core/package.json +39 -0
package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
package/packages/dashboard-core/src/chrome/index.ts +14 -0
package/packages/dashboard-core/src/chrome/types.ts +81 -0
package/packages/dashboard-core/src/chrome/utils.ts +23 -0
package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
package/packages/dashboard-core/src/gates/index.ts +3 -0
package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
package/packages/dashboard-core/src/host/adapter.ts +47 -0
package/packages/dashboard-core/src/host/capabilities.ts +55 -0
package/packages/dashboard-core/src/host/index.ts +3 -0
package/packages/dashboard-core/src/models/analytics.ts +39 -0
package/packages/dashboard-core/src/models/index.ts +4 -0
package/packages/dashboard-core/src/models/overview.ts +98 -0
package/packages/dashboard-core/src/models/runtime.ts +7 -0
package/packages/dashboard-core/src/models/skills.ts +34 -0
package/packages/dashboard-core/src/routes/index.ts +2 -0
package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
package/packages/dashboard-core/src/routes/manifest.ts +451 -0
package/packages/dashboard-core/src/routes/types.ts +39 -0
package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
package/packages/dashboard-core/src/screens/index.ts +37 -0
package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
package/packages/telemetry-contract/fixtures/evidence-only-push.ts +2 -2
package/packages/telemetry-contract/fixtures/golden.test.ts +0 -1
package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +2 -2
package/packages/telemetry-contract/package.json +1 -1
package/packages/telemetry-contract/src/index.ts +1 -0
package/packages/telemetry-contract/src/schemas.ts +63 -5
package/packages/telemetry-contract/src/types.ts +97 -7
package/packages/telemetry-contract/tests/compatibility.test.ts +0 -1
package/packages/ui/AGENTS.md +16 -0
package/packages/ui/README.md +1 -1
package/packages/ui/package.json +1 -1
package/packages/ui/src/components/ActivityTimeline.tsx +152 -168
package/packages/ui/src/components/AnalyticsCharts.tsx +344 -0
package/packages/ui/src/components/EvidenceViewer.tsx +229 -464
package/packages/ui/src/components/EvolutionTimeline.tsx +34 -87
package/packages/ui/src/components/InfoTip.tsx +1 -2
package/packages/ui/src/components/InvocationsPanel.tsx +413 -0
package/packages/ui/src/components/JobHistoryTimeline.tsx +156 -0
package/packages/ui/src/components/OrchestrateRunsPanel.tsx +18 -36
package/packages/ui/src/components/OverviewPanels.tsx +693 -0
package/packages/ui/src/components/PipelineStatusBar.tsx +65 -0
package/packages/ui/src/components/SkillReportGuide.tsx +215 -0
package/packages/ui/src/components/SkillReportPanels.tsx +919 -0
package/packages/ui/src/components/SkillsLibrary.tsx +437 -0
package/packages/ui/src/components/index.ts +56 -1
package/packages/ui/src/components/section-cards.tsx +18 -35
package/packages/ui/src/components/skill-health-grid.tsx +47 -37
package/packages/ui/src/lib/constants.tsx +0 -1
package/packages/ui/src/primitives/card.tsx +1 -1
package/packages/ui/src/primitives/checkbox.tsx +1 -1
package/packages/ui/src/primitives/dropdown-menu.tsx +2 -2
package/packages/ui/src/primitives/select.tsx +2 -2
package/packages/ui/src/primitives/tabs.tsx +7 -6
package/packages/ui/src/types.ts +182 -4
package/skill/SKILL.md +130 -318
package/skill/agents/diagnosis-analyst.md +3 -3
package/skill/agents/evolution-reviewer.md +3 -3
package/skill/agents/integration-guide.md +3 -3
package/skill/agents/pattern-analyst.md +2 -2
package/skill/references/cli-quick-reference.md +89 -0
package/skill/references/creator-playbook.md +131 -0
package/skill/references/examples.md +48 -0
package/skill/references/troubleshooting.md +47 -0
package/skill/references/version-history.md +1 -1
package/skill/selftune.contribute.json +11 -0
package/skill/{Workflows → workflows}/Baseline.md +20 -1
package/skill/{Workflows → workflows}/Contribute.md +23 -10
package/skill/{Workflows → workflows}/Contributions.md +13 -5
package/skill/workflows/CreateTestDeploy.md +170 -0
package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
package/skill/{Workflows → workflows}/Cron.md +1 -1
package/skill/{Workflows → workflows}/Dashboard.md +20 -0
package/skill/{Workflows → workflows}/Doctor.md +1 -1
package/skill/{Workflows → workflows}/Evals.md +67 -2
package/skill/{Workflows → workflows}/Evolve.md +119 -30
package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
package/skill/{Workflows → workflows}/Grade.md +1 -1
package/skill/{Workflows → workflows}/Ingest.md +60 -2
package/skill/{Workflows → workflows}/Initialize.md +16 -9
package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
package/skill/{Workflows → workflows}/PlatformHooks.md +19 -3
package/skill/workflows/Registry.md +99 -0
package/skill/{Workflows → workflows}/Schedule.md +3 -3
package/skill/workflows/SignalsDashboard.md +87 -0
package/skill/{Workflows → workflows}/Sync.md +3 -1
package/skill/{Workflows → workflows}/UnitTest.md +19 -0
package/skill/{Workflows → workflows}/Watch.md +42 -2
package/skill/{Workflows → workflows}/Workflows.md +39 -2
package/apps/local-dashboard/dist/assets/index-D8O-RG1I.js +0 -60
package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +0 -1
package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +0 -12
package/cli/selftune/utils/html.ts +0 -27
package/packages/ui/src/components/RecentActivityFeed.tsx +0 -117
/package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
/package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
/package/skill/{Workflows → workflows}/Badge.md +0 -0
/package/skill/{Workflows → workflows}/Composability.md +0 -0
/package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
/package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
/package/skill/{Workflows → workflows}/Hook.md +0 -0
/package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
/package/skill/{Workflows → workflows}/Quickstart.md +0 -0
/package/skill/{Workflows → workflows}/Recover.md +0 -0
/package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
/package/skill/{Workflows → workflows}/Replay.md +0 -0
/package/skill/{Workflows → workflows}/Rollback.md +0 -0
/package/skill/{Workflows → workflows}/Telemetry.md +0 -0
/package/skill/{Workflows → workflows}/Uninstall.md +0 -0

package/cli/selftune/evolution/validate-proposal.ts CHANGED Viewed

@@ -6,17 +6,14 @@
  * to determine whether the proposal is an improvement.
  */
-import type { EvalEntry, EvolutionProposal, InvocationTypeScores } from "../types.js";
+import type {
+  EvalEntry,
+  EvolutionProposal,
+  InvocationTypeScores,
+  ValidationMode,
+} from "../types.js";
 import { callLlm, type EffortLevel } from "../utils/llm-call.js";
-import {
-  buildBatchTriggerCheckPrompt,
-  buildTriggerCheckPrompt,
-  parseBatchTriggerResponse,
-  parseTriggerResponse,
-} from "../utils/trigger-check.js";
-// Re-export so existing consumers don't break
-export { buildTriggerCheckPrompt, parseTriggerResponse };
+import { buildBatchTriggerCheckPrompt, parseBatchTriggerResponse } from "../utils/trigger-check.js";
 /** Number of eval queries to batch into a single LLM call.
  * Higher = fewer claude -p spawns = much faster (each spawn has ~30-60s overhead).
@@ -40,147 +37,11 @@ export interface ValidationResult {
   net_change: number; // after - before pass rate
   by_invocation_type?: InvocationTypeScores;
   per_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
-  validation_mode?: "llm_judge";
+  validation_mode?: ValidationMode;
   validation_agent?: string;
-}
-// ---------------------------------------------------------------------------
-// Proposal validation
-// ---------------------------------------------------------------------------
-/** Validate a proposal sequentially (one LLM call per query). Kept for backward compat. */
-export async function validateProposalSequential(
-  proposal: EvolutionProposal,
-  evalSet: EvalEntry[],
-  agent: string,
-  modelFlag?: string,
-  effort?: EffortLevel,
-): Promise<ValidationResult> {
-  if (evalSet.length === 0) {
-    return {
-      proposal_id: proposal.proposal_id,
-      before_pass_rate: 0,
-      after_pass_rate: 0,
-      improved: false,
-      regressions: [],
-      new_passes: [],
-      net_change: 0,
-      validation_mode: "llm_judge",
-      validation_agent: agent,
-    };
-  }
-  const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
-  const regressions: EvalEntry[] = [];
-  const newPasses: EvalEntry[] = [];
-  const perEntryResults: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }> =
-    [];
-  let beforePassed = 0;
-  let afterPassed = 0;
-  for (const entry of evalSet) {
-    // Check with original description
-    const beforePrompt = buildTriggerCheckPrompt(proposal.original_description, entry.query);
-    const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag, effort);
-    const beforeTriggered = parseTriggerResponse(beforeRaw);
-    const beforePass =
-      (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
-    // Check with proposed description
-    const afterPrompt = buildTriggerCheckPrompt(proposal.proposed_description, entry.query);
-    const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag, effort);
-    const afterTriggered = parseTriggerResponse(afterRaw);
-    const afterPass =
-      (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
-    if (beforePass) beforePassed++;
-    if (afterPass) afterPassed++;
-    // Regression: passed before, fails after
-    if (beforePass && !afterPass) {
-      regressions.push(entry);
-    }
-    // New pass: failed before, passes after
-    if (!beforePass && afterPass) {
-      newPasses.push(entry);
-    }
-    perEntryResults.push({ entry, before_pass: beforePass, after_pass: afterPass });
-  }
-  const total = evalSet.length;
-  const beforePassRate = beforePassed / total;
-  const afterPassRate = afterPassed / total;
-  const netChange = afterPassRate - beforePassRate;
-  // A proposal is improved when ALL of:
-  //   - after_pass_rate > before_pass_rate
-  //   - regressions count < 5% of total eval entries
-  //   - Either net improvement >= 0.10 OR new_passes.length >= 2
-  const improved =
-    afterPassRate > beforePassRate &&
-    regressions.length < total * 0.05 &&
-    (netChange >= 0.1 || newPasses.length >= 2);
-  // Compute per-invocation-type scores (initialize all required keys)
-  const byInvocationType: Record<string, { passed: number; total: number }> = {
-    explicit: { passed: 0, total: 0 },
-    implicit: { passed: 0, total: 0 },
-    contextual: { passed: 0, total: 0 },
-    negative: { passed: 0, total: 0 },
-  };
-  for (const r of perEntryResults) {
-    const type = r.entry.invocation_type ?? "implicit";
-    if (!byInvocationType[type]) byInvocationType[type] = { passed: 0, total: 0 };
-    byInvocationType[type].total++;
-    if (r.after_pass) byInvocationType[type].passed++;
-  }
-  const invocationScores: InvocationTypeScores = {
-    explicit: {
-      ...byInvocationType.explicit,
-      pass_rate:
-        byInvocationType.explicit.total > 0
-          ? byInvocationType.explicit.passed / byInvocationType.explicit.total
-          : 0,
-    },
-    implicit: {
-      ...byInvocationType.implicit,
-      pass_rate:
-        byInvocationType.implicit.total > 0
-          ? byInvocationType.implicit.passed / byInvocationType.implicit.total
-          : 0,
-    },
-    contextual: {
-      ...byInvocationType.contextual,
-      pass_rate:
-        byInvocationType.contextual.total > 0
-          ? byInvocationType.contextual.passed / byInvocationType.contextual.total
-          : 0,
-    },
-    negative: {
-      ...byInvocationType.negative,
-      pass_rate:
-        byInvocationType.negative.total > 0
-          ? byInvocationType.negative.passed / byInvocationType.negative.total
-          : 0,
-    },
-  };
-  return {
-    proposal_id: proposal.proposal_id,
-    before_pass_rate: beforePassRate,
-    after_pass_rate: afterPassRate,
-    improved,
-    regressions,
-    new_passes: newPasses,
-    net_change: netChange,
-    by_invocation_type: invocationScores,
-    per_entry_results: perEntryResults,
-    validation_mode: "llm_judge",
-    validation_agent: agent,
-  };
+  validation_fixture_id?: string;
+  validation_fallback_reason?: string;
+  before_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
 }
 // ---------------------------------------------------------------------------

package/cli/selftune/evolution/validate-routing.ts CHANGED Viewed

@@ -3,6 +3,9 @@
  *
  * Validates a routing table evolution proposal by checking structural validity
  * and running trigger accuracy checks against an eval set.
+ *
+ * Delegates replay-based and judge-based validation to dedicated engines
+ * (engines/replay-engine.ts and engines/judge-engine.ts).
  */
 import type {
@@ -10,28 +13,12 @@ import type {
   BodyValidationResult,
   EvalEntry,
   RoutingReplayEntryResult,
-  RoutingReplayFixture,
+  ValidationGate,
   ValidationMode,
 } from "../types.js";
-import { callLlm } from "../utils/llm-call.js";
-import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
-import { runHostReplayFixture } from "./validate-host-replay.js";
-export interface RoutingReplayRunnerInput {
-  routing: string;
-  evalSet: EvalEntry[];
-  agent: string;
-  fixture: RoutingReplayFixture;
-}
-export type RoutingReplayRunner = (
-  input: RoutingReplayRunnerInput,
-) => Promise<RoutingReplayEntryResult[]>;
-export interface RoutingValidationOptions {
-  replayFixture?: RoutingReplayFixture;
-  replayRunner?: RoutingReplayRunner;
-}
+import { runJudgeValidation } from "./engines/judge-engine.js";
+import { type ReplayValidationOptions } from "./engines/replay-engine.js";
+import { runValidationContract, type ValidationStrategy } from "./validation-contract.js";
 export interface RoutingTriggerAccuracyResult {
   before_pass_rate: number;
@@ -40,7 +27,14 @@ export interface RoutingTriggerAccuracyResult {
   validation_mode: ValidationMode;
   validation_agent: string;
   validation_fixture_id?: string;
+  validation_fallback_reason?: string;
   per_entry_results?: RoutingReplayEntryResult[];
+  before_entry_results?: RoutingReplayEntryResult[];
+}
+export interface RoutingValidationOptions extends ReplayValidationOptions {
+  mode?: ValidationStrategy;
+  onReplayFallback?: (reason?: string) => void;
 }
 // ---------------------------------------------------------------------------
@@ -104,6 +98,9 @@ export function validateRoutingStructure(routing: string): { valid: boolean; rea
 /**
  * Run before/after trigger checks on the eval set using the routing content.
  * Returns pass rates for comparison.
+ *
+ * Prefers host/runtime replay when a runtime runner is available,
+ * falls back to LLM judge otherwise.
  */
 export async function validateRoutingTriggerAccuracy(
   originalRouting: string,
@@ -123,94 +120,38 @@ export async function validateRoutingTriggerAccuracy(
     };
   }
-  if (options.replayFixture && options.replayRunner) {
-    const beforeResults = await options.replayRunner({
-      routing: originalRouting,
-      evalSet,
-      agent,
-      fixture: options.replayFixture,
-    });
-    const afterResults = await options.replayRunner({
-      routing: proposedRouting,
-      evalSet,
-      agent,
-      fixture: options.replayFixture,
-    });
-    const beforePassed = beforeResults.filter((result) => result.passed).length;
-    const afterPassed = afterResults.filter((result) => result.passed).length;
-    const total = evalSet.length;
-    return {
-      before_pass_rate: beforePassed / total,
-      after_pass_rate: afterPassed / total,
-      improved: afterPassed > beforePassed,
-      validation_mode: "host_replay",
-      validation_agent: agent,
-      validation_fixture_id: options.replayFixture.fixture_id,
-      per_entry_results: afterResults,
-    };
-  }
-  if (options.replayFixture) {
-    const beforeResults = runHostReplayFixture({
-      routing: originalRouting,
-      evalSet,
-      fixture: options.replayFixture,
-    });
-    const afterResults = runHostReplayFixture({
-      routing: proposedRouting,
-      evalSet,
-      fixture: options.replayFixture,
-    });
-    const beforePassed = beforeResults.filter((result) => result.passed).length;
-    const afterPassed = afterResults.filter((result) => result.passed).length;
-    const total = evalSet.length;
-    return {
-      before_pass_rate: beforePassed / total,
-      after_pass_rate: afterPassed / total,
-      improved: afterPassed > beforePassed,
-      validation_mode: "host_replay",
-      validation_agent: agent,
-      validation_fixture_id: options.replayFixture.fixture_id,
-      per_entry_results: afterResults,
-    };
-  }
-  const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
-  let beforePassed = 0;
-  let afterPassed = 0;
-  for (const entry of evalSet) {
-    // Check with original routing
-    const beforePrompt = buildTriggerCheckPrompt(originalRouting, entry.query);
-    const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
-    const beforeTriggered = parseTriggerResponse(beforeRaw);
-    const beforePass =
-      (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
-    // Check with proposed routing
-    const afterPrompt = buildTriggerCheckPrompt(proposedRouting, entry.query);
-    const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
-    const afterTriggered = parseTriggerResponse(afterRaw);
-    const afterPass =
-      (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
-    if (beforePass) beforePassed++;
-    if (afterPass) afterPassed++;
-  }
-  const total = evalSet.length;
-  const beforePassRate = beforePassed / total;
-  const afterPassRate = afterPassed / total;
+  const { result, fallbackReason } = await runValidationContract<RoutingTriggerAccuracyResult>({
+    mode: options.mode ?? "auto",
+    originalContent: originalRouting,
+    proposedContent: proposedRouting,
+    evalSet,
+    agent,
+    replayOptions: options,
+    runJudge: async () => {
+      const judgeResult = await runJudgeValidation(
+        originalRouting,
+        proposedRouting,
+        evalSet,
+        agent,
+        modelFlag,
+      );
+      return {
+        result: {
+          before_pass_rate: judgeResult.before_pass_rate,
+          after_pass_rate: judgeResult.after_pass_rate,
+          improved: judgeResult.improved,
+          validation_mode: judgeResult.validation_mode,
+          validation_agent: judgeResult.validation_agent,
+        },
+        modeUsed: judgeResult.validation_mode,
+      };
+    },
+    onReplayFallback: options.onReplayFallback,
+    adaptReplayResult: (replayResult) => replayResult,
+  });
-  return {
-    before_pass_rate: beforePassRate,
-    after_pass_rate: afterPassRate,
-    improved: afterPassRate > beforePassRate,
-    validation_mode: "llm_judge",
-    validation_agent: agent,
-  };
+  return fallbackReason ? { ...result, validation_fallback_reason: fallbackReason } : result;
 }
 // ---------------------------------------------------------------------------
@@ -225,7 +166,7 @@ export async function validateRoutingProposal(
   modelFlag?: string,
   options: RoutingValidationOptions = {},
 ): Promise<BodyValidationResult> {
-  const gateResults: Array<{ gate: string; passed: boolean; reason: string }> = [];
+  const gateResults: Array<{ gate: ValidationGate; passed: boolean; reason: string }> = [];
   // Gate 1: Structural validation
   const structural = validateRoutingStructure(proposal.proposed_body);
@@ -277,8 +218,10 @@ export async function validateRoutingProposal(
     validation_mode: accuracy.validation_mode,
     validation_agent: accuracy.validation_agent,
     validation_fixture_id: accuracy.validation_fixture_id,
+    validation_fallback_reason: accuracy.validation_fallback_reason,
     before_pass_rate: accuracy.before_pass_rate,
     after_pass_rate: accuracy.after_pass_rate,
     per_entry_results: accuracy.per_entry_results,
+    before_entry_results: accuracy.before_entry_results,
   };
 }

package/cli/selftune/evolution/validation-contract.ts ADDED Viewed

@@ -0,0 +1,91 @@
+import type { EvalEntry, ValidationMode } from "../types.js";
+import { CLIError } from "../utils/cli-error.js";
+import {
+  runReplayValidation,
+  type ReplayValidationOptions,
+  type ReplayValidationResult,
+} from "./engines/replay-engine.js";
+export type ValidationStrategy = "auto" | "replay" | "judge";
+export const DEFAULT_VALIDATION_STRATEGY: ValidationStrategy = "auto";
+export interface ValidationExecutionResult<TResult> {
+  result: TResult;
+  modeUsed: ValidationMode;
+  fallbackReason?: string;
+}
+export interface ValidationContractOptions<TResult> {
+  mode?: ValidationStrategy;
+  originalContent: string;
+  proposedContent: string;
+  evalSet: EvalEntry[];
+  agent: string;
+  replayOptions?: ReplayValidationOptions;
+  runJudge: () => Promise<ValidationExecutionResult<TResult>>;
+  adaptReplayResult: (replayResult: ReplayValidationResult) => TResult;
+  onReplayFallback?: (reason?: string) => void;
+}
+export function hasReplayValidationPath(
+  replayOptions?: ReplayValidationOptions,
+): replayOptions is ReplayValidationOptions {
+  return Boolean(replayOptions?.replayFixture || replayOptions?.replayRunner);
+}
+export function createReplayUnavailableError(reason?: string): CLIError {
+  const message = reason
+    ? `Replay validation requested but real host/runtime replay is unavailable: ${reason}`
+    : "Replay validation requested but real host/runtime replay is unavailable.";
+  return new CLIError(
+    message,
+    "REPLAY_UNAVAILABLE",
+    "Use --validation-mode auto to allow LLM judge fallback, or run selftune on a host/agent with runtime replay support for this skill.",
+  );
+}
+export async function runValidationContract<TResult>(
+  options: ValidationContractOptions<TResult>,
+): Promise<ValidationExecutionResult<TResult>> {
+  const mode = options.mode ?? DEFAULT_VALIDATION_STRATEGY;
+  if (mode === "judge") {
+    return options.runJudge();
+  }
+  if (hasReplayValidationPath(options.replayOptions)) {
+    const replayAttempt = await runReplayValidation(
+      options.originalContent,
+      options.proposedContent,
+      options.evalSet,
+      options.agent,
+      options.replayOptions,
+    );
+    if (replayAttempt.result) {
+      return {
+        result: options.adaptReplayResult(replayAttempt.result),
+        modeUsed: replayAttempt.result.validation_mode,
+      };
+    }
+    if (mode === "replay") {
+      throw createReplayUnavailableError(replayAttempt.fallbackReason);
+    }
+    options.onReplayFallback?.(replayAttempt.fallbackReason);
+    const judgeResult = await options.runJudge();
+    return {
+      ...judgeResult,
+      fallbackReason: replayAttempt.fallbackReason,
+    };
+  }
+  if (mode === "replay") {
+    throw createReplayUnavailableError();
+  }
+  options.onReplayFallback?.();
+  return options.runJudge();
+}

package/cli/selftune/grading/auto-grade.ts CHANGED Viewed

@@ -13,12 +13,16 @@ import { mkdirSync, writeFileSync } from "node:fs";
 import { dirname } from "node:path";
 import { parseArgs } from "node:util";
-import { AGENT_CANDIDATES, TELEMETRY_LOG } from "../constants.js";
+import { TELEMETRY_LOG } from "../constants.js";
 import { getDb } from "../localdb/db.js";
 import { querySessionTelemetry, querySkillUsageRecords } from "../localdb/queries.js";
 import type { GradingResult, SessionTelemetryRecord, SkillUsageRecord } from "../types.js";
 import { CLIError, handleCLIError } from "../utils/cli-error.js";
-import { detectAgent as _detectAgent } from "../utils/llm-call.js";
+import {
+  detectLlmAgent as _detectAgent,
+  isLlmBackedAgent,
+  LLM_BACKED_AGENT_CANDIDATES,
+} from "../utils/llm-call.js";
 import { readExcerpt } from "../utils/transcript.js";
 import {
   buildDefaultGradingOutputPath,
@@ -55,7 +59,7 @@ Options:
   --session-id        Grade a specific session (auto-detects most recent if omitted)
   --telemetry-log     Path to telemetry log (default: ~/.claude/session_telemetry_log.jsonl)
   --output            Output path for grading JSON (default: ~/.selftune/grading/result-<session>.json)
-  --agent             Agent CLI to use (${AGENT_CANDIDATES.join(", ")})
+  --agent             Agent CLI to use (${LLM_BACKED_AGENT_CANDIDATES.join(", ")})
   --show-transcript   Print transcript excerpt before grading
   -h, --help          Show this help message`);
     process.exit(0);
@@ -68,9 +72,9 @@ Options:
   // --- Determine agent ---
   let agent: string | null = null;
-  const validAgents = [...AGENT_CANDIDATES];
+  const validAgents = [...LLM_BACKED_AGENT_CANDIDATES];
   if (values.agent) {
-    if (!validAgents.includes(values.agent)) {
+    if (!isLlmBackedAgent(values.agent)) {
       throw new CLIError(
         `Invalid --agent '${values.agent}'. Expected one of: ${validAgents.join(", ")}`,
         "INVALID_FLAG",
@@ -84,9 +88,9 @@ Options:
   if (!agent) {
     throw new CLIError(
-      `No supported agent CLI (${AGENT_CANDIDATES.join("/")}) found in PATH`,
+      `No supported agent CLI (${LLM_BACKED_AGENT_CANDIDATES.join("/")}) found in PATH`,
       "AGENT_NOT_FOUND",
-      "Install one of the supported agent CLIs",
+      "Install Claude Code, Codex, OpenCode, or Pi",
     );
   }

package/cli/selftune/grading/grade-session.ts CHANGED Viewed

@@ -5,19 +5,14 @@
  * Rubric-based grader for Claude Code skill sessions.
  * Migrated from grade_session.py.
  *
- * Grades via an installed agent CLI selected from AGENT_CANDIDATES.
+ * Grades via an installed agent CLI selected from the LLM-backed agent set.
  */
 import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
 import { basename, dirname, join } from "node:path";
 import { parseArgs } from "node:util";
-import {
-  AGENT_CANDIDATES,
-  CLAUDE_CODE_PROJECTS_DIR,
-  SELFTUNE_CONFIG_DIR,
-  TELEMETRY_LOG,
-} from "../constants.js";
+import { CLAUDE_CODE_PROJECTS_DIR, SELFTUNE_CONFIG_DIR, TELEMETRY_LOG } from "../constants.js";
 import { getDb } from "../localdb/db.js";
 import { querySessionTelemetry, querySkillUsageRecords } from "../localdb/queries.js";
 import type {
@@ -31,7 +26,9 @@ import type {
 } from "../types.js";
 import { CLIError, handleCLIError } from "../utils/cli-error.js";
 import {
-  detectAgent as _detectAgent,
+  detectLlmAgent as _detectAgent,
+  isLlmBackedAgent,
+  LLM_BACKED_AGENT_CANDIDATES,
   stripMarkdownFences as _stripMarkdownFences,
   callViaAgent,
 } from "../utils/llm-call.js";
@@ -42,9 +39,6 @@ import {
 } from "../utils/transcript.js";
 import { type PreGateContext, runPreGates } from "./pre-gates.js";
-// Re-export for backward compatibility
-export { detectAgent, stripMarkdownFences } from "../utils/llm-call.js";
 // ---------------------------------------------------------------------------
 // Constants
 // ---------------------------------------------------------------------------
@@ -756,7 +750,7 @@ Options:
   --transcript        Path to transcript file
   --telemetry-log     Path to telemetry log (default: ~/.claude/session_telemetry_log.jsonl)
   --output            Output path for grading JSON (default: ~/.selftune/grading/result-<session>.json)
-  --agent             Agent CLI to use (${AGENT_CANDIDATES.join(", ")})
+  --agent             Agent CLI to use (${LLM_BACKED_AGENT_CANDIDATES.join(", ")})
   --show-transcript   Print transcript excerpt before grading
   -h, --help          Show this help message`);
     process.exit(0);
@@ -769,9 +763,9 @@ Options:
   // --- Determine agent ---
   let agent: string | null = null;
-  const validAgents = [...AGENT_CANDIDATES];
+  const validAgents = [...LLM_BACKED_AGENT_CANDIDATES];
   if (values.agent) {
-    if (!validAgents.includes(values.agent)) {
+    if (!isLlmBackedAgent(values.agent)) {
       throw new CLIError(
         `Invalid --agent '${values.agent}'. Expected one of: ${validAgents.join(", ")}`,
         "INVALID_FLAG",
@@ -785,9 +779,9 @@ Options:
   if (!agent) {
     throw new CLIError(
-      `No supported agent CLI (${AGENT_CANDIDATES.join("/")}) found in PATH`,
+      `No supported agent CLI (${LLM_BACKED_AGENT_CANDIDATES.join("/")}) found in PATH`,
       "AGENT_NOT_FOUND",
-      "Install claude, codex, or opencode CLI, then retry",
+      "Install Claude Code, Codex, OpenCode, or Pi, then retry",
     );
   }

package/cli/selftune/hooks/skill-eval.ts CHANGED Viewed

@@ -25,7 +25,7 @@ import {
   getLatestPromptIdentity,
 } from "../normalization.js";
 import type { PostToolUsePayload, SkillUsageRecord } from "../types.js";
-import { classifySkillPath } from "../utils/skill-discovery.js";
+import { classifySkillPath, isTestFixturePath } from "../utils/skill-discovery.js";
 import { getLastUserMessage } from "../utils/transcript.js";
 /**
@@ -122,6 +122,7 @@ export async function processToolUse(
   const skillName = extractSkillName(filePath);
   if (skillName === null) return null;
+  if (isTestFixturePath(filePath)) return null;
   const transcriptPath = payload.transcript_path ?? "";
   const sessionId = payload.session_id ?? "unknown";

package/cli/selftune/hooks-shared/types.ts CHANGED Viewed

@@ -83,6 +83,7 @@ export const PLATFORM_EVENT_MAP: Record<HookPlatform, Partial<Record<HookEventTy
     session_end: "TaskComplete",
   },
   pi: {
+    prompt_submit: "message",
     pre_tool_use: "tool_call",
     post_tool_use: "tool_result",
     session_end: "session_shutdown",