selftune 0.2.23 → 0.2.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/README.md +93 -15
  3. package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
  4. package/apps/local-dashboard/dist/assets/index-Dhgv5BQO.js +15 -0
  5. package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
  6. package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
  7. package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
  8. package/apps/local-dashboard/dist/index.html +5 -5
  9. package/cli/selftune/adapters/codex/install.ts +310 -78
  10. package/cli/selftune/adapters/opencode/install.ts +3 -4
  11. package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
  12. package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
  13. package/cli/selftune/auto-update.ts +200 -8
  14. package/cli/selftune/canonical-export.ts +55 -25
  15. package/cli/selftune/command-surface.ts +397 -0
  16. package/cli/selftune/contribute/contribute.ts +64 -13
  17. package/cli/selftune/contribution-config.ts +57 -3
  18. package/cli/selftune/contribution-preferences.ts +117 -0
  19. package/cli/selftune/contribution-signals.ts +8 -4
  20. package/cli/selftune/contribution-staging.ts +13 -2
  21. package/cli/selftune/contributions.ts +55 -121
  22. package/cli/selftune/creator-contributions.ts +29 -10
  23. package/cli/selftune/cron/setup.ts +7 -3
  24. package/cli/selftune/dashboard-contract.ts +73 -0
  25. package/cli/selftune/dashboard-server.ts +168 -17
  26. package/cli/selftune/dashboard.ts +350 -17
  27. package/cli/selftune/eval/baseline.ts +21 -5
  28. package/cli/selftune/eval/execution-eval.ts +170 -0
  29. package/cli/selftune/eval/family-overlap.ts +2 -2
  30. package/cli/selftune/eval/hooks-to-evals.ts +228 -82
  31. package/cli/selftune/eval/import-skillsbench.ts +2 -2
  32. package/cli/selftune/eval/invocation-classifier.ts +56 -0
  33. package/cli/selftune/eval/synthetic-evals.ts +5 -3
  34. package/cli/selftune/eval/unit-test-cli.ts +7 -4
  35. package/cli/selftune/evolution/apply-proposal.ts +295 -0
  36. package/cli/selftune/evolution/engines/replay-engine.ts +79 -57
  37. package/cli/selftune/evolution/evolve-body.ts +100 -39
  38. package/cli/selftune/evolution/evolve.ts +244 -52
  39. package/cli/selftune/evolution/rollback.ts +0 -1
  40. package/cli/selftune/evolution/validate-body.ts +68 -42
  41. package/cli/selftune/evolution/validate-host-replay.ts +510 -60
  42. package/cli/selftune/evolution/validate-proposal.ts +11 -150
  43. package/cli/selftune/evolution/validate-routing.ts +43 -41
  44. package/cli/selftune/evolution/validation-contract.ts +91 -0
  45. package/cli/selftune/grading/auto-grade.ts +11 -7
  46. package/cli/selftune/grading/grade-session.ts +10 -16
  47. package/cli/selftune/index.ts +35 -10
  48. package/cli/selftune/ingestors/claude-replay.ts +15 -10
  49. package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
  50. package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
  51. package/cli/selftune/ingestors/pi-ingest.ts +3 -2
  52. package/cli/selftune/init.ts +27 -3
  53. package/cli/selftune/localdb/direct-write.ts +35 -1
  54. package/cli/selftune/localdb/queries/cron.ts +34 -0
  55. package/cli/selftune/localdb/queries/dashboard.ts +834 -0
  56. package/cli/selftune/localdb/queries/evolution.ts +158 -0
  57. package/cli/selftune/localdb/queries/execution.ts +133 -0
  58. package/cli/selftune/localdb/queries/json.ts +18 -0
  59. package/cli/selftune/localdb/queries/monitoring.ts +263 -0
  60. package/cli/selftune/localdb/queries/raw.ts +95 -0
  61. package/cli/selftune/localdb/queries/staging.ts +270 -0
  62. package/cli/selftune/localdb/queries/trust.ts +392 -0
  63. package/cli/selftune/localdb/queries.ts +60 -2288
  64. package/cli/selftune/localdb/schema.ts +21 -0
  65. package/cli/selftune/monitoring/watch.ts +96 -29
  66. package/cli/selftune/normalization.ts +3 -0
  67. package/cli/selftune/observability.ts +4 -2
  68. package/cli/selftune/orchestrate/cli.ts +161 -0
  69. package/cli/selftune/orchestrate/execute.ts +295 -0
  70. package/cli/selftune/orchestrate/finalize.ts +157 -0
  71. package/cli/selftune/orchestrate/locks.ts +40 -0
  72. package/cli/selftune/orchestrate/plan.ts +131 -0
  73. package/cli/selftune/orchestrate/post-run.ts +59 -0
  74. package/cli/selftune/orchestrate/prepare.ts +334 -0
  75. package/cli/selftune/orchestrate/report.ts +182 -0
  76. package/cli/selftune/orchestrate/runtime.ts +120 -0
  77. package/cli/selftune/orchestrate/signals.ts +48 -0
  78. package/cli/selftune/orchestrate.ts +150 -1173
  79. package/cli/selftune/repair/skill-usage.ts +5 -2
  80. package/cli/selftune/routes/overview.ts +5 -2
  81. package/cli/selftune/routes/skill-report.ts +15 -2
  82. package/cli/selftune/schedule.ts +5 -5
  83. package/cli/selftune/status.ts +39 -2
  84. package/cli/selftune/testing-readiness.ts +597 -0
  85. package/cli/selftune/types.ts +44 -4
  86. package/cli/selftune/uninstall.ts +2 -1
  87. package/cli/selftune/utils/canonical-log.ts +1 -9
  88. package/cli/selftune/utils/cli-error.ts +9 -0
  89. package/cli/selftune/utils/llm-call.ts +126 -6
  90. package/cli/selftune/utils/skill-discovery.ts +2 -0
  91. package/cli/selftune/workflows/proposals.ts +184 -0
  92. package/cli/selftune/workflows/skill-scaffold.ts +241 -0
  93. package/cli/selftune/workflows/workflows.ts +100 -26
  94. package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
  95. package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
  96. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  97. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
  98. package/node_modules/@selftune/telemetry-contract/src/schemas.ts +41 -1
  99. package/node_modules/@selftune/telemetry-contract/src/types.ts +103 -2
  100. package/package.json +25 -9
  101. package/packages/dashboard-core/AGENTS.md +18 -0
  102. package/packages/dashboard-core/README.md +30 -0
  103. package/packages/dashboard-core/index.ts +3 -0
  104. package/packages/dashboard-core/package.json +39 -0
  105. package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
  106. package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
  107. package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
  108. package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
  109. package/packages/dashboard-core/src/chrome/index.ts +14 -0
  110. package/packages/dashboard-core/src/chrome/types.ts +81 -0
  111. package/packages/dashboard-core/src/chrome/utils.ts +23 -0
  112. package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
  113. package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
  114. package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
  115. package/packages/dashboard-core/src/gates/index.ts +3 -0
  116. package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
  117. package/packages/dashboard-core/src/host/adapter.ts +47 -0
  118. package/packages/dashboard-core/src/host/capabilities.ts +55 -0
  119. package/packages/dashboard-core/src/host/index.ts +3 -0
  120. package/packages/dashboard-core/src/models/analytics.ts +39 -0
  121. package/packages/dashboard-core/src/models/index.ts +4 -0
  122. package/packages/dashboard-core/src/models/overview.ts +98 -0
  123. package/packages/dashboard-core/src/models/runtime.ts +7 -0
  124. package/packages/dashboard-core/src/models/skills.ts +34 -0
  125. package/packages/dashboard-core/src/routes/index.ts +2 -0
  126. package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
  127. package/packages/dashboard-core/src/routes/manifest.ts +451 -0
  128. package/packages/dashboard-core/src/routes/types.ts +39 -0
  129. package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
  130. package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
  131. package/packages/dashboard-core/src/screens/index.ts +37 -0
  132. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
  133. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
  134. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
  135. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
  136. package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
  137. package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
  138. package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
  139. package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
  140. package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
  141. package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
  142. package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
  143. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
  144. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
  145. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
  146. package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
  147. package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
  148. package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
  149. package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
  150. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
  151. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
  152. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
  153. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
  154. package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
  155. package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
  156. package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
  157. package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
  158. package/packages/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
  159. package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  160. package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
  161. package/packages/telemetry-contract/src/schemas.ts +41 -1
  162. package/packages/telemetry-contract/src/types.ts +103 -2
  163. package/packages/ui/src/components/EvidenceViewer.tsx +80 -25
  164. package/packages/ui/src/components/OverviewPanels.tsx +67 -26
  165. package/packages/ui/src/primitives/tabs.tsx +7 -6
  166. package/packages/ui/src/types.ts +10 -0
  167. package/skill/SKILL.md +130 -332
  168. package/skill/agents/diagnosis-analyst.md +3 -3
  169. package/skill/agents/evolution-reviewer.md +3 -3
  170. package/skill/agents/integration-guide.md +3 -3
  171. package/skill/agents/pattern-analyst.md +2 -2
  172. package/skill/references/cli-quick-reference.md +89 -0
  173. package/skill/references/creator-playbook.md +131 -0
  174. package/skill/references/examples.md +48 -0
  175. package/skill/references/troubleshooting.md +47 -0
  176. package/skill/references/version-history.md +1 -1
  177. package/skill/selftune.contribute.json +11 -0
  178. package/skill/{Workflows → workflows}/Baseline.md +20 -1
  179. package/skill/{Workflows → workflows}/Contribute.md +23 -10
  180. package/skill/{Workflows → workflows}/Contributions.md +13 -5
  181. package/skill/workflows/CreateTestDeploy.md +170 -0
  182. package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
  183. package/skill/{Workflows → workflows}/Cron.md +1 -1
  184. package/skill/{Workflows → workflows}/Dashboard.md +20 -0
  185. package/skill/{Workflows → workflows}/Doctor.md +1 -1
  186. package/skill/{Workflows → workflows}/Evals.md +67 -2
  187. package/skill/{Workflows → workflows}/Evolve.md +119 -30
  188. package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
  189. package/skill/{Workflows → workflows}/Grade.md +1 -1
  190. package/skill/{Workflows → workflows}/Initialize.md +8 -4
  191. package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
  192. package/skill/{Workflows → workflows}/Schedule.md +3 -3
  193. package/skill/workflows/SignalsDashboard.md +87 -0
  194. package/skill/{Workflows → workflows}/UnitTest.md +19 -0
  195. package/skill/{Workflows → workflows}/Watch.md +42 -2
  196. package/skill/{Workflows → workflows}/Workflows.md +39 -2
  197. package/apps/local-dashboard/dist/assets/index-CwOtTrUS.css +0 -1
  198. package/apps/local-dashboard/dist/assets/index-f1HQpbeH.js +0 -59
  199. package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
  200. package/apps/local-dashboard/dist/assets/vendor-ui-jVSaIZey.js +0 -12
  201. /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
  202. /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
  203. /package/skill/{Workflows → workflows}/Badge.md +0 -0
  204. /package/skill/{Workflows → workflows}/Composability.md +0 -0
  205. /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
  206. /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
  207. /package/skill/{Workflows → workflows}/Hook.md +0 -0
  208. /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
  209. /package/skill/{Workflows → workflows}/Ingest.md +0 -0
  210. /package/skill/{Workflows → workflows}/PlatformHooks.md +0 -0
  211. /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
  212. /package/skill/{Workflows → workflows}/Recover.md +0 -0
  213. /package/skill/{Workflows → workflows}/Registry.md +0 -0
  214. /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
  215. /package/skill/{Workflows → workflows}/Replay.md +0 -0
  216. /package/skill/{Workflows → workflows}/Rollback.md +0 -0
  217. /package/skill/{Workflows → workflows}/Sync.md +0 -0
  218. /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
  219. /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
@@ -9,6 +9,7 @@
9
9
  import { copyFileSync, existsSync, readFileSync, writeFileSync } from "node:fs";
10
10
  import { parseArgs } from "node:util";
11
11
 
12
+ import { PUBLIC_COMMAND_SURFACES, renderCommandHelp } from "../command-surface.js";
12
13
  import { QUERY_LOG, SKILL_LOG } from "../constants.js";
13
14
  import type { BaselineMeasurement } from "../eval/baseline.js";
14
15
  import { measureBaseline } from "../eval/baseline.js";
@@ -43,6 +44,11 @@ import { createEvolveTUI } from "../utils/tui.js";
43
44
  import { appendAuditEntry } from "./audit.js";
44
45
  import { checkConstitution } from "./constitutional.js";
45
46
  import { scoreDescription } from "./description-quality.js";
47
+ import {
48
+ DEFAULT_VALIDATION_STRATEGY,
49
+ runValidationContract,
50
+ type ValidationStrategy,
51
+ } from "./validation-contract.js";
46
52
  import { appendEvidenceEntry, buildValidationEvidenceRef } from "./evidence.js";
47
53
  import { extractFailurePatterns } from "./extract-patterns.js";
48
54
  import {
@@ -54,6 +60,8 @@ import {
54
60
  import { generateMultipleProposals, generateProposal } from "./propose-description.js";
55
61
  import { evaluateStoppingCriteria } from "./stopping-criteria.js";
56
62
  import { buildUnblockSuggestions } from "./unblock-suggestions.js";
63
+ import type { ReplayValidationOptions, ReplayValidationResult } from "./engines/replay-engine.js";
64
+ import { buildRuntimeReplayValidationOptions } from "./validate-host-replay.js";
57
65
  import type { ValidationResult } from "./validate-proposal.js";
58
66
  import {
59
67
  TRIGGER_CHECK_BATCH_SIZE,
@@ -87,6 +95,10 @@ export interface EvolveOptions {
87
95
  adaptiveGate?: boolean;
88
96
  syncFirst?: boolean;
89
97
  syncForce?: boolean;
98
+ /** Validation mode for description evolution: auto (default), replay, or judge. */
99
+ validationMode?: ValidationStrategy;
100
+ /** Replay engine options (fixture, runner) — passed through to replay validation. */
101
+ replayOptions?: ReplayValidationOptions;
90
102
  }
91
103
 
92
104
  export interface EvolveResult {
@@ -257,6 +269,122 @@ function resolveGateDecision(
257
269
  };
258
270
  }
259
271
 
272
+ // ---------------------------------------------------------------------------
273
+ // Validation mode router
274
+ // ---------------------------------------------------------------------------
275
+
276
+ /**
277
+ * Route description validation to the correct engine based on the
278
+ * --validation-mode flag.
279
+ *
280
+ * - "judge" → LLM judge only (legacy path via validateProposal)
281
+ * - "replay" → Replay engine only; throws if no fixture/runner available
282
+ * - "auto" → Try replay first, fall back to judge if unavailable
283
+ *
284
+ * Returns a ValidationResult and the actual mode used.
285
+ */
286
+ export async function validateWithMode(
287
+ mode: ValidationStrategy,
288
+ proposal: EvolutionProposal,
289
+ evalSet: EvalEntry[],
290
+ agent: string,
291
+ replayOptions: ReplayValidationOptions | undefined,
292
+ validateFn: typeof validateProposal,
293
+ modelFlag?: string,
294
+ ): Promise<{
295
+ result: ValidationResult;
296
+ modeUsed: ValidationResult["validation_mode"] extends infer T ? Exclude<T, undefined> : never;
297
+ }> {
298
+ return runValidationContract({
299
+ mode,
300
+ originalContent: proposal.original_description,
301
+ proposedContent: proposal.proposed_description,
302
+ evalSet,
303
+ agent,
304
+ replayOptions,
305
+ runJudge: async () => {
306
+ const result = await validateFn(proposal, evalSet, agent, modelFlag);
307
+ return { result, modeUsed: result.validation_mode ?? "llm_judge" };
308
+ },
309
+ adaptReplayResult: (replayResult) =>
310
+ adaptReplayResultToValidationResult(proposal, replayResult, evalSet),
311
+ onReplayFallback: (reason) => {
312
+ if (reason) {
313
+ console.error(
314
+ `[evolve] Replay not available (${reason}), falling back to LLM judge validation.`,
315
+ );
316
+ return;
317
+ }
318
+ console.error("[evolve] Replay not available, falling back to LLM judge validation.");
319
+ },
320
+ }).then(({ result, modeUsed, fallbackReason }) => ({
321
+ result: fallbackReason ? { ...result, validation_fallback_reason: fallbackReason } : result,
322
+ modeUsed,
323
+ }));
324
+ }
325
+
326
+ function adaptReplayResultToValidationResult(
327
+ proposal: EvolutionProposal,
328
+ replayResult: ReplayValidationResult,
329
+ evalSet: EvalEntry[],
330
+ ): ValidationResult {
331
+ const evalEntryByQuery = new Map<string, EvalEntry>();
332
+ for (const entry of evalSet) {
333
+ evalEntryByQuery.set(entry.query, entry);
334
+ }
335
+
336
+ // Build lookups from before/after replay results keyed by query.
337
+ const beforeByQuery = new Map<string, boolean>();
338
+ for (const r of replayResult.before_entry_results ?? []) {
339
+ beforeByQuery.set(r.query, r.passed);
340
+ }
341
+ const afterByQuery = new Map<string, boolean>();
342
+ for (const r of replayResult.per_entry_results ?? []) {
343
+ afterByQuery.set(r.query, r.passed);
344
+ }
345
+
346
+ const entryForReplayResult = (result: { query: string; should_trigger: boolean }): EvalEntry => ({
347
+ ...(evalEntryByQuery.get(result.query) ?? {
348
+ query: result.query,
349
+ should_trigger: result.should_trigger,
350
+ }),
351
+ });
352
+
353
+ // Merge before + after into unified per_entry_results with both fields populated
354
+ const regressions: EvalEntry[] = [];
355
+ const newPasses: EvalEntry[] = [];
356
+ const perEntryResults = replayResult.per_entry_results?.map((result) => {
357
+ const beforePass = beforeByQuery.get(result.query) ?? false;
358
+ const afterPass = result.passed;
359
+ const entry = entryForReplayResult(result);
360
+
361
+ if (beforePass && !afterPass) regressions.push(entry);
362
+ if (!beforePass && afterPass) newPasses.push(entry);
363
+
364
+ return { entry, before_pass: beforePass, after_pass: afterPass };
365
+ });
366
+ const beforeEntryResults = replayResult.before_entry_results?.map((result) => ({
367
+ entry: entryForReplayResult(result),
368
+ before_pass: result.passed,
369
+ after_pass: afterByQuery.get(result.query) ?? false,
370
+ }));
371
+
372
+ return {
373
+ proposal_id: proposal.proposal_id,
374
+ before_pass_rate: replayResult.before_pass_rate,
375
+ after_pass_rate: replayResult.after_pass_rate,
376
+ improved: replayResult.improved,
377
+ regressions,
378
+ new_passes: newPasses,
379
+ net_change: replayResult.after_pass_rate - replayResult.before_pass_rate,
380
+ validation_mode: replayResult.validation_mode,
381
+ validation_agent: replayResult.validation_agent,
382
+ validation_fixture_id: replayResult.validation_fixture_id,
383
+ per_entry_results: perEntryResults,
384
+ before_entry_results: beforeEntryResults,
385
+ };
386
+ }
387
+
260
388
  // ---------------------------------------------------------------------------
261
389
  // Main orchestrator
262
390
  // ---------------------------------------------------------------------------
@@ -267,6 +395,7 @@ export async function evolve(
267
395
  ): Promise<EvolveResult> {
268
396
  const { skillName, skillPath, evalSetPath, agent, dryRun, confidenceThreshold, maxIterations } =
269
397
  options;
398
+ const effectiveValidationMode = options.validationMode ?? DEFAULT_VALIDATION_STRATEGY;
270
399
 
271
400
  // Apply cheap-loop defaults: cheap models for proposal/validation, expensive for gate
272
401
  if (options.cheapLoop) {
@@ -647,23 +776,33 @@ export async function evolve(
647
776
  continue;
648
777
  }
649
778
 
650
- const validation = await _validateProposal(
779
+ const { result: validation, modeUsed: paretoModeUsed } = await validateWithMode(
780
+ effectiveValidationMode,
651
781
  proposal,
652
782
  evalSet,
653
783
  agent,
784
+ options.replayOptions,
785
+ _validateProposal,
654
786
  options.validationModel,
655
787
  );
656
- llmCallCount += countValidationLlmCalls(evalSet.length);
788
+ if (paretoModeUsed === "llm_judge") {
789
+ llmCallCount += countValidationLlmCalls(evalSet.length);
790
+ }
657
791
  const evidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
658
792
  recordAudit(
659
793
  proposal.proposal_id,
660
794
  "validated",
661
- `Pareto validation: improved=${validation.improved}`,
795
+ `Pareto validation (${paretoModeUsed}): improved=${validation.improved}${
796
+ validation.validation_fallback_reason
797
+ ? ` (replay fallback: ${validation.validation_fallback_reason})`
798
+ : ""
799
+ }`,
662
800
  undefined,
663
801
  undefined,
664
802
  {
665
- validation_mode: validation.validation_mode,
803
+ validation_mode: paretoModeUsed,
666
804
  validation_agent: validation.validation_agent,
805
+ validation_fixture_id: validation.validation_fixture_id,
667
806
  validation_evidence_ref: evidenceRef,
668
807
  },
669
808
  );
@@ -676,7 +815,11 @@ export async function evolve(
676
815
  stage: "validated",
677
816
  rationale: proposal.rationale,
678
817
  confidence: proposal.confidence,
679
- details: `Pareto validation: improved=${validation.improved}`,
818
+ details: `Pareto validation: improved=${validation.improved}${
819
+ validation.validation_fallback_reason
820
+ ? ` (replay fallback: ${validation.validation_fallback_reason})`
821
+ : ""
822
+ }`,
680
823
  validation: {
681
824
  improved: validation.improved,
682
825
  before_pass_rate: validation.before_pass_rate,
@@ -685,8 +828,11 @@ export async function evolve(
685
828
  regressions: validation.regressions,
686
829
  new_passes: validation.new_passes,
687
830
  per_entry_results: validation.per_entry_results,
831
+ before_entry_results: validation.before_entry_results,
688
832
  validation_mode: validation.validation_mode,
689
833
  validation_agent: validation.validation_agent,
834
+ validation_fixture_id: validation.validation_fixture_id,
835
+ validation_fallback_reason: validation.validation_fallback_reason,
690
836
  validation_evidence_ref: evidenceRef,
691
837
  },
692
838
  });
@@ -873,16 +1019,21 @@ export async function evolve(
873
1019
  // Step 10: Validate against eval set
874
1020
  const batchCount = Math.ceil(evalSet.length / TRIGGER_CHECK_BATCH_SIZE);
875
1021
  tui.step(
876
- `Validating ${evalSet.length} entries (${batchCount} batches, ${VALIDATION_RUNS}x majority-vote)...`,
1022
+ `Validating ${evalSet.length} entries (mode=${effectiveValidationMode}, ${batchCount} batches, ${VALIDATION_RUNS}x majority-vote)...`,
877
1023
  );
878
- const validation = await _validateProposal(
1024
+ const { result: validation, modeUsed: retryModeUsed } = await validateWithMode(
1025
+ effectiveValidationMode,
879
1026
  proposal,
880
1027
  evalSet,
881
1028
  agent,
1029
+ options.replayOptions,
1030
+ _validateProposal,
882
1031
  options.validationModel,
883
1032
  );
884
1033
  lastValidation = validation;
885
- llmCallCount += countValidationLlmCalls(evalSet.length);
1034
+ if (retryModeUsed === "llm_judge") {
1035
+ llmCallCount += countValidationLlmCalls(evalSet.length);
1036
+ }
886
1037
  tui.done(
887
1038
  `Validation: ${(validation.before_pass_rate * 100).toFixed(1)}% \u2192 ${(validation.after_pass_rate * 100).toFixed(1)}% (improved: ${validation.improved})`,
888
1039
  );
@@ -898,12 +1049,17 @@ export async function evolve(
898
1049
  recordAudit(
899
1050
  proposal.proposal_id,
900
1051
  "validated",
901
- `Validation complete: improved=${validation.improved}`,
1052
+ `Validation complete (${retryModeUsed}): improved=${validation.improved}${
1053
+ validation.validation_fallback_reason
1054
+ ? ` (replay fallback: ${validation.validation_fallback_reason})`
1055
+ : ""
1056
+ }`,
902
1057
  evalSnapshot,
903
1058
  undefined,
904
1059
  {
905
- validation_mode: validation.validation_mode,
1060
+ validation_mode: retryModeUsed,
906
1061
  validation_agent: validation.validation_agent,
1062
+ validation_fixture_id: validation.validation_fixture_id,
907
1063
  validation_evidence_ref: validatedEvidenceRef,
908
1064
  },
909
1065
  );
@@ -916,7 +1072,11 @@ export async function evolve(
916
1072
  stage: "validated",
917
1073
  rationale: proposal.rationale,
918
1074
  confidence: proposal.confidence,
919
- details: `Validation complete: improved=${validation.improved}`,
1075
+ details: `Validation complete (${retryModeUsed}): improved=${validation.improved}${
1076
+ validation.validation_fallback_reason
1077
+ ? ` (replay fallback: ${validation.validation_fallback_reason})`
1078
+ : ""
1079
+ }`,
920
1080
  validation: {
921
1081
  improved: validation.improved,
922
1082
  before_pass_rate: validation.before_pass_rate,
@@ -925,8 +1085,11 @@ export async function evolve(
925
1085
  regressions: validation.regressions,
926
1086
  new_passes: validation.new_passes,
927
1087
  per_entry_results: validation.per_entry_results,
928
- validation_mode: validation.validation_mode,
1088
+ before_entry_results: validation.before_entry_results,
1089
+ validation_mode: retryModeUsed,
929
1090
  validation_agent: validation.validation_agent,
1091
+ validation_fixture_id: validation.validation_fixture_id,
1092
+ validation_fallback_reason: validation.validation_fallback_reason,
930
1093
  validation_evidence_ref: validatedEvidenceRef,
931
1094
  },
932
1095
  });
@@ -948,12 +1111,13 @@ export async function evolve(
948
1111
  recordAudit(
949
1112
  proposal.proposal_id,
950
1113
  "rejected",
951
- `Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
1114
+ `Validation failed (${retryModeUsed}): net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
952
1115
  undefined,
953
1116
  undefined,
954
1117
  {
955
- validation_mode: validation.validation_mode,
1118
+ validation_mode: retryModeUsed,
956
1119
  validation_agent: validation.validation_agent,
1120
+ validation_fixture_id: validation.validation_fixture_id,
957
1121
  validation_evidence_ref: rejectedEvidenceRef,
958
1122
  },
959
1123
  );
@@ -966,7 +1130,7 @@ export async function evolve(
966
1130
  stage: "rejected",
967
1131
  rationale: proposal.rationale,
968
1132
  confidence: proposal.confidence,
969
- details: `Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
1133
+ details: `Validation failed (${retryModeUsed}): net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
970
1134
  validation: {
971
1135
  improved: validation.improved,
972
1136
  before_pass_rate: validation.before_pass_rate,
@@ -975,8 +1139,10 @@ export async function evolve(
975
1139
  regressions: validation.regressions,
976
1140
  new_passes: validation.new_passes,
977
1141
  per_entry_results: validation.per_entry_results,
978
- validation_mode: validation.validation_mode,
1142
+ before_entry_results: validation.before_entry_results,
1143
+ validation_mode: retryModeUsed,
979
1144
  validation_agent: validation.validation_agent,
1145
+ validation_fixture_id: validation.validation_fixture_id,
980
1146
  validation_evidence_ref: rejectedEvidenceRef,
981
1147
  },
982
1148
  });
@@ -998,7 +1164,18 @@ export async function evolve(
998
1164
 
999
1165
  // Validation passed — check if converged or continue
1000
1166
  if (stopping.shouldStop && stopping.reason.includes("Converged")) {
1001
- recordAudit(proposal.proposal_id, "validated", `Stopping early: ${stopping.reason}`);
1167
+ recordAudit(
1168
+ proposal.proposal_id,
1169
+ "validated",
1170
+ `Stopping early: ${stopping.reason}`,
1171
+ undefined,
1172
+ undefined,
1173
+ {
1174
+ validation_mode: retryModeUsed,
1175
+ validation_agent: validation.validation_agent,
1176
+ validation_fixture_id: validation.validation_fixture_id,
1177
+ },
1178
+ );
1002
1179
  }
1003
1180
 
1004
1181
  // Validation passed - break out of retry loop
@@ -1133,6 +1310,11 @@ export async function evolve(
1133
1310
  regressions: gateValidation.regressions,
1134
1311
  new_passes: gateValidation.new_passes,
1135
1312
  per_entry_results: gateValidation.per_entry_results,
1313
+ before_entry_results: gateValidation.before_entry_results,
1314
+ validation_mode: gateValidation.validation_mode,
1315
+ validation_agent: gateValidation.validation_agent,
1316
+ validation_fixture_id: gateValidation.validation_fixture_id,
1317
+ validation_fallback_reason: gateValidation.validation_fallback_reason,
1136
1318
  },
1137
1319
  });
1138
1320
  finishTui();
@@ -1179,7 +1361,11 @@ export async function evolve(
1179
1361
  recordAudit(
1180
1362
  lastProposal.proposal_id,
1181
1363
  "deployed",
1182
- `Deployed proposal for ${skillName}`,
1364
+ `Deployed proposal for ${skillName}${
1365
+ lastValidation.validation_fallback_reason
1366
+ ? ` (replay fallback: ${lastValidation.validation_fallback_reason})`
1367
+ : ""
1368
+ }`,
1183
1369
  {
1184
1370
  total: evalSet.length,
1185
1371
  passed: Math.round(lastValidation.after_pass_rate * evalSet.length),
@@ -1190,6 +1376,7 @@ export async function evolve(
1190
1376
  {
1191
1377
  validation_mode: lastValidation.validation_mode,
1192
1378
  validation_agent: lastValidation.validation_agent,
1379
+ validation_fixture_id: lastValidation.validation_fixture_id,
1193
1380
  validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
1194
1381
  },
1195
1382
  );
@@ -1202,7 +1389,11 @@ export async function evolve(
1202
1389
  stage: "deployed",
1203
1390
  rationale: lastProposal.rationale,
1204
1391
  confidence: lastProposal.confidence,
1205
- details: `Deployed proposal for ${skillName}`,
1392
+ details: `Deployed proposal for ${skillName}${
1393
+ lastValidation.validation_fallback_reason
1394
+ ? ` (replay fallback: ${lastValidation.validation_fallback_reason})`
1395
+ : ""
1396
+ }`,
1206
1397
  validation: {
1207
1398
  improved: lastValidation.improved,
1208
1399
  before_pass_rate: lastValidation.before_pass_rate,
@@ -1211,8 +1402,11 @@ export async function evolve(
1211
1402
  regressions: lastValidation.regressions,
1212
1403
  new_passes: lastValidation.new_passes,
1213
1404
  per_entry_results: lastValidation.per_entry_results,
1405
+ before_entry_results: lastValidation.before_entry_results,
1214
1406
  validation_mode: lastValidation.validation_mode,
1215
1407
  validation_agent: lastValidation.validation_agent,
1408
+ validation_fixture_id: lastValidation.validation_fixture_id,
1409
+ validation_fallback_reason: lastValidation.validation_fallback_reason,
1216
1410
  validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
1217
1411
  },
1218
1412
  });
@@ -1221,7 +1415,7 @@ export async function evolve(
1221
1415
  // -----------------------------------------------------------------------
1222
1416
  // Step 15: Update evolution memory
1223
1417
  // -----------------------------------------------------------------------
1224
- const wasDeployed = lastProposal && lastValidation?.improved;
1418
+ const wasDeployed = Boolean(lastProposal && lastValidation?.improved);
1225
1419
  const evolveResult: EvolveResult = withStats({
1226
1420
  proposal: lastProposal,
1227
1421
  validation: lastValidation,
@@ -1287,6 +1481,7 @@ export async function cliMain(): Promise<void> {
1287
1481
  "gate-effort": { type: "string" },
1288
1482
  "proposal-model": { type: "string" },
1289
1483
  "adaptive-gate": { type: "boolean", default: false },
1484
+ "validation-mode": { type: "string", default: "auto" },
1290
1485
  "sync-first": { type: "boolean", default: false },
1291
1486
  "sync-force": { type: "boolean", default: false },
1292
1487
  verbose: { type: "boolean", default: false },
@@ -1296,34 +1491,7 @@ export async function cliMain(): Promise<void> {
1296
1491
  });
1297
1492
 
1298
1493
  if (values.help) {
1299
- console.log(`selftune evolve — Evolve a skill description via failure patterns
1300
-
1301
- Usage:
1302
- selftune evolve --skill <name> --skill-path <path> [options]
1303
-
1304
- Options:
1305
- --skill Skill name (required)
1306
- --skill-path Path to SKILL.md (required)
1307
- --eval-set Path to eval set JSON (optional, builds from logs if omitted)
1308
- --agent Agent CLI to use (claude, codex, opencode)
1309
- --dry-run Validate proposal without deploying
1310
- --confidence Confidence threshold 0.0-1.0 (default: 0.6)
1311
- --max-iterations Max retry iterations (default: 3)
1312
- --pareto Enable Pareto multi-candidate selection
1313
- --candidates Number of candidates to generate (default: 3, max: 5)
1314
- --token-efficiency Enable 5D Pareto with token efficiency scoring
1315
- --with-baseline Gate deployment on baseline lift > 0.05
1316
- --validation-model Model for trigger-check validation calls (default: haiku)
1317
- --cheap-loop Use cheap models for loop, expensive for gate (default: on)
1318
- --full-model Use same model for all stages (disables cheap-loop)
1319
- --gate-model Model for final gate validation (default: sonnet)
1320
- --gate-effort Thinking effort for final gate (low|medium|high|max)
1321
- --adaptive-gate Escalate risky gate checks to opus + high effort
1322
- --proposal-model Model for proposal generation LLM calls
1323
- --sync-first Refresh source-truth telemetry before building evals/failure patterns
1324
- --sync-force Force a full rescan during --sync-first
1325
- --verbose Output full EvolveResult JSON (default: compact summary)
1326
- --help Show this help message`);
1494
+ console.log(renderCommandHelp(PUBLIC_COMMAND_SURFACES.evolve));
1327
1495
  process.exit(0);
1328
1496
  }
1329
1497
 
@@ -1334,6 +1502,16 @@ Options:
1334
1502
  "selftune evolve --skill <name> --skill-path <path>",
1335
1503
  );
1336
1504
  }
1505
+ if (
1506
+ values["validation-mode"] &&
1507
+ !["auto", "replay", "judge"].includes(values["validation-mode"])
1508
+ ) {
1509
+ throw new CLIError(
1510
+ `Invalid --validation-mode value: ${values["validation-mode"]}`,
1511
+ "INVALID_FLAG",
1512
+ "Use one of: auto, replay, judge",
1513
+ );
1514
+ }
1337
1515
  if ((values["sync-force"] ?? false) && !(values["sync-first"] ?? false)) {
1338
1516
  throw new CLIError(
1339
1517
  "--sync-force requires --sync-first",
@@ -1360,7 +1538,7 @@ Options:
1360
1538
  );
1361
1539
  }
1362
1540
 
1363
- const { detectAgent } = await import("../utils/llm-call.js");
1541
+ const { detectLlmAgent } = await import("../utils/llm-call.js");
1364
1542
  const requestedAgent = values.agent;
1365
1543
  if (requestedAgent && !Bun.which(requestedAgent)) {
1366
1544
  throw new CLIError(
@@ -1369,12 +1547,12 @@ Options:
1369
1547
  "Install it or omit --agent to use auto-detection.",
1370
1548
  );
1371
1549
  }
1372
- const agent = requestedAgent ?? detectAgent();
1550
+ const agent = requestedAgent ?? detectLlmAgent();
1373
1551
  if (!agent) {
1374
1552
  throw new CLIError(
1375
- "No agent CLI (claude/codex/opencode) found in PATH.",
1553
+ "No agent CLI (claude/codex/opencode/pi) found in PATH.",
1376
1554
  "AGENT_NOT_FOUND",
1377
- "Install Claude Code, Codex, or OpenCode.",
1555
+ "Install Claude Code, Codex, OpenCode, or Pi.",
1378
1556
  );
1379
1557
  }
1380
1558
 
@@ -1443,6 +1621,17 @@ Options:
1443
1621
  console.error(`[verbose] Gate effort: ${values["gate-effort"] ?? "(default)"}`);
1444
1622
  }
1445
1623
 
1624
+ // Build replay options automatically when a real runtime replay runner exists.
1625
+ let replayOptions: ReplayValidationOptions | undefined;
1626
+ if (values["validation-mode"] !== "judge") {
1627
+ replayOptions = buildRuntimeReplayValidationOptions({
1628
+ skillName: values.skill,
1629
+ skillPath: values["skill-path"],
1630
+ agent,
1631
+ contentTarget: "description",
1632
+ });
1633
+ }
1634
+
1446
1635
  const result = await evolve({
1447
1636
  skillName: values.skill,
1448
1637
  skillPath: values["skill-path"],
@@ -1465,6 +1654,9 @@ Options:
1465
1654
  gradingResults,
1466
1655
  syncFirst: values["sync-first"] ?? false,
1467
1656
  syncForce: values["sync-force"] ?? false,
1657
+ validationMode:
1658
+ (values["validation-mode"] as ValidationStrategy) ?? DEFAULT_VALIDATION_STRATEGY,
1659
+ replayOptions,
1468
1660
  });
1469
1661
 
1470
1662
  if (values.verbose) {
@@ -25,7 +25,6 @@ export interface RollbackOptions {
25
25
  skillName: string;
26
26
  skillPath: string;
27
27
  proposalId?: string; // rollback specific proposal, or last deployed
28
- logPath?: string; // deprecated — ignored, kept for backward compat
29
28
  }
30
29
 
31
30
  export interface RollbackResult {