selftune 0.2.23 → 0.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/README.md +93 -15
  3. package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
  4. package/apps/local-dashboard/dist/assets/index-Dmx7LPVX.js +15 -0
  5. package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
  6. package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
  7. package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
  8. package/apps/local-dashboard/dist/index.html +5 -5
  9. package/cli/selftune/adapters/codex/install.ts +310 -78
  10. package/cli/selftune/adapters/opencode/install.ts +3 -4
  11. package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
  12. package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
  13. package/cli/selftune/auto-update.ts +200 -8
  14. package/cli/selftune/canonical-export.ts +55 -25
  15. package/cli/selftune/command-surface.ts +397 -0
  16. package/cli/selftune/contribute/contribute.ts +64 -13
  17. package/cli/selftune/contribution-config.ts +57 -3
  18. package/cli/selftune/contribution-preferences.ts +117 -0
  19. package/cli/selftune/contribution-signals.ts +8 -4
  20. package/cli/selftune/contribution-staging.ts +13 -2
  21. package/cli/selftune/contributions.ts +55 -121
  22. package/cli/selftune/creator-contributions.ts +29 -10
  23. package/cli/selftune/cron/setup.ts +7 -3
  24. package/cli/selftune/dashboard-contract.ts +73 -0
  25. package/cli/selftune/dashboard-server.ts +168 -17
  26. package/cli/selftune/dashboard.ts +350 -17
  27. package/cli/selftune/eval/baseline.ts +21 -5
  28. package/cli/selftune/eval/execution-eval.ts +170 -0
  29. package/cli/selftune/eval/family-overlap.ts +2 -2
  30. package/cli/selftune/eval/hooks-to-evals.ts +228 -82
  31. package/cli/selftune/eval/import-skillsbench.ts +2 -2
  32. package/cli/selftune/eval/invocation-classifier.ts +56 -0
  33. package/cli/selftune/eval/synthetic-evals.ts +5 -3
  34. package/cli/selftune/eval/unit-test-cli.ts +7 -4
  35. package/cli/selftune/evolution/apply-proposal.ts +295 -0
  36. package/cli/selftune/evolution/engines/replay-engine.ts +79 -57
  37. package/cli/selftune/evolution/evolve-body.ts +100 -39
  38. package/cli/selftune/evolution/evolve.ts +244 -52
  39. package/cli/selftune/evolution/rollback.ts +0 -1
  40. package/cli/selftune/evolution/validate-body.ts +68 -42
  41. package/cli/selftune/evolution/validate-host-replay.ts +510 -60
  42. package/cli/selftune/evolution/validate-proposal.ts +11 -150
  43. package/cli/selftune/evolution/validate-routing.ts +43 -41
  44. package/cli/selftune/evolution/validation-contract.ts +91 -0
  45. package/cli/selftune/grading/auto-grade.ts +11 -7
  46. package/cli/selftune/grading/grade-session.ts +10 -16
  47. package/cli/selftune/index.ts +35 -10
  48. package/cli/selftune/ingestors/claude-replay.ts +15 -10
  49. package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
  50. package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
  51. package/cli/selftune/ingestors/pi-ingest.ts +3 -2
  52. package/cli/selftune/init.ts +27 -3
  53. package/cli/selftune/localdb/direct-write.ts +35 -1
  54. package/cli/selftune/localdb/queries/cron.ts +34 -0
  55. package/cli/selftune/localdb/queries/dashboard.ts +834 -0
  56. package/cli/selftune/localdb/queries/evolution.ts +158 -0
  57. package/cli/selftune/localdb/queries/execution.ts +133 -0
  58. package/cli/selftune/localdb/queries/json.ts +18 -0
  59. package/cli/selftune/localdb/queries/monitoring.ts +263 -0
  60. package/cli/selftune/localdb/queries/raw.ts +95 -0
  61. package/cli/selftune/localdb/queries/staging.ts +270 -0
  62. package/cli/selftune/localdb/queries/trust.ts +392 -0
  63. package/cli/selftune/localdb/queries.ts +60 -2288
  64. package/cli/selftune/localdb/schema.ts +21 -0
  65. package/cli/selftune/monitoring/watch.ts +96 -29
  66. package/cli/selftune/normalization.ts +3 -0
  67. package/cli/selftune/observability.ts +4 -2
  68. package/cli/selftune/orchestrate/cli.ts +161 -0
  69. package/cli/selftune/orchestrate/execute.ts +295 -0
  70. package/cli/selftune/orchestrate/finalize.ts +157 -0
  71. package/cli/selftune/orchestrate/locks.ts +40 -0
  72. package/cli/selftune/orchestrate/plan.ts +131 -0
  73. package/cli/selftune/orchestrate/post-run.ts +59 -0
  74. package/cli/selftune/orchestrate/prepare.ts +334 -0
  75. package/cli/selftune/orchestrate/report.ts +182 -0
  76. package/cli/selftune/orchestrate/runtime.ts +120 -0
  77. package/cli/selftune/orchestrate/signals.ts +48 -0
  78. package/cli/selftune/orchestrate.ts +150 -1173
  79. package/cli/selftune/repair/skill-usage.ts +5 -2
  80. package/cli/selftune/routes/overview.ts +5 -2
  81. package/cli/selftune/routes/skill-report.ts +15 -2
  82. package/cli/selftune/schedule.ts +5 -5
  83. package/cli/selftune/status.ts +39 -2
  84. package/cli/selftune/testing-readiness.ts +597 -0
  85. package/cli/selftune/types.ts +44 -4
  86. package/cli/selftune/uninstall.ts +2 -1
  87. package/cli/selftune/utils/canonical-log.ts +1 -9
  88. package/cli/selftune/utils/cli-error.ts +9 -0
  89. package/cli/selftune/utils/llm-call.ts +126 -6
  90. package/cli/selftune/utils/skill-discovery.ts +2 -0
  91. package/cli/selftune/workflows/proposals.ts +184 -0
  92. package/cli/selftune/workflows/skill-scaffold.ts +241 -0
  93. package/cli/selftune/workflows/workflows.ts +100 -26
  94. package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
  95. package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
  96. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  97. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
  98. package/node_modules/@selftune/telemetry-contract/src/schemas.ts +41 -1
  99. package/node_modules/@selftune/telemetry-contract/src/types.ts +103 -2
  100. package/package.json +25 -9
  101. package/packages/dashboard-core/AGENTS.md +18 -0
  102. package/packages/dashboard-core/README.md +30 -0
  103. package/packages/dashboard-core/index.ts +3 -0
  104. package/packages/dashboard-core/package.json +39 -0
  105. package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
  106. package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
  107. package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
  108. package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
  109. package/packages/dashboard-core/src/chrome/index.ts +14 -0
  110. package/packages/dashboard-core/src/chrome/types.ts +81 -0
  111. package/packages/dashboard-core/src/chrome/utils.ts +23 -0
  112. package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
  113. package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
  114. package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
  115. package/packages/dashboard-core/src/gates/index.ts +3 -0
  116. package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
  117. package/packages/dashboard-core/src/host/adapter.ts +47 -0
  118. package/packages/dashboard-core/src/host/capabilities.ts +55 -0
  119. package/packages/dashboard-core/src/host/index.ts +3 -0
  120. package/packages/dashboard-core/src/models/analytics.ts +39 -0
  121. package/packages/dashboard-core/src/models/index.ts +4 -0
  122. package/packages/dashboard-core/src/models/overview.ts +98 -0
  123. package/packages/dashboard-core/src/models/runtime.ts +7 -0
  124. package/packages/dashboard-core/src/models/skills.ts +34 -0
  125. package/packages/dashboard-core/src/routes/index.ts +2 -0
  126. package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
  127. package/packages/dashboard-core/src/routes/manifest.ts +451 -0
  128. package/packages/dashboard-core/src/routes/types.ts +39 -0
  129. package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
  130. package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
  131. package/packages/dashboard-core/src/screens/index.ts +37 -0
  132. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
  133. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
  134. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
  135. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
  136. package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
  137. package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
  138. package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
  139. package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
  140. package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
  141. package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
  142. package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
  143. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
  144. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
  145. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
  146. package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
  147. package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
  148. package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
  149. package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
  150. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
  151. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
  152. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
  153. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
  154. package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
  155. package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
  156. package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
  157. package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
  158. package/packages/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
  159. package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  160. package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
  161. package/packages/telemetry-contract/src/schemas.ts +41 -1
  162. package/packages/telemetry-contract/src/types.ts +103 -2
  163. package/packages/ui/src/components/EvidenceViewer.tsx +80 -25
  164. package/packages/ui/src/components/OverviewPanels.tsx +67 -26
  165. package/packages/ui/src/primitives/tabs.tsx +7 -6
  166. package/packages/ui/src/types.ts +10 -0
  167. package/skill/SKILL.md +130 -332
  168. package/skill/agents/diagnosis-analyst.md +3 -3
  169. package/skill/agents/evolution-reviewer.md +3 -3
  170. package/skill/agents/integration-guide.md +3 -3
  171. package/skill/agents/pattern-analyst.md +2 -2
  172. package/skill/references/cli-quick-reference.md +89 -0
  173. package/skill/references/creator-playbook.md +131 -0
  174. package/skill/references/examples.md +48 -0
  175. package/skill/references/troubleshooting.md +47 -0
  176. package/skill/references/version-history.md +1 -1
  177. package/skill/selftune.contribute.json +11 -0
  178. package/skill/{Workflows → workflows}/Baseline.md +20 -1
  179. package/skill/{Workflows → workflows}/Contribute.md +23 -10
  180. package/skill/{Workflows → workflows}/Contributions.md +13 -5
  181. package/skill/workflows/CreateTestDeploy.md +170 -0
  182. package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
  183. package/skill/{Workflows → workflows}/Cron.md +1 -1
  184. package/skill/{Workflows → workflows}/Dashboard.md +20 -0
  185. package/skill/{Workflows → workflows}/Doctor.md +1 -1
  186. package/skill/{Workflows → workflows}/Evals.md +67 -2
  187. package/skill/{Workflows → workflows}/Evolve.md +119 -30
  188. package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
  189. package/skill/{Workflows → workflows}/Grade.md +1 -1
  190. package/skill/{Workflows → workflows}/Initialize.md +8 -4
  191. package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
  192. package/skill/{Workflows → workflows}/Schedule.md +3 -3
  193. package/skill/workflows/SignalsDashboard.md +87 -0
  194. package/skill/{Workflows → workflows}/UnitTest.md +19 -0
  195. package/skill/{Workflows → workflows}/Watch.md +42 -2
  196. package/skill/{Workflows → workflows}/Workflows.md +39 -2
  197. package/apps/local-dashboard/dist/assets/index-CwOtTrUS.css +0 -1
  198. package/apps/local-dashboard/dist/assets/index-f1HQpbeH.js +0 -59
  199. package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
  200. package/apps/local-dashboard/dist/assets/vendor-ui-jVSaIZey.js +0 -12
  201. /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
  202. /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
  203. /package/skill/{Workflows → workflows}/Badge.md +0 -0
  204. /package/skill/{Workflows → workflows}/Composability.md +0 -0
  205. /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
  206. /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
  207. /package/skill/{Workflows → workflows}/Hook.md +0 -0
  208. /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
  209. /package/skill/{Workflows → workflows}/Ingest.md +0 -0
  210. /package/skill/{Workflows → workflows}/PlatformHooks.md +0 -0
  211. /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
  212. /package/skill/{Workflows → workflows}/Recover.md +0 -0
  213. /package/skill/{Workflows → workflows}/Registry.md +0 -0
  214. /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
  215. /package/skill/{Workflows → workflows}/Replay.md +0 -0
  216. /package/skill/{Workflows → workflows}/Rollback.md +0 -0
  217. /package/skill/{Workflows → workflows}/Sync.md +0 -0
  218. /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
  219. /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
@@ -231,6 +231,20 @@ CREATE TABLE IF NOT EXISTS grading_results (
231
231
  execution_metrics_json TEXT
232
232
  )`;
233
233
 
234
+ // -- Grading baselines table (pre/post deploy grade snapshots) ---------------
235
+
236
+ export const CREATE_GRADING_BASELINES = `
237
+ CREATE TABLE IF NOT EXISTS grading_baselines (
238
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
239
+ skill_name TEXT NOT NULL,
240
+ proposal_id TEXT,
241
+ measured_at TEXT NOT NULL,
242
+ pass_rate REAL NOT NULL,
243
+ mean_score REAL,
244
+ sample_size INTEGER NOT NULL,
245
+ grading_results_json TEXT
246
+ )`;
247
+
234
248
  // -- Improvement signal table (from signal_log.jsonl) ------------------------
235
249
 
236
250
  export const CREATE_IMPROVEMENT_SIGNALS = `
@@ -369,6 +383,11 @@ export const CREATE_INDEXES = [
369
383
  `CREATE INDEX IF NOT EXISTS idx_grading_skill ON grading_results(skill_name)`,
370
384
  `CREATE INDEX IF NOT EXISTS idx_grading_ts ON grading_results(graded_at)`,
371
385
  `CREATE UNIQUE INDEX IF NOT EXISTS idx_grading_dedup ON grading_results(session_id, skill_name, graded_at)`,
386
+ // -- Grading baseline indexes ------------------------------------------------
387
+ `CREATE INDEX IF NOT EXISTS idx_grading_bl_skill ON grading_baselines(skill_name)`,
388
+ `CREATE INDEX IF NOT EXISTS idx_grading_bl_proposal ON grading_baselines(proposal_id)`,
389
+ `CREATE INDEX IF NOT EXISTS idx_grading_bl_ts ON grading_baselines(measured_at)`,
390
+ `CREATE INDEX IF NOT EXISTS idx_grading_bl_skill_proposal ON grading_baselines(skill_name, proposal_id, measured_at)`,
372
391
  // -- Improvement signal indexes ---------------------------------------------
373
392
  `CREATE INDEX IF NOT EXISTS idx_signals_session ON improvement_signals(session_id)`,
374
393
  `CREATE INDEX IF NOT EXISTS idx_signals_consumed ON improvement_signals(consumed)`,
@@ -389,6 +408,7 @@ export const CREATE_INDEXES = [
389
408
  `CREATE INDEX IF NOT EXISTS idx_replay_entry_proposal ON replay_entry_results(proposal_id)`,
390
409
  `CREATE INDEX IF NOT EXISTS idx_replay_entry_skill ON replay_entry_results(skill_name)`,
391
410
  `CREATE INDEX IF NOT EXISTS idx_replay_entry_passed ON replay_entry_results(passed)`,
411
+ `CREATE INDEX IF NOT EXISTS idx_replay_entry_proposal_phase ON replay_entry_results(proposal_id, phase)`,
392
412
  // -- Commit tracking indexes ------------------------------------------------
393
413
  `CREATE INDEX IF NOT EXISTS idx_commit_sha ON commit_tracking(commit_sha)`,
394
414
  `CREATE INDEX IF NOT EXISTS idx_commit_session ON commit_tracking(session_id)`,
@@ -485,6 +505,7 @@ export const ALL_DDL = [
485
505
  CREATE_ORCHESTRATE_RUNS,
486
506
  CREATE_QUERIES,
487
507
  CREATE_GRADING_RESULTS,
508
+ CREATE_GRADING_BASELINES,
488
509
  CREATE_IMPROVEMENT_SIGNALS,
489
510
  CREATE_UPLOAD_QUEUE,
490
511
  CREATE_CREATOR_CONTRIBUTION_STAGING,
@@ -8,12 +8,15 @@
8
8
 
9
9
  import { parseArgs } from "node:util";
10
10
 
11
+ import { PUBLIC_COMMAND_SURFACES, renderCommandHelp } from "../command-surface.js";
11
12
  import { QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
12
13
  import { classifyInvocation } from "../eval/hooks-to-evals.js";
13
14
  import { getLastDeployedProposal } from "../evolution/audit.js";
14
15
  import { getDb } from "../localdb/db.js";
15
16
  import {
17
+ queryGradingBaseline,
16
18
  queryQueryLog,
19
+ queryRecentGradingResults,
17
20
  querySessionTelemetry,
18
21
  querySkillUsageRecords,
19
22
  } from "../localdb/queries.js";
@@ -42,6 +45,10 @@ export interface WatchOptions {
42
45
  windowSessions: number;
43
46
  regressionThreshold: number;
44
47
  autoRollback: boolean;
48
+ /** Grade regression threshold (default 0.15). */
49
+ gradeRegressionThreshold?: number;
50
+ /** Enable grade-based regression watch (default true). */
51
+ enableGradeWatch?: boolean;
45
52
  /** Injected log paths for testing (override defaults). */
46
53
  _telemetryLogPath?: string;
47
54
  _skillLogPath?: string;
@@ -65,6 +72,8 @@ export interface WatchResult {
65
72
  rolledBack: boolean;
66
73
  recommendation: string;
67
74
  sync_result?: SyncResult;
75
+ gradeAlert?: string | null;
76
+ gradeRegression?: { before: number; after: number; delta: number } | null;
68
77
  }
69
78
 
70
79
  // ---------------------------------------------------------------------------
@@ -73,6 +82,7 @@ export interface WatchResult {
73
82
 
74
83
  const DEFAULT_BASELINE_PASS_RATE = 0.5;
75
84
  const DEFAULT_REGRESSION_THRESHOLD = 0.1;
85
+ const DEFAULT_GRADE_REGRESSION_THRESHOLD = 0.15;
76
86
  export const MIN_MONITORING_SKILL_CHECKS = 3;
77
87
 
78
88
  // ---------------------------------------------------------------------------
@@ -190,6 +200,8 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
190
200
  skillPath,
191
201
  windowSessions = 20,
192
202
  regressionThreshold = DEFAULT_REGRESSION_THRESHOLD,
203
+ gradeRegressionThreshold = DEFAULT_GRADE_REGRESSION_THRESHOLD,
204
+ enableGradeWatch = true,
193
205
  autoRollback = false,
194
206
  _telemetryLogPath = TELEMETRY_LOG,
195
207
  _skillLogPath = SKILL_LOG,
@@ -235,26 +247,71 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
235
247
  regressionThreshold,
236
248
  );
237
249
 
238
- // 4. Build alert and recommendation
239
- let alert: string | null = null;
250
+ // 4. Build trigger alert. Grade alerts are added below before rollback
251
+ // decisions so either signal can drive automated rollback.
252
+ let triggerAlert: string | null = null;
240
253
  let rolledBack = false;
241
- let recommendation: string;
242
254
 
243
255
  if (snapshot.regression_detected) {
244
- alert = `regression detected for "${skillName}": pass_rate=${snapshot.pass_rate.toFixed(2)} below baseline=${baselinePassRate.toFixed(2)} minus threshold=${regressionThreshold.toFixed(2)}`;
245
-
246
- // 5. Auto-rollback if enabled
247
- if (autoRollback) {
248
- const rollbackFn = _rollbackFn ?? (await loadRollbackFn());
249
- const proposalId = lastDeployed?.proposal_id;
250
- const rollbackResult = await rollbackFn({
251
- skillName,
252
- skillPath,
253
- proposalId,
254
- });
255
- rolledBack = rollbackResult.rolledBack;
256
+ triggerAlert = `regression detected for "${skillName}": pass_rate=${snapshot.pass_rate.toFixed(2)} below baseline=${baselinePassRate.toFixed(2)} minus threshold=${regressionThreshold.toFixed(2)}`;
257
+ }
258
+
259
+ // 5. Grade regression detection (fail-open)
260
+ let gradeAlert: string | null = null;
261
+ let gradeRegression: { before: number; after: number; delta: number } | null = null;
262
+
263
+ if (enableGradeWatch) {
264
+ try {
265
+ const baseline = queryGradingBaseline(db, skillName, lastDeployed?.proposal_id);
266
+ const recentResults = queryRecentGradingResults(db, skillName, 10);
267
+
268
+ if (baseline && recentResults.length > 0) {
269
+ // Compute the average pass rate from recent grading results
270
+ const validResults = recentResults.filter((r) => r.pass_rate != null);
271
+ if (validResults.length > 0) {
272
+ const recentAvgPassRate =
273
+ validResults.reduce((sum, r) => sum + (r.pass_rate ?? 0), 0) / validResults.length;
274
+ const baselinePassRateGrade = baseline.pass_rate;
275
+ const delta = baselinePassRateGrade - recentAvgPassRate;
276
+
277
+ if (delta > gradeRegressionThreshold) {
278
+ gradeAlert = `grade regression detected for "${skillName}": baseline_grade_pass_rate=${baselinePassRateGrade.toFixed(2)}, recent_avg=${recentAvgPassRate.toFixed(2)}, delta=${delta.toFixed(2)} exceeds threshold=${gradeRegressionThreshold.toFixed(2)}`;
279
+ gradeRegression = {
280
+ before: baselinePassRateGrade,
281
+ after: recentAvgPassRate,
282
+ delta,
283
+ };
284
+ }
285
+ }
286
+ }
287
+ } catch (err) {
288
+ // Fail-open: grade watch should never block trigger monitoring
289
+ console.error(
290
+ JSON.stringify({
291
+ level: "debug",
292
+ code: "grade_watch_failed",
293
+ message: `Grade watch failed for "${skillName}": ${err instanceof Error ? err.message : String(err)}`,
294
+ }),
295
+ );
256
296
  }
297
+ }
257
298
 
299
+ const alerts = [triggerAlert, gradeAlert].filter((value): value is string => Boolean(value));
300
+ const alert = alerts.length > 0 ? alerts.join("\n") : null;
301
+
302
+ if (alert && autoRollback) {
303
+ const rollbackFn = _rollbackFn ?? (await loadRollbackFn());
304
+ const proposalId = lastDeployed?.proposal_id;
305
+ const rollbackResult = await rollbackFn({
306
+ skillName,
307
+ skillPath,
308
+ proposalId,
309
+ });
310
+ rolledBack = rollbackResult.rolledBack;
311
+ }
312
+
313
+ let recommendation: string;
314
+ if (alert) {
258
315
  recommendation = rolledBack
259
316
  ? `Rolled back "${skillName}" to previous version. Monitor to confirm recovery.`
260
317
  : `Consider running: selftune rollback --skill "${skillName}" --skill-path "${skillPath}"`;
@@ -285,6 +342,8 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
285
342
  alert,
286
343
  rolledBack,
287
344
  recommendation,
345
+ gradeAlert,
346
+ gradeRegression,
288
347
  ...(syncResult ? { sync_result: syncResult } : {}),
289
348
  };
290
349
  }
@@ -329,6 +388,8 @@ export async function cliMain(): Promise<void> {
329
388
  window: { type: "string", default: "20" },
330
389
  threshold: { type: "string", default: "0.1" },
331
390
  "auto-rollback": { type: "boolean", default: false },
391
+ "grade-threshold": { type: "string", default: "0.15" },
392
+ "no-grade-watch": { type: "boolean", default: false },
332
393
  "sync-first": { type: "boolean", default: false },
333
394
  "sync-force": { type: "boolean", default: false },
334
395
  help: { type: "boolean", default: false },
@@ -337,20 +398,7 @@ export async function cliMain(): Promise<void> {
337
398
  });
338
399
 
339
400
  if (values.help) {
340
- console.log(`selftune watch — Monitor post-deploy skill health
341
-
342
- Usage:
343
- selftune watch --skill <name> --skill-path <path> [options]
344
-
345
- Options:
346
- --skill Skill name (required)
347
- --skill-path Path to SKILL.md (required)
348
- --window Number of recent sessions to consider (default: 20)
349
- --threshold Regression threshold below baseline (default: 0.1)
350
- --auto-rollback Automatically rollback on regression detection
351
- --sync-first Refresh source-truth telemetry before reading watch inputs
352
- --sync-force Force a full rescan during --sync-first
353
- --help Show this help message`);
401
+ console.log(renderCommandHelp(PUBLIC_COMMAND_SURFACES.watch));
354
402
  process.exit(0);
355
403
  }
356
404
 
@@ -403,11 +451,30 @@ Options:
403
451
  );
404
452
  }
405
453
 
454
+ const rawGradeThreshold = values["grade-threshold"] ?? "0.15";
455
+ if (!/^\d+(\.\d+)?$/.test(rawGradeThreshold)) {
456
+ throw new CLIError(
457
+ "--grade-threshold must be a finite number between 0 and 1.",
458
+ "INVALID_FLAG",
459
+ "selftune watch --grade-threshold 0.15",
460
+ );
461
+ }
462
+ const gradeRegressionThreshold = Number.parseFloat(rawGradeThreshold);
463
+ if (gradeRegressionThreshold < 0 || gradeRegressionThreshold > 1) {
464
+ throw new CLIError(
465
+ "--grade-threshold must be a finite number between 0 and 1.",
466
+ "INVALID_FLAG",
467
+ "selftune watch --grade-threshold 0.15",
468
+ );
469
+ }
470
+
406
471
  const result = await watch({
407
472
  skillName: values.skill,
408
473
  skillPath: values["skill-path"],
409
474
  windowSessions,
410
475
  regressionThreshold,
476
+ gradeRegressionThreshold,
477
+ enableGradeWatch: !(values["no-grade-watch"] ?? false),
411
478
  autoRollback: values["auto-rollback"] ?? false,
412
479
  syncFirst: values["sync-first"] ?? false,
413
480
  syncForce: values["sync-force"] ?? false,
@@ -694,6 +694,7 @@ export function buildCanonicalSkillInvocation(
694
694
  }
695
695
 
696
696
  export interface BuildExecutionFactInput extends CanonicalBaseInput {
697
+ execution_fact_id?: string;
697
698
  occurred_at: string;
698
699
  prompt_id?: string;
699
700
  tool_calls_json: Record<string, number>;
@@ -716,6 +717,8 @@ export function buildCanonicalExecutionFact(
716
717
  const record: CanonicalExecutionFactRecord = {
717
718
  ...base,
718
719
  record_kind: "execution_fact",
720
+ execution_fact_id:
721
+ input.execution_fact_id ?? `${input.session_id}:${input.occurred_at}:execution_fact`,
719
722
  occurred_at: input.occurred_at,
720
723
  tool_calls_json: input.tool_calls_json,
721
724
  total_tool_calls: input.total_tool_calls,
@@ -14,6 +14,7 @@ import { join } from "node:path";
14
14
 
15
15
  import { getAlphaGuidance } from "./agent-guidance.js";
16
16
  import { getAlphaLinkState, readAlphaIdentity } from "./alpha-identity.js";
17
+ import { getSelftuneUpdateHint } from "./auto-update.js";
17
18
  import { LOG_DIR, REQUIRED_FIELDS, SELFTUNE_CONFIG_PATH } from "./constants.js";
18
19
  import { DB_PATH, getDb } from "./localdb/db.js";
19
20
  import type {
@@ -318,12 +319,13 @@ export async function checkVersionHealth(): Promise<HealthCheck[]> {
318
319
  if (cmp >= 0) {
319
320
  check.message = `v${currentVersion} (latest)`;
320
321
  } else {
322
+ const updateCommand = getSelftuneUpdateHint("latest");
321
323
  check.status = "warn";
322
- check.message = `v${currentVersion} installed, v${latestVersion} available. Run: npx skills add selftune-dev/selftune`;
324
+ check.message = `v${currentVersion} installed, v${latestVersion} available. Run: ${updateCommand}`;
323
325
  check.guidance = {
324
326
  code: "version_update_available",
325
327
  message: "A newer selftune release is available.",
326
- next_command: "npx skills add selftune-dev/selftune",
328
+ next_command: updateCommand,
327
329
  suggested_commands: ["selftune doctor"],
328
330
  blocking: false,
329
331
  };
@@ -0,0 +1,161 @@
1
+ import { parseArgs } from "node:util";
2
+
3
+ import { PUBLIC_COMMAND_SURFACES, renderCommandHelp } from "../command-surface.js";
4
+ import type { OrchestrateOptions, OrchestrateResult } from "../orchestrate.js";
5
+ import { CLIError } from "../utils/cli-error.js";
6
+
7
+ export interface ParsedOrchestrateCliArgs {
8
+ showHelp: boolean;
9
+ warnings: string[];
10
+ loop: boolean;
11
+ loopIntervalSeconds: number;
12
+ runOptions: OrchestrateOptions;
13
+ }
14
+
15
+ function parsePositiveIntegerFlag(value: string, message: string, command: string): number {
16
+ if (!/^\d+$/.test(value) || Number(value) < 1) {
17
+ throw new CLIError(message, "INVALID_FLAG", command);
18
+ }
19
+ return Number(value);
20
+ }
21
+
22
+ function parseNonNegativeIntegerFlag(value: string, message: string, command: string): number {
23
+ if (!/^\d+$/.test(value)) {
24
+ throw new CLIError(message, "INVALID_FLAG", command);
25
+ }
26
+ return Number(value);
27
+ }
28
+
29
+ export function renderOrchestrateHelp(): string {
30
+ return renderCommandHelp(PUBLIC_COMMAND_SURFACES.orchestrate);
31
+ }
32
+
33
+ export function parseOrchestrateCliArgs(
34
+ argv: string[] = process.argv.slice(2),
35
+ ): ParsedOrchestrateCliArgs {
36
+ const { values } = parseArgs({
37
+ args: argv,
38
+ options: {
39
+ "dry-run": { type: "boolean", default: false },
40
+ "review-required": { type: "boolean", default: false },
41
+ "auto-approve": { type: "boolean", default: false },
42
+ skill: { type: "string" },
43
+ "max-skills": { type: "string", default: "5" },
44
+ "recent-window": { type: "string", default: "48" },
45
+ "sync-force": { type: "boolean", default: false },
46
+ "max-auto-grade": { type: "string", default: "5" },
47
+ loop: { type: "boolean", default: false },
48
+ "loop-interval": { type: "string", default: "3600" },
49
+ help: { type: "boolean", short: "h", default: false },
50
+ },
51
+ strict: true,
52
+ });
53
+
54
+ if (values.help) {
55
+ return {
56
+ showHelp: true,
57
+ warnings: [],
58
+ loop: false,
59
+ loopIntervalSeconds: 3600,
60
+ runOptions: {
61
+ dryRun: false,
62
+ approvalMode: "auto",
63
+ maxSkills: 5,
64
+ recentWindowHours: 48,
65
+ syncForce: false,
66
+ maxAutoGrade: 5,
67
+ },
68
+ };
69
+ }
70
+
71
+ const loop = values.loop ?? false;
72
+ const maxSkills = parsePositiveIntegerFlag(
73
+ values["max-skills"] ?? "5",
74
+ "--max-skills must be a positive integer",
75
+ "selftune orchestrate --max-skills 5",
76
+ );
77
+ const recentWindowHours = parsePositiveIntegerFlag(
78
+ values["recent-window"] ?? "48",
79
+ "--recent-window must be a positive integer",
80
+ "selftune orchestrate --recent-window 48",
81
+ );
82
+ const maxAutoGrade = parseNonNegativeIntegerFlag(
83
+ values["max-auto-grade"] ?? "5",
84
+ "--max-auto-grade must be a non-negative integer",
85
+ "selftune orchestrate --max-auto-grade 5",
86
+ );
87
+
88
+ const loopIntervalRaw = values["loop-interval"] ?? "3600";
89
+ if (!/^\d+$/.test(loopIntervalRaw) || (loop && Number(loopIntervalRaw) < 60)) {
90
+ throw new CLIError(
91
+ "--loop-interval must be an integer >= 60 (seconds)",
92
+ "INVALID_FLAG",
93
+ "selftune orchestrate --loop --loop-interval 3600",
94
+ );
95
+ }
96
+
97
+ const warnings: string[] = [];
98
+ if (values["auto-approve"]) {
99
+ warnings.push(
100
+ "[orchestrate] --auto-approve is deprecated; autonomous mode is now the default.",
101
+ );
102
+ }
103
+
104
+ return {
105
+ showHelp: false,
106
+ warnings,
107
+ loop,
108
+ loopIntervalSeconds: Number(loopIntervalRaw),
109
+ runOptions: {
110
+ dryRun: values["dry-run"] ?? false,
111
+ approvalMode: values["review-required"] ? "review" : "auto",
112
+ skillFilter: values.skill,
113
+ maxSkills,
114
+ recentWindowHours,
115
+ syncForce: values["sync-force"] ?? false,
116
+ maxAutoGrade,
117
+ },
118
+ };
119
+ }
120
+
121
+ export function buildOrchestrateJsonOutput(result: OrchestrateResult) {
122
+ return {
123
+ ...result.summary,
124
+ ...(result.uploadSummary ? { upload: result.uploadSummary } : {}),
125
+ workflow_proposals: result.workflowProposals.map((proposal) => ({
126
+ proposal_id: proposal.proposal_id,
127
+ source_skill_name: proposal.source_skill_name,
128
+ workflow_id: proposal.workflow.workflow_id,
129
+ generated_skill_name: proposal.draft.skill_name,
130
+ output_path: proposal.draft.skill_path,
131
+ confidence: proposal.confidence,
132
+ reason: proposal.rationale,
133
+ })),
134
+ decisions: result.candidates.map((candidate) => ({
135
+ skill: candidate.skill,
136
+ action: candidate.action,
137
+ reason: candidate.reason,
138
+ ...(candidate.evolveResult
139
+ ? {
140
+ deployed: candidate.evolveResult.deployed,
141
+ evolveReason: candidate.evolveResult.reason,
142
+ validation: candidate.evolveResult.validation
143
+ ? {
144
+ before: candidate.evolveResult.validation.before_pass_rate,
145
+ after: candidate.evolveResult.validation.after_pass_rate,
146
+ improved: candidate.evolveResult.validation.improved,
147
+ }
148
+ : null,
149
+ }
150
+ : {}),
151
+ ...(candidate.watchResult
152
+ ? {
153
+ alert: candidate.watchResult.alert,
154
+ rolledBack: candidate.watchResult.rolledBack,
155
+ passRate: candidate.watchResult.snapshot?.pass_rate ?? null,
156
+ recommendation: candidate.watchResult.recommendation,
157
+ }
158
+ : {}),
159
+ })),
160
+ };
161
+ }