selftune 0.2.23 → 0.2.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +93 -15
- package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
- package/apps/local-dashboard/dist/assets/index-Dmx7LPVX.js +15 -0
- package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
- package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
- package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
- package/apps/local-dashboard/dist/index.html +5 -5
- package/cli/selftune/adapters/codex/install.ts +310 -78
- package/cli/selftune/adapters/opencode/install.ts +3 -4
- package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
- package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
- package/cli/selftune/auto-update.ts +200 -8
- package/cli/selftune/canonical-export.ts +55 -25
- package/cli/selftune/command-surface.ts +397 -0
- package/cli/selftune/contribute/contribute.ts +64 -13
- package/cli/selftune/contribution-config.ts +57 -3
- package/cli/selftune/contribution-preferences.ts +117 -0
- package/cli/selftune/contribution-signals.ts +8 -4
- package/cli/selftune/contribution-staging.ts +13 -2
- package/cli/selftune/contributions.ts +55 -121
- package/cli/selftune/creator-contributions.ts +29 -10
- package/cli/selftune/cron/setup.ts +7 -3
- package/cli/selftune/dashboard-contract.ts +73 -0
- package/cli/selftune/dashboard-server.ts +168 -17
- package/cli/selftune/dashboard.ts +350 -17
- package/cli/selftune/eval/baseline.ts +21 -5
- package/cli/selftune/eval/execution-eval.ts +170 -0
- package/cli/selftune/eval/family-overlap.ts +2 -2
- package/cli/selftune/eval/hooks-to-evals.ts +228 -82
- package/cli/selftune/eval/import-skillsbench.ts +2 -2
- package/cli/selftune/eval/invocation-classifier.ts +56 -0
- package/cli/selftune/eval/synthetic-evals.ts +5 -3
- package/cli/selftune/eval/unit-test-cli.ts +7 -4
- package/cli/selftune/evolution/apply-proposal.ts +295 -0
- package/cli/selftune/evolution/engines/replay-engine.ts +79 -57
- package/cli/selftune/evolution/evolve-body.ts +100 -39
- package/cli/selftune/evolution/evolve.ts +244 -52
- package/cli/selftune/evolution/rollback.ts +0 -1
- package/cli/selftune/evolution/validate-body.ts +68 -42
- package/cli/selftune/evolution/validate-host-replay.ts +510 -60
- package/cli/selftune/evolution/validate-proposal.ts +11 -150
- package/cli/selftune/evolution/validate-routing.ts +43 -41
- package/cli/selftune/evolution/validation-contract.ts +91 -0
- package/cli/selftune/grading/auto-grade.ts +11 -7
- package/cli/selftune/grading/grade-session.ts +10 -16
- package/cli/selftune/index.ts +35 -10
- package/cli/selftune/ingestors/claude-replay.ts +15 -10
- package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
- package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
- package/cli/selftune/ingestors/pi-ingest.ts +3 -2
- package/cli/selftune/init.ts +27 -3
- package/cli/selftune/localdb/direct-write.ts +35 -1
- package/cli/selftune/localdb/queries/cron.ts +34 -0
- package/cli/selftune/localdb/queries/dashboard.ts +834 -0
- package/cli/selftune/localdb/queries/evolution.ts +158 -0
- package/cli/selftune/localdb/queries/execution.ts +133 -0
- package/cli/selftune/localdb/queries/json.ts +18 -0
- package/cli/selftune/localdb/queries/monitoring.ts +263 -0
- package/cli/selftune/localdb/queries/raw.ts +95 -0
- package/cli/selftune/localdb/queries/staging.ts +270 -0
- package/cli/selftune/localdb/queries/trust.ts +392 -0
- package/cli/selftune/localdb/queries.ts +60 -2288
- package/cli/selftune/localdb/schema.ts +21 -0
- package/cli/selftune/monitoring/watch.ts +96 -29
- package/cli/selftune/normalization.ts +3 -0
- package/cli/selftune/observability.ts +4 -2
- package/cli/selftune/orchestrate/cli.ts +161 -0
- package/cli/selftune/orchestrate/execute.ts +295 -0
- package/cli/selftune/orchestrate/finalize.ts +157 -0
- package/cli/selftune/orchestrate/locks.ts +40 -0
- package/cli/selftune/orchestrate/plan.ts +131 -0
- package/cli/selftune/orchestrate/post-run.ts +59 -0
- package/cli/selftune/orchestrate/prepare.ts +334 -0
- package/cli/selftune/orchestrate/report.ts +182 -0
- package/cli/selftune/orchestrate/runtime.ts +120 -0
- package/cli/selftune/orchestrate/signals.ts +48 -0
- package/cli/selftune/orchestrate.ts +150 -1173
- package/cli/selftune/repair/skill-usage.ts +5 -2
- package/cli/selftune/routes/overview.ts +5 -2
- package/cli/selftune/routes/skill-report.ts +15 -2
- package/cli/selftune/schedule.ts +5 -5
- package/cli/selftune/status.ts +39 -2
- package/cli/selftune/testing-readiness.ts +597 -0
- package/cli/selftune/types.ts +44 -4
- package/cli/selftune/uninstall.ts +2 -1
- package/cli/selftune/utils/canonical-log.ts +1 -9
- package/cli/selftune/utils/cli-error.ts +9 -0
- package/cli/selftune/utils/llm-call.ts +126 -6
- package/cli/selftune/utils/skill-discovery.ts +2 -0
- package/cli/selftune/workflows/proposals.ts +184 -0
- package/cli/selftune/workflows/skill-scaffold.ts +241 -0
- package/cli/selftune/workflows/workflows.ts +100 -26
- package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/src/schemas.ts +41 -1
- package/node_modules/@selftune/telemetry-contract/src/types.ts +103 -2
- package/package.json +25 -9
- package/packages/dashboard-core/AGENTS.md +18 -0
- package/packages/dashboard-core/README.md +30 -0
- package/packages/dashboard-core/index.ts +3 -0
- package/packages/dashboard-core/package.json +39 -0
- package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
- package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
- package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
- package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
- package/packages/dashboard-core/src/chrome/index.ts +14 -0
- package/packages/dashboard-core/src/chrome/types.ts +81 -0
- package/packages/dashboard-core/src/chrome/utils.ts +23 -0
- package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
- package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
- package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
- package/packages/dashboard-core/src/gates/index.ts +3 -0
- package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
- package/packages/dashboard-core/src/host/adapter.ts +47 -0
- package/packages/dashboard-core/src/host/capabilities.ts +55 -0
- package/packages/dashboard-core/src/host/index.ts +3 -0
- package/packages/dashboard-core/src/models/analytics.ts +39 -0
- package/packages/dashboard-core/src/models/index.ts +4 -0
- package/packages/dashboard-core/src/models/overview.ts +98 -0
- package/packages/dashboard-core/src/models/runtime.ts +7 -0
- package/packages/dashboard-core/src/models/skills.ts +34 -0
- package/packages/dashboard-core/src/routes/index.ts +2 -0
- package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
- package/packages/dashboard-core/src/routes/manifest.ts +451 -0
- package/packages/dashboard-core/src/routes/types.ts +39 -0
- package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
- package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
- package/packages/dashboard-core/src/screens/index.ts +37 -0
- package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
- package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
- package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
- package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
- package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
- package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
- package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
- package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
- package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
- package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
- package/packages/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
- package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
- package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
- package/packages/telemetry-contract/src/schemas.ts +41 -1
- package/packages/telemetry-contract/src/types.ts +103 -2
- package/packages/ui/src/components/EvidenceViewer.tsx +80 -25
- package/packages/ui/src/components/OverviewPanels.tsx +67 -26
- package/packages/ui/src/primitives/tabs.tsx +7 -6
- package/packages/ui/src/types.ts +10 -0
- package/skill/SKILL.md +130 -332
- package/skill/agents/diagnosis-analyst.md +3 -3
- package/skill/agents/evolution-reviewer.md +3 -3
- package/skill/agents/integration-guide.md +3 -3
- package/skill/agents/pattern-analyst.md +2 -2
- package/skill/references/cli-quick-reference.md +89 -0
- package/skill/references/creator-playbook.md +131 -0
- package/skill/references/examples.md +48 -0
- package/skill/references/troubleshooting.md +47 -0
- package/skill/references/version-history.md +1 -1
- package/skill/selftune.contribute.json +11 -0
- package/skill/{Workflows → workflows}/Baseline.md +20 -1
- package/skill/{Workflows → workflows}/Contribute.md +23 -10
- package/skill/{Workflows → workflows}/Contributions.md +13 -5
- package/skill/workflows/CreateTestDeploy.md +170 -0
- package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
- package/skill/{Workflows → workflows}/Cron.md +1 -1
- package/skill/{Workflows → workflows}/Dashboard.md +20 -0
- package/skill/{Workflows → workflows}/Doctor.md +1 -1
- package/skill/{Workflows → workflows}/Evals.md +67 -2
- package/skill/{Workflows → workflows}/Evolve.md +119 -30
- package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
- package/skill/{Workflows → workflows}/Grade.md +1 -1
- package/skill/{Workflows → workflows}/Initialize.md +8 -4
- package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
- package/skill/{Workflows → workflows}/Schedule.md +3 -3
- package/skill/workflows/SignalsDashboard.md +87 -0
- package/skill/{Workflows → workflows}/UnitTest.md +19 -0
- package/skill/{Workflows → workflows}/Watch.md +42 -2
- package/skill/{Workflows → workflows}/Workflows.md +39 -2
- package/apps/local-dashboard/dist/assets/index-CwOtTrUS.css +0 -1
- package/apps/local-dashboard/dist/assets/index-f1HQpbeH.js +0 -59
- package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
- package/apps/local-dashboard/dist/assets/vendor-ui-jVSaIZey.js +0 -12
- /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
- /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
- /package/skill/{Workflows → workflows}/Badge.md +0 -0
- /package/skill/{Workflows → workflows}/Composability.md +0 -0
- /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
- /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
- /package/skill/{Workflows → workflows}/Hook.md +0 -0
- /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
- /package/skill/{Workflows → workflows}/Ingest.md +0 -0
- /package/skill/{Workflows → workflows}/PlatformHooks.md +0 -0
- /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
- /package/skill/{Workflows → workflows}/Recover.md +0 -0
- /package/skill/{Workflows → workflows}/Registry.md +0 -0
- /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
- /package/skill/{Workflows → workflows}/Replay.md +0 -0
- /package/skill/{Workflows → workflows}/Rollback.md +0 -0
- /package/skill/{Workflows → workflows}/Sync.md +0 -0
- /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
- /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
|
@@ -231,6 +231,20 @@ CREATE TABLE IF NOT EXISTS grading_results (
|
|
|
231
231
|
execution_metrics_json TEXT
|
|
232
232
|
)`;
|
|
233
233
|
|
|
234
|
+
// -- Grading baselines table (pre/post deploy grade snapshots) ---------------
|
|
235
|
+
|
|
236
|
+
export const CREATE_GRADING_BASELINES = `
|
|
237
|
+
CREATE TABLE IF NOT EXISTS grading_baselines (
|
|
238
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
239
|
+
skill_name TEXT NOT NULL,
|
|
240
|
+
proposal_id TEXT,
|
|
241
|
+
measured_at TEXT NOT NULL,
|
|
242
|
+
pass_rate REAL NOT NULL,
|
|
243
|
+
mean_score REAL,
|
|
244
|
+
sample_size INTEGER NOT NULL,
|
|
245
|
+
grading_results_json TEXT
|
|
246
|
+
)`;
|
|
247
|
+
|
|
234
248
|
// -- Improvement signal table (from signal_log.jsonl) ------------------------
|
|
235
249
|
|
|
236
250
|
export const CREATE_IMPROVEMENT_SIGNALS = `
|
|
@@ -369,6 +383,11 @@ export const CREATE_INDEXES = [
|
|
|
369
383
|
`CREATE INDEX IF NOT EXISTS idx_grading_skill ON grading_results(skill_name)`,
|
|
370
384
|
`CREATE INDEX IF NOT EXISTS idx_grading_ts ON grading_results(graded_at)`,
|
|
371
385
|
`CREATE UNIQUE INDEX IF NOT EXISTS idx_grading_dedup ON grading_results(session_id, skill_name, graded_at)`,
|
|
386
|
+
// -- Grading baseline indexes ------------------------------------------------
|
|
387
|
+
`CREATE INDEX IF NOT EXISTS idx_grading_bl_skill ON grading_baselines(skill_name)`,
|
|
388
|
+
`CREATE INDEX IF NOT EXISTS idx_grading_bl_proposal ON grading_baselines(proposal_id)`,
|
|
389
|
+
`CREATE INDEX IF NOT EXISTS idx_grading_bl_ts ON grading_baselines(measured_at)`,
|
|
390
|
+
`CREATE INDEX IF NOT EXISTS idx_grading_bl_skill_proposal ON grading_baselines(skill_name, proposal_id, measured_at)`,
|
|
372
391
|
// -- Improvement signal indexes ---------------------------------------------
|
|
373
392
|
`CREATE INDEX IF NOT EXISTS idx_signals_session ON improvement_signals(session_id)`,
|
|
374
393
|
`CREATE INDEX IF NOT EXISTS idx_signals_consumed ON improvement_signals(consumed)`,
|
|
@@ -389,6 +408,7 @@ export const CREATE_INDEXES = [
|
|
|
389
408
|
`CREATE INDEX IF NOT EXISTS idx_replay_entry_proposal ON replay_entry_results(proposal_id)`,
|
|
390
409
|
`CREATE INDEX IF NOT EXISTS idx_replay_entry_skill ON replay_entry_results(skill_name)`,
|
|
391
410
|
`CREATE INDEX IF NOT EXISTS idx_replay_entry_passed ON replay_entry_results(passed)`,
|
|
411
|
+
`CREATE INDEX IF NOT EXISTS idx_replay_entry_proposal_phase ON replay_entry_results(proposal_id, phase)`,
|
|
392
412
|
// -- Commit tracking indexes ------------------------------------------------
|
|
393
413
|
`CREATE INDEX IF NOT EXISTS idx_commit_sha ON commit_tracking(commit_sha)`,
|
|
394
414
|
`CREATE INDEX IF NOT EXISTS idx_commit_session ON commit_tracking(session_id)`,
|
|
@@ -485,6 +505,7 @@ export const ALL_DDL = [
|
|
|
485
505
|
CREATE_ORCHESTRATE_RUNS,
|
|
486
506
|
CREATE_QUERIES,
|
|
487
507
|
CREATE_GRADING_RESULTS,
|
|
508
|
+
CREATE_GRADING_BASELINES,
|
|
488
509
|
CREATE_IMPROVEMENT_SIGNALS,
|
|
489
510
|
CREATE_UPLOAD_QUEUE,
|
|
490
511
|
CREATE_CREATOR_CONTRIBUTION_STAGING,
|
|
@@ -8,12 +8,15 @@
|
|
|
8
8
|
|
|
9
9
|
import { parseArgs } from "node:util";
|
|
10
10
|
|
|
11
|
+
import { PUBLIC_COMMAND_SURFACES, renderCommandHelp } from "../command-surface.js";
|
|
11
12
|
import { QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
|
|
12
13
|
import { classifyInvocation } from "../eval/hooks-to-evals.js";
|
|
13
14
|
import { getLastDeployedProposal } from "../evolution/audit.js";
|
|
14
15
|
import { getDb } from "../localdb/db.js";
|
|
15
16
|
import {
|
|
17
|
+
queryGradingBaseline,
|
|
16
18
|
queryQueryLog,
|
|
19
|
+
queryRecentGradingResults,
|
|
17
20
|
querySessionTelemetry,
|
|
18
21
|
querySkillUsageRecords,
|
|
19
22
|
} from "../localdb/queries.js";
|
|
@@ -42,6 +45,10 @@ export interface WatchOptions {
|
|
|
42
45
|
windowSessions: number;
|
|
43
46
|
regressionThreshold: number;
|
|
44
47
|
autoRollback: boolean;
|
|
48
|
+
/** Grade regression threshold (default 0.15). */
|
|
49
|
+
gradeRegressionThreshold?: number;
|
|
50
|
+
/** Enable grade-based regression watch (default true). */
|
|
51
|
+
enableGradeWatch?: boolean;
|
|
45
52
|
/** Injected log paths for testing (override defaults). */
|
|
46
53
|
_telemetryLogPath?: string;
|
|
47
54
|
_skillLogPath?: string;
|
|
@@ -65,6 +72,8 @@ export interface WatchResult {
|
|
|
65
72
|
rolledBack: boolean;
|
|
66
73
|
recommendation: string;
|
|
67
74
|
sync_result?: SyncResult;
|
|
75
|
+
gradeAlert?: string | null;
|
|
76
|
+
gradeRegression?: { before: number; after: number; delta: number } | null;
|
|
68
77
|
}
|
|
69
78
|
|
|
70
79
|
// ---------------------------------------------------------------------------
|
|
@@ -73,6 +82,7 @@ export interface WatchResult {
|
|
|
73
82
|
|
|
74
83
|
const DEFAULT_BASELINE_PASS_RATE = 0.5;
|
|
75
84
|
const DEFAULT_REGRESSION_THRESHOLD = 0.1;
|
|
85
|
+
const DEFAULT_GRADE_REGRESSION_THRESHOLD = 0.15;
|
|
76
86
|
export const MIN_MONITORING_SKILL_CHECKS = 3;
|
|
77
87
|
|
|
78
88
|
// ---------------------------------------------------------------------------
|
|
@@ -190,6 +200,8 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
|
|
|
190
200
|
skillPath,
|
|
191
201
|
windowSessions = 20,
|
|
192
202
|
regressionThreshold = DEFAULT_REGRESSION_THRESHOLD,
|
|
203
|
+
gradeRegressionThreshold = DEFAULT_GRADE_REGRESSION_THRESHOLD,
|
|
204
|
+
enableGradeWatch = true,
|
|
193
205
|
autoRollback = false,
|
|
194
206
|
_telemetryLogPath = TELEMETRY_LOG,
|
|
195
207
|
_skillLogPath = SKILL_LOG,
|
|
@@ -235,26 +247,71 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
|
|
|
235
247
|
regressionThreshold,
|
|
236
248
|
);
|
|
237
249
|
|
|
238
|
-
// 4. Build alert
|
|
239
|
-
|
|
250
|
+
// 4. Build trigger alert. Grade alerts are added below before rollback
|
|
251
|
+
// decisions so either signal can drive automated rollback.
|
|
252
|
+
let triggerAlert: string | null = null;
|
|
240
253
|
let rolledBack = false;
|
|
241
|
-
let recommendation: string;
|
|
242
254
|
|
|
243
255
|
if (snapshot.regression_detected) {
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
+
triggerAlert = `regression detected for "${skillName}": pass_rate=${snapshot.pass_rate.toFixed(2)} below baseline=${baselinePassRate.toFixed(2)} minus threshold=${regressionThreshold.toFixed(2)}`;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// 5. Grade regression detection (fail-open)
|
|
260
|
+
let gradeAlert: string | null = null;
|
|
261
|
+
let gradeRegression: { before: number; after: number; delta: number } | null = null;
|
|
262
|
+
|
|
263
|
+
if (enableGradeWatch) {
|
|
264
|
+
try {
|
|
265
|
+
const baseline = queryGradingBaseline(db, skillName, lastDeployed?.proposal_id);
|
|
266
|
+
const recentResults = queryRecentGradingResults(db, skillName, 10);
|
|
267
|
+
|
|
268
|
+
if (baseline && recentResults.length > 0) {
|
|
269
|
+
// Compute the average pass rate from recent grading results
|
|
270
|
+
const validResults = recentResults.filter((r) => r.pass_rate != null);
|
|
271
|
+
if (validResults.length > 0) {
|
|
272
|
+
const recentAvgPassRate =
|
|
273
|
+
validResults.reduce((sum, r) => sum + (r.pass_rate ?? 0), 0) / validResults.length;
|
|
274
|
+
const baselinePassRateGrade = baseline.pass_rate;
|
|
275
|
+
const delta = baselinePassRateGrade - recentAvgPassRate;
|
|
276
|
+
|
|
277
|
+
if (delta > gradeRegressionThreshold) {
|
|
278
|
+
gradeAlert = `grade regression detected for "${skillName}": baseline_grade_pass_rate=${baselinePassRateGrade.toFixed(2)}, recent_avg=${recentAvgPassRate.toFixed(2)}, delta=${delta.toFixed(2)} exceeds threshold=${gradeRegressionThreshold.toFixed(2)}`;
|
|
279
|
+
gradeRegression = {
|
|
280
|
+
before: baselinePassRateGrade,
|
|
281
|
+
after: recentAvgPassRate,
|
|
282
|
+
delta,
|
|
283
|
+
};
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
} catch (err) {
|
|
288
|
+
// Fail-open: grade watch should never block trigger monitoring
|
|
289
|
+
console.error(
|
|
290
|
+
JSON.stringify({
|
|
291
|
+
level: "debug",
|
|
292
|
+
code: "grade_watch_failed",
|
|
293
|
+
message: `Grade watch failed for "${skillName}": ${err instanceof Error ? err.message : String(err)}`,
|
|
294
|
+
}),
|
|
295
|
+
);
|
|
256
296
|
}
|
|
297
|
+
}
|
|
257
298
|
|
|
299
|
+
const alerts = [triggerAlert, gradeAlert].filter((value): value is string => Boolean(value));
|
|
300
|
+
const alert = alerts.length > 0 ? alerts.join("\n") : null;
|
|
301
|
+
|
|
302
|
+
if (alert && autoRollback) {
|
|
303
|
+
const rollbackFn = _rollbackFn ?? (await loadRollbackFn());
|
|
304
|
+
const proposalId = lastDeployed?.proposal_id;
|
|
305
|
+
const rollbackResult = await rollbackFn({
|
|
306
|
+
skillName,
|
|
307
|
+
skillPath,
|
|
308
|
+
proposalId,
|
|
309
|
+
});
|
|
310
|
+
rolledBack = rollbackResult.rolledBack;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
let recommendation: string;
|
|
314
|
+
if (alert) {
|
|
258
315
|
recommendation = rolledBack
|
|
259
316
|
? `Rolled back "${skillName}" to previous version. Monitor to confirm recovery.`
|
|
260
317
|
: `Consider running: selftune rollback --skill "${skillName}" --skill-path "${skillPath}"`;
|
|
@@ -285,6 +342,8 @@ export async function watch(options: WatchOptions): Promise<WatchResult> {
|
|
|
285
342
|
alert,
|
|
286
343
|
rolledBack,
|
|
287
344
|
recommendation,
|
|
345
|
+
gradeAlert,
|
|
346
|
+
gradeRegression,
|
|
288
347
|
...(syncResult ? { sync_result: syncResult } : {}),
|
|
289
348
|
};
|
|
290
349
|
}
|
|
@@ -329,6 +388,8 @@ export async function cliMain(): Promise<void> {
|
|
|
329
388
|
window: { type: "string", default: "20" },
|
|
330
389
|
threshold: { type: "string", default: "0.1" },
|
|
331
390
|
"auto-rollback": { type: "boolean", default: false },
|
|
391
|
+
"grade-threshold": { type: "string", default: "0.15" },
|
|
392
|
+
"no-grade-watch": { type: "boolean", default: false },
|
|
332
393
|
"sync-first": { type: "boolean", default: false },
|
|
333
394
|
"sync-force": { type: "boolean", default: false },
|
|
334
395
|
help: { type: "boolean", default: false },
|
|
@@ -337,20 +398,7 @@ export async function cliMain(): Promise<void> {
|
|
|
337
398
|
});
|
|
338
399
|
|
|
339
400
|
if (values.help) {
|
|
340
|
-
console.log(
|
|
341
|
-
|
|
342
|
-
Usage:
|
|
343
|
-
selftune watch --skill <name> --skill-path <path> [options]
|
|
344
|
-
|
|
345
|
-
Options:
|
|
346
|
-
--skill Skill name (required)
|
|
347
|
-
--skill-path Path to SKILL.md (required)
|
|
348
|
-
--window Number of recent sessions to consider (default: 20)
|
|
349
|
-
--threshold Regression threshold below baseline (default: 0.1)
|
|
350
|
-
--auto-rollback Automatically rollback on regression detection
|
|
351
|
-
--sync-first Refresh source-truth telemetry before reading watch inputs
|
|
352
|
-
--sync-force Force a full rescan during --sync-first
|
|
353
|
-
--help Show this help message`);
|
|
401
|
+
console.log(renderCommandHelp(PUBLIC_COMMAND_SURFACES.watch));
|
|
354
402
|
process.exit(0);
|
|
355
403
|
}
|
|
356
404
|
|
|
@@ -403,11 +451,30 @@ Options:
|
|
|
403
451
|
);
|
|
404
452
|
}
|
|
405
453
|
|
|
454
|
+
const rawGradeThreshold = values["grade-threshold"] ?? "0.15";
|
|
455
|
+
if (!/^\d+(\.\d+)?$/.test(rawGradeThreshold)) {
|
|
456
|
+
throw new CLIError(
|
|
457
|
+
"--grade-threshold must be a finite number between 0 and 1.",
|
|
458
|
+
"INVALID_FLAG",
|
|
459
|
+
"selftune watch --grade-threshold 0.15",
|
|
460
|
+
);
|
|
461
|
+
}
|
|
462
|
+
const gradeRegressionThreshold = Number.parseFloat(rawGradeThreshold);
|
|
463
|
+
if (gradeRegressionThreshold < 0 || gradeRegressionThreshold > 1) {
|
|
464
|
+
throw new CLIError(
|
|
465
|
+
"--grade-threshold must be a finite number between 0 and 1.",
|
|
466
|
+
"INVALID_FLAG",
|
|
467
|
+
"selftune watch --grade-threshold 0.15",
|
|
468
|
+
);
|
|
469
|
+
}
|
|
470
|
+
|
|
406
471
|
const result = await watch({
|
|
407
472
|
skillName: values.skill,
|
|
408
473
|
skillPath: values["skill-path"],
|
|
409
474
|
windowSessions,
|
|
410
475
|
regressionThreshold,
|
|
476
|
+
gradeRegressionThreshold,
|
|
477
|
+
enableGradeWatch: !(values["no-grade-watch"] ?? false),
|
|
411
478
|
autoRollback: values["auto-rollback"] ?? false,
|
|
412
479
|
syncFirst: values["sync-first"] ?? false,
|
|
413
480
|
syncForce: values["sync-force"] ?? false,
|
|
@@ -694,6 +694,7 @@ export function buildCanonicalSkillInvocation(
|
|
|
694
694
|
}
|
|
695
695
|
|
|
696
696
|
export interface BuildExecutionFactInput extends CanonicalBaseInput {
|
|
697
|
+
execution_fact_id?: string;
|
|
697
698
|
occurred_at: string;
|
|
698
699
|
prompt_id?: string;
|
|
699
700
|
tool_calls_json: Record<string, number>;
|
|
@@ -716,6 +717,8 @@ export function buildCanonicalExecutionFact(
|
|
|
716
717
|
const record: CanonicalExecutionFactRecord = {
|
|
717
718
|
...base,
|
|
718
719
|
record_kind: "execution_fact",
|
|
720
|
+
execution_fact_id:
|
|
721
|
+
input.execution_fact_id ?? `${input.session_id}:${input.occurred_at}:execution_fact`,
|
|
719
722
|
occurred_at: input.occurred_at,
|
|
720
723
|
tool_calls_json: input.tool_calls_json,
|
|
721
724
|
total_tool_calls: input.total_tool_calls,
|
|
@@ -14,6 +14,7 @@ import { join } from "node:path";
|
|
|
14
14
|
|
|
15
15
|
import { getAlphaGuidance } from "./agent-guidance.js";
|
|
16
16
|
import { getAlphaLinkState, readAlphaIdentity } from "./alpha-identity.js";
|
|
17
|
+
import { getSelftuneUpdateHint } from "./auto-update.js";
|
|
17
18
|
import { LOG_DIR, REQUIRED_FIELDS, SELFTUNE_CONFIG_PATH } from "./constants.js";
|
|
18
19
|
import { DB_PATH, getDb } from "./localdb/db.js";
|
|
19
20
|
import type {
|
|
@@ -318,12 +319,13 @@ export async function checkVersionHealth(): Promise<HealthCheck[]> {
|
|
|
318
319
|
if (cmp >= 0) {
|
|
319
320
|
check.message = `v${currentVersion} (latest)`;
|
|
320
321
|
} else {
|
|
322
|
+
const updateCommand = getSelftuneUpdateHint("latest");
|
|
321
323
|
check.status = "warn";
|
|
322
|
-
check.message = `v${currentVersion} installed, v${latestVersion} available. Run:
|
|
324
|
+
check.message = `v${currentVersion} installed, v${latestVersion} available. Run: ${updateCommand}`;
|
|
323
325
|
check.guidance = {
|
|
324
326
|
code: "version_update_available",
|
|
325
327
|
message: "A newer selftune release is available.",
|
|
326
|
-
next_command:
|
|
328
|
+
next_command: updateCommand,
|
|
327
329
|
suggested_commands: ["selftune doctor"],
|
|
328
330
|
blocking: false,
|
|
329
331
|
};
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import { parseArgs } from "node:util";
|
|
2
|
+
|
|
3
|
+
import { PUBLIC_COMMAND_SURFACES, renderCommandHelp } from "../command-surface.js";
|
|
4
|
+
import type { OrchestrateOptions, OrchestrateResult } from "../orchestrate.js";
|
|
5
|
+
import { CLIError } from "../utils/cli-error.js";
|
|
6
|
+
|
|
7
|
+
export interface ParsedOrchestrateCliArgs {
|
|
8
|
+
showHelp: boolean;
|
|
9
|
+
warnings: string[];
|
|
10
|
+
loop: boolean;
|
|
11
|
+
loopIntervalSeconds: number;
|
|
12
|
+
runOptions: OrchestrateOptions;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function parsePositiveIntegerFlag(value: string, message: string, command: string): number {
|
|
16
|
+
if (!/^\d+$/.test(value) || Number(value) < 1) {
|
|
17
|
+
throw new CLIError(message, "INVALID_FLAG", command);
|
|
18
|
+
}
|
|
19
|
+
return Number(value);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function parseNonNegativeIntegerFlag(value: string, message: string, command: string): number {
|
|
23
|
+
if (!/^\d+$/.test(value)) {
|
|
24
|
+
throw new CLIError(message, "INVALID_FLAG", command);
|
|
25
|
+
}
|
|
26
|
+
return Number(value);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export function renderOrchestrateHelp(): string {
|
|
30
|
+
return renderCommandHelp(PUBLIC_COMMAND_SURFACES.orchestrate);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function parseOrchestrateCliArgs(
|
|
34
|
+
argv: string[] = process.argv.slice(2),
|
|
35
|
+
): ParsedOrchestrateCliArgs {
|
|
36
|
+
const { values } = parseArgs({
|
|
37
|
+
args: argv,
|
|
38
|
+
options: {
|
|
39
|
+
"dry-run": { type: "boolean", default: false },
|
|
40
|
+
"review-required": { type: "boolean", default: false },
|
|
41
|
+
"auto-approve": { type: "boolean", default: false },
|
|
42
|
+
skill: { type: "string" },
|
|
43
|
+
"max-skills": { type: "string", default: "5" },
|
|
44
|
+
"recent-window": { type: "string", default: "48" },
|
|
45
|
+
"sync-force": { type: "boolean", default: false },
|
|
46
|
+
"max-auto-grade": { type: "string", default: "5" },
|
|
47
|
+
loop: { type: "boolean", default: false },
|
|
48
|
+
"loop-interval": { type: "string", default: "3600" },
|
|
49
|
+
help: { type: "boolean", short: "h", default: false },
|
|
50
|
+
},
|
|
51
|
+
strict: true,
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
if (values.help) {
|
|
55
|
+
return {
|
|
56
|
+
showHelp: true,
|
|
57
|
+
warnings: [],
|
|
58
|
+
loop: false,
|
|
59
|
+
loopIntervalSeconds: 3600,
|
|
60
|
+
runOptions: {
|
|
61
|
+
dryRun: false,
|
|
62
|
+
approvalMode: "auto",
|
|
63
|
+
maxSkills: 5,
|
|
64
|
+
recentWindowHours: 48,
|
|
65
|
+
syncForce: false,
|
|
66
|
+
maxAutoGrade: 5,
|
|
67
|
+
},
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
const loop = values.loop ?? false;
|
|
72
|
+
const maxSkills = parsePositiveIntegerFlag(
|
|
73
|
+
values["max-skills"] ?? "5",
|
|
74
|
+
"--max-skills must be a positive integer",
|
|
75
|
+
"selftune orchestrate --max-skills 5",
|
|
76
|
+
);
|
|
77
|
+
const recentWindowHours = parsePositiveIntegerFlag(
|
|
78
|
+
values["recent-window"] ?? "48",
|
|
79
|
+
"--recent-window must be a positive integer",
|
|
80
|
+
"selftune orchestrate --recent-window 48",
|
|
81
|
+
);
|
|
82
|
+
const maxAutoGrade = parseNonNegativeIntegerFlag(
|
|
83
|
+
values["max-auto-grade"] ?? "5",
|
|
84
|
+
"--max-auto-grade must be a non-negative integer",
|
|
85
|
+
"selftune orchestrate --max-auto-grade 5",
|
|
86
|
+
);
|
|
87
|
+
|
|
88
|
+
const loopIntervalRaw = values["loop-interval"] ?? "3600";
|
|
89
|
+
if (!/^\d+$/.test(loopIntervalRaw) || (loop && Number(loopIntervalRaw) < 60)) {
|
|
90
|
+
throw new CLIError(
|
|
91
|
+
"--loop-interval must be an integer >= 60 (seconds)",
|
|
92
|
+
"INVALID_FLAG",
|
|
93
|
+
"selftune orchestrate --loop --loop-interval 3600",
|
|
94
|
+
);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
const warnings: string[] = [];
|
|
98
|
+
if (values["auto-approve"]) {
|
|
99
|
+
warnings.push(
|
|
100
|
+
"[orchestrate] --auto-approve is deprecated; autonomous mode is now the default.",
|
|
101
|
+
);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
return {
|
|
105
|
+
showHelp: false,
|
|
106
|
+
warnings,
|
|
107
|
+
loop,
|
|
108
|
+
loopIntervalSeconds: Number(loopIntervalRaw),
|
|
109
|
+
runOptions: {
|
|
110
|
+
dryRun: values["dry-run"] ?? false,
|
|
111
|
+
approvalMode: values["review-required"] ? "review" : "auto",
|
|
112
|
+
skillFilter: values.skill,
|
|
113
|
+
maxSkills,
|
|
114
|
+
recentWindowHours,
|
|
115
|
+
syncForce: values["sync-force"] ?? false,
|
|
116
|
+
maxAutoGrade,
|
|
117
|
+
},
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
export function buildOrchestrateJsonOutput(result: OrchestrateResult) {
|
|
122
|
+
return {
|
|
123
|
+
...result.summary,
|
|
124
|
+
...(result.uploadSummary ? { upload: result.uploadSummary } : {}),
|
|
125
|
+
workflow_proposals: result.workflowProposals.map((proposal) => ({
|
|
126
|
+
proposal_id: proposal.proposal_id,
|
|
127
|
+
source_skill_name: proposal.source_skill_name,
|
|
128
|
+
workflow_id: proposal.workflow.workflow_id,
|
|
129
|
+
generated_skill_name: proposal.draft.skill_name,
|
|
130
|
+
output_path: proposal.draft.skill_path,
|
|
131
|
+
confidence: proposal.confidence,
|
|
132
|
+
reason: proposal.rationale,
|
|
133
|
+
})),
|
|
134
|
+
decisions: result.candidates.map((candidate) => ({
|
|
135
|
+
skill: candidate.skill,
|
|
136
|
+
action: candidate.action,
|
|
137
|
+
reason: candidate.reason,
|
|
138
|
+
...(candidate.evolveResult
|
|
139
|
+
? {
|
|
140
|
+
deployed: candidate.evolveResult.deployed,
|
|
141
|
+
evolveReason: candidate.evolveResult.reason,
|
|
142
|
+
validation: candidate.evolveResult.validation
|
|
143
|
+
? {
|
|
144
|
+
before: candidate.evolveResult.validation.before_pass_rate,
|
|
145
|
+
after: candidate.evolveResult.validation.after_pass_rate,
|
|
146
|
+
improved: candidate.evolveResult.validation.improved,
|
|
147
|
+
}
|
|
148
|
+
: null,
|
|
149
|
+
}
|
|
150
|
+
: {}),
|
|
151
|
+
...(candidate.watchResult
|
|
152
|
+
? {
|
|
153
|
+
alert: candidate.watchResult.alert,
|
|
154
|
+
rolledBack: candidate.watchResult.rolledBack,
|
|
155
|
+
passRate: candidate.watchResult.snapshot?.pass_rate ?? null,
|
|
156
|
+
recommendation: candidate.watchResult.recommendation,
|
|
157
|
+
}
|
|
158
|
+
: {}),
|
|
159
|
+
})),
|
|
160
|
+
};
|
|
161
|
+
}
|