selftune 0.2.22 → 0.2.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +95 -15
- package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
- package/apps/local-dashboard/dist/assets/index-Dmx7LPVX.js +15 -0
- package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
- package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
- package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
- package/apps/local-dashboard/dist/index.html +5 -5
- package/cli/selftune/adapters/codex/install.ts +310 -78
- package/cli/selftune/adapters/opencode/install.ts +3 -4
- package/cli/selftune/adapters/pi/hook.ts +273 -0
- package/cli/selftune/adapters/pi/install.ts +207 -0
- package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
- package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
- package/cli/selftune/auto-update.ts +200 -8
- package/cli/selftune/canonical-export.ts +55 -25
- package/cli/selftune/command-surface.ts +397 -0
- package/cli/selftune/constants.ts +10 -1
- package/cli/selftune/contribute/contribute.ts +64 -13
- package/cli/selftune/contribution-config.ts +57 -3
- package/cli/selftune/contribution-preferences.ts +117 -0
- package/cli/selftune/contribution-signals.ts +8 -4
- package/cli/selftune/contribution-staging.ts +13 -2
- package/cli/selftune/contributions.ts +55 -121
- package/cli/selftune/creator-contributions.ts +29 -10
- package/cli/selftune/cron/setup.ts +7 -3
- package/cli/selftune/dashboard-contract.ts +87 -0
- package/cli/selftune/dashboard-server.ts +168 -17
- package/cli/selftune/dashboard.ts +350 -17
- package/cli/selftune/eval/baseline.ts +21 -5
- package/cli/selftune/eval/execution-eval.ts +170 -0
- package/cli/selftune/eval/family-overlap.ts +2 -2
- package/cli/selftune/eval/hooks-to-evals.ts +228 -82
- package/cli/selftune/eval/import-skillsbench.ts +2 -2
- package/cli/selftune/eval/invocation-classifier.ts +56 -0
- package/cli/selftune/eval/synthetic-evals.ts +5 -3
- package/cli/selftune/eval/unit-test-cli.ts +7 -4
- package/cli/selftune/evolution/apply-proposal.ts +295 -0
- package/cli/selftune/evolution/engines/judge-engine.ts +96 -0
- package/cli/selftune/evolution/engines/replay-engine.ts +180 -0
- package/cli/selftune/evolution/evidence.ts +2 -6
- package/cli/selftune/evolution/evolve-body.ts +152 -38
- package/cli/selftune/evolution/evolve.ts +244 -52
- package/cli/selftune/evolution/rollback.ts +0 -1
- package/cli/selftune/evolution/validate-body.ts +111 -49
- package/cli/selftune/evolution/validate-host-replay.ts +510 -60
- package/cli/selftune/evolution/validate-proposal.ts +11 -150
- package/cli/selftune/evolution/validate-routing.ts +51 -108
- package/cli/selftune/evolution/validation-contract.ts +91 -0
- package/cli/selftune/grading/auto-grade.ts +11 -7
- package/cli/selftune/grading/grade-session.ts +10 -16
- package/cli/selftune/hooks/skill-eval.ts +2 -1
- package/cli/selftune/hooks-shared/types.ts +1 -0
- package/cli/selftune/index.ts +58 -15
- package/cli/selftune/ingestors/claude-replay.ts +15 -10
- package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
- package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
- package/cli/selftune/ingestors/pi-ingest.ts +727 -0
- package/cli/selftune/init.ts +38 -4
- package/cli/selftune/localdb/direct-write.ts +120 -1
- package/cli/selftune/localdb/materialize.ts +6 -7
- package/cli/selftune/localdb/queries/cron.ts +34 -0
- package/cli/selftune/localdb/queries/dashboard.ts +834 -0
- package/cli/selftune/localdb/queries/evolution.ts +158 -0
- package/cli/selftune/localdb/queries/execution.ts +133 -0
- package/cli/selftune/localdb/queries/json.ts +18 -0
- package/cli/selftune/localdb/queries/monitoring.ts +263 -0
- package/cli/selftune/localdb/queries/raw.ts +95 -0
- package/cli/selftune/localdb/queries/staging.ts +270 -0
- package/cli/selftune/localdb/queries/trust.ts +392 -0
- package/cli/selftune/localdb/queries.ts +60 -2162
- package/cli/selftune/localdb/schema.ts +59 -0
- package/cli/selftune/monitoring/watch.ts +96 -29
- package/cli/selftune/normalization.ts +3 -0
- package/cli/selftune/observability.ts +12 -3
- package/cli/selftune/orchestrate/cli.ts +161 -0
- package/cli/selftune/orchestrate/execute.ts +295 -0
- package/cli/selftune/orchestrate/finalize.ts +157 -0
- package/cli/selftune/orchestrate/locks.ts +40 -0
- package/cli/selftune/orchestrate/plan.ts +131 -0
- package/cli/selftune/orchestrate/post-run.ts +59 -0
- package/cli/selftune/orchestrate/prepare.ts +334 -0
- package/cli/selftune/orchestrate/report.ts +182 -0
- package/cli/selftune/orchestrate/runtime.ts +120 -0
- package/cli/selftune/orchestrate/signals.ts +48 -0
- package/cli/selftune/orchestrate.ts +162 -1142
- package/cli/selftune/registry/client.ts +74 -0
- package/cli/selftune/registry/history.ts +54 -0
- package/cli/selftune/registry/index.ts +90 -0
- package/cli/selftune/registry/install.ts +141 -0
- package/cli/selftune/registry/list.ts +44 -0
- package/cli/selftune/registry/push.ts +171 -0
- package/cli/selftune/registry/rollback.ts +49 -0
- package/cli/selftune/registry/status.ts +62 -0
- package/cli/selftune/registry/sync.ts +125 -0
- package/cli/selftune/repair/skill-usage.ts +9 -3
- package/cli/selftune/routes/overview.ts +5 -2
- package/cli/selftune/routes/skill-report.ts +15 -2
- package/cli/selftune/schedule.ts +5 -5
- package/cli/selftune/status.ts +70 -2
- package/cli/selftune/sync.ts +127 -23
- package/cli/selftune/testing-readiness.ts +597 -0
- package/cli/selftune/types.ts +46 -5
- package/cli/selftune/uninstall.ts +2 -1
- package/cli/selftune/utils/canonical-log.ts +1 -9
- package/cli/selftune/utils/cli-error.ts +9 -0
- package/cli/selftune/utils/jsonl.ts +1 -30
- package/cli/selftune/utils/llm-call.ts +126 -6
- package/cli/selftune/utils/skill-discovery.ts +24 -0
- package/cli/selftune/workflows/proposals.ts +184 -0
- package/cli/selftune/workflows/skill-scaffold.ts +241 -0
- package/cli/selftune/workflows/workflows.ts +100 -26
- package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +2 -2
- package/node_modules/@selftune/telemetry-contract/fixtures/golden.test.ts +0 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +2 -2
- package/node_modules/@selftune/telemetry-contract/package.json +1 -1
- package/node_modules/@selftune/telemetry-contract/src/index.ts +1 -0
- package/node_modules/@selftune/telemetry-contract/src/schemas.ts +63 -5
- package/node_modules/@selftune/telemetry-contract/src/types.ts +97 -7
- package/node_modules/@selftune/telemetry-contract/tests/compatibility.test.ts +0 -1
- package/package.json +25 -9
- package/packages/dashboard-core/AGENTS.md +18 -0
- package/packages/dashboard-core/README.md +30 -0
- package/packages/dashboard-core/index.ts +3 -0
- package/packages/dashboard-core/package.json +39 -0
- package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
- package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
- package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
- package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
- package/packages/dashboard-core/src/chrome/index.ts +14 -0
- package/packages/dashboard-core/src/chrome/types.ts +81 -0
- package/packages/dashboard-core/src/chrome/utils.ts +23 -0
- package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
- package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
- package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
- package/packages/dashboard-core/src/gates/index.ts +3 -0
- package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
- package/packages/dashboard-core/src/host/adapter.ts +47 -0
- package/packages/dashboard-core/src/host/capabilities.ts +55 -0
- package/packages/dashboard-core/src/host/index.ts +3 -0
- package/packages/dashboard-core/src/models/analytics.ts +39 -0
- package/packages/dashboard-core/src/models/index.ts +4 -0
- package/packages/dashboard-core/src/models/overview.ts +98 -0
- package/packages/dashboard-core/src/models/runtime.ts +7 -0
- package/packages/dashboard-core/src/models/skills.ts +34 -0
- package/packages/dashboard-core/src/routes/index.ts +2 -0
- package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
- package/packages/dashboard-core/src/routes/manifest.ts +451 -0
- package/packages/dashboard-core/src/routes/types.ts +39 -0
- package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
- package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
- package/packages/dashboard-core/src/screens/index.ts +37 -0
- package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
- package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
- package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
- package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
- package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
- package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
- package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
- package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
- package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
- package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
- package/packages/telemetry-contract/fixtures/evidence-only-push.ts +2 -2
- package/packages/telemetry-contract/fixtures/golden.test.ts +0 -1
- package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
- package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +2 -2
- package/packages/telemetry-contract/package.json +1 -1
- package/packages/telemetry-contract/src/index.ts +1 -0
- package/packages/telemetry-contract/src/schemas.ts +63 -5
- package/packages/telemetry-contract/src/types.ts +97 -7
- package/packages/telemetry-contract/tests/compatibility.test.ts +0 -1
- package/packages/ui/AGENTS.md +16 -0
- package/packages/ui/README.md +1 -1
- package/packages/ui/package.json +1 -1
- package/packages/ui/src/components/ActivityTimeline.tsx +152 -168
- package/packages/ui/src/components/AnalyticsCharts.tsx +344 -0
- package/packages/ui/src/components/EvidenceViewer.tsx +229 -464
- package/packages/ui/src/components/EvolutionTimeline.tsx +34 -87
- package/packages/ui/src/components/InfoTip.tsx +1 -2
- package/packages/ui/src/components/InvocationsPanel.tsx +413 -0
- package/packages/ui/src/components/JobHistoryTimeline.tsx +156 -0
- package/packages/ui/src/components/OrchestrateRunsPanel.tsx +18 -36
- package/packages/ui/src/components/OverviewPanels.tsx +693 -0
- package/packages/ui/src/components/PipelineStatusBar.tsx +65 -0
- package/packages/ui/src/components/SkillReportGuide.tsx +215 -0
- package/packages/ui/src/components/SkillReportPanels.tsx +919 -0
- package/packages/ui/src/components/SkillsLibrary.tsx +437 -0
- package/packages/ui/src/components/index.ts +56 -1
- package/packages/ui/src/components/section-cards.tsx +18 -35
- package/packages/ui/src/components/skill-health-grid.tsx +47 -37
- package/packages/ui/src/lib/constants.tsx +0 -1
- package/packages/ui/src/primitives/card.tsx +1 -1
- package/packages/ui/src/primitives/checkbox.tsx +1 -1
- package/packages/ui/src/primitives/dropdown-menu.tsx +2 -2
- package/packages/ui/src/primitives/select.tsx +2 -2
- package/packages/ui/src/primitives/tabs.tsx +7 -6
- package/packages/ui/src/types.ts +182 -4
- package/skill/SKILL.md +130 -318
- package/skill/agents/diagnosis-analyst.md +3 -3
- package/skill/agents/evolution-reviewer.md +3 -3
- package/skill/agents/integration-guide.md +3 -3
- package/skill/agents/pattern-analyst.md +2 -2
- package/skill/references/cli-quick-reference.md +89 -0
- package/skill/references/creator-playbook.md +131 -0
- package/skill/references/examples.md +48 -0
- package/skill/references/troubleshooting.md +47 -0
- package/skill/references/version-history.md +1 -1
- package/skill/selftune.contribute.json +11 -0
- package/skill/{Workflows → workflows}/Baseline.md +20 -1
- package/skill/{Workflows → workflows}/Contribute.md +23 -10
- package/skill/{Workflows → workflows}/Contributions.md +13 -5
- package/skill/workflows/CreateTestDeploy.md +170 -0
- package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
- package/skill/{Workflows → workflows}/Cron.md +1 -1
- package/skill/{Workflows → workflows}/Dashboard.md +20 -0
- package/skill/{Workflows → workflows}/Doctor.md +1 -1
- package/skill/{Workflows → workflows}/Evals.md +67 -2
- package/skill/{Workflows → workflows}/Evolve.md +119 -30
- package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
- package/skill/{Workflows → workflows}/Grade.md +1 -1
- package/skill/{Workflows → workflows}/Ingest.md +60 -2
- package/skill/{Workflows → workflows}/Initialize.md +16 -9
- package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
- package/skill/{Workflows → workflows}/PlatformHooks.md +19 -3
- package/skill/workflows/Registry.md +99 -0
- package/skill/{Workflows → workflows}/Schedule.md +3 -3
- package/skill/workflows/SignalsDashboard.md +87 -0
- package/skill/{Workflows → workflows}/Sync.md +3 -1
- package/skill/{Workflows → workflows}/UnitTest.md +19 -0
- package/skill/{Workflows → workflows}/Watch.md +42 -2
- package/skill/{Workflows → workflows}/Workflows.md +39 -2
- package/apps/local-dashboard/dist/assets/index-D8O-RG1I.js +0 -60
- package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +0 -1
- package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
- package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +0 -12
- package/cli/selftune/utils/html.ts +0 -27
- package/packages/ui/src/components/RecentActivityFeed.tsx +0 -117
- /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
- /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
- /package/skill/{Workflows → workflows}/Badge.md +0 -0
- /package/skill/{Workflows → workflows}/Composability.md +0 -0
- /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
- /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
- /package/skill/{Workflows → workflows}/Hook.md +0 -0
- /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
- /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
- /package/skill/{Workflows → workflows}/Recover.md +0 -0
- /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
- /package/skill/{Workflows → workflows}/Replay.md +0 -0
- /package/skill/{Workflows → workflows}/Rollback.md +0 -0
- /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
- /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
# selftune Creator-Contributions Workflow
|
|
2
2
|
|
|
3
|
-
Manage the creator
|
|
3
|
+
Manage the **creator sharing setup** — the `selftune.contribute.json` file
|
|
4
|
+
bundled with a skill package.
|
|
4
5
|
|
|
5
6
|
This is **not** the same as:
|
|
6
|
-
- `selftune contributions` — end-user opt-in / opt-out
|
|
7
|
-
- `selftune contribute` — community export bundle
|
|
7
|
+
- `selftune contributions` — end-user **sharing preferences** (opt-in / opt-out)
|
|
8
|
+
- `selftune contribute` — community **export bundle** (anonymized data export)
|
|
9
|
+
- The signals dashboard — viewing aggregated **contributor signal data** from all contributors
|
|
8
10
|
|
|
9
11
|
## When to Use
|
|
10
12
|
|
|
@@ -45,8 +47,17 @@ selftune creator-contributions disable --skill <name> [--skill-path <path>]
|
|
|
45
47
|
## Notes
|
|
46
48
|
|
|
47
49
|
- This is local packaging/setup only. It does **not** upload creator-directed signals yet.
|
|
48
|
-
- The creator
|
|
50
|
+
- The `creator_id` field must be the creator's cloud user UUID (the `cloud_user_id` from alpha enrollment). This is the canonical identifier used to route signals back to the correct creator account.
|
|
51
|
+
- The creator ID is sourced from `--creator-id` or the local alpha identity's `cloud_user_id`.
|
|
49
52
|
- Use this workflow when the user is preparing a skill package.
|
|
53
|
+
- For the full creator lifecycle, read `references/creator-playbook.md` before shipping.
|
|
54
|
+
|
|
55
|
+
## Selftune Dogfood Config
|
|
56
|
+
|
|
57
|
+
The selftune skill itself ships a bundled `selftune.contribute.json` at
|
|
58
|
+
`oss/selftune/skill/selftune.contribute.json`. This is the selftune project
|
|
59
|
+
dogfooding its own creator-directed relay flow. The `creator_id` field is
|
|
60
|
+
set to the production selftune creator's cloud user UUID.
|
|
50
61
|
|
|
51
62
|
## Common Patterns
|
|
52
63
|
|
|
@@ -60,13 +71,14 @@ selftune creator-contributions disable --skill <name> [--skill-path <path>]
|
|
|
60
71
|
> Run `selftune creator-contributions enable --skill <name>`.
|
|
61
72
|
> If auto-discovery fails, rerun with `--skill-path /path/to/SKILL.md`.
|
|
62
73
|
> If no creator identity is available locally, rerun with `--creator-id <id>`.
|
|
63
|
-
>
|
|
74
|
+
> The command rejects non-UUID creator IDs and unsupported signal names.
|
|
75
|
+
> Example: `selftune creator-contributions enable --skill sc-search --skill-path ./skills/sc-search/SKILL.md --creator-id 550e8400-e29b-41d4-a716-446655440000 --signals trigger,grade,miss_category --message "Share privacy-safe usage signals with the skill creator." --privacy-url https://statechange.ai/privacy`
|
|
64
76
|
|
|
65
77
|
**User wants to enable creator contributions for a whole installed skill suite**
|
|
66
78
|
|
|
67
79
|
> Run `selftune creator-contributions enable --all --prefix sc-`.
|
|
68
80
|
> This is the fastest path when preparing a whole family of skills like State Change skills.
|
|
69
|
-
> Example: `selftune creator-contributions enable --all --prefix sc- --creator-id
|
|
81
|
+
> Example: `selftune creator-contributions enable --all --prefix sc- --creator-id 550e8400-e29b-41d4-a716-446655440000`
|
|
70
82
|
|
|
71
83
|
**User wants to stop bundling creator contribution config**
|
|
72
84
|
|
|
@@ -130,4 +130,4 @@ interactive mode is for user-directed improvements.
|
|
|
130
130
|
- **User needs a specific timezone (OpenClaw)** -- Run `selftune cron setup --platform openclaw --tz America/New_York`.
|
|
131
131
|
- **User asks what jobs are registered** -- Run `selftune cron list`. Shows a table of all selftune cron jobs with their schedules and descriptions.
|
|
132
132
|
- **User wants to remove cron automation** -- Run `selftune cron remove`. Preview first with `selftune cron remove --dry-run`.
|
|
133
|
-
- **Skill regressed after cron evolution** -- The watch job should catch this automatically. If not, run `selftune evolve rollback --skill <name> --skill-path <path>` manually. See `
|
|
133
|
+
- **Skill regressed after cron evolution** -- The watch job should catch this automatically. If not, run `selftune evolve rollback --skill <name> --skill-path <path>` manually. See `workflows/Rollback.md`.
|
|
@@ -22,6 +22,7 @@ generate JSONL from SQLite for debugging or offline analysis.
|
|
|
22
22
|
| Flag | Description | Default |
|
|
23
23
|
| --------------- | ----------------------------------------- | ------- |
|
|
24
24
|
| `--port <port>` | Custom port for the server | 3141 |
|
|
25
|
+
| `--restart` | Force-restart an existing dashboard on the target port | Off |
|
|
25
26
|
| `--no-open` | Start server without opening browser | Off |
|
|
26
27
|
| `--serve` | _(Deprecated)_ Alias for default behavior | — |
|
|
27
28
|
|
|
@@ -35,6 +36,16 @@ suggesting `selftune dashboard` instead.
|
|
|
35
36
|
The live server binds to `localhost:3141` by default. Use `--port` to
|
|
36
37
|
override.
|
|
37
38
|
|
|
39
|
+
If a healthy selftune dashboard is already running on the requested port,
|
|
40
|
+
`selftune dashboard` reuses it instead of failing. If the running standalone
|
|
41
|
+
dashboard version is older than the installed CLI, the command restarts it
|
|
42
|
+
automatically to pick up the update. Use `--restart` to force that behavior
|
|
43
|
+
even when the versions match.
|
|
44
|
+
|
|
45
|
+
The dashboard client also polls `/api/health` for `spa_build_id`. If the server
|
|
46
|
+
is newer than the loaded client, the UI shows a reload prompt instead of silently
|
|
47
|
+
staying stale.
|
|
48
|
+
|
|
38
49
|
### Endpoints
|
|
39
50
|
|
|
40
51
|
| Method | Path | Description |
|
|
@@ -162,6 +173,7 @@ checked file paths.
|
|
|
162
173
|
```bash
|
|
163
174
|
selftune dashboard
|
|
164
175
|
selftune dashboard --port 8080
|
|
176
|
+
selftune dashboard --restart
|
|
165
177
|
selftune dashboard --no-open
|
|
166
178
|
```
|
|
167
179
|
|
|
@@ -182,6 +194,14 @@ to trigger watch, evolve, or rollback directly from the dashboard.
|
|
|
182
194
|
> Run `selftune dashboard`. The server provides real-time updates via SSE
|
|
183
195
|
> (~1 second latency).
|
|
184
196
|
|
|
197
|
+
**User just updated selftune and wants the dashboard to pick up the new UI**
|
|
198
|
+
|
|
199
|
+
> Run `selftune dashboard`. It reuses a healthy instance when possible and
|
|
200
|
+
> automatically restarts an older standalone dashboard version on the same port.
|
|
201
|
+
> If the user explicitly wants a restart, run `selftune dashboard --restart`.
|
|
202
|
+
> If the browser still has an older client loaded, the dashboard shows a reload
|
|
203
|
+
> prompt based on `/api/health` build metadata.
|
|
204
|
+
|
|
185
205
|
**Dashboard shows no data**
|
|
186
206
|
|
|
187
207
|
> Run `selftune doctor` to verify hooks are installed. If hooks are missing,
|
|
@@ -163,7 +163,7 @@ For each failed check, take the appropriate action:
|
|
|
163
163
|
| `evolution_audit` | Remove corrupted entries. Future operations will append clean entries. |
|
|
164
164
|
| `dashboard_freshness_mode` | This is an operator warning, not a broken install. Expect possible freshness gaps for SQLite-only writes and export before destructive recovery. |
|
|
165
165
|
| `skill_version_sync` | Run `bun run sync-version` to stamp SKILL.md from package.json. |
|
|
166
|
-
| `version_up_to_date` |
|
|
166
|
+
| `version_up_to_date` | Follow `.checks[].guidance.next_command` for the active install source. Common fixes are `npm install -g selftune@latest`, `bun add -g selftune@latest`, or `npx skills add selftune-dev/selftune`. |
|
|
167
167
|
|
|
168
168
|
### 4. Re-run Doctor
|
|
169
169
|
|
|
@@ -20,6 +20,24 @@ Invoke this workflow when the user requests any of the following:
|
|
|
20
20
|
selftune eval generate --skill <name> [options]
|
|
21
21
|
```
|
|
22
22
|
|
|
23
|
+
## Recommended Creator Loop
|
|
24
|
+
|
|
25
|
+
Use eval generation as step 1 of the default creator loop:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
selftune eval generate --skill <name>
|
|
29
|
+
selftune eval unit-test --skill <name> --generate --skill-path <path>
|
|
30
|
+
selftune evolve --skill <name> --skill-path <path> --dry-run --validation-mode replay
|
|
31
|
+
selftune grade baseline --skill <name> --skill-path <path>
|
|
32
|
+
selftune evolve --skill <name> --skill-path <path> --with-baseline
|
|
33
|
+
selftune watch --skill <name>
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
The command still writes the requested output path, and it now also mirrors a canonical copy into
|
|
37
|
+
`~/.selftune/eval-sets/<skill>.json` so the dashboard and `selftune status` can track whether eval
|
|
38
|
+
coverage exists. Once the earlier steps are complete, the creator loop surfaces now flip from
|
|
39
|
+
"needs testing" to "ready to deploy" and then "watching" after ship.
|
|
40
|
+
|
|
23
41
|
## Options
|
|
24
42
|
|
|
25
43
|
| Flag | Description | Default |
|
|
@@ -39,6 +57,8 @@ selftune eval generate --skill <name> [options]
|
|
|
39
57
|
| `--auto-synthetic` | Fall back to SKILL.md-based cold-start evals when no trusted triggers exist | Off |
|
|
40
58
|
| `--skill-path <path>` | Path to SKILL.md (required with `--synthetic`) | — |
|
|
41
59
|
| `--model <model>` | LLM model to use for synthetic generation | Agent default |
|
|
60
|
+
| `--blend` | Blend log-based and synthetic evals into one set | Off |
|
|
61
|
+
| `--help` | Show command help | Off |
|
|
42
62
|
|
|
43
63
|
## Output Format
|
|
44
64
|
|
|
@@ -49,11 +69,14 @@ selftune eval generate --skill <name> [options]
|
|
|
49
69
|
{
|
|
50
70
|
"query": "Make me a slide deck for the Q3 board meeting",
|
|
51
71
|
"should_trigger": true,
|
|
52
|
-
"invocation_type": "contextual"
|
|
72
|
+
"invocation_type": "contextual",
|
|
73
|
+
"source": "log",
|
|
74
|
+
"created_at": "2026-04-01T12:00:00Z"
|
|
53
75
|
},
|
|
54
76
|
{
|
|
55
77
|
"query": "What format should I use for a presentation?",
|
|
56
|
-
"should_trigger": false
|
|
78
|
+
"should_trigger": false,
|
|
79
|
+
"source": "synthetic"
|
|
57
80
|
}
|
|
58
81
|
]
|
|
59
82
|
```
|
|
@@ -61,6 +84,24 @@ selftune eval generate --skill <name> [options]
|
|
|
61
84
|
Each entry has `query` (string, max 500 chars), `should_trigger` (boolean),
|
|
62
85
|
and optional `invocation_type` (omitted when `--no-taxonomy` is set).
|
|
63
86
|
|
|
87
|
+
Entries also carry optional provenance fields:
|
|
88
|
+
|
|
89
|
+
- `source` — `"log"` (from real usage logs), `"synthetic"` (LLM-generated from SKILL.md), or `"blended"` (synthetic entry that survived dedup in a blended set)
|
|
90
|
+
- `created_at` — ISO timestamp of when the entry was created
|
|
91
|
+
|
|
92
|
+
Use `computeEvalSourceStats(entries)` to get aggregate provenance statistics:
|
|
93
|
+
|
|
94
|
+
```json
|
|
95
|
+
{
|
|
96
|
+
"total": 80,
|
|
97
|
+
"synthetic": 10,
|
|
98
|
+
"log": 50,
|
|
99
|
+
"blended": 20,
|
|
100
|
+
"oldest": "2026-03-01T00:00:00Z",
|
|
101
|
+
"newest": "2026-04-01T12:00:00Z"
|
|
102
|
+
}
|
|
103
|
+
```
|
|
104
|
+
|
|
64
105
|
### List Skills
|
|
65
106
|
|
|
66
107
|
```json
|
|
@@ -181,6 +222,30 @@ Use `--model` to override the default LLM model:
|
|
|
181
222
|
selftune eval generate --skill pptx --synthetic --skill-path ./skills/pptx/SKILL.md --model claude-sonnet-4-5-20250514
|
|
182
223
|
```
|
|
183
224
|
|
|
225
|
+
### Generate Blended Evals
|
|
226
|
+
|
|
227
|
+
When a skill has real log data but you want to fill coverage gaps with synthetic
|
|
228
|
+
entries, use `--blend` to combine both sources into one eval set.
|
|
229
|
+
|
|
230
|
+
```bash
|
|
231
|
+
selftune eval generate --skill pptx --blend --skill-path /path/to/skills/pptx/SKILL.md
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
The blending policy:
|
|
235
|
+
|
|
236
|
+
1. Keep ALL log-based entries (marked `source: "log"`)
|
|
237
|
+
2. Generate synthetic entries from SKILL.md
|
|
238
|
+
3. Deduplicate: drop any synthetic entry whose normalized Levenshtein distance to any log entry is < 0.3
|
|
239
|
+
4. Mark surviving synthetic entries as `source: "blended"`
|
|
240
|
+
5. Cap total entries at 2x the log-based count
|
|
241
|
+
|
|
242
|
+
This preserves real-world boundary cases from logs while filling underrepresented
|
|
243
|
+
invocation types with synthetic entries. The 2x cap prevents synthetic entries from
|
|
244
|
+
overwhelming log signal.
|
|
245
|
+
|
|
246
|
+
`--blend` requires a resolvable SKILL.md path. Use `--skill-path` or install the
|
|
247
|
+
skill locally so selftune can find it.
|
|
248
|
+
|
|
184
249
|
### Generate Evals (Log-Based)
|
|
185
250
|
|
|
186
251
|
Cross-reference `skill_usage_log.jsonl` (positive triggers) against
|
|
@@ -19,6 +19,23 @@ Invoke this workflow when the user requests any of the following:
|
|
|
19
19
|
selftune evolve --skill <name> --skill-path <path> [options]
|
|
20
20
|
```
|
|
21
21
|
|
|
22
|
+
## Recommended Creator Loop
|
|
23
|
+
|
|
24
|
+
Do not treat `evolve` as the first step when a creator asks whether a skill is
|
|
25
|
+
ready. The default loop is:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
selftune eval generate --skill <name> --skill-path <path>
|
|
29
|
+
selftune eval unit-test --skill <name> --generate --skill-path <path>
|
|
30
|
+
selftune evolve --skill <name> --skill-path <path> --dry-run --validation-mode replay
|
|
31
|
+
selftune grade baseline --skill <name> --skill-path <path>
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Then move to a live `selftune evolve ...` or `selftune watch ...` run.
|
|
35
|
+
|
|
36
|
+
If canonical evals or stored unit-test results already exist, reuse them rather
|
|
37
|
+
than regenerating everything.
|
|
38
|
+
|
|
22
39
|
## Options
|
|
23
40
|
|
|
24
41
|
| Flag | Description | Default |
|
|
@@ -26,7 +43,7 @@ selftune evolve --skill <name> --skill-path <path> [options]
|
|
|
26
43
|
| `--skill <name>` | Skill name | Required |
|
|
27
44
|
| `--skill-path <path>` | Path to the skill's SKILL.md | Required |
|
|
28
45
|
| `--eval-set <path>` | Pre-built eval set JSON | Auto-generated from logs |
|
|
29
|
-
| `--agent <name>` | Agent CLI to use (claude, codex, opencode)
|
|
46
|
+
| `--agent <name>` | Agent CLI to use (claude, codex, opencode, pi) | Auto-detected |
|
|
30
47
|
| `--dry-run` | Propose and validate without deploying | Off |
|
|
31
48
|
| `--confidence <n>` | Minimum confidence threshold (0-1) | 0.6 |
|
|
32
49
|
| `--max-iterations <n>` | Maximum retry iterations | 3 |
|
|
@@ -42,8 +59,10 @@ selftune evolve --skill <name> --skill-path <path> [options]
|
|
|
42
59
|
| `--gate-effort <level>` | Thinking effort for the final gate (`low|medium|high|max`) | None |
|
|
43
60
|
| `--adaptive-gate` | Escalate risky gate checks to `opus` + `high` effort | Off |
|
|
44
61
|
| `--proposal-model <model>` | Model for proposal generation LLM calls | None |
|
|
62
|
+
| `--validation-mode <mode>` | Validation strategy: `auto`, `replay`, or `judge` | `auto` |
|
|
45
63
|
| `--sync-first` | Refresh source-truth telemetry before generating evals/failure patterns | Off |
|
|
46
64
|
| `--sync-force` | Force a full source rescan during `--sync-first` | Off |
|
|
65
|
+
| `--help` | Show command help | Off |
|
|
47
66
|
|
|
48
67
|
## Output Format
|
|
49
68
|
|
|
@@ -83,37 +102,42 @@ Routing/body validation may also carry provenance fields such as:
|
|
|
83
102
|
- `validation_fixture_id` — fixture identifier when replay-backed validation is used
|
|
84
103
|
- `before_pass_rate` / `after_pass_rate` — only present when trigger validation actually ran; structural-guard exits do not emit synthetic pass rates
|
|
85
104
|
|
|
86
|
-
Most evolve runs today still validate through `llm_judge`.
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
target
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
105
|
+
Most evolve runs today still validate through `llm_judge`. Replay-backed
|
|
106
|
+
validation is only considered available when selftune can run a real
|
|
107
|
+
host/runtime replay for the target host. Today that means the Claude Code,
|
|
108
|
+
Codex, and OpenCode paths can stage a temporary local registry, apply the
|
|
109
|
+
candidate skill content, and observe the runtime's actual routing decision;
|
|
110
|
+
when that runtime path is unavailable, `auto` falls back to `llm_judge` and
|
|
111
|
+
`replay` errors explicitly instead of silently downgrading to fixture
|
|
112
|
+
simulation.
|
|
113
|
+
|
|
114
|
+
Description, routing, and full-body evolution now share the same public
|
|
115
|
+
validation contract: `auto` prefers replay and falls back to judge, `replay`
|
|
116
|
+
requires a replay path, and `judge` bypasses replay entirely. Audit and
|
|
117
|
+
evidence records may also include `validation_fallback_reason` when `auto`
|
|
118
|
+
had to fall back from replay to judge.
|
|
119
|
+
|
|
120
|
+
Replay stages the candidate into the target host's project-local registry:
|
|
121
|
+
Claude Code uses `.claude/skills`, Codex uses `.agents/skills`, and OpenCode
|
|
122
|
+
uses `.opencode/skills`. Validation records whether the runtime selected the
|
|
123
|
+
target skill, selected a competing skill, selected an unrelated skill, or made
|
|
124
|
+
no routing decision at all. Reads outside the staged skill set are treated as
|
|
125
|
+
replay failures even on negative evals, because they indicate the runtime left
|
|
126
|
+
the controlled evaluation surface.
|
|
127
|
+
|
|
128
|
+
For hosts without runtime replay support today, replay is not available. In
|
|
129
|
+
`auto` mode selftune falls back to `llm_judge`; in `replay` mode it exits with
|
|
130
|
+
`REPLAY_UNAVAILABLE`. Do not describe fixture-only surface matching as replay
|
|
131
|
+
validation in user-facing summaries.
|
|
108
132
|
|
|
109
133
|
Replay parsing is intentionally conservative: unreadable skill files degrade to
|
|
110
134
|
empty surfaces instead of throwing, and malformed routing rows with empty
|
|
111
|
-
trigger cells are ignored rather than treated as valid triggers.
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
skill. Reads outside the staged skill set are treated as replay
|
|
115
|
-
than benign negatives, because they indicate the runtime left
|
|
116
|
-
evaluation surface.
|
|
135
|
+
trigger cells are ignored rather than treated as valid triggers. Replay also
|
|
136
|
+
normalizes observed skill reads against the staged workspace, so relative skill
|
|
137
|
+
paths from Claude, Codex, or OpenCode still count as evidence for the target or
|
|
138
|
+
competing skill. Reads outside the staged skill set are treated as replay
|
|
139
|
+
failures rather than benign negatives, because they indicate the runtime left
|
|
140
|
+
the controlled evaluation surface.
|
|
117
141
|
|
|
118
142
|
## Parsing Instructions
|
|
119
143
|
|
|
@@ -281,6 +305,40 @@ The candidate is tested against the full eval set:
|
|
|
281
305
|
If validation fails, the command retries up to `--max-iterations` times
|
|
282
306
|
with adjusted proposals.
|
|
283
307
|
|
|
308
|
+
### Validation Mode (`--validation-mode`)
|
|
309
|
+
|
|
310
|
+
The `--validation-mode` flag controls which validation engine is used for
|
|
311
|
+
description proposals. Three modes are available:
|
|
312
|
+
|
|
313
|
+
| Mode | Behavior |
|
|
314
|
+
| -------- | ------------------------------------------------------------------------ |
|
|
315
|
+
| `auto` | Try replay-based validation first; fall back to LLM judge if unavailable |
|
|
316
|
+
| `replay` | Replay engine only; error if no replay fixture or runner is available |
|
|
317
|
+
| `judge` | LLM judge only (legacy path via `validateProposal`) |
|
|
318
|
+
|
|
319
|
+
The default is `auto`, which provides the strongest available signal without
|
|
320
|
+
requiring manual fixture configuration. When replay is available, it stages the
|
|
321
|
+
candidate skill content into a temporary local registry and records the
|
|
322
|
+
runtime's actual routing decision per eval entry. For description evolution,
|
|
323
|
+
that means the proposed description is applied to the target skill before
|
|
324
|
+
replay. When replay is not available, `auto` falls back to the LLM judge and
|
|
325
|
+
logs the fallback.
|
|
326
|
+
|
|
327
|
+
The actual mode used is recorded as `validation_mode` in audit entries
|
|
328
|
+
(`llm_judge`, `host_replay`, or `structural_guard`), along with
|
|
329
|
+
`validation_agent` and `validation_fixture_id` when applicable.
|
|
330
|
+
|
|
331
|
+
```bash
|
|
332
|
+
# Default: auto (replay-first, judge fallback)
|
|
333
|
+
selftune evolve --skill pptx --skill-path ./skills/pptx/SKILL.md
|
|
334
|
+
|
|
335
|
+
# Force replay only (error if unavailable)
|
|
336
|
+
selftune evolve --skill pptx --skill-path ./skills/pptx/SKILL.md --validation-mode replay
|
|
337
|
+
|
|
338
|
+
# Force judge only (legacy behavior)
|
|
339
|
+
selftune evolve --skill pptx --skill-path ./skills/pptx/SKILL.md --validation-mode judge
|
|
340
|
+
```
|
|
341
|
+
|
|
284
342
|
### Aggregate Metrics To Report
|
|
285
343
|
|
|
286
344
|
When summarizing an evolution run, include these aggregate metrics rather
|
|
@@ -378,6 +436,37 @@ selftune evolve --skill X --skill-path Y --cheap-loop --gate-model opus --gate-e
|
|
|
378
436
|
selftune evolve --skill X --skill-path Y --proposal-model haiku --validation-model sonnet
|
|
379
437
|
```
|
|
380
438
|
|
|
439
|
+
## Apply Contributor Proposal
|
|
440
|
+
|
|
441
|
+
The `apply-proposal` subcommand fetches an approved contributor aggregate
|
|
442
|
+
proposal from the cloud dashboard and applies it to the local SKILL.md.
|
|
443
|
+
|
|
444
|
+
```bash
|
|
445
|
+
selftune evolve apply-proposal --id <proposal-id> --skill-path <path> [--dry-run]
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
### Apply-Proposal Options
|
|
449
|
+
|
|
450
|
+
| Flag | Description | Default |
|
|
451
|
+
| ----------------- | ----------------------------------------------- | -------- |
|
|
452
|
+
| `--id <uuid>` | Proposal UUID from the dashboard | Required |
|
|
453
|
+
| `--skill-path` | Path to the target SKILL.md | Required |
|
|
454
|
+
| `--dry-run` | Preview the proposal without writing to disk | Off |
|
|
455
|
+
|
|
456
|
+
### Apply-Proposal Flow
|
|
457
|
+
|
|
458
|
+
1. Fetch the proposal via `GET /api/v1/proposals/:id`
|
|
459
|
+
2. Verify `proposed_by` is `contributor_aggregate` and status is `approved`
|
|
460
|
+
3. Display a summary (type, reason, pass rate change, diff preview)
|
|
461
|
+
4. If not `--dry-run`: back up SKILL.md, apply the proposed value, and
|
|
462
|
+
`PATCH /api/v1/proposals/:id` with status `applied`
|
|
463
|
+
|
|
464
|
+
### When to Use
|
|
465
|
+
|
|
466
|
+
- After reviewing and approving a contributor proposal in the cloud dashboard
|
|
467
|
+
- When community signal suggests a description or body improvement
|
|
468
|
+
- As the final step in the contributor-driven evolution workflow
|
|
469
|
+
|
|
381
470
|
## Common Patterns
|
|
382
471
|
|
|
383
472
|
**User asks to evolve a specific skill (e.g., "evolve the pptx skill"):**
|
|
@@ -398,7 +487,7 @@ Also check if the eval set has contradictory expectations.
|
|
|
398
487
|
|
|
399
488
|
**Agent CLI override needed:**
|
|
400
489
|
The evolve command auto-detects the installed agent CLI.
|
|
401
|
-
Use `--agent <name>` to override (claude, codex, opencode).
|
|
490
|
+
Use `--agent <name>` to override (claude, codex, opencode, pi).
|
|
402
491
|
|
|
403
492
|
## Subagent Escalation
|
|
404
493
|
|
|
@@ -10,6 +10,22 @@ LLM validates them through a 3-gate pipeline.
|
|
|
10
10
|
selftune evolve body --skill <name> --skill-path <path> --target <target> [options]
|
|
11
11
|
```
|
|
12
12
|
|
|
13
|
+
## Recommended Creator Loop
|
|
14
|
+
|
|
15
|
+
Before mutating routing or the full body, make sure the creator trust loop is in
|
|
16
|
+
place:
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
selftune eval generate --skill <name> --skill-path <path>
|
|
20
|
+
selftune eval unit-test --skill <name> --generate --skill-path <path>
|
|
21
|
+
selftune evolve body --skill <name> --skill-path <path> --target <target> --dry-run --validation-mode replay
|
|
22
|
+
selftune grade baseline --skill <name> --skill-path <path>
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
If replay validation or the baseline is still missing, prefer filling that gap
|
|
26
|
+
before live deployment. Body and routing evolution are much harder to trust than
|
|
27
|
+
description-only changes when the creator loop is incomplete.
|
|
28
|
+
|
|
13
29
|
## Options
|
|
14
30
|
|
|
15
31
|
| Flag | Description | Default |
|
|
@@ -26,6 +42,7 @@ selftune evolve body --skill <name> --skill-path <path> --target <target> [optio
|
|
|
26
42
|
| `--max-iterations <n>` | Maximum refinement iterations | 3 |
|
|
27
43
|
| `--task-description <text>` | Context for the evolution goal | None |
|
|
28
44
|
| `--validation-model <model>` | Model for trigger-check validation calls (overrides `--student-model` for validation) | None |
|
|
45
|
+
| `--validation-mode <mode>` | Validation strategy: `auto`, `replay`, or `judge` | `auto` |
|
|
29
46
|
| `--teacher-effort <level>` | Effort level for teacher LLM: `low`, `medium`, `high`, `max` | `high` |
|
|
30
47
|
| `--review` | Run `evolution-reviewer` subagent as Gate 4 before deployment | Off |
|
|
31
48
|
| `--few-shot <paths>` | Comma-separated paths to example SKILL.md files | None |
|
|
@@ -51,7 +68,7 @@ Every proposal passes through three sequential gates:
|
|
|
51
68
|
| Gate | Type | What it checks | Cost |
|
|
52
69
|
| ----------------------------- | ----------- | ----------------------------------------------------------------------------------------------- | -------- |
|
|
53
70
|
| **Gate 1: Structural** | Pure code | YAML frontmatter present, `# Title` exists, `## Workflow Routing` preserved if original had one | Free |
|
|
54
|
-
| **Gate 2: Trigger Accuracy** |
|
|
71
|
+
| **Gate 2: Trigger Accuracy** | Replay or student LLM | Runtime replay when available; otherwise YES/NO trigger check per eval entry | Cheap |
|
|
55
72
|
| **Gate 3: Quality** | Student LLM | Body clarity and completeness score (0.0-1.0) | Cheap |
|
|
56
73
|
| **Gate 4: Reviewer** (opt-in) | Subagent | `evolution-reviewer` multi-turn review — reads files, checks evidence, APPROVE/REJECT verdict | Moderate |
|
|
57
74
|
|
|
@@ -141,6 +158,25 @@ Few-shot examples from `--few-shot` paths provide structural guidance.
|
|
|
141
158
|
Each gate runs in sequence. If a gate fails, the teacher receives the
|
|
142
159
|
failure details and generates a refined proposal.
|
|
143
160
|
|
|
161
|
+
### Validation Mode (`--validation-mode`)
|
|
162
|
+
|
|
163
|
+
`evolve body` uses the same validation contract as `evolve`:
|
|
164
|
+
|
|
165
|
+
| Mode | Behavior |
|
|
166
|
+
| -------- | ------------------------------------------------------------------------ |
|
|
167
|
+
| `auto` | Try replay-backed validation first; fall back to LLM judge if unavailable |
|
|
168
|
+
| `replay` | Replay engine only; error if no replay fixture or runner is available |
|
|
169
|
+
| `judge` | LLM judge only |
|
|
170
|
+
|
|
171
|
+
When replay is available, selftune stages the candidate skill content into a
|
|
172
|
+
temporary local registry before running the real host/runtime replay. Claude
|
|
173
|
+
Code uses `.claude/skills`, Codex uses `.agents/skills`, and OpenCode uses
|
|
174
|
+
`.opencode/skills`. Routing targets stage the candidate `## Workflow Routing`
|
|
175
|
+
section; body targets stage the full candidate body while preserving
|
|
176
|
+
frontmatter and title. When replay is not available, `auto` falls back to the
|
|
177
|
+
LLM judge and records the `validation_fallback_reason` in audit/evidence
|
|
178
|
+
output.
|
|
179
|
+
|
|
144
180
|
### 6. Deploy or Preview
|
|
145
181
|
|
|
146
182
|
If `--dry-run`, prints the proposal without deploying. Otherwise:
|
|
@@ -164,6 +200,10 @@ If `--dry-run`, prints the proposal without deploying. Otherwise:
|
|
|
164
200
|
|
|
165
201
|
> `selftune evolve body --skill pptx --skill-path /path/SKILL.md --target body --teacher-model opus --student-model haiku`
|
|
166
202
|
|
|
203
|
+
**"Force replay-only validation for a routing change"**
|
|
204
|
+
|
|
205
|
+
> `selftune evolve body --skill Research --skill-path ~/.claude/skills/Research/SKILL.md --target routing --validation-mode replay`
|
|
206
|
+
|
|
167
207
|
**"Preview what would change"**
|
|
168
208
|
|
|
169
209
|
> Always start with `--dry-run` to review the proposal before deploying.
|
|
@@ -17,7 +17,7 @@ selftune grade --skill <name> [options]
|
|
|
17
17
|
| `--expectations "..."` | Explicit expectations (semicolon-separated) | Auto-derived |
|
|
18
18
|
| `--evals-json <path>` | Pre-built eval set JSON file | None |
|
|
19
19
|
| `--eval-id <n>` | Specific eval ID to grade from the eval set | None |
|
|
20
|
-
| `--agent <name>` | Agent CLI to use (claude, codex, opencode) | Auto-detected |
|
|
20
|
+
| `--agent <name>` | Agent CLI to use (claude, codex, opencode, pi) | Auto-detected |
|
|
21
21
|
|
|
22
22
|
## Output Format
|
|
23
23
|
|
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
> **Note:** Claude Code is the fully supported platform. Codex, OpenCode, and OpenClaw adapters are experimental and may have gaps.
|
|
4
4
|
|
|
5
5
|
Import sessions from agent platforms into the shared selftune log format.
|
|
6
|
-
Covers
|
|
7
|
-
`ingest openclaw`, and `ingest wrap-codex`.
|
|
6
|
+
Covers six sub-commands: `ingest claude`, `ingest codex`, `ingest opencode`,
|
|
7
|
+
`ingest openclaw`, `ingest pi`, and `ingest wrap-codex`.
|
|
8
8
|
|
|
9
9
|
## When to Use Each
|
|
10
10
|
|
|
@@ -14,6 +14,7 @@ Covers five sub-commands: `ingest claude`, `ingest codex`, `ingest opencode`,
|
|
|
14
14
|
| `ingest codex` | Codex | Batch | Import existing Codex rollout logs |
|
|
15
15
|
| `ingest opencode` | OpenCode | Batch | Import existing OpenCode sessions |
|
|
16
16
|
| `ingest openclaw` | OpenClaw | Batch | Import existing OpenClaw agent sessions |
|
|
17
|
+
| `ingest pi` | Pi | Batch | Import existing Pi agent sessions |
|
|
17
18
|
| `ingest wrap-codex` | Codex | Real-time | Wrap `codex exec` to capture telemetry live |
|
|
18
19
|
|
|
19
20
|
---
|
|
@@ -200,6 +201,55 @@ Writes to:
|
|
|
200
201
|
|
|
201
202
|
---
|
|
202
203
|
|
|
204
|
+
## ingest pi
|
|
205
|
+
|
|
206
|
+
Batch ingest Pi agent session histories into the shared JSONL schema.
|
|
207
|
+
|
|
208
|
+
### Default Command
|
|
209
|
+
|
|
210
|
+
```bash
|
|
211
|
+
selftune ingest pi
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
### Options
|
|
215
|
+
|
|
216
|
+
| Flag | Description |
|
|
217
|
+
| ----------------------- | ------------------------------------------------------------------ |
|
|
218
|
+
| `--sessions-dir <path>` | Override default `~/.pi/agent/sessions/` directory |
|
|
219
|
+
| `--since <date>` | Only ingest sessions modified after this date (e.g., `2026-01-01`) |
|
|
220
|
+
| `--dry-run` | Show what would be ingested without writing to logs |
|
|
221
|
+
| `--force` | Re-ingest all sessions, ignoring the marker file |
|
|
222
|
+
| `--verbose` / `-v` | Show per-session progress during ingestion |
|
|
223
|
+
|
|
224
|
+
### Source
|
|
225
|
+
|
|
226
|
+
Reads from `~/.pi/agent/sessions/`. Each session file contains Pi agent
|
|
227
|
+
conversation history in JSONL format.
|
|
228
|
+
|
|
229
|
+
### Output
|
|
230
|
+
|
|
231
|
+
Writes to:
|
|
232
|
+
|
|
233
|
+
- `~/.claude/all_queries_log.jsonl` -- extracted user queries
|
|
234
|
+
- `~/.claude/session_telemetry_log.jsonl` -- per-session metrics with `source: "pi"`
|
|
235
|
+
- `~/.claude/skill_usage_log.jsonl` -- skill triggers with `source: "pi"`
|
|
236
|
+
|
|
237
|
+
### Steps
|
|
238
|
+
|
|
239
|
+
1. Run `selftune ingest pi --dry-run` to preview what would be ingested
|
|
240
|
+
2. Run `selftune ingest pi` to ingest all sessions
|
|
241
|
+
3. Run `selftune doctor` to confirm logs are healthy
|
|
242
|
+
4. Run `selftune eval generate --list-skills` to see if the ingested sessions appear
|
|
243
|
+
|
|
244
|
+
### Notes
|
|
245
|
+
|
|
246
|
+
- Idempotent: uses a marker file to track which sessions have already been ingested.
|
|
247
|
+
Safe to run repeatedly. Use `--force` to re-ingest everything.
|
|
248
|
+
- Skill detection heuristic: identifies skills by checking for `SKILL.md` file reads in
|
|
249
|
+
tool calls and by matching known skill names in assistant text content.
|
|
250
|
+
|
|
251
|
+
---
|
|
252
|
+
|
|
203
253
|
## ingest wrap-codex
|
|
204
254
|
|
|
205
255
|
Wrap `codex exec` with real-time telemetry capture. Drop-in replacement
|
|
@@ -269,6 +319,14 @@ through hooks.
|
|
|
269
319
|
|
|
270
320
|
> Run `selftune ingest openclaw --since 2026-02-01` with an appropriate date.
|
|
271
321
|
|
|
322
|
+
**"Ingest Pi sessions"**
|
|
323
|
+
|
|
324
|
+
> Run `selftune ingest pi`. Reads from `~/.pi/agent/sessions/` automatically.
|
|
325
|
+
|
|
326
|
+
**"Import only recent Pi sessions"**
|
|
327
|
+
|
|
328
|
+
> Run `selftune ingest pi --since 2026-02-01` with an appropriate date.
|
|
329
|
+
|
|
272
330
|
**"Run codex through selftune"**
|
|
273
331
|
|
|
274
332
|
> Use `selftune ingest wrap-codex -- <codex args>` instead of `codex exec <args>` directly.
|