selftune 0.2.31 → 0.2.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/README.md +83 -56
  2. package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
  3. package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
  4. package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
  5. package/apps/local-dashboard/dist/index.html +3 -3
  6. package/cli/selftune/command-surface.ts +613 -2
  7. package/cli/selftune/create/baseline.ts +429 -0
  8. package/cli/selftune/create/check.ts +35 -0
  9. package/cli/selftune/create/init.ts +115 -0
  10. package/cli/selftune/create/package-candidate-state.ts +771 -0
  11. package/cli/selftune/create/package-evaluator.ts +710 -0
  12. package/cli/selftune/create/package-fingerprint.ts +142 -0
  13. package/cli/selftune/create/package-search.ts +377 -0
  14. package/cli/selftune/create/publish.ts +431 -0
  15. package/cli/selftune/create/readiness.ts +495 -0
  16. package/cli/selftune/create/replay.ts +330 -0
  17. package/cli/selftune/create/report.ts +74 -0
  18. package/cli/selftune/create/scaffold.ts +121 -0
  19. package/cli/selftune/create/skills-ref-adapter.ts +177 -0
  20. package/cli/selftune/create/status.ts +33 -0
  21. package/cli/selftune/create/templates.ts +249 -0
  22. package/cli/selftune/cron/setup.ts +1 -1
  23. package/cli/selftune/dashboard-action-events.ts +4 -1
  24. package/cli/selftune/dashboard-action-result.ts +789 -24
  25. package/cli/selftune/dashboard-action-stream.ts +80 -0
  26. package/cli/selftune/dashboard-contract.ts +146 -3
  27. package/cli/selftune/dashboard-server.ts +5 -4
  28. package/cli/selftune/eval/hooks-to-evals.ts +58 -35
  29. package/cli/selftune/eval/synthetic-evals.ts +145 -17
  30. package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
  31. package/cli/selftune/evolution/evolve-body.ts +9 -36
  32. package/cli/selftune/evolution/evolve.ts +8 -72
  33. package/cli/selftune/evolution/stopping-criteria.ts +5 -13
  34. package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
  35. package/cli/selftune/evolution/validate-host-replay.ts +115 -15
  36. package/cli/selftune/improve.ts +206 -0
  37. package/cli/selftune/index.ts +123 -6
  38. package/cli/selftune/init.ts +1 -1
  39. package/cli/selftune/localdb/queries/dashboard.ts +30 -0
  40. package/cli/selftune/localdb/schema.ts +52 -0
  41. package/cli/selftune/monitoring/watch.ts +257 -23
  42. package/cli/selftune/orchestrate/execute.ts +300 -1
  43. package/cli/selftune/orchestrate/finalize.ts +14 -0
  44. package/cli/selftune/orchestrate/plan.ts +22 -5
  45. package/cli/selftune/orchestrate/prepare.ts +59 -4
  46. package/cli/selftune/orchestrate/report.ts +1 -1
  47. package/cli/selftune/orchestrate.ts +34 -1
  48. package/cli/selftune/publish.ts +35 -0
  49. package/cli/selftune/routes/actions.ts +81 -15
  50. package/cli/selftune/routes/overview.ts +1 -1
  51. package/cli/selftune/routes/skill-report.ts +147 -2
  52. package/cli/selftune/run.ts +18 -0
  53. package/cli/selftune/schedule.ts +3 -3
  54. package/cli/selftune/search-run.ts +703 -0
  55. package/cli/selftune/status.ts +35 -11
  56. package/cli/selftune/testing-readiness.ts +431 -40
  57. package/cli/selftune/types.ts +316 -0
  58. package/cli/selftune/utils/eval-readiness.ts +1 -0
  59. package/cli/selftune/utils/json-output.ts +11 -0
  60. package/cli/selftune/utils/lifecycle-surface.ts +48 -0
  61. package/cli/selftune/utils/query-filter.ts +82 -1
  62. package/cli/selftune/utils/tui.ts +85 -2
  63. package/cli/selftune/verify.ts +205 -0
  64. package/cli/selftune/workflows/proposals.ts +1 -1
  65. package/cli/selftune/workflows/skill-scaffold.ts +141 -63
  66. package/cli/selftune/workflows/workflows.ts +4 -4
  67. package/package.json +1 -1
  68. package/skill/SKILL.md +148 -85
  69. package/skill/references/cli-quick-reference.md +16 -1
  70. package/skill/references/creator-playbook.md +31 -10
  71. package/skill/workflows/Baseline.md +8 -9
  72. package/skill/workflows/Contributions.md +4 -4
  73. package/skill/workflows/Create.md +173 -0
  74. package/skill/workflows/CreateTestDeploy.md +34 -30
  75. package/skill/workflows/Cron.md +2 -2
  76. package/skill/workflows/Dashboard.md +3 -3
  77. package/skill/workflows/Evals.md +13 -7
  78. package/skill/workflows/Evolve.md +75 -32
  79. package/skill/workflows/EvolveBody.md +22 -15
  80. package/skill/workflows/Hook.md +1 -1
  81. package/skill/workflows/Improve.md +168 -0
  82. package/skill/workflows/Initialize.md +3 -3
  83. package/skill/workflows/Orchestrate.md +49 -12
  84. package/skill/workflows/Publish.md +100 -0
  85. package/skill/workflows/Run.md +72 -0
  86. package/skill/workflows/Schedule.md +2 -2
  87. package/skill/workflows/SearchRun.md +89 -0
  88. package/skill/workflows/SignalsDashboard.md +2 -2
  89. package/skill/workflows/UnitTest.md +13 -4
  90. package/skill/workflows/Verify.md +136 -0
  91. package/skill/workflows/Watch.md +114 -47
  92. package/skill/workflows/Workflows.md +13 -8
  93. package/apps/local-dashboard/dist/assets/index-B7v_o1WC.js +0 -15
  94. package/apps/local-dashboard/dist/assets/index-CrO77SVi.css +0 -1
  95. package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
@@ -10,21 +10,25 @@ LLM validates them through a 3-gate pipeline.
10
10
  selftune evolve body --skill <name> --skill-path <path> --target <target> [options]
11
11
  ```
12
12
 
13
- ## Recommended Creator Loop
13
+ ## Recommended Package Evaluation Pipeline
14
14
 
15
- Before mutating routing or the full body, make sure the creator trust loop is in
16
- place:
15
+ Before mutating routing or the full body, make sure the package evaluation
16
+ pipeline is in place:
17
17
 
18
18
  ```bash
19
+ selftune verify --skill-path <path>
19
20
  selftune eval generate --skill <name> --skill-path <path>
21
+ selftune verify --skill-path <path>
20
22
  selftune eval unit-test --skill <name> --generate --skill-path <path>
23
+ selftune create replay --skill-path <path> --mode package
24
+ selftune create baseline --skill-path <path> --mode package
25
+ selftune verify --skill-path <path>
21
26
  selftune evolve body --skill <name> --skill-path <path> --target <target> --dry-run --validation-mode replay
22
- selftune grade baseline --skill <name> --skill-path <path>
23
27
  ```
24
28
 
25
29
  If replay validation or the baseline is still missing, prefer filling that gap
26
30
  before live deployment. Body and routing evolution are much harder to trust than
27
- description-only changes when the creator loop is incomplete.
31
+ description-only changes when the package evaluation pipeline is incomplete.
28
32
 
29
33
  ## Options
30
34
 
@@ -40,6 +44,7 @@ description-only changes when the creator loop is incomplete.
40
44
  | `--eval-set <path>` | Pre-built eval set JSON | Auto-generated from logs |
41
45
  | `--dry-run` | Propose and validate without deploying | Off |
42
46
  | `--max-iterations <n>` | Maximum refinement iterations | 3 |
47
+ | `--confidence <n>` | Low-confidence review threshold (warning/escalation only) | 0.6 |
43
48
  | `--task-description <text>` | Context for the evolution goal | None |
44
49
  | `--validation-model <model>` | Model for trigger-check validation calls (overrides `--student-model` for validation) | None |
45
50
  | `--validation-mode <mode>` | Validation strategy: `auto`, `replay`, or `judge` | `auto` |
@@ -65,15 +70,17 @@ teacher generates a complete replacement, validated through 3 gates.
65
70
 
66
71
  Every proposal passes through three sequential gates:
67
72
 
68
- | Gate | Type | What it checks | Cost |
69
- | ----------------------------- | ----------- | ----------------------------------------------------------------------------------------------- | -------- |
70
- | **Gate 1: Structural** | Pure code | YAML frontmatter present, `# Title` exists, `## Workflow Routing` preserved if original had one | Free |
71
- | **Gate 2: Trigger Accuracy** | Replay or student LLM | Runtime replay when available; otherwise YES/NO trigger check per eval entry | Cheap |
72
- | **Gate 3: Quality** | Student LLM | Body clarity and completeness score (0.0-1.0) | Cheap |
73
- | **Gate 4: Reviewer** (opt-in) | Subagent | `evolution-reviewer` multi-turn review — reads files, checks evidence, APPROVE/REJECT verdict | Moderate |
73
+ | Gate | Type | What it checks | Cost |
74
+ | ----------------------------- | --------------------- | ----------------------------------------------------------------------------------------------- | -------- |
75
+ | **Gate 1: Structural** | Pure code | YAML frontmatter present, `# Title` exists, `## Workflow Routing` preserved if original had one | Free |
76
+ | **Gate 2: Trigger Accuracy** | Replay or student LLM | Runtime replay when available; otherwise YES/NO trigger check per eval entry | Cheap |
77
+ | **Gate 3: Quality** | Student LLM | Body clarity and completeness score (0.0-1.0) | Cheap |
78
+ | **Gate 4: Reviewer** (opt-in) | Subagent | `evolution-reviewer` multi-turn review — reads files, checks evidence, APPROVE/REJECT verdict | Moderate |
74
79
 
75
80
  If any gate fails, the teacher receives structured feedback and generates
76
81
  a refined proposal. This repeats up to `--max-iterations` times.
82
+ Low confidence does not reject a proposal by itself; it is treated as review
83
+ metadata and may justify extra scrutiny.
77
84
 
78
85
  ## Steps
79
86
 
@@ -162,11 +169,11 @@ failure details and generates a refined proposal.
162
169
 
163
170
  `evolve body` uses the same validation contract as `evolve`:
164
171
 
165
- | Mode | Behavior |
166
- | -------- | ------------------------------------------------------------------------ |
172
+ | Mode | Behavior |
173
+ | -------- | ------------------------------------------------------------------------- |
167
174
  | `auto` | Try replay-backed validation first; fall back to LLM judge if unavailable |
168
- | `replay` | Replay engine only; error if no replay fixture or runner is available |
169
- | `judge` | LLM judge only |
175
+ | `replay` | Replay engine only; error if no replay fixture or runner is available |
176
+ | `judge` | LLM judge only |
170
177
 
171
178
  When replay is available, selftune stages the candidate skill content into a
172
179
  temporary local registry before running the real host/runtime replay. Claude
@@ -44,7 +44,7 @@ canonical prompt record.
44
44
  Fires when a Claude Code session ends. Reads the session transcript JSONL and
45
45
  extracts process-level telemetry (tool calls, errors, skills triggered, token
46
46
  counts). Writes one record per session to SQLite with a JSONL backup. May
47
- trigger a reactive `selftune orchestrate` spawn if conditions are met.
47
+ trigger a reactive `selftune run` spawn if conditions are met.
48
48
 
49
49
  ### skill-eval
50
50
 
@@ -0,0 +1,168 @@
1
+ # selftune Improve Workflow
2
+
3
+ Use this when the user wants to make a skill better based on measured evidence.
4
+
5
+ ## What Improve Means
6
+
7
+ `Improve` is the primary lifecycle workflow for bounded skill evolution.
8
+
9
+ The lifecycle entrypoint is:
10
+
11
+ ```bash
12
+ selftune improve --skill <name> --skill-path <path> [--scope auto|description|routing|body|package] [--agent AGENT] [--eval-set PATH] [--dry-run] [--validation-mode auto|replay|judge]
13
+ ```
14
+
15
+ ## Options
16
+
17
+ | Flag | Description |
18
+ |------|-------------|
19
+ | `--scope` | Improvement scope: `auto`, `description`, `routing`, `body`, or `package` |
20
+ | `--skill` | Skill name |
21
+ | `--skill-path` | Path to `SKILL.md` |
22
+ | `--agent` | Agent CLI to use; for body/routing this sets both teacher and student agents |
23
+ | `--eval-set` | Override the eval set JSON path |
24
+ | `--dry-run` | Validate candidate changes without deploying |
25
+ | `--validation-mode` | Validation strategy: `auto`, `replay`, or `judge` |
26
+ | `--help` | Show command help |
27
+
28
+ `improve` chooses an underlying command surface:
29
+
30
+ - `selftune evolve ...` for description and trigger-surface changes
31
+ - `selftune evolve body ... --target routing` for workflow-routing changes
32
+ - `selftune evolve body ... --target body` for larger body changes
33
+ - `selftune search-run ...` for bounded package search across routing/body variants
34
+
35
+ Always prefer the smallest mutation surface that matches the measured problem.
36
+
37
+ ## Preconditions
38
+
39
+ Before improving a skill, make sure the skill is already verified enough to
40
+ trust the result:
41
+
42
+ - evals exist
43
+ - unit tests exist when needed
44
+ - replay evidence exists
45
+ - baseline or equivalent value evidence exists when the change is high stakes
46
+
47
+ If those are missing, route to `Verify` first.
48
+
49
+ ## Scope Selection
50
+
51
+ ### Description scope
52
+
53
+ Use:
54
+
55
+ ```bash
56
+ selftune improve --skill <name> --skill-path <path> --scope description --dry-run --validation-mode replay
57
+ ```
58
+
59
+ Choose this when:
60
+
61
+ - undertriggering is mostly a wording problem
62
+ - missed queries cluster around synonyms or phrasing
63
+ - the workflow itself is fine once the skill triggers
64
+
65
+ ### Routing scope
66
+
67
+ Use:
68
+
69
+ ```bash
70
+ selftune improve --skill <name> --skill-path <path> --scope routing --dry-run --validation-mode replay
71
+ ```
72
+
73
+ Choose this when:
74
+
75
+ - the skill triggers but chooses the wrong workflow
76
+ - workflow routing is incomplete or ambiguous
77
+ - the problem is structural routing, not top-level matching
78
+
79
+ ### Body scope
80
+
81
+ Use:
82
+
83
+ ```bash
84
+ selftune improve --skill <name> --skill-path <path> --scope body --dry-run --validation-mode replay
85
+ ```
86
+
87
+ Choose this when:
88
+
89
+ - the skill triggers and routes correctly but executes poorly
90
+ - instructions inside the body are incomplete, misleading, or stale
91
+
92
+ ### Package scope
93
+
94
+ Use:
95
+
96
+ ```bash
97
+ selftune improve --skill <name> --skill-path <path> --scope package --eval-set <path>
98
+ ```
99
+
100
+ Choose this when:
101
+
102
+ - the measured gap spans routing and body together
103
+ - you want frontier-backed candidate comparison instead of a single mutation
104
+ - you need a bounded review loop before deciding what to publish
105
+
106
+ Package scope always runs bounded search through the shared package evaluator.
107
+ Without `--dry-run`, the winning candidate is promoted back into the draft
108
+ package automatically. `--validation-mode judge` is not supported here.
109
+
110
+ ## Default Approach
111
+
112
+ 1. start with the smallest scope that matches the evidence
113
+ 2. let `--scope auto` choose bounded package search automatically when the
114
+ target already has package evidence or a draft package manifest; otherwise
115
+ it falls back to description-surface evolution
116
+ 3. use `--scope package` when the evidence points to a multi-surface package
117
+ problem and you want to force package search explicitly
118
+ 4. prefer `--dry-run` first
119
+ 5. use replay-backed validation when available
120
+ 6. only deploy once the proposal clears the trust bar
121
+ 7. after deployment, hand off to `Watch`
122
+
123
+ ## After Improve
124
+
125
+ If a change is deployed, continue with:
126
+
127
+ ```bash
128
+ selftune watch --skill <name> --skill-path <path>
129
+ ```
130
+
131
+ For draft packages, use `Publish` instead of treating improve as the first ship
132
+ path.
133
+
134
+ ## Which workflows to load next
135
+
136
+ - missing trust evidence -> `workflows/Verify.md`
137
+ - explicit description-only evolution details -> `workflows/Evolve.md`
138
+ - explicit routing/body mutation details -> `workflows/EvolveBody.md`
139
+ - monitoring and rollback -> `workflows/Watch.md`
140
+
141
+ ## Common Patterns
142
+
143
+ **"Make this skill better."**
144
+
145
+ > Use `Improve`. Pick the smallest scope that fits the measured gap, or start
146
+ > with `--scope auto`. For draft packages or skills with package-frontier
147
+ > evidence, `--scope auto` now routes into bounded package search by default.
148
+
149
+ **"It undertriggers."**
150
+
151
+ > Start with description scope unless evidence points to routing issues.
152
+
153
+ **"The wrong workflow fires."**
154
+
155
+ > Use routing scope, not description scope.
156
+
157
+ **"The skill fires but performs badly."**
158
+
159
+ > Use body scope after verify is in place.
160
+
161
+ **"I want to compare a few full package candidates first."**
162
+
163
+ > Use package scope so selftune runs bounded search instead of a single edit.
164
+
165
+ Bounded package search now prefers reflective proposals first, then
166
+ eval-informed targeted variants, then deterministic fallback. When routing and
167
+ body both produce accepted improvements, it also evaluates a merged candidate
168
+ before final winner selection.
@@ -255,7 +255,7 @@ selftune recover --full --force
255
255
  ### 9. Autonomy Scheduling
256
256
 
257
257
  Init automatically installs OS-level scheduling (launchd on macOS, cron/systemd
258
- on Linux) so `selftune orchestrate` runs in the background. This is equivalent
258
+ on Linux) so `selftune run` runs in the background. This is equivalent
259
259
  to running `selftune cron setup` manually.
260
260
 
261
261
  Skip with `--no-autonomy` if you prefer manual orchestration only.
@@ -377,9 +377,9 @@ prompt/query text in addition to skill/session/evolution metadata.
377
377
 
378
378
  ### Upload Behavior
379
379
 
380
- Once enrolled, `selftune orchestrate` automatically uploads new session,
380
+ Once enrolled, `selftune run` automatically uploads new session,
381
381
  invocation, and evolution data to the cloud API at the end of each run.
382
- This upload step is fail-open -- errors never block the orchestrate loop.
382
+ This upload step is fail-open -- errors never block the autonomous loop.
383
383
  Use `selftune alpha upload` for manual uploads or `selftune alpha upload --dry-run`
384
384
  to preview what would be sent.
385
385
 
@@ -2,11 +2,18 @@
2
2
 
3
3
  Run the autonomy-first selftune loop in one command.
4
4
 
5
- `selftune orchestrate` is the primary closed-loop entrypoint. It runs
5
+ This remains the underlying runtime, but the simplified product surface should
6
+ teach it as `Run`.
7
+
8
+ `selftune orchestrate` is the current closed-loop entrypoint. It runs
6
9
  source-truth sync, computes current skill health, selects candidates,
7
10
  deploys validated low-risk description changes autonomously, and watches
8
11
  recent changes with auto-rollback enabled.
9
12
 
13
+ If the user asks to "run selftune", "run the loop", or "operate autonomously",
14
+ route through `workflows/Run.md` first and use this file for the exact
15
+ underlying behavior and flags.
16
+
10
17
  ## When to Use
11
18
 
12
19
  - You want the full autonomous loop, not isolated subcommands
@@ -17,6 +24,7 @@ recent changes with auto-rollback enabled.
17
24
  ## Default Command
18
25
 
19
26
  ```bash
27
+ selftune run
20
28
  selftune orchestrate
21
29
  ```
22
30
 
@@ -57,6 +65,7 @@ proposalModel = haiku
57
65
  - Sync source-truth telemetry first
58
66
  - Auto-grade up to 5 ungraded skills that have session data (enables evolution on first run after ingest)
59
67
  - Prioritize critical/warning/ungraded skills with real missed-query signal
68
+ - Select package-level search for skills with package evaluation evidence; fall back to description-level evolve otherwise
60
69
  - Deploy validated low-risk description changes automatically
61
70
  - Auto-grade and write grading baselines for freshly deployed skills
62
71
  - Generate review-first new skill proposals from strong workflow patterns
@@ -71,27 +80,27 @@ Use `--review-required` only when you want a stricter policy for a specific run.
71
80
 
72
81
  **User asks to improve skills or run the full loop**
73
82
 
74
- > Run `selftune orchestrate`. Parse the JSON output from stdout and the
83
+ > Run `selftune run`. Parse the JSON output from stdout and the
75
84
  > phased report from stderr. Report the summary to the user.
76
85
 
77
86
  **User wants to preview changes before deploying**
78
87
 
79
- > Run `selftune orchestrate --dry-run`. Report the planned actions without
88
+ > Run `selftune run --dry-run`. Report the planned actions without
80
89
  > making any changes.
81
90
 
82
91
  **User wants to focus on a single skill**
83
92
 
84
- > Run `selftune orchestrate --skill <name>`. This limits the loop to the
93
+ > Run `selftune run --skill <name>`. This limits the loop to the
85
94
  > specified skill only.
86
95
 
87
96
  **User wants manual review before deployment**
88
97
 
89
- > Run `selftune orchestrate --review-required`. Validated changes stay in
98
+ > Run `selftune run --review-required`. Validated changes stay in
90
99
  > review mode instead of auto-deploying.
91
100
 
92
101
  **Agent needs fresh source data before orchestrating**
93
102
 
94
- > Run `selftune orchestrate --sync-force`. This forces a full source replay
103
+ > Run `selftune run --sync-force`. This forces a full source replay
95
104
  > before candidate selection.
96
105
 
97
106
  ## Output
@@ -168,12 +177,40 @@ In autonomous mode, orchestrate calls sub-workflows in this fixed order:
168
177
  1. **Sync** — refresh source-truth telemetry across all supported agents (`selftune sync`)
169
178
  2. **Status** — compute skill health using existing grade results (reads `grading.json` outputs from previous sessions)
170
179
  3. **Auto-grade** — grade up to `--max-auto-grade` (default 5) ungraded skills that have session data but no grades yet. Skipped during `--dry-run` (grading makes LLM calls). After grading, status is recomputed so candidate selection sees updated grades. Fail-open: individual grading errors are logged but never block the loop.
171
- 4. **Evolve** — run evolution on selected candidates (pre-flight is skipped; Pareto mode uses 3 candidates; cheap-loop uses `haiku` for proposal + validation and `sonnet` for the final gate; adaptive gate escalation promotes risky proposals to `opus` + `high` effort; baseline and token-efficiency stay off)
172
- 5. **Post-deploy grade + baseline** — for each freshly deployed skill, grade the most recent session and write a grading baseline to SQLite (`grading_baselines` table). The baseline records the measured pass rate and sample size, anchoring future grade regression detection. Fail-open: individual grading errors are logged but never block the loop.
173
- 6. **Watch** — monitor recently evolved skills (auto-rollback enabled by default, `--recent-window` hours lookback). Skills freshly deployed in this run are included in the watch set immediately, so they are monitored in the same orchestrate cycle rather than waiting for the next run. These appear in `freshlyWatchedSkills` in the output. Grade watch (`enableGradeWatch: true`) runs alongside trigger regression for all watched skills.
174
- 7. **Workflow proposals** — discover repeated multi-skill patterns and create review-first `new_skill` proposals when a workflow is strong enough to merit codification. These are never auto-deployed; they are surfaced as proposals for review.
175
- 8. **Alpha Upload** — if enrolled in the alpha program (`config.alpha.enrolled === true`) and an API key is configured, stage new canonical records (sessions, invocations, evolution evidence, orchestrate runs) into `canonical_upload_staging`, build V2 push payloads, and flush to the cloud API (`POST /api/v1/push`) with Bearer auth. Fail-open: upload errors never block the orchestrate loop. Respects `--dry-run`.
176
- 9. **Contribution relay flush** — if an API key is configured, flush any staged creator-directed contribution signals for opted-in skills. Fail-open: relay errors never block the orchestrate loop. Respects `--dry-run`.
180
+ 4. **Scope selection** — for each candidate, orchestrate decides whether to run description-level evolve or package-level search. The decision is evidence-driven: when an accepted package frontier exists or the canonical package evaluation shows room for improvement (weak replay, low baseline lift, or body/routing regressions), orchestrate prefers package search. Skills without package evaluation evidence fall back to description-level evolve. This selection happens automatically; there is no flag to force it.
181
+ 5. **Evolve or Package Search** — description-level candidates run the existing evolve path (pre-flight skipped; Pareto mode uses 3 candidates; cheap-loop uses `haiku` for proposal + validation and `sonnet` for the final gate; adaptive gate escalation promotes risky proposals to `opus` + `high` effort; baseline and token-efficiency stay off). Package-level candidates run bounded search: generate bounded mutations on routing and body surfaces, fingerprint each variant, evaluate through the package evaluator against the accepted frontier parent, and apply the winning candidate back into the draft package.
182
+ 6. **Post-deploy grade + baseline** for each freshly deployed skill, grade the most recent session and write a grading baseline to SQLite (`grading_baselines` table). The baseline records the measured pass rate and sample size, anchoring future grade regression detection. Fail-open: individual grading errors are logged but never block the loop.
183
+ 7. **Watch** — monitor recently evolved skills (auto-rollback enabled by default, `--recent-window` hours lookback). Skills freshly deployed in this run are included in the watch set immediately, so they are monitored in the same orchestrate cycle rather than waiting for the next run. These appear in `freshlyWatchedSkills` in the output. Grade watch (`enableGradeWatch: true`) runs alongside trigger regression for all watched skills. Watch trust scoring feeds back into scope selection: observed regressions can demote accepted frontier candidates, influencing whether future runs prefer package search or description evolve for that skill.
184
+ 8. **Workflow proposals** — discover repeated multi-skill patterns and create review-first `new_skill` proposals when a workflow is strong enough to merit codification. These are never auto-deployed; they are surfaced as proposals for review.
185
+ 9. **Alpha Upload** — if enrolled in the alpha program (`config.alpha.enrolled === true`) and an API key is configured, stage new canonical records (sessions, invocations, evolution evidence, orchestrate runs) into `canonical_upload_staging`, build V2 push payloads, and flush to the cloud API (`POST /api/v1/push`) with Bearer auth. Fail-open: upload errors never block the orchestrate loop. Respects `--dry-run`.
186
+ 10. **Contribution relay flush** — if an API key is configured, flush any staged creator-directed contribution signals for opted-in skills. Fail-open: relay errors never block the orchestrate loop. Respects `--dry-run`.
187
+
188
+ ### Package Search in Orchestrate
189
+
190
+ When orchestrate selects a candidate for package-level improvement, it
191
+ delegates to the same bounded search flow available through
192
+ `selftune improve --scope package` or `selftune search-run`. The flow is:
193
+
194
+ 1. **Generate bounded mutations** on routing and body surfaces using
195
+ eval-informed targeted variants first, then deterministic fallback
196
+ transforms, biasing the candidate budget toward the weaker measured surface
197
+ from the accepted frontier or canonical package evaluation.
198
+ 2. **Fingerprint** each variant to skip duplicates already in the candidate
199
+ registry.
200
+ 3. **Evaluate** each variant through the package evaluator against the
201
+ accepted frontier parent, comparing replay, baseline lift, routing
202
+ validation, body validation, and unit-test results.
203
+ 4. **Select winner** based on the highest positive measured delta across the
204
+ full evaluator contract (not replay-only).
205
+ 5. **Apply winner** back into the draft package and refresh the canonical
206
+ package-evaluation artifact from the accepted candidate cache.
207
+ 6. **Continue** to the watch phase, where post-deploy evidence can demote
208
+ the accepted frontier candidate if regressions are observed.
209
+
210
+ Package search does not replace description evolve. It is selected
211
+ automatically when the evidence points to package-level improvement
212
+ opportunity. Skills without package evaluation history continue through
213
+ description-level evolve as before.
177
214
 
178
215
  When orchestrate invokes evolve for a selected candidate, it always passes
179
216
  `confidenceThreshold: 0.6` and `maxIterations: 3`, plus the autonomous evolve
@@ -0,0 +1,100 @@
1
+ # selftune Publish Workflow
2
+
3
+ Use this when the user wants to ship a skill safely after verification.
4
+
5
+ ## What Publish Means
6
+
7
+ `Publish` is the lifecycle step that takes a verified skill into live use and
8
+ starts post-deploy monitoring. For draft packages, the lifecycle entrypoint is:
9
+
10
+ ```bash
11
+ selftune publish --skill-path <path> [--no-watch] [--ignore-watch-alerts] [--json]
12
+ ```
13
+
14
+ ## Options
15
+
16
+ | Flag | Description |
17
+ |------|-------------|
18
+ | `--skill-path` | Path to a skill directory or `SKILL.md` file |
19
+ | `--no-watch` | Skip the default watch handoff and return the next watch command instead |
20
+ | `--ignore-watch-alerts` | Bypass publish-time watch gate warnings after watch runs |
21
+ | `--json` | Emit the publish summary as JSON |
22
+ | `--help` | Show command help |
23
+
24
+ ## When to Use
25
+
26
+ - The user says "publish", "ship", "deploy", or "go live"
27
+ - The user already has verification evidence and wants the next step
28
+ - The user wants watch to begin immediately after shipping
29
+
30
+ ## Draft Package Path
31
+
32
+ For a draft package that has already cleared verify:
33
+
34
+ ```bash
35
+ selftune publish --skill-path <path>
36
+ ```
37
+
38
+ This command:
39
+
40
+ 1. re-runs package replay
41
+ 2. re-runs package baseline
42
+ 3. hands the package into watch by default
43
+ 4. applies a measured publish-time watch gate after watch completes
44
+
45
+ Use `--ignore-watch-alerts` only when you deliberately want to bypass that
46
+ watch-trust gate after reviewing the output.
47
+
48
+ If verification evidence is missing, do not force publish. Route back to
49
+ `Verify`.
50
+
51
+ ## Existing Live Skill Path
52
+
53
+ If the skill is already deployed and the user is shipping a new measured
54
+ improvement, the current surface is still split:
55
+
56
+ 1. run `Improve`
57
+ 2. if deployed successfully, run:
58
+
59
+ ```bash
60
+ selftune watch --skill <name> --skill-path <path>
61
+ ```
62
+
63
+ Treat this as the live-skill version of publish until a unified publish command
64
+ exists for both draft packages and iterative evolution.
65
+
66
+ ## What To Check Before Publish
67
+
68
+ - the package or skill is not still blocked on `Create`
69
+ - trust evidence is complete enough to be called `verified`
70
+ - the next lifecycle step from status/check is publish rather than more prep
71
+ - the user is not actually asking for a dry-run or improvement proposal review
72
+
73
+ ## Outcome
74
+
75
+ `Publish` should leave the skill in one of these states:
76
+
77
+ - `published`
78
+ - `watching`
79
+ - back to `Verify` if the draft is still blocked on `needs_spec_validation`, `needs_package_resources`, `needs_evals`, `needs_unit_tests`, `needs_routing_replay`, or `needs_baseline`
80
+
81
+ ## Which workflows to load next
82
+
83
+ - missing trust proof -> `workflows/Verify.md`
84
+ - post-deploy monitoring -> `workflows/Watch.md`
85
+ - further iteration after ship -> `workflows/Improve.md`
86
+
87
+ ## Common Patterns
88
+
89
+ **"Ship this draft skill."**
90
+
91
+ > Use `selftune publish --skill-path <path>` if verify is green.
92
+
93
+ **"Deploy and monitor it."**
94
+
95
+ > Publish includes watch by default. Add `--no-watch` only when you want a
96
+ > manual handoff.
97
+
98
+ **"Can I skip straight to publish?"**
99
+
100
+ > Only if the skill is already verified. Otherwise route back to `Verify`.
@@ -0,0 +1,72 @@
1
+ # selftune Run Workflow
2
+
3
+ Use this when the user wants the full autonomous selftune loop rather than a
4
+ single lifecycle step.
5
+
6
+ ## What Run Means
7
+
8
+ `Run` is the simplified lifecycle name for selftune's autonomy-first runtime.
9
+
10
+ The lifecycle entrypoint is:
11
+
12
+ ```bash
13
+ selftune run [--dry-run] [--review-required] [--auto-approve] [--skill NAME] [--max-skills N] [--recent-window HOURS] [--sync-force] [--max-auto-grade N] [--loop] [--loop-interval SECS]
14
+ ```
15
+
16
+ ## Options
17
+
18
+ | Flag | Description |
19
+ |------|-------------|
20
+ | `--dry-run` | Preview actions without mutations |
21
+ | `--review-required` | Validate candidates but require human review before deploy |
22
+ | `--auto-approve` | Deprecated alias; autonomous mode is already the default |
23
+ | `--skill` | Scope to a single skill |
24
+ | `--max-skills` | Cap skills processed per run |
25
+ | `--recent-window` | Hours to look back for watch targets |
26
+ | `--sync-force` | Force a full rescan during sync |
27
+ | `--max-auto-grade` | Max ungraded skills to auto-grade per run |
28
+ | `--loop` | Run continuously |
29
+ | `--loop-interval` | Seconds between loop iterations |
30
+ | `--help` | Show command help |
31
+
32
+ `run` delegates to the existing `selftune orchestrate` runtime while keeping
33
+ the simpler lifecycle name.
34
+
35
+ ## When to Use
36
+
37
+ - The user wants the full autonomous loop
38
+ - The user says "run selftune", "improve all skills", or "operate continuously"
39
+ - The user wants a dry-run of what selftune would do next
40
+
41
+ ## Default Commands
42
+
43
+ ```bash
44
+ selftune run
45
+ selftune run --dry-run
46
+ selftune run --review-required
47
+ selftune run --skill <name>
48
+ selftune run --loop
49
+ ```
50
+
51
+ ## How To Explain It
52
+
53
+ `Run` is not the same as a single improve action. It is the higher-level system
54
+ runtime that:
55
+
56
+ - syncs
57
+ - computes status
58
+ - grades when needed
59
+ - improves selected skills
60
+ - watches recent changes
61
+ - flushes contribution/upload side effects
62
+
63
+ For users who ask for one specific skill or one specific trust question, prefer
64
+ `Status`, `Verify`, `Publish`, or `Improve` first.
65
+
66
+ For users who ask for the whole closed loop, use `Run`.
67
+
68
+ ## Which workflow to load next
69
+
70
+ - exact autonomy details and flags -> `workflows/Orchestrate.md`
71
+ - trust questions for one skill -> `workflows/Verify.md`
72
+ - package publishing -> `workflows/Publish.md`
@@ -19,10 +19,10 @@ For OpenClaw-specific scheduling, see `workflows/Cron.md`.
19
19
  The core selftune automation loop is one command:
20
20
 
21
21
  ```bash
22
- selftune orchestrate
22
+ selftune run
23
23
  ```
24
24
 
25
- `selftune orchestrate` runs source-truth sync first, selects candidate skills,
25
+ `selftune run` runs source-truth sync first, selects candidate skills,
26
26
  deploys validated low-risk description changes autonomously, and watches recent
27
27
  deployments with auto-rollback enabled.
28
28