selftune 0.2.31 → 0.2.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/README.md +83 -56
  2. package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
  3. package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
  4. package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
  5. package/apps/local-dashboard/dist/index.html +3 -3
  6. package/cli/selftune/command-surface.ts +613 -2
  7. package/cli/selftune/create/baseline.ts +429 -0
  8. package/cli/selftune/create/check.ts +35 -0
  9. package/cli/selftune/create/init.ts +115 -0
  10. package/cli/selftune/create/package-candidate-state.ts +771 -0
  11. package/cli/selftune/create/package-evaluator.ts +710 -0
  12. package/cli/selftune/create/package-fingerprint.ts +142 -0
  13. package/cli/selftune/create/package-search.ts +377 -0
  14. package/cli/selftune/create/publish.ts +431 -0
  15. package/cli/selftune/create/readiness.ts +495 -0
  16. package/cli/selftune/create/replay.ts +330 -0
  17. package/cli/selftune/create/report.ts +74 -0
  18. package/cli/selftune/create/scaffold.ts +121 -0
  19. package/cli/selftune/create/skills-ref-adapter.ts +177 -0
  20. package/cli/selftune/create/status.ts +33 -0
  21. package/cli/selftune/create/templates.ts +249 -0
  22. package/cli/selftune/cron/setup.ts +1 -1
  23. package/cli/selftune/dashboard-action-events.ts +4 -1
  24. package/cli/selftune/dashboard-action-result.ts +789 -24
  25. package/cli/selftune/dashboard-action-stream.ts +80 -0
  26. package/cli/selftune/dashboard-contract.ts +146 -3
  27. package/cli/selftune/dashboard-server.ts +5 -4
  28. package/cli/selftune/eval/hooks-to-evals.ts +58 -35
  29. package/cli/selftune/eval/synthetic-evals.ts +145 -17
  30. package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
  31. package/cli/selftune/evolution/evolve-body.ts +9 -36
  32. package/cli/selftune/evolution/evolve.ts +8 -72
  33. package/cli/selftune/evolution/stopping-criteria.ts +5 -13
  34. package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
  35. package/cli/selftune/evolution/validate-host-replay.ts +115 -15
  36. package/cli/selftune/improve.ts +206 -0
  37. package/cli/selftune/index.ts +123 -6
  38. package/cli/selftune/init.ts +1 -1
  39. package/cli/selftune/localdb/queries/dashboard.ts +30 -0
  40. package/cli/selftune/localdb/schema.ts +52 -0
  41. package/cli/selftune/monitoring/watch.ts +257 -23
  42. package/cli/selftune/orchestrate/execute.ts +300 -1
  43. package/cli/selftune/orchestrate/finalize.ts +14 -0
  44. package/cli/selftune/orchestrate/plan.ts +22 -5
  45. package/cli/selftune/orchestrate/prepare.ts +59 -4
  46. package/cli/selftune/orchestrate/report.ts +1 -1
  47. package/cli/selftune/orchestrate.ts +34 -1
  48. package/cli/selftune/publish.ts +35 -0
  49. package/cli/selftune/routes/actions.ts +81 -15
  50. package/cli/selftune/routes/overview.ts +1 -1
  51. package/cli/selftune/routes/skill-report.ts +147 -2
  52. package/cli/selftune/run.ts +18 -0
  53. package/cli/selftune/schedule.ts +3 -3
  54. package/cli/selftune/search-run.ts +703 -0
  55. package/cli/selftune/status.ts +35 -11
  56. package/cli/selftune/testing-readiness.ts +431 -40
  57. package/cli/selftune/types.ts +316 -0
  58. package/cli/selftune/utils/eval-readiness.ts +1 -0
  59. package/cli/selftune/utils/json-output.ts +11 -0
  60. package/cli/selftune/utils/lifecycle-surface.ts +48 -0
  61. package/cli/selftune/utils/query-filter.ts +82 -1
  62. package/cli/selftune/utils/tui.ts +85 -2
  63. package/cli/selftune/verify.ts +205 -0
  64. package/cli/selftune/workflows/proposals.ts +1 -1
  65. package/cli/selftune/workflows/skill-scaffold.ts +141 -63
  66. package/cli/selftune/workflows/workflows.ts +4 -4
  67. package/package.json +1 -1
  68. package/skill/SKILL.md +148 -85
  69. package/skill/references/cli-quick-reference.md +16 -1
  70. package/skill/references/creator-playbook.md +31 -10
  71. package/skill/workflows/Baseline.md +8 -9
  72. package/skill/workflows/Contributions.md +4 -4
  73. package/skill/workflows/Create.md +173 -0
  74. package/skill/workflows/CreateTestDeploy.md +34 -30
  75. package/skill/workflows/Cron.md +2 -2
  76. package/skill/workflows/Dashboard.md +3 -3
  77. package/skill/workflows/Evals.md +13 -7
  78. package/skill/workflows/Evolve.md +75 -32
  79. package/skill/workflows/EvolveBody.md +22 -15
  80. package/skill/workflows/Hook.md +1 -1
  81. package/skill/workflows/Improve.md +168 -0
  82. package/skill/workflows/Initialize.md +3 -3
  83. package/skill/workflows/Orchestrate.md +49 -12
  84. package/skill/workflows/Publish.md +100 -0
  85. package/skill/workflows/Run.md +72 -0
  86. package/skill/workflows/Schedule.md +2 -2
  87. package/skill/workflows/SearchRun.md +89 -0
  88. package/skill/workflows/SignalsDashboard.md +2 -2
  89. package/skill/workflows/UnitTest.md +13 -4
  90. package/skill/workflows/Verify.md +136 -0
  91. package/skill/workflows/Watch.md +114 -47
  92. package/skill/workflows/Workflows.md +13 -8
  93. package/apps/local-dashboard/dist/assets/index-B7v_o1WC.js +0 -15
  94. package/apps/local-dashboard/dist/assets/index-CrO77SVi.css +0 -1
  95. package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
@@ -0,0 +1,173 @@
1
+ # selftune Create Workflow
2
+
3
+ ## When to Use
4
+
5
+ When the user wants to author a brand-new skill package, bootstrap a clean draft
6
+ skill, or start from a package skeleton instead of mutating an existing skill.
7
+
8
+ ## Overview
9
+
10
+ `Create` is the beginning of the lifecycle for first-class package drafts.
11
+
12
+ Today the command surface is still split:
13
+
14
+ - `selftune create init` starts from a blank package
15
+ - `selftune create scaffold` starts from a discovered workflow
16
+ - `selftune create status` tells you where the draft is in the lifecycle
17
+
18
+ After authoring, move to `Verify` rather than staying in low-level `create`
19
+ subcommands longer than necessary.
20
+
21
+ ## Primary Commands
22
+
23
+ ```bash
24
+ selftune create init --name <name> --description <text> [--output-dir <path>] [--force] [--json]
25
+ selftune create scaffold --from-workflow <id|index> [--output-dir <path>] [--skill-name <name>] [--description <text>] [--write] [--force] [--json]
26
+ selftune create status --skill-path <path> [--json]
27
+ selftune verify --skill-path <path> [--json]
28
+ selftune create check --skill-path <path> [--json]
29
+ selftune create replay --skill-path <path> [--mode routing|package] [--agent AGENT] [--eval-set PATH] [--json]
30
+ selftune create baseline --skill-path <path> [--mode routing|package] [--agent AGENT] [--eval-set PATH] [--json]
31
+ selftune create report --skill-path <path> [--agent AGENT] [--eval-set PATH] [--json]
32
+ selftune publish --skill-path <path> [--json]
33
+ selftune create publish --skill-path <path> [--watch] [--ignore-watch-alerts] [--json]
34
+ ```
35
+
36
+ ## Options
37
+
38
+ - `--name <name>`: Display name for the new skill package. Required.
39
+ - `--description <text>`: Short routing description for the draft skill.
40
+ Required.
41
+ - `--output-dir <path>`: Parent directory for the new package. Default: the
42
+ repo-root `.agents/skills` directory.
43
+ - `--from-workflow <id|index>`: Workflow ID or 1-based index from
44
+ `selftune workflows`. Required for `scaffold`.
45
+ - `--skill-name <name>`: Override the generated scaffolded skill name.
46
+ - `--force`: Overwrite scaffold files if the package directory already exists.
47
+ - `--write`: Persist the workflow-derived scaffold to disk. Without this flag,
48
+ `scaffold` previews the package only.
49
+ - `--min-occurrences <n>`: Minimum workflow frequency to consider while
50
+ resolving `--from-workflow`.
51
+ - `--skill <name>`: Restrict workflow discovery to chains containing the named
52
+ skill during `scaffold`.
53
+ - `--json`: Emit the created package summary as JSON.
54
+ - `--skill-path <path>`: Path to a skill directory or `SKILL.md`. Required for
55
+ `status`, `check`, `replay`, `baseline`, `report`, and `publish`.
56
+ - `--mode routing|package`: Replay or baseline only the router, or the full
57
+ package tree.
58
+ - `--agent AGENT`: Runtime agent for replay, baseline, or report execution.
59
+ - `--eval-set PATH`: Override the canonical eval-set path for replay,
60
+ baseline, or report.
61
+ - `--watch`: Start watch immediately after `create publish` succeeds.
62
+ - `--ignore-watch-alerts`: Bypass the publish-time watch gate after watch
63
+ runs.
64
+ - `-h, --help`: Show command help.
65
+
66
+ ## Generated Layout
67
+
68
+ ```text
69
+ <skill-name>/
70
+ ├── SKILL.md
71
+ ├── workflows/
72
+ │ └── default.md
73
+ ├── references/
74
+ │ └── overview.md
75
+ ├── scripts/
76
+ ├── assets/
77
+ └── selftune.create.json
78
+ ```
79
+
80
+ ## What Each File Is For
81
+
82
+ - `SKILL.md`: The trigger surface and top-level routing contract.
83
+ - `workflows/default.md`: The first execution path once the skill triggers.
84
+ - `references/overview.md`: Background context that should be loaded on demand.
85
+ - `scripts/`: Deterministic helpers you want the agent to reuse.
86
+ - `assets/`: Static templates or seed artifacts.
87
+ - `selftune.create.json`: selftune-specific package metadata for readiness and
88
+ future package replay.
89
+
90
+ ## Examples
91
+
92
+ ```bash
93
+ selftune create init --name "Research Assistant" --description "Use when the user needs structured research help."
94
+ selftune create status --skill-path .agents/skills/research-assistant
95
+ selftune verify --skill-path .agents/skills/research-assistant
96
+ selftune create scaffold --from-workflow 1
97
+ selftune create replay --skill-path .agents/skills/research-assistant --mode package
98
+ selftune create baseline --skill-path .agents/skills/research-assistant --mode package
99
+ selftune create report --skill-path .agents/skills/research-assistant
100
+ selftune publish --skill-path .agents/skills/research-assistant
101
+ selftune create scaffold --from-workflow "Copywriting→MarketingAutomation→SelfTuneBlog" --skill-name "blog publisher" --write
102
+ selftune create init --name "Release Note Writer" --description "Use when the user needs changelog-ready release notes." --output-dir .agents/skills
103
+ selftune create init --name "Internal Docs Helper" --description "Use when the user needs internal documentation updates." --json
104
+ ```
105
+
106
+ ## Common Patterns
107
+
108
+ - "Start a brand-new skill package"
109
+ `selftune create init --name "Research Assistant" --description "Use when the user needs structured research help."`
110
+ - "Write the scaffold into a different local registry"
111
+ `selftune create init --name "Research Assistant" --description "Use when the user needs structured research help." --output-dir ~/skills`
112
+ - "Replace an older draft with a fresh scaffold"
113
+ `selftune create init --name "Research Assistant" --description "Use when the user needs structured research help." --force`
114
+ - "Preview a package scaffold from telemetry"
115
+ `selftune create scaffold --from-workflow 1`
116
+ - "Write a workflow-derived package draft"
117
+ `selftune create scaffold --from-workflow 1 --output-dir .agents/skills --write`
118
+ - "See where the draft is in the lifecycle"
119
+ `selftune create status --skill-path .agents/skills/research-assistant`
120
+ - "Run the lifecycle-first draft verification step"
121
+ `selftune verify --skill-path .agents/skills/research-assistant`
122
+ - "Run the low-level draft readiness check"
123
+ `selftune create check --skill-path .agents/skills/research-assistant`
124
+ - "Replay-validate the whole draft package"
125
+ `selftune create replay --skill-path .agents/skills/research-assistant --mode package`
126
+ - "Measure draft-package lift versus no-skill"
127
+ `selftune create baseline --skill-path .agents/skills/research-assistant --mode package`
128
+ - "Render the benchmark-style package report"
129
+ `selftune create report --skill-path .agents/skills/research-assistant`
130
+ - "Ship the draft through the lifecycle-first surface"
131
+ `selftune publish --skill-path .agents/skills/research-assistant`
132
+ - "Ship the draft through the legacy create surface"
133
+ `selftune create publish --skill-path .agents/skills/research-assistant --watch`
134
+
135
+ ## Follow-on Workflows
136
+
137
+ After the draft exists:
138
+
139
+ - use `workflows/Verify.md` to build trust evidence
140
+ - use `workflows/Publish.md` to ship the draft safely
141
+
142
+ ## Notes
143
+
144
+ - The generated package is intentionally sparse. It is a draft, not a published
145
+ skill.
146
+ - Replace the placeholder routing and workflow text before distribution.
147
+ - `Create` only owns draft authoring and local draft state.
148
+ - `Verify` owns trust evidence.
149
+ - `Publish` owns shipping + watch handoff.
150
+ - Lower-level `create check`, `create replay`, `create baseline`, `create report`,
151
+ and `create publish` still exist, but they are no longer the primary teaching
152
+ path in the skill surface.
153
+ - `create publish --watch --json` now returns both the raw nested `watch_result`
154
+ payload and a normalized `package_evaluation.watch` block, so agents can read
155
+ post-deploy pass rates, invocation totals, rollback state, and grade-watch
156
+ deltas from the same measured package-evaluation contract they already use for
157
+ replay and baseline evidence.
158
+ - The publish payload now also surfaces `watch_gate_passed`,
159
+ `watch_gate_warnings`, and `watch_trust_score`, so agents can tell whether the
160
+ latest watch signal cleared the advisory trust gate without parsing prose.
161
+ - `create report` and `create publish --json` now also surface
162
+ `package_evaluation.grading` when grading baselines and recent grading runs
163
+ exist, so agents can compare draft-package replay/baseline results against
164
+ observed execution quality instead of treating grading as a separate watch-only
165
+ signal.
166
+ - selftune now stores the latest measured package-evaluation summary
167
+ canonically in SQLite and mirrors it to
168
+ `~/.selftune/package-evaluations/<skill>.json`, so later publish/report/watch
169
+ steps can reuse one measured artifact instead of treating package evaluation
170
+ as stdout-only output.
171
+ - `selftune workflows scaffold` now writes the same package shape for backward
172
+ compatibility, but `selftune create scaffold` is the primary authoring
173
+ surface.
@@ -1,37 +1,38 @@
1
1
  # selftune Create, Test, and Deploy Workflow
2
2
 
3
3
  Use this when the user wants one guided path from a new or shaky skill to a
4
- safe shipped skill.
4
+ safe shipped package.
5
5
 
6
6
  This is a composed workflow. It does not replace the atomic `Evals`,
7
7
  `UnitTest`, `Baseline`, `Evolve`, or `Watch` workflows. It decides which one
8
- comes next and keeps the creator trust loop in order.
8
+ comes next and keeps the package evaluation pipeline in order.
9
9
 
10
10
  ## When to Use
11
11
 
12
12
  - The user says "create, test, and deploy"
13
- - The user wants the full creator loop end to end
13
+ - The user wants the full package evaluation pipeline end to end
14
14
  - The user asks "how do I know this skill works?" before shipping
15
15
  - The user asks whether a skill is ready to deploy
16
16
  - The user wants one recommended path from cold start to live watch
17
17
 
18
18
  ## Default Path
19
19
 
20
- There is no single `selftune create-test-deploy` command yet. Run the loop
21
- step by step:
20
+ Prefer the newer lifecycle:
22
21
 
23
22
  ```bash
24
- selftune eval generate --skill <name> --skill-path <path>
25
- selftune eval unit-test --skill <name> --generate --skill-path <path>
26
- selftune evolve --skill <name> --skill-path <path> --dry-run --validation-mode replay
27
- selftune grade baseline --skill <name> --skill-path <path>
28
- selftune evolve --skill <name> --skill-path <path> --with-baseline
29
- selftune watch --skill <name>
23
+ # author or inspect the draft
24
+ selftune create status --skill-path <path>
25
+
26
+ # build trust evidence
27
+ selftune verify --skill-path <path>
28
+
29
+ # ship safely
30
+ selftune publish --skill-path <path>
30
31
  ```
31
32
 
32
33
  ## How to Run It
33
34
 
34
- ### 1. Resolve the current loop position
35
+ ### 1. Resolve the current lifecycle position
35
36
 
36
37
  Start with one of these surfaces:
37
38
 
@@ -90,11 +91,10 @@ Then continue to replay dry-run validation.
90
91
  Run:
91
92
 
92
93
  ```bash
93
- selftune evolve --skill <name> --skill-path <path> --dry-run --validation-mode replay
94
+ selftune create replay --skill-path <path> --mode package
94
95
  ```
95
96
 
96
- This is the pre-deploy proof step. It validates against runtime-style routing
97
- without mutating the skill.
97
+ This is the runtime proof step behind `verify`.
98
98
 
99
99
  Then continue to baseline.
100
100
 
@@ -103,21 +103,21 @@ Then continue to baseline.
103
103
  Run:
104
104
 
105
105
  ```bash
106
- selftune grade baseline --skill <name> --skill-path <path>
106
+ selftune create baseline --skill-path <path> --mode package
107
107
  ```
108
108
 
109
- Then continue to live deploy.
109
+ Then re-run `verify`.
110
110
 
111
111
  #### Ready to deploy
112
112
 
113
113
  Run:
114
114
 
115
115
  ```bash
116
- selftune evolve --skill <name> --skill-path <path> --with-baseline
116
+ selftune publish --skill-path <path>
117
117
  ```
118
118
 
119
- This is the recommended creator ship command because it deploys only after the
120
- candidate clears the earlier trust gates.
119
+ This is the recommended creator ship command because it re-runs the draft
120
+ package validation gates and starts watch automatically.
121
121
 
122
122
  Then continue to watch.
123
123
 
@@ -134,13 +134,19 @@ another iteration.
134
134
 
135
135
  ## Which workflow to read next
136
136
 
137
- Load the atomic workflow that matches the next missing step:
137
+ Prefer the newer primary workflows:
138
+
139
+ - authoring -> `workflows/Create.md`
140
+ - trust-building -> `workflows/Verify.md`
141
+ - shipping -> `workflows/Publish.md`
142
+
143
+ Load the lower-level workflows only when the user explicitly wants the details:
138
144
 
139
- - eval generation -> `workflows/Evals.md`
140
- - unit tests -> `workflows/UnitTest.md`
141
- - replay dry-run / deploy -> `workflows/Evolve.md`
142
- - baseline -> `workflows/Baseline.md`
143
- - live monitoring -> `workflows/Watch.md`
145
+ - `workflows/Evals.md`
146
+ - `workflows/UnitTest.md`
147
+ - `workflows/Replay.md`
148
+ - `workflows/Baseline.md`
149
+ - `workflows/Watch.md`
144
150
 
145
151
  Use `references/creator-playbook.md` when the user is publishing a skill other
146
152
  people will install and needs before-ship versus after-ship guidance.
@@ -150,13 +156,11 @@ people will install and needs before-ship versus after-ship guidance.
150
156
  **User asks for one end-to-end shipping path**
151
157
 
152
158
  > Use this workflow. Check the current readiness surface first, then run the
153
- > next missing creator-loop step instead of dumping every command at once.
159
+ > next missing pipeline step instead of dumping every command at once.
154
160
 
155
161
  **User asks whether a skill is safe to ship**
156
162
 
157
- > Use `selftune status` or the dashboard to confirm evals, unit tests, replay
158
- > validation, and baseline exist. If all four are complete, run `selftune
159
- > evolve --with-baseline`. Otherwise run the missing step first.
163
+ > Use `Verify` first. If the skill is already verified, move to `Publish`.
160
164
 
161
165
  **User already shipped the skill**
162
166
 
@@ -94,7 +94,7 @@ no token cost for routine runs.
94
94
  OS scheduler fires (cron/launchd/systemd)
95
95
  |
96
96
  v
97
- selftune orchestrate --max-skills 3 (CLI runs directly, no agent)
97
+ selftune run --max-skills 3 (CLI runs directly, no agent)
98
98
  |
99
99
  v
100
100
  sync → candidate selection → evolve → validate → deploy → watch
@@ -107,7 +107,7 @@ Next interactive agent session uses updated description
107
107
  ```
108
108
 
109
109
  This is distinct from interactive mode where the user says "improve my skills"
110
- and the agent runs orchestrate. Automated mode is for routine maintenance;
110
+ and the agent runs `selftune run`. Automated mode is for routine maintenance;
111
111
  interactive mode is for user-directed improvements.
112
112
 
113
113
  ## Safety Controls
@@ -72,7 +72,7 @@ staying stale.
72
72
  The dashboard connects to `/api/v2/events` via Server-Sent Events.
73
73
  The server watches the SQLite WAL file for changes and broadcasts an
74
74
  `update` event when new data is written. The dashboard also broadcasts
75
- `action` events while creator-loop commands are running so the UI can
75
+ `action` events while lifecycle commands are running so the UI can
76
76
  show live stdout/stderr and terminal success/failure. This works for
77
77
  both dashboard-triggered actions and supported `selftune` commands run
78
78
  directly in another terminal, because the CLI writes a shared action
@@ -81,7 +81,7 @@ invalidates cached queries on updates and terminal action events (~1s
81
81
  latency for DB-backed updates).
82
82
 
83
83
  For demo or operator workflows, the skill report can open a dedicated
84
- live-run screen. That screen follows one active creator-loop run at a
84
+ live-run screen. That screen follows one active lifecycle run at a
85
85
  time, keeps a larger terminal log visible, and shows parsed dry-run
86
86
  summary fields plus historical model/platform/token aggregates from the
87
87
  skill report. Replay dry-runs also attach live `metrics` events when the
@@ -105,7 +105,7 @@ See [docs/design-docs/live-dashboard-sse.md](../../docs/design-docs/live-dashboa
105
105
  Action buttons in the dashboard trigger selftune commands via POST
106
106
  requests. Each endpoint spawns a `bun run` subprocess.
107
107
 
108
- **Creator-loop and watch/deploy actions** request body:
108
+ **Lifecycle and watch/deploy actions** request body:
109
109
 
110
110
  ```json
111
111
  {
@@ -20,24 +20,27 @@ Invoke this workflow when the user requests any of the following:
20
20
  selftune eval generate --skill <name> [options]
21
21
  ```
22
22
 
23
- ## Recommended Creator Loop
23
+ ## Recommended Package Evaluation Pipeline
24
24
 
25
- Use eval generation as step 1 of the default creator loop:
25
+ Use eval generation as step 1 of the package evaluation pipeline:
26
26
 
27
27
  ```bash
28
+ selftune verify --skill-path <path>
28
29
  selftune eval generate --skill <name>
30
+ selftune verify --skill-path <path>
29
31
  selftune eval unit-test --skill <name> --generate --skill-path <path>
30
- selftune evolve --skill <name> --skill-path <path> --dry-run --validation-mode replay
31
- selftune grade baseline --skill <name> --skill-path <path>
32
- selftune evolve --skill <name> --skill-path <path> --with-baseline
33
- selftune watch --skill <name>
32
+ selftune verify --skill-path <path>
34
33
  ```
35
34
 
36
35
  The command still writes the requested output path, and it now also mirrors a canonical copy into
37
36
  `~/.selftune/eval-sets/<skill>.json` so the dashboard and `selftune status` can track whether eval
38
- coverage exists. Once the earlier steps are complete, the creator loop surfaces now flip from
37
+ coverage exists. Once the earlier steps are complete, the pipeline surfaces now flip from
39
38
  "needs testing" to "ready to deploy" and then "watching" after ship.
40
39
 
40
+ For already-published skills, eval generation is still a common supporting step
41
+ before `selftune improve` / `selftune evolve` when you need fresher trigger
42
+ evidence.
43
+
41
44
  ## Options
42
45
 
43
46
  | Flag | Description | Default |
@@ -51,6 +54,7 @@ coverage exists. Once the earlier steps are complete, the creator loop surfaces
51
54
  | `--no-negatives` | Exclude negative examples from output | Off |
52
55
  | `--no-taxonomy` | Skip invocation_type classification | Off |
53
56
  | `--skill-log <path>` | Path to skill_usage_log.jsonl | Default log path |
57
+ | `--agent <name>` | Agent CLI for synthetic/blended eval generation (`claude`, `codex`, `opencode`, `pi`) | Auto-detected |
54
58
  | `--query-log <path>` | Path to all_queries_log.jsonl | Default log path |
55
59
  | `--telemetry-log <path>` | Path to session_telemetry_log.jsonl | Default log path |
56
60
  | `--synthetic` | Generate evals from SKILL.md via LLM (no logs needed) | Off |
@@ -184,6 +188,7 @@ queries directly from the SKILL.md content via an LLM.
184
188
 
185
189
  ```bash
186
190
  selftune eval generate --skill pptx --synthetic --skill-path /path/to/skills/pptx/SKILL.md
191
+ selftune eval generate --skill pptx --synthetic --skill-path /path/to/skills/pptx/SKILL.md --agent opencode
187
192
  ```
188
193
 
189
194
  If the skill is installed locally but has no trusted trigger history yet, use the faster creator
@@ -191,6 +196,7 @@ onboarding path:
191
196
 
192
197
  ```bash
193
198
  selftune eval generate --skill pptx --auto-synthetic --skill-path /path/to/skills/pptx/SKILL.md
199
+ selftune eval generate --skill pptx --auto-synthetic --skill-path /path/to/skills/pptx/SKILL.md --agent opencode
194
200
  ```
195
201
 
196
202
  `--auto-synthetic` keeps the normal log-based path when real trigger data exists, but falls back
@@ -1,8 +1,10 @@
1
1
  # selftune Evolve Workflow
2
2
 
3
- Improve a skill's description based on real usage signal. Analyzes failure
4
- patterns from eval sets and proposes description changes that catch more
5
- natural-language queries without breaking existing triggers.
3
+ Improve a skill's description as part of the package evaluation pipeline.
4
+ Analyzes failure patterns from eval sets and proposes description changes
5
+ that catch more natural-language queries without breaking existing triggers.
6
+ Each proposal is evaluated through replay, baseline, and grading before
7
+ acceptance into the measured frontier.
6
8
 
7
9
  ## When to Invoke
8
10
 
@@ -19,19 +21,31 @@ Invoke this workflow when the user requests any of the following:
19
21
  selftune evolve --skill <name> --skill-path <path> [options]
20
22
  ```
21
23
 
22
- ## Recommended Creator Loop
24
+ ## Recommended Package Evaluation Pipeline
23
25
 
24
26
  Do not treat `evolve` as the first step when a creator asks whether a skill is
25
- ready. The default loop is:
27
+ ready. The default package evaluation pipeline is:
26
28
 
27
29
  ```bash
30
+ selftune create status --skill-path <path>
31
+ selftune verify --skill-path <path>
28
32
  selftune eval generate --skill <name> --skill-path <path>
29
33
  selftune eval unit-test --skill <name> --generate --skill-path <path>
30
- selftune evolve --skill <name> --skill-path <path> --dry-run --validation-mode replay
31
- selftune grade baseline --skill <name> --skill-path <path>
34
+ selftune create replay --skill-path <path> --mode package
35
+ selftune create baseline --skill-path <path> --mode package
36
+ selftune verify --skill-path <path>
37
+ selftune publish --skill-path <path>
32
38
  ```
33
39
 
34
- Then move to a live `selftune evolve ...` or `selftune watch ...` run.
40
+ For already-published skills, this workflow is the right mutation surface. The
41
+ lifecycle alias is `selftune improve`; use `selftune evolve` directly when you
42
+ need exact advanced flags:
43
+
44
+ ```bash
45
+ selftune improve --skill <name> --skill-path <path> --dry-run --validation-mode replay
46
+ selftune evolve --skill <name> --skill-path <path>
47
+ selftune watch --skill <name>
48
+ ```
35
49
 
36
50
  If canonical evals or stored unit-test results already exist, reuse them rather
37
51
  than regenerating everything.
@@ -45,7 +59,7 @@ than regenerating everything.
45
59
  | `--eval-set <path>` | Pre-built eval set JSON | Auto-generated from logs |
46
60
  | `--agent <name>` | Agent CLI to use (claude, codex, opencode, pi) | Auto-detected |
47
61
  | `--dry-run` | Propose and validate without deploying | Off |
48
- | `--confidence <n>` | Minimum confidence threshold (0-1) | 0.6 |
62
+ | `--confidence <n>` | Low-confidence review threshold (0-1) | 0.6 |
49
63
  | `--max-iterations <n>` | Maximum retry iterations | 3 |
50
64
  | `--validation-model <model>` | Model for trigger-check validation LLM calls | `haiku` |
51
65
  | `--pareto` | Generate multiple candidates per iteration | On |
@@ -56,7 +70,7 @@ than regenerating everything.
56
70
  | `--full-model` | Use full-cost model throughout (disables cheap-loop) | Off |
57
71
  | `--verbose` | Print detailed progress during evolution | Off |
58
72
  | `--gate-model <model>` | Model for final gate validation | `sonnet` (when `--cheap-loop`) |
59
- | `--gate-effort <level>` | Thinking effort for the final gate (`low|medium|high|max`) | None |
73
+ | `--gate-effort <level>` | Thinking effort for the final gate (`low\|medium\|high\|max`) | None |
60
74
  | `--adaptive-gate` | Escalate risky gate checks to `opus` + `high` effort | Off |
61
75
  | `--proposal-model <model>` | Model for proposal generation LLM calls | None |
62
76
  | `--validation-mode <mode>` | Validation strategy: `auto`, `replay`, or `judge` | `auto` |
@@ -300,7 +314,8 @@ The candidate is tested against the full eval set:
300
314
 
301
315
  - Must improve overall pass rate
302
316
  - Must not regress more than 5% on previously-passing entries
303
- - Must exceed the `--confidence` threshold
317
+ - May still deploy when confidence is low if measured validation is strong;
318
+ `--confidence` only controls warning/review sensitivity
304
319
 
305
320
  If validation fails, the command retries up to `--max-iterations` times
306
321
  with adjusted proposals.
@@ -385,18 +400,18 @@ Proposals are scored on heuristic quality criteria (no LLM required). The compos
385
400
 
386
401
  The evolution loop uses a modular stopping criteria evaluator
387
402
  (`evolution/stopping-criteria.ts`) that checks conditions in priority order
388
- after each validation pass. The evaluator receives the current pass rate,
389
- historical pass rates from previous iterations, and proposal confidence to
390
- make a unified stop/continue decision. The stopping reason is recorded in
391
- audit entries for traceability.
392
-
393
- | # | Condition | Meaning |
394
- | --- | ------------------ | -------------------------------------------------------------- |
395
- | 1 | **Converged** | Pass rate >= 0.95 |
396
- | 2 | **Max iterations** | Reached `--max-iterations` limit |
397
- | 3 | **Low confidence** | Proposal confidence below `--confidence` threshold |
398
- | 4 | **Plateau** | < 1% pass rate variation across 3 consecutive iterations |
399
- | 5 | **Continue** | None of the above -- keep iterating |
403
+ after each validation pass. The evaluator receives the current pass rate and
404
+ historical pass rates from previous iterations to make a unified
405
+ stop/continue decision. Confidence is still recorded as metadata and may
406
+ raise warnings or gate-review risk, but it is not a standalone stop reason.
407
+ The stopping reason is recorded in audit entries for traceability.
408
+
409
+ | # | Condition | Meaning |
410
+ | --- | ------------------ | -------------------------------------------------------- |
411
+ | 1 | **Converged** | Pass rate >= 0.95 |
412
+ | 2 | **Max iterations** | Reached `--max-iterations` limit |
413
+ | 3 | **Plateau** | < 1% pass rate variation across 3 consecutive iterations |
414
+ | 4 | **Continue** | None of the above -- keep iterating |
400
415
 
401
416
  ## Cheap Loop Mode
402
417
 
@@ -447,11 +462,11 @@ selftune evolve apply-proposal --id <proposal-id> --skill-path <path> [--dry-run
447
462
 
448
463
  ### Apply-Proposal Options
449
464
 
450
- | Flag | Description | Default |
451
- | ----------------- | ----------------------------------------------- | -------- |
452
- | `--id <uuid>` | Proposal UUID from the dashboard | Required |
453
- | `--skill-path` | Path to the target SKILL.md | Required |
454
- | `--dry-run` | Preview the proposal without writing to disk | Off |
465
+ | Flag | Description | Default |
466
+ | -------------- | -------------------------------------------- | -------- |
467
+ | `--id <uuid>` | Proposal UUID from the dashboard | Required |
468
+ | `--skill-path` | Path to the target SKILL.md | Required |
469
+ | `--dry-run` | Preview the proposal without writing to disk | Off |
455
470
 
456
471
  ### Apply-Proposal Flow
457
472
 
@@ -482,8 +497,8 @@ Check the eval set quality. Missing contextual examples limit
482
497
  what evolution can learn. Generate a richer eval set first using the Evals workflow.
483
498
 
484
499
  **Evolution keeps failing validation:**
485
- Lower `--confidence` slightly or increase `--max-iterations`.
486
- Also check if the eval set has contradictory expectations.
500
+ Increase `--max-iterations` or improve the eval set.
501
+ Lower `--confidence` only if you want fewer low-confidence review warnings.
487
502
 
488
503
  **Agent CLI override needed:**
489
504
  The evolve command auto-detects the installed agent CLI.
@@ -497,10 +512,35 @@ This is especially valuable when the skill has a history of regressions,
497
512
  the evolution touches many trigger phrases, or the confidence score is near
498
513
  the threshold.
499
514
 
515
+ ## Scope: Description vs Package
516
+
517
+ The `evolve` command operates on description-level triggers and phrasing. For
518
+ package-level improvement (routing tables, body content, and the full skill
519
+ package), use `selftune improve --scope package` or `selftune search-run`,
520
+ which delegates to the bounded package search flow. That search path now uses
521
+ reflective proposals first, then measured targeted routing/body variants, then
522
+ deterministic fallback. It also evaluates a merged routing/body candidate when
523
+ both surfaces produce accepted improvements.
524
+
525
+ When `selftune orchestrate` (or `selftune run`) selects candidates
526
+ automatically, it chooses between description-level evolve and package-level
527
+ search based on evidence:
528
+
529
+ - Plain `selftune improve` also auto-selects package search for package-shaped
530
+ skills with draft manifests or package-frontier evidence.
531
+
532
+ - Skills with an accepted package frontier or canonical package evaluation
533
+ showing room for improvement are routed to package search.
534
+ - Skills without package evaluation history continue through description-level
535
+ evolve.
536
+
537
+ This means `evolve` remains the right tool for description and trigger
538
+ coverage, while package-level mutations are handled by the search pipeline.
539
+
500
540
  ## Autonomous Mode
501
541
 
502
- When called by `selftune orchestrate` (via cron or --loop), evolution runs
503
- without user interaction:
542
+ When called by `selftune orchestrate` (via cron or --loop), description-level
543
+ evolution runs without user interaction:
504
544
 
505
545
  - Pre-flight is skipped entirely — defaults are used
506
546
  - The orchestrator selects candidate skills based on health scores
@@ -511,3 +551,6 @@ without user interaction:
511
551
 
512
552
  No user confirmation is needed. The safety controls (regression threshold,
513
553
  auto-rollback via watch, SKILL.md backup) provide the guardrails.
554
+
555
+ For package-level candidates, orchestrate delegates to bounded search instead
556
+ of evolve. See `workflows/Orchestrate.md` for the full scope-selection logic.