selftune 0.2.31 → 0.2.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/README.md +83 -56
  2. package/apps/local-dashboard/dist/assets/index-B-ut4w0B.js +15 -0
  3. package/apps/local-dashboard/dist/assets/index-BFGfCVrL.css +1 -0
  4. package/apps/local-dashboard/dist/assets/vendor-ui-DfowE3Hu.js +1 -0
  5. package/apps/local-dashboard/dist/index.html +3 -3
  6. package/cli/selftune/command-surface.ts +613 -2
  7. package/cli/selftune/create/baseline.ts +429 -0
  8. package/cli/selftune/create/check.ts +35 -0
  9. package/cli/selftune/create/init.ts +115 -0
  10. package/cli/selftune/create/package-candidate-state.ts +771 -0
  11. package/cli/selftune/create/package-evaluator.ts +710 -0
  12. package/cli/selftune/create/package-fingerprint.ts +142 -0
  13. package/cli/selftune/create/package-search.ts +377 -0
  14. package/cli/selftune/create/publish.ts +431 -0
  15. package/cli/selftune/create/readiness.ts +495 -0
  16. package/cli/selftune/create/replay.ts +330 -0
  17. package/cli/selftune/create/report.ts +74 -0
  18. package/cli/selftune/create/scaffold.ts +121 -0
  19. package/cli/selftune/create/skills-ref-adapter.ts +177 -0
  20. package/cli/selftune/create/status.ts +33 -0
  21. package/cli/selftune/create/templates.ts +249 -0
  22. package/cli/selftune/cron/setup.ts +1 -1
  23. package/cli/selftune/dashboard-action-events.ts +4 -1
  24. package/cli/selftune/dashboard-action-result.ts +789 -24
  25. package/cli/selftune/dashboard-action-stream.ts +80 -0
  26. package/cli/selftune/dashboard-contract.ts +146 -3
  27. package/cli/selftune/dashboard-server.ts +5 -4
  28. package/cli/selftune/eval/hooks-to-evals.ts +58 -35
  29. package/cli/selftune/eval/synthetic-evals.ts +145 -17
  30. package/cli/selftune/evolution/bounded-mutations.ts +1045 -0
  31. package/cli/selftune/evolution/evolve-body.ts +9 -36
  32. package/cli/selftune/evolution/evolve.ts +8 -72
  33. package/cli/selftune/evolution/stopping-criteria.ts +5 -13
  34. package/cli/selftune/evolution/unblock-suggestions.ts +0 -16
  35. package/cli/selftune/evolution/validate-host-replay.ts +115 -15
  36. package/cli/selftune/improve.ts +206 -0
  37. package/cli/selftune/index.ts +123 -6
  38. package/cli/selftune/init.ts +1 -1
  39. package/cli/selftune/localdb/queries/dashboard.ts +30 -0
  40. package/cli/selftune/localdb/schema.ts +52 -0
  41. package/cli/selftune/monitoring/watch.ts +257 -23
  42. package/cli/selftune/orchestrate/execute.ts +300 -1
  43. package/cli/selftune/orchestrate/finalize.ts +14 -0
  44. package/cli/selftune/orchestrate/plan.ts +22 -5
  45. package/cli/selftune/orchestrate/prepare.ts +59 -4
  46. package/cli/selftune/orchestrate/report.ts +1 -1
  47. package/cli/selftune/orchestrate.ts +34 -1
  48. package/cli/selftune/publish.ts +35 -0
  49. package/cli/selftune/routes/actions.ts +81 -15
  50. package/cli/selftune/routes/overview.ts +1 -1
  51. package/cli/selftune/routes/skill-report.ts +147 -2
  52. package/cli/selftune/run.ts +18 -0
  53. package/cli/selftune/schedule.ts +3 -3
  54. package/cli/selftune/search-run.ts +703 -0
  55. package/cli/selftune/status.ts +35 -11
  56. package/cli/selftune/testing-readiness.ts +431 -40
  57. package/cli/selftune/types.ts +316 -0
  58. package/cli/selftune/utils/eval-readiness.ts +1 -0
  59. package/cli/selftune/utils/json-output.ts +11 -0
  60. package/cli/selftune/utils/lifecycle-surface.ts +48 -0
  61. package/cli/selftune/utils/query-filter.ts +82 -1
  62. package/cli/selftune/utils/tui.ts +85 -2
  63. package/cli/selftune/verify.ts +205 -0
  64. package/cli/selftune/workflows/proposals.ts +1 -1
  65. package/cli/selftune/workflows/skill-scaffold.ts +141 -63
  66. package/cli/selftune/workflows/workflows.ts +4 -4
  67. package/package.json +1 -1
  68. package/skill/SKILL.md +148 -85
  69. package/skill/references/cli-quick-reference.md +16 -1
  70. package/skill/references/creator-playbook.md +31 -10
  71. package/skill/workflows/Baseline.md +8 -9
  72. package/skill/workflows/Contributions.md +4 -4
  73. package/skill/workflows/Create.md +173 -0
  74. package/skill/workflows/CreateTestDeploy.md +34 -30
  75. package/skill/workflows/Cron.md +2 -2
  76. package/skill/workflows/Dashboard.md +3 -3
  77. package/skill/workflows/Evals.md +13 -7
  78. package/skill/workflows/Evolve.md +75 -32
  79. package/skill/workflows/EvolveBody.md +22 -15
  80. package/skill/workflows/Hook.md +1 -1
  81. package/skill/workflows/Improve.md +168 -0
  82. package/skill/workflows/Initialize.md +3 -3
  83. package/skill/workflows/Orchestrate.md +49 -12
  84. package/skill/workflows/Publish.md +100 -0
  85. package/skill/workflows/Run.md +72 -0
  86. package/skill/workflows/Schedule.md +2 -2
  87. package/skill/workflows/SearchRun.md +89 -0
  88. package/skill/workflows/SignalsDashboard.md +2 -2
  89. package/skill/workflows/UnitTest.md +13 -4
  90. package/skill/workflows/Verify.md +136 -0
  91. package/skill/workflows/Watch.md +114 -47
  92. package/skill/workflows/Workflows.md +13 -8
  93. package/apps/local-dashboard/dist/assets/index-B7v_o1WC.js +0 -15
  94. package/apps/local-dashboard/dist/assets/index-CrO77SVi.css +0 -1
  95. package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +0 -1
@@ -0,0 +1,89 @@
1
+ # selftune Search-Run Workflow
2
+
3
+ Use this when the user wants bounded package evolution instead of a single
4
+ description, routing, or body mutation. `search-run` explores a minibatch of
5
+ package variants, evaluates them through the shared package evaluator, and
6
+ records the winner plus provenance in the package frontier.
7
+
8
+ ## Command
9
+
10
+ ```bash
11
+ selftune search-run --skill-path <path> [--skill NAME] [--surface routing|body|both] [--max-candidates N] [--agent AGENT] [--eval-set PATH] [--apply-winner] [--json]
12
+ ```
13
+
14
+ ## Options
15
+
16
+ | Flag | Description |
17
+ | ------------------ | ---------------------------------------------------------- |
18
+ | `--skill-path` | Path to a skill directory or `SKILL.md` file |
19
+ | `--skill` | Override the inferred skill name for lineage and reporting |
20
+ | `--surface` | Mutation surface to explore: `routing`, `body`, or `both` |
21
+ | `--max-candidates` | Cap the number of variants evaluated in this run |
22
+ | `--agent` | Runtime agent to use for shared package evaluation |
23
+ | `--eval-set` | Override the eval set used during package evaluation |
24
+ | `--apply-winner` | Promote the winning candidate back into the draft package |
25
+ | `--json` | Emit the full search result as JSON |
26
+ | `--help` | Show command help |
27
+
28
+ ## What It Does
29
+
30
+ 1. Resolves the draft package from `--skill-path`
31
+ 2. Generates reflective variants first when measured failures and an agent are
32
+ available, then eval-informed targeted variants, then deterministic fallback
33
+ variants to fill the minibatch
34
+ - when `--surface both`, selftune biases the routing/body minibatch toward
35
+ the weaker measured surface from the accepted frontier or canonical package
36
+ evaluation
37
+ 3. Evaluates each candidate through the shared package evaluator
38
+ 4. If routing and body both produce accepted improvements, evaluates a merged
39
+ candidate built from the complementary surfaces
40
+ 5. Compares accepted candidates against the measured frontier
41
+ 6. Persists the search run, selected parent, winner, and provenance
42
+
43
+ ## When To Use
44
+
45
+ - The package is already verified and you want to explore multiple candidate edits
46
+ - You want a measured parent-selection loop instead of a one-shot mutation
47
+ - The skill report already shows frontier lineage and you want to expand it
48
+
49
+ ## Recommended Path
50
+
51
+ Start after the draft passes package verification:
52
+
53
+ ```bash
54
+ selftune verify --skill-path <path>
55
+ selftune search-run --skill-path <path> --surface both
56
+ ```
57
+
58
+ If you want the main lifecycle alias instead of the low-level command, use:
59
+
60
+ ```bash
61
+ selftune improve --skill <name> --skill-path <path> --scope package
62
+ ```
63
+
64
+ `selftune run` (via the `selftune orchestrate` runtime) auto-selects package
65
+ search for eligible skills. During the plan phase, `shouldSelectPackageSearch()`
66
+ routes a skill to the package-search action instead of description evolution
67
+ when eligibility criteria are met:
68
+
69
+ - The skill has an **accepted frontier candidate** from a prior search run, OR
70
+ - The skill has a **canonical package evaluation artifact**, OR
71
+ - The skill has a **draft package plus enough grading evidence** to be treated
72
+ as package-search eligible during orchestrate
73
+
74
+ Manual invocation via `selftune improve --scope package` or
75
+ `selftune search-run` remains available for on-demand use outside the
76
+ orchestrate loop.
77
+
78
+ Plain `selftune improve --skill <name> --skill-path <path>` also auto-selects
79
+ this path when the skill already has package evidence or a draft package
80
+ manifest.
81
+
82
+ When `search-run` runs with `--apply-winner`, or when `improve --scope package`
83
+ runs without `--dry-run`, the winning candidate is copied back into the draft
84
+ package automatically and the canonical package-evaluation artifact is
85
+ refreshed. The next step is usually:
86
+
87
+ ```bash
88
+ selftune publish --skill-path <path>
89
+ ```
@@ -51,9 +51,9 @@ These same thresholds gate proposal generation on the API side.
51
51
  4. If the user wants to help a skill reach threshold, route to the **Contribute** workflow
52
52
  5. If the user is the skill creator, use the Community page as the handoff into proposals and watch
53
53
 
54
- ## Creator Loop
54
+ ## After-Ship Pipeline
55
55
 
56
- For a creator, the after-ship loop is:
56
+ For a creator, the after-ship pipeline is:
57
57
 
58
58
  1. check whether the skill is low-signal or actionable
59
59
  2. inspect missed categories and grade distribution
@@ -146,16 +146,25 @@ selftune eval unit-test --skill Research
146
146
  Compare the new `pass_rate` against the previous run. Report whether
147
147
  the evolution improved trigger accuracy.
148
148
 
149
- ### 5. Continue the creator loop
149
+ ### 5. Continue the pipeline
150
150
 
151
- After unit tests exist, the next creator step is usually:
151
+ After unit tests exist, the next pipeline step is usually:
152
152
 
153
153
  ```bash
154
- selftune evolve --skill <name> --skill-path <path> --dry-run --validation-mode replay
154
+ selftune verify --skill-path <path>
155
+ ```
156
+
157
+ If `verify` still reports missing runtime proof, the next explicit supporting
158
+ steps are usually:
159
+
160
+ ```bash
161
+ selftune create replay --skill-path <path> --mode package
162
+ selftune create baseline --skill-path <path> --mode package
163
+ selftune verify --skill-path <path>
155
164
  ```
156
165
 
157
166
  That keeps the sequence aligned with the dashboard readiness surface:
158
- evals -> unit tests -> replay dry-run -> baseline -> deploy -> watch.
167
+ evals -> unit tests -> replay/baseline proof -> publish -> watch.
159
168
 
160
169
  ## Common Patterns
161
170
 
@@ -0,0 +1,136 @@
1
+ # selftune Verify Workflow
2
+
3
+ Use this when the user wants to know whether a skill is trustworthy enough to
4
+ ship or when they ask for the full draft lifecycle without wanting every low-level
5
+ command explained upfront.
6
+
7
+ ## What Verify Means
8
+
9
+ `Verify` is the primary trust-building workflow. The lifecycle entrypoint is:
10
+
11
+ ```bash
12
+ selftune verify --skill-path <path> [--agent AGENT] [--eval-set PATH] [--no-auto-fix] [--json]
13
+ ```
14
+
15
+ ## Options
16
+
17
+ | Flag | Description |
18
+ |------|-------------|
19
+ | `--skill-path` | Path to a skill directory or `SKILL.md` file |
20
+ | `--agent` | Runtime agent to use for package evaluation once readiness passes |
21
+ | `--eval-set` | Override the canonical eval-set path for package evaluation |
22
+ | `--no-auto-fix` | Skip automatic evidence generation when readiness checks fail |
23
+ | `--json` | Emit readiness plus report data as JSON |
24
+ | `--help` | Show command help |
25
+
26
+ `verify` first runs the same readiness contract as `selftune create check`. If
27
+ the draft is missing evidence (evals, unit tests, replay, or baseline), it
28
+ automatically runs the corresponding sub-command to generate it, up to 4
29
+ iterations. Use `--no-auto-fix` to disable this and get the old behavior of
30
+ returning the missing state immediately. If the draft is ready, it then runs
31
+ the benchmark-style package report.
32
+
33
+ ## When to Use
34
+
35
+ - The user asks "can I trust this skill?"
36
+ - The user asks "is this ready to ship?"
37
+ - The user asks for the full draft lifecycle or uses older "creator loop" wording
38
+ - A draft package exists and you need trust evidence before publish
39
+
40
+ ## Default Path
41
+
42
+ ### 1. Read the current state
43
+
44
+ For draft packages:
45
+
46
+ ```bash
47
+ selftune create status --skill-path <path>
48
+ ```
49
+
50
+ For a higher-level health summary:
51
+
52
+ ```bash
53
+ selftune status
54
+ ```
55
+
56
+ ### 2. Run verify
57
+
58
+ ```bash
59
+ selftune verify --skill-path <path>
60
+ ```
61
+
62
+ This is the default trust command because it tells you whether the package
63
+ itself is incomplete before you spend time generating more evidence.
64
+
65
+ ### 3. Fill missing trust evidence
66
+
67
+ When `verify` reports missing readiness or evidence, use only the missing
68
+ supporting step.
69
+
70
+ #### Missing evals or tests
71
+
72
+ ```bash
73
+ selftune eval generate --skill <name> --skill-path <path>
74
+ selftune eval unit-test --skill <name> --generate --skill-path <path>
75
+ ```
76
+
77
+ If the skill is cold-start and has no trusted triggers yet, prefer:
78
+
79
+ ```bash
80
+ selftune eval generate --skill <name> --auto-synthetic --skill-path <path>
81
+ ```
82
+
83
+ #### Missing runtime proof
84
+
85
+ ```bash
86
+ selftune create replay --skill-path <path> --mode package
87
+ ```
88
+
89
+ #### Missing no-skill value proof
90
+
91
+ ```bash
92
+ selftune create baseline --skill-path <path> --mode package
93
+ ```
94
+
95
+ ### 4. Re-run verify
96
+
97
+ ```bash
98
+ selftune verify --skill-path <path>
99
+ ```
100
+
101
+ The skill is verified when it no longer presents missing package/evidence
102
+ blockers and the next action becomes publish.
103
+
104
+ ## Outcome
105
+
106
+ `Verify` should leave the skill in one of these states:
107
+
108
+ - still `draft`
109
+ - one of `needs_spec_validation`, `needs_package_resources`, `needs_evals`, `needs_unit_tests`, `needs_routing_replay`, or `needs_baseline`
110
+ - `verified` (`ready_to_publish` in CLI output)
111
+
112
+ If the result is `verified`, hand off to `Publish`.
113
+ If the result is one of the concrete missing-evidence states, run only the missing step.
114
+ If the package itself is broken, route back to `Create`.
115
+
116
+ ## Which workflows to load next
117
+
118
+ - package authoring gaps -> `workflows/Create.md`
119
+ - publishing -> `workflows/Publish.md`
120
+ - specific low-level evidence work -> `Evals.md`, `UnitTest.md`, `Replay.md`, `Baseline.md`
121
+
122
+ ## Common Patterns
123
+
124
+ **"How do I know this skill works?"**
125
+
126
+ > Use `Verify`. Start with `create status` or `status`, then fill only the
127
+ > missing evidence step that `verify` reports.
128
+
129
+ **"Is this ready to ship?"**
130
+
131
+ > Use `Verify`. If the package is already verified, move directly to `Publish`.
132
+
133
+ **"Give me the creator loop."**
134
+
135
+ > Use `Verify` as the primary trust workflow instead of dumping every low-level
136
+ > command at once.
@@ -1,7 +1,10 @@
1
1
  # selftune Watch Workflow
2
2
 
3
- Monitor post-deploy skill performance for regressions. Compares current
3
+ Monitor post-deploy package performance for regressions. Compares current
4
4
  pass rates against a baseline within a sliding window of recent sessions.
5
+ Watch is the final stage of the package evaluation pipeline: after a
6
+ package is published, watch feeds measured evidence back into the accepted
7
+ frontier to confirm the package holds under real traffic.
5
8
 
6
9
  ## Default Command
7
10
 
@@ -11,33 +14,43 @@ selftune watch --skill <name> --skill-path <path> [options]
11
14
 
12
15
  ## Options
13
16
 
14
- | Flag | Description | Default |
15
- | --------------------- | ------------------------------------------------ | -------- |
16
- | `--skill <name>` | Skill name | Required |
17
- | `--skill-path <path>` | Path to the skill's SKILL.md | Required |
18
- | `--window <n>` | Sliding window size (number of sessions) | 20 |
19
- | `--threshold <n>` | Regression threshold (drop from baseline) | 0.1 |
20
- | `--auto-rollback` | Automatically rollback on detected regression | Off |
21
- | `--sync-first` | Refresh source-truth telemetry before evaluating | Off |
22
- | `--sync-force` | Force a full source rescan during `--sync-first` | Off |
23
- | `--grade-threshold <n>` | Grade regression threshold (drop from baseline)| 0.15 |
17
+ | Flag | Description | Default |
18
+ | ----------------------- | ------------------------------------------------ | -------- |
19
+ | `--skill <name>` | Skill name | Required |
20
+ | `--skill-path <path>` | Path to the skill's SKILL.md | Required |
21
+ | `--window <n>` | Sliding window size (number of sessions) | 20 |
22
+ | `--threshold <n>` | Regression threshold (drop from baseline) | 0.1 |
23
+ | `--auto-rollback` | Automatically rollback on detected regression | Off |
24
+ | `--sync-first` | Refresh source-truth telemetry before evaluating | Off |
25
+ | `--sync-force` | Force a full source rescan during `--sync-first` | Off |
26
+ | `--grade-threshold <n>` | Grade regression threshold (drop from baseline) | 0.15 |
24
27
  | `--no-grade-watch` | Disable grade-based regression monitoring | Enabled |
25
- | `--help` | Show command help | Off |
28
+ | `--help` | Show command help | Off |
26
29
 
27
30
  ## Output Format
28
31
 
29
32
  ```json
30
33
  {
31
- "skill_name": "pptx",
32
- "window_size": 20,
33
- "sessions_evaluated": 18,
34
- "current_pass_rate": 0.89,
35
- "baseline_pass_rate": 0.92,
36
- "threshold": 0.1,
37
- "regression_detected": false,
38
- "delta": -0.03,
39
- "status": "healthy",
40
- "evaluated_at": "2026-02-28T14:00:00Z",
34
+ "snapshot": {
35
+ "timestamp": "2026-04-14T14:00:00Z",
36
+ "skill_name": "pptx",
37
+ "window_sessions": 20,
38
+ "skill_checks": 18,
39
+ "pass_rate": 0.89,
40
+ "false_negative_rate": 0.11,
41
+ "by_invocation_type": {
42
+ "explicit": { "passed": 4, "total": 4 },
43
+ "implicit": { "passed": 8, "total": 9 },
44
+ "contextual": { "passed": 4, "total": 4 },
45
+ "negative": { "passed": 0, "total": 1 }
46
+ },
47
+ "regression_detected": false,
48
+ "baseline_pass_rate": 0.92
49
+ },
50
+ "alert": null,
51
+ "rolledBack": false,
52
+ "recommendation": "Skill \"pptx\" is stable. Pass rate 0.89 is within acceptable range of baseline 0.92.",
53
+ "recommended_command": null,
41
54
  "gradeAlert": null,
42
55
  "gradeRegression": null
43
56
  }
@@ -51,19 +64,21 @@ When grade regression is detected, the additional fields are populated:
51
64
  "gradeRegression": {
52
65
  "before": 0.85,
53
66
  "after": 0.65,
54
- "delta": 0.20
67
+ "delta": 0.2
55
68
  }
56
69
  }
57
70
  ```
58
71
 
59
72
  ### Status Values
60
73
 
61
- | Status | Meaning |
62
- | ------------------- | ------------------------------------------------- |
63
- | `healthy` | Current pass rate is within threshold of baseline |
64
- | `warning` | Pass rate dropped but within threshold |
65
- | `regression` | Pass rate dropped below baseline minus threshold |
66
- | `insufficient_data` | Not enough sessions in the window to evaluate |
74
+ Watch does not emit a separate `status` enum anymore. Instead, read:
75
+
76
+ | Field | Meaning |
77
+ | ------------------------------ | -------------------------------------------------------------------- |
78
+ | `snapshot.regression_detected` | Trigger pass rate dropped below baseline minus threshold |
79
+ | `alert` | One or more trigger/grade regressions were detected |
80
+ | `rolledBack` | Watch auto-rollback restored the previous version |
81
+ | `recommended_command` | Machine-readable follow-up command when watch wants an explicit step |
67
82
 
68
83
  ## Grade Regression Monitoring
69
84
 
@@ -76,10 +91,10 @@ exceeds `gradeRegressionThreshold` (default 0.15), a `gradeAlert` is raised.
76
91
 
77
92
  This runs alongside trigger regression:
78
93
 
79
- | Check | Source | Threshold | Field |
80
- | ------------------ | --------------------------- | --------- | ------------------- |
81
- | Trigger regression | Eval set pass rates | 0.10 | `regression_detected` |
82
- | Grade regression | Grading baseline vs recent | 0.15 | `gradeRegression` |
94
+ | Check | Source | Threshold | Field |
95
+ | ------------------ | -------------------------- | --------- | --------------------- |
96
+ | Trigger regression | Eval set pass rates | 0.10 | `regression_detected` |
97
+ | Grade regression | Grading baseline vs recent | 0.15 | `gradeRegression` |
83
98
 
84
99
  Both checks contribute to the overall `alert` field. A grade regression alert
85
100
  is appended to the watch alert string alongside any trigger regression alert.
@@ -92,16 +107,17 @@ if you only want trigger-based monitoring.
92
107
  ### Check Regression Status
93
108
 
94
109
  ```bash
95
- # Parse: .regression_detected (boolean)
96
- # Parse: .status (string)
97
- # Parse: .delta (float, negative = regression)
110
+ # Parse: .snapshot.regression_detected (boolean)
111
+ # Parse: .alert (string|null)
112
+ # Parse: .rolledBack (boolean)
98
113
  ```
99
114
 
100
115
  ### Get Key Metrics
101
116
 
102
117
  ```bash
103
- # Parse: .current_pass_rate vs .baseline_pass_rate
104
- # Parse: .sessions_evaluated (should be close to .window_size)
118
+ # Parse: .snapshot.pass_rate vs .snapshot.baseline_pass_rate
119
+ # Parse: .snapshot.skill_checks (should be close to .snapshot.window_sessions)
120
+ # Parse: .recommended_command for the next machine-readable follow-up
105
121
  ```
106
122
 
107
123
  ## Steps
@@ -132,12 +148,12 @@ selftune watch --skill pptx --skill-path /path/to/SKILL.md
132
148
 
133
149
  Parse the JSON output. Key decision points:
134
150
 
135
- | Status | Action |
136
- | ------------------- | --------------------------------------------------------- |
137
- | `healthy` | No action needed. Skill is performing well. |
138
- | `warning` | Monitor closely. Consider re-running after more sessions. |
139
- | `regression` | Investigate. Consider rollback. |
140
- | `insufficient_data` | Wait for more sessions before evaluating. |
151
+ | Signal | Action |
152
+ | ----------------------------------- | ------------------------------------------------- |
153
+ | `alert == null` | No rollback signal. Continue monitoring. |
154
+ | `alert != null` and `rolledBack` | Auto-rollback already happened. Confirm recovery. |
155
+ | `alert != null` and not rolled back | Investigate and consider `recommended_command`. |
156
+ | low `snapshot.skill_checks` | Wait for more sessions before calling it stable. |
141
157
 
142
158
  ### 3. Decide Action
143
159
 
@@ -174,7 +190,8 @@ context window resets before the user acts on the results.
174
190
 
175
191
  **"Check for regressions"**
176
192
 
177
- > Same as above. Focus on the `regression_detected` and `delta` fields.
193
+ > Same as above. Focus on `snapshot.regression_detected`, `alert`, and
194
+ > `recommended_command`.
178
195
 
179
196
  **"How is the skill doing?"**
180
197
 
@@ -186,12 +203,62 @@ context window resets before the user acts on the results.
186
203
  > Use `--auto-rollback`. The command will restore the previous description
187
204
  > automatically if pass rate drops below baseline minus threshold.
188
205
 
206
+ ## Trust Scoring and Frontier Feedback
207
+
208
+ Watch results now feed back into the package search pipeline and orchestrate's
209
+ scope selection:
210
+
211
+ - **Frontier demotion:** When watch detects a regression for a skill that has
212
+ an accepted package frontier candidate, the observed watch evidence is
213
+ written back into that candidate's artifact and SQLite row. This can demote
214
+ the candidate during future frontier parent selection, so the next search
215
+ run compares against a stronger baseline instead of repeating a regressed
216
+ one.
217
+ - **Publish blocking:** Watch alerts can block publish when regressions are
218
+ detected. The `create publish --watch` flow attaches structured watch
219
+ evidence directly to the package-evaluation summary. Regressions surface as
220
+ explicit blockers rather than silent degradation.
221
+ - **Scope influence:** Orchestrate uses watch evidence when deciding whether a
222
+ skill should go through description-level evolve or package-level search on
223
+ the next run. Skills with observed package regressions may be re-routed to
224
+ package search to address the underlying package-level issue rather than
225
+ only adjusting the description.
226
+
227
+ ### How Watch Evidence Feeds Back to the Frontier
228
+
229
+ When `selftune publish` runs with watch enabled and the watch result contains
230
+ alerts, the publish flow calls `refreshPackageCandidateEvaluationObservation()`
231
+ (in `create/package-candidate-state.ts`). This function writes the structured
232
+ watch evidence — alert text, regression deltas, and grade regression data —
233
+ back into the candidate's SQLite row.
234
+
235
+ The frontier ranking comparator in `package-candidate-state.ts` uses **watch
236
+ rank as the highest-priority signal** in its 15-level sort order:
237
+
238
+ | Watch rank | Meaning | Effect on frontier position |
239
+ | ---------- | ------------------------------------------ | --------------------------- |
240
+ | 2 | No issues detected | Best — ranked first |
241
+ | 1 | Unknown or insufficient watch data | Neutral — middle tier |
242
+ | 0 | Alert, rollback, or regression detected | Worst — ranked last |
243
+
244
+ Demoted candidates appear with `watch_demoted: true` in the dashboard frontier
245
+ state, making it visible to the operator which candidates were deprioritized by
246
+ watch evidence.
247
+
248
+ On subsequent search runs, the parent selection step picks the highest-ranked
249
+ frontier member as the baseline for the next generation of mutations. A
250
+ watch-demoted candidate (rank 0) will be deprioritized in favor of candidates
251
+ without alerts, so the search continues from a stronger measured baseline
252
+ rather than repeating a regressed one.
253
+
189
254
  ## Autonomous Mode
190
255
 
191
- When called by `selftune orchestrate`, watch runs automatically on recently
192
- evolved skills:
256
+ When called by `selftune run` (backed by the `selftune orchestrate` runtime),
257
+ watch runs automatically on recently evolved skills:
193
258
 
194
259
  - Checks all skills evolved in the last --recent-window hours (default 24)
195
260
  - Auto-rollback is enabled by default
196
261
  - Results are included in the orchestrate run report
197
262
  - No user notification — regressions are handled silently via rollback
263
+ - Watch evidence feeds back into the accepted package frontier for skills
264
+ with package evaluation history, influencing future scope selection
@@ -63,14 +63,15 @@ discovered-source metadata with occurrence count and synergy score.
63
63
  `scaffold` turns an observed workflow into a draft local skill.
64
64
 
65
65
  - Default behavior is preview-first: the command prints the proposed skill name,
66
- output path, provenance, and full `SKILL.md` content.
67
- - Add `--write` to create `<output-dir>/<skill-name>/SKILL.md`.
68
- - The generated skill is intentionally conservative: it includes provenance,
69
- a description derived from the workflow trigger, an execution plan, and the
70
- discovered workflow section. It does not silently publish or distribute the
71
- new skill.
72
-
73
- When `selftune orchestrate` sees a strong workflow pattern, it now creates a
66
+ output path, provenance, and the package preview.
67
+ - Add `--write` to create a full package under `<output-dir>/<skill-name>/`.
68
+ - The generated skill is intentionally conservative: it includes a trigger
69
+ summary in `SKILL.md`, ordered execution steps in `workflows/default.md`,
70
+ provenance in `references/overview.md`, empty `scripts/`, empty `assets/`,
71
+ and a `selftune.create.json` manifest. It does not silently publish or
72
+ distribute the new skill.
73
+
74
+ When `selftune run` sees a strong workflow pattern, it now creates a
74
75
  review-first `new_skill` proposal automatically. The manual `scaffold` command
75
76
  still exists for explicit previewing and local draft writes.
76
77
 
@@ -89,6 +90,10 @@ selftune workflows scaffold 1 --output-dir .agents/skills --write
89
90
  The number prefix (for example, `1.`) is the 1-based index you can pass to
90
91
  `selftune workflows save <index>`.
91
92
 
93
+ When you preview a scaffold, selftune prints the package metadata followed by
94
+ the generated file contents for `SKILL.md`, `workflows/default.md`, and
95
+ `references/overview.md`.
96
+
92
97
  ```text
93
98
  Discovered Workflows (from 450 sessions):
94
99