selftune 0.2.16 → 0.2.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/README.md +32 -22
  2. package/apps/local-dashboard/dist/assets/index-DnhnXQm6.js +60 -0
  3. package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +1 -0
  4. package/apps/local-dashboard/dist/assets/vendor-table-BIiI3YhS.js +1 -0
  5. package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +12 -0
  6. package/apps/local-dashboard/dist/index.html +5 -5
  7. package/cli/selftune/alpha-upload/build-payloads.ts +14 -1
  8. package/cli/selftune/alpha-upload/client.ts +51 -1
  9. package/cli/selftune/alpha-upload/flush.ts +46 -5
  10. package/cli/selftune/alpha-upload/stage-canonical.ts +32 -10
  11. package/cli/selftune/alpha-upload-contract.ts +9 -0
  12. package/cli/selftune/constants.ts +92 -5
  13. package/cli/selftune/contribute/contribute.ts +30 -2
  14. package/cli/selftune/contribute/sanitize.ts +52 -5
  15. package/cli/selftune/contribution-config.ts +249 -0
  16. package/cli/selftune/contribution-relay.ts +177 -0
  17. package/cli/selftune/contribution-signals.ts +219 -0
  18. package/cli/selftune/contribution-staging.ts +147 -0
  19. package/cli/selftune/contributions.ts +532 -0
  20. package/cli/selftune/creator-contributions.ts +333 -0
  21. package/cli/selftune/dashboard-contract.ts +305 -1
  22. package/cli/selftune/dashboard-server.ts +47 -13
  23. package/cli/selftune/eval/family-overlap.ts +395 -0
  24. package/cli/selftune/eval/hooks-to-evals.ts +182 -28
  25. package/cli/selftune/eval/synthetic-evals.ts +298 -11
  26. package/cli/selftune/evolution/description-quality.ts +12 -11
  27. package/cli/selftune/evolution/evolve.ts +214 -51
  28. package/cli/selftune/evolution/validate-proposal.ts +9 -6
  29. package/cli/selftune/export.ts +2 -2
  30. package/cli/selftune/grading/grade-session.ts +20 -0
  31. package/cli/selftune/hooks/commit-track.ts +188 -0
  32. package/cli/selftune/hooks/prompt-log.ts +10 -1
  33. package/cli/selftune/hooks/session-stop.ts +2 -2
  34. package/cli/selftune/hooks/skill-eval.ts +15 -1
  35. package/cli/selftune/hooks/stdin-preview.ts +32 -0
  36. package/cli/selftune/index.ts +41 -5
  37. package/cli/selftune/ingestors/codex-rollout.ts +31 -35
  38. package/cli/selftune/ingestors/codex-wrapper.ts +32 -24
  39. package/cli/selftune/localdb/db.ts +2 -2
  40. package/cli/selftune/localdb/direct-write.ts +69 -6
  41. package/cli/selftune/localdb/queries.ts +1253 -37
  42. package/cli/selftune/localdb/schema.ts +66 -0
  43. package/cli/selftune/orchestrate.ts +32 -4
  44. package/cli/selftune/recover.ts +153 -0
  45. package/cli/selftune/repair/skill-usage.ts +363 -4
  46. package/cli/selftune/routes/actions.ts +35 -1
  47. package/cli/selftune/routes/analytics.ts +14 -0
  48. package/cli/selftune/routes/index.ts +1 -0
  49. package/cli/selftune/routes/overview.ts +150 -4
  50. package/cli/selftune/routes/skill-report.ts +648 -18
  51. package/cli/selftune/status.ts +81 -2
  52. package/cli/selftune/sync.ts +56 -2
  53. package/cli/selftune/trust-model.ts +66 -0
  54. package/cli/selftune/types.ts +80 -0
  55. package/cli/selftune/utils/skill-detection.ts +43 -0
  56. package/cli/selftune/utils/transcript.ts +210 -1
  57. package/cli/selftune/watchlist.ts +65 -0
  58. package/node_modules/@selftune/telemetry-contract/src/types.ts +11 -0
  59. package/package.json +1 -1
  60. package/packages/telemetry-contract/src/types.ts +11 -0
  61. package/packages/ui/src/components/ActivityTimeline.tsx +165 -150
  62. package/packages/ui/src/components/EvidenceViewer.tsx +335 -144
  63. package/packages/ui/src/components/EvolutionTimeline.tsx +58 -28
  64. package/packages/ui/src/components/OrchestrateRunsPanel.tsx +33 -16
  65. package/packages/ui/src/components/RecentActivityFeed.tsx +72 -41
  66. package/packages/ui/src/components/section-cards.tsx +12 -9
  67. package/packages/ui/src/primitives/card.tsx +1 -1
  68. package/skill/SKILL.md +40 -2
  69. package/skill/Workflows/AlphaUpload.md +4 -0
  70. package/skill/Workflows/Composability.md +64 -0
  71. package/skill/Workflows/Contribute.md +6 -3
  72. package/skill/Workflows/Contributions.md +97 -0
  73. package/skill/Workflows/CreatorContributions.md +74 -0
  74. package/skill/Workflows/Dashboard.md +31 -0
  75. package/skill/Workflows/Evals.md +57 -8
  76. package/skill/Workflows/Evolve.md +31 -13
  77. package/skill/Workflows/ExportCanonical.md +121 -0
  78. package/skill/Workflows/Hook.md +131 -0
  79. package/skill/Workflows/Ingest.md +7 -0
  80. package/skill/Workflows/Initialize.md +29 -9
  81. package/skill/Workflows/Orchestrate.md +27 -5
  82. package/skill/Workflows/Quickstart.md +94 -0
  83. package/skill/Workflows/Recover.md +84 -0
  84. package/skill/Workflows/RepairSkillUsage.md +95 -0
  85. package/skill/Workflows/Sync.md +18 -12
  86. package/skill/Workflows/Uninstall.md +82 -0
  87. package/skill/settings_snippet.json +11 -0
  88. package/apps/local-dashboard/dist/assets/index-BMIS6uUh.css +0 -2
  89. package/apps/local-dashboard/dist/assets/index-DOu3iLD9.js +0 -16
  90. package/apps/local-dashboard/dist/assets/vendor-table-pHbDxq36.js +0 -8
  91. package/apps/local-dashboard/dist/assets/vendor-ui-DIwlrGlb.js +0 -12
@@ -4,12 +4,27 @@ Analyze how skills interact when triggered together in the same session.
4
4
  Detects conflict candidates — skill pairs that produce more errors when
5
5
  co-occurring than when used alone.
6
6
 
7
+ Use the same workflow when the user is asking whether a sibling skill family
8
+ should stay split apart or be consolidated under one parent skill.
9
+
7
10
  ## Default Command
8
11
 
9
12
  ```bash
10
13
  selftune eval composability --skill <name> [options]
11
14
  ```
12
15
 
16
+ ## Family Overlap Command
17
+
18
+ ```bash
19
+ selftune eval family-overlap --prefix <family-> [options]
20
+ ```
21
+
22
+ Or analyze an explicit set of siblings:
23
+
24
+ ```bash
25
+ selftune eval family-overlap --skills <skill-a,skill-b,skill-c> [options]
26
+ ```
27
+
13
28
  ## Options
14
29
 
15
30
  | Flag | Description | Default |
@@ -18,6 +33,16 @@ selftune eval composability --skill <name> [options]
18
33
  | `--window <n>` | Only analyze sessions from last N days | All sessions |
19
34
  | `--telemetry-log <path>` | Path to telemetry log | `~/.claude/session_telemetry_log.jsonl` |
20
35
 
36
+ ### Family Overlap Options
37
+
38
+ | Flag | Description | Default |
39
+ | ----------------------- | ------------------------------------------------------------------ | ------- |
40
+ | `--prefix <family->` | Analyze all installed/observed sibling skills with this prefix | Required unless `--skills` |
41
+ | `--skills <a,b,c>` | Analyze a specific skill family | Required unless `--prefix` |
42
+ | `--parent-skill <name>` | Override the suggested consolidated parent skill name | Derived from prefix |
43
+ | `--min-overlap <pct>` | Minimum positive-query overlap to flag consolidation pressure | `0.3` |
44
+ | `--min-shared <n>` | Minimum shared positive queries to flag a sibling pair | `2` |
45
+
21
46
  ## Output Format
22
47
 
23
48
  ```json
@@ -60,6 +85,25 @@ The analyzer is a pure function that computes conflict scores from telemetry:
60
85
  3. Pairs with `conflict_score > 0.3` are flagged as conflict candidates
61
86
  4. Results sorted by co-occurrence count (most common first)
62
87
 
88
+ ## How Family Overlap Works
89
+
90
+ The family-overlap analyzer answers a different question:
91
+
92
+ 1. Build a trusted positive query set for each sibling skill
93
+ 2. Compare every pair of siblings using exact-query overlap
94
+ 3. Flag pairs whose overlap crosses the configured threshold
95
+ 4. If overlap is persistent across the family, emit:
96
+ - consolidation recommendation
97
+ - draft parent skill name
98
+ - internal workflow mapping
99
+ - compatibility alias / migration notes
100
+
101
+ This is for packaging questions like:
102
+
103
+ - "Should `sc-search`, `sc-model`, and `sc-compare` really be one parent skill?"
104
+ - "Are my sibling skills competing for the same user intent?"
105
+ - "Should I stop evolving these independently and redesign the family?"
106
+
63
107
  ## Steps
64
108
 
65
109
  ### 1. Run Analysis
@@ -86,6 +130,18 @@ When conflict candidates are identified, present them to the user with recommend
86
130
  - Consider evolving descriptions to reduce false triggers
87
131
  - Use the `pattern-analyst` agent for deeper cross-skill analysis
88
132
 
133
+ ### 4. Investigate Family Consolidation
134
+
135
+ ```bash
136
+ selftune eval family-overlap --prefix sc-
137
+ ```
138
+
139
+ Interpretation:
140
+
141
+ - `consolidation_candidate: false` means keep improving the sibling descriptions/workflows separately
142
+ - `consolidation_candidate: true` means the problem is likely packaging, not just wording
143
+ - `refactor_proposal` is a draft for human review only; do not auto-deploy a family rewrite
144
+
89
145
  ## Subagent Escalation
90
146
 
91
147
  For deep cross-skill analysis beyond what the composability command provides,
@@ -110,3 +166,11 @@ resolution plan with trigger ownership recommendations.
110
166
  **"Why are sessions with multiple skills failing?"**
111
167
 
112
168
  > Run composability for each skill involved, look for high conflict scores.
169
+
170
+ **"Are my State Change skills too fragmented?"**
171
+
172
+ > `selftune eval family-overlap --prefix sc-`
173
+
174
+ **"Should I consolidate this sibling skill family?"**
175
+
176
+ > Run `selftune eval family-overlap` and look for `consolidation_candidate` plus the `refactor_proposal`.
@@ -1,8 +1,11 @@
1
1
  # selftune Contribute Workflow
2
2
 
3
- Export anonymized skill observability data as a JSON bundle for community
4
- contribution. Helps improve selftune's skill routing without exposing
5
- private data.
3
+ Export anonymized skill observability data as a JSON bundle for **community**
4
+ contribution. Helps improve selftune's skill routing without exposing private data.
5
+
6
+ This is **not** the same as `selftune contributions`, which manages per-skill
7
+ creator-directed sharing preferences, or `selftune creator-contributions`,
8
+ which manages the creator-side bundled config file.
6
9
 
7
10
  ## When to Use
8
11
 
@@ -0,0 +1,97 @@
1
+ # selftune Contributions Workflow
2
+
3
+ Manage local preferences for future creator-directed contribution flows.
4
+
5
+ This is **not** the same as `selftune contribute`:
6
+ - `selftune contributions` manages per-skill opt-in choices for creator-directed sharing
7
+ - `selftune contribute` exports a community contribution bundle
8
+ - `selftune creator-contributions` manages the creator-side `selftune.contribute.json` file
9
+
10
+ ## When to Use
11
+
12
+ - The user asks to approve or revoke sharing signals with a specific skill creator
13
+ - The user wants to see which creator-directed contribution preferences are stored locally
14
+ - The user wants to set a default behavior for future creator-directed contribution prompts
15
+
16
+ ## Default Commands
17
+
18
+ ```bash
19
+ selftune contributions
20
+ selftune contributions preview <skill>
21
+ selftune contributions approve <skill>
22
+ selftune contributions revoke <skill>
23
+ selftune contributions default <ask|always|never>
24
+ selftune contributions upload [--dry-run] [--retry-failed] [--limit <n>]
25
+ ```
26
+
27
+ ## What It Does Today
28
+
29
+ - Discovers installed skills that ship a `selftune.contribute.json` config
30
+ - Stores local opt-in / opt-out state in `~/.selftune/contribution-preferences.json`
31
+ - Stages privacy-safe creator-directed relay signals locally during `selftune sync` once a skill is approved
32
+ - Keeps creator-directed sharing preferences separate from:
33
+ - `selftune contribute` community export bundles
34
+ - `selftune alpha upload` personal cloud uploads
35
+
36
+ ## Commands
37
+
38
+ | Command | Description |
39
+ | --- | --- |
40
+ | `selftune contributions` | Show current creator-directed contribution preferences |
41
+ | `selftune contributions status` | Same as above |
42
+ | `selftune contributions preview <skill>` | Show the privacy-safe relay payload shape for one skill |
43
+ | `selftune contributions approve <skill>` | Approve creator-directed sharing for one skill |
44
+ | `selftune contributions revoke <skill>` | Revoke creator-directed sharing for one skill |
45
+ | `selftune contributions default <ask|always|never>` | Set the default behavior for future creator-directed prompts |
46
+ | `selftune contributions upload [--dry-run] [--retry-failed] [--limit <n>]` | Flush locally staged creator-directed relay signals |
47
+ | `selftune contributions reset` | Reset all creator-directed sharing preferences to defaults |
48
+
49
+ ## Upload Flags
50
+
51
+ | Flag | Type | Description |
52
+ | --- | --- | --- |
53
+ | `--dry-run` | Boolean | Show pending staged rows without uploading |
54
+ | `--retry-failed` | Boolean | Requeue failed rows before attempting upload |
55
+ | `--limit <n>` | Integer | Maximum number of staged rows to upload in one run |
56
+
57
+ ## Notes
58
+
59
+ - This workflow now shows which installed skills are requesting creator-directed sharing via `selftune.contribute.json`.
60
+ - Once approved, creator-directed contribution signals are staged locally during `selftune sync` / `selftune orchestrate`.
61
+ - Use `selftune contributions upload` to flush staged rows to the creator-directed relay endpoint.
62
+ - Relay upload is separate from `selftune alpha upload` and currently reuses the local cloud API key when available.
63
+ - Use `selftune contribute` when the user explicitly wants to export/share an anonymized community bundle.
64
+ - Use `selftune alpha upload` when the user wants to push their own cloud telemetry.
65
+
66
+ ## Common Patterns
67
+
68
+ **User asks what creator-directed sharing is configured**
69
+
70
+ > Run `selftune contributions` and summarize the global default plus any per-skill choices.
71
+
72
+ **User wants to allow contribution signals for one skill**
73
+
74
+ > Run `selftune contributions approve <skill>`.
75
+
76
+ **User wants to see what would actually be shared**
77
+
78
+ > Run `selftune contributions preview <skill>` and summarize the requested signals plus the “never shared” guarantees.
79
+
80
+ **User wants to turn off creator-directed sharing for one skill**
81
+
82
+ > Run `selftune contributions revoke <skill>`.
83
+
84
+ **User wants future creator-directed prompts to default one way**
85
+
86
+ > Run `selftune contributions default <ask|always|never>` using the user's preference.
87
+
88
+ **User wants to send staged creator-directed signals now**
89
+
90
+ > Run `selftune contributions upload`.
91
+ > Use `--dry-run` first if they want to confirm how many staged rows are pending.
92
+ > Use `--retry-failed` if earlier relay attempts failed and need to be retried.
93
+ > Use `--limit 25` when they want a smaller controlled batch.
94
+
95
+ **User wants to clear all stored creator-directed contribution preferences**
96
+
97
+ > Run `selftune contributions reset`.
@@ -0,0 +1,74 @@
1
+ # selftune Creator-Contributions Workflow
2
+
3
+ Manage the creator-side `selftune.contribute.json` file bundled with a skill.
4
+
5
+ This is **not** the same as:
6
+ - `selftune contributions` — end-user opt-in / opt-out preferences
7
+ - `selftune contribute` — community export bundle
8
+
9
+ ## When to Use
10
+
11
+ - The user is a skill creator and wants to enable creator-directed contribution for one skill
12
+ - The user wants to inspect or remove a bundled `selftune.contribute.json`
13
+ - The user wants to prepare a skill package for the future creator ← user relay pipeline
14
+
15
+ ## Default Commands
16
+
17
+ ```bash
18
+ selftune creator-contributions
19
+ selftune creator-contributions status --skill <name>
20
+ selftune creator-contributions enable --skill <name> [--skill-path <path>] [--creator-id <id>]
21
+ selftune creator-contributions enable --all [--prefix sc-] [--creator-id <id>]
22
+ selftune creator-contributions disable --skill <name> [--skill-path <path>]
23
+ ```
24
+
25
+ ## Options
26
+
27
+ | Flag | Description |
28
+ | --- | --- |
29
+ | `--skill <name>` | Skill name to inspect or configure |
30
+ | `--skill-path <path>` | Explicit path to the skill's `SKILL.md` when auto-discovery is ambiguous |
31
+ | `--creator-id <id>` | Explicit creator ID. If omitted, selftune uses `alpha.cloud_user_id` from local config when available |
32
+ | `--signals <csv>` | Comma-separated signal list for the generated config |
33
+ | `--message <text>` | Custom opt-in note stored in the config |
34
+ | `--privacy-url <url>` | Optional creator privacy URL stored in the config |
35
+ | `--all` | Enable configs for every installed skill selftune can resolve |
36
+ | `--prefix <prefix>` | Limit `--all` to installed skills whose names start with this prefix |
37
+
38
+ ## What It Does Today
39
+
40
+ - Discovers installed skills that already ship `selftune.contribute.json`
41
+ - Creates or removes that config file locally for a creator-owned skill
42
+ - Can bulk-enable configs for multiple installed skills (useful for a skill suite like `sc-*`)
43
+ - Uses a static JSON config only — no executable creator code
44
+
45
+ ## Notes
46
+
47
+ - This is local packaging/setup only. It does **not** upload creator-directed signals yet.
48
+ - The creator ID is currently sourced from `--creator-id` or the local alpha identity's `cloud_user_id`.
49
+ - Use this workflow when the user is preparing a skill package.
50
+
51
+ ## Common Patterns
52
+
53
+ **User wants to see which of their skills already request creator contributions**
54
+
55
+ > Run `selftune creator-contributions` and summarize the discovered configs.
56
+ > Example: `selftune creator-contributions status --skill sc-search`
57
+
58
+ **User wants to enable creator contributions for one skill**
59
+
60
+ > Run `selftune creator-contributions enable --skill <name>`.
61
+ > If auto-discovery fails, rerun with `--skill-path /path/to/SKILL.md`.
62
+ > If no creator identity is available locally, rerun with `--creator-id <id>`.
63
+ > Example: `selftune creator-contributions enable --skill sc-search --skill-path ./skills/sc-search/SKILL.md --creator-id cr_state_change --signals trigger,grade,miss_category --message "Share privacy-safe usage signals with the skill creator." --privacy-url https://statechange.ai/privacy`
64
+
65
+ **User wants to enable creator contributions for a whole installed skill suite**
66
+
67
+ > Run `selftune creator-contributions enable --all --prefix sc-`.
68
+ > This is the fastest path when preparing a whole family of skills like State Change skills.
69
+ > Example: `selftune creator-contributions enable --all --prefix sc- --creator-id cr_state_change`
70
+
71
+ **User wants to stop bundling creator contribution config**
72
+
73
+ > Run `selftune creator-contributions disable --skill <name>`.
74
+ > Example: `selftune creator-contributions disable --skill sc-search --skill-path ./skills/sc-search/SKILL.md`
@@ -49,6 +49,7 @@ override.
49
49
  | `POST` | `/api/actions/watch` | Trigger `selftune watch` for a skill |
50
50
  | `POST` | `/api/actions/evolve` | Trigger `selftune evolve` for a skill |
51
51
  | `POST` | `/api/actions/rollback` | Trigger `selftune evolve rollback` for a skill |
52
+ | `POST` | `/api/actions/watchlist` | Persist creator watchlist preferences |
52
53
 
53
54
  ### Live Updates (SSE)
54
55
 
@@ -98,6 +99,36 @@ All action endpoints return:
98
99
 
99
100
  On failure, `success` is `false` and `error` contains the error message.
100
101
 
102
+ **Watchlist** request body:
103
+
104
+ ```json
105
+ {
106
+ "skills": ["pptx", "sc-search"]
107
+ }
108
+ ```
109
+
110
+ `skills` must be an array of skill names. The action replaces the full persisted
111
+ watchlist for the local dashboard.
112
+
113
+ Watchlist success response:
114
+
115
+ ```json
116
+ {
117
+ "success": true,
118
+ "watched_skills": ["pptx", "sc-search"],
119
+ "error": null
120
+ }
121
+ ```
122
+
123
+ Watchlist failure response:
124
+
125
+ ```json
126
+ {
127
+ "success": false,
128
+ "error": "Missing required field: skills[]"
129
+ }
130
+ ```
131
+
101
132
  ### Browser and Shutdown
102
133
 
103
134
  The live server auto-opens the dashboard URL in the default browser on
@@ -25,7 +25,7 @@ selftune eval generate --skill <name> [options]
25
25
  | Flag | Description | Default |
26
26
  | ---------------------------------- | ----------------------------------------------------- | --------------------------------- |
27
27
  | `--skill <name>` | Skill to generate evals for | Required (unless `--list-skills`) |
28
- | `--list-skills` | List all logged skills with query counts | Off |
28
+ | `--list-skills` | List skills with trusted-vs-raw readiness counts | Off |
29
29
  | `--stats` | Show aggregate telemetry stats for the skill | Off |
30
30
  | `--max <n>` | Maximum eval entries per side | 50 |
31
31
  | `--seed <n>` | Seed for deterministic shuffling | 42 |
@@ -36,6 +36,7 @@ selftune eval generate --skill <name> [options]
36
36
  | `--query-log <path>` | Path to all_queries_log.jsonl | Default log path |
37
37
  | `--telemetry-log <path>` | Path to session_telemetry_log.jsonl | Default log path |
38
38
  | `--synthetic` | Generate evals from SKILL.md via LLM (no logs needed) | Off |
39
+ | `--auto-synthetic` | Fall back to SKILL.md-based cold-start evals when no trusted triggers exist | Off |
39
40
  | `--skill-path <path>` | Path to SKILL.md (required with `--synthetic`) | — |
40
41
  | `--model <model>` | LLM model to use for synthetic generation | Agent default |
41
42
 
@@ -65,8 +66,22 @@ and optional `invocation_type` (omitted when `--no-taxonomy` is set).
65
66
  ```json
66
67
  {
67
68
  "skills": [
68
- { "name": "pptx", "query_count": 42, "session_count": 15 },
69
- { "name": "selftune", "query_count": 28, "session_count": 10 }
69
+ {
70
+ "name": "pptx",
71
+ "trusted_trigger_count": 42,
72
+ "raw_trigger_count": 42,
73
+ "trusted_session_count": 15,
74
+ "raw_session_count": 15,
75
+ "readiness": "log-ready"
76
+ },
77
+ {
78
+ "name": "sc-search",
79
+ "trusted_trigger_count": 0,
80
+ "raw_trigger_count": 1,
81
+ "trusted_session_count": 0,
82
+ "raw_session_count": 1,
83
+ "readiness": "cold-start"
84
+ }
70
85
  ]
71
86
  }
72
87
  ```
@@ -115,7 +130,11 @@ Discover which skills have telemetry data and how many queries each has.
115
130
  selftune eval generate --list-skills
116
131
  ```
117
132
 
118
- Run this first to identify which skills have enough data for eval generation.
133
+ Run this first to identify which skills have enough trusted data for eval generation.
134
+ Installed skills with no trusted trigger history now appear as `cold-start`, which means the
135
+ skill is installed locally and ready for `--auto-synthetic` / `--synthetic` eval generation.
136
+ If raw trigger history exists but trusted positives do not, the list now shows both counts so the
137
+ creator can see that telemetry exists without being misled into thinking the skill is fully ready.
119
138
 
120
139
  ### Generate Synthetic Evals (Cold Start)
121
140
 
@@ -126,20 +145,36 @@ queries directly from the SKILL.md content via an LLM.
126
145
  selftune eval generate --skill pptx --synthetic --skill-path /path/to/skills/pptx/SKILL.md
127
146
  ```
128
147
 
148
+ If the skill is installed locally but has no trusted trigger history yet, use the faster creator
149
+ onboarding path:
150
+
151
+ ```bash
152
+ selftune eval generate --skill pptx --auto-synthetic --skill-path /path/to/skills/pptx/SKILL.md
153
+ ```
154
+
155
+ `--auto-synthetic` keeps the normal log-based path when real trigger data exists, but falls back
156
+ to synthetic cold-start generation when it does not.
157
+
129
158
  The command:
130
159
 
131
160
  1. Reads the SKILL.md file content
132
161
  2. Loads real user queries from the database (if available) as few-shot style examples so synthetic queries match real phrasing patterns
133
- 3. Sends skill content and real examples to an LLM with a prompt requesting realistic test queries
134
- 4. Parses the response into eval entries with invocation type annotations
135
- 5. Classifies each positive query using the deterministic `classifyInvocation()` heuristic
136
- 6. Writes the eval set to the output file
162
+ 3. Detects nearby installed sibling skills to generate harder negative controls
163
+ 4. Over-generates a candidate pool with a balanced prompt family mix (explicit / implicit / contextual positives plus sibling-confusion / adjacent / unrelated negatives)
164
+ 5. Runs a second critique/prune pass to remove weak paraphrases, overlaps, and blurry boundary cases
165
+ 6. Parses the response into eval entries with invocation type annotations
166
+ 7. Classifies each positive query using the deterministic `classifyInvocation()` heuristic
167
+ 8. Writes the eval set to the output file
137
168
 
138
169
  **Note:** When real query data exists in the database, synthetic generation
139
170
  automatically includes high-confidence positive triggers and general queries as
140
171
  phrasing references. This produces more natural-sounding eval queries. If no
141
172
  database is available, generation proceeds without real examples (fail-open).
142
173
 
174
+ The synthetic cold-start path is intentionally small and targeted. It is meant to bootstrap a
175
+ creator skill into its first supervised evolution cycle, not serve as the long-term source of
176
+ truth once real telemetry exists.
177
+
143
178
  Use `--model` to override the default LLM model:
144
179
 
145
180
  ```bash
@@ -165,6 +200,20 @@ The command:
165
200
  5. Annotates each entry with invocation type
166
201
  6. Writes the eval set to the output file
167
202
 
203
+ After generation, the current validation path is:
204
+
205
+ ```bash
206
+ selftune evolve --skill <name> --skill-path /path/to/SKILL.md --eval-set <generated-file> --dry-run
207
+ ```
208
+
209
+ That dry run validates a proposal against the generated eval set without deploying.
210
+
211
+ If the selected skill has no trusted positives yet but selftune can resolve a local `SKILL.md`,
212
+ the command now prints the exact `--auto-synthetic` rerun hint instead of leaving the creator to
213
+ guess the cold-start path.
214
+
215
+ After reviewing a dry-run proposal, deploy by rerunning without `--dry-run`.
216
+
168
217
  ### Show Stats
169
218
 
170
219
  View aggregate telemetry for a skill: average turns, tool call breakdown,
@@ -31,14 +31,16 @@ selftune evolve --skill <name> --skill-path <path> [options]
31
31
  | `--confidence <n>` | Minimum confidence threshold (0-1) | 0.6 |
32
32
  | `--max-iterations <n>` | Maximum retry iterations | 3 |
33
33
  | `--validation-model <model>` | Model for trigger-check validation LLM calls | `haiku` |
34
- | `--pareto` | Generate multiple candidates per iteration | Off |
35
- | `--candidates <n>` | Number of candidates per iteration (with `--pareto`) | 3 |
34
+ | `--pareto` | Generate multiple candidates per iteration | On |
35
+ | `--candidates <n>` | Number of candidates per iteration when Pareto mode is enabled | `3` |
36
36
  | `--token-efficiency` | Optimize for token efficiency in proposals | Off |
37
37
  | `--with-baseline` | Include a no-skill baseline comparison | Off |
38
38
  | `--cheap-loop` | Use cheap models for loop, expensive for final gate | On |
39
39
  | `--full-model` | Use full-cost model throughout (disables cheap-loop) | Off |
40
40
  | `--verbose` | Print detailed progress during evolution | Off |
41
41
  | `--gate-model <model>` | Model for final gate validation | `sonnet` (when `--cheap-loop`) |
42
+ | `--gate-effort <level>` | Thinking effort for the final gate (`low|medium|high|max`) | None |
43
+ | `--adaptive-gate` | Escalate risky gate checks to `opus` + `high` effort | Off |
42
44
  | `--proposal-model <model>` | Model for proposal generation LLM calls | None |
43
45
  | `--sync-first` | Refresh source-truth telemetry before generating evals/failure patterns | Off |
44
46
  | `--sync-force` | Force a full source rescan during `--sync-first` | Off |
@@ -115,7 +117,7 @@ Ask one `AskUserQuestion` at a time in this order:
115
117
  - `Single model — use one model throughout`
116
118
  4. `Advanced Options`
117
119
  Options:
118
- - `Defaults (0.6 confidence, 3 iterations, single candidate) (recommended)`
120
+ - `Defaults (0.6 confidence, 3 iterations, 3 Pareto candidates) (recommended)`
119
121
  - `Stricter (0.7 confidence, 5 iterations)`
120
122
  - `Pareto mode (multiple candidates per iteration)`
121
123
 
@@ -146,7 +148,7 @@ Configuration Summary:
146
148
  Model: haiku (cheap-loop: sonnet gate)
147
149
  Confidence: 0.6
148
150
  Iterations: 3
149
- Pareto: off
151
+ Pareto: on (3 candidates)
150
152
 
151
153
  Proceeding...
152
154
  ```
@@ -284,15 +286,20 @@ Proposals are scored on heuristic quality criteria (no LLM required). The compos
284
286
 
285
287
  ### Stopping Criteria
286
288
 
287
- The evolution loop stops when any of these conditions is met (priority order):
288
-
289
- | # | Condition | Meaning |
290
- | --- | ------------------ | --------------------------------------------------- |
291
- | 1 | **Converged** | Pass rate >= 0.95 |
292
- | 2 | **Max iterations** | Reached `--max-iterations` limit |
293
- | 3 | **Low confidence** | Proposal confidence below `--confidence` threshold |
294
- | 4 | **Plateau** | Pass rate unchanged across 3 consecutive iterations |
295
- | 5 | **Continue** | None of the above -- keep iterating |
289
+ The evolution loop uses a modular stopping criteria evaluator
290
+ (`evolution/stopping-criteria.ts`) that checks conditions in priority order
291
+ after each validation pass. The evaluator receives the current pass rate,
292
+ historical pass rates from previous iterations, and proposal confidence to
293
+ make a unified stop/continue decision. The stopping reason is recorded in
294
+ audit entries for traceability.
295
+
296
+ | # | Condition | Meaning |
297
+ | --- | ------------------ | -------------------------------------------------------------- |
298
+ | 1 | **Converged** | Pass rate >= 0.95 |
299
+ | 2 | **Max iterations** | Reached `--max-iterations` limit |
300
+ | 3 | **Low confidence** | Proposal confidence below `--confidence` threshold |
301
+ | 4 | **Plateau** | < 1% pass rate variation across 3 consecutive iterations |
302
+ | 5 | **Continue** | None of the above -- keep iterating |
296
303
 
297
304
  ## Cheap Loop Mode
298
305
 
@@ -310,6 +317,11 @@ The gate validation is a new step between validation and deploy. It re-runs
310
317
  `validateProposal` using the gate model. If the gate fails, the proposal is
311
318
  not deployed.
312
319
 
320
+ When `--adaptive-gate` is enabled, selftune keeps the normal gate for low-risk
321
+ proposals and escalates only risky ones to `opus` with `high` effort. Risk
322
+ signals include small net lift, regressions, low proposal confidence, and
323
+ large description broadening.
324
+
313
325
  ```bash
314
326
  # Cheap loop with default models
315
327
  selftune evolve --skill X --skill-path Y --cheap-loop
@@ -317,6 +329,12 @@ selftune evolve --skill X --skill-path Y --cheap-loop
317
329
  # Cheap loop with opus gate
318
330
  selftune evolve --skill X --skill-path Y --cheap-loop --gate-model opus
319
331
 
332
+ # Cheap loop with adaptive escalation for risky proposals
333
+ selftune evolve --skill X --skill-path Y --cheap-loop --adaptive-gate
334
+
335
+ # Explicit high-effort opus gate
336
+ selftune evolve --skill X --skill-path Y --cheap-loop --gate-model opus --gate-effort high
337
+
320
338
  # Manual model control without cheap-loop
321
339
  selftune evolve --skill X --skill-path Y --proposal-model haiku --validation-model sonnet
322
340
  ```
@@ -0,0 +1,121 @@
1
+ # selftune Export Canonical Workflow
2
+
3
+ Export canonical telemetry records as JSONL or as a V2 push payload for cloud
4
+ upload. Canonical records are the normalized, platform-agnostic representation
5
+ of sessions, prompts, skill invocations, execution facts, and normalization runs.
6
+
7
+ ## When to Use
8
+
9
+ - The user wants to export telemetry data for external analysis
10
+ - The user says "export canonical", "canonical export", or "canonical telemetry"
11
+ - The agent needs to produce a push payload for manual upload inspection
12
+ - Debugging what data would be sent to the cloud API
13
+
14
+ ## Default Command
15
+
16
+ ```bash
17
+ selftune export-canonical
18
+ ```
19
+
20
+ ## Options
21
+
22
+ | Flag | Description |
23
+ | ----------------------- | ------------------------------------------------------------------- |
24
+ | `--out <path>` | Write output to a file instead of stdout |
25
+ | `--platform <name>` | Filter by platform (`claude_code`, `codex`, `opencode`, `openclaw`) |
26
+ | `--record-kind <kind>` | Filter by record kind (`session`, `prompt`, `skill_invocation`, `execution_fact`, `normalization_run`) |
27
+ | `--pretty` | Pretty-print JSON output with 2-space indentation |
28
+ | `--log <path>` | Path to canonical log file (default: `~/.claude/canonical_log.jsonl`) |
29
+ | `--projects-dir <path>` | Claude transcript directory for fallback synthesis (default: `~/.claude/projects`) |
30
+ | `--push-payload` | Output as a V2 push payload envelope instead of raw JSONL |
31
+
32
+ ## Output Formats
33
+
34
+ ### Default (JSONL)
35
+
36
+ One canonical record per line:
37
+
38
+ ```jsonl
39
+ {"record_kind":"session","session_id":"abc123","platform":"claude_code",...}
40
+ {"record_kind":"prompt","prompt_id":"p1","session_id":"abc123",...}
41
+ {"record_kind":"skill_invocation","invocation_id":"inv1","skill_name":"selftune",...}
42
+ ```
43
+
44
+ ### Push Payload (`--push-payload`)
45
+
46
+ A single JSON envelope matching the V2 cloud upload schema:
47
+
48
+ ```json
49
+ {
50
+ "schema_version": "2.0",
51
+ "client_version": "0.1.0",
52
+ "push_id": "uuid",
53
+ "normalizer_version": "1.0.0",
54
+ "canonical": {
55
+ "sessions": [...],
56
+ "prompts": [...],
57
+ "skill_invocations": [...],
58
+ "execution_facts": [...],
59
+ "normalization_runs": [...],
60
+ "evolution_evidence": [...],
61
+ "orchestrate_runs": [],
62
+ "grading_results": [],
63
+ "improvement_signals": []
64
+ }
65
+ }
66
+ ```
67
+
68
+ ### File output (`--out`)
69
+
70
+ When `--out` is specified, the data is written to the file and a JSON summary
71
+ is printed to stdout:
72
+
73
+ ```json
74
+ {
75
+ "ok": true,
76
+ "out": "/path/to/output.jsonl",
77
+ "count": 42,
78
+ "format": "jsonl",
79
+ "pretty": false,
80
+ "platform": null,
81
+ "record_kind": null
82
+ }
83
+ ```
84
+
85
+ ## Fallback Behavior
86
+
87
+ If the canonical log file is empty or does not exist, the command falls back to
88
+ synthesizing canonical records directly from Claude Code transcripts in
89
+ `--projects-dir`. This supports existing installs that have rich transcript
90
+ data but have not yet generated a canonical log.
91
+
92
+ ## Common Patterns
93
+
94
+ **Export all canonical data**
95
+
96
+ > Run `selftune export-canonical > export.jsonl` to dump everything.
97
+
98
+ **Export only skill invocations**
99
+
100
+ > Run `selftune export-canonical --record-kind skill_invocation` to filter.
101
+
102
+ **Inspect push payload before upload**
103
+
104
+ > Run `selftune export-canonical --push-payload --pretty` to see exactly what would be sent to the cloud API.
105
+
106
+ **Export to file with summary**
107
+
108
+ > Run `selftune export-canonical --out /tmp/export.jsonl --pretty` to write data and see a count summary.
109
+
110
+ **Filter by platform**
111
+
112
+ > Run `selftune export-canonical --platform claude_code` to export only Claude Code records.
113
+
114
+ ## Troubleshooting
115
+
116
+ | Symptom | Cause | Fix |
117
+ | --- | --- | --- |
118
+ | Empty output | No canonical log and no transcripts | Run `selftune sync` or `selftune quickstart` to ingest data first |
119
+ | "Unknown platform" error | Invalid `--platform` value | Use one of: `claude_code`, `codex`, `opencode`, `openclaw` |
120
+ | "Unknown record kind" error | Invalid `--record-kind` value | Use one of: `session`, `prompt`, `skill_invocation`, `execution_fact`, `normalization_run` |
121
+ | Push payload missing evolution evidence | No evolution runs recorded | Run `selftune evolve` to generate evidence, then re-export |