selftune 0.2.9 → 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. package/README.md +35 -35
  2. package/apps/local-dashboard/dist/assets/index-BZVLv70T.js +16 -0
  3. package/apps/local-dashboard/dist/assets/{vendor-react-BQH_6WrG.js → vendor-react-BXP54cYo.js} +4 -4
  4. package/apps/local-dashboard/dist/assets/{vendor-table-dK1QMLq9.js → vendor-table-DTF_SXoy.js} +1 -1
  5. package/apps/local-dashboard/dist/assets/{vendor-ui-CO2mrx6e.js → vendor-ui-CWU0d1wd.js} +66 -66
  6. package/apps/local-dashboard/dist/index.html +15 -15
  7. package/bin/selftune.cjs +1 -1
  8. package/cli/selftune/activation-rules.ts +1 -0
  9. package/cli/selftune/alpha-upload/build-payloads.ts +18 -2
  10. package/cli/selftune/alpha-upload/stage-canonical.ts +94 -0
  11. package/cli/selftune/auth/device-code.ts +32 -0
  12. package/cli/selftune/auto-update.ts +12 -0
  13. package/cli/selftune/badge/badge.ts +1 -0
  14. package/cli/selftune/canonical-export.ts +5 -0
  15. package/cli/selftune/claude-agents.ts +154 -0
  16. package/cli/selftune/contribute/bundle.ts +1 -0
  17. package/cli/selftune/contribute/contribute.ts +1 -0
  18. package/cli/selftune/cron/setup.ts +2 -2
  19. package/cli/selftune/dashboard-server.ts +1 -0
  20. package/cli/selftune/eval/hooks-to-evals.ts +1 -0
  21. package/cli/selftune/eval/import-skillsbench.ts +1 -0
  22. package/cli/selftune/eval/synthetic-evals.ts +2 -3
  23. package/cli/selftune/eval/unit-test.ts +1 -0
  24. package/cli/selftune/evolution/deploy-proposal.ts +1 -0
  25. package/cli/selftune/evolution/evolve-body.ts +93 -6
  26. package/cli/selftune/evolution/evolve.ts +0 -1
  27. package/cli/selftune/evolution/propose-body.ts +3 -2
  28. package/cli/selftune/evolution/propose-routing.ts +3 -2
  29. package/cli/selftune/evolution/refine-body.ts +3 -2
  30. package/cli/selftune/export.ts +1 -0
  31. package/cli/selftune/grading/grade-session.ts +8 -0
  32. package/cli/selftune/hooks/auto-activate.ts +1 -0
  33. package/cli/selftune/hooks/evolution-guard.ts +1 -1
  34. package/cli/selftune/hooks/prompt-log.ts +1 -0
  35. package/cli/selftune/hooks/session-stop.ts +34 -40
  36. package/cli/selftune/hooks/skill-change-guard.ts +1 -0
  37. package/cli/selftune/hooks/skill-eval.ts +1 -1
  38. package/cli/selftune/index.ts +23 -14
  39. package/cli/selftune/ingestors/claude-replay.ts +1 -0
  40. package/cli/selftune/ingestors/codex-rollout.ts +1 -0
  41. package/cli/selftune/ingestors/codex-wrapper.ts +1 -0
  42. package/cli/selftune/ingestors/openclaw-ingest.ts +1 -0
  43. package/cli/selftune/ingestors/opencode-ingest.ts +1 -0
  44. package/cli/selftune/init.ts +121 -29
  45. package/cli/selftune/localdb/db.ts +1 -0
  46. package/cli/selftune/localdb/direct-write.ts +39 -0
  47. package/cli/selftune/localdb/materialize.ts +2 -0
  48. package/cli/selftune/localdb/queries.ts +53 -0
  49. package/cli/selftune/localdb/schema.ts +28 -0
  50. package/cli/selftune/normalization.ts +1 -0
  51. package/cli/selftune/observability.ts +1 -0
  52. package/cli/selftune/repair/skill-usage.ts +1 -0
  53. package/cli/selftune/routes/orchestrate-runs.ts +1 -0
  54. package/cli/selftune/routes/overview.ts +1 -0
  55. package/cli/selftune/routes/skill-report.ts +1 -0
  56. package/cli/selftune/sync.ts +30 -1
  57. package/cli/selftune/uninstall.ts +412 -0
  58. package/cli/selftune/utils/canonical-log.ts +2 -0
  59. package/cli/selftune/utils/jsonl.ts +1 -0
  60. package/cli/selftune/utils/llm-call.ts +131 -3
  61. package/cli/selftune/utils/skill-log.ts +1 -0
  62. package/cli/selftune/utils/transcript.ts +1 -0
  63. package/cli/selftune/utils/trigger-check.ts +1 -1
  64. package/cli/selftune/workflows/skill-md-writer.ts +5 -5
  65. package/cli/selftune/workflows/workflows.ts +1 -0
  66. package/package.json +37 -33
  67. package/packages/telemetry-contract/fixtures/golden.test.ts +1 -0
  68. package/packages/telemetry-contract/package.json +1 -1
  69. package/packages/telemetry-contract/src/schemas.ts +1 -0
  70. package/packages/telemetry-contract/tests/compatibility.test.ts +1 -0
  71. package/packages/ui/README.md +35 -34
  72. package/packages/ui/package.json +3 -3
  73. package/packages/ui/src/components/ActivityTimeline.tsx +49 -42
  74. package/packages/ui/src/components/EvidenceViewer.tsx +306 -182
  75. package/packages/ui/src/components/EvolutionTimeline.tsx +83 -72
  76. package/packages/ui/src/components/InfoTip.tsx +4 -3
  77. package/packages/ui/src/components/OrchestrateRunsPanel.tsx +60 -53
  78. package/packages/ui/src/components/section-cards.tsx +19 -24
  79. package/packages/ui/src/components/skill-health-grid.tsx +213 -193
  80. package/packages/ui/src/lib/constants.tsx +1 -0
  81. package/packages/ui/src/primitives/badge.tsx +12 -15
  82. package/packages/ui/src/primitives/button.tsx +7 -7
  83. package/packages/ui/src/primitives/card.tsx +15 -26
  84. package/packages/ui/src/primitives/checkbox.tsx +7 -8
  85. package/packages/ui/src/primitives/collapsible.tsx +5 -5
  86. package/packages/ui/src/primitives/dropdown-menu.tsx +45 -55
  87. package/packages/ui/src/primitives/label.tsx +6 -6
  88. package/packages/ui/src/primitives/select.tsx +28 -37
  89. package/packages/ui/src/primitives/table.tsx +17 -44
  90. package/packages/ui/src/primitives/tabs.tsx +14 -21
  91. package/packages/ui/src/primitives/tooltip.tsx +10 -22
  92. package/skill/SKILL.md +70 -57
  93. package/skill/Workflows/AlphaUpload.md +4 -4
  94. package/skill/Workflows/AutoActivation.md +11 -6
  95. package/skill/Workflows/Badge.md +22 -16
  96. package/skill/Workflows/Baseline.md +34 -36
  97. package/skill/Workflows/Composability.md +16 -11
  98. package/skill/Workflows/Contribute.md +26 -21
  99. package/skill/Workflows/Cron.md +23 -22
  100. package/skill/Workflows/Dashboard.md +32 -27
  101. package/skill/Workflows/Doctor.md +33 -27
  102. package/skill/Workflows/Evals.md +48 -47
  103. package/skill/Workflows/EvolutionMemory.md +31 -21
  104. package/skill/Workflows/Evolve.md +84 -82
  105. package/skill/Workflows/EvolveBody.md +58 -47
  106. package/skill/Workflows/Grade.md +16 -13
  107. package/skill/Workflows/ImportSkillsBench.md +9 -6
  108. package/skill/Workflows/Ingest.md +36 -21
  109. package/skill/Workflows/Initialize.md +108 -40
  110. package/skill/Workflows/Orchestrate.md +22 -16
  111. package/skill/Workflows/Replay.md +12 -7
  112. package/skill/Workflows/Rollback.md +13 -6
  113. package/skill/Workflows/Schedule.md +6 -6
  114. package/skill/Workflows/Sync.md +18 -11
  115. package/skill/Workflows/UnitTest.md +28 -17
  116. package/skill/Workflows/Watch.md +28 -21
  117. package/skill/agents/diagnosis-analyst.md +11 -0
  118. package/skill/agents/evolution-reviewer.md +15 -1
  119. package/skill/agents/integration-guide.md +10 -0
  120. package/skill/agents/pattern-analyst.md +12 -1
  121. package/skill/references/grading-methodology.md +23 -24
  122. package/skill/references/interactive-config.md +7 -7
  123. package/skill/references/invocation-taxonomy.md +22 -20
  124. package/skill/references/logs.md +14 -6
  125. package/skill/references/setup-patterns.md +4 -2
  126. package/.claude/agents/diagnosis-analyst.md +0 -156
  127. package/.claude/agents/evolution-reviewer.md +0 -180
  128. package/.claude/agents/integration-guide.md +0 -212
  129. package/.claude/agents/pattern-analyst.md +0 -160
  130. package/apps/local-dashboard/dist/assets/index-C4UYGWKr.js +0 -15
@@ -19,21 +19,22 @@ selftune sync
19
19
 
20
20
  ## Options
21
21
 
22
- | Flag | Description |
23
- |------|-------------|
24
- | `--since <date>` | Only sync sessions modified on/after this date |
25
- | `--dry-run` | Show summary without writing files |
26
- | `--force` | Ignore per-source markers and rescan everything |
27
- | `--no-claude` | Skip Claude transcript replay |
28
- | `--no-codex` | Skip Codex rollout ingest |
29
- | `--no-opencode` | Skip OpenCode ingest |
30
- | `--no-openclaw` | Skip OpenClaw ingest |
31
- | `--no-repair` | Skip rebuilding `skill_usage_repaired.jsonl` |
32
- | `--json` | Output results as JSON |
22
+ | Flag | Description |
23
+ | ---------------- | ----------------------------------------------- |
24
+ | `--since <date>` | Only sync sessions modified on/after this date |
25
+ | `--dry-run` | Show summary without writing files |
26
+ | `--force` | Ignore per-source markers and rescan everything |
27
+ | `--no-claude` | Skip Claude transcript replay |
28
+ | `--no-codex` | Skip Codex rollout ingest |
29
+ | `--no-opencode` | Skip OpenCode ingest |
30
+ | `--no-openclaw` | Skip OpenClaw ingest |
31
+ | `--no-repair` | Skip rebuilding `skill_usage_repaired.jsonl` |
32
+ | `--json` | Output results as JSON |
33
33
 
34
34
  ## Output
35
35
 
36
36
  Writes/refreshed data:
37
+
37
38
  - `~/.claude/session_telemetry_log.jsonl`
38
39
  - `~/.claude/all_queries_log.jsonl`
39
40
  - `~/.claude/skill_usage_log.jsonl`
@@ -50,6 +51,7 @@ counts. Report the preview summary to the user.
50
51
  ### 2. Run Sync
51
52
 
52
53
  Run `selftune sync`. The output includes:
54
+
53
55
  - Per-source `scanned`, `synced`, and `skipped` counts
54
56
  - Repaired overlay totals
55
57
  - Any errors or warnings
@@ -92,20 +94,25 @@ Use `--json` when the agent needs to parse sync results programmatically
92
94
  ## Common Patterns
93
95
 
94
96
  **User wants to refresh telemetry data**
97
+
95
98
  > Run `selftune sync`. Report per-source `scanned`, `synced`, and `skipped` counts.
96
99
 
97
100
  **User wants to sync only recent sessions**
101
+
98
102
  > Run `selftune sync --since <date>` with the user's specified date.
99
103
 
100
104
  **User wants a full rescan from scratch**
105
+
101
106
  > Run `selftune sync --force`. This ignores per-source markers and rescans
102
107
  > all sessions.
103
108
 
104
109
  **Agent needs to verify sync worked**
110
+
105
111
  > Check per-source `scanned`, `synced`, and `skipped` counts. `synced=0`
106
112
  > is normal when data is already up-to-date. Verify `scanned > 0` for
107
113
  > expected sources to confirm sync ran successfully.
108
114
 
109
115
  **Agent is chaining into monitoring or evolution**
116
+
110
117
  > Use `selftune watch --sync-first` or `selftune evolve --sync-first` to
111
118
  > refresh source truth automatically before making decisions.
@@ -11,15 +11,15 @@ selftune eval unit-test --skill <name> --tests <path> [options]
11
11
 
12
12
  ## Options
13
13
 
14
- | Flag | Description | Default |
15
- |------|-------------|---------|
16
- | `--skill <name>` | Skill name | Required |
17
- | `--tests <path>` | Path to unit test JSON file | `~/.selftune/unit-tests/<skill>.json` |
18
- | `--run-agent` | Run agent-based assertions (not just trigger checks) | Off |
19
- | `--generate` | Generate tests from skill content instead of running | Off |
20
- | `--skill-path <path>` | Path to SKILL.md (required for `--generate`) | None |
21
- | `--eval-set <path>` | Eval set for failure context (used with `--generate`) | None |
22
- | `--model <flag>` | Model flag for LLM calls | Agent default |
14
+ | Flag | Description | Default |
15
+ | --------------------- | ----------------------------------------------------- | ------------------------------------- |
16
+ | `--skill <name>` | Skill name | Required |
17
+ | `--tests <path>` | Path to unit test JSON file | `~/.selftune/unit-tests/<skill>.json` |
18
+ | `--run-agent` | Run agent-based assertions (not just trigger checks) | Off |
19
+ | `--generate` | Generate tests from skill content instead of running | Off |
20
+ | `--skill-path <path>` | Path to SKILL.md (required for `--generate`) | None |
21
+ | `--eval-set <path>` | Eval set for failure context (used with `--generate`) | None |
22
+ | `--model <flag>` | Model flag for LLM calls | Agent default |
23
23
 
24
24
  ## Test Format
25
25
 
@@ -48,12 +48,12 @@ Tests are stored as JSON arrays in `~/.selftune/unit-tests/<skill>.json`:
48
48
 
49
49
  ## Assertion Types
50
50
 
51
- | Type | What it checks | Requires agent? |
52
- |------|---------------|-----------------|
53
- | `trigger_check` | Query triggers the skill description | No (LLM only) |
54
- | `output_contains` | Agent output contains expected text | Yes |
55
- | `output_matches_regex` | Agent output matches regex pattern | Yes |
56
- | `tool_called` | Agent used a specific tool | Yes |
51
+ | Type | What it checks | Requires agent? |
52
+ | ---------------------- | ------------------------------------ | --------------- |
53
+ | `trigger_check` | Query triggers the skill description | No (LLM only) |
54
+ | `output_contains` | Agent output contains expected text | Yes |
55
+ | `output_matches_regex` | Agent output matches regex pattern | Yes |
56
+ | `tool_called` | Agent used a specific tool | Yes |
57
57
 
58
58
  Trigger check assertions are cheap (single LLM call). Agent-based assertions
59
59
  require `--run-agent` and run the query through the full agent.
@@ -66,14 +66,19 @@ require `--run-agent` and run the query through the full agent.
66
66
  "total": 10,
67
67
  "passed": 8,
68
68
  "failed": 2,
69
- "pass_rate": 0.80,
69
+ "pass_rate": 0.8,
70
70
  "results": [
71
71
  {
72
72
  "test_id": "research-trigger-1",
73
73
  "overall_passed": true,
74
74
  "trigger_passed": true,
75
75
  "assertion_results": [
76
- { "type": "trigger_check", "value": "true", "passed": true, "evidence": "LLM responded YES" }
76
+ {
77
+ "type": "trigger_check",
78
+ "value": "true",
79
+ "passed": true,
80
+ "evidence": "LLM responded YES"
81
+ }
77
82
  ],
78
83
  "duration_ms": 450
79
84
  }
@@ -93,6 +98,7 @@ selftune eval unit-test --skill Research --generate --skill-path ~/.claude/skill
93
98
  ```
94
99
 
95
100
  Parse the output. The LLM creates test cases covering:
101
+
96
102
  - Explicit trigger queries
97
103
  - Implicit trigger queries
98
104
  - Contextual trigger queries
@@ -114,6 +120,7 @@ Add `--run-agent` for full agent-based assertions.
114
120
  ### 3. Parse Results
115
121
 
116
122
  Parse the JSON output. Check `pass_rate` and investigate failures:
123
+
117
124
  - Failed trigger checks -- description needs improvement (route to Evolve)
118
125
  - Failed output assertions -- skill workflow needs fixes
119
126
  - Failed tool assertions -- skill routing is broken
@@ -134,17 +141,21 @@ the evolution improved trigger accuracy.
134
141
  ## Common Patterns
135
142
 
136
143
  **User asks to generate tests for a skill**
144
+
137
145
  > Run `selftune eval unit-test --skill <name> --generate --skill-path <path>`.
138
146
  > Parse the output and report how many tests were generated.
139
147
 
140
148
  **User asks to run existing tests**
149
+
141
150
  > Run `selftune eval unit-test --skill <name>`. Parse the JSON output and
142
151
  > report pass rate and any failures.
143
152
 
144
153
  **User asks for full agent-based testing**
154
+
145
155
  > Run `selftune eval unit-test --skill <name> --run-agent`. This runs queries
146
156
  > through the full agent, so inform the user it will take longer.
147
157
 
148
158
  **After an evolution completes**
159
+
149
160
  > Run unit tests to verify the evolution improved trigger accuracy. Compare
150
161
  > the new pass rate against the pre-evolution baseline.
@@ -11,15 +11,15 @@ selftune watch --skill <name> --skill-path <path> [options]
11
11
 
12
12
  ## Options
13
13
 
14
- | Flag | Description | Default |
15
- |------|-------------|---------|
16
- | `--skill <name>` | Skill name | Required |
17
- | `--skill-path <path>` | Path to the skill's SKILL.md | Required |
18
- | `--window <n>` | Sliding window size (number of sessions) | 20 |
19
- | `--threshold <n>` | Regression threshold (drop from baseline) | 0.1 |
20
- | `--auto-rollback` | Automatically rollback on detected regression | Off |
21
- | `--sync-first` | Refresh source-truth telemetry before evaluating | Off |
22
- | `--sync-force` | Force a full source rescan during `--sync-first` | Off |
14
+ | Flag | Description | Default |
15
+ | --------------------- | ------------------------------------------------ | -------- |
16
+ | `--skill <name>` | Skill name | Required |
17
+ | `--skill-path <path>` | Path to the skill's SKILL.md | Required |
18
+ | `--window <n>` | Sliding window size (number of sessions) | 20 |
19
+ | `--threshold <n>` | Regression threshold (drop from baseline) | 0.1 |
20
+ | `--auto-rollback` | Automatically rollback on detected regression | Off |
21
+ | `--sync-first` | Refresh source-truth telemetry before evaluating | Off |
22
+ | `--sync-force` | Force a full source rescan during `--sync-first` | Off |
23
23
 
24
24
  ## Output Format
25
25
 
@@ -40,12 +40,12 @@ selftune watch --skill <name> --skill-path <path> [options]
40
40
 
41
41
  ### Status Values
42
42
 
43
- | Status | Meaning |
44
- |--------|---------|
45
- | `healthy` | Current pass rate is within threshold of baseline |
46
- | `warning` | Pass rate dropped but within threshold |
47
- | `regression` | Pass rate dropped below baseline minus threshold |
48
- | `insufficient_data` | Not enough sessions in the window to evaluate |
43
+ | Status | Meaning |
44
+ | ------------------- | ------------------------------------------------- |
45
+ | `healthy` | Current pass rate is within threshold of baseline |
46
+ | `warning` | Pass rate dropped but within threshold |
47
+ | `regression` | Pass rate dropped below baseline minus threshold |
48
+ | `insufficient_data` | Not enough sessions in the window to evaluate |
49
49
 
50
50
  ## Parsing Instructions
51
51
 
@@ -69,6 +69,7 @@ selftune watch --skill <name> --skill-path <path> [options]
69
69
  ### 0. Read Evolution Context
70
70
 
71
71
  Read `~/.selftune/memory/context.md` for session context:
72
+
72
73
  - Active evolutions and their current status
73
74
  - Known issues and regression history
74
75
  - Last update timestamp
@@ -91,16 +92,17 @@ selftune watch --skill pptx --skill-path /path/to/SKILL.md
91
92
 
92
93
  Parse the JSON output. Key decision points:
93
94
 
94
- | Status | Action |
95
- |--------|--------|
96
- | `healthy` | No action needed. Skill is performing well. |
97
- | `warning` | Monitor closely. Consider re-running after more sessions. |
98
- | `regression` | Investigate. Consider rollback. |
99
- | `insufficient_data` | Wait for more sessions before evaluating. |
95
+ | Status | Action |
96
+ | ------------------- | --------------------------------------------------------- |
97
+ | `healthy` | No action needed. Skill is performing well. |
98
+ | `warning` | Monitor closely. Consider re-running after more sessions. |
99
+ | `regression` | Investigate. Consider rollback. |
100
+ | `insufficient_data` | Wait for more sessions before evaluating. |
100
101
 
101
102
  ### 3. Decide Action
102
103
 
103
104
  If regression is detected:
105
+
104
106
  - Review recent session transcripts to understand what changed
105
107
  - Check if the eval set is still representative
106
108
  - Run `evolve rollback` if the regression is confirmed (see `Workflows/Rollback.md`)
@@ -111,6 +113,7 @@ previous description and logs a `rolled_back` entry.
111
113
  ### 4. Report
112
114
 
113
115
  Summarize the snapshot for the user:
116
+
114
117
  - Current pass rate vs baseline
115
118
  - Number of sessions evaluated
116
119
  - Whether regression was detected
@@ -126,16 +129,20 @@ context window resets before the user acts on the results.
126
129
  ## Common Patterns
127
130
 
128
131
  **"Is the skill performing well after the change?"**
132
+
129
133
  > Run watch with the skill name and path. Report the snapshot.
130
134
 
131
135
  **"Check for regressions"**
136
+
132
137
  > Same as above. Focus on the `regression_detected` and `delta` fields.
133
138
 
134
139
  **"How is the skill doing?"**
140
+
135
141
  > Run watch. If `insufficient_data`, tell the user to wait for more
136
142
  > sessions before drawing conclusions.
137
143
 
138
144
  **"Auto-rollback if it regresses"**
145
+
139
146
  > Use `--auto-rollback`. The command will restore the previous description
140
147
  > automatically if pass rate drops below baseline minus threshold.
141
148
 
@@ -88,6 +88,7 @@ selftune eval generate --skill <name> --max 50
88
88
 
89
89
  Treat these outputs as exploratory summaries. Verify important claims against
90
90
  the underlying logs:
91
+
91
92
  - `~/.claude/skill_usage_log.jsonl`
92
93
  - `~/.claude/all_queries_log.jsonl`
93
94
  - `~/.claude/session_telemetry_log.jsonl`
@@ -96,6 +97,7 @@ the underlying logs:
96
97
 
97
98
  Read `~/.claude/evolution_audit_log.jsonl` for entries affecting the target
98
99
  skill. Look for:
100
+
99
101
  - recent deploys followed by regressions
100
102
  - repeated dry-runs or validated proposals with no deploy
101
103
  - rollbacks
@@ -107,6 +109,7 @@ Prefer the specific sessions passed by the parent. Otherwise, select recent
107
109
  sessions that show errors, unmatched queries, or clear misses.
108
110
 
109
111
  Look for:
112
+
110
113
  - the skill never being read or invoked
111
114
  - the wrong workflow being chosen
112
115
  - steps performed out of order
@@ -121,6 +124,7 @@ smallest credible next action.
121
124
  ## Stop Conditions
122
125
 
123
126
  Stop and return to the parent if:
127
+
124
128
  - the target skill is ambiguous
125
129
  - the required logs or transcripts are unavailable
126
130
  - the evidence is limited to one isolated session
@@ -134,30 +138,37 @@ Return a compact report with these sections:
134
138
  ## Diagnosis Report: <skill-name>
135
139
 
136
140
  ### Summary
141
+
137
142
  [2-4 sentence explanation of what is going wrong]
138
143
 
139
144
  ### Root Cause
145
+
140
146
  [TRIGGER / PROCESS / QUALITY / INFRASTRUCTURE]
141
147
 
142
148
  ### Findings
149
+
143
150
  - [Finding 1]
144
151
  - [Finding 2]
145
152
  - [Finding 3]
146
153
 
147
154
  ### Evidence
155
+
148
156
  - [path or command result]
149
157
  - [session ID / query / timestamp]
150
158
  - [audit or transcript evidence]
151
159
 
152
160
  ### Recommended Next Actions
161
+
153
162
  1. [Highest-leverage next step]
154
163
  2. [Second step]
155
164
  3. [Optional follow-up]
156
165
 
157
166
  ### Suggested Commands
167
+
158
168
  - `...`
159
169
  - `...`
160
170
 
161
171
  ### Confidence
172
+
162
173
  [high / medium / low]
163
174
  ```
@@ -3,7 +3,7 @@ name: evolution-reviewer
3
3
  description: Use when reviewing a dry-run or pending evolution proposal before deployment, especially for high-stakes skills, marginal improvements, or recent regressions. Compares old vs new content, checks evidence quality, and returns an approve or reject verdict with conditions.
4
4
  tools: Read, Grep, Glob, Bash
5
5
  disallowedTools: Write, Edit
6
- model: sonnet
6
+ model: opus
7
7
  maxTurns: 8
8
8
  ---
9
9
 
@@ -69,12 +69,14 @@ selftune evolve --skill <name> --skill-path <path> --dry-run
69
69
  ### 2. Compare original vs proposed content
70
70
 
71
71
  For description proposals, compare:
72
+
72
73
  - preserved working anchors
73
74
  - added language for missed queries
74
75
  - scope creep or vague broadening
75
76
  - tone and style continuity
76
77
 
77
78
  For routing/body proposals, compare:
79
+
78
80
  - workflow routing ownership changes
79
81
  - added or removed operational steps
80
82
  - whether the body still matches current CLI behavior
@@ -83,6 +85,7 @@ For routing/body proposals, compare:
83
85
  ### 3. Assess eval and evidence quality
84
86
 
85
87
  Check:
88
+
86
89
  - eval size is meaningful for the change being proposed
87
90
  - negatives exist for overtriggering protection
88
91
  - explicit queries are protected
@@ -91,6 +94,7 @@ Check:
91
94
  ### 4. Check metrics and history
92
95
 
93
96
  Review proposal metrics and recent history:
97
+
94
98
  - pass-rate delta
95
99
  - regression count or obvious explicit regressions
96
100
  - confidence
@@ -99,6 +103,7 @@ Review proposal metrics and recent history:
99
103
  ### 5. Render a safety verdict
100
104
 
101
105
  Issue one of:
106
+
102
107
  - `APPROVE`
103
108
  - `APPROVE WITH CONDITIONS`
104
109
  - `REJECT`
@@ -106,6 +111,7 @@ Issue one of:
106
111
  ## Stop Conditions
107
112
 
108
113
  Stop and return to the parent if:
114
+
109
115
  - there is no concrete proposal or diff to review
110
116
  - the target skill or proposal is ambiguous
111
117
  - the eval source is missing
@@ -119,31 +125,39 @@ Return a compact verdict with these sections:
119
125
  ## Evolution Review: <skill-name>
120
126
 
121
127
  ### Proposal ID
128
+
122
129
  [proposal ID or "not provided"]
123
130
 
124
131
  ### Verdict
132
+
125
133
  [APPROVE / APPROVE WITH CONDITIONS / REJECT]
126
134
 
127
135
  ### Summary
136
+
128
137
  [2-4 sentence explanation]
129
138
 
130
139
  ### Findings
140
+
131
141
  - [Finding 1]
132
142
  - [Finding 2]
133
143
  - [Finding 3]
134
144
 
135
145
  ### Evidence
146
+
136
147
  - [audit entry / eval fact / diff observation]
137
148
  - [audit entry / eval fact / diff observation]
138
149
 
139
150
  ### Required Changes
151
+
140
152
  1. [Only if not approved]
141
153
  2. [Only if not approved]
142
154
 
143
155
  ### Post-Deploy Conditions
156
+
144
157
  - [watch requirement or monitoring threshold]
145
158
  - [follow-up check]
146
159
 
147
160
  ### Confidence
161
+
148
162
  [high / medium / low]
149
163
  ```
@@ -46,6 +46,7 @@ parent. Do not ask the user directly unless the parent explicitly told you to.
46
46
  ### 1. Detect project structure
47
47
 
48
48
  Inspect the workspace and classify it as one of:
49
+
49
50
  - single-skill project
50
51
  - multi-skill repo
51
52
  - monorepo with shared tooling
@@ -64,6 +65,7 @@ selftune doctor
64
65
  ```
65
66
 
66
67
  Check:
68
+
67
69
  - whether the CLI exists
68
70
  - whether `config.json` exists and looks current (resolve via `SELFTUNE_CONFIG_DIR` or `SELFTUNE_HOME` env vars first, falling back to `~/.selftune/`; run `selftune doctor` to confirm the resolved path)
69
71
  - whether hooks or ingest paths are healthy
@@ -80,6 +82,7 @@ selftune init [--agent claude_code] [--cli-path <path>] [--force]
80
82
  For other platforms, route to the appropriate ingest workflow after init.
81
83
 
82
84
  If the repo layout is complex, decide whether the user needs:
85
+
83
86
  - one shared setup at the repo root
84
87
  - per-package setup guidance
85
88
  - absolute paths to avoid cwd-dependent failures
@@ -89,6 +92,7 @@ If the repo layout is complex, decide whether the user needs:
89
92
  If `requestedMode` is `plan-only`, stop at a verified setup plan.
90
93
 
91
94
  If `requestedMode` is `hands-on`, you may:
95
+
92
96
  - run `selftune init`
93
97
  - create or refresh local activation-rules files
94
98
  - repair obvious path or config issues
@@ -116,6 +120,7 @@ run evals, improve a skill, or set up autonomous orchestration.
116
120
  ## Stop Conditions
117
121
 
118
122
  Stop and return to the parent if:
123
+
119
124
  - the project root is ambiguous
120
125
  - the CLI is missing and installation is not allowed
121
126
  - the repo has no skills and the task is really skill creation, not setup
@@ -130,25 +135,30 @@ Return a setup report with these sections:
130
135
  ## selftune Setup Complete
131
136
 
132
137
  ### Environment
138
+
133
139
  - Agent platform: <claude_code / codex / opencode / openclaw / unknown>
134
140
  - Project type: <single-skill / multi-skill / monorepo / no-skills>
135
141
  - Skills detected: <list>
136
142
 
137
143
  ### Configuration
144
+
138
145
  - Config: [created / verified / missing]
139
146
  - Init path: [command used or recommended]
140
147
  - Hooks or ingest: [healthy / needs work / not applicable]
141
148
  - Doctor: [healthy / unhealthy with blockers]
142
149
 
143
150
  ### Verification
151
+
144
152
  - Telemetry capture: [working / not verified]
145
153
  - Skill tracking: [working / not verified]
146
154
 
147
155
  ### Next Steps
156
+
148
157
  1. [Primary recommended action]
149
158
  2. [Secondary action]
150
159
  3. [Optional action]
151
160
 
152
161
  ### Confidence
162
+
153
163
  [high / medium / low]
154
164
  ```
@@ -66,6 +66,7 @@ Then read the actual `SKILL.md` files for the skills in scope.
66
66
  ### 2. Extract each skill's ownership contract
67
67
 
68
68
  For each skill, capture:
69
+
69
70
  - frontmatter description
70
71
  - workflow-routing triggers
71
72
  - explicit exclusions or negative examples
@@ -74,6 +75,7 @@ For each skill, capture:
74
75
  ### 3. Detect conflicts and gaps
75
76
 
76
77
  Compare trigger keywords and description phrases across all skills. Flag:
78
+
77
79
  - direct conflicts
78
80
  - semantic overlaps
79
81
  - negative-example gaps
@@ -83,6 +85,7 @@ Compare trigger keywords and description phrases across all skills. Flag:
83
85
  ### 4. Analyze real query behavior
84
86
 
85
87
  Read the logs and look for:
88
+
86
89
  - queries that triggered multiple skills
87
90
  - queries that triggered no skills despite matching one or more descriptions
88
91
  - queries that appear to have been routed to the wrong skill
@@ -103,6 +106,7 @@ shifted ownership or introduced churn.
103
106
  ### 6. Recommend ownership changes
104
107
 
105
108
  For each important conflict, state:
109
+
106
110
  - which skill should own the query family
107
111
  - which skill should back off
108
112
  - whether the fix is a description change, routing-table change, negative
@@ -111,6 +115,7 @@ For each important conflict, state:
111
115
  ## Stop Conditions
112
116
 
113
117
  Stop and return to the parent if:
118
+
114
119
  - the skills in scope are not identifiable
115
120
  - there is not enough log data to say anything useful
116
121
  - the question is really about one underperforming skill rather than
@@ -124,26 +129,32 @@ Return a compact report with these sections:
124
129
  ## Cross-Skill Pattern Analysis
125
130
 
126
131
  ### Summary
132
+
127
133
  [2-4 sentence overview]
128
134
 
129
135
  ### Findings
136
+
130
137
  - [Finding 1]
131
138
  - [Finding 2]
132
139
  - [Finding 3]
133
140
 
134
141
  ### Conflict Matrix
142
+
135
143
  | Skill A | Skill B | Problem | Evidence | Recommended Owner |
136
- |---------|---------|---------|----------|-------------------|
144
+ | ------- | ------- | ------- | -------- | ----------------- |
137
145
  | ... | ... | ... | ... | ... |
138
146
 
139
147
  ### Coverage Gaps
148
+
140
149
  - [query family or sample]
141
150
 
142
151
  ### Recommended Changes
152
+
143
153
  1. [Highest-priority change]
144
154
  2. [Second change]
145
155
  3. [Optional follow-up]
146
156
 
147
157
  ### Confidence
158
+
148
159
  [high / medium / low]
149
160
  ```
@@ -9,11 +9,11 @@ referenced by evolution workflows to understand quality signals.
9
9
 
10
10
  Every session is graded across three tiers, each answering a different question:
11
11
 
12
- | Tier | Question | Example expectation |
13
- |------|----------|---------------------|
14
- | **Trigger** | Did the skill fire at all? | `skills_triggered` contains the skill name |
15
- | **Process** | Did the agent follow the right steps? | SKILL.md was read before main work started |
16
- | **Quality** | Was the output actually good? | Output file has correct content and structure |
12
+ | Tier | Question | Example expectation |
13
+ | ----------- | ------------------------------------- | --------------------------------------------- |
14
+ | **Trigger** | Did the skill fire at all? | `skills_triggered` contains the skill name |
15
+ | **Process** | Did the agent follow the right steps? | SKILL.md was read before main work started |
16
+ | **Quality** | Was the output actually good? | Output file has correct content and structure |
17
17
 
18
18
  A session can pass Trigger but fail Process (skill fired, but steps were wrong),
19
19
  or pass Process but fail Quality (steps were right, but output was bad).
@@ -71,13 +71,14 @@ Always include at least one Process and one Quality expectation.
71
71
  After grading explicit expectations, extract 2-4 implicit claims from the transcript.
72
72
  Each claim falls into one of three types:
73
73
 
74
- | Type | What it captures | Example |
75
- |------|------------------|---------|
76
- | **Factual** | A verifiable statement the agent made | "The agent said 12 slides were created" |
77
- | **Process** | An observed behavior pattern | "The agent read SKILL.md before making any file changes" |
78
- | **Quality** | An output characteristic | "The output file was named correctly" |
74
+ | Type | What it captures | Example |
75
+ | ----------- | ------------------------------------- | -------------------------------------------------------- |
76
+ | **Factual** | A verifiable statement the agent made | "The agent said 12 slides were created" |
77
+ | **Process** | An observed behavior pattern | "The agent read SKILL.md before making any file changes" |
78
+ | **Quality** | An output characteristic | "The output file was named correctly" |
79
79
 
80
80
  For each claim:
81
+
81
82
  1. State the claim clearly
82
83
  2. Classify its type
83
84
  3. Mark `verified: true` or `verified: false`
@@ -153,9 +154,7 @@ Only raise things worth improving. The goal is actionable feedback, not exhausti
153
154
  }
154
155
  ],
155
156
  "eval_feedback": {
156
- "suggestions": [
157
- { "reason": "No expectation checks slide content" }
158
- ],
157
+ "suggestions": [{ "reason": "No expectation checks slide content" }],
159
158
  "overall": "Process coverage good; add output quality assertions."
160
159
  }
161
160
  }
@@ -163,14 +162,14 @@ Only raise things worth improving. The goal is actionable feedback, not exhausti
163
162
 
164
163
  ### Field descriptions
165
164
 
166
- | Field | Type | Description |
167
- |-------|------|-------------|
168
- | `session_id` | string | From session telemetry |
169
- | `skill_name` | string | The skill being graded |
170
- | `transcript_path` | string | Path to the session transcript JSONL |
171
- | `graded_at` | string | ISO 8601 timestamp of grading |
172
- | `expectations[]` | array | Each expectation with verdict and evidence |
173
- | `summary` | object | Aggregate pass/fail counts and rate |
174
- | `execution_metrics` | object | Raw metrics from session telemetry |
175
- | `claims[]` | array | Implicit claims extracted from transcript |
176
- | `eval_feedback` | object | Suggestions for improving the eval set |
165
+ | Field | Type | Description |
166
+ | ------------------- | ------ | ------------------------------------------ |
167
+ | `session_id` | string | From session telemetry |
168
+ | `skill_name` | string | The skill being graded |
169
+ | `transcript_path` | string | Path to the session transcript JSONL |
170
+ | `graded_at` | string | ISO 8601 timestamp of grading |
171
+ | `expectations[]` | array | Each expectation with verdict and evidence |
172
+ | `summary` | object | Aggregate pass/fail counts and rate |
173
+ | `execution_metrics` | object | Raw metrics from session telemetry |
174
+ | `claims[]` | array | Implicit claims extracted from transcript |
175
+ | `eval_feedback` | object | Suggestions for improving the eval set |