selftune 0.1.4 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +156 -0
- package/.claude/agents/evolution-reviewer.md +180 -0
- package/.claude/agents/integration-guide.md +212 -0
- package/.claude/agents/pattern-analyst.md +160 -0
- package/CHANGELOG.md +46 -1
- package/README.md +105 -257
- package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
- package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
- package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
- package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
- package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
- package/apps/local-dashboard/dist/favicon.png +0 -0
- package/apps/local-dashboard/dist/index.html +17 -0
- package/apps/local-dashboard/dist/logo.png +0 -0
- package/apps/local-dashboard/dist/logo.svg +9 -0
- package/assets/BeforeAfter.gif +0 -0
- package/assets/FeedbackLoop.gif +0 -0
- package/assets/logo.svg +9 -0
- package/assets/skill-health-badge.svg +20 -0
- package/cli/selftune/activation-rules.ts +171 -0
- package/cli/selftune/badge/badge-data.ts +108 -0
- package/cli/selftune/badge/badge-svg.ts +212 -0
- package/cli/selftune/badge/badge.ts +99 -0
- package/cli/selftune/canonical-export.ts +183 -0
- package/cli/selftune/constants.ts +103 -1
- package/cli/selftune/contribute/bundle.ts +314 -0
- package/cli/selftune/contribute/contribute.ts +214 -0
- package/cli/selftune/contribute/sanitize.ts +162 -0
- package/cli/selftune/cron/setup.ts +266 -0
- package/cli/selftune/dashboard-contract.ts +202 -0
- package/cli/selftune/dashboard-server.ts +1049 -0
- package/cli/selftune/dashboard.ts +43 -156
- package/cli/selftune/eval/baseline.ts +248 -0
- package/cli/selftune/eval/composability-v2.ts +273 -0
- package/cli/selftune/eval/composability.ts +117 -0
- package/cli/selftune/eval/generate-unit-tests.ts +143 -0
- package/cli/selftune/eval/hooks-to-evals.ts +101 -16
- package/cli/selftune/eval/import-skillsbench.ts +221 -0
- package/cli/selftune/eval/synthetic-evals.ts +172 -0
- package/cli/selftune/eval/unit-test-cli.ts +152 -0
- package/cli/selftune/eval/unit-test.ts +196 -0
- package/cli/selftune/evolution/deploy-proposal.ts +142 -1
- package/cli/selftune/evolution/evidence.ts +26 -0
- package/cli/selftune/evolution/evolve-body.ts +586 -0
- package/cli/selftune/evolution/evolve.ts +825 -116
- package/cli/selftune/evolution/extract-patterns.ts +105 -16
- package/cli/selftune/evolution/pareto.ts +314 -0
- package/cli/selftune/evolution/propose-body.ts +171 -0
- package/cli/selftune/evolution/propose-description.ts +100 -2
- package/cli/selftune/evolution/propose-routing.ts +166 -0
- package/cli/selftune/evolution/refine-body.ts +141 -0
- package/cli/selftune/evolution/rollback.ts +21 -4
- package/cli/selftune/evolution/validate-body.ts +254 -0
- package/cli/selftune/evolution/validate-proposal.ts +257 -35
- package/cli/selftune/evolution/validate-routing.ts +177 -0
- package/cli/selftune/grading/auto-grade.ts +200 -0
- package/cli/selftune/grading/grade-session.ts +513 -42
- package/cli/selftune/grading/pre-gates.ts +104 -0
- package/cli/selftune/grading/results.ts +42 -0
- package/cli/selftune/hooks/auto-activate.ts +185 -0
- package/cli/selftune/hooks/evolution-guard.ts +165 -0
- package/cli/selftune/hooks/prompt-log.ts +172 -2
- package/cli/selftune/hooks/session-stop.ts +123 -3
- package/cli/selftune/hooks/skill-change-guard.ts +112 -0
- package/cli/selftune/hooks/skill-eval.ts +119 -3
- package/cli/selftune/index.ts +415 -48
- package/cli/selftune/ingestors/claude-replay.ts +377 -0
- package/cli/selftune/ingestors/codex-rollout.ts +345 -46
- package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
- package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
- package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
- package/cli/selftune/init.ts +376 -16
- package/cli/selftune/last.ts +14 -5
- package/cli/selftune/localdb/db.ts +63 -0
- package/cli/selftune/localdb/materialize.ts +428 -0
- package/cli/selftune/localdb/queries.ts +376 -0
- package/cli/selftune/localdb/schema.ts +204 -0
- package/cli/selftune/memory/writer.ts +447 -0
- package/cli/selftune/monitoring/watch.ts +90 -16
- package/cli/selftune/normalization.ts +682 -0
- package/cli/selftune/observability.ts +19 -44
- package/cli/selftune/orchestrate.ts +1073 -0
- package/cli/selftune/quickstart.ts +203 -0
- package/cli/selftune/repair/skill-usage.ts +576 -0
- package/cli/selftune/schedule.ts +561 -0
- package/cli/selftune/status.ts +59 -33
- package/cli/selftune/sync.ts +627 -0
- package/cli/selftune/types.ts +525 -5
- package/cli/selftune/utils/canonical-log.ts +45 -0
- package/cli/selftune/utils/frontmatter.ts +217 -0
- package/cli/selftune/utils/hooks.ts +41 -0
- package/cli/selftune/utils/html.ts +27 -0
- package/cli/selftune/utils/llm-call.ts +103 -19
- package/cli/selftune/utils/math.ts +10 -0
- package/cli/selftune/utils/query-filter.ts +139 -0
- package/cli/selftune/utils/skill-discovery.ts +340 -0
- package/cli/selftune/utils/skill-log.ts +68 -0
- package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
- package/cli/selftune/utils/transcript.ts +307 -26
- package/cli/selftune/utils/trigger-check.ts +89 -0
- package/cli/selftune/utils/tui.ts +156 -0
- package/cli/selftune/workflows/discover.ts +254 -0
- package/cli/selftune/workflows/skill-md-writer.ts +288 -0
- package/cli/selftune/workflows/workflows.ts +188 -0
- package/package.json +28 -11
- package/packages/telemetry-contract/README.md +11 -0
- package/packages/telemetry-contract/fixtures/golden.json +87 -0
- package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
- package/packages/telemetry-contract/index.ts +1 -0
- package/packages/telemetry-contract/package.json +19 -0
- package/packages/telemetry-contract/src/index.ts +2 -0
- package/packages/telemetry-contract/src/types.ts +163 -0
- package/packages/telemetry-contract/src/validators.ts +109 -0
- package/skill/SKILL.md +180 -33
- package/skill/Workflows/AutoActivation.md +145 -0
- package/skill/Workflows/Badge.md +124 -0
- package/skill/Workflows/Baseline.md +144 -0
- package/skill/Workflows/Composability.md +107 -0
- package/skill/Workflows/Contribute.md +94 -0
- package/skill/Workflows/Cron.md +132 -0
- package/skill/Workflows/Dashboard.md +214 -0
- package/skill/Workflows/Doctor.md +63 -14
- package/skill/Workflows/Evals.md +110 -18
- package/skill/Workflows/EvolutionMemory.md +154 -0
- package/skill/Workflows/Evolve.md +181 -21
- package/skill/Workflows/EvolveBody.md +159 -0
- package/skill/Workflows/Grade.md +36 -31
- package/skill/Workflows/ImportSkillsBench.md +117 -0
- package/skill/Workflows/Ingest.md +142 -21
- package/skill/Workflows/Initialize.md +91 -23
- package/skill/Workflows/Orchestrate.md +139 -0
- package/skill/Workflows/Replay.md +91 -0
- package/skill/Workflows/Rollback.md +23 -4
- package/skill/Workflows/Schedule.md +61 -0
- package/skill/Workflows/Sync.md +88 -0
- package/skill/Workflows/UnitTest.md +150 -0
- package/skill/Workflows/Watch.md +33 -1
- package/skill/Workflows/Workflows.md +129 -0
- package/skill/assets/activation-rules-default.json +26 -0
- package/skill/assets/multi-skill-settings.json +63 -0
- package/skill/assets/single-skill-settings.json +57 -0
- package/skill/references/invocation-taxonomy.md +2 -2
- package/skill/references/logs.md +164 -2
- package/skill/references/setup-patterns.md +65 -0
- package/skill/references/version-history.md +40 -0
- package/skill/settings_snippet.json +23 -0
- package/templates/activation-rules-default.json +27 -0
- package/templates/multi-skill-settings.json +64 -0
- package/templates/single-skill-settings.json +58 -0
- package/dashboard/index.html +0 -1119
|
@@ -6,7 +6,7 @@ Records the rollback in the evolution audit log for traceability.
|
|
|
6
6
|
## Default Command
|
|
7
7
|
|
|
8
8
|
```bash
|
|
9
|
-
selftune rollback --skill <name> --skill-path <path> [options]
|
|
9
|
+
selftune evolve rollback --skill <name> --skill-path <path> [options]
|
|
10
10
|
```
|
|
11
11
|
|
|
12
12
|
## Options
|
|
@@ -75,6 +75,16 @@ Manual restoration from version control is required.
|
|
|
75
75
|
|
|
76
76
|
## Steps
|
|
77
77
|
|
|
78
|
+
### 0. Read Evolution Context
|
|
79
|
+
|
|
80
|
+
Before starting, read `~/.selftune/memory/context.md` for session context:
|
|
81
|
+
- Active evolutions and their current status
|
|
82
|
+
- Previous rollback history
|
|
83
|
+
- Last update timestamp
|
|
84
|
+
|
|
85
|
+
This provides continuity across context resets. If the file doesn't exist,
|
|
86
|
+
proceed normally — it will be created after the first rollback.
|
|
87
|
+
|
|
78
88
|
### 1. Find the Last Evolution
|
|
79
89
|
|
|
80
90
|
Read `~/.claude/evolution_audit_log.jsonl` and find the most recent
|
|
@@ -85,13 +95,13 @@ If `--proposal-id` is specified, use that instead.
|
|
|
85
95
|
### 2. Run Rollback
|
|
86
96
|
|
|
87
97
|
```bash
|
|
88
|
-
selftune rollback --skill pptx --skill-path /path/to/SKILL.md
|
|
98
|
+
selftune evolve rollback --skill pptx --skill-path /path/to/SKILL.md
|
|
89
99
|
```
|
|
90
100
|
|
|
91
101
|
Or to rollback a specific proposal:
|
|
92
102
|
|
|
93
103
|
```bash
|
|
94
|
-
selftune rollback --skill pptx --skill-path /path/to/SKILL.md --proposal-id evolve-pptx-1709125200000
|
|
104
|
+
selftune evolve rollback --skill pptx --skill-path /path/to/SKILL.md --proposal-id evolve-pptx-1709125200000
|
|
95
105
|
```
|
|
96
106
|
|
|
97
107
|
### 3. Verify Restoration
|
|
@@ -101,7 +111,16 @@ After rollback, verify the SKILL.md content is restored:
|
|
|
101
111
|
- Check the audit log for the `rolled_back` entry
|
|
102
112
|
- Optionally re-run evals to confirm the original pass rate
|
|
103
113
|
|
|
104
|
-
### 4.
|
|
114
|
+
### 4. Update Memory
|
|
115
|
+
|
|
116
|
+
After rollback completes, the memory writer updates:
|
|
117
|
+
- `~/.selftune/memory/decisions.md` -- records the rollback decision and reason
|
|
118
|
+
- `~/.selftune/memory/context.md` -- clears the active evolution state and notes the rollback
|
|
119
|
+
|
|
120
|
+
This ensures future evolve and watch workflows have context about why the
|
|
121
|
+
rollback occurred, even across context window resets.
|
|
122
|
+
|
|
123
|
+
### 5. Post-Rollback Audit
|
|
105
124
|
|
|
106
125
|
The rollback is logged. Future `evolve` runs will see the rollback in the
|
|
107
126
|
audit trail and can use it to avoid repeating failed evolution patterns.
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# selftune Schedule Workflow
|
|
2
|
+
|
|
3
|
+
Generate ready-to-use scheduling examples for automating selftune with
|
|
4
|
+
standard system tools. This is the **primary automation path** — it works
|
|
5
|
+
on any machine without requiring a specific agent runtime.
|
|
6
|
+
|
|
7
|
+
For OpenClaw-specific scheduling, see `Workflows/Cron.md`.
|
|
8
|
+
|
|
9
|
+
## When to Use
|
|
10
|
+
|
|
11
|
+
- Setting up selftune automation for the first time
|
|
12
|
+
- Generating crontab entries for a Linux/macOS server
|
|
13
|
+
- Creating a launchd plist for a macOS machine
|
|
14
|
+
- Creating a systemd timer for a Linux server
|
|
15
|
+
- Understanding the selftune automation loop
|
|
16
|
+
|
|
17
|
+
## The Automation Loop
|
|
18
|
+
|
|
19
|
+
The core selftune automation loop is one command:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
selftune orchestrate
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
`selftune orchestrate` runs source-truth sync first, selects candidate skills,
|
|
26
|
+
deploys validated low-risk description changes autonomously, and watches recent
|
|
27
|
+
deployments with auto-rollback enabled.
|
|
28
|
+
|
|
29
|
+
## Default Command
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
selftune schedule
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Outputs examples for all three scheduling systems (cron, launchd, systemd).
|
|
36
|
+
|
|
37
|
+
## Flags
|
|
38
|
+
|
|
39
|
+
| Flag | Description | Default |
|
|
40
|
+
|------|-------------|---------|
|
|
41
|
+
| `--format <type>` | Output only one format: `cron`, `launchd`, or `systemd` | All formats |
|
|
42
|
+
| `--install` | Write and activate scheduler artifacts for the selected/default platform | Off |
|
|
43
|
+
| `--dry-run` | Preview installed files and activation commands without writing | Off |
|
|
44
|
+
| `--help` | Show help message | — |
|
|
45
|
+
|
|
46
|
+
## Steps
|
|
47
|
+
|
|
48
|
+
1. Run `selftune schedule` to see all examples
|
|
49
|
+
2. Pick the scheduling system for your platform
|
|
50
|
+
3. Install them directly with `--install`, or inspect/customize the raw snippets first
|
|
51
|
+
|
|
52
|
+
## Alias
|
|
53
|
+
|
|
54
|
+
`selftune schedule` is now an alias for `selftune cron`. Both commands are interchangeable. See `Workflows/Cron.md` for the full cron workflow reference.
|
|
55
|
+
|
|
56
|
+
## Common Patterns
|
|
57
|
+
|
|
58
|
+
- **User wants quick setup on a Linux server** -- Run `selftune schedule --install --format cron`.
|
|
59
|
+
- **User wants setup on macOS** -- Run `selftune schedule --install --format launchd`.
|
|
60
|
+
- **User wants setup on a systemd-based server** -- Run `selftune schedule --install --format systemd`.
|
|
61
|
+
- **User mentions OpenClaw** -- Use `selftune cron setup --platform openclaw` for the OpenClaw scheduler adapter. The default product path is still `selftune schedule --install`. See `Workflows/Cron.md`.
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# selftune Sync Workflow
|
|
2
|
+
|
|
3
|
+
Refresh source-truth telemetry across supported agent CLIs, then rebuild the
|
|
4
|
+
repaired skill-usage overlay so status, dashboard, grading, and evolution work
|
|
5
|
+
from real transcripts/rollouts instead of stale hook data.
|
|
6
|
+
|
|
7
|
+
## When to Use
|
|
8
|
+
|
|
9
|
+
- Before running `status`, `dashboard`, `watch`, or `evolve` when data may be stale
|
|
10
|
+
- The user has run many Claude Code, Codex, OpenCode, or OpenClaw sessions since last sync
|
|
11
|
+
- The agent detects host logs may be polluted and needs the repaired/source-first view
|
|
12
|
+
- Before exporting data to cloud ingest
|
|
13
|
+
|
|
14
|
+
## Default Command
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
selftune sync
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Options
|
|
21
|
+
|
|
22
|
+
| Flag | Description |
|
|
23
|
+
|------|-------------|
|
|
24
|
+
| `--since <date>` | Only sync sessions modified on/after this date |
|
|
25
|
+
| `--dry-run` | Show summary without writing files |
|
|
26
|
+
| `--force` | Ignore per-source markers and rescan everything |
|
|
27
|
+
| `--no-claude` | Skip Claude transcript replay |
|
|
28
|
+
| `--no-codex` | Skip Codex rollout ingest |
|
|
29
|
+
| `--no-opencode` | Skip OpenCode ingest |
|
|
30
|
+
| `--no-openclaw` | Skip OpenClaw ingest |
|
|
31
|
+
| `--no-repair` | Skip rebuilding `skill_usage_repaired.jsonl` |
|
|
32
|
+
|
|
33
|
+
## Output
|
|
34
|
+
|
|
35
|
+
Writes/refreshed data:
|
|
36
|
+
- `~/.claude/session_telemetry_log.jsonl`
|
|
37
|
+
- `~/.claude/all_queries_log.jsonl`
|
|
38
|
+
- `~/.claude/skill_usage_log.jsonl`
|
|
39
|
+
- `~/.claude/skill_usage_repaired.jsonl`
|
|
40
|
+
- per-source marker files
|
|
41
|
+
|
|
42
|
+
## Steps
|
|
43
|
+
|
|
44
|
+
### 1. Preview Sync
|
|
45
|
+
|
|
46
|
+
Run `selftune sync --dry-run`. The output includes per-source `scanned`
|
|
47
|
+
counts. Report the preview summary to the user.
|
|
48
|
+
|
|
49
|
+
### 2. Run Sync
|
|
50
|
+
|
|
51
|
+
Run `selftune sync`. The output includes:
|
|
52
|
+
- Per-source `scanned`, `synced`, and `skipped` counts
|
|
53
|
+
- Repaired overlay totals
|
|
54
|
+
- Any errors or warnings
|
|
55
|
+
|
|
56
|
+
### 3. Verify Results
|
|
57
|
+
|
|
58
|
+
Verify there are no sync errors and that per-source counters are internally
|
|
59
|
+
consistent (`scanned`, `synced`, `skipped`). `synced=0` is valid when no
|
|
60
|
+
new sessions exist since the last sync. Run `selftune doctor` only when
|
|
61
|
+
sync reports source/hook failures or expected active sources are missing.
|
|
62
|
+
|
|
63
|
+
### 4. Continue to Next Workflow
|
|
64
|
+
|
|
65
|
+
After sync completes, proceed with the user's intended workflow:
|
|
66
|
+
`selftune status`, `selftune dashboard`, `selftune watch --sync-first`,
|
|
67
|
+
or `selftune evolve --sync-first`.
|
|
68
|
+
|
|
69
|
+
## Common Patterns
|
|
70
|
+
|
|
71
|
+
**User wants to refresh telemetry data**
|
|
72
|
+
> Run `selftune sync`. Report per-source `scanned`, `synced`, and `skipped` counts.
|
|
73
|
+
|
|
74
|
+
**User wants to sync only recent sessions**
|
|
75
|
+
> Run `selftune sync --since <date>` with the user's specified date.
|
|
76
|
+
|
|
77
|
+
**User wants a full rescan from scratch**
|
|
78
|
+
> Run `selftune sync --force`. This ignores per-source markers and rescans
|
|
79
|
+
> all sessions.
|
|
80
|
+
|
|
81
|
+
**Agent needs to verify sync worked**
|
|
82
|
+
> Check per-source `scanned`, `synced`, and `skipped` counts. `synced=0`
|
|
83
|
+
> is normal when data is already up-to-date. Verify `scanned > 0` for
|
|
84
|
+
> expected sources to confirm sync ran successfully.
|
|
85
|
+
|
|
86
|
+
**Agent is chaining into monitoring or evolution**
|
|
87
|
+
> Use `selftune watch --sync-first` or `selftune evolve --sync-first` to
|
|
88
|
+
> refresh source truth automatically before making decisions.
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# selftune Unit Test Workflow
|
|
2
|
+
|
|
3
|
+
Run or generate unit tests for individual skills. Tests verify trigger
|
|
4
|
+
accuracy, output content, and tool usage with deterministic assertions.
|
|
5
|
+
|
|
6
|
+
## Default Command
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
selftune eval unit-test --skill <name> --tests <path> [options]
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## Options
|
|
13
|
+
|
|
14
|
+
| Flag | Description | Default |
|
|
15
|
+
|------|-------------|---------|
|
|
16
|
+
| `--skill <name>` | Skill name | Required |
|
|
17
|
+
| `--tests <path>` | Path to unit test JSON file | `~/.selftune/unit-tests/<skill>.json` |
|
|
18
|
+
| `--run-agent` | Run agent-based assertions (not just trigger checks) | Off |
|
|
19
|
+
| `--generate` | Generate tests from skill content instead of running | Off |
|
|
20
|
+
| `--skill-path <path>` | Path to SKILL.md (required for `--generate`) | None |
|
|
21
|
+
| `--eval-set <path>` | Eval set for failure context (used with `--generate`) | None |
|
|
22
|
+
| `--model <flag>` | Model flag for LLM calls | Agent default |
|
|
23
|
+
|
|
24
|
+
## Test Format
|
|
25
|
+
|
|
26
|
+
Tests are stored as JSON arrays in `~/.selftune/unit-tests/<skill>.json`:
|
|
27
|
+
|
|
28
|
+
```json
|
|
29
|
+
[
|
|
30
|
+
{
|
|
31
|
+
"test_id": "research-trigger-1",
|
|
32
|
+
"skill_name": "Research",
|
|
33
|
+
"description": "Should trigger on explicit research request",
|
|
34
|
+
"query": "Research the latest trends in AI safety",
|
|
35
|
+
"expected_trigger": true,
|
|
36
|
+
"assertions": [
|
|
37
|
+
{
|
|
38
|
+
"type": "trigger_check",
|
|
39
|
+
"value": "true",
|
|
40
|
+
"description": "Skill should trigger for this query"
|
|
41
|
+
}
|
|
42
|
+
],
|
|
43
|
+
"tags": ["explicit", "core"],
|
|
44
|
+
"source": "manual"
|
|
45
|
+
}
|
|
46
|
+
]
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Assertion Types
|
|
50
|
+
|
|
51
|
+
| Type | What it checks | Requires agent? |
|
|
52
|
+
|------|---------------|-----------------|
|
|
53
|
+
| `trigger_check` | Query triggers the skill description | No (LLM only) |
|
|
54
|
+
| `output_contains` | Agent output contains expected text | Yes |
|
|
55
|
+
| `output_matches_regex` | Agent output matches regex pattern | Yes |
|
|
56
|
+
| `tool_called` | Agent used a specific tool | Yes |
|
|
57
|
+
|
|
58
|
+
Trigger check assertions are cheap (single LLM call). Agent-based assertions
|
|
59
|
+
require `--run-agent` and run the query through the full agent.
|
|
60
|
+
|
|
61
|
+
## Output Format
|
|
62
|
+
|
|
63
|
+
```json
|
|
64
|
+
{
|
|
65
|
+
"skill_name": "Research",
|
|
66
|
+
"total": 10,
|
|
67
|
+
"passed": 8,
|
|
68
|
+
"failed": 2,
|
|
69
|
+
"pass_rate": 0.80,
|
|
70
|
+
"results": [
|
|
71
|
+
{
|
|
72
|
+
"test_id": "research-trigger-1",
|
|
73
|
+
"overall_passed": true,
|
|
74
|
+
"trigger_passed": true,
|
|
75
|
+
"assertion_results": [
|
|
76
|
+
{ "type": "trigger_check", "value": "true", "passed": true, "evidence": "LLM responded YES" }
|
|
77
|
+
],
|
|
78
|
+
"duration_ms": 450
|
|
79
|
+
}
|
|
80
|
+
],
|
|
81
|
+
"ran_at": "2026-03-04T12:00:00.000Z"
|
|
82
|
+
}
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Steps
|
|
86
|
+
|
|
87
|
+
### 1. Generate Tests (First Time)
|
|
88
|
+
|
|
89
|
+
If no test file exists for the skill, generate initial tests:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
selftune eval unit-test --skill Research --generate --skill-path ~/.claude/skills/Research/SKILL.md
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Parse the output. The LLM creates test cases covering:
|
|
96
|
+
- Explicit trigger queries
|
|
97
|
+
- Implicit trigger queries
|
|
98
|
+
- Contextual trigger queries
|
|
99
|
+
- Negative examples (should NOT trigger)
|
|
100
|
+
|
|
101
|
+
Tests are saved to `~/.selftune/unit-tests/Research.json`.
|
|
102
|
+
|
|
103
|
+
### 2. Run Tests
|
|
104
|
+
|
|
105
|
+
Run the test suite:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
selftune eval unit-test --skill Research --tests ~/.selftune/unit-tests/Research.json
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
By default, only `trigger_check` assertions run (fast, no agent needed).
|
|
112
|
+
Add `--run-agent` for full agent-based assertions.
|
|
113
|
+
|
|
114
|
+
### 3. Parse Results
|
|
115
|
+
|
|
116
|
+
Parse the JSON output. Check `pass_rate` and investigate failures:
|
|
117
|
+
- Failed trigger checks -- description needs improvement (route to Evolve)
|
|
118
|
+
- Failed output assertions -- skill workflow needs fixes
|
|
119
|
+
- Failed tool assertions -- skill routing is broken
|
|
120
|
+
|
|
121
|
+
Report the pass rate and any failures to the user.
|
|
122
|
+
|
|
123
|
+
### 4. Post-Evolution Verification
|
|
124
|
+
|
|
125
|
+
After evolving a skill, re-run unit tests to verify improvements:
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
selftune eval unit-test --skill Research
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Compare the new `pass_rate` against the previous run. Report whether
|
|
132
|
+
the evolution improved trigger accuracy.
|
|
133
|
+
|
|
134
|
+
## Common Patterns
|
|
135
|
+
|
|
136
|
+
**User asks to generate tests for a skill**
|
|
137
|
+
> Run `selftune eval unit-test --skill <name> --generate --skill-path <path>`.
|
|
138
|
+
> Parse the output and report how many tests were generated.
|
|
139
|
+
|
|
140
|
+
**User asks to run existing tests**
|
|
141
|
+
> Run `selftune eval unit-test --skill <name>`. Parse the JSON output and
|
|
142
|
+
> report pass rate and any failures.
|
|
143
|
+
|
|
144
|
+
**User asks for full agent-based testing**
|
|
145
|
+
> Run `selftune eval unit-test --skill <name> --run-agent`. This runs queries
|
|
146
|
+
> through the full agent, so inform the user it will take longer.
|
|
147
|
+
|
|
148
|
+
**After an evolution completes**
|
|
149
|
+
> Run unit tests to verify the evolution improved trigger accuracy. Compare
|
|
150
|
+
> the new pass rate against the pre-evolution baseline.
|
package/skill/Workflows/Watch.md
CHANGED
|
@@ -65,6 +65,21 @@ selftune watch --skill <name> --skill-path <path> [options]
|
|
|
65
65
|
|
|
66
66
|
## Steps
|
|
67
67
|
|
|
68
|
+
### 0. Read Evolution Context
|
|
69
|
+
|
|
70
|
+
Read `~/.selftune/memory/context.md` for session context:
|
|
71
|
+
- Active evolutions and their current status
|
|
72
|
+
- Known issues and regression history
|
|
73
|
+
- Last update timestamp
|
|
74
|
+
|
|
75
|
+
If the file does not exist, proceed normally -- it will be created after
|
|
76
|
+
the first watch.
|
|
77
|
+
|
|
78
|
+
The evolution-guard hook prevents conflicting SKILL.md edits while watch is
|
|
79
|
+
evaluating the skill. The auto-activation system uses watch results to
|
|
80
|
+
adjust suggestion confidence -- skills showing regressions get flagged for
|
|
81
|
+
attention in subsequent prompts.
|
|
82
|
+
|
|
68
83
|
### 1. Run Watch
|
|
69
84
|
|
|
70
85
|
```bash
|
|
@@ -87,7 +102,7 @@ Parse the JSON output. Key decision points:
|
|
|
87
102
|
If regression is detected:
|
|
88
103
|
- Review recent session transcripts to understand what changed
|
|
89
104
|
- Check if the eval set is still representative
|
|
90
|
-
- Run `rollback` if the regression is confirmed (see `Workflows/Rollback.md`)
|
|
105
|
+
- Run `evolve rollback` if the regression is confirmed (see `Workflows/Rollback.md`)
|
|
91
106
|
|
|
92
107
|
If `--auto-rollback` was set, the command automatically restores the
|
|
93
108
|
previous description and logs a `rolled_back` entry.
|
|
@@ -100,6 +115,13 @@ Summarize the snapshot for the user:
|
|
|
100
115
|
- Whether regression was detected
|
|
101
116
|
- Recommended action
|
|
102
117
|
|
|
118
|
+
### 5. Update Memory
|
|
119
|
+
|
|
120
|
+
After watch completes, the memory writer updates
|
|
121
|
+
`~/.selftune/memory/context.md` with the current regression status,
|
|
122
|
+
pass rates, and recommended next action. This ensures continuity if the
|
|
123
|
+
context window resets before the user acts on the results.
|
|
124
|
+
|
|
103
125
|
## Common Patterns
|
|
104
126
|
|
|
105
127
|
**"Is the skill performing well after the change?"**
|
|
@@ -119,3 +141,13 @@ Summarize the snapshot for the user:
|
|
|
119
141
|
**"Set a custom baseline"**
|
|
120
142
|
> Use `--baseline 0.85` to override auto-detection. Useful when the
|
|
121
143
|
> auto-detected baseline is from an older evolution.
|
|
144
|
+
|
|
145
|
+
## Autonomous Mode
|
|
146
|
+
|
|
147
|
+
When called by `selftune orchestrate`, watch runs automatically on recently
|
|
148
|
+
evolved skills:
|
|
149
|
+
|
|
150
|
+
- Checks all skills evolved in the last --recent-window hours (default 24)
|
|
151
|
+
- Auto-rollback is enabled by default
|
|
152
|
+
- Results are included in the orchestrate run report
|
|
153
|
+
- No user notification — regressions are handled silently via rollback
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# selftune Workflows Workflow
|
|
2
|
+
|
|
3
|
+
## When to Use
|
|
4
|
+
|
|
5
|
+
When the user asks about multi-skill workflows, workflow discovery, or skill composition.
|
|
6
|
+
|
|
7
|
+
## Overview
|
|
8
|
+
|
|
9
|
+
Discover repeated multi-skill sequences from telemetry and optionally save a
|
|
10
|
+
discovered workflow into a skill's `## Workflows` section.
|
|
11
|
+
|
|
12
|
+
## Default Commands
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
selftune workflows [options]
|
|
16
|
+
selftune workflows save <workflow-id|index> [--skill-path <path>]
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Options
|
|
20
|
+
|
|
21
|
+
- `--min-occurrences <n>`: Minimum times a workflow must appear before it is
|
|
22
|
+
shown. Default: `3`.
|
|
23
|
+
- `--window <n>`: Only analyze the last `n` sessions. Default: all sessions.
|
|
24
|
+
- `--skill <name>`: Only show workflows containing this skill. Default: all
|
|
25
|
+
skills.
|
|
26
|
+
- `--json`: Emit machine-readable `WorkflowDiscoveryReport` JSON. Default:
|
|
27
|
+
human-readable text.
|
|
28
|
+
- `--skill-path <path>`: Target SKILL.md when using `save`. Default:
|
|
29
|
+
auto-detect the first skill's SKILL.md path across contributing sessions. If
|
|
30
|
+
that skill maps to multiple SKILL.md files in those sessions, the command
|
|
31
|
+
errors and you must pass `--skill-path` explicitly.
|
|
32
|
+
|
|
33
|
+
## Save Semantics
|
|
34
|
+
|
|
35
|
+
`save` accepts either:
|
|
36
|
+
|
|
37
|
+
- A workflow ID, which is the ordered skill chain joined with `→`
|
|
38
|
+
- A 1-based index from the `selftune workflows` output
|
|
39
|
+
|
|
40
|
+
Examples:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
selftune workflows save "Copywriting→MarketingAutomation→SelfTuneBlog"
|
|
44
|
+
selftune workflows save 1
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
When saved, selftune appends a subsection to `## Workflows` in the target
|
|
48
|
+
SKILL.md. The subsection name is derived from the skill chain
|
|
49
|
+
(`Copywriting-MarketingAutomation-SelfTuneBlog`) and includes
|
|
50
|
+
discovered-source metadata with occurrence count and synergy score.
|
|
51
|
+
|
|
52
|
+
## Output Format
|
|
53
|
+
|
|
54
|
+
### Human-readable output
|
|
55
|
+
|
|
56
|
+
The number prefix (for example, `1.`) is the 1-based index you can pass to
|
|
57
|
+
`selftune workflows save <index>`.
|
|
58
|
+
|
|
59
|
+
```text
|
|
60
|
+
Discovered Workflows (from 450 sessions):
|
|
61
|
+
|
|
62
|
+
1. Copywriting → MarketingAutomation → SelfTuneBlog
|
|
63
|
+
Occurrences: 12 | Synergy: 0.72 | Consistency: 92% | Completion: 83%
|
|
64
|
+
Common trigger: "write and publish a blog post"
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### JSON output
|
|
68
|
+
|
|
69
|
+
```json
|
|
70
|
+
{
|
|
71
|
+
"workflows": [
|
|
72
|
+
{
|
|
73
|
+
"workflow_id": "Copywriting→MarketingAutomation→SelfTuneBlog",
|
|
74
|
+
"skills": ["Copywriting", "MarketingAutomation", "SelfTuneBlog"],
|
|
75
|
+
"occurrence_count": 12,
|
|
76
|
+
"avg_errors": 0.5,
|
|
77
|
+
"avg_errors_individual": 1.8,
|
|
78
|
+
"synergy_score": 0.72,
|
|
79
|
+
"representative_query": "write and publish a blog post",
|
|
80
|
+
"sequence_consistency": 0.92,
|
|
81
|
+
"completion_rate": 0.83,
|
|
82
|
+
"first_seen": "2026-03-01T10:00:00Z",
|
|
83
|
+
"last_seen": "2026-03-08T16:30:00Z",
|
|
84
|
+
"session_ids": ["s1", "s2"]
|
|
85
|
+
}
|
|
86
|
+
],
|
|
87
|
+
"total_sessions_analyzed": 450,
|
|
88
|
+
"generated_at": "2026-03-09T12:00:00.000Z"
|
|
89
|
+
}
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## How It Works
|
|
93
|
+
|
|
94
|
+
1. Reads `session_telemetry_log.jsonl` and `skill_usage_log.jsonl`
|
|
95
|
+
2. Orders skill usage inside each session by timestamp
|
|
96
|
+
3. Deduplicates consecutive same-skill entries
|
|
97
|
+
4. Keeps only sequences with 2+ skills
|
|
98
|
+
5. Counts repeated ordered sequences across sessions
|
|
99
|
+
6. Computes workflow metrics:
|
|
100
|
+
- `synergy_score` — whether the sequence performs better together than solo
|
|
101
|
+
baselines, where each skill's solo baseline is its average error rate from
|
|
102
|
+
single-skill sessions and the workflow uses the max of those solo rates
|
|
103
|
+
- `sequence_consistency` — how stable the ordering is for the same skill
|
|
104
|
+
set
|
|
105
|
+
- `completion_rate` — how often all skills in the sequence fire
|
|
106
|
+
7. Filters by `--min-occurrences` and optional `--skill`
|
|
107
|
+
8. Optionally appends the chosen workflow to SKILL.md via `save`
|
|
108
|
+
|
|
109
|
+
## Interpreting Results
|
|
110
|
+
|
|
111
|
+
- `synergy_score > 0.3`: Strong candidate for codifying as a workflow.
|
|
112
|
+
- `synergy_score < -0.3`: The sequence adds friction or conflicts.
|
|
113
|
+
- Low `sequence_consistency`: Same skills appear in multiple orders; the
|
|
114
|
+
pattern may still be unstable.
|
|
115
|
+
- Low `completion_rate`: One or more skills in the sequence often are not
|
|
116
|
+
invoked, so the full workflow does not complete.
|
|
117
|
+
|
|
118
|
+
## Common Patterns
|
|
119
|
+
|
|
120
|
+
- "Which skills always get used together?"
|
|
121
|
+
`selftune workflows`
|
|
122
|
+
- "Only show workflows involving Deploy"
|
|
123
|
+
`selftune workflows --skill Deploy`
|
|
124
|
+
- "Focus on recent behavior"
|
|
125
|
+
`selftune workflows --window 20`
|
|
126
|
+
- "Save the top workflow into SKILL.md"
|
|
127
|
+
`selftune workflows save 1 --skill-path /path/to/SKILL.md`
|
|
128
|
+
- "Save a specific discovered workflow by ID"
|
|
129
|
+
`selftune workflows save "Copywriting→MarketingAutomation→SelfTuneBlog"`
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
{
|
|
2
|
+
"_readme": "Default activation rules for selftune auto-activation. Copy to ~/.selftune/activation-rules.json to customize.",
|
|
3
|
+
"_note": "These defaults are bundled inside the installed skill so setup does not depend on repository-level templates.",
|
|
4
|
+
"rules": [
|
|
5
|
+
{
|
|
6
|
+
"id": "post-session-diagnostic",
|
|
7
|
+
"enabled": true,
|
|
8
|
+
"description": "Suggest `selftune last` when session has >2 unmatched queries"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "grading-threshold-breach",
|
|
12
|
+
"enabled": true,
|
|
13
|
+
"description": "Suggest `selftune evolve` when session pass rate < 60%"
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"id": "stale-evolution",
|
|
17
|
+
"enabled": true,
|
|
18
|
+
"description": "Suggest `selftune evolve` when no evolution in >7 days and pending false negatives exist"
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"id": "regression-detected",
|
|
22
|
+
"enabled": true,
|
|
23
|
+
"description": "Suggest `selftune rollback` when monitoring detects a regression"
|
|
24
|
+
}
|
|
25
|
+
]
|
|
26
|
+
}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
{
|
|
2
|
+
"_readme": "Claude settings template for multi-skill selftune projects. Merge into ~/.claude/settings.json.",
|
|
3
|
+
"_usage": "These hooks use npx selftune, which works regardless of installation path.",
|
|
4
|
+
"_note": "Multi-skill projects use activation rules to route queries to the correct skill. See assets/activation-rules-default.json.",
|
|
5
|
+
"hooks": {
|
|
6
|
+
"UserPromptSubmit": [
|
|
7
|
+
{
|
|
8
|
+
"hooks": [
|
|
9
|
+
{
|
|
10
|
+
"type": "command",
|
|
11
|
+
"command": "npx selftune hook prompt-log",
|
|
12
|
+
"timeout": 5
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"type": "command",
|
|
16
|
+
"command": "npx selftune hook auto-activate",
|
|
17
|
+
"timeout": 5
|
|
18
|
+
}
|
|
19
|
+
]
|
|
20
|
+
}
|
|
21
|
+
],
|
|
22
|
+
"PreToolUse": [
|
|
23
|
+
{
|
|
24
|
+
"matcher": "Write|Edit",
|
|
25
|
+
"hooks": [
|
|
26
|
+
{
|
|
27
|
+
"type": "command",
|
|
28
|
+
"command": "npx selftune hook skill-change-guard",
|
|
29
|
+
"timeout": 5
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"type": "command",
|
|
33
|
+
"command": "npx selftune hook evolution-guard",
|
|
34
|
+
"timeout": 5
|
|
35
|
+
}
|
|
36
|
+
]
|
|
37
|
+
}
|
|
38
|
+
],
|
|
39
|
+
"PostToolUse": [
|
|
40
|
+
{
|
|
41
|
+
"matcher": "Read",
|
|
42
|
+
"hooks": [
|
|
43
|
+
{
|
|
44
|
+
"type": "command",
|
|
45
|
+
"command": "npx selftune hook skill-eval",
|
|
46
|
+
"timeout": 5
|
|
47
|
+
}
|
|
48
|
+
]
|
|
49
|
+
}
|
|
50
|
+
],
|
|
51
|
+
"Stop": [
|
|
52
|
+
{
|
|
53
|
+
"hooks": [
|
|
54
|
+
{
|
|
55
|
+
"type": "command",
|
|
56
|
+
"command": "npx selftune hook session-stop",
|
|
57
|
+
"timeout": 15
|
|
58
|
+
}
|
|
59
|
+
]
|
|
60
|
+
}
|
|
61
|
+
]
|
|
62
|
+
}
|
|
63
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
{
|
|
2
|
+
"_readme": "Claude settings template for single-skill selftune projects. Merge into ~/.claude/settings.json.",
|
|
3
|
+
"_usage": "These hooks use npx selftune, which works regardless of installation path.",
|
|
4
|
+
"hooks": {
|
|
5
|
+
"UserPromptSubmit": [
|
|
6
|
+
{
|
|
7
|
+
"hooks": [
|
|
8
|
+
{
|
|
9
|
+
"type": "command",
|
|
10
|
+
"command": "npx selftune hook prompt-log",
|
|
11
|
+
"timeout": 5
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
"type": "command",
|
|
15
|
+
"command": "npx selftune hook auto-activate",
|
|
16
|
+
"timeout": 5
|
|
17
|
+
}
|
|
18
|
+
]
|
|
19
|
+
}
|
|
20
|
+
],
|
|
21
|
+
"PreToolUse": [
|
|
22
|
+
{
|
|
23
|
+
"matcher": "Write|Edit",
|
|
24
|
+
"hooks": [
|
|
25
|
+
{
|
|
26
|
+
"type": "command",
|
|
27
|
+
"command": "npx selftune hook skill-change-guard",
|
|
28
|
+
"timeout": 5
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
"PostToolUse": [
|
|
34
|
+
{
|
|
35
|
+
"matcher": "Read",
|
|
36
|
+
"hooks": [
|
|
37
|
+
{
|
|
38
|
+
"type": "command",
|
|
39
|
+
"command": "npx selftune hook skill-eval",
|
|
40
|
+
"timeout": 5
|
|
41
|
+
}
|
|
42
|
+
]
|
|
43
|
+
}
|
|
44
|
+
],
|
|
45
|
+
"Stop": [
|
|
46
|
+
{
|
|
47
|
+
"hooks": [
|
|
48
|
+
{
|
|
49
|
+
"type": "command",
|
|
50
|
+
"command": "npx selftune hook session-stop",
|
|
51
|
+
"timeout": 15
|
|
52
|
+
}
|
|
53
|
+
]
|
|
54
|
+
}
|
|
55
|
+
]
|
|
56
|
+
}
|
|
57
|
+
}
|