selftune 0.2.15 → 0.2.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -19
- package/bin/run-hook.cjs +36 -0
- package/cli/selftune/alpha-upload/build-payloads.ts +14 -1
- package/cli/selftune/alpha-upload/client.ts +51 -1
- package/cli/selftune/alpha-upload/flush.ts +46 -5
- package/cli/selftune/alpha-upload/stage-canonical.ts +25 -4
- package/cli/selftune/alpha-upload-contract.ts +9 -0
- package/cli/selftune/constants.ts +82 -5
- package/cli/selftune/contribute/sanitize.ts +52 -5
- package/cli/selftune/dashboard-contract.ts +100 -0
- package/cli/selftune/dashboard-server.ts +2 -2
- package/cli/selftune/evolution/description-quality.ts +12 -11
- package/cli/selftune/evolution/evolve.ts +238 -53
- package/cli/selftune/evolution/unblock-suggestions.ts +159 -0
- package/cli/selftune/evolution/validate-proposal.ts +9 -6
- package/cli/selftune/grading/grade-session.ts +20 -0
- package/cli/selftune/hooks/commit-track.ts +188 -0
- package/cli/selftune/hooks/prompt-log.ts +10 -1
- package/cli/selftune/hooks/session-stop.ts +2 -2
- package/cli/selftune/hooks/skill-eval.ts +15 -1
- package/cli/selftune/hooks/stdin-preview.ts +32 -0
- package/cli/selftune/init.ts +198 -27
- package/cli/selftune/localdb/direct-write.ts +69 -6
- package/cli/selftune/localdb/queries.ts +552 -7
- package/cli/selftune/localdb/schema.ts +46 -0
- package/cli/selftune/orchestrate.ts +32 -4
- package/cli/selftune/routes/overview.ts +41 -3
- package/cli/selftune/routes/skill-report.ts +88 -17
- package/cli/selftune/types.ts +32 -0
- package/cli/selftune/utils/hooks.ts +12 -2
- package/cli/selftune/utils/transcript.ts +210 -1
- package/node_modules/@selftune/telemetry-contract/src/types.ts +11 -0
- package/package.json +1 -1
- package/packages/telemetry-contract/src/types.ts +11 -0
- package/skill/SKILL.md +29 -1
- package/skill/Workflows/AutoActivation.md +1 -1
- package/skill/Workflows/Evolve.md +31 -13
- package/skill/Workflows/ExportCanonical.md +121 -0
- package/skill/Workflows/Hook.md +131 -0
- package/skill/Workflows/Initialize.md +9 -8
- package/skill/Workflows/Orchestrate.md +27 -5
- package/skill/Workflows/Quickstart.md +94 -0
- package/skill/Workflows/RepairSkillUsage.md +87 -0
- package/skill/Workflows/Uninstall.md +82 -0
- package/skill/settings_snippet.json +19 -8
package/package.json
CHANGED
|
@@ -143,7 +143,18 @@ export interface CanonicalExecutionFactRecord extends CanonicalSessionRecordBase
|
|
|
143
143
|
errors_encountered: number;
|
|
144
144
|
input_tokens?: number;
|
|
145
145
|
output_tokens?: number;
|
|
146
|
+
cached_input_tokens?: number;
|
|
147
|
+
reasoning_output_tokens?: number;
|
|
148
|
+
cost_usd?: number;
|
|
146
149
|
duration_ms?: number;
|
|
150
|
+
files_changed?: number;
|
|
151
|
+
lines_added?: number;
|
|
152
|
+
lines_removed?: number;
|
|
153
|
+
lines_modified?: number;
|
|
154
|
+
/** Count of output-producing tool calls (Write, Edit, WebFetch, WebSearch, Skill, Agent). */
|
|
155
|
+
artifact_count?: number;
|
|
156
|
+
/** Inferred session type based on tool distribution. */
|
|
157
|
+
session_type?: "dev" | "research" | "content" | "mixed";
|
|
147
158
|
completion_status?: CanonicalCompletionStatus;
|
|
148
159
|
end_reason?: string;
|
|
149
160
|
}
|
package/skill/SKILL.md
CHANGED
|
@@ -12,7 +12,7 @@ description: >
|
|
|
12
12
|
even if they don't say "selftune" explicitly.
|
|
13
13
|
metadata:
|
|
14
14
|
author: selftune-dev
|
|
15
|
-
version: 0.2.
|
|
15
|
+
version: 0.2.10
|
|
16
16
|
category: developer-tools
|
|
17
17
|
---
|
|
18
18
|
|
|
@@ -104,9 +104,27 @@ selftune cron remove [--dry-run]
|
|
|
104
104
|
selftune telemetry [status|enable|disable]
|
|
105
105
|
selftune export [TABLE...] [--output/-o DIR] [--since DATE]
|
|
106
106
|
|
|
107
|
+
# Autonomous loop
|
|
108
|
+
selftune orchestrate [--dry-run] [--review-required] [--auto-approve] [--skill NAME] [--max-skills N] [--recent-window HOURS] [--sync-force] [--max-auto-grade N] [--loop] [--loop-interval SECS]
|
|
109
|
+
selftune sync [--since DATE] [--dry-run] [--force] [--no-claude] [--no-codex] [--no-opencode] [--no-openclaw] [--no-repair] [--json]
|
|
110
|
+
|
|
111
|
+
# Discovery + badges
|
|
112
|
+
selftune workflows [--skill NAME] [--skill-path PATH] [--min-occurrences N] [--window N] [--json] [save --skill NAME --skill-path PATH]
|
|
113
|
+
selftune badge --skill <name> [--format svg|markdown|url] [--output PATH]
|
|
114
|
+
|
|
115
|
+
# Maintenance
|
|
116
|
+
selftune quickstart
|
|
117
|
+
selftune repair-skill-usage [--since DATE] [--dry-run]
|
|
118
|
+
selftune export-canonical [--out FILE] [--platform NAME] [--record-kind KIND] [--pretty] [--push-payload]
|
|
119
|
+
selftune uninstall [--dry-run] [--keep-logs] [--npm-uninstall]
|
|
120
|
+
|
|
121
|
+
# Hook dispatch (for debugging/manual invocation)
|
|
122
|
+
selftune hook <name> # prompt-log | session-stop | skill-eval | auto-activate | skill-change-guard | evolution-guard
|
|
123
|
+
|
|
107
124
|
# Alpha enrollment (device-code flow — browser opens automatically)
|
|
108
125
|
selftune init --alpha --alpha-email <email>
|
|
109
126
|
selftune alpha upload [--dry-run]
|
|
127
|
+
selftune alpha relink
|
|
110
128
|
selftune status # shows cloud link state + upload readiness
|
|
111
129
|
```
|
|
112
130
|
|
|
@@ -139,6 +157,11 @@ selftune status # shows c
|
|
|
139
157
|
| badge, readme badge, skill badge, health badge | Badge | Workflows/Badge.md |
|
|
140
158
|
| workflows, discover workflows, list workflows, multi-skill workflows | Workflows | Workflows/Workflows.md |
|
|
141
159
|
| alpha upload, upload data, send alpha data, manual upload, dry run upload | AlphaUpload | Workflows/AlphaUpload.md |
|
|
160
|
+
| quickstart, getting started, onboard, first time setup, new user | Quickstart | Workflows/Quickstart.md |
|
|
161
|
+
| uninstall, remove selftune, clean up, teardown | Uninstall | Workflows/Uninstall.md |
|
|
162
|
+
| repair, rebuild usage, fix skill usage, trustworthy usage, repair-skill-usage | RepairSkillUsage | Workflows/RepairSkillUsage.md |
|
|
163
|
+
| export canonical, canonical export, canonical telemetry, push payload | ExportCanonical | Workflows/ExportCanonical.md |
|
|
164
|
+
| hook, run hook, invoke hook, manual hook, debug hook | Hook | Workflows/Hook.md |
|
|
142
165
|
| export, dump, jsonl, export sqlite, debug export | Export | _(direct command — no workflow file)_ |
|
|
143
166
|
| status, health summary, skill health, how are skills, skills doing, run selftune | Status | _(direct command — no workflow file)_ |
|
|
144
167
|
| last, last session, recent session, what happened, what changed | Last | _(direct command — no workflow file)_ |
|
|
@@ -319,6 +342,11 @@ accomplish a task _using_ a skill, route to that skill instead.
|
|
|
319
342
|
| `agents/pattern-analyst.md` | Cross-skill conflict detection | Spawn when composability flags conflicts |
|
|
320
343
|
| `agents/evolution-reviewer.md` | Safety gate for evolution proposals | Spawn before deploying high-stakes evolutions |
|
|
321
344
|
| `agents/integration-guide.md` | Guided setup for complex projects | Spawn for monorepos, multi-skill setups |
|
|
345
|
+
| `Workflows/Quickstart.md` | Guided onboarding: init, ingest, status | First-time setup for new users |
|
|
346
|
+
| `Workflows/Uninstall.md` | Clean removal of selftune data and config | When removing selftune completely |
|
|
347
|
+
| `Workflows/RepairSkillUsage.md` | Rebuild skill usage from source transcripts | When skill usage data seems inaccurate |
|
|
348
|
+
| `Workflows/ExportCanonical.md` | Export canonical telemetry for downstream use | When exporting data for external consumption |
|
|
349
|
+
| `Workflows/Hook.md` | Manual hook invocation for debugging | When debugging or testing hooks manually |
|
|
322
350
|
| `references/logs.md` | Log file formats (telemetry, usage, queries, audit) | When parsing or debugging log files |
|
|
323
351
|
| `references/grading-methodology.md` | 3-tier grading model, evidence standards | When grading sessions or interpreting grades |
|
|
324
352
|
| `references/invocation-taxonomy.md` | 4 invocation types, coverage analysis | When analyzing trigger coverage |
|
|
@@ -110,7 +110,7 @@ The hook is registered under `UserPromptSubmit`:
|
|
|
110
110
|
"hooks": {
|
|
111
111
|
"UserPromptSubmit": [
|
|
112
112
|
{
|
|
113
|
-
"command": "
|
|
113
|
+
"command": "node /path/to/bin/run-hook.cjs /path/to/cli/selftune/hooks/auto-activate.ts"
|
|
114
114
|
}
|
|
115
115
|
]
|
|
116
116
|
}
|
|
@@ -31,14 +31,16 @@ selftune evolve --skill <name> --skill-path <path> [options]
|
|
|
31
31
|
| `--confidence <n>` | Minimum confidence threshold (0-1) | 0.6 |
|
|
32
32
|
| `--max-iterations <n>` | Maximum retry iterations | 3 |
|
|
33
33
|
| `--validation-model <model>` | Model for trigger-check validation LLM calls | `haiku` |
|
|
34
|
-
| `--pareto` | Generate multiple candidates per iteration |
|
|
35
|
-
| `--candidates <n>` | Number of candidates per iteration
|
|
34
|
+
| `--pareto` | Generate multiple candidates per iteration | On |
|
|
35
|
+
| `--candidates <n>` | Number of candidates per iteration when Pareto mode is enabled | `3` |
|
|
36
36
|
| `--token-efficiency` | Optimize for token efficiency in proposals | Off |
|
|
37
37
|
| `--with-baseline` | Include a no-skill baseline comparison | Off |
|
|
38
38
|
| `--cheap-loop` | Use cheap models for loop, expensive for final gate | On |
|
|
39
39
|
| `--full-model` | Use full-cost model throughout (disables cheap-loop) | Off |
|
|
40
40
|
| `--verbose` | Print detailed progress during evolution | Off |
|
|
41
41
|
| `--gate-model <model>` | Model for final gate validation | `sonnet` (when `--cheap-loop`) |
|
|
42
|
+
| `--gate-effort <level>` | Thinking effort for the final gate (`low|medium|high|max`) | None |
|
|
43
|
+
| `--adaptive-gate` | Escalate risky gate checks to `opus` + `high` effort | Off |
|
|
42
44
|
| `--proposal-model <model>` | Model for proposal generation LLM calls | None |
|
|
43
45
|
| `--sync-first` | Refresh source-truth telemetry before generating evals/failure patterns | Off |
|
|
44
46
|
| `--sync-force` | Force a full source rescan during `--sync-first` | Off |
|
|
@@ -115,7 +117,7 @@ Ask one `AskUserQuestion` at a time in this order:
|
|
|
115
117
|
- `Single model — use one model throughout`
|
|
116
118
|
4. `Advanced Options`
|
|
117
119
|
Options:
|
|
118
|
-
- `Defaults (0.6 confidence, 3 iterations,
|
|
120
|
+
- `Defaults (0.6 confidence, 3 iterations, 3 Pareto candidates) (recommended)`
|
|
119
121
|
- `Stricter (0.7 confidence, 5 iterations)`
|
|
120
122
|
- `Pareto mode (multiple candidates per iteration)`
|
|
121
123
|
|
|
@@ -146,7 +148,7 @@ Configuration Summary:
|
|
|
146
148
|
Model: haiku (cheap-loop: sonnet gate)
|
|
147
149
|
Confidence: 0.6
|
|
148
150
|
Iterations: 3
|
|
149
|
-
Pareto:
|
|
151
|
+
Pareto: on (3 candidates)
|
|
150
152
|
|
|
151
153
|
Proceeding...
|
|
152
154
|
```
|
|
@@ -284,15 +286,20 @@ Proposals are scored on heuristic quality criteria (no LLM required). The compos
|
|
|
284
286
|
|
|
285
287
|
### Stopping Criteria
|
|
286
288
|
|
|
287
|
-
The evolution loop
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
|
295
|
-
|
|
|
289
|
+
The evolution loop uses a modular stopping criteria evaluator
|
|
290
|
+
(`evolution/stopping-criteria.ts`) that checks conditions in priority order
|
|
291
|
+
after each validation pass. The evaluator receives the current pass rate,
|
|
292
|
+
historical pass rates from previous iterations, and proposal confidence to
|
|
293
|
+
make a unified stop/continue decision. The stopping reason is recorded in
|
|
294
|
+
audit entries for traceability.
|
|
295
|
+
|
|
296
|
+
| # | Condition | Meaning |
|
|
297
|
+
| --- | ------------------ | -------------------------------------------------------------- |
|
|
298
|
+
| 1 | **Converged** | Pass rate >= 0.95 |
|
|
299
|
+
| 2 | **Max iterations** | Reached `--max-iterations` limit |
|
|
300
|
+
| 3 | **Low confidence** | Proposal confidence below `--confidence` threshold |
|
|
301
|
+
| 4 | **Plateau** | < 1% pass rate variation across 3 consecutive iterations |
|
|
302
|
+
| 5 | **Continue** | None of the above -- keep iterating |
|
|
296
303
|
|
|
297
304
|
## Cheap Loop Mode
|
|
298
305
|
|
|
@@ -310,6 +317,11 @@ The gate validation is a new step between validation and deploy. It re-runs
|
|
|
310
317
|
`validateProposal` using the gate model. If the gate fails, the proposal is
|
|
311
318
|
not deployed.
|
|
312
319
|
|
|
320
|
+
When `--adaptive-gate` is enabled, selftune keeps the normal gate for low-risk
|
|
321
|
+
proposals and escalates only risky ones to `opus` with `high` effort. Risk
|
|
322
|
+
signals include small net lift, regressions, low proposal confidence, and
|
|
323
|
+
large description broadening.
|
|
324
|
+
|
|
313
325
|
```bash
|
|
314
326
|
# Cheap loop with default models
|
|
315
327
|
selftune evolve --skill X --skill-path Y --cheap-loop
|
|
@@ -317,6 +329,12 @@ selftune evolve --skill X --skill-path Y --cheap-loop
|
|
|
317
329
|
# Cheap loop with opus gate
|
|
318
330
|
selftune evolve --skill X --skill-path Y --cheap-loop --gate-model opus
|
|
319
331
|
|
|
332
|
+
# Cheap loop with adaptive escalation for risky proposals
|
|
333
|
+
selftune evolve --skill X --skill-path Y --cheap-loop --adaptive-gate
|
|
334
|
+
|
|
335
|
+
# Explicit high-effort opus gate
|
|
336
|
+
selftune evolve --skill X --skill-path Y --cheap-loop --gate-model opus --gate-effort high
|
|
337
|
+
|
|
320
338
|
# Manual model control without cheap-loop
|
|
321
339
|
selftune evolve --skill X --skill-path Y --proposal-model haiku --validation-model sonnet
|
|
322
340
|
```
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# selftune Export Canonical Workflow
|
|
2
|
+
|
|
3
|
+
Export canonical telemetry records as JSONL or as a V2 push payload for cloud
|
|
4
|
+
upload. Canonical records are the normalized, platform-agnostic representation
|
|
5
|
+
of sessions, prompts, skill invocations, execution facts, and normalization runs.
|
|
6
|
+
|
|
7
|
+
## When to Use
|
|
8
|
+
|
|
9
|
+
- The user wants to export telemetry data for external analysis
|
|
10
|
+
- The user says "export canonical", "canonical export", or "canonical telemetry"
|
|
11
|
+
- The agent needs to produce a push payload for manual upload inspection
|
|
12
|
+
- Debugging what data would be sent to the cloud API
|
|
13
|
+
|
|
14
|
+
## Default Command
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
selftune export-canonical
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Options
|
|
21
|
+
|
|
22
|
+
| Flag | Description |
|
|
23
|
+
| ----------------------- | ------------------------------------------------------------------- |
|
|
24
|
+
| `--out <path>` | Write output to a file instead of stdout |
|
|
25
|
+
| `--platform <name>` | Filter by platform (`claude_code`, `codex`, `opencode`, `openclaw`) |
|
|
26
|
+
| `--record-kind <kind>` | Filter by record kind (`session`, `prompt`, `skill_invocation`, `execution_fact`, `normalization_run`) |
|
|
27
|
+
| `--pretty` | Pretty-print JSON output with 2-space indentation |
|
|
28
|
+
| `--log <path>` | Path to canonical log file (default: `~/.claude/canonical_log.jsonl`) |
|
|
29
|
+
| `--projects-dir <path>` | Claude transcript directory for fallback synthesis (default: `~/.claude/projects`) |
|
|
30
|
+
| `--push-payload` | Output as a V2 push payload envelope instead of raw JSONL |
|
|
31
|
+
|
|
32
|
+
## Output Formats
|
|
33
|
+
|
|
34
|
+
### Default (JSONL)
|
|
35
|
+
|
|
36
|
+
One canonical record per line:
|
|
37
|
+
|
|
38
|
+
```jsonl
|
|
39
|
+
{"record_kind":"session","session_id":"abc123","platform":"claude_code",...}
|
|
40
|
+
{"record_kind":"prompt","prompt_id":"p1","session_id":"abc123",...}
|
|
41
|
+
{"record_kind":"skill_invocation","invocation_id":"inv1","skill_name":"selftune",...}
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Push Payload (`--push-payload`)
|
|
45
|
+
|
|
46
|
+
A single JSON envelope matching the V2 cloud upload schema:
|
|
47
|
+
|
|
48
|
+
```json
|
|
49
|
+
{
|
|
50
|
+
"schema_version": "2.0",
|
|
51
|
+
"client_version": "0.1.0",
|
|
52
|
+
"push_id": "uuid",
|
|
53
|
+
"normalizer_version": "1.0.0",
|
|
54
|
+
"canonical": {
|
|
55
|
+
"sessions": [...],
|
|
56
|
+
"prompts": [...],
|
|
57
|
+
"skill_invocations": [...],
|
|
58
|
+
"execution_facts": [...],
|
|
59
|
+
"normalization_runs": [...],
|
|
60
|
+
"evolution_evidence": [...],
|
|
61
|
+
"orchestrate_runs": [],
|
|
62
|
+
"grading_results": [],
|
|
63
|
+
"improvement_signals": []
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### File output (`--out`)
|
|
69
|
+
|
|
70
|
+
When `--out` is specified, the data is written to the file and a JSON summary
|
|
71
|
+
is printed to stdout:
|
|
72
|
+
|
|
73
|
+
```json
|
|
74
|
+
{
|
|
75
|
+
"ok": true,
|
|
76
|
+
"out": "/path/to/output.jsonl",
|
|
77
|
+
"count": 42,
|
|
78
|
+
"format": "jsonl",
|
|
79
|
+
"pretty": false,
|
|
80
|
+
"platform": null,
|
|
81
|
+
"record_kind": null
|
|
82
|
+
}
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Fallback Behavior
|
|
86
|
+
|
|
87
|
+
If the canonical log file is empty or does not exist, the command falls back to
|
|
88
|
+
synthesizing canonical records directly from Claude Code transcripts in
|
|
89
|
+
`--projects-dir`. This supports existing installs that have rich transcript
|
|
90
|
+
data but have not yet generated a canonical log.
|
|
91
|
+
|
|
92
|
+
## Common Patterns
|
|
93
|
+
|
|
94
|
+
**Export all canonical data**
|
|
95
|
+
|
|
96
|
+
> Run `selftune export-canonical > export.jsonl` to dump everything.
|
|
97
|
+
|
|
98
|
+
**Export only skill invocations**
|
|
99
|
+
|
|
100
|
+
> Run `selftune export-canonical --record-kind skill_invocation` to filter.
|
|
101
|
+
|
|
102
|
+
**Inspect push payload before upload**
|
|
103
|
+
|
|
104
|
+
> Run `selftune export-canonical --push-payload --pretty` to see exactly what would be sent to the cloud API.
|
|
105
|
+
|
|
106
|
+
**Export to file with summary**
|
|
107
|
+
|
|
108
|
+
> Run `selftune export-canonical --out /tmp/export.jsonl --pretty` to write data and see a count summary.
|
|
109
|
+
|
|
110
|
+
**Filter by platform**
|
|
111
|
+
|
|
112
|
+
> Run `selftune export-canonical --platform claude_code` to export only Claude Code records.
|
|
113
|
+
|
|
114
|
+
## Troubleshooting
|
|
115
|
+
|
|
116
|
+
| Symptom | Cause | Fix |
|
|
117
|
+
| --- | --- | --- |
|
|
118
|
+
| Empty output | No canonical log and no transcripts | Run `selftune sync` or `selftune quickstart` to ingest data first |
|
|
119
|
+
| "Unknown platform" error | Invalid `--platform` value | Use one of: `claude_code`, `codex`, `opencode`, `openclaw` |
|
|
120
|
+
| "Unknown record kind" error | Invalid `--record-kind` value | Use one of: `session`, `prompt`, `skill_invocation`, `execution_fact`, `normalization_run` |
|
|
121
|
+
| Push payload missing evolution evidence | No evolution runs recorded | Run `selftune evolve` to generate evidence, then re-export |
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# selftune Hook Workflow
|
|
2
|
+
|
|
3
|
+
Manually invoke individual Claude Code hooks for debugging and testing.
|
|
4
|
+
Each hook reads its payload from stdin and behaves exactly as it would when
|
|
5
|
+
triggered by the Claude Code host agent.
|
|
6
|
+
|
|
7
|
+
## When to Use
|
|
8
|
+
|
|
9
|
+
- Debugging a specific hook's behavior with a known payload
|
|
10
|
+
- The user says "hook", "run hook", "invoke hook", "manual hook", or "debug hook"
|
|
11
|
+
- Testing hook installation by simulating a hook event
|
|
12
|
+
- Verifying hook output before or after configuration changes
|
|
13
|
+
|
|
14
|
+
## Default Command
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
echo '{"payload":"..."}' | selftune hook <name>
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Where `<name>` is one of the 6 available hooks.
|
|
21
|
+
|
|
22
|
+
## Available Hooks
|
|
23
|
+
|
|
24
|
+
| Hook Name | Claude Code Event | Purpose |
|
|
25
|
+
| ---------------------- | ---------------------- | ----------------------------------------------------------------------- |
|
|
26
|
+
| `prompt-log` | UserPromptSubmit | Logs every user query to SQLite for false-negative eval detection |
|
|
27
|
+
| `session-stop` | Stop | Extracts session-level telemetry from transcript when a session ends |
|
|
28
|
+
| `skill-eval` | PostToolUse | Records skill usage when a SKILL.md is read or a Skill tool is invoked |
|
|
29
|
+
| `auto-activate` | UserPromptSubmit | Evaluates activation rules and suggests selftune actions via stderr |
|
|
30
|
+
| `skill-change-guard` | PreToolUse | Warns (advisory) when an agent is about to write to a SKILL.md file |
|
|
31
|
+
| `evolution-guard` | PreToolUse | Blocks writes to monitored SKILL.md files until `selftune watch` runs |
|
|
32
|
+
|
|
33
|
+
## Hook Details
|
|
34
|
+
|
|
35
|
+
### prompt-log
|
|
36
|
+
|
|
37
|
+
Fires on every user message before Claude processes it. Writes the query to
|
|
38
|
+
SQLite so that `hooks-to-evals` can identify prompts that did NOT trigger a
|
|
39
|
+
skill — the raw material for false-negative eval entries. Also writes a
|
|
40
|
+
canonical prompt record.
|
|
41
|
+
|
|
42
|
+
### session-stop
|
|
43
|
+
|
|
44
|
+
Fires when a Claude Code session ends. Reads the session transcript JSONL and
|
|
45
|
+
extracts process-level telemetry (tool calls, errors, skills triggered, token
|
|
46
|
+
counts). Writes one record per session to SQLite with a JSONL backup. May
|
|
47
|
+
trigger a reactive `selftune orchestrate` spawn if conditions are met.
|
|
48
|
+
|
|
49
|
+
### skill-eval
|
|
50
|
+
|
|
51
|
+
Fires after Read or Skill tool calls. If the target is a SKILL.md file or a
|
|
52
|
+
Skill invocation, finds the triggering user query from the transcript and
|
|
53
|
+
writes a usage record. Builds the real-usage eval dataset over time.
|
|
54
|
+
|
|
55
|
+
### auto-activate
|
|
56
|
+
|
|
57
|
+
Fires on every user message. Evaluates activation rules against the session
|
|
58
|
+
context and outputs suggestions to stderr (shown to Claude as system messages).
|
|
59
|
+
Suggestions are advisory only — exit code is always 0. Tracks session state to
|
|
60
|
+
avoid repeated suggestions.
|
|
61
|
+
|
|
62
|
+
### skill-change-guard
|
|
63
|
+
|
|
64
|
+
Fires before Write/Edit tool calls. If the target is a SKILL.md file, outputs
|
|
65
|
+
a suggestion to run `selftune watch --skill <name>` to monitor impact. Advisory
|
|
66
|
+
only — exit code is always 0, never blocking. Uses session state to avoid
|
|
67
|
+
repeating suggestions for the same skill.
|
|
68
|
+
|
|
69
|
+
### evolution-guard
|
|
70
|
+
|
|
71
|
+
Fires before Write/Edit tool calls. If the target is a SKILL.md file that has
|
|
72
|
+
a deployed evolution under active monitoring, and no recent `selftune watch`
|
|
73
|
+
snapshot exists, this hook BLOCKS the write with exit code 2. This prevents
|
|
74
|
+
unmonitored changes to skills that are being tracked.
|
|
75
|
+
|
|
76
|
+
Exit codes:
|
|
77
|
+
|
|
78
|
+
- `0` — Allow (not a SKILL.md, not monitored, or watch is recent)
|
|
79
|
+
- `2` — Block with message (Claude Code convention for PreToolUse hooks)
|
|
80
|
+
|
|
81
|
+
Fail-open: any internal error results in exit 0 (never blocks accidentally).
|
|
82
|
+
|
|
83
|
+
## Output Format
|
|
84
|
+
|
|
85
|
+
Hook output varies by hook type:
|
|
86
|
+
|
|
87
|
+
- **prompt-log, session-stop, skill-eval**: Write to SQLite and JSONL logs silently. Exit 0 on success.
|
|
88
|
+
- **auto-activate**: Writes suggestions to stderr. Exit 0 always.
|
|
89
|
+
- **skill-change-guard**: Writes advisory message to stderr. Exit 0 always.
|
|
90
|
+
- **evolution-guard**: Writes block message to stderr on exit 2. Exit 0 when allowing.
|
|
91
|
+
|
|
92
|
+
## Common Patterns
|
|
93
|
+
|
|
94
|
+
**Debug a prompt-log hook**
|
|
95
|
+
|
|
96
|
+
> Pipe a UserPromptSubmit payload to test prompt logging:
|
|
97
|
+
>
|
|
98
|
+
> ```bash
|
|
99
|
+
> echo '{"session_id":"test","query":"improve my skills"}' | selftune hook prompt-log
|
|
100
|
+
> ```
|
|
101
|
+
|
|
102
|
+
**Test skill-eval with a PostToolUse payload**
|
|
103
|
+
|
|
104
|
+
> ```bash
|
|
105
|
+
> echo '{"tool_name":"Read","file_path":"/path/to/SKILL.md","session_id":"test"}' | selftune hook skill-eval
|
|
106
|
+
> ```
|
|
107
|
+
|
|
108
|
+
**Verify evolution-guard blocks correctly**
|
|
109
|
+
|
|
110
|
+
> ```bash
|
|
111
|
+
> echo '{"tool_name":"Write","file_path":"/path/to/monitored/SKILL.md"}' | selftune hook evolution-guard
|
|
112
|
+
> echo $? # Should be 2 if skill is monitored without recent watch
|
|
113
|
+
> ```
|
|
114
|
+
|
|
115
|
+
## Error Handling
|
|
116
|
+
|
|
117
|
+
If no hook name is provided or the name is unrecognized, the command exits with
|
|
118
|
+
a `UNKNOWN_COMMAND` error listing available hooks:
|
|
119
|
+
|
|
120
|
+
```
|
|
121
|
+
Unknown hook: (none). Available: prompt-log, session-stop, skill-eval, auto-activate, skill-change-guard, evolution-guard
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Troubleshooting
|
|
125
|
+
|
|
126
|
+
| Symptom | Cause | Fix |
|
|
127
|
+
| --- | --- | --- |
|
|
128
|
+
| "Unknown hook" error | Typo in hook name | Use one of: `prompt-log`, `session-stop`, `skill-eval`, `auto-activate`, `skill-change-guard`, `evolution-guard` |
|
|
129
|
+
| Hook exits 0 but no data written | Payload missing required fields | Check the hook's expected payload schema in `cli/selftune/types.ts` |
|
|
130
|
+
| evolution-guard always exits 0 | No deployed evolution for the target skill | Run `selftune evolve` first to deploy an evolution, then test the guard |
|
|
131
|
+
| auto-activate produces no suggestions | Activation rules not configured or already suggested in session | Check `~/.selftune/` for activation rules and session state files |
|
|
@@ -126,14 +126,15 @@ Code subagent calls stay up to date.
|
|
|
126
126
|
|
|
127
127
|
**Hook reference** (for troubleshooting):
|
|
128
128
|
|
|
129
|
-
| Hook | Script | Purpose | Notes
|
|
130
|
-
| -------------------------- | ----------------------------- | ----------------------------------------------- |
|
|
131
|
-
| `UserPromptSubmit` | `hooks/prompt-log.ts` | Log every user query | Accepts both `prompt` and legacy `user_prompt`
|
|
132
|
-
| `UserPromptSubmit` | `hooks/auto-activate.ts` | Suggest skills before prompt processing | Uses `additionalContext` JSON for suggestions
|
|
133
|
-
| `PreToolUse` (Write/Edit) | `hooks/skill-change-guard.ts` | Detect uncontrolled skill edits | `if` filter: only fires on `*SKILL.md` paths
|
|
134
|
-
| `PreToolUse` (Write/Edit) | `hooks/evolution-guard.ts` | Block SKILL.md edits on monitored skills | `if` filter: only fires on `*SKILL.md` paths
|
|
135
|
-
| `PostToolUse` (Read/Skill) | `hooks/skill-eval.ts` | Track skill triggers and Skill tool invocations |
|
|
136
|
-
| `
|
|
129
|
+
| Hook | Script | Purpose | Notes |
|
|
130
|
+
| -------------------------- | ----------------------------- | ----------------------------------------------- | ----------------------------------------------- |
|
|
131
|
+
| `UserPromptSubmit` | `hooks/prompt-log.ts` | Log every user query | Accepts both `prompt` and legacy `user_prompt` |
|
|
132
|
+
| `UserPromptSubmit` | `hooks/auto-activate.ts` | Suggest skills before prompt processing | Uses `additionalContext` JSON for suggestions |
|
|
133
|
+
| `PreToolUse` (Write/Edit) | `hooks/skill-change-guard.ts` | Detect uncontrolled skill edits | `if` filter: only fires on `*SKILL.md` paths |
|
|
134
|
+
| `PreToolUse` (Write/Edit) | `hooks/evolution-guard.ts` | Block SKILL.md edits on monitored skills | `if` filter: only fires on `*SKILL.md` paths |
|
|
135
|
+
| `PostToolUse` (Read/Skill) | `hooks/skill-eval.ts` | Track skill triggers and Skill tool invocations | Fast-path: skips non-PostToolUse/non-Read/Skill |
|
|
136
|
+
| `PostToolUse` (Bash) | `hooks/commit-track.ts` | Track git commits for session traceability | Fast-path: skips non-git Bash commands |
|
|
137
|
+
| `Stop` | `hooks/session-stop.ts` | Capture session telemetry | Runs async (non-blocking), 60s timeout |
|
|
137
138
|
|
|
138
139
|
**Codex agents:**
|
|
139
140
|
|
|
@@ -20,6 +20,22 @@ recent changes with auto-rollback enabled.
|
|
|
20
20
|
selftune orchestrate
|
|
21
21
|
```
|
|
22
22
|
|
|
23
|
+
Autonomous evolve settings used by orchestrate:
|
|
24
|
+
|
|
25
|
+
```text
|
|
26
|
+
confidenceThreshold = 0.6
|
|
27
|
+
maxIterations = 3
|
|
28
|
+
paretoEnabled = true
|
|
29
|
+
candidateCount = 3
|
|
30
|
+
tokenEfficiencyEnabled = false
|
|
31
|
+
withBaseline = false
|
|
32
|
+
validationModel = haiku
|
|
33
|
+
cheapLoop = true
|
|
34
|
+
gateModel = sonnet
|
|
35
|
+
adaptiveGate = true
|
|
36
|
+
proposalModel = haiku
|
|
37
|
+
```
|
|
38
|
+
|
|
23
39
|
## Flags
|
|
24
40
|
|
|
25
41
|
| Flag | Description | Default |
|
|
@@ -109,10 +125,11 @@ This is the recommended runtime for recurring autonomous scheduling.
|
|
|
109
125
|
| **Automated (loop)** | `selftune orchestrate --loop` | No agent session; LLM cost only if evolution triggers | Configurable interval |
|
|
110
126
|
|
|
111
127
|
In automated mode, the OS calls the CLI binary directly. No agent session
|
|
112
|
-
is created.
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
128
|
+
is created. Outside of the regular sync/status/candidate-selection logic,
|
|
129
|
+
LLM calls can come from auto-grading ungraded skills and from the evolution
|
|
130
|
+
step itself. By default, orchestrate runs proposal generation and validation
|
|
131
|
+
on `haiku`, then re-runs the final gate on `sonnet` before deploy. Risky
|
|
132
|
+
candidates are escalated to `opus` with `high` effort for the gate only.
|
|
116
133
|
|
|
117
134
|
**Cron mode:** Install OS-level scheduling with `selftune cron setup`.
|
|
118
135
|
Runs as separate invocations on a schedule (default: every 6 hours).
|
|
@@ -144,10 +161,15 @@ In autonomous mode, orchestrate calls sub-workflows in this fixed order:
|
|
|
144
161
|
1. **Sync** — refresh source-truth telemetry across all supported agents (`selftune sync`)
|
|
145
162
|
2. **Status** — compute skill health using existing grade results (reads `grading.json` outputs from previous sessions)
|
|
146
163
|
3. **Auto-grade** — grade up to `--max-auto-grade` (default 5) ungraded skills that have session data but no grades yet. Skipped during `--dry-run` (grading makes LLM calls). After grading, status is recomputed so candidate selection sees updated grades. Fail-open: individual grading errors are logged but never block the loop.
|
|
147
|
-
4. **Evolve** — run evolution on selected candidates (pre-flight is skipped
|
|
164
|
+
4. **Evolve** — run evolution on selected candidates (pre-flight is skipped; Pareto mode uses 3 candidates; cheap-loop uses `haiku` for proposal + validation and `sonnet` for the final gate; adaptive gate escalation promotes risky proposals to `opus` + `high` effort; baseline and token-efficiency stay off)
|
|
148
165
|
5. **Watch** — monitor recently evolved skills (auto-rollback enabled by default, `--recent-window` hours lookback)
|
|
149
166
|
6. **Alpha Upload** — if enrolled in the alpha program (`config.alpha.enrolled === true`) and an API key is configured, stage new canonical records (sessions, invocations, evolution evidence, orchestrate runs) into `canonical_upload_staging`, build V2 push payloads, and flush to the cloud API (`POST /api/v1/push`) with Bearer auth. Fail-open: upload errors never block the orchestrate loop. Respects `--dry-run`.
|
|
150
167
|
|
|
168
|
+
When orchestrate invokes evolve for a selected candidate, it always passes
|
|
169
|
+
`confidenceThreshold: 0.6` and `maxIterations: 3`, plus the autonomous evolve
|
|
170
|
+
defaults listed above. Those defaults are the recurring-run policy for the
|
|
171
|
+
autonomy-first loop; there are no orchestrate flags to override them per run.
|
|
172
|
+
|
|
151
173
|
Between candidate selection and evolution, orchestrate checks for
|
|
152
174
|
**cross-skill eval set overlap**. When two or more evolution candidates
|
|
153
175
|
share >30% of their positive eval queries, a warning is logged to stderr.
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# selftune Quickstart Workflow
|
|
2
|
+
|
|
3
|
+
Guided onboarding that runs init, ingest, and status in a single command.
|
|
4
|
+
Designed for first-time users who want to get selftune working immediately.
|
|
5
|
+
|
|
6
|
+
## When to Use
|
|
7
|
+
|
|
8
|
+
- The user is setting up selftune for the first time
|
|
9
|
+
- The user says "getting started", "quickstart", "onboard", or "first time"
|
|
10
|
+
- The agent needs to bootstrap selftune in one step without running init, ingest, and status separately
|
|
11
|
+
|
|
12
|
+
## Default Command
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
selftune quickstart
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
Help:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
selftune quickstart --help
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Options
|
|
25
|
+
|
|
26
|
+
| Flag | Description |
|
|
27
|
+
| -------- | ---------------------- |
|
|
28
|
+
| `--help` | Show usage information |
|
|
29
|
+
|
|
30
|
+
## Steps Performed
|
|
31
|
+
|
|
32
|
+
Quickstart runs three steps automatically:
|
|
33
|
+
|
|
34
|
+
1. **Init** — Creates `~/.selftune/config.json` if it does not exist. Skips if config is already present.
|
|
35
|
+
2. **Ingest** — Runs Claude Code transcript replay if the ingest marker file does not exist. Discovers transcripts from `~/.claude/projects/` and writes session telemetry to SQLite.
|
|
36
|
+
3. **Status** — Displays current skill health using `computeStatus`. Shows pass rates, trends, and health indicators for all detected skills; when you need per-skill check volume, look at `snapshot.skill_checks` rather than a "session count" field.
|
|
37
|
+
|
|
38
|
+
After status, quickstart suggests the top 3 skills that would benefit from evolution, prioritized by:
|
|
39
|
+
|
|
40
|
+
- **UNGRADED/UNKNOWN** skills (highest priority) — suggests running `selftune grade`
|
|
41
|
+
- **CRITICAL** skills (pass rate below threshold) — suggests evolution
|
|
42
|
+
- **WARNING** skills — suggests improvement
|
|
43
|
+
|
|
44
|
+
## Output Format
|
|
45
|
+
|
|
46
|
+
```text
|
|
47
|
+
selftune quickstart
|
|
48
|
+
====================
|
|
49
|
+
|
|
50
|
+
[1/3] Config exists, skipping init.
|
|
51
|
+
[2/3] Running ingest claude...
|
|
52
|
+
Ingested 12 sessions.
|
|
53
|
+
[3/3] Current status:
|
|
54
|
+
|
|
55
|
+
Skill Health Summary
|
|
56
|
+
...
|
|
57
|
+
|
|
58
|
+
Suggested next steps:
|
|
59
|
+
- my-skill: pass rate 45% — needs evolution
|
|
60
|
+
- other-skill: needs grading — run `selftune grade --skill other-skill`
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
If all skills are healthy, the output ends with:
|
|
64
|
+
|
|
65
|
+
```text
|
|
66
|
+
All skills are healthy. No immediate actions needed.
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Common Patterns
|
|
70
|
+
|
|
71
|
+
**First-time setup**
|
|
72
|
+
|
|
73
|
+
> Run `selftune quickstart`. It handles init, ingest, and status automatically.
|
|
74
|
+
|
|
75
|
+
**Already initialized**
|
|
76
|
+
|
|
77
|
+
> Quickstart skips steps that are already complete (config exists, ingest marker exists). It is safe to run multiple times.
|
|
78
|
+
|
|
79
|
+
**No transcripts found**
|
|
80
|
+
|
|
81
|
+
> If no Claude Code transcripts exist in `~/.claude/projects/`, quickstart reports "No Claude Code transcripts found" and continues to the status step. The user should run some agent sessions first, then re-run quickstart.
|
|
82
|
+
|
|
83
|
+
**Status or ingest fails**
|
|
84
|
+
|
|
85
|
+
> Quickstart catches errors in each step and suggests the manual command for troubleshooting (e.g., `selftune init`, `selftune ingest claude`, or `selftune status`).
|
|
86
|
+
|
|
87
|
+
## Troubleshooting
|
|
88
|
+
|
|
89
|
+
| Symptom | Cause | Fix |
|
|
90
|
+
| --- | --- | --- |
|
|
91
|
+
| "Init failed" at step 1 | Config directory permissions or corrupted config | Run `selftune init --force` manually |
|
|
92
|
+
| "Ingest failed" at step 2 | Transcript directory missing or unreadable | Verify `~/.claude/projects/` exists and contains session directories |
|
|
93
|
+
| "No sessions found" after ingest | No actionable transcripts or no skill usage detected | Run agent sessions that use skills, then re-run quickstart |
|
|
94
|
+
| "Status failed" at step 3 | SQLite database issue | Run `selftune doctor` to diagnose |
|