selftune 0.1.0 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +11 -0
- package/LICENSE +21 -0
- package/README.md +62 -5
- package/bin/selftune.cjs +3 -1
- package/cli/selftune/constants.ts +0 -6
- package/cli/selftune/dashboard.ts +176 -0
- package/cli/selftune/evolution/evolve.ts +31 -20
- package/cli/selftune/evolution/propose-description.ts +2 -3
- package/cli/selftune/evolution/rollback.ts +1 -1
- package/cli/selftune/evolution/validate-proposal.ts +3 -4
- package/cli/selftune/grading/grade-session.ts +18 -62
- package/cli/selftune/index.ts +21 -0
- package/cli/selftune/ingestors/codex-rollout.ts +1 -1
- package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
- package/cli/selftune/init.ts +25 -36
- package/cli/selftune/last.ts +138 -0
- package/cli/selftune/observability.ts +1 -1
- package/cli/selftune/status.ts +318 -0
- package/cli/selftune/types.ts +1 -1
- package/cli/selftune/utils/llm-call.ts +8 -57
- package/dashboard/index.html +1119 -0
- package/package.json +32 -3
- package/skill/SKILL.md +19 -14
- package/skill/Workflows/Doctor.md +2 -9
- package/skill/Workflows/Evals.md +8 -17
- package/skill/Workflows/Evolve.md +5 -11
- package/skill/Workflows/Grade.md +5 -10
- package/skill/Workflows/Ingest.md +16 -34
- package/skill/Workflows/Initialize.md +32 -34
- package/skill/Workflows/Rollback.md +3 -10
- package/skill/Workflows/Watch.md +2 -9
package/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/).
|
|
7
7
|
|
|
8
|
+
## [0.1.4] - 2026-03-01
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- `selftune status` — CLI skill health summary with pass rates, trends, and system health
|
|
13
|
+
- `selftune last` — Quick insight from the most recent session
|
|
14
|
+
- `selftune dashboard` — Skill-health-centric HTML dashboard with grid view and drill-down
|
|
15
|
+
- CI/CD workflows: publish, auto-bump, CodeQL, scorecard
|
|
16
|
+
- FOSS governance: LICENSE (MIT), CODE_OF_CONDUCT, CONTRIBUTING, SECURITY
|
|
17
|
+
- npm package configuration with CJS bin entry point
|
|
18
|
+
|
|
8
19
|
## [0.1.0] - 2026-02-28
|
|
9
20
|
|
|
10
21
|
### Added
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 WellDunDun
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
CHANGED
|
@@ -1,9 +1,18 @@
|
|
|
1
|
+
[](https://github.com/WellDunDun/selftune/actions/workflows/ci.yml)
|
|
2
|
+
[](https://github.com/WellDunDun/selftune/actions/workflows/codeql.yml)
|
|
3
|
+
[](https://securityscorecards.dev/viewer/?uri=github.com/WellDunDun/selftune)
|
|
4
|
+
[](https://www.npmjs.com/package/selftune)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
[](https://www.typescriptlang.org/)
|
|
7
|
+
[](https://www.npmjs.com/package/selftune?activeTab=dependencies)
|
|
8
|
+
[](https://bun.sh)
|
|
9
|
+
|
|
1
10
|
# selftune — Skill Observability & Continuous Improvement CLI
|
|
2
11
|
|
|
3
12
|
[](https://www.npmjs.com/package/selftune)
|
|
4
|
-
[](https://github.com/WellDunDun/selftune/actions/workflows/ci.yml)
|
|
5
14
|
[](LICENSE)
|
|
6
|
-
[]()
|
|
15
|
+
[](https://www.npmjs.com/package/selftune?activeTab=dependencies)
|
|
7
16
|
[](https://bun.sh)
|
|
8
17
|
|
|
9
18
|
Observe real sessions, detect missed triggers, grade execution quality, and automatically evolve skill descriptions toward the language real users actually use.
|
|
@@ -54,7 +63,25 @@ selftune closes this feedback loop.
|
|
|
54
63
|
|
|
55
64
|
---
|
|
56
65
|
|
|
57
|
-
##
|
|
66
|
+
## Setup
|
|
67
|
+
|
|
68
|
+
### 1. Add the skill
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
npx skills add WellDunDun/selftune
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### 2. Initialize
|
|
75
|
+
|
|
76
|
+
Tell your agent: **"initialize selftune"**
|
|
77
|
+
|
|
78
|
+
The agent will install the CLI (`npm install -g selftune`) if needed, run `selftune init` to bootstrap config, install hooks, and verify with `selftune doctor`.
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## Development
|
|
83
|
+
|
|
84
|
+
For contributors running from source.
|
|
58
85
|
|
|
59
86
|
### 1. Initialize
|
|
60
87
|
|
|
@@ -68,7 +95,7 @@ Use `--agent claude_code|codex|opencode` to override detection, `--llm-mode agen
|
|
|
68
95
|
|
|
69
96
|
### 4. Install hooks (Claude Code)
|
|
70
97
|
|
|
71
|
-
If `init` reports hooks are not installed, merge the entries from `skill/settings_snippet.json` into `~/.claude/settings.json`.
|
|
98
|
+
If `init` reports hooks are not installed, merge the entries from `skill/settings_snippet.json` into `~/.claude/settings.json`. Derive hook script paths from the `cli_path` field in `~/.selftune/config.json` — the hooks directory is at `dirname(cli_path)/hooks/`.
|
|
72
99
|
|
|
73
100
|
### 5. Verify setup
|
|
74
101
|
|
|
@@ -112,12 +139,15 @@ selftune <command> [options]
|
|
|
112
139
|
| `evolve --skill <name> --skill-path <path>` | Analyze failures, propose and deploy improved description |
|
|
113
140
|
| `rollback --skill <name> --skill-path <path>` | Restore pre-evolution description |
|
|
114
141
|
| `watch --skill <name> --skill-path <path>` | Monitor post-deploy pass rates, detect regressions |
|
|
142
|
+
| `status` | Show skill health summary (pass rates, trends, missed queries) |
|
|
143
|
+
| `last` | Show quick insight from the most recent session |
|
|
115
144
|
| `doctor` | Health checks on logs, hooks, config, and schema |
|
|
145
|
+
| `dashboard` | Open skill-health-centric HTML dashboard in browser |
|
|
116
146
|
| `ingest-codex` | Batch ingest Codex rollout logs |
|
|
117
147
|
| `ingest-opencode` | Backfill historical OpenCode sessions from SQLite |
|
|
118
148
|
| `wrap-codex -- <args>` | Real-time Codex wrapper with telemetry |
|
|
119
149
|
|
|
120
|
-
No separate API key required — grading and evolution use whatever agent CLI you already have installed
|
|
150
|
+
No separate API key required — grading and evolution use whatever agent CLI you already have installed (Claude Code, Codex, or OpenCode).
|
|
121
151
|
|
|
122
152
|
See `skill/Workflows/` for detailed step-by-step guides for each command.
|
|
123
153
|
|
|
@@ -185,6 +215,9 @@ cli/selftune/
|
|
|
185
215
|
├── init.ts Agent detection, config bootstrap
|
|
186
216
|
├── types.ts, constants.ts Shared interfaces and constants
|
|
187
217
|
├── observability.ts Health checks (doctor command)
|
|
218
|
+
├── status.ts Skill health summary (status command)
|
|
219
|
+
├── last.ts Last session insight (last command)
|
|
220
|
+
├── dashboard.ts HTML dashboard builder (dashboard command)
|
|
188
221
|
├── utils/ JSONL, transcript parsing, LLM calls, schema validation
|
|
189
222
|
├── hooks/ Claude Code + OpenCode telemetry capture
|
|
190
223
|
├── ingestors/ Codex adapters + OpenCode backfill
|
|
@@ -193,6 +226,9 @@ cli/selftune/
|
|
|
193
226
|
├── evolution/ Failure extraction, proposal, validation, deploy, rollback
|
|
194
227
|
└── monitoring/ Post-deploy regression detection
|
|
195
228
|
|
|
229
|
+
dashboard/
|
|
230
|
+
└── index.html Skill-health-centric HTML dashboard template
|
|
231
|
+
|
|
196
232
|
skill/
|
|
197
233
|
├── SKILL.md Routing table (~120 lines)
|
|
198
234
|
├── settings_snippet.json Claude Code hook config template
|
|
@@ -248,6 +284,26 @@ Zero runtime dependencies. Uses Bun built-ins only.
|
|
|
248
284
|
|
|
249
285
|
---
|
|
250
286
|
|
|
287
|
+
## Contributing
|
|
288
|
+
|
|
289
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup, architecture rules, and PR guidelines.
|
|
290
|
+
|
|
291
|
+
Please follow our [Code of Conduct](CODE_OF_CONDUCT.md).
|
|
292
|
+
|
|
293
|
+
---
|
|
294
|
+
|
|
295
|
+
## Security
|
|
296
|
+
|
|
297
|
+
To report a vulnerability, see [SECURITY.md](SECURITY.md).
|
|
298
|
+
|
|
299
|
+
---
|
|
300
|
+
|
|
301
|
+
## Sponsor
|
|
302
|
+
|
|
303
|
+
If selftune saves you time, consider [sponsoring the project](https://github.com/sponsors/WellDunDun).
|
|
304
|
+
|
|
305
|
+
---
|
|
306
|
+
|
|
251
307
|
## Milestones
|
|
252
308
|
|
|
253
309
|
| Version | Scope | Status |
|
|
@@ -257,3 +313,4 @@ Zero runtime dependencies. Uses Bun built-ins only.
|
|
|
257
313
|
| v0.3 | Evolution loop (propose, validate, deploy, rollback) | Done |
|
|
258
314
|
| v0.4 | Post-deploy monitoring, regression detection | Done |
|
|
259
315
|
| v0.5 | Agent-first skill restructure, `init` command, config bootstrap | Done |
|
|
316
|
+
| v0.6 | Three-layer observability: `status`, `last`, redesigned dashboard | Done |
|
package/bin/selftune.cjs
CHANGED
|
@@ -15,7 +15,9 @@ for (const [cmd, args] of runners) {
|
|
|
15
15
|
execFileSync(cmd, args, { stdio: "inherit" });
|
|
16
16
|
process.exit(0);
|
|
17
17
|
} catch (e) {
|
|
18
|
-
|
|
18
|
+
// If the runner exits non-zero, propagate that status.
|
|
19
|
+
// If the runner is not found (ENOENT), e.status is null — continue to next runner.
|
|
20
|
+
if (e.status != null) {
|
|
19
21
|
process.exit(e.status);
|
|
20
22
|
}
|
|
21
23
|
}
|
|
@@ -63,9 +63,3 @@ export const REQUIRED_FIELDS: Record<string, Set<string>> = {
|
|
|
63
63
|
|
|
64
64
|
/** Agent CLI candidates in detection order. */
|
|
65
65
|
export const AGENT_CANDIDATES = ["claude", "codex", "opencode"] as const;
|
|
66
|
-
|
|
67
|
-
/** Anthropic API URL for direct grading. */
|
|
68
|
-
export const API_URL = "https://api.anthropic.com/v1/messages";
|
|
69
|
-
|
|
70
|
-
/** Default model for direct API grading. */
|
|
71
|
-
export const MODEL = "claude-sonnet-4-20250514";
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* selftune dashboard — Exports JSONL data into a standalone HTML viewer.
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* selftune dashboard — Open dashboard in default browser
|
|
6
|
+
* selftune dashboard --export — Export data-embedded HTML to stdout
|
|
7
|
+
* selftune dashboard --out FILE — Write data-embedded HTML to FILE
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
11
|
+
import { homedir } from "node:os";
|
|
12
|
+
import { dirname, join, resolve } from "node:path";
|
|
13
|
+
import { EVOLUTION_AUDIT_LOG, QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "./constants.js";
|
|
14
|
+
import { getLastDeployedProposal, readAuditTrail } from "./evolution/audit.js";
|
|
15
|
+
import { computeMonitoringSnapshot } from "./monitoring/watch.js";
|
|
16
|
+
import type {
|
|
17
|
+
EvolutionAuditEntry,
|
|
18
|
+
QueryLogRecord,
|
|
19
|
+
SessionTelemetryRecord,
|
|
20
|
+
SkillUsageRecord,
|
|
21
|
+
} from "./types.js";
|
|
22
|
+
import { readJsonl } from "./utils/jsonl.js";
|
|
23
|
+
|
|
24
|
+
function findViewerHTML(): string {
|
|
25
|
+
// Try relative to this module first (works for both dev and installed)
|
|
26
|
+
const candidates = [
|
|
27
|
+
join(dirname(import.meta.dir), "..", "dashboard", "index.html"),
|
|
28
|
+
join(dirname(import.meta.dir), "dashboard", "index.html"),
|
|
29
|
+
resolve("dashboard", "index.html"),
|
|
30
|
+
];
|
|
31
|
+
for (const c of candidates) {
|
|
32
|
+
if (existsSync(c)) return c;
|
|
33
|
+
}
|
|
34
|
+
throw new Error("Could not find dashboard/index.html. Ensure it exists in the selftune repo.");
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function buildEmbeddedHTML(): string {
|
|
38
|
+
const template = readFileSync(findViewerHTML(), "utf-8");
|
|
39
|
+
|
|
40
|
+
const telemetry = readJsonl<SessionTelemetryRecord>(TELEMETRY_LOG);
|
|
41
|
+
const skills = readJsonl<SkillUsageRecord>(SKILL_LOG);
|
|
42
|
+
const queries = readJsonl<QueryLogRecord>(QUERY_LOG);
|
|
43
|
+
const evolution = readJsonl<EvolutionAuditEntry>(EVOLUTION_AUDIT_LOG);
|
|
44
|
+
|
|
45
|
+
const totalRecords = telemetry.length + skills.length + queries.length + evolution.length;
|
|
46
|
+
|
|
47
|
+
if (totalRecords === 0) {
|
|
48
|
+
console.error("No log data found. Run some sessions first.");
|
|
49
|
+
console.error(` Checked: ${TELEMETRY_LOG}`);
|
|
50
|
+
console.error(` ${SKILL_LOG}`);
|
|
51
|
+
console.error(` ${QUERY_LOG}`);
|
|
52
|
+
console.error(` ${EVOLUTION_AUDIT_LOG}`);
|
|
53
|
+
process.exit(1);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// Compute per-skill monitoring snapshots
|
|
57
|
+
const skillNames = [...new Set(skills.map((r) => r.skill_name))];
|
|
58
|
+
const snapshots: Record<string, ReturnType<typeof computeMonitoringSnapshot>> = {};
|
|
59
|
+
for (const name of skillNames) {
|
|
60
|
+
const lastDeployed = getLastDeployedProposal(name);
|
|
61
|
+
const baselinePassRate = lastDeployed?.eval_snapshot?.pass_rate ?? 0.5;
|
|
62
|
+
snapshots[name] = computeMonitoringSnapshot(
|
|
63
|
+
name,
|
|
64
|
+
telemetry,
|
|
65
|
+
skills,
|
|
66
|
+
queries,
|
|
67
|
+
telemetry.length,
|
|
68
|
+
baselinePassRate,
|
|
69
|
+
);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Compute unmatched queries
|
|
73
|
+
const triggeredQueries = new Set(
|
|
74
|
+
skills.filter((r) => r.triggered).map((r) => r.query.toLowerCase().trim()),
|
|
75
|
+
);
|
|
76
|
+
const unmatched = queries
|
|
77
|
+
.filter((q) => !triggeredQueries.has(q.query.toLowerCase().trim()))
|
|
78
|
+
.map((q) => ({
|
|
79
|
+
timestamp: q.timestamp,
|
|
80
|
+
session_id: q.session_id,
|
|
81
|
+
query: q.query,
|
|
82
|
+
}));
|
|
83
|
+
|
|
84
|
+
// Compute pending proposals
|
|
85
|
+
const auditTrail = readAuditTrail();
|
|
86
|
+
const proposalStatus: Record<string, string[]> = {};
|
|
87
|
+
for (const e of auditTrail) {
|
|
88
|
+
if (!proposalStatus[e.proposal_id]) proposalStatus[e.proposal_id] = [];
|
|
89
|
+
proposalStatus[e.proposal_id].push(e.action);
|
|
90
|
+
}
|
|
91
|
+
// Deduplicate by proposal_id: one entry per pending proposal
|
|
92
|
+
const terminalActions = new Set(["deployed", "rejected", "rolled_back"]);
|
|
93
|
+
const seenProposals = new Set<string>();
|
|
94
|
+
const pendingProposals = auditTrail.filter((e) => {
|
|
95
|
+
if (e.action !== "created" && e.action !== "validated") return false;
|
|
96
|
+
if (seenProposals.has(e.proposal_id)) return false;
|
|
97
|
+
const actions = proposalStatus[e.proposal_id] || [];
|
|
98
|
+
const isPending = !actions.some((a: string) => terminalActions.has(a));
|
|
99
|
+
if (isPending) seenProposals.add(e.proposal_id);
|
|
100
|
+
return isPending;
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
const data = {
|
|
104
|
+
telemetry,
|
|
105
|
+
skills,
|
|
106
|
+
queries,
|
|
107
|
+
evolution,
|
|
108
|
+
computed: {
|
|
109
|
+
snapshots,
|
|
110
|
+
unmatched,
|
|
111
|
+
pendingProposals,
|
|
112
|
+
},
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
// Inject embedded data right before </body>
|
|
116
|
+
// Escape </script> sequences to prevent XSS via embedded JSON
|
|
117
|
+
const safeJson = JSON.stringify(data).replace(/<\/script>/gi, "<\\/script>");
|
|
118
|
+
const dataScript = `<script id="embedded-data" type="application/json">${safeJson}</script>`;
|
|
119
|
+
return template.replace("</body>", `${dataScript}\n</body>`);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
export async function cliMain(): Promise<void> {
|
|
123
|
+
const args = process.argv.slice(2);
|
|
124
|
+
|
|
125
|
+
if (args.includes("--help") || args.includes("-h")) {
|
|
126
|
+
console.log(`selftune dashboard — Visual data dashboard
|
|
127
|
+
|
|
128
|
+
Usage:
|
|
129
|
+
selftune dashboard Open dashboard in default browser
|
|
130
|
+
selftune dashboard --export Export data-embedded HTML to stdout
|
|
131
|
+
selftune dashboard --out FILE Write data-embedded HTML to FILE`);
|
|
132
|
+
process.exit(0);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
if (args.includes("--export")) {
|
|
136
|
+
process.stdout.write(buildEmbeddedHTML());
|
|
137
|
+
return;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
const outIdx = args.indexOf("--out");
|
|
141
|
+
if (outIdx !== -1) {
|
|
142
|
+
const outPath = args[outIdx + 1];
|
|
143
|
+
if (!outPath) {
|
|
144
|
+
console.error("--out requires a file path argument");
|
|
145
|
+
process.exit(1);
|
|
146
|
+
}
|
|
147
|
+
const html = buildEmbeddedHTML();
|
|
148
|
+
writeFileSync(outPath, html, "utf-8");
|
|
149
|
+
console.log(`Dashboard written to ${outPath}`);
|
|
150
|
+
return;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// Default: write to temp file and open in browser
|
|
154
|
+
const tmpDir = join(homedir(), ".selftune");
|
|
155
|
+
if (!existsSync(tmpDir)) {
|
|
156
|
+
mkdirSync(tmpDir, { recursive: true });
|
|
157
|
+
}
|
|
158
|
+
const tmpPath = join(tmpDir, "dashboard.html");
|
|
159
|
+
const html = buildEmbeddedHTML();
|
|
160
|
+
writeFileSync(tmpPath, html, "utf-8");
|
|
161
|
+
|
|
162
|
+
console.log(`Dashboard saved to ${tmpPath}`);
|
|
163
|
+
console.log("Opening in browser...");
|
|
164
|
+
|
|
165
|
+
try {
|
|
166
|
+
const platform = process.platform;
|
|
167
|
+
const cmd = platform === "darwin" ? "open" : platform === "linux" ? "xdg-open" : null;
|
|
168
|
+
if (!cmd) throw new Error("Unsupported platform");
|
|
169
|
+
const proc = Bun.spawn([cmd, tmpPath], { stdio: ["ignore", "ignore", "ignore"] });
|
|
170
|
+
await proc.exited;
|
|
171
|
+
if (proc.exitCode !== 0) throw new Error(`Failed to launch ${cmd}`);
|
|
172
|
+
} catch {
|
|
173
|
+
console.log(`Open manually: file://${tmpPath}`);
|
|
174
|
+
}
|
|
175
|
+
process.exit(0);
|
|
176
|
+
}
|
|
@@ -23,8 +23,8 @@ import { readJsonl } from "../utils/jsonl.js";
|
|
|
23
23
|
import { appendAuditEntry } from "./audit.js";
|
|
24
24
|
import { extractFailurePatterns } from "./extract-patterns.js";
|
|
25
25
|
import { generateProposal } from "./propose-description.js";
|
|
26
|
-
import { validateProposal } from "./validate-proposal.js";
|
|
27
26
|
import type { ValidationResult } from "./validate-proposal.js";
|
|
27
|
+
import { validateProposal } from "./validate-proposal.js";
|
|
28
28
|
|
|
29
29
|
// ---------------------------------------------------------------------------
|
|
30
30
|
// Types
|
|
@@ -34,8 +34,7 @@ export interface EvolveOptions {
|
|
|
34
34
|
skillName: string;
|
|
35
35
|
skillPath: string;
|
|
36
36
|
evalSetPath?: string;
|
|
37
|
-
|
|
38
|
-
agent?: string;
|
|
37
|
+
agent: string;
|
|
39
38
|
dryRun: boolean;
|
|
40
39
|
confidenceThreshold: number; // default 0.6
|
|
41
40
|
maxIterations: number; // default 3
|
|
@@ -88,16 +87,8 @@ export async function evolve(
|
|
|
88
87
|
options: EvolveOptions,
|
|
89
88
|
_deps: EvolveDeps = {},
|
|
90
89
|
): Promise<EvolveResult> {
|
|
91
|
-
const {
|
|
92
|
-
|
|
93
|
-
skillPath,
|
|
94
|
-
evalSetPath,
|
|
95
|
-
mode,
|
|
96
|
-
agent,
|
|
97
|
-
dryRun,
|
|
98
|
-
confidenceThreshold,
|
|
99
|
-
maxIterations,
|
|
100
|
-
} = options;
|
|
90
|
+
const { skillName, skillPath, evalSetPath, agent, dryRun, confidenceThreshold, maxIterations } =
|
|
91
|
+
options;
|
|
101
92
|
|
|
102
93
|
// Resolve injectable dependencies with real-import fallbacks
|
|
103
94
|
const _extractFailurePatterns = _deps.extractFailurePatterns ?? extractFailurePatterns;
|
|
@@ -201,7 +192,6 @@ export async function evolve(
|
|
|
201
192
|
effectiveMissedQueries,
|
|
202
193
|
skillName,
|
|
203
194
|
skillPath,
|
|
204
|
-
mode,
|
|
205
195
|
agent,
|
|
206
196
|
);
|
|
207
197
|
|
|
@@ -238,7 +228,7 @@ export async function evolve(
|
|
|
238
228
|
}
|
|
239
229
|
|
|
240
230
|
// Step 10: Validate against eval set
|
|
241
|
-
const validation = await _validateProposal(proposal, evalSet,
|
|
231
|
+
const validation = await _validateProposal(proposal, evalSet, agent);
|
|
242
232
|
lastValidation = validation;
|
|
243
233
|
|
|
244
234
|
// Step 11: Audit "validated"
|
|
@@ -347,7 +337,6 @@ export async function cliMain(): Promise<void> {
|
|
|
347
337
|
skill: { type: "string" },
|
|
348
338
|
"skill-path": { type: "string" },
|
|
349
339
|
"eval-set": { type: "string" },
|
|
350
|
-
mode: { type: "string", default: "agent" },
|
|
351
340
|
agent: { type: "string" },
|
|
352
341
|
"dry-run": { type: "boolean", default: false },
|
|
353
342
|
confidence: { type: "string", default: "0.6" },
|
|
@@ -367,7 +356,6 @@ Options:
|
|
|
367
356
|
--skill Skill name (required)
|
|
368
357
|
--skill-path Path to SKILL.md (required)
|
|
369
358
|
--eval-set Path to eval set JSON (optional, builds from logs if omitted)
|
|
370
|
-
--mode Execution mode: "agent" or "api" (default: "agent")
|
|
371
359
|
--agent Agent CLI to use (claude, codex, opencode)
|
|
372
360
|
--dry-run Validate proposal without deploying
|
|
373
361
|
--confidence Confidence threshold 0.0-1.0 (default: 0.6)
|
|
@@ -381,14 +369,37 @@ Options:
|
|
|
381
369
|
process.exit(1);
|
|
382
370
|
}
|
|
383
371
|
|
|
384
|
-
const
|
|
372
|
+
const { detectAgent } = await import("../utils/llm-call.js");
|
|
373
|
+
const requestedAgent = values.agent;
|
|
374
|
+
if (requestedAgent && !Bun.which(requestedAgent)) {
|
|
375
|
+
console.error(
|
|
376
|
+
JSON.stringify({
|
|
377
|
+
level: "error",
|
|
378
|
+
code: "agent_not_in_path",
|
|
379
|
+
message: `Agent CLI '${requestedAgent}' not found in PATH.`,
|
|
380
|
+
action: "Install it or omit --agent to use auto-detection.",
|
|
381
|
+
}),
|
|
382
|
+
);
|
|
383
|
+
process.exit(1);
|
|
384
|
+
}
|
|
385
|
+
const agent = requestedAgent ?? detectAgent();
|
|
386
|
+
if (!agent) {
|
|
387
|
+
console.error(
|
|
388
|
+
JSON.stringify({
|
|
389
|
+
level: "error",
|
|
390
|
+
code: "agent_not_found",
|
|
391
|
+
message: "No agent CLI (claude/codex/opencode) found in PATH.",
|
|
392
|
+
action: "Install Claude Code, Codex, or OpenCode.",
|
|
393
|
+
}),
|
|
394
|
+
);
|
|
395
|
+
process.exit(1);
|
|
396
|
+
}
|
|
385
397
|
|
|
386
398
|
const result = await evolve({
|
|
387
399
|
skillName: values.skill,
|
|
388
400
|
skillPath: values["skill-path"],
|
|
389
401
|
evalSetPath: values["eval-set"],
|
|
390
|
-
|
|
391
|
-
agent: values.agent,
|
|
402
|
+
agent,
|
|
392
403
|
dryRun: values["dry-run"] ?? false,
|
|
393
404
|
confidenceThreshold: Number.parseFloat(values.confidence ?? "0.6"),
|
|
394
405
|
maxIterations: Number.parseInt(values["max-iterations"] ?? "3", 10),
|
|
@@ -120,11 +120,10 @@ export async function generateProposal(
|
|
|
120
120
|
missedQueries: string[],
|
|
121
121
|
skillName: string,
|
|
122
122
|
skillPath: string,
|
|
123
|
-
|
|
124
|
-
agent?: string,
|
|
123
|
+
agent: string,
|
|
125
124
|
): Promise<EvolutionProposal> {
|
|
126
125
|
const prompt = buildProposalPrompt(currentDescription, failurePatterns, missedQueries, skillName);
|
|
127
|
-
const rawResponse = await callLlm(PROPOSER_SYSTEM, prompt,
|
|
126
|
+
const rawResponse = await callLlm(PROPOSER_SYSTEM, prompt, agent);
|
|
128
127
|
const { proposed_description, rationale, confidence } = parseProposalResponse(rawResponse);
|
|
129
128
|
|
|
130
129
|
return {
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
* 3. Recording a "rolled_back" entry in the audit trail
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
|
-
import { existsSync,
|
|
10
|
+
import { existsSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
|
|
11
11
|
import { basename, dirname, join } from "node:path";
|
|
12
12
|
import { parseArgs } from "node:util";
|
|
13
13
|
|
|
@@ -61,8 +61,7 @@ export function parseTriggerResponse(response: string): boolean {
|
|
|
61
61
|
export async function validateProposal(
|
|
62
62
|
proposal: EvolutionProposal,
|
|
63
63
|
evalSet: EvalEntry[],
|
|
64
|
-
|
|
65
|
-
agent?: string,
|
|
64
|
+
agent: string,
|
|
66
65
|
): Promise<ValidationResult> {
|
|
67
66
|
if (evalSet.length === 0) {
|
|
68
67
|
return {
|
|
@@ -85,14 +84,14 @@ export async function validateProposal(
|
|
|
85
84
|
for (const entry of evalSet) {
|
|
86
85
|
// Check with original description
|
|
87
86
|
const beforePrompt = buildTriggerCheckPrompt(proposal.original_description, entry.query);
|
|
88
|
-
const beforeRaw = await callLlm(systemPrompt, beforePrompt,
|
|
87
|
+
const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent);
|
|
89
88
|
const beforeTriggered = parseTriggerResponse(beforeRaw);
|
|
90
89
|
const beforePass =
|
|
91
90
|
(entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
|
|
92
91
|
|
|
93
92
|
// Check with proposed description
|
|
94
93
|
const afterPrompt = buildTriggerCheckPrompt(proposal.proposed_description, entry.query);
|
|
95
|
-
const afterRaw = await callLlm(systemPrompt, afterPrompt,
|
|
94
|
+
const afterRaw = await callLlm(systemPrompt, afterPrompt, agent);
|
|
96
95
|
const afterTriggered = parseTriggerResponse(afterRaw);
|
|
97
96
|
const afterPass =
|
|
98
97
|
(entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
|
|
@@ -5,9 +5,7 @@
|
|
|
5
5
|
* Rubric-based grader for Claude Code skill sessions.
|
|
6
6
|
* Migrated from grade_session.py.
|
|
7
7
|
*
|
|
8
|
-
*
|
|
9
|
-
* 1. --use-agent (default when no ANTHROPIC_API_KEY) — invokes installed agent CLI
|
|
10
|
-
* 2. --use-api (default when ANTHROPIC_API_KEY set) — calls Anthropic API directly
|
|
8
|
+
* Grades via installed agent CLI (claude/codex/opencode).
|
|
11
9
|
*/
|
|
12
10
|
|
|
13
11
|
import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
@@ -26,7 +24,6 @@ import {
|
|
|
26
24
|
detectAgent as _detectAgent,
|
|
27
25
|
stripMarkdownFences as _stripMarkdownFences,
|
|
28
26
|
callViaAgent,
|
|
29
|
-
callViaApi,
|
|
30
27
|
} from "../utils/llm-call.js";
|
|
31
28
|
import { readExcerpt } from "../utils/transcript.js";
|
|
32
29
|
|
|
@@ -226,22 +223,6 @@ export async function gradeViaAgent(prompt: string, agent: string): Promise<Grad
|
|
|
226
223
|
}
|
|
227
224
|
}
|
|
228
225
|
|
|
229
|
-
// ---------------------------------------------------------------------------
|
|
230
|
-
// Grading via direct Anthropic API
|
|
231
|
-
// ---------------------------------------------------------------------------
|
|
232
|
-
|
|
233
|
-
export async function gradeViaApi(prompt: string): Promise<GraderOutput> {
|
|
234
|
-
const raw = await callViaApi(GRADER_SYSTEM, prompt);
|
|
235
|
-
try {
|
|
236
|
-
return JSON.parse(_stripMarkdownFences(raw)) as GraderOutput;
|
|
237
|
-
} catch (err) {
|
|
238
|
-
throw new Error(
|
|
239
|
-
`gradeViaApi: failed to parse LLM output as JSON. Raw (truncated): ${raw.slice(0, 200)}`,
|
|
240
|
-
{ cause: err },
|
|
241
|
-
);
|
|
242
|
-
}
|
|
243
|
-
}
|
|
244
|
-
|
|
245
226
|
// ---------------------------------------------------------------------------
|
|
246
227
|
// Result assembly
|
|
247
228
|
// ---------------------------------------------------------------------------
|
|
@@ -306,8 +287,6 @@ export async function cliMain(): Promise<void> {
|
|
|
306
287
|
transcript: { type: "string" },
|
|
307
288
|
"telemetry-log": { type: "string", default: TELEMETRY_LOG },
|
|
308
289
|
output: { type: "string", default: "grading.json" },
|
|
309
|
-
"use-agent": { type: "boolean", default: false },
|
|
310
|
-
"use-api": { type: "boolean", default: false },
|
|
311
290
|
agent: { type: "string" },
|
|
312
291
|
"show-transcript": { type: "boolean", default: false },
|
|
313
292
|
},
|
|
@@ -320,50 +299,31 @@ export async function cliMain(): Promise<void> {
|
|
|
320
299
|
process.exit(1);
|
|
321
300
|
}
|
|
322
301
|
|
|
323
|
-
// --- Determine
|
|
324
|
-
const hasApiKey = Boolean(process.env.ANTHROPIC_API_KEY);
|
|
325
|
-
let mode: "agent" | "api";
|
|
302
|
+
// --- Determine agent ---
|
|
326
303
|
let agent: string | null = null;
|
|
327
|
-
|
|
328
|
-
if (values
|
|
329
|
-
|
|
330
|
-
} else if (values["use-agent"]) {
|
|
331
|
-
mode = "agent";
|
|
332
|
-
} else {
|
|
333
|
-
const availableAgent = _detectAgent();
|
|
334
|
-
if (availableAgent) {
|
|
335
|
-
mode = "agent";
|
|
336
|
-
} else if (hasApiKey) {
|
|
337
|
-
mode = "api";
|
|
338
|
-
} else {
|
|
304
|
+
const validAgents = ["claude", "codex", "opencode"];
|
|
305
|
+
if (values.agent) {
|
|
306
|
+
if (!validAgents.includes(values.agent)) {
|
|
339
307
|
console.error(
|
|
340
|
-
|
|
341
|
-
"and ANTHROPIC_API_KEY not set.\n" +
|
|
342
|
-
"Install Claude Code, Codex, or OpenCode, or set ANTHROPIC_API_KEY.",
|
|
308
|
+
`[ERROR] Invalid --agent '${values.agent}'. Expected one of: ${validAgents.join(", ")}`,
|
|
343
309
|
);
|
|
344
310
|
process.exit(1);
|
|
345
311
|
}
|
|
312
|
+
agent = values.agent;
|
|
313
|
+
} else {
|
|
314
|
+
agent = _detectAgent();
|
|
346
315
|
}
|
|
347
316
|
|
|
348
|
-
if (
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
}
|
|
355
|
-
if (!agent) {
|
|
356
|
-
console.error(
|
|
357
|
-
"[ERROR] --use-agent specified but no agent found in PATH.\n" +
|
|
358
|
-
"Install claude, codex, or opencode, or use --use-api instead.",
|
|
359
|
-
);
|
|
360
|
-
process.exit(1);
|
|
361
|
-
}
|
|
362
|
-
console.error(`[INFO] Grading via agent: ${agent}`);
|
|
363
|
-
} else {
|
|
364
|
-
console.error("[INFO] Grading via direct Anthropic API");
|
|
317
|
+
if (!agent) {
|
|
318
|
+
console.error(
|
|
319
|
+
"[ERROR] No agent CLI (claude/codex/opencode) found in PATH.\n" +
|
|
320
|
+
"Install Claude Code, Codex, or OpenCode.",
|
|
321
|
+
);
|
|
322
|
+
process.exit(1);
|
|
365
323
|
}
|
|
366
324
|
|
|
325
|
+
console.error(`[INFO] Grading via agent: ${agent}`);
|
|
326
|
+
|
|
367
327
|
// --- Resolve expectations ---
|
|
368
328
|
let expectations: string[] = [];
|
|
369
329
|
if (values["evals-json"] && values["eval-id"] != null) {
|
|
@@ -427,11 +387,7 @@ export async function cliMain(): Promise<void> {
|
|
|
427
387
|
|
|
428
388
|
let graderOutput: GraderOutput;
|
|
429
389
|
try {
|
|
430
|
-
|
|
431
|
-
graderOutput = await gradeViaAgent(prompt, agent as string);
|
|
432
|
-
} else {
|
|
433
|
-
graderOutput = await gradeViaApi(prompt);
|
|
434
|
-
}
|
|
390
|
+
graderOutput = await gradeViaAgent(prompt, agent);
|
|
435
391
|
} catch (e) {
|
|
436
392
|
console.error(`[ERROR] Grading failed: ${e}`);
|
|
437
393
|
process.exit(1);
|