skilltest 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +9 -2
- package/README.md +51 -5
- package/dist/index.js +408 -10
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/CLAUDE.md
CHANGED
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
- `lint`: static/offline quality checks
|
|
8
8
|
- `trigger`: model-based triggerability testing
|
|
9
9
|
- `eval`: end-to-end execution + grader-based scoring
|
|
10
|
+
- `check`: lint + trigger + eval quality gates in one run
|
|
10
11
|
|
|
11
12
|
The CLI is published as `skilltest` and built for `npx skilltest` usage.
|
|
12
13
|
|
|
@@ -18,6 +19,7 @@ The CLI is published as `skilltest` and built for `npx skilltest` usage.
|
|
|
18
19
|
- `src/core/linter/`: lint check modules and orchestrator
|
|
19
20
|
- `src/core/trigger-tester.ts`: query generation + trigger simulation + metrics
|
|
20
21
|
- `src/core/eval-runner.ts`: prompt generation/loading + skill execution + grading loop
|
|
22
|
+
- `src/core/check-runner.ts`: orchestrates lint + trigger + eval with threshold gates
|
|
21
23
|
- `src/core/grader.ts`: structured grader prompt + JSON parse
|
|
22
24
|
- `src/providers/`: LLM provider abstraction (`sendMessage`) and provider implementations
|
|
23
25
|
- `src/reporters/`: terminal rendering and JSON output helper
|
|
@@ -68,6 +70,9 @@ ANTHROPIC_API_KEY=your-key node dist/index.js trigger test-fixtures/sample-skill
|
|
|
68
70
|
- `sendMessage(systemPrompt, userMessage, { model }) => Promise<string>`
|
|
69
71
|
- Lint is fully offline and first-class.
|
|
70
72
|
- Trigger/eval rely on the same provider abstraction.
|
|
73
|
+
- `check` wraps lint + trigger + eval and enforces minimum thresholds:
|
|
74
|
+
- trigger F1
|
|
75
|
+
- eval assertion pass rate
|
|
71
76
|
- JSON mode is strict:
|
|
72
77
|
- no spinners
|
|
73
78
|
- no colored output
|
|
@@ -79,7 +84,9 @@ ANTHROPIC_API_KEY=your-key node dist/index.js trigger test-fixtures/sample-skill
|
|
|
79
84
|
## Gotchas
|
|
80
85
|
|
|
81
86
|
- `trigger --num-queries` must be even for balanced positive/negative cases.
|
|
82
|
-
-
|
|
87
|
+
- `check` also requires even `--num-queries`.
|
|
88
|
+
- `check` stops after lint failures unless `--continue-on-lint-fail` is set.
|
|
89
|
+
- OpenAI provider is implemented via dynamic import so Anthropic-only installs do not crash if optional deps are skipped.
|
|
83
90
|
- Frontmatter is validated with both `gray-matter` and `js-yaml`; malformed YAML should fail fast.
|
|
84
91
|
- Keep file references relative to skill root; out-of-root refs are lint failures.
|
|
85
92
|
- If you modify reporter formatting, ensure JSON mode remains machine-safe.
|
|
@@ -94,10 +101,10 @@ ANTHROPIC_API_KEY=your-key node dist/index.js trigger test-fixtures/sample-skill
|
|
|
94
101
|
- Compatibility hints: `src/core/linter/compat.ts`
|
|
95
102
|
- Trigger fake skill pool + scoring: `src/core/trigger-tester.ts`
|
|
96
103
|
- Eval grading schema: `src/core/grader.ts`
|
|
104
|
+
- Combined quality gate orchestration: `src/core/check-runner.ts`
|
|
97
105
|
|
|
98
106
|
## Future Work (Not Implemented Yet)
|
|
99
107
|
|
|
100
|
-
- Real OpenAI provider implementation
|
|
101
108
|
- Config file support (`.skilltestrc`)
|
|
102
109
|
- Parallel execution
|
|
103
110
|
- HTML reporting
|
package/README.md
CHANGED
|
@@ -23,7 +23,7 @@ Agent Skills are quick to write but hard to validate before deployment:
|
|
|
23
23
|
- You cannot easily measure trigger precision/recall.
|
|
24
24
|
- You do not know whether outputs are good until users exercise the skill.
|
|
25
25
|
|
|
26
|
-
`skilltest` closes this gap with one CLI and
|
|
26
|
+
`skilltest` closes this gap with one CLI and four modes.
|
|
27
27
|
|
|
28
28
|
## Install
|
|
29
29
|
|
|
@@ -61,12 +61,18 @@ End-to-end eval:
|
|
|
61
61
|
skilltest eval ./path/to/skill --provider anthropic --model claude-sonnet-4-5-20250929
|
|
62
62
|
```
|
|
63
63
|
|
|
64
|
+
Run full quality gate:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
skilltest check ./path/to/skill --provider anthropic --min-f1 0.8 --min-assert-pass-rate 0.9
|
|
68
|
+
```
|
|
69
|
+
|
|
64
70
|
Example lint summary:
|
|
65
71
|
|
|
66
72
|
```text
|
|
67
73
|
skilltest lint
|
|
68
74
|
target: ./test-fixtures/sample-skill
|
|
69
|
-
summary:
|
|
75
|
+
summary: 29/29 checks passed, 0 warnings, 0 failures
|
|
70
76
|
```
|
|
71
77
|
|
|
72
78
|
## Commands
|
|
@@ -153,6 +159,32 @@ Flags:
|
|
|
153
159
|
- `--api-key <key>` explicit key override
|
|
154
160
|
- `--verbose` show full model responses
|
|
155
161
|
|
|
162
|
+
### `skilltest check <path-to-skill>`
|
|
163
|
+
|
|
164
|
+
Runs `lint + trigger + eval` in one command and applies quality thresholds.
|
|
165
|
+
|
|
166
|
+
Default behavior:
|
|
167
|
+
|
|
168
|
+
1. Run lint.
|
|
169
|
+
2. Stop before model calls if lint has failures.
|
|
170
|
+
3. Run trigger and eval only when lint passes.
|
|
171
|
+
4. Fail quality gate when either threshold is below target.
|
|
172
|
+
|
|
173
|
+
Flags:
|
|
174
|
+
|
|
175
|
+
- `--provider <anthropic|openai>` default: `anthropic`
|
|
176
|
+
- `--model <model>` default: `claude-sonnet-4-5-20250929` (auto-switches to `gpt-4.1-mini` for `--provider openai` when unchanged)
|
|
177
|
+
- `--grader-model <model>` default: same as resolved `--model`
|
|
178
|
+
- `--api-key <key>` explicit key override
|
|
179
|
+
- `--queries <path>` custom trigger queries JSON
|
|
180
|
+
- `--num-queries <n>` default: `20` (must be even)
|
|
181
|
+
- `--prompts <path>` custom eval prompts JSON
|
|
182
|
+
- `--min-f1 <n>` default: `0.8`
|
|
183
|
+
- `--min-assert-pass-rate <n>` default: `0.9`
|
|
184
|
+
- `--save-results <path>` save combined check result JSON
|
|
185
|
+
- `--continue-on-lint-fail` continue trigger/eval even if lint fails
|
|
186
|
+
- `--verbose` include detailed trigger/eval sections
|
|
187
|
+
|
|
156
188
|
## Global Flags
|
|
157
189
|
|
|
158
190
|
- `--help` show help
|
|
@@ -195,8 +227,8 @@ Eval prompts (`--prompts`):
|
|
|
195
227
|
|
|
196
228
|
Exit codes:
|
|
197
229
|
|
|
198
|
-
- `0`: success
|
|
199
|
-
- `1`: lint
|
|
230
|
+
- `0`: success
|
|
231
|
+
- `1`: quality gate failed (`lint`, `check` thresholds, or command-specific failure conditions)
|
|
200
232
|
- `2`: runtime/config/API/parse error
|
|
201
233
|
|
|
202
234
|
JSON mode examples:
|
|
@@ -205,6 +237,7 @@ JSON mode examples:
|
|
|
205
237
|
skilltest lint ./skill --json
|
|
206
238
|
skilltest trigger ./skill --json
|
|
207
239
|
skilltest eval ./skill --json
|
|
240
|
+
skilltest check ./skill --json
|
|
208
241
|
```
|
|
209
242
|
|
|
210
243
|
## API Keys
|
|
@@ -230,7 +263,18 @@ skilltest trigger ./skill --api-key your-key
|
|
|
230
263
|
Current provider status:
|
|
231
264
|
|
|
232
265
|
- `anthropic`: implemented
|
|
233
|
-
- `openai`:
|
|
266
|
+
- `openai`: implemented
|
|
267
|
+
|
|
268
|
+
OpenAI quick example:
|
|
269
|
+
|
|
270
|
+
```bash
|
|
271
|
+
skilltest trigger ./path/to/skill --provider openai --model gpt-4.1-mini
|
|
272
|
+
skilltest eval ./path/to/skill --provider openai --model gpt-4.1-mini
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
Note:
|
|
276
|
+
|
|
277
|
+
- If you pass `--provider openai` and keep the Anthropic default model value, `skilltest` automatically switches to `gpt-4.1-mini`.
|
|
234
278
|
|
|
235
279
|
## CICD Integration
|
|
236
280
|
|
|
@@ -283,6 +327,7 @@ jobs:
|
|
|
283
327
|
- run: npm run build
|
|
284
328
|
- run: npx skilltest trigger path/to/skill --num-queries 20 --json
|
|
285
329
|
- run: npx skilltest eval path/to/skill --prompts path/to/prompts.json --json
|
|
330
|
+
- run: npx skilltest check path/to/skill --min-f1 0.8 --min-assert-pass-rate 0.9 --json
|
|
286
331
|
```
|
|
287
332
|
|
|
288
333
|
## Local Development
|
|
@@ -300,6 +345,7 @@ Smoke tests:
|
|
|
300
345
|
node dist/index.js lint test-fixtures/sample-skill/
|
|
301
346
|
node dist/index.js trigger test-fixtures/sample-skill/ --num-queries 2
|
|
302
347
|
node dist/index.js eval test-fixtures/sample-skill/ --prompts test-fixtures/eval-prompts.json
|
|
348
|
+
node dist/index.js check test-fixtures/sample-skill/ --num-queries 2 --prompts test-fixtures/eval-prompts.json
|
|
303
349
|
```
|
|
304
350
|
|
|
305
351
|
## Release Checklist
|
package/dist/index.js
CHANGED
|
@@ -1076,6 +1076,88 @@ function renderEvalReport(result, enableColor, verbose) {
|
|
|
1076
1076
|
}
|
|
1077
1077
|
return lines.join("\n");
|
|
1078
1078
|
}
|
|
1079
|
+
function gateStatusLabel(value, c) {
|
|
1080
|
+
if (value === null) {
|
|
1081
|
+
return c.yellow("SKIP");
|
|
1082
|
+
}
|
|
1083
|
+
return value ? c.green("PASS") : c.red("FAIL");
|
|
1084
|
+
}
|
|
1085
|
+
function renderCheckReport(result, enableColor, verbose) {
|
|
1086
|
+
const c = getChalkInstance(enableColor);
|
|
1087
|
+
const lines = [];
|
|
1088
|
+
const lintGate = gateStatusLabel(result.gates.lintPassed, c);
|
|
1089
|
+
const triggerGate = gateStatusLabel(result.gates.triggerPassed, c);
|
|
1090
|
+
const evalGate = gateStatusLabel(result.gates.evalPassed, c);
|
|
1091
|
+
const overallGate = result.gates.overallPassed ? c.green("PASS") : c.red("FAIL");
|
|
1092
|
+
lines.push("skilltest check");
|
|
1093
|
+
lines.push(`target: ${result.target}`);
|
|
1094
|
+
lines.push(`provider/model: ${result.provider}/${result.model}`);
|
|
1095
|
+
lines.push(`grader model: ${result.graderModel}`);
|
|
1096
|
+
lines.push(
|
|
1097
|
+
`thresholds: min-f1=${result.thresholds.minF1.toFixed(2)} min-assert-pass-rate=${result.thresholds.minAssertPassRate.toFixed(2)}`
|
|
1098
|
+
);
|
|
1099
|
+
lines.push("");
|
|
1100
|
+
lines.push("Lint");
|
|
1101
|
+
lines.push(
|
|
1102
|
+
`- ${lintGate} ${result.lint.summary.passed}/${result.lint.summary.total} checks passed (${result.lint.summary.warnings} warnings, ${result.lint.summary.failures} failures)`
|
|
1103
|
+
);
|
|
1104
|
+
const lintIssues = verbose ? result.lint.issues : result.lint.issues.filter((issue) => issue.status !== "pass");
|
|
1105
|
+
for (const issue of lintIssues) {
|
|
1106
|
+
lines.push(renderIssueLine(issue, c));
|
|
1107
|
+
}
|
|
1108
|
+
lines.push("");
|
|
1109
|
+
lines.push("Trigger");
|
|
1110
|
+
if (result.trigger) {
|
|
1111
|
+
lines.push(
|
|
1112
|
+
`- ${triggerGate} f1=${formatPercent(result.trigger.metrics.f1)} (precision=${formatPercent(result.trigger.metrics.precision)} recall=${formatPercent(result.trigger.metrics.recall)})`
|
|
1113
|
+
);
|
|
1114
|
+
lines.push(
|
|
1115
|
+
` TP ${result.trigger.metrics.truePositives} TN ${result.trigger.metrics.trueNegatives} FP ${result.trigger.metrics.falsePositives} FN ${result.trigger.metrics.falseNegatives}`
|
|
1116
|
+
);
|
|
1117
|
+
const triggerCases = verbose ? result.trigger.cases : result.trigger.cases.filter((testCase) => !testCase.matched);
|
|
1118
|
+
for (const testCase of triggerCases) {
|
|
1119
|
+
const status = testCase.matched ? c.green("PASS") : c.red("FAIL");
|
|
1120
|
+
lines.push(` - ${status} ${testCase.query}`);
|
|
1121
|
+
lines.push(` expected=${testCase.expected} actual=${testCase.actual}`);
|
|
1122
|
+
}
|
|
1123
|
+
} else {
|
|
1124
|
+
lines.push(`- ${triggerGate} ${result.triggerSkippedReason ?? "Skipped."}`);
|
|
1125
|
+
}
|
|
1126
|
+
lines.push("");
|
|
1127
|
+
lines.push("Eval");
|
|
1128
|
+
if (result.eval) {
|
|
1129
|
+
const passRate = result.gates.evalAssertPassRate ?? 0;
|
|
1130
|
+
lines.push(
|
|
1131
|
+
`- ${evalGate} assertion pass rate=${formatPercent(passRate)} (${result.eval.summary.passedAssertions}/${result.eval.summary.totalAssertions})`
|
|
1132
|
+
);
|
|
1133
|
+
for (const promptResult of result.eval.results) {
|
|
1134
|
+
const failedAssertions = promptResult.assertions.filter((assertion) => !assertion.passed);
|
|
1135
|
+
if (!verbose && failedAssertions.length === 0) {
|
|
1136
|
+
continue;
|
|
1137
|
+
}
|
|
1138
|
+
lines.push(` - prompt: ${promptResult.prompt}`);
|
|
1139
|
+
lines.push(` response summary: ${promptResult.responseSummary.replace(/\s+/g, " ").trim()}`);
|
|
1140
|
+
const assertionsToRender = verbose ? promptResult.assertions : failedAssertions;
|
|
1141
|
+
for (const assertion of assertionsToRender) {
|
|
1142
|
+
const assertionStatus = assertion.passed ? c.green("PASS") : c.red("FAIL");
|
|
1143
|
+
lines.push(` ${assertionStatus} ${assertion.assertion}`);
|
|
1144
|
+
lines.push(` evidence: ${assertion.evidence}`);
|
|
1145
|
+
}
|
|
1146
|
+
if (verbose) {
|
|
1147
|
+
lines.push(` full response: ${promptResult.response}`);
|
|
1148
|
+
}
|
|
1149
|
+
}
|
|
1150
|
+
} else {
|
|
1151
|
+
lines.push(`- ${evalGate} ${result.evalSkippedReason ?? "Skipped."}`);
|
|
1152
|
+
}
|
|
1153
|
+
lines.push("");
|
|
1154
|
+
lines.push("Quality Gate");
|
|
1155
|
+
lines.push(`- lint gate: ${lintGate}`);
|
|
1156
|
+
lines.push(`- trigger gate: ${triggerGate}`);
|
|
1157
|
+
lines.push(`- eval gate: ${evalGate}`);
|
|
1158
|
+
lines.push(`- overall: ${overallGate}`);
|
|
1159
|
+
return lines.join("\n");
|
|
1160
|
+
}
|
|
1079
1161
|
|
|
1080
1162
|
// src/reporters/json.ts
|
|
1081
1163
|
function renderJson(value) {
|
|
@@ -1414,15 +1496,104 @@ var AnthropicProvider = class {
|
|
|
1414
1496
|
};
|
|
1415
1497
|
|
|
1416
1498
|
// src/providers/openai.ts
|
|
1499
|
+
function wait2(ms) {
|
|
1500
|
+
return new Promise((resolve) => {
|
|
1501
|
+
setTimeout(resolve, ms);
|
|
1502
|
+
});
|
|
1503
|
+
}
|
|
1504
|
+
function isRetriableError(error) {
|
|
1505
|
+
if (!error || typeof error !== "object") {
|
|
1506
|
+
return false;
|
|
1507
|
+
}
|
|
1508
|
+
const maybeStatus = error.status;
|
|
1509
|
+
if (maybeStatus === 429 || typeof maybeStatus === "number" && maybeStatus >= 500) {
|
|
1510
|
+
return true;
|
|
1511
|
+
}
|
|
1512
|
+
const maybeCode = error.code;
|
|
1513
|
+
if (typeof maybeCode === "string" && /timeout|econnreset|enotfound|eai_again/i.test(maybeCode)) {
|
|
1514
|
+
return true;
|
|
1515
|
+
}
|
|
1516
|
+
const maybeMessage = error.message;
|
|
1517
|
+
if (typeof maybeMessage === "string" && /(rate limit|timeout|temporarily unavailable|connection)/i.test(maybeMessage)) {
|
|
1518
|
+
return true;
|
|
1519
|
+
}
|
|
1520
|
+
return false;
|
|
1521
|
+
}
|
|
1522
|
+
function extractTextContent(content) {
|
|
1523
|
+
if (!content) {
|
|
1524
|
+
return "";
|
|
1525
|
+
}
|
|
1526
|
+
if (typeof content === "string") {
|
|
1527
|
+
return content.trim();
|
|
1528
|
+
}
|
|
1529
|
+
const text = content.map((item) => item.type === "text" || !item.type ? item.text ?? "" : "").join("\n").trim();
|
|
1530
|
+
return text;
|
|
1531
|
+
}
|
|
1417
1532
|
var OpenAIProvider = class {
|
|
1418
1533
|
name = "openai";
|
|
1419
|
-
|
|
1534
|
+
apiKey;
|
|
1535
|
+
client;
|
|
1420
1536
|
constructor(apiKey) {
|
|
1421
|
-
this.
|
|
1537
|
+
this.apiKey = apiKey;
|
|
1538
|
+
this.client = null;
|
|
1539
|
+
}
|
|
1540
|
+
async ensureClient() {
|
|
1541
|
+
if (this.client) {
|
|
1542
|
+
return this.client;
|
|
1543
|
+
}
|
|
1544
|
+
let openAiModule;
|
|
1545
|
+
try {
|
|
1546
|
+
const moduleName = "openai";
|
|
1547
|
+
openAiModule = await import(moduleName);
|
|
1548
|
+
} catch {
|
|
1549
|
+
throw new Error(
|
|
1550
|
+
"OpenAI SDK is not installed. Install optional dependency 'openai' or run 'npm install' with optional dependencies enabled."
|
|
1551
|
+
);
|
|
1552
|
+
}
|
|
1553
|
+
const OpenAIConstructor = openAiModule.default;
|
|
1554
|
+
if (!OpenAIConstructor) {
|
|
1555
|
+
throw new Error("OpenAI SDK loaded but no default client export was found.");
|
|
1556
|
+
}
|
|
1557
|
+
this.client = new OpenAIConstructor({ apiKey: this.apiKey });
|
|
1558
|
+
return this.client;
|
|
1422
1559
|
}
|
|
1423
|
-
async sendMessage(
|
|
1424
|
-
|
|
1425
|
-
|
|
1560
|
+
async sendMessage(systemPrompt, userMessage, options) {
|
|
1561
|
+
const client = await this.ensureClient();
|
|
1562
|
+
let lastError;
|
|
1563
|
+
for (let attempt = 0; attempt < 3; attempt += 1) {
|
|
1564
|
+
try {
|
|
1565
|
+
const response = await client.chat.completions.create({
|
|
1566
|
+
model: options.model,
|
|
1567
|
+
max_tokens: 2048,
|
|
1568
|
+
messages: [
|
|
1569
|
+
{
|
|
1570
|
+
role: "system",
|
|
1571
|
+
content: systemPrompt
|
|
1572
|
+
},
|
|
1573
|
+
{
|
|
1574
|
+
role: "user",
|
|
1575
|
+
content: userMessage
|
|
1576
|
+
}
|
|
1577
|
+
]
|
|
1578
|
+
});
|
|
1579
|
+
const text = (response.choices ?? []).map((choice) => extractTextContent(choice.message?.content)).join("\n").trim();
|
|
1580
|
+
if (text.length === 0) {
|
|
1581
|
+
throw new Error("Model returned an empty response.");
|
|
1582
|
+
}
|
|
1583
|
+
return text;
|
|
1584
|
+
} catch (error) {
|
|
1585
|
+
lastError = error;
|
|
1586
|
+
if (!isRetriableError(error) || attempt === 2) {
|
|
1587
|
+
break;
|
|
1588
|
+
}
|
|
1589
|
+
const delay = Math.min(4e3, 500 * 2 ** attempt) + Math.floor(Math.random() * 250);
|
|
1590
|
+
await wait2(delay);
|
|
1591
|
+
}
|
|
1592
|
+
}
|
|
1593
|
+
if (lastError instanceof Error) {
|
|
1594
|
+
throw new Error(`OpenAI API call failed: ${lastError.message}`);
|
|
1595
|
+
}
|
|
1596
|
+
throw new Error("OpenAI API call failed with an unknown error.");
|
|
1426
1597
|
}
|
|
1427
1598
|
};
|
|
1428
1599
|
|
|
@@ -1445,8 +1616,16 @@ var triggerOptionsSchema = z3.object({
|
|
|
1445
1616
|
verbose: z3.boolean().optional(),
|
|
1446
1617
|
apiKey: z3.string().optional()
|
|
1447
1618
|
});
|
|
1619
|
+
var DEFAULT_ANTHROPIC_MODEL = "claude-sonnet-4-5-20250929";
|
|
1620
|
+
var DEFAULT_OPENAI_MODEL = "gpt-4.1-mini";
|
|
1621
|
+
function resolveModel(provider, model) {
|
|
1622
|
+
if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL) {
|
|
1623
|
+
return DEFAULT_OPENAI_MODEL;
|
|
1624
|
+
}
|
|
1625
|
+
return model;
|
|
1626
|
+
}
|
|
1448
1627
|
function registerTriggerCommand(program) {
|
|
1449
|
-
program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use",
|
|
1628
|
+
program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use", DEFAULT_ANTHROPIC_MODEL).option("--provider <provider>", "LLM provider: anthropic|openai", "anthropic").option("--queries <path>", "Path to custom test queries JSON").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10), 20).option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, commandOptions, command) => {
|
|
1450
1629
|
const globalOptions = getGlobalCliOptions(command);
|
|
1451
1630
|
const parsedOptions = triggerOptionsSchema.safeParse(commandOptions);
|
|
1452
1631
|
if (!parsedOptions.success) {
|
|
@@ -1483,8 +1662,9 @@ function registerTriggerCommand(program) {
|
|
|
1483
1662
|
if (spinner) {
|
|
1484
1663
|
spinner.text = "Running trigger simulations...";
|
|
1485
1664
|
}
|
|
1665
|
+
const model = resolveModel(options.provider, options.model);
|
|
1486
1666
|
const result = await runTriggerTest(skill, {
|
|
1487
|
-
model
|
|
1667
|
+
model,
|
|
1488
1668
|
provider,
|
|
1489
1669
|
queries,
|
|
1490
1670
|
numQueries: options.numQueries,
|
|
@@ -1669,8 +1849,16 @@ var evalOptionsSchema = z6.object({
|
|
|
1669
1849
|
verbose: z6.boolean().optional(),
|
|
1670
1850
|
apiKey: z6.string().optional()
|
|
1671
1851
|
});
|
|
1852
|
+
var DEFAULT_ANTHROPIC_MODEL2 = "claude-sonnet-4-5-20250929";
|
|
1853
|
+
var DEFAULT_OPENAI_MODEL2 = "gpt-4.1-mini";
|
|
1854
|
+
function resolveModel2(provider, model) {
|
|
1855
|
+
if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL2) {
|
|
1856
|
+
return DEFAULT_OPENAI_MODEL2;
|
|
1857
|
+
}
|
|
1858
|
+
return model;
|
|
1859
|
+
}
|
|
1672
1860
|
function registerEvalCommand(program) {
|
|
1673
|
-
program.command("eval").description("Run end-to-end skill execution and quality evaluation.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--prompts <path>", "Path to eval prompts JSON").option("--model <model>", "Model to execute prompts",
|
|
1861
|
+
program.command("eval").description("Run end-to-end skill execution and quality evaluation.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--prompts <path>", "Path to eval prompts JSON").option("--model <model>", "Model to execute prompts", DEFAULT_ANTHROPIC_MODEL2).option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--provider <provider>", "LLM provider: anthropic|openai", "anthropic").option("--save-results <path>", "Save full evaluation results to JSON").option("--api-key <key>", "API key override").option("--verbose", "Show full model responses").action(async (targetPath, commandOptions, command) => {
|
|
1674
1862
|
const globalOptions = getGlobalCliOptions(command);
|
|
1675
1863
|
const parsedOptions = evalOptionsSchema.safeParse(commandOptions);
|
|
1676
1864
|
if (!parsedOptions.success) {
|
|
@@ -1704,10 +1892,12 @@ function registerEvalCommand(program) {
|
|
|
1704
1892
|
if (spinner) {
|
|
1705
1893
|
spinner.text = "Running eval prompts and grading responses...";
|
|
1706
1894
|
}
|
|
1895
|
+
const model = resolveModel2(options.provider, options.model);
|
|
1896
|
+
const graderModel = options.graderModel ?? model;
|
|
1707
1897
|
const result = await runEval(skill, {
|
|
1708
1898
|
provider,
|
|
1709
|
-
model
|
|
1710
|
-
graderModel
|
|
1899
|
+
model,
|
|
1900
|
+
graderModel,
|
|
1711
1901
|
prompts
|
|
1712
1902
|
});
|
|
1713
1903
|
if (options.saveResults) {
|
|
@@ -1727,6 +1917,213 @@ function registerEvalCommand(program) {
|
|
|
1727
1917
|
});
|
|
1728
1918
|
}
|
|
1729
1919
|
|
|
1920
|
+
// src/commands/check.ts
|
|
1921
|
+
import ora3 from "ora";
|
|
1922
|
+
import { z as z7 } from "zod";
|
|
1923
|
+
|
|
1924
|
+
// src/core/check-runner.ts
|
|
1925
|
+
function calculateEvalAssertPassRate(result) {
|
|
1926
|
+
if (result.summary.totalAssertions === 0) {
|
|
1927
|
+
return 0;
|
|
1928
|
+
}
|
|
1929
|
+
return result.summary.passedAssertions / result.summary.totalAssertions;
|
|
1930
|
+
}
|
|
1931
|
+
async function runCheck(inputPath, options) {
|
|
1932
|
+
options.onStage?.("lint");
|
|
1933
|
+
const lint = await runLinter(inputPath);
|
|
1934
|
+
const lintPassed = lint.summary.failures === 0;
|
|
1935
|
+
let trigger = null;
|
|
1936
|
+
let evalResult = null;
|
|
1937
|
+
let triggerSkippedReason;
|
|
1938
|
+
let evalSkippedReason;
|
|
1939
|
+
if (!lintPassed && !options.continueOnLintFail) {
|
|
1940
|
+
triggerSkippedReason = "Skipped because lint has failures (use --continue-on-lint-fail to override).";
|
|
1941
|
+
evalSkippedReason = "Skipped because lint has failures (use --continue-on-lint-fail to override).";
|
|
1942
|
+
} else {
|
|
1943
|
+
options.onStage?.("parse");
|
|
1944
|
+
let parsedSkill = null;
|
|
1945
|
+
try {
|
|
1946
|
+
parsedSkill = await parseSkillStrict(inputPath);
|
|
1947
|
+
} catch (error) {
|
|
1948
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1949
|
+
triggerSkippedReason = `Skipped: skill could not be parsed strictly (${message}).`;
|
|
1950
|
+
evalSkippedReason = `Skipped: skill could not be parsed strictly (${message}).`;
|
|
1951
|
+
}
|
|
1952
|
+
if (parsedSkill) {
|
|
1953
|
+
options.onStage?.("trigger");
|
|
1954
|
+
trigger = await runTriggerTest(parsedSkill, {
|
|
1955
|
+
provider: options.provider,
|
|
1956
|
+
model: options.model,
|
|
1957
|
+
queries: options.queries,
|
|
1958
|
+
numQueries: options.numQueries,
|
|
1959
|
+
verbose: options.verbose
|
|
1960
|
+
});
|
|
1961
|
+
options.onStage?.("eval");
|
|
1962
|
+
evalResult = await runEval(parsedSkill, {
|
|
1963
|
+
provider: options.provider,
|
|
1964
|
+
model: options.model,
|
|
1965
|
+
graderModel: options.graderModel,
|
|
1966
|
+
prompts: options.prompts
|
|
1967
|
+
});
|
|
1968
|
+
}
|
|
1969
|
+
}
|
|
1970
|
+
const triggerF1 = trigger ? trigger.metrics.f1 : null;
|
|
1971
|
+
const evalAssertPassRate = evalResult ? calculateEvalAssertPassRate(evalResult) : null;
|
|
1972
|
+
const triggerPassed = triggerF1 === null ? null : triggerF1 >= options.minF1;
|
|
1973
|
+
const evalPassed = evalAssertPassRate === null ? null : evalAssertPassRate >= options.minAssertPassRate;
|
|
1974
|
+
const overallPassed = lintPassed && triggerPassed === true && evalPassed === true;
|
|
1975
|
+
return {
|
|
1976
|
+
target: inputPath,
|
|
1977
|
+
provider: options.provider.name,
|
|
1978
|
+
model: options.model,
|
|
1979
|
+
graderModel: options.graderModel,
|
|
1980
|
+
thresholds: {
|
|
1981
|
+
minF1: options.minF1,
|
|
1982
|
+
minAssertPassRate: options.minAssertPassRate
|
|
1983
|
+
},
|
|
1984
|
+
continueOnLintFail: options.continueOnLintFail,
|
|
1985
|
+
lint,
|
|
1986
|
+
trigger,
|
|
1987
|
+
eval: evalResult,
|
|
1988
|
+
triggerSkippedReason,
|
|
1989
|
+
evalSkippedReason,
|
|
1990
|
+
gates: {
|
|
1991
|
+
lintPassed,
|
|
1992
|
+
triggerPassed,
|
|
1993
|
+
evalPassed,
|
|
1994
|
+
triggerF1,
|
|
1995
|
+
evalAssertPassRate,
|
|
1996
|
+
overallPassed
|
|
1997
|
+
}
|
|
1998
|
+
};
|
|
1999
|
+
}
|
|
2000
|
+
|
|
2001
|
+
// src/commands/check.ts
|
|
2002
|
+
var checkOptionsSchema = z7.object({
|
|
2003
|
+
provider: z7.enum(["anthropic", "openai"]),
|
|
2004
|
+
model: z7.string(),
|
|
2005
|
+
graderModel: z7.string().optional(),
|
|
2006
|
+
apiKey: z7.string().optional(),
|
|
2007
|
+
queries: z7.string().optional(),
|
|
2008
|
+
numQueries: z7.number().int().min(2),
|
|
2009
|
+
prompts: z7.string().optional(),
|
|
2010
|
+
minF1: z7.number().min(0).max(1),
|
|
2011
|
+
minAssertPassRate: z7.number().min(0).max(1),
|
|
2012
|
+
saveResults: z7.string().optional(),
|
|
2013
|
+
continueOnLintFail: z7.boolean().optional(),
|
|
2014
|
+
verbose: z7.boolean().optional()
|
|
2015
|
+
});
|
|
2016
|
+
var DEFAULT_ANTHROPIC_MODEL3 = "claude-sonnet-4-5-20250929";
|
|
2017
|
+
var DEFAULT_OPENAI_MODEL3 = "gpt-4.1-mini";
|
|
2018
|
+
function resolveModel3(provider, model) {
|
|
2019
|
+
if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL3) {
|
|
2020
|
+
return DEFAULT_OPENAI_MODEL3;
|
|
2021
|
+
}
|
|
2022
|
+
return model;
|
|
2023
|
+
}
|
|
2024
|
+
function registerCheckCommand(program) {
|
|
2025
|
+
program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai", "anthropic").option("--model <model>", "Model for trigger/eval runs", DEFAULT_ANTHROPIC_MODEL3).option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10), 20).option("--prompts <path>", "Path to eval prompts JSON").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value), 0.8).option(
|
|
2026
|
+
"--min-assert-pass-rate <n>",
|
|
2027
|
+
"Minimum required eval assertion pass rate (0-1)",
|
|
2028
|
+
(value) => Number.parseFloat(value),
|
|
2029
|
+
0.9
|
|
2030
|
+
).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, commandOptions, command) => {
|
|
2031
|
+
const globalOptions = getGlobalCliOptions(command);
|
|
2032
|
+
const parsedOptions = checkOptionsSchema.safeParse(commandOptions);
|
|
2033
|
+
if (!parsedOptions.success) {
|
|
2034
|
+
writeError(new Error(parsedOptions.error.issues[0]?.message ?? "Invalid check options."), globalOptions.json);
|
|
2035
|
+
process.exitCode = 2;
|
|
2036
|
+
return;
|
|
2037
|
+
}
|
|
2038
|
+
const options = parsedOptions.data;
|
|
2039
|
+
if (options.numQueries % 2 !== 0) {
|
|
2040
|
+
writeError(
|
|
2041
|
+
new Error("--num-queries must be an even number so the suite can split should/should-not trigger cases."),
|
|
2042
|
+
globalOptions.json
|
|
2043
|
+
);
|
|
2044
|
+
process.exitCode = 2;
|
|
2045
|
+
return;
|
|
2046
|
+
}
|
|
2047
|
+
const spinner = globalOptions.json || !process.stdout.isTTY ? null : ora3("Preparing check run...").start();
|
|
2048
|
+
try {
|
|
2049
|
+
if (spinner) {
|
|
2050
|
+
spinner.text = "Initializing model provider...";
|
|
2051
|
+
}
|
|
2052
|
+
const provider = createProvider(options.provider, options.apiKey);
|
|
2053
|
+
let queries = void 0;
|
|
2054
|
+
if (options.queries) {
|
|
2055
|
+
if (spinner) {
|
|
2056
|
+
spinner.text = "Loading custom trigger queries...";
|
|
2057
|
+
}
|
|
2058
|
+
const loadedQueries = await readJsonFile(options.queries);
|
|
2059
|
+
const parsedQueries = triggerQueryArraySchema.safeParse(loadedQueries);
|
|
2060
|
+
if (!parsedQueries.success) {
|
|
2061
|
+
throw new Error(
|
|
2062
|
+
`Invalid --queries JSON: ${parsedQueries.error.issues[0]?.message ?? "unknown format issue"}`
|
|
2063
|
+
);
|
|
2064
|
+
}
|
|
2065
|
+
queries = parsedQueries.data;
|
|
2066
|
+
}
|
|
2067
|
+
let prompts = void 0;
|
|
2068
|
+
if (options.prompts) {
|
|
2069
|
+
if (spinner) {
|
|
2070
|
+
spinner.text = "Loading eval prompts...";
|
|
2071
|
+
}
|
|
2072
|
+
const loadedPrompts = await readJsonFile(options.prompts);
|
|
2073
|
+
const parsedPrompts = evalPromptArraySchema.safeParse(loadedPrompts);
|
|
2074
|
+
if (!parsedPrompts.success) {
|
|
2075
|
+
throw new Error(
|
|
2076
|
+
`Invalid --prompts JSON: ${parsedPrompts.error.issues[0]?.message ?? "unknown format issue"}`
|
|
2077
|
+
);
|
|
2078
|
+
}
|
|
2079
|
+
prompts = parsedPrompts.data;
|
|
2080
|
+
}
|
|
2081
|
+
const model = resolveModel3(options.provider, options.model);
|
|
2082
|
+
const graderModel = options.graderModel ?? model;
|
|
2083
|
+
const result = await runCheck(targetPath, {
|
|
2084
|
+
provider,
|
|
2085
|
+
model,
|
|
2086
|
+
graderModel,
|
|
2087
|
+
queries,
|
|
2088
|
+
numQueries: options.numQueries,
|
|
2089
|
+
prompts,
|
|
2090
|
+
minF1: options.minF1,
|
|
2091
|
+
minAssertPassRate: options.minAssertPassRate,
|
|
2092
|
+
continueOnLintFail: Boolean(options.continueOnLintFail),
|
|
2093
|
+
verbose: Boolean(options.verbose),
|
|
2094
|
+
onStage: (stage) => {
|
|
2095
|
+
if (!spinner) {
|
|
2096
|
+
return;
|
|
2097
|
+
}
|
|
2098
|
+
if (stage === "lint") {
|
|
2099
|
+
spinner.text = "Running lint checks...";
|
|
2100
|
+
} else if (stage === "parse") {
|
|
2101
|
+
spinner.text = "Parsing skill for model evaluations...";
|
|
2102
|
+
} else if (stage === "trigger") {
|
|
2103
|
+
spinner.text = "Running trigger test suite...";
|
|
2104
|
+
} else if (stage === "eval") {
|
|
2105
|
+
spinner.text = "Running end-to-end eval suite...";
|
|
2106
|
+
}
|
|
2107
|
+
}
|
|
2108
|
+
});
|
|
2109
|
+
if (options.saveResults) {
|
|
2110
|
+
await writeJsonFile(options.saveResults, result);
|
|
2111
|
+
}
|
|
2112
|
+
spinner?.stop();
|
|
2113
|
+
if (globalOptions.json) {
|
|
2114
|
+
writeResult(result, true);
|
|
2115
|
+
} else {
|
|
2116
|
+
writeResult(renderCheckReport(result, globalOptions.color, Boolean(options.verbose)), false);
|
|
2117
|
+
}
|
|
2118
|
+
process.exitCode = result.gates.overallPassed ? 0 : 1;
|
|
2119
|
+
} catch (error) {
|
|
2120
|
+
spinner?.stop();
|
|
2121
|
+
writeError(error, globalOptions.json);
|
|
2122
|
+
process.exitCode = 2;
|
|
2123
|
+
}
|
|
2124
|
+
});
|
|
2125
|
+
}
|
|
2126
|
+
|
|
1730
2127
|
// src/index.ts
|
|
1731
2128
|
function resolveVersion() {
|
|
1732
2129
|
try {
|
|
@@ -1745,6 +2142,7 @@ async function run(argv) {
|
|
|
1745
2142
|
registerLintCommand(program);
|
|
1746
2143
|
registerTriggerCommand(program);
|
|
1747
2144
|
registerEvalCommand(program);
|
|
2145
|
+
registerCheckCommand(program);
|
|
1748
2146
|
await program.parseAsync(argv);
|
|
1749
2147
|
}
|
|
1750
2148
|
run(process.argv).catch((error) => {
|