skilltest 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CLAUDE.md CHANGED
@@ -7,6 +7,7 @@
7
7
  - `lint`: static/offline quality checks
8
8
  - `trigger`: model-based triggerability testing
9
9
  - `eval`: end-to-end execution + grader-based scoring
10
+ - `check`: lint + trigger + eval quality gates in one run
10
11
 
11
12
  The CLI is published as `skilltest` and built for `npx skilltest` usage.
12
13
 
@@ -18,6 +19,7 @@ The CLI is published as `skilltest` and built for `npx skilltest` usage.
18
19
  - `src/core/linter/`: lint check modules and orchestrator
19
20
  - `src/core/trigger-tester.ts`: query generation + trigger simulation + metrics
20
21
  - `src/core/eval-runner.ts`: prompt generation/loading + skill execution + grading loop
22
+ - `src/core/check-runner.ts`: orchestrates lint + trigger + eval with threshold gates
21
23
  - `src/core/grader.ts`: structured grader prompt + JSON parse
22
24
  - `src/providers/`: LLM provider abstraction (`sendMessage`) and provider implementations
23
25
  - `src/reporters/`: terminal rendering and JSON output helper
@@ -68,6 +70,9 @@ ANTHROPIC_API_KEY=your-key node dist/index.js trigger test-fixtures/sample-skill
68
70
  - `sendMessage(systemPrompt, userMessage, { model }) => Promise<string>`
69
71
  - Lint is fully offline and first-class.
70
72
  - Trigger/eval rely on the same provider abstraction.
73
+ - `check` wraps lint + trigger + eval and enforces minimum thresholds:
74
+ - trigger F1
75
+ - eval assertion pass rate
71
76
  - JSON mode is strict:
72
77
  - no spinners
73
78
  - no colored output
@@ -79,7 +84,9 @@ ANTHROPIC_API_KEY=your-key node dist/index.js trigger test-fixtures/sample-skill
79
84
  ## Gotchas
80
85
 
81
86
  - `trigger --num-queries` must be even for balanced positive/negative cases.
82
- - OpenAI provider is intentionally a stub in v1 and throws `"OpenAI provider coming soon."`.
87
+ - `check` also requires even `--num-queries`.
88
+ - `check` stops after lint failures unless `--continue-on-lint-fail` is set.
89
+ - OpenAI provider is implemented via dynamic import so Anthropic-only installs do not crash if optional deps are skipped.
83
90
  - Frontmatter is validated with both `gray-matter` and `js-yaml`; malformed YAML should fail fast.
84
91
  - Keep file references relative to skill root; out-of-root refs are lint failures.
85
92
  - If you modify reporter formatting, ensure JSON mode remains machine-safe.
@@ -94,10 +101,10 @@ ANTHROPIC_API_KEY=your-key node dist/index.js trigger test-fixtures/sample-skill
94
101
  - Compatibility hints: `src/core/linter/compat.ts`
95
102
  - Trigger fake skill pool + scoring: `src/core/trigger-tester.ts`
96
103
  - Eval grading schema: `src/core/grader.ts`
104
+ - Combined quality gate orchestration: `src/core/check-runner.ts`
97
105
 
98
106
  ## Future Work (Not Implemented Yet)
99
107
 
100
- - Real OpenAI provider implementation
101
108
  - Config file support (`.skilltestrc`)
102
109
  - Parallel execution
103
110
  - HTML reporting
package/README.md CHANGED
@@ -23,7 +23,7 @@ Agent Skills are quick to write but hard to validate before deployment:
23
23
  - You cannot easily measure trigger precision/recall.
24
24
  - You do not know whether outputs are good until users exercise the skill.
25
25
 
26
- `skilltest` closes this gap with one CLI and three modes.
26
+ `skilltest` closes this gap with one CLI and four modes.
27
27
 
28
28
  ## Install
29
29
 
@@ -61,12 +61,18 @@ End-to-end eval:
61
61
  skilltest eval ./path/to/skill --provider anthropic --model claude-sonnet-4-5-20250929
62
62
  ```
63
63
 
64
+ Run full quality gate:
65
+
66
+ ```bash
67
+ skilltest check ./path/to/skill --provider anthropic --min-f1 0.8 --min-assert-pass-rate 0.9
68
+ ```
69
+
64
70
  Example lint summary:
65
71
 
66
72
  ```text
67
73
  skilltest lint
68
74
  target: ./test-fixtures/sample-skill
69
- summary: 25/25 checks passed, 0 warnings, 0 failures
75
+ summary: 29/29 checks passed, 0 warnings, 0 failures
70
76
  ```
71
77
 
72
78
  ## Commands
@@ -153,6 +159,32 @@ Flags:
153
159
  - `--api-key <key>` explicit key override
154
160
  - `--verbose` show full model responses
155
161
 
162
+ ### `skilltest check <path-to-skill>`
163
+
164
+ Runs `lint + trigger + eval` in one command and applies quality thresholds.
165
+
166
+ Default behavior:
167
+
168
+ 1. Run lint.
169
+ 2. Stop before model calls if lint has failures.
170
+ 3. Run trigger and eval only when lint passes.
171
+ 4. Fail quality gate when either threshold is below target.
172
+
173
+ Flags:
174
+
175
+ - `--provider <anthropic|openai>` default: `anthropic`
176
+ - `--model <model>` default: `claude-sonnet-4-5-20250929` (auto-switches to `gpt-4.1-mini` for `--provider openai` when unchanged)
177
+ - `--grader-model <model>` default: same as resolved `--model`
178
+ - `--api-key <key>` explicit key override
179
+ - `--queries <path>` custom trigger queries JSON
180
+ - `--num-queries <n>` default: `20` (must be even)
181
+ - `--prompts <path>` custom eval prompts JSON
182
+ - `--min-f1 <n>` default: `0.8`
183
+ - `--min-assert-pass-rate <n>` default: `0.9`
184
+ - `--save-results <path>` save combined check result JSON
185
+ - `--continue-on-lint-fail` continue trigger/eval even if lint fails
186
+ - `--verbose` include detailed trigger/eval sections
187
+
156
188
  ## Global Flags
157
189
 
158
190
  - `--help` show help
@@ -195,8 +227,8 @@ Eval prompts (`--prompts`):
195
227
 
196
228
  Exit codes:
197
229
 
198
- - `0`: success with no lint failures
199
- - `1`: lint failures present
230
+ - `0`: success
231
+ - `1`: quality gate failed (`lint`, `check` thresholds, or command-specific failure conditions)
200
232
  - `2`: runtime/config/API/parse error
201
233
 
202
234
  JSON mode examples:
@@ -205,6 +237,7 @@ JSON mode examples:
205
237
  skilltest lint ./skill --json
206
238
  skilltest trigger ./skill --json
207
239
  skilltest eval ./skill --json
240
+ skilltest check ./skill --json
208
241
  ```
209
242
 
210
243
  ## API Keys
@@ -230,7 +263,18 @@ skilltest trigger ./skill --api-key your-key
230
263
  Current provider status:
231
264
 
232
265
  - `anthropic`: implemented
233
- - `openai`: interface wired, command currently returns "OpenAI provider coming soon."
266
+ - `openai`: implemented
267
+
268
+ OpenAI quick example:
269
+
270
+ ```bash
271
+ skilltest trigger ./path/to/skill --provider openai --model gpt-4.1-mini
272
+ skilltest eval ./path/to/skill --provider openai --model gpt-4.1-mini
273
+ ```
274
+
275
+ Note:
276
+
277
+ - If you pass `--provider openai` and keep the Anthropic default model value, `skilltest` automatically switches to `gpt-4.1-mini`.
234
278
 
235
279
  ## CICD Integration
236
280
 
@@ -283,6 +327,7 @@ jobs:
283
327
  - run: npm run build
284
328
  - run: npx skilltest trigger path/to/skill --num-queries 20 --json
285
329
  - run: npx skilltest eval path/to/skill --prompts path/to/prompts.json --json
330
+ - run: npx skilltest check path/to/skill --min-f1 0.8 --min-assert-pass-rate 0.9 --json
286
331
  ```
287
332
 
288
333
  ## Local Development
@@ -300,6 +345,7 @@ Smoke tests:
300
345
  node dist/index.js lint test-fixtures/sample-skill/
301
346
  node dist/index.js trigger test-fixtures/sample-skill/ --num-queries 2
302
347
  node dist/index.js eval test-fixtures/sample-skill/ --prompts test-fixtures/eval-prompts.json
348
+ node dist/index.js check test-fixtures/sample-skill/ --num-queries 2 --prompts test-fixtures/eval-prompts.json
303
349
  ```
304
350
 
305
351
  ## Release Checklist
package/dist/index.js CHANGED
@@ -1076,6 +1076,88 @@ function renderEvalReport(result, enableColor, verbose) {
1076
1076
  }
1077
1077
  return lines.join("\n");
1078
1078
  }
1079
+ function gateStatusLabel(value, c) {
1080
+ if (value === null) {
1081
+ return c.yellow("SKIP");
1082
+ }
1083
+ return value ? c.green("PASS") : c.red("FAIL");
1084
+ }
1085
+ function renderCheckReport(result, enableColor, verbose) {
1086
+ const c = getChalkInstance(enableColor);
1087
+ const lines = [];
1088
+ const lintGate = gateStatusLabel(result.gates.lintPassed, c);
1089
+ const triggerGate = gateStatusLabel(result.gates.triggerPassed, c);
1090
+ const evalGate = gateStatusLabel(result.gates.evalPassed, c);
1091
+ const overallGate = result.gates.overallPassed ? c.green("PASS") : c.red("FAIL");
1092
+ lines.push("skilltest check");
1093
+ lines.push(`target: ${result.target}`);
1094
+ lines.push(`provider/model: ${result.provider}/${result.model}`);
1095
+ lines.push(`grader model: ${result.graderModel}`);
1096
+ lines.push(
1097
+ `thresholds: min-f1=${result.thresholds.minF1.toFixed(2)} min-assert-pass-rate=${result.thresholds.minAssertPassRate.toFixed(2)}`
1098
+ );
1099
+ lines.push("");
1100
+ lines.push("Lint");
1101
+ lines.push(
1102
+ `- ${lintGate} ${result.lint.summary.passed}/${result.lint.summary.total} checks passed (${result.lint.summary.warnings} warnings, ${result.lint.summary.failures} failures)`
1103
+ );
1104
+ const lintIssues = verbose ? result.lint.issues : result.lint.issues.filter((issue) => issue.status !== "pass");
1105
+ for (const issue of lintIssues) {
1106
+ lines.push(renderIssueLine(issue, c));
1107
+ }
1108
+ lines.push("");
1109
+ lines.push("Trigger");
1110
+ if (result.trigger) {
1111
+ lines.push(
1112
+ `- ${triggerGate} f1=${formatPercent(result.trigger.metrics.f1)} (precision=${formatPercent(result.trigger.metrics.precision)} recall=${formatPercent(result.trigger.metrics.recall)})`
1113
+ );
1114
+ lines.push(
1115
+ ` TP ${result.trigger.metrics.truePositives} TN ${result.trigger.metrics.trueNegatives} FP ${result.trigger.metrics.falsePositives} FN ${result.trigger.metrics.falseNegatives}`
1116
+ );
1117
+ const triggerCases = verbose ? result.trigger.cases : result.trigger.cases.filter((testCase) => !testCase.matched);
1118
+ for (const testCase of triggerCases) {
1119
+ const status = testCase.matched ? c.green("PASS") : c.red("FAIL");
1120
+ lines.push(` - ${status} ${testCase.query}`);
1121
+ lines.push(` expected=${testCase.expected} actual=${testCase.actual}`);
1122
+ }
1123
+ } else {
1124
+ lines.push(`- ${triggerGate} ${result.triggerSkippedReason ?? "Skipped."}`);
1125
+ }
1126
+ lines.push("");
1127
+ lines.push("Eval");
1128
+ if (result.eval) {
1129
+ const passRate = result.gates.evalAssertPassRate ?? 0;
1130
+ lines.push(
1131
+ `- ${evalGate} assertion pass rate=${formatPercent(passRate)} (${result.eval.summary.passedAssertions}/${result.eval.summary.totalAssertions})`
1132
+ );
1133
+ for (const promptResult of result.eval.results) {
1134
+ const failedAssertions = promptResult.assertions.filter((assertion) => !assertion.passed);
1135
+ if (!verbose && failedAssertions.length === 0) {
1136
+ continue;
1137
+ }
1138
+ lines.push(` - prompt: ${promptResult.prompt}`);
1139
+ lines.push(` response summary: ${promptResult.responseSummary.replace(/\s+/g, " ").trim()}`);
1140
+ const assertionsToRender = verbose ? promptResult.assertions : failedAssertions;
1141
+ for (const assertion of assertionsToRender) {
1142
+ const assertionStatus = assertion.passed ? c.green("PASS") : c.red("FAIL");
1143
+ lines.push(` ${assertionStatus} ${assertion.assertion}`);
1144
+ lines.push(` evidence: ${assertion.evidence}`);
1145
+ }
1146
+ if (verbose) {
1147
+ lines.push(` full response: ${promptResult.response}`);
1148
+ }
1149
+ }
1150
+ } else {
1151
+ lines.push(`- ${evalGate} ${result.evalSkippedReason ?? "Skipped."}`);
1152
+ }
1153
+ lines.push("");
1154
+ lines.push("Quality Gate");
1155
+ lines.push(`- lint gate: ${lintGate}`);
1156
+ lines.push(`- trigger gate: ${triggerGate}`);
1157
+ lines.push(`- eval gate: ${evalGate}`);
1158
+ lines.push(`- overall: ${overallGate}`);
1159
+ return lines.join("\n");
1160
+ }
1079
1161
 
1080
1162
  // src/reporters/json.ts
1081
1163
  function renderJson(value) {
@@ -1414,15 +1496,104 @@ var AnthropicProvider = class {
1414
1496
  };
1415
1497
 
1416
1498
  // src/providers/openai.ts
1499
+ function wait2(ms) {
1500
+ return new Promise((resolve) => {
1501
+ setTimeout(resolve, ms);
1502
+ });
1503
+ }
1504
+ function isRetriableError(error) {
1505
+ if (!error || typeof error !== "object") {
1506
+ return false;
1507
+ }
1508
+ const maybeStatus = error.status;
1509
+ if (maybeStatus === 429 || typeof maybeStatus === "number" && maybeStatus >= 500) {
1510
+ return true;
1511
+ }
1512
+ const maybeCode = error.code;
1513
+ if (typeof maybeCode === "string" && /timeout|econnreset|enotfound|eai_again/i.test(maybeCode)) {
1514
+ return true;
1515
+ }
1516
+ const maybeMessage = error.message;
1517
+ if (typeof maybeMessage === "string" && /(rate limit|timeout|temporarily unavailable|connection)/i.test(maybeMessage)) {
1518
+ return true;
1519
+ }
1520
+ return false;
1521
+ }
1522
+ function extractTextContent(content) {
1523
+ if (!content) {
1524
+ return "";
1525
+ }
1526
+ if (typeof content === "string") {
1527
+ return content.trim();
1528
+ }
1529
+ const text = content.map((item) => item.type === "text" || !item.type ? item.text ?? "" : "").join("\n").trim();
1530
+ return text;
1531
+ }
1417
1532
  var OpenAIProvider = class {
1418
1533
  name = "openai";
1419
- _apiKey;
1534
+ apiKey;
1535
+ client;
1420
1536
  constructor(apiKey) {
1421
- this._apiKey = apiKey;
1537
+ this.apiKey = apiKey;
1538
+ this.client = null;
1539
+ }
1540
+ async ensureClient() {
1541
+ if (this.client) {
1542
+ return this.client;
1543
+ }
1544
+ let openAiModule;
1545
+ try {
1546
+ const moduleName = "openai";
1547
+ openAiModule = await import(moduleName);
1548
+ } catch {
1549
+ throw new Error(
1550
+ "OpenAI SDK is not installed. Install optional dependency 'openai' or run 'npm install' with optional dependencies enabled."
1551
+ );
1552
+ }
1553
+ const OpenAIConstructor = openAiModule.default;
1554
+ if (!OpenAIConstructor) {
1555
+ throw new Error("OpenAI SDK loaded but no default client export was found.");
1556
+ }
1557
+ this.client = new OpenAIConstructor({ apiKey: this.apiKey });
1558
+ return this.client;
1422
1559
  }
1423
- async sendMessage(_systemPrompt, _userMessage, _options) {
1424
- void this._apiKey;
1425
- throw new Error("OpenAI provider coming soon.");
1560
+ async sendMessage(systemPrompt, userMessage, options) {
1561
+ const client = await this.ensureClient();
1562
+ let lastError;
1563
+ for (let attempt = 0; attempt < 3; attempt += 1) {
1564
+ try {
1565
+ const response = await client.chat.completions.create({
1566
+ model: options.model,
1567
+ max_tokens: 2048,
1568
+ messages: [
1569
+ {
1570
+ role: "system",
1571
+ content: systemPrompt
1572
+ },
1573
+ {
1574
+ role: "user",
1575
+ content: userMessage
1576
+ }
1577
+ ]
1578
+ });
1579
+ const text = (response.choices ?? []).map((choice) => extractTextContent(choice.message?.content)).join("\n").trim();
1580
+ if (text.length === 0) {
1581
+ throw new Error("Model returned an empty response.");
1582
+ }
1583
+ return text;
1584
+ } catch (error) {
1585
+ lastError = error;
1586
+ if (!isRetriableError(error) || attempt === 2) {
1587
+ break;
1588
+ }
1589
+ const delay = Math.min(4e3, 500 * 2 ** attempt) + Math.floor(Math.random() * 250);
1590
+ await wait2(delay);
1591
+ }
1592
+ }
1593
+ if (lastError instanceof Error) {
1594
+ throw new Error(`OpenAI API call failed: ${lastError.message}`);
1595
+ }
1596
+ throw new Error("OpenAI API call failed with an unknown error.");
1426
1597
  }
1427
1598
  };
1428
1599
 
@@ -1445,8 +1616,16 @@ var triggerOptionsSchema = z3.object({
1445
1616
  verbose: z3.boolean().optional(),
1446
1617
  apiKey: z3.string().optional()
1447
1618
  });
1619
+ var DEFAULT_ANTHROPIC_MODEL = "claude-sonnet-4-5-20250929";
1620
+ var DEFAULT_OPENAI_MODEL = "gpt-4.1-mini";
1621
+ function resolveModel(provider, model) {
1622
+ if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL) {
1623
+ return DEFAULT_OPENAI_MODEL;
1624
+ }
1625
+ return model;
1626
+ }
1448
1627
  function registerTriggerCommand(program) {
1449
- program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use", "claude-sonnet-4-5-20250929").option("--provider <provider>", "LLM provider: anthropic|openai", "anthropic").option("--queries <path>", "Path to custom test queries JSON").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10), 20).option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, commandOptions, command) => {
1628
+ program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use", DEFAULT_ANTHROPIC_MODEL).option("--provider <provider>", "LLM provider: anthropic|openai", "anthropic").option("--queries <path>", "Path to custom test queries JSON").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10), 20).option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, commandOptions, command) => {
1450
1629
  const globalOptions = getGlobalCliOptions(command);
1451
1630
  const parsedOptions = triggerOptionsSchema.safeParse(commandOptions);
1452
1631
  if (!parsedOptions.success) {
@@ -1483,8 +1662,9 @@ function registerTriggerCommand(program) {
1483
1662
  if (spinner) {
1484
1663
  spinner.text = "Running trigger simulations...";
1485
1664
  }
1665
+ const model = resolveModel(options.provider, options.model);
1486
1666
  const result = await runTriggerTest(skill, {
1487
- model: options.model,
1667
+ model,
1488
1668
  provider,
1489
1669
  queries,
1490
1670
  numQueries: options.numQueries,
@@ -1669,8 +1849,16 @@ var evalOptionsSchema = z6.object({
1669
1849
  verbose: z6.boolean().optional(),
1670
1850
  apiKey: z6.string().optional()
1671
1851
  });
1852
+ var DEFAULT_ANTHROPIC_MODEL2 = "claude-sonnet-4-5-20250929";
1853
+ var DEFAULT_OPENAI_MODEL2 = "gpt-4.1-mini";
1854
+ function resolveModel2(provider, model) {
1855
+ if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL2) {
1856
+ return DEFAULT_OPENAI_MODEL2;
1857
+ }
1858
+ return model;
1859
+ }
1672
1860
  function registerEvalCommand(program) {
1673
- program.command("eval").description("Run end-to-end skill execution and quality evaluation.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--prompts <path>", "Path to eval prompts JSON").option("--model <model>", "Model to execute prompts", "claude-sonnet-4-5-20250929").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--provider <provider>", "LLM provider: anthropic|openai", "anthropic").option("--save-results <path>", "Save full evaluation results to JSON").option("--api-key <key>", "API key override").option("--verbose", "Show full model responses").action(async (targetPath, commandOptions, command) => {
1861
+ program.command("eval").description("Run end-to-end skill execution and quality evaluation.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--prompts <path>", "Path to eval prompts JSON").option("--model <model>", "Model to execute prompts", DEFAULT_ANTHROPIC_MODEL2).option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--provider <provider>", "LLM provider: anthropic|openai", "anthropic").option("--save-results <path>", "Save full evaluation results to JSON").option("--api-key <key>", "API key override").option("--verbose", "Show full model responses").action(async (targetPath, commandOptions, command) => {
1674
1862
  const globalOptions = getGlobalCliOptions(command);
1675
1863
  const parsedOptions = evalOptionsSchema.safeParse(commandOptions);
1676
1864
  if (!parsedOptions.success) {
@@ -1704,10 +1892,12 @@ function registerEvalCommand(program) {
1704
1892
  if (spinner) {
1705
1893
  spinner.text = "Running eval prompts and grading responses...";
1706
1894
  }
1895
+ const model = resolveModel2(options.provider, options.model);
1896
+ const graderModel = options.graderModel ?? model;
1707
1897
  const result = await runEval(skill, {
1708
1898
  provider,
1709
- model: options.model,
1710
- graderModel: options.graderModel ?? options.model,
1899
+ model,
1900
+ graderModel,
1711
1901
  prompts
1712
1902
  });
1713
1903
  if (options.saveResults) {
@@ -1727,6 +1917,213 @@ function registerEvalCommand(program) {
1727
1917
  });
1728
1918
  }
1729
1919
 
1920
+ // src/commands/check.ts
1921
+ import ora3 from "ora";
1922
+ import { z as z7 } from "zod";
1923
+
1924
+ // src/core/check-runner.ts
1925
+ function calculateEvalAssertPassRate(result) {
1926
+ if (result.summary.totalAssertions === 0) {
1927
+ return 0;
1928
+ }
1929
+ return result.summary.passedAssertions / result.summary.totalAssertions;
1930
+ }
1931
+ async function runCheck(inputPath, options) {
1932
+ options.onStage?.("lint");
1933
+ const lint = await runLinter(inputPath);
1934
+ const lintPassed = lint.summary.failures === 0;
1935
+ let trigger = null;
1936
+ let evalResult = null;
1937
+ let triggerSkippedReason;
1938
+ let evalSkippedReason;
1939
+ if (!lintPassed && !options.continueOnLintFail) {
1940
+ triggerSkippedReason = "Skipped because lint has failures (use --continue-on-lint-fail to override).";
1941
+ evalSkippedReason = "Skipped because lint has failures (use --continue-on-lint-fail to override).";
1942
+ } else {
1943
+ options.onStage?.("parse");
1944
+ let parsedSkill = null;
1945
+ try {
1946
+ parsedSkill = await parseSkillStrict(inputPath);
1947
+ } catch (error) {
1948
+ const message = error instanceof Error ? error.message : String(error);
1949
+ triggerSkippedReason = `Skipped: skill could not be parsed strictly (${message}).`;
1950
+ evalSkippedReason = `Skipped: skill could not be parsed strictly (${message}).`;
1951
+ }
1952
+ if (parsedSkill) {
1953
+ options.onStage?.("trigger");
1954
+ trigger = await runTriggerTest(parsedSkill, {
1955
+ provider: options.provider,
1956
+ model: options.model,
1957
+ queries: options.queries,
1958
+ numQueries: options.numQueries,
1959
+ verbose: options.verbose
1960
+ });
1961
+ options.onStage?.("eval");
1962
+ evalResult = await runEval(parsedSkill, {
1963
+ provider: options.provider,
1964
+ model: options.model,
1965
+ graderModel: options.graderModel,
1966
+ prompts: options.prompts
1967
+ });
1968
+ }
1969
+ }
1970
+ const triggerF1 = trigger ? trigger.metrics.f1 : null;
1971
+ const evalAssertPassRate = evalResult ? calculateEvalAssertPassRate(evalResult) : null;
1972
+ const triggerPassed = triggerF1 === null ? null : triggerF1 >= options.minF1;
1973
+ const evalPassed = evalAssertPassRate === null ? null : evalAssertPassRate >= options.minAssertPassRate;
1974
+ const overallPassed = lintPassed && triggerPassed === true && evalPassed === true;
1975
+ return {
1976
+ target: inputPath,
1977
+ provider: options.provider.name,
1978
+ model: options.model,
1979
+ graderModel: options.graderModel,
1980
+ thresholds: {
1981
+ minF1: options.minF1,
1982
+ minAssertPassRate: options.minAssertPassRate
1983
+ },
1984
+ continueOnLintFail: options.continueOnLintFail,
1985
+ lint,
1986
+ trigger,
1987
+ eval: evalResult,
1988
+ triggerSkippedReason,
1989
+ evalSkippedReason,
1990
+ gates: {
1991
+ lintPassed,
1992
+ triggerPassed,
1993
+ evalPassed,
1994
+ triggerF1,
1995
+ evalAssertPassRate,
1996
+ overallPassed
1997
+ }
1998
+ };
1999
+ }
2000
+
2001
+ // src/commands/check.ts
2002
+ var checkOptionsSchema = z7.object({
2003
+ provider: z7.enum(["anthropic", "openai"]),
2004
+ model: z7.string(),
2005
+ graderModel: z7.string().optional(),
2006
+ apiKey: z7.string().optional(),
2007
+ queries: z7.string().optional(),
2008
+ numQueries: z7.number().int().min(2),
2009
+ prompts: z7.string().optional(),
2010
+ minF1: z7.number().min(0).max(1),
2011
+ minAssertPassRate: z7.number().min(0).max(1),
2012
+ saveResults: z7.string().optional(),
2013
+ continueOnLintFail: z7.boolean().optional(),
2014
+ verbose: z7.boolean().optional()
2015
+ });
2016
+ var DEFAULT_ANTHROPIC_MODEL3 = "claude-sonnet-4-5-20250929";
2017
+ var DEFAULT_OPENAI_MODEL3 = "gpt-4.1-mini";
2018
+ function resolveModel3(provider, model) {
2019
+ if (provider === "openai" && model === DEFAULT_ANTHROPIC_MODEL3) {
2020
+ return DEFAULT_OPENAI_MODEL3;
2021
+ }
2022
+ return model;
2023
+ }
2024
+ function registerCheckCommand(program) {
2025
+ program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai", "anthropic").option("--model <model>", "Model for trigger/eval runs", DEFAULT_ANTHROPIC_MODEL3).option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10), 20).option("--prompts <path>", "Path to eval prompts JSON").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value), 0.8).option(
2026
+ "--min-assert-pass-rate <n>",
2027
+ "Minimum required eval assertion pass rate (0-1)",
2028
+ (value) => Number.parseFloat(value),
2029
+ 0.9
2030
+ ).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, commandOptions, command) => {
2031
+ const globalOptions = getGlobalCliOptions(command);
2032
+ const parsedOptions = checkOptionsSchema.safeParse(commandOptions);
2033
+ if (!parsedOptions.success) {
2034
+ writeError(new Error(parsedOptions.error.issues[0]?.message ?? "Invalid check options."), globalOptions.json);
2035
+ process.exitCode = 2;
2036
+ return;
2037
+ }
2038
+ const options = parsedOptions.data;
2039
+ if (options.numQueries % 2 !== 0) {
2040
+ writeError(
2041
+ new Error("--num-queries must be an even number so the suite can split should/should-not trigger cases."),
2042
+ globalOptions.json
2043
+ );
2044
+ process.exitCode = 2;
2045
+ return;
2046
+ }
2047
+ const spinner = globalOptions.json || !process.stdout.isTTY ? null : ora3("Preparing check run...").start();
2048
+ try {
2049
+ if (spinner) {
2050
+ spinner.text = "Initializing model provider...";
2051
+ }
2052
+ const provider = createProvider(options.provider, options.apiKey);
2053
+ let queries = void 0;
2054
+ if (options.queries) {
2055
+ if (spinner) {
2056
+ spinner.text = "Loading custom trigger queries...";
2057
+ }
2058
+ const loadedQueries = await readJsonFile(options.queries);
2059
+ const parsedQueries = triggerQueryArraySchema.safeParse(loadedQueries);
2060
+ if (!parsedQueries.success) {
2061
+ throw new Error(
2062
+ `Invalid --queries JSON: ${parsedQueries.error.issues[0]?.message ?? "unknown format issue"}`
2063
+ );
2064
+ }
2065
+ queries = parsedQueries.data;
2066
+ }
2067
+ let prompts = void 0;
2068
+ if (options.prompts) {
2069
+ if (spinner) {
2070
+ spinner.text = "Loading eval prompts...";
2071
+ }
2072
+ const loadedPrompts = await readJsonFile(options.prompts);
2073
+ const parsedPrompts = evalPromptArraySchema.safeParse(loadedPrompts);
2074
+ if (!parsedPrompts.success) {
2075
+ throw new Error(
2076
+ `Invalid --prompts JSON: ${parsedPrompts.error.issues[0]?.message ?? "unknown format issue"}`
2077
+ );
2078
+ }
2079
+ prompts = parsedPrompts.data;
2080
+ }
2081
+ const model = resolveModel3(options.provider, options.model);
2082
+ const graderModel = options.graderModel ?? model;
2083
+ const result = await runCheck(targetPath, {
2084
+ provider,
2085
+ model,
2086
+ graderModel,
2087
+ queries,
2088
+ numQueries: options.numQueries,
2089
+ prompts,
2090
+ minF1: options.minF1,
2091
+ minAssertPassRate: options.minAssertPassRate,
2092
+ continueOnLintFail: Boolean(options.continueOnLintFail),
2093
+ verbose: Boolean(options.verbose),
2094
+ onStage: (stage) => {
2095
+ if (!spinner) {
2096
+ return;
2097
+ }
2098
+ if (stage === "lint") {
2099
+ spinner.text = "Running lint checks...";
2100
+ } else if (stage === "parse") {
2101
+ spinner.text = "Parsing skill for model evaluations...";
2102
+ } else if (stage === "trigger") {
2103
+ spinner.text = "Running trigger test suite...";
2104
+ } else if (stage === "eval") {
2105
+ spinner.text = "Running end-to-end eval suite...";
2106
+ }
2107
+ }
2108
+ });
2109
+ if (options.saveResults) {
2110
+ await writeJsonFile(options.saveResults, result);
2111
+ }
2112
+ spinner?.stop();
2113
+ if (globalOptions.json) {
2114
+ writeResult(result, true);
2115
+ } else {
2116
+ writeResult(renderCheckReport(result, globalOptions.color, Boolean(options.verbose)), false);
2117
+ }
2118
+ process.exitCode = result.gates.overallPassed ? 0 : 1;
2119
+ } catch (error) {
2120
+ spinner?.stop();
2121
+ writeError(error, globalOptions.json);
2122
+ process.exitCode = 2;
2123
+ }
2124
+ });
2125
+ }
2126
+
1730
2127
  // src/index.ts
1731
2128
  function resolveVersion() {
1732
2129
  try {
@@ -1745,6 +2142,7 @@ async function run(argv) {
1745
2142
  registerLintCommand(program);
1746
2143
  registerTriggerCommand(program);
1747
2144
  registerEvalCommand(program);
2145
+ registerCheckCommand(program);
1748
2146
  await program.parseAsync(argv);
1749
2147
  }
1750
2148
  run(process.argv).catch((error) => {