skilltest 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,8 +1,8 @@
1
1
  #!/usr/bin/env node
2
2
 
3
3
  // src/index.ts
4
- import fs5 from "node:fs";
5
- import path5 from "node:path";
4
+ import fs7 from "node:fs";
5
+ import path6 from "node:path";
6
6
  import { fileURLToPath } from "node:url";
7
7
  import { Command } from "commander";
8
8
 
@@ -194,6 +194,7 @@ function runCompatibilityChecks(context) {
194
194
  if (hasAllowedTools) {
195
195
  issues.push({
196
196
  id: "compat.allowed-tools",
197
+ checkId: "compat:frontmatter",
197
198
  title: "Platform-Specific Frontmatter",
198
199
  status: "warn",
199
200
  message: "Frontmatter includes allowed-tools, which is typically Claude-specific.",
@@ -202,6 +203,7 @@ function runCompatibilityChecks(context) {
202
203
  } else {
203
204
  issues.push({
204
205
  id: "compat.allowed-tools",
206
+ checkId: "compat:frontmatter",
205
207
  title: "Platform-Specific Frontmatter",
206
208
  status: "pass",
207
209
  message: "No known provider-specific frontmatter keys detected."
@@ -211,6 +213,7 @@ function runCompatibilityChecks(context) {
211
213
  const platform = mentionsClaudeOnly ? "Claude" : "Codex";
212
214
  issues.push({
213
215
  id: "compat.provider-phrasing",
216
+ checkId: "compat:provider-language",
214
217
  title: "Provider-Specific Language",
215
218
  status: "warn",
216
219
  message: `Skill body appears tuned to ${platform}-specific behavior.`,
@@ -219,6 +222,7 @@ function runCompatibilityChecks(context) {
219
222
  } else {
220
223
  issues.push({
221
224
  id: "compat.provider-phrasing",
225
+ checkId: "compat:provider-language",
222
226
  title: "Provider-Specific Language",
223
227
  status: "pass",
224
228
  message: "Skill body appears provider-neutral."
@@ -227,6 +231,7 @@ function runCompatibilityChecks(context) {
227
231
  const likelyCompatibility = hasAllowedTools || mentionsClaudeOnly || mentionsCodexOnly ? "Likely compatible with some agents, but includes platform-specific assumptions." : "Likely broadly compatible across Anthropic, OpenAI/Codex-style, and other markdown skill runners.";
228
232
  issues.push({
229
233
  id: "compat.summary",
234
+ checkId: "compat:summary",
230
235
  title: "Compatibility Hint",
231
236
  status: hasAllowedTools || mentionsClaudeOnly || mentionsCodexOnly ? "warn" : "pass",
232
237
  message: likelyCompatibility
@@ -234,6 +239,171 @@ function runCompatibilityChecks(context) {
234
239
  return issues;
235
240
  }
236
241
 
242
+ // src/core/linter/markdown-zones.ts
243
+ function splitLines(raw) {
244
+ return raw.split(/\r?\n/);
245
+ }
246
+ function stripTopFrontmatter(raw) {
247
+ const lines = splitLines(raw);
248
+ if (lines[0] !== "---") {
249
+ return {
250
+ bodyLines: lines,
251
+ bodyStartLine: 1
252
+ };
253
+ }
254
+ for (let index = 1; index < lines.length; index += 1) {
255
+ if (lines[index] === "---") {
256
+ return {
257
+ bodyLines: lines.slice(index + 1),
258
+ bodyStartLine: index + 2
259
+ };
260
+ }
261
+ }
262
+ return {
263
+ bodyLines: lines,
264
+ bodyStartLine: 1
265
+ };
266
+ }
267
+ function matchCodeFenceOpener(line) {
268
+ const match = line.match(/^\s*(`{3,}|~{3,})(.*)$/);
269
+ return match?.[1] ?? null;
270
+ }
271
+ function isExactCodeFenceCloser(line, delimiter) {
272
+ return line.trim() === delimiter;
273
+ }
274
+ function appendZone(zones, type, content, startLine, endLine) {
275
+ if (content === "") {
276
+ return;
277
+ }
278
+ const previous = zones[zones.length - 1];
279
+ if (previous && previous.type === type && startLine <= previous.endLine + 1) {
280
+ const separator = startLine > previous.endLine ? "\n" : "";
281
+ previous.content += `${separator}${content}`;
282
+ previous.endLine = endLine;
283
+ return;
284
+ }
285
+ zones.push({
286
+ type,
287
+ content,
288
+ startLine,
289
+ endLine
290
+ });
291
+ }
292
+ function appendToOpenZone(zone, content, lineNumber) {
293
+ if (content === "") {
294
+ if (lineNumber > zone.endLine) {
295
+ zone.content += "\n";
296
+ zone.endLine = lineNumber;
297
+ }
298
+ return;
299
+ }
300
+ const separator = lineNumber > zone.endLine ? "\n" : "";
301
+ zone.content += `${separator}${content}`;
302
+ zone.endLine = lineNumber;
303
+ }
304
+ function addInlineAwareText(zones, text, lineNumber, baseType) {
305
+ if (text === "") {
306
+ return;
307
+ }
308
+ let cursor = 0;
309
+ while (cursor < text.length) {
310
+ const inlineStart = text.indexOf("`", cursor);
311
+ if (inlineStart === -1) {
312
+ appendZone(zones, baseType, text.slice(cursor), lineNumber, lineNumber);
313
+ return;
314
+ }
315
+ if (inlineStart > cursor) {
316
+ appendZone(zones, baseType, text.slice(cursor, inlineStart), lineNumber, lineNumber);
317
+ }
318
+ const inlineEnd = text.indexOf("`", inlineStart + 1);
319
+ if (inlineEnd === -1) {
320
+ appendZone(zones, baseType, text.slice(inlineStart), lineNumber, lineNumber);
321
+ return;
322
+ }
323
+ appendZone(zones, "inline-code", text.slice(inlineStart, inlineEnd + 1), lineNumber, lineNumber);
324
+ cursor = inlineEnd + 1;
325
+ }
326
+ }
327
+ function parseZones(raw) {
328
+ const { bodyLines, bodyStartLine } = stripTopFrontmatter(raw);
329
+ const zones = [];
330
+ let openCodeFence = null;
331
+ let openComment = null;
332
+ for (const [index, line] of bodyLines.entries()) {
333
+ const lineNumber = bodyStartLine + index;
334
+ if (openCodeFence) {
335
+ appendToOpenZone(openCodeFence.zone, line, lineNumber);
336
+ if (isExactCodeFenceCloser(line, openCodeFence.delimiter)) {
337
+ zones.push(openCodeFence.zone);
338
+ openCodeFence = null;
339
+ }
340
+ continue;
341
+ }
342
+ if (!openComment) {
343
+ const fenceDelimiter = matchCodeFenceOpener(line);
344
+ if (fenceDelimiter) {
345
+ openCodeFence = {
346
+ delimiter: fenceDelimiter,
347
+ zone: {
348
+ type: "code-fence",
349
+ content: line,
350
+ startLine: lineNumber,
351
+ endLine: lineNumber
352
+ }
353
+ };
354
+ continue;
355
+ }
356
+ }
357
+ const baseType = /^\s*>/.test(line) ? "blockquote" : "prose";
358
+ let cursor = 0;
359
+ while (cursor < line.length || openComment) {
360
+ if (openComment) {
361
+ const closeIndex = line.indexOf("-->", cursor);
362
+ if (closeIndex === -1) {
363
+ appendToOpenZone(openComment, line.slice(cursor), lineNumber);
364
+ cursor = line.length;
365
+ break;
366
+ }
367
+ appendToOpenZone(openComment, line.slice(cursor, closeIndex + 3), lineNumber);
368
+ zones.push(openComment);
369
+ openComment = null;
370
+ cursor = closeIndex + 3;
371
+ continue;
372
+ }
373
+ if (cursor >= line.length) {
374
+ break;
375
+ }
376
+ const commentStart = line.indexOf("<!--", cursor);
377
+ const textEnd = commentStart === -1 ? line.length : commentStart;
378
+ if (textEnd > cursor) {
379
+ addInlineAwareText(zones, line.slice(cursor, textEnd), lineNumber, baseType);
380
+ }
381
+ if (commentStart === -1) {
382
+ break;
383
+ }
384
+ const commentEnd = line.indexOf("-->", commentStart + 4);
385
+ if (commentEnd === -1) {
386
+ openComment = {
387
+ type: "html-comment",
388
+ content: line.slice(commentStart),
389
+ startLine: lineNumber,
390
+ endLine: lineNumber
391
+ };
392
+ break;
393
+ }
394
+ appendZone(zones, "html-comment", line.slice(commentStart, commentEnd + 3), lineNumber, lineNumber);
395
+ cursor = commentEnd + 3;
396
+ }
397
+ }
398
+ if (openComment) {
399
+ zones.push(openComment);
400
+ }
401
+ if (openCodeFence) {
402
+ zones.push(openCodeFence.zone);
403
+ }
404
+ return zones;
405
+ }
406
+
237
407
  // src/core/linter/content.ts
238
408
  var VAGUE_PATTERNS = [
239
409
  /\bdo something appropriate\b/i,
@@ -250,6 +420,102 @@ var SECRET_PATTERNS = [
250
420
  { label: "Slack token", regex: /\bxox[baprs]-[A-Za-z0-9-]{20,}\b/ },
251
421
  { label: "Generic private key header", regex: /-----BEGIN (?:RSA |EC |OPENSSH )?PRIVATE KEY-----/ }
252
422
  ];
423
+ function summarizeLineRange(matches) {
424
+ if (matches.length === 0) {
425
+ return {};
426
+ }
427
+ return {
428
+ startLine: Math.min(...matches.map((match) => match.startLine)),
429
+ endLine: Math.max(...matches.map((match) => match.endLine))
430
+ };
431
+ }
432
+ function uniqueLabels(matches) {
433
+ const labels = [];
434
+ const seen = /* @__PURE__ */ new Set();
435
+ for (const match of matches) {
436
+ if (seen.has(match.label)) {
437
+ continue;
438
+ }
439
+ seen.add(match.label);
440
+ labels.push(match.label);
441
+ }
442
+ return labels;
443
+ }
444
+ function collectSecretMatches(zones) {
445
+ const prose = [];
446
+ const nonProse = [];
447
+ for (const zone of zones) {
448
+ for (const pattern of SECRET_PATTERNS) {
449
+ if (!pattern.regex.test(zone.content)) {
450
+ continue;
451
+ }
452
+ const occurrence = {
453
+ label: pattern.label,
454
+ zoneType: zone.type,
455
+ startLine: zone.startLine,
456
+ endLine: zone.endLine
457
+ };
458
+ if (zone.type === "prose") {
459
+ prose.push(occurrence);
460
+ } else {
461
+ nonProse.push(occurrence);
462
+ }
463
+ }
464
+ }
465
+ return { prose, nonProse };
466
+ }
467
+ function buildSkippedPatterns(matches) {
468
+ if (matches.length === 0) {
469
+ return void 0;
470
+ }
471
+ return matches.map((match) => ({
472
+ label: match.label,
473
+ zoneType: match.zoneType,
474
+ startLine: match.startLine,
475
+ endLine: match.endLine
476
+ }));
477
+ }
478
+ function buildSecretsIssue(context) {
479
+ if (context.suppressedCheckIds.has("content:secrets")) {
480
+ return null;
481
+ }
482
+ const { prose, nonProse } = collectSecretMatches(parseZones(context.skill.raw));
483
+ const proseLabels = uniqueLabels(prose);
484
+ const nonProseLabels = uniqueLabels(nonProse);
485
+ const skippedPatterns = buildSkippedPatterns(nonProse);
486
+ if (proseLabels.length > 0) {
487
+ return {
488
+ id: "content.secrets",
489
+ checkId: "content:secrets",
490
+ title: "Hardcoded Secrets",
491
+ status: "fail",
492
+ message: `Potential secrets detected (${proseLabels.join(", ")}).`,
493
+ suggestion: "Remove secrets from skill files and use environment variables or secret managers.",
494
+ ...summarizeLineRange(prose),
495
+ skippedPatterns
496
+ };
497
+ }
498
+ if (nonProseLabels.length > 0) {
499
+ const codeFenceOnly = nonProse.every((match) => match.zoneType === "code-fence");
500
+ return {
501
+ id: "content.secrets",
502
+ checkId: "content:secrets",
503
+ title: "Hardcoded Secrets",
504
+ status: "warn",
505
+ message: codeFenceOnly ? `Possible secret in code example \u2014 verify this is a placeholder, not a real key (${nonProseLabels.join(", ")}).` : `Possible secrets found outside prose instructions (${nonProseLabels.join(", ")}). Verify these are placeholders, not real credentials.`,
506
+ suggestion: "Replace real-looking credentials in examples with explicit placeholders such as YOUR_API_KEY.",
507
+ ...summarizeLineRange(nonProse),
508
+ skippedPatterns
509
+ };
510
+ }
511
+ return {
512
+ id: "content.secrets",
513
+ checkId: "content:secrets",
514
+ title: "Hardcoded Secrets",
515
+ status: "pass",
516
+ message: "No obvious API keys or secrets patterns were detected."
517
+ };
518
+ }
253
519
  function runContentChecks(context) {
254
520
  const issues = [];
255
521
  const body = context.frontmatter.content;
@@ -258,6 +524,7 @@ function runContentChecks(context) {
258
524
  if (!/^#{1,6}\s+\S+/m.test(body)) {
259
525
  issues.push({
260
526
  id: "content.headers",
527
+ checkId: "content:headers",
261
528
  title: "Section Headers",
262
529
  status: "warn",
263
530
  message: "No markdown headers found in SKILL.md body.",
@@ -266,6 +533,7 @@ function runContentChecks(context) {
266
533
  } else {
267
534
  issues.push({
268
535
  id: "content.headers",
536
+ checkId: "content:headers",
269
537
  title: "Section Headers",
270
538
  status: "pass",
271
539
  message: "SKILL.md contains markdown section headers."
@@ -275,6 +543,7 @@ function runContentChecks(context) {
275
543
  if (!hasExamples) {
276
544
  issues.push({
277
545
  id: "content.examples",
546
+ checkId: "content:examples",
278
547
  title: "Examples",
279
548
  status: "warn",
280
549
  message: "No examples detected in SKILL.md body.",
@@ -283,6 +552,7 @@ function runContentChecks(context) {
283
552
  } else {
284
553
  issues.push({
285
554
  id: "content.examples",
555
+ checkId: "content:examples",
286
556
  title: "Examples",
287
557
  status: "pass",
288
558
  message: "Examples were detected in SKILL.md."
@@ -292,6 +562,7 @@ function runContentChecks(context) {
292
562
  if (vagueMatches.length > 0) {
293
563
  issues.push({
294
564
  id: "content.vagueness",
565
+ checkId: "content:vagueness",
295
566
  title: "Instruction Specificity",
296
567
  status: "warn",
297
568
  message: "Potentially vague instruction phrases detected.",
@@ -300,6 +571,7 @@ function runContentChecks(context) {
300
571
  } else {
301
572
  issues.push({
302
573
  id: "content.vagueness",
574
+ checkId: "content:vagueness",
303
575
  title: "Instruction Specificity",
304
576
  status: "pass",
305
577
  message: "No obvious vague placeholder phrasing found."
@@ -308,6 +580,7 @@ function runContentChecks(context) {
308
580
  if (context.frontmatter.rawFrontmatter && /[<>]/.test(context.frontmatter.rawFrontmatter)) {
309
581
  issues.push({
310
582
  id: "content.frontmatter-angle-brackets",
583
+ checkId: "content:angle-brackets",
311
584
  title: "Frontmatter Angle Brackets",
312
585
  status: "warn",
313
586
  message: "Frontmatter contains angle bracket characters (< or >), which can be misinterpreted in some agents.",
@@ -316,36 +589,20 @@ function runContentChecks(context) {
316
589
  } else {
317
590
  issues.push({
318
591
  id: "content.frontmatter-angle-brackets",
592
+ checkId: "content:angle-brackets",
319
593
  title: "Frontmatter Angle Brackets",
320
594
  status: "pass",
321
595
  message: "No angle bracket tokens detected in frontmatter."
322
596
  });
323
597
  }
324
- const secretHits = /* @__PURE__ */ new Set();
325
- for (const pattern of SECRET_PATTERNS) {
326
- if (pattern.regex.test(context.skill.raw)) {
327
- secretHits.add(pattern.label);
328
- }
329
- }
330
- if (secretHits.size > 0) {
331
- issues.push({
332
- id: "content.secrets",
333
- title: "Hardcoded Secrets",
334
- status: "fail",
335
- message: `Potential secrets detected (${Array.from(secretHits).join(", ")}).`,
336
- suggestion: "Remove secrets from skill files and use environment variables or secret managers."
337
- });
338
- } else {
339
- issues.push({
340
- id: "content.secrets",
341
- title: "Hardcoded Secrets",
342
- status: "pass",
343
- message: "No obvious API keys or secrets patterns were detected."
344
- });
598
+ const secretsIssue = buildSecretsIssue(context);
599
+ if (secretsIssue) {
600
+ issues.push(secretsIssue);
345
601
  }
346
602
  if (bodyLines.length < 10) {
347
603
  issues.push({
348
604
  id: "content.body-length",
605
+ checkId: "content:body-length",
349
606
  title: "Body Completeness",
350
607
  status: "warn",
351
608
  message: `SKILL.md body has only ${bodyLines.length} non-empty lines.`,
@@ -354,6 +611,7 @@ function runContentChecks(context) {
354
611
  } else {
355
612
  issues.push({
356
613
  id: "content.body-length",
614
+ checkId: "content:body-length",
357
615
  title: "Body Completeness",
358
616
  status: "pass",
359
617
  message: `SKILL.md body has ${bodyLines.length} non-empty lines.`
@@ -362,6 +620,7 @@ function runContentChecks(context) {
362
620
  if (description && description.length < 50) {
363
621
  issues.push({
364
622
  id: "content.description-length",
623
+ checkId: "content:description-length",
365
624
  title: "Description Specificity",
366
625
  status: "warn",
367
626
  message: `Description length is ${description.length} characters, which may be too vague for reliable triggering.`,
@@ -370,6 +629,7 @@ function runContentChecks(context) {
370
629
  } else if (description) {
371
630
  issues.push({
372
631
  id: "content.description-length",
632
+ checkId: "content:description-length",
373
633
  title: "Description Specificity",
374
634
  status: "pass",
375
635
  message: "Description length is sufficient for triggerability heuristics."
@@ -461,6 +721,7 @@ async function runDisclosureChecks(context) {
461
721
  if (context.skill.lineCount > 200 && !await pathExists(referencesDir)) {
462
722
  issues.push({
463
723
  id: "disclosure.skill-split",
724
+ checkId: "disclosure:progressive-disclosure",
464
725
  title: "Progressive Disclosure",
465
726
  status: "warn",
466
727
  message: "SKILL.md exceeds 200 lines and no references/ directory is present.",
@@ -469,6 +730,7 @@ async function runDisclosureChecks(context) {
469
730
  } else {
470
731
  issues.push({
471
732
  id: "disclosure.skill-split",
733
+ checkId: "disclosure:progressive-disclosure",
472
734
  title: "Progressive Disclosure",
473
735
  status: "pass",
474
736
  message: "Top-level file length and references/ usage look reasonable."
@@ -496,6 +758,7 @@ async function runDisclosureChecks(context) {
496
758
  if (nonRelativeOrEscaping.length > 0) {
497
759
  issues.push({
498
760
  id: "disclosure.relative-path-root",
761
+ checkId: "disclosure:path-scope",
499
762
  title: "Reference Path Scope",
500
763
  status: "fail",
501
764
  message: `Found non-relative or out-of-root references: ${nonRelativeOrEscaping.join(", ")}`,
@@ -504,6 +767,7 @@ async function runDisclosureChecks(context) {
504
767
  } else {
505
768
  issues.push({
506
769
  id: "disclosure.relative-path-root",
770
+ checkId: "disclosure:path-scope",
507
771
  title: "Reference Path Scope",
508
772
  status: "pass",
509
773
  message: "All detected file references are relative and scoped to skill root."
@@ -513,6 +777,7 @@ async function runDisclosureChecks(context) {
513
777
  if (chainDepth > 1) {
514
778
  issues.push({
515
779
  id: "disclosure.reference-depth",
780
+ checkId: "disclosure:reference-depth",
516
781
  title: "Reference Chain Depth",
517
782
  status: "warn",
518
783
  message: "Deep reference chains detected (>1 level).",
@@ -521,6 +786,7 @@ async function runDisclosureChecks(context) {
521
786
  } else {
522
787
  issues.push({
523
788
  id: "disclosure.reference-depth",
789
+ checkId: "disclosure:reference-depth",
524
790
  title: "Reference Chain Depth",
525
791
  status: "pass",
526
792
  message: "Reference depth is shallow and easy to navigate."
@@ -530,6 +796,7 @@ async function runDisclosureChecks(context) {
530
796
  if (normalizedReferences.some((item) => item.includes("../"))) {
531
797
  issues.push({
532
798
  id: "disclosure.parent-traversal",
799
+ checkId: "disclosure:parent-traversal",
533
800
  title: "Parent Traversal",
534
801
  status: "warn",
535
802
  message: "References include parent-directory traversal (../).",
@@ -538,6 +805,7 @@ async function runDisclosureChecks(context) {
538
805
  } else {
539
806
  issues.push({
540
807
  id: "disclosure.parent-traversal",
808
+ checkId: "disclosure:parent-traversal",
541
809
  title: "Parent Traversal",
542
810
  status: "pass",
543
811
  message: "No parent-directory traversal references detected."
@@ -568,6 +836,7 @@ function runFrontmatterChecks(context) {
568
836
  if (!context.frontmatter.hasFrontmatter) {
569
837
  issues.push({
570
838
  id: "frontmatter.exists",
839
+ checkId: "frontmatter:exists",
571
840
  title: "Frontmatter Presence",
572
841
  status: "fail",
573
842
  message: "SKILL.md is missing YAML frontmatter delimited by --- blocks.",
@@ -578,6 +847,7 @@ function runFrontmatterChecks(context) {
578
847
  if (context.frontmatter.error) {
579
848
  issues.push({
580
849
  id: "frontmatter.valid-yaml",
850
+ checkId: "frontmatter:yaml",
581
851
  title: "Frontmatter YAML",
582
852
  status: "fail",
583
853
  message: `Frontmatter is not valid YAML: ${context.frontmatter.error}`,
@@ -587,6 +857,7 @@ function runFrontmatterChecks(context) {
587
857
  }
588
858
  issues.push({
589
859
  id: "frontmatter.valid-yaml",
860
+ checkId: "frontmatter:yaml",
590
861
  title: "Frontmatter YAML",
591
862
  status: "pass",
592
863
  message: "Frontmatter exists and parses correctly."
@@ -596,6 +867,7 @@ function runFrontmatterChecks(context) {
596
867
  if (!name) {
597
868
  issues.push({
598
869
  id: "frontmatter.name.required",
870
+ checkId: "frontmatter:name",
599
871
  title: "Frontmatter Name",
600
872
  status: "fail",
601
873
  message: "Missing required frontmatter field: name.",
@@ -604,6 +876,7 @@ function runFrontmatterChecks(context) {
604
876
  } else if (name.length > 64) {
605
877
  issues.push({
606
878
  id: "frontmatter.name.length",
879
+ checkId: "frontmatter:name",
607
880
  title: "Frontmatter Name Length",
608
881
  status: "fail",
609
882
  message: `name is too long (${name.length} chars, max 64).`,
@@ -612,6 +885,7 @@ function runFrontmatterChecks(context) {
612
885
  } else if (!SKILL_NAME_REGEX.test(name)) {
613
886
  issues.push({
614
887
  id: "frontmatter.name.format",
888
+ checkId: "frontmatter:name",
615
889
  title: "Frontmatter Name Format",
616
890
  status: "fail",
617
891
  message: "name must be lowercase alphanumeric with single hyphen separators only.",
@@ -620,6 +894,7 @@ function runFrontmatterChecks(context) {
620
894
  } else {
621
895
  issues.push({
622
896
  id: "frontmatter.name.valid",
897
+ checkId: "frontmatter:name",
623
898
  title: "Frontmatter Name",
624
899
  status: "pass",
625
900
  message: "name is present and follows naming conventions."
@@ -629,6 +904,7 @@ function runFrontmatterChecks(context) {
629
904
  if (!description || description.trim() === "") {
630
905
  issues.push({
631
906
  id: "frontmatter.description.required",
907
+ checkId: "frontmatter:description",
632
908
  title: "Frontmatter Description",
633
909
  status: "fail",
634
910
  message: "Missing required frontmatter field: description.",
@@ -637,6 +913,7 @@ function runFrontmatterChecks(context) {
637
913
  } else if (description.length > 1024) {
638
914
  issues.push({
639
915
  id: "frontmatter.description.length",
916
+ checkId: "frontmatter:description",
640
917
  title: "Frontmatter Description Length",
641
918
  status: "fail",
642
919
  message: `description is too long (${description.length} chars, max 1024).`,
@@ -645,6 +922,7 @@ function runFrontmatterChecks(context) {
645
922
  } else {
646
923
  issues.push({
647
924
  id: "frontmatter.description.valid",
925
+ checkId: "frontmatter:description",
648
926
  title: "Frontmatter Description",
649
927
  status: "pass",
650
928
  message: "description is present and within allowed length."
@@ -654,6 +932,7 @@ function runFrontmatterChecks(context) {
654
932
  if (!license || license.trim() === "") {
655
933
  issues.push({
656
934
  id: "frontmatter.license.recommended",
935
+ checkId: "frontmatter:license",
657
936
  title: "Frontmatter License",
658
937
  status: "warn",
659
938
  message: "No license field found in frontmatter.",
@@ -662,6 +941,7 @@ function runFrontmatterChecks(context) {
662
941
  } else {
663
942
  issues.push({
664
943
  id: "frontmatter.license.present",
944
+ checkId: "frontmatter:license",
665
945
  title: "Frontmatter License",
666
946
  status: "pass",
667
947
  message: "license field is present."
@@ -670,6 +950,7 @@ function runFrontmatterChecks(context) {
670
950
  if (description && description.trim() !== "" && !descriptionLooksActionable(description)) {
671
951
  issues.push({
672
952
  id: "frontmatter.description.triggerability",
953
+ checkId: "frontmatter:triggerability",
673
954
  title: "Description Trigger Clarity",
674
955
  status: "warn",
675
956
  message: "Description should explain both what the skill does and when it should be used.",
@@ -678,6 +959,7 @@ function runFrontmatterChecks(context) {
678
959
  } else if (description) {
679
960
  issues.push({
680
961
  id: "frontmatter.description.triggerability",
962
+ checkId: "frontmatter:triggerability",
681
963
  title: "Description Trigger Clarity",
682
964
  status: "pass",
683
965
  message: "Description appears to cover both capability and usage context."
@@ -735,85 +1017,159 @@ var SHELL_ACTIVITY_PATTERNS = [
735
1017
  /\b(?:npm|pnpm|yarn|pip|git|docker|kubectl)\s+[A-Za-z0-9-]/i
736
1018
  ];
737
1019
  var SAFETY_GUARDRAIL_PATTERN = /\b(?:ask before|confirm|approval|dry[- ]run|sandbox|least privilege|redact|never expose|do not reveal)\b/i;
738
- function collectMatches(content, patterns) {
739
- const matches = [];
740
- for (const pattern of patterns) {
741
- if (pattern.regex.test(content)) {
742
- matches.push(pattern.label);
1020
+ function buildOccurrence(zone, pattern) {
1021
+ return {
1022
+ label: pattern.label,
1023
+ zoneType: zone.type,
1024
+ startLine: zone.startLine,
1025
+ endLine: zone.endLine
1026
+ };
1027
+ }
1028
+ function collectZoneAwareMatches(zones, patterns) {
1029
+ const flagged = [];
1030
+ const skipped = [];
1031
+ for (const zone of zones) {
1032
+ for (const pattern of patterns) {
1033
+ if (!pattern.regex.test(zone.content)) {
1034
+ continue;
1035
+ }
1036
+ const occurrence = buildOccurrence(zone, pattern);
1037
+ if (zone.type === "prose") {
1038
+ flagged.push(occurrence);
1039
+ } else {
1040
+ skipped.push(occurrence);
1041
+ }
1042
+ }
1043
+ }
1044
+ return { flagged, skipped };
1045
+ }
1046
+ function uniqueLabels2(matches) {
1047
+ const labels = [];
1048
+ const seen = /* @__PURE__ */ new Set();
1049
+ for (const match of matches) {
1050
+ if (seen.has(match.label)) {
1051
+ continue;
743
1052
  }
1053
+ seen.add(match.label);
1054
+ labels.push(match.label);
744
1055
  }
745
- return matches;
1056
+ return labels;
1057
+ }
1058
+ function summarizeLineRange2(matches) {
1059
+ if (matches.length === 0) {
1060
+ return {};
1061
+ }
1062
+ return {
1063
+ startLine: Math.min(...matches.map((match) => match.startLine)),
1064
+ endLine: Math.max(...matches.map((match) => match.endLine))
1065
+ };
1066
+ }
1067
+ function buildSkippedPatterns2(matches) {
1068
+ if (matches.length === 0) {
1069
+ return void 0;
1070
+ }
1071
+ return matches.map((match) => ({
1072
+ label: match.label,
1073
+ zoneType: match.zoneType,
1074
+ startLine: match.startLine,
1075
+ endLine: match.endLine
1076
+ }));
1077
+ }
1078
+ function isSuppressed(context, checkId) {
1079
+ return context.suppressedCheckIds.has(checkId);
1080
+ }
1081
+ function runZoneAwareSecurityCheck(context, zones, options) {
1082
+ if (isSuppressed(context, options.checkId)) {
1083
+ return null;
1084
+ }
1085
+ const matches = collectZoneAwareMatches(zones, options.patterns);
1086
+ const labels = uniqueLabels2(matches.flagged);
1087
+ const skippedPatterns = buildSkippedPatterns2(matches.skipped);
1088
+ if (labels.length > 0) {
1089
+ return {
1090
+ id: options.id,
1091
+ checkId: options.checkId,
1092
+ title: options.title,
1093
+ status: options.statusOnMatch,
1094
+ message: `${options.matchMessagePrefix}: ${labels.join(", ")}.`,
1095
+ suggestion: options.suggestion,
1096
+ ...summarizeLineRange2(matches.flagged),
1097
+ skippedPatterns
1098
+ };
1099
+ }
1100
+ return {
1101
+ id: options.id,
1102
+ checkId: options.checkId,
1103
+ title: options.title,
1104
+ status: "pass",
1105
+ message: options.passMessage,
1106
+ skippedPatterns
1107
+ };
746
1108
  }
747
1109
  function runSecurityChecks(context) {
748
1110
  const issues = [];
749
1111
  const skillText = context.skill.raw;
750
- const dangerousCommandHits = collectMatches(skillText, DANGEROUS_COMMAND_PATTERNS);
751
- if (dangerousCommandHits.length > 0) {
752
- issues.push({
753
- id: "security.dangerous-command-patterns",
754
- title: "Dangerous Command Patterns",
755
- status: "fail",
756
- message: `Potentially dangerous command instruction patterns found: ${dangerousCommandHits.join(", ")}.`,
757
- suggestion: "Remove destructive/pipe-exec command examples or wrap them with explicit safety constraints."
758
- });
759
- } else {
760
- issues.push({
761
- id: "security.dangerous-command-patterns",
762
- title: "Dangerous Command Patterns",
763
- status: "pass",
764
- message: "No high-risk destructive or direct pipe-to-shell patterns detected."
765
- });
766
- }
767
- const exfiltrationHits = collectMatches(skillText, EXFILTRATION_PATTERNS);
768
- if (exfiltrationHits.length > 0) {
769
- issues.push({
770
- id: "security.exfiltration-patterns",
771
- title: "Sensitive Data Exfiltration",
772
- status: "fail",
773
- message: `Possible sensitive data exfiltration patterns found: ${exfiltrationHits.join(", ")}.`,
774
- suggestion: "Remove instructions that access or transmit secrets/credential files."
775
- });
776
- } else {
777
- issues.push({
778
- id: "security.exfiltration-patterns",
779
- title: "Sensitive Data Exfiltration",
780
- status: "pass",
781
- message: "No obvious credential access/exfiltration instructions detected."
782
- });
783
- }
784
- const escalationHits = collectMatches(skillText, PRIVILEGE_ESCALATION_PATTERNS);
785
- if (escalationHits.length > 0) {
786
- issues.push({
787
- id: "security.privilege-escalation",
788
- title: "Privilege Escalation Language",
789
- status: "warn",
790
- message: `Potentially risky privilege/execution language detected: ${escalationHits.join(", ")}.`,
791
- suggestion: "Prefer least-privilege execution and explicit approval steps for elevated commands."
792
- });
793
- } else {
794
- issues.push({
795
- id: "security.privilege-escalation",
796
- title: "Privilege Escalation Language",
797
- status: "pass",
798
- message: "No obvious privilege-escalation language detected."
799
- });
1112
+ const needsZoneParsing = !isSuppressed(context, "security:dangerous-commands") || !isSuppressed(context, "security:exfiltration") || !isSuppressed(context, "security:privilege-escalation");
1113
+ const zones = needsZoneParsing ? parseZones(skillText) : [];
1114
+ const dangerousCommandsIssue = runZoneAwareSecurityCheck(context, zones, {
1115
+ id: "security.dangerous-command-patterns",
1116
+ checkId: "security:dangerous-commands",
1117
+ title: "Dangerous Command Patterns",
1118
+ statusOnMatch: "fail",
1119
+ patterns: DANGEROUS_COMMAND_PATTERNS,
1120
+ matchMessagePrefix: "Potentially dangerous command instruction patterns found",
1121
+ passMessage: "No high-risk destructive or direct pipe-to-shell patterns detected.",
1122
+ suggestion: "Remove destructive/pipe-exec command examples or wrap them with explicit safety constraints."
1123
+ });
1124
+ if (dangerousCommandsIssue) {
1125
+ issues.push(dangerousCommandsIssue);
1126
+ }
1127
+ const exfiltrationIssue = runZoneAwareSecurityCheck(context, zones, {
1128
+ id: "security.exfiltration-patterns",
1129
+ checkId: "security:exfiltration",
1130
+ title: "Sensitive Data Exfiltration",
1131
+ statusOnMatch: "fail",
1132
+ patterns: EXFILTRATION_PATTERNS,
1133
+ matchMessagePrefix: "Possible sensitive data exfiltration patterns found",
1134
+ passMessage: "No obvious credential access/exfiltration instructions detected.",
1135
+ suggestion: "Remove instructions that access or transmit secrets/credential files."
1136
+ });
1137
+ if (exfiltrationIssue) {
1138
+ issues.push(exfiltrationIssue);
1139
+ }
1140
+ const privilegeEscalationIssue = runZoneAwareSecurityCheck(context, zones, {
1141
+ id: "security.privilege-escalation",
1142
+ checkId: "security:privilege-escalation",
1143
+ title: "Privilege Escalation Language",
1144
+ statusOnMatch: "warn",
1145
+ patterns: PRIVILEGE_ESCALATION_PATTERNS,
1146
+ matchMessagePrefix: "Potentially risky privilege/execution language detected",
1147
+ passMessage: "No obvious privilege-escalation language detected.",
1148
+ suggestion: "Prefer least-privilege execution and explicit approval steps for elevated commands."
1149
+ });
1150
+ if (privilegeEscalationIssue) {
1151
+ issues.push(privilegeEscalationIssue);
800
1152
  }
801
- const hasShellActivity = SHELL_ACTIVITY_PATTERNS.some((pattern) => pattern.test(skillText));
802
- if (hasShellActivity && !SAFETY_GUARDRAIL_PATTERN.test(skillText)) {
803
- issues.push({
804
- id: "security.safety-guardrails",
805
- title: "Execution Safety Guardrails",
806
- status: "warn",
807
- message: "Shell/tool execution is present, but no explicit safety guardrails were detected.",
808
- suggestion: "Add guidance such as approval requirements, dry-run mode, scope checks, and redaction rules."
809
- });
810
- } else {
811
- issues.push({
812
- id: "security.safety-guardrails",
813
- title: "Execution Safety Guardrails",
814
- status: "pass",
815
- message: hasShellActivity ? "Shell/tool execution instructions include at least one safety guardrail." : "No shell/tool execution instructions detected."
816
- });
1153
+ if (!isSuppressed(context, "security:missing-guardrails")) {
1154
+ const hasShellActivity = SHELL_ACTIVITY_PATTERNS.some((pattern) => pattern.test(skillText));
1155
+ if (hasShellActivity && !SAFETY_GUARDRAIL_PATTERN.test(skillText)) {
1156
+ issues.push({
1157
+ id: "security.safety-guardrails",
1158
+ checkId: "security:missing-guardrails",
1159
+ title: "Execution Safety Guardrails",
1160
+ status: "warn",
1161
+ message: "Shell/tool execution is present, but no explicit safety guardrails were detected.",
1162
+ suggestion: "Add guidance such as approval requirements, dry-run mode, scope checks, and redaction rules."
1163
+ });
1164
+ } else {
1165
+ issues.push({
1166
+ id: "security.safety-guardrails",
1167
+ checkId: "security:missing-guardrails",
1168
+ title: "Execution Safety Guardrails",
1169
+ status: "pass",
1170
+ message: hasShellActivity ? "Shell/tool execution instructions include at least one safety guardrail." : "No shell/tool execution instructions detected."
1171
+ });
1172
+ }
817
1173
  }
818
1174
  return issues;
819
1175
  }
@@ -846,6 +1202,7 @@ async function runStructureChecks(context) {
846
1202
  if (context.skill.lineCount > 500) {
847
1203
  issues.push({
848
1204
  id: "structure.skill-size",
1205
+ checkId: "structure:file-size",
849
1206
  title: "SKILL.md Size",
850
1207
  status: "warn",
851
1208
  message: `SKILL.md is ${context.skill.lineCount} lines (recommended max is 500).`,
@@ -854,6 +1211,7 @@ async function runStructureChecks(context) {
854
1211
  } else {
855
1212
  issues.push({
856
1213
  id: "structure.skill-size",
1214
+ checkId: "structure:file-size",
857
1215
  title: "SKILL.md Size",
858
1216
  status: "pass",
859
1217
  message: `SKILL.md length is ${context.skill.lineCount} lines.`
@@ -870,6 +1228,7 @@ async function runStructureChecks(context) {
870
1228
  oversizedWithoutToc += 1;
871
1229
  issues.push({
872
1230
  id: `structure.references.toc.${toPosixPath(path4.relative(context.skill.skillRoot, file))}`,
1231
+ checkId: "structure:toc",
873
1232
  title: "Reference File Navigation",
874
1233
  status: "warn",
875
1234
  message: `${toPosixPath(path4.relative(context.skill.skillRoot, file))} is ${lineCount} lines and has no table of contents.`,
@@ -880,6 +1239,7 @@ async function runStructureChecks(context) {
880
1239
  if (oversizedWithoutToc === 0) {
881
1240
  issues.push({
882
1241
  id: "structure.references.toc",
1242
+ checkId: "structure:toc",
883
1243
  title: "Reference File Navigation",
884
1244
  status: "pass",
885
1245
  message: "No oversized reference files missing a table of contents."
@@ -888,6 +1248,7 @@ async function runStructureChecks(context) {
888
1248
  } else {
889
1249
  issues.push({
890
1250
  id: "structure.references.toc",
1251
+ checkId: "structure:toc",
891
1252
  title: "Reference File Navigation",
892
1253
  status: "pass",
893
1254
  message: "No references/ directory found, so no long reference files to validate."
@@ -916,6 +1277,7 @@ async function runStructureChecks(context) {
916
1277
  if (missing.length > 0) {
917
1278
  issues.push({
918
1279
  id: `structure.${category.key}.exists`,
1280
+ checkId: "structure:references",
919
1281
  title: category.title,
920
1282
  status: "fail",
921
1283
  message: `Missing referenced ${category.key} file(s): ${missing.join(", ")}`,
@@ -924,6 +1286,7 @@ async function runStructureChecks(context) {
924
1286
  } else {
925
1287
  issues.push({
926
1288
  id: `structure.${category.key}.exists`,
1289
+ checkId: "structure:references",
927
1290
  title: category.title,
928
1291
  status: "pass",
929
1292
  message: `All referenced ${category.key} files exist.`
@@ -934,6 +1297,7 @@ async function runStructureChecks(context) {
934
1297
  if (missingGeneric.length > 0) {
935
1298
  issues.push({
936
1299
  id: "structure.relative-links.broken",
1300
+ checkId: "structure:references",
937
1301
  title: "Relative Links",
938
1302
  status: "fail",
939
1303
  message: `Broken relative path reference(s): ${missingGeneric.join(", ")}`,
@@ -942,6 +1306,7 @@ async function runStructureChecks(context) {
942
1306
  } else {
943
1307
  issues.push({
944
1308
  id: "structure.relative-links.broken",
1309
+ checkId: "structure:references",
945
1310
  title: "Relative Links",
946
1311
  status: "pass",
947
1312
  message: "No broken generic relative file references were found."
@@ -971,12 +1336,20 @@ function summarizeIssues(issues) {
971
1336
  }
972
1337
  return summary;
973
1338
  }
974
- async function runLinter(inputPath) {
1339
+ function lintFails(report, failOn) {
1340
+ if (report.summary.failures > 0) {
1341
+ return true;
1342
+ }
1343
+ return failOn === "warn" && report.summary.warnings > 0;
1344
+ }
1345
+ async function runLinter(inputPath, options = {}) {
975
1346
  const skill = await loadSkillFile(inputPath);
976
1347
  const frontmatter = parseFrontmatter(skill.raw);
1348
+ const suppressedCheckIds = new Set(options.suppress ?? []);
977
1349
  const context = {
978
1350
  skill,
979
- frontmatter
1351
+ frontmatter,
1352
+ suppressedCheckIds
980
1353
  };
981
1354
  const issues = [];
982
1355
  issues.push(...runFrontmatterChecks(context));
@@ -985,10 +1358,11 @@ async function runLinter(inputPath) {
985
1358
  issues.push(...runSecurityChecks(context));
986
1359
  issues.push(...await runDisclosureChecks(context));
987
1360
  issues.push(...runCompatibilityChecks(context));
1361
+ const filteredIssues = issues.filter((issue) => !suppressedCheckIds.has(issue.checkId));
988
1362
  return {
989
1363
  target: inputPath,
990
- issues,
991
- summary: summarizeIssues(issues)
1364
+ issues: filteredIssues,
1365
+ summary: summarizeIssues(filteredIssues)
992
1366
  };
993
1367
  }
994
1368
 
@@ -1004,6 +1378,14 @@ function renderIssueLine(issue, c) {
1004
1378
  return ` ${label} ${issue.title}
1005
1379
  ${issue.message}${detail}`;
1006
1380
  }
1381
+ function countSkippedSecurityPatterns(issues) {
1382
+ return issues.reduce((total, issue) => {
1383
+ if (!issue.checkId.startsWith("security:")) {
1384
+ return total;
1385
+ }
1386
+ return total + (issue.skippedPatterns?.length ?? 0);
1387
+ }, 0);
1388
+ }
1007
1389
  function renderLintReport(report, enableColor) {
1008
1390
  const c = getChalkInstance(enableColor);
1009
1391
  const { passed, warnings, failures, total } = report.summary;
@@ -1016,8 +1398,11 @@ function renderLintReport(report, enableColor) {
1016
1398
  `\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518`
1017
1399
  ];
1018
1400
  const renderedIssues = report.issues.map((issue) => renderIssueLine(issue, c)).join("\n");
1401
+ const skippedSecurityPatterns = countSkippedSecurityPatterns(report.issues);
1402
+ const infoLine = skippedSecurityPatterns > 0 ? `
1403
+ ${c.cyan("\u2139")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)` : "";
1019
1404
  return `${headerLines.join("\n")}
1020
- ${renderedIssues}`;
1405
+ ${renderedIssues}${infoLine}`;
1021
1406
  }
1022
1407
  function formatPercent(value) {
1023
1408
  return `${(value * 100).toFixed(1)}%`;
@@ -1105,6 +1490,10 @@ function renderCheckReport(result, enableColor, verbose) {
1105
1490
  for (const issue of lintIssues) {
1106
1491
  lines.push(renderIssueLine(issue, c));
1107
1492
  }
1493
+ const skippedSecurityPatterns = countSkippedSecurityPatterns(result.lint.issues);
1494
+ if (skippedSecurityPatterns > 0) {
1495
+ lines.push(` ${c.cyan("\u2139")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)`);
1496
+ }
1108
1497
  lines.push("");
1109
1498
  lines.push("Trigger");
1110
1499
  if (result.trigger) {
@@ -1159,107 +1548,213 @@ function renderCheckReport(result, enableColor, verbose) {
1159
1548
  return lines.join("\n");
1160
1549
  }
1161
1550
 
1162
- // src/reporters/json.ts
1163
- function renderJson(value) {
1164
- return JSON.stringify(value, null, 2);
1165
- }
1166
-
1167
1551
  // src/commands/common.ts
1168
- function getGlobalCliOptions(command) {
1169
- const options = command.optsWithGlobals();
1170
- return {
1171
- json: Boolean(options.json),
1172
- color: options.color !== false
1173
- };
1174
- }
1175
- function writeResult(value, asJson) {
1176
- if (asJson) {
1177
- process.stdout.write(`${renderJson(value)}
1178
- `);
1179
- return;
1180
- }
1181
- process.stdout.write(`${String(value)}
1182
- `);
1183
- }
1184
- function writeError(error, asJson) {
1185
- const message = error instanceof Error ? error.message : String(error);
1186
- if (asJson) {
1187
- process.stdout.write(`${renderJson({ error: message })}
1188
- `);
1189
- return;
1190
- }
1191
- process.stderr.write(`Error: ${message}
1192
- `);
1193
- }
1194
-
1195
- // src/commands/lint.ts
1196
- function registerLintCommand(program) {
1197
- program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").action(async (targetPath, _commandOptions, command) => {
1198
- const globalOptions = getGlobalCliOptions(command);
1199
- try {
1200
- const report = await runLinter(targetPath);
1201
- if (globalOptions.json) {
1202
- writeResult(report, true);
1203
- } else {
1204
- writeResult(renderLintReport(report, globalOptions.color), false);
1205
- }
1206
- if (report.summary.failures > 0) {
1207
- process.exitCode = 1;
1208
- }
1209
- } catch (error) {
1210
- writeError(error, globalOptions.json);
1211
- process.exitCode = 2;
1212
- }
1213
- });
1214
- }
1552
+ import fs5 from "node:fs/promises";
1553
+ import { z as z5 } from "zod";
1215
1554
 
1216
- // src/commands/trigger.ts
1217
- import ora from "ora";
1555
+ // src/core/eval-runner.ts
1218
1556
  import { z as z3 } from "zod";
1219
1557
 
1220
- // src/core/trigger-tester.ts
1558
+ // src/core/grader.ts
1221
1559
  import { z as z2 } from "zod";
1222
- var triggerQuerySchema = z2.object({
1223
- query: z2.string().min(1),
1224
- should_trigger: z2.boolean()
1560
+ var gradedAssertionSchema = z2.object({
1561
+ assertion: z2.string(),
1562
+ passed: z2.boolean(),
1563
+ evidence: z2.string()
1225
1564
  });
1226
- var triggerQueryArraySchema = z2.array(triggerQuerySchema);
1227
- var FAKE_SKILLS = [
1228
- { name: "code-review", description: "Reviews code changes for bugs, regressions, and maintainability issues." },
1229
- { name: "api-tester", description: "Designs and runs REST API tests, validating status codes and response shapes." },
1230
- { name: "db-migrator", description: "Plans and generates safe database migration scripts with rollback guidance." },
1231
- { name: "bug-repro", description: "Reproduces reported bugs by building deterministic minimal test cases." },
1232
- { name: "release-notes", description: "Drafts release notes from commits and PR metadata for stakeholders." },
1233
- { name: "log-analyzer", description: "Analyzes service logs to identify error clusters and likely root causes." },
1234
- { name: "performance-audit", description: "Finds hotspots in runtime and suggests profiling-driven optimizations." },
1235
- { name: "security-audit", description: "Checks code and config for common security vulnerabilities and risky defaults." },
1236
- { name: "refactor-planner", description: "Breaks large refactors into safe incremental steps with validation plans." },
1237
- { name: "schema-designer", description: "Designs JSON schemas and validates data contracts for integrations." },
1238
- { name: "docs-writer", description: "Writes developer documentation, tutorials, and API usage examples." },
1239
- { name: "cli-scaffolder", description: "Creates CLI project skeletons with argument parsing and help text." },
1240
- { name: "incident-triage", description: "Triage production incidents with severity tagging and next-action checklists." },
1241
- { name: "test-generator", description: "Generates unit and integration test cases from feature requirements." },
1242
- { name: "prompt-tuner", description: "Improves prompts for reliability, formatting, and failure handling." }
1243
- ];
1244
- function shuffle(values) {
1245
- const copy = [...values];
1246
- for (let index = copy.length - 1; index > 0; index -= 1) {
1247
- const swapIndex = Math.floor(Math.random() * (index + 1));
1248
- [copy[index], copy[swapIndex]] = [copy[swapIndex], copy[index]];
1249
- }
1250
- return copy;
1251
- }
1252
- function sample(values, count) {
1253
- return shuffle(values).slice(0, Math.max(0, Math.min(count, values.length)));
1254
- }
1255
- function parseJsonArrayFromModelOutput(raw) {
1565
+ var graderOutputSchema = z2.object({
1566
+ assertions: z2.array(gradedAssertionSchema)
1567
+ });
1568
+ function extractJsonObject(raw) {
1256
1569
  const trimmed = raw.trim();
1257
- if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
1570
+ if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
1258
1571
  return JSON.parse(trimmed);
1259
1572
  }
1260
- const start = trimmed.indexOf("[");
1261
- const end = trimmed.lastIndexOf("]");
1262
- if (start >= 0 && end > start) {
1573
+ const start = trimmed.indexOf("{");
1574
+ const end = trimmed.lastIndexOf("}");
1575
+ if (start >= 0 && end > start) {
1576
+ return JSON.parse(trimmed.slice(start, end + 1));
1577
+ }
1578
+ throw new Error("Grader did not return a JSON object.");
1579
+ }
1580
+ async function gradeResponse(options) {
1581
+ const assertionList = options.assertions && options.assertions.length > 0 ? options.assertions : [
1582
+ "The response follows the skill instructions faithfully.",
1583
+ "The response is well-structured and actionable.",
1584
+ "The response addresses the user prompt directly."
1585
+ ];
1586
+ const systemPrompt = [
1587
+ "You are a strict evaluator for agent skill outputs.",
1588
+ "Assess each assertion and return JSON only.",
1589
+ 'Required output format: {"assertions":[{"assertion":"...","passed":true|false,"evidence":"..."}]}'
1590
+ ].join(" ");
1591
+ const userPrompt = [
1592
+ `Skill: ${options.skillName}`,
1593
+ "Skill instructions:",
1594
+ options.skillBody,
1595
+ "",
1596
+ `User prompt: ${options.userPrompt}`,
1597
+ "",
1598
+ "Model response:",
1599
+ options.modelResponse,
1600
+ "",
1601
+ "Assertions to evaluate:",
1602
+ assertionList.map((assertion, index) => `${index + 1}. ${assertion}`).join("\n")
1603
+ ].join("\n");
1604
+ const raw = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
1605
+ const parsed = graderOutputSchema.safeParse(extractJsonObject(raw));
1606
+ if (!parsed.success) {
1607
+ throw new Error(`Failed to parse grader output: ${parsed.error.issues[0]?.message ?? "invalid grader JSON"}`);
1608
+ }
1609
+ return parsed.data.assertions;
1610
+ }
1611
+
1612
+ // src/core/eval-runner.ts
1613
+ var evalPromptSchema = z3.object({
1614
+ prompt: z3.string().min(1),
1615
+ assertions: z3.array(z3.string().min(1)).optional()
1616
+ });
1617
+ var evalPromptArraySchema = z3.array(evalPromptSchema);
1618
+ function extractJsonArray(raw) {
1619
+ const trimmed = raw.trim();
1620
+ if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
1621
+ return JSON.parse(trimmed);
1622
+ }
1623
+ const start = trimmed.indexOf("[");
1624
+ const end = trimmed.lastIndexOf("]");
1625
+ if (start >= 0 && end > start) {
1626
+ return JSON.parse(trimmed.slice(start, end + 1));
1627
+ }
1628
+ throw new Error("Model did not return a JSON array.");
1629
+ }
1630
+ async function generatePrompts(skill, provider, model, count) {
1631
+ const systemPrompt = [
1632
+ "You generate realistic evaluation prompts for an agent skill.",
1633
+ "Return JSON only.",
1634
+ 'Format: [{"prompt":"...","assertions":["...", "..."]}]',
1635
+ "Assertions should be concrete and checkable."
1636
+ ].join(" ");
1637
+ const userPrompt = [
1638
+ `Skill name: ${skill.frontmatter.name}`,
1639
+ `Skill description: ${skill.frontmatter.description}`,
1640
+ "Skill instructions:",
1641
+ skill.content,
1642
+ "",
1643
+ `Generate ${count} prompts that stress the main capabilities and likely edge cases.`,
1644
+ "Each prompt should include 2-4 assertions."
1645
+ ].join("\n");
1646
+ const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
1647
+ const parsed = evalPromptArraySchema.safeParse(extractJsonArray(raw));
1648
+ if (!parsed.success) {
1649
+ throw new Error(`Failed to parse generated eval prompts: ${parsed.error.issues[0]?.message ?? "invalid prompt JSON"}`);
1650
+ }
1651
+ if (parsed.data.length !== count) {
1652
+ throw new Error(`Expected ${count} prompts, got ${parsed.data.length}.`);
1653
+ }
1654
+ return parsed.data;
1655
+ }
1656
+ async function runEval(skill, options) {
1657
+ const prompts = options.prompts && options.prompts.length > 0 ? evalPromptArraySchema.parse(options.prompts) : await generatePrompts(skill, options.provider, options.model, options.numRuns);
1658
+ const results = [];
1659
+ for (const evalPrompt of prompts) {
1660
+ const systemPrompt = [
1661
+ "You are an AI assistant with an activated skill.",
1662
+ "Follow this SKILL.md content exactly where applicable.",
1663
+ "",
1664
+ skill.raw
1665
+ ].join("\n");
1666
+ const response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
1667
+ const gradedAssertions = await gradeResponse({
1668
+ provider: options.provider,
1669
+ model: options.graderModel,
1670
+ skillName: skill.frontmatter.name,
1671
+ skillBody: skill.content,
1672
+ userPrompt: evalPrompt.prompt,
1673
+ modelResponse: response,
1674
+ assertions: evalPrompt.assertions
1675
+ });
1676
+ const passedAssertions2 = gradedAssertions.filter((assertion) => assertion.passed).length;
1677
+ results.push({
1678
+ prompt: evalPrompt.prompt,
1679
+ assertions: gradedAssertions,
1680
+ responseSummary: response.slice(0, 200),
1681
+ response,
1682
+ passedAssertions: passedAssertions2,
1683
+ totalAssertions: gradedAssertions.length
1684
+ });
1685
+ }
1686
+ const totalAssertions = results.reduce((total, result) => total + result.totalAssertions, 0);
1687
+ const passedAssertions = results.reduce((total, result) => total + result.passedAssertions, 0);
1688
+ return {
1689
+ skillName: skill.frontmatter.name,
1690
+ model: options.model,
1691
+ graderModel: options.graderModel,
1692
+ provider: options.provider.name,
1693
+ prompts,
1694
+ results,
1695
+ summary: {
1696
+ totalPrompts: results.length,
1697
+ totalAssertions,
1698
+ passedAssertions
1699
+ }
1700
+ };
1701
+ }
1702
+
1703
+ // src/core/trigger-tester.ts
1704
+ import { z as z4 } from "zod";
1705
+ var triggerQuerySchema = z4.object({
1706
+ query: z4.string().min(1),
1707
+ should_trigger: z4.boolean()
1708
+ });
1709
+ var triggerQueryArraySchema = z4.array(triggerQuerySchema);
1710
+ var FAKE_SKILLS = [
1711
+ { name: "code-review", description: "Reviews code changes for bugs, regressions, and maintainability issues." },
1712
+ { name: "api-tester", description: "Designs and runs REST API tests, validating status codes and response shapes." },
1713
+ { name: "db-migrator", description: "Plans and generates safe database migration scripts with rollback guidance." },
1714
+ { name: "bug-repro", description: "Reproduces reported bugs by building deterministic minimal test cases." },
1715
+ { name: "release-notes", description: "Drafts release notes from commits and PR metadata for stakeholders." },
1716
+ { name: "log-analyzer", description: "Analyzes service logs to identify error clusters and likely root causes." },
1717
+ { name: "performance-audit", description: "Finds hotspots in runtime and suggests profiling-driven optimizations." },
1718
+ { name: "security-audit", description: "Checks code and config for common security vulnerabilities and risky defaults." },
1719
+ { name: "refactor-planner", description: "Breaks large refactors into safe incremental steps with validation plans." },
1720
+ { name: "schema-designer", description: "Designs JSON schemas and validates data contracts for integrations." },
1721
+ { name: "docs-writer", description: "Writes developer documentation, tutorials, and API usage examples." },
1722
+ { name: "cli-scaffolder", description: "Creates CLI project skeletons with argument parsing and help text." },
1723
+ { name: "incident-triage", description: "Triage production incidents with severity tagging and next-action checklists." },
1724
+ { name: "test-generator", description: "Generates unit and integration test cases from feature requirements." },
1725
+ { name: "prompt-tuner", description: "Improves prompts for reliability, formatting, and failure handling." }
1726
+ ];
1727
+ function mulberry32(seed) {
1728
+ return () => {
1729
+ seed |= 0;
1730
+ seed = seed + 1831565813 | 0;
1731
+ let t = Math.imul(seed ^ seed >>> 15, 1 | seed);
1732
+ t = t + Math.imul(t ^ t >>> 7, 61 | t) ^ t;
1733
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
1734
+ };
1735
+ }
1736
+ function createRng(seed) {
1737
+ return seed !== void 0 ? mulberry32(seed) : Math.random;
1738
+ }
1739
+ function shuffle(values, rng) {
1740
+ const copy = [...values];
1741
+ for (let index = copy.length - 1; index > 0; index -= 1) {
1742
+ const swapIndex = Math.floor(rng() * (index + 1));
1743
+ [copy[index], copy[swapIndex]] = [copy[swapIndex], copy[index]];
1744
+ }
1745
+ return copy;
1746
+ }
1747
+ function sample(values, count, rng) {
1748
+ return shuffle(values, rng).slice(0, Math.max(0, Math.min(count, values.length)));
1749
+ }
1750
+ function parseJsonArrayFromModelOutput(raw) {
1751
+ const trimmed = raw.trim();
1752
+ if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
1753
+ return JSON.parse(trimmed);
1754
+ }
1755
+ const start = trimmed.indexOf("[");
1756
+ const end = trimmed.lastIndexOf("]");
1757
+ if (start >= 0 && end > start) {
1263
1758
  const possibleJson = trimmed.slice(start, end + 1);
1264
1759
  return JSON.parse(possibleJson);
1265
1760
  }
@@ -1362,19 +1857,20 @@ function buildSuggestions(metrics) {
1362
1857
  return suggestions;
1363
1858
  }
1364
1859
  async function runTriggerTest(skill, options) {
1860
+ const rng = createRng(options.seed);
1365
1861
  const queries = options.queries && options.queries.length > 0 ? triggerQueryArraySchema.parse(options.queries) : await generateQueriesWithModel(skill, options.provider, options.model, options.numQueries);
1366
1862
  const results = [];
1367
1863
  const skillName = skill.frontmatter.name;
1368
1864
  for (const testQuery of queries) {
1369
- const fakeCount = 5 + Math.floor(Math.random() * 4);
1370
- const fakeSkills = sample(FAKE_SKILLS, fakeCount);
1865
+ const fakeCount = 5 + Math.floor(rng() * 5);
1866
+ const fakeSkills = sample(FAKE_SKILLS, fakeCount, rng);
1371
1867
  const allSkills = shuffle([
1372
1868
  ...fakeSkills,
1373
1869
  {
1374
1870
  name: skill.frontmatter.name,
1375
1871
  description: skill.frontmatter.description
1376
1872
  }
1377
- ]);
1873
+ ], rng);
1378
1874
  const skillListText = allSkills.map((entry) => `- ${entry.name}: ${entry.description}`).join("\n");
1379
1875
  const systemPrompt = [
1380
1876
  "You are selecting one skill to activate for a user query.",
@@ -1403,6 +1899,7 @@ async function runTriggerTest(skill, options) {
1403
1899
  skillName,
1404
1900
  model: options.model,
1405
1901
  provider: options.provider.name,
1902
+ seed: options.seed,
1406
1903
  queries,
1407
1904
  cases: results,
1408
1905
  metrics,
@@ -1410,59 +1907,502 @@ async function runTriggerTest(skill, options) {
1410
1907
  };
1411
1908
  }
1412
1909
 
1413
- // src/utils/config.ts
1414
- function resolveApiKey(provider, override) {
1415
- if (override && override.trim() !== "") {
1416
- return override.trim();
1910
+ // src/reporters/json.ts
1911
+ function renderJson(value) {
1912
+ return JSON.stringify(value, null, 2);
1913
+ }
1914
+
1915
+ // src/commands/common.ts
1916
+ var executionContextByCommand = /* @__PURE__ */ new WeakMap();
1917
+ var singleEvalPromptSchema = z5.object({
1918
+ prompt: z5.string().min(1),
1919
+ assertions: z5.array(z5.string().min(1)).optional()
1920
+ });
1921
+ var promptStringArraySchema = z5.array(z5.string().min(1));
1922
+ var assertionsObjectSchema = z5.object({
1923
+ assertions: z5.array(z5.string().min(1))
1924
+ });
1925
+ function parseJsonIfPossible(raw) {
1926
+ try {
1927
+ return JSON.parse(raw);
1928
+ } catch {
1929
+ return raw;
1417
1930
  }
1418
- if (provider === "anthropic") {
1419
- const envValue2 = process.env.ANTHROPIC_API_KEY?.trim();
1420
- if (envValue2) {
1421
- return envValue2;
1931
+ }
1932
+ function normalizeEvalPrompts(value, sourceLabel) {
1933
+ const promptArray = evalPromptArraySchema.safeParse(value);
1934
+ if (promptArray.success) {
1935
+ return promptArray.data;
1936
+ }
1937
+ const singlePrompt = singleEvalPromptSchema.safeParse(value);
1938
+ if (singlePrompt.success) {
1939
+ return [singlePrompt.data];
1940
+ }
1941
+ const promptStrings = promptStringArraySchema.safeParse(value);
1942
+ if (promptStrings.success) {
1943
+ return promptStrings.data.map((prompt) => ({ prompt }));
1944
+ }
1945
+ if (typeof value === "string" && value.trim() !== "") {
1946
+ return [{ prompt: value.trim() }];
1947
+ }
1948
+ throw new Error(
1949
+ `Invalid eval prompt source at ${sourceLabel}. Expected plain text, a JSON prompt object, or a JSON array of prompts.`
1950
+ );
1951
+ }
1952
+ function parseAssertionsFromText(raw) {
1953
+ return raw.split(/\r?\n/).map((line) => line.trim().replace(/^[-*]\s+/, "").replace(/^\d+\.\s+/, "")).filter((line) => line.length > 0);
1954
+ }
1955
+ function normalizeAssertions(value, sourceLabel) {
1956
+ const assertionArray = z5.array(z5.string().min(1)).safeParse(value);
1957
+ if (assertionArray.success) {
1958
+ return assertionArray.data;
1959
+ }
1960
+ const assertionObject = assertionsObjectSchema.safeParse(value);
1961
+ if (assertionObject.success) {
1962
+ return assertionObject.data.assertions;
1963
+ }
1964
+ if (typeof value === "string") {
1965
+ const assertions = parseAssertionsFromText(value);
1966
+ if (assertions.length > 0) {
1967
+ return assertions;
1422
1968
  }
1423
- throw new Error(
1424
- "No Anthropic API key found. Set ANTHROPIC_API_KEY environment variable or pass --api-key flag."
1425
- );
1426
1969
  }
1427
- const envValue = process.env.OPENAI_API_KEY?.trim();
1428
- if (envValue) {
1429
- return envValue;
1970
+ throw new Error(
1971
+ `Invalid eval assertions source at ${sourceLabel}. Expected JSON string[], { assertions: string[] }, or newline-delimited text.`
1972
+ );
1973
+ }
1974
+ function setCommandExecutionContext(command, context) {
1975
+ executionContextByCommand.set(command, context);
1976
+ }
1977
+ function getCommandExecutionContext(command) {
1978
+ const context = executionContextByCommand.get(command);
1979
+ if (!context) {
1980
+ throw new Error(`Missing resolved config for command '${command.name()}'.`);
1430
1981
  }
1431
- throw new Error("No OpenAI API key found. Set OPENAI_API_KEY environment variable or pass --api-key flag.");
1982
+ return context;
1432
1983
  }
1433
-
1434
- // src/providers/anthropic.ts
1435
- import Anthropic from "@anthropic-ai/sdk";
1436
- function wait(ms) {
1437
- return new Promise((resolve) => {
1438
- setTimeout(resolve, ms);
1439
- });
1984
+ function getResolvedConfig(command) {
1985
+ return getCommandExecutionContext(command).config;
1440
1986
  }
1441
- function isRateLimitError(error) {
1442
- if (!error || typeof error !== "object") {
1443
- return false;
1987
+ function getGlobalCliOptions(command) {
1988
+ const options = command.optsWithGlobals();
1989
+ const context = executionContextByCommand.get(command);
1990
+ return {
1991
+ json: context?.config.json ?? Boolean(options.json),
1992
+ color: options.color !== false
1993
+ };
1994
+ }
1995
+ async function loadTriggerQueriesFile(filePath) {
1996
+ const loaded = await readJsonFile(filePath);
1997
+ const parsed = triggerQueryArraySchema.safeParse(loaded);
1998
+ if (!parsed.success) {
1999
+ throw new Error(`Invalid --queries JSON: ${parsed.error.issues[0]?.message ?? "unknown format issue"}`);
1444
2000
  }
1445
- const maybeStatus = error.status;
1446
- if (maybeStatus === 429) {
1447
- return true;
2001
+ return parsed.data;
2002
+ }
2003
+ async function loadEvalPromptsJson(filePath) {
2004
+ const loaded = await readJsonFile(filePath);
2005
+ const parsed = evalPromptArraySchema.safeParse(loaded);
2006
+ if (!parsed.success) {
2007
+ throw new Error(`Invalid --prompts JSON: ${parsed.error.issues[0]?.message ?? "unknown format issue"}`);
1448
2008
  }
1449
- const maybeMessage = error.message;
1450
- if (typeof maybeMessage === "string" && /rate limit/i.test(maybeMessage)) {
1451
- return true;
2009
+ return parsed.data;
2010
+ }
2011
+ async function loadConfiguredEvalPrompts(command) {
2012
+ const context = getCommandExecutionContext(command);
2013
+ const promptFile = context.config.eval.promptFile;
2014
+ const assertionsFile = context.config.eval.assertionsFile;
2015
+ if (!promptFile && !assertionsFile) {
2016
+ return void 0;
2017
+ }
2018
+ if (!promptFile && assertionsFile) {
2019
+ throw new Error("Config field eval.assertionsFile requires eval.promptFile.");
2020
+ }
2021
+ const promptRaw = await fs5.readFile(promptFile, "utf8");
2022
+ let prompts = normalizeEvalPrompts(parseJsonIfPossible(promptRaw), promptFile);
2023
+ if (assertionsFile) {
2024
+ const assertionsRaw = await fs5.readFile(assertionsFile, "utf8");
2025
+ const assertions = normalizeAssertions(parseJsonIfPossible(assertionsRaw), assertionsFile);
2026
+ prompts = prompts.map((prompt) => ({
2027
+ prompt: prompt.prompt,
2028
+ assertions: [...assertions]
2029
+ }));
2030
+ }
2031
+ const numRunsWasExplicit = context.configFile?.eval?.numRuns !== void 0;
2032
+ if (numRunsWasExplicit && prompts.length === 1 && context.config.eval.numRuns > 1) {
2033
+ const promptTemplate = prompts[0];
2034
+ prompts = Array.from({ length: context.config.eval.numRuns }, () => ({
2035
+ prompt: promptTemplate.prompt,
2036
+ assertions: promptTemplate.assertions ? [...promptTemplate.assertions] : void 0
2037
+ }));
2038
+ }
2039
+ return prompts;
2040
+ }
2041
+ function writeResult(value, asJson) {
2042
+ if (asJson) {
2043
+ process.stdout.write(`${renderJson(value)}
2044
+ `);
2045
+ return;
1452
2046
  }
1453
- return false;
2047
+ process.stdout.write(`${String(value)}
2048
+ `);
1454
2049
  }
1455
- var AnthropicProvider = class {
1456
- name = "anthropic";
1457
- client;
1458
- constructor(apiKey) {
1459
- this.client = new Anthropic({ apiKey });
2050
+ function writeError(error, asJson) {
2051
+ const message = error instanceof Error ? error.message : String(error);
2052
+ if (asJson) {
2053
+ process.stdout.write(`${renderJson({ error: message })}
2054
+ `);
2055
+ return;
1460
2056
  }
1461
- async sendMessage(systemPrompt, userMessage, options) {
1462
- let lastError;
1463
- for (let attempt = 0; attempt < 3; attempt += 1) {
1464
- try {
1465
- const response = await this.client.messages.create({
2057
+ process.stderr.write(`Error: ${message}
2058
+ `);
2059
+ }
2060
+
2061
+ // src/commands/lint.ts
2062
+ async function handleLintCommand(targetPath, options) {
2063
+ try {
2064
+ const report = await runLinter(targetPath, { suppress: options.suppress });
2065
+ if (options.json) {
2066
+ writeResult(report, true);
2067
+ } else {
2068
+ writeResult(renderLintReport(report, options.color), false);
2069
+ }
2070
+ if (lintFails(report, options.failOn)) {
2071
+ process.exitCode = 1;
2072
+ }
2073
+ } catch (error) {
2074
+ writeError(error, options.json);
2075
+ process.exitCode = 2;
2076
+ }
2077
+ }
2078
+ function registerLintCommand(program) {
2079
+ program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").action(async (targetPath, _commandOptions, command) => {
2080
+ const globalOptions = getGlobalCliOptions(command);
2081
+ const config = getResolvedConfig(command);
2082
+ await handleLintCommand(targetPath, {
2083
+ ...globalOptions,
2084
+ failOn: config.lint.failOn,
2085
+ suppress: config.lint.suppress
2086
+ });
2087
+ });
2088
+ }
2089
+
2090
+ // src/commands/trigger.ts
2091
+ import ora from "ora";
2092
+ import { z as z7 } from "zod";
2093
+
2094
+ // src/utils/config.ts
2095
+ import fs6 from "node:fs/promises";
2096
+ import path5 from "node:path";
2097
+ import { z as z6 } from "zod";
2098
+ var providerNameSchema = z6.enum(["anthropic", "openai"]);
2099
+ var lintFailOnSchema = z6.enum(["error", "warn"]);
2100
+ var lintConfigSchema = z6.object({
2101
+ failOn: lintFailOnSchema.optional(),
2102
+ suppress: z6.array(z6.string().min(1)).optional()
2103
+ }).strict();
2104
+ var triggerConfigSchema = z6.object({
2105
+ numQueries: z6.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
2106
+ threshold: z6.number().min(0).max(1).optional(),
2107
+ seed: z6.number().int().optional()
2108
+ }).strict().partial();
2109
+ var evalConfigSchema = z6.object({
2110
+ numRuns: z6.number().int().min(1).optional(),
2111
+ threshold: z6.number().min(0).max(1).optional(),
2112
+ promptFile: z6.string().min(1).optional(),
2113
+ assertionsFile: z6.string().min(1).optional()
2114
+ }).strict().partial();
2115
+ var skilltestConfigSchema = z6.object({
2116
+ provider: providerNameSchema.optional(),
2117
+ model: z6.string().min(1).optional(),
2118
+ json: z6.boolean().optional(),
2119
+ lint: lintConfigSchema.optional(),
2120
+ trigger: triggerConfigSchema.optional(),
2121
+ eval: evalConfigSchema.optional()
2122
+ }).strict();
2123
+ var resolvedSkilltestConfigSchema = z6.object({
2124
+ provider: providerNameSchema,
2125
+ model: z6.string().min(1),
2126
+ json: z6.boolean(),
2127
+ lint: z6.object({
2128
+ failOn: lintFailOnSchema,
2129
+ suppress: z6.array(z6.string().min(1))
2130
+ }),
2131
+ trigger: z6.object({
2132
+ numQueries: z6.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
2133
+ threshold: z6.number().min(0).max(1),
2134
+ seed: z6.number().int().optional()
2135
+ }),
2136
+ eval: z6.object({
2137
+ numRuns: z6.number().int().min(1),
2138
+ threshold: z6.number().min(0).max(1),
2139
+ promptFile: z6.string().min(1).optional(),
2140
+ assertionsFile: z6.string().min(1).optional()
2141
+ })
2142
+ });
2143
+ var DEFAULT_SKILLTEST_CONFIG = {
2144
+ provider: "anthropic",
2145
+ model: "claude-sonnet-4-5-20250929",
2146
+ json: false,
2147
+ lint: {
2148
+ failOn: "error",
2149
+ suppress: []
2150
+ },
2151
+ trigger: {
2152
+ numQueries: 20,
2153
+ threshold: 0.8
2154
+ },
2155
+ eval: {
2156
+ numRuns: 5,
2157
+ threshold: 0.9
2158
+ }
2159
+ };
2160
+ function formatIssuePath(issuePath) {
2161
+ if (issuePath.length === 0) {
2162
+ return "root";
2163
+ }
2164
+ return issuePath.map(String).join(".");
2165
+ }
2166
+ function buildConfigValidationError(error, sourceLabel) {
2167
+ const issue = error.issues[0];
2168
+ const issuePath = formatIssuePath(issue?.path ?? []);
2169
+ const issueMessage = issue?.message ?? "Invalid config value.";
2170
+ return new Error(`Invalid skilltest config in ${sourceLabel} at ${issuePath}: ${issueMessage}`);
2171
+ }
2172
+ async function readJsonObject(filePath, label) {
2173
+ let raw;
2174
+ try {
2175
+ raw = await fs6.readFile(filePath, "utf8");
2176
+ } catch (error) {
2177
+ const message = error instanceof Error ? error.message : String(error);
2178
+ throw new Error(`Failed to read ${label}: ${message}`);
2179
+ }
2180
+ try {
2181
+ return JSON.parse(raw);
2182
+ } catch (error) {
2183
+ const message = error instanceof Error ? error.message : String(error);
2184
+ throw new Error(`Invalid JSON in ${label}: ${message}`);
2185
+ }
2186
+ }
2187
+ async function loadConfigFromJsonFile(filePath) {
2188
+ if (!await pathExists(filePath)) {
2189
+ return null;
2190
+ }
2191
+ const raw = await readJsonObject(filePath, filePath);
2192
+ const parsed = skilltestConfigSchema.safeParse(raw);
2193
+ if (!parsed.success) {
2194
+ throw buildConfigValidationError(parsed.error, filePath);
2195
+ }
2196
+ return {
2197
+ configFile: parsed.data,
2198
+ sourcePath: filePath,
2199
+ sourceDirectory: path5.dirname(filePath)
2200
+ };
2201
+ }
2202
+ async function loadConfigFromNearestPackageJson(startDirectory) {
2203
+ let currentDirectory = path5.resolve(startDirectory);
2204
+ while (true) {
2205
+ const packageJsonPath = path5.join(currentDirectory, "package.json");
2206
+ if (await pathExists(packageJsonPath)) {
2207
+ const raw = await readJsonObject(packageJsonPath, packageJsonPath);
2208
+ const packageJsonSchema = z6.object({
2209
+ skilltestrc: skilltestConfigSchema.optional()
2210
+ }).passthrough();
2211
+ const parsed = packageJsonSchema.safeParse(raw);
2212
+ if (!parsed.success) {
2213
+ throw buildConfigValidationError(parsed.error, `${packageJsonPath}#skilltestrc`);
2214
+ }
2215
+ if (!parsed.data.skilltestrc) {
2216
+ return null;
2217
+ }
2218
+ return {
2219
+ configFile: parsed.data.skilltestrc,
2220
+ sourcePath: packageJsonPath,
2221
+ sourceDirectory: currentDirectory
2222
+ };
2223
+ }
2224
+ const parentDirectory = path5.dirname(currentDirectory);
2225
+ if (parentDirectory === currentDirectory) {
2226
+ return null;
2227
+ }
2228
+ currentDirectory = parentDirectory;
2229
+ }
2230
+ }
2231
+ async function resolveSkillDirectoryConfig(targetPath) {
2232
+ if (!targetPath) {
2233
+ return null;
2234
+ }
2235
+ try {
2236
+ const { skillRoot } = await resolveSkillPath(targetPath);
2237
+ return loadConfigFromJsonFile(path5.join(skillRoot, ".skilltestrc"));
2238
+ } catch {
2239
+ return null;
2240
+ }
2241
+ }
2242
+ function resolveConfigRelativePath(baseDirectory, value) {
2243
+ if (!value) {
2244
+ return void 0;
2245
+ }
2246
+ return path5.resolve(baseDirectory, value);
2247
+ }
2248
+ function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = process.cwd()) {
2249
+ const merged = {
2250
+ provider: cliFlags.provider ?? configFile.provider ?? DEFAULT_SKILLTEST_CONFIG.provider,
2251
+ model: cliFlags.model ?? configFile.model ?? DEFAULT_SKILLTEST_CONFIG.model,
2252
+ json: cliFlags.json ?? configFile.json ?? DEFAULT_SKILLTEST_CONFIG.json,
2253
+ lint: {
2254
+ failOn: cliFlags.lint?.failOn ?? configFile.lint?.failOn ?? DEFAULT_SKILLTEST_CONFIG.lint.failOn,
2255
+ suppress: cliFlags.lint?.suppress ?? configFile.lint?.suppress ?? DEFAULT_SKILLTEST_CONFIG.lint.suppress
2256
+ },
2257
+ trigger: {
2258
+ numQueries: cliFlags.trigger?.numQueries ?? configFile.trigger?.numQueries ?? DEFAULT_SKILLTEST_CONFIG.trigger.numQueries,
2259
+ threshold: cliFlags.trigger?.threshold ?? configFile.trigger?.threshold ?? DEFAULT_SKILLTEST_CONFIG.trigger.threshold,
2260
+ seed: cliFlags.trigger?.seed ?? configFile.trigger?.seed
2261
+ },
2262
+ eval: {
2263
+ numRuns: cliFlags.eval?.numRuns ?? configFile.eval?.numRuns ?? DEFAULT_SKILLTEST_CONFIG.eval.numRuns,
2264
+ threshold: cliFlags.eval?.threshold ?? configFile.eval?.threshold ?? DEFAULT_SKILLTEST_CONFIG.eval.threshold,
2265
+ promptFile: resolveConfigRelativePath(
2266
+ baseDirectory,
2267
+ cliFlags.eval?.promptFile ?? configFile.eval?.promptFile ?? DEFAULT_SKILLTEST_CONFIG.eval.promptFile
2268
+ ),
2269
+ assertionsFile: resolveConfigRelativePath(
2270
+ baseDirectory,
2271
+ cliFlags.eval?.assertionsFile ?? configFile.eval?.assertionsFile ?? DEFAULT_SKILLTEST_CONFIG.eval.assertionsFile
2272
+ )
2273
+ }
2274
+ };
2275
+ return resolvedSkilltestConfigSchema.parse(merged);
2276
+ }
2277
+ function getTypedOptionValue(command, key) {
2278
+ const options = command.optsWithGlobals();
2279
+ const value = options[key];
2280
+ if (value === void 0) {
2281
+ return void 0;
2282
+ }
2283
+ return value;
2284
+ }
2285
+ function extractCliConfigOverrides(command) {
2286
+ const overrides = {};
2287
+ if (command.getOptionValueSourceWithGlobals("json") === "cli") {
2288
+ overrides.json = Boolean(getTypedOptionValue(command, "json"));
2289
+ }
2290
+ if (command.getOptionValueSource("provider") === "cli") {
2291
+ overrides.provider = getTypedOptionValue(command, "provider");
2292
+ }
2293
+ if (command.getOptionValueSource("model") === "cli") {
2294
+ overrides.model = getTypedOptionValue(command, "model");
2295
+ }
2296
+ if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("numQueries") === "cli") {
2297
+ overrides.trigger = {
2298
+ ...overrides.trigger,
2299
+ numQueries: getTypedOptionValue(command, "numQueries")
2300
+ };
2301
+ }
2302
+ if (command.name() === "check" && command.getOptionValueSource("minF1") === "cli") {
2303
+ overrides.trigger = {
2304
+ ...overrides.trigger,
2305
+ threshold: getTypedOptionValue(command, "minF1")
2306
+ };
2307
+ }
2308
+ if (command.name() === "check" && command.getOptionValueSource("minAssertPassRate") === "cli") {
2309
+ overrides.eval = {
2310
+ ...overrides.eval,
2311
+ threshold: getTypedOptionValue(command, "minAssertPassRate")
2312
+ };
2313
+ }
2314
+ const parsed = skilltestConfigSchema.safeParse(overrides);
2315
+ if (!parsed.success) {
2316
+ throw buildConfigValidationError(parsed.error, "CLI flags");
2317
+ }
2318
+ return parsed.data;
2319
+ }
2320
+ async function resolveConfigContext(targetPath, cliFlags) {
2321
+ const cwd = process.cwd();
2322
+ const skillDirectoryConfig = await resolveSkillDirectoryConfig(targetPath);
2323
+ if (skillDirectoryConfig) {
2324
+ return {
2325
+ configFile: skillDirectoryConfig.configFile,
2326
+ ...skillDirectoryConfig,
2327
+ config: mergeConfigLayers(skillDirectoryConfig.configFile, cliFlags, skillDirectoryConfig.sourceDirectory)
2328
+ };
2329
+ }
2330
+ const cwdConfigPath = path5.join(cwd, ".skilltestrc");
2331
+ const cwdConfig = await loadConfigFromJsonFile(cwdConfigPath);
2332
+ if (cwdConfig) {
2333
+ return {
2334
+ configFile: cwdConfig.configFile,
2335
+ ...cwdConfig,
2336
+ config: mergeConfigLayers(cwdConfig.configFile, cliFlags, cwdConfig.sourceDirectory)
2337
+ };
2338
+ }
2339
+ const packageJsonConfig = await loadConfigFromNearestPackageJson(cwd);
2340
+ if (packageJsonConfig) {
2341
+ return {
2342
+ configFile: packageJsonConfig.configFile,
2343
+ ...packageJsonConfig,
2344
+ config: mergeConfigLayers(packageJsonConfig.configFile, cliFlags, packageJsonConfig.sourceDirectory)
2345
+ };
2346
+ }
2347
+ return {
2348
+ configFile: null,
2349
+ config: mergeConfigLayers({}, cliFlags, cwd),
2350
+ sourcePath: null,
2351
+ sourceDirectory: cwd
2352
+ };
2353
+ }
2354
+ function resolveApiKey(provider, override) {
2355
+ if (override && override.trim() !== "") {
2356
+ return override.trim();
2357
+ }
2358
+ if (provider === "anthropic") {
2359
+ const envValue2 = process.env.ANTHROPIC_API_KEY?.trim();
2360
+ if (envValue2) {
2361
+ return envValue2;
2362
+ }
2363
+ throw new Error(
2364
+ "No Anthropic API key found. Set ANTHROPIC_API_KEY environment variable or pass --api-key flag."
2365
+ );
2366
+ }
2367
+ const envValue = process.env.OPENAI_API_KEY?.trim();
2368
+ if (envValue) {
2369
+ return envValue;
2370
+ }
2371
+ throw new Error("No OpenAI API key found. Set OPENAI_API_KEY environment variable or pass --api-key flag.");
2372
+ }
2373
+
2374
+ // src/providers/anthropic.ts
2375
+ import Anthropic from "@anthropic-ai/sdk";
2376
+ function wait(ms) {
2377
+ return new Promise((resolve) => {
2378
+ setTimeout(resolve, ms);
2379
+ });
2380
+ }
2381
+ function isRateLimitError(error) {
2382
+ if (!error || typeof error !== "object") {
2383
+ return false;
2384
+ }
2385
+ const maybeStatus = error.status;
2386
+ if (maybeStatus === 429) {
2387
+ return true;
2388
+ }
2389
+ const maybeMessage = error.message;
2390
+ if (typeof maybeMessage === "string" && /rate limit/i.test(maybeMessage)) {
2391
+ return true;
2392
+ }
2393
+ return false;
2394
+ }
2395
+ var AnthropicProvider = class {
2396
+ name = "anthropic";
2397
+ client;
2398
+ constructor(apiKey) {
2399
+ this.client = new Anthropic({ apiKey });
2400
+ }
2401
+ async sendMessage(systemPrompt, userMessage, options) {
2402
+ let lastError;
2403
+ for (let attempt = 0; attempt < 3; attempt += 1) {
2404
+ try {
2405
+ const response = await this.client.messages.create({
1466
2406
  model: options.model,
1467
2407
  max_tokens: 2048,
1468
2408
  system: systemPrompt,
@@ -1607,14 +2547,12 @@ function createProvider(providerName, apiKeyOverride) {
1607
2547
  }
1608
2548
 
1609
2549
  // src/commands/trigger.ts
1610
- var triggerOptionsSchema = z3.object({
1611
- model: z3.string(),
1612
- provider: z3.enum(["anthropic", "openai"]),
1613
- queries: z3.string().optional(),
1614
- numQueries: z3.number().int().min(2),
1615
- saveQueries: z3.string().optional(),
1616
- verbose: z3.boolean().optional(),
1617
- apiKey: z3.string().optional()
2550
+ var triggerCliSchema = z7.object({
2551
+ queries: z7.string().optional(),
2552
+ saveQueries: z7.string().optional(),
2553
+ seed: z7.number().int().optional(),
2554
+ verbose: z7.boolean().optional(),
2555
+ apiKey: z7.string().optional()
1618
2556
  });
1619
2557
  var DEFAULT_ANTHROPIC_MODEL = "claude-sonnet-4-5-20250929";
1620
2558
  var DEFAULT_OPENAI_MODEL = "gpt-4.1-mini";
@@ -1624,230 +2562,91 @@ function resolveModel(provider, model) {
1624
2562
  }
1625
2563
  return model;
1626
2564
  }
1627
- function registerTriggerCommand(program) {
1628
- program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use", DEFAULT_ANTHROPIC_MODEL).option("--provider <provider>", "LLM provider: anthropic|openai", "anthropic").option("--queries <path>", "Path to custom test queries JSON").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10), 20).option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, commandOptions, command) => {
1629
- const globalOptions = getGlobalCliOptions(command);
1630
- const parsedOptions = triggerOptionsSchema.safeParse(commandOptions);
1631
- if (!parsedOptions.success) {
1632
- writeError(new Error(parsedOptions.error.issues[0]?.message ?? "Invalid trigger options."), globalOptions.json);
1633
- process.exitCode = 2;
1634
- return;
2565
+ function renderTriggerOutputWithSeed(output, seed) {
2566
+ if (seed === void 0) {
2567
+ return output;
2568
+ }
2569
+ return `${output}
2570
+ Seed: ${seed}`;
2571
+ }
2572
+ async function handleTriggerCommand(targetPath, options) {
2573
+ const spinner = options.json || !process.stdout.isTTY ? null : ora("Preparing trigger evaluation...").start();
2574
+ try {
2575
+ if (spinner) {
2576
+ spinner.text = "Parsing skill...";
1635
2577
  }
1636
- const options = parsedOptions.data;
1637
- const spinner = globalOptions.json || !process.stdout.isTTY ? null : ora("Preparing trigger evaluation...").start();
1638
- try {
1639
- if (options.numQueries % 2 !== 0) {
1640
- throw new Error("--num-queries must be an even number so the suite can split should/should-not trigger cases.");
1641
- }
1642
- if (spinner) {
1643
- spinner.text = "Parsing skill...";
1644
- }
1645
- const skill = await parseSkillStrict(targetPath);
1646
- if (spinner) {
1647
- spinner.text = "Initializing model provider...";
1648
- }
1649
- const provider = createProvider(options.provider, options.apiKey);
1650
- let queries = void 0;
1651
- if (options.queries) {
1652
- if (spinner) {
1653
- spinner.text = "Loading custom trigger queries...";
1654
- }
1655
- const loaded = await readJsonFile(options.queries);
1656
- const parsedQueries = triggerQueryArraySchema.safeParse(loaded);
1657
- if (!parsedQueries.success) {
1658
- throw new Error(`Invalid --queries JSON: ${parsedQueries.error.issues[0]?.message ?? "unknown format issue"}`);
1659
- }
1660
- queries = parsedQueries.data;
1661
- }
2578
+ const skill = await parseSkillStrict(targetPath);
2579
+ if (spinner) {
2580
+ spinner.text = "Initializing model provider...";
2581
+ }
2582
+ const provider = createProvider(options.provider, options.apiKey);
2583
+ let queries = void 0;
2584
+ if (options.queries) {
1662
2585
  if (spinner) {
1663
- spinner.text = "Running trigger simulations...";
1664
- }
1665
- const model = resolveModel(options.provider, options.model);
1666
- const result = await runTriggerTest(skill, {
1667
- model,
1668
- provider,
1669
- queries,
1670
- numQueries: options.numQueries,
1671
- verbose: Boolean(options.verbose)
1672
- });
1673
- if (options.saveQueries) {
1674
- await writeJsonFile(options.saveQueries, result.queries);
1675
- }
1676
- spinner?.stop();
1677
- if (globalOptions.json) {
1678
- writeResult(result, true);
1679
- } else {
1680
- writeResult(renderTriggerReport(result, globalOptions.color, Boolean(options.verbose)), false);
2586
+ spinner.text = "Loading custom trigger queries...";
1681
2587
  }
1682
- } catch (error) {
1683
- spinner?.stop();
1684
- writeError(error, globalOptions.json);
1685
- process.exitCode = 2;
2588
+ queries = await loadTriggerQueriesFile(options.queries);
1686
2589
  }
1687
- });
1688
- }
1689
-
1690
- // src/commands/eval.ts
1691
- import ora2 from "ora";
1692
- import { z as z6 } from "zod";
1693
-
1694
- // src/core/eval-runner.ts
1695
- import { z as z5 } from "zod";
1696
-
1697
- // src/core/grader.ts
1698
- import { z as z4 } from "zod";
1699
- var gradedAssertionSchema = z4.object({
1700
- assertion: z4.string(),
1701
- passed: z4.boolean(),
1702
- evidence: z4.string()
1703
- });
1704
- var graderOutputSchema = z4.object({
1705
- assertions: z4.array(gradedAssertionSchema)
1706
- });
1707
- function extractJsonObject(raw) {
1708
- const trimmed = raw.trim();
1709
- if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
1710
- return JSON.parse(trimmed);
1711
- }
1712
- const start = trimmed.indexOf("{");
1713
- const end = trimmed.lastIndexOf("}");
1714
- if (start >= 0 && end > start) {
1715
- return JSON.parse(trimmed.slice(start, end + 1));
1716
- }
1717
- throw new Error("Grader did not return a JSON object.");
1718
- }
1719
- async function gradeResponse(options) {
1720
- const assertionList = options.assertions && options.assertions.length > 0 ? options.assertions : [
1721
- "The response follows the skill instructions faithfully.",
1722
- "The response is well-structured and actionable.",
1723
- "The response addresses the user prompt directly."
1724
- ];
1725
- const systemPrompt = [
1726
- "You are a strict evaluator for agent skill outputs.",
1727
- "Assess each assertion and return JSON only.",
1728
- 'Required output format: {"assertions":[{"assertion":"...","passed":true|false,"evidence":"..."}]}'
1729
- ].join(" ");
1730
- const userPrompt = [
1731
- `Skill: ${options.skillName}`,
1732
- "Skill instructions:",
1733
- options.skillBody,
1734
- "",
1735
- `User prompt: ${options.userPrompt}`,
1736
- "",
1737
- "Model response:",
1738
- options.modelResponse,
1739
- "",
1740
- "Assertions to evaluate:",
1741
- assertionList.map((assertion, index) => `${index + 1}. ${assertion}`).join("\n")
1742
- ].join("\n");
1743
- const raw = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
1744
- const parsed = graderOutputSchema.safeParse(extractJsonObject(raw));
1745
- if (!parsed.success) {
1746
- throw new Error(`Failed to parse grader output: ${parsed.error.issues[0]?.message ?? "invalid grader JSON"}`);
1747
- }
1748
- return parsed.data.assertions;
1749
- }
1750
-
1751
- // src/core/eval-runner.ts
1752
- var evalPromptSchema = z5.object({
1753
- prompt: z5.string().min(1),
1754
- assertions: z5.array(z5.string().min(1)).optional()
1755
- });
1756
- var evalPromptArraySchema = z5.array(evalPromptSchema);
1757
- function extractJsonArray(raw) {
1758
- const trimmed = raw.trim();
1759
- if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
1760
- return JSON.parse(trimmed);
1761
- }
1762
- const start = trimmed.indexOf("[");
1763
- const end = trimmed.lastIndexOf("]");
1764
- if (start >= 0 && end > start) {
1765
- return JSON.parse(trimmed.slice(start, end + 1));
1766
- }
1767
- throw new Error("Model did not return a JSON array.");
1768
- }
1769
- async function generatePrompts(skill, provider, model, count) {
1770
- const systemPrompt = [
1771
- "You generate realistic evaluation prompts for an agent skill.",
1772
- "Return JSON only.",
1773
- 'Format: [{"prompt":"...","assertions":["...", "..."]}]',
1774
- "Assertions should be concrete and checkable."
1775
- ].join(" ");
1776
- const userPrompt = [
1777
- `Skill name: ${skill.frontmatter.name}`,
1778
- `Skill description: ${skill.frontmatter.description}`,
1779
- "Skill instructions:",
1780
- skill.content,
1781
- "",
1782
- `Generate ${count} prompts that stress the main capabilities and likely edge cases.`,
1783
- "Each prompt should include 2-4 assertions."
1784
- ].join("\n");
1785
- const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
1786
- const parsed = evalPromptArraySchema.safeParse(extractJsonArray(raw));
1787
- if (!parsed.success) {
1788
- throw new Error(`Failed to parse generated eval prompts: ${parsed.error.issues[0]?.message ?? "invalid prompt JSON"}`);
1789
- }
1790
- if (parsed.data.length !== count) {
1791
- throw new Error(`Expected ${count} prompts, got ${parsed.data.length}.`);
1792
- }
1793
- return parsed.data;
1794
- }
1795
- async function runEval(skill, options) {
1796
- const prompts = options.prompts && options.prompts.length > 0 ? evalPromptArraySchema.parse(options.prompts) : await generatePrompts(skill, options.provider, options.model, 5);
1797
- const results = [];
1798
- for (const evalPrompt of prompts) {
1799
- const systemPrompt = [
1800
- "You are an AI assistant with an activated skill.",
1801
- "Follow this SKILL.md content exactly where applicable.",
1802
- "",
1803
- skill.raw
1804
- ].join("\n");
1805
- const response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
1806
- const gradedAssertions = await gradeResponse({
1807
- provider: options.provider,
1808
- model: options.graderModel,
1809
- skillName: skill.frontmatter.name,
1810
- skillBody: skill.content,
1811
- userPrompt: evalPrompt.prompt,
1812
- modelResponse: response,
1813
- assertions: evalPrompt.assertions
1814
- });
1815
- const passedAssertions2 = gradedAssertions.filter((assertion) => assertion.passed).length;
1816
- results.push({
1817
- prompt: evalPrompt.prompt,
1818
- assertions: gradedAssertions,
1819
- responseSummary: response.slice(0, 200),
1820
- response,
1821
- passedAssertions: passedAssertions2,
1822
- totalAssertions: gradedAssertions.length
2590
+ if (spinner) {
2591
+ spinner.text = "Running trigger simulations...";
2592
+ }
2593
+ const model = resolveModel(options.provider, options.model);
2594
+ const result = await runTriggerTest(skill, {
2595
+ model,
2596
+ provider,
2597
+ queries,
2598
+ numQueries: options.numQueries,
2599
+ seed: options.seed,
2600
+ verbose: options.verbose
1823
2601
  });
2602
+ if (options.saveQueries) {
2603
+ await writeJsonFile(options.saveQueries, result.queries);
2604
+ }
2605
+ spinner?.stop();
2606
+ if (options.json) {
2607
+ writeResult(result, true);
2608
+ } else {
2609
+ writeResult(renderTriggerOutputWithSeed(renderTriggerReport(result, options.color, options.verbose), result.seed), false);
2610
+ }
2611
+ } catch (error) {
2612
+ spinner?.stop();
2613
+ writeError(error, options.json);
2614
+ process.exitCode = 2;
1824
2615
  }
1825
- const totalAssertions = results.reduce((total, result) => total + result.totalAssertions, 0);
1826
- const passedAssertions = results.reduce((total, result) => total + result.passedAssertions, 0);
1827
- return {
1828
- skillName: skill.frontmatter.name,
1829
- model: options.model,
1830
- graderModel: options.graderModel,
1831
- provider: options.provider.name,
1832
- prompts,
1833
- results,
1834
- summary: {
1835
- totalPrompts: results.length,
1836
- totalAssertions,
1837
- passedAssertions
2616
+ }
2617
+ function registerTriggerCommand(program) {
2618
+ program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--queries <path>", "Path to custom test queries JSON").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, _commandOptions, command) => {
2619
+ const globalOptions = getGlobalCliOptions(command);
2620
+ const config = getResolvedConfig(command);
2621
+ const parsedCli = triggerCliSchema.safeParse(command.opts());
2622
+ if (!parsedCli.success) {
2623
+ writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid trigger options."), globalOptions.json);
2624
+ process.exitCode = 2;
2625
+ return;
1838
2626
  }
1839
- };
2627
+ await handleTriggerCommand(targetPath, {
2628
+ ...globalOptions,
2629
+ model: config.model,
2630
+ provider: config.provider,
2631
+ queries: parsedCli.data.queries,
2632
+ numQueries: config.trigger.numQueries,
2633
+ saveQueries: parsedCli.data.saveQueries,
2634
+ seed: parsedCli.data.seed ?? config.trigger.seed,
2635
+ verbose: Boolean(parsedCli.data.verbose),
2636
+ apiKey: parsedCli.data.apiKey
2637
+ });
2638
+ });
1840
2639
  }
1841
2640
 
1842
2641
  // src/commands/eval.ts
1843
- var evalOptionsSchema = z6.object({
1844
- prompts: z6.string().optional(),
1845
- model: z6.string(),
1846
- graderModel: z6.string().optional(),
1847
- provider: z6.enum(["anthropic", "openai"]),
1848
- saveResults: z6.string().optional(),
1849
- verbose: z6.boolean().optional(),
1850
- apiKey: z6.string().optional()
2642
+ import ora2 from "ora";
2643
+ import { z as z8 } from "zod";
2644
+ var evalCliSchema = z8.object({
2645
+ prompts: z8.string().optional(),
2646
+ graderModel: z8.string().optional(),
2647
+ saveResults: z8.string().optional(),
2648
+ verbose: z8.boolean().optional(),
2649
+ apiKey: z8.string().optional()
1851
2650
  });
1852
2651
  var DEFAULT_ANTHROPIC_MODEL2 = "claude-sonnet-4-5-20250929";
1853
2652
  var DEFAULT_OPENAI_MODEL2 = "gpt-4.1-mini";
@@ -1857,69 +2656,84 @@ function resolveModel2(provider, model) {
1857
2656
  }
1858
2657
  return model;
1859
2658
  }
1860
- function registerEvalCommand(program) {
1861
- program.command("eval").description("Run end-to-end skill execution and quality evaluation.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--prompts <path>", "Path to eval prompts JSON").option("--model <model>", "Model to execute prompts", DEFAULT_ANTHROPIC_MODEL2).option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--provider <provider>", "LLM provider: anthropic|openai", "anthropic").option("--save-results <path>", "Save full evaluation results to JSON").option("--api-key <key>", "API key override").option("--verbose", "Show full model responses").action(async (targetPath, commandOptions, command) => {
1862
- const globalOptions = getGlobalCliOptions(command);
1863
- const parsedOptions = evalOptionsSchema.safeParse(commandOptions);
1864
- if (!parsedOptions.success) {
1865
- writeError(new Error(parsedOptions.error.issues[0]?.message ?? "Invalid eval options."), globalOptions.json);
1866
- process.exitCode = 2;
1867
- return;
2659
+ async function handleEvalCommand(targetPath, options, command) {
2660
+ const spinner = options.json || !process.stdout.isTTY ? null : ora2("Preparing evaluation...").start();
2661
+ try {
2662
+ if (spinner) {
2663
+ spinner.text = "Parsing skill...";
1868
2664
  }
1869
- const options = parsedOptions.data;
1870
- const spinner = globalOptions.json || !process.stdout.isTTY ? null : ora2("Preparing evaluation...").start();
1871
- try {
1872
- if (spinner) {
1873
- spinner.text = "Parsing skill...";
1874
- }
1875
- const skill = await parseSkillStrict(targetPath);
1876
- if (spinner) {
1877
- spinner.text = "Initializing model provider...";
1878
- }
1879
- const provider = createProvider(options.provider, options.apiKey);
1880
- let prompts = void 0;
1881
- if (options.prompts) {
1882
- if (spinner) {
1883
- spinner.text = "Loading test prompts...";
1884
- }
1885
- const loaded = await readJsonFile(options.prompts);
1886
- const parsedPrompts = evalPromptArraySchema.safeParse(loaded);
1887
- if (!parsedPrompts.success) {
1888
- throw new Error(`Invalid --prompts JSON: ${parsedPrompts.error.issues[0]?.message ?? "unknown format issue"}`);
1889
- }
1890
- prompts = parsedPrompts.data;
1891
- }
2665
+ const skill = await parseSkillStrict(targetPath);
2666
+ if (spinner) {
2667
+ spinner.text = "Initializing model provider...";
2668
+ }
2669
+ const provider = createProvider(options.provider, options.apiKey);
2670
+ let prompts = void 0;
2671
+ if (options.prompts) {
1892
2672
  if (spinner) {
1893
- spinner.text = "Running eval prompts and grading responses...";
1894
- }
1895
- const model = resolveModel2(options.provider, options.model);
1896
- const graderModel = options.graderModel ?? model;
1897
- const result = await runEval(skill, {
1898
- provider,
1899
- model,
1900
- graderModel,
1901
- prompts
1902
- });
1903
- if (options.saveResults) {
1904
- await writeJsonFile(options.saveResults, result);
2673
+ spinner.text = "Loading test prompts...";
1905
2674
  }
1906
- spinner?.stop();
1907
- if (globalOptions.json) {
1908
- writeResult(result, true);
1909
- } else {
1910
- writeResult(renderEvalReport(result, globalOptions.color, Boolean(options.verbose)), false);
1911
- }
1912
- } catch (error) {
1913
- spinner?.stop();
1914
- writeError(error, globalOptions.json);
2675
+ prompts = await loadEvalPromptsJson(options.prompts);
2676
+ } else {
2677
+ prompts = await loadConfiguredEvalPrompts(command);
2678
+ }
2679
+ if (spinner) {
2680
+ spinner.text = "Running eval prompts and grading responses...";
2681
+ }
2682
+ const model = resolveModel2(options.provider, options.model);
2683
+ const graderModel = options.graderModel ?? model;
2684
+ const result = await runEval(skill, {
2685
+ provider,
2686
+ model,
2687
+ graderModel,
2688
+ numRuns: options.numRuns,
2689
+ prompts
2690
+ });
2691
+ if (options.saveResults) {
2692
+ await writeJsonFile(options.saveResults, result);
2693
+ }
2694
+ spinner?.stop();
2695
+ if (options.json) {
2696
+ writeResult(result, true);
2697
+ } else {
2698
+ writeResult(renderEvalReport(result, options.color, options.verbose), false);
2699
+ }
2700
+ } catch (error) {
2701
+ spinner?.stop();
2702
+ writeError(error, options.json);
2703
+ process.exitCode = 2;
2704
+ }
2705
+ }
2706
+ function registerEvalCommand(program) {
2707
+ program.command("eval").description("Run end-to-end skill execution and quality evaluation.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--prompts <path>", "Path to eval prompts JSON").option("--model <model>", "Model to execute prompts").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--provider <provider>", "LLM provider: anthropic|openai").option("--save-results <path>", "Save full evaluation results to JSON").option("--api-key <key>", "API key override").option("--verbose", "Show full model responses").action(async (targetPath, _commandOptions, command) => {
2708
+ const globalOptions = getGlobalCliOptions(command);
2709
+ const config = getResolvedConfig(command);
2710
+ const parsedCli = evalCliSchema.safeParse(command.opts());
2711
+ if (!parsedCli.success) {
2712
+ writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid eval options."), globalOptions.json);
1915
2713
  process.exitCode = 2;
2714
+ return;
1916
2715
  }
2716
+ await handleEvalCommand(
2717
+ targetPath,
2718
+ {
2719
+ ...globalOptions,
2720
+ prompts: parsedCli.data.prompts,
2721
+ model: config.model,
2722
+ graderModel: parsedCli.data.graderModel,
2723
+ provider: config.provider,
2724
+ saveResults: parsedCli.data.saveResults,
2725
+ verbose: Boolean(parsedCli.data.verbose),
2726
+ apiKey: parsedCli.data.apiKey,
2727
+ numRuns: config.eval.numRuns
2728
+ },
2729
+ command
2730
+ );
1917
2731
  });
1918
2732
  }
1919
2733
 
1920
2734
  // src/commands/check.ts
1921
2735
  import ora3 from "ora";
1922
- import { z as z7 } from "zod";
2736
+ import { z as z9 } from "zod";
1923
2737
 
1924
2738
  // src/core/check-runner.ts
1925
2739
  function calculateEvalAssertPassRate(result) {
@@ -1930,8 +2744,8 @@ function calculateEvalAssertPassRate(result) {
1930
2744
  }
1931
2745
  async function runCheck(inputPath, options) {
1932
2746
  options.onStage?.("lint");
1933
- const lint = await runLinter(inputPath);
1934
- const lintPassed = lint.summary.failures === 0;
2747
+ const lint = await runLinter(inputPath, { suppress: options.lintSuppress });
2748
+ const lintPassed = !lintFails(lint, options.lintFailOn);
1935
2749
  let trigger = null;
1936
2750
  let evalResult = null;
1937
2751
  let triggerSkippedReason;
@@ -1956,6 +2770,7 @@ async function runCheck(inputPath, options) {
1956
2770
  model: options.model,
1957
2771
  queries: options.queries,
1958
2772
  numQueries: options.numQueries,
2773
+ seed: options.triggerSeed,
1959
2774
  verbose: options.verbose
1960
2775
  });
1961
2776
  options.onStage?.("eval");
@@ -1963,6 +2778,7 @@ async function runCheck(inputPath, options) {
1963
2778
  provider: options.provider,
1964
2779
  model: options.model,
1965
2780
  graderModel: options.graderModel,
2781
+ numRuns: options.evalNumRuns,
1966
2782
  prompts: options.prompts
1967
2783
  });
1968
2784
  }
@@ -1999,19 +2815,15 @@ async function runCheck(inputPath, options) {
1999
2815
  }
2000
2816
 
2001
2817
  // src/commands/check.ts
2002
- var checkOptionsSchema = z7.object({
2003
- provider: z7.enum(["anthropic", "openai"]),
2004
- model: z7.string(),
2005
- graderModel: z7.string().optional(),
2006
- apiKey: z7.string().optional(),
2007
- queries: z7.string().optional(),
2008
- numQueries: z7.number().int().min(2),
2009
- prompts: z7.string().optional(),
2010
- minF1: z7.number().min(0).max(1),
2011
- minAssertPassRate: z7.number().min(0).max(1),
2012
- saveResults: z7.string().optional(),
2013
- continueOnLintFail: z7.boolean().optional(),
2014
- verbose: z7.boolean().optional()
2818
+ var checkCliSchema = z9.object({
2819
+ graderModel: z9.string().optional(),
2820
+ apiKey: z9.string().optional(),
2821
+ queries: z9.string().optional(),
2822
+ seed: z9.number().int().optional(),
2823
+ prompts: z9.string().optional(),
2824
+ saveResults: z9.string().optional(),
2825
+ continueOnLintFail: z9.boolean().optional(),
2826
+ verbose: z9.boolean().optional()
2015
2827
  });
2016
2828
  var DEFAULT_ANTHROPIC_MODEL3 = "claude-sonnet-4-5-20250929";
2017
2829
  var DEFAULT_OPENAI_MODEL3 = "gpt-4.1-mini";
@@ -2021,106 +2833,126 @@ function resolveModel3(provider, model) {
2021
2833
  }
2022
2834
  return model;
2023
2835
  }
2024
- function registerCheckCommand(program) {
2025
- program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai", "anthropic").option("--model <model>", "Model for trigger/eval runs", DEFAULT_ANTHROPIC_MODEL3).option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10), 20).option("--prompts <path>", "Path to eval prompts JSON").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value), 0.8).option(
2026
- "--min-assert-pass-rate <n>",
2027
- "Minimum required eval assertion pass rate (0-1)",
2028
- (value) => Number.parseFloat(value),
2029
- 0.9
2030
- ).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, commandOptions, command) => {
2031
- const globalOptions = getGlobalCliOptions(command);
2032
- const parsedOptions = checkOptionsSchema.safeParse(commandOptions);
2033
- if (!parsedOptions.success) {
2034
- writeError(new Error(parsedOptions.error.issues[0]?.message ?? "Invalid check options."), globalOptions.json);
2035
- process.exitCode = 2;
2036
- return;
2037
- }
2038
- const options = parsedOptions.data;
2039
- if (options.numQueries % 2 !== 0) {
2040
- writeError(
2041
- new Error("--num-queries must be an even number so the suite can split should/should-not trigger cases."),
2042
- globalOptions.json
2043
- );
2044
- process.exitCode = 2;
2045
- return;
2836
+ function renderCheckOutputWithSeed(output, seed) {
2837
+ if (seed === void 0) {
2838
+ return output;
2839
+ }
2840
+ const lines = output.split("\n");
2841
+ const triggerIndex = lines.indexOf("Trigger");
2842
+ if (triggerIndex === -1) {
2843
+ return `${output}
2844
+ Seed: ${seed}`;
2845
+ }
2846
+ lines.splice(triggerIndex + 1, 0, `Seed: ${seed}`);
2847
+ return lines.join("\n");
2848
+ }
2849
+ async function handleCheckCommand(targetPath, options, command) {
2850
+ const spinner = options.json || !process.stdout.isTTY ? null : ora3("Preparing check run...").start();
2851
+ try {
2852
+ if (spinner) {
2853
+ spinner.text = "Initializing model provider...";
2046
2854
  }
2047
- const spinner = globalOptions.json || !process.stdout.isTTY ? null : ora3("Preparing check run...").start();
2048
- try {
2855
+ const provider = createProvider(options.provider, options.apiKey);
2856
+ let queries = void 0;
2857
+ if (options.queries) {
2049
2858
  if (spinner) {
2050
- spinner.text = "Initializing model provider...";
2859
+ spinner.text = "Loading custom trigger queries...";
2051
2860
  }
2052
- const provider = createProvider(options.provider, options.apiKey);
2053
- let queries = void 0;
2054
- if (options.queries) {
2055
- if (spinner) {
2056
- spinner.text = "Loading custom trigger queries...";
2057
- }
2058
- const loadedQueries = await readJsonFile(options.queries);
2059
- const parsedQueries = triggerQueryArraySchema.safeParse(loadedQueries);
2060
- if (!parsedQueries.success) {
2061
- throw new Error(
2062
- `Invalid --queries JSON: ${parsedQueries.error.issues[0]?.message ?? "unknown format issue"}`
2063
- );
2064
- }
2065
- queries = parsedQueries.data;
2861
+ queries = await loadTriggerQueriesFile(options.queries);
2862
+ }
2863
+ let prompts = void 0;
2864
+ if (options.prompts) {
2865
+ if (spinner) {
2866
+ spinner.text = "Loading eval prompts...";
2066
2867
  }
2067
- let prompts = void 0;
2068
- if (options.prompts) {
2069
- if (spinner) {
2070
- spinner.text = "Loading eval prompts...";
2071
- }
2072
- const loadedPrompts = await readJsonFile(options.prompts);
2073
- const parsedPrompts = evalPromptArraySchema.safeParse(loadedPrompts);
2074
- if (!parsedPrompts.success) {
2075
- throw new Error(
2076
- `Invalid --prompts JSON: ${parsedPrompts.error.issues[0]?.message ?? "unknown format issue"}`
2077
- );
2868
+ prompts = await loadEvalPromptsJson(options.prompts);
2869
+ } else {
2870
+ prompts = await loadConfiguredEvalPrompts(command);
2871
+ }
2872
+ const model = resolveModel3(options.provider, options.model);
2873
+ const graderModel = options.graderModel ?? model;
2874
+ const result = await runCheck(targetPath, {
2875
+ provider,
2876
+ model,
2877
+ graderModel,
2878
+ lintFailOn: options.lintFailOn,
2879
+ lintSuppress: options.lintSuppress,
2880
+ queries,
2881
+ numQueries: options.numQueries,
2882
+ triggerSeed: options.triggerSeed,
2883
+ prompts,
2884
+ evalNumRuns: options.numRuns,
2885
+ minF1: options.minF1,
2886
+ minAssertPassRate: options.minAssertPassRate,
2887
+ continueOnLintFail: options.continueOnLintFail,
2888
+ verbose: options.verbose,
2889
+ onStage: (stage) => {
2890
+ if (!spinner) {
2891
+ return;
2078
2892
  }
2079
- prompts = parsedPrompts.data;
2080
- }
2081
- const model = resolveModel3(options.provider, options.model);
2082
- const graderModel = options.graderModel ?? model;
2083
- const result = await runCheck(targetPath, {
2084
- provider,
2085
- model,
2086
- graderModel,
2087
- queries,
2088
- numQueries: options.numQueries,
2089
- prompts,
2090
- minF1: options.minF1,
2091
- minAssertPassRate: options.minAssertPassRate,
2092
- continueOnLintFail: Boolean(options.continueOnLintFail),
2093
- verbose: Boolean(options.verbose),
2094
- onStage: (stage) => {
2095
- if (!spinner) {
2096
- return;
2097
- }
2098
- if (stage === "lint") {
2099
- spinner.text = "Running lint checks...";
2100
- } else if (stage === "parse") {
2101
- spinner.text = "Parsing skill for model evaluations...";
2102
- } else if (stage === "trigger") {
2103
- spinner.text = "Running trigger test suite...";
2104
- } else if (stage === "eval") {
2105
- spinner.text = "Running end-to-end eval suite...";
2106
- }
2893
+ if (stage === "lint") {
2894
+ spinner.text = "Running lint checks...";
2895
+ } else if (stage === "parse") {
2896
+ spinner.text = "Parsing skill for model evaluations...";
2897
+ } else if (stage === "trigger") {
2898
+ spinner.text = "Running trigger test suite...";
2899
+ } else if (stage === "eval") {
2900
+ spinner.text = "Running end-to-end eval suite...";
2107
2901
  }
2108
- });
2109
- if (options.saveResults) {
2110
- await writeJsonFile(options.saveResults, result);
2111
2902
  }
2112
- spinner?.stop();
2113
- if (globalOptions.json) {
2114
- writeResult(result, true);
2115
- } else {
2116
- writeResult(renderCheckReport(result, globalOptions.color, Boolean(options.verbose)), false);
2117
- }
2118
- process.exitCode = result.gates.overallPassed ? 0 : 1;
2119
- } catch (error) {
2120
- spinner?.stop();
2121
- writeError(error, globalOptions.json);
2903
+ });
2904
+ if (options.saveResults) {
2905
+ await writeJsonFile(options.saveResults, result);
2906
+ }
2907
+ spinner?.stop();
2908
+ if (options.json) {
2909
+ writeResult(result, true);
2910
+ } else {
2911
+ writeResult(
2912
+ renderCheckOutputWithSeed(renderCheckReport(result, options.color, options.verbose), result.trigger?.seed),
2913
+ false
2914
+ );
2915
+ }
2916
+ process.exitCode = result.gates.overallPassed ? 0 : 1;
2917
+ } catch (error) {
2918
+ spinner?.stop();
2919
+ writeError(error, options.json);
2920
+ process.exitCode = 2;
2921
+ }
2922
+ }
2923
+ function registerCheckCommand(program) {
2924
+ program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for trigger/eval runs").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option("--min-assert-pass-rate <n>", "Minimum required eval assertion pass rate (0-1)", (value) => Number.parseFloat(value)).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, _commandOptions, command) => {
2925
+ const globalOptions = getGlobalCliOptions(command);
2926
+ const config = getResolvedConfig(command);
2927
+ const parsedCli = checkCliSchema.safeParse(command.opts());
2928
+ if (!parsedCli.success) {
2929
+ writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid check options."), globalOptions.json);
2122
2930
  process.exitCode = 2;
2931
+ return;
2123
2932
  }
2933
+ await handleCheckCommand(
2934
+ targetPath,
2935
+ {
2936
+ ...globalOptions,
2937
+ provider: config.provider,
2938
+ model: config.model,
2939
+ graderModel: parsedCli.data.graderModel,
2940
+ apiKey: parsedCli.data.apiKey,
2941
+ queries: parsedCli.data.queries,
2942
+ numQueries: config.trigger.numQueries,
2943
+ prompts: parsedCli.data.prompts,
2944
+ minF1: config.trigger.threshold,
2945
+ minAssertPassRate: config.eval.threshold,
2946
+ numRuns: config.eval.numRuns,
2947
+ lintFailOn: config.lint.failOn,
2948
+ lintSuppress: config.lint.suppress,
2949
+ triggerSeed: parsedCli.data.seed ?? config.trigger.seed,
2950
+ saveResults: parsedCli.data.saveResults,
2951
+ continueOnLintFail: Boolean(parsedCli.data.continueOnLintFail),
2952
+ verbose: Boolean(parsedCli.data.verbose)
2953
+ },
2954
+ command
2955
+ );
2124
2956
  });
2125
2957
  }
2126
2958
 
@@ -2128,27 +2960,49 @@ function registerCheckCommand(program) {
2128
2960
  function resolveVersion() {
2129
2961
  try {
2130
2962
  const currentFilePath = fileURLToPath(import.meta.url);
2131
- const packageJsonPath = path5.resolve(path5.dirname(currentFilePath), "..", "package.json");
2132
- const raw = fs5.readFileSync(packageJsonPath, "utf8");
2963
+ const packageJsonPath = path6.resolve(path6.dirname(currentFilePath), "..", "package.json");
2964
+ const raw = fs7.readFileSync(packageJsonPath, "utf8");
2133
2965
  const parsed = JSON.parse(raw);
2134
2966
  return parsed.version ?? "0.0.0";
2135
2967
  } catch {
2136
2968
  return "0.0.0";
2137
2969
  }
2138
2970
  }
2971
+ function shouldRenderJson(argv) {
2972
+ return argv.includes("--json");
2973
+ }
2974
+ function renderTopLevelError(error, asJson) {
2975
+ const message = error instanceof Error ? error.message : String(error);
2976
+ if (asJson) {
2977
+ process.stdout.write(`${JSON.stringify({ error: message }, null, 2)}
2978
+ `);
2979
+ return;
2980
+ }
2981
+ process.stderr.write(`Error: ${message}
2982
+ `);
2983
+ }
2139
2984
  async function run(argv) {
2140
2985
  const program = new Command();
2141
2986
  program.name("skilltest").description("The testing framework for Agent Skills.").version(resolveVersion()).option("--json", "Output results as JSON").option("--no-color", "Disable colored output").showHelpAfterError();
2987
+ program.hook("preAction", async (_program, actionCommand) => {
2988
+ const targetPath = typeof actionCommand.processedArgs[0] === "string" ? actionCommand.processedArgs[0] : void 0;
2989
+ const cliOverrides = extractCliConfigOverrides(actionCommand);
2990
+ const context = await resolveConfigContext(targetPath, cliOverrides);
2991
+ setCommandExecutionContext(actionCommand, context);
2992
+ });
2142
2993
  registerLintCommand(program);
2143
2994
  registerTriggerCommand(program);
2144
2995
  registerEvalCommand(program);
2145
2996
  registerCheckCommand(program);
2146
- await program.parseAsync(argv);
2997
+ try {
2998
+ await program.parseAsync(argv);
2999
+ } catch (error) {
3000
+ renderTopLevelError(error, shouldRenderJson(argv));
3001
+ process.exitCode = 2;
3002
+ }
2147
3003
  }
2148
3004
  run(process.argv).catch((error) => {
2149
- const message = error instanceof Error ? error.message : String(error);
2150
- process.stderr.write(`Error: ${message}
2151
- `);
3005
+ renderTopLevelError(error, shouldRenderJson(process.argv));
2152
3006
  process.exitCode = 2;
2153
3007
  });
2154
3008
  export {