skilltest 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,8 +1,8 @@
1
1
  #!/usr/bin/env node
2
2
 
3
3
  // src/index.ts
4
- import fs5 from "node:fs";
5
- import path5 from "node:path";
4
+ import fs7 from "node:fs";
5
+ import path6 from "node:path";
6
6
  import { fileURLToPath } from "node:url";
7
7
  import { Command } from "commander";
8
8
 
@@ -194,6 +194,7 @@ function runCompatibilityChecks(context) {
194
194
  if (hasAllowedTools) {
195
195
  issues.push({
196
196
  id: "compat.allowed-tools",
197
+ checkId: "compat:frontmatter",
197
198
  title: "Platform-Specific Frontmatter",
198
199
  status: "warn",
199
200
  message: "Frontmatter includes allowed-tools, which is typically Claude-specific.",
@@ -202,6 +203,7 @@ function runCompatibilityChecks(context) {
202
203
  } else {
203
204
  issues.push({
204
205
  id: "compat.allowed-tools",
206
+ checkId: "compat:frontmatter",
205
207
  title: "Platform-Specific Frontmatter",
206
208
  status: "pass",
207
209
  message: "No known provider-specific frontmatter keys detected."
@@ -211,6 +213,7 @@ function runCompatibilityChecks(context) {
211
213
  const platform = mentionsClaudeOnly ? "Claude" : "Codex";
212
214
  issues.push({
213
215
  id: "compat.provider-phrasing",
216
+ checkId: "compat:provider-language",
214
217
  title: "Provider-Specific Language",
215
218
  status: "warn",
216
219
  message: `Skill body appears tuned to ${platform}-specific behavior.`,
@@ -219,6 +222,7 @@ function runCompatibilityChecks(context) {
219
222
  } else {
220
223
  issues.push({
221
224
  id: "compat.provider-phrasing",
225
+ checkId: "compat:provider-language",
222
226
  title: "Provider-Specific Language",
223
227
  status: "pass",
224
228
  message: "Skill body appears provider-neutral."
@@ -227,6 +231,7 @@ function runCompatibilityChecks(context) {
227
231
  const likelyCompatibility = hasAllowedTools || mentionsClaudeOnly || mentionsCodexOnly ? "Likely compatible with some agents, but includes platform-specific assumptions." : "Likely broadly compatible across Anthropic, OpenAI/Codex-style, and other markdown skill runners.";
228
232
  issues.push({
229
233
  id: "compat.summary",
234
+ checkId: "compat:summary",
230
235
  title: "Compatibility Hint",
231
236
  status: hasAllowedTools || mentionsClaudeOnly || mentionsCodexOnly ? "warn" : "pass",
232
237
  message: likelyCompatibility
@@ -258,6 +263,7 @@ function runContentChecks(context) {
258
263
  if (!/^#{1,6}\s+\S+/m.test(body)) {
259
264
  issues.push({
260
265
  id: "content.headers",
266
+ checkId: "content:headers",
261
267
  title: "Section Headers",
262
268
  status: "warn",
263
269
  message: "No markdown headers found in SKILL.md body.",
@@ -266,6 +272,7 @@ function runContentChecks(context) {
266
272
  } else {
267
273
  issues.push({
268
274
  id: "content.headers",
275
+ checkId: "content:headers",
269
276
  title: "Section Headers",
270
277
  status: "pass",
271
278
  message: "SKILL.md contains markdown section headers."
@@ -275,6 +282,7 @@ function runContentChecks(context) {
275
282
  if (!hasExamples) {
276
283
  issues.push({
277
284
  id: "content.examples",
285
+ checkId: "content:examples",
278
286
  title: "Examples",
279
287
  status: "warn",
280
288
  message: "No examples detected in SKILL.md body.",
@@ -283,6 +291,7 @@ function runContentChecks(context) {
283
291
  } else {
284
292
  issues.push({
285
293
  id: "content.examples",
294
+ checkId: "content:examples",
286
295
  title: "Examples",
287
296
  status: "pass",
288
297
  message: "Examples were detected in SKILL.md."
@@ -292,6 +301,7 @@ function runContentChecks(context) {
292
301
  if (vagueMatches.length > 0) {
293
302
  issues.push({
294
303
  id: "content.vagueness",
304
+ checkId: "content:vagueness",
295
305
  title: "Instruction Specificity",
296
306
  status: "warn",
297
307
  message: "Potentially vague instruction phrases detected.",
@@ -300,6 +310,7 @@ function runContentChecks(context) {
300
310
  } else {
301
311
  issues.push({
302
312
  id: "content.vagueness",
313
+ checkId: "content:vagueness",
303
314
  title: "Instruction Specificity",
304
315
  status: "pass",
305
316
  message: "No obvious vague placeholder phrasing found."
@@ -308,6 +319,7 @@ function runContentChecks(context) {
308
319
  if (context.frontmatter.rawFrontmatter && /[<>]/.test(context.frontmatter.rawFrontmatter)) {
309
320
  issues.push({
310
321
  id: "content.frontmatter-angle-brackets",
322
+ checkId: "content:angle-brackets",
311
323
  title: "Frontmatter Angle Brackets",
312
324
  status: "warn",
313
325
  message: "Frontmatter contains angle bracket characters (< or >), which can be misinterpreted in some agents.",
@@ -316,6 +328,7 @@ function runContentChecks(context) {
316
328
  } else {
317
329
  issues.push({
318
330
  id: "content.frontmatter-angle-brackets",
331
+ checkId: "content:angle-brackets",
319
332
  title: "Frontmatter Angle Brackets",
320
333
  status: "pass",
321
334
  message: "No angle bracket tokens detected in frontmatter."
@@ -330,6 +343,7 @@ function runContentChecks(context) {
330
343
  if (secretHits.size > 0) {
331
344
  issues.push({
332
345
  id: "content.secrets",
346
+ checkId: "content:secrets",
333
347
  title: "Hardcoded Secrets",
334
348
  status: "fail",
335
349
  message: `Potential secrets detected (${Array.from(secretHits).join(", ")}).`,
@@ -338,6 +352,7 @@ function runContentChecks(context) {
338
352
  } else {
339
353
  issues.push({
340
354
  id: "content.secrets",
355
+ checkId: "content:secrets",
341
356
  title: "Hardcoded Secrets",
342
357
  status: "pass",
343
358
  message: "No obvious API keys or secrets patterns were detected."
@@ -346,6 +361,7 @@ function runContentChecks(context) {
346
361
  if (bodyLines.length < 10) {
347
362
  issues.push({
348
363
  id: "content.body-length",
364
+ checkId: "content:body-length",
349
365
  title: "Body Completeness",
350
366
  status: "warn",
351
367
  message: `SKILL.md body has only ${bodyLines.length} non-empty lines.`,
@@ -354,6 +370,7 @@ function runContentChecks(context) {
354
370
  } else {
355
371
  issues.push({
356
372
  id: "content.body-length",
373
+ checkId: "content:body-length",
357
374
  title: "Body Completeness",
358
375
  status: "pass",
359
376
  message: `SKILL.md body has ${bodyLines.length} non-empty lines.`
@@ -362,6 +379,7 @@ function runContentChecks(context) {
362
379
  if (description && description.length < 50) {
363
380
  issues.push({
364
381
  id: "content.description-length",
382
+ checkId: "content:description-length",
365
383
  title: "Description Specificity",
366
384
  status: "warn",
367
385
  message: `Description length is ${description.length} characters, which may be too vague for reliable triggering.`,
@@ -370,6 +388,7 @@ function runContentChecks(context) {
370
388
  } else if (description) {
371
389
  issues.push({
372
390
  id: "content.description-length",
391
+ checkId: "content:description-length",
373
392
  title: "Description Specificity",
374
393
  status: "pass",
375
394
  message: "Description length is sufficient for triggerability heuristics."
@@ -461,6 +480,7 @@ async function runDisclosureChecks(context) {
461
480
  if (context.skill.lineCount > 200 && !await pathExists(referencesDir)) {
462
481
  issues.push({
463
482
  id: "disclosure.skill-split",
483
+ checkId: "disclosure:progressive-disclosure",
464
484
  title: "Progressive Disclosure",
465
485
  status: "warn",
466
486
  message: "SKILL.md exceeds 200 lines and no references/ directory is present.",
@@ -469,6 +489,7 @@ async function runDisclosureChecks(context) {
469
489
  } else {
470
490
  issues.push({
471
491
  id: "disclosure.skill-split",
492
+ checkId: "disclosure:progressive-disclosure",
472
493
  title: "Progressive Disclosure",
473
494
  status: "pass",
474
495
  message: "Top-level file length and references/ usage look reasonable."
@@ -496,6 +517,7 @@ async function runDisclosureChecks(context) {
496
517
  if (nonRelativeOrEscaping.length > 0) {
497
518
  issues.push({
498
519
  id: "disclosure.relative-path-root",
520
+ checkId: "disclosure:path-scope",
499
521
  title: "Reference Path Scope",
500
522
  status: "fail",
501
523
  message: `Found non-relative or out-of-root references: ${nonRelativeOrEscaping.join(", ")}`,
@@ -504,6 +526,7 @@ async function runDisclosureChecks(context) {
504
526
  } else {
505
527
  issues.push({
506
528
  id: "disclosure.relative-path-root",
529
+ checkId: "disclosure:path-scope",
507
530
  title: "Reference Path Scope",
508
531
  status: "pass",
509
532
  message: "All detected file references are relative and scoped to skill root."
@@ -513,6 +536,7 @@ async function runDisclosureChecks(context) {
513
536
  if (chainDepth > 1) {
514
537
  issues.push({
515
538
  id: "disclosure.reference-depth",
539
+ checkId: "disclosure:reference-depth",
516
540
  title: "Reference Chain Depth",
517
541
  status: "warn",
518
542
  message: "Deep reference chains detected (>1 level).",
@@ -521,6 +545,7 @@ async function runDisclosureChecks(context) {
521
545
  } else {
522
546
  issues.push({
523
547
  id: "disclosure.reference-depth",
548
+ checkId: "disclosure:reference-depth",
524
549
  title: "Reference Chain Depth",
525
550
  status: "pass",
526
551
  message: "Reference depth is shallow and easy to navigate."
@@ -530,6 +555,7 @@ async function runDisclosureChecks(context) {
530
555
  if (normalizedReferences.some((item) => item.includes("../"))) {
531
556
  issues.push({
532
557
  id: "disclosure.parent-traversal",
558
+ checkId: "disclosure:parent-traversal",
533
559
  title: "Parent Traversal",
534
560
  status: "warn",
535
561
  message: "References include parent-directory traversal (../).",
@@ -538,6 +564,7 @@ async function runDisclosureChecks(context) {
538
564
  } else {
539
565
  issues.push({
540
566
  id: "disclosure.parent-traversal",
567
+ checkId: "disclosure:parent-traversal",
541
568
  title: "Parent Traversal",
542
569
  status: "pass",
543
570
  message: "No parent-directory traversal references detected."
@@ -568,6 +595,7 @@ function runFrontmatterChecks(context) {
568
595
  if (!context.frontmatter.hasFrontmatter) {
569
596
  issues.push({
570
597
  id: "frontmatter.exists",
598
+ checkId: "frontmatter:exists",
571
599
  title: "Frontmatter Presence",
572
600
  status: "fail",
573
601
  message: "SKILL.md is missing YAML frontmatter delimited by --- blocks.",
@@ -578,6 +606,7 @@ function runFrontmatterChecks(context) {
578
606
  if (context.frontmatter.error) {
579
607
  issues.push({
580
608
  id: "frontmatter.valid-yaml",
609
+ checkId: "frontmatter:yaml",
581
610
  title: "Frontmatter YAML",
582
611
  status: "fail",
583
612
  message: `Frontmatter is not valid YAML: ${context.frontmatter.error}`,
@@ -587,6 +616,7 @@ function runFrontmatterChecks(context) {
587
616
  }
588
617
  issues.push({
589
618
  id: "frontmatter.valid-yaml",
619
+ checkId: "frontmatter:yaml",
590
620
  title: "Frontmatter YAML",
591
621
  status: "pass",
592
622
  message: "Frontmatter exists and parses correctly."
@@ -596,6 +626,7 @@ function runFrontmatterChecks(context) {
596
626
  if (!name) {
597
627
  issues.push({
598
628
  id: "frontmatter.name.required",
629
+ checkId: "frontmatter:name",
599
630
  title: "Frontmatter Name",
600
631
  status: "fail",
601
632
  message: "Missing required frontmatter field: name.",
@@ -604,6 +635,7 @@ function runFrontmatterChecks(context) {
604
635
  } else if (name.length > 64) {
605
636
  issues.push({
606
637
  id: "frontmatter.name.length",
638
+ checkId: "frontmatter:name",
607
639
  title: "Frontmatter Name Length",
608
640
  status: "fail",
609
641
  message: `name is too long (${name.length} chars, max 64).`,
@@ -612,6 +644,7 @@ function runFrontmatterChecks(context) {
612
644
  } else if (!SKILL_NAME_REGEX.test(name)) {
613
645
  issues.push({
614
646
  id: "frontmatter.name.format",
647
+ checkId: "frontmatter:name",
615
648
  title: "Frontmatter Name Format",
616
649
  status: "fail",
617
650
  message: "name must be lowercase alphanumeric with single hyphen separators only.",
@@ -620,6 +653,7 @@ function runFrontmatterChecks(context) {
620
653
  } else {
621
654
  issues.push({
622
655
  id: "frontmatter.name.valid",
656
+ checkId: "frontmatter:name",
623
657
  title: "Frontmatter Name",
624
658
  status: "pass",
625
659
  message: "name is present and follows naming conventions."
@@ -629,6 +663,7 @@ function runFrontmatterChecks(context) {
629
663
  if (!description || description.trim() === "") {
630
664
  issues.push({
631
665
  id: "frontmatter.description.required",
666
+ checkId: "frontmatter:description",
632
667
  title: "Frontmatter Description",
633
668
  status: "fail",
634
669
  message: "Missing required frontmatter field: description.",
@@ -637,6 +672,7 @@ function runFrontmatterChecks(context) {
637
672
  } else if (description.length > 1024) {
638
673
  issues.push({
639
674
  id: "frontmatter.description.length",
675
+ checkId: "frontmatter:description",
640
676
  title: "Frontmatter Description Length",
641
677
  status: "fail",
642
678
  message: `description is too long (${description.length} chars, max 1024).`,
@@ -645,6 +681,7 @@ function runFrontmatterChecks(context) {
645
681
  } else {
646
682
  issues.push({
647
683
  id: "frontmatter.description.valid",
684
+ checkId: "frontmatter:description",
648
685
  title: "Frontmatter Description",
649
686
  status: "pass",
650
687
  message: "description is present and within allowed length."
@@ -654,6 +691,7 @@ function runFrontmatterChecks(context) {
654
691
  if (!license || license.trim() === "") {
655
692
  issues.push({
656
693
  id: "frontmatter.license.recommended",
694
+ checkId: "frontmatter:license",
657
695
  title: "Frontmatter License",
658
696
  status: "warn",
659
697
  message: "No license field found in frontmatter.",
@@ -662,6 +700,7 @@ function runFrontmatterChecks(context) {
662
700
  } else {
663
701
  issues.push({
664
702
  id: "frontmatter.license.present",
703
+ checkId: "frontmatter:license",
665
704
  title: "Frontmatter License",
666
705
  status: "pass",
667
706
  message: "license field is present."
@@ -670,6 +709,7 @@ function runFrontmatterChecks(context) {
670
709
  if (description && description.trim() !== "" && !descriptionLooksActionable(description)) {
671
710
  issues.push({
672
711
  id: "frontmatter.description.triggerability",
712
+ checkId: "frontmatter:triggerability",
673
713
  title: "Description Trigger Clarity",
674
714
  status: "warn",
675
715
  message: "Description should explain both what the skill does and when it should be used.",
@@ -678,6 +718,7 @@ function runFrontmatterChecks(context) {
678
718
  } else if (description) {
679
719
  issues.push({
680
720
  id: "frontmatter.description.triggerability",
721
+ checkId: "frontmatter:triggerability",
681
722
  title: "Description Trigger Clarity",
682
723
  status: "pass",
683
724
  message: "Description appears to cover both capability and usage context."
@@ -751,6 +792,7 @@ function runSecurityChecks(context) {
751
792
  if (dangerousCommandHits.length > 0) {
752
793
  issues.push({
753
794
  id: "security.dangerous-command-patterns",
795
+ checkId: "security:dangerous-commands",
754
796
  title: "Dangerous Command Patterns",
755
797
  status: "fail",
756
798
  message: `Potentially dangerous command instruction patterns found: ${dangerousCommandHits.join(", ")}.`,
@@ -759,6 +801,7 @@ function runSecurityChecks(context) {
759
801
  } else {
760
802
  issues.push({
761
803
  id: "security.dangerous-command-patterns",
804
+ checkId: "security:dangerous-commands",
762
805
  title: "Dangerous Command Patterns",
763
806
  status: "pass",
764
807
  message: "No high-risk destructive or direct pipe-to-shell patterns detected."
@@ -768,6 +811,7 @@ function runSecurityChecks(context) {
768
811
  if (exfiltrationHits.length > 0) {
769
812
  issues.push({
770
813
  id: "security.exfiltration-patterns",
814
+ checkId: "security:exfiltration",
771
815
  title: "Sensitive Data Exfiltration",
772
816
  status: "fail",
773
817
  message: `Possible sensitive data exfiltration patterns found: ${exfiltrationHits.join(", ")}.`,
@@ -776,6 +820,7 @@ function runSecurityChecks(context) {
776
820
  } else {
777
821
  issues.push({
778
822
  id: "security.exfiltration-patterns",
823
+ checkId: "security:exfiltration",
779
824
  title: "Sensitive Data Exfiltration",
780
825
  status: "pass",
781
826
  message: "No obvious credential access/exfiltration instructions detected."
@@ -785,6 +830,7 @@ function runSecurityChecks(context) {
785
830
  if (escalationHits.length > 0) {
786
831
  issues.push({
787
832
  id: "security.privilege-escalation",
833
+ checkId: "security:privilege-escalation",
788
834
  title: "Privilege Escalation Language",
789
835
  status: "warn",
790
836
  message: `Potentially risky privilege/execution language detected: ${escalationHits.join(", ")}.`,
@@ -793,6 +839,7 @@ function runSecurityChecks(context) {
793
839
  } else {
794
840
  issues.push({
795
841
  id: "security.privilege-escalation",
842
+ checkId: "security:privilege-escalation",
796
843
  title: "Privilege Escalation Language",
797
844
  status: "pass",
798
845
  message: "No obvious privilege-escalation language detected."
@@ -802,6 +849,7 @@ function runSecurityChecks(context) {
802
849
  if (hasShellActivity && !SAFETY_GUARDRAIL_PATTERN.test(skillText)) {
803
850
  issues.push({
804
851
  id: "security.safety-guardrails",
852
+ checkId: "security:missing-guardrails",
805
853
  title: "Execution Safety Guardrails",
806
854
  status: "warn",
807
855
  message: "Shell/tool execution is present, but no explicit safety guardrails were detected.",
@@ -810,6 +858,7 @@ function runSecurityChecks(context) {
810
858
  } else {
811
859
  issues.push({
812
860
  id: "security.safety-guardrails",
861
+ checkId: "security:missing-guardrails",
813
862
  title: "Execution Safety Guardrails",
814
863
  status: "pass",
815
864
  message: hasShellActivity ? "Shell/tool execution instructions include at least one safety guardrail." : "No shell/tool execution instructions detected."
@@ -846,6 +895,7 @@ async function runStructureChecks(context) {
846
895
  if (context.skill.lineCount > 500) {
847
896
  issues.push({
848
897
  id: "structure.skill-size",
898
+ checkId: "structure:file-size",
849
899
  title: "SKILL.md Size",
850
900
  status: "warn",
851
901
  message: `SKILL.md is ${context.skill.lineCount} lines (recommended max is 500).`,
@@ -854,6 +904,7 @@ async function runStructureChecks(context) {
854
904
  } else {
855
905
  issues.push({
856
906
  id: "structure.skill-size",
907
+ checkId: "structure:file-size",
857
908
  title: "SKILL.md Size",
858
909
  status: "pass",
859
910
  message: `SKILL.md length is ${context.skill.lineCount} lines.`
@@ -870,6 +921,7 @@ async function runStructureChecks(context) {
870
921
  oversizedWithoutToc += 1;
871
922
  issues.push({
872
923
  id: `structure.references.toc.${toPosixPath(path4.relative(context.skill.skillRoot, file))}`,
924
+ checkId: "structure:toc",
873
925
  title: "Reference File Navigation",
874
926
  status: "warn",
875
927
  message: `${toPosixPath(path4.relative(context.skill.skillRoot, file))} is ${lineCount} lines and has no table of contents.`,
@@ -880,6 +932,7 @@ async function runStructureChecks(context) {
880
932
  if (oversizedWithoutToc === 0) {
881
933
  issues.push({
882
934
  id: "structure.references.toc",
935
+ checkId: "structure:toc",
883
936
  title: "Reference File Navigation",
884
937
  status: "pass",
885
938
  message: "No oversized reference files missing a table of contents."
@@ -888,6 +941,7 @@ async function runStructureChecks(context) {
888
941
  } else {
889
942
  issues.push({
890
943
  id: "structure.references.toc",
944
+ checkId: "structure:toc",
891
945
  title: "Reference File Navigation",
892
946
  status: "pass",
893
947
  message: "No references/ directory found, so no long reference files to validate."
@@ -916,6 +970,7 @@ async function runStructureChecks(context) {
916
970
  if (missing.length > 0) {
917
971
  issues.push({
918
972
  id: `structure.${category.key}.exists`,
973
+ checkId: "structure:references",
919
974
  title: category.title,
920
975
  status: "fail",
921
976
  message: `Missing referenced ${category.key} file(s): ${missing.join(", ")}`,
@@ -924,6 +979,7 @@ async function runStructureChecks(context) {
924
979
  } else {
925
980
  issues.push({
926
981
  id: `structure.${category.key}.exists`,
982
+ checkId: "structure:references",
927
983
  title: category.title,
928
984
  status: "pass",
929
985
  message: `All referenced ${category.key} files exist.`
@@ -934,6 +990,7 @@ async function runStructureChecks(context) {
934
990
  if (missingGeneric.length > 0) {
935
991
  issues.push({
936
992
  id: "structure.relative-links.broken",
993
+ checkId: "structure:references",
937
994
  title: "Relative Links",
938
995
  status: "fail",
939
996
  message: `Broken relative path reference(s): ${missingGeneric.join(", ")}`,
@@ -942,6 +999,7 @@ async function runStructureChecks(context) {
942
999
  } else {
943
1000
  issues.push({
944
1001
  id: "structure.relative-links.broken",
1002
+ checkId: "structure:references",
945
1003
  title: "Relative Links",
946
1004
  status: "pass",
947
1005
  message: "No broken generic relative file references were found."
@@ -971,7 +1029,13 @@ function summarizeIssues(issues) {
971
1029
  }
972
1030
  return summary;
973
1031
  }
974
- async function runLinter(inputPath) {
1032
+ function lintFails(report, failOn) {
1033
+ if (report.summary.failures > 0) {
1034
+ return true;
1035
+ }
1036
+ return failOn === "warn" && report.summary.warnings > 0;
1037
+ }
1038
+ async function runLinter(inputPath, options = {}) {
975
1039
  const skill = await loadSkillFile(inputPath);
976
1040
  const frontmatter = parseFrontmatter(skill.raw);
977
1041
  const context = {
@@ -985,10 +1049,12 @@ async function runLinter(inputPath) {
985
1049
  issues.push(...runSecurityChecks(context));
986
1050
  issues.push(...await runDisclosureChecks(context));
987
1051
  issues.push(...runCompatibilityChecks(context));
1052
+ const suppress = new Set(options.suppress ?? []);
1053
+ const filteredIssues = issues.filter((issue) => !suppress.has(issue.checkId));
988
1054
  return {
989
1055
  target: inputPath,
990
- issues,
991
- summary: summarizeIssues(issues)
1056
+ issues: filteredIssues,
1057
+ summary: summarizeIssues(filteredIssues)
992
1058
  };
993
1059
  }
994
1060
 
@@ -1159,71 +1225,165 @@ function renderCheckReport(result, enableColor, verbose) {
1159
1225
  return lines.join("\n");
1160
1226
  }
1161
1227
 
1162
- // src/reporters/json.ts
1163
- function renderJson(value) {
1164
- return JSON.stringify(value, null, 2);
1165
- }
1166
-
1167
1228
  // src/commands/common.ts
1168
- function getGlobalCliOptions(command) {
1169
- const options = command.optsWithGlobals();
1170
- return {
1171
- json: Boolean(options.json),
1172
- color: options.color !== false
1173
- };
1174
- }
1175
- function writeResult(value, asJson) {
1176
- if (asJson) {
1177
- process.stdout.write(`${renderJson(value)}
1178
- `);
1179
- return;
1229
+ import fs5 from "node:fs/promises";
1230
+ import { z as z5 } from "zod";
1231
+
1232
+ // src/core/eval-runner.ts
1233
+ import { z as z3 } from "zod";
1234
+
1235
+ // src/core/grader.ts
1236
+ import { z as z2 } from "zod";
1237
+ var gradedAssertionSchema = z2.object({
1238
+ assertion: z2.string(),
1239
+ passed: z2.boolean(),
1240
+ evidence: z2.string()
1241
+ });
1242
+ var graderOutputSchema = z2.object({
1243
+ assertions: z2.array(gradedAssertionSchema)
1244
+ });
1245
+ function extractJsonObject(raw) {
1246
+ const trimmed = raw.trim();
1247
+ if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
1248
+ return JSON.parse(trimmed);
1180
1249
  }
1181
- process.stdout.write(`${String(value)}
1182
- `);
1250
+ const start = trimmed.indexOf("{");
1251
+ const end = trimmed.lastIndexOf("}");
1252
+ if (start >= 0 && end > start) {
1253
+ return JSON.parse(trimmed.slice(start, end + 1));
1254
+ }
1255
+ throw new Error("Grader did not return a JSON object.");
1183
1256
  }
1184
- function writeError(error, asJson) {
1185
- const message = error instanceof Error ? error.message : String(error);
1186
- if (asJson) {
1187
- process.stdout.write(`${renderJson({ error: message })}
1188
- `);
1189
- return;
1257
+ async function gradeResponse(options) {
1258
+ const assertionList = options.assertions && options.assertions.length > 0 ? options.assertions : [
1259
+ "The response follows the skill instructions faithfully.",
1260
+ "The response is well-structured and actionable.",
1261
+ "The response addresses the user prompt directly."
1262
+ ];
1263
+ const systemPrompt = [
1264
+ "You are a strict evaluator for agent skill outputs.",
1265
+ "Assess each assertion and return JSON only.",
1266
+ 'Required output format: {"assertions":[{"assertion":"...","passed":true|false,"evidence":"..."}]}'
1267
+ ].join(" ");
1268
+ const userPrompt = [
1269
+ `Skill: ${options.skillName}`,
1270
+ "Skill instructions:",
1271
+ options.skillBody,
1272
+ "",
1273
+ `User prompt: ${options.userPrompt}`,
1274
+ "",
1275
+ "Model response:",
1276
+ options.modelResponse,
1277
+ "",
1278
+ "Assertions to evaluate:",
1279
+ assertionList.map((assertion, index) => `${index + 1}. ${assertion}`).join("\n")
1280
+ ].join("\n");
1281
+ const raw = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
1282
+ const parsed = graderOutputSchema.safeParse(extractJsonObject(raw));
1283
+ if (!parsed.success) {
1284
+ throw new Error(`Failed to parse grader output: ${parsed.error.issues[0]?.message ?? "invalid grader JSON"}`);
1190
1285
  }
1191
- process.stderr.write(`Error: ${message}
1192
- `);
1286
+ return parsed.data.assertions;
1193
1287
  }
1194
1288
 
1195
- // src/commands/lint.ts
1196
- function registerLintCommand(program) {
1197
- program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").action(async (targetPath, _commandOptions, command) => {
1198
- const globalOptions = getGlobalCliOptions(command);
1199
- try {
1200
- const report = await runLinter(targetPath);
1201
- if (globalOptions.json) {
1202
- writeResult(report, true);
1203
- } else {
1204
- writeResult(renderLintReport(report, globalOptions.color), false);
1205
- }
1206
- if (report.summary.failures > 0) {
1207
- process.exitCode = 1;
1208
- }
1209
- } catch (error) {
1210
- writeError(error, globalOptions.json);
1211
- process.exitCode = 2;
1289
+ // src/core/eval-runner.ts
1290
+ var evalPromptSchema = z3.object({
1291
+ prompt: z3.string().min(1),
1292
+ assertions: z3.array(z3.string().min(1)).optional()
1293
+ });
1294
+ var evalPromptArraySchema = z3.array(evalPromptSchema);
1295
+ function extractJsonArray(raw) {
1296
+ const trimmed = raw.trim();
1297
+ if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
1298
+ return JSON.parse(trimmed);
1299
+ }
1300
+ const start = trimmed.indexOf("[");
1301
+ const end = trimmed.lastIndexOf("]");
1302
+ if (start >= 0 && end > start) {
1303
+ return JSON.parse(trimmed.slice(start, end + 1));
1304
+ }
1305
+ throw new Error("Model did not return a JSON array.");
1306
+ }
1307
+ async function generatePrompts(skill, provider, model, count) {
1308
+ const systemPrompt = [
1309
+ "You generate realistic evaluation prompts for an agent skill.",
1310
+ "Return JSON only.",
1311
+ 'Format: [{"prompt":"...","assertions":["...", "..."]}]',
1312
+ "Assertions should be concrete and checkable."
1313
+ ].join(" ");
1314
+ const userPrompt = [
1315
+ `Skill name: ${skill.frontmatter.name}`,
1316
+ `Skill description: ${skill.frontmatter.description}`,
1317
+ "Skill instructions:",
1318
+ skill.content,
1319
+ "",
1320
+ `Generate ${count} prompts that stress the main capabilities and likely edge cases.`,
1321
+ "Each prompt should include 2-4 assertions."
1322
+ ].join("\n");
1323
+ const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
1324
+ const parsed = evalPromptArraySchema.safeParse(extractJsonArray(raw));
1325
+ if (!parsed.success) {
1326
+ throw new Error(`Failed to parse generated eval prompts: ${parsed.error.issues[0]?.message ?? "invalid prompt JSON"}`);
1327
+ }
1328
+ if (parsed.data.length !== count) {
1329
+ throw new Error(`Expected ${count} prompts, got ${parsed.data.length}.`);
1330
+ }
1331
+ return parsed.data;
1332
+ }
1333
+ async function runEval(skill, options) {
1334
+ const prompts = options.prompts && options.prompts.length > 0 ? evalPromptArraySchema.parse(options.prompts) : await generatePrompts(skill, options.provider, options.model, options.numRuns);
1335
+ const results = [];
1336
+ for (const evalPrompt of prompts) {
1337
+ const systemPrompt = [
1338
+ "You are an AI assistant with an activated skill.",
1339
+ "Follow this SKILL.md content exactly where applicable.",
1340
+ "",
1341
+ skill.raw
1342
+ ].join("\n");
1343
+ const response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
1344
+ const gradedAssertions = await gradeResponse({
1345
+ provider: options.provider,
1346
+ model: options.graderModel,
1347
+ skillName: skill.frontmatter.name,
1348
+ skillBody: skill.content,
1349
+ userPrompt: evalPrompt.prompt,
1350
+ modelResponse: response,
1351
+ assertions: evalPrompt.assertions
1352
+ });
1353
+ const passedAssertions2 = gradedAssertions.filter((assertion) => assertion.passed).length;
1354
+ results.push({
1355
+ prompt: evalPrompt.prompt,
1356
+ assertions: gradedAssertions,
1357
+ responseSummary: response.slice(0, 200),
1358
+ response,
1359
+ passedAssertions: passedAssertions2,
1360
+ totalAssertions: gradedAssertions.length
1361
+ });
1362
+ }
1363
+ const totalAssertions = results.reduce((total, result) => total + result.totalAssertions, 0);
1364
+ const passedAssertions = results.reduce((total, result) => total + result.passedAssertions, 0);
1365
+ return {
1366
+ skillName: skill.frontmatter.name,
1367
+ model: options.model,
1368
+ graderModel: options.graderModel,
1369
+ provider: options.provider.name,
1370
+ prompts,
1371
+ results,
1372
+ summary: {
1373
+ totalPrompts: results.length,
1374
+ totalAssertions,
1375
+ passedAssertions
1212
1376
  }
1213
- });
1377
+ };
1214
1378
  }
1215
1379
 
1216
- // src/commands/trigger.ts
1217
- import ora from "ora";
1218
- import { z as z3 } from "zod";
1219
-
1220
1380
  // src/core/trigger-tester.ts
1221
- import { z as z2 } from "zod";
1222
- var triggerQuerySchema = z2.object({
1223
- query: z2.string().min(1),
1224
- should_trigger: z2.boolean()
1381
+ import { z as z4 } from "zod";
1382
+ var triggerQuerySchema = z4.object({
1383
+ query: z4.string().min(1),
1384
+ should_trigger: z4.boolean()
1225
1385
  });
1226
- var triggerQueryArraySchema = z2.array(triggerQuerySchema);
1386
+ var triggerQueryArraySchema = z4.array(triggerQuerySchema);
1227
1387
  var FAKE_SKILLS = [
1228
1388
  { name: "code-review", description: "Reviews code changes for bugs, regressions, and maintainability issues." },
1229
1389
  { name: "api-tester", description: "Designs and runs REST API tests, validating status codes and response shapes." },
@@ -1241,16 +1401,23 @@ var FAKE_SKILLS = [
1241
1401
  { name: "test-generator", description: "Generates unit and integration test cases from feature requirements." },
1242
1402
  { name: "prompt-tuner", description: "Improves prompts for reliability, formatting, and failure handling." }
1243
1403
  ];
1244
- function shuffle(values) {
1404
+ function createSeededRandom(seed) {
1405
+ let state = seed >>> 0;
1406
+ return () => {
1407
+ state = state * 1664525 + 1013904223 >>> 0;
1408
+ return state / 4294967296;
1409
+ };
1410
+ }
1411
+ function shuffle(values, random = Math.random) {
1245
1412
  const copy = [...values];
1246
1413
  for (let index = copy.length - 1; index > 0; index -= 1) {
1247
- const swapIndex = Math.floor(Math.random() * (index + 1));
1414
+ const swapIndex = Math.floor(random() * (index + 1));
1248
1415
  [copy[index], copy[swapIndex]] = [copy[swapIndex], copy[index]];
1249
1416
  }
1250
1417
  return copy;
1251
1418
  }
1252
- function sample(values, count) {
1253
- return shuffle(values).slice(0, Math.max(0, Math.min(count, values.length)));
1419
+ function sample(values, count, random = Math.random) {
1420
+ return shuffle(values, random).slice(0, Math.max(0, Math.min(count, values.length)));
1254
1421
  }
1255
1422
  function parseJsonArrayFromModelOutput(raw) {
1256
1423
  const trimmed = raw.trim();
@@ -1362,19 +1529,20 @@ function buildSuggestions(metrics) {
1362
1529
  return suggestions;
1363
1530
  }
1364
1531
  async function runTriggerTest(skill, options) {
1532
+ const random = options.seed === void 0 ? Math.random : createSeededRandom(options.seed);
1365
1533
  const queries = options.queries && options.queries.length > 0 ? triggerQueryArraySchema.parse(options.queries) : await generateQueriesWithModel(skill, options.provider, options.model, options.numQueries);
1366
1534
  const results = [];
1367
1535
  const skillName = skill.frontmatter.name;
1368
1536
  for (const testQuery of queries) {
1369
- const fakeCount = 5 + Math.floor(Math.random() * 4);
1370
- const fakeSkills = sample(FAKE_SKILLS, fakeCount);
1537
+ const fakeCount = 5 + Math.floor(random() * 4);
1538
+ const fakeSkills = sample(FAKE_SKILLS, fakeCount, random);
1371
1539
  const allSkills = shuffle([
1372
1540
  ...fakeSkills,
1373
1541
  {
1374
1542
  name: skill.frontmatter.name,
1375
1543
  description: skill.frontmatter.description
1376
1544
  }
1377
- ]);
1545
+ ], random);
1378
1546
  const skillListText = allSkills.map((entry) => `- ${entry.name}: ${entry.description}`).join("\n");
1379
1547
  const systemPrompt = [
1380
1548
  "You are selecting one skill to activate for a user query.",
@@ -1410,74 +1578,517 @@ async function runTriggerTest(skill, options) {
1410
1578
  };
1411
1579
  }
1412
1580
 
1413
- // src/utils/config.ts
1414
- function resolveApiKey(provider, override) {
1415
- if (override && override.trim() !== "") {
1416
- return override.trim();
1581
+ // src/reporters/json.ts
1582
+ function renderJson(value) {
1583
+ return JSON.stringify(value, null, 2);
1584
+ }
1585
+
1586
+ // src/commands/common.ts
1587
+ var executionContextByCommand = /* @__PURE__ */ new WeakMap();
1588
+ var singleEvalPromptSchema = z5.object({
1589
+ prompt: z5.string().min(1),
1590
+ assertions: z5.array(z5.string().min(1)).optional()
1591
+ });
1592
+ var promptStringArraySchema = z5.array(z5.string().min(1));
1593
+ var assertionsObjectSchema = z5.object({
1594
+ assertions: z5.array(z5.string().min(1))
1595
+ });
1596
+ function parseJsonIfPossible(raw) {
1597
+ try {
1598
+ return JSON.parse(raw);
1599
+ } catch {
1600
+ return raw;
1417
1601
  }
1418
- if (provider === "anthropic") {
1419
- const envValue2 = process.env.ANTHROPIC_API_KEY?.trim();
1420
- if (envValue2) {
1421
- return envValue2;
1422
- }
1423
- throw new Error(
1424
- "No Anthropic API key found. Set ANTHROPIC_API_KEY environment variable or pass --api-key flag."
1425
- );
1602
+ }
1603
+ function normalizeEvalPrompts(value, sourceLabel) {
1604
+ const promptArray = evalPromptArraySchema.safeParse(value);
1605
+ if (promptArray.success) {
1606
+ return promptArray.data;
1426
1607
  }
1427
- const envValue = process.env.OPENAI_API_KEY?.trim();
1428
- if (envValue) {
1429
- return envValue;
1608
+ const singlePrompt = singleEvalPromptSchema.safeParse(value);
1609
+ if (singlePrompt.success) {
1610
+ return [singlePrompt.data];
1430
1611
  }
1431
- throw new Error("No OpenAI API key found. Set OPENAI_API_KEY environment variable or pass --api-key flag.");
1612
+ const promptStrings = promptStringArraySchema.safeParse(value);
1613
+ if (promptStrings.success) {
1614
+ return promptStrings.data.map((prompt) => ({ prompt }));
1615
+ }
1616
+ if (typeof value === "string" && value.trim() !== "") {
1617
+ return [{ prompt: value.trim() }];
1618
+ }
1619
+ throw new Error(
1620
+ `Invalid eval prompt source at ${sourceLabel}. Expected plain text, a JSON prompt object, or a JSON array of prompts.`
1621
+ );
1432
1622
  }
1433
-
1434
- // src/providers/anthropic.ts
1435
- import Anthropic from "@anthropic-ai/sdk";
1436
- function wait(ms) {
1437
- return new Promise((resolve) => {
1438
- setTimeout(resolve, ms);
1439
- });
1623
+ function parseAssertionsFromText(raw) {
1624
+ return raw.split(/\r?\n/).map((line) => line.trim().replace(/^[-*]\s+/, "").replace(/^\d+\.\s+/, "")).filter((line) => line.length > 0);
1440
1625
  }
1441
- function isRateLimitError(error) {
1442
- if (!error || typeof error !== "object") {
1443
- return false;
1626
+ function normalizeAssertions(value, sourceLabel) {
1627
+ const assertionArray = z5.array(z5.string().min(1)).safeParse(value);
1628
+ if (assertionArray.success) {
1629
+ return assertionArray.data;
1444
1630
  }
1445
- const maybeStatus = error.status;
1446
- if (maybeStatus === 429) {
1447
- return true;
1631
+ const assertionObject = assertionsObjectSchema.safeParse(value);
1632
+ if (assertionObject.success) {
1633
+ return assertionObject.data.assertions;
1448
1634
  }
1449
- const maybeMessage = error.message;
1450
- if (typeof maybeMessage === "string" && /rate limit/i.test(maybeMessage)) {
1451
- return true;
1635
+ if (typeof value === "string") {
1636
+ const assertions = parseAssertionsFromText(value);
1637
+ if (assertions.length > 0) {
1638
+ return assertions;
1639
+ }
1452
1640
  }
1453
- return false;
1641
+ throw new Error(
1642
+ `Invalid eval assertions source at ${sourceLabel}. Expected JSON string[], { assertions: string[] }, or newline-delimited text.`
1643
+ );
1454
1644
  }
1455
- var AnthropicProvider = class {
1456
- name = "anthropic";
1457
- client;
1458
- constructor(apiKey) {
1459
- this.client = new Anthropic({ apiKey });
1645
+ function setCommandExecutionContext(command, context) {
1646
+ executionContextByCommand.set(command, context);
1647
+ }
1648
+ function getCommandExecutionContext(command) {
1649
+ const context = executionContextByCommand.get(command);
1650
+ if (!context) {
1651
+ throw new Error(`Missing resolved config for command '${command.name()}'.`);
1460
1652
  }
1461
- async sendMessage(systemPrompt, userMessage, options) {
1462
- let lastError;
1463
- for (let attempt = 0; attempt < 3; attempt += 1) {
1464
- try {
1465
- const response = await this.client.messages.create({
1466
- model: options.model,
1467
- max_tokens: 2048,
1468
- system: systemPrompt,
1469
- messages: [
1470
- {
1471
- role: "user",
1472
- content: userMessage
1473
- }
1474
- ]
1475
- });
1476
- const textBlocks = response.content.filter((block) => block.type === "text");
1477
- const text = textBlocks.map((block) => block.text).join("\n").trim();
1478
- if (text.length === 0) {
1479
- throw new Error("Model returned an empty response.");
1480
- }
1653
+ return context;
1654
+ }
1655
+ function getResolvedConfig(command) {
1656
+ return getCommandExecutionContext(command).config;
1657
+ }
1658
+ function getGlobalCliOptions(command) {
1659
+ const options = command.optsWithGlobals();
1660
+ const context = executionContextByCommand.get(command);
1661
+ return {
1662
+ json: context?.config.json ?? Boolean(options.json),
1663
+ color: options.color !== false
1664
+ };
1665
+ }
1666
+ async function loadTriggerQueriesFile(filePath) {
1667
+ const loaded = await readJsonFile(filePath);
1668
+ const parsed = triggerQueryArraySchema.safeParse(loaded);
1669
+ if (!parsed.success) {
1670
+ throw new Error(`Invalid --queries JSON: ${parsed.error.issues[0]?.message ?? "unknown format issue"}`);
1671
+ }
1672
+ return parsed.data;
1673
+ }
1674
+ async function loadEvalPromptsJson(filePath) {
1675
+ const loaded = await readJsonFile(filePath);
1676
+ const parsed = evalPromptArraySchema.safeParse(loaded);
1677
+ if (!parsed.success) {
1678
+ throw new Error(`Invalid --prompts JSON: ${parsed.error.issues[0]?.message ?? "unknown format issue"}`);
1679
+ }
1680
+ return parsed.data;
1681
+ }
1682
+ async function loadConfiguredEvalPrompts(command) {
1683
+ const context = getCommandExecutionContext(command);
1684
+ const promptFile = context.config.eval.promptFile;
1685
+ const assertionsFile = context.config.eval.assertionsFile;
1686
+ if (!promptFile && !assertionsFile) {
1687
+ return void 0;
1688
+ }
1689
+ if (!promptFile && assertionsFile) {
1690
+ throw new Error("Config field eval.assertionsFile requires eval.promptFile.");
1691
+ }
1692
+ const promptRaw = await fs5.readFile(promptFile, "utf8");
1693
+ let prompts = normalizeEvalPrompts(parseJsonIfPossible(promptRaw), promptFile);
1694
+ if (assertionsFile) {
1695
+ const assertionsRaw = await fs5.readFile(assertionsFile, "utf8");
1696
+ const assertions = normalizeAssertions(parseJsonIfPossible(assertionsRaw), assertionsFile);
1697
+ prompts = prompts.map((prompt) => ({
1698
+ prompt: prompt.prompt,
1699
+ assertions: [...assertions]
1700
+ }));
1701
+ }
1702
+ const numRunsWasExplicit = context.configFile?.eval?.numRuns !== void 0;
1703
+ if (numRunsWasExplicit && prompts.length === 1 && context.config.eval.numRuns > 1) {
1704
+ const promptTemplate = prompts[0];
1705
+ prompts = Array.from({ length: context.config.eval.numRuns }, () => ({
1706
+ prompt: promptTemplate.prompt,
1707
+ assertions: promptTemplate.assertions ? [...promptTemplate.assertions] : void 0
1708
+ }));
1709
+ }
1710
+ return prompts;
1711
+ }
1712
+ function writeResult(value, asJson) {
1713
+ if (asJson) {
1714
+ process.stdout.write(`${renderJson(value)}
1715
+ `);
1716
+ return;
1717
+ }
1718
+ process.stdout.write(`${String(value)}
1719
+ `);
1720
+ }
1721
+ function writeError(error, asJson) {
1722
+ const message = error instanceof Error ? error.message : String(error);
1723
+ if (asJson) {
1724
+ process.stdout.write(`${renderJson({ error: message })}
1725
+ `);
1726
+ return;
1727
+ }
1728
+ process.stderr.write(`Error: ${message}
1729
+ `);
1730
+ }
1731
+
1732
+ // src/commands/lint.ts
1733
+ async function handleLintCommand(targetPath, options) {
1734
+ try {
1735
+ const report = await runLinter(targetPath, { suppress: options.suppress });
1736
+ if (options.json) {
1737
+ writeResult(report, true);
1738
+ } else {
1739
+ writeResult(renderLintReport(report, options.color), false);
1740
+ }
1741
+ if (lintFails(report, options.failOn)) {
1742
+ process.exitCode = 1;
1743
+ }
1744
+ } catch (error) {
1745
+ writeError(error, options.json);
1746
+ process.exitCode = 2;
1747
+ }
1748
+ }
1749
+ function registerLintCommand(program) {
1750
+ program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").action(async (targetPath, _commandOptions, command) => {
1751
+ const globalOptions = getGlobalCliOptions(command);
1752
+ const config = getResolvedConfig(command);
1753
+ await handleLintCommand(targetPath, {
1754
+ ...globalOptions,
1755
+ failOn: config.lint.failOn,
1756
+ suppress: config.lint.suppress
1757
+ });
1758
+ });
1759
+ }
1760
+
1761
+ // src/commands/trigger.ts
1762
+ import ora from "ora";
1763
+ import { z as z7 } from "zod";
1764
+
1765
+ // src/utils/config.ts
1766
+ import fs6 from "node:fs/promises";
1767
+ import path5 from "node:path";
1768
+ import { z as z6 } from "zod";
1769
+ var providerNameSchema = z6.enum(["anthropic", "openai"]);
1770
+ var lintFailOnSchema = z6.enum(["error", "warn"]);
1771
+ var lintConfigSchema = z6.object({
1772
+ failOn: lintFailOnSchema.optional(),
1773
+ suppress: z6.array(z6.string().min(1)).optional()
1774
+ }).strict();
1775
+ var triggerConfigSchema = z6.object({
1776
+ numQueries: z6.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
1777
+ threshold: z6.number().min(0).max(1).optional(),
1778
+ seed: z6.number().int().optional()
1779
+ }).strict().partial();
1780
+ var evalConfigSchema = z6.object({
1781
+ numRuns: z6.number().int().min(1).optional(),
1782
+ threshold: z6.number().min(0).max(1).optional(),
1783
+ promptFile: z6.string().min(1).optional(),
1784
+ assertionsFile: z6.string().min(1).optional()
1785
+ }).strict().partial();
1786
+ var skilltestConfigSchema = z6.object({
1787
+ provider: providerNameSchema.optional(),
1788
+ model: z6.string().min(1).optional(),
1789
+ json: z6.boolean().optional(),
1790
+ lint: lintConfigSchema.optional(),
1791
+ trigger: triggerConfigSchema.optional(),
1792
+ eval: evalConfigSchema.optional()
1793
+ }).strict();
1794
+ var resolvedSkilltestConfigSchema = z6.object({
1795
+ provider: providerNameSchema,
1796
+ model: z6.string().min(1),
1797
+ json: z6.boolean(),
1798
+ lint: z6.object({
1799
+ failOn: lintFailOnSchema,
1800
+ suppress: z6.array(z6.string().min(1))
1801
+ }),
1802
+ trigger: z6.object({
1803
+ numQueries: z6.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
1804
+ threshold: z6.number().min(0).max(1),
1805
+ seed: z6.number().int().optional()
1806
+ }),
1807
+ eval: z6.object({
1808
+ numRuns: z6.number().int().min(1),
1809
+ threshold: z6.number().min(0).max(1),
1810
+ promptFile: z6.string().min(1).optional(),
1811
+ assertionsFile: z6.string().min(1).optional()
1812
+ })
1813
+ });
1814
+ var DEFAULT_SKILLTEST_CONFIG = {
1815
+ provider: "anthropic",
1816
+ model: "claude-sonnet-4-5-20250929",
1817
+ json: false,
1818
+ lint: {
1819
+ failOn: "error",
1820
+ suppress: []
1821
+ },
1822
+ trigger: {
1823
+ numQueries: 20,
1824
+ threshold: 0.8
1825
+ },
1826
+ eval: {
1827
+ numRuns: 5,
1828
+ threshold: 0.9
1829
+ }
1830
+ };
1831
+ function formatIssuePath(issuePath) {
1832
+ if (issuePath.length === 0) {
1833
+ return "root";
1834
+ }
1835
+ return issuePath.map(String).join(".");
1836
+ }
1837
+ function buildConfigValidationError(error, sourceLabel) {
1838
+ const issue = error.issues[0];
1839
+ const issuePath = formatIssuePath(issue?.path ?? []);
1840
+ const issueMessage = issue?.message ?? "Invalid config value.";
1841
+ return new Error(`Invalid skilltest config in ${sourceLabel} at ${issuePath}: ${issueMessage}`);
1842
+ }
1843
+ async function readJsonObject(filePath, label) {
1844
+ let raw;
1845
+ try {
1846
+ raw = await fs6.readFile(filePath, "utf8");
1847
+ } catch (error) {
1848
+ const message = error instanceof Error ? error.message : String(error);
1849
+ throw new Error(`Failed to read ${label}: ${message}`);
1850
+ }
1851
+ try {
1852
+ return JSON.parse(raw);
1853
+ } catch (error) {
1854
+ const message = error instanceof Error ? error.message : String(error);
1855
+ throw new Error(`Invalid JSON in ${label}: ${message}`);
1856
+ }
1857
+ }
1858
+ async function loadConfigFromJsonFile(filePath) {
1859
+ if (!await pathExists(filePath)) {
1860
+ return null;
1861
+ }
1862
+ const raw = await readJsonObject(filePath, filePath);
1863
+ const parsed = skilltestConfigSchema.safeParse(raw);
1864
+ if (!parsed.success) {
1865
+ throw buildConfigValidationError(parsed.error, filePath);
1866
+ }
1867
+ return {
1868
+ configFile: parsed.data,
1869
+ sourcePath: filePath,
1870
+ sourceDirectory: path5.dirname(filePath)
1871
+ };
1872
+ }
1873
+ async function loadConfigFromNearestPackageJson(startDirectory) {
1874
+ let currentDirectory = path5.resolve(startDirectory);
1875
+ while (true) {
1876
+ const packageJsonPath = path5.join(currentDirectory, "package.json");
1877
+ if (await pathExists(packageJsonPath)) {
1878
+ const raw = await readJsonObject(packageJsonPath, packageJsonPath);
1879
+ const packageJsonSchema = z6.object({
1880
+ skilltestrc: skilltestConfigSchema.optional()
1881
+ }).passthrough();
1882
+ const parsed = packageJsonSchema.safeParse(raw);
1883
+ if (!parsed.success) {
1884
+ throw buildConfigValidationError(parsed.error, `${packageJsonPath}#skilltestrc`);
1885
+ }
1886
+ if (!parsed.data.skilltestrc) {
1887
+ return null;
1888
+ }
1889
+ return {
1890
+ configFile: parsed.data.skilltestrc,
1891
+ sourcePath: packageJsonPath,
1892
+ sourceDirectory: currentDirectory
1893
+ };
1894
+ }
1895
+ const parentDirectory = path5.dirname(currentDirectory);
1896
+ if (parentDirectory === currentDirectory) {
1897
+ return null;
1898
+ }
1899
+ currentDirectory = parentDirectory;
1900
+ }
1901
+ }
1902
+ async function resolveSkillDirectoryConfig(targetPath) {
1903
+ if (!targetPath) {
1904
+ return null;
1905
+ }
1906
+ try {
1907
+ const { skillRoot } = await resolveSkillPath(targetPath);
1908
+ return loadConfigFromJsonFile(path5.join(skillRoot, ".skilltestrc"));
1909
+ } catch {
1910
+ return null;
1911
+ }
1912
+ }
1913
+ function resolveConfigRelativePath(baseDirectory, value) {
1914
+ if (!value) {
1915
+ return void 0;
1916
+ }
1917
+ return path5.resolve(baseDirectory, value);
1918
+ }
1919
+ function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = process.cwd()) {
1920
+ const merged = {
1921
+ provider: cliFlags.provider ?? configFile.provider ?? DEFAULT_SKILLTEST_CONFIG.provider,
1922
+ model: cliFlags.model ?? configFile.model ?? DEFAULT_SKILLTEST_CONFIG.model,
1923
+ json: cliFlags.json ?? configFile.json ?? DEFAULT_SKILLTEST_CONFIG.json,
1924
+ lint: {
1925
+ failOn: cliFlags.lint?.failOn ?? configFile.lint?.failOn ?? DEFAULT_SKILLTEST_CONFIG.lint.failOn,
1926
+ suppress: cliFlags.lint?.suppress ?? configFile.lint?.suppress ?? DEFAULT_SKILLTEST_CONFIG.lint.suppress
1927
+ },
1928
+ trigger: {
1929
+ numQueries: cliFlags.trigger?.numQueries ?? configFile.trigger?.numQueries ?? DEFAULT_SKILLTEST_CONFIG.trigger.numQueries,
1930
+ threshold: cliFlags.trigger?.threshold ?? configFile.trigger?.threshold ?? DEFAULT_SKILLTEST_CONFIG.trigger.threshold,
1931
+ seed: cliFlags.trigger?.seed ?? configFile.trigger?.seed
1932
+ },
1933
+ eval: {
1934
+ numRuns: cliFlags.eval?.numRuns ?? configFile.eval?.numRuns ?? DEFAULT_SKILLTEST_CONFIG.eval.numRuns,
1935
+ threshold: cliFlags.eval?.threshold ?? configFile.eval?.threshold ?? DEFAULT_SKILLTEST_CONFIG.eval.threshold,
1936
+ promptFile: resolveConfigRelativePath(
1937
+ baseDirectory,
1938
+ cliFlags.eval?.promptFile ?? configFile.eval?.promptFile ?? DEFAULT_SKILLTEST_CONFIG.eval.promptFile
1939
+ ),
1940
+ assertionsFile: resolveConfigRelativePath(
1941
+ baseDirectory,
1942
+ cliFlags.eval?.assertionsFile ?? configFile.eval?.assertionsFile ?? DEFAULT_SKILLTEST_CONFIG.eval.assertionsFile
1943
+ )
1944
+ }
1945
+ };
1946
+ return resolvedSkilltestConfigSchema.parse(merged);
1947
+ }
1948
+ function getTypedOptionValue(command, key) {
1949
+ const options = command.optsWithGlobals();
1950
+ const value = options[key];
1951
+ if (value === void 0) {
1952
+ return void 0;
1953
+ }
1954
+ return value;
1955
+ }
1956
+ function extractCliConfigOverrides(command) {
1957
+ const overrides = {};
1958
+ if (command.getOptionValueSourceWithGlobals("json") === "cli") {
1959
+ overrides.json = Boolean(getTypedOptionValue(command, "json"));
1960
+ }
1961
+ if (command.getOptionValueSource("provider") === "cli") {
1962
+ overrides.provider = getTypedOptionValue(command, "provider");
1963
+ }
1964
+ if (command.getOptionValueSource("model") === "cli") {
1965
+ overrides.model = getTypedOptionValue(command, "model");
1966
+ }
1967
+ if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("numQueries") === "cli") {
1968
+ overrides.trigger = {
1969
+ ...overrides.trigger,
1970
+ numQueries: getTypedOptionValue(command, "numQueries")
1971
+ };
1972
+ }
1973
+ if (command.name() === "check" && command.getOptionValueSource("minF1") === "cli") {
1974
+ overrides.trigger = {
1975
+ ...overrides.trigger,
1976
+ threshold: getTypedOptionValue(command, "minF1")
1977
+ };
1978
+ }
1979
+ if (command.name() === "check" && command.getOptionValueSource("minAssertPassRate") === "cli") {
1980
+ overrides.eval = {
1981
+ ...overrides.eval,
1982
+ threshold: getTypedOptionValue(command, "minAssertPassRate")
1983
+ };
1984
+ }
1985
+ const parsed = skilltestConfigSchema.safeParse(overrides);
1986
+ if (!parsed.success) {
1987
+ throw buildConfigValidationError(parsed.error, "CLI flags");
1988
+ }
1989
+ return parsed.data;
1990
+ }
1991
+ async function resolveConfigContext(targetPath, cliFlags) {
1992
+ const cwd = process.cwd();
1993
+ const skillDirectoryConfig = await resolveSkillDirectoryConfig(targetPath);
1994
+ if (skillDirectoryConfig) {
1995
+ return {
1996
+ configFile: skillDirectoryConfig.configFile,
1997
+ ...skillDirectoryConfig,
1998
+ config: mergeConfigLayers(skillDirectoryConfig.configFile, cliFlags, skillDirectoryConfig.sourceDirectory)
1999
+ };
2000
+ }
2001
+ const cwdConfigPath = path5.join(cwd, ".skilltestrc");
2002
+ const cwdConfig = await loadConfigFromJsonFile(cwdConfigPath);
2003
+ if (cwdConfig) {
2004
+ return {
2005
+ configFile: cwdConfig.configFile,
2006
+ ...cwdConfig,
2007
+ config: mergeConfigLayers(cwdConfig.configFile, cliFlags, cwdConfig.sourceDirectory)
2008
+ };
2009
+ }
2010
+ const packageJsonConfig = await loadConfigFromNearestPackageJson(cwd);
2011
+ if (packageJsonConfig) {
2012
+ return {
2013
+ configFile: packageJsonConfig.configFile,
2014
+ ...packageJsonConfig,
2015
+ config: mergeConfigLayers(packageJsonConfig.configFile, cliFlags, packageJsonConfig.sourceDirectory)
2016
+ };
2017
+ }
2018
+ return {
2019
+ configFile: null,
2020
+ config: mergeConfigLayers({}, cliFlags, cwd),
2021
+ sourcePath: null,
2022
+ sourceDirectory: cwd
2023
+ };
2024
+ }
2025
+ function resolveApiKey(provider, override) {
2026
+ if (override && override.trim() !== "") {
2027
+ return override.trim();
2028
+ }
2029
+ if (provider === "anthropic") {
2030
+ const envValue2 = process.env.ANTHROPIC_API_KEY?.trim();
2031
+ if (envValue2) {
2032
+ return envValue2;
2033
+ }
2034
+ throw new Error(
2035
+ "No Anthropic API key found. Set ANTHROPIC_API_KEY environment variable or pass --api-key flag."
2036
+ );
2037
+ }
2038
+ const envValue = process.env.OPENAI_API_KEY?.trim();
2039
+ if (envValue) {
2040
+ return envValue;
2041
+ }
2042
+ throw new Error("No OpenAI API key found. Set OPENAI_API_KEY environment variable or pass --api-key flag.");
2043
+ }
2044
+
2045
+ // src/providers/anthropic.ts
2046
+ import Anthropic from "@anthropic-ai/sdk";
2047
+ function wait(ms) {
2048
+ return new Promise((resolve) => {
2049
+ setTimeout(resolve, ms);
2050
+ });
2051
+ }
2052
+ function isRateLimitError(error) {
2053
+ if (!error || typeof error !== "object") {
2054
+ return false;
2055
+ }
2056
+ const maybeStatus = error.status;
2057
+ if (maybeStatus === 429) {
2058
+ return true;
2059
+ }
2060
+ const maybeMessage = error.message;
2061
+ if (typeof maybeMessage === "string" && /rate limit/i.test(maybeMessage)) {
2062
+ return true;
2063
+ }
2064
+ return false;
2065
+ }
2066
+ var AnthropicProvider = class {
2067
+ name = "anthropic";
2068
+ client;
2069
+ constructor(apiKey) {
2070
+ this.client = new Anthropic({ apiKey });
2071
+ }
2072
+ async sendMessage(systemPrompt, userMessage, options) {
2073
+ let lastError;
2074
+ for (let attempt = 0; attempt < 3; attempt += 1) {
2075
+ try {
2076
+ const response = await this.client.messages.create({
2077
+ model: options.model,
2078
+ max_tokens: 2048,
2079
+ system: systemPrompt,
2080
+ messages: [
2081
+ {
2082
+ role: "user",
2083
+ content: userMessage
2084
+ }
2085
+ ]
2086
+ });
2087
+ const textBlocks = response.content.filter((block) => block.type === "text");
2088
+ const text = textBlocks.map((block) => block.text).join("\n").trim();
2089
+ if (text.length === 0) {
2090
+ throw new Error("Model returned an empty response.");
2091
+ }
1481
2092
  return text;
1482
2093
  } catch (error) {
1483
2094
  lastError = error;
@@ -1607,14 +2218,11 @@ function createProvider(providerName, apiKeyOverride) {
1607
2218
  }
1608
2219
 
1609
2220
  // src/commands/trigger.ts
1610
- var triggerOptionsSchema = z3.object({
1611
- model: z3.string(),
1612
- provider: z3.enum(["anthropic", "openai"]),
1613
- queries: z3.string().optional(),
1614
- numQueries: z3.number().int().min(2),
1615
- saveQueries: z3.string().optional(),
1616
- verbose: z3.boolean().optional(),
1617
- apiKey: z3.string().optional()
2221
+ var triggerCliSchema = z7.object({
2222
+ queries: z7.string().optional(),
2223
+ saveQueries: z7.string().optional(),
2224
+ verbose: z7.boolean().optional(),
2225
+ apiKey: z7.string().optional()
1618
2226
  });
1619
2227
  var DEFAULT_ANTHROPIC_MODEL = "claude-sonnet-4-5-20250929";
1620
2228
  var DEFAULT_OPENAI_MODEL = "gpt-4.1-mini";
@@ -1624,230 +2232,84 @@ function resolveModel(provider, model) {
1624
2232
  }
1625
2233
  return model;
1626
2234
  }
1627
- function registerTriggerCommand(program) {
1628
- program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use", DEFAULT_ANTHROPIC_MODEL).option("--provider <provider>", "LLM provider: anthropic|openai", "anthropic").option("--queries <path>", "Path to custom test queries JSON").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10), 20).option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, commandOptions, command) => {
1629
- const globalOptions = getGlobalCliOptions(command);
1630
- const parsedOptions = triggerOptionsSchema.safeParse(commandOptions);
1631
- if (!parsedOptions.success) {
1632
- writeError(new Error(parsedOptions.error.issues[0]?.message ?? "Invalid trigger options."), globalOptions.json);
1633
- process.exitCode = 2;
1634
- return;
2235
+ async function handleTriggerCommand(targetPath, options) {
2236
+ const spinner = options.json || !process.stdout.isTTY ? null : ora("Preparing trigger evaluation...").start();
2237
+ try {
2238
+ if (spinner) {
2239
+ spinner.text = "Parsing skill...";
1635
2240
  }
1636
- const options = parsedOptions.data;
1637
- const spinner = globalOptions.json || !process.stdout.isTTY ? null : ora("Preparing trigger evaluation...").start();
1638
- try {
1639
- if (options.numQueries % 2 !== 0) {
1640
- throw new Error("--num-queries must be an even number so the suite can split should/should-not trigger cases.");
1641
- }
1642
- if (spinner) {
1643
- spinner.text = "Parsing skill...";
1644
- }
1645
- const skill = await parseSkillStrict(targetPath);
1646
- if (spinner) {
1647
- spinner.text = "Initializing model provider...";
1648
- }
1649
- const provider = createProvider(options.provider, options.apiKey);
1650
- let queries = void 0;
1651
- if (options.queries) {
1652
- if (spinner) {
1653
- spinner.text = "Loading custom trigger queries...";
1654
- }
1655
- const loaded = await readJsonFile(options.queries);
1656
- const parsedQueries = triggerQueryArraySchema.safeParse(loaded);
1657
- if (!parsedQueries.success) {
1658
- throw new Error(`Invalid --queries JSON: ${parsedQueries.error.issues[0]?.message ?? "unknown format issue"}`);
1659
- }
1660
- queries = parsedQueries.data;
1661
- }
2241
+ const skill = await parseSkillStrict(targetPath);
2242
+ if (spinner) {
2243
+ spinner.text = "Initializing model provider...";
2244
+ }
2245
+ const provider = createProvider(options.provider, options.apiKey);
2246
+ let queries = void 0;
2247
+ if (options.queries) {
1662
2248
  if (spinner) {
1663
- spinner.text = "Running trigger simulations...";
1664
- }
1665
- const model = resolveModel(options.provider, options.model);
1666
- const result = await runTriggerTest(skill, {
1667
- model,
1668
- provider,
1669
- queries,
1670
- numQueries: options.numQueries,
1671
- verbose: Boolean(options.verbose)
1672
- });
1673
- if (options.saveQueries) {
1674
- await writeJsonFile(options.saveQueries, result.queries);
1675
- }
1676
- spinner?.stop();
1677
- if (globalOptions.json) {
1678
- writeResult(result, true);
1679
- } else {
1680
- writeResult(renderTriggerReport(result, globalOptions.color, Boolean(options.verbose)), false);
2249
+ spinner.text = "Loading custom trigger queries...";
1681
2250
  }
1682
- } catch (error) {
1683
- spinner?.stop();
1684
- writeError(error, globalOptions.json);
1685
- process.exitCode = 2;
2251
+ queries = await loadTriggerQueriesFile(options.queries);
1686
2252
  }
1687
- });
1688
- }
1689
-
1690
- // src/commands/eval.ts
1691
- import ora2 from "ora";
1692
- import { z as z6 } from "zod";
1693
-
1694
- // src/core/eval-runner.ts
1695
- import { z as z5 } from "zod";
1696
-
1697
- // src/core/grader.ts
1698
- import { z as z4 } from "zod";
1699
- var gradedAssertionSchema = z4.object({
1700
- assertion: z4.string(),
1701
- passed: z4.boolean(),
1702
- evidence: z4.string()
1703
- });
1704
- var graderOutputSchema = z4.object({
1705
- assertions: z4.array(gradedAssertionSchema)
1706
- });
1707
- function extractJsonObject(raw) {
1708
- const trimmed = raw.trim();
1709
- if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
1710
- return JSON.parse(trimmed);
1711
- }
1712
- const start = trimmed.indexOf("{");
1713
- const end = trimmed.lastIndexOf("}");
1714
- if (start >= 0 && end > start) {
1715
- return JSON.parse(trimmed.slice(start, end + 1));
1716
- }
1717
- throw new Error("Grader did not return a JSON object.");
1718
- }
1719
- async function gradeResponse(options) {
1720
- const assertionList = options.assertions && options.assertions.length > 0 ? options.assertions : [
1721
- "The response follows the skill instructions faithfully.",
1722
- "The response is well-structured and actionable.",
1723
- "The response addresses the user prompt directly."
1724
- ];
1725
- const systemPrompt = [
1726
- "You are a strict evaluator for agent skill outputs.",
1727
- "Assess each assertion and return JSON only.",
1728
- 'Required output format: {"assertions":[{"assertion":"...","passed":true|false,"evidence":"..."}]}'
1729
- ].join(" ");
1730
- const userPrompt = [
1731
- `Skill: ${options.skillName}`,
1732
- "Skill instructions:",
1733
- options.skillBody,
1734
- "",
1735
- `User prompt: ${options.userPrompt}`,
1736
- "",
1737
- "Model response:",
1738
- options.modelResponse,
1739
- "",
1740
- "Assertions to evaluate:",
1741
- assertionList.map((assertion, index) => `${index + 1}. ${assertion}`).join("\n")
1742
- ].join("\n");
1743
- const raw = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
1744
- const parsed = graderOutputSchema.safeParse(extractJsonObject(raw));
1745
- if (!parsed.success) {
1746
- throw new Error(`Failed to parse grader output: ${parsed.error.issues[0]?.message ?? "invalid grader JSON"}`);
1747
- }
1748
- return parsed.data.assertions;
1749
- }
1750
-
1751
- // src/core/eval-runner.ts
1752
- var evalPromptSchema = z5.object({
1753
- prompt: z5.string().min(1),
1754
- assertions: z5.array(z5.string().min(1)).optional()
1755
- });
1756
- var evalPromptArraySchema = z5.array(evalPromptSchema);
1757
- function extractJsonArray(raw) {
1758
- const trimmed = raw.trim();
1759
- if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
1760
- return JSON.parse(trimmed);
1761
- }
1762
- const start = trimmed.indexOf("[");
1763
- const end = trimmed.lastIndexOf("]");
1764
- if (start >= 0 && end > start) {
1765
- return JSON.parse(trimmed.slice(start, end + 1));
1766
- }
1767
- throw new Error("Model did not return a JSON array.");
1768
- }
1769
- async function generatePrompts(skill, provider, model, count) {
1770
- const systemPrompt = [
1771
- "You generate realistic evaluation prompts for an agent skill.",
1772
- "Return JSON only.",
1773
- 'Format: [{"prompt":"...","assertions":["...", "..."]}]',
1774
- "Assertions should be concrete and checkable."
1775
- ].join(" ");
1776
- const userPrompt = [
1777
- `Skill name: ${skill.frontmatter.name}`,
1778
- `Skill description: ${skill.frontmatter.description}`,
1779
- "Skill instructions:",
1780
- skill.content,
1781
- "",
1782
- `Generate ${count} prompts that stress the main capabilities and likely edge cases.`,
1783
- "Each prompt should include 2-4 assertions."
1784
- ].join("\n");
1785
- const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
1786
- const parsed = evalPromptArraySchema.safeParse(extractJsonArray(raw));
1787
- if (!parsed.success) {
1788
- throw new Error(`Failed to parse generated eval prompts: ${parsed.error.issues[0]?.message ?? "invalid prompt JSON"}`);
1789
- }
1790
- if (parsed.data.length !== count) {
1791
- throw new Error(`Expected ${count} prompts, got ${parsed.data.length}.`);
1792
- }
1793
- return parsed.data;
1794
- }
1795
- async function runEval(skill, options) {
1796
- const prompts = options.prompts && options.prompts.length > 0 ? evalPromptArraySchema.parse(options.prompts) : await generatePrompts(skill, options.provider, options.model, 5);
1797
- const results = [];
1798
- for (const evalPrompt of prompts) {
1799
- const systemPrompt = [
1800
- "You are an AI assistant with an activated skill.",
1801
- "Follow this SKILL.md content exactly where applicable.",
1802
- "",
1803
- skill.raw
1804
- ].join("\n");
1805
- const response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
1806
- const gradedAssertions = await gradeResponse({
1807
- provider: options.provider,
1808
- model: options.graderModel,
1809
- skillName: skill.frontmatter.name,
1810
- skillBody: skill.content,
1811
- userPrompt: evalPrompt.prompt,
1812
- modelResponse: response,
1813
- assertions: evalPrompt.assertions
1814
- });
1815
- const passedAssertions2 = gradedAssertions.filter((assertion) => assertion.passed).length;
1816
- results.push({
1817
- prompt: evalPrompt.prompt,
1818
- assertions: gradedAssertions,
1819
- responseSummary: response.slice(0, 200),
1820
- response,
1821
- passedAssertions: passedAssertions2,
1822
- totalAssertions: gradedAssertions.length
2253
+ if (spinner) {
2254
+ spinner.text = "Running trigger simulations...";
2255
+ }
2256
+ const model = resolveModel(options.provider, options.model);
2257
+ const result = await runTriggerTest(skill, {
2258
+ model,
2259
+ provider,
2260
+ queries,
2261
+ numQueries: options.numQueries,
2262
+ seed: options.seed,
2263
+ verbose: options.verbose
1823
2264
  });
2265
+ if (options.saveQueries) {
2266
+ await writeJsonFile(options.saveQueries, result.queries);
2267
+ }
2268
+ spinner?.stop();
2269
+ if (options.json) {
2270
+ writeResult(result, true);
2271
+ } else {
2272
+ writeResult(renderTriggerReport(result, options.color, options.verbose), false);
2273
+ }
2274
+ } catch (error) {
2275
+ spinner?.stop();
2276
+ writeError(error, options.json);
2277
+ process.exitCode = 2;
1824
2278
  }
1825
- const totalAssertions = results.reduce((total, result) => total + result.totalAssertions, 0);
1826
- const passedAssertions = results.reduce((total, result) => total + result.passedAssertions, 0);
1827
- return {
1828
- skillName: skill.frontmatter.name,
1829
- model: options.model,
1830
- graderModel: options.graderModel,
1831
- provider: options.provider.name,
1832
- prompts,
1833
- results,
1834
- summary: {
1835
- totalPrompts: results.length,
1836
- totalAssertions,
1837
- passedAssertions
2279
+ }
2280
+ function registerTriggerCommand(program) {
2281
+ program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--queries <path>", "Path to custom test queries JSON").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10)).option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, _commandOptions, command) => {
2282
+ const globalOptions = getGlobalCliOptions(command);
2283
+ const config = getResolvedConfig(command);
2284
+ const parsedCli = triggerCliSchema.safeParse(command.opts());
2285
+ if (!parsedCli.success) {
2286
+ writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid trigger options."), globalOptions.json);
2287
+ process.exitCode = 2;
2288
+ return;
1838
2289
  }
1839
- };
2290
+ await handleTriggerCommand(targetPath, {
2291
+ ...globalOptions,
2292
+ model: config.model,
2293
+ provider: config.provider,
2294
+ queries: parsedCli.data.queries,
2295
+ numQueries: config.trigger.numQueries,
2296
+ saveQueries: parsedCli.data.saveQueries,
2297
+ seed: config.trigger.seed,
2298
+ verbose: Boolean(parsedCli.data.verbose),
2299
+ apiKey: parsedCli.data.apiKey
2300
+ });
2301
+ });
1840
2302
  }
1841
2303
 
1842
2304
  // src/commands/eval.ts
1843
- var evalOptionsSchema = z6.object({
1844
- prompts: z6.string().optional(),
1845
- model: z6.string(),
1846
- graderModel: z6.string().optional(),
1847
- provider: z6.enum(["anthropic", "openai"]),
1848
- saveResults: z6.string().optional(),
1849
- verbose: z6.boolean().optional(),
1850
- apiKey: z6.string().optional()
2305
+ import ora2 from "ora";
2306
+ import { z as z8 } from "zod";
2307
+ var evalCliSchema = z8.object({
2308
+ prompts: z8.string().optional(),
2309
+ graderModel: z8.string().optional(),
2310
+ saveResults: z8.string().optional(),
2311
+ verbose: z8.boolean().optional(),
2312
+ apiKey: z8.string().optional()
1851
2313
  });
1852
2314
  var DEFAULT_ANTHROPIC_MODEL2 = "claude-sonnet-4-5-20250929";
1853
2315
  var DEFAULT_OPENAI_MODEL2 = "gpt-4.1-mini";
@@ -1857,69 +2319,84 @@ function resolveModel2(provider, model) {
1857
2319
  }
1858
2320
  return model;
1859
2321
  }
1860
- function registerEvalCommand(program) {
1861
- program.command("eval").description("Run end-to-end skill execution and quality evaluation.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--prompts <path>", "Path to eval prompts JSON").option("--model <model>", "Model to execute prompts", DEFAULT_ANTHROPIC_MODEL2).option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--provider <provider>", "LLM provider: anthropic|openai", "anthropic").option("--save-results <path>", "Save full evaluation results to JSON").option("--api-key <key>", "API key override").option("--verbose", "Show full model responses").action(async (targetPath, commandOptions, command) => {
1862
- const globalOptions = getGlobalCliOptions(command);
1863
- const parsedOptions = evalOptionsSchema.safeParse(commandOptions);
1864
- if (!parsedOptions.success) {
1865
- writeError(new Error(parsedOptions.error.issues[0]?.message ?? "Invalid eval options."), globalOptions.json);
1866
- process.exitCode = 2;
1867
- return;
2322
+ async function handleEvalCommand(targetPath, options, command) {
2323
+ const spinner = options.json || !process.stdout.isTTY ? null : ora2("Preparing evaluation...").start();
2324
+ try {
2325
+ if (spinner) {
2326
+ spinner.text = "Parsing skill...";
1868
2327
  }
1869
- const options = parsedOptions.data;
1870
- const spinner = globalOptions.json || !process.stdout.isTTY ? null : ora2("Preparing evaluation...").start();
1871
- try {
1872
- if (spinner) {
1873
- spinner.text = "Parsing skill...";
1874
- }
1875
- const skill = await parseSkillStrict(targetPath);
1876
- if (spinner) {
1877
- spinner.text = "Initializing model provider...";
1878
- }
1879
- const provider = createProvider(options.provider, options.apiKey);
1880
- let prompts = void 0;
1881
- if (options.prompts) {
1882
- if (spinner) {
1883
- spinner.text = "Loading test prompts...";
1884
- }
1885
- const loaded = await readJsonFile(options.prompts);
1886
- const parsedPrompts = evalPromptArraySchema.safeParse(loaded);
1887
- if (!parsedPrompts.success) {
1888
- throw new Error(`Invalid --prompts JSON: ${parsedPrompts.error.issues[0]?.message ?? "unknown format issue"}`);
1889
- }
1890
- prompts = parsedPrompts.data;
1891
- }
2328
+ const skill = await parseSkillStrict(targetPath);
2329
+ if (spinner) {
2330
+ spinner.text = "Initializing model provider...";
2331
+ }
2332
+ const provider = createProvider(options.provider, options.apiKey);
2333
+ let prompts = void 0;
2334
+ if (options.prompts) {
1892
2335
  if (spinner) {
1893
- spinner.text = "Running eval prompts and grading responses...";
2336
+ spinner.text = "Loading test prompts...";
1894
2337
  }
1895
- const model = resolveModel2(options.provider, options.model);
1896
- const graderModel = options.graderModel ?? model;
1897
- const result = await runEval(skill, {
1898
- provider,
1899
- model,
1900
- graderModel,
1901
- prompts
1902
- });
1903
- if (options.saveResults) {
1904
- await writeJsonFile(options.saveResults, result);
1905
- }
1906
- spinner?.stop();
1907
- if (globalOptions.json) {
1908
- writeResult(result, true);
1909
- } else {
1910
- writeResult(renderEvalReport(result, globalOptions.color, Boolean(options.verbose)), false);
1911
- }
1912
- } catch (error) {
1913
- spinner?.stop();
1914
- writeError(error, globalOptions.json);
2338
+ prompts = await loadEvalPromptsJson(options.prompts);
2339
+ } else {
2340
+ prompts = await loadConfiguredEvalPrompts(command);
2341
+ }
2342
+ if (spinner) {
2343
+ spinner.text = "Running eval prompts and grading responses...";
2344
+ }
2345
+ const model = resolveModel2(options.provider, options.model);
2346
+ const graderModel = options.graderModel ?? model;
2347
+ const result = await runEval(skill, {
2348
+ provider,
2349
+ model,
2350
+ graderModel,
2351
+ numRuns: options.numRuns,
2352
+ prompts
2353
+ });
2354
+ if (options.saveResults) {
2355
+ await writeJsonFile(options.saveResults, result);
2356
+ }
2357
+ spinner?.stop();
2358
+ if (options.json) {
2359
+ writeResult(result, true);
2360
+ } else {
2361
+ writeResult(renderEvalReport(result, options.color, options.verbose), false);
2362
+ }
2363
+ } catch (error) {
2364
+ spinner?.stop();
2365
+ writeError(error, options.json);
2366
+ process.exitCode = 2;
2367
+ }
2368
+ }
2369
+ function registerEvalCommand(program) {
2370
+ program.command("eval").description("Run end-to-end skill execution and quality evaluation.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--prompts <path>", "Path to eval prompts JSON").option("--model <model>", "Model to execute prompts").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--provider <provider>", "LLM provider: anthropic|openai").option("--save-results <path>", "Save full evaluation results to JSON").option("--api-key <key>", "API key override").option("--verbose", "Show full model responses").action(async (targetPath, _commandOptions, command) => {
2371
+ const globalOptions = getGlobalCliOptions(command);
2372
+ const config = getResolvedConfig(command);
2373
+ const parsedCli = evalCliSchema.safeParse(command.opts());
2374
+ if (!parsedCli.success) {
2375
+ writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid eval options."), globalOptions.json);
1915
2376
  process.exitCode = 2;
2377
+ return;
1916
2378
  }
2379
+ await handleEvalCommand(
2380
+ targetPath,
2381
+ {
2382
+ ...globalOptions,
2383
+ prompts: parsedCli.data.prompts,
2384
+ model: config.model,
2385
+ graderModel: parsedCli.data.graderModel,
2386
+ provider: config.provider,
2387
+ saveResults: parsedCli.data.saveResults,
2388
+ verbose: Boolean(parsedCli.data.verbose),
2389
+ apiKey: parsedCli.data.apiKey,
2390
+ numRuns: config.eval.numRuns
2391
+ },
2392
+ command
2393
+ );
1917
2394
  });
1918
2395
  }
1919
2396
 
1920
2397
  // src/commands/check.ts
1921
2398
  import ora3 from "ora";
1922
- import { z as z7 } from "zod";
2399
+ import { z as z9 } from "zod";
1923
2400
 
1924
2401
  // src/core/check-runner.ts
1925
2402
  function calculateEvalAssertPassRate(result) {
@@ -1930,8 +2407,8 @@ function calculateEvalAssertPassRate(result) {
1930
2407
  }
1931
2408
  async function runCheck(inputPath, options) {
1932
2409
  options.onStage?.("lint");
1933
- const lint = await runLinter(inputPath);
1934
- const lintPassed = lint.summary.failures === 0;
2410
+ const lint = await runLinter(inputPath, { suppress: options.lintSuppress });
2411
+ const lintPassed = !lintFails(lint, options.lintFailOn);
1935
2412
  let trigger = null;
1936
2413
  let evalResult = null;
1937
2414
  let triggerSkippedReason;
@@ -1956,6 +2433,7 @@ async function runCheck(inputPath, options) {
1956
2433
  model: options.model,
1957
2434
  queries: options.queries,
1958
2435
  numQueries: options.numQueries,
2436
+ seed: options.triggerSeed,
1959
2437
  verbose: options.verbose
1960
2438
  });
1961
2439
  options.onStage?.("eval");
@@ -1963,6 +2441,7 @@ async function runCheck(inputPath, options) {
1963
2441
  provider: options.provider,
1964
2442
  model: options.model,
1965
2443
  graderModel: options.graderModel,
2444
+ numRuns: options.evalNumRuns,
1966
2445
  prompts: options.prompts
1967
2446
  });
1968
2447
  }
@@ -1999,19 +2478,14 @@ async function runCheck(inputPath, options) {
1999
2478
  }
2000
2479
 
2001
2480
  // src/commands/check.ts
2002
- var checkOptionsSchema = z7.object({
2003
- provider: z7.enum(["anthropic", "openai"]),
2004
- model: z7.string(),
2005
- graderModel: z7.string().optional(),
2006
- apiKey: z7.string().optional(),
2007
- queries: z7.string().optional(),
2008
- numQueries: z7.number().int().min(2),
2009
- prompts: z7.string().optional(),
2010
- minF1: z7.number().min(0).max(1),
2011
- minAssertPassRate: z7.number().min(0).max(1),
2012
- saveResults: z7.string().optional(),
2013
- continueOnLintFail: z7.boolean().optional(),
2014
- verbose: z7.boolean().optional()
2481
+ var checkCliSchema = z9.object({
2482
+ graderModel: z9.string().optional(),
2483
+ apiKey: z9.string().optional(),
2484
+ queries: z9.string().optional(),
2485
+ prompts: z9.string().optional(),
2486
+ saveResults: z9.string().optional(),
2487
+ continueOnLintFail: z9.boolean().optional(),
2488
+ verbose: z9.boolean().optional()
2015
2489
  });
2016
2490
  var DEFAULT_ANTHROPIC_MODEL3 = "claude-sonnet-4-5-20250929";
2017
2491
  var DEFAULT_OPENAI_MODEL3 = "gpt-4.1-mini";
@@ -2021,106 +2495,110 @@ function resolveModel3(provider, model) {
2021
2495
  }
2022
2496
  return model;
2023
2497
  }
2024
- function registerCheckCommand(program) {
2025
- program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai", "anthropic").option("--model <model>", "Model for trigger/eval runs", DEFAULT_ANTHROPIC_MODEL3).option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10), 20).option("--prompts <path>", "Path to eval prompts JSON").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value), 0.8).option(
2026
- "--min-assert-pass-rate <n>",
2027
- "Minimum required eval assertion pass rate (0-1)",
2028
- (value) => Number.parseFloat(value),
2029
- 0.9
2030
- ).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, commandOptions, command) => {
2031
- const globalOptions = getGlobalCliOptions(command);
2032
- const parsedOptions = checkOptionsSchema.safeParse(commandOptions);
2033
- if (!parsedOptions.success) {
2034
- writeError(new Error(parsedOptions.error.issues[0]?.message ?? "Invalid check options."), globalOptions.json);
2035
- process.exitCode = 2;
2036
- return;
2037
- }
2038
- const options = parsedOptions.data;
2039
- if (options.numQueries % 2 !== 0) {
2040
- writeError(
2041
- new Error("--num-queries must be an even number so the suite can split should/should-not trigger cases."),
2042
- globalOptions.json
2043
- );
2044
- process.exitCode = 2;
2045
- return;
2498
+ async function handleCheckCommand(targetPath, options, command) {
2499
+ const spinner = options.json || !process.stdout.isTTY ? null : ora3("Preparing check run...").start();
2500
+ try {
2501
+ if (spinner) {
2502
+ spinner.text = "Initializing model provider...";
2046
2503
  }
2047
- const spinner = globalOptions.json || !process.stdout.isTTY ? null : ora3("Preparing check run...").start();
2048
- try {
2504
+ const provider = createProvider(options.provider, options.apiKey);
2505
+ let queries = void 0;
2506
+ if (options.queries) {
2049
2507
  if (spinner) {
2050
- spinner.text = "Initializing model provider...";
2508
+ spinner.text = "Loading custom trigger queries...";
2051
2509
  }
2052
- const provider = createProvider(options.provider, options.apiKey);
2053
- let queries = void 0;
2054
- if (options.queries) {
2055
- if (spinner) {
2056
- spinner.text = "Loading custom trigger queries...";
2057
- }
2058
- const loadedQueries = await readJsonFile(options.queries);
2059
- const parsedQueries = triggerQueryArraySchema.safeParse(loadedQueries);
2060
- if (!parsedQueries.success) {
2061
- throw new Error(
2062
- `Invalid --queries JSON: ${parsedQueries.error.issues[0]?.message ?? "unknown format issue"}`
2063
- );
2064
- }
2065
- queries = parsedQueries.data;
2510
+ queries = await loadTriggerQueriesFile(options.queries);
2511
+ }
2512
+ let prompts = void 0;
2513
+ if (options.prompts) {
2514
+ if (spinner) {
2515
+ spinner.text = "Loading eval prompts...";
2066
2516
  }
2067
- let prompts = void 0;
2068
- if (options.prompts) {
2069
- if (spinner) {
2070
- spinner.text = "Loading eval prompts...";
2071
- }
2072
- const loadedPrompts = await readJsonFile(options.prompts);
2073
- const parsedPrompts = evalPromptArraySchema.safeParse(loadedPrompts);
2074
- if (!parsedPrompts.success) {
2075
- throw new Error(
2076
- `Invalid --prompts JSON: ${parsedPrompts.error.issues[0]?.message ?? "unknown format issue"}`
2077
- );
2517
+ prompts = await loadEvalPromptsJson(options.prompts);
2518
+ } else {
2519
+ prompts = await loadConfiguredEvalPrompts(command);
2520
+ }
2521
+ const model = resolveModel3(options.provider, options.model);
2522
+ const graderModel = options.graderModel ?? model;
2523
+ const result = await runCheck(targetPath, {
2524
+ provider,
2525
+ model,
2526
+ graderModel,
2527
+ lintFailOn: options.lintFailOn,
2528
+ lintSuppress: options.lintSuppress,
2529
+ queries,
2530
+ numQueries: options.numQueries,
2531
+ triggerSeed: options.triggerSeed,
2532
+ prompts,
2533
+ evalNumRuns: options.numRuns,
2534
+ minF1: options.minF1,
2535
+ minAssertPassRate: options.minAssertPassRate,
2536
+ continueOnLintFail: options.continueOnLintFail,
2537
+ verbose: options.verbose,
2538
+ onStage: (stage) => {
2539
+ if (!spinner) {
2540
+ return;
2078
2541
  }
2079
- prompts = parsedPrompts.data;
2080
- }
2081
- const model = resolveModel3(options.provider, options.model);
2082
- const graderModel = options.graderModel ?? model;
2083
- const result = await runCheck(targetPath, {
2084
- provider,
2085
- model,
2086
- graderModel,
2087
- queries,
2088
- numQueries: options.numQueries,
2089
- prompts,
2090
- minF1: options.minF1,
2091
- minAssertPassRate: options.minAssertPassRate,
2092
- continueOnLintFail: Boolean(options.continueOnLintFail),
2093
- verbose: Boolean(options.verbose),
2094
- onStage: (stage) => {
2095
- if (!spinner) {
2096
- return;
2097
- }
2098
- if (stage === "lint") {
2099
- spinner.text = "Running lint checks...";
2100
- } else if (stage === "parse") {
2101
- spinner.text = "Parsing skill for model evaluations...";
2102
- } else if (stage === "trigger") {
2103
- spinner.text = "Running trigger test suite...";
2104
- } else if (stage === "eval") {
2105
- spinner.text = "Running end-to-end eval suite...";
2106
- }
2542
+ if (stage === "lint") {
2543
+ spinner.text = "Running lint checks...";
2544
+ } else if (stage === "parse") {
2545
+ spinner.text = "Parsing skill for model evaluations...";
2546
+ } else if (stage === "trigger") {
2547
+ spinner.text = "Running trigger test suite...";
2548
+ } else if (stage === "eval") {
2549
+ spinner.text = "Running end-to-end eval suite...";
2107
2550
  }
2108
- });
2109
- if (options.saveResults) {
2110
- await writeJsonFile(options.saveResults, result);
2111
- }
2112
- spinner?.stop();
2113
- if (globalOptions.json) {
2114
- writeResult(result, true);
2115
- } else {
2116
- writeResult(renderCheckReport(result, globalOptions.color, Boolean(options.verbose)), false);
2117
2551
  }
2118
- process.exitCode = result.gates.overallPassed ? 0 : 1;
2119
- } catch (error) {
2120
- spinner?.stop();
2121
- writeError(error, globalOptions.json);
2552
+ });
2553
+ if (options.saveResults) {
2554
+ await writeJsonFile(options.saveResults, result);
2555
+ }
2556
+ spinner?.stop();
2557
+ if (options.json) {
2558
+ writeResult(result, true);
2559
+ } else {
2560
+ writeResult(renderCheckReport(result, options.color, options.verbose), false);
2561
+ }
2562
+ process.exitCode = result.gates.overallPassed ? 0 : 1;
2563
+ } catch (error) {
2564
+ spinner?.stop();
2565
+ writeError(error, options.json);
2566
+ process.exitCode = 2;
2567
+ }
2568
+ }
2569
+ function registerCheckCommand(program) {
2570
+ program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for trigger/eval runs").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option("--min-assert-pass-rate <n>", "Minimum required eval assertion pass rate (0-1)", (value) => Number.parseFloat(value)).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, _commandOptions, command) => {
2571
+ const globalOptions = getGlobalCliOptions(command);
2572
+ const config = getResolvedConfig(command);
2573
+ const parsedCli = checkCliSchema.safeParse(command.opts());
2574
+ if (!parsedCli.success) {
2575
+ writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid check options."), globalOptions.json);
2122
2576
  process.exitCode = 2;
2577
+ return;
2123
2578
  }
2579
+ await handleCheckCommand(
2580
+ targetPath,
2581
+ {
2582
+ ...globalOptions,
2583
+ provider: config.provider,
2584
+ model: config.model,
2585
+ graderModel: parsedCli.data.graderModel,
2586
+ apiKey: parsedCli.data.apiKey,
2587
+ queries: parsedCli.data.queries,
2588
+ numQueries: config.trigger.numQueries,
2589
+ prompts: parsedCli.data.prompts,
2590
+ minF1: config.trigger.threshold,
2591
+ minAssertPassRate: config.eval.threshold,
2592
+ numRuns: config.eval.numRuns,
2593
+ lintFailOn: config.lint.failOn,
2594
+ lintSuppress: config.lint.suppress,
2595
+ triggerSeed: config.trigger.seed,
2596
+ saveResults: parsedCli.data.saveResults,
2597
+ continueOnLintFail: Boolean(parsedCli.data.continueOnLintFail),
2598
+ verbose: Boolean(parsedCli.data.verbose)
2599
+ },
2600
+ command
2601
+ );
2124
2602
  });
2125
2603
  }
2126
2604
 
@@ -2128,27 +2606,49 @@ function registerCheckCommand(program) {
2128
2606
  function resolveVersion() {
2129
2607
  try {
2130
2608
  const currentFilePath = fileURLToPath(import.meta.url);
2131
- const packageJsonPath = path5.resolve(path5.dirname(currentFilePath), "..", "package.json");
2132
- const raw = fs5.readFileSync(packageJsonPath, "utf8");
2609
+ const packageJsonPath = path6.resolve(path6.dirname(currentFilePath), "..", "package.json");
2610
+ const raw = fs7.readFileSync(packageJsonPath, "utf8");
2133
2611
  const parsed = JSON.parse(raw);
2134
2612
  return parsed.version ?? "0.0.0";
2135
2613
  } catch {
2136
2614
  return "0.0.0";
2137
2615
  }
2138
2616
  }
2617
+ function shouldRenderJson(argv) {
2618
+ return argv.includes("--json");
2619
+ }
2620
+ function renderTopLevelError(error, asJson) {
2621
+ const message = error instanceof Error ? error.message : String(error);
2622
+ if (asJson) {
2623
+ process.stdout.write(`${JSON.stringify({ error: message }, null, 2)}
2624
+ `);
2625
+ return;
2626
+ }
2627
+ process.stderr.write(`Error: ${message}
2628
+ `);
2629
+ }
2139
2630
  async function run(argv) {
2140
2631
  const program = new Command();
2141
2632
  program.name("skilltest").description("The testing framework for Agent Skills.").version(resolveVersion()).option("--json", "Output results as JSON").option("--no-color", "Disable colored output").showHelpAfterError();
2633
+ program.hook("preAction", async (_program, actionCommand) => {
2634
+ const targetPath = typeof actionCommand.processedArgs[0] === "string" ? actionCommand.processedArgs[0] : void 0;
2635
+ const cliOverrides = extractCliConfigOverrides(actionCommand);
2636
+ const context = await resolveConfigContext(targetPath, cliOverrides);
2637
+ setCommandExecutionContext(actionCommand, context);
2638
+ });
2142
2639
  registerLintCommand(program);
2143
2640
  registerTriggerCommand(program);
2144
2641
  registerEvalCommand(program);
2145
2642
  registerCheckCommand(program);
2146
- await program.parseAsync(argv);
2643
+ try {
2644
+ await program.parseAsync(argv);
2645
+ } catch (error) {
2646
+ renderTopLevelError(error, shouldRenderJson(argv));
2647
+ process.exitCode = 2;
2648
+ }
2147
2649
  }
2148
2650
  run(process.argv).catch((error) => {
2149
- const message = error instanceof Error ? error.message : String(error);
2150
- process.stderr.write(`Error: ${message}
2151
- `);
2651
+ renderTopLevelError(error, shouldRenderJson(process.argv));
2152
2652
  process.exitCode = 2;
2153
2653
  });
2154
2654
  export {