skilltest 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +1011 -511
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/index.ts
|
|
4
|
-
import
|
|
5
|
-
import
|
|
4
|
+
import fs7 from "node:fs";
|
|
5
|
+
import path6 from "node:path";
|
|
6
6
|
import { fileURLToPath } from "node:url";
|
|
7
7
|
import { Command } from "commander";
|
|
8
8
|
|
|
@@ -194,6 +194,7 @@ function runCompatibilityChecks(context) {
|
|
|
194
194
|
if (hasAllowedTools) {
|
|
195
195
|
issues.push({
|
|
196
196
|
id: "compat.allowed-tools",
|
|
197
|
+
checkId: "compat:frontmatter",
|
|
197
198
|
title: "Platform-Specific Frontmatter",
|
|
198
199
|
status: "warn",
|
|
199
200
|
message: "Frontmatter includes allowed-tools, which is typically Claude-specific.",
|
|
@@ -202,6 +203,7 @@ function runCompatibilityChecks(context) {
|
|
|
202
203
|
} else {
|
|
203
204
|
issues.push({
|
|
204
205
|
id: "compat.allowed-tools",
|
|
206
|
+
checkId: "compat:frontmatter",
|
|
205
207
|
title: "Platform-Specific Frontmatter",
|
|
206
208
|
status: "pass",
|
|
207
209
|
message: "No known provider-specific frontmatter keys detected."
|
|
@@ -211,6 +213,7 @@ function runCompatibilityChecks(context) {
|
|
|
211
213
|
const platform = mentionsClaudeOnly ? "Claude" : "Codex";
|
|
212
214
|
issues.push({
|
|
213
215
|
id: "compat.provider-phrasing",
|
|
216
|
+
checkId: "compat:provider-language",
|
|
214
217
|
title: "Provider-Specific Language",
|
|
215
218
|
status: "warn",
|
|
216
219
|
message: `Skill body appears tuned to ${platform}-specific behavior.`,
|
|
@@ -219,6 +222,7 @@ function runCompatibilityChecks(context) {
|
|
|
219
222
|
} else {
|
|
220
223
|
issues.push({
|
|
221
224
|
id: "compat.provider-phrasing",
|
|
225
|
+
checkId: "compat:provider-language",
|
|
222
226
|
title: "Provider-Specific Language",
|
|
223
227
|
status: "pass",
|
|
224
228
|
message: "Skill body appears provider-neutral."
|
|
@@ -227,6 +231,7 @@ function runCompatibilityChecks(context) {
|
|
|
227
231
|
const likelyCompatibility = hasAllowedTools || mentionsClaudeOnly || mentionsCodexOnly ? "Likely compatible with some agents, but includes platform-specific assumptions." : "Likely broadly compatible across Anthropic, OpenAI/Codex-style, and other markdown skill runners.";
|
|
228
232
|
issues.push({
|
|
229
233
|
id: "compat.summary",
|
|
234
|
+
checkId: "compat:summary",
|
|
230
235
|
title: "Compatibility Hint",
|
|
231
236
|
status: hasAllowedTools || mentionsClaudeOnly || mentionsCodexOnly ? "warn" : "pass",
|
|
232
237
|
message: likelyCompatibility
|
|
@@ -258,6 +263,7 @@ function runContentChecks(context) {
|
|
|
258
263
|
if (!/^#{1,6}\s+\S+/m.test(body)) {
|
|
259
264
|
issues.push({
|
|
260
265
|
id: "content.headers",
|
|
266
|
+
checkId: "content:headers",
|
|
261
267
|
title: "Section Headers",
|
|
262
268
|
status: "warn",
|
|
263
269
|
message: "No markdown headers found in SKILL.md body.",
|
|
@@ -266,6 +272,7 @@ function runContentChecks(context) {
|
|
|
266
272
|
} else {
|
|
267
273
|
issues.push({
|
|
268
274
|
id: "content.headers",
|
|
275
|
+
checkId: "content:headers",
|
|
269
276
|
title: "Section Headers",
|
|
270
277
|
status: "pass",
|
|
271
278
|
message: "SKILL.md contains markdown section headers."
|
|
@@ -275,6 +282,7 @@ function runContentChecks(context) {
|
|
|
275
282
|
if (!hasExamples) {
|
|
276
283
|
issues.push({
|
|
277
284
|
id: "content.examples",
|
|
285
|
+
checkId: "content:examples",
|
|
278
286
|
title: "Examples",
|
|
279
287
|
status: "warn",
|
|
280
288
|
message: "No examples detected in SKILL.md body.",
|
|
@@ -283,6 +291,7 @@ function runContentChecks(context) {
|
|
|
283
291
|
} else {
|
|
284
292
|
issues.push({
|
|
285
293
|
id: "content.examples",
|
|
294
|
+
checkId: "content:examples",
|
|
286
295
|
title: "Examples",
|
|
287
296
|
status: "pass",
|
|
288
297
|
message: "Examples were detected in SKILL.md."
|
|
@@ -292,6 +301,7 @@ function runContentChecks(context) {
|
|
|
292
301
|
if (vagueMatches.length > 0) {
|
|
293
302
|
issues.push({
|
|
294
303
|
id: "content.vagueness",
|
|
304
|
+
checkId: "content:vagueness",
|
|
295
305
|
title: "Instruction Specificity",
|
|
296
306
|
status: "warn",
|
|
297
307
|
message: "Potentially vague instruction phrases detected.",
|
|
@@ -300,6 +310,7 @@ function runContentChecks(context) {
|
|
|
300
310
|
} else {
|
|
301
311
|
issues.push({
|
|
302
312
|
id: "content.vagueness",
|
|
313
|
+
checkId: "content:vagueness",
|
|
303
314
|
title: "Instruction Specificity",
|
|
304
315
|
status: "pass",
|
|
305
316
|
message: "No obvious vague placeholder phrasing found."
|
|
@@ -308,6 +319,7 @@ function runContentChecks(context) {
|
|
|
308
319
|
if (context.frontmatter.rawFrontmatter && /[<>]/.test(context.frontmatter.rawFrontmatter)) {
|
|
309
320
|
issues.push({
|
|
310
321
|
id: "content.frontmatter-angle-brackets",
|
|
322
|
+
checkId: "content:angle-brackets",
|
|
311
323
|
title: "Frontmatter Angle Brackets",
|
|
312
324
|
status: "warn",
|
|
313
325
|
message: "Frontmatter contains angle bracket characters (< or >), which can be misinterpreted in some agents.",
|
|
@@ -316,6 +328,7 @@ function runContentChecks(context) {
|
|
|
316
328
|
} else {
|
|
317
329
|
issues.push({
|
|
318
330
|
id: "content.frontmatter-angle-brackets",
|
|
331
|
+
checkId: "content:angle-brackets",
|
|
319
332
|
title: "Frontmatter Angle Brackets",
|
|
320
333
|
status: "pass",
|
|
321
334
|
message: "No angle bracket tokens detected in frontmatter."
|
|
@@ -330,6 +343,7 @@ function runContentChecks(context) {
|
|
|
330
343
|
if (secretHits.size > 0) {
|
|
331
344
|
issues.push({
|
|
332
345
|
id: "content.secrets",
|
|
346
|
+
checkId: "content:secrets",
|
|
333
347
|
title: "Hardcoded Secrets",
|
|
334
348
|
status: "fail",
|
|
335
349
|
message: `Potential secrets detected (${Array.from(secretHits).join(", ")}).`,
|
|
@@ -338,6 +352,7 @@ function runContentChecks(context) {
|
|
|
338
352
|
} else {
|
|
339
353
|
issues.push({
|
|
340
354
|
id: "content.secrets",
|
|
355
|
+
checkId: "content:secrets",
|
|
341
356
|
title: "Hardcoded Secrets",
|
|
342
357
|
status: "pass",
|
|
343
358
|
message: "No obvious API keys or secrets patterns were detected."
|
|
@@ -346,6 +361,7 @@ function runContentChecks(context) {
|
|
|
346
361
|
if (bodyLines.length < 10) {
|
|
347
362
|
issues.push({
|
|
348
363
|
id: "content.body-length",
|
|
364
|
+
checkId: "content:body-length",
|
|
349
365
|
title: "Body Completeness",
|
|
350
366
|
status: "warn",
|
|
351
367
|
message: `SKILL.md body has only ${bodyLines.length} non-empty lines.`,
|
|
@@ -354,6 +370,7 @@ function runContentChecks(context) {
|
|
|
354
370
|
} else {
|
|
355
371
|
issues.push({
|
|
356
372
|
id: "content.body-length",
|
|
373
|
+
checkId: "content:body-length",
|
|
357
374
|
title: "Body Completeness",
|
|
358
375
|
status: "pass",
|
|
359
376
|
message: `SKILL.md body has ${bodyLines.length} non-empty lines.`
|
|
@@ -362,6 +379,7 @@ function runContentChecks(context) {
|
|
|
362
379
|
if (description && description.length < 50) {
|
|
363
380
|
issues.push({
|
|
364
381
|
id: "content.description-length",
|
|
382
|
+
checkId: "content:description-length",
|
|
365
383
|
title: "Description Specificity",
|
|
366
384
|
status: "warn",
|
|
367
385
|
message: `Description length is ${description.length} characters, which may be too vague for reliable triggering.`,
|
|
@@ -370,6 +388,7 @@ function runContentChecks(context) {
|
|
|
370
388
|
} else if (description) {
|
|
371
389
|
issues.push({
|
|
372
390
|
id: "content.description-length",
|
|
391
|
+
checkId: "content:description-length",
|
|
373
392
|
title: "Description Specificity",
|
|
374
393
|
status: "pass",
|
|
375
394
|
message: "Description length is sufficient for triggerability heuristics."
|
|
@@ -461,6 +480,7 @@ async function runDisclosureChecks(context) {
|
|
|
461
480
|
if (context.skill.lineCount > 200 && !await pathExists(referencesDir)) {
|
|
462
481
|
issues.push({
|
|
463
482
|
id: "disclosure.skill-split",
|
|
483
|
+
checkId: "disclosure:progressive-disclosure",
|
|
464
484
|
title: "Progressive Disclosure",
|
|
465
485
|
status: "warn",
|
|
466
486
|
message: "SKILL.md exceeds 200 lines and no references/ directory is present.",
|
|
@@ -469,6 +489,7 @@ async function runDisclosureChecks(context) {
|
|
|
469
489
|
} else {
|
|
470
490
|
issues.push({
|
|
471
491
|
id: "disclosure.skill-split",
|
|
492
|
+
checkId: "disclosure:progressive-disclosure",
|
|
472
493
|
title: "Progressive Disclosure",
|
|
473
494
|
status: "pass",
|
|
474
495
|
message: "Top-level file length and references/ usage look reasonable."
|
|
@@ -496,6 +517,7 @@ async function runDisclosureChecks(context) {
|
|
|
496
517
|
if (nonRelativeOrEscaping.length > 0) {
|
|
497
518
|
issues.push({
|
|
498
519
|
id: "disclosure.relative-path-root",
|
|
520
|
+
checkId: "disclosure:path-scope",
|
|
499
521
|
title: "Reference Path Scope",
|
|
500
522
|
status: "fail",
|
|
501
523
|
message: `Found non-relative or out-of-root references: ${nonRelativeOrEscaping.join(", ")}`,
|
|
@@ -504,6 +526,7 @@ async function runDisclosureChecks(context) {
|
|
|
504
526
|
} else {
|
|
505
527
|
issues.push({
|
|
506
528
|
id: "disclosure.relative-path-root",
|
|
529
|
+
checkId: "disclosure:path-scope",
|
|
507
530
|
title: "Reference Path Scope",
|
|
508
531
|
status: "pass",
|
|
509
532
|
message: "All detected file references are relative and scoped to skill root."
|
|
@@ -513,6 +536,7 @@ async function runDisclosureChecks(context) {
|
|
|
513
536
|
if (chainDepth > 1) {
|
|
514
537
|
issues.push({
|
|
515
538
|
id: "disclosure.reference-depth",
|
|
539
|
+
checkId: "disclosure:reference-depth",
|
|
516
540
|
title: "Reference Chain Depth",
|
|
517
541
|
status: "warn",
|
|
518
542
|
message: "Deep reference chains detected (>1 level).",
|
|
@@ -521,6 +545,7 @@ async function runDisclosureChecks(context) {
|
|
|
521
545
|
} else {
|
|
522
546
|
issues.push({
|
|
523
547
|
id: "disclosure.reference-depth",
|
|
548
|
+
checkId: "disclosure:reference-depth",
|
|
524
549
|
title: "Reference Chain Depth",
|
|
525
550
|
status: "pass",
|
|
526
551
|
message: "Reference depth is shallow and easy to navigate."
|
|
@@ -530,6 +555,7 @@ async function runDisclosureChecks(context) {
|
|
|
530
555
|
if (normalizedReferences.some((item) => item.includes("../"))) {
|
|
531
556
|
issues.push({
|
|
532
557
|
id: "disclosure.parent-traversal",
|
|
558
|
+
checkId: "disclosure:parent-traversal",
|
|
533
559
|
title: "Parent Traversal",
|
|
534
560
|
status: "warn",
|
|
535
561
|
message: "References include parent-directory traversal (../).",
|
|
@@ -538,6 +564,7 @@ async function runDisclosureChecks(context) {
|
|
|
538
564
|
} else {
|
|
539
565
|
issues.push({
|
|
540
566
|
id: "disclosure.parent-traversal",
|
|
567
|
+
checkId: "disclosure:parent-traversal",
|
|
541
568
|
title: "Parent Traversal",
|
|
542
569
|
status: "pass",
|
|
543
570
|
message: "No parent-directory traversal references detected."
|
|
@@ -568,6 +595,7 @@ function runFrontmatterChecks(context) {
|
|
|
568
595
|
if (!context.frontmatter.hasFrontmatter) {
|
|
569
596
|
issues.push({
|
|
570
597
|
id: "frontmatter.exists",
|
|
598
|
+
checkId: "frontmatter:exists",
|
|
571
599
|
title: "Frontmatter Presence",
|
|
572
600
|
status: "fail",
|
|
573
601
|
message: "SKILL.md is missing YAML frontmatter delimited by --- blocks.",
|
|
@@ -578,6 +606,7 @@ function runFrontmatterChecks(context) {
|
|
|
578
606
|
if (context.frontmatter.error) {
|
|
579
607
|
issues.push({
|
|
580
608
|
id: "frontmatter.valid-yaml",
|
|
609
|
+
checkId: "frontmatter:yaml",
|
|
581
610
|
title: "Frontmatter YAML",
|
|
582
611
|
status: "fail",
|
|
583
612
|
message: `Frontmatter is not valid YAML: ${context.frontmatter.error}`,
|
|
@@ -587,6 +616,7 @@ function runFrontmatterChecks(context) {
|
|
|
587
616
|
}
|
|
588
617
|
issues.push({
|
|
589
618
|
id: "frontmatter.valid-yaml",
|
|
619
|
+
checkId: "frontmatter:yaml",
|
|
590
620
|
title: "Frontmatter YAML",
|
|
591
621
|
status: "pass",
|
|
592
622
|
message: "Frontmatter exists and parses correctly."
|
|
@@ -596,6 +626,7 @@ function runFrontmatterChecks(context) {
|
|
|
596
626
|
if (!name) {
|
|
597
627
|
issues.push({
|
|
598
628
|
id: "frontmatter.name.required",
|
|
629
|
+
checkId: "frontmatter:name",
|
|
599
630
|
title: "Frontmatter Name",
|
|
600
631
|
status: "fail",
|
|
601
632
|
message: "Missing required frontmatter field: name.",
|
|
@@ -604,6 +635,7 @@ function runFrontmatterChecks(context) {
|
|
|
604
635
|
} else if (name.length > 64) {
|
|
605
636
|
issues.push({
|
|
606
637
|
id: "frontmatter.name.length",
|
|
638
|
+
checkId: "frontmatter:name",
|
|
607
639
|
title: "Frontmatter Name Length",
|
|
608
640
|
status: "fail",
|
|
609
641
|
message: `name is too long (${name.length} chars, max 64).`,
|
|
@@ -612,6 +644,7 @@ function runFrontmatterChecks(context) {
|
|
|
612
644
|
} else if (!SKILL_NAME_REGEX.test(name)) {
|
|
613
645
|
issues.push({
|
|
614
646
|
id: "frontmatter.name.format",
|
|
647
|
+
checkId: "frontmatter:name",
|
|
615
648
|
title: "Frontmatter Name Format",
|
|
616
649
|
status: "fail",
|
|
617
650
|
message: "name must be lowercase alphanumeric with single hyphen separators only.",
|
|
@@ -620,6 +653,7 @@ function runFrontmatterChecks(context) {
|
|
|
620
653
|
} else {
|
|
621
654
|
issues.push({
|
|
622
655
|
id: "frontmatter.name.valid",
|
|
656
|
+
checkId: "frontmatter:name",
|
|
623
657
|
title: "Frontmatter Name",
|
|
624
658
|
status: "pass",
|
|
625
659
|
message: "name is present and follows naming conventions."
|
|
@@ -629,6 +663,7 @@ function runFrontmatterChecks(context) {
|
|
|
629
663
|
if (!description || description.trim() === "") {
|
|
630
664
|
issues.push({
|
|
631
665
|
id: "frontmatter.description.required",
|
|
666
|
+
checkId: "frontmatter:description",
|
|
632
667
|
title: "Frontmatter Description",
|
|
633
668
|
status: "fail",
|
|
634
669
|
message: "Missing required frontmatter field: description.",
|
|
@@ -637,6 +672,7 @@ function runFrontmatterChecks(context) {
|
|
|
637
672
|
} else if (description.length > 1024) {
|
|
638
673
|
issues.push({
|
|
639
674
|
id: "frontmatter.description.length",
|
|
675
|
+
checkId: "frontmatter:description",
|
|
640
676
|
title: "Frontmatter Description Length",
|
|
641
677
|
status: "fail",
|
|
642
678
|
message: `description is too long (${description.length} chars, max 1024).`,
|
|
@@ -645,6 +681,7 @@ function runFrontmatterChecks(context) {
|
|
|
645
681
|
} else {
|
|
646
682
|
issues.push({
|
|
647
683
|
id: "frontmatter.description.valid",
|
|
684
|
+
checkId: "frontmatter:description",
|
|
648
685
|
title: "Frontmatter Description",
|
|
649
686
|
status: "pass",
|
|
650
687
|
message: "description is present and within allowed length."
|
|
@@ -654,6 +691,7 @@ function runFrontmatterChecks(context) {
|
|
|
654
691
|
if (!license || license.trim() === "") {
|
|
655
692
|
issues.push({
|
|
656
693
|
id: "frontmatter.license.recommended",
|
|
694
|
+
checkId: "frontmatter:license",
|
|
657
695
|
title: "Frontmatter License",
|
|
658
696
|
status: "warn",
|
|
659
697
|
message: "No license field found in frontmatter.",
|
|
@@ -662,6 +700,7 @@ function runFrontmatterChecks(context) {
|
|
|
662
700
|
} else {
|
|
663
701
|
issues.push({
|
|
664
702
|
id: "frontmatter.license.present",
|
|
703
|
+
checkId: "frontmatter:license",
|
|
665
704
|
title: "Frontmatter License",
|
|
666
705
|
status: "pass",
|
|
667
706
|
message: "license field is present."
|
|
@@ -670,6 +709,7 @@ function runFrontmatterChecks(context) {
|
|
|
670
709
|
if (description && description.trim() !== "" && !descriptionLooksActionable(description)) {
|
|
671
710
|
issues.push({
|
|
672
711
|
id: "frontmatter.description.triggerability",
|
|
712
|
+
checkId: "frontmatter:triggerability",
|
|
673
713
|
title: "Description Trigger Clarity",
|
|
674
714
|
status: "warn",
|
|
675
715
|
message: "Description should explain both what the skill does and when it should be used.",
|
|
@@ -678,6 +718,7 @@ function runFrontmatterChecks(context) {
|
|
|
678
718
|
} else if (description) {
|
|
679
719
|
issues.push({
|
|
680
720
|
id: "frontmatter.description.triggerability",
|
|
721
|
+
checkId: "frontmatter:triggerability",
|
|
681
722
|
title: "Description Trigger Clarity",
|
|
682
723
|
status: "pass",
|
|
683
724
|
message: "Description appears to cover both capability and usage context."
|
|
@@ -751,6 +792,7 @@ function runSecurityChecks(context) {
|
|
|
751
792
|
if (dangerousCommandHits.length > 0) {
|
|
752
793
|
issues.push({
|
|
753
794
|
id: "security.dangerous-command-patterns",
|
|
795
|
+
checkId: "security:dangerous-commands",
|
|
754
796
|
title: "Dangerous Command Patterns",
|
|
755
797
|
status: "fail",
|
|
756
798
|
message: `Potentially dangerous command instruction patterns found: ${dangerousCommandHits.join(", ")}.`,
|
|
@@ -759,6 +801,7 @@ function runSecurityChecks(context) {
|
|
|
759
801
|
} else {
|
|
760
802
|
issues.push({
|
|
761
803
|
id: "security.dangerous-command-patterns",
|
|
804
|
+
checkId: "security:dangerous-commands",
|
|
762
805
|
title: "Dangerous Command Patterns",
|
|
763
806
|
status: "pass",
|
|
764
807
|
message: "No high-risk destructive or direct pipe-to-shell patterns detected."
|
|
@@ -768,6 +811,7 @@ function runSecurityChecks(context) {
|
|
|
768
811
|
if (exfiltrationHits.length > 0) {
|
|
769
812
|
issues.push({
|
|
770
813
|
id: "security.exfiltration-patterns",
|
|
814
|
+
checkId: "security:exfiltration",
|
|
771
815
|
title: "Sensitive Data Exfiltration",
|
|
772
816
|
status: "fail",
|
|
773
817
|
message: `Possible sensitive data exfiltration patterns found: ${exfiltrationHits.join(", ")}.`,
|
|
@@ -776,6 +820,7 @@ function runSecurityChecks(context) {
|
|
|
776
820
|
} else {
|
|
777
821
|
issues.push({
|
|
778
822
|
id: "security.exfiltration-patterns",
|
|
823
|
+
checkId: "security:exfiltration",
|
|
779
824
|
title: "Sensitive Data Exfiltration",
|
|
780
825
|
status: "pass",
|
|
781
826
|
message: "No obvious credential access/exfiltration instructions detected."
|
|
@@ -785,6 +830,7 @@ function runSecurityChecks(context) {
|
|
|
785
830
|
if (escalationHits.length > 0) {
|
|
786
831
|
issues.push({
|
|
787
832
|
id: "security.privilege-escalation",
|
|
833
|
+
checkId: "security:privilege-escalation",
|
|
788
834
|
title: "Privilege Escalation Language",
|
|
789
835
|
status: "warn",
|
|
790
836
|
message: `Potentially risky privilege/execution language detected: ${escalationHits.join(", ")}.`,
|
|
@@ -793,6 +839,7 @@ function runSecurityChecks(context) {
|
|
|
793
839
|
} else {
|
|
794
840
|
issues.push({
|
|
795
841
|
id: "security.privilege-escalation",
|
|
842
|
+
checkId: "security:privilege-escalation",
|
|
796
843
|
title: "Privilege Escalation Language",
|
|
797
844
|
status: "pass",
|
|
798
845
|
message: "No obvious privilege-escalation language detected."
|
|
@@ -802,6 +849,7 @@ function runSecurityChecks(context) {
|
|
|
802
849
|
if (hasShellActivity && !SAFETY_GUARDRAIL_PATTERN.test(skillText)) {
|
|
803
850
|
issues.push({
|
|
804
851
|
id: "security.safety-guardrails",
|
|
852
|
+
checkId: "security:missing-guardrails",
|
|
805
853
|
title: "Execution Safety Guardrails",
|
|
806
854
|
status: "warn",
|
|
807
855
|
message: "Shell/tool execution is present, but no explicit safety guardrails were detected.",
|
|
@@ -810,6 +858,7 @@ function runSecurityChecks(context) {
|
|
|
810
858
|
} else {
|
|
811
859
|
issues.push({
|
|
812
860
|
id: "security.safety-guardrails",
|
|
861
|
+
checkId: "security:missing-guardrails",
|
|
813
862
|
title: "Execution Safety Guardrails",
|
|
814
863
|
status: "pass",
|
|
815
864
|
message: hasShellActivity ? "Shell/tool execution instructions include at least one safety guardrail." : "No shell/tool execution instructions detected."
|
|
@@ -846,6 +895,7 @@ async function runStructureChecks(context) {
|
|
|
846
895
|
if (context.skill.lineCount > 500) {
|
|
847
896
|
issues.push({
|
|
848
897
|
id: "structure.skill-size",
|
|
898
|
+
checkId: "structure:file-size",
|
|
849
899
|
title: "SKILL.md Size",
|
|
850
900
|
status: "warn",
|
|
851
901
|
message: `SKILL.md is ${context.skill.lineCount} lines (recommended max is 500).`,
|
|
@@ -854,6 +904,7 @@ async function runStructureChecks(context) {
|
|
|
854
904
|
} else {
|
|
855
905
|
issues.push({
|
|
856
906
|
id: "structure.skill-size",
|
|
907
|
+
checkId: "structure:file-size",
|
|
857
908
|
title: "SKILL.md Size",
|
|
858
909
|
status: "pass",
|
|
859
910
|
message: `SKILL.md length is ${context.skill.lineCount} lines.`
|
|
@@ -870,6 +921,7 @@ async function runStructureChecks(context) {
|
|
|
870
921
|
oversizedWithoutToc += 1;
|
|
871
922
|
issues.push({
|
|
872
923
|
id: `structure.references.toc.${toPosixPath(path4.relative(context.skill.skillRoot, file))}`,
|
|
924
|
+
checkId: "structure:toc",
|
|
873
925
|
title: "Reference File Navigation",
|
|
874
926
|
status: "warn",
|
|
875
927
|
message: `${toPosixPath(path4.relative(context.skill.skillRoot, file))} is ${lineCount} lines and has no table of contents.`,
|
|
@@ -880,6 +932,7 @@ async function runStructureChecks(context) {
|
|
|
880
932
|
if (oversizedWithoutToc === 0) {
|
|
881
933
|
issues.push({
|
|
882
934
|
id: "structure.references.toc",
|
|
935
|
+
checkId: "structure:toc",
|
|
883
936
|
title: "Reference File Navigation",
|
|
884
937
|
status: "pass",
|
|
885
938
|
message: "No oversized reference files missing a table of contents."
|
|
@@ -888,6 +941,7 @@ async function runStructureChecks(context) {
|
|
|
888
941
|
} else {
|
|
889
942
|
issues.push({
|
|
890
943
|
id: "structure.references.toc",
|
|
944
|
+
checkId: "structure:toc",
|
|
891
945
|
title: "Reference File Navigation",
|
|
892
946
|
status: "pass",
|
|
893
947
|
message: "No references/ directory found, so no long reference files to validate."
|
|
@@ -916,6 +970,7 @@ async function runStructureChecks(context) {
|
|
|
916
970
|
if (missing.length > 0) {
|
|
917
971
|
issues.push({
|
|
918
972
|
id: `structure.${category.key}.exists`,
|
|
973
|
+
checkId: "structure:references",
|
|
919
974
|
title: category.title,
|
|
920
975
|
status: "fail",
|
|
921
976
|
message: `Missing referenced ${category.key} file(s): ${missing.join(", ")}`,
|
|
@@ -924,6 +979,7 @@ async function runStructureChecks(context) {
|
|
|
924
979
|
} else {
|
|
925
980
|
issues.push({
|
|
926
981
|
id: `structure.${category.key}.exists`,
|
|
982
|
+
checkId: "structure:references",
|
|
927
983
|
title: category.title,
|
|
928
984
|
status: "pass",
|
|
929
985
|
message: `All referenced ${category.key} files exist.`
|
|
@@ -934,6 +990,7 @@ async function runStructureChecks(context) {
|
|
|
934
990
|
if (missingGeneric.length > 0) {
|
|
935
991
|
issues.push({
|
|
936
992
|
id: "structure.relative-links.broken",
|
|
993
|
+
checkId: "structure:references",
|
|
937
994
|
title: "Relative Links",
|
|
938
995
|
status: "fail",
|
|
939
996
|
message: `Broken relative path reference(s): ${missingGeneric.join(", ")}`,
|
|
@@ -942,6 +999,7 @@ async function runStructureChecks(context) {
|
|
|
942
999
|
} else {
|
|
943
1000
|
issues.push({
|
|
944
1001
|
id: "structure.relative-links.broken",
|
|
1002
|
+
checkId: "structure:references",
|
|
945
1003
|
title: "Relative Links",
|
|
946
1004
|
status: "pass",
|
|
947
1005
|
message: "No broken generic relative file references were found."
|
|
@@ -971,7 +1029,13 @@ function summarizeIssues(issues) {
|
|
|
971
1029
|
}
|
|
972
1030
|
return summary;
|
|
973
1031
|
}
|
|
974
|
-
|
|
1032
|
+
function lintFails(report, failOn) {
|
|
1033
|
+
if (report.summary.failures > 0) {
|
|
1034
|
+
return true;
|
|
1035
|
+
}
|
|
1036
|
+
return failOn === "warn" && report.summary.warnings > 0;
|
|
1037
|
+
}
|
|
1038
|
+
async function runLinter(inputPath, options = {}) {
|
|
975
1039
|
const skill = await loadSkillFile(inputPath);
|
|
976
1040
|
const frontmatter = parseFrontmatter(skill.raw);
|
|
977
1041
|
const context = {
|
|
@@ -985,10 +1049,12 @@ async function runLinter(inputPath) {
|
|
|
985
1049
|
issues.push(...runSecurityChecks(context));
|
|
986
1050
|
issues.push(...await runDisclosureChecks(context));
|
|
987
1051
|
issues.push(...runCompatibilityChecks(context));
|
|
1052
|
+
const suppress = new Set(options.suppress ?? []);
|
|
1053
|
+
const filteredIssues = issues.filter((issue) => !suppress.has(issue.checkId));
|
|
988
1054
|
return {
|
|
989
1055
|
target: inputPath,
|
|
990
|
-
issues,
|
|
991
|
-
summary: summarizeIssues(
|
|
1056
|
+
issues: filteredIssues,
|
|
1057
|
+
summary: summarizeIssues(filteredIssues)
|
|
992
1058
|
};
|
|
993
1059
|
}
|
|
994
1060
|
|
|
@@ -1159,71 +1225,165 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
1159
1225
|
return lines.join("\n");
|
|
1160
1226
|
}
|
|
1161
1227
|
|
|
1162
|
-
// src/reporters/json.ts
|
|
1163
|
-
function renderJson(value) {
|
|
1164
|
-
return JSON.stringify(value, null, 2);
|
|
1165
|
-
}
|
|
1166
|
-
|
|
1167
1228
|
// src/commands/common.ts
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1229
|
+
import fs5 from "node:fs/promises";
|
|
1230
|
+
import { z as z5 } from "zod";
|
|
1231
|
+
|
|
1232
|
+
// src/core/eval-runner.ts
|
|
1233
|
+
import { z as z3 } from "zod";
|
|
1234
|
+
|
|
1235
|
+
// src/core/grader.ts
|
|
1236
|
+
import { z as z2 } from "zod";
|
|
1237
|
+
var gradedAssertionSchema = z2.object({
|
|
1238
|
+
assertion: z2.string(),
|
|
1239
|
+
passed: z2.boolean(),
|
|
1240
|
+
evidence: z2.string()
|
|
1241
|
+
});
|
|
1242
|
+
var graderOutputSchema = z2.object({
|
|
1243
|
+
assertions: z2.array(gradedAssertionSchema)
|
|
1244
|
+
});
|
|
1245
|
+
function extractJsonObject(raw) {
|
|
1246
|
+
const trimmed = raw.trim();
|
|
1247
|
+
if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
|
|
1248
|
+
return JSON.parse(trimmed);
|
|
1180
1249
|
}
|
|
1181
|
-
|
|
1182
|
-
|
|
1250
|
+
const start = trimmed.indexOf("{");
|
|
1251
|
+
const end = trimmed.lastIndexOf("}");
|
|
1252
|
+
if (start >= 0 && end > start) {
|
|
1253
|
+
return JSON.parse(trimmed.slice(start, end + 1));
|
|
1254
|
+
}
|
|
1255
|
+
throw new Error("Grader did not return a JSON object.");
|
|
1183
1256
|
}
|
|
1184
|
-
function
|
|
1185
|
-
const
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1257
|
+
async function gradeResponse(options) {
|
|
1258
|
+
const assertionList = options.assertions && options.assertions.length > 0 ? options.assertions : [
|
|
1259
|
+
"The response follows the skill instructions faithfully.",
|
|
1260
|
+
"The response is well-structured and actionable.",
|
|
1261
|
+
"The response addresses the user prompt directly."
|
|
1262
|
+
];
|
|
1263
|
+
const systemPrompt = [
|
|
1264
|
+
"You are a strict evaluator for agent skill outputs.",
|
|
1265
|
+
"Assess each assertion and return JSON only.",
|
|
1266
|
+
'Required output format: {"assertions":[{"assertion":"...","passed":true|false,"evidence":"..."}]}'
|
|
1267
|
+
].join(" ");
|
|
1268
|
+
const userPrompt = [
|
|
1269
|
+
`Skill: ${options.skillName}`,
|
|
1270
|
+
"Skill instructions:",
|
|
1271
|
+
options.skillBody,
|
|
1272
|
+
"",
|
|
1273
|
+
`User prompt: ${options.userPrompt}`,
|
|
1274
|
+
"",
|
|
1275
|
+
"Model response:",
|
|
1276
|
+
options.modelResponse,
|
|
1277
|
+
"",
|
|
1278
|
+
"Assertions to evaluate:",
|
|
1279
|
+
assertionList.map((assertion, index) => `${index + 1}. ${assertion}`).join("\n")
|
|
1280
|
+
].join("\n");
|
|
1281
|
+
const raw = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
|
|
1282
|
+
const parsed = graderOutputSchema.safeParse(extractJsonObject(raw));
|
|
1283
|
+
if (!parsed.success) {
|
|
1284
|
+
throw new Error(`Failed to parse grader output: ${parsed.error.issues[0]?.message ?? "invalid grader JSON"}`);
|
|
1190
1285
|
}
|
|
1191
|
-
|
|
1192
|
-
`);
|
|
1286
|
+
return parsed.data.assertions;
|
|
1193
1287
|
}
|
|
1194
1288
|
|
|
1195
|
-
// src/
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1289
|
+
// src/core/eval-runner.ts
|
|
1290
|
+
var evalPromptSchema = z3.object({
|
|
1291
|
+
prompt: z3.string().min(1),
|
|
1292
|
+
assertions: z3.array(z3.string().min(1)).optional()
|
|
1293
|
+
});
|
|
1294
|
+
var evalPromptArraySchema = z3.array(evalPromptSchema);
|
|
1295
|
+
function extractJsonArray(raw) {
|
|
1296
|
+
const trimmed = raw.trim();
|
|
1297
|
+
if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
|
|
1298
|
+
return JSON.parse(trimmed);
|
|
1299
|
+
}
|
|
1300
|
+
const start = trimmed.indexOf("[");
|
|
1301
|
+
const end = trimmed.lastIndexOf("]");
|
|
1302
|
+
if (start >= 0 && end > start) {
|
|
1303
|
+
return JSON.parse(trimmed.slice(start, end + 1));
|
|
1304
|
+
}
|
|
1305
|
+
throw new Error("Model did not return a JSON array.");
|
|
1306
|
+
}
|
|
1307
|
+
async function generatePrompts(skill, provider, model, count) {
|
|
1308
|
+
const systemPrompt = [
|
|
1309
|
+
"You generate realistic evaluation prompts for an agent skill.",
|
|
1310
|
+
"Return JSON only.",
|
|
1311
|
+
'Format: [{"prompt":"...","assertions":["...", "..."]}]',
|
|
1312
|
+
"Assertions should be concrete and checkable."
|
|
1313
|
+
].join(" ");
|
|
1314
|
+
const userPrompt = [
|
|
1315
|
+
`Skill name: ${skill.frontmatter.name}`,
|
|
1316
|
+
`Skill description: ${skill.frontmatter.description}`,
|
|
1317
|
+
"Skill instructions:",
|
|
1318
|
+
skill.content,
|
|
1319
|
+
"",
|
|
1320
|
+
`Generate ${count} prompts that stress the main capabilities and likely edge cases.`,
|
|
1321
|
+
"Each prompt should include 2-4 assertions."
|
|
1322
|
+
].join("\n");
|
|
1323
|
+
const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
|
|
1324
|
+
const parsed = evalPromptArraySchema.safeParse(extractJsonArray(raw));
|
|
1325
|
+
if (!parsed.success) {
|
|
1326
|
+
throw new Error(`Failed to parse generated eval prompts: ${parsed.error.issues[0]?.message ?? "invalid prompt JSON"}`);
|
|
1327
|
+
}
|
|
1328
|
+
if (parsed.data.length !== count) {
|
|
1329
|
+
throw new Error(`Expected ${count} prompts, got ${parsed.data.length}.`);
|
|
1330
|
+
}
|
|
1331
|
+
return parsed.data;
|
|
1332
|
+
}
|
|
1333
|
+
async function runEval(skill, options) {
|
|
1334
|
+
const prompts = options.prompts && options.prompts.length > 0 ? evalPromptArraySchema.parse(options.prompts) : await generatePrompts(skill, options.provider, options.model, options.numRuns);
|
|
1335
|
+
const results = [];
|
|
1336
|
+
for (const evalPrompt of prompts) {
|
|
1337
|
+
const systemPrompt = [
|
|
1338
|
+
"You are an AI assistant with an activated skill.",
|
|
1339
|
+
"Follow this SKILL.md content exactly where applicable.",
|
|
1340
|
+
"",
|
|
1341
|
+
skill.raw
|
|
1342
|
+
].join("\n");
|
|
1343
|
+
const response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
|
|
1344
|
+
const gradedAssertions = await gradeResponse({
|
|
1345
|
+
provider: options.provider,
|
|
1346
|
+
model: options.graderModel,
|
|
1347
|
+
skillName: skill.frontmatter.name,
|
|
1348
|
+
skillBody: skill.content,
|
|
1349
|
+
userPrompt: evalPrompt.prompt,
|
|
1350
|
+
modelResponse: response,
|
|
1351
|
+
assertions: evalPrompt.assertions
|
|
1352
|
+
});
|
|
1353
|
+
const passedAssertions2 = gradedAssertions.filter((assertion) => assertion.passed).length;
|
|
1354
|
+
results.push({
|
|
1355
|
+
prompt: evalPrompt.prompt,
|
|
1356
|
+
assertions: gradedAssertions,
|
|
1357
|
+
responseSummary: response.slice(0, 200),
|
|
1358
|
+
response,
|
|
1359
|
+
passedAssertions: passedAssertions2,
|
|
1360
|
+
totalAssertions: gradedAssertions.length
|
|
1361
|
+
});
|
|
1362
|
+
}
|
|
1363
|
+
const totalAssertions = results.reduce((total, result) => total + result.totalAssertions, 0);
|
|
1364
|
+
const passedAssertions = results.reduce((total, result) => total + result.passedAssertions, 0);
|
|
1365
|
+
return {
|
|
1366
|
+
skillName: skill.frontmatter.name,
|
|
1367
|
+
model: options.model,
|
|
1368
|
+
graderModel: options.graderModel,
|
|
1369
|
+
provider: options.provider.name,
|
|
1370
|
+
prompts,
|
|
1371
|
+
results,
|
|
1372
|
+
summary: {
|
|
1373
|
+
totalPrompts: results.length,
|
|
1374
|
+
totalAssertions,
|
|
1375
|
+
passedAssertions
|
|
1212
1376
|
}
|
|
1213
|
-
}
|
|
1377
|
+
};
|
|
1214
1378
|
}
|
|
1215
1379
|
|
|
1216
|
-
// src/commands/trigger.ts
|
|
1217
|
-
import ora from "ora";
|
|
1218
|
-
import { z as z3 } from "zod";
|
|
1219
|
-
|
|
1220
1380
|
// src/core/trigger-tester.ts
|
|
1221
|
-
import { z as
|
|
1222
|
-
var triggerQuerySchema =
|
|
1223
|
-
query:
|
|
1224
|
-
should_trigger:
|
|
1381
|
+
import { z as z4 } from "zod";
|
|
1382
|
+
var triggerQuerySchema = z4.object({
|
|
1383
|
+
query: z4.string().min(1),
|
|
1384
|
+
should_trigger: z4.boolean()
|
|
1225
1385
|
});
|
|
1226
|
-
var triggerQueryArraySchema =
|
|
1386
|
+
var triggerQueryArraySchema = z4.array(triggerQuerySchema);
|
|
1227
1387
|
var FAKE_SKILLS = [
|
|
1228
1388
|
{ name: "code-review", description: "Reviews code changes for bugs, regressions, and maintainability issues." },
|
|
1229
1389
|
{ name: "api-tester", description: "Designs and runs REST API tests, validating status codes and response shapes." },
|
|
@@ -1241,16 +1401,23 @@ var FAKE_SKILLS = [
|
|
|
1241
1401
|
{ name: "test-generator", description: "Generates unit and integration test cases from feature requirements." },
|
|
1242
1402
|
{ name: "prompt-tuner", description: "Improves prompts for reliability, formatting, and failure handling." }
|
|
1243
1403
|
];
|
|
1244
|
-
function
|
|
1404
|
+
function createSeededRandom(seed) {
|
|
1405
|
+
let state = seed >>> 0;
|
|
1406
|
+
return () => {
|
|
1407
|
+
state = state * 1664525 + 1013904223 >>> 0;
|
|
1408
|
+
return state / 4294967296;
|
|
1409
|
+
};
|
|
1410
|
+
}
|
|
1411
|
+
function shuffle(values, random = Math.random) {
|
|
1245
1412
|
const copy = [...values];
|
|
1246
1413
|
for (let index = copy.length - 1; index > 0; index -= 1) {
|
|
1247
|
-
const swapIndex = Math.floor(
|
|
1414
|
+
const swapIndex = Math.floor(random() * (index + 1));
|
|
1248
1415
|
[copy[index], copy[swapIndex]] = [copy[swapIndex], copy[index]];
|
|
1249
1416
|
}
|
|
1250
1417
|
return copy;
|
|
1251
1418
|
}
|
|
1252
|
-
function sample(values, count) {
|
|
1253
|
-
return shuffle(values).slice(0, Math.max(0, Math.min(count, values.length)));
|
|
1419
|
+
function sample(values, count, random = Math.random) {
|
|
1420
|
+
return shuffle(values, random).slice(0, Math.max(0, Math.min(count, values.length)));
|
|
1254
1421
|
}
|
|
1255
1422
|
function parseJsonArrayFromModelOutput(raw) {
|
|
1256
1423
|
const trimmed = raw.trim();
|
|
@@ -1362,19 +1529,20 @@ function buildSuggestions(metrics) {
|
|
|
1362
1529
|
return suggestions;
|
|
1363
1530
|
}
|
|
1364
1531
|
async function runTriggerTest(skill, options) {
|
|
1532
|
+
const random = options.seed === void 0 ? Math.random : createSeededRandom(options.seed);
|
|
1365
1533
|
const queries = options.queries && options.queries.length > 0 ? triggerQueryArraySchema.parse(options.queries) : await generateQueriesWithModel(skill, options.provider, options.model, options.numQueries);
|
|
1366
1534
|
const results = [];
|
|
1367
1535
|
const skillName = skill.frontmatter.name;
|
|
1368
1536
|
for (const testQuery of queries) {
|
|
1369
|
-
const fakeCount = 5 + Math.floor(
|
|
1370
|
-
const fakeSkills = sample(FAKE_SKILLS, fakeCount);
|
|
1537
|
+
const fakeCount = 5 + Math.floor(random() * 4);
|
|
1538
|
+
const fakeSkills = sample(FAKE_SKILLS, fakeCount, random);
|
|
1371
1539
|
const allSkills = shuffle([
|
|
1372
1540
|
...fakeSkills,
|
|
1373
1541
|
{
|
|
1374
1542
|
name: skill.frontmatter.name,
|
|
1375
1543
|
description: skill.frontmatter.description
|
|
1376
1544
|
}
|
|
1377
|
-
]);
|
|
1545
|
+
], random);
|
|
1378
1546
|
const skillListText = allSkills.map((entry) => `- ${entry.name}: ${entry.description}`).join("\n");
|
|
1379
1547
|
const systemPrompt = [
|
|
1380
1548
|
"You are selecting one skill to activate for a user query.",
|
|
@@ -1410,74 +1578,517 @@ async function runTriggerTest(skill, options) {
|
|
|
1410
1578
|
};
|
|
1411
1579
|
}
|
|
1412
1580
|
|
|
1413
|
-
// src/
|
|
1414
|
-
function
|
|
1415
|
-
|
|
1416
|
-
|
|
1581
|
+
// src/reporters/json.ts
|
|
1582
|
+
function renderJson(value) {
|
|
1583
|
+
return JSON.stringify(value, null, 2);
|
|
1584
|
+
}
|
|
1585
|
+
|
|
1586
|
+
// src/commands/common.ts
|
|
1587
|
+
var executionContextByCommand = /* @__PURE__ */ new WeakMap();
|
|
1588
|
+
var singleEvalPromptSchema = z5.object({
|
|
1589
|
+
prompt: z5.string().min(1),
|
|
1590
|
+
assertions: z5.array(z5.string().min(1)).optional()
|
|
1591
|
+
});
|
|
1592
|
+
var promptStringArraySchema = z5.array(z5.string().min(1));
|
|
1593
|
+
var assertionsObjectSchema = z5.object({
|
|
1594
|
+
assertions: z5.array(z5.string().min(1))
|
|
1595
|
+
});
|
|
1596
|
+
function parseJsonIfPossible(raw) {
|
|
1597
|
+
try {
|
|
1598
|
+
return JSON.parse(raw);
|
|
1599
|
+
} catch {
|
|
1600
|
+
return raw;
|
|
1417
1601
|
}
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
throw new Error(
|
|
1424
|
-
"No Anthropic API key found. Set ANTHROPIC_API_KEY environment variable or pass --api-key flag."
|
|
1425
|
-
);
|
|
1602
|
+
}
|
|
1603
|
+
function normalizeEvalPrompts(value, sourceLabel) {
|
|
1604
|
+
const promptArray = evalPromptArraySchema.safeParse(value);
|
|
1605
|
+
if (promptArray.success) {
|
|
1606
|
+
return promptArray.data;
|
|
1426
1607
|
}
|
|
1427
|
-
const
|
|
1428
|
-
if (
|
|
1429
|
-
return
|
|
1608
|
+
const singlePrompt = singleEvalPromptSchema.safeParse(value);
|
|
1609
|
+
if (singlePrompt.success) {
|
|
1610
|
+
return [singlePrompt.data];
|
|
1430
1611
|
}
|
|
1431
|
-
|
|
1612
|
+
const promptStrings = promptStringArraySchema.safeParse(value);
|
|
1613
|
+
if (promptStrings.success) {
|
|
1614
|
+
return promptStrings.data.map((prompt) => ({ prompt }));
|
|
1615
|
+
}
|
|
1616
|
+
if (typeof value === "string" && value.trim() !== "") {
|
|
1617
|
+
return [{ prompt: value.trim() }];
|
|
1618
|
+
}
|
|
1619
|
+
throw new Error(
|
|
1620
|
+
`Invalid eval prompt source at ${sourceLabel}. Expected plain text, a JSON prompt object, or a JSON array of prompts.`
|
|
1621
|
+
);
|
|
1432
1622
|
}
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
import Anthropic from "@anthropic-ai/sdk";
|
|
1436
|
-
function wait(ms) {
|
|
1437
|
-
return new Promise((resolve) => {
|
|
1438
|
-
setTimeout(resolve, ms);
|
|
1439
|
-
});
|
|
1623
|
+
function parseAssertionsFromText(raw) {
|
|
1624
|
+
return raw.split(/\r?\n/).map((line) => line.trim().replace(/^[-*]\s+/, "").replace(/^\d+\.\s+/, "")).filter((line) => line.length > 0);
|
|
1440
1625
|
}
|
|
1441
|
-
function
|
|
1442
|
-
|
|
1443
|
-
|
|
1626
|
+
function normalizeAssertions(value, sourceLabel) {
|
|
1627
|
+
const assertionArray = z5.array(z5.string().min(1)).safeParse(value);
|
|
1628
|
+
if (assertionArray.success) {
|
|
1629
|
+
return assertionArray.data;
|
|
1444
1630
|
}
|
|
1445
|
-
const
|
|
1446
|
-
if (
|
|
1447
|
-
return
|
|
1631
|
+
const assertionObject = assertionsObjectSchema.safeParse(value);
|
|
1632
|
+
if (assertionObject.success) {
|
|
1633
|
+
return assertionObject.data.assertions;
|
|
1448
1634
|
}
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1635
|
+
if (typeof value === "string") {
|
|
1636
|
+
const assertions = parseAssertionsFromText(value);
|
|
1637
|
+
if (assertions.length > 0) {
|
|
1638
|
+
return assertions;
|
|
1639
|
+
}
|
|
1452
1640
|
}
|
|
1453
|
-
|
|
1641
|
+
throw new Error(
|
|
1642
|
+
`Invalid eval assertions source at ${sourceLabel}. Expected JSON string[], { assertions: string[] }, or newline-delimited text.`
|
|
1643
|
+
);
|
|
1454
1644
|
}
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1645
|
+
function setCommandExecutionContext(command, context) {
|
|
1646
|
+
executionContextByCommand.set(command, context);
|
|
1647
|
+
}
|
|
1648
|
+
function getCommandExecutionContext(command) {
|
|
1649
|
+
const context = executionContextByCommand.get(command);
|
|
1650
|
+
if (!context) {
|
|
1651
|
+
throw new Error(`Missing resolved config for command '${command.name()}'.`);
|
|
1460
1652
|
}
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1653
|
+
return context;
|
|
1654
|
+
}
|
|
1655
|
+
function getResolvedConfig(command) {
|
|
1656
|
+
return getCommandExecutionContext(command).config;
|
|
1657
|
+
}
|
|
1658
|
+
function getGlobalCliOptions(command) {
|
|
1659
|
+
const options = command.optsWithGlobals();
|
|
1660
|
+
const context = executionContextByCommand.get(command);
|
|
1661
|
+
return {
|
|
1662
|
+
json: context?.config.json ?? Boolean(options.json),
|
|
1663
|
+
color: options.color !== false
|
|
1664
|
+
};
|
|
1665
|
+
}
|
|
1666
|
+
async function loadTriggerQueriesFile(filePath) {
|
|
1667
|
+
const loaded = await readJsonFile(filePath);
|
|
1668
|
+
const parsed = triggerQueryArraySchema.safeParse(loaded);
|
|
1669
|
+
if (!parsed.success) {
|
|
1670
|
+
throw new Error(`Invalid --queries JSON: ${parsed.error.issues[0]?.message ?? "unknown format issue"}`);
|
|
1671
|
+
}
|
|
1672
|
+
return parsed.data;
|
|
1673
|
+
}
|
|
1674
|
+
async function loadEvalPromptsJson(filePath) {
|
|
1675
|
+
const loaded = await readJsonFile(filePath);
|
|
1676
|
+
const parsed = evalPromptArraySchema.safeParse(loaded);
|
|
1677
|
+
if (!parsed.success) {
|
|
1678
|
+
throw new Error(`Invalid --prompts JSON: ${parsed.error.issues[0]?.message ?? "unknown format issue"}`);
|
|
1679
|
+
}
|
|
1680
|
+
return parsed.data;
|
|
1681
|
+
}
|
|
1682
|
+
async function loadConfiguredEvalPrompts(command) {
|
|
1683
|
+
const context = getCommandExecutionContext(command);
|
|
1684
|
+
const promptFile = context.config.eval.promptFile;
|
|
1685
|
+
const assertionsFile = context.config.eval.assertionsFile;
|
|
1686
|
+
if (!promptFile && !assertionsFile) {
|
|
1687
|
+
return void 0;
|
|
1688
|
+
}
|
|
1689
|
+
if (!promptFile && assertionsFile) {
|
|
1690
|
+
throw new Error("Config field eval.assertionsFile requires eval.promptFile.");
|
|
1691
|
+
}
|
|
1692
|
+
const promptRaw = await fs5.readFile(promptFile, "utf8");
|
|
1693
|
+
let prompts = normalizeEvalPrompts(parseJsonIfPossible(promptRaw), promptFile);
|
|
1694
|
+
if (assertionsFile) {
|
|
1695
|
+
const assertionsRaw = await fs5.readFile(assertionsFile, "utf8");
|
|
1696
|
+
const assertions = normalizeAssertions(parseJsonIfPossible(assertionsRaw), assertionsFile);
|
|
1697
|
+
prompts = prompts.map((prompt) => ({
|
|
1698
|
+
prompt: prompt.prompt,
|
|
1699
|
+
assertions: [...assertions]
|
|
1700
|
+
}));
|
|
1701
|
+
}
|
|
1702
|
+
const numRunsWasExplicit = context.configFile?.eval?.numRuns !== void 0;
|
|
1703
|
+
if (numRunsWasExplicit && prompts.length === 1 && context.config.eval.numRuns > 1) {
|
|
1704
|
+
const promptTemplate = prompts[0];
|
|
1705
|
+
prompts = Array.from({ length: context.config.eval.numRuns }, () => ({
|
|
1706
|
+
prompt: promptTemplate.prompt,
|
|
1707
|
+
assertions: promptTemplate.assertions ? [...promptTemplate.assertions] : void 0
|
|
1708
|
+
}));
|
|
1709
|
+
}
|
|
1710
|
+
return prompts;
|
|
1711
|
+
}
|
|
1712
|
+
function writeResult(value, asJson) {
|
|
1713
|
+
if (asJson) {
|
|
1714
|
+
process.stdout.write(`${renderJson(value)}
|
|
1715
|
+
`);
|
|
1716
|
+
return;
|
|
1717
|
+
}
|
|
1718
|
+
process.stdout.write(`${String(value)}
|
|
1719
|
+
`);
|
|
1720
|
+
}
|
|
1721
|
+
function writeError(error, asJson) {
|
|
1722
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1723
|
+
if (asJson) {
|
|
1724
|
+
process.stdout.write(`${renderJson({ error: message })}
|
|
1725
|
+
`);
|
|
1726
|
+
return;
|
|
1727
|
+
}
|
|
1728
|
+
process.stderr.write(`Error: ${message}
|
|
1729
|
+
`);
|
|
1730
|
+
}
|
|
1731
|
+
|
|
1732
|
+
// src/commands/lint.ts
|
|
1733
|
+
async function handleLintCommand(targetPath, options) {
|
|
1734
|
+
try {
|
|
1735
|
+
const report = await runLinter(targetPath, { suppress: options.suppress });
|
|
1736
|
+
if (options.json) {
|
|
1737
|
+
writeResult(report, true);
|
|
1738
|
+
} else {
|
|
1739
|
+
writeResult(renderLintReport(report, options.color), false);
|
|
1740
|
+
}
|
|
1741
|
+
if (lintFails(report, options.failOn)) {
|
|
1742
|
+
process.exitCode = 1;
|
|
1743
|
+
}
|
|
1744
|
+
} catch (error) {
|
|
1745
|
+
writeError(error, options.json);
|
|
1746
|
+
process.exitCode = 2;
|
|
1747
|
+
}
|
|
1748
|
+
}
|
|
1749
|
+
function registerLintCommand(program) {
|
|
1750
|
+
program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").action(async (targetPath, _commandOptions, command) => {
|
|
1751
|
+
const globalOptions = getGlobalCliOptions(command);
|
|
1752
|
+
const config = getResolvedConfig(command);
|
|
1753
|
+
await handleLintCommand(targetPath, {
|
|
1754
|
+
...globalOptions,
|
|
1755
|
+
failOn: config.lint.failOn,
|
|
1756
|
+
suppress: config.lint.suppress
|
|
1757
|
+
});
|
|
1758
|
+
});
|
|
1759
|
+
}
|
|
1760
|
+
|
|
1761
|
+
// src/commands/trigger.ts
|
|
1762
|
+
import ora from "ora";
|
|
1763
|
+
import { z as z7 } from "zod";
|
|
1764
|
+
|
|
1765
|
+
// src/utils/config.ts
|
|
1766
|
+
import fs6 from "node:fs/promises";
|
|
1767
|
+
import path5 from "node:path";
|
|
1768
|
+
import { z as z6 } from "zod";
|
|
1769
|
+
var providerNameSchema = z6.enum(["anthropic", "openai"]);
|
|
1770
|
+
var lintFailOnSchema = z6.enum(["error", "warn"]);
|
|
1771
|
+
var lintConfigSchema = z6.object({
|
|
1772
|
+
failOn: lintFailOnSchema.optional(),
|
|
1773
|
+
suppress: z6.array(z6.string().min(1)).optional()
|
|
1774
|
+
}).strict();
|
|
1775
|
+
var triggerConfigSchema = z6.object({
|
|
1776
|
+
numQueries: z6.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
|
|
1777
|
+
threshold: z6.number().min(0).max(1).optional(),
|
|
1778
|
+
seed: z6.number().int().optional()
|
|
1779
|
+
}).strict().partial();
|
|
1780
|
+
var evalConfigSchema = z6.object({
|
|
1781
|
+
numRuns: z6.number().int().min(1).optional(),
|
|
1782
|
+
threshold: z6.number().min(0).max(1).optional(),
|
|
1783
|
+
promptFile: z6.string().min(1).optional(),
|
|
1784
|
+
assertionsFile: z6.string().min(1).optional()
|
|
1785
|
+
}).strict().partial();
|
|
1786
|
+
var skilltestConfigSchema = z6.object({
|
|
1787
|
+
provider: providerNameSchema.optional(),
|
|
1788
|
+
model: z6.string().min(1).optional(),
|
|
1789
|
+
json: z6.boolean().optional(),
|
|
1790
|
+
lint: lintConfigSchema.optional(),
|
|
1791
|
+
trigger: triggerConfigSchema.optional(),
|
|
1792
|
+
eval: evalConfigSchema.optional()
|
|
1793
|
+
}).strict();
|
|
1794
|
+
var resolvedSkilltestConfigSchema = z6.object({
|
|
1795
|
+
provider: providerNameSchema,
|
|
1796
|
+
model: z6.string().min(1),
|
|
1797
|
+
json: z6.boolean(),
|
|
1798
|
+
lint: z6.object({
|
|
1799
|
+
failOn: lintFailOnSchema,
|
|
1800
|
+
suppress: z6.array(z6.string().min(1))
|
|
1801
|
+
}),
|
|
1802
|
+
trigger: z6.object({
|
|
1803
|
+
numQueries: z6.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
|
|
1804
|
+
threshold: z6.number().min(0).max(1),
|
|
1805
|
+
seed: z6.number().int().optional()
|
|
1806
|
+
}),
|
|
1807
|
+
eval: z6.object({
|
|
1808
|
+
numRuns: z6.number().int().min(1),
|
|
1809
|
+
threshold: z6.number().min(0).max(1),
|
|
1810
|
+
promptFile: z6.string().min(1).optional(),
|
|
1811
|
+
assertionsFile: z6.string().min(1).optional()
|
|
1812
|
+
})
|
|
1813
|
+
});
|
|
1814
|
+
var DEFAULT_SKILLTEST_CONFIG = {
|
|
1815
|
+
provider: "anthropic",
|
|
1816
|
+
model: "claude-sonnet-4-5-20250929",
|
|
1817
|
+
json: false,
|
|
1818
|
+
lint: {
|
|
1819
|
+
failOn: "error",
|
|
1820
|
+
suppress: []
|
|
1821
|
+
},
|
|
1822
|
+
trigger: {
|
|
1823
|
+
numQueries: 20,
|
|
1824
|
+
threshold: 0.8
|
|
1825
|
+
},
|
|
1826
|
+
eval: {
|
|
1827
|
+
numRuns: 5,
|
|
1828
|
+
threshold: 0.9
|
|
1829
|
+
}
|
|
1830
|
+
};
|
|
1831
|
+
function formatIssuePath(issuePath) {
|
|
1832
|
+
if (issuePath.length === 0) {
|
|
1833
|
+
return "root";
|
|
1834
|
+
}
|
|
1835
|
+
return issuePath.map(String).join(".");
|
|
1836
|
+
}
|
|
1837
|
+
function buildConfigValidationError(error, sourceLabel) {
|
|
1838
|
+
const issue = error.issues[0];
|
|
1839
|
+
const issuePath = formatIssuePath(issue?.path ?? []);
|
|
1840
|
+
const issueMessage = issue?.message ?? "Invalid config value.";
|
|
1841
|
+
return new Error(`Invalid skilltest config in ${sourceLabel} at ${issuePath}: ${issueMessage}`);
|
|
1842
|
+
}
|
|
1843
|
+
async function readJsonObject(filePath, label) {
|
|
1844
|
+
let raw;
|
|
1845
|
+
try {
|
|
1846
|
+
raw = await fs6.readFile(filePath, "utf8");
|
|
1847
|
+
} catch (error) {
|
|
1848
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1849
|
+
throw new Error(`Failed to read ${label}: ${message}`);
|
|
1850
|
+
}
|
|
1851
|
+
try {
|
|
1852
|
+
return JSON.parse(raw);
|
|
1853
|
+
} catch (error) {
|
|
1854
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1855
|
+
throw new Error(`Invalid JSON in ${label}: ${message}`);
|
|
1856
|
+
}
|
|
1857
|
+
}
|
|
1858
|
+
async function loadConfigFromJsonFile(filePath) {
|
|
1859
|
+
if (!await pathExists(filePath)) {
|
|
1860
|
+
return null;
|
|
1861
|
+
}
|
|
1862
|
+
const raw = await readJsonObject(filePath, filePath);
|
|
1863
|
+
const parsed = skilltestConfigSchema.safeParse(raw);
|
|
1864
|
+
if (!parsed.success) {
|
|
1865
|
+
throw buildConfigValidationError(parsed.error, filePath);
|
|
1866
|
+
}
|
|
1867
|
+
return {
|
|
1868
|
+
configFile: parsed.data,
|
|
1869
|
+
sourcePath: filePath,
|
|
1870
|
+
sourceDirectory: path5.dirname(filePath)
|
|
1871
|
+
};
|
|
1872
|
+
}
|
|
1873
|
+
async function loadConfigFromNearestPackageJson(startDirectory) {
|
|
1874
|
+
let currentDirectory = path5.resolve(startDirectory);
|
|
1875
|
+
while (true) {
|
|
1876
|
+
const packageJsonPath = path5.join(currentDirectory, "package.json");
|
|
1877
|
+
if (await pathExists(packageJsonPath)) {
|
|
1878
|
+
const raw = await readJsonObject(packageJsonPath, packageJsonPath);
|
|
1879
|
+
const packageJsonSchema = z6.object({
|
|
1880
|
+
skilltestrc: skilltestConfigSchema.optional()
|
|
1881
|
+
}).passthrough();
|
|
1882
|
+
const parsed = packageJsonSchema.safeParse(raw);
|
|
1883
|
+
if (!parsed.success) {
|
|
1884
|
+
throw buildConfigValidationError(parsed.error, `${packageJsonPath}#skilltestrc`);
|
|
1885
|
+
}
|
|
1886
|
+
if (!parsed.data.skilltestrc) {
|
|
1887
|
+
return null;
|
|
1888
|
+
}
|
|
1889
|
+
return {
|
|
1890
|
+
configFile: parsed.data.skilltestrc,
|
|
1891
|
+
sourcePath: packageJsonPath,
|
|
1892
|
+
sourceDirectory: currentDirectory
|
|
1893
|
+
};
|
|
1894
|
+
}
|
|
1895
|
+
const parentDirectory = path5.dirname(currentDirectory);
|
|
1896
|
+
if (parentDirectory === currentDirectory) {
|
|
1897
|
+
return null;
|
|
1898
|
+
}
|
|
1899
|
+
currentDirectory = parentDirectory;
|
|
1900
|
+
}
|
|
1901
|
+
}
|
|
1902
|
+
async function resolveSkillDirectoryConfig(targetPath) {
|
|
1903
|
+
if (!targetPath) {
|
|
1904
|
+
return null;
|
|
1905
|
+
}
|
|
1906
|
+
try {
|
|
1907
|
+
const { skillRoot } = await resolveSkillPath(targetPath);
|
|
1908
|
+
return loadConfigFromJsonFile(path5.join(skillRoot, ".skilltestrc"));
|
|
1909
|
+
} catch {
|
|
1910
|
+
return null;
|
|
1911
|
+
}
|
|
1912
|
+
}
|
|
1913
|
+
function resolveConfigRelativePath(baseDirectory, value) {
|
|
1914
|
+
if (!value) {
|
|
1915
|
+
return void 0;
|
|
1916
|
+
}
|
|
1917
|
+
return path5.resolve(baseDirectory, value);
|
|
1918
|
+
}
|
|
1919
|
+
function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = process.cwd()) {
|
|
1920
|
+
const merged = {
|
|
1921
|
+
provider: cliFlags.provider ?? configFile.provider ?? DEFAULT_SKILLTEST_CONFIG.provider,
|
|
1922
|
+
model: cliFlags.model ?? configFile.model ?? DEFAULT_SKILLTEST_CONFIG.model,
|
|
1923
|
+
json: cliFlags.json ?? configFile.json ?? DEFAULT_SKILLTEST_CONFIG.json,
|
|
1924
|
+
lint: {
|
|
1925
|
+
failOn: cliFlags.lint?.failOn ?? configFile.lint?.failOn ?? DEFAULT_SKILLTEST_CONFIG.lint.failOn,
|
|
1926
|
+
suppress: cliFlags.lint?.suppress ?? configFile.lint?.suppress ?? DEFAULT_SKILLTEST_CONFIG.lint.suppress
|
|
1927
|
+
},
|
|
1928
|
+
trigger: {
|
|
1929
|
+
numQueries: cliFlags.trigger?.numQueries ?? configFile.trigger?.numQueries ?? DEFAULT_SKILLTEST_CONFIG.trigger.numQueries,
|
|
1930
|
+
threshold: cliFlags.trigger?.threshold ?? configFile.trigger?.threshold ?? DEFAULT_SKILLTEST_CONFIG.trigger.threshold,
|
|
1931
|
+
seed: cliFlags.trigger?.seed ?? configFile.trigger?.seed
|
|
1932
|
+
},
|
|
1933
|
+
eval: {
|
|
1934
|
+
numRuns: cliFlags.eval?.numRuns ?? configFile.eval?.numRuns ?? DEFAULT_SKILLTEST_CONFIG.eval.numRuns,
|
|
1935
|
+
threshold: cliFlags.eval?.threshold ?? configFile.eval?.threshold ?? DEFAULT_SKILLTEST_CONFIG.eval.threshold,
|
|
1936
|
+
promptFile: resolveConfigRelativePath(
|
|
1937
|
+
baseDirectory,
|
|
1938
|
+
cliFlags.eval?.promptFile ?? configFile.eval?.promptFile ?? DEFAULT_SKILLTEST_CONFIG.eval.promptFile
|
|
1939
|
+
),
|
|
1940
|
+
assertionsFile: resolveConfigRelativePath(
|
|
1941
|
+
baseDirectory,
|
|
1942
|
+
cliFlags.eval?.assertionsFile ?? configFile.eval?.assertionsFile ?? DEFAULT_SKILLTEST_CONFIG.eval.assertionsFile
|
|
1943
|
+
)
|
|
1944
|
+
}
|
|
1945
|
+
};
|
|
1946
|
+
return resolvedSkilltestConfigSchema.parse(merged);
|
|
1947
|
+
}
|
|
1948
|
+
function getTypedOptionValue(command, key) {
|
|
1949
|
+
const options = command.optsWithGlobals();
|
|
1950
|
+
const value = options[key];
|
|
1951
|
+
if (value === void 0) {
|
|
1952
|
+
return void 0;
|
|
1953
|
+
}
|
|
1954
|
+
return value;
|
|
1955
|
+
}
|
|
1956
|
+
function extractCliConfigOverrides(command) {
|
|
1957
|
+
const overrides = {};
|
|
1958
|
+
if (command.getOptionValueSourceWithGlobals("json") === "cli") {
|
|
1959
|
+
overrides.json = Boolean(getTypedOptionValue(command, "json"));
|
|
1960
|
+
}
|
|
1961
|
+
if (command.getOptionValueSource("provider") === "cli") {
|
|
1962
|
+
overrides.provider = getTypedOptionValue(command, "provider");
|
|
1963
|
+
}
|
|
1964
|
+
if (command.getOptionValueSource("model") === "cli") {
|
|
1965
|
+
overrides.model = getTypedOptionValue(command, "model");
|
|
1966
|
+
}
|
|
1967
|
+
if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("numQueries") === "cli") {
|
|
1968
|
+
overrides.trigger = {
|
|
1969
|
+
...overrides.trigger,
|
|
1970
|
+
numQueries: getTypedOptionValue(command, "numQueries")
|
|
1971
|
+
};
|
|
1972
|
+
}
|
|
1973
|
+
if (command.name() === "check" && command.getOptionValueSource("minF1") === "cli") {
|
|
1974
|
+
overrides.trigger = {
|
|
1975
|
+
...overrides.trigger,
|
|
1976
|
+
threshold: getTypedOptionValue(command, "minF1")
|
|
1977
|
+
};
|
|
1978
|
+
}
|
|
1979
|
+
if (command.name() === "check" && command.getOptionValueSource("minAssertPassRate") === "cli") {
|
|
1980
|
+
overrides.eval = {
|
|
1981
|
+
...overrides.eval,
|
|
1982
|
+
threshold: getTypedOptionValue(command, "minAssertPassRate")
|
|
1983
|
+
};
|
|
1984
|
+
}
|
|
1985
|
+
const parsed = skilltestConfigSchema.safeParse(overrides);
|
|
1986
|
+
if (!parsed.success) {
|
|
1987
|
+
throw buildConfigValidationError(parsed.error, "CLI flags");
|
|
1988
|
+
}
|
|
1989
|
+
return parsed.data;
|
|
1990
|
+
}
|
|
1991
|
+
async function resolveConfigContext(targetPath, cliFlags) {
|
|
1992
|
+
const cwd = process.cwd();
|
|
1993
|
+
const skillDirectoryConfig = await resolveSkillDirectoryConfig(targetPath);
|
|
1994
|
+
if (skillDirectoryConfig) {
|
|
1995
|
+
return {
|
|
1996
|
+
configFile: skillDirectoryConfig.configFile,
|
|
1997
|
+
...skillDirectoryConfig,
|
|
1998
|
+
config: mergeConfigLayers(skillDirectoryConfig.configFile, cliFlags, skillDirectoryConfig.sourceDirectory)
|
|
1999
|
+
};
|
|
2000
|
+
}
|
|
2001
|
+
const cwdConfigPath = path5.join(cwd, ".skilltestrc");
|
|
2002
|
+
const cwdConfig = await loadConfigFromJsonFile(cwdConfigPath);
|
|
2003
|
+
if (cwdConfig) {
|
|
2004
|
+
return {
|
|
2005
|
+
configFile: cwdConfig.configFile,
|
|
2006
|
+
...cwdConfig,
|
|
2007
|
+
config: mergeConfigLayers(cwdConfig.configFile, cliFlags, cwdConfig.sourceDirectory)
|
|
2008
|
+
};
|
|
2009
|
+
}
|
|
2010
|
+
const packageJsonConfig = await loadConfigFromNearestPackageJson(cwd);
|
|
2011
|
+
if (packageJsonConfig) {
|
|
2012
|
+
return {
|
|
2013
|
+
configFile: packageJsonConfig.configFile,
|
|
2014
|
+
...packageJsonConfig,
|
|
2015
|
+
config: mergeConfigLayers(packageJsonConfig.configFile, cliFlags, packageJsonConfig.sourceDirectory)
|
|
2016
|
+
};
|
|
2017
|
+
}
|
|
2018
|
+
return {
|
|
2019
|
+
configFile: null,
|
|
2020
|
+
config: mergeConfigLayers({}, cliFlags, cwd),
|
|
2021
|
+
sourcePath: null,
|
|
2022
|
+
sourceDirectory: cwd
|
|
2023
|
+
};
|
|
2024
|
+
}
|
|
2025
|
+
function resolveApiKey(provider, override) {
|
|
2026
|
+
if (override && override.trim() !== "") {
|
|
2027
|
+
return override.trim();
|
|
2028
|
+
}
|
|
2029
|
+
if (provider === "anthropic") {
|
|
2030
|
+
const envValue2 = process.env.ANTHROPIC_API_KEY?.trim();
|
|
2031
|
+
if (envValue2) {
|
|
2032
|
+
return envValue2;
|
|
2033
|
+
}
|
|
2034
|
+
throw new Error(
|
|
2035
|
+
"No Anthropic API key found. Set ANTHROPIC_API_KEY environment variable or pass --api-key flag."
|
|
2036
|
+
);
|
|
2037
|
+
}
|
|
2038
|
+
const envValue = process.env.OPENAI_API_KEY?.trim();
|
|
2039
|
+
if (envValue) {
|
|
2040
|
+
return envValue;
|
|
2041
|
+
}
|
|
2042
|
+
throw new Error("No OpenAI API key found. Set OPENAI_API_KEY environment variable or pass --api-key flag.");
|
|
2043
|
+
}
|
|
2044
|
+
|
|
2045
|
+
// src/providers/anthropic.ts
|
|
2046
|
+
import Anthropic from "@anthropic-ai/sdk";
|
|
2047
|
+
function wait(ms) {
|
|
2048
|
+
return new Promise((resolve) => {
|
|
2049
|
+
setTimeout(resolve, ms);
|
|
2050
|
+
});
|
|
2051
|
+
}
|
|
2052
|
+
function isRateLimitError(error) {
|
|
2053
|
+
if (!error || typeof error !== "object") {
|
|
2054
|
+
return false;
|
|
2055
|
+
}
|
|
2056
|
+
const maybeStatus = error.status;
|
|
2057
|
+
if (maybeStatus === 429) {
|
|
2058
|
+
return true;
|
|
2059
|
+
}
|
|
2060
|
+
const maybeMessage = error.message;
|
|
2061
|
+
if (typeof maybeMessage === "string" && /rate limit/i.test(maybeMessage)) {
|
|
2062
|
+
return true;
|
|
2063
|
+
}
|
|
2064
|
+
return false;
|
|
2065
|
+
}
|
|
2066
|
+
var AnthropicProvider = class {
|
|
2067
|
+
name = "anthropic";
|
|
2068
|
+
client;
|
|
2069
|
+
constructor(apiKey) {
|
|
2070
|
+
this.client = new Anthropic({ apiKey });
|
|
2071
|
+
}
|
|
2072
|
+
async sendMessage(systemPrompt, userMessage, options) {
|
|
2073
|
+
let lastError;
|
|
2074
|
+
for (let attempt = 0; attempt < 3; attempt += 1) {
|
|
2075
|
+
try {
|
|
2076
|
+
const response = await this.client.messages.create({
|
|
2077
|
+
model: options.model,
|
|
2078
|
+
max_tokens: 2048,
|
|
2079
|
+
system: systemPrompt,
|
|
2080
|
+
messages: [
|
|
2081
|
+
{
|
|
2082
|
+
role: "user",
|
|
2083
|
+
content: userMessage
|
|
2084
|
+
}
|
|
2085
|
+
]
|
|
2086
|
+
});
|
|
2087
|
+
const textBlocks = response.content.filter((block) => block.type === "text");
|
|
2088
|
+
const text = textBlocks.map((block) => block.text).join("\n").trim();
|
|
2089
|
+
if (text.length === 0) {
|
|
2090
|
+
throw new Error("Model returned an empty response.");
|
|
2091
|
+
}
|
|
1481
2092
|
return text;
|
|
1482
2093
|
} catch (error) {
|
|
1483
2094
|
lastError = error;
|
|
@@ -1607,14 +2218,11 @@ function createProvider(providerName, apiKeyOverride) {
|
|
|
1607
2218
|
}
|
|
1608
2219
|
|
|
1609
2220
|
// src/commands/trigger.ts
|
|
1610
|
-
var
|
|
1611
|
-
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
saveQueries: z3.string().optional(),
|
|
1616
|
-
verbose: z3.boolean().optional(),
|
|
1617
|
-
apiKey: z3.string().optional()
|
|
2221
|
+
var triggerCliSchema = z7.object({
|
|
2222
|
+
queries: z7.string().optional(),
|
|
2223
|
+
saveQueries: z7.string().optional(),
|
|
2224
|
+
verbose: z7.boolean().optional(),
|
|
2225
|
+
apiKey: z7.string().optional()
|
|
1618
2226
|
});
|
|
1619
2227
|
var DEFAULT_ANTHROPIC_MODEL = "claude-sonnet-4-5-20250929";
|
|
1620
2228
|
var DEFAULT_OPENAI_MODEL = "gpt-4.1-mini";
|
|
@@ -1624,230 +2232,84 @@ function resolveModel(provider, model) {
|
|
|
1624
2232
|
}
|
|
1625
2233
|
return model;
|
|
1626
2234
|
}
|
|
1627
|
-
function
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
writeError(new Error(parsedOptions.error.issues[0]?.message ?? "Invalid trigger options."), globalOptions.json);
|
|
1633
|
-
process.exitCode = 2;
|
|
1634
|
-
return;
|
|
2235
|
+
async function handleTriggerCommand(targetPath, options) {
|
|
2236
|
+
const spinner = options.json || !process.stdout.isTTY ? null : ora("Preparing trigger evaluation...").start();
|
|
2237
|
+
try {
|
|
2238
|
+
if (spinner) {
|
|
2239
|
+
spinner.text = "Parsing skill...";
|
|
1635
2240
|
}
|
|
1636
|
-
const
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
spinner.text = "Parsing skill...";
|
|
1644
|
-
}
|
|
1645
|
-
const skill = await parseSkillStrict(targetPath);
|
|
1646
|
-
if (spinner) {
|
|
1647
|
-
spinner.text = "Initializing model provider...";
|
|
1648
|
-
}
|
|
1649
|
-
const provider = createProvider(options.provider, options.apiKey);
|
|
1650
|
-
let queries = void 0;
|
|
1651
|
-
if (options.queries) {
|
|
1652
|
-
if (spinner) {
|
|
1653
|
-
spinner.text = "Loading custom trigger queries...";
|
|
1654
|
-
}
|
|
1655
|
-
const loaded = await readJsonFile(options.queries);
|
|
1656
|
-
const parsedQueries = triggerQueryArraySchema.safeParse(loaded);
|
|
1657
|
-
if (!parsedQueries.success) {
|
|
1658
|
-
throw new Error(`Invalid --queries JSON: ${parsedQueries.error.issues[0]?.message ?? "unknown format issue"}`);
|
|
1659
|
-
}
|
|
1660
|
-
queries = parsedQueries.data;
|
|
1661
|
-
}
|
|
2241
|
+
const skill = await parseSkillStrict(targetPath);
|
|
2242
|
+
if (spinner) {
|
|
2243
|
+
spinner.text = "Initializing model provider...";
|
|
2244
|
+
}
|
|
2245
|
+
const provider = createProvider(options.provider, options.apiKey);
|
|
2246
|
+
let queries = void 0;
|
|
2247
|
+
if (options.queries) {
|
|
1662
2248
|
if (spinner) {
|
|
1663
|
-
spinner.text = "
|
|
1664
|
-
}
|
|
1665
|
-
const model = resolveModel(options.provider, options.model);
|
|
1666
|
-
const result = await runTriggerTest(skill, {
|
|
1667
|
-
model,
|
|
1668
|
-
provider,
|
|
1669
|
-
queries,
|
|
1670
|
-
numQueries: options.numQueries,
|
|
1671
|
-
verbose: Boolean(options.verbose)
|
|
1672
|
-
});
|
|
1673
|
-
if (options.saveQueries) {
|
|
1674
|
-
await writeJsonFile(options.saveQueries, result.queries);
|
|
1675
|
-
}
|
|
1676
|
-
spinner?.stop();
|
|
1677
|
-
if (globalOptions.json) {
|
|
1678
|
-
writeResult(result, true);
|
|
1679
|
-
} else {
|
|
1680
|
-
writeResult(renderTriggerReport(result, globalOptions.color, Boolean(options.verbose)), false);
|
|
2249
|
+
spinner.text = "Loading custom trigger queries...";
|
|
1681
2250
|
}
|
|
1682
|
-
|
|
1683
|
-
spinner?.stop();
|
|
1684
|
-
writeError(error, globalOptions.json);
|
|
1685
|
-
process.exitCode = 2;
|
|
2251
|
+
queries = await loadTriggerQueriesFile(options.queries);
|
|
1686
2252
|
}
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
import { z as z4 } from "zod";
|
|
1699
|
-
var gradedAssertionSchema = z4.object({
|
|
1700
|
-
assertion: z4.string(),
|
|
1701
|
-
passed: z4.boolean(),
|
|
1702
|
-
evidence: z4.string()
|
|
1703
|
-
});
|
|
1704
|
-
var graderOutputSchema = z4.object({
|
|
1705
|
-
assertions: z4.array(gradedAssertionSchema)
|
|
1706
|
-
});
|
|
1707
|
-
function extractJsonObject(raw) {
|
|
1708
|
-
const trimmed = raw.trim();
|
|
1709
|
-
if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
|
|
1710
|
-
return JSON.parse(trimmed);
|
|
1711
|
-
}
|
|
1712
|
-
const start = trimmed.indexOf("{");
|
|
1713
|
-
const end = trimmed.lastIndexOf("}");
|
|
1714
|
-
if (start >= 0 && end > start) {
|
|
1715
|
-
return JSON.parse(trimmed.slice(start, end + 1));
|
|
1716
|
-
}
|
|
1717
|
-
throw new Error("Grader did not return a JSON object.");
|
|
1718
|
-
}
|
|
1719
|
-
async function gradeResponse(options) {
|
|
1720
|
-
const assertionList = options.assertions && options.assertions.length > 0 ? options.assertions : [
|
|
1721
|
-
"The response follows the skill instructions faithfully.",
|
|
1722
|
-
"The response is well-structured and actionable.",
|
|
1723
|
-
"The response addresses the user prompt directly."
|
|
1724
|
-
];
|
|
1725
|
-
const systemPrompt = [
|
|
1726
|
-
"You are a strict evaluator for agent skill outputs.",
|
|
1727
|
-
"Assess each assertion and return JSON only.",
|
|
1728
|
-
'Required output format: {"assertions":[{"assertion":"...","passed":true|false,"evidence":"..."}]}'
|
|
1729
|
-
].join(" ");
|
|
1730
|
-
const userPrompt = [
|
|
1731
|
-
`Skill: ${options.skillName}`,
|
|
1732
|
-
"Skill instructions:",
|
|
1733
|
-
options.skillBody,
|
|
1734
|
-
"",
|
|
1735
|
-
`User prompt: ${options.userPrompt}`,
|
|
1736
|
-
"",
|
|
1737
|
-
"Model response:",
|
|
1738
|
-
options.modelResponse,
|
|
1739
|
-
"",
|
|
1740
|
-
"Assertions to evaluate:",
|
|
1741
|
-
assertionList.map((assertion, index) => `${index + 1}. ${assertion}`).join("\n")
|
|
1742
|
-
].join("\n");
|
|
1743
|
-
const raw = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
|
|
1744
|
-
const parsed = graderOutputSchema.safeParse(extractJsonObject(raw));
|
|
1745
|
-
if (!parsed.success) {
|
|
1746
|
-
throw new Error(`Failed to parse grader output: ${parsed.error.issues[0]?.message ?? "invalid grader JSON"}`);
|
|
1747
|
-
}
|
|
1748
|
-
return parsed.data.assertions;
|
|
1749
|
-
}
|
|
1750
|
-
|
|
1751
|
-
// src/core/eval-runner.ts
|
|
1752
|
-
var evalPromptSchema = z5.object({
|
|
1753
|
-
prompt: z5.string().min(1),
|
|
1754
|
-
assertions: z5.array(z5.string().min(1)).optional()
|
|
1755
|
-
});
|
|
1756
|
-
var evalPromptArraySchema = z5.array(evalPromptSchema);
|
|
1757
|
-
function extractJsonArray(raw) {
|
|
1758
|
-
const trimmed = raw.trim();
|
|
1759
|
-
if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
|
|
1760
|
-
return JSON.parse(trimmed);
|
|
1761
|
-
}
|
|
1762
|
-
const start = trimmed.indexOf("[");
|
|
1763
|
-
const end = trimmed.lastIndexOf("]");
|
|
1764
|
-
if (start >= 0 && end > start) {
|
|
1765
|
-
return JSON.parse(trimmed.slice(start, end + 1));
|
|
1766
|
-
}
|
|
1767
|
-
throw new Error("Model did not return a JSON array.");
|
|
1768
|
-
}
|
|
1769
|
-
async function generatePrompts(skill, provider, model, count) {
|
|
1770
|
-
const systemPrompt = [
|
|
1771
|
-
"You generate realistic evaluation prompts for an agent skill.",
|
|
1772
|
-
"Return JSON only.",
|
|
1773
|
-
'Format: [{"prompt":"...","assertions":["...", "..."]}]',
|
|
1774
|
-
"Assertions should be concrete and checkable."
|
|
1775
|
-
].join(" ");
|
|
1776
|
-
const userPrompt = [
|
|
1777
|
-
`Skill name: ${skill.frontmatter.name}`,
|
|
1778
|
-
`Skill description: ${skill.frontmatter.description}`,
|
|
1779
|
-
"Skill instructions:",
|
|
1780
|
-
skill.content,
|
|
1781
|
-
"",
|
|
1782
|
-
`Generate ${count} prompts that stress the main capabilities and likely edge cases.`,
|
|
1783
|
-
"Each prompt should include 2-4 assertions."
|
|
1784
|
-
].join("\n");
|
|
1785
|
-
const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
|
|
1786
|
-
const parsed = evalPromptArraySchema.safeParse(extractJsonArray(raw));
|
|
1787
|
-
if (!parsed.success) {
|
|
1788
|
-
throw new Error(`Failed to parse generated eval prompts: ${parsed.error.issues[0]?.message ?? "invalid prompt JSON"}`);
|
|
1789
|
-
}
|
|
1790
|
-
if (parsed.data.length !== count) {
|
|
1791
|
-
throw new Error(`Expected ${count} prompts, got ${parsed.data.length}.`);
|
|
1792
|
-
}
|
|
1793
|
-
return parsed.data;
|
|
1794
|
-
}
|
|
1795
|
-
async function runEval(skill, options) {
|
|
1796
|
-
const prompts = options.prompts && options.prompts.length > 0 ? evalPromptArraySchema.parse(options.prompts) : await generatePrompts(skill, options.provider, options.model, 5);
|
|
1797
|
-
const results = [];
|
|
1798
|
-
for (const evalPrompt of prompts) {
|
|
1799
|
-
const systemPrompt = [
|
|
1800
|
-
"You are an AI assistant with an activated skill.",
|
|
1801
|
-
"Follow this SKILL.md content exactly where applicable.",
|
|
1802
|
-
"",
|
|
1803
|
-
skill.raw
|
|
1804
|
-
].join("\n");
|
|
1805
|
-
const response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
|
|
1806
|
-
const gradedAssertions = await gradeResponse({
|
|
1807
|
-
provider: options.provider,
|
|
1808
|
-
model: options.graderModel,
|
|
1809
|
-
skillName: skill.frontmatter.name,
|
|
1810
|
-
skillBody: skill.content,
|
|
1811
|
-
userPrompt: evalPrompt.prompt,
|
|
1812
|
-
modelResponse: response,
|
|
1813
|
-
assertions: evalPrompt.assertions
|
|
1814
|
-
});
|
|
1815
|
-
const passedAssertions2 = gradedAssertions.filter((assertion) => assertion.passed).length;
|
|
1816
|
-
results.push({
|
|
1817
|
-
prompt: evalPrompt.prompt,
|
|
1818
|
-
assertions: gradedAssertions,
|
|
1819
|
-
responseSummary: response.slice(0, 200),
|
|
1820
|
-
response,
|
|
1821
|
-
passedAssertions: passedAssertions2,
|
|
1822
|
-
totalAssertions: gradedAssertions.length
|
|
2253
|
+
if (spinner) {
|
|
2254
|
+
spinner.text = "Running trigger simulations...";
|
|
2255
|
+
}
|
|
2256
|
+
const model = resolveModel(options.provider, options.model);
|
|
2257
|
+
const result = await runTriggerTest(skill, {
|
|
2258
|
+
model,
|
|
2259
|
+
provider,
|
|
2260
|
+
queries,
|
|
2261
|
+
numQueries: options.numQueries,
|
|
2262
|
+
seed: options.seed,
|
|
2263
|
+
verbose: options.verbose
|
|
1823
2264
|
});
|
|
2265
|
+
if (options.saveQueries) {
|
|
2266
|
+
await writeJsonFile(options.saveQueries, result.queries);
|
|
2267
|
+
}
|
|
2268
|
+
spinner?.stop();
|
|
2269
|
+
if (options.json) {
|
|
2270
|
+
writeResult(result, true);
|
|
2271
|
+
} else {
|
|
2272
|
+
writeResult(renderTriggerReport(result, options.color, options.verbose), false);
|
|
2273
|
+
}
|
|
2274
|
+
} catch (error) {
|
|
2275
|
+
spinner?.stop();
|
|
2276
|
+
writeError(error, options.json);
|
|
2277
|
+
process.exitCode = 2;
|
|
1824
2278
|
}
|
|
1825
|
-
|
|
1826
|
-
|
|
1827
|
-
|
|
1828
|
-
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
1833
|
-
|
|
1834
|
-
|
|
1835
|
-
totalPrompts: results.length,
|
|
1836
|
-
totalAssertions,
|
|
1837
|
-
passedAssertions
|
|
2279
|
+
}
|
|
2280
|
+
function registerTriggerCommand(program) {
|
|
2281
|
+
program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--queries <path>", "Path to custom test queries JSON").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10)).option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, _commandOptions, command) => {
|
|
2282
|
+
const globalOptions = getGlobalCliOptions(command);
|
|
2283
|
+
const config = getResolvedConfig(command);
|
|
2284
|
+
const parsedCli = triggerCliSchema.safeParse(command.opts());
|
|
2285
|
+
if (!parsedCli.success) {
|
|
2286
|
+
writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid trigger options."), globalOptions.json);
|
|
2287
|
+
process.exitCode = 2;
|
|
2288
|
+
return;
|
|
1838
2289
|
}
|
|
1839
|
-
|
|
2290
|
+
await handleTriggerCommand(targetPath, {
|
|
2291
|
+
...globalOptions,
|
|
2292
|
+
model: config.model,
|
|
2293
|
+
provider: config.provider,
|
|
2294
|
+
queries: parsedCli.data.queries,
|
|
2295
|
+
numQueries: config.trigger.numQueries,
|
|
2296
|
+
saveQueries: parsedCli.data.saveQueries,
|
|
2297
|
+
seed: config.trigger.seed,
|
|
2298
|
+
verbose: Boolean(parsedCli.data.verbose),
|
|
2299
|
+
apiKey: parsedCli.data.apiKey
|
|
2300
|
+
});
|
|
2301
|
+
});
|
|
1840
2302
|
}
|
|
1841
2303
|
|
|
1842
2304
|
// src/commands/eval.ts
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
|
|
1847
|
-
|
|
1848
|
-
saveResults:
|
|
1849
|
-
verbose:
|
|
1850
|
-
apiKey:
|
|
2305
|
+
import ora2 from "ora";
|
|
2306
|
+
import { z as z8 } from "zod";
|
|
2307
|
+
var evalCliSchema = z8.object({
|
|
2308
|
+
prompts: z8.string().optional(),
|
|
2309
|
+
graderModel: z8.string().optional(),
|
|
2310
|
+
saveResults: z8.string().optional(),
|
|
2311
|
+
verbose: z8.boolean().optional(),
|
|
2312
|
+
apiKey: z8.string().optional()
|
|
1851
2313
|
});
|
|
1852
2314
|
var DEFAULT_ANTHROPIC_MODEL2 = "claude-sonnet-4-5-20250929";
|
|
1853
2315
|
var DEFAULT_OPENAI_MODEL2 = "gpt-4.1-mini";
|
|
@@ -1857,69 +2319,84 @@ function resolveModel2(provider, model) {
|
|
|
1857
2319
|
}
|
|
1858
2320
|
return model;
|
|
1859
2321
|
}
|
|
1860
|
-
function
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
writeError(new Error(parsedOptions.error.issues[0]?.message ?? "Invalid eval options."), globalOptions.json);
|
|
1866
|
-
process.exitCode = 2;
|
|
1867
|
-
return;
|
|
2322
|
+
async function handleEvalCommand(targetPath, options, command) {
|
|
2323
|
+
const spinner = options.json || !process.stdout.isTTY ? null : ora2("Preparing evaluation...").start();
|
|
2324
|
+
try {
|
|
2325
|
+
if (spinner) {
|
|
2326
|
+
spinner.text = "Parsing skill...";
|
|
1868
2327
|
}
|
|
1869
|
-
const
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
if (spinner) {
|
|
1877
|
-
spinner.text = "Initializing model provider...";
|
|
1878
|
-
}
|
|
1879
|
-
const provider = createProvider(options.provider, options.apiKey);
|
|
1880
|
-
let prompts = void 0;
|
|
1881
|
-
if (options.prompts) {
|
|
1882
|
-
if (spinner) {
|
|
1883
|
-
spinner.text = "Loading test prompts...";
|
|
1884
|
-
}
|
|
1885
|
-
const loaded = await readJsonFile(options.prompts);
|
|
1886
|
-
const parsedPrompts = evalPromptArraySchema.safeParse(loaded);
|
|
1887
|
-
if (!parsedPrompts.success) {
|
|
1888
|
-
throw new Error(`Invalid --prompts JSON: ${parsedPrompts.error.issues[0]?.message ?? "unknown format issue"}`);
|
|
1889
|
-
}
|
|
1890
|
-
prompts = parsedPrompts.data;
|
|
1891
|
-
}
|
|
2328
|
+
const skill = await parseSkillStrict(targetPath);
|
|
2329
|
+
if (spinner) {
|
|
2330
|
+
spinner.text = "Initializing model provider...";
|
|
2331
|
+
}
|
|
2332
|
+
const provider = createProvider(options.provider, options.apiKey);
|
|
2333
|
+
let prompts = void 0;
|
|
2334
|
+
if (options.prompts) {
|
|
1892
2335
|
if (spinner) {
|
|
1893
|
-
spinner.text = "
|
|
2336
|
+
spinner.text = "Loading test prompts...";
|
|
1894
2337
|
}
|
|
1895
|
-
|
|
1896
|
-
|
|
1897
|
-
|
|
1898
|
-
|
|
1899
|
-
|
|
1900
|
-
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
|
|
1904
|
-
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
|
-
|
|
1911
|
-
|
|
1912
|
-
|
|
1913
|
-
|
|
1914
|
-
|
|
2338
|
+
prompts = await loadEvalPromptsJson(options.prompts);
|
|
2339
|
+
} else {
|
|
2340
|
+
prompts = await loadConfiguredEvalPrompts(command);
|
|
2341
|
+
}
|
|
2342
|
+
if (spinner) {
|
|
2343
|
+
spinner.text = "Running eval prompts and grading responses...";
|
|
2344
|
+
}
|
|
2345
|
+
const model = resolveModel2(options.provider, options.model);
|
|
2346
|
+
const graderModel = options.graderModel ?? model;
|
|
2347
|
+
const result = await runEval(skill, {
|
|
2348
|
+
provider,
|
|
2349
|
+
model,
|
|
2350
|
+
graderModel,
|
|
2351
|
+
numRuns: options.numRuns,
|
|
2352
|
+
prompts
|
|
2353
|
+
});
|
|
2354
|
+
if (options.saveResults) {
|
|
2355
|
+
await writeJsonFile(options.saveResults, result);
|
|
2356
|
+
}
|
|
2357
|
+
spinner?.stop();
|
|
2358
|
+
if (options.json) {
|
|
2359
|
+
writeResult(result, true);
|
|
2360
|
+
} else {
|
|
2361
|
+
writeResult(renderEvalReport(result, options.color, options.verbose), false);
|
|
2362
|
+
}
|
|
2363
|
+
} catch (error) {
|
|
2364
|
+
spinner?.stop();
|
|
2365
|
+
writeError(error, options.json);
|
|
2366
|
+
process.exitCode = 2;
|
|
2367
|
+
}
|
|
2368
|
+
}
|
|
2369
|
+
function registerEvalCommand(program) {
|
|
2370
|
+
program.command("eval").description("Run end-to-end skill execution and quality evaluation.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--prompts <path>", "Path to eval prompts JSON").option("--model <model>", "Model to execute prompts").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--provider <provider>", "LLM provider: anthropic|openai").option("--save-results <path>", "Save full evaluation results to JSON").option("--api-key <key>", "API key override").option("--verbose", "Show full model responses").action(async (targetPath, _commandOptions, command) => {
|
|
2371
|
+
const globalOptions = getGlobalCliOptions(command);
|
|
2372
|
+
const config = getResolvedConfig(command);
|
|
2373
|
+
const parsedCli = evalCliSchema.safeParse(command.opts());
|
|
2374
|
+
if (!parsedCli.success) {
|
|
2375
|
+
writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid eval options."), globalOptions.json);
|
|
1915
2376
|
process.exitCode = 2;
|
|
2377
|
+
return;
|
|
1916
2378
|
}
|
|
2379
|
+
await handleEvalCommand(
|
|
2380
|
+
targetPath,
|
|
2381
|
+
{
|
|
2382
|
+
...globalOptions,
|
|
2383
|
+
prompts: parsedCli.data.prompts,
|
|
2384
|
+
model: config.model,
|
|
2385
|
+
graderModel: parsedCli.data.graderModel,
|
|
2386
|
+
provider: config.provider,
|
|
2387
|
+
saveResults: parsedCli.data.saveResults,
|
|
2388
|
+
verbose: Boolean(parsedCli.data.verbose),
|
|
2389
|
+
apiKey: parsedCli.data.apiKey,
|
|
2390
|
+
numRuns: config.eval.numRuns
|
|
2391
|
+
},
|
|
2392
|
+
command
|
|
2393
|
+
);
|
|
1917
2394
|
});
|
|
1918
2395
|
}
|
|
1919
2396
|
|
|
1920
2397
|
// src/commands/check.ts
|
|
1921
2398
|
import ora3 from "ora";
|
|
1922
|
-
import { z as
|
|
2399
|
+
import { z as z9 } from "zod";
|
|
1923
2400
|
|
|
1924
2401
|
// src/core/check-runner.ts
|
|
1925
2402
|
function calculateEvalAssertPassRate(result) {
|
|
@@ -1930,8 +2407,8 @@ function calculateEvalAssertPassRate(result) {
|
|
|
1930
2407
|
}
|
|
1931
2408
|
async function runCheck(inputPath, options) {
|
|
1932
2409
|
options.onStage?.("lint");
|
|
1933
|
-
const lint = await runLinter(inputPath);
|
|
1934
|
-
const lintPassed = lint.
|
|
2410
|
+
const lint = await runLinter(inputPath, { suppress: options.lintSuppress });
|
|
2411
|
+
const lintPassed = !lintFails(lint, options.lintFailOn);
|
|
1935
2412
|
let trigger = null;
|
|
1936
2413
|
let evalResult = null;
|
|
1937
2414
|
let triggerSkippedReason;
|
|
@@ -1956,6 +2433,7 @@ async function runCheck(inputPath, options) {
|
|
|
1956
2433
|
model: options.model,
|
|
1957
2434
|
queries: options.queries,
|
|
1958
2435
|
numQueries: options.numQueries,
|
|
2436
|
+
seed: options.triggerSeed,
|
|
1959
2437
|
verbose: options.verbose
|
|
1960
2438
|
});
|
|
1961
2439
|
options.onStage?.("eval");
|
|
@@ -1963,6 +2441,7 @@ async function runCheck(inputPath, options) {
|
|
|
1963
2441
|
provider: options.provider,
|
|
1964
2442
|
model: options.model,
|
|
1965
2443
|
graderModel: options.graderModel,
|
|
2444
|
+
numRuns: options.evalNumRuns,
|
|
1966
2445
|
prompts: options.prompts
|
|
1967
2446
|
});
|
|
1968
2447
|
}
|
|
@@ -1999,19 +2478,14 @@ async function runCheck(inputPath, options) {
|
|
|
1999
2478
|
}
|
|
2000
2479
|
|
|
2001
2480
|
// src/commands/check.ts
|
|
2002
|
-
var
|
|
2003
|
-
|
|
2004
|
-
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
minF1: z7.number().min(0).max(1),
|
|
2011
|
-
minAssertPassRate: z7.number().min(0).max(1),
|
|
2012
|
-
saveResults: z7.string().optional(),
|
|
2013
|
-
continueOnLintFail: z7.boolean().optional(),
|
|
2014
|
-
verbose: z7.boolean().optional()
|
|
2481
|
+
var checkCliSchema = z9.object({
|
|
2482
|
+
graderModel: z9.string().optional(),
|
|
2483
|
+
apiKey: z9.string().optional(),
|
|
2484
|
+
queries: z9.string().optional(),
|
|
2485
|
+
prompts: z9.string().optional(),
|
|
2486
|
+
saveResults: z9.string().optional(),
|
|
2487
|
+
continueOnLintFail: z9.boolean().optional(),
|
|
2488
|
+
verbose: z9.boolean().optional()
|
|
2015
2489
|
});
|
|
2016
2490
|
var DEFAULT_ANTHROPIC_MODEL3 = "claude-sonnet-4-5-20250929";
|
|
2017
2491
|
var DEFAULT_OPENAI_MODEL3 = "gpt-4.1-mini";
|
|
@@ -2021,106 +2495,110 @@ function resolveModel3(provider, model) {
|
|
|
2021
2495
|
}
|
|
2022
2496
|
return model;
|
|
2023
2497
|
}
|
|
2024
|
-
function
|
|
2025
|
-
|
|
2026
|
-
|
|
2027
|
-
|
|
2028
|
-
|
|
2029
|
-
0.9
|
|
2030
|
-
).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, commandOptions, command) => {
|
|
2031
|
-
const globalOptions = getGlobalCliOptions(command);
|
|
2032
|
-
const parsedOptions = checkOptionsSchema.safeParse(commandOptions);
|
|
2033
|
-
if (!parsedOptions.success) {
|
|
2034
|
-
writeError(new Error(parsedOptions.error.issues[0]?.message ?? "Invalid check options."), globalOptions.json);
|
|
2035
|
-
process.exitCode = 2;
|
|
2036
|
-
return;
|
|
2037
|
-
}
|
|
2038
|
-
const options = parsedOptions.data;
|
|
2039
|
-
if (options.numQueries % 2 !== 0) {
|
|
2040
|
-
writeError(
|
|
2041
|
-
new Error("--num-queries must be an even number so the suite can split should/should-not trigger cases."),
|
|
2042
|
-
globalOptions.json
|
|
2043
|
-
);
|
|
2044
|
-
process.exitCode = 2;
|
|
2045
|
-
return;
|
|
2498
|
+
async function handleCheckCommand(targetPath, options, command) {
|
|
2499
|
+
const spinner = options.json || !process.stdout.isTTY ? null : ora3("Preparing check run...").start();
|
|
2500
|
+
try {
|
|
2501
|
+
if (spinner) {
|
|
2502
|
+
spinner.text = "Initializing model provider...";
|
|
2046
2503
|
}
|
|
2047
|
-
const
|
|
2048
|
-
|
|
2504
|
+
const provider = createProvider(options.provider, options.apiKey);
|
|
2505
|
+
let queries = void 0;
|
|
2506
|
+
if (options.queries) {
|
|
2049
2507
|
if (spinner) {
|
|
2050
|
-
spinner.text = "
|
|
2508
|
+
spinner.text = "Loading custom trigger queries...";
|
|
2051
2509
|
}
|
|
2052
|
-
|
|
2053
|
-
|
|
2054
|
-
|
|
2055
|
-
|
|
2056
|
-
|
|
2057
|
-
|
|
2058
|
-
const loadedQueries = await readJsonFile(options.queries);
|
|
2059
|
-
const parsedQueries = triggerQueryArraySchema.safeParse(loadedQueries);
|
|
2060
|
-
if (!parsedQueries.success) {
|
|
2061
|
-
throw new Error(
|
|
2062
|
-
`Invalid --queries JSON: ${parsedQueries.error.issues[0]?.message ?? "unknown format issue"}`
|
|
2063
|
-
);
|
|
2064
|
-
}
|
|
2065
|
-
queries = parsedQueries.data;
|
|
2510
|
+
queries = await loadTriggerQueriesFile(options.queries);
|
|
2511
|
+
}
|
|
2512
|
+
let prompts = void 0;
|
|
2513
|
+
if (options.prompts) {
|
|
2514
|
+
if (spinner) {
|
|
2515
|
+
spinner.text = "Loading eval prompts...";
|
|
2066
2516
|
}
|
|
2067
|
-
|
|
2068
|
-
|
|
2069
|
-
|
|
2070
|
-
|
|
2071
|
-
|
|
2072
|
-
|
|
2073
|
-
|
|
2074
|
-
|
|
2075
|
-
|
|
2076
|
-
|
|
2077
|
-
|
|
2517
|
+
prompts = await loadEvalPromptsJson(options.prompts);
|
|
2518
|
+
} else {
|
|
2519
|
+
prompts = await loadConfiguredEvalPrompts(command);
|
|
2520
|
+
}
|
|
2521
|
+
const model = resolveModel3(options.provider, options.model);
|
|
2522
|
+
const graderModel = options.graderModel ?? model;
|
|
2523
|
+
const result = await runCheck(targetPath, {
|
|
2524
|
+
provider,
|
|
2525
|
+
model,
|
|
2526
|
+
graderModel,
|
|
2527
|
+
lintFailOn: options.lintFailOn,
|
|
2528
|
+
lintSuppress: options.lintSuppress,
|
|
2529
|
+
queries,
|
|
2530
|
+
numQueries: options.numQueries,
|
|
2531
|
+
triggerSeed: options.triggerSeed,
|
|
2532
|
+
prompts,
|
|
2533
|
+
evalNumRuns: options.numRuns,
|
|
2534
|
+
minF1: options.minF1,
|
|
2535
|
+
minAssertPassRate: options.minAssertPassRate,
|
|
2536
|
+
continueOnLintFail: options.continueOnLintFail,
|
|
2537
|
+
verbose: options.verbose,
|
|
2538
|
+
onStage: (stage) => {
|
|
2539
|
+
if (!spinner) {
|
|
2540
|
+
return;
|
|
2078
2541
|
}
|
|
2079
|
-
|
|
2080
|
-
|
|
2081
|
-
|
|
2082
|
-
|
|
2083
|
-
|
|
2084
|
-
|
|
2085
|
-
|
|
2086
|
-
|
|
2087
|
-
queries,
|
|
2088
|
-
numQueries: options.numQueries,
|
|
2089
|
-
prompts,
|
|
2090
|
-
minF1: options.minF1,
|
|
2091
|
-
minAssertPassRate: options.minAssertPassRate,
|
|
2092
|
-
continueOnLintFail: Boolean(options.continueOnLintFail),
|
|
2093
|
-
verbose: Boolean(options.verbose),
|
|
2094
|
-
onStage: (stage) => {
|
|
2095
|
-
if (!spinner) {
|
|
2096
|
-
return;
|
|
2097
|
-
}
|
|
2098
|
-
if (stage === "lint") {
|
|
2099
|
-
spinner.text = "Running lint checks...";
|
|
2100
|
-
} else if (stage === "parse") {
|
|
2101
|
-
spinner.text = "Parsing skill for model evaluations...";
|
|
2102
|
-
} else if (stage === "trigger") {
|
|
2103
|
-
spinner.text = "Running trigger test suite...";
|
|
2104
|
-
} else if (stage === "eval") {
|
|
2105
|
-
spinner.text = "Running end-to-end eval suite...";
|
|
2106
|
-
}
|
|
2542
|
+
if (stage === "lint") {
|
|
2543
|
+
spinner.text = "Running lint checks...";
|
|
2544
|
+
} else if (stage === "parse") {
|
|
2545
|
+
spinner.text = "Parsing skill for model evaluations...";
|
|
2546
|
+
} else if (stage === "trigger") {
|
|
2547
|
+
spinner.text = "Running trigger test suite...";
|
|
2548
|
+
} else if (stage === "eval") {
|
|
2549
|
+
spinner.text = "Running end-to-end eval suite...";
|
|
2107
2550
|
}
|
|
2108
|
-
});
|
|
2109
|
-
if (options.saveResults) {
|
|
2110
|
-
await writeJsonFile(options.saveResults, result);
|
|
2111
|
-
}
|
|
2112
|
-
spinner?.stop();
|
|
2113
|
-
if (globalOptions.json) {
|
|
2114
|
-
writeResult(result, true);
|
|
2115
|
-
} else {
|
|
2116
|
-
writeResult(renderCheckReport(result, globalOptions.color, Boolean(options.verbose)), false);
|
|
2117
2551
|
}
|
|
2118
|
-
|
|
2119
|
-
|
|
2120
|
-
|
|
2121
|
-
|
|
2552
|
+
});
|
|
2553
|
+
if (options.saveResults) {
|
|
2554
|
+
await writeJsonFile(options.saveResults, result);
|
|
2555
|
+
}
|
|
2556
|
+
spinner?.stop();
|
|
2557
|
+
if (options.json) {
|
|
2558
|
+
writeResult(result, true);
|
|
2559
|
+
} else {
|
|
2560
|
+
writeResult(renderCheckReport(result, options.color, options.verbose), false);
|
|
2561
|
+
}
|
|
2562
|
+
process.exitCode = result.gates.overallPassed ? 0 : 1;
|
|
2563
|
+
} catch (error) {
|
|
2564
|
+
spinner?.stop();
|
|
2565
|
+
writeError(error, options.json);
|
|
2566
|
+
process.exitCode = 2;
|
|
2567
|
+
}
|
|
2568
|
+
}
|
|
2569
|
+
function registerCheckCommand(program) {
|
|
2570
|
+
program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for trigger/eval runs").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option("--min-assert-pass-rate <n>", "Minimum required eval assertion pass rate (0-1)", (value) => Number.parseFloat(value)).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, _commandOptions, command) => {
|
|
2571
|
+
const globalOptions = getGlobalCliOptions(command);
|
|
2572
|
+
const config = getResolvedConfig(command);
|
|
2573
|
+
const parsedCli = checkCliSchema.safeParse(command.opts());
|
|
2574
|
+
if (!parsedCli.success) {
|
|
2575
|
+
writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid check options."), globalOptions.json);
|
|
2122
2576
|
process.exitCode = 2;
|
|
2577
|
+
return;
|
|
2123
2578
|
}
|
|
2579
|
+
await handleCheckCommand(
|
|
2580
|
+
targetPath,
|
|
2581
|
+
{
|
|
2582
|
+
...globalOptions,
|
|
2583
|
+
provider: config.provider,
|
|
2584
|
+
model: config.model,
|
|
2585
|
+
graderModel: parsedCli.data.graderModel,
|
|
2586
|
+
apiKey: parsedCli.data.apiKey,
|
|
2587
|
+
queries: parsedCli.data.queries,
|
|
2588
|
+
numQueries: config.trigger.numQueries,
|
|
2589
|
+
prompts: parsedCli.data.prompts,
|
|
2590
|
+
minF1: config.trigger.threshold,
|
|
2591
|
+
minAssertPassRate: config.eval.threshold,
|
|
2592
|
+
numRuns: config.eval.numRuns,
|
|
2593
|
+
lintFailOn: config.lint.failOn,
|
|
2594
|
+
lintSuppress: config.lint.suppress,
|
|
2595
|
+
triggerSeed: config.trigger.seed,
|
|
2596
|
+
saveResults: parsedCli.data.saveResults,
|
|
2597
|
+
continueOnLintFail: Boolean(parsedCli.data.continueOnLintFail),
|
|
2598
|
+
verbose: Boolean(parsedCli.data.verbose)
|
|
2599
|
+
},
|
|
2600
|
+
command
|
|
2601
|
+
);
|
|
2124
2602
|
});
|
|
2125
2603
|
}
|
|
2126
2604
|
|
|
@@ -2128,27 +2606,49 @@ function registerCheckCommand(program) {
|
|
|
2128
2606
|
function resolveVersion() {
|
|
2129
2607
|
try {
|
|
2130
2608
|
const currentFilePath = fileURLToPath(import.meta.url);
|
|
2131
|
-
const packageJsonPath =
|
|
2132
|
-
const raw =
|
|
2609
|
+
const packageJsonPath = path6.resolve(path6.dirname(currentFilePath), "..", "package.json");
|
|
2610
|
+
const raw = fs7.readFileSync(packageJsonPath, "utf8");
|
|
2133
2611
|
const parsed = JSON.parse(raw);
|
|
2134
2612
|
return parsed.version ?? "0.0.0";
|
|
2135
2613
|
} catch {
|
|
2136
2614
|
return "0.0.0";
|
|
2137
2615
|
}
|
|
2138
2616
|
}
|
|
2617
|
+
function shouldRenderJson(argv) {
|
|
2618
|
+
return argv.includes("--json");
|
|
2619
|
+
}
|
|
2620
|
+
function renderTopLevelError(error, asJson) {
|
|
2621
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2622
|
+
if (asJson) {
|
|
2623
|
+
process.stdout.write(`${JSON.stringify({ error: message }, null, 2)}
|
|
2624
|
+
`);
|
|
2625
|
+
return;
|
|
2626
|
+
}
|
|
2627
|
+
process.stderr.write(`Error: ${message}
|
|
2628
|
+
`);
|
|
2629
|
+
}
|
|
2139
2630
|
async function run(argv) {
|
|
2140
2631
|
const program = new Command();
|
|
2141
2632
|
program.name("skilltest").description("The testing framework for Agent Skills.").version(resolveVersion()).option("--json", "Output results as JSON").option("--no-color", "Disable colored output").showHelpAfterError();
|
|
2633
|
+
program.hook("preAction", async (_program, actionCommand) => {
|
|
2634
|
+
const targetPath = typeof actionCommand.processedArgs[0] === "string" ? actionCommand.processedArgs[0] : void 0;
|
|
2635
|
+
const cliOverrides = extractCliConfigOverrides(actionCommand);
|
|
2636
|
+
const context = await resolveConfigContext(targetPath, cliOverrides);
|
|
2637
|
+
setCommandExecutionContext(actionCommand, context);
|
|
2638
|
+
});
|
|
2142
2639
|
registerLintCommand(program);
|
|
2143
2640
|
registerTriggerCommand(program);
|
|
2144
2641
|
registerEvalCommand(program);
|
|
2145
2642
|
registerCheckCommand(program);
|
|
2146
|
-
|
|
2643
|
+
try {
|
|
2644
|
+
await program.parseAsync(argv);
|
|
2645
|
+
} catch (error) {
|
|
2646
|
+
renderTopLevelError(error, shouldRenderJson(argv));
|
|
2647
|
+
process.exitCode = 2;
|
|
2648
|
+
}
|
|
2147
2649
|
}
|
|
2148
2650
|
run(process.argv).catch((error) => {
|
|
2149
|
-
|
|
2150
|
-
process.stderr.write(`Error: ${message}
|
|
2151
|
-
`);
|
|
2651
|
+
renderTopLevelError(error, shouldRenderJson(process.argv));
|
|
2152
2652
|
process.exitCode = 2;
|
|
2153
2653
|
});
|
|
2154
2654
|
export {
|