skilltest 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -0
- package/dist/index.js +1478 -624
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/index.ts
|
|
4
|
-
import
|
|
5
|
-
import
|
|
4
|
+
import fs7 from "node:fs";
|
|
5
|
+
import path6 from "node:path";
|
|
6
6
|
import { fileURLToPath } from "node:url";
|
|
7
7
|
import { Command } from "commander";
|
|
8
8
|
|
|
@@ -194,6 +194,7 @@ function runCompatibilityChecks(context) {
|
|
|
194
194
|
if (hasAllowedTools) {
|
|
195
195
|
issues.push({
|
|
196
196
|
id: "compat.allowed-tools",
|
|
197
|
+
checkId: "compat:frontmatter",
|
|
197
198
|
title: "Platform-Specific Frontmatter",
|
|
198
199
|
status: "warn",
|
|
199
200
|
message: "Frontmatter includes allowed-tools, which is typically Claude-specific.",
|
|
@@ -202,6 +203,7 @@ function runCompatibilityChecks(context) {
|
|
|
202
203
|
} else {
|
|
203
204
|
issues.push({
|
|
204
205
|
id: "compat.allowed-tools",
|
|
206
|
+
checkId: "compat:frontmatter",
|
|
205
207
|
title: "Platform-Specific Frontmatter",
|
|
206
208
|
status: "pass",
|
|
207
209
|
message: "No known provider-specific frontmatter keys detected."
|
|
@@ -211,6 +213,7 @@ function runCompatibilityChecks(context) {
|
|
|
211
213
|
const platform = mentionsClaudeOnly ? "Claude" : "Codex";
|
|
212
214
|
issues.push({
|
|
213
215
|
id: "compat.provider-phrasing",
|
|
216
|
+
checkId: "compat:provider-language",
|
|
214
217
|
title: "Provider-Specific Language",
|
|
215
218
|
status: "warn",
|
|
216
219
|
message: `Skill body appears tuned to ${platform}-specific behavior.`,
|
|
@@ -219,6 +222,7 @@ function runCompatibilityChecks(context) {
|
|
|
219
222
|
} else {
|
|
220
223
|
issues.push({
|
|
221
224
|
id: "compat.provider-phrasing",
|
|
225
|
+
checkId: "compat:provider-language",
|
|
222
226
|
title: "Provider-Specific Language",
|
|
223
227
|
status: "pass",
|
|
224
228
|
message: "Skill body appears provider-neutral."
|
|
@@ -227,6 +231,7 @@ function runCompatibilityChecks(context) {
|
|
|
227
231
|
const likelyCompatibility = hasAllowedTools || mentionsClaudeOnly || mentionsCodexOnly ? "Likely compatible with some agents, but includes platform-specific assumptions." : "Likely broadly compatible across Anthropic, OpenAI/Codex-style, and other markdown skill runners.";
|
|
228
232
|
issues.push({
|
|
229
233
|
id: "compat.summary",
|
|
234
|
+
checkId: "compat:summary",
|
|
230
235
|
title: "Compatibility Hint",
|
|
231
236
|
status: hasAllowedTools || mentionsClaudeOnly || mentionsCodexOnly ? "warn" : "pass",
|
|
232
237
|
message: likelyCompatibility
|
|
@@ -234,6 +239,171 @@ function runCompatibilityChecks(context) {
|
|
|
234
239
|
return issues;
|
|
235
240
|
}
|
|
236
241
|
|
|
242
|
+
// src/core/linter/markdown-zones.ts
|
|
243
|
+
function splitLines(raw) {
|
|
244
|
+
return raw.split(/\r?\n/);
|
|
245
|
+
}
|
|
246
|
+
function stripTopFrontmatter(raw) {
|
|
247
|
+
const lines = splitLines(raw);
|
|
248
|
+
if (lines[0] !== "---") {
|
|
249
|
+
return {
|
|
250
|
+
bodyLines: lines,
|
|
251
|
+
bodyStartLine: 1
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
for (let index = 1; index < lines.length; index += 1) {
|
|
255
|
+
if (lines[index] === "---") {
|
|
256
|
+
return {
|
|
257
|
+
bodyLines: lines.slice(index + 1),
|
|
258
|
+
bodyStartLine: index + 2
|
|
259
|
+
};
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
return {
|
|
263
|
+
bodyLines: lines,
|
|
264
|
+
bodyStartLine: 1
|
|
265
|
+
};
|
|
266
|
+
}
|
|
267
|
+
function matchCodeFenceOpener(line) {
|
|
268
|
+
const match = line.match(/^\s*(`{3,}|~{3,})(.*)$/);
|
|
269
|
+
return match?.[1] ?? null;
|
|
270
|
+
}
|
|
271
|
+
function isExactCodeFenceCloser(line, delimiter) {
|
|
272
|
+
return line.trim() === delimiter;
|
|
273
|
+
}
|
|
274
|
+
function appendZone(zones, type, content, startLine, endLine) {
|
|
275
|
+
if (content === "") {
|
|
276
|
+
return;
|
|
277
|
+
}
|
|
278
|
+
const previous = zones[zones.length - 1];
|
|
279
|
+
if (previous && previous.type === type && startLine <= previous.endLine + 1) {
|
|
280
|
+
const separator = startLine > previous.endLine ? "\n" : "";
|
|
281
|
+
previous.content += `${separator}${content}`;
|
|
282
|
+
previous.endLine = endLine;
|
|
283
|
+
return;
|
|
284
|
+
}
|
|
285
|
+
zones.push({
|
|
286
|
+
type,
|
|
287
|
+
content,
|
|
288
|
+
startLine,
|
|
289
|
+
endLine
|
|
290
|
+
});
|
|
291
|
+
}
|
|
292
|
+
function appendToOpenZone(zone, content, lineNumber) {
|
|
293
|
+
if (content === "") {
|
|
294
|
+
if (lineNumber > zone.endLine) {
|
|
295
|
+
zone.content += "\n";
|
|
296
|
+
zone.endLine = lineNumber;
|
|
297
|
+
}
|
|
298
|
+
return;
|
|
299
|
+
}
|
|
300
|
+
const separator = lineNumber > zone.endLine ? "\n" : "";
|
|
301
|
+
zone.content += `${separator}${content}`;
|
|
302
|
+
zone.endLine = lineNumber;
|
|
303
|
+
}
|
|
304
|
+
function addInlineAwareText(zones, text, lineNumber, baseType) {
|
|
305
|
+
if (text === "") {
|
|
306
|
+
return;
|
|
307
|
+
}
|
|
308
|
+
let cursor = 0;
|
|
309
|
+
while (cursor < text.length) {
|
|
310
|
+
const inlineStart = text.indexOf("`", cursor);
|
|
311
|
+
if (inlineStart === -1) {
|
|
312
|
+
appendZone(zones, baseType, text.slice(cursor), lineNumber, lineNumber);
|
|
313
|
+
return;
|
|
314
|
+
}
|
|
315
|
+
if (inlineStart > cursor) {
|
|
316
|
+
appendZone(zones, baseType, text.slice(cursor, inlineStart), lineNumber, lineNumber);
|
|
317
|
+
}
|
|
318
|
+
const inlineEnd = text.indexOf("`", inlineStart + 1);
|
|
319
|
+
if (inlineEnd === -1) {
|
|
320
|
+
appendZone(zones, baseType, text.slice(inlineStart), lineNumber, lineNumber);
|
|
321
|
+
return;
|
|
322
|
+
}
|
|
323
|
+
appendZone(zones, "inline-code", text.slice(inlineStart, inlineEnd + 1), lineNumber, lineNumber);
|
|
324
|
+
cursor = inlineEnd + 1;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
function parseZones(raw) {
|
|
328
|
+
const { bodyLines, bodyStartLine } = stripTopFrontmatter(raw);
|
|
329
|
+
const zones = [];
|
|
330
|
+
let openCodeFence = null;
|
|
331
|
+
let openComment = null;
|
|
332
|
+
for (const [index, line] of bodyLines.entries()) {
|
|
333
|
+
const lineNumber = bodyStartLine + index;
|
|
334
|
+
if (openCodeFence) {
|
|
335
|
+
appendToOpenZone(openCodeFence.zone, line, lineNumber);
|
|
336
|
+
if (isExactCodeFenceCloser(line, openCodeFence.delimiter)) {
|
|
337
|
+
zones.push(openCodeFence.zone);
|
|
338
|
+
openCodeFence = null;
|
|
339
|
+
}
|
|
340
|
+
continue;
|
|
341
|
+
}
|
|
342
|
+
if (!openComment) {
|
|
343
|
+
const fenceDelimiter = matchCodeFenceOpener(line);
|
|
344
|
+
if (fenceDelimiter) {
|
|
345
|
+
openCodeFence = {
|
|
346
|
+
delimiter: fenceDelimiter,
|
|
347
|
+
zone: {
|
|
348
|
+
type: "code-fence",
|
|
349
|
+
content: line,
|
|
350
|
+
startLine: lineNumber,
|
|
351
|
+
endLine: lineNumber
|
|
352
|
+
}
|
|
353
|
+
};
|
|
354
|
+
continue;
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
const baseType = /^\s*>/.test(line) ? "blockquote" : "prose";
|
|
358
|
+
let cursor = 0;
|
|
359
|
+
while (cursor < line.length || openComment) {
|
|
360
|
+
if (openComment) {
|
|
361
|
+
const closeIndex = line.indexOf("-->", cursor);
|
|
362
|
+
if (closeIndex === -1) {
|
|
363
|
+
appendToOpenZone(openComment, line.slice(cursor), lineNumber);
|
|
364
|
+
cursor = line.length;
|
|
365
|
+
break;
|
|
366
|
+
}
|
|
367
|
+
appendToOpenZone(openComment, line.slice(cursor, closeIndex + 3), lineNumber);
|
|
368
|
+
zones.push(openComment);
|
|
369
|
+
openComment = null;
|
|
370
|
+
cursor = closeIndex + 3;
|
|
371
|
+
continue;
|
|
372
|
+
}
|
|
373
|
+
if (cursor >= line.length) {
|
|
374
|
+
break;
|
|
375
|
+
}
|
|
376
|
+
const commentStart = line.indexOf("<!--", cursor);
|
|
377
|
+
const textEnd = commentStart === -1 ? line.length : commentStart;
|
|
378
|
+
if (textEnd > cursor) {
|
|
379
|
+
addInlineAwareText(zones, line.slice(cursor, textEnd), lineNumber, baseType);
|
|
380
|
+
}
|
|
381
|
+
if (commentStart === -1) {
|
|
382
|
+
break;
|
|
383
|
+
}
|
|
384
|
+
const commentEnd = line.indexOf("-->", commentStart + 4);
|
|
385
|
+
if (commentEnd === -1) {
|
|
386
|
+
openComment = {
|
|
387
|
+
type: "html-comment",
|
|
388
|
+
content: line.slice(commentStart),
|
|
389
|
+
startLine: lineNumber,
|
|
390
|
+
endLine: lineNumber
|
|
391
|
+
};
|
|
392
|
+
break;
|
|
393
|
+
}
|
|
394
|
+
appendZone(zones, "html-comment", line.slice(commentStart, commentEnd + 3), lineNumber, lineNumber);
|
|
395
|
+
cursor = commentEnd + 3;
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
if (openComment) {
|
|
399
|
+
zones.push(openComment);
|
|
400
|
+
}
|
|
401
|
+
if (openCodeFence) {
|
|
402
|
+
zones.push(openCodeFence.zone);
|
|
403
|
+
}
|
|
404
|
+
return zones;
|
|
405
|
+
}
|
|
406
|
+
|
|
237
407
|
// src/core/linter/content.ts
|
|
238
408
|
var VAGUE_PATTERNS = [
|
|
239
409
|
/\bdo something appropriate\b/i,
|
|
@@ -250,6 +420,102 @@ var SECRET_PATTERNS = [
|
|
|
250
420
|
{ label: "Slack token", regex: /\bxox[baprs]-[A-Za-z0-9-]{20,}\b/ },
|
|
251
421
|
{ label: "Generic private key header", regex: /-----BEGIN (?:RSA |EC |OPENSSH )?PRIVATE KEY-----/ }
|
|
252
422
|
];
|
|
423
|
+
function summarizeLineRange(matches) {
|
|
424
|
+
if (matches.length === 0) {
|
|
425
|
+
return {};
|
|
426
|
+
}
|
|
427
|
+
return {
|
|
428
|
+
startLine: Math.min(...matches.map((match) => match.startLine)),
|
|
429
|
+
endLine: Math.max(...matches.map((match) => match.endLine))
|
|
430
|
+
};
|
|
431
|
+
}
|
|
432
|
+
function uniqueLabels(matches) {
|
|
433
|
+
const labels = [];
|
|
434
|
+
const seen = /* @__PURE__ */ new Set();
|
|
435
|
+
for (const match of matches) {
|
|
436
|
+
if (seen.has(match.label)) {
|
|
437
|
+
continue;
|
|
438
|
+
}
|
|
439
|
+
seen.add(match.label);
|
|
440
|
+
labels.push(match.label);
|
|
441
|
+
}
|
|
442
|
+
return labels;
|
|
443
|
+
}
|
|
444
|
+
function collectSecretMatches(zones) {
|
|
445
|
+
const prose = [];
|
|
446
|
+
const nonProse = [];
|
|
447
|
+
for (const zone of zones) {
|
|
448
|
+
for (const pattern of SECRET_PATTERNS) {
|
|
449
|
+
if (!pattern.regex.test(zone.content)) {
|
|
450
|
+
continue;
|
|
451
|
+
}
|
|
452
|
+
const occurrence = {
|
|
453
|
+
label: pattern.label,
|
|
454
|
+
zoneType: zone.type,
|
|
455
|
+
startLine: zone.startLine,
|
|
456
|
+
endLine: zone.endLine
|
|
457
|
+
};
|
|
458
|
+
if (zone.type === "prose") {
|
|
459
|
+
prose.push(occurrence);
|
|
460
|
+
} else {
|
|
461
|
+
nonProse.push(occurrence);
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
return { prose, nonProse };
|
|
466
|
+
}
|
|
467
|
+
function buildSkippedPatterns(matches) {
|
|
468
|
+
if (matches.length === 0) {
|
|
469
|
+
return void 0;
|
|
470
|
+
}
|
|
471
|
+
return matches.map((match) => ({
|
|
472
|
+
label: match.label,
|
|
473
|
+
zoneType: match.zoneType,
|
|
474
|
+
startLine: match.startLine,
|
|
475
|
+
endLine: match.endLine
|
|
476
|
+
}));
|
|
477
|
+
}
|
|
478
|
+
function buildSecretsIssue(context) {
|
|
479
|
+
if (context.suppressedCheckIds.has("content:secrets")) {
|
|
480
|
+
return null;
|
|
481
|
+
}
|
|
482
|
+
const { prose, nonProse } = collectSecretMatches(parseZones(context.skill.raw));
|
|
483
|
+
const proseLabels = uniqueLabels(prose);
|
|
484
|
+
const nonProseLabels = uniqueLabels(nonProse);
|
|
485
|
+
const skippedPatterns = buildSkippedPatterns(nonProse);
|
|
486
|
+
if (proseLabels.length > 0) {
|
|
487
|
+
return {
|
|
488
|
+
id: "content.secrets",
|
|
489
|
+
checkId: "content:secrets",
|
|
490
|
+
title: "Hardcoded Secrets",
|
|
491
|
+
status: "fail",
|
|
492
|
+
message: `Potential secrets detected (${proseLabels.join(", ")}).`,
|
|
493
|
+
suggestion: "Remove secrets from skill files and use environment variables or secret managers.",
|
|
494
|
+
...summarizeLineRange(prose),
|
|
495
|
+
skippedPatterns
|
|
496
|
+
};
|
|
497
|
+
}
|
|
498
|
+
if (nonProseLabels.length > 0) {
|
|
499
|
+
const codeFenceOnly = nonProse.every((match) => match.zoneType === "code-fence");
|
|
500
|
+
return {
|
|
501
|
+
id: "content.secrets",
|
|
502
|
+
checkId: "content:secrets",
|
|
503
|
+
title: "Hardcoded Secrets",
|
|
504
|
+
status: "warn",
|
|
505
|
+
message: codeFenceOnly ? `Possible secret in code example \u2014 verify this is a placeholder, not a real key (${nonProseLabels.join(", ")}).` : `Possible secrets found outside prose instructions (${nonProseLabels.join(", ")}). Verify these are placeholders, not real credentials.`,
|
|
506
|
+
suggestion: "Replace real-looking credentials in examples with explicit placeholders such as YOUR_API_KEY.",
|
|
507
|
+
...summarizeLineRange(nonProse),
|
|
508
|
+
skippedPatterns
|
|
509
|
+
};
|
|
510
|
+
}
|
|
511
|
+
return {
|
|
512
|
+
id: "content.secrets",
|
|
513
|
+
checkId: "content:secrets",
|
|
514
|
+
title: "Hardcoded Secrets",
|
|
515
|
+
status: "pass",
|
|
516
|
+
message: "No obvious API keys or secrets patterns were detected."
|
|
517
|
+
};
|
|
518
|
+
}
|
|
253
519
|
function runContentChecks(context) {
|
|
254
520
|
const issues = [];
|
|
255
521
|
const body = context.frontmatter.content;
|
|
@@ -258,6 +524,7 @@ function runContentChecks(context) {
|
|
|
258
524
|
if (!/^#{1,6}\s+\S+/m.test(body)) {
|
|
259
525
|
issues.push({
|
|
260
526
|
id: "content.headers",
|
|
527
|
+
checkId: "content:headers",
|
|
261
528
|
title: "Section Headers",
|
|
262
529
|
status: "warn",
|
|
263
530
|
message: "No markdown headers found in SKILL.md body.",
|
|
@@ -266,6 +533,7 @@ function runContentChecks(context) {
|
|
|
266
533
|
} else {
|
|
267
534
|
issues.push({
|
|
268
535
|
id: "content.headers",
|
|
536
|
+
checkId: "content:headers",
|
|
269
537
|
title: "Section Headers",
|
|
270
538
|
status: "pass",
|
|
271
539
|
message: "SKILL.md contains markdown section headers."
|
|
@@ -275,6 +543,7 @@ function runContentChecks(context) {
|
|
|
275
543
|
if (!hasExamples) {
|
|
276
544
|
issues.push({
|
|
277
545
|
id: "content.examples",
|
|
546
|
+
checkId: "content:examples",
|
|
278
547
|
title: "Examples",
|
|
279
548
|
status: "warn",
|
|
280
549
|
message: "No examples detected in SKILL.md body.",
|
|
@@ -283,6 +552,7 @@ function runContentChecks(context) {
|
|
|
283
552
|
} else {
|
|
284
553
|
issues.push({
|
|
285
554
|
id: "content.examples",
|
|
555
|
+
checkId: "content:examples",
|
|
286
556
|
title: "Examples",
|
|
287
557
|
status: "pass",
|
|
288
558
|
message: "Examples were detected in SKILL.md."
|
|
@@ -292,6 +562,7 @@ function runContentChecks(context) {
|
|
|
292
562
|
if (vagueMatches.length > 0) {
|
|
293
563
|
issues.push({
|
|
294
564
|
id: "content.vagueness",
|
|
565
|
+
checkId: "content:vagueness",
|
|
295
566
|
title: "Instruction Specificity",
|
|
296
567
|
status: "warn",
|
|
297
568
|
message: "Potentially vague instruction phrases detected.",
|
|
@@ -300,6 +571,7 @@ function runContentChecks(context) {
|
|
|
300
571
|
} else {
|
|
301
572
|
issues.push({
|
|
302
573
|
id: "content.vagueness",
|
|
574
|
+
checkId: "content:vagueness",
|
|
303
575
|
title: "Instruction Specificity",
|
|
304
576
|
status: "pass",
|
|
305
577
|
message: "No obvious vague placeholder phrasing found."
|
|
@@ -308,6 +580,7 @@ function runContentChecks(context) {
|
|
|
308
580
|
if (context.frontmatter.rawFrontmatter && /[<>]/.test(context.frontmatter.rawFrontmatter)) {
|
|
309
581
|
issues.push({
|
|
310
582
|
id: "content.frontmatter-angle-brackets",
|
|
583
|
+
checkId: "content:angle-brackets",
|
|
311
584
|
title: "Frontmatter Angle Brackets",
|
|
312
585
|
status: "warn",
|
|
313
586
|
message: "Frontmatter contains angle bracket characters (< or >), which can be misinterpreted in some agents.",
|
|
@@ -316,36 +589,20 @@ function runContentChecks(context) {
|
|
|
316
589
|
} else {
|
|
317
590
|
issues.push({
|
|
318
591
|
id: "content.frontmatter-angle-brackets",
|
|
592
|
+
checkId: "content:angle-brackets",
|
|
319
593
|
title: "Frontmatter Angle Brackets",
|
|
320
594
|
status: "pass",
|
|
321
595
|
message: "No angle bracket tokens detected in frontmatter."
|
|
322
596
|
});
|
|
323
597
|
}
|
|
324
|
-
const
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
secretHits.add(pattern.label);
|
|
328
|
-
}
|
|
329
|
-
}
|
|
330
|
-
if (secretHits.size > 0) {
|
|
331
|
-
issues.push({
|
|
332
|
-
id: "content.secrets",
|
|
333
|
-
title: "Hardcoded Secrets",
|
|
334
|
-
status: "fail",
|
|
335
|
-
message: `Potential secrets detected (${Array.from(secretHits).join(", ")}).`,
|
|
336
|
-
suggestion: "Remove secrets from skill files and use environment variables or secret managers."
|
|
337
|
-
});
|
|
338
|
-
} else {
|
|
339
|
-
issues.push({
|
|
340
|
-
id: "content.secrets",
|
|
341
|
-
title: "Hardcoded Secrets",
|
|
342
|
-
status: "pass",
|
|
343
|
-
message: "No obvious API keys or secrets patterns were detected."
|
|
344
|
-
});
|
|
598
|
+
const secretsIssue = buildSecretsIssue(context);
|
|
599
|
+
if (secretsIssue) {
|
|
600
|
+
issues.push(secretsIssue);
|
|
345
601
|
}
|
|
346
602
|
if (bodyLines.length < 10) {
|
|
347
603
|
issues.push({
|
|
348
604
|
id: "content.body-length",
|
|
605
|
+
checkId: "content:body-length",
|
|
349
606
|
title: "Body Completeness",
|
|
350
607
|
status: "warn",
|
|
351
608
|
message: `SKILL.md body has only ${bodyLines.length} non-empty lines.`,
|
|
@@ -354,6 +611,7 @@ function runContentChecks(context) {
|
|
|
354
611
|
} else {
|
|
355
612
|
issues.push({
|
|
356
613
|
id: "content.body-length",
|
|
614
|
+
checkId: "content:body-length",
|
|
357
615
|
title: "Body Completeness",
|
|
358
616
|
status: "pass",
|
|
359
617
|
message: `SKILL.md body has ${bodyLines.length} non-empty lines.`
|
|
@@ -362,6 +620,7 @@ function runContentChecks(context) {
|
|
|
362
620
|
if (description && description.length < 50) {
|
|
363
621
|
issues.push({
|
|
364
622
|
id: "content.description-length",
|
|
623
|
+
checkId: "content:description-length",
|
|
365
624
|
title: "Description Specificity",
|
|
366
625
|
status: "warn",
|
|
367
626
|
message: `Description length is ${description.length} characters, which may be too vague for reliable triggering.`,
|
|
@@ -370,6 +629,7 @@ function runContentChecks(context) {
|
|
|
370
629
|
} else if (description) {
|
|
371
630
|
issues.push({
|
|
372
631
|
id: "content.description-length",
|
|
632
|
+
checkId: "content:description-length",
|
|
373
633
|
title: "Description Specificity",
|
|
374
634
|
status: "pass",
|
|
375
635
|
message: "Description length is sufficient for triggerability heuristics."
|
|
@@ -461,6 +721,7 @@ async function runDisclosureChecks(context) {
|
|
|
461
721
|
if (context.skill.lineCount > 200 && !await pathExists(referencesDir)) {
|
|
462
722
|
issues.push({
|
|
463
723
|
id: "disclosure.skill-split",
|
|
724
|
+
checkId: "disclosure:progressive-disclosure",
|
|
464
725
|
title: "Progressive Disclosure",
|
|
465
726
|
status: "warn",
|
|
466
727
|
message: "SKILL.md exceeds 200 lines and no references/ directory is present.",
|
|
@@ -469,6 +730,7 @@ async function runDisclosureChecks(context) {
|
|
|
469
730
|
} else {
|
|
470
731
|
issues.push({
|
|
471
732
|
id: "disclosure.skill-split",
|
|
733
|
+
checkId: "disclosure:progressive-disclosure",
|
|
472
734
|
title: "Progressive Disclosure",
|
|
473
735
|
status: "pass",
|
|
474
736
|
message: "Top-level file length and references/ usage look reasonable."
|
|
@@ -496,6 +758,7 @@ async function runDisclosureChecks(context) {
|
|
|
496
758
|
if (nonRelativeOrEscaping.length > 0) {
|
|
497
759
|
issues.push({
|
|
498
760
|
id: "disclosure.relative-path-root",
|
|
761
|
+
checkId: "disclosure:path-scope",
|
|
499
762
|
title: "Reference Path Scope",
|
|
500
763
|
status: "fail",
|
|
501
764
|
message: `Found non-relative or out-of-root references: ${nonRelativeOrEscaping.join(", ")}`,
|
|
@@ -504,6 +767,7 @@ async function runDisclosureChecks(context) {
|
|
|
504
767
|
} else {
|
|
505
768
|
issues.push({
|
|
506
769
|
id: "disclosure.relative-path-root",
|
|
770
|
+
checkId: "disclosure:path-scope",
|
|
507
771
|
title: "Reference Path Scope",
|
|
508
772
|
status: "pass",
|
|
509
773
|
message: "All detected file references are relative and scoped to skill root."
|
|
@@ -513,6 +777,7 @@ async function runDisclosureChecks(context) {
|
|
|
513
777
|
if (chainDepth > 1) {
|
|
514
778
|
issues.push({
|
|
515
779
|
id: "disclosure.reference-depth",
|
|
780
|
+
checkId: "disclosure:reference-depth",
|
|
516
781
|
title: "Reference Chain Depth",
|
|
517
782
|
status: "warn",
|
|
518
783
|
message: "Deep reference chains detected (>1 level).",
|
|
@@ -521,6 +786,7 @@ async function runDisclosureChecks(context) {
|
|
|
521
786
|
} else {
|
|
522
787
|
issues.push({
|
|
523
788
|
id: "disclosure.reference-depth",
|
|
789
|
+
checkId: "disclosure:reference-depth",
|
|
524
790
|
title: "Reference Chain Depth",
|
|
525
791
|
status: "pass",
|
|
526
792
|
message: "Reference depth is shallow and easy to navigate."
|
|
@@ -530,6 +796,7 @@ async function runDisclosureChecks(context) {
|
|
|
530
796
|
if (normalizedReferences.some((item) => item.includes("../"))) {
|
|
531
797
|
issues.push({
|
|
532
798
|
id: "disclosure.parent-traversal",
|
|
799
|
+
checkId: "disclosure:parent-traversal",
|
|
533
800
|
title: "Parent Traversal",
|
|
534
801
|
status: "warn",
|
|
535
802
|
message: "References include parent-directory traversal (../).",
|
|
@@ -538,6 +805,7 @@ async function runDisclosureChecks(context) {
|
|
|
538
805
|
} else {
|
|
539
806
|
issues.push({
|
|
540
807
|
id: "disclosure.parent-traversal",
|
|
808
|
+
checkId: "disclosure:parent-traversal",
|
|
541
809
|
title: "Parent Traversal",
|
|
542
810
|
status: "pass",
|
|
543
811
|
message: "No parent-directory traversal references detected."
|
|
@@ -568,6 +836,7 @@ function runFrontmatterChecks(context) {
|
|
|
568
836
|
if (!context.frontmatter.hasFrontmatter) {
|
|
569
837
|
issues.push({
|
|
570
838
|
id: "frontmatter.exists",
|
|
839
|
+
checkId: "frontmatter:exists",
|
|
571
840
|
title: "Frontmatter Presence",
|
|
572
841
|
status: "fail",
|
|
573
842
|
message: "SKILL.md is missing YAML frontmatter delimited by --- blocks.",
|
|
@@ -578,6 +847,7 @@ function runFrontmatterChecks(context) {
|
|
|
578
847
|
if (context.frontmatter.error) {
|
|
579
848
|
issues.push({
|
|
580
849
|
id: "frontmatter.valid-yaml",
|
|
850
|
+
checkId: "frontmatter:yaml",
|
|
581
851
|
title: "Frontmatter YAML",
|
|
582
852
|
status: "fail",
|
|
583
853
|
message: `Frontmatter is not valid YAML: ${context.frontmatter.error}`,
|
|
@@ -587,6 +857,7 @@ function runFrontmatterChecks(context) {
|
|
|
587
857
|
}
|
|
588
858
|
issues.push({
|
|
589
859
|
id: "frontmatter.valid-yaml",
|
|
860
|
+
checkId: "frontmatter:yaml",
|
|
590
861
|
title: "Frontmatter YAML",
|
|
591
862
|
status: "pass",
|
|
592
863
|
message: "Frontmatter exists and parses correctly."
|
|
@@ -596,6 +867,7 @@ function runFrontmatterChecks(context) {
|
|
|
596
867
|
if (!name) {
|
|
597
868
|
issues.push({
|
|
598
869
|
id: "frontmatter.name.required",
|
|
870
|
+
checkId: "frontmatter:name",
|
|
599
871
|
title: "Frontmatter Name",
|
|
600
872
|
status: "fail",
|
|
601
873
|
message: "Missing required frontmatter field: name.",
|
|
@@ -604,6 +876,7 @@ function runFrontmatterChecks(context) {
|
|
|
604
876
|
} else if (name.length > 64) {
|
|
605
877
|
issues.push({
|
|
606
878
|
id: "frontmatter.name.length",
|
|
879
|
+
checkId: "frontmatter:name",
|
|
607
880
|
title: "Frontmatter Name Length",
|
|
608
881
|
status: "fail",
|
|
609
882
|
message: `name is too long (${name.length} chars, max 64).`,
|
|
@@ -612,6 +885,7 @@ function runFrontmatterChecks(context) {
|
|
|
612
885
|
} else if (!SKILL_NAME_REGEX.test(name)) {
|
|
613
886
|
issues.push({
|
|
614
887
|
id: "frontmatter.name.format",
|
|
888
|
+
checkId: "frontmatter:name",
|
|
615
889
|
title: "Frontmatter Name Format",
|
|
616
890
|
status: "fail",
|
|
617
891
|
message: "name must be lowercase alphanumeric with single hyphen separators only.",
|
|
@@ -620,6 +894,7 @@ function runFrontmatterChecks(context) {
|
|
|
620
894
|
} else {
|
|
621
895
|
issues.push({
|
|
622
896
|
id: "frontmatter.name.valid",
|
|
897
|
+
checkId: "frontmatter:name",
|
|
623
898
|
title: "Frontmatter Name",
|
|
624
899
|
status: "pass",
|
|
625
900
|
message: "name is present and follows naming conventions."
|
|
@@ -629,6 +904,7 @@ function runFrontmatterChecks(context) {
|
|
|
629
904
|
if (!description || description.trim() === "") {
|
|
630
905
|
issues.push({
|
|
631
906
|
id: "frontmatter.description.required",
|
|
907
|
+
checkId: "frontmatter:description",
|
|
632
908
|
title: "Frontmatter Description",
|
|
633
909
|
status: "fail",
|
|
634
910
|
message: "Missing required frontmatter field: description.",
|
|
@@ -637,6 +913,7 @@ function runFrontmatterChecks(context) {
|
|
|
637
913
|
} else if (description.length > 1024) {
|
|
638
914
|
issues.push({
|
|
639
915
|
id: "frontmatter.description.length",
|
|
916
|
+
checkId: "frontmatter:description",
|
|
640
917
|
title: "Frontmatter Description Length",
|
|
641
918
|
status: "fail",
|
|
642
919
|
message: `description is too long (${description.length} chars, max 1024).`,
|
|
@@ -645,6 +922,7 @@ function runFrontmatterChecks(context) {
|
|
|
645
922
|
} else {
|
|
646
923
|
issues.push({
|
|
647
924
|
id: "frontmatter.description.valid",
|
|
925
|
+
checkId: "frontmatter:description",
|
|
648
926
|
title: "Frontmatter Description",
|
|
649
927
|
status: "pass",
|
|
650
928
|
message: "description is present and within allowed length."
|
|
@@ -654,6 +932,7 @@ function runFrontmatterChecks(context) {
|
|
|
654
932
|
if (!license || license.trim() === "") {
|
|
655
933
|
issues.push({
|
|
656
934
|
id: "frontmatter.license.recommended",
|
|
935
|
+
checkId: "frontmatter:license",
|
|
657
936
|
title: "Frontmatter License",
|
|
658
937
|
status: "warn",
|
|
659
938
|
message: "No license field found in frontmatter.",
|
|
@@ -662,6 +941,7 @@ function runFrontmatterChecks(context) {
|
|
|
662
941
|
} else {
|
|
663
942
|
issues.push({
|
|
664
943
|
id: "frontmatter.license.present",
|
|
944
|
+
checkId: "frontmatter:license",
|
|
665
945
|
title: "Frontmatter License",
|
|
666
946
|
status: "pass",
|
|
667
947
|
message: "license field is present."
|
|
@@ -670,6 +950,7 @@ function runFrontmatterChecks(context) {
|
|
|
670
950
|
if (description && description.trim() !== "" && !descriptionLooksActionable(description)) {
|
|
671
951
|
issues.push({
|
|
672
952
|
id: "frontmatter.description.triggerability",
|
|
953
|
+
checkId: "frontmatter:triggerability",
|
|
673
954
|
title: "Description Trigger Clarity",
|
|
674
955
|
status: "warn",
|
|
675
956
|
message: "Description should explain both what the skill does and when it should be used.",
|
|
@@ -678,6 +959,7 @@ function runFrontmatterChecks(context) {
|
|
|
678
959
|
} else if (description) {
|
|
679
960
|
issues.push({
|
|
680
961
|
id: "frontmatter.description.triggerability",
|
|
962
|
+
checkId: "frontmatter:triggerability",
|
|
681
963
|
title: "Description Trigger Clarity",
|
|
682
964
|
status: "pass",
|
|
683
965
|
message: "Description appears to cover both capability and usage context."
|
|
@@ -735,85 +1017,159 @@ var SHELL_ACTIVITY_PATTERNS = [
|
|
|
735
1017
|
/\b(?:npm|pnpm|yarn|pip|git|docker|kubectl)\s+[A-Za-z0-9-]/i
|
|
736
1018
|
];
|
|
737
1019
|
var SAFETY_GUARDRAIL_PATTERN = /\b(?:ask before|confirm|approval|dry[- ]run|sandbox|least privilege|redact|never expose|do not reveal)\b/i;
|
|
738
|
-
function
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
1020
|
+
function buildOccurrence(zone, pattern) {
|
|
1021
|
+
return {
|
|
1022
|
+
label: pattern.label,
|
|
1023
|
+
zoneType: zone.type,
|
|
1024
|
+
startLine: zone.startLine,
|
|
1025
|
+
endLine: zone.endLine
|
|
1026
|
+
};
|
|
1027
|
+
}
|
|
1028
|
+
function collectZoneAwareMatches(zones, patterns) {
|
|
1029
|
+
const flagged = [];
|
|
1030
|
+
const skipped = [];
|
|
1031
|
+
for (const zone of zones) {
|
|
1032
|
+
for (const pattern of patterns) {
|
|
1033
|
+
if (!pattern.regex.test(zone.content)) {
|
|
1034
|
+
continue;
|
|
1035
|
+
}
|
|
1036
|
+
const occurrence = buildOccurrence(zone, pattern);
|
|
1037
|
+
if (zone.type === "prose") {
|
|
1038
|
+
flagged.push(occurrence);
|
|
1039
|
+
} else {
|
|
1040
|
+
skipped.push(occurrence);
|
|
1041
|
+
}
|
|
1042
|
+
}
|
|
1043
|
+
}
|
|
1044
|
+
return { flagged, skipped };
|
|
1045
|
+
}
|
|
1046
|
+
function uniqueLabels2(matches) {
|
|
1047
|
+
const labels = [];
|
|
1048
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1049
|
+
for (const match of matches) {
|
|
1050
|
+
if (seen.has(match.label)) {
|
|
1051
|
+
continue;
|
|
743
1052
|
}
|
|
1053
|
+
seen.add(match.label);
|
|
1054
|
+
labels.push(match.label);
|
|
744
1055
|
}
|
|
745
|
-
return
|
|
1056
|
+
return labels;
|
|
1057
|
+
}
|
|
1058
|
+
function summarizeLineRange2(matches) {
|
|
1059
|
+
if (matches.length === 0) {
|
|
1060
|
+
return {};
|
|
1061
|
+
}
|
|
1062
|
+
return {
|
|
1063
|
+
startLine: Math.min(...matches.map((match) => match.startLine)),
|
|
1064
|
+
endLine: Math.max(...matches.map((match) => match.endLine))
|
|
1065
|
+
};
|
|
1066
|
+
}
|
|
1067
|
+
function buildSkippedPatterns2(matches) {
|
|
1068
|
+
if (matches.length === 0) {
|
|
1069
|
+
return void 0;
|
|
1070
|
+
}
|
|
1071
|
+
return matches.map((match) => ({
|
|
1072
|
+
label: match.label,
|
|
1073
|
+
zoneType: match.zoneType,
|
|
1074
|
+
startLine: match.startLine,
|
|
1075
|
+
endLine: match.endLine
|
|
1076
|
+
}));
|
|
1077
|
+
}
|
|
1078
|
+
function isSuppressed(context, checkId) {
|
|
1079
|
+
return context.suppressedCheckIds.has(checkId);
|
|
1080
|
+
}
|
|
1081
|
+
function runZoneAwareSecurityCheck(context, zones, options) {
|
|
1082
|
+
if (isSuppressed(context, options.checkId)) {
|
|
1083
|
+
return null;
|
|
1084
|
+
}
|
|
1085
|
+
const matches = collectZoneAwareMatches(zones, options.patterns);
|
|
1086
|
+
const labels = uniqueLabels2(matches.flagged);
|
|
1087
|
+
const skippedPatterns = buildSkippedPatterns2(matches.skipped);
|
|
1088
|
+
if (labels.length > 0) {
|
|
1089
|
+
return {
|
|
1090
|
+
id: options.id,
|
|
1091
|
+
checkId: options.checkId,
|
|
1092
|
+
title: options.title,
|
|
1093
|
+
status: options.statusOnMatch,
|
|
1094
|
+
message: `${options.matchMessagePrefix}: ${labels.join(", ")}.`,
|
|
1095
|
+
suggestion: options.suggestion,
|
|
1096
|
+
...summarizeLineRange2(matches.flagged),
|
|
1097
|
+
skippedPatterns
|
|
1098
|
+
};
|
|
1099
|
+
}
|
|
1100
|
+
return {
|
|
1101
|
+
id: options.id,
|
|
1102
|
+
checkId: options.checkId,
|
|
1103
|
+
title: options.title,
|
|
1104
|
+
status: "pass",
|
|
1105
|
+
message: options.passMessage,
|
|
1106
|
+
skippedPatterns
|
|
1107
|
+
};
|
|
746
1108
|
}
|
|
747
1109
|
function runSecurityChecks(context) {
|
|
748
1110
|
const issues = [];
|
|
749
1111
|
const skillText = context.skill.raw;
|
|
750
|
-
const
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
message: `Potentially risky privilege/execution language detected: ${escalationHits.join(", ")}.`,
|
|
791
|
-
suggestion: "Prefer least-privilege execution and explicit approval steps for elevated commands."
|
|
792
|
-
});
|
|
793
|
-
} else {
|
|
794
|
-
issues.push({
|
|
795
|
-
id: "security.privilege-escalation",
|
|
796
|
-
title: "Privilege Escalation Language",
|
|
797
|
-
status: "pass",
|
|
798
|
-
message: "No obvious privilege-escalation language detected."
|
|
799
|
-
});
|
|
1112
|
+
const needsZoneParsing = !isSuppressed(context, "security:dangerous-commands") || !isSuppressed(context, "security:exfiltration") || !isSuppressed(context, "security:privilege-escalation");
|
|
1113
|
+
const zones = needsZoneParsing ? parseZones(skillText) : [];
|
|
1114
|
+
const dangerousCommandsIssue = runZoneAwareSecurityCheck(context, zones, {
|
|
1115
|
+
id: "security.dangerous-command-patterns",
|
|
1116
|
+
checkId: "security:dangerous-commands",
|
|
1117
|
+
title: "Dangerous Command Patterns",
|
|
1118
|
+
statusOnMatch: "fail",
|
|
1119
|
+
patterns: DANGEROUS_COMMAND_PATTERNS,
|
|
1120
|
+
matchMessagePrefix: "Potentially dangerous command instruction patterns found",
|
|
1121
|
+
passMessage: "No high-risk destructive or direct pipe-to-shell patterns detected.",
|
|
1122
|
+
suggestion: "Remove destructive/pipe-exec command examples or wrap them with explicit safety constraints."
|
|
1123
|
+
});
|
|
1124
|
+
if (dangerousCommandsIssue) {
|
|
1125
|
+
issues.push(dangerousCommandsIssue);
|
|
1126
|
+
}
|
|
1127
|
+
const exfiltrationIssue = runZoneAwareSecurityCheck(context, zones, {
|
|
1128
|
+
id: "security.exfiltration-patterns",
|
|
1129
|
+
checkId: "security:exfiltration",
|
|
1130
|
+
title: "Sensitive Data Exfiltration",
|
|
1131
|
+
statusOnMatch: "fail",
|
|
1132
|
+
patterns: EXFILTRATION_PATTERNS,
|
|
1133
|
+
matchMessagePrefix: "Possible sensitive data exfiltration patterns found",
|
|
1134
|
+
passMessage: "No obvious credential access/exfiltration instructions detected.",
|
|
1135
|
+
suggestion: "Remove instructions that access or transmit secrets/credential files."
|
|
1136
|
+
});
|
|
1137
|
+
if (exfiltrationIssue) {
|
|
1138
|
+
issues.push(exfiltrationIssue);
|
|
1139
|
+
}
|
|
1140
|
+
const privilegeEscalationIssue = runZoneAwareSecurityCheck(context, zones, {
|
|
1141
|
+
id: "security.privilege-escalation",
|
|
1142
|
+
checkId: "security:privilege-escalation",
|
|
1143
|
+
title: "Privilege Escalation Language",
|
|
1144
|
+
statusOnMatch: "warn",
|
|
1145
|
+
patterns: PRIVILEGE_ESCALATION_PATTERNS,
|
|
1146
|
+
matchMessagePrefix: "Potentially risky privilege/execution language detected",
|
|
1147
|
+
passMessage: "No obvious privilege-escalation language detected.",
|
|
1148
|
+
suggestion: "Prefer least-privilege execution and explicit approval steps for elevated commands."
|
|
1149
|
+
});
|
|
1150
|
+
if (privilegeEscalationIssue) {
|
|
1151
|
+
issues.push(privilegeEscalationIssue);
|
|
800
1152
|
}
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
1153
|
+
if (!isSuppressed(context, "security:missing-guardrails")) {
|
|
1154
|
+
const hasShellActivity = SHELL_ACTIVITY_PATTERNS.some((pattern) => pattern.test(skillText));
|
|
1155
|
+
if (hasShellActivity && !SAFETY_GUARDRAIL_PATTERN.test(skillText)) {
|
|
1156
|
+
issues.push({
|
|
1157
|
+
id: "security.safety-guardrails",
|
|
1158
|
+
checkId: "security:missing-guardrails",
|
|
1159
|
+
title: "Execution Safety Guardrails",
|
|
1160
|
+
status: "warn",
|
|
1161
|
+
message: "Shell/tool execution is present, but no explicit safety guardrails were detected.",
|
|
1162
|
+
suggestion: "Add guidance such as approval requirements, dry-run mode, scope checks, and redaction rules."
|
|
1163
|
+
});
|
|
1164
|
+
} else {
|
|
1165
|
+
issues.push({
|
|
1166
|
+
id: "security.safety-guardrails",
|
|
1167
|
+
checkId: "security:missing-guardrails",
|
|
1168
|
+
title: "Execution Safety Guardrails",
|
|
1169
|
+
status: "pass",
|
|
1170
|
+
message: hasShellActivity ? "Shell/tool execution instructions include at least one safety guardrail." : "No shell/tool execution instructions detected."
|
|
1171
|
+
});
|
|
1172
|
+
}
|
|
817
1173
|
}
|
|
818
1174
|
return issues;
|
|
819
1175
|
}
|
|
@@ -846,6 +1202,7 @@ async function runStructureChecks(context) {
|
|
|
846
1202
|
if (context.skill.lineCount > 500) {
|
|
847
1203
|
issues.push({
|
|
848
1204
|
id: "structure.skill-size",
|
|
1205
|
+
checkId: "structure:file-size",
|
|
849
1206
|
title: "SKILL.md Size",
|
|
850
1207
|
status: "warn",
|
|
851
1208
|
message: `SKILL.md is ${context.skill.lineCount} lines (recommended max is 500).`,
|
|
@@ -854,6 +1211,7 @@ async function runStructureChecks(context) {
|
|
|
854
1211
|
} else {
|
|
855
1212
|
issues.push({
|
|
856
1213
|
id: "structure.skill-size",
|
|
1214
|
+
checkId: "structure:file-size",
|
|
857
1215
|
title: "SKILL.md Size",
|
|
858
1216
|
status: "pass",
|
|
859
1217
|
message: `SKILL.md length is ${context.skill.lineCount} lines.`
|
|
@@ -870,6 +1228,7 @@ async function runStructureChecks(context) {
|
|
|
870
1228
|
oversizedWithoutToc += 1;
|
|
871
1229
|
issues.push({
|
|
872
1230
|
id: `structure.references.toc.${toPosixPath(path4.relative(context.skill.skillRoot, file))}`,
|
|
1231
|
+
checkId: "structure:toc",
|
|
873
1232
|
title: "Reference File Navigation",
|
|
874
1233
|
status: "warn",
|
|
875
1234
|
message: `${toPosixPath(path4.relative(context.skill.skillRoot, file))} is ${lineCount} lines and has no table of contents.`,
|
|
@@ -880,6 +1239,7 @@ async function runStructureChecks(context) {
|
|
|
880
1239
|
if (oversizedWithoutToc === 0) {
|
|
881
1240
|
issues.push({
|
|
882
1241
|
id: "structure.references.toc",
|
|
1242
|
+
checkId: "structure:toc",
|
|
883
1243
|
title: "Reference File Navigation",
|
|
884
1244
|
status: "pass",
|
|
885
1245
|
message: "No oversized reference files missing a table of contents."
|
|
@@ -888,6 +1248,7 @@ async function runStructureChecks(context) {
|
|
|
888
1248
|
} else {
|
|
889
1249
|
issues.push({
|
|
890
1250
|
id: "structure.references.toc",
|
|
1251
|
+
checkId: "structure:toc",
|
|
891
1252
|
title: "Reference File Navigation",
|
|
892
1253
|
status: "pass",
|
|
893
1254
|
message: "No references/ directory found, so no long reference files to validate."
|
|
@@ -916,6 +1277,7 @@ async function runStructureChecks(context) {
|
|
|
916
1277
|
if (missing.length > 0) {
|
|
917
1278
|
issues.push({
|
|
918
1279
|
id: `structure.${category.key}.exists`,
|
|
1280
|
+
checkId: "structure:references",
|
|
919
1281
|
title: category.title,
|
|
920
1282
|
status: "fail",
|
|
921
1283
|
message: `Missing referenced ${category.key} file(s): ${missing.join(", ")}`,
|
|
@@ -924,6 +1286,7 @@ async function runStructureChecks(context) {
|
|
|
924
1286
|
} else {
|
|
925
1287
|
issues.push({
|
|
926
1288
|
id: `structure.${category.key}.exists`,
|
|
1289
|
+
checkId: "structure:references",
|
|
927
1290
|
title: category.title,
|
|
928
1291
|
status: "pass",
|
|
929
1292
|
message: `All referenced ${category.key} files exist.`
|
|
@@ -934,6 +1297,7 @@ async function runStructureChecks(context) {
|
|
|
934
1297
|
if (missingGeneric.length > 0) {
|
|
935
1298
|
issues.push({
|
|
936
1299
|
id: "structure.relative-links.broken",
|
|
1300
|
+
checkId: "structure:references",
|
|
937
1301
|
title: "Relative Links",
|
|
938
1302
|
status: "fail",
|
|
939
1303
|
message: `Broken relative path reference(s): ${missingGeneric.join(", ")}`,
|
|
@@ -942,6 +1306,7 @@ async function runStructureChecks(context) {
|
|
|
942
1306
|
} else {
|
|
943
1307
|
issues.push({
|
|
944
1308
|
id: "structure.relative-links.broken",
|
|
1309
|
+
checkId: "structure:references",
|
|
945
1310
|
title: "Relative Links",
|
|
946
1311
|
status: "pass",
|
|
947
1312
|
message: "No broken generic relative file references were found."
|
|
@@ -971,12 +1336,20 @@ function summarizeIssues(issues) {
|
|
|
971
1336
|
}
|
|
972
1337
|
return summary;
|
|
973
1338
|
}
|
|
974
|
-
|
|
1339
|
+
function lintFails(report, failOn) {
|
|
1340
|
+
if (report.summary.failures > 0) {
|
|
1341
|
+
return true;
|
|
1342
|
+
}
|
|
1343
|
+
return failOn === "warn" && report.summary.warnings > 0;
|
|
1344
|
+
}
|
|
1345
|
+
async function runLinter(inputPath, options = {}) {
|
|
975
1346
|
const skill = await loadSkillFile(inputPath);
|
|
976
1347
|
const frontmatter = parseFrontmatter(skill.raw);
|
|
1348
|
+
const suppressedCheckIds = new Set(options.suppress ?? []);
|
|
977
1349
|
const context = {
|
|
978
1350
|
skill,
|
|
979
|
-
frontmatter
|
|
1351
|
+
frontmatter,
|
|
1352
|
+
suppressedCheckIds
|
|
980
1353
|
};
|
|
981
1354
|
const issues = [];
|
|
982
1355
|
issues.push(...runFrontmatterChecks(context));
|
|
@@ -985,10 +1358,11 @@ async function runLinter(inputPath) {
|
|
|
985
1358
|
issues.push(...runSecurityChecks(context));
|
|
986
1359
|
issues.push(...await runDisclosureChecks(context));
|
|
987
1360
|
issues.push(...runCompatibilityChecks(context));
|
|
1361
|
+
const filteredIssues = issues.filter((issue) => !suppressedCheckIds.has(issue.checkId));
|
|
988
1362
|
return {
|
|
989
1363
|
target: inputPath,
|
|
990
|
-
issues,
|
|
991
|
-
summary: summarizeIssues(
|
|
1364
|
+
issues: filteredIssues,
|
|
1365
|
+
summary: summarizeIssues(filteredIssues)
|
|
992
1366
|
};
|
|
993
1367
|
}
|
|
994
1368
|
|
|
@@ -1004,6 +1378,14 @@ function renderIssueLine(issue, c) {
|
|
|
1004
1378
|
return ` ${label} ${issue.title}
|
|
1005
1379
|
${issue.message}${detail}`;
|
|
1006
1380
|
}
|
|
1381
|
+
function countSkippedSecurityPatterns(issues) {
|
|
1382
|
+
return issues.reduce((total, issue) => {
|
|
1383
|
+
if (!issue.checkId.startsWith("security:")) {
|
|
1384
|
+
return total;
|
|
1385
|
+
}
|
|
1386
|
+
return total + (issue.skippedPatterns?.length ?? 0);
|
|
1387
|
+
}, 0);
|
|
1388
|
+
}
|
|
1007
1389
|
function renderLintReport(report, enableColor) {
|
|
1008
1390
|
const c = getChalkInstance(enableColor);
|
|
1009
1391
|
const { passed, warnings, failures, total } = report.summary;
|
|
@@ -1016,8 +1398,11 @@ function renderLintReport(report, enableColor) {
|
|
|
1016
1398
|
`\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518`
|
|
1017
1399
|
];
|
|
1018
1400
|
const renderedIssues = report.issues.map((issue) => renderIssueLine(issue, c)).join("\n");
|
|
1401
|
+
const skippedSecurityPatterns = countSkippedSecurityPatterns(report.issues);
|
|
1402
|
+
const infoLine = skippedSecurityPatterns > 0 ? `
|
|
1403
|
+
${c.cyan("\u2139")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)` : "";
|
|
1019
1404
|
return `${headerLines.join("\n")}
|
|
1020
|
-
${renderedIssues}`;
|
|
1405
|
+
${renderedIssues}${infoLine}`;
|
|
1021
1406
|
}
|
|
1022
1407
|
function formatPercent(value) {
|
|
1023
1408
|
return `${(value * 100).toFixed(1)}%`;
|
|
@@ -1105,6 +1490,10 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
1105
1490
|
for (const issue of lintIssues) {
|
|
1106
1491
|
lines.push(renderIssueLine(issue, c));
|
|
1107
1492
|
}
|
|
1493
|
+
const skippedSecurityPatterns = countSkippedSecurityPatterns(result.lint.issues);
|
|
1494
|
+
if (skippedSecurityPatterns > 0) {
|
|
1495
|
+
lines.push(` ${c.cyan("\u2139")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)`);
|
|
1496
|
+
}
|
|
1108
1497
|
lines.push("");
|
|
1109
1498
|
lines.push("Trigger");
|
|
1110
1499
|
if (result.trigger) {
|
|
@@ -1159,107 +1548,213 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
1159
1548
|
return lines.join("\n");
|
|
1160
1549
|
}
|
|
1161
1550
|
|
|
1162
|
-
// src/reporters/json.ts
|
|
1163
|
-
function renderJson(value) {
|
|
1164
|
-
return JSON.stringify(value, null, 2);
|
|
1165
|
-
}
|
|
1166
|
-
|
|
1167
1551
|
// src/commands/common.ts
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
return {
|
|
1171
|
-
json: Boolean(options.json),
|
|
1172
|
-
color: options.color !== false
|
|
1173
|
-
};
|
|
1174
|
-
}
|
|
1175
|
-
function writeResult(value, asJson) {
|
|
1176
|
-
if (asJson) {
|
|
1177
|
-
process.stdout.write(`${renderJson(value)}
|
|
1178
|
-
`);
|
|
1179
|
-
return;
|
|
1180
|
-
}
|
|
1181
|
-
process.stdout.write(`${String(value)}
|
|
1182
|
-
`);
|
|
1183
|
-
}
|
|
1184
|
-
function writeError(error, asJson) {
|
|
1185
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1186
|
-
if (asJson) {
|
|
1187
|
-
process.stdout.write(`${renderJson({ error: message })}
|
|
1188
|
-
`);
|
|
1189
|
-
return;
|
|
1190
|
-
}
|
|
1191
|
-
process.stderr.write(`Error: ${message}
|
|
1192
|
-
`);
|
|
1193
|
-
}
|
|
1194
|
-
|
|
1195
|
-
// src/commands/lint.ts
|
|
1196
|
-
function registerLintCommand(program) {
|
|
1197
|
-
program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").action(async (targetPath, _commandOptions, command) => {
|
|
1198
|
-
const globalOptions = getGlobalCliOptions(command);
|
|
1199
|
-
try {
|
|
1200
|
-
const report = await runLinter(targetPath);
|
|
1201
|
-
if (globalOptions.json) {
|
|
1202
|
-
writeResult(report, true);
|
|
1203
|
-
} else {
|
|
1204
|
-
writeResult(renderLintReport(report, globalOptions.color), false);
|
|
1205
|
-
}
|
|
1206
|
-
if (report.summary.failures > 0) {
|
|
1207
|
-
process.exitCode = 1;
|
|
1208
|
-
}
|
|
1209
|
-
} catch (error) {
|
|
1210
|
-
writeError(error, globalOptions.json);
|
|
1211
|
-
process.exitCode = 2;
|
|
1212
|
-
}
|
|
1213
|
-
});
|
|
1214
|
-
}
|
|
1552
|
+
import fs5 from "node:fs/promises";
|
|
1553
|
+
import { z as z5 } from "zod";
|
|
1215
1554
|
|
|
1216
|
-
// src/
|
|
1217
|
-
import ora from "ora";
|
|
1555
|
+
// src/core/eval-runner.ts
|
|
1218
1556
|
import { z as z3 } from "zod";
|
|
1219
1557
|
|
|
1220
|
-
// src/core/
|
|
1558
|
+
// src/core/grader.ts
|
|
1221
1559
|
import { z as z2 } from "zod";
|
|
1222
|
-
var
|
|
1223
|
-
|
|
1224
|
-
|
|
1560
|
+
var gradedAssertionSchema = z2.object({
|
|
1561
|
+
assertion: z2.string(),
|
|
1562
|
+
passed: z2.boolean(),
|
|
1563
|
+
evidence: z2.string()
|
|
1225
1564
|
});
|
|
1226
|
-
var
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
{ name: "db-migrator", description: "Plans and generates safe database migration scripts with rollback guidance." },
|
|
1231
|
-
{ name: "bug-repro", description: "Reproduces reported bugs by building deterministic minimal test cases." },
|
|
1232
|
-
{ name: "release-notes", description: "Drafts release notes from commits and PR metadata for stakeholders." },
|
|
1233
|
-
{ name: "log-analyzer", description: "Analyzes service logs to identify error clusters and likely root causes." },
|
|
1234
|
-
{ name: "performance-audit", description: "Finds hotspots in runtime and suggests profiling-driven optimizations." },
|
|
1235
|
-
{ name: "security-audit", description: "Checks code and config for common security vulnerabilities and risky defaults." },
|
|
1236
|
-
{ name: "refactor-planner", description: "Breaks large refactors into safe incremental steps with validation plans." },
|
|
1237
|
-
{ name: "schema-designer", description: "Designs JSON schemas and validates data contracts for integrations." },
|
|
1238
|
-
{ name: "docs-writer", description: "Writes developer documentation, tutorials, and API usage examples." },
|
|
1239
|
-
{ name: "cli-scaffolder", description: "Creates CLI project skeletons with argument parsing and help text." },
|
|
1240
|
-
{ name: "incident-triage", description: "Triage production incidents with severity tagging and next-action checklists." },
|
|
1241
|
-
{ name: "test-generator", description: "Generates unit and integration test cases from feature requirements." },
|
|
1242
|
-
{ name: "prompt-tuner", description: "Improves prompts for reliability, formatting, and failure handling." }
|
|
1243
|
-
];
|
|
1244
|
-
function shuffle(values) {
|
|
1245
|
-
const copy = [...values];
|
|
1246
|
-
for (let index = copy.length - 1; index > 0; index -= 1) {
|
|
1247
|
-
const swapIndex = Math.floor(Math.random() * (index + 1));
|
|
1248
|
-
[copy[index], copy[swapIndex]] = [copy[swapIndex], copy[index]];
|
|
1249
|
-
}
|
|
1250
|
-
return copy;
|
|
1251
|
-
}
|
|
1252
|
-
function sample(values, count) {
|
|
1253
|
-
return shuffle(values).slice(0, Math.max(0, Math.min(count, values.length)));
|
|
1254
|
-
}
|
|
1255
|
-
function parseJsonArrayFromModelOutput(raw) {
|
|
1565
|
+
var graderOutputSchema = z2.object({
|
|
1566
|
+
assertions: z2.array(gradedAssertionSchema)
|
|
1567
|
+
});
|
|
1568
|
+
function extractJsonObject(raw) {
|
|
1256
1569
|
const trimmed = raw.trim();
|
|
1257
|
-
if (trimmed.startsWith("
|
|
1570
|
+
if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
|
|
1258
1571
|
return JSON.parse(trimmed);
|
|
1259
1572
|
}
|
|
1260
|
-
const start = trimmed.indexOf("
|
|
1261
|
-
const end = trimmed.lastIndexOf("
|
|
1262
|
-
if (start >= 0 && end > start) {
|
|
1573
|
+
const start = trimmed.indexOf("{");
|
|
1574
|
+
const end = trimmed.lastIndexOf("}");
|
|
1575
|
+
if (start >= 0 && end > start) {
|
|
1576
|
+
return JSON.parse(trimmed.slice(start, end + 1));
|
|
1577
|
+
}
|
|
1578
|
+
throw new Error("Grader did not return a JSON object.");
|
|
1579
|
+
}
|
|
1580
|
+
async function gradeResponse(options) {
|
|
1581
|
+
const assertionList = options.assertions && options.assertions.length > 0 ? options.assertions : [
|
|
1582
|
+
"The response follows the skill instructions faithfully.",
|
|
1583
|
+
"The response is well-structured and actionable.",
|
|
1584
|
+
"The response addresses the user prompt directly."
|
|
1585
|
+
];
|
|
1586
|
+
const systemPrompt = [
|
|
1587
|
+
"You are a strict evaluator for agent skill outputs.",
|
|
1588
|
+
"Assess each assertion and return JSON only.",
|
|
1589
|
+
'Required output format: {"assertions":[{"assertion":"...","passed":true|false,"evidence":"..."}]}'
|
|
1590
|
+
].join(" ");
|
|
1591
|
+
const userPrompt = [
|
|
1592
|
+
`Skill: ${options.skillName}`,
|
|
1593
|
+
"Skill instructions:",
|
|
1594
|
+
options.skillBody,
|
|
1595
|
+
"",
|
|
1596
|
+
`User prompt: ${options.userPrompt}`,
|
|
1597
|
+
"",
|
|
1598
|
+
"Model response:",
|
|
1599
|
+
options.modelResponse,
|
|
1600
|
+
"",
|
|
1601
|
+
"Assertions to evaluate:",
|
|
1602
|
+
assertionList.map((assertion, index) => `${index + 1}. ${assertion}`).join("\n")
|
|
1603
|
+
].join("\n");
|
|
1604
|
+
const raw = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
|
|
1605
|
+
const parsed = graderOutputSchema.safeParse(extractJsonObject(raw));
|
|
1606
|
+
if (!parsed.success) {
|
|
1607
|
+
throw new Error(`Failed to parse grader output: ${parsed.error.issues[0]?.message ?? "invalid grader JSON"}`);
|
|
1608
|
+
}
|
|
1609
|
+
return parsed.data.assertions;
|
|
1610
|
+
}
|
|
1611
|
+
|
|
1612
|
+
// src/core/eval-runner.ts
|
|
1613
|
+
var evalPromptSchema = z3.object({
|
|
1614
|
+
prompt: z3.string().min(1),
|
|
1615
|
+
assertions: z3.array(z3.string().min(1)).optional()
|
|
1616
|
+
});
|
|
1617
|
+
var evalPromptArraySchema = z3.array(evalPromptSchema);
|
|
1618
|
+
function extractJsonArray(raw) {
|
|
1619
|
+
const trimmed = raw.trim();
|
|
1620
|
+
if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
|
|
1621
|
+
return JSON.parse(trimmed);
|
|
1622
|
+
}
|
|
1623
|
+
const start = trimmed.indexOf("[");
|
|
1624
|
+
const end = trimmed.lastIndexOf("]");
|
|
1625
|
+
if (start >= 0 && end > start) {
|
|
1626
|
+
return JSON.parse(trimmed.slice(start, end + 1));
|
|
1627
|
+
}
|
|
1628
|
+
throw new Error("Model did not return a JSON array.");
|
|
1629
|
+
}
|
|
1630
|
+
async function generatePrompts(skill, provider, model, count) {
|
|
1631
|
+
const systemPrompt = [
|
|
1632
|
+
"You generate realistic evaluation prompts for an agent skill.",
|
|
1633
|
+
"Return JSON only.",
|
|
1634
|
+
'Format: [{"prompt":"...","assertions":["...", "..."]}]',
|
|
1635
|
+
"Assertions should be concrete and checkable."
|
|
1636
|
+
].join(" ");
|
|
1637
|
+
const userPrompt = [
|
|
1638
|
+
`Skill name: ${skill.frontmatter.name}`,
|
|
1639
|
+
`Skill description: ${skill.frontmatter.description}`,
|
|
1640
|
+
"Skill instructions:",
|
|
1641
|
+
skill.content,
|
|
1642
|
+
"",
|
|
1643
|
+
`Generate ${count} prompts that stress the main capabilities and likely edge cases.`,
|
|
1644
|
+
"Each prompt should include 2-4 assertions."
|
|
1645
|
+
].join("\n");
|
|
1646
|
+
const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
|
|
1647
|
+
const parsed = evalPromptArraySchema.safeParse(extractJsonArray(raw));
|
|
1648
|
+
if (!parsed.success) {
|
|
1649
|
+
throw new Error(`Failed to parse generated eval prompts: ${parsed.error.issues[0]?.message ?? "invalid prompt JSON"}`);
|
|
1650
|
+
}
|
|
1651
|
+
if (parsed.data.length !== count) {
|
|
1652
|
+
throw new Error(`Expected ${count} prompts, got ${parsed.data.length}.`);
|
|
1653
|
+
}
|
|
1654
|
+
return parsed.data;
|
|
1655
|
+
}
|
|
1656
|
+
async function runEval(skill, options) {
|
|
1657
|
+
const prompts = options.prompts && options.prompts.length > 0 ? evalPromptArraySchema.parse(options.prompts) : await generatePrompts(skill, options.provider, options.model, options.numRuns);
|
|
1658
|
+
const results = [];
|
|
1659
|
+
for (const evalPrompt of prompts) {
|
|
1660
|
+
const systemPrompt = [
|
|
1661
|
+
"You are an AI assistant with an activated skill.",
|
|
1662
|
+
"Follow this SKILL.md content exactly where applicable.",
|
|
1663
|
+
"",
|
|
1664
|
+
skill.raw
|
|
1665
|
+
].join("\n");
|
|
1666
|
+
const response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
|
|
1667
|
+
const gradedAssertions = await gradeResponse({
|
|
1668
|
+
provider: options.provider,
|
|
1669
|
+
model: options.graderModel,
|
|
1670
|
+
skillName: skill.frontmatter.name,
|
|
1671
|
+
skillBody: skill.content,
|
|
1672
|
+
userPrompt: evalPrompt.prompt,
|
|
1673
|
+
modelResponse: response,
|
|
1674
|
+
assertions: evalPrompt.assertions
|
|
1675
|
+
});
|
|
1676
|
+
const passedAssertions2 = gradedAssertions.filter((assertion) => assertion.passed).length;
|
|
1677
|
+
results.push({
|
|
1678
|
+
prompt: evalPrompt.prompt,
|
|
1679
|
+
assertions: gradedAssertions,
|
|
1680
|
+
responseSummary: response.slice(0, 200),
|
|
1681
|
+
response,
|
|
1682
|
+
passedAssertions: passedAssertions2,
|
|
1683
|
+
totalAssertions: gradedAssertions.length
|
|
1684
|
+
});
|
|
1685
|
+
}
|
|
1686
|
+
const totalAssertions = results.reduce((total, result) => total + result.totalAssertions, 0);
|
|
1687
|
+
const passedAssertions = results.reduce((total, result) => total + result.passedAssertions, 0);
|
|
1688
|
+
return {
|
|
1689
|
+
skillName: skill.frontmatter.name,
|
|
1690
|
+
model: options.model,
|
|
1691
|
+
graderModel: options.graderModel,
|
|
1692
|
+
provider: options.provider.name,
|
|
1693
|
+
prompts,
|
|
1694
|
+
results,
|
|
1695
|
+
summary: {
|
|
1696
|
+
totalPrompts: results.length,
|
|
1697
|
+
totalAssertions,
|
|
1698
|
+
passedAssertions
|
|
1699
|
+
}
|
|
1700
|
+
};
|
|
1701
|
+
}
|
|
1702
|
+
|
|
1703
|
+
// src/core/trigger-tester.ts
|
|
1704
|
+
import { z as z4 } from "zod";
|
|
1705
|
+
var triggerQuerySchema = z4.object({
|
|
1706
|
+
query: z4.string().min(1),
|
|
1707
|
+
should_trigger: z4.boolean()
|
|
1708
|
+
});
|
|
1709
|
+
var triggerQueryArraySchema = z4.array(triggerQuerySchema);
|
|
1710
|
+
var FAKE_SKILLS = [
|
|
1711
|
+
{ name: "code-review", description: "Reviews code changes for bugs, regressions, and maintainability issues." },
|
|
1712
|
+
{ name: "api-tester", description: "Designs and runs REST API tests, validating status codes and response shapes." },
|
|
1713
|
+
{ name: "db-migrator", description: "Plans and generates safe database migration scripts with rollback guidance." },
|
|
1714
|
+
{ name: "bug-repro", description: "Reproduces reported bugs by building deterministic minimal test cases." },
|
|
1715
|
+
{ name: "release-notes", description: "Drafts release notes from commits and PR metadata for stakeholders." },
|
|
1716
|
+
{ name: "log-analyzer", description: "Analyzes service logs to identify error clusters and likely root causes." },
|
|
1717
|
+
{ name: "performance-audit", description: "Finds hotspots in runtime and suggests profiling-driven optimizations." },
|
|
1718
|
+
{ name: "security-audit", description: "Checks code and config for common security vulnerabilities and risky defaults." },
|
|
1719
|
+
{ name: "refactor-planner", description: "Breaks large refactors into safe incremental steps with validation plans." },
|
|
1720
|
+
{ name: "schema-designer", description: "Designs JSON schemas and validates data contracts for integrations." },
|
|
1721
|
+
{ name: "docs-writer", description: "Writes developer documentation, tutorials, and API usage examples." },
|
|
1722
|
+
{ name: "cli-scaffolder", description: "Creates CLI project skeletons with argument parsing and help text." },
|
|
1723
|
+
{ name: "incident-triage", description: "Triage production incidents with severity tagging and next-action checklists." },
|
|
1724
|
+
{ name: "test-generator", description: "Generates unit and integration test cases from feature requirements." },
|
|
1725
|
+
{ name: "prompt-tuner", description: "Improves prompts for reliability, formatting, and failure handling." }
|
|
1726
|
+
];
|
|
1727
|
+
function mulberry32(seed) {
|
|
1728
|
+
return () => {
|
|
1729
|
+
seed |= 0;
|
|
1730
|
+
seed = seed + 1831565813 | 0;
|
|
1731
|
+
let t = Math.imul(seed ^ seed >>> 15, 1 | seed);
|
|
1732
|
+
t = t + Math.imul(t ^ t >>> 7, 61 | t) ^ t;
|
|
1733
|
+
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
1734
|
+
};
|
|
1735
|
+
}
|
|
1736
|
+
function createRng(seed) {
|
|
1737
|
+
return seed !== void 0 ? mulberry32(seed) : Math.random;
|
|
1738
|
+
}
|
|
1739
|
+
function shuffle(values, rng) {
|
|
1740
|
+
const copy = [...values];
|
|
1741
|
+
for (let index = copy.length - 1; index > 0; index -= 1) {
|
|
1742
|
+
const swapIndex = Math.floor(rng() * (index + 1));
|
|
1743
|
+
[copy[index], copy[swapIndex]] = [copy[swapIndex], copy[index]];
|
|
1744
|
+
}
|
|
1745
|
+
return copy;
|
|
1746
|
+
}
|
|
1747
|
+
function sample(values, count, rng) {
|
|
1748
|
+
return shuffle(values, rng).slice(0, Math.max(0, Math.min(count, values.length)));
|
|
1749
|
+
}
|
|
1750
|
+
function parseJsonArrayFromModelOutput(raw) {
|
|
1751
|
+
const trimmed = raw.trim();
|
|
1752
|
+
if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
|
|
1753
|
+
return JSON.parse(trimmed);
|
|
1754
|
+
}
|
|
1755
|
+
const start = trimmed.indexOf("[");
|
|
1756
|
+
const end = trimmed.lastIndexOf("]");
|
|
1757
|
+
if (start >= 0 && end > start) {
|
|
1263
1758
|
const possibleJson = trimmed.slice(start, end + 1);
|
|
1264
1759
|
return JSON.parse(possibleJson);
|
|
1265
1760
|
}
|
|
@@ -1362,19 +1857,20 @@ function buildSuggestions(metrics) {
|
|
|
1362
1857
|
return suggestions;
|
|
1363
1858
|
}
|
|
1364
1859
|
async function runTriggerTest(skill, options) {
|
|
1860
|
+
const rng = createRng(options.seed);
|
|
1365
1861
|
const queries = options.queries && options.queries.length > 0 ? triggerQueryArraySchema.parse(options.queries) : await generateQueriesWithModel(skill, options.provider, options.model, options.numQueries);
|
|
1366
1862
|
const results = [];
|
|
1367
1863
|
const skillName = skill.frontmatter.name;
|
|
1368
1864
|
for (const testQuery of queries) {
|
|
1369
|
-
const fakeCount = 5 + Math.floor(
|
|
1370
|
-
const fakeSkills = sample(FAKE_SKILLS, fakeCount);
|
|
1865
|
+
const fakeCount = 5 + Math.floor(rng() * 5);
|
|
1866
|
+
const fakeSkills = sample(FAKE_SKILLS, fakeCount, rng);
|
|
1371
1867
|
const allSkills = shuffle([
|
|
1372
1868
|
...fakeSkills,
|
|
1373
1869
|
{
|
|
1374
1870
|
name: skill.frontmatter.name,
|
|
1375
1871
|
description: skill.frontmatter.description
|
|
1376
1872
|
}
|
|
1377
|
-
]);
|
|
1873
|
+
], rng);
|
|
1378
1874
|
const skillListText = allSkills.map((entry) => `- ${entry.name}: ${entry.description}`).join("\n");
|
|
1379
1875
|
const systemPrompt = [
|
|
1380
1876
|
"You are selecting one skill to activate for a user query.",
|
|
@@ -1403,6 +1899,7 @@ async function runTriggerTest(skill, options) {
|
|
|
1403
1899
|
skillName,
|
|
1404
1900
|
model: options.model,
|
|
1405
1901
|
provider: options.provider.name,
|
|
1902
|
+
seed: options.seed,
|
|
1406
1903
|
queries,
|
|
1407
1904
|
cases: results,
|
|
1408
1905
|
metrics,
|
|
@@ -1410,59 +1907,502 @@ async function runTriggerTest(skill, options) {
|
|
|
1410
1907
|
};
|
|
1411
1908
|
}
|
|
1412
1909
|
|
|
1413
|
-
// src/
|
|
1414
|
-
function
|
|
1415
|
-
|
|
1416
|
-
|
|
1910
|
+
// src/reporters/json.ts
|
|
1911
|
+
function renderJson(value) {
|
|
1912
|
+
return JSON.stringify(value, null, 2);
|
|
1913
|
+
}
|
|
1914
|
+
|
|
1915
|
+
// src/commands/common.ts
|
|
1916
|
+
var executionContextByCommand = /* @__PURE__ */ new WeakMap();
|
|
1917
|
+
var singleEvalPromptSchema = z5.object({
|
|
1918
|
+
prompt: z5.string().min(1),
|
|
1919
|
+
assertions: z5.array(z5.string().min(1)).optional()
|
|
1920
|
+
});
|
|
1921
|
+
var promptStringArraySchema = z5.array(z5.string().min(1));
|
|
1922
|
+
var assertionsObjectSchema = z5.object({
|
|
1923
|
+
assertions: z5.array(z5.string().min(1))
|
|
1924
|
+
});
|
|
1925
|
+
function parseJsonIfPossible(raw) {
|
|
1926
|
+
try {
|
|
1927
|
+
return JSON.parse(raw);
|
|
1928
|
+
} catch {
|
|
1929
|
+
return raw;
|
|
1417
1930
|
}
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1931
|
+
}
|
|
1932
|
+
function normalizeEvalPrompts(value, sourceLabel) {
|
|
1933
|
+
const promptArray = evalPromptArraySchema.safeParse(value);
|
|
1934
|
+
if (promptArray.success) {
|
|
1935
|
+
return promptArray.data;
|
|
1936
|
+
}
|
|
1937
|
+
const singlePrompt = singleEvalPromptSchema.safeParse(value);
|
|
1938
|
+
if (singlePrompt.success) {
|
|
1939
|
+
return [singlePrompt.data];
|
|
1940
|
+
}
|
|
1941
|
+
const promptStrings = promptStringArraySchema.safeParse(value);
|
|
1942
|
+
if (promptStrings.success) {
|
|
1943
|
+
return promptStrings.data.map((prompt) => ({ prompt }));
|
|
1944
|
+
}
|
|
1945
|
+
if (typeof value === "string" && value.trim() !== "") {
|
|
1946
|
+
return [{ prompt: value.trim() }];
|
|
1947
|
+
}
|
|
1948
|
+
throw new Error(
|
|
1949
|
+
`Invalid eval prompt source at ${sourceLabel}. Expected plain text, a JSON prompt object, or a JSON array of prompts.`
|
|
1950
|
+
);
|
|
1951
|
+
}
|
|
1952
|
+
function parseAssertionsFromText(raw) {
|
|
1953
|
+
return raw.split(/\r?\n/).map((line) => line.trim().replace(/^[-*]\s+/, "").replace(/^\d+\.\s+/, "")).filter((line) => line.length > 0);
|
|
1954
|
+
}
|
|
1955
|
+
function normalizeAssertions(value, sourceLabel) {
|
|
1956
|
+
const assertionArray = z5.array(z5.string().min(1)).safeParse(value);
|
|
1957
|
+
if (assertionArray.success) {
|
|
1958
|
+
return assertionArray.data;
|
|
1959
|
+
}
|
|
1960
|
+
const assertionObject = assertionsObjectSchema.safeParse(value);
|
|
1961
|
+
if (assertionObject.success) {
|
|
1962
|
+
return assertionObject.data.assertions;
|
|
1963
|
+
}
|
|
1964
|
+
if (typeof value === "string") {
|
|
1965
|
+
const assertions = parseAssertionsFromText(value);
|
|
1966
|
+
if (assertions.length > 0) {
|
|
1967
|
+
return assertions;
|
|
1422
1968
|
}
|
|
1423
|
-
throw new Error(
|
|
1424
|
-
"No Anthropic API key found. Set ANTHROPIC_API_KEY environment variable or pass --api-key flag."
|
|
1425
|
-
);
|
|
1426
1969
|
}
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1970
|
+
throw new Error(
|
|
1971
|
+
`Invalid eval assertions source at ${sourceLabel}. Expected JSON string[], { assertions: string[] }, or newline-delimited text.`
|
|
1972
|
+
);
|
|
1973
|
+
}
|
|
1974
|
+
function setCommandExecutionContext(command, context) {
|
|
1975
|
+
executionContextByCommand.set(command, context);
|
|
1976
|
+
}
|
|
1977
|
+
function getCommandExecutionContext(command) {
|
|
1978
|
+
const context = executionContextByCommand.get(command);
|
|
1979
|
+
if (!context) {
|
|
1980
|
+
throw new Error(`Missing resolved config for command '${command.name()}'.`);
|
|
1430
1981
|
}
|
|
1431
|
-
|
|
1982
|
+
return context;
|
|
1432
1983
|
}
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
import Anthropic from "@anthropic-ai/sdk";
|
|
1436
|
-
function wait(ms) {
|
|
1437
|
-
return new Promise((resolve) => {
|
|
1438
|
-
setTimeout(resolve, ms);
|
|
1439
|
-
});
|
|
1984
|
+
function getResolvedConfig(command) {
|
|
1985
|
+
return getCommandExecutionContext(command).config;
|
|
1440
1986
|
}
|
|
1441
|
-
function
|
|
1442
|
-
|
|
1443
|
-
|
|
1987
|
+
function getGlobalCliOptions(command) {
|
|
1988
|
+
const options = command.optsWithGlobals();
|
|
1989
|
+
const context = executionContextByCommand.get(command);
|
|
1990
|
+
return {
|
|
1991
|
+
json: context?.config.json ?? Boolean(options.json),
|
|
1992
|
+
color: options.color !== false
|
|
1993
|
+
};
|
|
1994
|
+
}
|
|
1995
|
+
async function loadTriggerQueriesFile(filePath) {
|
|
1996
|
+
const loaded = await readJsonFile(filePath);
|
|
1997
|
+
const parsed = triggerQueryArraySchema.safeParse(loaded);
|
|
1998
|
+
if (!parsed.success) {
|
|
1999
|
+
throw new Error(`Invalid --queries JSON: ${parsed.error.issues[0]?.message ?? "unknown format issue"}`);
|
|
1444
2000
|
}
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
2001
|
+
return parsed.data;
|
|
2002
|
+
}
|
|
2003
|
+
async function loadEvalPromptsJson(filePath) {
|
|
2004
|
+
const loaded = await readJsonFile(filePath);
|
|
2005
|
+
const parsed = evalPromptArraySchema.safeParse(loaded);
|
|
2006
|
+
if (!parsed.success) {
|
|
2007
|
+
throw new Error(`Invalid --prompts JSON: ${parsed.error.issues[0]?.message ?? "unknown format issue"}`);
|
|
1448
2008
|
}
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
2009
|
+
return parsed.data;
|
|
2010
|
+
}
|
|
2011
|
+
async function loadConfiguredEvalPrompts(command) {
|
|
2012
|
+
const context = getCommandExecutionContext(command);
|
|
2013
|
+
const promptFile = context.config.eval.promptFile;
|
|
2014
|
+
const assertionsFile = context.config.eval.assertionsFile;
|
|
2015
|
+
if (!promptFile && !assertionsFile) {
|
|
2016
|
+
return void 0;
|
|
2017
|
+
}
|
|
2018
|
+
if (!promptFile && assertionsFile) {
|
|
2019
|
+
throw new Error("Config field eval.assertionsFile requires eval.promptFile.");
|
|
2020
|
+
}
|
|
2021
|
+
const promptRaw = await fs5.readFile(promptFile, "utf8");
|
|
2022
|
+
let prompts = normalizeEvalPrompts(parseJsonIfPossible(promptRaw), promptFile);
|
|
2023
|
+
if (assertionsFile) {
|
|
2024
|
+
const assertionsRaw = await fs5.readFile(assertionsFile, "utf8");
|
|
2025
|
+
const assertions = normalizeAssertions(parseJsonIfPossible(assertionsRaw), assertionsFile);
|
|
2026
|
+
prompts = prompts.map((prompt) => ({
|
|
2027
|
+
prompt: prompt.prompt,
|
|
2028
|
+
assertions: [...assertions]
|
|
2029
|
+
}));
|
|
2030
|
+
}
|
|
2031
|
+
const numRunsWasExplicit = context.configFile?.eval?.numRuns !== void 0;
|
|
2032
|
+
if (numRunsWasExplicit && prompts.length === 1 && context.config.eval.numRuns > 1) {
|
|
2033
|
+
const promptTemplate = prompts[0];
|
|
2034
|
+
prompts = Array.from({ length: context.config.eval.numRuns }, () => ({
|
|
2035
|
+
prompt: promptTemplate.prompt,
|
|
2036
|
+
assertions: promptTemplate.assertions ? [...promptTemplate.assertions] : void 0
|
|
2037
|
+
}));
|
|
2038
|
+
}
|
|
2039
|
+
return prompts;
|
|
2040
|
+
}
|
|
2041
|
+
function writeResult(value, asJson) {
|
|
2042
|
+
if (asJson) {
|
|
2043
|
+
process.stdout.write(`${renderJson(value)}
|
|
2044
|
+
`);
|
|
2045
|
+
return;
|
|
1452
2046
|
}
|
|
1453
|
-
|
|
2047
|
+
process.stdout.write(`${String(value)}
|
|
2048
|
+
`);
|
|
1454
2049
|
}
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
2050
|
+
function writeError(error, asJson) {
|
|
2051
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2052
|
+
if (asJson) {
|
|
2053
|
+
process.stdout.write(`${renderJson({ error: message })}
|
|
2054
|
+
`);
|
|
2055
|
+
return;
|
|
1460
2056
|
}
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
2057
|
+
process.stderr.write(`Error: ${message}
|
|
2058
|
+
`);
|
|
2059
|
+
}
|
|
2060
|
+
|
|
2061
|
+
// src/commands/lint.ts
|
|
2062
|
+
async function handleLintCommand(targetPath, options) {
|
|
2063
|
+
try {
|
|
2064
|
+
const report = await runLinter(targetPath, { suppress: options.suppress });
|
|
2065
|
+
if (options.json) {
|
|
2066
|
+
writeResult(report, true);
|
|
2067
|
+
} else {
|
|
2068
|
+
writeResult(renderLintReport(report, options.color), false);
|
|
2069
|
+
}
|
|
2070
|
+
if (lintFails(report, options.failOn)) {
|
|
2071
|
+
process.exitCode = 1;
|
|
2072
|
+
}
|
|
2073
|
+
} catch (error) {
|
|
2074
|
+
writeError(error, options.json);
|
|
2075
|
+
process.exitCode = 2;
|
|
2076
|
+
}
|
|
2077
|
+
}
|
|
2078
|
+
function registerLintCommand(program) {
|
|
2079
|
+
program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").action(async (targetPath, _commandOptions, command) => {
|
|
2080
|
+
const globalOptions = getGlobalCliOptions(command);
|
|
2081
|
+
const config = getResolvedConfig(command);
|
|
2082
|
+
await handleLintCommand(targetPath, {
|
|
2083
|
+
...globalOptions,
|
|
2084
|
+
failOn: config.lint.failOn,
|
|
2085
|
+
suppress: config.lint.suppress
|
|
2086
|
+
});
|
|
2087
|
+
});
|
|
2088
|
+
}
|
|
2089
|
+
|
|
2090
|
+
// src/commands/trigger.ts
|
|
2091
|
+
import ora from "ora";
|
|
2092
|
+
import { z as z7 } from "zod";
|
|
2093
|
+
|
|
2094
|
+
// src/utils/config.ts
|
|
2095
|
+
import fs6 from "node:fs/promises";
|
|
2096
|
+
import path5 from "node:path";
|
|
2097
|
+
import { z as z6 } from "zod";
|
|
2098
|
+
var providerNameSchema = z6.enum(["anthropic", "openai"]);
|
|
2099
|
+
var lintFailOnSchema = z6.enum(["error", "warn"]);
|
|
2100
|
+
var lintConfigSchema = z6.object({
|
|
2101
|
+
failOn: lintFailOnSchema.optional(),
|
|
2102
|
+
suppress: z6.array(z6.string().min(1)).optional()
|
|
2103
|
+
}).strict();
|
|
2104
|
+
var triggerConfigSchema = z6.object({
|
|
2105
|
+
numQueries: z6.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
|
|
2106
|
+
threshold: z6.number().min(0).max(1).optional(),
|
|
2107
|
+
seed: z6.number().int().optional()
|
|
2108
|
+
}).strict().partial();
|
|
2109
|
+
var evalConfigSchema = z6.object({
|
|
2110
|
+
numRuns: z6.number().int().min(1).optional(),
|
|
2111
|
+
threshold: z6.number().min(0).max(1).optional(),
|
|
2112
|
+
promptFile: z6.string().min(1).optional(),
|
|
2113
|
+
assertionsFile: z6.string().min(1).optional()
|
|
2114
|
+
}).strict().partial();
|
|
2115
|
+
var skilltestConfigSchema = z6.object({
|
|
2116
|
+
provider: providerNameSchema.optional(),
|
|
2117
|
+
model: z6.string().min(1).optional(),
|
|
2118
|
+
json: z6.boolean().optional(),
|
|
2119
|
+
lint: lintConfigSchema.optional(),
|
|
2120
|
+
trigger: triggerConfigSchema.optional(),
|
|
2121
|
+
eval: evalConfigSchema.optional()
|
|
2122
|
+
}).strict();
|
|
2123
|
+
var resolvedSkilltestConfigSchema = z6.object({
|
|
2124
|
+
provider: providerNameSchema,
|
|
2125
|
+
model: z6.string().min(1),
|
|
2126
|
+
json: z6.boolean(),
|
|
2127
|
+
lint: z6.object({
|
|
2128
|
+
failOn: lintFailOnSchema,
|
|
2129
|
+
suppress: z6.array(z6.string().min(1))
|
|
2130
|
+
}),
|
|
2131
|
+
trigger: z6.object({
|
|
2132
|
+
numQueries: z6.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
|
|
2133
|
+
threshold: z6.number().min(0).max(1),
|
|
2134
|
+
seed: z6.number().int().optional()
|
|
2135
|
+
}),
|
|
2136
|
+
eval: z6.object({
|
|
2137
|
+
numRuns: z6.number().int().min(1),
|
|
2138
|
+
threshold: z6.number().min(0).max(1),
|
|
2139
|
+
promptFile: z6.string().min(1).optional(),
|
|
2140
|
+
assertionsFile: z6.string().min(1).optional()
|
|
2141
|
+
})
|
|
2142
|
+
});
|
|
2143
|
+
var DEFAULT_SKILLTEST_CONFIG = {
|
|
2144
|
+
provider: "anthropic",
|
|
2145
|
+
model: "claude-sonnet-4-5-20250929",
|
|
2146
|
+
json: false,
|
|
2147
|
+
lint: {
|
|
2148
|
+
failOn: "error",
|
|
2149
|
+
suppress: []
|
|
2150
|
+
},
|
|
2151
|
+
trigger: {
|
|
2152
|
+
numQueries: 20,
|
|
2153
|
+
threshold: 0.8
|
|
2154
|
+
},
|
|
2155
|
+
eval: {
|
|
2156
|
+
numRuns: 5,
|
|
2157
|
+
threshold: 0.9
|
|
2158
|
+
}
|
|
2159
|
+
};
|
|
2160
|
+
function formatIssuePath(issuePath) {
|
|
2161
|
+
if (issuePath.length === 0) {
|
|
2162
|
+
return "root";
|
|
2163
|
+
}
|
|
2164
|
+
return issuePath.map(String).join(".");
|
|
2165
|
+
}
|
|
2166
|
+
function buildConfigValidationError(error, sourceLabel) {
|
|
2167
|
+
const issue = error.issues[0];
|
|
2168
|
+
const issuePath = formatIssuePath(issue?.path ?? []);
|
|
2169
|
+
const issueMessage = issue?.message ?? "Invalid config value.";
|
|
2170
|
+
return new Error(`Invalid skilltest config in ${sourceLabel} at ${issuePath}: ${issueMessage}`);
|
|
2171
|
+
}
|
|
2172
|
+
async function readJsonObject(filePath, label) {
|
|
2173
|
+
let raw;
|
|
2174
|
+
try {
|
|
2175
|
+
raw = await fs6.readFile(filePath, "utf8");
|
|
2176
|
+
} catch (error) {
|
|
2177
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2178
|
+
throw new Error(`Failed to read ${label}: ${message}`);
|
|
2179
|
+
}
|
|
2180
|
+
try {
|
|
2181
|
+
return JSON.parse(raw);
|
|
2182
|
+
} catch (error) {
|
|
2183
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2184
|
+
throw new Error(`Invalid JSON in ${label}: ${message}`);
|
|
2185
|
+
}
|
|
2186
|
+
}
|
|
2187
|
+
async function loadConfigFromJsonFile(filePath) {
|
|
2188
|
+
if (!await pathExists(filePath)) {
|
|
2189
|
+
return null;
|
|
2190
|
+
}
|
|
2191
|
+
const raw = await readJsonObject(filePath, filePath);
|
|
2192
|
+
const parsed = skilltestConfigSchema.safeParse(raw);
|
|
2193
|
+
if (!parsed.success) {
|
|
2194
|
+
throw buildConfigValidationError(parsed.error, filePath);
|
|
2195
|
+
}
|
|
2196
|
+
return {
|
|
2197
|
+
configFile: parsed.data,
|
|
2198
|
+
sourcePath: filePath,
|
|
2199
|
+
sourceDirectory: path5.dirname(filePath)
|
|
2200
|
+
};
|
|
2201
|
+
}
|
|
2202
|
+
async function loadConfigFromNearestPackageJson(startDirectory) {
|
|
2203
|
+
let currentDirectory = path5.resolve(startDirectory);
|
|
2204
|
+
while (true) {
|
|
2205
|
+
const packageJsonPath = path5.join(currentDirectory, "package.json");
|
|
2206
|
+
if (await pathExists(packageJsonPath)) {
|
|
2207
|
+
const raw = await readJsonObject(packageJsonPath, packageJsonPath);
|
|
2208
|
+
const packageJsonSchema = z6.object({
|
|
2209
|
+
skilltestrc: skilltestConfigSchema.optional()
|
|
2210
|
+
}).passthrough();
|
|
2211
|
+
const parsed = packageJsonSchema.safeParse(raw);
|
|
2212
|
+
if (!parsed.success) {
|
|
2213
|
+
throw buildConfigValidationError(parsed.error, `${packageJsonPath}#skilltestrc`);
|
|
2214
|
+
}
|
|
2215
|
+
if (!parsed.data.skilltestrc) {
|
|
2216
|
+
return null;
|
|
2217
|
+
}
|
|
2218
|
+
return {
|
|
2219
|
+
configFile: parsed.data.skilltestrc,
|
|
2220
|
+
sourcePath: packageJsonPath,
|
|
2221
|
+
sourceDirectory: currentDirectory
|
|
2222
|
+
};
|
|
2223
|
+
}
|
|
2224
|
+
const parentDirectory = path5.dirname(currentDirectory);
|
|
2225
|
+
if (parentDirectory === currentDirectory) {
|
|
2226
|
+
return null;
|
|
2227
|
+
}
|
|
2228
|
+
currentDirectory = parentDirectory;
|
|
2229
|
+
}
|
|
2230
|
+
}
|
|
2231
|
+
async function resolveSkillDirectoryConfig(targetPath) {
|
|
2232
|
+
if (!targetPath) {
|
|
2233
|
+
return null;
|
|
2234
|
+
}
|
|
2235
|
+
try {
|
|
2236
|
+
const { skillRoot } = await resolveSkillPath(targetPath);
|
|
2237
|
+
return loadConfigFromJsonFile(path5.join(skillRoot, ".skilltestrc"));
|
|
2238
|
+
} catch {
|
|
2239
|
+
return null;
|
|
2240
|
+
}
|
|
2241
|
+
}
|
|
2242
|
+
function resolveConfigRelativePath(baseDirectory, value) {
|
|
2243
|
+
if (!value) {
|
|
2244
|
+
return void 0;
|
|
2245
|
+
}
|
|
2246
|
+
return path5.resolve(baseDirectory, value);
|
|
2247
|
+
}
|
|
2248
|
+
function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = process.cwd()) {
|
|
2249
|
+
const merged = {
|
|
2250
|
+
provider: cliFlags.provider ?? configFile.provider ?? DEFAULT_SKILLTEST_CONFIG.provider,
|
|
2251
|
+
model: cliFlags.model ?? configFile.model ?? DEFAULT_SKILLTEST_CONFIG.model,
|
|
2252
|
+
json: cliFlags.json ?? configFile.json ?? DEFAULT_SKILLTEST_CONFIG.json,
|
|
2253
|
+
lint: {
|
|
2254
|
+
failOn: cliFlags.lint?.failOn ?? configFile.lint?.failOn ?? DEFAULT_SKILLTEST_CONFIG.lint.failOn,
|
|
2255
|
+
suppress: cliFlags.lint?.suppress ?? configFile.lint?.suppress ?? DEFAULT_SKILLTEST_CONFIG.lint.suppress
|
|
2256
|
+
},
|
|
2257
|
+
trigger: {
|
|
2258
|
+
numQueries: cliFlags.trigger?.numQueries ?? configFile.trigger?.numQueries ?? DEFAULT_SKILLTEST_CONFIG.trigger.numQueries,
|
|
2259
|
+
threshold: cliFlags.trigger?.threshold ?? configFile.trigger?.threshold ?? DEFAULT_SKILLTEST_CONFIG.trigger.threshold,
|
|
2260
|
+
seed: cliFlags.trigger?.seed ?? configFile.trigger?.seed
|
|
2261
|
+
},
|
|
2262
|
+
eval: {
|
|
2263
|
+
numRuns: cliFlags.eval?.numRuns ?? configFile.eval?.numRuns ?? DEFAULT_SKILLTEST_CONFIG.eval.numRuns,
|
|
2264
|
+
threshold: cliFlags.eval?.threshold ?? configFile.eval?.threshold ?? DEFAULT_SKILLTEST_CONFIG.eval.threshold,
|
|
2265
|
+
promptFile: resolveConfigRelativePath(
|
|
2266
|
+
baseDirectory,
|
|
2267
|
+
cliFlags.eval?.promptFile ?? configFile.eval?.promptFile ?? DEFAULT_SKILLTEST_CONFIG.eval.promptFile
|
|
2268
|
+
),
|
|
2269
|
+
assertionsFile: resolveConfigRelativePath(
|
|
2270
|
+
baseDirectory,
|
|
2271
|
+
cliFlags.eval?.assertionsFile ?? configFile.eval?.assertionsFile ?? DEFAULT_SKILLTEST_CONFIG.eval.assertionsFile
|
|
2272
|
+
)
|
|
2273
|
+
}
|
|
2274
|
+
};
|
|
2275
|
+
return resolvedSkilltestConfigSchema.parse(merged);
|
|
2276
|
+
}
|
|
2277
|
+
function getTypedOptionValue(command, key) {
|
|
2278
|
+
const options = command.optsWithGlobals();
|
|
2279
|
+
const value = options[key];
|
|
2280
|
+
if (value === void 0) {
|
|
2281
|
+
return void 0;
|
|
2282
|
+
}
|
|
2283
|
+
return value;
|
|
2284
|
+
}
|
|
2285
|
+
function extractCliConfigOverrides(command) {
|
|
2286
|
+
const overrides = {};
|
|
2287
|
+
if (command.getOptionValueSourceWithGlobals("json") === "cli") {
|
|
2288
|
+
overrides.json = Boolean(getTypedOptionValue(command, "json"));
|
|
2289
|
+
}
|
|
2290
|
+
if (command.getOptionValueSource("provider") === "cli") {
|
|
2291
|
+
overrides.provider = getTypedOptionValue(command, "provider");
|
|
2292
|
+
}
|
|
2293
|
+
if (command.getOptionValueSource("model") === "cli") {
|
|
2294
|
+
overrides.model = getTypedOptionValue(command, "model");
|
|
2295
|
+
}
|
|
2296
|
+
if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("numQueries") === "cli") {
|
|
2297
|
+
overrides.trigger = {
|
|
2298
|
+
...overrides.trigger,
|
|
2299
|
+
numQueries: getTypedOptionValue(command, "numQueries")
|
|
2300
|
+
};
|
|
2301
|
+
}
|
|
2302
|
+
if (command.name() === "check" && command.getOptionValueSource("minF1") === "cli") {
|
|
2303
|
+
overrides.trigger = {
|
|
2304
|
+
...overrides.trigger,
|
|
2305
|
+
threshold: getTypedOptionValue(command, "minF1")
|
|
2306
|
+
};
|
|
2307
|
+
}
|
|
2308
|
+
if (command.name() === "check" && command.getOptionValueSource("minAssertPassRate") === "cli") {
|
|
2309
|
+
overrides.eval = {
|
|
2310
|
+
...overrides.eval,
|
|
2311
|
+
threshold: getTypedOptionValue(command, "minAssertPassRate")
|
|
2312
|
+
};
|
|
2313
|
+
}
|
|
2314
|
+
const parsed = skilltestConfigSchema.safeParse(overrides);
|
|
2315
|
+
if (!parsed.success) {
|
|
2316
|
+
throw buildConfigValidationError(parsed.error, "CLI flags");
|
|
2317
|
+
}
|
|
2318
|
+
return parsed.data;
|
|
2319
|
+
}
|
|
2320
|
+
async function resolveConfigContext(targetPath, cliFlags) {
|
|
2321
|
+
const cwd = process.cwd();
|
|
2322
|
+
const skillDirectoryConfig = await resolveSkillDirectoryConfig(targetPath);
|
|
2323
|
+
if (skillDirectoryConfig) {
|
|
2324
|
+
return {
|
|
2325
|
+
configFile: skillDirectoryConfig.configFile,
|
|
2326
|
+
...skillDirectoryConfig,
|
|
2327
|
+
config: mergeConfigLayers(skillDirectoryConfig.configFile, cliFlags, skillDirectoryConfig.sourceDirectory)
|
|
2328
|
+
};
|
|
2329
|
+
}
|
|
2330
|
+
const cwdConfigPath = path5.join(cwd, ".skilltestrc");
|
|
2331
|
+
const cwdConfig = await loadConfigFromJsonFile(cwdConfigPath);
|
|
2332
|
+
if (cwdConfig) {
|
|
2333
|
+
return {
|
|
2334
|
+
configFile: cwdConfig.configFile,
|
|
2335
|
+
...cwdConfig,
|
|
2336
|
+
config: mergeConfigLayers(cwdConfig.configFile, cliFlags, cwdConfig.sourceDirectory)
|
|
2337
|
+
};
|
|
2338
|
+
}
|
|
2339
|
+
const packageJsonConfig = await loadConfigFromNearestPackageJson(cwd);
|
|
2340
|
+
if (packageJsonConfig) {
|
|
2341
|
+
return {
|
|
2342
|
+
configFile: packageJsonConfig.configFile,
|
|
2343
|
+
...packageJsonConfig,
|
|
2344
|
+
config: mergeConfigLayers(packageJsonConfig.configFile, cliFlags, packageJsonConfig.sourceDirectory)
|
|
2345
|
+
};
|
|
2346
|
+
}
|
|
2347
|
+
return {
|
|
2348
|
+
configFile: null,
|
|
2349
|
+
config: mergeConfigLayers({}, cliFlags, cwd),
|
|
2350
|
+
sourcePath: null,
|
|
2351
|
+
sourceDirectory: cwd
|
|
2352
|
+
};
|
|
2353
|
+
}
|
|
2354
|
+
function resolveApiKey(provider, override) {
|
|
2355
|
+
if (override && override.trim() !== "") {
|
|
2356
|
+
return override.trim();
|
|
2357
|
+
}
|
|
2358
|
+
if (provider === "anthropic") {
|
|
2359
|
+
const envValue2 = process.env.ANTHROPIC_API_KEY?.trim();
|
|
2360
|
+
if (envValue2) {
|
|
2361
|
+
return envValue2;
|
|
2362
|
+
}
|
|
2363
|
+
throw new Error(
|
|
2364
|
+
"No Anthropic API key found. Set ANTHROPIC_API_KEY environment variable or pass --api-key flag."
|
|
2365
|
+
);
|
|
2366
|
+
}
|
|
2367
|
+
const envValue = process.env.OPENAI_API_KEY?.trim();
|
|
2368
|
+
if (envValue) {
|
|
2369
|
+
return envValue;
|
|
2370
|
+
}
|
|
2371
|
+
throw new Error("No OpenAI API key found. Set OPENAI_API_KEY environment variable or pass --api-key flag.");
|
|
2372
|
+
}
|
|
2373
|
+
|
|
2374
|
+
// src/providers/anthropic.ts
|
|
2375
|
+
import Anthropic from "@anthropic-ai/sdk";
|
|
2376
|
+
function wait(ms) {
|
|
2377
|
+
return new Promise((resolve) => {
|
|
2378
|
+
setTimeout(resolve, ms);
|
|
2379
|
+
});
|
|
2380
|
+
}
|
|
2381
|
+
function isRateLimitError(error) {
|
|
2382
|
+
if (!error || typeof error !== "object") {
|
|
2383
|
+
return false;
|
|
2384
|
+
}
|
|
2385
|
+
const maybeStatus = error.status;
|
|
2386
|
+
if (maybeStatus === 429) {
|
|
2387
|
+
return true;
|
|
2388
|
+
}
|
|
2389
|
+
const maybeMessage = error.message;
|
|
2390
|
+
if (typeof maybeMessage === "string" && /rate limit/i.test(maybeMessage)) {
|
|
2391
|
+
return true;
|
|
2392
|
+
}
|
|
2393
|
+
return false;
|
|
2394
|
+
}
|
|
2395
|
+
var AnthropicProvider = class {
|
|
2396
|
+
name = "anthropic";
|
|
2397
|
+
client;
|
|
2398
|
+
constructor(apiKey) {
|
|
2399
|
+
this.client = new Anthropic({ apiKey });
|
|
2400
|
+
}
|
|
2401
|
+
async sendMessage(systemPrompt, userMessage, options) {
|
|
2402
|
+
let lastError;
|
|
2403
|
+
for (let attempt = 0; attempt < 3; attempt += 1) {
|
|
2404
|
+
try {
|
|
2405
|
+
const response = await this.client.messages.create({
|
|
1466
2406
|
model: options.model,
|
|
1467
2407
|
max_tokens: 2048,
|
|
1468
2408
|
system: systemPrompt,
|
|
@@ -1607,14 +2547,12 @@ function createProvider(providerName, apiKeyOverride) {
|
|
|
1607
2547
|
}
|
|
1608
2548
|
|
|
1609
2549
|
// src/commands/trigger.ts
|
|
1610
|
-
var
|
|
1611
|
-
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
|
|
1616
|
-
verbose: z3.boolean().optional(),
|
|
1617
|
-
apiKey: z3.string().optional()
|
|
2550
|
+
var triggerCliSchema = z7.object({
|
|
2551
|
+
queries: z7.string().optional(),
|
|
2552
|
+
saveQueries: z7.string().optional(),
|
|
2553
|
+
seed: z7.number().int().optional(),
|
|
2554
|
+
verbose: z7.boolean().optional(),
|
|
2555
|
+
apiKey: z7.string().optional()
|
|
1618
2556
|
});
|
|
1619
2557
|
var DEFAULT_ANTHROPIC_MODEL = "claude-sonnet-4-5-20250929";
|
|
1620
2558
|
var DEFAULT_OPENAI_MODEL = "gpt-4.1-mini";
|
|
@@ -1624,230 +2562,91 @@ function resolveModel(provider, model) {
|
|
|
1624
2562
|
}
|
|
1625
2563
|
return model;
|
|
1626
2564
|
}
|
|
1627
|
-
function
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
2565
|
+
function renderTriggerOutputWithSeed(output, seed) {
|
|
2566
|
+
if (seed === void 0) {
|
|
2567
|
+
return output;
|
|
2568
|
+
}
|
|
2569
|
+
return `${output}
|
|
2570
|
+
Seed: ${seed}`;
|
|
2571
|
+
}
|
|
2572
|
+
async function handleTriggerCommand(targetPath, options) {
|
|
2573
|
+
const spinner = options.json || !process.stdout.isTTY ? null : ora("Preparing trigger evaluation...").start();
|
|
2574
|
+
try {
|
|
2575
|
+
if (spinner) {
|
|
2576
|
+
spinner.text = "Parsing skill...";
|
|
1635
2577
|
}
|
|
1636
|
-
const
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
spinner.text = "Parsing skill...";
|
|
1644
|
-
}
|
|
1645
|
-
const skill = await parseSkillStrict(targetPath);
|
|
1646
|
-
if (spinner) {
|
|
1647
|
-
spinner.text = "Initializing model provider...";
|
|
1648
|
-
}
|
|
1649
|
-
const provider = createProvider(options.provider, options.apiKey);
|
|
1650
|
-
let queries = void 0;
|
|
1651
|
-
if (options.queries) {
|
|
1652
|
-
if (spinner) {
|
|
1653
|
-
spinner.text = "Loading custom trigger queries...";
|
|
1654
|
-
}
|
|
1655
|
-
const loaded = await readJsonFile(options.queries);
|
|
1656
|
-
const parsedQueries = triggerQueryArraySchema.safeParse(loaded);
|
|
1657
|
-
if (!parsedQueries.success) {
|
|
1658
|
-
throw new Error(`Invalid --queries JSON: ${parsedQueries.error.issues[0]?.message ?? "unknown format issue"}`);
|
|
1659
|
-
}
|
|
1660
|
-
queries = parsedQueries.data;
|
|
1661
|
-
}
|
|
2578
|
+
const skill = await parseSkillStrict(targetPath);
|
|
2579
|
+
if (spinner) {
|
|
2580
|
+
spinner.text = "Initializing model provider...";
|
|
2581
|
+
}
|
|
2582
|
+
const provider = createProvider(options.provider, options.apiKey);
|
|
2583
|
+
let queries = void 0;
|
|
2584
|
+
if (options.queries) {
|
|
1662
2585
|
if (spinner) {
|
|
1663
|
-
spinner.text = "
|
|
1664
|
-
}
|
|
1665
|
-
const model = resolveModel(options.provider, options.model);
|
|
1666
|
-
const result = await runTriggerTest(skill, {
|
|
1667
|
-
model,
|
|
1668
|
-
provider,
|
|
1669
|
-
queries,
|
|
1670
|
-
numQueries: options.numQueries,
|
|
1671
|
-
verbose: Boolean(options.verbose)
|
|
1672
|
-
});
|
|
1673
|
-
if (options.saveQueries) {
|
|
1674
|
-
await writeJsonFile(options.saveQueries, result.queries);
|
|
1675
|
-
}
|
|
1676
|
-
spinner?.stop();
|
|
1677
|
-
if (globalOptions.json) {
|
|
1678
|
-
writeResult(result, true);
|
|
1679
|
-
} else {
|
|
1680
|
-
writeResult(renderTriggerReport(result, globalOptions.color, Boolean(options.verbose)), false);
|
|
2586
|
+
spinner.text = "Loading custom trigger queries...";
|
|
1681
2587
|
}
|
|
1682
|
-
|
|
1683
|
-
spinner?.stop();
|
|
1684
|
-
writeError(error, globalOptions.json);
|
|
1685
|
-
process.exitCode = 2;
|
|
2588
|
+
queries = await loadTriggerQueriesFile(options.queries);
|
|
1686
2589
|
}
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
import { z as z4 } from "zod";
|
|
1699
|
-
var gradedAssertionSchema = z4.object({
|
|
1700
|
-
assertion: z4.string(),
|
|
1701
|
-
passed: z4.boolean(),
|
|
1702
|
-
evidence: z4.string()
|
|
1703
|
-
});
|
|
1704
|
-
var graderOutputSchema = z4.object({
|
|
1705
|
-
assertions: z4.array(gradedAssertionSchema)
|
|
1706
|
-
});
|
|
1707
|
-
function extractJsonObject(raw) {
|
|
1708
|
-
const trimmed = raw.trim();
|
|
1709
|
-
if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
|
|
1710
|
-
return JSON.parse(trimmed);
|
|
1711
|
-
}
|
|
1712
|
-
const start = trimmed.indexOf("{");
|
|
1713
|
-
const end = trimmed.lastIndexOf("}");
|
|
1714
|
-
if (start >= 0 && end > start) {
|
|
1715
|
-
return JSON.parse(trimmed.slice(start, end + 1));
|
|
1716
|
-
}
|
|
1717
|
-
throw new Error("Grader did not return a JSON object.");
|
|
1718
|
-
}
|
|
1719
|
-
async function gradeResponse(options) {
|
|
1720
|
-
const assertionList = options.assertions && options.assertions.length > 0 ? options.assertions : [
|
|
1721
|
-
"The response follows the skill instructions faithfully.",
|
|
1722
|
-
"The response is well-structured and actionable.",
|
|
1723
|
-
"The response addresses the user prompt directly."
|
|
1724
|
-
];
|
|
1725
|
-
const systemPrompt = [
|
|
1726
|
-
"You are a strict evaluator for agent skill outputs.",
|
|
1727
|
-
"Assess each assertion and return JSON only.",
|
|
1728
|
-
'Required output format: {"assertions":[{"assertion":"...","passed":true|false,"evidence":"..."}]}'
|
|
1729
|
-
].join(" ");
|
|
1730
|
-
const userPrompt = [
|
|
1731
|
-
`Skill: ${options.skillName}`,
|
|
1732
|
-
"Skill instructions:",
|
|
1733
|
-
options.skillBody,
|
|
1734
|
-
"",
|
|
1735
|
-
`User prompt: ${options.userPrompt}`,
|
|
1736
|
-
"",
|
|
1737
|
-
"Model response:",
|
|
1738
|
-
options.modelResponse,
|
|
1739
|
-
"",
|
|
1740
|
-
"Assertions to evaluate:",
|
|
1741
|
-
assertionList.map((assertion, index) => `${index + 1}. ${assertion}`).join("\n")
|
|
1742
|
-
].join("\n");
|
|
1743
|
-
const raw = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
|
|
1744
|
-
const parsed = graderOutputSchema.safeParse(extractJsonObject(raw));
|
|
1745
|
-
if (!parsed.success) {
|
|
1746
|
-
throw new Error(`Failed to parse grader output: ${parsed.error.issues[0]?.message ?? "invalid grader JSON"}`);
|
|
1747
|
-
}
|
|
1748
|
-
return parsed.data.assertions;
|
|
1749
|
-
}
|
|
1750
|
-
|
|
1751
|
-
// src/core/eval-runner.ts
|
|
1752
|
-
var evalPromptSchema = z5.object({
|
|
1753
|
-
prompt: z5.string().min(1),
|
|
1754
|
-
assertions: z5.array(z5.string().min(1)).optional()
|
|
1755
|
-
});
|
|
1756
|
-
var evalPromptArraySchema = z5.array(evalPromptSchema);
|
|
1757
|
-
function extractJsonArray(raw) {
|
|
1758
|
-
const trimmed = raw.trim();
|
|
1759
|
-
if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
|
|
1760
|
-
return JSON.parse(trimmed);
|
|
1761
|
-
}
|
|
1762
|
-
const start = trimmed.indexOf("[");
|
|
1763
|
-
const end = trimmed.lastIndexOf("]");
|
|
1764
|
-
if (start >= 0 && end > start) {
|
|
1765
|
-
return JSON.parse(trimmed.slice(start, end + 1));
|
|
1766
|
-
}
|
|
1767
|
-
throw new Error("Model did not return a JSON array.");
|
|
1768
|
-
}
|
|
1769
|
-
async function generatePrompts(skill, provider, model, count) {
|
|
1770
|
-
const systemPrompt = [
|
|
1771
|
-
"You generate realistic evaluation prompts for an agent skill.",
|
|
1772
|
-
"Return JSON only.",
|
|
1773
|
-
'Format: [{"prompt":"...","assertions":["...", "..."]}]',
|
|
1774
|
-
"Assertions should be concrete and checkable."
|
|
1775
|
-
].join(" ");
|
|
1776
|
-
const userPrompt = [
|
|
1777
|
-
`Skill name: ${skill.frontmatter.name}`,
|
|
1778
|
-
`Skill description: ${skill.frontmatter.description}`,
|
|
1779
|
-
"Skill instructions:",
|
|
1780
|
-
skill.content,
|
|
1781
|
-
"",
|
|
1782
|
-
`Generate ${count} prompts that stress the main capabilities and likely edge cases.`,
|
|
1783
|
-
"Each prompt should include 2-4 assertions."
|
|
1784
|
-
].join("\n");
|
|
1785
|
-
const raw = await provider.sendMessage(systemPrompt, userPrompt, { model });
|
|
1786
|
-
const parsed = evalPromptArraySchema.safeParse(extractJsonArray(raw));
|
|
1787
|
-
if (!parsed.success) {
|
|
1788
|
-
throw new Error(`Failed to parse generated eval prompts: ${parsed.error.issues[0]?.message ?? "invalid prompt JSON"}`);
|
|
1789
|
-
}
|
|
1790
|
-
if (parsed.data.length !== count) {
|
|
1791
|
-
throw new Error(`Expected ${count} prompts, got ${parsed.data.length}.`);
|
|
1792
|
-
}
|
|
1793
|
-
return parsed.data;
|
|
1794
|
-
}
|
|
1795
|
-
async function runEval(skill, options) {
|
|
1796
|
-
const prompts = options.prompts && options.prompts.length > 0 ? evalPromptArraySchema.parse(options.prompts) : await generatePrompts(skill, options.provider, options.model, 5);
|
|
1797
|
-
const results = [];
|
|
1798
|
-
for (const evalPrompt of prompts) {
|
|
1799
|
-
const systemPrompt = [
|
|
1800
|
-
"You are an AI assistant with an activated skill.",
|
|
1801
|
-
"Follow this SKILL.md content exactly where applicable.",
|
|
1802
|
-
"",
|
|
1803
|
-
skill.raw
|
|
1804
|
-
].join("\n");
|
|
1805
|
-
const response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
|
|
1806
|
-
const gradedAssertions = await gradeResponse({
|
|
1807
|
-
provider: options.provider,
|
|
1808
|
-
model: options.graderModel,
|
|
1809
|
-
skillName: skill.frontmatter.name,
|
|
1810
|
-
skillBody: skill.content,
|
|
1811
|
-
userPrompt: evalPrompt.prompt,
|
|
1812
|
-
modelResponse: response,
|
|
1813
|
-
assertions: evalPrompt.assertions
|
|
1814
|
-
});
|
|
1815
|
-
const passedAssertions2 = gradedAssertions.filter((assertion) => assertion.passed).length;
|
|
1816
|
-
results.push({
|
|
1817
|
-
prompt: evalPrompt.prompt,
|
|
1818
|
-
assertions: gradedAssertions,
|
|
1819
|
-
responseSummary: response.slice(0, 200),
|
|
1820
|
-
response,
|
|
1821
|
-
passedAssertions: passedAssertions2,
|
|
1822
|
-
totalAssertions: gradedAssertions.length
|
|
2590
|
+
if (spinner) {
|
|
2591
|
+
spinner.text = "Running trigger simulations...";
|
|
2592
|
+
}
|
|
2593
|
+
const model = resolveModel(options.provider, options.model);
|
|
2594
|
+
const result = await runTriggerTest(skill, {
|
|
2595
|
+
model,
|
|
2596
|
+
provider,
|
|
2597
|
+
queries,
|
|
2598
|
+
numQueries: options.numQueries,
|
|
2599
|
+
seed: options.seed,
|
|
2600
|
+
verbose: options.verbose
|
|
1823
2601
|
});
|
|
2602
|
+
if (options.saveQueries) {
|
|
2603
|
+
await writeJsonFile(options.saveQueries, result.queries);
|
|
2604
|
+
}
|
|
2605
|
+
spinner?.stop();
|
|
2606
|
+
if (options.json) {
|
|
2607
|
+
writeResult(result, true);
|
|
2608
|
+
} else {
|
|
2609
|
+
writeResult(renderTriggerOutputWithSeed(renderTriggerReport(result, options.color, options.verbose), result.seed), false);
|
|
2610
|
+
}
|
|
2611
|
+
} catch (error) {
|
|
2612
|
+
spinner?.stop();
|
|
2613
|
+
writeError(error, options.json);
|
|
2614
|
+
process.exitCode = 2;
|
|
1824
2615
|
}
|
|
1825
|
-
|
|
1826
|
-
|
|
1827
|
-
|
|
1828
|
-
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
1833
|
-
|
|
1834
|
-
|
|
1835
|
-
totalPrompts: results.length,
|
|
1836
|
-
totalAssertions,
|
|
1837
|
-
passedAssertions
|
|
2616
|
+
}
|
|
2617
|
+
function registerTriggerCommand(program) {
|
|
2618
|
+
program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--queries <path>", "Path to custom test queries JSON").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, _commandOptions, command) => {
|
|
2619
|
+
const globalOptions = getGlobalCliOptions(command);
|
|
2620
|
+
const config = getResolvedConfig(command);
|
|
2621
|
+
const parsedCli = triggerCliSchema.safeParse(command.opts());
|
|
2622
|
+
if (!parsedCli.success) {
|
|
2623
|
+
writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid trigger options."), globalOptions.json);
|
|
2624
|
+
process.exitCode = 2;
|
|
2625
|
+
return;
|
|
1838
2626
|
}
|
|
1839
|
-
|
|
2627
|
+
await handleTriggerCommand(targetPath, {
|
|
2628
|
+
...globalOptions,
|
|
2629
|
+
model: config.model,
|
|
2630
|
+
provider: config.provider,
|
|
2631
|
+
queries: parsedCli.data.queries,
|
|
2632
|
+
numQueries: config.trigger.numQueries,
|
|
2633
|
+
saveQueries: parsedCli.data.saveQueries,
|
|
2634
|
+
seed: parsedCli.data.seed ?? config.trigger.seed,
|
|
2635
|
+
verbose: Boolean(parsedCli.data.verbose),
|
|
2636
|
+
apiKey: parsedCli.data.apiKey
|
|
2637
|
+
});
|
|
2638
|
+
});
|
|
1840
2639
|
}
|
|
1841
2640
|
|
|
1842
2641
|
// src/commands/eval.ts
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
|
|
1847
|
-
|
|
1848
|
-
saveResults:
|
|
1849
|
-
verbose:
|
|
1850
|
-
apiKey:
|
|
2642
|
+
import ora2 from "ora";
|
|
2643
|
+
import { z as z8 } from "zod";
|
|
2644
|
+
var evalCliSchema = z8.object({
|
|
2645
|
+
prompts: z8.string().optional(),
|
|
2646
|
+
graderModel: z8.string().optional(),
|
|
2647
|
+
saveResults: z8.string().optional(),
|
|
2648
|
+
verbose: z8.boolean().optional(),
|
|
2649
|
+
apiKey: z8.string().optional()
|
|
1851
2650
|
});
|
|
1852
2651
|
var DEFAULT_ANTHROPIC_MODEL2 = "claude-sonnet-4-5-20250929";
|
|
1853
2652
|
var DEFAULT_OPENAI_MODEL2 = "gpt-4.1-mini";
|
|
@@ -1857,69 +2656,84 @@ function resolveModel2(provider, model) {
|
|
|
1857
2656
|
}
|
|
1858
2657
|
return model;
|
|
1859
2658
|
}
|
|
1860
|
-
function
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
writeError(new Error(parsedOptions.error.issues[0]?.message ?? "Invalid eval options."), globalOptions.json);
|
|
1866
|
-
process.exitCode = 2;
|
|
1867
|
-
return;
|
|
2659
|
+
async function handleEvalCommand(targetPath, options, command) {
|
|
2660
|
+
const spinner = options.json || !process.stdout.isTTY ? null : ora2("Preparing evaluation...").start();
|
|
2661
|
+
try {
|
|
2662
|
+
if (spinner) {
|
|
2663
|
+
spinner.text = "Parsing skill...";
|
|
1868
2664
|
}
|
|
1869
|
-
const
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
if (spinner) {
|
|
1877
|
-
spinner.text = "Initializing model provider...";
|
|
1878
|
-
}
|
|
1879
|
-
const provider = createProvider(options.provider, options.apiKey);
|
|
1880
|
-
let prompts = void 0;
|
|
1881
|
-
if (options.prompts) {
|
|
1882
|
-
if (spinner) {
|
|
1883
|
-
spinner.text = "Loading test prompts...";
|
|
1884
|
-
}
|
|
1885
|
-
const loaded = await readJsonFile(options.prompts);
|
|
1886
|
-
const parsedPrompts = evalPromptArraySchema.safeParse(loaded);
|
|
1887
|
-
if (!parsedPrompts.success) {
|
|
1888
|
-
throw new Error(`Invalid --prompts JSON: ${parsedPrompts.error.issues[0]?.message ?? "unknown format issue"}`);
|
|
1889
|
-
}
|
|
1890
|
-
prompts = parsedPrompts.data;
|
|
1891
|
-
}
|
|
2665
|
+
const skill = await parseSkillStrict(targetPath);
|
|
2666
|
+
if (spinner) {
|
|
2667
|
+
spinner.text = "Initializing model provider...";
|
|
2668
|
+
}
|
|
2669
|
+
const provider = createProvider(options.provider, options.apiKey);
|
|
2670
|
+
let prompts = void 0;
|
|
2671
|
+
if (options.prompts) {
|
|
1892
2672
|
if (spinner) {
|
|
1893
|
-
spinner.text = "
|
|
1894
|
-
}
|
|
1895
|
-
const model = resolveModel2(options.provider, options.model);
|
|
1896
|
-
const graderModel = options.graderModel ?? model;
|
|
1897
|
-
const result = await runEval(skill, {
|
|
1898
|
-
provider,
|
|
1899
|
-
model,
|
|
1900
|
-
graderModel,
|
|
1901
|
-
prompts
|
|
1902
|
-
});
|
|
1903
|
-
if (options.saveResults) {
|
|
1904
|
-
await writeJsonFile(options.saveResults, result);
|
|
2673
|
+
spinner.text = "Loading test prompts...";
|
|
1905
2674
|
}
|
|
1906
|
-
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
|
-
|
|
1911
|
-
|
|
1912
|
-
}
|
|
1913
|
-
|
|
1914
|
-
|
|
2675
|
+
prompts = await loadEvalPromptsJson(options.prompts);
|
|
2676
|
+
} else {
|
|
2677
|
+
prompts = await loadConfiguredEvalPrompts(command);
|
|
2678
|
+
}
|
|
2679
|
+
if (spinner) {
|
|
2680
|
+
spinner.text = "Running eval prompts and grading responses...";
|
|
2681
|
+
}
|
|
2682
|
+
const model = resolveModel2(options.provider, options.model);
|
|
2683
|
+
const graderModel = options.graderModel ?? model;
|
|
2684
|
+
const result = await runEval(skill, {
|
|
2685
|
+
provider,
|
|
2686
|
+
model,
|
|
2687
|
+
graderModel,
|
|
2688
|
+
numRuns: options.numRuns,
|
|
2689
|
+
prompts
|
|
2690
|
+
});
|
|
2691
|
+
if (options.saveResults) {
|
|
2692
|
+
await writeJsonFile(options.saveResults, result);
|
|
2693
|
+
}
|
|
2694
|
+
spinner?.stop();
|
|
2695
|
+
if (options.json) {
|
|
2696
|
+
writeResult(result, true);
|
|
2697
|
+
} else {
|
|
2698
|
+
writeResult(renderEvalReport(result, options.color, options.verbose), false);
|
|
2699
|
+
}
|
|
2700
|
+
} catch (error) {
|
|
2701
|
+
spinner?.stop();
|
|
2702
|
+
writeError(error, options.json);
|
|
2703
|
+
process.exitCode = 2;
|
|
2704
|
+
}
|
|
2705
|
+
}
|
|
2706
|
+
function registerEvalCommand(program) {
|
|
2707
|
+
program.command("eval").description("Run end-to-end skill execution and quality evaluation.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--prompts <path>", "Path to eval prompts JSON").option("--model <model>", "Model to execute prompts").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--provider <provider>", "LLM provider: anthropic|openai").option("--save-results <path>", "Save full evaluation results to JSON").option("--api-key <key>", "API key override").option("--verbose", "Show full model responses").action(async (targetPath, _commandOptions, command) => {
|
|
2708
|
+
const globalOptions = getGlobalCliOptions(command);
|
|
2709
|
+
const config = getResolvedConfig(command);
|
|
2710
|
+
const parsedCli = evalCliSchema.safeParse(command.opts());
|
|
2711
|
+
if (!parsedCli.success) {
|
|
2712
|
+
writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid eval options."), globalOptions.json);
|
|
1915
2713
|
process.exitCode = 2;
|
|
2714
|
+
return;
|
|
1916
2715
|
}
|
|
2716
|
+
await handleEvalCommand(
|
|
2717
|
+
targetPath,
|
|
2718
|
+
{
|
|
2719
|
+
...globalOptions,
|
|
2720
|
+
prompts: parsedCli.data.prompts,
|
|
2721
|
+
model: config.model,
|
|
2722
|
+
graderModel: parsedCli.data.graderModel,
|
|
2723
|
+
provider: config.provider,
|
|
2724
|
+
saveResults: parsedCli.data.saveResults,
|
|
2725
|
+
verbose: Boolean(parsedCli.data.verbose),
|
|
2726
|
+
apiKey: parsedCli.data.apiKey,
|
|
2727
|
+
numRuns: config.eval.numRuns
|
|
2728
|
+
},
|
|
2729
|
+
command
|
|
2730
|
+
);
|
|
1917
2731
|
});
|
|
1918
2732
|
}
|
|
1919
2733
|
|
|
1920
2734
|
// src/commands/check.ts
|
|
1921
2735
|
import ora3 from "ora";
|
|
1922
|
-
import { z as
|
|
2736
|
+
import { z as z9 } from "zod";
|
|
1923
2737
|
|
|
1924
2738
|
// src/core/check-runner.ts
|
|
1925
2739
|
function calculateEvalAssertPassRate(result) {
|
|
@@ -1930,8 +2744,8 @@ function calculateEvalAssertPassRate(result) {
|
|
|
1930
2744
|
}
|
|
1931
2745
|
async function runCheck(inputPath, options) {
|
|
1932
2746
|
options.onStage?.("lint");
|
|
1933
|
-
const lint = await runLinter(inputPath);
|
|
1934
|
-
const lintPassed = lint.
|
|
2747
|
+
const lint = await runLinter(inputPath, { suppress: options.lintSuppress });
|
|
2748
|
+
const lintPassed = !lintFails(lint, options.lintFailOn);
|
|
1935
2749
|
let trigger = null;
|
|
1936
2750
|
let evalResult = null;
|
|
1937
2751
|
let triggerSkippedReason;
|
|
@@ -1956,6 +2770,7 @@ async function runCheck(inputPath, options) {
|
|
|
1956
2770
|
model: options.model,
|
|
1957
2771
|
queries: options.queries,
|
|
1958
2772
|
numQueries: options.numQueries,
|
|
2773
|
+
seed: options.triggerSeed,
|
|
1959
2774
|
verbose: options.verbose
|
|
1960
2775
|
});
|
|
1961
2776
|
options.onStage?.("eval");
|
|
@@ -1963,6 +2778,7 @@ async function runCheck(inputPath, options) {
|
|
|
1963
2778
|
provider: options.provider,
|
|
1964
2779
|
model: options.model,
|
|
1965
2780
|
graderModel: options.graderModel,
|
|
2781
|
+
numRuns: options.evalNumRuns,
|
|
1966
2782
|
prompts: options.prompts
|
|
1967
2783
|
});
|
|
1968
2784
|
}
|
|
@@ -1999,19 +2815,15 @@ async function runCheck(inputPath, options) {
|
|
|
1999
2815
|
}
|
|
2000
2816
|
|
|
2001
2817
|
// src/commands/check.ts
|
|
2002
|
-
var
|
|
2003
|
-
|
|
2004
|
-
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
minAssertPassRate: z7.number().min(0).max(1),
|
|
2012
|
-
saveResults: z7.string().optional(),
|
|
2013
|
-
continueOnLintFail: z7.boolean().optional(),
|
|
2014
|
-
verbose: z7.boolean().optional()
|
|
2818
|
+
var checkCliSchema = z9.object({
|
|
2819
|
+
graderModel: z9.string().optional(),
|
|
2820
|
+
apiKey: z9.string().optional(),
|
|
2821
|
+
queries: z9.string().optional(),
|
|
2822
|
+
seed: z9.number().int().optional(),
|
|
2823
|
+
prompts: z9.string().optional(),
|
|
2824
|
+
saveResults: z9.string().optional(),
|
|
2825
|
+
continueOnLintFail: z9.boolean().optional(),
|
|
2826
|
+
verbose: z9.boolean().optional()
|
|
2015
2827
|
});
|
|
2016
2828
|
var DEFAULT_ANTHROPIC_MODEL3 = "claude-sonnet-4-5-20250929";
|
|
2017
2829
|
var DEFAULT_OPENAI_MODEL3 = "gpt-4.1-mini";
|
|
@@ -2021,106 +2833,126 @@ function resolveModel3(provider, model) {
|
|
|
2021
2833
|
}
|
|
2022
2834
|
return model;
|
|
2023
2835
|
}
|
|
2024
|
-
function
|
|
2025
|
-
|
|
2026
|
-
|
|
2027
|
-
|
|
2028
|
-
|
|
2029
|
-
|
|
2030
|
-
|
|
2031
|
-
|
|
2032
|
-
|
|
2033
|
-
|
|
2034
|
-
|
|
2035
|
-
|
|
2036
|
-
|
|
2037
|
-
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
|
|
2041
|
-
|
|
2042
|
-
globalOptions.json
|
|
2043
|
-
);
|
|
2044
|
-
process.exitCode = 2;
|
|
2045
|
-
return;
|
|
2836
|
+
function renderCheckOutputWithSeed(output, seed) {
|
|
2837
|
+
if (seed === void 0) {
|
|
2838
|
+
return output;
|
|
2839
|
+
}
|
|
2840
|
+
const lines = output.split("\n");
|
|
2841
|
+
const triggerIndex = lines.indexOf("Trigger");
|
|
2842
|
+
if (triggerIndex === -1) {
|
|
2843
|
+
return `${output}
|
|
2844
|
+
Seed: ${seed}`;
|
|
2845
|
+
}
|
|
2846
|
+
lines.splice(triggerIndex + 1, 0, `Seed: ${seed}`);
|
|
2847
|
+
return lines.join("\n");
|
|
2848
|
+
}
|
|
2849
|
+
async function handleCheckCommand(targetPath, options, command) {
|
|
2850
|
+
const spinner = options.json || !process.stdout.isTTY ? null : ora3("Preparing check run...").start();
|
|
2851
|
+
try {
|
|
2852
|
+
if (spinner) {
|
|
2853
|
+
spinner.text = "Initializing model provider...";
|
|
2046
2854
|
}
|
|
2047
|
-
const
|
|
2048
|
-
|
|
2855
|
+
const provider = createProvider(options.provider, options.apiKey);
|
|
2856
|
+
let queries = void 0;
|
|
2857
|
+
if (options.queries) {
|
|
2049
2858
|
if (spinner) {
|
|
2050
|
-
spinner.text = "
|
|
2859
|
+
spinner.text = "Loading custom trigger queries...";
|
|
2051
2860
|
}
|
|
2052
|
-
|
|
2053
|
-
|
|
2054
|
-
|
|
2055
|
-
|
|
2056
|
-
|
|
2057
|
-
|
|
2058
|
-
const loadedQueries = await readJsonFile(options.queries);
|
|
2059
|
-
const parsedQueries = triggerQueryArraySchema.safeParse(loadedQueries);
|
|
2060
|
-
if (!parsedQueries.success) {
|
|
2061
|
-
throw new Error(
|
|
2062
|
-
`Invalid --queries JSON: ${parsedQueries.error.issues[0]?.message ?? "unknown format issue"}`
|
|
2063
|
-
);
|
|
2064
|
-
}
|
|
2065
|
-
queries = parsedQueries.data;
|
|
2861
|
+
queries = await loadTriggerQueriesFile(options.queries);
|
|
2862
|
+
}
|
|
2863
|
+
let prompts = void 0;
|
|
2864
|
+
if (options.prompts) {
|
|
2865
|
+
if (spinner) {
|
|
2866
|
+
spinner.text = "Loading eval prompts...";
|
|
2066
2867
|
}
|
|
2067
|
-
|
|
2068
|
-
|
|
2069
|
-
|
|
2070
|
-
|
|
2071
|
-
|
|
2072
|
-
|
|
2073
|
-
|
|
2074
|
-
|
|
2075
|
-
|
|
2076
|
-
|
|
2077
|
-
|
|
2868
|
+
prompts = await loadEvalPromptsJson(options.prompts);
|
|
2869
|
+
} else {
|
|
2870
|
+
prompts = await loadConfiguredEvalPrompts(command);
|
|
2871
|
+
}
|
|
2872
|
+
const model = resolveModel3(options.provider, options.model);
|
|
2873
|
+
const graderModel = options.graderModel ?? model;
|
|
2874
|
+
const result = await runCheck(targetPath, {
|
|
2875
|
+
provider,
|
|
2876
|
+
model,
|
|
2877
|
+
graderModel,
|
|
2878
|
+
lintFailOn: options.lintFailOn,
|
|
2879
|
+
lintSuppress: options.lintSuppress,
|
|
2880
|
+
queries,
|
|
2881
|
+
numQueries: options.numQueries,
|
|
2882
|
+
triggerSeed: options.triggerSeed,
|
|
2883
|
+
prompts,
|
|
2884
|
+
evalNumRuns: options.numRuns,
|
|
2885
|
+
minF1: options.minF1,
|
|
2886
|
+
minAssertPassRate: options.minAssertPassRate,
|
|
2887
|
+
continueOnLintFail: options.continueOnLintFail,
|
|
2888
|
+
verbose: options.verbose,
|
|
2889
|
+
onStage: (stage) => {
|
|
2890
|
+
if (!spinner) {
|
|
2891
|
+
return;
|
|
2078
2892
|
}
|
|
2079
|
-
|
|
2080
|
-
|
|
2081
|
-
|
|
2082
|
-
|
|
2083
|
-
|
|
2084
|
-
|
|
2085
|
-
|
|
2086
|
-
|
|
2087
|
-
queries,
|
|
2088
|
-
numQueries: options.numQueries,
|
|
2089
|
-
prompts,
|
|
2090
|
-
minF1: options.minF1,
|
|
2091
|
-
minAssertPassRate: options.minAssertPassRate,
|
|
2092
|
-
continueOnLintFail: Boolean(options.continueOnLintFail),
|
|
2093
|
-
verbose: Boolean(options.verbose),
|
|
2094
|
-
onStage: (stage) => {
|
|
2095
|
-
if (!spinner) {
|
|
2096
|
-
return;
|
|
2097
|
-
}
|
|
2098
|
-
if (stage === "lint") {
|
|
2099
|
-
spinner.text = "Running lint checks...";
|
|
2100
|
-
} else if (stage === "parse") {
|
|
2101
|
-
spinner.text = "Parsing skill for model evaluations...";
|
|
2102
|
-
} else if (stage === "trigger") {
|
|
2103
|
-
spinner.text = "Running trigger test suite...";
|
|
2104
|
-
} else if (stage === "eval") {
|
|
2105
|
-
spinner.text = "Running end-to-end eval suite...";
|
|
2106
|
-
}
|
|
2893
|
+
if (stage === "lint") {
|
|
2894
|
+
spinner.text = "Running lint checks...";
|
|
2895
|
+
} else if (stage === "parse") {
|
|
2896
|
+
spinner.text = "Parsing skill for model evaluations...";
|
|
2897
|
+
} else if (stage === "trigger") {
|
|
2898
|
+
spinner.text = "Running trigger test suite...";
|
|
2899
|
+
} else if (stage === "eval") {
|
|
2900
|
+
spinner.text = "Running end-to-end eval suite...";
|
|
2107
2901
|
}
|
|
2108
|
-
});
|
|
2109
|
-
if (options.saveResults) {
|
|
2110
|
-
await writeJsonFile(options.saveResults, result);
|
|
2111
2902
|
}
|
|
2112
|
-
|
|
2113
|
-
|
|
2114
|
-
|
|
2115
|
-
|
|
2116
|
-
|
|
2117
|
-
|
|
2118
|
-
|
|
2119
|
-
}
|
|
2120
|
-
|
|
2121
|
-
|
|
2903
|
+
});
|
|
2904
|
+
if (options.saveResults) {
|
|
2905
|
+
await writeJsonFile(options.saveResults, result);
|
|
2906
|
+
}
|
|
2907
|
+
spinner?.stop();
|
|
2908
|
+
if (options.json) {
|
|
2909
|
+
writeResult(result, true);
|
|
2910
|
+
} else {
|
|
2911
|
+
writeResult(
|
|
2912
|
+
renderCheckOutputWithSeed(renderCheckReport(result, options.color, options.verbose), result.trigger?.seed),
|
|
2913
|
+
false
|
|
2914
|
+
);
|
|
2915
|
+
}
|
|
2916
|
+
process.exitCode = result.gates.overallPassed ? 0 : 1;
|
|
2917
|
+
} catch (error) {
|
|
2918
|
+
spinner?.stop();
|
|
2919
|
+
writeError(error, options.json);
|
|
2920
|
+
process.exitCode = 2;
|
|
2921
|
+
}
|
|
2922
|
+
}
|
|
2923
|
+
function registerCheckCommand(program) {
|
|
2924
|
+
program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for trigger/eval runs").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option("--min-assert-pass-rate <n>", "Minimum required eval assertion pass rate (0-1)", (value) => Number.parseFloat(value)).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, _commandOptions, command) => {
|
|
2925
|
+
const globalOptions = getGlobalCliOptions(command);
|
|
2926
|
+
const config = getResolvedConfig(command);
|
|
2927
|
+
const parsedCli = checkCliSchema.safeParse(command.opts());
|
|
2928
|
+
if (!parsedCli.success) {
|
|
2929
|
+
writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid check options."), globalOptions.json);
|
|
2122
2930
|
process.exitCode = 2;
|
|
2931
|
+
return;
|
|
2123
2932
|
}
|
|
2933
|
+
await handleCheckCommand(
|
|
2934
|
+
targetPath,
|
|
2935
|
+
{
|
|
2936
|
+
...globalOptions,
|
|
2937
|
+
provider: config.provider,
|
|
2938
|
+
model: config.model,
|
|
2939
|
+
graderModel: parsedCli.data.graderModel,
|
|
2940
|
+
apiKey: parsedCli.data.apiKey,
|
|
2941
|
+
queries: parsedCli.data.queries,
|
|
2942
|
+
numQueries: config.trigger.numQueries,
|
|
2943
|
+
prompts: parsedCli.data.prompts,
|
|
2944
|
+
minF1: config.trigger.threshold,
|
|
2945
|
+
minAssertPassRate: config.eval.threshold,
|
|
2946
|
+
numRuns: config.eval.numRuns,
|
|
2947
|
+
lintFailOn: config.lint.failOn,
|
|
2948
|
+
lintSuppress: config.lint.suppress,
|
|
2949
|
+
triggerSeed: parsedCli.data.seed ?? config.trigger.seed,
|
|
2950
|
+
saveResults: parsedCli.data.saveResults,
|
|
2951
|
+
continueOnLintFail: Boolean(parsedCli.data.continueOnLintFail),
|
|
2952
|
+
verbose: Boolean(parsedCli.data.verbose)
|
|
2953
|
+
},
|
|
2954
|
+
command
|
|
2955
|
+
);
|
|
2124
2956
|
});
|
|
2125
2957
|
}
|
|
2126
2958
|
|
|
@@ -2128,27 +2960,49 @@ function registerCheckCommand(program) {
|
|
|
2128
2960
|
function resolveVersion() {
|
|
2129
2961
|
try {
|
|
2130
2962
|
const currentFilePath = fileURLToPath(import.meta.url);
|
|
2131
|
-
const packageJsonPath =
|
|
2132
|
-
const raw =
|
|
2963
|
+
const packageJsonPath = path6.resolve(path6.dirname(currentFilePath), "..", "package.json");
|
|
2964
|
+
const raw = fs7.readFileSync(packageJsonPath, "utf8");
|
|
2133
2965
|
const parsed = JSON.parse(raw);
|
|
2134
2966
|
return parsed.version ?? "0.0.0";
|
|
2135
2967
|
} catch {
|
|
2136
2968
|
return "0.0.0";
|
|
2137
2969
|
}
|
|
2138
2970
|
}
|
|
2971
|
+
function shouldRenderJson(argv) {
|
|
2972
|
+
return argv.includes("--json");
|
|
2973
|
+
}
|
|
2974
|
+
function renderTopLevelError(error, asJson) {
|
|
2975
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2976
|
+
if (asJson) {
|
|
2977
|
+
process.stdout.write(`${JSON.stringify({ error: message }, null, 2)}
|
|
2978
|
+
`);
|
|
2979
|
+
return;
|
|
2980
|
+
}
|
|
2981
|
+
process.stderr.write(`Error: ${message}
|
|
2982
|
+
`);
|
|
2983
|
+
}
|
|
2139
2984
|
async function run(argv) {
|
|
2140
2985
|
const program = new Command();
|
|
2141
2986
|
program.name("skilltest").description("The testing framework for Agent Skills.").version(resolveVersion()).option("--json", "Output results as JSON").option("--no-color", "Disable colored output").showHelpAfterError();
|
|
2987
|
+
program.hook("preAction", async (_program, actionCommand) => {
|
|
2988
|
+
const targetPath = typeof actionCommand.processedArgs[0] === "string" ? actionCommand.processedArgs[0] : void 0;
|
|
2989
|
+
const cliOverrides = extractCliConfigOverrides(actionCommand);
|
|
2990
|
+
const context = await resolveConfigContext(targetPath, cliOverrides);
|
|
2991
|
+
setCommandExecutionContext(actionCommand, context);
|
|
2992
|
+
});
|
|
2142
2993
|
registerLintCommand(program);
|
|
2143
2994
|
registerTriggerCommand(program);
|
|
2144
2995
|
registerEvalCommand(program);
|
|
2145
2996
|
registerCheckCommand(program);
|
|
2146
|
-
|
|
2997
|
+
try {
|
|
2998
|
+
await program.parseAsync(argv);
|
|
2999
|
+
} catch (error) {
|
|
3000
|
+
renderTopLevelError(error, shouldRenderJson(argv));
|
|
3001
|
+
process.exitCode = 2;
|
|
3002
|
+
}
|
|
2147
3003
|
}
|
|
2148
3004
|
run(process.argv).catch((error) => {
|
|
2149
|
-
|
|
2150
|
-
process.stderr.write(`Error: ${message}
|
|
2151
|
-
`);
|
|
3005
|
+
renderTopLevelError(error, shouldRenderJson(process.argv));
|
|
2152
3006
|
process.exitCode = 2;
|
|
2153
3007
|
});
|
|
2154
3008
|
export {
|