skilltest 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +5 -3
- package/README.md +77 -3
- package/dist/index.js +1061 -176
- package/dist/index.js.map +1 -1
- package/package.json +4 -3
package/dist/index.js
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/index.ts
|
|
4
|
-
import
|
|
4
|
+
import fs11 from "node:fs";
|
|
5
5
|
import path6 from "node:path";
|
|
6
6
|
import { fileURLToPath } from "node:url";
|
|
7
7
|
import { Command } from "commander";
|
|
8
8
|
|
|
9
|
+
// src/commands/lint.ts
|
|
10
|
+
import fs6 from "node:fs/promises";
|
|
11
|
+
import { z as z6 } from "zod";
|
|
12
|
+
|
|
9
13
|
// src/core/skill-parser.ts
|
|
10
14
|
import fs from "node:fs/promises";
|
|
11
15
|
import path from "node:path";
|
|
@@ -577,24 +581,6 @@ function runContentChecks(context) {
|
|
|
577
581
|
message: "No obvious vague placeholder phrasing found."
|
|
578
582
|
});
|
|
579
583
|
}
|
|
580
|
-
if (context.frontmatter.rawFrontmatter && /[<>]/.test(context.frontmatter.rawFrontmatter)) {
|
|
581
|
-
issues.push({
|
|
582
|
-
id: "content.frontmatter-angle-brackets",
|
|
583
|
-
checkId: "content:angle-brackets",
|
|
584
|
-
title: "Frontmatter Angle Brackets",
|
|
585
|
-
status: "warn",
|
|
586
|
-
message: "Frontmatter contains angle bracket characters (< or >), which can be misinterpreted in some agents.",
|
|
587
|
-
suggestion: "Remove XML-like tags from frontmatter values when possible."
|
|
588
|
-
});
|
|
589
|
-
} else {
|
|
590
|
-
issues.push({
|
|
591
|
-
id: "content.frontmatter-angle-brackets",
|
|
592
|
-
checkId: "content:angle-brackets",
|
|
593
|
-
title: "Frontmatter Angle Brackets",
|
|
594
|
-
status: "pass",
|
|
595
|
-
message: "No angle bracket tokens detected in frontmatter."
|
|
596
|
-
});
|
|
597
|
-
}
|
|
598
584
|
const secretsIssue = buildSecretsIssue(context);
|
|
599
585
|
if (secretsIssue) {
|
|
600
586
|
issues.push(secretsIssue);
|
|
@@ -947,6 +933,24 @@ function runFrontmatterChecks(context) {
|
|
|
947
933
|
message: "license field is present."
|
|
948
934
|
});
|
|
949
935
|
}
|
|
936
|
+
if (context.frontmatter.rawFrontmatter && /[<>]/.test(context.frontmatter.rawFrontmatter)) {
|
|
937
|
+
issues.push({
|
|
938
|
+
id: "frontmatter.angle-brackets",
|
|
939
|
+
checkId: "frontmatter:angle-brackets",
|
|
940
|
+
title: "Frontmatter Angle Brackets",
|
|
941
|
+
status: "warn",
|
|
942
|
+
message: "Frontmatter contains angle bracket characters (< or >), which can be misinterpreted in some agents.",
|
|
943
|
+
suggestion: "Remove XML-like tags from frontmatter values when possible."
|
|
944
|
+
});
|
|
945
|
+
} else {
|
|
946
|
+
issues.push({
|
|
947
|
+
id: "frontmatter.angle-brackets",
|
|
948
|
+
checkId: "frontmatter:angle-brackets",
|
|
949
|
+
title: "Frontmatter Angle Brackets",
|
|
950
|
+
status: "pass",
|
|
951
|
+
message: "No angle bracket tokens detected in frontmatter."
|
|
952
|
+
});
|
|
953
|
+
}
|
|
950
954
|
if (description && description.trim() !== "" && !descriptionLooksActionable(description)) {
|
|
951
955
|
issues.push({
|
|
952
956
|
id: "frontmatter.description.triggerability",
|
|
@@ -1366,6 +1370,739 @@ async function runLinter(inputPath, options = {}) {
|
|
|
1366
1370
|
};
|
|
1367
1371
|
}
|
|
1368
1372
|
|
|
1373
|
+
// src/reporters/html.ts
|
|
1374
|
+
function escapeHtml(value) {
|
|
1375
|
+
return String(value ?? "").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
1376
|
+
}
|
|
1377
|
+
function formatPercent(value) {
|
|
1378
|
+
return `${(value * 100).toFixed(1)}%`;
|
|
1379
|
+
}
|
|
1380
|
+
function formatLineRange(startLine, endLine) {
|
|
1381
|
+
if (startLine === void 0) {
|
|
1382
|
+
return null;
|
|
1383
|
+
}
|
|
1384
|
+
if (endLine === void 0 || endLine === startLine) {
|
|
1385
|
+
return `line ${startLine}`;
|
|
1386
|
+
}
|
|
1387
|
+
return `lines ${startLine}-${endLine}`;
|
|
1388
|
+
}
|
|
1389
|
+
function badgeLabel(status) {
|
|
1390
|
+
if (status === "pass") {
|
|
1391
|
+
return "PASS";
|
|
1392
|
+
}
|
|
1393
|
+
if (status === "warn") {
|
|
1394
|
+
return "WARN";
|
|
1395
|
+
}
|
|
1396
|
+
if (status === "fail") {
|
|
1397
|
+
return "FAIL";
|
|
1398
|
+
}
|
|
1399
|
+
return "SKIP";
|
|
1400
|
+
}
|
|
1401
|
+
function renderBadge(status) {
|
|
1402
|
+
return `<span class="badge ${status}">${badgeLabel(status)}</span>`;
|
|
1403
|
+
}
|
|
1404
|
+
function renderStatCards(stats) {
|
|
1405
|
+
return `<div class="stats-grid">${stats.map(
|
|
1406
|
+
(stat) => `
|
|
1407
|
+
<div class="stat-card${stat.status ? ` status-${stat.status}` : ""}">
|
|
1408
|
+
<div class="stat-label">${escapeHtml(stat.label)}</div>
|
|
1409
|
+
<div class="stat-value">${escapeHtml(stat.value)}</div>
|
|
1410
|
+
${stat.note ? `<div class="stat-note">${escapeHtml(stat.note)}</div>` : ""}
|
|
1411
|
+
</div>
|
|
1412
|
+
`
|
|
1413
|
+
).join("")}</div>`;
|
|
1414
|
+
}
|
|
1415
|
+
function renderMetaItems(items) {
|
|
1416
|
+
if (items.length === 0) {
|
|
1417
|
+
return "";
|
|
1418
|
+
}
|
|
1419
|
+
return `<div class="meta-grid">${items.map(
|
|
1420
|
+
(item) => `
|
|
1421
|
+
<div class="meta-item">
|
|
1422
|
+
<span class="meta-label">${escapeHtml(item.label)}</span>
|
|
1423
|
+
<span class="meta-value">${escapeHtml(item.value)}</span>
|
|
1424
|
+
</div>
|
|
1425
|
+
`
|
|
1426
|
+
).join("")}</div>`;
|
|
1427
|
+
}
|
|
1428
|
+
function renderHeaderCard(commandName, heading, target, stats, metaItems) {
|
|
1429
|
+
return `
|
|
1430
|
+
<section class="card header-card">
|
|
1431
|
+
<div class="eyebrow">skilltest ${escapeHtml(commandName)}</div>
|
|
1432
|
+
<h1>${escapeHtml(heading)}</h1>
|
|
1433
|
+
<div class="target-line">target: ${escapeHtml(target)}</div>
|
|
1434
|
+
${renderMetaItems(metaItems)}
|
|
1435
|
+
${renderStatCards(stats)}
|
|
1436
|
+
</section>
|
|
1437
|
+
`;
|
|
1438
|
+
}
|
|
1439
|
+
function renderSectionCard(title, body) {
|
|
1440
|
+
return `
|
|
1441
|
+
<section class="card">
|
|
1442
|
+
<h2>${escapeHtml(title)}</h2>
|
|
1443
|
+
${body}
|
|
1444
|
+
</section>
|
|
1445
|
+
`;
|
|
1446
|
+
}
|
|
1447
|
+
function renderMessageRow(status, title, message, details) {
|
|
1448
|
+
return `
|
|
1449
|
+
<div class="row">
|
|
1450
|
+
<div class="row-header">
|
|
1451
|
+
<div class="row-title">${escapeHtml(title)}</div>
|
|
1452
|
+
${renderBadge(status)}
|
|
1453
|
+
</div>
|
|
1454
|
+
<div class="row-body">${escapeHtml(message)}</div>
|
|
1455
|
+
${details ?? ""}
|
|
1456
|
+
</div>
|
|
1457
|
+
`;
|
|
1458
|
+
}
|
|
1459
|
+
function renderDetails(summary, content) {
|
|
1460
|
+
return `
|
|
1461
|
+
<details class="detail-block">
|
|
1462
|
+
<summary>${escapeHtml(summary)}</summary>
|
|
1463
|
+
<div class="detail-content">${content}</div>
|
|
1464
|
+
</details>
|
|
1465
|
+
`;
|
|
1466
|
+
}
|
|
1467
|
+
function renderPreBlock(content) {
|
|
1468
|
+
return `<pre>${escapeHtml(content)}</pre>`;
|
|
1469
|
+
}
|
|
1470
|
+
function renderDefinitionList(items) {
|
|
1471
|
+
return `<div class="definition-list">${items.map(
|
|
1472
|
+
(item) => `
|
|
1473
|
+
<div class="definition-item">
|
|
1474
|
+
<div class="definition-label">${escapeHtml(item.label)}</div>
|
|
1475
|
+
<div class="definition-value">${escapeHtml(item.value)}</div>
|
|
1476
|
+
</div>
|
|
1477
|
+
`
|
|
1478
|
+
).join("")}</div>`;
|
|
1479
|
+
}
|
|
1480
|
+
function countSkippedSecurityPatterns(issues) {
|
|
1481
|
+
return issues.reduce((total, issue) => total + (issue.skippedPatterns?.length ?? 0), 0);
|
|
1482
|
+
}
|
|
1483
|
+
function renderLintIssueRow(issue) {
|
|
1484
|
+
const lineRange = formatLineRange(issue.startLine, issue.endLine);
|
|
1485
|
+
const detailBlocks = [];
|
|
1486
|
+
if (issue.suggestion) {
|
|
1487
|
+
detailBlocks.push(renderDetails("Suggestion", `<p>${escapeHtml(issue.suggestion)}</p>`));
|
|
1488
|
+
}
|
|
1489
|
+
if (issue.skippedPatterns && issue.skippedPatterns.length > 0) {
|
|
1490
|
+
const patternItems = issue.skippedPatterns.map(
|
|
1491
|
+
(pattern) => `
|
|
1492
|
+
<div class="definition-item">
|
|
1493
|
+
<div class="definition-label">${escapeHtml(pattern.label)}</div>
|
|
1494
|
+
<div class="definition-value">${escapeHtml(
|
|
1495
|
+
`${pattern.zoneType} lines ${pattern.startLine}-${pattern.endLine}`
|
|
1496
|
+
)}</div>
|
|
1497
|
+
</div>
|
|
1498
|
+
`
|
|
1499
|
+
).join("");
|
|
1500
|
+
detailBlocks.push(renderDetails("Skipped security patterns", `<div class="definition-list">${patternItems}</div>`));
|
|
1501
|
+
}
|
|
1502
|
+
return `
|
|
1503
|
+
<div class="row">
|
|
1504
|
+
<div class="row-header">
|
|
1505
|
+
<div>
|
|
1506
|
+
<div class="row-title">${escapeHtml(issue.title)}</div>
|
|
1507
|
+
<div class="row-subtitle">${escapeHtml(issue.checkId)}</div>
|
|
1508
|
+
</div>
|
|
1509
|
+
${renderBadge(issue.status)}
|
|
1510
|
+
</div>
|
|
1511
|
+
<div class="row-body">${escapeHtml(issue.message)}</div>
|
|
1512
|
+
${renderDefinitionList(
|
|
1513
|
+
[
|
|
1514
|
+
lineRange ? { label: "Location", value: lineRange } : null,
|
|
1515
|
+
{ label: "Check ID", value: issue.checkId }
|
|
1516
|
+
].filter((item) => item !== null)
|
|
1517
|
+
)}
|
|
1518
|
+
${detailBlocks.join("")}
|
|
1519
|
+
</div>
|
|
1520
|
+
`;
|
|
1521
|
+
}
|
|
1522
|
+
function renderLintIssueList(report) {
|
|
1523
|
+
const skippedSecurityPatterns = countSkippedSecurityPatterns(report.issues);
|
|
1524
|
+
const rows = report.issues.map((issue) => renderLintIssueRow(issue)).join("");
|
|
1525
|
+
const info = skippedSecurityPatterns > 0 ? `<p class="info-line">Skipped security patterns in examples/comments: ${escapeHtml(skippedSecurityPatterns)}</p>` : "";
|
|
1526
|
+
return `<div class="row-list">${rows}</div>${info}`;
|
|
1527
|
+
}
|
|
1528
|
+
function renderTriggerCaseRow(testCase) {
|
|
1529
|
+
const details = testCase.rawModelResponse ? renderDetails("Model response", renderPreBlock(testCase.rawModelResponse)) : "";
|
|
1530
|
+
return `
|
|
1531
|
+
<div class="row">
|
|
1532
|
+
<div class="row-header">
|
|
1533
|
+
<div>
|
|
1534
|
+
<div class="row-title">${escapeHtml(testCase.query)}</div>
|
|
1535
|
+
<div class="row-subtitle">${escapeHtml(
|
|
1536
|
+
`expected=${testCase.expected} actual=${testCase.actual} should_trigger=${String(testCase.shouldTrigger)}`
|
|
1537
|
+
)}</div>
|
|
1538
|
+
</div>
|
|
1539
|
+
${renderBadge(testCase.matched ? "pass" : "fail")}
|
|
1540
|
+
</div>
|
|
1541
|
+
${renderDefinitionList([
|
|
1542
|
+
{ label: "Expected", value: testCase.expected },
|
|
1543
|
+
{ label: "Actual", value: testCase.actual }
|
|
1544
|
+
])}
|
|
1545
|
+
${details}
|
|
1546
|
+
</div>
|
|
1547
|
+
`;
|
|
1548
|
+
}
|
|
1549
|
+
function promptStatus(promptResult) {
|
|
1550
|
+
if (promptResult.totalAssertions === 0) {
|
|
1551
|
+
return "skip";
|
|
1552
|
+
}
|
|
1553
|
+
if (promptResult.passedAssertions === promptResult.totalAssertions) {
|
|
1554
|
+
return "pass";
|
|
1555
|
+
}
|
|
1556
|
+
if (promptResult.passedAssertions === 0) {
|
|
1557
|
+
return "fail";
|
|
1558
|
+
}
|
|
1559
|
+
return "warn";
|
|
1560
|
+
}
|
|
1561
|
+
function renderAssertionRow(assertion) {
|
|
1562
|
+
return renderDetails(
|
|
1563
|
+
`${badgeLabel(assertion.passed ? "pass" : "fail")} ${assertion.assertion}`,
|
|
1564
|
+
renderPreBlock(assertion.evidence)
|
|
1565
|
+
);
|
|
1566
|
+
}
|
|
1567
|
+
function renderEvalPromptRow(promptResult) {
|
|
1568
|
+
const assertionDetails = promptResult.assertions.map((assertion) => renderAssertionRow(assertion)).join("");
|
|
1569
|
+
const responseDetails = renderDetails("Full model response", renderPreBlock(promptResult.response));
|
|
1570
|
+
return `
|
|
1571
|
+
<div class="row">
|
|
1572
|
+
<div class="row-header">
|
|
1573
|
+
<div>
|
|
1574
|
+
<div class="row-title">${escapeHtml(promptResult.prompt)}</div>
|
|
1575
|
+
<div class="row-subtitle">${escapeHtml(
|
|
1576
|
+
`${promptResult.passedAssertions}/${promptResult.totalAssertions} assertions passed`
|
|
1577
|
+
)}</div>
|
|
1578
|
+
</div>
|
|
1579
|
+
${renderBadge(promptStatus(promptResult))}
|
|
1580
|
+
</div>
|
|
1581
|
+
<div class="row-body">${escapeHtml(promptResult.responseSummary)}</div>
|
|
1582
|
+
${renderDefinitionList([
|
|
1583
|
+
{ label: "Passed assertions", value: String(promptResult.passedAssertions) },
|
|
1584
|
+
{ label: "Total assertions", value: String(promptResult.totalAssertions) }
|
|
1585
|
+
])}
|
|
1586
|
+
${renderDetails("Assertion evidence", assertionDetails || `<p>No assertions.</p>`)}
|
|
1587
|
+
${responseDetails}
|
|
1588
|
+
</div>
|
|
1589
|
+
`;
|
|
1590
|
+
}
|
|
1591
|
+
function gateStatus(value) {
|
|
1592
|
+
if (value === null) {
|
|
1593
|
+
return "skip";
|
|
1594
|
+
}
|
|
1595
|
+
return value ? "pass" : "fail";
|
|
1596
|
+
}
|
|
1597
|
+
function renderGateCard(title, status, message) {
|
|
1598
|
+
return `
|
|
1599
|
+
<div class="gate-card">
|
|
1600
|
+
<div class="row-header">
|
|
1601
|
+
<div class="row-title">${escapeHtml(title)}</div>
|
|
1602
|
+
${renderBadge(status)}
|
|
1603
|
+
</div>
|
|
1604
|
+
<div class="row-body">${escapeHtml(message)}</div>
|
|
1605
|
+
</div>
|
|
1606
|
+
`;
|
|
1607
|
+
}
|
|
1608
|
+
function renderCollapsibleSection(title, summary, body, status) {
|
|
1609
|
+
return `
|
|
1610
|
+
<details class="section-card" open>
|
|
1611
|
+
<summary>
|
|
1612
|
+
<span class="section-title">${escapeHtml(title)}</span>
|
|
1613
|
+
<span class="section-summary">${renderBadge(status)} ${escapeHtml(summary)}</span>
|
|
1614
|
+
</summary>
|
|
1615
|
+
<div class="section-body">${body}</div>
|
|
1616
|
+
</details>
|
|
1617
|
+
`;
|
|
1618
|
+
}
|
|
1619
|
+
function resolveOptionalTarget(result, fallback) {
|
|
1620
|
+
return result.target ?? fallback;
|
|
1621
|
+
}
|
|
1622
|
+
function renderHtmlDocument(title, body) {
|
|
1623
|
+
return `<!DOCTYPE html>
|
|
1624
|
+
<html lang="en">
|
|
1625
|
+
<head>
|
|
1626
|
+
<meta charset="utf-8">
|
|
1627
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
1628
|
+
<title>${escapeHtml(title)}</title>
|
|
1629
|
+
<style>
|
|
1630
|
+
:root {
|
|
1631
|
+
color-scheme: light;
|
|
1632
|
+
--bg: #f5f5f5;
|
|
1633
|
+
--surface: #ffffff;
|
|
1634
|
+
--surface-muted: #fafafa;
|
|
1635
|
+
--border: #d4d4d8;
|
|
1636
|
+
--text: #111827;
|
|
1637
|
+
--muted: #6b7280;
|
|
1638
|
+
--pass: #22c55e;
|
|
1639
|
+
--warn: #eab308;
|
|
1640
|
+
--fail: #ef4444;
|
|
1641
|
+
--skip: #6b7280;
|
|
1642
|
+
--shadow: 0 10px 30px rgba(15, 23, 42, 0.08);
|
|
1643
|
+
}
|
|
1644
|
+
|
|
1645
|
+
* {
|
|
1646
|
+
box-sizing: border-box;
|
|
1647
|
+
}
|
|
1648
|
+
|
|
1649
|
+
body {
|
|
1650
|
+
margin: 0;
|
|
1651
|
+
background: linear-gradient(180deg, #fafafa 0%, #f4f4f5 100%);
|
|
1652
|
+
color: var(--text);
|
|
1653
|
+
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
|
|
1654
|
+
line-height: 1.5;
|
|
1655
|
+
}
|
|
1656
|
+
|
|
1657
|
+
.container {
|
|
1658
|
+
max-width: 1120px;
|
|
1659
|
+
margin: 0 auto;
|
|
1660
|
+
padding: 24px 16px 40px;
|
|
1661
|
+
}
|
|
1662
|
+
|
|
1663
|
+
.card,
|
|
1664
|
+
.section-card {
|
|
1665
|
+
background: var(--surface);
|
|
1666
|
+
border: 1px solid var(--border);
|
|
1667
|
+
border-radius: 16px;
|
|
1668
|
+
box-shadow: var(--shadow);
|
|
1669
|
+
margin-bottom: 16px;
|
|
1670
|
+
}
|
|
1671
|
+
|
|
1672
|
+
.card {
|
|
1673
|
+
padding: 20px;
|
|
1674
|
+
}
|
|
1675
|
+
|
|
1676
|
+
.header-card h1,
|
|
1677
|
+
.card h2 {
|
|
1678
|
+
margin: 0 0 10px;
|
|
1679
|
+
font-size: 1.25rem;
|
|
1680
|
+
}
|
|
1681
|
+
|
|
1682
|
+
.eyebrow {
|
|
1683
|
+
margin-bottom: 10px;
|
|
1684
|
+
color: var(--muted);
|
|
1685
|
+
font-size: 0.78rem;
|
|
1686
|
+
letter-spacing: 0.08em;
|
|
1687
|
+
text-transform: uppercase;
|
|
1688
|
+
}
|
|
1689
|
+
|
|
1690
|
+
.target-line,
|
|
1691
|
+
.info-line {
|
|
1692
|
+
color: var(--muted);
|
|
1693
|
+
overflow-wrap: anywhere;
|
|
1694
|
+
}
|
|
1695
|
+
|
|
1696
|
+
.meta-grid,
|
|
1697
|
+
.stats-grid,
|
|
1698
|
+
.gate-grid,
|
|
1699
|
+
.definition-list {
|
|
1700
|
+
display: grid;
|
|
1701
|
+
gap: 12px;
|
|
1702
|
+
}
|
|
1703
|
+
|
|
1704
|
+
.meta-grid,
|
|
1705
|
+
.gate-grid,
|
|
1706
|
+
.definition-list {
|
|
1707
|
+
grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
|
|
1708
|
+
}
|
|
1709
|
+
|
|
1710
|
+
.stats-grid {
|
|
1711
|
+
grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
|
|
1712
|
+
margin-top: 16px;
|
|
1713
|
+
}
|
|
1714
|
+
|
|
1715
|
+
.meta-grid {
|
|
1716
|
+
margin-top: 14px;
|
|
1717
|
+
}
|
|
1718
|
+
|
|
1719
|
+
.meta-item,
|
|
1720
|
+
.definition-item,
|
|
1721
|
+
.stat-card,
|
|
1722
|
+
.gate-card {
|
|
1723
|
+
background: var(--surface-muted);
|
|
1724
|
+
border: 1px solid var(--border);
|
|
1725
|
+
border-radius: 12px;
|
|
1726
|
+
padding: 12px;
|
|
1727
|
+
}
|
|
1728
|
+
|
|
1729
|
+
.meta-item,
|
|
1730
|
+
.definition-item {
|
|
1731
|
+
display: flex;
|
|
1732
|
+
justify-content: space-between;
|
|
1733
|
+
gap: 12px;
|
|
1734
|
+
}
|
|
1735
|
+
|
|
1736
|
+
.meta-label,
|
|
1737
|
+
.definition-label,
|
|
1738
|
+
.stat-label {
|
|
1739
|
+
color: var(--muted);
|
|
1740
|
+
font-size: 0.82rem;
|
|
1741
|
+
}
|
|
1742
|
+
|
|
1743
|
+
.meta-value,
|
|
1744
|
+
.definition-value {
|
|
1745
|
+
text-align: right;
|
|
1746
|
+
overflow-wrap: anywhere;
|
|
1747
|
+
}
|
|
1748
|
+
|
|
1749
|
+
.stat-value {
|
|
1750
|
+
margin-top: 4px;
|
|
1751
|
+
font-size: 1.3rem;
|
|
1752
|
+
font-weight: 700;
|
|
1753
|
+
}
|
|
1754
|
+
|
|
1755
|
+
.stat-note {
|
|
1756
|
+
margin-top: 6px;
|
|
1757
|
+
color: var(--muted);
|
|
1758
|
+
font-size: 0.82rem;
|
|
1759
|
+
}
|
|
1760
|
+
|
|
1761
|
+
.status-pass {
|
|
1762
|
+
border-color: rgba(34, 197, 94, 0.35);
|
|
1763
|
+
}
|
|
1764
|
+
|
|
1765
|
+
.status-warn {
|
|
1766
|
+
border-color: rgba(234, 179, 8, 0.35);
|
|
1767
|
+
}
|
|
1768
|
+
|
|
1769
|
+
.status-fail {
|
|
1770
|
+
border-color: rgba(239, 68, 68, 0.35);
|
|
1771
|
+
}
|
|
1772
|
+
|
|
1773
|
+
.status-skip {
|
|
1774
|
+
border-color: rgba(107, 114, 128, 0.35);
|
|
1775
|
+
}
|
|
1776
|
+
|
|
1777
|
+
.row-list {
|
|
1778
|
+
display: grid;
|
|
1779
|
+
gap: 12px;
|
|
1780
|
+
}
|
|
1781
|
+
|
|
1782
|
+
.row {
|
|
1783
|
+
border: 1px solid var(--border);
|
|
1784
|
+
border-radius: 12px;
|
|
1785
|
+
padding: 14px;
|
|
1786
|
+
background: var(--surface-muted);
|
|
1787
|
+
}
|
|
1788
|
+
|
|
1789
|
+
.row-header {
|
|
1790
|
+
display: flex;
|
|
1791
|
+
justify-content: space-between;
|
|
1792
|
+
align-items: flex-start;
|
|
1793
|
+
gap: 12px;
|
|
1794
|
+
}
|
|
1795
|
+
|
|
1796
|
+
.row-title {
|
|
1797
|
+
font-weight: 700;
|
|
1798
|
+
overflow-wrap: anywhere;
|
|
1799
|
+
}
|
|
1800
|
+
|
|
1801
|
+
.row-subtitle {
|
|
1802
|
+
margin-top: 4px;
|
|
1803
|
+
color: var(--muted);
|
|
1804
|
+
font-size: 0.84rem;
|
|
1805
|
+
overflow-wrap: anywhere;
|
|
1806
|
+
}
|
|
1807
|
+
|
|
1808
|
+
.row-body {
|
|
1809
|
+
margin-top: 10px;
|
|
1810
|
+
overflow-wrap: anywhere;
|
|
1811
|
+
}
|
|
1812
|
+
|
|
1813
|
+
.badge {
|
|
1814
|
+
display: inline-flex;
|
|
1815
|
+
align-items: center;
|
|
1816
|
+
justify-content: center;
|
|
1817
|
+
min-width: 58px;
|
|
1818
|
+
padding: 3px 10px;
|
|
1819
|
+
border-radius: 999px;
|
|
1820
|
+
border: 1px solid currentColor;
|
|
1821
|
+
font-size: 0.76rem;
|
|
1822
|
+
font-weight: 700;
|
|
1823
|
+
letter-spacing: 0.04em;
|
|
1824
|
+
white-space: nowrap;
|
|
1825
|
+
}
|
|
1826
|
+
|
|
1827
|
+
.badge.pass {
|
|
1828
|
+
color: #15803d;
|
|
1829
|
+
background: rgba(34, 197, 94, 0.14);
|
|
1830
|
+
}
|
|
1831
|
+
|
|
1832
|
+
.badge.warn {
|
|
1833
|
+
color: #a16207;
|
|
1834
|
+
background: rgba(234, 179, 8, 0.18);
|
|
1835
|
+
}
|
|
1836
|
+
|
|
1837
|
+
.badge.fail {
|
|
1838
|
+
color: #b91c1c;
|
|
1839
|
+
background: rgba(239, 68, 68, 0.14);
|
|
1840
|
+
}
|
|
1841
|
+
|
|
1842
|
+
.badge.skip {
|
|
1843
|
+
color: #4b5563;
|
|
1844
|
+
background: rgba(107, 114, 128, 0.14);
|
|
1845
|
+
}
|
|
1846
|
+
|
|
1847
|
+
details {
|
|
1848
|
+
margin-top: 10px;
|
|
1849
|
+
}
|
|
1850
|
+
|
|
1851
|
+
details summary {
|
|
1852
|
+
cursor: pointer;
|
|
1853
|
+
color: var(--muted);
|
|
1854
|
+
}
|
|
1855
|
+
|
|
1856
|
+
.detail-block {
|
|
1857
|
+
border-top: 1px dashed var(--border);
|
|
1858
|
+
padding-top: 10px;
|
|
1859
|
+
}
|
|
1860
|
+
|
|
1861
|
+
.detail-content p {
|
|
1862
|
+
margin: 0;
|
|
1863
|
+
}
|
|
1864
|
+
|
|
1865
|
+
.section-card summary {
|
|
1866
|
+
display: flex;
|
|
1867
|
+
justify-content: space-between;
|
|
1868
|
+
align-items: center;
|
|
1869
|
+
gap: 12px;
|
|
1870
|
+
padding: 18px 20px;
|
|
1871
|
+
list-style: none;
|
|
1872
|
+
}
|
|
1873
|
+
|
|
1874
|
+
.section-card summary::-webkit-details-marker {
|
|
1875
|
+
display: none;
|
|
1876
|
+
}
|
|
1877
|
+
|
|
1878
|
+
.section-title {
|
|
1879
|
+
font-size: 1rem;
|
|
1880
|
+
font-weight: 700;
|
|
1881
|
+
color: var(--text);
|
|
1882
|
+
}
|
|
1883
|
+
|
|
1884
|
+
.section-summary {
|
|
1885
|
+
display: inline-flex;
|
|
1886
|
+
align-items: center;
|
|
1887
|
+
gap: 8px;
|
|
1888
|
+
color: var(--muted);
|
|
1889
|
+
text-align: right;
|
|
1890
|
+
}
|
|
1891
|
+
|
|
1892
|
+
.section-body {
|
|
1893
|
+
padding: 0 20px 20px;
|
|
1894
|
+
}
|
|
1895
|
+
|
|
1896
|
+
.gate-grid {
|
|
1897
|
+
margin-top: 12px;
|
|
1898
|
+
}
|
|
1899
|
+
|
|
1900
|
+
pre {
|
|
1901
|
+
margin: 0;
|
|
1902
|
+
padding: 12px;
|
|
1903
|
+
background: #f8fafc;
|
|
1904
|
+
border: 1px solid var(--border);
|
|
1905
|
+
border-radius: 10px;
|
|
1906
|
+
white-space: pre-wrap;
|
|
1907
|
+
word-break: break-word;
|
|
1908
|
+
overflow-wrap: anywhere;
|
|
1909
|
+
}
|
|
1910
|
+
|
|
1911
|
+
ul {
|
|
1912
|
+
margin: 0;
|
|
1913
|
+
padding-left: 20px;
|
|
1914
|
+
}
|
|
1915
|
+
|
|
1916
|
+
@media (max-width: 720px) {
|
|
1917
|
+
.container {
|
|
1918
|
+
padding: 16px 12px 28px;
|
|
1919
|
+
}
|
|
1920
|
+
|
|
1921
|
+
.row-header,
|
|
1922
|
+
.section-card summary,
|
|
1923
|
+
.meta-item,
|
|
1924
|
+
.definition-item {
|
|
1925
|
+
flex-direction: column;
|
|
1926
|
+
align-items: flex-start;
|
|
1927
|
+
}
|
|
1928
|
+
|
|
1929
|
+
.meta-value,
|
|
1930
|
+
.definition-value,
|
|
1931
|
+
.section-summary {
|
|
1932
|
+
text-align: left;
|
|
1933
|
+
}
|
|
1934
|
+
}
|
|
1935
|
+
</style>
|
|
1936
|
+
</head>
|
|
1937
|
+
<body>
|
|
1938
|
+
<main class="container">
|
|
1939
|
+
${body}
|
|
1940
|
+
</main>
|
|
1941
|
+
</body>
|
|
1942
|
+
</html>`;
|
|
1943
|
+
}
|
|
1944
|
+
function renderLintHtml(report) {
|
|
1945
|
+
const passRate = report.summary.total === 0 ? 0 : report.summary.passed / report.summary.total;
|
|
1946
|
+
const body = [
|
|
1947
|
+
renderHeaderCard(
|
|
1948
|
+
"lint",
|
|
1949
|
+
"Static Analysis Report",
|
|
1950
|
+
report.target,
|
|
1951
|
+
[
|
|
1952
|
+
{ label: "Pass rate", value: formatPercent(passRate), note: `${report.summary.passed}/${report.summary.total} passed` },
|
|
1953
|
+
{ label: "Warnings", value: String(report.summary.warnings), status: report.summary.warnings > 0 ? "warn" : "pass" },
|
|
1954
|
+
{ label: "Failures", value: String(report.summary.failures), status: report.summary.failures > 0 ? "fail" : "pass" },
|
|
1955
|
+
{ label: "Checks", value: String(report.summary.total) }
|
|
1956
|
+
],
|
|
1957
|
+
[{ label: "Target", value: report.target }]
|
|
1958
|
+
),
|
|
1959
|
+
renderSectionCard("Lint Issues", renderLintIssueList(report))
|
|
1960
|
+
].join("");
|
|
1961
|
+
return renderHtmlDocument(`skilltest lint - ${report.target}`, body);
|
|
1962
|
+
}
|
|
1963
|
+
function renderTriggerHtml(result) {
|
|
1964
|
+
const htmlResult = result;
|
|
1965
|
+
const target = resolveOptionalTarget(htmlResult, result.skillName);
|
|
1966
|
+
const matchedCount = result.cases.filter((testCase) => testCase.matched).length;
|
|
1967
|
+
const matchRate = result.cases.length === 0 ? 0 : matchedCount / result.cases.length;
|
|
1968
|
+
const body = [
|
|
1969
|
+
renderHeaderCard(
|
|
1970
|
+
"trigger",
|
|
1971
|
+
result.skillName,
|
|
1972
|
+
target,
|
|
1973
|
+
[
|
|
1974
|
+
{ label: "Match rate", value: formatPercent(matchRate), note: `${matchedCount}/${result.cases.length} matched` },
|
|
1975
|
+
{ label: "Precision", value: formatPercent(result.metrics.precision) },
|
|
1976
|
+
{ label: "Recall", value: formatPercent(result.metrics.recall) },
|
|
1977
|
+
{ label: "F1", value: formatPercent(result.metrics.f1), status: result.metrics.f1 >= 0.8 ? "pass" : "warn" }
|
|
1978
|
+
],
|
|
1979
|
+
[
|
|
1980
|
+
{ label: "Provider", value: result.provider },
|
|
1981
|
+
{ label: "Model", value: result.model },
|
|
1982
|
+
{ label: "Seed", value: result.seed !== void 0 ? String(result.seed) : "none" },
|
|
1983
|
+
{ label: "Queries", value: String(result.queries.length) }
|
|
1984
|
+
]
|
|
1985
|
+
),
|
|
1986
|
+
renderSectionCard("Trigger Cases", `<div class="row-list">${result.cases.map((testCase) => renderTriggerCaseRow(testCase)).join("")}</div>`),
|
|
1987
|
+
renderSectionCard(
|
|
1988
|
+
"Suggestions",
|
|
1989
|
+
`<ul>${result.suggestions.map((suggestion) => `<li>${escapeHtml(suggestion)}</li>`).join("")}</ul>`
|
|
1990
|
+
)
|
|
1991
|
+
].join("");
|
|
1992
|
+
return renderHtmlDocument(`skilltest trigger - ${result.skillName}`, body);
|
|
1993
|
+
}
|
|
1994
|
+
function renderEvalHtml(result) {
|
|
1995
|
+
const htmlResult = result;
|
|
1996
|
+
const target = resolveOptionalTarget(htmlResult, result.skillName);
|
|
1997
|
+
const passRate = result.summary.totalAssertions === 0 ? 0 : result.summary.passedAssertions / result.summary.totalAssertions;
|
|
1998
|
+
const body = [
|
|
1999
|
+
renderHeaderCard(
|
|
2000
|
+
"eval",
|
|
2001
|
+
result.skillName,
|
|
2002
|
+
target,
|
|
2003
|
+
[
|
|
2004
|
+
{
|
|
2005
|
+
label: "Assertion pass rate",
|
|
2006
|
+
value: formatPercent(passRate),
|
|
2007
|
+
note: `${result.summary.passedAssertions}/${result.summary.totalAssertions} passed`
|
|
2008
|
+
},
|
|
2009
|
+
{ label: "Prompts", value: String(result.summary.totalPrompts) },
|
|
2010
|
+
{ label: "Model", value: result.model },
|
|
2011
|
+
{ label: "Grader", value: result.graderModel }
|
|
2012
|
+
],
|
|
2013
|
+
[
|
|
2014
|
+
{ label: "Provider", value: result.provider },
|
|
2015
|
+
{ label: "Execution model", value: result.model },
|
|
2016
|
+
{ label: "Grader model", value: result.graderModel },
|
|
2017
|
+
{ label: "Prompts", value: String(result.prompts.length) }
|
|
2018
|
+
]
|
|
2019
|
+
),
|
|
2020
|
+
renderSectionCard("Eval Prompts", `<div class="row-list">${result.results.map((promptResult) => renderEvalPromptRow(promptResult)).join("")}</div>`)
|
|
2021
|
+
].join("");
|
|
2022
|
+
return renderHtmlDocument(`skilltest eval - ${result.skillName}`, body);
|
|
2023
|
+
}
|
|
2024
|
+
function renderCheckHtml(result) {
|
|
2025
|
+
const skillName = result.trigger?.skillName ?? result.eval?.skillName ?? result.target;
|
|
2026
|
+
const triggerBody = result.trigger ? `<div class="row-list">${result.trigger.cases.map((testCase) => renderTriggerCaseRow(testCase)).join("")}</div>
|
|
2027
|
+
<div class="card" style="margin-top: 16px;">
|
|
2028
|
+
<h2>Trigger Suggestions</h2>
|
|
2029
|
+
<ul>${result.trigger.suggestions.map((suggestion) => `<li>${escapeHtml(suggestion)}</li>`).join("")}</ul>
|
|
2030
|
+
</div>` : renderMessageRow("skip", "Trigger skipped", result.triggerSkippedReason ?? "Skipped.");
|
|
2031
|
+
const evalBody = result.eval ? `<div class="row-list">${result.eval.results.map((promptResult) => renderEvalPromptRow(promptResult)).join("")}</div>` : renderMessageRow("skip", "Eval skipped", result.evalSkippedReason ?? "Skipped.");
|
|
2032
|
+
const lintStatus = result.gates.lintPassed ? "pass" : "fail";
|
|
2033
|
+
const triggerStatus = gateStatus(result.gates.triggerPassed);
|
|
2034
|
+
const evalStatus = gateStatus(result.gates.evalPassed);
|
|
2035
|
+
const overallStatus = result.gates.overallPassed ? "pass" : "fail";
|
|
2036
|
+
const header = renderHeaderCard(
|
|
2037
|
+
"check",
|
|
2038
|
+
skillName,
|
|
2039
|
+
result.target,
|
|
2040
|
+
[
|
|
2041
|
+
{ label: "Overall gate", value: badgeLabel(overallStatus), status: overallStatus },
|
|
2042
|
+
{
|
|
2043
|
+
label: "Trigger F1",
|
|
2044
|
+
value: result.gates.triggerF1 !== null ? formatPercent(result.gates.triggerF1) : "skipped",
|
|
2045
|
+
status: triggerStatus
|
|
2046
|
+
},
|
|
2047
|
+
{
|
|
2048
|
+
label: "Eval pass rate",
|
|
2049
|
+
value: result.gates.evalAssertPassRate !== null ? formatPercent(result.gates.evalAssertPassRate) : "skipped",
|
|
2050
|
+
status: evalStatus
|
|
2051
|
+
},
|
|
2052
|
+
{
|
|
2053
|
+
label: "Lint result",
|
|
2054
|
+
value: `${result.lint.summary.failures} fail / ${result.lint.summary.warnings} warn`,
|
|
2055
|
+
status: lintStatus
|
|
2056
|
+
}
|
|
2057
|
+
],
|
|
2058
|
+
[
|
|
2059
|
+
{ label: "Provider", value: result.provider },
|
|
2060
|
+
{ label: "Model", value: result.model },
|
|
2061
|
+
{ label: "Grader model", value: result.graderModel },
|
|
2062
|
+
{
|
|
2063
|
+
label: "Thresholds",
|
|
2064
|
+
value: `min-f1=${result.thresholds.minF1.toFixed(2)} min-assert-pass-rate=${result.thresholds.minAssertPassRate.toFixed(2)}`
|
|
2065
|
+
}
|
|
2066
|
+
]
|
|
2067
|
+
);
|
|
2068
|
+
const lintSection = renderCollapsibleSection(
|
|
2069
|
+
"Lint",
|
|
2070
|
+
`${result.lint.summary.passed}/${result.lint.summary.total} passed, ${result.lint.summary.warnings} warnings, ${result.lint.summary.failures} failures`,
|
|
2071
|
+
renderLintIssueList(result.lint),
|
|
2072
|
+
lintStatus
|
|
2073
|
+
);
|
|
2074
|
+
const triggerSection = renderCollapsibleSection(
|
|
2075
|
+
"Trigger",
|
|
2076
|
+
result.trigger ? `f1=${formatPercent(result.trigger.metrics.f1)} precision=${formatPercent(result.trigger.metrics.precision)} recall=${formatPercent(result.trigger.metrics.recall)}` : result.triggerSkippedReason ?? "Skipped.",
|
|
2077
|
+
triggerBody,
|
|
2078
|
+
triggerStatus
|
|
2079
|
+
);
|
|
2080
|
+
const evalSection = renderCollapsibleSection(
|
|
2081
|
+
"Eval",
|
|
2082
|
+
result.eval ? `assertion pass rate=${formatPercent(result.gates.evalAssertPassRate ?? 0)} (${result.eval.summary.passedAssertions}/${result.eval.summary.totalAssertions})` : result.evalSkippedReason ?? "Skipped.",
|
|
2083
|
+
evalBody,
|
|
2084
|
+
evalStatus
|
|
2085
|
+
);
|
|
2086
|
+
const qualityGate = renderSectionCard(
|
|
2087
|
+
"Quality Gate",
|
|
2088
|
+
`<div class="gate-grid">
|
|
2089
|
+
${renderGateCard("Lint gate", lintStatus, result.gates.lintPassed ? "Lint passed." : "Lint failed.")}
|
|
2090
|
+
${renderGateCard(
|
|
2091
|
+
"Trigger gate",
|
|
2092
|
+
triggerStatus,
|
|
2093
|
+
result.gates.triggerPassed === null ? result.triggerSkippedReason ?? "Skipped." : `required ${result.thresholds.minF1.toFixed(2)}, actual ${result.gates.triggerF1?.toFixed(2) ?? "n/a"}`
|
|
2094
|
+
)}
|
|
2095
|
+
${renderGateCard(
|
|
2096
|
+
"Eval gate",
|
|
2097
|
+
evalStatus,
|
|
2098
|
+
result.gates.evalPassed === null ? result.evalSkippedReason ?? "Skipped." : `required ${result.thresholds.minAssertPassRate.toFixed(2)}, actual ${result.gates.evalAssertPassRate?.toFixed(2) ?? "n/a"}`
|
|
2099
|
+
)}
|
|
2100
|
+
${renderGateCard("Overall", overallStatus, result.gates.overallPassed ? "All quality gates passed." : "One or more gates failed.")}
|
|
2101
|
+
</div>`
|
|
2102
|
+
);
|
|
2103
|
+
return renderHtmlDocument(`skilltest check - ${skillName}`, [header, lintSection, triggerSection, evalSection, qualityGate].join(""));
|
|
2104
|
+
}
|
|
2105
|
+
|
|
1369
2106
|
// src/reporters/terminal.ts
|
|
1370
2107
|
import { Chalk } from "chalk";
|
|
1371
2108
|
function getChalkInstance(enableColor) {
|
|
@@ -1378,7 +2115,7 @@ function renderIssueLine(issue, c) {
|
|
|
1378
2115
|
return ` ${label} ${issue.title}
|
|
1379
2116
|
${issue.message}${detail}`;
|
|
1380
2117
|
}
|
|
1381
|
-
function
|
|
2118
|
+
function countSkippedSecurityPatterns2(issues) {
|
|
1382
2119
|
return issues.reduce((total, issue) => {
|
|
1383
2120
|
if (!issue.checkId.startsWith("security:")) {
|
|
1384
2121
|
return total;
|
|
@@ -1398,13 +2135,13 @@ function renderLintReport(report, enableColor) {
|
|
|
1398
2135
|
`\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518`
|
|
1399
2136
|
];
|
|
1400
2137
|
const renderedIssues = report.issues.map((issue) => renderIssueLine(issue, c)).join("\n");
|
|
1401
|
-
const skippedSecurityPatterns =
|
|
2138
|
+
const skippedSecurityPatterns = countSkippedSecurityPatterns2(report.issues);
|
|
1402
2139
|
const infoLine = skippedSecurityPatterns > 0 ? `
|
|
1403
2140
|
${c.cyan("\u2139")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)` : "";
|
|
1404
2141
|
return `${headerLines.join("\n")}
|
|
1405
2142
|
${renderedIssues}${infoLine}`;
|
|
1406
2143
|
}
|
|
1407
|
-
function
|
|
2144
|
+
function formatPercent2(value) {
|
|
1408
2145
|
return `${(value * 100).toFixed(1)}%`;
|
|
1409
2146
|
}
|
|
1410
2147
|
function renderTriggerReport(result, enableColor, verbose) {
|
|
@@ -1416,7 +2153,7 @@ function renderTriggerReport(result, enableColor, verbose) {
|
|
|
1416
2153
|
lines.push(`\u2502 skill: ${result.skillName}`);
|
|
1417
2154
|
lines.push(`\u2502 provider/model: ${result.provider}/${result.model}`);
|
|
1418
2155
|
lines.push(
|
|
1419
|
-
`\u2502 precision: ${
|
|
2156
|
+
`\u2502 precision: ${formatPercent2(result.metrics.precision)} recall: ${formatPercent2(result.metrics.recall)} f1: ${formatPercent2(result.metrics.f1)}`
|
|
1420
2157
|
);
|
|
1421
2158
|
lines.push(
|
|
1422
2159
|
`\u2502 TP ${result.metrics.truePositives} TN ${result.metrics.trueNegatives} FP ${result.metrics.falsePositives} FN ${result.metrics.falseNegatives}`
|
|
@@ -1490,7 +2227,7 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
1490
2227
|
for (const issue of lintIssues) {
|
|
1491
2228
|
lines.push(renderIssueLine(issue, c));
|
|
1492
2229
|
}
|
|
1493
|
-
const skippedSecurityPatterns =
|
|
2230
|
+
const skippedSecurityPatterns = countSkippedSecurityPatterns2(result.lint.issues);
|
|
1494
2231
|
if (skippedSecurityPatterns > 0) {
|
|
1495
2232
|
lines.push(` ${c.cyan("\u2139")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)`);
|
|
1496
2233
|
}
|
|
@@ -1498,7 +2235,7 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
1498
2235
|
lines.push("Trigger");
|
|
1499
2236
|
if (result.trigger) {
|
|
1500
2237
|
lines.push(
|
|
1501
|
-
`- ${triggerGate} f1=${
|
|
2238
|
+
`- ${triggerGate} f1=${formatPercent2(result.trigger.metrics.f1)} (precision=${formatPercent2(result.trigger.metrics.precision)} recall=${formatPercent2(result.trigger.metrics.recall)})`
|
|
1502
2239
|
);
|
|
1503
2240
|
lines.push(
|
|
1504
2241
|
` TP ${result.trigger.metrics.truePositives} TN ${result.trigger.metrics.trueNegatives} FP ${result.trigger.metrics.falsePositives} FN ${result.trigger.metrics.falseNegatives}`
|
|
@@ -1517,7 +2254,7 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
1517
2254
|
if (result.eval) {
|
|
1518
2255
|
const passRate = result.gates.evalAssertPassRate ?? 0;
|
|
1519
2256
|
lines.push(
|
|
1520
|
-
`- ${evalGate} assertion pass rate=${
|
|
2257
|
+
`- ${evalGate} assertion pass rate=${formatPercent2(passRate)} (${result.eval.summary.passedAssertions}/${result.eval.summary.totalAssertions})`
|
|
1521
2258
|
);
|
|
1522
2259
|
for (const promptResult of result.eval.results) {
|
|
1523
2260
|
const failedAssertions = promptResult.assertions.filter((assertion) => !assertion.passed);
|
|
@@ -1577,12 +2314,13 @@ function extractJsonObject(raw) {
|
|
|
1577
2314
|
}
|
|
1578
2315
|
throw new Error("Grader did not return a JSON object.");
|
|
1579
2316
|
}
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
2317
|
+
var DEFAULT_ASSERTIONS = [
|
|
2318
|
+
"The response follows the skill instructions faithfully.",
|
|
2319
|
+
"The response is well-structured and actionable.",
|
|
2320
|
+
"The response addresses the user prompt directly."
|
|
2321
|
+
];
|
|
2322
|
+
function buildGraderPrompts(options) {
|
|
2323
|
+
const assertions = options.assertions && options.assertions.length > 0 ? options.assertions : DEFAULT_ASSERTIONS;
|
|
1586
2324
|
const systemPrompt = [
|
|
1587
2325
|
"You are a strict evaluator for agent skill outputs.",
|
|
1588
2326
|
"Assess each assertion and return JSON only.",
|
|
@@ -1599,15 +2337,78 @@ async function gradeResponse(options) {
|
|
|
1599
2337
|
options.modelResponse,
|
|
1600
2338
|
"",
|
|
1601
2339
|
"Assertions to evaluate:",
|
|
1602
|
-
|
|
2340
|
+
assertions.map((assertion, index) => `${index + 1}. ${assertion}`).join("\n")
|
|
1603
2341
|
].join("\n");
|
|
1604
|
-
|
|
2342
|
+
return {
|
|
2343
|
+
assertions,
|
|
2344
|
+
systemPrompt,
|
|
2345
|
+
userPrompt
|
|
2346
|
+
};
|
|
2347
|
+
}
|
|
2348
|
+
function parseGraderOutput(raw) {
|
|
1605
2349
|
const parsed = graderOutputSchema.safeParse(extractJsonObject(raw));
|
|
1606
2350
|
if (!parsed.success) {
|
|
1607
2351
|
throw new Error(`Failed to parse grader output: ${parsed.error.issues[0]?.message ?? "invalid grader JSON"}`);
|
|
1608
2352
|
}
|
|
1609
2353
|
return parsed.data.assertions;
|
|
1610
2354
|
}
|
|
2355
|
+
async function gradeResponse(options) {
|
|
2356
|
+
const prompts = buildGraderPrompts(options);
|
|
2357
|
+
const raw = await options.provider.sendMessage(prompts.systemPrompt, prompts.userPrompt, { model: options.model });
|
|
2358
|
+
return parseGraderOutput(raw);
|
|
2359
|
+
}
|
|
2360
|
+
|
|
2361
|
+
// src/utils/concurrency.ts
|
|
2362
|
+
async function pMap(items, fn, concurrency) {
|
|
2363
|
+
if (!Number.isInteger(concurrency) || concurrency < 1) {
|
|
2364
|
+
throw new Error("pMap concurrency must be an integer greater than or equal to 1.");
|
|
2365
|
+
}
|
|
2366
|
+
if (items.length === 0) {
|
|
2367
|
+
return [];
|
|
2368
|
+
}
|
|
2369
|
+
const results = new Array(items.length);
|
|
2370
|
+
return new Promise((resolve, reject) => {
|
|
2371
|
+
let nextIndex = 0;
|
|
2372
|
+
let completed = 0;
|
|
2373
|
+
let rejected = false;
|
|
2374
|
+
const launchNext = () => {
|
|
2375
|
+
if (rejected) {
|
|
2376
|
+
return;
|
|
2377
|
+
}
|
|
2378
|
+
if (completed === items.length) {
|
|
2379
|
+
resolve(results);
|
|
2380
|
+
return;
|
|
2381
|
+
}
|
|
2382
|
+
if (nextIndex >= items.length) {
|
|
2383
|
+
return;
|
|
2384
|
+
}
|
|
2385
|
+
const currentIndex = nextIndex;
|
|
2386
|
+
nextIndex += 1;
|
|
2387
|
+
Promise.resolve().then(() => fn(items[currentIndex], currentIndex)).then((result) => {
|
|
2388
|
+
if (rejected) {
|
|
2389
|
+
return;
|
|
2390
|
+
}
|
|
2391
|
+
results[currentIndex] = result;
|
|
2392
|
+
completed += 1;
|
|
2393
|
+
if (completed === items.length) {
|
|
2394
|
+
resolve(results);
|
|
2395
|
+
return;
|
|
2396
|
+
}
|
|
2397
|
+
launchNext();
|
|
2398
|
+
}).catch((error) => {
|
|
2399
|
+
if (rejected) {
|
|
2400
|
+
return;
|
|
2401
|
+
}
|
|
2402
|
+
rejected = true;
|
|
2403
|
+
reject(error);
|
|
2404
|
+
});
|
|
2405
|
+
};
|
|
2406
|
+
const initialWorkers = Math.min(concurrency, items.length);
|
|
2407
|
+
for (let workerIndex = 0; workerIndex < initialWorkers; workerIndex += 1) {
|
|
2408
|
+
launchNext();
|
|
2409
|
+
}
|
|
2410
|
+
});
|
|
2411
|
+
}
|
|
1611
2412
|
|
|
1612
2413
|
// src/core/eval-runner.ts
|
|
1613
2414
|
var evalPromptSchema = z3.object({
|
|
@@ -1655,34 +2456,37 @@ async function generatePrompts(skill, provider, model, count) {
|
|
|
1655
2456
|
}
|
|
1656
2457
|
async function runEval(skill, options) {
|
|
1657
2458
|
const prompts = options.prompts && options.prompts.length > 0 ? evalPromptArraySchema.parse(options.prompts) : await generatePrompts(skill, options.provider, options.model, options.numRuns);
|
|
1658
|
-
const
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
2459
|
+
const systemPrompt = [
|
|
2460
|
+
"You are an AI assistant with an activated skill.",
|
|
2461
|
+
"Follow this SKILL.md content exactly where applicable.",
|
|
2462
|
+
"",
|
|
2463
|
+
skill.raw
|
|
2464
|
+
].join("\n");
|
|
2465
|
+
const results = await pMap(
|
|
2466
|
+
prompts,
|
|
2467
|
+
async (evalPrompt) => {
|
|
2468
|
+
const response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
|
|
2469
|
+
const gradedAssertions = await gradeResponse({
|
|
2470
|
+
provider: options.provider,
|
|
2471
|
+
model: options.graderModel,
|
|
2472
|
+
skillName: skill.frontmatter.name,
|
|
2473
|
+
skillBody: skill.content,
|
|
2474
|
+
userPrompt: evalPrompt.prompt,
|
|
2475
|
+
modelResponse: response,
|
|
2476
|
+
assertions: evalPrompt.assertions
|
|
2477
|
+
});
|
|
2478
|
+
const passedAssertions2 = gradedAssertions.filter((assertion) => assertion.passed).length;
|
|
2479
|
+
return {
|
|
2480
|
+
prompt: evalPrompt.prompt,
|
|
2481
|
+
assertions: gradedAssertions,
|
|
2482
|
+
responseSummary: response.slice(0, 200),
|
|
2483
|
+
response,
|
|
2484
|
+
passedAssertions: passedAssertions2,
|
|
2485
|
+
totalAssertions: gradedAssertions.length
|
|
2486
|
+
};
|
|
2487
|
+
},
|
|
2488
|
+
options.concurrency ?? 5
|
|
2489
|
+
);
|
|
1686
2490
|
const totalAssertions = results.reduce((total, result) => total + result.totalAssertions, 0);
|
|
1687
2491
|
const passedAssertions = results.reduce((total, result) => total + result.passedAssertions, 0);
|
|
1688
2492
|
return {
|
|
@@ -1707,6 +2511,7 @@ var triggerQuerySchema = z4.object({
|
|
|
1707
2511
|
should_trigger: z4.boolean()
|
|
1708
2512
|
});
|
|
1709
2513
|
var triggerQueryArraySchema = z4.array(triggerQuerySchema);
|
|
2514
|
+
var triggerNumQueriesSchema = z4.number().int().min(2).refine((value) => value % 2 === 0, "numQueries must be an even number.");
|
|
1710
2515
|
var FAKE_SKILLS = [
|
|
1711
2516
|
{ name: "code-review", description: "Reviews code changes for bugs, regressions, and maintainability issues." },
|
|
1712
2517
|
{ name: "api-tester", description: "Designs and runs REST API tests, validating status codes and response shapes." },
|
|
@@ -1747,6 +2552,9 @@ function shuffle(values, rng) {
|
|
|
1747
2552
|
function sample(values, count, rng) {
|
|
1748
2553
|
return shuffle(values, rng).slice(0, Math.max(0, Math.min(count, values.length)));
|
|
1749
2554
|
}
|
|
2555
|
+
function validateNumQueries(numQueries) {
|
|
2556
|
+
return triggerNumQueriesSchema.parse(numQueries);
|
|
2557
|
+
}
|
|
1750
2558
|
function parseJsonArrayFromModelOutput(raw) {
|
|
1751
2559
|
const trimmed = raw.trim();
|
|
1752
2560
|
if (trimmed.startsWith("[") && trimmed.endsWith("]")) {
|
|
@@ -1761,6 +2569,7 @@ function parseJsonArrayFromModelOutput(raw) {
|
|
|
1761
2569
|
throw new Error("Model did not return a JSON array.");
|
|
1762
2570
|
}
|
|
1763
2571
|
async function generateQueriesWithModel(skill, provider, model, numQueries) {
|
|
2572
|
+
validateNumQueries(numQueries);
|
|
1764
2573
|
const shouldTriggerCount = Math.floor(numQueries / 2);
|
|
1765
2574
|
const shouldNotTriggerCount = numQueries - shouldTriggerCount;
|
|
1766
2575
|
const systemPrompt = [
|
|
@@ -1805,6 +2614,29 @@ function parseDecision(rawResponse, skillNames) {
|
|
|
1805
2614
|
}
|
|
1806
2615
|
return "unrecognized";
|
|
1807
2616
|
}
|
|
2617
|
+
function prepareTriggerQueries(skill, queries, seed) {
|
|
2618
|
+
const rng = createRng(seed);
|
|
2619
|
+
return queries.map((testQuery) => {
|
|
2620
|
+
const fakeCount = 5 + Math.floor(rng() * 5);
|
|
2621
|
+
const fakeSkills = sample(FAKE_SKILLS, fakeCount, rng);
|
|
2622
|
+
const allSkills = shuffle(
|
|
2623
|
+
[
|
|
2624
|
+
...fakeSkills,
|
|
2625
|
+
{
|
|
2626
|
+
name: skill.frontmatter.name,
|
|
2627
|
+
description: skill.frontmatter.description
|
|
2628
|
+
}
|
|
2629
|
+
],
|
|
2630
|
+
rng
|
|
2631
|
+
);
|
|
2632
|
+
return {
|
|
2633
|
+
testQuery,
|
|
2634
|
+
fakeSkills,
|
|
2635
|
+
allSkills,
|
|
2636
|
+
skillListText: allSkills.map((entry) => `- ${entry.name}: ${entry.description}`).join("\n")
|
|
2637
|
+
};
|
|
2638
|
+
});
|
|
2639
|
+
}
|
|
1808
2640
|
function calculateMetrics(skillName, cases) {
|
|
1809
2641
|
let truePositives = 0;
|
|
1810
2642
|
let trueNegatives = 0;
|
|
@@ -1857,43 +2689,36 @@ function buildSuggestions(metrics) {
|
|
|
1857
2689
|
return suggestions;
|
|
1858
2690
|
}
|
|
1859
2691
|
async function runTriggerTest(skill, options) {
|
|
1860
|
-
const rng = createRng(options.seed);
|
|
1861
2692
|
const queries = options.queries && options.queries.length > 0 ? triggerQueryArraySchema.parse(options.queries) : await generateQueriesWithModel(skill, options.provider, options.model, options.numQueries);
|
|
1862
|
-
const results = [];
|
|
1863
2693
|
const skillName = skill.frontmatter.name;
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
|
|
1877
|
-
|
|
1878
|
-
|
|
1879
|
-
|
|
1880
|
-
|
|
1881
|
-
|
|
1882
|
-
|
|
1883
|
-
|
|
1884
|
-
|
|
1885
|
-
|
|
1886
|
-
|
|
1887
|
-
|
|
1888
|
-
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
|
|
1892
|
-
actual: decision,
|
|
1893
|
-
matched,
|
|
1894
|
-
rawModelResponse: options.verbose ? rawResponse : void 0
|
|
1895
|
-
});
|
|
1896
|
-
}
|
|
2694
|
+
const preparedQueries = prepareTriggerQueries(skill, queries, options.seed);
|
|
2695
|
+
const systemPrompt = [
|
|
2696
|
+
"You are selecting one skill to activate for a user query.",
|
|
2697
|
+
"Choose the single best matching skill name from the provided list, or 'none' if no skill is a good fit.",
|
|
2698
|
+
"Respond with only the skill name or 'none'."
|
|
2699
|
+
].join(" ");
|
|
2700
|
+
const results = await pMap(
|
|
2701
|
+
preparedQueries,
|
|
2702
|
+
async ({ testQuery, allSkills, skillListText }) => {
|
|
2703
|
+
const userPrompt = [`Available skills:`, skillListText, "", `User query: ${testQuery.query}`].join("\n");
|
|
2704
|
+
const rawResponse = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
|
|
2705
|
+
const decision = parseDecision(
|
|
2706
|
+
rawResponse,
|
|
2707
|
+
allSkills.map((entry) => entry.name)
|
|
2708
|
+
);
|
|
2709
|
+
const expected = testQuery.should_trigger ? skillName : "none";
|
|
2710
|
+
const matched = testQuery.should_trigger ? decision === skillName : decision !== skillName;
|
|
2711
|
+
return {
|
|
2712
|
+
query: testQuery.query,
|
|
2713
|
+
shouldTrigger: testQuery.should_trigger,
|
|
2714
|
+
expected,
|
|
2715
|
+
actual: decision,
|
|
2716
|
+
matched,
|
|
2717
|
+
rawModelResponse: options.verbose ? rawResponse : void 0
|
|
2718
|
+
};
|
|
2719
|
+
},
|
|
2720
|
+
options.concurrency ?? 5
|
|
2721
|
+
);
|
|
1897
2722
|
const metrics = calculateMetrics(skillName, results);
|
|
1898
2723
|
return {
|
|
1899
2724
|
skillName,
|
|
@@ -2059,6 +2884,9 @@ function writeError(error, asJson) {
|
|
|
2059
2884
|
}
|
|
2060
2885
|
|
|
2061
2886
|
// src/commands/lint.ts
|
|
2887
|
+
var lintCliSchema = z6.object({
|
|
2888
|
+
html: z6.string().optional()
|
|
2889
|
+
});
|
|
2062
2890
|
async function handleLintCommand(targetPath, options) {
|
|
2063
2891
|
try {
|
|
2064
2892
|
const report = await runLinter(targetPath, { suppress: options.suppress });
|
|
@@ -2067,6 +2895,9 @@ async function handleLintCommand(targetPath, options) {
|
|
|
2067
2895
|
} else {
|
|
2068
2896
|
writeResult(renderLintReport(report, options.color), false);
|
|
2069
2897
|
}
|
|
2898
|
+
if (options.html) {
|
|
2899
|
+
await fs6.writeFile(options.html, renderLintHtml(report), "utf8");
|
|
2900
|
+
}
|
|
2070
2901
|
if (lintFails(report, options.failOn)) {
|
|
2071
2902
|
process.exitCode = 1;
|
|
2072
2903
|
}
|
|
@@ -2076,74 +2907,85 @@ async function handleLintCommand(targetPath, options) {
|
|
|
2076
2907
|
}
|
|
2077
2908
|
}
|
|
2078
2909
|
function registerLintCommand(program) {
|
|
2079
|
-
program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").action(async (targetPath, _commandOptions, command) => {
|
|
2910
|
+
program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--html <path>", "Write an HTML report to the given file path").action(async (targetPath, _commandOptions, command) => {
|
|
2080
2911
|
const globalOptions = getGlobalCliOptions(command);
|
|
2081
2912
|
const config = getResolvedConfig(command);
|
|
2913
|
+
const parsedCli = lintCliSchema.safeParse(command.opts());
|
|
2914
|
+
if (!parsedCli.success) {
|
|
2915
|
+
writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid lint options."), globalOptions.json);
|
|
2916
|
+
process.exitCode = 2;
|
|
2917
|
+
return;
|
|
2918
|
+
}
|
|
2082
2919
|
await handleLintCommand(targetPath, {
|
|
2083
2920
|
...globalOptions,
|
|
2084
2921
|
failOn: config.lint.failOn,
|
|
2085
|
-
suppress: config.lint.suppress
|
|
2922
|
+
suppress: config.lint.suppress,
|
|
2923
|
+
html: parsedCli.data.html
|
|
2086
2924
|
});
|
|
2087
2925
|
});
|
|
2088
2926
|
}
|
|
2089
2927
|
|
|
2090
2928
|
// src/commands/trigger.ts
|
|
2929
|
+
import fs8 from "node:fs/promises";
|
|
2091
2930
|
import ora from "ora";
|
|
2092
|
-
import { z as
|
|
2931
|
+
import { z as z8 } from "zod";
|
|
2093
2932
|
|
|
2094
2933
|
// src/utils/config.ts
|
|
2095
|
-
import
|
|
2934
|
+
import fs7 from "node:fs/promises";
|
|
2096
2935
|
import path5 from "node:path";
|
|
2097
|
-
import { z as
|
|
2098
|
-
var providerNameSchema =
|
|
2099
|
-
var lintFailOnSchema =
|
|
2100
|
-
var lintConfigSchema =
|
|
2936
|
+
import { z as z7 } from "zod";
|
|
2937
|
+
var providerNameSchema = z7.enum(["anthropic", "openai"]);
|
|
2938
|
+
var lintFailOnSchema = z7.enum(["error", "warn"]);
|
|
2939
|
+
var lintConfigSchema = z7.object({
|
|
2101
2940
|
failOn: lintFailOnSchema.optional(),
|
|
2102
|
-
suppress:
|
|
2941
|
+
suppress: z7.array(z7.string().min(1)).optional()
|
|
2103
2942
|
}).strict();
|
|
2104
|
-
var triggerConfigSchema =
|
|
2105
|
-
numQueries:
|
|
2106
|
-
threshold:
|
|
2107
|
-
seed:
|
|
2943
|
+
var triggerConfigSchema = z7.object({
|
|
2944
|
+
numQueries: z7.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
|
|
2945
|
+
threshold: z7.number().min(0).max(1).optional(),
|
|
2946
|
+
seed: z7.number().int().optional()
|
|
2108
2947
|
}).strict().partial();
|
|
2109
|
-
var evalConfigSchema =
|
|
2110
|
-
numRuns:
|
|
2111
|
-
threshold:
|
|
2112
|
-
promptFile:
|
|
2113
|
-
assertionsFile:
|
|
2948
|
+
var evalConfigSchema = z7.object({
|
|
2949
|
+
numRuns: z7.number().int().min(1).optional(),
|
|
2950
|
+
threshold: z7.number().min(0).max(1).optional(),
|
|
2951
|
+
promptFile: z7.string().min(1).optional(),
|
|
2952
|
+
assertionsFile: z7.string().min(1).optional()
|
|
2114
2953
|
}).strict().partial();
|
|
2115
|
-
var skilltestConfigSchema =
|
|
2954
|
+
var skilltestConfigSchema = z7.object({
|
|
2116
2955
|
provider: providerNameSchema.optional(),
|
|
2117
|
-
model:
|
|
2118
|
-
json:
|
|
2956
|
+
model: z7.string().min(1).optional(),
|
|
2957
|
+
json: z7.boolean().optional(),
|
|
2958
|
+
concurrency: z7.number().int().min(1).optional(),
|
|
2119
2959
|
lint: lintConfigSchema.optional(),
|
|
2120
2960
|
trigger: triggerConfigSchema.optional(),
|
|
2121
2961
|
eval: evalConfigSchema.optional()
|
|
2122
2962
|
}).strict();
|
|
2123
|
-
var resolvedSkilltestConfigSchema =
|
|
2963
|
+
var resolvedSkilltestConfigSchema = z7.object({
|
|
2124
2964
|
provider: providerNameSchema,
|
|
2125
|
-
model:
|
|
2126
|
-
json:
|
|
2127
|
-
|
|
2965
|
+
model: z7.string().min(1),
|
|
2966
|
+
json: z7.boolean(),
|
|
2967
|
+
concurrency: z7.number().int().min(1),
|
|
2968
|
+
lint: z7.object({
|
|
2128
2969
|
failOn: lintFailOnSchema,
|
|
2129
|
-
suppress:
|
|
2970
|
+
suppress: z7.array(z7.string().min(1))
|
|
2130
2971
|
}),
|
|
2131
|
-
trigger:
|
|
2132
|
-
numQueries:
|
|
2133
|
-
threshold:
|
|
2134
|
-
seed:
|
|
2972
|
+
trigger: z7.object({
|
|
2973
|
+
numQueries: z7.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
|
|
2974
|
+
threshold: z7.number().min(0).max(1),
|
|
2975
|
+
seed: z7.number().int().optional()
|
|
2135
2976
|
}),
|
|
2136
|
-
eval:
|
|
2137
|
-
numRuns:
|
|
2138
|
-
threshold:
|
|
2139
|
-
promptFile:
|
|
2140
|
-
assertionsFile:
|
|
2977
|
+
eval: z7.object({
|
|
2978
|
+
numRuns: z7.number().int().min(1),
|
|
2979
|
+
threshold: z7.number().min(0).max(1),
|
|
2980
|
+
promptFile: z7.string().min(1).optional(),
|
|
2981
|
+
assertionsFile: z7.string().min(1).optional()
|
|
2141
2982
|
})
|
|
2142
2983
|
});
|
|
2143
2984
|
var DEFAULT_SKILLTEST_CONFIG = {
|
|
2144
2985
|
provider: "anthropic",
|
|
2145
2986
|
model: "claude-sonnet-4-5-20250929",
|
|
2146
2987
|
json: false,
|
|
2988
|
+
concurrency: 5,
|
|
2147
2989
|
lint: {
|
|
2148
2990
|
failOn: "error",
|
|
2149
2991
|
suppress: []
|
|
@@ -2172,7 +3014,7 @@ function buildConfigValidationError(error, sourceLabel) {
|
|
|
2172
3014
|
async function readJsonObject(filePath, label) {
|
|
2173
3015
|
let raw;
|
|
2174
3016
|
try {
|
|
2175
|
-
raw = await
|
|
3017
|
+
raw = await fs7.readFile(filePath, "utf8");
|
|
2176
3018
|
} catch (error) {
|
|
2177
3019
|
const message = error instanceof Error ? error.message : String(error);
|
|
2178
3020
|
throw new Error(`Failed to read ${label}: ${message}`);
|
|
@@ -2205,7 +3047,7 @@ async function loadConfigFromNearestPackageJson(startDirectory) {
|
|
|
2205
3047
|
const packageJsonPath = path5.join(currentDirectory, "package.json");
|
|
2206
3048
|
if (await pathExists(packageJsonPath)) {
|
|
2207
3049
|
const raw = await readJsonObject(packageJsonPath, packageJsonPath);
|
|
2208
|
-
const packageJsonSchema =
|
|
3050
|
+
const packageJsonSchema = z7.object({
|
|
2209
3051
|
skilltestrc: skilltestConfigSchema.optional()
|
|
2210
3052
|
}).passthrough();
|
|
2211
3053
|
const parsed = packageJsonSchema.safeParse(raw);
|
|
@@ -2250,6 +3092,7 @@ function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = proce
|
|
|
2250
3092
|
provider: cliFlags.provider ?? configFile.provider ?? DEFAULT_SKILLTEST_CONFIG.provider,
|
|
2251
3093
|
model: cliFlags.model ?? configFile.model ?? DEFAULT_SKILLTEST_CONFIG.model,
|
|
2252
3094
|
json: cliFlags.json ?? configFile.json ?? DEFAULT_SKILLTEST_CONFIG.json,
|
|
3095
|
+
concurrency: cliFlags.concurrency ?? configFile.concurrency ?? DEFAULT_SKILLTEST_CONFIG.concurrency,
|
|
2253
3096
|
lint: {
|
|
2254
3097
|
failOn: cliFlags.lint?.failOn ?? configFile.lint?.failOn ?? DEFAULT_SKILLTEST_CONFIG.lint.failOn,
|
|
2255
3098
|
suppress: cliFlags.lint?.suppress ?? configFile.lint?.suppress ?? DEFAULT_SKILLTEST_CONFIG.lint.suppress
|
|
@@ -2293,6 +3136,9 @@ function extractCliConfigOverrides(command) {
|
|
|
2293
3136
|
if (command.getOptionValueSource("model") === "cli") {
|
|
2294
3137
|
overrides.model = getTypedOptionValue(command, "model");
|
|
2295
3138
|
}
|
|
3139
|
+
if ((command.name() === "trigger" || command.name() === "eval" || command.name() === "check") && command.getOptionValueSource("concurrency") === "cli") {
|
|
3140
|
+
overrides.concurrency = getTypedOptionValue(command, "concurrency");
|
|
3141
|
+
}
|
|
2296
3142
|
if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("numQueries") === "cli") {
|
|
2297
3143
|
overrides.trigger = {
|
|
2298
3144
|
...overrides.trigger,
|
|
@@ -2322,7 +3168,6 @@ async function resolveConfigContext(targetPath, cliFlags) {
|
|
|
2322
3168
|
const skillDirectoryConfig = await resolveSkillDirectoryConfig(targetPath);
|
|
2323
3169
|
if (skillDirectoryConfig) {
|
|
2324
3170
|
return {
|
|
2325
|
-
configFile: skillDirectoryConfig.configFile,
|
|
2326
3171
|
...skillDirectoryConfig,
|
|
2327
3172
|
config: mergeConfigLayers(skillDirectoryConfig.configFile, cliFlags, skillDirectoryConfig.sourceDirectory)
|
|
2328
3173
|
};
|
|
@@ -2331,7 +3176,6 @@ async function resolveConfigContext(targetPath, cliFlags) {
|
|
|
2331
3176
|
const cwdConfig = await loadConfigFromJsonFile(cwdConfigPath);
|
|
2332
3177
|
if (cwdConfig) {
|
|
2333
3178
|
return {
|
|
2334
|
-
configFile: cwdConfig.configFile,
|
|
2335
3179
|
...cwdConfig,
|
|
2336
3180
|
config: mergeConfigLayers(cwdConfig.configFile, cliFlags, cwdConfig.sourceDirectory)
|
|
2337
3181
|
};
|
|
@@ -2339,7 +3183,6 @@ async function resolveConfigContext(targetPath, cliFlags) {
|
|
|
2339
3183
|
const packageJsonConfig = await loadConfigFromNearestPackageJson(cwd);
|
|
2340
3184
|
if (packageJsonConfig) {
|
|
2341
3185
|
return {
|
|
2342
|
-
configFile: packageJsonConfig.configFile,
|
|
2343
3186
|
...packageJsonConfig,
|
|
2344
3187
|
config: mergeConfigLayers(packageJsonConfig.configFile, cliFlags, packageJsonConfig.sourceDirectory)
|
|
2345
3188
|
};
|
|
@@ -2547,12 +3390,14 @@ function createProvider(providerName, apiKeyOverride) {
|
|
|
2547
3390
|
}
|
|
2548
3391
|
|
|
2549
3392
|
// src/commands/trigger.ts
|
|
2550
|
-
var triggerCliSchema =
|
|
2551
|
-
queries:
|
|
2552
|
-
saveQueries:
|
|
2553
|
-
seed:
|
|
2554
|
-
|
|
2555
|
-
|
|
3393
|
+
var triggerCliSchema = z8.object({
|
|
3394
|
+
queries: z8.string().optional(),
|
|
3395
|
+
saveQueries: z8.string().optional(),
|
|
3396
|
+
seed: z8.number().int().optional(),
|
|
3397
|
+
concurrency: z8.number().int().min(1).optional(),
|
|
3398
|
+
html: z8.string().optional(),
|
|
3399
|
+
verbose: z8.boolean().optional(),
|
|
3400
|
+
apiKey: z8.string().optional()
|
|
2556
3401
|
});
|
|
2557
3402
|
var DEFAULT_ANTHROPIC_MODEL = "claude-sonnet-4-5-20250929";
|
|
2558
3403
|
var DEFAULT_OPENAI_MODEL = "gpt-4.1-mini";
|
|
@@ -2597,6 +3442,7 @@ async function handleTriggerCommand(targetPath, options) {
|
|
|
2597
3442
|
queries,
|
|
2598
3443
|
numQueries: options.numQueries,
|
|
2599
3444
|
seed: options.seed,
|
|
3445
|
+
concurrency: options.concurrency,
|
|
2600
3446
|
verbose: options.verbose
|
|
2601
3447
|
});
|
|
2602
3448
|
if (options.saveQueries) {
|
|
@@ -2608,6 +3454,13 @@ async function handleTriggerCommand(targetPath, options) {
|
|
|
2608
3454
|
} else {
|
|
2609
3455
|
writeResult(renderTriggerOutputWithSeed(renderTriggerReport(result, options.color, options.verbose), result.seed), false);
|
|
2610
3456
|
}
|
|
3457
|
+
if (options.html) {
|
|
3458
|
+
const htmlResult = {
|
|
3459
|
+
...result,
|
|
3460
|
+
target: targetPath
|
|
3461
|
+
};
|
|
3462
|
+
await fs8.writeFile(options.html, renderTriggerHtml(htmlResult), "utf8");
|
|
3463
|
+
}
|
|
2611
3464
|
} catch (error) {
|
|
2612
3465
|
spinner?.stop();
|
|
2613
3466
|
writeError(error, options.json);
|
|
@@ -2615,7 +3468,7 @@ async function handleTriggerCommand(targetPath, options) {
|
|
|
2615
3468
|
}
|
|
2616
3469
|
}
|
|
2617
3470
|
function registerTriggerCommand(program) {
|
|
2618
|
-
program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--queries <path>", "Path to custom test queries JSON").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, _commandOptions, command) => {
|
|
3471
|
+
program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--queries <path>", "Path to custom test queries JSON").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--concurrency <n>", "Maximum in-flight trigger requests", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, _commandOptions, command) => {
|
|
2619
3472
|
const globalOptions = getGlobalCliOptions(command);
|
|
2620
3473
|
const config = getResolvedConfig(command);
|
|
2621
3474
|
const parsedCli = triggerCliSchema.safeParse(command.opts());
|
|
@@ -2632,6 +3485,8 @@ function registerTriggerCommand(program) {
|
|
|
2632
3485
|
numQueries: config.trigger.numQueries,
|
|
2633
3486
|
saveQueries: parsedCli.data.saveQueries,
|
|
2634
3487
|
seed: parsedCli.data.seed ?? config.trigger.seed,
|
|
3488
|
+
concurrency: config.concurrency,
|
|
3489
|
+
html: parsedCli.data.html,
|
|
2635
3490
|
verbose: Boolean(parsedCli.data.verbose),
|
|
2636
3491
|
apiKey: parsedCli.data.apiKey
|
|
2637
3492
|
});
|
|
@@ -2639,14 +3494,17 @@ function registerTriggerCommand(program) {
|
|
|
2639
3494
|
}
|
|
2640
3495
|
|
|
2641
3496
|
// src/commands/eval.ts
|
|
3497
|
+
import fs9 from "node:fs/promises";
|
|
2642
3498
|
import ora2 from "ora";
|
|
2643
|
-
import { z as
|
|
2644
|
-
var evalCliSchema =
|
|
2645
|
-
prompts:
|
|
2646
|
-
graderModel:
|
|
2647
|
-
saveResults:
|
|
2648
|
-
|
|
2649
|
-
|
|
3499
|
+
import { z as z9 } from "zod";
|
|
3500
|
+
var evalCliSchema = z9.object({
|
|
3501
|
+
prompts: z9.string().optional(),
|
|
3502
|
+
graderModel: z9.string().optional(),
|
|
3503
|
+
saveResults: z9.string().optional(),
|
|
3504
|
+
concurrency: z9.number().int().min(1).optional(),
|
|
3505
|
+
html: z9.string().optional(),
|
|
3506
|
+
verbose: z9.boolean().optional(),
|
|
3507
|
+
apiKey: z9.string().optional()
|
|
2650
3508
|
});
|
|
2651
3509
|
var DEFAULT_ANTHROPIC_MODEL2 = "claude-sonnet-4-5-20250929";
|
|
2652
3510
|
var DEFAULT_OPENAI_MODEL2 = "gpt-4.1-mini";
|
|
@@ -2686,6 +3544,7 @@ async function handleEvalCommand(targetPath, options, command) {
|
|
|
2686
3544
|
model,
|
|
2687
3545
|
graderModel,
|
|
2688
3546
|
numRuns: options.numRuns,
|
|
3547
|
+
concurrency: options.concurrency,
|
|
2689
3548
|
prompts
|
|
2690
3549
|
});
|
|
2691
3550
|
if (options.saveResults) {
|
|
@@ -2697,6 +3556,13 @@ async function handleEvalCommand(targetPath, options, command) {
|
|
|
2697
3556
|
} else {
|
|
2698
3557
|
writeResult(renderEvalReport(result, options.color, options.verbose), false);
|
|
2699
3558
|
}
|
|
3559
|
+
if (options.html) {
|
|
3560
|
+
const htmlResult = {
|
|
3561
|
+
...result,
|
|
3562
|
+
target: targetPath
|
|
3563
|
+
};
|
|
3564
|
+
await fs9.writeFile(options.html, renderEvalHtml(htmlResult), "utf8");
|
|
3565
|
+
}
|
|
2700
3566
|
} catch (error) {
|
|
2701
3567
|
spinner?.stop();
|
|
2702
3568
|
writeError(error, options.json);
|
|
@@ -2704,7 +3570,7 @@ async function handleEvalCommand(targetPath, options, command) {
|
|
|
2704
3570
|
}
|
|
2705
3571
|
}
|
|
2706
3572
|
function registerEvalCommand(program) {
|
|
2707
|
-
program.command("eval").description("Run end-to-end skill execution and quality evaluation.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--prompts <path>", "Path to eval prompts JSON").option("--model <model>", "Model to execute prompts").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--provider <provider>", "LLM provider: anthropic|openai").option("--save-results <path>", "Save full evaluation results to JSON").option("--api-key <key>", "API key override").option("--verbose", "Show full model responses").action(async (targetPath, _commandOptions, command) => {
|
|
3573
|
+
program.command("eval").description("Run end-to-end skill execution and quality evaluation.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--prompts <path>", "Path to eval prompts JSON").option("--model <model>", "Model to execute prompts").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--provider <provider>", "LLM provider: anthropic|openai").option("--concurrency <n>", "Maximum in-flight eval prompt runs", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-results <path>", "Save full evaluation results to JSON").option("--api-key <key>", "API key override").option("--verbose", "Show full model responses").action(async (targetPath, _commandOptions, command) => {
|
|
2708
3574
|
const globalOptions = getGlobalCliOptions(command);
|
|
2709
3575
|
const config = getResolvedConfig(command);
|
|
2710
3576
|
const parsedCli = evalCliSchema.safeParse(command.opts());
|
|
@@ -2722,9 +3588,11 @@ function registerEvalCommand(program) {
|
|
|
2722
3588
|
graderModel: parsedCli.data.graderModel,
|
|
2723
3589
|
provider: config.provider,
|
|
2724
3590
|
saveResults: parsedCli.data.saveResults,
|
|
3591
|
+
html: parsedCli.data.html,
|
|
2725
3592
|
verbose: Boolean(parsedCli.data.verbose),
|
|
2726
3593
|
apiKey: parsedCli.data.apiKey,
|
|
2727
|
-
numRuns: config.eval.numRuns
|
|
3594
|
+
numRuns: config.eval.numRuns,
|
|
3595
|
+
concurrency: config.concurrency
|
|
2728
3596
|
},
|
|
2729
3597
|
command
|
|
2730
3598
|
);
|
|
@@ -2732,8 +3600,9 @@ function registerEvalCommand(program) {
|
|
|
2732
3600
|
}
|
|
2733
3601
|
|
|
2734
3602
|
// src/commands/check.ts
|
|
3603
|
+
import fs10 from "node:fs/promises";
|
|
2735
3604
|
import ora3 from "ora";
|
|
2736
|
-
import { z as
|
|
3605
|
+
import { z as z10 } from "zod";
|
|
2737
3606
|
|
|
2738
3607
|
// src/core/check-runner.ts
|
|
2739
3608
|
function calculateEvalAssertPassRate(result) {
|
|
@@ -2764,23 +3633,33 @@ async function runCheck(inputPath, options) {
|
|
|
2764
3633
|
evalSkippedReason = `Skipped: skill could not be parsed strictly (${message}).`;
|
|
2765
3634
|
}
|
|
2766
3635
|
if (parsedSkill) {
|
|
2767
|
-
|
|
2768
|
-
trigger = await runTriggerTest(parsedSkill, {
|
|
3636
|
+
const triggerOptions = {
|
|
2769
3637
|
provider: options.provider,
|
|
2770
3638
|
model: options.model,
|
|
2771
3639
|
queries: options.queries,
|
|
2772
3640
|
numQueries: options.numQueries,
|
|
2773
3641
|
seed: options.triggerSeed,
|
|
3642
|
+
concurrency: options.concurrency,
|
|
2774
3643
|
verbose: options.verbose
|
|
2775
|
-
}
|
|
2776
|
-
|
|
2777
|
-
evalResult = await runEval(parsedSkill, {
|
|
3644
|
+
};
|
|
3645
|
+
const evalOptions = {
|
|
2778
3646
|
provider: options.provider,
|
|
2779
3647
|
model: options.model,
|
|
2780
3648
|
graderModel: options.graderModel,
|
|
2781
3649
|
numRuns: options.evalNumRuns,
|
|
2782
|
-
prompts: options.prompts
|
|
2783
|
-
|
|
3650
|
+
prompts: options.prompts,
|
|
3651
|
+
concurrency: options.concurrency
|
|
3652
|
+
};
|
|
3653
|
+
if ((options.concurrency ?? 5) === 1) {
|
|
3654
|
+
options.onStage?.("trigger");
|
|
3655
|
+
trigger = await runTriggerTest(parsedSkill, triggerOptions);
|
|
3656
|
+
options.onStage?.("eval");
|
|
3657
|
+
evalResult = await runEval(parsedSkill, evalOptions);
|
|
3658
|
+
} else {
|
|
3659
|
+
options.onStage?.("trigger");
|
|
3660
|
+
options.onStage?.("eval");
|
|
3661
|
+
[trigger, evalResult] = await Promise.all([runTriggerTest(parsedSkill, triggerOptions), runEval(parsedSkill, evalOptions)]);
|
|
3662
|
+
}
|
|
2784
3663
|
}
|
|
2785
3664
|
}
|
|
2786
3665
|
const triggerF1 = trigger ? trigger.metrics.f1 : null;
|
|
@@ -2815,15 +3694,17 @@ async function runCheck(inputPath, options) {
|
|
|
2815
3694
|
}
|
|
2816
3695
|
|
|
2817
3696
|
// src/commands/check.ts
|
|
2818
|
-
var checkCliSchema =
|
|
2819
|
-
graderModel:
|
|
2820
|
-
apiKey:
|
|
2821
|
-
queries:
|
|
2822
|
-
seed:
|
|
2823
|
-
prompts:
|
|
2824
|
-
|
|
2825
|
-
|
|
2826
|
-
|
|
3697
|
+
var checkCliSchema = z10.object({
|
|
3698
|
+
graderModel: z10.string().optional(),
|
|
3699
|
+
apiKey: z10.string().optional(),
|
|
3700
|
+
queries: z10.string().optional(),
|
|
3701
|
+
seed: z10.number().int().optional(),
|
|
3702
|
+
prompts: z10.string().optional(),
|
|
3703
|
+
concurrency: z10.number().int().min(1).optional(),
|
|
3704
|
+
html: z10.string().optional(),
|
|
3705
|
+
saveResults: z10.string().optional(),
|
|
3706
|
+
continueOnLintFail: z10.boolean().optional(),
|
|
3707
|
+
verbose: z10.boolean().optional()
|
|
2827
3708
|
});
|
|
2828
3709
|
var DEFAULT_ANTHROPIC_MODEL3 = "claude-sonnet-4-5-20250929";
|
|
2829
3710
|
var DEFAULT_OPENAI_MODEL3 = "gpt-4.1-mini";
|
|
@@ -2882,6 +3763,7 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
2882
3763
|
triggerSeed: options.triggerSeed,
|
|
2883
3764
|
prompts,
|
|
2884
3765
|
evalNumRuns: options.numRuns,
|
|
3766
|
+
concurrency: options.concurrency,
|
|
2885
3767
|
minF1: options.minF1,
|
|
2886
3768
|
minAssertPassRate: options.minAssertPassRate,
|
|
2887
3769
|
continueOnLintFail: options.continueOnLintFail,
|
|
@@ -2894,10 +3776,8 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
2894
3776
|
spinner.text = "Running lint checks...";
|
|
2895
3777
|
} else if (stage === "parse") {
|
|
2896
3778
|
spinner.text = "Parsing skill for model evaluations...";
|
|
2897
|
-
} else if (stage === "trigger") {
|
|
2898
|
-
spinner.text = "Running trigger
|
|
2899
|
-
} else if (stage === "eval") {
|
|
2900
|
-
spinner.text = "Running end-to-end eval suite...";
|
|
3779
|
+
} else if (stage === "trigger" || stage === "eval") {
|
|
3780
|
+
spinner.text = "Running trigger and eval suites...";
|
|
2901
3781
|
}
|
|
2902
3782
|
}
|
|
2903
3783
|
});
|
|
@@ -2913,6 +3793,9 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
2913
3793
|
false
|
|
2914
3794
|
);
|
|
2915
3795
|
}
|
|
3796
|
+
if (options.html) {
|
|
3797
|
+
await fs10.writeFile(options.html, renderCheckHtml(result), "utf8");
|
|
3798
|
+
}
|
|
2916
3799
|
process.exitCode = result.gates.overallPassed ? 0 : 1;
|
|
2917
3800
|
} catch (error) {
|
|
2918
3801
|
spinner?.stop();
|
|
@@ -2921,7 +3804,7 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
2921
3804
|
}
|
|
2922
3805
|
}
|
|
2923
3806
|
function registerCheckCommand(program) {
|
|
2924
|
-
program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for trigger/eval runs").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option("--min-assert-pass-rate <n>", "Minimum required eval assertion pass rate (0-1)", (value) => Number.parseFloat(value)).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, _commandOptions, command) => {
|
|
3807
|
+
program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for trigger/eval runs").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--concurrency <n>", "Maximum in-flight trigger/eval tasks", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option("--min-assert-pass-rate <n>", "Minimum required eval assertion pass rate (0-1)", (value) => Number.parseFloat(value)).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, _commandOptions, command) => {
|
|
2925
3808
|
const globalOptions = getGlobalCliOptions(command);
|
|
2926
3809
|
const config = getResolvedConfig(command);
|
|
2927
3810
|
const parsedCli = checkCliSchema.safeParse(command.opts());
|
|
@@ -2944,6 +3827,8 @@ function registerCheckCommand(program) {
|
|
|
2944
3827
|
minF1: config.trigger.threshold,
|
|
2945
3828
|
minAssertPassRate: config.eval.threshold,
|
|
2946
3829
|
numRuns: config.eval.numRuns,
|
|
3830
|
+
concurrency: config.concurrency,
|
|
3831
|
+
html: parsedCli.data.html,
|
|
2947
3832
|
lintFailOn: config.lint.failOn,
|
|
2948
3833
|
lintSuppress: config.lint.suppress,
|
|
2949
3834
|
triggerSeed: parsedCli.data.seed ?? config.trigger.seed,
|
|
@@ -2961,7 +3846,7 @@ function resolveVersion() {
|
|
|
2961
3846
|
try {
|
|
2962
3847
|
const currentFilePath = fileURLToPath(import.meta.url);
|
|
2963
3848
|
const packageJsonPath = path6.resolve(path6.dirname(currentFilePath), "..", "package.json");
|
|
2964
|
-
const raw =
|
|
3849
|
+
const raw = fs11.readFileSync(packageJsonPath, "utf8");
|
|
2965
3850
|
const parsed = JSON.parse(raw);
|
|
2966
3851
|
return parsed.version ?? "0.0.0";
|
|
2967
3852
|
} catch {
|