skilltest 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +5 -3
- package/README.md +65 -1
- package/dist/index.js +1003 -139
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/index.ts
|
|
4
|
-
import
|
|
4
|
+
import fs11 from "node:fs";
|
|
5
5
|
import path6 from "node:path";
|
|
6
6
|
import { fileURLToPath } from "node:url";
|
|
7
7
|
import { Command } from "commander";
|
|
8
8
|
|
|
9
|
+
// src/commands/lint.ts
|
|
10
|
+
import fs6 from "node:fs/promises";
|
|
11
|
+
import { z as z6 } from "zod";
|
|
12
|
+
|
|
9
13
|
// src/core/skill-parser.ts
|
|
10
14
|
import fs from "node:fs/promises";
|
|
11
15
|
import path from "node:path";
|
|
@@ -1366,6 +1370,739 @@ async function runLinter(inputPath, options = {}) {
|
|
|
1366
1370
|
};
|
|
1367
1371
|
}
|
|
1368
1372
|
|
|
1373
|
+
// src/reporters/html.ts
|
|
1374
|
+
function escapeHtml(value) {
|
|
1375
|
+
return String(value ?? "").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
1376
|
+
}
|
|
1377
|
+
function formatPercent(value) {
|
|
1378
|
+
return `${(value * 100).toFixed(1)}%`;
|
|
1379
|
+
}
|
|
1380
|
+
function formatLineRange(startLine, endLine) {
|
|
1381
|
+
if (startLine === void 0) {
|
|
1382
|
+
return null;
|
|
1383
|
+
}
|
|
1384
|
+
if (endLine === void 0 || endLine === startLine) {
|
|
1385
|
+
return `line ${startLine}`;
|
|
1386
|
+
}
|
|
1387
|
+
return `lines ${startLine}-${endLine}`;
|
|
1388
|
+
}
|
|
1389
|
+
function badgeLabel(status) {
|
|
1390
|
+
if (status === "pass") {
|
|
1391
|
+
return "PASS";
|
|
1392
|
+
}
|
|
1393
|
+
if (status === "warn") {
|
|
1394
|
+
return "WARN";
|
|
1395
|
+
}
|
|
1396
|
+
if (status === "fail") {
|
|
1397
|
+
return "FAIL";
|
|
1398
|
+
}
|
|
1399
|
+
return "SKIP";
|
|
1400
|
+
}
|
|
1401
|
+
function renderBadge(status) {
|
|
1402
|
+
return `<span class="badge ${status}">${badgeLabel(status)}</span>`;
|
|
1403
|
+
}
|
|
1404
|
+
function renderStatCards(stats) {
|
|
1405
|
+
return `<div class="stats-grid">${stats.map(
|
|
1406
|
+
(stat) => `
|
|
1407
|
+
<div class="stat-card${stat.status ? ` status-${stat.status}` : ""}">
|
|
1408
|
+
<div class="stat-label">${escapeHtml(stat.label)}</div>
|
|
1409
|
+
<div class="stat-value">${escapeHtml(stat.value)}</div>
|
|
1410
|
+
${stat.note ? `<div class="stat-note">${escapeHtml(stat.note)}</div>` : ""}
|
|
1411
|
+
</div>
|
|
1412
|
+
`
|
|
1413
|
+
).join("")}</div>`;
|
|
1414
|
+
}
|
|
1415
|
+
function renderMetaItems(items) {
|
|
1416
|
+
if (items.length === 0) {
|
|
1417
|
+
return "";
|
|
1418
|
+
}
|
|
1419
|
+
return `<div class="meta-grid">${items.map(
|
|
1420
|
+
(item) => `
|
|
1421
|
+
<div class="meta-item">
|
|
1422
|
+
<span class="meta-label">${escapeHtml(item.label)}</span>
|
|
1423
|
+
<span class="meta-value">${escapeHtml(item.value)}</span>
|
|
1424
|
+
</div>
|
|
1425
|
+
`
|
|
1426
|
+
).join("")}</div>`;
|
|
1427
|
+
}
|
|
1428
|
+
function renderHeaderCard(commandName, heading, target, stats, metaItems) {
|
|
1429
|
+
return `
|
|
1430
|
+
<section class="card header-card">
|
|
1431
|
+
<div class="eyebrow">skilltest ${escapeHtml(commandName)}</div>
|
|
1432
|
+
<h1>${escapeHtml(heading)}</h1>
|
|
1433
|
+
<div class="target-line">target: ${escapeHtml(target)}</div>
|
|
1434
|
+
${renderMetaItems(metaItems)}
|
|
1435
|
+
${renderStatCards(stats)}
|
|
1436
|
+
</section>
|
|
1437
|
+
`;
|
|
1438
|
+
}
|
|
1439
|
+
function renderSectionCard(title, body) {
|
|
1440
|
+
return `
|
|
1441
|
+
<section class="card">
|
|
1442
|
+
<h2>${escapeHtml(title)}</h2>
|
|
1443
|
+
${body}
|
|
1444
|
+
</section>
|
|
1445
|
+
`;
|
|
1446
|
+
}
|
|
1447
|
+
function renderMessageRow(status, title, message, details) {
|
|
1448
|
+
return `
|
|
1449
|
+
<div class="row">
|
|
1450
|
+
<div class="row-header">
|
|
1451
|
+
<div class="row-title">${escapeHtml(title)}</div>
|
|
1452
|
+
${renderBadge(status)}
|
|
1453
|
+
</div>
|
|
1454
|
+
<div class="row-body">${escapeHtml(message)}</div>
|
|
1455
|
+
${details ?? ""}
|
|
1456
|
+
</div>
|
|
1457
|
+
`;
|
|
1458
|
+
}
|
|
1459
|
+
function renderDetails(summary, content) {
|
|
1460
|
+
return `
|
|
1461
|
+
<details class="detail-block">
|
|
1462
|
+
<summary>${escapeHtml(summary)}</summary>
|
|
1463
|
+
<div class="detail-content">${content}</div>
|
|
1464
|
+
</details>
|
|
1465
|
+
`;
|
|
1466
|
+
}
|
|
1467
|
+
function renderPreBlock(content) {
|
|
1468
|
+
return `<pre>${escapeHtml(content)}</pre>`;
|
|
1469
|
+
}
|
|
1470
|
+
function renderDefinitionList(items) {
|
|
1471
|
+
return `<div class="definition-list">${items.map(
|
|
1472
|
+
(item) => `
|
|
1473
|
+
<div class="definition-item">
|
|
1474
|
+
<div class="definition-label">${escapeHtml(item.label)}</div>
|
|
1475
|
+
<div class="definition-value">${escapeHtml(item.value)}</div>
|
|
1476
|
+
</div>
|
|
1477
|
+
`
|
|
1478
|
+
).join("")}</div>`;
|
|
1479
|
+
}
|
|
1480
|
+
function countSkippedSecurityPatterns(issues) {
|
|
1481
|
+
return issues.reduce((total, issue) => total + (issue.skippedPatterns?.length ?? 0), 0);
|
|
1482
|
+
}
|
|
1483
|
+
function renderLintIssueRow(issue) {
|
|
1484
|
+
const lineRange = formatLineRange(issue.startLine, issue.endLine);
|
|
1485
|
+
const detailBlocks = [];
|
|
1486
|
+
if (issue.suggestion) {
|
|
1487
|
+
detailBlocks.push(renderDetails("Suggestion", `<p>${escapeHtml(issue.suggestion)}</p>`));
|
|
1488
|
+
}
|
|
1489
|
+
if (issue.skippedPatterns && issue.skippedPatterns.length > 0) {
|
|
1490
|
+
const patternItems = issue.skippedPatterns.map(
|
|
1491
|
+
(pattern) => `
|
|
1492
|
+
<div class="definition-item">
|
|
1493
|
+
<div class="definition-label">${escapeHtml(pattern.label)}</div>
|
|
1494
|
+
<div class="definition-value">${escapeHtml(
|
|
1495
|
+
`${pattern.zoneType} lines ${pattern.startLine}-${pattern.endLine}`
|
|
1496
|
+
)}</div>
|
|
1497
|
+
</div>
|
|
1498
|
+
`
|
|
1499
|
+
).join("");
|
|
1500
|
+
detailBlocks.push(renderDetails("Skipped security patterns", `<div class="definition-list">${patternItems}</div>`));
|
|
1501
|
+
}
|
|
1502
|
+
return `
|
|
1503
|
+
<div class="row">
|
|
1504
|
+
<div class="row-header">
|
|
1505
|
+
<div>
|
|
1506
|
+
<div class="row-title">${escapeHtml(issue.title)}</div>
|
|
1507
|
+
<div class="row-subtitle">${escapeHtml(issue.checkId)}</div>
|
|
1508
|
+
</div>
|
|
1509
|
+
${renderBadge(issue.status)}
|
|
1510
|
+
</div>
|
|
1511
|
+
<div class="row-body">${escapeHtml(issue.message)}</div>
|
|
1512
|
+
${renderDefinitionList(
|
|
1513
|
+
[
|
|
1514
|
+
lineRange ? { label: "Location", value: lineRange } : null,
|
|
1515
|
+
{ label: "Check ID", value: issue.checkId }
|
|
1516
|
+
].filter((item) => item !== null)
|
|
1517
|
+
)}
|
|
1518
|
+
${detailBlocks.join("")}
|
|
1519
|
+
</div>
|
|
1520
|
+
`;
|
|
1521
|
+
}
|
|
1522
|
+
function renderLintIssueList(report) {
|
|
1523
|
+
const skippedSecurityPatterns = countSkippedSecurityPatterns(report.issues);
|
|
1524
|
+
const rows = report.issues.map((issue) => renderLintIssueRow(issue)).join("");
|
|
1525
|
+
const info = skippedSecurityPatterns > 0 ? `<p class="info-line">Skipped security patterns in examples/comments: ${escapeHtml(skippedSecurityPatterns)}</p>` : "";
|
|
1526
|
+
return `<div class="row-list">${rows}</div>${info}`;
|
|
1527
|
+
}
|
|
1528
|
+
function renderTriggerCaseRow(testCase) {
|
|
1529
|
+
const details = testCase.rawModelResponse ? renderDetails("Model response", renderPreBlock(testCase.rawModelResponse)) : "";
|
|
1530
|
+
return `
|
|
1531
|
+
<div class="row">
|
|
1532
|
+
<div class="row-header">
|
|
1533
|
+
<div>
|
|
1534
|
+
<div class="row-title">${escapeHtml(testCase.query)}</div>
|
|
1535
|
+
<div class="row-subtitle">${escapeHtml(
|
|
1536
|
+
`expected=${testCase.expected} actual=${testCase.actual} should_trigger=${String(testCase.shouldTrigger)}`
|
|
1537
|
+
)}</div>
|
|
1538
|
+
</div>
|
|
1539
|
+
${renderBadge(testCase.matched ? "pass" : "fail")}
|
|
1540
|
+
</div>
|
|
1541
|
+
${renderDefinitionList([
|
|
1542
|
+
{ label: "Expected", value: testCase.expected },
|
|
1543
|
+
{ label: "Actual", value: testCase.actual }
|
|
1544
|
+
])}
|
|
1545
|
+
${details}
|
|
1546
|
+
</div>
|
|
1547
|
+
`;
|
|
1548
|
+
}
|
|
1549
|
+
function promptStatus(promptResult) {
|
|
1550
|
+
if (promptResult.totalAssertions === 0) {
|
|
1551
|
+
return "skip";
|
|
1552
|
+
}
|
|
1553
|
+
if (promptResult.passedAssertions === promptResult.totalAssertions) {
|
|
1554
|
+
return "pass";
|
|
1555
|
+
}
|
|
1556
|
+
if (promptResult.passedAssertions === 0) {
|
|
1557
|
+
return "fail";
|
|
1558
|
+
}
|
|
1559
|
+
return "warn";
|
|
1560
|
+
}
|
|
1561
|
+
function renderAssertionRow(assertion) {
|
|
1562
|
+
return renderDetails(
|
|
1563
|
+
`${badgeLabel(assertion.passed ? "pass" : "fail")} ${assertion.assertion}`,
|
|
1564
|
+
renderPreBlock(assertion.evidence)
|
|
1565
|
+
);
|
|
1566
|
+
}
|
|
1567
|
+
function renderEvalPromptRow(promptResult) {
|
|
1568
|
+
const assertionDetails = promptResult.assertions.map((assertion) => renderAssertionRow(assertion)).join("");
|
|
1569
|
+
const responseDetails = renderDetails("Full model response", renderPreBlock(promptResult.response));
|
|
1570
|
+
return `
|
|
1571
|
+
<div class="row">
|
|
1572
|
+
<div class="row-header">
|
|
1573
|
+
<div>
|
|
1574
|
+
<div class="row-title">${escapeHtml(promptResult.prompt)}</div>
|
|
1575
|
+
<div class="row-subtitle">${escapeHtml(
|
|
1576
|
+
`${promptResult.passedAssertions}/${promptResult.totalAssertions} assertions passed`
|
|
1577
|
+
)}</div>
|
|
1578
|
+
</div>
|
|
1579
|
+
${renderBadge(promptStatus(promptResult))}
|
|
1580
|
+
</div>
|
|
1581
|
+
<div class="row-body">${escapeHtml(promptResult.responseSummary)}</div>
|
|
1582
|
+
${renderDefinitionList([
|
|
1583
|
+
{ label: "Passed assertions", value: String(promptResult.passedAssertions) },
|
|
1584
|
+
{ label: "Total assertions", value: String(promptResult.totalAssertions) }
|
|
1585
|
+
])}
|
|
1586
|
+
${renderDetails("Assertion evidence", assertionDetails || `<p>No assertions.</p>`)}
|
|
1587
|
+
${responseDetails}
|
|
1588
|
+
</div>
|
|
1589
|
+
`;
|
|
1590
|
+
}
|
|
1591
|
+
function gateStatus(value) {
|
|
1592
|
+
if (value === null) {
|
|
1593
|
+
return "skip";
|
|
1594
|
+
}
|
|
1595
|
+
return value ? "pass" : "fail";
|
|
1596
|
+
}
|
|
1597
|
+
function renderGateCard(title, status, message) {
|
|
1598
|
+
return `
|
|
1599
|
+
<div class="gate-card">
|
|
1600
|
+
<div class="row-header">
|
|
1601
|
+
<div class="row-title">${escapeHtml(title)}</div>
|
|
1602
|
+
${renderBadge(status)}
|
|
1603
|
+
</div>
|
|
1604
|
+
<div class="row-body">${escapeHtml(message)}</div>
|
|
1605
|
+
</div>
|
|
1606
|
+
`;
|
|
1607
|
+
}
|
|
1608
|
+
function renderCollapsibleSection(title, summary, body, status) {
|
|
1609
|
+
return `
|
|
1610
|
+
<details class="section-card" open>
|
|
1611
|
+
<summary>
|
|
1612
|
+
<span class="section-title">${escapeHtml(title)}</span>
|
|
1613
|
+
<span class="section-summary">${renderBadge(status)} ${escapeHtml(summary)}</span>
|
|
1614
|
+
</summary>
|
|
1615
|
+
<div class="section-body">${body}</div>
|
|
1616
|
+
</details>
|
|
1617
|
+
`;
|
|
1618
|
+
}
|
|
1619
|
+
function resolveOptionalTarget(result, fallback) {
|
|
1620
|
+
return result.target ?? fallback;
|
|
1621
|
+
}
|
|
1622
|
+
function renderHtmlDocument(title, body) {
|
|
1623
|
+
return `<!DOCTYPE html>
|
|
1624
|
+
<html lang="en">
|
|
1625
|
+
<head>
|
|
1626
|
+
<meta charset="utf-8">
|
|
1627
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
1628
|
+
<title>${escapeHtml(title)}</title>
|
|
1629
|
+
<style>
|
|
1630
|
+
:root {
|
|
1631
|
+
color-scheme: light;
|
|
1632
|
+
--bg: #f5f5f5;
|
|
1633
|
+
--surface: #ffffff;
|
|
1634
|
+
--surface-muted: #fafafa;
|
|
1635
|
+
--border: #d4d4d8;
|
|
1636
|
+
--text: #111827;
|
|
1637
|
+
--muted: #6b7280;
|
|
1638
|
+
--pass: #22c55e;
|
|
1639
|
+
--warn: #eab308;
|
|
1640
|
+
--fail: #ef4444;
|
|
1641
|
+
--skip: #6b7280;
|
|
1642
|
+
--shadow: 0 10px 30px rgba(15, 23, 42, 0.08);
|
|
1643
|
+
}
|
|
1644
|
+
|
|
1645
|
+
* {
|
|
1646
|
+
box-sizing: border-box;
|
|
1647
|
+
}
|
|
1648
|
+
|
|
1649
|
+
body {
|
|
1650
|
+
margin: 0;
|
|
1651
|
+
background: linear-gradient(180deg, #fafafa 0%, #f4f4f5 100%);
|
|
1652
|
+
color: var(--text);
|
|
1653
|
+
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
|
|
1654
|
+
line-height: 1.5;
|
|
1655
|
+
}
|
|
1656
|
+
|
|
1657
|
+
.container {
|
|
1658
|
+
max-width: 1120px;
|
|
1659
|
+
margin: 0 auto;
|
|
1660
|
+
padding: 24px 16px 40px;
|
|
1661
|
+
}
|
|
1662
|
+
|
|
1663
|
+
.card,
|
|
1664
|
+
.section-card {
|
|
1665
|
+
background: var(--surface);
|
|
1666
|
+
border: 1px solid var(--border);
|
|
1667
|
+
border-radius: 16px;
|
|
1668
|
+
box-shadow: var(--shadow);
|
|
1669
|
+
margin-bottom: 16px;
|
|
1670
|
+
}
|
|
1671
|
+
|
|
1672
|
+
.card {
|
|
1673
|
+
padding: 20px;
|
|
1674
|
+
}
|
|
1675
|
+
|
|
1676
|
+
.header-card h1,
|
|
1677
|
+
.card h2 {
|
|
1678
|
+
margin: 0 0 10px;
|
|
1679
|
+
font-size: 1.25rem;
|
|
1680
|
+
}
|
|
1681
|
+
|
|
1682
|
+
.eyebrow {
|
|
1683
|
+
margin-bottom: 10px;
|
|
1684
|
+
color: var(--muted);
|
|
1685
|
+
font-size: 0.78rem;
|
|
1686
|
+
letter-spacing: 0.08em;
|
|
1687
|
+
text-transform: uppercase;
|
|
1688
|
+
}
|
|
1689
|
+
|
|
1690
|
+
.target-line,
|
|
1691
|
+
.info-line {
|
|
1692
|
+
color: var(--muted);
|
|
1693
|
+
overflow-wrap: anywhere;
|
|
1694
|
+
}
|
|
1695
|
+
|
|
1696
|
+
.meta-grid,
|
|
1697
|
+
.stats-grid,
|
|
1698
|
+
.gate-grid,
|
|
1699
|
+
.definition-list {
|
|
1700
|
+
display: grid;
|
|
1701
|
+
gap: 12px;
|
|
1702
|
+
}
|
|
1703
|
+
|
|
1704
|
+
.meta-grid,
|
|
1705
|
+
.gate-grid,
|
|
1706
|
+
.definition-list {
|
|
1707
|
+
grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
|
|
1708
|
+
}
|
|
1709
|
+
|
|
1710
|
+
.stats-grid {
|
|
1711
|
+
grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
|
|
1712
|
+
margin-top: 16px;
|
|
1713
|
+
}
|
|
1714
|
+
|
|
1715
|
+
.meta-grid {
|
|
1716
|
+
margin-top: 14px;
|
|
1717
|
+
}
|
|
1718
|
+
|
|
1719
|
+
.meta-item,
|
|
1720
|
+
.definition-item,
|
|
1721
|
+
.stat-card,
|
|
1722
|
+
.gate-card {
|
|
1723
|
+
background: var(--surface-muted);
|
|
1724
|
+
border: 1px solid var(--border);
|
|
1725
|
+
border-radius: 12px;
|
|
1726
|
+
padding: 12px;
|
|
1727
|
+
}
|
|
1728
|
+
|
|
1729
|
+
.meta-item,
|
|
1730
|
+
.definition-item {
|
|
1731
|
+
display: flex;
|
|
1732
|
+
justify-content: space-between;
|
|
1733
|
+
gap: 12px;
|
|
1734
|
+
}
|
|
1735
|
+
|
|
1736
|
+
.meta-label,
|
|
1737
|
+
.definition-label,
|
|
1738
|
+
.stat-label {
|
|
1739
|
+
color: var(--muted);
|
|
1740
|
+
font-size: 0.82rem;
|
|
1741
|
+
}
|
|
1742
|
+
|
|
1743
|
+
.meta-value,
|
|
1744
|
+
.definition-value {
|
|
1745
|
+
text-align: right;
|
|
1746
|
+
overflow-wrap: anywhere;
|
|
1747
|
+
}
|
|
1748
|
+
|
|
1749
|
+
.stat-value {
|
|
1750
|
+
margin-top: 4px;
|
|
1751
|
+
font-size: 1.3rem;
|
|
1752
|
+
font-weight: 700;
|
|
1753
|
+
}
|
|
1754
|
+
|
|
1755
|
+
.stat-note {
|
|
1756
|
+
margin-top: 6px;
|
|
1757
|
+
color: var(--muted);
|
|
1758
|
+
font-size: 0.82rem;
|
|
1759
|
+
}
|
|
1760
|
+
|
|
1761
|
+
.status-pass {
|
|
1762
|
+
border-color: rgba(34, 197, 94, 0.35);
|
|
1763
|
+
}
|
|
1764
|
+
|
|
1765
|
+
.status-warn {
|
|
1766
|
+
border-color: rgba(234, 179, 8, 0.35);
|
|
1767
|
+
}
|
|
1768
|
+
|
|
1769
|
+
.status-fail {
|
|
1770
|
+
border-color: rgba(239, 68, 68, 0.35);
|
|
1771
|
+
}
|
|
1772
|
+
|
|
1773
|
+
.status-skip {
|
|
1774
|
+
border-color: rgba(107, 114, 128, 0.35);
|
|
1775
|
+
}
|
|
1776
|
+
|
|
1777
|
+
.row-list {
|
|
1778
|
+
display: grid;
|
|
1779
|
+
gap: 12px;
|
|
1780
|
+
}
|
|
1781
|
+
|
|
1782
|
+
.row {
|
|
1783
|
+
border: 1px solid var(--border);
|
|
1784
|
+
border-radius: 12px;
|
|
1785
|
+
padding: 14px;
|
|
1786
|
+
background: var(--surface-muted);
|
|
1787
|
+
}
|
|
1788
|
+
|
|
1789
|
+
.row-header {
|
|
1790
|
+
display: flex;
|
|
1791
|
+
justify-content: space-between;
|
|
1792
|
+
align-items: flex-start;
|
|
1793
|
+
gap: 12px;
|
|
1794
|
+
}
|
|
1795
|
+
|
|
1796
|
+
.row-title {
|
|
1797
|
+
font-weight: 700;
|
|
1798
|
+
overflow-wrap: anywhere;
|
|
1799
|
+
}
|
|
1800
|
+
|
|
1801
|
+
.row-subtitle {
|
|
1802
|
+
margin-top: 4px;
|
|
1803
|
+
color: var(--muted);
|
|
1804
|
+
font-size: 0.84rem;
|
|
1805
|
+
overflow-wrap: anywhere;
|
|
1806
|
+
}
|
|
1807
|
+
|
|
1808
|
+
.row-body {
|
|
1809
|
+
margin-top: 10px;
|
|
1810
|
+
overflow-wrap: anywhere;
|
|
1811
|
+
}
|
|
1812
|
+
|
|
1813
|
+
.badge {
|
|
1814
|
+
display: inline-flex;
|
|
1815
|
+
align-items: center;
|
|
1816
|
+
justify-content: center;
|
|
1817
|
+
min-width: 58px;
|
|
1818
|
+
padding: 3px 10px;
|
|
1819
|
+
border-radius: 999px;
|
|
1820
|
+
border: 1px solid currentColor;
|
|
1821
|
+
font-size: 0.76rem;
|
|
1822
|
+
font-weight: 700;
|
|
1823
|
+
letter-spacing: 0.04em;
|
|
1824
|
+
white-space: nowrap;
|
|
1825
|
+
}
|
|
1826
|
+
|
|
1827
|
+
.badge.pass {
|
|
1828
|
+
color: #15803d;
|
|
1829
|
+
background: rgba(34, 197, 94, 0.14);
|
|
1830
|
+
}
|
|
1831
|
+
|
|
1832
|
+
.badge.warn {
|
|
1833
|
+
color: #a16207;
|
|
1834
|
+
background: rgba(234, 179, 8, 0.18);
|
|
1835
|
+
}
|
|
1836
|
+
|
|
1837
|
+
.badge.fail {
|
|
1838
|
+
color: #b91c1c;
|
|
1839
|
+
background: rgba(239, 68, 68, 0.14);
|
|
1840
|
+
}
|
|
1841
|
+
|
|
1842
|
+
.badge.skip {
|
|
1843
|
+
color: #4b5563;
|
|
1844
|
+
background: rgba(107, 114, 128, 0.14);
|
|
1845
|
+
}
|
|
1846
|
+
|
|
1847
|
+
details {
|
|
1848
|
+
margin-top: 10px;
|
|
1849
|
+
}
|
|
1850
|
+
|
|
1851
|
+
details summary {
|
|
1852
|
+
cursor: pointer;
|
|
1853
|
+
color: var(--muted);
|
|
1854
|
+
}
|
|
1855
|
+
|
|
1856
|
+
.detail-block {
|
|
1857
|
+
border-top: 1px dashed var(--border);
|
|
1858
|
+
padding-top: 10px;
|
|
1859
|
+
}
|
|
1860
|
+
|
|
1861
|
+
.detail-content p {
|
|
1862
|
+
margin: 0;
|
|
1863
|
+
}
|
|
1864
|
+
|
|
1865
|
+
.section-card summary {
|
|
1866
|
+
display: flex;
|
|
1867
|
+
justify-content: space-between;
|
|
1868
|
+
align-items: center;
|
|
1869
|
+
gap: 12px;
|
|
1870
|
+
padding: 18px 20px;
|
|
1871
|
+
list-style: none;
|
|
1872
|
+
}
|
|
1873
|
+
|
|
1874
|
+
.section-card summary::-webkit-details-marker {
|
|
1875
|
+
display: none;
|
|
1876
|
+
}
|
|
1877
|
+
|
|
1878
|
+
.section-title {
|
|
1879
|
+
font-size: 1rem;
|
|
1880
|
+
font-weight: 700;
|
|
1881
|
+
color: var(--text);
|
|
1882
|
+
}
|
|
1883
|
+
|
|
1884
|
+
.section-summary {
|
|
1885
|
+
display: inline-flex;
|
|
1886
|
+
align-items: center;
|
|
1887
|
+
gap: 8px;
|
|
1888
|
+
color: var(--muted);
|
|
1889
|
+
text-align: right;
|
|
1890
|
+
}
|
|
1891
|
+
|
|
1892
|
+
.section-body {
|
|
1893
|
+
padding: 0 20px 20px;
|
|
1894
|
+
}
|
|
1895
|
+
|
|
1896
|
+
.gate-grid {
|
|
1897
|
+
margin-top: 12px;
|
|
1898
|
+
}
|
|
1899
|
+
|
|
1900
|
+
pre {
|
|
1901
|
+
margin: 0;
|
|
1902
|
+
padding: 12px;
|
|
1903
|
+
background: #f8fafc;
|
|
1904
|
+
border: 1px solid var(--border);
|
|
1905
|
+
border-radius: 10px;
|
|
1906
|
+
white-space: pre-wrap;
|
|
1907
|
+
word-break: break-word;
|
|
1908
|
+
overflow-wrap: anywhere;
|
|
1909
|
+
}
|
|
1910
|
+
|
|
1911
|
+
ul {
|
|
1912
|
+
margin: 0;
|
|
1913
|
+
padding-left: 20px;
|
|
1914
|
+
}
|
|
1915
|
+
|
|
1916
|
+
@media (max-width: 720px) {
|
|
1917
|
+
.container {
|
|
1918
|
+
padding: 16px 12px 28px;
|
|
1919
|
+
}
|
|
1920
|
+
|
|
1921
|
+
.row-header,
|
|
1922
|
+
.section-card summary,
|
|
1923
|
+
.meta-item,
|
|
1924
|
+
.definition-item {
|
|
1925
|
+
flex-direction: column;
|
|
1926
|
+
align-items: flex-start;
|
|
1927
|
+
}
|
|
1928
|
+
|
|
1929
|
+
.meta-value,
|
|
1930
|
+
.definition-value,
|
|
1931
|
+
.section-summary {
|
|
1932
|
+
text-align: left;
|
|
1933
|
+
}
|
|
1934
|
+
}
|
|
1935
|
+
</style>
|
|
1936
|
+
</head>
|
|
1937
|
+
<body>
|
|
1938
|
+
<main class="container">
|
|
1939
|
+
${body}
|
|
1940
|
+
</main>
|
|
1941
|
+
</body>
|
|
1942
|
+
</html>`;
|
|
1943
|
+
}
|
|
1944
|
+
function renderLintHtml(report) {
|
|
1945
|
+
const passRate = report.summary.total === 0 ? 0 : report.summary.passed / report.summary.total;
|
|
1946
|
+
const body = [
|
|
1947
|
+
renderHeaderCard(
|
|
1948
|
+
"lint",
|
|
1949
|
+
"Static Analysis Report",
|
|
1950
|
+
report.target,
|
|
1951
|
+
[
|
|
1952
|
+
{ label: "Pass rate", value: formatPercent(passRate), note: `${report.summary.passed}/${report.summary.total} passed` },
|
|
1953
|
+
{ label: "Warnings", value: String(report.summary.warnings), status: report.summary.warnings > 0 ? "warn" : "pass" },
|
|
1954
|
+
{ label: "Failures", value: String(report.summary.failures), status: report.summary.failures > 0 ? "fail" : "pass" },
|
|
1955
|
+
{ label: "Checks", value: String(report.summary.total) }
|
|
1956
|
+
],
|
|
1957
|
+
[{ label: "Target", value: report.target }]
|
|
1958
|
+
),
|
|
1959
|
+
renderSectionCard("Lint Issues", renderLintIssueList(report))
|
|
1960
|
+
].join("");
|
|
1961
|
+
return renderHtmlDocument(`skilltest lint - ${report.target}`, body);
|
|
1962
|
+
}
|
|
1963
|
+
function renderTriggerHtml(result) {
|
|
1964
|
+
const htmlResult = result;
|
|
1965
|
+
const target = resolveOptionalTarget(htmlResult, result.skillName);
|
|
1966
|
+
const matchedCount = result.cases.filter((testCase) => testCase.matched).length;
|
|
1967
|
+
const matchRate = result.cases.length === 0 ? 0 : matchedCount / result.cases.length;
|
|
1968
|
+
const body = [
|
|
1969
|
+
renderHeaderCard(
|
|
1970
|
+
"trigger",
|
|
1971
|
+
result.skillName,
|
|
1972
|
+
target,
|
|
1973
|
+
[
|
|
1974
|
+
{ label: "Match rate", value: formatPercent(matchRate), note: `${matchedCount}/${result.cases.length} matched` },
|
|
1975
|
+
{ label: "Precision", value: formatPercent(result.metrics.precision) },
|
|
1976
|
+
{ label: "Recall", value: formatPercent(result.metrics.recall) },
|
|
1977
|
+
{ label: "F1", value: formatPercent(result.metrics.f1), status: result.metrics.f1 >= 0.8 ? "pass" : "warn" }
|
|
1978
|
+
],
|
|
1979
|
+
[
|
|
1980
|
+
{ label: "Provider", value: result.provider },
|
|
1981
|
+
{ label: "Model", value: result.model },
|
|
1982
|
+
{ label: "Seed", value: result.seed !== void 0 ? String(result.seed) : "none" },
|
|
1983
|
+
{ label: "Queries", value: String(result.queries.length) }
|
|
1984
|
+
]
|
|
1985
|
+
),
|
|
1986
|
+
renderSectionCard("Trigger Cases", `<div class="row-list">${result.cases.map((testCase) => renderTriggerCaseRow(testCase)).join("")}</div>`),
|
|
1987
|
+
renderSectionCard(
|
|
1988
|
+
"Suggestions",
|
|
1989
|
+
`<ul>${result.suggestions.map((suggestion) => `<li>${escapeHtml(suggestion)}</li>`).join("")}</ul>`
|
|
1990
|
+
)
|
|
1991
|
+
].join("");
|
|
1992
|
+
return renderHtmlDocument(`skilltest trigger - ${result.skillName}`, body);
|
|
1993
|
+
}
|
|
1994
|
+
function renderEvalHtml(result) {
|
|
1995
|
+
const htmlResult = result;
|
|
1996
|
+
const target = resolveOptionalTarget(htmlResult, result.skillName);
|
|
1997
|
+
const passRate = result.summary.totalAssertions === 0 ? 0 : result.summary.passedAssertions / result.summary.totalAssertions;
|
|
1998
|
+
const body = [
|
|
1999
|
+
renderHeaderCard(
|
|
2000
|
+
"eval",
|
|
2001
|
+
result.skillName,
|
|
2002
|
+
target,
|
|
2003
|
+
[
|
|
2004
|
+
{
|
|
2005
|
+
label: "Assertion pass rate",
|
|
2006
|
+
value: formatPercent(passRate),
|
|
2007
|
+
note: `${result.summary.passedAssertions}/${result.summary.totalAssertions} passed`
|
|
2008
|
+
},
|
|
2009
|
+
{ label: "Prompts", value: String(result.summary.totalPrompts) },
|
|
2010
|
+
{ label: "Model", value: result.model },
|
|
2011
|
+
{ label: "Grader", value: result.graderModel }
|
|
2012
|
+
],
|
|
2013
|
+
[
|
|
2014
|
+
{ label: "Provider", value: result.provider },
|
|
2015
|
+
{ label: "Execution model", value: result.model },
|
|
2016
|
+
{ label: "Grader model", value: result.graderModel },
|
|
2017
|
+
{ label: "Prompts", value: String(result.prompts.length) }
|
|
2018
|
+
]
|
|
2019
|
+
),
|
|
2020
|
+
renderSectionCard("Eval Prompts", `<div class="row-list">${result.results.map((promptResult) => renderEvalPromptRow(promptResult)).join("")}</div>`)
|
|
2021
|
+
].join("");
|
|
2022
|
+
return renderHtmlDocument(`skilltest eval - ${result.skillName}`, body);
|
|
2023
|
+
}
|
|
2024
|
+
function renderCheckHtml(result) {
|
|
2025
|
+
const skillName = result.trigger?.skillName ?? result.eval?.skillName ?? result.target;
|
|
2026
|
+
const triggerBody = result.trigger ? `<div class="row-list">${result.trigger.cases.map((testCase) => renderTriggerCaseRow(testCase)).join("")}</div>
|
|
2027
|
+
<div class="card" style="margin-top: 16px;">
|
|
2028
|
+
<h2>Trigger Suggestions</h2>
|
|
2029
|
+
<ul>${result.trigger.suggestions.map((suggestion) => `<li>${escapeHtml(suggestion)}</li>`).join("")}</ul>
|
|
2030
|
+
</div>` : renderMessageRow("skip", "Trigger skipped", result.triggerSkippedReason ?? "Skipped.");
|
|
2031
|
+
const evalBody = result.eval ? `<div class="row-list">${result.eval.results.map((promptResult) => renderEvalPromptRow(promptResult)).join("")}</div>` : renderMessageRow("skip", "Eval skipped", result.evalSkippedReason ?? "Skipped.");
|
|
2032
|
+
const lintStatus = result.gates.lintPassed ? "pass" : "fail";
|
|
2033
|
+
const triggerStatus = gateStatus(result.gates.triggerPassed);
|
|
2034
|
+
const evalStatus = gateStatus(result.gates.evalPassed);
|
|
2035
|
+
const overallStatus = result.gates.overallPassed ? "pass" : "fail";
|
|
2036
|
+
const header = renderHeaderCard(
|
|
2037
|
+
"check",
|
|
2038
|
+
skillName,
|
|
2039
|
+
result.target,
|
|
2040
|
+
[
|
|
2041
|
+
{ label: "Overall gate", value: badgeLabel(overallStatus), status: overallStatus },
|
|
2042
|
+
{
|
|
2043
|
+
label: "Trigger F1",
|
|
2044
|
+
value: result.gates.triggerF1 !== null ? formatPercent(result.gates.triggerF1) : "skipped",
|
|
2045
|
+
status: triggerStatus
|
|
2046
|
+
},
|
|
2047
|
+
{
|
|
2048
|
+
label: "Eval pass rate",
|
|
2049
|
+
value: result.gates.evalAssertPassRate !== null ? formatPercent(result.gates.evalAssertPassRate) : "skipped",
|
|
2050
|
+
status: evalStatus
|
|
2051
|
+
},
|
|
2052
|
+
{
|
|
2053
|
+
label: "Lint result",
|
|
2054
|
+
value: `${result.lint.summary.failures} fail / ${result.lint.summary.warnings} warn`,
|
|
2055
|
+
status: lintStatus
|
|
2056
|
+
}
|
|
2057
|
+
],
|
|
2058
|
+
[
|
|
2059
|
+
{ label: "Provider", value: result.provider },
|
|
2060
|
+
{ label: "Model", value: result.model },
|
|
2061
|
+
{ label: "Grader model", value: result.graderModel },
|
|
2062
|
+
{
|
|
2063
|
+
label: "Thresholds",
|
|
2064
|
+
value: `min-f1=${result.thresholds.minF1.toFixed(2)} min-assert-pass-rate=${result.thresholds.minAssertPassRate.toFixed(2)}`
|
|
2065
|
+
}
|
|
2066
|
+
]
|
|
2067
|
+
);
|
|
2068
|
+
const lintSection = renderCollapsibleSection(
|
|
2069
|
+
"Lint",
|
|
2070
|
+
`${result.lint.summary.passed}/${result.lint.summary.total} passed, ${result.lint.summary.warnings} warnings, ${result.lint.summary.failures} failures`,
|
|
2071
|
+
renderLintIssueList(result.lint),
|
|
2072
|
+
lintStatus
|
|
2073
|
+
);
|
|
2074
|
+
const triggerSection = renderCollapsibleSection(
|
|
2075
|
+
"Trigger",
|
|
2076
|
+
result.trigger ? `f1=${formatPercent(result.trigger.metrics.f1)} precision=${formatPercent(result.trigger.metrics.precision)} recall=${formatPercent(result.trigger.metrics.recall)}` : result.triggerSkippedReason ?? "Skipped.",
|
|
2077
|
+
triggerBody,
|
|
2078
|
+
triggerStatus
|
|
2079
|
+
);
|
|
2080
|
+
const evalSection = renderCollapsibleSection(
|
|
2081
|
+
"Eval",
|
|
2082
|
+
result.eval ? `assertion pass rate=${formatPercent(result.gates.evalAssertPassRate ?? 0)} (${result.eval.summary.passedAssertions}/${result.eval.summary.totalAssertions})` : result.evalSkippedReason ?? "Skipped.",
|
|
2083
|
+
evalBody,
|
|
2084
|
+
evalStatus
|
|
2085
|
+
);
|
|
2086
|
+
const qualityGate = renderSectionCard(
|
|
2087
|
+
"Quality Gate",
|
|
2088
|
+
`<div class="gate-grid">
|
|
2089
|
+
${renderGateCard("Lint gate", lintStatus, result.gates.lintPassed ? "Lint passed." : "Lint failed.")}
|
|
2090
|
+
${renderGateCard(
|
|
2091
|
+
"Trigger gate",
|
|
2092
|
+
triggerStatus,
|
|
2093
|
+
result.gates.triggerPassed === null ? result.triggerSkippedReason ?? "Skipped." : `required ${result.thresholds.minF1.toFixed(2)}, actual ${result.gates.triggerF1?.toFixed(2) ?? "n/a"}`
|
|
2094
|
+
)}
|
|
2095
|
+
${renderGateCard(
|
|
2096
|
+
"Eval gate",
|
|
2097
|
+
evalStatus,
|
|
2098
|
+
result.gates.evalPassed === null ? result.evalSkippedReason ?? "Skipped." : `required ${result.thresholds.minAssertPassRate.toFixed(2)}, actual ${result.gates.evalAssertPassRate?.toFixed(2) ?? "n/a"}`
|
|
2099
|
+
)}
|
|
2100
|
+
${renderGateCard("Overall", overallStatus, result.gates.overallPassed ? "All quality gates passed." : "One or more gates failed.")}
|
|
2101
|
+
</div>`
|
|
2102
|
+
);
|
|
2103
|
+
return renderHtmlDocument(`skilltest check - ${skillName}`, [header, lintSection, triggerSection, evalSection, qualityGate].join(""));
|
|
2104
|
+
}
|
|
2105
|
+
|
|
1369
2106
|
// src/reporters/terminal.ts
|
|
1370
2107
|
import { Chalk } from "chalk";
|
|
1371
2108
|
function getChalkInstance(enableColor) {
|
|
@@ -1378,7 +2115,7 @@ function renderIssueLine(issue, c) {
|
|
|
1378
2115
|
return ` ${label} ${issue.title}
|
|
1379
2116
|
${issue.message}${detail}`;
|
|
1380
2117
|
}
|
|
1381
|
-
function
|
|
2118
|
+
function countSkippedSecurityPatterns2(issues) {
|
|
1382
2119
|
return issues.reduce((total, issue) => {
|
|
1383
2120
|
if (!issue.checkId.startsWith("security:")) {
|
|
1384
2121
|
return total;
|
|
@@ -1398,13 +2135,13 @@ function renderLintReport(report, enableColor) {
|
|
|
1398
2135
|
`\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518`
|
|
1399
2136
|
];
|
|
1400
2137
|
const renderedIssues = report.issues.map((issue) => renderIssueLine(issue, c)).join("\n");
|
|
1401
|
-
const skippedSecurityPatterns =
|
|
2138
|
+
const skippedSecurityPatterns = countSkippedSecurityPatterns2(report.issues);
|
|
1402
2139
|
const infoLine = skippedSecurityPatterns > 0 ? `
|
|
1403
2140
|
${c.cyan("\u2139")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)` : "";
|
|
1404
2141
|
return `${headerLines.join("\n")}
|
|
1405
2142
|
${renderedIssues}${infoLine}`;
|
|
1406
2143
|
}
|
|
1407
|
-
function
|
|
2144
|
+
function formatPercent2(value) {
|
|
1408
2145
|
return `${(value * 100).toFixed(1)}%`;
|
|
1409
2146
|
}
|
|
1410
2147
|
function renderTriggerReport(result, enableColor, verbose) {
|
|
@@ -1416,7 +2153,7 @@ function renderTriggerReport(result, enableColor, verbose) {
|
|
|
1416
2153
|
lines.push(`\u2502 skill: ${result.skillName}`);
|
|
1417
2154
|
lines.push(`\u2502 provider/model: ${result.provider}/${result.model}`);
|
|
1418
2155
|
lines.push(
|
|
1419
|
-
`\u2502 precision: ${
|
|
2156
|
+
`\u2502 precision: ${formatPercent2(result.metrics.precision)} recall: ${formatPercent2(result.metrics.recall)} f1: ${formatPercent2(result.metrics.f1)}`
|
|
1420
2157
|
);
|
|
1421
2158
|
lines.push(
|
|
1422
2159
|
`\u2502 TP ${result.metrics.truePositives} TN ${result.metrics.trueNegatives} FP ${result.metrics.falsePositives} FN ${result.metrics.falseNegatives}`
|
|
@@ -1490,7 +2227,7 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
1490
2227
|
for (const issue of lintIssues) {
|
|
1491
2228
|
lines.push(renderIssueLine(issue, c));
|
|
1492
2229
|
}
|
|
1493
|
-
const skippedSecurityPatterns =
|
|
2230
|
+
const skippedSecurityPatterns = countSkippedSecurityPatterns2(result.lint.issues);
|
|
1494
2231
|
if (skippedSecurityPatterns > 0) {
|
|
1495
2232
|
lines.push(` ${c.cyan("\u2139")} ${skippedSecurityPatterns} security pattern(s) found in code examples/comments (not flagged)`);
|
|
1496
2233
|
}
|
|
@@ -1498,7 +2235,7 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
1498
2235
|
lines.push("Trigger");
|
|
1499
2236
|
if (result.trigger) {
|
|
1500
2237
|
lines.push(
|
|
1501
|
-
`- ${triggerGate} f1=${
|
|
2238
|
+
`- ${triggerGate} f1=${formatPercent2(result.trigger.metrics.f1)} (precision=${formatPercent2(result.trigger.metrics.precision)} recall=${formatPercent2(result.trigger.metrics.recall)})`
|
|
1502
2239
|
);
|
|
1503
2240
|
lines.push(
|
|
1504
2241
|
` TP ${result.trigger.metrics.truePositives} TN ${result.trigger.metrics.trueNegatives} FP ${result.trigger.metrics.falsePositives} FN ${result.trigger.metrics.falseNegatives}`
|
|
@@ -1517,7 +2254,7 @@ function renderCheckReport(result, enableColor, verbose) {
|
|
|
1517
2254
|
if (result.eval) {
|
|
1518
2255
|
const passRate = result.gates.evalAssertPassRate ?? 0;
|
|
1519
2256
|
lines.push(
|
|
1520
|
-
`- ${evalGate} assertion pass rate=${
|
|
2257
|
+
`- ${evalGate} assertion pass rate=${formatPercent2(passRate)} (${result.eval.summary.passedAssertions}/${result.eval.summary.totalAssertions})`
|
|
1521
2258
|
);
|
|
1522
2259
|
for (const promptResult of result.eval.results) {
|
|
1523
2260
|
const failedAssertions = promptResult.assertions.filter((assertion) => !assertion.passed);
|
|
@@ -1609,6 +2346,58 @@ async function gradeResponse(options) {
|
|
|
1609
2346
|
return parsed.data.assertions;
|
|
1610
2347
|
}
|
|
1611
2348
|
|
|
2349
|
+
// src/utils/concurrency.ts
|
|
2350
|
+
async function pMap(items, fn, concurrency) {
|
|
2351
|
+
if (!Number.isInteger(concurrency) || concurrency < 1) {
|
|
2352
|
+
throw new Error("pMap concurrency must be an integer greater than or equal to 1.");
|
|
2353
|
+
}
|
|
2354
|
+
if (items.length === 0) {
|
|
2355
|
+
return [];
|
|
2356
|
+
}
|
|
2357
|
+
const results = new Array(items.length);
|
|
2358
|
+
return new Promise((resolve, reject) => {
|
|
2359
|
+
let nextIndex = 0;
|
|
2360
|
+
let completed = 0;
|
|
2361
|
+
let rejected = false;
|
|
2362
|
+
const launchNext = () => {
|
|
2363
|
+
if (rejected) {
|
|
2364
|
+
return;
|
|
2365
|
+
}
|
|
2366
|
+
if (completed === items.length) {
|
|
2367
|
+
resolve(results);
|
|
2368
|
+
return;
|
|
2369
|
+
}
|
|
2370
|
+
if (nextIndex >= items.length) {
|
|
2371
|
+
return;
|
|
2372
|
+
}
|
|
2373
|
+
const currentIndex = nextIndex;
|
|
2374
|
+
nextIndex += 1;
|
|
2375
|
+
Promise.resolve().then(() => fn(items[currentIndex], currentIndex)).then((result) => {
|
|
2376
|
+
if (rejected) {
|
|
2377
|
+
return;
|
|
2378
|
+
}
|
|
2379
|
+
results[currentIndex] = result;
|
|
2380
|
+
completed += 1;
|
|
2381
|
+
if (completed === items.length) {
|
|
2382
|
+
resolve(results);
|
|
2383
|
+
return;
|
|
2384
|
+
}
|
|
2385
|
+
launchNext();
|
|
2386
|
+
}).catch((error) => {
|
|
2387
|
+
if (rejected) {
|
|
2388
|
+
return;
|
|
2389
|
+
}
|
|
2390
|
+
rejected = true;
|
|
2391
|
+
reject(error);
|
|
2392
|
+
});
|
|
2393
|
+
};
|
|
2394
|
+
const initialWorkers = Math.min(concurrency, items.length);
|
|
2395
|
+
for (let workerIndex = 0; workerIndex < initialWorkers; workerIndex += 1) {
|
|
2396
|
+
launchNext();
|
|
2397
|
+
}
|
|
2398
|
+
});
|
|
2399
|
+
}
|
|
2400
|
+
|
|
1612
2401
|
// src/core/eval-runner.ts
|
|
1613
2402
|
var evalPromptSchema = z3.object({
|
|
1614
2403
|
prompt: z3.string().min(1),
|
|
@@ -1655,34 +2444,37 @@ async function generatePrompts(skill, provider, model, count) {
|
|
|
1655
2444
|
}
|
|
1656
2445
|
async function runEval(skill, options) {
|
|
1657
2446
|
const prompts = options.prompts && options.prompts.length > 0 ? evalPromptArraySchema.parse(options.prompts) : await generatePrompts(skill, options.provider, options.model, options.numRuns);
|
|
1658
|
-
const
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
2447
|
+
const systemPrompt = [
|
|
2448
|
+
"You are an AI assistant with an activated skill.",
|
|
2449
|
+
"Follow this SKILL.md content exactly where applicable.",
|
|
2450
|
+
"",
|
|
2451
|
+
skill.raw
|
|
2452
|
+
].join("\n");
|
|
2453
|
+
const results = await pMap(
|
|
2454
|
+
prompts,
|
|
2455
|
+
async (evalPrompt) => {
|
|
2456
|
+
const response = await options.provider.sendMessage(systemPrompt, evalPrompt.prompt, { model: options.model });
|
|
2457
|
+
const gradedAssertions = await gradeResponse({
|
|
2458
|
+
provider: options.provider,
|
|
2459
|
+
model: options.graderModel,
|
|
2460
|
+
skillName: skill.frontmatter.name,
|
|
2461
|
+
skillBody: skill.content,
|
|
2462
|
+
userPrompt: evalPrompt.prompt,
|
|
2463
|
+
modelResponse: response,
|
|
2464
|
+
assertions: evalPrompt.assertions
|
|
2465
|
+
});
|
|
2466
|
+
const passedAssertions2 = gradedAssertions.filter((assertion) => assertion.passed).length;
|
|
2467
|
+
return {
|
|
2468
|
+
prompt: evalPrompt.prompt,
|
|
2469
|
+
assertions: gradedAssertions,
|
|
2470
|
+
responseSummary: response.slice(0, 200),
|
|
2471
|
+
response,
|
|
2472
|
+
passedAssertions: passedAssertions2,
|
|
2473
|
+
totalAssertions: gradedAssertions.length
|
|
2474
|
+
};
|
|
2475
|
+
},
|
|
2476
|
+
options.concurrency ?? 5
|
|
2477
|
+
);
|
|
1686
2478
|
const totalAssertions = results.reduce((total, result) => total + result.totalAssertions, 0);
|
|
1687
2479
|
const passedAssertions = results.reduce((total, result) => total + result.passedAssertions, 0);
|
|
1688
2480
|
return {
|
|
@@ -1859,9 +2651,8 @@ function buildSuggestions(metrics) {
|
|
|
1859
2651
|
async function runTriggerTest(skill, options) {
|
|
1860
2652
|
const rng = createRng(options.seed);
|
|
1861
2653
|
const queries = options.queries && options.queries.length > 0 ? triggerQueryArraySchema.parse(options.queries) : await generateQueriesWithModel(skill, options.provider, options.model, options.numQueries);
|
|
1862
|
-
const results = [];
|
|
1863
2654
|
const skillName = skill.frontmatter.name;
|
|
1864
|
-
|
|
2655
|
+
const preparedQueries = queries.map((testQuery) => {
|
|
1865
2656
|
const fakeCount = 5 + Math.floor(rng() * 5);
|
|
1866
2657
|
const fakeSkills = sample(FAKE_SKILLS, fakeCount, rng);
|
|
1867
2658
|
const allSkills = shuffle([
|
|
@@ -1872,28 +2663,41 @@ async function runTriggerTest(skill, options) {
|
|
|
1872
2663
|
}
|
|
1873
2664
|
], rng);
|
|
1874
2665
|
const skillListText = allSkills.map((entry) => `- ${entry.name}: ${entry.description}`).join("\n");
|
|
1875
|
-
|
|
1876
|
-
|
|
1877
|
-
|
|
1878
|
-
|
|
1879
|
-
|
|
1880
|
-
|
|
1881
|
-
|
|
1882
|
-
|
|
1883
|
-
|
|
1884
|
-
|
|
1885
|
-
|
|
1886
|
-
|
|
1887
|
-
|
|
1888
|
-
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
|
|
1892
|
-
|
|
1893
|
-
|
|
1894
|
-
|
|
1895
|
-
|
|
1896
|
-
|
|
2666
|
+
return {
|
|
2667
|
+
testQuery,
|
|
2668
|
+
fakeCount,
|
|
2669
|
+
fakeSkills,
|
|
2670
|
+
allSkills,
|
|
2671
|
+
skillListText
|
|
2672
|
+
};
|
|
2673
|
+
});
|
|
2674
|
+
const systemPrompt = [
|
|
2675
|
+
"You are selecting one skill to activate for a user query.",
|
|
2676
|
+
"Choose the single best matching skill name from the provided list, or 'none' if no skill is a good fit.",
|
|
2677
|
+
"Respond with only the skill name or 'none'."
|
|
2678
|
+
].join(" ");
|
|
2679
|
+
const results = await pMap(
|
|
2680
|
+
preparedQueries,
|
|
2681
|
+
async ({ testQuery, allSkills, skillListText }) => {
|
|
2682
|
+
const userPrompt = [`Available skills:`, skillListText, "", `User query: ${testQuery.query}`].join("\n");
|
|
2683
|
+
const rawResponse = await options.provider.sendMessage(systemPrompt, userPrompt, { model: options.model });
|
|
2684
|
+
const decision = parseDecision(
|
|
2685
|
+
rawResponse,
|
|
2686
|
+
allSkills.map((entry) => entry.name)
|
|
2687
|
+
);
|
|
2688
|
+
const expected = testQuery.should_trigger ? skillName : "none";
|
|
2689
|
+
const matched = testQuery.should_trigger ? decision === skillName : decision !== skillName;
|
|
2690
|
+
return {
|
|
2691
|
+
query: testQuery.query,
|
|
2692
|
+
shouldTrigger: testQuery.should_trigger,
|
|
2693
|
+
expected,
|
|
2694
|
+
actual: decision,
|
|
2695
|
+
matched,
|
|
2696
|
+
rawModelResponse: options.verbose ? rawResponse : void 0
|
|
2697
|
+
};
|
|
2698
|
+
},
|
|
2699
|
+
options.concurrency ?? 5
|
|
2700
|
+
);
|
|
1897
2701
|
const metrics = calculateMetrics(skillName, results);
|
|
1898
2702
|
return {
|
|
1899
2703
|
skillName,
|
|
@@ -2059,6 +2863,9 @@ function writeError(error, asJson) {
|
|
|
2059
2863
|
}
|
|
2060
2864
|
|
|
2061
2865
|
// src/commands/lint.ts
|
|
2866
|
+
var lintCliSchema = z6.object({
|
|
2867
|
+
html: z6.string().optional()
|
|
2868
|
+
});
|
|
2062
2869
|
async function handleLintCommand(targetPath, options) {
|
|
2063
2870
|
try {
|
|
2064
2871
|
const report = await runLinter(targetPath, { suppress: options.suppress });
|
|
@@ -2067,6 +2874,9 @@ async function handleLintCommand(targetPath, options) {
|
|
|
2067
2874
|
} else {
|
|
2068
2875
|
writeResult(renderLintReport(report, options.color), false);
|
|
2069
2876
|
}
|
|
2877
|
+
if (options.html) {
|
|
2878
|
+
await fs6.writeFile(options.html, renderLintHtml(report), "utf8");
|
|
2879
|
+
}
|
|
2070
2880
|
if (lintFails(report, options.failOn)) {
|
|
2071
2881
|
process.exitCode = 1;
|
|
2072
2882
|
}
|
|
@@ -2076,74 +2886,85 @@ async function handleLintCommand(targetPath, options) {
|
|
|
2076
2886
|
}
|
|
2077
2887
|
}
|
|
2078
2888
|
function registerLintCommand(program) {
|
|
2079
|
-
program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").action(async (targetPath, _commandOptions, command) => {
|
|
2889
|
+
program.command("lint").description("Run static lint checks against a SKILL.md file or skill directory.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--html <path>", "Write an HTML report to the given file path").action(async (targetPath, _commandOptions, command) => {
|
|
2080
2890
|
const globalOptions = getGlobalCliOptions(command);
|
|
2081
2891
|
const config = getResolvedConfig(command);
|
|
2892
|
+
const parsedCli = lintCliSchema.safeParse(command.opts());
|
|
2893
|
+
if (!parsedCli.success) {
|
|
2894
|
+
writeError(new Error(parsedCli.error.issues[0]?.message ?? "Invalid lint options."), globalOptions.json);
|
|
2895
|
+
process.exitCode = 2;
|
|
2896
|
+
return;
|
|
2897
|
+
}
|
|
2082
2898
|
await handleLintCommand(targetPath, {
|
|
2083
2899
|
...globalOptions,
|
|
2084
2900
|
failOn: config.lint.failOn,
|
|
2085
|
-
suppress: config.lint.suppress
|
|
2901
|
+
suppress: config.lint.suppress,
|
|
2902
|
+
html: parsedCli.data.html
|
|
2086
2903
|
});
|
|
2087
2904
|
});
|
|
2088
2905
|
}
|
|
2089
2906
|
|
|
2090
2907
|
// src/commands/trigger.ts
|
|
2908
|
+
import fs8 from "node:fs/promises";
|
|
2091
2909
|
import ora from "ora";
|
|
2092
|
-
import { z as
|
|
2910
|
+
import { z as z8 } from "zod";
|
|
2093
2911
|
|
|
2094
2912
|
// src/utils/config.ts
|
|
2095
|
-
import
|
|
2913
|
+
import fs7 from "node:fs/promises";
|
|
2096
2914
|
import path5 from "node:path";
|
|
2097
|
-
import { z as
|
|
2098
|
-
var providerNameSchema =
|
|
2099
|
-
var lintFailOnSchema =
|
|
2100
|
-
var lintConfigSchema =
|
|
2915
|
+
import { z as z7 } from "zod";
|
|
2916
|
+
var providerNameSchema = z7.enum(["anthropic", "openai"]);
|
|
2917
|
+
var lintFailOnSchema = z7.enum(["error", "warn"]);
|
|
2918
|
+
var lintConfigSchema = z7.object({
|
|
2101
2919
|
failOn: lintFailOnSchema.optional(),
|
|
2102
|
-
suppress:
|
|
2920
|
+
suppress: z7.array(z7.string().min(1)).optional()
|
|
2103
2921
|
}).strict();
|
|
2104
|
-
var triggerConfigSchema =
|
|
2105
|
-
numQueries:
|
|
2106
|
-
threshold:
|
|
2107
|
-
seed:
|
|
2922
|
+
var triggerConfigSchema = z7.object({
|
|
2923
|
+
numQueries: z7.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
|
|
2924
|
+
threshold: z7.number().min(0).max(1).optional(),
|
|
2925
|
+
seed: z7.number().int().optional()
|
|
2108
2926
|
}).strict().partial();
|
|
2109
|
-
var evalConfigSchema =
|
|
2110
|
-
numRuns:
|
|
2111
|
-
threshold:
|
|
2112
|
-
promptFile:
|
|
2113
|
-
assertionsFile:
|
|
2927
|
+
var evalConfigSchema = z7.object({
|
|
2928
|
+
numRuns: z7.number().int().min(1).optional(),
|
|
2929
|
+
threshold: z7.number().min(0).max(1).optional(),
|
|
2930
|
+
promptFile: z7.string().min(1).optional(),
|
|
2931
|
+
assertionsFile: z7.string().min(1).optional()
|
|
2114
2932
|
}).strict().partial();
|
|
2115
|
-
var skilltestConfigSchema =
|
|
2933
|
+
var skilltestConfigSchema = z7.object({
|
|
2116
2934
|
provider: providerNameSchema.optional(),
|
|
2117
|
-
model:
|
|
2118
|
-
json:
|
|
2935
|
+
model: z7.string().min(1).optional(),
|
|
2936
|
+
json: z7.boolean().optional(),
|
|
2937
|
+
concurrency: z7.number().int().min(1).optional(),
|
|
2119
2938
|
lint: lintConfigSchema.optional(),
|
|
2120
2939
|
trigger: triggerConfigSchema.optional(),
|
|
2121
2940
|
eval: evalConfigSchema.optional()
|
|
2122
2941
|
}).strict();
|
|
2123
|
-
var resolvedSkilltestConfigSchema =
|
|
2942
|
+
var resolvedSkilltestConfigSchema = z7.object({
|
|
2124
2943
|
provider: providerNameSchema,
|
|
2125
|
-
model:
|
|
2126
|
-
json:
|
|
2127
|
-
|
|
2944
|
+
model: z7.string().min(1),
|
|
2945
|
+
json: z7.boolean(),
|
|
2946
|
+
concurrency: z7.number().int().min(1),
|
|
2947
|
+
lint: z7.object({
|
|
2128
2948
|
failOn: lintFailOnSchema,
|
|
2129
|
-
suppress:
|
|
2949
|
+
suppress: z7.array(z7.string().min(1))
|
|
2130
2950
|
}),
|
|
2131
|
-
trigger:
|
|
2132
|
-
numQueries:
|
|
2133
|
-
threshold:
|
|
2134
|
-
seed:
|
|
2951
|
+
trigger: z7.object({
|
|
2952
|
+
numQueries: z7.number().int().min(2).refine((value) => value % 2 === 0, "trigger.numQueries must be an even number."),
|
|
2953
|
+
threshold: z7.number().min(0).max(1),
|
|
2954
|
+
seed: z7.number().int().optional()
|
|
2135
2955
|
}),
|
|
2136
|
-
eval:
|
|
2137
|
-
numRuns:
|
|
2138
|
-
threshold:
|
|
2139
|
-
promptFile:
|
|
2140
|
-
assertionsFile:
|
|
2956
|
+
eval: z7.object({
|
|
2957
|
+
numRuns: z7.number().int().min(1),
|
|
2958
|
+
threshold: z7.number().min(0).max(1),
|
|
2959
|
+
promptFile: z7.string().min(1).optional(),
|
|
2960
|
+
assertionsFile: z7.string().min(1).optional()
|
|
2141
2961
|
})
|
|
2142
2962
|
});
|
|
2143
2963
|
var DEFAULT_SKILLTEST_CONFIG = {
|
|
2144
2964
|
provider: "anthropic",
|
|
2145
2965
|
model: "claude-sonnet-4-5-20250929",
|
|
2146
2966
|
json: false,
|
|
2967
|
+
concurrency: 5,
|
|
2147
2968
|
lint: {
|
|
2148
2969
|
failOn: "error",
|
|
2149
2970
|
suppress: []
|
|
@@ -2172,7 +2993,7 @@ function buildConfigValidationError(error, sourceLabel) {
|
|
|
2172
2993
|
async function readJsonObject(filePath, label) {
|
|
2173
2994
|
let raw;
|
|
2174
2995
|
try {
|
|
2175
|
-
raw = await
|
|
2996
|
+
raw = await fs7.readFile(filePath, "utf8");
|
|
2176
2997
|
} catch (error) {
|
|
2177
2998
|
const message = error instanceof Error ? error.message : String(error);
|
|
2178
2999
|
throw new Error(`Failed to read ${label}: ${message}`);
|
|
@@ -2205,7 +3026,7 @@ async function loadConfigFromNearestPackageJson(startDirectory) {
|
|
|
2205
3026
|
const packageJsonPath = path5.join(currentDirectory, "package.json");
|
|
2206
3027
|
if (await pathExists(packageJsonPath)) {
|
|
2207
3028
|
const raw = await readJsonObject(packageJsonPath, packageJsonPath);
|
|
2208
|
-
const packageJsonSchema =
|
|
3029
|
+
const packageJsonSchema = z7.object({
|
|
2209
3030
|
skilltestrc: skilltestConfigSchema.optional()
|
|
2210
3031
|
}).passthrough();
|
|
2211
3032
|
const parsed = packageJsonSchema.safeParse(raw);
|
|
@@ -2250,6 +3071,7 @@ function mergeConfigLayers(configFile = {}, cliFlags = {}, baseDirectory = proce
|
|
|
2250
3071
|
provider: cliFlags.provider ?? configFile.provider ?? DEFAULT_SKILLTEST_CONFIG.provider,
|
|
2251
3072
|
model: cliFlags.model ?? configFile.model ?? DEFAULT_SKILLTEST_CONFIG.model,
|
|
2252
3073
|
json: cliFlags.json ?? configFile.json ?? DEFAULT_SKILLTEST_CONFIG.json,
|
|
3074
|
+
concurrency: cliFlags.concurrency ?? configFile.concurrency ?? DEFAULT_SKILLTEST_CONFIG.concurrency,
|
|
2253
3075
|
lint: {
|
|
2254
3076
|
failOn: cliFlags.lint?.failOn ?? configFile.lint?.failOn ?? DEFAULT_SKILLTEST_CONFIG.lint.failOn,
|
|
2255
3077
|
suppress: cliFlags.lint?.suppress ?? configFile.lint?.suppress ?? DEFAULT_SKILLTEST_CONFIG.lint.suppress
|
|
@@ -2293,6 +3115,9 @@ function extractCliConfigOverrides(command) {
|
|
|
2293
3115
|
if (command.getOptionValueSource("model") === "cli") {
|
|
2294
3116
|
overrides.model = getTypedOptionValue(command, "model");
|
|
2295
3117
|
}
|
|
3118
|
+
if ((command.name() === "trigger" || command.name() === "eval" || command.name() === "check") && command.getOptionValueSource("concurrency") === "cli") {
|
|
3119
|
+
overrides.concurrency = getTypedOptionValue(command, "concurrency");
|
|
3120
|
+
}
|
|
2296
3121
|
if ((command.name() === "trigger" || command.name() === "check") && command.getOptionValueSource("numQueries") === "cli") {
|
|
2297
3122
|
overrides.trigger = {
|
|
2298
3123
|
...overrides.trigger,
|
|
@@ -2322,7 +3147,6 @@ async function resolveConfigContext(targetPath, cliFlags) {
|
|
|
2322
3147
|
const skillDirectoryConfig = await resolveSkillDirectoryConfig(targetPath);
|
|
2323
3148
|
if (skillDirectoryConfig) {
|
|
2324
3149
|
return {
|
|
2325
|
-
configFile: skillDirectoryConfig.configFile,
|
|
2326
3150
|
...skillDirectoryConfig,
|
|
2327
3151
|
config: mergeConfigLayers(skillDirectoryConfig.configFile, cliFlags, skillDirectoryConfig.sourceDirectory)
|
|
2328
3152
|
};
|
|
@@ -2331,7 +3155,6 @@ async function resolveConfigContext(targetPath, cliFlags) {
|
|
|
2331
3155
|
const cwdConfig = await loadConfigFromJsonFile(cwdConfigPath);
|
|
2332
3156
|
if (cwdConfig) {
|
|
2333
3157
|
return {
|
|
2334
|
-
configFile: cwdConfig.configFile,
|
|
2335
3158
|
...cwdConfig,
|
|
2336
3159
|
config: mergeConfigLayers(cwdConfig.configFile, cliFlags, cwdConfig.sourceDirectory)
|
|
2337
3160
|
};
|
|
@@ -2339,7 +3162,6 @@ async function resolveConfigContext(targetPath, cliFlags) {
|
|
|
2339
3162
|
const packageJsonConfig = await loadConfigFromNearestPackageJson(cwd);
|
|
2340
3163
|
if (packageJsonConfig) {
|
|
2341
3164
|
return {
|
|
2342
|
-
configFile: packageJsonConfig.configFile,
|
|
2343
3165
|
...packageJsonConfig,
|
|
2344
3166
|
config: mergeConfigLayers(packageJsonConfig.configFile, cliFlags, packageJsonConfig.sourceDirectory)
|
|
2345
3167
|
};
|
|
@@ -2547,12 +3369,14 @@ function createProvider(providerName, apiKeyOverride) {
|
|
|
2547
3369
|
}
|
|
2548
3370
|
|
|
2549
3371
|
// src/commands/trigger.ts
|
|
2550
|
-
var triggerCliSchema =
|
|
2551
|
-
queries:
|
|
2552
|
-
saveQueries:
|
|
2553
|
-
seed:
|
|
2554
|
-
|
|
2555
|
-
|
|
3372
|
+
var triggerCliSchema = z8.object({
|
|
3373
|
+
queries: z8.string().optional(),
|
|
3374
|
+
saveQueries: z8.string().optional(),
|
|
3375
|
+
seed: z8.number().int().optional(),
|
|
3376
|
+
concurrency: z8.number().int().min(1).optional(),
|
|
3377
|
+
html: z8.string().optional(),
|
|
3378
|
+
verbose: z8.boolean().optional(),
|
|
3379
|
+
apiKey: z8.string().optional()
|
|
2556
3380
|
});
|
|
2557
3381
|
var DEFAULT_ANTHROPIC_MODEL = "claude-sonnet-4-5-20250929";
|
|
2558
3382
|
var DEFAULT_OPENAI_MODEL = "gpt-4.1-mini";
|
|
@@ -2597,6 +3421,7 @@ async function handleTriggerCommand(targetPath, options) {
|
|
|
2597
3421
|
queries,
|
|
2598
3422
|
numQueries: options.numQueries,
|
|
2599
3423
|
seed: options.seed,
|
|
3424
|
+
concurrency: options.concurrency,
|
|
2600
3425
|
verbose: options.verbose
|
|
2601
3426
|
});
|
|
2602
3427
|
if (options.saveQueries) {
|
|
@@ -2608,6 +3433,13 @@ async function handleTriggerCommand(targetPath, options) {
|
|
|
2608
3433
|
} else {
|
|
2609
3434
|
writeResult(renderTriggerOutputWithSeed(renderTriggerReport(result, options.color, options.verbose), result.seed), false);
|
|
2610
3435
|
}
|
|
3436
|
+
if (options.html) {
|
|
3437
|
+
const htmlResult = {
|
|
3438
|
+
...result,
|
|
3439
|
+
target: targetPath
|
|
3440
|
+
};
|
|
3441
|
+
await fs8.writeFile(options.html, renderTriggerHtml(htmlResult), "utf8");
|
|
3442
|
+
}
|
|
2611
3443
|
} catch (error) {
|
|
2612
3444
|
spinner?.stop();
|
|
2613
3445
|
writeError(error, options.json);
|
|
@@ -2615,7 +3447,7 @@ async function handleTriggerCommand(targetPath, options) {
|
|
|
2615
3447
|
}
|
|
2616
3448
|
}
|
|
2617
3449
|
function registerTriggerCommand(program) {
|
|
2618
|
-
program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--queries <path>", "Path to custom test queries JSON").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, _commandOptions, command) => {
|
|
3450
|
+
program.command("trigger").description("Evaluate whether a skill description triggers correctly.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--model <model>", "Model to use").option("--provider <provider>", "LLM provider: anthropic|openai").option("--queries <path>", "Path to custom test queries JSON").option("--num-queries <n>", "Number of auto-generated queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--concurrency <n>", "Maximum in-flight trigger requests", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-queries <path>", "Save generated queries to a JSON file").option("--api-key <key>", "API key override").option("--verbose", "Show full model decisions").action(async (targetPath, _commandOptions, command) => {
|
|
2619
3451
|
const globalOptions = getGlobalCliOptions(command);
|
|
2620
3452
|
const config = getResolvedConfig(command);
|
|
2621
3453
|
const parsedCli = triggerCliSchema.safeParse(command.opts());
|
|
@@ -2632,6 +3464,8 @@ function registerTriggerCommand(program) {
|
|
|
2632
3464
|
numQueries: config.trigger.numQueries,
|
|
2633
3465
|
saveQueries: parsedCli.data.saveQueries,
|
|
2634
3466
|
seed: parsedCli.data.seed ?? config.trigger.seed,
|
|
3467
|
+
concurrency: config.concurrency,
|
|
3468
|
+
html: parsedCli.data.html,
|
|
2635
3469
|
verbose: Boolean(parsedCli.data.verbose),
|
|
2636
3470
|
apiKey: parsedCli.data.apiKey
|
|
2637
3471
|
});
|
|
@@ -2639,14 +3473,17 @@ function registerTriggerCommand(program) {
|
|
|
2639
3473
|
}
|
|
2640
3474
|
|
|
2641
3475
|
// src/commands/eval.ts
|
|
3476
|
+
import fs9 from "node:fs/promises";
|
|
2642
3477
|
import ora2 from "ora";
|
|
2643
|
-
import { z as
|
|
2644
|
-
var evalCliSchema =
|
|
2645
|
-
prompts:
|
|
2646
|
-
graderModel:
|
|
2647
|
-
saveResults:
|
|
2648
|
-
|
|
2649
|
-
|
|
3478
|
+
import { z as z9 } from "zod";
|
|
3479
|
+
var evalCliSchema = z9.object({
|
|
3480
|
+
prompts: z9.string().optional(),
|
|
3481
|
+
graderModel: z9.string().optional(),
|
|
3482
|
+
saveResults: z9.string().optional(),
|
|
3483
|
+
concurrency: z9.number().int().min(1).optional(),
|
|
3484
|
+
html: z9.string().optional(),
|
|
3485
|
+
verbose: z9.boolean().optional(),
|
|
3486
|
+
apiKey: z9.string().optional()
|
|
2650
3487
|
});
|
|
2651
3488
|
var DEFAULT_ANTHROPIC_MODEL2 = "claude-sonnet-4-5-20250929";
|
|
2652
3489
|
var DEFAULT_OPENAI_MODEL2 = "gpt-4.1-mini";
|
|
@@ -2686,6 +3523,7 @@ async function handleEvalCommand(targetPath, options, command) {
|
|
|
2686
3523
|
model,
|
|
2687
3524
|
graderModel,
|
|
2688
3525
|
numRuns: options.numRuns,
|
|
3526
|
+
concurrency: options.concurrency,
|
|
2689
3527
|
prompts
|
|
2690
3528
|
});
|
|
2691
3529
|
if (options.saveResults) {
|
|
@@ -2697,6 +3535,13 @@ async function handleEvalCommand(targetPath, options, command) {
|
|
|
2697
3535
|
} else {
|
|
2698
3536
|
writeResult(renderEvalReport(result, options.color, options.verbose), false);
|
|
2699
3537
|
}
|
|
3538
|
+
if (options.html) {
|
|
3539
|
+
const htmlResult = {
|
|
3540
|
+
...result,
|
|
3541
|
+
target: targetPath
|
|
3542
|
+
};
|
|
3543
|
+
await fs9.writeFile(options.html, renderEvalHtml(htmlResult), "utf8");
|
|
3544
|
+
}
|
|
2700
3545
|
} catch (error) {
|
|
2701
3546
|
spinner?.stop();
|
|
2702
3547
|
writeError(error, options.json);
|
|
@@ -2704,7 +3549,7 @@ async function handleEvalCommand(targetPath, options, command) {
|
|
|
2704
3549
|
}
|
|
2705
3550
|
}
|
|
2706
3551
|
function registerEvalCommand(program) {
|
|
2707
|
-
program.command("eval").description("Run end-to-end skill execution and quality evaluation.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--prompts <path>", "Path to eval prompts JSON").option("--model <model>", "Model to execute prompts").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--provider <provider>", "LLM provider: anthropic|openai").option("--save-results <path>", "Save full evaluation results to JSON").option("--api-key <key>", "API key override").option("--verbose", "Show full model responses").action(async (targetPath, _commandOptions, command) => {
|
|
3552
|
+
program.command("eval").description("Run end-to-end skill execution and quality evaluation.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--prompts <path>", "Path to eval prompts JSON").option("--model <model>", "Model to execute prompts").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--provider <provider>", "LLM provider: anthropic|openai").option("--concurrency <n>", "Maximum in-flight eval prompt runs", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--save-results <path>", "Save full evaluation results to JSON").option("--api-key <key>", "API key override").option("--verbose", "Show full model responses").action(async (targetPath, _commandOptions, command) => {
|
|
2708
3553
|
const globalOptions = getGlobalCliOptions(command);
|
|
2709
3554
|
const config = getResolvedConfig(command);
|
|
2710
3555
|
const parsedCli = evalCliSchema.safeParse(command.opts());
|
|
@@ -2722,9 +3567,11 @@ function registerEvalCommand(program) {
|
|
|
2722
3567
|
graderModel: parsedCli.data.graderModel,
|
|
2723
3568
|
provider: config.provider,
|
|
2724
3569
|
saveResults: parsedCli.data.saveResults,
|
|
3570
|
+
html: parsedCli.data.html,
|
|
2725
3571
|
verbose: Boolean(parsedCli.data.verbose),
|
|
2726
3572
|
apiKey: parsedCli.data.apiKey,
|
|
2727
|
-
numRuns: config.eval.numRuns
|
|
3573
|
+
numRuns: config.eval.numRuns,
|
|
3574
|
+
concurrency: config.concurrency
|
|
2728
3575
|
},
|
|
2729
3576
|
command
|
|
2730
3577
|
);
|
|
@@ -2732,8 +3579,9 @@ function registerEvalCommand(program) {
|
|
|
2732
3579
|
}
|
|
2733
3580
|
|
|
2734
3581
|
// src/commands/check.ts
|
|
3582
|
+
import fs10 from "node:fs/promises";
|
|
2735
3583
|
import ora3 from "ora";
|
|
2736
|
-
import { z as
|
|
3584
|
+
import { z as z10 } from "zod";
|
|
2737
3585
|
|
|
2738
3586
|
// src/core/check-runner.ts
|
|
2739
3587
|
function calculateEvalAssertPassRate(result) {
|
|
@@ -2764,23 +3612,33 @@ async function runCheck(inputPath, options) {
|
|
|
2764
3612
|
evalSkippedReason = `Skipped: skill could not be parsed strictly (${message}).`;
|
|
2765
3613
|
}
|
|
2766
3614
|
if (parsedSkill) {
|
|
2767
|
-
|
|
2768
|
-
trigger = await runTriggerTest(parsedSkill, {
|
|
3615
|
+
const triggerOptions = {
|
|
2769
3616
|
provider: options.provider,
|
|
2770
3617
|
model: options.model,
|
|
2771
3618
|
queries: options.queries,
|
|
2772
3619
|
numQueries: options.numQueries,
|
|
2773
3620
|
seed: options.triggerSeed,
|
|
3621
|
+
concurrency: options.concurrency,
|
|
2774
3622
|
verbose: options.verbose
|
|
2775
|
-
}
|
|
2776
|
-
|
|
2777
|
-
evalResult = await runEval(parsedSkill, {
|
|
3623
|
+
};
|
|
3624
|
+
const evalOptions = {
|
|
2778
3625
|
provider: options.provider,
|
|
2779
3626
|
model: options.model,
|
|
2780
3627
|
graderModel: options.graderModel,
|
|
2781
3628
|
numRuns: options.evalNumRuns,
|
|
2782
|
-
prompts: options.prompts
|
|
2783
|
-
|
|
3629
|
+
prompts: options.prompts,
|
|
3630
|
+
concurrency: options.concurrency
|
|
3631
|
+
};
|
|
3632
|
+
if ((options.concurrency ?? 5) === 1) {
|
|
3633
|
+
options.onStage?.("trigger");
|
|
3634
|
+
trigger = await runTriggerTest(parsedSkill, triggerOptions);
|
|
3635
|
+
options.onStage?.("eval");
|
|
3636
|
+
evalResult = await runEval(parsedSkill, evalOptions);
|
|
3637
|
+
} else {
|
|
3638
|
+
options.onStage?.("trigger");
|
|
3639
|
+
options.onStage?.("eval");
|
|
3640
|
+
[trigger, evalResult] = await Promise.all([runTriggerTest(parsedSkill, triggerOptions), runEval(parsedSkill, evalOptions)]);
|
|
3641
|
+
}
|
|
2784
3642
|
}
|
|
2785
3643
|
}
|
|
2786
3644
|
const triggerF1 = trigger ? trigger.metrics.f1 : null;
|
|
@@ -2815,15 +3673,17 @@ async function runCheck(inputPath, options) {
|
|
|
2815
3673
|
}
|
|
2816
3674
|
|
|
2817
3675
|
// src/commands/check.ts
|
|
2818
|
-
var checkCliSchema =
|
|
2819
|
-
graderModel:
|
|
2820
|
-
apiKey:
|
|
2821
|
-
queries:
|
|
2822
|
-
seed:
|
|
2823
|
-
prompts:
|
|
2824
|
-
|
|
2825
|
-
|
|
2826
|
-
|
|
3676
|
+
var checkCliSchema = z10.object({
|
|
3677
|
+
graderModel: z10.string().optional(),
|
|
3678
|
+
apiKey: z10.string().optional(),
|
|
3679
|
+
queries: z10.string().optional(),
|
|
3680
|
+
seed: z10.number().int().optional(),
|
|
3681
|
+
prompts: z10.string().optional(),
|
|
3682
|
+
concurrency: z10.number().int().min(1).optional(),
|
|
3683
|
+
html: z10.string().optional(),
|
|
3684
|
+
saveResults: z10.string().optional(),
|
|
3685
|
+
continueOnLintFail: z10.boolean().optional(),
|
|
3686
|
+
verbose: z10.boolean().optional()
|
|
2827
3687
|
});
|
|
2828
3688
|
var DEFAULT_ANTHROPIC_MODEL3 = "claude-sonnet-4-5-20250929";
|
|
2829
3689
|
var DEFAULT_OPENAI_MODEL3 = "gpt-4.1-mini";
|
|
@@ -2882,6 +3742,7 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
2882
3742
|
triggerSeed: options.triggerSeed,
|
|
2883
3743
|
prompts,
|
|
2884
3744
|
evalNumRuns: options.numRuns,
|
|
3745
|
+
concurrency: options.concurrency,
|
|
2885
3746
|
minF1: options.minF1,
|
|
2886
3747
|
minAssertPassRate: options.minAssertPassRate,
|
|
2887
3748
|
continueOnLintFail: options.continueOnLintFail,
|
|
@@ -2894,10 +3755,8 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
2894
3755
|
spinner.text = "Running lint checks...";
|
|
2895
3756
|
} else if (stage === "parse") {
|
|
2896
3757
|
spinner.text = "Parsing skill for model evaluations...";
|
|
2897
|
-
} else if (stage === "trigger") {
|
|
2898
|
-
spinner.text = "Running trigger
|
|
2899
|
-
} else if (stage === "eval") {
|
|
2900
|
-
spinner.text = "Running end-to-end eval suite...";
|
|
3758
|
+
} else if (stage === "trigger" || stage === "eval") {
|
|
3759
|
+
spinner.text = "Running trigger and eval suites...";
|
|
2901
3760
|
}
|
|
2902
3761
|
}
|
|
2903
3762
|
});
|
|
@@ -2913,6 +3772,9 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
2913
3772
|
false
|
|
2914
3773
|
);
|
|
2915
3774
|
}
|
|
3775
|
+
if (options.html) {
|
|
3776
|
+
await fs10.writeFile(options.html, renderCheckHtml(result), "utf8");
|
|
3777
|
+
}
|
|
2916
3778
|
process.exitCode = result.gates.overallPassed ? 0 : 1;
|
|
2917
3779
|
} catch (error) {
|
|
2918
3780
|
spinner?.stop();
|
|
@@ -2921,7 +3783,7 @@ async function handleCheckCommand(targetPath, options, command) {
|
|
|
2921
3783
|
}
|
|
2922
3784
|
}
|
|
2923
3785
|
function registerCheckCommand(program) {
|
|
2924
|
-
program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for trigger/eval runs").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option("--min-assert-pass-rate <n>", "Minimum required eval assertion pass rate (0-1)", (value) => Number.parseFloat(value)).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, _commandOptions, command) => {
|
|
3786
|
+
program.command("check").description("Run lint + trigger + eval with threshold-based quality gates.").argument("<path-to-skill>", "Path to SKILL.md or skill directory").option("--provider <provider>", "LLM provider: anthropic|openai").option("--model <model>", "Model for trigger/eval runs").option("--grader-model <model>", "Model used for grading (defaults to --model)").option("--api-key <key>", "API key override").option("--queries <path>", "Path to custom trigger queries JSON").option("--num-queries <n>", "Number of auto-generated trigger queries", (value) => Number.parseInt(value, 10)).option("--seed <number>", "RNG seed for reproducible results", (value) => Number.parseInt(value, 10)).option("--prompts <path>", "Path to eval prompts JSON").option("--concurrency <n>", "Maximum in-flight trigger/eval tasks", (value) => Number.parseInt(value, 10)).option("--html <path>", "Write an HTML report to the given file path").option("--min-f1 <n>", "Minimum required trigger F1 score (0-1)", (value) => Number.parseFloat(value)).option("--min-assert-pass-rate <n>", "Minimum required eval assertion pass rate (0-1)", (value) => Number.parseFloat(value)).option("--save-results <path>", "Save combined check results to JSON").option("--continue-on-lint-fail", "Continue trigger/eval stages even when lint has failures").option("--verbose", "Show detailed trigger/eval output sections").action(async (targetPath, _commandOptions, command) => {
|
|
2925
3787
|
const globalOptions = getGlobalCliOptions(command);
|
|
2926
3788
|
const config = getResolvedConfig(command);
|
|
2927
3789
|
const parsedCli = checkCliSchema.safeParse(command.opts());
|
|
@@ -2944,6 +3806,8 @@ function registerCheckCommand(program) {
|
|
|
2944
3806
|
minF1: config.trigger.threshold,
|
|
2945
3807
|
minAssertPassRate: config.eval.threshold,
|
|
2946
3808
|
numRuns: config.eval.numRuns,
|
|
3809
|
+
concurrency: config.concurrency,
|
|
3810
|
+
html: parsedCli.data.html,
|
|
2947
3811
|
lintFailOn: config.lint.failOn,
|
|
2948
3812
|
lintSuppress: config.lint.suppress,
|
|
2949
3813
|
triggerSeed: parsedCli.data.seed ?? config.trigger.seed,
|
|
@@ -2961,7 +3825,7 @@ function resolveVersion() {
|
|
|
2961
3825
|
try {
|
|
2962
3826
|
const currentFilePath = fileURLToPath(import.meta.url);
|
|
2963
3827
|
const packageJsonPath = path6.resolve(path6.dirname(currentFilePath), "..", "package.json");
|
|
2964
|
-
const raw =
|
|
3828
|
+
const raw = fs11.readFileSync(packageJsonPath, "utf8");
|
|
2965
3829
|
const parsed = JSON.parse(raw);
|
|
2966
3830
|
return parsed.version ?? "0.0.0";
|
|
2967
3831
|
} catch {
|