@rudderjs/ai 1.5.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +399 -0
- package/boost/guidelines.md +60 -0
- package/dist/agent.d.ts +35 -1
- package/dist/agent.d.ts.map +1 -1
- package/dist/agent.js +118 -16
- package/dist/agent.js.map +1 -1
- package/dist/budget/pricing.d.ts +124 -0
- package/dist/budget/pricing.d.ts.map +1 -0
- package/dist/budget/pricing.js +175 -0
- package/dist/budget/pricing.js.map +1 -0
- package/dist/budget/storage.d.ts +104 -0
- package/dist/budget/storage.d.ts.map +1 -0
- package/dist/budget/storage.js +0 -0
- package/dist/budget/storage.js.map +1 -0
- package/dist/budget/with-budget.d.ts +119 -0
- package/dist/budget/with-budget.d.ts.map +1 -0
- package/dist/budget/with-budget.js +175 -0
- package/dist/budget/with-budget.js.map +1 -0
- package/dist/budget-orm/index.d.ts +96 -0
- package/dist/budget-orm/index.d.ts.map +1 -0
- package/dist/budget-orm/index.js +177 -0
- package/dist/budget-orm/index.js.map +1 -0
- package/dist/commands/ai-eval.d.ts +93 -0
- package/dist/commands/ai-eval.d.ts.map +1 -0
- package/dist/commands/ai-eval.js +378 -0
- package/dist/commands/ai-eval.js.map +1 -0
- package/dist/computer-use/actions.d.ts +214 -0
- package/dist/computer-use/actions.d.ts.map +1 -0
- package/dist/computer-use/actions.js +48 -0
- package/dist/computer-use/actions.js.map +1 -0
- package/dist/computer-use/errors.d.ts +57 -0
- package/dist/computer-use/errors.d.ts.map +1 -0
- package/dist/computer-use/errors.js +76 -0
- package/dist/computer-use/errors.js.map +1 -0
- package/dist/computer-use/index.d.ts +53 -0
- package/dist/computer-use/index.d.ts.map +1 -0
- package/dist/computer-use/index.js +51 -0
- package/dist/computer-use/index.js.map +1 -0
- package/dist/computer-use/playwright.d.ts +76 -0
- package/dist/computer-use/playwright.d.ts.map +1 -0
- package/dist/computer-use/playwright.js +270 -0
- package/dist/computer-use/playwright.js.map +1 -0
- package/dist/computer-use/tool.d.ts +154 -0
- package/dist/computer-use/tool.d.ts.map +1 -0
- package/dist/computer-use/tool.js +210 -0
- package/dist/computer-use/tool.js.map +1 -0
- package/dist/eval/fixtures.d.ts +65 -0
- package/dist/eval/fixtures.d.ts.map +1 -0
- package/dist/eval/fixtures.js +110 -0
- package/dist/eval/fixtures.js.map +1 -0
- package/dist/eval/html-reporter.d.ts +25 -0
- package/dist/eval/html-reporter.d.ts.map +1 -0
- package/dist/eval/html-reporter.js +209 -0
- package/dist/eval/html-reporter.js.map +1 -0
- package/dist/eval/index.d.ts +271 -0
- package/dist/eval/index.d.ts.map +1 -0
- package/dist/eval/index.js +510 -0
- package/dist/eval/index.js.map +1 -0
- package/dist/eval/json-reporter.d.ts +43 -0
- package/dist/eval/json-reporter.d.ts.map +1 -0
- package/dist/eval/json-reporter.js +40 -0
- package/dist/eval/json-reporter.js.map +1 -0
- package/dist/fake.d.ts +36 -1
- package/dist/fake.d.ts.map +1 -1
- package/dist/fake.js +49 -2
- package/dist/fake.js.map +1 -1
- package/dist/file-search.d.ts +168 -0
- package/dist/file-search.d.ts.map +1 -0
- package/dist/file-search.js +158 -0
- package/dist/file-search.js.map +1 -0
- package/dist/index.d.ts +22 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +17 -1
- package/dist/index.js.map +1 -1
- package/dist/mcp/client-tools.d.ts +39 -0
- package/dist/mcp/client-tools.d.ts.map +1 -0
- package/dist/mcp/client-tools.js +147 -0
- package/dist/mcp/client-tools.js.map +1 -0
- package/dist/mcp/index.d.ts +16 -0
- package/dist/mcp/index.d.ts.map +1 -0
- package/dist/mcp/index.js +15 -0
- package/dist/mcp/index.js.map +1 -0
- package/dist/mcp/server-from-agent.d.ts +24 -0
- package/dist/mcp/server-from-agent.d.ts.map +1 -0
- package/dist/mcp/server-from-agent.js +113 -0
- package/dist/mcp/server-from-agent.js.map +1 -0
- package/dist/mcp/types.d.ts +64 -0
- package/dist/mcp/types.d.ts.map +1 -0
- package/dist/mcp/types.js +6 -0
- package/dist/mcp/types.js.map +1 -0
- package/dist/memory-embedding/index.d.ts +121 -0
- package/dist/memory-embedding/index.d.ts.map +1 -0
- package/dist/memory-embedding/index.js +229 -0
- package/dist/memory-embedding/index.js.map +1 -0
- package/dist/memory-extract.d.ts +60 -0
- package/dist/memory-extract.d.ts.map +1 -0
- package/dist/memory-extract.js +163 -0
- package/dist/memory-extract.js.map +1 -0
- package/dist/memory-inject.d.ts +39 -0
- package/dist/memory-inject.d.ts.map +1 -0
- package/dist/memory-inject.js +135 -0
- package/dist/memory-inject.js.map +1 -0
- package/dist/memory-orm/index.d.ts +118 -0
- package/dist/memory-orm/index.d.ts.map +1 -0
- package/dist/memory-orm/index.js +187 -0
- package/dist/memory-orm/index.js.map +1 -0
- package/dist/memory.d.ts +55 -0
- package/dist/memory.d.ts.map +1 -0
- package/dist/memory.js +132 -0
- package/dist/memory.js.map +1 -0
- package/dist/observers.d.ts +22 -0
- package/dist/observers.d.ts.map +1 -1
- package/dist/observers.js.map +1 -1
- package/dist/provider-tools.d.ts +15 -1
- package/dist/provider-tools.d.ts.map +1 -1
- package/dist/provider-tools.js +21 -1
- package/dist/provider-tools.js.map +1 -1
- package/dist/providers/anthropic.d.ts.map +1 -1
- package/dist/providers/anthropic.js +61 -6
- package/dist/providers/anthropic.js.map +1 -1
- package/dist/providers/elevenlabs.d.ts +98 -0
- package/dist/providers/elevenlabs.d.ts.map +1 -0
- package/dist/providers/elevenlabs.js +229 -0
- package/dist/providers/elevenlabs.js.map +1 -0
- package/dist/providers/google.d.ts +83 -1
- package/dist/providers/google.d.ts.map +1 -1
- package/dist/providers/google.js +491 -8
- package/dist/providers/google.js.map +1 -1
- package/dist/providers/openai.d.ts +3 -1
- package/dist/providers/openai.d.ts.map +1 -1
- package/dist/providers/openai.js +209 -5
- package/dist/providers/openai.js.map +1 -1
- package/dist/providers/voyage.d.ts +91 -0
- package/dist/providers/voyage.d.ts.map +1 -0
- package/dist/providers/voyage.js +166 -0
- package/dist/providers/voyage.js.map +1 -0
- package/dist/queue-job.d.ts +69 -4
- package/dist/queue-job.d.ts.map +1 -1
- package/dist/queue-job.js +114 -11
- package/dist/queue-job.js.map +1 -1
- package/dist/registry.d.ts +3 -1
- package/dist/registry.d.ts.map +1 -1
- package/dist/registry.js +10 -0
- package/dist/registry.js.map +1 -1
- package/dist/server/provider.d.ts.map +1 -1
- package/dist/server/provider.js +23 -1
- package/dist/server/provider.js.map +1 -1
- package/dist/similarity-search.d.ts +163 -0
- package/dist/similarity-search.d.ts.map +1 -0
- package/dist/similarity-search.js +147 -0
- package/dist/similarity-search.js.map +1 -0
- package/dist/tool.d.ts.map +1 -1
- package/dist/tool.js +13 -4
- package/dist/tool.js.map +1 -1
- package/dist/types.d.ts +246 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/vector-stores/index.d.ts +96 -0
- package/dist/vector-stores/index.d.ts.map +1 -0
- package/dist/vector-stores/index.js +153 -0
- package/dist/vector-stores/index.js.map +1 -0
- package/package.json +41 -3
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Self-contained HTML reporter for `pnpm rudder ai:eval --html`
|
|
3
|
+
* (#A5 Phase 5). Renders one or more {@link SuiteReport}s into a
|
|
4
|
+
* single HTML string with inline styles and minimal vanilla JS for
|
|
5
|
+
* row expand/collapse — no framework deps, no external assets, safe
|
|
6
|
+
* to paste into a PR comment or open offline.
|
|
7
|
+
*
|
|
8
|
+
* Defensive HTML-escape on every piece of user content (suite name,
|
|
9
|
+
* case name, input, response, metadata). Long responses get a
|
|
10
|
+
* `<pre>` block with `white-space: pre-wrap` so output stays
|
|
11
|
+
* scannable without a horizontal scroll.
|
|
12
|
+
*/
|
|
13
|
+
/**
|
|
14
|
+
* Render an array of {@link SuiteReport}s as a single self-contained
|
|
15
|
+
* HTML document.
|
|
16
|
+
*/
|
|
17
|
+
export function reportHtml(reports, opts = {}) {
|
|
18
|
+
const title = opts.title ?? 'Eval Report';
|
|
19
|
+
const generatedAt = opts.generatedAt ?? new Date().toISOString();
|
|
20
|
+
const totals = reports.reduce((a, r) => ({
|
|
21
|
+
cases: a.cases + r.cases.length,
|
|
22
|
+
passed: a.passed + r.passed,
|
|
23
|
+
failed: a.failed + r.failed,
|
|
24
|
+
skipped: a.skipped + r.skipped,
|
|
25
|
+
cost: a.cost + r.cost,
|
|
26
|
+
tokens: a.tokens + r.tokens,
|
|
27
|
+
duration: a.duration + r.duration,
|
|
28
|
+
}), { cases: 0, passed: 0, failed: 0, skipped: 0, cost: 0, tokens: 0, duration: 0 });
|
|
29
|
+
const passRate = totals.cases > 0 ? Math.round((totals.passed / totals.cases) * 100) : 0;
|
|
30
|
+
return `<!DOCTYPE html>
|
|
31
|
+
<html lang="en">
|
|
32
|
+
<head>
|
|
33
|
+
<meta charset="UTF-8">
|
|
34
|
+
<meta name="viewport" content="width=device-width,initial-scale=1">
|
|
35
|
+
<title>${escapeHtml(title)}</title>
|
|
36
|
+
<style>${STYLE}</style>
|
|
37
|
+
</head>
|
|
38
|
+
<body>
|
|
39
|
+
<header class="page-header">
|
|
40
|
+
<h1>${escapeHtml(title)}</h1>
|
|
41
|
+
<div class="meta">
|
|
42
|
+
Generated ${escapeHtml(generatedAt)} ·
|
|
43
|
+
${reports.length} suite${plural(reports.length)} ·
|
|
44
|
+
${totals.cases} case${plural(totals.cases)} ·
|
|
45
|
+
<strong class="${passRate === 100 ? 'ok' : passRate >= 80 ? 'warn' : 'bad'}">${passRate}% pass</strong> ·
|
|
46
|
+
${formatCost(totals.cost)} ·
|
|
47
|
+
${totals.tokens.toLocaleString()} tokens ·
|
|
48
|
+
${formatMs(totals.duration)}
|
|
49
|
+
</div>
|
|
50
|
+
</header>
|
|
51
|
+
${reports.map(renderSuite).join('\n')}
|
|
52
|
+
<script>${SCRIPT}</script>
|
|
53
|
+
</body>
|
|
54
|
+
</html>
|
|
55
|
+
`;
|
|
56
|
+
}
|
|
57
|
+
function renderSuite(r) {
|
|
58
|
+
const passRate = r.cases.length > 0 ? Math.round((r.passed / r.cases.length) * 100) : 0;
|
|
59
|
+
return `<section class="suite">
|
|
60
|
+
<header class="suite-header">
|
|
61
|
+
<h2>${escapeHtml(r.suite)}</h2>
|
|
62
|
+
<div class="suite-stats">
|
|
63
|
+
<span class="${passRate === 100 ? 'ok' : passRate >= 80 ? 'warn' : 'bad'}">${r.passed}/${r.cases.length} passed</span>
|
|
64
|
+
${r.skipped > 0 ? `· <span class="muted">${r.skipped} skipped</span>` : ''}
|
|
65
|
+
· ${formatCost(r.cost)}
|
|
66
|
+
· ${r.tokens.toLocaleString()} tokens
|
|
67
|
+
· ${formatMs(r.duration)}
|
|
68
|
+
</div>
|
|
69
|
+
${renderMetadata(r.metadata)}
|
|
70
|
+
</header>
|
|
71
|
+
<table class="cases">
|
|
72
|
+
<thead>
|
|
73
|
+
<tr>
|
|
74
|
+
<th>Case</th>
|
|
75
|
+
<th>Status</th>
|
|
76
|
+
<th class="num">Tokens</th>
|
|
77
|
+
<th class="num">Cost</th>
|
|
78
|
+
<th class="num">Duration</th>
|
|
79
|
+
</tr>
|
|
80
|
+
</thead>
|
|
81
|
+
<tbody>
|
|
82
|
+
${r.cases.map(renderCase).join('\n')}
|
|
83
|
+
</tbody>
|
|
84
|
+
</table>
|
|
85
|
+
</section>`;
|
|
86
|
+
}
|
|
87
|
+
function renderCase(c) {
|
|
88
|
+
const glyph = c.status === 'passed' ? '✓' : c.status === 'failed' ? '✗' : '○';
|
|
89
|
+
const responseBlock = c.responseText !== undefined
|
|
90
|
+
? `<h4>Response</h4><pre>${escapeHtml(c.responseText)}</pre>`
|
|
91
|
+
: '<h4>Response</h4><pre class="muted"><no response — agent threw or skipped></pre>';
|
|
92
|
+
const reasonBlock = (c.metric?.reason ?? c.reason)
|
|
93
|
+
? `<h4>Reason</h4><pre>${escapeHtml(c.metric?.reason ?? c.reason)}</pre>`
|
|
94
|
+
: '';
|
|
95
|
+
const scoreBlock = c.metric?.score !== undefined
|
|
96
|
+
? `<h4>Score</h4><pre>${c.metric.score.toFixed(3)}</pre>`
|
|
97
|
+
: '';
|
|
98
|
+
return ` <tr class="case ${c.status}" tabindex="0" aria-expanded="false">
|
|
99
|
+
<td><span class="glyph">${glyph}</span> ${escapeHtml(c.name)}</td>
|
|
100
|
+
<td><span class="badge ${c.status}">${c.status}</span></td>
|
|
101
|
+
<td class="num">${c.tokens.toLocaleString()}</td>
|
|
102
|
+
<td class="num">${formatCost(c.cost)}</td>
|
|
103
|
+
<td class="num">${formatMs(c.duration)}</td>
|
|
104
|
+
</tr>
|
|
105
|
+
<tr class="case-detail" hidden>
|
|
106
|
+
<td colspan="5">
|
|
107
|
+
<h4>Input</h4>
|
|
108
|
+
<pre>${escapeHtml(c.input)}</pre>
|
|
109
|
+
${responseBlock}
|
|
110
|
+
${scoreBlock}
|
|
111
|
+
${reasonBlock}
|
|
112
|
+
</td>
|
|
113
|
+
</tr>`;
|
|
114
|
+
}
|
|
115
|
+
function renderMetadata(meta) {
|
|
116
|
+
if (!meta)
|
|
117
|
+
return '';
|
|
118
|
+
const rows = Object.entries(meta).filter(([, v]) => v !== undefined && v !== '');
|
|
119
|
+
if (rows.length === 0)
|
|
120
|
+
return '';
|
|
121
|
+
return `<dl class="metadata">${rows.map(([k, v]) => `<dt>${escapeHtml(formatLabel(k))}</dt><dd>${escapeHtml(v)}</dd>`).join('')}</dl>`;
|
|
122
|
+
}
|
|
123
|
+
function formatLabel(key) {
|
|
124
|
+
// camelCase → Title Case for the well-known keys; pass others through.
|
|
125
|
+
if (key === 'lastReviewed')
|
|
126
|
+
return 'Last reviewed';
|
|
127
|
+
return key.charAt(0).toUpperCase() + key.slice(1);
|
|
128
|
+
}
|
|
129
|
+
// ─── HTML escape (no external dep) ───────────────────────
|
|
130
|
+
const ESCAPE_MAP = {
|
|
131
|
+
'&': '&',
|
|
132
|
+
'<': '<',
|
|
133
|
+
'>': '>',
|
|
134
|
+
'"': '"',
|
|
135
|
+
"'": ''',
|
|
136
|
+
};
|
|
137
|
+
function escapeHtml(s) {
|
|
138
|
+
return s.replace(/[&<>"']/g, ch => ESCAPE_MAP[ch]);
|
|
139
|
+
}
|
|
140
|
+
function plural(n) {
|
|
141
|
+
return n === 1 ? '' : 's';
|
|
142
|
+
}
|
|
143
|
+
function formatMs(ms) {
|
|
144
|
+
if (ms < 1000)
|
|
145
|
+
return `${Math.round(ms)}ms`;
|
|
146
|
+
return `${(ms / 1000).toFixed(1)}s`;
|
|
147
|
+
}
|
|
148
|
+
function formatCost(usd) {
|
|
149
|
+
if (usd === 0)
|
|
150
|
+
return '$0.000';
|
|
151
|
+
if (usd < 0.001)
|
|
152
|
+
return '<$0.001';
|
|
153
|
+
return `$${usd.toFixed(3)}`;
|
|
154
|
+
}
|
|
155
|
+
// ─── Inline assets ────────────────────────────────────────
|
|
156
|
+
const STYLE = `
|
|
157
|
+
:root { color-scheme: light dark; --fg: #1a1a1a; --bg: #fff; --muted: #6a6a6a; --border: #e2e2e2; --row-hover: #f7f7f7; --ok: #1a7f37; --warn: #b08800; --bad: #b91c1c; --pre-bg: #f6f8fa; }
|
|
158
|
+
@media (prefers-color-scheme: dark) {
|
|
159
|
+
:root { --fg: #e6e6e6; --bg: #0d1117; --muted: #8a8a8a; --border: #30363d; --row-hover: #161b22; --ok: #3fb950; --warn: #d29922; --bad: #f85149; --pre-bg: #161b22; }
|
|
160
|
+
}
|
|
161
|
+
* { box-sizing: border-box }
|
|
162
|
+
body { font: 14px/1.5 -apple-system,BlinkMacSystemFont,"Segoe UI",sans-serif; color: var(--fg); background: var(--bg); margin: 0; padding: 24px; max-width: 1100px; margin-inline: auto }
|
|
163
|
+
h1 { margin: 0 0 4px; font-size: 24px }
|
|
164
|
+
h2 { margin: 0 0 4px; font-size: 18px }
|
|
165
|
+
h4 { margin: 12px 0 4px; font-size: 12px; text-transform: uppercase; letter-spacing: .04em; color: var(--muted) }
|
|
166
|
+
pre { background: var(--pre-bg); border: 1px solid var(--border); border-radius: 6px; padding: 8px 12px; overflow-x: auto; white-space: pre-wrap; word-wrap: break-word; font: 12px/1.45 ui-monospace,SFMono-Regular,Menlo,monospace; margin: 0 }
|
|
167
|
+
.page-header { border-bottom: 1px solid var(--border); padding-bottom: 12px; margin-bottom: 20px }
|
|
168
|
+
.page-header .meta { color: var(--muted); font-size: 13px }
|
|
169
|
+
.suite { margin-bottom: 28px }
|
|
170
|
+
.suite-header h2 { display: inline }
|
|
171
|
+
.suite-stats { color: var(--muted); font-size: 13px; margin-top: 4px }
|
|
172
|
+
.metadata { display: grid; grid-template-columns: max-content 1fr; gap: 4px 12px; margin: 8px 0 0; font-size: 13px }
|
|
173
|
+
.metadata dt { color: var(--muted); font-weight: normal }
|
|
174
|
+
.metadata dd { margin: 0 }
|
|
175
|
+
.cases { width: 100%; border-collapse: collapse; margin-top: 12px; border: 1px solid var(--border); border-radius: 6px; overflow: hidden }
|
|
176
|
+
.cases th, .cases td { padding: 8px 12px; text-align: left; border-bottom: 1px solid var(--border) }
|
|
177
|
+
.cases th { background: var(--pre-bg); font-weight: 600; font-size: 12px; text-transform: uppercase; letter-spacing: .04em; color: var(--muted) }
|
|
178
|
+
.cases tr:last-child td { border-bottom: none }
|
|
179
|
+
.cases tr.case { cursor: pointer; user-select: none }
|
|
180
|
+
.cases tr.case:hover { background: var(--row-hover) }
|
|
181
|
+
.cases tr.case:focus { outline: 2px solid var(--warn); outline-offset: -2px }
|
|
182
|
+
.cases tr.case-detail td { background: var(--pre-bg) }
|
|
183
|
+
.cases td.num { text-align: right; font-variant-numeric: tabular-nums }
|
|
184
|
+
.glyph { display: inline-block; width: 14px; font-weight: bold }
|
|
185
|
+
.cases tr.case.passed .glyph { color: var(--ok) }
|
|
186
|
+
.cases tr.case.failed .glyph { color: var(--bad) }
|
|
187
|
+
.cases tr.case.skipped .glyph { color: var(--muted) }
|
|
188
|
+
.badge { display: inline-block; padding: 1px 8px; border-radius: 999px; font-size: 11px; font-weight: 600; text-transform: uppercase; letter-spacing: .04em }
|
|
189
|
+
.badge.passed { background: rgba(63,185,80,.15); color: var(--ok) }
|
|
190
|
+
.badge.failed { background: rgba(248,81,73,.15); color: var(--bad) }
|
|
191
|
+
.badge.skipped { background: rgba(138,138,138,.15); color: var(--muted) }
|
|
192
|
+
.ok { color: var(--ok) } .warn { color: var(--warn) } .bad { color: var(--bad) } .muted { color: var(--muted) }
|
|
193
|
+
`.trim();
|
|
194
|
+
const SCRIPT = `
|
|
195
|
+
document.querySelectorAll('tr.case').forEach(function(row) {
|
|
196
|
+
function toggle() {
|
|
197
|
+
var detail = row.nextElementSibling;
|
|
198
|
+
if (!detail || !detail.classList.contains('case-detail')) return;
|
|
199
|
+
var open = !detail.hidden;
|
|
200
|
+
detail.hidden = open;
|
|
201
|
+
row.setAttribute('aria-expanded', String(!open));
|
|
202
|
+
}
|
|
203
|
+
row.addEventListener('click', toggle);
|
|
204
|
+
row.addEventListener('keydown', function(e) {
|
|
205
|
+
if (e.key === 'Enter' || e.key === ' ') { e.preventDefault(); toggle(); }
|
|
206
|
+
});
|
|
207
|
+
});
|
|
208
|
+
`.trim();
|
|
209
|
+
//# sourceMappingURL=html-reporter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"html-reporter.js","sourceRoot":"","sources":["../../src/eval/html-reporter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAWH;;;GAGG;AACH,MAAM,UAAU,UAAU,CAAC,OAAsB,EAAE,OAA0B,EAAE;IAC7E,MAAM,KAAK,GAAS,IAAI,CAAC,KAAK,IAAI,aAAa,CAAA;IAC/C,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,IAAI,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAA;IAEhE,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAC3B,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QACT,KAAK,EAAK,CAAC,CAAC,KAAK,GAAM,CAAC,CAAC,KAAK,CAAC,MAAM;QACrC,MAAM,EAAI,CAAC,CAAC,MAAM,GAAK,CAAC,CAAC,MAAM;QAC/B,MAAM,EAAI,CAAC,CAAC,MAAM,GAAK,CAAC,CAAC,MAAM;QAC/B,OAAO,EAAG,CAAC,CAAC,OAAO,GAAI,CAAC,CAAC,OAAO;QAChC,IAAI,EAAM,CAAC,CAAC,IAAI,GAAO,CAAC,CAAC,IAAI;QAC7B,MAAM,EAAI,CAAC,CAAC,MAAM,GAAK,CAAC,CAAC,MAAM;QAC/B,QAAQ,EAAE,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ;KAClC,CAAC,EACF,EAAE,KAAK,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,QAAQ,EAAE,CAAC,EAAE,CAChF,CAAA;IAED,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;IAExF,OAAO;;;;;SAKA,UAAU,CAAC,KAAK,CAAC;SACjB,KAAK;;;;QAIN,UAAU,CAAC,KAAK,CAAC;;gBAET,UAAU,CAAC,WAAW,CAAC;MACjC,OAAO,CAAC,MAAM,SAAS,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC;MAC7C,MAAM,CAAC,KAAK,QAAQ,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC;qBACzB,QAAQ,KAAK,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,KAAK,QAAQ;MACrF,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC;MACvB,MAAM,CAAC,MAAM,CAAC,cAAc,EAAE;MAC9B,QAAQ,CAAC,MAAM,CAAC,QAAQ,CAAC;;;EAG7B,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;UAC3B,MAAM;;;CAGf,CAAA;AACD,CAAC;AAED,SAAS,WAAW,CAAC,CAAc;IACjC,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;IACvF,OAAO;;UAEC,UAAU,CAAC,CAAC,CAAC,KAAK,CAAC;;qBAER,QAAQ,KAAK,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,KAAK,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,KAAK,CAAC,MAAM;QACrG,CAAC,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC,yBAAyB,CAAC,CAAC,OAAO,iBAAiB,CAAC,CAAC,CAAC,EAAE;UACtE,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC;UAClB,CAAC,CAAC,MAAM,CAAC,cAAc,EAAE;UACzB,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC;;MAExB,cAAc,CAAC,CAAC,CAAC,QAAQ,CAAC;;;;;;;;;;;;;EAa9B,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;WAGzB,CAAA;AACX,CAAC;AAED,SAAS,UAAU,CAAC,CAA+B;IACjD,MAAM,KAAK,GAAG,CAAC,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAA;IAC7E,MAAM,aAAa,GAAG,CAAC,CAAC,YAAY,KAAK,SAAS;QAChD,CAAC,CAAC,yBAAyB,UAAU,CAAC,CAAC,CAAC,YAAY,CAAC,QAAQ;QAC7D,CAAC,CAAC,wFAAwF,CAAA;IAC5F,MAAM,WAAW,GAAG,CAAC,CAAC,CAAC,MAAM,EAAE,MAAM,IAAI,CAAC,CAAC,MAAM,CAAC;QAChD,CAAC,CAAC,uBAAuB,UAAU,CAAC,CAAC,CAAC,MAAM,EAAE,MAAM,IAAI,CAAC,CAAC,MAAO,CAAC,QAAQ;QAC1E,CAAC,CAAC,EAAE,CAAA;IACN,MAAM,UAAU,GAAG,CAAC,CAAC,MAAM,EAAE,KAAK,KAAK,SAAS;QAC9C,CAAC,CAAC,sBAAsB,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ;QACzD,CAAC,CAAC,EAAE,CAAA;IACN,OAAO,yBAAyB,CAAC,CAAC,MAAM;kCACR,KAAK,WAAW,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC;iCACnC,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,MAAM;0BAC5B,CAAC,CAAC,MAAM,CAAC,cAAc,EAAE;0BACzB,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC;0BAClB,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC;;;;;iBAK7B,UAAU,CAAC,CAAC,CAAC,KAAK,CAAC;YACxB,aAAa;YACb,UAAU;YACV,WAAW;;YAEX,CAAA;AACZ,CAAC;AAED,SAAS,cAAc,CAAC,IAA8B;IACpD,IAAI,CAAC,IAAI;QAAE,OAAO,EAAE,CAAA;IACpB,MAAM,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,KAAK,SAAS,IAAI,CAAC,KAAK,EAAE,CAAC,CAAA;IAChF,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAA;IAChC,OAAO,wBACL,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,UAAU,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,YAAY,UAAU,CAAC,CAAE,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,EAAE,CAClG,OAAO,CAAA;AACT,CAAC;AAED,SAAS,WAAW,CAAC,GAAW;IAC9B,uEAAuE;IACvE,IAAI,GAAG,KAAK,cAAc;QAAE,OAAO,eAAe,CAAA;IAClD,OAAO,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA;AACnD,CAAC;AAED,4DAA4D;AAE5D,MAAM,UAAU,GAA2B;IACzC,GAAG,EAAG,OAAO;IACb,GAAG,EAAG,MAAM;IACZ,GAAG,EAAG,MAAM;IACZ,GAAG,EAAG,QAAQ;IACd,GAAG,EAAG,OAAO;CACd,CAAA;AACD,SAAS,UAAU,CAAC,CAAS;IAC3B,OAAO,CAAC,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,EAAE,CAAC,UAAU,CAAC,EAAE,CAAE,CAAC,CAAA;AACrD,CAAC;AAED,SAAS,MAAM,CAAC,CAAS;IACvB,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAA;AAC3B,CAAC;AAED,SAAS,QAAQ,CAAC,EAAU;IAC1B,IAAI,EAAE,GAAG,IAAI;QAAE,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,IAAI,CAAA;IAC3C,OAAO,GAAG,CAAC,EAAE,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAA;AACrC,CAAC;AAED,SAAS,UAAU,CAAC,GAAW;IAC7B,IAAI,GAAG,KAAK,CAAC;QAAM,OAAO,QAAQ,CAAA;IAClC,IAAI,GAAG,GAAG,KAAK;QAAI,OAAO,SAAS,CAAA;IACnC,OAAO,IAAI,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAA;AAC7B,CAAC;AAED,6DAA6D;AAE7D,MAAM,KAAK,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAqCb,CAAC,IAAI,EAAE,CAAA;AAER,MAAM,MAAM,GAAG;;;;;;;;;;;;;;CAcd,CAAC,IAAI,EAAE,CAAA"}
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `@rudderjs/ai/eval` — built-in eval framework for #A5 Phase 1.
|
|
3
|
+
*
|
|
4
|
+
* Define a suite of input cases + assertions, run them against any
|
|
5
|
+
* `Agent`, get a console report with pass/fail + cost + tokens. Same
|
|
6
|
+
* `Agent` instances as your app code — one source of truth.
|
|
7
|
+
*
|
|
8
|
+
* @example
|
|
9
|
+
* ```ts
|
|
10
|
+
* // evals/support-agent.eval.ts
|
|
11
|
+
* import { evalSuite, llmJudge, exactMatch, regex } from '@rudderjs/ai/eval'
|
|
12
|
+
* import { SupportAgent } from '../app/Agents/SupportAgent.js'
|
|
13
|
+
*
|
|
14
|
+
* export default evalSuite('SupportAgent', {
|
|
15
|
+
* agent: () => new SupportAgent(),
|
|
16
|
+
* cases: [
|
|
17
|
+
* { name: 'password reset', input: 'How do I reset my password?',
|
|
18
|
+
* assert: llmJudge('mentions a password reset link') },
|
|
19
|
+
* { name: 'price', input: 'How much?', assert: exactMatch('$99/month') },
|
|
20
|
+
* { name: 'support email', input: 'Contact?', assert: regex(/support@/) },
|
|
21
|
+
* ],
|
|
22
|
+
* })
|
|
23
|
+
* ```
|
|
24
|
+
*
|
|
25
|
+
* Run programmatically via `runSuite(suite)` from this entry, or via
|
|
26
|
+
* `pnpm rudder ai:eval` once Phase 2 lands.
|
|
27
|
+
*
|
|
28
|
+
* Built-in metrics: `exactMatch`, `regex`, `llmJudge`, `jsonShape`,
|
|
29
|
+
* `semanticMatch`, `tokenCost`. Compose multiple via `compose(...)`.
|
|
30
|
+
* User-defined metrics work today — any `(response, ctx) =>
|
|
31
|
+
* MetricResult` qualifies.
|
|
32
|
+
*/
|
|
33
|
+
import type { Agent } from '../agent.js';
|
|
34
|
+
import type { AgentResponse } from '../types.js';
|
|
35
|
+
import { z } from 'zod';
|
|
36
|
+
export { reportJson } from './json-reporter.js';
|
|
37
|
+
export type { SuiteJson, SuiteJsonCase } from './json-reporter.js';
|
|
38
|
+
export { stepsFromResponse } from './fixtures.js';
|
|
39
|
+
export type { EvalFixture } from './fixtures.js';
|
|
40
|
+
export { reportHtml } from './html-reporter.js';
|
|
41
|
+
export type { HtmlReportOptions } from './html-reporter.js';
|
|
42
|
+
/**
|
|
43
|
+
* Result of a single assertion. `pass` is the only required field;
|
|
44
|
+
* `score` (0..1) and `reason` are surfaced in reports.
|
|
45
|
+
*/
|
|
46
|
+
export interface MetricResult {
|
|
47
|
+
pass: boolean;
|
|
48
|
+
score?: number;
|
|
49
|
+
reason?: string;
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Assertion signature. Sync or async; the runner awaits both.
|
|
53
|
+
*
|
|
54
|
+
* `ctx` carries the case context so user metrics can opt into the
|
|
55
|
+
* input/case-name (e.g. for logging). The built-ins ignore it.
|
|
56
|
+
*/
|
|
57
|
+
export type Metric = (response: AgentResponse, ctx: MetricContext) => MetricResult | Promise<MetricResult>;
|
|
58
|
+
export interface MetricContext {
|
|
59
|
+
/** The case's input string (the same passed to `agent.prompt`). */
|
|
60
|
+
input: string;
|
|
61
|
+
/** Optional case `name` if set on the spec. */
|
|
62
|
+
caseName: string;
|
|
63
|
+
}
|
|
64
|
+
/** A single eval case. */
|
|
65
|
+
export interface EvalCase {
|
|
66
|
+
/** Stable identifier used in reports. Defaults to `case-<index>`. */
|
|
67
|
+
name?: string;
|
|
68
|
+
/** Input passed to `agent.prompt(input)`. */
|
|
69
|
+
input: string;
|
|
70
|
+
/** The assertion. Pass-fail + optional score/reason. */
|
|
71
|
+
assert: Metric;
|
|
72
|
+
/**
|
|
73
|
+
* Per-case agent override. When set, replaces the suite-level
|
|
74
|
+
* `agent` factory for this case (e.g. swap models for a stress
|
|
75
|
+
* test).
|
|
76
|
+
*/
|
|
77
|
+
agent?: () => Agent;
|
|
78
|
+
/**
|
|
79
|
+
* Per-case timeout in ms. Defaults to the suite-level timeout
|
|
80
|
+
* (or no timeout if neither is set).
|
|
81
|
+
*/
|
|
82
|
+
timeout?: number;
|
|
83
|
+
/**
|
|
84
|
+
* Skip this case. Pass `true` to silently skip, or a string for
|
|
85
|
+
* a reason that surfaces in the report.
|
|
86
|
+
*/
|
|
87
|
+
skip?: boolean | string;
|
|
88
|
+
}
|
|
89
|
+
export interface EvalSuiteSpec {
|
|
90
|
+
/** Factory for the agent under test. Called once per case. */
|
|
91
|
+
agent: () => Agent;
|
|
92
|
+
/** The cases to run. */
|
|
93
|
+
cases: EvalCase[];
|
|
94
|
+
/**
|
|
95
|
+
* Suite-wide timeout in ms applied to every case unless the case
|
|
96
|
+
* overrides. Throws cause `pass: false` with the timeout message.
|
|
97
|
+
*/
|
|
98
|
+
timeout?: number;
|
|
99
|
+
/**
|
|
100
|
+
* Optional ownership / context surfaced in the HTML report (#A5
|
|
101
|
+
* Phase 5). Well-known keys (`owner`, `lastReviewed`, `ticket`)
|
|
102
|
+
* get formatted headings; any extra string keys render as a
|
|
103
|
+
* generic key/value row so teams can attach their own metadata.
|
|
104
|
+
*/
|
|
105
|
+
metadata?: EvalMetadata;
|
|
106
|
+
}
|
|
107
|
+
export interface EvalMetadata {
|
|
108
|
+
owner?: string;
|
|
109
|
+
lastReviewed?: string;
|
|
110
|
+
ticket?: string;
|
|
111
|
+
[key: string]: string | undefined;
|
|
112
|
+
}
|
|
113
|
+
export interface EvalSuite {
|
|
114
|
+
name: string;
|
|
115
|
+
spec: EvalSuiteSpec;
|
|
116
|
+
}
|
|
117
|
+
/** Per-case run record collected by {@link runSuite}. */
|
|
118
|
+
export interface CaseResult {
|
|
119
|
+
name: string;
|
|
120
|
+
/** Final result; `'skipped'` skips assertion + cost. */
|
|
121
|
+
status: 'passed' | 'failed' | 'skipped';
|
|
122
|
+
metric?: MetricResult;
|
|
123
|
+
/** Skip reason (when `status === 'skipped'`). */
|
|
124
|
+
reason?: string;
|
|
125
|
+
/** Wall-clock ms for the agent call + assertion. */
|
|
126
|
+
duration: number;
|
|
127
|
+
/**
|
|
128
|
+
* Token usage from the agent's `prompt()` (zero on skip / failure
|
|
129
|
+
* before the call). Includes BOTH the agent under test AND any
|
|
130
|
+
* judge-model calls the assertion made.
|
|
131
|
+
*/
|
|
132
|
+
tokens: number;
|
|
133
|
+
/** USD estimate (see {@link estimateCost}; zero on skip). */
|
|
134
|
+
cost: number;
|
|
135
|
+
/**
|
|
136
|
+
* The case's input string, copied through from `EvalCase.input`
|
|
137
|
+
* for reporters that want to render the prompt alongside the
|
|
138
|
+
* response (#A5 Phase 5 HTML report). Always present — runners
|
|
139
|
+
* always know the input.
|
|
140
|
+
*/
|
|
141
|
+
input: string;
|
|
142
|
+
/**
|
|
143
|
+
* The agent's final assistant text. Absent when the case skipped
|
|
144
|
+
* or the agent threw before producing a response. The HTML
|
|
145
|
+
* reporter renders `<no response>` in that case.
|
|
146
|
+
*/
|
|
147
|
+
responseText?: string;
|
|
148
|
+
}
|
|
149
|
+
/** Full report returned by {@link runSuite}. */
|
|
150
|
+
export interface SuiteReport {
|
|
151
|
+
suite: string;
|
|
152
|
+
cases: CaseResult[];
|
|
153
|
+
passed: number;
|
|
154
|
+
failed: number;
|
|
155
|
+
skipped: number;
|
|
156
|
+
duration: number;
|
|
157
|
+
cost: number;
|
|
158
|
+
tokens: number;
|
|
159
|
+
/** Suite-level metadata (#A5 Phase 5), copied through from the spec. */
|
|
160
|
+
metadata?: EvalMetadata;
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Define an eval suite. Returns a frozen `EvalSuite` ready to pass
|
|
164
|
+
* into {@link runSuite} or to default-export from an `evals/*.eval.ts`
|
|
165
|
+
* file (Phase 2's CLI auto-discovers those).
|
|
166
|
+
*
|
|
167
|
+
* The shape is deliberately a function rather than a class — keeps the
|
|
168
|
+
* file's default export trivially serializable (Phase 2 needs to load
|
|
169
|
+
* suites via dynamic import) and avoids the "did you forget `new`?"
|
|
170
|
+
* footgun.
|
|
171
|
+
*/
|
|
172
|
+
export declare function evalSuite(name: string, spec: EvalSuiteSpec): EvalSuite;
|
|
173
|
+
/** Exact string equality against `response.text`. */
|
|
174
|
+
export declare function exactMatch(expected: string): Metric;
|
|
175
|
+
/** Pattern match against `response.text`. */
|
|
176
|
+
export declare function regex(pattern: RegExp): Metric;
|
|
177
|
+
/**
|
|
178
|
+
* LLM-as-judge: ask a small model whether the response satisfies a
|
|
179
|
+
* natural-language criterion. Returns the judge's reasoning in
|
|
180
|
+
* `reason` so failures are debuggable.
|
|
181
|
+
*
|
|
182
|
+
* Design: the judge runs as a one-shot anonymous agent (no recursion
|
|
183
|
+
* concern — default `remembers()` is `false`). Output is shaped via
|
|
184
|
+
* `Output.object({ schema })` for deterministic parsing. Failures
|
|
185
|
+
* (network, parse, unhandled judge error) bubble as `pass: false`
|
|
186
|
+
* with the error in `reason` — a broken judge is not a passing case.
|
|
187
|
+
*
|
|
188
|
+
* Pitfall: the judge model has the same biases as any LLM. Use it
|
|
189
|
+
* for fuzzy "did the answer mention X?" assertions; for exact
|
|
190
|
+
* structural checks prefer `jsonShape` (Phase 3) or `regex`.
|
|
191
|
+
*/
|
|
192
|
+
export declare function llmJudge(criterion: string, opts?: {
|
|
193
|
+
model?: string;
|
|
194
|
+
}): Metric;
|
|
195
|
+
/**
|
|
196
|
+
* Strict structural assertion: parse `response.text` as JSON
|
|
197
|
+
* (stripping ```json fences) and run it through a zod schema.
|
|
198
|
+
*
|
|
199
|
+
* Pairs naturally with `Output.object({ schema })` on the agent —
|
|
200
|
+
* if the agent declares the same schema, this metric verifies the
|
|
201
|
+
* output actually conforms. Failures surface the zod issue path
|
|
202
|
+
* (e.g. `customer.email`) so debugging doesn't require a separate
|
|
203
|
+
* console log.
|
|
204
|
+
*/
|
|
205
|
+
export declare function jsonShape<T>(schema: z.ZodType<T>): Metric;
|
|
206
|
+
/**
|
|
207
|
+
* Embedding-based fuzzy match. Embeds both `reference` and
|
|
208
|
+
* `response.text` via `AI.embed()`, computes cosine similarity,
|
|
209
|
+
* passes when >= `threshold` (default `0.85` — tighter than
|
|
210
|
+
* `EmbeddingUserMemory`'s 0.5 retrieval-rank floor since this is
|
|
211
|
+
* an assertion, not a ranking).
|
|
212
|
+
*
|
|
213
|
+
* Uses ≤ 2 embedding calls per case; embed tokens roll into the
|
|
214
|
+
* case's cost rollup via the same side-channel `llmJudge` uses.
|
|
215
|
+
*
|
|
216
|
+
* Pitfall: requires a provider that implements `createEmbedding()`
|
|
217
|
+
* (openai / google / mistral / cohere / jina). Failures (no
|
|
218
|
+
* provider, network, etc.) surface as `pass: false` with the
|
|
219
|
+
* error in `reason` — a broken embed is not a passing case.
|
|
220
|
+
*/
|
|
221
|
+
export declare function semanticMatch(reference: string, opts?: {
|
|
222
|
+
threshold?: number;
|
|
223
|
+
model?: string;
|
|
224
|
+
}): Metric;
|
|
225
|
+
/**
|
|
226
|
+
* Token budget guard. Passes when `response.usage.totalTokens
|
|
227
|
+
* <= threshold`. Pair with cost-conscious agents to detect prompt-
|
|
228
|
+
* size regressions before they show up as a billing surprise.
|
|
229
|
+
*
|
|
230
|
+
* `response.usage` is the multi-step rollup, so it's meaningful
|
|
231
|
+
* even when the agent runs tools across several provider calls.
|
|
232
|
+
*/
|
|
233
|
+
export declare function tokenCost(threshold: number): Metric;
|
|
234
|
+
/**
|
|
235
|
+
* Compose multiple metrics into one assertion. Runs them in order
|
|
236
|
+
* and short-circuits on the first failure — failure `reason` is
|
|
237
|
+
* surfaced; success returns `{ pass: true, score: 1 }`.
|
|
238
|
+
*
|
|
239
|
+
* @example
|
|
240
|
+
* { input: '…',
|
|
241
|
+
* assert: compose(
|
|
242
|
+
* jsonShape(SummarySchema),
|
|
243
|
+
* tokenCost(800),
|
|
244
|
+
* ),
|
|
245
|
+
* }
|
|
246
|
+
*/
|
|
247
|
+
export declare function compose(...metrics: Metric[]): Metric;
|
|
248
|
+
/**
|
|
249
|
+
* Run every case in the suite, in declaration order. Returns the
|
|
250
|
+
* full report; never throws (assertion errors become `failed` cases,
|
|
251
|
+
* not exceptions).
|
|
252
|
+
*
|
|
253
|
+
* Phase 1 runs serially. Parallel execution lands in a follow-up
|
|
254
|
+
* once we understand the rate-limit shape of real-world judge
|
|
255
|
+
* models — sequential is correct under any rate limit.
|
|
256
|
+
*/
|
|
257
|
+
export declare function runSuite(suite: EvalSuite): Promise<SuiteReport>;
|
|
258
|
+
export { estimateCost, ModelPricing } from '../budget/pricing.js';
|
|
259
|
+
export type { ModelPriceEntry } from '../budget/pricing.js';
|
|
260
|
+
/**
|
|
261
|
+
* Default reporter — prints a colorless ANSI-aware table to a
|
|
262
|
+
* caller-supplied `console`-like sink. Uses Unicode pass/fail glyphs
|
|
263
|
+
* for visual scanning. JSON / HTML reporters land in Phase 2 / 5.
|
|
264
|
+
*
|
|
265
|
+
* Returns the report unchanged so chains compose: `await
|
|
266
|
+
* reportConsole(await runSuite(suite))`.
|
|
267
|
+
*/
|
|
268
|
+
export declare function reportConsole(report: SuiteReport, sink?: {
|
|
269
|
+
log: (s: string) => void;
|
|
270
|
+
}): SuiteReport;
|
|
271
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BG;AAGH,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,aAAa,CAAA;AACxC,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,aAAa,CAAA;AAKhD,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAA;AAEvB,OAAO,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAA;AAC/C,YAAY,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAA;AAClE,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAA;AACjD,YAAY,EAAE,WAAW,EAAE,MAAM,eAAe,CAAA;AAChD,OAAO,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAA;AAC/C,YAAY,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAA;AAI3D;;;GAGG;AACH,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAK,OAAO,CAAA;IAChB,KAAK,CAAC,EAAG,MAAM,CAAA;IACf,MAAM,CAAC,EAAE,MAAM,CAAA;CAChB;AAED;;;;;GAKG;AACH,MAAM,MAAM,MAAM,GAAG,CAAC,QAAQ,EAAE,aAAa,EAAE,GAAG,EAAE,aAAa,KAAK,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC,CAAA;AAE1G,MAAM,WAAW,aAAa;IAC5B,mEAAmE;IACnE,KAAK,EAAK,MAAM,CAAA;IAChB,+CAA+C;IAC/C,QAAQ,EAAE,MAAM,CAAA;CACjB;AAED,0BAA0B;AAC1B,MAAM,WAAW,QAAQ;IACvB,qEAAqE;IACrE,IAAI,CAAC,EAAE,MAAM,CAAA;IACb,6CAA6C;IAC7C,KAAK,EAAE,MAAM,CAAA;IACb,wDAAwD;IACxD,MAAM,EAAE,MAAM,CAAA;IACd;;;;OAIG;IACH,KAAK,CAAC,EAAE,MAAM,KAAK,CAAA;IACnB;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB;;;OAGG;IACH,IAAI,CAAC,EAAE,OAAO,GAAG,MAAM,CAAA;CACxB;AAED,MAAM,WAAW,aAAa;IAC5B,8DAA8D;IAC9D,KAAK,EAAE,MAAM,KAAK,CAAA;IAClB,wBAAwB;IACxB,KAAK,EAAE,QAAQ,EAAE,CAAA;IACjB;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB;;;;;OAKG;IACH,QAAQ,CAAC,EAAE,YAAY,CAAA;CACxB;AAED,MAAM,WAAW,YAAY;IAC3B,KAAK,CAAC,EAAS,MAAM,CAAA;IACrB,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,MAAM,CAAC,EAAQ,MAAM,CAAA;IACrB,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAAA;CAClC;AAED,MAAM,WAAW,SAAS;IACxB,IAAI,EAAE,MAAM,CAAA;IACZ,IAAI,EAAE,aAAa,CAAA;CACpB;AAED,yDAAyD;AACzD,MAAM,WAAW,UAAU;IACzB,IAAI,EAAK,MAAM,CAAA;IACf,wDAAwD;IACxD,MAAM,EAAG,QAAQ,GAAG,QAAQ,GAAG,SAAS,CAAA;IACxC,MAAM,CAAC,EAAE,YAAY,CAAA;IACrB,iDAAiD;IACjD,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,oDAAoD;IACpD,QAAQ,EAAE,MAAM,CAAA;IAChB;;;;OAIG;IACH,MAAM,EAAG,MAAM,CAAA;IACf,6DAA6D;IAC7D,IAAI,EAAK,MAAM,CAAA;IACf;;;;;OAKG;IACH,KAAK,EAAI,MAAM,CAAA;IACf;;;;OAIG;IACH,YAAY,CAAC,EAAE,MAAM,CAAA;CACtB;AAED,gDAAgD;AAChD,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAK,MAAM,CAAA;IAChB,KAAK,EAAK,UAAU,EAAE,CAAA;IACtB,MAAM,EAAI,MAAM,CAAA;IAChB,MAAM,EAAI,MAAM,CAAA;IAChB,OAAO,EAAG,MAAM,CAAA;IAChB,QAAQ,EAAE,MAAM,CAAA;IAChB,IAAI,EAAM,MAAM,CAAA;IAChB,MAAM,EAAI,MAAM,CAAA;IAChB,wEAAwE;IACxE,QAAQ,CAAC,EAAE,YAAY,CAAA;CACxB;AAID;;;;;;;;;GASG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,aAAa,GAAG,SAAS,CAStE;AAID,qDAAqD;AACrD,wBAAgB,UAAU,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAUnD;AAED,6CAA6C;AAC7C,wBAAgB,KAAK,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CAS7C;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,QAAQ,CAAC,SAAS,EAAE,MAAM,EAAE,IAAI,GAAE;IAAE,KAAK,CAAC,EAAE,MAAM,CAAA;CAAO,GAAG,MAAM,CA6CjF;AAQD;;;;;;;;;GASG;AACH,wBAAgB,SAAS,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,CAuBzD;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,aAAa,CAC3B,SAAS,EAAE,MAAM,EACjB,IAAI,GAAE;IAAE,SAAS,CAAC,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAA;CAAO,GAChD,MAAM,CA+BR;AAED;;;;;;;GAOG;AACH,wBAAgB,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAYnD;AAED;;;;;;;;;;;;GAYG;AACH,wBAAgB,OAAO,CAAC,GAAG,OAAO,EAAE,MAAM,EAAE,GAAG,MAAM,CAQpD;AA4BD;;;;;;;;GAQG;AACH,wBAAsB,QAAQ,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC,CAgDrE;AA0FD,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAA;AACjE,YAAY,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAA;AAI3D;;;;;;;GAOG;AACH,wBAAgB,aAAa,CAAC,MAAM,EAAE,WAAW,EAAE,IAAI,GAAE;IAAE,GAAG,EAAE,CAAC,CAAC,EAAE,MAAM,KAAK,IAAI,CAAA;CAAY,GAAG,WAAW,CAyB5G"}
|