@rudderjs/ai 1.5.0 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +482 -4
- package/boost/guidelines.md +60 -0
- package/boost/skills/ai-agents/SKILL.md +7 -0
- package/boost/skills/ai-tools/SKILL.md +7 -0
- package/dist/agent.d.ts +35 -1
- package/dist/agent.d.ts.map +1 -1
- package/dist/agent.js +118 -16
- package/dist/agent.js.map +1 -1
- package/dist/budget/pricing.d.ts +124 -0
- package/dist/budget/pricing.d.ts.map +1 -0
- package/dist/budget/pricing.js +175 -0
- package/dist/budget/pricing.js.map +1 -0
- package/dist/budget/storage.d.ts +104 -0
- package/dist/budget/storage.d.ts.map +1 -0
- package/dist/budget/storage.js +0 -0
- package/dist/budget/storage.js.map +1 -0
- package/dist/budget/with-budget.d.ts +119 -0
- package/dist/budget/with-budget.d.ts.map +1 -0
- package/dist/budget/with-budget.js +175 -0
- package/dist/budget/with-budget.js.map +1 -0
- package/dist/budget-orm/index.d.ts +96 -0
- package/dist/budget-orm/index.d.ts.map +1 -0
- package/dist/budget-orm/index.js +177 -0
- package/dist/budget-orm/index.js.map +1 -0
- package/dist/commands/ai-eval.d.ts +93 -0
- package/dist/commands/ai-eval.d.ts.map +1 -0
- package/dist/commands/ai-eval.js +378 -0
- package/dist/commands/ai-eval.js.map +1 -0
- package/dist/computer-use/actions.d.ts +214 -0
- package/dist/computer-use/actions.d.ts.map +1 -0
- package/dist/computer-use/actions.js +48 -0
- package/dist/computer-use/actions.js.map +1 -0
- package/dist/computer-use/errors.d.ts +57 -0
- package/dist/computer-use/errors.d.ts.map +1 -0
- package/dist/computer-use/errors.js +76 -0
- package/dist/computer-use/errors.js.map +1 -0
- package/dist/computer-use/index.d.ts +53 -0
- package/dist/computer-use/index.d.ts.map +1 -0
- package/dist/computer-use/index.js +51 -0
- package/dist/computer-use/index.js.map +1 -0
- package/dist/computer-use/playwright.d.ts +76 -0
- package/dist/computer-use/playwright.d.ts.map +1 -0
- package/dist/computer-use/playwright.js +270 -0
- package/dist/computer-use/playwright.js.map +1 -0
- package/dist/computer-use/tool.d.ts +154 -0
- package/dist/computer-use/tool.d.ts.map +1 -0
- package/dist/computer-use/tool.js +210 -0
- package/dist/computer-use/tool.js.map +1 -0
- package/dist/eval/fixtures.d.ts +65 -0
- package/dist/eval/fixtures.d.ts.map +1 -0
- package/dist/eval/fixtures.js +110 -0
- package/dist/eval/fixtures.js.map +1 -0
- package/dist/eval/html-reporter.d.ts +25 -0
- package/dist/eval/html-reporter.d.ts.map +1 -0
- package/dist/eval/html-reporter.js +209 -0
- package/dist/eval/html-reporter.js.map +1 -0
- package/dist/eval/index.d.ts +271 -0
- package/dist/eval/index.d.ts.map +1 -0
- package/dist/eval/index.js +510 -0
- package/dist/eval/index.js.map +1 -0
- package/dist/eval/json-reporter.d.ts +43 -0
- package/dist/eval/json-reporter.d.ts.map +1 -0
- package/dist/eval/json-reporter.js +40 -0
- package/dist/eval/json-reporter.js.map +1 -0
- package/dist/fake.d.ts +36 -1
- package/dist/fake.d.ts.map +1 -1
- package/dist/fake.js +49 -2
- package/dist/fake.js.map +1 -1
- package/dist/file-search.d.ts +168 -0
- package/dist/file-search.d.ts.map +1 -0
- package/dist/file-search.js +158 -0
- package/dist/file-search.js.map +1 -0
- package/dist/index.d.ts +22 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +17 -1
- package/dist/index.js.map +1 -1
- package/dist/mcp/client-tools.d.ts +39 -0
- package/dist/mcp/client-tools.d.ts.map +1 -0
- package/dist/mcp/client-tools.js +147 -0
- package/dist/mcp/client-tools.js.map +1 -0
- package/dist/mcp/index.d.ts +16 -0
- package/dist/mcp/index.d.ts.map +1 -0
- package/dist/mcp/index.js +15 -0
- package/dist/mcp/index.js.map +1 -0
- package/dist/mcp/server-from-agent.d.ts +24 -0
- package/dist/mcp/server-from-agent.d.ts.map +1 -0
- package/dist/mcp/server-from-agent.js +113 -0
- package/dist/mcp/server-from-agent.js.map +1 -0
- package/dist/mcp/types.d.ts +64 -0
- package/dist/mcp/types.d.ts.map +1 -0
- package/dist/mcp/types.js +6 -0
- package/dist/mcp/types.js.map +1 -0
- package/dist/memory-embedding/index.d.ts +121 -0
- package/dist/memory-embedding/index.d.ts.map +1 -0
- package/dist/memory-embedding/index.js +229 -0
- package/dist/memory-embedding/index.js.map +1 -0
- package/dist/memory-extract.d.ts +60 -0
- package/dist/memory-extract.d.ts.map +1 -0
- package/dist/memory-extract.js +163 -0
- package/dist/memory-extract.js.map +1 -0
- package/dist/memory-inject.d.ts +39 -0
- package/dist/memory-inject.d.ts.map +1 -0
- package/dist/memory-inject.js +135 -0
- package/dist/memory-inject.js.map +1 -0
- package/dist/memory-orm/index.d.ts +118 -0
- package/dist/memory-orm/index.d.ts.map +1 -0
- package/dist/memory-orm/index.js +187 -0
- package/dist/memory-orm/index.js.map +1 -0
- package/dist/memory.d.ts +55 -0
- package/dist/memory.d.ts.map +1 -0
- package/dist/memory.js +132 -0
- package/dist/memory.js.map +1 -0
- package/dist/observers.d.ts +22 -0
- package/dist/observers.d.ts.map +1 -1
- package/dist/observers.js.map +1 -1
- package/dist/provider-tools.d.ts +15 -1
- package/dist/provider-tools.d.ts.map +1 -1
- package/dist/provider-tools.js +21 -1
- package/dist/provider-tools.js.map +1 -1
- package/dist/providers/anthropic.d.ts.map +1 -1
- package/dist/providers/anthropic.js +61 -6
- package/dist/providers/anthropic.js.map +1 -1
- package/dist/providers/elevenlabs.d.ts +98 -0
- package/dist/providers/elevenlabs.d.ts.map +1 -0
- package/dist/providers/elevenlabs.js +229 -0
- package/dist/providers/elevenlabs.js.map +1 -0
- package/dist/providers/google.d.ts +83 -1
- package/dist/providers/google.d.ts.map +1 -1
- package/dist/providers/google.js +491 -8
- package/dist/providers/google.js.map +1 -1
- package/dist/providers/openai.d.ts +3 -1
- package/dist/providers/openai.d.ts.map +1 -1
- package/dist/providers/openai.js +209 -5
- package/dist/providers/openai.js.map +1 -1
- package/dist/providers/voyage.d.ts +91 -0
- package/dist/providers/voyage.d.ts.map +1 -0
- package/dist/providers/voyage.js +166 -0
- package/dist/providers/voyage.js.map +1 -0
- package/dist/queue-job.d.ts +69 -4
- package/dist/queue-job.d.ts.map +1 -1
- package/dist/queue-job.js +114 -11
- package/dist/queue-job.js.map +1 -1
- package/dist/registry.d.ts +3 -1
- package/dist/registry.d.ts.map +1 -1
- package/dist/registry.js +10 -0
- package/dist/registry.js.map +1 -1
- package/dist/server/provider.d.ts.map +1 -1
- package/dist/server/provider.js +23 -1
- package/dist/server/provider.js.map +1 -1
- package/dist/similarity-search.d.ts +163 -0
- package/dist/similarity-search.d.ts.map +1 -0
- package/dist/similarity-search.js +147 -0
- package/dist/similarity-search.js.map +1 -0
- package/dist/tool.d.ts.map +1 -1
- package/dist/tool.js +13 -4
- package/dist/tool.js.map +1 -1
- package/dist/types.d.ts +246 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/vector-stores/index.d.ts +96 -0
- package/dist/vector-stores/index.d.ts.map +1 -0
- package/dist/vector-stores/index.js +153 -0
- package/dist/vector-stores/index.js.map +1 -0
- package/package.json +41 -3
|
@@ -0,0 +1,510 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `@rudderjs/ai/eval` — built-in eval framework for #A5 Phase 1.
|
|
3
|
+
*
|
|
4
|
+
* Define a suite of input cases + assertions, run them against any
|
|
5
|
+
* `Agent`, get a console report with pass/fail + cost + tokens. Same
|
|
6
|
+
* `Agent` instances as your app code — one source of truth.
|
|
7
|
+
*
|
|
8
|
+
* @example
|
|
9
|
+
* ```ts
|
|
10
|
+
* // evals/support-agent.eval.ts
|
|
11
|
+
* import { evalSuite, llmJudge, exactMatch, regex } from '@rudderjs/ai/eval'
|
|
12
|
+
* import { SupportAgent } from '../app/Agents/SupportAgent.js'
|
|
13
|
+
*
|
|
14
|
+
* export default evalSuite('SupportAgent', {
|
|
15
|
+
* agent: () => new SupportAgent(),
|
|
16
|
+
* cases: [
|
|
17
|
+
* { name: 'password reset', input: 'How do I reset my password?',
|
|
18
|
+
* assert: llmJudge('mentions a password reset link') },
|
|
19
|
+
* { name: 'price', input: 'How much?', assert: exactMatch('$99/month') },
|
|
20
|
+
* { name: 'support email', input: 'Contact?', assert: regex(/support@/) },
|
|
21
|
+
* ],
|
|
22
|
+
* })
|
|
23
|
+
* ```
|
|
24
|
+
*
|
|
25
|
+
* Run programmatically via `runSuite(suite)` from this entry, or via
|
|
26
|
+
* `pnpm rudder ai:eval` once Phase 2 lands.
|
|
27
|
+
*
|
|
28
|
+
* Built-in metrics: `exactMatch`, `regex`, `llmJudge`, `jsonShape`,
|
|
29
|
+
* `semanticMatch`, `tokenCost`. Compose multiple via `compose(...)`.
|
|
30
|
+
* User-defined metrics work today — any `(response, ctx) =>
|
|
31
|
+
* MetricResult` qualifies.
|
|
32
|
+
*/
|
|
33
|
+
import { agent } from '../agent.js';
|
|
34
|
+
import { Output } from '../output.js';
|
|
35
|
+
import { AI } from '../facade.js';
|
|
36
|
+
import { aiObservers } from '../observers.js';
|
|
37
|
+
import { estimateCost } from '../budget/pricing.js';
|
|
38
|
+
import { z } from 'zod';
|
|
39
|
+
export { reportJson } from './json-reporter.js';
|
|
40
|
+
export { stepsFromResponse } from './fixtures.js';
|
|
41
|
+
export { reportHtml } from './html-reporter.js';
|
|
42
|
+
// ─── Suite definition ─────────────────────────────────────
|
|
43
|
+
/**
|
|
44
|
+
* Define an eval suite. Returns a frozen `EvalSuite` ready to pass
|
|
45
|
+
* into {@link runSuite} or to default-export from an `evals/*.eval.ts`
|
|
46
|
+
* file (Phase 2's CLI auto-discovers those).
|
|
47
|
+
*
|
|
48
|
+
* The shape is deliberately a function rather than a class — keeps the
|
|
49
|
+
* file's default export trivially serializable (Phase 2 needs to load
|
|
50
|
+
* suites via dynamic import) and avoids the "did you forget `new`?"
|
|
51
|
+
* footgun.
|
|
52
|
+
*/
|
|
53
|
+
export function evalSuite(name, spec) {
|
|
54
|
+
if (!name)
|
|
55
|
+
throw new Error('[RudderJS AI] evalSuite() requires a name.');
|
|
56
|
+
if (!spec || typeof spec.agent !== 'function') {
|
|
57
|
+
throw new Error('[RudderJS AI] evalSuite() requires { agent: () => Agent, cases: [...] }.');
|
|
58
|
+
}
|
|
59
|
+
if (!Array.isArray(spec.cases) || spec.cases.length === 0) {
|
|
60
|
+
throw new Error('[RudderJS AI] evalSuite() requires at least one case.');
|
|
61
|
+
}
|
|
62
|
+
return Object.freeze({ name, spec });
|
|
63
|
+
}
|
|
64
|
+
// ─── Built-in metrics ─────────────────────────────────────
|
|
65
|
+
/** Exact string equality against `response.text`. */
|
|
66
|
+
export function exactMatch(expected) {
|
|
67
|
+
return (response) => {
|
|
68
|
+
const actual = response.text;
|
|
69
|
+
if (actual === expected)
|
|
70
|
+
return { pass: true, score: 1 };
|
|
71
|
+
return {
|
|
72
|
+
pass: false,
|
|
73
|
+
score: 0,
|
|
74
|
+
reason: `expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`,
|
|
75
|
+
};
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
/** Pattern match against `response.text`. */
|
|
79
|
+
export function regex(pattern) {
|
|
80
|
+
return (response) => {
|
|
81
|
+
if (pattern.test(response.text))
|
|
82
|
+
return { pass: true, score: 1 };
|
|
83
|
+
return {
|
|
84
|
+
pass: false,
|
|
85
|
+
score: 0,
|
|
86
|
+
reason: `pattern ${pattern} did not match ${JSON.stringify(response.text.slice(0, 120))}${response.text.length > 120 ? '…' : ''}`,
|
|
87
|
+
};
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* LLM-as-judge: ask a small model whether the response satisfies a
|
|
92
|
+
* natural-language criterion. Returns the judge's reasoning in
|
|
93
|
+
* `reason` so failures are debuggable.
|
|
94
|
+
*
|
|
95
|
+
* Design: the judge runs as a one-shot anonymous agent (no recursion
|
|
96
|
+
* concern — default `remembers()` is `false`). Output is shaped via
|
|
97
|
+
* `Output.object({ schema })` for deterministic parsing. Failures
|
|
98
|
+
* (network, parse, unhandled judge error) bubble as `pass: false`
|
|
99
|
+
* with the error in `reason` — a broken judge is not a passing case.
|
|
100
|
+
*
|
|
101
|
+
* Pitfall: the judge model has the same biases as any LLM. Use it
|
|
102
|
+
* for fuzzy "did the answer mention X?" assertions; for exact
|
|
103
|
+
* structural checks prefer `jsonShape` (Phase 3) or `regex`.
|
|
104
|
+
*/
|
|
105
|
+
export function llmJudge(criterion, opts = {}) {
|
|
106
|
+
const wrapper = Output.object({
|
|
107
|
+
schema: z.object({
|
|
108
|
+
pass: z.boolean(),
|
|
109
|
+
reason: z.string(),
|
|
110
|
+
}),
|
|
111
|
+
});
|
|
112
|
+
return async (response, ctx) => {
|
|
113
|
+
try {
|
|
114
|
+
const judge = agent({
|
|
115
|
+
instructions: `${JUDGE_INSTRUCTIONS}\n\n${wrapper.toSystemPrompt()}`,
|
|
116
|
+
...(opts.model ? { model: opts.model } : {}),
|
|
117
|
+
});
|
|
118
|
+
const prompt = [
|
|
119
|
+
`Criterion: ${criterion}`,
|
|
120
|
+
'',
|
|
121
|
+
`User input: ${JSON.stringify(ctx.input)}`,
|
|
122
|
+
`Agent response: ${JSON.stringify(response.text)}`,
|
|
123
|
+
'',
|
|
124
|
+
'Does the response satisfy the criterion? Return strictly valid JSON.',
|
|
125
|
+
].join('\n');
|
|
126
|
+
const judgeResponse = await judge.prompt(prompt);
|
|
127
|
+
const parsed = wrapper.parse(judgeResponse.text);
|
|
128
|
+
// Tag the judge's token usage onto the response so the runner
|
|
129
|
+
// can include it in the cost rollup. This is a side-channel
|
|
130
|
+
// since the metric signature doesn't surface usage natively.
|
|
131
|
+
attachExtraUsage(response, judgeResponse.usage.totalTokens);
|
|
132
|
+
return {
|
|
133
|
+
pass: parsed.pass,
|
|
134
|
+
score: parsed.pass ? 1 : 0,
|
|
135
|
+
reason: parsed.reason,
|
|
136
|
+
};
|
|
137
|
+
}
|
|
138
|
+
catch (err) {
|
|
139
|
+
return {
|
|
140
|
+
pass: false,
|
|
141
|
+
score: 0,
|
|
142
|
+
reason: `judge failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
const JUDGE_INSTRUCTIONS = [
|
|
148
|
+
'You are an evaluator judging whether an agent response satisfies a natural-language criterion.',
|
|
149
|
+
'Be precise: only return pass=true if the criterion is plainly met.',
|
|
150
|
+
'Provide a short reason for your decision (1-2 sentences) so the developer can debug failures.',
|
|
151
|
+
].join(' ');
|
|
152
|
+
/**
|
|
153
|
+
* Strict structural assertion: parse `response.text` as JSON
|
|
154
|
+
* (stripping ```json fences) and run it through a zod schema.
|
|
155
|
+
*
|
|
156
|
+
* Pairs naturally with `Output.object({ schema })` on the agent —
|
|
157
|
+
* if the agent declares the same schema, this metric verifies the
|
|
158
|
+
* output actually conforms. Failures surface the zod issue path
|
|
159
|
+
* (e.g. `customer.email`) so debugging doesn't require a separate
|
|
160
|
+
* console log.
|
|
161
|
+
*/
|
|
162
|
+
export function jsonShape(schema) {
|
|
163
|
+
return (response) => {
|
|
164
|
+
const stripped = stripCodeFences(response.text);
|
|
165
|
+
let parsed;
|
|
166
|
+
try {
|
|
167
|
+
parsed = JSON.parse(stripped);
|
|
168
|
+
}
|
|
169
|
+
catch (err) {
|
|
170
|
+
return {
|
|
171
|
+
pass: false,
|
|
172
|
+
score: 0,
|
|
173
|
+
reason: `not JSON: ${err instanceof Error ? err.message : String(err)}`,
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
const result = schema.safeParse(parsed);
|
|
177
|
+
if (result.success)
|
|
178
|
+
return { pass: true, score: 1 };
|
|
179
|
+
const first = result.error.issues[0];
|
|
180
|
+
const path = first?.path.join('.') || '<root>';
|
|
181
|
+
return {
|
|
182
|
+
pass: false,
|
|
183
|
+
score: 0,
|
|
184
|
+
reason: `schema mismatch at ${path}: ${first?.message ?? 'unknown error'}`,
|
|
185
|
+
};
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
/**
|
|
189
|
+
* Embedding-based fuzzy match. Embeds both `reference` and
|
|
190
|
+
* `response.text` via `AI.embed()`, computes cosine similarity,
|
|
191
|
+
* passes when >= `threshold` (default `0.85` — tighter than
|
|
192
|
+
* `EmbeddingUserMemory`'s 0.5 retrieval-rank floor since this is
|
|
193
|
+
* an assertion, not a ranking).
|
|
194
|
+
*
|
|
195
|
+
* Uses ≤ 2 embedding calls per case; embed tokens roll into the
|
|
196
|
+
* case's cost rollup via the same side-channel `llmJudge` uses.
|
|
197
|
+
*
|
|
198
|
+
* Pitfall: requires a provider that implements `createEmbedding()`
|
|
199
|
+
* (openai / google / mistral / cohere / jina). Failures (no
|
|
200
|
+
* provider, network, etc.) surface as `pass: false` with the
|
|
201
|
+
* error in `reason` — a broken embed is not a passing case.
|
|
202
|
+
*/
|
|
203
|
+
export function semanticMatch(reference, opts = {}) {
|
|
204
|
+
const threshold = opts.threshold ?? 0.85;
|
|
205
|
+
return async (response) => {
|
|
206
|
+
try {
|
|
207
|
+
const inputs = [reference, response.text];
|
|
208
|
+
const embedOpts = {};
|
|
209
|
+
if (opts.model)
|
|
210
|
+
embedOpts.model = opts.model;
|
|
211
|
+
const result = await AI.embed(inputs, embedOpts);
|
|
212
|
+
const [refVec, respVec] = result.embeddings;
|
|
213
|
+
if (!refVec || !respVec) {
|
|
214
|
+
return { pass: false, score: 0, reason: 'embed returned no vectors' };
|
|
215
|
+
}
|
|
216
|
+
attachExtraUsage(response, result.usage.totalTokens);
|
|
217
|
+
const score = cosineSimilarity(refVec, respVec);
|
|
218
|
+
const pass = score >= threshold;
|
|
219
|
+
return {
|
|
220
|
+
pass,
|
|
221
|
+
score,
|
|
222
|
+
reason: pass
|
|
223
|
+
? `cosine ${score.toFixed(3)} >= ${threshold}`
|
|
224
|
+
: `cosine ${score.toFixed(3)} < ${threshold} (reference: ${JSON.stringify(reference.slice(0, 80))}${reference.length > 80 ? '…' : ''})`,
|
|
225
|
+
};
|
|
226
|
+
}
|
|
227
|
+
catch (err) {
|
|
228
|
+
return {
|
|
229
|
+
pass: false,
|
|
230
|
+
score: 0,
|
|
231
|
+
reason: `embed failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
232
|
+
};
|
|
233
|
+
}
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
/**
|
|
237
|
+
* Token budget guard. Passes when `response.usage.totalTokens
|
|
238
|
+
* <= threshold`. Pair with cost-conscious agents to detect prompt-
|
|
239
|
+
* size regressions before they show up as a billing surprise.
|
|
240
|
+
*
|
|
241
|
+
* `response.usage` is the multi-step rollup, so it's meaningful
|
|
242
|
+
* even when the agent runs tools across several provider calls.
|
|
243
|
+
*/
|
|
244
|
+
export function tokenCost(threshold) {
|
|
245
|
+
return (response) => {
|
|
246
|
+
const used = response.usage.totalTokens;
|
|
247
|
+
const pass = used <= threshold;
|
|
248
|
+
return {
|
|
249
|
+
pass,
|
|
250
|
+
score: pass ? 1 : 0,
|
|
251
|
+
reason: pass
|
|
252
|
+
? `${used} tokens <= ${threshold}`
|
|
253
|
+
: `${used} tokens > ${threshold}`,
|
|
254
|
+
};
|
|
255
|
+
};
|
|
256
|
+
}
|
|
257
|
+
/**
|
|
258
|
+
* Compose multiple metrics into one assertion. Runs them in order
|
|
259
|
+
* and short-circuits on the first failure — failure `reason` is
|
|
260
|
+
* surfaced; success returns `{ pass: true, score: 1 }`.
|
|
261
|
+
*
|
|
262
|
+
* @example
|
|
263
|
+
* { input: '…',
|
|
264
|
+
* assert: compose(
|
|
265
|
+
* jsonShape(SummarySchema),
|
|
266
|
+
* tokenCost(800),
|
|
267
|
+
* ),
|
|
268
|
+
* }
|
|
269
|
+
*/
|
|
270
|
+
export function compose(...metrics) {
|
|
271
|
+
return async (response, ctx) => {
|
|
272
|
+
for (const m of metrics) {
|
|
273
|
+
const result = await m(response, ctx);
|
|
274
|
+
if (!result.pass)
|
|
275
|
+
return result;
|
|
276
|
+
}
|
|
277
|
+
return { pass: true, score: 1 };
|
|
278
|
+
};
|
|
279
|
+
}
|
|
280
|
+
/** Local cosine — kept inline so `eval/` doesn't pull in `memory-embedding` (which depends on `@rudderjs/orm`). */
|
|
281
|
+
function cosineSimilarity(a, b) {
|
|
282
|
+
if (a.length !== b.length)
|
|
283
|
+
return 0;
|
|
284
|
+
let dot = 0;
|
|
285
|
+
let magA = 0;
|
|
286
|
+
let magB = 0;
|
|
287
|
+
for (let i = 0; i < a.length; i++) {
|
|
288
|
+
const ai = a[i];
|
|
289
|
+
const bi = b[i];
|
|
290
|
+
dot += ai * bi;
|
|
291
|
+
magA += ai * ai;
|
|
292
|
+
magB += bi * bi;
|
|
293
|
+
}
|
|
294
|
+
if (magA === 0 || magB === 0)
|
|
295
|
+
return 0;
|
|
296
|
+
return dot / (Math.sqrt(magA) * Math.sqrt(magB));
|
|
297
|
+
}
|
|
298
|
+
function stripCodeFences(text) {
|
|
299
|
+
return text
|
|
300
|
+
.replace(/^```(?:json)?\s*\n?/m, '')
|
|
301
|
+
.replace(/\n?```\s*$/m, '')
|
|
302
|
+
.trim();
|
|
303
|
+
}
|
|
304
|
+
// ─── Runner ───────────────────────────────────────────────
|
|
305
|
+
/**
|
|
306
|
+
* Run every case in the suite, in declaration order. Returns the
|
|
307
|
+
* full report; never throws (assertion errors become `failed` cases,
|
|
308
|
+
* not exceptions).
|
|
309
|
+
*
|
|
310
|
+
* Phase 1 runs serially. Parallel execution lands in a follow-up
|
|
311
|
+
* once we understand the rate-limit shape of real-world judge
|
|
312
|
+
* models — sequential is correct under any rate limit.
|
|
313
|
+
*/
|
|
314
|
+
export async function runSuite(suite) {
|
|
315
|
+
const start = performance.now();
|
|
316
|
+
const cases = [];
|
|
317
|
+
let passed = 0;
|
|
318
|
+
let failed = 0;
|
|
319
|
+
let skipped = 0;
|
|
320
|
+
for (let i = 0; i < suite.spec.cases.length; i++) {
|
|
321
|
+
const c = suite.spec.cases[i];
|
|
322
|
+
const name = c.name ?? `case-${i}`;
|
|
323
|
+
if (c.skip) {
|
|
324
|
+
const skipResult = {
|
|
325
|
+
name,
|
|
326
|
+
status: 'skipped',
|
|
327
|
+
reason: typeof c.skip === 'string' ? c.skip : 'skipped',
|
|
328
|
+
duration: 0,
|
|
329
|
+
tokens: 0,
|
|
330
|
+
cost: 0,
|
|
331
|
+
input: c.input,
|
|
332
|
+
};
|
|
333
|
+
cases.push(skipResult);
|
|
334
|
+
emitEvalCompleted(suite.name, skipResult);
|
|
335
|
+
skipped++;
|
|
336
|
+
continue;
|
|
337
|
+
}
|
|
338
|
+
const result = await runCase(suite, c, name);
|
|
339
|
+
cases.push(result);
|
|
340
|
+
emitEvalCompleted(suite.name, result);
|
|
341
|
+
if (result.status === 'passed')
|
|
342
|
+
passed++;
|
|
343
|
+
else if (result.status === 'failed')
|
|
344
|
+
failed++;
|
|
345
|
+
}
|
|
346
|
+
const duration = performance.now() - start;
|
|
347
|
+
const report = {
|
|
348
|
+
suite: suite.name,
|
|
349
|
+
cases,
|
|
350
|
+
passed,
|
|
351
|
+
failed,
|
|
352
|
+
skipped,
|
|
353
|
+
duration,
|
|
354
|
+
cost: cases.reduce((sum, c) => sum + c.cost, 0),
|
|
355
|
+
tokens: cases.reduce((sum, c) => sum + c.tokens, 0),
|
|
356
|
+
};
|
|
357
|
+
if (suite.spec.metadata)
|
|
358
|
+
report.metadata = suite.spec.metadata;
|
|
359
|
+
return report;
|
|
360
|
+
}
|
|
361
|
+
function emitEvalCompleted(suiteName, result) {
|
|
362
|
+
const event = {
|
|
363
|
+
kind: 'agent.eval.completed',
|
|
364
|
+
suite: suiteName,
|
|
365
|
+
case: result.name,
|
|
366
|
+
status: result.status,
|
|
367
|
+
pass: result.status === 'passed',
|
|
368
|
+
tokens: result.tokens,
|
|
369
|
+
cost: result.cost,
|
|
370
|
+
duration: result.duration,
|
|
371
|
+
};
|
|
372
|
+
if (result.metric?.score !== undefined)
|
|
373
|
+
event.score = result.metric.score;
|
|
374
|
+
const reason = result.status === 'skipped' ? result.reason : result.metric?.reason;
|
|
375
|
+
if (reason)
|
|
376
|
+
event.reason = reason;
|
|
377
|
+
aiObservers.emit(event);
|
|
378
|
+
}
|
|
379
|
+
async function runCase(suite, c, name) {
|
|
380
|
+
const factory = c.agent ?? suite.spec.agent;
|
|
381
|
+
const ag = factory();
|
|
382
|
+
const timeout = c.timeout ?? suite.spec.timeout;
|
|
383
|
+
const start = performance.now();
|
|
384
|
+
let response;
|
|
385
|
+
try {
|
|
386
|
+
response = await runWithTimeout(() => ag.prompt(c.input), timeout);
|
|
387
|
+
}
|
|
388
|
+
catch (err) {
|
|
389
|
+
return {
|
|
390
|
+
name,
|
|
391
|
+
status: 'failed',
|
|
392
|
+
metric: { pass: false, reason: err instanceof Error ? err.message : String(err) },
|
|
393
|
+
duration: performance.now() - start,
|
|
394
|
+
tokens: 0,
|
|
395
|
+
cost: 0,
|
|
396
|
+
input: c.input,
|
|
397
|
+
};
|
|
398
|
+
}
|
|
399
|
+
let metric;
|
|
400
|
+
try {
|
|
401
|
+
metric = await c.assert(response, { input: c.input, caseName: name });
|
|
402
|
+
}
|
|
403
|
+
catch (err) {
|
|
404
|
+
metric = { pass: false, reason: `assert threw: ${err instanceof Error ? err.message : String(err)}` };
|
|
405
|
+
}
|
|
406
|
+
const extraTokens = consumeExtraUsage(response);
|
|
407
|
+
const totalTokens = response.usage.totalTokens + extraTokens;
|
|
408
|
+
return {
|
|
409
|
+
name,
|
|
410
|
+
status: metric.pass ? 'passed' : 'failed',
|
|
411
|
+
metric,
|
|
412
|
+
duration: performance.now() - start,
|
|
413
|
+
tokens: totalTokens,
|
|
414
|
+
cost: estimateCost(modelStringFor(ag), response.usage.promptTokens, response.usage.completionTokens)
|
|
415
|
+
+ estimateCost(modelStringFor(ag), 0, extraTokens), // judge/embed cost approximated as completion-side
|
|
416
|
+
input: c.input,
|
|
417
|
+
responseText: response.text,
|
|
418
|
+
};
|
|
419
|
+
}
|
|
420
|
+
function modelStringFor(ag) {
|
|
421
|
+
// `Agent.model()` may return undefined → callers fall back to the
|
|
422
|
+
// registry default. We don't have a stable hook for the default
|
|
423
|
+
// here without importing the registry; the eval flow doesn't
|
|
424
|
+
// strictly need the resolved model for cost estimation as long as
|
|
425
|
+
// the user's agent declares one. When it doesn't, costs fall back
|
|
426
|
+
// to an unknown-model rate (zero in Phase 1).
|
|
427
|
+
return ag.model() ?? 'unknown/unknown';
|
|
428
|
+
}
|
|
429
|
+
async function runWithTimeout(fn, ms) {
|
|
430
|
+
if (!ms || ms <= 0)
|
|
431
|
+
return fn();
|
|
432
|
+
return new Promise((resolve, reject) => {
|
|
433
|
+
const timer = setTimeout(() => reject(new Error(`timeout after ${ms}ms`)), ms);
|
|
434
|
+
fn().then(v => { clearTimeout(timer); resolve(v); }, e => { clearTimeout(timer); reject(e); });
|
|
435
|
+
});
|
|
436
|
+
}
|
|
437
|
+
// ─── Pricing ──────────────────────────────────────────────
|
|
438
|
+
// Pricing catalog + estimator live in `../budget/pricing.ts` and are
|
|
439
|
+
// re-exported here so `import { estimateCost } from '@rudderjs/ai/eval'`
|
|
440
|
+
// continues to work. (Note: estimateCost is also imported at the top
|
|
441
|
+
// of this file for use by `runSuite`.)
|
|
442
|
+
export { estimateCost, ModelPricing } from '../budget/pricing.js';
|
|
443
|
+
// ─── Console reporter ─────────────────────────────────────
|
|
444
|
+
/**
|
|
445
|
+
* Default reporter — prints a colorless ANSI-aware table to a
|
|
446
|
+
* caller-supplied `console`-like sink. Uses Unicode pass/fail glyphs
|
|
447
|
+
* for visual scanning. JSON / HTML reporters land in Phase 2 / 5.
|
|
448
|
+
*
|
|
449
|
+
* Returns the report unchanged so chains compose: `await
|
|
450
|
+
* reportConsole(await runSuite(suite))`.
|
|
451
|
+
*/
|
|
452
|
+
export function reportConsole(report, sink = console) {
|
|
453
|
+
const lines = [];
|
|
454
|
+
const summary = `${report.suite} (${report.cases.length} cases, ${formatMs(report.duration)}, ${formatCost(report.cost)})`;
|
|
455
|
+
lines.push(summary);
|
|
456
|
+
for (const c of report.cases) {
|
|
457
|
+
const glyph = c.status === 'passed' ? '✓' : c.status === 'failed' ? '✗' : '○';
|
|
458
|
+
const meta = c.status === 'skipped'
|
|
459
|
+
? `skip: ${c.reason ?? 'skipped'}`
|
|
460
|
+
: `${formatMs(c.duration)} ${formatCost(c.cost)} tokens: ${c.tokens}`;
|
|
461
|
+
lines.push(` ${glyph} ${padName(c.name)} ${meta}`);
|
|
462
|
+
if (c.status === 'failed' && c.metric?.reason) {
|
|
463
|
+
// Indent reason on its own line so long messages don't break alignment.
|
|
464
|
+
for (const line of c.metric.reason.split('\n')) {
|
|
465
|
+
lines.push(` ${line}`);
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
lines.push('');
|
|
470
|
+
lines.push(` ${report.passed} passed, ${report.failed} failed${report.skipped > 0 ? `, ${report.skipped} skipped` : ''}`);
|
|
471
|
+
lines.push(` total: ${formatCost(report.cost)} • cumulative tokens: ${report.tokens}`);
|
|
472
|
+
for (const line of lines)
|
|
473
|
+
sink.log(line);
|
|
474
|
+
return report;
|
|
475
|
+
}
|
|
476
|
+
function padName(s) {
|
|
477
|
+
return s.padEnd(28);
|
|
478
|
+
}
|
|
479
|
+
function formatMs(ms) {
|
|
480
|
+
if (ms < 1000)
|
|
481
|
+
return `${Math.round(ms)}ms`;
|
|
482
|
+
return `${(ms / 1000).toFixed(1)}s`;
|
|
483
|
+
}
|
|
484
|
+
function formatCost(cents) {
|
|
485
|
+
if (cents === 0)
|
|
486
|
+
return '$0.000';
|
|
487
|
+
if (cents < 0.001)
|
|
488
|
+
return '<$0.001';
|
|
489
|
+
return `$${cents.toFixed(3)}`;
|
|
490
|
+
}
|
|
491
|
+
// ─── Internal: extra-usage side-channel ───────────────────
|
|
492
|
+
//
|
|
493
|
+
// `Metric` doesn't surface token usage in its return type — the
|
|
494
|
+
// signature is `(response, ctx) => MetricResult`. To roll auxiliary
|
|
495
|
+
// token costs (llmJudge's judge model, semanticMatch's embeddings)
|
|
496
|
+
// into the case's cost report, the metrics stamp usage onto a
|
|
497
|
+
// Symbol-keyed slot on the response object and the runner consumes
|
|
498
|
+
// it. Internal-only; never exported.
|
|
499
|
+
const EXTRA_USAGE_KEY = Symbol.for('rudderjs.ai.eval.extraUsage');
|
|
500
|
+
function attachExtraUsage(response, tokens) {
|
|
501
|
+
const carrier = response;
|
|
502
|
+
carrier[EXTRA_USAGE_KEY] = (carrier[EXTRA_USAGE_KEY] ?? 0) + tokens;
|
|
503
|
+
}
|
|
504
|
+
function consumeExtraUsage(response) {
|
|
505
|
+
const carrier = response;
|
|
506
|
+
const tokens = carrier[EXTRA_USAGE_KEY] ?? 0;
|
|
507
|
+
delete carrier[EXTRA_USAGE_KEY];
|
|
508
|
+
return tokens;
|
|
509
|
+
}
|
|
510
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BG;AAEH,OAAO,EAAE,KAAK,EAAE,MAAM,aAAa,CAAA;AAGnC,OAAO,EAAE,MAAM,EAAE,MAAM,cAAc,CAAA;AACrC,OAAO,EAAE,EAAE,EAAE,MAAM,cAAc,CAAA;AACjC,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAC7C,OAAO,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAA;AACnD,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAA;AAEvB,OAAO,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAA;AAE/C,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAA;AAEjD,OAAO,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAA;AAsI/C,6DAA6D;AAE7D;;;;;;;;;GASG;AACH,MAAM,UAAU,SAAS,CAAC,IAAY,EAAE,IAAmB;IACzD,IAAI,CAAC,IAAI;QAAE,MAAM,IAAI,KAAK,CAAC,4CAA4C,CAAC,CAAA;IACxE,IAAI,CAAC,IAAI,IAAI,OAAO,IAAI,CAAC,KAAK,KAAK,UAAU,EAAE,CAAC;QAC9C,MAAM,IAAI,KAAK,CAAC,0EAA0E,CAAC,CAAA;IAC7F,CAAC;IACD,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1D,MAAM,IAAI,KAAK,CAAC,uDAAuD,CAAC,CAAA;IAC1E,CAAC;IACD,OAAO,MAAM,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAA;AACtC,CAAC;AAED,6DAA6D;AAE7D,qDAAqD;AACrD,MAAM,UAAU,UAAU,CAAC,QAAgB;IACzC,OAAO,CAAC,QAAQ,EAAgB,EAAE;QAChC,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAA;QAC5B,IAAI,MAAM,KAAK,QAAQ;YAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,CAAA;QACxD,OAAO;YACL,IAAI,EAAI,KAAK;YACb,KAAK,EAAG,CAAC;YACT,MAAM,EAAE,YAAY,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,SAAS,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,EAAE;SAC9E,CAAA;IACH,CAAC,CAAA;AACH,CAAC;AAED,6CAA6C;AAC7C,MAAM,UAAU,KAAK,CAAC,OAAe;IACnC,OAAO,CAAC,QAAQ,EAAgB,EAAE;QAChC,IAAI,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC;YAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,CAAA;QAChE,OAAO;YACL,IAAI,EAAI,KAAK;YACb,KAAK,EAAG,CAAC;YACT,MAAM,EAAE,WAAW,OAAO,kBAAkB,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;SAClI,CAAA;IACH,CAAC,CAAA;AACH,CAAC;AAED;;;;;;;;;;;;;;GAcG;AACH,MAAM,UAAU,QAAQ,CAAC,SAAiB,EAAE,OAA2B,EAAE;IACvE,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC;QAC5B,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC;YACf,IAAI,EAAI,CAAC,CAAC,OAAO,EAAE;YACnB,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;SACnB,CAAC;KACH,CAAC,CAAA;IAEF,OAAO,KAAK,EAAE,QAAQ,EAAE,GAAG,EAAyB,EAAE;QACpD,IAAI,CAAC;YACH,MAAM,KAAK,GAAG,KAAK,CAAC;gBAClB,YAAY,EAAE,GAAG,kBAAkB,OAAO,OAAO,CAAC,cAAc,EAAE,EAAE;gBACpE,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;aAC7C,CAAC,CAAA;YAEF,MAAM,MAAM,GAAG;gBACb,cAAc,SAAS,EAAE;gBACzB,EAAE;gBACF,eAAe,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE;gBAC1C,mBAAmB,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE;gBAClD,EAAE;gBACF,sEAAsE;aACvE,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;YAEZ,MAAM,aAAa,GAAG,MAAM,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC,CAAA;YAChD,MAAM,MAAM,GAAG,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC,IAAI,CAAC,CAAA;YAEhD,8DAA8D;YAC9D,4DAA4D;YAC5D,6DAA6D;YAC7D,gBAAgB,CAAC,QAAQ,EAAE,aAAa,CAAC,KAAK,CAAC,WAAW,CAAC,CAAA;YAE3D,OAAO;gBACL,IAAI,EAAI,MAAM,CAAC,IAAI;gBACnB,KAAK,EAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;gBAC3B,MAAM,EAAE,MAAM,CAAC,MAAM;aACtB,CAAA;QACH,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,OAAO;gBACL,IAAI,EAAI,KAAK;gBACb,KAAK,EAAG,CAAC;gBACT,MAAM,EAAE,iBAAiB,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE;aAC5E,CAAA;QACH,CAAC;IACH,CAAC,CAAA;AACH,CAAC;AAED,MAAM,kBAAkB,GAAG;IACzB,gGAAgG;IAChG,oEAAoE;IACpE,+FAA+F;CAChG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;AAEX;;;;;;;;;GASG;AACH,MAAM,UAAU,SAAS,CAAI,MAAoB;IAC/C,OAAO,CAAC,QAAQ,EAAgB,EAAE;QAChC,MAAM,QAAQ,GAAG,eAAe,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAA;QAC/C,IAAI,MAAe,CAAA;QACnB,IAAI,CAAC;YACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAA;QAC/B,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,OAAO;gBACL,IAAI,EAAI,KAAK;gBACb,KAAK,EAAG,CAAC;gBACT,MAAM,EAAE,aAAa,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE;aACxE,CAAA;QACH,CAAC;QACD,MAAM,MAAM,GAAG,MAAM,CAAC,SAAS,CAAC,MAAM,CAAC,CAAA;QACvC,IAAI,MAAM,CAAC,OAAO;YAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,CAAA;QACnD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,CAAA;QACpC,MAAM,IAAI,GAAI,KAAK,EAAE,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,QAAQ,CAAA;QAC/C,OAAO;YACL,IAAI,EAAI,KAAK;YACb,KAAK,EAAG,CAAC;YACT,MAAM,EAAE,sBAAsB,IAAI,KAAK,KAAK,EAAE,OAAO,IAAI,eAAe,EAAE;SAC3E,CAAA;IACH,CAAC,CAAA;AACH,CAAC;AAED;;;;;;;;;;;;;;GAcG;AACH,MAAM,UAAU,aAAa,CAC3B,SAAiB,EACjB,OAA+C,EAAE;IAEjD,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,IAAI,CAAA;IACxC,OAAO,KAAK,EAAE,QAAQ,EAAyB,EAAE;QAC/C,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,CAAC,SAAS,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAA;YACzC,MAAM,SAAS,GAAuB,EAAE,CAAA;YACxC,IAAI,IAAI,CAAC,KAAK;gBAAE,SAAS,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAA;YAC5C,MAAM,MAAM,GAAG,MAAM,EAAE,CAAC,KAAK,CAAC,MAAM,EAAE,SAAS,CAAC,CAAA;YAChD,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,MAAM,CAAC,UAAU,CAAA;YAC3C,IAAI,CAAC,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;gBACxB,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,EAAE,MAAM,EAAE,2BAA2B,EAAE,CAAA;YACvE,CAAC;YACD,gBAAgB,CAAC,QAAQ,EAAE,MAAM,CAAC,KAAK,CAAC,WAAW,CAAC,CAAA;YAEpD,MAAM,KAAK,GAAG,gBAAgB,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;YAC/C,MAAM,IAAI,GAAI,KAAK,IAAI,SAAS,CAAA;YAChC,OAAO;gBACL,IAAI;gBACJ,KAAK;gBACL,MAAM,EAAE,IAAI;oBACV,CAAC,CAAC,UAAU,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,SAAS,EAAE;oBAC9C,CAAC,CAAC,UAAU,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,SAAS,gBAAgB,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,SAAS,CAAC,MAAM,GAAG,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG;aAC1I,CAAA;QACH,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,OAAO;gBACL,IAAI,EAAI,KAAK;gBACb,KAAK,EAAG,CAAC;gBACT,MAAM,EAAE,iBAAiB,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE;aAC5E,CAAA;QACH,CAAC;IACH,CAAC,CAAA;AACH,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,SAAS,CAAC,SAAiB;IACzC,OAAO,CAAC,QAAQ,EAAgB,EAAE;QAChC,MAAM,IAAI,GAAG,QAAQ,CAAC,KAAK,CAAC,WAAW,CAAA;QACvC,MAAM,IAAI,GAAG,IAAI,IAAI,SAAS,CAAA;QAC9B,OAAO;YACL,IAAI;YACJ,KAAK,EAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACpB,MAAM,EAAE,IAAI;gBACV,CAAC,CAAC,GAAG,IAAI,cAAc,SAAS,EAAE;gBAClC,CAAC,CAAC,GAAG,IAAI,aAAa,SAAS,EAAE;SACpC,CAAA;IACH,CAAC,CAAA;AACH,CAAC;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,OAAO,CAAC,GAAG,OAAiB;IAC1C,OAAO,KAAK,EAAE,QAAQ,EAAE,GAAG,EAAyB,EAAE;QACpD,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;YACxB,MAAM,MAAM,GAAG,MAAM,CAAC,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAA;YACrC,IAAI,CAAC,MAAM,CAAC,IAAI;gBAAE,OAAO,MAAM,CAAA;QACjC,CAAC;QACD,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,CAAA;IACjC,CAAC,CAAA;AACH,CAAC;AAED,mHAAmH;AACnH,SAAS,gBAAgB,CAAC,CAAW,EAAE,CAAW;IAChD,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,MAAM;QAAE,OAAO,CAAC,CAAA;IACnC,IAAI,GAAG,GAAI,CAAC,CAAA;IACZ,IAAI,IAAI,GAAG,CAAC,CAAA;IACZ,IAAI,IAAI,GAAG,CAAC,CAAA;IACZ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAClC,MAAM,EAAE,GAAG,CAAC,CAAC,CAAC,CAAE,CAAA;QAChB,MAAM,EAAE,GAAG,CAAC,CAAC,CAAC,CAAE,CAAA;QAChB,GAAG,IAAK,EAAE,GAAG,EAAE,CAAA;QACf,IAAI,IAAI,EAAE,GAAG,EAAE,CAAA;QACf,IAAI,IAAI,EAAE,GAAG,EAAE,CAAA;IACjB,CAAC;IACD,IAAI,IAAI,KAAK,CAAC,IAAI,IAAI,KAAK,CAAC;QAAE,OAAO,CAAC,CAAA;IACtC,OAAO,GAAG,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAA;AAClD,CAAC;AAED,SAAS,eAAe,CAAC,IAAY;IACnC,OAAO,IAAI;SACR,OAAO,CAAC,sBAAsB,EAAE,EAAE,CAAC;SACnC,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC;SAC1B,IAAI,EAAE,CAAA;AACX,CAAC;AAED,6DAA6D;AAE7D;;;;;;;;GAQG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAAC,KAAgB;IAC7C,MAAM,KAAK,GAAG,WAAW,CAAC,GAAG,EAAE,CAAA;IAC/B,MAAM,KAAK,GAAiB,EAAE,CAAA;IAC9B,IAAI,MAAM,GAAI,CAAC,CAAA;IACf,IAAI,MAAM,GAAI,CAAC,CAAA;IACf,IAAI,OAAO,GAAG,CAAC,CAAA;IAEf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACjD,MAAM,CAAC,GAAM,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAE,CAAA;QACjC,MAAM,IAAI,GAAG,CAAC,CAAC,IAAI,IAAI,QAAQ,CAAC,EAAE,CAAA;QAElC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;YACX,MAAM,UAAU,GAAe;gBAC7B,IAAI;gBACJ,MAAM,EAAI,SAAS;gBACnB,MAAM,EAAI,OAAO,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,SAAS;gBACzD,QAAQ,EAAE,CAAC;gBACX,MAAM,EAAI,CAAC;gBACX,IAAI,EAAM,CAAC;gBACX,KAAK,EAAK,CAAC,CAAC,KAAK;aAClB,CAAA;YACD,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;YACtB,iBAAiB,CAAC,KAAK,CAAC,IAAI,EAAE,UAAU,CAAC,CAAA;YACzC,OAAO,EAAE,CAAA;YACT,SAAQ;QACV,CAAC;QAED,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC,EAAE,IAAI,CAAC,CAAA;QAC5C,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAClB,iBAAiB,CAAC,KAAK,CAAC,IAAI,EAAE,MAAM,CAAC,CAAA;QACrC,IAAI,MAAM,CAAC,MAAM,KAAK,QAAQ;YAAE,MAAM,EAAE,CAAA;aACnC,IAAI,MAAM,CAAC,MAAM,KAAK,QAAQ;YAAE,MAAM,EAAE,CAAA;IAC/C,CAAC;IAED,MAAM,QAAQ,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,KAAK,CAAA;IAE1C,MAAM,MAAM,GAAgB;QAC1B,KAAK,EAAE,KAAK,CAAC,IAAI;QACjB,KAAK;QACL,MAAM;QACN,MAAM;QACN,OAAO;QACP,QAAQ;QACR,IAAI,EAAI,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,EAAI,CAAC,CAAC;QACnD,MAAM,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;KACpD,CAAA;IACD,IAAI,KAAK,CAAC,IAAI,CAAC,QAAQ;QAAE,MAAM,CAAC,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAA;IAC9D,OAAO,MAAM,CAAA;AACf,CAAC;AAED,SAAS,iBAAiB,CAAC,SAAiB,EAAE,MAAkB;IAC9D,MAAM,KAAK,GAA2C;QACpD,IAAI,EAAM,sBAAsB;QAChC,KAAK,EAAK,SAAS;QACnB,IAAI,EAAM,MAAM,CAAC,IAAI;QACrB,MAAM,EAAI,MAAM,CAAC,MAAM;QACvB,IAAI,EAAM,MAAM,CAAC,MAAM,KAAK,QAAQ;QACpC,MAAM,EAAI,MAAM,CAAC,MAAM;QACvB,IAAI,EAAM,MAAM,CAAC,IAAI;QACrB,QAAQ,EAAE,MAAM,CAAC,QAAQ;KAC1B,CAAA;IACD,IAAI,MAAM,CAAC,MAAM,EAAE,KAAK,KAAK,SAAS;QAAE,KAAK,CAAC,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,KAAK,CAAA;IACzE,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,KAAK,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAA;IAClF,IAAI,MAAM;QAAE,KAAK,CAAC,MAAM,GAAG,MAAM,CAAA;IACjC,WAAW,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;AACzB,CAAC;AAED,KAAK,UAAU,OAAO,CAAC,KAAgB,EAAE,CAAW,EAAE,IAAY;IAChE,MAAM,OAAO,GAAG,CAAC,CAAC,KAAK,IAAI,KAAK,CAAC,IAAI,CAAC,KAAK,CAAA;IAC3C,MAAM,EAAE,GAAQ,OAAO,EAAE,CAAA;IACzB,MAAM,OAAO,GAAG,CAAC,CAAC,OAAO,IAAI,KAAK,CAAC,IAAI,CAAC,OAAO,CAAA;IAE/C,MAAM,KAAK,GAAG,WAAW,CAAC,GAAG,EAAE,CAAA;IAC/B,IAAI,QAAuB,CAAA;IAC3B,IAAI,CAAC;QACH,QAAQ,GAAG,MAAM,cAAc,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC,CAAA;IACpE,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,OAAO;YACL,IAAI;YACJ,MAAM,EAAI,QAAQ;YAClB,MAAM,EAAI,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE;YACnF,QAAQ,EAAE,WAAW,CAAC,GAAG,EAAE,GAAG,KAAK;YACnC,MAAM,EAAI,CAAC;YACX,IAAI,EAAM,CAAC;YACX,KAAK,EAAK,CAAC,CAAC,KAAK;SAClB,CAAA;IACH,CAAC;IAED,IAAI,MAAoB,CAAA;IACxB,IAAI,CAAC;QACH,MAAM,GAAG,MAAM,CAAC,CAAC,MAAM,CAAC,QAAQ,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAA;IACvE,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,MAAM,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,EAAE,iBAAiB,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,EAAE,CAAA;IACvG,CAAC;IAED,MAAM,WAAW,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAA;IAC/C,MAAM,WAAW,GAAG,QAAQ,CAAC,KAAK,CAAC,WAAW,GAAG,WAAW,CAAA;IAE5D,OAAO;QACL,IAAI;QACJ,MAAM,EAAQ,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ;QAC/C,MAAM;QACN,QAAQ,EAAM,WAAW,CAAC,GAAG,EAAE,GAAG,KAAK;QACvC,MAAM,EAAQ,WAAW;QACzB,IAAI,EAAU,YAAY,CAAC,cAAc,CAAC,EAAE,CAAC,EAAE,QAAQ,CAAC,KAAK,CAAC,YAAY,EAAE,QAAQ,CAAC,KAAK,CAAC,gBAAgB,CAAC;cAC9F,YAAY,CAAC,cAAc,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,WAAW,CAAC,EAAG,mDAAmD;QACpH,KAAK,EAAS,CAAC,CAAC,KAAK;QACrB,YAAY,EAAE,QAAQ,CAAC,IAAI;KAC5B,CAAA;AACH,CAAC;AAED,SAAS,cAAc,CAAC,EAAS;IAC/B,kEAAkE;IAClE,gEAAgE;IAChE,6DAA6D;IAC7D,kEAAkE;IAClE,kEAAkE;IAClE,8CAA8C;IAC9C,OAAO,EAAE,CAAC,KAAK,EAAE,IAAI,iBAAiB,CAAA;AACxC,CAAC;AAED,KAAK,UAAU,cAAc,CAAI,EAAoB,EAAE,EAAsB;IAC3E,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,CAAC;QAAE,OAAO,EAAE,EAAE,CAAA;IAC/B,OAAO,IAAI,OAAO,CAAI,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACxC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,IAAI,KAAK,CAAC,iBAAiB,EAAE,IAAI,CAAC,CAAC,EAAE,EAAE,CAAC,CAAA;QAC9E,EAAE,EAAE,CAAC,IAAI,CACP,CAAC,CAAC,EAAE,GAAG,YAAY,CAAC,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA,CAAC,CAAC,EACxC,CAAC,CAAC,EAAE,GAAG,YAAY,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAA,CAAC,CAAC,CACxC,CAAA;IACH,CAAC,CAAC,CAAA;AACJ,CAAC;AAED,6DAA6D;AAE7D,qEAAqE;AACrE,yEAAyE;AACzE,qEAAqE;AACrE,uCAAuC;AACvC,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAA;AAGjE,6DAA6D;AAE7D;;;;;;;GAOG;AACH,MAAM,UAAU,aAAa,CAAC,MAAmB,EAAE,OAAqC,OAAO;IAC7F,MAAM,KAAK,GAAa,EAAE,CAAA;IAC1B,MAAM,OAAO,GAAG,GAAG,MAAM,CAAC,KAAK,KAAK,MAAM,CAAC,KAAK,CAAC,MAAM,WAAW,QAAQ,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAA;IAC1H,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IAEnB,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;QAC7B,MAAM,KAAK,GAAG,CAAC,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAA;QAC7E,MAAM,IAAI,GAAI,CAAC,CAAC,MAAM,KAAK,SAAS;YAClC,CAAC,CAAC,SAAS,CAAC,CAAC,MAAM,IAAI,SAAS,EAAE;YAClC,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC,MAAM,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,MAAM,EAAE,CAAA;QAC3E,KAAK,CAAC,IAAI,CAAC,KAAK,KAAK,IAAI,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC,CAAA;QACnD,IAAI,CAAC,CAAC,MAAM,KAAK,QAAQ,IAAI,CAAC,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC;YAC9C,wEAAwE;YACxE,KAAK,MAAM,IAAI,IAAI,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC/C,KAAK,CAAC,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC,CAAA;YAC7B,CAAC;QACH,CAAC;IACH,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;IACd,KAAK,CAAC,IAAI,CAAC,KAAK,MAAM,CAAC,MAAM,YAAY,MAAM,CAAC,MAAM,UAAU,MAAM,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,MAAM,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAA;IAC1H,KAAK,CAAC,IAAI,CAAC,YAAY,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,2BAA2B,MAAM,CAAC,MAAM,EAAE,CAAC,CAAA;IAEzF,KAAK,MAAM,IAAI,IAAI,KAAK;QAAE,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;IACxC,OAAO,MAAM,CAAA;AACf,CAAC;AAED,SAAS,OAAO,CAAC,CAAS;IACxB,OAAO,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,CAAA;AACrB,CAAC;AAED,SAAS,QAAQ,CAAC,EAAU;IAC1B,IAAI,EAAE,GAAG,IAAI;QAAE,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,IAAI,CAAA;IAC3C,OAAO,GAAG,CAAC,EAAE,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAA;AACrC,CAAC;AAED,SAAS,UAAU,CAAC,KAAa;IAC/B,IAAI,KAAK,KAAK,CAAC;QAAE,OAAO,QAAQ,CAAA;IAChC,IAAI,KAAK,GAAG,KAAK;QAAE,OAAO,SAAS,CAAA;IACnC,OAAO,IAAI,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAA;AAC/B,CAAC;AAED,6DAA6D;AAC7D,EAAE;AACF,gEAAgE;AAChE,oEAAoE;AACpE,mEAAmE;AACnE,8DAA8D;AAC9D,mEAAmE;AACnE,qCAAqC;AAErC,MAAM,eAAe,GAAG,MAAM,CAAC,GAAG,CAAC,6BAA6B,CAAC,CAAA;AAMjE,SAAS,gBAAgB,CAAC,QAAuB,EAAE,MAAc;IAC/D,MAAM,OAAO,GAAG,QAA6C,CAAA;IAC7D,OAAO,CAAC,eAAe,CAAC,GAAG,CAAC,OAAO,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,GAAG,MAAM,CAAA;AACrE,CAAC;AAED,SAAS,iBAAiB,CAAC,QAAuB;IAChD,MAAM,OAAO,GAAG,QAA6C,CAAA;IAC7D,MAAM,MAAM,GAAI,OAAO,CAAC,eAAe,CAAC,IAAI,CAAC,CAAA;IAC7C,OAAO,OAAO,CAAC,eAAe,CAAC,CAAA;IAC/B,OAAO,MAAM,CAAA;AACf,CAAC"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import type { SuiteReport } from './index.js';
|
|
2
|
+
/**
|
|
3
|
+
* Single case row in the JSON output. `metric.pass`/`metric.score`/
|
|
4
|
+
* `metric.reason` are flattened up so consumers don't have to reach
|
|
5
|
+
* through nested objects in CI scripts.
|
|
6
|
+
*/
|
|
7
|
+
export interface SuiteJsonCase {
|
|
8
|
+
name: string;
|
|
9
|
+
status: 'passed' | 'failed' | 'skipped';
|
|
10
|
+
pass: boolean;
|
|
11
|
+
score?: number;
|
|
12
|
+
reason?: string;
|
|
13
|
+
duration: number;
|
|
14
|
+
tokens: number;
|
|
15
|
+
cost: number;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Machine-readable suite output emitted by `pnpm rudder ai:eval --json`.
|
|
19
|
+
* Stable shape — bumping fields here is a minor (additive) bump for
|
|
20
|
+
* `@rudderjs/ai`. Removing or renaming fields is a major.
|
|
21
|
+
*/
|
|
22
|
+
export interface SuiteJson {
|
|
23
|
+
suite: string;
|
|
24
|
+
passed: number;
|
|
25
|
+
failed: number;
|
|
26
|
+
skipped: number;
|
|
27
|
+
duration: number;
|
|
28
|
+
cost: number;
|
|
29
|
+
tokens: number;
|
|
30
|
+
cases: SuiteJsonCase[];
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* JSON reporter — flattens a `SuiteReport` for CI consumption.
|
|
34
|
+
*
|
|
35
|
+
* Mirrors the `command_run` MCP tool envelope shape so the boost
|
|
36
|
+
* agent surface and the eval CLI feel like one family.
|
|
37
|
+
*
|
|
38
|
+
* @example
|
|
39
|
+
* const report = await runSuite(suite)
|
|
40
|
+
* process.stdout.write(JSON.stringify(reportJson(report)))
|
|
41
|
+
*/
|
|
42
|
+
export declare function reportJson(report: SuiteReport): SuiteJson;
|
|
43
|
+
//# sourceMappingURL=json-reporter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"json-reporter.d.ts","sourceRoot":"","sources":["../../src/eval/json-reporter.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,YAAY,CAAA;AAE7C;;;;GAIG;AACH,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAO,MAAM,CAAA;IACjB,MAAM,EAAK,QAAQ,GAAG,QAAQ,GAAG,SAAS,CAAA;IAC1C,IAAI,EAAO,OAAO,CAAA;IAClB,KAAK,CAAC,EAAK,MAAM,CAAA;IACjB,MAAM,CAAC,EAAI,MAAM,CAAA;IACjB,QAAQ,EAAG,MAAM,CAAA;IACjB,MAAM,EAAK,MAAM,CAAA;IACjB,IAAI,EAAO,MAAM,CAAA;CAClB;AAED;;;;GAIG;AACH,MAAM,WAAW,SAAS;IACxB,KAAK,EAAK,MAAM,CAAA;IAChB,MAAM,EAAI,MAAM,CAAA;IAChB,MAAM,EAAI,MAAM,CAAA;IAChB,OAAO,EAAG,MAAM,CAAA;IAChB,QAAQ,EAAE,MAAM,CAAA;IAChB,IAAI,EAAM,MAAM,CAAA;IAChB,MAAM,EAAI,MAAM,CAAA;IAChB,KAAK,EAAK,aAAa,EAAE,CAAA;CAC1B;AAED;;;;;;;;;GASG;AACH,wBAAgB,UAAU,CAAC,MAAM,EAAE,WAAW,GAAG,SAAS,CAWzD"}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JSON reporter — flattens a `SuiteReport` for CI consumption.
|
|
3
|
+
*
|
|
4
|
+
* Mirrors the `command_run` MCP tool envelope shape so the boost
|
|
5
|
+
* agent surface and the eval CLI feel like one family.
|
|
6
|
+
*
|
|
7
|
+
* @example
|
|
8
|
+
* const report = await runSuite(suite)
|
|
9
|
+
* process.stdout.write(JSON.stringify(reportJson(report)))
|
|
10
|
+
*/
|
|
11
|
+
export function reportJson(report) {
|
|
12
|
+
return {
|
|
13
|
+
suite: report.suite,
|
|
14
|
+
passed: report.passed,
|
|
15
|
+
failed: report.failed,
|
|
16
|
+
skipped: report.skipped,
|
|
17
|
+
duration: report.duration,
|
|
18
|
+
cost: report.cost,
|
|
19
|
+
tokens: report.tokens,
|
|
20
|
+
cases: report.cases.map(toJsonCase),
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
function toJsonCase(c) {
|
|
24
|
+
const out = {
|
|
25
|
+
name: c.name,
|
|
26
|
+
status: c.status,
|
|
27
|
+
pass: c.status === 'passed',
|
|
28
|
+
duration: c.duration,
|
|
29
|
+
tokens: c.tokens,
|
|
30
|
+
cost: c.cost,
|
|
31
|
+
};
|
|
32
|
+
if (c.metric?.score !== undefined)
|
|
33
|
+
out.score = c.metric.score;
|
|
34
|
+
if (c.status === 'skipped' && c.reason)
|
|
35
|
+
out.reason = c.reason;
|
|
36
|
+
else if (c.metric?.reason)
|
|
37
|
+
out.reason = c.metric.reason;
|
|
38
|
+
return out;
|
|
39
|
+
}
|
|
40
|
+
//# sourceMappingURL=json-reporter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"json-reporter.js","sourceRoot":"","sources":["../../src/eval/json-reporter.ts"],"names":[],"mappings":"AAkCA;;;;;;;;;GASG;AACH,MAAM,UAAU,UAAU,CAAC,MAAmB;IAC5C,OAAO;QACL,KAAK,EAAK,MAAM,CAAC,KAAK;QACtB,MAAM,EAAI,MAAM,CAAC,MAAM;QACvB,MAAM,EAAI,MAAM,CAAC,MAAM;QACvB,OAAO,EAAG,MAAM,CAAC,OAAO;QACxB,QAAQ,EAAE,MAAM,CAAC,QAAQ;QACzB,IAAI,EAAM,MAAM,CAAC,IAAI;QACrB,MAAM,EAAI,MAAM,CAAC,MAAM;QACvB,KAAK,EAAK,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,UAAU,CAAC;KACvC,CAAA;AACH,CAAC;AAED,SAAS,UAAU,CAAC,CAA+B;IACjD,MAAM,GAAG,GAAkB;QACzB,IAAI,EAAM,CAAC,CAAC,IAAI;QAChB,MAAM,EAAI,CAAC,CAAC,MAAM;QAClB,IAAI,EAAM,CAAC,CAAC,MAAM,KAAK,QAAQ;QAC/B,QAAQ,EAAE,CAAC,CAAC,QAAQ;QACpB,MAAM,EAAI,CAAC,CAAC,MAAM;QAClB,IAAI,EAAM,CAAC,CAAC,IAAI;KACjB,CAAA;IACD,IAAI,CAAC,CAAC,MAAM,EAAE,KAAK,KAAK,SAAS;QAAE,GAAG,CAAC,KAAK,GAAG,CAAC,CAAC,MAAM,CAAC,KAAK,CAAA;IAC7D,IAAI,CAAC,CAAC,MAAM,KAAK,SAAS,IAAI,CAAC,CAAC,MAAM;QAAE,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,MAAM,CAAA;SACxD,IAAI,CAAC,CAAC,MAAM,EAAE,MAAM;QAAE,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,MAAM,CAAC,MAAM,CAAA;IACvD,OAAO,GAAG,CAAA;AACZ,CAAC"}
|