@rudderjs/ai 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. package/README.md +399 -0
  2. package/boost/guidelines.md +60 -0
  3. package/dist/agent.d.ts +35 -1
  4. package/dist/agent.d.ts.map +1 -1
  5. package/dist/agent.js +118 -16
  6. package/dist/agent.js.map +1 -1
  7. package/dist/budget/pricing.d.ts +124 -0
  8. package/dist/budget/pricing.d.ts.map +1 -0
  9. package/dist/budget/pricing.js +175 -0
  10. package/dist/budget/pricing.js.map +1 -0
  11. package/dist/budget/storage.d.ts +104 -0
  12. package/dist/budget/storage.d.ts.map +1 -0
  13. package/dist/budget/storage.js +0 -0
  14. package/dist/budget/storage.js.map +1 -0
  15. package/dist/budget/with-budget.d.ts +119 -0
  16. package/dist/budget/with-budget.d.ts.map +1 -0
  17. package/dist/budget/with-budget.js +175 -0
  18. package/dist/budget/with-budget.js.map +1 -0
  19. package/dist/budget-orm/index.d.ts +96 -0
  20. package/dist/budget-orm/index.d.ts.map +1 -0
  21. package/dist/budget-orm/index.js +177 -0
  22. package/dist/budget-orm/index.js.map +1 -0
  23. package/dist/commands/ai-eval.d.ts +93 -0
  24. package/dist/commands/ai-eval.d.ts.map +1 -0
  25. package/dist/commands/ai-eval.js +378 -0
  26. package/dist/commands/ai-eval.js.map +1 -0
  27. package/dist/computer-use/actions.d.ts +214 -0
  28. package/dist/computer-use/actions.d.ts.map +1 -0
  29. package/dist/computer-use/actions.js +48 -0
  30. package/dist/computer-use/actions.js.map +1 -0
  31. package/dist/computer-use/errors.d.ts +57 -0
  32. package/dist/computer-use/errors.d.ts.map +1 -0
  33. package/dist/computer-use/errors.js +76 -0
  34. package/dist/computer-use/errors.js.map +1 -0
  35. package/dist/computer-use/index.d.ts +53 -0
  36. package/dist/computer-use/index.d.ts.map +1 -0
  37. package/dist/computer-use/index.js +51 -0
  38. package/dist/computer-use/index.js.map +1 -0
  39. package/dist/computer-use/playwright.d.ts +76 -0
  40. package/dist/computer-use/playwright.d.ts.map +1 -0
  41. package/dist/computer-use/playwright.js +270 -0
  42. package/dist/computer-use/playwright.js.map +1 -0
  43. package/dist/computer-use/tool.d.ts +154 -0
  44. package/dist/computer-use/tool.d.ts.map +1 -0
  45. package/dist/computer-use/tool.js +210 -0
  46. package/dist/computer-use/tool.js.map +1 -0
  47. package/dist/eval/fixtures.d.ts +65 -0
  48. package/dist/eval/fixtures.d.ts.map +1 -0
  49. package/dist/eval/fixtures.js +110 -0
  50. package/dist/eval/fixtures.js.map +1 -0
  51. package/dist/eval/html-reporter.d.ts +25 -0
  52. package/dist/eval/html-reporter.d.ts.map +1 -0
  53. package/dist/eval/html-reporter.js +209 -0
  54. package/dist/eval/html-reporter.js.map +1 -0
  55. package/dist/eval/index.d.ts +271 -0
  56. package/dist/eval/index.d.ts.map +1 -0
  57. package/dist/eval/index.js +510 -0
  58. package/dist/eval/index.js.map +1 -0
  59. package/dist/eval/json-reporter.d.ts +43 -0
  60. package/dist/eval/json-reporter.d.ts.map +1 -0
  61. package/dist/eval/json-reporter.js +40 -0
  62. package/dist/eval/json-reporter.js.map +1 -0
  63. package/dist/fake.d.ts +36 -1
  64. package/dist/fake.d.ts.map +1 -1
  65. package/dist/fake.js +49 -2
  66. package/dist/fake.js.map +1 -1
  67. package/dist/file-search.d.ts +168 -0
  68. package/dist/file-search.d.ts.map +1 -0
  69. package/dist/file-search.js +158 -0
  70. package/dist/file-search.js.map +1 -0
  71. package/dist/index.d.ts +22 -2
  72. package/dist/index.d.ts.map +1 -1
  73. package/dist/index.js +17 -1
  74. package/dist/index.js.map +1 -1
  75. package/dist/mcp/client-tools.d.ts +39 -0
  76. package/dist/mcp/client-tools.d.ts.map +1 -0
  77. package/dist/mcp/client-tools.js +147 -0
  78. package/dist/mcp/client-tools.js.map +1 -0
  79. package/dist/mcp/index.d.ts +16 -0
  80. package/dist/mcp/index.d.ts.map +1 -0
  81. package/dist/mcp/index.js +15 -0
  82. package/dist/mcp/index.js.map +1 -0
  83. package/dist/mcp/server-from-agent.d.ts +24 -0
  84. package/dist/mcp/server-from-agent.d.ts.map +1 -0
  85. package/dist/mcp/server-from-agent.js +113 -0
  86. package/dist/mcp/server-from-agent.js.map +1 -0
  87. package/dist/mcp/types.d.ts +64 -0
  88. package/dist/mcp/types.d.ts.map +1 -0
  89. package/dist/mcp/types.js +6 -0
  90. package/dist/mcp/types.js.map +1 -0
  91. package/dist/memory-embedding/index.d.ts +121 -0
  92. package/dist/memory-embedding/index.d.ts.map +1 -0
  93. package/dist/memory-embedding/index.js +229 -0
  94. package/dist/memory-embedding/index.js.map +1 -0
  95. package/dist/memory-extract.d.ts +60 -0
  96. package/dist/memory-extract.d.ts.map +1 -0
  97. package/dist/memory-extract.js +163 -0
  98. package/dist/memory-extract.js.map +1 -0
  99. package/dist/memory-inject.d.ts +39 -0
  100. package/dist/memory-inject.d.ts.map +1 -0
  101. package/dist/memory-inject.js +135 -0
  102. package/dist/memory-inject.js.map +1 -0
  103. package/dist/memory-orm/index.d.ts +118 -0
  104. package/dist/memory-orm/index.d.ts.map +1 -0
  105. package/dist/memory-orm/index.js +187 -0
  106. package/dist/memory-orm/index.js.map +1 -0
  107. package/dist/memory.d.ts +55 -0
  108. package/dist/memory.d.ts.map +1 -0
  109. package/dist/memory.js +132 -0
  110. package/dist/memory.js.map +1 -0
  111. package/dist/observers.d.ts +22 -0
  112. package/dist/observers.d.ts.map +1 -1
  113. package/dist/observers.js.map +1 -1
  114. package/dist/provider-tools.d.ts +15 -1
  115. package/dist/provider-tools.d.ts.map +1 -1
  116. package/dist/provider-tools.js +21 -1
  117. package/dist/provider-tools.js.map +1 -1
  118. package/dist/providers/anthropic.d.ts.map +1 -1
  119. package/dist/providers/anthropic.js +61 -6
  120. package/dist/providers/anthropic.js.map +1 -1
  121. package/dist/providers/elevenlabs.d.ts +98 -0
  122. package/dist/providers/elevenlabs.d.ts.map +1 -0
  123. package/dist/providers/elevenlabs.js +229 -0
  124. package/dist/providers/elevenlabs.js.map +1 -0
  125. package/dist/providers/google.d.ts +83 -1
  126. package/dist/providers/google.d.ts.map +1 -1
  127. package/dist/providers/google.js +491 -8
  128. package/dist/providers/google.js.map +1 -1
  129. package/dist/providers/openai.d.ts +3 -1
  130. package/dist/providers/openai.d.ts.map +1 -1
  131. package/dist/providers/openai.js +209 -5
  132. package/dist/providers/openai.js.map +1 -1
  133. package/dist/providers/voyage.d.ts +91 -0
  134. package/dist/providers/voyage.d.ts.map +1 -0
  135. package/dist/providers/voyage.js +166 -0
  136. package/dist/providers/voyage.js.map +1 -0
  137. package/dist/queue-job.d.ts +69 -4
  138. package/dist/queue-job.d.ts.map +1 -1
  139. package/dist/queue-job.js +114 -11
  140. package/dist/queue-job.js.map +1 -1
  141. package/dist/registry.d.ts +3 -1
  142. package/dist/registry.d.ts.map +1 -1
  143. package/dist/registry.js +10 -0
  144. package/dist/registry.js.map +1 -1
  145. package/dist/server/provider.d.ts.map +1 -1
  146. package/dist/server/provider.js +23 -1
  147. package/dist/server/provider.js.map +1 -1
  148. package/dist/similarity-search.d.ts +163 -0
  149. package/dist/similarity-search.d.ts.map +1 -0
  150. package/dist/similarity-search.js +147 -0
  151. package/dist/similarity-search.js.map +1 -0
  152. package/dist/tool.d.ts.map +1 -1
  153. package/dist/tool.js +13 -4
  154. package/dist/tool.js.map +1 -1
  155. package/dist/types.d.ts +246 -0
  156. package/dist/types.d.ts.map +1 -1
  157. package/dist/vector-stores/index.d.ts +96 -0
  158. package/dist/vector-stores/index.d.ts.map +1 -0
  159. package/dist/vector-stores/index.js +153 -0
  160. package/dist/vector-stores/index.js.map +1 -0
  161. package/package.json +41 -3
@@ -0,0 +1,510 @@
1
+ /**
2
+ * `@rudderjs/ai/eval` — built-in eval framework for #A5 Phase 1.
3
+ *
4
+ * Define a suite of input cases + assertions, run them against any
5
+ * `Agent`, get a console report with pass/fail + cost + tokens. Same
6
+ * `Agent` instances as your app code — one source of truth.
7
+ *
8
+ * @example
9
+ * ```ts
10
+ * // evals/support-agent.eval.ts
11
+ * import { evalSuite, llmJudge, exactMatch, regex } from '@rudderjs/ai/eval'
12
+ * import { SupportAgent } from '../app/Agents/SupportAgent.js'
13
+ *
14
+ * export default evalSuite('SupportAgent', {
15
+ * agent: () => new SupportAgent(),
16
+ * cases: [
17
+ * { name: 'password reset', input: 'How do I reset my password?',
18
+ * assert: llmJudge('mentions a password reset link') },
19
+ * { name: 'price', input: 'How much?', assert: exactMatch('$99/month') },
20
+ * { name: 'support email', input: 'Contact?', assert: regex(/support@/) },
21
+ * ],
22
+ * })
23
+ * ```
24
+ *
25
+ * Run programmatically via `runSuite(suite)` from this entry, or via
26
+ * `pnpm rudder ai:eval` once Phase 2 lands.
27
+ *
28
+ * Built-in metrics: `exactMatch`, `regex`, `llmJudge`, `jsonShape`,
29
+ * `semanticMatch`, `tokenCost`. Compose multiple via `compose(...)`.
30
+ * User-defined metrics work today — any `(response, ctx) =>
31
+ * MetricResult` qualifies.
32
+ */
33
+ import { agent } from '../agent.js';
34
+ import { Output } from '../output.js';
35
+ import { AI } from '../facade.js';
36
+ import { aiObservers } from '../observers.js';
37
+ import { estimateCost } from '../budget/pricing.js';
38
+ import { z } from 'zod';
39
+ export { reportJson } from './json-reporter.js';
40
+ export { stepsFromResponse } from './fixtures.js';
41
+ export { reportHtml } from './html-reporter.js';
42
+ // ─── Suite definition ─────────────────────────────────────
43
+ /**
44
+ * Define an eval suite. Returns a frozen `EvalSuite` ready to pass
45
+ * into {@link runSuite} or to default-export from an `evals/*.eval.ts`
46
+ * file (Phase 2's CLI auto-discovers those).
47
+ *
48
+ * The shape is deliberately a function rather than a class — keeps the
49
+ * file's default export trivially serializable (Phase 2 needs to load
50
+ * suites via dynamic import) and avoids the "did you forget `new`?"
51
+ * footgun.
52
+ */
53
+ export function evalSuite(name, spec) {
54
+ if (!name)
55
+ throw new Error('[RudderJS AI] evalSuite() requires a name.');
56
+ if (!spec || typeof spec.agent !== 'function') {
57
+ throw new Error('[RudderJS AI] evalSuite() requires { agent: () => Agent, cases: [...] }.');
58
+ }
59
+ if (!Array.isArray(spec.cases) || spec.cases.length === 0) {
60
+ throw new Error('[RudderJS AI] evalSuite() requires at least one case.');
61
+ }
62
+ return Object.freeze({ name, spec });
63
+ }
64
+ // ─── Built-in metrics ─────────────────────────────────────
65
+ /** Exact string equality against `response.text`. */
66
+ export function exactMatch(expected) {
67
+ return (response) => {
68
+ const actual = response.text;
69
+ if (actual === expected)
70
+ return { pass: true, score: 1 };
71
+ return {
72
+ pass: false,
73
+ score: 0,
74
+ reason: `expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`,
75
+ };
76
+ };
77
+ }
78
+ /** Pattern match against `response.text`. */
79
+ export function regex(pattern) {
80
+ return (response) => {
81
+ if (pattern.test(response.text))
82
+ return { pass: true, score: 1 };
83
+ return {
84
+ pass: false,
85
+ score: 0,
86
+ reason: `pattern ${pattern} did not match ${JSON.stringify(response.text.slice(0, 120))}${response.text.length > 120 ? '…' : ''}`,
87
+ };
88
+ };
89
+ }
90
+ /**
91
+ * LLM-as-judge: ask a small model whether the response satisfies a
92
+ * natural-language criterion. Returns the judge's reasoning in
93
+ * `reason` so failures are debuggable.
94
+ *
95
+ * Design: the judge runs as a one-shot anonymous agent (no recursion
96
+ * concern — default `remembers()` is `false`). Output is shaped via
97
+ * `Output.object({ schema })` for deterministic parsing. Failures
98
+ * (network, parse, unhandled judge error) bubble as `pass: false`
99
+ * with the error in `reason` — a broken judge is not a passing case.
100
+ *
101
+ * Pitfall: the judge model has the same biases as any LLM. Use it
102
+ * for fuzzy "did the answer mention X?" assertions; for exact
103
+ * structural checks prefer `jsonShape` (Phase 3) or `regex`.
104
+ */
105
+ export function llmJudge(criterion, opts = {}) {
106
+ const wrapper = Output.object({
107
+ schema: z.object({
108
+ pass: z.boolean(),
109
+ reason: z.string(),
110
+ }),
111
+ });
112
+ return async (response, ctx) => {
113
+ try {
114
+ const judge = agent({
115
+ instructions: `${JUDGE_INSTRUCTIONS}\n\n${wrapper.toSystemPrompt()}`,
116
+ ...(opts.model ? { model: opts.model } : {}),
117
+ });
118
+ const prompt = [
119
+ `Criterion: ${criterion}`,
120
+ '',
121
+ `User input: ${JSON.stringify(ctx.input)}`,
122
+ `Agent response: ${JSON.stringify(response.text)}`,
123
+ '',
124
+ 'Does the response satisfy the criterion? Return strictly valid JSON.',
125
+ ].join('\n');
126
+ const judgeResponse = await judge.prompt(prompt);
127
+ const parsed = wrapper.parse(judgeResponse.text);
128
+ // Tag the judge's token usage onto the response so the runner
129
+ // can include it in the cost rollup. This is a side-channel
130
+ // since the metric signature doesn't surface usage natively.
131
+ attachExtraUsage(response, judgeResponse.usage.totalTokens);
132
+ return {
133
+ pass: parsed.pass,
134
+ score: parsed.pass ? 1 : 0,
135
+ reason: parsed.reason,
136
+ };
137
+ }
138
+ catch (err) {
139
+ return {
140
+ pass: false,
141
+ score: 0,
142
+ reason: `judge failed: ${err instanceof Error ? err.message : String(err)}`,
143
+ };
144
+ }
145
+ };
146
+ }
147
+ const JUDGE_INSTRUCTIONS = [
148
+ 'You are an evaluator judging whether an agent response satisfies a natural-language criterion.',
149
+ 'Be precise: only return pass=true if the criterion is plainly met.',
150
+ 'Provide a short reason for your decision (1-2 sentences) so the developer can debug failures.',
151
+ ].join(' ');
152
+ /**
153
+ * Strict structural assertion: parse `response.text` as JSON
154
+ * (stripping ```json fences) and run it through a zod schema.
155
+ *
156
+ * Pairs naturally with `Output.object({ schema })` on the agent —
157
+ * if the agent declares the same schema, this metric verifies the
158
+ * output actually conforms. Failures surface the zod issue path
159
+ * (e.g. `customer.email`) so debugging doesn't require a separate
160
+ * console log.
161
+ */
162
+ export function jsonShape(schema) {
163
+ return (response) => {
164
+ const stripped = stripCodeFences(response.text);
165
+ let parsed;
166
+ try {
167
+ parsed = JSON.parse(stripped);
168
+ }
169
+ catch (err) {
170
+ return {
171
+ pass: false,
172
+ score: 0,
173
+ reason: `not JSON: ${err instanceof Error ? err.message : String(err)}`,
174
+ };
175
+ }
176
+ const result = schema.safeParse(parsed);
177
+ if (result.success)
178
+ return { pass: true, score: 1 };
179
+ const first = result.error.issues[0];
180
+ const path = first?.path.join('.') || '<root>';
181
+ return {
182
+ pass: false,
183
+ score: 0,
184
+ reason: `schema mismatch at ${path}: ${first?.message ?? 'unknown error'}`,
185
+ };
186
+ };
187
+ }
188
+ /**
189
+ * Embedding-based fuzzy match. Embeds both `reference` and
190
+ * `response.text` via `AI.embed()`, computes cosine similarity,
191
+ * passes when >= `threshold` (default `0.85` — tighter than
192
+ * `EmbeddingUserMemory`'s 0.5 retrieval-rank floor since this is
193
+ * an assertion, not a ranking).
194
+ *
195
+ * Uses ≤ 2 embedding calls per case; embed tokens roll into the
196
+ * case's cost rollup via the same side-channel `llmJudge` uses.
197
+ *
198
+ * Pitfall: requires a provider that implements `createEmbedding()`
199
+ * (openai / google / mistral / cohere / jina). Failures (no
200
+ * provider, network, etc.) surface as `pass: false` with the
201
+ * error in `reason` — a broken embed is not a passing case.
202
+ */
203
+ export function semanticMatch(reference, opts = {}) {
204
+ const threshold = opts.threshold ?? 0.85;
205
+ return async (response) => {
206
+ try {
207
+ const inputs = [reference, response.text];
208
+ const embedOpts = {};
209
+ if (opts.model)
210
+ embedOpts.model = opts.model;
211
+ const result = await AI.embed(inputs, embedOpts);
212
+ const [refVec, respVec] = result.embeddings;
213
+ if (!refVec || !respVec) {
214
+ return { pass: false, score: 0, reason: 'embed returned no vectors' };
215
+ }
216
+ attachExtraUsage(response, result.usage.totalTokens);
217
+ const score = cosineSimilarity(refVec, respVec);
218
+ const pass = score >= threshold;
219
+ return {
220
+ pass,
221
+ score,
222
+ reason: pass
223
+ ? `cosine ${score.toFixed(3)} >= ${threshold}`
224
+ : `cosine ${score.toFixed(3)} < ${threshold} (reference: ${JSON.stringify(reference.slice(0, 80))}${reference.length > 80 ? '…' : ''})`,
225
+ };
226
+ }
227
+ catch (err) {
228
+ return {
229
+ pass: false,
230
+ score: 0,
231
+ reason: `embed failed: ${err instanceof Error ? err.message : String(err)}`,
232
+ };
233
+ }
234
+ };
235
+ }
236
+ /**
237
+ * Token budget guard. Passes when `response.usage.totalTokens
238
+ * <= threshold`. Pair with cost-conscious agents to detect prompt-
239
+ * size regressions before they show up as a billing surprise.
240
+ *
241
+ * `response.usage` is the multi-step rollup, so it's meaningful
242
+ * even when the agent runs tools across several provider calls.
243
+ */
244
+ export function tokenCost(threshold) {
245
+ return (response) => {
246
+ const used = response.usage.totalTokens;
247
+ const pass = used <= threshold;
248
+ return {
249
+ pass,
250
+ score: pass ? 1 : 0,
251
+ reason: pass
252
+ ? `${used} tokens <= ${threshold}`
253
+ : `${used} tokens > ${threshold}`,
254
+ };
255
+ };
256
+ }
257
+ /**
258
+ * Compose multiple metrics into one assertion. Runs them in order
259
+ * and short-circuits on the first failure — failure `reason` is
260
+ * surfaced; success returns `{ pass: true, score: 1 }`.
261
+ *
262
+ * @example
263
+ * { input: '…',
264
+ * assert: compose(
265
+ * jsonShape(SummarySchema),
266
+ * tokenCost(800),
267
+ * ),
268
+ * }
269
+ */
270
+ export function compose(...metrics) {
271
+ return async (response, ctx) => {
272
+ for (const m of metrics) {
273
+ const result = await m(response, ctx);
274
+ if (!result.pass)
275
+ return result;
276
+ }
277
+ return { pass: true, score: 1 };
278
+ };
279
+ }
280
+ /** Local cosine — kept inline so `eval/` doesn't pull in `memory-embedding` (which depends on `@rudderjs/orm`). */
281
+ function cosineSimilarity(a, b) {
282
+ if (a.length !== b.length)
283
+ return 0;
284
+ let dot = 0;
285
+ let magA = 0;
286
+ let magB = 0;
287
+ for (let i = 0; i < a.length; i++) {
288
+ const ai = a[i];
289
+ const bi = b[i];
290
+ dot += ai * bi;
291
+ magA += ai * ai;
292
+ magB += bi * bi;
293
+ }
294
+ if (magA === 0 || magB === 0)
295
+ return 0;
296
+ return dot / (Math.sqrt(magA) * Math.sqrt(magB));
297
+ }
298
+ function stripCodeFences(text) {
299
+ return text
300
+ .replace(/^```(?:json)?\s*\n?/m, '')
301
+ .replace(/\n?```\s*$/m, '')
302
+ .trim();
303
+ }
304
+ // ─── Runner ───────────────────────────────────────────────
305
+ /**
306
+ * Run every case in the suite, in declaration order. Returns the
307
+ * full report; never throws (assertion errors become `failed` cases,
308
+ * not exceptions).
309
+ *
310
+ * Phase 1 runs serially. Parallel execution lands in a follow-up
311
+ * once we understand the rate-limit shape of real-world judge
312
+ * models — sequential is correct under any rate limit.
313
+ */
314
+ export async function runSuite(suite) {
315
+ const start = performance.now();
316
+ const cases = [];
317
+ let passed = 0;
318
+ let failed = 0;
319
+ let skipped = 0;
320
+ for (let i = 0; i < suite.spec.cases.length; i++) {
321
+ const c = suite.spec.cases[i];
322
+ const name = c.name ?? `case-${i}`;
323
+ if (c.skip) {
324
+ const skipResult = {
325
+ name,
326
+ status: 'skipped',
327
+ reason: typeof c.skip === 'string' ? c.skip : 'skipped',
328
+ duration: 0,
329
+ tokens: 0,
330
+ cost: 0,
331
+ input: c.input,
332
+ };
333
+ cases.push(skipResult);
334
+ emitEvalCompleted(suite.name, skipResult);
335
+ skipped++;
336
+ continue;
337
+ }
338
+ const result = await runCase(suite, c, name);
339
+ cases.push(result);
340
+ emitEvalCompleted(suite.name, result);
341
+ if (result.status === 'passed')
342
+ passed++;
343
+ else if (result.status === 'failed')
344
+ failed++;
345
+ }
346
+ const duration = performance.now() - start;
347
+ const report = {
348
+ suite: suite.name,
349
+ cases,
350
+ passed,
351
+ failed,
352
+ skipped,
353
+ duration,
354
+ cost: cases.reduce((sum, c) => sum + c.cost, 0),
355
+ tokens: cases.reduce((sum, c) => sum + c.tokens, 0),
356
+ };
357
+ if (suite.spec.metadata)
358
+ report.metadata = suite.spec.metadata;
359
+ return report;
360
+ }
361
+ function emitEvalCompleted(suiteName, result) {
362
+ const event = {
363
+ kind: 'agent.eval.completed',
364
+ suite: suiteName,
365
+ case: result.name,
366
+ status: result.status,
367
+ pass: result.status === 'passed',
368
+ tokens: result.tokens,
369
+ cost: result.cost,
370
+ duration: result.duration,
371
+ };
372
+ if (result.metric?.score !== undefined)
373
+ event.score = result.metric.score;
374
+ const reason = result.status === 'skipped' ? result.reason : result.metric?.reason;
375
+ if (reason)
376
+ event.reason = reason;
377
+ aiObservers.emit(event);
378
+ }
379
+ async function runCase(suite, c, name) {
380
+ const factory = c.agent ?? suite.spec.agent;
381
+ const ag = factory();
382
+ const timeout = c.timeout ?? suite.spec.timeout;
383
+ const start = performance.now();
384
+ let response;
385
+ try {
386
+ response = await runWithTimeout(() => ag.prompt(c.input), timeout);
387
+ }
388
+ catch (err) {
389
+ return {
390
+ name,
391
+ status: 'failed',
392
+ metric: { pass: false, reason: err instanceof Error ? err.message : String(err) },
393
+ duration: performance.now() - start,
394
+ tokens: 0,
395
+ cost: 0,
396
+ input: c.input,
397
+ };
398
+ }
399
+ let metric;
400
+ try {
401
+ metric = await c.assert(response, { input: c.input, caseName: name });
402
+ }
403
+ catch (err) {
404
+ metric = { pass: false, reason: `assert threw: ${err instanceof Error ? err.message : String(err)}` };
405
+ }
406
+ const extraTokens = consumeExtraUsage(response);
407
+ const totalTokens = response.usage.totalTokens + extraTokens;
408
+ return {
409
+ name,
410
+ status: metric.pass ? 'passed' : 'failed',
411
+ metric,
412
+ duration: performance.now() - start,
413
+ tokens: totalTokens,
414
+ cost: estimateCost(modelStringFor(ag), response.usage.promptTokens, response.usage.completionTokens)
415
+ + estimateCost(modelStringFor(ag), 0, extraTokens), // judge/embed cost approximated as completion-side
416
+ input: c.input,
417
+ responseText: response.text,
418
+ };
419
+ }
420
+ function modelStringFor(ag) {
421
+ // `Agent.model()` may return undefined → callers fall back to the
422
+ // registry default. We don't have a stable hook for the default
423
+ // here without importing the registry; the eval flow doesn't
424
+ // strictly need the resolved model for cost estimation as long as
425
+ // the user's agent declares one. When it doesn't, costs fall back
426
+ // to an unknown-model rate (zero in Phase 1).
427
+ return ag.model() ?? 'unknown/unknown';
428
+ }
429
+ async function runWithTimeout(fn, ms) {
430
+ if (!ms || ms <= 0)
431
+ return fn();
432
+ return new Promise((resolve, reject) => {
433
+ const timer = setTimeout(() => reject(new Error(`timeout after ${ms}ms`)), ms);
434
+ fn().then(v => { clearTimeout(timer); resolve(v); }, e => { clearTimeout(timer); reject(e); });
435
+ });
436
+ }
437
+ // ─── Pricing ──────────────────────────────────────────────
438
+ // Pricing catalog + estimator live in `../budget/pricing.ts` and are
439
+ // re-exported here so `import { estimateCost } from '@rudderjs/ai/eval'`
440
+ // continues to work. (Note: estimateCost is also imported at the top
441
+ // of this file for use by `runSuite`.)
442
+ export { estimateCost, ModelPricing } from '../budget/pricing.js';
443
+ // ─── Console reporter ─────────────────────────────────────
444
+ /**
445
+ * Default reporter — prints a colorless ANSI-aware table to a
446
+ * caller-supplied `console`-like sink. Uses Unicode pass/fail glyphs
447
+ * for visual scanning. JSON / HTML reporters land in Phase 2 / 5.
448
+ *
449
+ * Returns the report unchanged so chains compose: `await
450
+ * reportConsole(await runSuite(suite))`.
451
+ */
452
+ export function reportConsole(report, sink = console) {
453
+ const lines = [];
454
+ const summary = `${report.suite} (${report.cases.length} cases, ${formatMs(report.duration)}, ${formatCost(report.cost)})`;
455
+ lines.push(summary);
456
+ for (const c of report.cases) {
457
+ const glyph = c.status === 'passed' ? '✓' : c.status === 'failed' ? '✗' : '○';
458
+ const meta = c.status === 'skipped'
459
+ ? `skip: ${c.reason ?? 'skipped'}`
460
+ : `${formatMs(c.duration)} ${formatCost(c.cost)} tokens: ${c.tokens}`;
461
+ lines.push(` ${glyph} ${padName(c.name)} ${meta}`);
462
+ if (c.status === 'failed' && c.metric?.reason) {
463
+ // Indent reason on its own line so long messages don't break alignment.
464
+ for (const line of c.metric.reason.split('\n')) {
465
+ lines.push(` ${line}`);
466
+ }
467
+ }
468
+ }
469
+ lines.push('');
470
+ lines.push(` ${report.passed} passed, ${report.failed} failed${report.skipped > 0 ? `, ${report.skipped} skipped` : ''}`);
471
+ lines.push(` total: ${formatCost(report.cost)} • cumulative tokens: ${report.tokens}`);
472
+ for (const line of lines)
473
+ sink.log(line);
474
+ return report;
475
+ }
476
+ function padName(s) {
477
+ return s.padEnd(28);
478
+ }
479
+ function formatMs(ms) {
480
+ if (ms < 1000)
481
+ return `${Math.round(ms)}ms`;
482
+ return `${(ms / 1000).toFixed(1)}s`;
483
+ }
484
+ function formatCost(cents) {
485
+ if (cents === 0)
486
+ return '$0.000';
487
+ if (cents < 0.001)
488
+ return '<$0.001';
489
+ return `$${cents.toFixed(3)}`;
490
+ }
491
+ // ─── Internal: extra-usage side-channel ───────────────────
492
+ //
493
+ // `Metric` doesn't surface token usage in its return type — the
494
+ // signature is `(response, ctx) => MetricResult`. To roll auxiliary
495
+ // token costs (llmJudge's judge model, semanticMatch's embeddings)
496
+ // into the case's cost report, the metrics stamp usage onto a
497
+ // Symbol-keyed slot on the response object and the runner consumes
498
+ // it. Internal-only; never exported.
499
+ const EXTRA_USAGE_KEY = Symbol.for('rudderjs.ai.eval.extraUsage');
500
+ function attachExtraUsage(response, tokens) {
501
+ const carrier = response;
502
+ carrier[EXTRA_USAGE_KEY] = (carrier[EXTRA_USAGE_KEY] ?? 0) + tokens;
503
+ }
504
+ function consumeExtraUsage(response) {
505
+ const carrier = response;
506
+ const tokens = carrier[EXTRA_USAGE_KEY] ?? 0;
507
+ delete carrier[EXTRA_USAGE_KEY];
508
+ return tokens;
509
+ }
510
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BG;AAEH,OAAO,EAAE,KAAK,EAAE,MAAM,aAAa,CAAA;AAGnC,OAAO,EAAE,MAAM,EAAE,MAAM,cAAc,CAAA;AACrC,OAAO,EAAE,EAAE,EAAE,MAAM,cAAc,CAAA;AACjC,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAC7C,OAAO,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAA;AACnD,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAA;AAEvB,OAAO,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAA;AAE/C,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAA;AAEjD,OAAO,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAA;AAsI/C,6DAA6D;AAE7D;;;;;;;;;GASG;AACH,MAAM,UAAU,SAAS,CAAC,IAAY,EAAE,IAAmB;IACzD,IAAI,CAAC,IAAI;QAAE,MAAM,IAAI,KAAK,CAAC,4CAA4C,CAAC,CAAA;IACxE,IAAI,CAAC,IAAI,IAAI,OAAO,IAAI,CAAC,KAAK,KAAK,UAAU,EAAE,CAAC;QAC9C,MAAM,IAAI,KAAK,CAAC,0EAA0E,CAAC,CAAA;IAC7F,CAAC;IACD,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1D,MAAM,IAAI,KAAK,CAAC,uDAAuD,CAAC,CAAA;IAC1E,CAAC;IACD,OAAO,MAAM,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAA;AACtC,CAAC;AAED,6DAA6D;AAE7D,qDAAqD;AACrD,MAAM,UAAU,UAAU,CAAC,QAAgB;IACzC,OAAO,CAAC,QAAQ,EAAgB,EAAE;QAChC,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAA;QAC5B,IAAI,MAAM,KAAK,QAAQ;YAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,CAAA;QACxD,OAAO;YACL,IAAI,EAAI,KAAK;YACb,KAAK,EAAG,CAAC;YACT,MAAM,EAAE,YAAY,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,SAAS,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,EAAE;SAC9E,CAAA;IACH,CAAC,CAAA;AACH,CAAC;AAED,6CAA6C;AAC7C,MAAM,UAAU,KAAK,CAAC,OAAe;IACnC,OAAO,CAAC,QAAQ,EAAgB,EAAE;QAChC,IAAI,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC;YAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,CAAA;QAChE,OAAO;YACL,IAAI,EAAI,KAAK;YACb,KAAK,EAAG,CAAC;YACT,MAAM,EAAE,WAAW,OAAO,kBAAkB,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;SAClI,CAAA;IACH,CAAC,CAAA;AACH,CAAC;AAED;;;;;;;;;;;;;;GAcG;AACH,MAAM,UAAU,QAAQ,CAAC,SAAiB,EAAE,OAA2B,EAAE;IACvE,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC;QAC5B,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC;YACf,IAAI,EAAI,CAAC,CAAC,OAAO,EAAE;YACnB,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;SACnB,CAAC;KACH,CAAC,CAAA;IAEF,OAAO,KAAK,EAAE,QAAQ,EAAE,GAAG,EAAyB,EAAE;QACpD,IAAI,CAAC;YACH,MAAM,KAAK,GAAG,KAAK,CAAC;gBAClB,YAAY,EAAE,GAAG,kBAAkB,OAAO,OAAO,CAAC,cAAc,EAAE,EAAE;gBACpE,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;aAC7C,CAAC,CAAA;YAEF,MAAM,MAAM,GAAG;gBACb,cAAc,SAAS,EAAE;gBACzB,EAAE;gBACF,eAAe,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE;gBAC1C,mBAAmB,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE;gBAClD,EAAE;gBACF,sEAAsE;aACvE,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;YAEZ,MAAM,aAAa,GAAG,MAAM,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC,CAAA;YAChD,MAAM,MAAM,GAAG,OAAO,CAAC,KAAK,CAAC,aAAa,CAAC,IAAI,CAAC,CAAA;YAEhD,8DAA8D;YAC9D,4DAA4D;YAC5D,6DAA6D;YAC7D,gBAAgB,CAAC,QAAQ,EAAE,aAAa,CAAC,KAAK,CAAC,WAAW,CAAC,CAAA;YAE3D,OAAO;gBACL,IAAI,EAAI,MAAM,CAAC,IAAI;gBACnB,KAAK,EAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;gBAC3B,MAAM,EAAE,MAAM,CAAC,MAAM;aACtB,CAAA;QACH,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,OAAO;gBACL,IAAI,EAAI,KAAK;gBACb,KAAK,EAAG,CAAC;gBACT,MAAM,EAAE,iBAAiB,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE;aAC5E,CAAA;QACH,CAAC;IACH,CAAC,CAAA;AACH,CAAC;AAED,MAAM,kBAAkB,GAAG;IACzB,gGAAgG;IAChG,oEAAoE;IACpE,+FAA+F;CAChG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;AAEX;;;;;;;;;GASG;AACH,MAAM,UAAU,SAAS,CAAI,MAAoB;IAC/C,OAAO,CAAC,QAAQ,EAAgB,EAAE;QAChC,MAAM,QAAQ,GAAG,eAAe,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAA;QAC/C,IAAI,MAAe,CAAA;QACnB,IAAI,CAAC;YACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAA;QAC/B,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,OAAO;gBACL,IAAI,EAAI,KAAK;gBACb,KAAK,EAAG,CAAC;gBACT,MAAM,EAAE,aAAa,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE;aACxE,CAAA;QACH,CAAC;QACD,MAAM,MAAM,GAAG,MAAM,CAAC,SAAS,CAAC,MAAM,CAAC,CAAA;QACvC,IAAI,MAAM,CAAC,OAAO;YAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,CAAA;QACnD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,CAAA;QACpC,MAAM,IAAI,GAAI,KAAK,EAAE,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,QAAQ,CAAA;QAC/C,OAAO;YACL,IAAI,EAAI,KAAK;YACb,KAAK,EAAG,CAAC;YACT,MAAM,EAAE,sBAAsB,IAAI,KAAK,KAAK,EAAE,OAAO,IAAI,eAAe,EAAE;SAC3E,CAAA;IACH,CAAC,CAAA;AACH,CAAC;AAED;;;;;;;;;;;;;;GAcG;AACH,MAAM,UAAU,aAAa,CAC3B,SAAiB,EACjB,OAA+C,EAAE;IAEjD,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,IAAI,CAAA;IACxC,OAAO,KAAK,EAAE,QAAQ,EAAyB,EAAE;QAC/C,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,CAAC,SAAS,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAA;YACzC,MAAM,SAAS,GAAuB,EAAE,CAAA;YACxC,IAAI,IAAI,CAAC,KAAK;gBAAE,SAAS,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAA;YAC5C,MAAM,MAAM,GAAG,MAAM,EAAE,CAAC,KAAK,CAAC,MAAM,EAAE,SAAS,CAAC,CAAA;YAChD,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,MAAM,CAAC,UAAU,CAAA;YAC3C,IAAI,CAAC,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;gBACxB,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,EAAE,MAAM,EAAE,2BAA2B,EAAE,CAAA;YACvE,CAAC;YACD,gBAAgB,CAAC,QAAQ,EAAE,MAAM,CAAC,KAAK,CAAC,WAAW,CAAC,CAAA;YAEpD,MAAM,KAAK,GAAG,gBAAgB,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;YAC/C,MAAM,IAAI,GAAI,KAAK,IAAI,SAAS,CAAA;YAChC,OAAO;gBACL,IAAI;gBACJ,KAAK;gBACL,MAAM,EAAE,IAAI;oBACV,CAAC,CAAC,UAAU,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,SAAS,EAAE;oBAC9C,CAAC,CAAC,UAAU,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,SAAS,gBAAgB,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,SAAS,CAAC,MAAM,GAAG,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG;aAC1I,CAAA;QACH,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,OAAO;gBACL,IAAI,EAAI,KAAK;gBACb,KAAK,EAAG,CAAC;gBACT,MAAM,EAAE,iBAAiB,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE;aAC5E,CAAA;QACH,CAAC;IACH,CAAC,CAAA;AACH,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,SAAS,CAAC,SAAiB;IACzC,OAAO,CAAC,QAAQ,EAAgB,EAAE;QAChC,MAAM,IAAI,GAAG,QAAQ,CAAC,KAAK,CAAC,WAAW,CAAA;QACvC,MAAM,IAAI,GAAG,IAAI,IAAI,SAAS,CAAA;QAC9B,OAAO;YACL,IAAI;YACJ,KAAK,EAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACpB,MAAM,EAAE,IAAI;gBACV,CAAC,CAAC,GAAG,IAAI,cAAc,SAAS,EAAE;gBAClC,CAAC,CAAC,GAAG,IAAI,aAAa,SAAS,EAAE;SACpC,CAAA;IACH,CAAC,CAAA;AACH,CAAC;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,OAAO,CAAC,GAAG,OAAiB;IAC1C,OAAO,KAAK,EAAE,QAAQ,EAAE,GAAG,EAAyB,EAAE;QACpD,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;YACxB,MAAM,MAAM,GAAG,MAAM,CAAC,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAA;YACrC,IAAI,CAAC,MAAM,CAAC,IAAI;gBAAE,OAAO,MAAM,CAAA;QACjC,CAAC;QACD,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,CAAA;IACjC,CAAC,CAAA;AACH,CAAC;AAED,mHAAmH;AACnH,SAAS,gBAAgB,CAAC,CAAW,EAAE,CAAW;IAChD,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,MAAM;QAAE,OAAO,CAAC,CAAA;IACnC,IAAI,GAAG,GAAI,CAAC,CAAA;IACZ,IAAI,IAAI,GAAG,CAAC,CAAA;IACZ,IAAI,IAAI,GAAG,CAAC,CAAA;IACZ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAClC,MAAM,EAAE,GAAG,CAAC,CAAC,CAAC,CAAE,CAAA;QAChB,MAAM,EAAE,GAAG,CAAC,CAAC,CAAC,CAAE,CAAA;QAChB,GAAG,IAAK,EAAE,GAAG,EAAE,CAAA;QACf,IAAI,IAAI,EAAE,GAAG,EAAE,CAAA;QACf,IAAI,IAAI,EAAE,GAAG,EAAE,CAAA;IACjB,CAAC;IACD,IAAI,IAAI,KAAK,CAAC,IAAI,IAAI,KAAK,CAAC;QAAE,OAAO,CAAC,CAAA;IACtC,OAAO,GAAG,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAA;AAClD,CAAC;AAED,SAAS,eAAe,CAAC,IAAY;IACnC,OAAO,IAAI;SACR,OAAO,CAAC,sBAAsB,EAAE,EAAE,CAAC;SACnC,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC;SAC1B,IAAI,EAAE,CAAA;AACX,CAAC;AAED,6DAA6D;AAE7D;;;;;;;;GAQG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAAC,KAAgB;IAC7C,MAAM,KAAK,GAAG,WAAW,CAAC,GAAG,EAAE,CAAA;IAC/B,MAAM,KAAK,GAAiB,EAAE,CAAA;IAC9B,IAAI,MAAM,GAAI,CAAC,CAAA;IACf,IAAI,MAAM,GAAI,CAAC,CAAA;IACf,IAAI,OAAO,GAAG,CAAC,CAAA;IAEf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACjD,MAAM,CAAC,GAAM,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAE,CAAA;QACjC,MAAM,IAAI,GAAG,CAAC,CAAC,IAAI,IAAI,QAAQ,CAAC,EAAE,CAAA;QAElC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;YACX,MAAM,UAAU,GAAe;gBAC7B,IAAI;gBACJ,MAAM,EAAI,SAAS;gBACnB,MAAM,EAAI,OAAO,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,SAAS;gBACzD,QAAQ,EAAE,CAAC;gBACX,MAAM,EAAI,CAAC;gBACX,IAAI,EAAM,CAAC;gBACX,KAAK,EAAK,CAAC,CAAC,KAAK;aAClB,CAAA;YACD,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;YACtB,iBAAiB,CAAC,KAAK,CAAC,IAAI,EAAE,UAAU,CAAC,CAAA;YACzC,OAAO,EAAE,CAAA;YACT,SAAQ;QACV,CAAC;QAED,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC,EAAE,IAAI,CAAC,CAAA;QAC5C,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAClB,iBAAiB,CAAC,KAAK,CAAC,IAAI,EAAE,MAAM,CAAC,CAAA;QACrC,IAAI,MAAM,CAAC,MAAM,KAAK,QAAQ;YAAE,MAAM,EAAE,CAAA;aACnC,IAAI,MAAM,CAAC,MAAM,KAAK,QAAQ;YAAE,MAAM,EAAE,CAAA;IAC/C,CAAC;IAED,MAAM,QAAQ,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,KAAK,CAAA;IAE1C,MAAM,MAAM,GAAgB;QAC1B,KAAK,EAAE,KAAK,CAAC,IAAI;QACjB,KAAK;QACL,MAAM;QACN,MAAM;QACN,OAAO;QACP,QAAQ;QACR,IAAI,EAAI,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,EAAI,CAAC,CAAC;QACnD,MAAM,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;KACpD,CAAA;IACD,IAAI,KAAK,CAAC,IAAI,CAAC,QAAQ;QAAE,MAAM,CAAC,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAA;IAC9D,OAAO,MAAM,CAAA;AACf,CAAC;AAED,SAAS,iBAAiB,CAAC,SAAiB,EAAE,MAAkB;IAC9D,MAAM,KAAK,GAA2C;QACpD,IAAI,EAAM,sBAAsB;QAChC,KAAK,EAAK,SAAS;QACnB,IAAI,EAAM,MAAM,CAAC,IAAI;QACrB,MAAM,EAAI,MAAM,CAAC,MAAM;QACvB,IAAI,EAAM,MAAM,CAAC,MAAM,KAAK,QAAQ;QACpC,MAAM,EAAI,MAAM,CAAC,MAAM;QACvB,IAAI,EAAM,MAAM,CAAC,IAAI;QACrB,QAAQ,EAAE,MAAM,CAAC,QAAQ;KAC1B,CAAA;IACD,IAAI,MAAM,CAAC,MAAM,EAAE,KAAK,KAAK,SAAS;QAAE,KAAK,CAAC,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,KAAK,CAAA;IACzE,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,KAAK,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAA;IAClF,IAAI,MAAM;QAAE,KAAK,CAAC,MAAM,GAAG,MAAM,CAAA;IACjC,WAAW,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;AACzB,CAAC;AAED,KAAK,UAAU,OAAO,CAAC,KAAgB,EAAE,CAAW,EAAE,IAAY;IAChE,MAAM,OAAO,GAAG,CAAC,CAAC,KAAK,IAAI,KAAK,CAAC,IAAI,CAAC,KAAK,CAAA;IAC3C,MAAM,EAAE,GAAQ,OAAO,EAAE,CAAA;IACzB,MAAM,OAAO,GAAG,CAAC,CAAC,OAAO,IAAI,KAAK,CAAC,IAAI,CAAC,OAAO,CAAA;IAE/C,MAAM,KAAK,GAAG,WAAW,CAAC,GAAG,EAAE,CAAA;IAC/B,IAAI,QAAuB,CAAA;IAC3B,IAAI,CAAC;QACH,QAAQ,GAAG,MAAM,cAAc,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC,CAAA;IACpE,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,OAAO;YACL,IAAI;YACJ,MAAM,EAAI,QAAQ;YAClB,MAAM,EAAI,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE;YACnF,QAAQ,EAAE,WAAW,CAAC,GAAG,EAAE,GAAG,KAAK;YACnC,MAAM,EAAI,CAAC;YACX,IAAI,EAAM,CAAC;YACX,KAAK,EAAK,CAAC,CAAC,KAAK;SAClB,CAAA;IACH,CAAC;IAED,IAAI,MAAoB,CAAA;IACxB,IAAI,CAAC;QACH,MAAM,GAAG,MAAM,CAAC,CAAC,MAAM,CAAC,QAAQ,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAA;IACvE,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,MAAM,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,EAAE,iBAAiB,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,EAAE,CAAA;IACvG,CAAC;IAED,MAAM,WAAW,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAA;IAC/C,MAAM,WAAW,GAAG,QAAQ,CAAC,KAAK,CAAC,WAAW,GAAG,WAAW,CAAA;IAE5D,OAAO;QACL,IAAI;QACJ,MAAM,EAAQ,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ;QAC/C,MAAM;QACN,QAAQ,EAAM,WAAW,CAAC,GAAG,EAAE,GAAG,KAAK;QACvC,MAAM,EAAQ,WAAW;QACzB,IAAI,EAAU,YAAY,CAAC,cAAc,CAAC,EAAE,CAAC,EAAE,QAAQ,CAAC,KAAK,CAAC,YAAY,EAAE,QAAQ,CAAC,KAAK,CAAC,gBAAgB,CAAC;cAC9F,YAAY,CAAC,cAAc,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,WAAW,CAAC,EAAG,mDAAmD;QACpH,KAAK,EAAS,CAAC,CAAC,KAAK;QACrB,YAAY,EAAE,QAAQ,CAAC,IAAI;KAC5B,CAAA;AACH,CAAC;AAED,SAAS,cAAc,CAAC,EAAS;IAC/B,kEAAkE;IAClE,gEAAgE;IAChE,6DAA6D;IAC7D,kEAAkE;IAClE,kEAAkE;IAClE,8CAA8C;IAC9C,OAAO,EAAE,CAAC,KAAK,EAAE,IAAI,iBAAiB,CAAA;AACxC,CAAC;AAED,KAAK,UAAU,cAAc,CAAI,EAAoB,EAAE,EAAsB;IAC3E,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,CAAC;QAAE,OAAO,EAAE,EAAE,CAAA;IAC/B,OAAO,IAAI,OAAO,CAAI,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACxC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,IAAI,KAAK,CAAC,iBAAiB,EAAE,IAAI,CAAC,CAAC,EAAE,EAAE,CAAC,CAAA;QAC9E,EAAE,EAAE,CAAC,IAAI,CACP,CAAC,CAAC,EAAE,GAAG,YAAY,CAAC,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA,CAAC,CAAC,EACxC,CAAC,CAAC,EAAE,GAAG,YAAY,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAA,CAAC,CAAC,CACxC,CAAA;IACH,CAAC,CAAC,CAAA;AACJ,CAAC;AAED,6DAA6D;AAE7D,qEAAqE;AACrE,yEAAyE;AACzE,qEAAqE;AACrE,uCAAuC;AACvC,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAA;AAGjE,6DAA6D;AAE7D;;;;;;;GAOG;AACH,MAAM,UAAU,aAAa,CAAC,MAAmB,EAAE,OAAqC,OAAO;IAC7F,MAAM,KAAK,GAAa,EAAE,CAAA;IAC1B,MAAM,OAAO,GAAG,GAAG,MAAM,CAAC,KAAK,KAAK,MAAM,CAAC,KAAK,CAAC,MAAM,WAAW,QAAQ,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAA;IAC1H,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IAEnB,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;QAC7B,MAAM,KAAK,GAAG,CAAC,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAA;QAC7E,MAAM,IAAI,GAAI,CAAC,CAAC,MAAM,KAAK,SAAS;YAClC,CAAC,CAAC,SAAS,CAAC,CAAC,MAAM,IAAI,SAAS,EAAE;YAClC,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC,MAAM,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,MAAM,EAAE,CAAA;QAC3E,KAAK,CAAC,IAAI,CAAC,KAAK,KAAK,IAAI,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC,CAAA;QACnD,IAAI,CAAC,CAAC,MAAM,KAAK,QAAQ,IAAI,CAAC,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC;YAC9C,wEAAwE;YACxE,KAAK,MAAM,IAAI,IAAI,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC/C,KAAK,CAAC,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC,CAAA;YAC7B,CAAC;QACH,CAAC;IACH,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;IACd,KAAK,CAAC,IAAI,CAAC,KAAK,MAAM,CAAC,MAAM,YAAY,MAAM,CAAC,MAAM,UAAU,MAAM,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,MAAM,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAA;IAC1H,KAAK,CAAC,IAAI,CAAC,YAAY,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,2BAA2B,MAAM,CAAC,MAAM,EAAE,CAAC,CAAA;IAEzF,KAAK,MAAM,IAAI,IAAI,KAAK;QAAE,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;IACxC,OAAO,MAAM,CAAA;AACf,CAAC;AAED,SAAS,OAAO,CAAC,CAAS;IACxB,OAAO,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,CAAA;AACrB,CAAC;AAED,SAAS,QAAQ,CAAC,EAAU;IAC1B,IAAI,EAAE,GAAG,IAAI;QAAE,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,IAAI,CAAA;IAC3C,OAAO,GAAG,CAAC,EAAE,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAA;AACrC,CAAC;AAED,SAAS,UAAU,CAAC,KAAa;IAC/B,IAAI,KAAK,KAAK,CAAC;QAAE,OAAO,QAAQ,CAAA;IAChC,IAAI,KAAK,GAAG,KAAK;QAAE,OAAO,SAAS,CAAA;IACnC,OAAO,IAAI,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAA;AAC/B,CAAC;AAED,6DAA6D;AAC7D,EAAE;AACF,gEAAgE;AAChE,oEAAoE;AACpE,mEAAmE;AACnE,8DAA8D;AAC9D,mEAAmE;AACnE,qCAAqC;AAErC,MAAM,eAAe,GAAG,MAAM,CAAC,GAAG,CAAC,6BAA6B,CAAC,CAAA;AAMjE,SAAS,gBAAgB,CAAC,QAAuB,EAAE,MAAc;IAC/D,MAAM,OAAO,GAAG,QAA6C,CAAA;IAC7D,OAAO,CAAC,eAAe,CAAC,GAAG,CAAC,OAAO,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,GAAG,MAAM,CAAA;AACrE,CAAC;AAED,SAAS,iBAAiB,CAAC,QAAuB;IAChD,MAAM,OAAO,GAAG,QAA6C,CAAA;IAC7D,MAAM,MAAM,GAAI,OAAO,CAAC,eAAe,CAAC,IAAI,CAAC,CAAA;IAC7C,OAAO,OAAO,CAAC,eAAe,CAAC,CAAA;IAC/B,OAAO,MAAM,CAAA;AACf,CAAC"}
@@ -0,0 +1,43 @@
1
+ import type { SuiteReport } from './index.js';
2
+ /**
3
+ * Single case row in the JSON output. `metric.pass`/`metric.score`/
4
+ * `metric.reason` are flattened up so consumers don't have to reach
5
+ * through nested objects in CI scripts.
6
+ */
7
+ export interface SuiteJsonCase {
8
+ name: string;
9
+ status: 'passed' | 'failed' | 'skipped';
10
+ pass: boolean;
11
+ score?: number;
12
+ reason?: string;
13
+ duration: number;
14
+ tokens: number;
15
+ cost: number;
16
+ }
17
+ /**
18
+ * Machine-readable suite output emitted by `pnpm rudder ai:eval --json`.
19
+ * Stable shape — bumping fields here is a minor (additive) bump for
20
+ * `@rudderjs/ai`. Removing or renaming fields is a major.
21
+ */
22
+ export interface SuiteJson {
23
+ suite: string;
24
+ passed: number;
25
+ failed: number;
26
+ skipped: number;
27
+ duration: number;
28
+ cost: number;
29
+ tokens: number;
30
+ cases: SuiteJsonCase[];
31
+ }
32
+ /**
33
+ * JSON reporter — flattens a `SuiteReport` for CI consumption.
34
+ *
35
+ * Mirrors the `command_run` MCP tool envelope shape so the boost
36
+ * agent surface and the eval CLI feel like one family.
37
+ *
38
+ * @example
39
+ * const report = await runSuite(suite)
40
+ * process.stdout.write(JSON.stringify(reportJson(report)))
41
+ */
42
+ export declare function reportJson(report: SuiteReport): SuiteJson;
43
+ //# sourceMappingURL=json-reporter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"json-reporter.d.ts","sourceRoot":"","sources":["../../src/eval/json-reporter.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,YAAY,CAAA;AAE7C;;;;GAIG;AACH,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAO,MAAM,CAAA;IACjB,MAAM,EAAK,QAAQ,GAAG,QAAQ,GAAG,SAAS,CAAA;IAC1C,IAAI,EAAO,OAAO,CAAA;IAClB,KAAK,CAAC,EAAK,MAAM,CAAA;IACjB,MAAM,CAAC,EAAI,MAAM,CAAA;IACjB,QAAQ,EAAG,MAAM,CAAA;IACjB,MAAM,EAAK,MAAM,CAAA;IACjB,IAAI,EAAO,MAAM,CAAA;CAClB;AAED;;;;GAIG;AACH,MAAM,WAAW,SAAS;IACxB,KAAK,EAAK,MAAM,CAAA;IAChB,MAAM,EAAI,MAAM,CAAA;IAChB,MAAM,EAAI,MAAM,CAAA;IAChB,OAAO,EAAG,MAAM,CAAA;IAChB,QAAQ,EAAE,MAAM,CAAA;IAChB,IAAI,EAAM,MAAM,CAAA;IAChB,MAAM,EAAI,MAAM,CAAA;IAChB,KAAK,EAAK,aAAa,EAAE,CAAA;CAC1B;AAED;;;;;;;;;GASG;AACH,wBAAgB,UAAU,CAAC,MAAM,EAAE,WAAW,GAAG,SAAS,CAWzD"}
@@ -0,0 +1,40 @@
1
+ /**
2
+ * JSON reporter — flattens a `SuiteReport` for CI consumption.
3
+ *
4
+ * Mirrors the `command_run` MCP tool envelope shape so the boost
5
+ * agent surface and the eval CLI feel like one family.
6
+ *
7
+ * @example
8
+ * const report = await runSuite(suite)
9
+ * process.stdout.write(JSON.stringify(reportJson(report)))
10
+ */
11
+ export function reportJson(report) {
12
+ return {
13
+ suite: report.suite,
14
+ passed: report.passed,
15
+ failed: report.failed,
16
+ skipped: report.skipped,
17
+ duration: report.duration,
18
+ cost: report.cost,
19
+ tokens: report.tokens,
20
+ cases: report.cases.map(toJsonCase),
21
+ };
22
+ }
23
+ function toJsonCase(c) {
24
+ const out = {
25
+ name: c.name,
26
+ status: c.status,
27
+ pass: c.status === 'passed',
28
+ duration: c.duration,
29
+ tokens: c.tokens,
30
+ cost: c.cost,
31
+ };
32
+ if (c.metric?.score !== undefined)
33
+ out.score = c.metric.score;
34
+ if (c.status === 'skipped' && c.reason)
35
+ out.reason = c.reason;
36
+ else if (c.metric?.reason)
37
+ out.reason = c.metric.reason;
38
+ return out;
39
+ }
40
+ //# sourceMappingURL=json-reporter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"json-reporter.js","sourceRoot":"","sources":["../../src/eval/json-reporter.ts"],"names":[],"mappings":"AAkCA;;;;;;;;;GASG;AACH,MAAM,UAAU,UAAU,CAAC,MAAmB;IAC5C,OAAO;QACL,KAAK,EAAK,MAAM,CAAC,KAAK;QACtB,MAAM,EAAI,MAAM,CAAC,MAAM;QACvB,MAAM,EAAI,MAAM,CAAC,MAAM;QACvB,OAAO,EAAG,MAAM,CAAC,OAAO;QACxB,QAAQ,EAAE,MAAM,CAAC,QAAQ;QACzB,IAAI,EAAM,MAAM,CAAC,IAAI;QACrB,MAAM,EAAI,MAAM,CAAC,MAAM;QACvB,KAAK,EAAK,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,UAAU,CAAC;KACvC,CAAA;AACH,CAAC;AAED,SAAS,UAAU,CAAC,CAA+B;IACjD,MAAM,GAAG,GAAkB;QACzB,IAAI,EAAM,CAAC,CAAC,IAAI;QAChB,MAAM,EAAI,CAAC,CAAC,MAAM;QAClB,IAAI,EAAM,CAAC,CAAC,MAAM,KAAK,QAAQ;QAC/B,QAAQ,EAAE,CAAC,CAAC,QAAQ;QACpB,MAAM,EAAI,CAAC,CAAC,MAAM;QAClB,IAAI,EAAM,CAAC,CAAC,IAAI;KACjB,CAAA;IACD,IAAI,CAAC,CAAC,MAAM,EAAE,KAAK,KAAK,SAAS;QAAE,GAAG,CAAC,KAAK,GAAG,CAAC,CAAC,MAAM,CAAC,KAAK,CAAA;IAC7D,IAAI,CAAC,CAAC,MAAM,KAAK,SAAS,IAAI,CAAC,CAAC,MAAM;QAAE,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,MAAM,CAAA;SACxD,IAAI,CAAC,CAAC,MAAM,EAAE,MAAM;QAAE,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,MAAM,CAAC,MAAM,CAAA;IACvD,OAAO,GAAG,CAAA;AACZ,CAAC"}