vitest-evals 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/README.md +57 -10
  2. package/dist/harness.d.mts +56 -40
  3. package/dist/harness.d.ts +56 -40
  4. package/dist/harness.js +34 -104
  5. package/dist/harness.js.map +1 -1
  6. package/dist/harness.mjs +37 -104
  7. package/dist/harness.mjs.map +1 -1
  8. package/dist/index.d.mts +6 -6
  9. package/dist/index.d.ts +6 -6
  10. package/dist/index.js +56 -117
  11. package/dist/index.js.map +1 -1
  12. package/dist/index.mjs +59 -117
  13. package/dist/index.mjs.map +1 -1
  14. package/dist/internal/scoring.d.mts +2 -2
  15. package/dist/internal/scoring.d.ts +2 -2
  16. package/dist/internal/scoring.js.map +1 -1
  17. package/dist/internal/toolCallScorer.js.map +1 -1
  18. package/dist/internal/toolCallScorer.mjs +4 -1
  19. package/dist/internal/toolCallScorer.mjs.map +1 -1
  20. package/dist/judges/factualityJudge.js.map +1 -1
  21. package/dist/judges/factualityJudge.mjs +4 -1
  22. package/dist/judges/factualityJudge.mjs.map +1 -1
  23. package/dist/judges/index.js +47 -110
  24. package/dist/judges/index.js.map +1 -1
  25. package/dist/judges/index.mjs +51 -111
  26. package/dist/judges/index.mjs.map +1 -1
  27. package/dist/judges/judgeHarness.js +47 -110
  28. package/dist/judges/judgeHarness.js.map +1 -1
  29. package/dist/judges/judgeHarness.mjs +51 -111
  30. package/dist/judges/judgeHarness.mjs.map +1 -1
  31. package/dist/judges/toolCallJudge.js.map +1 -1
  32. package/dist/judges/toolCallJudge.mjs +4 -1
  33. package/dist/judges/toolCallJudge.mjs.map +1 -1
  34. package/dist/judges/types.d.mts +2 -2
  35. package/dist/judges/types.d.ts +2 -2
  36. package/dist/judges/types.js.map +1 -1
  37. package/dist/legacy/scorers/index.js.map +1 -1
  38. package/dist/legacy/scorers/index.mjs +4 -1
  39. package/dist/legacy/scorers/index.mjs.map +1 -1
  40. package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
  41. package/dist/legacy/scorers/toolCallScorer.mjs +4 -1
  42. package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
  43. package/dist/legacy/shared.d.mts +1 -8
  44. package/dist/legacy/shared.d.ts +1 -8
  45. package/dist/legacy/shared.js.map +1 -1
  46. package/dist/legacy.js +15 -1
  47. package/dist/legacy.js.map +1 -1
  48. package/dist/legacy.mjs +19 -2
  49. package/dist/legacy.mjs.map +1 -1
  50. package/dist/reporter.d.mts +0 -3
  51. package/dist/reporter.d.ts +0 -3
  52. package/dist/reporter.js +10 -40
  53. package/dist/reporter.js.map +1 -1
  54. package/dist/reporter.mjs +14 -41
  55. package/dist/reporter.mjs.map +1 -1
  56. package/package.json +3 -3
package/dist/index.mjs CHANGED
@@ -4,14 +4,17 @@ import "vitest";
4
4
 
5
5
  // src/harness.ts
6
6
  import {
7
+ messagesToTranscriptEvents,
8
+ NormalizedSessionSchema,
7
9
  spans,
8
- toolCalls
10
+ TranscriptEventSchema
9
11
  } from "@vitest-evals/core";
10
12
  import {
11
13
  assistantMessages as assistantMessages2,
12
14
  failedSpans as failedSpans2,
13
15
  latestAssistantMessageContent as latestAssistantMessageContent2,
14
16
  messagesByRole as messagesByRole2,
17
+ messagesToTranscriptEvents as messagesToTranscriptEvents2,
15
18
  spans as spans2,
16
19
  spansByKind as spansByKind2,
17
20
  systemMessages as systemMessages2,
@@ -140,14 +143,24 @@ function normalizeHarnessRun(input, result, context) {
140
143
  }
141
144
  return result;
142
145
  }
146
+ if ("toolCalls" in result) {
147
+ throw new TypeError(
148
+ 'createHarness results do not accept top-level toolCalls. Return ordered session events with type: "tool_call" and type: "tool_result" entries instead.'
149
+ );
150
+ }
143
151
  const output = result.output;
144
- const toolCalls3 = normalizeSimpleToolCalls(result.toolCalls);
145
152
  const usage = result.usage ?? {};
146
- const messages = result.messages ?? createDefaultSessionMessages({
147
- input,
148
- output,
149
- toolCalls: toolCalls3
150
- });
153
+ const events = normalizeTranscriptInput(result);
154
+ if (!events) {
155
+ throw new TypeError(
156
+ "createHarness results must include ordered events or messages. Return a full HarnessRun or a lightweight result with events/messages."
157
+ );
158
+ }
159
+ if (events.length === 0) {
160
+ throw new TypeError(
161
+ "createHarness results must include at least one transcript event. Return ordered events or message transport inputs that normalize into events."
162
+ );
163
+ }
151
164
  const metadata = result.metadata ? normalizeMetadata(result.metadata) : void 0;
152
165
  const artifacts = normalizeMergedArtifacts(
153
166
  context?.artifacts,
@@ -156,7 +169,7 @@ function normalizeHarnessRun(input, result, context) {
156
169
  const traces = normalizeSimpleTraces(result.traces);
157
170
  return {
158
171
  session: {
159
- messages,
172
+ events,
160
173
  ...usage.provider ? { provider: usage.provider } : {},
161
174
  ...usage.model ? { model: usage.model } : {},
162
175
  ...metadata ? { metadata } : {}
@@ -169,12 +182,24 @@ function normalizeHarnessRun(input, result, context) {
169
182
  errors: normalizeSimpleErrors(result.errors)
170
183
  };
171
184
  }
185
+ function normalizeTranscriptInput(result) {
186
+ if ("events" in result && Array.isArray(result.events)) {
187
+ return result.events.map((event) => TranscriptEventSchema.parse(event));
188
+ }
189
+ if ("messages" in result && Array.isArray(result.messages)) {
190
+ return messagesToTranscriptEvents(result.messages).map(
191
+ (event) => TranscriptEventSchema.parse(event)
192
+ );
193
+ }
194
+ return void 0;
195
+ }
172
196
  function createFailedHarnessRun(input, error, options = {}) {
173
197
  const artifacts = options.artifacts;
174
198
  return {
175
199
  session: {
176
- messages: [
200
+ events: [
177
201
  {
202
+ type: "message",
178
203
  role: "user",
179
204
  content: normalizeContent(input)
180
205
  }
@@ -185,67 +210,6 @@ function createFailedHarnessRun(input, error, options = {}) {
185
210
  errors: [serializeError(error)]
186
211
  };
187
212
  }
188
- function createDefaultSessionMessages({
189
- input,
190
- output,
191
- toolCalls: normalizedToolCalls
192
- }) {
193
- const messages = [
194
- {
195
- role: "user",
196
- content: normalizeContent(input)
197
- }
198
- ];
199
- if (output !== void 0 || normalizedToolCalls.length > 0) {
200
- messages.push({
201
- role: "assistant",
202
- ...output !== void 0 ? { content: normalizeContent(output) } : {},
203
- ...normalizedToolCalls.length > 0 ? { toolCalls: normalizedToolCalls } : {}
204
- });
205
- }
206
- return messages;
207
- }
208
- function normalizeSimpleToolCalls(calls) {
209
- return (calls ?? []).map((call) => {
210
- const {
211
- arguments: rawArguments,
212
- result: rawResult,
213
- error: rawError,
214
- metadata: rawMetadata,
215
- ...toolCall
216
- } = call;
217
- const args = normalizeToolCallArguments(rawArguments);
218
- const result = toJsonValue(rawResult);
219
- const error = normalizeToolCallError(rawError);
220
- const metadata = rawMetadata ? normalizeMetadata(rawMetadata) : void 0;
221
- return {
222
- ...toolCall,
223
- ...args ? { arguments: args } : {},
224
- ...result !== void 0 ? { result } : {},
225
- ...error ? { error } : {},
226
- ...metadata ? { metadata } : {}
227
- };
228
- });
229
- }
230
- function normalizeToolCallArguments(value) {
231
- if (value === void 0) {
232
- return void 0;
233
- }
234
- const normalized = toJsonValue(value);
235
- return normalized && typeof normalized === "object" && !Array.isArray(normalized) ? normalized : void 0;
236
- }
237
- function normalizeToolCallError(value) {
238
- if (value === void 0) {
239
- return void 0;
240
- }
241
- const serialized = serializeError(value);
242
- const { message, type, ...details } = serialized;
243
- return {
244
- ...details,
245
- message: typeof message === "string" ? message : String(message),
246
- ...typeof type === "string" ? { type } : {}
247
- };
248
- }
249
213
  function normalizeMergedArtifacts(contextArtifacts, resultArtifacts) {
250
214
  const artifacts = {
251
215
  ...contextArtifacts ?? {},
@@ -371,32 +335,6 @@ function createGenAiUsageAttributes(usage, options = {}) {
371
335
  "gen_ai.usage.reasoning.output_tokens": usage?.reasoningTokens
372
336
  };
373
337
  }
374
- function createToolCallSpans(calls, options = {}) {
375
- return calls.map((call, index) => {
376
- const spanError = call.error ? normalizeSpanError(call.error) : void 0;
377
- const spanId = options.spanIdPrefix ? `${options.spanIdPrefix}:${index + 1}` : call.id;
378
- return {
379
- ...spanId ? { id: spanId } : {},
380
- ...options.traceId ? { traceId: options.traceId } : {},
381
- ...options.parentId ? { parentId: options.parentId } : {},
382
- name: call.name,
383
- kind: "tool",
384
- ...call.startedAt ? { startedAt: call.startedAt } : {},
385
- ...call.finishedAt ? { finishedAt: call.finishedAt } : {},
386
- ...call.durationMs !== void 0 ? { durationMs: call.durationMs } : {},
387
- status: spanError ? "error" : "ok",
388
- ...spanError ? { error: spanError } : {},
389
- attributes: normalizeSpanAttributes({
390
- "gen_ai.operation.name": "execute_tool",
391
- "gen_ai.tool.name": call.name,
392
- "gen_ai.tool.type": "function",
393
- ...call.id ? { "gen_ai.tool.call.id": call.id } : {},
394
- ...call.arguments !== void 0 ? { "gen_ai.tool.call.arguments": call.arguments } : {},
395
- ...call.result !== void 0 ? { "gen_ai.tool.call.result": call.result } : {}
396
- })
397
- };
398
- });
399
- }
400
338
  function ensureRunTrace(run, options) {
401
339
  if (spans(run).length > 0) {
402
340
  return void 0;
@@ -421,11 +359,6 @@ function ensureRunTrace(run, options) {
421
359
  ...createGenAiUsageAttributes(run.usage)
422
360
  })
423
361
  };
424
- const toolSpans = createToolCallSpans(toolCalls(run.session), {
425
- traceId,
426
- parentId: rootSpanId,
427
- spanIdPrefix: `${traceId}:tool`
428
- });
429
362
  const trace = {
430
363
  id: traceId,
431
364
  name: options.name,
@@ -433,7 +366,7 @@ function ensureRunTrace(run, options) {
433
366
  finishedAt: options.finishedAt.toISOString(),
434
367
  durationMs,
435
368
  ...options.source ? { metadata: { source: options.source } } : {},
436
- spans: [runSpan, ...toolSpans]
369
+ spans: [runSpan]
437
370
  };
438
371
  run.traces = [trace];
439
372
  return trace;
@@ -463,7 +396,7 @@ function isHarnessRun(value) {
463
396
  return isNormalizedSession(candidate.session) && Boolean(candidate.usage) && typeof candidate.usage === "object" && !Array.isArray(candidate.usage) && Array.isArray(candidate.errors);
464
397
  }
465
398
  function isNormalizedSession(value) {
466
- return Boolean(value) && typeof value === "object" && value !== null && "messages" in value && Array.isArray(value.messages);
399
+ return NormalizedSessionSchema.safeParse(value).success;
467
400
  }
468
401
  function serializeError(error) {
469
402
  if (error instanceof Error) {
@@ -483,7 +416,10 @@ function createJudgeHarness(options) {
483
416
  return createHarness({
484
417
  name: options.name ?? "judge-harness",
485
418
  run: async ({ input, signal }) => {
486
- return normalizeJudgeHarnessResult(await options.run(input, { signal }));
419
+ return normalizeJudgeHarnessResult(
420
+ input,
421
+ await options.run(input, { signal })
422
+ );
487
423
  }
488
424
  });
489
425
  }
@@ -506,17 +442,14 @@ function createRunJudge(judgeHarness, signal) {
506
442
  signal: options?.signal ?? signal
507
443
  });
508
444
  }
509
- function normalizeJudgeHarnessResult(result) {
445
+ function normalizeJudgeHarnessResult(input, result) {
510
446
  if (isHarnessRun(result)) {
511
447
  return result;
512
448
  }
513
- if (hasOutputField(result)) {
514
- return {
515
- output: normalizeJudgeHarnessOutput(result.output)
516
- };
517
- }
449
+ const output = hasOutputField(result) ? normalizeJudgeHarnessOutput(result.output) : normalizeJudgeHarnessOutput(result);
518
450
  return {
519
- output: normalizeJudgeHarnessOutput(result)
451
+ output,
452
+ messages: createJudgeHarnessMessages(input, output)
520
453
  };
521
454
  }
522
455
  function hasOutputField(value) {
@@ -528,6 +461,13 @@ function normalizeJudgeHarnessOutput(value) {
528
461
  }
529
462
  return normalizeContent(value);
530
463
  }
464
+ function createJudgeHarnessMessages(input, output) {
465
+ return [
466
+ ...input.system ? [{ role: "system", content: input.system }] : [],
467
+ { role: "user", content: input.prompt },
468
+ ...output !== void 0 ? [{ role: "assistant", content: output }] : []
469
+ ];
470
+ }
531
471
  function resolveJudgeHarnessAssistantOutput(run) {
532
472
  return latestAssistantMessageContent2(run.session) ?? "";
533
473
  }
@@ -1655,23 +1595,25 @@ function resolveJudgeAssertionOutput(received, run, explicitOutput) {
1655
1595
  return normalizeJudgeJsonValue(received);
1656
1596
  }
1657
1597
  function createSyntheticJudgeSession(received, options) {
1658
- const messages = [];
1598
+ const events = [];
1659
1599
  const userContent = normalizeJudgeJsonValue(options.input);
1660
1600
  if (userContent !== void 0) {
1661
- messages.push({
1601
+ events.push({
1602
+ type: "message",
1662
1603
  role: "user",
1663
1604
  content: userContent
1664
1605
  });
1665
1606
  }
1666
1607
  const assistantContent = normalizeJudgeJsonValue(received);
1667
1608
  if (assistantContent !== void 0) {
1668
- messages.push({
1609
+ events.push({
1610
+ type: "message",
1669
1611
  role: "assistant",
1670
1612
  content: assistantContent
1671
1613
  });
1672
1614
  }
1673
1615
  return {
1674
- messages
1616
+ events
1675
1617
  };
1676
1618
  }
1677
1619
  function inferJudgeOutputValue(received, session) {
@@ -1679,7 +1621,7 @@ function inferJudgeOutputValue(received, session) {
1679
1621
  return received.output;
1680
1622
  }
1681
1623
  if (isNormalizedSession(received)) {
1682
- return resolveAssistantOutput(session) ?? normalizeJudgeJsonValue(received.messages);
1624
+ return resolveAssistantOutput(session) ?? normalizeJudgeJsonValue(received.events);
1683
1625
  }
1684
1626
  return normalizeJudgeJsonValue(received);
1685
1627
  }
@@ -1752,7 +1694,6 @@ export {
1752
1694
  createHarness,
1753
1695
  createJudge,
1754
1696
  createJudgeHarness,
1755
- createToolCallSpans,
1756
1697
  describeEval,
1757
1698
  ensureRunTrace,
1758
1699
  failedSpans2 as failedSpans,
@@ -1760,6 +1701,7 @@ export {
1760
1701
  getHarnessRunFromError,
1761
1702
  latestAssistantMessageContent2 as latestAssistantMessageContent,
1762
1703
  messagesByRole2 as messagesByRole,
1704
+ messagesToTranscriptEvents2 as messagesToTranscriptEvents,
1763
1705
  normalizeHarnessRun,
1764
1706
  normalizeSpanAttributes,
1765
1707
  normalizeSpanError,