vitest-evals 0.13.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +57 -10
- package/dist/harness.d.mts +56 -40
- package/dist/harness.d.ts +56 -40
- package/dist/harness.js +34 -104
- package/dist/harness.js.map +1 -1
- package/dist/harness.mjs +37 -104
- package/dist/harness.mjs.map +1 -1
- package/dist/index.d.mts +6 -6
- package/dist/index.d.ts +6 -6
- package/dist/index.js +56 -117
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +59 -117
- package/dist/index.mjs.map +1 -1
- package/dist/internal/scoring.d.mts +2 -2
- package/dist/internal/scoring.d.ts +2 -2
- package/dist/internal/scoring.js.map +1 -1
- package/dist/internal/toolCallScorer.js.map +1 -1
- package/dist/internal/toolCallScorer.mjs +4 -1
- package/dist/internal/toolCallScorer.mjs.map +1 -1
- package/dist/judges/factualityJudge.js.map +1 -1
- package/dist/judges/factualityJudge.mjs +4 -1
- package/dist/judges/factualityJudge.mjs.map +1 -1
- package/dist/judges/index.js +47 -110
- package/dist/judges/index.js.map +1 -1
- package/dist/judges/index.mjs +51 -111
- package/dist/judges/index.mjs.map +1 -1
- package/dist/judges/judgeHarness.js +47 -110
- package/dist/judges/judgeHarness.js.map +1 -1
- package/dist/judges/judgeHarness.mjs +51 -111
- package/dist/judges/judgeHarness.mjs.map +1 -1
- package/dist/judges/toolCallJudge.js.map +1 -1
- package/dist/judges/toolCallJudge.mjs +4 -1
- package/dist/judges/toolCallJudge.mjs.map +1 -1
- package/dist/judges/types.d.mts +2 -2
- package/dist/judges/types.d.ts +2 -2
- package/dist/judges/types.js.map +1 -1
- package/dist/legacy/scorers/index.js.map +1 -1
- package/dist/legacy/scorers/index.mjs +4 -1
- package/dist/legacy/scorers/index.mjs.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.mjs +4 -1
- package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
- package/dist/legacy/shared.d.mts +1 -8
- package/dist/legacy/shared.d.ts +1 -8
- package/dist/legacy/shared.js.map +1 -1
- package/dist/legacy.js +15 -1
- package/dist/legacy.js.map +1 -1
- package/dist/legacy.mjs +19 -2
- package/dist/legacy.mjs.map +1 -1
- package/dist/reporter.d.mts +0 -3
- package/dist/reporter.d.ts +0 -3
- package/dist/reporter.js +10 -40
- package/dist/reporter.js.map +1 -1
- package/dist/reporter.mjs +14 -41
- package/dist/reporter.mjs.map +1 -1
- package/package.json +3 -3
package/dist/index.mjs
CHANGED
|
@@ -4,14 +4,17 @@ import "vitest";
|
|
|
4
4
|
|
|
5
5
|
// src/harness.ts
|
|
6
6
|
import {
|
|
7
|
+
messagesToTranscriptEvents,
|
|
8
|
+
NormalizedSessionSchema,
|
|
7
9
|
spans,
|
|
8
|
-
|
|
10
|
+
TranscriptEventSchema
|
|
9
11
|
} from "@vitest-evals/core";
|
|
10
12
|
import {
|
|
11
13
|
assistantMessages as assistantMessages2,
|
|
12
14
|
failedSpans as failedSpans2,
|
|
13
15
|
latestAssistantMessageContent as latestAssistantMessageContent2,
|
|
14
16
|
messagesByRole as messagesByRole2,
|
|
17
|
+
messagesToTranscriptEvents as messagesToTranscriptEvents2,
|
|
15
18
|
spans as spans2,
|
|
16
19
|
spansByKind as spansByKind2,
|
|
17
20
|
systemMessages as systemMessages2,
|
|
@@ -140,14 +143,24 @@ function normalizeHarnessRun(input, result, context) {
|
|
|
140
143
|
}
|
|
141
144
|
return result;
|
|
142
145
|
}
|
|
146
|
+
if ("toolCalls" in result) {
|
|
147
|
+
throw new TypeError(
|
|
148
|
+
'createHarness results do not accept top-level toolCalls. Return ordered session events with type: "tool_call" and type: "tool_result" entries instead.'
|
|
149
|
+
);
|
|
150
|
+
}
|
|
143
151
|
const output = result.output;
|
|
144
|
-
const toolCalls3 = normalizeSimpleToolCalls(result.toolCalls);
|
|
145
152
|
const usage = result.usage ?? {};
|
|
146
|
-
const
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
153
|
+
const events = normalizeTranscriptInput(result);
|
|
154
|
+
if (!events) {
|
|
155
|
+
throw new TypeError(
|
|
156
|
+
"createHarness results must include ordered events or messages. Return a full HarnessRun or a lightweight result with events/messages."
|
|
157
|
+
);
|
|
158
|
+
}
|
|
159
|
+
if (events.length === 0) {
|
|
160
|
+
throw new TypeError(
|
|
161
|
+
"createHarness results must include at least one transcript event. Return ordered events or message transport inputs that normalize into events."
|
|
162
|
+
);
|
|
163
|
+
}
|
|
151
164
|
const metadata = result.metadata ? normalizeMetadata(result.metadata) : void 0;
|
|
152
165
|
const artifacts = normalizeMergedArtifacts(
|
|
153
166
|
context?.artifacts,
|
|
@@ -156,7 +169,7 @@ function normalizeHarnessRun(input, result, context) {
|
|
|
156
169
|
const traces = normalizeSimpleTraces(result.traces);
|
|
157
170
|
return {
|
|
158
171
|
session: {
|
|
159
|
-
|
|
172
|
+
events,
|
|
160
173
|
...usage.provider ? { provider: usage.provider } : {},
|
|
161
174
|
...usage.model ? { model: usage.model } : {},
|
|
162
175
|
...metadata ? { metadata } : {}
|
|
@@ -169,12 +182,24 @@ function normalizeHarnessRun(input, result, context) {
|
|
|
169
182
|
errors: normalizeSimpleErrors(result.errors)
|
|
170
183
|
};
|
|
171
184
|
}
|
|
185
|
+
function normalizeTranscriptInput(result) {
|
|
186
|
+
if ("events" in result && Array.isArray(result.events)) {
|
|
187
|
+
return result.events.map((event) => TranscriptEventSchema.parse(event));
|
|
188
|
+
}
|
|
189
|
+
if ("messages" in result && Array.isArray(result.messages)) {
|
|
190
|
+
return messagesToTranscriptEvents(result.messages).map(
|
|
191
|
+
(event) => TranscriptEventSchema.parse(event)
|
|
192
|
+
);
|
|
193
|
+
}
|
|
194
|
+
return void 0;
|
|
195
|
+
}
|
|
172
196
|
function createFailedHarnessRun(input, error, options = {}) {
|
|
173
197
|
const artifacts = options.artifacts;
|
|
174
198
|
return {
|
|
175
199
|
session: {
|
|
176
|
-
|
|
200
|
+
events: [
|
|
177
201
|
{
|
|
202
|
+
type: "message",
|
|
178
203
|
role: "user",
|
|
179
204
|
content: normalizeContent(input)
|
|
180
205
|
}
|
|
@@ -185,67 +210,6 @@ function createFailedHarnessRun(input, error, options = {}) {
|
|
|
185
210
|
errors: [serializeError(error)]
|
|
186
211
|
};
|
|
187
212
|
}
|
|
188
|
-
function createDefaultSessionMessages({
|
|
189
|
-
input,
|
|
190
|
-
output,
|
|
191
|
-
toolCalls: normalizedToolCalls
|
|
192
|
-
}) {
|
|
193
|
-
const messages = [
|
|
194
|
-
{
|
|
195
|
-
role: "user",
|
|
196
|
-
content: normalizeContent(input)
|
|
197
|
-
}
|
|
198
|
-
];
|
|
199
|
-
if (output !== void 0 || normalizedToolCalls.length > 0) {
|
|
200
|
-
messages.push({
|
|
201
|
-
role: "assistant",
|
|
202
|
-
...output !== void 0 ? { content: normalizeContent(output) } : {},
|
|
203
|
-
...normalizedToolCalls.length > 0 ? { toolCalls: normalizedToolCalls } : {}
|
|
204
|
-
});
|
|
205
|
-
}
|
|
206
|
-
return messages;
|
|
207
|
-
}
|
|
208
|
-
function normalizeSimpleToolCalls(calls) {
|
|
209
|
-
return (calls ?? []).map((call) => {
|
|
210
|
-
const {
|
|
211
|
-
arguments: rawArguments,
|
|
212
|
-
result: rawResult,
|
|
213
|
-
error: rawError,
|
|
214
|
-
metadata: rawMetadata,
|
|
215
|
-
...toolCall
|
|
216
|
-
} = call;
|
|
217
|
-
const args = normalizeToolCallArguments(rawArguments);
|
|
218
|
-
const result = toJsonValue(rawResult);
|
|
219
|
-
const error = normalizeToolCallError(rawError);
|
|
220
|
-
const metadata = rawMetadata ? normalizeMetadata(rawMetadata) : void 0;
|
|
221
|
-
return {
|
|
222
|
-
...toolCall,
|
|
223
|
-
...args ? { arguments: args } : {},
|
|
224
|
-
...result !== void 0 ? { result } : {},
|
|
225
|
-
...error ? { error } : {},
|
|
226
|
-
...metadata ? { metadata } : {}
|
|
227
|
-
};
|
|
228
|
-
});
|
|
229
|
-
}
|
|
230
|
-
function normalizeToolCallArguments(value) {
|
|
231
|
-
if (value === void 0) {
|
|
232
|
-
return void 0;
|
|
233
|
-
}
|
|
234
|
-
const normalized = toJsonValue(value);
|
|
235
|
-
return normalized && typeof normalized === "object" && !Array.isArray(normalized) ? normalized : void 0;
|
|
236
|
-
}
|
|
237
|
-
function normalizeToolCallError(value) {
|
|
238
|
-
if (value === void 0) {
|
|
239
|
-
return void 0;
|
|
240
|
-
}
|
|
241
|
-
const serialized = serializeError(value);
|
|
242
|
-
const { message, type, ...details } = serialized;
|
|
243
|
-
return {
|
|
244
|
-
...details,
|
|
245
|
-
message: typeof message === "string" ? message : String(message),
|
|
246
|
-
...typeof type === "string" ? { type } : {}
|
|
247
|
-
};
|
|
248
|
-
}
|
|
249
213
|
function normalizeMergedArtifacts(contextArtifacts, resultArtifacts) {
|
|
250
214
|
const artifacts = {
|
|
251
215
|
...contextArtifacts ?? {},
|
|
@@ -371,32 +335,6 @@ function createGenAiUsageAttributes(usage, options = {}) {
|
|
|
371
335
|
"gen_ai.usage.reasoning.output_tokens": usage?.reasoningTokens
|
|
372
336
|
};
|
|
373
337
|
}
|
|
374
|
-
function createToolCallSpans(calls, options = {}) {
|
|
375
|
-
return calls.map((call, index) => {
|
|
376
|
-
const spanError = call.error ? normalizeSpanError(call.error) : void 0;
|
|
377
|
-
const spanId = options.spanIdPrefix ? `${options.spanIdPrefix}:${index + 1}` : call.id;
|
|
378
|
-
return {
|
|
379
|
-
...spanId ? { id: spanId } : {},
|
|
380
|
-
...options.traceId ? { traceId: options.traceId } : {},
|
|
381
|
-
...options.parentId ? { parentId: options.parentId } : {},
|
|
382
|
-
name: call.name,
|
|
383
|
-
kind: "tool",
|
|
384
|
-
...call.startedAt ? { startedAt: call.startedAt } : {},
|
|
385
|
-
...call.finishedAt ? { finishedAt: call.finishedAt } : {},
|
|
386
|
-
...call.durationMs !== void 0 ? { durationMs: call.durationMs } : {},
|
|
387
|
-
status: spanError ? "error" : "ok",
|
|
388
|
-
...spanError ? { error: spanError } : {},
|
|
389
|
-
attributes: normalizeSpanAttributes({
|
|
390
|
-
"gen_ai.operation.name": "execute_tool",
|
|
391
|
-
"gen_ai.tool.name": call.name,
|
|
392
|
-
"gen_ai.tool.type": "function",
|
|
393
|
-
...call.id ? { "gen_ai.tool.call.id": call.id } : {},
|
|
394
|
-
...call.arguments !== void 0 ? { "gen_ai.tool.call.arguments": call.arguments } : {},
|
|
395
|
-
...call.result !== void 0 ? { "gen_ai.tool.call.result": call.result } : {}
|
|
396
|
-
})
|
|
397
|
-
};
|
|
398
|
-
});
|
|
399
|
-
}
|
|
400
338
|
function ensureRunTrace(run, options) {
|
|
401
339
|
if (spans(run).length > 0) {
|
|
402
340
|
return void 0;
|
|
@@ -421,11 +359,6 @@ function ensureRunTrace(run, options) {
|
|
|
421
359
|
...createGenAiUsageAttributes(run.usage)
|
|
422
360
|
})
|
|
423
361
|
};
|
|
424
|
-
const toolSpans = createToolCallSpans(toolCalls(run.session), {
|
|
425
|
-
traceId,
|
|
426
|
-
parentId: rootSpanId,
|
|
427
|
-
spanIdPrefix: `${traceId}:tool`
|
|
428
|
-
});
|
|
429
362
|
const trace = {
|
|
430
363
|
id: traceId,
|
|
431
364
|
name: options.name,
|
|
@@ -433,7 +366,7 @@ function ensureRunTrace(run, options) {
|
|
|
433
366
|
finishedAt: options.finishedAt.toISOString(),
|
|
434
367
|
durationMs,
|
|
435
368
|
...options.source ? { metadata: { source: options.source } } : {},
|
|
436
|
-
spans: [runSpan
|
|
369
|
+
spans: [runSpan]
|
|
437
370
|
};
|
|
438
371
|
run.traces = [trace];
|
|
439
372
|
return trace;
|
|
@@ -463,7 +396,7 @@ function isHarnessRun(value) {
|
|
|
463
396
|
return isNormalizedSession(candidate.session) && Boolean(candidate.usage) && typeof candidate.usage === "object" && !Array.isArray(candidate.usage) && Array.isArray(candidate.errors);
|
|
464
397
|
}
|
|
465
398
|
function isNormalizedSession(value) {
|
|
466
|
-
return
|
|
399
|
+
return NormalizedSessionSchema.safeParse(value).success;
|
|
467
400
|
}
|
|
468
401
|
function serializeError(error) {
|
|
469
402
|
if (error instanceof Error) {
|
|
@@ -483,7 +416,10 @@ function createJudgeHarness(options) {
|
|
|
483
416
|
return createHarness({
|
|
484
417
|
name: options.name ?? "judge-harness",
|
|
485
418
|
run: async ({ input, signal }) => {
|
|
486
|
-
return normalizeJudgeHarnessResult(
|
|
419
|
+
return normalizeJudgeHarnessResult(
|
|
420
|
+
input,
|
|
421
|
+
await options.run(input, { signal })
|
|
422
|
+
);
|
|
487
423
|
}
|
|
488
424
|
});
|
|
489
425
|
}
|
|
@@ -506,17 +442,14 @@ function createRunJudge(judgeHarness, signal) {
|
|
|
506
442
|
signal: options?.signal ?? signal
|
|
507
443
|
});
|
|
508
444
|
}
|
|
509
|
-
function normalizeJudgeHarnessResult(result) {
|
|
445
|
+
function normalizeJudgeHarnessResult(input, result) {
|
|
510
446
|
if (isHarnessRun(result)) {
|
|
511
447
|
return result;
|
|
512
448
|
}
|
|
513
|
-
|
|
514
|
-
return {
|
|
515
|
-
output: normalizeJudgeHarnessOutput(result.output)
|
|
516
|
-
};
|
|
517
|
-
}
|
|
449
|
+
const output = hasOutputField(result) ? normalizeJudgeHarnessOutput(result.output) : normalizeJudgeHarnessOutput(result);
|
|
518
450
|
return {
|
|
519
|
-
output
|
|
451
|
+
output,
|
|
452
|
+
messages: createJudgeHarnessMessages(input, output)
|
|
520
453
|
};
|
|
521
454
|
}
|
|
522
455
|
function hasOutputField(value) {
|
|
@@ -528,6 +461,13 @@ function normalizeJudgeHarnessOutput(value) {
|
|
|
528
461
|
}
|
|
529
462
|
return normalizeContent(value);
|
|
530
463
|
}
|
|
464
|
+
function createJudgeHarnessMessages(input, output) {
|
|
465
|
+
return [
|
|
466
|
+
...input.system ? [{ role: "system", content: input.system }] : [],
|
|
467
|
+
{ role: "user", content: input.prompt },
|
|
468
|
+
...output !== void 0 ? [{ role: "assistant", content: output }] : []
|
|
469
|
+
];
|
|
470
|
+
}
|
|
531
471
|
function resolveJudgeHarnessAssistantOutput(run) {
|
|
532
472
|
return latestAssistantMessageContent2(run.session) ?? "";
|
|
533
473
|
}
|
|
@@ -1655,23 +1595,25 @@ function resolveJudgeAssertionOutput(received, run, explicitOutput) {
|
|
|
1655
1595
|
return normalizeJudgeJsonValue(received);
|
|
1656
1596
|
}
|
|
1657
1597
|
function createSyntheticJudgeSession(received, options) {
|
|
1658
|
-
const
|
|
1598
|
+
const events = [];
|
|
1659
1599
|
const userContent = normalizeJudgeJsonValue(options.input);
|
|
1660
1600
|
if (userContent !== void 0) {
|
|
1661
|
-
|
|
1601
|
+
events.push({
|
|
1602
|
+
type: "message",
|
|
1662
1603
|
role: "user",
|
|
1663
1604
|
content: userContent
|
|
1664
1605
|
});
|
|
1665
1606
|
}
|
|
1666
1607
|
const assistantContent = normalizeJudgeJsonValue(received);
|
|
1667
1608
|
if (assistantContent !== void 0) {
|
|
1668
|
-
|
|
1609
|
+
events.push({
|
|
1610
|
+
type: "message",
|
|
1669
1611
|
role: "assistant",
|
|
1670
1612
|
content: assistantContent
|
|
1671
1613
|
});
|
|
1672
1614
|
}
|
|
1673
1615
|
return {
|
|
1674
|
-
|
|
1616
|
+
events
|
|
1675
1617
|
};
|
|
1676
1618
|
}
|
|
1677
1619
|
function inferJudgeOutputValue(received, session) {
|
|
@@ -1679,7 +1621,7 @@ function inferJudgeOutputValue(received, session) {
|
|
|
1679
1621
|
return received.output;
|
|
1680
1622
|
}
|
|
1681
1623
|
if (isNormalizedSession(received)) {
|
|
1682
|
-
return resolveAssistantOutput(session) ?? normalizeJudgeJsonValue(received.
|
|
1624
|
+
return resolveAssistantOutput(session) ?? normalizeJudgeJsonValue(received.events);
|
|
1683
1625
|
}
|
|
1684
1626
|
return normalizeJudgeJsonValue(received);
|
|
1685
1627
|
}
|
|
@@ -1752,7 +1694,6 @@ export {
|
|
|
1752
1694
|
createHarness,
|
|
1753
1695
|
createJudge,
|
|
1754
1696
|
createJudgeHarness,
|
|
1755
|
-
createToolCallSpans,
|
|
1756
1697
|
describeEval,
|
|
1757
1698
|
ensureRunTrace,
|
|
1758
1699
|
failedSpans2 as failedSpans,
|
|
@@ -1760,6 +1701,7 @@ export {
|
|
|
1760
1701
|
getHarnessRunFromError,
|
|
1761
1702
|
latestAssistantMessageContent2 as latestAssistantMessageContent,
|
|
1762
1703
|
messagesByRole2 as messagesByRole,
|
|
1704
|
+
messagesToTranscriptEvents2 as messagesToTranscriptEvents,
|
|
1763
1705
|
normalizeHarnessRun,
|
|
1764
1706
|
normalizeSpanAttributes,
|
|
1765
1707
|
normalizeSpanError,
|