vitest-evals 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +112 -66
- package/dist/harness.d.mts +12 -1
- package/dist/harness.d.ts +12 -1
- package/dist/harness.js +8 -0
- package/dist/harness.js.map +1 -1
- package/dist/harness.mjs +7 -0
- package/dist/harness.mjs.map +1 -1
- package/dist/index.d.mts +48 -20
- package/dist/index.d.ts +48 -20
- package/dist/index.js +308 -21
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +304 -21
- package/dist/index.mjs.map +1 -1
- package/dist/internal/scoring.d.mts +3 -3
- package/dist/internal/scoring.d.ts +3 -3
- package/dist/internal/scoring.js.map +1 -1
- package/dist/internal/toolCallScorer.js +42 -2
- package/dist/internal/toolCallScorer.js.map +1 -1
- package/dist/internal/toolCallScorer.mjs +42 -2
- package/dist/internal/toolCallScorer.mjs.map +1 -1
- package/dist/judges/factualityJudge.d.mts +151 -0
- package/dist/judges/factualityJudge.d.ts +151 -0
- package/dist/judges/factualityJudge.js +235 -0
- package/dist/judges/factualityJudge.js.map +1 -0
- package/dist/judges/factualityJudge.mjs +208 -0
- package/dist/judges/factualityJudge.mjs.map +1 -0
- package/dist/judges/index.d.mts +3 -1
- package/dist/judges/index.d.ts +3 -1
- package/dist/judges/index.js +447 -7
- package/dist/judges/index.js.map +1 -1
- package/dist/judges/index.mjs +443 -6
- package/dist/judges/index.mjs.map +1 -1
- package/dist/judges/judgeHarness.d.mts +122 -0
- package/dist/judges/judgeHarness.d.ts +122 -0
- package/dist/judges/judgeHarness.js +303 -0
- package/dist/judges/judgeHarness.js.map +1 -0
- package/dist/judges/judgeHarness.mjs +274 -0
- package/dist/judges/judgeHarness.mjs.map +1 -0
- package/dist/judges/structuredOutputJudge.d.mts +1 -0
- package/dist/judges/structuredOutputJudge.d.ts +1 -0
- package/dist/judges/toolCallJudge.d.mts +1 -0
- package/dist/judges/toolCallJudge.d.ts +1 -0
- package/dist/judges/toolCallJudge.js +42 -2
- package/dist/judges/toolCallJudge.js.map +1 -1
- package/dist/judges/toolCallJudge.mjs +42 -2
- package/dist/judges/toolCallJudge.mjs.map +1 -1
- package/dist/judges/types.d.mts +33 -6
- package/dist/judges/types.d.ts +33 -6
- package/dist/judges/types.js.map +1 -1
- package/dist/legacy/scorers/index.js +42 -2
- package/dist/legacy/scorers/index.js.map +1 -1
- package/dist/legacy/scorers/index.mjs +42 -2
- package/dist/legacy/scorers/index.mjs.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.js +42 -2
- package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.mjs +42 -2
- package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
- package/dist/legacy.js +56 -3
- package/dist/legacy.js.map +1 -1
- package/dist/legacy.mjs +56 -3
- package/dist/legacy.mjs.map +1 -1
- package/dist/replay.js +1 -1
- package/dist/replay.js.map +1 -1
- package/dist/replay.mjs +1 -1
- package/dist/replay.mjs.map +1 -1
- package/dist/reporter.js.map +1 -1
- package/dist/reporter.mjs.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -2,6 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
Harness-backed AI testing on top of Vitest.
|
|
4
4
|
|
|
5
|
+
Use this package README for the core authoring model. For a guided setup path,
|
|
6
|
+
runtime-specific harness examples, replay, and GitHub Actions reporting, start
|
|
7
|
+
with the docs site: `https://vitest-evals.sentry.dev/docs`.
|
|
8
|
+
|
|
5
9
|
## Install
|
|
6
10
|
|
|
7
11
|
```sh
|
|
@@ -40,55 +44,28 @@ workflow.
|
|
|
40
44
|
- every judge receives `JudgeContext` with typed `input`, typed `output`, the
|
|
41
45
|
normalized run/session, tool calls, and metadata; `output` is only optional
|
|
42
46
|
when the harness output type includes `undefined`
|
|
43
|
-
- judges own their prompt, rubric,
|
|
44
|
-
`
|
|
45
|
-
when multiple judges share setup
|
|
47
|
+
- judges own their prompt, rubric, and parsing; LLM-backed judges use
|
|
48
|
+
`ctx.runJudge(...)` from a configured `judgeHarness`
|
|
46
49
|
- explicit judge assertions use
|
|
47
50
|
`await expect(result).toSatisfyJudge(judge, context)`
|
|
48
51
|
|
|
49
52
|
## Explicit Run Example
|
|
50
53
|
|
|
51
54
|
```ts
|
|
55
|
+
import { getModel } from "@mariozechner/pi-ai";
|
|
52
56
|
import { expect } from "vitest";
|
|
53
|
-
import { piAiHarness } from "@vitest-evals/harness-pi-ai";
|
|
57
|
+
import { piAiHarness, piAiJudgeHarness } from "@vitest-evals/harness-pi-ai";
|
|
54
58
|
import {
|
|
55
|
-
createJudge,
|
|
56
59
|
describeEval,
|
|
60
|
+
FactualityJudge,
|
|
57
61
|
toolCalls,
|
|
58
|
-
type JudgeContext,
|
|
59
62
|
} from "vitest-evals";
|
|
60
63
|
import { createRefundAgent } from "../src/refundAgent";
|
|
61
64
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
};
|
|
66
|
-
|
|
67
|
-
type RefundOutput = {
|
|
68
|
-
status: "approved" | "denied";
|
|
69
|
-
};
|
|
70
|
-
|
|
71
|
-
const FactualityJudge = createJudge(
|
|
72
|
-
"FactualityJudge",
|
|
73
|
-
async ({
|
|
74
|
-
input,
|
|
75
|
-
output,
|
|
76
|
-
metadata,
|
|
77
|
-
}: JudgeContext<string, RefundOutput, RefundEvalMetadata>) => {
|
|
78
|
-
const verdict = await judgeFactuality({
|
|
79
|
-
question: input,
|
|
80
|
-
answer: output,
|
|
81
|
-
expectedStatus: metadata.expectedStatus,
|
|
82
|
-
});
|
|
83
|
-
|
|
84
|
-
return {
|
|
85
|
-
score: verdict.score,
|
|
86
|
-
metadata: {
|
|
87
|
-
rationale: verdict.rationale,
|
|
88
|
-
},
|
|
89
|
-
};
|
|
90
|
-
},
|
|
91
|
-
);
|
|
65
|
+
const judgeHarness = piAiJudgeHarness({
|
|
66
|
+
model: getModel("anthropic", "claude-sonnet-4-5"),
|
|
67
|
+
temperature: 0,
|
|
68
|
+
});
|
|
92
69
|
|
|
93
70
|
describeEval(
|
|
94
71
|
"refund agent",
|
|
@@ -96,12 +73,15 @@ describeEval(
|
|
|
96
73
|
harness: piAiHarness({
|
|
97
74
|
agent: () => createRefundAgent(),
|
|
98
75
|
}),
|
|
99
|
-
|
|
76
|
+
judgeHarness,
|
|
77
|
+
judges: [FactualityJudge()],
|
|
78
|
+
judgeThreshold: 0.6,
|
|
100
79
|
},
|
|
101
80
|
(it) => {
|
|
102
81
|
it("approves a refundable invoice", async ({ run }) => {
|
|
103
82
|
const result = await run("Refund invoice inv_123", {
|
|
104
83
|
metadata: {
|
|
84
|
+
expected: "The refund request is approved.",
|
|
105
85
|
expectedStatus: "approved",
|
|
106
86
|
expectedTools: ["lookupInvoice", "createRefund"],
|
|
107
87
|
},
|
|
@@ -210,6 +190,7 @@ When generics are needed, use `createHarness<Input, Output, Metadata>(...)`.
|
|
|
210
190
|
import {
|
|
211
191
|
createHarness,
|
|
212
192
|
createJudge,
|
|
193
|
+
createJudgeHarness,
|
|
213
194
|
describeEval,
|
|
214
195
|
type JudgeContext,
|
|
215
196
|
} from "vitest-evals";
|
|
@@ -255,14 +236,25 @@ const appHarness = createHarness<AppEvalInput, AppOutput, AppEvalMetadata>({
|
|
|
255
236
|
},
|
|
256
237
|
});
|
|
257
238
|
|
|
239
|
+
const judgeHarness = createJudgeHarness({
|
|
240
|
+
name: "app-rubric-judge-model",
|
|
241
|
+
run: async ({ prompt }, { signal }) =>
|
|
242
|
+
promptJudgeModel({ prompt, signal }),
|
|
243
|
+
});
|
|
244
|
+
|
|
258
245
|
const AppRubricJudge = createJudge(
|
|
259
246
|
"AppRubricJudge",
|
|
260
247
|
async (ctx: JudgeContext<AppEvalInput, AppOutput, AppEvalMetadata>) => {
|
|
261
|
-
|
|
248
|
+
if (!ctx.runJudge) {
|
|
249
|
+
throw new Error("AppRubricJudge requires a configured judgeHarness.");
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
const verdict = await ctx.runJudge({
|
|
262
253
|
prompt: formatRubricPrompt({
|
|
263
254
|
output: ctx.output,
|
|
264
255
|
criteria: ctx.input.criteria,
|
|
265
256
|
}),
|
|
257
|
+
responseFormat: { type: "json" },
|
|
266
258
|
});
|
|
267
259
|
|
|
268
260
|
return parseRubricVerdict(verdict);
|
|
@@ -273,6 +265,7 @@ describeEval(
|
|
|
273
265
|
"app behavior",
|
|
274
266
|
{
|
|
275
267
|
harness: appHarness,
|
|
268
|
+
judgeHarness,
|
|
276
269
|
judges: [AppRubricJudge],
|
|
277
270
|
judgeThreshold: 0.75,
|
|
278
271
|
},
|
|
@@ -328,11 +321,26 @@ In practice, this is usually most useful for factuality, rubric, or grounded
|
|
|
328
321
|
answer checks:
|
|
329
322
|
|
|
330
323
|
```ts
|
|
331
|
-
|
|
324
|
+
import { openai } from "@ai-sdk/openai";
|
|
325
|
+
import { aiSdkJudgeHarness } from "@vitest-evals/harness-ai-sdk";
|
|
326
|
+
import { expect } from "vitest";
|
|
327
|
+
import { FactualityJudge } from "vitest-evals";
|
|
328
|
+
|
|
329
|
+
const judgeHarness = aiSdkJudgeHarness({
|
|
330
|
+
model: openai("gpt-4.1-mini"),
|
|
331
|
+
temperature: 0,
|
|
332
|
+
});
|
|
333
|
+
const factualityJudge = FactualityJudge({ judgeHarness });
|
|
334
|
+
|
|
335
|
+
await expect(result).toSatisfyJudge(factualityJudge, {
|
|
336
|
+
expected: "Paris is the capital of France.",
|
|
337
|
+
threshold: 0.6,
|
|
338
|
+
});
|
|
332
339
|
```
|
|
333
340
|
|
|
334
341
|
For lower-level cases, the matcher also accepts raw values and synthetic judge
|
|
335
|
-
context
|
|
342
|
+
context. Pass every context field the judge needs when the value did not come
|
|
343
|
+
from eval fixture `run(...)`:
|
|
336
344
|
|
|
337
345
|
```ts
|
|
338
346
|
await expect({ status: "approved" }).toSatisfyJudge(MyJudge, {
|
|
@@ -340,35 +348,75 @@ await expect({ status: "approved" }).toSatisfyJudge(MyJudge, {
|
|
|
340
348
|
});
|
|
341
349
|
```
|
|
342
350
|
|
|
343
|
-
Use
|
|
344
|
-
|
|
351
|
+
Use the built-in factuality judge when you want a model-backed factuality grade
|
|
352
|
+
over the normalized run:
|
|
345
353
|
|
|
346
354
|
```ts
|
|
347
|
-
import {
|
|
355
|
+
import { openai } from "@ai-sdk/openai";
|
|
356
|
+
import { aiSdkJudgeHarness } from "@vitest-evals/harness-ai-sdk";
|
|
357
|
+
import { FactualityJudge } from "vitest-evals";
|
|
348
358
|
|
|
349
|
-
const
|
|
350
|
-
"
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
359
|
+
export const judgeHarness = aiSdkJudgeHarness({
|
|
360
|
+
model: openai("gpt-4.1-mini"),
|
|
361
|
+
temperature: 0,
|
|
362
|
+
});
|
|
363
|
+
export const factualityJudge = FactualityJudge({ judgeHarness });
|
|
364
|
+
```
|
|
354
365
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
366
|
+
For custom judge providers, create a dedicated judge harness with the same
|
|
367
|
+
prompt contract:
|
|
368
|
+
|
|
369
|
+
```ts
|
|
370
|
+
import {
|
|
371
|
+
createJudgeHarness,
|
|
372
|
+
FactualityJudge,
|
|
373
|
+
type JudgeHarness,
|
|
374
|
+
} from "vitest-evals";
|
|
375
|
+
import { callJudgeModel } from "./judgeModel";
|
|
376
|
+
|
|
377
|
+
export const judgeHarness: JudgeHarness = createJudgeHarness({
|
|
378
|
+
name: "factuality-judge-model",
|
|
379
|
+
run: async ({ system, prompt }, { signal }) =>
|
|
380
|
+
callJudgeModel({ system, prompt, signal }),
|
|
381
|
+
});
|
|
382
|
+
|
|
383
|
+
export const factualityJudge = FactualityJudge({ judgeHarness });
|
|
363
384
|
```
|
|
364
385
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
386
|
+
Configure that judge harness once and reuse the same judge with any app
|
|
387
|
+
harness:
|
|
388
|
+
|
|
389
|
+
```ts
|
|
390
|
+
import { describeEval } from "vitest-evals";
|
|
391
|
+
import { aiSdkRefundHarness } from "./aiSdkRefundHarness";
|
|
392
|
+
import { piRefundHarness } from "./piRefundHarness";
|
|
393
|
+
import { factualityJudge } from "./sharedJudges";
|
|
394
|
+
|
|
395
|
+
describeEval("ai sdk refund agent", {
|
|
396
|
+
harness: aiSdkRefundHarness,
|
|
397
|
+
judges: [factualityJudge],
|
|
398
|
+
});
|
|
399
|
+
|
|
400
|
+
describeEval("pi refund agent", {
|
|
401
|
+
harness: piRefundHarness,
|
|
402
|
+
judges: [factualityJudge],
|
|
403
|
+
});
|
|
404
|
+
```
|
|
405
|
+
|
|
406
|
+
Use `createJudge(...)` for custom judges so reporter output gets a stable
|
|
407
|
+
label. Custom LLM-backed judges should provide their own judge prompt, rubric
|
|
408
|
+
text, and parser, then call `ctx.runJudge(...)` for the provider-specific model
|
|
409
|
+
request. Bind a reusable default with `createJudge({ name, judgeHarness,
|
|
410
|
+
assess })` or pass `judgeHarness` on the matcher or suite. Core curries the
|
|
411
|
+
matcher, judge, or explicit suite `judgeHarness` into that function with the
|
|
412
|
+
current run's abort signal. Matcher options win over a judge default, and a
|
|
413
|
+
judge default wins over the suite default. Explicit matcher calls can also
|
|
414
|
+
reuse a single unambiguous judge-level harness from the suite's automatic
|
|
415
|
+
judges, but automatic judges do not inherit inferred harnesses from sibling
|
|
416
|
+
judges. That inference requires those judges to share the same judge harness
|
|
417
|
+
instance. Leave `judgeHarness` unset for suites that only use deterministic
|
|
418
|
+
judges. Calling `harness.run(...)` from a judge executes the application again,
|
|
419
|
+
so use that only when a second run is intentional.
|
|
372
420
|
|
|
373
421
|
For an `EvalHarnessRun` returned by fixture `run(...)`,
|
|
374
422
|
`toSatisfyJudge(...)` uses the run's typed `output` and reuses the registered
|
|
@@ -386,6 +434,4 @@ When a judge needs richer normalized context or the configured suite harness,
|
|
|
386
434
|
type it with `JudgeContext`.
|
|
387
435
|
|
|
388
436
|
When you only need deterministic contract checks, built-ins such as
|
|
389
|
-
`StructuredOutputJudge()` and `ToolCallJudge()` are still available.
|
|
390
|
-
documentation examples intentionally use factuality/rubric judges because those
|
|
391
|
-
match the product's LLM-as-a-judge direction.
|
|
437
|
+
`StructuredOutputJudge()` and `ToolCallJudge()` are still available.
|
package/dist/harness.d.mts
CHANGED
|
@@ -414,6 +414,17 @@ declare function userMessages(session: NormalizedSession): NormalizedMessage[];
|
|
|
414
414
|
* ```
|
|
415
415
|
*/
|
|
416
416
|
declare function assistantMessages(session: NormalizedSession): NormalizedMessage[];
|
|
417
|
+
/**
|
|
418
|
+
* Returns the latest assistant message content, ignoring empty text messages.
|
|
419
|
+
*
|
|
420
|
+
* @param session - Normalized session produced by a harness run.
|
|
421
|
+
*
|
|
422
|
+
* @example
|
|
423
|
+
* ```ts
|
|
424
|
+
* const finalAnswer = latestAssistantMessageContent(result.session);
|
|
425
|
+
* ```
|
|
426
|
+
*/
|
|
427
|
+
declare function latestAssistantMessageContent(session: NormalizedSession): JsonValue | undefined;
|
|
417
428
|
/**
|
|
418
429
|
* Returns every normalized tool message from a session.
|
|
419
430
|
*
|
|
@@ -465,4 +476,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
|
|
|
465
476
|
/** Serializes an arbitrary thrown value into the normalized error shape. */
|
|
466
477
|
declare function serializeError(error: unknown): Record<string, JsonValue>;
|
|
467
478
|
|
|
468
|
-
export { type CreateHarnessOptions, type CreateHarnessRunArgs, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type MaybePromise, type NormalizedMessage, type NormalizedSession, type SimpleHarnessResult, type SimpleToolCallRecord, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, createHarness, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
|
|
479
|
+
export { type CreateHarnessOptions, type CreateHarnessRunArgs, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type MaybePromise, type NormalizedMessage, type NormalizedSession, type SimpleHarnessResult, type SimpleToolCallRecord, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, createHarness, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, latestAssistantMessageContent, messagesByRole, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
|
package/dist/harness.d.ts
CHANGED
|
@@ -414,6 +414,17 @@ declare function userMessages(session: NormalizedSession): NormalizedMessage[];
|
|
|
414
414
|
* ```
|
|
415
415
|
*/
|
|
416
416
|
declare function assistantMessages(session: NormalizedSession): NormalizedMessage[];
|
|
417
|
+
/**
|
|
418
|
+
* Returns the latest assistant message content, ignoring empty text messages.
|
|
419
|
+
*
|
|
420
|
+
* @param session - Normalized session produced by a harness run.
|
|
421
|
+
*
|
|
422
|
+
* @example
|
|
423
|
+
* ```ts
|
|
424
|
+
* const finalAnswer = latestAssistantMessageContent(result.session);
|
|
425
|
+
* ```
|
|
426
|
+
*/
|
|
427
|
+
declare function latestAssistantMessageContent(session: NormalizedSession): JsonValue | undefined;
|
|
417
428
|
/**
|
|
418
429
|
* Returns every normalized tool message from a session.
|
|
419
430
|
*
|
|
@@ -465,4 +476,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
|
|
|
465
476
|
/** Serializes an arbitrary thrown value into the normalized error shape. */
|
|
466
477
|
declare function serializeError(error: unknown): Record<string, JsonValue>;
|
|
467
478
|
|
|
468
|
-
export { type CreateHarnessOptions, type CreateHarnessRunArgs, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type MaybePromise, type NormalizedMessage, type NormalizedSession, type SimpleHarnessResult, type SimpleToolCallRecord, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, createHarness, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
|
|
479
|
+
export { type CreateHarnessOptions, type CreateHarnessRunArgs, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type MaybePromise, type NormalizedMessage, type NormalizedSession, type SimpleHarnessResult, type SimpleToolCallRecord, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, createHarness, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, latestAssistantMessageContent, messagesByRole, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
|
package/dist/harness.js
CHANGED
|
@@ -27,6 +27,7 @@ __export(harness_exports, {
|
|
|
27
27
|
hasCallableMethod: () => hasCallableMethod,
|
|
28
28
|
isHarnessRun: () => isHarnessRun,
|
|
29
29
|
isNormalizedSession: () => isNormalizedSession,
|
|
30
|
+
latestAssistantMessageContent: () => latestAssistantMessageContent,
|
|
30
31
|
messagesByRole: () => messagesByRole,
|
|
31
32
|
normalizeContent: () => normalizeContent,
|
|
32
33
|
normalizeHarnessRun: () => normalizeHarnessRun,
|
|
@@ -225,6 +226,9 @@ function toolCalls(session) {
|
|
|
225
226
|
function messagesByRole(session, role) {
|
|
226
227
|
return session.messages.filter((message) => message.role === role);
|
|
227
228
|
}
|
|
229
|
+
function hasNonEmptyMessageContent(message) {
|
|
230
|
+
return message.content !== void 0 && (typeof message.content !== "string" || message.content.trim().length > 0);
|
|
231
|
+
}
|
|
228
232
|
function systemMessages(session) {
|
|
229
233
|
return messagesByRole(session, "system");
|
|
230
234
|
}
|
|
@@ -234,6 +238,9 @@ function userMessages(session) {
|
|
|
234
238
|
function assistantMessages(session) {
|
|
235
239
|
return messagesByRole(session, "assistant");
|
|
236
240
|
}
|
|
241
|
+
function latestAssistantMessageContent(session) {
|
|
242
|
+
return [...assistantMessages(session)].reverse().find(hasNonEmptyMessageContent)?.content;
|
|
243
|
+
}
|
|
237
244
|
function toolMessages(session) {
|
|
238
245
|
return messagesByRole(session, "tool");
|
|
239
246
|
}
|
|
@@ -286,6 +293,7 @@ function serializeError(error) {
|
|
|
286
293
|
hasCallableMethod,
|
|
287
294
|
isHarnessRun,
|
|
288
295
|
isNormalizedSession,
|
|
296
|
+
latestAssistantMessageContent,
|
|
289
297
|
messagesByRole,
|
|
290
298
|
normalizeContent,
|
|
291
299
|
normalizeHarnessRun,
|
package/dist/harness.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/harness.ts"],"sourcesContent":["/** Primitive scalar values allowed in normalized JSON-safe eval data. */\nexport type JsonPrimitive = string | number | boolean | null;\n\n/** JSON-safe value shape used by normalized sessions, artifacts, and errors. */\nexport type JsonValue =\n | JsonPrimitive\n | JsonValue[]\n | { [key: string]: JsonValue };\n\n/**\n * Normalized record for one tool call observed during a harness run.\n *\n * @example\n * ```ts\n * const call: ToolCallRecord = {\n * name: \"lookupInvoice\",\n * arguments: { invoiceId: \"inv_123\" },\n * result: { refundable: true },\n * };\n * ```\n */\nexport type ToolCallRecord = {\n /** Provider or runtime tool-call id when one is available. */\n id?: string;\n /** Tool name as exposed to the agent or application runtime. */\n name: string;\n /** JSON-safe tool arguments after provider/runtime normalization. */\n arguments?: Record<string, JsonValue>;\n /** JSON-safe tool result returned by the application tool. */\n result?: JsonValue;\n /** Normalized tool error when execution failed. */\n error?: {\n message: string;\n type?: string;\n [key: string]: JsonValue | undefined;\n };\n /** ISO timestamp for the start of tool execution. */\n startedAt?: string;\n /** ISO timestamp for the end of tool execution. */\n finishedAt?: string;\n /** Tool execution duration in milliseconds. */\n durationMs?: number;\n /** Extra JSON-safe tool metadata for reporters and custom judges. */\n metadata?: Record<string, JsonValue>;\n};\n\n/**\n * Normalized message recorded in a harness session transcript.\n *\n * @example\n * ```ts\n * const message: NormalizedMessage = {\n * role: \"assistant\",\n * content: { status: \"approved\" },\n * toolCalls: [{ name: \"lookupInvoice\" }],\n * };\n * ```\n */\nexport type NormalizedMessage = {\n /** Transcript role for the normalized message. */\n role: \"system\" | \"user\" | \"assistant\" | \"tool\";\n /** JSON-safe message content. */\n content?: JsonValue;\n /** Tool calls associated with this message. */\n toolCalls?: ToolCallRecord[];\n /** Extra JSON-safe message metadata. */\n metadata?: Record<string, JsonValue>;\n};\n\n/**\n * Provider usage summary attached to a normalized harness run.\n *\n * @example\n * ```ts\n * const usage: UsageSummary = {\n * provider: \"openai\",\n * model: \"gpt-4o-mini\",\n * inputTokens: 212,\n * outputTokens: 48,\n * totalTokens: 260,\n * };\n * ```\n */\nexport type UsageSummary = {\n /** Provider that served the application run. */\n provider?: string;\n /** Model used for the application run. */\n model?: string;\n /** Input, prompt, or request tokens consumed by the run. */\n inputTokens?: number;\n /** Output or completion tokens produced by the run. */\n outputTokens?: number;\n /** Reasoning tokens reported by providers that expose them. */\n reasoningTokens?: number;\n /** Total token count reported by the provider or adapter. */\n totalTokens?: number;\n /** Count of tool calls observed during the run. */\n toolCalls?: number;\n /** Retry count observed during the run. */\n retries?: number;\n /** Provider-specific JSON-safe usage details. Cost estimates belong here. */\n metadata?: Record<string, JsonValue>;\n};\n\n/** Timing summary attached to a normalized harness run. */\nexport type TimingSummary = {\n /** End-to-end run duration in milliseconds. */\n totalMs?: number;\n /** Extra JSON-safe timing metadata. */\n metadata?: Record<string, JsonValue>;\n};\n\n/**\n * JSON-serializable transcript produced by the system under test.\n *\n * @example\n * ```ts\n * const session: NormalizedSession = {\n * provider: \"openai\",\n * model: \"gpt-4o-mini\",\n * messages: [\n * { role: \"user\", content: \"Refund invoice inv_123\" },\n * { role: \"assistant\", content: { status: \"approved\" } },\n * ],\n * };\n * ```\n */\nexport type NormalizedSession = {\n /** Ordered normalized transcript messages. */\n messages: NormalizedMessage[];\n /** Provider that produced the session when known. */\n provider?: string;\n /** Model that produced the session when known. */\n model?: string;\n /** Extra JSON-safe session metadata. */\n metadata?: Record<string, JsonValue>;\n};\n\ntype OutputField<TOutput extends JsonValue | undefined> =\n undefined extends TOutput ? { output?: TOutput } : { output: TOutput };\n\n/**\n * Normalized result returned by every harness execution.\n *\n * @example\n * ```ts\n * const run: HarnessRun<{ status: \"approved\" }> = {\n * output: { status: \"approved\" },\n * session: {\n * messages: [\n * { role: \"user\", content: \"Refund invoice inv_123\" },\n * { role: \"assistant\", content: { status: \"approved\" } },\n * ],\n * },\n * usage: { totalTokens: 260 },\n * errors: [],\n * };\n * ```\n */\nexport type HarnessRun<\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = OutputField<TOutput> & {\n /** Normalized transcript and provider/session metadata. */\n session: NormalizedSession;\n /** Stable provider usage units such as tokens, tools, and retries. */\n usage: UsageSummary;\n /** Optional timing summary for the run. */\n timings?: TimingSummary;\n /** JSON-safe run artifacts captured by the harness or test context. */\n artifacts?: Record<string, JsonValue>;\n /** Normalized errors captured during execution. */\n errors: Array<Record<string, JsonValue>>;\n};\n\n/** Error value with an attached partial or complete normalized harness run. */\nexport type HarnessRunError = Error & {\n /** Attached normalized harness run recovered by `getHarnessRunFromError(...)`. */\n vitestEvalsRun: HarnessRun;\n};\n\n/** Per-run metadata shape accepted by harnesses and eval tests. */\nexport type HarnessMetadata = Record<string, unknown>;\n\n/**\n * Runtime context passed from the eval fixture into a harness run.\n *\n * @example\n * ```ts\n * const harness: Harness<string> = {\n * name: \"refund-agent\",\n * async run(input, context) {\n * context.setArtifact(\"inputLength\", input.length);\n *\n * return {\n * output: undefined,\n * session: { messages: [{ role: \"user\", content: input }] },\n * usage: {},\n * errors: [],\n * };\n * },\n * };\n * ```\n */\nexport type HarnessContext<\n TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n /** Per-run metadata passed through `run(input, { metadata })`. */\n metadata: Readonly<TMetadata>;\n /** Abort signal from Vitest when available. */\n signal?: AbortSignal;\n /** Mutable JSON-safe artifact bag shared with the harness. */\n artifacts: Record<string, JsonValue>;\n /** Stores one JSON-safe artifact on the current run. */\n setArtifact: (name: string, value: JsonValue) => void;\n};\n\n/**\n * Adapter that executes the system under test and returns a normalized run.\n *\n * @example\n * ```ts\n * const harness: Harness<string, { status: \"approved\" | \"denied\" }> = {\n * name: \"refund-agent\",\n * async run(input, context) {\n * return normalizeHarnessRun(input, await runRefundFlow(input), context);\n * },\n * };\n * ```\n */\nexport type Harness<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n /** Stable harness name used in reports. */\n name: string;\n /** Executes the system under test and returns a normalized run. */\n run: (\n input: TInput,\n context: HarnessContext<TMetadata>,\n ) => Promise<HarnessRun<TOutput>>;\n};\n\n/** Value or promise accepted by lightweight harness callbacks. */\nexport type MaybePromise<T> = T | Promise<T>;\n\n/** Lightweight tool-call record accepted by `createHarness(...)` results. */\nexport type SimpleToolCallRecord = Omit<\n ToolCallRecord,\n \"arguments\" | \"result\" | \"error\" | \"metadata\"\n> & {\n /** Raw tool arguments accepted by `createHarness(...)` before normalization. */\n arguments?: unknown;\n /** Raw tool result accepted by `createHarness(...)` before normalization. */\n result?: unknown;\n /** Raw tool error accepted by `createHarness(...)` before normalization. */\n error?: unknown;\n /** Raw tool metadata accepted by `createHarness(...)` before normalization. */\n metadata?: Record<string, unknown>;\n};\n\n/**\n * Lightweight result shape normalized by `createHarness(...)`.\n *\n * @example\n * ```ts\n * const result: SimpleHarnessResult<{ status: \"approved\" }> = {\n * output: { status: \"approved\" },\n * toolCalls: [{ name: \"lookupInvoice\", arguments: { invoiceId: \"inv_123\" } }],\n * usage: { totalTokens: 260 },\n * };\n * ```\n */\nexport type SimpleHarnessResult<\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = OutputField<TOutput> & {\n /** Pre-normalized transcript messages. When omitted, a default user/assistant transcript is created. */\n messages?: NormalizedMessage[];\n /** Lightweight tool-call records to normalize into the session. */\n toolCalls?: SimpleToolCallRecord[];\n /** Usage summary to attach to the run. */\n usage?: UsageSummary;\n /** Timing summary to attach to the run. */\n timings?: TimingSummary;\n /** Raw artifact values to normalize and merge into the run. */\n artifacts?: Record<string, unknown>;\n /** Raw session metadata to normalize into the session. */\n metadata?: Record<string, unknown>;\n /** Raw errors to normalize into the run. */\n errors?: unknown[];\n};\n\n/** Either a complete normalized run or a lightweight result to normalize. */\nexport type HarnessResultLike<\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = HarnessRun<TOutput> | SimpleHarnessResult<TOutput>;\n\n/** Arguments passed to the `createHarness(...)` convenience callback. */\nexport type CreateHarnessRunArgs<TInput, TMetadata extends HarnessMetadata> = {\n /** Original input passed to `run(input)`. */\n input: TInput;\n /** Read-only metadata passed to `run(input, { metadata })`. */\n metadata: Readonly<TMetadata>;\n /** Abort signal from Vitest when available. */\n signal?: AbortSignal;\n /** Mutable run artifact bag. */\n artifacts: HarnessContext<TMetadata>[\"artifacts\"];\n /** Stores one JSON-safe artifact on the current run. */\n setArtifact: HarnessContext<TMetadata>[\"setArtifact\"];\n};\n\n/**\n * Options for creating a lightweight custom application harness.\n *\n * @example\n * ```ts\n * const options: CreateHarnessOptions<string, { status: \"approved\" }> = {\n * name: \"refund-agent\",\n * run: async ({ input }) => ({\n * output: await classifyRefund(input),\n * }),\n * };\n * ```\n */\nexport type CreateHarnessOptions<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n /** Stable harness name used in reports. */\n name: string;\n /** Executes application code and returns either a lightweight result or full `HarnessRun`. */\n run: (\n args: CreateHarnessRunArgs<TInput, TMetadata>,\n ) => MaybePromise<HarnessResultLike<TOutput>>;\n};\n\nfunction isJsonPrimitive(value: unknown): value is JsonPrimitive {\n return (\n value === null ||\n typeof value === \"string\" ||\n typeof value === \"number\" ||\n typeof value === \"boolean\"\n );\n}\n\nfunction isJsonRecord(value: unknown): value is Record<string, unknown> {\n return typeof value === \"object\" && value !== null && !Array.isArray(value);\n}\n\nfunction normalizeJsonArray(value: unknown[]): JsonValue[] {\n return value.map((item) => {\n const normalized = toJsonValue(item);\n return normalized === undefined ? null : normalized;\n });\n}\n\nfunction normalizeJsonObject(\n value: Record<string, unknown>,\n): Record<string, JsonValue> {\n const normalized: Record<string, JsonValue> = {};\n\n for (const [key, entryValue] of Object.entries(value)) {\n const entry = toJsonValue(entryValue);\n if (entry !== undefined) {\n normalized[key] = entry;\n }\n }\n\n return normalized;\n}\n\n/** Returns true when a value exposes a callable method with the given name. */\nexport function hasCallableMethod(value: unknown, methodName: string) {\n return (\n value !== null &&\n (typeof value === \"object\" || typeof value === \"function\") &&\n methodName in value &&\n typeof (value as Record<string, unknown>)[methodName] === \"function\"\n );\n}\n\n/** Normalizes an unknown value into the JSON-safe shape used by harness runs. */\nexport function toJsonValue(value: unknown): JsonValue | undefined {\n if (isJsonPrimitive(value)) {\n return value;\n }\n\n if (Array.isArray(value)) {\n return normalizeJsonArray(value);\n }\n\n if (isJsonRecord(value)) {\n return normalizeJsonObject(value);\n }\n\n return undefined;\n}\n\n/** Drops non-JSON properties from a record while preserving valid values. */\nexport function normalizeRecord(\n value: Record<string, unknown>,\n): Record<string, JsonValue> {\n return normalizeJsonObject(value);\n}\n\n/** Normalizes metadata and omits the field entirely when nothing survives. */\nexport function normalizeMetadata(\n value: Record<string, unknown>,\n): Record<string, JsonValue> | undefined {\n const normalized = normalizeRecord(value);\n return Object.keys(normalized).length > 0 ? normalized : undefined;\n}\n\n/** Converts arbitrary content into the JSON-safe message content shape. */\nexport function normalizeContent(value: unknown): JsonValue {\n const normalized = toJsonValue(value);\n return normalized !== undefined ? normalized : String(value);\n}\n\n/**\n * Creates a harness from the common \"run app code and return output\" shape.\n *\n * @param options - Harness name plus the callback that executes app code.\n *\n * @example\n * ```ts\n * import { createHarness } from \"vitest-evals\";\n *\n * export const refundHarness = createHarness<\n * string,\n * { status: \"approved\" | \"denied\" },\n * { expected: { status: \"approved\" | \"denied\" } }\n * >({\n * name: \"refund-agent\",\n * run: async ({ input, metadata, setArtifact }) => {\n * const result = await runRefundFlow(input, metadata);\n * const output = { status: result.status };\n *\n * setArtifact(\"case\", { expected: metadata.expected.status });\n *\n * return {\n * output,\n * toolCalls: result.toolCalls,\n * usage: { provider: \"openai\", model: \"gpt-4o-mini\" },\n * };\n * },\n * });\n * ```\n */\nexport function createHarness<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n>(\n options: CreateHarnessOptions<TInput, TOutput, TMetadata>,\n): Harness<TInput, TOutput, TMetadata>;\nexport function createHarness<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n>(\n options: CreateHarnessOptions<TInput, TOutput, TMetadata>,\n): Harness<TInput, TOutput, TMetadata> {\n const harness: Harness<TInput, TOutput, TMetadata> = {\n name: options.name,\n run: async (input, context) => {\n const result = await options.run({\n input,\n metadata: context.metadata,\n signal: context.signal,\n artifacts: context.artifacts,\n setArtifact: context.setArtifact,\n });\n\n return normalizeHarnessRun(input, result, context);\n },\n };\n\n return harness;\n}\n\n/**\n * Normalizes a lightweight harness result into the reporter-facing run shape.\n *\n * @param input - Original input passed to the harness.\n * @param result - Lightweight result or pre-normalized harness run.\n * @param context - Optional per-run context used to merge artifacts.\n *\n * @example\n * ```ts\n * const run = normalizeHarnessRun(\"Refund invoice inv_123\", {\n * output: { status: \"approved\" },\n * toolCalls: [{ name: \"lookupInvoice\", arguments: { invoiceId: \"inv_123\" } }],\n * usage: { provider: \"openai\", model: \"gpt-4o-mini\" },\n * });\n *\n * expect(toolCalls(run.session)).toHaveLength(1);\n * ```\n */\nexport function normalizeHarnessRun<\n TInput = unknown,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n>(\n input: TInput,\n result: HarnessResultLike<TOutput>,\n context?: HarnessContext<TMetadata>,\n): HarnessRun<TOutput> {\n if (isHarnessRun(result)) {\n if (\n context &&\n Object.keys(context.artifacts).length > 0 &&\n !result.artifacts\n ) {\n return {\n ...result,\n artifacts: context.artifacts,\n };\n }\n\n return result;\n }\n\n const output = result.output;\n const toolCalls = normalizeSimpleToolCalls(result.toolCalls);\n const usage = result.usage ?? {};\n const messages =\n result.messages ??\n createDefaultSessionMessages({\n input,\n output,\n toolCalls,\n });\n const metadata = result.metadata\n ? normalizeMetadata(result.metadata)\n : undefined;\n const artifacts = normalizeMergedArtifacts(\n context?.artifacts,\n result.artifacts,\n );\n\n return {\n session: {\n messages,\n ...(usage.provider ? { provider: usage.provider } : {}),\n ...(usage.model ? { model: usage.model } : {}),\n ...(metadata ? { metadata } : {}),\n },\n ...(output !== undefined ? { output } : {}),\n usage,\n ...(result.timings ? { timings: result.timings } : {}),\n ...(artifacts ? { artifacts } : {}),\n errors: normalizeSimpleErrors(result.errors),\n } as HarnessRun<TOutput>;\n}\n\nfunction createDefaultSessionMessages<TInput>({\n input,\n output,\n toolCalls: normalizedToolCalls,\n}: {\n input: TInput;\n output: JsonValue | undefined;\n toolCalls: ToolCallRecord[];\n}): NormalizedMessage[] {\n const messages: NormalizedMessage[] = [\n {\n role: \"user\",\n content: normalizeContent(input),\n },\n ];\n\n if (output !== undefined || normalizedToolCalls.length > 0) {\n messages.push({\n role: \"assistant\",\n ...(output !== undefined ? { content: normalizeContent(output) } : {}),\n ...(normalizedToolCalls.length > 0\n ? { toolCalls: normalizedToolCalls }\n : {}),\n });\n }\n\n return messages;\n}\n\nfunction normalizeSimpleToolCalls(\n calls: SimpleToolCallRecord[] | undefined,\n): ToolCallRecord[] {\n return (calls ?? []).map((call) => {\n const {\n arguments: rawArguments,\n result: rawResult,\n error: rawError,\n metadata: rawMetadata,\n ...toolCall\n } = call;\n const args = normalizeToolCallArguments(rawArguments);\n const result = toJsonValue(rawResult);\n const error = normalizeToolCallError(rawError);\n const metadata = rawMetadata ? normalizeMetadata(rawMetadata) : undefined;\n\n return {\n ...toolCall,\n ...(args ? { arguments: args } : {}),\n ...(result !== undefined ? { result } : {}),\n ...(error ? { error } : {}),\n ...(metadata ? { metadata } : {}),\n };\n });\n}\n\nfunction normalizeToolCallArguments(\n value: unknown,\n): Record<string, JsonValue> | undefined {\n if (value === undefined) {\n return undefined;\n }\n\n const normalized = toJsonValue(value);\n return normalized &&\n typeof normalized === \"object\" &&\n !Array.isArray(normalized)\n ? normalized\n : undefined;\n}\n\nfunction normalizeToolCallError(\n value: unknown,\n): ToolCallRecord[\"error\"] | undefined {\n if (value === undefined) {\n return undefined;\n }\n\n const serialized = serializeError(value);\n const { message, type, ...details } = serialized;\n\n return {\n ...details,\n message: typeof message === \"string\" ? message : String(message),\n ...(typeof type === \"string\" ? { type } : {}),\n };\n}\n\nfunction normalizeMergedArtifacts(\n contextArtifacts: Record<string, JsonValue> | undefined,\n resultArtifacts: Record<string, unknown> | undefined,\n) {\n const artifacts = {\n ...(contextArtifacts ?? {}),\n ...(resultArtifacts ? normalizeRecord(resultArtifacts) : {}),\n };\n\n return Object.keys(artifacts).length > 0 ? artifacts : undefined;\n}\n\nfunction normalizeSimpleErrors(\n errors: unknown[] | undefined,\n): Array<Record<string, JsonValue>> {\n return (errors ?? []).map((error) => {\n const normalized = toJsonValue(error);\n\n if (\n normalized &&\n typeof normalized === \"object\" &&\n !Array.isArray(normalized) &&\n Object.keys(normalized).length > 0\n ) {\n return normalized;\n }\n\n return serializeError(error);\n });\n}\n\n/**\n * Flattens every recorded tool call from a normalized session.\n *\n * @param session - Normalized session produced by a harness run.\n *\n * @example\n * ```ts\n * const names = toolCalls(result.session).map((call) => call.name);\n *\n * expect(names).toEqual([\"lookupInvoice\", \"createRefund\"]);\n * ```\n */\nexport function toolCalls(session: NormalizedSession): ToolCallRecord[] {\n return session.messages.flatMap((message) => message.toolCalls ?? []);\n}\n\n/**\n * Filters normalized session messages by role.\n *\n * @param session - Normalized session produced by a harness run.\n * @param role - Message role to keep.\n *\n * @example\n * ```ts\n * const assistantText = messagesByRole(result.session, \"assistant\")\n * .map((message) => message.content)\n * .join(\"\\n\");\n * ```\n */\nexport function messagesByRole(\n session: NormalizedSession,\n role: NormalizedMessage[\"role\"],\n): NormalizedMessage[] {\n return session.messages.filter((message) => message.role === role);\n}\n\n/**\n * Returns every normalized system message from a session.\n *\n * @param session - Normalized session produced by a harness run.\n *\n * @example\n * ```ts\n * const systemPrompts = systemMessages(result.session);\n * ```\n */\nexport function systemMessages(session: NormalizedSession) {\n return messagesByRole(session, \"system\");\n}\n\n/**\n * Returns every normalized user message from a session.\n *\n * @param session - Normalized session produced by a harness run.\n *\n * @example\n * ```ts\n * const firstPrompt = userMessages(result.session)[0]?.content;\n * ```\n */\nexport function userMessages(session: NormalizedSession) {\n return messagesByRole(session, \"user\");\n}\n\n/**\n * Returns every normalized assistant message from a session.\n *\n * @param session - Normalized session produced by a harness run.\n *\n * @example\n * ```ts\n * const finalAnswer = assistantMessages(result.session).at(-1)?.content;\n * ```\n */\nexport function assistantMessages(session: NormalizedSession) {\n return messagesByRole(session, \"assistant\");\n}\n\n/**\n * Returns every normalized tool message from a session.\n *\n * @param session - Normalized session produced by a harness run.\n *\n * @example\n * ```ts\n * const toolOutputs = toolMessages(result.session).map((message) => message.content);\n * ```\n */\nexport function toolMessages(session: NormalizedSession) {\n return messagesByRole(session, \"tool\");\n}\n\n/**\n * Attaches a partial or complete harness run to an arbitrary thrown error.\n *\n * @param error - Thrown value to wrap.\n * @param run - Partial or complete normalized harness run to preserve.\n *\n * @example\n * ```ts\n * try {\n * return await runAgent(input);\n * } catch (error) {\n * throw attachHarnessRunToError(error, partialRun);\n * }\n * ```\n */\nexport function attachHarnessRunToError(\n error: unknown,\n run: HarnessRun,\n): HarnessRunError {\n const baseError =\n error instanceof Error\n ? error\n : new Error(String(error ?? \"Unknown error\"));\n return Object.assign(baseError, {\n vitestEvalsRun: run,\n });\n}\n\n/**\n * Reads an attached harness run back off a previously wrapped error value.\n *\n * @param error - Unknown thrown value that may contain a harness run.\n *\n * @example\n * ```ts\n * const partialRun = getHarnessRunFromError(error);\n *\n * if (partialRun) {\n * console.log(toolCalls(partialRun.session));\n * }\n * ```\n */\nexport function getHarnessRunFromError(error: unknown): HarnessRun | undefined {\n if (\n error &&\n typeof error === \"object\" &&\n \"vitestEvalsRun\" in error &&\n isHarnessRun((error as { vitestEvalsRun?: unknown }).vitestEvalsRun)\n ) {\n return (error as { vitestEvalsRun: HarnessRun }).vitestEvalsRun;\n }\n\n return undefined;\n}\n\n/** Returns true when a value matches the normalized `HarnessRun` contract. */\nexport function isHarnessRun(value: unknown): value is HarnessRun {\n if (!value || typeof value !== \"object\") {\n return false;\n }\n\n const candidate = value as {\n session?: unknown;\n usage?: unknown;\n errors?: unknown;\n };\n\n return (\n isNormalizedSession(candidate.session) &&\n Boolean(candidate.usage) &&\n typeof candidate.usage === \"object\" &&\n !Array.isArray(candidate.usage) &&\n Array.isArray(candidate.errors)\n );\n}\n\n/** Returns true when a value matches the normalized session contract. */\nexport function isNormalizedSession(\n value: unknown,\n): value is NormalizedSession {\n return (\n Boolean(value) &&\n typeof value === \"object\" &&\n value !== null &&\n \"messages\" in value &&\n Array.isArray((value as { messages?: unknown }).messages)\n );\n}\n\n/** Reuses pre-normalized harness errors when a runtime already returns them. */\nexport function resolveHarnessRunErrors(\n result: unknown,\n): Array<Record<string, JsonValue>> {\n if (\n result &&\n typeof result === \"object\" &&\n Array.isArray((result as Record<string, unknown>).errors)\n ) {\n return (result as { errors: Array<Record<string, JsonValue>> }).errors;\n }\n\n return [];\n}\n\n/** Serializes an arbitrary thrown value into the normalized error shape. */\nexport function serializeError(error: unknown): Record<string, JsonValue> {\n if (error instanceof Error) {\n return {\n type: error.name,\n message: error.message,\n };\n }\n\n return {\n type: \"Error\",\n message: String(error),\n };\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAiVA,SAAS,gBAAgB,OAAwC;AAC/D,SACE,UAAU,QACV,OAAO,UAAU,YACjB,OAAO,UAAU,YACjB,OAAO,UAAU;AAErB;AAEA,SAAS,aAAa,OAAkD;AACtE,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,CAAC,MAAM,QAAQ,KAAK;AAC5E;AAEA,SAAS,mBAAmB,OAA+B;AACzD,SAAO,MAAM,IAAI,CAAC,SAAS;AACzB,UAAM,aAAa,YAAY,IAAI;AACnC,WAAO,eAAe,SAAY,OAAO;AAAA,EAC3C,CAAC;AACH;AAEA,SAAS,oBACP,OAC2B;AAC3B,QAAM,aAAwC,CAAC;AAE/C,aAAW,CAAC,KAAK,UAAU,KAAK,OAAO,QAAQ,KAAK,GAAG;AACrD,UAAM,QAAQ,YAAY,UAAU;AACpC,QAAI,UAAU,QAAW;AACvB,iBAAW,GAAG,IAAI;AAAA,IACpB;AAAA,EACF;AAEA,SAAO;AACT;AAGO,SAAS,kBAAkB,OAAgB,YAAoB;AACpE,SACE,UAAU,SACT,OAAO,UAAU,YAAY,OAAO,UAAU,eAC/C,cAAc,SACd,OAAQ,MAAkC,UAAU,MAAM;AAE9D;AAGO,SAAS,YAAY,OAAuC;AACjE,MAAI,gBAAgB,KAAK,GAAG;AAC1B,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,QAAQ,KAAK,GAAG;AACxB,WAAO,mBAAmB,KAAK;AAAA,EACjC;AAEA,MAAI,aAAa,KAAK,GAAG;AACvB,WAAO,oBAAoB,KAAK;AAAA,EAClC;AAEA,SAAO;AACT;AAGO,SAAS,gBACd,OAC2B;AAC3B,SAAO,oBAAoB,KAAK;AAClC;AAGO,SAAS,kBACd,OACuC;AACvC,QAAM,aAAa,gBAAgB,KAAK;AACxC,SAAO,OAAO,KAAK,UAAU,EAAE,SAAS,IAAI,aAAa;AAC3D;AAGO,SAAS,iBAAiB,OAA2B;AAC1D,QAAM,aAAa,YAAY,KAAK;AACpC,SAAO,eAAe,SAAY,aAAa,OAAO,KAAK;AAC7D;AAuCO,SAAS,cAKd,SACqC;AACrC,QAAM,UAA+C;AAAA,IACnD,MAAM,QAAQ;AAAA,IACd,KAAK,OAAO,OAAO,YAAY;AAC7B,YAAM,SAAS,MAAM,QAAQ,IAAI;AAAA,QAC/B;AAAA,QACA,UAAU,QAAQ;AAAA,QAClB,QAAQ,QAAQ;AAAA,QAChB,WAAW,QAAQ;AAAA,QACnB,aAAa,QAAQ;AAAA,MACvB,CAAC;AAED,aAAO,oBAAoB,OAAO,QAAQ,OAAO;AAAA,IACnD;AAAA,EACF;AAEA,SAAO;AACT;AAoBO,SAAS,oBAKd,OACA,QACA,SACqB;AACrB,MAAI,aAAa,MAAM,GAAG;AACxB,QACE,WACA,OAAO,KAAK,QAAQ,SAAS,EAAE,SAAS,KACxC,CAAC,OAAO,WACR;AACA,aAAO;AAAA,QACL,GAAG;AAAA,QACH,WAAW,QAAQ;AAAA,MACrB;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AAEA,QAAM,SAAS,OAAO;AACtB,QAAMA,aAAY,yBAAyB,OAAO,SAAS;AAC3D,QAAM,QAAQ,OAAO,SAAS,CAAC;AAC/B,QAAM,WACJ,OAAO,YACP,6BAA6B;AAAA,IAC3B;AAAA,IACA;AAAA,IACA,WAAAA;AAAA,EACF,CAAC;AACH,QAAM,WAAW,OAAO,WACpB,kBAAkB,OAAO,QAAQ,IACjC;AACJ,QAAM,YAAY;AAAA,IAChB,SAAS;AAAA,IACT,OAAO;AAAA,EACT;AAEA,SAAO;AAAA,IACL,SAAS;AAAA,MACP;AAAA,MACA,GAAI,MAAM,WAAW,EAAE,UAAU,MAAM,SAAS,IAAI,CAAC;AAAA,MACrD,GAAI,MAAM,QAAQ,EAAE,OAAO,MAAM,MAAM,IAAI,CAAC;AAAA,MAC5C,GAAI,WAAW,EAAE,SAAS,IAAI,CAAC;AAAA,IACjC;AAAA,IACA,GAAI,WAAW,SAAY,EAAE,OAAO,IAAI,CAAC;AAAA,IACzC;AAAA,IACA,GAAI,OAAO,UAAU,EAAE,SAAS,OAAO,QAAQ,IAAI,CAAC;AAAA,IACpD,GAAI,YAAY,EAAE,UAAU,IAAI,CAAC;AAAA,IACjC,QAAQ,sBAAsB,OAAO,MAAM;AAAA,EAC7C;AACF;AAEA,SAAS,6BAAqC;AAAA,EAC5C;AAAA,EACA;AAAA,EACA,WAAW;AACb,GAIwB;AACtB,QAAM,WAAgC;AAAA,IACpC;AAAA,MACE,MAAM;AAAA,MACN,SAAS,iBAAiB,KAAK;AAAA,IACjC;AAAA,EACF;AAEA,MAAI,WAAW,UAAa,oBAAoB,SAAS,GAAG;AAC1D,aAAS,KAAK;AAAA,MACZ,MAAM;AAAA,MACN,GAAI,WAAW,SAAY,EAAE,SAAS,iBAAiB,MAAM,EAAE,IAAI,CAAC;AAAA,MACpE,GAAI,oBAAoB,SAAS,IAC7B,EAAE,WAAW,oBAAoB,IACjC,CAAC;AAAA,IACP,CAAC;AAAA,EACH;AAEA,SAAO;AACT;AAEA,SAAS,yBACP,OACkB;AAClB,UAAQ,SAAS,CAAC,GAAG,IAAI,CAAC,SAAS;AACjC,UAAM;AAAA,MACJ,WAAW;AAAA,MACX,QAAQ;AAAA,MACR,OAAO;AAAA,MACP,UAAU;AAAA,MACV,GAAG;AAAA,IACL,IAAI;AACJ,UAAM,OAAO,2BAA2B,YAAY;AACpD,UAAM,SAAS,YAAY,SAAS;AACpC,UAAM,QAAQ,uBAAuB,QAAQ;AAC7C,UAAM,WAAW,cAAc,kBAAkB,WAAW,IAAI;AAEhE,WAAO;AAAA,MACL,GAAG;AAAA,MACH,GAAI,OAAO,EAAE,WAAW,KAAK,IAAI,CAAC;AAAA,MAClC,GAAI,WAAW,SAAY,EAAE,OAAO,IAAI,CAAC;AAAA,MACzC,GAAI,QAAQ,EAAE,MAAM,IAAI,CAAC;AAAA,MACzB,GAAI,WAAW,EAAE,SAAS,IAAI,CAAC;AAAA,IACjC;AAAA,EACF,CAAC;AACH;AAEA,SAAS,2BACP,OACuC;AACvC,MAAI,UAAU,QAAW;AACvB,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,YAAY,KAAK;AACpC,SAAO,cACL,OAAO,eAAe,YACtB,CAAC,MAAM,QAAQ,UAAU,IACvB,aACA;AACN;AAEA,SAAS,uBACP,OACqC;AACrC,MAAI,UAAU,QAAW;AACvB,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,eAAe,KAAK;AACvC,QAAM,EAAE,SAAS,MAAM,GAAG,QAAQ,IAAI;AAEtC,SAAO;AAAA,IACL,GAAG;AAAA,IACH,SAAS,OAAO,YAAY,WAAW,UAAU,OAAO,OAAO;AAAA,IAC/D,GAAI,OAAO,SAAS,WAAW,EAAE,KAAK,IAAI,CAAC;AAAA,EAC7C;AACF;AAEA,SAAS,yBACP,kBACA,iBACA;AACA,QAAM,YAAY;AAAA,IAChB,GAAI,oBAAoB,CAAC;AAAA,IACzB,GAAI,kBAAkB,gBAAgB,eAAe,IAAI,CAAC;AAAA,EAC5D;AAEA,SAAO,OAAO,KAAK,SAAS,EAAE,SAAS,IAAI,YAAY;AACzD;AAEA,SAAS,sBACP,QACkC;AAClC,UAAQ,UAAU,CAAC,GAAG,IAAI,CAAC,UAAU;AACnC,UAAM,aAAa,YAAY,KAAK;AAEpC,QACE,cACA,OAAO,eAAe,YACtB,CAAC,MAAM,QAAQ,UAAU,KACzB,OAAO,KAAK,UAAU,EAAE,SAAS,GACjC;AACA,aAAO;AAAA,IACT;AAEA,WAAO,eAAe,KAAK;AAAA,EAC7B,CAAC;AACH;AAcO,SAAS,UAAU,SAA8C;AACtE,SAAO,QAAQ,SAAS,QAAQ,CAAC,YAAY,QAAQ,aAAa,CAAC,CAAC;AACtE;AAeO,SAAS,eACd,SACA,MACqB;AACrB,SAAO,QAAQ,SAAS,OAAO,CAAC,YAAY,QAAQ,SAAS,IAAI;AACnE;AAYO,SAAS,eAAe,SAA4B;AACzD,SAAO,eAAe,SAAS,QAAQ;AACzC;AAYO,SAAS,aAAa,SAA4B;AACvD,SAAO,eAAe,SAAS,MAAM;AACvC;AAYO,SAAS,kBAAkB,SAA4B;AAC5D,SAAO,eAAe,SAAS,WAAW;AAC5C;AAYO,SAAS,aAAa,SAA4B;AACvD,SAAO,eAAe,SAAS,MAAM;AACvC;AAiBO,SAAS,wBACd,OACA,KACiB;AACjB,QAAM,YACJ,iBAAiB,QACb,QACA,IAAI,MAAM,OAAO,SAAS,eAAe,CAAC;AAChD,SAAO,OAAO,OAAO,WAAW;AAAA,IAC9B,gBAAgB;AAAA,EAClB,CAAC;AACH;AAgBO,SAAS,uBAAuB,OAAwC;AAC7E,MACE,SACA,OAAO,UAAU,YACjB,oBAAoB,SACpB,aAAc,MAAuC,cAAc,GACnE;AACA,WAAQ,MAAyC;AAAA,EACnD;AAEA,SAAO;AACT;AAGO,SAAS,aAAa,OAAqC;AAChE,MAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACvC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY;AAMlB,SACE,oBAAoB,UAAU,OAAO,KACrC,QAAQ,UAAU,KAAK,KACvB,OAAO,UAAU,UAAU,YAC3B,CAAC,MAAM,QAAQ,UAAU,KAAK,KAC9B,MAAM,QAAQ,UAAU,MAAM;AAElC;AAGO,SAAS,oBACd,OAC4B;AAC5B,SACE,QAAQ,KAAK,KACb,OAAO,UAAU,YACjB,UAAU,QACV,cAAc,SACd,MAAM,QAAS,MAAiC,QAAQ;AAE5D;AAGO,SAAS,wBACd,QACkC;AAClC,MACE,UACA,OAAO,WAAW,YAClB,MAAM,QAAS,OAAmC,MAAM,GACxD;AACA,WAAQ,OAAwD;AAAA,EAClE;AAEA,SAAO,CAAC;AACV;AAGO,SAAS,eAAe,OAA2C;AACxE,MAAI,iBAAiB,OAAO;AAC1B,WAAO;AAAA,MACL,MAAM,MAAM;AAAA,MACZ,SAAS,MAAM;AAAA,IACjB;AAAA,EACF;AAEA,SAAO;AAAA,IACL,MAAM;AAAA,IACN,SAAS,OAAO,KAAK;AAAA,EACvB;AACF;","names":["toolCalls"]}
|
|
1
|
+
{"version":3,"sources":["../src/harness.ts"],"sourcesContent":["/** Primitive scalar values allowed in normalized JSON-safe eval data. */\nexport type JsonPrimitive = string | number | boolean | null;\n\n/** JSON-safe value shape used by normalized sessions, artifacts, and errors. */\nexport type JsonValue =\n | JsonPrimitive\n | JsonValue[]\n | { [key: string]: JsonValue };\n\n/**\n * Normalized record for one tool call observed during a harness run.\n *\n * @example\n * ```ts\n * const call: ToolCallRecord = {\n * name: \"lookupInvoice\",\n * arguments: { invoiceId: \"inv_123\" },\n * result: { refundable: true },\n * };\n * ```\n */\nexport type ToolCallRecord = {\n /** Provider or runtime tool-call id when one is available. */\n id?: string;\n /** Tool name as exposed to the agent or application runtime. */\n name: string;\n /** JSON-safe tool arguments after provider/runtime normalization. */\n arguments?: Record<string, JsonValue>;\n /** JSON-safe tool result returned by the application tool. */\n result?: JsonValue;\n /** Normalized tool error when execution failed. */\n error?: {\n message: string;\n type?: string;\n [key: string]: JsonValue | undefined;\n };\n /** ISO timestamp for the start of tool execution. */\n startedAt?: string;\n /** ISO timestamp for the end of tool execution. */\n finishedAt?: string;\n /** Tool execution duration in milliseconds. */\n durationMs?: number;\n /** Extra JSON-safe tool metadata for reporters and custom judges. */\n metadata?: Record<string, JsonValue>;\n};\n\n/**\n * Normalized message recorded in a harness session transcript.\n *\n * @example\n * ```ts\n * const message: NormalizedMessage = {\n * role: \"assistant\",\n * content: { status: \"approved\" },\n * toolCalls: [{ name: \"lookupInvoice\" }],\n * };\n * ```\n */\nexport type NormalizedMessage = {\n /** Transcript role for the normalized message. */\n role: \"system\" | \"user\" | \"assistant\" | \"tool\";\n /** JSON-safe message content. */\n content?: JsonValue;\n /** Tool calls associated with this message. */\n toolCalls?: ToolCallRecord[];\n /** Extra JSON-safe message metadata. */\n metadata?: Record<string, JsonValue>;\n};\n\n/**\n * Provider usage summary attached to a normalized harness run.\n *\n * @example\n * ```ts\n * const usage: UsageSummary = {\n * provider: \"openai\",\n * model: \"gpt-4o-mini\",\n * inputTokens: 212,\n * outputTokens: 48,\n * totalTokens: 260,\n * };\n * ```\n */\nexport type UsageSummary = {\n /** Provider that served the application run. */\n provider?: string;\n /** Model used for the application run. */\n model?: string;\n /** Input, prompt, or request tokens consumed by the run. */\n inputTokens?: number;\n /** Output or completion tokens produced by the run. */\n outputTokens?: number;\n /** Reasoning tokens reported by providers that expose them. */\n reasoningTokens?: number;\n /** Total token count reported by the provider or adapter. */\n totalTokens?: number;\n /** Count of tool calls observed during the run. */\n toolCalls?: number;\n /** Retry count observed during the run. */\n retries?: number;\n /** Provider-specific JSON-safe usage details. Cost estimates belong here. */\n metadata?: Record<string, JsonValue>;\n};\n\n/** Timing summary attached to a normalized harness run. */\nexport type TimingSummary = {\n /** End-to-end run duration in milliseconds. */\n totalMs?: number;\n /** Extra JSON-safe timing metadata. */\n metadata?: Record<string, JsonValue>;\n};\n\n/**\n * JSON-serializable transcript produced by the system under test.\n *\n * @example\n * ```ts\n * const session: NormalizedSession = {\n * provider: \"openai\",\n * model: \"gpt-4o-mini\",\n * messages: [\n * { role: \"user\", content: \"Refund invoice inv_123\" },\n * { role: \"assistant\", content: { status: \"approved\" } },\n * ],\n * };\n * ```\n */\nexport type NormalizedSession = {\n /** Ordered normalized transcript messages. */\n messages: NormalizedMessage[];\n /** Provider that produced the session when known. */\n provider?: string;\n /** Model that produced the session when known. */\n model?: string;\n /** Extra JSON-safe session metadata. */\n metadata?: Record<string, JsonValue>;\n};\n\ntype OutputField<TOutput extends JsonValue | undefined> =\n undefined extends TOutput ? { output?: TOutput } : { output: TOutput };\n\n/**\n * Normalized result returned by every harness execution.\n *\n * @example\n * ```ts\n * const run: HarnessRun<{ status: \"approved\" }> = {\n * output: { status: \"approved\" },\n * session: {\n * messages: [\n * { role: \"user\", content: \"Refund invoice inv_123\" },\n * { role: \"assistant\", content: { status: \"approved\" } },\n * ],\n * },\n * usage: { totalTokens: 260 },\n * errors: [],\n * };\n * ```\n */\nexport type HarnessRun<\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = OutputField<TOutput> & {\n /** Normalized transcript and provider/session metadata. */\n session: NormalizedSession;\n /** Stable provider usage units such as tokens, tools, and retries. */\n usage: UsageSummary;\n /** Optional timing summary for the run. */\n timings?: TimingSummary;\n /** JSON-safe run artifacts captured by the harness or test context. */\n artifacts?: Record<string, JsonValue>;\n /** Normalized errors captured during execution. */\n errors: Array<Record<string, JsonValue>>;\n};\n\n/** Error value with an attached partial or complete normalized harness run. */\nexport type HarnessRunError = Error & {\n /** Attached normalized harness run recovered by `getHarnessRunFromError(...)`. */\n vitestEvalsRun: HarnessRun;\n};\n\n/** Per-run metadata shape accepted by harnesses and eval tests. */\nexport type HarnessMetadata = Record<string, unknown>;\n\n/**\n * Runtime context passed from the eval fixture into a harness run.\n *\n * @example\n * ```ts\n * const harness: Harness<string> = {\n * name: \"refund-agent\",\n * async run(input, context) {\n * context.setArtifact(\"inputLength\", input.length);\n *\n * return {\n * output: undefined,\n * session: { messages: [{ role: \"user\", content: input }] },\n * usage: {},\n * errors: [],\n * };\n * },\n * };\n * ```\n */\nexport type HarnessContext<\n TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n /** Per-run metadata passed through `run(input, { metadata })`. */\n metadata: Readonly<TMetadata>;\n /** Abort signal from Vitest when available. */\n signal?: AbortSignal;\n /** Mutable JSON-safe artifact bag shared with the harness. */\n artifacts: Record<string, JsonValue>;\n /** Stores one JSON-safe artifact on the current run. */\n setArtifact: (name: string, value: JsonValue) => void;\n};\n\n/**\n * Adapter that executes the system under test and returns a normalized run.\n *\n * @example\n * ```ts\n * const harness: Harness<string, { status: \"approved\" | \"denied\" }> = {\n * name: \"refund-agent\",\n * async run(input, context) {\n * return normalizeHarnessRun(input, await runRefundFlow(input), context);\n * },\n * };\n * ```\n */\nexport type Harness<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n /** Stable harness name used in reports. */\n name: string;\n /** Executes the system under test and returns a normalized run. */\n run: (\n input: TInput,\n context: HarnessContext<TMetadata>,\n ) => Promise<HarnessRun<TOutput>>;\n};\n\n/** Value or promise accepted by lightweight harness callbacks. */\nexport type MaybePromise<T> = T | Promise<T>;\n\n/** Lightweight tool-call record accepted by `createHarness(...)` results. */\nexport type SimpleToolCallRecord = Omit<\n ToolCallRecord,\n \"arguments\" | \"result\" | \"error\" | \"metadata\"\n> & {\n /** Raw tool arguments accepted by `createHarness(...)` before normalization. */\n arguments?: unknown;\n /** Raw tool result accepted by `createHarness(...)` before normalization. */\n result?: unknown;\n /** Raw tool error accepted by `createHarness(...)` before normalization. */\n error?: unknown;\n /** Raw tool metadata accepted by `createHarness(...)` before normalization. */\n metadata?: Record<string, unknown>;\n};\n\n/**\n * Lightweight result shape normalized by `createHarness(...)`.\n *\n * @example\n * ```ts\n * const result: SimpleHarnessResult<{ status: \"approved\" }> = {\n * output: { status: \"approved\" },\n * toolCalls: [{ name: \"lookupInvoice\", arguments: { invoiceId: \"inv_123\" } }],\n * usage: { totalTokens: 260 },\n * };\n * ```\n */\nexport type SimpleHarnessResult<\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = OutputField<TOutput> & {\n /** Pre-normalized transcript messages. When omitted, a default user/assistant transcript is created. */\n messages?: NormalizedMessage[];\n /** Lightweight tool-call records to normalize into the session. */\n toolCalls?: SimpleToolCallRecord[];\n /** Usage summary to attach to the run. */\n usage?: UsageSummary;\n /** Timing summary to attach to the run. */\n timings?: TimingSummary;\n /** Raw artifact values to normalize and merge into the run. */\n artifacts?: Record<string, unknown>;\n /** Raw session metadata to normalize into the session. */\n metadata?: Record<string, unknown>;\n /** Raw errors to normalize into the run. */\n errors?: unknown[];\n};\n\n/** Either a complete normalized run or a lightweight result to normalize. */\nexport type HarnessResultLike<\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = HarnessRun<TOutput> | SimpleHarnessResult<TOutput>;\n\n/** Arguments passed to the `createHarness(...)` convenience callback. */\nexport type CreateHarnessRunArgs<TInput, TMetadata extends HarnessMetadata> = {\n /** Original input passed to `run(input)`. */\n input: TInput;\n /** Read-only metadata passed to `run(input, { metadata })`. */\n metadata: Readonly<TMetadata>;\n /** Abort signal from Vitest when available. */\n signal?: AbortSignal;\n /** Mutable run artifact bag. */\n artifacts: HarnessContext<TMetadata>[\"artifacts\"];\n /** Stores one JSON-safe artifact on the current run. */\n setArtifact: HarnessContext<TMetadata>[\"setArtifact\"];\n};\n\n/**\n * Options for creating a lightweight custom application harness.\n *\n * @example\n * ```ts\n * const options: CreateHarnessOptions<string, { status: \"approved\" }> = {\n * name: \"refund-agent\",\n * run: async ({ input }) => ({\n * output: await classifyRefund(input),\n * }),\n * };\n * ```\n */\nexport type CreateHarnessOptions<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n /** Stable harness name used in reports. */\n name: string;\n /** Executes application code and returns either a lightweight result or full `HarnessRun`. */\n run: (\n args: CreateHarnessRunArgs<TInput, TMetadata>,\n ) => MaybePromise<HarnessResultLike<TOutput>>;\n};\n\nfunction isJsonPrimitive(value: unknown): value is JsonPrimitive {\n return (\n value === null ||\n typeof value === \"string\" ||\n typeof value === \"number\" ||\n typeof value === \"boolean\"\n );\n}\n\nfunction isJsonRecord(value: unknown): value is Record<string, unknown> {\n return typeof value === \"object\" && value !== null && !Array.isArray(value);\n}\n\nfunction normalizeJsonArray(value: unknown[]): JsonValue[] {\n return value.map((item) => {\n const normalized = toJsonValue(item);\n return normalized === undefined ? null : normalized;\n });\n}\n\nfunction normalizeJsonObject(\n value: Record<string, unknown>,\n): Record<string, JsonValue> {\n const normalized: Record<string, JsonValue> = {};\n\n for (const [key, entryValue] of Object.entries(value)) {\n const entry = toJsonValue(entryValue);\n if (entry !== undefined) {\n normalized[key] = entry;\n }\n }\n\n return normalized;\n}\n\n/** Returns true when a value exposes a callable method with the given name. */\nexport function hasCallableMethod(value: unknown, methodName: string) {\n return (\n value !== null &&\n (typeof value === \"object\" || typeof value === \"function\") &&\n methodName in value &&\n typeof (value as Record<string, unknown>)[methodName] === \"function\"\n );\n}\n\n/** Normalizes an unknown value into the JSON-safe shape used by harness runs. */\nexport function toJsonValue(value: unknown): JsonValue | undefined {\n if (isJsonPrimitive(value)) {\n return value;\n }\n\n if (Array.isArray(value)) {\n return normalizeJsonArray(value);\n }\n\n if (isJsonRecord(value)) {\n return normalizeJsonObject(value);\n }\n\n return undefined;\n}\n\n/** Drops non-JSON properties from a record while preserving valid values. */\nexport function normalizeRecord(\n value: Record<string, unknown>,\n): Record<string, JsonValue> {\n return normalizeJsonObject(value);\n}\n\n/** Normalizes metadata and omits the field entirely when nothing survives. */\nexport function normalizeMetadata(\n value: Record<string, unknown>,\n): Record<string, JsonValue> | undefined {\n const normalized = normalizeRecord(value);\n return Object.keys(normalized).length > 0 ? normalized : undefined;\n}\n\n/** Converts arbitrary content into the JSON-safe message content shape. */\nexport function normalizeContent(value: unknown): JsonValue {\n const normalized = toJsonValue(value);\n return normalized !== undefined ? normalized : String(value);\n}\n\n/**\n * Creates a harness from the common \"run app code and return output\" shape.\n *\n * @param options - Harness name plus the callback that executes app code.\n *\n * @example\n * ```ts\n * import { createHarness } from \"vitest-evals\";\n *\n * export const refundHarness = createHarness<\n * string,\n * { status: \"approved\" | \"denied\" },\n * { expected: { status: \"approved\" | \"denied\" } }\n * >({\n * name: \"refund-agent\",\n * run: async ({ input, metadata, setArtifact }) => {\n * const result = await runRefundFlow(input, metadata);\n * const output = { status: result.status };\n *\n * setArtifact(\"case\", { expected: metadata.expected.status });\n *\n * return {\n * output,\n * toolCalls: result.toolCalls,\n * usage: { provider: \"openai\", model: \"gpt-4o-mini\" },\n * };\n * },\n * });\n * ```\n */\nexport function createHarness<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n>(\n options: CreateHarnessOptions<TInput, TOutput, TMetadata>,\n): Harness<TInput, TOutput, TMetadata>;\nexport function createHarness<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n>(\n options: CreateHarnessOptions<TInput, TOutput, TMetadata>,\n): Harness<TInput, TOutput, TMetadata> {\n const harness: Harness<TInput, TOutput, TMetadata> = {\n name: options.name,\n run: async (input, context) => {\n const result = await options.run({\n input,\n metadata: context.metadata,\n signal: context.signal,\n artifacts: context.artifacts,\n setArtifact: context.setArtifact,\n });\n\n return normalizeHarnessRun(input, result, context);\n },\n };\n\n return harness;\n}\n\n/**\n * Normalizes a lightweight harness result into the reporter-facing run shape.\n *\n * @param input - Original input passed to the harness.\n * @param result - Lightweight result or pre-normalized harness run.\n * @param context - Optional per-run context used to merge artifacts.\n *\n * @example\n * ```ts\n * const run = normalizeHarnessRun(\"Refund invoice inv_123\", {\n * output: { status: \"approved\" },\n * toolCalls: [{ name: \"lookupInvoice\", arguments: { invoiceId: \"inv_123\" } }],\n * usage: { provider: \"openai\", model: \"gpt-4o-mini\" },\n * });\n *\n * expect(toolCalls(run.session)).toHaveLength(1);\n * ```\n */\nexport function normalizeHarnessRun<\n TInput = unknown,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n>(\n input: TInput,\n result: HarnessResultLike<TOutput>,\n context?: HarnessContext<TMetadata>,\n): HarnessRun<TOutput> {\n if (isHarnessRun(result)) {\n if (\n context &&\n Object.keys(context.artifacts).length > 0 &&\n !result.artifacts\n ) {\n return {\n ...result,\n artifacts: context.artifacts,\n };\n }\n\n return result;\n }\n\n const output = result.output;\n const toolCalls = normalizeSimpleToolCalls(result.toolCalls);\n const usage = result.usage ?? {};\n const messages =\n result.messages ??\n createDefaultSessionMessages({\n input,\n output,\n toolCalls,\n });\n const metadata = result.metadata\n ? normalizeMetadata(result.metadata)\n : undefined;\n const artifacts = normalizeMergedArtifacts(\n context?.artifacts,\n result.artifacts,\n );\n\n return {\n session: {\n messages,\n ...(usage.provider ? { provider: usage.provider } : {}),\n ...(usage.model ? { model: usage.model } : {}),\n ...(metadata ? { metadata } : {}),\n },\n ...(output !== undefined ? { output } : {}),\n usage,\n ...(result.timings ? { timings: result.timings } : {}),\n ...(artifacts ? { artifacts } : {}),\n errors: normalizeSimpleErrors(result.errors),\n } as HarnessRun<TOutput>;\n}\n\nfunction createDefaultSessionMessages<TInput>({\n input,\n output,\n toolCalls: normalizedToolCalls,\n}: {\n input: TInput;\n output: JsonValue | undefined;\n toolCalls: ToolCallRecord[];\n}): NormalizedMessage[] {\n const messages: NormalizedMessage[] = [\n {\n role: \"user\",\n content: normalizeContent(input),\n },\n ];\n\n if (output !== undefined || normalizedToolCalls.length > 0) {\n messages.push({\n role: \"assistant\",\n ...(output !== undefined ? { content: normalizeContent(output) } : {}),\n ...(normalizedToolCalls.length > 0\n ? { toolCalls: normalizedToolCalls }\n : {}),\n });\n }\n\n return messages;\n}\n\nfunction normalizeSimpleToolCalls(\n calls: SimpleToolCallRecord[] | undefined,\n): ToolCallRecord[] {\n return (calls ?? []).map((call) => {\n const {\n arguments: rawArguments,\n result: rawResult,\n error: rawError,\n metadata: rawMetadata,\n ...toolCall\n } = call;\n const args = normalizeToolCallArguments(rawArguments);\n const result = toJsonValue(rawResult);\n const error = normalizeToolCallError(rawError);\n const metadata = rawMetadata ? normalizeMetadata(rawMetadata) : undefined;\n\n return {\n ...toolCall,\n ...(args ? { arguments: args } : {}),\n ...(result !== undefined ? { result } : {}),\n ...(error ? { error } : {}),\n ...(metadata ? { metadata } : {}),\n };\n });\n}\n\nfunction normalizeToolCallArguments(\n value: unknown,\n): Record<string, JsonValue> | undefined {\n if (value === undefined) {\n return undefined;\n }\n\n const normalized = toJsonValue(value);\n return normalized &&\n typeof normalized === \"object\" &&\n !Array.isArray(normalized)\n ? normalized\n : undefined;\n}\n\nfunction normalizeToolCallError(\n value: unknown,\n): ToolCallRecord[\"error\"] | undefined {\n if (value === undefined) {\n return undefined;\n }\n\n const serialized = serializeError(value);\n const { message, type, ...details } = serialized;\n\n return {\n ...details,\n message: typeof message === \"string\" ? message : String(message),\n ...(typeof type === \"string\" ? { type } : {}),\n };\n}\n\nfunction normalizeMergedArtifacts(\n contextArtifacts: Record<string, JsonValue> | undefined,\n resultArtifacts: Record<string, unknown> | undefined,\n) {\n const artifacts = {\n ...(contextArtifacts ?? {}),\n ...(resultArtifacts ? normalizeRecord(resultArtifacts) : {}),\n };\n\n return Object.keys(artifacts).length > 0 ? artifacts : undefined;\n}\n\nfunction normalizeSimpleErrors(\n errors: unknown[] | undefined,\n): Array<Record<string, JsonValue>> {\n return (errors ?? []).map((error) => {\n const normalized = toJsonValue(error);\n\n if (\n normalized &&\n typeof normalized === \"object\" &&\n !Array.isArray(normalized) &&\n Object.keys(normalized).length > 0\n ) {\n return normalized;\n }\n\n return serializeError(error);\n });\n}\n\n/**\n * Flattens every recorded tool call from a normalized session.\n *\n * @param session - Normalized session produced by a harness run.\n *\n * @example\n * ```ts\n * const names = toolCalls(result.session).map((call) => call.name);\n *\n * expect(names).toEqual([\"lookupInvoice\", \"createRefund\"]);\n * ```\n */\nexport function toolCalls(session: NormalizedSession): ToolCallRecord[] {\n return session.messages.flatMap((message) => message.toolCalls ?? []);\n}\n\n/**\n * Filters normalized session messages by role.\n *\n * @param session - Normalized session produced by a harness run.\n * @param role - Message role to keep.\n *\n * @example\n * ```ts\n * const assistantText = messagesByRole(result.session, \"assistant\")\n * .map((message) => message.content)\n * .join(\"\\n\");\n * ```\n */\nexport function messagesByRole(\n session: NormalizedSession,\n role: NormalizedMessage[\"role\"],\n): NormalizedMessage[] {\n return session.messages.filter((message) => message.role === role);\n}\n\nfunction hasNonEmptyMessageContent(message: NormalizedMessage) {\n return (\n message.content !== undefined &&\n (typeof message.content !== \"string\" || message.content.trim().length > 0)\n );\n}\n\n/**\n * Returns every normalized system message from a session.\n *\n * @param session - Normalized session produced by a harness run.\n *\n * @example\n * ```ts\n * const systemPrompts = systemMessages(result.session);\n * ```\n */\nexport function systemMessages(session: NormalizedSession) {\n return messagesByRole(session, \"system\");\n}\n\n/**\n * Returns every normalized user message from a session.\n *\n * @param session - Normalized session produced by a harness run.\n *\n * @example\n * ```ts\n * const firstPrompt = userMessages(result.session)[0]?.content;\n * ```\n */\nexport function userMessages(session: NormalizedSession) {\n return messagesByRole(session, \"user\");\n}\n\n/**\n * Returns every normalized assistant message from a session.\n *\n * @param session - Normalized session produced by a harness run.\n *\n * @example\n * ```ts\n * const finalAnswer = assistantMessages(result.session).at(-1)?.content;\n * ```\n */\nexport function assistantMessages(session: NormalizedSession) {\n return messagesByRole(session, \"assistant\");\n}\n\n/**\n * Returns the latest assistant message content, ignoring empty text messages.\n *\n * @param session - Normalized session produced by a harness run.\n *\n * @example\n * ```ts\n * const finalAnswer = latestAssistantMessageContent(result.session);\n * ```\n */\nexport function latestAssistantMessageContent(session: NormalizedSession) {\n return [...assistantMessages(session)]\n .reverse()\n .find(hasNonEmptyMessageContent)?.content;\n}\n\n/**\n * Returns every normalized tool message from a session.\n *\n * @param session - Normalized session produced by a harness run.\n *\n * @example\n * ```ts\n * const toolOutputs = toolMessages(result.session).map((message) => message.content);\n * ```\n */\nexport function toolMessages(session: NormalizedSession) {\n return messagesByRole(session, \"tool\");\n}\n\n/**\n * Attaches a partial or complete harness run to an arbitrary thrown error.\n *\n * @param error - Thrown value to wrap.\n * @param run - Partial or complete normalized harness run to preserve.\n *\n * @example\n * ```ts\n * try {\n * return await runAgent(input);\n * } catch (error) {\n * throw attachHarnessRunToError(error, partialRun);\n * }\n * ```\n */\nexport function attachHarnessRunToError(\n error: unknown,\n run: HarnessRun,\n): HarnessRunError {\n const baseError =\n error instanceof Error\n ? error\n : new Error(String(error ?? \"Unknown error\"));\n return Object.assign(baseError, {\n vitestEvalsRun: run,\n });\n}\n\n/**\n * Reads an attached harness run back off a previously wrapped error value.\n *\n * @param error - Unknown thrown value that may contain a harness run.\n *\n * @example\n * ```ts\n * const partialRun = getHarnessRunFromError(error);\n *\n * if (partialRun) {\n * console.log(toolCalls(partialRun.session));\n * }\n * ```\n */\nexport function getHarnessRunFromError(error: unknown): HarnessRun | undefined {\n if (\n error &&\n typeof error === \"object\" &&\n \"vitestEvalsRun\" in error &&\n isHarnessRun((error as { vitestEvalsRun?: unknown }).vitestEvalsRun)\n ) {\n return (error as { vitestEvalsRun: HarnessRun }).vitestEvalsRun;\n }\n\n return undefined;\n}\n\n/** Returns true when a value matches the normalized `HarnessRun` contract. */\nexport function isHarnessRun(value: unknown): value is HarnessRun {\n if (!value || typeof value !== \"object\") {\n return false;\n }\n\n const candidate = value as {\n session?: unknown;\n usage?: unknown;\n errors?: unknown;\n };\n\n return (\n isNormalizedSession(candidate.session) &&\n Boolean(candidate.usage) &&\n typeof candidate.usage === \"object\" &&\n !Array.isArray(candidate.usage) &&\n Array.isArray(candidate.errors)\n );\n}\n\n/** Returns true when a value matches the normalized session contract. */\nexport function isNormalizedSession(\n value: unknown,\n): value is NormalizedSession {\n return (\n Boolean(value) &&\n typeof value === \"object\" &&\n value !== null &&\n \"messages\" in value &&\n Array.isArray((value as { messages?: unknown }).messages)\n );\n}\n\n/** Reuses pre-normalized harness errors when a runtime already returns them. */\nexport function resolveHarnessRunErrors(\n result: unknown,\n): Array<Record<string, JsonValue>> {\n if (\n result &&\n typeof result === \"object\" &&\n Array.isArray((result as Record<string, unknown>).errors)\n ) {\n return (result as { errors: Array<Record<string, JsonValue>> }).errors;\n }\n\n return [];\n}\n\n/** Serializes an arbitrary thrown value into the normalized error shape. */\nexport function serializeError(error: unknown): Record<string, JsonValue> {\n if (error instanceof Error) {\n return {\n type: error.name,\n message: error.message,\n };\n }\n\n return {\n type: \"Error\",\n message: String(error),\n };\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAiVA,SAAS,gBAAgB,OAAwC;AAC/D,SACE,UAAU,QACV,OAAO,UAAU,YACjB,OAAO,UAAU,YACjB,OAAO,UAAU;AAErB;AAEA,SAAS,aAAa,OAAkD;AACtE,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,CAAC,MAAM,QAAQ,KAAK;AAC5E;AAEA,SAAS,mBAAmB,OAA+B;AACzD,SAAO,MAAM,IAAI,CAAC,SAAS;AACzB,UAAM,aAAa,YAAY,IAAI;AACnC,WAAO,eAAe,SAAY,OAAO;AAAA,EAC3C,CAAC;AACH;AAEA,SAAS,oBACP,OAC2B;AAC3B,QAAM,aAAwC,CAAC;AAE/C,aAAW,CAAC,KAAK,UAAU,KAAK,OAAO,QAAQ,KAAK,GAAG;AACrD,UAAM,QAAQ,YAAY,UAAU;AACpC,QAAI,UAAU,QAAW;AACvB,iBAAW,GAAG,IAAI;AAAA,IACpB;AAAA,EACF;AAEA,SAAO;AACT;AAGO,SAAS,kBAAkB,OAAgB,YAAoB;AACpE,SACE,UAAU,SACT,OAAO,UAAU,YAAY,OAAO,UAAU,eAC/C,cAAc,SACd,OAAQ,MAAkC,UAAU,MAAM;AAE9D;AAGO,SAAS,YAAY,OAAuC;AACjE,MAAI,gBAAgB,KAAK,GAAG;AAC1B,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,QAAQ,KAAK,GAAG;AACxB,WAAO,mBAAmB,KAAK;AAAA,EACjC;AAEA,MAAI,aAAa,KAAK,GAAG;AACvB,WAAO,oBAAoB,KAAK;AAAA,EAClC;AAEA,SAAO;AACT;AAGO,SAAS,gBACd,OAC2B;AAC3B,SAAO,oBAAoB,KAAK;AAClC;AAGO,SAAS,kBACd,OACuC;AACvC,QAAM,aAAa,gBAAgB,KAAK;AACxC,SAAO,OAAO,KAAK,UAAU,EAAE,SAAS,IAAI,aAAa;AAC3D;AAGO,SAAS,iBAAiB,OAA2B;AAC1D,QAAM,aAAa,YAAY,KAAK;AACpC,SAAO,eAAe,SAAY,aAAa,OAAO,KAAK;AAC7D;AAuCO,SAAS,cAKd,SACqC;AACrC,QAAM,UAA+C;AAAA,IACnD,MAAM,QAAQ;AAAA,IACd,KAAK,OAAO,OAAO,YAAY;AAC7B,YAAM,SAAS,MAAM,QAAQ,IAAI;AAAA,QAC/B;AAAA,QACA,UAAU,QAAQ;AAAA,QAClB,QAAQ,QAAQ;AAAA,QAChB,WAAW,QAAQ;AAAA,QACnB,aAAa,QAAQ;AAAA,MACvB,CAAC;AAED,aAAO,oBAAoB,OAAO,QAAQ,OAAO;AAAA,IACnD;AAAA,EACF;AAEA,SAAO;AACT;AAoBO,SAAS,oBAKd,OACA,QACA,SACqB;AACrB,MAAI,aAAa,MAAM,GAAG;AACxB,QACE,WACA,OAAO,KAAK,QAAQ,SAAS,EAAE,SAAS,KACxC,CAAC,OAAO,WACR;AACA,aAAO;AAAA,QACL,GAAG;AAAA,QACH,WAAW,QAAQ;AAAA,MACrB;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AAEA,QAAM,SAAS,OAAO;AACtB,QAAMA,aAAY,yBAAyB,OAAO,SAAS;AAC3D,QAAM,QAAQ,OAAO,SAAS,CAAC;AAC/B,QAAM,WACJ,OAAO,YACP,6BAA6B;AAAA,IAC3B;AAAA,IACA;AAAA,IACA,WAAAA;AAAA,EACF,CAAC;AACH,QAAM,WAAW,OAAO,WACpB,kBAAkB,OAAO,QAAQ,IACjC;AACJ,QAAM,YAAY;AAAA,IAChB,SAAS;AAAA,IACT,OAAO;AAAA,EACT;AAEA,SAAO;AAAA,IACL,SAAS;AAAA,MACP;AAAA,MACA,GAAI,MAAM,WAAW,EAAE,UAAU,MAAM,SAAS,IAAI,CAAC;AAAA,MACrD,GAAI,MAAM,QAAQ,EAAE,OAAO,MAAM,MAAM,IAAI,CAAC;AAAA,MAC5C,GAAI,WAAW,EAAE,SAAS,IAAI,CAAC;AAAA,IACjC;AAAA,IACA,GAAI,WAAW,SAAY,EAAE,OAAO,IAAI,CAAC;AAAA,IACzC;AAAA,IACA,GAAI,OAAO,UAAU,EAAE,SAAS,OAAO,QAAQ,IAAI,CAAC;AAAA,IACpD,GAAI,YAAY,EAAE,UAAU,IAAI,CAAC;AAAA,IACjC,QAAQ,sBAAsB,OAAO,MAAM;AAAA,EAC7C;AACF;AAEA,SAAS,6BAAqC;AAAA,EAC5C;AAAA,EACA;AAAA,EACA,WAAW;AACb,GAIwB;AACtB,QAAM,WAAgC;AAAA,IACpC;AAAA,MACE,MAAM;AAAA,MACN,SAAS,iBAAiB,KAAK;AAAA,IACjC;AAAA,EACF;AAEA,MAAI,WAAW,UAAa,oBAAoB,SAAS,GAAG;AAC1D,aAAS,KAAK;AAAA,MACZ,MAAM;AAAA,MACN,GAAI,WAAW,SAAY,EAAE,SAAS,iBAAiB,MAAM,EAAE,IAAI,CAAC;AAAA,MACpE,GAAI,oBAAoB,SAAS,IAC7B,EAAE,WAAW,oBAAoB,IACjC,CAAC;AAAA,IACP,CAAC;AAAA,EACH;AAEA,SAAO;AACT;AAEA,SAAS,yBACP,OACkB;AAClB,UAAQ,SAAS,CAAC,GAAG,IAAI,CAAC,SAAS;AACjC,UAAM;AAAA,MACJ,WAAW;AAAA,MACX,QAAQ;AAAA,MACR,OAAO;AAAA,MACP,UAAU;AAAA,MACV,GAAG;AAAA,IACL,IAAI;AACJ,UAAM,OAAO,2BAA2B,YAAY;AACpD,UAAM,SAAS,YAAY,SAAS;AACpC,UAAM,QAAQ,uBAAuB,QAAQ;AAC7C,UAAM,WAAW,cAAc,kBAAkB,WAAW,IAAI;AAEhE,WAAO;AAAA,MACL,GAAG;AAAA,MACH,GAAI,OAAO,EAAE,WAAW,KAAK,IAAI,CAAC;AAAA,MAClC,GAAI,WAAW,SAAY,EAAE,OAAO,IAAI,CAAC;AAAA,MACzC,GAAI,QAAQ,EAAE,MAAM,IAAI,CAAC;AAAA,MACzB,GAAI,WAAW,EAAE,SAAS,IAAI,CAAC;AAAA,IACjC;AAAA,EACF,CAAC;AACH;AAEA,SAAS,2BACP,OACuC;AACvC,MAAI,UAAU,QAAW;AACvB,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,YAAY,KAAK;AACpC,SAAO,cACL,OAAO,eAAe,YACtB,CAAC,MAAM,QAAQ,UAAU,IACvB,aACA;AACN;AAEA,SAAS,uBACP,OACqC;AACrC,MAAI,UAAU,QAAW;AACvB,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,eAAe,KAAK;AACvC,QAAM,EAAE,SAAS,MAAM,GAAG,QAAQ,IAAI;AAEtC,SAAO;AAAA,IACL,GAAG;AAAA,IACH,SAAS,OAAO,YAAY,WAAW,UAAU,OAAO,OAAO;AAAA,IAC/D,GAAI,OAAO,SAAS,WAAW,EAAE,KAAK,IAAI,CAAC;AAAA,EAC7C;AACF;AAEA,SAAS,yBACP,kBACA,iBACA;AACA,QAAM,YAAY;AAAA,IAChB,GAAI,oBAAoB,CAAC;AAAA,IACzB,GAAI,kBAAkB,gBAAgB,eAAe,IAAI,CAAC;AAAA,EAC5D;AAEA,SAAO,OAAO,KAAK,SAAS,EAAE,SAAS,IAAI,YAAY;AACzD;AAEA,SAAS,sBACP,QACkC;AAClC,UAAQ,UAAU,CAAC,GAAG,IAAI,CAAC,UAAU;AACnC,UAAM,aAAa,YAAY,KAAK;AAEpC,QACE,cACA,OAAO,eAAe,YACtB,CAAC,MAAM,QAAQ,UAAU,KACzB,OAAO,KAAK,UAAU,EAAE,SAAS,GACjC;AACA,aAAO;AAAA,IACT;AAEA,WAAO,eAAe,KAAK;AAAA,EAC7B,CAAC;AACH;AAcO,SAAS,UAAU,SAA8C;AACtE,SAAO,QAAQ,SAAS,QAAQ,CAAC,YAAY,QAAQ,aAAa,CAAC,CAAC;AACtE;AAeO,SAAS,eACd,SACA,MACqB;AACrB,SAAO,QAAQ,SAAS,OAAO,CAAC,YAAY,QAAQ,SAAS,IAAI;AACnE;AAEA,SAAS,0BAA0B,SAA4B;AAC7D,SACE,QAAQ,YAAY,WACnB,OAAO,QAAQ,YAAY,YAAY,QAAQ,QAAQ,KAAK,EAAE,SAAS;AAE5E;AAYO,SAAS,eAAe,SAA4B;AACzD,SAAO,eAAe,SAAS,QAAQ;AACzC;AAYO,SAAS,aAAa,SAA4B;AACvD,SAAO,eAAe,SAAS,MAAM;AACvC;AAYO,SAAS,kBAAkB,SAA4B;AAC5D,SAAO,eAAe,SAAS,WAAW;AAC5C;AAYO,SAAS,8BAA8B,SAA4B;AACxE,SAAO,CAAC,GAAG,kBAAkB,OAAO,CAAC,EAClC,QAAQ,EACR,KAAK,yBAAyB,GAAG;AACtC;AAYO,SAAS,aAAa,SAA4B;AACvD,SAAO,eAAe,SAAS,MAAM;AACvC;AAiBO,SAAS,wBACd,OACA,KACiB;AACjB,QAAM,YACJ,iBAAiB,QACb,QACA,IAAI,MAAM,OAAO,SAAS,eAAe,CAAC;AAChD,SAAO,OAAO,OAAO,WAAW;AAAA,IAC9B,gBAAgB;AAAA,EAClB,CAAC;AACH;AAgBO,SAAS,uBAAuB,OAAwC;AAC7E,MACE,SACA,OAAO,UAAU,YACjB,oBAAoB,SACpB,aAAc,MAAuC,cAAc,GACnE;AACA,WAAQ,MAAyC;AAAA,EACnD;AAEA,SAAO;AACT;AAGO,SAAS,aAAa,OAAqC;AAChE,MAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACvC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY;AAMlB,SACE,oBAAoB,UAAU,OAAO,KACrC,QAAQ,UAAU,KAAK,KACvB,OAAO,UAAU,UAAU,YAC3B,CAAC,MAAM,QAAQ,UAAU,KAAK,KAC9B,MAAM,QAAQ,UAAU,MAAM;AAElC;AAGO,SAAS,oBACd,OAC4B;AAC5B,SACE,QAAQ,KAAK,KACb,OAAO,UAAU,YACjB,UAAU,QACV,cAAc,SACd,MAAM,QAAS,MAAiC,QAAQ;AAE5D;AAGO,SAAS,wBACd,QACkC;AAClC,MACE,UACA,OAAO,WAAW,YAClB,MAAM,QAAS,OAAmC,MAAM,GACxD;AACA,WAAQ,OAAwD;AAAA,EAClE;AAEA,SAAO,CAAC;AACV;AAGO,SAAS,eAAe,OAA2C;AACxE,MAAI,iBAAiB,OAAO;AAC1B,WAAO;AAAA,MACL,MAAM,MAAM;AAAA,MACZ,SAAS,MAAM;AAAA,IACjB;AAAA,EACF;AAEA,SAAO;AAAA,IACL,MAAM;AAAA,IACN,SAAS,OAAO,KAAK;AAAA,EACvB;AACF;","names":["toolCalls"]}
|
package/dist/harness.mjs
CHANGED
|
@@ -183,6 +183,9 @@ function toolCalls(session) {
|
|
|
183
183
|
function messagesByRole(session, role) {
|
|
184
184
|
return session.messages.filter((message) => message.role === role);
|
|
185
185
|
}
|
|
186
|
+
function hasNonEmptyMessageContent(message) {
|
|
187
|
+
return message.content !== void 0 && (typeof message.content !== "string" || message.content.trim().length > 0);
|
|
188
|
+
}
|
|
186
189
|
function systemMessages(session) {
|
|
187
190
|
return messagesByRole(session, "system");
|
|
188
191
|
}
|
|
@@ -192,6 +195,9 @@ function userMessages(session) {
|
|
|
192
195
|
function assistantMessages(session) {
|
|
193
196
|
return messagesByRole(session, "assistant");
|
|
194
197
|
}
|
|
198
|
+
function latestAssistantMessageContent(session) {
|
|
199
|
+
return [...assistantMessages(session)].reverse().find(hasNonEmptyMessageContent)?.content;
|
|
200
|
+
}
|
|
195
201
|
function toolMessages(session) {
|
|
196
202
|
return messagesByRole(session, "tool");
|
|
197
203
|
}
|
|
@@ -243,6 +249,7 @@ export {
|
|
|
243
249
|
hasCallableMethod,
|
|
244
250
|
isHarnessRun,
|
|
245
251
|
isNormalizedSession,
|
|
252
|
+
latestAssistantMessageContent,
|
|
246
253
|
messagesByRole,
|
|
247
254
|
normalizeContent,
|
|
248
255
|
normalizeHarnessRun,
|