vitest-evals 0.9.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +130 -67
- package/dist/harness.d.mts +263 -1
- package/dist/harness.d.ts +263 -1
- package/dist/harness.js +306 -21
- package/dist/harness.js.map +1 -1
- package/dist/harness.mjs +296 -21
- package/dist/harness.mjs.map +1 -1
- package/dist/index.d.mts +48 -20
- package/dist/index.d.ts +48 -20
- package/dist/index.js +639 -42
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +625 -42
- package/dist/index.mjs.map +1 -1
- package/dist/internal/scoring.d.mts +3 -3
- package/dist/internal/scoring.d.ts +3 -3
- package/dist/internal/scoring.js.map +1 -1
- package/dist/internal/toolCallScorer.js +62 -2
- package/dist/internal/toolCallScorer.js.map +1 -1
- package/dist/internal/toolCallScorer.mjs +62 -2
- package/dist/internal/toolCallScorer.mjs.map +1 -1
- package/dist/judges/factualityJudge.d.mts +151 -0
- package/dist/judges/factualityJudge.d.ts +151 -0
- package/dist/judges/factualityJudge.js +235 -0
- package/dist/judges/factualityJudge.js.map +1 -0
- package/dist/judges/factualityJudge.mjs +208 -0
- package/dist/judges/factualityJudge.mjs.map +1 -0
- package/dist/judges/index.d.mts +3 -1
- package/dist/judges/index.d.ts +3 -1
- package/dist/judges/index.js +715 -7
- package/dist/judges/index.js.map +1 -1
- package/dist/judges/index.mjs +711 -6
- package/dist/judges/index.mjs.map +1 -1
- package/dist/judges/judgeHarness.d.mts +122 -0
- package/dist/judges/judgeHarness.d.ts +122 -0
- package/dist/judges/judgeHarness.js +571 -0
- package/dist/judges/judgeHarness.js.map +1 -0
- package/dist/judges/judgeHarness.mjs +542 -0
- package/dist/judges/judgeHarness.mjs.map +1 -0
- package/dist/judges/structuredOutputJudge.d.mts +1 -0
- package/dist/judges/structuredOutputJudge.d.ts +1 -0
- package/dist/judges/toolCallJudge.d.mts +1 -0
- package/dist/judges/toolCallJudge.d.ts +1 -0
- package/dist/judges/toolCallJudge.js +62 -2
- package/dist/judges/toolCallJudge.js.map +1 -1
- package/dist/judges/toolCallJudge.mjs +62 -2
- package/dist/judges/toolCallJudge.mjs.map +1 -1
- package/dist/judges/types.d.mts +33 -6
- package/dist/judges/types.d.ts +33 -6
- package/dist/judges/types.js.map +1 -1
- package/dist/legacy/scorers/index.js +62 -2
- package/dist/legacy/scorers/index.js.map +1 -1
- package/dist/legacy/scorers/index.mjs +62 -2
- package/dist/legacy/scorers/index.mjs.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.js +62 -2
- package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.mjs +62 -2
- package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
- package/dist/legacy.js +76 -3
- package/dist/legacy.js.map +1 -1
- package/dist/legacy.mjs +76 -3
- package/dist/legacy.mjs.map +1 -1
- package/dist/replay.js +1 -1
- package/dist/replay.js.map +1 -1
- package/dist/replay.mjs +1 -1
- package/dist/replay.mjs.map +1 -1
- package/dist/reporter.d.mts +5 -0
- package/dist/reporter.d.ts +5 -0
- package/dist/reporter.js +26 -2
- package/dist/reporter.js.map +1 -1
- package/dist/reporter.mjs +26 -2
- package/dist/reporter.mjs.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -2,6 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
Harness-backed AI testing on top of Vitest.
|
|
4
4
|
|
|
5
|
+
Use this package README for the core authoring model. For a guided setup path,
|
|
6
|
+
runtime-specific harness examples, replay, and GitHub Actions reporting, start
|
|
7
|
+
with the docs site: `https://vitest-evals.sentry.dev/docs`.
|
|
8
|
+
|
|
5
9
|
## Install
|
|
6
10
|
|
|
7
11
|
```sh
|
|
@@ -29,8 +33,14 @@ workflow.
|
|
|
29
33
|
- `run(input, { metadata? })` executes the harness explicitly and returns a
|
|
30
34
|
normalized `HarnessRun`
|
|
31
35
|
- the returned `result.output` is the app-facing value you assert on directly
|
|
32
|
-
- the returned `result.session` is the canonical JSON-serializable
|
|
36
|
+
- the returned `result.session` is the canonical JSON-serializable transcript for
|
|
33
37
|
reporting, replay, tool assertions, and judges
|
|
38
|
+
- the returned `result.traces` contains JSON-serializable operation spans; the
|
|
39
|
+
first-party harnesses attach run, model, and tool spans automatically, while
|
|
40
|
+
`createHarness(...)` attaches fallback run and tool spans for custom harnesses
|
|
41
|
+
that do not return traces themselves. Span attributes include typed
|
|
42
|
+
OpenTelemetry GenAI semantic keys while still allowing provider-specific
|
|
43
|
+
metadata
|
|
34
44
|
- scenario-specific judge criteria can live in `input`; use `metadata` for
|
|
35
45
|
per-run expectations or harness configuration that are not part of the
|
|
36
46
|
scenario payload
|
|
@@ -40,55 +50,28 @@ workflow.
|
|
|
40
50
|
- every judge receives `JudgeContext` with typed `input`, typed `output`, the
|
|
41
51
|
normalized run/session, tool calls, and metadata; `output` is only optional
|
|
42
52
|
when the harness output type includes `undefined`
|
|
43
|
-
- judges own their prompt, rubric,
|
|
44
|
-
`
|
|
45
|
-
when multiple judges share setup
|
|
53
|
+
- judges own their prompt, rubric, and parsing; LLM-backed judges use
|
|
54
|
+
`ctx.runJudge(...)` from a configured `judgeHarness`
|
|
46
55
|
- explicit judge assertions use
|
|
47
56
|
`await expect(result).toSatisfyJudge(judge, context)`
|
|
48
57
|
|
|
49
58
|
## Explicit Run Example
|
|
50
59
|
|
|
51
60
|
```ts
|
|
61
|
+
import { getModel } from "@mariozechner/pi-ai";
|
|
52
62
|
import { expect } from "vitest";
|
|
53
|
-
import { piAiHarness } from "@vitest-evals/harness-pi-ai";
|
|
63
|
+
import { piAiHarness, piAiJudgeHarness } from "@vitest-evals/harness-pi-ai";
|
|
54
64
|
import {
|
|
55
|
-
createJudge,
|
|
56
65
|
describeEval,
|
|
66
|
+
FactualityJudge,
|
|
57
67
|
toolCalls,
|
|
58
|
-
type JudgeContext,
|
|
59
68
|
} from "vitest-evals";
|
|
60
69
|
import { createRefundAgent } from "../src/refundAgent";
|
|
61
70
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
};
|
|
66
|
-
|
|
67
|
-
type RefundOutput = {
|
|
68
|
-
status: "approved" | "denied";
|
|
69
|
-
};
|
|
70
|
-
|
|
71
|
-
const FactualityJudge = createJudge(
|
|
72
|
-
"FactualityJudge",
|
|
73
|
-
async ({
|
|
74
|
-
input,
|
|
75
|
-
output,
|
|
76
|
-
metadata,
|
|
77
|
-
}: JudgeContext<string, RefundOutput, RefundEvalMetadata>) => {
|
|
78
|
-
const verdict = await judgeFactuality({
|
|
79
|
-
question: input,
|
|
80
|
-
answer: output,
|
|
81
|
-
expectedStatus: metadata.expectedStatus,
|
|
82
|
-
});
|
|
83
|
-
|
|
84
|
-
return {
|
|
85
|
-
score: verdict.score,
|
|
86
|
-
metadata: {
|
|
87
|
-
rationale: verdict.rationale,
|
|
88
|
-
},
|
|
89
|
-
};
|
|
90
|
-
},
|
|
91
|
-
);
|
|
71
|
+
const judgeHarness = piAiJudgeHarness({
|
|
72
|
+
model: getModel("anthropic", "claude-sonnet-4-5"),
|
|
73
|
+
temperature: 0,
|
|
74
|
+
});
|
|
92
75
|
|
|
93
76
|
describeEval(
|
|
94
77
|
"refund agent",
|
|
@@ -96,12 +79,15 @@ describeEval(
|
|
|
96
79
|
harness: piAiHarness({
|
|
97
80
|
agent: () => createRefundAgent(),
|
|
98
81
|
}),
|
|
99
|
-
|
|
82
|
+
judgeHarness,
|
|
83
|
+
judges: [FactualityJudge()],
|
|
84
|
+
judgeThreshold: 0.6,
|
|
100
85
|
},
|
|
101
86
|
(it) => {
|
|
102
87
|
it("approves a refundable invoice", async ({ run }) => {
|
|
103
88
|
const result = await run("Refund invoice inv_123", {
|
|
104
89
|
metadata: {
|
|
90
|
+
expected: "The refund request is approved.",
|
|
105
91
|
expectedStatus: "approved",
|
|
106
92
|
expectedTools: ["lookupInvoice", "createRefund"],
|
|
107
93
|
},
|
|
@@ -147,6 +133,17 @@ describeEval("refund agent", { harness }, (it) => {
|
|
|
147
133
|
});
|
|
148
134
|
```
|
|
149
135
|
|
|
136
|
+
## Terminal Reporting
|
|
137
|
+
|
|
138
|
+
The terminal reporter has two eval report levels. Normal mode prints compact
|
|
139
|
+
test, score, usage, and tool-count summaries. Info mode adds per-tool summaries,
|
|
140
|
+
arguments, timing/size metadata, replay status, and final output summaries.
|
|
141
|
+
Set `VITEST_EVALS_REPORT_LEVEL=info`, or pass `--info` through the workspace
|
|
142
|
+
eval scripts, to enable it. `--verbose` and `-v` remain aliases for
|
|
143
|
+
compatibility.
|
|
144
|
+
|
|
145
|
+
Full transcripts and spans are preserved in the Vitest JSON report metadata.
|
|
146
|
+
|
|
150
147
|
## GitHub Actions Reporting
|
|
151
148
|
|
|
152
149
|
Use Vitest JSON as the eval report artifact. It preserves the `meta` field that
|
|
@@ -210,6 +207,7 @@ When generics are needed, use `createHarness<Input, Output, Metadata>(...)`.
|
|
|
210
207
|
import {
|
|
211
208
|
createHarness,
|
|
212
209
|
createJudge,
|
|
210
|
+
createJudgeHarness,
|
|
213
211
|
describeEval,
|
|
214
212
|
type JudgeContext,
|
|
215
213
|
} from "vitest-evals";
|
|
@@ -255,14 +253,25 @@ const appHarness = createHarness<AppEvalInput, AppOutput, AppEvalMetadata>({
|
|
|
255
253
|
},
|
|
256
254
|
});
|
|
257
255
|
|
|
256
|
+
const judgeHarness = createJudgeHarness({
|
|
257
|
+
name: "app-rubric-judge-model",
|
|
258
|
+
run: async ({ prompt }, { signal }) =>
|
|
259
|
+
promptJudgeModel({ prompt, signal }),
|
|
260
|
+
});
|
|
261
|
+
|
|
258
262
|
const AppRubricJudge = createJudge(
|
|
259
263
|
"AppRubricJudge",
|
|
260
264
|
async (ctx: JudgeContext<AppEvalInput, AppOutput, AppEvalMetadata>) => {
|
|
261
|
-
|
|
265
|
+
if (!ctx.runJudge) {
|
|
266
|
+
throw new Error("AppRubricJudge requires a configured judgeHarness.");
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
const verdict = await ctx.runJudge({
|
|
262
270
|
prompt: formatRubricPrompt({
|
|
263
271
|
output: ctx.output,
|
|
264
272
|
criteria: ctx.input.criteria,
|
|
265
273
|
}),
|
|
274
|
+
responseFormat: { type: "json" },
|
|
266
275
|
});
|
|
267
276
|
|
|
268
277
|
return parseRubricVerdict(verdict);
|
|
@@ -273,6 +282,7 @@ describeEval(
|
|
|
273
282
|
"app behavior",
|
|
274
283
|
{
|
|
275
284
|
harness: appHarness,
|
|
285
|
+
judgeHarness,
|
|
276
286
|
judges: [AppRubricJudge],
|
|
277
287
|
judgeThreshold: 0.75,
|
|
278
288
|
},
|
|
@@ -328,11 +338,26 @@ In practice, this is usually most useful for factuality, rubric, or grounded
|
|
|
328
338
|
answer checks:
|
|
329
339
|
|
|
330
340
|
```ts
|
|
331
|
-
|
|
341
|
+
import { openai } from "@ai-sdk/openai";
|
|
342
|
+
import { aiSdkJudgeHarness } from "@vitest-evals/harness-ai-sdk";
|
|
343
|
+
import { expect } from "vitest";
|
|
344
|
+
import { FactualityJudge } from "vitest-evals";
|
|
345
|
+
|
|
346
|
+
const judgeHarness = aiSdkJudgeHarness({
|
|
347
|
+
model: openai("gpt-4.1-mini"),
|
|
348
|
+
temperature: 0,
|
|
349
|
+
});
|
|
350
|
+
const factualityJudge = FactualityJudge({ judgeHarness });
|
|
351
|
+
|
|
352
|
+
await expect(result).toSatisfyJudge(factualityJudge, {
|
|
353
|
+
expected: "Paris is the capital of France.",
|
|
354
|
+
threshold: 0.6,
|
|
355
|
+
});
|
|
332
356
|
```
|
|
333
357
|
|
|
334
358
|
For lower-level cases, the matcher also accepts raw values and synthetic judge
|
|
335
|
-
context
|
|
359
|
+
context. Pass every context field the judge needs when the value did not come
|
|
360
|
+
from eval fixture `run(...)`:
|
|
336
361
|
|
|
337
362
|
```ts
|
|
338
363
|
await expect({ status: "approved" }).toSatisfyJudge(MyJudge, {
|
|
@@ -340,35 +365,75 @@ await expect({ status: "approved" }).toSatisfyJudge(MyJudge, {
|
|
|
340
365
|
});
|
|
341
366
|
```
|
|
342
367
|
|
|
343
|
-
Use
|
|
344
|
-
|
|
368
|
+
Use the built-in factuality judge when you want a model-backed factuality grade
|
|
369
|
+
over the normalized run:
|
|
345
370
|
|
|
346
371
|
```ts
|
|
347
|
-
import {
|
|
372
|
+
import { openai } from "@ai-sdk/openai";
|
|
373
|
+
import { aiSdkJudgeHarness } from "@vitest-evals/harness-ai-sdk";
|
|
374
|
+
import { FactualityJudge } from "vitest-evals";
|
|
348
375
|
|
|
349
|
-
const
|
|
350
|
-
"
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
376
|
+
export const judgeHarness = aiSdkJudgeHarness({
|
|
377
|
+
model: openai("gpt-4.1-mini"),
|
|
378
|
+
temperature: 0,
|
|
379
|
+
});
|
|
380
|
+
export const factualityJudge = FactualityJudge({ judgeHarness });
|
|
381
|
+
```
|
|
354
382
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
383
|
+
For custom judge providers, create a dedicated judge harness with the same
|
|
384
|
+
prompt contract:
|
|
385
|
+
|
|
386
|
+
```ts
|
|
387
|
+
import {
|
|
388
|
+
createJudgeHarness,
|
|
389
|
+
FactualityJudge,
|
|
390
|
+
type JudgeHarness,
|
|
391
|
+
} from "vitest-evals";
|
|
392
|
+
import { callJudgeModel } from "./judgeModel";
|
|
393
|
+
|
|
394
|
+
export const judgeHarness: JudgeHarness = createJudgeHarness({
|
|
395
|
+
name: "factuality-judge-model",
|
|
396
|
+
run: async ({ system, prompt }, { signal }) =>
|
|
397
|
+
callJudgeModel({ system, prompt, signal }),
|
|
398
|
+
});
|
|
399
|
+
|
|
400
|
+
export const factualityJudge = FactualityJudge({ judgeHarness });
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
Configure that judge harness once and reuse the same judge with any app
|
|
404
|
+
harness:
|
|
405
|
+
|
|
406
|
+
```ts
|
|
407
|
+
import { describeEval } from "vitest-evals";
|
|
408
|
+
import { aiSdkRefundHarness } from "./aiSdkRefundHarness";
|
|
409
|
+
import { piRefundHarness } from "./piRefundHarness";
|
|
410
|
+
import { factualityJudge } from "./sharedJudges";
|
|
411
|
+
|
|
412
|
+
describeEval("ai sdk refund agent", {
|
|
413
|
+
harness: aiSdkRefundHarness,
|
|
414
|
+
judges: [factualityJudge],
|
|
415
|
+
});
|
|
416
|
+
|
|
417
|
+
describeEval("pi refund agent", {
|
|
418
|
+
harness: piRefundHarness,
|
|
419
|
+
judges: [factualityJudge],
|
|
420
|
+
});
|
|
363
421
|
```
|
|
364
422
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
423
|
+
Use `createJudge(...)` for custom judges so reporter output gets a stable
|
|
424
|
+
label. Custom LLM-backed judges should provide their own judge prompt, rubric
|
|
425
|
+
text, and parser, then call `ctx.runJudge(...)` for the provider-specific model
|
|
426
|
+
request. Bind a reusable default with `createJudge({ name, judgeHarness,
|
|
427
|
+
assess })` or pass `judgeHarness` on the matcher or suite. Core curries the
|
|
428
|
+
matcher, judge, or explicit suite `judgeHarness` into that function with the
|
|
429
|
+
current run's abort signal. Matcher options win over a judge default, and a
|
|
430
|
+
judge default wins over the suite default. Explicit matcher calls can also
|
|
431
|
+
reuse a single unambiguous judge-level harness from the suite's automatic
|
|
432
|
+
judges, but automatic judges do not inherit inferred harnesses from sibling
|
|
433
|
+
judges. That inference requires those judges to share the same judge harness
|
|
434
|
+
instance. Leave `judgeHarness` unset for suites that only use deterministic
|
|
435
|
+
judges. Calling `harness.run(...)` from a judge executes the application again,
|
|
436
|
+
so use that only when a second run is intentional.
|
|
372
437
|
|
|
373
438
|
For an `EvalHarnessRun` returned by fixture `run(...)`,
|
|
374
439
|
`toSatisfyJudge(...)` uses the run's typed `output` and reuses the registered
|
|
@@ -386,6 +451,4 @@ When a judge needs richer normalized context or the configured suite harness,
|
|
|
386
451
|
type it with `JudgeContext`.
|
|
387
452
|
|
|
388
453
|
When you only need deterministic contract checks, built-ins such as
|
|
389
|
-
`StructuredOutputJudge()` and `ToolCallJudge()` are still available.
|
|
390
|
-
documentation examples intentionally use factuality/rubric judges because those
|
|
391
|
-
match the product's LLM-as-a-judge direction.
|
|
454
|
+
`StructuredOutputJudge()` and `ToolCallJudge()` are still available.
|
package/dist/harness.d.mts
CHANGED
|
@@ -4,6 +4,169 @@ type JsonPrimitive = string | number | boolean | null;
|
|
|
4
4
|
type JsonValue = JsonPrimitive | JsonValue[] | {
|
|
5
5
|
[key: string]: JsonValue;
|
|
6
6
|
};
|
|
7
|
+
/** Well-known OpenTelemetry GenAI operation names. */
|
|
8
|
+
type GenAiOperationName = "chat" | "create_agent" | "embeddings" | "execute_tool" | "generate_content" | "invoke_agent" | "invoke_workflow" | "retrieval" | "text_completion" | (string & {});
|
|
9
|
+
/** Well-known OpenTelemetry GenAI output content types. */
|
|
10
|
+
type GenAiOutputType = "image" | "json" | "speech" | "text" | (string & {});
|
|
11
|
+
/** Well-known OpenTelemetry GenAI provider names. */
|
|
12
|
+
type GenAiProviderName = "anthropic" | "aws.bedrock" | "azure.ai.inference" | "azure.ai.openai" | "cohere" | "deepseek" | "gcp.gemini" | "gcp.gen_ai" | "gcp.vertex_ai" | "groq" | "ibm.watsonx.ai" | "mistral_ai" | "openai" | "perplexity" | "x_ai" | (string & {});
|
|
13
|
+
/** Well-known OpenTelemetry GenAI token types. */
|
|
14
|
+
type GenAiTokenType = "input" | "output" | (string & {});
|
|
15
|
+
/** Well-known OpenTelemetry GenAI tool execution types. */
|
|
16
|
+
type GenAiToolType = "datastore" | "extension" | "function" | (string & {});
|
|
17
|
+
/** Typed subset of OpenTelemetry GenAI semantic attributes. */
|
|
18
|
+
type GenAiSemanticAttributes = {
|
|
19
|
+
"gen_ai.agent.description"?: string;
|
|
20
|
+
"gen_ai.agent.id"?: string;
|
|
21
|
+
"gen_ai.agent.name"?: string;
|
|
22
|
+
"gen_ai.agent.version"?: string;
|
|
23
|
+
"gen_ai.conversation.id"?: string;
|
|
24
|
+
"gen_ai.data_source.id"?: string;
|
|
25
|
+
"gen_ai.embeddings.dimension.count"?: number;
|
|
26
|
+
"gen_ai.evaluation.explanation"?: string;
|
|
27
|
+
"gen_ai.evaluation.name"?: string;
|
|
28
|
+
"gen_ai.evaluation.score.label"?: string;
|
|
29
|
+
"gen_ai.evaluation.score.value"?: number;
|
|
30
|
+
"gen_ai.input.messages"?: JsonValue;
|
|
31
|
+
"gen_ai.operation.name"?: GenAiOperationName;
|
|
32
|
+
"gen_ai.output.messages"?: JsonValue;
|
|
33
|
+
"gen_ai.output.type"?: GenAiOutputType;
|
|
34
|
+
"gen_ai.prompt.name"?: string;
|
|
35
|
+
"gen_ai.provider.name"?: GenAiProviderName;
|
|
36
|
+
"gen_ai.request.choice.count"?: number;
|
|
37
|
+
"gen_ai.request.encoding_formats"?: string[];
|
|
38
|
+
"gen_ai.request.frequency_penalty"?: number;
|
|
39
|
+
"gen_ai.request.max_tokens"?: number;
|
|
40
|
+
"gen_ai.request.model"?: string;
|
|
41
|
+
"gen_ai.request.presence_penalty"?: number;
|
|
42
|
+
"gen_ai.request.seed"?: number;
|
|
43
|
+
"gen_ai.request.stop_sequences"?: string[];
|
|
44
|
+
"gen_ai.request.stream"?: boolean;
|
|
45
|
+
"gen_ai.request.temperature"?: number;
|
|
46
|
+
"gen_ai.request.top_k"?: number;
|
|
47
|
+
"gen_ai.request.top_p"?: number;
|
|
48
|
+
"gen_ai.response.finish_reasons"?: string[];
|
|
49
|
+
"gen_ai.response.id"?: string;
|
|
50
|
+
"gen_ai.response.model"?: string;
|
|
51
|
+
"gen_ai.response.time_to_first_chunk"?: number;
|
|
52
|
+
"gen_ai.retrieval.documents"?: JsonValue;
|
|
53
|
+
"gen_ai.retrieval.query.text"?: string;
|
|
54
|
+
"gen_ai.system_instructions"?: JsonValue;
|
|
55
|
+
"gen_ai.token.type"?: GenAiTokenType;
|
|
56
|
+
"gen_ai.tool.call.arguments"?: JsonValue;
|
|
57
|
+
"gen_ai.tool.call.id"?: string;
|
|
58
|
+
"gen_ai.tool.call.result"?: JsonValue;
|
|
59
|
+
"gen_ai.tool.definitions"?: JsonValue;
|
|
60
|
+
"gen_ai.tool.description"?: string;
|
|
61
|
+
"gen_ai.tool.name"?: string;
|
|
62
|
+
"gen_ai.tool.type"?: GenAiToolType;
|
|
63
|
+
"gen_ai.usage.cache_creation.input_tokens"?: number;
|
|
64
|
+
"gen_ai.usage.cache_read.input_tokens"?: number;
|
|
65
|
+
"gen_ai.usage.input_tokens"?: number;
|
|
66
|
+
"gen_ai.usage.output_tokens"?: number;
|
|
67
|
+
"gen_ai.usage.reasoning.output_tokens"?: number;
|
|
68
|
+
"gen_ai.workflow.name"?: string;
|
|
69
|
+
};
|
|
70
|
+
/** Attribute keys defined by the OpenTelemetry GenAI semantic conventions. */
|
|
71
|
+
type GenAiSemanticAttributeKey = keyof GenAiSemanticAttributes;
|
|
72
|
+
/** Typed OpenTelemetry semantic attributes accepted on normalized spans. */
|
|
73
|
+
type OpenTelemetrySemanticAttributes = GenAiSemanticAttributes & {
|
|
74
|
+
"error.type"?: string;
|
|
75
|
+
"server.address"?: string;
|
|
76
|
+
"server.port"?: number;
|
|
77
|
+
};
|
|
78
|
+
/** Known OpenTelemetry semantic attribute keys accepted on normalized spans. */
|
|
79
|
+
type OpenTelemetrySemanticAttributeKey = keyof OpenTelemetrySemanticAttributes;
|
|
80
|
+
/** Attribute keys accepted on normalized spans. */
|
|
81
|
+
type NormalizedSpanAttributeKey = OpenTelemetrySemanticAttributeKey | (string & {});
|
|
82
|
+
/**
|
|
83
|
+
* JSON-safe span attributes. Known OpenTelemetry GenAI keys are typed while
|
|
84
|
+
* custom provider and application keys remain allowed.
|
|
85
|
+
*/
|
|
86
|
+
type NormalizedSpanAttributes = OpenTelemetrySemanticAttributes & {
|
|
87
|
+
[key: string]: JsonValue | undefined;
|
|
88
|
+
};
|
|
89
|
+
/** Event attached to one normalized span. */
|
|
90
|
+
type NormalizedSpanEvent = {
|
|
91
|
+
/** Event name emitted by the runtime or harness. */
|
|
92
|
+
name: string;
|
|
93
|
+
/** ISO timestamp for the event when available. */
|
|
94
|
+
timestamp?: string;
|
|
95
|
+
/** JSON-safe event attributes. */
|
|
96
|
+
attributes?: NormalizedSpanAttributes;
|
|
97
|
+
};
|
|
98
|
+
/** Normalized operation span captured during a harness run. */
|
|
99
|
+
type NormalizedSpan = {
|
|
100
|
+
/** Runtime or provider span id when one is available. */
|
|
101
|
+
id?: string;
|
|
102
|
+
/** Trace id this span belongs to. */
|
|
103
|
+
traceId?: string;
|
|
104
|
+
/** Parent span id when the runtime exposes hierarchy. */
|
|
105
|
+
parentId?: string;
|
|
106
|
+
/** Human-readable operation name. */
|
|
107
|
+
name: string;
|
|
108
|
+
/** Coarse operation kind used by reporters and judges. */
|
|
109
|
+
kind?: "run" | "agent" | "model" | "tool" | "guardrail" | "handoff" | "custom";
|
|
110
|
+
/** ISO timestamp for the start of the span. */
|
|
111
|
+
startedAt?: string;
|
|
112
|
+
/** ISO timestamp for the end of the span. */
|
|
113
|
+
finishedAt?: string;
|
|
114
|
+
/** Span duration in milliseconds. */
|
|
115
|
+
durationMs?: number;
|
|
116
|
+
/** Success or failure status for the span. */
|
|
117
|
+
status?: "ok" | "error";
|
|
118
|
+
/** Normalized error when the span failed. */
|
|
119
|
+
error?: {
|
|
120
|
+
message: string;
|
|
121
|
+
type?: string;
|
|
122
|
+
[key: string]: JsonValue | undefined;
|
|
123
|
+
};
|
|
124
|
+
/** JSON-safe operation attributes. */
|
|
125
|
+
attributes?: NormalizedSpanAttributes;
|
|
126
|
+
/** Events observed inside this span. */
|
|
127
|
+
events?: NormalizedSpanEvent[];
|
|
128
|
+
};
|
|
129
|
+
/** Normalized trace captured during a harness run. */
|
|
130
|
+
type NormalizedTrace = {
|
|
131
|
+
/** Runtime or provider trace id when one is available. */
|
|
132
|
+
id?: string;
|
|
133
|
+
/** Human-readable trace or workflow name. */
|
|
134
|
+
name?: string;
|
|
135
|
+
/** ISO timestamp for the start of the trace. */
|
|
136
|
+
startedAt?: string;
|
|
137
|
+
/** ISO timestamp for the end of the trace. */
|
|
138
|
+
finishedAt?: string;
|
|
139
|
+
/** Trace duration in milliseconds. */
|
|
140
|
+
durationMs?: number;
|
|
141
|
+
/** Extra JSON-safe trace metadata. */
|
|
142
|
+
metadata?: Record<string, JsonValue>;
|
|
143
|
+
/** Spans that make up this trace. */
|
|
144
|
+
spans: NormalizedSpan[];
|
|
145
|
+
};
|
|
146
|
+
/** Options for converting normalized tool calls into trace spans. */
|
|
147
|
+
type CreateToolCallSpansOptions = {
|
|
148
|
+
/** Trace id to attach to each generated tool span. */
|
|
149
|
+
traceId?: string;
|
|
150
|
+
/** Parent span id to attach to each generated tool span. */
|
|
151
|
+
parentId?: string;
|
|
152
|
+
/** Prefix used to create internal span ids instead of reusing tool-call ids. */
|
|
153
|
+
spanIdPrefix?: string;
|
|
154
|
+
};
|
|
155
|
+
/** Options for attaching a fallback run trace to a harness result. */
|
|
156
|
+
type EnsureRunTraceOptions = {
|
|
157
|
+
/** Human-readable run or harness name. */
|
|
158
|
+
name: string;
|
|
159
|
+
/** Wall-clock start time for the harness run. */
|
|
160
|
+
startedAt: Date;
|
|
161
|
+
/** Wall-clock finish time for the harness run. */
|
|
162
|
+
finishedAt: Date;
|
|
163
|
+
/** Optional trace id. A generated id is used when omitted. */
|
|
164
|
+
id?: string;
|
|
165
|
+
/** GenAI operation name to place on the root run span. */
|
|
166
|
+
operationName?: GenAiOperationName;
|
|
167
|
+
/** Optional JSON-safe source marker for the trace metadata. */
|
|
168
|
+
source?: string;
|
|
169
|
+
};
|
|
7
170
|
/**
|
|
8
171
|
* Normalized record for one tool call observed during a harness run.
|
|
9
172
|
*
|
|
@@ -160,6 +323,8 @@ type HarnessRun<TOutput extends JsonValue | undefined = JsonValue | undefined> =
|
|
|
160
323
|
timings?: TimingSummary;
|
|
161
324
|
/** JSON-safe run artifacts captured by the harness or test context. */
|
|
162
325
|
artifacts?: Record<string, JsonValue>;
|
|
326
|
+
/** Normalized traces and spans captured during execution. */
|
|
327
|
+
traces?: NormalizedTrace[];
|
|
163
328
|
/** Normalized errors captured during execution. */
|
|
164
329
|
errors: Array<Record<string, JsonValue>>;
|
|
165
330
|
};
|
|
@@ -232,6 +397,27 @@ type SimpleToolCallRecord = Omit<ToolCallRecord, "arguments" | "result" | "error
|
|
|
232
397
|
/** Raw tool metadata accepted by `createHarness(...)` before normalization. */
|
|
233
398
|
metadata?: Record<string, unknown>;
|
|
234
399
|
};
|
|
400
|
+
/** Lightweight span event accepted by `createHarness(...)` results. */
|
|
401
|
+
type SimpleSpanEvent = Omit<NormalizedSpanEvent, "attributes"> & {
|
|
402
|
+
/** Raw event attributes accepted by `createHarness(...)` before normalization. */
|
|
403
|
+
attributes?: Record<string, unknown>;
|
|
404
|
+
};
|
|
405
|
+
/** Lightweight span record accepted by `createHarness(...)` results. */
|
|
406
|
+
type SimpleSpanRecord = Omit<NormalizedSpan, "attributes" | "error" | "events"> & {
|
|
407
|
+
/** Raw span attributes accepted by `createHarness(...)` before normalization. */
|
|
408
|
+
attributes?: Record<string, unknown>;
|
|
409
|
+
/** Raw span error accepted by `createHarness(...)` before normalization. */
|
|
410
|
+
error?: unknown;
|
|
411
|
+
/** Raw span events accepted by `createHarness(...)` before normalization. */
|
|
412
|
+
events?: SimpleSpanEvent[];
|
|
413
|
+
};
|
|
414
|
+
/** Lightweight trace record accepted by `createHarness(...)` results. */
|
|
415
|
+
type SimpleTraceRecord = Omit<NormalizedTrace, "metadata" | "spans"> & {
|
|
416
|
+
/** Raw trace metadata accepted by `createHarness(...)` before normalization. */
|
|
417
|
+
metadata?: Record<string, unknown>;
|
|
418
|
+
/** Lightweight spans to normalize into the trace. */
|
|
419
|
+
spans: SimpleSpanRecord[];
|
|
420
|
+
};
|
|
235
421
|
/**
|
|
236
422
|
* Lightweight result shape normalized by `createHarness(...)`.
|
|
237
423
|
*
|
|
@@ -255,6 +441,8 @@ type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | und
|
|
|
255
441
|
timings?: TimingSummary;
|
|
256
442
|
/** Raw artifact values to normalize and merge into the run. */
|
|
257
443
|
artifacts?: Record<string, unknown>;
|
|
444
|
+
/** Lightweight traces and spans to normalize into the run. */
|
|
445
|
+
traces?: SimpleTraceRecord[];
|
|
258
446
|
/** Raw session metadata to normalize into the session. */
|
|
259
447
|
metadata?: Record<string, unknown>;
|
|
260
448
|
/** Raw errors to normalize into the run. */
|
|
@@ -354,6 +542,31 @@ declare function createHarness<TInput = unknown, TOutput extends JsonValue | und
|
|
|
354
542
|
* ```
|
|
355
543
|
*/
|
|
356
544
|
declare function normalizeHarnessRun<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, TOutput extends JsonValue | undefined = JsonValue | undefined>(input: TInput, result: HarnessResultLike<TOutput>, context?: HarnessContext<TMetadata>): HarnessRun<TOutput>;
|
|
545
|
+
/**
|
|
546
|
+
* Builds a JSON-safe failed run for errors that happen before a harness can return.
|
|
547
|
+
*
|
|
548
|
+
* @param input - Original input passed to the harness.
|
|
549
|
+
* @param error - Error thrown by setup or execution.
|
|
550
|
+
* @param options - Optional artifacts to preserve on the failed run.
|
|
551
|
+
*/
|
|
552
|
+
declare function createFailedHarnessRun(input: unknown, error: unknown, options?: {
|
|
553
|
+
artifacts?: Record<string, JsonValue>;
|
|
554
|
+
}): HarnessRun;
|
|
555
|
+
/** Normalizes arbitrary span errors while preserving object-shaped messages. */
|
|
556
|
+
declare function normalizeSpanError(error: unknown): NormalizedSpan["error"] | undefined;
|
|
557
|
+
/** Normalizes raw span attributes into the JSON-safe span attribute shape. */
|
|
558
|
+
declare function normalizeSpanAttributes(attributes: Record<string, unknown>): NormalizedSpanAttributes | undefined;
|
|
559
|
+
/** Builds common OpenTelemetry GenAI usage attributes from a usage summary. */
|
|
560
|
+
declare function createGenAiUsageAttributes(usage: UsageSummary | undefined, options?: {
|
|
561
|
+
provider?: string;
|
|
562
|
+
}): {
|
|
563
|
+
"gen_ai.provider.name": string | undefined;
|
|
564
|
+
"gen_ai.request.model": string | undefined;
|
|
565
|
+
"gen_ai.response.model": string | undefined;
|
|
566
|
+
"gen_ai.usage.input_tokens": number | undefined;
|
|
567
|
+
"gen_ai.usage.output_tokens": number | undefined;
|
|
568
|
+
"gen_ai.usage.reasoning.output_tokens": number | undefined;
|
|
569
|
+
};
|
|
357
570
|
/**
|
|
358
571
|
* Flattens every recorded tool call from a normalized session.
|
|
359
572
|
*
|
|
@@ -367,6 +580,44 @@ declare function normalizeHarnessRun<TInput = unknown, TMetadata extends Harness
|
|
|
367
580
|
* ```
|
|
368
581
|
*/
|
|
369
582
|
declare function toolCalls(session: NormalizedSession): ToolCallRecord[];
|
|
583
|
+
/**
|
|
584
|
+
* Converts normalized tool-call records into trace spans.
|
|
585
|
+
*
|
|
586
|
+
* Tool-call ids are preserved as GenAI attributes. Pass `spanIdPrefix` when the
|
|
587
|
+
* spans belong to a known trace so span ids stay internally unique.
|
|
588
|
+
*/
|
|
589
|
+
declare function createToolCallSpans(calls: ToolCallRecord[], options?: CreateToolCallSpansOptions): NormalizedSpan[];
|
|
590
|
+
/**
|
|
591
|
+
* Attaches a fallback run trace when a harness result does not already contain spans.
|
|
592
|
+
*
|
|
593
|
+
* This keeps custom harnesses inspectable while first-party harness packages
|
|
594
|
+
* remain free to attach richer native traces.
|
|
595
|
+
*/
|
|
596
|
+
declare function ensureRunTrace(run: HarnessRun, options: EnsureRunTraceOptions): NormalizedTrace | undefined;
|
|
597
|
+
/**
|
|
598
|
+
* Flattens every recorded span from a normalized harness run.
|
|
599
|
+
*
|
|
600
|
+
* @param run - Normalized harness run produced by a harness.
|
|
601
|
+
*
|
|
602
|
+
* @example
|
|
603
|
+
* ```ts
|
|
604
|
+
* const modelSpans = spans(result).filter((span) => span.kind === "model");
|
|
605
|
+
* ```
|
|
606
|
+
*/
|
|
607
|
+
declare function spans(run: HarnessRun): NormalizedSpan[];
|
|
608
|
+
/**
|
|
609
|
+
* Returns spans of one coarse operation kind from a normalized run.
|
|
610
|
+
*
|
|
611
|
+
* @param run - Normalized harness run produced by a harness.
|
|
612
|
+
* @param kind - Span kind to keep.
|
|
613
|
+
*/
|
|
614
|
+
declare function spansByKind(run: HarnessRun, kind: NonNullable<NormalizedSpan["kind"]>): NormalizedSpan[];
|
|
615
|
+
/**
|
|
616
|
+
* Returns every span that explicitly failed or carries a normalized error.
|
|
617
|
+
*
|
|
618
|
+
* @param run - Normalized harness run produced by a harness.
|
|
619
|
+
*/
|
|
620
|
+
declare function failedSpans(run: HarnessRun): NormalizedSpan[];
|
|
370
621
|
/**
|
|
371
622
|
* Filters normalized session messages by role.
|
|
372
623
|
*
|
|
@@ -414,6 +665,17 @@ declare function userMessages(session: NormalizedSession): NormalizedMessage[];
|
|
|
414
665
|
* ```
|
|
415
666
|
*/
|
|
416
667
|
declare function assistantMessages(session: NormalizedSession): NormalizedMessage[];
|
|
668
|
+
/**
|
|
669
|
+
* Returns the latest assistant message content, ignoring empty text messages.
|
|
670
|
+
*
|
|
671
|
+
* @param session - Normalized session produced by a harness run.
|
|
672
|
+
*
|
|
673
|
+
* @example
|
|
674
|
+
* ```ts
|
|
675
|
+
* const finalAnswer = latestAssistantMessageContent(result.session);
|
|
676
|
+
* ```
|
|
677
|
+
*/
|
|
678
|
+
declare function latestAssistantMessageContent(session: NormalizedSession): JsonValue | undefined;
|
|
417
679
|
/**
|
|
418
680
|
* Returns every normalized tool message from a session.
|
|
419
681
|
*
|
|
@@ -465,4 +727,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
|
|
|
465
727
|
/** Serializes an arbitrary thrown value into the normalized error shape. */
|
|
466
728
|
declare function serializeError(error: unknown): Record<string, JsonValue>;
|
|
467
729
|
|
|
468
|
-
export { type CreateHarnessOptions, type CreateHarnessRunArgs, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type MaybePromise, type NormalizedMessage, type NormalizedSession, type SimpleHarnessResult, type SimpleToolCallRecord, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, createHarness, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
|
|
730
|
+
export { type CreateHarnessOptions, type CreateHarnessRunArgs, type CreateToolCallSpansOptions, type EnsureRunTraceOptions, type GenAiOperationName, type GenAiOutputType, type GenAiProviderName, type GenAiSemanticAttributeKey, type GenAiSemanticAttributes, type GenAiTokenType, type GenAiToolType, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type MaybePromise, type NormalizedMessage, type NormalizedSession, type NormalizedSpan, type NormalizedSpanAttributeKey, type NormalizedSpanAttributes, type NormalizedSpanEvent, type NormalizedTrace, type OpenTelemetrySemanticAttributeKey, type OpenTelemetrySemanticAttributes, type SimpleHarnessResult, type SimpleSpanEvent, type SimpleSpanRecord, type SimpleToolCallRecord, type SimpleTraceRecord, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, createToolCallSpans, ensureRunTrace, failedSpans, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, latestAssistantMessageContent, messagesByRole, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, normalizeSpanAttributes, normalizeSpanError, resolveHarnessRunErrors, serializeError, spans, spansByKind, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
|