vitest-evals 0.13.1 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -8
- package/dist/harness.d.mts +55 -39
- package/dist/harness.d.ts +55 -39
- package/dist/harness.js +34 -104
- package/dist/harness.js.map +1 -1
- package/dist/harness.mjs +37 -104
- package/dist/harness.mjs.map +1 -1
- package/dist/index.d.mts +5 -5
- package/dist/index.d.ts +5 -5
- package/dist/index.js +56 -117
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +59 -117
- package/dist/index.mjs.map +1 -1
- package/dist/internal/scoring.d.mts +2 -2
- package/dist/internal/scoring.d.ts +2 -2
- package/dist/internal/scoring.js.map +1 -1
- package/dist/internal/toolCallScorer.js.map +1 -1
- package/dist/internal/toolCallScorer.mjs +4 -1
- package/dist/internal/toolCallScorer.mjs.map +1 -1
- package/dist/judges/factualityJudge.js.map +1 -1
- package/dist/judges/factualityJudge.mjs +4 -1
- package/dist/judges/factualityJudge.mjs.map +1 -1
- package/dist/judges/index.js +47 -110
- package/dist/judges/index.js.map +1 -1
- package/dist/judges/index.mjs +51 -111
- package/dist/judges/index.mjs.map +1 -1
- package/dist/judges/judgeHarness.js +47 -110
- package/dist/judges/judgeHarness.js.map +1 -1
- package/dist/judges/judgeHarness.mjs +51 -111
- package/dist/judges/judgeHarness.mjs.map +1 -1
- package/dist/judges/toolCallJudge.js.map +1 -1
- package/dist/judges/toolCallJudge.mjs +4 -1
- package/dist/judges/toolCallJudge.mjs.map +1 -1
- package/dist/judges/types.d.mts +2 -2
- package/dist/judges/types.d.ts +2 -2
- package/dist/judges/types.js.map +1 -1
- package/dist/legacy/scorers/index.js.map +1 -1
- package/dist/legacy/scorers/index.mjs +4 -1
- package/dist/legacy/scorers/index.mjs.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.mjs +4 -1
- package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
- package/dist/legacy/shared.d.mts +1 -8
- package/dist/legacy/shared.d.ts +1 -8
- package/dist/legacy/shared.js.map +1 -1
- package/dist/legacy.js +15 -1
- package/dist/legacy.js.map +1 -1
- package/dist/legacy.mjs +19 -2
- package/dist/legacy.mjs.map +1 -1
- package/dist/reporter.d.mts +0 -3
- package/dist/reporter.d.ts +0 -3
- package/dist/reporter.js +10 -40
- package/dist/reporter.js.map +1 -1
- package/dist/reporter.mjs +14 -41
- package/dist/reporter.mjs.map +1 -1
- package/package.json +3 -3
package/README.md
CHANGED
|
@@ -34,13 +34,13 @@ workflow.
|
|
|
34
34
|
`HarnessRun`
|
|
35
35
|
- the returned `result.output` is the app-facing value you assert on directly
|
|
36
36
|
- helper assertions usually read the returned `result`, for example
|
|
37
|
-
`toolCalls(result)` or `
|
|
37
|
+
`toolCalls(result)` or `assistantMessages(result)`
|
|
38
38
|
- `result.session` is the canonical JSON-serializable transcript for reporting,
|
|
39
39
|
replay, tool assertions, and judges
|
|
40
|
-
- `result.traces` contains JSON-serializable operation spans;
|
|
41
|
-
|
|
42
|
-
`createHarness(...)` attaches fallback run
|
|
43
|
-
|
|
40
|
+
- `result.traces` contains JSON-serializable operation spans; first-party
|
|
41
|
+
harnesses attach native spans when provider/runtime data is available, while
|
|
42
|
+
`createHarness(...)` attaches a fallback run span for custom harnesses that do
|
|
43
|
+
not return traces themselves. Span attributes include typed
|
|
44
44
|
OpenTelemetry GenAI semantic keys while still allowing provider-specific
|
|
45
45
|
metadata
|
|
46
46
|
- scenario-specific judge criteria should live in `input` or explicit matcher
|
|
@@ -247,6 +247,18 @@ const appHarness = createHarness<AppEvalInput, AppOutput>({
|
|
|
247
247
|
});
|
|
248
248
|
|
|
249
249
|
return {
|
|
250
|
+
events: [
|
|
251
|
+
{
|
|
252
|
+
type: "message",
|
|
253
|
+
role: "user",
|
|
254
|
+
content: input.events.map((event) => event.type).join(", "),
|
|
255
|
+
},
|
|
256
|
+
{
|
|
257
|
+
type: "message",
|
|
258
|
+
role: "assistant",
|
|
259
|
+
content: result.replies.map((reply) => reply.text).join("\n"),
|
|
260
|
+
},
|
|
261
|
+
],
|
|
250
262
|
output: {
|
|
251
263
|
replies: result.replies,
|
|
252
264
|
sideEffects: result.sideEffects,
|
|
@@ -319,9 +331,42 @@ Use `Harness.run(...)` for the application under test. Calling
|
|
|
319
331
|
so reserve that for judges that intentionally need a second execution. Put
|
|
320
332
|
criteria on `input` when they are part of the scenario itself; pass
|
|
321
333
|
case-specific judge criteria through matcher options, or configure suite-wide
|
|
322
|
-
criteria on the judge instance.
|
|
323
|
-
|
|
324
|
-
`
|
|
334
|
+
criteria on the judge instance.
|
|
335
|
+
|
|
336
|
+
`createHarness(...)` lightweight results must return at least one normalized
|
|
337
|
+
event, either directly as `events` or from strict camelCase `messages`. Stored
|
|
338
|
+
run metadata always uses `session.events`, a flat ordered transcript:
|
|
339
|
+
|
|
340
|
+
```ts
|
|
341
|
+
events: [
|
|
342
|
+
{ type: "message", role: "user", content: input },
|
|
343
|
+
{
|
|
344
|
+
type: "tool_call",
|
|
345
|
+
id: "call_lookup",
|
|
346
|
+
name: "lookupInvoice",
|
|
347
|
+
arguments: { invoiceId: "inv_123" },
|
|
348
|
+
},
|
|
349
|
+
{
|
|
350
|
+
type: "tool_result",
|
|
351
|
+
toolCallId: "call_lookup",
|
|
352
|
+
name: "lookupInvoice",
|
|
353
|
+
content: { refundable: true },
|
|
354
|
+
},
|
|
355
|
+
{ type: "message", role: "assistant", content: output },
|
|
356
|
+
];
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
For apps that already produce message-shaped data, returning `messages` is also
|
|
360
|
+
accepted; the harness normalizer converts assistant `toolCalls`, `role: "tool"`
|
|
361
|
+
results keyed by `toolCallId`, and AI SDK `tool-call`/`tool-result` content
|
|
362
|
+
parts into the same flat `events` shape. Provider wire formats such as OpenAI
|
|
363
|
+
snake_case fields should be mapped by the harness before they reach this
|
|
364
|
+
boundary. Other provider content blocks or item streams should adapt those
|
|
365
|
+
records into `events` directly. Assertions and judges should read normalized
|
|
366
|
+
projections through helpers such as `toolCalls(result)`, `userMessages(result)`,
|
|
367
|
+
`assistantMessages(result)`, `toolMessages(result)`, and `spans(result)` instead
|
|
368
|
+
of manually walking provider payloads. Return a full `HarnessRun` only when you
|
|
369
|
+
need exact canonical `session.events`, trace, or usage control.
|
|
325
370
|
|
|
326
371
|
Provider setup and rubric parsing stay in your judge. The core
|
|
327
372
|
package only requires the judge to return a `JudgeResult` with a score and
|
package/dist/harness.d.mts
CHANGED
|
@@ -1,15 +1,6 @@
|
|
|
1
|
-
import { JsonValue, HarnessRun, GenAiOperationName,
|
|
2
|
-
export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue,
|
|
1
|
+
import { JsonValue, HarnessRun, GenAiOperationName, NormalizedSpanEvent, NormalizedSpan, NormalizedTrace, TranscriptEvent, TranscriptMessageInput, UsageSummary, TimingSummary, NormalizedSpanAttributes, HarnessRunError, NormalizedSession } from '@vitest-evals/core';
|
|
2
|
+
export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue, NormalizedError, NormalizedSession, NormalizedSpan, NormalizedSpanAttributeKey, NormalizedSpanAttributes, NormalizedSpanEvent, NormalizedTrace, OpenTelemetrySemanticAttributeKey, OpenTelemetrySemanticAttributes, TimingSummary, ToolCall, TranscriptEvent, TranscriptMessageContentPart, TranscriptMessageEvent, TranscriptMessageInput, TranscriptMessageTextPart, TranscriptMessageToolCall, TranscriptMessageToolCallPart, TranscriptMessageToolResultPart, TranscriptToolCallEvent, TranscriptToolResultEvent, UsageSummary, assistantMessages, failedSpans, latestAssistantMessageContent, messagesByRole, messagesToTranscriptEvents, spans, spansByKind, systemMessages, toolCalls, toolMessages, userMessages } from '@vitest-evals/core';
|
|
3
3
|
|
|
4
|
-
/** Options for converting normalized tool calls into trace spans. */
|
|
5
|
-
type CreateToolCallSpansOptions = {
|
|
6
|
-
/** Trace id to attach to each generated tool span. */
|
|
7
|
-
traceId?: string;
|
|
8
|
-
/** Parent span id to attach to each generated tool span. */
|
|
9
|
-
parentId?: string;
|
|
10
|
-
/** Prefix used to create internal span ids instead of reusing tool-call ids. */
|
|
11
|
-
spanIdPrefix?: string;
|
|
12
|
-
};
|
|
13
4
|
/** Options for attaching a fallback run trace to a harness result. */
|
|
14
5
|
type EnsureRunTraceOptions = {
|
|
15
6
|
/** Human-readable run or harness name. */
|
|
@@ -44,7 +35,7 @@ type HarnessMetadata = Record<string, unknown>;
|
|
|
44
35
|
*
|
|
45
36
|
* return {
|
|
46
37
|
* output: undefined,
|
|
47
|
-
* session: {
|
|
38
|
+
* session: { events: [{ type: "message", role: "user", content: input }] },
|
|
48
39
|
* usage: {},
|
|
49
40
|
* errors: [],
|
|
50
41
|
* };
|
|
@@ -81,17 +72,6 @@ type Harness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue
|
|
|
81
72
|
};
|
|
82
73
|
/** Value or promise accepted by lightweight harness callbacks. */
|
|
83
74
|
type MaybePromise<T> = T | Promise<T>;
|
|
84
|
-
/** Lightweight tool-call record accepted by `createHarness(...)` results. */
|
|
85
|
-
type SimpleToolCallRecord = Omit<ToolCallRecord, "arguments" | "result" | "error" | "metadata"> & {
|
|
86
|
-
/** Raw tool arguments accepted by `createHarness(...)` before normalization. */
|
|
87
|
-
arguments?: unknown;
|
|
88
|
-
/** Raw tool result accepted by `createHarness(...)` before normalization. */
|
|
89
|
-
result?: unknown;
|
|
90
|
-
/** Raw tool error accepted by `createHarness(...)` before normalization. */
|
|
91
|
-
error?: unknown;
|
|
92
|
-
/** Raw tool metadata accepted by `createHarness(...)` before normalization. */
|
|
93
|
-
metadata?: Record<string, unknown>;
|
|
94
|
-
};
|
|
95
75
|
/** Lightweight span event accepted by `createHarness(...)` results. */
|
|
96
76
|
type SimpleSpanEvent = Omit<NormalizedSpanEvent, "attributes"> & {
|
|
97
77
|
/** Raw event attributes accepted by `createHarness(...)` before normalization. */
|
|
@@ -113,6 +93,16 @@ type SimpleTraceRecord = Omit<NormalizedTrace, "metadata" | "spans"> & {
|
|
|
113
93
|
/** Lightweight spans to normalize into the trace. */
|
|
114
94
|
spans: SimpleSpanRecord[];
|
|
115
95
|
};
|
|
96
|
+
/** Lightweight transcript input accepted by `createHarness(...)` results. */
|
|
97
|
+
type SimpleTranscriptInput = {
|
|
98
|
+
/** Ordered normalized transcript events for the application run. */
|
|
99
|
+
events: TranscriptEvent[];
|
|
100
|
+
messages?: never;
|
|
101
|
+
} | {
|
|
102
|
+
/** Strict camelCase message transport normalized into transcript events. */
|
|
103
|
+
messages: TranscriptMessageInput[];
|
|
104
|
+
events?: never;
|
|
105
|
+
};
|
|
116
106
|
/**
|
|
117
107
|
* Lightweight result shape normalized by `createHarness(...)`.
|
|
118
108
|
*
|
|
@@ -120,16 +110,15 @@ type SimpleTraceRecord = Omit<NormalizedTrace, "metadata" | "spans"> & {
|
|
|
120
110
|
* ```ts
|
|
121
111
|
* const result: SimpleHarnessResult<{ status: "approved" }> = {
|
|
122
112
|
* output: { status: "approved" },
|
|
123
|
-
*
|
|
113
|
+
* events: [
|
|
114
|
+
* { type: "message", role: "user", content: "Refund invoice inv_123" },
|
|
115
|
+
* { type: "message", role: "assistant", content: { status: "approved" } },
|
|
116
|
+
* ],
|
|
124
117
|
* usage: { totalTokens: 260 },
|
|
125
118
|
* };
|
|
126
119
|
* ```
|
|
127
120
|
*/
|
|
128
|
-
type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
|
|
129
|
-
/** Pre-normalized transcript messages. When omitted, a default user/assistant transcript is created. */
|
|
130
|
-
messages?: NormalizedMessage[];
|
|
131
|
-
/** Lightweight tool-call records to normalize into the session. */
|
|
132
|
-
toolCalls?: SimpleToolCallRecord[];
|
|
121
|
+
type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & SimpleTranscriptInput & {
|
|
133
122
|
/** Usage summary to attach to the run. */
|
|
134
123
|
usage?: UsageSummary;
|
|
135
124
|
/** Timing summary to attach to the run. */
|
|
@@ -159,12 +148,17 @@ type CreateHarnessRunArgs<TInput> = {
|
|
|
159
148
|
/**
|
|
160
149
|
* Options for creating a lightweight custom application harness.
|
|
161
150
|
*
|
|
151
|
+
* Prefer this helper for custom harnesses. Implement `Harness` directly only
|
|
152
|
+
* when the callback already returns a full `HarnessRun` with canonical
|
|
153
|
+
* `session.events`.
|
|
154
|
+
*
|
|
162
155
|
* @example
|
|
163
156
|
* ```ts
|
|
164
157
|
* const options: CreateHarnessOptions<string, { status: "approved" }> = {
|
|
165
158
|
* name: "refund-agent",
|
|
166
159
|
* run: async ({ input }) => ({
|
|
167
160
|
* output: await classifyRefund(input),
|
|
161
|
+
* events: [{ type: "message", role: "user", content: input }],
|
|
168
162
|
* }),
|
|
169
163
|
* };
|
|
170
164
|
* ```
|
|
@@ -207,7 +201,22 @@ declare function normalizeContent(value: unknown): JsonValue;
|
|
|
207
201
|
*
|
|
208
202
|
* return {
|
|
209
203
|
* output,
|
|
210
|
-
*
|
|
204
|
+
* events: [
|
|
205
|
+
* { type: "message", role: "user", content: input },
|
|
206
|
+
* {
|
|
207
|
+
* type: "tool_call",
|
|
208
|
+
* id: "call_lookup",
|
|
209
|
+
* name: "lookupInvoice",
|
|
210
|
+
* arguments: { invoiceId: result.invoiceId },
|
|
211
|
+
* },
|
|
212
|
+
* {
|
|
213
|
+
* type: "tool_result",
|
|
214
|
+
* toolCallId: "call_lookup",
|
|
215
|
+
* name: "lookupInvoice",
|
|
216
|
+
* content: { refundable: result.refundable },
|
|
217
|
+
* },
|
|
218
|
+
* { type: "message", role: "assistant", content: output },
|
|
219
|
+
* ],
|
|
211
220
|
* usage: { provider: "openai", model: "gpt-4o-mini" },
|
|
212
221
|
* };
|
|
213
222
|
* },
|
|
@@ -226,7 +235,21 @@ declare function createHarness<TInput = unknown, TOutput extends JsonValue | und
|
|
|
226
235
|
* ```ts
|
|
227
236
|
* const run = normalizeHarnessRun("Refund invoice inv_123", {
|
|
228
237
|
* output: { status: "approved" },
|
|
229
|
-
*
|
|
238
|
+
* events: [
|
|
239
|
+
* { type: "message", role: "user", content: "Refund invoice inv_123" },
|
|
240
|
+
* {
|
|
241
|
+
* type: "tool_call",
|
|
242
|
+
* id: "call_lookup",
|
|
243
|
+
* name: "lookupInvoice",
|
|
244
|
+
* arguments: { invoiceId: "inv_123" },
|
|
245
|
+
* },
|
|
246
|
+
* {
|
|
247
|
+
* type: "tool_result",
|
|
248
|
+
* toolCallId: "call_lookup",
|
|
249
|
+
* name: "lookupInvoice",
|
|
250
|
+
* content: { refundable: true },
|
|
251
|
+
* },
|
|
252
|
+
* ],
|
|
230
253
|
* usage: { provider: "openai", model: "gpt-4o-mini" },
|
|
231
254
|
* });
|
|
232
255
|
*
|
|
@@ -259,13 +282,6 @@ declare function createGenAiUsageAttributes(usage: UsageSummary | undefined, opt
|
|
|
259
282
|
"gen_ai.usage.output_tokens": number | undefined;
|
|
260
283
|
"gen_ai.usage.reasoning.output_tokens": number | undefined;
|
|
261
284
|
};
|
|
262
|
-
/**
|
|
263
|
-
* Converts normalized tool-call records into trace spans.
|
|
264
|
-
*
|
|
265
|
-
* Tool-call ids are preserved as GenAI attributes. Pass `spanIdPrefix` when the
|
|
266
|
-
* spans belong to a known trace so span ids stay internally unique.
|
|
267
|
-
*/
|
|
268
|
-
declare function createToolCallSpans(calls: ToolCallRecord[], options?: CreateToolCallSpansOptions): NormalizedSpan[];
|
|
269
285
|
/**
|
|
270
286
|
* Attaches a fallback run trace when a harness result does not already contain spans.
|
|
271
287
|
*
|
|
@@ -313,4 +329,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
|
|
|
313
329
|
/** Serializes an arbitrary thrown value into the normalized error shape. */
|
|
314
330
|
declare function serializeError(error: unknown): Record<string, JsonValue>;
|
|
315
331
|
|
|
316
|
-
export { type CreateHarnessOptions, type CreateHarnessRunArgs, type
|
|
332
|
+
export { type CreateHarnessOptions, type CreateHarnessRunArgs, type EnsureRunTraceOptions, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type MaybePromise, type SimpleHarnessResult, type SimpleSpanEvent, type SimpleSpanRecord, type SimpleTraceRecord, type SimpleTranscriptInput, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, ensureRunTrace, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, normalizeSpanAttributes, normalizeSpanError, resolveHarnessRunErrors, serializeError, toJsonValue };
|
package/dist/harness.d.ts
CHANGED
|
@@ -1,15 +1,6 @@
|
|
|
1
|
-
import { JsonValue, HarnessRun, GenAiOperationName,
|
|
2
|
-
export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue,
|
|
1
|
+
import { JsonValue, HarnessRun, GenAiOperationName, NormalizedSpanEvent, NormalizedSpan, NormalizedTrace, TranscriptEvent, TranscriptMessageInput, UsageSummary, TimingSummary, NormalizedSpanAttributes, HarnessRunError, NormalizedSession } from '@vitest-evals/core';
|
|
2
|
+
export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue, NormalizedError, NormalizedSession, NormalizedSpan, NormalizedSpanAttributeKey, NormalizedSpanAttributes, NormalizedSpanEvent, NormalizedTrace, OpenTelemetrySemanticAttributeKey, OpenTelemetrySemanticAttributes, TimingSummary, ToolCall, TranscriptEvent, TranscriptMessageContentPart, TranscriptMessageEvent, TranscriptMessageInput, TranscriptMessageTextPart, TranscriptMessageToolCall, TranscriptMessageToolCallPart, TranscriptMessageToolResultPart, TranscriptToolCallEvent, TranscriptToolResultEvent, UsageSummary, assistantMessages, failedSpans, latestAssistantMessageContent, messagesByRole, messagesToTranscriptEvents, spans, spansByKind, systemMessages, toolCalls, toolMessages, userMessages } from '@vitest-evals/core';
|
|
3
3
|
|
|
4
|
-
/** Options for converting normalized tool calls into trace spans. */
|
|
5
|
-
type CreateToolCallSpansOptions = {
|
|
6
|
-
/** Trace id to attach to each generated tool span. */
|
|
7
|
-
traceId?: string;
|
|
8
|
-
/** Parent span id to attach to each generated tool span. */
|
|
9
|
-
parentId?: string;
|
|
10
|
-
/** Prefix used to create internal span ids instead of reusing tool-call ids. */
|
|
11
|
-
spanIdPrefix?: string;
|
|
12
|
-
};
|
|
13
4
|
/** Options for attaching a fallback run trace to a harness result. */
|
|
14
5
|
type EnsureRunTraceOptions = {
|
|
15
6
|
/** Human-readable run or harness name. */
|
|
@@ -44,7 +35,7 @@ type HarnessMetadata = Record<string, unknown>;
|
|
|
44
35
|
*
|
|
45
36
|
* return {
|
|
46
37
|
* output: undefined,
|
|
47
|
-
* session: {
|
|
38
|
+
* session: { events: [{ type: "message", role: "user", content: input }] },
|
|
48
39
|
* usage: {},
|
|
49
40
|
* errors: [],
|
|
50
41
|
* };
|
|
@@ -81,17 +72,6 @@ type Harness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue
|
|
|
81
72
|
};
|
|
82
73
|
/** Value or promise accepted by lightweight harness callbacks. */
|
|
83
74
|
type MaybePromise<T> = T | Promise<T>;
|
|
84
|
-
/** Lightweight tool-call record accepted by `createHarness(...)` results. */
|
|
85
|
-
type SimpleToolCallRecord = Omit<ToolCallRecord, "arguments" | "result" | "error" | "metadata"> & {
|
|
86
|
-
/** Raw tool arguments accepted by `createHarness(...)` before normalization. */
|
|
87
|
-
arguments?: unknown;
|
|
88
|
-
/** Raw tool result accepted by `createHarness(...)` before normalization. */
|
|
89
|
-
result?: unknown;
|
|
90
|
-
/** Raw tool error accepted by `createHarness(...)` before normalization. */
|
|
91
|
-
error?: unknown;
|
|
92
|
-
/** Raw tool metadata accepted by `createHarness(...)` before normalization. */
|
|
93
|
-
metadata?: Record<string, unknown>;
|
|
94
|
-
};
|
|
95
75
|
/** Lightweight span event accepted by `createHarness(...)` results. */
|
|
96
76
|
type SimpleSpanEvent = Omit<NormalizedSpanEvent, "attributes"> & {
|
|
97
77
|
/** Raw event attributes accepted by `createHarness(...)` before normalization. */
|
|
@@ -113,6 +93,16 @@ type SimpleTraceRecord = Omit<NormalizedTrace, "metadata" | "spans"> & {
|
|
|
113
93
|
/** Lightweight spans to normalize into the trace. */
|
|
114
94
|
spans: SimpleSpanRecord[];
|
|
115
95
|
};
|
|
96
|
+
/** Lightweight transcript input accepted by `createHarness(...)` results. */
|
|
97
|
+
type SimpleTranscriptInput = {
|
|
98
|
+
/** Ordered normalized transcript events for the application run. */
|
|
99
|
+
events: TranscriptEvent[];
|
|
100
|
+
messages?: never;
|
|
101
|
+
} | {
|
|
102
|
+
/** Strict camelCase message transport normalized into transcript events. */
|
|
103
|
+
messages: TranscriptMessageInput[];
|
|
104
|
+
events?: never;
|
|
105
|
+
};
|
|
116
106
|
/**
|
|
117
107
|
* Lightweight result shape normalized by `createHarness(...)`.
|
|
118
108
|
*
|
|
@@ -120,16 +110,15 @@ type SimpleTraceRecord = Omit<NormalizedTrace, "metadata" | "spans"> & {
|
|
|
120
110
|
* ```ts
|
|
121
111
|
* const result: SimpleHarnessResult<{ status: "approved" }> = {
|
|
122
112
|
* output: { status: "approved" },
|
|
123
|
-
*
|
|
113
|
+
* events: [
|
|
114
|
+
* { type: "message", role: "user", content: "Refund invoice inv_123" },
|
|
115
|
+
* { type: "message", role: "assistant", content: { status: "approved" } },
|
|
116
|
+
* ],
|
|
124
117
|
* usage: { totalTokens: 260 },
|
|
125
118
|
* };
|
|
126
119
|
* ```
|
|
127
120
|
*/
|
|
128
|
-
type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
|
|
129
|
-
/** Pre-normalized transcript messages. When omitted, a default user/assistant transcript is created. */
|
|
130
|
-
messages?: NormalizedMessage[];
|
|
131
|
-
/** Lightweight tool-call records to normalize into the session. */
|
|
132
|
-
toolCalls?: SimpleToolCallRecord[];
|
|
121
|
+
type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & SimpleTranscriptInput & {
|
|
133
122
|
/** Usage summary to attach to the run. */
|
|
134
123
|
usage?: UsageSummary;
|
|
135
124
|
/** Timing summary to attach to the run. */
|
|
@@ -159,12 +148,17 @@ type CreateHarnessRunArgs<TInput> = {
|
|
|
159
148
|
/**
|
|
160
149
|
* Options for creating a lightweight custom application harness.
|
|
161
150
|
*
|
|
151
|
+
* Prefer this helper for custom harnesses. Implement `Harness` directly only
|
|
152
|
+
* when the callback already returns a full `HarnessRun` with canonical
|
|
153
|
+
* `session.events`.
|
|
154
|
+
*
|
|
162
155
|
* @example
|
|
163
156
|
* ```ts
|
|
164
157
|
* const options: CreateHarnessOptions<string, { status: "approved" }> = {
|
|
165
158
|
* name: "refund-agent",
|
|
166
159
|
* run: async ({ input }) => ({
|
|
167
160
|
* output: await classifyRefund(input),
|
|
161
|
+
* events: [{ type: "message", role: "user", content: input }],
|
|
168
162
|
* }),
|
|
169
163
|
* };
|
|
170
164
|
* ```
|
|
@@ -207,7 +201,22 @@ declare function normalizeContent(value: unknown): JsonValue;
|
|
|
207
201
|
*
|
|
208
202
|
* return {
|
|
209
203
|
* output,
|
|
210
|
-
*
|
|
204
|
+
* events: [
|
|
205
|
+
* { type: "message", role: "user", content: input },
|
|
206
|
+
* {
|
|
207
|
+
* type: "tool_call",
|
|
208
|
+
* id: "call_lookup",
|
|
209
|
+
* name: "lookupInvoice",
|
|
210
|
+
* arguments: { invoiceId: result.invoiceId },
|
|
211
|
+
* },
|
|
212
|
+
* {
|
|
213
|
+
* type: "tool_result",
|
|
214
|
+
* toolCallId: "call_lookup",
|
|
215
|
+
* name: "lookupInvoice",
|
|
216
|
+
* content: { refundable: result.refundable },
|
|
217
|
+
* },
|
|
218
|
+
* { type: "message", role: "assistant", content: output },
|
|
219
|
+
* ],
|
|
211
220
|
* usage: { provider: "openai", model: "gpt-4o-mini" },
|
|
212
221
|
* };
|
|
213
222
|
* },
|
|
@@ -226,7 +235,21 @@ declare function createHarness<TInput = unknown, TOutput extends JsonValue | und
|
|
|
226
235
|
* ```ts
|
|
227
236
|
* const run = normalizeHarnessRun("Refund invoice inv_123", {
|
|
228
237
|
* output: { status: "approved" },
|
|
229
|
-
*
|
|
238
|
+
* events: [
|
|
239
|
+
* { type: "message", role: "user", content: "Refund invoice inv_123" },
|
|
240
|
+
* {
|
|
241
|
+
* type: "tool_call",
|
|
242
|
+
* id: "call_lookup",
|
|
243
|
+
* name: "lookupInvoice",
|
|
244
|
+
* arguments: { invoiceId: "inv_123" },
|
|
245
|
+
* },
|
|
246
|
+
* {
|
|
247
|
+
* type: "tool_result",
|
|
248
|
+
* toolCallId: "call_lookup",
|
|
249
|
+
* name: "lookupInvoice",
|
|
250
|
+
* content: { refundable: true },
|
|
251
|
+
* },
|
|
252
|
+
* ],
|
|
230
253
|
* usage: { provider: "openai", model: "gpt-4o-mini" },
|
|
231
254
|
* });
|
|
232
255
|
*
|
|
@@ -259,13 +282,6 @@ declare function createGenAiUsageAttributes(usage: UsageSummary | undefined, opt
|
|
|
259
282
|
"gen_ai.usage.output_tokens": number | undefined;
|
|
260
283
|
"gen_ai.usage.reasoning.output_tokens": number | undefined;
|
|
261
284
|
};
|
|
262
|
-
/**
|
|
263
|
-
* Converts normalized tool-call records into trace spans.
|
|
264
|
-
*
|
|
265
|
-
* Tool-call ids are preserved as GenAI attributes. Pass `spanIdPrefix` when the
|
|
266
|
-
* spans belong to a known trace so span ids stay internally unique.
|
|
267
|
-
*/
|
|
268
|
-
declare function createToolCallSpans(calls: ToolCallRecord[], options?: CreateToolCallSpansOptions): NormalizedSpan[];
|
|
269
285
|
/**
|
|
270
286
|
* Attaches a fallback run trace when a harness result does not already contain spans.
|
|
271
287
|
*
|
|
@@ -313,4 +329,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
|
|
|
313
329
|
/** Serializes an arbitrary thrown value into the normalized error shape. */
|
|
314
330
|
declare function serializeError(error: unknown): Record<string, JsonValue>;
|
|
315
331
|
|
|
316
|
-
export { type CreateHarnessOptions, type CreateHarnessRunArgs, type
|
|
332
|
+
export { type CreateHarnessOptions, type CreateHarnessRunArgs, type EnsureRunTraceOptions, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type MaybePromise, type SimpleHarnessResult, type SimpleSpanEvent, type SimpleSpanRecord, type SimpleTraceRecord, type SimpleTranscriptInput, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, ensureRunTrace, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, normalizeSpanAttributes, normalizeSpanError, resolveHarnessRunErrors, serializeError, toJsonValue };
|
package/dist/harness.js
CHANGED
|
@@ -25,7 +25,6 @@ __export(harness_exports, {
|
|
|
25
25
|
createFailedHarnessRun: () => createFailedHarnessRun,
|
|
26
26
|
createGenAiUsageAttributes: () => createGenAiUsageAttributes,
|
|
27
27
|
createHarness: () => createHarness,
|
|
28
|
-
createToolCallSpans: () => createToolCallSpans,
|
|
29
28
|
ensureRunTrace: () => ensureRunTrace,
|
|
30
29
|
failedSpans: () => import_core2.failedSpans,
|
|
31
30
|
getHarnessRunFromError: () => getHarnessRunFromError,
|
|
@@ -34,6 +33,7 @@ __export(harness_exports, {
|
|
|
34
33
|
isNormalizedSession: () => isNormalizedSession,
|
|
35
34
|
latestAssistantMessageContent: () => import_core2.latestAssistantMessageContent,
|
|
36
35
|
messagesByRole: () => import_core2.messagesByRole,
|
|
36
|
+
messagesToTranscriptEvents: () => import_core2.messagesToTranscriptEvents,
|
|
37
37
|
normalizeContent: () => normalizeContent,
|
|
38
38
|
normalizeHarnessRun: () => normalizeHarnessRun,
|
|
39
39
|
normalizeMetadata: () => normalizeMetadata,
|
|
@@ -177,14 +177,24 @@ function normalizeHarnessRun(input, result, context) {
|
|
|
177
177
|
}
|
|
178
178
|
return result;
|
|
179
179
|
}
|
|
180
|
+
if ("toolCalls" in result) {
|
|
181
|
+
throw new TypeError(
|
|
182
|
+
'createHarness results do not accept top-level toolCalls. Return ordered session events with type: "tool_call" and type: "tool_result" entries instead.'
|
|
183
|
+
);
|
|
184
|
+
}
|
|
180
185
|
const output = result.output;
|
|
181
|
-
const toolCalls3 = normalizeSimpleToolCalls(result.toolCalls);
|
|
182
186
|
const usage = result.usage ?? {};
|
|
183
|
-
const
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
187
|
+
const events = normalizeTranscriptInput(result);
|
|
188
|
+
if (!events) {
|
|
189
|
+
throw new TypeError(
|
|
190
|
+
"createHarness results must include ordered events or messages. Return a full HarnessRun or a lightweight result with events/messages."
|
|
191
|
+
);
|
|
192
|
+
}
|
|
193
|
+
if (events.length === 0) {
|
|
194
|
+
throw new TypeError(
|
|
195
|
+
"createHarness results must include at least one transcript event. Return ordered events or message transport inputs that normalize into events."
|
|
196
|
+
);
|
|
197
|
+
}
|
|
188
198
|
const metadata = result.metadata ? normalizeMetadata(result.metadata) : void 0;
|
|
189
199
|
const artifacts = normalizeMergedArtifacts(
|
|
190
200
|
context?.artifacts,
|
|
@@ -193,7 +203,7 @@ function normalizeHarnessRun(input, result, context) {
|
|
|
193
203
|
const traces = normalizeSimpleTraces(result.traces);
|
|
194
204
|
return {
|
|
195
205
|
session: {
|
|
196
|
-
|
|
206
|
+
events,
|
|
197
207
|
...usage.provider ? { provider: usage.provider } : {},
|
|
198
208
|
...usage.model ? { model: usage.model } : {},
|
|
199
209
|
...metadata ? { metadata } : {}
|
|
@@ -206,12 +216,24 @@ function normalizeHarnessRun(input, result, context) {
|
|
|
206
216
|
errors: normalizeSimpleErrors(result.errors)
|
|
207
217
|
};
|
|
208
218
|
}
|
|
219
|
+
function normalizeTranscriptInput(result) {
|
|
220
|
+
if ("events" in result && Array.isArray(result.events)) {
|
|
221
|
+
return result.events.map((event) => import_core.TranscriptEventSchema.parse(event));
|
|
222
|
+
}
|
|
223
|
+
if ("messages" in result && Array.isArray(result.messages)) {
|
|
224
|
+
return (0, import_core.messagesToTranscriptEvents)(result.messages).map(
|
|
225
|
+
(event) => import_core.TranscriptEventSchema.parse(event)
|
|
226
|
+
);
|
|
227
|
+
}
|
|
228
|
+
return void 0;
|
|
229
|
+
}
|
|
209
230
|
function createFailedHarnessRun(input, error, options = {}) {
|
|
210
231
|
const artifacts = options.artifacts;
|
|
211
232
|
return {
|
|
212
233
|
session: {
|
|
213
|
-
|
|
234
|
+
events: [
|
|
214
235
|
{
|
|
236
|
+
type: "message",
|
|
215
237
|
role: "user",
|
|
216
238
|
content: normalizeContent(input)
|
|
217
239
|
}
|
|
@@ -222,67 +244,6 @@ function createFailedHarnessRun(input, error, options = {}) {
|
|
|
222
244
|
errors: [serializeError(error)]
|
|
223
245
|
};
|
|
224
246
|
}
|
|
225
|
-
function createDefaultSessionMessages({
|
|
226
|
-
input,
|
|
227
|
-
output,
|
|
228
|
-
toolCalls: normalizedToolCalls
|
|
229
|
-
}) {
|
|
230
|
-
const messages = [
|
|
231
|
-
{
|
|
232
|
-
role: "user",
|
|
233
|
-
content: normalizeContent(input)
|
|
234
|
-
}
|
|
235
|
-
];
|
|
236
|
-
if (output !== void 0 || normalizedToolCalls.length > 0) {
|
|
237
|
-
messages.push({
|
|
238
|
-
role: "assistant",
|
|
239
|
-
...output !== void 0 ? { content: normalizeContent(output) } : {},
|
|
240
|
-
...normalizedToolCalls.length > 0 ? { toolCalls: normalizedToolCalls } : {}
|
|
241
|
-
});
|
|
242
|
-
}
|
|
243
|
-
return messages;
|
|
244
|
-
}
|
|
245
|
-
function normalizeSimpleToolCalls(calls) {
|
|
246
|
-
return (calls ?? []).map((call) => {
|
|
247
|
-
const {
|
|
248
|
-
arguments: rawArguments,
|
|
249
|
-
result: rawResult,
|
|
250
|
-
error: rawError,
|
|
251
|
-
metadata: rawMetadata,
|
|
252
|
-
...toolCall
|
|
253
|
-
} = call;
|
|
254
|
-
const args = normalizeToolCallArguments(rawArguments);
|
|
255
|
-
const result = toJsonValue(rawResult);
|
|
256
|
-
const error = normalizeToolCallError(rawError);
|
|
257
|
-
const metadata = rawMetadata ? normalizeMetadata(rawMetadata) : void 0;
|
|
258
|
-
return {
|
|
259
|
-
...toolCall,
|
|
260
|
-
...args ? { arguments: args } : {},
|
|
261
|
-
...result !== void 0 ? { result } : {},
|
|
262
|
-
...error ? { error } : {},
|
|
263
|
-
...metadata ? { metadata } : {}
|
|
264
|
-
};
|
|
265
|
-
});
|
|
266
|
-
}
|
|
267
|
-
function normalizeToolCallArguments(value) {
|
|
268
|
-
if (value === void 0) {
|
|
269
|
-
return void 0;
|
|
270
|
-
}
|
|
271
|
-
const normalized = toJsonValue(value);
|
|
272
|
-
return normalized && typeof normalized === "object" && !Array.isArray(normalized) ? normalized : void 0;
|
|
273
|
-
}
|
|
274
|
-
function normalizeToolCallError(value) {
|
|
275
|
-
if (value === void 0) {
|
|
276
|
-
return void 0;
|
|
277
|
-
}
|
|
278
|
-
const serialized = serializeError(value);
|
|
279
|
-
const { message, type, ...details } = serialized;
|
|
280
|
-
return {
|
|
281
|
-
...details,
|
|
282
|
-
message: typeof message === "string" ? message : String(message),
|
|
283
|
-
...typeof type === "string" ? { type } : {}
|
|
284
|
-
};
|
|
285
|
-
}
|
|
286
247
|
function normalizeMergedArtifacts(contextArtifacts, resultArtifacts) {
|
|
287
248
|
const artifacts = {
|
|
288
249
|
...contextArtifacts ?? {},
|
|
@@ -408,32 +369,6 @@ function createGenAiUsageAttributes(usage, options = {}) {
|
|
|
408
369
|
"gen_ai.usage.reasoning.output_tokens": usage?.reasoningTokens
|
|
409
370
|
};
|
|
410
371
|
}
|
|
411
|
-
function createToolCallSpans(calls, options = {}) {
|
|
412
|
-
return calls.map((call, index) => {
|
|
413
|
-
const spanError = call.error ? normalizeSpanError(call.error) : void 0;
|
|
414
|
-
const spanId = options.spanIdPrefix ? `${options.spanIdPrefix}:${index + 1}` : call.id;
|
|
415
|
-
return {
|
|
416
|
-
...spanId ? { id: spanId } : {},
|
|
417
|
-
...options.traceId ? { traceId: options.traceId } : {},
|
|
418
|
-
...options.parentId ? { parentId: options.parentId } : {},
|
|
419
|
-
name: call.name,
|
|
420
|
-
kind: "tool",
|
|
421
|
-
...call.startedAt ? { startedAt: call.startedAt } : {},
|
|
422
|
-
...call.finishedAt ? { finishedAt: call.finishedAt } : {},
|
|
423
|
-
...call.durationMs !== void 0 ? { durationMs: call.durationMs } : {},
|
|
424
|
-
status: spanError ? "error" : "ok",
|
|
425
|
-
...spanError ? { error: spanError } : {},
|
|
426
|
-
attributes: normalizeSpanAttributes({
|
|
427
|
-
"gen_ai.operation.name": "execute_tool",
|
|
428
|
-
"gen_ai.tool.name": call.name,
|
|
429
|
-
"gen_ai.tool.type": "function",
|
|
430
|
-
...call.id ? { "gen_ai.tool.call.id": call.id } : {},
|
|
431
|
-
...call.arguments !== void 0 ? { "gen_ai.tool.call.arguments": call.arguments } : {},
|
|
432
|
-
...call.result !== void 0 ? { "gen_ai.tool.call.result": call.result } : {}
|
|
433
|
-
})
|
|
434
|
-
};
|
|
435
|
-
});
|
|
436
|
-
}
|
|
437
372
|
function ensureRunTrace(run, options) {
|
|
438
373
|
if ((0, import_core.spans)(run).length > 0) {
|
|
439
374
|
return void 0;
|
|
@@ -458,11 +393,6 @@ function ensureRunTrace(run, options) {
|
|
|
458
393
|
...createGenAiUsageAttributes(run.usage)
|
|
459
394
|
})
|
|
460
395
|
};
|
|
461
|
-
const toolSpans = createToolCallSpans((0, import_core.toolCalls)(run.session), {
|
|
462
|
-
traceId,
|
|
463
|
-
parentId: rootSpanId,
|
|
464
|
-
spanIdPrefix: `${traceId}:tool`
|
|
465
|
-
});
|
|
466
396
|
const trace = {
|
|
467
397
|
id: traceId,
|
|
468
398
|
name: options.name,
|
|
@@ -470,7 +400,7 @@ function ensureRunTrace(run, options) {
|
|
|
470
400
|
finishedAt: options.finishedAt.toISOString(),
|
|
471
401
|
durationMs,
|
|
472
402
|
...options.source ? { metadata: { source: options.source } } : {},
|
|
473
|
-
spans: [runSpan
|
|
403
|
+
spans: [runSpan]
|
|
474
404
|
};
|
|
475
405
|
run.traces = [trace];
|
|
476
406
|
return trace;
|
|
@@ -500,7 +430,7 @@ function isHarnessRun(value) {
|
|
|
500
430
|
return isNormalizedSession(candidate.session) && Boolean(candidate.usage) && typeof candidate.usage === "object" && !Array.isArray(candidate.usage) && Array.isArray(candidate.errors);
|
|
501
431
|
}
|
|
502
432
|
function isNormalizedSession(value) {
|
|
503
|
-
return
|
|
433
|
+
return import_core.NormalizedSessionSchema.safeParse(value).success;
|
|
504
434
|
}
|
|
505
435
|
function resolveHarnessRunErrors(result) {
|
|
506
436
|
if (result && typeof result === "object" && Array.isArray(result.errors)) {
|
|
@@ -527,7 +457,6 @@ function serializeError(error) {
|
|
|
527
457
|
createFailedHarnessRun,
|
|
528
458
|
createGenAiUsageAttributes,
|
|
529
459
|
createHarness,
|
|
530
|
-
createToolCallSpans,
|
|
531
460
|
ensureRunTrace,
|
|
532
461
|
failedSpans,
|
|
533
462
|
getHarnessRunFromError,
|
|
@@ -536,6 +465,7 @@ function serializeError(error) {
|
|
|
536
465
|
isNormalizedSession,
|
|
537
466
|
latestAssistantMessageContent,
|
|
538
467
|
messagesByRole,
|
|
468
|
+
messagesToTranscriptEvents,
|
|
539
469
|
normalizeContent,
|
|
540
470
|
normalizeHarnessRun,
|
|
541
471
|
normalizeMetadata,
|