vitest-evals 0.10.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +30 -1
- package/bin/vitest-evals.js +8 -0
- package/dist/cli.d.mts +13 -0
- package/dist/cli.d.ts +13 -0
- package/dist/cli.js +83 -0
- package/dist/cli.js.map +1 -0
- package/dist/cli.mjs +55 -0
- package/dist/cli.mjs.map +1 -0
- package/dist/harness.d.mts +78 -236
- package/dist/harness.d.ts +78 -236
- package/dist/harness.js +292 -46
- package/dist/harness.js.map +1 -1
- package/dist/harness.mjs +297 -46
- package/dist/harness.mjs.map +1 -1
- package/dist/index.d.mts +5 -3
- package/dist/index.d.ts +5 -3
- package/dist/index.js +331 -52
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +335 -52
- package/dist/index.mjs.map +1 -1
- package/dist/internal/scoring.d.mts +1 -1
- package/dist/internal/scoring.d.ts +1 -1
- package/dist/internal/structuredOutputScorer.d.mts +1 -1
- package/dist/internal/structuredOutputScorer.d.ts +1 -1
- package/dist/internal/toolCallScorer.d.mts +1 -1
- package/dist/internal/toolCallScorer.d.ts +1 -1
- package/dist/internal/toolCallScorer.js +34 -12
- package/dist/internal/toolCallScorer.js.map +1 -1
- package/dist/internal/toolCallScorer.mjs +48 -12
- package/dist/internal/toolCallScorer.mjs.map +1 -1
- package/dist/judges/factualityJudge.d.mts +2 -1
- package/dist/judges/factualityJudge.d.ts +2 -1
- package/dist/judges/factualityJudge.js +4 -14
- package/dist/judges/factualityJudge.js.map +1 -1
- package/dist/judges/factualityJudge.mjs +18 -14
- package/dist/judges/factualityJudge.mjs.map +1 -1
- package/dist/judges/index.d.mts +1 -0
- package/dist/judges/index.d.ts +1 -0
- package/dist/judges/index.js +285 -33
- package/dist/judges/index.js.map +1 -1
- package/dist/judges/index.mjs +299 -33
- package/dist/judges/index.mjs.map +1 -1
- package/dist/judges/judgeHarness.d.mts +2 -1
- package/dist/judges/judgeHarness.d.ts +2 -1
- package/dist/judges/judgeHarness.js +284 -32
- package/dist/judges/judgeHarness.js.map +1 -1
- package/dist/judges/judgeHarness.mjs +298 -32
- package/dist/judges/judgeHarness.mjs.map +1 -1
- package/dist/judges/structuredOutputJudge.d.mts +1 -0
- package/dist/judges/structuredOutputJudge.d.ts +1 -0
- package/dist/judges/toolCallJudge.d.mts +1 -0
- package/dist/judges/toolCallJudge.d.ts +1 -0
- package/dist/judges/toolCallJudge.js +34 -12
- package/dist/judges/toolCallJudge.js.map +1 -1
- package/dist/judges/toolCallJudge.mjs +48 -12
- package/dist/judges/toolCallJudge.mjs.map +1 -1
- package/dist/judges/types.d.mts +2 -1
- package/dist/judges/types.d.ts +2 -1
- package/dist/legacy/scorers/index.js +34 -12
- package/dist/legacy/scorers/index.js.map +1 -1
- package/dist/legacy/scorers/index.mjs +48 -12
- package/dist/legacy/scorers/index.mjs.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.js +34 -12
- package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.mjs +48 -12
- package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
- package/dist/legacy.js +40 -18
- package/dist/legacy.js.map +1 -1
- package/dist/legacy.mjs +54 -18
- package/dist/legacy.mjs.map +1 -1
- package/dist/replay.d.mts +1 -1
- package/dist/replay.d.ts +1 -1
- package/dist/reporter.d.mts +5 -0
- package/dist/reporter.d.ts +5 -0
- package/dist/reporter.js +30 -7
- package/dist/reporter.js.map +1 -1
- package/dist/reporter.mjs +44 -7
- package/dist/reporter.mjs.map +1 -1
- package/package.json +9 -1
package/README.md
CHANGED
|
@@ -33,8 +33,14 @@ workflow.
|
|
|
33
33
|
- `run(input, { metadata? })` executes the harness explicitly and returns a
|
|
34
34
|
normalized `HarnessRun`
|
|
35
35
|
- the returned `result.output` is the app-facing value you assert on directly
|
|
36
|
-
- the returned `result.session` is the canonical JSON-serializable
|
|
36
|
+
- the returned `result.session` is the canonical JSON-serializable transcript for
|
|
37
37
|
reporting, replay, tool assertions, and judges
|
|
38
|
+
- the returned `result.traces` contains JSON-serializable operation spans; the
|
|
39
|
+
first-party harnesses attach run, model, and tool spans automatically, while
|
|
40
|
+
`createHarness(...)` attaches fallback run and tool spans for custom harnesses
|
|
41
|
+
that do not return traces themselves. Span attributes include typed
|
|
42
|
+
OpenTelemetry GenAI semantic keys while still allowing provider-specific
|
|
43
|
+
metadata
|
|
38
44
|
- scenario-specific judge criteria can live in `input`; use `metadata` for
|
|
39
45
|
per-run expectations or harness configuration that are not part of the
|
|
40
46
|
scenario payload
|
|
@@ -127,6 +133,29 @@ describeEval("refund agent", { harness }, (it) => {
|
|
|
127
133
|
});
|
|
128
134
|
```
|
|
129
135
|
|
|
136
|
+
## Terminal Reporting
|
|
137
|
+
|
|
138
|
+
The terminal reporter has two eval report levels. Normal mode prints compact
|
|
139
|
+
test, score, usage, and tool-count summaries. Info mode adds per-tool summaries,
|
|
140
|
+
arguments, timing/size metadata, replay status, and final output summaries.
|
|
141
|
+
Set `VITEST_EVALS_REPORT_LEVEL=info`, or pass `--info` through the workspace
|
|
142
|
+
eval scripts, to enable it. `--verbose` and `-v` remain aliases for
|
|
143
|
+
compatibility.
|
|
144
|
+
|
|
145
|
+
Full transcripts and spans are preserved in the Vitest JSON report metadata.
|
|
146
|
+
|
|
147
|
+
## Local Report UI
|
|
148
|
+
|
|
149
|
+
The local report UI reads the same Vitest JSON artifacts and serves a React SPA
|
|
150
|
+
for drilling into runs, eval cases, harness output, sessions, tool calls,
|
|
151
|
+
scores, and trace spans.
|
|
152
|
+
|
|
153
|
+
```sh
|
|
154
|
+
pnpm exec vitest-evals serve vitest-results.json
|
|
155
|
+
pnpm exec vitest-evals serve "eval-results/*.json"
|
|
156
|
+
pnpm exec vitest-evals serve eval-results/
|
|
157
|
+
```
|
|
158
|
+
|
|
130
159
|
## GitHub Actions Reporting
|
|
131
160
|
|
|
132
161
|
Use Vitest JSON as the eval report artifact. It preserves the `meta` field that
|
package/dist/cli.d.mts
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/** Output streams used by the `vitest-evals` CLI runner. */
|
|
3
|
+
type VitestEvalsCliIo = {
|
|
4
|
+
stdout?: Pick<NodeJS.WriteStream, "write">;
|
|
5
|
+
};
|
|
6
|
+
/** Options for running the `vitest-evals` CLI. */
|
|
7
|
+
type RunVitestEvalsCliOptions = VitestEvalsCliIo & {
|
|
8
|
+
cwd?: string;
|
|
9
|
+
};
|
|
10
|
+
/** Runs the product-facing `vitest-evals` CLI. */
|
|
11
|
+
declare function runVitestEvalsCli(args?: string[], options?: RunVitestEvalsCliOptions): Promise<void>;
|
|
12
|
+
|
|
13
|
+
export { type RunVitestEvalsCliOptions, type VitestEvalsCliIo, runVitestEvalsCli };
|
package/dist/cli.d.ts
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/** Output streams used by the `vitest-evals` CLI runner. */
|
|
3
|
+
type VitestEvalsCliIo = {
|
|
4
|
+
stdout?: Pick<NodeJS.WriteStream, "write">;
|
|
5
|
+
};
|
|
6
|
+
/** Options for running the `vitest-evals` CLI. */
|
|
7
|
+
type RunVitestEvalsCliOptions = VitestEvalsCliIo & {
|
|
8
|
+
cwd?: string;
|
|
9
|
+
};
|
|
10
|
+
/** Runs the product-facing `vitest-evals` CLI. */
|
|
11
|
+
declare function runVitestEvalsCli(args?: string[], options?: RunVitestEvalsCliOptions): Promise<void>;
|
|
12
|
+
|
|
13
|
+
export { type RunVitestEvalsCliOptions, type VitestEvalsCliIo, runVitestEvalsCli };
|
package/dist/cli.js
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
"use strict";
|
|
3
|
+
var __create = Object.create;
|
|
4
|
+
var __defProp = Object.defineProperty;
|
|
5
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
6
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
7
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
8
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
9
|
+
var __export = (target, all) => {
|
|
10
|
+
for (var name in all)
|
|
11
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
12
|
+
};
|
|
13
|
+
var __copyProps = (to, from, except, desc) => {
|
|
14
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
15
|
+
for (let key of __getOwnPropNames(from))
|
|
16
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
17
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
18
|
+
}
|
|
19
|
+
return to;
|
|
20
|
+
};
|
|
21
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
22
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
23
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
24
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
25
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
26
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
27
|
+
mod
|
|
28
|
+
));
|
|
29
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
30
|
+
|
|
31
|
+
// src/cli.ts
|
|
32
|
+
var cli_exports = {};
|
|
33
|
+
__export(cli_exports, {
|
|
34
|
+
runVitestEvalsCli: () => runVitestEvalsCli
|
|
35
|
+
});
|
|
36
|
+
module.exports = __toCommonJS(cli_exports);
|
|
37
|
+
async function runVitestEvalsCli(args = process.argv.slice(2), options = {}) {
|
|
38
|
+
const [command, ...commandArgs] = args;
|
|
39
|
+
if (!command || command === "help" || command === "--help" || command === "-h") {
|
|
40
|
+
writeLine(options.stdout, usage());
|
|
41
|
+
return;
|
|
42
|
+
}
|
|
43
|
+
switch (command) {
|
|
44
|
+
case "serve": {
|
|
45
|
+
const { runReportUiCli } = await import("@vitest-evals/report-ui");
|
|
46
|
+
await runReportUiCli(commandArgs, {
|
|
47
|
+
commandName: "vitest-evals serve",
|
|
48
|
+
cwd: options.cwd,
|
|
49
|
+
stdout: options.stdout
|
|
50
|
+
});
|
|
51
|
+
return;
|
|
52
|
+
}
|
|
53
|
+
default:
|
|
54
|
+
throw new Error(`Unknown command: ${command}
|
|
55
|
+
|
|
56
|
+
${usage()}`);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
function usage() {
|
|
60
|
+
return [
|
|
61
|
+
"Usage: vitest-evals <command>",
|
|
62
|
+
"",
|
|
63
|
+
"Commands:",
|
|
64
|
+
" serve [json | dir | glob] Serve the local report UI",
|
|
65
|
+
"",
|
|
66
|
+
"Run `vitest-evals serve --help` for report UI options."
|
|
67
|
+
].join("\n");
|
|
68
|
+
}
|
|
69
|
+
function writeLine(stdout, message) {
|
|
70
|
+
(stdout ?? process.stdout).write(`${message}
|
|
71
|
+
`);
|
|
72
|
+
}
|
|
73
|
+
if (typeof require !== "undefined" && typeof module !== "undefined" && require.main === module) {
|
|
74
|
+
runVitestEvalsCli().catch((error) => {
|
|
75
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
76
|
+
process.exitCode = 1;
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
80
|
+
0 && (module.exports = {
|
|
81
|
+
runVitestEvalsCli
|
|
82
|
+
});
|
|
83
|
+
//# sourceMappingURL=cli.js.map
|
package/dist/cli.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/** Output streams used by the `vitest-evals` CLI runner. */\nexport type VitestEvalsCliIo = {\n stdout?: Pick<NodeJS.WriteStream, \"write\">;\n};\n\n/** Options for running the `vitest-evals` CLI. */\nexport type RunVitestEvalsCliOptions = VitestEvalsCliIo & {\n cwd?: string;\n};\n\n/** Runs the product-facing `vitest-evals` CLI. */\nexport async function runVitestEvalsCli(\n args = process.argv.slice(2),\n options: RunVitestEvalsCliOptions = {},\n) {\n const [command, ...commandArgs] = args;\n\n if (\n !command ||\n command === \"help\" ||\n command === \"--help\" ||\n command === \"-h\"\n ) {\n writeLine(options.stdout, usage());\n return;\n }\n\n switch (command) {\n case \"serve\": {\n const { runReportUiCli } = await import(\"@vitest-evals/report-ui\");\n await runReportUiCli(commandArgs, {\n commandName: \"vitest-evals serve\",\n cwd: options.cwd,\n stdout: options.stdout,\n });\n return;\n }\n default:\n throw new Error(`Unknown command: ${command}\\n\\n${usage()}`);\n }\n}\n\nfunction usage() {\n return [\n \"Usage: vitest-evals <command>\",\n \"\",\n \"Commands:\",\n \" serve [json | dir | glob] Serve the local report UI\",\n \"\",\n \"Run `vitest-evals serve --help` for report UI options.\",\n ].join(\"\\n\");\n}\n\nfunction writeLine(\n stdout: Pick<NodeJS.WriteStream, \"write\"> | undefined,\n message: string,\n) {\n (stdout ?? process.stdout).write(`${message}\\n`);\n}\n\ndeclare const require: NodeJS.Require | undefined;\ndeclare const module: NodeJS.Module | undefined;\n\nif (\n typeof require !== \"undefined\" &&\n typeof module !== \"undefined\" &&\n require.main === module\n) {\n runVitestEvalsCli().catch((error) => {\n console.error(error instanceof Error ? error.message : String(error));\n process.exitCode = 1;\n });\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAaA,eAAsB,kBACpB,OAAO,QAAQ,KAAK,MAAM,CAAC,GAC3B,UAAoC,CAAC,GACrC;AACA,QAAM,CAAC,SAAS,GAAG,WAAW,IAAI;AAElC,MACE,CAAC,WACD,YAAY,UACZ,YAAY,YACZ,YAAY,MACZ;AACA,cAAU,QAAQ,QAAQ,MAAM,CAAC;AACjC;AAAA,EACF;AAEA,UAAQ,SAAS;AAAA,IACf,KAAK,SAAS;AACZ,YAAM,EAAE,eAAe,IAAI,MAAM,OAAO,yBAAyB;AACjE,YAAM,eAAe,aAAa;AAAA,QAChC,aAAa;AAAA,QACb,KAAK,QAAQ;AAAA,QACb,QAAQ,QAAQ;AAAA,MAClB,CAAC;AACD;AAAA,IACF;AAAA,IACA;AACE,YAAM,IAAI,MAAM,oBAAoB,OAAO;AAAA;AAAA,EAAO,MAAM,CAAC,EAAE;AAAA,EAC/D;AACF;AAEA,SAAS,QAAQ;AACf,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF,EAAE,KAAK,IAAI;AACb;AAEA,SAAS,UACP,QACA,SACA;AACA,GAAC,UAAU,QAAQ,QAAQ,MAAM,GAAG,OAAO;AAAA,CAAI;AACjD;AAKA,IACE,OAAO,YAAY,eACnB,OAAO,WAAW,eAClB,QAAQ,SAAS,QACjB;AACA,oBAAkB,EAAE,MAAM,CAAC,UAAU;AACnC,YAAQ,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AACpE,YAAQ,WAAW;AAAA,EACrB,CAAC;AACH;","names":[]}
|
package/dist/cli.mjs
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
|
|
3
|
+
get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
|
|
4
|
+
}) : x)(function(x) {
|
|
5
|
+
if (typeof require !== "undefined") return require.apply(this, arguments);
|
|
6
|
+
throw Error('Dynamic require of "' + x + '" is not supported');
|
|
7
|
+
});
|
|
8
|
+
|
|
9
|
+
// src/cli.ts
|
|
10
|
+
async function runVitestEvalsCli(args = process.argv.slice(2), options = {}) {
|
|
11
|
+
const [command, ...commandArgs] = args;
|
|
12
|
+
if (!command || command === "help" || command === "--help" || command === "-h") {
|
|
13
|
+
writeLine(options.stdout, usage());
|
|
14
|
+
return;
|
|
15
|
+
}
|
|
16
|
+
switch (command) {
|
|
17
|
+
case "serve": {
|
|
18
|
+
const { runReportUiCli } = await import("@vitest-evals/report-ui");
|
|
19
|
+
await runReportUiCli(commandArgs, {
|
|
20
|
+
commandName: "vitest-evals serve",
|
|
21
|
+
cwd: options.cwd,
|
|
22
|
+
stdout: options.stdout
|
|
23
|
+
});
|
|
24
|
+
return;
|
|
25
|
+
}
|
|
26
|
+
default:
|
|
27
|
+
throw new Error(`Unknown command: ${command}
|
|
28
|
+
|
|
29
|
+
${usage()}`);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
function usage() {
|
|
33
|
+
return [
|
|
34
|
+
"Usage: vitest-evals <command>",
|
|
35
|
+
"",
|
|
36
|
+
"Commands:",
|
|
37
|
+
" serve [json | dir | glob] Serve the local report UI",
|
|
38
|
+
"",
|
|
39
|
+
"Run `vitest-evals serve --help` for report UI options."
|
|
40
|
+
].join("\n");
|
|
41
|
+
}
|
|
42
|
+
function writeLine(stdout, message) {
|
|
43
|
+
(stdout ?? process.stdout).write(`${message}
|
|
44
|
+
`);
|
|
45
|
+
}
|
|
46
|
+
if (typeof __require !== "undefined" && typeof module !== "undefined" && __require.main === module) {
|
|
47
|
+
runVitestEvalsCli().catch((error) => {
|
|
48
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
49
|
+
process.exitCode = 1;
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
export {
|
|
53
|
+
runVitestEvalsCli
|
|
54
|
+
};
|
|
55
|
+
//# sourceMappingURL=cli.mjs.map
|
package/dist/cli.mjs.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/** Output streams used by the `vitest-evals` CLI runner. */\nexport type VitestEvalsCliIo = {\n stdout?: Pick<NodeJS.WriteStream, \"write\">;\n};\n\n/** Options for running the `vitest-evals` CLI. */\nexport type RunVitestEvalsCliOptions = VitestEvalsCliIo & {\n cwd?: string;\n};\n\n/** Runs the product-facing `vitest-evals` CLI. */\nexport async function runVitestEvalsCli(\n args = process.argv.slice(2),\n options: RunVitestEvalsCliOptions = {},\n) {\n const [command, ...commandArgs] = args;\n\n if (\n !command ||\n command === \"help\" ||\n command === \"--help\" ||\n command === \"-h\"\n ) {\n writeLine(options.stdout, usage());\n return;\n }\n\n switch (command) {\n case \"serve\": {\n const { runReportUiCli } = await import(\"@vitest-evals/report-ui\");\n await runReportUiCli(commandArgs, {\n commandName: \"vitest-evals serve\",\n cwd: options.cwd,\n stdout: options.stdout,\n });\n return;\n }\n default:\n throw new Error(`Unknown command: ${command}\\n\\n${usage()}`);\n }\n}\n\nfunction usage() {\n return [\n \"Usage: vitest-evals <command>\",\n \"\",\n \"Commands:\",\n \" serve [json | dir | glob] Serve the local report UI\",\n \"\",\n \"Run `vitest-evals serve --help` for report UI options.\",\n ].join(\"\\n\");\n}\n\nfunction writeLine(\n stdout: Pick<NodeJS.WriteStream, \"write\"> | undefined,\n message: string,\n) {\n (stdout ?? process.stdout).write(`${message}\\n`);\n}\n\ndeclare const require: NodeJS.Require | undefined;\ndeclare const module: NodeJS.Module | undefined;\n\nif (\n typeof require !== \"undefined\" &&\n typeof module !== \"undefined\" &&\n require.main === module\n) {\n runVitestEvalsCli().catch((error) => {\n console.error(error instanceof Error ? error.message : String(error));\n process.exitCode = 1;\n });\n}\n"],"mappings":";;;;;;;;;AAaA,eAAsB,kBACpB,OAAO,QAAQ,KAAK,MAAM,CAAC,GAC3B,UAAoC,CAAC,GACrC;AACA,QAAM,CAAC,SAAS,GAAG,WAAW,IAAI;AAElC,MACE,CAAC,WACD,YAAY,UACZ,YAAY,YACZ,YAAY,MACZ;AACA,cAAU,QAAQ,QAAQ,MAAM,CAAC;AACjC;AAAA,EACF;AAEA,UAAQ,SAAS;AAAA,IACf,KAAK,SAAS;AACZ,YAAM,EAAE,eAAe,IAAI,MAAM,OAAO,yBAAyB;AACjE,YAAM,eAAe,aAAa;AAAA,QAChC,aAAa;AAAA,QACb,KAAK,QAAQ;AAAA,QACb,QAAQ,QAAQ;AAAA,MAClB,CAAC;AACD;AAAA,IACF;AAAA,IACA;AACE,YAAM,IAAI,MAAM,oBAAoB,OAAO;AAAA;AAAA,EAAO,MAAM,CAAC,EAAE;AAAA,EAC/D;AACF;AAEA,SAAS,QAAQ;AACf,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF,EAAE,KAAK,IAAI;AACb;AAEA,SAAS,UACP,QACA,SACA;AACA,GAAC,UAAU,QAAQ,QAAQ,MAAM,GAAG,OAAO;AAAA,CAAI;AACjD;AAKA,IACE,OAAO,cAAY,eACnB,OAAO,WAAW,eAClB,UAAQ,SAAS,QACjB;AACA,oBAAkB,EAAE,MAAM,CAAC,UAAU;AACnC,YAAQ,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AACpE,YAAQ,WAAW;AAAA,EACrB,CAAC;AACH;","names":[]}
|
package/dist/harness.d.mts
CHANGED
|
@@ -1,173 +1,35 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
1
|
+
import { JsonValue, HarnessRun, GenAiOperationName, ToolCallRecord, NormalizedSpanEvent, NormalizedSpan, NormalizedTrace, NormalizedMessage, UsageSummary, TimingSummary, NormalizedSpanAttributes, HarnessRunError, NormalizedSession } from '@vitest-evals/core';
|
|
2
|
+
export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue, NormalizedMessage, NormalizedSession, NormalizedSpan, NormalizedSpanAttributeKey, NormalizedSpanAttributes, NormalizedSpanEvent, NormalizedTrace, OpenTelemetrySemanticAttributeKey, OpenTelemetrySemanticAttributes, TimingSummary, ToolCallRecord, UsageSummary, assistantMessages, failedSpans, latestAssistantMessageContent, messagesByRole, spans, spansByKind, systemMessages, toolCalls, toolMessages, userMessages } from '@vitest-evals/core';
|
|
3
|
+
|
|
4
|
+
/** Options for converting normalized tool calls into trace spans. */
|
|
5
|
+
type CreateToolCallSpansOptions = {
|
|
6
|
+
/** Trace id to attach to each generated tool span. */
|
|
7
|
+
traceId?: string;
|
|
8
|
+
/** Parent span id to attach to each generated tool span. */
|
|
9
|
+
parentId?: string;
|
|
10
|
+
/** Prefix used to create internal span ids instead of reusing tool-call ids. */
|
|
11
|
+
spanIdPrefix?: string;
|
|
6
12
|
};
|
|
7
|
-
/**
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
* @example
|
|
11
|
-
* ```ts
|
|
12
|
-
* const call: ToolCallRecord = {
|
|
13
|
-
* name: "lookupInvoice",
|
|
14
|
-
* arguments: { invoiceId: "inv_123" },
|
|
15
|
-
* result: { refundable: true },
|
|
16
|
-
* };
|
|
17
|
-
* ```
|
|
18
|
-
*/
|
|
19
|
-
type ToolCallRecord = {
|
|
20
|
-
/** Provider or runtime tool-call id when one is available. */
|
|
21
|
-
id?: string;
|
|
22
|
-
/** Tool name as exposed to the agent or application runtime. */
|
|
13
|
+
/** Options for attaching a fallback run trace to a harness result. */
|
|
14
|
+
type EnsureRunTraceOptions = {
|
|
15
|
+
/** Human-readable run or harness name. */
|
|
23
16
|
name: string;
|
|
24
|
-
/**
|
|
25
|
-
|
|
26
|
-
/**
|
|
27
|
-
|
|
28
|
-
/**
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
/** ISO timestamp for the start of tool execution. */
|
|
35
|
-
startedAt?: string;
|
|
36
|
-
/** ISO timestamp for the end of tool execution. */
|
|
37
|
-
finishedAt?: string;
|
|
38
|
-
/** Tool execution duration in milliseconds. */
|
|
39
|
-
durationMs?: number;
|
|
40
|
-
/** Extra JSON-safe tool metadata for reporters and custom judges. */
|
|
41
|
-
metadata?: Record<string, JsonValue>;
|
|
42
|
-
};
|
|
43
|
-
/**
|
|
44
|
-
* Normalized message recorded in a harness session transcript.
|
|
45
|
-
*
|
|
46
|
-
* @example
|
|
47
|
-
* ```ts
|
|
48
|
-
* const message: NormalizedMessage = {
|
|
49
|
-
* role: "assistant",
|
|
50
|
-
* content: { status: "approved" },
|
|
51
|
-
* toolCalls: [{ name: "lookupInvoice" }],
|
|
52
|
-
* };
|
|
53
|
-
* ```
|
|
54
|
-
*/
|
|
55
|
-
type NormalizedMessage = {
|
|
56
|
-
/** Transcript role for the normalized message. */
|
|
57
|
-
role: "system" | "user" | "assistant" | "tool";
|
|
58
|
-
/** JSON-safe message content. */
|
|
59
|
-
content?: JsonValue;
|
|
60
|
-
/** Tool calls associated with this message. */
|
|
61
|
-
toolCalls?: ToolCallRecord[];
|
|
62
|
-
/** Extra JSON-safe message metadata. */
|
|
63
|
-
metadata?: Record<string, JsonValue>;
|
|
64
|
-
};
|
|
65
|
-
/**
|
|
66
|
-
* Provider usage summary attached to a normalized harness run.
|
|
67
|
-
*
|
|
68
|
-
* @example
|
|
69
|
-
* ```ts
|
|
70
|
-
* const usage: UsageSummary = {
|
|
71
|
-
* provider: "openai",
|
|
72
|
-
* model: "gpt-4o-mini",
|
|
73
|
-
* inputTokens: 212,
|
|
74
|
-
* outputTokens: 48,
|
|
75
|
-
* totalTokens: 260,
|
|
76
|
-
* };
|
|
77
|
-
* ```
|
|
78
|
-
*/
|
|
79
|
-
type UsageSummary = {
|
|
80
|
-
/** Provider that served the application run. */
|
|
81
|
-
provider?: string;
|
|
82
|
-
/** Model used for the application run. */
|
|
83
|
-
model?: string;
|
|
84
|
-
/** Input, prompt, or request tokens consumed by the run. */
|
|
85
|
-
inputTokens?: number;
|
|
86
|
-
/** Output or completion tokens produced by the run. */
|
|
87
|
-
outputTokens?: number;
|
|
88
|
-
/** Reasoning tokens reported by providers that expose them. */
|
|
89
|
-
reasoningTokens?: number;
|
|
90
|
-
/** Total token count reported by the provider or adapter. */
|
|
91
|
-
totalTokens?: number;
|
|
92
|
-
/** Count of tool calls observed during the run. */
|
|
93
|
-
toolCalls?: number;
|
|
94
|
-
/** Retry count observed during the run. */
|
|
95
|
-
retries?: number;
|
|
96
|
-
/** Provider-specific JSON-safe usage details. Cost estimates belong here. */
|
|
97
|
-
metadata?: Record<string, JsonValue>;
|
|
98
|
-
};
|
|
99
|
-
/** Timing summary attached to a normalized harness run. */
|
|
100
|
-
type TimingSummary = {
|
|
101
|
-
/** End-to-end run duration in milliseconds. */
|
|
102
|
-
totalMs?: number;
|
|
103
|
-
/** Extra JSON-safe timing metadata. */
|
|
104
|
-
metadata?: Record<string, JsonValue>;
|
|
105
|
-
};
|
|
106
|
-
/**
|
|
107
|
-
* JSON-serializable transcript produced by the system under test.
|
|
108
|
-
*
|
|
109
|
-
* @example
|
|
110
|
-
* ```ts
|
|
111
|
-
* const session: NormalizedSession = {
|
|
112
|
-
* provider: "openai",
|
|
113
|
-
* model: "gpt-4o-mini",
|
|
114
|
-
* messages: [
|
|
115
|
-
* { role: "user", content: "Refund invoice inv_123" },
|
|
116
|
-
* { role: "assistant", content: { status: "approved" } },
|
|
117
|
-
* ],
|
|
118
|
-
* };
|
|
119
|
-
* ```
|
|
120
|
-
*/
|
|
121
|
-
type NormalizedSession = {
|
|
122
|
-
/** Ordered normalized transcript messages. */
|
|
123
|
-
messages: NormalizedMessage[];
|
|
124
|
-
/** Provider that produced the session when known. */
|
|
125
|
-
provider?: string;
|
|
126
|
-
/** Model that produced the session when known. */
|
|
127
|
-
model?: string;
|
|
128
|
-
/** Extra JSON-safe session metadata. */
|
|
129
|
-
metadata?: Record<string, JsonValue>;
|
|
17
|
+
/** Wall-clock start time for the harness run. */
|
|
18
|
+
startedAt: Date;
|
|
19
|
+
/** Wall-clock finish time for the harness run. */
|
|
20
|
+
finishedAt: Date;
|
|
21
|
+
/** Optional trace id. A generated id is used when omitted. */
|
|
22
|
+
id?: string;
|
|
23
|
+
/** GenAI operation name to place on the root run span. */
|
|
24
|
+
operationName?: GenAiOperationName;
|
|
25
|
+
/** Optional JSON-safe source marker for the trace metadata. */
|
|
26
|
+
source?: string;
|
|
130
27
|
};
|
|
131
28
|
type OutputField<TOutput extends JsonValue | undefined> = undefined extends TOutput ? {
|
|
132
29
|
output?: TOutput;
|
|
133
30
|
} : {
|
|
134
31
|
output: TOutput;
|
|
135
32
|
};
|
|
136
|
-
/**
|
|
137
|
-
* Normalized result returned by every harness execution.
|
|
138
|
-
*
|
|
139
|
-
* @example
|
|
140
|
-
* ```ts
|
|
141
|
-
* const run: HarnessRun<{ status: "approved" }> = {
|
|
142
|
-
* output: { status: "approved" },
|
|
143
|
-
* session: {
|
|
144
|
-
* messages: [
|
|
145
|
-
* { role: "user", content: "Refund invoice inv_123" },
|
|
146
|
-
* { role: "assistant", content: { status: "approved" } },
|
|
147
|
-
* ],
|
|
148
|
-
* },
|
|
149
|
-
* usage: { totalTokens: 260 },
|
|
150
|
-
* errors: [],
|
|
151
|
-
* };
|
|
152
|
-
* ```
|
|
153
|
-
*/
|
|
154
|
-
type HarnessRun<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
|
|
155
|
-
/** Normalized transcript and provider/session metadata. */
|
|
156
|
-
session: NormalizedSession;
|
|
157
|
-
/** Stable provider usage units such as tokens, tools, and retries. */
|
|
158
|
-
usage: UsageSummary;
|
|
159
|
-
/** Optional timing summary for the run. */
|
|
160
|
-
timings?: TimingSummary;
|
|
161
|
-
/** JSON-safe run artifacts captured by the harness or test context. */
|
|
162
|
-
artifacts?: Record<string, JsonValue>;
|
|
163
|
-
/** Normalized errors captured during execution. */
|
|
164
|
-
errors: Array<Record<string, JsonValue>>;
|
|
165
|
-
};
|
|
166
|
-
/** Error value with an attached partial or complete normalized harness run. */
|
|
167
|
-
type HarnessRunError = Error & {
|
|
168
|
-
/** Attached normalized harness run recovered by `getHarnessRunFromError(...)`. */
|
|
169
|
-
vitestEvalsRun: HarnessRun;
|
|
170
|
-
};
|
|
171
33
|
/** Per-run metadata shape accepted by harnesses and eval tests. */
|
|
172
34
|
type HarnessMetadata = Record<string, unknown>;
|
|
173
35
|
/**
|
|
@@ -232,6 +94,27 @@ type SimpleToolCallRecord = Omit<ToolCallRecord, "arguments" | "result" | "error
|
|
|
232
94
|
/** Raw tool metadata accepted by `createHarness(...)` before normalization. */
|
|
233
95
|
metadata?: Record<string, unknown>;
|
|
234
96
|
};
|
|
97
|
+
/** Lightweight span event accepted by `createHarness(...)` results. */
|
|
98
|
+
type SimpleSpanEvent = Omit<NormalizedSpanEvent, "attributes"> & {
|
|
99
|
+
/** Raw event attributes accepted by `createHarness(...)` before normalization. */
|
|
100
|
+
attributes?: Record<string, unknown>;
|
|
101
|
+
};
|
|
102
|
+
/** Lightweight span record accepted by `createHarness(...)` results. */
|
|
103
|
+
type SimpleSpanRecord = Omit<NormalizedSpan, "attributes" | "error" | "events"> & {
|
|
104
|
+
/** Raw span attributes accepted by `createHarness(...)` before normalization. */
|
|
105
|
+
attributes?: Record<string, unknown>;
|
|
106
|
+
/** Raw span error accepted by `createHarness(...)` before normalization. */
|
|
107
|
+
error?: unknown;
|
|
108
|
+
/** Raw span events accepted by `createHarness(...)` before normalization. */
|
|
109
|
+
events?: SimpleSpanEvent[];
|
|
110
|
+
};
|
|
111
|
+
/** Lightweight trace record accepted by `createHarness(...)` results. */
|
|
112
|
+
type SimpleTraceRecord = Omit<NormalizedTrace, "metadata" | "spans"> & {
|
|
113
|
+
/** Raw trace metadata accepted by `createHarness(...)` before normalization. */
|
|
114
|
+
metadata?: Record<string, unknown>;
|
|
115
|
+
/** Lightweight spans to normalize into the trace. */
|
|
116
|
+
spans: SimpleSpanRecord[];
|
|
117
|
+
};
|
|
235
118
|
/**
|
|
236
119
|
* Lightweight result shape normalized by `createHarness(...)`.
|
|
237
120
|
*
|
|
@@ -255,6 +138,8 @@ type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | und
|
|
|
255
138
|
timings?: TimingSummary;
|
|
256
139
|
/** Raw artifact values to normalize and merge into the run. */
|
|
257
140
|
artifacts?: Record<string, unknown>;
|
|
141
|
+
/** Lightweight traces and spans to normalize into the run. */
|
|
142
|
+
traces?: SimpleTraceRecord[];
|
|
258
143
|
/** Raw session metadata to normalize into the session. */
|
|
259
144
|
metadata?: Record<string, unknown>;
|
|
260
145
|
/** Raw errors to normalize into the run. */
|
|
@@ -355,87 +240,44 @@ declare function createHarness<TInput = unknown, TOutput extends JsonValue | und
|
|
|
355
240
|
*/
|
|
356
241
|
declare function normalizeHarnessRun<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, TOutput extends JsonValue | undefined = JsonValue | undefined>(input: TInput, result: HarnessResultLike<TOutput>, context?: HarnessContext<TMetadata>): HarnessRun<TOutput>;
|
|
357
242
|
/**
|
|
358
|
-
*
|
|
359
|
-
*
|
|
360
|
-
* @param session - Normalized session produced by a harness run.
|
|
361
|
-
*
|
|
362
|
-
* @example
|
|
363
|
-
* ```ts
|
|
364
|
-
* const names = toolCalls(result.session).map((call) => call.name);
|
|
365
|
-
*
|
|
366
|
-
* expect(names).toEqual(["lookupInvoice", "createRefund"]);
|
|
367
|
-
* ```
|
|
368
|
-
*/
|
|
369
|
-
declare function toolCalls(session: NormalizedSession): ToolCallRecord[];
|
|
370
|
-
/**
|
|
371
|
-
* Filters normalized session messages by role.
|
|
372
|
-
*
|
|
373
|
-
* @param session - Normalized session produced by a harness run.
|
|
374
|
-
* @param role - Message role to keep.
|
|
375
|
-
*
|
|
376
|
-
* @example
|
|
377
|
-
* ```ts
|
|
378
|
-
* const assistantText = messagesByRole(result.session, "assistant")
|
|
379
|
-
* .map((message) => message.content)
|
|
380
|
-
* .join("\n");
|
|
381
|
-
* ```
|
|
382
|
-
*/
|
|
383
|
-
declare function messagesByRole(session: NormalizedSession, role: NormalizedMessage["role"]): NormalizedMessage[];
|
|
384
|
-
/**
|
|
385
|
-
* Returns every normalized system message from a session.
|
|
386
|
-
*
|
|
387
|
-
* @param session - Normalized session produced by a harness run.
|
|
388
|
-
*
|
|
389
|
-
* @example
|
|
390
|
-
* ```ts
|
|
391
|
-
* const systemPrompts = systemMessages(result.session);
|
|
392
|
-
* ```
|
|
393
|
-
*/
|
|
394
|
-
declare function systemMessages(session: NormalizedSession): NormalizedMessage[];
|
|
395
|
-
/**
|
|
396
|
-
* Returns every normalized user message from a session.
|
|
243
|
+
* Builds a JSON-safe failed run for errors that happen before a harness can return.
|
|
397
244
|
*
|
|
398
|
-
* @param
|
|
399
|
-
*
|
|
400
|
-
* @
|
|
401
|
-
* ```ts
|
|
402
|
-
* const firstPrompt = userMessages(result.session)[0]?.content;
|
|
403
|
-
* ```
|
|
404
|
-
*/
|
|
405
|
-
declare function userMessages(session: NormalizedSession): NormalizedMessage[];
|
|
406
|
-
/**
|
|
407
|
-
* Returns every normalized assistant message from a session.
|
|
408
|
-
*
|
|
409
|
-
* @param session - Normalized session produced by a harness run.
|
|
410
|
-
*
|
|
411
|
-
* @example
|
|
412
|
-
* ```ts
|
|
413
|
-
* const finalAnswer = assistantMessages(result.session).at(-1)?.content;
|
|
414
|
-
* ```
|
|
245
|
+
* @param input - Original input passed to the harness.
|
|
246
|
+
* @param error - Error thrown by setup or execution.
|
|
247
|
+
* @param options - Optional artifacts to preserve on the failed run.
|
|
415
248
|
*/
|
|
416
|
-
declare function
|
|
249
|
+
declare function createFailedHarnessRun(input: unknown, error: unknown, options?: {
|
|
250
|
+
artifacts?: Record<string, JsonValue>;
|
|
251
|
+
}): HarnessRun;
|
|
252
|
+
/** Normalizes arbitrary span errors while preserving object-shaped messages. */
|
|
253
|
+
declare function normalizeSpanError(error: unknown): NormalizedSpan["error"] | undefined;
|
|
254
|
+
/** Normalizes raw span attributes into the JSON-safe span attribute shape. */
|
|
255
|
+
declare function normalizeSpanAttributes(attributes: Record<string, unknown>): NormalizedSpanAttributes | undefined;
|
|
256
|
+
/** Builds common OpenTelemetry GenAI usage attributes from a usage summary. */
|
|
257
|
+
declare function createGenAiUsageAttributes(usage: UsageSummary | undefined, options?: {
|
|
258
|
+
provider?: string;
|
|
259
|
+
}): {
|
|
260
|
+
"gen_ai.provider.name": string | undefined;
|
|
261
|
+
"gen_ai.request.model": string | undefined;
|
|
262
|
+
"gen_ai.response.model": string | undefined;
|
|
263
|
+
"gen_ai.usage.input_tokens": number | undefined;
|
|
264
|
+
"gen_ai.usage.output_tokens": number | undefined;
|
|
265
|
+
"gen_ai.usage.reasoning.output_tokens": number | undefined;
|
|
266
|
+
};
|
|
417
267
|
/**
|
|
418
|
-
*
|
|
419
|
-
*
|
|
420
|
-
* @param session - Normalized session produced by a harness run.
|
|
268
|
+
* Converts normalized tool-call records into trace spans.
|
|
421
269
|
*
|
|
422
|
-
*
|
|
423
|
-
*
|
|
424
|
-
* const finalAnswer = latestAssistantMessageContent(result.session);
|
|
425
|
-
* ```
|
|
270
|
+
* Tool-call ids are preserved as GenAI attributes. Pass `spanIdPrefix` when the
|
|
271
|
+
* spans belong to a known trace so span ids stay internally unique.
|
|
426
272
|
*/
|
|
427
|
-
declare function
|
|
273
|
+
declare function createToolCallSpans(calls: ToolCallRecord[], options?: CreateToolCallSpansOptions): NormalizedSpan[];
|
|
428
274
|
/**
|
|
429
|
-
*
|
|
430
|
-
*
|
|
431
|
-
* @param session - Normalized session produced by a harness run.
|
|
275
|
+
* Attaches a fallback run trace when a harness result does not already contain spans.
|
|
432
276
|
*
|
|
433
|
-
*
|
|
434
|
-
*
|
|
435
|
-
* const toolOutputs = toolMessages(result.session).map((message) => message.content);
|
|
436
|
-
* ```
|
|
277
|
+
* This keeps custom harnesses inspectable while first-party harness packages
|
|
278
|
+
* remain free to attach richer native traces.
|
|
437
279
|
*/
|
|
438
|
-
declare function
|
|
280
|
+
declare function ensureRunTrace(run: HarnessRun, options: EnsureRunTraceOptions): NormalizedTrace | undefined;
|
|
439
281
|
/**
|
|
440
282
|
* Attaches a partial or complete harness run to an arbitrary thrown error.
|
|
441
283
|
*
|
|
@@ -476,4 +318,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
|
|
|
476
318
|
/** Serializes an arbitrary thrown value into the normalized error shape. */
|
|
477
319
|
declare function serializeError(error: unknown): Record<string, JsonValue>;
|
|
478
320
|
|
|
479
|
-
export { type CreateHarnessOptions, type CreateHarnessRunArgs, type
|
|
321
|
+
export { type CreateHarnessOptions, type CreateHarnessRunArgs, type CreateToolCallSpansOptions, type EnsureRunTraceOptions, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type MaybePromise, type SimpleHarnessResult, type SimpleSpanEvent, type SimpleSpanRecord, type SimpleToolCallRecord, type SimpleTraceRecord, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, createToolCallSpans, ensureRunTrace, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, normalizeSpanAttributes, normalizeSpanError, resolveHarnessRunErrors, serializeError, toJsonValue };
|