vitest-evals 0.11.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -0
- package/bin/vitest-evals.js +8 -0
- package/dist/cli.d.mts +13 -0
- package/dist/cli.d.ts +13 -0
- package/dist/cli.js +83 -0
- package/dist/cli.js.map +1 -0
- package/dist/cli.mjs +55 -0
- package/dist/cli.mjs.map +1 -0
- package/dist/harness.d.mts +4 -413
- package/dist/harness.d.ts +4 -413
- package/dist/harness.js +19 -50
- package/dist/harness.js.map +1 -1
- package/dist/harness.mjs +31 -48
- package/dist/harness.mjs.map +1 -1
- package/dist/index.d.mts +5 -3
- package/dist/index.d.ts +5 -3
- package/dist/index.js +25 -56
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +37 -54
- package/dist/index.mjs.map +1 -1
- package/dist/internal/scoring.d.mts +1 -1
- package/dist/internal/scoring.d.ts +1 -1
- package/dist/internal/structuredOutputScorer.d.mts +1 -1
- package/dist/internal/structuredOutputScorer.d.ts +1 -1
- package/dist/internal/toolCallScorer.d.mts +1 -1
- package/dist/internal/toolCallScorer.d.ts +1 -1
- package/dist/internal/toolCallScorer.js +2 -0
- package/dist/internal/toolCallScorer.js.map +1 -1
- package/dist/internal/toolCallScorer.mjs +16 -0
- package/dist/internal/toolCallScorer.mjs.map +1 -1
- package/dist/judges/factualityJudge.d.mts +2 -1
- package/dist/judges/factualityJudge.d.ts +2 -1
- package/dist/judges/factualityJudge.js +4 -14
- package/dist/judges/factualityJudge.js.map +1 -1
- package/dist/judges/factualityJudge.mjs +18 -14
- package/dist/judges/factualityJudge.mjs.map +1 -1
- package/dist/judges/index.d.mts +1 -0
- package/dist/judges/index.d.ts +1 -0
- package/dist/judges/index.js +11 -27
- package/dist/judges/index.js.map +1 -1
- package/dist/judges/index.mjs +23 -25
- package/dist/judges/index.mjs.map +1 -1
- package/dist/judges/judgeHarness.d.mts +2 -1
- package/dist/judges/judgeHarness.d.ts +2 -1
- package/dist/judges/judgeHarness.js +10 -26
- package/dist/judges/judgeHarness.js.map +1 -1
- package/dist/judges/judgeHarness.mjs +22 -24
- package/dist/judges/judgeHarness.mjs.map +1 -1
- package/dist/judges/structuredOutputJudge.d.mts +1 -0
- package/dist/judges/structuredOutputJudge.d.ts +1 -0
- package/dist/judges/toolCallJudge.d.mts +1 -0
- package/dist/judges/toolCallJudge.d.ts +1 -0
- package/dist/judges/toolCallJudge.js +2 -0
- package/dist/judges/toolCallJudge.js.map +1 -1
- package/dist/judges/toolCallJudge.mjs +16 -0
- package/dist/judges/toolCallJudge.mjs.map +1 -1
- package/dist/judges/types.d.mts +2 -1
- package/dist/judges/types.d.ts +2 -1
- package/dist/legacy/scorers/index.js +2 -0
- package/dist/legacy/scorers/index.js.map +1 -1
- package/dist/legacy/scorers/index.mjs +16 -0
- package/dist/legacy/scorers/index.mjs.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.js +2 -0
- package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.mjs +16 -0
- package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
- package/dist/legacy.js +7 -5
- package/dist/legacy.js.map +1 -1
- package/dist/legacy.mjs +21 -5
- package/dist/legacy.mjs.map +1 -1
- package/dist/replay.d.mts +1 -1
- package/dist/replay.d.ts +1 -1
- package/dist/reporter.js +4 -5
- package/dist/reporter.js.map +1 -1
- package/dist/reporter.mjs +18 -5
- package/dist/reporter.mjs.map +1 -1
- package/package.json +9 -1
package/README.md
CHANGED
|
@@ -144,6 +144,18 @@ compatibility.
|
|
|
144
144
|
|
|
145
145
|
Full transcripts and spans are preserved in the Vitest JSON report metadata.
|
|
146
146
|
|
|
147
|
+
## Local Report UI
|
|
148
|
+
|
|
149
|
+
The local report UI reads the same Vitest JSON artifacts and serves a React SPA
|
|
150
|
+
for drilling into runs, eval cases, harness output, sessions, tool calls,
|
|
151
|
+
scores, and trace spans.
|
|
152
|
+
|
|
153
|
+
```sh
|
|
154
|
+
pnpm exec vitest-evals serve vitest-results.json
|
|
155
|
+
pnpm exec vitest-evals serve "eval-results/*.json"
|
|
156
|
+
pnpm exec vitest-evals serve eval-results/
|
|
157
|
+
```
|
|
158
|
+
|
|
147
159
|
## GitHub Actions Reporting
|
|
148
160
|
|
|
149
161
|
Use Vitest JSON as the eval report artifact. It preserves the `meta` field that
|
package/dist/cli.d.mts
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/** Output streams used by the `vitest-evals` CLI runner. */
|
|
3
|
+
type VitestEvalsCliIo = {
|
|
4
|
+
stdout?: Pick<NodeJS.WriteStream, "write">;
|
|
5
|
+
};
|
|
6
|
+
/** Options for running the `vitest-evals` CLI. */
|
|
7
|
+
type RunVitestEvalsCliOptions = VitestEvalsCliIo & {
|
|
8
|
+
cwd?: string;
|
|
9
|
+
};
|
|
10
|
+
/** Runs the product-facing `vitest-evals` CLI. */
|
|
11
|
+
declare function runVitestEvalsCli(args?: string[], options?: RunVitestEvalsCliOptions): Promise<void>;
|
|
12
|
+
|
|
13
|
+
export { type RunVitestEvalsCliOptions, type VitestEvalsCliIo, runVitestEvalsCli };
|
package/dist/cli.d.ts
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/** Output streams used by the `vitest-evals` CLI runner. */
|
|
3
|
+
type VitestEvalsCliIo = {
|
|
4
|
+
stdout?: Pick<NodeJS.WriteStream, "write">;
|
|
5
|
+
};
|
|
6
|
+
/** Options for running the `vitest-evals` CLI. */
|
|
7
|
+
type RunVitestEvalsCliOptions = VitestEvalsCliIo & {
|
|
8
|
+
cwd?: string;
|
|
9
|
+
};
|
|
10
|
+
/** Runs the product-facing `vitest-evals` CLI. */
|
|
11
|
+
declare function runVitestEvalsCli(args?: string[], options?: RunVitestEvalsCliOptions): Promise<void>;
|
|
12
|
+
|
|
13
|
+
export { type RunVitestEvalsCliOptions, type VitestEvalsCliIo, runVitestEvalsCli };
|
package/dist/cli.js
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
"use strict";
|
|
3
|
+
var __create = Object.create;
|
|
4
|
+
var __defProp = Object.defineProperty;
|
|
5
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
6
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
7
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
8
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
9
|
+
var __export = (target, all) => {
|
|
10
|
+
for (var name in all)
|
|
11
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
12
|
+
};
|
|
13
|
+
var __copyProps = (to, from, except, desc) => {
|
|
14
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
15
|
+
for (let key of __getOwnPropNames(from))
|
|
16
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
17
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
18
|
+
}
|
|
19
|
+
return to;
|
|
20
|
+
};
|
|
21
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
22
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
23
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
24
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
25
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
26
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
27
|
+
mod
|
|
28
|
+
));
|
|
29
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
30
|
+
|
|
31
|
+
// src/cli.ts
|
|
32
|
+
var cli_exports = {};
|
|
33
|
+
__export(cli_exports, {
|
|
34
|
+
runVitestEvalsCli: () => runVitestEvalsCli
|
|
35
|
+
});
|
|
36
|
+
module.exports = __toCommonJS(cli_exports);
|
|
37
|
+
async function runVitestEvalsCli(args = process.argv.slice(2), options = {}) {
|
|
38
|
+
const [command, ...commandArgs] = args;
|
|
39
|
+
if (!command || command === "help" || command === "--help" || command === "-h") {
|
|
40
|
+
writeLine(options.stdout, usage());
|
|
41
|
+
return;
|
|
42
|
+
}
|
|
43
|
+
switch (command) {
|
|
44
|
+
case "serve": {
|
|
45
|
+
const { runReportUiCli } = await import("@vitest-evals/report-ui");
|
|
46
|
+
await runReportUiCli(commandArgs, {
|
|
47
|
+
commandName: "vitest-evals serve",
|
|
48
|
+
cwd: options.cwd,
|
|
49
|
+
stdout: options.stdout
|
|
50
|
+
});
|
|
51
|
+
return;
|
|
52
|
+
}
|
|
53
|
+
default:
|
|
54
|
+
throw new Error(`Unknown command: ${command}
|
|
55
|
+
|
|
56
|
+
${usage()}`);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
function usage() {
|
|
60
|
+
return [
|
|
61
|
+
"Usage: vitest-evals <command>",
|
|
62
|
+
"",
|
|
63
|
+
"Commands:",
|
|
64
|
+
" serve [json | dir | glob] Serve the local report UI",
|
|
65
|
+
"",
|
|
66
|
+
"Run `vitest-evals serve --help` for report UI options."
|
|
67
|
+
].join("\n");
|
|
68
|
+
}
|
|
69
|
+
function writeLine(stdout, message) {
|
|
70
|
+
(stdout ?? process.stdout).write(`${message}
|
|
71
|
+
`);
|
|
72
|
+
}
|
|
73
|
+
if (typeof require !== "undefined" && typeof module !== "undefined" && require.main === module) {
|
|
74
|
+
runVitestEvalsCli().catch((error) => {
|
|
75
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
76
|
+
process.exitCode = 1;
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
80
|
+
0 && (module.exports = {
|
|
81
|
+
runVitestEvalsCli
|
|
82
|
+
});
|
|
83
|
+
//# sourceMappingURL=cli.js.map
|
package/dist/cli.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/** Output streams used by the `vitest-evals` CLI runner. */\nexport type VitestEvalsCliIo = {\n stdout?: Pick<NodeJS.WriteStream, \"write\">;\n};\n\n/** Options for running the `vitest-evals` CLI. */\nexport type RunVitestEvalsCliOptions = VitestEvalsCliIo & {\n cwd?: string;\n};\n\n/** Runs the product-facing `vitest-evals` CLI. */\nexport async function runVitestEvalsCli(\n args = process.argv.slice(2),\n options: RunVitestEvalsCliOptions = {},\n) {\n const [command, ...commandArgs] = args;\n\n if (\n !command ||\n command === \"help\" ||\n command === \"--help\" ||\n command === \"-h\"\n ) {\n writeLine(options.stdout, usage());\n return;\n }\n\n switch (command) {\n case \"serve\": {\n const { runReportUiCli } = await import(\"@vitest-evals/report-ui\");\n await runReportUiCli(commandArgs, {\n commandName: \"vitest-evals serve\",\n cwd: options.cwd,\n stdout: options.stdout,\n });\n return;\n }\n default:\n throw new Error(`Unknown command: ${command}\\n\\n${usage()}`);\n }\n}\n\nfunction usage() {\n return [\n \"Usage: vitest-evals <command>\",\n \"\",\n \"Commands:\",\n \" serve [json | dir | glob] Serve the local report UI\",\n \"\",\n \"Run `vitest-evals serve --help` for report UI options.\",\n ].join(\"\\n\");\n}\n\nfunction writeLine(\n stdout: Pick<NodeJS.WriteStream, \"write\"> | undefined,\n message: string,\n) {\n (stdout ?? process.stdout).write(`${message}\\n`);\n}\n\ndeclare const require: NodeJS.Require | undefined;\ndeclare const module: NodeJS.Module | undefined;\n\nif (\n typeof require !== \"undefined\" &&\n typeof module !== \"undefined\" &&\n require.main === module\n) {\n runVitestEvalsCli().catch((error) => {\n console.error(error instanceof Error ? error.message : String(error));\n process.exitCode = 1;\n });\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAaA,eAAsB,kBACpB,OAAO,QAAQ,KAAK,MAAM,CAAC,GAC3B,UAAoC,CAAC,GACrC;AACA,QAAM,CAAC,SAAS,GAAG,WAAW,IAAI;AAElC,MACE,CAAC,WACD,YAAY,UACZ,YAAY,YACZ,YAAY,MACZ;AACA,cAAU,QAAQ,QAAQ,MAAM,CAAC;AACjC;AAAA,EACF;AAEA,UAAQ,SAAS;AAAA,IACf,KAAK,SAAS;AACZ,YAAM,EAAE,eAAe,IAAI,MAAM,OAAO,yBAAyB;AACjE,YAAM,eAAe,aAAa;AAAA,QAChC,aAAa;AAAA,QACb,KAAK,QAAQ;AAAA,QACb,QAAQ,QAAQ;AAAA,MAClB,CAAC;AACD;AAAA,IACF;AAAA,IACA;AACE,YAAM,IAAI,MAAM,oBAAoB,OAAO;AAAA;AAAA,EAAO,MAAM,CAAC,EAAE;AAAA,EAC/D;AACF;AAEA,SAAS,QAAQ;AACf,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF,EAAE,KAAK,IAAI;AACb;AAEA,SAAS,UACP,QACA,SACA;AACA,GAAC,UAAU,QAAQ,QAAQ,MAAM,GAAG,OAAO;AAAA,CAAI;AACjD;AAKA,IACE,OAAO,YAAY,eACnB,OAAO,WAAW,eAClB,QAAQ,SAAS,QACjB;AACA,oBAAkB,EAAE,MAAM,CAAC,UAAU;AACnC,YAAQ,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AACpE,YAAQ,WAAW;AAAA,EACrB,CAAC;AACH;","names":[]}
|
package/dist/cli.mjs
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
|
|
3
|
+
get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
|
|
4
|
+
}) : x)(function(x) {
|
|
5
|
+
if (typeof require !== "undefined") return require.apply(this, arguments);
|
|
6
|
+
throw Error('Dynamic require of "' + x + '" is not supported');
|
|
7
|
+
});
|
|
8
|
+
|
|
9
|
+
// src/cli.ts
|
|
10
|
+
async function runVitestEvalsCli(args = process.argv.slice(2), options = {}) {
|
|
11
|
+
const [command, ...commandArgs] = args;
|
|
12
|
+
if (!command || command === "help" || command === "--help" || command === "-h") {
|
|
13
|
+
writeLine(options.stdout, usage());
|
|
14
|
+
return;
|
|
15
|
+
}
|
|
16
|
+
switch (command) {
|
|
17
|
+
case "serve": {
|
|
18
|
+
const { runReportUiCli } = await import("@vitest-evals/report-ui");
|
|
19
|
+
await runReportUiCli(commandArgs, {
|
|
20
|
+
commandName: "vitest-evals serve",
|
|
21
|
+
cwd: options.cwd,
|
|
22
|
+
stdout: options.stdout
|
|
23
|
+
});
|
|
24
|
+
return;
|
|
25
|
+
}
|
|
26
|
+
default:
|
|
27
|
+
throw new Error(`Unknown command: ${command}
|
|
28
|
+
|
|
29
|
+
${usage()}`);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
function usage() {
|
|
33
|
+
return [
|
|
34
|
+
"Usage: vitest-evals <command>",
|
|
35
|
+
"",
|
|
36
|
+
"Commands:",
|
|
37
|
+
" serve [json | dir | glob] Serve the local report UI",
|
|
38
|
+
"",
|
|
39
|
+
"Run `vitest-evals serve --help` for report UI options."
|
|
40
|
+
].join("\n");
|
|
41
|
+
}
|
|
42
|
+
function writeLine(stdout, message) {
|
|
43
|
+
(stdout ?? process.stdout).write(`${message}
|
|
44
|
+
`);
|
|
45
|
+
}
|
|
46
|
+
if (typeof __require !== "undefined" && typeof module !== "undefined" && __require.main === module) {
|
|
47
|
+
runVitestEvalsCli().catch((error) => {
|
|
48
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
49
|
+
process.exitCode = 1;
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
export {
|
|
53
|
+
runVitestEvalsCli
|
|
54
|
+
};
|
|
55
|
+
//# sourceMappingURL=cli.mjs.map
|
package/dist/cli.mjs.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\n\n/** Output streams used by the `vitest-evals` CLI runner. */\nexport type VitestEvalsCliIo = {\n stdout?: Pick<NodeJS.WriteStream, \"write\">;\n};\n\n/** Options for running the `vitest-evals` CLI. */\nexport type RunVitestEvalsCliOptions = VitestEvalsCliIo & {\n cwd?: string;\n};\n\n/** Runs the product-facing `vitest-evals` CLI. */\nexport async function runVitestEvalsCli(\n args = process.argv.slice(2),\n options: RunVitestEvalsCliOptions = {},\n) {\n const [command, ...commandArgs] = args;\n\n if (\n !command ||\n command === \"help\" ||\n command === \"--help\" ||\n command === \"-h\"\n ) {\n writeLine(options.stdout, usage());\n return;\n }\n\n switch (command) {\n case \"serve\": {\n const { runReportUiCli } = await import(\"@vitest-evals/report-ui\");\n await runReportUiCli(commandArgs, {\n commandName: \"vitest-evals serve\",\n cwd: options.cwd,\n stdout: options.stdout,\n });\n return;\n }\n default:\n throw new Error(`Unknown command: ${command}\\n\\n${usage()}`);\n }\n}\n\nfunction usage() {\n return [\n \"Usage: vitest-evals <command>\",\n \"\",\n \"Commands:\",\n \" serve [json | dir | glob] Serve the local report UI\",\n \"\",\n \"Run `vitest-evals serve --help` for report UI options.\",\n ].join(\"\\n\");\n}\n\nfunction writeLine(\n stdout: Pick<NodeJS.WriteStream, \"write\"> | undefined,\n message: string,\n) {\n (stdout ?? process.stdout).write(`${message}\\n`);\n}\n\ndeclare const require: NodeJS.Require | undefined;\ndeclare const module: NodeJS.Module | undefined;\n\nif (\n typeof require !== \"undefined\" &&\n typeof module !== \"undefined\" &&\n require.main === module\n) {\n runVitestEvalsCli().catch((error) => {\n console.error(error instanceof Error ? error.message : String(error));\n process.exitCode = 1;\n });\n}\n"],"mappings":";;;;;;;;;AAaA,eAAsB,kBACpB,OAAO,QAAQ,KAAK,MAAM,CAAC,GAC3B,UAAoC,CAAC,GACrC;AACA,QAAM,CAAC,SAAS,GAAG,WAAW,IAAI;AAElC,MACE,CAAC,WACD,YAAY,UACZ,YAAY,YACZ,YAAY,MACZ;AACA,cAAU,QAAQ,QAAQ,MAAM,CAAC;AACjC;AAAA,EACF;AAEA,UAAQ,SAAS;AAAA,IACf,KAAK,SAAS;AACZ,YAAM,EAAE,eAAe,IAAI,MAAM,OAAO,yBAAyB;AACjE,YAAM,eAAe,aAAa;AAAA,QAChC,aAAa;AAAA,QACb,KAAK,QAAQ;AAAA,QACb,QAAQ,QAAQ;AAAA,MAClB,CAAC;AACD;AAAA,IACF;AAAA,IACA;AACE,YAAM,IAAI,MAAM,oBAAoB,OAAO;AAAA;AAAA,EAAO,MAAM,CAAC,EAAE;AAAA,EAC/D;AACF;AAEA,SAAS,QAAQ;AACf,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF,EAAE,KAAK,IAAI;AACb;AAEA,SAAS,UACP,QACA,SACA;AACA,GAAC,UAAU,QAAQ,QAAQ,MAAM,GAAG,OAAO;AAAA,CAAI;AACjD;AAKA,IACE,OAAO,cAAY,eACnB,OAAO,WAAW,eAClB,UAAQ,SAAS,QACjB;AACA,oBAAkB,EAAE,MAAM,CAAC,UAAU;AACnC,YAAQ,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AACpE,YAAQ,WAAW;AAAA,EACrB,CAAC;AACH;","names":[]}
|
package/dist/harness.d.mts
CHANGED
|
@@ -1,148 +1,6 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
type JsonValue = JsonPrimitive | JsonValue[] | {
|
|
5
|
-
[key: string]: JsonValue;
|
|
6
|
-
};
|
|
7
|
-
/** Well-known OpenTelemetry GenAI operation names. */
|
|
8
|
-
type GenAiOperationName = "chat" | "create_agent" | "embeddings" | "execute_tool" | "generate_content" | "invoke_agent" | "invoke_workflow" | "retrieval" | "text_completion" | (string & {});
|
|
9
|
-
/** Well-known OpenTelemetry GenAI output content types. */
|
|
10
|
-
type GenAiOutputType = "image" | "json" | "speech" | "text" | (string & {});
|
|
11
|
-
/** Well-known OpenTelemetry GenAI provider names. */
|
|
12
|
-
type GenAiProviderName = "anthropic" | "aws.bedrock" | "azure.ai.inference" | "azure.ai.openai" | "cohere" | "deepseek" | "gcp.gemini" | "gcp.gen_ai" | "gcp.vertex_ai" | "groq" | "ibm.watsonx.ai" | "mistral_ai" | "openai" | "perplexity" | "x_ai" | (string & {});
|
|
13
|
-
/** Well-known OpenTelemetry GenAI token types. */
|
|
14
|
-
type GenAiTokenType = "input" | "output" | (string & {});
|
|
15
|
-
/** Well-known OpenTelemetry GenAI tool execution types. */
|
|
16
|
-
type GenAiToolType = "datastore" | "extension" | "function" | (string & {});
|
|
17
|
-
/** Typed subset of OpenTelemetry GenAI semantic attributes. */
|
|
18
|
-
type GenAiSemanticAttributes = {
|
|
19
|
-
"gen_ai.agent.description"?: string;
|
|
20
|
-
"gen_ai.agent.id"?: string;
|
|
21
|
-
"gen_ai.agent.name"?: string;
|
|
22
|
-
"gen_ai.agent.version"?: string;
|
|
23
|
-
"gen_ai.conversation.id"?: string;
|
|
24
|
-
"gen_ai.data_source.id"?: string;
|
|
25
|
-
"gen_ai.embeddings.dimension.count"?: number;
|
|
26
|
-
"gen_ai.evaluation.explanation"?: string;
|
|
27
|
-
"gen_ai.evaluation.name"?: string;
|
|
28
|
-
"gen_ai.evaluation.score.label"?: string;
|
|
29
|
-
"gen_ai.evaluation.score.value"?: number;
|
|
30
|
-
"gen_ai.input.messages"?: JsonValue;
|
|
31
|
-
"gen_ai.operation.name"?: GenAiOperationName;
|
|
32
|
-
"gen_ai.output.messages"?: JsonValue;
|
|
33
|
-
"gen_ai.output.type"?: GenAiOutputType;
|
|
34
|
-
"gen_ai.prompt.name"?: string;
|
|
35
|
-
"gen_ai.provider.name"?: GenAiProviderName;
|
|
36
|
-
"gen_ai.request.choice.count"?: number;
|
|
37
|
-
"gen_ai.request.encoding_formats"?: string[];
|
|
38
|
-
"gen_ai.request.frequency_penalty"?: number;
|
|
39
|
-
"gen_ai.request.max_tokens"?: number;
|
|
40
|
-
"gen_ai.request.model"?: string;
|
|
41
|
-
"gen_ai.request.presence_penalty"?: number;
|
|
42
|
-
"gen_ai.request.seed"?: number;
|
|
43
|
-
"gen_ai.request.stop_sequences"?: string[];
|
|
44
|
-
"gen_ai.request.stream"?: boolean;
|
|
45
|
-
"gen_ai.request.temperature"?: number;
|
|
46
|
-
"gen_ai.request.top_k"?: number;
|
|
47
|
-
"gen_ai.request.top_p"?: number;
|
|
48
|
-
"gen_ai.response.finish_reasons"?: string[];
|
|
49
|
-
"gen_ai.response.id"?: string;
|
|
50
|
-
"gen_ai.response.model"?: string;
|
|
51
|
-
"gen_ai.response.time_to_first_chunk"?: number;
|
|
52
|
-
"gen_ai.retrieval.documents"?: JsonValue;
|
|
53
|
-
"gen_ai.retrieval.query.text"?: string;
|
|
54
|
-
"gen_ai.system_instructions"?: JsonValue;
|
|
55
|
-
"gen_ai.token.type"?: GenAiTokenType;
|
|
56
|
-
"gen_ai.tool.call.arguments"?: JsonValue;
|
|
57
|
-
"gen_ai.tool.call.id"?: string;
|
|
58
|
-
"gen_ai.tool.call.result"?: JsonValue;
|
|
59
|
-
"gen_ai.tool.definitions"?: JsonValue;
|
|
60
|
-
"gen_ai.tool.description"?: string;
|
|
61
|
-
"gen_ai.tool.name"?: string;
|
|
62
|
-
"gen_ai.tool.type"?: GenAiToolType;
|
|
63
|
-
"gen_ai.usage.cache_creation.input_tokens"?: number;
|
|
64
|
-
"gen_ai.usage.cache_read.input_tokens"?: number;
|
|
65
|
-
"gen_ai.usage.input_tokens"?: number;
|
|
66
|
-
"gen_ai.usage.output_tokens"?: number;
|
|
67
|
-
"gen_ai.usage.reasoning.output_tokens"?: number;
|
|
68
|
-
"gen_ai.workflow.name"?: string;
|
|
69
|
-
};
|
|
70
|
-
/** Attribute keys defined by the OpenTelemetry GenAI semantic conventions. */
|
|
71
|
-
type GenAiSemanticAttributeKey = keyof GenAiSemanticAttributes;
|
|
72
|
-
/** Typed OpenTelemetry semantic attributes accepted on normalized spans. */
|
|
73
|
-
type OpenTelemetrySemanticAttributes = GenAiSemanticAttributes & {
|
|
74
|
-
"error.type"?: string;
|
|
75
|
-
"server.address"?: string;
|
|
76
|
-
"server.port"?: number;
|
|
77
|
-
};
|
|
78
|
-
/** Known OpenTelemetry semantic attribute keys accepted on normalized spans. */
|
|
79
|
-
type OpenTelemetrySemanticAttributeKey = keyof OpenTelemetrySemanticAttributes;
|
|
80
|
-
/** Attribute keys accepted on normalized spans. */
|
|
81
|
-
type NormalizedSpanAttributeKey = OpenTelemetrySemanticAttributeKey | (string & {});
|
|
82
|
-
/**
|
|
83
|
-
* JSON-safe span attributes. Known OpenTelemetry GenAI keys are typed while
|
|
84
|
-
* custom provider and application keys remain allowed.
|
|
85
|
-
*/
|
|
86
|
-
type NormalizedSpanAttributes = OpenTelemetrySemanticAttributes & {
|
|
87
|
-
[key: string]: JsonValue | undefined;
|
|
88
|
-
};
|
|
89
|
-
/** Event attached to one normalized span. */
|
|
90
|
-
type NormalizedSpanEvent = {
|
|
91
|
-
/** Event name emitted by the runtime or harness. */
|
|
92
|
-
name: string;
|
|
93
|
-
/** ISO timestamp for the event when available. */
|
|
94
|
-
timestamp?: string;
|
|
95
|
-
/** JSON-safe event attributes. */
|
|
96
|
-
attributes?: NormalizedSpanAttributes;
|
|
97
|
-
};
|
|
98
|
-
/** Normalized operation span captured during a harness run. */
|
|
99
|
-
type NormalizedSpan = {
|
|
100
|
-
/** Runtime or provider span id when one is available. */
|
|
101
|
-
id?: string;
|
|
102
|
-
/** Trace id this span belongs to. */
|
|
103
|
-
traceId?: string;
|
|
104
|
-
/** Parent span id when the runtime exposes hierarchy. */
|
|
105
|
-
parentId?: string;
|
|
106
|
-
/** Human-readable operation name. */
|
|
107
|
-
name: string;
|
|
108
|
-
/** Coarse operation kind used by reporters and judges. */
|
|
109
|
-
kind?: "run" | "agent" | "model" | "tool" | "guardrail" | "handoff" | "custom";
|
|
110
|
-
/** ISO timestamp for the start of the span. */
|
|
111
|
-
startedAt?: string;
|
|
112
|
-
/** ISO timestamp for the end of the span. */
|
|
113
|
-
finishedAt?: string;
|
|
114
|
-
/** Span duration in milliseconds. */
|
|
115
|
-
durationMs?: number;
|
|
116
|
-
/** Success or failure status for the span. */
|
|
117
|
-
status?: "ok" | "error";
|
|
118
|
-
/** Normalized error when the span failed. */
|
|
119
|
-
error?: {
|
|
120
|
-
message: string;
|
|
121
|
-
type?: string;
|
|
122
|
-
[key: string]: JsonValue | undefined;
|
|
123
|
-
};
|
|
124
|
-
/** JSON-safe operation attributes. */
|
|
125
|
-
attributes?: NormalizedSpanAttributes;
|
|
126
|
-
/** Events observed inside this span. */
|
|
127
|
-
events?: NormalizedSpanEvent[];
|
|
128
|
-
};
|
|
129
|
-
/** Normalized trace captured during a harness run. */
|
|
130
|
-
type NormalizedTrace = {
|
|
131
|
-
/** Runtime or provider trace id when one is available. */
|
|
132
|
-
id?: string;
|
|
133
|
-
/** Human-readable trace or workflow name. */
|
|
134
|
-
name?: string;
|
|
135
|
-
/** ISO timestamp for the start of the trace. */
|
|
136
|
-
startedAt?: string;
|
|
137
|
-
/** ISO timestamp for the end of the trace. */
|
|
138
|
-
finishedAt?: string;
|
|
139
|
-
/** Trace duration in milliseconds. */
|
|
140
|
-
durationMs?: number;
|
|
141
|
-
/** Extra JSON-safe trace metadata. */
|
|
142
|
-
metadata?: Record<string, JsonValue>;
|
|
143
|
-
/** Spans that make up this trace. */
|
|
144
|
-
spans: NormalizedSpan[];
|
|
145
|
-
};
|
|
1
|
+
import { JsonValue, HarnessRun, GenAiOperationName, ToolCallRecord, NormalizedSpanEvent, NormalizedSpan, NormalizedTrace, NormalizedMessage, UsageSummary, TimingSummary, NormalizedSpanAttributes, HarnessRunError, NormalizedSession } from '@vitest-evals/core';
|
|
2
|
+
export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue, NormalizedMessage, NormalizedSession, NormalizedSpan, NormalizedSpanAttributeKey, NormalizedSpanAttributes, NormalizedSpanEvent, NormalizedTrace, OpenTelemetrySemanticAttributeKey, OpenTelemetrySemanticAttributes, TimingSummary, ToolCallRecord, UsageSummary, assistantMessages, failedSpans, latestAssistantMessageContent, messagesByRole, spans, spansByKind, systemMessages, toolCalls, toolMessages, userMessages } from '@vitest-evals/core';
|
|
3
|
+
|
|
146
4
|
/** Options for converting normalized tool calls into trace spans. */
|
|
147
5
|
type CreateToolCallSpansOptions = {
|
|
148
6
|
/** Trace id to attach to each generated tool span. */
|
|
@@ -167,172 +25,11 @@ type EnsureRunTraceOptions = {
|
|
|
167
25
|
/** Optional JSON-safe source marker for the trace metadata. */
|
|
168
26
|
source?: string;
|
|
169
27
|
};
|
|
170
|
-
/**
|
|
171
|
-
* Normalized record for one tool call observed during a harness run.
|
|
172
|
-
*
|
|
173
|
-
* @example
|
|
174
|
-
* ```ts
|
|
175
|
-
* const call: ToolCallRecord = {
|
|
176
|
-
* name: "lookupInvoice",
|
|
177
|
-
* arguments: { invoiceId: "inv_123" },
|
|
178
|
-
* result: { refundable: true },
|
|
179
|
-
* };
|
|
180
|
-
* ```
|
|
181
|
-
*/
|
|
182
|
-
type ToolCallRecord = {
|
|
183
|
-
/** Provider or runtime tool-call id when one is available. */
|
|
184
|
-
id?: string;
|
|
185
|
-
/** Tool name as exposed to the agent or application runtime. */
|
|
186
|
-
name: string;
|
|
187
|
-
/** JSON-safe tool arguments after provider/runtime normalization. */
|
|
188
|
-
arguments?: Record<string, JsonValue>;
|
|
189
|
-
/** JSON-safe tool result returned by the application tool. */
|
|
190
|
-
result?: JsonValue;
|
|
191
|
-
/** Normalized tool error when execution failed. */
|
|
192
|
-
error?: {
|
|
193
|
-
message: string;
|
|
194
|
-
type?: string;
|
|
195
|
-
[key: string]: JsonValue | undefined;
|
|
196
|
-
};
|
|
197
|
-
/** ISO timestamp for the start of tool execution. */
|
|
198
|
-
startedAt?: string;
|
|
199
|
-
/** ISO timestamp for the end of tool execution. */
|
|
200
|
-
finishedAt?: string;
|
|
201
|
-
/** Tool execution duration in milliseconds. */
|
|
202
|
-
durationMs?: number;
|
|
203
|
-
/** Extra JSON-safe tool metadata for reporters and custom judges. */
|
|
204
|
-
metadata?: Record<string, JsonValue>;
|
|
205
|
-
};
|
|
206
|
-
/**
|
|
207
|
-
* Normalized message recorded in a harness session transcript.
|
|
208
|
-
*
|
|
209
|
-
* @example
|
|
210
|
-
* ```ts
|
|
211
|
-
* const message: NormalizedMessage = {
|
|
212
|
-
* role: "assistant",
|
|
213
|
-
* content: { status: "approved" },
|
|
214
|
-
* toolCalls: [{ name: "lookupInvoice" }],
|
|
215
|
-
* };
|
|
216
|
-
* ```
|
|
217
|
-
*/
|
|
218
|
-
type NormalizedMessage = {
|
|
219
|
-
/** Transcript role for the normalized message. */
|
|
220
|
-
role: "system" | "user" | "assistant" | "tool";
|
|
221
|
-
/** JSON-safe message content. */
|
|
222
|
-
content?: JsonValue;
|
|
223
|
-
/** Tool calls associated with this message. */
|
|
224
|
-
toolCalls?: ToolCallRecord[];
|
|
225
|
-
/** Extra JSON-safe message metadata. */
|
|
226
|
-
metadata?: Record<string, JsonValue>;
|
|
227
|
-
};
|
|
228
|
-
/**
|
|
229
|
-
* Provider usage summary attached to a normalized harness run.
|
|
230
|
-
*
|
|
231
|
-
* @example
|
|
232
|
-
* ```ts
|
|
233
|
-
* const usage: UsageSummary = {
|
|
234
|
-
* provider: "openai",
|
|
235
|
-
* model: "gpt-4o-mini",
|
|
236
|
-
* inputTokens: 212,
|
|
237
|
-
* outputTokens: 48,
|
|
238
|
-
* totalTokens: 260,
|
|
239
|
-
* };
|
|
240
|
-
* ```
|
|
241
|
-
*/
|
|
242
|
-
type UsageSummary = {
|
|
243
|
-
/** Provider that served the application run. */
|
|
244
|
-
provider?: string;
|
|
245
|
-
/** Model used for the application run. */
|
|
246
|
-
model?: string;
|
|
247
|
-
/** Input, prompt, or request tokens consumed by the run. */
|
|
248
|
-
inputTokens?: number;
|
|
249
|
-
/** Output or completion tokens produced by the run. */
|
|
250
|
-
outputTokens?: number;
|
|
251
|
-
/** Reasoning tokens reported by providers that expose them. */
|
|
252
|
-
reasoningTokens?: number;
|
|
253
|
-
/** Total token count reported by the provider or adapter. */
|
|
254
|
-
totalTokens?: number;
|
|
255
|
-
/** Count of tool calls observed during the run. */
|
|
256
|
-
toolCalls?: number;
|
|
257
|
-
/** Retry count observed during the run. */
|
|
258
|
-
retries?: number;
|
|
259
|
-
/** Provider-specific JSON-safe usage details. Cost estimates belong here. */
|
|
260
|
-
metadata?: Record<string, JsonValue>;
|
|
261
|
-
};
|
|
262
|
-
/** Timing summary attached to a normalized harness run. */
|
|
263
|
-
type TimingSummary = {
|
|
264
|
-
/** End-to-end run duration in milliseconds. */
|
|
265
|
-
totalMs?: number;
|
|
266
|
-
/** Extra JSON-safe timing metadata. */
|
|
267
|
-
metadata?: Record<string, JsonValue>;
|
|
268
|
-
};
|
|
269
|
-
/**
|
|
270
|
-
* JSON-serializable transcript produced by the system under test.
|
|
271
|
-
*
|
|
272
|
-
* @example
|
|
273
|
-
* ```ts
|
|
274
|
-
* const session: NormalizedSession = {
|
|
275
|
-
* provider: "openai",
|
|
276
|
-
* model: "gpt-4o-mini",
|
|
277
|
-
* messages: [
|
|
278
|
-
* { role: "user", content: "Refund invoice inv_123" },
|
|
279
|
-
* { role: "assistant", content: { status: "approved" } },
|
|
280
|
-
* ],
|
|
281
|
-
* };
|
|
282
|
-
* ```
|
|
283
|
-
*/
|
|
284
|
-
type NormalizedSession = {
|
|
285
|
-
/** Ordered normalized transcript messages. */
|
|
286
|
-
messages: NormalizedMessage[];
|
|
287
|
-
/** Provider that produced the session when known. */
|
|
288
|
-
provider?: string;
|
|
289
|
-
/** Model that produced the session when known. */
|
|
290
|
-
model?: string;
|
|
291
|
-
/** Extra JSON-safe session metadata. */
|
|
292
|
-
metadata?: Record<string, JsonValue>;
|
|
293
|
-
};
|
|
294
28
|
type OutputField<TOutput extends JsonValue | undefined> = undefined extends TOutput ? {
|
|
295
29
|
output?: TOutput;
|
|
296
30
|
} : {
|
|
297
31
|
output: TOutput;
|
|
298
32
|
};
|
|
299
|
-
/**
|
|
300
|
-
* Normalized result returned by every harness execution.
|
|
301
|
-
*
|
|
302
|
-
* @example
|
|
303
|
-
* ```ts
|
|
304
|
-
* const run: HarnessRun<{ status: "approved" }> = {
|
|
305
|
-
* output: { status: "approved" },
|
|
306
|
-
* session: {
|
|
307
|
-
* messages: [
|
|
308
|
-
* { role: "user", content: "Refund invoice inv_123" },
|
|
309
|
-
* { role: "assistant", content: { status: "approved" } },
|
|
310
|
-
* ],
|
|
311
|
-
* },
|
|
312
|
-
* usage: { totalTokens: 260 },
|
|
313
|
-
* errors: [],
|
|
314
|
-
* };
|
|
315
|
-
* ```
|
|
316
|
-
*/
|
|
317
|
-
type HarnessRun<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
|
|
318
|
-
/** Normalized transcript and provider/session metadata. */
|
|
319
|
-
session: NormalizedSession;
|
|
320
|
-
/** Stable provider usage units such as tokens, tools, and retries. */
|
|
321
|
-
usage: UsageSummary;
|
|
322
|
-
/** Optional timing summary for the run. */
|
|
323
|
-
timings?: TimingSummary;
|
|
324
|
-
/** JSON-safe run artifacts captured by the harness or test context. */
|
|
325
|
-
artifacts?: Record<string, JsonValue>;
|
|
326
|
-
/** Normalized traces and spans captured during execution. */
|
|
327
|
-
traces?: NormalizedTrace[];
|
|
328
|
-
/** Normalized errors captured during execution. */
|
|
329
|
-
errors: Array<Record<string, JsonValue>>;
|
|
330
|
-
};
|
|
331
|
-
/** Error value with an attached partial or complete normalized harness run. */
|
|
332
|
-
type HarnessRunError = Error & {
|
|
333
|
-
/** Attached normalized harness run recovered by `getHarnessRunFromError(...)`. */
|
|
334
|
-
vitestEvalsRun: HarnessRun;
|
|
335
|
-
};
|
|
336
33
|
/** Per-run metadata shape accepted by harnesses and eval tests. */
|
|
337
34
|
type HarnessMetadata = Record<string, unknown>;
|
|
338
35
|
/**
|
|
@@ -567,19 +264,6 @@ declare function createGenAiUsageAttributes(usage: UsageSummary | undefined, opt
|
|
|
567
264
|
"gen_ai.usage.output_tokens": number | undefined;
|
|
568
265
|
"gen_ai.usage.reasoning.output_tokens": number | undefined;
|
|
569
266
|
};
|
|
570
|
-
/**
|
|
571
|
-
* Flattens every recorded tool call from a normalized session.
|
|
572
|
-
*
|
|
573
|
-
* @param session - Normalized session produced by a harness run.
|
|
574
|
-
*
|
|
575
|
-
* @example
|
|
576
|
-
* ```ts
|
|
577
|
-
* const names = toolCalls(result.session).map((call) => call.name);
|
|
578
|
-
*
|
|
579
|
-
* expect(names).toEqual(["lookupInvoice", "createRefund"]);
|
|
580
|
-
* ```
|
|
581
|
-
*/
|
|
582
|
-
declare function toolCalls(session: NormalizedSession): ToolCallRecord[];
|
|
583
267
|
/**
|
|
584
268
|
* Converts normalized tool-call records into trace spans.
|
|
585
269
|
*
|
|
@@ -594,99 +278,6 @@ declare function createToolCallSpans(calls: ToolCallRecord[], options?: CreateTo
|
|
|
594
278
|
* remain free to attach richer native traces.
|
|
595
279
|
*/
|
|
596
280
|
declare function ensureRunTrace(run: HarnessRun, options: EnsureRunTraceOptions): NormalizedTrace | undefined;
|
|
597
|
-
/**
|
|
598
|
-
* Flattens every recorded span from a normalized harness run.
|
|
599
|
-
*
|
|
600
|
-
* @param run - Normalized harness run produced by a harness.
|
|
601
|
-
*
|
|
602
|
-
* @example
|
|
603
|
-
* ```ts
|
|
604
|
-
* const modelSpans = spans(result).filter((span) => span.kind === "model");
|
|
605
|
-
* ```
|
|
606
|
-
*/
|
|
607
|
-
declare function spans(run: HarnessRun): NormalizedSpan[];
|
|
608
|
-
/**
|
|
609
|
-
* Returns spans of one coarse operation kind from a normalized run.
|
|
610
|
-
*
|
|
611
|
-
* @param run - Normalized harness run produced by a harness.
|
|
612
|
-
* @param kind - Span kind to keep.
|
|
613
|
-
*/
|
|
614
|
-
declare function spansByKind(run: HarnessRun, kind: NonNullable<NormalizedSpan["kind"]>): NormalizedSpan[];
|
|
615
|
-
/**
|
|
616
|
-
* Returns every span that explicitly failed or carries a normalized error.
|
|
617
|
-
*
|
|
618
|
-
* @param run - Normalized harness run produced by a harness.
|
|
619
|
-
*/
|
|
620
|
-
declare function failedSpans(run: HarnessRun): NormalizedSpan[];
|
|
621
|
-
/**
|
|
622
|
-
* Filters normalized session messages by role.
|
|
623
|
-
*
|
|
624
|
-
* @param session - Normalized session produced by a harness run.
|
|
625
|
-
* @param role - Message role to keep.
|
|
626
|
-
*
|
|
627
|
-
* @example
|
|
628
|
-
* ```ts
|
|
629
|
-
* const assistantText = messagesByRole(result.session, "assistant")
|
|
630
|
-
* .map((message) => message.content)
|
|
631
|
-
* .join("\n");
|
|
632
|
-
* ```
|
|
633
|
-
*/
|
|
634
|
-
declare function messagesByRole(session: NormalizedSession, role: NormalizedMessage["role"]): NormalizedMessage[];
|
|
635
|
-
/**
|
|
636
|
-
* Returns every normalized system message from a session.
|
|
637
|
-
*
|
|
638
|
-
* @param session - Normalized session produced by a harness run.
|
|
639
|
-
*
|
|
640
|
-
* @example
|
|
641
|
-
* ```ts
|
|
642
|
-
* const systemPrompts = systemMessages(result.session);
|
|
643
|
-
* ```
|
|
644
|
-
*/
|
|
645
|
-
declare function systemMessages(session: NormalizedSession): NormalizedMessage[];
|
|
646
|
-
/**
|
|
647
|
-
* Returns every normalized user message from a session.
|
|
648
|
-
*
|
|
649
|
-
* @param session - Normalized session produced by a harness run.
|
|
650
|
-
*
|
|
651
|
-
* @example
|
|
652
|
-
* ```ts
|
|
653
|
-
* const firstPrompt = userMessages(result.session)[0]?.content;
|
|
654
|
-
* ```
|
|
655
|
-
*/
|
|
656
|
-
declare function userMessages(session: NormalizedSession): NormalizedMessage[];
|
|
657
|
-
/**
|
|
658
|
-
* Returns every normalized assistant message from a session.
|
|
659
|
-
*
|
|
660
|
-
* @param session - Normalized session produced by a harness run.
|
|
661
|
-
*
|
|
662
|
-
* @example
|
|
663
|
-
* ```ts
|
|
664
|
-
* const finalAnswer = assistantMessages(result.session).at(-1)?.content;
|
|
665
|
-
* ```
|
|
666
|
-
*/
|
|
667
|
-
declare function assistantMessages(session: NormalizedSession): NormalizedMessage[];
|
|
668
|
-
/**
|
|
669
|
-
* Returns the latest assistant message content, ignoring empty text messages.
|
|
670
|
-
*
|
|
671
|
-
* @param session - Normalized session produced by a harness run.
|
|
672
|
-
*
|
|
673
|
-
* @example
|
|
674
|
-
* ```ts
|
|
675
|
-
* const finalAnswer = latestAssistantMessageContent(result.session);
|
|
676
|
-
* ```
|
|
677
|
-
*/
|
|
678
|
-
declare function latestAssistantMessageContent(session: NormalizedSession): JsonValue | undefined;
|
|
679
|
-
/**
|
|
680
|
-
* Returns every normalized tool message from a session.
|
|
681
|
-
*
|
|
682
|
-
* @param session - Normalized session produced by a harness run.
|
|
683
|
-
*
|
|
684
|
-
* @example
|
|
685
|
-
* ```ts
|
|
686
|
-
* const toolOutputs = toolMessages(result.session).map((message) => message.content);
|
|
687
|
-
* ```
|
|
688
|
-
*/
|
|
689
|
-
declare function toolMessages(session: NormalizedSession): NormalizedMessage[];
|
|
690
281
|
/**
|
|
691
282
|
* Attaches a partial or complete harness run to an arbitrary thrown error.
|
|
692
283
|
*
|
|
@@ -727,4 +318,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
|
|
|
727
318
|
/** Serializes an arbitrary thrown value into the normalized error shape. */
|
|
728
319
|
declare function serializeError(error: unknown): Record<string, JsonValue>;
|
|
729
320
|
|
|
730
|
-
export { type CreateHarnessOptions, type CreateHarnessRunArgs, type CreateToolCallSpansOptions, type EnsureRunTraceOptions, type
|
|
321
|
+
export { type CreateHarnessOptions, type CreateHarnessRunArgs, type CreateToolCallSpansOptions, type EnsureRunTraceOptions, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type MaybePromise, type SimpleHarnessResult, type SimpleSpanEvent, type SimpleSpanRecord, type SimpleToolCallRecord, type SimpleTraceRecord, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, createToolCallSpans, ensureRunTrace, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, normalizeSpanAttributes, normalizeSpanError, resolveHarnessRunErrors, serializeError, toJsonValue };
|