agentv 3.7.0 → 3.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/{chunk-7YS6YNJZ.js → chunk-GC5P5HHZ.js} +127 -46
- package/dist/chunk-GC5P5HHZ.js.map +1 -0
- package/dist/{chunk-TR6H437M.js → chunk-Q2YWV4QM.js} +21 -21
- package/dist/chunk-Q2YWV4QM.js.map +1 -0
- package/dist/{chunk-XGG64VIY.js → chunk-TXDPYXHY.js} +636 -892
- package/dist/chunk-TXDPYXHY.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-VP6AXX6B.js → dist-PIOSPBKX.js} +2 -4
- package/dist/index.js +3 -3
- package/dist/{interactive-F6XECJ33.js → interactive-3VTDK5NX.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-7YS6YNJZ.js.map +0 -1
- package/dist/chunk-TR6H437M.js.map +0 -1
- package/dist/chunk-XGG64VIY.js.map +0 -1
- /package/dist/{dist-VP6AXX6B.js.map → dist-PIOSPBKX.js.map} +0 -0
- /package/dist/{interactive-F6XECJ33.js.map → interactive-3VTDK5NX.js.map} +0 -0
|
@@ -16,7 +16,7 @@ import {
|
|
|
16
16
|
validateEvalFile,
|
|
17
17
|
validateFileReferences,
|
|
18
18
|
validateTargetsFile
|
|
19
|
-
} from "./chunk-
|
|
19
|
+
} from "./chunk-GC5P5HHZ.js";
|
|
20
20
|
import {
|
|
21
21
|
createBuiltinRegistry,
|
|
22
22
|
createProvider,
|
|
@@ -34,7 +34,7 @@ import {
|
|
|
34
34
|
toSnakeCaseDeep as toSnakeCaseDeep2,
|
|
35
35
|
transpileEvalYamlFile,
|
|
36
36
|
trimBaselineResult
|
|
37
|
-
} from "./chunk-
|
|
37
|
+
} from "./chunk-TXDPYXHY.js";
|
|
38
38
|
import {
|
|
39
39
|
__commonJS,
|
|
40
40
|
__esm,
|
|
@@ -3714,7 +3714,6 @@ async function getPromptEvalInput(evalPath, testId) {
|
|
|
3714
3714
|
return {
|
|
3715
3715
|
test_id: evalCase.id,
|
|
3716
3716
|
input: resolveMessages(evalCase.input, fileMap),
|
|
3717
|
-
guideline_paths: evalCase.guideline_paths,
|
|
3718
3717
|
criteria: evalCase.criteria
|
|
3719
3718
|
};
|
|
3720
3719
|
}
|
|
@@ -3739,9 +3738,8 @@ async function getPromptEvalGradingBrief(evalPath, testId) {
|
|
|
3739
3738
|
if (inputText) {
|
|
3740
3739
|
lines.push(`Input: "${inputText}"`);
|
|
3741
3740
|
}
|
|
3742
|
-
|
|
3743
|
-
|
|
3744
|
-
lines.push(`Files: ${filePaths.join(", ")}`);
|
|
3741
|
+
if (evalCase.file_paths.length > 0) {
|
|
3742
|
+
lines.push(`Files: ${evalCase.file_paths.join(", ")}`);
|
|
3745
3743
|
}
|
|
3746
3744
|
if (evalCase.reference_answer) {
|
|
3747
3745
|
lines.push(`Expected: "${evalCase.reference_answer}"`);
|
|
@@ -3973,7 +3971,6 @@ var evalAssertCommand = command({
|
|
|
3973
3971
|
criteria: "",
|
|
3974
3972
|
expected_output: [],
|
|
3975
3973
|
reference_answer: "",
|
|
3976
|
-
guideline_files: [],
|
|
3977
3974
|
input_files: [],
|
|
3978
3975
|
trace: null,
|
|
3979
3976
|
token_usage: null,
|
|
@@ -4180,11 +4177,16 @@ var evalRunCommand = command({
|
|
|
4180
4177
|
type: optional(string),
|
|
4181
4178
|
long: "model",
|
|
4182
4179
|
description: 'Override model for the grader target (e.g., "openai:gpt-5-mini")'
|
|
4180
|
+
}),
|
|
4181
|
+
outputMessages: option({
|
|
4182
|
+
type: optional(string),
|
|
4183
|
+
long: "output-messages",
|
|
4184
|
+
description: 'Number of trailing messages to include in results output (default: 1, or "all")'
|
|
4183
4185
|
})
|
|
4184
4186
|
},
|
|
4185
4187
|
handler: async (args) => {
|
|
4186
4188
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4187
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4189
|
+
const { launchInteractiveWizard } = await import("./interactive-3VTDK5NX.js");
|
|
4188
4190
|
await launchInteractiveWizard();
|
|
4189
4191
|
return;
|
|
4190
4192
|
}
|
|
@@ -4220,7 +4222,8 @@ var evalRunCommand = command({
|
|
|
4220
4222
|
benchmarkJson: args.benchmarkJson,
|
|
4221
4223
|
artifacts: args.artifacts,
|
|
4222
4224
|
graderTarget: args.graderTarget,
|
|
4223
|
-
model: args.model
|
|
4225
|
+
model: args.model,
|
|
4226
|
+
outputMessages: args.outputMessages
|
|
4224
4227
|
};
|
|
4225
4228
|
await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
|
|
4226
4229
|
}
|
|
@@ -4706,10 +4709,9 @@ function exportResults(sourceFile, content, outputDir) {
|
|
|
4706
4709
|
const outputsDir = path8.join(outputDir, "outputs");
|
|
4707
4710
|
mkdirSync2(outputsDir, { recursive: true });
|
|
4708
4711
|
for (const result of patched) {
|
|
4709
|
-
|
|
4710
|
-
if (outputText) {
|
|
4712
|
+
if (result.output && result.output.length > 0) {
|
|
4711
4713
|
const id = safeTestId(result);
|
|
4712
|
-
writeFileSync3(path8.join(outputsDir, `${id}.txt`),
|
|
4714
|
+
writeFileSync3(path8.join(outputsDir, `${id}.txt`), JSON.stringify(result.output, null, 2));
|
|
4713
4715
|
}
|
|
4714
4716
|
}
|
|
4715
4717
|
}
|
|
@@ -5021,7 +5023,6 @@ function toTraceSummary(raw) {
|
|
|
5021
5023
|
return toCamelCaseDeep(raw.trace);
|
|
5022
5024
|
}
|
|
5023
5025
|
function extractCandidate(raw) {
|
|
5024
|
-
if (raw.output_text !== void 0) return raw.output_text;
|
|
5025
5026
|
if (raw.output !== void 0)
|
|
5026
5027
|
return typeof raw.output === "string" ? raw.output : JSON.stringify(raw.output);
|
|
5027
5028
|
return "";
|
|
@@ -5033,7 +5034,6 @@ function buildEvalTest(raw) {
|
|
|
5033
5034
|
input: [],
|
|
5034
5035
|
input_segments: [],
|
|
5035
5036
|
expected_output: [],
|
|
5036
|
-
guideline_paths: [],
|
|
5037
5037
|
file_paths: [],
|
|
5038
5038
|
criteria: ""
|
|
5039
5039
|
};
|
|
@@ -5071,7 +5071,7 @@ async function runScore(results, evaluatorConfig, testIdFilter) {
|
|
|
5071
5071
|
target: { kind: "custom", name: raw.target ?? "unknown", config: {} },
|
|
5072
5072
|
provider: stubProvider,
|
|
5073
5073
|
attempt: 1,
|
|
5074
|
-
promptInputs: { question: ""
|
|
5074
|
+
promptInputs: { question: "" },
|
|
5075
5075
|
now: /* @__PURE__ */ new Date(),
|
|
5076
5076
|
output: Array.isArray(output) ? output : void 0,
|
|
5077
5077
|
trace,
|
|
@@ -5325,7 +5325,7 @@ function formatResultDetail(result, index, tree) {
|
|
|
5325
5325
|
}
|
|
5326
5326
|
const scoreColor = result.score >= 0.9 ? c2.green : result.score >= 0.5 ? c2.yellow : c2.red;
|
|
5327
5327
|
lines.push(
|
|
5328
|
-
`${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.
|
|
5328
|
+
`${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.eval_set ? ` ${c2.dim}eval-set: ${result.eval_set}${c2.reset}` : ""}`
|
|
5329
5329
|
);
|
|
5330
5330
|
if (result.error) {
|
|
5331
5331
|
lines.push(` ${c2.red}Error: ${result.error}${c2.reset}`);
|
|
@@ -5499,8 +5499,8 @@ function groupResults(results, groupBy2) {
|
|
|
5499
5499
|
case "target":
|
|
5500
5500
|
key = result.target ?? "unknown";
|
|
5501
5501
|
break;
|
|
5502
|
-
case "
|
|
5503
|
-
key = result.
|
|
5502
|
+
case "eval-set":
|
|
5503
|
+
key = result.eval_set ?? "unknown";
|
|
5504
5504
|
break;
|
|
5505
5505
|
case "test-id":
|
|
5506
5506
|
key = result.test_id ?? result.eval_id ?? "unknown";
|
|
@@ -5582,10 +5582,10 @@ var traceStatsCommand = command({
|
|
|
5582
5582
|
description: "Path to JSONL result file"
|
|
5583
5583
|
}),
|
|
5584
5584
|
groupBy: option({
|
|
5585
|
-
type: optional(oneOf(["target", "
|
|
5585
|
+
type: optional(oneOf(["target", "eval-set", "test-id"])),
|
|
5586
5586
|
long: "group-by",
|
|
5587
5587
|
short: "g",
|
|
5588
|
-
description: "Group statistics by: target,
|
|
5588
|
+
description: "Group statistics by: target, eval-set, or test-id"
|
|
5589
5589
|
}),
|
|
5590
5590
|
format: option({
|
|
5591
5591
|
type: optional(oneOf(["table", "json"])),
|
|
@@ -6276,4 +6276,4 @@ export {
|
|
|
6276
6276
|
preprocessArgv,
|
|
6277
6277
|
runCli
|
|
6278
6278
|
};
|
|
6279
|
-
//# sourceMappingURL=chunk-
|
|
6279
|
+
//# sourceMappingURL=chunk-Q2YWV4QM.js.map
|