agentv 4.2.0 → 4.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-UXSQQHCI.js → chunk-BEFW6WZ6.js} +53 -40
- package/dist/chunk-BEFW6WZ6.js.map +1 -0
- package/dist/{chunk-HAZJO7OY.js → chunk-LTALLYDW.js} +4 -4
- package/dist/{chunk-HAZJO7OY.js.map → chunk-LTALLYDW.js.map} +1 -1
- package/dist/{chunk-XLM3RNN7.js → chunk-URQXFJEB.js} +57 -58
- package/dist/{chunk-XLM3RNN7.js.map → chunk-URQXFJEB.js.map} +1 -1
- package/dist/cli.js +3 -3
- package/dist/{dist-VVXR6TYM.js → dist-GQ2YNG7B.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-NVNOLL2H.js → interactive-UZBC7V4B.js} +3 -3
- package/dist/studio/assets/{index-D8LVkz9x.js → index-CDGReinH.js} +1 -1
- package/dist/studio/assets/{index-Cir5Hc8S.js → index-DofvSOmX.js} +1 -1
- package/dist/studio/index.html +1 -1
- package/package.json +1 -1
- package/dist/chunk-UXSQQHCI.js.map +0 -1
- /package/dist/{dist-VVXR6TYM.js.map → dist-GQ2YNG7B.js.map} +0 -0
- /package/dist/{interactive-NVNOLL2H.js.map → interactive-UZBC7V4B.js.map} +0 -0
|
@@ -24,7 +24,7 @@ import {
|
|
|
24
24
|
validateFileReferences,
|
|
25
25
|
validateTargetsFile,
|
|
26
26
|
writeArtifactsFromResults
|
|
27
|
-
} from "./chunk-
|
|
27
|
+
} from "./chunk-LTALLYDW.js";
|
|
28
28
|
import {
|
|
29
29
|
DEFAULT_CATEGORY,
|
|
30
30
|
createBuiltinRegistry,
|
|
@@ -43,7 +43,7 @@ import {
|
|
|
43
43
|
toSnakeCaseDeep as toSnakeCaseDeep2,
|
|
44
44
|
transpileEvalYamlFile,
|
|
45
45
|
trimBaselineResult
|
|
46
|
-
} from "./chunk-
|
|
46
|
+
} from "./chunk-URQXFJEB.js";
|
|
47
47
|
import {
|
|
48
48
|
__commonJS,
|
|
49
49
|
__esm,
|
|
@@ -4217,7 +4217,7 @@ var evalRunCommand = command({
|
|
|
4217
4217
|
},
|
|
4218
4218
|
handler: async (args) => {
|
|
4219
4219
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4220
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4220
|
+
const { launchInteractiveWizard } = await import("./interactive-UZBC7V4B.js");
|
|
4221
4221
|
await launchInteractiveWizard();
|
|
4222
4222
|
return;
|
|
4223
4223
|
}
|
|
@@ -4441,27 +4441,15 @@ var evalBenchCommand = command({
|
|
|
4441
4441
|
type: string,
|
|
4442
4442
|
displayName: "export-dir",
|
|
4443
4443
|
description: "Export directory from pipeline input/grade"
|
|
4444
|
-
}),
|
|
4445
|
-
llmScores: option({
|
|
4446
|
-
type: optional(string),
|
|
4447
|
-
long: "llm-scores",
|
|
4448
|
-
description: "Path to LLM scores JSON file (reads from stdin if omitted)"
|
|
4449
4444
|
})
|
|
4450
4445
|
},
|
|
4451
|
-
handler: async ({ exportDir
|
|
4446
|
+
handler: async ({ exportDir }) => {
|
|
4452
4447
|
const manifest = JSON.parse(await readFile(join(exportDir, "manifest.json"), "utf8"));
|
|
4453
4448
|
const testIds = manifest.test_ids;
|
|
4454
4449
|
const targetName = manifest.target?.name ?? "unknown";
|
|
4455
4450
|
const evalSet = manifest.dataset ?? "";
|
|
4456
4451
|
const experiment = manifest.experiment;
|
|
4457
4452
|
const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
|
|
4458
|
-
let stdinData;
|
|
4459
|
-
if (llmScoresPath) {
|
|
4460
|
-
stdinData = await readFile(llmScoresPath, "utf8");
|
|
4461
|
-
} else {
|
|
4462
|
-
stdinData = await readStdin();
|
|
4463
|
-
}
|
|
4464
|
-
const llmScores = stdinData ? JSON.parse(stdinData) : {};
|
|
4465
4453
|
const indexLines = [];
|
|
4466
4454
|
const allPassRates = [];
|
|
4467
4455
|
for (const testId of testIds) {
|
|
@@ -4488,14 +4476,18 @@ var evalBenchCommand = command({
|
|
|
4488
4476
|
}
|
|
4489
4477
|
} catch {
|
|
4490
4478
|
}
|
|
4491
|
-
const testLlmScores = llmScores[testId] ?? {};
|
|
4492
4479
|
const llmGradersDir = join(testDir, "llm_graders");
|
|
4493
4480
|
try {
|
|
4494
4481
|
const graderFiles = (await readdir(llmGradersDir)).filter((f) => f.endsWith(".json"));
|
|
4495
4482
|
for (const file of graderFiles) {
|
|
4496
4483
|
const graderMeta = JSON.parse(await readFile(join(llmGradersDir, file), "utf8"));
|
|
4497
4484
|
const graderName = graderMeta.name;
|
|
4498
|
-
const
|
|
4485
|
+
const diskResultPath = join(testDir, "llm_grader_results", `${graderName}.json`);
|
|
4486
|
+
let llmResult;
|
|
4487
|
+
try {
|
|
4488
|
+
llmResult = JSON.parse(await readFile(diskResultPath, "utf8"));
|
|
4489
|
+
} catch {
|
|
4490
|
+
}
|
|
4499
4491
|
if (llmResult) {
|
|
4500
4492
|
evaluators.push({
|
|
4501
4493
|
name: graderName,
|
|
@@ -4515,7 +4507,7 @@ var evalBenchCommand = command({
|
|
|
4515
4507
|
const weightedScore = totalWeight > 0 ? evaluators.reduce((sum, e) => sum + e.score * e.weight, 0) / totalWeight : 0;
|
|
4516
4508
|
const passed = allAssertions.filter((a) => a.passed).length;
|
|
4517
4509
|
const failed = allAssertions.filter((a) => !a.passed).length;
|
|
4518
|
-
const passRate = allAssertions.length > 0 ? Math.round(passed / allAssertions.length * 1e3) / 1e3 : 0;
|
|
4510
|
+
const passRate = allAssertions.length > 0 ? Math.round(passed / allAssertions.length * 1e3) / 1e3 : weightedScore >= 0.5 ? 1 : 0;
|
|
4519
4511
|
allPassRates.push(passRate);
|
|
4520
4512
|
const grading = {
|
|
4521
4513
|
assertions: allAssertions,
|
|
@@ -4608,13 +4600,6 @@ var evalBenchCommand = command({
|
|
|
4608
4600
|
console.log(`Benchmark: ${testIds.length} test(s), pass_rate=${passRateStats.mean}`);
|
|
4609
4601
|
}
|
|
4610
4602
|
});
|
|
4611
|
-
async function readStdin() {
|
|
4612
|
-
const chunks = [];
|
|
4613
|
-
for await (const chunk of process.stdin) {
|
|
4614
|
-
chunks.push(chunk);
|
|
4615
|
-
}
|
|
4616
|
-
return Buffer.concat(chunks).toString("utf8").trim();
|
|
4617
|
-
}
|
|
4618
4603
|
function computeStats(values) {
|
|
4619
4604
|
if (values.length === 0) return { mean: 0, stddev: 0 };
|
|
4620
4605
|
const mean2 = values.reduce((sum, v) => sum + v, 0) / values.length;
|
|
@@ -4681,7 +4666,10 @@ async function runCodeGraders(tasks, concurrency) {
|
|
|
4681
4666
|
);
|
|
4682
4667
|
const parsed = JSON.parse(stdout);
|
|
4683
4668
|
const score = typeof parsed.score === "number" ? parsed.score : 0;
|
|
4684
|
-
const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : [
|
|
4669
|
+
const assertions = Array.isArray(parsed.assertions) && parsed.assertions.length > 0 ? parsed.assertions : [
|
|
4670
|
+
...(parsed.hits ?? []).map((h) => ({ text: h, passed: true })),
|
|
4671
|
+
...(parsed.misses ?? []).map((m) => ({ text: m, passed: false }))
|
|
4672
|
+
];
|
|
4685
4673
|
const result = {
|
|
4686
4674
|
name: graderName,
|
|
4687
4675
|
type: "code-grader",
|
|
@@ -4960,7 +4948,7 @@ async function writeJson(filePath, data) {
|
|
|
4960
4948
|
}
|
|
4961
4949
|
|
|
4962
4950
|
// src/commands/pipeline/run.ts
|
|
4963
|
-
import {
|
|
4951
|
+
import { exec } from "node:child_process";
|
|
4964
4952
|
import { existsSync as existsSync3, readFileSync as readFileSync4, unlinkSync } from "node:fs";
|
|
4965
4953
|
import { mkdir as mkdir4, readFile as readFile4, readdir as readdir3, writeFile as writeFile5 } from "node:fs/promises";
|
|
4966
4954
|
import { tmpdir } from "node:os";
|
|
@@ -5009,7 +4997,7 @@ var evalRunCommand2 = command({
|
|
|
5009
4997
|
workers: option({
|
|
5010
4998
|
type: optional(number),
|
|
5011
4999
|
long: "workers",
|
|
5012
|
-
description: "Parallel workers for target invocation (default:
|
|
5000
|
+
description: "Parallel workers for target invocation (default: targets.yaml workers, then 5)"
|
|
5013
5001
|
}),
|
|
5014
5002
|
experiment: option({
|
|
5015
5003
|
type: optional(string),
|
|
@@ -5037,6 +5025,7 @@ var evalRunCommand2 = command({
|
|
|
5037
5025
|
let targetInfo = null;
|
|
5038
5026
|
let targetName = "agent";
|
|
5039
5027
|
let targetKind = "agent";
|
|
5028
|
+
let targetWorkers;
|
|
5040
5029
|
try {
|
|
5041
5030
|
const selection = await selectTarget({
|
|
5042
5031
|
testFilePath: resolvedEvalPath,
|
|
@@ -5049,6 +5038,7 @@ var evalRunCommand2 = command({
|
|
|
5049
5038
|
env: process.env
|
|
5050
5039
|
});
|
|
5051
5040
|
targetName = selection.targetName;
|
|
5041
|
+
targetWorkers = selection.resolvedTarget.workers;
|
|
5052
5042
|
if (selection.resolvedTarget.kind === "cli") {
|
|
5053
5043
|
targetKind = "cli";
|
|
5054
5044
|
const config = selection.resolvedTarget.config;
|
|
@@ -5116,8 +5106,14 @@ var evalRunCommand2 = command({
|
|
|
5116
5106
|
process.env.AGENTV_RUN_TIMESTAMP = (/* @__PURE__ */ new Date()).toISOString().replace(/:/g, "-").replace(/\./g, "-");
|
|
5117
5107
|
}
|
|
5118
5108
|
const mergedEnv = { ...process.env, ...envVars };
|
|
5119
|
-
const maxWorkers = workers ??
|
|
5120
|
-
|
|
5109
|
+
const maxWorkers = workers ?? targetWorkers ?? 5;
|
|
5110
|
+
let invCompleted = 0;
|
|
5111
|
+
const invTotal = testIds.length;
|
|
5112
|
+
const writeInvProgress = () => {
|
|
5113
|
+
process.stderr.write(`\rInvoking: ${invCompleted}/${invTotal} done`);
|
|
5114
|
+
};
|
|
5115
|
+
console.log(`Invoking ${invTotal} CLI target(s) (${maxWorkers} workers)...`);
|
|
5116
|
+
writeInvProgress();
|
|
5121
5117
|
const invokeTarget = async (testId) => {
|
|
5122
5118
|
const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
|
|
5123
5119
|
const testDir = join4(outDir, ...subpath);
|
|
@@ -5137,12 +5133,20 @@ var evalRunCommand2 = command({
|
|
|
5137
5133
|
rendered = rendered.replace("{PROMPT}", inputText);
|
|
5138
5134
|
const start = performance.now();
|
|
5139
5135
|
try {
|
|
5140
|
-
|
|
5141
|
-
|
|
5142
|
-
|
|
5143
|
-
|
|
5144
|
-
|
|
5145
|
-
|
|
5136
|
+
await new Promise((resolveP, rejectP) => {
|
|
5137
|
+
exec(
|
|
5138
|
+
rendered,
|
|
5139
|
+
{
|
|
5140
|
+
cwd,
|
|
5141
|
+
timeout: timeoutMs,
|
|
5142
|
+
env: mergedEnv,
|
|
5143
|
+
maxBuffer: 10 * 1024 * 1024
|
|
5144
|
+
},
|
|
5145
|
+
(error) => {
|
|
5146
|
+
if (error) rejectP(error);
|
|
5147
|
+
else resolveP();
|
|
5148
|
+
}
|
|
5149
|
+
);
|
|
5146
5150
|
});
|
|
5147
5151
|
const durationMs = Math.round(performance.now() - start);
|
|
5148
5152
|
let response;
|
|
@@ -5157,7 +5161,9 @@ var evalRunCommand2 = command({
|
|
|
5157
5161
|
total_duration_seconds: Math.round(durationMs / 10) / 100,
|
|
5158
5162
|
execution_status: "ok"
|
|
5159
5163
|
});
|
|
5160
|
-
|
|
5164
|
+
process.stderr.write(`
|
|
5165
|
+
${testId}: OK (${durationMs}ms, ${response.length} chars)
|
|
5166
|
+
`);
|
|
5161
5167
|
} catch (error) {
|
|
5162
5168
|
const durationMs = Math.round(performance.now() - start);
|
|
5163
5169
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -5168,8 +5174,14 @@ var evalRunCommand2 = command({
|
|
|
5168
5174
|
total_duration_seconds: Math.round(durationMs / 10) / 100,
|
|
5169
5175
|
execution_status: "execution_error"
|
|
5170
5176
|
});
|
|
5171
|
-
|
|
5177
|
+
process.stderr.write(
|
|
5178
|
+
`
|
|
5179
|
+
${testId}: FAILED (${durationMs}ms) \u2014 ${message.slice(0, 200)}
|
|
5180
|
+
`
|
|
5181
|
+
);
|
|
5172
5182
|
} finally {
|
|
5183
|
+
invCompleted++;
|
|
5184
|
+
writeInvProgress();
|
|
5173
5185
|
try {
|
|
5174
5186
|
if (existsSync3(promptFile)) unlinkSync(promptFile);
|
|
5175
5187
|
if (existsSync3(outputFile)) unlinkSync(outputFile);
|
|
@@ -5188,6 +5200,7 @@ var evalRunCommand2 = command({
|
|
|
5188
5200
|
}
|
|
5189
5201
|
}
|
|
5190
5202
|
await Promise.all(pending);
|
|
5203
|
+
process.stderr.write("\n");
|
|
5191
5204
|
} else {
|
|
5192
5205
|
console.log("Subagent-as-target mode \u2014 skipping CLI invocation.");
|
|
5193
5206
|
}
|
|
@@ -8330,4 +8343,4 @@ export {
|
|
|
8330
8343
|
preprocessArgv,
|
|
8331
8344
|
runCli
|
|
8332
8345
|
};
|
|
8333
|
-
//# sourceMappingURL=chunk-
|
|
8346
|
+
//# sourceMappingURL=chunk-BEFW6WZ6.js.map
|