agentv 4.2.0 → 4.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,7 +24,7 @@ import {
24
24
  validateFileReferences,
25
25
  validateTargetsFile,
26
26
  writeArtifactsFromResults
27
- } from "./chunk-HAZJO7OY.js";
27
+ } from "./chunk-LTALLYDW.js";
28
28
  import {
29
29
  DEFAULT_CATEGORY,
30
30
  createBuiltinRegistry,
@@ -43,7 +43,7 @@ import {
43
43
  toSnakeCaseDeep as toSnakeCaseDeep2,
44
44
  transpileEvalYamlFile,
45
45
  trimBaselineResult
46
- } from "./chunk-XLM3RNN7.js";
46
+ } from "./chunk-URQXFJEB.js";
47
47
  import {
48
48
  __commonJS,
49
49
  __esm,
@@ -4217,7 +4217,7 @@ var evalRunCommand = command({
4217
4217
  },
4218
4218
  handler: async (args) => {
4219
4219
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
4220
- const { launchInteractiveWizard } = await import("./interactive-NVNOLL2H.js");
4220
+ const { launchInteractiveWizard } = await import("./interactive-UZBC7V4B.js");
4221
4221
  await launchInteractiveWizard();
4222
4222
  return;
4223
4223
  }
@@ -4441,27 +4441,15 @@ var evalBenchCommand = command({
4441
4441
  type: string,
4442
4442
  displayName: "export-dir",
4443
4443
  description: "Export directory from pipeline input/grade"
4444
- }),
4445
- llmScores: option({
4446
- type: optional(string),
4447
- long: "llm-scores",
4448
- description: "Path to LLM scores JSON file (reads from stdin if omitted)"
4449
4444
  })
4450
4445
  },
4451
- handler: async ({ exportDir, llmScores: llmScoresPath }) => {
4446
+ handler: async ({ exportDir }) => {
4452
4447
  const manifest = JSON.parse(await readFile(join(exportDir, "manifest.json"), "utf8"));
4453
4448
  const testIds = manifest.test_ids;
4454
4449
  const targetName = manifest.target?.name ?? "unknown";
4455
4450
  const evalSet = manifest.dataset ?? "";
4456
4451
  const experiment = manifest.experiment;
4457
4452
  const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
4458
- let stdinData;
4459
- if (llmScoresPath) {
4460
- stdinData = await readFile(llmScoresPath, "utf8");
4461
- } else {
4462
- stdinData = await readStdin();
4463
- }
4464
- const llmScores = stdinData ? JSON.parse(stdinData) : {};
4465
4453
  const indexLines = [];
4466
4454
  const allPassRates = [];
4467
4455
  for (const testId of testIds) {
@@ -4488,14 +4476,18 @@ var evalBenchCommand = command({
4488
4476
  }
4489
4477
  } catch {
4490
4478
  }
4491
- const testLlmScores = llmScores[testId] ?? {};
4492
4479
  const llmGradersDir = join(testDir, "llm_graders");
4493
4480
  try {
4494
4481
  const graderFiles = (await readdir(llmGradersDir)).filter((f) => f.endsWith(".json"));
4495
4482
  for (const file of graderFiles) {
4496
4483
  const graderMeta = JSON.parse(await readFile(join(llmGradersDir, file), "utf8"));
4497
4484
  const graderName = graderMeta.name;
4498
- const llmResult = testLlmScores[graderName];
4485
+ const diskResultPath = join(testDir, "llm_grader_results", `${graderName}.json`);
4486
+ let llmResult;
4487
+ try {
4488
+ llmResult = JSON.parse(await readFile(diskResultPath, "utf8"));
4489
+ } catch {
4490
+ }
4499
4491
  if (llmResult) {
4500
4492
  evaluators.push({
4501
4493
  name: graderName,
@@ -4515,7 +4507,7 @@ var evalBenchCommand = command({
4515
4507
  const weightedScore = totalWeight > 0 ? evaluators.reduce((sum, e) => sum + e.score * e.weight, 0) / totalWeight : 0;
4516
4508
  const passed = allAssertions.filter((a) => a.passed).length;
4517
4509
  const failed = allAssertions.filter((a) => !a.passed).length;
4518
- const passRate = allAssertions.length > 0 ? Math.round(passed / allAssertions.length * 1e3) / 1e3 : 0;
4510
+ const passRate = allAssertions.length > 0 ? Math.round(passed / allAssertions.length * 1e3) / 1e3 : weightedScore >= 0.5 ? 1 : 0;
4519
4511
  allPassRates.push(passRate);
4520
4512
  const grading = {
4521
4513
  assertions: allAssertions,
@@ -4608,13 +4600,6 @@ var evalBenchCommand = command({
4608
4600
  console.log(`Benchmark: ${testIds.length} test(s), pass_rate=${passRateStats.mean}`);
4609
4601
  }
4610
4602
  });
4611
- async function readStdin() {
4612
- const chunks = [];
4613
- for await (const chunk of process.stdin) {
4614
- chunks.push(chunk);
4615
- }
4616
- return Buffer.concat(chunks).toString("utf8").trim();
4617
- }
4618
4603
  function computeStats(values) {
4619
4604
  if (values.length === 0) return { mean: 0, stddev: 0 };
4620
4605
  const mean2 = values.reduce((sum, v) => sum + v, 0) / values.length;
@@ -4681,7 +4666,10 @@ async function runCodeGraders(tasks, concurrency) {
4681
4666
  );
4682
4667
  const parsed = JSON.parse(stdout);
4683
4668
  const score = typeof parsed.score === "number" ? parsed.score : 0;
4684
- const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : [];
4669
+ const assertions = Array.isArray(parsed.assertions) && parsed.assertions.length > 0 ? parsed.assertions : [
4670
+ ...(parsed.hits ?? []).map((h) => ({ text: h, passed: true })),
4671
+ ...(parsed.misses ?? []).map((m) => ({ text: m, passed: false }))
4672
+ ];
4685
4673
  const result = {
4686
4674
  name: graderName,
4687
4675
  type: "code-grader",
@@ -4960,7 +4948,7 @@ async function writeJson(filePath, data) {
4960
4948
  }
4961
4949
 
4962
4950
  // src/commands/pipeline/run.ts
4963
- import { execSync } from "node:child_process";
4951
+ import { exec } from "node:child_process";
4964
4952
  import { existsSync as existsSync3, readFileSync as readFileSync4, unlinkSync } from "node:fs";
4965
4953
  import { mkdir as mkdir4, readFile as readFile4, readdir as readdir3, writeFile as writeFile5 } from "node:fs/promises";
4966
4954
  import { tmpdir } from "node:os";
@@ -5009,7 +4997,7 @@ var evalRunCommand2 = command({
5009
4997
  workers: option({
5010
4998
  type: optional(number),
5011
4999
  long: "workers",
5012
- description: "Parallel workers for target invocation (default: all tests)"
5000
+ description: "Parallel workers for target invocation (default: targets.yaml workers, then 5)"
5013
5001
  }),
5014
5002
  experiment: option({
5015
5003
  type: optional(string),
@@ -5037,6 +5025,7 @@ var evalRunCommand2 = command({
5037
5025
  let targetInfo = null;
5038
5026
  let targetName = "agent";
5039
5027
  let targetKind = "agent";
5028
+ let targetWorkers;
5040
5029
  try {
5041
5030
  const selection = await selectTarget({
5042
5031
  testFilePath: resolvedEvalPath,
@@ -5049,6 +5038,7 @@ var evalRunCommand2 = command({
5049
5038
  env: process.env
5050
5039
  });
5051
5040
  targetName = selection.targetName;
5041
+ targetWorkers = selection.resolvedTarget.workers;
5052
5042
  if (selection.resolvedTarget.kind === "cli") {
5053
5043
  targetKind = "cli";
5054
5044
  const config = selection.resolvedTarget.config;
@@ -5116,8 +5106,14 @@ var evalRunCommand2 = command({
5116
5106
  process.env.AGENTV_RUN_TIMESTAMP = (/* @__PURE__ */ new Date()).toISOString().replace(/:/g, "-").replace(/\./g, "-");
5117
5107
  }
5118
5108
  const mergedEnv = { ...process.env, ...envVars };
5119
- const maxWorkers = workers ?? testIds.length;
5120
- console.log(`Invoking ${testIds.length} CLI target(s) (${maxWorkers} workers)...`);
5109
+ const maxWorkers = workers ?? targetWorkers ?? 5;
5110
+ let invCompleted = 0;
5111
+ const invTotal = testIds.length;
5112
+ const writeInvProgress = () => {
5113
+ process.stderr.write(`\rInvoking: ${invCompleted}/${invTotal} done`);
5114
+ };
5115
+ console.log(`Invoking ${invTotal} CLI target(s) (${maxWorkers} workers)...`);
5116
+ writeInvProgress();
5121
5117
  const invokeTarget = async (testId) => {
5122
5118
  const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
5123
5119
  const testDir = join4(outDir, ...subpath);
@@ -5137,12 +5133,20 @@ var evalRunCommand2 = command({
5137
5133
  rendered = rendered.replace("{PROMPT}", inputText);
5138
5134
  const start = performance.now();
5139
5135
  try {
5140
- execSync(rendered, {
5141
- cwd,
5142
- timeout: timeoutMs,
5143
- env: mergedEnv,
5144
- stdio: ["pipe", "pipe", "pipe"],
5145
- maxBuffer: 10 * 1024 * 1024
5136
+ await new Promise((resolveP, rejectP) => {
5137
+ exec(
5138
+ rendered,
5139
+ {
5140
+ cwd,
5141
+ timeout: timeoutMs,
5142
+ env: mergedEnv,
5143
+ maxBuffer: 10 * 1024 * 1024
5144
+ },
5145
+ (error) => {
5146
+ if (error) rejectP(error);
5147
+ else resolveP();
5148
+ }
5149
+ );
5146
5150
  });
5147
5151
  const durationMs = Math.round(performance.now() - start);
5148
5152
  let response;
@@ -5157,7 +5161,9 @@ var evalRunCommand2 = command({
5157
5161
  total_duration_seconds: Math.round(durationMs / 10) / 100,
5158
5162
  execution_status: "ok"
5159
5163
  });
5160
- console.log(` ${testId}: OK (${durationMs}ms, ${response.length} chars)`);
5164
+ process.stderr.write(`
5165
+ ${testId}: OK (${durationMs}ms, ${response.length} chars)
5166
+ `);
5161
5167
  } catch (error) {
5162
5168
  const durationMs = Math.round(performance.now() - start);
5163
5169
  const message = error instanceof Error ? error.message : String(error);
@@ -5168,8 +5174,14 @@ var evalRunCommand2 = command({
5168
5174
  total_duration_seconds: Math.round(durationMs / 10) / 100,
5169
5175
  execution_status: "execution_error"
5170
5176
  });
5171
- console.error(` ${testId}: FAILED (${durationMs}ms) \u2014 ${message.slice(0, 200)}`);
5177
+ process.stderr.write(
5178
+ `
5179
+ ${testId}: FAILED (${durationMs}ms) \u2014 ${message.slice(0, 200)}
5180
+ `
5181
+ );
5172
5182
  } finally {
5183
+ invCompleted++;
5184
+ writeInvProgress();
5173
5185
  try {
5174
5186
  if (existsSync3(promptFile)) unlinkSync(promptFile);
5175
5187
  if (existsSync3(outputFile)) unlinkSync(outputFile);
@@ -5188,6 +5200,7 @@ var evalRunCommand2 = command({
5188
5200
  }
5189
5201
  }
5190
5202
  await Promise.all(pending);
5203
+ process.stderr.write("\n");
5191
5204
  } else {
5192
5205
  console.log("Subagent-as-target mode \u2014 skipping CLI invocation.");
5193
5206
  }
@@ -8330,4 +8343,4 @@ export {
8330
8343
  preprocessArgv,
8331
8344
  runCli
8332
8345
  };
8333
- //# sourceMappingURL=chunk-UXSQQHCI.js.map
8346
+ //# sourceMappingURL=chunk-BEFW6WZ6.js.map