npm - agentv - Versions diffs - 4.2.0 → 4.3.1 - Mend

agentv 4.2.0 → 4.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/dist/{chunk-UXSQQHCI.js → chunk-BEFW6WZ6.js} RENAMED Viewed

@@ -24,7 +24,7 @@ import {
   validateFileReferences,
   validateTargetsFile,
   writeArtifactsFromResults
-} from "./chunk-HAZJO7OY.js";
+} from "./chunk-LTALLYDW.js";
 import {
   DEFAULT_CATEGORY,
   createBuiltinRegistry,
@@ -43,7 +43,7 @@ import {
   toSnakeCaseDeep as toSnakeCaseDeep2,
   transpileEvalYamlFile,
   trimBaselineResult
-} from "./chunk-XLM3RNN7.js";
+} from "./chunk-URQXFJEB.js";
 import {
   __commonJS,
   __esm,
@@ -4217,7 +4217,7 @@ var evalRunCommand = command({
   },
   handler: async (args) => {
     if (args.evalPaths.length === 0 && process.stdin.isTTY) {
-      const { launchInteractiveWizard } = await import("./interactive-NVNOLL2H.js");
+      const { launchInteractiveWizard } = await import("./interactive-UZBC7V4B.js");
       await launchInteractiveWizard();
       return;
     }
@@ -4441,27 +4441,15 @@ var evalBenchCommand = command({
       type: string,
       displayName: "export-dir",
       description: "Export directory from pipeline input/grade"
-    }),
-    llmScores: option({
-      type: optional(string),
-      long: "llm-scores",
-      description: "Path to LLM scores JSON file (reads from stdin if omitted)"
     })
   },
-  handler: async ({ exportDir, llmScores: llmScoresPath }) => {
+  handler: async ({ exportDir }) => {
     const manifest = JSON.parse(await readFile(join(exportDir, "manifest.json"), "utf8"));
     const testIds = manifest.test_ids;
     const targetName = manifest.target?.name ?? "unknown";
     const evalSet = manifest.dataset ?? "";
     const experiment = manifest.experiment;
     const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
-    let stdinData;
-    if (llmScoresPath) {
-      stdinData = await readFile(llmScoresPath, "utf8");
-    } else {
-      stdinData = await readStdin();
-    }
-    const llmScores = stdinData ? JSON.parse(stdinData) : {};
     const indexLines = [];
     const allPassRates = [];
     for (const testId of testIds) {
@@ -4488,14 +4476,18 @@ var evalBenchCommand = command({
         }
       } catch {
       }
-      const testLlmScores = llmScores[testId] ?? {};
       const llmGradersDir = join(testDir, "llm_graders");
       try {
         const graderFiles = (await readdir(llmGradersDir)).filter((f) => f.endsWith(".json"));
         for (const file of graderFiles) {
           const graderMeta = JSON.parse(await readFile(join(llmGradersDir, file), "utf8"));
           const graderName = graderMeta.name;
-          const llmResult = testLlmScores[graderName];
+          const diskResultPath = join(testDir, "llm_grader_results", `${graderName}.json`);
+          let llmResult;
+          try {
+            llmResult = JSON.parse(await readFile(diskResultPath, "utf8"));
+          } catch {
+          }
           if (llmResult) {
             evaluators.push({
               name: graderName,
@@ -4515,7 +4507,7 @@ var evalBenchCommand = command({
       const weightedScore = totalWeight > 0 ? evaluators.reduce((sum, e) => sum + e.score * e.weight, 0) / totalWeight : 0;
       const passed = allAssertions.filter((a) => a.passed).length;
       const failed = allAssertions.filter((a) => !a.passed).length;
-      const passRate = allAssertions.length > 0 ? Math.round(passed / allAssertions.length * 1e3) / 1e3 : 0;
+      const passRate = allAssertions.length > 0 ? Math.round(passed / allAssertions.length * 1e3) / 1e3 : weightedScore >= 0.5 ? 1 : 0;
       allPassRates.push(passRate);
       const grading = {
         assertions: allAssertions,
@@ -4608,13 +4600,6 @@ var evalBenchCommand = command({
     console.log(`Benchmark: ${testIds.length} test(s), pass_rate=${passRateStats.mean}`);
   }
 });
-async function readStdin() {
-  const chunks = [];
-  for await (const chunk of process.stdin) {
-    chunks.push(chunk);
-  }
-  return Buffer.concat(chunks).toString("utf8").trim();
-}
 function computeStats(values) {
   if (values.length === 0) return { mean: 0, stddev: 0 };
   const mean2 = values.reduce((sum, v) => sum + v, 0) / values.length;
@@ -4681,7 +4666,10 @@ async function runCodeGraders(tasks, concurrency) {
       );
       const parsed = JSON.parse(stdout);
       const score = typeof parsed.score === "number" ? parsed.score : 0;
-      const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : [];
+      const assertions = Array.isArray(parsed.assertions) && parsed.assertions.length > 0 ? parsed.assertions : [
+        ...(parsed.hits ?? []).map((h) => ({ text: h, passed: true })),
+        ...(parsed.misses ?? []).map((m) => ({ text: m, passed: false }))
+      ];
       const result = {
         name: graderName,
         type: "code-grader",
@@ -4960,7 +4948,7 @@ async function writeJson(filePath, data) {
 }
 // src/commands/pipeline/run.ts
-import { execSync } from "node:child_process";
+import { exec } from "node:child_process";
 import { existsSync as existsSync3, readFileSync as readFileSync4, unlinkSync } from "node:fs";
 import { mkdir as mkdir4, readFile as readFile4, readdir as readdir3, writeFile as writeFile5 } from "node:fs/promises";
 import { tmpdir } from "node:os";
@@ -5009,7 +4997,7 @@ var evalRunCommand2 = command({
     workers: option({
       type: optional(number),
       long: "workers",
-      description: "Parallel workers for target invocation (default: all tests)"
+      description: "Parallel workers for target invocation (default: targets.yaml workers, then 5)"
     }),
     experiment: option({
       type: optional(string),
@@ -5037,6 +5025,7 @@ var evalRunCommand2 = command({
     let targetInfo = null;
     let targetName = "agent";
     let targetKind = "agent";
+    let targetWorkers;
     try {
       const selection = await selectTarget({
         testFilePath: resolvedEvalPath,
@@ -5049,6 +5038,7 @@ var evalRunCommand2 = command({
         env: process.env
       });
       targetName = selection.targetName;
+      targetWorkers = selection.resolvedTarget.workers;
       if (selection.resolvedTarget.kind === "cli") {
         targetKind = "cli";
         const config = selection.resolvedTarget.config;
@@ -5116,8 +5106,14 @@ var evalRunCommand2 = command({
         process.env.AGENTV_RUN_TIMESTAMP = (/* @__PURE__ */ new Date()).toISOString().replace(/:/g, "-").replace(/\./g, "-");
       }
       const mergedEnv = { ...process.env, ...envVars };
-      const maxWorkers = workers ?? testIds.length;
-      console.log(`Invoking ${testIds.length} CLI target(s) (${maxWorkers} workers)...`);
+      const maxWorkers = workers ?? targetWorkers ?? 5;
+      let invCompleted = 0;
+      const invTotal = testIds.length;
+      const writeInvProgress = () => {
+        process.stderr.write(`\rInvoking: ${invCompleted}/${invTotal} done`);
+      };
+      console.log(`Invoking ${invTotal} CLI target(s) (${maxWorkers} workers)...`);
+      writeInvProgress();
       const invokeTarget = async (testId) => {
         const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
         const testDir = join4(outDir, ...subpath);
@@ -5137,12 +5133,20 @@ var evalRunCommand2 = command({
         rendered = rendered.replace("{PROMPT}", inputText);
         const start = performance.now();
         try {
-          execSync(rendered, {
-            cwd,
-            timeout: timeoutMs,
-            env: mergedEnv,
-            stdio: ["pipe", "pipe", "pipe"],
-            maxBuffer: 10 * 1024 * 1024
+          await new Promise((resolveP, rejectP) => {
+            exec(
+              rendered,
+              {
+                cwd,
+                timeout: timeoutMs,
+                env: mergedEnv,
+                maxBuffer: 10 * 1024 * 1024
+              },
+              (error) => {
+                if (error) rejectP(error);
+                else resolveP();
+              }
+            );
           });
           const durationMs = Math.round(performance.now() - start);
           let response;
@@ -5157,7 +5161,9 @@ var evalRunCommand2 = command({
             total_duration_seconds: Math.round(durationMs / 10) / 100,
             execution_status: "ok"
           });
-          console.log(`  ${testId}: OK (${durationMs}ms, ${response.length} chars)`);
+          process.stderr.write(`
+  ${testId}: OK (${durationMs}ms, ${response.length} chars)
+`);
         } catch (error) {
           const durationMs = Math.round(performance.now() - start);
           const message = error instanceof Error ? error.message : String(error);
@@ -5168,8 +5174,14 @@ var evalRunCommand2 = command({
             total_duration_seconds: Math.round(durationMs / 10) / 100,
             execution_status: "execution_error"
           });
-          console.error(`  ${testId}: FAILED (${durationMs}ms) \u2014 ${message.slice(0, 200)}`);
+          process.stderr.write(
+            `
+  ${testId}: FAILED (${durationMs}ms) \u2014 ${message.slice(0, 200)}
+`
+          );
         } finally {
+          invCompleted++;
+          writeInvProgress();
           try {
             if (existsSync3(promptFile)) unlinkSync(promptFile);
             if (existsSync3(outputFile)) unlinkSync(outputFile);
@@ -5188,6 +5200,7 @@ var evalRunCommand2 = command({
         }
       }
       await Promise.all(pending);
+      process.stderr.write("\n");
     } else {
       console.log("Subagent-as-target mode \u2014 skipping CLI invocation.");
     }
@@ -8330,4 +8343,4 @@ export {
   preprocessArgv,
   runCli
 };
-//# sourceMappingURL=chunk-UXSQQHCI.js.map
+//# sourceMappingURL=chunk-BEFW6WZ6.js.map