npm - agentv - Versions diffs - 4.1.1 → 4.3.0 - Mend

agentv 4.1.1 → 4.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/{chunk-TDY2FQN5.js → chunk-ASU5L5ZW.js} RENAMED Viewed

@@ -24,7 +24,7 @@ import {
   validateFileReferences,
   validateTargetsFile,
   writeArtifactsFromResults
-} from "./chunk-QCKPJPYC.js";
+} from "./chunk-ZDJN5FSI.js";
 import {
   DEFAULT_CATEGORY,
   createBuiltinRegistry,
@@ -43,7 +43,7 @@ import {
   toSnakeCaseDeep as toSnakeCaseDeep2,
   transpileEvalYamlFile,
   trimBaselineResult
-} from "./chunk-XEAW7OQT.js";
+} from "./chunk-XLM3RNN7.js";
 import {
   __commonJS,
   __esm,
@@ -4217,7 +4217,7 @@ var evalRunCommand = command({
   },
   handler: async (args) => {
     if (args.evalPaths.length === 0 && process.stdin.isTTY) {
-      const { launchInteractiveWizard } = await import("./interactive-ASB4FU3J.js");
+      const { launchInteractiveWizard } = await import("./interactive-BKK53ETJ.js");
       await launchInteractiveWizard();
       return;
     }
@@ -4441,27 +4441,15 @@ var evalBenchCommand = command({
       type: string,
       displayName: "export-dir",
       description: "Export directory from pipeline input/grade"
-    }),
-    llmScores: option({
-      type: optional(string),
-      long: "llm-scores",
-      description: "Path to LLM scores JSON file (reads from stdin if omitted)"
     })
   },
-  handler: async ({ exportDir, llmScores: llmScoresPath }) => {
+  handler: async ({ exportDir }) => {
     const manifest = JSON.parse(await readFile(join(exportDir, "manifest.json"), "utf8"));
     const testIds = manifest.test_ids;
     const targetName = manifest.target?.name ?? "unknown";
     const evalSet = manifest.dataset ?? "";
     const experiment = manifest.experiment;
     const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
-    let stdinData;
-    if (llmScoresPath) {
-      stdinData = await readFile(llmScoresPath, "utf8");
-    } else {
-      stdinData = await readStdin();
-    }
-    const llmScores = stdinData ? JSON.parse(stdinData) : {};
     const indexLines = [];
     const allPassRates = [];
     for (const testId of testIds) {
@@ -4488,14 +4476,18 @@ var evalBenchCommand = command({
         }
       } catch {
       }
-      const testLlmScores = llmScores[testId] ?? {};
       const llmGradersDir = join(testDir, "llm_graders");
       try {
         const graderFiles = (await readdir(llmGradersDir)).filter((f) => f.endsWith(".json"));
         for (const file of graderFiles) {
           const graderMeta = JSON.parse(await readFile(join(llmGradersDir, file), "utf8"));
           const graderName = graderMeta.name;
-          const llmResult = testLlmScores[graderName];
+          const diskResultPath = join(testDir, "llm_grader_results", `${graderName}.json`);
+          let llmResult;
+          try {
+            llmResult = JSON.parse(await readFile(diskResultPath, "utf8"));
+          } catch {
+          }
           if (llmResult) {
             evaluators.push({
               name: graderName,
@@ -4515,7 +4507,7 @@ var evalBenchCommand = command({
       const weightedScore = totalWeight > 0 ? evaluators.reduce((sum, e) => sum + e.score * e.weight, 0) / totalWeight : 0;
       const passed = allAssertions.filter((a) => a.passed).length;
       const failed = allAssertions.filter((a) => !a.passed).length;
-      const passRate = allAssertions.length > 0 ? Math.round(passed / allAssertions.length * 1e3) / 1e3 : 0;
+      const passRate = allAssertions.length > 0 ? Math.round(passed / allAssertions.length * 1e3) / 1e3 : weightedScore >= 0.5 ? 1 : 0;
       allPassRates.push(passRate);
       const grading = {
         assertions: allAssertions,
@@ -4608,13 +4600,6 @@ var evalBenchCommand = command({
     console.log(`Benchmark: ${testIds.length} test(s), pass_rate=${passRateStats.mean}`);
   }
 });
-async function readStdin() {
-  const chunks = [];
-  for await (const chunk of process.stdin) {
-    chunks.push(chunk);
-  }
-  return Buffer.concat(chunks).toString("utf8").trim();
-}
 function computeStats(values) {
   if (values.length === 0) return { mean: 0, stddev: 0 };
   const mean2 = values.reduce((sum, v) => sum + v, 0) / values.length;
@@ -4628,12 +4613,118 @@ function computeStats(values) {
 // src/commands/pipeline/grade.ts
 import { mkdir as mkdir2, readFile as readFile2, readdir as readdir2, writeFile as writeFile3 } from "node:fs/promises";
 import { join as join2 } from "node:path";
+var DEFAULT_CONCURRENCY = 10;
 function extractInputText(input) {
   if (!input || input.length === 0) return "";
   if (input.length === 1) return input[0].content;
   return input.map((m) => `@[${m.role}]:
 ${m.content}`).join("\n\n");
 }
+async function runCodeGraders(tasks, concurrency) {
+  let totalGraders = 0;
+  let totalPassed = 0;
+  let completed = 0;
+  const total = tasks.length;
+  if (total === 0) return { totalGraders: 0, totalPassed: 0 };
+  const writeProgress = () => {
+    process.stderr.write(`\rGrading: ${completed}/${total} done`);
+  };
+  writeProgress();
+  const executeGrader = async (task) => {
+    const { testId, testDir, resultsDir, graderFile, responseText, inputData } = task;
+    const graderConfig = JSON.parse(
+      await readFile2(join2(testDir, "code_graders", graderFile), "utf8")
+    );
+    const graderName = graderConfig.name;
+    const inputText = extractInputText(inputData.input);
+    const payload = JSON.stringify({
+      output: [{ role: "assistant", content: responseText }],
+      input: inputData.input,
+      criteria: "",
+      expected_output: [],
+      input_files: inputData.input_files ?? [],
+      trace: null,
+      token_usage: null,
+      cost_usd: null,
+      duration_ms: null,
+      start_time: null,
+      end_time: null,
+      file_changes: null,
+      workspace_path: null,
+      config: graderConfig.config ?? null,
+      metadata: inputData.metadata ?? {},
+      input_text: inputText,
+      output_text: responseText,
+      expected_output_text: ""
+    });
+    try {
+      const stdout = await executeScript(
+        graderConfig.command,
+        payload,
+        void 0,
+        graderConfig.cwd
+      );
+      const parsed = JSON.parse(stdout);
+      const score = typeof parsed.score === "number" ? parsed.score : 0;
+      const assertions = Array.isArray(parsed.assertions) && parsed.assertions.length > 0 ? parsed.assertions : [
+        ...(parsed.hits ?? []).map((h) => ({ text: h, passed: true })),
+        ...(parsed.misses ?? []).map((m) => ({ text: m, passed: false }))
+      ];
+      const result = {
+        name: graderName,
+        type: "code-grader",
+        score,
+        weight: graderConfig.weight ?? 1,
+        assertions,
+        details: parsed.details ?? {}
+      };
+      await writeFile3(
+        join2(resultsDir, `${graderName}.json`),
+        `${JSON.stringify(result, null, 2)}
+`,
+        "utf8"
+      );
+      totalGraders++;
+      if (score >= 0.5) totalPassed++;
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      process.stderr.write(`
+  ${testId}/${graderName}: ERROR \u2014 ${message}
+`);
+      const errorResult = {
+        name: graderName,
+        type: "code-grader",
+        score: 0,
+        weight: graderConfig.weight ?? 1,
+        assertions: [{ text: `Error: ${message}`, passed: false }],
+        details: { error: message }
+      };
+      await writeFile3(
+        join2(resultsDir, `${graderName}.json`),
+        `${JSON.stringify(errorResult, null, 2)}
+`,
+        "utf8"
+      );
+      totalGraders++;
+    } finally {
+      completed++;
+      writeProgress();
+    }
+  };
+  const pending = /* @__PURE__ */ new Set();
+  for (const task of tasks) {
+    const p = executeGrader(task).then(() => {
+      pending.delete(p);
+    });
+    pending.add(p);
+    if (pending.size >= concurrency) {
+      await Promise.race(pending);
+    }
+  }
+  await Promise.all(pending);
+  process.stderr.write("\n");
+  return { totalGraders, totalPassed };
+}
 var evalGradeCommand = command({
   name: "grade",
   description: "Run code-grader assertions on responses in an export directory",
@@ -4642,16 +4733,22 @@ var evalGradeCommand = command({
       type: string,
       displayName: "export-dir",
       description: "Export directory from pipeline input"
+    }),
+    concurrency: option({
+      type: optional(number),
+      long: "concurrency",
+      short: "j",
+      description: `Number of graders to run in parallel (default: ${DEFAULT_CONCURRENCY})`
     })
   },
-  handler: async ({ exportDir }) => {
+  handler: async ({ exportDir, concurrency }) => {
+    const maxWorkers = concurrency ?? DEFAULT_CONCURRENCY;
     const manifestPath = join2(exportDir, "manifest.json");
     const manifest = JSON.parse(await readFile2(manifestPath, "utf8"));
     const testIds = manifest.test_ids;
     const evalSet = manifest.dataset ?? "";
     const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
-    let totalGraders = 0;
-    let totalPassed = 0;
+    const tasks = [];
     for (const testId of testIds) {
       const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
       const testDir = join2(exportDir, ...subpath);
@@ -4668,76 +4765,10 @@ var evalGradeCommand = command({
       const responseText = await readFile2(join2(testDir, "response.md"), "utf8");
       const inputData = JSON.parse(await readFile2(join2(testDir, "input.json"), "utf8"));
       for (const graderFile of graderFiles) {
-        const graderConfig = JSON.parse(await readFile2(join2(codeGradersDir, graderFile), "utf8"));
-        const graderName = graderConfig.name;
-        const inputText = extractInputText(inputData.input);
-        const payload = JSON.stringify({
-          output: [{ role: "assistant", content: responseText }],
-          input: inputData.input,
-          criteria: "",
-          expected_output: [],
-          input_files: inputData.input_files ?? [],
-          trace: null,
-          token_usage: null,
-          cost_usd: null,
-          duration_ms: null,
-          start_time: null,
-          end_time: null,
-          file_changes: null,
-          workspace_path: null,
-          config: graderConfig.config ?? null,
-          metadata: inputData.metadata ?? {},
-          input_text: inputText,
-          output_text: responseText,
-          expected_output_text: ""
-        });
-        try {
-          const stdout = await executeScript(
-            graderConfig.command,
-            payload,
-            void 0,
-            graderConfig.cwd
-          );
-          const parsed = JSON.parse(stdout);
-          const score = typeof parsed.score === "number" ? parsed.score : 0;
-          const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : [];
-          const result = {
-            name: graderName,
-            type: "code-grader",
-            score,
-            weight: graderConfig.weight ?? 1,
-            assertions,
-            details: parsed.details ?? {}
-          };
-          await writeFile3(
-            join2(resultsDir, `${graderName}.json`),
-            `${JSON.stringify(result, null, 2)}
-`,
-            "utf8"
-          );
-          totalGraders++;
-          if (score >= 0.5) totalPassed++;
-        } catch (error) {
-          const message = error instanceof Error ? error.message : String(error);
-          console.error(`  ${testId}/${graderName}: ERROR \u2014 ${message}`);
-          const errorResult = {
-            name: graderName,
-            type: "code-grader",
-            score: 0,
-            weight: graderConfig.weight ?? 1,
-            assertions: [{ text: `Error: ${message}`, passed: false }],
-            details: { error: message }
-          };
-          await writeFile3(
-            join2(resultsDir, `${graderName}.json`),
-            `${JSON.stringify(errorResult, null, 2)}
-`,
-            "utf8"
-          );
-          totalGraders++;
-        }
+        tasks.push({ testId, testDir, resultsDir, graderFile, responseText, inputData });
       }
     }
+    const { totalGraders, totalPassed } = await runCodeGraders(tasks, maxWorkers);
     console.log(`Graded ${totalGraders} code-grader(s): ${totalPassed} passed`);
   }
 });
@@ -5156,8 +5187,7 @@ Done. Results in ${outDir}`);
       );
       return;
     }
-    let totalGraders = 0;
-    let totalPassed = 0;
+    const graderTasks = [];
     for (const testId of testIds) {
       const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
       const testDir = join4(outDir, ...subpath);
@@ -5174,82 +5204,11 @@ Done. Results in ${outDir}`);
       const responseText = await readFile4(join4(testDir, "response.md"), "utf8");
       const inputData = JSON.parse(await readFile4(join4(testDir, "input.json"), "utf8"));
       for (const graderFile of graderFiles) {
-        const graderConfig = JSON.parse(await readFile4(join4(codeGradersDir, graderFile), "utf8"));
-        const graderName = graderConfig.name;
-        const inputText = extractInputText2(inputData.input);
-        const payload = JSON.stringify({
-          output: [{ role: "assistant", content: responseText }],
-          input: inputData.input,
-          criteria: "",
-          expected_output: [],
-          input_files: inputData.input_files ?? [],
-          trace: null,
-          token_usage: null,
-          cost_usd: null,
-          duration_ms: null,
-          start_time: null,
-          end_time: null,
-          file_changes: null,
-          workspace_path: null,
-          config: graderConfig.config ?? null,
-          metadata: inputData.metadata ?? {},
-          input_text: inputText,
-          output_text: responseText,
-          expected_output_text: ""
-        });
-        try {
-          const stdout = await executeScript(
-            graderConfig.command,
-            payload,
-            void 0,
-            graderConfig.cwd
-          );
-          const parsed = JSON.parse(stdout);
-          const score = typeof parsed.score === "number" ? parsed.score : 0;
-          const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : [];
-          await writeFile5(
-            join4(resultsDir, `${graderName}.json`),
-            `${JSON.stringify(
-              {
-                name: graderName,
-                type: "code-grader",
-                score,
-                weight: graderConfig.weight ?? 1,
-                assertions,
-                details: parsed.details ?? {}
-              },
-              null,
-              2
-            )}
-`,
-            "utf8"
-          );
-          totalGraders++;
-          if (score >= 0.5) totalPassed++;
-        } catch (error) {
-          const message = error instanceof Error ? error.message : String(error);
-          console.error(`  ${testId}/${graderName}: ERROR \u2014 ${message}`);
-          await writeFile5(
-            join4(resultsDir, `${graderName}.json`),
-            `${JSON.stringify(
-              {
-                name: graderName,
-                type: "code-grader",
-                score: 0,
-                weight: graderConfig.weight ?? 1,
-                assertions: [{ text: `Error: ${message}`, passed: false }],
-                details: { error: message }
-              },
-              null,
-              2
-            )}
-`,
-            "utf8"
-          );
-          totalGraders++;
-        }
+        graderTasks.push({ testId, testDir, resultsDir, graderFile, responseText, inputData });
       }
     }
+    const graderConcurrency = workers ?? 10;
+    const { totalGraders, totalPassed } = await runCodeGraders(graderTasks, graderConcurrency);
     console.log(`Graded ${totalGraders} code-grader(s): ${totalPassed} passed`);
     console.log(`
 Done. Agent can now perform LLM grading on responses in ${outDir}`);
@@ -6743,8 +6702,8 @@ function resolveStudioDistDir() {
     path9.resolve(currentDir, "../../../../studio/dist"),
     // From dist/ → sibling apps/studio/dist (monorepo dev)
     path9.resolve(currentDir, "../../studio/dist"),
-    // Bundled inside CLI dist (published package)
-    path9.resolve(currentDir, "../studio"),
+    // Bundled inside CLI dist (published package: dist/studio/)
+    path9.resolve(currentDir, "studio"),
     // From dist/ in monorepo root context
     path9.resolve(currentDir, "../../../apps/studio/dist")
   ];
@@ -8359,4 +8318,4 @@ export {
   preprocessArgv,
   runCli
 };
-//# sourceMappingURL=chunk-TDY2FQN5.js.map
+//# sourceMappingURL=chunk-ASU5L5ZW.js.map