npm - agentv - Versions diffs - 4.38.1-next.1 → 4.39.0-next.1 - Mend

agentv 4.38.1-next.1 → 4.39.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/dist/{artifact-writer-MK5X5MSO.js → artifact-writer-VPRAQSQM.js} RENAMED Viewed

@@ -1,28 +1,31 @@
 import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
 import {
+  buildIndexArtifactEntry,
+  buildResultIndexArtifact,
+  writeArtifactsFromResults,
+  writePerTestArtifacts
+} from "./chunk-5VQPWWUI.js";
+import "./chunk-RLMXZDDC.js";
+import "./chunk-76FOHROU.js";
+import "./chunk-BPGJ4HBU.js";
+import {
+  RESULT_INDEX_FILENAME,
   aggregateRunDir,
   buildAggregateGradingArtifact,
   buildBenchmarkArtifact,
   buildGradingArtifact,
-  buildIndexArtifactEntry,
-  buildResultIndexArtifact,
   buildTestTargetKey,
   buildTimingArtifact,
   deduplicateByTestIdTarget,
   parseJsonlResults,
   writeArtifacts,
-  writeArtifactsFromResults,
-  writeInitialBenchmarkArtifact,
-  writePerTestArtifacts
-} from "./chunk-DKUAETXE.js";
-import "./chunk-NLTIK3LV.js";
-import "./chunk-QOBQ5XYF.js";
-import "./chunk-BPGJ4HBU.js";
-import "./chunk-VBHHZQS6.js";
+  writeInitialBenchmarkArtifact
+} from "./chunk-DR2ZHSBE.js";
 import "./chunk-NPVGBFF6.js";
 import "./chunk-M7BUKBAF.js";
 import "./chunk-5H446C7X.js";
 export {
+  RESULT_INDEX_FILENAME,
   aggregateRunDir,
   buildAggregateGradingArtifact,
   buildBenchmarkArtifact,
@@ -38,4 +41,4 @@ export {
   writeInitialBenchmarkArtifact,
   writePerTestArtifacts
 };
-//# sourceMappingURL=artifact-writer-MK5X5MSO.js.map
+//# sourceMappingURL=artifact-writer-VPRAQSQM.js.map

package/dist/{chunk-SMZQ7RPW.js → chunk-4NAWRNBL.js} RENAMED Viewed

@@ -1,9 +1,12 @@
 import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
 import {
   Mutex,
+  RESULT_INDEX_FILENAME,
   TARGET_FILE_CANDIDATES,
+  buildDefaultRunDir,
   c,
   clearRemoteRunTags,
+  createRunDirName,
   deleteRunTags,
   detectFileType,
   discoverEvalFiles,
@@ -36,6 +39,7 @@ import {
   resolveEvalPaths,
   resolveResultSourcePath,
   resolveRunCacheFile,
+  resolveRunManifestPath,
   runEvalCommand,
   selectTarget,
   setRemoteRunTags,
@@ -48,19 +52,11 @@ import {
   validateTargetsFile,
   validateWorkspacePaths,
   writeRunTags
-} from "./chunk-Z4BVJJXA.js";
+} from "./chunk-5JWECTVJ.js";
 import {
-  RESULT_INDEX_FILENAME,
-  aggregateRunDir,
-  buildBenchmarkArtifact,
-  buildDefaultRunDir,
-  buildTestTargetKey,
-  buildTimingArtifact,
-  createRunDirName,
-  resolveRunManifestPath,
   toSnakeCaseDeep as toSnakeCaseDeep2,
   writeArtifactsFromResults
-} from "./chunk-DKUAETXE.js";
+} from "./chunk-5VQPWWUI.js";
 import {
   DEFAULT_CATEGORY,
   deriveCategory,
@@ -69,17 +65,19 @@ import {
   getOutputFilenames,
   parseClaudeSession,
   parseCodexSession,
-  readTranscriptFile,
   runBeforeSessionHook,
   scanRepoDeps,
   syncProjects,
-  toTranscriptJsonLines,
   transpileEvalYamlFile,
   trimBaselineResult
-} from "./chunk-NLTIK3LV.js";
+} from "./chunk-RLMXZDDC.js";
 import {
   DEFAULT_THRESHOLD,
   addProject,
+  aggregateRunDir,
+  buildBenchmarkArtifact,
+  buildTestTargetKey,
+  buildTimingArtifact,
   buildTraceFromMessages,
   createBuiltinRegistry,
   discoverCopilotSessions,
@@ -93,10 +91,12 @@ import {
   loadProjectRegistry,
   loadTestSuite,
   normalizeLineEndings,
+  normalizeResultRow,
   parseAgentSkillsEvals,
   parseCopilotEvents,
   parseYamlValue,
   readTargetDefinitions,
+  readTranscriptFile,
   removeProject,
   runContainsAllAssertion,
   runContainsAnyAssertion,
@@ -111,8 +111,9 @@ import {
   runStartsWithAssertion,
   toCamelCaseDeep,
   toSnakeCaseDeep,
+  toTranscriptJsonLines,
   touchProject
-} from "./chunk-VBHHZQS6.js";
+} from "./chunk-DR2ZHSBE.js";
 import {
   __commonJS,
   __require,
@@ -3955,53 +3956,27 @@ var ASSERTION_TEMPLATES = {
   default: `#!/usr/bin/env bun
 import { defineAssertion } from '@agentv/eval';
-/** Extract text from the last message with the given role. */
-function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
-  for (let i = messages.length - 1; i >= 0; i--) {
-    const msg = messages[i];
-    if (msg.role !== role) continue;
-    if (typeof msg.content === 'string') return msg.content;
-    if (Array.isArray(msg.content)) {
-      return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
-    }
-  }
-  return '';
-}
 export default defineAssertion(({ output }) => {
   // TODO: Implement your assertion logic
-  const text = getMessageText(output ?? []);
+  const text = output ?? '';
   const pass = text.length > 0;
   return {
     pass,
-    reasoning: pass ? 'Output has content' : 'Output is empty',
+    assertions: [{ text: pass ? 'Output has content' : 'Output is empty', passed: pass }],
   };
 });
 `,
   score: `#!/usr/bin/env bun
 import { defineAssertion } from '@agentv/eval';
-/** Extract text from the last message with the given role. */
-function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
-  for (let i = messages.length - 1; i >= 0; i--) {
-    const msg = messages[i];
-    if (msg.role !== role) continue;
-    if (typeof msg.content === 'string') return msg.content;
-    if (Array.isArray(msg.content)) {
-      return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
-    }
-  }
-  return '';
-}
 export default defineAssertion(({ output }) => {
   // TODO: Implement your scoring logic (0.0 to 1.0)
-  const text = getMessageText(output ?? []);
+  const text = output ?? '';
   const score = text.length > 0 ? 1.0 : 0.0;
   return {
     pass: score >= 0.5,
     score,
-    reasoning: \`Score: \${score}\`,
+    assertions: [{ text: 'Output has content', passed: score === 1.0 }],
   };
 });
 `
@@ -4337,7 +4312,6 @@ var evalAssertCommand = command({
         question: resolvedInput,
         criteria: "",
         expected_output: [],
-        reference_answer: "",
         input_files: [],
         trace,
         token_usage: null,
@@ -4348,11 +4322,7 @@ var evalAssertCommand = command({
         file_changes: null,
         workspace_path: null,
         config: null,
-        metadata: {},
-        // Text convenience accessors (new names)
-        input_text: resolvedInput,
-        output_text: resolvedOutput,
-        expected_output_text: ""
+        metadata: {}
       },
       null,
       2
@@ -4440,6 +4410,33 @@ var evalRunCommand = command({
       long: "experiment",
       description: "Experiment label for canonical run output (default: default)"
     }),
+    resultsRepo: option({
+      type: optional(string),
+      long: "results-repo",
+      description: "Results Git repo override: current/. for the source repo, a local path, Git URL, or owner/repo"
+    }),
+    resultsBranch: option({
+      type: optional(string),
+      long: "results-branch",
+      description: "Results storage branch (default: agentv/results/v1)"
+    }),
+    resultsRemote: option({
+      type: optional(string),
+      long: "results-remote",
+      description: "Git remote name for results push/fetch (default: origin)"
+    }),
+    resultsPush: flag({
+      long: "results-push",
+      description: "Push the results branch after publishing the completed local run"
+    }),
+    noResultsPush: flag({
+      long: "no-results-push",
+      description: "Publish to the local results branch without pushing to the remote"
+    }),
+    resultsRequirePush: flag({
+      long: "results-require-push",
+      description: "Fail the eval command if the completed results branch cannot be pushed"
+    }),
     dryRun: flag({
       long: "dry-run",
       description: "Use mock provider responses instead of real LLM calls"
@@ -4515,7 +4512,7 @@ var evalRunCommand = command({
     otelBackend: option({
       type: optional(string),
       long: "otel-backend",
-      description: "Use a backend preset (langfuse, braintrust, confident)"
+      description: "Use an OTel backend resolver (langfuse, braintrust, confident, or local)"
     }),
     otelCaptureContent: flag({
       long: "otel-capture-content",
@@ -4600,7 +4597,7 @@ var evalRunCommand = command({
   },
   handler: async (args) => {
     if (args.evalPaths.length === 0 && process.stdin.isTTY) {
-      const { launchInteractiveWizard } = await import("./interactive-A7JNS2MT.js");
+      const { launchInteractiveWizard } = await import("./interactive-V2GW7A25.js");
       await launchInteractiveWizard();
       return;
     }
@@ -4609,6 +4606,10 @@ var evalRunCommand = command({
       console.error("Error: --budget-usd must be a positive number.");
       process.exit(2);
     }
+    if (args.resultsPush && args.noResultsPush) {
+      console.error("Error: --results-push and --no-results-push cannot be used together.");
+      process.exit(2);
+    }
     const rawOptions = {
       target: args.target,
       targets: args.targets,
@@ -4618,6 +4619,12 @@ var evalRunCommand = command({
       output: args.output,
       outputFormat: args.outputFormat,
       experiment: args.experiment,
+      resultsRepo: args.resultsRepo,
+      resultsBranch: args.resultsBranch,
+      resultsRemote: args.resultsRemote,
+      resultsPush: args.resultsPush,
+      noResultsPush: args.noResultsPush,
+      resultsRequirePush: args.resultsRequirePush,
       dryRun: args.dryRun,
       dryRunDelay: args.dryRunDelay,
       dryRunDelayMin: args.dryRunDelayMin,
@@ -6353,13 +6360,15 @@ function parseFilterableRecords(filePath) {
   }
   const lines = content.split("\n").filter((line) => line.trim());
   const records = [];
-  for (const line of lines) {
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i];
     let raw;
     try {
       raw = JSON.parse(line);
     } catch {
       continue;
     }
+    raw = normalizeResultRow(raw, { lineNumber: i + 1, sourceLabel: filePath });
     let experiment = typeof raw.experiment === "string" ? raw.experiment : void 0;
     if (!experiment) {
       const parts = filePath.split(path14.sep);
@@ -6955,7 +6964,7 @@ function searchJsonlFile(filePath, regex2, targetFilter, experimentFilter) {
     const target = typeof record.target === "string" ? record.target : void 0;
     const experiment = typeof record.experiment === "string" ? record.experiment : void 0;
     const score = typeof record.score === "number" ? record.score : void 0;
-    const testId = typeof record.test_id === "string" ? record.test_id : typeof record.source === "object" && record.source !== null ? record.source.session_id : void 0;
+    const testId = typeof record.test_id === "string" ? record.test_id : typeof record.testId === "string" ? record.testId : typeof record.source === "object" && record.source !== null ? record.source.session_id : void 0;
     if (targetFilter && target !== targetFilter) continue;
     if (experimentFilter && experiment !== experimentFilter) continue;
     const match = regex2.exec(line);
@@ -7748,12 +7757,6 @@ function computeStats(values) {
 import { mkdir as mkdir7, readFile as readFile4, readdir as readdir2, writeFile as writeFile8 } from "node:fs/promises";
 import { join as join2 } from "node:path";
 var DEFAULT_CONCURRENCY = 10;
-function extractInputText(input) {
-  if (!input || input.length === 0) return "";
-  if (input.length === 1) return input[0].content;
-  return input.map((m) => `@[${m.role}]:
-${m.content}`).join("\n\n");
-}
 async function runCodeGraders(tasks, concurrency) {
   let totalGraders = 0;
   let totalPassed = 0;
@@ -7782,7 +7785,6 @@ async function runCodeGraders(tasks, concurrency) {
   const executeCodeGrader = async (graderConfig, task) => {
     const { testId, resultsDir, responseText, inputData } = task;
     const graderName = graderConfig.name;
-    const inputText = extractInputText(inputData.input);
     const messages = [{ role: "assistant", content: responseText }];
     const trace = buildTraceFromMessages({
       input: inputData.input,
@@ -7807,10 +7809,7 @@ async function runCodeGraders(tasks, concurrency) {
       file_changes: null,
       workspace_path: null,
       config: graderConfig.config ?? null,
-      metadata: inputData.metadata ?? {},
-      input_text: inputText,
-      output_text: responseText,
-      expected_output_text: ""
+      metadata: inputData.metadata ?? {}
     });
     try {
       const stdout = await executeScript(
@@ -8215,7 +8214,7 @@ import { existsSync as existsSync7, readFileSync as readFileSync6, unlinkSync }
 import { mkdir as mkdir9, readFile as readFile6, readdir as readdir3, writeFile as writeFile10 } from "node:fs/promises";
 import { tmpdir } from "node:os";
 import { dirname as dirname2, join as join4, relative as relative2, resolve as resolve2 } from "node:path";
-function extractInputText2(input) {
+function extractInputText(input) {
   if (!input || input.length === 0) return "";
   if (input.length === 1) return input[0].content;
   return input.map((m) => `@[${m.role}]:
@@ -8399,7 +8398,7 @@ var evalRunCommand2 = command({
         const timeoutMs = invoke.timeout_ms ?? 12e4;
         const promptFile = join4(tmpdir(), `agentv-prompt-${testId}-${Date.now()}.txt`);
         const outputFile = join4(tmpdir(), `agentv-output-${testId}-${Date.now()}.txt`);
-        const inputText = extractInputText2(inputData.input);
+        const inputText = extractInputText(inputData.input);
         await writeFile10(promptFile, inputText, "utf8");
         let rendered = template;
         rendered = rendered.replace("{PROMPT_FILE}", promptFile);
@@ -15632,4 +15631,4 @@ export {
   preprocessArgv,
   runCli
 };
-//# sourceMappingURL=chunk-SMZQ7RPW.js.map
+//# sourceMappingURL=chunk-4NAWRNBL.js.map