npm - @agentgrader/core - Versions diffs - 1.0.0 → 1.1.0 - Mend

@agentgrader/core 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -637,13 +637,15 @@ declare function runSingle(input: RunSingleInput): Promise<RunSingleResult>;
 interface BenchmarkInput {
     testCases: TestCase[];
     agentConfigs: AgentConfig[];
-    adapter: AgentAdapter;
+    adapter?: AgentAdapter;
+    adapters?: AgentAdapter[];
     sandboxProvider: SandboxProvider;
     db?: AgrDb;
     concurrency?: number;
     onRunUpdate?: (run: RunSingleResult & {
         testCaseId: string;
         agentConfigId: string;
+        adapterName?: string;
         status: "running" | "completed" | "failed";
     }) => void;
 }

package/dist/index.js CHANGED Viewed

@@ -836,7 +836,11 @@ ${addendum}` : addendum
   };
 }
 async function runBenchmark(input) {
-  const { testCases, agentConfigs, adapter, sandboxProvider, db, concurrency = 2, onRunUpdate } = input;
+  const { testCases, agentConfigs, adapter, adapters, sandboxProvider, db, concurrency = 2, onRunUpdate } = input;
+  const actualAdapters = adapters || (adapter ? [adapter] : []);
+  if (actualAdapters.length === 0) {
+    throw new Error("You must provide either 'adapter' or 'adapters' to runBenchmark.");
+  }
   const generateCombinationsStep = createStep({
     id: "generateCombinations",
     inputSchema: z.any(),
@@ -846,10 +850,13 @@ async function runBenchmark(input) {
       const combinations = [];
       for (const tc of initData.testCases) {
         for (const config of initData.agentConfigs) {
-          combinations.push({
-            testCase: tc,
-            agentConfig: config
-          });
+          for (const adapterName of initData.adapterNames) {
+            combinations.push({
+              testCase: tc,
+              agentConfig: config,
+              adapterName
+            });
+          }
         }
       }
       return combinations;
@@ -860,7 +867,7 @@ async function runBenchmark(input) {
     inputSchema: z.any(),
     outputSchema: z.any(),
     execute: async ({ inputData, requestContext }) => {
-      const { testCase, agentConfig } = inputData;
+      const { testCase, agentConfig, adapterName } = inputData;
       const ctx = requestContext?.context || requestContext;
       const getVal = (key) => {
         if (ctx instanceof Map) return ctx.get(key);
@@ -868,7 +875,11 @@ async function runBenchmark(input) {
         if (typeof ctx?.get === "function") return ctx.get(key);
         return void 0;
       };
-      const adapter2 = getVal("adapter");
+      const adaptersFromCtx = getVal("adapters");
+      const singleAdapter = getVal("adapter");
+      const adapterList = adaptersFromCtx || (singleAdapter ? [singleAdapter] : []);
+      const adapter2 = adapterList.find((a) => a.name === adapterName);
+      if (!adapter2) throw new Error(`Adapter ${adapterName} not found in execution context`);
       const sandboxProvider2 = getVal("sandboxProvider");
       const db2 = getVal("db");
       const onRunUpdate2 = getVal("onRunUpdate");
@@ -878,6 +889,7 @@ async function runBenchmark(input) {
           runId,
           testCaseId: testCase.id || testCase.name,
           agentConfigId: agentConfig.id || agentConfig.name,
+          adapterName: adapter2.name,
           status: "running",
           passed: false,
           stepsCount: 0,
@@ -901,6 +913,7 @@ async function runBenchmark(input) {
             ...res2,
             testCaseId: testCase.id || testCase.name,
             agentConfigId: agentConfig.id || agentConfig.name,
+            adapterName: adapter2.name,
             status: res2.error ? "failed" : "completed"
           });
         }
@@ -921,6 +934,7 @@ async function runBenchmark(input) {
             ...failedResult,
             testCaseId: testCase.id || testCase.name,
             agentConfigId: agentConfig.id || agentConfig.name,
+            adapterName: adapter2.name,
             status: "failed"
           });
         }
@@ -936,13 +950,14 @@ async function runBenchmark(input) {
   const runState = {};
   const executionContext = /* @__PURE__ */ new Map([
     ["adapter", adapter],
+    ["adapters", actualAdapters],
     ["sandboxProvider", sandboxProvider],
     ["db", db],
     ["onRunUpdate", onRunUpdate]
   ]);
   const run = await workflow.createRun();
   const res = await run.start({
-    inputData: { testCases, agentConfigs },
+    inputData: { testCases, agentConfigs, adapterNames: actualAdapters.map((a) => a.name) },
     initialState: runState,
     requestContext: executionContext
   });

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@agentgrader/core",
-  "version": "1.0.0",
+  "version": "1.1.0",
   "description": "Core schemas, contracts, and runner for the Agentgrader benchmarking framework",
   "license": "MIT",
   "type": "module",
@@ -22,7 +22,7 @@
     "dev": "bun run src/index.ts"
   },
   "dependencies": {
-    "@agentgrader/store": "workspace:*",
+    "@agentgrader/store": "^1.0.2",
     "@mastra/core": "^1.41.0",
     "yaml": "^2.5.1",
     "zod": "^3.23.8"