@agentgrader/core 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -637,13 +637,15 @@ declare function runSingle(input: RunSingleInput): Promise<RunSingleResult>;
637
637
  interface BenchmarkInput {
638
638
  testCases: TestCase[];
639
639
  agentConfigs: AgentConfig[];
640
- adapter: AgentAdapter;
640
+ adapter?: AgentAdapter;
641
+ adapters?: AgentAdapter[];
641
642
  sandboxProvider: SandboxProvider;
642
643
  db?: AgrDb;
643
644
  concurrency?: number;
644
645
  onRunUpdate?: (run: RunSingleResult & {
645
646
  testCaseId: string;
646
647
  agentConfigId: string;
648
+ adapterName?: string;
647
649
  status: "running" | "completed" | "failed";
648
650
  }) => void;
649
651
  }
package/dist/index.js CHANGED
@@ -836,7 +836,11 @@ ${addendum}` : addendum
836
836
  };
837
837
  }
838
838
  async function runBenchmark(input) {
839
- const { testCases, agentConfigs, adapter, sandboxProvider, db, concurrency = 2, onRunUpdate } = input;
839
+ const { testCases, agentConfigs, adapter, adapters, sandboxProvider, db, concurrency = 2, onRunUpdate } = input;
840
+ const actualAdapters = adapters || (adapter ? [adapter] : []);
841
+ if (actualAdapters.length === 0) {
842
+ throw new Error("You must provide either 'adapter' or 'adapters' to runBenchmark.");
843
+ }
840
844
  const generateCombinationsStep = createStep({
841
845
  id: "generateCombinations",
842
846
  inputSchema: z.any(),
@@ -846,10 +850,13 @@ async function runBenchmark(input) {
846
850
  const combinations = [];
847
851
  for (const tc of initData.testCases) {
848
852
  for (const config of initData.agentConfigs) {
849
- combinations.push({
850
- testCase: tc,
851
- agentConfig: config
852
- });
853
+ for (const adapterName of initData.adapterNames) {
854
+ combinations.push({
855
+ testCase: tc,
856
+ agentConfig: config,
857
+ adapterName
858
+ });
859
+ }
853
860
  }
854
861
  }
855
862
  return combinations;
@@ -860,7 +867,7 @@ async function runBenchmark(input) {
860
867
  inputSchema: z.any(),
861
868
  outputSchema: z.any(),
862
869
  execute: async ({ inputData, requestContext }) => {
863
- const { testCase, agentConfig } = inputData;
870
+ const { testCase, agentConfig, adapterName } = inputData;
864
871
  const ctx = requestContext?.context || requestContext;
865
872
  const getVal = (key) => {
866
873
  if (ctx instanceof Map) return ctx.get(key);
@@ -868,7 +875,11 @@ async function runBenchmark(input) {
868
875
  if (typeof ctx?.get === "function") return ctx.get(key);
869
876
  return void 0;
870
877
  };
871
- const adapter2 = getVal("adapter");
878
+ const adaptersFromCtx = getVal("adapters");
879
+ const singleAdapter = getVal("adapter");
880
+ const adapterList = adaptersFromCtx || (singleAdapter ? [singleAdapter] : []);
881
+ const adapter2 = adapterList.find((a) => a.name === adapterName);
882
+ if (!adapter2) throw new Error(`Adapter ${adapterName} not found in execution context`);
872
883
  const sandboxProvider2 = getVal("sandboxProvider");
873
884
  const db2 = getVal("db");
874
885
  const onRunUpdate2 = getVal("onRunUpdate");
@@ -878,6 +889,7 @@ async function runBenchmark(input) {
878
889
  runId,
879
890
  testCaseId: testCase.id || testCase.name,
880
891
  agentConfigId: agentConfig.id || agentConfig.name,
892
+ adapterName: adapter2.name,
881
893
  status: "running",
882
894
  passed: false,
883
895
  stepsCount: 0,
@@ -901,6 +913,7 @@ async function runBenchmark(input) {
901
913
  ...res2,
902
914
  testCaseId: testCase.id || testCase.name,
903
915
  agentConfigId: agentConfig.id || agentConfig.name,
916
+ adapterName: adapter2.name,
904
917
  status: res2.error ? "failed" : "completed"
905
918
  });
906
919
  }
@@ -921,6 +934,7 @@ async function runBenchmark(input) {
921
934
  ...failedResult,
922
935
  testCaseId: testCase.id || testCase.name,
923
936
  agentConfigId: agentConfig.id || agentConfig.name,
937
+ adapterName: adapter2.name,
924
938
  status: "failed"
925
939
  });
926
940
  }
@@ -936,13 +950,14 @@ async function runBenchmark(input) {
936
950
  const runState = {};
937
951
  const executionContext = /* @__PURE__ */ new Map([
938
952
  ["adapter", adapter],
953
+ ["adapters", actualAdapters],
939
954
  ["sandboxProvider", sandboxProvider],
940
955
  ["db", db],
941
956
  ["onRunUpdate", onRunUpdate]
942
957
  ]);
943
958
  const run = await workflow.createRun();
944
959
  const res = await run.start({
945
- inputData: { testCases, agentConfigs },
960
+ inputData: { testCases, agentConfigs, adapterNames: actualAdapters.map((a) => a.name) },
946
961
  initialState: runState,
947
962
  requestContext: executionContext
948
963
  });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agentgrader/core",
3
- "version": "1.0.1",
3
+ "version": "1.1.0",
4
4
  "description": "Core schemas, contracts, and runner for the Agentgrader benchmarking framework",
5
5
  "license": "MIT",
6
6
  "type": "module",
@@ -22,7 +22,7 @@
22
22
  "dev": "bun run src/index.ts"
23
23
  },
24
24
  "dependencies": {
25
- "@agentgrader/store": "^1.0.1",
25
+ "@agentgrader/store": "^1.0.2",
26
26
  "@mastra/core": "^1.41.0",
27
27
  "yaml": "^2.5.1",
28
28
  "zod": "^3.23.8"