@agentgrader/core 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +3 -1
- package/dist/index.js +23 -8
- package/package.json +2 -2
package/dist/index.d.ts
CHANGED
|
@@ -637,13 +637,15 @@ declare function runSingle(input: RunSingleInput): Promise<RunSingleResult>;
|
|
|
637
637
|
interface BenchmarkInput {
|
|
638
638
|
testCases: TestCase[];
|
|
639
639
|
agentConfigs: AgentConfig[];
|
|
640
|
-
adapter
|
|
640
|
+
adapter?: AgentAdapter;
|
|
641
|
+
adapters?: AgentAdapter[];
|
|
641
642
|
sandboxProvider: SandboxProvider;
|
|
642
643
|
db?: AgrDb;
|
|
643
644
|
concurrency?: number;
|
|
644
645
|
onRunUpdate?: (run: RunSingleResult & {
|
|
645
646
|
testCaseId: string;
|
|
646
647
|
agentConfigId: string;
|
|
648
|
+
adapterName?: string;
|
|
647
649
|
status: "running" | "completed" | "failed";
|
|
648
650
|
}) => void;
|
|
649
651
|
}
|
package/dist/index.js
CHANGED
|
@@ -836,7 +836,11 @@ ${addendum}` : addendum
|
|
|
836
836
|
};
|
|
837
837
|
}
|
|
838
838
|
async function runBenchmark(input) {
|
|
839
|
-
const { testCases, agentConfigs, adapter, sandboxProvider, db, concurrency = 2, onRunUpdate } = input;
|
|
839
|
+
const { testCases, agentConfigs, adapter, adapters, sandboxProvider, db, concurrency = 2, onRunUpdate } = input;
|
|
840
|
+
const actualAdapters = adapters || (adapter ? [adapter] : []);
|
|
841
|
+
if (actualAdapters.length === 0) {
|
|
842
|
+
throw new Error("You must provide either 'adapter' or 'adapters' to runBenchmark.");
|
|
843
|
+
}
|
|
840
844
|
const generateCombinationsStep = createStep({
|
|
841
845
|
id: "generateCombinations",
|
|
842
846
|
inputSchema: z.any(),
|
|
@@ -846,10 +850,13 @@ async function runBenchmark(input) {
|
|
|
846
850
|
const combinations = [];
|
|
847
851
|
for (const tc of initData.testCases) {
|
|
848
852
|
for (const config of initData.agentConfigs) {
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
+
for (const adapterName of initData.adapterNames) {
|
|
854
|
+
combinations.push({
|
|
855
|
+
testCase: tc,
|
|
856
|
+
agentConfig: config,
|
|
857
|
+
adapterName
|
|
858
|
+
});
|
|
859
|
+
}
|
|
853
860
|
}
|
|
854
861
|
}
|
|
855
862
|
return combinations;
|
|
@@ -860,7 +867,7 @@ async function runBenchmark(input) {
|
|
|
860
867
|
inputSchema: z.any(),
|
|
861
868
|
outputSchema: z.any(),
|
|
862
869
|
execute: async ({ inputData, requestContext }) => {
|
|
863
|
-
const { testCase, agentConfig } = inputData;
|
|
870
|
+
const { testCase, agentConfig, adapterName } = inputData;
|
|
864
871
|
const ctx = requestContext?.context || requestContext;
|
|
865
872
|
const getVal = (key) => {
|
|
866
873
|
if (ctx instanceof Map) return ctx.get(key);
|
|
@@ -868,7 +875,11 @@ async function runBenchmark(input) {
|
|
|
868
875
|
if (typeof ctx?.get === "function") return ctx.get(key);
|
|
869
876
|
return void 0;
|
|
870
877
|
};
|
|
871
|
-
const
|
|
878
|
+
const adaptersFromCtx = getVal("adapters");
|
|
879
|
+
const singleAdapter = getVal("adapter");
|
|
880
|
+
const adapterList = adaptersFromCtx || (singleAdapter ? [singleAdapter] : []);
|
|
881
|
+
const adapter2 = adapterList.find((a) => a.name === adapterName);
|
|
882
|
+
if (!adapter2) throw new Error(`Adapter ${adapterName} not found in execution context`);
|
|
872
883
|
const sandboxProvider2 = getVal("sandboxProvider");
|
|
873
884
|
const db2 = getVal("db");
|
|
874
885
|
const onRunUpdate2 = getVal("onRunUpdate");
|
|
@@ -878,6 +889,7 @@ async function runBenchmark(input) {
|
|
|
878
889
|
runId,
|
|
879
890
|
testCaseId: testCase.id || testCase.name,
|
|
880
891
|
agentConfigId: agentConfig.id || agentConfig.name,
|
|
892
|
+
adapterName: adapter2.name,
|
|
881
893
|
status: "running",
|
|
882
894
|
passed: false,
|
|
883
895
|
stepsCount: 0,
|
|
@@ -901,6 +913,7 @@ async function runBenchmark(input) {
|
|
|
901
913
|
...res2,
|
|
902
914
|
testCaseId: testCase.id || testCase.name,
|
|
903
915
|
agentConfigId: agentConfig.id || agentConfig.name,
|
|
916
|
+
adapterName: adapter2.name,
|
|
904
917
|
status: res2.error ? "failed" : "completed"
|
|
905
918
|
});
|
|
906
919
|
}
|
|
@@ -921,6 +934,7 @@ async function runBenchmark(input) {
|
|
|
921
934
|
...failedResult,
|
|
922
935
|
testCaseId: testCase.id || testCase.name,
|
|
923
936
|
agentConfigId: agentConfig.id || agentConfig.name,
|
|
937
|
+
adapterName: adapter2.name,
|
|
924
938
|
status: "failed"
|
|
925
939
|
});
|
|
926
940
|
}
|
|
@@ -936,13 +950,14 @@ async function runBenchmark(input) {
|
|
|
936
950
|
const runState = {};
|
|
937
951
|
const executionContext = /* @__PURE__ */ new Map([
|
|
938
952
|
["adapter", adapter],
|
|
953
|
+
["adapters", actualAdapters],
|
|
939
954
|
["sandboxProvider", sandboxProvider],
|
|
940
955
|
["db", db],
|
|
941
956
|
["onRunUpdate", onRunUpdate]
|
|
942
957
|
]);
|
|
943
958
|
const run = await workflow.createRun();
|
|
944
959
|
const res = await run.start({
|
|
945
|
-
inputData: { testCases, agentConfigs },
|
|
960
|
+
inputData: { testCases, agentConfigs, adapterNames: actualAdapters.map((a) => a.name) },
|
|
946
961
|
initialState: runState,
|
|
947
962
|
requestContext: executionContext
|
|
948
963
|
});
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@agentgrader/core",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "Core schemas, contracts, and runner for the Agentgrader benchmarking framework",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
"dev": "bun run src/index.ts"
|
|
23
23
|
},
|
|
24
24
|
"dependencies": {
|
|
25
|
-
"@agentgrader/store": "
|
|
25
|
+
"@agentgrader/store": "^1.0.2",
|
|
26
26
|
"@mastra/core": "^1.41.0",
|
|
27
27
|
"yaml": "^2.5.1",
|
|
28
28
|
"zod": "^3.23.8"
|