@agentgrader/core 1.0.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +42 -1
- package/dist/index.js +66 -17
- package/package.json +2 -2
package/dist/index.d.ts
CHANGED
|
@@ -131,6 +131,7 @@ declare const AgentConfigSchema: z.ZodObject<{
|
|
|
131
131
|
id: z.ZodOptional<z.ZodString>;
|
|
132
132
|
name: z.ZodString;
|
|
133
133
|
model: z.ZodString;
|
|
134
|
+
provider: z.ZodOptional<z.ZodString>;
|
|
134
135
|
max_steps: z.ZodDefault<z.ZodNumber>;
|
|
135
136
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
136
137
|
system_prompt: z.ZodOptional<z.ZodString>;
|
|
@@ -167,6 +168,7 @@ declare const AgentConfigSchema: z.ZodObject<{
|
|
|
167
168
|
max_steps: number;
|
|
168
169
|
id?: string | undefined;
|
|
169
170
|
toolkits?: string[] | undefined;
|
|
171
|
+
provider?: string | undefined;
|
|
170
172
|
temperature?: number | undefined;
|
|
171
173
|
system_prompt?: string | undefined;
|
|
172
174
|
tools?: string[] | undefined;
|
|
@@ -184,6 +186,7 @@ declare const AgentConfigSchema: z.ZodObject<{
|
|
|
184
186
|
model: string;
|
|
185
187
|
id?: string | undefined;
|
|
186
188
|
toolkits?: string[] | undefined;
|
|
189
|
+
provider?: string | undefined;
|
|
187
190
|
max_steps?: number | undefined;
|
|
188
191
|
temperature?: number | undefined;
|
|
189
192
|
system_prompt?: string | undefined;
|
|
@@ -300,6 +303,7 @@ declare const RunSchema: z.ZodObject<{
|
|
|
300
303
|
error: z.ZodOptional<z.ZodString>;
|
|
301
304
|
finalDiff: z.ZodOptional<z.ZodString>;
|
|
302
305
|
metrics: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodAny>>;
|
|
306
|
+
matrixId: z.ZodOptional<z.ZodString>;
|
|
303
307
|
createdAt: z.ZodNumber;
|
|
304
308
|
completedAt: z.ZodOptional<z.ZodNumber>;
|
|
305
309
|
}, "strip", z.ZodTypeAny, {
|
|
@@ -319,6 +323,7 @@ declare const RunSchema: z.ZodObject<{
|
|
|
319
323
|
error?: string | undefined;
|
|
320
324
|
finalDiff?: string | undefined;
|
|
321
325
|
metrics?: Record<string, any> | undefined;
|
|
326
|
+
matrixId?: string | undefined;
|
|
322
327
|
completedAt?: number | undefined;
|
|
323
328
|
}, {
|
|
324
329
|
status: "running" | "completed" | "failed";
|
|
@@ -337,6 +342,7 @@ declare const RunSchema: z.ZodObject<{
|
|
|
337
342
|
error?: string | undefined;
|
|
338
343
|
finalDiff?: string | undefined;
|
|
339
344
|
metrics?: Record<string, any> | undefined;
|
|
345
|
+
matrixId?: string | undefined;
|
|
340
346
|
completedAt?: number | undefined;
|
|
341
347
|
}>;
|
|
342
348
|
type Run = z.infer<typeof RunSchema>;
|
|
@@ -493,6 +499,26 @@ interface ScorerResult {
|
|
|
493
499
|
passed: boolean;
|
|
494
500
|
detail: string;
|
|
495
501
|
score?: number;
|
|
502
|
+
/**
|
|
503
|
+
* non-functional code-quality signals. populated by additive scorers
|
|
504
|
+
* (e.g. staticqualityscorer, llmjudgescorer) that annotate a run without
|
|
505
|
+
* affecting `passed`/`score`. all fields optional - a scorer only fills
|
|
506
|
+
* in what it actually measured.
|
|
507
|
+
*/
|
|
508
|
+
quality?: {
|
|
509
|
+
/** total +/- lines in the agent's diff */
|
|
510
|
+
diffLines?: number;
|
|
511
|
+
/** number of files touched by the agent's diff */
|
|
512
|
+
filesModified?: number;
|
|
513
|
+
/** TODO/FIXME/HACK/XXX markers introduced by the diff */
|
|
514
|
+
todosIntroduced?: number;
|
|
515
|
+
/** linter (e.g. Biome) error+warning count on the changed files */
|
|
516
|
+
linterViolations?: number;
|
|
517
|
+
/** 0-1 holistic quality score from an LLM judge */
|
|
518
|
+
llmJudgeScore?: number;
|
|
519
|
+
/** prose rationale from an LLM judge */
|
|
520
|
+
llmJudgeDetail?: string;
|
|
521
|
+
};
|
|
496
522
|
}
|
|
497
523
|
interface Scorer {
|
|
498
524
|
readonly name: string;
|
|
@@ -618,6 +644,15 @@ interface RunSingleInput {
|
|
|
618
644
|
sandboxProvider: SandboxProvider;
|
|
619
645
|
db?: AgrDb;
|
|
620
646
|
runId: string;
|
|
647
|
+
/**
|
|
648
|
+
* additive, non-blocking scorers (e.g. staticqualityscorer,
|
|
649
|
+
* llmjudgescorer) run after the core pass/fail scoring. their results
|
|
650
|
+
* never affect `passed`/`score` - each scorer's `scorerresult` is merged
|
|
651
|
+
* into `metrics` under its own `name`.
|
|
652
|
+
*/
|
|
653
|
+
extraScorers?: Scorer[];
|
|
654
|
+
/** links this run to an optimizer matrix run, if any */
|
|
655
|
+
matrixId?: string;
|
|
621
656
|
}
|
|
622
657
|
interface RunSingleResult {
|
|
623
658
|
runId: string;
|
|
@@ -637,13 +672,19 @@ declare function runSingle(input: RunSingleInput): Promise<RunSingleResult>;
|
|
|
637
672
|
interface BenchmarkInput {
|
|
638
673
|
testCases: TestCase[];
|
|
639
674
|
agentConfigs: AgentConfig[];
|
|
640
|
-
adapter
|
|
675
|
+
adapter?: AgentAdapter;
|
|
676
|
+
adapters?: AgentAdapter[];
|
|
641
677
|
sandboxProvider: SandboxProvider;
|
|
642
678
|
db?: AgrDb;
|
|
643
679
|
concurrency?: number;
|
|
680
|
+
/** additive, non-blocking quality scorers run for every test case x config combination */
|
|
681
|
+
extraScorers?: Scorer[];
|
|
682
|
+
/** links every run in this benchmark to an optimizer matrix run, if any */
|
|
683
|
+
matrixId?: string;
|
|
644
684
|
onRunUpdate?: (run: RunSingleResult & {
|
|
645
685
|
testCaseId: string;
|
|
646
686
|
agentConfigId: string;
|
|
687
|
+
adapterName?: string;
|
|
647
688
|
status: "running" | "completed" | "failed";
|
|
648
689
|
}) => void;
|
|
649
690
|
}
|
package/dist/index.js
CHANGED
|
@@ -79,6 +79,7 @@ var AgentConfigSchema = z.object({
|
|
|
79
79
|
id: z.string().optional(),
|
|
80
80
|
name: z.string(),
|
|
81
81
|
model: z.string(),
|
|
82
|
+
provider: z.string().optional(),
|
|
82
83
|
max_steps: z.number().default(30),
|
|
83
84
|
temperature: z.number().optional(),
|
|
84
85
|
system_prompt: z.string().optional(),
|
|
@@ -105,8 +106,12 @@ var RunSchema = z.object({
|
|
|
105
106
|
error: z.string().optional(),
|
|
106
107
|
finalDiff: z.string().optional(),
|
|
107
108
|
// extended scoring metrics: regression (FAIL_TO_PASS/PASS_TO_PASS), diff stats,
|
|
108
|
-
// localization precision/recall,
|
|
109
|
+
// localization precision/recall, additive quality scorers (keyed by scorer
|
|
110
|
+
// name, ScorerResult-shaped), etc. Stored as JSON.
|
|
109
111
|
metrics: z.record(z.any()).optional(),
|
|
112
|
+
// links this run back to the optimizer matrix run that generated its
|
|
113
|
+
// agentConfig, if any (see @agentgrader/optimizer).
|
|
114
|
+
matrixId: z.string().optional(),
|
|
110
115
|
createdAt: z.number(),
|
|
111
116
|
completedAt: z.number().optional()
|
|
112
117
|
});
|
|
@@ -575,7 +580,7 @@ function buildSkillsPromptAddendum(skills) {
|
|
|
575
580
|
|
|
576
581
|
// src/runner/run-single.ts
|
|
577
582
|
async function runSingle(input) {
|
|
578
|
-
const { testCase, agentConfig, adapter, sandboxProvider, db, runId } = input;
|
|
583
|
+
const { testCase, agentConfig, adapter, sandboxProvider, db, runId, extraScorers, matrixId } = input;
|
|
579
584
|
const startTime = Date.now();
|
|
580
585
|
let sandbox = null;
|
|
581
586
|
let passed = false;
|
|
@@ -598,6 +603,7 @@ async function runSingle(input) {
|
|
|
598
603
|
agentConfigId: agentConfig.id || agentConfig.name,
|
|
599
604
|
sandboxProvider: sandboxProvider.name,
|
|
600
605
|
status: "running",
|
|
606
|
+
matrixId,
|
|
601
607
|
createdAt: Math.floor(startTime / 1e3)
|
|
602
608
|
});
|
|
603
609
|
}
|
|
@@ -701,6 +707,21 @@ ${addendum}` : addendum
|
|
|
701
707
|
}),
|
|
702
708
|
execute: async () => {
|
|
703
709
|
if (!sandbox) throw new Error("Sandbox not initialized");
|
|
710
|
+
if (extraScorers && extraScorers.length > 0 && agentResult) {
|
|
711
|
+
const trace2 = { runId, steps: emittedSteps };
|
|
712
|
+
for (const scorer of extraScorers) {
|
|
713
|
+
try {
|
|
714
|
+
metrics[scorer.name] = await scorer.score({
|
|
715
|
+
testCase,
|
|
716
|
+
result: agentResult,
|
|
717
|
+
trace: trace2,
|
|
718
|
+
sandbox
|
|
719
|
+
});
|
|
720
|
+
} catch (e) {
|
|
721
|
+
metrics[scorer.name] = { passed: true, detail: `Scorer error: ${e.message}` };
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
}
|
|
704
725
|
const cmdScorer = new CommandScorer();
|
|
705
726
|
const cmdResult = await cmdScorer.score({
|
|
706
727
|
testCase,
|
|
@@ -790,10 +811,17 @@ ${addendum}` : addendum
|
|
|
790
811
|
inputData: {},
|
|
791
812
|
initialState: runState
|
|
792
813
|
});
|
|
793
|
-
const
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
814
|
+
const scoreStep2 = res.steps?.score;
|
|
815
|
+
if (res.status === "success" && scoreStep2?.status === "success") {
|
|
816
|
+
const scoreOutput = scoreStep2.output;
|
|
817
|
+
passed = scoreOutput?.passed ?? false;
|
|
818
|
+
score = scoreOutput?.score ?? 0;
|
|
819
|
+
errorMsg = scoreOutput?.passed ? void 0 : scoreOutput?.detail;
|
|
820
|
+
} else {
|
|
821
|
+
passed = false;
|
|
822
|
+
score = 0;
|
|
823
|
+
errorMsg = res.error?.message ?? "Workflow did not complete successfully";
|
|
824
|
+
}
|
|
797
825
|
} catch (err) {
|
|
798
826
|
errorMsg = err.message || "Unknown execution error";
|
|
799
827
|
passed = false;
|
|
@@ -836,7 +864,11 @@ ${addendum}` : addendum
|
|
|
836
864
|
};
|
|
837
865
|
}
|
|
838
866
|
async function runBenchmark(input) {
|
|
839
|
-
const { testCases, agentConfigs, adapter, sandboxProvider, db, concurrency = 2, onRunUpdate } = input;
|
|
867
|
+
const { testCases, agentConfigs, adapter, adapters, sandboxProvider, db, concurrency = 2, extraScorers, matrixId, onRunUpdate } = input;
|
|
868
|
+
const actualAdapters = adapters || (adapter ? [adapter] : []);
|
|
869
|
+
if (actualAdapters.length === 0) {
|
|
870
|
+
throw new Error("You must provide either 'adapter' or 'adapters' to runBenchmark.");
|
|
871
|
+
}
|
|
840
872
|
const generateCombinationsStep = createStep({
|
|
841
873
|
id: "generateCombinations",
|
|
842
874
|
inputSchema: z.any(),
|
|
@@ -846,10 +878,13 @@ async function runBenchmark(input) {
|
|
|
846
878
|
const combinations = [];
|
|
847
879
|
for (const tc of initData.testCases) {
|
|
848
880
|
for (const config of initData.agentConfigs) {
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
881
|
+
for (const adapterName of initData.adapterNames) {
|
|
882
|
+
combinations.push({
|
|
883
|
+
testCase: tc,
|
|
884
|
+
agentConfig: config,
|
|
885
|
+
adapterName
|
|
886
|
+
});
|
|
887
|
+
}
|
|
853
888
|
}
|
|
854
889
|
}
|
|
855
890
|
return combinations;
|
|
@@ -860,7 +895,7 @@ async function runBenchmark(input) {
|
|
|
860
895
|
inputSchema: z.any(),
|
|
861
896
|
outputSchema: z.any(),
|
|
862
897
|
execute: async ({ inputData, requestContext }) => {
|
|
863
|
-
const { testCase, agentConfig } = inputData;
|
|
898
|
+
const { testCase, agentConfig, adapterName } = inputData;
|
|
864
899
|
const ctx = requestContext?.context || requestContext;
|
|
865
900
|
const getVal = (key) => {
|
|
866
901
|
if (ctx instanceof Map) return ctx.get(key);
|
|
@@ -868,16 +903,23 @@ async function runBenchmark(input) {
|
|
|
868
903
|
if (typeof ctx?.get === "function") return ctx.get(key);
|
|
869
904
|
return void 0;
|
|
870
905
|
};
|
|
871
|
-
const
|
|
906
|
+
const adaptersFromCtx = getVal("adapters");
|
|
907
|
+
const singleAdapter = getVal("adapter");
|
|
908
|
+
const adapterList = adaptersFromCtx || (singleAdapter ? [singleAdapter] : []);
|
|
909
|
+
const adapter2 = adapterList.find((a) => a.name === adapterName);
|
|
910
|
+
if (!adapter2) throw new Error(`Adapter ${adapterName} not found in execution context`);
|
|
872
911
|
const sandboxProvider2 = getVal("sandboxProvider");
|
|
873
912
|
const db2 = getVal("db");
|
|
874
913
|
const onRunUpdate2 = getVal("onRunUpdate");
|
|
914
|
+
const extraScorers2 = getVal("extraScorers");
|
|
915
|
+
const matrixId2 = getVal("matrixId");
|
|
875
916
|
const runId = randomUUID();
|
|
876
917
|
if (onRunUpdate2) {
|
|
877
918
|
onRunUpdate2({
|
|
878
919
|
runId,
|
|
879
920
|
testCaseId: testCase.id || testCase.name,
|
|
880
921
|
agentConfigId: agentConfig.id || agentConfig.name,
|
|
922
|
+
adapterName: adapter2.name,
|
|
881
923
|
status: "running",
|
|
882
924
|
passed: false,
|
|
883
925
|
stepsCount: 0,
|
|
@@ -894,13 +936,16 @@ async function runBenchmark(input) {
|
|
|
894
936
|
adapter: adapter2,
|
|
895
937
|
sandboxProvider: sandboxProvider2,
|
|
896
938
|
db: db2,
|
|
897
|
-
runId
|
|
939
|
+
runId,
|
|
940
|
+
extraScorers: extraScorers2,
|
|
941
|
+
matrixId: matrixId2
|
|
898
942
|
});
|
|
899
943
|
if (onRunUpdate2) {
|
|
900
944
|
onRunUpdate2({
|
|
901
945
|
...res2,
|
|
902
946
|
testCaseId: testCase.id || testCase.name,
|
|
903
947
|
agentConfigId: agentConfig.id || agentConfig.name,
|
|
948
|
+
adapterName: adapter2.name,
|
|
904
949
|
status: res2.error ? "failed" : "completed"
|
|
905
950
|
});
|
|
906
951
|
}
|
|
@@ -921,6 +966,7 @@ async function runBenchmark(input) {
|
|
|
921
966
|
...failedResult,
|
|
922
967
|
testCaseId: testCase.id || testCase.name,
|
|
923
968
|
agentConfigId: agentConfig.id || agentConfig.name,
|
|
969
|
+
adapterName: adapter2.name,
|
|
924
970
|
status: "failed"
|
|
925
971
|
});
|
|
926
972
|
}
|
|
@@ -936,17 +982,20 @@ async function runBenchmark(input) {
|
|
|
936
982
|
const runState = {};
|
|
937
983
|
const executionContext = /* @__PURE__ */ new Map([
|
|
938
984
|
["adapter", adapter],
|
|
985
|
+
["adapters", actualAdapters],
|
|
939
986
|
["sandboxProvider", sandboxProvider],
|
|
940
987
|
["db", db],
|
|
941
|
-
["onRunUpdate", onRunUpdate]
|
|
988
|
+
["onRunUpdate", onRunUpdate],
|
|
989
|
+
["extraScorers", extraScorers],
|
|
990
|
+
["matrixId", matrixId]
|
|
942
991
|
]);
|
|
943
992
|
const run = await workflow.createRun();
|
|
944
993
|
const res = await run.start({
|
|
945
|
-
inputData: { testCases, agentConfigs },
|
|
994
|
+
inputData: { testCases, agentConfigs, adapterNames: actualAdapters.map((a) => a.name) },
|
|
946
995
|
initialState: runState,
|
|
947
996
|
requestContext: executionContext
|
|
948
997
|
});
|
|
949
|
-
const rawRuns = res.
|
|
998
|
+
const rawRuns = res.steps?.executeSingleRun?.output || [];
|
|
950
999
|
return {
|
|
951
1000
|
runs: Array.isArray(rawRuns) ? rawRuns : []
|
|
952
1001
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@agentgrader/core",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.1.2",
|
|
4
4
|
"description": "Core schemas, contracts, and runner for the Agentgrader benchmarking framework",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
"dev": "bun run src/index.ts"
|
|
23
23
|
},
|
|
24
24
|
"dependencies": {
|
|
25
|
-
"@agentgrader/store": "^1.0.
|
|
25
|
+
"@agentgrader/store": "^1.0.2",
|
|
26
26
|
"@mastra/core": "^1.41.0",
|
|
27
27
|
"yaml": "^2.5.1",
|
|
28
28
|
"zod": "^3.23.8"
|