@agentgrader/core 1.1.0 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +40 -0
- package/dist/index.js +46 -11
- package/package.json +2 -2
package/dist/index.d.ts
CHANGED
|
@@ -131,6 +131,7 @@ declare const AgentConfigSchema: z.ZodObject<{
|
|
|
131
131
|
id: z.ZodOptional<z.ZodString>;
|
|
132
132
|
name: z.ZodString;
|
|
133
133
|
model: z.ZodString;
|
|
134
|
+
provider: z.ZodOptional<z.ZodString>;
|
|
134
135
|
max_steps: z.ZodDefault<z.ZodNumber>;
|
|
135
136
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
136
137
|
system_prompt: z.ZodOptional<z.ZodString>;
|
|
@@ -167,6 +168,7 @@ declare const AgentConfigSchema: z.ZodObject<{
|
|
|
167
168
|
max_steps: number;
|
|
168
169
|
id?: string | undefined;
|
|
169
170
|
toolkits?: string[] | undefined;
|
|
171
|
+
provider?: string | undefined;
|
|
170
172
|
temperature?: number | undefined;
|
|
171
173
|
system_prompt?: string | undefined;
|
|
172
174
|
tools?: string[] | undefined;
|
|
@@ -184,6 +186,7 @@ declare const AgentConfigSchema: z.ZodObject<{
|
|
|
184
186
|
model: string;
|
|
185
187
|
id?: string | undefined;
|
|
186
188
|
toolkits?: string[] | undefined;
|
|
189
|
+
provider?: string | undefined;
|
|
187
190
|
max_steps?: number | undefined;
|
|
188
191
|
temperature?: number | undefined;
|
|
189
192
|
system_prompt?: string | undefined;
|
|
@@ -300,6 +303,7 @@ declare const RunSchema: z.ZodObject<{
|
|
|
300
303
|
error: z.ZodOptional<z.ZodString>;
|
|
301
304
|
finalDiff: z.ZodOptional<z.ZodString>;
|
|
302
305
|
metrics: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodAny>>;
|
|
306
|
+
matrixId: z.ZodOptional<z.ZodString>;
|
|
303
307
|
createdAt: z.ZodNumber;
|
|
304
308
|
completedAt: z.ZodOptional<z.ZodNumber>;
|
|
305
309
|
}, "strip", z.ZodTypeAny, {
|
|
@@ -319,6 +323,7 @@ declare const RunSchema: z.ZodObject<{
|
|
|
319
323
|
error?: string | undefined;
|
|
320
324
|
finalDiff?: string | undefined;
|
|
321
325
|
metrics?: Record<string, any> | undefined;
|
|
326
|
+
matrixId?: string | undefined;
|
|
322
327
|
completedAt?: number | undefined;
|
|
323
328
|
}, {
|
|
324
329
|
status: "running" | "completed" | "failed";
|
|
@@ -337,6 +342,7 @@ declare const RunSchema: z.ZodObject<{
|
|
|
337
342
|
error?: string | undefined;
|
|
338
343
|
finalDiff?: string | undefined;
|
|
339
344
|
metrics?: Record<string, any> | undefined;
|
|
345
|
+
matrixId?: string | undefined;
|
|
340
346
|
completedAt?: number | undefined;
|
|
341
347
|
}>;
|
|
342
348
|
type Run = z.infer<typeof RunSchema>;
|
|
@@ -493,6 +499,26 @@ interface ScorerResult {
|
|
|
493
499
|
passed: boolean;
|
|
494
500
|
detail: string;
|
|
495
501
|
score?: number;
|
|
502
|
+
/**
|
|
503
|
+
* non-functional code-quality signals. populated by additive scorers
|
|
504
|
+
* (e.g. staticqualityscorer, llmjudgescorer) that annotate a run without
|
|
505
|
+
* affecting `passed`/`score`. all fields optional - a scorer only fills
|
|
506
|
+
* in what it actually measured.
|
|
507
|
+
*/
|
|
508
|
+
quality?: {
|
|
509
|
+
/** total +/- lines in the agent's diff */
|
|
510
|
+
diffLines?: number;
|
|
511
|
+
/** number of files touched by the agent's diff */
|
|
512
|
+
filesModified?: number;
|
|
513
|
+
/** TODO/FIXME/HACK/XXX markers introduced by the diff */
|
|
514
|
+
todosIntroduced?: number;
|
|
515
|
+
/** linter (e.g. Biome) error+warning count on the changed files */
|
|
516
|
+
linterViolations?: number;
|
|
517
|
+
/** 0-1 holistic quality score from an LLM judge */
|
|
518
|
+
llmJudgeScore?: number;
|
|
519
|
+
/** prose rationale from an LLM judge */
|
|
520
|
+
llmJudgeDetail?: string;
|
|
521
|
+
};
|
|
496
522
|
}
|
|
497
523
|
interface Scorer {
|
|
498
524
|
readonly name: string;
|
|
@@ -618,6 +644,16 @@ interface RunSingleInput {
|
|
|
618
644
|
sandboxProvider: SandboxProvider;
|
|
619
645
|
db?: AgrDb;
|
|
620
646
|
runId: string;
|
|
647
|
+
/**
|
|
648
|
+
* additive, non-blocking scorers (e.g. staticqualityscorer,
|
|
649
|
+
* llmjudgescorer) run after the core pass/fail scoring. their results
|
|
650
|
+
* never affect `passed`/`score` - each scorer's `scorerresult` is merged
|
|
651
|
+
* into `metrics` under its own `name`.
|
|
652
|
+
*/
|
|
653
|
+
extraScorers?: Scorer[];
|
|
654
|
+
/** links this run to an optimizer matrix run, if any */
|
|
655
|
+
matrixId?: string;
|
|
656
|
+
onStep?: (step: StepEvent) => void;
|
|
621
657
|
}
|
|
622
658
|
interface RunSingleResult {
|
|
623
659
|
runId: string;
|
|
@@ -642,6 +678,10 @@ interface BenchmarkInput {
|
|
|
642
678
|
sandboxProvider: SandboxProvider;
|
|
643
679
|
db?: AgrDb;
|
|
644
680
|
concurrency?: number;
|
|
681
|
+
/** additive, non-blocking quality scorers run for every test case x config combination */
|
|
682
|
+
extraScorers?: Scorer[];
|
|
683
|
+
/** links every run in this benchmark to an optimizer matrix run, if any */
|
|
684
|
+
matrixId?: string;
|
|
645
685
|
onRunUpdate?: (run: RunSingleResult & {
|
|
646
686
|
testCaseId: string;
|
|
647
687
|
agentConfigId: string;
|
package/dist/index.js
CHANGED
|
@@ -79,6 +79,7 @@ var AgentConfigSchema = z.object({
|
|
|
79
79
|
id: z.string().optional(),
|
|
80
80
|
name: z.string(),
|
|
81
81
|
model: z.string(),
|
|
82
|
+
provider: z.string().optional(),
|
|
82
83
|
max_steps: z.number().default(30),
|
|
83
84
|
temperature: z.number().optional(),
|
|
84
85
|
system_prompt: z.string().optional(),
|
|
@@ -105,8 +106,12 @@ var RunSchema = z.object({
|
|
|
105
106
|
error: z.string().optional(),
|
|
106
107
|
finalDiff: z.string().optional(),
|
|
107
108
|
// extended scoring metrics: regression (FAIL_TO_PASS/PASS_TO_PASS), diff stats,
|
|
108
|
-
// localization precision/recall,
|
|
109
|
+
// localization precision/recall, additive quality scorers (keyed by scorer
|
|
110
|
+
// name, ScorerResult-shaped), etc. Stored as JSON.
|
|
109
111
|
metrics: z.record(z.any()).optional(),
|
|
112
|
+
// links this run back to the optimizer matrix run that generated its
|
|
113
|
+
// agentConfig, if any (see @agentgrader/optimizer).
|
|
114
|
+
matrixId: z.string().optional(),
|
|
110
115
|
createdAt: z.number(),
|
|
111
116
|
completedAt: z.number().optional()
|
|
112
117
|
});
|
|
@@ -575,7 +580,7 @@ function buildSkillsPromptAddendum(skills) {
|
|
|
575
580
|
|
|
576
581
|
// src/runner/run-single.ts
|
|
577
582
|
async function runSingle(input) {
|
|
578
|
-
const { testCase, agentConfig, adapter, sandboxProvider, db, runId } = input;
|
|
583
|
+
const { testCase, agentConfig, adapter, sandboxProvider, db, runId, extraScorers, matrixId } = input;
|
|
579
584
|
const startTime = Date.now();
|
|
580
585
|
let sandbox = null;
|
|
581
586
|
let passed = false;
|
|
@@ -598,6 +603,7 @@ async function runSingle(input) {
|
|
|
598
603
|
agentConfigId: agentConfig.id || agentConfig.name,
|
|
599
604
|
sandboxProvider: sandboxProvider.name,
|
|
600
605
|
status: "running",
|
|
606
|
+
matrixId,
|
|
601
607
|
createdAt: Math.floor(startTime / 1e3)
|
|
602
608
|
});
|
|
603
609
|
}
|
|
@@ -635,6 +641,7 @@ async function runSingle(input) {
|
|
|
635
641
|
tokensIn += stepEvent.tokensIn || 0;
|
|
636
642
|
tokensOut += stepEvent.tokensOut || 0;
|
|
637
643
|
costUsd += stepEvent.costUsd || 0;
|
|
644
|
+
input.onStep?.(stepEvent);
|
|
638
645
|
if (db) {
|
|
639
646
|
addTrace(db, {
|
|
640
647
|
runId,
|
|
@@ -701,6 +708,21 @@ ${addendum}` : addendum
|
|
|
701
708
|
}),
|
|
702
709
|
execute: async () => {
|
|
703
710
|
if (!sandbox) throw new Error("Sandbox not initialized");
|
|
711
|
+
if (extraScorers && extraScorers.length > 0 && agentResult) {
|
|
712
|
+
const trace2 = { runId, steps: emittedSteps };
|
|
713
|
+
for (const scorer of extraScorers) {
|
|
714
|
+
try {
|
|
715
|
+
metrics[scorer.name] = await scorer.score({
|
|
716
|
+
testCase,
|
|
717
|
+
result: agentResult,
|
|
718
|
+
trace: trace2,
|
|
719
|
+
sandbox
|
|
720
|
+
});
|
|
721
|
+
} catch (e) {
|
|
722
|
+
metrics[scorer.name] = { passed: true, detail: `Scorer error: ${e.message}` };
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
}
|
|
704
726
|
const cmdScorer = new CommandScorer();
|
|
705
727
|
const cmdResult = await cmdScorer.score({
|
|
706
728
|
testCase,
|
|
@@ -790,10 +812,17 @@ ${addendum}` : addendum
|
|
|
790
812
|
inputData: {},
|
|
791
813
|
initialState: runState
|
|
792
814
|
});
|
|
793
|
-
const
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
815
|
+
const scoreStep2 = res.steps?.score;
|
|
816
|
+
if (res.status === "success" && scoreStep2?.status === "success") {
|
|
817
|
+
const scoreOutput = scoreStep2.output;
|
|
818
|
+
passed = scoreOutput?.passed ?? false;
|
|
819
|
+
score = scoreOutput?.score ?? 0;
|
|
820
|
+
errorMsg = scoreOutput?.passed ? void 0 : scoreOutput?.detail;
|
|
821
|
+
} else {
|
|
822
|
+
passed = false;
|
|
823
|
+
score = 0;
|
|
824
|
+
errorMsg = res.error?.message ?? "Workflow did not complete successfully";
|
|
825
|
+
}
|
|
797
826
|
} catch (err) {
|
|
798
827
|
errorMsg = err.message || "Unknown execution error";
|
|
799
828
|
passed = false;
|
|
@@ -836,7 +865,7 @@ ${addendum}` : addendum
|
|
|
836
865
|
};
|
|
837
866
|
}
|
|
838
867
|
async function runBenchmark(input) {
|
|
839
|
-
const { testCases, agentConfigs, adapter, adapters, sandboxProvider, db, concurrency = 2, onRunUpdate } = input;
|
|
868
|
+
const { testCases, agentConfigs, adapter, adapters, sandboxProvider, db, concurrency = 2, extraScorers, matrixId, onRunUpdate } = input;
|
|
840
869
|
const actualAdapters = adapters || (adapter ? [adapter] : []);
|
|
841
870
|
if (actualAdapters.length === 0) {
|
|
842
871
|
throw new Error("You must provide either 'adapter' or 'adapters' to runBenchmark.");
|
|
@@ -883,6 +912,8 @@ async function runBenchmark(input) {
|
|
|
883
912
|
const sandboxProvider2 = getVal("sandboxProvider");
|
|
884
913
|
const db2 = getVal("db");
|
|
885
914
|
const onRunUpdate2 = getVal("onRunUpdate");
|
|
915
|
+
const extraScorers2 = getVal("extraScorers");
|
|
916
|
+
const matrixId2 = getVal("matrixId");
|
|
886
917
|
const runId = randomUUID();
|
|
887
918
|
if (onRunUpdate2) {
|
|
888
919
|
onRunUpdate2({
|
|
@@ -906,7 +937,9 @@ async function runBenchmark(input) {
|
|
|
906
937
|
adapter: adapter2,
|
|
907
938
|
sandboxProvider: sandboxProvider2,
|
|
908
939
|
db: db2,
|
|
909
|
-
runId
|
|
940
|
+
runId,
|
|
941
|
+
extraScorers: extraScorers2,
|
|
942
|
+
matrixId: matrixId2
|
|
910
943
|
});
|
|
911
944
|
if (onRunUpdate2) {
|
|
912
945
|
onRunUpdate2({
|
|
@@ -953,7 +986,9 @@ async function runBenchmark(input) {
|
|
|
953
986
|
["adapters", actualAdapters],
|
|
954
987
|
["sandboxProvider", sandboxProvider],
|
|
955
988
|
["db", db],
|
|
956
|
-
["onRunUpdate", onRunUpdate]
|
|
989
|
+
["onRunUpdate", onRunUpdate],
|
|
990
|
+
["extraScorers", extraScorers],
|
|
991
|
+
["matrixId", matrixId]
|
|
957
992
|
]);
|
|
958
993
|
const run = await workflow.createRun();
|
|
959
994
|
const res = await run.start({
|
|
@@ -961,7 +996,7 @@ async function runBenchmark(input) {
|
|
|
961
996
|
initialState: runState,
|
|
962
997
|
requestContext: executionContext
|
|
963
998
|
});
|
|
964
|
-
const rawRuns = res.
|
|
999
|
+
const rawRuns = res.steps?.executeSingleRun?.output || [];
|
|
965
1000
|
return {
|
|
966
1001
|
runs: Array.isArray(rawRuns) ? rawRuns : []
|
|
967
1002
|
};
|
|
@@ -974,7 +1009,7 @@ async function validateTestCase(input) {
|
|
|
974
1009
|
checks.push(...checkStaticFields(testCase));
|
|
975
1010
|
if (!testCase.test_command) {
|
|
976
1011
|
checks.push({
|
|
977
|
-
name: "execution-checks",
|
|
1012
|
+
name: "execution-checks (skipped - no test_command)",
|
|
978
1013
|
passed: true,
|
|
979
1014
|
detail: "No test_command configured; skipping pre/post-patch execution checks."
|
|
980
1015
|
});
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@agentgrader/core",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.3",
|
|
4
4
|
"description": "Core schemas, contracts, and runner for the Agentgrader benchmarking framework",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
"dev": "bun run src/index.ts"
|
|
23
23
|
},
|
|
24
24
|
"dependencies": {
|
|
25
|
-
"@agentgrader/store": "^1.0.
|
|
25
|
+
"@agentgrader/store": "^1.0.3",
|
|
26
26
|
"@mastra/core": "^1.41.0",
|
|
27
27
|
"yaml": "^2.5.1",
|
|
28
28
|
"zod": "^3.23.8"
|