@agentgrader/core 1.0.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -131,6 +131,7 @@ declare const AgentConfigSchema: z.ZodObject<{
131
131
  id: z.ZodOptional<z.ZodString>;
132
132
  name: z.ZodString;
133
133
  model: z.ZodString;
134
+ provider: z.ZodOptional<z.ZodString>;
134
135
  max_steps: z.ZodDefault<z.ZodNumber>;
135
136
  temperature: z.ZodOptional<z.ZodNumber>;
136
137
  system_prompt: z.ZodOptional<z.ZodString>;
@@ -167,6 +168,7 @@ declare const AgentConfigSchema: z.ZodObject<{
167
168
  max_steps: number;
168
169
  id?: string | undefined;
169
170
  toolkits?: string[] | undefined;
171
+ provider?: string | undefined;
170
172
  temperature?: number | undefined;
171
173
  system_prompt?: string | undefined;
172
174
  tools?: string[] | undefined;
@@ -184,6 +186,7 @@ declare const AgentConfigSchema: z.ZodObject<{
184
186
  model: string;
185
187
  id?: string | undefined;
186
188
  toolkits?: string[] | undefined;
189
+ provider?: string | undefined;
187
190
  max_steps?: number | undefined;
188
191
  temperature?: number | undefined;
189
192
  system_prompt?: string | undefined;
@@ -300,6 +303,7 @@ declare const RunSchema: z.ZodObject<{
300
303
  error: z.ZodOptional<z.ZodString>;
301
304
  finalDiff: z.ZodOptional<z.ZodString>;
302
305
  metrics: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodAny>>;
306
+ matrixId: z.ZodOptional<z.ZodString>;
303
307
  createdAt: z.ZodNumber;
304
308
  completedAt: z.ZodOptional<z.ZodNumber>;
305
309
  }, "strip", z.ZodTypeAny, {
@@ -319,6 +323,7 @@ declare const RunSchema: z.ZodObject<{
319
323
  error?: string | undefined;
320
324
  finalDiff?: string | undefined;
321
325
  metrics?: Record<string, any> | undefined;
326
+ matrixId?: string | undefined;
322
327
  completedAt?: number | undefined;
323
328
  }, {
324
329
  status: "running" | "completed" | "failed";
@@ -337,6 +342,7 @@ declare const RunSchema: z.ZodObject<{
337
342
  error?: string | undefined;
338
343
  finalDiff?: string | undefined;
339
344
  metrics?: Record<string, any> | undefined;
345
+ matrixId?: string | undefined;
340
346
  completedAt?: number | undefined;
341
347
  }>;
342
348
  type Run = z.infer<typeof RunSchema>;
@@ -493,6 +499,26 @@ interface ScorerResult {
493
499
  passed: boolean;
494
500
  detail: string;
495
501
  score?: number;
502
+ /**
503
+ * non-functional code-quality signals. populated by additive scorers
504
+ * (e.g. staticqualityscorer, llmjudgescorer) that annotate a run without
505
+ * affecting `passed`/`score`. all fields optional - a scorer only fills
506
+ * in what it actually measured.
507
+ */
508
+ quality?: {
509
+ /** total +/- lines in the agent's diff */
510
+ diffLines?: number;
511
+ /** number of files touched by the agent's diff */
512
+ filesModified?: number;
513
+ /** TODO/FIXME/HACK/XXX markers introduced by the diff */
514
+ todosIntroduced?: number;
515
+ /** linter (e.g. Biome) error+warning count on the changed files */
516
+ linterViolations?: number;
517
+ /** 0-1 holistic quality score from an LLM judge */
518
+ llmJudgeScore?: number;
519
+ /** prose rationale from an LLM judge */
520
+ llmJudgeDetail?: string;
521
+ };
496
522
  }
497
523
  interface Scorer {
498
524
  readonly name: string;
@@ -618,6 +644,15 @@ interface RunSingleInput {
618
644
  sandboxProvider: SandboxProvider;
619
645
  db?: AgrDb;
620
646
  runId: string;
647
+ /**
648
+ * additive, non-blocking scorers (e.g. staticqualityscorer,
649
+ * llmjudgescorer) run after the core pass/fail scoring. their results
650
+ * never affect `passed`/`score` - each scorer's `scorerresult` is merged
651
+ * into `metrics` under its own `name`.
652
+ */
653
+ extraScorers?: Scorer[];
654
+ /** links this run to an optimizer matrix run, if any */
655
+ matrixId?: string;
621
656
  }
622
657
  interface RunSingleResult {
623
658
  runId: string;
@@ -637,13 +672,19 @@ declare function runSingle(input: RunSingleInput): Promise<RunSingleResult>;
637
672
  interface BenchmarkInput {
638
673
  testCases: TestCase[];
639
674
  agentConfigs: AgentConfig[];
640
- adapter: AgentAdapter;
675
+ adapter?: AgentAdapter;
676
+ adapters?: AgentAdapter[];
641
677
  sandboxProvider: SandboxProvider;
642
678
  db?: AgrDb;
643
679
  concurrency?: number;
680
+ /** additive, non-blocking quality scorers run for every test case x config combination */
681
+ extraScorers?: Scorer[];
682
+ /** links every run in this benchmark to an optimizer matrix run, if any */
683
+ matrixId?: string;
644
684
  onRunUpdate?: (run: RunSingleResult & {
645
685
  testCaseId: string;
646
686
  agentConfigId: string;
687
+ adapterName?: string;
647
688
  status: "running" | "completed" | "failed";
648
689
  }) => void;
649
690
  }
package/dist/index.js CHANGED
@@ -79,6 +79,7 @@ var AgentConfigSchema = z.object({
79
79
  id: z.string().optional(),
80
80
  name: z.string(),
81
81
  model: z.string(),
82
+ provider: z.string().optional(),
82
83
  max_steps: z.number().default(30),
83
84
  temperature: z.number().optional(),
84
85
  system_prompt: z.string().optional(),
@@ -105,8 +106,12 @@ var RunSchema = z.object({
105
106
  error: z.string().optional(),
106
107
  finalDiff: z.string().optional(),
107
108
  // extended scoring metrics: regression (FAIL_TO_PASS/PASS_TO_PASS), diff stats,
108
- // localization precision/recall, etc. Stored as JSON.
109
+ // localization precision/recall, additive quality scorers (keyed by scorer
110
+ // name, ScorerResult-shaped), etc. Stored as JSON.
109
111
  metrics: z.record(z.any()).optional(),
112
+ // links this run back to the optimizer matrix run that generated its
113
+ // agentConfig, if any (see @agentgrader/optimizer).
114
+ matrixId: z.string().optional(),
110
115
  createdAt: z.number(),
111
116
  completedAt: z.number().optional()
112
117
  });
@@ -575,7 +580,7 @@ function buildSkillsPromptAddendum(skills) {
575
580
 
576
581
  // src/runner/run-single.ts
577
582
  async function runSingle(input) {
578
- const { testCase, agentConfig, adapter, sandboxProvider, db, runId } = input;
583
+ const { testCase, agentConfig, adapter, sandboxProvider, db, runId, extraScorers, matrixId } = input;
579
584
  const startTime = Date.now();
580
585
  let sandbox = null;
581
586
  let passed = false;
@@ -598,6 +603,7 @@ async function runSingle(input) {
598
603
  agentConfigId: agentConfig.id || agentConfig.name,
599
604
  sandboxProvider: sandboxProvider.name,
600
605
  status: "running",
606
+ matrixId,
601
607
  createdAt: Math.floor(startTime / 1e3)
602
608
  });
603
609
  }
@@ -701,6 +707,21 @@ ${addendum}` : addendum
701
707
  }),
702
708
  execute: async () => {
703
709
  if (!sandbox) throw new Error("Sandbox not initialized");
710
+ if (extraScorers && extraScorers.length > 0 && agentResult) {
711
+ const trace2 = { runId, steps: emittedSteps };
712
+ for (const scorer of extraScorers) {
713
+ try {
714
+ metrics[scorer.name] = await scorer.score({
715
+ testCase,
716
+ result: agentResult,
717
+ trace: trace2,
718
+ sandbox
719
+ });
720
+ } catch (e) {
721
+ metrics[scorer.name] = { passed: true, detail: `Scorer error: ${e.message}` };
722
+ }
723
+ }
724
+ }
704
725
  const cmdScorer = new CommandScorer();
705
726
  const cmdResult = await cmdScorer.score({
706
727
  testCase,
@@ -790,10 +811,17 @@ ${addendum}` : addendum
790
811
  inputData: {},
791
812
  initialState: runState
792
813
  });
793
- const scoreResults = res.results?.score;
794
- passed = scoreResults?.passed ?? false;
795
- score = scoreResults?.score ?? 0;
796
- errorMsg = scoreResults?.passed ? void 0 : scoreResults?.detail;
814
+ const scoreStep2 = res.steps?.score;
815
+ if (res.status === "success" && scoreStep2?.status === "success") {
816
+ const scoreOutput = scoreStep2.output;
817
+ passed = scoreOutput?.passed ?? false;
818
+ score = scoreOutput?.score ?? 0;
819
+ errorMsg = scoreOutput?.passed ? void 0 : scoreOutput?.detail;
820
+ } else {
821
+ passed = false;
822
+ score = 0;
823
+ errorMsg = res.error?.message ?? "Workflow did not complete successfully";
824
+ }
797
825
  } catch (err) {
798
826
  errorMsg = err.message || "Unknown execution error";
799
827
  passed = false;
@@ -836,7 +864,11 @@ ${addendum}` : addendum
836
864
  };
837
865
  }
838
866
  async function runBenchmark(input) {
839
- const { testCases, agentConfigs, adapter, sandboxProvider, db, concurrency = 2, onRunUpdate } = input;
867
+ const { testCases, agentConfigs, adapter, adapters, sandboxProvider, db, concurrency = 2, extraScorers, matrixId, onRunUpdate } = input;
868
+ const actualAdapters = adapters || (adapter ? [adapter] : []);
869
+ if (actualAdapters.length === 0) {
870
+ throw new Error("You must provide either 'adapter' or 'adapters' to runBenchmark.");
871
+ }
840
872
  const generateCombinationsStep = createStep({
841
873
  id: "generateCombinations",
842
874
  inputSchema: z.any(),
@@ -846,10 +878,13 @@ async function runBenchmark(input) {
846
878
  const combinations = [];
847
879
  for (const tc of initData.testCases) {
848
880
  for (const config of initData.agentConfigs) {
849
- combinations.push({
850
- testCase: tc,
851
- agentConfig: config
852
- });
881
+ for (const adapterName of initData.adapterNames) {
882
+ combinations.push({
883
+ testCase: tc,
884
+ agentConfig: config,
885
+ adapterName
886
+ });
887
+ }
853
888
  }
854
889
  }
855
890
  return combinations;
@@ -860,7 +895,7 @@ async function runBenchmark(input) {
860
895
  inputSchema: z.any(),
861
896
  outputSchema: z.any(),
862
897
  execute: async ({ inputData, requestContext }) => {
863
- const { testCase, agentConfig } = inputData;
898
+ const { testCase, agentConfig, adapterName } = inputData;
864
899
  const ctx = requestContext?.context || requestContext;
865
900
  const getVal = (key) => {
866
901
  if (ctx instanceof Map) return ctx.get(key);
@@ -868,16 +903,23 @@ async function runBenchmark(input) {
868
903
  if (typeof ctx?.get === "function") return ctx.get(key);
869
904
  return void 0;
870
905
  };
871
- const adapter2 = getVal("adapter");
906
+ const adaptersFromCtx = getVal("adapters");
907
+ const singleAdapter = getVal("adapter");
908
+ const adapterList = adaptersFromCtx || (singleAdapter ? [singleAdapter] : []);
909
+ const adapter2 = adapterList.find((a) => a.name === adapterName);
910
+ if (!adapter2) throw new Error(`Adapter ${adapterName} not found in execution context`);
872
911
  const sandboxProvider2 = getVal("sandboxProvider");
873
912
  const db2 = getVal("db");
874
913
  const onRunUpdate2 = getVal("onRunUpdate");
914
+ const extraScorers2 = getVal("extraScorers");
915
+ const matrixId2 = getVal("matrixId");
875
916
  const runId = randomUUID();
876
917
  if (onRunUpdate2) {
877
918
  onRunUpdate2({
878
919
  runId,
879
920
  testCaseId: testCase.id || testCase.name,
880
921
  agentConfigId: agentConfig.id || agentConfig.name,
922
+ adapterName: adapter2.name,
881
923
  status: "running",
882
924
  passed: false,
883
925
  stepsCount: 0,
@@ -894,13 +936,16 @@ async function runBenchmark(input) {
894
936
  adapter: adapter2,
895
937
  sandboxProvider: sandboxProvider2,
896
938
  db: db2,
897
- runId
939
+ runId,
940
+ extraScorers: extraScorers2,
941
+ matrixId: matrixId2
898
942
  });
899
943
  if (onRunUpdate2) {
900
944
  onRunUpdate2({
901
945
  ...res2,
902
946
  testCaseId: testCase.id || testCase.name,
903
947
  agentConfigId: agentConfig.id || agentConfig.name,
948
+ adapterName: adapter2.name,
904
949
  status: res2.error ? "failed" : "completed"
905
950
  });
906
951
  }
@@ -921,6 +966,7 @@ async function runBenchmark(input) {
921
966
  ...failedResult,
922
967
  testCaseId: testCase.id || testCase.name,
923
968
  agentConfigId: agentConfig.id || agentConfig.name,
969
+ adapterName: adapter2.name,
924
970
  status: "failed"
925
971
  });
926
972
  }
@@ -936,17 +982,20 @@ async function runBenchmark(input) {
936
982
  const runState = {};
937
983
  const executionContext = /* @__PURE__ */ new Map([
938
984
  ["adapter", adapter],
985
+ ["adapters", actualAdapters],
939
986
  ["sandboxProvider", sandboxProvider],
940
987
  ["db", db],
941
- ["onRunUpdate", onRunUpdate]
988
+ ["onRunUpdate", onRunUpdate],
989
+ ["extraScorers", extraScorers],
990
+ ["matrixId", matrixId]
942
991
  ]);
943
992
  const run = await workflow.createRun();
944
993
  const res = await run.start({
945
- inputData: { testCases, agentConfigs },
994
+ inputData: { testCases, agentConfigs, adapterNames: actualAdapters.map((a) => a.name) },
946
995
  initialState: runState,
947
996
  requestContext: executionContext
948
997
  });
949
- const rawRuns = res.results?.executeSingleRunStepResult || [];
998
+ const rawRuns = res.steps?.executeSingleRun?.output || [];
950
999
  return {
951
1000
  runs: Array.isArray(rawRuns) ? rawRuns : []
952
1001
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agentgrader/core",
3
- "version": "1.0.1",
3
+ "version": "1.1.2",
4
4
  "description": "Core schemas, contracts, and runner for the Agentgrader benchmarking framework",
5
5
  "license": "MIT",
6
6
  "type": "module",
@@ -22,7 +22,7 @@
22
22
  "dev": "bun run src/index.ts"
23
23
  },
24
24
  "dependencies": {
25
- "@agentgrader/store": "^1.0.1",
25
+ "@agentgrader/store": "^1.0.2",
26
26
  "@mastra/core": "^1.41.0",
27
27
  "yaml": "^2.5.1",
28
28
  "zod": "^3.23.8"