@agentgrader/core 1.1.0 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -131,6 +131,7 @@ declare const AgentConfigSchema: z.ZodObject<{
131
131
  id: z.ZodOptional<z.ZodString>;
132
132
  name: z.ZodString;
133
133
  model: z.ZodString;
134
+ provider: z.ZodOptional<z.ZodString>;
134
135
  max_steps: z.ZodDefault<z.ZodNumber>;
135
136
  temperature: z.ZodOptional<z.ZodNumber>;
136
137
  system_prompt: z.ZodOptional<z.ZodString>;
@@ -167,6 +168,7 @@ declare const AgentConfigSchema: z.ZodObject<{
167
168
  max_steps: number;
168
169
  id?: string | undefined;
169
170
  toolkits?: string[] | undefined;
171
+ provider?: string | undefined;
170
172
  temperature?: number | undefined;
171
173
  system_prompt?: string | undefined;
172
174
  tools?: string[] | undefined;
@@ -184,6 +186,7 @@ declare const AgentConfigSchema: z.ZodObject<{
184
186
  model: string;
185
187
  id?: string | undefined;
186
188
  toolkits?: string[] | undefined;
189
+ provider?: string | undefined;
187
190
  max_steps?: number | undefined;
188
191
  temperature?: number | undefined;
189
192
  system_prompt?: string | undefined;
@@ -300,6 +303,7 @@ declare const RunSchema: z.ZodObject<{
300
303
  error: z.ZodOptional<z.ZodString>;
301
304
  finalDiff: z.ZodOptional<z.ZodString>;
302
305
  metrics: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodAny>>;
306
+ matrixId: z.ZodOptional<z.ZodString>;
303
307
  createdAt: z.ZodNumber;
304
308
  completedAt: z.ZodOptional<z.ZodNumber>;
305
309
  }, "strip", z.ZodTypeAny, {
@@ -319,6 +323,7 @@ declare const RunSchema: z.ZodObject<{
319
323
  error?: string | undefined;
320
324
  finalDiff?: string | undefined;
321
325
  metrics?: Record<string, any> | undefined;
326
+ matrixId?: string | undefined;
322
327
  completedAt?: number | undefined;
323
328
  }, {
324
329
  status: "running" | "completed" | "failed";
@@ -337,6 +342,7 @@ declare const RunSchema: z.ZodObject<{
337
342
  error?: string | undefined;
338
343
  finalDiff?: string | undefined;
339
344
  metrics?: Record<string, any> | undefined;
345
+ matrixId?: string | undefined;
340
346
  completedAt?: number | undefined;
341
347
  }>;
342
348
  type Run = z.infer<typeof RunSchema>;
@@ -493,6 +499,26 @@ interface ScorerResult {
493
499
  passed: boolean;
494
500
  detail: string;
495
501
  score?: number;
502
+ /**
503
+ * non-functional code-quality signals. populated by additive scorers
504
+ * (e.g. staticqualityscorer, llmjudgescorer) that annotate a run without
505
+ * affecting `passed`/`score`. all fields optional - a scorer only fills
506
+ * in what it actually measured.
507
+ */
508
+ quality?: {
509
+ /** total +/- lines in the agent's diff */
510
+ diffLines?: number;
511
+ /** number of files touched by the agent's diff */
512
+ filesModified?: number;
513
+ /** TODO/FIXME/HACK/XXX markers introduced by the diff */
514
+ todosIntroduced?: number;
515
+ /** linter (e.g. Biome) error+warning count on the changed files */
516
+ linterViolations?: number;
517
+ /** 0-1 holistic quality score from an LLM judge */
518
+ llmJudgeScore?: number;
519
+ /** prose rationale from an LLM judge */
520
+ llmJudgeDetail?: string;
521
+ };
496
522
  }
497
523
  interface Scorer {
498
524
  readonly name: string;
@@ -618,6 +644,16 @@ interface RunSingleInput {
618
644
  sandboxProvider: SandboxProvider;
619
645
  db?: AgrDb;
620
646
  runId: string;
647
+ /**
648
+ * additive, non-blocking scorers (e.g. staticqualityscorer,
649
+ * llmjudgescorer) run after the core pass/fail scoring. their results
650
+ * never affect `passed`/`score` - each scorer's `scorerresult` is merged
651
+ * into `metrics` under its own `name`.
652
+ */
653
+ extraScorers?: Scorer[];
654
+ /** links this run to an optimizer matrix run, if any */
655
+ matrixId?: string;
656
+ onStep?: (step: StepEvent) => void;
621
657
  }
622
658
  interface RunSingleResult {
623
659
  runId: string;
@@ -642,6 +678,10 @@ interface BenchmarkInput {
642
678
  sandboxProvider: SandboxProvider;
643
679
  db?: AgrDb;
644
680
  concurrency?: number;
681
+ /** additive, non-blocking quality scorers run for every test case x config combination */
682
+ extraScorers?: Scorer[];
683
+ /** links every run in this benchmark to an optimizer matrix run, if any */
684
+ matrixId?: string;
645
685
  onRunUpdate?: (run: RunSingleResult & {
646
686
  testCaseId: string;
647
687
  agentConfigId: string;
package/dist/index.js CHANGED
@@ -79,6 +79,7 @@ var AgentConfigSchema = z.object({
79
79
  id: z.string().optional(),
80
80
  name: z.string(),
81
81
  model: z.string(),
82
+ provider: z.string().optional(),
82
83
  max_steps: z.number().default(30),
83
84
  temperature: z.number().optional(),
84
85
  system_prompt: z.string().optional(),
@@ -105,8 +106,12 @@ var RunSchema = z.object({
105
106
  error: z.string().optional(),
106
107
  finalDiff: z.string().optional(),
107
108
  // extended scoring metrics: regression (FAIL_TO_PASS/PASS_TO_PASS), diff stats,
108
- // localization precision/recall, etc. Stored as JSON.
109
+ // localization precision/recall, additive quality scorers (keyed by scorer
110
+ // name, ScorerResult-shaped), etc. Stored as JSON.
109
111
  metrics: z.record(z.any()).optional(),
112
+ // links this run back to the optimizer matrix run that generated its
113
+ // agentConfig, if any (see @agentgrader/optimizer).
114
+ matrixId: z.string().optional(),
110
115
  createdAt: z.number(),
111
116
  completedAt: z.number().optional()
112
117
  });
@@ -575,7 +580,7 @@ function buildSkillsPromptAddendum(skills) {
575
580
 
576
581
  // src/runner/run-single.ts
577
582
  async function runSingle(input) {
578
- const { testCase, agentConfig, adapter, sandboxProvider, db, runId } = input;
583
+ const { testCase, agentConfig, adapter, sandboxProvider, db, runId, extraScorers, matrixId } = input;
579
584
  const startTime = Date.now();
580
585
  let sandbox = null;
581
586
  let passed = false;
@@ -598,6 +603,7 @@ async function runSingle(input) {
598
603
  agentConfigId: agentConfig.id || agentConfig.name,
599
604
  sandboxProvider: sandboxProvider.name,
600
605
  status: "running",
606
+ matrixId,
601
607
  createdAt: Math.floor(startTime / 1e3)
602
608
  });
603
609
  }
@@ -635,6 +641,7 @@ async function runSingle(input) {
635
641
  tokensIn += stepEvent.tokensIn || 0;
636
642
  tokensOut += stepEvent.tokensOut || 0;
637
643
  costUsd += stepEvent.costUsd || 0;
644
+ input.onStep?.(stepEvent);
638
645
  if (db) {
639
646
  addTrace(db, {
640
647
  runId,
@@ -701,6 +708,21 @@ ${addendum}` : addendum
701
708
  }),
702
709
  execute: async () => {
703
710
  if (!sandbox) throw new Error("Sandbox not initialized");
711
+ if (extraScorers && extraScorers.length > 0 && agentResult) {
712
+ const trace2 = { runId, steps: emittedSteps };
713
+ for (const scorer of extraScorers) {
714
+ try {
715
+ metrics[scorer.name] = await scorer.score({
716
+ testCase,
717
+ result: agentResult,
718
+ trace: trace2,
719
+ sandbox
720
+ });
721
+ } catch (e) {
722
+ metrics[scorer.name] = { passed: true, detail: `Scorer error: ${e.message}` };
723
+ }
724
+ }
725
+ }
704
726
  const cmdScorer = new CommandScorer();
705
727
  const cmdResult = await cmdScorer.score({
706
728
  testCase,
@@ -790,10 +812,17 @@ ${addendum}` : addendum
790
812
  inputData: {},
791
813
  initialState: runState
792
814
  });
793
- const scoreResults = res.results?.score;
794
- passed = scoreResults?.passed ?? false;
795
- score = scoreResults?.score ?? 0;
796
- errorMsg = scoreResults?.passed ? void 0 : scoreResults?.detail;
815
+ const scoreStep2 = res.steps?.score;
816
+ if (res.status === "success" && scoreStep2?.status === "success") {
817
+ const scoreOutput = scoreStep2.output;
818
+ passed = scoreOutput?.passed ?? false;
819
+ score = scoreOutput?.score ?? 0;
820
+ errorMsg = scoreOutput?.passed ? void 0 : scoreOutput?.detail;
821
+ } else {
822
+ passed = false;
823
+ score = 0;
824
+ errorMsg = res.error?.message ?? "Workflow did not complete successfully";
825
+ }
797
826
  } catch (err) {
798
827
  errorMsg = err.message || "Unknown execution error";
799
828
  passed = false;
@@ -836,7 +865,7 @@ ${addendum}` : addendum
836
865
  };
837
866
  }
838
867
  async function runBenchmark(input) {
839
- const { testCases, agentConfigs, adapter, adapters, sandboxProvider, db, concurrency = 2, onRunUpdate } = input;
868
+ const { testCases, agentConfigs, adapter, adapters, sandboxProvider, db, concurrency = 2, extraScorers, matrixId, onRunUpdate } = input;
840
869
  const actualAdapters = adapters || (adapter ? [adapter] : []);
841
870
  if (actualAdapters.length === 0) {
842
871
  throw new Error("You must provide either 'adapter' or 'adapters' to runBenchmark.");
@@ -883,6 +912,8 @@ async function runBenchmark(input) {
883
912
  const sandboxProvider2 = getVal("sandboxProvider");
884
913
  const db2 = getVal("db");
885
914
  const onRunUpdate2 = getVal("onRunUpdate");
915
+ const extraScorers2 = getVal("extraScorers");
916
+ const matrixId2 = getVal("matrixId");
886
917
  const runId = randomUUID();
887
918
  if (onRunUpdate2) {
888
919
  onRunUpdate2({
@@ -906,7 +937,9 @@ async function runBenchmark(input) {
906
937
  adapter: adapter2,
907
938
  sandboxProvider: sandboxProvider2,
908
939
  db: db2,
909
- runId
940
+ runId,
941
+ extraScorers: extraScorers2,
942
+ matrixId: matrixId2
910
943
  });
911
944
  if (onRunUpdate2) {
912
945
  onRunUpdate2({
@@ -953,7 +986,9 @@ async function runBenchmark(input) {
953
986
  ["adapters", actualAdapters],
954
987
  ["sandboxProvider", sandboxProvider],
955
988
  ["db", db],
956
- ["onRunUpdate", onRunUpdate]
989
+ ["onRunUpdate", onRunUpdate],
990
+ ["extraScorers", extraScorers],
991
+ ["matrixId", matrixId]
957
992
  ]);
958
993
  const run = await workflow.createRun();
959
994
  const res = await run.start({
@@ -961,7 +996,7 @@ async function runBenchmark(input) {
961
996
  initialState: runState,
962
997
  requestContext: executionContext
963
998
  });
964
- const rawRuns = res.results?.executeSingleRunStepResult || [];
999
+ const rawRuns = res.steps?.executeSingleRun?.output || [];
965
1000
  return {
966
1001
  runs: Array.isArray(rawRuns) ? rawRuns : []
967
1002
  };
@@ -974,7 +1009,7 @@ async function validateTestCase(input) {
974
1009
  checks.push(...checkStaticFields(testCase));
975
1010
  if (!testCase.test_command) {
976
1011
  checks.push({
977
- name: "execution-checks",
1012
+ name: "execution-checks (skipped - no test_command)",
978
1013
  passed: true,
979
1014
  detail: "No test_command configured; skipping pre/post-patch execution checks."
980
1015
  });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agentgrader/core",
3
- "version": "1.1.0",
3
+ "version": "1.1.3",
4
4
  "description": "Core schemas, contracts, and runner for the Agentgrader benchmarking framework",
5
5
  "license": "MIT",
6
6
  "type": "module",
@@ -22,7 +22,7 @@
22
22
  "dev": "bun run src/index.ts"
23
23
  },
24
24
  "dependencies": {
25
- "@agentgrader/store": "^1.0.2",
25
+ "@agentgrader/store": "^1.0.3",
26
26
  "@mastra/core": "^1.41.0",
27
27
  "yaml": "^2.5.1",
28
28
  "zod": "^3.23.8"