@agentgrader/core 1.1.0 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -131,6 +131,7 @@ declare const AgentConfigSchema: z.ZodObject<{
131
131
  id: z.ZodOptional<z.ZodString>;
132
132
  name: z.ZodString;
133
133
  model: z.ZodString;
134
+ provider: z.ZodOptional<z.ZodString>;
134
135
  max_steps: z.ZodDefault<z.ZodNumber>;
135
136
  temperature: z.ZodOptional<z.ZodNumber>;
136
137
  system_prompt: z.ZodOptional<z.ZodString>;
@@ -167,6 +168,7 @@ declare const AgentConfigSchema: z.ZodObject<{
167
168
  max_steps: number;
168
169
  id?: string | undefined;
169
170
  toolkits?: string[] | undefined;
171
+ provider?: string | undefined;
170
172
  temperature?: number | undefined;
171
173
  system_prompt?: string | undefined;
172
174
  tools?: string[] | undefined;
@@ -184,6 +186,7 @@ declare const AgentConfigSchema: z.ZodObject<{
184
186
  model: string;
185
187
  id?: string | undefined;
186
188
  toolkits?: string[] | undefined;
189
+ provider?: string | undefined;
187
190
  max_steps?: number | undefined;
188
191
  temperature?: number | undefined;
189
192
  system_prompt?: string | undefined;
@@ -300,6 +303,7 @@ declare const RunSchema: z.ZodObject<{
300
303
  error: z.ZodOptional<z.ZodString>;
301
304
  finalDiff: z.ZodOptional<z.ZodString>;
302
305
  metrics: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodAny>>;
306
+ matrixId: z.ZodOptional<z.ZodString>;
303
307
  createdAt: z.ZodNumber;
304
308
  completedAt: z.ZodOptional<z.ZodNumber>;
305
309
  }, "strip", z.ZodTypeAny, {
@@ -319,6 +323,7 @@ declare const RunSchema: z.ZodObject<{
319
323
  error?: string | undefined;
320
324
  finalDiff?: string | undefined;
321
325
  metrics?: Record<string, any> | undefined;
326
+ matrixId?: string | undefined;
322
327
  completedAt?: number | undefined;
323
328
  }, {
324
329
  status: "running" | "completed" | "failed";
@@ -337,6 +342,7 @@ declare const RunSchema: z.ZodObject<{
337
342
  error?: string | undefined;
338
343
  finalDiff?: string | undefined;
339
344
  metrics?: Record<string, any> | undefined;
345
+ matrixId?: string | undefined;
340
346
  completedAt?: number | undefined;
341
347
  }>;
342
348
  type Run = z.infer<typeof RunSchema>;
@@ -493,6 +499,26 @@ interface ScorerResult {
493
499
  passed: boolean;
494
500
  detail: string;
495
501
  score?: number;
502
+ /**
503
+ * non-functional code-quality signals. populated by additive scorers
504
+ * (e.g. staticqualityscorer, llmjudgescorer) that annotate a run without
505
+ * affecting `passed`/`score`. all fields optional - a scorer only fills
506
+ * in what it actually measured.
507
+ */
508
+ quality?: {
509
+ /** total +/- lines in the agent's diff */
510
+ diffLines?: number;
511
+ /** number of files touched by the agent's diff */
512
+ filesModified?: number;
513
+ /** TODO/FIXME/HACK/XXX markers introduced by the diff */
514
+ todosIntroduced?: number;
515
+ /** linter (e.g. Biome) error+warning count on the changed files */
516
+ linterViolations?: number;
517
+ /** 0-1 holistic quality score from an LLM judge */
518
+ llmJudgeScore?: number;
519
+ /** prose rationale from an LLM judge */
520
+ llmJudgeDetail?: string;
521
+ };
496
522
  }
497
523
  interface Scorer {
498
524
  readonly name: string;
@@ -618,6 +644,15 @@ interface RunSingleInput {
618
644
  sandboxProvider: SandboxProvider;
619
645
  db?: AgrDb;
620
646
  runId: string;
647
+ /**
648
+ * additive, non-blocking scorers (e.g. staticqualityscorer,
649
+ * llmjudgescorer) run after the core pass/fail scoring. their results
650
+ * never affect `passed`/`score` - each scorer's `scorerresult` is merged
651
+ * into `metrics` under its own `name`.
652
+ */
653
+ extraScorers?: Scorer[];
654
+ /** links this run to an optimizer matrix run, if any */
655
+ matrixId?: string;
621
656
  }
622
657
  interface RunSingleResult {
623
658
  runId: string;
@@ -642,6 +677,10 @@ interface BenchmarkInput {
642
677
  sandboxProvider: SandboxProvider;
643
678
  db?: AgrDb;
644
679
  concurrency?: number;
680
+ /** additive, non-blocking quality scorers run for every test case x config combination */
681
+ extraScorers?: Scorer[];
682
+ /** links every run in this benchmark to an optimizer matrix run, if any */
683
+ matrixId?: string;
645
684
  onRunUpdate?: (run: RunSingleResult & {
646
685
  testCaseId: string;
647
686
  agentConfigId: string;
package/dist/index.js CHANGED
@@ -79,6 +79,7 @@ var AgentConfigSchema = z.object({
79
79
  id: z.string().optional(),
80
80
  name: z.string(),
81
81
  model: z.string(),
82
+ provider: z.string().optional(),
82
83
  max_steps: z.number().default(30),
83
84
  temperature: z.number().optional(),
84
85
  system_prompt: z.string().optional(),
@@ -105,8 +106,12 @@ var RunSchema = z.object({
105
106
  error: z.string().optional(),
106
107
  finalDiff: z.string().optional(),
107
108
  // extended scoring metrics: regression (FAIL_TO_PASS/PASS_TO_PASS), diff stats,
108
- // localization precision/recall, etc. Stored as JSON.
109
+ // localization precision/recall, additive quality scorers (keyed by scorer
110
+ // name, ScorerResult-shaped), etc. Stored as JSON.
109
111
  metrics: z.record(z.any()).optional(),
112
+ // links this run back to the optimizer matrix run that generated its
113
+ // agentConfig, if any (see @agentgrader/optimizer).
114
+ matrixId: z.string().optional(),
110
115
  createdAt: z.number(),
111
116
  completedAt: z.number().optional()
112
117
  });
@@ -575,7 +580,7 @@ function buildSkillsPromptAddendum(skills) {
575
580
 
576
581
  // src/runner/run-single.ts
577
582
  async function runSingle(input) {
578
- const { testCase, agentConfig, adapter, sandboxProvider, db, runId } = input;
583
+ const { testCase, agentConfig, adapter, sandboxProvider, db, runId, extraScorers, matrixId } = input;
579
584
  const startTime = Date.now();
580
585
  let sandbox = null;
581
586
  let passed = false;
@@ -598,6 +603,7 @@ async function runSingle(input) {
598
603
  agentConfigId: agentConfig.id || agentConfig.name,
599
604
  sandboxProvider: sandboxProvider.name,
600
605
  status: "running",
606
+ matrixId,
601
607
  createdAt: Math.floor(startTime / 1e3)
602
608
  });
603
609
  }
@@ -701,6 +707,21 @@ ${addendum}` : addendum
701
707
  }),
702
708
  execute: async () => {
703
709
  if (!sandbox) throw new Error("Sandbox not initialized");
710
+ if (extraScorers && extraScorers.length > 0 && agentResult) {
711
+ const trace2 = { runId, steps: emittedSteps };
712
+ for (const scorer of extraScorers) {
713
+ try {
714
+ metrics[scorer.name] = await scorer.score({
715
+ testCase,
716
+ result: agentResult,
717
+ trace: trace2,
718
+ sandbox
719
+ });
720
+ } catch (e) {
721
+ metrics[scorer.name] = { passed: true, detail: `Scorer error: ${e.message}` };
722
+ }
723
+ }
724
+ }
704
725
  const cmdScorer = new CommandScorer();
705
726
  const cmdResult = await cmdScorer.score({
706
727
  testCase,
@@ -790,10 +811,17 @@ ${addendum}` : addendum
790
811
  inputData: {},
791
812
  initialState: runState
792
813
  });
793
- const scoreResults = res.results?.score;
794
- passed = scoreResults?.passed ?? false;
795
- score = scoreResults?.score ?? 0;
796
- errorMsg = scoreResults?.passed ? void 0 : scoreResults?.detail;
814
+ const scoreStep2 = res.steps?.score;
815
+ if (res.status === "success" && scoreStep2?.status === "success") {
816
+ const scoreOutput = scoreStep2.output;
817
+ passed = scoreOutput?.passed ?? false;
818
+ score = scoreOutput?.score ?? 0;
819
+ errorMsg = scoreOutput?.passed ? void 0 : scoreOutput?.detail;
820
+ } else {
821
+ passed = false;
822
+ score = 0;
823
+ errorMsg = res.error?.message ?? "Workflow did not complete successfully";
824
+ }
797
825
  } catch (err) {
798
826
  errorMsg = err.message || "Unknown execution error";
799
827
  passed = false;
@@ -836,7 +864,7 @@ ${addendum}` : addendum
836
864
  };
837
865
  }
838
866
  async function runBenchmark(input) {
839
- const { testCases, agentConfigs, adapter, adapters, sandboxProvider, db, concurrency = 2, onRunUpdate } = input;
867
+ const { testCases, agentConfigs, adapter, adapters, sandboxProvider, db, concurrency = 2, extraScorers, matrixId, onRunUpdate } = input;
840
868
  const actualAdapters = adapters || (adapter ? [adapter] : []);
841
869
  if (actualAdapters.length === 0) {
842
870
  throw new Error("You must provide either 'adapter' or 'adapters' to runBenchmark.");
@@ -883,6 +911,8 @@ async function runBenchmark(input) {
883
911
  const sandboxProvider2 = getVal("sandboxProvider");
884
912
  const db2 = getVal("db");
885
913
  const onRunUpdate2 = getVal("onRunUpdate");
914
+ const extraScorers2 = getVal("extraScorers");
915
+ const matrixId2 = getVal("matrixId");
886
916
  const runId = randomUUID();
887
917
  if (onRunUpdate2) {
888
918
  onRunUpdate2({
@@ -906,7 +936,9 @@ async function runBenchmark(input) {
906
936
  adapter: adapter2,
907
937
  sandboxProvider: sandboxProvider2,
908
938
  db: db2,
909
- runId
939
+ runId,
940
+ extraScorers: extraScorers2,
941
+ matrixId: matrixId2
910
942
  });
911
943
  if (onRunUpdate2) {
912
944
  onRunUpdate2({
@@ -953,7 +985,9 @@ async function runBenchmark(input) {
953
985
  ["adapters", actualAdapters],
954
986
  ["sandboxProvider", sandboxProvider],
955
987
  ["db", db],
956
- ["onRunUpdate", onRunUpdate]
988
+ ["onRunUpdate", onRunUpdate],
989
+ ["extraScorers", extraScorers],
990
+ ["matrixId", matrixId]
957
991
  ]);
958
992
  const run = await workflow.createRun();
959
993
  const res = await run.start({
@@ -961,7 +995,7 @@ async function runBenchmark(input) {
961
995
  initialState: runState,
962
996
  requestContext: executionContext
963
997
  });
964
- const rawRuns = res.results?.executeSingleRunStepResult || [];
998
+ const rawRuns = res.steps?.executeSingleRun?.output || [];
965
999
  return {
966
1000
  runs: Array.isArray(rawRuns) ? rawRuns : []
967
1001
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agentgrader/core",
3
- "version": "1.1.0",
3
+ "version": "1.1.2",
4
4
  "description": "Core schemas, contracts, and runner for the Agentgrader benchmarking framework",
5
5
  "license": "MIT",
6
6
  "type": "module",