npm - @agentgrader/core - Versions diffs - 1.1.0 → 1.1.3 - Mend

@agentgrader/core 1.1.0 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -131,6 +131,7 @@ declare const AgentConfigSchema: z.ZodObject<{
     id: z.ZodOptional<z.ZodString>;
     name: z.ZodString;
     model: z.ZodString;
+    provider: z.ZodOptional<z.ZodString>;
     max_steps: z.ZodDefault<z.ZodNumber>;
     temperature: z.ZodOptional<z.ZodNumber>;
     system_prompt: z.ZodOptional<z.ZodString>;
@@ -167,6 +168,7 @@ declare const AgentConfigSchema: z.ZodObject<{
     max_steps: number;
     id?: string | undefined;
     toolkits?: string[] | undefined;
+    provider?: string | undefined;
     temperature?: number | undefined;
     system_prompt?: string | undefined;
     tools?: string[] | undefined;
@@ -184,6 +186,7 @@ declare const AgentConfigSchema: z.ZodObject<{
     model: string;
     id?: string | undefined;
     toolkits?: string[] | undefined;
+    provider?: string | undefined;
     max_steps?: number | undefined;
     temperature?: number | undefined;
     system_prompt?: string | undefined;
@@ -300,6 +303,7 @@ declare const RunSchema: z.ZodObject<{
     error: z.ZodOptional<z.ZodString>;
     finalDiff: z.ZodOptional<z.ZodString>;
     metrics: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodAny>>;
+    matrixId: z.ZodOptional<z.ZodString>;
     createdAt: z.ZodNumber;
     completedAt: z.ZodOptional<z.ZodNumber>;
 }, "strip", z.ZodTypeAny, {
@@ -319,6 +323,7 @@ declare const RunSchema: z.ZodObject<{
     error?: string | undefined;
     finalDiff?: string | undefined;
     metrics?: Record<string, any> | undefined;
+    matrixId?: string | undefined;
     completedAt?: number | undefined;
 }, {
     status: "running" | "completed" | "failed";
@@ -337,6 +342,7 @@ declare const RunSchema: z.ZodObject<{
     error?: string | undefined;
     finalDiff?: string | undefined;
     metrics?: Record<string, any> | undefined;
+    matrixId?: string | undefined;
     completedAt?: number | undefined;
 }>;
 type Run = z.infer<typeof RunSchema>;
@@ -493,6 +499,26 @@ interface ScorerResult {
     passed: boolean;
     detail: string;
     score?: number;
+    /**
+     * non-functional code-quality signals. populated by additive scorers
+     * (e.g. staticqualityscorer, llmjudgescorer) that annotate a run without
+     * affecting `passed`/`score`. all fields optional - a scorer only fills
+     * in what it actually measured.
+     */
+    quality?: {
+        /** total +/- lines in the agent's diff */
+        diffLines?: number;
+        /** number of files touched by the agent's diff */
+        filesModified?: number;
+        /** TODO/FIXME/HACK/XXX markers introduced by the diff */
+        todosIntroduced?: number;
+        /** linter (e.g. Biome) error+warning count on the changed files */
+        linterViolations?: number;
+        /** 0-1 holistic quality score from an LLM judge */
+        llmJudgeScore?: number;
+        /** prose rationale from an LLM judge */
+        llmJudgeDetail?: string;
+    };
 }
 interface Scorer {
     readonly name: string;
@@ -618,6 +644,16 @@ interface RunSingleInput {
     sandboxProvider: SandboxProvider;
     db?: AgrDb;
     runId: string;
+    /**
+     * additive, non-blocking scorers (e.g. staticqualityscorer,
+     * llmjudgescorer) run after the core pass/fail scoring. their results
+     * never affect `passed`/`score` - each scorer's `scorerresult` is merged
+     * into `metrics` under its own `name`.
+     */
+    extraScorers?: Scorer[];
+    /** links this run to an optimizer matrix run, if any */
+    matrixId?: string;
+    onStep?: (step: StepEvent) => void;
 }
 interface RunSingleResult {
     runId: string;
@@ -642,6 +678,10 @@ interface BenchmarkInput {
     sandboxProvider: SandboxProvider;
     db?: AgrDb;
     concurrency?: number;
+    /** additive, non-blocking quality scorers run for every test case x config combination */
+    extraScorers?: Scorer[];
+    /** links every run in this benchmark to an optimizer matrix run, if any */
+    matrixId?: string;
     onRunUpdate?: (run: RunSingleResult & {
         testCaseId: string;
         agentConfigId: string;

package/dist/index.js CHANGED Viewed

@@ -79,6 +79,7 @@ var AgentConfigSchema = z.object({
   id: z.string().optional(),
   name: z.string(),
   model: z.string(),
+  provider: z.string().optional(),
   max_steps: z.number().default(30),
   temperature: z.number().optional(),
   system_prompt: z.string().optional(),
@@ -105,8 +106,12 @@ var RunSchema = z.object({
   error: z.string().optional(),
   finalDiff: z.string().optional(),
   // extended scoring metrics: regression (FAIL_TO_PASS/PASS_TO_PASS), diff stats,
-  // localization precision/recall, etc. Stored as JSON.
+  // localization precision/recall, additive quality scorers (keyed by scorer
+  // name, ScorerResult-shaped), etc. Stored as JSON.
   metrics: z.record(z.any()).optional(),
+  // links this run back to the optimizer matrix run that generated its
+  // agentConfig, if any (see @agentgrader/optimizer).
+  matrixId: z.string().optional(),
   createdAt: z.number(),
   completedAt: z.number().optional()
 });
@@ -575,7 +580,7 @@ function buildSkillsPromptAddendum(skills) {
 // src/runner/run-single.ts
 async function runSingle(input) {
-  const { testCase, agentConfig, adapter, sandboxProvider, db, runId } = input;
+  const { testCase, agentConfig, adapter, sandboxProvider, db, runId, extraScorers, matrixId } = input;
   const startTime = Date.now();
   let sandbox = null;
   let passed = false;
@@ -598,6 +603,7 @@ async function runSingle(input) {
       agentConfigId: agentConfig.id || agentConfig.name,
       sandboxProvider: sandboxProvider.name,
       status: "running",
+      matrixId,
       createdAt: Math.floor(startTime / 1e3)
     });
   }
@@ -635,6 +641,7 @@ async function runSingle(input) {
         tokensIn += stepEvent.tokensIn || 0;
         tokensOut += stepEvent.tokensOut || 0;
         costUsd += stepEvent.costUsd || 0;
+        input.onStep?.(stepEvent);
         if (db) {
           addTrace(db, {
             runId,
@@ -701,6 +708,21 @@ ${addendum}` : addendum
     }),
     execute: async () => {
       if (!sandbox) throw new Error("Sandbox not initialized");
+      if (extraScorers && extraScorers.length > 0 && agentResult) {
+        const trace2 = { runId, steps: emittedSteps };
+        for (const scorer of extraScorers) {
+          try {
+            metrics[scorer.name] = await scorer.score({
+              testCase,
+              result: agentResult,
+              trace: trace2,
+              sandbox
+            });
+          } catch (e) {
+            metrics[scorer.name] = { passed: true, detail: `Scorer error: ${e.message}` };
+          }
+        }
+      }
       const cmdScorer = new CommandScorer();
       const cmdResult = await cmdScorer.score({
         testCase,
@@ -790,10 +812,17 @@ ${addendum}` : addendum
       inputData: {},
       initialState: runState
     });
-    const scoreResults = res.results?.score;
-    passed = scoreResults?.passed ?? false;
-    score = scoreResults?.score ?? 0;
-    errorMsg = scoreResults?.passed ? void 0 : scoreResults?.detail;
+    const scoreStep2 = res.steps?.score;
+    if (res.status === "success" && scoreStep2?.status === "success") {
+      const scoreOutput = scoreStep2.output;
+      passed = scoreOutput?.passed ?? false;
+      score = scoreOutput?.score ?? 0;
+      errorMsg = scoreOutput?.passed ? void 0 : scoreOutput?.detail;
+    } else {
+      passed = false;
+      score = 0;
+      errorMsg = res.error?.message ?? "Workflow did not complete successfully";
+    }
   } catch (err) {
     errorMsg = err.message || "Unknown execution error";
     passed = false;
@@ -836,7 +865,7 @@ ${addendum}` : addendum
   };
 }
 async function runBenchmark(input) {
-  const { testCases, agentConfigs, adapter, adapters, sandboxProvider, db, concurrency = 2, onRunUpdate } = input;
+  const { testCases, agentConfigs, adapter, adapters, sandboxProvider, db, concurrency = 2, extraScorers, matrixId, onRunUpdate } = input;
   const actualAdapters = adapters || (adapter ? [adapter] : []);
   if (actualAdapters.length === 0) {
     throw new Error("You must provide either 'adapter' or 'adapters' to runBenchmark.");
@@ -883,6 +912,8 @@ async function runBenchmark(input) {
       const sandboxProvider2 = getVal("sandboxProvider");
       const db2 = getVal("db");
       const onRunUpdate2 = getVal("onRunUpdate");
+      const extraScorers2 = getVal("extraScorers");
+      const matrixId2 = getVal("matrixId");
       const runId = randomUUID();
       if (onRunUpdate2) {
         onRunUpdate2({
@@ -906,7 +937,9 @@ async function runBenchmark(input) {
           adapter: adapter2,
           sandboxProvider: sandboxProvider2,
           db: db2,
-          runId
+          runId,
+          extraScorers: extraScorers2,
+          matrixId: matrixId2
         });
         if (onRunUpdate2) {
           onRunUpdate2({
@@ -953,7 +986,9 @@ async function runBenchmark(input) {
     ["adapters", actualAdapters],
     ["sandboxProvider", sandboxProvider],
     ["db", db],
-    ["onRunUpdate", onRunUpdate]
+    ["onRunUpdate", onRunUpdate],
+    ["extraScorers", extraScorers],
+    ["matrixId", matrixId]
   ]);
   const run = await workflow.createRun();
   const res = await run.start({
@@ -961,7 +996,7 @@ async function runBenchmark(input) {
     initialState: runState,
     requestContext: executionContext
   });
-  const rawRuns = res.results?.executeSingleRunStepResult || [];
+  const rawRuns = res.steps?.executeSingleRun?.output || [];
   return {
     runs: Array.isArray(rawRuns) ? rawRuns : []
   };
@@ -974,7 +1009,7 @@ async function validateTestCase(input) {
   checks.push(...checkStaticFields(testCase));
   if (!testCase.test_command) {
     checks.push({
-      name: "execution-checks",
+      name: "execution-checks (skipped - no test_command)",
       passed: true,
       detail: "No test_command configured; skipping pre/post-patch execution checks."
     });

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@agentgrader/core",
-  "version": "1.1.0",
+  "version": "1.1.3",
   "description": "Core schemas, contracts, and runner for the Agentgrader benchmarking framework",
   "license": "MIT",
   "type": "module",
@@ -22,7 +22,7 @@
     "dev": "bun run src/index.ts"
   },
   "dependencies": {
-    "@agentgrader/store": "^1.0.2",
+    "@agentgrader/store": "^1.0.3",
     "@mastra/core": "^1.41.0",
     "yaml": "^2.5.1",
     "zod": "^3.23.8"