@agentgrader/core 1.1.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -72,6 +72,7 @@ declare const TestCaseSchema: z.ZodObject<{
72
72
  created_at: z.ZodOptional<z.ZodString>;
73
73
  image: z.ZodOptional<z.ZodString>;
74
74
  toolkits: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
75
+ agent_config: z.ZodOptional<z.ZodString>;
75
76
  }, "strip", z.ZodTypeAny, {
76
77
  name: string;
77
78
  fixture: string;
@@ -98,6 +99,7 @@ declare const TestCaseSchema: z.ZodObject<{
98
99
  created_at?: string | undefined;
99
100
  image?: string | undefined;
100
101
  toolkits?: string[] | undefined;
102
+ agent_config?: string | undefined;
101
103
  }, {
102
104
  name: string;
103
105
  fixture: string;
@@ -124,6 +126,7 @@ declare const TestCaseSchema: z.ZodObject<{
124
126
  created_at?: string | undefined;
125
127
  image?: string | undefined;
126
128
  toolkits?: string[] | undefined;
129
+ agent_config?: string | undefined;
127
130
  }>;
128
131
  type TestCase = z.infer<typeof TestCaseSchema>;
129
132
 
@@ -653,6 +656,7 @@ interface RunSingleInput {
653
656
  extraScorers?: Scorer[];
654
657
  /** links this run to an optimizer matrix run, if any */
655
658
  matrixId?: string;
659
+ onStep?: (step: StepEvent) => void;
656
660
  }
657
661
  interface RunSingleResult {
658
662
  runId: string;
package/dist/index.js CHANGED
@@ -50,7 +50,8 @@ var TestCaseSchema = z.object({
50
50
  // paths to toolkit directories (custom CLI tools + .claude/skills/) to
51
51
  // inject into the sandbox and surface to the agent via the system prompt,
52
52
  // in addition to any toolkits configured on the agent
53
- toolkits: z.array(z.string()).optional()
53
+ toolkits: z.array(z.string()).optional(),
54
+ agent_config: z.string().optional()
54
55
  });
55
56
  var SkillFrontmatterSchema = z.object({
56
57
  /** lowercase letters, numbers, hyphens; max 64 chars */
@@ -83,7 +84,9 @@ var AgentConfigSchema = z.object({
83
84
  max_steps: z.number().default(30),
84
85
  temperature: z.number().optional(),
85
86
  system_prompt: z.string().optional(),
86
- tools: z.array(z.string()).optional(),
87
+ tools: z.array(z.string()).optional().describe(
88
+ "Optional allowlist of tool names (local: executeCommand, readFile, writeFile, submit; MCP: <mcpServerName>_<toolName>). submit is always included implicitly."
89
+ ),
87
90
  // paths to toolkit directories (custom CLI tools + .claude/skills/) to
88
91
  // inject into the sandbox and surface to the agent via the system prompt
89
92
  toolkits: z.array(z.string()).optional(),
@@ -641,6 +644,7 @@ async function runSingle(input) {
641
644
  tokensIn += stepEvent.tokensIn || 0;
642
645
  tokensOut += stepEvent.tokensOut || 0;
643
646
  costUsd += stepEvent.costUsd || 0;
647
+ input.onStep?.(stepEvent);
644
648
  if (db) {
645
649
  addTrace(db, {
646
650
  runId,
@@ -1008,7 +1012,7 @@ async function validateTestCase(input) {
1008
1012
  checks.push(...checkStaticFields(testCase));
1009
1013
  if (!testCase.test_command) {
1010
1014
  checks.push({
1011
- name: "execution-checks",
1015
+ name: "execution-checks (skipped - no test_command)",
1012
1016
  passed: true,
1013
1017
  detail: "No test_command configured; skipping pre/post-patch execution checks."
1014
1018
  });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agentgrader/core",
3
- "version": "1.1.2",
3
+ "version": "1.2.0",
4
4
  "description": "Core schemas, contracts, and runner for the Agentgrader benchmarking framework",
5
5
  "license": "MIT",
6
6
  "type": "module",
@@ -22,7 +22,7 @@
22
22
  "dev": "bun run src/index.ts"
23
23
  },
24
24
  "dependencies": {
25
- "@agentgrader/store": "^1.0.2",
25
+ "@agentgrader/store": "^1.0.3",
26
26
  "@mastra/core": "^1.41.0",
27
27
  "yaml": "^2.5.1",
28
28
  "zod": "^3.23.8"