@agentgrader/core 1.1.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +4 -0
- package/dist/index.js +7 -3
- package/package.json +2 -2
package/dist/index.d.ts
CHANGED
|
@@ -72,6 +72,7 @@ declare const TestCaseSchema: z.ZodObject<{
|
|
|
72
72
|
created_at: z.ZodOptional<z.ZodString>;
|
|
73
73
|
image: z.ZodOptional<z.ZodString>;
|
|
74
74
|
toolkits: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
75
|
+
agent_config: z.ZodOptional<z.ZodString>;
|
|
75
76
|
}, "strip", z.ZodTypeAny, {
|
|
76
77
|
name: string;
|
|
77
78
|
fixture: string;
|
|
@@ -98,6 +99,7 @@ declare const TestCaseSchema: z.ZodObject<{
|
|
|
98
99
|
created_at?: string | undefined;
|
|
99
100
|
image?: string | undefined;
|
|
100
101
|
toolkits?: string[] | undefined;
|
|
102
|
+
agent_config?: string | undefined;
|
|
101
103
|
}, {
|
|
102
104
|
name: string;
|
|
103
105
|
fixture: string;
|
|
@@ -124,6 +126,7 @@ declare const TestCaseSchema: z.ZodObject<{
|
|
|
124
126
|
created_at?: string | undefined;
|
|
125
127
|
image?: string | undefined;
|
|
126
128
|
toolkits?: string[] | undefined;
|
|
129
|
+
agent_config?: string | undefined;
|
|
127
130
|
}>;
|
|
128
131
|
type TestCase = z.infer<typeof TestCaseSchema>;
|
|
129
132
|
|
|
@@ -653,6 +656,7 @@ interface RunSingleInput {
|
|
|
653
656
|
extraScorers?: Scorer[];
|
|
654
657
|
/** links this run to an optimizer matrix run, if any */
|
|
655
658
|
matrixId?: string;
|
|
659
|
+
onStep?: (step: StepEvent) => void;
|
|
656
660
|
}
|
|
657
661
|
interface RunSingleResult {
|
|
658
662
|
runId: string;
|
package/dist/index.js
CHANGED
|
@@ -50,7 +50,8 @@ var TestCaseSchema = z.object({
|
|
|
50
50
|
// paths to toolkit directories (custom CLI tools + .claude/skills/) to
|
|
51
51
|
// inject into the sandbox and surface to the agent via the system prompt,
|
|
52
52
|
// in addition to any toolkits configured on the agent
|
|
53
|
-
toolkits: z.array(z.string()).optional()
|
|
53
|
+
toolkits: z.array(z.string()).optional(),
|
|
54
|
+
agent_config: z.string().optional()
|
|
54
55
|
});
|
|
55
56
|
var SkillFrontmatterSchema = z.object({
|
|
56
57
|
/** lowercase letters, numbers, hyphens; max 64 chars */
|
|
@@ -83,7 +84,9 @@ var AgentConfigSchema = z.object({
|
|
|
83
84
|
max_steps: z.number().default(30),
|
|
84
85
|
temperature: z.number().optional(),
|
|
85
86
|
system_prompt: z.string().optional(),
|
|
86
|
-
tools: z.array(z.string()).optional()
|
|
87
|
+
tools: z.array(z.string()).optional().describe(
|
|
88
|
+
"Optional allowlist of tool names (local: executeCommand, readFile, writeFile, submit; MCP: <mcpServerName>_<toolName>). submit is always included implicitly."
|
|
89
|
+
),
|
|
87
90
|
// paths to toolkit directories (custom CLI tools + .claude/skills/) to
|
|
88
91
|
// inject into the sandbox and surface to the agent via the system prompt
|
|
89
92
|
toolkits: z.array(z.string()).optional(),
|
|
@@ -641,6 +644,7 @@ async function runSingle(input) {
|
|
|
641
644
|
tokensIn += stepEvent.tokensIn || 0;
|
|
642
645
|
tokensOut += stepEvent.tokensOut || 0;
|
|
643
646
|
costUsd += stepEvent.costUsd || 0;
|
|
647
|
+
input.onStep?.(stepEvent);
|
|
644
648
|
if (db) {
|
|
645
649
|
addTrace(db, {
|
|
646
650
|
runId,
|
|
@@ -1008,7 +1012,7 @@ async function validateTestCase(input) {
|
|
|
1008
1012
|
checks.push(...checkStaticFields(testCase));
|
|
1009
1013
|
if (!testCase.test_command) {
|
|
1010
1014
|
checks.push({
|
|
1011
|
-
name: "execution-checks",
|
|
1015
|
+
name: "execution-checks (skipped - no test_command)",
|
|
1012
1016
|
passed: true,
|
|
1013
1017
|
detail: "No test_command configured; skipping pre/post-patch execution checks."
|
|
1014
1018
|
});
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@agentgrader/core",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.2.0",
|
|
4
4
|
"description": "Core schemas, contracts, and runner for the Agentgrader benchmarking framework",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
"dev": "bun run src/index.ts"
|
|
23
23
|
},
|
|
24
24
|
"dependencies": {
|
|
25
|
-
"@agentgrader/store": "^1.0.
|
|
25
|
+
"@agentgrader/store": "^1.0.3",
|
|
26
26
|
"@mastra/core": "^1.41.0",
|
|
27
27
|
"yaml": "^2.5.1",
|
|
28
28
|
"zod": "^3.23.8"
|