agent-tool-forge 0.4.6 → 0.4.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -40,10 +40,10 @@ See [docs/tui-workflow.md](docs/tui-workflow.md) for a start-to-finish walkthrou
40
40
 
41
41
  ```bash
42
42
  # Global install (available in all projects)
43
- cp -r tool-forge/skills/forge-tool ~/.claude/skills/
44
- cp -r tool-forge/skills/forge-eval ~/.claude/skills/
45
- cp -r tool-forge/skills/forge-mcp ~/.claude/skills/
46
- cp -r tool-forge/skills/forge-verifier ~/.claude/skills/
43
+ cp -r node_modules/agent-tool-forge/skills/forge-tool ~/.claude/skills/
44
+ cp -r node_modules/agent-tool-forge/skills/forge-eval ~/.claude/skills/
45
+ cp -r node_modules/agent-tool-forge/skills/forge-mcp ~/.claude/skills/
46
+ cp -r node_modules/agent-tool-forge/skills/forge-verifier ~/.claude/skills/
47
47
  ```
48
48
 
49
49
  Then in any Claude Code session:
@@ -123,23 +123,23 @@ All subpaths ship with TypeScript declarations.
123
123
 
124
124
  ```js
125
125
  import { createSidecar } from 'agent-tool-forge' // main entry
126
- import { reactLoop } from 'tool-forge/react-engine'
127
- import { createAuth } from 'tool-forge/auth'
128
- import { makeConversationStore } from 'tool-forge/conversation-store'
129
- import { mergeDefaults } from 'tool-forge/config'
130
- import { makeHitlEngine } from 'tool-forge/hitl-engine'
131
- import { makePromptStore } from 'tool-forge/prompt-store'
132
- import { makePreferenceStore } from 'tool-forge/preference-store'
133
- import { makeRateLimiter } from 'tool-forge/rate-limiter'
134
- import { getDb } from 'tool-forge/db'
135
- import { initSSE } from 'tool-forge/sse'
126
+ import { reactLoop } from 'agent-tool-forge/react-engine'
127
+ import { createAuth } from 'agent-tool-forge/auth'
128
+ import { makeConversationStore } from 'agent-tool-forge/conversation-store'
129
+ import { mergeDefaults } from 'agent-tool-forge/config'
130
+ import { makeHitlEngine } from 'agent-tool-forge/hitl-engine'
131
+ import { makePromptStore } from 'agent-tool-forge/prompt-store'
132
+ import { makePreferenceStore } from 'agent-tool-forge/preference-store'
133
+ import { makeRateLimiter } from 'agent-tool-forge/rate-limiter'
134
+ import { getDb } from 'agent-tool-forge/db'
135
+ import { initSSE } from 'agent-tool-forge/sse'
136
136
  import {
137
137
  PostgresStore,
138
138
  PostgresEvalStore,
139
139
  PostgresChatAuditStore,
140
140
  PostgresVerifierStore
141
- } from 'tool-forge/postgres-store'
142
- import { buildSidecarContext, createSidecarRouter } from 'tool-forge/forge-service'
141
+ } from 'agent-tool-forge/postgres-store'
142
+ import { buildSidecarContext, createSidecarRouter } from 'agent-tool-forge/forge-service'
143
143
  ```
144
144
 
145
145
  ---
@@ -0,0 +1,17 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "_comment": "Manual endpoint manifest. Add endpoints here when OpenAPI discovery is unavailable. Forge uses this to propose tools.",
4
+ "baseUrl": "${API_BASE_URL}",
5
+ "endpoints": [
6
+ {
7
+ "path": "/api/v1/example",
8
+ "method": "GET",
9
+ "name": "get_example",
10
+ "description": "Retrieves example data from the API. Use when the user asks for examples.",
11
+ "params": {
12
+ "id": { "type": "string", "description": "Optional filter by ID" }
13
+ },
14
+ "requiresConfirmation": false
15
+ }
16
+ ]
17
+ }
@@ -0,0 +1,106 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "_comment": "Optional configuration that front-loads answers to common skill questions. Delete fields you don't need — all are optional. The skills work via dialogue alone without this file.",
4
+
5
+ "project": {
6
+ "name": "my-project",
7
+ "toolsDir": "src/tools",
8
+ "testsDir": "src/tools/__tests__",
9
+ "evalsDir": "evals/dataset",
10
+ "barrelsFile": "src/tools/tools.exports.ts"
11
+ },
12
+
13
+ "api": {
14
+ "baseUrl": "http://localhost:3000",
15
+ "_baseUrlComment": "Base URL for MCP tool routing. Tool mcpRouting.endpoint paths are appended to this.",
16
+ "discovery": {
17
+ "type": "openapi",
18
+ "url": "http://localhost:3333/api-json",
19
+ "_comment": "Or file: { \"type\": \"openapi\", \"file\": \"openapi.json\" }"
20
+ },
21
+ "manifestPath": "api-endpoints.json"
22
+ },
23
+
24
+ "language": "typescript",
25
+
26
+ "validation": {
27
+ "library": "zod",
28
+ "_alternatives": ["pydantic", "joi", "json-schema", "struct-tags"]
29
+ },
30
+
31
+ "testing": {
32
+ "framework": "jest",
33
+ "_alternatives": ["vitest", "pytest", "go-test", "mocha"],
34
+ "command": "npx jest --passWithNoTests"
35
+ },
36
+
37
+ "typeCheck": {
38
+ "command": "npx tsc --noEmit",
39
+ "_comment": "Set to null if your stack doesn't have a type checker"
40
+ },
41
+
42
+ "auth": {
43
+ "contextField": "context.auth",
44
+ "type": "jwt",
45
+ "_alternatives": ["api-key", "oauth", "service-account"]
46
+ },
47
+
48
+ "client": {
49
+ "contextField": "context.client",
50
+ "type": "http",
51
+ "_comment": "The API client your tools use. Could be HTTP, gRPC, SDK wrapper, etc."
52
+ },
53
+
54
+ "hitl": {
55
+ "enabled": false,
56
+ "framework": null,
57
+ "_comment": "Set to true and specify framework (e.g., 'langgraph') if you use human-in-the-loop confirmation for write tools"
58
+ },
59
+
60
+ "mcp": {
61
+ "defaultTransport": "stdio",
62
+ "_alternatives": ["streamable-http"],
63
+ "serverPrefix": "my-project",
64
+ "_comment": "Used by /forge-mcp to name the generated MCP server"
65
+ },
66
+
67
+ "evals": {
68
+ "goldenDir": "evals/dataset/golden",
69
+ "labeledDir": "evals/dataset/labeled",
70
+ "overlapMapFile": "evals/tool-overlap-map.json",
71
+ "seedManifestFile": "evals/seed-manifest.json",
72
+ "_comment": "Paths are relative to project root",
73
+ "defaultMix": {
74
+ "golden": { "total": 10 },
75
+ "labeled": { "straightforward": 3, "ambiguous": 3, "edge": 2, "adversarial": 2 }
76
+ },
77
+ "multiPass": { "passes": 3 },
78
+ "randomSample": { "aggression": "standard" }
79
+ },
80
+ "drift": {
81
+ "threshold": 0.1,
82
+ "windowSize": 5
83
+ },
84
+ "modelMatrix": [],
85
+ "_modelMatrixComment": "Add model names to compare during eval runs, e.g. ['gpt-4o-mini', 'gemini-2.0-flash', 'claude-haiku-4-5-20251001']",
86
+ "costs": {
87
+ "claude-haiku-4-5-20251001": { "input": 0.80, "output": 4.00 },
88
+ "claude-sonnet-4-6": { "input": 3.00, "output": 15.00 },
89
+ "claude-opus-4-6": { "input": 15.00, "output": 75.00 },
90
+ "gpt-4o": { "input": 2.50, "output": 10.00 },
91
+ "gpt-4o-mini": { "input": 0.15, "output": 0.60 },
92
+ "o1": { "input": 15.00, "output": 60.00 },
93
+ "o3-mini": { "input": 1.10, "output": 4.40 },
94
+ "gemini-2.0-flash": { "input": 0.10, "output": 0.40 },
95
+ "gemini-2.5-pro-exp": { "input": 1.25, "output": 10.00 },
96
+ "deepseek-chat": { "input": 0.27, "output": 1.10 }
97
+ },
98
+
99
+ "verification": {
100
+ "enabled": true,
101
+ "verifiersDir": "src/verification",
102
+ "barrelsFile": "src/verification/verifiers.exports.ts",
103
+ "orderPrefix": "A-",
104
+ "_comment": "Order categories: A=attribution, C=compliance, I=interface, R=risk, U=uncertainty"
105
+ }
106
+ }
@@ -14,7 +14,7 @@ export const CONFIG_DEFAULTS = {
14
14
  adminKey: null,
15
15
  database: { type: 'sqlite', url: null },
16
16
  conversation: { store: 'sqlite', window: 25, redis: {} },
17
- sidecar: { enabled: false, port: 8001 },
17
+ sidecar: { port: 8001 }, // port: used in direct-run mode only (node lib/forge-service.js)
18
18
  agents: [],
19
19
  rateLimit: {
20
20
  enabled: false,
@@ -100,9 +100,9 @@ export function validateConfig(raw = {}) {
100
100
  errors.push('auth.signingKey is required when auth.mode is "verify"');
101
101
  }
102
102
 
103
- // Startup validation: sidecar enabled + verify mode + no signingKey
104
- if (raw.sidecar?.enabled && raw.auth?.mode === 'verify' && !raw.auth?.signingKey) {
105
- errors.push('auth.signingKey is required when auth.mode is "verify" and sidecar is enabled. Set FORGE_JWT_KEY in .env');
103
+ // verify mode always requires a signingKey
104
+ if (raw.auth?.mode === 'verify' && !raw.auth?.signingKey) {
105
+ errors.push('auth.signingKey is required when auth.mode is "verify". Set it in forge.config.json or via a ${ENV_VAR} reference.');
106
106
  }
107
107
 
108
108
  // defaultHitlLevel
package/lib/config.d.ts CHANGED
@@ -23,9 +23,13 @@ export interface DatabaseConfig {
23
23
  }
24
24
 
25
25
  export interface AuthConfig {
26
- mode?: 'trust' | 'verify';
27
- signingKey?: string;
26
+ mode?: 'trust' | 'verify' | 'none';
27
+ signingKey?: string | null;
28
28
  claimsPath?: string;
29
+ /** Admin Bearer token. Replaces top-level `adminKey`. Supports `${VAR}` env references. */
30
+ adminToken?: string | null;
31
+ /** Metrics scrape token for /metrics. Supports `${VAR}` env references. */
32
+ metricsToken?: string | null;
29
33
  }
30
34
 
31
35
  export interface AgentConfig {
@@ -43,6 +47,26 @@ export interface AgentConfig {
43
47
  enabled?: number;
44
48
  }
45
49
 
50
+ export interface AgentRouterConfig {
51
+ endpoint?: string | null;
52
+ method?: string;
53
+ headers?: Record<string, string>;
54
+ inputField?: string;
55
+ outputField?: string;
56
+ sessionField?: string;
57
+ }
58
+
59
+ export interface GatesConfig {
60
+ passRate?: number | null;
61
+ maxCost?: number | null;
62
+ p95LatencyMs?: number | null;
63
+ }
64
+
65
+ export interface FixturesConfig {
66
+ dir?: string;
67
+ ttlDays?: number;
68
+ }
69
+
46
70
  export interface SidecarConfig {
47
71
  auth?: AuthConfig;
48
72
  defaultModel?: string;
@@ -50,14 +74,19 @@ export interface SidecarConfig {
50
74
  allowUserModelSelect?: boolean;
51
75
  allowUserHitlConfig?: boolean;
52
76
  systemPrompt?: string;
53
- adminKey?: string;
77
+ /** @deprecated Use `auth.adminToken` instead. */
78
+ adminKey?: string | null;
54
79
  conversation?: ConversationConfig;
55
80
  rateLimit?: RateLimitConfig;
56
81
  verification?: VerificationConfig;
57
82
  database?: DatabaseConfig;
58
- sidecar?: { enabled?: boolean; port?: number };
83
+ /** `port` is used in direct-run mode only (`node lib/forge-service.js`). `createSidecar()` uses `SidecarOptions.port`. */
84
+ sidecar?: { port?: number };
59
85
  agents?: AgentConfig[];
60
86
  costs?: Record<string, { input: number; output: number }>;
87
+ agent?: AgentRouterConfig;
88
+ gates?: GatesConfig;
89
+ fixtures?: FixturesConfig;
61
90
  }
62
91
 
63
92
  export const CONFIG_DEFAULTS: SidecarConfig;
@@ -67,7 +67,16 @@ const PROJECT_ROOT = resolve(__dirname, '..');
67
67
  * @returns {Promise<{ auth, promptStore, preferenceStore, conversationStore, hitlEngine, verifierRunner, agentRegistry, db, config, env, rateLimiter, configPath, evalStore, chatAuditStore, verifierStore, pgStore, _redisClient, _pgPool }>}
68
68
  */
69
69
  export async function buildSidecarContext(config, db, env = {}, opts = {}) {
70
- const auth = createAuth(config.auth);
70
+ // Resolve ${VAR} references in auth token fields at startup, not per-request.
71
+ // No fallback for signingKey: if the env var is absent, resolve to null so createAuth
72
+ // fails-closed in verify mode rather than using the literal "${VAR}" string as the key.
73
+ const resolvedAuth = config.auth ? {
74
+ ...config.auth,
75
+ signingKey: resolveSecret(config.auth.signingKey, env),
76
+ adminToken: resolveSecret(config.auth.adminToken, env),
77
+ metricsToken: resolveSecret(config.auth.metricsToken, env),
78
+ } : config.auth;
79
+ const auth = createAuth(resolvedAuth);
71
80
 
72
81
  let redisClient = null;
73
82
  let pgPool = null;
@@ -103,6 +112,7 @@ export async function buildSidecarContext(config, db, env = {}, opts = {}) {
103
112
  idleTimeoutMillis: 30000,
104
113
  max: 10
105
114
  });
115
+ pgPool.on('error', err => process.stderr.write(`[forge] pg pool error: ${err.message}\n`));
106
116
  await pgPool.query(SCHEMA); // ensure all tables exist
107
117
  }
108
118
 
@@ -142,9 +152,14 @@ export async function buildSidecarContext(config, db, env = {}, opts = {}) {
142
152
  // project directory, not into the installed package.
143
153
  const configPath = opts?.configPath ?? resolve(process.cwd(), 'forge.config.json');
144
154
 
155
+ // Return resolved auth config so applyRouteAuth sees literal tokens (not ${VAR})
156
+ const resolvedConfig = resolvedAuth !== config.auth
157
+ ? { ...config, auth: resolvedAuth }
158
+ : config;
159
+
145
160
  return {
146
161
  auth, promptStore, preferenceStore, conversationStore, hitlEngine, verifierRunner,
147
- agentRegistry, db, config, env, rateLimiter, configPath,
162
+ agentRegistry, db, config: resolvedConfig, env, rateLimiter, configPath,
148
163
  evalStore, chatAuditStore, verifierStore, pgStore,
149
164
  _redisClient: redisClient, _pgPool: pgPool
150
165
  };
@@ -304,7 +319,8 @@ export function createSidecarRouter(ctx, options = {}) {
304
319
  if (sidecarPath === '/agent-api/user/preferences') {
305
320
  if (req.method === 'GET') return handleGetPreferences(req, res, ctx);
306
321
  if (req.method === 'PUT') return handlePutPreferences(req, res, ctx);
307
- else { sendJson(res, 405, { error: 'Method not allowed' }); return; }
322
+ sendJson(res, 405, { error: 'Method not allowed' });
323
+ return;
308
324
  }
309
325
  if (sidecarPath.startsWith('/agent-api/conversations')) {
310
326
  return handleConversations(req, res, ctx);
@@ -374,8 +390,14 @@ export function createSidecarRouter(ctx, options = {}) {
374
390
 
375
391
  // ── Custom routes (consumer-provided) ─────────────────────────────────
376
392
  if (customRoutes) {
377
- const handled = await customRoutes(req, res, ctx);
378
- if (handled) return;
393
+ try {
394
+ const handled = await customRoutes(req, res, ctx);
395
+ if (handled) return;
396
+ } catch (err) {
397
+ process.stderr.write(`[forge] customRoutes error: ${err.message}\n`);
398
+ if (!res.headersSent) sendJson(res, 500, { error: 'Internal server error' });
399
+ return;
400
+ }
379
401
  }
380
402
 
381
403
  // ── 404 fallback ───────────────────────────────────────────────────────
@@ -630,7 +652,8 @@ function createDirectServer() {
630
652
  if (sidecarPath === '/agent-api/user/preferences') {
631
653
  if (req.method === 'GET') return handleGetPreferences(req, res, sidecarCtx);
632
654
  if (req.method === 'PUT') return handlePutPreferences(req, res, sidecarCtx);
633
- else { json(res, 405, { error: 'Method not allowed' }); return; }
655
+ json(res, 405, { error: 'Method not allowed' });
656
+ return;
634
657
  }
635
658
  if (sidecarPath.startsWith('/agent-api/conversations')) {
636
659
  return handleConversations(req, res, sidecarCtx);
@@ -37,9 +37,15 @@ export class HitlEngine {
37
37
 
38
38
  /**
39
39
  * Retrieve and consume the paused state for a resume token.
40
- * Throws if the token has expired or does not exist.
40
+ * Returns null if the token has expired or does not exist (does not throw).
41
41
  */
42
- resume(resumeToken: string): Promise<unknown>;
42
+ resume(resumeToken: string): Promise<object | null>;
43
+
44
+ /**
45
+ * Tear down any backend connections (Redis subscriber, Postgres pool, etc.).
46
+ * Call on graceful shutdown. Synchronous.
47
+ */
48
+ destroy(): void;
43
49
  }
44
50
 
45
51
  /**
package/lib/index.js CHANGED
@@ -8,8 +8,7 @@
8
8
  */
9
9
 
10
10
  import { readFileSync, existsSync, writeFileSync } from 'fs';
11
- import { resolve, dirname } from 'path';
12
- import { fileURLToPath } from 'url';
11
+ import { resolve } from 'path';
13
12
  import { runTui } from './tui.js';
14
13
  import { addEndpointManually } from './manual-entry.js';
15
14
  import * as readline from 'readline';
@@ -18,7 +17,7 @@ const CONFIG_FILE = 'forge.config.json';
18
17
  const PENDING_SPEC_FILE = 'forge-pending-tool.json';
19
18
 
20
19
  function findProjectRoot() {
21
- return resolve(dirname(fileURLToPath(import.meta.url)), '..');
20
+ return process.cwd();
22
21
  }
23
22
 
24
23
  function loadConfig() {
package/lib/init.js CHANGED
@@ -499,7 +499,7 @@ export async function runInit(opts = {}) {
499
499
  const adminKeyValue = hasSidecar ? generateAdminKey() : null;
500
500
 
501
501
  if (hasSidecar) {
502
- raw.sidecar = { enabled: true, port: 8001 };
502
+ raw.sidecar = { port: 8001 };
503
503
  raw.adminKey = '${FORGE_ADMIN_KEY}';
504
504
  raw.auth = { mode: authMode };
505
505
  if (authMode === 'verify') {
package/lib/sidecar.d.ts CHANGED
@@ -44,11 +44,20 @@ export interface SidecarInstance {
44
44
 
45
45
  export function createSidecar(config?: Partial<SidecarConfig>, options?: SidecarOptions): Promise<SidecarInstance>;
46
46
 
47
+ export interface SidecarRouterOptions {
48
+ /** Absolute path to serve static files from for /widget/* routes. Defaults to package widget/. */
49
+ widgetDir?: string;
50
+ /** Optional async handler for /mcp routes. */
51
+ mcpHandler?: (req: object, res: object) => Promise<void> | void;
52
+ /** Called before the 404 fallback. Return true if the request was handled. */
53
+ customRoutes?: (req: object, res: object, ctx: SidecarContext) => Promise<boolean> | boolean;
54
+ }
55
+
47
56
  // Advanced consumers
48
- export function buildSidecarContext(config: SidecarConfig, db: object, env?: Record<string, string>, opts?: object): Promise<SidecarContext>;
49
- export function createSidecarRouter(ctx: SidecarContext, opts?: object): (req: object, res: object) => void;
57
+ export function buildSidecarContext(config: SidecarConfig, db: object, env?: Record<string, string>, opts?: { configPath?: string }): Promise<SidecarContext>;
58
+ export function createSidecarRouter(ctx: SidecarContext, opts?: SidecarRouterOptions): (req: object, res: object) => Promise<void>;
50
59
 
51
- export { createAuth } from './auth.js';
60
+ export { createAuth, resolveSecret, authenticateAdmin } from './auth.js';
52
61
  export type { AuthResult, AuthConfig, Authenticator } from './auth.js';
53
62
 
54
63
  export { reactLoop } from './react-engine.js';
@@ -82,8 +91,10 @@ export class AgentRegistry {
82
91
  }
83
92
 
84
93
  export class VerifierRunner {
85
- constructor(db: object, config?: object, workerPool?: object);
94
+ constructor(db: object, config?: object, pgPool?: object | null, workerPool?: object | null);
86
95
  loadFromDb(db: object): Promise<void>;
87
- run(toolName: string, args: object, result: unknown): Promise<Array<{ outcome: 'pass' | 'warn' | 'block'; message: string | null; verifier: string }>>;
96
+ registerVerifiers(toolName: string, verifiers: object[]): void;
97
+ verify(toolName: string, args: object, result: unknown): Promise<{ outcome: 'pass' | 'warn' | 'block'; message: string | null; verifierName: string | null }>;
98
+ logResult(sessionId: string, toolName: string, result: object): void;
88
99
  destroy(): void;
89
100
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-tool-forge",
3
- "version": "0.4.6",
3
+ "version": "0.4.9",
4
4
  "description": "Production LLM agent sidecar + Claude Code skill library for building, testing, and running tool-calling agents.",
5
5
  "keywords": [
6
6
  "llm",
@@ -29,6 +29,8 @@
29
29
  "files": [
30
30
  "lib",
31
31
  "widget",
32
+ "config",
33
+ "skills",
32
34
  "!lib/**/*.test.js",
33
35
  "!lib/__fixtures__",
34
36
  "!widget/**/*.test.js"
@@ -0,0 +1,69 @@
1
+ # /forge-eval — Generate Eval Suites
2
+
3
+ Generate golden and labeled eval JSON files for a named tool. Run this skill after a tool is implemented and tests are green.
4
+
5
+ ---
6
+
7
+ ## Step 1 — Identify the Tool
8
+
9
+ Ask the user which tool to generate evals for, or read it from context if `/forge-tool` just completed.
10
+
11
+ Read the tool's ToolDefinition from `tools/<name>.tool.js`:
12
+ - `name`, `description`, `schema`, `triggerPhrases`, `category`, `consequenceLevel`
13
+
14
+ ---
15
+
16
+ ## Step 2 — Generate Golden Eval Suite
17
+
18
+ Generate **5–10 golden cases** covering:
19
+ - Happy path with typical inputs
20
+ - Edge cases: empty results, boundary values, missing optional params
21
+ - Error paths: invalid input, service unavailable
22
+
23
+ Each golden case follows this schema:
24
+ ```json
25
+ {
26
+ "id": "case-001",
27
+ "description": "What this case tests",
28
+ "input": { "message": "User's natural-language request" },
29
+ "expectedTool": "<tool_name>",
30
+ "expectedArgs": { "param": "value" },
31
+ "checks": [
32
+ { "type": "tool_called", "tool": "<tool_name>" },
33
+ { "type": "arg_equals", "arg": "param", "value": "value" }
34
+ ]
35
+ }
36
+ ```
37
+
38
+ Write to `evals/<name>.golden.json` as a JSON array.
39
+
40
+ ---
41
+
42
+ ## Step 3 — Generate Labeled Eval Suite
43
+
44
+ Generate **2–3 labeled (multi-tool) scenarios** where the agent must choose between 2+ tools or sequence multiple calls:
45
+ - Scenario where the tool is the correct choice over a similar tool
46
+ - Scenario where the tool is called followed by a second tool
47
+ - Scenario where the tool should NOT be called (wrong intent)
48
+
49
+ Each labeled case:
50
+ ```json
51
+ {
52
+ "id": "labeled-001",
53
+ "description": "What this scenario tests",
54
+ "input": { "message": "User's multi-intent request" },
55
+ "label": "correct" | "incorrect" | "partial",
56
+ "expectedTools": ["<tool_name>"],
57
+ "checks": [...]
58
+ }
59
+ ```
60
+
61
+ Write to `evals/<name>.labeled.json` as a JSON array.
62
+
63
+ ---
64
+
65
+ ## Step 4 — Validate
66
+
67
+ Run `node lib/index.js run --eval evals/<name>.golden.json --dry-run` if available to validate JSON schema.
68
+
69
+ Print a summary: N golden cases, M labeled scenarios, file paths written.