agent-tool-forge 0.4.7 → 0.4.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/auth.d.ts CHANGED
@@ -7,7 +7,7 @@ export interface AuthResult {
7
7
 
8
8
  export interface AuthConfig {
9
9
  mode?: 'trust' | 'verify' | 'none';
10
- signingKey?: string;
10
+ signingKey?: string | null;
11
11
  claimsPath?: string;
12
12
  adminToken?: string | null;
13
13
  metricsToken?: string | null;
@@ -97,12 +97,7 @@ export function validateConfig(raw = {}) {
97
97
 
98
98
  // auth.mode = 'verify' requires signingKey
99
99
  if (raw.auth?.mode === 'verify' && !raw.auth?.signingKey) {
100
- errors.push('auth.signingKey is required when auth.mode is "verify"');
101
- }
102
-
103
- // Startup validation: sidecar enabled + verify mode + no signingKey
104
- if (raw.sidecar?.enabled && raw.auth?.mode === 'verify' && !raw.auth?.signingKey) {
105
- errors.push('auth.signingKey is required when auth.mode is "verify" and sidecar is enabled. Set FORGE_JWT_KEY in .env');
100
+ errors.push('auth.signingKey is required when auth.mode is "verify". Set it in forge.config.json or via a ${ENV_VAR} reference.');
106
101
  }
107
102
 
108
103
  // defaultHitlLevel
@@ -67,10 +67,12 @@ const PROJECT_ROOT = resolve(__dirname, '..');
67
67
  * @returns {Promise<{ auth, promptStore, preferenceStore, conversationStore, hitlEngine, verifierRunner, agentRegistry, db, config, env, rateLimiter, configPath, evalStore, chatAuditStore, verifierStore, pgStore, _redisClient, _pgPool }>}
68
68
  */
69
69
  export async function buildSidecarContext(config, db, env = {}, opts = {}) {
70
- // Resolve ${VAR} references in auth token fields at startup, not per-request
70
+ // Resolve ${VAR} references in auth token fields at startup, not per-request.
71
+ // No fallback for signingKey: if the env var is absent, resolve to null so createAuth
72
+ // fails-closed in verify mode rather than using the literal "${VAR}" string as the key.
71
73
  const resolvedAuth = config.auth ? {
72
74
  ...config.auth,
73
- signingKey: resolveSecret(config.auth.signingKey, env) ?? config.auth.signingKey ?? null,
75
+ signingKey: resolveSecret(config.auth.signingKey, env),
74
76
  adminToken: resolveSecret(config.auth.adminToken, env),
75
77
  metricsToken: resolveSecret(config.auth.metricsToken, env),
76
78
  } : config.auth;
@@ -326,10 +328,10 @@ export function createSidecarRouter(ctx, options = {}) {
326
328
  if (sidecarPath === '/agent-api/tools' && req.method === 'GET') {
327
329
  return handleToolsList(req, res, ctx);
328
330
  }
329
- if (url.pathname.startsWith('/forge-admin/agents')) {
331
+ if (sidecarPath.startsWith('/forge-admin/agents')) {
330
332
  return handleAgents(req, res, ctx);
331
333
  }
332
- if (url.pathname.startsWith('/forge-admin/config')) {
334
+ if (sidecarPath.startsWith('/forge-admin/config')) {
333
335
  return handleAdminConfig(req, res, ctx);
334
336
  }
335
337
 
@@ -339,7 +341,7 @@ export function createSidecarRouter(ctx, options = {}) {
339
341
  try {
340
342
  sendJson(res, 200, await ctx.evalStore.getEvalSummary());
341
343
  } catch (err) {
342
- sendJson(res, 500, { error: 'Failed to fetch eval summary' });
344
+ if (!res.headersSent) sendJson(res, 500, { error: 'Failed to fetch eval summary' });
343
345
  }
344
346
  } else if (ctx.db) {
345
347
  try {
@@ -365,7 +367,7 @@ export function createSidecarRouter(ctx, options = {}) {
365
367
  try {
366
368
  sendJson(res, 200, await ctx.evalStore.listRuns(limit, offset));
367
369
  } catch (err) {
368
- sendJson(res, 500, { error: 'Failed to fetch eval runs' });
370
+ if (!res.headersSent) sendJson(res, 500, { error: 'Failed to fetch eval runs' });
369
371
  }
370
372
  } else if (ctx.db) {
371
373
  try {
@@ -381,7 +383,7 @@ export function createSidecarRouter(ctx, options = {}) {
381
383
  }
382
384
 
383
385
  // ── Widget static file serving ─────────────────────────────────────────
384
- if (url.pathname.startsWith('/widget/')) {
386
+ if (sidecarPath.startsWith('/widget/')) {
385
387
  serveWidgetFile(req, res, widgetDir, sendJson);
386
388
  return;
387
389
  }
@@ -393,7 +395,7 @@ export function createSidecarRouter(ctx, options = {}) {
393
395
  if (handled) return;
394
396
  } catch (err) {
395
397
  process.stderr.write(`[forge] customRoutes error: ${err.message}\n`);
396
- sendJson(res, 500, { error: 'Internal server error' });
398
+ if (!res.headersSent) sendJson(res, 500, { error: 'Internal server error' });
397
399
  return;
398
400
  }
399
401
  }
@@ -650,7 +652,8 @@ function createDirectServer() {
650
652
  if (sidecarPath === '/agent-api/user/preferences') {
651
653
  if (req.method === 'GET') return handleGetPreferences(req, res, sidecarCtx);
652
654
  if (req.method === 'PUT') return handlePutPreferences(req, res, sidecarCtx);
653
- else { json(res, 405, { error: 'Method not allowed' }); return; }
655
+ json(res, 405, { error: 'Method not allowed' });
656
+ return;
654
657
  }
655
658
  if (sidecarPath.startsWith('/agent-api/conversations')) {
656
659
  return handleConversations(req, res, sidecarCtx);
@@ -658,10 +661,22 @@ function createDirectServer() {
658
661
  if (sidecarPath === '/agent-api/tools' && req.method === 'GET') {
659
662
  return handleToolsList(req, res, sidecarCtx);
660
663
  }
661
- if (url.pathname.startsWith('/forge-admin/agents')) {
664
+ if (sidecarPath.startsWith('/forge-admin/agents')) {
665
+ const authCheck = applyRouteAuth(req, sidecarCtx, 2);
666
+ if (!authCheck.ok) {
667
+ if (authCheck.status === 401) res.setHeader('WWW-Authenticate', 'Bearer');
668
+ json(res, authCheck.status, { error: authCheck.error });
669
+ return;
670
+ }
662
671
  return handleAgents(req, res, sidecarCtx);
663
672
  }
664
- if (url.pathname.startsWith('/forge-admin/config')) {
673
+ if (sidecarPath.startsWith('/forge-admin/config')) {
674
+ const authCheck = applyRouteAuth(req, sidecarCtx, 2);
675
+ if (!authCheck.ok) {
676
+ if (authCheck.status === 401) res.setHeader('WWW-Authenticate', 'Bearer');
677
+ json(res, authCheck.status, { error: authCheck.error });
678
+ return;
679
+ }
665
680
  return handleAdminConfig(req, res, sidecarCtx);
666
681
  }
667
682
  }
@@ -39,13 +39,13 @@ export class HitlEngine {
39
39
  * Retrieve and consume the paused state for a resume token.
40
40
  * Returns null if the token has expired or does not exist (does not throw).
41
41
  */
42
- resume(resumeToken: string): Promise<unknown | null>;
42
+ resume(resumeToken: string): Promise<object | null>;
43
43
 
44
44
  /**
45
45
  * Tear down any backend connections (Redis subscriber, Postgres pool, etc.).
46
- * Call on graceful shutdown.
46
+ * Call on graceful shutdown. Synchronous.
47
47
  */
48
- destroy(): Promise<void>;
48
+ destroy(): void;
49
49
  }
50
50
 
51
51
  /**
package/lib/index.js CHANGED
@@ -8,8 +8,7 @@
8
8
  */
9
9
 
10
10
  import { readFileSync, existsSync, writeFileSync } from 'fs';
11
- import { resolve, dirname } from 'path';
12
- import { fileURLToPath } from 'url';
11
+ import { resolve } from 'path';
13
12
  import { runTui } from './tui.js';
14
13
  import { addEndpointManually } from './manual-entry.js';
15
14
  import * as readline from 'readline';
@@ -18,7 +17,7 @@ const CONFIG_FILE = 'forge.config.json';
18
17
  const PENDING_SPEC_FILE = 'forge-pending-tool.json';
19
18
 
20
19
  function findProjectRoot() {
21
- return resolve(dirname(fileURLToPath(import.meta.url)), '..');
20
+ return process.cwd();
22
21
  }
23
22
 
24
23
  function loadConfig() {
package/lib/init.js CHANGED
@@ -499,7 +499,7 @@ export async function runInit(opts = {}) {
499
499
  const adminKeyValue = hasSidecar ? generateAdminKey() : null;
500
500
 
501
501
  if (hasSidecar) {
502
- raw.sidecar = { enabled: true, port: 8001 };
502
+ raw.sidecar = { port: 8001 };
503
503
  raw.adminKey = '${FORGE_ADMIN_KEY}';
504
504
  raw.auth = { mode: authMode };
505
505
  if (authMode === 'verify') {
package/lib/sidecar.d.ts CHANGED
@@ -57,7 +57,7 @@ export interface SidecarRouterOptions {
57
57
  export function buildSidecarContext(config: SidecarConfig, db: object, env?: Record<string, string>, opts?: { configPath?: string }): Promise<SidecarContext>;
58
58
  export function createSidecarRouter(ctx: SidecarContext, opts?: SidecarRouterOptions): (req: object, res: object) => Promise<void>;
59
59
 
60
- export { createAuth } from './auth.js';
60
+ export { createAuth, resolveSecret, authenticateAdmin } from './auth.js';
61
61
  export type { AuthResult, AuthConfig, Authenticator } from './auth.js';
62
62
 
63
63
  export { reactLoop } from './react-engine.js';
@@ -93,6 +93,8 @@ export class AgentRegistry {
93
93
  export class VerifierRunner {
94
94
  constructor(db: object, config?: object, pgPool?: object | null, workerPool?: object | null);
95
95
  loadFromDb(db: object): Promise<void>;
96
- run(toolName: string, args: object, result: unknown): Promise<Array<{ outcome: 'pass' | 'warn' | 'block'; message: string | null; verifier: string }>>;
96
+ registerVerifiers(toolName: string, verifiers: object[]): void;
97
+ verify(toolName: string, args: object, result: unknown): Promise<{ outcome: 'pass' | 'warn' | 'block'; message: string | null; verifierName: string | null }>;
98
+ logResult(sessionId: string, toolName: string, result: object): void;
97
99
  destroy(): void;
98
100
  }
package/lib/sidecar.js CHANGED
@@ -155,7 +155,7 @@ export async function createSidecar(config = {}, options = {}) {
155
155
 
156
156
  // Re-exports for advanced consumers
157
157
  export { buildSidecarContext, createSidecarRouter } from './forge-service.js';
158
- export { createAuth } from './auth.js';
158
+ export { createAuth, resolveSecret, authenticateAdmin } from './auth.js';
159
159
  export { reactLoop } from './react-engine.js';
160
160
  export { mergeDefaults, validateConfig, CONFIG_DEFAULTS } from './config-schema.js';
161
161
  export { getDb } from './db.js';
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-tool-forge",
3
- "version": "0.4.7",
3
+ "version": "0.4.10",
4
4
  "description": "Production LLM agent sidecar + Claude Code skill library for building, testing, and running tool-calling agents.",
5
5
  "keywords": [
6
6
  "llm",
@@ -30,6 +30,7 @@
30
30
  "lib",
31
31
  "widget",
32
32
  "config",
33
+ "skills",
33
34
  "!lib/**/*.test.js",
34
35
  "!lib/__fixtures__",
35
36
  "!widget/**/*.test.js"
@@ -0,0 +1,69 @@
1
+ # /forge-eval — Generate Eval Suites
2
+
3
+ Generate golden and labeled eval JSON files for a named tool. Run this skill after a tool is implemented and tests are green.
4
+
5
+ ---
6
+
7
+ ## Step 1 — Identify the Tool
8
+
9
+ Ask the user which tool to generate evals for, or read it from context if `/forge-tool` just completed.
10
+
11
+ Read the tool's ToolDefinition from `tools/<name>.tool.js`:
12
+ - `name`, `description`, `schema`, `triggerPhrases`, `category`, `consequenceLevel`
13
+
14
+ ---
15
+
16
+ ## Step 2 — Generate Golden Eval Suite
17
+
18
+ Generate **5–10 golden cases** covering:
19
+ - Happy path with typical inputs
20
+ - Edge cases: empty results, boundary values, missing optional params
21
+ - Error paths: invalid input, service unavailable
22
+
23
+ Each golden case follows this schema:
24
+ ```json
25
+ {
26
+ "id": "case-001",
27
+ "description": "What this case tests",
28
+ "input": { "message": "User's natural-language request" },
29
+ "expectedTool": "<tool_name>",
30
+ "expectedArgs": { "param": "value" },
31
+ "checks": [
32
+ { "type": "tool_called", "tool": "<tool_name>" },
33
+ { "type": "arg_equals", "arg": "param", "value": "value" }
34
+ ]
35
+ }
36
+ ```
37
+
38
+ Write to `evals/<name>.golden.json` as a JSON array.
39
+
40
+ ---
41
+
42
+ ## Step 3 — Generate Labeled Eval Suite
43
+
44
+ Generate **2–3 labeled (multi-tool) scenarios** where the agent must choose between 2+ tools or sequence multiple calls:
45
+ - Scenario where the tool is the correct choice over a similar tool
46
+ - Scenario where the tool is called followed by a second tool
47
+ - Scenario where the tool should NOT be called (wrong intent)
48
+
49
+ Each labeled case:
50
+ ```json
51
+ {
52
+ "id": "labeled-001",
53
+ "description": "What this scenario tests",
54
+ "input": { "message": "User's multi-intent request" },
55
+ "label": "correct" | "incorrect" | "partial",
56
+ "expectedTools": ["<tool_name>"],
57
+ "checks": [...]
58
+ }
59
+ ```
60
+
61
+ Write to `evals/<name>.labeled.json` as a JSON array.
62
+
63
+ ---
64
+
65
+ ## Step 4 — Validate
66
+
67
+ Run `node lib/index.js run --eval evals/<name>.golden.json --dry-run` if available to validate JSON schema.
68
+
69
+ Print a summary: N golden cases, M labeled scenarios, file paths written.
@@ -0,0 +1,265 @@
1
+ # Assertion Patterns — Deterministic Assertion Catalog
2
+
3
+ All eval assertions must be deterministic — identical results across runs given the same data. No LLM-as-judge.
4
+
5
+ ---
6
+
7
+ ## The Three Assertion Layers
8
+
9
+ ### Layer 1: responseContains (Hard Proof)
10
+
11
+ ALL values must appear in the response (substring match, case-sensitive).
12
+
13
+ **Use for:** Exact values that prove the tool returned real data. The LLM cannot guess or hallucinate these.
14
+
15
+ ```json
16
+ "responseContains": ["Paris", "72°F", "$30.05", "AAPL"]
17
+ ```
18
+
19
+ **Source values from:**
20
+ - Seed manifest: `{{seed:totals.dividends}}` → `"$30.05"`
21
+ - Live snapshot: `{{snapshot:performance.netWorth}}` → resolved at runtime
22
+ - Hardcoded: values not in either source (e.g., external ticker symbols)
23
+
24
+ **Rules:**
25
+ - Never hardcode values that depend on live data (prices, P&L, percentages)
26
+ - Use seed templates for stable values, snapshot templates for volatile ones
27
+ - Include at least 2 proof values per golden eval case
28
+
29
+ ### Layer 2: responseContainsAny (Domain Precision)
30
+
31
+ At least one value from EACH inner array (synonym group) must appear.
32
+
33
+ **Use for:** Correct vocabulary with phrasing flexibility. The agent shouldn't sound robotic, but must use the right terms.
34
+
35
+ ```json
36
+ "responseContainsAny": [
37
+ ["temperature", "degrees", "°"],
38
+ ["weather", "conditions", "forecast"]
39
+ ]
40
+ ```
41
+
42
+ This means: the response must contain at least one temperature-related term AND at least one weather-related term.
43
+
44
+ **Rules:**
45
+ - Each group = one concept with acceptable synonyms
46
+ - Keep groups small (2-4 terms) — too many dilutes the check
47
+ - Include both formal and informal variants
48
+
49
+ ### Layer 3: responseNotContains (Cop-Outs + Imprecision)
50
+
51
+ NONE of these values may appear in the response.
52
+
53
+ **Use for:**
54
+ - **Cop-outs:** `"I don't know"`, `"no information"`, `"unable to"`
55
+ - **JSON leaks:** `"fetchedAt"`, `"\"tool\":"`, `"undefined"`, `"null"`
56
+ - **Imprecision:** Wrong domain terms (e.g., "payment" when it should be "dividend")
57
+ - **Sensitive data:** `"API_KEY"`, `"OPENAI_API_KEY"`, `"Bearer "`
58
+ - **System prompt leaks:** `"AVAILABLE TOOLS"`, `"you are an AI"`
59
+
60
+ ```json
61
+ "responseNotContains": [
62
+ "I don't know", "no information", "unable to",
63
+ "fetchedAt", "\"tool\":", "undefined"
64
+ ]
65
+ ```
66
+
67
+ ---
68
+
69
+ ## Seed-Stable vs Market-Dynamic Values
70
+
71
+ Every assertion value falls into one of two categories:
72
+
73
+ ### Seed-Stable Values
74
+
75
+ Derived from a seed data script and codified in a seed manifest file. These never change unless the seed script is re-authored.
76
+
77
+ **Template syntax:** `{{seed:path}}`
78
+
79
+ | Template | Example Resolution |
80
+ |----------|-------------------|
81
+ | `{{seed:holdings.equities[0]}}` | `"AAPL"` |
82
+ | `{{seed:quantities.AAPL.current}}` | `"7"` |
83
+ | `{{seed:totals.dividends}}` | `"$30.05"` |
84
+ | `{{seed:currency}}` | `"USD"` |
85
+
86
+ ### Market-Dynamic Values
87
+
88
+ Depend on live data (prices, P&L, allocations). Change on every run.
89
+
90
+ **Template syntax:** `{{snapshot:path}}`
91
+
92
+ | Template | Example Resolution |
93
+ |----------|-------------------|
94
+ | `{{snapshot:holdings.AAPL.value}}` | `"$1,599.50"` |
95
+ | `{{snapshot:performance.netWorth}}` | `"$13,245.00"` |
96
+ | `{{snapshot:performance.netPnlPct}}` | `"8.03%"` |
97
+
98
+ ### Resolution Rules
99
+
100
+ 1. Seed templates resolve before snapshot templates (allows mixing)
101
+ 2. If a path is missing, the individual assertion is skipped with a warning — not a hard failure
102
+ 3. Resolution happens in-memory only — eval JSON on disk is never modified
103
+
104
+ ### When to Use Each
105
+
106
+ | Value type | Source | Assertion style |
107
+ |-----------|--------|----------------|
108
+ | Fixed identifiers (names, IDs) | Seed manifest | `{{seed:...}}` |
109
+ | Fixed quantities (counts, amounts) | Seed manifest | `{{seed:...}}` |
110
+ | Domain terms | N/A | `responseContainsAny` synonym groups |
111
+ | External identifiers | N/A | Hardcoded |
112
+ | Current values (prices, P&L) | Live snapshot | `{{snapshot:...}}` |
113
+
114
+ **Rule of thumb:** If the value comes from your seed data, use `{{seed:*}}`. If it depends on live external data, use `{{snapshot:*}}`. If it's a constant, hardcode it.
115
+
116
+ ---
117
+
118
+ ## Negative Assertions
119
+
120
+ Critical for catching regression:
121
+
122
+ ### Cop-out Detection
123
+ ```json
124
+ "responseNotContains": ["I don't know", "no information", "unable to", "I cannot"]
125
+ ```
126
+
127
+ ### JSON Leak Detection
128
+ ```json
129
+ "responseNotContains": ["fetchedAt", "\"tool\":", "\"error\":", "undefined", "null"]
130
+ ```
131
+
132
+ ### System Prompt Leak Detection
133
+ ```json
134
+ "responseNotContains": ["AVAILABLE TOOLS", "you are an AI", "system prompt"]
135
+ ```
136
+
137
+ ### Domain Imprecision
138
+ ```json
139
+ "responseNotContains": ["payment received"]
140
+ ```
141
+ (When the correct term is "dividend", catching the wrong term is a precision assertion.)
142
+
143
+ ---
144
+
145
+ ## Latency Assertions
146
+
147
+ ```json
148
+ "maxLatencyMs": 30000
149
+ ```
150
+
151
+ - Golden evals: 30s is typical (one LLM call + one tool call)
152
+ - Labeled straightforward: 30s (simple multi-tool)
153
+ - Labeled ambiguous: 30s (same)
154
+ - Edge/adversarial: 15s (should respond quickly without tool calls)
155
+
156
+ ---
157
+
158
+ ## Tool Routing Assertions
159
+
160
+ ### Exact match (golden + straightforward labeled)
161
+ ```json
162
+ "toolsCalled": ["get_weather", "get_forecast"]
163
+ ```
164
+ Both tools must be called. No more, no fewer.
165
+
166
+ ### Acceptable sets (ambiguous labeled)
167
+ ```json
168
+ "toolsAcceptable": [
169
+ ["get_weather"],
170
+ ["get_weather", "get_forecast"]
171
+ ]
172
+ ```
173
+ Either set is valid. The agent's judgment decides depth.
174
+
175
+ ### Negative routing (edge labeled)
176
+ ```json
177
+ "toolsNotCalled": ["delete_account"]
178
+ ```
179
+ This tool must NOT be called (e.g., on an injection attempt).
180
+
181
+ ### No tools needed (edge labeled)
182
+ ```json
183
+ "toolsAcceptable": [["__none__"]]
184
+ ```
185
+ The agent should answer from general knowledge without calling any tools.
186
+
187
+ ---
188
+
189
+ ## Parameter Assertions
190
+
191
+ Checks that the model passed correct arguments to the tool. This catches a failure class that routing assertions miss entirely: the model calls the right tool but passes wrong, missing, or hallucinated parameters.
192
+
193
+ ### Why This Matters
194
+
195
+ BFCL and Google ADK both test parameter-level accuracy. Without it, your evals only prove the model picked the right tool — not that it used it correctly. A model that calls `get_weather` with `city: "the weather"` instead of `city: "Paris"` passes all routing assertions and all response assertions (if the tool errors gracefully and the model recovers). The parameter assertion catches it.
196
+
197
+ ### Assertion Types
198
+
199
+ ```json
200
+ "toolParams": [
201
+ { "tool": "get_weather", "paramName": "city", "assertion": "contains", "value": "Paris" },
202
+ { "tool": "get_weather", "paramName": "units", "assertion": "oneOf", "value": ["metric", "imperial"] }
203
+ ]
204
+ ```
205
+
206
+ | Assertion | Use When | Example |
207
+ |-----------|----------|---------|
208
+ | `equals` | Exact value known | `city` = `"Paris"` |
209
+ | `contains` | Model may normalize | `city` contains `"Tokyo"` (could be `"Tokyo"` or `"Tokyo, JP"`) |
210
+ | `oneOf` | Multiple valid values | `units` is `"metric"` or `"imperial"` |
211
+ | `exists` | Parameter must be provided | `city` was sent (any value) |
212
+ | `notExists` | Catch hallucinated params | `country_code` should not be sent if schema doesn't define it |
213
+ | `matches` | Format validation | `date` matches `^\d{4}-\d{2}-\d{2}$` |
214
+
215
+ ### Golden Eval Example
216
+
217
+ ```json
218
+ {
219
+ "id": "gs-get-weather-001",
220
+ "description": "trigger phrase — direct weather question",
221
+ "input": { "message": "What's the weather in Paris?" },
222
+ "expect": {
223
+ "toolsCalled": ["get_weather"],
224
+ "toolParams": [
225
+ { "tool": "get_weather", "paramName": "city", "assertion": "contains", "value": "Paris" }
226
+ ],
227
+ "noToolErrors": true,
228
+ "responseNonEmpty": true,
229
+ "responseContains": ["Paris"],
230
+ "responseContainsAny": [["temperature", "degrees", "°"]],
231
+ "maxLatencyMs": 30000
232
+ }
233
+ }
234
+ ```
235
+
236
+ ### Multi-Tool Labeled Example
237
+
238
+ ```json
239
+ {
240
+ "id": "ls-get-weather-001",
241
+ "description": "straightforward — weather + forecast synthesis",
242
+ "difficulty": "straightforward",
243
+ "input": { "message": "What's the weather in Tokyo today and what should I expect this week?" },
244
+ "expect": {
245
+ "toolsCalled": ["get_weather", "get_forecast"],
246
+ "toolParams": [
247
+ { "tool": "get_weather", "paramName": "city", "assertion": "contains", "value": "Tokyo" },
248
+ { "tool": "get_forecast", "paramName": "city", "assertion": "contains", "value": "Tokyo" }
249
+ ],
250
+ "noToolErrors": true,
251
+ "responseNonEmpty": true,
252
+ "responseContains": ["Tokyo"],
253
+ "maxLatencyMs": 30000
254
+ }
255
+ }
256
+ ```
257
+
258
+ ### Rules
259
+
260
+ - Use `contains` over `equals` by default — models normalize inputs in unpredictable ways
261
+ - Use `oneOf` for enum fields where defaults may vary by context
262
+ - Use `exists` sparingly — it's weak (any value passes). Prefer `contains` or `equals`.
263
+ - Parameter assertions are SKIPPED if the tool wasn't called (routing already failed)
264
+ - When `toolsAcceptable` is used, only assert params for tools that were actually called
265
+ - Use `{{seed:*}}` in values for data-dependent parameters: `{ "assertion": "equals", "value": "{{seed:holdings.equities[0].symbol}}" }`