agent-tool-forge 0.4.7 → 0.4.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/config-schema.js +3 -3
- package/lib/forge-service.js +7 -4
- package/lib/hitl-engine.d.ts +3 -3
- package/lib/index.js +2 -3
- package/lib/init.js +1 -1
- package/lib/sidecar.d.ts +4 -2
- package/package.json +2 -1
- package/skills/forge-eval/SKILL.md +69 -0
- package/skills/forge-eval/references/assertion-patterns.md +265 -0
- package/skills/forge-eval/references/eval-types.md +262 -0
- package/skills/forge-eval/references/overlap-map.md +89 -0
- package/skills/forge-mcp/SKILL.md +62 -0
- package/skills/forge-mcp/references/mcp-templates.md +302 -0
- package/skills/forge-mcp/references/tool-to-mcp-mapping.md +108 -0
- package/skills/forge-tool/SKILL.md +112 -0
- package/skills/forge-tool/references/description-contract.md +102 -0
- package/skills/forge-tool/references/extension-points.md +120 -0
- package/skills/forge-tool/references/pending-spec.md +53 -0
- package/skills/forge-tool/references/tool-shape.md +106 -0
- package/skills/forge-verifier/SKILL.md +78 -0
- package/skills/forge-verifier/references/output-groups.md +39 -0
- package/skills/forge-verifier/references/verifier-pattern.md +83 -0
- package/skills/forge-verifier/references/verifier-stubs.md +147 -0
package/lib/config-schema.js
CHANGED
|
@@ -100,9 +100,9 @@ export function validateConfig(raw = {}) {
|
|
|
100
100
|
errors.push('auth.signingKey is required when auth.mode is "verify"');
|
|
101
101
|
}
|
|
102
102
|
|
|
103
|
-
//
|
|
104
|
-
if (raw.
|
|
105
|
-
errors.push('auth.signingKey is required when auth.mode is "verify"
|
|
103
|
+
// verify mode always requires a signingKey
|
|
104
|
+
if (raw.auth?.mode === 'verify' && !raw.auth?.signingKey) {
|
|
105
|
+
errors.push('auth.signingKey is required when auth.mode is "verify". Set it in forge.config.json or via a ${ENV_VAR} reference.');
|
|
106
106
|
}
|
|
107
107
|
|
|
108
108
|
// defaultHitlLevel
|
package/lib/forge-service.js
CHANGED
|
@@ -67,10 +67,12 @@ const PROJECT_ROOT = resolve(__dirname, '..');
|
|
|
67
67
|
* @returns {Promise<{ auth, promptStore, preferenceStore, conversationStore, hitlEngine, verifierRunner, agentRegistry, db, config, env, rateLimiter, configPath, evalStore, chatAuditStore, verifierStore, pgStore, _redisClient, _pgPool }>}
|
|
68
68
|
*/
|
|
69
69
|
export async function buildSidecarContext(config, db, env = {}, opts = {}) {
|
|
70
|
-
// Resolve ${VAR} references in auth token fields at startup, not per-request
|
|
70
|
+
// Resolve ${VAR} references in auth token fields at startup, not per-request.
|
|
71
|
+
// No fallback for signingKey: if the env var is absent, resolve to null so createAuth
|
|
72
|
+
// fails-closed in verify mode rather than using the literal "${VAR}" string as the key.
|
|
71
73
|
const resolvedAuth = config.auth ? {
|
|
72
74
|
...config.auth,
|
|
73
|
-
signingKey: resolveSecret(config.auth.signingKey, env)
|
|
75
|
+
signingKey: resolveSecret(config.auth.signingKey, env),
|
|
74
76
|
adminToken: resolveSecret(config.auth.adminToken, env),
|
|
75
77
|
metricsToken: resolveSecret(config.auth.metricsToken, env),
|
|
76
78
|
} : config.auth;
|
|
@@ -393,7 +395,7 @@ export function createSidecarRouter(ctx, options = {}) {
|
|
|
393
395
|
if (handled) return;
|
|
394
396
|
} catch (err) {
|
|
395
397
|
process.stderr.write(`[forge] customRoutes error: ${err.message}\n`);
|
|
396
|
-
sendJson(res, 500, { error: 'Internal server error' });
|
|
398
|
+
if (!res.headersSent) sendJson(res, 500, { error: 'Internal server error' });
|
|
397
399
|
return;
|
|
398
400
|
}
|
|
399
401
|
}
|
|
@@ -650,7 +652,8 @@ function createDirectServer() {
|
|
|
650
652
|
if (sidecarPath === '/agent-api/user/preferences') {
|
|
651
653
|
if (req.method === 'GET') return handleGetPreferences(req, res, sidecarCtx);
|
|
652
654
|
if (req.method === 'PUT') return handlePutPreferences(req, res, sidecarCtx);
|
|
653
|
-
|
|
655
|
+
json(res, 405, { error: 'Method not allowed' });
|
|
656
|
+
return;
|
|
654
657
|
}
|
|
655
658
|
if (sidecarPath.startsWith('/agent-api/conversations')) {
|
|
656
659
|
return handleConversations(req, res, sidecarCtx);
|
package/lib/hitl-engine.d.ts
CHANGED
|
@@ -39,13 +39,13 @@ export class HitlEngine {
|
|
|
39
39
|
* Retrieve and consume the paused state for a resume token.
|
|
40
40
|
* Returns null if the token has expired or does not exist (does not throw).
|
|
41
41
|
*/
|
|
42
|
-
resume(resumeToken: string): Promise<
|
|
42
|
+
resume(resumeToken: string): Promise<object | null>;
|
|
43
43
|
|
|
44
44
|
/**
|
|
45
45
|
* Tear down any backend connections (Redis subscriber, Postgres pool, etc.).
|
|
46
|
-
* Call on graceful shutdown.
|
|
46
|
+
* Call on graceful shutdown. Synchronous.
|
|
47
47
|
*/
|
|
48
|
-
destroy():
|
|
48
|
+
destroy(): void;
|
|
49
49
|
}
|
|
50
50
|
|
|
51
51
|
/**
|
package/lib/index.js
CHANGED
|
@@ -8,8 +8,7 @@
|
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
10
|
import { readFileSync, existsSync, writeFileSync } from 'fs';
|
|
11
|
-
import { resolve
|
|
12
|
-
import { fileURLToPath } from 'url';
|
|
11
|
+
import { resolve } from 'path';
|
|
13
12
|
import { runTui } from './tui.js';
|
|
14
13
|
import { addEndpointManually } from './manual-entry.js';
|
|
15
14
|
import * as readline from 'readline';
|
|
@@ -18,7 +17,7 @@ const CONFIG_FILE = 'forge.config.json';
|
|
|
18
17
|
const PENDING_SPEC_FILE = 'forge-pending-tool.json';
|
|
19
18
|
|
|
20
19
|
function findProjectRoot() {
|
|
21
|
-
return
|
|
20
|
+
return process.cwd();
|
|
22
21
|
}
|
|
23
22
|
|
|
24
23
|
function loadConfig() {
|
package/lib/init.js
CHANGED
|
@@ -499,7 +499,7 @@ export async function runInit(opts = {}) {
|
|
|
499
499
|
const adminKeyValue = hasSidecar ? generateAdminKey() : null;
|
|
500
500
|
|
|
501
501
|
if (hasSidecar) {
|
|
502
|
-
raw.sidecar = {
|
|
502
|
+
raw.sidecar = { port: 8001 };
|
|
503
503
|
raw.adminKey = '${FORGE_ADMIN_KEY}';
|
|
504
504
|
raw.auth = { mode: authMode };
|
|
505
505
|
if (authMode === 'verify') {
|
package/lib/sidecar.d.ts
CHANGED
|
@@ -57,7 +57,7 @@ export interface SidecarRouterOptions {
|
|
|
57
57
|
export function buildSidecarContext(config: SidecarConfig, db: object, env?: Record<string, string>, opts?: { configPath?: string }): Promise<SidecarContext>;
|
|
58
58
|
export function createSidecarRouter(ctx: SidecarContext, opts?: SidecarRouterOptions): (req: object, res: object) => Promise<void>;
|
|
59
59
|
|
|
60
|
-
export { createAuth } from './auth.js';
|
|
60
|
+
export { createAuth, resolveSecret, authenticateAdmin } from './auth.js';
|
|
61
61
|
export type { AuthResult, AuthConfig, Authenticator } from './auth.js';
|
|
62
62
|
|
|
63
63
|
export { reactLoop } from './react-engine.js';
|
|
@@ -93,6 +93,8 @@ export class AgentRegistry {
|
|
|
93
93
|
export class VerifierRunner {
|
|
94
94
|
constructor(db: object, config?: object, pgPool?: object | null, workerPool?: object | null);
|
|
95
95
|
loadFromDb(db: object): Promise<void>;
|
|
96
|
-
|
|
96
|
+
registerVerifiers(toolName: string, verifiers: object[]): void;
|
|
97
|
+
verify(toolName: string, args: object, result: unknown): Promise<{ outcome: 'pass' | 'warn' | 'block'; message: string | null; verifierName: string | null }>;
|
|
98
|
+
logResult(sessionId: string, toolName: string, result: object): void;
|
|
97
99
|
destroy(): void;
|
|
98
100
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agent-tool-forge",
|
|
3
|
-
"version": "0.4.
|
|
3
|
+
"version": "0.4.9",
|
|
4
4
|
"description": "Production LLM agent sidecar + Claude Code skill library for building, testing, and running tool-calling agents.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"llm",
|
|
@@ -30,6 +30,7 @@
|
|
|
30
30
|
"lib",
|
|
31
31
|
"widget",
|
|
32
32
|
"config",
|
|
33
|
+
"skills",
|
|
33
34
|
"!lib/**/*.test.js",
|
|
34
35
|
"!lib/__fixtures__",
|
|
35
36
|
"!widget/**/*.test.js"
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# /forge-eval — Generate Eval Suites
|
|
2
|
+
|
|
3
|
+
Generate golden and labeled eval JSON files for a named tool. Run this skill after a tool is implemented and tests are green.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Step 1 — Identify the Tool
|
|
8
|
+
|
|
9
|
+
Ask the user which tool to generate evals for, or read it from context if `/forge-tool` just completed.
|
|
10
|
+
|
|
11
|
+
Read the tool's ToolDefinition from `tools/<name>.tool.js`:
|
|
12
|
+
- `name`, `description`, `schema`, `triggerPhrases`, `category`, `consequenceLevel`
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## Step 2 — Generate Golden Eval Suite
|
|
17
|
+
|
|
18
|
+
Generate **5–10 golden cases** covering:
|
|
19
|
+
- Happy path with typical inputs
|
|
20
|
+
- Edge cases: empty results, boundary values, missing optional params
|
|
21
|
+
- Error paths: invalid input, service unavailable
|
|
22
|
+
|
|
23
|
+
Each golden case follows this schema:
|
|
24
|
+
```json
|
|
25
|
+
{
|
|
26
|
+
"id": "case-001",
|
|
27
|
+
"description": "What this case tests",
|
|
28
|
+
"input": { "message": "User's natural-language request" },
|
|
29
|
+
"expectedTool": "<tool_name>",
|
|
30
|
+
"expectedArgs": { "param": "value" },
|
|
31
|
+
"checks": [
|
|
32
|
+
{ "type": "tool_called", "tool": "<tool_name>" },
|
|
33
|
+
{ "type": "arg_equals", "arg": "param", "value": "value" }
|
|
34
|
+
]
|
|
35
|
+
}
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Write to `evals/<name>.golden.json` as a JSON array.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## Step 3 — Generate Labeled Eval Suite
|
|
43
|
+
|
|
44
|
+
Generate **2–3 labeled (multi-tool) scenarios** where the agent must choose between 2+ tools or sequence multiple calls:
|
|
45
|
+
- Scenario where the tool is the correct choice over a similar tool
|
|
46
|
+
- Scenario where the tool is called followed by a second tool
|
|
47
|
+
- Scenario where the tool should NOT be called (wrong intent)
|
|
48
|
+
|
|
49
|
+
Each labeled case:
|
|
50
|
+
```json
|
|
51
|
+
{
|
|
52
|
+
"id": "labeled-001",
|
|
53
|
+
"description": "What this scenario tests",
|
|
54
|
+
"input": { "message": "User's multi-intent request" },
|
|
55
|
+
"label": "correct" | "incorrect" | "partial",
|
|
56
|
+
"expectedTools": ["<tool_name>"],
|
|
57
|
+
"checks": [...]
|
|
58
|
+
}
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Write to `evals/<name>.labeled.json` as a JSON array.
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Step 4 — Validate
|
|
66
|
+
|
|
67
|
+
Run `node lib/index.js run --eval evals/<name>.golden.json --dry-run` if available to validate JSON schema.
|
|
68
|
+
|
|
69
|
+
Print a summary: N golden cases, M labeled scenarios, file paths written.
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
# Assertion Patterns — Deterministic Assertion Catalog
|
|
2
|
+
|
|
3
|
+
All eval assertions must be deterministic — identical results across runs given the same data. No LLM-as-judge.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## The Three Assertion Layers
|
|
8
|
+
|
|
9
|
+
### Layer 1: responseContains (Hard Proof)
|
|
10
|
+
|
|
11
|
+
ALL values must appear in the response (substring match, case-sensitive).
|
|
12
|
+
|
|
13
|
+
**Use for:** Exact values that prove the tool returned real data. The LLM cannot guess or hallucinate these.
|
|
14
|
+
|
|
15
|
+
```json
|
|
16
|
+
"responseContains": ["Paris", "72°F", "$30.05", "AAPL"]
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
**Source values from:**
|
|
20
|
+
- Seed manifest: `{{seed:totals.dividends}}` → `"$30.05"`
|
|
21
|
+
- Live snapshot: `{{snapshot:performance.netWorth}}` → resolved at runtime
|
|
22
|
+
- Hardcoded: values not in either source (e.g., external ticker symbols)
|
|
23
|
+
|
|
24
|
+
**Rules:**
|
|
25
|
+
- Never hardcode values that depend on live data (prices, P&L, percentages)
|
|
26
|
+
- Use seed templates for stable values, snapshot templates for volatile ones
|
|
27
|
+
- Include at least 2 proof values per golden eval case
|
|
28
|
+
|
|
29
|
+
### Layer 2: responseContainsAny (Domain Precision)
|
|
30
|
+
|
|
31
|
+
At least one value from EACH inner array (synonym group) must appear.
|
|
32
|
+
|
|
33
|
+
**Use for:** Correct vocabulary with phrasing flexibility. The agent shouldn't sound robotic, but must use the right terms.
|
|
34
|
+
|
|
35
|
+
```json
|
|
36
|
+
"responseContainsAny": [
|
|
37
|
+
["temperature", "degrees", "°"],
|
|
38
|
+
["weather", "conditions", "forecast"]
|
|
39
|
+
]
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
This means: the response must contain at least one temperature-related term AND at least one weather-related term.
|
|
43
|
+
|
|
44
|
+
**Rules:**
|
|
45
|
+
- Each group = one concept with acceptable synonyms
|
|
46
|
+
- Keep groups small (2-4 terms) — too many dilutes the check
|
|
47
|
+
- Include both formal and informal variants
|
|
48
|
+
|
|
49
|
+
### Layer 3: responseNotContains (Cop-Outs + Imprecision)
|
|
50
|
+
|
|
51
|
+
NONE of these values may appear in the response.
|
|
52
|
+
|
|
53
|
+
**Use for:**
|
|
54
|
+
- **Cop-outs:** `"I don't know"`, `"no information"`, `"unable to"`
|
|
55
|
+
- **JSON leaks:** `"fetchedAt"`, `"\"tool\":"`, `"undefined"`, `"null"`
|
|
56
|
+
- **Imprecision:** Wrong domain terms (e.g., "payment" when it should be "dividend")
|
|
57
|
+
- **Sensitive data:** `"API_KEY"`, `"OPENAI_API_KEY"`, `"Bearer "`
|
|
58
|
+
- **System prompt leaks:** `"AVAILABLE TOOLS"`, `"you are an AI"`
|
|
59
|
+
|
|
60
|
+
```json
|
|
61
|
+
"responseNotContains": [
|
|
62
|
+
"I don't know", "no information", "unable to",
|
|
63
|
+
"fetchedAt", "\"tool\":", "undefined"
|
|
64
|
+
]
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## Seed-Stable vs Market-Dynamic Values
|
|
70
|
+
|
|
71
|
+
Every assertion value falls into one of two categories:
|
|
72
|
+
|
|
73
|
+
### Seed-Stable Values
|
|
74
|
+
|
|
75
|
+
Derived from a seed data script and codified in a seed manifest file. These never change unless the seed script is re-authored.
|
|
76
|
+
|
|
77
|
+
**Template syntax:** `{{seed:path}}`
|
|
78
|
+
|
|
79
|
+
| Template | Example Resolution |
|
|
80
|
+
|----------|-------------------|
|
|
81
|
+
| `{{seed:holdings.equities[0]}}` | `"AAPL"` |
|
|
82
|
+
| `{{seed:quantities.AAPL.current}}` | `"7"` |
|
|
83
|
+
| `{{seed:totals.dividends}}` | `"$30.05"` |
|
|
84
|
+
| `{{seed:currency}}` | `"USD"` |
|
|
85
|
+
|
|
86
|
+
### Market-Dynamic Values
|
|
87
|
+
|
|
88
|
+
Depend on live data (prices, P&L, allocations). Change on every run.
|
|
89
|
+
|
|
90
|
+
**Template syntax:** `{{snapshot:path}}`
|
|
91
|
+
|
|
92
|
+
| Template | Example Resolution |
|
|
93
|
+
|----------|-------------------|
|
|
94
|
+
| `{{snapshot:holdings.AAPL.value}}` | `"$1,599.50"` |
|
|
95
|
+
| `{{snapshot:performance.netWorth}}` | `"$13,245.00"` |
|
|
96
|
+
| `{{snapshot:performance.netPnlPct}}` | `"8.03%"` |
|
|
97
|
+
|
|
98
|
+
### Resolution Rules
|
|
99
|
+
|
|
100
|
+
1. Seed templates resolve before snapshot templates (allows mixing)
|
|
101
|
+
2. If a path is missing, the individual assertion is skipped with a warning — not a hard failure
|
|
102
|
+
3. Resolution happens in-memory only — eval JSON on disk is never modified
|
|
103
|
+
|
|
104
|
+
### When to Use Each
|
|
105
|
+
|
|
106
|
+
| Value type | Source | Assertion style |
|
|
107
|
+
|-----------|--------|----------------|
|
|
108
|
+
| Fixed identifiers (names, IDs) | Seed manifest | `{{seed:...}}` |
|
|
109
|
+
| Fixed quantities (counts, amounts) | Seed manifest | `{{seed:...}}` |
|
|
110
|
+
| Domain terms | N/A | `responseContainsAny` synonym groups |
|
|
111
|
+
| External identifiers | N/A | Hardcoded |
|
|
112
|
+
| Current values (prices, P&L) | Live snapshot | `{{snapshot:...}}` |
|
|
113
|
+
|
|
114
|
+
**Rule of thumb:** If the value comes from your seed data, use `{{seed:*}}`. If it depends on live external data, use `{{snapshot:*}}`. If it's a constant, hardcode it.
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Negative Assertions
|
|
119
|
+
|
|
120
|
+
Critical for catching regression:
|
|
121
|
+
|
|
122
|
+
### Cop-out Detection
|
|
123
|
+
```json
|
|
124
|
+
"responseNotContains": ["I don't know", "no information", "unable to", "I cannot"]
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### JSON Leak Detection
|
|
128
|
+
```json
|
|
129
|
+
"responseNotContains": ["fetchedAt", "\"tool\":", "\"error\":", "undefined", "null"]
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### System Prompt Leak Detection
|
|
133
|
+
```json
|
|
134
|
+
"responseNotContains": ["AVAILABLE TOOLS", "you are an AI", "system prompt"]
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Domain Imprecision
|
|
138
|
+
```json
|
|
139
|
+
"responseNotContains": ["payment received"]
|
|
140
|
+
```
|
|
141
|
+
(When the correct term is "dividend", catching the wrong term is a precision assertion.)
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## Latency Assertions
|
|
146
|
+
|
|
147
|
+
```json
|
|
148
|
+
"maxLatencyMs": 30000
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
- Golden evals: 30s is typical (one LLM call + one tool call)
|
|
152
|
+
- Labeled straightforward: 30s (simple multi-tool)
|
|
153
|
+
- Labeled ambiguous: 30s (same)
|
|
154
|
+
- Edge/adversarial: 15s (should respond quickly without tool calls)
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Tool Routing Assertions
|
|
159
|
+
|
|
160
|
+
### Exact match (golden + straightforward labeled)
|
|
161
|
+
```json
|
|
162
|
+
"toolsCalled": ["get_weather", "get_forecast"]
|
|
163
|
+
```
|
|
164
|
+
Both tools must be called. No more, no fewer.
|
|
165
|
+
|
|
166
|
+
### Acceptable sets (ambiguous labeled)
|
|
167
|
+
```json
|
|
168
|
+
"toolsAcceptable": [
|
|
169
|
+
["get_weather"],
|
|
170
|
+
["get_weather", "get_forecast"]
|
|
171
|
+
]
|
|
172
|
+
```
|
|
173
|
+
Either set is valid. The agent's judgment decides depth.
|
|
174
|
+
|
|
175
|
+
### Negative routing (edge labeled)
|
|
176
|
+
```json
|
|
177
|
+
"toolsNotCalled": ["delete_account"]
|
|
178
|
+
```
|
|
179
|
+
This tool must NOT be called (e.g., on an injection attempt).
|
|
180
|
+
|
|
181
|
+
### No tools needed (edge labeled)
|
|
182
|
+
```json
|
|
183
|
+
"toolsAcceptable": [["__none__"]]
|
|
184
|
+
```
|
|
185
|
+
The agent should answer from general knowledge without calling any tools.
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## Parameter Assertions
|
|
190
|
+
|
|
191
|
+
Checks that the model passed correct arguments to the tool. This catches a failure class that routing assertions miss entirely: the model calls the right tool but passes wrong, missing, or hallucinated parameters.
|
|
192
|
+
|
|
193
|
+
### Why This Matters
|
|
194
|
+
|
|
195
|
+
BFCL and Google ADK both test parameter-level accuracy. Without it, your evals only prove the model picked the right tool — not that it used it correctly. A model that calls `get_weather` with `city: "the weather"` instead of `city: "Paris"` passes all routing assertions and all response assertions (if the tool errors gracefully and the model recovers). The parameter assertion catches it.
|
|
196
|
+
|
|
197
|
+
### Assertion Types
|
|
198
|
+
|
|
199
|
+
```json
|
|
200
|
+
"toolParams": [
|
|
201
|
+
{ "tool": "get_weather", "paramName": "city", "assertion": "contains", "value": "Paris" },
|
|
202
|
+
{ "tool": "get_weather", "paramName": "units", "assertion": "oneOf", "value": ["metric", "imperial"] }
|
|
203
|
+
]
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
| Assertion | Use When | Example |
|
|
207
|
+
|-----------|----------|---------|
|
|
208
|
+
| `equals` | Exact value known | `city` = `"Paris"` |
|
|
209
|
+
| `contains` | Model may normalize | `city` contains `"Tokyo"` (could be `"Tokyo"` or `"Tokyo, JP"`) |
|
|
210
|
+
| `oneOf` | Multiple valid values | `units` is `"metric"` or `"imperial"` |
|
|
211
|
+
| `exists` | Parameter must be provided | `city` was sent (any value) |
|
|
212
|
+
| `notExists` | Catch hallucinated params | `country_code` should not be sent if schema doesn't define it |
|
|
213
|
+
| `matches` | Format validation | `date` matches `^\d{4}-\d{2}-\d{2}$` |
|
|
214
|
+
|
|
215
|
+
### Golden Eval Example
|
|
216
|
+
|
|
217
|
+
```json
|
|
218
|
+
{
|
|
219
|
+
"id": "gs-get-weather-001",
|
|
220
|
+
"description": "trigger phrase — direct weather question",
|
|
221
|
+
"input": { "message": "What's the weather in Paris?" },
|
|
222
|
+
"expect": {
|
|
223
|
+
"toolsCalled": ["get_weather"],
|
|
224
|
+
"toolParams": [
|
|
225
|
+
{ "tool": "get_weather", "paramName": "city", "assertion": "contains", "value": "Paris" }
|
|
226
|
+
],
|
|
227
|
+
"noToolErrors": true,
|
|
228
|
+
"responseNonEmpty": true,
|
|
229
|
+
"responseContains": ["Paris"],
|
|
230
|
+
"responseContainsAny": [["temperature", "degrees", "°"]],
|
|
231
|
+
"maxLatencyMs": 30000
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
### Multi-Tool Labeled Example
|
|
237
|
+
|
|
238
|
+
```json
|
|
239
|
+
{
|
|
240
|
+
"id": "ls-get-weather-001",
|
|
241
|
+
"description": "straightforward — weather + forecast synthesis",
|
|
242
|
+
"difficulty": "straightforward",
|
|
243
|
+
"input": { "message": "What's the weather in Tokyo today and what should I expect this week?" },
|
|
244
|
+
"expect": {
|
|
245
|
+
"toolsCalled": ["get_weather", "get_forecast"],
|
|
246
|
+
"toolParams": [
|
|
247
|
+
{ "tool": "get_weather", "paramName": "city", "assertion": "contains", "value": "Tokyo" },
|
|
248
|
+
{ "tool": "get_forecast", "paramName": "city", "assertion": "contains", "value": "Tokyo" }
|
|
249
|
+
],
|
|
250
|
+
"noToolErrors": true,
|
|
251
|
+
"responseNonEmpty": true,
|
|
252
|
+
"responseContains": ["Tokyo"],
|
|
253
|
+
"maxLatencyMs": 30000
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
### Rules
|
|
259
|
+
|
|
260
|
+
- Use `contains` over `equals` by default — models normalize inputs in unpredictable ways
|
|
261
|
+
- Use `oneOf` for enum fields where defaults may vary by context
|
|
262
|
+
- Use `exists` sparingly — it's weak (any value passes). Prefer `contains` or `equals`.
|
|
263
|
+
- Parameter assertions are SKIPPED if the tool wasn't called (routing already failed)
|
|
264
|
+
- When `toolsAcceptable` is used, only assert params for tools that were actually called
|
|
265
|
+
- Use `{{seed:*}}` in values for data-dependent parameters: `{ "assertion": "equals", "value": "{{seed:holdings.equities[0].symbol}}" }`
|