agent-tool-forge 0.4.7 → 0.4.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/auth.d.ts +1 -1
- package/lib/config-schema.js +1 -6
- package/lib/forge-service.js +26 -11
- package/lib/hitl-engine.d.ts +3 -3
- package/lib/index.js +2 -3
- package/lib/init.js +1 -1
- package/lib/sidecar.d.ts +4 -2
- package/lib/sidecar.js +1 -1
- package/package.json +2 -1
- package/skills/forge-eval/SKILL.md +69 -0
- package/skills/forge-eval/references/assertion-patterns.md +265 -0
- package/skills/forge-eval/references/eval-types.md +262 -0
- package/skills/forge-eval/references/overlap-map.md +89 -0
- package/skills/forge-mcp/SKILL.md +62 -0
- package/skills/forge-mcp/references/mcp-templates.md +302 -0
- package/skills/forge-mcp/references/tool-to-mcp-mapping.md +108 -0
- package/skills/forge-tool/SKILL.md +112 -0
- package/skills/forge-tool/references/description-contract.md +102 -0
- package/skills/forge-tool/references/extension-points.md +120 -0
- package/skills/forge-tool/references/pending-spec.md +53 -0
- package/skills/forge-tool/references/tool-shape.md +106 -0
- package/skills/forge-verifier/SKILL.md +78 -0
- package/skills/forge-verifier/references/output-groups.md +39 -0
- package/skills/forge-verifier/references/verifier-pattern.md +83 -0
- package/skills/forge-verifier/references/verifier-stubs.md +147 -0
package/lib/auth.d.ts
CHANGED
package/lib/config-schema.js
CHANGED
|
@@ -97,12 +97,7 @@ export function validateConfig(raw = {}) {
|
|
|
97
97
|
|
|
98
98
|
// auth.mode = 'verify' requires signingKey
|
|
99
99
|
if (raw.auth?.mode === 'verify' && !raw.auth?.signingKey) {
|
|
100
|
-
errors.push('auth.signingKey is required when auth.mode is "verify"');
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
// Startup validation: sidecar enabled + verify mode + no signingKey
|
|
104
|
-
if (raw.sidecar?.enabled && raw.auth?.mode === 'verify' && !raw.auth?.signingKey) {
|
|
105
|
-
errors.push('auth.signingKey is required when auth.mode is "verify" and sidecar is enabled. Set FORGE_JWT_KEY in .env');
|
|
100
|
+
errors.push('auth.signingKey is required when auth.mode is "verify". Set it in forge.config.json or via a ${ENV_VAR} reference.');
|
|
106
101
|
}
|
|
107
102
|
|
|
108
103
|
// defaultHitlLevel
|
package/lib/forge-service.js
CHANGED
|
@@ -67,10 +67,12 @@ const PROJECT_ROOT = resolve(__dirname, '..');
|
|
|
67
67
|
* @returns {Promise<{ auth, promptStore, preferenceStore, conversationStore, hitlEngine, verifierRunner, agentRegistry, db, config, env, rateLimiter, configPath, evalStore, chatAuditStore, verifierStore, pgStore, _redisClient, _pgPool }>}
|
|
68
68
|
*/
|
|
69
69
|
export async function buildSidecarContext(config, db, env = {}, opts = {}) {
|
|
70
|
-
// Resolve ${VAR} references in auth token fields at startup, not per-request
|
|
70
|
+
// Resolve ${VAR} references in auth token fields at startup, not per-request.
|
|
71
|
+
// No fallback for signingKey: if the env var is absent, resolve to null so createAuth
|
|
72
|
+
// fails-closed in verify mode rather than using the literal "${VAR}" string as the key.
|
|
71
73
|
const resolvedAuth = config.auth ? {
|
|
72
74
|
...config.auth,
|
|
73
|
-
signingKey: resolveSecret(config.auth.signingKey, env)
|
|
75
|
+
signingKey: resolveSecret(config.auth.signingKey, env),
|
|
74
76
|
adminToken: resolveSecret(config.auth.adminToken, env),
|
|
75
77
|
metricsToken: resolveSecret(config.auth.metricsToken, env),
|
|
76
78
|
} : config.auth;
|
|
@@ -326,10 +328,10 @@ export function createSidecarRouter(ctx, options = {}) {
|
|
|
326
328
|
if (sidecarPath === '/agent-api/tools' && req.method === 'GET') {
|
|
327
329
|
return handleToolsList(req, res, ctx);
|
|
328
330
|
}
|
|
329
|
-
if (
|
|
331
|
+
if (sidecarPath.startsWith('/forge-admin/agents')) {
|
|
330
332
|
return handleAgents(req, res, ctx);
|
|
331
333
|
}
|
|
332
|
-
if (
|
|
334
|
+
if (sidecarPath.startsWith('/forge-admin/config')) {
|
|
333
335
|
return handleAdminConfig(req, res, ctx);
|
|
334
336
|
}
|
|
335
337
|
|
|
@@ -339,7 +341,7 @@ export function createSidecarRouter(ctx, options = {}) {
|
|
|
339
341
|
try {
|
|
340
342
|
sendJson(res, 200, await ctx.evalStore.getEvalSummary());
|
|
341
343
|
} catch (err) {
|
|
342
|
-
sendJson(res, 500, { error: 'Failed to fetch eval summary' });
|
|
344
|
+
if (!res.headersSent) sendJson(res, 500, { error: 'Failed to fetch eval summary' });
|
|
343
345
|
}
|
|
344
346
|
} else if (ctx.db) {
|
|
345
347
|
try {
|
|
@@ -365,7 +367,7 @@ export function createSidecarRouter(ctx, options = {}) {
|
|
|
365
367
|
try {
|
|
366
368
|
sendJson(res, 200, await ctx.evalStore.listRuns(limit, offset));
|
|
367
369
|
} catch (err) {
|
|
368
|
-
sendJson(res, 500, { error: 'Failed to fetch eval runs' });
|
|
370
|
+
if (!res.headersSent) sendJson(res, 500, { error: 'Failed to fetch eval runs' });
|
|
369
371
|
}
|
|
370
372
|
} else if (ctx.db) {
|
|
371
373
|
try {
|
|
@@ -381,7 +383,7 @@ export function createSidecarRouter(ctx, options = {}) {
|
|
|
381
383
|
}
|
|
382
384
|
|
|
383
385
|
// ── Widget static file serving ─────────────────────────────────────────
|
|
384
|
-
if (
|
|
386
|
+
if (sidecarPath.startsWith('/widget/')) {
|
|
385
387
|
serveWidgetFile(req, res, widgetDir, sendJson);
|
|
386
388
|
return;
|
|
387
389
|
}
|
|
@@ -393,7 +395,7 @@ export function createSidecarRouter(ctx, options = {}) {
|
|
|
393
395
|
if (handled) return;
|
|
394
396
|
} catch (err) {
|
|
395
397
|
process.stderr.write(`[forge] customRoutes error: ${err.message}\n`);
|
|
396
|
-
sendJson(res, 500, { error: 'Internal server error' });
|
|
398
|
+
if (!res.headersSent) sendJson(res, 500, { error: 'Internal server error' });
|
|
397
399
|
return;
|
|
398
400
|
}
|
|
399
401
|
}
|
|
@@ -650,7 +652,8 @@ function createDirectServer() {
|
|
|
650
652
|
if (sidecarPath === '/agent-api/user/preferences') {
|
|
651
653
|
if (req.method === 'GET') return handleGetPreferences(req, res, sidecarCtx);
|
|
652
654
|
if (req.method === 'PUT') return handlePutPreferences(req, res, sidecarCtx);
|
|
653
|
-
|
|
655
|
+
json(res, 405, { error: 'Method not allowed' });
|
|
656
|
+
return;
|
|
654
657
|
}
|
|
655
658
|
if (sidecarPath.startsWith('/agent-api/conversations')) {
|
|
656
659
|
return handleConversations(req, res, sidecarCtx);
|
|
@@ -658,10 +661,22 @@ function createDirectServer() {
|
|
|
658
661
|
if (sidecarPath === '/agent-api/tools' && req.method === 'GET') {
|
|
659
662
|
return handleToolsList(req, res, sidecarCtx);
|
|
660
663
|
}
|
|
661
|
-
if (
|
|
664
|
+
if (sidecarPath.startsWith('/forge-admin/agents')) {
|
|
665
|
+
const authCheck = applyRouteAuth(req, sidecarCtx, 2);
|
|
666
|
+
if (!authCheck.ok) {
|
|
667
|
+
if (authCheck.status === 401) res.setHeader('WWW-Authenticate', 'Bearer');
|
|
668
|
+
json(res, authCheck.status, { error: authCheck.error });
|
|
669
|
+
return;
|
|
670
|
+
}
|
|
662
671
|
return handleAgents(req, res, sidecarCtx);
|
|
663
672
|
}
|
|
664
|
-
if (
|
|
673
|
+
if (sidecarPath.startsWith('/forge-admin/config')) {
|
|
674
|
+
const authCheck = applyRouteAuth(req, sidecarCtx, 2);
|
|
675
|
+
if (!authCheck.ok) {
|
|
676
|
+
if (authCheck.status === 401) res.setHeader('WWW-Authenticate', 'Bearer');
|
|
677
|
+
json(res, authCheck.status, { error: authCheck.error });
|
|
678
|
+
return;
|
|
679
|
+
}
|
|
665
680
|
return handleAdminConfig(req, res, sidecarCtx);
|
|
666
681
|
}
|
|
667
682
|
}
|
package/lib/hitl-engine.d.ts
CHANGED
|
@@ -39,13 +39,13 @@ export class HitlEngine {
|
|
|
39
39
|
* Retrieve and consume the paused state for a resume token.
|
|
40
40
|
* Returns null if the token has expired or does not exist (does not throw).
|
|
41
41
|
*/
|
|
42
|
-
resume(resumeToken: string): Promise<
|
|
42
|
+
resume(resumeToken: string): Promise<object | null>;
|
|
43
43
|
|
|
44
44
|
/**
|
|
45
45
|
* Tear down any backend connections (Redis subscriber, Postgres pool, etc.).
|
|
46
|
-
* Call on graceful shutdown.
|
|
46
|
+
* Call on graceful shutdown. Synchronous.
|
|
47
47
|
*/
|
|
48
|
-
destroy():
|
|
48
|
+
destroy(): void;
|
|
49
49
|
}
|
|
50
50
|
|
|
51
51
|
/**
|
package/lib/index.js
CHANGED
|
@@ -8,8 +8,7 @@
|
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
10
|
import { readFileSync, existsSync, writeFileSync } from 'fs';
|
|
11
|
-
import { resolve
|
|
12
|
-
import { fileURLToPath } from 'url';
|
|
11
|
+
import { resolve } from 'path';
|
|
13
12
|
import { runTui } from './tui.js';
|
|
14
13
|
import { addEndpointManually } from './manual-entry.js';
|
|
15
14
|
import * as readline from 'readline';
|
|
@@ -18,7 +17,7 @@ const CONFIG_FILE = 'forge.config.json';
|
|
|
18
17
|
const PENDING_SPEC_FILE = 'forge-pending-tool.json';
|
|
19
18
|
|
|
20
19
|
function findProjectRoot() {
|
|
21
|
-
return
|
|
20
|
+
return process.cwd();
|
|
22
21
|
}
|
|
23
22
|
|
|
24
23
|
function loadConfig() {
|
package/lib/init.js
CHANGED
|
@@ -499,7 +499,7 @@ export async function runInit(opts = {}) {
|
|
|
499
499
|
const adminKeyValue = hasSidecar ? generateAdminKey() : null;
|
|
500
500
|
|
|
501
501
|
if (hasSidecar) {
|
|
502
|
-
raw.sidecar = {
|
|
502
|
+
raw.sidecar = { port: 8001 };
|
|
503
503
|
raw.adminKey = '${FORGE_ADMIN_KEY}';
|
|
504
504
|
raw.auth = { mode: authMode };
|
|
505
505
|
if (authMode === 'verify') {
|
package/lib/sidecar.d.ts
CHANGED
|
@@ -57,7 +57,7 @@ export interface SidecarRouterOptions {
|
|
|
57
57
|
export function buildSidecarContext(config: SidecarConfig, db: object, env?: Record<string, string>, opts?: { configPath?: string }): Promise<SidecarContext>;
|
|
58
58
|
export function createSidecarRouter(ctx: SidecarContext, opts?: SidecarRouterOptions): (req: object, res: object) => Promise<void>;
|
|
59
59
|
|
|
60
|
-
export { createAuth } from './auth.js';
|
|
60
|
+
export { createAuth, resolveSecret, authenticateAdmin } from './auth.js';
|
|
61
61
|
export type { AuthResult, AuthConfig, Authenticator } from './auth.js';
|
|
62
62
|
|
|
63
63
|
export { reactLoop } from './react-engine.js';
|
|
@@ -93,6 +93,8 @@ export class AgentRegistry {
|
|
|
93
93
|
export class VerifierRunner {
|
|
94
94
|
constructor(db: object, config?: object, pgPool?: object | null, workerPool?: object | null);
|
|
95
95
|
loadFromDb(db: object): Promise<void>;
|
|
96
|
-
|
|
96
|
+
registerVerifiers(toolName: string, verifiers: object[]): void;
|
|
97
|
+
verify(toolName: string, args: object, result: unknown): Promise<{ outcome: 'pass' | 'warn' | 'block'; message: string | null; verifierName: string | null }>;
|
|
98
|
+
logResult(sessionId: string, toolName: string, result: object): void;
|
|
97
99
|
destroy(): void;
|
|
98
100
|
}
|
package/lib/sidecar.js
CHANGED
|
@@ -155,7 +155,7 @@ export async function createSidecar(config = {}, options = {}) {
|
|
|
155
155
|
|
|
156
156
|
// Re-exports for advanced consumers
|
|
157
157
|
export { buildSidecarContext, createSidecarRouter } from './forge-service.js';
|
|
158
|
-
export { createAuth } from './auth.js';
|
|
158
|
+
export { createAuth, resolveSecret, authenticateAdmin } from './auth.js';
|
|
159
159
|
export { reactLoop } from './react-engine.js';
|
|
160
160
|
export { mergeDefaults, validateConfig, CONFIG_DEFAULTS } from './config-schema.js';
|
|
161
161
|
export { getDb } from './db.js';
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agent-tool-forge",
|
|
3
|
-
"version": "0.4.
|
|
3
|
+
"version": "0.4.10",
|
|
4
4
|
"description": "Production LLM agent sidecar + Claude Code skill library for building, testing, and running tool-calling agents.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"llm",
|
|
@@ -30,6 +30,7 @@
|
|
|
30
30
|
"lib",
|
|
31
31
|
"widget",
|
|
32
32
|
"config",
|
|
33
|
+
"skills",
|
|
33
34
|
"!lib/**/*.test.js",
|
|
34
35
|
"!lib/__fixtures__",
|
|
35
36
|
"!widget/**/*.test.js"
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# /forge-eval — Generate Eval Suites
|
|
2
|
+
|
|
3
|
+
Generate golden and labeled eval JSON files for a named tool. Run this skill after a tool is implemented and tests are green.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Step 1 — Identify the Tool
|
|
8
|
+
|
|
9
|
+
Ask the user which tool to generate evals for, or read it from context if `/forge-tool` just completed.
|
|
10
|
+
|
|
11
|
+
Read the tool's ToolDefinition from `tools/<name>.tool.js`:
|
|
12
|
+
- `name`, `description`, `schema`, `triggerPhrases`, `category`, `consequenceLevel`
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## Step 2 — Generate Golden Eval Suite
|
|
17
|
+
|
|
18
|
+
Generate **5–10 golden cases** covering:
|
|
19
|
+
- Happy path with typical inputs
|
|
20
|
+
- Edge cases: empty results, boundary values, missing optional params
|
|
21
|
+
- Error paths: invalid input, service unavailable
|
|
22
|
+
|
|
23
|
+
Each golden case follows this schema:
|
|
24
|
+
```json
|
|
25
|
+
{
|
|
26
|
+
"id": "case-001",
|
|
27
|
+
"description": "What this case tests",
|
|
28
|
+
"input": { "message": "User's natural-language request" },
|
|
29
|
+
"expectedTool": "<tool_name>",
|
|
30
|
+
"expectedArgs": { "param": "value" },
|
|
31
|
+
"checks": [
|
|
32
|
+
{ "type": "tool_called", "tool": "<tool_name>" },
|
|
33
|
+
{ "type": "arg_equals", "arg": "param", "value": "value" }
|
|
34
|
+
]
|
|
35
|
+
}
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Write to `evals/<name>.golden.json` as a JSON array.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## Step 3 — Generate Labeled Eval Suite
|
|
43
|
+
|
|
44
|
+
Generate **2–3 labeled (multi-tool) scenarios** where the agent must choose between 2+ tools or sequence multiple calls:
|
|
45
|
+
- Scenario where the tool is the correct choice over a similar tool
|
|
46
|
+
- Scenario where the tool is called followed by a second tool
|
|
47
|
+
- Scenario where the tool should NOT be called (wrong intent)
|
|
48
|
+
|
|
49
|
+
Each labeled case:
|
|
50
|
+
```json
|
|
51
|
+
{
|
|
52
|
+
"id": "labeled-001",
|
|
53
|
+
"description": "What this scenario tests",
|
|
54
|
+
"input": { "message": "User's multi-intent request" },
|
|
55
|
+
"label": "correct" | "incorrect" | "partial",
|
|
56
|
+
"expectedTools": ["<tool_name>"],
|
|
57
|
+
"checks": [...]
|
|
58
|
+
}
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Write to `evals/<name>.labeled.json` as a JSON array.
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Step 4 — Validate
|
|
66
|
+
|
|
67
|
+
Run `node lib/index.js run --eval evals/<name>.golden.json --dry-run` if available to validate JSON schema.
|
|
68
|
+
|
|
69
|
+
Print a summary: N golden cases, M labeled scenarios, file paths written.
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
# Assertion Patterns — Deterministic Assertion Catalog
|
|
2
|
+
|
|
3
|
+
All eval assertions must be deterministic — identical results across runs given the same data. No LLM-as-judge.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## The Three Assertion Layers
|
|
8
|
+
|
|
9
|
+
### Layer 1: responseContains (Hard Proof)
|
|
10
|
+
|
|
11
|
+
ALL values must appear in the response (substring match, case-sensitive).
|
|
12
|
+
|
|
13
|
+
**Use for:** Exact values that prove the tool returned real data. The LLM cannot guess or hallucinate these.
|
|
14
|
+
|
|
15
|
+
```json
|
|
16
|
+
"responseContains": ["Paris", "72°F", "$30.05", "AAPL"]
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
**Source values from:**
|
|
20
|
+
- Seed manifest: `{{seed:totals.dividends}}` → `"$30.05"`
|
|
21
|
+
- Live snapshot: `{{snapshot:performance.netWorth}}` → resolved at runtime
|
|
22
|
+
- Hardcoded: values not in either source (e.g., external ticker symbols)
|
|
23
|
+
|
|
24
|
+
**Rules:**
|
|
25
|
+
- Never hardcode values that depend on live data (prices, P&L, percentages)
|
|
26
|
+
- Use seed templates for stable values, snapshot templates for volatile ones
|
|
27
|
+
- Include at least 2 proof values per golden eval case
|
|
28
|
+
|
|
29
|
+
### Layer 2: responseContainsAny (Domain Precision)
|
|
30
|
+
|
|
31
|
+
At least one value from EACH inner array (synonym group) must appear.
|
|
32
|
+
|
|
33
|
+
**Use for:** Correct vocabulary with phrasing flexibility. The agent shouldn't sound robotic, but must use the right terms.
|
|
34
|
+
|
|
35
|
+
```json
|
|
36
|
+
"responseContainsAny": [
|
|
37
|
+
["temperature", "degrees", "°"],
|
|
38
|
+
["weather", "conditions", "forecast"]
|
|
39
|
+
]
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
This means: the response must contain at least one temperature-related term AND at least one weather-related term.
|
|
43
|
+
|
|
44
|
+
**Rules:**
|
|
45
|
+
- Each group = one concept with acceptable synonyms
|
|
46
|
+
- Keep groups small (2-4 terms) — too many dilutes the check
|
|
47
|
+
- Include both formal and informal variants
|
|
48
|
+
|
|
49
|
+
### Layer 3: responseNotContains (Cop-Outs + Imprecision)
|
|
50
|
+
|
|
51
|
+
NONE of these values may appear in the response.
|
|
52
|
+
|
|
53
|
+
**Use for:**
|
|
54
|
+
- **Cop-outs:** `"I don't know"`, `"no information"`, `"unable to"`
|
|
55
|
+
- **JSON leaks:** `"fetchedAt"`, `"\"tool\":"`, `"undefined"`, `"null"`
|
|
56
|
+
- **Imprecision:** Wrong domain terms (e.g., "payment" when it should be "dividend")
|
|
57
|
+
- **Sensitive data:** `"API_KEY"`, `"OPENAI_API_KEY"`, `"Bearer "`
|
|
58
|
+
- **System prompt leaks:** `"AVAILABLE TOOLS"`, `"you are an AI"`
|
|
59
|
+
|
|
60
|
+
```json
|
|
61
|
+
"responseNotContains": [
|
|
62
|
+
"I don't know", "no information", "unable to",
|
|
63
|
+
"fetchedAt", "\"tool\":", "undefined"
|
|
64
|
+
]
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## Seed-Stable vs Market-Dynamic Values
|
|
70
|
+
|
|
71
|
+
Every assertion value falls into one of two categories:
|
|
72
|
+
|
|
73
|
+
### Seed-Stable Values
|
|
74
|
+
|
|
75
|
+
Derived from a seed data script and codified in a seed manifest file. These never change unless the seed script is re-authored.
|
|
76
|
+
|
|
77
|
+
**Template syntax:** `{{seed:path}}`
|
|
78
|
+
|
|
79
|
+
| Template | Example Resolution |
|
|
80
|
+
|----------|-------------------|
|
|
81
|
+
| `{{seed:holdings.equities[0]}}` | `"AAPL"` |
|
|
82
|
+
| `{{seed:quantities.AAPL.current}}` | `"7"` |
|
|
83
|
+
| `{{seed:totals.dividends}}` | `"$30.05"` |
|
|
84
|
+
| `{{seed:currency}}` | `"USD"` |
|
|
85
|
+
|
|
86
|
+
### Market-Dynamic Values
|
|
87
|
+
|
|
88
|
+
Depend on live data (prices, P&L, allocations). Change on every run.
|
|
89
|
+
|
|
90
|
+
**Template syntax:** `{{snapshot:path}}`
|
|
91
|
+
|
|
92
|
+
| Template | Example Resolution |
|
|
93
|
+
|----------|-------------------|
|
|
94
|
+
| `{{snapshot:holdings.AAPL.value}}` | `"$1,599.50"` |
|
|
95
|
+
| `{{snapshot:performance.netWorth}}` | `"$13,245.00"` |
|
|
96
|
+
| `{{snapshot:performance.netPnlPct}}` | `"8.03%"` |
|
|
97
|
+
|
|
98
|
+
### Resolution Rules
|
|
99
|
+
|
|
100
|
+
1. Seed templates resolve before snapshot templates (allows mixing)
|
|
101
|
+
2. If a path is missing, the individual assertion is skipped with a warning — not a hard failure
|
|
102
|
+
3. Resolution happens in-memory only — eval JSON on disk is never modified
|
|
103
|
+
|
|
104
|
+
### When to Use Each
|
|
105
|
+
|
|
106
|
+
| Value type | Source | Assertion style |
|
|
107
|
+
|-----------|--------|----------------|
|
|
108
|
+
| Fixed identifiers (names, IDs) | Seed manifest | `{{seed:...}}` |
|
|
109
|
+
| Fixed quantities (counts, amounts) | Seed manifest | `{{seed:...}}` |
|
|
110
|
+
| Domain terms | N/A | `responseContainsAny` synonym groups |
|
|
111
|
+
| External identifiers | N/A | Hardcoded |
|
|
112
|
+
| Current values (prices, P&L) | Live snapshot | `{{snapshot:...}}` |
|
|
113
|
+
|
|
114
|
+
**Rule of thumb:** If the value comes from your seed data, use `{{seed:*}}`. If it depends on live external data, use `{{snapshot:*}}`. If it's a constant, hardcode it.
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Negative Assertions
|
|
119
|
+
|
|
120
|
+
Critical for catching regression:
|
|
121
|
+
|
|
122
|
+
### Cop-out Detection
|
|
123
|
+
```json
|
|
124
|
+
"responseNotContains": ["I don't know", "no information", "unable to", "I cannot"]
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### JSON Leak Detection
|
|
128
|
+
```json
|
|
129
|
+
"responseNotContains": ["fetchedAt", "\"tool\":", "\"error\":", "undefined", "null"]
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### System Prompt Leak Detection
|
|
133
|
+
```json
|
|
134
|
+
"responseNotContains": ["AVAILABLE TOOLS", "you are an AI", "system prompt"]
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Domain Imprecision
|
|
138
|
+
```json
|
|
139
|
+
"responseNotContains": ["payment received"]
|
|
140
|
+
```
|
|
141
|
+
(When the correct term is "dividend", catching the wrong term is a precision assertion.)
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## Latency Assertions
|
|
146
|
+
|
|
147
|
+
```json
|
|
148
|
+
"maxLatencyMs": 30000
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
- Golden evals: 30s is typical (one LLM call + one tool call)
|
|
152
|
+
- Labeled straightforward: 30s (simple multi-tool)
|
|
153
|
+
- Labeled ambiguous: 30s (same)
|
|
154
|
+
- Edge/adversarial: 15s (should respond quickly without tool calls)
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Tool Routing Assertions
|
|
159
|
+
|
|
160
|
+
### Exact match (golden + straightforward labeled)
|
|
161
|
+
```json
|
|
162
|
+
"toolsCalled": ["get_weather", "get_forecast"]
|
|
163
|
+
```
|
|
164
|
+
Both tools must be called. No more, no fewer.
|
|
165
|
+
|
|
166
|
+
### Acceptable sets (ambiguous labeled)
|
|
167
|
+
```json
|
|
168
|
+
"toolsAcceptable": [
|
|
169
|
+
["get_weather"],
|
|
170
|
+
["get_weather", "get_forecast"]
|
|
171
|
+
]
|
|
172
|
+
```
|
|
173
|
+
Either set is valid. The agent's judgment decides depth.
|
|
174
|
+
|
|
175
|
+
### Negative routing (edge labeled)
|
|
176
|
+
```json
|
|
177
|
+
"toolsNotCalled": ["delete_account"]
|
|
178
|
+
```
|
|
179
|
+
This tool must NOT be called (e.g., on an injection attempt).
|
|
180
|
+
|
|
181
|
+
### No tools needed (edge labeled)
|
|
182
|
+
```json
|
|
183
|
+
"toolsAcceptable": [["__none__"]]
|
|
184
|
+
```
|
|
185
|
+
The agent should answer from general knowledge without calling any tools.
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## Parameter Assertions
|
|
190
|
+
|
|
191
|
+
Checks that the model passed correct arguments to the tool. This catches a failure class that routing assertions miss entirely: the model calls the right tool but passes wrong, missing, or hallucinated parameters.
|
|
192
|
+
|
|
193
|
+
### Why This Matters
|
|
194
|
+
|
|
195
|
+
BFCL and Google ADK both test parameter-level accuracy. Without it, your evals only prove the model picked the right tool — not that it used it correctly. A model that calls `get_weather` with `city: "the weather"` instead of `city: "Paris"` passes all routing assertions and all response assertions (if the tool errors gracefully and the model recovers). The parameter assertion catches it.
|
|
196
|
+
|
|
197
|
+
### Assertion Types
|
|
198
|
+
|
|
199
|
+
```json
|
|
200
|
+
"toolParams": [
|
|
201
|
+
{ "tool": "get_weather", "paramName": "city", "assertion": "contains", "value": "Paris" },
|
|
202
|
+
{ "tool": "get_weather", "paramName": "units", "assertion": "oneOf", "value": ["metric", "imperial"] }
|
|
203
|
+
]
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
| Assertion | Use When | Example |
|
|
207
|
+
|-----------|----------|---------|
|
|
208
|
+
| `equals` | Exact value known | `city` = `"Paris"` |
|
|
209
|
+
| `contains` | Model may normalize | `city` contains `"Tokyo"` (could be `"Tokyo"` or `"Tokyo, JP"`) |
|
|
210
|
+
| `oneOf` | Multiple valid values | `units` is `"metric"` or `"imperial"` |
|
|
211
|
+
| `exists` | Parameter must be provided | `city` was sent (any value) |
|
|
212
|
+
| `notExists` | Catch hallucinated params | `country_code` should not be sent if schema doesn't define it |
|
|
213
|
+
| `matches` | Format validation | `date` matches `^\d{4}-\d{2}-\d{2}$` |
|
|
214
|
+
|
|
215
|
+
### Golden Eval Example
|
|
216
|
+
|
|
217
|
+
```json
|
|
218
|
+
{
|
|
219
|
+
"id": "gs-get-weather-001",
|
|
220
|
+
"description": "trigger phrase — direct weather question",
|
|
221
|
+
"input": { "message": "What's the weather in Paris?" },
|
|
222
|
+
"expect": {
|
|
223
|
+
"toolsCalled": ["get_weather"],
|
|
224
|
+
"toolParams": [
|
|
225
|
+
{ "tool": "get_weather", "paramName": "city", "assertion": "contains", "value": "Paris" }
|
|
226
|
+
],
|
|
227
|
+
"noToolErrors": true,
|
|
228
|
+
"responseNonEmpty": true,
|
|
229
|
+
"responseContains": ["Paris"],
|
|
230
|
+
"responseContainsAny": [["temperature", "degrees", "°"]],
|
|
231
|
+
"maxLatencyMs": 30000
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
### Multi-Tool Labeled Example
|
|
237
|
+
|
|
238
|
+
```json
|
|
239
|
+
{
|
|
240
|
+
"id": "ls-get-weather-001",
|
|
241
|
+
"description": "straightforward — weather + forecast synthesis",
|
|
242
|
+
"difficulty": "straightforward",
|
|
243
|
+
"input": { "message": "What's the weather in Tokyo today and what should I expect this week?" },
|
|
244
|
+
"expect": {
|
|
245
|
+
"toolsCalled": ["get_weather", "get_forecast"],
|
|
246
|
+
"toolParams": [
|
|
247
|
+
{ "tool": "get_weather", "paramName": "city", "assertion": "contains", "value": "Tokyo" },
|
|
248
|
+
{ "tool": "get_forecast", "paramName": "city", "assertion": "contains", "value": "Tokyo" }
|
|
249
|
+
],
|
|
250
|
+
"noToolErrors": true,
|
|
251
|
+
"responseNonEmpty": true,
|
|
252
|
+
"responseContains": ["Tokyo"],
|
|
253
|
+
"maxLatencyMs": 30000
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
### Rules
|
|
259
|
+
|
|
260
|
+
- Use `contains` over `equals` by default — models normalize inputs in unpredictable ways
|
|
261
|
+
- Use `oneOf` for enum fields where defaults may vary by context
|
|
262
|
+
- Use `exists` sparingly — it's weak (any value passes). Prefer `contains` or `equals`.
|
|
263
|
+
- Parameter assertions are SKIPPED if the tool wasn't called (routing already failed)
|
|
264
|
+
- When `toolsAcceptable` is used, only assert params for tools that were actually called
|
|
265
|
+
- Use `{{seed:*}}` in values for data-dependent parameters: `{ "assertion": "equals", "value": "{{seed:holdings.equities[0].symbol}}" }`
|