agent-tool-forge 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +209 -0
  3. package/lib/agent-registry.js +170 -0
  4. package/lib/api-client.js +792 -0
  5. package/lib/api-loader.js +260 -0
  6. package/lib/auth.d.ts +25 -0
  7. package/lib/auth.js +158 -0
  8. package/lib/checks/check-adapter.js +172 -0
  9. package/lib/checks/compose.js +42 -0
  10. package/lib/checks/content-match.js +14 -0
  11. package/lib/checks/cost-budget.js +11 -0
  12. package/lib/checks/index.js +18 -0
  13. package/lib/checks/json-valid.js +15 -0
  14. package/lib/checks/latency.js +11 -0
  15. package/lib/checks/length-bounds.js +17 -0
  16. package/lib/checks/negative-match.js +14 -0
  17. package/lib/checks/no-hallucinated-numbers.js +63 -0
  18. package/lib/checks/non-empty.js +34 -0
  19. package/lib/checks/regex-match.js +12 -0
  20. package/lib/checks/run-checks.js +84 -0
  21. package/lib/checks/schema-match.js +26 -0
  22. package/lib/checks/tool-call-count.js +16 -0
  23. package/lib/checks/tool-selection.js +34 -0
  24. package/lib/checks/types.js +45 -0
  25. package/lib/comparison/compare.js +86 -0
  26. package/lib/comparison/format.js +104 -0
  27. package/lib/comparison/index.js +6 -0
  28. package/lib/comparison/statistics.js +59 -0
  29. package/lib/comparison/types.js +41 -0
  30. package/lib/config-schema.js +200 -0
  31. package/lib/config.d.ts +66 -0
  32. package/lib/conversation-store.d.ts +77 -0
  33. package/lib/conversation-store.js +443 -0
  34. package/lib/db.d.ts +6 -0
  35. package/lib/db.js +1112 -0
  36. package/lib/dep-check.js +99 -0
  37. package/lib/drift-background.js +61 -0
  38. package/lib/drift-monitor.js +187 -0
  39. package/lib/eval-runner.js +566 -0
  40. package/lib/fixtures/fixture-store.js +161 -0
  41. package/lib/fixtures/index.js +11 -0
  42. package/lib/forge-engine.js +982 -0
  43. package/lib/forge-eval-generator.js +417 -0
  44. package/lib/forge-file-writer.js +386 -0
  45. package/lib/forge-service-client.js +190 -0
  46. package/lib/forge-service.d.ts +4 -0
  47. package/lib/forge-service.js +655 -0
  48. package/lib/forge-verifier-generator.js +271 -0
  49. package/lib/handlers/admin.js +151 -0
  50. package/lib/handlers/agents.js +229 -0
  51. package/lib/handlers/chat-resume.js +334 -0
  52. package/lib/handlers/chat-sync.js +320 -0
  53. package/lib/handlers/chat.js +320 -0
  54. package/lib/handlers/conversations.js +92 -0
  55. package/lib/handlers/preferences.js +88 -0
  56. package/lib/handlers/tools-list.js +58 -0
  57. package/lib/hitl-engine.d.ts +60 -0
  58. package/lib/hitl-engine.js +261 -0
  59. package/lib/http-utils.js +92 -0
  60. package/lib/index.d.ts +20 -0
  61. package/lib/index.js +141 -0
  62. package/lib/init.js +636 -0
  63. package/lib/manual-entry.js +59 -0
  64. package/lib/mcp-server.js +252 -0
  65. package/lib/output-groups.js +54 -0
  66. package/lib/postgres-store.d.ts +31 -0
  67. package/lib/postgres-store.js +465 -0
  68. package/lib/preference-store.d.ts +47 -0
  69. package/lib/preference-store.js +79 -0
  70. package/lib/prompt-store.d.ts +42 -0
  71. package/lib/prompt-store.js +60 -0
  72. package/lib/rate-limiter.d.ts +30 -0
  73. package/lib/rate-limiter.js +104 -0
  74. package/lib/react-engine.d.ts +110 -0
  75. package/lib/react-engine.js +337 -0
  76. package/lib/runner/cli.js +156 -0
  77. package/lib/runner/cost-estimator.js +71 -0
  78. package/lib/runner/gate.js +46 -0
  79. package/lib/runner/index.js +165 -0
  80. package/lib/sidecar.d.ts +83 -0
  81. package/lib/sidecar.js +161 -0
  82. package/lib/sse.d.ts +15 -0
  83. package/lib/sse.js +30 -0
  84. package/lib/tools-scanner.js +91 -0
  85. package/lib/tui.js +253 -0
  86. package/lib/verifier-report.js +78 -0
  87. package/lib/verifier-runner.js +338 -0
  88. package/lib/verifier-scanner.js +70 -0
  89. package/lib/verifier-worker-pool.js +196 -0
  90. package/lib/views/chat.js +340 -0
  91. package/lib/views/endpoints.js +203 -0
  92. package/lib/views/eval-run.js +206 -0
  93. package/lib/views/forge-agent.js +538 -0
  94. package/lib/views/forge.js +410 -0
  95. package/lib/views/main-menu.js +275 -0
  96. package/lib/views/mediation.js +381 -0
  97. package/lib/views/model-compare.js +430 -0
  98. package/lib/views/model-comparison.js +333 -0
  99. package/lib/views/onboarding.js +470 -0
  100. package/lib/views/performance.js +237 -0
  101. package/lib/views/run-evals.js +205 -0
  102. package/lib/views/settings.js +829 -0
  103. package/lib/views/tools-evals.js +514 -0
  104. package/lib/views/verifier-coverage.js +617 -0
  105. package/lib/workers/verifier-worker.js +52 -0
  106. package/package.json +123 -0
  107. package/widget/forge-chat.js +789 -0
@@ -0,0 +1,260 @@
1
+ /**
2
+ * API Loader — Fetches endpoints from OpenAPI (URL or file) and manifest.
3
+ * Merges and dedupes by path+method.
4
+ */
5
+
6
+ import { readFileSync, existsSync } from 'fs';
7
+ import { resolve } from 'path';
8
+
9
+ /**
10
+ * @typedef {Object} ApiEndpoint
11
+ * @property {string} path - API path (e.g. /api/v1/holdings)
12
+ * @property {string} method - HTTP method (GET, POST, etc.)
13
+ * @property {string} [name] - Suggested tool name (snake_case)
14
+ * @property {string} [description] - Suggested tool description
15
+ * @property {Record<string,unknown>} [params] - Parameter schema
16
+ * @property {boolean} [requiresConfirmation] - HITL gate for write ops
17
+ * @property {string} [source] - 'openapi' | 'manifest'
18
+ */
19
+
20
+ /**
21
+ * Derive tool name from path and method.
22
+ * @param {string} path
23
+ * @param {string} method
24
+ * @returns {string}
25
+ */
26
+ function deriveName(path, method) {
27
+ const parts = path
28
+ .replace(/^\/+/, '')
29
+ .replace(/\/$/, '')
30
+ .split('/')
31
+ .filter(Boolean);
32
+ const last = parts[parts.length - 1] || 'resource';
33
+ const base = last.replace(/\{[^}]+\}/g, 'by_id');
34
+ const verb = method === 'GET' ? 'get' : method === 'POST' ? 'create' : method.toLowerCase();
35
+ return `${verb}_${base}`.replace(/-/g, '_').replace(/[^a-z0-9_]/gi, '_').toLowerCase();
36
+ }
37
+
38
+ /**
39
+ * Parse OpenAPI 3.x paths into ApiEndpoint array.
40
+ * @param {object} spec - Parsed OpenAPI JSON
41
+ * @returns {ApiEndpoint[]}
42
+ */
43
+ function parseOpenApiPaths(spec) {
44
+ const endpoints = [];
45
+ const paths = spec.paths || {};
46
+ const rawBase = spec.servers?.[0]?.url?.replace(/\/$/, '') || '';
47
+ // Only use relative-path server bases (e.g. /api/v1); ignore full URLs
48
+ const basePath = rawBase.startsWith('/') ? rawBase : '';
49
+
50
+ for (const [path, pathItem] of Object.entries(paths)) {
51
+ if (typeof pathItem !== 'object' || pathItem === null) continue;
52
+ const methods = ['get', 'post', 'put', 'patch', 'delete'];
53
+ for (const method of methods) {
54
+ const op = pathItem[method];
55
+ if (!op) continue;
56
+ const relativePath = path.startsWith('/') ? path : `/${path}`;
57
+ const fullPath = `${basePath}${relativePath}`;
58
+ const name = deriveName(relativePath, method.toUpperCase());
59
+ const params = {};
60
+ for (const p of op.parameters || []) {
61
+ if (p?.name) {
62
+ params[p.name] = {
63
+ type: p.schema?.type || 'string',
64
+ description: p.description
65
+ };
66
+ }
67
+ }
68
+ if (pathItem.parameters) {
69
+ for (const p of pathItem.parameters) {
70
+ if (p?.name && !params[p.name]) {
71
+ params[p.name] = {
72
+ type: p.schema?.type || 'string',
73
+ description: p.description
74
+ };
75
+ }
76
+ }
77
+ }
78
+ endpoints.push({
79
+ path: fullPath,
80
+ method: method.toUpperCase(),
81
+ name,
82
+ description: op.summary || op.description || `${method.toUpperCase()} ${fullPath}`,
83
+ params: Object.keys(params).length ? params : undefined,
84
+ requiresConfirmation: ['post', 'put', 'patch', 'delete'].includes(method),
85
+ source: 'openapi'
86
+ });
87
+ }
88
+ }
89
+ return endpoints;
90
+ }
91
+
92
+ /**
93
+ * Load OpenAPI spec from URL.
94
+ * @param {string} url
95
+ * @returns {Promise<ApiEndpoint[]>}
96
+ */
97
+ async function loadFromOpenApiUrl(url, headers = {}) {
98
+ const res = await fetch(url, {
99
+ signal: AbortSignal.timeout(10000),
100
+ headers,
101
+ });
102
+ if (!res.ok) throw new Error(`OpenAPI fetch failed: ${res.status} ${url}`);
103
+ const spec = await res.json();
104
+ return parseOpenApiPaths(spec);
105
+ }
106
+
107
+ /**
108
+ * Load OpenAPI spec from file.
109
+ * @param {string} filePath
110
+ * @returns {ApiEndpoint[]}
111
+ */
112
+ function loadFromOpenApiFile(filePath) {
113
+ const abs = resolve(process.cwd(), filePath);
114
+ if (!existsSync(abs)) return [];
115
+ const raw = readFileSync(abs, 'utf-8');
116
+ let spec;
117
+ try { spec = JSON.parse(raw); } catch (err) {
118
+ throw new Error(`Failed to parse OpenAPI file ${filePath}: ${err.message}`);
119
+ }
120
+ return parseOpenApiPaths(spec);
121
+ }
122
+
123
+ /**
124
+ * Load endpoints from manifest file.
125
+ * @param {string} manifestPath
126
+ * @returns {ApiEndpoint[]}
127
+ */
128
+ function loadFromManifest(manifestPath) {
129
+ const abs = resolve(process.cwd(), manifestPath);
130
+ if (!existsSync(abs)) return [];
131
+ const raw = readFileSync(abs, 'utf-8');
132
+ let manifest;
133
+ try { manifest = JSON.parse(raw); } catch (err) {
134
+ throw new Error(`Failed to parse manifest ${manifestPath}: ${err.message}`);
135
+ }
136
+ const endpoints = manifest.endpoints || [];
137
+ return endpoints.map((e) => ({
138
+ path: e.path,
139
+ method: (e.method || 'GET').toUpperCase(),
140
+ name: e.name || deriveName(e.path, e.method || 'GET'),
141
+ description: e.description || `${e.method || 'GET'} ${e.path}`,
142
+ params: e.params,
143
+ requiresConfirmation: e.requiresConfirmation ?? false,
144
+ source: 'manifest'
145
+ }));
146
+ }
147
+
148
+ /**
149
+ * Merge endpoints, dedupe by path+method (manifest overrides openapi).
150
+ * @param {ApiEndpoint[][]} arrays
151
+ * @returns {ApiEndpoint[]}
152
+ */
153
+ function mergeEndpoints(...arrays) {
154
+ const byKey = new Map();
155
+ for (const arr of arrays) {
156
+ for (const e of arr) {
157
+ const key = `${e.method}:${e.path}`;
158
+ byKey.set(key, e);
159
+ }
160
+ }
161
+ return Array.from(byKey.values());
162
+ }
163
+
164
+ /**
165
+ * Safe JSON.parse — returns null on failure.
166
+ * @param {string} str
167
+ * @returns {object|null}
168
+ */
169
+ function safeParseJson(str) {
170
+ try { return JSON.parse(str); } catch { return null; }
171
+ }
172
+
173
+ /**
174
+ * Normalize a path for comparison: ensure leading slash, trim trailing slash, lowercase.
175
+ * @param {string} p
176
+ * @returns {string}
177
+ */
178
+ function normalizePath(p) {
179
+ if (!p) return '';
180
+ const withSlash = p.startsWith('/') ? p : `/${p}`;
181
+ return withSlash.replace(/\/$/, '').toLowerCase();
182
+ }
183
+
184
+ /**
185
+ * Compute API coverage: which spec endpoints have a promoted tool with a matching
186
+ * mcpRouting.endpoint (path) and mcpRouting.method.
187
+ *
188
+ * Matching is path+method, case-insensitive, path-only (no base URL).
189
+ *
190
+ * @param {object|null} spec - Parsed OpenAPI spec object
191
+ * @param {import('better-sqlite3').Database} db
192
+ * @returns {{ covered: ApiEndpoint[]; uncovered: ApiEndpoint[]; total: number }}
193
+ */
194
+ export function computeCoverage(spec, db) {
195
+ if (!spec?.paths) return { covered: [], uncovered: [], total: 0 };
196
+
197
+ const endpoints = parseOpenApiPaths(spec);
198
+
199
+ const promotedRows = db.prepare(
200
+ `SELECT spec_json FROM tool_registry WHERE lifecycle_state = 'promoted'`
201
+ ).all();
202
+
203
+ // Build a set of "METHOD:normalizedPath" keys from promoted tools
204
+ const coveredKeys = new Set();
205
+ for (const row of promotedRows) {
206
+ const toolSpec = safeParseJson(row.spec_json);
207
+ if (!toolSpec?.mcpRouting?.endpoint) continue;
208
+ const path = normalizePath(toolSpec.mcpRouting.endpoint);
209
+ const method = (toolSpec.mcpRouting.method || '').toUpperCase();
210
+ if (!path || !method) continue;
211
+ coveredKeys.add(`${method}:${path}`);
212
+ }
213
+
214
+ const covered = [];
215
+ const uncovered = [];
216
+
217
+ for (const endpoint of endpoints) {
218
+ const key = `${endpoint.method.toUpperCase()}:${normalizePath(endpoint.path)}`;
219
+ if (coveredKeys.has(key)) {
220
+ covered.push(endpoint);
221
+ } else {
222
+ uncovered.push(endpoint);
223
+ }
224
+ }
225
+
226
+ return { covered, uncovered, total: endpoints.length };
227
+ }
228
+
229
+ /**
230
+ * Load all APIs from config.
231
+ * @param {object} config - forge.config.json api section
232
+ * @returns {Promise<ApiEndpoint[]>}
233
+ */
234
+ export async function loadApis(config) {
235
+ const endpoints = [];
236
+ const discovery = config?.discovery;
237
+ const manifestPath = config?.manifestPath;
238
+
239
+ if (discovery?.type === 'openapi') {
240
+ if (discovery.url) {
241
+ try {
242
+ const fromUrl = await loadFromOpenApiUrl(discovery.url, discovery.headers);
243
+ endpoints.push(...fromUrl);
244
+ } catch (err) {
245
+ console.error(`OpenAPI URL failed: ${err.message}`);
246
+ }
247
+ }
248
+ if (discovery.file) {
249
+ const fromFile = loadFromOpenApiFile(discovery.file);
250
+ endpoints.push(...fromFile);
251
+ }
252
+ }
253
+
254
+ if (manifestPath) {
255
+ const fromManifest = loadFromManifest(manifestPath);
256
+ endpoints.push(...fromManifest);
257
+ }
258
+
259
+ return mergeEndpoints(endpoints);
260
+ }
package/lib/auth.d.ts ADDED
@@ -0,0 +1,25 @@
1
+ export interface AuthResult {
2
+ authenticated: boolean;
3
+ userId: string | null;
4
+ claims: Record<string, unknown> | null;
5
+ error: string | null;
6
+ }
7
+
8
+ export interface AuthConfig {
9
+ mode?: 'trust' | 'verify';
10
+ signingKey?: string;
11
+ claimsPath?: string;
12
+ }
13
+
14
+ export interface Authenticator {
15
+ authenticate(req: object): AuthResult;
16
+ }
17
+
18
+ export function createAuth(authConfig?: AuthConfig): Authenticator;
19
+
20
+ export interface AdminAuthResult {
21
+ authenticated: boolean;
22
+ error: string | null;
23
+ }
24
+
25
+ export function authenticateAdmin(req: object, adminKey: string): AdminAuthResult;
package/lib/auth.js ADDED
@@ -0,0 +1,158 @@
1
+ /**
2
+ * Auth module — configurable JWT authentication for the forge sidecar.
3
+ *
4
+ * Two modes:
5
+ * trust — decode JWT payload without verifying signature (fast, for local dev / behind reverse proxy)
6
+ * verify — verify HMAC-SHA256 (HS256) or RSA-SHA256 (RS256) signature via Node.js built-in crypto
7
+ *
8
+ * No external JWT library required.
9
+ */
10
+
11
+ import { createHmac, createVerify, timingSafeEqual } from 'crypto';
12
+
13
+ /**
14
+ * @typedef {{ authenticated: boolean, userId: string|null, claims: object|null, error: string|null }} AuthResult
15
+ */
16
+
17
+ /**
18
+ * Create an authenticator from config.
19
+ * @param {{ mode: 'verify'|'trust', signingKey?: string, claimsPath?: string }} authConfig
20
+ * @returns {{ authenticate(req): AuthResult }}
21
+ */
22
+ export function createAuth(authConfig = {}) {
23
+ const mode = authConfig.mode ?? 'trust';
24
+ const signingKey = authConfig.signingKey ?? null;
25
+ const claimsPath = authConfig.claimsPath ?? 'sub';
26
+
27
+ return {
28
+ authenticate(req) {
29
+ let token = null;
30
+
31
+ // Priority 1: Authorization header
32
+ const authHeader = req.headers?.authorization;
33
+ if (authHeader?.startsWith('Bearer ')) {
34
+ token = authHeader.slice(7);
35
+ }
36
+
37
+ // Priority 2: ?token= query param (for EventSource which can't set headers)
38
+ if (!token && req.url) {
39
+ try {
40
+ const url = new URL(req.url, 'http://localhost');
41
+ token = url.searchParams.get('token');
42
+ } catch { /* malformed URL */ }
43
+ }
44
+
45
+ if (!token) {
46
+ return { authenticated: false, userId: null, claims: null, error: 'Missing token' };
47
+ }
48
+
49
+ const parts = token.split('.');
50
+ if (parts.length !== 3) {
51
+ return { authenticated: false, userId: null, claims: null, error: 'Malformed JWT' };
52
+ }
53
+
54
+ // Verify signature if in verify mode
55
+ if (mode === 'verify') {
56
+ if (!signingKey) {
57
+ return { authenticated: false, userId: null, claims: null, error: 'No signing key configured' };
58
+ }
59
+
60
+ let header;
61
+ try {
62
+ header = JSON.parse(base64UrlDecode(parts[0]));
63
+ } catch {
64
+ return { authenticated: false, userId: null, claims: null, error: 'Invalid JWT header' };
65
+ }
66
+
67
+ const sigInput = `${parts[0]}.${parts[1]}`;
68
+ const signature = parts[2];
69
+ const alg = header.alg ?? 'HS256';
70
+
71
+ if (alg === 'HS256') {
72
+ const expected = base64UrlEncode(
73
+ createHmac('sha256', signingKey).update(sigInput).digest()
74
+ );
75
+ const expectedBuf = Buffer.from(expected);
76
+ const signatureBuf = Buffer.from(signature);
77
+ if (expectedBuf.length !== signatureBuf.length || !timingSafeEqual(expectedBuf, signatureBuf)) {
78
+ return { authenticated: false, userId: null, claims: null, error: 'Invalid signature' };
79
+ }
80
+ } else if (alg === 'RS256') {
81
+ const sigBuf = base64UrlToBuffer(signature);
82
+ const verifier = createVerify('RSA-SHA256');
83
+ verifier.update(sigInput);
84
+ if (!verifier.verify(signingKey, sigBuf)) {
85
+ return { authenticated: false, userId: null, claims: null, error: 'Invalid signature' };
86
+ }
87
+ } else {
88
+ return { authenticated: false, userId: null, claims: null, error: `Unsupported algorithm: ${alg}` };
89
+ }
90
+ }
91
+
92
+ // Decode payload
93
+ let claims;
94
+ try {
95
+ claims = JSON.parse(base64UrlDecode(parts[1]));
96
+ } catch {
97
+ return { authenticated: false, userId: null, claims: null, error: 'Invalid JWT payload' };
98
+ }
99
+
100
+ const userId = extractClaim(claims, claimsPath);
101
+ if (!userId) {
102
+ return { authenticated: false, userId: null, claims, error: `Claim "${claimsPath}" not found in token` };
103
+ }
104
+
105
+ return { authenticated: true, userId: String(userId), claims, error: null };
106
+ }
107
+ };
108
+ }
109
+
110
+ /**
111
+ * Admin auth — simple Bearer token comparison.
112
+ * @param {import('http').IncomingMessage} req
113
+ * @param {string} adminKey
114
+ * @returns {{ authenticated: boolean, error: string|null }}
115
+ */
116
+ export function authenticateAdmin(req, adminKey) {
117
+ if (!adminKey) {
118
+ return { authenticated: false, error: 'No admin key configured' };
119
+ }
120
+ const authHeader = req.headers?.authorization;
121
+ if (!authHeader || !authHeader.startsWith('Bearer ')) {
122
+ return { authenticated: false, error: 'Missing or invalid Authorization header' };
123
+ }
124
+ const token = authHeader.slice(7);
125
+ const tokenBuf = Buffer.from(token);
126
+ const keyBuf = Buffer.from(adminKey);
127
+ if (tokenBuf.length !== keyBuf.length || !timingSafeEqual(tokenBuf, keyBuf)) {
128
+ return { authenticated: false, error: 'Invalid admin key' };
129
+ }
130
+ return { authenticated: true, error: null };
131
+ }
132
+
133
+ // ── Helpers ──────────────────────────────────────────────────────────────────
134
+
135
+ function base64UrlDecode(str) {
136
+ const padded = str.replace(/-/g, '+').replace(/_/g, '/');
137
+ return Buffer.from(padded, 'base64').toString('utf-8');
138
+ }
139
+
140
+ function base64UrlEncode(buf) {
141
+ return Buffer.from(buf).toString('base64')
142
+ .replace(/\+/g, '-').replace(/\//g, '_').replace(/=+$/, '');
143
+ }
144
+
145
+ function base64UrlToBuffer(str) {
146
+ const padded = str.replace(/-/g, '+').replace(/_/g, '/');
147
+ return Buffer.from(padded, 'base64');
148
+ }
149
+
150
+ function extractClaim(claims, path) {
151
+ const parts = path.split('.');
152
+ let val = claims;
153
+ for (const p of parts) {
154
+ if (val == null || typeof val !== 'object') return null;
155
+ val = val[p];
156
+ }
157
+ return val ?? null;
158
+ }
@@ -0,0 +1,172 @@
1
+ // Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
2
+ // MIT License — see LICENSE
3
+
4
+ /**
5
+ * Map an agent-tool-forge eval case's `expect` object plus run metadata
6
+ * to a RunChecksInput shape that runChecks() can consume.
7
+ *
8
+ * @param {Object} evalCase - the eval case object from the eval JSON file
9
+ * @param {Object} runMeta - runtime data from executing the case
10
+ * @param {string[]} runMeta.toolsCalled - actual tools called during execution
11
+ * @param {string} runMeta.responseText - the model's response text
12
+ * @param {number} [runMeta.latencyMs] - round-trip latency in ms
13
+ * @param {number} [runMeta.cost] - actual cost in USD
14
+ * @returns {import('./types.js').RunChecksInput}
15
+ */
16
+ export function checkAdapter(evalCase, runMeta) {
17
+ const expect = evalCase.expect ?? {};
18
+ const { toolsCalled = [], responseText = '', latencyMs, cost } = runMeta;
19
+
20
+ /** @type {import('./types.js').RunChecksInput} */
21
+ const input = {};
22
+
23
+ // Response text is needed for most content checks
24
+ input.responseText = responseText;
25
+
26
+ // Tool selection — exact match (strict mode via expectedTools/actualTools)
27
+ // toolsAcceptable is handled separately via checkToolsAcceptable() below
28
+ if (expect.toolsCalled !== undefined) {
29
+ input.expectedTools = Array.isArray(expect.toolsCalled)
30
+ ? expect.toolsCalled
31
+ : [expect.toolsCalled];
32
+ input.actualTools = toolsCalled;
33
+ }
34
+
35
+ // responseContains → mustContain (array of strings; every item must appear)
36
+ if (expect.responseContains !== undefined) {
37
+ input.mustContain = Array.isArray(expect.responseContains)
38
+ ? expect.responseContains
39
+ : [expect.responseContains];
40
+ }
41
+
42
+ // responseContainsAny — handled by callers via checkResponseContainsAnyGroups().
43
+ // Normalization of flat string[] to string[][] is done inside that function.
44
+ // No field is set on RunChecksInput for this (runChecks has no native anyOf check).
45
+
46
+ // responseNotContains → mustNotContain
47
+ if (expect.responseNotContains !== undefined) {
48
+ input.mustNotContain = Array.isArray(expect.responseNotContains)
49
+ ? expect.responseNotContains
50
+ : [expect.responseNotContains];
51
+ }
52
+
53
+ // responseNonEmpty → nonEmpty check
54
+ // The eval-runner treats "non-empty" as: text present OR at least one tool called.
55
+ // RunChecksInput.nonEmpty is a simpler text-only flag; callers should also check
56
+ // toolsCalled.length when the original semantics matter.
57
+ if (expect.responseNonEmpty) {
58
+ input.nonEmpty = true;
59
+ }
60
+
61
+ // requiresPreamble lives on evalCase directly (not inside expect).
62
+ // If true and the model returned only tool calls (no text), the run should fail.
63
+ // Map it to nonEmpty so the text-presence check fires.
64
+ if (evalCase.requiresPreamble === true) {
65
+ input.nonEmpty = true;
66
+ }
67
+
68
+ // Latency check
69
+ if (latencyMs !== undefined && expect.maxLatencyMs !== undefined) {
70
+ input.latencyMs = latencyMs;
71
+ input.maxLatencyMs = expect.maxLatencyMs;
72
+ }
73
+
74
+ // Cost budget
75
+ if (cost !== undefined && expect.maxCost !== undefined) {
76
+ input.actualCost = cost;
77
+ input.maxCost = expect.maxCost;
78
+ }
79
+
80
+ // Tool call count
81
+ if (expect.minToolCalls !== undefined || expect.maxToolCalls !== undefined) {
82
+ input.actualToolCallCount = toolsCalled.length;
83
+ if (expect.minToolCalls !== undefined) input.minToolCalls = expect.minToolCalls;
84
+ if (expect.maxToolCalls !== undefined) input.maxToolCalls = expect.maxToolCalls;
85
+ }
86
+
87
+ // jsonValid — check if response is valid JSON
88
+ if (expect.jsonValid) {
89
+ input.jsonValid = true;
90
+ }
91
+
92
+ // schemaData — validate response against a schema
93
+ if (expect.schemaData !== undefined) {
94
+ input.schemaData = expect.schemaData;
95
+ input.requiredKeys = expect.requiredKeys ?? [];
96
+ if (expect.typeChecks) input.typeChecks = expect.typeChecks;
97
+ }
98
+
99
+ // minLength / maxLength — response length bounds
100
+ if (expect.minLength !== undefined) input.minLength = expect.minLength;
101
+ if (expect.maxLength !== undefined) input.maxLength = expect.maxLength;
102
+
103
+ // regexPattern — response must match pattern
104
+ if (expect.regexPattern !== undefined) input.regexPattern = expect.regexPattern;
105
+
106
+ // copOutPhrases — custom cop-out phrase list for nonEmpty check
107
+ if (expect.copOutPhrases !== undefined) input.copOutPhrases = expect.copOutPhrases;
108
+
109
+ return input;
110
+ }
111
+
112
+ /**
113
+ * Handle the responseContainsAny case.
114
+ *
115
+ * In eval-runner, responseContainsAny is string[][] — an array of groups where
116
+ * each group must contribute at least one match. This mirrors that behaviour.
117
+ *
118
+ * @param {string} responseText
119
+ * @param {string[][]} groups - each inner array is one group; at least one member
120
+ * of each group must appear in responseText (case-sensitive, same as eval-runner)
121
+ * @returns {{ pass: boolean, reason?: string }}
122
+ */
123
+ export function checkResponseContainsAnyGroups(responseText, groups) {
124
+ if (!groups?.length) return { pass: true };
125
+
126
+ // Normalize: flat string[] → [[...]] (single group). Grouped string[][] passes through.
127
+ const normalized = Array.isArray(groups[0]) ? groups : [groups];
128
+
129
+ const failures = [];
130
+ for (const group of normalized) {
131
+ if (!group.some((str) => responseText.includes(str))) {
132
+ failures.push(`response should contain any of [${group.join(', ')}]`);
133
+ }
134
+ }
135
+
136
+ if (failures.length === 0) return { pass: true };
137
+ return { pass: false, reason: failures.join('; ') };
138
+ }
139
+
140
+ /**
141
+ * Handle the toolsAcceptable case.
142
+ *
143
+ * toolsAcceptable is string[][] — an array of acceptable tool sets. The run
144
+ * passes if the actual tools called exactly match ANY of the acceptable sets.
145
+ * The special token '__none__' inside an acceptable set means no tools called.
146
+ *
147
+ * @param {string[]} actualTools
148
+ * @param {string[][]} acceptable
149
+ * @returns {{ pass: boolean, reason?: string }}
150
+ */
151
+ export function checkToolsAcceptable(actualTools, acceptable) {
152
+ if (!acceptable?.length) return { pass: true };
153
+
154
+ function setsEqual(a, b) {
155
+ const sa = new Set(a);
156
+ const sb = new Set(b);
157
+ if (sa.size !== sb.size) return false;
158
+ for (const v of sa) if (!sb.has(v)) return false;
159
+ return true;
160
+ }
161
+
162
+ const anyMatch = acceptable.some((set) => {
163
+ if (set.includes('__none__') && actualTools.length === 0) return true;
164
+ return setsEqual(set, actualTools);
165
+ });
166
+
167
+ if (anyMatch) return { pass: true };
168
+ return {
169
+ pass: false,
170
+ reason: `tools: [${actualTools.join(', ')}] not in any acceptable set`
171
+ };
172
+ }
@@ -0,0 +1,42 @@
1
+ // Adapted from agent-eval-kit by FlanaganSe (https://github.com/FlanaganSe/agent-eval-kit)
2
+ // MIT License — see LICENSE
3
+
4
+ /**
5
+ * Compose multiple grader functions — passes only if ALL pass.
6
+ * @param {Array<(input: unknown) => Promise<{pass: boolean, reason?: string}>>} graders
7
+ * @returns {(input: unknown) => Promise<{pass: boolean, reason?: string}>}
8
+ */
9
+ export function all(graders) {
10
+ return async (input) => {
11
+ const results = await Promise.all(graders.map(g => g(input)));
12
+ const failed = results.filter(r => !r.pass);
13
+ if (failed.length === 0) return { pass: true };
14
+ return { pass: false, reason: failed.map(r => r.reason).filter(Boolean).join('; ') };
15
+ };
16
+ }
17
+
18
+ /**
19
+ * Compose multiple grader functions — passes if ANY pass.
20
+ * @param {Array<(input: unknown) => Promise<{pass: boolean, reason?: string}>>} graders
21
+ * @returns {(input: unknown) => Promise<{pass: boolean, reason?: string}>}
22
+ */
23
+ export function any(graders) {
24
+ return async (input) => {
25
+ const results = await Promise.all(graders.map(g => g(input)));
26
+ if (results.some(r => r.pass)) return { pass: true };
27
+ return { pass: false, reason: results.map(r => r.reason).filter(Boolean).join(' | ') };
28
+ };
29
+ }
30
+
31
+ /**
32
+ * Invert a grader function — passes if the original fails.
33
+ * @param {(input: unknown) => Promise<{pass: boolean, reason?: string}>} grader
34
+ * @returns {(input: unknown) => Promise<{pass: boolean, reason?: string}>}
35
+ */
36
+ export function not(grader) {
37
+ return async (input) => {
38
+ const result = await grader(input);
39
+ if (!result.pass) return { pass: true };
40
+ return { pass: false, reason: `Expected grader to fail but it passed` };
41
+ };
42
+ }
@@ -0,0 +1,14 @@
1
+ // Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
2
+ // MIT License — see LICENSE
3
+
4
+ /**
5
+ * Check that responseText contains all required substrings (case-insensitive).
6
+ * @param {{responseText: string, mustContain: string[]}} input
7
+ * @returns {import('./types.js').EvalResult}
8
+ */
9
+ export function contentMatch({ responseText, mustContain }) {
10
+ const lower = responseText.toLowerCase();
11
+ const missing = mustContain.filter(s => !lower.includes(s.toLowerCase()));
12
+ if (missing.length === 0) return { pass: true };
13
+ return { pass: false, reason: `Missing from response: ${missing.join(', ')}` };
14
+ }
@@ -0,0 +1,11 @@
1
+ // Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
2
+ // MIT License — see LICENSE
3
+
4
+ /**
5
+ * @param {{actualCost: number, maxCost: number}} input
6
+ * @returns {import('./types.js').EvalResult}
7
+ */
8
+ export function costBudget({ actualCost, maxCost }) {
9
+ if (actualCost <= maxCost) return { pass: true };
10
+ return { pass: false, reason: `Cost $${actualCost.toFixed(6)} exceeded budget $${maxCost.toFixed(6)}` };
11
+ }