agent-tool-forge 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +209 -0
- package/lib/agent-registry.js +170 -0
- package/lib/api-client.js +792 -0
- package/lib/api-loader.js +260 -0
- package/lib/auth.d.ts +25 -0
- package/lib/auth.js +158 -0
- package/lib/checks/check-adapter.js +172 -0
- package/lib/checks/compose.js +42 -0
- package/lib/checks/content-match.js +14 -0
- package/lib/checks/cost-budget.js +11 -0
- package/lib/checks/index.js +18 -0
- package/lib/checks/json-valid.js +15 -0
- package/lib/checks/latency.js +11 -0
- package/lib/checks/length-bounds.js +17 -0
- package/lib/checks/negative-match.js +14 -0
- package/lib/checks/no-hallucinated-numbers.js +63 -0
- package/lib/checks/non-empty.js +34 -0
- package/lib/checks/regex-match.js +12 -0
- package/lib/checks/run-checks.js +84 -0
- package/lib/checks/schema-match.js +26 -0
- package/lib/checks/tool-call-count.js +16 -0
- package/lib/checks/tool-selection.js +34 -0
- package/lib/checks/types.js +45 -0
- package/lib/comparison/compare.js +86 -0
- package/lib/comparison/format.js +104 -0
- package/lib/comparison/index.js +6 -0
- package/lib/comparison/statistics.js +59 -0
- package/lib/comparison/types.js +41 -0
- package/lib/config-schema.js +200 -0
- package/lib/config.d.ts +66 -0
- package/lib/conversation-store.d.ts +77 -0
- package/lib/conversation-store.js +443 -0
- package/lib/db.d.ts +6 -0
- package/lib/db.js +1112 -0
- package/lib/dep-check.js +99 -0
- package/lib/drift-background.js +61 -0
- package/lib/drift-monitor.js +187 -0
- package/lib/eval-runner.js +566 -0
- package/lib/fixtures/fixture-store.js +161 -0
- package/lib/fixtures/index.js +11 -0
- package/lib/forge-engine.js +982 -0
- package/lib/forge-eval-generator.js +417 -0
- package/lib/forge-file-writer.js +386 -0
- package/lib/forge-service-client.js +190 -0
- package/lib/forge-service.d.ts +4 -0
- package/lib/forge-service.js +655 -0
- package/lib/forge-verifier-generator.js +271 -0
- package/lib/handlers/admin.js +151 -0
- package/lib/handlers/agents.js +229 -0
- package/lib/handlers/chat-resume.js +334 -0
- package/lib/handlers/chat-sync.js +320 -0
- package/lib/handlers/chat.js +320 -0
- package/lib/handlers/conversations.js +92 -0
- package/lib/handlers/preferences.js +88 -0
- package/lib/handlers/tools-list.js +58 -0
- package/lib/hitl-engine.d.ts +60 -0
- package/lib/hitl-engine.js +261 -0
- package/lib/http-utils.js +92 -0
- package/lib/index.d.ts +20 -0
- package/lib/index.js +141 -0
- package/lib/init.js +636 -0
- package/lib/manual-entry.js +59 -0
- package/lib/mcp-server.js +252 -0
- package/lib/output-groups.js +54 -0
- package/lib/postgres-store.d.ts +31 -0
- package/lib/postgres-store.js +465 -0
- package/lib/preference-store.d.ts +47 -0
- package/lib/preference-store.js +79 -0
- package/lib/prompt-store.d.ts +42 -0
- package/lib/prompt-store.js +60 -0
- package/lib/rate-limiter.d.ts +30 -0
- package/lib/rate-limiter.js +104 -0
- package/lib/react-engine.d.ts +110 -0
- package/lib/react-engine.js +337 -0
- package/lib/runner/cli.js +156 -0
- package/lib/runner/cost-estimator.js +71 -0
- package/lib/runner/gate.js +46 -0
- package/lib/runner/index.js +165 -0
- package/lib/sidecar.d.ts +83 -0
- package/lib/sidecar.js +161 -0
- package/lib/sse.d.ts +15 -0
- package/lib/sse.js +30 -0
- package/lib/tools-scanner.js +91 -0
- package/lib/tui.js +253 -0
- package/lib/verifier-report.js +78 -0
- package/lib/verifier-runner.js +338 -0
- package/lib/verifier-scanner.js +70 -0
- package/lib/verifier-worker-pool.js +196 -0
- package/lib/views/chat.js +340 -0
- package/lib/views/endpoints.js +203 -0
- package/lib/views/eval-run.js +206 -0
- package/lib/views/forge-agent.js +538 -0
- package/lib/views/forge.js +410 -0
- package/lib/views/main-menu.js +275 -0
- package/lib/views/mediation.js +381 -0
- package/lib/views/model-compare.js +430 -0
- package/lib/views/model-comparison.js +333 -0
- package/lib/views/onboarding.js +470 -0
- package/lib/views/performance.js +237 -0
- package/lib/views/run-evals.js +205 -0
- package/lib/views/settings.js +829 -0
- package/lib/views/tools-evals.js +514 -0
- package/lib/views/verifier-coverage.js +617 -0
- package/lib/workers/verifier-worker.js +52 -0
- package/package.json +123 -0
- package/widget/forge-chat.js +789 -0
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* API Loader — Fetches endpoints from OpenAPI (URL or file) and manifest.
|
|
3
|
+
* Merges and dedupes by path+method.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { readFileSync, existsSync } from 'fs';
|
|
7
|
+
import { resolve } from 'path';
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* @typedef {Object} ApiEndpoint
|
|
11
|
+
* @property {string} path - API path (e.g. /api/v1/holdings)
|
|
12
|
+
* @property {string} method - HTTP method (GET, POST, etc.)
|
|
13
|
+
* @property {string} [name] - Suggested tool name (snake_case)
|
|
14
|
+
* @property {string} [description] - Suggested tool description
|
|
15
|
+
* @property {Record<string,unknown>} [params] - Parameter schema
|
|
16
|
+
* @property {boolean} [requiresConfirmation] - HITL gate for write ops
|
|
17
|
+
* @property {string} [source] - 'openapi' | 'manifest'
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Derive tool name from path and method.
|
|
22
|
+
* @param {string} path
|
|
23
|
+
* @param {string} method
|
|
24
|
+
* @returns {string}
|
|
25
|
+
*/
|
|
26
|
+
function deriveName(path, method) {
|
|
27
|
+
const parts = path
|
|
28
|
+
.replace(/^\/+/, '')
|
|
29
|
+
.replace(/\/$/, '')
|
|
30
|
+
.split('/')
|
|
31
|
+
.filter(Boolean);
|
|
32
|
+
const last = parts[parts.length - 1] || 'resource';
|
|
33
|
+
const base = last.replace(/\{[^}]+\}/g, 'by_id');
|
|
34
|
+
const verb = method === 'GET' ? 'get' : method === 'POST' ? 'create' : method.toLowerCase();
|
|
35
|
+
return `${verb}_${base}`.replace(/-/g, '_').replace(/[^a-z0-9_]/gi, '_').toLowerCase();
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Parse OpenAPI 3.x paths into ApiEndpoint array.
|
|
40
|
+
* @param {object} spec - Parsed OpenAPI JSON
|
|
41
|
+
* @returns {ApiEndpoint[]}
|
|
42
|
+
*/
|
|
43
|
+
function parseOpenApiPaths(spec) {
|
|
44
|
+
const endpoints = [];
|
|
45
|
+
const paths = spec.paths || {};
|
|
46
|
+
const rawBase = spec.servers?.[0]?.url?.replace(/\/$/, '') || '';
|
|
47
|
+
// Only use relative-path server bases (e.g. /api/v1); ignore full URLs
|
|
48
|
+
const basePath = rawBase.startsWith('/') ? rawBase : '';
|
|
49
|
+
|
|
50
|
+
for (const [path, pathItem] of Object.entries(paths)) {
|
|
51
|
+
if (typeof pathItem !== 'object' || pathItem === null) continue;
|
|
52
|
+
const methods = ['get', 'post', 'put', 'patch', 'delete'];
|
|
53
|
+
for (const method of methods) {
|
|
54
|
+
const op = pathItem[method];
|
|
55
|
+
if (!op) continue;
|
|
56
|
+
const relativePath = path.startsWith('/') ? path : `/${path}`;
|
|
57
|
+
const fullPath = `${basePath}${relativePath}`;
|
|
58
|
+
const name = deriveName(relativePath, method.toUpperCase());
|
|
59
|
+
const params = {};
|
|
60
|
+
for (const p of op.parameters || []) {
|
|
61
|
+
if (p?.name) {
|
|
62
|
+
params[p.name] = {
|
|
63
|
+
type: p.schema?.type || 'string',
|
|
64
|
+
description: p.description
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
if (pathItem.parameters) {
|
|
69
|
+
for (const p of pathItem.parameters) {
|
|
70
|
+
if (p?.name && !params[p.name]) {
|
|
71
|
+
params[p.name] = {
|
|
72
|
+
type: p.schema?.type || 'string',
|
|
73
|
+
description: p.description
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
endpoints.push({
|
|
79
|
+
path: fullPath,
|
|
80
|
+
method: method.toUpperCase(),
|
|
81
|
+
name,
|
|
82
|
+
description: op.summary || op.description || `${method.toUpperCase()} ${fullPath}`,
|
|
83
|
+
params: Object.keys(params).length ? params : undefined,
|
|
84
|
+
requiresConfirmation: ['post', 'put', 'patch', 'delete'].includes(method),
|
|
85
|
+
source: 'openapi'
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
return endpoints;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Load OpenAPI spec from URL.
|
|
94
|
+
* @param {string} url
|
|
95
|
+
* @returns {Promise<ApiEndpoint[]>}
|
|
96
|
+
*/
|
|
97
|
+
async function loadFromOpenApiUrl(url, headers = {}) {
|
|
98
|
+
const res = await fetch(url, {
|
|
99
|
+
signal: AbortSignal.timeout(10000),
|
|
100
|
+
headers,
|
|
101
|
+
});
|
|
102
|
+
if (!res.ok) throw new Error(`OpenAPI fetch failed: ${res.status} ${url}`);
|
|
103
|
+
const spec = await res.json();
|
|
104
|
+
return parseOpenApiPaths(spec);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Load OpenAPI spec from file.
|
|
109
|
+
* @param {string} filePath
|
|
110
|
+
* @returns {ApiEndpoint[]}
|
|
111
|
+
*/
|
|
112
|
+
function loadFromOpenApiFile(filePath) {
|
|
113
|
+
const abs = resolve(process.cwd(), filePath);
|
|
114
|
+
if (!existsSync(abs)) return [];
|
|
115
|
+
const raw = readFileSync(abs, 'utf-8');
|
|
116
|
+
let spec;
|
|
117
|
+
try { spec = JSON.parse(raw); } catch (err) {
|
|
118
|
+
throw new Error(`Failed to parse OpenAPI file ${filePath}: ${err.message}`);
|
|
119
|
+
}
|
|
120
|
+
return parseOpenApiPaths(spec);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Load endpoints from manifest file.
|
|
125
|
+
* @param {string} manifestPath
|
|
126
|
+
* @returns {ApiEndpoint[]}
|
|
127
|
+
*/
|
|
128
|
+
function loadFromManifest(manifestPath) {
|
|
129
|
+
const abs = resolve(process.cwd(), manifestPath);
|
|
130
|
+
if (!existsSync(abs)) return [];
|
|
131
|
+
const raw = readFileSync(abs, 'utf-8');
|
|
132
|
+
let manifest;
|
|
133
|
+
try { manifest = JSON.parse(raw); } catch (err) {
|
|
134
|
+
throw new Error(`Failed to parse manifest ${manifestPath}: ${err.message}`);
|
|
135
|
+
}
|
|
136
|
+
const endpoints = manifest.endpoints || [];
|
|
137
|
+
return endpoints.map((e) => ({
|
|
138
|
+
path: e.path,
|
|
139
|
+
method: (e.method || 'GET').toUpperCase(),
|
|
140
|
+
name: e.name || deriveName(e.path, e.method || 'GET'),
|
|
141
|
+
description: e.description || `${e.method || 'GET'} ${e.path}`,
|
|
142
|
+
params: e.params,
|
|
143
|
+
requiresConfirmation: e.requiresConfirmation ?? false,
|
|
144
|
+
source: 'manifest'
|
|
145
|
+
}));
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Merge endpoints, dedupe by path+method (manifest overrides openapi).
|
|
150
|
+
* @param {ApiEndpoint[][]} arrays
|
|
151
|
+
* @returns {ApiEndpoint[]}
|
|
152
|
+
*/
|
|
153
|
+
function mergeEndpoints(...arrays) {
|
|
154
|
+
const byKey = new Map();
|
|
155
|
+
for (const arr of arrays) {
|
|
156
|
+
for (const e of arr) {
|
|
157
|
+
const key = `${e.method}:${e.path}`;
|
|
158
|
+
byKey.set(key, e);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
return Array.from(byKey.values());
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Safe JSON.parse — returns null on failure.
|
|
166
|
+
* @param {string} str
|
|
167
|
+
* @returns {object|null}
|
|
168
|
+
*/
|
|
169
|
+
function safeParseJson(str) {
|
|
170
|
+
try { return JSON.parse(str); } catch { return null; }
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Normalize a path for comparison: ensure leading slash, trim trailing slash, lowercase.
|
|
175
|
+
* @param {string} p
|
|
176
|
+
* @returns {string}
|
|
177
|
+
*/
|
|
178
|
+
function normalizePath(p) {
|
|
179
|
+
if (!p) return '';
|
|
180
|
+
const withSlash = p.startsWith('/') ? p : `/${p}`;
|
|
181
|
+
return withSlash.replace(/\/$/, '').toLowerCase();
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Compute API coverage: which spec endpoints have a promoted tool with a matching
|
|
186
|
+
* mcpRouting.endpoint (path) and mcpRouting.method.
|
|
187
|
+
*
|
|
188
|
+
* Matching is path+method, case-insensitive, path-only (no base URL).
|
|
189
|
+
*
|
|
190
|
+
* @param {object|null} spec - Parsed OpenAPI spec object
|
|
191
|
+
* @param {import('better-sqlite3').Database} db
|
|
192
|
+
* @returns {{ covered: ApiEndpoint[]; uncovered: ApiEndpoint[]; total: number }}
|
|
193
|
+
*/
|
|
194
|
+
export function computeCoverage(spec, db) {
|
|
195
|
+
if (!spec?.paths) return { covered: [], uncovered: [], total: 0 };
|
|
196
|
+
|
|
197
|
+
const endpoints = parseOpenApiPaths(spec);
|
|
198
|
+
|
|
199
|
+
const promotedRows = db.prepare(
|
|
200
|
+
`SELECT spec_json FROM tool_registry WHERE lifecycle_state = 'promoted'`
|
|
201
|
+
).all();
|
|
202
|
+
|
|
203
|
+
// Build a set of "METHOD:normalizedPath" keys from promoted tools
|
|
204
|
+
const coveredKeys = new Set();
|
|
205
|
+
for (const row of promotedRows) {
|
|
206
|
+
const toolSpec = safeParseJson(row.spec_json);
|
|
207
|
+
if (!toolSpec?.mcpRouting?.endpoint) continue;
|
|
208
|
+
const path = normalizePath(toolSpec.mcpRouting.endpoint);
|
|
209
|
+
const method = (toolSpec.mcpRouting.method || '').toUpperCase();
|
|
210
|
+
if (!path || !method) continue;
|
|
211
|
+
coveredKeys.add(`${method}:${path}`);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
const covered = [];
|
|
215
|
+
const uncovered = [];
|
|
216
|
+
|
|
217
|
+
for (const endpoint of endpoints) {
|
|
218
|
+
const key = `${endpoint.method.toUpperCase()}:${normalizePath(endpoint.path)}`;
|
|
219
|
+
if (coveredKeys.has(key)) {
|
|
220
|
+
covered.push(endpoint);
|
|
221
|
+
} else {
|
|
222
|
+
uncovered.push(endpoint);
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
return { covered, uncovered, total: endpoints.length };
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Load all APIs from config.
|
|
231
|
+
* @param {object} config - forge.config.json api section
|
|
232
|
+
* @returns {Promise<ApiEndpoint[]>}
|
|
233
|
+
*/
|
|
234
|
+
export async function loadApis(config) {
|
|
235
|
+
const endpoints = [];
|
|
236
|
+
const discovery = config?.discovery;
|
|
237
|
+
const manifestPath = config?.manifestPath;
|
|
238
|
+
|
|
239
|
+
if (discovery?.type === 'openapi') {
|
|
240
|
+
if (discovery.url) {
|
|
241
|
+
try {
|
|
242
|
+
const fromUrl = await loadFromOpenApiUrl(discovery.url, discovery.headers);
|
|
243
|
+
endpoints.push(...fromUrl);
|
|
244
|
+
} catch (err) {
|
|
245
|
+
console.error(`OpenAPI URL failed: ${err.message}`);
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
if (discovery.file) {
|
|
249
|
+
const fromFile = loadFromOpenApiFile(discovery.file);
|
|
250
|
+
endpoints.push(...fromFile);
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
if (manifestPath) {
|
|
255
|
+
const fromManifest = loadFromManifest(manifestPath);
|
|
256
|
+
endpoints.push(...fromManifest);
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
return mergeEndpoints(endpoints);
|
|
260
|
+
}
|
package/lib/auth.d.ts
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
export interface AuthResult {
|
|
2
|
+
authenticated: boolean;
|
|
3
|
+
userId: string | null;
|
|
4
|
+
claims: Record<string, unknown> | null;
|
|
5
|
+
error: string | null;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
export interface AuthConfig {
|
|
9
|
+
mode?: 'trust' | 'verify';
|
|
10
|
+
signingKey?: string;
|
|
11
|
+
claimsPath?: string;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export interface Authenticator {
|
|
15
|
+
authenticate(req: object): AuthResult;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export function createAuth(authConfig?: AuthConfig): Authenticator;
|
|
19
|
+
|
|
20
|
+
export interface AdminAuthResult {
|
|
21
|
+
authenticated: boolean;
|
|
22
|
+
error: string | null;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export function authenticateAdmin(req: object, adminKey: string): AdminAuthResult;
|
package/lib/auth.js
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Auth module — configurable JWT authentication for the forge sidecar.
|
|
3
|
+
*
|
|
4
|
+
* Two modes:
|
|
5
|
+
* trust — decode JWT payload without verifying signature (fast, for local dev / behind reverse proxy)
|
|
6
|
+
* verify — verify HMAC-SHA256 (HS256) or RSA-SHA256 (RS256) signature via Node.js built-in crypto
|
|
7
|
+
*
|
|
8
|
+
* No external JWT library required.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { createHmac, createVerify, timingSafeEqual } from 'crypto';
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* @typedef {{ authenticated: boolean, userId: string|null, claims: object|null, error: string|null }} AuthResult
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Create an authenticator from config.
|
|
19
|
+
* @param {{ mode: 'verify'|'trust', signingKey?: string, claimsPath?: string }} authConfig
|
|
20
|
+
* @returns {{ authenticate(req): AuthResult }}
|
|
21
|
+
*/
|
|
22
|
+
export function createAuth(authConfig = {}) {
|
|
23
|
+
const mode = authConfig.mode ?? 'trust';
|
|
24
|
+
const signingKey = authConfig.signingKey ?? null;
|
|
25
|
+
const claimsPath = authConfig.claimsPath ?? 'sub';
|
|
26
|
+
|
|
27
|
+
return {
|
|
28
|
+
authenticate(req) {
|
|
29
|
+
let token = null;
|
|
30
|
+
|
|
31
|
+
// Priority 1: Authorization header
|
|
32
|
+
const authHeader = req.headers?.authorization;
|
|
33
|
+
if (authHeader?.startsWith('Bearer ')) {
|
|
34
|
+
token = authHeader.slice(7);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Priority 2: ?token= query param (for EventSource which can't set headers)
|
|
38
|
+
if (!token && req.url) {
|
|
39
|
+
try {
|
|
40
|
+
const url = new URL(req.url, 'http://localhost');
|
|
41
|
+
token = url.searchParams.get('token');
|
|
42
|
+
} catch { /* malformed URL */ }
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if (!token) {
|
|
46
|
+
return { authenticated: false, userId: null, claims: null, error: 'Missing token' };
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
const parts = token.split('.');
|
|
50
|
+
if (parts.length !== 3) {
|
|
51
|
+
return { authenticated: false, userId: null, claims: null, error: 'Malformed JWT' };
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Verify signature if in verify mode
|
|
55
|
+
if (mode === 'verify') {
|
|
56
|
+
if (!signingKey) {
|
|
57
|
+
return { authenticated: false, userId: null, claims: null, error: 'No signing key configured' };
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
let header;
|
|
61
|
+
try {
|
|
62
|
+
header = JSON.parse(base64UrlDecode(parts[0]));
|
|
63
|
+
} catch {
|
|
64
|
+
return { authenticated: false, userId: null, claims: null, error: 'Invalid JWT header' };
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const sigInput = `${parts[0]}.${parts[1]}`;
|
|
68
|
+
const signature = parts[2];
|
|
69
|
+
const alg = header.alg ?? 'HS256';
|
|
70
|
+
|
|
71
|
+
if (alg === 'HS256') {
|
|
72
|
+
const expected = base64UrlEncode(
|
|
73
|
+
createHmac('sha256', signingKey).update(sigInput).digest()
|
|
74
|
+
);
|
|
75
|
+
const expectedBuf = Buffer.from(expected);
|
|
76
|
+
const signatureBuf = Buffer.from(signature);
|
|
77
|
+
if (expectedBuf.length !== signatureBuf.length || !timingSafeEqual(expectedBuf, signatureBuf)) {
|
|
78
|
+
return { authenticated: false, userId: null, claims: null, error: 'Invalid signature' };
|
|
79
|
+
}
|
|
80
|
+
} else if (alg === 'RS256') {
|
|
81
|
+
const sigBuf = base64UrlToBuffer(signature);
|
|
82
|
+
const verifier = createVerify('RSA-SHA256');
|
|
83
|
+
verifier.update(sigInput);
|
|
84
|
+
if (!verifier.verify(signingKey, sigBuf)) {
|
|
85
|
+
return { authenticated: false, userId: null, claims: null, error: 'Invalid signature' };
|
|
86
|
+
}
|
|
87
|
+
} else {
|
|
88
|
+
return { authenticated: false, userId: null, claims: null, error: `Unsupported algorithm: ${alg}` };
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// Decode payload
|
|
93
|
+
let claims;
|
|
94
|
+
try {
|
|
95
|
+
claims = JSON.parse(base64UrlDecode(parts[1]));
|
|
96
|
+
} catch {
|
|
97
|
+
return { authenticated: false, userId: null, claims: null, error: 'Invalid JWT payload' };
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
const userId = extractClaim(claims, claimsPath);
|
|
101
|
+
if (!userId) {
|
|
102
|
+
return { authenticated: false, userId: null, claims, error: `Claim "${claimsPath}" not found in token` };
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
return { authenticated: true, userId: String(userId), claims, error: null };
|
|
106
|
+
}
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Admin auth — simple Bearer token comparison.
|
|
112
|
+
* @param {import('http').IncomingMessage} req
|
|
113
|
+
* @param {string} adminKey
|
|
114
|
+
* @returns {{ authenticated: boolean, error: string|null }}
|
|
115
|
+
*/
|
|
116
|
+
export function authenticateAdmin(req, adminKey) {
|
|
117
|
+
if (!adminKey) {
|
|
118
|
+
return { authenticated: false, error: 'No admin key configured' };
|
|
119
|
+
}
|
|
120
|
+
const authHeader = req.headers?.authorization;
|
|
121
|
+
if (!authHeader || !authHeader.startsWith('Bearer ')) {
|
|
122
|
+
return { authenticated: false, error: 'Missing or invalid Authorization header' };
|
|
123
|
+
}
|
|
124
|
+
const token = authHeader.slice(7);
|
|
125
|
+
const tokenBuf = Buffer.from(token);
|
|
126
|
+
const keyBuf = Buffer.from(adminKey);
|
|
127
|
+
if (tokenBuf.length !== keyBuf.length || !timingSafeEqual(tokenBuf, keyBuf)) {
|
|
128
|
+
return { authenticated: false, error: 'Invalid admin key' };
|
|
129
|
+
}
|
|
130
|
+
return { authenticated: true, error: null };
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// ── Helpers ──────────────────────────────────────────────────────────────────
|
|
134
|
+
|
|
135
|
+
function base64UrlDecode(str) {
|
|
136
|
+
const padded = str.replace(/-/g, '+').replace(/_/g, '/');
|
|
137
|
+
return Buffer.from(padded, 'base64').toString('utf-8');
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
function base64UrlEncode(buf) {
|
|
141
|
+
return Buffer.from(buf).toString('base64')
|
|
142
|
+
.replace(/\+/g, '-').replace(/\//g, '_').replace(/=+$/, '');
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
function base64UrlToBuffer(str) {
|
|
146
|
+
const padded = str.replace(/-/g, '+').replace(/_/g, '/');
|
|
147
|
+
return Buffer.from(padded, 'base64');
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
function extractClaim(claims, path) {
|
|
151
|
+
const parts = path.split('.');
|
|
152
|
+
let val = claims;
|
|
153
|
+
for (const p of parts) {
|
|
154
|
+
if (val == null || typeof val !== 'object') return null;
|
|
155
|
+
val = val[p];
|
|
156
|
+
}
|
|
157
|
+
return val ?? null;
|
|
158
|
+
}
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
// Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Map an agent-tool-forge eval case's `expect` object plus run metadata
|
|
6
|
+
* to a RunChecksInput shape that runChecks() can consume.
|
|
7
|
+
*
|
|
8
|
+
* @param {Object} evalCase - the eval case object from the eval JSON file
|
|
9
|
+
* @param {Object} runMeta - runtime data from executing the case
|
|
10
|
+
* @param {string[]} runMeta.toolsCalled - actual tools called during execution
|
|
11
|
+
* @param {string} runMeta.responseText - the model's response text
|
|
12
|
+
* @param {number} [runMeta.latencyMs] - round-trip latency in ms
|
|
13
|
+
* @param {number} [runMeta.cost] - actual cost in USD
|
|
14
|
+
* @returns {import('./types.js').RunChecksInput}
|
|
15
|
+
*/
|
|
16
|
+
export function checkAdapter(evalCase, runMeta) {
|
|
17
|
+
const expect = evalCase.expect ?? {};
|
|
18
|
+
const { toolsCalled = [], responseText = '', latencyMs, cost } = runMeta;
|
|
19
|
+
|
|
20
|
+
/** @type {import('./types.js').RunChecksInput} */
|
|
21
|
+
const input = {};
|
|
22
|
+
|
|
23
|
+
// Response text is needed for most content checks
|
|
24
|
+
input.responseText = responseText;
|
|
25
|
+
|
|
26
|
+
// Tool selection — exact match (strict mode via expectedTools/actualTools)
|
|
27
|
+
// toolsAcceptable is handled separately via checkToolsAcceptable() below
|
|
28
|
+
if (expect.toolsCalled !== undefined) {
|
|
29
|
+
input.expectedTools = Array.isArray(expect.toolsCalled)
|
|
30
|
+
? expect.toolsCalled
|
|
31
|
+
: [expect.toolsCalled];
|
|
32
|
+
input.actualTools = toolsCalled;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// responseContains → mustContain (array of strings; every item must appear)
|
|
36
|
+
if (expect.responseContains !== undefined) {
|
|
37
|
+
input.mustContain = Array.isArray(expect.responseContains)
|
|
38
|
+
? expect.responseContains
|
|
39
|
+
: [expect.responseContains];
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// responseContainsAny — handled by callers via checkResponseContainsAnyGroups().
|
|
43
|
+
// Normalization of flat string[] to string[][] is done inside that function.
|
|
44
|
+
// No field is set on RunChecksInput for this (runChecks has no native anyOf check).
|
|
45
|
+
|
|
46
|
+
// responseNotContains → mustNotContain
|
|
47
|
+
if (expect.responseNotContains !== undefined) {
|
|
48
|
+
input.mustNotContain = Array.isArray(expect.responseNotContains)
|
|
49
|
+
? expect.responseNotContains
|
|
50
|
+
: [expect.responseNotContains];
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// responseNonEmpty → nonEmpty check
|
|
54
|
+
// The eval-runner treats "non-empty" as: text present OR at least one tool called.
|
|
55
|
+
// RunChecksInput.nonEmpty is a simpler text-only flag; callers should also check
|
|
56
|
+
// toolsCalled.length when the original semantics matter.
|
|
57
|
+
if (expect.responseNonEmpty) {
|
|
58
|
+
input.nonEmpty = true;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// requiresPreamble lives on evalCase directly (not inside expect).
|
|
62
|
+
// If true and the model returned only tool calls (no text), the run should fail.
|
|
63
|
+
// Map it to nonEmpty so the text-presence check fires.
|
|
64
|
+
if (evalCase.requiresPreamble === true) {
|
|
65
|
+
input.nonEmpty = true;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Latency check
|
|
69
|
+
if (latencyMs !== undefined && expect.maxLatencyMs !== undefined) {
|
|
70
|
+
input.latencyMs = latencyMs;
|
|
71
|
+
input.maxLatencyMs = expect.maxLatencyMs;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Cost budget
|
|
75
|
+
if (cost !== undefined && expect.maxCost !== undefined) {
|
|
76
|
+
input.actualCost = cost;
|
|
77
|
+
input.maxCost = expect.maxCost;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Tool call count
|
|
81
|
+
if (expect.minToolCalls !== undefined || expect.maxToolCalls !== undefined) {
|
|
82
|
+
input.actualToolCallCount = toolsCalled.length;
|
|
83
|
+
if (expect.minToolCalls !== undefined) input.minToolCalls = expect.minToolCalls;
|
|
84
|
+
if (expect.maxToolCalls !== undefined) input.maxToolCalls = expect.maxToolCalls;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// jsonValid — check if response is valid JSON
|
|
88
|
+
if (expect.jsonValid) {
|
|
89
|
+
input.jsonValid = true;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// schemaData — validate response against a schema
|
|
93
|
+
if (expect.schemaData !== undefined) {
|
|
94
|
+
input.schemaData = expect.schemaData;
|
|
95
|
+
input.requiredKeys = expect.requiredKeys ?? [];
|
|
96
|
+
if (expect.typeChecks) input.typeChecks = expect.typeChecks;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// minLength / maxLength — response length bounds
|
|
100
|
+
if (expect.minLength !== undefined) input.minLength = expect.minLength;
|
|
101
|
+
if (expect.maxLength !== undefined) input.maxLength = expect.maxLength;
|
|
102
|
+
|
|
103
|
+
// regexPattern — response must match pattern
|
|
104
|
+
if (expect.regexPattern !== undefined) input.regexPattern = expect.regexPattern;
|
|
105
|
+
|
|
106
|
+
// copOutPhrases — custom cop-out phrase list for nonEmpty check
|
|
107
|
+
if (expect.copOutPhrases !== undefined) input.copOutPhrases = expect.copOutPhrases;
|
|
108
|
+
|
|
109
|
+
return input;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Handle the responseContainsAny case.
|
|
114
|
+
*
|
|
115
|
+
* In eval-runner, responseContainsAny is string[][] — an array of groups where
|
|
116
|
+
* each group must contribute at least one match. This mirrors that behaviour.
|
|
117
|
+
*
|
|
118
|
+
* @param {string} responseText
|
|
119
|
+
* @param {string[][]} groups - each inner array is one group; at least one member
|
|
120
|
+
* of each group must appear in responseText (case-sensitive, same as eval-runner)
|
|
121
|
+
* @returns {{ pass: boolean, reason?: string }}
|
|
122
|
+
*/
|
|
123
|
+
export function checkResponseContainsAnyGroups(responseText, groups) {
|
|
124
|
+
if (!groups?.length) return { pass: true };
|
|
125
|
+
|
|
126
|
+
// Normalize: flat string[] → [[...]] (single group). Grouped string[][] passes through.
|
|
127
|
+
const normalized = Array.isArray(groups[0]) ? groups : [groups];
|
|
128
|
+
|
|
129
|
+
const failures = [];
|
|
130
|
+
for (const group of normalized) {
|
|
131
|
+
if (!group.some((str) => responseText.includes(str))) {
|
|
132
|
+
failures.push(`response should contain any of [${group.join(', ')}]`);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
if (failures.length === 0) return { pass: true };
|
|
137
|
+
return { pass: false, reason: failures.join('; ') };
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Handle the toolsAcceptable case.
|
|
142
|
+
*
|
|
143
|
+
* toolsAcceptable is string[][] — an array of acceptable tool sets. The run
|
|
144
|
+
* passes if the actual tools called exactly match ANY of the acceptable sets.
|
|
145
|
+
* The special token '__none__' inside an acceptable set means no tools called.
|
|
146
|
+
*
|
|
147
|
+
* @param {string[]} actualTools
|
|
148
|
+
* @param {string[][]} acceptable
|
|
149
|
+
* @returns {{ pass: boolean, reason?: string }}
|
|
150
|
+
*/
|
|
151
|
+
export function checkToolsAcceptable(actualTools, acceptable) {
|
|
152
|
+
if (!acceptable?.length) return { pass: true };
|
|
153
|
+
|
|
154
|
+
function setsEqual(a, b) {
|
|
155
|
+
const sa = new Set(a);
|
|
156
|
+
const sb = new Set(b);
|
|
157
|
+
if (sa.size !== sb.size) return false;
|
|
158
|
+
for (const v of sa) if (!sb.has(v)) return false;
|
|
159
|
+
return true;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
const anyMatch = acceptable.some((set) => {
|
|
163
|
+
if (set.includes('__none__') && actualTools.length === 0) return true;
|
|
164
|
+
return setsEqual(set, actualTools);
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
if (anyMatch) return { pass: true };
|
|
168
|
+
return {
|
|
169
|
+
pass: false,
|
|
170
|
+
reason: `tools: [${actualTools.join(', ')}] not in any acceptable set`
|
|
171
|
+
};
|
|
172
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
// Adapted from agent-eval-kit by FlanaganSe (https://github.com/FlanaganSe/agent-eval-kit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Compose multiple grader functions — passes only if ALL pass.
|
|
6
|
+
* @param {Array<(input: unknown) => Promise<{pass: boolean, reason?: string}>>} graders
|
|
7
|
+
* @returns {(input: unknown) => Promise<{pass: boolean, reason?: string}>}
|
|
8
|
+
*/
|
|
9
|
+
export function all(graders) {
|
|
10
|
+
return async (input) => {
|
|
11
|
+
const results = await Promise.all(graders.map(g => g(input)));
|
|
12
|
+
const failed = results.filter(r => !r.pass);
|
|
13
|
+
if (failed.length === 0) return { pass: true };
|
|
14
|
+
return { pass: false, reason: failed.map(r => r.reason).filter(Boolean).join('; ') };
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Compose multiple grader functions — passes if ANY pass.
|
|
20
|
+
* @param {Array<(input: unknown) => Promise<{pass: boolean, reason?: string}>>} graders
|
|
21
|
+
* @returns {(input: unknown) => Promise<{pass: boolean, reason?: string}>}
|
|
22
|
+
*/
|
|
23
|
+
export function any(graders) {
|
|
24
|
+
return async (input) => {
|
|
25
|
+
const results = await Promise.all(graders.map(g => g(input)));
|
|
26
|
+
if (results.some(r => r.pass)) return { pass: true };
|
|
27
|
+
return { pass: false, reason: results.map(r => r.reason).filter(Boolean).join(' | ') };
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Invert a grader function — passes if the original fails.
|
|
33
|
+
* @param {(input: unknown) => Promise<{pass: boolean, reason?: string}>} grader
|
|
34
|
+
* @returns {(input: unknown) => Promise<{pass: boolean, reason?: string}>}
|
|
35
|
+
*/
|
|
36
|
+
export function not(grader) {
|
|
37
|
+
return async (input) => {
|
|
38
|
+
const result = await grader(input);
|
|
39
|
+
if (!result.pass) return { pass: true };
|
|
40
|
+
return { pass: false, reason: `Expected grader to fail but it passed` };
|
|
41
|
+
};
|
|
42
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
// Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Check that responseText contains all required substrings (case-insensitive).
|
|
6
|
+
* @param {{responseText: string, mustContain: string[]}} input
|
|
7
|
+
* @returns {import('./types.js').EvalResult}
|
|
8
|
+
*/
|
|
9
|
+
export function contentMatch({ responseText, mustContain }) {
|
|
10
|
+
const lower = responseText.toLowerCase();
|
|
11
|
+
const missing = mustContain.filter(s => !lower.includes(s.toLowerCase()));
|
|
12
|
+
if (missing.length === 0) return { pass: true };
|
|
13
|
+
return { pass: false, reason: `Missing from response: ${missing.join(', ')}` };
|
|
14
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
// Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* @param {{actualCost: number, maxCost: number}} input
|
|
6
|
+
* @returns {import('./types.js').EvalResult}
|
|
7
|
+
*/
|
|
8
|
+
export function costBudget({ actualCost, maxCost }) {
|
|
9
|
+
if (actualCost <= maxCost) return { pass: true };
|
|
10
|
+
return { pass: false, reason: `Cost $${actualCost.toFixed(6)} exceeded budget $${maxCost.toFixed(6)}` };
|
|
11
|
+
}
|