agent-tool-forge 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +209 -0
- package/lib/agent-registry.js +170 -0
- package/lib/api-client.js +792 -0
- package/lib/api-loader.js +260 -0
- package/lib/auth.d.ts +25 -0
- package/lib/auth.js +158 -0
- package/lib/checks/check-adapter.js +172 -0
- package/lib/checks/compose.js +42 -0
- package/lib/checks/content-match.js +14 -0
- package/lib/checks/cost-budget.js +11 -0
- package/lib/checks/index.js +18 -0
- package/lib/checks/json-valid.js +15 -0
- package/lib/checks/latency.js +11 -0
- package/lib/checks/length-bounds.js +17 -0
- package/lib/checks/negative-match.js +14 -0
- package/lib/checks/no-hallucinated-numbers.js +63 -0
- package/lib/checks/non-empty.js +34 -0
- package/lib/checks/regex-match.js +12 -0
- package/lib/checks/run-checks.js +84 -0
- package/lib/checks/schema-match.js +26 -0
- package/lib/checks/tool-call-count.js +16 -0
- package/lib/checks/tool-selection.js +34 -0
- package/lib/checks/types.js +45 -0
- package/lib/comparison/compare.js +86 -0
- package/lib/comparison/format.js +104 -0
- package/lib/comparison/index.js +6 -0
- package/lib/comparison/statistics.js +59 -0
- package/lib/comparison/types.js +41 -0
- package/lib/config-schema.js +200 -0
- package/lib/config.d.ts +66 -0
- package/lib/conversation-store.d.ts +77 -0
- package/lib/conversation-store.js +443 -0
- package/lib/db.d.ts +6 -0
- package/lib/db.js +1112 -0
- package/lib/dep-check.js +99 -0
- package/lib/drift-background.js +61 -0
- package/lib/drift-monitor.js +187 -0
- package/lib/eval-runner.js +566 -0
- package/lib/fixtures/fixture-store.js +161 -0
- package/lib/fixtures/index.js +11 -0
- package/lib/forge-engine.js +982 -0
- package/lib/forge-eval-generator.js +417 -0
- package/lib/forge-file-writer.js +386 -0
- package/lib/forge-service-client.js +190 -0
- package/lib/forge-service.d.ts +4 -0
- package/lib/forge-service.js +655 -0
- package/lib/forge-verifier-generator.js +271 -0
- package/lib/handlers/admin.js +151 -0
- package/lib/handlers/agents.js +229 -0
- package/lib/handlers/chat-resume.js +334 -0
- package/lib/handlers/chat-sync.js +320 -0
- package/lib/handlers/chat.js +320 -0
- package/lib/handlers/conversations.js +92 -0
- package/lib/handlers/preferences.js +88 -0
- package/lib/handlers/tools-list.js +58 -0
- package/lib/hitl-engine.d.ts +60 -0
- package/lib/hitl-engine.js +261 -0
- package/lib/http-utils.js +92 -0
- package/lib/index.d.ts +20 -0
- package/lib/index.js +141 -0
- package/lib/init.js +636 -0
- package/lib/manual-entry.js +59 -0
- package/lib/mcp-server.js +252 -0
- package/lib/output-groups.js +54 -0
- package/lib/postgres-store.d.ts +31 -0
- package/lib/postgres-store.js +465 -0
- package/lib/preference-store.d.ts +47 -0
- package/lib/preference-store.js +79 -0
- package/lib/prompt-store.d.ts +42 -0
- package/lib/prompt-store.js +60 -0
- package/lib/rate-limiter.d.ts +30 -0
- package/lib/rate-limiter.js +104 -0
- package/lib/react-engine.d.ts +110 -0
- package/lib/react-engine.js +337 -0
- package/lib/runner/cli.js +156 -0
- package/lib/runner/cost-estimator.js +71 -0
- package/lib/runner/gate.js +46 -0
- package/lib/runner/index.js +165 -0
- package/lib/sidecar.d.ts +83 -0
- package/lib/sidecar.js +161 -0
- package/lib/sse.d.ts +15 -0
- package/lib/sse.js +30 -0
- package/lib/tools-scanner.js +91 -0
- package/lib/tui.js +253 -0
- package/lib/verifier-report.js +78 -0
- package/lib/verifier-runner.js +338 -0
- package/lib/verifier-scanner.js +70 -0
- package/lib/verifier-worker-pool.js +196 -0
- package/lib/views/chat.js +340 -0
- package/lib/views/endpoints.js +203 -0
- package/lib/views/eval-run.js +206 -0
- package/lib/views/forge-agent.js +538 -0
- package/lib/views/forge.js +410 -0
- package/lib/views/main-menu.js +275 -0
- package/lib/views/mediation.js +381 -0
- package/lib/views/model-compare.js +430 -0
- package/lib/views/model-comparison.js +333 -0
- package/lib/views/onboarding.js +470 -0
- package/lib/views/performance.js +237 -0
- package/lib/views/run-evals.js +205 -0
- package/lib/views/settings.js +829 -0
- package/lib/views/tools-evals.js +514 -0
- package/lib/views/verifier-coverage.js +617 -0
- package/lib/workers/verifier-worker.js +52 -0
- package/package.json +123 -0
- package/widget/forge-chat.js +789 -0
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CLI handler for `node lib/index.js run`.
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* node lib/index.js run --eval <path> [--record] [--replay] [--suite <name>]
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { readFileSync, existsSync } from 'node:fs';
|
|
9
|
+
import { resolve } from 'node:path';
|
|
10
|
+
import { runEvalSuite } from './index.js';
|
|
11
|
+
|
|
12
|
+
function parseArgs(args) {
|
|
13
|
+
const opts = { record: false, replay: false, evalPath: null, suite: null };
|
|
14
|
+
for (let i = 0; i < args.length; i++) {
|
|
15
|
+
if (args[i] === '--eval' && args[i + 1]) { opts.evalPath = args[++i]; continue; }
|
|
16
|
+
if (args[i] === '--record') { opts.record = true; continue; }
|
|
17
|
+
if (args[i] === '--replay') { opts.replay = true; continue; }
|
|
18
|
+
if (args[i] === '--suite' && args[i + 1]) { opts.suite = args[++i]; continue; }
|
|
19
|
+
}
|
|
20
|
+
return opts;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
function loadConfig() {
|
|
24
|
+
const configPath = resolve(process.cwd(), 'forge.config.json');
|
|
25
|
+
if (!existsSync(configPath)) return {};
|
|
26
|
+
try {
|
|
27
|
+
return JSON.parse(readFileSync(configPath, 'utf8'));
|
|
28
|
+
} catch (err) {
|
|
29
|
+
console.error(`Warning: forge.config.json is invalid JSON: ${err.message}`);
|
|
30
|
+
return {};
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
async function buildAgentFn(config) {
|
|
35
|
+
const agentConfig = config.agent ?? {};
|
|
36
|
+
const endpoint = agentConfig.endpoint;
|
|
37
|
+
if (!endpoint) {
|
|
38
|
+
throw new Error('No agent.endpoint configured in forge.config.json.\nAdd: { "agent": { "endpoint": "http://localhost:8001/agent-api/chat-sync" } }');
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const method = agentConfig.method ?? 'POST';
|
|
42
|
+
const headers = { 'Content-Type': 'application/json', ...(agentConfig.headers ?? {}) };
|
|
43
|
+
const inputField = agentConfig.inputField ?? 'message';
|
|
44
|
+
const outputField = agentConfig.outputField ?? 'text';
|
|
45
|
+
|
|
46
|
+
return async (message) => {
|
|
47
|
+
const t0 = Date.now();
|
|
48
|
+
const body = JSON.stringify({ [inputField]: message });
|
|
49
|
+
let res;
|
|
50
|
+
try {
|
|
51
|
+
res = await fetch(endpoint, { method, headers, body });
|
|
52
|
+
} catch (err) {
|
|
53
|
+
throw new Error(`Agent request failed: ${err.message}`);
|
|
54
|
+
}
|
|
55
|
+
if (!res.ok) throw new Error(`Agent returned ${res.status}`);
|
|
56
|
+
let data;
|
|
57
|
+
try {
|
|
58
|
+
data = await res.json();
|
|
59
|
+
} catch {
|
|
60
|
+
throw new Error(`Agent at ${endpoint} returned non-JSON response (status ${res.status})`);
|
|
61
|
+
}
|
|
62
|
+
const latencyMs = Date.now() - t0;
|
|
63
|
+
return {
|
|
64
|
+
responseText: data[outputField] ?? '',
|
|
65
|
+
toolsCalled: data.toolsCalled ?? [],
|
|
66
|
+
latencyMs,
|
|
67
|
+
};
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
export async function runCli(args) {
|
|
72
|
+
const opts = parseArgs(args);
|
|
73
|
+
|
|
74
|
+
if (!opts.evalPath) {
|
|
75
|
+
console.error('Usage: node lib/index.js run --eval <path> [--record] [--replay] [--suite <name>]');
|
|
76
|
+
process.exit(1);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
const evalPath = resolve(process.cwd(), opts.evalPath);
|
|
80
|
+
if (!existsSync(evalPath)) {
|
|
81
|
+
console.error(`Eval file not found: ${evalPath}`);
|
|
82
|
+
process.exit(1);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const config = loadConfig();
|
|
86
|
+
const gates = config.gates ?? {};
|
|
87
|
+
const fixturesDir = resolve(process.cwd(), config.fixtures?.dir ?? '.forge-fixtures');
|
|
88
|
+
const ttlDays = config.fixtures?.ttlDays ?? 30;
|
|
89
|
+
|
|
90
|
+
console.log(`\nRunning evals: ${opts.evalPath}`);
|
|
91
|
+
if (opts.record) console.log(' [record mode] Saving fixtures');
|
|
92
|
+
if (opts.replay) console.log(' [replay mode] Using cached fixtures where available');
|
|
93
|
+
|
|
94
|
+
let agentFn;
|
|
95
|
+
try {
|
|
96
|
+
agentFn = await buildAgentFn(config);
|
|
97
|
+
} catch (err) {
|
|
98
|
+
console.error(`\nConfiguration error: ${err.message}`);
|
|
99
|
+
process.exit(1);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
let summary;
|
|
103
|
+
try {
|
|
104
|
+
summary = await runEvalSuite(evalPath, agentFn, {
|
|
105
|
+
record: opts.record,
|
|
106
|
+
replay: opts.replay,
|
|
107
|
+
fixturesDir,
|
|
108
|
+
ttlDays,
|
|
109
|
+
gates,
|
|
110
|
+
suiteName: opts.suite,
|
|
111
|
+
});
|
|
112
|
+
} catch (err) {
|
|
113
|
+
console.error(`\nEval run failed: ${err.message}`);
|
|
114
|
+
process.exit(1);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Print results
|
|
118
|
+
const { total, passed, failed, skipped, passRate, p95LatencyMs, totalCost, suiteName } = summary;
|
|
119
|
+
const ran = passed + failed;
|
|
120
|
+
const passRatePct = (passRate * 100).toFixed(1);
|
|
121
|
+
const icon = failed === 0 ? '✓' : '✗';
|
|
122
|
+
|
|
123
|
+
console.log(`\n${suiteName ? `[${suiteName}] ` : ''}${icon} ${passed}/${ran} passed (${passRatePct}%)` +
|
|
124
|
+
(skipped > 0 ? `, ${skipped} skipped` : '') +
|
|
125
|
+
(p95LatencyMs > 0 ? `, p95 latency: ${p95LatencyMs}ms` : '') +
|
|
126
|
+
(totalCost > 0 ? `, est. cost: $${totalCost.toFixed(6)}` : ''));
|
|
127
|
+
|
|
128
|
+
// Print failing cases (shown regardless of gate outcome)
|
|
129
|
+
const failures = summary.cases.filter(c => c.status === 'failed');
|
|
130
|
+
if (failures.length > 0) {
|
|
131
|
+
console.log('\nFailing cases:');
|
|
132
|
+
for (const f of failures) {
|
|
133
|
+
console.log(` ✗ ${f.id ?? '(unnamed)'}: ${f.reason}`);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
if (summary.gates) {
|
|
138
|
+
console.log('\nGate results:');
|
|
139
|
+
for (const r of summary.gates.results) {
|
|
140
|
+
const gateIcon = r.pass ? ' ✓' : ' ✗';
|
|
141
|
+
console.log(`${gateIcon} ${r.gate}: ${r.actual} (threshold: ${r.threshold})`);
|
|
142
|
+
}
|
|
143
|
+
if (!summary.gates.pass) {
|
|
144
|
+
console.log('\n✗ Gates failed — build should be blocked');
|
|
145
|
+
process.exit(1);
|
|
146
|
+
} else {
|
|
147
|
+
console.log('\n✓ All gates passed');
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
if (failures.length > 0) {
|
|
152
|
+
process.exit(1);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
process.exit(0);
|
|
156
|
+
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
// Adapted from agent-eval-kit by FlanaganSe (https://github.com/FlanaganSe/agent-eval-kit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Per-million-token costs for common models.
|
|
6
|
+
* Format: { input: $/M tokens, output: $/M tokens }
|
|
7
|
+
*/
|
|
8
|
+
const MODEL_COSTS = {
|
|
9
|
+
'claude-opus-4-6': { input: 15, output: 75 },
|
|
10
|
+
'claude-sonnet-4-6': { input: 3, output: 15 },
|
|
11
|
+
'claude-haiku-4-5-20251001': { input: 0.8, output: 4 },
|
|
12
|
+
'claude-3-5-sonnet-20241022': { input: 3, output: 15 },
|
|
13
|
+
'claude-3-5-haiku-20241022': { input: 0.8, output: 4 },
|
|
14
|
+
'claude-3-opus-20240229': { input: 15, output: 75 },
|
|
15
|
+
'gpt-4o': { input: 2.5, output: 10 },
|
|
16
|
+
'gpt-4o-mini': { input: 0.15, output: 0.6 },
|
|
17
|
+
'gemini-2.0-flash': { input: 0.1, output: 0.4 },
|
|
18
|
+
'gemini-1.5-pro': { input: 1.25, output: 5 },
|
|
19
|
+
'deepseek-chat': { input: 0.27, output: 1.1 },
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Compute the actual cost of a single LLM call from observed token counts.
|
|
24
|
+
*
|
|
25
|
+
* @param {number} inputTokens
|
|
26
|
+
* @param {number} outputTokens
|
|
27
|
+
* @param {string} modelName
|
|
28
|
+
* @returns {number} cost in USD
|
|
29
|
+
*/
|
|
30
|
+
export function computeActualCost(inputTokens, outputTokens, modelName) {
|
|
31
|
+
const costs = MODEL_COSTS[modelName] ?? { input: 3, output: 15 };
|
|
32
|
+
return (inputTokens / 1_000_000) * costs.input + (outputTokens / 1_000_000) * costs.output;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Estimate the cost of running an eval suite.
|
|
37
|
+
* Assumes ~500 input tokens and ~200 output tokens per call (conservative estimate).
|
|
38
|
+
* @param {number} caseCount - number of eval cases
|
|
39
|
+
* @param {number} trialCount - number of trials per case
|
|
40
|
+
* @param {string} modelName - model name (used for cost lookup)
|
|
41
|
+
* @param {{avgInputTokens?: number, avgOutputTokens?: number}} [options]
|
|
42
|
+
* @returns {{totalCalls: number, estimatedCostUsd: number, perCallCostUsd: number, modelName: string, summary: string}}
|
|
43
|
+
*/
|
|
44
|
+
export function estimateCost(caseCount, trialCount, modelName, options = {}) {
|
|
45
|
+
const { avgInputTokens = 500, avgOutputTokens = 200 } = options;
|
|
46
|
+
const totalCalls = caseCount * trialCount;
|
|
47
|
+
|
|
48
|
+
const costs = MODEL_COSTS[modelName] ?? { input: 3, output: 15 }; // default to claude-sonnet-4-6 pricing
|
|
49
|
+
|
|
50
|
+
const inputCostPer1M = costs.input;
|
|
51
|
+
const outputCostPer1M = costs.output;
|
|
52
|
+
|
|
53
|
+
const totalInputTokens = totalCalls * avgInputTokens;
|
|
54
|
+
const totalOutputTokens = totalCalls * avgOutputTokens;
|
|
55
|
+
|
|
56
|
+
const estimatedCostUsd =
|
|
57
|
+
(totalInputTokens / 1_000_000) * inputCostPer1M +
|
|
58
|
+
(totalOutputTokens / 1_000_000) * outputCostPer1M;
|
|
59
|
+
|
|
60
|
+
const perCallCostUsd = totalCalls > 0 ? estimatedCostUsd / totalCalls : 0;
|
|
61
|
+
|
|
62
|
+
const summary = `${totalCalls} calls × ${modelName} ≈ $${estimatedCostUsd.toFixed(4)} USD (est. ${avgInputTokens}in/${avgOutputTokens}out tokens/call)`;
|
|
63
|
+
|
|
64
|
+
return {
|
|
65
|
+
totalCalls,
|
|
66
|
+
estimatedCostUsd,
|
|
67
|
+
perCallCostUsd,
|
|
68
|
+
modelName,
|
|
69
|
+
summary,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
// Adapted from agent-eval-kit by FlanaganSe (https://github.com/FlanaganSe/agent-eval-kit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Evaluate gate thresholds against a run summary.
|
|
6
|
+
*
|
|
7
|
+
* @param {{passRate: number, totalCost: number, p95LatencyMs: number, totalCases: number}} summary
|
|
8
|
+
* @param {{passRate?: number, maxCost?: number, p95LatencyMs?: number}} gates
|
|
9
|
+
* @returns {{pass: boolean, results: Array<{gate: string, threshold: number, actual: number, pass: boolean}>}}
|
|
10
|
+
*/
|
|
11
|
+
export function evaluateGates(summary, gates) {
|
|
12
|
+
const results = [];
|
|
13
|
+
|
|
14
|
+
if (gates.passRate !== undefined) {
|
|
15
|
+
const pass = summary.passRate >= gates.passRate;
|
|
16
|
+
results.push({
|
|
17
|
+
gate: 'passRate',
|
|
18
|
+
threshold: gates.passRate,
|
|
19
|
+
actual: summary.passRate,
|
|
20
|
+
pass,
|
|
21
|
+
});
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
if (gates.maxCost !== undefined) {
|
|
25
|
+
const pass = summary.totalCost <= gates.maxCost;
|
|
26
|
+
results.push({
|
|
27
|
+
gate: 'maxCost',
|
|
28
|
+
threshold: gates.maxCost,
|
|
29
|
+
actual: summary.totalCost,
|
|
30
|
+
pass,
|
|
31
|
+
});
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
if (gates.p95LatencyMs !== undefined) {
|
|
35
|
+
const pass = summary.p95LatencyMs <= gates.p95LatencyMs;
|
|
36
|
+
results.push({
|
|
37
|
+
gate: 'p95LatencyMs',
|
|
38
|
+
threshold: gates.p95LatencyMs,
|
|
39
|
+
actual: summary.p95LatencyMs,
|
|
40
|
+
pass,
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const pass = results.length === 0 || results.every(r => r.pass);
|
|
45
|
+
return { pass, results };
|
|
46
|
+
}
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Programmatic eval runner API.
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* import { runEvalSuite } from './lib/runner/index.js';
|
|
6
|
+
* const summary = await runEvalSuite('./evals/my-tool.golden.json', async (message) => {
|
|
7
|
+
* const res = await fetch('http://localhost:8001/agent-api/chat', { ... });
|
|
8
|
+
* return { responseText: ..., toolsCalled: [], latencyMs: ... };
|
|
9
|
+
* });
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { readFile } from 'node:fs/promises';
|
|
13
|
+
import { runChecks } from '../checks/run-checks.js';
|
|
14
|
+
import { checkAdapter, checkResponseContainsAnyGroups, checkToolsAcceptable } from '../checks/check-adapter.js';
|
|
15
|
+
import { evaluateGates } from './gate.js';
|
|
16
|
+
import { writeFixture, readFixture, sortKeysDeep } from '../fixtures/fixture-store.js';
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Run an eval suite programmatically.
|
|
20
|
+
*
|
|
21
|
+
* @param {string} evalFilePath - path to eval JSON file
|
|
22
|
+
* @param {(message: string) => Promise<{responseText: string, toolsCalled: string[], latencyMs?: number, cost?: number}>} agentFn
|
|
23
|
+
* @param {{
|
|
24
|
+
* record?: boolean,
|
|
25
|
+
* replay?: boolean,
|
|
26
|
+
* fixturesDir?: string,
|
|
27
|
+
* ttlDays?: number,
|
|
28
|
+
* gates?: {passRate?: number, maxCost?: number, p95LatencyMs?: number},
|
|
29
|
+
* suiteName?: string,
|
|
30
|
+
* }} [opts]
|
|
31
|
+
* @returns {Promise<{total: number, passed: number, failed: number, skipped: number, passRate: number, cases: object[], gates?: object}>}
|
|
32
|
+
*/
|
|
33
|
+
export async function runEvalSuite(evalFilePath, agentFn, opts = {}) {
|
|
34
|
+
const { record = false, replay = false, fixturesDir = '.forge-fixtures', ttlDays = 30, gates = {}, suiteName } = opts;
|
|
35
|
+
|
|
36
|
+
// Load eval cases
|
|
37
|
+
let cases;
|
|
38
|
+
try {
|
|
39
|
+
const raw = await readFile(evalFilePath, 'utf8');
|
|
40
|
+
cases = JSON.parse(raw);
|
|
41
|
+
} catch (err) {
|
|
42
|
+
throw new Error(`Failed to load eval file ${evalFilePath}: ${err.message}`);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if (!Array.isArray(cases)) {
|
|
46
|
+
throw new Error(`Eval file must contain a JSON array of cases`);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
let passed = 0, failed = 0, skipped = 0;
|
|
50
|
+
const caseResults = [];
|
|
51
|
+
const allLatencies = [];
|
|
52
|
+
let totalCost = 0;
|
|
53
|
+
|
|
54
|
+
for (const evalCase of cases) {
|
|
55
|
+
const message = evalCase.input?.message ?? '';
|
|
56
|
+
if (!message) {
|
|
57
|
+
skipped++;
|
|
58
|
+
caseResults.push({ id: evalCase.id, status: 'skipped', reason: 'no input message' });
|
|
59
|
+
continue;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Fixture replay
|
|
63
|
+
if (replay) {
|
|
64
|
+
const caseId = evalCase.id ?? message.slice(0, 40);
|
|
65
|
+
const configHash = JSON.stringify(sortKeysDeep(evalCase.expect ?? {}));
|
|
66
|
+
const hit = await readFixture(fixturesDir, caseId, configHash, { ttlDays });
|
|
67
|
+
if (hit.status === 'hit') {
|
|
68
|
+
const { responseText, toolsCalled } = hit.output;
|
|
69
|
+
const failures = checkCase(evalCase, { responseText, toolsCalled });
|
|
70
|
+
const casePassed = failures.length === 0;
|
|
71
|
+
if (casePassed) passed++; else failed++;
|
|
72
|
+
// Note: fixture hits do not contribute latency or cost — p95LatencyMs and totalCost
|
|
73
|
+
// reflect live-only cases. Latency/cost gates trivially pass on fully-cached runs.
|
|
74
|
+
caseResults.push({ id: evalCase.id, status: casePassed ? 'passed' : 'failed', reason: failures.join('; ') || null, fromFixture: true });
|
|
75
|
+
continue;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Call agent
|
|
80
|
+
let result;
|
|
81
|
+
try {
|
|
82
|
+
result = await agentFn(message);
|
|
83
|
+
} catch (err) {
|
|
84
|
+
failed++;
|
|
85
|
+
caseResults.push({ id: evalCase.id, status: 'failed', reason: `Agent error: ${err.message}` });
|
|
86
|
+
continue;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const { responseText = '', toolsCalled = [], latencyMs, cost } = result;
|
|
90
|
+
|
|
91
|
+
// Record fixture
|
|
92
|
+
if (record) {
|
|
93
|
+
const caseId = evalCase.id ?? message.slice(0, 40);
|
|
94
|
+
const configHash = JSON.stringify(sortKeysDeep(evalCase.expect ?? {}));
|
|
95
|
+
await writeFixture(fixturesDir, caseId, configHash, { responseText, toolsCalled }).catch((err) => {
|
|
96
|
+
console.warn(`[forge] Failed to write fixture for case "${caseId}": ${err.message}`);
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if (latencyMs !== undefined) allLatencies.push(latencyMs);
|
|
101
|
+
if (cost !== undefined) totalCost += cost;
|
|
102
|
+
|
|
103
|
+
const failures = checkCase(evalCase, { responseText, toolsCalled, latencyMs, cost });
|
|
104
|
+
const casePassed = failures.length === 0;
|
|
105
|
+
if (casePassed) passed++; else failed++;
|
|
106
|
+
caseResults.push({ id: evalCase.id, status: casePassed ? 'passed' : 'failed', reason: failures.join('; ') || null });
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
const total = cases.length;
|
|
110
|
+
const ran = passed + failed;
|
|
111
|
+
const passRate = ran > 0 ? passed / ran : 0;
|
|
112
|
+
|
|
113
|
+
// Compute p95 latency
|
|
114
|
+
const sortedLatencies = [...allLatencies].sort((a, b) => a - b);
|
|
115
|
+
const p95Index = Math.floor((sortedLatencies.length - 1) * 0.95);
|
|
116
|
+
const p95LatencyMs = sortedLatencies[p95Index] ?? 0;
|
|
117
|
+
|
|
118
|
+
const summary = { passRate, totalCost, p95LatencyMs, totalCases: total };
|
|
119
|
+
|
|
120
|
+
// Gate evaluation
|
|
121
|
+
let gateResult;
|
|
122
|
+
if (Object.keys(gates).some(k => gates[k] != null)) {
|
|
123
|
+
const activeGates = Object.fromEntries(Object.entries(gates).filter(([, v]) => v != null));
|
|
124
|
+
gateResult = evaluateGates(summary, activeGates);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
return {
|
|
128
|
+
total,
|
|
129
|
+
passed,
|
|
130
|
+
failed,
|
|
131
|
+
skipped,
|
|
132
|
+
passRate,
|
|
133
|
+
p95LatencyMs,
|
|
134
|
+
totalCost,
|
|
135
|
+
cases: caseResults,
|
|
136
|
+
...(suiteName ? { suiteName } : {}),
|
|
137
|
+
...(gateResult ? { gates: gateResult } : {}),
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Internal assertion runner for a single case.
|
|
143
|
+
* @param {object} evalCase
|
|
144
|
+
* @param {{responseText: string, toolsCalled: string[], latencyMs?: number, cost?: number}} meta
|
|
145
|
+
* @returns {string[]}
|
|
146
|
+
*/
|
|
147
|
+
function checkCase(evalCase, { responseText, toolsCalled, latencyMs, cost }) {
|
|
148
|
+
const failures = [];
|
|
149
|
+
const input = checkAdapter(evalCase, { toolsCalled, responseText, latencyMs, cost });
|
|
150
|
+
const result = runChecks(input);
|
|
151
|
+
for (const [checkName, checkResult] of Object.entries(result.checks)) {
|
|
152
|
+
if (!checkResult.pass) failures.push(checkResult.reason ?? `${checkName} failed`);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
const expect = evalCase.expect ?? {};
|
|
156
|
+
if (expect.responseContainsAny?.length) {
|
|
157
|
+
const anyResult = checkResponseContainsAnyGroups(responseText, expect.responseContainsAny);
|
|
158
|
+
if (!anyResult.pass) failures.push(anyResult.reason);
|
|
159
|
+
}
|
|
160
|
+
if (expect.toolsAcceptable !== undefined) {
|
|
161
|
+
const acceptResult = checkToolsAcceptable(toolsCalled, expect.toolsAcceptable);
|
|
162
|
+
if (!acceptResult.pass) failures.push(acceptResult.reason);
|
|
163
|
+
}
|
|
164
|
+
return failures;
|
|
165
|
+
}
|
package/lib/sidecar.d.ts
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import type { Server } from 'http';
|
|
2
|
+
import type { SidecarConfig } from './config-schema.js';
|
|
3
|
+
import type { AuthResult, AuthConfig, Authenticator } from './auth.js';
|
|
4
|
+
import type { ConversationMessage, SessionSummary, ConversationStore } from './conversation-store.js';
|
|
5
|
+
import type { ReactEvent, ReactLoopParams } from './react-engine.js';
|
|
6
|
+
|
|
7
|
+
export interface SidecarOptions {
|
|
8
|
+
port?: number;
|
|
9
|
+
host?: string;
|
|
10
|
+
dbPath?: string;
|
|
11
|
+
env?: Record<string, string>;
|
|
12
|
+
autoListen?: boolean;
|
|
13
|
+
enableDrift?: boolean;
|
|
14
|
+
widgetDir?: string;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface SidecarContext {
|
|
18
|
+
auth: Authenticator;
|
|
19
|
+
promptStore: object;
|
|
20
|
+
preferenceStore: object;
|
|
21
|
+
conversationStore: ConversationStore;
|
|
22
|
+
agentRegistry: object;
|
|
23
|
+
verifierRunner: object | null;
|
|
24
|
+
hitlEngine: object | null;
|
|
25
|
+
rateLimiter: object | null;
|
|
26
|
+
db: object;
|
|
27
|
+
config: SidecarConfig;
|
|
28
|
+
env: Record<string, string>;
|
|
29
|
+
configPath?: string;
|
|
30
|
+
[key: string]: unknown;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export interface SidecarInstance {
|
|
34
|
+
server: Server;
|
|
35
|
+
ctx: SidecarContext;
|
|
36
|
+
close(): Promise<void>;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export function createSidecar(config?: Partial<SidecarConfig>, options?: SidecarOptions): Promise<SidecarInstance>;
|
|
40
|
+
|
|
41
|
+
// Advanced consumers
|
|
42
|
+
export function buildSidecarContext(config: SidecarConfig, db: object, env?: Record<string, string>, opts?: object): Promise<SidecarContext>;
|
|
43
|
+
export function createSidecarRouter(ctx: SidecarContext, opts?: object): (req: object, res: object) => void;
|
|
44
|
+
|
|
45
|
+
export { createAuth } from './auth.js';
|
|
46
|
+
export type { AuthResult, AuthConfig, Authenticator } from './auth.js';
|
|
47
|
+
|
|
48
|
+
export { reactLoop } from './react-engine.js';
|
|
49
|
+
export type { ReactEvent, ReactLoopParams, TextEvent, TextDeltaEvent, ToolCallEvent, ToolResultEvent, ToolWarningEvent, HitlEvent, ErrorEvent, DoneEvent } from './react-engine.js';
|
|
50
|
+
|
|
51
|
+
export { mergeDefaults, validateConfig, CONFIG_DEFAULTS } from './config-schema.js';
|
|
52
|
+
export type { SidecarConfig, AgentConfig, RateLimitConfig, VerificationConfig } from './config-schema.js';
|
|
53
|
+
|
|
54
|
+
export { makeConversationStore } from './conversation-store.js';
|
|
55
|
+
export type { ConversationMessage, SessionSummary, ConversationStore } from './conversation-store.js';
|
|
56
|
+
|
|
57
|
+
export function getDb(path: string): object;
|
|
58
|
+
export function initSSE(res: object): { write(event: string, data: unknown): void; close(): void };
|
|
59
|
+
export function makePromptStore(config: object, db: object): object;
|
|
60
|
+
export function makePreferenceStore(config: object, db: object): object;
|
|
61
|
+
export function makeHitlEngine(config: object, db: object, redis?: object, pgPool?: object): object;
|
|
62
|
+
export function makeAgentRegistry(config: object, db: object): object;
|
|
63
|
+
|
|
64
|
+
export class AgentRegistry {
|
|
65
|
+
constructor(config: object, db: object);
|
|
66
|
+
resolveAgent(agentId: string | null): Promise<object | null>;
|
|
67
|
+
getAgent(agentId: string): Promise<object | null>;
|
|
68
|
+
getAllAgents(): Promise<object[]>;
|
|
69
|
+
upsertAgent(agent: object): Promise<void>;
|
|
70
|
+
setDefault(agentId: string): Promise<void>;
|
|
71
|
+
deleteAgent(agentId: string): Promise<void>;
|
|
72
|
+
seedFromConfig(): Promise<void>;
|
|
73
|
+
filterTools(tools: object[]): object[];
|
|
74
|
+
buildAgentConfig(config: object, agent: object | null): object;
|
|
75
|
+
resolveSystemPrompt(agent: object | null, promptStore: object, config: object): Promise<string>;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export class VerifierRunner {
|
|
79
|
+
constructor(db: object, config?: object, workerPool?: object);
|
|
80
|
+
loadFromDb(db: object): Promise<void>;
|
|
81
|
+
run(toolName: string, args: object, result: unknown): Promise<Array<{ outcome: 'pass' | 'warn' | 'block'; message: string | null; verifier: string }>>;
|
|
82
|
+
destroy(): void;
|
|
83
|
+
}
|
package/lib/sidecar.js
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Sidecar — Library entry point for agent-tool-forge.
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* import { createSidecar } from 'agent-tool-forge';
|
|
6
|
+
* const sidecar = await createSidecar({ ... }, { port: 8001 });
|
|
7
|
+
* // sidecar.server, sidecar.ctx, sidecar.close()
|
|
8
|
+
*
|
|
9
|
+
* Advanced:
|
|
10
|
+
* import { buildSidecarContext, createSidecarRouter, mergeDefaults, getDb } from 'agent-tool-forge';
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { createServer as createHttpServer } from 'http';
|
|
14
|
+
import { resolve } from 'path';
|
|
15
|
+
import { getDb } from './db.js';
|
|
16
|
+
import { mergeDefaults, validateConfig } from './config-schema.js';
|
|
17
|
+
import { buildSidecarContext, createSidecarRouter } from './forge-service.js';
|
|
18
|
+
import { createDriftMonitor } from './drift-background.js';
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Create a fully configured sidecar instance.
|
|
22
|
+
*
|
|
23
|
+
* @param {object} config — sidecar configuration (auth, agents, model, etc.)
|
|
24
|
+
* @param {object} [options]
|
|
25
|
+
* @param {number} [options.port=8001] — port to listen on
|
|
26
|
+
* @param {string} [options.host='0.0.0.0'] — bind address
|
|
27
|
+
* @param {string} [options.dbPath=':memory:'] — SQLite database path
|
|
28
|
+
* @param {Record<string, string>} [options.env] — environment variables (defaults to process.env)
|
|
29
|
+
* @param {boolean} [options.autoListen=true] — start listening immediately
|
|
30
|
+
* @param {boolean} [options.enableDrift=false] — start background drift monitor
|
|
31
|
+
* @param {string} [options.widgetDir] — custom widget directory
|
|
32
|
+
* @returns {Promise<{ server: import('http').Server, ctx: object, close: () => void }>}
|
|
33
|
+
*/
|
|
34
|
+
export async function createSidecar(config = {}, options = {}) {
|
|
35
|
+
const {
|
|
36
|
+
port = 8001,
|
|
37
|
+
host = '0.0.0.0',
|
|
38
|
+
dbPath = ':memory:',
|
|
39
|
+
env = process.env,
|
|
40
|
+
autoListen = true,
|
|
41
|
+
enableDrift = false,
|
|
42
|
+
widgetDir,
|
|
43
|
+
} = options;
|
|
44
|
+
|
|
45
|
+
// Merge defaults first so validateConfig sees a fully-populated object (M1).
|
|
46
|
+
// Validating the raw user config risks false positives on missing-but-defaulted
|
|
47
|
+
// fields (e.g. auth.mode absent in raw → would fail "must be one of" if
|
|
48
|
+
// validateConfig checked before defaults were applied).
|
|
49
|
+
const merged = mergeDefaults(config);
|
|
50
|
+
const { valid, errors } = validateConfig(merged);
|
|
51
|
+
if (!valid) {
|
|
52
|
+
throw new Error(`Invalid sidecar config: ${errors.join('; ')}`);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Initialize database with WAL mode
|
|
56
|
+
const db = getDb(dbPath);
|
|
57
|
+
try {
|
|
58
|
+
db.pragma('journal_mode = WAL');
|
|
59
|
+
} catch {
|
|
60
|
+
// WAL not supported on all platforms — continue without it
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Build sidecar context (async — may create Redis/Postgres clients)
|
|
64
|
+
const ctx = await buildSidecarContext(merged, db, env);
|
|
65
|
+
|
|
66
|
+
// Seed agents from config
|
|
67
|
+
await ctx.agentRegistry.seedFromConfig();
|
|
68
|
+
|
|
69
|
+
// Build request handler
|
|
70
|
+
const routerOpts = {};
|
|
71
|
+
if (widgetDir) routerOpts.widgetDir = widgetDir;
|
|
72
|
+
const router = createSidecarRouter(ctx, routerOpts);
|
|
73
|
+
|
|
74
|
+
// Create HTTP server
|
|
75
|
+
const server = createHttpServer(router);
|
|
76
|
+
|
|
77
|
+
// Optional drift monitor
|
|
78
|
+
let driftMonitor = null;
|
|
79
|
+
if (enableDrift) {
|
|
80
|
+
driftMonitor = createDriftMonitor(merged, db);
|
|
81
|
+
driftMonitor.start();
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// One-time guard to prevent double teardown if close() is called twice (M2).
|
|
85
|
+
let _closing = false;
|
|
86
|
+
|
|
87
|
+
// close() tears down everything cleanly
|
|
88
|
+
function close() {
|
|
89
|
+
if (_closing) return Promise.resolve();
|
|
90
|
+
_closing = true;
|
|
91
|
+
|
|
92
|
+
if (driftMonitor) {
|
|
93
|
+
driftMonitor.stop();
|
|
94
|
+
driftMonitor = null;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
async function teardownConnections() {
|
|
98
|
+
try { if (ctx.verifierRunner?.destroy) ctx.verifierRunner.destroy(); } catch { /* ignore */ }
|
|
99
|
+
try { if (ctx._redisClient) await ctx._redisClient.quit(); } catch { /* ignore */ }
|
|
100
|
+
try { if (ctx._pgPool) await ctx._pgPool.end(); } catch { /* ignore */ }
|
|
101
|
+
try { db.close(); } catch { /* already closed */ }
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
return new Promise((res) => {
|
|
105
|
+
let resolved = false;
|
|
106
|
+
let t;
|
|
107
|
+
const finish = async () => {
|
|
108
|
+
if (resolved) return;
|
|
109
|
+
resolved = true;
|
|
110
|
+
clearTimeout(t);
|
|
111
|
+
await teardownConnections();
|
|
112
|
+
res();
|
|
113
|
+
};
|
|
114
|
+
server.close(() => finish());
|
|
115
|
+
// Force-resolve after 2s if connections linger — do NOT call process.exit()
|
|
116
|
+
// in a library module as it would kill the host application (M2).
|
|
117
|
+
t = setTimeout(async () => {
|
|
118
|
+
if (!resolved) {
|
|
119
|
+
console.error('[forge-sidecar] close() timed out after 2s — forcing resolve');
|
|
120
|
+
resolved = true;
|
|
121
|
+
clearTimeout(t);
|
|
122
|
+
await teardownConnections();
|
|
123
|
+
res();
|
|
124
|
+
}
|
|
125
|
+
}, 2000);
|
|
126
|
+
// Ensure the timeout doesn't keep the event loop alive if finish() runs first
|
|
127
|
+
if (t.unref) t.unref();
|
|
128
|
+
});
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// Optionally start listening
|
|
132
|
+
if (autoListen) {
|
|
133
|
+
await new Promise((res, rej) => {
|
|
134
|
+
// Use once() so the error listener is removed after firing and doesn't
|
|
135
|
+
// become a ghost listener that fires on unrelated future errors.
|
|
136
|
+
server.once('error', rej);
|
|
137
|
+
server.listen(port, host, () => {
|
|
138
|
+
// Remove the one-time error listener if listen succeeded, so it doesn't
|
|
139
|
+
// linger as a ghost listener on the now-live server.
|
|
140
|
+
server.removeListener('error', rej);
|
|
141
|
+
res();
|
|
142
|
+
});
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
return { server, ctx, close };
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// Re-exports for advanced consumers
|
|
150
|
+
export { buildSidecarContext, createSidecarRouter } from './forge-service.js';
|
|
151
|
+
export { createAuth } from './auth.js';
|
|
152
|
+
export { reactLoop } from './react-engine.js';
|
|
153
|
+
export { mergeDefaults, validateConfig, CONFIG_DEFAULTS } from './config-schema.js';
|
|
154
|
+
export { getDb } from './db.js';
|
|
155
|
+
export { initSSE } from './sse.js';
|
|
156
|
+
export { VerifierRunner } from './verifier-runner.js';
|
|
157
|
+
export { makePromptStore } from './prompt-store.js';
|
|
158
|
+
export { makePreferenceStore } from './preference-store.js';
|
|
159
|
+
export { makeConversationStore } from './conversation-store.js';
|
|
160
|
+
export { makeHitlEngine } from './hitl-engine.js';
|
|
161
|
+
export { makeAgentRegistry, AgentRegistry } from './agent-registry.js';
|