agent-tool-forge 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +209 -0
- package/lib/agent-registry.js +170 -0
- package/lib/api-client.js +792 -0
- package/lib/api-loader.js +260 -0
- package/lib/auth.d.ts +25 -0
- package/lib/auth.js +158 -0
- package/lib/checks/check-adapter.js +172 -0
- package/lib/checks/compose.js +42 -0
- package/lib/checks/content-match.js +14 -0
- package/lib/checks/cost-budget.js +11 -0
- package/lib/checks/index.js +18 -0
- package/lib/checks/json-valid.js +15 -0
- package/lib/checks/latency.js +11 -0
- package/lib/checks/length-bounds.js +17 -0
- package/lib/checks/negative-match.js +14 -0
- package/lib/checks/no-hallucinated-numbers.js +63 -0
- package/lib/checks/non-empty.js +34 -0
- package/lib/checks/regex-match.js +12 -0
- package/lib/checks/run-checks.js +84 -0
- package/lib/checks/schema-match.js +26 -0
- package/lib/checks/tool-call-count.js +16 -0
- package/lib/checks/tool-selection.js +34 -0
- package/lib/checks/types.js +45 -0
- package/lib/comparison/compare.js +86 -0
- package/lib/comparison/format.js +104 -0
- package/lib/comparison/index.js +6 -0
- package/lib/comparison/statistics.js +59 -0
- package/lib/comparison/types.js +41 -0
- package/lib/config-schema.js +200 -0
- package/lib/config.d.ts +66 -0
- package/lib/conversation-store.d.ts +77 -0
- package/lib/conversation-store.js +443 -0
- package/lib/db.d.ts +6 -0
- package/lib/db.js +1112 -0
- package/lib/dep-check.js +99 -0
- package/lib/drift-background.js +61 -0
- package/lib/drift-monitor.js +187 -0
- package/lib/eval-runner.js +566 -0
- package/lib/fixtures/fixture-store.js +161 -0
- package/lib/fixtures/index.js +11 -0
- package/lib/forge-engine.js +982 -0
- package/lib/forge-eval-generator.js +417 -0
- package/lib/forge-file-writer.js +386 -0
- package/lib/forge-service-client.js +190 -0
- package/lib/forge-service.d.ts +4 -0
- package/lib/forge-service.js +655 -0
- package/lib/forge-verifier-generator.js +271 -0
- package/lib/handlers/admin.js +151 -0
- package/lib/handlers/agents.js +229 -0
- package/lib/handlers/chat-resume.js +334 -0
- package/lib/handlers/chat-sync.js +320 -0
- package/lib/handlers/chat.js +320 -0
- package/lib/handlers/conversations.js +92 -0
- package/lib/handlers/preferences.js +88 -0
- package/lib/handlers/tools-list.js +58 -0
- package/lib/hitl-engine.d.ts +60 -0
- package/lib/hitl-engine.js +261 -0
- package/lib/http-utils.js +92 -0
- package/lib/index.d.ts +20 -0
- package/lib/index.js +141 -0
- package/lib/init.js +636 -0
- package/lib/manual-entry.js +59 -0
- package/lib/mcp-server.js +252 -0
- package/lib/output-groups.js +54 -0
- package/lib/postgres-store.d.ts +31 -0
- package/lib/postgres-store.js +465 -0
- package/lib/preference-store.d.ts +47 -0
- package/lib/preference-store.js +79 -0
- package/lib/prompt-store.d.ts +42 -0
- package/lib/prompt-store.js +60 -0
- package/lib/rate-limiter.d.ts +30 -0
- package/lib/rate-limiter.js +104 -0
- package/lib/react-engine.d.ts +110 -0
- package/lib/react-engine.js +337 -0
- package/lib/runner/cli.js +156 -0
- package/lib/runner/cost-estimator.js +71 -0
- package/lib/runner/gate.js +46 -0
- package/lib/runner/index.js +165 -0
- package/lib/sidecar.d.ts +83 -0
- package/lib/sidecar.js +161 -0
- package/lib/sse.d.ts +15 -0
- package/lib/sse.js +30 -0
- package/lib/tools-scanner.js +91 -0
- package/lib/tui.js +253 -0
- package/lib/verifier-report.js +78 -0
- package/lib/verifier-runner.js +338 -0
- package/lib/verifier-scanner.js +70 -0
- package/lib/verifier-worker-pool.js +196 -0
- package/lib/views/chat.js +340 -0
- package/lib/views/endpoints.js +203 -0
- package/lib/views/eval-run.js +206 -0
- package/lib/views/forge-agent.js +538 -0
- package/lib/views/forge.js +410 -0
- package/lib/views/main-menu.js +275 -0
- package/lib/views/mediation.js +381 -0
- package/lib/views/model-compare.js +430 -0
- package/lib/views/model-comparison.js +333 -0
- package/lib/views/onboarding.js +470 -0
- package/lib/views/performance.js +237 -0
- package/lib/views/run-evals.js +205 -0
- package/lib/views/settings.js +829 -0
- package/lib/views/tools-evals.js +514 -0
- package/lib/views/verifier-coverage.js +617 -0
- package/lib/workers/verifier-worker.js +52 -0
- package/package.json +123 -0
- package/widget/forge-chat.js +789 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
// Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
export { contentMatch } from './content-match.js';
|
|
5
|
+
export { negativeMatch } from './negative-match.js';
|
|
6
|
+
export { toolSelection } from './tool-selection.js';
|
|
7
|
+
export { latency } from './latency.js';
|
|
8
|
+
export { jsonValid } from './json-valid.js';
|
|
9
|
+
export { schemaMatch } from './schema-match.js';
|
|
10
|
+
export { nonEmpty, DEFAULT_COP_OUT_PHRASES } from './non-empty.js';
|
|
11
|
+
export { lengthBounds } from './length-bounds.js';
|
|
12
|
+
export { regexMatch } from './regex-match.js';
|
|
13
|
+
export { toolCallCount } from './tool-call-count.js';
|
|
14
|
+
export { costBudget } from './cost-budget.js';
|
|
15
|
+
export { runChecks } from './run-checks.js';
|
|
16
|
+
export { noHallucinatedNumbers } from './no-hallucinated-numbers.js';
|
|
17
|
+
export { all, any, not } from './compose.js';
|
|
18
|
+
export { checkAdapter, checkResponseContainsAnyGroups, checkToolsAcceptable } from './check-adapter.js';
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
// Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* @param {{responseText: string}} input
|
|
6
|
+
* @returns {import('./types.js').EvalResult}
|
|
7
|
+
*/
|
|
8
|
+
export function jsonValid({ responseText }) {
|
|
9
|
+
try {
|
|
10
|
+
JSON.parse(responseText);
|
|
11
|
+
return { pass: true };
|
|
12
|
+
} catch (e) {
|
|
13
|
+
return { pass: false, reason: `Invalid JSON: ${e.message}` };
|
|
14
|
+
}
|
|
15
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
// Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* @param {{latencyMs: number, maxLatencyMs: number}} input
|
|
6
|
+
* @returns {import('./types.js').EvalResult}
|
|
7
|
+
*/
|
|
8
|
+
export function latency({ latencyMs, maxLatencyMs }) {
|
|
9
|
+
if (latencyMs <= maxLatencyMs) return { pass: true };
|
|
10
|
+
return { pass: false, reason: `Latency ${latencyMs}ms exceeded max ${maxLatencyMs}ms` };
|
|
11
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
// Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* @param {{responseText: string, minLength?: number, maxLength?: number}} input
|
|
6
|
+
* @returns {import('./types.js').EvalResult}
|
|
7
|
+
*/
|
|
8
|
+
export function lengthBounds({ responseText, minLength, maxLength }) {
|
|
9
|
+
const len = responseText.length;
|
|
10
|
+
if (minLength !== undefined && len < minLength) {
|
|
11
|
+
return { pass: false, reason: `Response length ${len} is below minimum ${minLength}` };
|
|
12
|
+
}
|
|
13
|
+
if (maxLength !== undefined && len > maxLength) {
|
|
14
|
+
return { pass: false, reason: `Response length ${len} exceeds maximum ${maxLength}` };
|
|
15
|
+
}
|
|
16
|
+
return { pass: true };
|
|
17
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
// Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Check that responseText does NOT contain any of the forbidden substrings (case-insensitive).
|
|
6
|
+
* @param {{responseText: string, mustNotContain: string[]}} input
|
|
7
|
+
* @returns {import('./types.js').EvalResult}
|
|
8
|
+
*/
|
|
9
|
+
export function negativeMatch({ responseText, mustNotContain }) {
|
|
10
|
+
const lower = responseText.toLowerCase();
|
|
11
|
+
const found = mustNotContain.filter(s => lower.includes(s.toLowerCase()));
|
|
12
|
+
if (found.length === 0) return { pass: true };
|
|
13
|
+
return { pass: false, reason: `Forbidden content found: ${found.join(', ')}` };
|
|
14
|
+
}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
// Adapted from agent-eval-kit by FlanaganSe (https://github.com/FlanaganSe/agent-eval-kit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Extract all numbers from a string.
|
|
6
|
+
* @param {string} text
|
|
7
|
+
* @returns {number[]}
|
|
8
|
+
*/
|
|
9
|
+
function extractNumbers(text) {
|
|
10
|
+
const matches = text.match(/-?\d+(?:\.\d+)?(?:e[+-]?\d+)?/gi) ?? [];
|
|
11
|
+
return matches.map(Number).filter(n => !isNaN(n));
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Extract all numbers from a value (recursively for objects/arrays).
|
|
16
|
+
* @param {unknown} value
|
|
17
|
+
* @returns {number[]}
|
|
18
|
+
*/
|
|
19
|
+
function extractNumbersDeep(value) {
|
|
20
|
+
if (typeof value === 'number') return [value];
|
|
21
|
+
if (typeof value === 'string') return extractNumbers(value);
|
|
22
|
+
if (Array.isArray(value)) return value.flatMap(extractNumbersDeep);
|
|
23
|
+
if (value !== null && typeof value === 'object') {
|
|
24
|
+
return Object.values(value).flatMap(extractNumbersDeep);
|
|
25
|
+
}
|
|
26
|
+
return [];
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Check that numbers in responseText match numbers from toolResults (within tolerance).
|
|
31
|
+
* @param {{responseText: string, toolResults: unknown, tolerance?: number}} input
|
|
32
|
+
* @returns {{pass: boolean, hallucinated: number[], matched: number[], reason?: string}}
|
|
33
|
+
*/
|
|
34
|
+
export function noHallucinatedNumbers({ responseText, toolResults, tolerance = 0.01 }) {
|
|
35
|
+
const responseNumbers = extractNumbers(responseText);
|
|
36
|
+
const sourceNumbers = extractNumbersDeep(toolResults);
|
|
37
|
+
|
|
38
|
+
const hallucinated = [];
|
|
39
|
+
const matched = [];
|
|
40
|
+
|
|
41
|
+
for (const num of responseNumbers) {
|
|
42
|
+
// Check if this number is within tolerance of any source number
|
|
43
|
+
const isMatched = sourceNumbers.some(src => {
|
|
44
|
+
if (src === 0 && num === 0) return true;
|
|
45
|
+
if (src === 0) return Math.abs(num) <= tolerance;
|
|
46
|
+
return Math.abs(num - src) / Math.abs(src) <= tolerance;
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
if (isMatched) {
|
|
50
|
+
matched.push(num);
|
|
51
|
+
} else {
|
|
52
|
+
hallucinated.push(num);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if (hallucinated.length === 0) return { pass: true, hallucinated: [], matched };
|
|
57
|
+
return {
|
|
58
|
+
pass: false,
|
|
59
|
+
hallucinated,
|
|
60
|
+
matched,
|
|
61
|
+
reason: `Hallucinated numbers not found in tool results: ${hallucinated.join(', ')}`,
|
|
62
|
+
};
|
|
63
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
// Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
export const DEFAULT_COP_OUT_PHRASES = [
|
|
5
|
+
"i'm sorry",
|
|
6
|
+
"i cannot",
|
|
7
|
+
"i can't",
|
|
8
|
+
"i don't know",
|
|
9
|
+
"i am not able to",
|
|
10
|
+
"as an ai",
|
|
11
|
+
"as a language model",
|
|
12
|
+
"i don't have access",
|
|
13
|
+
"i don't have information",
|
|
14
|
+
"i'm not able to",
|
|
15
|
+
"i am unable to",
|
|
16
|
+
"unfortunately, i",
|
|
17
|
+
"i apologize",
|
|
18
|
+
];
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* @param {{responseText: string, copOutPhrases?: string[]}} input
|
|
22
|
+
* @returns {import('./types.js').EvalResult}
|
|
23
|
+
*/
|
|
24
|
+
export function nonEmpty({ responseText, copOutPhrases = DEFAULT_COP_OUT_PHRASES }) {
|
|
25
|
+
if (!responseText || responseText.trim().length === 0) {
|
|
26
|
+
return { pass: false, reason: 'Response is empty' };
|
|
27
|
+
}
|
|
28
|
+
const lower = responseText.toLowerCase();
|
|
29
|
+
const found = copOutPhrases.find(phrase => lower.includes(phrase.toLowerCase()));
|
|
30
|
+
if (found) {
|
|
31
|
+
return { pass: false, reason: `Response contains cop-out phrase: "${found}"` };
|
|
32
|
+
}
|
|
33
|
+
return { pass: true };
|
|
34
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
// Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* @param {{responseText: string, pattern: string|RegExp}} input
|
|
6
|
+
* @returns {import('./types.js').EvalResult}
|
|
7
|
+
*/
|
|
8
|
+
export function regexMatch({ responseText, pattern }) {
|
|
9
|
+
const regex = pattern instanceof RegExp ? pattern : new RegExp(pattern);
|
|
10
|
+
if (regex.test(responseText)) return { pass: true };
|
|
11
|
+
return { pass: false, reason: `Response did not match pattern: ${pattern}` };
|
|
12
|
+
}
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
// Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
import { contentMatch } from './content-match.js';
|
|
5
|
+
import { negativeMatch } from './negative-match.js';
|
|
6
|
+
import { toolSelection } from './tool-selection.js';
|
|
7
|
+
import { latency } from './latency.js';
|
|
8
|
+
import { jsonValid } from './json-valid.js';
|
|
9
|
+
import { schemaMatch } from './schema-match.js';
|
|
10
|
+
import { nonEmpty } from './non-empty.js';
|
|
11
|
+
import { lengthBounds } from './length-bounds.js';
|
|
12
|
+
import { regexMatch } from './regex-match.js';
|
|
13
|
+
import { toolCallCount } from './tool-call-count.js';
|
|
14
|
+
import { costBudget } from './cost-budget.js';
|
|
15
|
+
import { noHallucinatedNumbers } from './no-hallucinated-numbers.js';
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Run all applicable checks based on what inputs are provided.
|
|
19
|
+
* Only runs a check if the relevant input fields are present.
|
|
20
|
+
* @param {import('./types.js').RunChecksInput} input
|
|
21
|
+
* @returns {import('./types.js').CheckSuiteResult}
|
|
22
|
+
*/
|
|
23
|
+
export function runChecks(input) {
|
|
24
|
+
const checks = {};
|
|
25
|
+
|
|
26
|
+
if (input.mustContain?.length && input.responseText !== undefined) {
|
|
27
|
+
checks.contentMatch = contentMatch({ responseText: input.responseText, mustContain: input.mustContain });
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
if (input.mustNotContain?.length && input.responseText !== undefined) {
|
|
31
|
+
checks.negativeMatch = negativeMatch({ responseText: input.responseText, mustNotContain: input.mustNotContain });
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
if (input.expectedTools !== undefined && input.actualTools !== undefined) {
|
|
35
|
+
checks.toolSelection = toolSelection({ expected: input.expectedTools, actual: input.actualTools, mode: input.toolSelectionMode });
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
if (input.latencyMs !== undefined && input.maxLatencyMs !== undefined) {
|
|
39
|
+
checks.latency = latency({ latencyMs: input.latencyMs, maxLatencyMs: input.maxLatencyMs });
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
if (input.jsonValid && input.responseText !== undefined) {
|
|
43
|
+
checks.jsonValid = jsonValid({ responseText: input.responseText });
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
if (input.schemaData && (input.requiredKeys !== undefined || input.typeChecks)) {
|
|
47
|
+
checks.schemaMatch = schemaMatch({ data: input.schemaData, requiredKeys: input.requiredKeys ?? [], typeChecks: input.typeChecks });
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
if (input.nonEmpty && input.responseText !== undefined) {
|
|
51
|
+
checks.nonEmpty = nonEmpty({ responseText: input.responseText, copOutPhrases: input.copOutPhrases });
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
if ((input.minLength !== undefined || input.maxLength !== undefined) && input.responseText !== undefined) {
|
|
55
|
+
checks.lengthBounds = lengthBounds({ responseText: input.responseText, minLength: input.minLength, maxLength: input.maxLength });
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
if (input.regexPattern && input.responseText !== undefined) {
|
|
59
|
+
checks.regexMatch = regexMatch({ responseText: input.responseText, pattern: input.regexPattern });
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
if (input.actualToolCallCount !== undefined && (input.minToolCalls !== undefined || input.maxToolCalls !== undefined)) {
|
|
63
|
+
checks.toolCallCount = toolCallCount({ actual: input.actualToolCallCount, min: input.minToolCalls, max: input.maxToolCalls });
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
if (input.actualCost !== undefined && input.maxCost !== undefined) {
|
|
67
|
+
checks.costBudget = costBudget({ actualCost: input.actualCost, maxCost: input.maxCost });
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
if (input.toolResults !== undefined && input.responseText !== undefined) {
|
|
71
|
+
checks.noHallucinatedNumbers = noHallucinatedNumbers({
|
|
72
|
+
responseText: input.responseText,
|
|
73
|
+
toolResults: input.toolResults,
|
|
74
|
+
tolerance: input.tolerance,
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
const results = Object.values(checks);
|
|
79
|
+
const passed = results.filter(r => r.pass).length;
|
|
80
|
+
const failed = results.filter(r => !r.pass).length;
|
|
81
|
+
const pass = failed === 0;
|
|
82
|
+
|
|
83
|
+
return { pass, checks, total: results.length, passed, failed };
|
|
84
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
// Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* @param {{data: Object, requiredKeys: string[], typeChecks?: Object.<string, string>}} input
|
|
6
|
+
* @returns {import('./types.js').EvalResult}
|
|
7
|
+
*/
|
|
8
|
+
export function schemaMatch({ data, requiredKeys, typeChecks }) {
|
|
9
|
+
const missingKeys = requiredKeys.filter(k => !(k in data));
|
|
10
|
+
if (missingKeys.length > 0) {
|
|
11
|
+
return { pass: false, reason: `Missing keys: ${missingKeys.join(', ')}` };
|
|
12
|
+
}
|
|
13
|
+
if (typeChecks) {
|
|
14
|
+
const typeErrors = [];
|
|
15
|
+
for (const [key, expectedType] of Object.entries(typeChecks)) {
|
|
16
|
+
const actualType = typeof data[key];
|
|
17
|
+
if (actualType !== expectedType) {
|
|
18
|
+
typeErrors.push(`${key}: expected ${expectedType}, got ${actualType}`);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
if (typeErrors.length > 0) {
|
|
22
|
+
return { pass: false, reason: `Type mismatches: ${typeErrors.join('; ')}` };
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
return { pass: true };
|
|
26
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
// Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* @param {{actual: number, min?: number, max?: number}} input
|
|
6
|
+
* @returns {import('./types.js').EvalResult}
|
|
7
|
+
*/
|
|
8
|
+
export function toolCallCount({ actual, min, max }) {
|
|
9
|
+
if (min !== undefined && actual < min) {
|
|
10
|
+
return { pass: false, reason: `Tool call count ${actual} is below minimum ${min}` };
|
|
11
|
+
}
|
|
12
|
+
if (max !== undefined && actual > max) {
|
|
13
|
+
return { pass: false, reason: `Tool call count ${actual} exceeds maximum ${max}` };
|
|
14
|
+
}
|
|
15
|
+
return { pass: true };
|
|
16
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
// Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* @param {{expected: string[], actual: string[], mode?: 'strict'|'subset'|'superset'|'unordered'}} input
|
|
6
|
+
* @returns {import('./types.js').EvalResult}
|
|
7
|
+
*/
|
|
8
|
+
export function toolSelection({ expected, actual, mode = 'strict' }) {
|
|
9
|
+
const expectedSet = new Set(expected);
|
|
10
|
+
const actualSet = new Set(actual);
|
|
11
|
+
|
|
12
|
+
if (mode === 'subset') {
|
|
13
|
+
// expected must be subset of actual
|
|
14
|
+
const missing = expected.filter(t => !actualSet.has(t));
|
|
15
|
+
if (missing.length === 0) return { pass: true };
|
|
16
|
+
return { pass: false, reason: `Expected tools not called: ${missing.join(', ')}` };
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
if (mode === 'superset') {
|
|
20
|
+
// actual must be subset of expected (expected is superset)
|
|
21
|
+
const unexpected = actual.filter(t => !expectedSet.has(t));
|
|
22
|
+
if (unexpected.length === 0) return { pass: true };
|
|
23
|
+
return { pass: false, reason: `Unexpected tools called: ${unexpected.join(', ')}` };
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// strict / unordered: exact set equality
|
|
27
|
+
const missing = expected.filter(t => !actualSet.has(t));
|
|
28
|
+
const extra = actual.filter(t => !expectedSet.has(t));
|
|
29
|
+
if (missing.length === 0 && extra.length === 0) return { pass: true };
|
|
30
|
+
const parts = [];
|
|
31
|
+
if (missing.length) parts.push(`missing: ${missing.join(', ')}`);
|
|
32
|
+
if (extra.length) parts.push(`unexpected: ${extra.join(', ')}`);
|
|
33
|
+
return { pass: false, reason: parts.join('; ') };
|
|
34
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
// Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* @typedef {Object} EvalResult
|
|
6
|
+
* @property {boolean} pass
|
|
7
|
+
* @property {string} [reason]
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* @typedef {Object} CheckSuiteResult
|
|
12
|
+
* @property {boolean} pass - true only if all checks passed
|
|
13
|
+
* @property {Object.<string, EvalResult>} checks - named check results
|
|
14
|
+
* @property {number} total
|
|
15
|
+
* @property {number} passed
|
|
16
|
+
* @property {number} failed
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* @typedef {Object} RunChecksInput
|
|
21
|
+
* @property {string} [responseText]
|
|
22
|
+
* @property {string[]} [mustContain]
|
|
23
|
+
* @property {string[]} [mustNotContain]
|
|
24
|
+
* @property {string[]} [expectedTools]
|
|
25
|
+
* @property {string[]} [actualTools]
|
|
26
|
+
* @property {'strict'|'subset'|'superset'|'unordered'} [toolSelectionMode]
|
|
27
|
+
* @property {boolean} [nonEmpty]
|
|
28
|
+
* @property {string[]} [copOutPhrases]
|
|
29
|
+
* @property {boolean} [jsonValid]
|
|
30
|
+
* @property {Object} [schemaData]
|
|
31
|
+
* @property {string[]} [requiredKeys]
|
|
32
|
+
* @property {Object.<string, string>} [typeChecks]
|
|
33
|
+
* @property {number} [minLength]
|
|
34
|
+
* @property {number} [maxLength]
|
|
35
|
+
* @property {string|RegExp} [regexPattern]
|
|
36
|
+
* @property {unknown} [toolResults]
|
|
37
|
+
* @property {number} [tolerance]
|
|
38
|
+
* @property {number} [latencyMs]
|
|
39
|
+
* @property {number} [maxLatencyMs]
|
|
40
|
+
* @property {number} [actualToolCallCount]
|
|
41
|
+
* @property {number} [minToolCalls]
|
|
42
|
+
* @property {number} [maxToolCalls]
|
|
43
|
+
* @property {number} [actualCost]
|
|
44
|
+
* @property {number} [maxCost]
|
|
45
|
+
*/
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
// Adapted from agent-eval-kit by FlanaganSe (https://github.com/FlanaganSe/agent-eval-kit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Compare two eval runs, classifying each case as regression/improvement/unchanged/added/removed.
|
|
6
|
+
* Uses a configurable absolute significance threshold to classify regressions and improvements.
|
|
7
|
+
*
|
|
8
|
+
* @param {import('./types.js').RunSummary} baseRun
|
|
9
|
+
* @param {import('./types.js').RunSummary} compareRun
|
|
10
|
+
* @param {import('./types.js').ComparisonOptions} [options]
|
|
11
|
+
* @returns {import('./types.js').RunComparison}
|
|
12
|
+
*/
|
|
13
|
+
export function compareRuns(baseRun, compareRun, options = {}) {
|
|
14
|
+
const significanceThreshold = options.significanceThreshold ?? 0.1;
|
|
15
|
+
|
|
16
|
+
const baseCaseIds = new Set(Object.keys(baseRun.cases ?? {}));
|
|
17
|
+
const compareCaseIds = new Set(Object.keys(compareRun.cases ?? {}));
|
|
18
|
+
const allCaseIds = new Set([...baseCaseIds, ...compareCaseIds]);
|
|
19
|
+
|
|
20
|
+
const cases = [];
|
|
21
|
+
let regressions = 0, improvements = 0, unchanged = 0, added = 0, removed = 0;
|
|
22
|
+
|
|
23
|
+
for (const caseId of allCaseIds) {
|
|
24
|
+
const inBase = baseCaseIds.has(caseId);
|
|
25
|
+
const inCompare = compareCaseIds.has(caseId);
|
|
26
|
+
|
|
27
|
+
if (!inBase) {
|
|
28
|
+
cases.push({ caseId, status: 'added', comparePassRate: getPassRate(compareRun.cases[caseId]) });
|
|
29
|
+
added++;
|
|
30
|
+
continue;
|
|
31
|
+
}
|
|
32
|
+
if (!inCompare) {
|
|
33
|
+
cases.push({ caseId, status: 'removed', basePassRate: getPassRate(baseRun.cases[caseId]) });
|
|
34
|
+
removed++;
|
|
35
|
+
continue;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const basePassRate = getPassRate(baseRun.cases[caseId]);
|
|
39
|
+
const comparePassRate = getPassRate(compareRun.cases[caseId]);
|
|
40
|
+
const diff = comparePassRate - basePassRate;
|
|
41
|
+
|
|
42
|
+
let status = 'unchanged';
|
|
43
|
+
if (Math.abs(diff) >= significanceThreshold) {
|
|
44
|
+
status = diff < 0 ? 'regression' : 'improvement';
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
if (status === 'regression') regressions++;
|
|
48
|
+
else if (status === 'improvement') improvements++;
|
|
49
|
+
else unchanged++;
|
|
50
|
+
|
|
51
|
+
const baseMeanLatencyMs = getMeanLatency(baseRun.cases[caseId]);
|
|
52
|
+
const compareMeanLatencyMs = getMeanLatency(compareRun.cases[caseId]);
|
|
53
|
+
|
|
54
|
+
cases.push({ caseId, status, basePassRate, comparePassRate, baseMeanLatencyMs, compareMeanLatencyMs });
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return {
|
|
58
|
+
base: baseRun,
|
|
59
|
+
compare: compareRun,
|
|
60
|
+
cases,
|
|
61
|
+
regressions,
|
|
62
|
+
improvements,
|
|
63
|
+
unchanged,
|
|
64
|
+
added,
|
|
65
|
+
removed,
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* @param {{pass: boolean}[]} trials
|
|
71
|
+
* @returns {number}
|
|
72
|
+
*/
|
|
73
|
+
function getPassRate(trials) {
|
|
74
|
+
if (!trials?.length) return 0;
|
|
75
|
+
return trials.filter(t => t.pass).length / trials.length;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* @param {{latencyMs?: number}[]} trials
|
|
80
|
+
* @returns {number}
|
|
81
|
+
*/
|
|
82
|
+
function getMeanLatency(trials) {
|
|
83
|
+
if (!trials?.length) return 0;
|
|
84
|
+
const latencies = trials.map(t => t.latencyMs ?? 0);
|
|
85
|
+
return latencies.reduce((s, l) => s + l, 0) / latencies.length;
|
|
86
|
+
}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
// Adapted from agent-eval-kit by FlanaganSe (https://github.com/FlanaganSe/agent-eval-kit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
import chalk from 'chalk';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Format a comparison report as a string.
|
|
8
|
+
* @param {import('./types.js').RunComparison} comparison
|
|
9
|
+
* @param {{noColor?: boolean, verbose?: boolean}} [options]
|
|
10
|
+
* @returns {string}
|
|
11
|
+
*/
|
|
12
|
+
export function formatComparisonReport(comparison, options = {}) {
|
|
13
|
+
const { noColor = false, verbose = false } = options;
|
|
14
|
+
const c = noColor ? makeNoColor() : chalk;
|
|
15
|
+
|
|
16
|
+
const lines = [];
|
|
17
|
+
|
|
18
|
+
// Header
|
|
19
|
+
lines.push(c.bold('\n=== Eval Run Comparison ==='));
|
|
20
|
+
lines.push(`Base: ${comparison.base.modelName} (${comparison.base.runId})`);
|
|
21
|
+
lines.push(`Compare: ${comparison.compare.modelName} (${comparison.compare.runId})`);
|
|
22
|
+
lines.push('');
|
|
23
|
+
|
|
24
|
+
// Summary stats
|
|
25
|
+
const baseRate = (comparison.base.passRate * 100).toFixed(1);
|
|
26
|
+
const cmpRate = (comparison.compare.passRate * 100).toFixed(1);
|
|
27
|
+
const rateDiff = comparison.compare.passRate - comparison.base.passRate;
|
|
28
|
+
const rateStr = rateDiff >= 0
|
|
29
|
+
? c.green(`+${(rateDiff * 100).toFixed(1)}%`)
|
|
30
|
+
: c.red(`${(rateDiff * 100).toFixed(1)}%`);
|
|
31
|
+
|
|
32
|
+
lines.push(`Pass rate: ${baseRate}% → ${cmpRate}% (${rateStr})`);
|
|
33
|
+
lines.push(`Cases: ${comparison.regressions > 0 ? c.red(`${comparison.regressions} regressions`) : '0 regressions'}, ${comparison.improvements > 0 ? c.green(`${comparison.improvements} improvements`) : '0 improvements'}, ${comparison.unchanged} unchanged`);
|
|
34
|
+
if (comparison.added > 0) lines.push(` + ${comparison.added} new cases added`);
|
|
35
|
+
if (comparison.removed > 0) lines.push(` - ${comparison.removed} cases removed`);
|
|
36
|
+
lines.push('');
|
|
37
|
+
|
|
38
|
+
// Regressions
|
|
39
|
+
const regressions = comparison.cases.filter(cas => cas.status === 'regression');
|
|
40
|
+
if (regressions.length > 0) {
|
|
41
|
+
lines.push(c.red.bold('Regressions:'));
|
|
42
|
+
for (const cas of regressions) {
|
|
43
|
+
const base = (cas.basePassRate * 100).toFixed(0);
|
|
44
|
+
const cmp = (cas.comparePassRate * 100).toFixed(0);
|
|
45
|
+
lines.push(` ${c.red('✗')} ${cas.caseId}: ${base}% → ${cmp}%`);
|
|
46
|
+
}
|
|
47
|
+
lines.push('');
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// Improvements
|
|
51
|
+
const improvements = comparison.cases.filter(cas => cas.status === 'improvement');
|
|
52
|
+
if (improvements.length > 0) {
|
|
53
|
+
lines.push(c.green.bold('Improvements:'));
|
|
54
|
+
for (const cas of improvements) {
|
|
55
|
+
const base = (cas.basePassRate * 100).toFixed(0);
|
|
56
|
+
const cmp = (cas.comparePassRate * 100).toFixed(0);
|
|
57
|
+
lines.push(` ${c.green('✓')} ${cas.caseId}: ${base}% → ${cmp}%`);
|
|
58
|
+
}
|
|
59
|
+
lines.push('');
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Verbose: show all unchanged cases too
|
|
63
|
+
if (verbose) {
|
|
64
|
+
const unchanged = comparison.cases.filter(cas => cas.status === 'unchanged');
|
|
65
|
+
if (unchanged.length > 0) {
|
|
66
|
+
lines.push(c.gray('Unchanged:'));
|
|
67
|
+
for (const cas of unchanged) {
|
|
68
|
+
const rate = (cas.comparePassRate * 100).toFixed(0);
|
|
69
|
+
lines.push(` ${c.gray('·')} ${cas.caseId}: ${rate}%`);
|
|
70
|
+
}
|
|
71
|
+
lines.push('');
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
return lines.join('\n');
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Create a chalk-compatible no-color proxy.
|
|
80
|
+
* @returns {typeof chalk}
|
|
81
|
+
*/
|
|
82
|
+
function makeNoColor() {
|
|
83
|
+
const identity = (...args) => {
|
|
84
|
+
// Handle tagged template literal calls: args[0] is a TemplateStringsArray
|
|
85
|
+
if (Array.isArray(args[0]) && args[0].raw) {
|
|
86
|
+
return String.raw({ raw: args[0].raw }, ...args.slice(1));
|
|
87
|
+
}
|
|
88
|
+
return args[0] ?? '';
|
|
89
|
+
};
|
|
90
|
+
|
|
91
|
+
function makeProxy(fn) {
|
|
92
|
+
return new Proxy(fn, {
|
|
93
|
+
get(_, prop) {
|
|
94
|
+
// Return a new proxy for any property access (supports unlimited chaining)
|
|
95
|
+
return makeProxy(identity);
|
|
96
|
+
},
|
|
97
|
+
apply(_, _this, args) {
|
|
98
|
+
return identity(...args);
|
|
99
|
+
}
|
|
100
|
+
});
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
return makeProxy(identity);
|
|
104
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
// Adapted from agent-eval-kit by FlanaganSe (https://github.com/FlanaganSe/agent-eval-kit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
export { wilsonInterval, computeTrialStats, computeAllTrialStats } from './statistics.js';
|
|
5
|
+
export { compareRuns } from './compare.js';
|
|
6
|
+
export { formatComparisonReport } from './format.js';
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
// Adapted from agent-eval-kit by FlanaganSe (https://github.com/FlanaganSe/agent-eval-kit)
|
|
2
|
+
// MIT License — see LICENSE
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Compute Wilson score confidence interval for a proportion.
|
|
6
|
+
* @param {number} passes - number of successes
|
|
7
|
+
* @param {number} total - total trials
|
|
8
|
+
* @param {number} [z=1.96] - z-score (1.96 = 95% CI)
|
|
9
|
+
* @returns {{lower: number, upper: number, center: number}}
|
|
10
|
+
*/
|
|
11
|
+
export function wilsonInterval(passes, total, z = 1.96) {
|
|
12
|
+
if (total === 0) return { lower: 0, upper: 0, center: 0 };
|
|
13
|
+
const p = passes / total;
|
|
14
|
+
const z2 = z * z;
|
|
15
|
+
const n = total;
|
|
16
|
+
const center = (p + z2 / (2 * n)) / (1 + z2 / n);
|
|
17
|
+
const margin = (z / (1 + z2 / n)) * Math.sqrt(p * (1 - p) / n + z2 / (4 * n * n));
|
|
18
|
+
return {
|
|
19
|
+
lower: Math.max(0, center - margin),
|
|
20
|
+
upper: Math.min(1, center + margin),
|
|
21
|
+
center,
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Compute statistics for a list of trial results.
|
|
27
|
+
* @param {{pass: boolean, latencyMs?: number}[]} trials
|
|
28
|
+
* @returns {{passRate: number, lower95: number, upper95: number, meanLatencyMs: number, p95LatencyMs: number}}
|
|
29
|
+
*/
|
|
30
|
+
export function computeTrialStats(trials) {
|
|
31
|
+
if (trials.length === 0) return { passRate: 0, lower95: 0, upper95: 0, meanLatencyMs: 0, p95LatencyMs: 0 };
|
|
32
|
+
|
|
33
|
+
const passes = trials.filter(t => t.pass).length;
|
|
34
|
+
const { lower, upper } = wilsonInterval(passes, trials.length);
|
|
35
|
+
|
|
36
|
+
const latencies = trials.map(t => t.latencyMs ?? 0).sort((a, b) => a - b);
|
|
37
|
+
const meanLatencyMs = latencies.reduce((s, l) => s + l, 0) / latencies.length;
|
|
38
|
+
const p95Index = Math.floor((latencies.length - 1) * 0.95);
|
|
39
|
+
const p95LatencyMs = latencies[Math.min(p95Index, latencies.length - 1)] ?? 0;
|
|
40
|
+
|
|
41
|
+
return {
|
|
42
|
+
passRate: passes / trials.length,
|
|
43
|
+
lower95: lower,
|
|
44
|
+
upper95: upper,
|
|
45
|
+
meanLatencyMs,
|
|
46
|
+
p95LatencyMs,
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Compute stats for all cases in a run.
|
|
52
|
+
* @param {Object.<string, {pass: boolean, latencyMs?: number}[]>} allTrials
|
|
53
|
+
* @returns {Object.<string, ReturnType<typeof computeTrialStats>>}
|
|
54
|
+
*/
|
|
55
|
+
export function computeAllTrialStats(allTrials) {
|
|
56
|
+
return Object.fromEntries(
|
|
57
|
+
Object.entries(allTrials).map(([caseId, trials]) => [caseId, computeTrialStats(trials)])
|
|
58
|
+
);
|
|
59
|
+
}
|