agent-tool-forge 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +209 -0
  3. package/lib/agent-registry.js +170 -0
  4. package/lib/api-client.js +792 -0
  5. package/lib/api-loader.js +260 -0
  6. package/lib/auth.d.ts +25 -0
  7. package/lib/auth.js +158 -0
  8. package/lib/checks/check-adapter.js +172 -0
  9. package/lib/checks/compose.js +42 -0
  10. package/lib/checks/content-match.js +14 -0
  11. package/lib/checks/cost-budget.js +11 -0
  12. package/lib/checks/index.js +18 -0
  13. package/lib/checks/json-valid.js +15 -0
  14. package/lib/checks/latency.js +11 -0
  15. package/lib/checks/length-bounds.js +17 -0
  16. package/lib/checks/negative-match.js +14 -0
  17. package/lib/checks/no-hallucinated-numbers.js +63 -0
  18. package/lib/checks/non-empty.js +34 -0
  19. package/lib/checks/regex-match.js +12 -0
  20. package/lib/checks/run-checks.js +84 -0
  21. package/lib/checks/schema-match.js +26 -0
  22. package/lib/checks/tool-call-count.js +16 -0
  23. package/lib/checks/tool-selection.js +34 -0
  24. package/lib/checks/types.js +45 -0
  25. package/lib/comparison/compare.js +86 -0
  26. package/lib/comparison/format.js +104 -0
  27. package/lib/comparison/index.js +6 -0
  28. package/lib/comparison/statistics.js +59 -0
  29. package/lib/comparison/types.js +41 -0
  30. package/lib/config-schema.js +200 -0
  31. package/lib/config.d.ts +66 -0
  32. package/lib/conversation-store.d.ts +77 -0
  33. package/lib/conversation-store.js +443 -0
  34. package/lib/db.d.ts +6 -0
  35. package/lib/db.js +1112 -0
  36. package/lib/dep-check.js +99 -0
  37. package/lib/drift-background.js +61 -0
  38. package/lib/drift-monitor.js +187 -0
  39. package/lib/eval-runner.js +566 -0
  40. package/lib/fixtures/fixture-store.js +161 -0
  41. package/lib/fixtures/index.js +11 -0
  42. package/lib/forge-engine.js +982 -0
  43. package/lib/forge-eval-generator.js +417 -0
  44. package/lib/forge-file-writer.js +386 -0
  45. package/lib/forge-service-client.js +190 -0
  46. package/lib/forge-service.d.ts +4 -0
  47. package/lib/forge-service.js +655 -0
  48. package/lib/forge-verifier-generator.js +271 -0
  49. package/lib/handlers/admin.js +151 -0
  50. package/lib/handlers/agents.js +229 -0
  51. package/lib/handlers/chat-resume.js +334 -0
  52. package/lib/handlers/chat-sync.js +320 -0
  53. package/lib/handlers/chat.js +320 -0
  54. package/lib/handlers/conversations.js +92 -0
  55. package/lib/handlers/preferences.js +88 -0
  56. package/lib/handlers/tools-list.js +58 -0
  57. package/lib/hitl-engine.d.ts +60 -0
  58. package/lib/hitl-engine.js +261 -0
  59. package/lib/http-utils.js +92 -0
  60. package/lib/index.d.ts +20 -0
  61. package/lib/index.js +141 -0
  62. package/lib/init.js +636 -0
  63. package/lib/manual-entry.js +59 -0
  64. package/lib/mcp-server.js +252 -0
  65. package/lib/output-groups.js +54 -0
  66. package/lib/postgres-store.d.ts +31 -0
  67. package/lib/postgres-store.js +465 -0
  68. package/lib/preference-store.d.ts +47 -0
  69. package/lib/preference-store.js +79 -0
  70. package/lib/prompt-store.d.ts +42 -0
  71. package/lib/prompt-store.js +60 -0
  72. package/lib/rate-limiter.d.ts +30 -0
  73. package/lib/rate-limiter.js +104 -0
  74. package/lib/react-engine.d.ts +110 -0
  75. package/lib/react-engine.js +337 -0
  76. package/lib/runner/cli.js +156 -0
  77. package/lib/runner/cost-estimator.js +71 -0
  78. package/lib/runner/gate.js +46 -0
  79. package/lib/runner/index.js +165 -0
  80. package/lib/sidecar.d.ts +83 -0
  81. package/lib/sidecar.js +161 -0
  82. package/lib/sse.d.ts +15 -0
  83. package/lib/sse.js +30 -0
  84. package/lib/tools-scanner.js +91 -0
  85. package/lib/tui.js +253 -0
  86. package/lib/verifier-report.js +78 -0
  87. package/lib/verifier-runner.js +338 -0
  88. package/lib/verifier-scanner.js +70 -0
  89. package/lib/verifier-worker-pool.js +196 -0
  90. package/lib/views/chat.js +340 -0
  91. package/lib/views/endpoints.js +203 -0
  92. package/lib/views/eval-run.js +206 -0
  93. package/lib/views/forge-agent.js +538 -0
  94. package/lib/views/forge.js +410 -0
  95. package/lib/views/main-menu.js +275 -0
  96. package/lib/views/mediation.js +381 -0
  97. package/lib/views/model-compare.js +430 -0
  98. package/lib/views/model-comparison.js +333 -0
  99. package/lib/views/onboarding.js +470 -0
  100. package/lib/views/performance.js +237 -0
  101. package/lib/views/run-evals.js +205 -0
  102. package/lib/views/settings.js +829 -0
  103. package/lib/views/tools-evals.js +514 -0
  104. package/lib/views/verifier-coverage.js +617 -0
  105. package/lib/workers/verifier-worker.js +52 -0
  106. package/package.json +123 -0
  107. package/widget/forge-chat.js +789 -0
@@ -0,0 +1,18 @@
1
+ // Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
2
+ // MIT License — see LICENSE
3
+
4
+ export { contentMatch } from './content-match.js';
5
+ export { negativeMatch } from './negative-match.js';
6
+ export { toolSelection } from './tool-selection.js';
7
+ export { latency } from './latency.js';
8
+ export { jsonValid } from './json-valid.js';
9
+ export { schemaMatch } from './schema-match.js';
10
+ export { nonEmpty, DEFAULT_COP_OUT_PHRASES } from './non-empty.js';
11
+ export { lengthBounds } from './length-bounds.js';
12
+ export { regexMatch } from './regex-match.js';
13
+ export { toolCallCount } from './tool-call-count.js';
14
+ export { costBudget } from './cost-budget.js';
15
+ export { runChecks } from './run-checks.js';
16
+ export { noHallucinatedNumbers } from './no-hallucinated-numbers.js';
17
+ export { all, any, not } from './compose.js';
18
+ export { checkAdapter, checkResponseContainsAnyGroups, checkToolsAcceptable } from './check-adapter.js';
@@ -0,0 +1,15 @@
1
+ // Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
2
+ // MIT License — see LICENSE
3
+
4
+ /**
5
+ * @param {{responseText: string}} input
6
+ * @returns {import('./types.js').EvalResult}
7
+ */
8
+ export function jsonValid({ responseText }) {
9
+ try {
10
+ JSON.parse(responseText);
11
+ return { pass: true };
12
+ } catch (e) {
13
+ return { pass: false, reason: `Invalid JSON: ${e.message}` };
14
+ }
15
+ }
@@ -0,0 +1,11 @@
1
+ // Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
2
+ // MIT License — see LICENSE
3
+
4
+ /**
5
+ * @param {{latencyMs: number, maxLatencyMs: number}} input
6
+ * @returns {import('./types.js').EvalResult}
7
+ */
8
+ export function latency({ latencyMs, maxLatencyMs }) {
9
+ if (latencyMs <= maxLatencyMs) return { pass: true };
10
+ return { pass: false, reason: `Latency ${latencyMs}ms exceeded max ${maxLatencyMs}ms` };
11
+ }
@@ -0,0 +1,17 @@
1
+ // Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
2
+ // MIT License — see LICENSE
3
+
4
+ /**
5
+ * @param {{responseText: string, minLength?: number, maxLength?: number}} input
6
+ * @returns {import('./types.js').EvalResult}
7
+ */
8
+ export function lengthBounds({ responseText, minLength, maxLength }) {
9
+ const len = responseText.length;
10
+ if (minLength !== undefined && len < minLength) {
11
+ return { pass: false, reason: `Response length ${len} is below minimum ${minLength}` };
12
+ }
13
+ if (maxLength !== undefined && len > maxLength) {
14
+ return { pass: false, reason: `Response length ${len} exceeds maximum ${maxLength}` };
15
+ }
16
+ return { pass: true };
17
+ }
@@ -0,0 +1,14 @@
1
+ // Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
2
+ // MIT License — see LICENSE
3
+
4
+ /**
5
+ * Check that responseText does NOT contain any of the forbidden substrings (case-insensitive).
6
+ * @param {{responseText: string, mustNotContain: string[]}} input
7
+ * @returns {import('./types.js').EvalResult}
8
+ */
9
+ export function negativeMatch({ responseText, mustNotContain }) {
10
+ const lower = responseText.toLowerCase();
11
+ const found = mustNotContain.filter(s => lower.includes(s.toLowerCase()));
12
+ if (found.length === 0) return { pass: true };
13
+ return { pass: false, reason: `Forbidden content found: ${found.join(', ')}` };
14
+ }
@@ -0,0 +1,63 @@
1
+ // Adapted from agent-eval-kit by FlanaganSe (https://github.com/FlanaganSe/agent-eval-kit)
2
+ // MIT License — see LICENSE
3
+
4
+ /**
5
+ * Extract all numbers from a string.
6
+ * @param {string} text
7
+ * @returns {number[]}
8
+ */
9
+ function extractNumbers(text) {
10
+ const matches = text.match(/-?\d+(?:\.\d+)?(?:e[+-]?\d+)?/gi) ?? [];
11
+ return matches.map(Number).filter(n => !isNaN(n));
12
+ }
13
+
14
+ /**
15
+ * Extract all numbers from a value (recursively for objects/arrays).
16
+ * @param {unknown} value
17
+ * @returns {number[]}
18
+ */
19
+ function extractNumbersDeep(value) {
20
+ if (typeof value === 'number') return [value];
21
+ if (typeof value === 'string') return extractNumbers(value);
22
+ if (Array.isArray(value)) return value.flatMap(extractNumbersDeep);
23
+ if (value !== null && typeof value === 'object') {
24
+ return Object.values(value).flatMap(extractNumbersDeep);
25
+ }
26
+ return [];
27
+ }
28
+
29
+ /**
30
+ * Check that numbers in responseText match numbers from toolResults (within tolerance).
31
+ * @param {{responseText: string, toolResults: unknown, tolerance?: number}} input
32
+ * @returns {{pass: boolean, hallucinated: number[], matched: number[], reason?: string}}
33
+ */
34
+ export function noHallucinatedNumbers({ responseText, toolResults, tolerance = 0.01 }) {
35
+ const responseNumbers = extractNumbers(responseText);
36
+ const sourceNumbers = extractNumbersDeep(toolResults);
37
+
38
+ const hallucinated = [];
39
+ const matched = [];
40
+
41
+ for (const num of responseNumbers) {
42
+ // Check if this number is within tolerance of any source number
43
+ const isMatched = sourceNumbers.some(src => {
44
+ if (src === 0 && num === 0) return true;
45
+ if (src === 0) return Math.abs(num) <= tolerance;
46
+ return Math.abs(num - src) / Math.abs(src) <= tolerance;
47
+ });
48
+
49
+ if (isMatched) {
50
+ matched.push(num);
51
+ } else {
52
+ hallucinated.push(num);
53
+ }
54
+ }
55
+
56
+ if (hallucinated.length === 0) return { pass: true, hallucinated: [], matched };
57
+ return {
58
+ pass: false,
59
+ hallucinated,
60
+ matched,
61
+ reason: `Hallucinated numbers not found in tool results: ${hallucinated.join(', ')}`,
62
+ };
63
+ }
@@ -0,0 +1,34 @@
1
+ // Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
2
+ // MIT License — see LICENSE
3
+
4
+ export const DEFAULT_COP_OUT_PHRASES = [
5
+ "i'm sorry",
6
+ "i cannot",
7
+ "i can't",
8
+ "i don't know",
9
+ "i am not able to",
10
+ "as an ai",
11
+ "as a language model",
12
+ "i don't have access",
13
+ "i don't have information",
14
+ "i'm not able to",
15
+ "i am unable to",
16
+ "unfortunately, i",
17
+ "i apologize",
18
+ ];
19
+
20
+ /**
21
+ * @param {{responseText: string, copOutPhrases?: string[]}} input
22
+ * @returns {import('./types.js').EvalResult}
23
+ */
24
+ export function nonEmpty({ responseText, copOutPhrases = DEFAULT_COP_OUT_PHRASES }) {
25
+ if (!responseText || responseText.trim().length === 0) {
26
+ return { pass: false, reason: 'Response is empty' };
27
+ }
28
+ const lower = responseText.toLowerCase();
29
+ const found = copOutPhrases.find(phrase => lower.includes(phrase.toLowerCase()));
30
+ if (found) {
31
+ return { pass: false, reason: `Response contains cop-out phrase: "${found}"` };
32
+ }
33
+ return { pass: true };
34
+ }
@@ -0,0 +1,12 @@
1
+ // Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
2
+ // MIT License — see LICENSE
3
+
4
+ /**
5
+ * @param {{responseText: string, pattern: string|RegExp}} input
6
+ * @returns {import('./types.js').EvalResult}
7
+ */
8
+ export function regexMatch({ responseText, pattern }) {
9
+ const regex = pattern instanceof RegExp ? pattern : new RegExp(pattern);
10
+ if (regex.test(responseText)) return { pass: true };
11
+ return { pass: false, reason: `Response did not match pattern: ${pattern}` };
12
+ }
@@ -0,0 +1,84 @@
1
+ // Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
2
+ // MIT License — see LICENSE
3
+
4
+ import { contentMatch } from './content-match.js';
5
+ import { negativeMatch } from './negative-match.js';
6
+ import { toolSelection } from './tool-selection.js';
7
+ import { latency } from './latency.js';
8
+ import { jsonValid } from './json-valid.js';
9
+ import { schemaMatch } from './schema-match.js';
10
+ import { nonEmpty } from './non-empty.js';
11
+ import { lengthBounds } from './length-bounds.js';
12
+ import { regexMatch } from './regex-match.js';
13
+ import { toolCallCount } from './tool-call-count.js';
14
+ import { costBudget } from './cost-budget.js';
15
+ import { noHallucinatedNumbers } from './no-hallucinated-numbers.js';
16
+
17
+ /**
18
+ * Run all applicable checks based on what inputs are provided.
19
+ * Only runs a check if the relevant input fields are present.
20
+ * @param {import('./types.js').RunChecksInput} input
21
+ * @returns {import('./types.js').CheckSuiteResult}
22
+ */
23
+ export function runChecks(input) {
24
+ const checks = {};
25
+
26
+ if (input.mustContain?.length && input.responseText !== undefined) {
27
+ checks.contentMatch = contentMatch({ responseText: input.responseText, mustContain: input.mustContain });
28
+ }
29
+
30
+ if (input.mustNotContain?.length && input.responseText !== undefined) {
31
+ checks.negativeMatch = negativeMatch({ responseText: input.responseText, mustNotContain: input.mustNotContain });
32
+ }
33
+
34
+ if (input.expectedTools !== undefined && input.actualTools !== undefined) {
35
+ checks.toolSelection = toolSelection({ expected: input.expectedTools, actual: input.actualTools, mode: input.toolSelectionMode });
36
+ }
37
+
38
+ if (input.latencyMs !== undefined && input.maxLatencyMs !== undefined) {
39
+ checks.latency = latency({ latencyMs: input.latencyMs, maxLatencyMs: input.maxLatencyMs });
40
+ }
41
+
42
+ if (input.jsonValid && input.responseText !== undefined) {
43
+ checks.jsonValid = jsonValid({ responseText: input.responseText });
44
+ }
45
+
46
+ if (input.schemaData && (input.requiredKeys !== undefined || input.typeChecks)) {
47
+ checks.schemaMatch = schemaMatch({ data: input.schemaData, requiredKeys: input.requiredKeys ?? [], typeChecks: input.typeChecks });
48
+ }
49
+
50
+ if (input.nonEmpty && input.responseText !== undefined) {
51
+ checks.nonEmpty = nonEmpty({ responseText: input.responseText, copOutPhrases: input.copOutPhrases });
52
+ }
53
+
54
+ if ((input.minLength !== undefined || input.maxLength !== undefined) && input.responseText !== undefined) {
55
+ checks.lengthBounds = lengthBounds({ responseText: input.responseText, minLength: input.minLength, maxLength: input.maxLength });
56
+ }
57
+
58
+ if (input.regexPattern && input.responseText !== undefined) {
59
+ checks.regexMatch = regexMatch({ responseText: input.responseText, pattern: input.regexPattern });
60
+ }
61
+
62
+ if (input.actualToolCallCount !== undefined && (input.minToolCalls !== undefined || input.maxToolCalls !== undefined)) {
63
+ checks.toolCallCount = toolCallCount({ actual: input.actualToolCallCount, min: input.minToolCalls, max: input.maxToolCalls });
64
+ }
65
+
66
+ if (input.actualCost !== undefined && input.maxCost !== undefined) {
67
+ checks.costBudget = costBudget({ actualCost: input.actualCost, maxCost: input.maxCost });
68
+ }
69
+
70
+ if (input.toolResults !== undefined && input.responseText !== undefined) {
71
+ checks.noHallucinatedNumbers = noHallucinatedNumbers({
72
+ responseText: input.responseText,
73
+ toolResults: input.toolResults,
74
+ tolerance: input.tolerance,
75
+ });
76
+ }
77
+
78
+ const results = Object.values(checks);
79
+ const passed = results.filter(r => r.pass).length;
80
+ const failed = results.filter(r => !r.pass).length;
81
+ const pass = failed === 0;
82
+
83
+ return { pass, checks, total: results.length, passed, failed };
84
+ }
@@ -0,0 +1,26 @@
1
+ // Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
2
+ // MIT License — see LICENSE
3
+
4
+ /**
5
+ * @param {{data: Object, requiredKeys: string[], typeChecks?: Object.<string, string>}} input
6
+ * @returns {import('./types.js').EvalResult}
7
+ */
8
+ export function schemaMatch({ data, requiredKeys, typeChecks }) {
9
+ const missingKeys = requiredKeys.filter(k => !(k in data));
10
+ if (missingKeys.length > 0) {
11
+ return { pass: false, reason: `Missing keys: ${missingKeys.join(', ')}` };
12
+ }
13
+ if (typeChecks) {
14
+ const typeErrors = [];
15
+ for (const [key, expectedType] of Object.entries(typeChecks)) {
16
+ const actualType = typeof data[key];
17
+ if (actualType !== expectedType) {
18
+ typeErrors.push(`${key}: expected ${expectedType}, got ${actualType}`);
19
+ }
20
+ }
21
+ if (typeErrors.length > 0) {
22
+ return { pass: false, reason: `Type mismatches: ${typeErrors.join('; ')}` };
23
+ }
24
+ }
25
+ return { pass: true };
26
+ }
@@ -0,0 +1,16 @@
1
+ // Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
2
+ // MIT License — see LICENSE
3
+
4
+ /**
5
+ * @param {{actual: number, min?: number, max?: number}} input
6
+ * @returns {import('./types.js').EvalResult}
7
+ */
8
+ export function toolCallCount({ actual, min, max }) {
9
+ if (min !== undefined && actual < min) {
10
+ return { pass: false, reason: `Tool call count ${actual} is below minimum ${min}` };
11
+ }
12
+ if (max !== undefined && actual > max) {
13
+ return { pass: false, reason: `Tool call count ${actual} exceeds maximum ${max}` };
14
+ }
15
+ return { pass: true };
16
+ }
@@ -0,0 +1,34 @@
1
+ // Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
2
+ // MIT License — see LICENSE
3
+
4
+ /**
5
+ * @param {{expected: string[], actual: string[], mode?: 'strict'|'subset'|'superset'|'unordered'}} input
6
+ * @returns {import('./types.js').EvalResult}
7
+ */
8
+ export function toolSelection({ expected, actual, mode = 'strict' }) {
9
+ const expectedSet = new Set(expected);
10
+ const actualSet = new Set(actual);
11
+
12
+ if (mode === 'subset') {
13
+ // expected must be subset of actual
14
+ const missing = expected.filter(t => !actualSet.has(t));
15
+ if (missing.length === 0) return { pass: true };
16
+ return { pass: false, reason: `Expected tools not called: ${missing.join(', ')}` };
17
+ }
18
+
19
+ if (mode === 'superset') {
20
+ // actual must be subset of expected (expected is superset)
21
+ const unexpected = actual.filter(t => !expectedSet.has(t));
22
+ if (unexpected.length === 0) return { pass: true };
23
+ return { pass: false, reason: `Unexpected tools called: ${unexpected.join(', ')}` };
24
+ }
25
+
26
+ // strict / unordered: exact set equality
27
+ const missing = expected.filter(t => !actualSet.has(t));
28
+ const extra = actual.filter(t => !expectedSet.has(t));
29
+ if (missing.length === 0 && extra.length === 0) return { pass: true };
30
+ const parts = [];
31
+ if (missing.length) parts.push(`missing: ${missing.join(', ')}`);
32
+ if (extra.length) parts.push(`unexpected: ${extra.join(', ')}`);
33
+ return { pass: false, reason: parts.join('; ') };
34
+ }
@@ -0,0 +1,45 @@
1
+ // Adapted from evalkit by wkhori (https://github.com/wkhori/evalkit)
2
+ // MIT License — see LICENSE
3
+
4
+ /**
5
+ * @typedef {Object} EvalResult
6
+ * @property {boolean} pass
7
+ * @property {string} [reason]
8
+ */
9
+
10
+ /**
11
+ * @typedef {Object} CheckSuiteResult
12
+ * @property {boolean} pass - true only if all checks passed
13
+ * @property {Object.<string, EvalResult>} checks - named check results
14
+ * @property {number} total
15
+ * @property {number} passed
16
+ * @property {number} failed
17
+ */
18
+
19
+ /**
20
+ * @typedef {Object} RunChecksInput
21
+ * @property {string} [responseText]
22
+ * @property {string[]} [mustContain]
23
+ * @property {string[]} [mustNotContain]
24
+ * @property {string[]} [expectedTools]
25
+ * @property {string[]} [actualTools]
26
+ * @property {'strict'|'subset'|'superset'|'unordered'} [toolSelectionMode]
27
+ * @property {boolean} [nonEmpty]
28
+ * @property {string[]} [copOutPhrases]
29
+ * @property {boolean} [jsonValid]
30
+ * @property {Object} [schemaData]
31
+ * @property {string[]} [requiredKeys]
32
+ * @property {Object.<string, string>} [typeChecks]
33
+ * @property {number} [minLength]
34
+ * @property {number} [maxLength]
35
+ * @property {string|RegExp} [regexPattern]
36
+ * @property {unknown} [toolResults]
37
+ * @property {number} [tolerance]
38
+ * @property {number} [latencyMs]
39
+ * @property {number} [maxLatencyMs]
40
+ * @property {number} [actualToolCallCount]
41
+ * @property {number} [minToolCalls]
42
+ * @property {number} [maxToolCalls]
43
+ * @property {number} [actualCost]
44
+ * @property {number} [maxCost]
45
+ */
@@ -0,0 +1,86 @@
1
+ // Adapted from agent-eval-kit by FlanaganSe (https://github.com/FlanaganSe/agent-eval-kit)
2
+ // MIT License — see LICENSE
3
+
4
+ /**
5
+ * Compare two eval runs, classifying each case as regression/improvement/unchanged/added/removed.
6
+ * Uses a configurable absolute significance threshold to classify regressions and improvements.
7
+ *
8
+ * @param {import('./types.js').RunSummary} baseRun
9
+ * @param {import('./types.js').RunSummary} compareRun
10
+ * @param {import('./types.js').ComparisonOptions} [options]
11
+ * @returns {import('./types.js').RunComparison}
12
+ */
13
+ export function compareRuns(baseRun, compareRun, options = {}) {
14
+ const significanceThreshold = options.significanceThreshold ?? 0.1;
15
+
16
+ const baseCaseIds = new Set(Object.keys(baseRun.cases ?? {}));
17
+ const compareCaseIds = new Set(Object.keys(compareRun.cases ?? {}));
18
+ const allCaseIds = new Set([...baseCaseIds, ...compareCaseIds]);
19
+
20
+ const cases = [];
21
+ let regressions = 0, improvements = 0, unchanged = 0, added = 0, removed = 0;
22
+
23
+ for (const caseId of allCaseIds) {
24
+ const inBase = baseCaseIds.has(caseId);
25
+ const inCompare = compareCaseIds.has(caseId);
26
+
27
+ if (!inBase) {
28
+ cases.push({ caseId, status: 'added', comparePassRate: getPassRate(compareRun.cases[caseId]) });
29
+ added++;
30
+ continue;
31
+ }
32
+ if (!inCompare) {
33
+ cases.push({ caseId, status: 'removed', basePassRate: getPassRate(baseRun.cases[caseId]) });
34
+ removed++;
35
+ continue;
36
+ }
37
+
38
+ const basePassRate = getPassRate(baseRun.cases[caseId]);
39
+ const comparePassRate = getPassRate(compareRun.cases[caseId]);
40
+ const diff = comparePassRate - basePassRate;
41
+
42
+ let status = 'unchanged';
43
+ if (Math.abs(diff) >= significanceThreshold) {
44
+ status = diff < 0 ? 'regression' : 'improvement';
45
+ }
46
+
47
+ if (status === 'regression') regressions++;
48
+ else if (status === 'improvement') improvements++;
49
+ else unchanged++;
50
+
51
+ const baseMeanLatencyMs = getMeanLatency(baseRun.cases[caseId]);
52
+ const compareMeanLatencyMs = getMeanLatency(compareRun.cases[caseId]);
53
+
54
+ cases.push({ caseId, status, basePassRate, comparePassRate, baseMeanLatencyMs, compareMeanLatencyMs });
55
+ }
56
+
57
+ return {
58
+ base: baseRun,
59
+ compare: compareRun,
60
+ cases,
61
+ regressions,
62
+ improvements,
63
+ unchanged,
64
+ added,
65
+ removed,
66
+ };
67
+ }
68
+
69
+ /**
70
+ * @param {{pass: boolean}[]} trials
71
+ * @returns {number}
72
+ */
73
+ function getPassRate(trials) {
74
+ if (!trials?.length) return 0;
75
+ return trials.filter(t => t.pass).length / trials.length;
76
+ }
77
+
78
+ /**
79
+ * @param {{latencyMs?: number}[]} trials
80
+ * @returns {number}
81
+ */
82
+ function getMeanLatency(trials) {
83
+ if (!trials?.length) return 0;
84
+ const latencies = trials.map(t => t.latencyMs ?? 0);
85
+ return latencies.reduce((s, l) => s + l, 0) / latencies.length;
86
+ }
@@ -0,0 +1,104 @@
1
+ // Adapted from agent-eval-kit by FlanaganSe (https://github.com/FlanaganSe/agent-eval-kit)
2
+ // MIT License — see LICENSE
3
+
4
+ import chalk from 'chalk';
5
+
6
+ /**
7
+ * Format a comparison report as a string.
8
+ * @param {import('./types.js').RunComparison} comparison
9
+ * @param {{noColor?: boolean, verbose?: boolean}} [options]
10
+ * @returns {string}
11
+ */
12
+ export function formatComparisonReport(comparison, options = {}) {
13
+ const { noColor = false, verbose = false } = options;
14
+ const c = noColor ? makeNoColor() : chalk;
15
+
16
+ const lines = [];
17
+
18
+ // Header
19
+ lines.push(c.bold('\n=== Eval Run Comparison ==='));
20
+ lines.push(`Base: ${comparison.base.modelName} (${comparison.base.runId})`);
21
+ lines.push(`Compare: ${comparison.compare.modelName} (${comparison.compare.runId})`);
22
+ lines.push('');
23
+
24
+ // Summary stats
25
+ const baseRate = (comparison.base.passRate * 100).toFixed(1);
26
+ const cmpRate = (comparison.compare.passRate * 100).toFixed(1);
27
+ const rateDiff = comparison.compare.passRate - comparison.base.passRate;
28
+ const rateStr = rateDiff >= 0
29
+ ? c.green(`+${(rateDiff * 100).toFixed(1)}%`)
30
+ : c.red(`${(rateDiff * 100).toFixed(1)}%`);
31
+
32
+ lines.push(`Pass rate: ${baseRate}% → ${cmpRate}% (${rateStr})`);
33
+ lines.push(`Cases: ${comparison.regressions > 0 ? c.red(`${comparison.regressions} regressions`) : '0 regressions'}, ${comparison.improvements > 0 ? c.green(`${comparison.improvements} improvements`) : '0 improvements'}, ${comparison.unchanged} unchanged`);
34
+ if (comparison.added > 0) lines.push(` + ${comparison.added} new cases added`);
35
+ if (comparison.removed > 0) lines.push(` - ${comparison.removed} cases removed`);
36
+ lines.push('');
37
+
38
+ // Regressions
39
+ const regressions = comparison.cases.filter(cas => cas.status === 'regression');
40
+ if (regressions.length > 0) {
41
+ lines.push(c.red.bold('Regressions:'));
42
+ for (const cas of regressions) {
43
+ const base = (cas.basePassRate * 100).toFixed(0);
44
+ const cmp = (cas.comparePassRate * 100).toFixed(0);
45
+ lines.push(` ${c.red('✗')} ${cas.caseId}: ${base}% → ${cmp}%`);
46
+ }
47
+ lines.push('');
48
+ }
49
+
50
+ // Improvements
51
+ const improvements = comparison.cases.filter(cas => cas.status === 'improvement');
52
+ if (improvements.length > 0) {
53
+ lines.push(c.green.bold('Improvements:'));
54
+ for (const cas of improvements) {
55
+ const base = (cas.basePassRate * 100).toFixed(0);
56
+ const cmp = (cas.comparePassRate * 100).toFixed(0);
57
+ lines.push(` ${c.green('✓')} ${cas.caseId}: ${base}% → ${cmp}%`);
58
+ }
59
+ lines.push('');
60
+ }
61
+
62
+ // Verbose: show all unchanged cases too
63
+ if (verbose) {
64
+ const unchanged = comparison.cases.filter(cas => cas.status === 'unchanged');
65
+ if (unchanged.length > 0) {
66
+ lines.push(c.gray('Unchanged:'));
67
+ for (const cas of unchanged) {
68
+ const rate = (cas.comparePassRate * 100).toFixed(0);
69
+ lines.push(` ${c.gray('·')} ${cas.caseId}: ${rate}%`);
70
+ }
71
+ lines.push('');
72
+ }
73
+ }
74
+
75
+ return lines.join('\n');
76
+ }
77
+
78
+ /**
79
+ * Create a chalk-compatible no-color proxy.
80
+ * @returns {typeof chalk}
81
+ */
82
+ function makeNoColor() {
83
+ const identity = (...args) => {
84
+ // Handle tagged template literal calls: args[0] is a TemplateStringsArray
85
+ if (Array.isArray(args[0]) && args[0].raw) {
86
+ return String.raw({ raw: args[0].raw }, ...args.slice(1));
87
+ }
88
+ return args[0] ?? '';
89
+ };
90
+
91
+ function makeProxy(fn) {
92
+ return new Proxy(fn, {
93
+ get(_, prop) {
94
+ // Return a new proxy for any property access (supports unlimited chaining)
95
+ return makeProxy(identity);
96
+ },
97
+ apply(_, _this, args) {
98
+ return identity(...args);
99
+ }
100
+ });
101
+ }
102
+
103
+ return makeProxy(identity);
104
+ }
@@ -0,0 +1,6 @@
1
+ // Adapted from agent-eval-kit by FlanaganSe (https://github.com/FlanaganSe/agent-eval-kit)
2
+ // MIT License — see LICENSE
3
+
4
+ export { wilsonInterval, computeTrialStats, computeAllTrialStats } from './statistics.js';
5
+ export { compareRuns } from './compare.js';
6
+ export { formatComparisonReport } from './format.js';
@@ -0,0 +1,59 @@
1
+ // Adapted from agent-eval-kit by FlanaganSe (https://github.com/FlanaganSe/agent-eval-kit)
2
+ // MIT License — see LICENSE
3
+
4
+ /**
5
+ * Compute Wilson score confidence interval for a proportion.
6
+ * @param {number} passes - number of successes
7
+ * @param {number} total - total trials
8
+ * @param {number} [z=1.96] - z-score (1.96 = 95% CI)
9
+ * @returns {{lower: number, upper: number, center: number}}
10
+ */
11
+ export function wilsonInterval(passes, total, z = 1.96) {
12
+ if (total === 0) return { lower: 0, upper: 0, center: 0 };
13
+ const p = passes / total;
14
+ const z2 = z * z;
15
+ const n = total;
16
+ const center = (p + z2 / (2 * n)) / (1 + z2 / n);
17
+ const margin = (z / (1 + z2 / n)) * Math.sqrt(p * (1 - p) / n + z2 / (4 * n * n));
18
+ return {
19
+ lower: Math.max(0, center - margin),
20
+ upper: Math.min(1, center + margin),
21
+ center,
22
+ };
23
+ }
24
+
25
+ /**
26
+ * Compute statistics for a list of trial results.
27
+ * @param {{pass: boolean, latencyMs?: number}[]} trials
28
+ * @returns {{passRate: number, lower95: number, upper95: number, meanLatencyMs: number, p95LatencyMs: number}}
29
+ */
30
+ export function computeTrialStats(trials) {
31
+ if (trials.length === 0) return { passRate: 0, lower95: 0, upper95: 0, meanLatencyMs: 0, p95LatencyMs: 0 };
32
+
33
+ const passes = trials.filter(t => t.pass).length;
34
+ const { lower, upper } = wilsonInterval(passes, trials.length);
35
+
36
+ const latencies = trials.map(t => t.latencyMs ?? 0).sort((a, b) => a - b);
37
+ const meanLatencyMs = latencies.reduce((s, l) => s + l, 0) / latencies.length;
38
+ const p95Index = Math.floor((latencies.length - 1) * 0.95);
39
+ const p95LatencyMs = latencies[Math.min(p95Index, latencies.length - 1)] ?? 0;
40
+
41
+ return {
42
+ passRate: passes / trials.length,
43
+ lower95: lower,
44
+ upper95: upper,
45
+ meanLatencyMs,
46
+ p95LatencyMs,
47
+ };
48
+ }
49
+
50
+ /**
51
+ * Compute stats for all cases in a run.
52
+ * @param {Object.<string, {pass: boolean, latencyMs?: number}[]>} allTrials
53
+ * @returns {Object.<string, ReturnType<typeof computeTrialStats>>}
54
+ */
55
+ export function computeAllTrialStats(allTrials) {
56
+ return Object.fromEntries(
57
+ Object.entries(allTrials).map(([caseId, trials]) => [caseId, computeTrialStats(trials)])
58
+ );
59
+ }