@gleanwork/mcp-server-tester 1.0.0-beta.3 → 1.0.0-beta.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -10
- package/dist/cli/index.js +34 -11
- package/dist/fixtures/mcp.d.ts +6 -6
- package/dist/fixtures/mcp.js +5 -5
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +79 -45
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +215 -1168
- package/dist/index.d.ts +215 -1168
- package/dist/index.js +79 -45
- package/dist/index.js.map +1 -1
- package/dist/reporters/mcpReporter.cjs.map +1 -1
- package/dist/reporters/mcpReporter.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +107 -7
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +9 -6
- package/src/reporters/ui-dist/app.js +0 -174
- package/src/reporters/ui-dist/index.html +0 -28
- package/src/reporters/ui-dist/styles.css +0 -1
package/dist/index.cjs
CHANGED
|
@@ -3127,7 +3127,7 @@ var init_dist3 = __esm({
|
|
|
3127
3127
|
}
|
|
3128
3128
|
});
|
|
3129
3129
|
var MCPHostCapabilitiesSchema = zod.z.object({
|
|
3130
|
-
sampling: zod.z.record(zod.z.unknown()).optional(),
|
|
3130
|
+
sampling: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
|
|
3131
3131
|
roots: zod.z.object({
|
|
3132
3132
|
listChanged: zod.z.boolean()
|
|
3133
3133
|
}).optional()
|
|
@@ -3186,7 +3186,7 @@ var HttpConfigSchema = zod.z.object({
|
|
|
3186
3186
|
}
|
|
3187
3187
|
return true;
|
|
3188
3188
|
}),
|
|
3189
|
-
headers: zod.z.record(zod.z.string()).optional(),
|
|
3189
|
+
headers: zod.z.record(zod.z.string(), zod.z.string()).optional(),
|
|
3190
3190
|
capabilities: MCPHostCapabilitiesSchema.optional(),
|
|
3191
3191
|
connectTimeoutMs: zod.z.number().positive().optional(),
|
|
3192
3192
|
requestTimeoutMs: zod.z.number().positive().optional(),
|
|
@@ -4407,7 +4407,7 @@ function escapeHtml(text) {
|
|
|
4407
4407
|
|
|
4408
4408
|
// package.json
|
|
4409
4409
|
var package_default = {
|
|
4410
|
-
version: "1.0.0-beta.
|
|
4410
|
+
version: "1.0.0-beta.5"};
|
|
4411
4411
|
|
|
4412
4412
|
// src/mcp/clientFactory.ts
|
|
4413
4413
|
function getRetryAfterDelayMs(err) {
|
|
@@ -5151,7 +5151,7 @@ function validateToolCalls(response, expectation) {
|
|
|
5151
5151
|
if (!isSimulationResult(response)) {
|
|
5152
5152
|
return {
|
|
5153
5153
|
pass: false,
|
|
5154
|
-
message: "toolsTriggered expectation requires
|
|
5154
|
+
message: "toolsTriggered expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
|
|
5155
5155
|
};
|
|
5156
5156
|
}
|
|
5157
5157
|
const actual = response.toolCalls;
|
|
@@ -5211,7 +5211,7 @@ function validateToolCallCount(response, options) {
|
|
|
5211
5211
|
if (!isSimulationResult(response)) {
|
|
5212
5212
|
return {
|
|
5213
5213
|
pass: false,
|
|
5214
|
-
message: "toolCallCount expectation requires
|
|
5214
|
+
message: "toolCallCount expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
|
|
5215
5215
|
};
|
|
5216
5216
|
}
|
|
5217
5217
|
const count = response.toolCalls.length;
|
|
@@ -6333,7 +6333,7 @@ function getAuthConfigFromEnv() {
|
|
|
6333
6333
|
}
|
|
6334
6334
|
return void 0;
|
|
6335
6335
|
}
|
|
6336
|
-
var
|
|
6336
|
+
var MCPHostConfigSchema = zod.z.object({
|
|
6337
6337
|
provider: zod.z.enum([
|
|
6338
6338
|
"openai",
|
|
6339
6339
|
"anthropic",
|
|
@@ -6402,7 +6402,7 @@ var EvalExpectBlockSchema = zod.z.object({
|
|
|
6402
6402
|
calls: zod.z.array(
|
|
6403
6403
|
zod.z.object({
|
|
6404
6404
|
name: zod.z.string(),
|
|
6405
|
-
arguments: zod.z.record(zod.z.unknown()).optional(),
|
|
6405
|
+
arguments: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
|
|
6406
6406
|
required: zod.z.boolean().optional()
|
|
6407
6407
|
})
|
|
6408
6408
|
),
|
|
@@ -6418,12 +6418,12 @@ var EvalExpectBlockSchema = zod.z.object({
|
|
|
6418
6418
|
var EvalCaseSchema = zod.z.object({
|
|
6419
6419
|
id: zod.z.string().min(1, "id must not be empty"),
|
|
6420
6420
|
description: zod.z.string().optional(),
|
|
6421
|
-
mode: zod.z.enum(["direct", "
|
|
6421
|
+
mode: zod.z.enum(["direct", "mcp_host"]).optional(),
|
|
6422
6422
|
toolName: zod.z.string().min(1, "toolName must not be empty").optional(),
|
|
6423
|
-
args: zod.z.record(zod.z.unknown()).optional(),
|
|
6423
|
+
args: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
|
|
6424
6424
|
scenario: zod.z.string().optional(),
|
|
6425
|
-
|
|
6426
|
-
metadata: zod.z.record(zod.z.unknown()).optional(),
|
|
6425
|
+
mcpHostConfig: MCPHostConfigSchema.optional(),
|
|
6426
|
+
metadata: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
|
|
6427
6427
|
iterations: zod.z.number().int().min(1).optional(),
|
|
6428
6428
|
accuracyThreshold: zod.z.number().min(0).max(1).optional(),
|
|
6429
6429
|
judgeReps: zod.z.number().int().min(1).optional(),
|
|
@@ -6435,7 +6435,7 @@ var EvalDatasetSchema = zod.z.object({
|
|
|
6435
6435
|
name: zod.z.string().min(1, "name must not be empty"),
|
|
6436
6436
|
description: zod.z.string().optional(),
|
|
6437
6437
|
cases: zod.z.array(EvalCaseSchema).min(1, "dataset must have at least one case"),
|
|
6438
|
-
metadata: zod.z.record(zod.z.unknown()).optional()
|
|
6438
|
+
metadata: zod.z.record(zod.z.string(), zod.z.unknown()).optional()
|
|
6439
6439
|
});
|
|
6440
6440
|
function validateEvalCase(evalCase) {
|
|
6441
6441
|
return EvalCaseSchema.parse(evalCase);
|
|
@@ -6473,30 +6473,30 @@ function loadEvalDatasetFromObject(data, options = {}) {
|
|
|
6473
6473
|
return dataset;
|
|
6474
6474
|
}
|
|
6475
6475
|
|
|
6476
|
-
// src/evals/
|
|
6476
|
+
// src/evals/mcpHost/adapters/vercel.ts
|
|
6477
6477
|
function enrichErrorMessage(err, provider) {
|
|
6478
6478
|
const raw = err instanceof Error ? err.message : String(err);
|
|
6479
6479
|
if (raw.includes("Cannot find module") || raw.includes("ERR_MODULE_NOT_FOUND")) {
|
|
6480
|
-
return `
|
|
6481
|
-
Hint: run \`getMissingDependencyMessage('${provider}')\` or check docs/
|
|
6480
|
+
return `MCP host simulation failed: required package not installed.
|
|
6481
|
+
Hint: run \`getMissingDependencyMessage('${provider}')\` or check docs/mcp-host.md for install instructions.`;
|
|
6482
6482
|
}
|
|
6483
6483
|
if (raw.includes("401") || raw.includes("Unauthorized") || raw.includes("API key") || raw.includes("api_key")) {
|
|
6484
|
-
return `
|
|
6484
|
+
return `MCP host simulation failed: authentication error.
|
|
6485
6485
|
Hint: check your API key environment variable (e.g. ANTHROPIC_API_KEY, GOOGLE_APPLICATION_CREDENTIALS).`;
|
|
6486
6486
|
}
|
|
6487
6487
|
if (raw.includes("404") || raw.includes("Not Found") || raw.toLowerCase().includes("model") && raw.toLowerCase().includes("not found")) {
|
|
6488
|
-
return `
|
|
6488
|
+
return `MCP host simulation failed: model not found.
|
|
6489
6489
|
Hint: check the model name format for your provider. For vertex-anthropic use 'claude-3-5-haiku@20241022' (with @).`;
|
|
6490
6490
|
}
|
|
6491
6491
|
if (raw.includes("ENOTFOUND") || raw.includes("fetch failed") || raw.includes("ECONNREFUSED")) {
|
|
6492
|
-
return `
|
|
6492
|
+
return `MCP host simulation failed: network error.
|
|
6493
6493
|
Hint: check network connectivity and whether the provider's API endpoint is reachable from this machine.`;
|
|
6494
6494
|
}
|
|
6495
6495
|
if (raw.includes("429") || raw.toLowerCase().includes("rate limit") || raw.includes("Too Many Requests")) {
|
|
6496
|
-
return `
|
|
6496
|
+
return `MCP host simulation failed: rate limited.
|
|
6497
6497
|
Hint: reduce concurrency, add delays between iterations, or upgrade your API plan.`;
|
|
6498
6498
|
}
|
|
6499
|
-
return `
|
|
6499
|
+
return `MCP host simulation failed: ${raw}`;
|
|
6500
6500
|
}
|
|
6501
6501
|
async function loadModel(provider, model) {
|
|
6502
6502
|
switch (provider) {
|
|
@@ -6626,7 +6626,7 @@ function createVercelOrchestrator() {
|
|
|
6626
6626
|
};
|
|
6627
6627
|
}
|
|
6628
6628
|
|
|
6629
|
-
// src/evals/
|
|
6629
|
+
// src/evals/mcpHost/mcpHostSimulation.ts
|
|
6630
6630
|
var vercelOrchestrator = createVercelOrchestrator();
|
|
6631
6631
|
var allProviders = [
|
|
6632
6632
|
"openai",
|
|
@@ -6642,7 +6642,7 @@ var allProviders = [
|
|
|
6642
6642
|
var simulatorRegistry = new Map(
|
|
6643
6643
|
allProviders.map((p) => [p, vercelOrchestrator])
|
|
6644
6644
|
);
|
|
6645
|
-
async function
|
|
6645
|
+
async function simulateMCPHost(mcp, scenario, config) {
|
|
6646
6646
|
const simulator = simulatorRegistry.get(config.provider);
|
|
6647
6647
|
if (!simulator) {
|
|
6648
6648
|
throw new Error(
|
|
@@ -6664,7 +6664,7 @@ function getMissingDependencyMessage(provider) {
|
|
|
6664
6664
|
deepseek: "npm install ai @ai-sdk/deepseek",
|
|
6665
6665
|
openrouter: "npm install ai @openrouter/ai-sdk-provider",
|
|
6666
6666
|
xai: "npm install ai @ai-sdk/xai",
|
|
6667
|
-
"vertex-anthropic": "npm install ai @ai-sdk/google-vertex (requires Application Default Credentials \u2014 see docs/
|
|
6667
|
+
"vertex-anthropic": "npm install ai @ai-sdk/google-vertex (requires Application Default Credentials \u2014 see docs/mcp-host.md)"
|
|
6668
6668
|
};
|
|
6669
6669
|
const pkg = packageMap[provider];
|
|
6670
6670
|
return pkg ? `${String(provider)} provider requires: ${pkg}` : `Unknown provider: ${String(provider)}`;
|
|
@@ -6707,24 +6707,24 @@ async function execFileNoThrow(file, args) {
|
|
|
6707
6707
|
async function executeToolCall(evalCase, mcp) {
|
|
6708
6708
|
const mode = evalCase.mode || "direct";
|
|
6709
6709
|
try {
|
|
6710
|
-
if (mode === "
|
|
6710
|
+
if (mode === "mcp_host") {
|
|
6711
6711
|
if (!evalCase.scenario) {
|
|
6712
6712
|
throw new Error(
|
|
6713
|
-
`Eval case ${evalCase.id}: scenario is required for
|
|
6713
|
+
`Eval case ${evalCase.id}: scenario is required for mcp_host mode`
|
|
6714
6714
|
);
|
|
6715
6715
|
}
|
|
6716
|
-
if (!evalCase.
|
|
6716
|
+
if (!evalCase.mcpHostConfig) {
|
|
6717
6717
|
throw new Error(
|
|
6718
|
-
`Eval case ${evalCase.id}:
|
|
6718
|
+
`Eval case ${evalCase.id}: mcpHostConfig is required for mcp_host mode`
|
|
6719
6719
|
);
|
|
6720
6720
|
}
|
|
6721
|
-
const simulationResult = await
|
|
6721
|
+
const simulationResult = await simulateMCPHost(
|
|
6722
6722
|
mcp,
|
|
6723
6723
|
evalCase.scenario,
|
|
6724
|
-
evalCase.
|
|
6724
|
+
evalCase.mcpHostConfig
|
|
6725
6725
|
);
|
|
6726
6726
|
if (!simulationResult.success) {
|
|
6727
|
-
throw new Error(simulationResult.error || "
|
|
6727
|
+
throw new Error(simulationResult.error || "MCP host simulation failed");
|
|
6728
6728
|
}
|
|
6729
6729
|
return { response: simulationResult };
|
|
6730
6730
|
} else {
|
|
@@ -6866,12 +6866,16 @@ async function runExpectBlockValidations(expectBlock, response, config) {
|
|
|
6866
6866
|
}
|
|
6867
6867
|
return { expectations: results, toolPrecision, toolRecall };
|
|
6868
6868
|
}
|
|
6869
|
+
function isMCPHostSimulationResult(value) {
|
|
6870
|
+
return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
|
|
6871
|
+
}
|
|
6869
6872
|
async function runSingleIteration(evalCase, context, options) {
|
|
6870
6873
|
const startTime = Date.now();
|
|
6871
6874
|
const { response, error } = await executeToolCall(evalCase, context.mcp);
|
|
6872
6875
|
let expectationResults = {};
|
|
6873
6876
|
let toolPrecision;
|
|
6874
6877
|
let toolRecall;
|
|
6878
|
+
let mcpHostTrace;
|
|
6875
6879
|
if (!error && evalCase.expect) {
|
|
6876
6880
|
const {
|
|
6877
6881
|
expectations,
|
|
@@ -6886,11 +6890,28 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
6886
6890
|
expectationResults = expectations;
|
|
6887
6891
|
toolPrecision = tp;
|
|
6888
6892
|
toolRecall = tr;
|
|
6893
|
+
if (evalCase.expect.toolsTriggered !== void 0 && isMCPHostSimulationResult(response)) {
|
|
6894
|
+
const expectedNames = new Set(
|
|
6895
|
+
evalCase.expect.toolsTriggered.calls.map((c) => c.name)
|
|
6896
|
+
);
|
|
6897
|
+
const requiredNames = new Set(
|
|
6898
|
+
evalCase.expect.toolsTriggered.calls.filter((c) => c.required !== false).map((c) => c.name)
|
|
6899
|
+
);
|
|
6900
|
+
const calledNames = new Set(response.toolCalls.map((c) => c.name));
|
|
6901
|
+
mcpHostTrace = {
|
|
6902
|
+
calls: response.toolCalls.map((call) => ({
|
|
6903
|
+
name: call.name,
|
|
6904
|
+
arguments: call.arguments,
|
|
6905
|
+
status: expectedNames.has(call.name) ? "expected" : "unexpected"
|
|
6906
|
+
})),
|
|
6907
|
+
missed: Array.from(requiredNames).filter((name15) => !calledNames.has(name15)).map((name15) => ({ name: name15 }))
|
|
6908
|
+
};
|
|
6909
|
+
}
|
|
6889
6910
|
}
|
|
6890
6911
|
return {
|
|
6891
6912
|
id: evalCase.id,
|
|
6892
6913
|
datasetName: options.datasetName ?? "single-case",
|
|
6893
|
-
toolName: evalCase.
|
|
6914
|
+
toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
|
|
6894
6915
|
source: "eval",
|
|
6895
6916
|
pass: didCasePass(error, expectationResults),
|
|
6896
6917
|
response,
|
|
@@ -6901,7 +6922,8 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
6901
6922
|
durationMs: Date.now() - startTime,
|
|
6902
6923
|
tags: evalCase.tags,
|
|
6903
6924
|
toolPrecision,
|
|
6904
|
-
toolRecall
|
|
6925
|
+
toolRecall,
|
|
6926
|
+
mcpHostTrace
|
|
6905
6927
|
};
|
|
6906
6928
|
}
|
|
6907
6929
|
function isInfrastructureError(err) {
|
|
@@ -6958,7 +6980,7 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
6958
6980
|
const baseResult = lastResult ?? {
|
|
6959
6981
|
id: evalCase.id,
|
|
6960
6982
|
datasetName: options.datasetName ?? "single-case",
|
|
6961
|
-
toolName: evalCase.
|
|
6983
|
+
toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
|
|
6962
6984
|
source: "eval",
|
|
6963
6985
|
pass: false,
|
|
6964
6986
|
error: iterationResults[0]?.error,
|
|
@@ -6972,12 +6994,25 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
6972
6994
|
...baseResult,
|
|
6973
6995
|
pass: assertionPassRate >= threshold,
|
|
6974
6996
|
assertionPassRate,
|
|
6997
|
+
assertionPassRateCI: wilsonCI(passCount, assertionResults.length),
|
|
6975
6998
|
infrastructureErrorRate,
|
|
6976
6999
|
iterationResults,
|
|
6977
7000
|
infrastructureErrorCount: infraErrors.length,
|
|
6978
7001
|
durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
|
|
6979
7002
|
};
|
|
6980
7003
|
}
|
|
7004
|
+
function wilsonCI(k, n) {
|
|
7005
|
+
if (n < 2) return void 0;
|
|
7006
|
+
const z5 = 1.96;
|
|
7007
|
+
const z22 = z5 * z5;
|
|
7008
|
+
const \u00F1 = n + z22;
|
|
7009
|
+
const p\u0303 = (k + z22 / 2) / \u00F1;
|
|
7010
|
+
const margin = z5 * Math.sqrt(p\u0303 * (1 - p\u0303) / \u00F1);
|
|
7011
|
+
return {
|
|
7012
|
+
lower: Math.max(0, p\u0303 - margin),
|
|
7013
|
+
upper: Math.min(1, p\u0303 + margin)
|
|
7014
|
+
};
|
|
7015
|
+
}
|
|
6981
7016
|
async function runWithConcurrency(tasks, limit) {
|
|
6982
7017
|
const results = new Array(tasks.length);
|
|
6983
7018
|
let index = 0;
|
|
@@ -7007,7 +7042,7 @@ async function runEvalDataset(options, context) {
|
|
|
7007
7042
|
filterTags,
|
|
7008
7043
|
saveResultsTo,
|
|
7009
7044
|
baselineResultsFrom,
|
|
7010
|
-
|
|
7045
|
+
mcpHostModel,
|
|
7011
7046
|
judgeModel
|
|
7012
7047
|
} = options;
|
|
7013
7048
|
const startTime = Date.now();
|
|
@@ -7017,7 +7052,7 @@ async function runEvalDataset(options, context) {
|
|
|
7017
7052
|
};
|
|
7018
7053
|
const casesToRun = filterTags && filterTags.length > 0 ? dataset.cases.filter((c) => c.tags?.some((t) => filterTags.includes(t))) : dataset.cases;
|
|
7019
7054
|
const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
|
|
7020
|
-
const effectiveIterations = c.mode === "
|
|
7055
|
+
const effectiveIterations = c.mode === "mcp_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
|
|
7021
7056
|
const judgeReps = c.expect?.passesJudge != null ? c.expect.passesJudge.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1 : 0;
|
|
7022
7057
|
return sum + effectiveIterations * judgeReps;
|
|
7023
7058
|
}, 0);
|
|
@@ -7027,12 +7062,12 @@ async function runEvalDataset(options, context) {
|
|
|
7027
7062
|
);
|
|
7028
7063
|
}
|
|
7029
7064
|
const tasks = casesToRun.map((evalCase) => async () => {
|
|
7030
|
-
const withIterations = evalCase.mode === "
|
|
7031
|
-
if (evalCase.mode === "
|
|
7065
|
+
const withIterations = evalCase.mode === "mcp_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
|
|
7066
|
+
if (evalCase.mode === "mcp_host") {
|
|
7032
7067
|
const effectiveIterations = withIterations.iterations ?? 1;
|
|
7033
7068
|
if (effectiveIterations > 1 && effectiveIterations < 10) {
|
|
7034
7069
|
console.warn(
|
|
7035
|
-
`[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in
|
|
7070
|
+
`[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in mcp_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
|
|
7036
7071
|
);
|
|
7037
7072
|
}
|
|
7038
7073
|
}
|
|
@@ -7064,7 +7099,7 @@ async function runEvalDataset(options, context) {
|
|
|
7064
7099
|
gitHash,
|
|
7065
7100
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7066
7101
|
packageVersion: package_default.version,
|
|
7067
|
-
...
|
|
7102
|
+
...mcpHostModel !== void 0 && { mcpHostModel },
|
|
7068
7103
|
...judgeModel !== void 0 && { judgeModel }
|
|
7069
7104
|
};
|
|
7070
7105
|
const result = {
|
|
@@ -7109,12 +7144,12 @@ async function runEvalDataset(options, context) {
|
|
|
7109
7144
|
);
|
|
7110
7145
|
}
|
|
7111
7146
|
}
|
|
7112
|
-
const
|
|
7147
|
+
const mcpHostCases = caseResults.filter(
|
|
7113
7148
|
(r) => r.toolPrecision !== void 0 || r.toolRecall !== void 0
|
|
7114
7149
|
);
|
|
7115
|
-
if (
|
|
7116
|
-
const avgPrec =
|
|
7117
|
-
const avgRecall =
|
|
7150
|
+
if (mcpHostCases.length > 0) {
|
|
7151
|
+
const avgPrec = mcpHostCases.reduce((s, r) => s + (r.toolPrecision ?? 0), 0) / mcpHostCases.length;
|
|
7152
|
+
const avgRecall = mcpHostCases.reduce((s, r) => s + (r.toolRecall ?? 0), 0) / mcpHostCases.length;
|
|
7118
7153
|
result.datasetToolPrecision = avgPrec;
|
|
7119
7154
|
result.datasetToolRecall = avgRecall;
|
|
7120
7155
|
result.datasetToolF1 = avgPrec + avgRecall > 0 ? 2 * avgPrec * avgRecall / (avgPrec + avgRecall) : 0;
|
|
@@ -7182,7 +7217,6 @@ async function runServerComparison(options, contextA, contextB) {
|
|
|
7182
7217
|
bWins,
|
|
7183
7218
|
ties,
|
|
7184
7219
|
bothFail,
|
|
7185
|
-
bothFailCount: bothFail,
|
|
7186
7220
|
decidedCases,
|
|
7187
7221
|
failureAlignment: total > 0 ? bothFail / total : 0,
|
|
7188
7222
|
aWinRate: decidedCases > 0 ? aWins / decidedCases : 0,
|
|
@@ -7411,7 +7445,7 @@ exports.runEvalCase = runEvalCase;
|
|
|
7411
7445
|
exports.runEvalDataset = runEvalDataset;
|
|
7412
7446
|
exports.runServerComparison = runServerComparison;
|
|
7413
7447
|
exports.saveBaseline = saveBaseline;
|
|
7414
|
-
exports.
|
|
7448
|
+
exports.simulateMCPHost = simulateMCPHost;
|
|
7415
7449
|
exports.test = test;
|
|
7416
7450
|
exports.validateAccessToken = validateAccessToken;
|
|
7417
7451
|
exports.validateError = validateError;
|