@gleanwork/mcp-server-tester 1.0.0-beta.3 → 1.0.0-beta.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -10
- package/dist/cli/index.js +34 -11
- package/dist/fixtures/mcp.d.ts +6 -6
- package/dist/fixtures/mcp.js +5 -5
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +64 -43
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +199 -1168
- package/dist/index.d.ts +199 -1168
- package/dist/index.js +64 -43
- package/dist/index.js.map +1 -1
- package/dist/reporters/mcpReporter.cjs.map +1 -1
- package/dist/reporters/mcpReporter.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +107 -7
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +9 -6
- package/src/reporters/ui-dist/app.js +0 -174
- package/src/reporters/ui-dist/index.html +0 -28
- package/src/reporters/ui-dist/styles.css +0 -1
package/dist/index.cjs
CHANGED
|
@@ -3127,7 +3127,7 @@ var init_dist3 = __esm({
|
|
|
3127
3127
|
}
|
|
3128
3128
|
});
|
|
3129
3129
|
var MCPHostCapabilitiesSchema = zod.z.object({
|
|
3130
|
-
sampling: zod.z.record(zod.z.unknown()).optional(),
|
|
3130
|
+
sampling: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
|
|
3131
3131
|
roots: zod.z.object({
|
|
3132
3132
|
listChanged: zod.z.boolean()
|
|
3133
3133
|
}).optional()
|
|
@@ -3186,7 +3186,7 @@ var HttpConfigSchema = zod.z.object({
|
|
|
3186
3186
|
}
|
|
3187
3187
|
return true;
|
|
3188
3188
|
}),
|
|
3189
|
-
headers: zod.z.record(zod.z.string()).optional(),
|
|
3189
|
+
headers: zod.z.record(zod.z.string(), zod.z.string()).optional(),
|
|
3190
3190
|
capabilities: MCPHostCapabilitiesSchema.optional(),
|
|
3191
3191
|
connectTimeoutMs: zod.z.number().positive().optional(),
|
|
3192
3192
|
requestTimeoutMs: zod.z.number().positive().optional(),
|
|
@@ -4407,7 +4407,7 @@ function escapeHtml(text) {
|
|
|
4407
4407
|
|
|
4408
4408
|
// package.json
|
|
4409
4409
|
var package_default = {
|
|
4410
|
-
version: "1.0.0-beta.
|
|
4410
|
+
version: "1.0.0-beta.4"};
|
|
4411
4411
|
|
|
4412
4412
|
// src/mcp/clientFactory.ts
|
|
4413
4413
|
function getRetryAfterDelayMs(err) {
|
|
@@ -5151,7 +5151,7 @@ function validateToolCalls(response, expectation) {
|
|
|
5151
5151
|
if (!isSimulationResult(response)) {
|
|
5152
5152
|
return {
|
|
5153
5153
|
pass: false,
|
|
5154
|
-
message: "toolsTriggered expectation requires
|
|
5154
|
+
message: "toolsTriggered expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
|
|
5155
5155
|
};
|
|
5156
5156
|
}
|
|
5157
5157
|
const actual = response.toolCalls;
|
|
@@ -5211,7 +5211,7 @@ function validateToolCallCount(response, options) {
|
|
|
5211
5211
|
if (!isSimulationResult(response)) {
|
|
5212
5212
|
return {
|
|
5213
5213
|
pass: false,
|
|
5214
|
-
message: "toolCallCount expectation requires
|
|
5214
|
+
message: "toolCallCount expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
|
|
5215
5215
|
};
|
|
5216
5216
|
}
|
|
5217
5217
|
const count = response.toolCalls.length;
|
|
@@ -6333,7 +6333,7 @@ function getAuthConfigFromEnv() {
|
|
|
6333
6333
|
}
|
|
6334
6334
|
return void 0;
|
|
6335
6335
|
}
|
|
6336
|
-
var
|
|
6336
|
+
var MCPHostConfigSchema = zod.z.object({
|
|
6337
6337
|
provider: zod.z.enum([
|
|
6338
6338
|
"openai",
|
|
6339
6339
|
"anthropic",
|
|
@@ -6402,7 +6402,7 @@ var EvalExpectBlockSchema = zod.z.object({
|
|
|
6402
6402
|
calls: zod.z.array(
|
|
6403
6403
|
zod.z.object({
|
|
6404
6404
|
name: zod.z.string(),
|
|
6405
|
-
arguments: zod.z.record(zod.z.unknown()).optional(),
|
|
6405
|
+
arguments: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
|
|
6406
6406
|
required: zod.z.boolean().optional()
|
|
6407
6407
|
})
|
|
6408
6408
|
),
|
|
@@ -6418,12 +6418,12 @@ var EvalExpectBlockSchema = zod.z.object({
|
|
|
6418
6418
|
var EvalCaseSchema = zod.z.object({
|
|
6419
6419
|
id: zod.z.string().min(1, "id must not be empty"),
|
|
6420
6420
|
description: zod.z.string().optional(),
|
|
6421
|
-
mode: zod.z.enum(["direct", "
|
|
6421
|
+
mode: zod.z.enum(["direct", "mcp_host"]).optional(),
|
|
6422
6422
|
toolName: zod.z.string().min(1, "toolName must not be empty").optional(),
|
|
6423
|
-
args: zod.z.record(zod.z.unknown()).optional(),
|
|
6423
|
+
args: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
|
|
6424
6424
|
scenario: zod.z.string().optional(),
|
|
6425
|
-
|
|
6426
|
-
metadata: zod.z.record(zod.z.unknown()).optional(),
|
|
6425
|
+
mcpHostConfig: MCPHostConfigSchema.optional(),
|
|
6426
|
+
metadata: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
|
|
6427
6427
|
iterations: zod.z.number().int().min(1).optional(),
|
|
6428
6428
|
accuracyThreshold: zod.z.number().min(0).max(1).optional(),
|
|
6429
6429
|
judgeReps: zod.z.number().int().min(1).optional(),
|
|
@@ -6435,7 +6435,7 @@ var EvalDatasetSchema = zod.z.object({
|
|
|
6435
6435
|
name: zod.z.string().min(1, "name must not be empty"),
|
|
6436
6436
|
description: zod.z.string().optional(),
|
|
6437
6437
|
cases: zod.z.array(EvalCaseSchema).min(1, "dataset must have at least one case"),
|
|
6438
|
-
metadata: zod.z.record(zod.z.unknown()).optional()
|
|
6438
|
+
metadata: zod.z.record(zod.z.string(), zod.z.unknown()).optional()
|
|
6439
6439
|
});
|
|
6440
6440
|
function validateEvalCase(evalCase) {
|
|
6441
6441
|
return EvalCaseSchema.parse(evalCase);
|
|
@@ -6473,30 +6473,30 @@ function loadEvalDatasetFromObject(data, options = {}) {
|
|
|
6473
6473
|
return dataset;
|
|
6474
6474
|
}
|
|
6475
6475
|
|
|
6476
|
-
// src/evals/
|
|
6476
|
+
// src/evals/mcpHost/adapters/vercel.ts
|
|
6477
6477
|
function enrichErrorMessage(err, provider) {
|
|
6478
6478
|
const raw = err instanceof Error ? err.message : String(err);
|
|
6479
6479
|
if (raw.includes("Cannot find module") || raw.includes("ERR_MODULE_NOT_FOUND")) {
|
|
6480
|
-
return `
|
|
6481
|
-
Hint: run \`getMissingDependencyMessage('${provider}')\` or check docs/
|
|
6480
|
+
return `MCP host simulation failed: required package not installed.
|
|
6481
|
+
Hint: run \`getMissingDependencyMessage('${provider}')\` or check docs/mcp-host.md for install instructions.`;
|
|
6482
6482
|
}
|
|
6483
6483
|
if (raw.includes("401") || raw.includes("Unauthorized") || raw.includes("API key") || raw.includes("api_key")) {
|
|
6484
|
-
return `
|
|
6484
|
+
return `MCP host simulation failed: authentication error.
|
|
6485
6485
|
Hint: check your API key environment variable (e.g. ANTHROPIC_API_KEY, GOOGLE_APPLICATION_CREDENTIALS).`;
|
|
6486
6486
|
}
|
|
6487
6487
|
if (raw.includes("404") || raw.includes("Not Found") || raw.toLowerCase().includes("model") && raw.toLowerCase().includes("not found")) {
|
|
6488
|
-
return `
|
|
6488
|
+
return `MCP host simulation failed: model not found.
|
|
6489
6489
|
Hint: check the model name format for your provider. For vertex-anthropic use 'claude-3-5-haiku@20241022' (with @).`;
|
|
6490
6490
|
}
|
|
6491
6491
|
if (raw.includes("ENOTFOUND") || raw.includes("fetch failed") || raw.includes("ECONNREFUSED")) {
|
|
6492
|
-
return `
|
|
6492
|
+
return `MCP host simulation failed: network error.
|
|
6493
6493
|
Hint: check network connectivity and whether the provider's API endpoint is reachable from this machine.`;
|
|
6494
6494
|
}
|
|
6495
6495
|
if (raw.includes("429") || raw.toLowerCase().includes("rate limit") || raw.includes("Too Many Requests")) {
|
|
6496
|
-
return `
|
|
6496
|
+
return `MCP host simulation failed: rate limited.
|
|
6497
6497
|
Hint: reduce concurrency, add delays between iterations, or upgrade your API plan.`;
|
|
6498
6498
|
}
|
|
6499
|
-
return `
|
|
6499
|
+
return `MCP host simulation failed: ${raw}`;
|
|
6500
6500
|
}
|
|
6501
6501
|
async function loadModel(provider, model) {
|
|
6502
6502
|
switch (provider) {
|
|
@@ -6626,7 +6626,7 @@ function createVercelOrchestrator() {
|
|
|
6626
6626
|
};
|
|
6627
6627
|
}
|
|
6628
6628
|
|
|
6629
|
-
// src/evals/
|
|
6629
|
+
// src/evals/mcpHost/mcpHostSimulation.ts
|
|
6630
6630
|
var vercelOrchestrator = createVercelOrchestrator();
|
|
6631
6631
|
var allProviders = [
|
|
6632
6632
|
"openai",
|
|
@@ -6642,7 +6642,7 @@ var allProviders = [
|
|
|
6642
6642
|
var simulatorRegistry = new Map(
|
|
6643
6643
|
allProviders.map((p) => [p, vercelOrchestrator])
|
|
6644
6644
|
);
|
|
6645
|
-
async function
|
|
6645
|
+
async function simulateMCPHost(mcp, scenario, config) {
|
|
6646
6646
|
const simulator = simulatorRegistry.get(config.provider);
|
|
6647
6647
|
if (!simulator) {
|
|
6648
6648
|
throw new Error(
|
|
@@ -6664,7 +6664,7 @@ function getMissingDependencyMessage(provider) {
|
|
|
6664
6664
|
deepseek: "npm install ai @ai-sdk/deepseek",
|
|
6665
6665
|
openrouter: "npm install ai @openrouter/ai-sdk-provider",
|
|
6666
6666
|
xai: "npm install ai @ai-sdk/xai",
|
|
6667
|
-
"vertex-anthropic": "npm install ai @ai-sdk/google-vertex (requires Application Default Credentials \u2014 see docs/
|
|
6667
|
+
"vertex-anthropic": "npm install ai @ai-sdk/google-vertex (requires Application Default Credentials \u2014 see docs/mcp-host.md)"
|
|
6668
6668
|
};
|
|
6669
6669
|
const pkg = packageMap[provider];
|
|
6670
6670
|
return pkg ? `${String(provider)} provider requires: ${pkg}` : `Unknown provider: ${String(provider)}`;
|
|
@@ -6707,24 +6707,24 @@ async function execFileNoThrow(file, args) {
|
|
|
6707
6707
|
async function executeToolCall(evalCase, mcp) {
|
|
6708
6708
|
const mode = evalCase.mode || "direct";
|
|
6709
6709
|
try {
|
|
6710
|
-
if (mode === "
|
|
6710
|
+
if (mode === "mcp_host") {
|
|
6711
6711
|
if (!evalCase.scenario) {
|
|
6712
6712
|
throw new Error(
|
|
6713
|
-
`Eval case ${evalCase.id}: scenario is required for
|
|
6713
|
+
`Eval case ${evalCase.id}: scenario is required for mcp_host mode`
|
|
6714
6714
|
);
|
|
6715
6715
|
}
|
|
6716
|
-
if (!evalCase.
|
|
6716
|
+
if (!evalCase.mcpHostConfig) {
|
|
6717
6717
|
throw new Error(
|
|
6718
|
-
`Eval case ${evalCase.id}:
|
|
6718
|
+
`Eval case ${evalCase.id}: mcpHostConfig is required for mcp_host mode`
|
|
6719
6719
|
);
|
|
6720
6720
|
}
|
|
6721
|
-
const simulationResult = await
|
|
6721
|
+
const simulationResult = await simulateMCPHost(
|
|
6722
6722
|
mcp,
|
|
6723
6723
|
evalCase.scenario,
|
|
6724
|
-
evalCase.
|
|
6724
|
+
evalCase.mcpHostConfig
|
|
6725
6725
|
);
|
|
6726
6726
|
if (!simulationResult.success) {
|
|
6727
|
-
throw new Error(simulationResult.error || "
|
|
6727
|
+
throw new Error(simulationResult.error || "MCP host simulation failed");
|
|
6728
6728
|
}
|
|
6729
6729
|
return { response: simulationResult };
|
|
6730
6730
|
} else {
|
|
@@ -6866,12 +6866,16 @@ async function runExpectBlockValidations(expectBlock, response, config) {
|
|
|
6866
6866
|
}
|
|
6867
6867
|
return { expectations: results, toolPrecision, toolRecall };
|
|
6868
6868
|
}
|
|
6869
|
+
function isMCPHostSimulationResult(value) {
|
|
6870
|
+
return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
|
|
6871
|
+
}
|
|
6869
6872
|
async function runSingleIteration(evalCase, context, options) {
|
|
6870
6873
|
const startTime = Date.now();
|
|
6871
6874
|
const { response, error } = await executeToolCall(evalCase, context.mcp);
|
|
6872
6875
|
let expectationResults = {};
|
|
6873
6876
|
let toolPrecision;
|
|
6874
6877
|
let toolRecall;
|
|
6878
|
+
let mcpHostTrace;
|
|
6875
6879
|
if (!error && evalCase.expect) {
|
|
6876
6880
|
const {
|
|
6877
6881
|
expectations,
|
|
@@ -6886,6 +6890,23 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
6886
6890
|
expectationResults = expectations;
|
|
6887
6891
|
toolPrecision = tp;
|
|
6888
6892
|
toolRecall = tr;
|
|
6893
|
+
if (evalCase.expect.toolsTriggered !== void 0 && isMCPHostSimulationResult(response)) {
|
|
6894
|
+
const expectedNames = new Set(
|
|
6895
|
+
evalCase.expect.toolsTriggered.calls.map((c) => c.name)
|
|
6896
|
+
);
|
|
6897
|
+
const requiredNames = new Set(
|
|
6898
|
+
evalCase.expect.toolsTriggered.calls.filter((c) => c.required !== false).map((c) => c.name)
|
|
6899
|
+
);
|
|
6900
|
+
const calledNames = new Set(response.toolCalls.map((c) => c.name));
|
|
6901
|
+
mcpHostTrace = {
|
|
6902
|
+
calls: response.toolCalls.map((call) => ({
|
|
6903
|
+
name: call.name,
|
|
6904
|
+
arguments: call.arguments,
|
|
6905
|
+
status: expectedNames.has(call.name) ? "expected" : "unexpected"
|
|
6906
|
+
})),
|
|
6907
|
+
missed: Array.from(requiredNames).filter((name15) => !calledNames.has(name15)).map((name15) => ({ name: name15 }))
|
|
6908
|
+
};
|
|
6909
|
+
}
|
|
6889
6910
|
}
|
|
6890
6911
|
return {
|
|
6891
6912
|
id: evalCase.id,
|
|
@@ -6901,7 +6922,8 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
6901
6922
|
durationMs: Date.now() - startTime,
|
|
6902
6923
|
tags: evalCase.tags,
|
|
6903
6924
|
toolPrecision,
|
|
6904
|
-
toolRecall
|
|
6925
|
+
toolRecall,
|
|
6926
|
+
mcpHostTrace
|
|
6905
6927
|
};
|
|
6906
6928
|
}
|
|
6907
6929
|
function isInfrastructureError(err) {
|
|
@@ -7007,7 +7029,7 @@ async function runEvalDataset(options, context) {
|
|
|
7007
7029
|
filterTags,
|
|
7008
7030
|
saveResultsTo,
|
|
7009
7031
|
baselineResultsFrom,
|
|
7010
|
-
|
|
7032
|
+
mcpHostModel,
|
|
7011
7033
|
judgeModel
|
|
7012
7034
|
} = options;
|
|
7013
7035
|
const startTime = Date.now();
|
|
@@ -7017,7 +7039,7 @@ async function runEvalDataset(options, context) {
|
|
|
7017
7039
|
};
|
|
7018
7040
|
const casesToRun = filterTags && filterTags.length > 0 ? dataset.cases.filter((c) => c.tags?.some((t) => filterTags.includes(t))) : dataset.cases;
|
|
7019
7041
|
const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
|
|
7020
|
-
const effectiveIterations = c.mode === "
|
|
7042
|
+
const effectiveIterations = c.mode === "mcp_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
|
|
7021
7043
|
const judgeReps = c.expect?.passesJudge != null ? c.expect.passesJudge.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1 : 0;
|
|
7022
7044
|
return sum + effectiveIterations * judgeReps;
|
|
7023
7045
|
}, 0);
|
|
@@ -7027,12 +7049,12 @@ async function runEvalDataset(options, context) {
|
|
|
7027
7049
|
);
|
|
7028
7050
|
}
|
|
7029
7051
|
const tasks = casesToRun.map((evalCase) => async () => {
|
|
7030
|
-
const withIterations = evalCase.mode === "
|
|
7031
|
-
if (evalCase.mode === "
|
|
7052
|
+
const withIterations = evalCase.mode === "mcp_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
|
|
7053
|
+
if (evalCase.mode === "mcp_host") {
|
|
7032
7054
|
const effectiveIterations = withIterations.iterations ?? 1;
|
|
7033
7055
|
if (effectiveIterations > 1 && effectiveIterations < 10) {
|
|
7034
7056
|
console.warn(
|
|
7035
|
-
`[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in
|
|
7057
|
+
`[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in mcp_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
|
|
7036
7058
|
);
|
|
7037
7059
|
}
|
|
7038
7060
|
}
|
|
@@ -7064,7 +7086,7 @@ async function runEvalDataset(options, context) {
|
|
|
7064
7086
|
gitHash,
|
|
7065
7087
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7066
7088
|
packageVersion: package_default.version,
|
|
7067
|
-
...
|
|
7089
|
+
...mcpHostModel !== void 0 && { mcpHostModel },
|
|
7068
7090
|
...judgeModel !== void 0 && { judgeModel }
|
|
7069
7091
|
};
|
|
7070
7092
|
const result = {
|
|
@@ -7109,12 +7131,12 @@ async function runEvalDataset(options, context) {
|
|
|
7109
7131
|
);
|
|
7110
7132
|
}
|
|
7111
7133
|
}
|
|
7112
|
-
const
|
|
7134
|
+
const mcpHostCases = caseResults.filter(
|
|
7113
7135
|
(r) => r.toolPrecision !== void 0 || r.toolRecall !== void 0
|
|
7114
7136
|
);
|
|
7115
|
-
if (
|
|
7116
|
-
const avgPrec =
|
|
7117
|
-
const avgRecall =
|
|
7137
|
+
if (mcpHostCases.length > 0) {
|
|
7138
|
+
const avgPrec = mcpHostCases.reduce((s, r) => s + (r.toolPrecision ?? 0), 0) / mcpHostCases.length;
|
|
7139
|
+
const avgRecall = mcpHostCases.reduce((s, r) => s + (r.toolRecall ?? 0), 0) / mcpHostCases.length;
|
|
7118
7140
|
result.datasetToolPrecision = avgPrec;
|
|
7119
7141
|
result.datasetToolRecall = avgRecall;
|
|
7120
7142
|
result.datasetToolF1 = avgPrec + avgRecall > 0 ? 2 * avgPrec * avgRecall / (avgPrec + avgRecall) : 0;
|
|
@@ -7182,7 +7204,6 @@ async function runServerComparison(options, contextA, contextB) {
|
|
|
7182
7204
|
bWins,
|
|
7183
7205
|
ties,
|
|
7184
7206
|
bothFail,
|
|
7185
|
-
bothFailCount: bothFail,
|
|
7186
7207
|
decidedCases,
|
|
7187
7208
|
failureAlignment: total > 0 ? bothFail / total : 0,
|
|
7188
7209
|
aWinRate: decidedCases > 0 ? aWins / decidedCases : 0,
|
|
@@ -7411,7 +7432,7 @@ exports.runEvalCase = runEvalCase;
|
|
|
7411
7432
|
exports.runEvalDataset = runEvalDataset;
|
|
7412
7433
|
exports.runServerComparison = runServerComparison;
|
|
7413
7434
|
exports.saveBaseline = saveBaseline;
|
|
7414
|
-
exports.
|
|
7435
|
+
exports.simulateMCPHost = simulateMCPHost;
|
|
7415
7436
|
exports.test = test;
|
|
7416
7437
|
exports.validateAccessToken = validateAccessToken;
|
|
7417
7438
|
exports.validateError = validateError;
|