@gleanwork/mcp-server-tester 1.0.0-beta.3 → 1.0.0-beta.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -3127,7 +3127,7 @@ var init_dist3 = __esm({
3127
3127
  }
3128
3128
  });
3129
3129
  var MCPHostCapabilitiesSchema = zod.z.object({
3130
- sampling: zod.z.record(zod.z.unknown()).optional(),
3130
+ sampling: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
3131
3131
  roots: zod.z.object({
3132
3132
  listChanged: zod.z.boolean()
3133
3133
  }).optional()
@@ -3186,7 +3186,7 @@ var HttpConfigSchema = zod.z.object({
3186
3186
  }
3187
3187
  return true;
3188
3188
  }),
3189
- headers: zod.z.record(zod.z.string()).optional(),
3189
+ headers: zod.z.record(zod.z.string(), zod.z.string()).optional(),
3190
3190
  capabilities: MCPHostCapabilitiesSchema.optional(),
3191
3191
  connectTimeoutMs: zod.z.number().positive().optional(),
3192
3192
  requestTimeoutMs: zod.z.number().positive().optional(),
@@ -4407,7 +4407,7 @@ function escapeHtml(text) {
4407
4407
 
4408
4408
  // package.json
4409
4409
  var package_default = {
4410
- version: "1.0.0-beta.3"};
4410
+ version: "1.0.0-beta.5"};
4411
4411
 
4412
4412
  // src/mcp/clientFactory.ts
4413
4413
  function getRetryAfterDelayMs(err) {
@@ -5151,7 +5151,7 @@ function validateToolCalls(response, expectation) {
5151
5151
  if (!isSimulationResult(response)) {
5152
5152
  return {
5153
5153
  pass: false,
5154
- message: "toolsTriggered expectation requires llm_host mode \u2014 response must be an LLMHostSimulationResult"
5154
+ message: "toolsTriggered expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
5155
5155
  };
5156
5156
  }
5157
5157
  const actual = response.toolCalls;
@@ -5211,7 +5211,7 @@ function validateToolCallCount(response, options) {
5211
5211
  if (!isSimulationResult(response)) {
5212
5212
  return {
5213
5213
  pass: false,
5214
- message: "toolCallCount expectation requires llm_host mode \u2014 response must be an LLMHostSimulationResult"
5214
+ message: "toolCallCount expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
5215
5215
  };
5216
5216
  }
5217
5217
  const count = response.toolCalls.length;
@@ -6333,7 +6333,7 @@ function getAuthConfigFromEnv() {
6333
6333
  }
6334
6334
  return void 0;
6335
6335
  }
6336
- var LLMHostConfigSchema = zod.z.object({
6336
+ var MCPHostConfigSchema = zod.z.object({
6337
6337
  provider: zod.z.enum([
6338
6338
  "openai",
6339
6339
  "anthropic",
@@ -6402,7 +6402,7 @@ var EvalExpectBlockSchema = zod.z.object({
6402
6402
  calls: zod.z.array(
6403
6403
  zod.z.object({
6404
6404
  name: zod.z.string(),
6405
- arguments: zod.z.record(zod.z.unknown()).optional(),
6405
+ arguments: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
6406
6406
  required: zod.z.boolean().optional()
6407
6407
  })
6408
6408
  ),
@@ -6418,12 +6418,12 @@ var EvalExpectBlockSchema = zod.z.object({
6418
6418
  var EvalCaseSchema = zod.z.object({
6419
6419
  id: zod.z.string().min(1, "id must not be empty"),
6420
6420
  description: zod.z.string().optional(),
6421
- mode: zod.z.enum(["direct", "llm_host"]).optional(),
6421
+ mode: zod.z.enum(["direct", "mcp_host"]).optional(),
6422
6422
  toolName: zod.z.string().min(1, "toolName must not be empty").optional(),
6423
- args: zod.z.record(zod.z.unknown()).optional(),
6423
+ args: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
6424
6424
  scenario: zod.z.string().optional(),
6425
- llmHostConfig: LLMHostConfigSchema.optional(),
6426
- metadata: zod.z.record(zod.z.unknown()).optional(),
6425
+ mcpHostConfig: MCPHostConfigSchema.optional(),
6426
+ metadata: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
6427
6427
  iterations: zod.z.number().int().min(1).optional(),
6428
6428
  accuracyThreshold: zod.z.number().min(0).max(1).optional(),
6429
6429
  judgeReps: zod.z.number().int().min(1).optional(),
@@ -6435,7 +6435,7 @@ var EvalDatasetSchema = zod.z.object({
6435
6435
  name: zod.z.string().min(1, "name must not be empty"),
6436
6436
  description: zod.z.string().optional(),
6437
6437
  cases: zod.z.array(EvalCaseSchema).min(1, "dataset must have at least one case"),
6438
- metadata: zod.z.record(zod.z.unknown()).optional()
6438
+ metadata: zod.z.record(zod.z.string(), zod.z.unknown()).optional()
6439
6439
  });
6440
6440
  function validateEvalCase(evalCase) {
6441
6441
  return EvalCaseSchema.parse(evalCase);
@@ -6473,30 +6473,30 @@ function loadEvalDatasetFromObject(data, options = {}) {
6473
6473
  return dataset;
6474
6474
  }
6475
6475
 
6476
- // src/evals/llmHost/adapters/vercel.ts
6476
+ // src/evals/mcpHost/adapters/vercel.ts
6477
6477
  function enrichErrorMessage(err, provider) {
6478
6478
  const raw = err instanceof Error ? err.message : String(err);
6479
6479
  if (raw.includes("Cannot find module") || raw.includes("ERR_MODULE_NOT_FOUND")) {
6480
- return `LLM host simulation failed: required package not installed.
6481
- Hint: run \`getMissingDependencyMessage('${provider}')\` or check docs/llm-host.md for install instructions.`;
6480
+ return `MCP host simulation failed: required package not installed.
6481
+ Hint: run \`getMissingDependencyMessage('${provider}')\` or check docs/mcp-host.md for install instructions.`;
6482
6482
  }
6483
6483
  if (raw.includes("401") || raw.includes("Unauthorized") || raw.includes("API key") || raw.includes("api_key")) {
6484
- return `LLM host simulation failed: authentication error.
6484
+ return `MCP host simulation failed: authentication error.
6485
6485
  Hint: check your API key environment variable (e.g. ANTHROPIC_API_KEY, GOOGLE_APPLICATION_CREDENTIALS).`;
6486
6486
  }
6487
6487
  if (raw.includes("404") || raw.includes("Not Found") || raw.toLowerCase().includes("model") && raw.toLowerCase().includes("not found")) {
6488
- return `LLM host simulation failed: model not found.
6488
+ return `MCP host simulation failed: model not found.
6489
6489
  Hint: check the model name format for your provider. For vertex-anthropic use 'claude-3-5-haiku@20241022' (with @).`;
6490
6490
  }
6491
6491
  if (raw.includes("ENOTFOUND") || raw.includes("fetch failed") || raw.includes("ECONNREFUSED")) {
6492
- return `LLM host simulation failed: network error.
6492
+ return `MCP host simulation failed: network error.
6493
6493
  Hint: check network connectivity and whether the provider's API endpoint is reachable from this machine.`;
6494
6494
  }
6495
6495
  if (raw.includes("429") || raw.toLowerCase().includes("rate limit") || raw.includes("Too Many Requests")) {
6496
- return `LLM host simulation failed: rate limited.
6496
+ return `MCP host simulation failed: rate limited.
6497
6497
  Hint: reduce concurrency, add delays between iterations, or upgrade your API plan.`;
6498
6498
  }
6499
- return `LLM host simulation failed: ${raw}`;
6499
+ return `MCP host simulation failed: ${raw}`;
6500
6500
  }
6501
6501
  async function loadModel(provider, model) {
6502
6502
  switch (provider) {
@@ -6626,7 +6626,7 @@ function createVercelOrchestrator() {
6626
6626
  };
6627
6627
  }
6628
6628
 
6629
- // src/evals/llmHost/llmHostSimulation.ts
6629
+ // src/evals/mcpHost/mcpHostSimulation.ts
6630
6630
  var vercelOrchestrator = createVercelOrchestrator();
6631
6631
  var allProviders = [
6632
6632
  "openai",
@@ -6642,7 +6642,7 @@ var allProviders = [
6642
6642
  var simulatorRegistry = new Map(
6643
6643
  allProviders.map((p) => [p, vercelOrchestrator])
6644
6644
  );
6645
- async function simulateLLMHost(mcp, scenario, config) {
6645
+ async function simulateMCPHost(mcp, scenario, config) {
6646
6646
  const simulator = simulatorRegistry.get(config.provider);
6647
6647
  if (!simulator) {
6648
6648
  throw new Error(
@@ -6664,7 +6664,7 @@ function getMissingDependencyMessage(provider) {
6664
6664
  deepseek: "npm install ai @ai-sdk/deepseek",
6665
6665
  openrouter: "npm install ai @openrouter/ai-sdk-provider",
6666
6666
  xai: "npm install ai @ai-sdk/xai",
6667
- "vertex-anthropic": "npm install ai @ai-sdk/google-vertex (requires Application Default Credentials \u2014 see docs/llm-host.md)"
6667
+ "vertex-anthropic": "npm install ai @ai-sdk/google-vertex (requires Application Default Credentials \u2014 see docs/mcp-host.md)"
6668
6668
  };
6669
6669
  const pkg = packageMap[provider];
6670
6670
  return pkg ? `${String(provider)} provider requires: ${pkg}` : `Unknown provider: ${String(provider)}`;
@@ -6707,24 +6707,24 @@ async function execFileNoThrow(file, args) {
6707
6707
  async function executeToolCall(evalCase, mcp) {
6708
6708
  const mode = evalCase.mode || "direct";
6709
6709
  try {
6710
- if (mode === "llm_host") {
6710
+ if (mode === "mcp_host") {
6711
6711
  if (!evalCase.scenario) {
6712
6712
  throw new Error(
6713
- `Eval case ${evalCase.id}: scenario is required for llm_host mode`
6713
+ `Eval case ${evalCase.id}: scenario is required for mcp_host mode`
6714
6714
  );
6715
6715
  }
6716
- if (!evalCase.llmHostConfig) {
6716
+ if (!evalCase.mcpHostConfig) {
6717
6717
  throw new Error(
6718
- `Eval case ${evalCase.id}: llmHostConfig is required for llm_host mode`
6718
+ `Eval case ${evalCase.id}: mcpHostConfig is required for mcp_host mode`
6719
6719
  );
6720
6720
  }
6721
- const simulationResult = await simulateLLMHost(
6721
+ const simulationResult = await simulateMCPHost(
6722
6722
  mcp,
6723
6723
  evalCase.scenario,
6724
- evalCase.llmHostConfig
6724
+ evalCase.mcpHostConfig
6725
6725
  );
6726
6726
  if (!simulationResult.success) {
6727
- throw new Error(simulationResult.error || "LLM host simulation failed");
6727
+ throw new Error(simulationResult.error || "MCP host simulation failed");
6728
6728
  }
6729
6729
  return { response: simulationResult };
6730
6730
  } else {
@@ -6866,12 +6866,16 @@ async function runExpectBlockValidations(expectBlock, response, config) {
6866
6866
  }
6867
6867
  return { expectations: results, toolPrecision, toolRecall };
6868
6868
  }
6869
+ function isMCPHostSimulationResult(value) {
6870
+ return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
6871
+ }
6869
6872
  async function runSingleIteration(evalCase, context, options) {
6870
6873
  const startTime = Date.now();
6871
6874
  const { response, error } = await executeToolCall(evalCase, context.mcp);
6872
6875
  let expectationResults = {};
6873
6876
  let toolPrecision;
6874
6877
  let toolRecall;
6878
+ let mcpHostTrace;
6875
6879
  if (!error && evalCase.expect) {
6876
6880
  const {
6877
6881
  expectations,
@@ -6886,11 +6890,28 @@ async function runSingleIteration(evalCase, context, options) {
6886
6890
  expectationResults = expectations;
6887
6891
  toolPrecision = tp;
6888
6892
  toolRecall = tr;
6893
+ if (evalCase.expect.toolsTriggered !== void 0 && isMCPHostSimulationResult(response)) {
6894
+ const expectedNames = new Set(
6895
+ evalCase.expect.toolsTriggered.calls.map((c) => c.name)
6896
+ );
6897
+ const requiredNames = new Set(
6898
+ evalCase.expect.toolsTriggered.calls.filter((c) => c.required !== false).map((c) => c.name)
6899
+ );
6900
+ const calledNames = new Set(response.toolCalls.map((c) => c.name));
6901
+ mcpHostTrace = {
6902
+ calls: response.toolCalls.map((call) => ({
6903
+ name: call.name,
6904
+ arguments: call.arguments,
6905
+ status: expectedNames.has(call.name) ? "expected" : "unexpected"
6906
+ })),
6907
+ missed: Array.from(requiredNames).filter((name15) => !calledNames.has(name15)).map((name15) => ({ name: name15 }))
6908
+ };
6909
+ }
6889
6910
  }
6890
6911
  return {
6891
6912
  id: evalCase.id,
6892
6913
  datasetName: options.datasetName ?? "single-case",
6893
- toolName: evalCase.toolName ?? evalCase.scenario ?? "unknown",
6914
+ toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
6894
6915
  source: "eval",
6895
6916
  pass: didCasePass(error, expectationResults),
6896
6917
  response,
@@ -6901,7 +6922,8 @@ async function runSingleIteration(evalCase, context, options) {
6901
6922
  durationMs: Date.now() - startTime,
6902
6923
  tags: evalCase.tags,
6903
6924
  toolPrecision,
6904
- toolRecall
6925
+ toolRecall,
6926
+ mcpHostTrace
6905
6927
  };
6906
6928
  }
6907
6929
  function isInfrastructureError(err) {
@@ -6958,7 +6980,7 @@ async function runEvalCase(evalCase, context, options = {}) {
6958
6980
  const baseResult = lastResult ?? {
6959
6981
  id: evalCase.id,
6960
6982
  datasetName: options.datasetName ?? "single-case",
6961
- toolName: evalCase.toolName ?? evalCase.scenario ?? "unknown",
6983
+ toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
6962
6984
  source: "eval",
6963
6985
  pass: false,
6964
6986
  error: iterationResults[0]?.error,
@@ -6972,12 +6994,25 @@ async function runEvalCase(evalCase, context, options = {}) {
6972
6994
  ...baseResult,
6973
6995
  pass: assertionPassRate >= threshold,
6974
6996
  assertionPassRate,
6997
+ assertionPassRateCI: wilsonCI(passCount, assertionResults.length),
6975
6998
  infrastructureErrorRate,
6976
6999
  iterationResults,
6977
7000
  infrastructureErrorCount: infraErrors.length,
6978
7001
  durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
6979
7002
  };
6980
7003
  }
7004
+ function wilsonCI(k, n) {
7005
+ if (n < 2) return void 0;
7006
+ const z5 = 1.96;
7007
+ const z22 = z5 * z5;
7008
+ const \u00F1 = n + z22;
7009
+ const p\u0303 = (k + z22 / 2) / \u00F1;
7010
+ const margin = z5 * Math.sqrt(p\u0303 * (1 - p\u0303) / \u00F1);
7011
+ return {
7012
+ lower: Math.max(0, p\u0303 - margin),
7013
+ upper: Math.min(1, p\u0303 + margin)
7014
+ };
7015
+ }
6981
7016
  async function runWithConcurrency(tasks, limit) {
6982
7017
  const results = new Array(tasks.length);
6983
7018
  let index = 0;
@@ -7007,7 +7042,7 @@ async function runEvalDataset(options, context) {
7007
7042
  filterTags,
7008
7043
  saveResultsTo,
7009
7044
  baselineResultsFrom,
7010
- llmHostModel,
7045
+ mcpHostModel,
7011
7046
  judgeModel
7012
7047
  } = options;
7013
7048
  const startTime = Date.now();
@@ -7017,7 +7052,7 @@ async function runEvalDataset(options, context) {
7017
7052
  };
7018
7053
  const casesToRun = filterTags && filterTags.length > 0 ? dataset.cases.filter((c) => c.tags?.some((t) => filterTags.includes(t))) : dataset.cases;
7019
7054
  const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
7020
- const effectiveIterations = c.mode === "llm_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
7055
+ const effectiveIterations = c.mode === "mcp_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
7021
7056
  const judgeReps = c.expect?.passesJudge != null ? c.expect.passesJudge.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1 : 0;
7022
7057
  return sum + effectiveIterations * judgeReps;
7023
7058
  }, 0);
@@ -7027,12 +7062,12 @@ async function runEvalDataset(options, context) {
7027
7062
  );
7028
7063
  }
7029
7064
  const tasks = casesToRun.map((evalCase) => async () => {
7030
- const withIterations = evalCase.mode === "llm_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
7031
- if (evalCase.mode === "llm_host") {
7065
+ const withIterations = evalCase.mode === "mcp_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
7066
+ if (evalCase.mode === "mcp_host") {
7032
7067
  const effectiveIterations = withIterations.iterations ?? 1;
7033
7068
  if (effectiveIterations > 1 && effectiveIterations < 10) {
7034
7069
  console.warn(
7035
- `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in llm_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
7070
+ `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in mcp_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
7036
7071
  );
7037
7072
  }
7038
7073
  }
@@ -7064,7 +7099,7 @@ async function runEvalDataset(options, context) {
7064
7099
  gitHash,
7065
7100
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
7066
7101
  packageVersion: package_default.version,
7067
- ...llmHostModel !== void 0 && { llmHostModel },
7102
+ ...mcpHostModel !== void 0 && { mcpHostModel },
7068
7103
  ...judgeModel !== void 0 && { judgeModel }
7069
7104
  };
7070
7105
  const result = {
@@ -7109,12 +7144,12 @@ async function runEvalDataset(options, context) {
7109
7144
  );
7110
7145
  }
7111
7146
  }
7112
- const llmHostCases = caseResults.filter(
7147
+ const mcpHostCases = caseResults.filter(
7113
7148
  (r) => r.toolPrecision !== void 0 || r.toolRecall !== void 0
7114
7149
  );
7115
- if (llmHostCases.length > 0) {
7116
- const avgPrec = llmHostCases.reduce((s, r) => s + (r.toolPrecision ?? 0), 0) / llmHostCases.length;
7117
- const avgRecall = llmHostCases.reduce((s, r) => s + (r.toolRecall ?? 0), 0) / llmHostCases.length;
7150
+ if (mcpHostCases.length > 0) {
7151
+ const avgPrec = mcpHostCases.reduce((s, r) => s + (r.toolPrecision ?? 0), 0) / mcpHostCases.length;
7152
+ const avgRecall = mcpHostCases.reduce((s, r) => s + (r.toolRecall ?? 0), 0) / mcpHostCases.length;
7118
7153
  result.datasetToolPrecision = avgPrec;
7119
7154
  result.datasetToolRecall = avgRecall;
7120
7155
  result.datasetToolF1 = avgPrec + avgRecall > 0 ? 2 * avgPrec * avgRecall / (avgPrec + avgRecall) : 0;
@@ -7182,7 +7217,6 @@ async function runServerComparison(options, contextA, contextB) {
7182
7217
  bWins,
7183
7218
  ties,
7184
7219
  bothFail,
7185
- bothFailCount: bothFail,
7186
7220
  decidedCases,
7187
7221
  failureAlignment: total > 0 ? bothFail / total : 0,
7188
7222
  aWinRate: decidedCases > 0 ? aWins / decidedCases : 0,
@@ -7411,7 +7445,7 @@ exports.runEvalCase = runEvalCase;
7411
7445
  exports.runEvalDataset = runEvalDataset;
7412
7446
  exports.runServerComparison = runServerComparison;
7413
7447
  exports.saveBaseline = saveBaseline;
7414
- exports.simulateLLMHost = simulateLLMHost;
7448
+ exports.simulateMCPHost = simulateMCPHost;
7415
7449
  exports.test = test;
7416
7450
  exports.validateAccessToken = validateAccessToken;
7417
7451
  exports.validateError = validateError;