@gleanwork/mcp-server-tester 1.0.0-beta.3 → 1.0.0-beta.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -3127,7 +3127,7 @@ var init_dist3 = __esm({
3127
3127
  }
3128
3128
  });
3129
3129
  var MCPHostCapabilitiesSchema = zod.z.object({
3130
- sampling: zod.z.record(zod.z.unknown()).optional(),
3130
+ sampling: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
3131
3131
  roots: zod.z.object({
3132
3132
  listChanged: zod.z.boolean()
3133
3133
  }).optional()
@@ -3186,7 +3186,7 @@ var HttpConfigSchema = zod.z.object({
3186
3186
  }
3187
3187
  return true;
3188
3188
  }),
3189
- headers: zod.z.record(zod.z.string()).optional(),
3189
+ headers: zod.z.record(zod.z.string(), zod.z.string()).optional(),
3190
3190
  capabilities: MCPHostCapabilitiesSchema.optional(),
3191
3191
  connectTimeoutMs: zod.z.number().positive().optional(),
3192
3192
  requestTimeoutMs: zod.z.number().positive().optional(),
@@ -4407,7 +4407,7 @@ function escapeHtml(text) {
4407
4407
 
4408
4408
  // package.json
4409
4409
  var package_default = {
4410
- version: "1.0.0-beta.3"};
4410
+ version: "1.0.0-beta.4"};
4411
4411
 
4412
4412
  // src/mcp/clientFactory.ts
4413
4413
  function getRetryAfterDelayMs(err) {
@@ -5151,7 +5151,7 @@ function validateToolCalls(response, expectation) {
5151
5151
  if (!isSimulationResult(response)) {
5152
5152
  return {
5153
5153
  pass: false,
5154
- message: "toolsTriggered expectation requires llm_host mode \u2014 response must be an LLMHostSimulationResult"
5154
+ message: "toolsTriggered expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
5155
5155
  };
5156
5156
  }
5157
5157
  const actual = response.toolCalls;
@@ -5211,7 +5211,7 @@ function validateToolCallCount(response, options) {
5211
5211
  if (!isSimulationResult(response)) {
5212
5212
  return {
5213
5213
  pass: false,
5214
- message: "toolCallCount expectation requires llm_host mode \u2014 response must be an LLMHostSimulationResult"
5214
+ message: "toolCallCount expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
5215
5215
  };
5216
5216
  }
5217
5217
  const count = response.toolCalls.length;
@@ -6333,7 +6333,7 @@ function getAuthConfigFromEnv() {
6333
6333
  }
6334
6334
  return void 0;
6335
6335
  }
6336
- var LLMHostConfigSchema = zod.z.object({
6336
+ var MCPHostConfigSchema = zod.z.object({
6337
6337
  provider: zod.z.enum([
6338
6338
  "openai",
6339
6339
  "anthropic",
@@ -6402,7 +6402,7 @@ var EvalExpectBlockSchema = zod.z.object({
6402
6402
  calls: zod.z.array(
6403
6403
  zod.z.object({
6404
6404
  name: zod.z.string(),
6405
- arguments: zod.z.record(zod.z.unknown()).optional(),
6405
+ arguments: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
6406
6406
  required: zod.z.boolean().optional()
6407
6407
  })
6408
6408
  ),
@@ -6418,12 +6418,12 @@ var EvalExpectBlockSchema = zod.z.object({
6418
6418
  var EvalCaseSchema = zod.z.object({
6419
6419
  id: zod.z.string().min(1, "id must not be empty"),
6420
6420
  description: zod.z.string().optional(),
6421
- mode: zod.z.enum(["direct", "llm_host"]).optional(),
6421
+ mode: zod.z.enum(["direct", "mcp_host"]).optional(),
6422
6422
  toolName: zod.z.string().min(1, "toolName must not be empty").optional(),
6423
- args: zod.z.record(zod.z.unknown()).optional(),
6423
+ args: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
6424
6424
  scenario: zod.z.string().optional(),
6425
- llmHostConfig: LLMHostConfigSchema.optional(),
6426
- metadata: zod.z.record(zod.z.unknown()).optional(),
6425
+ mcpHostConfig: MCPHostConfigSchema.optional(),
6426
+ metadata: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
6427
6427
  iterations: zod.z.number().int().min(1).optional(),
6428
6428
  accuracyThreshold: zod.z.number().min(0).max(1).optional(),
6429
6429
  judgeReps: zod.z.number().int().min(1).optional(),
@@ -6435,7 +6435,7 @@ var EvalDatasetSchema = zod.z.object({
6435
6435
  name: zod.z.string().min(1, "name must not be empty"),
6436
6436
  description: zod.z.string().optional(),
6437
6437
  cases: zod.z.array(EvalCaseSchema).min(1, "dataset must have at least one case"),
6438
- metadata: zod.z.record(zod.z.unknown()).optional()
6438
+ metadata: zod.z.record(zod.z.string(), zod.z.unknown()).optional()
6439
6439
  });
6440
6440
  function validateEvalCase(evalCase) {
6441
6441
  return EvalCaseSchema.parse(evalCase);
@@ -6473,30 +6473,30 @@ function loadEvalDatasetFromObject(data, options = {}) {
6473
6473
  return dataset;
6474
6474
  }
6475
6475
 
6476
- // src/evals/llmHost/adapters/vercel.ts
6476
+ // src/evals/mcpHost/adapters/vercel.ts
6477
6477
  function enrichErrorMessage(err, provider) {
6478
6478
  const raw = err instanceof Error ? err.message : String(err);
6479
6479
  if (raw.includes("Cannot find module") || raw.includes("ERR_MODULE_NOT_FOUND")) {
6480
- return `LLM host simulation failed: required package not installed.
6481
- Hint: run \`getMissingDependencyMessage('${provider}')\` or check docs/llm-host.md for install instructions.`;
6480
+ return `MCP host simulation failed: required package not installed.
6481
+ Hint: run \`getMissingDependencyMessage('${provider}')\` or check docs/mcp-host.md for install instructions.`;
6482
6482
  }
6483
6483
  if (raw.includes("401") || raw.includes("Unauthorized") || raw.includes("API key") || raw.includes("api_key")) {
6484
- return `LLM host simulation failed: authentication error.
6484
+ return `MCP host simulation failed: authentication error.
6485
6485
  Hint: check your API key environment variable (e.g. ANTHROPIC_API_KEY, GOOGLE_APPLICATION_CREDENTIALS).`;
6486
6486
  }
6487
6487
  if (raw.includes("404") || raw.includes("Not Found") || raw.toLowerCase().includes("model") && raw.toLowerCase().includes("not found")) {
6488
- return `LLM host simulation failed: model not found.
6488
+ return `MCP host simulation failed: model not found.
6489
6489
  Hint: check the model name format for your provider. For vertex-anthropic use 'claude-3-5-haiku@20241022' (with @).`;
6490
6490
  }
6491
6491
  if (raw.includes("ENOTFOUND") || raw.includes("fetch failed") || raw.includes("ECONNREFUSED")) {
6492
- return `LLM host simulation failed: network error.
6492
+ return `MCP host simulation failed: network error.
6493
6493
  Hint: check network connectivity and whether the provider's API endpoint is reachable from this machine.`;
6494
6494
  }
6495
6495
  if (raw.includes("429") || raw.toLowerCase().includes("rate limit") || raw.includes("Too Many Requests")) {
6496
- return `LLM host simulation failed: rate limited.
6496
+ return `MCP host simulation failed: rate limited.
6497
6497
  Hint: reduce concurrency, add delays between iterations, or upgrade your API plan.`;
6498
6498
  }
6499
- return `LLM host simulation failed: ${raw}`;
6499
+ return `MCP host simulation failed: ${raw}`;
6500
6500
  }
6501
6501
  async function loadModel(provider, model) {
6502
6502
  switch (provider) {
@@ -6626,7 +6626,7 @@ function createVercelOrchestrator() {
6626
6626
  };
6627
6627
  }
6628
6628
 
6629
- // src/evals/llmHost/llmHostSimulation.ts
6629
+ // src/evals/mcpHost/mcpHostSimulation.ts
6630
6630
  var vercelOrchestrator = createVercelOrchestrator();
6631
6631
  var allProviders = [
6632
6632
  "openai",
@@ -6642,7 +6642,7 @@ var allProviders = [
6642
6642
  var simulatorRegistry = new Map(
6643
6643
  allProviders.map((p) => [p, vercelOrchestrator])
6644
6644
  );
6645
- async function simulateLLMHost(mcp, scenario, config) {
6645
+ async function simulateMCPHost(mcp, scenario, config) {
6646
6646
  const simulator = simulatorRegistry.get(config.provider);
6647
6647
  if (!simulator) {
6648
6648
  throw new Error(
@@ -6664,7 +6664,7 @@ function getMissingDependencyMessage(provider) {
6664
6664
  deepseek: "npm install ai @ai-sdk/deepseek",
6665
6665
  openrouter: "npm install ai @openrouter/ai-sdk-provider",
6666
6666
  xai: "npm install ai @ai-sdk/xai",
6667
- "vertex-anthropic": "npm install ai @ai-sdk/google-vertex (requires Application Default Credentials \u2014 see docs/llm-host.md)"
6667
+ "vertex-anthropic": "npm install ai @ai-sdk/google-vertex (requires Application Default Credentials \u2014 see docs/mcp-host.md)"
6668
6668
  };
6669
6669
  const pkg = packageMap[provider];
6670
6670
  return pkg ? `${String(provider)} provider requires: ${pkg}` : `Unknown provider: ${String(provider)}`;
@@ -6707,24 +6707,24 @@ async function execFileNoThrow(file, args) {
6707
6707
  async function executeToolCall(evalCase, mcp) {
6708
6708
  const mode = evalCase.mode || "direct";
6709
6709
  try {
6710
- if (mode === "llm_host") {
6710
+ if (mode === "mcp_host") {
6711
6711
  if (!evalCase.scenario) {
6712
6712
  throw new Error(
6713
- `Eval case ${evalCase.id}: scenario is required for llm_host mode`
6713
+ `Eval case ${evalCase.id}: scenario is required for mcp_host mode`
6714
6714
  );
6715
6715
  }
6716
- if (!evalCase.llmHostConfig) {
6716
+ if (!evalCase.mcpHostConfig) {
6717
6717
  throw new Error(
6718
- `Eval case ${evalCase.id}: llmHostConfig is required for llm_host mode`
6718
+ `Eval case ${evalCase.id}: mcpHostConfig is required for mcp_host mode`
6719
6719
  );
6720
6720
  }
6721
- const simulationResult = await simulateLLMHost(
6721
+ const simulationResult = await simulateMCPHost(
6722
6722
  mcp,
6723
6723
  evalCase.scenario,
6724
- evalCase.llmHostConfig
6724
+ evalCase.mcpHostConfig
6725
6725
  );
6726
6726
  if (!simulationResult.success) {
6727
- throw new Error(simulationResult.error || "LLM host simulation failed");
6727
+ throw new Error(simulationResult.error || "MCP host simulation failed");
6728
6728
  }
6729
6729
  return { response: simulationResult };
6730
6730
  } else {
@@ -6866,12 +6866,16 @@ async function runExpectBlockValidations(expectBlock, response, config) {
6866
6866
  }
6867
6867
  return { expectations: results, toolPrecision, toolRecall };
6868
6868
  }
6869
+ function isMCPHostSimulationResult(value) {
6870
+ return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
6871
+ }
6869
6872
  async function runSingleIteration(evalCase, context, options) {
6870
6873
  const startTime = Date.now();
6871
6874
  const { response, error } = await executeToolCall(evalCase, context.mcp);
6872
6875
  let expectationResults = {};
6873
6876
  let toolPrecision;
6874
6877
  let toolRecall;
6878
+ let mcpHostTrace;
6875
6879
  if (!error && evalCase.expect) {
6876
6880
  const {
6877
6881
  expectations,
@@ -6886,6 +6890,23 @@ async function runSingleIteration(evalCase, context, options) {
6886
6890
  expectationResults = expectations;
6887
6891
  toolPrecision = tp;
6888
6892
  toolRecall = tr;
6893
+ if (evalCase.expect.toolsTriggered !== void 0 && isMCPHostSimulationResult(response)) {
6894
+ const expectedNames = new Set(
6895
+ evalCase.expect.toolsTriggered.calls.map((c) => c.name)
6896
+ );
6897
+ const requiredNames = new Set(
6898
+ evalCase.expect.toolsTriggered.calls.filter((c) => c.required !== false).map((c) => c.name)
6899
+ );
6900
+ const calledNames = new Set(response.toolCalls.map((c) => c.name));
6901
+ mcpHostTrace = {
6902
+ calls: response.toolCalls.map((call) => ({
6903
+ name: call.name,
6904
+ arguments: call.arguments,
6905
+ status: expectedNames.has(call.name) ? "expected" : "unexpected"
6906
+ })),
6907
+ missed: Array.from(requiredNames).filter((name15) => !calledNames.has(name15)).map((name15) => ({ name: name15 }))
6908
+ };
6909
+ }
6889
6910
  }
6890
6911
  return {
6891
6912
  id: evalCase.id,
@@ -6901,7 +6922,8 @@ async function runSingleIteration(evalCase, context, options) {
6901
6922
  durationMs: Date.now() - startTime,
6902
6923
  tags: evalCase.tags,
6903
6924
  toolPrecision,
6904
- toolRecall
6925
+ toolRecall,
6926
+ mcpHostTrace
6905
6927
  };
6906
6928
  }
6907
6929
  function isInfrastructureError(err) {
@@ -7007,7 +7029,7 @@ async function runEvalDataset(options, context) {
7007
7029
  filterTags,
7008
7030
  saveResultsTo,
7009
7031
  baselineResultsFrom,
7010
- llmHostModel,
7032
+ mcpHostModel,
7011
7033
  judgeModel
7012
7034
  } = options;
7013
7035
  const startTime = Date.now();
@@ -7017,7 +7039,7 @@ async function runEvalDataset(options, context) {
7017
7039
  };
7018
7040
  const casesToRun = filterTags && filterTags.length > 0 ? dataset.cases.filter((c) => c.tags?.some((t) => filterTags.includes(t))) : dataset.cases;
7019
7041
  const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
7020
- const effectiveIterations = c.mode === "llm_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
7042
+ const effectiveIterations = c.mode === "mcp_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
7021
7043
  const judgeReps = c.expect?.passesJudge != null ? c.expect.passesJudge.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1 : 0;
7022
7044
  return sum + effectiveIterations * judgeReps;
7023
7045
  }, 0);
@@ -7027,12 +7049,12 @@ async function runEvalDataset(options, context) {
7027
7049
  );
7028
7050
  }
7029
7051
  const tasks = casesToRun.map((evalCase) => async () => {
7030
- const withIterations = evalCase.mode === "llm_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
7031
- if (evalCase.mode === "llm_host") {
7052
+ const withIterations = evalCase.mode === "mcp_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
7053
+ if (evalCase.mode === "mcp_host") {
7032
7054
  const effectiveIterations = withIterations.iterations ?? 1;
7033
7055
  if (effectiveIterations > 1 && effectiveIterations < 10) {
7034
7056
  console.warn(
7035
- `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in llm_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
7057
+ `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in mcp_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
7036
7058
  );
7037
7059
  }
7038
7060
  }
@@ -7064,7 +7086,7 @@ async function runEvalDataset(options, context) {
7064
7086
  gitHash,
7065
7087
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
7066
7088
  packageVersion: package_default.version,
7067
- ...llmHostModel !== void 0 && { llmHostModel },
7089
+ ...mcpHostModel !== void 0 && { mcpHostModel },
7068
7090
  ...judgeModel !== void 0 && { judgeModel }
7069
7091
  };
7070
7092
  const result = {
@@ -7109,12 +7131,12 @@ async function runEvalDataset(options, context) {
7109
7131
  );
7110
7132
  }
7111
7133
  }
7112
- const llmHostCases = caseResults.filter(
7134
+ const mcpHostCases = caseResults.filter(
7113
7135
  (r) => r.toolPrecision !== void 0 || r.toolRecall !== void 0
7114
7136
  );
7115
- if (llmHostCases.length > 0) {
7116
- const avgPrec = llmHostCases.reduce((s, r) => s + (r.toolPrecision ?? 0), 0) / llmHostCases.length;
7117
- const avgRecall = llmHostCases.reduce((s, r) => s + (r.toolRecall ?? 0), 0) / llmHostCases.length;
7137
+ if (mcpHostCases.length > 0) {
7138
+ const avgPrec = mcpHostCases.reduce((s, r) => s + (r.toolPrecision ?? 0), 0) / mcpHostCases.length;
7139
+ const avgRecall = mcpHostCases.reduce((s, r) => s + (r.toolRecall ?? 0), 0) / mcpHostCases.length;
7118
7140
  result.datasetToolPrecision = avgPrec;
7119
7141
  result.datasetToolRecall = avgRecall;
7120
7142
  result.datasetToolF1 = avgPrec + avgRecall > 0 ? 2 * avgPrec * avgRecall / (avgPrec + avgRecall) : 0;
@@ -7182,7 +7204,6 @@ async function runServerComparison(options, contextA, contextB) {
7182
7204
  bWins,
7183
7205
  ties,
7184
7206
  bothFail,
7185
- bothFailCount: bothFail,
7186
7207
  decidedCases,
7187
7208
  failureAlignment: total > 0 ? bothFail / total : 0,
7188
7209
  aWinRate: decidedCases > 0 ? aWins / decidedCases : 0,
@@ -7411,7 +7432,7 @@ exports.runEvalCase = runEvalCase;
7411
7432
  exports.runEvalDataset = runEvalDataset;
7412
7433
  exports.runServerComparison = runServerComparison;
7413
7434
  exports.saveBaseline = saveBaseline;
7414
- exports.simulateLLMHost = simulateLLMHost;
7435
+ exports.simulateMCPHost = simulateMCPHost;
7415
7436
  exports.test = test;
7416
7437
  exports.validateAccessToken = validateAccessToken;
7417
7438
  exports.validateError = validateError;