@gleanwork/mcp-server-tester 1.0.0-beta.2 → 1.0.0-beta.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -3127,7 +3127,7 @@ var init_dist3 = __esm({
3127
3127
  }
3128
3128
  });
3129
3129
  var MCPHostCapabilitiesSchema = zod.z.object({
3130
- sampling: zod.z.record(zod.z.unknown()).optional(),
3130
+ sampling: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
3131
3131
  roots: zod.z.object({
3132
3132
  listChanged: zod.z.boolean()
3133
3133
  }).optional()
@@ -3186,7 +3186,7 @@ var HttpConfigSchema = zod.z.object({
3186
3186
  }
3187
3187
  return true;
3188
3188
  }),
3189
- headers: zod.z.record(zod.z.string()).optional(),
3189
+ headers: zod.z.record(zod.z.string(), zod.z.string()).optional(),
3190
3190
  capabilities: MCPHostCapabilitiesSchema.optional(),
3191
3191
  connectTimeoutMs: zod.z.number().positive().optional(),
3192
3192
  requestTimeoutMs: zod.z.number().positive().optional(),
@@ -4407,7 +4407,7 @@ function escapeHtml(text) {
4407
4407
 
4408
4408
  // package.json
4409
4409
  var package_default = {
4410
- version: "1.0.0-beta.2"};
4410
+ version: "1.0.0-beta.4"};
4411
4411
 
4412
4412
  // src/mcp/clientFactory.ts
4413
4413
  function getRetryAfterDelayMs(err) {
@@ -4498,7 +4498,10 @@ async function createMCPClientForConfig(config, options) {
4498
4498
  validatedConfig.connectTimeoutMs !== void 0 ? { timeout: validatedConfig.connectTimeoutMs } : void 0
4499
4499
  );
4500
4500
  } else if (isHttpConfig(validatedConfig)) {
4501
- const headers = { ...validatedConfig.headers };
4501
+ const headers = {
4502
+ "User-Agent": `@gleanwork/mcp-server-tester/${package_default.version}`,
4503
+ ...validatedConfig.headers
4504
+ };
4502
4505
  if (validatedConfig.auth?.clientCredentials && !options?.authProvider) {
4503
4506
  const ccConfig = validatedConfig.auth.clientCredentials;
4504
4507
  const clientId = ccConfig.clientId ?? process.env["MCP_CLIENT_ID"];
@@ -5148,7 +5151,7 @@ function validateToolCalls(response, expectation) {
5148
5151
  if (!isSimulationResult(response)) {
5149
5152
  return {
5150
5153
  pass: false,
5151
- message: "toolsTriggered expectation requires llm_host mode \u2014 response must be an LLMHostSimulationResult"
5154
+ message: "toolsTriggered expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
5152
5155
  };
5153
5156
  }
5154
5157
  const actual = response.toolCalls;
@@ -5208,7 +5211,7 @@ function validateToolCallCount(response, options) {
5208
5211
  if (!isSimulationResult(response)) {
5209
5212
  return {
5210
5213
  pass: false,
5211
- message: "toolCallCount expectation requires llm_host mode \u2014 response must be an LLMHostSimulationResult"
5214
+ message: "toolCallCount expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
5212
5215
  };
5213
5216
  }
5214
5217
  const count = response.toolCalls.length;
@@ -6330,7 +6333,7 @@ function getAuthConfigFromEnv() {
6330
6333
  }
6331
6334
  return void 0;
6332
6335
  }
6333
- var LLMHostConfigSchema = zod.z.object({
6336
+ var MCPHostConfigSchema = zod.z.object({
6334
6337
  provider: zod.z.enum([
6335
6338
  "openai",
6336
6339
  "anthropic",
@@ -6399,7 +6402,7 @@ var EvalExpectBlockSchema = zod.z.object({
6399
6402
  calls: zod.z.array(
6400
6403
  zod.z.object({
6401
6404
  name: zod.z.string(),
6402
- arguments: zod.z.record(zod.z.unknown()).optional(),
6405
+ arguments: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
6403
6406
  required: zod.z.boolean().optional()
6404
6407
  })
6405
6408
  ),
@@ -6415,12 +6418,12 @@ var EvalExpectBlockSchema = zod.z.object({
6415
6418
  var EvalCaseSchema = zod.z.object({
6416
6419
  id: zod.z.string().min(1, "id must not be empty"),
6417
6420
  description: zod.z.string().optional(),
6418
- mode: zod.z.enum(["direct", "llm_host"]).optional(),
6421
+ mode: zod.z.enum(["direct", "mcp_host"]).optional(),
6419
6422
  toolName: zod.z.string().min(1, "toolName must not be empty").optional(),
6420
- args: zod.z.record(zod.z.unknown()).optional(),
6423
+ args: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
6421
6424
  scenario: zod.z.string().optional(),
6422
- llmHostConfig: LLMHostConfigSchema.optional(),
6423
- metadata: zod.z.record(zod.z.unknown()).optional(),
6425
+ mcpHostConfig: MCPHostConfigSchema.optional(),
6426
+ metadata: zod.z.record(zod.z.string(), zod.z.unknown()).optional(),
6424
6427
  iterations: zod.z.number().int().min(1).optional(),
6425
6428
  accuracyThreshold: zod.z.number().min(0).max(1).optional(),
6426
6429
  judgeReps: zod.z.number().int().min(1).optional(),
@@ -6432,7 +6435,7 @@ var EvalDatasetSchema = zod.z.object({
6432
6435
  name: zod.z.string().min(1, "name must not be empty"),
6433
6436
  description: zod.z.string().optional(),
6434
6437
  cases: zod.z.array(EvalCaseSchema).min(1, "dataset must have at least one case"),
6435
- metadata: zod.z.record(zod.z.unknown()).optional()
6438
+ metadata: zod.z.record(zod.z.string(), zod.z.unknown()).optional()
6436
6439
  });
6437
6440
  function validateEvalCase(evalCase) {
6438
6441
  return EvalCaseSchema.parse(evalCase);
@@ -6470,30 +6473,30 @@ function loadEvalDatasetFromObject(data, options = {}) {
6470
6473
  return dataset;
6471
6474
  }
6472
6475
 
6473
- // src/evals/llmHost/adapters/vercel.ts
6476
+ // src/evals/mcpHost/adapters/vercel.ts
6474
6477
  function enrichErrorMessage(err, provider) {
6475
6478
  const raw = err instanceof Error ? err.message : String(err);
6476
6479
  if (raw.includes("Cannot find module") || raw.includes("ERR_MODULE_NOT_FOUND")) {
6477
- return `LLM host simulation failed: required package not installed.
6478
- Hint: run \`getMissingDependencyMessage('${provider}')\` or check docs/llm-host.md for install instructions.`;
6480
+ return `MCP host simulation failed: required package not installed.
6481
+ Hint: run \`getMissingDependencyMessage('${provider}')\` or check docs/mcp-host.md for install instructions.`;
6479
6482
  }
6480
6483
  if (raw.includes("401") || raw.includes("Unauthorized") || raw.includes("API key") || raw.includes("api_key")) {
6481
- return `LLM host simulation failed: authentication error.
6484
+ return `MCP host simulation failed: authentication error.
6482
6485
  Hint: check your API key environment variable (e.g. ANTHROPIC_API_KEY, GOOGLE_APPLICATION_CREDENTIALS).`;
6483
6486
  }
6484
6487
  if (raw.includes("404") || raw.includes("Not Found") || raw.toLowerCase().includes("model") && raw.toLowerCase().includes("not found")) {
6485
- return `LLM host simulation failed: model not found.
6488
+ return `MCP host simulation failed: model not found.
6486
6489
  Hint: check the model name format for your provider. For vertex-anthropic use 'claude-3-5-haiku@20241022' (with @).`;
6487
6490
  }
6488
6491
  if (raw.includes("ENOTFOUND") || raw.includes("fetch failed") || raw.includes("ECONNREFUSED")) {
6489
- return `LLM host simulation failed: network error.
6492
+ return `MCP host simulation failed: network error.
6490
6493
  Hint: check network connectivity and whether the provider's API endpoint is reachable from this machine.`;
6491
6494
  }
6492
6495
  if (raw.includes("429") || raw.toLowerCase().includes("rate limit") || raw.includes("Too Many Requests")) {
6493
- return `LLM host simulation failed: rate limited.
6496
+ return `MCP host simulation failed: rate limited.
6494
6497
  Hint: reduce concurrency, add delays between iterations, or upgrade your API plan.`;
6495
6498
  }
6496
- return `LLM host simulation failed: ${raw}`;
6499
+ return `MCP host simulation failed: ${raw}`;
6497
6500
  }
6498
6501
  async function loadModel(provider, model) {
6499
6502
  switch (provider) {
@@ -6623,7 +6626,7 @@ function createVercelOrchestrator() {
6623
6626
  };
6624
6627
  }
6625
6628
 
6626
- // src/evals/llmHost/llmHostSimulation.ts
6629
+ // src/evals/mcpHost/mcpHostSimulation.ts
6627
6630
  var vercelOrchestrator = createVercelOrchestrator();
6628
6631
  var allProviders = [
6629
6632
  "openai",
@@ -6639,7 +6642,7 @@ var allProviders = [
6639
6642
  var simulatorRegistry = new Map(
6640
6643
  allProviders.map((p) => [p, vercelOrchestrator])
6641
6644
  );
6642
- async function simulateLLMHost(mcp, scenario, config) {
6645
+ async function simulateMCPHost(mcp, scenario, config) {
6643
6646
  const simulator = simulatorRegistry.get(config.provider);
6644
6647
  if (!simulator) {
6645
6648
  throw new Error(
@@ -6661,7 +6664,7 @@ function getMissingDependencyMessage(provider) {
6661
6664
  deepseek: "npm install ai @ai-sdk/deepseek",
6662
6665
  openrouter: "npm install ai @openrouter/ai-sdk-provider",
6663
6666
  xai: "npm install ai @ai-sdk/xai",
6664
- "vertex-anthropic": "npm install ai @ai-sdk/google-vertex (requires Application Default Credentials \u2014 see docs/llm-host.md)"
6667
+ "vertex-anthropic": "npm install ai @ai-sdk/google-vertex (requires Application Default Credentials \u2014 see docs/mcp-host.md)"
6665
6668
  };
6666
6669
  const pkg = packageMap[provider];
6667
6670
  return pkg ? `${String(provider)} provider requires: ${pkg}` : `Unknown provider: ${String(provider)}`;
@@ -6704,24 +6707,24 @@ async function execFileNoThrow(file, args) {
6704
6707
  async function executeToolCall(evalCase, mcp) {
6705
6708
  const mode = evalCase.mode || "direct";
6706
6709
  try {
6707
- if (mode === "llm_host") {
6710
+ if (mode === "mcp_host") {
6708
6711
  if (!evalCase.scenario) {
6709
6712
  throw new Error(
6710
- `Eval case ${evalCase.id}: scenario is required for llm_host mode`
6713
+ `Eval case ${evalCase.id}: scenario is required for mcp_host mode`
6711
6714
  );
6712
6715
  }
6713
- if (!evalCase.llmHostConfig) {
6716
+ if (!evalCase.mcpHostConfig) {
6714
6717
  throw new Error(
6715
- `Eval case ${evalCase.id}: llmHostConfig is required for llm_host mode`
6718
+ `Eval case ${evalCase.id}: mcpHostConfig is required for mcp_host mode`
6716
6719
  );
6717
6720
  }
6718
- const simulationResult = await simulateLLMHost(
6721
+ const simulationResult = await simulateMCPHost(
6719
6722
  mcp,
6720
6723
  evalCase.scenario,
6721
- evalCase.llmHostConfig
6724
+ evalCase.mcpHostConfig
6722
6725
  );
6723
6726
  if (!simulationResult.success) {
6724
- throw new Error(simulationResult.error || "LLM host simulation failed");
6727
+ throw new Error(simulationResult.error || "MCP host simulation failed");
6725
6728
  }
6726
6729
  return { response: simulationResult };
6727
6730
  } else {
@@ -6863,12 +6866,16 @@ async function runExpectBlockValidations(expectBlock, response, config) {
6863
6866
  }
6864
6867
  return { expectations: results, toolPrecision, toolRecall };
6865
6868
  }
6869
+ function isMCPHostSimulationResult(value) {
6870
+ return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
6871
+ }
6866
6872
  async function runSingleIteration(evalCase, context, options) {
6867
6873
  const startTime = Date.now();
6868
6874
  const { response, error } = await executeToolCall(evalCase, context.mcp);
6869
6875
  let expectationResults = {};
6870
6876
  let toolPrecision;
6871
6877
  let toolRecall;
6878
+ let mcpHostTrace;
6872
6879
  if (!error && evalCase.expect) {
6873
6880
  const {
6874
6881
  expectations,
@@ -6883,6 +6890,23 @@ async function runSingleIteration(evalCase, context, options) {
6883
6890
  expectationResults = expectations;
6884
6891
  toolPrecision = tp;
6885
6892
  toolRecall = tr;
6893
+ if (evalCase.expect.toolsTriggered !== void 0 && isMCPHostSimulationResult(response)) {
6894
+ const expectedNames = new Set(
6895
+ evalCase.expect.toolsTriggered.calls.map((c) => c.name)
6896
+ );
6897
+ const requiredNames = new Set(
6898
+ evalCase.expect.toolsTriggered.calls.filter((c) => c.required !== false).map((c) => c.name)
6899
+ );
6900
+ const calledNames = new Set(response.toolCalls.map((c) => c.name));
6901
+ mcpHostTrace = {
6902
+ calls: response.toolCalls.map((call) => ({
6903
+ name: call.name,
6904
+ arguments: call.arguments,
6905
+ status: expectedNames.has(call.name) ? "expected" : "unexpected"
6906
+ })),
6907
+ missed: Array.from(requiredNames).filter((name15) => !calledNames.has(name15)).map((name15) => ({ name: name15 }))
6908
+ };
6909
+ }
6886
6910
  }
6887
6911
  return {
6888
6912
  id: evalCase.id,
@@ -6898,7 +6922,8 @@ async function runSingleIteration(evalCase, context, options) {
6898
6922
  durationMs: Date.now() - startTime,
6899
6923
  tags: evalCase.tags,
6900
6924
  toolPrecision,
6901
- toolRecall
6925
+ toolRecall,
6926
+ mcpHostTrace
6902
6927
  };
6903
6928
  }
6904
6929
  function isInfrastructureError(err) {
@@ -6951,7 +6976,6 @@ async function runEvalCase(evalCase, context, options = {}) {
6951
6976
  const passCount = assertionResults.filter((r) => r.pass).length;
6952
6977
  const assertionPassRate = assertionResults.length > 0 ? passCount / assertionResults.length : 0;
6953
6978
  const infrastructureErrorRate = infraErrors.length / iterations;
6954
- const accuracy = assertionPassRate;
6955
6979
  const threshold = evalCase.accuracyThreshold ?? 1;
6956
6980
  const baseResult = lastResult ?? {
6957
6981
  id: evalCase.id,
@@ -6968,10 +6992,9 @@ async function runEvalCase(evalCase, context, options = {}) {
6968
6992
  };
6969
6993
  return {
6970
6994
  ...baseResult,
6971
- pass: accuracy >= threshold,
6995
+ pass: assertionPassRate >= threshold,
6972
6996
  assertionPassRate,
6973
6997
  infrastructureErrorRate,
6974
- accuracy,
6975
6998
  iterationResults,
6976
6999
  infrastructureErrorCount: infraErrors.length,
6977
7000
  durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
@@ -7006,7 +7029,7 @@ async function runEvalDataset(options, context) {
7006
7029
  filterTags,
7007
7030
  saveResultsTo,
7008
7031
  baselineResultsFrom,
7009
- llmHostModel,
7032
+ mcpHostModel,
7010
7033
  judgeModel
7011
7034
  } = options;
7012
7035
  const startTime = Date.now();
@@ -7016,7 +7039,7 @@ async function runEvalDataset(options, context) {
7016
7039
  };
7017
7040
  const casesToRun = filterTags && filterTags.length > 0 ? dataset.cases.filter((c) => c.tags?.some((t) => filterTags.includes(t))) : dataset.cases;
7018
7041
  const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
7019
- const effectiveIterations = c.mode === "llm_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
7042
+ const effectiveIterations = c.mode === "mcp_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
7020
7043
  const judgeReps = c.expect?.passesJudge != null ? c.expect.passesJudge.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1 : 0;
7021
7044
  return sum + effectiveIterations * judgeReps;
7022
7045
  }, 0);
@@ -7026,12 +7049,12 @@ async function runEvalDataset(options, context) {
7026
7049
  );
7027
7050
  }
7028
7051
  const tasks = casesToRun.map((evalCase) => async () => {
7029
- const withIterations = evalCase.mode === "llm_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
7030
- if (evalCase.mode === "llm_host") {
7052
+ const withIterations = evalCase.mode === "mcp_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
7053
+ if (evalCase.mode === "mcp_host") {
7031
7054
  const effectiveIterations = withIterations.iterations ?? 1;
7032
7055
  if (effectiveIterations > 1 && effectiveIterations < 10) {
7033
7056
  console.warn(
7034
- `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in llm_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
7057
+ `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in mcp_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
7035
7058
  );
7036
7059
  }
7037
7060
  }
@@ -7063,7 +7086,7 @@ async function runEvalDataset(options, context) {
7063
7086
  gitHash,
7064
7087
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
7065
7088
  packageVersion: package_default.version,
7066
- ...llmHostModel !== void 0 && { llmHostModel },
7089
+ ...mcpHostModel !== void 0 && { mcpHostModel },
7067
7090
  ...judgeModel !== void 0 && { judgeModel }
7068
7091
  };
7069
7092
  const result = {
@@ -7108,12 +7131,12 @@ async function runEvalDataset(options, context) {
7108
7131
  );
7109
7132
  }
7110
7133
  }
7111
- const llmHostCases = caseResults.filter(
7134
+ const mcpHostCases = caseResults.filter(
7112
7135
  (r) => r.toolPrecision !== void 0 || r.toolRecall !== void 0
7113
7136
  );
7114
- if (llmHostCases.length > 0) {
7115
- const avgPrec = llmHostCases.reduce((s, r) => s + (r.toolPrecision ?? 0), 0) / llmHostCases.length;
7116
- const avgRecall = llmHostCases.reduce((s, r) => s + (r.toolRecall ?? 0), 0) / llmHostCases.length;
7137
+ if (mcpHostCases.length > 0) {
7138
+ const avgPrec = mcpHostCases.reduce((s, r) => s + (r.toolPrecision ?? 0), 0) / mcpHostCases.length;
7139
+ const avgRecall = mcpHostCases.reduce((s, r) => s + (r.toolRecall ?? 0), 0) / mcpHostCases.length;
7117
7140
  result.datasetToolPrecision = avgPrec;
7118
7141
  result.datasetToolRecall = avgRecall;
7119
7142
  result.datasetToolF1 = avgPrec + avgRecall > 0 ? 2 * avgPrec * avgRecall / (avgPrec + avgRecall) : 0;
@@ -7181,7 +7204,6 @@ async function runServerComparison(options, contextA, contextB) {
7181
7204
  bWins,
7182
7205
  ties,
7183
7206
  bothFail,
7184
- bothFailCount: bothFail,
7185
7207
  decidedCases,
7186
7208
  failureAlignment: total > 0 ? bothFail / total : 0,
7187
7209
  aWinRate: decidedCases > 0 ? aWins / decidedCases : 0,
@@ -7410,7 +7432,7 @@ exports.runEvalCase = runEvalCase;
7410
7432
  exports.runEvalDataset = runEvalDataset;
7411
7433
  exports.runServerComparison = runServerComparison;
7412
7434
  exports.saveBaseline = saveBaseline;
7413
- exports.simulateLLMHost = simulateLLMHost;
7435
+ exports.simulateMCPHost = simulateMCPHost;
7414
7436
  exports.test = test;
7415
7437
  exports.validateAccessToken = validateAccessToken;
7416
7438
  exports.validateError = validateError;