@gleanwork/mcp-server-tester 1.0.0-beta.3 → 1.0.0-beta.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -3100,7 +3100,7 @@ var init_dist3 = __esm({
3100
3100
  }
3101
3101
  });
3102
3102
  var MCPHostCapabilitiesSchema = z.object({
3103
- sampling: z.record(z.unknown()).optional(),
3103
+ sampling: z.record(z.string(), z.unknown()).optional(),
3104
3104
  roots: z.object({
3105
3105
  listChanged: z.boolean()
3106
3106
  }).optional()
@@ -3159,7 +3159,7 @@ var HttpConfigSchema = z.object({
3159
3159
  }
3160
3160
  return true;
3161
3161
  }),
3162
- headers: z.record(z.string()).optional(),
3162
+ headers: z.record(z.string(), z.string()).optional(),
3163
3163
  capabilities: MCPHostCapabilitiesSchema.optional(),
3164
3164
  connectTimeoutMs: z.number().positive().optional(),
3165
3165
  requestTimeoutMs: z.number().positive().optional(),
@@ -4380,7 +4380,7 @@ function escapeHtml(text) {
4380
4380
 
4381
4381
  // package.json
4382
4382
  var package_default = {
4383
- version: "1.0.0-beta.3"};
4383
+ version: "1.0.0-beta.5"};
4384
4384
 
4385
4385
  // src/mcp/clientFactory.ts
4386
4386
  function getRetryAfterDelayMs(err) {
@@ -5124,7 +5124,7 @@ function validateToolCalls(response, expectation) {
5124
5124
  if (!isSimulationResult(response)) {
5125
5125
  return {
5126
5126
  pass: false,
5127
- message: "toolsTriggered expectation requires llm_host mode \u2014 response must be an LLMHostSimulationResult"
5127
+ message: "toolsTriggered expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
5128
5128
  };
5129
5129
  }
5130
5130
  const actual = response.toolCalls;
@@ -5184,7 +5184,7 @@ function validateToolCallCount(response, options) {
5184
5184
  if (!isSimulationResult(response)) {
5185
5185
  return {
5186
5186
  pass: false,
5187
- message: "toolCallCount expectation requires llm_host mode \u2014 response must be an LLMHostSimulationResult"
5187
+ message: "toolCallCount expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
5188
5188
  };
5189
5189
  }
5190
5190
  const count = response.toolCalls.length;
@@ -6306,7 +6306,7 @@ function getAuthConfigFromEnv() {
6306
6306
  }
6307
6307
  return void 0;
6308
6308
  }
6309
- var LLMHostConfigSchema = z.object({
6309
+ var MCPHostConfigSchema = z.object({
6310
6310
  provider: z.enum([
6311
6311
  "openai",
6312
6312
  "anthropic",
@@ -6375,7 +6375,7 @@ var EvalExpectBlockSchema = z.object({
6375
6375
  calls: z.array(
6376
6376
  z.object({
6377
6377
  name: z.string(),
6378
- arguments: z.record(z.unknown()).optional(),
6378
+ arguments: z.record(z.string(), z.unknown()).optional(),
6379
6379
  required: z.boolean().optional()
6380
6380
  })
6381
6381
  ),
@@ -6391,12 +6391,12 @@ var EvalExpectBlockSchema = z.object({
6391
6391
  var EvalCaseSchema = z.object({
6392
6392
  id: z.string().min(1, "id must not be empty"),
6393
6393
  description: z.string().optional(),
6394
- mode: z.enum(["direct", "llm_host"]).optional(),
6394
+ mode: z.enum(["direct", "mcp_host"]).optional(),
6395
6395
  toolName: z.string().min(1, "toolName must not be empty").optional(),
6396
- args: z.record(z.unknown()).optional(),
6396
+ args: z.record(z.string(), z.unknown()).optional(),
6397
6397
  scenario: z.string().optional(),
6398
- llmHostConfig: LLMHostConfigSchema.optional(),
6399
- metadata: z.record(z.unknown()).optional(),
6398
+ mcpHostConfig: MCPHostConfigSchema.optional(),
6399
+ metadata: z.record(z.string(), z.unknown()).optional(),
6400
6400
  iterations: z.number().int().min(1).optional(),
6401
6401
  accuracyThreshold: z.number().min(0).max(1).optional(),
6402
6402
  judgeReps: z.number().int().min(1).optional(),
@@ -6408,7 +6408,7 @@ var EvalDatasetSchema = z.object({
6408
6408
  name: z.string().min(1, "name must not be empty"),
6409
6409
  description: z.string().optional(),
6410
6410
  cases: z.array(EvalCaseSchema).min(1, "dataset must have at least one case"),
6411
- metadata: z.record(z.unknown()).optional()
6411
+ metadata: z.record(z.string(), z.unknown()).optional()
6412
6412
  });
6413
6413
  function validateEvalCase(evalCase) {
6414
6414
  return EvalCaseSchema.parse(evalCase);
@@ -6446,30 +6446,30 @@ function loadEvalDatasetFromObject(data, options = {}) {
6446
6446
  return dataset;
6447
6447
  }
6448
6448
 
6449
- // src/evals/llmHost/adapters/vercel.ts
6449
+ // src/evals/mcpHost/adapters/vercel.ts
6450
6450
  function enrichErrorMessage(err, provider) {
6451
6451
  const raw = err instanceof Error ? err.message : String(err);
6452
6452
  if (raw.includes("Cannot find module") || raw.includes("ERR_MODULE_NOT_FOUND")) {
6453
- return `LLM host simulation failed: required package not installed.
6454
- Hint: run \`getMissingDependencyMessage('${provider}')\` or check docs/llm-host.md for install instructions.`;
6453
+ return `MCP host simulation failed: required package not installed.
6454
+ Hint: run \`getMissingDependencyMessage('${provider}')\` or check docs/mcp-host.md for install instructions.`;
6455
6455
  }
6456
6456
  if (raw.includes("401") || raw.includes("Unauthorized") || raw.includes("API key") || raw.includes("api_key")) {
6457
- return `LLM host simulation failed: authentication error.
6457
+ return `MCP host simulation failed: authentication error.
6458
6458
  Hint: check your API key environment variable (e.g. ANTHROPIC_API_KEY, GOOGLE_APPLICATION_CREDENTIALS).`;
6459
6459
  }
6460
6460
  if (raw.includes("404") || raw.includes("Not Found") || raw.toLowerCase().includes("model") && raw.toLowerCase().includes("not found")) {
6461
- return `LLM host simulation failed: model not found.
6461
+ return `MCP host simulation failed: model not found.
6462
6462
  Hint: check the model name format for your provider. For vertex-anthropic use 'claude-3-5-haiku@20241022' (with @).`;
6463
6463
  }
6464
6464
  if (raw.includes("ENOTFOUND") || raw.includes("fetch failed") || raw.includes("ECONNREFUSED")) {
6465
- return `LLM host simulation failed: network error.
6465
+ return `MCP host simulation failed: network error.
6466
6466
  Hint: check network connectivity and whether the provider's API endpoint is reachable from this machine.`;
6467
6467
  }
6468
6468
  if (raw.includes("429") || raw.toLowerCase().includes("rate limit") || raw.includes("Too Many Requests")) {
6469
- return `LLM host simulation failed: rate limited.
6469
+ return `MCP host simulation failed: rate limited.
6470
6470
  Hint: reduce concurrency, add delays between iterations, or upgrade your API plan.`;
6471
6471
  }
6472
- return `LLM host simulation failed: ${raw}`;
6472
+ return `MCP host simulation failed: ${raw}`;
6473
6473
  }
6474
6474
  async function loadModel(provider, model) {
6475
6475
  switch (provider) {
@@ -6599,7 +6599,7 @@ function createVercelOrchestrator() {
6599
6599
  };
6600
6600
  }
6601
6601
 
6602
- // src/evals/llmHost/llmHostSimulation.ts
6602
+ // src/evals/mcpHost/mcpHostSimulation.ts
6603
6603
  var vercelOrchestrator = createVercelOrchestrator();
6604
6604
  var allProviders = [
6605
6605
  "openai",
@@ -6615,7 +6615,7 @@ var allProviders = [
6615
6615
  var simulatorRegistry = new Map(
6616
6616
  allProviders.map((p) => [p, vercelOrchestrator])
6617
6617
  );
6618
- async function simulateLLMHost(mcp, scenario, config) {
6618
+ async function simulateMCPHost(mcp, scenario, config) {
6619
6619
  const simulator = simulatorRegistry.get(config.provider);
6620
6620
  if (!simulator) {
6621
6621
  throw new Error(
@@ -6637,7 +6637,7 @@ function getMissingDependencyMessage(provider) {
6637
6637
  deepseek: "npm install ai @ai-sdk/deepseek",
6638
6638
  openrouter: "npm install ai @openrouter/ai-sdk-provider",
6639
6639
  xai: "npm install ai @ai-sdk/xai",
6640
- "vertex-anthropic": "npm install ai @ai-sdk/google-vertex (requires Application Default Credentials \u2014 see docs/llm-host.md)"
6640
+ "vertex-anthropic": "npm install ai @ai-sdk/google-vertex (requires Application Default Credentials \u2014 see docs/mcp-host.md)"
6641
6641
  };
6642
6642
  const pkg = packageMap[provider];
6643
6643
  return pkg ? `${String(provider)} provider requires: ${pkg}` : `Unknown provider: ${String(provider)}`;
@@ -6680,24 +6680,24 @@ async function execFileNoThrow(file, args) {
6680
6680
  async function executeToolCall(evalCase, mcp) {
6681
6681
  const mode = evalCase.mode || "direct";
6682
6682
  try {
6683
- if (mode === "llm_host") {
6683
+ if (mode === "mcp_host") {
6684
6684
  if (!evalCase.scenario) {
6685
6685
  throw new Error(
6686
- `Eval case ${evalCase.id}: scenario is required for llm_host mode`
6686
+ `Eval case ${evalCase.id}: scenario is required for mcp_host mode`
6687
6687
  );
6688
6688
  }
6689
- if (!evalCase.llmHostConfig) {
6689
+ if (!evalCase.mcpHostConfig) {
6690
6690
  throw new Error(
6691
- `Eval case ${evalCase.id}: llmHostConfig is required for llm_host mode`
6691
+ `Eval case ${evalCase.id}: mcpHostConfig is required for mcp_host mode`
6692
6692
  );
6693
6693
  }
6694
- const simulationResult = await simulateLLMHost(
6694
+ const simulationResult = await simulateMCPHost(
6695
6695
  mcp,
6696
6696
  evalCase.scenario,
6697
- evalCase.llmHostConfig
6697
+ evalCase.mcpHostConfig
6698
6698
  );
6699
6699
  if (!simulationResult.success) {
6700
- throw new Error(simulationResult.error || "LLM host simulation failed");
6700
+ throw new Error(simulationResult.error || "MCP host simulation failed");
6701
6701
  }
6702
6702
  return { response: simulationResult };
6703
6703
  } else {
@@ -6839,12 +6839,16 @@ async function runExpectBlockValidations(expectBlock, response, config) {
6839
6839
  }
6840
6840
  return { expectations: results, toolPrecision, toolRecall };
6841
6841
  }
6842
+ function isMCPHostSimulationResult(value) {
6843
+ return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
6844
+ }
6842
6845
  async function runSingleIteration(evalCase, context, options) {
6843
6846
  const startTime = Date.now();
6844
6847
  const { response, error } = await executeToolCall(evalCase, context.mcp);
6845
6848
  let expectationResults = {};
6846
6849
  let toolPrecision;
6847
6850
  let toolRecall;
6851
+ let mcpHostTrace;
6848
6852
  if (!error && evalCase.expect) {
6849
6853
  const {
6850
6854
  expectations,
@@ -6859,11 +6863,28 @@ async function runSingleIteration(evalCase, context, options) {
6859
6863
  expectationResults = expectations;
6860
6864
  toolPrecision = tp;
6861
6865
  toolRecall = tr;
6866
+ if (evalCase.expect.toolsTriggered !== void 0 && isMCPHostSimulationResult(response)) {
6867
+ const expectedNames = new Set(
6868
+ evalCase.expect.toolsTriggered.calls.map((c) => c.name)
6869
+ );
6870
+ const requiredNames = new Set(
6871
+ evalCase.expect.toolsTriggered.calls.filter((c) => c.required !== false).map((c) => c.name)
6872
+ );
6873
+ const calledNames = new Set(response.toolCalls.map((c) => c.name));
6874
+ mcpHostTrace = {
6875
+ calls: response.toolCalls.map((call) => ({
6876
+ name: call.name,
6877
+ arguments: call.arguments,
6878
+ status: expectedNames.has(call.name) ? "expected" : "unexpected"
6879
+ })),
6880
+ missed: Array.from(requiredNames).filter((name15) => !calledNames.has(name15)).map((name15) => ({ name: name15 }))
6881
+ };
6882
+ }
6862
6883
  }
6863
6884
  return {
6864
6885
  id: evalCase.id,
6865
6886
  datasetName: options.datasetName ?? "single-case",
6866
- toolName: evalCase.toolName ?? evalCase.scenario ?? "unknown",
6887
+ toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
6867
6888
  source: "eval",
6868
6889
  pass: didCasePass(error, expectationResults),
6869
6890
  response,
@@ -6874,7 +6895,8 @@ async function runSingleIteration(evalCase, context, options) {
6874
6895
  durationMs: Date.now() - startTime,
6875
6896
  tags: evalCase.tags,
6876
6897
  toolPrecision,
6877
- toolRecall
6898
+ toolRecall,
6899
+ mcpHostTrace
6878
6900
  };
6879
6901
  }
6880
6902
  function isInfrastructureError(err) {
@@ -6931,7 +6953,7 @@ async function runEvalCase(evalCase, context, options = {}) {
6931
6953
  const baseResult = lastResult ?? {
6932
6954
  id: evalCase.id,
6933
6955
  datasetName: options.datasetName ?? "single-case",
6934
- toolName: evalCase.toolName ?? evalCase.scenario ?? "unknown",
6956
+ toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
6935
6957
  source: "eval",
6936
6958
  pass: false,
6937
6959
  error: iterationResults[0]?.error,
@@ -6945,12 +6967,25 @@ async function runEvalCase(evalCase, context, options = {}) {
6945
6967
  ...baseResult,
6946
6968
  pass: assertionPassRate >= threshold,
6947
6969
  assertionPassRate,
6970
+ assertionPassRateCI: wilsonCI(passCount, assertionResults.length),
6948
6971
  infrastructureErrorRate,
6949
6972
  iterationResults,
6950
6973
  infrastructureErrorCount: infraErrors.length,
6951
6974
  durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
6952
6975
  };
6953
6976
  }
6977
+ function wilsonCI(k, n) {
6978
+ if (n < 2) return void 0;
6979
+ const z5 = 1.96;
6980
+ const z22 = z5 * z5;
6981
+ const \u00F1 = n + z22;
6982
+ const p\u0303 = (k + z22 / 2) / \u00F1;
6983
+ const margin = z5 * Math.sqrt(p\u0303 * (1 - p\u0303) / \u00F1);
6984
+ return {
6985
+ lower: Math.max(0, p\u0303 - margin),
6986
+ upper: Math.min(1, p\u0303 + margin)
6987
+ };
6988
+ }
6954
6989
  async function runWithConcurrency(tasks, limit) {
6955
6990
  const results = new Array(tasks.length);
6956
6991
  let index = 0;
@@ -6980,7 +7015,7 @@ async function runEvalDataset(options, context) {
6980
7015
  filterTags,
6981
7016
  saveResultsTo,
6982
7017
  baselineResultsFrom,
6983
- llmHostModel,
7018
+ mcpHostModel,
6984
7019
  judgeModel
6985
7020
  } = options;
6986
7021
  const startTime = Date.now();
@@ -6990,7 +7025,7 @@ async function runEvalDataset(options, context) {
6990
7025
  };
6991
7026
  const casesToRun = filterTags && filterTags.length > 0 ? dataset.cases.filter((c) => c.tags?.some((t) => filterTags.includes(t))) : dataset.cases;
6992
7027
  const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
6993
- const effectiveIterations = c.mode === "llm_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
7028
+ const effectiveIterations = c.mode === "mcp_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
6994
7029
  const judgeReps = c.expect?.passesJudge != null ? c.expect.passesJudge.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1 : 0;
6995
7030
  return sum + effectiveIterations * judgeReps;
6996
7031
  }, 0);
@@ -7000,12 +7035,12 @@ async function runEvalDataset(options, context) {
7000
7035
  );
7001
7036
  }
7002
7037
  const tasks = casesToRun.map((evalCase) => async () => {
7003
- const withIterations = evalCase.mode === "llm_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
7004
- if (evalCase.mode === "llm_host") {
7038
+ const withIterations = evalCase.mode === "mcp_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
7039
+ if (evalCase.mode === "mcp_host") {
7005
7040
  const effectiveIterations = withIterations.iterations ?? 1;
7006
7041
  if (effectiveIterations > 1 && effectiveIterations < 10) {
7007
7042
  console.warn(
7008
- `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in llm_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
7043
+ `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in mcp_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
7009
7044
  );
7010
7045
  }
7011
7046
  }
@@ -7037,7 +7072,7 @@ async function runEvalDataset(options, context) {
7037
7072
  gitHash,
7038
7073
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
7039
7074
  packageVersion: package_default.version,
7040
- ...llmHostModel !== void 0 && { llmHostModel },
7075
+ ...mcpHostModel !== void 0 && { mcpHostModel },
7041
7076
  ...judgeModel !== void 0 && { judgeModel }
7042
7077
  };
7043
7078
  const result = {
@@ -7082,12 +7117,12 @@ async function runEvalDataset(options, context) {
7082
7117
  );
7083
7118
  }
7084
7119
  }
7085
- const llmHostCases = caseResults.filter(
7120
+ const mcpHostCases = caseResults.filter(
7086
7121
  (r) => r.toolPrecision !== void 0 || r.toolRecall !== void 0
7087
7122
  );
7088
- if (llmHostCases.length > 0) {
7089
- const avgPrec = llmHostCases.reduce((s, r) => s + (r.toolPrecision ?? 0), 0) / llmHostCases.length;
7090
- const avgRecall = llmHostCases.reduce((s, r) => s + (r.toolRecall ?? 0), 0) / llmHostCases.length;
7123
+ if (mcpHostCases.length > 0) {
7124
+ const avgPrec = mcpHostCases.reduce((s, r) => s + (r.toolPrecision ?? 0), 0) / mcpHostCases.length;
7125
+ const avgRecall = mcpHostCases.reduce((s, r) => s + (r.toolRecall ?? 0), 0) / mcpHostCases.length;
7091
7126
  result.datasetToolPrecision = avgPrec;
7092
7127
  result.datasetToolRecall = avgRecall;
7093
7128
  result.datasetToolF1 = avgPrec + avgRecall > 0 ? 2 * avgPrec * avgRecall / (avgPrec + avgRecall) : 0;
@@ -7155,7 +7190,6 @@ async function runServerComparison(options, contextA, contextB) {
7155
7190
  bWins,
7156
7191
  ties,
7157
7192
  bothFail,
7158
- bothFailCount: bothFail,
7159
7193
  decidedCases,
7160
7194
  failureAlignment: total > 0 ? bothFail / total : 0,
7161
7195
  aWinRate: decidedCases > 0 ? aWins / decidedCases : 0,
@@ -7339,6 +7373,6 @@ function formatCapabilities(capabilities) {
7339
7373
  return parts.length > 0 ? parts.join(", ") : "none declared";
7340
7374
  }
7341
7375
 
7342
- export { BUILT_IN_RUBRICS, CLIOAuthClient, DiscoveryError, ENV_VAR_NAMES, EvalCaseSchema, EvalDatasetSchema, MCPConfigSchema, MCP_PROTOCOL_VERSION, PlaywrightOAuthClientProvider, SnapshotSanitizers, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test2 as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateLLMHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
7376
+ export { BUILT_IN_RUBRICS, CLIOAuthClient, DiscoveryError, ENV_VAR_NAMES, EvalCaseSchema, EvalDatasetSchema, MCPConfigSchema, MCP_PROTOCOL_VERSION, PlaywrightOAuthClientProvider, SnapshotSanitizers, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test2 as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
7343
7377
  //# sourceMappingURL=index.js.map
7344
7378
  //# sourceMappingURL=index.js.map