@gleanwork/mcp-server-tester 1.0.0-beta.2 → 1.0.0-beta.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -3100,7 +3100,7 @@ var init_dist3 = __esm({
3100
3100
  }
3101
3101
  });
3102
3102
  var MCPHostCapabilitiesSchema = z.object({
3103
- sampling: z.record(z.unknown()).optional(),
3103
+ sampling: z.record(z.string(), z.unknown()).optional(),
3104
3104
  roots: z.object({
3105
3105
  listChanged: z.boolean()
3106
3106
  }).optional()
@@ -3159,7 +3159,7 @@ var HttpConfigSchema = z.object({
3159
3159
  }
3160
3160
  return true;
3161
3161
  }),
3162
- headers: z.record(z.string()).optional(),
3162
+ headers: z.record(z.string(), z.string()).optional(),
3163
3163
  capabilities: MCPHostCapabilitiesSchema.optional(),
3164
3164
  connectTimeoutMs: z.number().positive().optional(),
3165
3165
  requestTimeoutMs: z.number().positive().optional(),
@@ -4380,7 +4380,7 @@ function escapeHtml(text) {
4380
4380
 
4381
4381
  // package.json
4382
4382
  var package_default = {
4383
- version: "1.0.0-beta.2"};
4383
+ version: "1.0.0-beta.4"};
4384
4384
 
4385
4385
  // src/mcp/clientFactory.ts
4386
4386
  function getRetryAfterDelayMs(err) {
@@ -4471,7 +4471,10 @@ async function createMCPClientForConfig(config, options) {
4471
4471
  validatedConfig.connectTimeoutMs !== void 0 ? { timeout: validatedConfig.connectTimeoutMs } : void 0
4472
4472
  );
4473
4473
  } else if (isHttpConfig(validatedConfig)) {
4474
- const headers = { ...validatedConfig.headers };
4474
+ const headers = {
4475
+ "User-Agent": `@gleanwork/mcp-server-tester/${package_default.version}`,
4476
+ ...validatedConfig.headers
4477
+ };
4475
4478
  if (validatedConfig.auth?.clientCredentials && !options?.authProvider) {
4476
4479
  const ccConfig = validatedConfig.auth.clientCredentials;
4477
4480
  const clientId = ccConfig.clientId ?? process.env["MCP_CLIENT_ID"];
@@ -5121,7 +5124,7 @@ function validateToolCalls(response, expectation) {
5121
5124
  if (!isSimulationResult(response)) {
5122
5125
  return {
5123
5126
  pass: false,
5124
- message: "toolsTriggered expectation requires llm_host mode \u2014 response must be an LLMHostSimulationResult"
5127
+ message: "toolsTriggered expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
5125
5128
  };
5126
5129
  }
5127
5130
  const actual = response.toolCalls;
@@ -5181,7 +5184,7 @@ function validateToolCallCount(response, options) {
5181
5184
  if (!isSimulationResult(response)) {
5182
5185
  return {
5183
5186
  pass: false,
5184
- message: "toolCallCount expectation requires llm_host mode \u2014 response must be an LLMHostSimulationResult"
5187
+ message: "toolCallCount expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
5185
5188
  };
5186
5189
  }
5187
5190
  const count = response.toolCalls.length;
@@ -6303,7 +6306,7 @@ function getAuthConfigFromEnv() {
6303
6306
  }
6304
6307
  return void 0;
6305
6308
  }
6306
- var LLMHostConfigSchema = z.object({
6309
+ var MCPHostConfigSchema = z.object({
6307
6310
  provider: z.enum([
6308
6311
  "openai",
6309
6312
  "anthropic",
@@ -6372,7 +6375,7 @@ var EvalExpectBlockSchema = z.object({
6372
6375
  calls: z.array(
6373
6376
  z.object({
6374
6377
  name: z.string(),
6375
- arguments: z.record(z.unknown()).optional(),
6378
+ arguments: z.record(z.string(), z.unknown()).optional(),
6376
6379
  required: z.boolean().optional()
6377
6380
  })
6378
6381
  ),
@@ -6388,12 +6391,12 @@ var EvalExpectBlockSchema = z.object({
6388
6391
  var EvalCaseSchema = z.object({
6389
6392
  id: z.string().min(1, "id must not be empty"),
6390
6393
  description: z.string().optional(),
6391
- mode: z.enum(["direct", "llm_host"]).optional(),
6394
+ mode: z.enum(["direct", "mcp_host"]).optional(),
6392
6395
  toolName: z.string().min(1, "toolName must not be empty").optional(),
6393
- args: z.record(z.unknown()).optional(),
6396
+ args: z.record(z.string(), z.unknown()).optional(),
6394
6397
  scenario: z.string().optional(),
6395
- llmHostConfig: LLMHostConfigSchema.optional(),
6396
- metadata: z.record(z.unknown()).optional(),
6398
+ mcpHostConfig: MCPHostConfigSchema.optional(),
6399
+ metadata: z.record(z.string(), z.unknown()).optional(),
6397
6400
  iterations: z.number().int().min(1).optional(),
6398
6401
  accuracyThreshold: z.number().min(0).max(1).optional(),
6399
6402
  judgeReps: z.number().int().min(1).optional(),
@@ -6405,7 +6408,7 @@ var EvalDatasetSchema = z.object({
6405
6408
  name: z.string().min(1, "name must not be empty"),
6406
6409
  description: z.string().optional(),
6407
6410
  cases: z.array(EvalCaseSchema).min(1, "dataset must have at least one case"),
6408
- metadata: z.record(z.unknown()).optional()
6411
+ metadata: z.record(z.string(), z.unknown()).optional()
6409
6412
  });
6410
6413
  function validateEvalCase(evalCase) {
6411
6414
  return EvalCaseSchema.parse(evalCase);
@@ -6443,30 +6446,30 @@ function loadEvalDatasetFromObject(data, options = {}) {
6443
6446
  return dataset;
6444
6447
  }
6445
6448
 
6446
- // src/evals/llmHost/adapters/vercel.ts
6449
+ // src/evals/mcpHost/adapters/vercel.ts
6447
6450
  function enrichErrorMessage(err, provider) {
6448
6451
  const raw = err instanceof Error ? err.message : String(err);
6449
6452
  if (raw.includes("Cannot find module") || raw.includes("ERR_MODULE_NOT_FOUND")) {
6450
- return `LLM host simulation failed: required package not installed.
6451
- Hint: run \`getMissingDependencyMessage('${provider}')\` or check docs/llm-host.md for install instructions.`;
6453
+ return `MCP host simulation failed: required package not installed.
6454
+ Hint: run \`getMissingDependencyMessage('${provider}')\` or check docs/mcp-host.md for install instructions.`;
6452
6455
  }
6453
6456
  if (raw.includes("401") || raw.includes("Unauthorized") || raw.includes("API key") || raw.includes("api_key")) {
6454
- return `LLM host simulation failed: authentication error.
6457
+ return `MCP host simulation failed: authentication error.
6455
6458
  Hint: check your API key environment variable (e.g. ANTHROPIC_API_KEY, GOOGLE_APPLICATION_CREDENTIALS).`;
6456
6459
  }
6457
6460
  if (raw.includes("404") || raw.includes("Not Found") || raw.toLowerCase().includes("model") && raw.toLowerCase().includes("not found")) {
6458
- return `LLM host simulation failed: model not found.
6461
+ return `MCP host simulation failed: model not found.
6459
6462
  Hint: check the model name format for your provider. For vertex-anthropic use 'claude-3-5-haiku@20241022' (with @).`;
6460
6463
  }
6461
6464
  if (raw.includes("ENOTFOUND") || raw.includes("fetch failed") || raw.includes("ECONNREFUSED")) {
6462
- return `LLM host simulation failed: network error.
6465
+ return `MCP host simulation failed: network error.
6463
6466
  Hint: check network connectivity and whether the provider's API endpoint is reachable from this machine.`;
6464
6467
  }
6465
6468
  if (raw.includes("429") || raw.toLowerCase().includes("rate limit") || raw.includes("Too Many Requests")) {
6466
- return `LLM host simulation failed: rate limited.
6469
+ return `MCP host simulation failed: rate limited.
6467
6470
  Hint: reduce concurrency, add delays between iterations, or upgrade your API plan.`;
6468
6471
  }
6469
- return `LLM host simulation failed: ${raw}`;
6472
+ return `MCP host simulation failed: ${raw}`;
6470
6473
  }
6471
6474
  async function loadModel(provider, model) {
6472
6475
  switch (provider) {
@@ -6596,7 +6599,7 @@ function createVercelOrchestrator() {
6596
6599
  };
6597
6600
  }
6598
6601
 
6599
- // src/evals/llmHost/llmHostSimulation.ts
6602
+ // src/evals/mcpHost/mcpHostSimulation.ts
6600
6603
  var vercelOrchestrator = createVercelOrchestrator();
6601
6604
  var allProviders = [
6602
6605
  "openai",
@@ -6612,7 +6615,7 @@ var allProviders = [
6612
6615
  var simulatorRegistry = new Map(
6613
6616
  allProviders.map((p) => [p, vercelOrchestrator])
6614
6617
  );
6615
- async function simulateLLMHost(mcp, scenario, config) {
6618
+ async function simulateMCPHost(mcp, scenario, config) {
6616
6619
  const simulator = simulatorRegistry.get(config.provider);
6617
6620
  if (!simulator) {
6618
6621
  throw new Error(
@@ -6634,7 +6637,7 @@ function getMissingDependencyMessage(provider) {
6634
6637
  deepseek: "npm install ai @ai-sdk/deepseek",
6635
6638
  openrouter: "npm install ai @openrouter/ai-sdk-provider",
6636
6639
  xai: "npm install ai @ai-sdk/xai",
6637
- "vertex-anthropic": "npm install ai @ai-sdk/google-vertex (requires Application Default Credentials \u2014 see docs/llm-host.md)"
6640
+ "vertex-anthropic": "npm install ai @ai-sdk/google-vertex (requires Application Default Credentials \u2014 see docs/mcp-host.md)"
6638
6641
  };
6639
6642
  const pkg = packageMap[provider];
6640
6643
  return pkg ? `${String(provider)} provider requires: ${pkg}` : `Unknown provider: ${String(provider)}`;
@@ -6677,24 +6680,24 @@ async function execFileNoThrow(file, args) {
6677
6680
  async function executeToolCall(evalCase, mcp) {
6678
6681
  const mode = evalCase.mode || "direct";
6679
6682
  try {
6680
- if (mode === "llm_host") {
6683
+ if (mode === "mcp_host") {
6681
6684
  if (!evalCase.scenario) {
6682
6685
  throw new Error(
6683
- `Eval case ${evalCase.id}: scenario is required for llm_host mode`
6686
+ `Eval case ${evalCase.id}: scenario is required for mcp_host mode`
6684
6687
  );
6685
6688
  }
6686
- if (!evalCase.llmHostConfig) {
6689
+ if (!evalCase.mcpHostConfig) {
6687
6690
  throw new Error(
6688
- `Eval case ${evalCase.id}: llmHostConfig is required for llm_host mode`
6691
+ `Eval case ${evalCase.id}: mcpHostConfig is required for mcp_host mode`
6689
6692
  );
6690
6693
  }
6691
- const simulationResult = await simulateLLMHost(
6694
+ const simulationResult = await simulateMCPHost(
6692
6695
  mcp,
6693
6696
  evalCase.scenario,
6694
- evalCase.llmHostConfig
6697
+ evalCase.mcpHostConfig
6695
6698
  );
6696
6699
  if (!simulationResult.success) {
6697
- throw new Error(simulationResult.error || "LLM host simulation failed");
6700
+ throw new Error(simulationResult.error || "MCP host simulation failed");
6698
6701
  }
6699
6702
  return { response: simulationResult };
6700
6703
  } else {
@@ -6836,12 +6839,16 @@ async function runExpectBlockValidations(expectBlock, response, config) {
6836
6839
  }
6837
6840
  return { expectations: results, toolPrecision, toolRecall };
6838
6841
  }
6842
+ function isMCPHostSimulationResult(value) {
6843
+ return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
6844
+ }
6839
6845
  async function runSingleIteration(evalCase, context, options) {
6840
6846
  const startTime = Date.now();
6841
6847
  const { response, error } = await executeToolCall(evalCase, context.mcp);
6842
6848
  let expectationResults = {};
6843
6849
  let toolPrecision;
6844
6850
  let toolRecall;
6851
+ let mcpHostTrace;
6845
6852
  if (!error && evalCase.expect) {
6846
6853
  const {
6847
6854
  expectations,
@@ -6856,6 +6863,23 @@ async function runSingleIteration(evalCase, context, options) {
6856
6863
  expectationResults = expectations;
6857
6864
  toolPrecision = tp;
6858
6865
  toolRecall = tr;
6866
+ if (evalCase.expect.toolsTriggered !== void 0 && isMCPHostSimulationResult(response)) {
6867
+ const expectedNames = new Set(
6868
+ evalCase.expect.toolsTriggered.calls.map((c) => c.name)
6869
+ );
6870
+ const requiredNames = new Set(
6871
+ evalCase.expect.toolsTriggered.calls.filter((c) => c.required !== false).map((c) => c.name)
6872
+ );
6873
+ const calledNames = new Set(response.toolCalls.map((c) => c.name));
6874
+ mcpHostTrace = {
6875
+ calls: response.toolCalls.map((call) => ({
6876
+ name: call.name,
6877
+ arguments: call.arguments,
6878
+ status: expectedNames.has(call.name) ? "expected" : "unexpected"
6879
+ })),
6880
+ missed: Array.from(requiredNames).filter((name15) => !calledNames.has(name15)).map((name15) => ({ name: name15 }))
6881
+ };
6882
+ }
6859
6883
  }
6860
6884
  return {
6861
6885
  id: evalCase.id,
@@ -6871,7 +6895,8 @@ async function runSingleIteration(evalCase, context, options) {
6871
6895
  durationMs: Date.now() - startTime,
6872
6896
  tags: evalCase.tags,
6873
6897
  toolPrecision,
6874
- toolRecall
6898
+ toolRecall,
6899
+ mcpHostTrace
6875
6900
  };
6876
6901
  }
6877
6902
  function isInfrastructureError(err) {
@@ -6924,7 +6949,6 @@ async function runEvalCase(evalCase, context, options = {}) {
6924
6949
  const passCount = assertionResults.filter((r) => r.pass).length;
6925
6950
  const assertionPassRate = assertionResults.length > 0 ? passCount / assertionResults.length : 0;
6926
6951
  const infrastructureErrorRate = infraErrors.length / iterations;
6927
- const accuracy = assertionPassRate;
6928
6952
  const threshold = evalCase.accuracyThreshold ?? 1;
6929
6953
  const baseResult = lastResult ?? {
6930
6954
  id: evalCase.id,
@@ -6941,10 +6965,9 @@ async function runEvalCase(evalCase, context, options = {}) {
6941
6965
  };
6942
6966
  return {
6943
6967
  ...baseResult,
6944
- pass: accuracy >= threshold,
6968
+ pass: assertionPassRate >= threshold,
6945
6969
  assertionPassRate,
6946
6970
  infrastructureErrorRate,
6947
- accuracy,
6948
6971
  iterationResults,
6949
6972
  infrastructureErrorCount: infraErrors.length,
6950
6973
  durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
@@ -6979,7 +7002,7 @@ async function runEvalDataset(options, context) {
6979
7002
  filterTags,
6980
7003
  saveResultsTo,
6981
7004
  baselineResultsFrom,
6982
- llmHostModel,
7005
+ mcpHostModel,
6983
7006
  judgeModel
6984
7007
  } = options;
6985
7008
  const startTime = Date.now();
@@ -6989,7 +7012,7 @@ async function runEvalDataset(options, context) {
6989
7012
  };
6990
7013
  const casesToRun = filterTags && filterTags.length > 0 ? dataset.cases.filter((c) => c.tags?.some((t) => filterTags.includes(t))) : dataset.cases;
6991
7014
  const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
6992
- const effectiveIterations = c.mode === "llm_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
7015
+ const effectiveIterations = c.mode === "mcp_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
6993
7016
  const judgeReps = c.expect?.passesJudge != null ? c.expect.passesJudge.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1 : 0;
6994
7017
  return sum + effectiveIterations * judgeReps;
6995
7018
  }, 0);
@@ -6999,12 +7022,12 @@ async function runEvalDataset(options, context) {
6999
7022
  );
7000
7023
  }
7001
7024
  const tasks = casesToRun.map((evalCase) => async () => {
7002
- const withIterations = evalCase.mode === "llm_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
7003
- if (evalCase.mode === "llm_host") {
7025
+ const withIterations = evalCase.mode === "mcp_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
7026
+ if (evalCase.mode === "mcp_host") {
7004
7027
  const effectiveIterations = withIterations.iterations ?? 1;
7005
7028
  if (effectiveIterations > 1 && effectiveIterations < 10) {
7006
7029
  console.warn(
7007
- `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in llm_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
7030
+ `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in mcp_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
7008
7031
  );
7009
7032
  }
7010
7033
  }
@@ -7036,7 +7059,7 @@ async function runEvalDataset(options, context) {
7036
7059
  gitHash,
7037
7060
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
7038
7061
  packageVersion: package_default.version,
7039
- ...llmHostModel !== void 0 && { llmHostModel },
7062
+ ...mcpHostModel !== void 0 && { mcpHostModel },
7040
7063
  ...judgeModel !== void 0 && { judgeModel }
7041
7064
  };
7042
7065
  const result = {
@@ -7081,12 +7104,12 @@ async function runEvalDataset(options, context) {
7081
7104
  );
7082
7105
  }
7083
7106
  }
7084
- const llmHostCases = caseResults.filter(
7107
+ const mcpHostCases = caseResults.filter(
7085
7108
  (r) => r.toolPrecision !== void 0 || r.toolRecall !== void 0
7086
7109
  );
7087
- if (llmHostCases.length > 0) {
7088
- const avgPrec = llmHostCases.reduce((s, r) => s + (r.toolPrecision ?? 0), 0) / llmHostCases.length;
7089
- const avgRecall = llmHostCases.reduce((s, r) => s + (r.toolRecall ?? 0), 0) / llmHostCases.length;
7110
+ if (mcpHostCases.length > 0) {
7111
+ const avgPrec = mcpHostCases.reduce((s, r) => s + (r.toolPrecision ?? 0), 0) / mcpHostCases.length;
7112
+ const avgRecall = mcpHostCases.reduce((s, r) => s + (r.toolRecall ?? 0), 0) / mcpHostCases.length;
7090
7113
  result.datasetToolPrecision = avgPrec;
7091
7114
  result.datasetToolRecall = avgRecall;
7092
7115
  result.datasetToolF1 = avgPrec + avgRecall > 0 ? 2 * avgPrec * avgRecall / (avgPrec + avgRecall) : 0;
@@ -7154,7 +7177,6 @@ async function runServerComparison(options, contextA, contextB) {
7154
7177
  bWins,
7155
7178
  ties,
7156
7179
  bothFail,
7157
- bothFailCount: bothFail,
7158
7180
  decidedCases,
7159
7181
  failureAlignment: total > 0 ? bothFail / total : 0,
7160
7182
  aWinRate: decidedCases > 0 ? aWins / decidedCases : 0,
@@ -7338,6 +7360,6 @@ function formatCapabilities(capabilities) {
7338
7360
  return parts.length > 0 ? parts.join(", ") : "none declared";
7339
7361
  }
7340
7362
 
7341
- export { BUILT_IN_RUBRICS, CLIOAuthClient, DiscoveryError, ENV_VAR_NAMES, EvalCaseSchema, EvalDatasetSchema, MCPConfigSchema, MCP_PROTOCOL_VERSION, PlaywrightOAuthClientProvider, SnapshotSanitizers, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test2 as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateLLMHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
7363
+ export { BUILT_IN_RUBRICS, CLIOAuthClient, DiscoveryError, ENV_VAR_NAMES, EvalCaseSchema, EvalDatasetSchema, MCPConfigSchema, MCP_PROTOCOL_VERSION, PlaywrightOAuthClientProvider, SnapshotSanitizers, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test2 as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
7342
7364
  //# sourceMappingURL=index.js.map
7343
7365
  //# sourceMappingURL=index.js.map