@gleanwork/mcp-server-tester 1.0.0-beta.3 → 1.0.0-beta.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -3100,7 +3100,7 @@ var init_dist3 = __esm({
3100
3100
  }
3101
3101
  });
3102
3102
  var MCPHostCapabilitiesSchema = z.object({
3103
- sampling: z.record(z.unknown()).optional(),
3103
+ sampling: z.record(z.string(), z.unknown()).optional(),
3104
3104
  roots: z.object({
3105
3105
  listChanged: z.boolean()
3106
3106
  }).optional()
@@ -3159,7 +3159,7 @@ var HttpConfigSchema = z.object({
3159
3159
  }
3160
3160
  return true;
3161
3161
  }),
3162
- headers: z.record(z.string()).optional(),
3162
+ headers: z.record(z.string(), z.string()).optional(),
3163
3163
  capabilities: MCPHostCapabilitiesSchema.optional(),
3164
3164
  connectTimeoutMs: z.number().positive().optional(),
3165
3165
  requestTimeoutMs: z.number().positive().optional(),
@@ -4380,7 +4380,7 @@ function escapeHtml(text) {
4380
4380
 
4381
4381
  // package.json
4382
4382
  var package_default = {
4383
- version: "1.0.0-beta.3"};
4383
+ version: "1.0.0-beta.4"};
4384
4384
 
4385
4385
  // src/mcp/clientFactory.ts
4386
4386
  function getRetryAfterDelayMs(err) {
@@ -5124,7 +5124,7 @@ function validateToolCalls(response, expectation) {
5124
5124
  if (!isSimulationResult(response)) {
5125
5125
  return {
5126
5126
  pass: false,
5127
- message: "toolsTriggered expectation requires llm_host mode \u2014 response must be an LLMHostSimulationResult"
5127
+ message: "toolsTriggered expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
5128
5128
  };
5129
5129
  }
5130
5130
  const actual = response.toolCalls;
@@ -5184,7 +5184,7 @@ function validateToolCallCount(response, options) {
5184
5184
  if (!isSimulationResult(response)) {
5185
5185
  return {
5186
5186
  pass: false,
5187
- message: "toolCallCount expectation requires llm_host mode \u2014 response must be an LLMHostSimulationResult"
5187
+ message: "toolCallCount expectation requires mcp_host mode \u2014 response must be an MCPHostSimulationResult"
5188
5188
  };
5189
5189
  }
5190
5190
  const count = response.toolCalls.length;
@@ -6306,7 +6306,7 @@ function getAuthConfigFromEnv() {
6306
6306
  }
6307
6307
  return void 0;
6308
6308
  }
6309
- var LLMHostConfigSchema = z.object({
6309
+ var MCPHostConfigSchema = z.object({
6310
6310
  provider: z.enum([
6311
6311
  "openai",
6312
6312
  "anthropic",
@@ -6375,7 +6375,7 @@ var EvalExpectBlockSchema = z.object({
6375
6375
  calls: z.array(
6376
6376
  z.object({
6377
6377
  name: z.string(),
6378
- arguments: z.record(z.unknown()).optional(),
6378
+ arguments: z.record(z.string(), z.unknown()).optional(),
6379
6379
  required: z.boolean().optional()
6380
6380
  })
6381
6381
  ),
@@ -6391,12 +6391,12 @@ var EvalExpectBlockSchema = z.object({
6391
6391
  var EvalCaseSchema = z.object({
6392
6392
  id: z.string().min(1, "id must not be empty"),
6393
6393
  description: z.string().optional(),
6394
- mode: z.enum(["direct", "llm_host"]).optional(),
6394
+ mode: z.enum(["direct", "mcp_host"]).optional(),
6395
6395
  toolName: z.string().min(1, "toolName must not be empty").optional(),
6396
- args: z.record(z.unknown()).optional(),
6396
+ args: z.record(z.string(), z.unknown()).optional(),
6397
6397
  scenario: z.string().optional(),
6398
- llmHostConfig: LLMHostConfigSchema.optional(),
6399
- metadata: z.record(z.unknown()).optional(),
6398
+ mcpHostConfig: MCPHostConfigSchema.optional(),
6399
+ metadata: z.record(z.string(), z.unknown()).optional(),
6400
6400
  iterations: z.number().int().min(1).optional(),
6401
6401
  accuracyThreshold: z.number().min(0).max(1).optional(),
6402
6402
  judgeReps: z.number().int().min(1).optional(),
@@ -6408,7 +6408,7 @@ var EvalDatasetSchema = z.object({
6408
6408
  name: z.string().min(1, "name must not be empty"),
6409
6409
  description: z.string().optional(),
6410
6410
  cases: z.array(EvalCaseSchema).min(1, "dataset must have at least one case"),
6411
- metadata: z.record(z.unknown()).optional()
6411
+ metadata: z.record(z.string(), z.unknown()).optional()
6412
6412
  });
6413
6413
  function validateEvalCase(evalCase) {
6414
6414
  return EvalCaseSchema.parse(evalCase);
@@ -6446,30 +6446,30 @@ function loadEvalDatasetFromObject(data, options = {}) {
6446
6446
  return dataset;
6447
6447
  }
6448
6448
 
6449
- // src/evals/llmHost/adapters/vercel.ts
6449
+ // src/evals/mcpHost/adapters/vercel.ts
6450
6450
  function enrichErrorMessage(err, provider) {
6451
6451
  const raw = err instanceof Error ? err.message : String(err);
6452
6452
  if (raw.includes("Cannot find module") || raw.includes("ERR_MODULE_NOT_FOUND")) {
6453
- return `LLM host simulation failed: required package not installed.
6454
- Hint: run \`getMissingDependencyMessage('${provider}')\` or check docs/llm-host.md for install instructions.`;
6453
+ return `MCP host simulation failed: required package not installed.
6454
+ Hint: run \`getMissingDependencyMessage('${provider}')\` or check docs/mcp-host.md for install instructions.`;
6455
6455
  }
6456
6456
  if (raw.includes("401") || raw.includes("Unauthorized") || raw.includes("API key") || raw.includes("api_key")) {
6457
- return `LLM host simulation failed: authentication error.
6457
+ return `MCP host simulation failed: authentication error.
6458
6458
  Hint: check your API key environment variable (e.g. ANTHROPIC_API_KEY, GOOGLE_APPLICATION_CREDENTIALS).`;
6459
6459
  }
6460
6460
  if (raw.includes("404") || raw.includes("Not Found") || raw.toLowerCase().includes("model") && raw.toLowerCase().includes("not found")) {
6461
- return `LLM host simulation failed: model not found.
6461
+ return `MCP host simulation failed: model not found.
6462
6462
  Hint: check the model name format for your provider. For vertex-anthropic use 'claude-3-5-haiku@20241022' (with @).`;
6463
6463
  }
6464
6464
  if (raw.includes("ENOTFOUND") || raw.includes("fetch failed") || raw.includes("ECONNREFUSED")) {
6465
- return `LLM host simulation failed: network error.
6465
+ return `MCP host simulation failed: network error.
6466
6466
  Hint: check network connectivity and whether the provider's API endpoint is reachable from this machine.`;
6467
6467
  }
6468
6468
  if (raw.includes("429") || raw.toLowerCase().includes("rate limit") || raw.includes("Too Many Requests")) {
6469
- return `LLM host simulation failed: rate limited.
6469
+ return `MCP host simulation failed: rate limited.
6470
6470
  Hint: reduce concurrency, add delays between iterations, or upgrade your API plan.`;
6471
6471
  }
6472
- return `LLM host simulation failed: ${raw}`;
6472
+ return `MCP host simulation failed: ${raw}`;
6473
6473
  }
6474
6474
  async function loadModel(provider, model) {
6475
6475
  switch (provider) {
@@ -6599,7 +6599,7 @@ function createVercelOrchestrator() {
6599
6599
  };
6600
6600
  }
6601
6601
 
6602
- // src/evals/llmHost/llmHostSimulation.ts
6602
+ // src/evals/mcpHost/mcpHostSimulation.ts
6603
6603
  var vercelOrchestrator = createVercelOrchestrator();
6604
6604
  var allProviders = [
6605
6605
  "openai",
@@ -6615,7 +6615,7 @@ var allProviders = [
6615
6615
  var simulatorRegistry = new Map(
6616
6616
  allProviders.map((p) => [p, vercelOrchestrator])
6617
6617
  );
6618
- async function simulateLLMHost(mcp, scenario, config) {
6618
+ async function simulateMCPHost(mcp, scenario, config) {
6619
6619
  const simulator = simulatorRegistry.get(config.provider);
6620
6620
  if (!simulator) {
6621
6621
  throw new Error(
@@ -6637,7 +6637,7 @@ function getMissingDependencyMessage(provider) {
6637
6637
  deepseek: "npm install ai @ai-sdk/deepseek",
6638
6638
  openrouter: "npm install ai @openrouter/ai-sdk-provider",
6639
6639
  xai: "npm install ai @ai-sdk/xai",
6640
- "vertex-anthropic": "npm install ai @ai-sdk/google-vertex (requires Application Default Credentials \u2014 see docs/llm-host.md)"
6640
+ "vertex-anthropic": "npm install ai @ai-sdk/google-vertex (requires Application Default Credentials \u2014 see docs/mcp-host.md)"
6641
6641
  };
6642
6642
  const pkg = packageMap[provider];
6643
6643
  return pkg ? `${String(provider)} provider requires: ${pkg}` : `Unknown provider: ${String(provider)}`;
@@ -6680,24 +6680,24 @@ async function execFileNoThrow(file, args) {
6680
6680
  async function executeToolCall(evalCase, mcp) {
6681
6681
  const mode = evalCase.mode || "direct";
6682
6682
  try {
6683
- if (mode === "llm_host") {
6683
+ if (mode === "mcp_host") {
6684
6684
  if (!evalCase.scenario) {
6685
6685
  throw new Error(
6686
- `Eval case ${evalCase.id}: scenario is required for llm_host mode`
6686
+ `Eval case ${evalCase.id}: scenario is required for mcp_host mode`
6687
6687
  );
6688
6688
  }
6689
- if (!evalCase.llmHostConfig) {
6689
+ if (!evalCase.mcpHostConfig) {
6690
6690
  throw new Error(
6691
- `Eval case ${evalCase.id}: llmHostConfig is required for llm_host mode`
6691
+ `Eval case ${evalCase.id}: mcpHostConfig is required for mcp_host mode`
6692
6692
  );
6693
6693
  }
6694
- const simulationResult = await simulateLLMHost(
6694
+ const simulationResult = await simulateMCPHost(
6695
6695
  mcp,
6696
6696
  evalCase.scenario,
6697
- evalCase.llmHostConfig
6697
+ evalCase.mcpHostConfig
6698
6698
  );
6699
6699
  if (!simulationResult.success) {
6700
- throw new Error(simulationResult.error || "LLM host simulation failed");
6700
+ throw new Error(simulationResult.error || "MCP host simulation failed");
6701
6701
  }
6702
6702
  return { response: simulationResult };
6703
6703
  } else {
@@ -6839,12 +6839,16 @@ async function runExpectBlockValidations(expectBlock, response, config) {
6839
6839
  }
6840
6840
  return { expectations: results, toolPrecision, toolRecall };
6841
6841
  }
6842
+ function isMCPHostSimulationResult(value) {
6843
+ return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
6844
+ }
6842
6845
  async function runSingleIteration(evalCase, context, options) {
6843
6846
  const startTime = Date.now();
6844
6847
  const { response, error } = await executeToolCall(evalCase, context.mcp);
6845
6848
  let expectationResults = {};
6846
6849
  let toolPrecision;
6847
6850
  let toolRecall;
6851
+ let mcpHostTrace;
6848
6852
  if (!error && evalCase.expect) {
6849
6853
  const {
6850
6854
  expectations,
@@ -6859,6 +6863,23 @@ async function runSingleIteration(evalCase, context, options) {
6859
6863
  expectationResults = expectations;
6860
6864
  toolPrecision = tp;
6861
6865
  toolRecall = tr;
6866
+ if (evalCase.expect.toolsTriggered !== void 0 && isMCPHostSimulationResult(response)) {
6867
+ const expectedNames = new Set(
6868
+ evalCase.expect.toolsTriggered.calls.map((c) => c.name)
6869
+ );
6870
+ const requiredNames = new Set(
6871
+ evalCase.expect.toolsTriggered.calls.filter((c) => c.required !== false).map((c) => c.name)
6872
+ );
6873
+ const calledNames = new Set(response.toolCalls.map((c) => c.name));
6874
+ mcpHostTrace = {
6875
+ calls: response.toolCalls.map((call) => ({
6876
+ name: call.name,
6877
+ arguments: call.arguments,
6878
+ status: expectedNames.has(call.name) ? "expected" : "unexpected"
6879
+ })),
6880
+ missed: Array.from(requiredNames).filter((name15) => !calledNames.has(name15)).map((name15) => ({ name: name15 }))
6881
+ };
6882
+ }
6862
6883
  }
6863
6884
  return {
6864
6885
  id: evalCase.id,
@@ -6874,7 +6895,8 @@ async function runSingleIteration(evalCase, context, options) {
6874
6895
  durationMs: Date.now() - startTime,
6875
6896
  tags: evalCase.tags,
6876
6897
  toolPrecision,
6877
- toolRecall
6898
+ toolRecall,
6899
+ mcpHostTrace
6878
6900
  };
6879
6901
  }
6880
6902
  function isInfrastructureError(err) {
@@ -6980,7 +7002,7 @@ async function runEvalDataset(options, context) {
6980
7002
  filterTags,
6981
7003
  saveResultsTo,
6982
7004
  baselineResultsFrom,
6983
- llmHostModel,
7005
+ mcpHostModel,
6984
7006
  judgeModel
6985
7007
  } = options;
6986
7008
  const startTime = Date.now();
@@ -6990,7 +7012,7 @@ async function runEvalDataset(options, context) {
6990
7012
  };
6991
7013
  const casesToRun = filterTags && filterTags.length > 0 ? dataset.cases.filter((c) => c.tags?.some((t) => filterTags.includes(t))) : dataset.cases;
6992
7014
  const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
6993
- const effectiveIterations = c.mode === "llm_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
7015
+ const effectiveIterations = c.mode === "mcp_host" ? c.iterations ?? defaultLlmIterations ?? 1 : c.iterations ?? 1;
6994
7016
  const judgeReps = c.expect?.passesJudge != null ? c.expect.passesJudge.reps ?? c.judgeReps ?? defaultJudgeReps ?? 1 : 0;
6995
7017
  return sum + effectiveIterations * judgeReps;
6996
7018
  }, 0);
@@ -7000,12 +7022,12 @@ async function runEvalDataset(options, context) {
7000
7022
  );
7001
7023
  }
7002
7024
  const tasks = casesToRun.map((evalCase) => async () => {
7003
- const withIterations = evalCase.mode === "llm_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
7004
- if (evalCase.mode === "llm_host") {
7025
+ const withIterations = evalCase.mode === "mcp_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
7026
+ if (evalCase.mode === "mcp_host") {
7005
7027
  const effectiveIterations = withIterations.iterations ?? 1;
7006
7028
  if (effectiveIterations > 1 && effectiveIterations < 10) {
7007
7029
  console.warn(
7008
- `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in llm_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
7030
+ `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in mcp_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
7009
7031
  );
7010
7032
  }
7011
7033
  }
@@ -7037,7 +7059,7 @@ async function runEvalDataset(options, context) {
7037
7059
  gitHash,
7038
7060
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
7039
7061
  packageVersion: package_default.version,
7040
- ...llmHostModel !== void 0 && { llmHostModel },
7062
+ ...mcpHostModel !== void 0 && { mcpHostModel },
7041
7063
  ...judgeModel !== void 0 && { judgeModel }
7042
7064
  };
7043
7065
  const result = {
@@ -7082,12 +7104,12 @@ async function runEvalDataset(options, context) {
7082
7104
  );
7083
7105
  }
7084
7106
  }
7085
- const llmHostCases = caseResults.filter(
7107
+ const mcpHostCases = caseResults.filter(
7086
7108
  (r) => r.toolPrecision !== void 0 || r.toolRecall !== void 0
7087
7109
  );
7088
- if (llmHostCases.length > 0) {
7089
- const avgPrec = llmHostCases.reduce((s, r) => s + (r.toolPrecision ?? 0), 0) / llmHostCases.length;
7090
- const avgRecall = llmHostCases.reduce((s, r) => s + (r.toolRecall ?? 0), 0) / llmHostCases.length;
7110
+ if (mcpHostCases.length > 0) {
7111
+ const avgPrec = mcpHostCases.reduce((s, r) => s + (r.toolPrecision ?? 0), 0) / mcpHostCases.length;
7112
+ const avgRecall = mcpHostCases.reduce((s, r) => s + (r.toolRecall ?? 0), 0) / mcpHostCases.length;
7091
7113
  result.datasetToolPrecision = avgPrec;
7092
7114
  result.datasetToolRecall = avgRecall;
7093
7115
  result.datasetToolF1 = avgPrec + avgRecall > 0 ? 2 * avgPrec * avgRecall / (avgPrec + avgRecall) : 0;
@@ -7155,7 +7177,6 @@ async function runServerComparison(options, contextA, contextB) {
7155
7177
  bWins,
7156
7178
  ties,
7157
7179
  bothFail,
7158
- bothFailCount: bothFail,
7159
7180
  decidedCases,
7160
7181
  failureAlignment: total > 0 ? bothFail / total : 0,
7161
7182
  aWinRate: decidedCases > 0 ? aWins / decidedCases : 0,
@@ -7339,6 +7360,6 @@ function formatCapabilities(capabilities) {
7339
7360
  return parts.length > 0 ? parts.join(", ") : "none declared";
7340
7361
  }
7341
7362
 
7342
- export { BUILT_IN_RUBRICS, CLIOAuthClient, DiscoveryError, ENV_VAR_NAMES, EvalCaseSchema, EvalDatasetSchema, MCPConfigSchema, MCP_PROTOCOL_VERSION, PlaywrightOAuthClientProvider, SnapshotSanitizers, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test2 as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateLLMHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
7363
+ export { BUILT_IN_RUBRICS, CLIOAuthClient, DiscoveryError, ENV_VAR_NAMES, EvalCaseSchema, EvalDatasetSchema, MCPConfigSchema, MCP_PROTOCOL_VERSION, PlaywrightOAuthClientProvider, SnapshotSanitizers, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test2 as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
7343
7364
  //# sourceMappingURL=index.js.map
7344
7365
  //# sourceMappingURL=index.js.map