@gleanwork/mcp-server-tester 1.0.0-beta.8 → 1.0.1-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -4384,7 +4384,7 @@ function escapeHtml(text) {
4384
4384
 
4385
4385
  // package.json
4386
4386
  var package_default = {
4387
- version: "1.0.0-beta.8"};
4387
+ version: "1.0.1-beta.0"};
4388
4388
 
4389
4389
  // src/mcp/clientFactory.ts
4390
4390
  function getRetryAfterDelayMs(err) {
@@ -4603,6 +4603,17 @@ async function createMCPClientForConfig(config, options) {
4603
4603
  }
4604
4604
  async function closeMCPClient(client) {
4605
4605
  try {
4606
+ const transport = client.transport;
4607
+ if (transport instanceof StreamableHTTPClientTransport) {
4608
+ try {
4609
+ await transport.terminateSession();
4610
+ } catch (sessionError) {
4611
+ debugClient(
4612
+ "Error terminating session: %s",
4613
+ sessionError instanceof Error ? sessionError.message : String(sessionError)
4614
+ );
4615
+ }
4616
+ }
4606
4617
  await client.close();
4607
4618
  } catch (error) {
4608
4619
  debugClient(
@@ -4831,11 +4842,13 @@ function validateSchema(response, schema, options = {}) {
4831
4842
  } catch (error) {
4832
4843
  const zodError = error;
4833
4844
  const issues = formatZodIssues(zodError);
4845
+ const text = stringifyResponse(response);
4834
4846
  return {
4835
4847
  pass: false,
4836
4848
  message: `Response does not match schema: ${issues}`,
4837
4849
  details: {
4838
- issues: zodError.issues
4850
+ issues: zodError.issues,
4851
+ textPreview: truncateForDisplay2(text)
4839
4852
  }
4840
4853
  };
4841
4854
  }
@@ -4888,6 +4901,12 @@ function formatZodIssues(error) {
4888
4901
  });
4889
4902
  return issues.join("; ");
4890
4903
  }
4904
+ function truncateForDisplay2(str, maxLength = 200) {
4905
+ if (str.length <= maxLength) {
4906
+ return str;
4907
+ }
4908
+ return str.slice(0, maxLength) + "... (truncated)";
4909
+ }
4891
4910
 
4892
4911
  // src/assertions/validators/text.ts
4893
4912
  function validateText(response, expected, options = {}) {
@@ -4914,11 +4933,11 @@ function validateText(response, expected, options = {}) {
4914
4933
  details: {
4915
4934
  missing,
4916
4935
  textLength: text.length,
4917
- textPreview: truncateForDisplay2(text)
4936
+ textPreview: truncateForDisplay3(text)
4918
4937
  }
4919
4938
  };
4920
4939
  }
4921
- function truncateForDisplay2(str, maxLength = 200) {
4940
+ function truncateForDisplay3(str, maxLength = 200) {
4922
4941
  if (str.length <= maxLength) {
4923
4942
  return str;
4924
4943
  }
@@ -4950,7 +4969,7 @@ function validatePattern(response, patterns, options = {}) {
4950
4969
  details: {
4951
4970
  unmatched,
4952
4971
  textLength: text.length,
4953
- textPreview: truncateForDisplay3(text)
4972
+ textPreview: truncateForDisplay4(text)
4954
4973
  }
4955
4974
  };
4956
4975
  }
@@ -4970,7 +4989,7 @@ function patternToString(pattern) {
4970
4989
  }
4971
4990
  return `/${pattern}/`;
4972
4991
  }
4973
- function truncateForDisplay3(str, maxLength = 200) {
4992
+ function truncateForDisplay4(str, maxLength = 200) {
4974
4993
  if (str.length <= maxLength) {
4975
4994
  return str;
4976
4995
  }
@@ -4993,7 +5012,7 @@ function validateError(response, expected = true) {
4993
5012
  pass: false,
4994
5013
  message: "Expected an error response but got success",
4995
5014
  details: {
4996
- textPreview: truncateForDisplay4(extractText2(response))
5015
+ textPreview: truncateForDisplay5(extractText2(response))
4997
5016
  }
4998
5017
  };
4999
5018
  } else {
@@ -5005,7 +5024,7 @@ function validateError(response, expected = true) {
5005
5024
  }
5006
5025
  return {
5007
5026
  pass: false,
5008
- message: `Expected a success response but got error: "${truncateForDisplay4(errorMessage)}"`,
5027
+ message: `Expected a success response but got error: "${truncateForDisplay5(errorMessage)}"`,
5009
5028
  details: {
5010
5029
  errorMessage
5011
5030
  }
@@ -5018,7 +5037,7 @@ function validateError(response, expected = true) {
5018
5037
  pass: false,
5019
5038
  message: `Expected an error containing "${expectedMessages[0]}" but got success`,
5020
5039
  details: {
5021
- textPreview: truncateForDisplay4(extractText2(response))
5040
+ textPreview: truncateForDisplay5(extractText2(response))
5022
5041
  }
5023
5042
  };
5024
5043
  }
@@ -5040,7 +5059,7 @@ function validateError(response, expected = true) {
5040
5059
  }
5041
5060
  };
5042
5061
  }
5043
- function truncateForDisplay4(str, maxLength = 200) {
5062
+ function truncateForDisplay5(str, maxLength = 200) {
5044
5063
  if (str.length <= maxLength) {
5045
5064
  return str;
5046
5065
  }
@@ -5158,6 +5177,10 @@ function validateToolCalls(response, expectation) {
5158
5177
  return {
5159
5178
  pass: false,
5160
5179
  message: `Expected tool '${expected.name}' to be called in sequence (starting from position ${searchFrom}), but it was not found`,
5180
+ details: {
5181
+ actual: actual.map((c) => c.name),
5182
+ expected: expected.name
5183
+ },
5161
5184
  metrics
5162
5185
  };
5163
5186
  }
@@ -5174,6 +5197,10 @@ function validateToolCalls(response, expectation) {
5174
5197
  return {
5175
5198
  pass: false,
5176
5199
  message: `Expected tool '${expected.name}'${argsNote} to be called, but it was not`,
5200
+ details: {
5201
+ actual: actual.map((c) => c.name),
5202
+ expected: expected.name
5203
+ },
5177
5204
  metrics
5178
5205
  };
5179
5206
  }
@@ -5186,6 +5213,10 @@ function validateToolCalls(response, expectation) {
5186
5213
  return {
5187
5214
  pass: false,
5188
5215
  message: `Unexpected tool calls: ${names}. Only ${[...allowedNames].map((n) => `'${n}'`).join(", ")} are allowed`,
5216
+ details: {
5217
+ actual: actual.map((c) => c.name),
5218
+ unexpected: unexpected.map((c) => c.name)
5219
+ },
5189
5220
  metrics
5190
5221
  };
5191
5222
  }
@@ -5204,19 +5235,22 @@ function validateToolCallCount(response, options) {
5204
5235
  if (exact !== void 0 && count !== exact) {
5205
5236
  return {
5206
5237
  pass: false,
5207
- message: `Expected exactly ${exact} tool call(s), but got ${count}`
5238
+ message: `Expected exactly ${exact} tool call(s), but got ${count}`,
5239
+ details: { actual: count, expected: exact }
5208
5240
  };
5209
5241
  }
5210
5242
  if (min !== void 0 && count < min) {
5211
5243
  return {
5212
5244
  pass: false,
5213
- message: `Expected at least ${min} tool call(s), but got ${count}`
5245
+ message: `Expected at least ${min} tool call(s), but got ${count}`,
5246
+ details: { actual: count, min }
5214
5247
  };
5215
5248
  }
5216
5249
  if (max !== void 0 && count > max) {
5217
5250
  return {
5218
5251
  pass: false,
5219
- message: `Expected at most ${max} tool call(s), but got ${count}`
5252
+ message: `Expected at most ${max} tool call(s), but got ${count}`,
5253
+ details: { actual: count, max }
5220
5254
  };
5221
5255
  }
5222
5256
  return {
@@ -5730,7 +5764,9 @@ function createJudge(config = {}) {
5730
5764
  case "google":
5731
5765
  return createGoogleJudge(config);
5732
5766
  default:
5733
- throw new Error(`Unsupported LLM provider: ${String(provider)}`);
5767
+ throw new Error(
5768
+ `Unsupported LLM provider: ${String(provider)}. Valid providers: 'anthropic', 'vertex-anthropic', 'anthropic-agent-sdk', 'openai', 'google'`
5769
+ );
5734
5770
  }
5735
5771
  }
5736
5772
 
@@ -6055,12 +6091,19 @@ function toMatchToolResponse(received, expected) {
6055
6091
  // src/assertions/matchers/toMatchToolSchema.ts
6056
6092
  function toMatchToolSchema(received, schema, options = {}) {
6057
6093
  const result = validateSchema(received, schema, options);
6094
+ const preview = result.details?.textPreview;
6058
6095
  return {
6059
6096
  pass: result.pass,
6060
6097
  message: () => {
6061
6098
  if (this.isNot) {
6062
6099
  return result.pass ? "Expected response NOT to match schema, but it did" : result.message;
6063
6100
  }
6101
+ if (!result.pass && preview) {
6102
+ return `${result.message}
6103
+
6104
+ Actual response (truncated):
6105
+ ${preview}`;
6106
+ }
6064
6107
  return result.message;
6065
6108
  }
6066
6109
  };
@@ -6069,6 +6112,7 @@ function toMatchToolSchema(received, schema, options = {}) {
6069
6112
  // src/assertions/matchers/toContainToolText.ts
6070
6113
  function toContainToolText(received, expected, options = {}) {
6071
6114
  const result = validateText(received, expected, options);
6115
+ const preview = result.details?.textPreview;
6072
6116
  return {
6073
6117
  pass: result.pass,
6074
6118
  message: () => {
@@ -6076,6 +6120,12 @@ function toContainToolText(received, expected, options = {}) {
6076
6120
  const expectedStr = Array.isArray(expected) ? expected.map((s) => `"${s}"`).join(", ") : `"${expected}"`;
6077
6121
  return result.pass ? `Expected response NOT to contain ${expectedStr}, but it did` : result.message;
6078
6122
  }
6123
+ if (!result.pass && preview) {
6124
+ return `${result.message}
6125
+
6126
+ Actual response (truncated):
6127
+ ${preview}`;
6128
+ }
6079
6129
  return result.message;
6080
6130
  }
6081
6131
  };
@@ -6084,12 +6134,19 @@ function toContainToolText(received, expected, options = {}) {
6084
6134
  // src/assertions/matchers/toMatchToolPattern.ts
6085
6135
  function toMatchToolPattern(received, patterns, options = {}) {
6086
6136
  const result = validatePattern(received, patterns, options);
6137
+ const preview = result.details?.textPreview;
6087
6138
  return {
6088
6139
  pass: result.pass,
6089
6140
  message: () => {
6090
6141
  if (this.isNot) {
6091
6142
  return result.pass ? "Expected response NOT to match pattern(s), but it did" : result.message;
6092
6143
  }
6144
+ if (!result.pass && preview) {
6145
+ return `${result.message}
6146
+
6147
+ Actual response (truncated):
6148
+ ${preview}`;
6149
+ }
6093
6150
  return result.message;
6094
6151
  }
6095
6152
  };
@@ -6874,6 +6931,12 @@ function createVercelOrchestrator() {
6874
6931
  });
6875
6932
  const totalDurationMs = Date.now() - llmStart;
6876
6933
  const llmDurationMs = totalDurationMs - mcpDurationMs;
6934
+ const hostUsage = result.usage ? {
6935
+ inputTokens: result.usage.promptTokens ?? 0,
6936
+ outputTokens: result.usage.completionTokens ?? 0,
6937
+ totalCostUsd: 0,
6938
+ durationMs: llmDurationMs
6939
+ } : void 0;
6877
6940
  const conversationHistory = (result.steps ?? []).map((step) => ({
6878
6941
  role: step.toolCalls?.length > 0 ? "tool" : "assistant",
6879
6942
  content: step.toolCalls?.length > 0 ? JSON.stringify(step.toolResults) : step.text ?? ""
@@ -6885,7 +6948,8 @@ function createVercelOrchestrator() {
6885
6948
  scenario,
6886
6949
  llmDurationMs,
6887
6950
  mcpDurationMs,
6888
- conversationHistory
6951
+ conversationHistory,
6952
+ usage: hostUsage
6889
6953
  };
6890
6954
  } catch (err) {
6891
6955
  return {
@@ -6903,6 +6967,7 @@ function parseStreamJson(stdout) {
6903
6967
  const lines = stdout.split("\n").filter((line) => line.trim().length > 0);
6904
6968
  const toolCalls = [];
6905
6969
  const textParts = [];
6970
+ let usage;
6906
6971
  const conversationHistory = [];
6907
6972
  for (const line of lines) {
6908
6973
  let event;
@@ -6935,16 +7000,28 @@ function parseStreamJson(stdout) {
6935
7000
  }
6936
7001
  }
6937
7002
  }
6938
- if (event.type === "result" && typeof event.result === "string") {
6939
- if (textParts.length === 0) {
7003
+ if (event.type === "result") {
7004
+ if (typeof event.result === "string" && textParts.length === 0) {
6940
7005
  textParts.push(event.result);
6941
7006
  }
7007
+ if (event.usage) {
7008
+ usage = {
7009
+ inputTokens: event.usage.input_tokens ?? 0,
7010
+ outputTokens: event.usage.output_tokens ?? 0,
7011
+ totalCostUsd: event.total_cost_usd ?? 0,
7012
+ durationMs: event.duration_ms ?? 0,
7013
+ durationApiMs: event.duration_api_ms,
7014
+ cacheReadInputTokens: event.usage.cache_read_input_tokens,
7015
+ cacheCreationInputTokens: event.usage.cache_creation_input_tokens
7016
+ };
7017
+ }
6942
7018
  }
6943
7019
  if (event.type === "result" && event.is_error === true) {
6944
7020
  return {
6945
7021
  success: false,
6946
7022
  toolCalls,
6947
- error: typeof event.result === "string" ? event.result : "CLI host reported an error"
7023
+ error: typeof event.result === "string" ? event.result : "CLI host reported an error",
7024
+ usage
6948
7025
  };
6949
7026
  }
6950
7027
  }
@@ -6956,7 +7033,8 @@ function parseStreamJson(stdout) {
6956
7033
  success: true,
6957
7034
  toolCalls,
6958
7035
  response: response || void 0,
6959
- conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0
7036
+ conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0,
7037
+ usage
6960
7038
  };
6961
7039
  }
6962
7040
  function createJsonParser(paths) {
@@ -7221,6 +7299,32 @@ async function execFileNoThrow(file, args) {
7221
7299
  }
7222
7300
  }
7223
7301
 
7302
+ // src/utils/usageUtils.ts
7303
+ function optionalSum(a, b) {
7304
+ if (a === void 0 && b === void 0) return void 0;
7305
+ return (a ?? 0) + (b ?? 0);
7306
+ }
7307
+ function sumUsage(a, b) {
7308
+ if (!a && !b) return void 0;
7309
+ if (!a) return b ? { ...b } : void 0;
7310
+ if (!b) return { ...a };
7311
+ return {
7312
+ inputTokens: a.inputTokens + b.inputTokens,
7313
+ outputTokens: a.outputTokens + b.outputTokens,
7314
+ totalCostUsd: a.totalCostUsd + b.totalCostUsd,
7315
+ durationMs: a.durationMs + b.durationMs,
7316
+ durationApiMs: optionalSum(a.durationApiMs, b.durationApiMs),
7317
+ cacheReadInputTokens: optionalSum(
7318
+ a.cacheReadInputTokens,
7319
+ b.cacheReadInputTokens
7320
+ ),
7321
+ cacheCreationInputTokens: optionalSum(
7322
+ a.cacheCreationInputTokens,
7323
+ b.cacheCreationInputTokens
7324
+ )
7325
+ };
7326
+ }
7327
+
7224
7328
  // src/evals/evalRunner.ts
7225
7329
  async function executeToolCall(evalCase, mcp) {
7226
7330
  const mode = evalCase.mode || "direct";
@@ -7466,6 +7570,7 @@ async function runSingleIteration(evalCase, context, options) {
7466
7570
  };
7467
7571
  }
7468
7572
  }
7573
+ const hostUsage = isMCPHostSimulationResult(response) && response.usage ? response.usage : void 0;
7469
7574
  return {
7470
7575
  id: evalCase.id,
7471
7576
  datasetName: options.datasetName ?? "single-case",
@@ -7482,7 +7587,8 @@ async function runSingleIteration(evalCase, context, options) {
7482
7587
  tags: evalCase.tags,
7483
7588
  toolPrecision,
7484
7589
  toolRecall,
7485
- mcpHostTrace
7590
+ mcpHostTrace,
7591
+ hostUsage
7486
7592
  };
7487
7593
  }
7488
7594
  function isInfrastructureError(err) {
@@ -7498,7 +7604,7 @@ function isInfrastructureError(err) {
7498
7604
  } else {
7499
7605
  return false;
7500
7606
  }
7501
- return name15 === "AbortError" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
7607
+ return name15?.toLowerCase() === "aborterror" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
7502
7608
  msg.includes("prompt is too long") || msg.includes("context length exceeded") || msg.includes("maximum context length") || msg.includes("context_length_exceeded") || msg.includes("tokens > ") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
7503
7609
  }
7504
7610
  async function runEvalCase(evalCase, context, options = {}) {
@@ -7518,7 +7624,8 @@ async function runEvalCase(evalCase, context, options = {}) {
7518
7624
  durationMs: result.durationMs,
7519
7625
  error: result.error,
7520
7626
  isInfrastructureError: infraError,
7521
- mcpHostTrace: result.mcpHostTrace
7627
+ mcpHostTrace: result.mcpHostTrace,
7628
+ hostUsage: result.hostUsage
7522
7629
  });
7523
7630
  } catch (err) {
7524
7631
  const errorMessage = err instanceof Error ? err.message : String(err);
@@ -7551,6 +7658,10 @@ async function runEvalCase(evalCase, context, options = {}) {
7551
7658
  durationMs: 0,
7552
7659
  tags: evalCase.tags
7553
7660
  };
7661
+ const totalHostUsage = iterationResults.reduce(
7662
+ (acc, r) => sumUsage(acc, r.hostUsage),
7663
+ void 0
7664
+ );
7554
7665
  return {
7555
7666
  ...baseResult,
7556
7667
  pass: assertionPassRate >= threshold,
@@ -7559,7 +7670,8 @@ async function runEvalCase(evalCase, context, options = {}) {
7559
7670
  infrastructureErrorRate,
7560
7671
  iterationResults,
7561
7672
  infrastructureErrorCount: infraErrors.length,
7562
- durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
7673
+ durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0),
7674
+ hostUsage: totalHostUsage
7563
7675
  };
7564
7676
  }
7565
7677
  function wilsonCI(k, n) {
@@ -7669,13 +7781,18 @@ async function runEvalDataset(options, context) {
7669
7781
  ...mcpHostModel !== void 0 && { mcpHostModel },
7670
7782
  ...judgeModel !== void 0 && { judgeModel }
7671
7783
  };
7784
+ const runHostUsage = caseResults.reduce(
7785
+ (acc, r) => sumUsage(acc, r.hostUsage),
7786
+ void 0
7787
+ );
7672
7788
  const result = {
7673
7789
  total,
7674
7790
  passed,
7675
7791
  failed: total - passed,
7676
7792
  caseResults,
7677
7793
  durationMs: Date.now() - startTime,
7678
- metadata
7794
+ metadata,
7795
+ totalHostUsage: runHostUsage
7679
7796
  };
7680
7797
  if (baselineResultsFrom) {
7681
7798
  try {
@@ -7969,6 +8086,6 @@ function formatCapabilities(capabilities) {
7969
8086
  return parts.length > 0 ? parts.join(", ") : "none declared";
7970
8087
  }
7971
8088
 
7972
- export { BUILT_IN_RUBRICS, CLIOAuthClient, DiscoveryError, ENV_VAR_NAMES, EvalCaseSchema, EvalDatasetSchema, MCPConfigSchema, MCP_PROTOCOL_VERSION, PlaywrightOAuthClientProvider, SnapshotSanitizers, clearJudgeRegistry, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getRegisteredJudge, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test2 as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, registerJudge, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
8089
+ export { BUILT_IN_RUBRICS, CLIOAuthClient, DiscoveryError, ENV_VAR_NAMES, EvalCaseSchema, EvalDatasetSchema, MCPConfigSchema, MCP_PROTOCOL_VERSION, PlaywrightOAuthClientProvider, SnapshotSanitizers, clearJudgeRegistry, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getRegisteredJudge, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test2 as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, refreshAccessToken, registerJudge, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
7973
8090
  //# sourceMappingURL=index.js.map
7974
8091
  //# sourceMappingURL=index.js.map