@gleanwork/mcp-server-tester 1.0.0-beta.8 → 1.0.1-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -4411,7 +4411,7 @@ function escapeHtml(text) {
4411
4411
 
4412
4412
  // package.json
4413
4413
  var package_default = {
4414
- version: "1.0.0-beta.8"};
4414
+ version: "1.0.1-beta.0"};
4415
4415
 
4416
4416
  // src/mcp/clientFactory.ts
4417
4417
  function getRetryAfterDelayMs(err) {
@@ -4630,6 +4630,17 @@ async function createMCPClientForConfig(config, options) {
4630
4630
  }
4631
4631
  async function closeMCPClient(client) {
4632
4632
  try {
4633
+ const transport = client.transport;
4634
+ if (transport instanceof streamableHttp_js.StreamableHTTPClientTransport) {
4635
+ try {
4636
+ await transport.terminateSession();
4637
+ } catch (sessionError) {
4638
+ debugClient(
4639
+ "Error terminating session: %s",
4640
+ sessionError instanceof Error ? sessionError.message : String(sessionError)
4641
+ );
4642
+ }
4643
+ }
4633
4644
  await client.close();
4634
4645
  } catch (error) {
4635
4646
  debugClient(
@@ -4858,11 +4869,13 @@ function validateSchema(response, schema, options = {}) {
4858
4869
  } catch (error) {
4859
4870
  const zodError = error;
4860
4871
  const issues = formatZodIssues(zodError);
4872
+ const text = stringifyResponse(response);
4861
4873
  return {
4862
4874
  pass: false,
4863
4875
  message: `Response does not match schema: ${issues}`,
4864
4876
  details: {
4865
- issues: zodError.issues
4877
+ issues: zodError.issues,
4878
+ textPreview: truncateForDisplay2(text)
4866
4879
  }
4867
4880
  };
4868
4881
  }
@@ -4915,6 +4928,12 @@ function formatZodIssues(error) {
4915
4928
  });
4916
4929
  return issues.join("; ");
4917
4930
  }
4931
+ function truncateForDisplay2(str, maxLength = 200) {
4932
+ if (str.length <= maxLength) {
4933
+ return str;
4934
+ }
4935
+ return str.slice(0, maxLength) + "... (truncated)";
4936
+ }
4918
4937
 
4919
4938
  // src/assertions/validators/text.ts
4920
4939
  function validateText(response, expected, options = {}) {
@@ -4941,11 +4960,11 @@ function validateText(response, expected, options = {}) {
4941
4960
  details: {
4942
4961
  missing,
4943
4962
  textLength: text.length,
4944
- textPreview: truncateForDisplay2(text)
4963
+ textPreview: truncateForDisplay3(text)
4945
4964
  }
4946
4965
  };
4947
4966
  }
4948
- function truncateForDisplay2(str, maxLength = 200) {
4967
+ function truncateForDisplay3(str, maxLength = 200) {
4949
4968
  if (str.length <= maxLength) {
4950
4969
  return str;
4951
4970
  }
@@ -4977,7 +4996,7 @@ function validatePattern(response, patterns, options = {}) {
4977
4996
  details: {
4978
4997
  unmatched,
4979
4998
  textLength: text.length,
4980
- textPreview: truncateForDisplay3(text)
4999
+ textPreview: truncateForDisplay4(text)
4981
5000
  }
4982
5001
  };
4983
5002
  }
@@ -4997,7 +5016,7 @@ function patternToString(pattern) {
4997
5016
  }
4998
5017
  return `/${pattern}/`;
4999
5018
  }
5000
- function truncateForDisplay3(str, maxLength = 200) {
5019
+ function truncateForDisplay4(str, maxLength = 200) {
5001
5020
  if (str.length <= maxLength) {
5002
5021
  return str;
5003
5022
  }
@@ -5020,7 +5039,7 @@ function validateError(response, expected = true) {
5020
5039
  pass: false,
5021
5040
  message: "Expected an error response but got success",
5022
5041
  details: {
5023
- textPreview: truncateForDisplay4(extractText2(response))
5042
+ textPreview: truncateForDisplay5(extractText2(response))
5024
5043
  }
5025
5044
  };
5026
5045
  } else {
@@ -5032,7 +5051,7 @@ function validateError(response, expected = true) {
5032
5051
  }
5033
5052
  return {
5034
5053
  pass: false,
5035
- message: `Expected a success response but got error: "${truncateForDisplay4(errorMessage)}"`,
5054
+ message: `Expected a success response but got error: "${truncateForDisplay5(errorMessage)}"`,
5036
5055
  details: {
5037
5056
  errorMessage
5038
5057
  }
@@ -5045,7 +5064,7 @@ function validateError(response, expected = true) {
5045
5064
  pass: false,
5046
5065
  message: `Expected an error containing "${expectedMessages[0]}" but got success`,
5047
5066
  details: {
5048
- textPreview: truncateForDisplay4(extractText2(response))
5067
+ textPreview: truncateForDisplay5(extractText2(response))
5049
5068
  }
5050
5069
  };
5051
5070
  }
@@ -5067,7 +5086,7 @@ function validateError(response, expected = true) {
5067
5086
  }
5068
5087
  };
5069
5088
  }
5070
- function truncateForDisplay4(str, maxLength = 200) {
5089
+ function truncateForDisplay5(str, maxLength = 200) {
5071
5090
  if (str.length <= maxLength) {
5072
5091
  return str;
5073
5092
  }
@@ -5185,6 +5204,10 @@ function validateToolCalls(response, expectation) {
5185
5204
  return {
5186
5205
  pass: false,
5187
5206
  message: `Expected tool '${expected.name}' to be called in sequence (starting from position ${searchFrom}), but it was not found`,
5207
+ details: {
5208
+ actual: actual.map((c) => c.name),
5209
+ expected: expected.name
5210
+ },
5188
5211
  metrics
5189
5212
  };
5190
5213
  }
@@ -5201,6 +5224,10 @@ function validateToolCalls(response, expectation) {
5201
5224
  return {
5202
5225
  pass: false,
5203
5226
  message: `Expected tool '${expected.name}'${argsNote} to be called, but it was not`,
5227
+ details: {
5228
+ actual: actual.map((c) => c.name),
5229
+ expected: expected.name
5230
+ },
5204
5231
  metrics
5205
5232
  };
5206
5233
  }
@@ -5213,6 +5240,10 @@ function validateToolCalls(response, expectation) {
5213
5240
  return {
5214
5241
  pass: false,
5215
5242
  message: `Unexpected tool calls: ${names}. Only ${[...allowedNames].map((n) => `'${n}'`).join(", ")} are allowed`,
5243
+ details: {
5244
+ actual: actual.map((c) => c.name),
5245
+ unexpected: unexpected.map((c) => c.name)
5246
+ },
5216
5247
  metrics
5217
5248
  };
5218
5249
  }
@@ -5231,19 +5262,22 @@ function validateToolCallCount(response, options) {
5231
5262
  if (exact !== void 0 && count !== exact) {
5232
5263
  return {
5233
5264
  pass: false,
5234
- message: `Expected exactly ${exact} tool call(s), but got ${count}`
5265
+ message: `Expected exactly ${exact} tool call(s), but got ${count}`,
5266
+ details: { actual: count, expected: exact }
5235
5267
  };
5236
5268
  }
5237
5269
  if (min !== void 0 && count < min) {
5238
5270
  return {
5239
5271
  pass: false,
5240
- message: `Expected at least ${min} tool call(s), but got ${count}`
5272
+ message: `Expected at least ${min} tool call(s), but got ${count}`,
5273
+ details: { actual: count, min }
5241
5274
  };
5242
5275
  }
5243
5276
  if (max !== void 0 && count > max) {
5244
5277
  return {
5245
5278
  pass: false,
5246
- message: `Expected at most ${max} tool call(s), but got ${count}`
5279
+ message: `Expected at most ${max} tool call(s), but got ${count}`,
5280
+ details: { actual: count, max }
5247
5281
  };
5248
5282
  }
5249
5283
  return {
@@ -5757,7 +5791,9 @@ function createJudge(config = {}) {
5757
5791
  case "google":
5758
5792
  return createGoogleJudge(config);
5759
5793
  default:
5760
- throw new Error(`Unsupported LLM provider: ${String(provider)}`);
5794
+ throw new Error(
5795
+ `Unsupported LLM provider: ${String(provider)}. Valid providers: 'anthropic', 'vertex-anthropic', 'anthropic-agent-sdk', 'openai', 'google'`
5796
+ );
5761
5797
  }
5762
5798
  }
5763
5799
 
@@ -6082,12 +6118,19 @@ function toMatchToolResponse(received, expected) {
6082
6118
  // src/assertions/matchers/toMatchToolSchema.ts
6083
6119
  function toMatchToolSchema(received, schema, options = {}) {
6084
6120
  const result = validateSchema(received, schema, options);
6121
+ const preview = result.details?.textPreview;
6085
6122
  return {
6086
6123
  pass: result.pass,
6087
6124
  message: () => {
6088
6125
  if (this.isNot) {
6089
6126
  return result.pass ? "Expected response NOT to match schema, but it did" : result.message;
6090
6127
  }
6128
+ if (!result.pass && preview) {
6129
+ return `${result.message}
6130
+
6131
+ Actual response (truncated):
6132
+ ${preview}`;
6133
+ }
6091
6134
  return result.message;
6092
6135
  }
6093
6136
  };
@@ -6096,6 +6139,7 @@ function toMatchToolSchema(received, schema, options = {}) {
6096
6139
  // src/assertions/matchers/toContainToolText.ts
6097
6140
  function toContainToolText(received, expected, options = {}) {
6098
6141
  const result = validateText(received, expected, options);
6142
+ const preview = result.details?.textPreview;
6099
6143
  return {
6100
6144
  pass: result.pass,
6101
6145
  message: () => {
@@ -6103,6 +6147,12 @@ function toContainToolText(received, expected, options = {}) {
6103
6147
  const expectedStr = Array.isArray(expected) ? expected.map((s) => `"${s}"`).join(", ") : `"${expected}"`;
6104
6148
  return result.pass ? `Expected response NOT to contain ${expectedStr}, but it did` : result.message;
6105
6149
  }
6150
+ if (!result.pass && preview) {
6151
+ return `${result.message}
6152
+
6153
+ Actual response (truncated):
6154
+ ${preview}`;
6155
+ }
6106
6156
  return result.message;
6107
6157
  }
6108
6158
  };
@@ -6111,12 +6161,19 @@ function toContainToolText(received, expected, options = {}) {
6111
6161
  // src/assertions/matchers/toMatchToolPattern.ts
6112
6162
  function toMatchToolPattern(received, patterns, options = {}) {
6113
6163
  const result = validatePattern(received, patterns, options);
6164
+ const preview = result.details?.textPreview;
6114
6165
  return {
6115
6166
  pass: result.pass,
6116
6167
  message: () => {
6117
6168
  if (this.isNot) {
6118
6169
  return result.pass ? "Expected response NOT to match pattern(s), but it did" : result.message;
6119
6170
  }
6171
+ if (!result.pass && preview) {
6172
+ return `${result.message}
6173
+
6174
+ Actual response (truncated):
6175
+ ${preview}`;
6176
+ }
6120
6177
  return result.message;
6121
6178
  }
6122
6179
  };
@@ -6901,6 +6958,12 @@ function createVercelOrchestrator() {
6901
6958
  });
6902
6959
  const totalDurationMs = Date.now() - llmStart;
6903
6960
  const llmDurationMs = totalDurationMs - mcpDurationMs;
6961
+ const hostUsage = result.usage ? {
6962
+ inputTokens: result.usage.promptTokens ?? 0,
6963
+ outputTokens: result.usage.completionTokens ?? 0,
6964
+ totalCostUsd: 0,
6965
+ durationMs: llmDurationMs
6966
+ } : void 0;
6904
6967
  const conversationHistory = (result.steps ?? []).map((step) => ({
6905
6968
  role: step.toolCalls?.length > 0 ? "tool" : "assistant",
6906
6969
  content: step.toolCalls?.length > 0 ? JSON.stringify(step.toolResults) : step.text ?? ""
@@ -6912,7 +6975,8 @@ function createVercelOrchestrator() {
6912
6975
  scenario,
6913
6976
  llmDurationMs,
6914
6977
  mcpDurationMs,
6915
- conversationHistory
6978
+ conversationHistory,
6979
+ usage: hostUsage
6916
6980
  };
6917
6981
  } catch (err) {
6918
6982
  return {
@@ -6930,6 +6994,7 @@ function parseStreamJson(stdout) {
6930
6994
  const lines = stdout.split("\n").filter((line) => line.trim().length > 0);
6931
6995
  const toolCalls = [];
6932
6996
  const textParts = [];
6997
+ let usage;
6933
6998
  const conversationHistory = [];
6934
6999
  for (const line of lines) {
6935
7000
  let event;
@@ -6962,16 +7027,28 @@ function parseStreamJson(stdout) {
6962
7027
  }
6963
7028
  }
6964
7029
  }
6965
- if (event.type === "result" && typeof event.result === "string") {
6966
- if (textParts.length === 0) {
7030
+ if (event.type === "result") {
7031
+ if (typeof event.result === "string" && textParts.length === 0) {
6967
7032
  textParts.push(event.result);
6968
7033
  }
7034
+ if (event.usage) {
7035
+ usage = {
7036
+ inputTokens: event.usage.input_tokens ?? 0,
7037
+ outputTokens: event.usage.output_tokens ?? 0,
7038
+ totalCostUsd: event.total_cost_usd ?? 0,
7039
+ durationMs: event.duration_ms ?? 0,
7040
+ durationApiMs: event.duration_api_ms,
7041
+ cacheReadInputTokens: event.usage.cache_read_input_tokens,
7042
+ cacheCreationInputTokens: event.usage.cache_creation_input_tokens
7043
+ };
7044
+ }
6969
7045
  }
6970
7046
  if (event.type === "result" && event.is_error === true) {
6971
7047
  return {
6972
7048
  success: false,
6973
7049
  toolCalls,
6974
- error: typeof event.result === "string" ? event.result : "CLI host reported an error"
7050
+ error: typeof event.result === "string" ? event.result : "CLI host reported an error",
7051
+ usage
6975
7052
  };
6976
7053
  }
6977
7054
  }
@@ -6983,7 +7060,8 @@ function parseStreamJson(stdout) {
6983
7060
  success: true,
6984
7061
  toolCalls,
6985
7062
  response: response || void 0,
6986
- conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0
7063
+ conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0,
7064
+ usage
6987
7065
  };
6988
7066
  }
6989
7067
  function createJsonParser(paths) {
@@ -7248,6 +7326,32 @@ async function execFileNoThrow(file, args) {
7248
7326
  }
7249
7327
  }
7250
7328
 
7329
+ // src/utils/usageUtils.ts
7330
+ function optionalSum(a, b) {
7331
+ if (a === void 0 && b === void 0) return void 0;
7332
+ return (a ?? 0) + (b ?? 0);
7333
+ }
7334
+ function sumUsage(a, b) {
7335
+ if (!a && !b) return void 0;
7336
+ if (!a) return b ? { ...b } : void 0;
7337
+ if (!b) return { ...a };
7338
+ return {
7339
+ inputTokens: a.inputTokens + b.inputTokens,
7340
+ outputTokens: a.outputTokens + b.outputTokens,
7341
+ totalCostUsd: a.totalCostUsd + b.totalCostUsd,
7342
+ durationMs: a.durationMs + b.durationMs,
7343
+ durationApiMs: optionalSum(a.durationApiMs, b.durationApiMs),
7344
+ cacheReadInputTokens: optionalSum(
7345
+ a.cacheReadInputTokens,
7346
+ b.cacheReadInputTokens
7347
+ ),
7348
+ cacheCreationInputTokens: optionalSum(
7349
+ a.cacheCreationInputTokens,
7350
+ b.cacheCreationInputTokens
7351
+ )
7352
+ };
7353
+ }
7354
+
7251
7355
  // src/evals/evalRunner.ts
7252
7356
  async function executeToolCall(evalCase, mcp) {
7253
7357
  const mode = evalCase.mode || "direct";
@@ -7493,6 +7597,7 @@ async function runSingleIteration(evalCase, context, options) {
7493
7597
  };
7494
7598
  }
7495
7599
  }
7600
+ const hostUsage = isMCPHostSimulationResult(response) && response.usage ? response.usage : void 0;
7496
7601
  return {
7497
7602
  id: evalCase.id,
7498
7603
  datasetName: options.datasetName ?? "single-case",
@@ -7509,7 +7614,8 @@ async function runSingleIteration(evalCase, context, options) {
7509
7614
  tags: evalCase.tags,
7510
7615
  toolPrecision,
7511
7616
  toolRecall,
7512
- mcpHostTrace
7617
+ mcpHostTrace,
7618
+ hostUsage
7513
7619
  };
7514
7620
  }
7515
7621
  function isInfrastructureError(err) {
@@ -7525,7 +7631,7 @@ function isInfrastructureError(err) {
7525
7631
  } else {
7526
7632
  return false;
7527
7633
  }
7528
- return name15 === "AbortError" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
7634
+ return name15?.toLowerCase() === "aborterror" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
7529
7635
  msg.includes("prompt is too long") || msg.includes("context length exceeded") || msg.includes("maximum context length") || msg.includes("context_length_exceeded") || msg.includes("tokens > ") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
7530
7636
  }
7531
7637
  async function runEvalCase(evalCase, context, options = {}) {
@@ -7545,7 +7651,8 @@ async function runEvalCase(evalCase, context, options = {}) {
7545
7651
  durationMs: result.durationMs,
7546
7652
  error: result.error,
7547
7653
  isInfrastructureError: infraError,
7548
- mcpHostTrace: result.mcpHostTrace
7654
+ mcpHostTrace: result.mcpHostTrace,
7655
+ hostUsage: result.hostUsage
7549
7656
  });
7550
7657
  } catch (err) {
7551
7658
  const errorMessage = err instanceof Error ? err.message : String(err);
@@ -7578,6 +7685,10 @@ async function runEvalCase(evalCase, context, options = {}) {
7578
7685
  durationMs: 0,
7579
7686
  tags: evalCase.tags
7580
7687
  };
7688
+ const totalHostUsage = iterationResults.reduce(
7689
+ (acc, r) => sumUsage(acc, r.hostUsage),
7690
+ void 0
7691
+ );
7581
7692
  return {
7582
7693
  ...baseResult,
7583
7694
  pass: assertionPassRate >= threshold,
@@ -7586,7 +7697,8 @@ async function runEvalCase(evalCase, context, options = {}) {
7586
7697
  infrastructureErrorRate,
7587
7698
  iterationResults,
7588
7699
  infrastructureErrorCount: infraErrors.length,
7589
- durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
7700
+ durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0),
7701
+ hostUsage: totalHostUsage
7590
7702
  };
7591
7703
  }
7592
7704
  function wilsonCI(k, n) {
@@ -7696,13 +7808,18 @@ async function runEvalDataset(options, context) {
7696
7808
  ...mcpHostModel !== void 0 && { mcpHostModel },
7697
7809
  ...judgeModel !== void 0 && { judgeModel }
7698
7810
  };
7811
+ const runHostUsage = caseResults.reduce(
7812
+ (acc, r) => sumUsage(acc, r.hostUsage),
7813
+ void 0
7814
+ );
7699
7815
  const result = {
7700
7816
  total,
7701
7817
  passed,
7702
7818
  failed: total - passed,
7703
7819
  caseResults,
7704
7820
  durationMs: Date.now() - startTime,
7705
- metadata
7821
+ metadata,
7822
+ totalHostUsage: runHostUsage
7706
7823
  };
7707
7824
  if (baselineResultsFrom) {
7708
7825
  try {
@@ -8037,6 +8154,7 @@ exports.normalizeWhitespace = normalizeWhitespace;
8037
8154
  exports.performClientCredentialsFlow = performClientCredentialsFlow;
8038
8155
  exports.performOAuthSetup = performOAuthSetup;
8039
8156
  exports.performOAuthSetupIfNeeded = performOAuthSetupIfNeeded;
8157
+ exports.refreshAccessToken = refreshAccessToken;
8040
8158
  exports.registerJudge = registerJudge;
8041
8159
  exports.resolveRubric = resolveRubric;
8042
8160
  exports.runConformanceChecks = runConformanceChecks;