@gleanwork/mcp-server-tester 1.0.0-beta.8 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -4411,7 +4411,7 @@ function escapeHtml(text) {
4411
4411
 
4412
4412
  // package.json
4413
4413
  var package_default = {
4414
- version: "1.0.0-beta.8"};
4414
+ version: "1.0.1"};
4415
4415
 
4416
4416
  // src/mcp/clientFactory.ts
4417
4417
  function getRetryAfterDelayMs(err) {
@@ -4630,6 +4630,17 @@ async function createMCPClientForConfig(config, options) {
4630
4630
  }
4631
4631
  async function closeMCPClient(client) {
4632
4632
  try {
4633
+ const transport = client.transport;
4634
+ if (transport instanceof streamableHttp_js.StreamableHTTPClientTransport) {
4635
+ try {
4636
+ await transport.terminateSession();
4637
+ } catch (sessionError) {
4638
+ debugClient(
4639
+ "Error terminating session: %s",
4640
+ sessionError instanceof Error ? sessionError.message : String(sessionError)
4641
+ );
4642
+ }
4643
+ }
4633
4644
  await client.close();
4634
4645
  } catch (error) {
4635
4646
  debugClient(
@@ -4858,11 +4869,13 @@ function validateSchema(response, schema, options = {}) {
4858
4869
  } catch (error) {
4859
4870
  const zodError = error;
4860
4871
  const issues = formatZodIssues(zodError);
4872
+ const text = stringifyResponse(response);
4861
4873
  return {
4862
4874
  pass: false,
4863
4875
  message: `Response does not match schema: ${issues}`,
4864
4876
  details: {
4865
- issues: zodError.issues
4877
+ issues: zodError.issues,
4878
+ textPreview: truncateForDisplay2(text)
4866
4879
  }
4867
4880
  };
4868
4881
  }
@@ -4915,6 +4928,12 @@ function formatZodIssues(error) {
4915
4928
  });
4916
4929
  return issues.join("; ");
4917
4930
  }
4931
+ function truncateForDisplay2(str, maxLength = 200) {
4932
+ if (str.length <= maxLength) {
4933
+ return str;
4934
+ }
4935
+ return str.slice(0, maxLength) + "... (truncated)";
4936
+ }
4918
4937
 
4919
4938
  // src/assertions/validators/text.ts
4920
4939
  function validateText(response, expected, options = {}) {
@@ -4941,11 +4960,11 @@ function validateText(response, expected, options = {}) {
4941
4960
  details: {
4942
4961
  missing,
4943
4962
  textLength: text.length,
4944
- textPreview: truncateForDisplay2(text)
4963
+ textPreview: truncateForDisplay3(text)
4945
4964
  }
4946
4965
  };
4947
4966
  }
4948
- function truncateForDisplay2(str, maxLength = 200) {
4967
+ function truncateForDisplay3(str, maxLength = 200) {
4949
4968
  if (str.length <= maxLength) {
4950
4969
  return str;
4951
4970
  }
@@ -4977,7 +4996,7 @@ function validatePattern(response, patterns, options = {}) {
4977
4996
  details: {
4978
4997
  unmatched,
4979
4998
  textLength: text.length,
4980
- textPreview: truncateForDisplay3(text)
4999
+ textPreview: truncateForDisplay4(text)
4981
5000
  }
4982
5001
  };
4983
5002
  }
@@ -4997,7 +5016,7 @@ function patternToString(pattern) {
4997
5016
  }
4998
5017
  return `/${pattern}/`;
4999
5018
  }
5000
- function truncateForDisplay3(str, maxLength = 200) {
5019
+ function truncateForDisplay4(str, maxLength = 200) {
5001
5020
  if (str.length <= maxLength) {
5002
5021
  return str;
5003
5022
  }
@@ -5020,7 +5039,7 @@ function validateError(response, expected = true) {
5020
5039
  pass: false,
5021
5040
  message: "Expected an error response but got success",
5022
5041
  details: {
5023
- textPreview: truncateForDisplay4(extractText2(response))
5042
+ textPreview: truncateForDisplay5(extractText2(response))
5024
5043
  }
5025
5044
  };
5026
5045
  } else {
@@ -5032,7 +5051,7 @@ function validateError(response, expected = true) {
5032
5051
  }
5033
5052
  return {
5034
5053
  pass: false,
5035
- message: `Expected a success response but got error: "${truncateForDisplay4(errorMessage)}"`,
5054
+ message: `Expected a success response but got error: "${truncateForDisplay5(errorMessage)}"`,
5036
5055
  details: {
5037
5056
  errorMessage
5038
5057
  }
@@ -5045,7 +5064,7 @@ function validateError(response, expected = true) {
5045
5064
  pass: false,
5046
5065
  message: `Expected an error containing "${expectedMessages[0]}" but got success`,
5047
5066
  details: {
5048
- textPreview: truncateForDisplay4(extractText2(response))
5067
+ textPreview: truncateForDisplay5(extractText2(response))
5049
5068
  }
5050
5069
  };
5051
5070
  }
@@ -5067,7 +5086,7 @@ function validateError(response, expected = true) {
5067
5086
  }
5068
5087
  };
5069
5088
  }
5070
- function truncateForDisplay4(str, maxLength = 200) {
5089
+ function truncateForDisplay5(str, maxLength = 200) {
5071
5090
  if (str.length <= maxLength) {
5072
5091
  return str;
5073
5092
  }
@@ -5185,6 +5204,10 @@ function validateToolCalls(response, expectation) {
5185
5204
  return {
5186
5205
  pass: false,
5187
5206
  message: `Expected tool '${expected.name}' to be called in sequence (starting from position ${searchFrom}), but it was not found`,
5207
+ details: {
5208
+ actual: actual.map((c) => c.name),
5209
+ expected: expected.name
5210
+ },
5188
5211
  metrics
5189
5212
  };
5190
5213
  }
@@ -5201,6 +5224,10 @@ function validateToolCalls(response, expectation) {
5201
5224
  return {
5202
5225
  pass: false,
5203
5226
  message: `Expected tool '${expected.name}'${argsNote} to be called, but it was not`,
5227
+ details: {
5228
+ actual: actual.map((c) => c.name),
5229
+ expected: expected.name
5230
+ },
5204
5231
  metrics
5205
5232
  };
5206
5233
  }
@@ -5213,6 +5240,10 @@ function validateToolCalls(response, expectation) {
5213
5240
  return {
5214
5241
  pass: false,
5215
5242
  message: `Unexpected tool calls: ${names}. Only ${[...allowedNames].map((n) => `'${n}'`).join(", ")} are allowed`,
5243
+ details: {
5244
+ actual: actual.map((c) => c.name),
5245
+ unexpected: unexpected.map((c) => c.name)
5246
+ },
5216
5247
  metrics
5217
5248
  };
5218
5249
  }
@@ -5231,19 +5262,22 @@ function validateToolCallCount(response, options) {
5231
5262
  if (exact !== void 0 && count !== exact) {
5232
5263
  return {
5233
5264
  pass: false,
5234
- message: `Expected exactly ${exact} tool call(s), but got ${count}`
5265
+ message: `Expected exactly ${exact} tool call(s), but got ${count}`,
5266
+ details: { actual: count, expected: exact }
5235
5267
  };
5236
5268
  }
5237
5269
  if (min !== void 0 && count < min) {
5238
5270
  return {
5239
5271
  pass: false,
5240
- message: `Expected at least ${min} tool call(s), but got ${count}`
5272
+ message: `Expected at least ${min} tool call(s), but got ${count}`,
5273
+ details: { actual: count, min }
5241
5274
  };
5242
5275
  }
5243
5276
  if (max !== void 0 && count > max) {
5244
5277
  return {
5245
5278
  pass: false,
5246
- message: `Expected at most ${max} tool call(s), but got ${count}`
5279
+ message: `Expected at most ${max} tool call(s), but got ${count}`,
5280
+ details: { actual: count, max }
5247
5281
  };
5248
5282
  }
5249
5283
  return {
@@ -5757,7 +5791,9 @@ function createJudge(config = {}) {
5757
5791
  case "google":
5758
5792
  return createGoogleJudge(config);
5759
5793
  default:
5760
- throw new Error(`Unsupported LLM provider: ${String(provider)}`);
5794
+ throw new Error(
5795
+ `Unsupported LLM provider: ${String(provider)}. Valid providers: 'anthropic', 'vertex-anthropic', 'anthropic-agent-sdk', 'openai', 'google'`
5796
+ );
5761
5797
  }
5762
5798
  }
5763
5799
 
@@ -6082,12 +6118,19 @@ function toMatchToolResponse(received, expected) {
6082
6118
  // src/assertions/matchers/toMatchToolSchema.ts
6083
6119
  function toMatchToolSchema(received, schema, options = {}) {
6084
6120
  const result = validateSchema(received, schema, options);
6121
+ const preview = result.details?.textPreview;
6085
6122
  return {
6086
6123
  pass: result.pass,
6087
6124
  message: () => {
6088
6125
  if (this.isNot) {
6089
6126
  return result.pass ? "Expected response NOT to match schema, but it did" : result.message;
6090
6127
  }
6128
+ if (!result.pass && preview) {
6129
+ return `${result.message}
6130
+
6131
+ Actual response (truncated):
6132
+ ${preview}`;
6133
+ }
6091
6134
  return result.message;
6092
6135
  }
6093
6136
  };
@@ -6096,6 +6139,7 @@ function toMatchToolSchema(received, schema, options = {}) {
6096
6139
  // src/assertions/matchers/toContainToolText.ts
6097
6140
  function toContainToolText(received, expected, options = {}) {
6098
6141
  const result = validateText(received, expected, options);
6142
+ const preview = result.details?.textPreview;
6099
6143
  return {
6100
6144
  pass: result.pass,
6101
6145
  message: () => {
@@ -6103,6 +6147,12 @@ function toContainToolText(received, expected, options = {}) {
6103
6147
  const expectedStr = Array.isArray(expected) ? expected.map((s) => `"${s}"`).join(", ") : `"${expected}"`;
6104
6148
  return result.pass ? `Expected response NOT to contain ${expectedStr}, but it did` : result.message;
6105
6149
  }
6150
+ if (!result.pass && preview) {
6151
+ return `${result.message}
6152
+
6153
+ Actual response (truncated):
6154
+ ${preview}`;
6155
+ }
6106
6156
  return result.message;
6107
6157
  }
6108
6158
  };
@@ -6111,12 +6161,19 @@ function toContainToolText(received, expected, options = {}) {
6111
6161
  // src/assertions/matchers/toMatchToolPattern.ts
6112
6162
  function toMatchToolPattern(received, patterns, options = {}) {
6113
6163
  const result = validatePattern(received, patterns, options);
6164
+ const preview = result.details?.textPreview;
6114
6165
  return {
6115
6166
  pass: result.pass,
6116
6167
  message: () => {
6117
6168
  if (this.isNot) {
6118
6169
  return result.pass ? "Expected response NOT to match pattern(s), but it did" : result.message;
6119
6170
  }
6171
+ if (!result.pass && preview) {
6172
+ return `${result.message}
6173
+
6174
+ Actual response (truncated):
6175
+ ${preview}`;
6176
+ }
6120
6177
  return result.message;
6121
6178
  }
6122
6179
  };
@@ -6901,6 +6958,12 @@ function createVercelOrchestrator() {
6901
6958
  });
6902
6959
  const totalDurationMs = Date.now() - llmStart;
6903
6960
  const llmDurationMs = totalDurationMs - mcpDurationMs;
6961
+ const hostUsage = result.usage ? {
6962
+ inputTokens: result.usage.promptTokens ?? 0,
6963
+ outputTokens: result.usage.completionTokens ?? 0,
6964
+ totalCostUsd: 0,
6965
+ durationMs: llmDurationMs
6966
+ } : void 0;
6904
6967
  const conversationHistory = (result.steps ?? []).map((step) => ({
6905
6968
  role: step.toolCalls?.length > 0 ? "tool" : "assistant",
6906
6969
  content: step.toolCalls?.length > 0 ? JSON.stringify(step.toolResults) : step.text ?? ""
@@ -6912,7 +6975,8 @@ function createVercelOrchestrator() {
6912
6975
  scenario,
6913
6976
  llmDurationMs,
6914
6977
  mcpDurationMs,
6915
- conversationHistory
6978
+ conversationHistory,
6979
+ usage: hostUsage
6916
6980
  };
6917
6981
  } catch (err) {
6918
6982
  return {
@@ -7248,6 +7312,32 @@ async function execFileNoThrow(file, args) {
7248
7312
  }
7249
7313
  }
7250
7314
 
7315
+ // src/utils/usageUtils.ts
7316
+ function optionalSum(a, b) {
7317
+ if (a === void 0 && b === void 0) return void 0;
7318
+ return (a ?? 0) + (b ?? 0);
7319
+ }
7320
+ function sumUsage(a, b) {
7321
+ if (!a && !b) return void 0;
7322
+ if (!a) return b ? { ...b } : void 0;
7323
+ if (!b) return { ...a };
7324
+ return {
7325
+ inputTokens: a.inputTokens + b.inputTokens,
7326
+ outputTokens: a.outputTokens + b.outputTokens,
7327
+ totalCostUsd: a.totalCostUsd + b.totalCostUsd,
7328
+ durationMs: a.durationMs + b.durationMs,
7329
+ durationApiMs: optionalSum(a.durationApiMs, b.durationApiMs),
7330
+ cacheReadInputTokens: optionalSum(
7331
+ a.cacheReadInputTokens,
7332
+ b.cacheReadInputTokens
7333
+ ),
7334
+ cacheCreationInputTokens: optionalSum(
7335
+ a.cacheCreationInputTokens,
7336
+ b.cacheCreationInputTokens
7337
+ )
7338
+ };
7339
+ }
7340
+
7251
7341
  // src/evals/evalRunner.ts
7252
7342
  async function executeToolCall(evalCase, mcp) {
7253
7343
  const mode = evalCase.mode || "direct";
@@ -7493,6 +7583,7 @@ async function runSingleIteration(evalCase, context, options) {
7493
7583
  };
7494
7584
  }
7495
7585
  }
7586
+ const hostUsage = isMCPHostSimulationResult(response) && response.usage ? response.usage : void 0;
7496
7587
  return {
7497
7588
  id: evalCase.id,
7498
7589
  datasetName: options.datasetName ?? "single-case",
@@ -7509,7 +7600,8 @@ async function runSingleIteration(evalCase, context, options) {
7509
7600
  tags: evalCase.tags,
7510
7601
  toolPrecision,
7511
7602
  toolRecall,
7512
- mcpHostTrace
7603
+ mcpHostTrace,
7604
+ hostUsage
7513
7605
  };
7514
7606
  }
7515
7607
  function isInfrastructureError(err) {
@@ -7525,7 +7617,7 @@ function isInfrastructureError(err) {
7525
7617
  } else {
7526
7618
  return false;
7527
7619
  }
7528
- return name15 === "AbortError" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
7620
+ return name15?.toLowerCase() === "aborterror" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
7529
7621
  msg.includes("prompt is too long") || msg.includes("context length exceeded") || msg.includes("maximum context length") || msg.includes("context_length_exceeded") || msg.includes("tokens > ") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
7530
7622
  }
7531
7623
  async function runEvalCase(evalCase, context, options = {}) {
@@ -7545,7 +7637,8 @@ async function runEvalCase(evalCase, context, options = {}) {
7545
7637
  durationMs: result.durationMs,
7546
7638
  error: result.error,
7547
7639
  isInfrastructureError: infraError,
7548
- mcpHostTrace: result.mcpHostTrace
7640
+ mcpHostTrace: result.mcpHostTrace,
7641
+ hostUsage: result.hostUsage
7549
7642
  });
7550
7643
  } catch (err) {
7551
7644
  const errorMessage = err instanceof Error ? err.message : String(err);
@@ -7578,6 +7671,10 @@ async function runEvalCase(evalCase, context, options = {}) {
7578
7671
  durationMs: 0,
7579
7672
  tags: evalCase.tags
7580
7673
  };
7674
+ const totalHostUsage = iterationResults.reduce(
7675
+ (acc, r) => sumUsage(acc, r.hostUsage),
7676
+ void 0
7677
+ );
7581
7678
  return {
7582
7679
  ...baseResult,
7583
7680
  pass: assertionPassRate >= threshold,
@@ -7586,7 +7683,8 @@ async function runEvalCase(evalCase, context, options = {}) {
7586
7683
  infrastructureErrorRate,
7587
7684
  iterationResults,
7588
7685
  infrastructureErrorCount: infraErrors.length,
7589
- durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
7686
+ durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0),
7687
+ hostUsage: totalHostUsage
7590
7688
  };
7591
7689
  }
7592
7690
  function wilsonCI(k, n) {
@@ -7696,13 +7794,18 @@ async function runEvalDataset(options, context) {
7696
7794
  ...mcpHostModel !== void 0 && { mcpHostModel },
7697
7795
  ...judgeModel !== void 0 && { judgeModel }
7698
7796
  };
7797
+ const runHostUsage = caseResults.reduce(
7798
+ (acc, r) => sumUsage(acc, r.hostUsage),
7799
+ void 0
7800
+ );
7699
7801
  const result = {
7700
7802
  total,
7701
7803
  passed,
7702
7804
  failed: total - passed,
7703
7805
  caseResults,
7704
7806
  durationMs: Date.now() - startTime,
7705
- metadata
7807
+ metadata,
7808
+ totalHostUsage: runHostUsage
7706
7809
  };
7707
7810
  if (baselineResultsFrom) {
7708
7811
  try {
@@ -8037,6 +8140,7 @@ exports.normalizeWhitespace = normalizeWhitespace;
8037
8140
  exports.performClientCredentialsFlow = performClientCredentialsFlow;
8038
8141
  exports.performOAuthSetup = performOAuthSetup;
8039
8142
  exports.performOAuthSetupIfNeeded = performOAuthSetupIfNeeded;
8143
+ exports.refreshAccessToken = refreshAccessToken;
8040
8144
  exports.registerJudge = registerJudge;
8041
8145
  exports.resolveRubric = resolveRubric;
8042
8146
  exports.runConformanceChecks = runConformanceChecks;