@gleanwork/mcp-server-tester 1.0.0-beta.4 → 1.0.0-beta.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -4407,7 +4407,7 @@ function escapeHtml(text) {
4407
4407
 
4408
4408
  // package.json
4409
4409
  var package_default = {
4410
- version: "1.0.0-beta.4"};
4410
+ version: "1.0.0-beta.6"};
4411
4411
 
4412
4412
  // src/mcp/clientFactory.ts
4413
4413
  function getRetryAfterDelayMs(err) {
@@ -6669,9 +6669,16 @@ function getMissingDependencyMessage(provider) {
6669
6669
  const pkg = packageMap[provider];
6670
6670
  return pkg ? `${String(provider)} provider requires: ${pkg}` : `Unknown provider: ${String(provider)}`;
6671
6671
  }
6672
- async function saveBaseline(result, filePath) {
6672
+ async function saveBaseline(result, filePath, options = {}) {
6673
+ const { omitResponses = true } = options;
6674
+ const toSave = omitResponses ? {
6675
+ ...result,
6676
+ caseResults: result.caseResults.map(
6677
+ ({ response: _response, ...rest }) => rest
6678
+ )
6679
+ } : result;
6673
6680
  await fs$1.mkdir(path2.dirname(filePath), { recursive: true });
6674
- await fs$1.writeFile(filePath, JSON.stringify(result, null, 2), "utf8");
6681
+ await fs$1.writeFile(filePath, JSON.stringify(toSave, null, 2), "utf8");
6675
6682
  }
6676
6683
  async function loadBaseline(filePath) {
6677
6684
  const raw = await fs$1.readFile(filePath, "utf8");
@@ -6911,7 +6918,7 @@ async function runSingleIteration(evalCase, context, options) {
6911
6918
  return {
6912
6919
  id: evalCase.id,
6913
6920
  datasetName: options.datasetName ?? "single-case",
6914
- toolName: evalCase.toolName ?? evalCase.scenario ?? "unknown",
6921
+ toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
6915
6922
  source: "eval",
6916
6923
  pass: didCasePass(error, expectationResults),
6917
6924
  response,
@@ -6939,7 +6946,8 @@ function isInfrastructureError(err) {
6939
6946
  } else {
6940
6947
  return false;
6941
6948
  }
6942
- return name15 === "AbortError" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
6949
+ return name15 === "AbortError" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow LLM couldn't run, not a tool discoverability failure
6950
+ msg.includes("prompt is too long") || msg.includes("context length exceeded") || msg.includes("maximum context length") || msg.includes("context_length_exceeded") || msg.includes("tokens > ") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
6943
6951
  }
6944
6952
  async function runEvalCase(evalCase, context, options = {}) {
6945
6953
  const iterations = evalCase.iterations ?? 1;
@@ -6957,7 +6965,8 @@ async function runEvalCase(evalCase, context, options = {}) {
6957
6965
  pass: result.pass,
6958
6966
  durationMs: result.durationMs,
6959
6967
  error: result.error,
6960
- isInfrastructureError: infraError
6968
+ isInfrastructureError: infraError,
6969
+ mcpHostTrace: result.mcpHostTrace
6961
6970
  });
6962
6971
  } catch (err) {
6963
6972
  const errorMessage = err instanceof Error ? err.message : String(err);
@@ -6980,7 +6989,7 @@ async function runEvalCase(evalCase, context, options = {}) {
6980
6989
  const baseResult = lastResult ?? {
6981
6990
  id: evalCase.id,
6982
6991
  datasetName: options.datasetName ?? "single-case",
6983
- toolName: evalCase.toolName ?? evalCase.scenario ?? "unknown",
6992
+ toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
6984
6993
  source: "eval",
6985
6994
  pass: false,
6986
6995
  error: iterationResults[0]?.error,
@@ -6994,12 +7003,25 @@ async function runEvalCase(evalCase, context, options = {}) {
6994
7003
  ...baseResult,
6995
7004
  pass: assertionPassRate >= threshold,
6996
7005
  assertionPassRate,
7006
+ assertionPassRateCI: wilsonCI(passCount, assertionResults.length),
6997
7007
  infrastructureErrorRate,
6998
7008
  iterationResults,
6999
7009
  infrastructureErrorCount: infraErrors.length,
7000
7010
  durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
7001
7011
  };
7002
7012
  }
7013
+ function wilsonCI(k, n) {
7014
+ if (n < 2) return void 0;
7015
+ const z5 = 1.96;
7016
+ const z22 = z5 * z5;
7017
+ const \u00F1 = n + z22;
7018
+ const p\u0303 = (k + z22 / 2) / \u00F1;
7019
+ const margin = z5 * Math.sqrt(p\u0303 * (1 - p\u0303) / \u00F1);
7020
+ return {
7021
+ lower: Math.max(0, p\u0303 - margin),
7022
+ upper: Math.min(1, p\u0303 + margin)
7023
+ };
7024
+ }
7003
7025
  async function runWithConcurrency(tasks, limit) {
7004
7026
  const results = new Array(tasks.length);
7005
7027
  let index = 0;
@@ -7028,6 +7050,7 @@ async function runEvalDataset(options, context) {
7028
7050
  onCaseComplete,
7029
7051
  filterTags,
7030
7052
  saveResultsTo,
7053
+ omitResponsesFromBaseline = true,
7031
7054
  baselineResultsFrom,
7032
7055
  mcpHostModel,
7033
7056
  judgeModel
@@ -7142,7 +7165,9 @@ async function runEvalDataset(options, context) {
7142
7165
  result.datasetToolF1 = avgPrec + avgRecall > 0 ? 2 * avgPrec * avgRecall / (avgPrec + avgRecall) : 0;
7143
7166
  }
7144
7167
  if (saveResultsTo) {
7145
- await saveBaseline(result, saveResultsTo);
7168
+ await saveBaseline(result, saveResultsTo, {
7169
+ omitResponses: omitResponsesFromBaseline
7170
+ });
7146
7171
  }
7147
7172
  if (context.testInfo) {
7148
7173
  await context.testInfo.attach("mcp-test-results", {