@gleanwork/mcp-server-tester 1.0.0-beta.4 → 1.0.0-beta.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -4407,7 +4407,7 @@ function escapeHtml(text) {
4407
4407
 
4408
4408
  // package.json
4409
4409
  var package_default = {
4410
- version: "1.0.0-beta.4"};
4410
+ version: "1.0.0-beta.5"};
4411
4411
 
4412
4412
  // src/mcp/clientFactory.ts
4413
4413
  function getRetryAfterDelayMs(err) {
@@ -6911,7 +6911,7 @@ async function runSingleIteration(evalCase, context, options) {
6911
6911
  return {
6912
6912
  id: evalCase.id,
6913
6913
  datasetName: options.datasetName ?? "single-case",
6914
- toolName: evalCase.toolName ?? evalCase.scenario ?? "unknown",
6914
+ toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
6915
6915
  source: "eval",
6916
6916
  pass: didCasePass(error, expectationResults),
6917
6917
  response,
@@ -6980,7 +6980,7 @@ async function runEvalCase(evalCase, context, options = {}) {
6980
6980
  const baseResult = lastResult ?? {
6981
6981
  id: evalCase.id,
6982
6982
  datasetName: options.datasetName ?? "single-case",
6983
- toolName: evalCase.toolName ?? evalCase.scenario ?? "unknown",
6983
+ toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
6984
6984
  source: "eval",
6985
6985
  pass: false,
6986
6986
  error: iterationResults[0]?.error,
@@ -6994,12 +6994,25 @@ async function runEvalCase(evalCase, context, options = {}) {
6994
6994
  ...baseResult,
6995
6995
  pass: assertionPassRate >= threshold,
6996
6996
  assertionPassRate,
6997
+ assertionPassRateCI: wilsonCI(passCount, assertionResults.length),
6997
6998
  infrastructureErrorRate,
6998
6999
  iterationResults,
6999
7000
  infrastructureErrorCount: infraErrors.length,
7000
7001
  durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
7001
7002
  };
7002
7003
  }
7004
+ function wilsonCI(k, n) {
7005
+ if (n < 2) return void 0;
7006
+ const z5 = 1.96;
7007
+ const z22 = z5 * z5;
7008
+ const \u00F1 = n + z22;
7009
+ const p\u0303 = (k + z22 / 2) / \u00F1;
7010
+ const margin = z5 * Math.sqrt(p\u0303 * (1 - p\u0303) / \u00F1);
7011
+ return {
7012
+ lower: Math.max(0, p\u0303 - margin),
7013
+ upper: Math.min(1, p\u0303 + margin)
7014
+ };
7015
+ }
7003
7016
  async function runWithConcurrency(tasks, limit) {
7004
7017
  const results = new Array(tasks.length);
7005
7018
  let index = 0;