@gleanwork/mcp-server-tester 1.0.0-beta.4 → 1.0.0-beta.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -3309,6 +3309,22 @@ interface EvalCaseResult {
3309
3309
  * the denominator so that environment reliability does not inflate this metric.
3310
3310
  */
3311
3311
  assertionPassRate?: number;
3312
+ /**
3313
+ * 95% Wilson score confidence interval for `assertionPassRate`.
3314
+ * Only present when the case was run with `iterations > 1`.
3315
+ *
3316
+ * Interpet as: the true pass rate is likely between `lower` and `upper`.
3317
+ * Wider intervals mean fewer iterations were run; run more iterations to narrow them.
3318
+ *
3319
+ * @example { lower: 0.35, upper: 0.93 } // 7/10 passes → 70% ± wide CI
3320
+ * @example { lower: 0.57, upper: 0.80 } // 35/50 passes → 70% ± narrow CI
3321
+ */
3322
+ assertionPassRateCI?: {
3323
+ /** Lower bound of the 95% confidence interval (0–1) */
3324
+ lower: number;
3325
+ /** Upper bound of the 95% confidence interval (0–1) */
3326
+ upper: number;
3327
+ };
3312
3328
  /**
3313
3329
  * Infrastructure error rate (0–1): infra errors divided by total iterations.
3314
3330
  * Only present when the case was run with `iterations > 1`.
package/dist/index.d.ts CHANGED
@@ -3309,6 +3309,22 @@ interface EvalCaseResult {
3309
3309
  * the denominator so that environment reliability does not inflate this metric.
3310
3310
  */
3311
3311
  assertionPassRate?: number;
3312
+ /**
3313
+ * 95% Wilson score confidence interval for `assertionPassRate`.
3314
+ * Only present when the case was run with `iterations > 1`.
3315
+ *
3316
+ * Interpet as: the true pass rate is likely between `lower` and `upper`.
3317
+ * Wider intervals mean fewer iterations were run; run more iterations to narrow them.
3318
+ *
3319
+ * @example { lower: 0.35, upper: 0.93 } // 7/10 passes → 70% ± wide CI
3320
+ * @example { lower: 0.57, upper: 0.80 } // 35/50 passes → 70% ± narrow CI
3321
+ */
3322
+ assertionPassRateCI?: {
3323
+ /** Lower bound of the 95% confidence interval (0–1) */
3324
+ lower: number;
3325
+ /** Upper bound of the 95% confidence interval (0–1) */
3326
+ upper: number;
3327
+ };
3312
3328
  /**
3313
3329
  * Infrastructure error rate (0–1): infra errors divided by total iterations.
3314
3330
  * Only present when the case was run with `iterations > 1`.
package/dist/index.js CHANGED
@@ -4380,7 +4380,7 @@ function escapeHtml(text) {
4380
4380
 
4381
4381
  // package.json
4382
4382
  var package_default = {
4383
- version: "1.0.0-beta.4"};
4383
+ version: "1.0.0-beta.5"};
4384
4384
 
4385
4385
  // src/mcp/clientFactory.ts
4386
4386
  function getRetryAfterDelayMs(err) {
@@ -6884,7 +6884,7 @@ async function runSingleIteration(evalCase, context, options) {
6884
6884
  return {
6885
6885
  id: evalCase.id,
6886
6886
  datasetName: options.datasetName ?? "single-case",
6887
- toolName: evalCase.toolName ?? evalCase.scenario ?? "unknown",
6887
+ toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
6888
6888
  source: "eval",
6889
6889
  pass: didCasePass(error, expectationResults),
6890
6890
  response,
@@ -6953,7 +6953,7 @@ async function runEvalCase(evalCase, context, options = {}) {
6953
6953
  const baseResult = lastResult ?? {
6954
6954
  id: evalCase.id,
6955
6955
  datasetName: options.datasetName ?? "single-case",
6956
- toolName: evalCase.toolName ?? evalCase.scenario ?? "unknown",
6956
+ toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
6957
6957
  source: "eval",
6958
6958
  pass: false,
6959
6959
  error: iterationResults[0]?.error,
@@ -6967,12 +6967,25 @@ async function runEvalCase(evalCase, context, options = {}) {
6967
6967
  ...baseResult,
6968
6968
  pass: assertionPassRate >= threshold,
6969
6969
  assertionPassRate,
6970
+ assertionPassRateCI: wilsonCI(passCount, assertionResults.length),
6970
6971
  infrastructureErrorRate,
6971
6972
  iterationResults,
6972
6973
  infrastructureErrorCount: infraErrors.length,
6973
6974
  durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
6974
6975
  };
6975
6976
  }
6977
+ function wilsonCI(k, n) {
6978
+ if (n < 2) return void 0;
6979
+ const z5 = 1.96;
6980
+ const z22 = z5 * z5;
6981
+ const \u00F1 = n + z22;
6982
+ const p\u0303 = (k + z22 / 2) / \u00F1;
6983
+ const margin = z5 * Math.sqrt(p\u0303 * (1 - p\u0303) / \u00F1);
6984
+ return {
6985
+ lower: Math.max(0, p\u0303 - margin),
6986
+ upper: Math.min(1, p\u0303 + margin)
6987
+ };
6988
+ }
6976
6989
  async function runWithConcurrency(tasks, limit) {
6977
6990
  const results = new Array(tasks.length);
6978
6991
  let index = 0;