npm - @gleanwork/mcp-server-tester - Versions diffs - 1.0.0-beta.4 → 1.0.0-beta.5 - Mend

@gleanwork/mcp-server-tester 1.0.0-beta.4 → 1.0.0-beta.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/cli/index.js +1 -1
package/dist/fixtures/mcp.js +1 -1
package/dist/fixtures/mcp.js.map +1 -1
package/dist/index.cjs +16 -3
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +16 -0
package/dist/index.d.ts +16 -0
package/dist/index.js +16 -3
package/dist/index.js.map +1 -1
package/dist/reporters/ui-dist/app.js +12 -12
package/dist/reporters/ui-dist/styles.css +1 -1
package/package.json +1 -1

package/dist/index.d.cts CHANGED Viewed

@@ -3309,6 +3309,22 @@ interface EvalCaseResult {
      * the denominator so that environment reliability does not inflate this metric.
      */
     assertionPassRate?: number;
+    /**
+     * 95% Wilson score confidence interval for `assertionPassRate`.
+     * Only present when the case was run with `iterations > 1`.
+     *
+     * Interpet as: the true pass rate is likely between `lower` and `upper`.
+     * Wider intervals mean fewer iterations were run; run more iterations to narrow them.
+     *
+     * @example { lower: 0.35, upper: 0.93 } // 7/10 passes → 70% ± wide CI
+     * @example { lower: 0.57, upper: 0.80 } // 35/50 passes → 70% ± narrow CI
+     */
+    assertionPassRateCI?: {
+        /** Lower bound of the 95% confidence interval (0–1) */
+        lower: number;
+        /** Upper bound of the 95% confidence interval (0–1) */
+        upper: number;
+    };
     /**
      * Infrastructure error rate (0–1): infra errors divided by total iterations.
      * Only present when the case was run with `iterations > 1`.

package/dist/index.d.ts CHANGED Viewed

@@ -3309,6 +3309,22 @@ interface EvalCaseResult {
      * the denominator so that environment reliability does not inflate this metric.
      */
     assertionPassRate?: number;
+    /**
+     * 95% Wilson score confidence interval for `assertionPassRate`.
+     * Only present when the case was run with `iterations > 1`.
+     *
+     * Interpet as: the true pass rate is likely between `lower` and `upper`.
+     * Wider intervals mean fewer iterations were run; run more iterations to narrow them.
+     *
+     * @example { lower: 0.35, upper: 0.93 } // 7/10 passes → 70% ± wide CI
+     * @example { lower: 0.57, upper: 0.80 } // 35/50 passes → 70% ± narrow CI
+     */
+    assertionPassRateCI?: {
+        /** Lower bound of the 95% confidence interval (0–1) */
+        lower: number;
+        /** Upper bound of the 95% confidence interval (0–1) */
+        upper: number;
+    };
     /**
      * Infrastructure error rate (0–1): infra errors divided by total iterations.
      * Only present when the case was run with `iterations > 1`.

package/dist/index.js CHANGED Viewed

@@ -4380,7 +4380,7 @@ function escapeHtml(text) {
 // package.json
 var package_default = {
-  version: "1.0.0-beta.4"};
+  version: "1.0.0-beta.5"};
 // src/mcp/clientFactory.ts
 function getRetryAfterDelayMs(err) {
@@ -6884,7 +6884,7 @@ async function runSingleIteration(evalCase, context, options) {
   return {
     id: evalCase.id,
     datasetName: options.datasetName ?? "single-case",
-    toolName: evalCase.toolName ?? evalCase.scenario ?? "unknown",
+    toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
     source: "eval",
     pass: didCasePass(error, expectationResults),
     response,
@@ -6953,7 +6953,7 @@ async function runEvalCase(evalCase, context, options = {}) {
   const baseResult = lastResult ?? {
     id: evalCase.id,
     datasetName: options.datasetName ?? "single-case",
-    toolName: evalCase.toolName ?? evalCase.scenario ?? "unknown",
+    toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
     source: "eval",
     pass: false,
     error: iterationResults[0]?.error,
@@ -6967,12 +6967,25 @@ async function runEvalCase(evalCase, context, options = {}) {
     ...baseResult,
     pass: assertionPassRate >= threshold,
     assertionPassRate,
+    assertionPassRateCI: wilsonCI(passCount, assertionResults.length),
     infrastructureErrorRate,
     iterationResults,
     infrastructureErrorCount: infraErrors.length,
     durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
   };
 }
+function wilsonCI(k, n) {
+  if (n < 2) return void 0;
+  const z5 = 1.96;
+  const z22 = z5 * z5;
+  const \u00F1 = n + z22;
+  const p\u0303 = (k + z22 / 2) / \u00F1;
+  const margin = z5 * Math.sqrt(p\u0303 * (1 - p\u0303) / \u00F1);
+  return {
+    lower: Math.max(0, p\u0303 - margin),
+    upper: Math.min(1, p\u0303 + margin)
+  };
+}
 async function runWithConcurrency(tasks, limit) {
   const results = new Array(tasks.length);
   let index = 0;