@gleanwork/mcp-server-tester 1.0.0-beta.4 → 1.0.0-beta.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +1 -1
- package/dist/fixtures/mcp.js +1 -1
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +16 -3
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +16 -0
- package/dist/index.d.ts +16 -0
- package/dist/index.js +16 -3
- package/dist/index.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +12 -12
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -3309,6 +3309,22 @@ interface EvalCaseResult {
|
|
|
3309
3309
|
* the denominator so that environment reliability does not inflate this metric.
|
|
3310
3310
|
*/
|
|
3311
3311
|
assertionPassRate?: number;
|
|
3312
|
+
/**
|
|
3313
|
+
* 95% Wilson score confidence interval for `assertionPassRate`.
|
|
3314
|
+
* Only present when the case was run with `iterations > 1`.
|
|
3315
|
+
*
|
|
3316
|
+
* Interpet as: the true pass rate is likely between `lower` and `upper`.
|
|
3317
|
+
* Wider intervals mean fewer iterations were run; run more iterations to narrow them.
|
|
3318
|
+
*
|
|
3319
|
+
* @example { lower: 0.35, upper: 0.93 } // 7/10 passes → 70% ± wide CI
|
|
3320
|
+
* @example { lower: 0.57, upper: 0.80 } // 35/50 passes → 70% ± narrow CI
|
|
3321
|
+
*/
|
|
3322
|
+
assertionPassRateCI?: {
|
|
3323
|
+
/** Lower bound of the 95% confidence interval (0–1) */
|
|
3324
|
+
lower: number;
|
|
3325
|
+
/** Upper bound of the 95% confidence interval (0–1) */
|
|
3326
|
+
upper: number;
|
|
3327
|
+
};
|
|
3312
3328
|
/**
|
|
3313
3329
|
* Infrastructure error rate (0–1): infra errors divided by total iterations.
|
|
3314
3330
|
* Only present when the case was run with `iterations > 1`.
|
package/dist/index.d.ts
CHANGED
|
@@ -3309,6 +3309,22 @@ interface EvalCaseResult {
|
|
|
3309
3309
|
* the denominator so that environment reliability does not inflate this metric.
|
|
3310
3310
|
*/
|
|
3311
3311
|
assertionPassRate?: number;
|
|
3312
|
+
/**
|
|
3313
|
+
* 95% Wilson score confidence interval for `assertionPassRate`.
|
|
3314
|
+
* Only present when the case was run with `iterations > 1`.
|
|
3315
|
+
*
|
|
3316
|
+
* Interpet as: the true pass rate is likely between `lower` and `upper`.
|
|
3317
|
+
* Wider intervals mean fewer iterations were run; run more iterations to narrow them.
|
|
3318
|
+
*
|
|
3319
|
+
* @example { lower: 0.35, upper: 0.93 } // 7/10 passes → 70% ± wide CI
|
|
3320
|
+
* @example { lower: 0.57, upper: 0.80 } // 35/50 passes → 70% ± narrow CI
|
|
3321
|
+
*/
|
|
3322
|
+
assertionPassRateCI?: {
|
|
3323
|
+
/** Lower bound of the 95% confidence interval (0–1) */
|
|
3324
|
+
lower: number;
|
|
3325
|
+
/** Upper bound of the 95% confidence interval (0–1) */
|
|
3326
|
+
upper: number;
|
|
3327
|
+
};
|
|
3312
3328
|
/**
|
|
3313
3329
|
* Infrastructure error rate (0–1): infra errors divided by total iterations.
|
|
3314
3330
|
* Only present when the case was run with `iterations > 1`.
|
package/dist/index.js
CHANGED
|
@@ -4380,7 +4380,7 @@ function escapeHtml(text) {
|
|
|
4380
4380
|
|
|
4381
4381
|
// package.json
|
|
4382
4382
|
var package_default = {
|
|
4383
|
-
version: "1.0.0-beta.
|
|
4383
|
+
version: "1.0.0-beta.5"};
|
|
4384
4384
|
|
|
4385
4385
|
// src/mcp/clientFactory.ts
|
|
4386
4386
|
function getRetryAfterDelayMs(err) {
|
|
@@ -6884,7 +6884,7 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
6884
6884
|
return {
|
|
6885
6885
|
id: evalCase.id,
|
|
6886
6886
|
datasetName: options.datasetName ?? "single-case",
|
|
6887
|
-
toolName: evalCase.
|
|
6887
|
+
toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
|
|
6888
6888
|
source: "eval",
|
|
6889
6889
|
pass: didCasePass(error, expectationResults),
|
|
6890
6890
|
response,
|
|
@@ -6953,7 +6953,7 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
6953
6953
|
const baseResult = lastResult ?? {
|
|
6954
6954
|
id: evalCase.id,
|
|
6955
6955
|
datasetName: options.datasetName ?? "single-case",
|
|
6956
|
-
toolName: evalCase.
|
|
6956
|
+
toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
|
|
6957
6957
|
source: "eval",
|
|
6958
6958
|
pass: false,
|
|
6959
6959
|
error: iterationResults[0]?.error,
|
|
@@ -6967,12 +6967,25 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
6967
6967
|
...baseResult,
|
|
6968
6968
|
pass: assertionPassRate >= threshold,
|
|
6969
6969
|
assertionPassRate,
|
|
6970
|
+
assertionPassRateCI: wilsonCI(passCount, assertionResults.length),
|
|
6970
6971
|
infrastructureErrorRate,
|
|
6971
6972
|
iterationResults,
|
|
6972
6973
|
infrastructureErrorCount: infraErrors.length,
|
|
6973
6974
|
durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
|
|
6974
6975
|
};
|
|
6975
6976
|
}
|
|
6977
|
+
function wilsonCI(k, n) {
|
|
6978
|
+
if (n < 2) return void 0;
|
|
6979
|
+
const z5 = 1.96;
|
|
6980
|
+
const z22 = z5 * z5;
|
|
6981
|
+
const \u00F1 = n + z22;
|
|
6982
|
+
const p\u0303 = (k + z22 / 2) / \u00F1;
|
|
6983
|
+
const margin = z5 * Math.sqrt(p\u0303 * (1 - p\u0303) / \u00F1);
|
|
6984
|
+
return {
|
|
6985
|
+
lower: Math.max(0, p\u0303 - margin),
|
|
6986
|
+
upper: Math.min(1, p\u0303 + margin)
|
|
6987
|
+
};
|
|
6988
|
+
}
|
|
6976
6989
|
async function runWithConcurrency(tasks, limit) {
|
|
6977
6990
|
const results = new Array(tasks.length);
|
|
6978
6991
|
let index = 0;
|