@gleanwork/mcp-server-tester 1.0.0-beta.4 → 1.0.0-beta.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +1 -1
- package/dist/fixtures/mcp.js +1 -1
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +16 -3
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +16 -0
- package/dist/index.d.ts +16 -0
- package/dist/index.js +16 -3
- package/dist/index.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +12 -12
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -4407,7 +4407,7 @@ function escapeHtml(text) {
|
|
|
4407
4407
|
|
|
4408
4408
|
// package.json
|
|
4409
4409
|
var package_default = {
|
|
4410
|
-
version: "1.0.0-beta.
|
|
4410
|
+
version: "1.0.0-beta.5"};
|
|
4411
4411
|
|
|
4412
4412
|
// src/mcp/clientFactory.ts
|
|
4413
4413
|
function getRetryAfterDelayMs(err) {
|
|
@@ -6911,7 +6911,7 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
6911
6911
|
return {
|
|
6912
6912
|
id: evalCase.id,
|
|
6913
6913
|
datasetName: options.datasetName ?? "single-case",
|
|
6914
|
-
toolName: evalCase.
|
|
6914
|
+
toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
|
|
6915
6915
|
source: "eval",
|
|
6916
6916
|
pass: didCasePass(error, expectationResults),
|
|
6917
6917
|
response,
|
|
@@ -6980,7 +6980,7 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
6980
6980
|
const baseResult = lastResult ?? {
|
|
6981
6981
|
id: evalCase.id,
|
|
6982
6982
|
datasetName: options.datasetName ?? "single-case",
|
|
6983
|
-
toolName: evalCase.
|
|
6983
|
+
toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
|
|
6984
6984
|
source: "eval",
|
|
6985
6985
|
pass: false,
|
|
6986
6986
|
error: iterationResults[0]?.error,
|
|
@@ -6994,12 +6994,25 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
6994
6994
|
...baseResult,
|
|
6995
6995
|
pass: assertionPassRate >= threshold,
|
|
6996
6996
|
assertionPassRate,
|
|
6997
|
+
assertionPassRateCI: wilsonCI(passCount, assertionResults.length),
|
|
6997
6998
|
infrastructureErrorRate,
|
|
6998
6999
|
iterationResults,
|
|
6999
7000
|
infrastructureErrorCount: infraErrors.length,
|
|
7000
7001
|
durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
|
|
7001
7002
|
};
|
|
7002
7003
|
}
|
|
7004
|
+
function wilsonCI(k, n) {
|
|
7005
|
+
if (n < 2) return void 0;
|
|
7006
|
+
const z5 = 1.96;
|
|
7007
|
+
const z22 = z5 * z5;
|
|
7008
|
+
const \u00F1 = n + z22;
|
|
7009
|
+
const p\u0303 = (k + z22 / 2) / \u00F1;
|
|
7010
|
+
const margin = z5 * Math.sqrt(p\u0303 * (1 - p\u0303) / \u00F1);
|
|
7011
|
+
return {
|
|
7012
|
+
lower: Math.max(0, p\u0303 - margin),
|
|
7013
|
+
upper: Math.min(1, p\u0303 + margin)
|
|
7014
|
+
};
|
|
7015
|
+
}
|
|
7003
7016
|
async function runWithConcurrency(tasks, limit) {
|
|
7004
7017
|
const results = new Array(tasks.length);
|
|
7005
7018
|
let index = 0;
|