@gleanwork/mcp-server-tester 1.0.0-beta.4 → 1.0.0-beta.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +1 -1
- package/dist/fixtures/mcp.js +1 -1
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +33 -8
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +59 -2
- package/dist/index.d.ts +59 -2
- package/dist/index.js +33 -8
- package/dist/index.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +12 -12
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -4407,7 +4407,7 @@ function escapeHtml(text) {
|
|
|
4407
4407
|
|
|
4408
4408
|
// package.json
|
|
4409
4409
|
var package_default = {
|
|
4410
|
-
version: "1.0.0-beta.
|
|
4410
|
+
version: "1.0.0-beta.6"};
|
|
4411
4411
|
|
|
4412
4412
|
// src/mcp/clientFactory.ts
|
|
4413
4413
|
function getRetryAfterDelayMs(err) {
|
|
@@ -6669,9 +6669,16 @@ function getMissingDependencyMessage(provider) {
|
|
|
6669
6669
|
const pkg = packageMap[provider];
|
|
6670
6670
|
return pkg ? `${String(provider)} provider requires: ${pkg}` : `Unknown provider: ${String(provider)}`;
|
|
6671
6671
|
}
|
|
6672
|
-
async function saveBaseline(result, filePath) {
|
|
6672
|
+
async function saveBaseline(result, filePath, options = {}) {
|
|
6673
|
+
const { omitResponses = true } = options;
|
|
6674
|
+
const toSave = omitResponses ? {
|
|
6675
|
+
...result,
|
|
6676
|
+
caseResults: result.caseResults.map(
|
|
6677
|
+
({ response: _response, ...rest }) => rest
|
|
6678
|
+
)
|
|
6679
|
+
} : result;
|
|
6673
6680
|
await fs$1.mkdir(path2.dirname(filePath), { recursive: true });
|
|
6674
|
-
await fs$1.writeFile(filePath, JSON.stringify(
|
|
6681
|
+
await fs$1.writeFile(filePath, JSON.stringify(toSave, null, 2), "utf8");
|
|
6675
6682
|
}
|
|
6676
6683
|
async function loadBaseline(filePath) {
|
|
6677
6684
|
const raw = await fs$1.readFile(filePath, "utf8");
|
|
@@ -6911,7 +6918,7 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
6911
6918
|
return {
|
|
6912
6919
|
id: evalCase.id,
|
|
6913
6920
|
datasetName: options.datasetName ?? "single-case",
|
|
6914
|
-
toolName: evalCase.
|
|
6921
|
+
toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
|
|
6915
6922
|
source: "eval",
|
|
6916
6923
|
pass: didCasePass(error, expectationResults),
|
|
6917
6924
|
response,
|
|
@@ -6939,7 +6946,8 @@ function isInfrastructureError(err) {
|
|
|
6939
6946
|
} else {
|
|
6940
6947
|
return false;
|
|
6941
6948
|
}
|
|
6942
|
-
return name15 === "AbortError" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") ||
|
|
6949
|
+
return name15 === "AbortError" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
|
|
6950
|
+
msg.includes("prompt is too long") || msg.includes("context length exceeded") || msg.includes("maximum context length") || msg.includes("context_length_exceeded") || msg.includes("tokens > ") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
|
|
6943
6951
|
}
|
|
6944
6952
|
async function runEvalCase(evalCase, context, options = {}) {
|
|
6945
6953
|
const iterations = evalCase.iterations ?? 1;
|
|
@@ -6957,7 +6965,8 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
6957
6965
|
pass: result.pass,
|
|
6958
6966
|
durationMs: result.durationMs,
|
|
6959
6967
|
error: result.error,
|
|
6960
|
-
isInfrastructureError: infraError
|
|
6968
|
+
isInfrastructureError: infraError,
|
|
6969
|
+
mcpHostTrace: result.mcpHostTrace
|
|
6961
6970
|
});
|
|
6962
6971
|
} catch (err) {
|
|
6963
6972
|
const errorMessage = err instanceof Error ? err.message : String(err);
|
|
@@ -6980,7 +6989,7 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
6980
6989
|
const baseResult = lastResult ?? {
|
|
6981
6990
|
id: evalCase.id,
|
|
6982
6991
|
datasetName: options.datasetName ?? "single-case",
|
|
6983
|
-
toolName: evalCase.
|
|
6992
|
+
toolName: evalCase.scenario != null ? "mcp_host" : evalCase.toolName ?? "unknown",
|
|
6984
6993
|
source: "eval",
|
|
6985
6994
|
pass: false,
|
|
6986
6995
|
error: iterationResults[0]?.error,
|
|
@@ -6994,12 +7003,25 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
6994
7003
|
...baseResult,
|
|
6995
7004
|
pass: assertionPassRate >= threshold,
|
|
6996
7005
|
assertionPassRate,
|
|
7006
|
+
assertionPassRateCI: wilsonCI(passCount, assertionResults.length),
|
|
6997
7007
|
infrastructureErrorRate,
|
|
6998
7008
|
iterationResults,
|
|
6999
7009
|
infrastructureErrorCount: infraErrors.length,
|
|
7000
7010
|
durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
|
|
7001
7011
|
};
|
|
7002
7012
|
}
|
|
7013
|
+
function wilsonCI(k, n) {
|
|
7014
|
+
if (n < 2) return void 0;
|
|
7015
|
+
const z5 = 1.96;
|
|
7016
|
+
const z22 = z5 * z5;
|
|
7017
|
+
const \u00F1 = n + z22;
|
|
7018
|
+
const p\u0303 = (k + z22 / 2) / \u00F1;
|
|
7019
|
+
const margin = z5 * Math.sqrt(p\u0303 * (1 - p\u0303) / \u00F1);
|
|
7020
|
+
return {
|
|
7021
|
+
lower: Math.max(0, p\u0303 - margin),
|
|
7022
|
+
upper: Math.min(1, p\u0303 + margin)
|
|
7023
|
+
};
|
|
7024
|
+
}
|
|
7003
7025
|
async function runWithConcurrency(tasks, limit) {
|
|
7004
7026
|
const results = new Array(tasks.length);
|
|
7005
7027
|
let index = 0;
|
|
@@ -7028,6 +7050,7 @@ async function runEvalDataset(options, context) {
|
|
|
7028
7050
|
onCaseComplete,
|
|
7029
7051
|
filterTags,
|
|
7030
7052
|
saveResultsTo,
|
|
7053
|
+
omitResponsesFromBaseline = true,
|
|
7031
7054
|
baselineResultsFrom,
|
|
7032
7055
|
mcpHostModel,
|
|
7033
7056
|
judgeModel
|
|
@@ -7142,7 +7165,9 @@ async function runEvalDataset(options, context) {
|
|
|
7142
7165
|
result.datasetToolF1 = avgPrec + avgRecall > 0 ? 2 * avgPrec * avgRecall / (avgPrec + avgRecall) : 0;
|
|
7143
7166
|
}
|
|
7144
7167
|
if (saveResultsTo) {
|
|
7145
|
-
await saveBaseline(result, saveResultsTo
|
|
7168
|
+
await saveBaseline(result, saveResultsTo, {
|
|
7169
|
+
omitResponses: omitResponsesFromBaseline
|
|
7170
|
+
});
|
|
7146
7171
|
}
|
|
7147
7172
|
if (context.testInfo) {
|
|
7148
7173
|
await context.testInfo.attach("mcp-test-results", {
|