@gleanwork/mcp-server-tester 1.0.0-beta.1 → 1.0.0-beta.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -12
- package/dist/cli/index.js +5 -2
- package/dist/fixtures/mcp.d.ts +8 -0
- package/dist/fixtures/mcp.js +17 -3
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +30 -8
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +9 -7
- package/dist/index.d.ts +9 -7
- package/dist/index.js +30 -8
- package/dist/index.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +4 -4
- package/package.json +3 -3
- package/src/reporters/ui-dist/app.js +4 -4
package/dist/index.cjs
CHANGED
|
@@ -4407,7 +4407,7 @@ function escapeHtml(text) {
|
|
|
4407
4407
|
|
|
4408
4408
|
// package.json
|
|
4409
4409
|
var package_default = {
|
|
4410
|
-
version: "1.0.0-beta.
|
|
4410
|
+
version: "1.0.0-beta.3"};
|
|
4411
4411
|
|
|
4412
4412
|
// src/mcp/clientFactory.ts
|
|
4413
4413
|
function getRetryAfterDelayMs(err) {
|
|
@@ -4498,7 +4498,10 @@ async function createMCPClientForConfig(config, options) {
|
|
|
4498
4498
|
validatedConfig.connectTimeoutMs !== void 0 ? { timeout: validatedConfig.connectTimeoutMs } : void 0
|
|
4499
4499
|
);
|
|
4500
4500
|
} else if (isHttpConfig(validatedConfig)) {
|
|
4501
|
-
const headers = {
|
|
4501
|
+
const headers = {
|
|
4502
|
+
"User-Agent": `@gleanwork/mcp-server-tester/${package_default.version}`,
|
|
4503
|
+
...validatedConfig.headers
|
|
4504
|
+
};
|
|
4502
4505
|
if (validatedConfig.auth?.clientCredentials && !options?.authProvider) {
|
|
4503
4506
|
const ccConfig = validatedConfig.auth.clientCredentials;
|
|
4504
4507
|
const clientId = ccConfig.clientId ?? process.env["MCP_CLIENT_ID"];
|
|
@@ -5915,7 +5918,18 @@ function applySanitizers(value, sanitizers) {
|
|
|
5915
5918
|
continue;
|
|
5916
5919
|
}
|
|
5917
5920
|
if (isRegexSanitizer(sanitizer)) {
|
|
5918
|
-
|
|
5921
|
+
let pattern;
|
|
5922
|
+
if (sanitizer.pattern instanceof RegExp) {
|
|
5923
|
+
pattern = sanitizer.pattern;
|
|
5924
|
+
} else {
|
|
5925
|
+
try {
|
|
5926
|
+
pattern = new RegExp(sanitizer.pattern, "g");
|
|
5927
|
+
} catch {
|
|
5928
|
+
throw new Error(
|
|
5929
|
+
`toMatchToolSnapshot: invalid regex pattern "${sanitizer.pattern}" in sanitizer`
|
|
5930
|
+
);
|
|
5931
|
+
}
|
|
5932
|
+
}
|
|
5919
5933
|
const replacement = sanitizer.replacement ?? "[SANITIZED]";
|
|
5920
5934
|
result = result.replace(pattern, replacement);
|
|
5921
5935
|
continue;
|
|
@@ -6940,7 +6954,6 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
6940
6954
|
const passCount = assertionResults.filter((r) => r.pass).length;
|
|
6941
6955
|
const assertionPassRate = assertionResults.length > 0 ? passCount / assertionResults.length : 0;
|
|
6942
6956
|
const infrastructureErrorRate = infraErrors.length / iterations;
|
|
6943
|
-
const accuracy = assertionPassRate;
|
|
6944
6957
|
const threshold = evalCase.accuracyThreshold ?? 1;
|
|
6945
6958
|
const baseResult = lastResult ?? {
|
|
6946
6959
|
id: evalCase.id,
|
|
@@ -6957,10 +6970,9 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
6957
6970
|
};
|
|
6958
6971
|
return {
|
|
6959
6972
|
...baseResult,
|
|
6960
|
-
pass:
|
|
6973
|
+
pass: assertionPassRate >= threshold,
|
|
6961
6974
|
assertionPassRate,
|
|
6962
6975
|
infrastructureErrorRate,
|
|
6963
|
-
accuracy,
|
|
6964
6976
|
iterationResults,
|
|
6965
6977
|
infrastructureErrorCount: infraErrors.length,
|
|
6966
6978
|
durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
|
|
@@ -7018,9 +7030,9 @@ async function runEvalDataset(options, context) {
|
|
|
7018
7030
|
const withIterations = evalCase.mode === "llm_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
|
|
7019
7031
|
if (evalCase.mode === "llm_host") {
|
|
7020
7032
|
const effectiveIterations = withIterations.iterations ?? 1;
|
|
7021
|
-
if (effectiveIterations < 10) {
|
|
7033
|
+
if (effectiveIterations > 1 && effectiveIterations < 10) {
|
|
7022
7034
|
console.warn(
|
|
7023
|
-
`[mcp-server-tester] Eval case "${evalCase.id}"
|
|
7035
|
+
`[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in llm_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
|
|
7024
7036
|
);
|
|
7025
7037
|
}
|
|
7026
7038
|
}
|
|
@@ -7068,6 +7080,16 @@ async function runEvalDataset(options, context) {
|
|
|
7068
7080
|
const baseline = await loadBaseline(baselineResultsFrom);
|
|
7069
7081
|
const baselinePassRate = baseline.total > 0 ? baseline.passed / baseline.total : 0;
|
|
7070
7082
|
const baselineMap = buildBaselinePassMap(baseline);
|
|
7083
|
+
const currentCaseIds = result.caseResults.map((cr) => cr.id);
|
|
7084
|
+
const unmatchedCount = currentCaseIds.filter(
|
|
7085
|
+
(id) => !baselineMap.has(id)
|
|
7086
|
+
).length;
|
|
7087
|
+
const unmatchedRatio = currentCaseIds.length > 0 ? unmatchedCount / currentCaseIds.length : 0;
|
|
7088
|
+
if (unmatchedRatio > 0.2) {
|
|
7089
|
+
console.warn(
|
|
7090
|
+
`[mcp-server-tester] Baseline comparison: ${unmatchedCount} of ${currentCaseIds.length} cases (${Math.round(unmatchedRatio * 100)}%) have no baseline entry. This may indicate the dataset structure has changed. Results for unmatched cases cannot be compared.`
|
|
7091
|
+
);
|
|
7092
|
+
}
|
|
7071
7093
|
for (const cr of result.caseResults) {
|
|
7072
7094
|
const baselinePass = baselineMap.get(cr.id);
|
|
7073
7095
|
if (baselinePass !== void 0) {
|