@gleanwork/mcp-server-tester 1.0.0-beta.1 → 1.0.0-beta.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -12
- package/dist/cli/index.js +5 -2
- package/dist/fixtures/mcp.d.ts +8 -0
- package/dist/fixtures/mcp.js +17 -3
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +30 -8
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +9 -7
- package/dist/index.d.ts +9 -7
- package/dist/index.js +30 -8
- package/dist/index.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +4 -4
- package/package.json +3 -3
- package/src/reporters/ui-dist/app.js +4 -4
package/dist/index.d.cts
CHANGED
|
@@ -2486,6 +2486,14 @@ declare function toMatchToolPattern(this: {
|
|
|
2486
2486
|
/**
|
|
2487
2487
|
* Creates the toMatchToolSnapshot matcher function
|
|
2488
2488
|
*
|
|
2489
|
+
* @remarks
|
|
2490
|
+
* **Requires Playwright test context.** This matcher calls `expect(content).toMatchSnapshot()`
|
|
2491
|
+
* internally, which only works inside a Playwright test (i.e., when `testInfo` is available).
|
|
2492
|
+
* Calling it outside a Playwright test will throw a cryptic context error.
|
|
2493
|
+
*
|
|
2494
|
+
* To test sanitizer logic without a Playwright context, use the exported `applySanitizers`
|
|
2495
|
+
* function directly.
|
|
2496
|
+
*
|
|
2489
2497
|
* Note: This is an async matcher that uses Playwright's snapshot testing.
|
|
2490
2498
|
*/
|
|
2491
2499
|
declare function toMatchToolSnapshot(this: {
|
|
@@ -2896,7 +2904,7 @@ interface EvalCase {
|
|
|
2896
2904
|
metadata?: Record<string, unknown>;
|
|
2897
2905
|
/**
|
|
2898
2906
|
* Number of times to run this case and compute an accuracy score.
|
|
2899
|
-
* When > 1, `EvalCaseResult.
|
|
2907
|
+
* When > 1, `EvalCaseResult.assertionPassRate` is populated and `pass` is determined
|
|
2900
2908
|
* by `accuracyThreshold` rather than a single run.
|
|
2901
2909
|
* @default 1
|
|
2902
2910
|
*/
|
|
@@ -4282,12 +4290,6 @@ interface EvalCaseResult {
|
|
|
4282
4290
|
* Only present when the case was run with `iterations > 1`.
|
|
4283
4291
|
*/
|
|
4284
4292
|
infrastructureErrorRate?: number;
|
|
4285
|
-
/**
|
|
4286
|
-
* Accuracy score (0–1) across all iterations.
|
|
4287
|
-
* Alias for `assertionPassRate`. Only present when the case was run with `iterations > 1`.
|
|
4288
|
-
* @deprecated Use `assertionPassRate` for clarity; this field is kept for backward compatibility.
|
|
4289
|
-
*/
|
|
4290
|
-
accuracy?: number;
|
|
4291
4293
|
/**
|
|
4292
4294
|
* Per-iteration pass/fail breakdown.
|
|
4293
4295
|
* Only present when the case was run with `iterations > 1`.
|
package/dist/index.d.ts
CHANGED
|
@@ -2486,6 +2486,14 @@ declare function toMatchToolPattern(this: {
|
|
|
2486
2486
|
/**
|
|
2487
2487
|
* Creates the toMatchToolSnapshot matcher function
|
|
2488
2488
|
*
|
|
2489
|
+
* @remarks
|
|
2490
|
+
* **Requires Playwright test context.** This matcher calls `expect(content).toMatchSnapshot()`
|
|
2491
|
+
* internally, which only works inside a Playwright test (i.e., when `testInfo` is available).
|
|
2492
|
+
* Calling it outside a Playwright test will throw a cryptic context error.
|
|
2493
|
+
*
|
|
2494
|
+
* To test sanitizer logic without a Playwright context, use the exported `applySanitizers`
|
|
2495
|
+
* function directly.
|
|
2496
|
+
*
|
|
2489
2497
|
* Note: This is an async matcher that uses Playwright's snapshot testing.
|
|
2490
2498
|
*/
|
|
2491
2499
|
declare function toMatchToolSnapshot(this: {
|
|
@@ -2896,7 +2904,7 @@ interface EvalCase {
|
|
|
2896
2904
|
metadata?: Record<string, unknown>;
|
|
2897
2905
|
/**
|
|
2898
2906
|
* Number of times to run this case and compute an accuracy score.
|
|
2899
|
-
* When > 1, `EvalCaseResult.
|
|
2907
|
+
* When > 1, `EvalCaseResult.assertionPassRate` is populated and `pass` is determined
|
|
2900
2908
|
* by `accuracyThreshold` rather than a single run.
|
|
2901
2909
|
* @default 1
|
|
2902
2910
|
*/
|
|
@@ -4282,12 +4290,6 @@ interface EvalCaseResult {
|
|
|
4282
4290
|
* Only present when the case was run with `iterations > 1`.
|
|
4283
4291
|
*/
|
|
4284
4292
|
infrastructureErrorRate?: number;
|
|
4285
|
-
/**
|
|
4286
|
-
* Accuracy score (0–1) across all iterations.
|
|
4287
|
-
* Alias for `assertionPassRate`. Only present when the case was run with `iterations > 1`.
|
|
4288
|
-
* @deprecated Use `assertionPassRate` for clarity; this field is kept for backward compatibility.
|
|
4289
|
-
*/
|
|
4290
|
-
accuracy?: number;
|
|
4291
4293
|
/**
|
|
4292
4294
|
* Per-iteration pass/fail breakdown.
|
|
4293
4295
|
* Only present when the case was run with `iterations > 1`.
|
package/dist/index.js
CHANGED
|
@@ -4380,7 +4380,7 @@ function escapeHtml(text) {
|
|
|
4380
4380
|
|
|
4381
4381
|
// package.json
|
|
4382
4382
|
var package_default = {
|
|
4383
|
-
version: "1.0.0-beta.
|
|
4383
|
+
version: "1.0.0-beta.3"};
|
|
4384
4384
|
|
|
4385
4385
|
// src/mcp/clientFactory.ts
|
|
4386
4386
|
function getRetryAfterDelayMs(err) {
|
|
@@ -4471,7 +4471,10 @@ async function createMCPClientForConfig(config, options) {
|
|
|
4471
4471
|
validatedConfig.connectTimeoutMs !== void 0 ? { timeout: validatedConfig.connectTimeoutMs } : void 0
|
|
4472
4472
|
);
|
|
4473
4473
|
} else if (isHttpConfig(validatedConfig)) {
|
|
4474
|
-
const headers = {
|
|
4474
|
+
const headers = {
|
|
4475
|
+
"User-Agent": `@gleanwork/mcp-server-tester/${package_default.version}`,
|
|
4476
|
+
...validatedConfig.headers
|
|
4477
|
+
};
|
|
4475
4478
|
if (validatedConfig.auth?.clientCredentials && !options?.authProvider) {
|
|
4476
4479
|
const ccConfig = validatedConfig.auth.clientCredentials;
|
|
4477
4480
|
const clientId = ccConfig.clientId ?? process.env["MCP_CLIENT_ID"];
|
|
@@ -5888,7 +5891,18 @@ function applySanitizers(value, sanitizers) {
|
|
|
5888
5891
|
continue;
|
|
5889
5892
|
}
|
|
5890
5893
|
if (isRegexSanitizer(sanitizer)) {
|
|
5891
|
-
|
|
5894
|
+
let pattern;
|
|
5895
|
+
if (sanitizer.pattern instanceof RegExp) {
|
|
5896
|
+
pattern = sanitizer.pattern;
|
|
5897
|
+
} else {
|
|
5898
|
+
try {
|
|
5899
|
+
pattern = new RegExp(sanitizer.pattern, "g");
|
|
5900
|
+
} catch {
|
|
5901
|
+
throw new Error(
|
|
5902
|
+
`toMatchToolSnapshot: invalid regex pattern "${sanitizer.pattern}" in sanitizer`
|
|
5903
|
+
);
|
|
5904
|
+
}
|
|
5905
|
+
}
|
|
5892
5906
|
const replacement = sanitizer.replacement ?? "[SANITIZED]";
|
|
5893
5907
|
result = result.replace(pattern, replacement);
|
|
5894
5908
|
continue;
|
|
@@ -6913,7 +6927,6 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
6913
6927
|
const passCount = assertionResults.filter((r) => r.pass).length;
|
|
6914
6928
|
const assertionPassRate = assertionResults.length > 0 ? passCount / assertionResults.length : 0;
|
|
6915
6929
|
const infrastructureErrorRate = infraErrors.length / iterations;
|
|
6916
|
-
const accuracy = assertionPassRate;
|
|
6917
6930
|
const threshold = evalCase.accuracyThreshold ?? 1;
|
|
6918
6931
|
const baseResult = lastResult ?? {
|
|
6919
6932
|
id: evalCase.id,
|
|
@@ -6930,10 +6943,9 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
6930
6943
|
};
|
|
6931
6944
|
return {
|
|
6932
6945
|
...baseResult,
|
|
6933
|
-
pass:
|
|
6946
|
+
pass: assertionPassRate >= threshold,
|
|
6934
6947
|
assertionPassRate,
|
|
6935
6948
|
infrastructureErrorRate,
|
|
6936
|
-
accuracy,
|
|
6937
6949
|
iterationResults,
|
|
6938
6950
|
infrastructureErrorCount: infraErrors.length,
|
|
6939
6951
|
durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
|
|
@@ -6991,9 +7003,9 @@ async function runEvalDataset(options, context) {
|
|
|
6991
7003
|
const withIterations = evalCase.mode === "llm_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
|
|
6992
7004
|
if (evalCase.mode === "llm_host") {
|
|
6993
7005
|
const effectiveIterations = withIterations.iterations ?? 1;
|
|
6994
|
-
if (effectiveIterations < 10) {
|
|
7006
|
+
if (effectiveIterations > 1 && effectiveIterations < 10) {
|
|
6995
7007
|
console.warn(
|
|
6996
|
-
`[mcp-server-tester] Eval case "${evalCase.id}"
|
|
7008
|
+
`[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in llm_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
|
|
6997
7009
|
);
|
|
6998
7010
|
}
|
|
6999
7011
|
}
|
|
@@ -7041,6 +7053,16 @@ async function runEvalDataset(options, context) {
|
|
|
7041
7053
|
const baseline = await loadBaseline(baselineResultsFrom);
|
|
7042
7054
|
const baselinePassRate = baseline.total > 0 ? baseline.passed / baseline.total : 0;
|
|
7043
7055
|
const baselineMap = buildBaselinePassMap(baseline);
|
|
7056
|
+
const currentCaseIds = result.caseResults.map((cr) => cr.id);
|
|
7057
|
+
const unmatchedCount = currentCaseIds.filter(
|
|
7058
|
+
(id) => !baselineMap.has(id)
|
|
7059
|
+
).length;
|
|
7060
|
+
const unmatchedRatio = currentCaseIds.length > 0 ? unmatchedCount / currentCaseIds.length : 0;
|
|
7061
|
+
if (unmatchedRatio > 0.2) {
|
|
7062
|
+
console.warn(
|
|
7063
|
+
`[mcp-server-tester] Baseline comparison: ${unmatchedCount} of ${currentCaseIds.length} cases (${Math.round(unmatchedRatio * 100)}%) have no baseline entry. This may indicate the dataset structure has changed. Results for unmatched cases cannot be compared.`
|
|
7064
|
+
);
|
|
7065
|
+
}
|
|
7044
7066
|
for (const cr of result.caseResults) {
|
|
7045
7067
|
const baselinePass = baselineMap.get(cr.id);
|
|
7046
7068
|
if (baselinePass !== void 0) {
|