@gleanwork/mcp-server-tester 1.0.0-beta.1 → 1.0.0-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -2486,6 +2486,14 @@ declare function toMatchToolPattern(this: {
2486
2486
  /**
2487
2487
  * Creates the toMatchToolSnapshot matcher function
2488
2488
  *
2489
+ * @remarks
2490
+ * **Requires Playwright test context.** This matcher calls `expect(content).toMatchSnapshot()`
2491
+ * internally, which only works inside a Playwright test (i.e., when `testInfo` is available).
2492
+ * Calling it outside a Playwright test will throw a cryptic context error.
2493
+ *
2494
+ * To test sanitizer logic without a Playwright context, use the exported `applySanitizers`
2495
+ * function directly.
2496
+ *
2489
2497
  * Note: This is an async matcher that uses Playwright's snapshot testing.
2490
2498
  */
2491
2499
  declare function toMatchToolSnapshot(this: {
@@ -2896,7 +2904,7 @@ interface EvalCase {
2896
2904
  metadata?: Record<string, unknown>;
2897
2905
  /**
2898
2906
  * Number of times to run this case and compute an accuracy score.
2899
- * When > 1, `EvalCaseResult.accuracy` is populated and `pass` is determined
2907
+ * When > 1, `EvalCaseResult.assertionPassRate` is populated and `pass` is determined
2900
2908
  * by `accuracyThreshold` rather than a single run.
2901
2909
  * @default 1
2902
2910
  */
@@ -4282,12 +4290,6 @@ interface EvalCaseResult {
4282
4290
  * Only present when the case was run with `iterations > 1`.
4283
4291
  */
4284
4292
  infrastructureErrorRate?: number;
4285
- /**
4286
- * Accuracy score (0–1) across all iterations.
4287
- * Alias for `assertionPassRate`. Only present when the case was run with `iterations > 1`.
4288
- * @deprecated Use `assertionPassRate` for clarity; this field is kept for backward compatibility.
4289
- */
4290
- accuracy?: number;
4291
4293
  /**
4292
4294
  * Per-iteration pass/fail breakdown.
4293
4295
  * Only present when the case was run with `iterations > 1`.
package/dist/index.d.ts CHANGED
@@ -2486,6 +2486,14 @@ declare function toMatchToolPattern(this: {
2486
2486
  /**
2487
2487
  * Creates the toMatchToolSnapshot matcher function
2488
2488
  *
2489
+ * @remarks
2490
+ * **Requires Playwright test context.** This matcher calls `expect(content).toMatchSnapshot()`
2491
+ * internally, which only works inside a Playwright test (i.e., when `testInfo` is available).
2492
+ * Calling it outside a Playwright test will throw a cryptic context error.
2493
+ *
2494
+ * To test sanitizer logic without a Playwright context, use the exported `applySanitizers`
2495
+ * function directly.
2496
+ *
2489
2497
  * Note: This is an async matcher that uses Playwright's snapshot testing.
2490
2498
  */
2491
2499
  declare function toMatchToolSnapshot(this: {
@@ -2896,7 +2904,7 @@ interface EvalCase {
2896
2904
  metadata?: Record<string, unknown>;
2897
2905
  /**
2898
2906
  * Number of times to run this case and compute an accuracy score.
2899
- * When > 1, `EvalCaseResult.accuracy` is populated and `pass` is determined
2907
+ * When > 1, `EvalCaseResult.assertionPassRate` is populated and `pass` is determined
2900
2908
  * by `accuracyThreshold` rather than a single run.
2901
2909
  * @default 1
2902
2910
  */
@@ -4282,12 +4290,6 @@ interface EvalCaseResult {
4282
4290
  * Only present when the case was run with `iterations > 1`.
4283
4291
  */
4284
4292
  infrastructureErrorRate?: number;
4285
- /**
4286
- * Accuracy score (0–1) across all iterations.
4287
- * Alias for `assertionPassRate`. Only present when the case was run with `iterations > 1`.
4288
- * @deprecated Use `assertionPassRate` for clarity; this field is kept for backward compatibility.
4289
- */
4290
- accuracy?: number;
4291
4293
  /**
4292
4294
  * Per-iteration pass/fail breakdown.
4293
4295
  * Only present when the case was run with `iterations > 1`.
package/dist/index.js CHANGED
@@ -4380,7 +4380,7 @@ function escapeHtml(text) {
4380
4380
 
4381
4381
  // package.json
4382
4382
  var package_default = {
4383
- version: "1.0.0-beta.1"};
4383
+ version: "1.0.0-beta.3"};
4384
4384
 
4385
4385
  // src/mcp/clientFactory.ts
4386
4386
  function getRetryAfterDelayMs(err) {
@@ -4471,7 +4471,10 @@ async function createMCPClientForConfig(config, options) {
4471
4471
  validatedConfig.connectTimeoutMs !== void 0 ? { timeout: validatedConfig.connectTimeoutMs } : void 0
4472
4472
  );
4473
4473
  } else if (isHttpConfig(validatedConfig)) {
4474
- const headers = { ...validatedConfig.headers };
4474
+ const headers = {
4475
+ "User-Agent": `@gleanwork/mcp-server-tester/${package_default.version}`,
4476
+ ...validatedConfig.headers
4477
+ };
4475
4478
  if (validatedConfig.auth?.clientCredentials && !options?.authProvider) {
4476
4479
  const ccConfig = validatedConfig.auth.clientCredentials;
4477
4480
  const clientId = ccConfig.clientId ?? process.env["MCP_CLIENT_ID"];
@@ -5888,7 +5891,18 @@ function applySanitizers(value, sanitizers) {
5888
5891
  continue;
5889
5892
  }
5890
5893
  if (isRegexSanitizer(sanitizer)) {
5891
- const pattern = sanitizer.pattern instanceof RegExp ? sanitizer.pattern : new RegExp(sanitizer.pattern, "g");
5894
+ let pattern;
5895
+ if (sanitizer.pattern instanceof RegExp) {
5896
+ pattern = sanitizer.pattern;
5897
+ } else {
5898
+ try {
5899
+ pattern = new RegExp(sanitizer.pattern, "g");
5900
+ } catch {
5901
+ throw new Error(
5902
+ `toMatchToolSnapshot: invalid regex pattern "${sanitizer.pattern}" in sanitizer`
5903
+ );
5904
+ }
5905
+ }
5892
5906
  const replacement = sanitizer.replacement ?? "[SANITIZED]";
5893
5907
  result = result.replace(pattern, replacement);
5894
5908
  continue;
@@ -6913,7 +6927,6 @@ async function runEvalCase(evalCase, context, options = {}) {
6913
6927
  const passCount = assertionResults.filter((r) => r.pass).length;
6914
6928
  const assertionPassRate = assertionResults.length > 0 ? passCount / assertionResults.length : 0;
6915
6929
  const infrastructureErrorRate = infraErrors.length / iterations;
6916
- const accuracy = assertionPassRate;
6917
6930
  const threshold = evalCase.accuracyThreshold ?? 1;
6918
6931
  const baseResult = lastResult ?? {
6919
6932
  id: evalCase.id,
@@ -6930,10 +6943,9 @@ async function runEvalCase(evalCase, context, options = {}) {
6930
6943
  };
6931
6944
  return {
6932
6945
  ...baseResult,
6933
- pass: accuracy >= threshold,
6946
+ pass: assertionPassRate >= threshold,
6934
6947
  assertionPassRate,
6935
6948
  infrastructureErrorRate,
6936
- accuracy,
6937
6949
  iterationResults,
6938
6950
  infrastructureErrorCount: infraErrors.length,
6939
6951
  durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
@@ -6991,9 +7003,9 @@ async function runEvalDataset(options, context) {
6991
7003
  const withIterations = evalCase.mode === "llm_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
6992
7004
  if (evalCase.mode === "llm_host") {
6993
7005
  const effectiveIterations = withIterations.iterations ?? 1;
6994
- if (effectiveIterations < 10) {
7006
+ if (effectiveIterations > 1 && effectiveIterations < 10) {
6995
7007
  console.warn(
6996
- `[mcp-server-tester] Eval case "${evalCase.id}" uses llm_host mode with only ${effectiveIterations} iteration(s). The evals guide recommends >= 10 iterations. See docs/evals-guide.md for guidance on statistical reliability.`
7008
+ `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in llm_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
6997
7009
  );
6998
7010
  }
6999
7011
  }
@@ -7041,6 +7053,16 @@ async function runEvalDataset(options, context) {
7041
7053
  const baseline = await loadBaseline(baselineResultsFrom);
7042
7054
  const baselinePassRate = baseline.total > 0 ? baseline.passed / baseline.total : 0;
7043
7055
  const baselineMap = buildBaselinePassMap(baseline);
7056
+ const currentCaseIds = result.caseResults.map((cr) => cr.id);
7057
+ const unmatchedCount = currentCaseIds.filter(
7058
+ (id) => !baselineMap.has(id)
7059
+ ).length;
7060
+ const unmatchedRatio = currentCaseIds.length > 0 ? unmatchedCount / currentCaseIds.length : 0;
7061
+ if (unmatchedRatio > 0.2) {
7062
+ console.warn(
7063
+ `[mcp-server-tester] Baseline comparison: ${unmatchedCount} of ${currentCaseIds.length} cases (${Math.round(unmatchedRatio * 100)}%) have no baseline entry. This may indicate the dataset structure has changed. Results for unmatched cases cannot be compared.`
7064
+ );
7065
+ }
7044
7066
  for (const cr of result.caseResults) {
7045
7067
  const baselinePass = baselineMap.get(cr.id);
7046
7068
  if (baselinePass !== void 0) {