@gleanwork/mcp-server-tester 1.0.0-beta.1 → 1.0.0-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -4407,7 +4407,7 @@ function escapeHtml(text) {
4407
4407
 
4408
4408
  // package.json
4409
4409
  var package_default = {
4410
- version: "1.0.0-beta.1"};
4410
+ version: "1.0.0-beta.3"};
4411
4411
 
4412
4412
  // src/mcp/clientFactory.ts
4413
4413
  function getRetryAfterDelayMs(err) {
@@ -4498,7 +4498,10 @@ async function createMCPClientForConfig(config, options) {
4498
4498
  validatedConfig.connectTimeoutMs !== void 0 ? { timeout: validatedConfig.connectTimeoutMs } : void 0
4499
4499
  );
4500
4500
  } else if (isHttpConfig(validatedConfig)) {
4501
- const headers = { ...validatedConfig.headers };
4501
+ const headers = {
4502
+ "User-Agent": `@gleanwork/mcp-server-tester/${package_default.version}`,
4503
+ ...validatedConfig.headers
4504
+ };
4502
4505
  if (validatedConfig.auth?.clientCredentials && !options?.authProvider) {
4503
4506
  const ccConfig = validatedConfig.auth.clientCredentials;
4504
4507
  const clientId = ccConfig.clientId ?? process.env["MCP_CLIENT_ID"];
@@ -5915,7 +5918,18 @@ function applySanitizers(value, sanitizers) {
5915
5918
  continue;
5916
5919
  }
5917
5920
  if (isRegexSanitizer(sanitizer)) {
5918
- const pattern = sanitizer.pattern instanceof RegExp ? sanitizer.pattern : new RegExp(sanitizer.pattern, "g");
5921
+ let pattern;
5922
+ if (sanitizer.pattern instanceof RegExp) {
5923
+ pattern = sanitizer.pattern;
5924
+ } else {
5925
+ try {
5926
+ pattern = new RegExp(sanitizer.pattern, "g");
5927
+ } catch {
5928
+ throw new Error(
5929
+ `toMatchToolSnapshot: invalid regex pattern "${sanitizer.pattern}" in sanitizer`
5930
+ );
5931
+ }
5932
+ }
5919
5933
  const replacement = sanitizer.replacement ?? "[SANITIZED]";
5920
5934
  result = result.replace(pattern, replacement);
5921
5935
  continue;
@@ -6940,7 +6954,6 @@ async function runEvalCase(evalCase, context, options = {}) {
6940
6954
  const passCount = assertionResults.filter((r) => r.pass).length;
6941
6955
  const assertionPassRate = assertionResults.length > 0 ? passCount / assertionResults.length : 0;
6942
6956
  const infrastructureErrorRate = infraErrors.length / iterations;
6943
- const accuracy = assertionPassRate;
6944
6957
  const threshold = evalCase.accuracyThreshold ?? 1;
6945
6958
  const baseResult = lastResult ?? {
6946
6959
  id: evalCase.id,
@@ -6957,10 +6970,9 @@ async function runEvalCase(evalCase, context, options = {}) {
6957
6970
  };
6958
6971
  return {
6959
6972
  ...baseResult,
6960
- pass: accuracy >= threshold,
6973
+ pass: assertionPassRate >= threshold,
6961
6974
  assertionPassRate,
6962
6975
  infrastructureErrorRate,
6963
- accuracy,
6964
6976
  iterationResults,
6965
6977
  infrastructureErrorCount: infraErrors.length,
6966
6978
  durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
@@ -7018,9 +7030,9 @@ async function runEvalDataset(options, context) {
7018
7030
  const withIterations = evalCase.mode === "llm_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
7019
7031
  if (evalCase.mode === "llm_host") {
7020
7032
  const effectiveIterations = withIterations.iterations ?? 1;
7021
- if (effectiveIterations < 10) {
7033
+ if (effectiveIterations > 1 && effectiveIterations < 10) {
7022
7034
  console.warn(
7023
- `[mcp-server-tester] Eval case "${evalCase.id}" uses llm_host mode with only ${effectiveIterations} iteration(s). The evals guide recommends >= 10 iterations. See docs/evals-guide.md for guidance on statistical reliability.`
7035
+ `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in llm_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
7024
7036
  );
7025
7037
  }
7026
7038
  }
@@ -7068,6 +7080,16 @@ async function runEvalDataset(options, context) {
7068
7080
  const baseline = await loadBaseline(baselineResultsFrom);
7069
7081
  const baselinePassRate = baseline.total > 0 ? baseline.passed / baseline.total : 0;
7070
7082
  const baselineMap = buildBaselinePassMap(baseline);
7083
+ const currentCaseIds = result.caseResults.map((cr) => cr.id);
7084
+ const unmatchedCount = currentCaseIds.filter(
7085
+ (id) => !baselineMap.has(id)
7086
+ ).length;
7087
+ const unmatchedRatio = currentCaseIds.length > 0 ? unmatchedCount / currentCaseIds.length : 0;
7088
+ if (unmatchedRatio > 0.2) {
7089
+ console.warn(
7090
+ `[mcp-server-tester] Baseline comparison: ${unmatchedCount} of ${currentCaseIds.length} cases (${Math.round(unmatchedRatio * 100)}%) have no baseline entry. This may indicate the dataset structure has changed. Results for unmatched cases cannot be compared.`
7091
+ );
7092
+ }
7071
7093
  for (const cr of result.caseResults) {
7072
7094
  const baselinePass = baselineMap.get(cr.id);
7073
7095
  if (baselinePass !== void 0) {