npm - @gleanwork/mcp-server-tester - Versions diffs - 1.0.0-beta.1 → 1.0.0-beta.3 - Mend

@gleanwork/mcp-server-tester 1.0.0-beta.1 → 1.0.0-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +13 -12
package/dist/cli/index.js +5 -2
package/dist/fixtures/mcp.d.ts +8 -0
package/dist/fixtures/mcp.js +17 -3
package/dist/fixtures/mcp.js.map +1 -1
package/dist/index.cjs +30 -8
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +9 -7
package/dist/index.d.ts +9 -7
package/dist/index.js +30 -8
package/dist/index.js.map +1 -1
package/dist/reporters/ui-dist/app.js +4 -4
package/package.json +3 -3
package/src/reporters/ui-dist/app.js +4 -4

package/dist/index.cjs CHANGED Viewed

@@ -4407,7 +4407,7 @@ function escapeHtml(text) {
 // package.json
 var package_default = {
-  version: "1.0.0-beta.1"};
+  version: "1.0.0-beta.3"};
 // src/mcp/clientFactory.ts
 function getRetryAfterDelayMs(err) {
@@ -4498,7 +4498,10 @@ async function createMCPClientForConfig(config, options) {
       validatedConfig.connectTimeoutMs !== void 0 ? { timeout: validatedConfig.connectTimeoutMs } : void 0
     );
   } else if (isHttpConfig(validatedConfig)) {
-    const headers = { ...validatedConfig.headers };
+    const headers = {
+      "User-Agent": `@gleanwork/mcp-server-tester/${package_default.version}`,
+      ...validatedConfig.headers
+    };
     if (validatedConfig.auth?.clientCredentials && !options?.authProvider) {
       const ccConfig = validatedConfig.auth.clientCredentials;
       const clientId = ccConfig.clientId ?? process.env["MCP_CLIENT_ID"];
@@ -5915,7 +5918,18 @@ function applySanitizers(value, sanitizers) {
       continue;
     }
     if (isRegexSanitizer(sanitizer)) {
-      const pattern = sanitizer.pattern instanceof RegExp ? sanitizer.pattern : new RegExp(sanitizer.pattern, "g");
+      let pattern;
+      if (sanitizer.pattern instanceof RegExp) {
+        pattern = sanitizer.pattern;
+      } else {
+        try {
+          pattern = new RegExp(sanitizer.pattern, "g");
+        } catch {
+          throw new Error(
+            `toMatchToolSnapshot: invalid regex pattern "${sanitizer.pattern}" in sanitizer`
+          );
+        }
+      }
       const replacement = sanitizer.replacement ?? "[SANITIZED]";
       result = result.replace(pattern, replacement);
       continue;
@@ -6940,7 +6954,6 @@ async function runEvalCase(evalCase, context, options = {}) {
   const passCount = assertionResults.filter((r) => r.pass).length;
   const assertionPassRate = assertionResults.length > 0 ? passCount / assertionResults.length : 0;
   const infrastructureErrorRate = infraErrors.length / iterations;
-  const accuracy = assertionPassRate;
   const threshold = evalCase.accuracyThreshold ?? 1;
   const baseResult = lastResult ?? {
     id: evalCase.id,
@@ -6957,10 +6970,9 @@ async function runEvalCase(evalCase, context, options = {}) {
   };
   return {
     ...baseResult,
-    pass: accuracy >= threshold,
+    pass: assertionPassRate >= threshold,
     assertionPassRate,
     infrastructureErrorRate,
-    accuracy,
     iterationResults,
     infrastructureErrorCount: infraErrors.length,
     durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
@@ -7018,9 +7030,9 @@ async function runEvalDataset(options, context) {
     const withIterations = evalCase.mode === "llm_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
     if (evalCase.mode === "llm_host") {
       const effectiveIterations = withIterations.iterations ?? 1;
-      if (effectiveIterations < 10) {
+      if (effectiveIterations > 1 && effectiveIterations < 10) {
         console.warn(
-          `[mcp-server-tester] Eval case "${evalCase.id}" uses llm_host mode with only ${effectiveIterations} iteration(s). The evals guide recommends >= 10 iterations. See docs/evals-guide.md for guidance on statistical reliability.`
+          `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in llm_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
         );
       }
     }
@@ -7068,6 +7080,16 @@ async function runEvalDataset(options, context) {
       const baseline = await loadBaseline(baselineResultsFrom);
       const baselinePassRate = baseline.total > 0 ? baseline.passed / baseline.total : 0;
       const baselineMap = buildBaselinePassMap(baseline);
+      const currentCaseIds = result.caseResults.map((cr) => cr.id);
+      const unmatchedCount = currentCaseIds.filter(
+        (id) => !baselineMap.has(id)
+      ).length;
+      const unmatchedRatio = currentCaseIds.length > 0 ? unmatchedCount / currentCaseIds.length : 0;
+      if (unmatchedRatio > 0.2) {
+        console.warn(
+          `[mcp-server-tester] Baseline comparison: ${unmatchedCount} of ${currentCaseIds.length} cases (${Math.round(unmatchedRatio * 100)}%) have no baseline entry. This may indicate the dataset structure has changed. Results for unmatched cases cannot be compared.`
+        );
+      }
       for (const cr of result.caseResults) {
         const baselinePass = baselineMap.get(cr.id);
         if (baselinePass !== void 0) {