@gleanwork/mcp-server-tester 1.0.0-beta.1 → 1.0.0-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -4407,7 +4407,7 @@ function escapeHtml(text) {
4407
4407
 
4408
4408
  // package.json
4409
4409
  var package_default = {
4410
- version: "1.0.0-beta.1"};
4410
+ version: "1.0.0-beta.2"};
4411
4411
 
4412
4412
  // src/mcp/clientFactory.ts
4413
4413
  function getRetryAfterDelayMs(err) {
@@ -5915,7 +5915,18 @@ function applySanitizers(value, sanitizers) {
5915
5915
  continue;
5916
5916
  }
5917
5917
  if (isRegexSanitizer(sanitizer)) {
5918
- const pattern = sanitizer.pattern instanceof RegExp ? sanitizer.pattern : new RegExp(sanitizer.pattern, "g");
5918
+ let pattern;
5919
+ if (sanitizer.pattern instanceof RegExp) {
5920
+ pattern = sanitizer.pattern;
5921
+ } else {
5922
+ try {
5923
+ pattern = new RegExp(sanitizer.pattern, "g");
5924
+ } catch {
5925
+ throw new Error(
5926
+ `toMatchToolSnapshot: invalid regex pattern "${sanitizer.pattern}" in sanitizer`
5927
+ );
5928
+ }
5929
+ }
5919
5930
  const replacement = sanitizer.replacement ?? "[SANITIZED]";
5920
5931
  result = result.replace(pattern, replacement);
5921
5932
  continue;
@@ -7018,9 +7029,9 @@ async function runEvalDataset(options, context) {
7018
7029
  const withIterations = evalCase.mode === "llm_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
7019
7030
  if (evalCase.mode === "llm_host") {
7020
7031
  const effectiveIterations = withIterations.iterations ?? 1;
7021
- if (effectiveIterations < 10) {
7032
+ if (effectiveIterations > 1 && effectiveIterations < 10) {
7022
7033
  console.warn(
7023
- `[mcp-server-tester] Eval case "${evalCase.id}" uses llm_host mode with only ${effectiveIterations} iteration(s). The evals guide recommends >= 10 iterations. See docs/evals-guide.md for guidance on statistical reliability.`
7034
+ `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in llm_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
7024
7035
  );
7025
7036
  }
7026
7037
  }
@@ -7068,6 +7079,16 @@ async function runEvalDataset(options, context) {
7068
7079
  const baseline = await loadBaseline(baselineResultsFrom);
7069
7080
  const baselinePassRate = baseline.total > 0 ? baseline.passed / baseline.total : 0;
7070
7081
  const baselineMap = buildBaselinePassMap(baseline);
7082
+ const currentCaseIds = result.caseResults.map((cr) => cr.id);
7083
+ const unmatchedCount = currentCaseIds.filter(
7084
+ (id) => !baselineMap.has(id)
7085
+ ).length;
7086
+ const unmatchedRatio = currentCaseIds.length > 0 ? unmatchedCount / currentCaseIds.length : 0;
7087
+ if (unmatchedRatio > 0.2) {
7088
+ console.warn(
7089
+ `[mcp-server-tester] Baseline comparison: ${unmatchedCount} of ${currentCaseIds.length} cases (${Math.round(unmatchedRatio * 100)}%) have no baseline entry. This may indicate the dataset structure has changed. Results for unmatched cases cannot be compared.`
7090
+ );
7091
+ }
7071
7092
  for (const cr of result.caseResults) {
7072
7093
  const baselinePass = baselineMap.get(cr.id);
7073
7094
  if (baselinePass !== void 0) {