@gleanwork/mcp-server-tester 1.0.0-beta.1 → 1.0.0-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -4380,7 +4380,7 @@ function escapeHtml(text) {
4380
4380
 
4381
4381
  // package.json
4382
4382
  var package_default = {
4383
- version: "1.0.0-beta.1"};
4383
+ version: "1.0.0-beta.2"};
4384
4384
 
4385
4385
  // src/mcp/clientFactory.ts
4386
4386
  function getRetryAfterDelayMs(err) {
@@ -5888,7 +5888,18 @@ function applySanitizers(value, sanitizers) {
5888
5888
  continue;
5889
5889
  }
5890
5890
  if (isRegexSanitizer(sanitizer)) {
5891
- const pattern = sanitizer.pattern instanceof RegExp ? sanitizer.pattern : new RegExp(sanitizer.pattern, "g");
5891
+ let pattern;
5892
+ if (sanitizer.pattern instanceof RegExp) {
5893
+ pattern = sanitizer.pattern;
5894
+ } else {
5895
+ try {
5896
+ pattern = new RegExp(sanitizer.pattern, "g");
5897
+ } catch {
5898
+ throw new Error(
5899
+ `toMatchToolSnapshot: invalid regex pattern "${sanitizer.pattern}" in sanitizer`
5900
+ );
5901
+ }
5902
+ }
5892
5903
  const replacement = sanitizer.replacement ?? "[SANITIZED]";
5893
5904
  result = result.replace(pattern, replacement);
5894
5905
  continue;
@@ -6991,9 +7002,9 @@ async function runEvalDataset(options, context) {
6991
7002
  const withIterations = evalCase.mode === "llm_host" && evalCase.iterations === void 0 && defaultLlmIterations !== void 0 ? { ...evalCase, iterations: defaultLlmIterations } : evalCase;
6992
7003
  if (evalCase.mode === "llm_host") {
6993
7004
  const effectiveIterations = withIterations.iterations ?? 1;
6994
- if (effectiveIterations < 10) {
7005
+ if (effectiveIterations > 1 && effectiveIterations < 10) {
6995
7006
  console.warn(
6996
- `[mcp-server-tester] Eval case "${evalCase.id}" uses llm_host mode with only ${effectiveIterations} iteration(s). The evals guide recommends >= 10 iterations. See docs/evals-guide.md for guidance on statistical reliability.`
7007
+ `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in llm_host mode may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
6997
7008
  );
6998
7009
  }
6999
7010
  }
@@ -7041,6 +7052,16 @@ async function runEvalDataset(options, context) {
7041
7052
  const baseline = await loadBaseline(baselineResultsFrom);
7042
7053
  const baselinePassRate = baseline.total > 0 ? baseline.passed / baseline.total : 0;
7043
7054
  const baselineMap = buildBaselinePassMap(baseline);
7055
+ const currentCaseIds = result.caseResults.map((cr) => cr.id);
7056
+ const unmatchedCount = currentCaseIds.filter(
7057
+ (id) => !baselineMap.has(id)
7058
+ ).length;
7059
+ const unmatchedRatio = currentCaseIds.length > 0 ? unmatchedCount / currentCaseIds.length : 0;
7060
+ if (unmatchedRatio > 0.2) {
7061
+ console.warn(
7062
+ `[mcp-server-tester] Baseline comparison: ${unmatchedCount} of ${currentCaseIds.length} cases (${Math.round(unmatchedRatio * 100)}%) have no baseline entry. This may indicate the dataset structure has changed. Results for unmatched cases cannot be compared.`
7063
+ );
7064
+ }
7044
7065
  for (const cr of result.caseResults) {
7045
7066
  const baselinePass = baselineMap.get(cr.id);
7046
7067
  if (baselinePass !== void 0) {