@gleanwork/mcp-server-tester 1.0.0-beta.7 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  import { expect as expect$1, test as test$1 } from '@playwright/test';
2
- import { query } from '@anthropic-ai/claude-agent-sdk';
3
2
  import { z } from 'zod';
3
+ import { query } from '@anthropic-ai/claude-agent-sdk';
4
4
  import { Client } from '@modelcontextprotocol/sdk/client/index.js';
5
5
  import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
6
6
  import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js';
@@ -237,11 +237,13 @@ function validateSchema(response, schema, options = {}) {
237
237
  } catch (error) {
238
238
  const zodError = error;
239
239
  const issues = formatZodIssues(zodError);
240
+ const text = stringifyResponse(response);
240
241
  return {
241
242
  pass: false,
242
243
  message: `Response does not match schema: ${issues}`,
243
244
  details: {
244
- issues: zodError.issues
245
+ issues: zodError.issues,
246
+ textPreview: truncateForDisplay2(text)
245
247
  }
246
248
  };
247
249
  }
@@ -294,16 +296,29 @@ function formatZodIssues(error) {
294
296
  });
295
297
  return issues.join("; ");
296
298
  }
299
+ function truncateForDisplay2(str, maxLength = 200) {
300
+ if (str.length <= maxLength) {
301
+ return str;
302
+ }
303
+ return str.slice(0, maxLength) + "... (truncated)";
304
+ }
297
305
 
298
306
  // src/assertions/matchers/toMatchToolSchema.ts
299
307
  function toMatchToolSchema(received, schema, options = {}) {
300
308
  const result = validateSchema(received, schema, options);
309
+ const preview = result.details?.textPreview;
301
310
  return {
302
311
  pass: result.pass,
303
312
  message: () => {
304
313
  if (this.isNot) {
305
314
  return result.pass ? "Expected response NOT to match schema, but it did" : result.message;
306
315
  }
316
+ if (!result.pass && preview) {
317
+ return `${result.message}
318
+
319
+ Actual response (truncated):
320
+ ${preview}`;
321
+ }
307
322
  return result.message;
308
323
  }
309
324
  };
@@ -334,11 +349,11 @@ function validateText(response, expected, options = {}) {
334
349
  details: {
335
350
  missing,
336
351
  textLength: text.length,
337
- textPreview: truncateForDisplay2(text)
352
+ textPreview: truncateForDisplay3(text)
338
353
  }
339
354
  };
340
355
  }
341
- function truncateForDisplay2(str, maxLength = 200) {
356
+ function truncateForDisplay3(str, maxLength = 200) {
342
357
  if (str.length <= maxLength) {
343
358
  return str;
344
359
  }
@@ -348,6 +363,7 @@ function truncateForDisplay2(str, maxLength = 200) {
348
363
  // src/assertions/matchers/toContainToolText.ts
349
364
  function toContainToolText(received, expected, options = {}) {
350
365
  const result = validateText(received, expected, options);
366
+ const preview = result.details?.textPreview;
351
367
  return {
352
368
  pass: result.pass,
353
369
  message: () => {
@@ -355,6 +371,12 @@ function toContainToolText(received, expected, options = {}) {
355
371
  const expectedStr = Array.isArray(expected) ? expected.map((s) => `"${s}"`).join(", ") : `"${expected}"`;
356
372
  return result.pass ? `Expected response NOT to contain ${expectedStr}, but it did` : result.message;
357
373
  }
374
+ if (!result.pass && preview) {
375
+ return `${result.message}
376
+
377
+ Actual response (truncated):
378
+ ${preview}`;
379
+ }
358
380
  return result.message;
359
381
  }
360
382
  };
@@ -385,7 +407,7 @@ function validatePattern(response, patterns, options = {}) {
385
407
  details: {
386
408
  unmatched,
387
409
  textLength: text.length,
388
- textPreview: truncateForDisplay3(text)
410
+ textPreview: truncateForDisplay4(text)
389
411
  }
390
412
  };
391
413
  }
@@ -405,7 +427,7 @@ function patternToString(pattern) {
405
427
  }
406
428
  return `/${pattern}/`;
407
429
  }
408
- function truncateForDisplay3(str, maxLength = 200) {
430
+ function truncateForDisplay4(str, maxLength = 200) {
409
431
  if (str.length <= maxLength) {
410
432
  return str;
411
433
  }
@@ -415,12 +437,19 @@ function truncateForDisplay3(str, maxLength = 200) {
415
437
  // src/assertions/matchers/toMatchToolPattern.ts
416
438
  function toMatchToolPattern(received, patterns, options = {}) {
417
439
  const result = validatePattern(received, patterns, options);
440
+ const preview = result.details?.textPreview;
418
441
  return {
419
442
  pass: result.pass,
420
443
  message: () => {
421
444
  if (this.isNot) {
422
445
  return result.pass ? "Expected response NOT to match pattern(s), but it did" : result.message;
423
446
  }
447
+ if (!result.pass && preview) {
448
+ return `${result.message}
449
+
450
+ Actual response (truncated):
451
+ ${preview}`;
452
+ }
424
453
  return result.message;
425
454
  }
426
455
  };
@@ -567,7 +596,7 @@ function validateError(response, expected = true) {
567
596
  pass: false,
568
597
  message: "Expected an error response but got success",
569
598
  details: {
570
- textPreview: truncateForDisplay4(extractText2(response))
599
+ textPreview: truncateForDisplay5(extractText2(response))
571
600
  }
572
601
  };
573
602
  } else {
@@ -579,7 +608,7 @@ function validateError(response, expected = true) {
579
608
  }
580
609
  return {
581
610
  pass: false,
582
- message: `Expected a success response but got error: "${truncateForDisplay4(errorMessage)}"`,
611
+ message: `Expected a success response but got error: "${truncateForDisplay5(errorMessage)}"`,
583
612
  details: {
584
613
  errorMessage
585
614
  }
@@ -592,7 +621,7 @@ function validateError(response, expected = true) {
592
621
  pass: false,
593
622
  message: `Expected an error containing "${expectedMessages[0]}" but got success`,
594
623
  details: {
595
- textPreview: truncateForDisplay4(extractText2(response))
624
+ textPreview: truncateForDisplay5(extractText2(response))
596
625
  }
597
626
  };
598
627
  }
@@ -614,7 +643,7 @@ function validateError(response, expected = true) {
614
643
  }
615
644
  };
616
645
  }
617
- function truncateForDisplay4(str, maxLength = 200) {
646
+ function truncateForDisplay5(str, maxLength = 200) {
618
647
  if (str.length <= maxLength) {
619
648
  return str;
620
649
  }
@@ -662,7 +691,175 @@ var JudgeResponseSchema = z.object({
662
691
  reasoning: z.string()
663
692
  });
664
693
 
665
- // src/judge/claudeAgentJudge.ts
694
+ // src/judge/anthropicJudge.ts
695
+ function createAnthropicJudge(config = {}) {
696
+ const apiKeyEnvVar = config.apiKeyEnvVar ?? "ANTHROPIC_API_KEY";
697
+ const apiKey = process.env[apiKeyEnvVar];
698
+ if (!apiKey) {
699
+ throw new Error(
700
+ `Anthropic judge requires an API key. Set the ${apiKeyEnvVar} environment variable.`
701
+ );
702
+ }
703
+ const model = config.model ?? "claude-sonnet-4-20250514";
704
+ const maxTokens = config.maxTokens ?? 1e3;
705
+ const temperature = config.temperature ?? 0;
706
+ return {
707
+ async evaluate(candidate, reference, rubric) {
708
+ let anthropicModule;
709
+ try {
710
+ anthropicModule = await import('@anthropic-ai/sdk');
711
+ } catch (err) {
712
+ throw new Error(
713
+ `Anthropic judge requires the \`@anthropic-ai/sdk\` package. Install it with: npm install @anthropic-ai/sdk
714
+ Original error: ${err instanceof Error ? err.message : String(err)}`
715
+ );
716
+ }
717
+ const client = new anthropicModule.default({ apiKey });
718
+ const prompt = buildJudgePrompt(candidate, reference, rubric);
719
+ const startTime = Date.now();
720
+ const response = await client.messages.create({
721
+ model,
722
+ max_tokens: maxTokens,
723
+ temperature,
724
+ system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
725
+ messages: [{ role: "user", content: prompt }]
726
+ });
727
+ const durationMs = Date.now() - startTime;
728
+ const textBlock = response.content.find(
729
+ (b) => b.type === "text"
730
+ );
731
+ const text = textBlock?.text ?? "";
732
+ const parsed = parseJudgeResponse(text);
733
+ return {
734
+ pass: parsed.pass,
735
+ score: parsed.score,
736
+ reasoning: parsed.reasoning,
737
+ usage: {
738
+ inputTokens: response.usage?.input_tokens ?? 0,
739
+ outputTokens: response.usage?.output_tokens ?? 0,
740
+ totalCostUsd: 0,
741
+ durationMs
742
+ }
743
+ };
744
+ }
745
+ };
746
+ }
747
+ function buildJudgePrompt(candidate, reference, rubric) {
748
+ const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
749
+ const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
750
+ return `Rubric:
751
+ ${rubric}
752
+
753
+ <candidate_response>
754
+ ${candidateStr}
755
+ </candidate_response>
756
+
757
+ <reference_answer>
758
+ ${referenceStr ?? "No reference provided."}
759
+ </reference_answer>
760
+
761
+ Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
762
+ }
763
+ function parseJudgeResponse(text) {
764
+ const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
765
+ let parsed;
766
+ try {
767
+ parsed = JSON.parse(cleaned);
768
+ } catch {
769
+ throw new Error(`Failed to parse judge response as JSON: ${text}`);
770
+ }
771
+ const result = JudgeResponseSchema.safeParse(parsed);
772
+ if (!result.success) {
773
+ throw new Error(
774
+ `Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
775
+ Validation errors: ${JSON.stringify(result.error.issues)}`
776
+ );
777
+ }
778
+ return result.data;
779
+ }
780
+
781
+ // src/judge/vertexAnthropicJudge.ts
782
+ function createVertexAnthropicJudge(config = {}) {
783
+ const model = config.model ?? "claude-sonnet-4-20250514";
784
+ const maxTokens = config.maxTokens ?? 1e3;
785
+ const temperature = config.temperature ?? 0;
786
+ return {
787
+ async evaluate(candidate, reference, rubric) {
788
+ let vertexModule;
789
+ try {
790
+ vertexModule = await import('@anthropic-ai/vertex-sdk');
791
+ } catch (err) {
792
+ throw new Error(
793
+ `Vertex Anthropic judge requires the \`@anthropic-ai/vertex-sdk\` package. Install it with: npm install @anthropic-ai/vertex-sdk
794
+ Original error: ${err instanceof Error ? err.message : String(err)}`
795
+ );
796
+ }
797
+ const client = new vertexModule.AnthropicVertex({
798
+ projectId: process.env.GOOGLE_VERTEX_PROJECT ?? process.env.CLOUD_ML_PROJECT_ID,
799
+ region: process.env.GOOGLE_VERTEX_LOCATION ?? "us-east5"
800
+ });
801
+ const prompt = buildJudgePrompt2(candidate, reference, rubric);
802
+ const startTime = Date.now();
803
+ const response = await client.messages.create({
804
+ model,
805
+ max_tokens: maxTokens,
806
+ temperature,
807
+ system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
808
+ messages: [{ role: "user", content: prompt }]
809
+ });
810
+ const durationMs = Date.now() - startTime;
811
+ const textBlock = response.content.find(
812
+ (b) => b.type === "text"
813
+ );
814
+ const text = textBlock?.text ?? "";
815
+ const parsed = parseJudgeResponse2(text);
816
+ return {
817
+ pass: parsed.pass,
818
+ score: parsed.score,
819
+ reasoning: parsed.reasoning,
820
+ usage: {
821
+ inputTokens: response.usage?.input_tokens ?? 0,
822
+ outputTokens: response.usage?.output_tokens ?? 0,
823
+ totalCostUsd: 0,
824
+ durationMs
825
+ }
826
+ };
827
+ }
828
+ };
829
+ }
830
+ function buildJudgePrompt2(candidate, reference, rubric) {
831
+ const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
832
+ const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
833
+ return `Rubric:
834
+ ${rubric}
835
+
836
+ <candidate_response>
837
+ ${candidateStr}
838
+ </candidate_response>
839
+
840
+ <reference_answer>
841
+ ${referenceStr ?? "No reference provided."}
842
+ </reference_answer>
843
+
844
+ Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
845
+ }
846
+ function parseJudgeResponse2(text) {
847
+ const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
848
+ let parsed;
849
+ try {
850
+ parsed = JSON.parse(cleaned);
851
+ } catch {
852
+ throw new Error(`Failed to parse judge response as JSON: ${text}`);
853
+ }
854
+ const result = JudgeResponseSchema.safeParse(parsed);
855
+ if (!result.success) {
856
+ throw new Error(
857
+ `Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
858
+ Validation errors: ${JSON.stringify(result.error.issues)}`
859
+ );
860
+ }
861
+ return result.data;
862
+ }
666
863
  function createClaudeAgentJudge(config) {
667
864
  const model = config.model ?? "claude-sonnet-4-20250514";
668
865
  const maxBudgetUsd = config.maxBudgetUsd ?? 0.1;
@@ -680,7 +877,7 @@ function createClaudeAgentJudge(config) {
680
877
  exceedsMaxToolOutputSize: true
681
878
  };
682
879
  }
683
- const prompt = buildJudgePrompt(candidate, reference, rubric);
880
+ const prompt = buildJudgePrompt3(candidate, reference, rubric);
684
881
  try {
685
882
  let resultMessage;
686
883
  for await (const message of query({
@@ -712,7 +909,7 @@ function createClaudeAgentJudge(config) {
712
909
  );
713
910
  }
714
911
  const responseText = resultMessage.result ?? "";
715
- const parsed = parseJudgeResponse(responseText);
912
+ const parsed = parseJudgeResponse3(responseText);
716
913
  const usage = {
717
914
  inputTokens: resultMessage.usage?.input_tokens ?? 0,
718
915
  outputTokens: resultMessage.usage?.output_tokens ?? 0,
@@ -741,7 +938,7 @@ function createClaudeAgentJudge(config) {
741
938
  function buildSystemPrompt() {
742
939
  return 'You are an expert evaluator. Evaluate the candidate response based on the rubric provided. Respond ONLY with valid JSON in this exact format: {"pass": boolean, "score": number (0-1), "reasoning": string}. Do not include any other text, markdown formatting, or code blocks.';
743
940
  }
744
- function buildJudgePrompt(candidate, reference, rubric) {
941
+ function buildJudgePrompt3(candidate, reference, rubric) {
745
942
  const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
746
943
  const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
747
944
  const parts = [];
@@ -758,7 +955,7 @@ function buildJudgePrompt(candidate, reference, rubric) {
758
955
  );
759
956
  return parts.join("");
760
957
  }
761
- function parseJudgeResponse(text) {
958
+ function parseJudgeResponse3(text) {
762
959
  let jsonText = text.trim();
763
960
  if (jsonText.startsWith("```json")) {
764
961
  jsonText = jsonText.slice(7);
@@ -815,7 +1012,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
815
1012
  );
816
1013
  }
817
1014
  const client = new openaiModule.default({ apiKey });
818
- const prompt = buildJudgePrompt2(candidate, reference, rubric);
1015
+ const prompt = buildJudgePrompt4(candidate, reference, rubric);
819
1016
  const startTime = Date.now();
820
1017
  const completion = await client.chat.completions.create({
821
1018
  model,
@@ -831,7 +1028,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
831
1028
  });
832
1029
  const durationMs = Date.now() - startTime;
833
1030
  const text = completion.choices[0]?.message.content ?? "";
834
- const parsed = parseJudgeResponse2(text);
1031
+ const parsed = parseJudgeResponse4(text);
835
1032
  return {
836
1033
  pass: parsed.pass,
837
1034
  score: parsed.score,
@@ -846,7 +1043,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
846
1043
  }
847
1044
  };
848
1045
  }
849
- function buildJudgePrompt2(candidate, reference, rubric) {
1046
+ function buildJudgePrompt4(candidate, reference, rubric) {
850
1047
  const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
851
1048
  const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
852
1049
  return `Rubric:
@@ -862,7 +1059,7 @@ ${referenceStr ?? "No reference provided."}
862
1059
 
863
1060
  Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
864
1061
  }
865
- function parseJudgeResponse2(text) {
1062
+ function parseJudgeResponse4(text) {
866
1063
  const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
867
1064
  let parsed;
868
1065
  try {
@@ -964,14 +1161,33 @@ function createJudge(config = {}) {
964
1161
  const provider = config.provider ?? "anthropic";
965
1162
  switch (provider) {
966
1163
  case "anthropic":
1164
+ return createAnthropicJudge(config);
1165
+ case "vertex-anthropic":
1166
+ return createVertexAnthropicJudge(config);
1167
+ case "anthropic-agent-sdk":
967
1168
  return createClaudeAgentJudge(config);
968
1169
  case "openai":
969
1170
  return createOpenAIJudge(config);
970
1171
  case "google":
971
1172
  return createGoogleJudge(config);
972
1173
  default:
973
- throw new Error(`Unsupported LLM provider: ${String(provider)}`);
1174
+ throw new Error(
1175
+ `Unsupported LLM provider: ${String(provider)}. Valid providers: 'anthropic', 'vertex-anthropic', 'anthropic-agent-sdk', 'openai', 'google'`
1176
+ );
1177
+ }
1178
+ }
1179
+
1180
+ // src/judge/judgeRegistry.ts
1181
+ var registry = /* @__PURE__ */ new Map();
1182
+ function getRegisteredJudge(name) {
1183
+ const executor = registry.get(name);
1184
+ if (!executor) {
1185
+ const available = registry.size > 0 ? ` Available judges: ${[...registry.keys()].join(", ")}` : " No judges are registered.";
1186
+ throw new Error(
1187
+ `Judge "${name}" is not registered.${available} Register it with registerJudge() before tests run.`
1188
+ );
974
1189
  }
1190
+ return executor;
975
1191
  }
976
1192
 
977
1193
  // src/assertions/validators/judge.ts
@@ -982,6 +1198,7 @@ function computeStdDev(scores, mean) {
982
1198
  }
983
1199
  async function validateJudge(response, config) {
984
1200
  const {
1201
+ judge: judgeName,
985
1202
  rubric,
986
1203
  reference,
987
1204
  threshold = 0.7,
@@ -994,6 +1211,29 @@ async function validateJudge(response, config) {
994
1211
  maxBudgetUsd,
995
1212
  maxToolOutputSize
996
1213
  } = config;
1214
+ if (judgeName !== void 0) {
1215
+ try {
1216
+ const executor = getRegisteredJudge(judgeName);
1217
+ const judgeResult = await executor(response, reference ?? void 0);
1218
+ const score = judgeResult.score;
1219
+ const passed = score >= threshold;
1220
+ return {
1221
+ pass: passed,
1222
+ message: passed ? `Custom judge "${judgeName}" passed with score ${score.toFixed(2)}` : `Custom judge "${judgeName}" failed with score ${score.toFixed(2)} (threshold: ${threshold}). ${judgeResult.reasoning ?? ""}`
1223
+ };
1224
+ } catch (err) {
1225
+ return {
1226
+ pass: false,
1227
+ message: `Custom judge "${judgeName}" error: ${err instanceof Error ? err.message : String(err)}`
1228
+ };
1229
+ }
1230
+ }
1231
+ if (rubric === void 0) {
1232
+ return {
1233
+ pass: false,
1234
+ message: 'Judge evaluation failed: either "judge" or "rubric" must be provided'
1235
+ };
1236
+ }
997
1237
  const resolvedRubric = resolveRubric(rubric);
998
1238
  const judgeConfig = {
999
1239
  ...provider !== void 0 && { provider },
@@ -1040,11 +1280,17 @@ async function validateJudge(response, config) {
1040
1280
  return {
1041
1281
  pass: passed,
1042
1282
  message: passed ? `Judge passed with score ${meanScore.toFixed(2)}${repNote}` : `Judge failed with score ${meanScore.toFixed(2)} (threshold: ${threshold})${repNote}. ${lastReasoning ?? ""}`,
1043
- details: reps > 1 ? {
1044
- scores,
1045
- scoreStdDev: stdDev,
1046
- highVariance
1047
- } : void 0
1283
+ details: {
1284
+ score: meanScore,
1285
+ reasoning: lastReasoning,
1286
+ judgeProvider: provider ?? "anthropic",
1287
+ judgeModel: model,
1288
+ ...reps > 1 && {
1289
+ scores,
1290
+ scoreStdDev: stdDev,
1291
+ highVariance
1292
+ }
1293
+ }
1048
1294
  };
1049
1295
  } catch (err) {
1050
1296
  return {
@@ -1056,31 +1302,68 @@ async function validateJudge(response, config) {
1056
1302
 
1057
1303
  // src/assertions/matchers/toPassToolJudge.ts
1058
1304
  var DEFAULT_PASSING_THRESHOLD = 0.7;
1059
- async function toPassToolJudge(received, rubric, options = {}) {
1305
+ async function runSingleJudge(received, rubric, options) {
1060
1306
  const {
1061
1307
  reference = null,
1062
1308
  passingThreshold = DEFAULT_PASSING_THRESHOLD,
1063
1309
  reps,
1064
1310
  provider,
1065
- model
1311
+ model,
1312
+ judge
1066
1313
  } = options;
1067
1314
  const validation = await validateJudge(received, {
1068
- rubric,
1315
+ ...rubric !== void 0 && { rubric },
1069
1316
  reference: reference ?? void 0,
1070
1317
  threshold: passingThreshold,
1071
1318
  ...reps !== void 0 && { reps },
1072
1319
  ...provider !== void 0 && { provider },
1073
- ...model !== void 0 && { model }
1320
+ ...model !== void 0 && { model },
1321
+ ...judge !== void 0 && { judge }
1074
1322
  });
1323
+ return { pass: validation.pass, message: validation.message };
1324
+ }
1325
+ async function toPassToolJudge(received, rubricOrOptions, maybeOptions) {
1326
+ if (Array.isArray(rubricOrOptions)) {
1327
+ const results = await Promise.all(
1328
+ rubricOrOptions.map(async (judgeConfig) => {
1329
+ const { rubric: r, ...opts } = judgeConfig;
1330
+ return runSingleJudge(received, r, opts);
1331
+ })
1332
+ );
1333
+ const allPassed = results.every((r) => r.pass);
1334
+ const passCount = results.filter((r) => r.pass).length;
1335
+ const summary = `${passCount}/${results.length} judges passed`;
1336
+ const details = results.map((r) => r.message).join("\n");
1337
+ if (this.isNot) {
1338
+ return {
1339
+ pass: !allPassed,
1340
+ message: () => allPassed ? `Expected all judges to fail, but ${summary}` : `Judges failed as expected: ${summary}`
1341
+ };
1342
+ }
1343
+ return {
1344
+ pass: allPassed,
1345
+ message: () => `${summary}
1346
+ ${details}`
1347
+ };
1348
+ }
1349
+ let rubric;
1350
+ let options;
1351
+ if (typeof rubricOrOptions === "string" || typeof rubricOrOptions === "object" && rubricOrOptions !== null && "text" in rubricOrOptions) {
1352
+ rubric = rubricOrOptions;
1353
+ options = maybeOptions ?? {};
1354
+ } else {
1355
+ options = rubricOrOptions;
1356
+ }
1357
+ const result = await runSingleJudge(received, rubric, options);
1075
1358
  if (this.isNot) {
1076
1359
  return {
1077
- pass: !validation.pass,
1078
- message: () => validation.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
1360
+ pass: !result.pass,
1361
+ message: () => result.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
1079
1362
  };
1080
1363
  }
1081
1364
  return {
1082
- pass: validation.pass,
1083
- message: () => validation.message
1365
+ pass: result.pass,
1366
+ message: () => result.message
1084
1367
  };
1085
1368
  }
1086
1369
 
@@ -1188,9 +1471,17 @@ async function toSatisfyToolPredicate(received, predicate, description) {
1188
1471
  function isSimulationResult(value) {
1189
1472
  return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
1190
1473
  }
1474
+ function isPatternMatcher(v) {
1475
+ return typeof v === "object" && v !== null && "$pattern" in v && typeof v["$pattern"] === "string";
1476
+ }
1191
1477
  function partialMatch(actual, expected) {
1192
1478
  return Object.entries(expected).every(([k, v]) => {
1193
1479
  const actualVal = actual[k];
1480
+ if (isPatternMatcher(v)) {
1481
+ if (typeof actualVal !== "string") return false;
1482
+ const re = new RegExp(v.$pattern, v.$flags);
1483
+ return re.test(actualVal);
1484
+ }
1194
1485
  if (typeof v === "object" && v !== null && typeof actualVal === "object" && actualVal !== null) {
1195
1486
  return partialMatch(
1196
1487
  actualVal,
@@ -1237,6 +1528,10 @@ function validateToolCalls(response, expectation) {
1237
1528
  return {
1238
1529
  pass: false,
1239
1530
  message: `Expected tool '${expected.name}' to be called in sequence (starting from position ${searchFrom}), but it was not found`,
1531
+ details: {
1532
+ actual: actual.map((c) => c.name),
1533
+ expected: expected.name
1534
+ },
1240
1535
  metrics
1241
1536
  };
1242
1537
  }
@@ -1253,6 +1548,10 @@ function validateToolCalls(response, expectation) {
1253
1548
  return {
1254
1549
  pass: false,
1255
1550
  message: `Expected tool '${expected.name}'${argsNote} to be called, but it was not`,
1551
+ details: {
1552
+ actual: actual.map((c) => c.name),
1553
+ expected: expected.name
1554
+ },
1256
1555
  metrics
1257
1556
  };
1258
1557
  }
@@ -1265,6 +1564,10 @@ function validateToolCalls(response, expectation) {
1265
1564
  return {
1266
1565
  pass: false,
1267
1566
  message: `Unexpected tool calls: ${names}. Only ${[...allowedNames].map((n) => `'${n}'`).join(", ")} are allowed`,
1567
+ details: {
1568
+ actual: actual.map((c) => c.name),
1569
+ unexpected: unexpected.map((c) => c.name)
1570
+ },
1268
1571
  metrics
1269
1572
  };
1270
1573
  }
@@ -1283,19 +1586,22 @@ function validateToolCallCount(response, options) {
1283
1586
  if (exact !== void 0 && count !== exact) {
1284
1587
  return {
1285
1588
  pass: false,
1286
- message: `Expected exactly ${exact} tool call(s), but got ${count}`
1589
+ message: `Expected exactly ${exact} tool call(s), but got ${count}`,
1590
+ details: { actual: count, expected: exact }
1287
1591
  };
1288
1592
  }
1289
1593
  if (min !== void 0 && count < min) {
1290
1594
  return {
1291
1595
  pass: false,
1292
- message: `Expected at least ${min} tool call(s), but got ${count}`
1596
+ message: `Expected at least ${min} tool call(s), but got ${count}`,
1597
+ details: { actual: count, min }
1293
1598
  };
1294
1599
  }
1295
1600
  if (max !== void 0 && count > max) {
1296
1601
  return {
1297
1602
  pass: false,
1298
- message: `Expected at most ${max} tool call(s), but got ${count}`
1603
+ message: `Expected at most ${max} tool call(s), but got ${count}`,
1604
+ details: { actual: count, max }
1299
1605
  };
1300
1606
  }
1301
1607
  return {
@@ -1434,7 +1740,7 @@ var debugHttp = createDebug(`${NAMESPACE}:http`);
1434
1740
 
1435
1741
  // package.json
1436
1742
  var package_default = {
1437
- version: "1.0.0-beta.7"};
1743
+ version: "1.0.0"};
1438
1744
  var debug = createDebug("mcp-server-tester:oauth-flow");
1439
1745
  async function generatePKCE() {
1440
1746
  const codeVerifier = oauth.generateRandomCodeVerifier();
@@ -1815,6 +2121,17 @@ async function createMCPClientForConfig(config, options) {
1815
2121
  }
1816
2122
  async function closeMCPClient(client) {
1817
2123
  try {
2124
+ const transport = client.transport;
2125
+ if (transport instanceof StreamableHTTPClientTransport) {
2126
+ try {
2127
+ await transport.terminateSession();
2128
+ } catch (sessionError) {
2129
+ debugClient(
2130
+ "Error terminating session: %s",
2131
+ sessionError instanceof Error ? sessionError.message : String(sessionError)
2132
+ );
2133
+ }
2134
+ }
1818
2135
  await client.close();
1819
2136
  } catch (error) {
1820
2137
  debugClient(