@gleanwork/mcp-server-tester 1.0.0-beta.7 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -1
- package/dist/cli/index.js +12 -1
- package/dist/fixtures/mcp.d.ts +33 -8
- package/dist/fixtures/mcp.js +354 -37
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +721 -76
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +533 -116
- package/dist/index.d.ts +533 -116
- package/dist/index.js +719 -78
- package/dist/index.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +8 -134
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +11 -6
- package/dist/reporters/mcpReporter.d.cts +0 -90
- package/dist/reporters/mcpReporter.d.ts +0 -90
package/dist/fixtures/mcp.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { expect as expect$1, test as test$1 } from '@playwright/test';
|
|
2
|
-
import { query } from '@anthropic-ai/claude-agent-sdk';
|
|
3
2
|
import { z } from 'zod';
|
|
3
|
+
import { query } from '@anthropic-ai/claude-agent-sdk';
|
|
4
4
|
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
|
|
5
5
|
import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
|
|
6
6
|
import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js';
|
|
@@ -237,11 +237,13 @@ function validateSchema(response, schema, options = {}) {
|
|
|
237
237
|
} catch (error) {
|
|
238
238
|
const zodError = error;
|
|
239
239
|
const issues = formatZodIssues(zodError);
|
|
240
|
+
const text = stringifyResponse(response);
|
|
240
241
|
return {
|
|
241
242
|
pass: false,
|
|
242
243
|
message: `Response does not match schema: ${issues}`,
|
|
243
244
|
details: {
|
|
244
|
-
issues: zodError.issues
|
|
245
|
+
issues: zodError.issues,
|
|
246
|
+
textPreview: truncateForDisplay2(text)
|
|
245
247
|
}
|
|
246
248
|
};
|
|
247
249
|
}
|
|
@@ -294,16 +296,29 @@ function formatZodIssues(error) {
|
|
|
294
296
|
});
|
|
295
297
|
return issues.join("; ");
|
|
296
298
|
}
|
|
299
|
+
function truncateForDisplay2(str, maxLength = 200) {
|
|
300
|
+
if (str.length <= maxLength) {
|
|
301
|
+
return str;
|
|
302
|
+
}
|
|
303
|
+
return str.slice(0, maxLength) + "... (truncated)";
|
|
304
|
+
}
|
|
297
305
|
|
|
298
306
|
// src/assertions/matchers/toMatchToolSchema.ts
|
|
299
307
|
function toMatchToolSchema(received, schema, options = {}) {
|
|
300
308
|
const result = validateSchema(received, schema, options);
|
|
309
|
+
const preview = result.details?.textPreview;
|
|
301
310
|
return {
|
|
302
311
|
pass: result.pass,
|
|
303
312
|
message: () => {
|
|
304
313
|
if (this.isNot) {
|
|
305
314
|
return result.pass ? "Expected response NOT to match schema, but it did" : result.message;
|
|
306
315
|
}
|
|
316
|
+
if (!result.pass && preview) {
|
|
317
|
+
return `${result.message}
|
|
318
|
+
|
|
319
|
+
Actual response (truncated):
|
|
320
|
+
${preview}`;
|
|
321
|
+
}
|
|
307
322
|
return result.message;
|
|
308
323
|
}
|
|
309
324
|
};
|
|
@@ -334,11 +349,11 @@ function validateText(response, expected, options = {}) {
|
|
|
334
349
|
details: {
|
|
335
350
|
missing,
|
|
336
351
|
textLength: text.length,
|
|
337
|
-
textPreview:
|
|
352
|
+
textPreview: truncateForDisplay3(text)
|
|
338
353
|
}
|
|
339
354
|
};
|
|
340
355
|
}
|
|
341
|
-
function
|
|
356
|
+
function truncateForDisplay3(str, maxLength = 200) {
|
|
342
357
|
if (str.length <= maxLength) {
|
|
343
358
|
return str;
|
|
344
359
|
}
|
|
@@ -348,6 +363,7 @@ function truncateForDisplay2(str, maxLength = 200) {
|
|
|
348
363
|
// src/assertions/matchers/toContainToolText.ts
|
|
349
364
|
function toContainToolText(received, expected, options = {}) {
|
|
350
365
|
const result = validateText(received, expected, options);
|
|
366
|
+
const preview = result.details?.textPreview;
|
|
351
367
|
return {
|
|
352
368
|
pass: result.pass,
|
|
353
369
|
message: () => {
|
|
@@ -355,6 +371,12 @@ function toContainToolText(received, expected, options = {}) {
|
|
|
355
371
|
const expectedStr = Array.isArray(expected) ? expected.map((s) => `"${s}"`).join(", ") : `"${expected}"`;
|
|
356
372
|
return result.pass ? `Expected response NOT to contain ${expectedStr}, but it did` : result.message;
|
|
357
373
|
}
|
|
374
|
+
if (!result.pass && preview) {
|
|
375
|
+
return `${result.message}
|
|
376
|
+
|
|
377
|
+
Actual response (truncated):
|
|
378
|
+
${preview}`;
|
|
379
|
+
}
|
|
358
380
|
return result.message;
|
|
359
381
|
}
|
|
360
382
|
};
|
|
@@ -385,7 +407,7 @@ function validatePattern(response, patterns, options = {}) {
|
|
|
385
407
|
details: {
|
|
386
408
|
unmatched,
|
|
387
409
|
textLength: text.length,
|
|
388
|
-
textPreview:
|
|
410
|
+
textPreview: truncateForDisplay4(text)
|
|
389
411
|
}
|
|
390
412
|
};
|
|
391
413
|
}
|
|
@@ -405,7 +427,7 @@ function patternToString(pattern) {
|
|
|
405
427
|
}
|
|
406
428
|
return `/${pattern}/`;
|
|
407
429
|
}
|
|
408
|
-
function
|
|
430
|
+
function truncateForDisplay4(str, maxLength = 200) {
|
|
409
431
|
if (str.length <= maxLength) {
|
|
410
432
|
return str;
|
|
411
433
|
}
|
|
@@ -415,12 +437,19 @@ function truncateForDisplay3(str, maxLength = 200) {
|
|
|
415
437
|
// src/assertions/matchers/toMatchToolPattern.ts
|
|
416
438
|
function toMatchToolPattern(received, patterns, options = {}) {
|
|
417
439
|
const result = validatePattern(received, patterns, options);
|
|
440
|
+
const preview = result.details?.textPreview;
|
|
418
441
|
return {
|
|
419
442
|
pass: result.pass,
|
|
420
443
|
message: () => {
|
|
421
444
|
if (this.isNot) {
|
|
422
445
|
return result.pass ? "Expected response NOT to match pattern(s), but it did" : result.message;
|
|
423
446
|
}
|
|
447
|
+
if (!result.pass && preview) {
|
|
448
|
+
return `${result.message}
|
|
449
|
+
|
|
450
|
+
Actual response (truncated):
|
|
451
|
+
${preview}`;
|
|
452
|
+
}
|
|
424
453
|
return result.message;
|
|
425
454
|
}
|
|
426
455
|
};
|
|
@@ -567,7 +596,7 @@ function validateError(response, expected = true) {
|
|
|
567
596
|
pass: false,
|
|
568
597
|
message: "Expected an error response but got success",
|
|
569
598
|
details: {
|
|
570
|
-
textPreview:
|
|
599
|
+
textPreview: truncateForDisplay5(extractText2(response))
|
|
571
600
|
}
|
|
572
601
|
};
|
|
573
602
|
} else {
|
|
@@ -579,7 +608,7 @@ function validateError(response, expected = true) {
|
|
|
579
608
|
}
|
|
580
609
|
return {
|
|
581
610
|
pass: false,
|
|
582
|
-
message: `Expected a success response but got error: "${
|
|
611
|
+
message: `Expected a success response but got error: "${truncateForDisplay5(errorMessage)}"`,
|
|
583
612
|
details: {
|
|
584
613
|
errorMessage
|
|
585
614
|
}
|
|
@@ -592,7 +621,7 @@ function validateError(response, expected = true) {
|
|
|
592
621
|
pass: false,
|
|
593
622
|
message: `Expected an error containing "${expectedMessages[0]}" but got success`,
|
|
594
623
|
details: {
|
|
595
|
-
textPreview:
|
|
624
|
+
textPreview: truncateForDisplay5(extractText2(response))
|
|
596
625
|
}
|
|
597
626
|
};
|
|
598
627
|
}
|
|
@@ -614,7 +643,7 @@ function validateError(response, expected = true) {
|
|
|
614
643
|
}
|
|
615
644
|
};
|
|
616
645
|
}
|
|
617
|
-
function
|
|
646
|
+
function truncateForDisplay5(str, maxLength = 200) {
|
|
618
647
|
if (str.length <= maxLength) {
|
|
619
648
|
return str;
|
|
620
649
|
}
|
|
@@ -662,7 +691,175 @@ var JudgeResponseSchema = z.object({
|
|
|
662
691
|
reasoning: z.string()
|
|
663
692
|
});
|
|
664
693
|
|
|
665
|
-
// src/judge/
|
|
694
|
+
// src/judge/anthropicJudge.ts
|
|
695
|
+
function createAnthropicJudge(config = {}) {
|
|
696
|
+
const apiKeyEnvVar = config.apiKeyEnvVar ?? "ANTHROPIC_API_KEY";
|
|
697
|
+
const apiKey = process.env[apiKeyEnvVar];
|
|
698
|
+
if (!apiKey) {
|
|
699
|
+
throw new Error(
|
|
700
|
+
`Anthropic judge requires an API key. Set the ${apiKeyEnvVar} environment variable.`
|
|
701
|
+
);
|
|
702
|
+
}
|
|
703
|
+
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
704
|
+
const maxTokens = config.maxTokens ?? 1e3;
|
|
705
|
+
const temperature = config.temperature ?? 0;
|
|
706
|
+
return {
|
|
707
|
+
async evaluate(candidate, reference, rubric) {
|
|
708
|
+
let anthropicModule;
|
|
709
|
+
try {
|
|
710
|
+
anthropicModule = await import('@anthropic-ai/sdk');
|
|
711
|
+
} catch (err) {
|
|
712
|
+
throw new Error(
|
|
713
|
+
`Anthropic judge requires the \`@anthropic-ai/sdk\` package. Install it with: npm install @anthropic-ai/sdk
|
|
714
|
+
Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
715
|
+
);
|
|
716
|
+
}
|
|
717
|
+
const client = new anthropicModule.default({ apiKey });
|
|
718
|
+
const prompt = buildJudgePrompt(candidate, reference, rubric);
|
|
719
|
+
const startTime = Date.now();
|
|
720
|
+
const response = await client.messages.create({
|
|
721
|
+
model,
|
|
722
|
+
max_tokens: maxTokens,
|
|
723
|
+
temperature,
|
|
724
|
+
system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
|
|
725
|
+
messages: [{ role: "user", content: prompt }]
|
|
726
|
+
});
|
|
727
|
+
const durationMs = Date.now() - startTime;
|
|
728
|
+
const textBlock = response.content.find(
|
|
729
|
+
(b) => b.type === "text"
|
|
730
|
+
);
|
|
731
|
+
const text = textBlock?.text ?? "";
|
|
732
|
+
const parsed = parseJudgeResponse(text);
|
|
733
|
+
return {
|
|
734
|
+
pass: parsed.pass,
|
|
735
|
+
score: parsed.score,
|
|
736
|
+
reasoning: parsed.reasoning,
|
|
737
|
+
usage: {
|
|
738
|
+
inputTokens: response.usage?.input_tokens ?? 0,
|
|
739
|
+
outputTokens: response.usage?.output_tokens ?? 0,
|
|
740
|
+
totalCostUsd: 0,
|
|
741
|
+
durationMs
|
|
742
|
+
}
|
|
743
|
+
};
|
|
744
|
+
}
|
|
745
|
+
};
|
|
746
|
+
}
|
|
747
|
+
function buildJudgePrompt(candidate, reference, rubric) {
|
|
748
|
+
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
749
|
+
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
750
|
+
return `Rubric:
|
|
751
|
+
${rubric}
|
|
752
|
+
|
|
753
|
+
<candidate_response>
|
|
754
|
+
${candidateStr}
|
|
755
|
+
</candidate_response>
|
|
756
|
+
|
|
757
|
+
<reference_answer>
|
|
758
|
+
${referenceStr ?? "No reference provided."}
|
|
759
|
+
</reference_answer>
|
|
760
|
+
|
|
761
|
+
Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
|
|
762
|
+
}
|
|
763
|
+
function parseJudgeResponse(text) {
|
|
764
|
+
const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
|
|
765
|
+
let parsed;
|
|
766
|
+
try {
|
|
767
|
+
parsed = JSON.parse(cleaned);
|
|
768
|
+
} catch {
|
|
769
|
+
throw new Error(`Failed to parse judge response as JSON: ${text}`);
|
|
770
|
+
}
|
|
771
|
+
const result = JudgeResponseSchema.safeParse(parsed);
|
|
772
|
+
if (!result.success) {
|
|
773
|
+
throw new Error(
|
|
774
|
+
`Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
|
|
775
|
+
Validation errors: ${JSON.stringify(result.error.issues)}`
|
|
776
|
+
);
|
|
777
|
+
}
|
|
778
|
+
return result.data;
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
// src/judge/vertexAnthropicJudge.ts
|
|
782
|
+
function createVertexAnthropicJudge(config = {}) {
|
|
783
|
+
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
784
|
+
const maxTokens = config.maxTokens ?? 1e3;
|
|
785
|
+
const temperature = config.temperature ?? 0;
|
|
786
|
+
return {
|
|
787
|
+
async evaluate(candidate, reference, rubric) {
|
|
788
|
+
let vertexModule;
|
|
789
|
+
try {
|
|
790
|
+
vertexModule = await import('@anthropic-ai/vertex-sdk');
|
|
791
|
+
} catch (err) {
|
|
792
|
+
throw new Error(
|
|
793
|
+
`Vertex Anthropic judge requires the \`@anthropic-ai/vertex-sdk\` package. Install it with: npm install @anthropic-ai/vertex-sdk
|
|
794
|
+
Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
795
|
+
);
|
|
796
|
+
}
|
|
797
|
+
const client = new vertexModule.AnthropicVertex({
|
|
798
|
+
projectId: process.env.GOOGLE_VERTEX_PROJECT ?? process.env.CLOUD_ML_PROJECT_ID,
|
|
799
|
+
region: process.env.GOOGLE_VERTEX_LOCATION ?? "us-east5"
|
|
800
|
+
});
|
|
801
|
+
const prompt = buildJudgePrompt2(candidate, reference, rubric);
|
|
802
|
+
const startTime = Date.now();
|
|
803
|
+
const response = await client.messages.create({
|
|
804
|
+
model,
|
|
805
|
+
max_tokens: maxTokens,
|
|
806
|
+
temperature,
|
|
807
|
+
system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
|
|
808
|
+
messages: [{ role: "user", content: prompt }]
|
|
809
|
+
});
|
|
810
|
+
const durationMs = Date.now() - startTime;
|
|
811
|
+
const textBlock = response.content.find(
|
|
812
|
+
(b) => b.type === "text"
|
|
813
|
+
);
|
|
814
|
+
const text = textBlock?.text ?? "";
|
|
815
|
+
const parsed = parseJudgeResponse2(text);
|
|
816
|
+
return {
|
|
817
|
+
pass: parsed.pass,
|
|
818
|
+
score: parsed.score,
|
|
819
|
+
reasoning: parsed.reasoning,
|
|
820
|
+
usage: {
|
|
821
|
+
inputTokens: response.usage?.input_tokens ?? 0,
|
|
822
|
+
outputTokens: response.usage?.output_tokens ?? 0,
|
|
823
|
+
totalCostUsd: 0,
|
|
824
|
+
durationMs
|
|
825
|
+
}
|
|
826
|
+
};
|
|
827
|
+
}
|
|
828
|
+
};
|
|
829
|
+
}
|
|
830
|
+
function buildJudgePrompt2(candidate, reference, rubric) {
|
|
831
|
+
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
832
|
+
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
833
|
+
return `Rubric:
|
|
834
|
+
${rubric}
|
|
835
|
+
|
|
836
|
+
<candidate_response>
|
|
837
|
+
${candidateStr}
|
|
838
|
+
</candidate_response>
|
|
839
|
+
|
|
840
|
+
<reference_answer>
|
|
841
|
+
${referenceStr ?? "No reference provided."}
|
|
842
|
+
</reference_answer>
|
|
843
|
+
|
|
844
|
+
Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
|
|
845
|
+
}
|
|
846
|
+
function parseJudgeResponse2(text) {
|
|
847
|
+
const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
|
|
848
|
+
let parsed;
|
|
849
|
+
try {
|
|
850
|
+
parsed = JSON.parse(cleaned);
|
|
851
|
+
} catch {
|
|
852
|
+
throw new Error(`Failed to parse judge response as JSON: ${text}`);
|
|
853
|
+
}
|
|
854
|
+
const result = JudgeResponseSchema.safeParse(parsed);
|
|
855
|
+
if (!result.success) {
|
|
856
|
+
throw new Error(
|
|
857
|
+
`Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
|
|
858
|
+
Validation errors: ${JSON.stringify(result.error.issues)}`
|
|
859
|
+
);
|
|
860
|
+
}
|
|
861
|
+
return result.data;
|
|
862
|
+
}
|
|
666
863
|
function createClaudeAgentJudge(config) {
|
|
667
864
|
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
668
865
|
const maxBudgetUsd = config.maxBudgetUsd ?? 0.1;
|
|
@@ -680,7 +877,7 @@ function createClaudeAgentJudge(config) {
|
|
|
680
877
|
exceedsMaxToolOutputSize: true
|
|
681
878
|
};
|
|
682
879
|
}
|
|
683
|
-
const prompt =
|
|
880
|
+
const prompt = buildJudgePrompt3(candidate, reference, rubric);
|
|
684
881
|
try {
|
|
685
882
|
let resultMessage;
|
|
686
883
|
for await (const message of query({
|
|
@@ -712,7 +909,7 @@ function createClaudeAgentJudge(config) {
|
|
|
712
909
|
);
|
|
713
910
|
}
|
|
714
911
|
const responseText = resultMessage.result ?? "";
|
|
715
|
-
const parsed =
|
|
912
|
+
const parsed = parseJudgeResponse3(responseText);
|
|
716
913
|
const usage = {
|
|
717
914
|
inputTokens: resultMessage.usage?.input_tokens ?? 0,
|
|
718
915
|
outputTokens: resultMessage.usage?.output_tokens ?? 0,
|
|
@@ -741,7 +938,7 @@ function createClaudeAgentJudge(config) {
|
|
|
741
938
|
function buildSystemPrompt() {
|
|
742
939
|
return 'You are an expert evaluator. Evaluate the candidate response based on the rubric provided. Respond ONLY with valid JSON in this exact format: {"pass": boolean, "score": number (0-1), "reasoning": string}. Do not include any other text, markdown formatting, or code blocks.';
|
|
743
940
|
}
|
|
744
|
-
function
|
|
941
|
+
function buildJudgePrompt3(candidate, reference, rubric) {
|
|
745
942
|
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
746
943
|
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
747
944
|
const parts = [];
|
|
@@ -758,7 +955,7 @@ function buildJudgePrompt(candidate, reference, rubric) {
|
|
|
758
955
|
);
|
|
759
956
|
return parts.join("");
|
|
760
957
|
}
|
|
761
|
-
function
|
|
958
|
+
function parseJudgeResponse3(text) {
|
|
762
959
|
let jsonText = text.trim();
|
|
763
960
|
if (jsonText.startsWith("```json")) {
|
|
764
961
|
jsonText = jsonText.slice(7);
|
|
@@ -815,7 +1012,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
|
815
1012
|
);
|
|
816
1013
|
}
|
|
817
1014
|
const client = new openaiModule.default({ apiKey });
|
|
818
|
-
const prompt =
|
|
1015
|
+
const prompt = buildJudgePrompt4(candidate, reference, rubric);
|
|
819
1016
|
const startTime = Date.now();
|
|
820
1017
|
const completion = await client.chat.completions.create({
|
|
821
1018
|
model,
|
|
@@ -831,7 +1028,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
|
831
1028
|
});
|
|
832
1029
|
const durationMs = Date.now() - startTime;
|
|
833
1030
|
const text = completion.choices[0]?.message.content ?? "";
|
|
834
|
-
const parsed =
|
|
1031
|
+
const parsed = parseJudgeResponse4(text);
|
|
835
1032
|
return {
|
|
836
1033
|
pass: parsed.pass,
|
|
837
1034
|
score: parsed.score,
|
|
@@ -846,7 +1043,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
|
846
1043
|
}
|
|
847
1044
|
};
|
|
848
1045
|
}
|
|
849
|
-
function
|
|
1046
|
+
function buildJudgePrompt4(candidate, reference, rubric) {
|
|
850
1047
|
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
851
1048
|
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
852
1049
|
return `Rubric:
|
|
@@ -862,7 +1059,7 @@ ${referenceStr ?? "No reference provided."}
|
|
|
862
1059
|
|
|
863
1060
|
Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
|
|
864
1061
|
}
|
|
865
|
-
function
|
|
1062
|
+
function parseJudgeResponse4(text) {
|
|
866
1063
|
const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
|
|
867
1064
|
let parsed;
|
|
868
1065
|
try {
|
|
@@ -964,14 +1161,33 @@ function createJudge(config = {}) {
|
|
|
964
1161
|
const provider = config.provider ?? "anthropic";
|
|
965
1162
|
switch (provider) {
|
|
966
1163
|
case "anthropic":
|
|
1164
|
+
return createAnthropicJudge(config);
|
|
1165
|
+
case "vertex-anthropic":
|
|
1166
|
+
return createVertexAnthropicJudge(config);
|
|
1167
|
+
case "anthropic-agent-sdk":
|
|
967
1168
|
return createClaudeAgentJudge(config);
|
|
968
1169
|
case "openai":
|
|
969
1170
|
return createOpenAIJudge(config);
|
|
970
1171
|
case "google":
|
|
971
1172
|
return createGoogleJudge(config);
|
|
972
1173
|
default:
|
|
973
|
-
throw new Error(
|
|
1174
|
+
throw new Error(
|
|
1175
|
+
`Unsupported LLM provider: ${String(provider)}. Valid providers: 'anthropic', 'vertex-anthropic', 'anthropic-agent-sdk', 'openai', 'google'`
|
|
1176
|
+
);
|
|
1177
|
+
}
|
|
1178
|
+
}
|
|
1179
|
+
|
|
1180
|
+
// src/judge/judgeRegistry.ts
|
|
1181
|
+
var registry = /* @__PURE__ */ new Map();
|
|
1182
|
+
function getRegisteredJudge(name) {
|
|
1183
|
+
const executor = registry.get(name);
|
|
1184
|
+
if (!executor) {
|
|
1185
|
+
const available = registry.size > 0 ? ` Available judges: ${[...registry.keys()].join(", ")}` : " No judges are registered.";
|
|
1186
|
+
throw new Error(
|
|
1187
|
+
`Judge "${name}" is not registered.${available} Register it with registerJudge() before tests run.`
|
|
1188
|
+
);
|
|
974
1189
|
}
|
|
1190
|
+
return executor;
|
|
975
1191
|
}
|
|
976
1192
|
|
|
977
1193
|
// src/assertions/validators/judge.ts
|
|
@@ -982,6 +1198,7 @@ function computeStdDev(scores, mean) {
|
|
|
982
1198
|
}
|
|
983
1199
|
async function validateJudge(response, config) {
|
|
984
1200
|
const {
|
|
1201
|
+
judge: judgeName,
|
|
985
1202
|
rubric,
|
|
986
1203
|
reference,
|
|
987
1204
|
threshold = 0.7,
|
|
@@ -994,6 +1211,29 @@ async function validateJudge(response, config) {
|
|
|
994
1211
|
maxBudgetUsd,
|
|
995
1212
|
maxToolOutputSize
|
|
996
1213
|
} = config;
|
|
1214
|
+
if (judgeName !== void 0) {
|
|
1215
|
+
try {
|
|
1216
|
+
const executor = getRegisteredJudge(judgeName);
|
|
1217
|
+
const judgeResult = await executor(response, reference ?? void 0);
|
|
1218
|
+
const score = judgeResult.score;
|
|
1219
|
+
const passed = score >= threshold;
|
|
1220
|
+
return {
|
|
1221
|
+
pass: passed,
|
|
1222
|
+
message: passed ? `Custom judge "${judgeName}" passed with score ${score.toFixed(2)}` : `Custom judge "${judgeName}" failed with score ${score.toFixed(2)} (threshold: ${threshold}). ${judgeResult.reasoning ?? ""}`
|
|
1223
|
+
};
|
|
1224
|
+
} catch (err) {
|
|
1225
|
+
return {
|
|
1226
|
+
pass: false,
|
|
1227
|
+
message: `Custom judge "${judgeName}" error: ${err instanceof Error ? err.message : String(err)}`
|
|
1228
|
+
};
|
|
1229
|
+
}
|
|
1230
|
+
}
|
|
1231
|
+
if (rubric === void 0) {
|
|
1232
|
+
return {
|
|
1233
|
+
pass: false,
|
|
1234
|
+
message: 'Judge evaluation failed: either "judge" or "rubric" must be provided'
|
|
1235
|
+
};
|
|
1236
|
+
}
|
|
997
1237
|
const resolvedRubric = resolveRubric(rubric);
|
|
998
1238
|
const judgeConfig = {
|
|
999
1239
|
...provider !== void 0 && { provider },
|
|
@@ -1040,11 +1280,17 @@ async function validateJudge(response, config) {
|
|
|
1040
1280
|
return {
|
|
1041
1281
|
pass: passed,
|
|
1042
1282
|
message: passed ? `Judge passed with score ${meanScore.toFixed(2)}${repNote}` : `Judge failed with score ${meanScore.toFixed(2)} (threshold: ${threshold})${repNote}. ${lastReasoning ?? ""}`,
|
|
1043
|
-
details:
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1283
|
+
details: {
|
|
1284
|
+
score: meanScore,
|
|
1285
|
+
reasoning: lastReasoning,
|
|
1286
|
+
judgeProvider: provider ?? "anthropic",
|
|
1287
|
+
judgeModel: model,
|
|
1288
|
+
...reps > 1 && {
|
|
1289
|
+
scores,
|
|
1290
|
+
scoreStdDev: stdDev,
|
|
1291
|
+
highVariance
|
|
1292
|
+
}
|
|
1293
|
+
}
|
|
1048
1294
|
};
|
|
1049
1295
|
} catch (err) {
|
|
1050
1296
|
return {
|
|
@@ -1056,31 +1302,68 @@ async function validateJudge(response, config) {
|
|
|
1056
1302
|
|
|
1057
1303
|
// src/assertions/matchers/toPassToolJudge.ts
|
|
1058
1304
|
var DEFAULT_PASSING_THRESHOLD = 0.7;
|
|
1059
|
-
async function
|
|
1305
|
+
async function runSingleJudge(received, rubric, options) {
|
|
1060
1306
|
const {
|
|
1061
1307
|
reference = null,
|
|
1062
1308
|
passingThreshold = DEFAULT_PASSING_THRESHOLD,
|
|
1063
1309
|
reps,
|
|
1064
1310
|
provider,
|
|
1065
|
-
model
|
|
1311
|
+
model,
|
|
1312
|
+
judge
|
|
1066
1313
|
} = options;
|
|
1067
1314
|
const validation = await validateJudge(received, {
|
|
1068
|
-
rubric,
|
|
1315
|
+
...rubric !== void 0 && { rubric },
|
|
1069
1316
|
reference: reference ?? void 0,
|
|
1070
1317
|
threshold: passingThreshold,
|
|
1071
1318
|
...reps !== void 0 && { reps },
|
|
1072
1319
|
...provider !== void 0 && { provider },
|
|
1073
|
-
...model !== void 0 && { model }
|
|
1320
|
+
...model !== void 0 && { model },
|
|
1321
|
+
...judge !== void 0 && { judge }
|
|
1074
1322
|
});
|
|
1323
|
+
return { pass: validation.pass, message: validation.message };
|
|
1324
|
+
}
|
|
1325
|
+
async function toPassToolJudge(received, rubricOrOptions, maybeOptions) {
|
|
1326
|
+
if (Array.isArray(rubricOrOptions)) {
|
|
1327
|
+
const results = await Promise.all(
|
|
1328
|
+
rubricOrOptions.map(async (judgeConfig) => {
|
|
1329
|
+
const { rubric: r, ...opts } = judgeConfig;
|
|
1330
|
+
return runSingleJudge(received, r, opts);
|
|
1331
|
+
})
|
|
1332
|
+
);
|
|
1333
|
+
const allPassed = results.every((r) => r.pass);
|
|
1334
|
+
const passCount = results.filter((r) => r.pass).length;
|
|
1335
|
+
const summary = `${passCount}/${results.length} judges passed`;
|
|
1336
|
+
const details = results.map((r) => r.message).join("\n");
|
|
1337
|
+
if (this.isNot) {
|
|
1338
|
+
return {
|
|
1339
|
+
pass: !allPassed,
|
|
1340
|
+
message: () => allPassed ? `Expected all judges to fail, but ${summary}` : `Judges failed as expected: ${summary}`
|
|
1341
|
+
};
|
|
1342
|
+
}
|
|
1343
|
+
return {
|
|
1344
|
+
pass: allPassed,
|
|
1345
|
+
message: () => `${summary}
|
|
1346
|
+
${details}`
|
|
1347
|
+
};
|
|
1348
|
+
}
|
|
1349
|
+
let rubric;
|
|
1350
|
+
let options;
|
|
1351
|
+
if (typeof rubricOrOptions === "string" || typeof rubricOrOptions === "object" && rubricOrOptions !== null && "text" in rubricOrOptions) {
|
|
1352
|
+
rubric = rubricOrOptions;
|
|
1353
|
+
options = maybeOptions ?? {};
|
|
1354
|
+
} else {
|
|
1355
|
+
options = rubricOrOptions;
|
|
1356
|
+
}
|
|
1357
|
+
const result = await runSingleJudge(received, rubric, options);
|
|
1075
1358
|
if (this.isNot) {
|
|
1076
1359
|
return {
|
|
1077
|
-
pass: !
|
|
1078
|
-
message: () =>
|
|
1360
|
+
pass: !result.pass,
|
|
1361
|
+
message: () => result.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
|
|
1079
1362
|
};
|
|
1080
1363
|
}
|
|
1081
1364
|
return {
|
|
1082
|
-
pass:
|
|
1083
|
-
message: () =>
|
|
1365
|
+
pass: result.pass,
|
|
1366
|
+
message: () => result.message
|
|
1084
1367
|
};
|
|
1085
1368
|
}
|
|
1086
1369
|
|
|
@@ -1188,9 +1471,17 @@ async function toSatisfyToolPredicate(received, predicate, description) {
|
|
|
1188
1471
|
function isSimulationResult(value) {
|
|
1189
1472
|
return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
|
|
1190
1473
|
}
|
|
1474
|
+
function isPatternMatcher(v) {
|
|
1475
|
+
return typeof v === "object" && v !== null && "$pattern" in v && typeof v["$pattern"] === "string";
|
|
1476
|
+
}
|
|
1191
1477
|
function partialMatch(actual, expected) {
|
|
1192
1478
|
return Object.entries(expected).every(([k, v]) => {
|
|
1193
1479
|
const actualVal = actual[k];
|
|
1480
|
+
if (isPatternMatcher(v)) {
|
|
1481
|
+
if (typeof actualVal !== "string") return false;
|
|
1482
|
+
const re = new RegExp(v.$pattern, v.$flags);
|
|
1483
|
+
return re.test(actualVal);
|
|
1484
|
+
}
|
|
1194
1485
|
if (typeof v === "object" && v !== null && typeof actualVal === "object" && actualVal !== null) {
|
|
1195
1486
|
return partialMatch(
|
|
1196
1487
|
actualVal,
|
|
@@ -1237,6 +1528,10 @@ function validateToolCalls(response, expectation) {
|
|
|
1237
1528
|
return {
|
|
1238
1529
|
pass: false,
|
|
1239
1530
|
message: `Expected tool '${expected.name}' to be called in sequence (starting from position ${searchFrom}), but it was not found`,
|
|
1531
|
+
details: {
|
|
1532
|
+
actual: actual.map((c) => c.name),
|
|
1533
|
+
expected: expected.name
|
|
1534
|
+
},
|
|
1240
1535
|
metrics
|
|
1241
1536
|
};
|
|
1242
1537
|
}
|
|
@@ -1253,6 +1548,10 @@ function validateToolCalls(response, expectation) {
|
|
|
1253
1548
|
return {
|
|
1254
1549
|
pass: false,
|
|
1255
1550
|
message: `Expected tool '${expected.name}'${argsNote} to be called, but it was not`,
|
|
1551
|
+
details: {
|
|
1552
|
+
actual: actual.map((c) => c.name),
|
|
1553
|
+
expected: expected.name
|
|
1554
|
+
},
|
|
1256
1555
|
metrics
|
|
1257
1556
|
};
|
|
1258
1557
|
}
|
|
@@ -1265,6 +1564,10 @@ function validateToolCalls(response, expectation) {
|
|
|
1265
1564
|
return {
|
|
1266
1565
|
pass: false,
|
|
1267
1566
|
message: `Unexpected tool calls: ${names}. Only ${[...allowedNames].map((n) => `'${n}'`).join(", ")} are allowed`,
|
|
1567
|
+
details: {
|
|
1568
|
+
actual: actual.map((c) => c.name),
|
|
1569
|
+
unexpected: unexpected.map((c) => c.name)
|
|
1570
|
+
},
|
|
1268
1571
|
metrics
|
|
1269
1572
|
};
|
|
1270
1573
|
}
|
|
@@ -1283,19 +1586,22 @@ function validateToolCallCount(response, options) {
|
|
|
1283
1586
|
if (exact !== void 0 && count !== exact) {
|
|
1284
1587
|
return {
|
|
1285
1588
|
pass: false,
|
|
1286
|
-
message: `Expected exactly ${exact} tool call(s), but got ${count}
|
|
1589
|
+
message: `Expected exactly ${exact} tool call(s), but got ${count}`,
|
|
1590
|
+
details: { actual: count, expected: exact }
|
|
1287
1591
|
};
|
|
1288
1592
|
}
|
|
1289
1593
|
if (min !== void 0 && count < min) {
|
|
1290
1594
|
return {
|
|
1291
1595
|
pass: false,
|
|
1292
|
-
message: `Expected at least ${min} tool call(s), but got ${count}
|
|
1596
|
+
message: `Expected at least ${min} tool call(s), but got ${count}`,
|
|
1597
|
+
details: { actual: count, min }
|
|
1293
1598
|
};
|
|
1294
1599
|
}
|
|
1295
1600
|
if (max !== void 0 && count > max) {
|
|
1296
1601
|
return {
|
|
1297
1602
|
pass: false,
|
|
1298
|
-
message: `Expected at most ${max} tool call(s), but got ${count}
|
|
1603
|
+
message: `Expected at most ${max} tool call(s), but got ${count}`,
|
|
1604
|
+
details: { actual: count, max }
|
|
1299
1605
|
};
|
|
1300
1606
|
}
|
|
1301
1607
|
return {
|
|
@@ -1434,7 +1740,7 @@ var debugHttp = createDebug(`${NAMESPACE}:http`);
|
|
|
1434
1740
|
|
|
1435
1741
|
// package.json
|
|
1436
1742
|
var package_default = {
|
|
1437
|
-
version: "1.0.0
|
|
1743
|
+
version: "1.0.0"};
|
|
1438
1744
|
var debug = createDebug("mcp-server-tester:oauth-flow");
|
|
1439
1745
|
async function generatePKCE() {
|
|
1440
1746
|
const codeVerifier = oauth.generateRandomCodeVerifier();
|
|
@@ -1815,6 +2121,17 @@ async function createMCPClientForConfig(config, options) {
|
|
|
1815
2121
|
}
|
|
1816
2122
|
async function closeMCPClient(client) {
|
|
1817
2123
|
try {
|
|
2124
|
+
const transport = client.transport;
|
|
2125
|
+
if (transport instanceof StreamableHTTPClientTransport) {
|
|
2126
|
+
try {
|
|
2127
|
+
await transport.terminateSession();
|
|
2128
|
+
} catch (sessionError) {
|
|
2129
|
+
debugClient(
|
|
2130
|
+
"Error terminating session: %s",
|
|
2131
|
+
sessionError instanceof Error ? sessionError.message : String(sessionError)
|
|
2132
|
+
);
|
|
2133
|
+
}
|
|
2134
|
+
}
|
|
1818
2135
|
await client.close();
|
|
1819
2136
|
} catch (error) {
|
|
1820
2137
|
debugClient(
|