@gleanwork/mcp-server-tester 1.0.0-beta.6 → 1.0.0-beta.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +1 -1
- package/dist/fixtures/mcp.d.ts +33 -8
- package/dist/fixtures/mcp.js +284 -24
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +649 -62
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +504 -115
- package/dist/index.d.ts +504 -115
- package/dist/index.js +648 -64
- package/dist/index.js.map +1 -1
- package/dist/reporters/ui-dist/app.js +8 -134
- package/dist/reporters/ui-dist/styles.css +1 -1
- package/package.json +12 -7
- package/dist/reporters/mcpReporter.d.cts +0 -90
- package/dist/reporters/mcpReporter.d.ts +0 -90
package/dist/cli/index.js
CHANGED
package/dist/fixtures/mcp.d.ts
CHANGED
|
@@ -214,7 +214,7 @@ type RubricSpec = BuiltInRubric | {
|
|
|
214
214
|
};
|
|
215
215
|
|
|
216
216
|
/** Valid LLM judge provider kinds. */
|
|
217
|
-
type ProviderKind = 'anthropic' | 'openai' | 'google';
|
|
217
|
+
type ProviderKind = 'anthropic' | 'vertex-anthropic' | 'anthropic-agent-sdk' | 'openai' | 'google';
|
|
218
218
|
|
|
219
219
|
/**
|
|
220
220
|
* Tool call validators for mcp_host simulation results.
|
|
@@ -258,6 +258,12 @@ interface JudgeMatcherOptions {
|
|
|
258
258
|
provider?: ProviderKind;
|
|
259
259
|
/** Override the judge model */
|
|
260
260
|
model?: string;
|
|
261
|
+
/**
|
|
262
|
+
* Name of a registered custom judge executor.
|
|
263
|
+
* When set, the named judge handles the entire evaluation pipeline
|
|
264
|
+
* and its `pass` result is authoritative.
|
|
265
|
+
*/
|
|
266
|
+
judge?: string;
|
|
261
267
|
}
|
|
262
268
|
/**
|
|
263
269
|
* Declaration merging for Playwright matchers
|
|
@@ -348,21 +354,30 @@ declare global {
|
|
|
348
354
|
*/
|
|
349
355
|
toBeToolError(expected?: boolean | string | string[]): R;
|
|
350
356
|
/**
|
|
351
|
-
* Validates that a response passes LLM-as-judge evaluation
|
|
357
|
+
* Validates that a response passes LLM-as-judge evaluation.
|
|
352
358
|
*
|
|
353
|
-
*
|
|
354
|
-
*
|
|
359
|
+
* Two call signatures:
|
|
360
|
+
* - With rubric: `toPassToolJudge(rubric, options?)` — built-in LLM judge
|
|
361
|
+
* - With named judge: `toPassToolJudge({ judge: 'name' })` — custom judge executor
|
|
355
362
|
*
|
|
356
363
|
* @example
|
|
357
364
|
* ```typescript
|
|
365
|
+
* // Built-in LLM judge with rubric
|
|
358
366
|
* expect(result).toPassToolJudge('Response should be helpful and accurate');
|
|
359
|
-
* expect(result).toPassToolJudge('
|
|
367
|
+
* expect(result).toPassToolJudge('correctness', {
|
|
360
368
|
* reference: expectedOutput,
|
|
361
369
|
* passingThreshold: 0.8,
|
|
362
370
|
* });
|
|
371
|
+
*
|
|
372
|
+
* // Named custom judge (registered via registerJudge)
|
|
373
|
+
* expect(result).toPassToolJudge({ judge: 'glean-completeness' });
|
|
363
374
|
* ```
|
|
364
375
|
*/
|
|
365
376
|
toPassToolJudge(rubric: RubricSpec, options?: JudgeMatcherOptions): Promise<R>;
|
|
377
|
+
toPassToolJudge(options: JudgeMatcherOptions): Promise<R>;
|
|
378
|
+
toPassToolJudge(judges: Array<JudgeMatcherOptions & {
|
|
379
|
+
rubric?: RubricSpec;
|
|
380
|
+
}>): Promise<R>;
|
|
366
381
|
/**
|
|
367
382
|
* Validates that a response meets size constraints
|
|
368
383
|
*
|
|
@@ -452,16 +467,26 @@ type ToolPredicate = (response: unknown, text: string) => boolean | PredicateRes
|
|
|
452
467
|
* Validates that a response passes LLM-as-judge evaluation.
|
|
453
468
|
* Delegates evaluation logic to validateJudge() for consistency
|
|
454
469
|
* with the validator/matcher duality pattern.
|
|
470
|
+
*
|
|
471
|
+
* Supports three call signatures:
|
|
472
|
+
* - toPassToolJudge(rubric, options?) — built-in LLM judge with rubric
|
|
473
|
+
* - toPassToolJudge({ judge: 'name', ... }) — named custom judge
|
|
474
|
+
* - toPassToolJudge([...judges]) — multi-judge (all must pass)
|
|
455
475
|
*/
|
|
456
476
|
|
|
457
477
|
/**
|
|
458
|
-
*
|
|
478
|
+
* The toPassToolJudge matcher function.
|
|
459
479
|
*
|
|
460
|
-
*
|
|
480
|
+
* Accepts either:
|
|
481
|
+
* (received, rubric, options?) — rubric-based LLM judge
|
|
482
|
+
* (received, options) — named custom judge (options.judge required)
|
|
483
|
+
* (received, judges[]) — multi-judge (all must pass)
|
|
461
484
|
*/
|
|
462
485
|
declare function toPassToolJudge(this: {
|
|
463
486
|
isNot: boolean;
|
|
464
|
-
}, received: unknown,
|
|
487
|
+
}, received: unknown, rubricOrOptions: RubricSpec | JudgeMatcherOptions | Array<JudgeMatcherOptions & {
|
|
488
|
+
rubric?: RubricSpec;
|
|
489
|
+
}>, maybeOptions?: JudgeMatcherOptions): Promise<{
|
|
465
490
|
pass: boolean;
|
|
466
491
|
message: () => string;
|
|
467
492
|
}>;
|
package/dist/fixtures/mcp.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { expect as expect$1, test as test$1 } from '@playwright/test';
|
|
2
|
-
import { query } from '@anthropic-ai/claude-agent-sdk';
|
|
3
2
|
import { z } from 'zod';
|
|
3
|
+
import { query } from '@anthropic-ai/claude-agent-sdk';
|
|
4
4
|
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
|
|
5
5
|
import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
|
|
6
6
|
import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js';
|
|
@@ -662,7 +662,175 @@ var JudgeResponseSchema = z.object({
|
|
|
662
662
|
reasoning: z.string()
|
|
663
663
|
});
|
|
664
664
|
|
|
665
|
-
// src/judge/
|
|
665
|
+
// src/judge/anthropicJudge.ts
|
|
666
|
+
function createAnthropicJudge(config = {}) {
|
|
667
|
+
const apiKeyEnvVar = config.apiKeyEnvVar ?? "ANTHROPIC_API_KEY";
|
|
668
|
+
const apiKey = process.env[apiKeyEnvVar];
|
|
669
|
+
if (!apiKey) {
|
|
670
|
+
throw new Error(
|
|
671
|
+
`Anthropic judge requires an API key. Set the ${apiKeyEnvVar} environment variable.`
|
|
672
|
+
);
|
|
673
|
+
}
|
|
674
|
+
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
675
|
+
const maxTokens = config.maxTokens ?? 1e3;
|
|
676
|
+
const temperature = config.temperature ?? 0;
|
|
677
|
+
return {
|
|
678
|
+
async evaluate(candidate, reference, rubric) {
|
|
679
|
+
let anthropicModule;
|
|
680
|
+
try {
|
|
681
|
+
anthropicModule = await import('@anthropic-ai/sdk');
|
|
682
|
+
} catch (err) {
|
|
683
|
+
throw new Error(
|
|
684
|
+
`Anthropic judge requires the \`@anthropic-ai/sdk\` package. Install it with: npm install @anthropic-ai/sdk
|
|
685
|
+
Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
686
|
+
);
|
|
687
|
+
}
|
|
688
|
+
const client = new anthropicModule.default({ apiKey });
|
|
689
|
+
const prompt = buildJudgePrompt(candidate, reference, rubric);
|
|
690
|
+
const startTime = Date.now();
|
|
691
|
+
const response = await client.messages.create({
|
|
692
|
+
model,
|
|
693
|
+
max_tokens: maxTokens,
|
|
694
|
+
temperature,
|
|
695
|
+
system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
|
|
696
|
+
messages: [{ role: "user", content: prompt }]
|
|
697
|
+
});
|
|
698
|
+
const durationMs = Date.now() - startTime;
|
|
699
|
+
const textBlock = response.content.find(
|
|
700
|
+
(b) => b.type === "text"
|
|
701
|
+
);
|
|
702
|
+
const text = textBlock?.text ?? "";
|
|
703
|
+
const parsed = parseJudgeResponse(text);
|
|
704
|
+
return {
|
|
705
|
+
pass: parsed.pass,
|
|
706
|
+
score: parsed.score,
|
|
707
|
+
reasoning: parsed.reasoning,
|
|
708
|
+
usage: {
|
|
709
|
+
inputTokens: response.usage?.input_tokens ?? 0,
|
|
710
|
+
outputTokens: response.usage?.output_tokens ?? 0,
|
|
711
|
+
totalCostUsd: 0,
|
|
712
|
+
durationMs
|
|
713
|
+
}
|
|
714
|
+
};
|
|
715
|
+
}
|
|
716
|
+
};
|
|
717
|
+
}
|
|
718
|
+
function buildJudgePrompt(candidate, reference, rubric) {
|
|
719
|
+
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
720
|
+
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
721
|
+
return `Rubric:
|
|
722
|
+
${rubric}
|
|
723
|
+
|
|
724
|
+
<candidate_response>
|
|
725
|
+
${candidateStr}
|
|
726
|
+
</candidate_response>
|
|
727
|
+
|
|
728
|
+
<reference_answer>
|
|
729
|
+
${referenceStr ?? "No reference provided."}
|
|
730
|
+
</reference_answer>
|
|
731
|
+
|
|
732
|
+
Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
|
|
733
|
+
}
|
|
734
|
+
function parseJudgeResponse(text) {
|
|
735
|
+
const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
|
|
736
|
+
let parsed;
|
|
737
|
+
try {
|
|
738
|
+
parsed = JSON.parse(cleaned);
|
|
739
|
+
} catch {
|
|
740
|
+
throw new Error(`Failed to parse judge response as JSON: ${text}`);
|
|
741
|
+
}
|
|
742
|
+
const result = JudgeResponseSchema.safeParse(parsed);
|
|
743
|
+
if (!result.success) {
|
|
744
|
+
throw new Error(
|
|
745
|
+
`Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
|
|
746
|
+
Validation errors: ${JSON.stringify(result.error.issues)}`
|
|
747
|
+
);
|
|
748
|
+
}
|
|
749
|
+
return result.data;
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
// src/judge/vertexAnthropicJudge.ts
|
|
753
|
+
function createVertexAnthropicJudge(config = {}) {
|
|
754
|
+
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
755
|
+
const maxTokens = config.maxTokens ?? 1e3;
|
|
756
|
+
const temperature = config.temperature ?? 0;
|
|
757
|
+
return {
|
|
758
|
+
async evaluate(candidate, reference, rubric) {
|
|
759
|
+
let vertexModule;
|
|
760
|
+
try {
|
|
761
|
+
vertexModule = await import('@anthropic-ai/vertex-sdk');
|
|
762
|
+
} catch (err) {
|
|
763
|
+
throw new Error(
|
|
764
|
+
`Vertex Anthropic judge requires the \`@anthropic-ai/vertex-sdk\` package. Install it with: npm install @anthropic-ai/vertex-sdk
|
|
765
|
+
Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
766
|
+
);
|
|
767
|
+
}
|
|
768
|
+
const client = new vertexModule.AnthropicVertex({
|
|
769
|
+
projectId: process.env.GOOGLE_VERTEX_PROJECT ?? process.env.CLOUD_ML_PROJECT_ID,
|
|
770
|
+
region: process.env.GOOGLE_VERTEX_LOCATION ?? "us-east5"
|
|
771
|
+
});
|
|
772
|
+
const prompt = buildJudgePrompt2(candidate, reference, rubric);
|
|
773
|
+
const startTime = Date.now();
|
|
774
|
+
const response = await client.messages.create({
|
|
775
|
+
model,
|
|
776
|
+
max_tokens: maxTokens,
|
|
777
|
+
temperature,
|
|
778
|
+
system: 'You are an expert evaluator. Respond with valid JSON only: {"pass": true|false, "score": 0.0-1.0, "reasoning": "explanation"}',
|
|
779
|
+
messages: [{ role: "user", content: prompt }]
|
|
780
|
+
});
|
|
781
|
+
const durationMs = Date.now() - startTime;
|
|
782
|
+
const textBlock = response.content.find(
|
|
783
|
+
(b) => b.type === "text"
|
|
784
|
+
);
|
|
785
|
+
const text = textBlock?.text ?? "";
|
|
786
|
+
const parsed = parseJudgeResponse2(text);
|
|
787
|
+
return {
|
|
788
|
+
pass: parsed.pass,
|
|
789
|
+
score: parsed.score,
|
|
790
|
+
reasoning: parsed.reasoning,
|
|
791
|
+
usage: {
|
|
792
|
+
inputTokens: response.usage?.input_tokens ?? 0,
|
|
793
|
+
outputTokens: response.usage?.output_tokens ?? 0,
|
|
794
|
+
totalCostUsd: 0,
|
|
795
|
+
durationMs
|
|
796
|
+
}
|
|
797
|
+
};
|
|
798
|
+
}
|
|
799
|
+
};
|
|
800
|
+
}
|
|
801
|
+
function buildJudgePrompt2(candidate, reference, rubric) {
|
|
802
|
+
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
803
|
+
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
804
|
+
return `Rubric:
|
|
805
|
+
${rubric}
|
|
806
|
+
|
|
807
|
+
<candidate_response>
|
|
808
|
+
${candidateStr}
|
|
809
|
+
</candidate_response>
|
|
810
|
+
|
|
811
|
+
<reference_answer>
|
|
812
|
+
${referenceStr ?? "No reference provided."}
|
|
813
|
+
</reference_answer>
|
|
814
|
+
|
|
815
|
+
Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
|
|
816
|
+
}
|
|
817
|
+
function parseJudgeResponse2(text) {
|
|
818
|
+
const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
|
|
819
|
+
let parsed;
|
|
820
|
+
try {
|
|
821
|
+
parsed = JSON.parse(cleaned);
|
|
822
|
+
} catch {
|
|
823
|
+
throw new Error(`Failed to parse judge response as JSON: ${text}`);
|
|
824
|
+
}
|
|
825
|
+
const result = JudgeResponseSchema.safeParse(parsed);
|
|
826
|
+
if (!result.success) {
|
|
827
|
+
throw new Error(
|
|
828
|
+
`Judge returned invalid response. Expected {pass, score, reasoning} but got: ${cleaned.slice(0, 500)}
|
|
829
|
+
Validation errors: ${JSON.stringify(result.error.issues)}`
|
|
830
|
+
);
|
|
831
|
+
}
|
|
832
|
+
return result.data;
|
|
833
|
+
}
|
|
666
834
|
function createClaudeAgentJudge(config) {
|
|
667
835
|
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
668
836
|
const maxBudgetUsd = config.maxBudgetUsd ?? 0.1;
|
|
@@ -680,7 +848,7 @@ function createClaudeAgentJudge(config) {
|
|
|
680
848
|
exceedsMaxToolOutputSize: true
|
|
681
849
|
};
|
|
682
850
|
}
|
|
683
|
-
const prompt =
|
|
851
|
+
const prompt = buildJudgePrompt3(candidate, reference, rubric);
|
|
684
852
|
try {
|
|
685
853
|
let resultMessage;
|
|
686
854
|
for await (const message of query({
|
|
@@ -712,7 +880,7 @@ function createClaudeAgentJudge(config) {
|
|
|
712
880
|
);
|
|
713
881
|
}
|
|
714
882
|
const responseText = resultMessage.result ?? "";
|
|
715
|
-
const parsed =
|
|
883
|
+
const parsed = parseJudgeResponse3(responseText);
|
|
716
884
|
const usage = {
|
|
717
885
|
inputTokens: resultMessage.usage?.input_tokens ?? 0,
|
|
718
886
|
outputTokens: resultMessage.usage?.output_tokens ?? 0,
|
|
@@ -741,7 +909,7 @@ function createClaudeAgentJudge(config) {
|
|
|
741
909
|
function buildSystemPrompt() {
|
|
742
910
|
return 'You are an expert evaluator. Evaluate the candidate response based on the rubric provided. Respond ONLY with valid JSON in this exact format: {"pass": boolean, "score": number (0-1), "reasoning": string}. Do not include any other text, markdown formatting, or code blocks.';
|
|
743
911
|
}
|
|
744
|
-
function
|
|
912
|
+
function buildJudgePrompt3(candidate, reference, rubric) {
|
|
745
913
|
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
746
914
|
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
747
915
|
const parts = [];
|
|
@@ -758,7 +926,7 @@ function buildJudgePrompt(candidate, reference, rubric) {
|
|
|
758
926
|
);
|
|
759
927
|
return parts.join("");
|
|
760
928
|
}
|
|
761
|
-
function
|
|
929
|
+
function parseJudgeResponse3(text) {
|
|
762
930
|
let jsonText = text.trim();
|
|
763
931
|
if (jsonText.startsWith("```json")) {
|
|
764
932
|
jsonText = jsonText.slice(7);
|
|
@@ -815,7 +983,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
|
815
983
|
);
|
|
816
984
|
}
|
|
817
985
|
const client = new openaiModule.default({ apiKey });
|
|
818
|
-
const prompt =
|
|
986
|
+
const prompt = buildJudgePrompt4(candidate, reference, rubric);
|
|
819
987
|
const startTime = Date.now();
|
|
820
988
|
const completion = await client.chat.completions.create({
|
|
821
989
|
model,
|
|
@@ -831,7 +999,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
|
831
999
|
});
|
|
832
1000
|
const durationMs = Date.now() - startTime;
|
|
833
1001
|
const text = completion.choices[0]?.message.content ?? "";
|
|
834
|
-
const parsed =
|
|
1002
|
+
const parsed = parseJudgeResponse4(text);
|
|
835
1003
|
return {
|
|
836
1004
|
pass: parsed.pass,
|
|
837
1005
|
score: parsed.score,
|
|
@@ -846,7 +1014,7 @@ Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
|
846
1014
|
}
|
|
847
1015
|
};
|
|
848
1016
|
}
|
|
849
|
-
function
|
|
1017
|
+
function buildJudgePrompt4(candidate, reference, rubric) {
|
|
850
1018
|
const candidateStr = typeof candidate === "string" ? candidate : JSON.stringify(candidate, null, 2);
|
|
851
1019
|
const referenceStr = reference !== null && reference !== void 0 ? typeof reference === "string" ? reference : JSON.stringify(reference, null, 2) : null;
|
|
852
1020
|
return `Rubric:
|
|
@@ -862,7 +1030,7 @@ ${referenceStr ?? "No reference provided."}
|
|
|
862
1030
|
|
|
863
1031
|
Evaluate and return JSON: {"pass": boolean, "score": number (0-1), "reasoning": string}`;
|
|
864
1032
|
}
|
|
865
|
-
function
|
|
1033
|
+
function parseJudgeResponse4(text) {
|
|
866
1034
|
const cleaned = text.replace(/```json\n?/g, "").replace(/```\n?/g, "").trim();
|
|
867
1035
|
let parsed;
|
|
868
1036
|
try {
|
|
@@ -964,6 +1132,10 @@ function createJudge(config = {}) {
|
|
|
964
1132
|
const provider = config.provider ?? "anthropic";
|
|
965
1133
|
switch (provider) {
|
|
966
1134
|
case "anthropic":
|
|
1135
|
+
return createAnthropicJudge(config);
|
|
1136
|
+
case "vertex-anthropic":
|
|
1137
|
+
return createVertexAnthropicJudge(config);
|
|
1138
|
+
case "anthropic-agent-sdk":
|
|
967
1139
|
return createClaudeAgentJudge(config);
|
|
968
1140
|
case "openai":
|
|
969
1141
|
return createOpenAIJudge(config);
|
|
@@ -974,6 +1146,19 @@ function createJudge(config = {}) {
|
|
|
974
1146
|
}
|
|
975
1147
|
}
|
|
976
1148
|
|
|
1149
|
+
// src/judge/judgeRegistry.ts
|
|
1150
|
+
var registry = /* @__PURE__ */ new Map();
|
|
1151
|
+
function getRegisteredJudge(name) {
|
|
1152
|
+
const executor = registry.get(name);
|
|
1153
|
+
if (!executor) {
|
|
1154
|
+
const available = registry.size > 0 ? ` Available judges: ${[...registry.keys()].join(", ")}` : " No judges are registered.";
|
|
1155
|
+
throw new Error(
|
|
1156
|
+
`Judge "${name}" is not registered.${available} Register it with registerJudge() before tests run.`
|
|
1157
|
+
);
|
|
1158
|
+
}
|
|
1159
|
+
return executor;
|
|
1160
|
+
}
|
|
1161
|
+
|
|
977
1162
|
// src/assertions/validators/judge.ts
|
|
978
1163
|
function computeStdDev(scores, mean) {
|
|
979
1164
|
if (scores.length <= 1) return 0;
|
|
@@ -982,6 +1167,7 @@ function computeStdDev(scores, mean) {
|
|
|
982
1167
|
}
|
|
983
1168
|
async function validateJudge(response, config) {
|
|
984
1169
|
const {
|
|
1170
|
+
judge: judgeName,
|
|
985
1171
|
rubric,
|
|
986
1172
|
reference,
|
|
987
1173
|
threshold = 0.7,
|
|
@@ -994,6 +1180,29 @@ async function validateJudge(response, config) {
|
|
|
994
1180
|
maxBudgetUsd,
|
|
995
1181
|
maxToolOutputSize
|
|
996
1182
|
} = config;
|
|
1183
|
+
if (judgeName !== void 0) {
|
|
1184
|
+
try {
|
|
1185
|
+
const executor = getRegisteredJudge(judgeName);
|
|
1186
|
+
const judgeResult = await executor(response, reference ?? void 0);
|
|
1187
|
+
const score = judgeResult.score;
|
|
1188
|
+
const passed = score >= threshold;
|
|
1189
|
+
return {
|
|
1190
|
+
pass: passed,
|
|
1191
|
+
message: passed ? `Custom judge "${judgeName}" passed with score ${score.toFixed(2)}` : `Custom judge "${judgeName}" failed with score ${score.toFixed(2)} (threshold: ${threshold}). ${judgeResult.reasoning ?? ""}`
|
|
1192
|
+
};
|
|
1193
|
+
} catch (err) {
|
|
1194
|
+
return {
|
|
1195
|
+
pass: false,
|
|
1196
|
+
message: `Custom judge "${judgeName}" error: ${err instanceof Error ? err.message : String(err)}`
|
|
1197
|
+
};
|
|
1198
|
+
}
|
|
1199
|
+
}
|
|
1200
|
+
if (rubric === void 0) {
|
|
1201
|
+
return {
|
|
1202
|
+
pass: false,
|
|
1203
|
+
message: 'Judge evaluation failed: either "judge" or "rubric" must be provided'
|
|
1204
|
+
};
|
|
1205
|
+
}
|
|
997
1206
|
const resolvedRubric = resolveRubric(rubric);
|
|
998
1207
|
const judgeConfig = {
|
|
999
1208
|
...provider !== void 0 && { provider },
|
|
@@ -1040,11 +1249,17 @@ async function validateJudge(response, config) {
|
|
|
1040
1249
|
return {
|
|
1041
1250
|
pass: passed,
|
|
1042
1251
|
message: passed ? `Judge passed with score ${meanScore.toFixed(2)}${repNote}` : `Judge failed with score ${meanScore.toFixed(2)} (threshold: ${threshold})${repNote}. ${lastReasoning ?? ""}`,
|
|
1043
|
-
details:
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1252
|
+
details: {
|
|
1253
|
+
score: meanScore,
|
|
1254
|
+
reasoning: lastReasoning,
|
|
1255
|
+
judgeProvider: provider ?? "anthropic",
|
|
1256
|
+
judgeModel: model,
|
|
1257
|
+
...reps > 1 && {
|
|
1258
|
+
scores,
|
|
1259
|
+
scoreStdDev: stdDev,
|
|
1260
|
+
highVariance
|
|
1261
|
+
}
|
|
1262
|
+
}
|
|
1048
1263
|
};
|
|
1049
1264
|
} catch (err) {
|
|
1050
1265
|
return {
|
|
@@ -1056,31 +1271,68 @@ async function validateJudge(response, config) {
|
|
|
1056
1271
|
|
|
1057
1272
|
// src/assertions/matchers/toPassToolJudge.ts
|
|
1058
1273
|
var DEFAULT_PASSING_THRESHOLD = 0.7;
|
|
1059
|
-
async function
|
|
1274
|
+
async function runSingleJudge(received, rubric, options) {
|
|
1060
1275
|
const {
|
|
1061
1276
|
reference = null,
|
|
1062
1277
|
passingThreshold = DEFAULT_PASSING_THRESHOLD,
|
|
1063
1278
|
reps,
|
|
1064
1279
|
provider,
|
|
1065
|
-
model
|
|
1280
|
+
model,
|
|
1281
|
+
judge
|
|
1066
1282
|
} = options;
|
|
1067
1283
|
const validation = await validateJudge(received, {
|
|
1068
|
-
rubric,
|
|
1284
|
+
...rubric !== void 0 && { rubric },
|
|
1069
1285
|
reference: reference ?? void 0,
|
|
1070
1286
|
threshold: passingThreshold,
|
|
1071
1287
|
...reps !== void 0 && { reps },
|
|
1072
1288
|
...provider !== void 0 && { provider },
|
|
1073
|
-
...model !== void 0 && { model }
|
|
1289
|
+
...model !== void 0 && { model },
|
|
1290
|
+
...judge !== void 0 && { judge }
|
|
1074
1291
|
});
|
|
1292
|
+
return { pass: validation.pass, message: validation.message };
|
|
1293
|
+
}
|
|
1294
|
+
async function toPassToolJudge(received, rubricOrOptions, maybeOptions) {
|
|
1295
|
+
if (Array.isArray(rubricOrOptions)) {
|
|
1296
|
+
const results = await Promise.all(
|
|
1297
|
+
rubricOrOptions.map(async (judgeConfig) => {
|
|
1298
|
+
const { rubric: r, ...opts } = judgeConfig;
|
|
1299
|
+
return runSingleJudge(received, r, opts);
|
|
1300
|
+
})
|
|
1301
|
+
);
|
|
1302
|
+
const allPassed = results.every((r) => r.pass);
|
|
1303
|
+
const passCount = results.filter((r) => r.pass).length;
|
|
1304
|
+
const summary = `${passCount}/${results.length} judges passed`;
|
|
1305
|
+
const details = results.map((r) => r.message).join("\n");
|
|
1306
|
+
if (this.isNot) {
|
|
1307
|
+
return {
|
|
1308
|
+
pass: !allPassed,
|
|
1309
|
+
message: () => allPassed ? `Expected all judges to fail, but ${summary}` : `Judges failed as expected: ${summary}`
|
|
1310
|
+
};
|
|
1311
|
+
}
|
|
1312
|
+
return {
|
|
1313
|
+
pass: allPassed,
|
|
1314
|
+
message: () => `${summary}
|
|
1315
|
+
${details}`
|
|
1316
|
+
};
|
|
1317
|
+
}
|
|
1318
|
+
let rubric;
|
|
1319
|
+
let options;
|
|
1320
|
+
if (typeof rubricOrOptions === "string" || typeof rubricOrOptions === "object" && rubricOrOptions !== null && "text" in rubricOrOptions) {
|
|
1321
|
+
rubric = rubricOrOptions;
|
|
1322
|
+
options = maybeOptions ?? {};
|
|
1323
|
+
} else {
|
|
1324
|
+
options = rubricOrOptions;
|
|
1325
|
+
}
|
|
1326
|
+
const result = await runSingleJudge(received, rubric, options);
|
|
1075
1327
|
if (this.isNot) {
|
|
1076
1328
|
return {
|
|
1077
|
-
pass: !
|
|
1078
|
-
message: () =>
|
|
1329
|
+
pass: !result.pass,
|
|
1330
|
+
message: () => result.pass ? `Expected judge evaluation to fail, but it passed` : `Judge evaluation failed as expected`
|
|
1079
1331
|
};
|
|
1080
1332
|
}
|
|
1081
1333
|
return {
|
|
1082
|
-
pass:
|
|
1083
|
-
message: () =>
|
|
1334
|
+
pass: result.pass,
|
|
1335
|
+
message: () => result.message
|
|
1084
1336
|
};
|
|
1085
1337
|
}
|
|
1086
1338
|
|
|
@@ -1188,9 +1440,17 @@ async function toSatisfyToolPredicate(received, predicate, description) {
|
|
|
1188
1440
|
function isSimulationResult(value) {
|
|
1189
1441
|
return typeof value === "object" && value !== null && "success" in value && "toolCalls" in value && Array.isArray(value.toolCalls);
|
|
1190
1442
|
}
|
|
1443
|
+
function isPatternMatcher(v) {
|
|
1444
|
+
return typeof v === "object" && v !== null && "$pattern" in v && typeof v["$pattern"] === "string";
|
|
1445
|
+
}
|
|
1191
1446
|
function partialMatch(actual, expected) {
|
|
1192
1447
|
return Object.entries(expected).every(([k, v]) => {
|
|
1193
1448
|
const actualVal = actual[k];
|
|
1449
|
+
if (isPatternMatcher(v)) {
|
|
1450
|
+
if (typeof actualVal !== "string") return false;
|
|
1451
|
+
const re = new RegExp(v.$pattern, v.$flags);
|
|
1452
|
+
return re.test(actualVal);
|
|
1453
|
+
}
|
|
1194
1454
|
if (typeof v === "object" && v !== null && typeof actualVal === "object" && actualVal !== null) {
|
|
1195
1455
|
return partialMatch(
|
|
1196
1456
|
actualVal,
|
|
@@ -1434,7 +1694,7 @@ var debugHttp = createDebug(`${NAMESPACE}:http`);
|
|
|
1434
1694
|
|
|
1435
1695
|
// package.json
|
|
1436
1696
|
var package_default = {
|
|
1437
|
-
version: "1.0.0-beta.
|
|
1697
|
+
version: "1.0.0-beta.8"};
|
|
1438
1698
|
var debug = createDebug("mcp-server-tester:oauth-flow");
|
|
1439
1699
|
async function generatePKCE() {
|
|
1440
1700
|
const codeVerifier = oauth.generateRandomCodeVerifier();
|