@gleanwork/mcp-server-tester 1.0.0-beta.8 → 1.0.1-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -1
- package/dist/cli/index.js +12 -1
- package/dist/fixtures/mcp.js +71 -14
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +142 -24
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +118 -16
- package/dist/index.d.ts +118 -16
- package/dist/index.js +142 -25
- package/dist/index.js.map +1 -1
- package/dist/reporters/mcpReporter.cjs +34 -1
- package/dist/reporters/mcpReporter.cjs.map +1 -1
- package/dist/reporters/mcpReporter.d.cts +90 -0
- package/dist/reporters/mcpReporter.d.ts +90 -0
- package/dist/reporters/mcpReporter.js +34 -1
- package/dist/reporters/mcpReporter.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -4384,7 +4384,7 @@ function escapeHtml(text) {
|
|
|
4384
4384
|
|
|
4385
4385
|
// package.json
|
|
4386
4386
|
var package_default = {
|
|
4387
|
-
version: "1.0.
|
|
4387
|
+
version: "1.0.1-beta.0"};
|
|
4388
4388
|
|
|
4389
4389
|
// src/mcp/clientFactory.ts
|
|
4390
4390
|
function getRetryAfterDelayMs(err) {
|
|
@@ -4603,6 +4603,17 @@ async function createMCPClientForConfig(config, options) {
|
|
|
4603
4603
|
}
|
|
4604
4604
|
async function closeMCPClient(client) {
|
|
4605
4605
|
try {
|
|
4606
|
+
const transport = client.transport;
|
|
4607
|
+
if (transport instanceof StreamableHTTPClientTransport) {
|
|
4608
|
+
try {
|
|
4609
|
+
await transport.terminateSession();
|
|
4610
|
+
} catch (sessionError) {
|
|
4611
|
+
debugClient(
|
|
4612
|
+
"Error terminating session: %s",
|
|
4613
|
+
sessionError instanceof Error ? sessionError.message : String(sessionError)
|
|
4614
|
+
);
|
|
4615
|
+
}
|
|
4616
|
+
}
|
|
4606
4617
|
await client.close();
|
|
4607
4618
|
} catch (error) {
|
|
4608
4619
|
debugClient(
|
|
@@ -4831,11 +4842,13 @@ function validateSchema(response, schema, options = {}) {
|
|
|
4831
4842
|
} catch (error) {
|
|
4832
4843
|
const zodError = error;
|
|
4833
4844
|
const issues = formatZodIssues(zodError);
|
|
4845
|
+
const text = stringifyResponse(response);
|
|
4834
4846
|
return {
|
|
4835
4847
|
pass: false,
|
|
4836
4848
|
message: `Response does not match schema: ${issues}`,
|
|
4837
4849
|
details: {
|
|
4838
|
-
issues: zodError.issues
|
|
4850
|
+
issues: zodError.issues,
|
|
4851
|
+
textPreview: truncateForDisplay2(text)
|
|
4839
4852
|
}
|
|
4840
4853
|
};
|
|
4841
4854
|
}
|
|
@@ -4888,6 +4901,12 @@ function formatZodIssues(error) {
|
|
|
4888
4901
|
});
|
|
4889
4902
|
return issues.join("; ");
|
|
4890
4903
|
}
|
|
4904
|
+
function truncateForDisplay2(str, maxLength = 200) {
|
|
4905
|
+
if (str.length <= maxLength) {
|
|
4906
|
+
return str;
|
|
4907
|
+
}
|
|
4908
|
+
return str.slice(0, maxLength) + "... (truncated)";
|
|
4909
|
+
}
|
|
4891
4910
|
|
|
4892
4911
|
// src/assertions/validators/text.ts
|
|
4893
4912
|
function validateText(response, expected, options = {}) {
|
|
@@ -4914,11 +4933,11 @@ function validateText(response, expected, options = {}) {
|
|
|
4914
4933
|
details: {
|
|
4915
4934
|
missing,
|
|
4916
4935
|
textLength: text.length,
|
|
4917
|
-
textPreview:
|
|
4936
|
+
textPreview: truncateForDisplay3(text)
|
|
4918
4937
|
}
|
|
4919
4938
|
};
|
|
4920
4939
|
}
|
|
4921
|
-
function
|
|
4940
|
+
function truncateForDisplay3(str, maxLength = 200) {
|
|
4922
4941
|
if (str.length <= maxLength) {
|
|
4923
4942
|
return str;
|
|
4924
4943
|
}
|
|
@@ -4950,7 +4969,7 @@ function validatePattern(response, patterns, options = {}) {
|
|
|
4950
4969
|
details: {
|
|
4951
4970
|
unmatched,
|
|
4952
4971
|
textLength: text.length,
|
|
4953
|
-
textPreview:
|
|
4972
|
+
textPreview: truncateForDisplay4(text)
|
|
4954
4973
|
}
|
|
4955
4974
|
};
|
|
4956
4975
|
}
|
|
@@ -4970,7 +4989,7 @@ function patternToString(pattern) {
|
|
|
4970
4989
|
}
|
|
4971
4990
|
return `/${pattern}/`;
|
|
4972
4991
|
}
|
|
4973
|
-
function
|
|
4992
|
+
function truncateForDisplay4(str, maxLength = 200) {
|
|
4974
4993
|
if (str.length <= maxLength) {
|
|
4975
4994
|
return str;
|
|
4976
4995
|
}
|
|
@@ -4993,7 +5012,7 @@ function validateError(response, expected = true) {
|
|
|
4993
5012
|
pass: false,
|
|
4994
5013
|
message: "Expected an error response but got success",
|
|
4995
5014
|
details: {
|
|
4996
|
-
textPreview:
|
|
5015
|
+
textPreview: truncateForDisplay5(extractText2(response))
|
|
4997
5016
|
}
|
|
4998
5017
|
};
|
|
4999
5018
|
} else {
|
|
@@ -5005,7 +5024,7 @@ function validateError(response, expected = true) {
|
|
|
5005
5024
|
}
|
|
5006
5025
|
return {
|
|
5007
5026
|
pass: false,
|
|
5008
|
-
message: `Expected a success response but got error: "${
|
|
5027
|
+
message: `Expected a success response but got error: "${truncateForDisplay5(errorMessage)}"`,
|
|
5009
5028
|
details: {
|
|
5010
5029
|
errorMessage
|
|
5011
5030
|
}
|
|
@@ -5018,7 +5037,7 @@ function validateError(response, expected = true) {
|
|
|
5018
5037
|
pass: false,
|
|
5019
5038
|
message: `Expected an error containing "${expectedMessages[0]}" but got success`,
|
|
5020
5039
|
details: {
|
|
5021
|
-
textPreview:
|
|
5040
|
+
textPreview: truncateForDisplay5(extractText2(response))
|
|
5022
5041
|
}
|
|
5023
5042
|
};
|
|
5024
5043
|
}
|
|
@@ -5040,7 +5059,7 @@ function validateError(response, expected = true) {
|
|
|
5040
5059
|
}
|
|
5041
5060
|
};
|
|
5042
5061
|
}
|
|
5043
|
-
function
|
|
5062
|
+
function truncateForDisplay5(str, maxLength = 200) {
|
|
5044
5063
|
if (str.length <= maxLength) {
|
|
5045
5064
|
return str;
|
|
5046
5065
|
}
|
|
@@ -5158,6 +5177,10 @@ function validateToolCalls(response, expectation) {
|
|
|
5158
5177
|
return {
|
|
5159
5178
|
pass: false,
|
|
5160
5179
|
message: `Expected tool '${expected.name}' to be called in sequence (starting from position ${searchFrom}), but it was not found`,
|
|
5180
|
+
details: {
|
|
5181
|
+
actual: actual.map((c) => c.name),
|
|
5182
|
+
expected: expected.name
|
|
5183
|
+
},
|
|
5161
5184
|
metrics
|
|
5162
5185
|
};
|
|
5163
5186
|
}
|
|
@@ -5174,6 +5197,10 @@ function validateToolCalls(response, expectation) {
|
|
|
5174
5197
|
return {
|
|
5175
5198
|
pass: false,
|
|
5176
5199
|
message: `Expected tool '${expected.name}'${argsNote} to be called, but it was not`,
|
|
5200
|
+
details: {
|
|
5201
|
+
actual: actual.map((c) => c.name),
|
|
5202
|
+
expected: expected.name
|
|
5203
|
+
},
|
|
5177
5204
|
metrics
|
|
5178
5205
|
};
|
|
5179
5206
|
}
|
|
@@ -5186,6 +5213,10 @@ function validateToolCalls(response, expectation) {
|
|
|
5186
5213
|
return {
|
|
5187
5214
|
pass: false,
|
|
5188
5215
|
message: `Unexpected tool calls: ${names}. Only ${[...allowedNames].map((n) => `'${n}'`).join(", ")} are allowed`,
|
|
5216
|
+
details: {
|
|
5217
|
+
actual: actual.map((c) => c.name),
|
|
5218
|
+
unexpected: unexpected.map((c) => c.name)
|
|
5219
|
+
},
|
|
5189
5220
|
metrics
|
|
5190
5221
|
};
|
|
5191
5222
|
}
|
|
@@ -5204,19 +5235,22 @@ function validateToolCallCount(response, options) {
|
|
|
5204
5235
|
if (exact !== void 0 && count !== exact) {
|
|
5205
5236
|
return {
|
|
5206
5237
|
pass: false,
|
|
5207
|
-
message: `Expected exactly ${exact} tool call(s), but got ${count}
|
|
5238
|
+
message: `Expected exactly ${exact} tool call(s), but got ${count}`,
|
|
5239
|
+
details: { actual: count, expected: exact }
|
|
5208
5240
|
};
|
|
5209
5241
|
}
|
|
5210
5242
|
if (min !== void 0 && count < min) {
|
|
5211
5243
|
return {
|
|
5212
5244
|
pass: false,
|
|
5213
|
-
message: `Expected at least ${min} tool call(s), but got ${count}
|
|
5245
|
+
message: `Expected at least ${min} tool call(s), but got ${count}`,
|
|
5246
|
+
details: { actual: count, min }
|
|
5214
5247
|
};
|
|
5215
5248
|
}
|
|
5216
5249
|
if (max !== void 0 && count > max) {
|
|
5217
5250
|
return {
|
|
5218
5251
|
pass: false,
|
|
5219
|
-
message: `Expected at most ${max} tool call(s), but got ${count}
|
|
5252
|
+
message: `Expected at most ${max} tool call(s), but got ${count}`,
|
|
5253
|
+
details: { actual: count, max }
|
|
5220
5254
|
};
|
|
5221
5255
|
}
|
|
5222
5256
|
return {
|
|
@@ -5730,7 +5764,9 @@ function createJudge(config = {}) {
|
|
|
5730
5764
|
case "google":
|
|
5731
5765
|
return createGoogleJudge(config);
|
|
5732
5766
|
default:
|
|
5733
|
-
throw new Error(
|
|
5767
|
+
throw new Error(
|
|
5768
|
+
`Unsupported LLM provider: ${String(provider)}. Valid providers: 'anthropic', 'vertex-anthropic', 'anthropic-agent-sdk', 'openai', 'google'`
|
|
5769
|
+
);
|
|
5734
5770
|
}
|
|
5735
5771
|
}
|
|
5736
5772
|
|
|
@@ -6055,12 +6091,19 @@ function toMatchToolResponse(received, expected) {
|
|
|
6055
6091
|
// src/assertions/matchers/toMatchToolSchema.ts
|
|
6056
6092
|
function toMatchToolSchema(received, schema, options = {}) {
|
|
6057
6093
|
const result = validateSchema(received, schema, options);
|
|
6094
|
+
const preview = result.details?.textPreview;
|
|
6058
6095
|
return {
|
|
6059
6096
|
pass: result.pass,
|
|
6060
6097
|
message: () => {
|
|
6061
6098
|
if (this.isNot) {
|
|
6062
6099
|
return result.pass ? "Expected response NOT to match schema, but it did" : result.message;
|
|
6063
6100
|
}
|
|
6101
|
+
if (!result.pass && preview) {
|
|
6102
|
+
return `${result.message}
|
|
6103
|
+
|
|
6104
|
+
Actual response (truncated):
|
|
6105
|
+
${preview}`;
|
|
6106
|
+
}
|
|
6064
6107
|
return result.message;
|
|
6065
6108
|
}
|
|
6066
6109
|
};
|
|
@@ -6069,6 +6112,7 @@ function toMatchToolSchema(received, schema, options = {}) {
|
|
|
6069
6112
|
// src/assertions/matchers/toContainToolText.ts
|
|
6070
6113
|
function toContainToolText(received, expected, options = {}) {
|
|
6071
6114
|
const result = validateText(received, expected, options);
|
|
6115
|
+
const preview = result.details?.textPreview;
|
|
6072
6116
|
return {
|
|
6073
6117
|
pass: result.pass,
|
|
6074
6118
|
message: () => {
|
|
@@ -6076,6 +6120,12 @@ function toContainToolText(received, expected, options = {}) {
|
|
|
6076
6120
|
const expectedStr = Array.isArray(expected) ? expected.map((s) => `"${s}"`).join(", ") : `"${expected}"`;
|
|
6077
6121
|
return result.pass ? `Expected response NOT to contain ${expectedStr}, but it did` : result.message;
|
|
6078
6122
|
}
|
|
6123
|
+
if (!result.pass && preview) {
|
|
6124
|
+
return `${result.message}
|
|
6125
|
+
|
|
6126
|
+
Actual response (truncated):
|
|
6127
|
+
${preview}`;
|
|
6128
|
+
}
|
|
6079
6129
|
return result.message;
|
|
6080
6130
|
}
|
|
6081
6131
|
};
|
|
@@ -6084,12 +6134,19 @@ function toContainToolText(received, expected, options = {}) {
|
|
|
6084
6134
|
// src/assertions/matchers/toMatchToolPattern.ts
|
|
6085
6135
|
function toMatchToolPattern(received, patterns, options = {}) {
|
|
6086
6136
|
const result = validatePattern(received, patterns, options);
|
|
6137
|
+
const preview = result.details?.textPreview;
|
|
6087
6138
|
return {
|
|
6088
6139
|
pass: result.pass,
|
|
6089
6140
|
message: () => {
|
|
6090
6141
|
if (this.isNot) {
|
|
6091
6142
|
return result.pass ? "Expected response NOT to match pattern(s), but it did" : result.message;
|
|
6092
6143
|
}
|
|
6144
|
+
if (!result.pass && preview) {
|
|
6145
|
+
return `${result.message}
|
|
6146
|
+
|
|
6147
|
+
Actual response (truncated):
|
|
6148
|
+
${preview}`;
|
|
6149
|
+
}
|
|
6093
6150
|
return result.message;
|
|
6094
6151
|
}
|
|
6095
6152
|
};
|
|
@@ -6874,6 +6931,12 @@ function createVercelOrchestrator() {
|
|
|
6874
6931
|
});
|
|
6875
6932
|
const totalDurationMs = Date.now() - llmStart;
|
|
6876
6933
|
const llmDurationMs = totalDurationMs - mcpDurationMs;
|
|
6934
|
+
const hostUsage = result.usage ? {
|
|
6935
|
+
inputTokens: result.usage.promptTokens ?? 0,
|
|
6936
|
+
outputTokens: result.usage.completionTokens ?? 0,
|
|
6937
|
+
totalCostUsd: 0,
|
|
6938
|
+
durationMs: llmDurationMs
|
|
6939
|
+
} : void 0;
|
|
6877
6940
|
const conversationHistory = (result.steps ?? []).map((step) => ({
|
|
6878
6941
|
role: step.toolCalls?.length > 0 ? "tool" : "assistant",
|
|
6879
6942
|
content: step.toolCalls?.length > 0 ? JSON.stringify(step.toolResults) : step.text ?? ""
|
|
@@ -6885,7 +6948,8 @@ function createVercelOrchestrator() {
|
|
|
6885
6948
|
scenario,
|
|
6886
6949
|
llmDurationMs,
|
|
6887
6950
|
mcpDurationMs,
|
|
6888
|
-
conversationHistory
|
|
6951
|
+
conversationHistory,
|
|
6952
|
+
usage: hostUsage
|
|
6889
6953
|
};
|
|
6890
6954
|
} catch (err) {
|
|
6891
6955
|
return {
|
|
@@ -6903,6 +6967,7 @@ function parseStreamJson(stdout) {
|
|
|
6903
6967
|
const lines = stdout.split("\n").filter((line) => line.trim().length > 0);
|
|
6904
6968
|
const toolCalls = [];
|
|
6905
6969
|
const textParts = [];
|
|
6970
|
+
let usage;
|
|
6906
6971
|
const conversationHistory = [];
|
|
6907
6972
|
for (const line of lines) {
|
|
6908
6973
|
let event;
|
|
@@ -6935,16 +7000,28 @@ function parseStreamJson(stdout) {
|
|
|
6935
7000
|
}
|
|
6936
7001
|
}
|
|
6937
7002
|
}
|
|
6938
|
-
if (event.type === "result"
|
|
6939
|
-
if (textParts.length === 0) {
|
|
7003
|
+
if (event.type === "result") {
|
|
7004
|
+
if (typeof event.result === "string" && textParts.length === 0) {
|
|
6940
7005
|
textParts.push(event.result);
|
|
6941
7006
|
}
|
|
7007
|
+
if (event.usage) {
|
|
7008
|
+
usage = {
|
|
7009
|
+
inputTokens: event.usage.input_tokens ?? 0,
|
|
7010
|
+
outputTokens: event.usage.output_tokens ?? 0,
|
|
7011
|
+
totalCostUsd: event.total_cost_usd ?? 0,
|
|
7012
|
+
durationMs: event.duration_ms ?? 0,
|
|
7013
|
+
durationApiMs: event.duration_api_ms,
|
|
7014
|
+
cacheReadInputTokens: event.usage.cache_read_input_tokens,
|
|
7015
|
+
cacheCreationInputTokens: event.usage.cache_creation_input_tokens
|
|
7016
|
+
};
|
|
7017
|
+
}
|
|
6942
7018
|
}
|
|
6943
7019
|
if (event.type === "result" && event.is_error === true) {
|
|
6944
7020
|
return {
|
|
6945
7021
|
success: false,
|
|
6946
7022
|
toolCalls,
|
|
6947
|
-
error: typeof event.result === "string" ? event.result : "CLI host reported an error"
|
|
7023
|
+
error: typeof event.result === "string" ? event.result : "CLI host reported an error",
|
|
7024
|
+
usage
|
|
6948
7025
|
};
|
|
6949
7026
|
}
|
|
6950
7027
|
}
|
|
@@ -6956,7 +7033,8 @@ function parseStreamJson(stdout) {
|
|
|
6956
7033
|
success: true,
|
|
6957
7034
|
toolCalls,
|
|
6958
7035
|
response: response || void 0,
|
|
6959
|
-
conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0
|
|
7036
|
+
conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0,
|
|
7037
|
+
usage
|
|
6960
7038
|
};
|
|
6961
7039
|
}
|
|
6962
7040
|
function createJsonParser(paths) {
|
|
@@ -7221,6 +7299,32 @@ async function execFileNoThrow(file, args) {
|
|
|
7221
7299
|
}
|
|
7222
7300
|
}
|
|
7223
7301
|
|
|
7302
|
+
// src/utils/usageUtils.ts
|
|
7303
|
+
function optionalSum(a, b) {
|
|
7304
|
+
if (a === void 0 && b === void 0) return void 0;
|
|
7305
|
+
return (a ?? 0) + (b ?? 0);
|
|
7306
|
+
}
|
|
7307
|
+
function sumUsage(a, b) {
|
|
7308
|
+
if (!a && !b) return void 0;
|
|
7309
|
+
if (!a) return b ? { ...b } : void 0;
|
|
7310
|
+
if (!b) return { ...a };
|
|
7311
|
+
return {
|
|
7312
|
+
inputTokens: a.inputTokens + b.inputTokens,
|
|
7313
|
+
outputTokens: a.outputTokens + b.outputTokens,
|
|
7314
|
+
totalCostUsd: a.totalCostUsd + b.totalCostUsd,
|
|
7315
|
+
durationMs: a.durationMs + b.durationMs,
|
|
7316
|
+
durationApiMs: optionalSum(a.durationApiMs, b.durationApiMs),
|
|
7317
|
+
cacheReadInputTokens: optionalSum(
|
|
7318
|
+
a.cacheReadInputTokens,
|
|
7319
|
+
b.cacheReadInputTokens
|
|
7320
|
+
),
|
|
7321
|
+
cacheCreationInputTokens: optionalSum(
|
|
7322
|
+
a.cacheCreationInputTokens,
|
|
7323
|
+
b.cacheCreationInputTokens
|
|
7324
|
+
)
|
|
7325
|
+
};
|
|
7326
|
+
}
|
|
7327
|
+
|
|
7224
7328
|
// src/evals/evalRunner.ts
|
|
7225
7329
|
async function executeToolCall(evalCase, mcp) {
|
|
7226
7330
|
const mode = evalCase.mode || "direct";
|
|
@@ -7466,6 +7570,7 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
7466
7570
|
};
|
|
7467
7571
|
}
|
|
7468
7572
|
}
|
|
7573
|
+
const hostUsage = isMCPHostSimulationResult(response) && response.usage ? response.usage : void 0;
|
|
7469
7574
|
return {
|
|
7470
7575
|
id: evalCase.id,
|
|
7471
7576
|
datasetName: options.datasetName ?? "single-case",
|
|
@@ -7482,7 +7587,8 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
7482
7587
|
tags: evalCase.tags,
|
|
7483
7588
|
toolPrecision,
|
|
7484
7589
|
toolRecall,
|
|
7485
|
-
mcpHostTrace
|
|
7590
|
+
mcpHostTrace,
|
|
7591
|
+
hostUsage
|
|
7486
7592
|
};
|
|
7487
7593
|
}
|
|
7488
7594
|
function isInfrastructureError(err) {
|
|
@@ -7498,7 +7604,7 @@ function isInfrastructureError(err) {
|
|
|
7498
7604
|
} else {
|
|
7499
7605
|
return false;
|
|
7500
7606
|
}
|
|
7501
|
-
return name15 === "
|
|
7607
|
+
return name15?.toLowerCase() === "aborterror" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
|
|
7502
7608
|
msg.includes("prompt is too long") || msg.includes("context length exceeded") || msg.includes("maximum context length") || msg.includes("context_length_exceeded") || msg.includes("tokens > ") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
|
|
7503
7609
|
}
|
|
7504
7610
|
async function runEvalCase(evalCase, context, options = {}) {
|
|
@@ -7518,7 +7624,8 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
7518
7624
|
durationMs: result.durationMs,
|
|
7519
7625
|
error: result.error,
|
|
7520
7626
|
isInfrastructureError: infraError,
|
|
7521
|
-
mcpHostTrace: result.mcpHostTrace
|
|
7627
|
+
mcpHostTrace: result.mcpHostTrace,
|
|
7628
|
+
hostUsage: result.hostUsage
|
|
7522
7629
|
});
|
|
7523
7630
|
} catch (err) {
|
|
7524
7631
|
const errorMessage = err instanceof Error ? err.message : String(err);
|
|
@@ -7551,6 +7658,10 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
7551
7658
|
durationMs: 0,
|
|
7552
7659
|
tags: evalCase.tags
|
|
7553
7660
|
};
|
|
7661
|
+
const totalHostUsage = iterationResults.reduce(
|
|
7662
|
+
(acc, r) => sumUsage(acc, r.hostUsage),
|
|
7663
|
+
void 0
|
|
7664
|
+
);
|
|
7554
7665
|
return {
|
|
7555
7666
|
...baseResult,
|
|
7556
7667
|
pass: assertionPassRate >= threshold,
|
|
@@ -7559,7 +7670,8 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
7559
7670
|
infrastructureErrorRate,
|
|
7560
7671
|
iterationResults,
|
|
7561
7672
|
infrastructureErrorCount: infraErrors.length,
|
|
7562
|
-
durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
|
|
7673
|
+
durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0),
|
|
7674
|
+
hostUsage: totalHostUsage
|
|
7563
7675
|
};
|
|
7564
7676
|
}
|
|
7565
7677
|
function wilsonCI(k, n) {
|
|
@@ -7669,13 +7781,18 @@ async function runEvalDataset(options, context) {
|
|
|
7669
7781
|
...mcpHostModel !== void 0 && { mcpHostModel },
|
|
7670
7782
|
...judgeModel !== void 0 && { judgeModel }
|
|
7671
7783
|
};
|
|
7784
|
+
const runHostUsage = caseResults.reduce(
|
|
7785
|
+
(acc, r) => sumUsage(acc, r.hostUsage),
|
|
7786
|
+
void 0
|
|
7787
|
+
);
|
|
7672
7788
|
const result = {
|
|
7673
7789
|
total,
|
|
7674
7790
|
passed,
|
|
7675
7791
|
failed: total - passed,
|
|
7676
7792
|
caseResults,
|
|
7677
7793
|
durationMs: Date.now() - startTime,
|
|
7678
|
-
metadata
|
|
7794
|
+
metadata,
|
|
7795
|
+
totalHostUsage: runHostUsage
|
|
7679
7796
|
};
|
|
7680
7797
|
if (baselineResultsFrom) {
|
|
7681
7798
|
try {
|
|
@@ -7969,6 +8086,6 @@ function formatCapabilities(capabilities) {
|
|
|
7969
8086
|
return parts.length > 0 ? parts.join(", ") : "none declared";
|
|
7970
8087
|
}
|
|
7971
8088
|
|
|
7972
|
-
export { BUILT_IN_RUBRICS, CLIOAuthClient, DiscoveryError, ENV_VAR_NAMES, EvalCaseSchema, EvalDatasetSchema, MCPConfigSchema, MCP_PROTOCOL_VERSION, PlaywrightOAuthClientProvider, SnapshotSanitizers, clearJudgeRegistry, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getRegisteredJudge, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test2 as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, registerJudge, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|
|
8089
|
+
export { BUILT_IN_RUBRICS, CLIOAuthClient, DiscoveryError, ENV_VAR_NAMES, EvalCaseSchema, EvalDatasetSchema, MCPConfigSchema, MCP_PROTOCOL_VERSION, PlaywrightOAuthClientProvider, SnapshotSanitizers, clearJudgeRegistry, closeMCPClient, createJudge, createMCPClientForConfig, createMCPFixture, createTokenAuthHeaders, discoverAuthorizationServer, discoverProtectedResource, expect, extractText, getMissingDependencyMessage, getRegisteredJudge, getResponseSizeBytes, hasValidTokens, injectTokens, isBuiltInRubric, isHttpConfig, isProviderAvailable, isStdioConfig, isTokenExpired, isTokenExpiringSoon, loadBaseline, loadEvalDataset, loadEvalDatasetFromObject, loadTokens, loadTokensFromEnv, test2 as mcpAuthTest, normalizeToolResponse, normalizeWhitespace, performClientCredentialsFlow, performOAuthSetup, performOAuthSetupIfNeeded, refreshAccessToken, registerJudge, resolveRubric, runConformanceChecks, runEvalCase, runEvalDataset, runServerComparison, saveBaseline, simulateMCPHost, test, validateAccessToken, validateError, validateEvalCase, validateEvalDataset, validateJudge, validateMCPConfig, validatePattern, validateResponse, validateSchema, validateSize, validateText, validateToolCallCount, validateToolCalls };
|
|
7973
8090
|
//# sourceMappingURL=index.js.map
|
|
7974
8091
|
//# sourceMappingURL=index.js.map
|