@gleanwork/mcp-server-tester 1.0.0-beta.8 → 1.0.1-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -1
- package/dist/cli/index.js +12 -1
- package/dist/fixtures/mcp.js +71 -14
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +142 -24
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +118 -16
- package/dist/index.d.ts +118 -16
- package/dist/index.js +142 -25
- package/dist/index.js.map +1 -1
- package/dist/reporters/mcpReporter.cjs +34 -1
- package/dist/reporters/mcpReporter.cjs.map +1 -1
- package/dist/reporters/mcpReporter.d.cts +90 -0
- package/dist/reporters/mcpReporter.d.ts +90 -0
- package/dist/reporters/mcpReporter.js +34 -1
- package/dist/reporters/mcpReporter.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -4411,7 +4411,7 @@ function escapeHtml(text) {
|
|
|
4411
4411
|
|
|
4412
4412
|
// package.json
|
|
4413
4413
|
var package_default = {
|
|
4414
|
-
version: "1.0.
|
|
4414
|
+
version: "1.0.1-beta.0"};
|
|
4415
4415
|
|
|
4416
4416
|
// src/mcp/clientFactory.ts
|
|
4417
4417
|
function getRetryAfterDelayMs(err) {
|
|
@@ -4630,6 +4630,17 @@ async function createMCPClientForConfig(config, options) {
|
|
|
4630
4630
|
}
|
|
4631
4631
|
async function closeMCPClient(client) {
|
|
4632
4632
|
try {
|
|
4633
|
+
const transport = client.transport;
|
|
4634
|
+
if (transport instanceof streamableHttp_js.StreamableHTTPClientTransport) {
|
|
4635
|
+
try {
|
|
4636
|
+
await transport.terminateSession();
|
|
4637
|
+
} catch (sessionError) {
|
|
4638
|
+
debugClient(
|
|
4639
|
+
"Error terminating session: %s",
|
|
4640
|
+
sessionError instanceof Error ? sessionError.message : String(sessionError)
|
|
4641
|
+
);
|
|
4642
|
+
}
|
|
4643
|
+
}
|
|
4633
4644
|
await client.close();
|
|
4634
4645
|
} catch (error) {
|
|
4635
4646
|
debugClient(
|
|
@@ -4858,11 +4869,13 @@ function validateSchema(response, schema, options = {}) {
|
|
|
4858
4869
|
} catch (error) {
|
|
4859
4870
|
const zodError = error;
|
|
4860
4871
|
const issues = formatZodIssues(zodError);
|
|
4872
|
+
const text = stringifyResponse(response);
|
|
4861
4873
|
return {
|
|
4862
4874
|
pass: false,
|
|
4863
4875
|
message: `Response does not match schema: ${issues}`,
|
|
4864
4876
|
details: {
|
|
4865
|
-
issues: zodError.issues
|
|
4877
|
+
issues: zodError.issues,
|
|
4878
|
+
textPreview: truncateForDisplay2(text)
|
|
4866
4879
|
}
|
|
4867
4880
|
};
|
|
4868
4881
|
}
|
|
@@ -4915,6 +4928,12 @@ function formatZodIssues(error) {
|
|
|
4915
4928
|
});
|
|
4916
4929
|
return issues.join("; ");
|
|
4917
4930
|
}
|
|
4931
|
+
function truncateForDisplay2(str, maxLength = 200) {
|
|
4932
|
+
if (str.length <= maxLength) {
|
|
4933
|
+
return str;
|
|
4934
|
+
}
|
|
4935
|
+
return str.slice(0, maxLength) + "... (truncated)";
|
|
4936
|
+
}
|
|
4918
4937
|
|
|
4919
4938
|
// src/assertions/validators/text.ts
|
|
4920
4939
|
function validateText(response, expected, options = {}) {
|
|
@@ -4941,11 +4960,11 @@ function validateText(response, expected, options = {}) {
|
|
|
4941
4960
|
details: {
|
|
4942
4961
|
missing,
|
|
4943
4962
|
textLength: text.length,
|
|
4944
|
-
textPreview:
|
|
4963
|
+
textPreview: truncateForDisplay3(text)
|
|
4945
4964
|
}
|
|
4946
4965
|
};
|
|
4947
4966
|
}
|
|
4948
|
-
function
|
|
4967
|
+
function truncateForDisplay3(str, maxLength = 200) {
|
|
4949
4968
|
if (str.length <= maxLength) {
|
|
4950
4969
|
return str;
|
|
4951
4970
|
}
|
|
@@ -4977,7 +4996,7 @@ function validatePattern(response, patterns, options = {}) {
|
|
|
4977
4996
|
details: {
|
|
4978
4997
|
unmatched,
|
|
4979
4998
|
textLength: text.length,
|
|
4980
|
-
textPreview:
|
|
4999
|
+
textPreview: truncateForDisplay4(text)
|
|
4981
5000
|
}
|
|
4982
5001
|
};
|
|
4983
5002
|
}
|
|
@@ -4997,7 +5016,7 @@ function patternToString(pattern) {
|
|
|
4997
5016
|
}
|
|
4998
5017
|
return `/${pattern}/`;
|
|
4999
5018
|
}
|
|
5000
|
-
function
|
|
5019
|
+
function truncateForDisplay4(str, maxLength = 200) {
|
|
5001
5020
|
if (str.length <= maxLength) {
|
|
5002
5021
|
return str;
|
|
5003
5022
|
}
|
|
@@ -5020,7 +5039,7 @@ function validateError(response, expected = true) {
|
|
|
5020
5039
|
pass: false,
|
|
5021
5040
|
message: "Expected an error response but got success",
|
|
5022
5041
|
details: {
|
|
5023
|
-
textPreview:
|
|
5042
|
+
textPreview: truncateForDisplay5(extractText2(response))
|
|
5024
5043
|
}
|
|
5025
5044
|
};
|
|
5026
5045
|
} else {
|
|
@@ -5032,7 +5051,7 @@ function validateError(response, expected = true) {
|
|
|
5032
5051
|
}
|
|
5033
5052
|
return {
|
|
5034
5053
|
pass: false,
|
|
5035
|
-
message: `Expected a success response but got error: "${
|
|
5054
|
+
message: `Expected a success response but got error: "${truncateForDisplay5(errorMessage)}"`,
|
|
5036
5055
|
details: {
|
|
5037
5056
|
errorMessage
|
|
5038
5057
|
}
|
|
@@ -5045,7 +5064,7 @@ function validateError(response, expected = true) {
|
|
|
5045
5064
|
pass: false,
|
|
5046
5065
|
message: `Expected an error containing "${expectedMessages[0]}" but got success`,
|
|
5047
5066
|
details: {
|
|
5048
|
-
textPreview:
|
|
5067
|
+
textPreview: truncateForDisplay5(extractText2(response))
|
|
5049
5068
|
}
|
|
5050
5069
|
};
|
|
5051
5070
|
}
|
|
@@ -5067,7 +5086,7 @@ function validateError(response, expected = true) {
|
|
|
5067
5086
|
}
|
|
5068
5087
|
};
|
|
5069
5088
|
}
|
|
5070
|
-
function
|
|
5089
|
+
function truncateForDisplay5(str, maxLength = 200) {
|
|
5071
5090
|
if (str.length <= maxLength) {
|
|
5072
5091
|
return str;
|
|
5073
5092
|
}
|
|
@@ -5185,6 +5204,10 @@ function validateToolCalls(response, expectation) {
|
|
|
5185
5204
|
return {
|
|
5186
5205
|
pass: false,
|
|
5187
5206
|
message: `Expected tool '${expected.name}' to be called in sequence (starting from position ${searchFrom}), but it was not found`,
|
|
5207
|
+
details: {
|
|
5208
|
+
actual: actual.map((c) => c.name),
|
|
5209
|
+
expected: expected.name
|
|
5210
|
+
},
|
|
5188
5211
|
metrics
|
|
5189
5212
|
};
|
|
5190
5213
|
}
|
|
@@ -5201,6 +5224,10 @@ function validateToolCalls(response, expectation) {
|
|
|
5201
5224
|
return {
|
|
5202
5225
|
pass: false,
|
|
5203
5226
|
message: `Expected tool '${expected.name}'${argsNote} to be called, but it was not`,
|
|
5227
|
+
details: {
|
|
5228
|
+
actual: actual.map((c) => c.name),
|
|
5229
|
+
expected: expected.name
|
|
5230
|
+
},
|
|
5204
5231
|
metrics
|
|
5205
5232
|
};
|
|
5206
5233
|
}
|
|
@@ -5213,6 +5240,10 @@ function validateToolCalls(response, expectation) {
|
|
|
5213
5240
|
return {
|
|
5214
5241
|
pass: false,
|
|
5215
5242
|
message: `Unexpected tool calls: ${names}. Only ${[...allowedNames].map((n) => `'${n}'`).join(", ")} are allowed`,
|
|
5243
|
+
details: {
|
|
5244
|
+
actual: actual.map((c) => c.name),
|
|
5245
|
+
unexpected: unexpected.map((c) => c.name)
|
|
5246
|
+
},
|
|
5216
5247
|
metrics
|
|
5217
5248
|
};
|
|
5218
5249
|
}
|
|
@@ -5231,19 +5262,22 @@ function validateToolCallCount(response, options) {
|
|
|
5231
5262
|
if (exact !== void 0 && count !== exact) {
|
|
5232
5263
|
return {
|
|
5233
5264
|
pass: false,
|
|
5234
|
-
message: `Expected exactly ${exact} tool call(s), but got ${count}
|
|
5265
|
+
message: `Expected exactly ${exact} tool call(s), but got ${count}`,
|
|
5266
|
+
details: { actual: count, expected: exact }
|
|
5235
5267
|
};
|
|
5236
5268
|
}
|
|
5237
5269
|
if (min !== void 0 && count < min) {
|
|
5238
5270
|
return {
|
|
5239
5271
|
pass: false,
|
|
5240
|
-
message: `Expected at least ${min} tool call(s), but got ${count}
|
|
5272
|
+
message: `Expected at least ${min} tool call(s), but got ${count}`,
|
|
5273
|
+
details: { actual: count, min }
|
|
5241
5274
|
};
|
|
5242
5275
|
}
|
|
5243
5276
|
if (max !== void 0 && count > max) {
|
|
5244
5277
|
return {
|
|
5245
5278
|
pass: false,
|
|
5246
|
-
message: `Expected at most ${max} tool call(s), but got ${count}
|
|
5279
|
+
message: `Expected at most ${max} tool call(s), but got ${count}`,
|
|
5280
|
+
details: { actual: count, max }
|
|
5247
5281
|
};
|
|
5248
5282
|
}
|
|
5249
5283
|
return {
|
|
@@ -5757,7 +5791,9 @@ function createJudge(config = {}) {
|
|
|
5757
5791
|
case "google":
|
|
5758
5792
|
return createGoogleJudge(config);
|
|
5759
5793
|
default:
|
|
5760
|
-
throw new Error(
|
|
5794
|
+
throw new Error(
|
|
5795
|
+
`Unsupported LLM provider: ${String(provider)}. Valid providers: 'anthropic', 'vertex-anthropic', 'anthropic-agent-sdk', 'openai', 'google'`
|
|
5796
|
+
);
|
|
5761
5797
|
}
|
|
5762
5798
|
}
|
|
5763
5799
|
|
|
@@ -6082,12 +6118,19 @@ function toMatchToolResponse(received, expected) {
|
|
|
6082
6118
|
// src/assertions/matchers/toMatchToolSchema.ts
|
|
6083
6119
|
function toMatchToolSchema(received, schema, options = {}) {
|
|
6084
6120
|
const result = validateSchema(received, schema, options);
|
|
6121
|
+
const preview = result.details?.textPreview;
|
|
6085
6122
|
return {
|
|
6086
6123
|
pass: result.pass,
|
|
6087
6124
|
message: () => {
|
|
6088
6125
|
if (this.isNot) {
|
|
6089
6126
|
return result.pass ? "Expected response NOT to match schema, but it did" : result.message;
|
|
6090
6127
|
}
|
|
6128
|
+
if (!result.pass && preview) {
|
|
6129
|
+
return `${result.message}
|
|
6130
|
+
|
|
6131
|
+
Actual response (truncated):
|
|
6132
|
+
${preview}`;
|
|
6133
|
+
}
|
|
6091
6134
|
return result.message;
|
|
6092
6135
|
}
|
|
6093
6136
|
};
|
|
@@ -6096,6 +6139,7 @@ function toMatchToolSchema(received, schema, options = {}) {
|
|
|
6096
6139
|
// src/assertions/matchers/toContainToolText.ts
|
|
6097
6140
|
function toContainToolText(received, expected, options = {}) {
|
|
6098
6141
|
const result = validateText(received, expected, options);
|
|
6142
|
+
const preview = result.details?.textPreview;
|
|
6099
6143
|
return {
|
|
6100
6144
|
pass: result.pass,
|
|
6101
6145
|
message: () => {
|
|
@@ -6103,6 +6147,12 @@ function toContainToolText(received, expected, options = {}) {
|
|
|
6103
6147
|
const expectedStr = Array.isArray(expected) ? expected.map((s) => `"${s}"`).join(", ") : `"${expected}"`;
|
|
6104
6148
|
return result.pass ? `Expected response NOT to contain ${expectedStr}, but it did` : result.message;
|
|
6105
6149
|
}
|
|
6150
|
+
if (!result.pass && preview) {
|
|
6151
|
+
return `${result.message}
|
|
6152
|
+
|
|
6153
|
+
Actual response (truncated):
|
|
6154
|
+
${preview}`;
|
|
6155
|
+
}
|
|
6106
6156
|
return result.message;
|
|
6107
6157
|
}
|
|
6108
6158
|
};
|
|
@@ -6111,12 +6161,19 @@ function toContainToolText(received, expected, options = {}) {
|
|
|
6111
6161
|
// src/assertions/matchers/toMatchToolPattern.ts
|
|
6112
6162
|
function toMatchToolPattern(received, patterns, options = {}) {
|
|
6113
6163
|
const result = validatePattern(received, patterns, options);
|
|
6164
|
+
const preview = result.details?.textPreview;
|
|
6114
6165
|
return {
|
|
6115
6166
|
pass: result.pass,
|
|
6116
6167
|
message: () => {
|
|
6117
6168
|
if (this.isNot) {
|
|
6118
6169
|
return result.pass ? "Expected response NOT to match pattern(s), but it did" : result.message;
|
|
6119
6170
|
}
|
|
6171
|
+
if (!result.pass && preview) {
|
|
6172
|
+
return `${result.message}
|
|
6173
|
+
|
|
6174
|
+
Actual response (truncated):
|
|
6175
|
+
${preview}`;
|
|
6176
|
+
}
|
|
6120
6177
|
return result.message;
|
|
6121
6178
|
}
|
|
6122
6179
|
};
|
|
@@ -6901,6 +6958,12 @@ function createVercelOrchestrator() {
|
|
|
6901
6958
|
});
|
|
6902
6959
|
const totalDurationMs = Date.now() - llmStart;
|
|
6903
6960
|
const llmDurationMs = totalDurationMs - mcpDurationMs;
|
|
6961
|
+
const hostUsage = result.usage ? {
|
|
6962
|
+
inputTokens: result.usage.promptTokens ?? 0,
|
|
6963
|
+
outputTokens: result.usage.completionTokens ?? 0,
|
|
6964
|
+
totalCostUsd: 0,
|
|
6965
|
+
durationMs: llmDurationMs
|
|
6966
|
+
} : void 0;
|
|
6904
6967
|
const conversationHistory = (result.steps ?? []).map((step) => ({
|
|
6905
6968
|
role: step.toolCalls?.length > 0 ? "tool" : "assistant",
|
|
6906
6969
|
content: step.toolCalls?.length > 0 ? JSON.stringify(step.toolResults) : step.text ?? ""
|
|
@@ -6912,7 +6975,8 @@ function createVercelOrchestrator() {
|
|
|
6912
6975
|
scenario,
|
|
6913
6976
|
llmDurationMs,
|
|
6914
6977
|
mcpDurationMs,
|
|
6915
|
-
conversationHistory
|
|
6978
|
+
conversationHistory,
|
|
6979
|
+
usage: hostUsage
|
|
6916
6980
|
};
|
|
6917
6981
|
} catch (err) {
|
|
6918
6982
|
return {
|
|
@@ -6930,6 +6994,7 @@ function parseStreamJson(stdout) {
|
|
|
6930
6994
|
const lines = stdout.split("\n").filter((line) => line.trim().length > 0);
|
|
6931
6995
|
const toolCalls = [];
|
|
6932
6996
|
const textParts = [];
|
|
6997
|
+
let usage;
|
|
6933
6998
|
const conversationHistory = [];
|
|
6934
6999
|
for (const line of lines) {
|
|
6935
7000
|
let event;
|
|
@@ -6962,16 +7027,28 @@ function parseStreamJson(stdout) {
|
|
|
6962
7027
|
}
|
|
6963
7028
|
}
|
|
6964
7029
|
}
|
|
6965
|
-
if (event.type === "result"
|
|
6966
|
-
if (textParts.length === 0) {
|
|
7030
|
+
if (event.type === "result") {
|
|
7031
|
+
if (typeof event.result === "string" && textParts.length === 0) {
|
|
6967
7032
|
textParts.push(event.result);
|
|
6968
7033
|
}
|
|
7034
|
+
if (event.usage) {
|
|
7035
|
+
usage = {
|
|
7036
|
+
inputTokens: event.usage.input_tokens ?? 0,
|
|
7037
|
+
outputTokens: event.usage.output_tokens ?? 0,
|
|
7038
|
+
totalCostUsd: event.total_cost_usd ?? 0,
|
|
7039
|
+
durationMs: event.duration_ms ?? 0,
|
|
7040
|
+
durationApiMs: event.duration_api_ms,
|
|
7041
|
+
cacheReadInputTokens: event.usage.cache_read_input_tokens,
|
|
7042
|
+
cacheCreationInputTokens: event.usage.cache_creation_input_tokens
|
|
7043
|
+
};
|
|
7044
|
+
}
|
|
6969
7045
|
}
|
|
6970
7046
|
if (event.type === "result" && event.is_error === true) {
|
|
6971
7047
|
return {
|
|
6972
7048
|
success: false,
|
|
6973
7049
|
toolCalls,
|
|
6974
|
-
error: typeof event.result === "string" ? event.result : "CLI host reported an error"
|
|
7050
|
+
error: typeof event.result === "string" ? event.result : "CLI host reported an error",
|
|
7051
|
+
usage
|
|
6975
7052
|
};
|
|
6976
7053
|
}
|
|
6977
7054
|
}
|
|
@@ -6983,7 +7060,8 @@ function parseStreamJson(stdout) {
|
|
|
6983
7060
|
success: true,
|
|
6984
7061
|
toolCalls,
|
|
6985
7062
|
response: response || void 0,
|
|
6986
|
-
conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0
|
|
7063
|
+
conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0,
|
|
7064
|
+
usage
|
|
6987
7065
|
};
|
|
6988
7066
|
}
|
|
6989
7067
|
function createJsonParser(paths) {
|
|
@@ -7248,6 +7326,32 @@ async function execFileNoThrow(file, args) {
|
|
|
7248
7326
|
}
|
|
7249
7327
|
}
|
|
7250
7328
|
|
|
7329
|
+
// src/utils/usageUtils.ts
|
|
7330
|
+
function optionalSum(a, b) {
|
|
7331
|
+
if (a === void 0 && b === void 0) return void 0;
|
|
7332
|
+
return (a ?? 0) + (b ?? 0);
|
|
7333
|
+
}
|
|
7334
|
+
function sumUsage(a, b) {
|
|
7335
|
+
if (!a && !b) return void 0;
|
|
7336
|
+
if (!a) return b ? { ...b } : void 0;
|
|
7337
|
+
if (!b) return { ...a };
|
|
7338
|
+
return {
|
|
7339
|
+
inputTokens: a.inputTokens + b.inputTokens,
|
|
7340
|
+
outputTokens: a.outputTokens + b.outputTokens,
|
|
7341
|
+
totalCostUsd: a.totalCostUsd + b.totalCostUsd,
|
|
7342
|
+
durationMs: a.durationMs + b.durationMs,
|
|
7343
|
+
durationApiMs: optionalSum(a.durationApiMs, b.durationApiMs),
|
|
7344
|
+
cacheReadInputTokens: optionalSum(
|
|
7345
|
+
a.cacheReadInputTokens,
|
|
7346
|
+
b.cacheReadInputTokens
|
|
7347
|
+
),
|
|
7348
|
+
cacheCreationInputTokens: optionalSum(
|
|
7349
|
+
a.cacheCreationInputTokens,
|
|
7350
|
+
b.cacheCreationInputTokens
|
|
7351
|
+
)
|
|
7352
|
+
};
|
|
7353
|
+
}
|
|
7354
|
+
|
|
7251
7355
|
// src/evals/evalRunner.ts
|
|
7252
7356
|
async function executeToolCall(evalCase, mcp) {
|
|
7253
7357
|
const mode = evalCase.mode || "direct";
|
|
@@ -7493,6 +7597,7 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
7493
7597
|
};
|
|
7494
7598
|
}
|
|
7495
7599
|
}
|
|
7600
|
+
const hostUsage = isMCPHostSimulationResult(response) && response.usage ? response.usage : void 0;
|
|
7496
7601
|
return {
|
|
7497
7602
|
id: evalCase.id,
|
|
7498
7603
|
datasetName: options.datasetName ?? "single-case",
|
|
@@ -7509,7 +7614,8 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
7509
7614
|
tags: evalCase.tags,
|
|
7510
7615
|
toolPrecision,
|
|
7511
7616
|
toolRecall,
|
|
7512
|
-
mcpHostTrace
|
|
7617
|
+
mcpHostTrace,
|
|
7618
|
+
hostUsage
|
|
7513
7619
|
};
|
|
7514
7620
|
}
|
|
7515
7621
|
function isInfrastructureError(err) {
|
|
@@ -7525,7 +7631,7 @@ function isInfrastructureError(err) {
|
|
|
7525
7631
|
} else {
|
|
7526
7632
|
return false;
|
|
7527
7633
|
}
|
|
7528
|
-
return name15 === "
|
|
7634
|
+
return name15?.toLowerCase() === "aborterror" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
|
|
7529
7635
|
msg.includes("prompt is too long") || msg.includes("context length exceeded") || msg.includes("maximum context length") || msg.includes("context_length_exceeded") || msg.includes("tokens > ") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
|
|
7530
7636
|
}
|
|
7531
7637
|
async function runEvalCase(evalCase, context, options = {}) {
|
|
@@ -7545,7 +7651,8 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
7545
7651
|
durationMs: result.durationMs,
|
|
7546
7652
|
error: result.error,
|
|
7547
7653
|
isInfrastructureError: infraError,
|
|
7548
|
-
mcpHostTrace: result.mcpHostTrace
|
|
7654
|
+
mcpHostTrace: result.mcpHostTrace,
|
|
7655
|
+
hostUsage: result.hostUsage
|
|
7549
7656
|
});
|
|
7550
7657
|
} catch (err) {
|
|
7551
7658
|
const errorMessage = err instanceof Error ? err.message : String(err);
|
|
@@ -7578,6 +7685,10 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
7578
7685
|
durationMs: 0,
|
|
7579
7686
|
tags: evalCase.tags
|
|
7580
7687
|
};
|
|
7688
|
+
const totalHostUsage = iterationResults.reduce(
|
|
7689
|
+
(acc, r) => sumUsage(acc, r.hostUsage),
|
|
7690
|
+
void 0
|
|
7691
|
+
);
|
|
7581
7692
|
return {
|
|
7582
7693
|
...baseResult,
|
|
7583
7694
|
pass: assertionPassRate >= threshold,
|
|
@@ -7586,7 +7697,8 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
7586
7697
|
infrastructureErrorRate,
|
|
7587
7698
|
iterationResults,
|
|
7588
7699
|
infrastructureErrorCount: infraErrors.length,
|
|
7589
|
-
durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
|
|
7700
|
+
durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0),
|
|
7701
|
+
hostUsage: totalHostUsage
|
|
7590
7702
|
};
|
|
7591
7703
|
}
|
|
7592
7704
|
function wilsonCI(k, n) {
|
|
@@ -7696,13 +7808,18 @@ async function runEvalDataset(options, context) {
|
|
|
7696
7808
|
...mcpHostModel !== void 0 && { mcpHostModel },
|
|
7697
7809
|
...judgeModel !== void 0 && { judgeModel }
|
|
7698
7810
|
};
|
|
7811
|
+
const runHostUsage = caseResults.reduce(
|
|
7812
|
+
(acc, r) => sumUsage(acc, r.hostUsage),
|
|
7813
|
+
void 0
|
|
7814
|
+
);
|
|
7699
7815
|
const result = {
|
|
7700
7816
|
total,
|
|
7701
7817
|
passed,
|
|
7702
7818
|
failed: total - passed,
|
|
7703
7819
|
caseResults,
|
|
7704
7820
|
durationMs: Date.now() - startTime,
|
|
7705
|
-
metadata
|
|
7821
|
+
metadata,
|
|
7822
|
+
totalHostUsage: runHostUsage
|
|
7706
7823
|
};
|
|
7707
7824
|
if (baselineResultsFrom) {
|
|
7708
7825
|
try {
|
|
@@ -8037,6 +8154,7 @@ exports.normalizeWhitespace = normalizeWhitespace;
|
|
|
8037
8154
|
exports.performClientCredentialsFlow = performClientCredentialsFlow;
|
|
8038
8155
|
exports.performOAuthSetup = performOAuthSetup;
|
|
8039
8156
|
exports.performOAuthSetupIfNeeded = performOAuthSetupIfNeeded;
|
|
8157
|
+
exports.refreshAccessToken = refreshAccessToken;
|
|
8040
8158
|
exports.registerJudge = registerJudge;
|
|
8041
8159
|
exports.resolveRubric = resolveRubric;
|
|
8042
8160
|
exports.runConformanceChecks = runConformanceChecks;
|