@gleanwork/mcp-server-tester 1.0.0-beta.8 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -1
- package/dist/cli/index.js +12 -1
- package/dist/fixtures/mcp.js +71 -14
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +124 -20
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +118 -16
- package/dist/index.d.ts +118 -16
- package/dist/index.js +124 -21
- package/dist/index.js.map +1 -1
- package/dist/reporters/mcpReporter.cjs +34 -1
- package/dist/reporters/mcpReporter.cjs.map +1 -1
- package/dist/reporters/mcpReporter.d.cts +90 -0
- package/dist/reporters/mcpReporter.d.ts +90 -0
- package/dist/reporters/mcpReporter.js +34 -1
- package/dist/reporters/mcpReporter.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -4411,7 +4411,7 @@ function escapeHtml(text) {
|
|
|
4411
4411
|
|
|
4412
4412
|
// package.json
|
|
4413
4413
|
var package_default = {
|
|
4414
|
-
version: "1.0.
|
|
4414
|
+
version: "1.0.1"};
|
|
4415
4415
|
|
|
4416
4416
|
// src/mcp/clientFactory.ts
|
|
4417
4417
|
function getRetryAfterDelayMs(err) {
|
|
@@ -4630,6 +4630,17 @@ async function createMCPClientForConfig(config, options) {
|
|
|
4630
4630
|
}
|
|
4631
4631
|
async function closeMCPClient(client) {
|
|
4632
4632
|
try {
|
|
4633
|
+
const transport = client.transport;
|
|
4634
|
+
if (transport instanceof streamableHttp_js.StreamableHTTPClientTransport) {
|
|
4635
|
+
try {
|
|
4636
|
+
await transport.terminateSession();
|
|
4637
|
+
} catch (sessionError) {
|
|
4638
|
+
debugClient(
|
|
4639
|
+
"Error terminating session: %s",
|
|
4640
|
+
sessionError instanceof Error ? sessionError.message : String(sessionError)
|
|
4641
|
+
);
|
|
4642
|
+
}
|
|
4643
|
+
}
|
|
4633
4644
|
await client.close();
|
|
4634
4645
|
} catch (error) {
|
|
4635
4646
|
debugClient(
|
|
@@ -4858,11 +4869,13 @@ function validateSchema(response, schema, options = {}) {
|
|
|
4858
4869
|
} catch (error) {
|
|
4859
4870
|
const zodError = error;
|
|
4860
4871
|
const issues = formatZodIssues(zodError);
|
|
4872
|
+
const text = stringifyResponse(response);
|
|
4861
4873
|
return {
|
|
4862
4874
|
pass: false,
|
|
4863
4875
|
message: `Response does not match schema: ${issues}`,
|
|
4864
4876
|
details: {
|
|
4865
|
-
issues: zodError.issues
|
|
4877
|
+
issues: zodError.issues,
|
|
4878
|
+
textPreview: truncateForDisplay2(text)
|
|
4866
4879
|
}
|
|
4867
4880
|
};
|
|
4868
4881
|
}
|
|
@@ -4915,6 +4928,12 @@ function formatZodIssues(error) {
|
|
|
4915
4928
|
});
|
|
4916
4929
|
return issues.join("; ");
|
|
4917
4930
|
}
|
|
4931
|
+
function truncateForDisplay2(str, maxLength = 200) {
|
|
4932
|
+
if (str.length <= maxLength) {
|
|
4933
|
+
return str;
|
|
4934
|
+
}
|
|
4935
|
+
return str.slice(0, maxLength) + "... (truncated)";
|
|
4936
|
+
}
|
|
4918
4937
|
|
|
4919
4938
|
// src/assertions/validators/text.ts
|
|
4920
4939
|
function validateText(response, expected, options = {}) {
|
|
@@ -4941,11 +4960,11 @@ function validateText(response, expected, options = {}) {
|
|
|
4941
4960
|
details: {
|
|
4942
4961
|
missing,
|
|
4943
4962
|
textLength: text.length,
|
|
4944
|
-
textPreview:
|
|
4963
|
+
textPreview: truncateForDisplay3(text)
|
|
4945
4964
|
}
|
|
4946
4965
|
};
|
|
4947
4966
|
}
|
|
4948
|
-
function
|
|
4967
|
+
function truncateForDisplay3(str, maxLength = 200) {
|
|
4949
4968
|
if (str.length <= maxLength) {
|
|
4950
4969
|
return str;
|
|
4951
4970
|
}
|
|
@@ -4977,7 +4996,7 @@ function validatePattern(response, patterns, options = {}) {
|
|
|
4977
4996
|
details: {
|
|
4978
4997
|
unmatched,
|
|
4979
4998
|
textLength: text.length,
|
|
4980
|
-
textPreview:
|
|
4999
|
+
textPreview: truncateForDisplay4(text)
|
|
4981
5000
|
}
|
|
4982
5001
|
};
|
|
4983
5002
|
}
|
|
@@ -4997,7 +5016,7 @@ function patternToString(pattern) {
|
|
|
4997
5016
|
}
|
|
4998
5017
|
return `/${pattern}/`;
|
|
4999
5018
|
}
|
|
5000
|
-
function
|
|
5019
|
+
function truncateForDisplay4(str, maxLength = 200) {
|
|
5001
5020
|
if (str.length <= maxLength) {
|
|
5002
5021
|
return str;
|
|
5003
5022
|
}
|
|
@@ -5020,7 +5039,7 @@ function validateError(response, expected = true) {
|
|
|
5020
5039
|
pass: false,
|
|
5021
5040
|
message: "Expected an error response but got success",
|
|
5022
5041
|
details: {
|
|
5023
|
-
textPreview:
|
|
5042
|
+
textPreview: truncateForDisplay5(extractText2(response))
|
|
5024
5043
|
}
|
|
5025
5044
|
};
|
|
5026
5045
|
} else {
|
|
@@ -5032,7 +5051,7 @@ function validateError(response, expected = true) {
|
|
|
5032
5051
|
}
|
|
5033
5052
|
return {
|
|
5034
5053
|
pass: false,
|
|
5035
|
-
message: `Expected a success response but got error: "${
|
|
5054
|
+
message: `Expected a success response but got error: "${truncateForDisplay5(errorMessage)}"`,
|
|
5036
5055
|
details: {
|
|
5037
5056
|
errorMessage
|
|
5038
5057
|
}
|
|
@@ -5045,7 +5064,7 @@ function validateError(response, expected = true) {
|
|
|
5045
5064
|
pass: false,
|
|
5046
5065
|
message: `Expected an error containing "${expectedMessages[0]}" but got success`,
|
|
5047
5066
|
details: {
|
|
5048
|
-
textPreview:
|
|
5067
|
+
textPreview: truncateForDisplay5(extractText2(response))
|
|
5049
5068
|
}
|
|
5050
5069
|
};
|
|
5051
5070
|
}
|
|
@@ -5067,7 +5086,7 @@ function validateError(response, expected = true) {
|
|
|
5067
5086
|
}
|
|
5068
5087
|
};
|
|
5069
5088
|
}
|
|
5070
|
-
function
|
|
5089
|
+
function truncateForDisplay5(str, maxLength = 200) {
|
|
5071
5090
|
if (str.length <= maxLength) {
|
|
5072
5091
|
return str;
|
|
5073
5092
|
}
|
|
@@ -5185,6 +5204,10 @@ function validateToolCalls(response, expectation) {
|
|
|
5185
5204
|
return {
|
|
5186
5205
|
pass: false,
|
|
5187
5206
|
message: `Expected tool '${expected.name}' to be called in sequence (starting from position ${searchFrom}), but it was not found`,
|
|
5207
|
+
details: {
|
|
5208
|
+
actual: actual.map((c) => c.name),
|
|
5209
|
+
expected: expected.name
|
|
5210
|
+
},
|
|
5188
5211
|
metrics
|
|
5189
5212
|
};
|
|
5190
5213
|
}
|
|
@@ -5201,6 +5224,10 @@ function validateToolCalls(response, expectation) {
|
|
|
5201
5224
|
return {
|
|
5202
5225
|
pass: false,
|
|
5203
5226
|
message: `Expected tool '${expected.name}'${argsNote} to be called, but it was not`,
|
|
5227
|
+
details: {
|
|
5228
|
+
actual: actual.map((c) => c.name),
|
|
5229
|
+
expected: expected.name
|
|
5230
|
+
},
|
|
5204
5231
|
metrics
|
|
5205
5232
|
};
|
|
5206
5233
|
}
|
|
@@ -5213,6 +5240,10 @@ function validateToolCalls(response, expectation) {
|
|
|
5213
5240
|
return {
|
|
5214
5241
|
pass: false,
|
|
5215
5242
|
message: `Unexpected tool calls: ${names}. Only ${[...allowedNames].map((n) => `'${n}'`).join(", ")} are allowed`,
|
|
5243
|
+
details: {
|
|
5244
|
+
actual: actual.map((c) => c.name),
|
|
5245
|
+
unexpected: unexpected.map((c) => c.name)
|
|
5246
|
+
},
|
|
5216
5247
|
metrics
|
|
5217
5248
|
};
|
|
5218
5249
|
}
|
|
@@ -5231,19 +5262,22 @@ function validateToolCallCount(response, options) {
|
|
|
5231
5262
|
if (exact !== void 0 && count !== exact) {
|
|
5232
5263
|
return {
|
|
5233
5264
|
pass: false,
|
|
5234
|
-
message: `Expected exactly ${exact} tool call(s), but got ${count}
|
|
5265
|
+
message: `Expected exactly ${exact} tool call(s), but got ${count}`,
|
|
5266
|
+
details: { actual: count, expected: exact }
|
|
5235
5267
|
};
|
|
5236
5268
|
}
|
|
5237
5269
|
if (min !== void 0 && count < min) {
|
|
5238
5270
|
return {
|
|
5239
5271
|
pass: false,
|
|
5240
|
-
message: `Expected at least ${min} tool call(s), but got ${count}
|
|
5272
|
+
message: `Expected at least ${min} tool call(s), but got ${count}`,
|
|
5273
|
+
details: { actual: count, min }
|
|
5241
5274
|
};
|
|
5242
5275
|
}
|
|
5243
5276
|
if (max !== void 0 && count > max) {
|
|
5244
5277
|
return {
|
|
5245
5278
|
pass: false,
|
|
5246
|
-
message: `Expected at most ${max} tool call(s), but got ${count}
|
|
5279
|
+
message: `Expected at most ${max} tool call(s), but got ${count}`,
|
|
5280
|
+
details: { actual: count, max }
|
|
5247
5281
|
};
|
|
5248
5282
|
}
|
|
5249
5283
|
return {
|
|
@@ -5757,7 +5791,9 @@ function createJudge(config = {}) {
|
|
|
5757
5791
|
case "google":
|
|
5758
5792
|
return createGoogleJudge(config);
|
|
5759
5793
|
default:
|
|
5760
|
-
throw new Error(
|
|
5794
|
+
throw new Error(
|
|
5795
|
+
`Unsupported LLM provider: ${String(provider)}. Valid providers: 'anthropic', 'vertex-anthropic', 'anthropic-agent-sdk', 'openai', 'google'`
|
|
5796
|
+
);
|
|
5761
5797
|
}
|
|
5762
5798
|
}
|
|
5763
5799
|
|
|
@@ -6082,12 +6118,19 @@ function toMatchToolResponse(received, expected) {
|
|
|
6082
6118
|
// src/assertions/matchers/toMatchToolSchema.ts
|
|
6083
6119
|
function toMatchToolSchema(received, schema, options = {}) {
|
|
6084
6120
|
const result = validateSchema(received, schema, options);
|
|
6121
|
+
const preview = result.details?.textPreview;
|
|
6085
6122
|
return {
|
|
6086
6123
|
pass: result.pass,
|
|
6087
6124
|
message: () => {
|
|
6088
6125
|
if (this.isNot) {
|
|
6089
6126
|
return result.pass ? "Expected response NOT to match schema, but it did" : result.message;
|
|
6090
6127
|
}
|
|
6128
|
+
if (!result.pass && preview) {
|
|
6129
|
+
return `${result.message}
|
|
6130
|
+
|
|
6131
|
+
Actual response (truncated):
|
|
6132
|
+
${preview}`;
|
|
6133
|
+
}
|
|
6091
6134
|
return result.message;
|
|
6092
6135
|
}
|
|
6093
6136
|
};
|
|
@@ -6096,6 +6139,7 @@ function toMatchToolSchema(received, schema, options = {}) {
|
|
|
6096
6139
|
// src/assertions/matchers/toContainToolText.ts
|
|
6097
6140
|
function toContainToolText(received, expected, options = {}) {
|
|
6098
6141
|
const result = validateText(received, expected, options);
|
|
6142
|
+
const preview = result.details?.textPreview;
|
|
6099
6143
|
return {
|
|
6100
6144
|
pass: result.pass,
|
|
6101
6145
|
message: () => {
|
|
@@ -6103,6 +6147,12 @@ function toContainToolText(received, expected, options = {}) {
|
|
|
6103
6147
|
const expectedStr = Array.isArray(expected) ? expected.map((s) => `"${s}"`).join(", ") : `"${expected}"`;
|
|
6104
6148
|
return result.pass ? `Expected response NOT to contain ${expectedStr}, but it did` : result.message;
|
|
6105
6149
|
}
|
|
6150
|
+
if (!result.pass && preview) {
|
|
6151
|
+
return `${result.message}
|
|
6152
|
+
|
|
6153
|
+
Actual response (truncated):
|
|
6154
|
+
${preview}`;
|
|
6155
|
+
}
|
|
6106
6156
|
return result.message;
|
|
6107
6157
|
}
|
|
6108
6158
|
};
|
|
@@ -6111,12 +6161,19 @@ function toContainToolText(received, expected, options = {}) {
|
|
|
6111
6161
|
// src/assertions/matchers/toMatchToolPattern.ts
|
|
6112
6162
|
function toMatchToolPattern(received, patterns, options = {}) {
|
|
6113
6163
|
const result = validatePattern(received, patterns, options);
|
|
6164
|
+
const preview = result.details?.textPreview;
|
|
6114
6165
|
return {
|
|
6115
6166
|
pass: result.pass,
|
|
6116
6167
|
message: () => {
|
|
6117
6168
|
if (this.isNot) {
|
|
6118
6169
|
return result.pass ? "Expected response NOT to match pattern(s), but it did" : result.message;
|
|
6119
6170
|
}
|
|
6171
|
+
if (!result.pass && preview) {
|
|
6172
|
+
return `${result.message}
|
|
6173
|
+
|
|
6174
|
+
Actual response (truncated):
|
|
6175
|
+
${preview}`;
|
|
6176
|
+
}
|
|
6120
6177
|
return result.message;
|
|
6121
6178
|
}
|
|
6122
6179
|
};
|
|
@@ -6901,6 +6958,12 @@ function createVercelOrchestrator() {
|
|
|
6901
6958
|
});
|
|
6902
6959
|
const totalDurationMs = Date.now() - llmStart;
|
|
6903
6960
|
const llmDurationMs = totalDurationMs - mcpDurationMs;
|
|
6961
|
+
const hostUsage = result.usage ? {
|
|
6962
|
+
inputTokens: result.usage.promptTokens ?? 0,
|
|
6963
|
+
outputTokens: result.usage.completionTokens ?? 0,
|
|
6964
|
+
totalCostUsd: 0,
|
|
6965
|
+
durationMs: llmDurationMs
|
|
6966
|
+
} : void 0;
|
|
6904
6967
|
const conversationHistory = (result.steps ?? []).map((step) => ({
|
|
6905
6968
|
role: step.toolCalls?.length > 0 ? "tool" : "assistant",
|
|
6906
6969
|
content: step.toolCalls?.length > 0 ? JSON.stringify(step.toolResults) : step.text ?? ""
|
|
@@ -6912,7 +6975,8 @@ function createVercelOrchestrator() {
|
|
|
6912
6975
|
scenario,
|
|
6913
6976
|
llmDurationMs,
|
|
6914
6977
|
mcpDurationMs,
|
|
6915
|
-
conversationHistory
|
|
6978
|
+
conversationHistory,
|
|
6979
|
+
usage: hostUsage
|
|
6916
6980
|
};
|
|
6917
6981
|
} catch (err) {
|
|
6918
6982
|
return {
|
|
@@ -7248,6 +7312,32 @@ async function execFileNoThrow(file, args) {
|
|
|
7248
7312
|
}
|
|
7249
7313
|
}
|
|
7250
7314
|
|
|
7315
|
+
// src/utils/usageUtils.ts
|
|
7316
|
+
function optionalSum(a, b) {
|
|
7317
|
+
if (a === void 0 && b === void 0) return void 0;
|
|
7318
|
+
return (a ?? 0) + (b ?? 0);
|
|
7319
|
+
}
|
|
7320
|
+
function sumUsage(a, b) {
|
|
7321
|
+
if (!a && !b) return void 0;
|
|
7322
|
+
if (!a) return b ? { ...b } : void 0;
|
|
7323
|
+
if (!b) return { ...a };
|
|
7324
|
+
return {
|
|
7325
|
+
inputTokens: a.inputTokens + b.inputTokens,
|
|
7326
|
+
outputTokens: a.outputTokens + b.outputTokens,
|
|
7327
|
+
totalCostUsd: a.totalCostUsd + b.totalCostUsd,
|
|
7328
|
+
durationMs: a.durationMs + b.durationMs,
|
|
7329
|
+
durationApiMs: optionalSum(a.durationApiMs, b.durationApiMs),
|
|
7330
|
+
cacheReadInputTokens: optionalSum(
|
|
7331
|
+
a.cacheReadInputTokens,
|
|
7332
|
+
b.cacheReadInputTokens
|
|
7333
|
+
),
|
|
7334
|
+
cacheCreationInputTokens: optionalSum(
|
|
7335
|
+
a.cacheCreationInputTokens,
|
|
7336
|
+
b.cacheCreationInputTokens
|
|
7337
|
+
)
|
|
7338
|
+
};
|
|
7339
|
+
}
|
|
7340
|
+
|
|
7251
7341
|
// src/evals/evalRunner.ts
|
|
7252
7342
|
async function executeToolCall(evalCase, mcp) {
|
|
7253
7343
|
const mode = evalCase.mode || "direct";
|
|
@@ -7493,6 +7583,7 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
7493
7583
|
};
|
|
7494
7584
|
}
|
|
7495
7585
|
}
|
|
7586
|
+
const hostUsage = isMCPHostSimulationResult(response) && response.usage ? response.usage : void 0;
|
|
7496
7587
|
return {
|
|
7497
7588
|
id: evalCase.id,
|
|
7498
7589
|
datasetName: options.datasetName ?? "single-case",
|
|
@@ -7509,7 +7600,8 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
7509
7600
|
tags: evalCase.tags,
|
|
7510
7601
|
toolPrecision,
|
|
7511
7602
|
toolRecall,
|
|
7512
|
-
mcpHostTrace
|
|
7603
|
+
mcpHostTrace,
|
|
7604
|
+
hostUsage
|
|
7513
7605
|
};
|
|
7514
7606
|
}
|
|
7515
7607
|
function isInfrastructureError(err) {
|
|
@@ -7525,7 +7617,7 @@ function isInfrastructureError(err) {
|
|
|
7525
7617
|
} else {
|
|
7526
7618
|
return false;
|
|
7527
7619
|
}
|
|
7528
|
-
return name15 === "
|
|
7620
|
+
return name15?.toLowerCase() === "aborterror" || msg.includes("econnreset") || msg.includes("etimedout") || msg.includes("econnrefused") || msg.includes("rate limit") || msg.includes("429") || msg.includes("503") || msg.includes("network") || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
|
|
7529
7621
|
msg.includes("prompt is too long") || msg.includes("context length exceeded") || msg.includes("maximum context length") || msg.includes("context_length_exceeded") || msg.includes("tokens > ") || code.includes("econnreset") || code.includes("etimedout") || code.includes("econnrefused");
|
|
7530
7622
|
}
|
|
7531
7623
|
async function runEvalCase(evalCase, context, options = {}) {
|
|
@@ -7545,7 +7637,8 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
7545
7637
|
durationMs: result.durationMs,
|
|
7546
7638
|
error: result.error,
|
|
7547
7639
|
isInfrastructureError: infraError,
|
|
7548
|
-
mcpHostTrace: result.mcpHostTrace
|
|
7640
|
+
mcpHostTrace: result.mcpHostTrace,
|
|
7641
|
+
hostUsage: result.hostUsage
|
|
7549
7642
|
});
|
|
7550
7643
|
} catch (err) {
|
|
7551
7644
|
const errorMessage = err instanceof Error ? err.message : String(err);
|
|
@@ -7578,6 +7671,10 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
7578
7671
|
durationMs: 0,
|
|
7579
7672
|
tags: evalCase.tags
|
|
7580
7673
|
};
|
|
7674
|
+
const totalHostUsage = iterationResults.reduce(
|
|
7675
|
+
(acc, r) => sumUsage(acc, r.hostUsage),
|
|
7676
|
+
void 0
|
|
7677
|
+
);
|
|
7581
7678
|
return {
|
|
7582
7679
|
...baseResult,
|
|
7583
7680
|
pass: assertionPassRate >= threshold,
|
|
@@ -7586,7 +7683,8 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
7586
7683
|
infrastructureErrorRate,
|
|
7587
7684
|
iterationResults,
|
|
7588
7685
|
infrastructureErrorCount: infraErrors.length,
|
|
7589
|
-
durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
|
|
7686
|
+
durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0),
|
|
7687
|
+
hostUsage: totalHostUsage
|
|
7590
7688
|
};
|
|
7591
7689
|
}
|
|
7592
7690
|
function wilsonCI(k, n) {
|
|
@@ -7696,13 +7794,18 @@ async function runEvalDataset(options, context) {
|
|
|
7696
7794
|
...mcpHostModel !== void 0 && { mcpHostModel },
|
|
7697
7795
|
...judgeModel !== void 0 && { judgeModel }
|
|
7698
7796
|
};
|
|
7797
|
+
const runHostUsage = caseResults.reduce(
|
|
7798
|
+
(acc, r) => sumUsage(acc, r.hostUsage),
|
|
7799
|
+
void 0
|
|
7800
|
+
);
|
|
7699
7801
|
const result = {
|
|
7700
7802
|
total,
|
|
7701
7803
|
passed,
|
|
7702
7804
|
failed: total - passed,
|
|
7703
7805
|
caseResults,
|
|
7704
7806
|
durationMs: Date.now() - startTime,
|
|
7705
|
-
metadata
|
|
7807
|
+
metadata,
|
|
7808
|
+
totalHostUsage: runHostUsage
|
|
7706
7809
|
};
|
|
7707
7810
|
if (baselineResultsFrom) {
|
|
7708
7811
|
try {
|
|
@@ -8037,6 +8140,7 @@ exports.normalizeWhitespace = normalizeWhitespace;
|
|
|
8037
8140
|
exports.performClientCredentialsFlow = performClientCredentialsFlow;
|
|
8038
8141
|
exports.performOAuthSetup = performOAuthSetup;
|
|
8039
8142
|
exports.performOAuthSetupIfNeeded = performOAuthSetupIfNeeded;
|
|
8143
|
+
exports.refreshAccessToken = refreshAccessToken;
|
|
8040
8144
|
exports.registerJudge = registerJudge;
|
|
8041
8145
|
exports.resolveRubric = resolveRubric;
|
|
8042
8146
|
exports.runConformanceChecks = runConformanceChecks;
|