@gleanwork/mcp-server-tester 1.0.0 → 1.0.1-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +1 -1
- package/dist/fixtures/mcp.js +1 -1
- package/dist/fixtures/mcp.js.map +1 -1
- package/dist/index.cjs +70 -10
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +88 -14
- package/dist/index.d.ts +88 -14
- package/dist/index.js +70 -10
- package/dist/index.js.map +1 -1
- package/dist/reporters/mcpReporter.cjs +34 -1
- package/dist/reporters/mcpReporter.cjs.map +1 -1
- package/dist/reporters/mcpReporter.d.cts +90 -0
- package/dist/reporters/mcpReporter.d.ts +90 -0
- package/dist/reporters/mcpReporter.js +34 -1
- package/dist/reporters/mcpReporter.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -4411,7 +4411,7 @@ function escapeHtml(text) {
|
|
|
4411
4411
|
|
|
4412
4412
|
// package.json
|
|
4413
4413
|
var package_default = {
|
|
4414
|
-
version: "1.0.0"};
|
|
4414
|
+
version: "1.0.1-beta.0"};
|
|
4415
4415
|
|
|
4416
4416
|
// src/mcp/clientFactory.ts
|
|
4417
4417
|
function getRetryAfterDelayMs(err) {
|
|
@@ -6958,6 +6958,12 @@ function createVercelOrchestrator() {
|
|
|
6958
6958
|
});
|
|
6959
6959
|
const totalDurationMs = Date.now() - llmStart;
|
|
6960
6960
|
const llmDurationMs = totalDurationMs - mcpDurationMs;
|
|
6961
|
+
const hostUsage = result.usage ? {
|
|
6962
|
+
inputTokens: result.usage.promptTokens ?? 0,
|
|
6963
|
+
outputTokens: result.usage.completionTokens ?? 0,
|
|
6964
|
+
totalCostUsd: 0,
|
|
6965
|
+
durationMs: llmDurationMs
|
|
6966
|
+
} : void 0;
|
|
6961
6967
|
const conversationHistory = (result.steps ?? []).map((step) => ({
|
|
6962
6968
|
role: step.toolCalls?.length > 0 ? "tool" : "assistant",
|
|
6963
6969
|
content: step.toolCalls?.length > 0 ? JSON.stringify(step.toolResults) : step.text ?? ""
|
|
@@ -6969,7 +6975,8 @@ function createVercelOrchestrator() {
|
|
|
6969
6975
|
scenario,
|
|
6970
6976
|
llmDurationMs,
|
|
6971
6977
|
mcpDurationMs,
|
|
6972
|
-
conversationHistory
|
|
6978
|
+
conversationHistory,
|
|
6979
|
+
usage: hostUsage
|
|
6973
6980
|
};
|
|
6974
6981
|
} catch (err) {
|
|
6975
6982
|
return {
|
|
@@ -6987,6 +6994,7 @@ function parseStreamJson(stdout) {
|
|
|
6987
6994
|
const lines = stdout.split("\n").filter((line) => line.trim().length > 0);
|
|
6988
6995
|
const toolCalls = [];
|
|
6989
6996
|
const textParts = [];
|
|
6997
|
+
let usage;
|
|
6990
6998
|
const conversationHistory = [];
|
|
6991
6999
|
for (const line of lines) {
|
|
6992
7000
|
let event;
|
|
@@ -7019,16 +7027,28 @@ function parseStreamJson(stdout) {
|
|
|
7019
7027
|
}
|
|
7020
7028
|
}
|
|
7021
7029
|
}
|
|
7022
|
-
if (event.type === "result"
|
|
7023
|
-
if (textParts.length === 0) {
|
|
7030
|
+
if (event.type === "result") {
|
|
7031
|
+
if (typeof event.result === "string" && textParts.length === 0) {
|
|
7024
7032
|
textParts.push(event.result);
|
|
7025
7033
|
}
|
|
7034
|
+
if (event.usage) {
|
|
7035
|
+
usage = {
|
|
7036
|
+
inputTokens: event.usage.input_tokens ?? 0,
|
|
7037
|
+
outputTokens: event.usage.output_tokens ?? 0,
|
|
7038
|
+
totalCostUsd: event.total_cost_usd ?? 0,
|
|
7039
|
+
durationMs: event.duration_ms ?? 0,
|
|
7040
|
+
durationApiMs: event.duration_api_ms,
|
|
7041
|
+
cacheReadInputTokens: event.usage.cache_read_input_tokens,
|
|
7042
|
+
cacheCreationInputTokens: event.usage.cache_creation_input_tokens
|
|
7043
|
+
};
|
|
7044
|
+
}
|
|
7026
7045
|
}
|
|
7027
7046
|
if (event.type === "result" && event.is_error === true) {
|
|
7028
7047
|
return {
|
|
7029
7048
|
success: false,
|
|
7030
7049
|
toolCalls,
|
|
7031
|
-
error: typeof event.result === "string" ? event.result : "CLI host reported an error"
|
|
7050
|
+
error: typeof event.result === "string" ? event.result : "CLI host reported an error",
|
|
7051
|
+
usage
|
|
7032
7052
|
};
|
|
7033
7053
|
}
|
|
7034
7054
|
}
|
|
@@ -7040,7 +7060,8 @@ function parseStreamJson(stdout) {
|
|
|
7040
7060
|
success: true,
|
|
7041
7061
|
toolCalls,
|
|
7042
7062
|
response: response || void 0,
|
|
7043
|
-
conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0
|
|
7063
|
+
conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0,
|
|
7064
|
+
usage
|
|
7044
7065
|
};
|
|
7045
7066
|
}
|
|
7046
7067
|
function createJsonParser(paths) {
|
|
@@ -7305,6 +7326,32 @@ async function execFileNoThrow(file, args) {
|
|
|
7305
7326
|
}
|
|
7306
7327
|
}
|
|
7307
7328
|
|
|
7329
|
+
// src/utils/usageUtils.ts
|
|
7330
|
+
function optionalSum(a, b) {
|
|
7331
|
+
if (a === void 0 && b === void 0) return void 0;
|
|
7332
|
+
return (a ?? 0) + (b ?? 0);
|
|
7333
|
+
}
|
|
7334
|
+
function sumUsage(a, b) {
|
|
7335
|
+
if (!a && !b) return void 0;
|
|
7336
|
+
if (!a) return b ? { ...b } : void 0;
|
|
7337
|
+
if (!b) return { ...a };
|
|
7338
|
+
return {
|
|
7339
|
+
inputTokens: a.inputTokens + b.inputTokens,
|
|
7340
|
+
outputTokens: a.outputTokens + b.outputTokens,
|
|
7341
|
+
totalCostUsd: a.totalCostUsd + b.totalCostUsd,
|
|
7342
|
+
durationMs: a.durationMs + b.durationMs,
|
|
7343
|
+
durationApiMs: optionalSum(a.durationApiMs, b.durationApiMs),
|
|
7344
|
+
cacheReadInputTokens: optionalSum(
|
|
7345
|
+
a.cacheReadInputTokens,
|
|
7346
|
+
b.cacheReadInputTokens
|
|
7347
|
+
),
|
|
7348
|
+
cacheCreationInputTokens: optionalSum(
|
|
7349
|
+
a.cacheCreationInputTokens,
|
|
7350
|
+
b.cacheCreationInputTokens
|
|
7351
|
+
)
|
|
7352
|
+
};
|
|
7353
|
+
}
|
|
7354
|
+
|
|
7308
7355
|
// src/evals/evalRunner.ts
|
|
7309
7356
|
async function executeToolCall(evalCase, mcp) {
|
|
7310
7357
|
const mode = evalCase.mode || "direct";
|
|
@@ -7550,6 +7597,7 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
7550
7597
|
};
|
|
7551
7598
|
}
|
|
7552
7599
|
}
|
|
7600
|
+
const hostUsage = isMCPHostSimulationResult(response) && response.usage ? response.usage : void 0;
|
|
7553
7601
|
return {
|
|
7554
7602
|
id: evalCase.id,
|
|
7555
7603
|
datasetName: options.datasetName ?? "single-case",
|
|
@@ -7566,7 +7614,8 @@ async function runSingleIteration(evalCase, context, options) {
|
|
|
7566
7614
|
tags: evalCase.tags,
|
|
7567
7615
|
toolPrecision,
|
|
7568
7616
|
toolRecall,
|
|
7569
|
-
mcpHostTrace
|
|
7617
|
+
mcpHostTrace,
|
|
7618
|
+
hostUsage
|
|
7570
7619
|
};
|
|
7571
7620
|
}
|
|
7572
7621
|
function isInfrastructureError(err) {
|
|
@@ -7602,7 +7651,8 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
7602
7651
|
durationMs: result.durationMs,
|
|
7603
7652
|
error: result.error,
|
|
7604
7653
|
isInfrastructureError: infraError,
|
|
7605
|
-
mcpHostTrace: result.mcpHostTrace
|
|
7654
|
+
mcpHostTrace: result.mcpHostTrace,
|
|
7655
|
+
hostUsage: result.hostUsage
|
|
7606
7656
|
});
|
|
7607
7657
|
} catch (err) {
|
|
7608
7658
|
const errorMessage = err instanceof Error ? err.message : String(err);
|
|
@@ -7635,6 +7685,10 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
7635
7685
|
durationMs: 0,
|
|
7636
7686
|
tags: evalCase.tags
|
|
7637
7687
|
};
|
|
7688
|
+
const totalHostUsage = iterationResults.reduce(
|
|
7689
|
+
(acc, r) => sumUsage(acc, r.hostUsage),
|
|
7690
|
+
void 0
|
|
7691
|
+
);
|
|
7638
7692
|
return {
|
|
7639
7693
|
...baseResult,
|
|
7640
7694
|
pass: assertionPassRate >= threshold,
|
|
@@ -7643,7 +7697,8 @@ async function runEvalCase(evalCase, context, options = {}) {
|
|
|
7643
7697
|
infrastructureErrorRate,
|
|
7644
7698
|
iterationResults,
|
|
7645
7699
|
infrastructureErrorCount: infraErrors.length,
|
|
7646
|
-
durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
|
|
7700
|
+
durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0),
|
|
7701
|
+
hostUsage: totalHostUsage
|
|
7647
7702
|
};
|
|
7648
7703
|
}
|
|
7649
7704
|
function wilsonCI(k, n) {
|
|
@@ -7753,13 +7808,18 @@ async function runEvalDataset(options, context) {
|
|
|
7753
7808
|
...mcpHostModel !== void 0 && { mcpHostModel },
|
|
7754
7809
|
...judgeModel !== void 0 && { judgeModel }
|
|
7755
7810
|
};
|
|
7811
|
+
const runHostUsage = caseResults.reduce(
|
|
7812
|
+
(acc, r) => sumUsage(acc, r.hostUsage),
|
|
7813
|
+
void 0
|
|
7814
|
+
);
|
|
7756
7815
|
const result = {
|
|
7757
7816
|
total,
|
|
7758
7817
|
passed,
|
|
7759
7818
|
failed: total - passed,
|
|
7760
7819
|
caseResults,
|
|
7761
7820
|
durationMs: Date.now() - startTime,
|
|
7762
|
-
metadata
|
|
7821
|
+
metadata,
|
|
7822
|
+
totalHostUsage: runHostUsage
|
|
7763
7823
|
};
|
|
7764
7824
|
if (baselineResultsFrom) {
|
|
7765
7825
|
try {
|