@gleanwork/mcp-server-tester 1.0.0 → 1.0.1-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -4411,7 +4411,7 @@ function escapeHtml(text) {
4411
4411
 
4412
4412
  // package.json
4413
4413
  var package_default = {
4414
- version: "1.0.0"};
4414
+ version: "1.0.1-beta.0"};
4415
4415
 
4416
4416
  // src/mcp/clientFactory.ts
4417
4417
  function getRetryAfterDelayMs(err) {
@@ -6958,6 +6958,12 @@ function createVercelOrchestrator() {
6958
6958
  });
6959
6959
  const totalDurationMs = Date.now() - llmStart;
6960
6960
  const llmDurationMs = totalDurationMs - mcpDurationMs;
6961
+ const hostUsage = result.usage ? {
6962
+ inputTokens: result.usage.promptTokens ?? 0,
6963
+ outputTokens: result.usage.completionTokens ?? 0,
6964
+ totalCostUsd: 0,
6965
+ durationMs: llmDurationMs
6966
+ } : void 0;
6961
6967
  const conversationHistory = (result.steps ?? []).map((step) => ({
6962
6968
  role: step.toolCalls?.length > 0 ? "tool" : "assistant",
6963
6969
  content: step.toolCalls?.length > 0 ? JSON.stringify(step.toolResults) : step.text ?? ""
@@ -6969,7 +6975,8 @@ function createVercelOrchestrator() {
6969
6975
  scenario,
6970
6976
  llmDurationMs,
6971
6977
  mcpDurationMs,
6972
- conversationHistory
6978
+ conversationHistory,
6979
+ usage: hostUsage
6973
6980
  };
6974
6981
  } catch (err) {
6975
6982
  return {
@@ -6987,6 +6994,7 @@ function parseStreamJson(stdout) {
6987
6994
  const lines = stdout.split("\n").filter((line) => line.trim().length > 0);
6988
6995
  const toolCalls = [];
6989
6996
  const textParts = [];
6997
+ let usage;
6990
6998
  const conversationHistory = [];
6991
6999
  for (const line of lines) {
6992
7000
  let event;
@@ -7019,16 +7027,28 @@ function parseStreamJson(stdout) {
7019
7027
  }
7020
7028
  }
7021
7029
  }
7022
- if (event.type === "result" && typeof event.result === "string") {
7023
- if (textParts.length === 0) {
7030
+ if (event.type === "result") {
7031
+ if (typeof event.result === "string" && textParts.length === 0) {
7024
7032
  textParts.push(event.result);
7025
7033
  }
7034
+ if (event.usage) {
7035
+ usage = {
7036
+ inputTokens: event.usage.input_tokens ?? 0,
7037
+ outputTokens: event.usage.output_tokens ?? 0,
7038
+ totalCostUsd: event.total_cost_usd ?? 0,
7039
+ durationMs: event.duration_ms ?? 0,
7040
+ durationApiMs: event.duration_api_ms,
7041
+ cacheReadInputTokens: event.usage.cache_read_input_tokens,
7042
+ cacheCreationInputTokens: event.usage.cache_creation_input_tokens
7043
+ };
7044
+ }
7026
7045
  }
7027
7046
  if (event.type === "result" && event.is_error === true) {
7028
7047
  return {
7029
7048
  success: false,
7030
7049
  toolCalls,
7031
- error: typeof event.result === "string" ? event.result : "CLI host reported an error"
7050
+ error: typeof event.result === "string" ? event.result : "CLI host reported an error",
7051
+ usage
7032
7052
  };
7033
7053
  }
7034
7054
  }
@@ -7040,7 +7060,8 @@ function parseStreamJson(stdout) {
7040
7060
  success: true,
7041
7061
  toolCalls,
7042
7062
  response: response || void 0,
7043
- conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0
7063
+ conversationHistory: conversationHistory.length > 0 ? conversationHistory : void 0,
7064
+ usage
7044
7065
  };
7045
7066
  }
7046
7067
  function createJsonParser(paths) {
@@ -7305,6 +7326,32 @@ async function execFileNoThrow(file, args) {
7305
7326
  }
7306
7327
  }
7307
7328
 
7329
+ // src/utils/usageUtils.ts
7330
+ function optionalSum(a, b) {
7331
+ if (a === void 0 && b === void 0) return void 0;
7332
+ return (a ?? 0) + (b ?? 0);
7333
+ }
7334
+ function sumUsage(a, b) {
7335
+ if (!a && !b) return void 0;
7336
+ if (!a) return b ? { ...b } : void 0;
7337
+ if (!b) return { ...a };
7338
+ return {
7339
+ inputTokens: a.inputTokens + b.inputTokens,
7340
+ outputTokens: a.outputTokens + b.outputTokens,
7341
+ totalCostUsd: a.totalCostUsd + b.totalCostUsd,
7342
+ durationMs: a.durationMs + b.durationMs,
7343
+ durationApiMs: optionalSum(a.durationApiMs, b.durationApiMs),
7344
+ cacheReadInputTokens: optionalSum(
7345
+ a.cacheReadInputTokens,
7346
+ b.cacheReadInputTokens
7347
+ ),
7348
+ cacheCreationInputTokens: optionalSum(
7349
+ a.cacheCreationInputTokens,
7350
+ b.cacheCreationInputTokens
7351
+ )
7352
+ };
7353
+ }
7354
+
7308
7355
  // src/evals/evalRunner.ts
7309
7356
  async function executeToolCall(evalCase, mcp) {
7310
7357
  const mode = evalCase.mode || "direct";
@@ -7550,6 +7597,7 @@ async function runSingleIteration(evalCase, context, options) {
7550
7597
  };
7551
7598
  }
7552
7599
  }
7600
+ const hostUsage = isMCPHostSimulationResult(response) && response.usage ? response.usage : void 0;
7553
7601
  return {
7554
7602
  id: evalCase.id,
7555
7603
  datasetName: options.datasetName ?? "single-case",
@@ -7566,7 +7614,8 @@ async function runSingleIteration(evalCase, context, options) {
7566
7614
  tags: evalCase.tags,
7567
7615
  toolPrecision,
7568
7616
  toolRecall,
7569
- mcpHostTrace
7617
+ mcpHostTrace,
7618
+ hostUsage
7570
7619
  };
7571
7620
  }
7572
7621
  function isInfrastructureError(err) {
@@ -7602,7 +7651,8 @@ async function runEvalCase(evalCase, context, options = {}) {
7602
7651
  durationMs: result.durationMs,
7603
7652
  error: result.error,
7604
7653
  isInfrastructureError: infraError,
7605
- mcpHostTrace: result.mcpHostTrace
7654
+ mcpHostTrace: result.mcpHostTrace,
7655
+ hostUsage: result.hostUsage
7606
7656
  });
7607
7657
  } catch (err) {
7608
7658
  const errorMessage = err instanceof Error ? err.message : String(err);
@@ -7635,6 +7685,10 @@ async function runEvalCase(evalCase, context, options = {}) {
7635
7685
  durationMs: 0,
7636
7686
  tags: evalCase.tags
7637
7687
  };
7688
+ const totalHostUsage = iterationResults.reduce(
7689
+ (acc, r) => sumUsage(acc, r.hostUsage),
7690
+ void 0
7691
+ );
7638
7692
  return {
7639
7693
  ...baseResult,
7640
7694
  pass: assertionPassRate >= threshold,
@@ -7643,7 +7697,8 @@ async function runEvalCase(evalCase, context, options = {}) {
7643
7697
  infrastructureErrorRate,
7644
7698
  iterationResults,
7645
7699
  infrastructureErrorCount: infraErrors.length,
7646
- durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
7700
+ durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0),
7701
+ hostUsage: totalHostUsage
7647
7702
  };
7648
7703
  }
7649
7704
  function wilsonCI(k, n) {
@@ -7753,13 +7808,18 @@ async function runEvalDataset(options, context) {
7753
7808
  ...mcpHostModel !== void 0 && { mcpHostModel },
7754
7809
  ...judgeModel !== void 0 && { judgeModel }
7755
7810
  };
7811
+ const runHostUsage = caseResults.reduce(
7812
+ (acc, r) => sumUsage(acc, r.hostUsage),
7813
+ void 0
7814
+ );
7756
7815
  const result = {
7757
7816
  total,
7758
7817
  passed,
7759
7818
  failed: total - passed,
7760
7819
  caseResults,
7761
7820
  durationMs: Date.now() - startTime,
7762
- metadata
7821
+ metadata,
7822
+ totalHostUsage: runHostUsage
7763
7823
  };
7764
7824
  if (baselineResultsFrom) {
7765
7825
  try {