@gleanwork/mcp-server-tester 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -4411,7 +4411,7 @@ function escapeHtml(text) {
4411
4411
 
4412
4412
  // package.json
4413
4413
  var package_default = {
4414
- version: "1.0.0"};
4414
+ version: "1.0.1"};
4415
4415
 
4416
4416
  // src/mcp/clientFactory.ts
4417
4417
  function getRetryAfterDelayMs(err) {
@@ -6958,6 +6958,12 @@ function createVercelOrchestrator() {
6958
6958
  });
6959
6959
  const totalDurationMs = Date.now() - llmStart;
6960
6960
  const llmDurationMs = totalDurationMs - mcpDurationMs;
6961
+ const hostUsage = result.usage ? {
6962
+ inputTokens: result.usage.promptTokens ?? 0,
6963
+ outputTokens: result.usage.completionTokens ?? 0,
6964
+ totalCostUsd: 0,
6965
+ durationMs: llmDurationMs
6966
+ } : void 0;
6961
6967
  const conversationHistory = (result.steps ?? []).map((step) => ({
6962
6968
  role: step.toolCalls?.length > 0 ? "tool" : "assistant",
6963
6969
  content: step.toolCalls?.length > 0 ? JSON.stringify(step.toolResults) : step.text ?? ""
@@ -6969,7 +6975,8 @@ function createVercelOrchestrator() {
6969
6975
  scenario,
6970
6976
  llmDurationMs,
6971
6977
  mcpDurationMs,
6972
- conversationHistory
6978
+ conversationHistory,
6979
+ usage: hostUsage
6973
6980
  };
6974
6981
  } catch (err) {
6975
6982
  return {
@@ -7305,6 +7312,32 @@ async function execFileNoThrow(file, args) {
7305
7312
  }
7306
7313
  }
7307
7314
 
7315
+ // src/utils/usageUtils.ts
7316
+ function optionalSum(a, b) {
7317
+ if (a === void 0 && b === void 0) return void 0;
7318
+ return (a ?? 0) + (b ?? 0);
7319
+ }
7320
+ function sumUsage(a, b) {
7321
+ if (!a && !b) return void 0;
7322
+ if (!a) return b ? { ...b } : void 0;
7323
+ if (!b) return { ...a };
7324
+ return {
7325
+ inputTokens: a.inputTokens + b.inputTokens,
7326
+ outputTokens: a.outputTokens + b.outputTokens,
7327
+ totalCostUsd: a.totalCostUsd + b.totalCostUsd,
7328
+ durationMs: a.durationMs + b.durationMs,
7329
+ durationApiMs: optionalSum(a.durationApiMs, b.durationApiMs),
7330
+ cacheReadInputTokens: optionalSum(
7331
+ a.cacheReadInputTokens,
7332
+ b.cacheReadInputTokens
7333
+ ),
7334
+ cacheCreationInputTokens: optionalSum(
7335
+ a.cacheCreationInputTokens,
7336
+ b.cacheCreationInputTokens
7337
+ )
7338
+ };
7339
+ }
7340
+
7308
7341
  // src/evals/evalRunner.ts
7309
7342
  async function executeToolCall(evalCase, mcp) {
7310
7343
  const mode = evalCase.mode || "direct";
@@ -7550,6 +7583,7 @@ async function runSingleIteration(evalCase, context, options) {
7550
7583
  };
7551
7584
  }
7552
7585
  }
7586
+ const hostUsage = isMCPHostSimulationResult(response) && response.usage ? response.usage : void 0;
7553
7587
  return {
7554
7588
  id: evalCase.id,
7555
7589
  datasetName: options.datasetName ?? "single-case",
@@ -7566,7 +7600,8 @@ async function runSingleIteration(evalCase, context, options) {
7566
7600
  tags: evalCase.tags,
7567
7601
  toolPrecision,
7568
7602
  toolRecall,
7569
- mcpHostTrace
7603
+ mcpHostTrace,
7604
+ hostUsage
7570
7605
  };
7571
7606
  }
7572
7607
  function isInfrastructureError(err) {
@@ -7602,7 +7637,8 @@ async function runEvalCase(evalCase, context, options = {}) {
7602
7637
  durationMs: result.durationMs,
7603
7638
  error: result.error,
7604
7639
  isInfrastructureError: infraError,
7605
- mcpHostTrace: result.mcpHostTrace
7640
+ mcpHostTrace: result.mcpHostTrace,
7641
+ hostUsage: result.hostUsage
7606
7642
  });
7607
7643
  } catch (err) {
7608
7644
  const errorMessage = err instanceof Error ? err.message : String(err);
@@ -7635,6 +7671,10 @@ async function runEvalCase(evalCase, context, options = {}) {
7635
7671
  durationMs: 0,
7636
7672
  tags: evalCase.tags
7637
7673
  };
7674
+ const totalHostUsage = iterationResults.reduce(
7675
+ (acc, r) => sumUsage(acc, r.hostUsage),
7676
+ void 0
7677
+ );
7638
7678
  return {
7639
7679
  ...baseResult,
7640
7680
  pass: assertionPassRate >= threshold,
@@ -7643,7 +7683,8 @@ async function runEvalCase(evalCase, context, options = {}) {
7643
7683
  infrastructureErrorRate,
7644
7684
  iterationResults,
7645
7685
  infrastructureErrorCount: infraErrors.length,
7646
- durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0)
7686
+ durationMs: iterationResults.reduce((sum, r) => sum + r.durationMs, 0),
7687
+ hostUsage: totalHostUsage
7647
7688
  };
7648
7689
  }
7649
7690
  function wilsonCI(k, n) {
@@ -7753,13 +7794,18 @@ async function runEvalDataset(options, context) {
7753
7794
  ...mcpHostModel !== void 0 && { mcpHostModel },
7754
7795
  ...judgeModel !== void 0 && { judgeModel }
7755
7796
  };
7797
+ const runHostUsage = caseResults.reduce(
7798
+ (acc, r) => sumUsage(acc, r.hostUsage),
7799
+ void 0
7800
+ );
7756
7801
  const result = {
7757
7802
  total,
7758
7803
  passed,
7759
7804
  failed: total - passed,
7760
7805
  caseResults,
7761
7806
  durationMs: Date.now() - startTime,
7762
- metadata
7807
+ metadata,
7808
+ totalHostUsage: runHostUsage
7763
7809
  };
7764
7810
  if (baselineResultsFrom) {
7765
7811
  try {