@wix/evalforge-evaluator 0.85.0 → 0.86.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
24
24
  ));
25
25
 
26
26
  // src/index.ts
27
- var import_evalforge_types7 = require("@wix/evalforge-types");
27
+ var import_evalforge_types6 = require("@wix/evalforge-types");
28
28
 
29
29
  // src/config.ts
30
30
  function loadConfig() {
@@ -399,7 +399,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
399
399
  }
400
400
 
401
401
  // src/run-scenario/index.ts
402
- var import_evalforge_types5 = require("@wix/evalforge-types");
402
+ var import_evalforge_types4 = require("@wix/evalforge-types");
403
403
  var import_eval_assertions = require("@wix/eval-assertions");
404
404
 
405
405
  // src/run-scenario/environment.ts
@@ -636,9 +636,6 @@ function getAdapter(runCommand) {
636
636
  return adapter;
637
637
  }
638
638
 
639
- // src/run-scenario/agents/claude-code/claude-code-adapter.ts
640
- var import_evalforge_types4 = require("@wix/evalforge-types");
641
-
642
639
  // src/run-scenario/agents/claude-code/execute.ts
643
640
  var import_evalforge_types3 = require("@wix/evalforge-types");
644
641
  var import_crypto = require("crypto");
@@ -695,21 +692,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents) {
695
692
  }
696
693
 
697
694
  // src/run-scenario/agents/claude-code/execute.ts
698
- var DEFAULT_MODEL = "claude-3-5-sonnet-latest";
699
- function calculateStepCost(inputTokens, outputTokens, modelName) {
700
- const model = import_evalforge_types3.AVAILABLE_MODELS.find(
701
- (m) => m.name === modelName || m.providerModelId === modelName || // Handle model aliases like "claude-3-5-sonnet-latest" -> "claude-3-5-sonnet-20241022"
702
- modelName.includes("claude-3-5-sonnet") ? m.providerModelId.includes("claude-3-5-sonnet") : modelName.includes("claude-4-sonnet") ? m.providerModelId.includes("claude-4-sonnet") : modelName.includes("claude-4-opus") ? m.providerModelId.includes("claude-4-opus") : false
703
- );
704
- if (!model) {
705
- const inputCost2 = inputTokens / 1e6 * 3;
706
- const outputCost2 = outputTokens / 1e6 * 15;
707
- return inputCost2 + outputCost2;
708
- }
709
- const inputCost = inputTokens / 1e6 * model.pricing.inputPer1M;
710
- const outputCost = outputTokens / 1e6 * model.pricing.outputPer1M;
711
- return inputCost + outputCost;
712
- }
695
+ var DEFAULT_MODEL = import_evalforge_types3.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
713
696
  function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
714
697
  console.log(`${import_evalforge_types3.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
715
698
  if (tracePushUrl) {
@@ -922,6 +905,13 @@ async function executeWithClaudeCode(skills, scenario, options) {
922
905
  }
923
906
  const startTime = /* @__PURE__ */ new Date();
924
907
  const allMessages = [];
908
+ const { mkdir: mkdirAsync, writeFile: writeFile4 } = await import("fs/promises");
909
+ const claudeDir = `${options.cwd}/.claude`;
910
+ await mkdirAsync(claudeDir, { recursive: true });
911
+ await writeFile4(`${claudeDir}/settings.json`, "{}", {
912
+ flag: "wx"
913
+ }).catch(() => {
914
+ });
925
915
  if (options.mcps && options.mcps.length > 0) {
926
916
  await writeMcpToFilesystem(options.cwd, options.mcps);
927
917
  }
@@ -1448,7 +1438,14 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
1448
1438
  startTime,
1449
1439
  endTime
1450
1440
  );
1451
- const outputText = extractFinalOutput(allMessages);
1441
+ const rawOutput = extractFinalOutput(allMessages);
1442
+ const isError = sdkResult?.subtype !== "success";
1443
+ let outputText = rawOutput;
1444
+ if (!rawOutput && isError) {
1445
+ const hasErrors = sdkResult && "errors" in sdkResult && sdkResult.errors?.length;
1446
+ const errorDetails = hasErrors ? sdkResult.errors.join("; ") : sdkResult?.subtype ?? "unknown";
1447
+ outputText = `[ERROR] Agent execution failed: ${errorDetails}`;
1448
+ }
1452
1449
  const usage = extractTotalUsage(sdkResult);
1453
1450
  const llmTrace = buildLLMTraceFromSteps(
1454
1451
  steps,
@@ -1590,12 +1587,13 @@ function extractTotalUsage(result) {
1590
1587
  };
1591
1588
  }
1592
1589
  function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
1590
+ const totalCost = usage.costUsd ?? 0;
1591
+ const totalStepTokens = steps.reduce(
1592
+ (sum, s) => sum + s.usage.totalTokens,
1593
+ 0
1594
+ );
1593
1595
  const traceSteps = steps.map((step, index) => {
1594
- const stepCost = calculateStepCost(
1595
- step.usage.inputTokens,
1596
- step.usage.outputTokens,
1597
- model
1598
- );
1596
+ const proportion = totalStepTokens > 0 ? step.usage.totalTokens / totalStepTokens : 0;
1599
1597
  return {
1600
1598
  id: (0, import_crypto.randomUUID)(),
1601
1599
  stepNumber: index + 1,
@@ -1609,7 +1607,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
1609
1607
  completion: step.usage.outputTokens,
1610
1608
  total: step.usage.totalTokens
1611
1609
  },
1612
- costUsd: stepCost,
1610
+ costUsd: totalCost * proportion,
1613
1611
  toolName: step.toolCalls?.[0]?.toolName,
1614
1612
  toolArguments: step.toolCalls?.[0] ? JSON.stringify(step.toolCalls[0].args) : void 0,
1615
1613
  outputPreview: step.text?.slice(0, 200),
@@ -1622,13 +1620,12 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
1622
1620
  completion: traceSteps.reduce((sum, s) => sum + s.tokenUsage.completion, 0),
1623
1621
  total: traceSteps.reduce((sum, s) => sum + s.tokenUsage.total, 0)
1624
1622
  };
1625
- const stepsTotalCost = traceSteps.reduce((sum, s) => sum + s.costUsd, 0);
1626
1623
  const finalTokens = {
1627
1624
  prompt: usage.inputTokens > 0 ? usage.inputTokens : stepsTokens.prompt,
1628
1625
  completion: usage.outputTokens > 0 ? usage.outputTokens : stepsTokens.completion,
1629
1626
  total: usage.totalTokens > 0 ? usage.totalTokens : stepsTokens.total
1630
1627
  };
1631
- const finalCost = usage.costUsd !== void 0 && usage.costUsd > 0 ? usage.costUsd : stepsTotalCost;
1628
+ const finalCost = totalCost;
1632
1629
  const summary = {
1633
1630
  totalSteps: traceSteps.length,
1634
1631
  totalDurationMs,
@@ -1674,7 +1671,7 @@ var ClaudeCodeAdapter = class {
1674
1671
  mcps,
1675
1672
  subAgents
1676
1673
  } = context;
1677
- const modelForSdk = modelConfig?.model ? import_evalforge_types4.AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
1674
+ const modelForSdk = modelConfig?.model;
1678
1675
  const options = {
1679
1676
  cwd,
1680
1677
  model: modelForSdk,
@@ -2523,8 +2520,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
2523
2520
  }))
2524
2521
  };
2525
2522
  const { "x-wix-ai-gateway-stream": _stream, ...judgeHeaders } = config.aiGatewayHeaders;
2523
+ const defaultJudgeModel = import_evalforge_types4.AVAILABLE_MODEL_IDS[0];
2526
2524
  const assertionContext = {
2527
2525
  workDir,
2526
+ defaultJudgeModel,
2528
2527
  llmConfig: {
2529
2528
  baseUrl: config.aiGatewayUrl,
2530
2529
  headers: judgeHeaders
@@ -2536,10 +2535,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
2536
2535
  assertionContext
2537
2536
  ) : [];
2538
2537
  const passed = assertionResults.filter(
2539
- (r) => r.status === import_evalforge_types5.AssertionResultStatus.PASSED
2538
+ (r) => r.status === import_evalforge_types4.AssertionResultStatus.PASSED
2540
2539
  ).length;
2541
2540
  const failed = assertionResults.filter(
2542
- (r) => r.status === import_evalforge_types5.AssertionResultStatus.FAILED
2541
+ (r) => r.status === import_evalforge_types4.AssertionResultStatus.FAILED
2543
2542
  ).length;
2544
2543
  const total = assertionResults.length;
2545
2544
  const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
@@ -2553,7 +2552,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
2553
2552
  }
2554
2553
 
2555
2554
  // src/error-reporter.ts
2556
- var import_evalforge_types6 = require("@wix/evalforge-types");
2555
+ var import_evalforge_types5 = require("@wix/evalforge-types");
2557
2556
  function formatError(error, phase, context) {
2558
2557
  const timestamp = (/* @__PURE__ */ new Date()).toISOString();
2559
2558
  if (error instanceof Error) {
@@ -2802,7 +2801,7 @@ async function runEvaluation(projectId2, evalRunId2) {
2802
2801
  };
2803
2802
  try {
2804
2803
  await api.updateEvalRun(projectId2, evalRunId2, {
2805
- status: import_evalforge_types7.EvalStatus.COMPLETED,
2804
+ status: import_evalforge_types6.EvalStatus.COMPLETED,
2806
2805
  completedAt: (/* @__PURE__ */ new Date()).toISOString()
2807
2806
  });
2808
2807
  } catch (updateErr) {
@@ -2843,7 +2842,7 @@ runEvaluation(projectId, evalRunId).then(() => {
2843
2842
  authToken: config.authToken
2844
2843
  });
2845
2844
  await api.updateEvalRun(projectId, evalRunId, {
2846
- status: import_evalforge_types7.EvalStatus.FAILED,
2845
+ status: import_evalforge_types6.EvalStatus.FAILED,
2847
2846
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
2848
2847
  jobError,
2849
2848
  jobStatus: "FAILED"
@@ -2866,7 +2865,7 @@ runEvaluation(projectId, evalRunId).then(() => {
2866
2865
  authToken
2867
2866
  });
2868
2867
  await api.updateEvalRun(projectId, evalRunId, {
2869
- status: import_evalforge_types7.EvalStatus.FAILED,
2868
+ status: import_evalforge_types6.EvalStatus.FAILED,
2870
2869
  completedAt: (/* @__PURE__ */ new Date()).toISOString(),
2871
2870
  jobError: `Config load failed, then: ${jobError}`,
2872
2871
  jobStatus: "FAILED"