@wix/evalforge-evaluator 0.85.0 → 0.86.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -379,7 +379,10 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
379
379
  }
380
380
 
381
381
  // src/run-scenario/index.ts
382
- import { AssertionResultStatus } from "@wix/evalforge-types";
382
+ import {
383
+ AssertionResultStatus,
384
+ AVAILABLE_MODEL_IDS
385
+ } from "@wix/evalforge-types";
383
386
  import {
384
387
  evaluateAssertions as evaluateAssertionsBase
385
388
  } from "@wix/eval-assertions";
@@ -618,15 +621,12 @@ function getAdapter(runCommand) {
618
621
  return adapter;
619
622
  }
620
623
 
621
- // src/run-scenario/agents/claude-code/claude-code-adapter.ts
622
- import { AVAILABLE_MODELS_MAP } from "@wix/evalforge-types";
623
-
624
624
  // src/run-scenario/agents/claude-code/execute.ts
625
625
  import {
626
+ ClaudeModel,
626
627
  LLMStepType,
627
628
  LiveTraceEventType,
628
- TRACE_EVENT_PREFIX,
629
- AVAILABLE_MODELS
629
+ TRACE_EVENT_PREFIX
630
630
  } from "@wix/evalforge-types";
631
631
  import { randomUUID } from "crypto";
632
632
 
@@ -682,21 +682,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents) {
682
682
  }
683
683
 
684
684
  // src/run-scenario/agents/claude-code/execute.ts
685
- var DEFAULT_MODEL = "claude-3-5-sonnet-latest";
686
- function calculateStepCost(inputTokens, outputTokens, modelName) {
687
- const model = AVAILABLE_MODELS.find(
688
- (m) => m.name === modelName || m.providerModelId === modelName || // Handle model aliases like "claude-3-5-sonnet-latest" -> "claude-3-5-sonnet-20241022"
689
- modelName.includes("claude-3-5-sonnet") ? m.providerModelId.includes("claude-3-5-sonnet") : modelName.includes("claude-4-sonnet") ? m.providerModelId.includes("claude-4-sonnet") : modelName.includes("claude-4-opus") ? m.providerModelId.includes("claude-4-opus") : false
690
- );
691
- if (!model) {
692
- const inputCost2 = inputTokens / 1e6 * 3;
693
- const outputCost2 = outputTokens / 1e6 * 15;
694
- return inputCost2 + outputCost2;
695
- }
696
- const inputCost = inputTokens / 1e6 * model.pricing.inputPer1M;
697
- const outputCost = outputTokens / 1e6 * model.pricing.outputPer1M;
698
- return inputCost + outputCost;
699
- }
685
+ var DEFAULT_MODEL = ClaudeModel.CLAUDE_4_5_SONNET_1_0;
700
686
  function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
701
687
  console.log(`${TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
702
688
  if (tracePushUrl) {
@@ -909,6 +895,13 @@ async function executeWithClaudeCode(skills, scenario, options) {
909
895
  }
910
896
  const startTime = /* @__PURE__ */ new Date();
911
897
  const allMessages = [];
898
+ const { mkdir: mkdirAsync, writeFile: writeFile4 } = await import("fs/promises");
899
+ const claudeDir = `${options.cwd}/.claude`;
900
+ await mkdirAsync(claudeDir, { recursive: true });
901
+ await writeFile4(`${claudeDir}/settings.json`, "{}", {
902
+ flag: "wx"
903
+ }).catch(() => {
904
+ });
912
905
  if (options.mcps && options.mcps.length > 0) {
913
906
  await writeMcpToFilesystem(options.cwd, options.mcps);
914
907
  }
@@ -1435,7 +1428,14 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
1435
1428
  startTime,
1436
1429
  endTime
1437
1430
  );
1438
- const outputText = extractFinalOutput(allMessages);
1431
+ const rawOutput = extractFinalOutput(allMessages);
1432
+ const isError = sdkResult?.subtype !== "success";
1433
+ let outputText = rawOutput;
1434
+ if (!rawOutput && isError) {
1435
+ const hasErrors = sdkResult && "errors" in sdkResult && sdkResult.errors?.length;
1436
+ const errorDetails = hasErrors ? sdkResult.errors.join("; ") : sdkResult?.subtype ?? "unknown";
1437
+ outputText = `[ERROR] Agent execution failed: ${errorDetails}`;
1438
+ }
1439
1439
  const usage = extractTotalUsage(sdkResult);
1440
1440
  const llmTrace = buildLLMTraceFromSteps(
1441
1441
  steps,
@@ -1577,12 +1577,13 @@ function extractTotalUsage(result) {
1577
1577
  };
1578
1578
  }
1579
1579
  function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
1580
+ const totalCost = usage.costUsd ?? 0;
1581
+ const totalStepTokens = steps.reduce(
1582
+ (sum, s) => sum + s.usage.totalTokens,
1583
+ 0
1584
+ );
1580
1585
  const traceSteps = steps.map((step, index) => {
1581
- const stepCost = calculateStepCost(
1582
- step.usage.inputTokens,
1583
- step.usage.outputTokens,
1584
- model
1585
- );
1586
+ const proportion = totalStepTokens > 0 ? step.usage.totalTokens / totalStepTokens : 0;
1586
1587
  return {
1587
1588
  id: randomUUID(),
1588
1589
  stepNumber: index + 1,
@@ -1596,7 +1597,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
1596
1597
  completion: step.usage.outputTokens,
1597
1598
  total: step.usage.totalTokens
1598
1599
  },
1599
- costUsd: stepCost,
1600
+ costUsd: totalCost * proportion,
1600
1601
  toolName: step.toolCalls?.[0]?.toolName,
1601
1602
  toolArguments: step.toolCalls?.[0] ? JSON.stringify(step.toolCalls[0].args) : void 0,
1602
1603
  outputPreview: step.text?.slice(0, 200),
@@ -1609,13 +1610,12 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
1609
1610
  completion: traceSteps.reduce((sum, s) => sum + s.tokenUsage.completion, 0),
1610
1611
  total: traceSteps.reduce((sum, s) => sum + s.tokenUsage.total, 0)
1611
1612
  };
1612
- const stepsTotalCost = traceSteps.reduce((sum, s) => sum + s.costUsd, 0);
1613
1613
  const finalTokens = {
1614
1614
  prompt: usage.inputTokens > 0 ? usage.inputTokens : stepsTokens.prompt,
1615
1615
  completion: usage.outputTokens > 0 ? usage.outputTokens : stepsTokens.completion,
1616
1616
  total: usage.totalTokens > 0 ? usage.totalTokens : stepsTokens.total
1617
1617
  };
1618
- const finalCost = usage.costUsd !== void 0 && usage.costUsd > 0 ? usage.costUsd : stepsTotalCost;
1618
+ const finalCost = totalCost;
1619
1619
  const summary = {
1620
1620
  totalSteps: traceSteps.length,
1621
1621
  totalDurationMs,
@@ -1661,7 +1661,7 @@ var ClaudeCodeAdapter = class {
1661
1661
  mcps,
1662
1662
  subAgents
1663
1663
  } = context;
1664
- const modelForSdk = modelConfig?.model ? AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
1664
+ const modelForSdk = modelConfig?.model;
1665
1665
  const options = {
1666
1666
  cwd,
1667
1667
  model: modelForSdk,
@@ -2510,8 +2510,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
2510
2510
  }))
2511
2511
  };
2512
2512
  const { "x-wix-ai-gateway-stream": _stream, ...judgeHeaders } = config.aiGatewayHeaders;
2513
+ const defaultJudgeModel = AVAILABLE_MODEL_IDS[0];
2513
2514
  const assertionContext = {
2514
2515
  workDir,
2516
+ defaultJudgeModel,
2515
2517
  llmConfig: {
2516
2518
  baseUrl: config.aiGatewayUrl,
2517
2519
  headers: judgeHeaders