@wix/evalforge-evaluator 0.32.0 → 0.34.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -6591,6 +6591,20 @@ var import_crypto = require("crypto");
6591
6591
  var import_promises3 = require("fs/promises");
6592
6592
  var import_path5 = require("path");
6593
6593
  var DEFAULT_MODEL = "claude-3-5-sonnet-latest";
6594
+ function calculateStepCost(inputTokens, outputTokens, modelName) {
6595
+ const model = import_evalforge_types.AVAILABLE_MODELS.find(
6596
+ (m) => m.name === modelName || m.providerModelId === modelName || // Handle model aliases like "claude-3-5-sonnet-latest" -> "claude-3-5-sonnet-20241022"
6597
+ modelName.includes("claude-3-5-sonnet") ? m.providerModelId.includes("claude-3-5-sonnet") : modelName.includes("claude-4-sonnet") ? m.providerModelId.includes("claude-4-sonnet") : modelName.includes("claude-4-opus") ? m.providerModelId.includes("claude-4-opus") : false
6598
+ );
6599
+ if (!model) {
6600
+ const inputCost2 = inputTokens / 1e6 * 3;
6601
+ const outputCost2 = outputTokens / 1e6 * 15;
6602
+ return inputCost2 + outputCost2;
6603
+ }
6604
+ const inputCost = inputTokens / 1e6 * model.pricing.inputPer1M;
6605
+ const outputCost = outputTokens / 1e6 * model.pricing.outputPer1M;
6606
+ return inputCost + outputCost;
6607
+ }
6594
6608
  function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
6595
6609
  console.log(`${import_evalforge_types.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
6596
6610
  if (tracePushUrl) {
@@ -7022,15 +7036,22 @@ async function executeWithClaudeCode(skill, scenario, options) {
7022
7036
  const sdkPromise = (async () => {
7023
7037
  const evaluatorPromptSuffix = `
7024
7038
 
7025
- IMPORTANT: This is an automated evaluation run. Execute the requested changes immediately without asking for confirmation. Do not ask "would you like me to proceed?" or similar questions - just implement the solution directly.`;
7039
+ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
7040
+ 1. Execute the requested changes immediately without asking for confirmation.
7041
+ 2. Do NOT ask "would you like me to proceed?" or similar questions.
7042
+ 3. Do NOT use the Task tool to delegate simple operations - do them directly yourself.
7043
+ 4. Keep your approach simple and direct - avoid excessive planning.
7044
+ 5. Make targeted edits using Read and Edit tools rather than exploring the entire codebase.
7045
+ 6. If you encounter an error, fix it directly rather than starting over.`;
7026
7046
  const fullPrompt = scenario.triggerPrompt + evaluatorPromptSuffix;
7027
7047
  for await (const message of query({
7028
7048
  prompt: fullPrompt,
7029
7049
  options: queryOptions
7030
7050
  })) {
7031
7051
  messageCount++;
7052
+ const receivedAt = /* @__PURE__ */ new Date();
7032
7053
  console.log("[SDK Message]", JSON.stringify(message, null, 2));
7033
- allMessages.push(message);
7054
+ allMessages.push({ message, receivedAt });
7034
7055
  if (messageCount <= 3) {
7035
7056
  console.error(
7036
7057
  "[DEBUG-H5] SDK message received",
@@ -7297,7 +7318,11 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
7297
7318
  }
7298
7319
  const endTime = /* @__PURE__ */ new Date();
7299
7320
  const totalDurationMs = endTime.getTime() - startTime.getTime();
7300
- const { steps, result: sdkResult } = processMessages(allMessages, startTime);
7321
+ const { steps, result: sdkResult } = processMessages(
7322
+ allMessages,
7323
+ startTime,
7324
+ endTime
7325
+ );
7301
7326
  const outputText = extractFinalOutput(allMessages);
7302
7327
  const usage = extractTotalUsage(sdkResult);
7303
7328
  const llmTrace = buildLLMTraceFromSteps(
@@ -7348,25 +7373,36 @@ function isAssistantMessage(message) {
7348
7373
  function isResultMessage(message) {
7349
7374
  return message.type === "result";
7350
7375
  }
7351
- function processMessages(messages, startTime) {
7376
+ function processMessages(timestampedMessages, startTime, endTime) {
7352
7377
  const steps = [];
7353
7378
  let result;
7354
- let stepIndex = 0;
7355
- const assistantMessages = /* @__PURE__ */ new Map();
7356
- for (const message of messages) {
7379
+ const assistantMessageGroups = /* @__PURE__ */ new Map();
7380
+ for (const { message, receivedAt } of timestampedMessages) {
7357
7381
  if (isAssistantMessage(message)) {
7358
7382
  const uuid3 = message.uuid;
7359
- if (!assistantMessages.has(uuid3)) {
7360
- assistantMessages.set(uuid3, []);
7383
+ if (!assistantMessageGroups.has(uuid3)) {
7384
+ assistantMessageGroups.set(uuid3, {
7385
+ messages: [],
7386
+ firstReceivedAt: receivedAt,
7387
+ lastReceivedAt: receivedAt
7388
+ });
7361
7389
  }
7362
- assistantMessages.get(uuid3).push(message);
7390
+ const group = assistantMessageGroups.get(uuid3);
7391
+ group.messages.push(message);
7392
+ group.lastReceivedAt = receivedAt;
7363
7393
  } else if (isResultMessage(message)) {
7364
7394
  result = message;
7365
7395
  }
7366
7396
  }
7367
- for (const groupedMessages of assistantMessages.values()) {
7368
- const lastMessage = groupedMessages[groupedMessages.length - 1];
7369
- const stepStartTime = new Date(startTime.getTime() + stepIndex * 100);
7397
+ const sortedGroups = Array.from(assistantMessageGroups.values()).sort(
7398
+ (a, b) => a.firstReceivedAt.getTime() - b.firstReceivedAt.getTime()
7399
+ );
7400
+ for (let i = 0; i < sortedGroups.length; i++) {
7401
+ const group = sortedGroups[i];
7402
+ const lastMessage = group.messages[group.messages.length - 1];
7403
+ const stepStartTime = group.firstReceivedAt;
7404
+ const nextStepStartTime = i < sortedGroups.length - 1 ? sortedGroups[i + 1].firstReceivedAt : endTime;
7405
+ const durationMs = nextStepStartTime.getTime() - stepStartTime.getTime();
7370
7406
  const usage = lastMessage.message.usage;
7371
7407
  const inputTokens = usage.input_tokens;
7372
7408
  const outputTokens = usage.output_tokens;
@@ -7392,17 +7428,9 @@ function processMessages(messages, startTime) {
7392
7428
  finishReason: mapStopReason(lastMessage.message.stop_reason),
7393
7429
  toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
7394
7430
  startedAt: stepStartTime,
7395
- durationMs: 0
7396
- // Will be updated from result
7431
+ durationMs: Math.max(0, durationMs)
7432
+ // Ensure non-negative
7397
7433
  });
7398
- stepIndex++;
7399
- }
7400
- if (result && steps.length > 0) {
7401
- const totalDuration = result.duration_ms;
7402
- const durationPerStep = Math.floor(totalDuration / steps.length);
7403
- for (const step of steps) {
7404
- step.durationMs = durationPerStep;
7405
- }
7406
7434
  }
7407
7435
  return { steps, result };
7408
7436
  }
@@ -7419,9 +7447,9 @@ function mapStopReason(stopReason) {
7419
7447
  return "unknown";
7420
7448
  }
7421
7449
  }
7422
- function extractFinalOutput(messages) {
7423
- for (let i = messages.length - 1; i >= 0; i--) {
7424
- const message = messages[i];
7450
+ function extractFinalOutput(timestampedMessages) {
7451
+ for (let i = timestampedMessages.length - 1; i >= 0; i--) {
7452
+ const { message } = timestampedMessages[i];
7425
7453
  if (isAssistantMessage(message)) {
7426
7454
  for (const block of message.message.content) {
7427
7455
  if (block.type === "text" && block.text) {
@@ -7445,42 +7473,56 @@ function extractTotalUsage(result) {
7445
7473
  };
7446
7474
  }
7447
7475
  function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
7448
- const traceSteps = steps.map((step, index) => ({
7449
- id: (0, import_crypto.randomUUID)(),
7450
- stepNumber: index + 1,
7451
- type: step.toolCalls?.length ? import_evalforge_types.LLMStepType.TOOL_USE : import_evalforge_types.LLMStepType.COMPLETION,
7452
- model,
7453
- provider: "anthropic",
7454
- startedAt: step.startedAt.toISOString(),
7455
- durationMs: step.durationMs,
7456
- tokenUsage: {
7457
- prompt: step.usage.inputTokens,
7458
- completion: step.usage.outputTokens,
7459
- total: step.usage.totalTokens
7460
- },
7461
- costUsd: 0,
7462
- // Individual step costs not available
7463
- toolName: step.toolCalls?.[0]?.toolName,
7464
- toolArguments: step.toolCalls?.[0] ? JSON.stringify(step.toolCalls[0].args) : void 0,
7465
- outputPreview: step.text?.slice(0, 200),
7466
- success: step.finishReason !== "error",
7467
- error: step.finishReason === "error" ? "Generation failed" : void 0
7468
- }));
7476
+ const traceSteps = steps.map((step, index) => {
7477
+ const stepCost = calculateStepCost(
7478
+ step.usage.inputTokens,
7479
+ step.usage.outputTokens,
7480
+ model
7481
+ );
7482
+ return {
7483
+ id: (0, import_crypto.randomUUID)(),
7484
+ stepNumber: index + 1,
7485
+ type: step.toolCalls?.length ? import_evalforge_types.LLMStepType.TOOL_USE : import_evalforge_types.LLMStepType.COMPLETION,
7486
+ model,
7487
+ provider: "anthropic",
7488
+ startedAt: step.startedAt.toISOString(),
7489
+ durationMs: step.durationMs,
7490
+ tokenUsage: {
7491
+ prompt: step.usage.inputTokens,
7492
+ completion: step.usage.outputTokens,
7493
+ total: step.usage.totalTokens
7494
+ },
7495
+ costUsd: stepCost,
7496
+ toolName: step.toolCalls?.[0]?.toolName,
7497
+ toolArguments: step.toolCalls?.[0] ? JSON.stringify(step.toolCalls[0].args) : void 0,
7498
+ outputPreview: step.text?.slice(0, 200),
7499
+ success: step.finishReason !== "error",
7500
+ error: step.finishReason === "error" ? "Generation failed" : void 0
7501
+ };
7502
+ });
7503
+ const stepsTokens = {
7504
+ prompt: traceSteps.reduce((sum, s) => sum + s.tokenUsage.prompt, 0),
7505
+ completion: traceSteps.reduce((sum, s) => sum + s.tokenUsage.completion, 0),
7506
+ total: traceSteps.reduce((sum, s) => sum + s.tokenUsage.total, 0)
7507
+ };
7508
+ const stepsTotalCost = traceSteps.reduce((sum, s) => sum + s.costUsd, 0);
7509
+ const finalTokens = {
7510
+ prompt: usage.inputTokens > 0 ? usage.inputTokens : stepsTokens.prompt,
7511
+ completion: usage.outputTokens > 0 ? usage.outputTokens : stepsTokens.completion,
7512
+ total: usage.totalTokens > 0 ? usage.totalTokens : stepsTokens.total
7513
+ };
7514
+ const finalCost = usage.costUsd !== void 0 && usage.costUsd > 0 ? usage.costUsd : stepsTotalCost;
7469
7515
  const summary = {
7470
7516
  totalSteps: traceSteps.length,
7471
7517
  totalDurationMs,
7472
- totalTokens: {
7473
- prompt: usage.inputTokens,
7474
- completion: usage.outputTokens,
7475
- total: usage.totalTokens
7476
- },
7477
- totalCostUsd: usage.costUsd || 0,
7518
+ totalTokens: finalTokens,
7519
+ totalCostUsd: finalCost,
7478
7520
  modelBreakdown: {
7479
7521
  [model]: {
7480
7522
  count: traceSteps.length,
7481
7523
  durationMs: totalDurationMs,
7482
- tokens: usage.totalTokens,
7483
- costUsd: usage.costUsd || 0
7524
+ tokens: finalTokens.total,
7525
+ costUsd: finalCost
7484
7526
  }
7485
7527
  },
7486
7528
  modelsUsed: [model]
@@ -7495,21 +7537,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
7495
7537
  // src/run-scenario/file-diff.ts
7496
7538
  var import_fs6 = require("fs");
7497
7539
  var import_path6 = require("path");
7498
- var IGNORED_PATTERNS = [
7499
- "node_modules",
7500
- ".git",
7501
- ".claude",
7502
- ".cursor",
7503
- "dist",
7504
- "build",
7505
- ".next",
7506
- ".turbo",
7507
- "__pycache__",
7508
- ".pytest_cache",
7509
- ".venv",
7510
- "venv",
7511
- ".DS_Store"
7512
- ];
7540
+ var IGNORED_PATTERNS = [];
7513
7541
  var BINARY_EXTENSIONS = [
7514
7542
  ".png",
7515
7543
  ".jpg",