@wix/evalforge-evaluator 0.32.0 → 0.34.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +98 -70
- package/build/index.js.map +3 -3
- package/build/index.mjs +100 -71
- package/build/index.mjs.map +3 -3
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -6591,6 +6591,20 @@ var import_crypto = require("crypto");
|
|
|
6591
6591
|
var import_promises3 = require("fs/promises");
|
|
6592
6592
|
var import_path5 = require("path");
|
|
6593
6593
|
var DEFAULT_MODEL = "claude-3-5-sonnet-latest";
|
|
6594
|
+
function calculateStepCost(inputTokens, outputTokens, modelName) {
|
|
6595
|
+
const model = import_evalforge_types.AVAILABLE_MODELS.find(
|
|
6596
|
+
(m) => m.name === modelName || m.providerModelId === modelName || // Handle model aliases like "claude-3-5-sonnet-latest" -> "claude-3-5-sonnet-20241022"
|
|
6597
|
+
modelName.includes("claude-3-5-sonnet") ? m.providerModelId.includes("claude-3-5-sonnet") : modelName.includes("claude-4-sonnet") ? m.providerModelId.includes("claude-4-sonnet") : modelName.includes("claude-4-opus") ? m.providerModelId.includes("claude-4-opus") : false
|
|
6598
|
+
);
|
|
6599
|
+
if (!model) {
|
|
6600
|
+
const inputCost2 = inputTokens / 1e6 * 3;
|
|
6601
|
+
const outputCost2 = outputTokens / 1e6 * 15;
|
|
6602
|
+
return inputCost2 + outputCost2;
|
|
6603
|
+
}
|
|
6604
|
+
const inputCost = inputTokens / 1e6 * model.pricing.inputPer1M;
|
|
6605
|
+
const outputCost = outputTokens / 1e6 * model.pricing.outputPer1M;
|
|
6606
|
+
return inputCost + outputCost;
|
|
6607
|
+
}
|
|
6594
6608
|
function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
|
|
6595
6609
|
console.log(`${import_evalforge_types.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
|
|
6596
6610
|
if (tracePushUrl) {
|
|
@@ -7022,15 +7036,22 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
7022
7036
|
const sdkPromise = (async () => {
|
|
7023
7037
|
const evaluatorPromptSuffix = `
|
|
7024
7038
|
|
|
7025
|
-
IMPORTANT: This is an automated evaluation run.
|
|
7039
|
+
IMPORTANT: This is an automated evaluation run. Follow these guidelines:
|
|
7040
|
+
1. Execute the requested changes immediately without asking for confirmation.
|
|
7041
|
+
2. Do NOT ask "would you like me to proceed?" or similar questions.
|
|
7042
|
+
3. Do NOT use the Task tool to delegate simple operations - do them directly yourself.
|
|
7043
|
+
4. Keep your approach simple and direct - avoid excessive planning.
|
|
7044
|
+
5. Make targeted edits using Read and Edit tools rather than exploring the entire codebase.
|
|
7045
|
+
6. If you encounter an error, fix it directly rather than starting over.`;
|
|
7026
7046
|
const fullPrompt = scenario.triggerPrompt + evaluatorPromptSuffix;
|
|
7027
7047
|
for await (const message of query({
|
|
7028
7048
|
prompt: fullPrompt,
|
|
7029
7049
|
options: queryOptions
|
|
7030
7050
|
})) {
|
|
7031
7051
|
messageCount++;
|
|
7052
|
+
const receivedAt = /* @__PURE__ */ new Date();
|
|
7032
7053
|
console.log("[SDK Message]", JSON.stringify(message, null, 2));
|
|
7033
|
-
allMessages.push(message);
|
|
7054
|
+
allMessages.push({ message, receivedAt });
|
|
7034
7055
|
if (messageCount <= 3) {
|
|
7035
7056
|
console.error(
|
|
7036
7057
|
"[DEBUG-H5] SDK message received",
|
|
@@ -7297,7 +7318,11 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
7297
7318
|
}
|
|
7298
7319
|
const endTime = /* @__PURE__ */ new Date();
|
|
7299
7320
|
const totalDurationMs = endTime.getTime() - startTime.getTime();
|
|
7300
|
-
const { steps, result: sdkResult } = processMessages(
|
|
7321
|
+
const { steps, result: sdkResult } = processMessages(
|
|
7322
|
+
allMessages,
|
|
7323
|
+
startTime,
|
|
7324
|
+
endTime
|
|
7325
|
+
);
|
|
7301
7326
|
const outputText = extractFinalOutput(allMessages);
|
|
7302
7327
|
const usage = extractTotalUsage(sdkResult);
|
|
7303
7328
|
const llmTrace = buildLLMTraceFromSteps(
|
|
@@ -7348,25 +7373,36 @@ function isAssistantMessage(message) {
|
|
|
7348
7373
|
function isResultMessage(message) {
|
|
7349
7374
|
return message.type === "result";
|
|
7350
7375
|
}
|
|
7351
|
-
function processMessages(
|
|
7376
|
+
function processMessages(timestampedMessages, startTime, endTime) {
|
|
7352
7377
|
const steps = [];
|
|
7353
7378
|
let result;
|
|
7354
|
-
|
|
7355
|
-
const
|
|
7356
|
-
for (const message of messages) {
|
|
7379
|
+
const assistantMessageGroups = /* @__PURE__ */ new Map();
|
|
7380
|
+
for (const { message, receivedAt } of timestampedMessages) {
|
|
7357
7381
|
if (isAssistantMessage(message)) {
|
|
7358
7382
|
const uuid3 = message.uuid;
|
|
7359
|
-
if (!
|
|
7360
|
-
|
|
7383
|
+
if (!assistantMessageGroups.has(uuid3)) {
|
|
7384
|
+
assistantMessageGroups.set(uuid3, {
|
|
7385
|
+
messages: [],
|
|
7386
|
+
firstReceivedAt: receivedAt,
|
|
7387
|
+
lastReceivedAt: receivedAt
|
|
7388
|
+
});
|
|
7361
7389
|
}
|
|
7362
|
-
|
|
7390
|
+
const group = assistantMessageGroups.get(uuid3);
|
|
7391
|
+
group.messages.push(message);
|
|
7392
|
+
group.lastReceivedAt = receivedAt;
|
|
7363
7393
|
} else if (isResultMessage(message)) {
|
|
7364
7394
|
result = message;
|
|
7365
7395
|
}
|
|
7366
7396
|
}
|
|
7367
|
-
|
|
7368
|
-
|
|
7369
|
-
|
|
7397
|
+
const sortedGroups = Array.from(assistantMessageGroups.values()).sort(
|
|
7398
|
+
(a, b) => a.firstReceivedAt.getTime() - b.firstReceivedAt.getTime()
|
|
7399
|
+
);
|
|
7400
|
+
for (let i = 0; i < sortedGroups.length; i++) {
|
|
7401
|
+
const group = sortedGroups[i];
|
|
7402
|
+
const lastMessage = group.messages[group.messages.length - 1];
|
|
7403
|
+
const stepStartTime = group.firstReceivedAt;
|
|
7404
|
+
const nextStepStartTime = i < sortedGroups.length - 1 ? sortedGroups[i + 1].firstReceivedAt : endTime;
|
|
7405
|
+
const durationMs = nextStepStartTime.getTime() - stepStartTime.getTime();
|
|
7370
7406
|
const usage = lastMessage.message.usage;
|
|
7371
7407
|
const inputTokens = usage.input_tokens;
|
|
7372
7408
|
const outputTokens = usage.output_tokens;
|
|
@@ -7392,17 +7428,9 @@ function processMessages(messages, startTime) {
|
|
|
7392
7428
|
finishReason: mapStopReason(lastMessage.message.stop_reason),
|
|
7393
7429
|
toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
|
|
7394
7430
|
startedAt: stepStartTime,
|
|
7395
|
-
durationMs: 0
|
|
7396
|
-
//
|
|
7431
|
+
durationMs: Math.max(0, durationMs)
|
|
7432
|
+
// Ensure non-negative
|
|
7397
7433
|
});
|
|
7398
|
-
stepIndex++;
|
|
7399
|
-
}
|
|
7400
|
-
if (result && steps.length > 0) {
|
|
7401
|
-
const totalDuration = result.duration_ms;
|
|
7402
|
-
const durationPerStep = Math.floor(totalDuration / steps.length);
|
|
7403
|
-
for (const step of steps) {
|
|
7404
|
-
step.durationMs = durationPerStep;
|
|
7405
|
-
}
|
|
7406
7434
|
}
|
|
7407
7435
|
return { steps, result };
|
|
7408
7436
|
}
|
|
@@ -7419,9 +7447,9 @@ function mapStopReason(stopReason) {
|
|
|
7419
7447
|
return "unknown";
|
|
7420
7448
|
}
|
|
7421
7449
|
}
|
|
7422
|
-
function extractFinalOutput(
|
|
7423
|
-
for (let i =
|
|
7424
|
-
const message =
|
|
7450
|
+
function extractFinalOutput(timestampedMessages) {
|
|
7451
|
+
for (let i = timestampedMessages.length - 1; i >= 0; i--) {
|
|
7452
|
+
const { message } = timestampedMessages[i];
|
|
7425
7453
|
if (isAssistantMessage(message)) {
|
|
7426
7454
|
for (const block of message.message.content) {
|
|
7427
7455
|
if (block.type === "text" && block.text) {
|
|
@@ -7445,42 +7473,56 @@ function extractTotalUsage(result) {
|
|
|
7445
7473
|
};
|
|
7446
7474
|
}
|
|
7447
7475
|
function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
7448
|
-
const traceSteps = steps.map((step, index) =>
|
|
7449
|
-
|
|
7450
|
-
|
|
7451
|
-
|
|
7452
|
-
|
|
7453
|
-
|
|
7454
|
-
|
|
7455
|
-
|
|
7456
|
-
|
|
7457
|
-
|
|
7458
|
-
|
|
7459
|
-
|
|
7460
|
-
|
|
7461
|
-
|
|
7462
|
-
|
|
7463
|
-
|
|
7464
|
-
|
|
7465
|
-
|
|
7466
|
-
|
|
7467
|
-
|
|
7468
|
-
|
|
7476
|
+
const traceSteps = steps.map((step, index) => {
|
|
7477
|
+
const stepCost = calculateStepCost(
|
|
7478
|
+
step.usage.inputTokens,
|
|
7479
|
+
step.usage.outputTokens,
|
|
7480
|
+
model
|
|
7481
|
+
);
|
|
7482
|
+
return {
|
|
7483
|
+
id: (0, import_crypto.randomUUID)(),
|
|
7484
|
+
stepNumber: index + 1,
|
|
7485
|
+
type: step.toolCalls?.length ? import_evalforge_types.LLMStepType.TOOL_USE : import_evalforge_types.LLMStepType.COMPLETION,
|
|
7486
|
+
model,
|
|
7487
|
+
provider: "anthropic",
|
|
7488
|
+
startedAt: step.startedAt.toISOString(),
|
|
7489
|
+
durationMs: step.durationMs,
|
|
7490
|
+
tokenUsage: {
|
|
7491
|
+
prompt: step.usage.inputTokens,
|
|
7492
|
+
completion: step.usage.outputTokens,
|
|
7493
|
+
total: step.usage.totalTokens
|
|
7494
|
+
},
|
|
7495
|
+
costUsd: stepCost,
|
|
7496
|
+
toolName: step.toolCalls?.[0]?.toolName,
|
|
7497
|
+
toolArguments: step.toolCalls?.[0] ? JSON.stringify(step.toolCalls[0].args) : void 0,
|
|
7498
|
+
outputPreview: step.text?.slice(0, 200),
|
|
7499
|
+
success: step.finishReason !== "error",
|
|
7500
|
+
error: step.finishReason === "error" ? "Generation failed" : void 0
|
|
7501
|
+
};
|
|
7502
|
+
});
|
|
7503
|
+
const stepsTokens = {
|
|
7504
|
+
prompt: traceSteps.reduce((sum, s) => sum + s.tokenUsage.prompt, 0),
|
|
7505
|
+
completion: traceSteps.reduce((sum, s) => sum + s.tokenUsage.completion, 0),
|
|
7506
|
+
total: traceSteps.reduce((sum, s) => sum + s.tokenUsage.total, 0)
|
|
7507
|
+
};
|
|
7508
|
+
const stepsTotalCost = traceSteps.reduce((sum, s) => sum + s.costUsd, 0);
|
|
7509
|
+
const finalTokens = {
|
|
7510
|
+
prompt: usage.inputTokens > 0 ? usage.inputTokens : stepsTokens.prompt,
|
|
7511
|
+
completion: usage.outputTokens > 0 ? usage.outputTokens : stepsTokens.completion,
|
|
7512
|
+
total: usage.totalTokens > 0 ? usage.totalTokens : stepsTokens.total
|
|
7513
|
+
};
|
|
7514
|
+
const finalCost = usage.costUsd !== void 0 && usage.costUsd > 0 ? usage.costUsd : stepsTotalCost;
|
|
7469
7515
|
const summary = {
|
|
7470
7516
|
totalSteps: traceSteps.length,
|
|
7471
7517
|
totalDurationMs,
|
|
7472
|
-
totalTokens:
|
|
7473
|
-
|
|
7474
|
-
completion: usage.outputTokens,
|
|
7475
|
-
total: usage.totalTokens
|
|
7476
|
-
},
|
|
7477
|
-
totalCostUsd: usage.costUsd || 0,
|
|
7518
|
+
totalTokens: finalTokens,
|
|
7519
|
+
totalCostUsd: finalCost,
|
|
7478
7520
|
modelBreakdown: {
|
|
7479
7521
|
[model]: {
|
|
7480
7522
|
count: traceSteps.length,
|
|
7481
7523
|
durationMs: totalDurationMs,
|
|
7482
|
-
tokens:
|
|
7483
|
-
costUsd:
|
|
7524
|
+
tokens: finalTokens.total,
|
|
7525
|
+
costUsd: finalCost
|
|
7484
7526
|
}
|
|
7485
7527
|
},
|
|
7486
7528
|
modelsUsed: [model]
|
|
@@ -7495,21 +7537,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
7495
7537
|
// src/run-scenario/file-diff.ts
|
|
7496
7538
|
var import_fs6 = require("fs");
|
|
7497
7539
|
var import_path6 = require("path");
|
|
7498
|
-
var IGNORED_PATTERNS = [
|
|
7499
|
-
"node_modules",
|
|
7500
|
-
".git",
|
|
7501
|
-
".claude",
|
|
7502
|
-
".cursor",
|
|
7503
|
-
"dist",
|
|
7504
|
-
"build",
|
|
7505
|
-
".next",
|
|
7506
|
-
".turbo",
|
|
7507
|
-
"__pycache__",
|
|
7508
|
-
".pytest_cache",
|
|
7509
|
-
".venv",
|
|
7510
|
-
"venv",
|
|
7511
|
-
".DS_Store"
|
|
7512
|
-
];
|
|
7540
|
+
var IGNORED_PATTERNS = [];
|
|
7513
7541
|
var BINARY_EXTENSIONS = [
|
|
7514
7542
|
".png",
|
|
7515
7543
|
".jpg",
|