@wix/evalforge-evaluator 0.107.0 → 0.109.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +128 -50
- package/build/index.js.map +2 -2
- package/build/index.mjs +129 -50
- package/build/index.mjs.map +2 -2
- package/build/types/run-scenario/types.d.ts +2 -0
- package/package.json +4 -4
package/build/index.js
CHANGED
|
@@ -1769,6 +1769,7 @@ function createTraceEventFromMessage(message, context2, stepNumber, isComplete)
|
|
|
1769
1769
|
let toolArgs;
|
|
1770
1770
|
let outputPreview;
|
|
1771
1771
|
let filePath;
|
|
1772
|
+
let thinking;
|
|
1772
1773
|
for (const block of message.message.content) {
|
|
1773
1774
|
if (block.type === "tool_use") {
|
|
1774
1775
|
type = import_evalforge_types4.LiveTraceEventType.TOOL_USE;
|
|
@@ -1785,6 +1786,15 @@ function createTraceEventFromMessage(message, context2, stepNumber, isComplete)
|
|
|
1785
1786
|
}
|
|
1786
1787
|
} else if (block.type === "text") {
|
|
1787
1788
|
outputPreview = block.text.slice(0, 500);
|
|
1789
|
+
if (!toolName) {
|
|
1790
|
+
type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
|
|
1791
|
+
}
|
|
1792
|
+
} else if (block.type === "thinking") {
|
|
1793
|
+
const thinkingBlock = block;
|
|
1794
|
+
thinking = thinkingBlock.thinking.slice(0, 500);
|
|
1795
|
+
if (!outputPreview && !toolName) {
|
|
1796
|
+
type = import_evalforge_types4.LiveTraceEventType.THINKING;
|
|
1797
|
+
}
|
|
1788
1798
|
}
|
|
1789
1799
|
}
|
|
1790
1800
|
return {
|
|
@@ -1799,6 +1809,7 @@ function createTraceEventFromMessage(message, context2, stepNumber, isComplete)
|
|
|
1799
1809
|
toolArgs,
|
|
1800
1810
|
outputPreview,
|
|
1801
1811
|
filePath,
|
|
1812
|
+
thinking,
|
|
1802
1813
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1803
1814
|
isComplete
|
|
1804
1815
|
};
|
|
@@ -2424,28 +2435,53 @@ function processMessages(timestampedMessages, startTime, endTime) {
|
|
|
2424
2435
|
const sortedGroups = Array.from(assistantMessageGroups.values()).sort(
|
|
2425
2436
|
(a, b) => a.firstReceivedAt.getTime() - b.firstReceivedAt.getTime()
|
|
2426
2437
|
);
|
|
2427
|
-
|
|
2428
|
-
|
|
2429
|
-
const
|
|
2430
|
-
const
|
|
2431
|
-
const
|
|
2438
|
+
const mergedTurns = [];
|
|
2439
|
+
for (const group of sortedGroups) {
|
|
2440
|
+
const lastMsg = group.messages[group.messages.length - 1];
|
|
2441
|
+
const inputTokens = lastMsg.message.usage.input_tokens;
|
|
2442
|
+
const prev = mergedTurns[mergedTurns.length - 1];
|
|
2443
|
+
const prevLastMsg = prev?.groups[prev.groups.length - 1].messages.at(-1);
|
|
2444
|
+
const prevInputTokens = prevLastMsg?.message.usage.input_tokens;
|
|
2445
|
+
if (prev && prevInputTokens === inputTokens) {
|
|
2446
|
+
prev.groups.push(group);
|
|
2447
|
+
prev.lastReceivedAt = group.lastReceivedAt;
|
|
2448
|
+
} else {
|
|
2449
|
+
mergedTurns.push({
|
|
2450
|
+
groups: [group],
|
|
2451
|
+
firstReceivedAt: group.firstReceivedAt,
|
|
2452
|
+
lastReceivedAt: group.lastReceivedAt
|
|
2453
|
+
});
|
|
2454
|
+
}
|
|
2455
|
+
}
|
|
2456
|
+
for (let i = 0; i < mergedTurns.length; i++) {
|
|
2457
|
+
const turn = mergedTurns[i];
|
|
2458
|
+
const stepStartTime = turn.firstReceivedAt;
|
|
2459
|
+
const nextStepStartTime = i < mergedTurns.length - 1 ? mergedTurns[i + 1].firstReceivedAt : endTime;
|
|
2432
2460
|
const durationMs = nextStepStartTime.getTime() - stepStartTime.getTime();
|
|
2433
|
-
const usage = lastMessage.message.usage;
|
|
2434
|
-
const inputTokens = usage.input_tokens;
|
|
2435
|
-
const outputTokens = usage.output_tokens;
|
|
2436
2461
|
let text2 = "";
|
|
2437
2462
|
let thinking = "";
|
|
2438
2463
|
const toolCalls = [];
|
|
2439
|
-
|
|
2440
|
-
|
|
2441
|
-
|
|
2442
|
-
|
|
2443
|
-
|
|
2444
|
-
|
|
2445
|
-
|
|
2446
|
-
|
|
2447
|
-
|
|
2448
|
-
|
|
2464
|
+
let lastStopReason = null;
|
|
2465
|
+
let inputTokens = 0;
|
|
2466
|
+
let outputTokens = 0;
|
|
2467
|
+
let cacheReadTokens;
|
|
2468
|
+
let cacheWriteTokens;
|
|
2469
|
+
for (const group of turn.groups) {
|
|
2470
|
+
const lastMessage = group.messages[group.messages.length - 1];
|
|
2471
|
+
lastStopReason = lastMessage.message.stop_reason;
|
|
2472
|
+
const msgUsage = lastMessage.message.usage;
|
|
2473
|
+
inputTokens = msgUsage.input_tokens;
|
|
2474
|
+
outputTokens = msgUsage.output_tokens;
|
|
2475
|
+
cacheReadTokens = msgUsage.cache_read_input_tokens ?? void 0;
|
|
2476
|
+
cacheWriteTokens = msgUsage.cache_creation_input_tokens ?? void 0;
|
|
2477
|
+
for (const block of lastMessage.message.content) {
|
|
2478
|
+
if (block.type === "text") {
|
|
2479
|
+
text2 += block.text;
|
|
2480
|
+
} else if (block.type === "thinking") {
|
|
2481
|
+
thinking += block.thinking;
|
|
2482
|
+
} else if (block.type === "tool_use") {
|
|
2483
|
+
toolCalls.push({ toolName: block.name, args: block.input });
|
|
2484
|
+
}
|
|
2449
2485
|
}
|
|
2450
2486
|
}
|
|
2451
2487
|
steps.push({
|
|
@@ -2454,13 +2490,14 @@ function processMessages(timestampedMessages, startTime, endTime) {
|
|
|
2454
2490
|
usage: {
|
|
2455
2491
|
inputTokens,
|
|
2456
2492
|
outputTokens,
|
|
2457
|
-
totalTokens: inputTokens + outputTokens
|
|
2493
|
+
totalTokens: inputTokens + outputTokens,
|
|
2494
|
+
cacheReadTokens: cacheReadTokens || void 0,
|
|
2495
|
+
cacheWriteTokens: cacheWriteTokens || void 0
|
|
2458
2496
|
},
|
|
2459
|
-
finishReason: mapStopReason(
|
|
2497
|
+
finishReason: mapStopReason(lastStopReason),
|
|
2460
2498
|
toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
|
|
2461
2499
|
startedAt: stepStartTime,
|
|
2462
2500
|
durationMs: Math.max(0, durationMs)
|
|
2463
|
-
// Ensure non-negative
|
|
2464
2501
|
});
|
|
2465
2502
|
}
|
|
2466
2503
|
return { steps, result };
|
|
@@ -2500,35 +2537,35 @@ function extractTotalUsage(result) {
|
|
|
2500
2537
|
inputTokens: usage.input_tokens,
|
|
2501
2538
|
outputTokens: usage.output_tokens,
|
|
2502
2539
|
totalTokens: usage.input_tokens + usage.output_tokens,
|
|
2503
|
-
costUsd: result.total_cost_usd
|
|
2540
|
+
costUsd: result.total_cost_usd,
|
|
2541
|
+
cacheReadTokens: usage.cache_read_input_tokens || void 0,
|
|
2542
|
+
cacheWriteTokens: usage.cache_creation_input_tokens || void 0,
|
|
2543
|
+
durationApiMs: result.duration_api_ms || void 0
|
|
2504
2544
|
};
|
|
2505
2545
|
}
|
|
2506
2546
|
function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
2507
2547
|
const totalCost = usage.costUsd ?? 0;
|
|
2508
|
-
const
|
|
2509
|
-
(sum, s) => sum + s.usage.
|
|
2548
|
+
const totalStepInputTokens = steps.reduce(
|
|
2549
|
+
(sum, s) => sum + s.usage.inputTokens,
|
|
2510
2550
|
0
|
|
2511
2551
|
);
|
|
2512
2552
|
const totalStepDuration = steps.reduce((sum, s) => sum + s.durationMs, 0);
|
|
2513
|
-
const
|
|
2553
|
+
const inputTokensDuplicated = usage.inputTokens > 0 && totalStepInputTokens > usage.inputTokens * 1.2;
|
|
2514
2554
|
const traceSteps = steps.map((step, index) => {
|
|
2515
|
-
let
|
|
2555
|
+
let stepPromptTokens;
|
|
2556
|
+
let stepOutputTokens;
|
|
2516
2557
|
let proportion;
|
|
2517
|
-
if (
|
|
2518
|
-
proportion = step.usage.totalTokens / totalStepTokens;
|
|
2519
|
-
tokenUsage = {
|
|
2520
|
-
prompt: step.usage.inputTokens,
|
|
2521
|
-
completion: step.usage.outputTokens,
|
|
2522
|
-
total: step.usage.totalTokens
|
|
2523
|
-
};
|
|
2524
|
-
} else {
|
|
2558
|
+
if (inputTokensDuplicated) {
|
|
2525
2559
|
proportion = totalStepDuration > 0 ? step.durationMs / totalStepDuration : 0;
|
|
2526
|
-
|
|
2527
|
-
|
|
2528
|
-
|
|
2529
|
-
|
|
2530
|
-
|
|
2560
|
+
stepPromptTokens = Math.round(usage.inputTokens * proportion);
|
|
2561
|
+
stepOutputTokens = Math.round(usage.outputTokens * proportion);
|
|
2562
|
+
} else {
|
|
2563
|
+
proportion = totalStepInputTokens > 0 ? step.usage.inputTokens / totalStepInputTokens : 0;
|
|
2564
|
+
stepPromptTokens = step.usage.inputTokens;
|
|
2565
|
+
stepOutputTokens = Math.round(usage.outputTokens * proportion);
|
|
2531
2566
|
}
|
|
2567
|
+
const stepTotalTokens = stepPromptTokens + stepOutputTokens;
|
|
2568
|
+
const costProportion = proportion;
|
|
2532
2569
|
const stepType = step.toolCalls?.length ? import_evalforge_types4.LLMStepType.TOOL_USE : step.thinking && !step.text ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
|
|
2533
2570
|
return {
|
|
2534
2571
|
id: (0, import_crypto.randomUUID)(),
|
|
@@ -2538,11 +2575,15 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
2538
2575
|
provider: "anthropic",
|
|
2539
2576
|
startedAt: step.startedAt.toISOString(),
|
|
2540
2577
|
durationMs: step.durationMs,
|
|
2541
|
-
tokenUsage
|
|
2542
|
-
|
|
2578
|
+
tokenUsage: {
|
|
2579
|
+
prompt: stepPromptTokens,
|
|
2580
|
+
completion: stepOutputTokens,
|
|
2581
|
+
total: stepTotalTokens
|
|
2582
|
+
},
|
|
2583
|
+
costUsd: totalCost * costProportion,
|
|
2543
2584
|
toolName: step.toolCalls?.[0]?.toolName,
|
|
2544
2585
|
toolArguments: step.toolCalls?.[0] ? JSON.stringify(step.toolCalls[0].args) : void 0,
|
|
2545
|
-
outputPreview: step.text?.slice(0, 200),
|
|
2586
|
+
outputPreview: (step.text || step.thinking)?.slice(0, 200),
|
|
2546
2587
|
success: step.finishReason !== "error",
|
|
2547
2588
|
error: step.finishReason === "error" ? "Generation failed" : void 0
|
|
2548
2589
|
};
|
|
@@ -2552,21 +2593,35 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
2552
2593
|
completion: usage.outputTokens,
|
|
2553
2594
|
total: usage.totalTokens
|
|
2554
2595
|
};
|
|
2555
|
-
const
|
|
2596
|
+
const stepTypeBreakdown = {};
|
|
2597
|
+
for (const ts of traceSteps) {
|
|
2598
|
+
const entry = stepTypeBreakdown[ts.type] ?? {
|
|
2599
|
+
count: 0,
|
|
2600
|
+
durationMs: 0,
|
|
2601
|
+
tokens: 0,
|
|
2602
|
+
costUsd: 0
|
|
2603
|
+
};
|
|
2604
|
+
entry.count += 1;
|
|
2605
|
+
entry.durationMs += ts.durationMs;
|
|
2606
|
+
entry.tokens += ts.tokenUsage.total;
|
|
2607
|
+
entry.costUsd += ts.costUsd;
|
|
2608
|
+
stepTypeBreakdown[ts.type] = entry;
|
|
2609
|
+
}
|
|
2556
2610
|
const summary = {
|
|
2557
2611
|
totalSteps: traceSteps.length,
|
|
2558
2612
|
totalDurationMs,
|
|
2559
2613
|
totalTokens: finalTokens,
|
|
2560
|
-
totalCostUsd:
|
|
2614
|
+
totalCostUsd: totalCost,
|
|
2561
2615
|
modelBreakdown: {
|
|
2562
2616
|
[model]: {
|
|
2563
2617
|
count: traceSteps.length,
|
|
2564
2618
|
durationMs: totalDurationMs,
|
|
2565
2619
|
tokens: finalTokens.total,
|
|
2566
|
-
costUsd:
|
|
2620
|
+
costUsd: totalCost
|
|
2567
2621
|
}
|
|
2568
2622
|
},
|
|
2569
|
-
modelsUsed: [model]
|
|
2623
|
+
modelsUsed: [model],
|
|
2624
|
+
stepTypeBreakdown
|
|
2570
2625
|
};
|
|
2571
2626
|
return {
|
|
2572
2627
|
id: (0, import_crypto.randomUUID)(),
|
|
@@ -2606,6 +2661,7 @@ var ClaudeCodeAdapter = class {
|
|
|
2606
2661
|
model: modelForSdk,
|
|
2607
2662
|
temperature: modelConfig?.temperature,
|
|
2608
2663
|
maxTokens: modelConfig?.maxTokens,
|
|
2664
|
+
maxTurns: modelConfig?.maxTurns,
|
|
2609
2665
|
aiGatewayUrl,
|
|
2610
2666
|
aiGatewayHeaders,
|
|
2611
2667
|
traceContext,
|
|
@@ -42638,7 +42694,6 @@ var MODEL_PRICING = {
|
|
|
42638
42694
|
CLAUDE_4_OPUS_1_0: { input: 15, output: 75 },
|
|
42639
42695
|
CLAUDE_4_SONNET_1_0: { input: 3, output: 15 },
|
|
42640
42696
|
// Anthropic — Claude 3.x
|
|
42641
|
-
CLAUDE_3_7_SONNET_1_0: { input: 3, output: 15 },
|
|
42642
42697
|
CLAUDE_3_5_SONNET_2_0: { input: 3, output: 15 },
|
|
42643
42698
|
CLAUDE_3_5_SONNET_1_0: { input: 3, output: 15 },
|
|
42644
42699
|
CLAUDE_3_HAIKU_1_0: { input: 0.25, output: 1.25 },
|
|
@@ -42756,7 +42811,7 @@ function estimateStepTimestamp(startMs, stepIndex, totalSteps) {
|
|
|
42756
42811
|
// src/run-scenario/agents/simple-agent/execute.ts
|
|
42757
42812
|
var PROVIDER_ANTHROPIC2 = "anthropic";
|
|
42758
42813
|
var PROVIDER_OPENAI = "openai";
|
|
42759
|
-
var
|
|
42814
|
+
var DEFAULT_MAX_TOOL_STEPS = 25;
|
|
42760
42815
|
function createModel(modelId, baseUrl, headers) {
|
|
42761
42816
|
const isClaudeModel = import_evalforge_types6.AVAILABLE_CLAUDE_MODEL_IDS.includes(
|
|
42762
42817
|
modelId
|
|
@@ -42774,6 +42829,9 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
42774
42829
|
apiKey: "proxy-auth",
|
|
42775
42830
|
headers
|
|
42776
42831
|
});
|
|
42832
|
+
if (import_evalforge_types6.OPENAI_RESPONSES_MODEL_IDS.has(modelId)) {
|
|
42833
|
+
return openai2.responses(modelId);
|
|
42834
|
+
}
|
|
42777
42835
|
return openai2.chat(modelId);
|
|
42778
42836
|
}
|
|
42779
42837
|
function isClaudeModelId(modelId) {
|
|
@@ -42810,14 +42868,34 @@ async function executeWithAiSdk(context2) {
|
|
|
42810
42868
|
emitStartEvent(traceContext, startTime);
|
|
42811
42869
|
}
|
|
42812
42870
|
try {
|
|
42871
|
+
const isAnthropic = provider === PROVIDER_ANTHROPIC2;
|
|
42872
|
+
const isResponsesAPI = import_evalforge_types6.OPENAI_RESPONSES_MODEL_IDS.has(modelConfig.model);
|
|
42873
|
+
const supportsThinking = isAnthropic || isResponsesAPI;
|
|
42874
|
+
const providerOpts = {
|
|
42875
|
+
...isAnthropic && {
|
|
42876
|
+
anthropic: {
|
|
42877
|
+
thinking: { type: "enabled", budgetTokens: 1e4 }
|
|
42878
|
+
}
|
|
42879
|
+
},
|
|
42880
|
+
...isResponsesAPI && {
|
|
42881
|
+
openai: {
|
|
42882
|
+
// Prevent the SDK from sending item_reference inputs that the proxy can't forward
|
|
42883
|
+
store: false,
|
|
42884
|
+
forceReasoning: true,
|
|
42885
|
+
reasoningEffort: "high",
|
|
42886
|
+
reasoningSummary: "detailed"
|
|
42887
|
+
}
|
|
42888
|
+
}
|
|
42889
|
+
};
|
|
42813
42890
|
const result = await generateText({
|
|
42814
42891
|
model,
|
|
42815
42892
|
system: systemPrompt,
|
|
42816
42893
|
prompt: scenario.triggerPrompt,
|
|
42817
|
-
temperature: modelConfig.temperature,
|
|
42894
|
+
temperature: supportsThinking ? void 0 : modelConfig.temperature,
|
|
42818
42895
|
maxOutputTokens: modelConfig.maxTokens,
|
|
42819
42896
|
tools: mcpTools,
|
|
42820
|
-
stopWhen: mcpTools ? stepCountIs(
|
|
42897
|
+
stopWhen: mcpTools ? stepCountIs(modelConfig.maxTurns ?? DEFAULT_MAX_TOOL_STEPS) : stepCountIs(1),
|
|
42898
|
+
providerOptions: providerOpts
|
|
42821
42899
|
});
|
|
42822
42900
|
const durationMs = Date.now() - startTime;
|
|
42823
42901
|
const usage = {
|