@wix/evalforge-evaluator 0.107.0 → 0.109.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +128 -50
- package/build/index.js.map +2 -2
- package/build/index.mjs +129 -50
- package/build/index.mjs.map +2 -2
- package/build/types/run-scenario/types.d.ts +2 -0
- package/package.json +4 -4
package/build/index.mjs
CHANGED
|
@@ -1789,6 +1789,7 @@ function createTraceEventFromMessage(message, context2, stepNumber, isComplete)
|
|
|
1789
1789
|
let toolArgs;
|
|
1790
1790
|
let outputPreview;
|
|
1791
1791
|
let filePath;
|
|
1792
|
+
let thinking;
|
|
1792
1793
|
for (const block of message.message.content) {
|
|
1793
1794
|
if (block.type === "tool_use") {
|
|
1794
1795
|
type = LiveTraceEventType.TOOL_USE;
|
|
@@ -1805,6 +1806,15 @@ function createTraceEventFromMessage(message, context2, stepNumber, isComplete)
|
|
|
1805
1806
|
}
|
|
1806
1807
|
} else if (block.type === "text") {
|
|
1807
1808
|
outputPreview = block.text.slice(0, 500);
|
|
1809
|
+
if (!toolName) {
|
|
1810
|
+
type = LiveTraceEventType.COMPLETION;
|
|
1811
|
+
}
|
|
1812
|
+
} else if (block.type === "thinking") {
|
|
1813
|
+
const thinkingBlock = block;
|
|
1814
|
+
thinking = thinkingBlock.thinking.slice(0, 500);
|
|
1815
|
+
if (!outputPreview && !toolName) {
|
|
1816
|
+
type = LiveTraceEventType.THINKING;
|
|
1817
|
+
}
|
|
1808
1818
|
}
|
|
1809
1819
|
}
|
|
1810
1820
|
return {
|
|
@@ -1819,6 +1829,7 @@ function createTraceEventFromMessage(message, context2, stepNumber, isComplete)
|
|
|
1819
1829
|
toolArgs,
|
|
1820
1830
|
outputPreview,
|
|
1821
1831
|
filePath,
|
|
1832
|
+
thinking,
|
|
1822
1833
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1823
1834
|
isComplete
|
|
1824
1835
|
};
|
|
@@ -2444,28 +2455,53 @@ function processMessages(timestampedMessages, startTime, endTime) {
|
|
|
2444
2455
|
const sortedGroups = Array.from(assistantMessageGroups.values()).sort(
|
|
2445
2456
|
(a, b) => a.firstReceivedAt.getTime() - b.firstReceivedAt.getTime()
|
|
2446
2457
|
);
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
const
|
|
2450
|
-
const
|
|
2451
|
-
const
|
|
2458
|
+
const mergedTurns = [];
|
|
2459
|
+
for (const group of sortedGroups) {
|
|
2460
|
+
const lastMsg = group.messages[group.messages.length - 1];
|
|
2461
|
+
const inputTokens = lastMsg.message.usage.input_tokens;
|
|
2462
|
+
const prev = mergedTurns[mergedTurns.length - 1];
|
|
2463
|
+
const prevLastMsg = prev?.groups[prev.groups.length - 1].messages.at(-1);
|
|
2464
|
+
const prevInputTokens = prevLastMsg?.message.usage.input_tokens;
|
|
2465
|
+
if (prev && prevInputTokens === inputTokens) {
|
|
2466
|
+
prev.groups.push(group);
|
|
2467
|
+
prev.lastReceivedAt = group.lastReceivedAt;
|
|
2468
|
+
} else {
|
|
2469
|
+
mergedTurns.push({
|
|
2470
|
+
groups: [group],
|
|
2471
|
+
firstReceivedAt: group.firstReceivedAt,
|
|
2472
|
+
lastReceivedAt: group.lastReceivedAt
|
|
2473
|
+
});
|
|
2474
|
+
}
|
|
2475
|
+
}
|
|
2476
|
+
for (let i = 0; i < mergedTurns.length; i++) {
|
|
2477
|
+
const turn = mergedTurns[i];
|
|
2478
|
+
const stepStartTime = turn.firstReceivedAt;
|
|
2479
|
+
const nextStepStartTime = i < mergedTurns.length - 1 ? mergedTurns[i + 1].firstReceivedAt : endTime;
|
|
2452
2480
|
const durationMs = nextStepStartTime.getTime() - stepStartTime.getTime();
|
|
2453
|
-
const usage = lastMessage.message.usage;
|
|
2454
|
-
const inputTokens = usage.input_tokens;
|
|
2455
|
-
const outputTokens = usage.output_tokens;
|
|
2456
2481
|
let text2 = "";
|
|
2457
2482
|
let thinking = "";
|
|
2458
2483
|
const toolCalls = [];
|
|
2459
|
-
|
|
2460
|
-
|
|
2461
|
-
|
|
2462
|
-
|
|
2463
|
-
|
|
2464
|
-
|
|
2465
|
-
|
|
2466
|
-
|
|
2467
|
-
|
|
2468
|
-
|
|
2484
|
+
let lastStopReason = null;
|
|
2485
|
+
let inputTokens = 0;
|
|
2486
|
+
let outputTokens = 0;
|
|
2487
|
+
let cacheReadTokens;
|
|
2488
|
+
let cacheWriteTokens;
|
|
2489
|
+
for (const group of turn.groups) {
|
|
2490
|
+
const lastMessage = group.messages[group.messages.length - 1];
|
|
2491
|
+
lastStopReason = lastMessage.message.stop_reason;
|
|
2492
|
+
const msgUsage = lastMessage.message.usage;
|
|
2493
|
+
inputTokens = msgUsage.input_tokens;
|
|
2494
|
+
outputTokens = msgUsage.output_tokens;
|
|
2495
|
+
cacheReadTokens = msgUsage.cache_read_input_tokens ?? void 0;
|
|
2496
|
+
cacheWriteTokens = msgUsage.cache_creation_input_tokens ?? void 0;
|
|
2497
|
+
for (const block of lastMessage.message.content) {
|
|
2498
|
+
if (block.type === "text") {
|
|
2499
|
+
text2 += block.text;
|
|
2500
|
+
} else if (block.type === "thinking") {
|
|
2501
|
+
thinking += block.thinking;
|
|
2502
|
+
} else if (block.type === "tool_use") {
|
|
2503
|
+
toolCalls.push({ toolName: block.name, args: block.input });
|
|
2504
|
+
}
|
|
2469
2505
|
}
|
|
2470
2506
|
}
|
|
2471
2507
|
steps.push({
|
|
@@ -2474,13 +2510,14 @@ function processMessages(timestampedMessages, startTime, endTime) {
|
|
|
2474
2510
|
usage: {
|
|
2475
2511
|
inputTokens,
|
|
2476
2512
|
outputTokens,
|
|
2477
|
-
totalTokens: inputTokens + outputTokens
|
|
2513
|
+
totalTokens: inputTokens + outputTokens,
|
|
2514
|
+
cacheReadTokens: cacheReadTokens || void 0,
|
|
2515
|
+
cacheWriteTokens: cacheWriteTokens || void 0
|
|
2478
2516
|
},
|
|
2479
|
-
finishReason: mapStopReason(
|
|
2517
|
+
finishReason: mapStopReason(lastStopReason),
|
|
2480
2518
|
toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
|
|
2481
2519
|
startedAt: stepStartTime,
|
|
2482
2520
|
durationMs: Math.max(0, durationMs)
|
|
2483
|
-
// Ensure non-negative
|
|
2484
2521
|
});
|
|
2485
2522
|
}
|
|
2486
2523
|
return { steps, result };
|
|
@@ -2520,35 +2557,35 @@ function extractTotalUsage(result) {
|
|
|
2520
2557
|
inputTokens: usage.input_tokens,
|
|
2521
2558
|
outputTokens: usage.output_tokens,
|
|
2522
2559
|
totalTokens: usage.input_tokens + usage.output_tokens,
|
|
2523
|
-
costUsd: result.total_cost_usd
|
|
2560
|
+
costUsd: result.total_cost_usd,
|
|
2561
|
+
cacheReadTokens: usage.cache_read_input_tokens || void 0,
|
|
2562
|
+
cacheWriteTokens: usage.cache_creation_input_tokens || void 0,
|
|
2563
|
+
durationApiMs: result.duration_api_ms || void 0
|
|
2524
2564
|
};
|
|
2525
2565
|
}
|
|
2526
2566
|
function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
2527
2567
|
const totalCost = usage.costUsd ?? 0;
|
|
2528
|
-
const
|
|
2529
|
-
(sum, s) => sum + s.usage.
|
|
2568
|
+
const totalStepInputTokens = steps.reduce(
|
|
2569
|
+
(sum, s) => sum + s.usage.inputTokens,
|
|
2530
2570
|
0
|
|
2531
2571
|
);
|
|
2532
2572
|
const totalStepDuration = steps.reduce((sum, s) => sum + s.durationMs, 0);
|
|
2533
|
-
const
|
|
2573
|
+
const inputTokensDuplicated = usage.inputTokens > 0 && totalStepInputTokens > usage.inputTokens * 1.2;
|
|
2534
2574
|
const traceSteps = steps.map((step, index) => {
|
|
2535
|
-
let
|
|
2575
|
+
let stepPromptTokens;
|
|
2576
|
+
let stepOutputTokens;
|
|
2536
2577
|
let proportion;
|
|
2537
|
-
if (
|
|
2538
|
-
proportion = step.usage.totalTokens / totalStepTokens;
|
|
2539
|
-
tokenUsage = {
|
|
2540
|
-
prompt: step.usage.inputTokens,
|
|
2541
|
-
completion: step.usage.outputTokens,
|
|
2542
|
-
total: step.usage.totalTokens
|
|
2543
|
-
};
|
|
2544
|
-
} else {
|
|
2578
|
+
if (inputTokensDuplicated) {
|
|
2545
2579
|
proportion = totalStepDuration > 0 ? step.durationMs / totalStepDuration : 0;
|
|
2546
|
-
|
|
2547
|
-
|
|
2548
|
-
|
|
2549
|
-
|
|
2550
|
-
|
|
2580
|
+
stepPromptTokens = Math.round(usage.inputTokens * proportion);
|
|
2581
|
+
stepOutputTokens = Math.round(usage.outputTokens * proportion);
|
|
2582
|
+
} else {
|
|
2583
|
+
proportion = totalStepInputTokens > 0 ? step.usage.inputTokens / totalStepInputTokens : 0;
|
|
2584
|
+
stepPromptTokens = step.usage.inputTokens;
|
|
2585
|
+
stepOutputTokens = Math.round(usage.outputTokens * proportion);
|
|
2551
2586
|
}
|
|
2587
|
+
const stepTotalTokens = stepPromptTokens + stepOutputTokens;
|
|
2588
|
+
const costProportion = proportion;
|
|
2552
2589
|
const stepType = step.toolCalls?.length ? LLMStepType.TOOL_USE : step.thinking && !step.text ? LLMStepType.THINKING : LLMStepType.COMPLETION;
|
|
2553
2590
|
return {
|
|
2554
2591
|
id: randomUUID(),
|
|
@@ -2558,11 +2595,15 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
2558
2595
|
provider: "anthropic",
|
|
2559
2596
|
startedAt: step.startedAt.toISOString(),
|
|
2560
2597
|
durationMs: step.durationMs,
|
|
2561
|
-
tokenUsage
|
|
2562
|
-
|
|
2598
|
+
tokenUsage: {
|
|
2599
|
+
prompt: stepPromptTokens,
|
|
2600
|
+
completion: stepOutputTokens,
|
|
2601
|
+
total: stepTotalTokens
|
|
2602
|
+
},
|
|
2603
|
+
costUsd: totalCost * costProportion,
|
|
2563
2604
|
toolName: step.toolCalls?.[0]?.toolName,
|
|
2564
2605
|
toolArguments: step.toolCalls?.[0] ? JSON.stringify(step.toolCalls[0].args) : void 0,
|
|
2565
|
-
outputPreview: step.text?.slice(0, 200),
|
|
2606
|
+
outputPreview: (step.text || step.thinking)?.slice(0, 200),
|
|
2566
2607
|
success: step.finishReason !== "error",
|
|
2567
2608
|
error: step.finishReason === "error" ? "Generation failed" : void 0
|
|
2568
2609
|
};
|
|
@@ -2572,21 +2613,35 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
2572
2613
|
completion: usage.outputTokens,
|
|
2573
2614
|
total: usage.totalTokens
|
|
2574
2615
|
};
|
|
2575
|
-
const
|
|
2616
|
+
const stepTypeBreakdown = {};
|
|
2617
|
+
for (const ts of traceSteps) {
|
|
2618
|
+
const entry = stepTypeBreakdown[ts.type] ?? {
|
|
2619
|
+
count: 0,
|
|
2620
|
+
durationMs: 0,
|
|
2621
|
+
tokens: 0,
|
|
2622
|
+
costUsd: 0
|
|
2623
|
+
};
|
|
2624
|
+
entry.count += 1;
|
|
2625
|
+
entry.durationMs += ts.durationMs;
|
|
2626
|
+
entry.tokens += ts.tokenUsage.total;
|
|
2627
|
+
entry.costUsd += ts.costUsd;
|
|
2628
|
+
stepTypeBreakdown[ts.type] = entry;
|
|
2629
|
+
}
|
|
2576
2630
|
const summary = {
|
|
2577
2631
|
totalSteps: traceSteps.length,
|
|
2578
2632
|
totalDurationMs,
|
|
2579
2633
|
totalTokens: finalTokens,
|
|
2580
|
-
totalCostUsd:
|
|
2634
|
+
totalCostUsd: totalCost,
|
|
2581
2635
|
modelBreakdown: {
|
|
2582
2636
|
[model]: {
|
|
2583
2637
|
count: traceSteps.length,
|
|
2584
2638
|
durationMs: totalDurationMs,
|
|
2585
2639
|
tokens: finalTokens.total,
|
|
2586
|
-
costUsd:
|
|
2640
|
+
costUsd: totalCost
|
|
2587
2641
|
}
|
|
2588
2642
|
},
|
|
2589
|
-
modelsUsed: [model]
|
|
2643
|
+
modelsUsed: [model],
|
|
2644
|
+
stepTypeBreakdown
|
|
2590
2645
|
};
|
|
2591
2646
|
return {
|
|
2592
2647
|
id: randomUUID(),
|
|
@@ -2626,6 +2681,7 @@ var ClaudeCodeAdapter = class {
|
|
|
2626
2681
|
model: modelForSdk,
|
|
2627
2682
|
temperature: modelConfig?.temperature,
|
|
2628
2683
|
maxTokens: modelConfig?.maxTokens,
|
|
2684
|
+
maxTurns: modelConfig?.maxTurns,
|
|
2629
2685
|
aiGatewayUrl,
|
|
2630
2686
|
aiGatewayHeaders,
|
|
2631
2687
|
traceContext,
|
|
@@ -40080,6 +40136,7 @@ var openai = createOpenAI();
|
|
|
40080
40136
|
// src/run-scenario/agents/simple-agent/execute.ts
|
|
40081
40137
|
import {
|
|
40082
40138
|
AVAILABLE_CLAUDE_MODEL_IDS,
|
|
40139
|
+
OPENAI_RESPONSES_MODEL_IDS,
|
|
40083
40140
|
LLMStepType as LLMStepType2,
|
|
40084
40141
|
LiveTraceEventType as LiveTraceEventType2
|
|
40085
40142
|
} from "@wix/evalforge-types";
|
|
@@ -42662,7 +42719,6 @@ var MODEL_PRICING = {
|
|
|
42662
42719
|
CLAUDE_4_OPUS_1_0: { input: 15, output: 75 },
|
|
42663
42720
|
CLAUDE_4_SONNET_1_0: { input: 3, output: 15 },
|
|
42664
42721
|
// Anthropic — Claude 3.x
|
|
42665
|
-
CLAUDE_3_7_SONNET_1_0: { input: 3, output: 15 },
|
|
42666
42722
|
CLAUDE_3_5_SONNET_2_0: { input: 3, output: 15 },
|
|
42667
42723
|
CLAUDE_3_5_SONNET_1_0: { input: 3, output: 15 },
|
|
42668
42724
|
CLAUDE_3_HAIKU_1_0: { input: 0.25, output: 1.25 },
|
|
@@ -42780,7 +42836,7 @@ function estimateStepTimestamp(startMs, stepIndex, totalSteps) {
|
|
|
42780
42836
|
// src/run-scenario/agents/simple-agent/execute.ts
|
|
42781
42837
|
var PROVIDER_ANTHROPIC2 = "anthropic";
|
|
42782
42838
|
var PROVIDER_OPENAI = "openai";
|
|
42783
|
-
var
|
|
42839
|
+
var DEFAULT_MAX_TOOL_STEPS = 25;
|
|
42784
42840
|
function createModel(modelId, baseUrl, headers) {
|
|
42785
42841
|
const isClaudeModel = AVAILABLE_CLAUDE_MODEL_IDS.includes(
|
|
42786
42842
|
modelId
|
|
@@ -42798,6 +42854,9 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
42798
42854
|
apiKey: "proxy-auth",
|
|
42799
42855
|
headers
|
|
42800
42856
|
});
|
|
42857
|
+
if (OPENAI_RESPONSES_MODEL_IDS.has(modelId)) {
|
|
42858
|
+
return openai2.responses(modelId);
|
|
42859
|
+
}
|
|
42801
42860
|
return openai2.chat(modelId);
|
|
42802
42861
|
}
|
|
42803
42862
|
function isClaudeModelId(modelId) {
|
|
@@ -42834,14 +42893,34 @@ async function executeWithAiSdk(context2) {
|
|
|
42834
42893
|
emitStartEvent(traceContext, startTime);
|
|
42835
42894
|
}
|
|
42836
42895
|
try {
|
|
42896
|
+
const isAnthropic = provider === PROVIDER_ANTHROPIC2;
|
|
42897
|
+
const isResponsesAPI = OPENAI_RESPONSES_MODEL_IDS.has(modelConfig.model);
|
|
42898
|
+
const supportsThinking = isAnthropic || isResponsesAPI;
|
|
42899
|
+
const providerOpts = {
|
|
42900
|
+
...isAnthropic && {
|
|
42901
|
+
anthropic: {
|
|
42902
|
+
thinking: { type: "enabled", budgetTokens: 1e4 }
|
|
42903
|
+
}
|
|
42904
|
+
},
|
|
42905
|
+
...isResponsesAPI && {
|
|
42906
|
+
openai: {
|
|
42907
|
+
// Prevent the SDK from sending item_reference inputs that the proxy can't forward
|
|
42908
|
+
store: false,
|
|
42909
|
+
forceReasoning: true,
|
|
42910
|
+
reasoningEffort: "high",
|
|
42911
|
+
reasoningSummary: "detailed"
|
|
42912
|
+
}
|
|
42913
|
+
}
|
|
42914
|
+
};
|
|
42837
42915
|
const result = await generateText({
|
|
42838
42916
|
model,
|
|
42839
42917
|
system: systemPrompt,
|
|
42840
42918
|
prompt: scenario.triggerPrompt,
|
|
42841
|
-
temperature: modelConfig.temperature,
|
|
42919
|
+
temperature: supportsThinking ? void 0 : modelConfig.temperature,
|
|
42842
42920
|
maxOutputTokens: modelConfig.maxTokens,
|
|
42843
42921
|
tools: mcpTools,
|
|
42844
|
-
stopWhen: mcpTools ? stepCountIs(
|
|
42922
|
+
stopWhen: mcpTools ? stepCountIs(modelConfig.maxTurns ?? DEFAULT_MAX_TOOL_STEPS) : stepCountIs(1),
|
|
42923
|
+
providerOptions: providerOpts
|
|
42845
42924
|
});
|
|
42846
42925
|
const durationMs = Date.now() - startTime;
|
|
42847
42926
|
const usage = {
|