@wix/evalforge-evaluator 0.106.0 → 0.108.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js
CHANGED
|
@@ -1505,6 +1505,7 @@ async function writeMcpToFilesystem(cwd, mcps) {
|
|
|
1505
1505
|
// src/run-scenario/agents/claude-code/write-sub-agents.ts
|
|
1506
1506
|
var import_promises5 = require("fs/promises");
|
|
1507
1507
|
var import_path6 = require("path");
|
|
1508
|
+
var import_evalforge_github_client3 = require("@wix/evalforge-github-client");
|
|
1508
1509
|
var AGENTS_DIR = ".claude/agents";
|
|
1509
1510
|
function toAgentFilename(name26, index, nameCount) {
|
|
1510
1511
|
const base = (name26 || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
@@ -1512,7 +1513,34 @@ function toAgentFilename(name26, index, nameCount) {
|
|
|
1512
1513
|
nameCount.set(base, count + 1);
|
|
1513
1514
|
return count === 0 ? base : `${base}-${count + 1}`;
|
|
1514
1515
|
}
|
|
1515
|
-
async function
|
|
1516
|
+
async function resolveSubAgentContent(agent, fetchFn) {
|
|
1517
|
+
if (agent.source) {
|
|
1518
|
+
try {
|
|
1519
|
+
const content = await fetchFn(agent.source, {
|
|
1520
|
+
userAgent: "EvalForge-Evaluator"
|
|
1521
|
+
});
|
|
1522
|
+
console.log(
|
|
1523
|
+
`[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
|
|
1524
|
+
);
|
|
1525
|
+
return content;
|
|
1526
|
+
} catch (error48) {
|
|
1527
|
+
const message = error48 instanceof Error ? error48.message : "Unknown error";
|
|
1528
|
+
console.error(
|
|
1529
|
+
`[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
|
|
1530
|
+
);
|
|
1531
|
+
throw new Error(
|
|
1532
|
+
`Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
|
|
1533
|
+
);
|
|
1534
|
+
}
|
|
1535
|
+
}
|
|
1536
|
+
if (!agent.subAgentMd) {
|
|
1537
|
+
console.warn(
|
|
1538
|
+
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
1539
|
+
);
|
|
1540
|
+
}
|
|
1541
|
+
return agent.subAgentMd;
|
|
1542
|
+
}
|
|
1543
|
+
async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalforge_github_client3.fetchGitHubFile) {
|
|
1516
1544
|
if (subAgents.length === 0) return;
|
|
1517
1545
|
const agentsDir = (0, import_path6.join)(cwd, AGENTS_DIR);
|
|
1518
1546
|
await (0, import_promises5.mkdir)(agentsDir, { recursive: true });
|
|
@@ -1520,7 +1548,8 @@ async function writeSubAgentsToFilesystem(cwd, subAgents) {
|
|
|
1520
1548
|
for (const [i, agent] of subAgents.entries()) {
|
|
1521
1549
|
const filename = toAgentFilename(agent.name, i, nameCount);
|
|
1522
1550
|
const filePath = (0, import_path6.join)(agentsDir, `${filename}.md`);
|
|
1523
|
-
await (
|
|
1551
|
+
const content = await resolveSubAgentContent(agent, fetchFn);
|
|
1552
|
+
await (0, import_promises5.writeFile)(filePath, content, "utf8");
|
|
1524
1553
|
}
|
|
1525
1554
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
1526
1555
|
}
|
|
@@ -1740,6 +1769,7 @@ function createTraceEventFromMessage(message, context2, stepNumber, isComplete)
|
|
|
1740
1769
|
let toolArgs;
|
|
1741
1770
|
let outputPreview;
|
|
1742
1771
|
let filePath;
|
|
1772
|
+
let thinking;
|
|
1743
1773
|
for (const block of message.message.content) {
|
|
1744
1774
|
if (block.type === "tool_use") {
|
|
1745
1775
|
type = import_evalforge_types4.LiveTraceEventType.TOOL_USE;
|
|
@@ -1756,6 +1786,15 @@ function createTraceEventFromMessage(message, context2, stepNumber, isComplete)
|
|
|
1756
1786
|
}
|
|
1757
1787
|
} else if (block.type === "text") {
|
|
1758
1788
|
outputPreview = block.text.slice(0, 500);
|
|
1789
|
+
if (!toolName) {
|
|
1790
|
+
type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
|
|
1791
|
+
}
|
|
1792
|
+
} else if (block.type === "thinking") {
|
|
1793
|
+
const thinkingBlock = block;
|
|
1794
|
+
thinking = thinkingBlock.thinking.slice(0, 500);
|
|
1795
|
+
if (!outputPreview && !toolName) {
|
|
1796
|
+
type = import_evalforge_types4.LiveTraceEventType.THINKING;
|
|
1797
|
+
}
|
|
1759
1798
|
}
|
|
1760
1799
|
}
|
|
1761
1800
|
return {
|
|
@@ -1770,6 +1809,7 @@ function createTraceEventFromMessage(message, context2, stepNumber, isComplete)
|
|
|
1770
1809
|
toolArgs,
|
|
1771
1810
|
outputPreview,
|
|
1772
1811
|
filePath,
|
|
1812
|
+
thinking,
|
|
1773
1813
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1774
1814
|
isComplete
|
|
1775
1815
|
};
|
|
@@ -2395,28 +2435,53 @@ function processMessages(timestampedMessages, startTime, endTime) {
|
|
|
2395
2435
|
const sortedGroups = Array.from(assistantMessageGroups.values()).sort(
|
|
2396
2436
|
(a, b) => a.firstReceivedAt.getTime() - b.firstReceivedAt.getTime()
|
|
2397
2437
|
);
|
|
2398
|
-
|
|
2399
|
-
|
|
2400
|
-
const
|
|
2401
|
-
const
|
|
2402
|
-
const
|
|
2438
|
+
const mergedTurns = [];
|
|
2439
|
+
for (const group of sortedGroups) {
|
|
2440
|
+
const lastMsg = group.messages[group.messages.length - 1];
|
|
2441
|
+
const inputTokens = lastMsg.message.usage.input_tokens;
|
|
2442
|
+
const prev = mergedTurns[mergedTurns.length - 1];
|
|
2443
|
+
const prevLastMsg = prev?.groups[prev.groups.length - 1].messages.at(-1);
|
|
2444
|
+
const prevInputTokens = prevLastMsg?.message.usage.input_tokens;
|
|
2445
|
+
if (prev && prevInputTokens === inputTokens) {
|
|
2446
|
+
prev.groups.push(group);
|
|
2447
|
+
prev.lastReceivedAt = group.lastReceivedAt;
|
|
2448
|
+
} else {
|
|
2449
|
+
mergedTurns.push({
|
|
2450
|
+
groups: [group],
|
|
2451
|
+
firstReceivedAt: group.firstReceivedAt,
|
|
2452
|
+
lastReceivedAt: group.lastReceivedAt
|
|
2453
|
+
});
|
|
2454
|
+
}
|
|
2455
|
+
}
|
|
2456
|
+
for (let i = 0; i < mergedTurns.length; i++) {
|
|
2457
|
+
const turn = mergedTurns[i];
|
|
2458
|
+
const stepStartTime = turn.firstReceivedAt;
|
|
2459
|
+
const nextStepStartTime = i < mergedTurns.length - 1 ? mergedTurns[i + 1].firstReceivedAt : endTime;
|
|
2403
2460
|
const durationMs = nextStepStartTime.getTime() - stepStartTime.getTime();
|
|
2404
|
-
const usage = lastMessage.message.usage;
|
|
2405
|
-
const inputTokens = usage.input_tokens;
|
|
2406
|
-
const outputTokens = usage.output_tokens;
|
|
2407
2461
|
let text2 = "";
|
|
2408
2462
|
let thinking = "";
|
|
2409
2463
|
const toolCalls = [];
|
|
2410
|
-
|
|
2411
|
-
|
|
2412
|
-
|
|
2413
|
-
|
|
2414
|
-
|
|
2415
|
-
|
|
2416
|
-
|
|
2417
|
-
|
|
2418
|
-
|
|
2419
|
-
|
|
2464
|
+
let lastStopReason = null;
|
|
2465
|
+
let inputTokens = 0;
|
|
2466
|
+
let outputTokens = 0;
|
|
2467
|
+
let cacheReadTokens;
|
|
2468
|
+
let cacheWriteTokens;
|
|
2469
|
+
for (const group of turn.groups) {
|
|
2470
|
+
const lastMessage = group.messages[group.messages.length - 1];
|
|
2471
|
+
lastStopReason = lastMessage.message.stop_reason;
|
|
2472
|
+
const msgUsage = lastMessage.message.usage;
|
|
2473
|
+
inputTokens = msgUsage.input_tokens;
|
|
2474
|
+
outputTokens = msgUsage.output_tokens;
|
|
2475
|
+
cacheReadTokens = msgUsage.cache_read_input_tokens ?? void 0;
|
|
2476
|
+
cacheWriteTokens = msgUsage.cache_creation_input_tokens ?? void 0;
|
|
2477
|
+
for (const block of lastMessage.message.content) {
|
|
2478
|
+
if (block.type === "text") {
|
|
2479
|
+
text2 += block.text;
|
|
2480
|
+
} else if (block.type === "thinking") {
|
|
2481
|
+
thinking += block.thinking;
|
|
2482
|
+
} else if (block.type === "tool_use") {
|
|
2483
|
+
toolCalls.push({ toolName: block.name, args: block.input });
|
|
2484
|
+
}
|
|
2420
2485
|
}
|
|
2421
2486
|
}
|
|
2422
2487
|
steps.push({
|
|
@@ -2425,13 +2490,14 @@ function processMessages(timestampedMessages, startTime, endTime) {
|
|
|
2425
2490
|
usage: {
|
|
2426
2491
|
inputTokens,
|
|
2427
2492
|
outputTokens,
|
|
2428
|
-
totalTokens: inputTokens + outputTokens
|
|
2493
|
+
totalTokens: inputTokens + outputTokens,
|
|
2494
|
+
cacheReadTokens: cacheReadTokens || void 0,
|
|
2495
|
+
cacheWriteTokens: cacheWriteTokens || void 0
|
|
2429
2496
|
},
|
|
2430
|
-
finishReason: mapStopReason(
|
|
2497
|
+
finishReason: mapStopReason(lastStopReason),
|
|
2431
2498
|
toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
|
|
2432
2499
|
startedAt: stepStartTime,
|
|
2433
2500
|
durationMs: Math.max(0, durationMs)
|
|
2434
|
-
// Ensure non-negative
|
|
2435
2501
|
});
|
|
2436
2502
|
}
|
|
2437
2503
|
return { steps, result };
|
|
@@ -2471,35 +2537,35 @@ function extractTotalUsage(result) {
|
|
|
2471
2537
|
inputTokens: usage.input_tokens,
|
|
2472
2538
|
outputTokens: usage.output_tokens,
|
|
2473
2539
|
totalTokens: usage.input_tokens + usage.output_tokens,
|
|
2474
|
-
costUsd: result.total_cost_usd
|
|
2540
|
+
costUsd: result.total_cost_usd,
|
|
2541
|
+
cacheReadTokens: usage.cache_read_input_tokens || void 0,
|
|
2542
|
+
cacheWriteTokens: usage.cache_creation_input_tokens || void 0,
|
|
2543
|
+
durationApiMs: result.duration_api_ms || void 0
|
|
2475
2544
|
};
|
|
2476
2545
|
}
|
|
2477
2546
|
function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
2478
2547
|
const totalCost = usage.costUsd ?? 0;
|
|
2479
|
-
const
|
|
2480
|
-
(sum, s) => sum + s.usage.
|
|
2548
|
+
const totalStepInputTokens = steps.reduce(
|
|
2549
|
+
(sum, s) => sum + s.usage.inputTokens,
|
|
2481
2550
|
0
|
|
2482
2551
|
);
|
|
2483
2552
|
const totalStepDuration = steps.reduce((sum, s) => sum + s.durationMs, 0);
|
|
2484
|
-
const
|
|
2553
|
+
const inputTokensDuplicated = usage.inputTokens > 0 && totalStepInputTokens > usage.inputTokens * 1.2;
|
|
2485
2554
|
const traceSteps = steps.map((step, index) => {
|
|
2486
|
-
let
|
|
2555
|
+
let stepPromptTokens;
|
|
2556
|
+
let stepOutputTokens;
|
|
2487
2557
|
let proportion;
|
|
2488
|
-
if (
|
|
2489
|
-
proportion = step.usage.totalTokens / totalStepTokens;
|
|
2490
|
-
tokenUsage = {
|
|
2491
|
-
prompt: step.usage.inputTokens,
|
|
2492
|
-
completion: step.usage.outputTokens,
|
|
2493
|
-
total: step.usage.totalTokens
|
|
2494
|
-
};
|
|
2495
|
-
} else {
|
|
2558
|
+
if (inputTokensDuplicated) {
|
|
2496
2559
|
proportion = totalStepDuration > 0 ? step.durationMs / totalStepDuration : 0;
|
|
2497
|
-
|
|
2498
|
-
|
|
2499
|
-
|
|
2500
|
-
|
|
2501
|
-
|
|
2560
|
+
stepPromptTokens = Math.round(usage.inputTokens * proportion);
|
|
2561
|
+
stepOutputTokens = Math.round(usage.outputTokens * proportion);
|
|
2562
|
+
} else {
|
|
2563
|
+
proportion = totalStepInputTokens > 0 ? step.usage.inputTokens / totalStepInputTokens : 0;
|
|
2564
|
+
stepPromptTokens = step.usage.inputTokens;
|
|
2565
|
+
stepOutputTokens = Math.round(usage.outputTokens * proportion);
|
|
2502
2566
|
}
|
|
2567
|
+
const stepTotalTokens = stepPromptTokens + stepOutputTokens;
|
|
2568
|
+
const costProportion = proportion;
|
|
2503
2569
|
const stepType = step.toolCalls?.length ? import_evalforge_types4.LLMStepType.TOOL_USE : step.thinking && !step.text ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
|
|
2504
2570
|
return {
|
|
2505
2571
|
id: (0, import_crypto.randomUUID)(),
|
|
@@ -2509,11 +2575,15 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
2509
2575
|
provider: "anthropic",
|
|
2510
2576
|
startedAt: step.startedAt.toISOString(),
|
|
2511
2577
|
durationMs: step.durationMs,
|
|
2512
|
-
tokenUsage
|
|
2513
|
-
|
|
2578
|
+
tokenUsage: {
|
|
2579
|
+
prompt: stepPromptTokens,
|
|
2580
|
+
completion: stepOutputTokens,
|
|
2581
|
+
total: stepTotalTokens
|
|
2582
|
+
},
|
|
2583
|
+
costUsd: totalCost * costProportion,
|
|
2514
2584
|
toolName: step.toolCalls?.[0]?.toolName,
|
|
2515
2585
|
toolArguments: step.toolCalls?.[0] ? JSON.stringify(step.toolCalls[0].args) : void 0,
|
|
2516
|
-
outputPreview: step.text?.slice(0, 200),
|
|
2586
|
+
outputPreview: (step.text || step.thinking)?.slice(0, 200),
|
|
2517
2587
|
success: step.finishReason !== "error",
|
|
2518
2588
|
error: step.finishReason === "error" ? "Generation failed" : void 0
|
|
2519
2589
|
};
|
|
@@ -2523,21 +2593,35 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
2523
2593
|
completion: usage.outputTokens,
|
|
2524
2594
|
total: usage.totalTokens
|
|
2525
2595
|
};
|
|
2526
|
-
const
|
|
2596
|
+
const stepTypeBreakdown = {};
|
|
2597
|
+
for (const ts of traceSteps) {
|
|
2598
|
+
const entry = stepTypeBreakdown[ts.type] ?? {
|
|
2599
|
+
count: 0,
|
|
2600
|
+
durationMs: 0,
|
|
2601
|
+
tokens: 0,
|
|
2602
|
+
costUsd: 0
|
|
2603
|
+
};
|
|
2604
|
+
entry.count += 1;
|
|
2605
|
+
entry.durationMs += ts.durationMs;
|
|
2606
|
+
entry.tokens += ts.tokenUsage.total;
|
|
2607
|
+
entry.costUsd += ts.costUsd;
|
|
2608
|
+
stepTypeBreakdown[ts.type] = entry;
|
|
2609
|
+
}
|
|
2527
2610
|
const summary = {
|
|
2528
2611
|
totalSteps: traceSteps.length,
|
|
2529
2612
|
totalDurationMs,
|
|
2530
2613
|
totalTokens: finalTokens,
|
|
2531
|
-
totalCostUsd:
|
|
2614
|
+
totalCostUsd: totalCost,
|
|
2532
2615
|
modelBreakdown: {
|
|
2533
2616
|
[model]: {
|
|
2534
2617
|
count: traceSteps.length,
|
|
2535
2618
|
durationMs: totalDurationMs,
|
|
2536
2619
|
tokens: finalTokens.total,
|
|
2537
|
-
costUsd:
|
|
2620
|
+
costUsd: totalCost
|
|
2538
2621
|
}
|
|
2539
2622
|
},
|
|
2540
|
-
modelsUsed: [model]
|
|
2623
|
+
modelsUsed: [model],
|
|
2624
|
+
stepTypeBreakdown
|
|
2541
2625
|
};
|
|
2542
2626
|
return {
|
|
2543
2627
|
id: (0, import_crypto.randomUUID)(),
|
|
@@ -42745,6 +42829,9 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
42745
42829
|
apiKey: "proxy-auth",
|
|
42746
42830
|
headers
|
|
42747
42831
|
});
|
|
42832
|
+
if (import_evalforge_types6.OPENAI_RESPONSES_MODEL_IDS.has(modelId)) {
|
|
42833
|
+
return openai2.responses(modelId);
|
|
42834
|
+
}
|
|
42748
42835
|
return openai2.chat(modelId);
|
|
42749
42836
|
}
|
|
42750
42837
|
function isClaudeModelId(modelId) {
|
|
@@ -42781,14 +42868,34 @@ async function executeWithAiSdk(context2) {
|
|
|
42781
42868
|
emitStartEvent(traceContext, startTime);
|
|
42782
42869
|
}
|
|
42783
42870
|
try {
|
|
42871
|
+
const isAnthropic = provider === PROVIDER_ANTHROPIC2;
|
|
42872
|
+
const isResponsesAPI = import_evalforge_types6.OPENAI_RESPONSES_MODEL_IDS.has(modelConfig.model);
|
|
42873
|
+
const supportsThinking = isAnthropic || isResponsesAPI;
|
|
42874
|
+
const providerOpts = {
|
|
42875
|
+
...isAnthropic && {
|
|
42876
|
+
anthropic: {
|
|
42877
|
+
thinking: { type: "enabled", budgetTokens: 1e4 }
|
|
42878
|
+
}
|
|
42879
|
+
},
|
|
42880
|
+
...isResponsesAPI && {
|
|
42881
|
+
openai: {
|
|
42882
|
+
// Prevent the SDK from sending item_reference inputs that the proxy can't forward
|
|
42883
|
+
store: false,
|
|
42884
|
+
forceReasoning: true,
|
|
42885
|
+
reasoningEffort: "high",
|
|
42886
|
+
reasoningSummary: "detailed"
|
|
42887
|
+
}
|
|
42888
|
+
}
|
|
42889
|
+
};
|
|
42784
42890
|
const result = await generateText({
|
|
42785
42891
|
model,
|
|
42786
42892
|
system: systemPrompt,
|
|
42787
42893
|
prompt: scenario.triggerPrompt,
|
|
42788
|
-
temperature: modelConfig.temperature,
|
|
42894
|
+
temperature: supportsThinking ? void 0 : modelConfig.temperature,
|
|
42789
42895
|
maxOutputTokens: modelConfig.maxTokens,
|
|
42790
42896
|
tools: mcpTools,
|
|
42791
|
-
stopWhen: mcpTools ? stepCountIs(MAX_TOOL_STEPS) : stepCountIs(1)
|
|
42897
|
+
stopWhen: mcpTools ? stepCountIs(MAX_TOOL_STEPS) : stepCountIs(1),
|
|
42898
|
+
providerOptions: providerOpts
|
|
42792
42899
|
});
|
|
42793
42900
|
const durationMs = Date.now() - startTime;
|
|
42794
42901
|
const usage = {
|