@wix/evalforge-evaluator 0.119.0 → 0.120.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +45 -54
- package/build/index.js.map +2 -2
- package/build/index.mjs +45 -54
- package/build/index.mjs.map +3 -3
- package/build/types/run-scenario/agents/simple-agent/build-conversation.d.ts +1 -1
- package/build/types/run-scenario/agents/simple-agent/execute.d.ts +1 -1
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -55,6 +55,7 @@ function loadConfig() {
|
|
|
55
55
|
aiGatewayHeaders[key] = value;
|
|
56
56
|
}
|
|
57
57
|
}
|
|
58
|
+
aiGatewayHeaders["x-wix-ai-gateway-disable-cache"] = "true";
|
|
58
59
|
const tracePushUrl = process.env.TRACE_PUSH_URL;
|
|
59
60
|
const routeHeader = process.env.EVAL_ROUTE_HEADER;
|
|
60
61
|
const authToken = process.env.EVAL_AUTH_TOKEN;
|
|
@@ -3404,7 +3405,7 @@ function calculateStepCost(step, modelId, provider, tokenUsage) {
|
|
|
3404
3405
|
}
|
|
3405
3406
|
|
|
3406
3407
|
// src/run-scenario/agents/simple-agent/build-conversation.ts
|
|
3407
|
-
function buildConversation3(triggerPrompt, steps, executionStartMs) {
|
|
3408
|
+
function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestamps) {
|
|
3408
3409
|
const messages = [];
|
|
3409
3410
|
messages.push({
|
|
3410
3411
|
role: "user",
|
|
@@ -3413,11 +3414,9 @@ function buildConversation3(triggerPrompt, steps, executionStartMs) {
|
|
|
3413
3414
|
});
|
|
3414
3415
|
for (let i = 0; i < steps.length; i++) {
|
|
3415
3416
|
const step = steps[i];
|
|
3416
|
-
const stepTimestamp =
|
|
3417
|
-
executionStartMs
|
|
3418
|
-
|
|
3419
|
-
steps.length
|
|
3420
|
-
);
|
|
3417
|
+
const stepTimestamp = new Date(
|
|
3418
|
+
stepTimestamps[i] ?? executionStartMs
|
|
3419
|
+
).toISOString();
|
|
3421
3420
|
const assistantContent = [];
|
|
3422
3421
|
if (step.reasoningText) {
|
|
3423
3422
|
assistantContent.push({ type: "thinking", thinking: step.reasoningText });
|
|
@@ -3460,10 +3459,6 @@ function buildConversation3(triggerPrompt, steps, executionStartMs) {
|
|
|
3460
3459
|
}
|
|
3461
3460
|
return messages;
|
|
3462
3461
|
}
|
|
3463
|
-
function estimateStepTimestamp(startMs, stepIndex, totalSteps) {
|
|
3464
|
-
const offset = totalSteps > 1 ? (stepIndex + 1) / totalSteps : 1;
|
|
3465
|
-
return new Date(startMs + Math.round(offset * 1e3)).toISOString();
|
|
3466
|
-
}
|
|
3467
3462
|
|
|
3468
3463
|
// src/run-scenario/agents/simple-agent/execute.ts
|
|
3469
3464
|
var PROVIDER_ANTHROPIC2 = "anthropic";
|
|
@@ -3548,6 +3543,7 @@ async function executeWithAiSdk(context) {
|
|
|
3548
3543
|
}
|
|
3549
3544
|
}
|
|
3550
3545
|
};
|
|
3546
|
+
const stepTimestamps = [];
|
|
3551
3547
|
const result = await (0, import_ai.generateText)({
|
|
3552
3548
|
model,
|
|
3553
3549
|
system: systemPrompt,
|
|
@@ -3556,7 +3552,34 @@ async function executeWithAiSdk(context) {
|
|
|
3556
3552
|
maxOutputTokens: modelConfig.maxTokens,
|
|
3557
3553
|
tools: mcpTools,
|
|
3558
3554
|
stopWhen: mcpTools ? (0, import_ai.stepCountIs)(modelConfig.maxTurns ?? DEFAULT_MAX_TOOL_STEPS) : (0, import_ai.stepCountIs)(1),
|
|
3559
|
-
providerOptions: providerOpts
|
|
3555
|
+
providerOptions: providerOpts,
|
|
3556
|
+
onStepFinish: (step) => {
|
|
3557
|
+
stepTimestamps.push(Date.now());
|
|
3558
|
+
if (traceContext) {
|
|
3559
|
+
const isToolStep = step.toolCalls.length > 0;
|
|
3560
|
+
const firstToolCall = step.toolCalls[0];
|
|
3561
|
+
emitTraceEvent(
|
|
3562
|
+
{
|
|
3563
|
+
evalRunId: traceContext.evalRunId,
|
|
3564
|
+
scenarioId: traceContext.scenarioId,
|
|
3565
|
+
scenarioName: traceContext.scenarioName,
|
|
3566
|
+
targetId: traceContext.targetId,
|
|
3567
|
+
targetName: traceContext.targetName,
|
|
3568
|
+
stepNumber: stepTimestamps.length,
|
|
3569
|
+
type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
|
|
3570
|
+
toolName: firstToolCall?.toolName,
|
|
3571
|
+
toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
|
|
3572
|
+
outputPreview: step.text?.slice(0, 500),
|
|
3573
|
+
elapsedMs: Date.now() - startTime,
|
|
3574
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3575
|
+
isComplete: false
|
|
3576
|
+
},
|
|
3577
|
+
traceContext.tracePushUrl,
|
|
3578
|
+
traceContext.routeHeader,
|
|
3579
|
+
traceContext.authToken
|
|
3580
|
+
);
|
|
3581
|
+
}
|
|
3582
|
+
}
|
|
3560
3583
|
});
|
|
3561
3584
|
const durationMs = Date.now() - startTime;
|
|
3562
3585
|
const usage = {
|
|
@@ -3570,16 +3593,17 @@ async function executeWithAiSdk(context) {
|
|
|
3570
3593
|
usage,
|
|
3571
3594
|
modelConfig.model,
|
|
3572
3595
|
provider,
|
|
3573
|
-
startTime
|
|
3596
|
+
startTime,
|
|
3597
|
+
stepTimestamps
|
|
3574
3598
|
);
|
|
3575
3599
|
if (traceContext) {
|
|
3576
|
-
|
|
3577
|
-
emitCompletionEvent(traceContext, result.steps.length + 1);
|
|
3600
|
+
emitCompletionEvent(traceContext, stepTimestamps.length + 1);
|
|
3578
3601
|
}
|
|
3579
3602
|
const conversation = buildConversation3(
|
|
3580
3603
|
scenario.triggerPrompt,
|
|
3581
3604
|
result.steps,
|
|
3582
|
-
startTime
|
|
3605
|
+
startTime,
|
|
3606
|
+
stepTimestamps
|
|
3583
3607
|
);
|
|
3584
3608
|
return {
|
|
3585
3609
|
outputText: result.text,
|
|
@@ -3620,20 +3644,16 @@ function findToolResultError(step) {
|
|
|
3620
3644
|
}
|
|
3621
3645
|
return null;
|
|
3622
3646
|
}
|
|
3623
|
-
function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs) {
|
|
3624
|
-
const totalStepTokens = steps.reduce(
|
|
3625
|
-
(sum, s) => sum + (s.usage.totalTokens ?? 0),
|
|
3626
|
-
0
|
|
3627
|
-
);
|
|
3647
|
+
function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs, stepTimestamps) {
|
|
3628
3648
|
const traceSteps = steps.map((step, i) => {
|
|
3629
|
-
const
|
|
3630
|
-
const
|
|
3631
|
-
const stepDurationMs =
|
|
3649
|
+
const stepFinishedAt = stepTimestamps[i] ?? executionStartMs;
|
|
3650
|
+
const stepStartedAt = i === 0 ? executionStartMs : stepTimestamps[i - 1] ?? executionStartMs;
|
|
3651
|
+
const stepDurationMs = stepFinishedAt - stepStartedAt;
|
|
3632
3652
|
const firstToolCall = step.toolCalls[0];
|
|
3633
3653
|
const tokenUsage = {
|
|
3634
3654
|
prompt: step.usage.inputTokens ?? 0,
|
|
3635
3655
|
completion: step.usage.outputTokens ?? 0,
|
|
3636
|
-
total:
|
|
3656
|
+
total: step.usage.totalTokens ?? 0
|
|
3637
3657
|
};
|
|
3638
3658
|
const costUsd = calculateStepCost(step, modelId, provider, tokenUsage);
|
|
3639
3659
|
const toolResultError = findToolResultError(step);
|
|
@@ -3644,9 +3664,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
|
|
|
3644
3664
|
type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
|
|
3645
3665
|
model: modelId,
|
|
3646
3666
|
provider,
|
|
3647
|
-
startedAt: new Date(
|
|
3648
|
-
executionStartMs + Math.round(totalDurationMs * (i / Math.max(steps.length, 1)))
|
|
3649
|
-
).toISOString(),
|
|
3667
|
+
startedAt: new Date(stepStartedAt).toISOString(),
|
|
3650
3668
|
durationMs: stepDurationMs,
|
|
3651
3669
|
tokenUsage,
|
|
3652
3670
|
costUsd,
|
|
@@ -3704,33 +3722,6 @@ function emitStartEvent(traceContext, startTime) {
|
|
|
3704
3722
|
traceContext.authToken
|
|
3705
3723
|
);
|
|
3706
3724
|
}
|
|
3707
|
-
function emitStepEvents(traceContext, steps, startTime) {
|
|
3708
|
-
for (let i = 0; i < steps.length; i++) {
|
|
3709
|
-
const step = steps[i];
|
|
3710
|
-
const isToolStep = step.toolCalls.length > 0;
|
|
3711
|
-
const firstToolCall = step.toolCalls[0];
|
|
3712
|
-
emitTraceEvent(
|
|
3713
|
-
{
|
|
3714
|
-
evalRunId: traceContext.evalRunId,
|
|
3715
|
-
scenarioId: traceContext.scenarioId,
|
|
3716
|
-
scenarioName: traceContext.scenarioName,
|
|
3717
|
-
targetId: traceContext.targetId,
|
|
3718
|
-
targetName: traceContext.targetName,
|
|
3719
|
-
stepNumber: i + 1,
|
|
3720
|
-
type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
|
|
3721
|
-
toolName: firstToolCall?.toolName,
|
|
3722
|
-
toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
|
|
3723
|
-
outputPreview: step.text?.slice(0, 500),
|
|
3724
|
-
elapsedMs: Date.now() - startTime,
|
|
3725
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3726
|
-
isComplete: false
|
|
3727
|
-
},
|
|
3728
|
-
traceContext.tracePushUrl,
|
|
3729
|
-
traceContext.routeHeader,
|
|
3730
|
-
traceContext.authToken
|
|
3731
|
-
);
|
|
3732
|
-
}
|
|
3733
|
-
}
|
|
3734
3725
|
function emitCompletionEvent(traceContext, stepNumber) {
|
|
3735
3726
|
emitTraceEvent(
|
|
3736
3727
|
{
|