@wix/evalforge-evaluator 0.119.0 → 0.121.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +48 -144
- package/build/index.js.map +2 -2
- package/build/index.mjs +48 -144
- package/build/index.mjs.map +3 -3
- package/build/types/api-client.d.ts +1 -2
- package/build/types/fetch-evaluation-data.d.ts +1 -5
- package/build/types/run-scenario/agents/simple-agent/build-conversation.d.ts +1 -1
- package/build/types/run-scenario/agents/simple-agent/execute.d.ts +1 -1
- package/package.json +5 -5
package/build/index.js
CHANGED
|
@@ -55,6 +55,7 @@ function loadConfig() {
|
|
|
55
55
|
aiGatewayHeaders[key] = value;
|
|
56
56
|
}
|
|
57
57
|
}
|
|
58
|
+
aiGatewayHeaders["x-wix-ai-gateway-disable-cache"] = "true";
|
|
58
59
|
const tracePushUrl = process.env.TRACE_PUSH_URL;
|
|
59
60
|
const routeHeader = process.env.EVAL_ROUTE_HEADER;
|
|
60
61
|
const authToken = process.env.EVAL_AUTH_TOKEN;
|
|
@@ -185,9 +186,6 @@ function createApiClient(serverUrl, options = "") {
|
|
|
185
186
|
getPreset(projectId2, id) {
|
|
186
187
|
return fetchJson(`/projects/${projectId2}/presets/${id}`);
|
|
187
188
|
},
|
|
188
|
-
getAssertion(projectId2, id) {
|
|
189
|
-
return fetchJson(`/projects/${projectId2}/assertions/${id}`);
|
|
190
|
-
},
|
|
191
189
|
addResult(projectId2, evalRunId2, result) {
|
|
192
190
|
return postJson(
|
|
193
191
|
`/projects/${projectId2}/eval-runs/${evalRunId2}/results`,
|
|
@@ -249,9 +247,6 @@ function resolveValue(value, placeholders) {
|
|
|
249
247
|
}
|
|
250
248
|
return value;
|
|
251
249
|
}
|
|
252
|
-
function resolvePlaceholdersInString(text, placeholders) {
|
|
253
|
-
return resolveValue(text, placeholders);
|
|
254
|
-
}
|
|
255
250
|
|
|
256
251
|
// src/fetch-evaluation-data.ts
|
|
257
252
|
function parseSkillNamesFromParams(value) {
|
|
@@ -264,59 +259,6 @@ function parseSkillNamesFromParams(value) {
|
|
|
264
259
|
}
|
|
265
260
|
return [];
|
|
266
261
|
}
|
|
267
|
-
function applyParamsToAssertion(assertion, params) {
|
|
268
|
-
if (!params || Object.keys(params).length === 0) {
|
|
269
|
-
return assertion;
|
|
270
|
-
}
|
|
271
|
-
if (assertion.type === "llm_judge") {
|
|
272
|
-
const stringParams = {};
|
|
273
|
-
for (const [key, value] of Object.entries(params)) {
|
|
274
|
-
stringParams[key] = String(value ?? "");
|
|
275
|
-
}
|
|
276
|
-
const prompt = resolvePlaceholdersInString(assertion.prompt, stringParams);
|
|
277
|
-
return {
|
|
278
|
-
...assertion,
|
|
279
|
-
prompt,
|
|
280
|
-
...params.model !== void 0 && { model: params.model },
|
|
281
|
-
...params.maxTokens !== void 0 && {
|
|
282
|
-
maxTokens: params.maxTokens
|
|
283
|
-
},
|
|
284
|
-
...params.temperature !== void 0 && {
|
|
285
|
-
temperature: params.temperature
|
|
286
|
-
},
|
|
287
|
-
...params.minScore !== void 0 && {
|
|
288
|
-
minScore: params.minScore
|
|
289
|
-
}
|
|
290
|
-
};
|
|
291
|
-
}
|
|
292
|
-
if (assertion.type === "time_limit" && params.maxDurationMs !== void 0) {
|
|
293
|
-
return {
|
|
294
|
-
...assertion,
|
|
295
|
-
maxDurationMs: params.maxDurationMs
|
|
296
|
-
};
|
|
297
|
-
}
|
|
298
|
-
if (assertion.type === "skill_was_called" && params.skillNames !== void 0) {
|
|
299
|
-
return {
|
|
300
|
-
...assertion,
|
|
301
|
-
skillNames: parseSkillNamesFromParams(params.skillNames)
|
|
302
|
-
};
|
|
303
|
-
}
|
|
304
|
-
if (assertion.type === "tool_called_with_param") {
|
|
305
|
-
return {
|
|
306
|
-
...assertion,
|
|
307
|
-
...params.toolName !== void 0 && {
|
|
308
|
-
toolName: params.toolName
|
|
309
|
-
},
|
|
310
|
-
...params.expectedParams !== void 0 && {
|
|
311
|
-
expectedParams: params.expectedParams
|
|
312
|
-
},
|
|
313
|
-
...params.requireSuccess !== void 0 && {
|
|
314
|
-
requireSuccess: params.requireSuccess
|
|
315
|
-
}
|
|
316
|
-
};
|
|
317
|
-
}
|
|
318
|
-
return { ...assertion, ...params };
|
|
319
|
-
}
|
|
320
262
|
function resolveSystemAssertion(assertionId, params) {
|
|
321
263
|
const systemAssertion = import_evalforge_types.SYSTEM_ASSERTIONS[assertionId];
|
|
322
264
|
let baseAssertion;
|
|
@@ -371,18 +313,6 @@ function resolveSystemAssertion(assertionId, params) {
|
|
|
371
313
|
}
|
|
372
314
|
return baseAssertion;
|
|
373
315
|
}
|
|
374
|
-
function customAssertionToAssertion(ca, params) {
|
|
375
|
-
const config = ca.config;
|
|
376
|
-
const baseAssertion = {
|
|
377
|
-
type: "llm_judge",
|
|
378
|
-
prompt: config?.prompt ?? "",
|
|
379
|
-
minScore: config?.minScore,
|
|
380
|
-
model: config?.model,
|
|
381
|
-
maxTokens: config?.maxTokens,
|
|
382
|
-
temperature: config?.temperature
|
|
383
|
-
};
|
|
384
|
-
return applyParamsToAssertion(baseAssertion, params);
|
|
385
|
-
}
|
|
386
316
|
async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
387
317
|
const evalRun = await api.getEvalRun(projectId2, evalRunId2);
|
|
388
318
|
const scenarios = await Promise.all(
|
|
@@ -452,30 +382,14 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
452
382
|
templateIds.map((id) => api.getTemplate(projectId2, id))
|
|
453
383
|
) : [];
|
|
454
384
|
const templateMap = new Map(templates.map((t) => [t.id, t]));
|
|
455
|
-
const assertionIds = [
|
|
456
|
-
...new Set(
|
|
457
|
-
scenarios.flatMap((s) => s.assertionLinks ?? []).map((link) => link.assertionId).filter((id) => !(0, import_evalforge_types.isSystemAssertionId)(id))
|
|
458
|
-
)
|
|
459
|
-
];
|
|
460
|
-
const assertions = assertionIds.length > 0 ? await Promise.all(
|
|
461
|
-
assertionIds.map((id) => api.getAssertion(projectId2, id))
|
|
462
|
-
) : [];
|
|
463
|
-
const assertionMap = new Map(assertions.map((a) => [a.id, a]));
|
|
464
385
|
const scenarioItems = scenarios.map((scenario) => {
|
|
465
386
|
const resolvedAssertions = (scenario.assertionLinks ?? []).map((link) => {
|
|
466
387
|
const { assertionId, params } = link;
|
|
467
|
-
if ((0, import_evalforge_types.isSystemAssertionId)(assertionId)) {
|
|
468
|
-
return resolveSystemAssertion(
|
|
469
|
-
assertionId,
|
|
470
|
-
params
|
|
471
|
-
);
|
|
472
|
-
}
|
|
473
|
-
const customAssertion = assertionMap.get(assertionId);
|
|
474
|
-
if (!customAssertion) {
|
|
388
|
+
if (!(0, import_evalforge_types.isSystemAssertionId)(assertionId)) {
|
|
475
389
|
return null;
|
|
476
390
|
}
|
|
477
|
-
return
|
|
478
|
-
|
|
391
|
+
return resolveSystemAssertion(
|
|
392
|
+
assertionId,
|
|
479
393
|
params
|
|
480
394
|
);
|
|
481
395
|
}).filter((a) => a !== null);
|
|
@@ -3404,7 +3318,7 @@ function calculateStepCost(step, modelId, provider, tokenUsage) {
|
|
|
3404
3318
|
}
|
|
3405
3319
|
|
|
3406
3320
|
// src/run-scenario/agents/simple-agent/build-conversation.ts
|
|
3407
|
-
function buildConversation3(triggerPrompt, steps, executionStartMs) {
|
|
3321
|
+
function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestamps) {
|
|
3408
3322
|
const messages = [];
|
|
3409
3323
|
messages.push({
|
|
3410
3324
|
role: "user",
|
|
@@ -3413,11 +3327,9 @@ function buildConversation3(triggerPrompt, steps, executionStartMs) {
|
|
|
3413
3327
|
});
|
|
3414
3328
|
for (let i = 0; i < steps.length; i++) {
|
|
3415
3329
|
const step = steps[i];
|
|
3416
|
-
const stepTimestamp =
|
|
3417
|
-
executionStartMs
|
|
3418
|
-
|
|
3419
|
-
steps.length
|
|
3420
|
-
);
|
|
3330
|
+
const stepTimestamp = new Date(
|
|
3331
|
+
stepTimestamps[i] ?? executionStartMs
|
|
3332
|
+
).toISOString();
|
|
3421
3333
|
const assistantContent = [];
|
|
3422
3334
|
if (step.reasoningText) {
|
|
3423
3335
|
assistantContent.push({ type: "thinking", thinking: step.reasoningText });
|
|
@@ -3460,10 +3372,6 @@ function buildConversation3(triggerPrompt, steps, executionStartMs) {
|
|
|
3460
3372
|
}
|
|
3461
3373
|
return messages;
|
|
3462
3374
|
}
|
|
3463
|
-
function estimateStepTimestamp(startMs, stepIndex, totalSteps) {
|
|
3464
|
-
const offset = totalSteps > 1 ? (stepIndex + 1) / totalSteps : 1;
|
|
3465
|
-
return new Date(startMs + Math.round(offset * 1e3)).toISOString();
|
|
3466
|
-
}
|
|
3467
3375
|
|
|
3468
3376
|
// src/run-scenario/agents/simple-agent/execute.ts
|
|
3469
3377
|
var PROVIDER_ANTHROPIC2 = "anthropic";
|
|
@@ -3548,6 +3456,7 @@ async function executeWithAiSdk(context) {
|
|
|
3548
3456
|
}
|
|
3549
3457
|
}
|
|
3550
3458
|
};
|
|
3459
|
+
const stepTimestamps = [];
|
|
3551
3460
|
const result = await (0, import_ai.generateText)({
|
|
3552
3461
|
model,
|
|
3553
3462
|
system: systemPrompt,
|
|
@@ -3556,7 +3465,34 @@ async function executeWithAiSdk(context) {
|
|
|
3556
3465
|
maxOutputTokens: modelConfig.maxTokens,
|
|
3557
3466
|
tools: mcpTools,
|
|
3558
3467
|
stopWhen: mcpTools ? (0, import_ai.stepCountIs)(modelConfig.maxTurns ?? DEFAULT_MAX_TOOL_STEPS) : (0, import_ai.stepCountIs)(1),
|
|
3559
|
-
providerOptions: providerOpts
|
|
3468
|
+
providerOptions: providerOpts,
|
|
3469
|
+
onStepFinish: (step) => {
|
|
3470
|
+
stepTimestamps.push(Date.now());
|
|
3471
|
+
if (traceContext) {
|
|
3472
|
+
const isToolStep = step.toolCalls.length > 0;
|
|
3473
|
+
const firstToolCall = step.toolCalls[0];
|
|
3474
|
+
emitTraceEvent(
|
|
3475
|
+
{
|
|
3476
|
+
evalRunId: traceContext.evalRunId,
|
|
3477
|
+
scenarioId: traceContext.scenarioId,
|
|
3478
|
+
scenarioName: traceContext.scenarioName,
|
|
3479
|
+
targetId: traceContext.targetId,
|
|
3480
|
+
targetName: traceContext.targetName,
|
|
3481
|
+
stepNumber: stepTimestamps.length,
|
|
3482
|
+
type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
|
|
3483
|
+
toolName: firstToolCall?.toolName,
|
|
3484
|
+
toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
|
|
3485
|
+
outputPreview: step.text?.slice(0, 500),
|
|
3486
|
+
elapsedMs: Date.now() - startTime,
|
|
3487
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3488
|
+
isComplete: false
|
|
3489
|
+
},
|
|
3490
|
+
traceContext.tracePushUrl,
|
|
3491
|
+
traceContext.routeHeader,
|
|
3492
|
+
traceContext.authToken
|
|
3493
|
+
);
|
|
3494
|
+
}
|
|
3495
|
+
}
|
|
3560
3496
|
});
|
|
3561
3497
|
const durationMs = Date.now() - startTime;
|
|
3562
3498
|
const usage = {
|
|
@@ -3570,16 +3506,17 @@ async function executeWithAiSdk(context) {
|
|
|
3570
3506
|
usage,
|
|
3571
3507
|
modelConfig.model,
|
|
3572
3508
|
provider,
|
|
3573
|
-
startTime
|
|
3509
|
+
startTime,
|
|
3510
|
+
stepTimestamps
|
|
3574
3511
|
);
|
|
3575
3512
|
if (traceContext) {
|
|
3576
|
-
|
|
3577
|
-
emitCompletionEvent(traceContext, result.steps.length + 1);
|
|
3513
|
+
emitCompletionEvent(traceContext, stepTimestamps.length + 1);
|
|
3578
3514
|
}
|
|
3579
3515
|
const conversation = buildConversation3(
|
|
3580
3516
|
scenario.triggerPrompt,
|
|
3581
3517
|
result.steps,
|
|
3582
|
-
startTime
|
|
3518
|
+
startTime,
|
|
3519
|
+
stepTimestamps
|
|
3583
3520
|
);
|
|
3584
3521
|
return {
|
|
3585
3522
|
outputText: result.text,
|
|
@@ -3620,20 +3557,16 @@ function findToolResultError(step) {
|
|
|
3620
3557
|
}
|
|
3621
3558
|
return null;
|
|
3622
3559
|
}
|
|
3623
|
-
function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs) {
|
|
3624
|
-
const totalStepTokens = steps.reduce(
|
|
3625
|
-
(sum, s) => sum + (s.usage.totalTokens ?? 0),
|
|
3626
|
-
0
|
|
3627
|
-
);
|
|
3560
|
+
function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs, stepTimestamps) {
|
|
3628
3561
|
const traceSteps = steps.map((step, i) => {
|
|
3629
|
-
const
|
|
3630
|
-
const
|
|
3631
|
-
const stepDurationMs =
|
|
3562
|
+
const stepFinishedAt = stepTimestamps[i] ?? executionStartMs;
|
|
3563
|
+
const stepStartedAt = i === 0 ? executionStartMs : stepTimestamps[i - 1] ?? executionStartMs;
|
|
3564
|
+
const stepDurationMs = stepFinishedAt - stepStartedAt;
|
|
3632
3565
|
const firstToolCall = step.toolCalls[0];
|
|
3633
3566
|
const tokenUsage = {
|
|
3634
3567
|
prompt: step.usage.inputTokens ?? 0,
|
|
3635
3568
|
completion: step.usage.outputTokens ?? 0,
|
|
3636
|
-
total:
|
|
3569
|
+
total: step.usage.totalTokens ?? 0
|
|
3637
3570
|
};
|
|
3638
3571
|
const costUsd = calculateStepCost(step, modelId, provider, tokenUsage);
|
|
3639
3572
|
const toolResultError = findToolResultError(step);
|
|
@@ -3644,9 +3577,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
|
|
|
3644
3577
|
type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
|
|
3645
3578
|
model: modelId,
|
|
3646
3579
|
provider,
|
|
3647
|
-
startedAt: new Date(
|
|
3648
|
-
executionStartMs + Math.round(totalDurationMs * (i / Math.max(steps.length, 1)))
|
|
3649
|
-
).toISOString(),
|
|
3580
|
+
startedAt: new Date(stepStartedAt).toISOString(),
|
|
3650
3581
|
durationMs: stepDurationMs,
|
|
3651
3582
|
tokenUsage,
|
|
3652
3583
|
costUsd,
|
|
@@ -3704,33 +3635,6 @@ function emitStartEvent(traceContext, startTime) {
|
|
|
3704
3635
|
traceContext.authToken
|
|
3705
3636
|
);
|
|
3706
3637
|
}
|
|
3707
|
-
function emitStepEvents(traceContext, steps, startTime) {
|
|
3708
|
-
for (let i = 0; i < steps.length; i++) {
|
|
3709
|
-
const step = steps[i];
|
|
3710
|
-
const isToolStep = step.toolCalls.length > 0;
|
|
3711
|
-
const firstToolCall = step.toolCalls[0];
|
|
3712
|
-
emitTraceEvent(
|
|
3713
|
-
{
|
|
3714
|
-
evalRunId: traceContext.evalRunId,
|
|
3715
|
-
scenarioId: traceContext.scenarioId,
|
|
3716
|
-
scenarioName: traceContext.scenarioName,
|
|
3717
|
-
targetId: traceContext.targetId,
|
|
3718
|
-
targetName: traceContext.targetName,
|
|
3719
|
-
stepNumber: i + 1,
|
|
3720
|
-
type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
|
|
3721
|
-
toolName: firstToolCall?.toolName,
|
|
3722
|
-
toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
|
|
3723
|
-
outputPreview: step.text?.slice(0, 500),
|
|
3724
|
-
elapsedMs: Date.now() - startTime,
|
|
3725
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3726
|
-
isComplete: false
|
|
3727
|
-
},
|
|
3728
|
-
traceContext.tracePushUrl,
|
|
3729
|
-
traceContext.routeHeader,
|
|
3730
|
-
traceContext.authToken
|
|
3731
|
-
);
|
|
3732
|
-
}
|
|
3733
|
-
}
|
|
3734
3638
|
function emitCompletionEvent(traceContext, stepNumber) {
|
|
3735
3639
|
emitTraceEvent(
|
|
3736
3640
|
{
|