@wix/evalforge-evaluator 0.119.0 → 0.121.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -55,6 +55,7 @@ function loadConfig() {
55
55
  aiGatewayHeaders[key] = value;
56
56
  }
57
57
  }
58
+ aiGatewayHeaders["x-wix-ai-gateway-disable-cache"] = "true";
58
59
  const tracePushUrl = process.env.TRACE_PUSH_URL;
59
60
  const routeHeader = process.env.EVAL_ROUTE_HEADER;
60
61
  const authToken = process.env.EVAL_AUTH_TOKEN;
@@ -185,9 +186,6 @@ function createApiClient(serverUrl, options = "") {
185
186
  getPreset(projectId2, id) {
186
187
  return fetchJson(`/projects/${projectId2}/presets/${id}`);
187
188
  },
188
- getAssertion(projectId2, id) {
189
- return fetchJson(`/projects/${projectId2}/assertions/${id}`);
190
- },
191
189
  addResult(projectId2, evalRunId2, result) {
192
190
  return postJson(
193
191
  `/projects/${projectId2}/eval-runs/${evalRunId2}/results`,
@@ -249,9 +247,6 @@ function resolveValue(value, placeholders) {
249
247
  }
250
248
  return value;
251
249
  }
252
- function resolvePlaceholdersInString(text, placeholders) {
253
- return resolveValue(text, placeholders);
254
- }
255
250
 
256
251
  // src/fetch-evaluation-data.ts
257
252
  function parseSkillNamesFromParams(value) {
@@ -264,59 +259,6 @@ function parseSkillNamesFromParams(value) {
264
259
  }
265
260
  return [];
266
261
  }
267
- function applyParamsToAssertion(assertion, params) {
268
- if (!params || Object.keys(params).length === 0) {
269
- return assertion;
270
- }
271
- if (assertion.type === "llm_judge") {
272
- const stringParams = {};
273
- for (const [key, value] of Object.entries(params)) {
274
- stringParams[key] = String(value ?? "");
275
- }
276
- const prompt = resolvePlaceholdersInString(assertion.prompt, stringParams);
277
- return {
278
- ...assertion,
279
- prompt,
280
- ...params.model !== void 0 && { model: params.model },
281
- ...params.maxTokens !== void 0 && {
282
- maxTokens: params.maxTokens
283
- },
284
- ...params.temperature !== void 0 && {
285
- temperature: params.temperature
286
- },
287
- ...params.minScore !== void 0 && {
288
- minScore: params.minScore
289
- }
290
- };
291
- }
292
- if (assertion.type === "time_limit" && params.maxDurationMs !== void 0) {
293
- return {
294
- ...assertion,
295
- maxDurationMs: params.maxDurationMs
296
- };
297
- }
298
- if (assertion.type === "skill_was_called" && params.skillNames !== void 0) {
299
- return {
300
- ...assertion,
301
- skillNames: parseSkillNamesFromParams(params.skillNames)
302
- };
303
- }
304
- if (assertion.type === "tool_called_with_param") {
305
- return {
306
- ...assertion,
307
- ...params.toolName !== void 0 && {
308
- toolName: params.toolName
309
- },
310
- ...params.expectedParams !== void 0 && {
311
- expectedParams: params.expectedParams
312
- },
313
- ...params.requireSuccess !== void 0 && {
314
- requireSuccess: params.requireSuccess
315
- }
316
- };
317
- }
318
- return { ...assertion, ...params };
319
- }
320
262
  function resolveSystemAssertion(assertionId, params) {
321
263
  const systemAssertion = import_evalforge_types.SYSTEM_ASSERTIONS[assertionId];
322
264
  let baseAssertion;
@@ -371,18 +313,6 @@ function resolveSystemAssertion(assertionId, params) {
371
313
  }
372
314
  return baseAssertion;
373
315
  }
374
- function customAssertionToAssertion(ca, params) {
375
- const config = ca.config;
376
- const baseAssertion = {
377
- type: "llm_judge",
378
- prompt: config?.prompt ?? "",
379
- minScore: config?.minScore,
380
- model: config?.model,
381
- maxTokens: config?.maxTokens,
382
- temperature: config?.temperature
383
- };
384
- return applyParamsToAssertion(baseAssertion, params);
385
- }
386
316
  async function fetchEvaluationData(api, projectId2, evalRunId2) {
387
317
  const evalRun = await api.getEvalRun(projectId2, evalRunId2);
388
318
  const scenarios = await Promise.all(
@@ -452,30 +382,14 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
452
382
  templateIds.map((id) => api.getTemplate(projectId2, id))
453
383
  ) : [];
454
384
  const templateMap = new Map(templates.map((t) => [t.id, t]));
455
- const assertionIds = [
456
- ...new Set(
457
- scenarios.flatMap((s) => s.assertionLinks ?? []).map((link) => link.assertionId).filter((id) => !(0, import_evalforge_types.isSystemAssertionId)(id))
458
- )
459
- ];
460
- const assertions = assertionIds.length > 0 ? await Promise.all(
461
- assertionIds.map((id) => api.getAssertion(projectId2, id))
462
- ) : [];
463
- const assertionMap = new Map(assertions.map((a) => [a.id, a]));
464
385
  const scenarioItems = scenarios.map((scenario) => {
465
386
  const resolvedAssertions = (scenario.assertionLinks ?? []).map((link) => {
466
387
  const { assertionId, params } = link;
467
- if ((0, import_evalforge_types.isSystemAssertionId)(assertionId)) {
468
- return resolveSystemAssertion(
469
- assertionId,
470
- params
471
- );
472
- }
473
- const customAssertion = assertionMap.get(assertionId);
474
- if (!customAssertion) {
388
+ if (!(0, import_evalforge_types.isSystemAssertionId)(assertionId)) {
475
389
  return null;
476
390
  }
477
- return customAssertionToAssertion(
478
- customAssertion,
391
+ return resolveSystemAssertion(
392
+ assertionId,
479
393
  params
480
394
  );
481
395
  }).filter((a) => a !== null);
@@ -3404,7 +3318,7 @@ function calculateStepCost(step, modelId, provider, tokenUsage) {
3404
3318
  }
3405
3319
 
3406
3320
  // src/run-scenario/agents/simple-agent/build-conversation.ts
3407
- function buildConversation3(triggerPrompt, steps, executionStartMs) {
3321
+ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestamps) {
3408
3322
  const messages = [];
3409
3323
  messages.push({
3410
3324
  role: "user",
@@ -3413,11 +3327,9 @@ function buildConversation3(triggerPrompt, steps, executionStartMs) {
3413
3327
  });
3414
3328
  for (let i = 0; i < steps.length; i++) {
3415
3329
  const step = steps[i];
3416
- const stepTimestamp = estimateStepTimestamp(
3417
- executionStartMs,
3418
- i,
3419
- steps.length
3420
- );
3330
+ const stepTimestamp = new Date(
3331
+ stepTimestamps[i] ?? executionStartMs
3332
+ ).toISOString();
3421
3333
  const assistantContent = [];
3422
3334
  if (step.reasoningText) {
3423
3335
  assistantContent.push({ type: "thinking", thinking: step.reasoningText });
@@ -3460,10 +3372,6 @@ function buildConversation3(triggerPrompt, steps, executionStartMs) {
3460
3372
  }
3461
3373
  return messages;
3462
3374
  }
3463
- function estimateStepTimestamp(startMs, stepIndex, totalSteps) {
3464
- const offset = totalSteps > 1 ? (stepIndex + 1) / totalSteps : 1;
3465
- return new Date(startMs + Math.round(offset * 1e3)).toISOString();
3466
- }
3467
3375
 
3468
3376
  // src/run-scenario/agents/simple-agent/execute.ts
3469
3377
  var PROVIDER_ANTHROPIC2 = "anthropic";
@@ -3548,6 +3456,7 @@ async function executeWithAiSdk(context) {
3548
3456
  }
3549
3457
  }
3550
3458
  };
3459
+ const stepTimestamps = [];
3551
3460
  const result = await (0, import_ai.generateText)({
3552
3461
  model,
3553
3462
  system: systemPrompt,
@@ -3556,7 +3465,34 @@ async function executeWithAiSdk(context) {
3556
3465
  maxOutputTokens: modelConfig.maxTokens,
3557
3466
  tools: mcpTools,
3558
3467
  stopWhen: mcpTools ? (0, import_ai.stepCountIs)(modelConfig.maxTurns ?? DEFAULT_MAX_TOOL_STEPS) : (0, import_ai.stepCountIs)(1),
3559
- providerOptions: providerOpts
3468
+ providerOptions: providerOpts,
3469
+ onStepFinish: (step) => {
3470
+ stepTimestamps.push(Date.now());
3471
+ if (traceContext) {
3472
+ const isToolStep = step.toolCalls.length > 0;
3473
+ const firstToolCall = step.toolCalls[0];
3474
+ emitTraceEvent(
3475
+ {
3476
+ evalRunId: traceContext.evalRunId,
3477
+ scenarioId: traceContext.scenarioId,
3478
+ scenarioName: traceContext.scenarioName,
3479
+ targetId: traceContext.targetId,
3480
+ targetName: traceContext.targetName,
3481
+ stepNumber: stepTimestamps.length,
3482
+ type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
3483
+ toolName: firstToolCall?.toolName,
3484
+ toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
3485
+ outputPreview: step.text?.slice(0, 500),
3486
+ elapsedMs: Date.now() - startTime,
3487
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3488
+ isComplete: false
3489
+ },
3490
+ traceContext.tracePushUrl,
3491
+ traceContext.routeHeader,
3492
+ traceContext.authToken
3493
+ );
3494
+ }
3495
+ }
3560
3496
  });
3561
3497
  const durationMs = Date.now() - startTime;
3562
3498
  const usage = {
@@ -3570,16 +3506,17 @@ async function executeWithAiSdk(context) {
3570
3506
  usage,
3571
3507
  modelConfig.model,
3572
3508
  provider,
3573
- startTime
3509
+ startTime,
3510
+ stepTimestamps
3574
3511
  );
3575
3512
  if (traceContext) {
3576
- emitStepEvents(traceContext, result.steps, startTime);
3577
- emitCompletionEvent(traceContext, result.steps.length + 1);
3513
+ emitCompletionEvent(traceContext, stepTimestamps.length + 1);
3578
3514
  }
3579
3515
  const conversation = buildConversation3(
3580
3516
  scenario.triggerPrompt,
3581
3517
  result.steps,
3582
- startTime
3518
+ startTime,
3519
+ stepTimestamps
3583
3520
  );
3584
3521
  return {
3585
3522
  outputText: result.text,
@@ -3620,20 +3557,16 @@ function findToolResultError(step) {
3620
3557
  }
3621
3558
  return null;
3622
3559
  }
3623
- function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs) {
3624
- const totalStepTokens = steps.reduce(
3625
- (sum, s) => sum + (s.usage.totalTokens ?? 0),
3626
- 0
3627
- );
3560
+ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs, stepTimestamps) {
3628
3561
  const traceSteps = steps.map((step, i) => {
3629
- const stepTokens = step.usage.totalTokens ?? 0;
3630
- const proportion = totalStepTokens > 0 ? stepTokens / totalStepTokens : 0;
3631
- const stepDurationMs = Math.round(totalDurationMs * proportion);
3562
+ const stepFinishedAt = stepTimestamps[i] ?? executionStartMs;
3563
+ const stepStartedAt = i === 0 ? executionStartMs : stepTimestamps[i - 1] ?? executionStartMs;
3564
+ const stepDurationMs = stepFinishedAt - stepStartedAt;
3632
3565
  const firstToolCall = step.toolCalls[0];
3633
3566
  const tokenUsage = {
3634
3567
  prompt: step.usage.inputTokens ?? 0,
3635
3568
  completion: step.usage.outputTokens ?? 0,
3636
- total: stepTokens
3569
+ total: step.usage.totalTokens ?? 0
3637
3570
  };
3638
3571
  const costUsd = calculateStepCost(step, modelId, provider, tokenUsage);
3639
3572
  const toolResultError = findToolResultError(step);
@@ -3644,9 +3577,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
3644
3577
  type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
3645
3578
  model: modelId,
3646
3579
  provider,
3647
- startedAt: new Date(
3648
- executionStartMs + Math.round(totalDurationMs * (i / Math.max(steps.length, 1)))
3649
- ).toISOString(),
3580
+ startedAt: new Date(stepStartedAt).toISOString(),
3650
3581
  durationMs: stepDurationMs,
3651
3582
  tokenUsage,
3652
3583
  costUsd,
@@ -3704,33 +3635,6 @@ function emitStartEvent(traceContext, startTime) {
3704
3635
  traceContext.authToken
3705
3636
  );
3706
3637
  }
3707
- function emitStepEvents(traceContext, steps, startTime) {
3708
- for (let i = 0; i < steps.length; i++) {
3709
- const step = steps[i];
3710
- const isToolStep = step.toolCalls.length > 0;
3711
- const firstToolCall = step.toolCalls[0];
3712
- emitTraceEvent(
3713
- {
3714
- evalRunId: traceContext.evalRunId,
3715
- scenarioId: traceContext.scenarioId,
3716
- scenarioName: traceContext.scenarioName,
3717
- targetId: traceContext.targetId,
3718
- targetName: traceContext.targetName,
3719
- stepNumber: i + 1,
3720
- type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
3721
- toolName: firstToolCall?.toolName,
3722
- toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
3723
- outputPreview: step.text?.slice(0, 500),
3724
- elapsedMs: Date.now() - startTime,
3725
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3726
- isComplete: false
3727
- },
3728
- traceContext.tracePushUrl,
3729
- traceContext.routeHeader,
3730
- traceContext.authToken
3731
- );
3732
- }
3733
- }
3734
3638
  function emitCompletionEvent(traceContext, stepNumber) {
3735
3639
  emitTraceEvent(
3736
3640
  {