@wix/evalforge-evaluator 0.132.0 → 0.134.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -3466,9 +3466,10 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
3466
3466
  });
3467
3467
  for (let i = 0; i < steps.length; i++) {
3468
3468
  const step = steps[i];
3469
- const stepTimestamp = new Date(
3470
- stepTimestamps[i] ?? executionStartMs
3471
- ).toISOString();
3469
+ const stepStartedAt = i === 0 ? executionStartMs : stepTimestamps[i - 1] ?? executionStartMs;
3470
+ const stepFinishedAt = stepTimestamps[i] ?? executionStartMs;
3471
+ const assistantTimestamp = new Date(stepStartedAt).toISOString();
3472
+ const toolResultTimestamp = new Date(stepFinishedAt).toISOString();
3472
3473
  const assistantContent = [];
3473
3474
  if (step.reasoningText) {
3474
3475
  assistantContent.push({ type: "thinking", thinking: step.reasoningText });
@@ -3488,7 +3489,7 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
3488
3489
  messages.push({
3489
3490
  role: "assistant",
3490
3491
  content: assistantContent,
3491
- timestamp: stepTimestamp
3492
+ timestamp: assistantTimestamp
3492
3493
  });
3493
3494
  }
3494
3495
  if (step.toolResults.length > 0) {
@@ -3505,7 +3506,7 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
3505
3506
  messages.push({
3506
3507
  role: "user",
3507
3508
  content: resultBlocks,
3508
- timestamp: stepTimestamp
3509
+ timestamp: toolResultTimestamp
3509
3510
  });
3510
3511
  }
3511
3512
  }
@@ -4678,6 +4679,57 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
4678
4679
  };
4679
4680
  }
4680
4681
 
4682
+ // src/evaluation-loop.ts
4683
+ var import_crypto5 = require("crypto");
4684
+ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4685
+ const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
4686
+ let completedExecutions = 0;
4687
+ const totalExecutions = scenarioItems.length * runsPerScenario;
4688
+ const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
4689
+ const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
4690
+ for (const { scenario, template, resolvedAssertions } of scenarioItems) {
4691
+ for (let iterationIndex = 0; iterationIndex < runsPerScenario; iterationIndex++) {
4692
+ const iterLabel = runsPerScenario > 1 ? ` [run ${iterationIndex + 1}/${runsPerScenario}]` : "";
4693
+ callbacks.onProgress(completedExecutions + 1, totalExecutions, iterLabel);
4694
+ let scenarioResult = null;
4695
+ try {
4696
+ scenarioResult = await callbacks.runScenario(
4697
+ scenario,
4698
+ template,
4699
+ resolvedAssertions
4700
+ );
4701
+ } catch (err) {
4702
+ const errorMsg = err instanceof Error ? err.message : String(err);
4703
+ console.error(
4704
+ `[Evaluator] Scenario iteration failed, recording as error result: "${scenario.name}"${iterLabel} \u2014 ${errorMsg}`
4705
+ );
4706
+ const errorResult = {
4707
+ id: (0, import_crypto5.randomUUID)(),
4708
+ targetId,
4709
+ targetName,
4710
+ scenarioId: scenario.id,
4711
+ scenarioName: scenario.name,
4712
+ assertionResults: [],
4713
+ passed: 0,
4714
+ failed: 0,
4715
+ passRate: 0,
4716
+ duration: 0,
4717
+ outputText: `Execution error: ${errorMsg}`,
4718
+ startedAt: (/* @__PURE__ */ new Date()).toISOString(),
4719
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
4720
+ iterationIndex
4721
+ };
4722
+ await callbacks.addResult(errorResult);
4723
+ }
4724
+ if (scenarioResult !== null) {
4725
+ await callbacks.addResult({ ...scenarioResult, iterationIndex });
4726
+ }
4727
+ completedExecutions++;
4728
+ }
4729
+ }
4730
+ return { completedExecutions, totalExecutions };
4731
+ }
4732
+
4681
4733
  // src/error-reporter.ts
4682
4734
  var import_evalforge_types14 = require("@wix/evalforge-types");
4683
4735
  function formatError(error, phase, context) {
@@ -4858,69 +4910,57 @@ async function runEvaluation(projectId2, evalRunId2) {
4858
4910
  `[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for evaluation runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
4859
4911
  );
4860
4912
  }
4861
- let completedScenarios = 0;
4862
- const totalScenarios = scenarioItems.length;
4863
- for (const { scenario, template, resolvedAssertions } of scenarioItems) {
4864
- state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
4865
- state.currentContext = {
4866
- projectId: projectId2,
4867
- evalRunId: evalRunId2,
4868
- scenarioId: scenario.id,
4869
- scenarioName: scenario.name,
4870
- presetId: evalData.evalRun.presetId,
4871
- presetName: evalData.presetName,
4872
- agentId: agent?.id,
4873
- agentName: agent?.name,
4874
- progress: `${completedScenarios + 1}/${totalScenarios}`
4875
- };
4876
- const skillNames = evalData.skills.map((s) => s.name).join(", ");
4877
- console.log(
4878
- "[Evaluator] Running scenario with preset:",
4879
- evalData.presetName,
4880
- skillNames ? `(${skillNames})` : "",
4881
- agent ? `with agent: ${agent.name}` : "",
4882
- `(${completedScenarios + 1}/${totalScenarios})`
4883
- );
4884
- try {
4885
- const result = await runScenario(
4886
- config,
4887
- evalRunId2,
4888
- scenario,
4889
- evalData,
4890
- template,
4891
- resolvedAssertions
4892
- );
4893
- console.log("[Evaluator] Scenario completed, adding result");
4894
- state.currentPhase = ExecutionPhase.ADD_RESULT;
4895
- state.currentContext = {
4896
- ...state.currentContext,
4897
- resultId: result.id
4898
- };
4899
- await api.addResult(projectId2, evalRunId2, result);
4900
- completedScenarios++;
4901
- } catch (err) {
4902
- const errorMsg = err instanceof Error ? err.message : String(err);
4903
- const errorStack = err instanceof Error ? err.stack : void 0;
4904
- console.error(
4905
- "[Evaluator] Failed to run scenario with preset:",
4906
- evalData.presetName,
4907
- "Error:",
4908
- errorMsg
4909
- );
4910
- if (errorStack) {
4911
- console.error("[Evaluator] Stack trace:", errorStack);
4913
+ const skillNames = evalData.skills.map((s) => s.name).join(", ");
4914
+ const { completedExecutions, totalExecutions } = await runEvaluationLoop(
4915
+ scenarioItems,
4916
+ evalData,
4917
+ {
4918
+ runScenario: (scenario, template, resolvedAssertions) => {
4919
+ state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
4920
+ state.currentContext = {
4921
+ projectId: projectId2,
4922
+ evalRunId: evalRunId2,
4923
+ scenarioId: scenario.id,
4924
+ scenarioName: scenario.name,
4925
+ presetId: evalData.evalRun.presetId,
4926
+ presetName: evalData.presetName,
4927
+ agentId: agent?.id,
4928
+ agentName: agent?.name
4929
+ };
4930
+ return runScenario(
4931
+ config,
4932
+ evalRunId2,
4933
+ scenario,
4934
+ evalData,
4935
+ template,
4936
+ resolvedAssertions
4937
+ );
4938
+ },
4939
+ addResult: async (result) => {
4940
+ state.currentPhase = ExecutionPhase.ADD_RESULT;
4941
+ state.currentContext = {
4942
+ ...state.currentContext,
4943
+ resultId: result.id
4944
+ };
4945
+ await api.addResult(projectId2, evalRunId2, result);
4946
+ },
4947
+ onProgress: (completed, total, iterLabel) => {
4948
+ console.log(
4949
+ "[Evaluator] Running scenario with preset:",
4950
+ evalData.presetName,
4951
+ skillNames ? `(${skillNames})` : "",
4952
+ agent ? `with agent: ${agent.name}` : "",
4953
+ `(${completed}/${total})${iterLabel}`
4954
+ );
4912
4955
  }
4913
- throw new Error(
4914
- `[${state.currentPhase}] Failed to execute preset "${evalData.presetName}" on scenario "${scenario.name}": ${errorMsg}`
4915
- );
4916
4956
  }
4917
- }
4957
+ );
4918
4958
  state.currentPhase = ExecutionPhase.UPDATE_STATUS;
4919
4959
  state.currentContext = {
4920
4960
  projectId: projectId2,
4921
4961
  evalRunId: evalRunId2,
4922
- completedScenarios,
4923
- totalScenarios
4962
+ completedExecutions,
4963
+ totalExecutions
4924
4964
  };
4925
4965
  try {
4926
4966
  await api.updateEvalRun(projectId2, evalRunId2, {