@wix/evalforge-evaluator 0.132.0 → 0.134.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -3474,9 +3474,10 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
3474
3474
  });
3475
3475
  for (let i = 0; i < steps.length; i++) {
3476
3476
  const step = steps[i];
3477
- const stepTimestamp = new Date(
3478
- stepTimestamps[i] ?? executionStartMs
3479
- ).toISOString();
3477
+ const stepStartedAt = i === 0 ? executionStartMs : stepTimestamps[i - 1] ?? executionStartMs;
3478
+ const stepFinishedAt = stepTimestamps[i] ?? executionStartMs;
3479
+ const assistantTimestamp = new Date(stepStartedAt).toISOString();
3480
+ const toolResultTimestamp = new Date(stepFinishedAt).toISOString();
3480
3481
  const assistantContent = [];
3481
3482
  if (step.reasoningText) {
3482
3483
  assistantContent.push({ type: "thinking", thinking: step.reasoningText });
@@ -3496,7 +3497,7 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
3496
3497
  messages.push({
3497
3498
  role: "assistant",
3498
3499
  content: assistantContent,
3499
- timestamp: stepTimestamp
3500
+ timestamp: assistantTimestamp
3500
3501
  });
3501
3502
  }
3502
3503
  if (step.toolResults.length > 0) {
@@ -3513,7 +3514,7 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
3513
3514
  messages.push({
3514
3515
  role: "user",
3515
3516
  content: resultBlocks,
3516
- timestamp: stepTimestamp
3517
+ timestamp: toolResultTimestamp
3517
3518
  });
3518
3519
  }
3519
3520
  }
@@ -4686,6 +4687,57 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
4686
4687
  };
4687
4688
  }
4688
4689
 
4690
+ // src/evaluation-loop.ts
4691
+ import { randomUUID as randomUUID5 } from "crypto";
4692
+ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4693
+ const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
4694
+ let completedExecutions = 0;
4695
+ const totalExecutions = scenarioItems.length * runsPerScenario;
4696
+ const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
4697
+ const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
4698
+ for (const { scenario, template, resolvedAssertions } of scenarioItems) {
4699
+ for (let iterationIndex = 0; iterationIndex < runsPerScenario; iterationIndex++) {
4700
+ const iterLabel = runsPerScenario > 1 ? ` [run ${iterationIndex + 1}/${runsPerScenario}]` : "";
4701
+ callbacks.onProgress(completedExecutions + 1, totalExecutions, iterLabel);
4702
+ let scenarioResult = null;
4703
+ try {
4704
+ scenarioResult = await callbacks.runScenario(
4705
+ scenario,
4706
+ template,
4707
+ resolvedAssertions
4708
+ );
4709
+ } catch (err) {
4710
+ const errorMsg = err instanceof Error ? err.message : String(err);
4711
+ console.error(
4712
+ `[Evaluator] Scenario iteration failed, recording as error result: "${scenario.name}"${iterLabel} \u2014 ${errorMsg}`
4713
+ );
4714
+ const errorResult = {
4715
+ id: randomUUID5(),
4716
+ targetId,
4717
+ targetName,
4718
+ scenarioId: scenario.id,
4719
+ scenarioName: scenario.name,
4720
+ assertionResults: [],
4721
+ passed: 0,
4722
+ failed: 0,
4723
+ passRate: 0,
4724
+ duration: 0,
4725
+ outputText: `Execution error: ${errorMsg}`,
4726
+ startedAt: (/* @__PURE__ */ new Date()).toISOString(),
4727
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
4728
+ iterationIndex
4729
+ };
4730
+ await callbacks.addResult(errorResult);
4731
+ }
4732
+ if (scenarioResult !== null) {
4733
+ await callbacks.addResult({ ...scenarioResult, iterationIndex });
4734
+ }
4735
+ completedExecutions++;
4736
+ }
4737
+ }
4738
+ return { completedExecutions, totalExecutions };
4739
+ }
4740
+
4689
4741
  // src/error-reporter.ts
4690
4742
  import { EvalStatus } from "@wix/evalforge-types";
4691
4743
  function formatError(error, phase, context) {
@@ -4866,69 +4918,57 @@ async function runEvaluation(projectId2, evalRunId2) {
4866
4918
  `[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for evaluation runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
4867
4919
  );
4868
4920
  }
4869
- let completedScenarios = 0;
4870
- const totalScenarios = scenarioItems.length;
4871
- for (const { scenario, template, resolvedAssertions } of scenarioItems) {
4872
- state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
4873
- state.currentContext = {
4874
- projectId: projectId2,
4875
- evalRunId: evalRunId2,
4876
- scenarioId: scenario.id,
4877
- scenarioName: scenario.name,
4878
- presetId: evalData.evalRun.presetId,
4879
- presetName: evalData.presetName,
4880
- agentId: agent?.id,
4881
- agentName: agent?.name,
4882
- progress: `${completedScenarios + 1}/${totalScenarios}`
4883
- };
4884
- const skillNames = evalData.skills.map((s) => s.name).join(", ");
4885
- console.log(
4886
- "[Evaluator] Running scenario with preset:",
4887
- evalData.presetName,
4888
- skillNames ? `(${skillNames})` : "",
4889
- agent ? `with agent: ${agent.name}` : "",
4890
- `(${completedScenarios + 1}/${totalScenarios})`
4891
- );
4892
- try {
4893
- const result = await runScenario(
4894
- config,
4895
- evalRunId2,
4896
- scenario,
4897
- evalData,
4898
- template,
4899
- resolvedAssertions
4900
- );
4901
- console.log("[Evaluator] Scenario completed, adding result");
4902
- state.currentPhase = ExecutionPhase.ADD_RESULT;
4903
- state.currentContext = {
4904
- ...state.currentContext,
4905
- resultId: result.id
4906
- };
4907
- await api.addResult(projectId2, evalRunId2, result);
4908
- completedScenarios++;
4909
- } catch (err) {
4910
- const errorMsg = err instanceof Error ? err.message : String(err);
4911
- const errorStack = err instanceof Error ? err.stack : void 0;
4912
- console.error(
4913
- "[Evaluator] Failed to run scenario with preset:",
4914
- evalData.presetName,
4915
- "Error:",
4916
- errorMsg
4917
- );
4918
- if (errorStack) {
4919
- console.error("[Evaluator] Stack trace:", errorStack);
4921
+ const skillNames = evalData.skills.map((s) => s.name).join(", ");
4922
+ const { completedExecutions, totalExecutions } = await runEvaluationLoop(
4923
+ scenarioItems,
4924
+ evalData,
4925
+ {
4926
+ runScenario: (scenario, template, resolvedAssertions) => {
4927
+ state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
4928
+ state.currentContext = {
4929
+ projectId: projectId2,
4930
+ evalRunId: evalRunId2,
4931
+ scenarioId: scenario.id,
4932
+ scenarioName: scenario.name,
4933
+ presetId: evalData.evalRun.presetId,
4934
+ presetName: evalData.presetName,
4935
+ agentId: agent?.id,
4936
+ agentName: agent?.name
4937
+ };
4938
+ return runScenario(
4939
+ config,
4940
+ evalRunId2,
4941
+ scenario,
4942
+ evalData,
4943
+ template,
4944
+ resolvedAssertions
4945
+ );
4946
+ },
4947
+ addResult: async (result) => {
4948
+ state.currentPhase = ExecutionPhase.ADD_RESULT;
4949
+ state.currentContext = {
4950
+ ...state.currentContext,
4951
+ resultId: result.id
4952
+ };
4953
+ await api.addResult(projectId2, evalRunId2, result);
4954
+ },
4955
+ onProgress: (completed, total, iterLabel) => {
4956
+ console.log(
4957
+ "[Evaluator] Running scenario with preset:",
4958
+ evalData.presetName,
4959
+ skillNames ? `(${skillNames})` : "",
4960
+ agent ? `with agent: ${agent.name}` : "",
4961
+ `(${completed}/${total})${iterLabel}`
4962
+ );
4920
4963
  }
4921
- throw new Error(
4922
- `[${state.currentPhase}] Failed to execute preset "${evalData.presetName}" on scenario "${scenario.name}": ${errorMsg}`
4923
- );
4924
4964
  }
4925
- }
4965
+ );
4926
4966
  state.currentPhase = ExecutionPhase.UPDATE_STATUS;
4927
4967
  state.currentContext = {
4928
4968
  projectId: projectId2,
4929
4969
  evalRunId: evalRunId2,
4930
- completedScenarios,
4931
- totalScenarios
4970
+ completedExecutions,
4971
+ totalExecutions
4932
4972
  };
4933
4973
  try {
4934
4974
  await api.updateEvalRun(projectId2, evalRunId2, {