@wix/evalforge-evaluator 0.132.0 → 0.133.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -4678,6 +4678,57 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
4678
4678
  };
4679
4679
  }
4680
4680
 
4681
+ // src/evaluation-loop.ts
4682
+ var import_crypto5 = require("crypto");
4683
+ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4684
+ const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
4685
+ let completedExecutions = 0;
4686
+ const totalExecutions = scenarioItems.length * runsPerScenario;
4687
+ const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
4688
+ const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
4689
+ for (const { scenario, template, resolvedAssertions } of scenarioItems) {
4690
+ for (let iterationIndex = 0; iterationIndex < runsPerScenario; iterationIndex++) {
4691
+ const iterLabel = runsPerScenario > 1 ? ` [run ${iterationIndex + 1}/${runsPerScenario}]` : "";
4692
+ callbacks.onProgress(completedExecutions + 1, totalExecutions, iterLabel);
4693
+ let scenarioResult = null;
4694
+ try {
4695
+ scenarioResult = await callbacks.runScenario(
4696
+ scenario,
4697
+ template,
4698
+ resolvedAssertions
4699
+ );
4700
+ } catch (err) {
4701
+ const errorMsg = err instanceof Error ? err.message : String(err);
4702
+ console.error(
4703
+ `[Evaluator] Scenario iteration failed, recording as error result: "${scenario.name}"${iterLabel} \u2014 ${errorMsg}`
4704
+ );
4705
+ const errorResult = {
4706
+ id: (0, import_crypto5.randomUUID)(),
4707
+ targetId,
4708
+ targetName,
4709
+ scenarioId: scenario.id,
4710
+ scenarioName: scenario.name,
4711
+ assertionResults: [],
4712
+ passed: 0,
4713
+ failed: 0,
4714
+ passRate: 0,
4715
+ duration: 0,
4716
+ outputText: `Execution error: ${errorMsg}`,
4717
+ startedAt: (/* @__PURE__ */ new Date()).toISOString(),
4718
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
4719
+ iterationIndex
4720
+ };
4721
+ await callbacks.addResult(errorResult);
4722
+ }
4723
+ if (scenarioResult !== null) {
4724
+ await callbacks.addResult({ ...scenarioResult, iterationIndex });
4725
+ }
4726
+ completedExecutions++;
4727
+ }
4728
+ }
4729
+ return { completedExecutions, totalExecutions };
4730
+ }
4731
+
4681
4732
  // src/error-reporter.ts
4682
4733
  var import_evalforge_types14 = require("@wix/evalforge-types");
4683
4734
  function formatError(error, phase, context) {
@@ -4858,69 +4909,57 @@ async function runEvaluation(projectId2, evalRunId2) {
4858
4909
  `[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for evaluation runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
4859
4910
  );
4860
4911
  }
4861
- let completedScenarios = 0;
4862
- const totalScenarios = scenarioItems.length;
4863
- for (const { scenario, template, resolvedAssertions } of scenarioItems) {
4864
- state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
4865
- state.currentContext = {
4866
- projectId: projectId2,
4867
- evalRunId: evalRunId2,
4868
- scenarioId: scenario.id,
4869
- scenarioName: scenario.name,
4870
- presetId: evalData.evalRun.presetId,
4871
- presetName: evalData.presetName,
4872
- agentId: agent?.id,
4873
- agentName: agent?.name,
4874
- progress: `${completedScenarios + 1}/${totalScenarios}`
4875
- };
4876
- const skillNames = evalData.skills.map((s) => s.name).join(", ");
4877
- console.log(
4878
- "[Evaluator] Running scenario with preset:",
4879
- evalData.presetName,
4880
- skillNames ? `(${skillNames})` : "",
4881
- agent ? `with agent: ${agent.name}` : "",
4882
- `(${completedScenarios + 1}/${totalScenarios})`
4883
- );
4884
- try {
4885
- const result = await runScenario(
4886
- config,
4887
- evalRunId2,
4888
- scenario,
4889
- evalData,
4890
- template,
4891
- resolvedAssertions
4892
- );
4893
- console.log("[Evaluator] Scenario completed, adding result");
4894
- state.currentPhase = ExecutionPhase.ADD_RESULT;
4895
- state.currentContext = {
4896
- ...state.currentContext,
4897
- resultId: result.id
4898
- };
4899
- await api.addResult(projectId2, evalRunId2, result);
4900
- completedScenarios++;
4901
- } catch (err) {
4902
- const errorMsg = err instanceof Error ? err.message : String(err);
4903
- const errorStack = err instanceof Error ? err.stack : void 0;
4904
- console.error(
4905
- "[Evaluator] Failed to run scenario with preset:",
4906
- evalData.presetName,
4907
- "Error:",
4908
- errorMsg
4909
- );
4910
- if (errorStack) {
4911
- console.error("[Evaluator] Stack trace:", errorStack);
4912
+ const skillNames = evalData.skills.map((s) => s.name).join(", ");
4913
+ const { completedExecutions, totalExecutions } = await runEvaluationLoop(
4914
+ scenarioItems,
4915
+ evalData,
4916
+ {
4917
+ runScenario: (scenario, template, resolvedAssertions) => {
4918
+ state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
4919
+ state.currentContext = {
4920
+ projectId: projectId2,
4921
+ evalRunId: evalRunId2,
4922
+ scenarioId: scenario.id,
4923
+ scenarioName: scenario.name,
4924
+ presetId: evalData.evalRun.presetId,
4925
+ presetName: evalData.presetName,
4926
+ agentId: agent?.id,
4927
+ agentName: agent?.name
4928
+ };
4929
+ return runScenario(
4930
+ config,
4931
+ evalRunId2,
4932
+ scenario,
4933
+ evalData,
4934
+ template,
4935
+ resolvedAssertions
4936
+ );
4937
+ },
4938
+ addResult: async (result) => {
4939
+ state.currentPhase = ExecutionPhase.ADD_RESULT;
4940
+ state.currentContext = {
4941
+ ...state.currentContext,
4942
+ resultId: result.id
4943
+ };
4944
+ await api.addResult(projectId2, evalRunId2, result);
4945
+ },
4946
+ onProgress: (completed, total, iterLabel) => {
4947
+ console.log(
4948
+ "[Evaluator] Running scenario with preset:",
4949
+ evalData.presetName,
4950
+ skillNames ? `(${skillNames})` : "",
4951
+ agent ? `with agent: ${agent.name}` : "",
4952
+ `(${completed}/${total})${iterLabel}`
4953
+ );
4912
4954
  }
4913
- throw new Error(
4914
- `[${state.currentPhase}] Failed to execute preset "${evalData.presetName}" on scenario "${scenario.name}": ${errorMsg}`
4915
- );
4916
4955
  }
4917
- }
4956
+ );
4918
4957
  state.currentPhase = ExecutionPhase.UPDATE_STATUS;
4919
4958
  state.currentContext = {
4920
4959
  projectId: projectId2,
4921
4960
  evalRunId: evalRunId2,
4922
- completedScenarios,
4923
- totalScenarios
4961
+ completedExecutions,
4962
+ totalExecutions
4924
4963
  };
4925
4964
  try {
4926
4965
  await api.updateEvalRun(projectId2, evalRunId2, {