@wix/evalforge-evaluator 0.140.0 → 0.142.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -287,13 +287,16 @@ function resolveSystemAssertion(assertionId, params) {
287
287
  }
288
288
  };
289
289
  break;
290
- case "build_passed":
290
+ case "build_passed": {
291
+ const rawCmd = params?.command;
292
+ const command = typeof rawCmd === "string" && (0, import_evalforge_types.isAllowedBuildCommandString)(rawCmd) ? rawCmd.trim() : void 0;
291
293
  baseAssertion = {
292
294
  type: "build_passed",
293
- command: params?.command ?? void 0,
295
+ ...command !== void 0 && { command },
294
296
  expectedExitCode: params?.expectedExitCode ?? void 0
295
297
  };
296
298
  break;
299
+ }
297
300
  case "time_limit":
298
301
  baseAssertion = {
299
302
  type: "time_limit",
@@ -4754,6 +4757,7 @@ var import_crypto5 = require("crypto");
4754
4757
  async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4755
4758
  const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
4756
4759
  let completedExecutions = 0;
4760
+ let erroredExecutions = 0;
4757
4761
  const totalExecutions = scenarioItems.length * runsPerScenario;
4758
4762
  const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
4759
4763
  const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
@@ -4790,6 +4794,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4790
4794
  iterationIndex
4791
4795
  };
4792
4796
  await callbacks.addResult(errorResult);
4797
+ erroredExecutions++;
4793
4798
  }
4794
4799
  if (scenarioResult !== null) {
4795
4800
  await callbacks.addResult({ ...scenarioResult, iterationIndex });
@@ -4797,7 +4802,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4797
4802
  completedExecutions++;
4798
4803
  }
4799
4804
  }
4800
- return { completedExecutions, totalExecutions };
4805
+ return { completedExecutions, totalExecutions, erroredExecutions };
4801
4806
  }
4802
4807
 
4803
4808
  // src/error-reporter.ts
@@ -4981,50 +4986,46 @@ async function runEvaluation(projectId2, evalRunId2) {
4981
4986
  );
4982
4987
  }
4983
4988
  const skillNames = evalData.skills.map((s) => s.name).join(", ");
4984
- const { completedExecutions, totalExecutions } = await runEvaluationLoop(
4985
- scenarioItems,
4986
- evalData,
4987
- {
4988
- runScenario: (scenario, template, resolvedAssertions) => {
4989
- state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
4990
- state.currentContext = {
4991
- projectId: projectId2,
4992
- evalRunId: evalRunId2,
4993
- scenarioId: scenario.id,
4994
- scenarioName: scenario.name,
4995
- presetId: evalData.evalRun.presetId,
4996
- presetName: evalData.presetName,
4997
- agentId: agent?.id,
4998
- agentName: agent?.name
4999
- };
5000
- return runScenario(
5001
- config,
5002
- evalRunId2,
5003
- scenario,
5004
- evalData,
5005
- template,
5006
- resolvedAssertions
5007
- );
5008
- },
5009
- addResult: async (result) => {
5010
- state.currentPhase = ExecutionPhase.ADD_RESULT;
5011
- state.currentContext = {
5012
- ...state.currentContext,
5013
- resultId: result.id
5014
- };
5015
- await api.addResult(projectId2, evalRunId2, result);
5016
- },
5017
- onProgress: (completed, total, iterLabel) => {
5018
- console.log(
5019
- "[Evaluator] Running scenario with preset:",
5020
- evalData.presetName,
5021
- skillNames ? `(${skillNames})` : "",
5022
- agent ? `with agent: ${agent.name}` : "",
5023
- `(${completed}/${total})${iterLabel}`
5024
- );
5025
- }
4989
+ const { completedExecutions, totalExecutions, erroredExecutions } = await runEvaluationLoop(scenarioItems, evalData, {
4990
+ runScenario: (scenario, template, resolvedAssertions) => {
4991
+ state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
4992
+ state.currentContext = {
4993
+ projectId: projectId2,
4994
+ evalRunId: evalRunId2,
4995
+ scenarioId: scenario.id,
4996
+ scenarioName: scenario.name,
4997
+ presetId: evalData.evalRun.presetId,
4998
+ presetName: evalData.presetName,
4999
+ agentId: agent?.id,
5000
+ agentName: agent?.name
5001
+ };
5002
+ return runScenario(
5003
+ config,
5004
+ evalRunId2,
5005
+ scenario,
5006
+ evalData,
5007
+ template,
5008
+ resolvedAssertions
5009
+ );
5010
+ },
5011
+ addResult: async (result) => {
5012
+ state.currentPhase = ExecutionPhase.ADD_RESULT;
5013
+ state.currentContext = {
5014
+ ...state.currentContext,
5015
+ resultId: result.id
5016
+ };
5017
+ await api.addResult(projectId2, evalRunId2, result);
5018
+ },
5019
+ onProgress: (completed, total, iterLabel) => {
5020
+ console.log(
5021
+ "[Evaluator] Running scenario with preset:",
5022
+ evalData.presetName,
5023
+ skillNames ? `(${skillNames})` : "",
5024
+ agent ? `with agent: ${agent.name}` : "",
5025
+ `(${completed}/${total})${iterLabel}`
5026
+ );
5026
5027
  }
5027
- );
5028
+ });
5028
5029
  state.currentPhase = ExecutionPhase.UPDATE_STATUS;
5029
5030
  state.currentContext = {
5030
5031
  projectId: projectId2,
@@ -5032,14 +5033,16 @@ async function runEvaluation(projectId2, evalRunId2) {
5032
5033
  completedExecutions,
5033
5034
  totalExecutions
5034
5035
  };
5036
+ const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
5037
+ const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
5035
5038
  try {
5036
5039
  await api.updateEvalRun(projectId2, evalRunId2, {
5037
- status: import_evalforge_types15.EvalStatus.COMPLETED,
5040
+ status: finalStatus,
5038
5041
  completedAt: (/* @__PURE__ */ new Date()).toISOString()
5039
5042
  });
5040
5043
  } catch (updateErr) {
5041
5044
  throw new Error(
5042
- `[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to COMPLETED: ${updateErr instanceof Error ? updateErr.message : String(updateErr)}`
5045
+ `[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to ${finalStatus}: ${updateErr instanceof Error ? updateErr.message : String(updateErr)}`
5043
5046
  );
5044
5047
  }
5045
5048
  }