@wix/evalforge-evaluator 0.140.0 → 0.142.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -183,7 +183,8 @@ function createApiClient(serverUrl, options = "") {
183
183
  // src/fetch-evaluation-data.ts
184
184
  import {
185
185
  isSystemAssertionId,
186
- SYSTEM_ASSERTIONS
186
+ SYSTEM_ASSERTIONS,
187
+ isAllowedBuildCommandString
187
188
  } from "@wix/evalforge-types";
188
189
 
189
190
  // src/resolve-placeholders.ts
@@ -267,13 +268,16 @@ function resolveSystemAssertion(assertionId, params) {
267
268
  }
268
269
  };
269
270
  break;
270
- case "build_passed":
271
+ case "build_passed": {
272
+ const rawCmd = params?.command;
273
+ const command = typeof rawCmd === "string" && isAllowedBuildCommandString(rawCmd) ? rawCmd.trim() : void 0;
271
274
  baseAssertion = {
272
275
  type: "build_passed",
273
- command: params?.command ?? void 0,
276
+ ...command !== void 0 && { command },
274
277
  expectedExitCode: params?.expectedExitCode ?? void 0
275
278
  };
276
279
  break;
280
+ }
277
281
  case "time_limit":
278
282
  baseAssertion = {
279
283
  type: "time_limit",
@@ -4762,6 +4766,7 @@ import { randomUUID as randomUUID5 } from "crypto";
4762
4766
  async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4763
4767
  const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
4764
4768
  let completedExecutions = 0;
4769
+ let erroredExecutions = 0;
4765
4770
  const totalExecutions = scenarioItems.length * runsPerScenario;
4766
4771
  const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
4767
4772
  const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
@@ -4798,6 +4803,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4798
4803
  iterationIndex
4799
4804
  };
4800
4805
  await callbacks.addResult(errorResult);
4806
+ erroredExecutions++;
4801
4807
  }
4802
4808
  if (scenarioResult !== null) {
4803
4809
  await callbacks.addResult({ ...scenarioResult, iterationIndex });
@@ -4805,7 +4811,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
4805
4811
  completedExecutions++;
4806
4812
  }
4807
4813
  }
4808
- return { completedExecutions, totalExecutions };
4814
+ return { completedExecutions, totalExecutions, erroredExecutions };
4809
4815
  }
4810
4816
 
4811
4817
  // src/error-reporter.ts
@@ -4989,50 +4995,46 @@ async function runEvaluation(projectId2, evalRunId2) {
4989
4995
  );
4990
4996
  }
4991
4997
  const skillNames = evalData.skills.map((s) => s.name).join(", ");
4992
- const { completedExecutions, totalExecutions } = await runEvaluationLoop(
4993
- scenarioItems,
4994
- evalData,
4995
- {
4996
- runScenario: (scenario, template, resolvedAssertions) => {
4997
- state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
4998
- state.currentContext = {
4999
- projectId: projectId2,
5000
- evalRunId: evalRunId2,
5001
- scenarioId: scenario.id,
5002
- scenarioName: scenario.name,
5003
- presetId: evalData.evalRun.presetId,
5004
- presetName: evalData.presetName,
5005
- agentId: agent?.id,
5006
- agentName: agent?.name
5007
- };
5008
- return runScenario(
5009
- config,
5010
- evalRunId2,
5011
- scenario,
5012
- evalData,
5013
- template,
5014
- resolvedAssertions
5015
- );
5016
- },
5017
- addResult: async (result) => {
5018
- state.currentPhase = ExecutionPhase.ADD_RESULT;
5019
- state.currentContext = {
5020
- ...state.currentContext,
5021
- resultId: result.id
5022
- };
5023
- await api.addResult(projectId2, evalRunId2, result);
5024
- },
5025
- onProgress: (completed, total, iterLabel) => {
5026
- console.log(
5027
- "[Evaluator] Running scenario with preset:",
5028
- evalData.presetName,
5029
- skillNames ? `(${skillNames})` : "",
5030
- agent ? `with agent: ${agent.name}` : "",
5031
- `(${completed}/${total})${iterLabel}`
5032
- );
5033
- }
4998
+ const { completedExecutions, totalExecutions, erroredExecutions } = await runEvaluationLoop(scenarioItems, evalData, {
4999
+ runScenario: (scenario, template, resolvedAssertions) => {
5000
+ state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
5001
+ state.currentContext = {
5002
+ projectId: projectId2,
5003
+ evalRunId: evalRunId2,
5004
+ scenarioId: scenario.id,
5005
+ scenarioName: scenario.name,
5006
+ presetId: evalData.evalRun.presetId,
5007
+ presetName: evalData.presetName,
5008
+ agentId: agent?.id,
5009
+ agentName: agent?.name
5010
+ };
5011
+ return runScenario(
5012
+ config,
5013
+ evalRunId2,
5014
+ scenario,
5015
+ evalData,
5016
+ template,
5017
+ resolvedAssertions
5018
+ );
5019
+ },
5020
+ addResult: async (result) => {
5021
+ state.currentPhase = ExecutionPhase.ADD_RESULT;
5022
+ state.currentContext = {
5023
+ ...state.currentContext,
5024
+ resultId: result.id
5025
+ };
5026
+ await api.addResult(projectId2, evalRunId2, result);
5027
+ },
5028
+ onProgress: (completed, total, iterLabel) => {
5029
+ console.log(
5030
+ "[Evaluator] Running scenario with preset:",
5031
+ evalData.presetName,
5032
+ skillNames ? `(${skillNames})` : "",
5033
+ agent ? `with agent: ${agent.name}` : "",
5034
+ `(${completed}/${total})${iterLabel}`
5035
+ );
5034
5036
  }
5035
- );
5037
+ });
5036
5038
  state.currentPhase = ExecutionPhase.UPDATE_STATUS;
5037
5039
  state.currentContext = {
5038
5040
  projectId: projectId2,
@@ -5040,14 +5042,16 @@ async function runEvaluation(projectId2, evalRunId2) {
5040
5042
  completedExecutions,
5041
5043
  totalExecutions
5042
5044
  };
5045
+ const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
5046
+ const finalStatus = allFailed ? EvalStatus2.FAILED : EvalStatus2.COMPLETED;
5043
5047
  try {
5044
5048
  await api.updateEvalRun(projectId2, evalRunId2, {
5045
- status: EvalStatus2.COMPLETED,
5049
+ status: finalStatus,
5046
5050
  completedAt: (/* @__PURE__ */ new Date()).toISOString()
5047
5051
  });
5048
5052
  } catch (updateErr) {
5049
5053
  throw new Error(
5050
- `[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to COMPLETED: ${updateErr instanceof Error ? updateErr.message : String(updateErr)}`
5054
+ `[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to ${finalStatus}: ${updateErr instanceof Error ? updateErr.message : String(updateErr)}`
5051
5055
  );
5052
5056
  }
5053
5057
  }