@wix/evalforge-evaluator 0.141.0 → 0.143.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +46 -46
- package/build/index.js.map +2 -2
- package/build/index.mjs +46 -46
- package/build/index.mjs.map +2 -2
- package/build/types/evaluation-loop.d.ts +1 -0
- package/package.json +5 -5
package/build/index.mjs
CHANGED
|
@@ -4766,6 +4766,7 @@ import { randomUUID as randomUUID5 } from "crypto";
|
|
|
4766
4766
|
async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
4767
4767
|
const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
|
|
4768
4768
|
let completedExecutions = 0;
|
|
4769
|
+
let erroredExecutions = 0;
|
|
4769
4770
|
const totalExecutions = scenarioItems.length * runsPerScenario;
|
|
4770
4771
|
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
4771
4772
|
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
@@ -4802,6 +4803,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
4802
4803
|
iterationIndex
|
|
4803
4804
|
};
|
|
4804
4805
|
await callbacks.addResult(errorResult);
|
|
4806
|
+
erroredExecutions++;
|
|
4805
4807
|
}
|
|
4806
4808
|
if (scenarioResult !== null) {
|
|
4807
4809
|
await callbacks.addResult({ ...scenarioResult, iterationIndex });
|
|
@@ -4809,7 +4811,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
4809
4811
|
completedExecutions++;
|
|
4810
4812
|
}
|
|
4811
4813
|
}
|
|
4812
|
-
return { completedExecutions, totalExecutions };
|
|
4814
|
+
return { completedExecutions, totalExecutions, erroredExecutions };
|
|
4813
4815
|
}
|
|
4814
4816
|
|
|
4815
4817
|
// src/error-reporter.ts
|
|
@@ -4993,50 +4995,46 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
4993
4995
|
);
|
|
4994
4996
|
}
|
|
4995
4997
|
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
4996
|
-
const { completedExecutions, totalExecutions } = await runEvaluationLoop(
|
|
4997
|
-
|
|
4998
|
-
|
|
4999
|
-
|
|
5000
|
-
|
|
5001
|
-
|
|
5002
|
-
|
|
5003
|
-
|
|
5004
|
-
|
|
5005
|
-
|
|
5006
|
-
|
|
5007
|
-
|
|
5008
|
-
|
|
5009
|
-
|
|
5010
|
-
|
|
5011
|
-
|
|
5012
|
-
|
|
5013
|
-
|
|
5014
|
-
|
|
5015
|
-
|
|
5016
|
-
|
|
5017
|
-
|
|
5018
|
-
|
|
5019
|
-
|
|
5020
|
-
|
|
5021
|
-
|
|
5022
|
-
|
|
5023
|
-
|
|
5024
|
-
|
|
5025
|
-
|
|
5026
|
-
|
|
5027
|
-
|
|
5028
|
-
|
|
5029
|
-
|
|
5030
|
-
|
|
5031
|
-
|
|
5032
|
-
|
|
5033
|
-
|
|
5034
|
-
agent ? `with agent: ${agent.name}` : "",
|
|
5035
|
-
`(${completed}/${total})${iterLabel}`
|
|
5036
|
-
);
|
|
5037
|
-
}
|
|
4998
|
+
const { completedExecutions, totalExecutions, erroredExecutions } = await runEvaluationLoop(scenarioItems, evalData, {
|
|
4999
|
+
runScenario: (scenario, template, resolvedAssertions) => {
|
|
5000
|
+
state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
|
|
5001
|
+
state.currentContext = {
|
|
5002
|
+
projectId: projectId2,
|
|
5003
|
+
evalRunId: evalRunId2,
|
|
5004
|
+
scenarioId: scenario.id,
|
|
5005
|
+
scenarioName: scenario.name,
|
|
5006
|
+
presetId: evalData.evalRun.presetId,
|
|
5007
|
+
presetName: evalData.presetName,
|
|
5008
|
+
agentId: agent?.id,
|
|
5009
|
+
agentName: agent?.name
|
|
5010
|
+
};
|
|
5011
|
+
return runScenario(
|
|
5012
|
+
config,
|
|
5013
|
+
evalRunId2,
|
|
5014
|
+
scenario,
|
|
5015
|
+
evalData,
|
|
5016
|
+
template,
|
|
5017
|
+
resolvedAssertions
|
|
5018
|
+
);
|
|
5019
|
+
},
|
|
5020
|
+
addResult: async (result) => {
|
|
5021
|
+
state.currentPhase = ExecutionPhase.ADD_RESULT;
|
|
5022
|
+
state.currentContext = {
|
|
5023
|
+
...state.currentContext,
|
|
5024
|
+
resultId: result.id
|
|
5025
|
+
};
|
|
5026
|
+
await api.addResult(projectId2, evalRunId2, result);
|
|
5027
|
+
},
|
|
5028
|
+
onProgress: (completed, total, iterLabel) => {
|
|
5029
|
+
console.log(
|
|
5030
|
+
"[Evaluator] Running scenario with preset:",
|
|
5031
|
+
evalData.presetName,
|
|
5032
|
+
skillNames ? `(${skillNames})` : "",
|
|
5033
|
+
agent ? `with agent: ${agent.name}` : "",
|
|
5034
|
+
`(${completed}/${total})${iterLabel}`
|
|
5035
|
+
);
|
|
5038
5036
|
}
|
|
5039
|
-
);
|
|
5037
|
+
});
|
|
5040
5038
|
state.currentPhase = ExecutionPhase.UPDATE_STATUS;
|
|
5041
5039
|
state.currentContext = {
|
|
5042
5040
|
projectId: projectId2,
|
|
@@ -5044,14 +5042,16 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5044
5042
|
completedExecutions,
|
|
5045
5043
|
totalExecutions
|
|
5046
5044
|
};
|
|
5045
|
+
const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
|
|
5046
|
+
const finalStatus = allFailed ? EvalStatus2.FAILED : EvalStatus2.COMPLETED;
|
|
5047
5047
|
try {
|
|
5048
5048
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
5049
|
-
status:
|
|
5049
|
+
status: finalStatus,
|
|
5050
5050
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
5051
5051
|
});
|
|
5052
5052
|
} catch (updateErr) {
|
|
5053
5053
|
throw new Error(
|
|
5054
|
-
`[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to
|
|
5054
|
+
`[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to ${finalStatus}: ${updateErr instanceof Error ? updateErr.message : String(updateErr)}`
|
|
5055
5055
|
);
|
|
5056
5056
|
}
|
|
5057
5057
|
}
|