@wix/evalforge-evaluator 0.141.0 → 0.143.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +46 -46
- package/build/index.js.map +2 -2
- package/build/index.mjs +46 -46
- package/build/index.mjs.map +2 -2
- package/build/types/evaluation-loop.d.ts +1 -0
- package/package.json +5 -5
package/build/index.js
CHANGED
|
@@ -4757,6 +4757,7 @@ var import_crypto5 = require("crypto");
|
|
|
4757
4757
|
async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
4758
4758
|
const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
|
|
4759
4759
|
let completedExecutions = 0;
|
|
4760
|
+
let erroredExecutions = 0;
|
|
4760
4761
|
const totalExecutions = scenarioItems.length * runsPerScenario;
|
|
4761
4762
|
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
4762
4763
|
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
@@ -4793,6 +4794,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
4793
4794
|
iterationIndex
|
|
4794
4795
|
};
|
|
4795
4796
|
await callbacks.addResult(errorResult);
|
|
4797
|
+
erroredExecutions++;
|
|
4796
4798
|
}
|
|
4797
4799
|
if (scenarioResult !== null) {
|
|
4798
4800
|
await callbacks.addResult({ ...scenarioResult, iterationIndex });
|
|
@@ -4800,7 +4802,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
4800
4802
|
completedExecutions++;
|
|
4801
4803
|
}
|
|
4802
4804
|
}
|
|
4803
|
-
return { completedExecutions, totalExecutions };
|
|
4805
|
+
return { completedExecutions, totalExecutions, erroredExecutions };
|
|
4804
4806
|
}
|
|
4805
4807
|
|
|
4806
4808
|
// src/error-reporter.ts
|
|
@@ -4984,50 +4986,46 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
4984
4986
|
);
|
|
4985
4987
|
}
|
|
4986
4988
|
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
4987
|
-
const { completedExecutions, totalExecutions } = await runEvaluationLoop(
|
|
4988
|
-
|
|
4989
|
-
|
|
4990
|
-
|
|
4991
|
-
|
|
4992
|
-
|
|
4993
|
-
|
|
4994
|
-
|
|
4995
|
-
|
|
4996
|
-
|
|
4997
|
-
|
|
4998
|
-
|
|
4999
|
-
|
|
5000
|
-
|
|
5001
|
-
|
|
5002
|
-
|
|
5003
|
-
|
|
5004
|
-
|
|
5005
|
-
|
|
5006
|
-
|
|
5007
|
-
|
|
5008
|
-
|
|
5009
|
-
|
|
5010
|
-
|
|
5011
|
-
|
|
5012
|
-
|
|
5013
|
-
|
|
5014
|
-
|
|
5015
|
-
|
|
5016
|
-
|
|
5017
|
-
|
|
5018
|
-
|
|
5019
|
-
|
|
5020
|
-
|
|
5021
|
-
|
|
5022
|
-
|
|
5023
|
-
|
|
5024
|
-
|
|
5025
|
-
agent ? `with agent: ${agent.name}` : "",
|
|
5026
|
-
`(${completed}/${total})${iterLabel}`
|
|
5027
|
-
);
|
|
5028
|
-
}
|
|
4989
|
+
const { completedExecutions, totalExecutions, erroredExecutions } = await runEvaluationLoop(scenarioItems, evalData, {
|
|
4990
|
+
runScenario: (scenario, template, resolvedAssertions) => {
|
|
4991
|
+
state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
|
|
4992
|
+
state.currentContext = {
|
|
4993
|
+
projectId: projectId2,
|
|
4994
|
+
evalRunId: evalRunId2,
|
|
4995
|
+
scenarioId: scenario.id,
|
|
4996
|
+
scenarioName: scenario.name,
|
|
4997
|
+
presetId: evalData.evalRun.presetId,
|
|
4998
|
+
presetName: evalData.presetName,
|
|
4999
|
+
agentId: agent?.id,
|
|
5000
|
+
agentName: agent?.name
|
|
5001
|
+
};
|
|
5002
|
+
return runScenario(
|
|
5003
|
+
config,
|
|
5004
|
+
evalRunId2,
|
|
5005
|
+
scenario,
|
|
5006
|
+
evalData,
|
|
5007
|
+
template,
|
|
5008
|
+
resolvedAssertions
|
|
5009
|
+
);
|
|
5010
|
+
},
|
|
5011
|
+
addResult: async (result) => {
|
|
5012
|
+
state.currentPhase = ExecutionPhase.ADD_RESULT;
|
|
5013
|
+
state.currentContext = {
|
|
5014
|
+
...state.currentContext,
|
|
5015
|
+
resultId: result.id
|
|
5016
|
+
};
|
|
5017
|
+
await api.addResult(projectId2, evalRunId2, result);
|
|
5018
|
+
},
|
|
5019
|
+
onProgress: (completed, total, iterLabel) => {
|
|
5020
|
+
console.log(
|
|
5021
|
+
"[Evaluator] Running scenario with preset:",
|
|
5022
|
+
evalData.presetName,
|
|
5023
|
+
skillNames ? `(${skillNames})` : "",
|
|
5024
|
+
agent ? `with agent: ${agent.name}` : "",
|
|
5025
|
+
`(${completed}/${total})${iterLabel}`
|
|
5026
|
+
);
|
|
5029
5027
|
}
|
|
5030
|
-
);
|
|
5028
|
+
});
|
|
5031
5029
|
state.currentPhase = ExecutionPhase.UPDATE_STATUS;
|
|
5032
5030
|
state.currentContext = {
|
|
5033
5031
|
projectId: projectId2,
|
|
@@ -5035,14 +5033,16 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5035
5033
|
completedExecutions,
|
|
5036
5034
|
totalExecutions
|
|
5037
5035
|
};
|
|
5036
|
+
const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
|
|
5037
|
+
const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
|
|
5038
5038
|
try {
|
|
5039
5039
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
5040
|
-
status:
|
|
5040
|
+
status: finalStatus,
|
|
5041
5041
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
5042
5042
|
});
|
|
5043
5043
|
} catch (updateErr) {
|
|
5044
5044
|
throw new Error(
|
|
5045
|
-
`[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to
|
|
5045
|
+
`[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to ${finalStatus}: ${updateErr instanceof Error ? updateErr.message : String(updateErr)}`
|
|
5046
5046
|
);
|
|
5047
5047
|
}
|
|
5048
5048
|
}
|