@wix/evalforge-evaluator 0.140.0 → 0.142.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +51 -48
- package/build/index.js.map +2 -2
- package/build/index.mjs +53 -49
- package/build/index.mjs.map +2 -2
- package/build/types/evaluation-loop.d.ts +1 -0
- package/package.json +5 -5
package/build/index.js
CHANGED
|
@@ -287,13 +287,16 @@ function resolveSystemAssertion(assertionId, params) {
|
|
|
287
287
|
}
|
|
288
288
|
};
|
|
289
289
|
break;
|
|
290
|
-
case "build_passed":
|
|
290
|
+
case "build_passed": {
|
|
291
|
+
const rawCmd = params?.command;
|
|
292
|
+
const command = typeof rawCmd === "string" && (0, import_evalforge_types.isAllowedBuildCommandString)(rawCmd) ? rawCmd.trim() : void 0;
|
|
291
293
|
baseAssertion = {
|
|
292
294
|
type: "build_passed",
|
|
293
|
-
command
|
|
295
|
+
...command !== void 0 && { command },
|
|
294
296
|
expectedExitCode: params?.expectedExitCode ?? void 0
|
|
295
297
|
};
|
|
296
298
|
break;
|
|
299
|
+
}
|
|
297
300
|
case "time_limit":
|
|
298
301
|
baseAssertion = {
|
|
299
302
|
type: "time_limit",
|
|
@@ -4754,6 +4757,7 @@ var import_crypto5 = require("crypto");
|
|
|
4754
4757
|
async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
4755
4758
|
const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
|
|
4756
4759
|
let completedExecutions = 0;
|
|
4760
|
+
let erroredExecutions = 0;
|
|
4757
4761
|
const totalExecutions = scenarioItems.length * runsPerScenario;
|
|
4758
4762
|
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
4759
4763
|
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
@@ -4790,6 +4794,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
4790
4794
|
iterationIndex
|
|
4791
4795
|
};
|
|
4792
4796
|
await callbacks.addResult(errorResult);
|
|
4797
|
+
erroredExecutions++;
|
|
4793
4798
|
}
|
|
4794
4799
|
if (scenarioResult !== null) {
|
|
4795
4800
|
await callbacks.addResult({ ...scenarioResult, iterationIndex });
|
|
@@ -4797,7 +4802,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
4797
4802
|
completedExecutions++;
|
|
4798
4803
|
}
|
|
4799
4804
|
}
|
|
4800
|
-
return { completedExecutions, totalExecutions };
|
|
4805
|
+
return { completedExecutions, totalExecutions, erroredExecutions };
|
|
4801
4806
|
}
|
|
4802
4807
|
|
|
4803
4808
|
// src/error-reporter.ts
|
|
@@ -4981,50 +4986,46 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
4981
4986
|
);
|
|
4982
4987
|
}
|
|
4983
4988
|
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
4984
|
-
const { completedExecutions, totalExecutions } = await runEvaluationLoop(
|
|
4985
|
-
|
|
4986
|
-
|
|
4987
|
-
|
|
4988
|
-
|
|
4989
|
-
|
|
4990
|
-
|
|
4991
|
-
|
|
4992
|
-
|
|
4993
|
-
|
|
4994
|
-
|
|
4995
|
-
|
|
4996
|
-
|
|
4997
|
-
|
|
4998
|
-
|
|
4999
|
-
|
|
5000
|
-
|
|
5001
|
-
|
|
5002
|
-
|
|
5003
|
-
|
|
5004
|
-
|
|
5005
|
-
|
|
5006
|
-
|
|
5007
|
-
|
|
5008
|
-
|
|
5009
|
-
|
|
5010
|
-
|
|
5011
|
-
|
|
5012
|
-
|
|
5013
|
-
|
|
5014
|
-
|
|
5015
|
-
|
|
5016
|
-
|
|
5017
|
-
|
|
5018
|
-
|
|
5019
|
-
|
|
5020
|
-
|
|
5021
|
-
|
|
5022
|
-
agent ? `with agent: ${agent.name}` : "",
|
|
5023
|
-
`(${completed}/${total})${iterLabel}`
|
|
5024
|
-
);
|
|
5025
|
-
}
|
|
4989
|
+
const { completedExecutions, totalExecutions, erroredExecutions } = await runEvaluationLoop(scenarioItems, evalData, {
|
|
4990
|
+
runScenario: (scenario, template, resolvedAssertions) => {
|
|
4991
|
+
state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
|
|
4992
|
+
state.currentContext = {
|
|
4993
|
+
projectId: projectId2,
|
|
4994
|
+
evalRunId: evalRunId2,
|
|
4995
|
+
scenarioId: scenario.id,
|
|
4996
|
+
scenarioName: scenario.name,
|
|
4997
|
+
presetId: evalData.evalRun.presetId,
|
|
4998
|
+
presetName: evalData.presetName,
|
|
4999
|
+
agentId: agent?.id,
|
|
5000
|
+
agentName: agent?.name
|
|
5001
|
+
};
|
|
5002
|
+
return runScenario(
|
|
5003
|
+
config,
|
|
5004
|
+
evalRunId2,
|
|
5005
|
+
scenario,
|
|
5006
|
+
evalData,
|
|
5007
|
+
template,
|
|
5008
|
+
resolvedAssertions
|
|
5009
|
+
);
|
|
5010
|
+
},
|
|
5011
|
+
addResult: async (result) => {
|
|
5012
|
+
state.currentPhase = ExecutionPhase.ADD_RESULT;
|
|
5013
|
+
state.currentContext = {
|
|
5014
|
+
...state.currentContext,
|
|
5015
|
+
resultId: result.id
|
|
5016
|
+
};
|
|
5017
|
+
await api.addResult(projectId2, evalRunId2, result);
|
|
5018
|
+
},
|
|
5019
|
+
onProgress: (completed, total, iterLabel) => {
|
|
5020
|
+
console.log(
|
|
5021
|
+
"[Evaluator] Running scenario with preset:",
|
|
5022
|
+
evalData.presetName,
|
|
5023
|
+
skillNames ? `(${skillNames})` : "",
|
|
5024
|
+
agent ? `with agent: ${agent.name}` : "",
|
|
5025
|
+
`(${completed}/${total})${iterLabel}`
|
|
5026
|
+
);
|
|
5026
5027
|
}
|
|
5027
|
-
);
|
|
5028
|
+
});
|
|
5028
5029
|
state.currentPhase = ExecutionPhase.UPDATE_STATUS;
|
|
5029
5030
|
state.currentContext = {
|
|
5030
5031
|
projectId: projectId2,
|
|
@@ -5032,14 +5033,16 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5032
5033
|
completedExecutions,
|
|
5033
5034
|
totalExecutions
|
|
5034
5035
|
};
|
|
5036
|
+
const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
|
|
5037
|
+
const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
|
|
5035
5038
|
try {
|
|
5036
5039
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
5037
|
-
status:
|
|
5040
|
+
status: finalStatus,
|
|
5038
5041
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
5039
5042
|
});
|
|
5040
5043
|
} catch (updateErr) {
|
|
5041
5044
|
throw new Error(
|
|
5042
|
-
`[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to
|
|
5045
|
+
`[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to ${finalStatus}: ${updateErr instanceof Error ? updateErr.message : String(updateErr)}`
|
|
5043
5046
|
);
|
|
5044
5047
|
}
|
|
5045
5048
|
}
|