@wix/evalforge-evaluator 0.140.0 → 0.142.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +51 -48
- package/build/index.js.map +2 -2
- package/build/index.mjs +53 -49
- package/build/index.mjs.map +2 -2
- package/build/types/evaluation-loop.d.ts +1 -0
- package/package.json +5 -5
package/build/index.mjs
CHANGED
|
@@ -183,7 +183,8 @@ function createApiClient(serverUrl, options = "") {
|
|
|
183
183
|
// src/fetch-evaluation-data.ts
|
|
184
184
|
import {
|
|
185
185
|
isSystemAssertionId,
|
|
186
|
-
SYSTEM_ASSERTIONS
|
|
186
|
+
SYSTEM_ASSERTIONS,
|
|
187
|
+
isAllowedBuildCommandString
|
|
187
188
|
} from "@wix/evalforge-types";
|
|
188
189
|
|
|
189
190
|
// src/resolve-placeholders.ts
|
|
@@ -267,13 +268,16 @@ function resolveSystemAssertion(assertionId, params) {
|
|
|
267
268
|
}
|
|
268
269
|
};
|
|
269
270
|
break;
|
|
270
|
-
case "build_passed":
|
|
271
|
+
case "build_passed": {
|
|
272
|
+
const rawCmd = params?.command;
|
|
273
|
+
const command = typeof rawCmd === "string" && isAllowedBuildCommandString(rawCmd) ? rawCmd.trim() : void 0;
|
|
271
274
|
baseAssertion = {
|
|
272
275
|
type: "build_passed",
|
|
273
|
-
command
|
|
276
|
+
...command !== void 0 && { command },
|
|
274
277
|
expectedExitCode: params?.expectedExitCode ?? void 0
|
|
275
278
|
};
|
|
276
279
|
break;
|
|
280
|
+
}
|
|
277
281
|
case "time_limit":
|
|
278
282
|
baseAssertion = {
|
|
279
283
|
type: "time_limit",
|
|
@@ -4762,6 +4766,7 @@ import { randomUUID as randomUUID5 } from "crypto";
|
|
|
4762
4766
|
async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
4763
4767
|
const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
|
|
4764
4768
|
let completedExecutions = 0;
|
|
4769
|
+
let erroredExecutions = 0;
|
|
4765
4770
|
const totalExecutions = scenarioItems.length * runsPerScenario;
|
|
4766
4771
|
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
4767
4772
|
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
@@ -4798,6 +4803,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
4798
4803
|
iterationIndex
|
|
4799
4804
|
};
|
|
4800
4805
|
await callbacks.addResult(errorResult);
|
|
4806
|
+
erroredExecutions++;
|
|
4801
4807
|
}
|
|
4802
4808
|
if (scenarioResult !== null) {
|
|
4803
4809
|
await callbacks.addResult({ ...scenarioResult, iterationIndex });
|
|
@@ -4805,7 +4811,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
4805
4811
|
completedExecutions++;
|
|
4806
4812
|
}
|
|
4807
4813
|
}
|
|
4808
|
-
return { completedExecutions, totalExecutions };
|
|
4814
|
+
return { completedExecutions, totalExecutions, erroredExecutions };
|
|
4809
4815
|
}
|
|
4810
4816
|
|
|
4811
4817
|
// src/error-reporter.ts
|
|
@@ -4989,50 +4995,46 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
4989
4995
|
);
|
|
4990
4996
|
}
|
|
4991
4997
|
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
4992
|
-
const { completedExecutions, totalExecutions } = await runEvaluationLoop(
|
|
4993
|
-
|
|
4994
|
-
|
|
4995
|
-
|
|
4996
|
-
|
|
4997
|
-
|
|
4998
|
-
|
|
4999
|
-
|
|
5000
|
-
|
|
5001
|
-
|
|
5002
|
-
|
|
5003
|
-
|
|
5004
|
-
|
|
5005
|
-
|
|
5006
|
-
|
|
5007
|
-
|
|
5008
|
-
|
|
5009
|
-
|
|
5010
|
-
|
|
5011
|
-
|
|
5012
|
-
|
|
5013
|
-
|
|
5014
|
-
|
|
5015
|
-
|
|
5016
|
-
|
|
5017
|
-
|
|
5018
|
-
|
|
5019
|
-
|
|
5020
|
-
|
|
5021
|
-
|
|
5022
|
-
|
|
5023
|
-
|
|
5024
|
-
|
|
5025
|
-
|
|
5026
|
-
|
|
5027
|
-
|
|
5028
|
-
|
|
5029
|
-
|
|
5030
|
-
agent ? `with agent: ${agent.name}` : "",
|
|
5031
|
-
`(${completed}/${total})${iterLabel}`
|
|
5032
|
-
);
|
|
5033
|
-
}
|
|
4998
|
+
const { completedExecutions, totalExecutions, erroredExecutions } = await runEvaluationLoop(scenarioItems, evalData, {
|
|
4999
|
+
runScenario: (scenario, template, resolvedAssertions) => {
|
|
5000
|
+
state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
|
|
5001
|
+
state.currentContext = {
|
|
5002
|
+
projectId: projectId2,
|
|
5003
|
+
evalRunId: evalRunId2,
|
|
5004
|
+
scenarioId: scenario.id,
|
|
5005
|
+
scenarioName: scenario.name,
|
|
5006
|
+
presetId: evalData.evalRun.presetId,
|
|
5007
|
+
presetName: evalData.presetName,
|
|
5008
|
+
agentId: agent?.id,
|
|
5009
|
+
agentName: agent?.name
|
|
5010
|
+
};
|
|
5011
|
+
return runScenario(
|
|
5012
|
+
config,
|
|
5013
|
+
evalRunId2,
|
|
5014
|
+
scenario,
|
|
5015
|
+
evalData,
|
|
5016
|
+
template,
|
|
5017
|
+
resolvedAssertions
|
|
5018
|
+
);
|
|
5019
|
+
},
|
|
5020
|
+
addResult: async (result) => {
|
|
5021
|
+
state.currentPhase = ExecutionPhase.ADD_RESULT;
|
|
5022
|
+
state.currentContext = {
|
|
5023
|
+
...state.currentContext,
|
|
5024
|
+
resultId: result.id
|
|
5025
|
+
};
|
|
5026
|
+
await api.addResult(projectId2, evalRunId2, result);
|
|
5027
|
+
},
|
|
5028
|
+
onProgress: (completed, total, iterLabel) => {
|
|
5029
|
+
console.log(
|
|
5030
|
+
"[Evaluator] Running scenario with preset:",
|
|
5031
|
+
evalData.presetName,
|
|
5032
|
+
skillNames ? `(${skillNames})` : "",
|
|
5033
|
+
agent ? `with agent: ${agent.name}` : "",
|
|
5034
|
+
`(${completed}/${total})${iterLabel}`
|
|
5035
|
+
);
|
|
5034
5036
|
}
|
|
5035
|
-
);
|
|
5037
|
+
});
|
|
5036
5038
|
state.currentPhase = ExecutionPhase.UPDATE_STATUS;
|
|
5037
5039
|
state.currentContext = {
|
|
5038
5040
|
projectId: projectId2,
|
|
@@ -5040,14 +5042,16 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
5040
5042
|
completedExecutions,
|
|
5041
5043
|
totalExecutions
|
|
5042
5044
|
};
|
|
5045
|
+
const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
|
|
5046
|
+
const finalStatus = allFailed ? EvalStatus2.FAILED : EvalStatus2.COMPLETED;
|
|
5043
5047
|
try {
|
|
5044
5048
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
5045
|
-
status:
|
|
5049
|
+
status: finalStatus,
|
|
5046
5050
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
5047
5051
|
});
|
|
5048
5052
|
} catch (updateErr) {
|
|
5049
5053
|
throw new Error(
|
|
5050
|
-
`[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to
|
|
5054
|
+
`[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to ${finalStatus}: ${updateErr instanceof Error ? updateErr.message : String(updateErr)}`
|
|
5051
5055
|
);
|
|
5052
5056
|
}
|
|
5053
5057
|
}
|