@wix/evalforge-evaluator 0.132.0 → 0.133.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +96 -57
- package/build/index.js.map +4 -4
- package/build/index.mjs +96 -57
- package/build/index.mjs.map +4 -4
- package/build/types/evaluation-loop.d.ts +23 -0
- package/package.json +5 -5
package/build/index.mjs
CHANGED
|
@@ -4686,6 +4686,57 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
4686
4686
|
};
|
|
4687
4687
|
}
|
|
4688
4688
|
|
|
4689
|
+
// src/evaluation-loop.ts
|
|
4690
|
+
import { randomUUID as randomUUID5 } from "crypto";
|
|
4691
|
+
async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
4692
|
+
const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
|
|
4693
|
+
let completedExecutions = 0;
|
|
4694
|
+
const totalExecutions = scenarioItems.length * runsPerScenario;
|
|
4695
|
+
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
4696
|
+
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
4697
|
+
for (const { scenario, template, resolvedAssertions } of scenarioItems) {
|
|
4698
|
+
for (let iterationIndex = 0; iterationIndex < runsPerScenario; iterationIndex++) {
|
|
4699
|
+
const iterLabel = runsPerScenario > 1 ? ` [run ${iterationIndex + 1}/${runsPerScenario}]` : "";
|
|
4700
|
+
callbacks.onProgress(completedExecutions + 1, totalExecutions, iterLabel);
|
|
4701
|
+
let scenarioResult = null;
|
|
4702
|
+
try {
|
|
4703
|
+
scenarioResult = await callbacks.runScenario(
|
|
4704
|
+
scenario,
|
|
4705
|
+
template,
|
|
4706
|
+
resolvedAssertions
|
|
4707
|
+
);
|
|
4708
|
+
} catch (err) {
|
|
4709
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
4710
|
+
console.error(
|
|
4711
|
+
`[Evaluator] Scenario iteration failed, recording as error result: "${scenario.name}"${iterLabel} \u2014 ${errorMsg}`
|
|
4712
|
+
);
|
|
4713
|
+
const errorResult = {
|
|
4714
|
+
id: randomUUID5(),
|
|
4715
|
+
targetId,
|
|
4716
|
+
targetName,
|
|
4717
|
+
scenarioId: scenario.id,
|
|
4718
|
+
scenarioName: scenario.name,
|
|
4719
|
+
assertionResults: [],
|
|
4720
|
+
passed: 0,
|
|
4721
|
+
failed: 0,
|
|
4722
|
+
passRate: 0,
|
|
4723
|
+
duration: 0,
|
|
4724
|
+
outputText: `Execution error: ${errorMsg}`,
|
|
4725
|
+
startedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4726
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4727
|
+
iterationIndex
|
|
4728
|
+
};
|
|
4729
|
+
await callbacks.addResult(errorResult);
|
|
4730
|
+
}
|
|
4731
|
+
if (scenarioResult !== null) {
|
|
4732
|
+
await callbacks.addResult({ ...scenarioResult, iterationIndex });
|
|
4733
|
+
}
|
|
4734
|
+
completedExecutions++;
|
|
4735
|
+
}
|
|
4736
|
+
}
|
|
4737
|
+
return { completedExecutions, totalExecutions };
|
|
4738
|
+
}
|
|
4739
|
+
|
|
4689
4740
|
// src/error-reporter.ts
|
|
4690
4741
|
import { EvalStatus } from "@wix/evalforge-types";
|
|
4691
4742
|
function formatError(error, phase, context) {
|
|
@@ -4866,69 +4917,57 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
4866
4917
|
`[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for evaluation runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
|
|
4867
4918
|
);
|
|
4868
4919
|
}
|
|
4869
|
-
|
|
4870
|
-
const
|
|
4871
|
-
|
|
4872
|
-
|
|
4873
|
-
|
|
4874
|
-
|
|
4875
|
-
|
|
4876
|
-
|
|
4877
|
-
|
|
4878
|
-
|
|
4879
|
-
|
|
4880
|
-
|
|
4881
|
-
|
|
4882
|
-
|
|
4883
|
-
|
|
4884
|
-
|
|
4885
|
-
|
|
4886
|
-
|
|
4887
|
-
|
|
4888
|
-
|
|
4889
|
-
|
|
4890
|
-
|
|
4891
|
-
|
|
4892
|
-
|
|
4893
|
-
|
|
4894
|
-
|
|
4895
|
-
|
|
4896
|
-
|
|
4897
|
-
|
|
4898
|
-
|
|
4899
|
-
|
|
4900
|
-
|
|
4901
|
-
|
|
4902
|
-
|
|
4903
|
-
|
|
4904
|
-
|
|
4905
|
-
|
|
4906
|
-
|
|
4907
|
-
|
|
4908
|
-
|
|
4909
|
-
|
|
4910
|
-
|
|
4911
|
-
const errorStack = err instanceof Error ? err.stack : void 0;
|
|
4912
|
-
console.error(
|
|
4913
|
-
"[Evaluator] Failed to run scenario with preset:",
|
|
4914
|
-
evalData.presetName,
|
|
4915
|
-
"Error:",
|
|
4916
|
-
errorMsg
|
|
4917
|
-
);
|
|
4918
|
-
if (errorStack) {
|
|
4919
|
-
console.error("[Evaluator] Stack trace:", errorStack);
|
|
4920
|
+
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
4921
|
+
const { completedExecutions, totalExecutions } = await runEvaluationLoop(
|
|
4922
|
+
scenarioItems,
|
|
4923
|
+
evalData,
|
|
4924
|
+
{
|
|
4925
|
+
runScenario: (scenario, template, resolvedAssertions) => {
|
|
4926
|
+
state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
|
|
4927
|
+
state.currentContext = {
|
|
4928
|
+
projectId: projectId2,
|
|
4929
|
+
evalRunId: evalRunId2,
|
|
4930
|
+
scenarioId: scenario.id,
|
|
4931
|
+
scenarioName: scenario.name,
|
|
4932
|
+
presetId: evalData.evalRun.presetId,
|
|
4933
|
+
presetName: evalData.presetName,
|
|
4934
|
+
agentId: agent?.id,
|
|
4935
|
+
agentName: agent?.name
|
|
4936
|
+
};
|
|
4937
|
+
return runScenario(
|
|
4938
|
+
config,
|
|
4939
|
+
evalRunId2,
|
|
4940
|
+
scenario,
|
|
4941
|
+
evalData,
|
|
4942
|
+
template,
|
|
4943
|
+
resolvedAssertions
|
|
4944
|
+
);
|
|
4945
|
+
},
|
|
4946
|
+
addResult: async (result) => {
|
|
4947
|
+
state.currentPhase = ExecutionPhase.ADD_RESULT;
|
|
4948
|
+
state.currentContext = {
|
|
4949
|
+
...state.currentContext,
|
|
4950
|
+
resultId: result.id
|
|
4951
|
+
};
|
|
4952
|
+
await api.addResult(projectId2, evalRunId2, result);
|
|
4953
|
+
},
|
|
4954
|
+
onProgress: (completed, total, iterLabel) => {
|
|
4955
|
+
console.log(
|
|
4956
|
+
"[Evaluator] Running scenario with preset:",
|
|
4957
|
+
evalData.presetName,
|
|
4958
|
+
skillNames ? `(${skillNames})` : "",
|
|
4959
|
+
agent ? `with agent: ${agent.name}` : "",
|
|
4960
|
+
`(${completed}/${total})${iterLabel}`
|
|
4961
|
+
);
|
|
4920
4962
|
}
|
|
4921
|
-
throw new Error(
|
|
4922
|
-
`[${state.currentPhase}] Failed to execute preset "${evalData.presetName}" on scenario "${scenario.name}": ${errorMsg}`
|
|
4923
|
-
);
|
|
4924
4963
|
}
|
|
4925
|
-
|
|
4964
|
+
);
|
|
4926
4965
|
state.currentPhase = ExecutionPhase.UPDATE_STATUS;
|
|
4927
4966
|
state.currentContext = {
|
|
4928
4967
|
projectId: projectId2,
|
|
4929
4968
|
evalRunId: evalRunId2,
|
|
4930
|
-
|
|
4931
|
-
|
|
4969
|
+
completedExecutions,
|
|
4970
|
+
totalExecutions
|
|
4932
4971
|
};
|
|
4933
4972
|
try {
|
|
4934
4973
|
await api.updateEvalRun(projectId2, evalRunId2, {
|