@wix/evalforge-evaluator 0.131.0 → 0.133.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +96 -57
- package/build/index.js.map +4 -4
- package/build/index.mjs +96 -57
- package/build/index.mjs.map +4 -4
- package/build/types/evaluation-loop.d.ts +23 -0
- package/package.json +5 -5
package/build/index.js
CHANGED
|
@@ -4678,6 +4678,57 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
4678
4678
|
};
|
|
4679
4679
|
}
|
|
4680
4680
|
|
|
4681
|
+
// src/evaluation-loop.ts
|
|
4682
|
+
var import_crypto5 = require("crypto");
|
|
4683
|
+
async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
4684
|
+
const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
|
|
4685
|
+
let completedExecutions = 0;
|
|
4686
|
+
const totalExecutions = scenarioItems.length * runsPerScenario;
|
|
4687
|
+
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
4688
|
+
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
4689
|
+
for (const { scenario, template, resolvedAssertions } of scenarioItems) {
|
|
4690
|
+
for (let iterationIndex = 0; iterationIndex < runsPerScenario; iterationIndex++) {
|
|
4691
|
+
const iterLabel = runsPerScenario > 1 ? ` [run ${iterationIndex + 1}/${runsPerScenario}]` : "";
|
|
4692
|
+
callbacks.onProgress(completedExecutions + 1, totalExecutions, iterLabel);
|
|
4693
|
+
let scenarioResult = null;
|
|
4694
|
+
try {
|
|
4695
|
+
scenarioResult = await callbacks.runScenario(
|
|
4696
|
+
scenario,
|
|
4697
|
+
template,
|
|
4698
|
+
resolvedAssertions
|
|
4699
|
+
);
|
|
4700
|
+
} catch (err) {
|
|
4701
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
4702
|
+
console.error(
|
|
4703
|
+
`[Evaluator] Scenario iteration failed, recording as error result: "${scenario.name}"${iterLabel} \u2014 ${errorMsg}`
|
|
4704
|
+
);
|
|
4705
|
+
const errorResult = {
|
|
4706
|
+
id: (0, import_crypto5.randomUUID)(),
|
|
4707
|
+
targetId,
|
|
4708
|
+
targetName,
|
|
4709
|
+
scenarioId: scenario.id,
|
|
4710
|
+
scenarioName: scenario.name,
|
|
4711
|
+
assertionResults: [],
|
|
4712
|
+
passed: 0,
|
|
4713
|
+
failed: 0,
|
|
4714
|
+
passRate: 0,
|
|
4715
|
+
duration: 0,
|
|
4716
|
+
outputText: `Execution error: ${errorMsg}`,
|
|
4717
|
+
startedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4718
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4719
|
+
iterationIndex
|
|
4720
|
+
};
|
|
4721
|
+
await callbacks.addResult(errorResult);
|
|
4722
|
+
}
|
|
4723
|
+
if (scenarioResult !== null) {
|
|
4724
|
+
await callbacks.addResult({ ...scenarioResult, iterationIndex });
|
|
4725
|
+
}
|
|
4726
|
+
completedExecutions++;
|
|
4727
|
+
}
|
|
4728
|
+
}
|
|
4729
|
+
return { completedExecutions, totalExecutions };
|
|
4730
|
+
}
|
|
4731
|
+
|
|
4681
4732
|
// src/error-reporter.ts
|
|
4682
4733
|
var import_evalforge_types14 = require("@wix/evalforge-types");
|
|
4683
4734
|
function formatError(error, phase, context) {
|
|
@@ -4858,69 +4909,57 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
4858
4909
|
`[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for evaluation runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
|
|
4859
4910
|
);
|
|
4860
4911
|
}
|
|
4861
|
-
|
|
4862
|
-
const
|
|
4863
|
-
|
|
4864
|
-
|
|
4865
|
-
|
|
4866
|
-
|
|
4867
|
-
|
|
4868
|
-
|
|
4869
|
-
|
|
4870
|
-
|
|
4871
|
-
|
|
4872
|
-
|
|
4873
|
-
|
|
4874
|
-
|
|
4875
|
-
|
|
4876
|
-
|
|
4877
|
-
|
|
4878
|
-
|
|
4879
|
-
|
|
4880
|
-
|
|
4881
|
-
|
|
4882
|
-
|
|
4883
|
-
|
|
4884
|
-
|
|
4885
|
-
|
|
4886
|
-
|
|
4887
|
-
|
|
4888
|
-
|
|
4889
|
-
|
|
4890
|
-
|
|
4891
|
-
|
|
4892
|
-
|
|
4893
|
-
|
|
4894
|
-
|
|
4895
|
-
|
|
4896
|
-
|
|
4897
|
-
|
|
4898
|
-
|
|
4899
|
-
|
|
4900
|
-
|
|
4901
|
-
|
|
4902
|
-
|
|
4903
|
-
const errorStack = err instanceof Error ? err.stack : void 0;
|
|
4904
|
-
console.error(
|
|
4905
|
-
"[Evaluator] Failed to run scenario with preset:",
|
|
4906
|
-
evalData.presetName,
|
|
4907
|
-
"Error:",
|
|
4908
|
-
errorMsg
|
|
4909
|
-
);
|
|
4910
|
-
if (errorStack) {
|
|
4911
|
-
console.error("[Evaluator] Stack trace:", errorStack);
|
|
4912
|
+
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
4913
|
+
const { completedExecutions, totalExecutions } = await runEvaluationLoop(
|
|
4914
|
+
scenarioItems,
|
|
4915
|
+
evalData,
|
|
4916
|
+
{
|
|
4917
|
+
runScenario: (scenario, template, resolvedAssertions) => {
|
|
4918
|
+
state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
|
|
4919
|
+
state.currentContext = {
|
|
4920
|
+
projectId: projectId2,
|
|
4921
|
+
evalRunId: evalRunId2,
|
|
4922
|
+
scenarioId: scenario.id,
|
|
4923
|
+
scenarioName: scenario.name,
|
|
4924
|
+
presetId: evalData.evalRun.presetId,
|
|
4925
|
+
presetName: evalData.presetName,
|
|
4926
|
+
agentId: agent?.id,
|
|
4927
|
+
agentName: agent?.name
|
|
4928
|
+
};
|
|
4929
|
+
return runScenario(
|
|
4930
|
+
config,
|
|
4931
|
+
evalRunId2,
|
|
4932
|
+
scenario,
|
|
4933
|
+
evalData,
|
|
4934
|
+
template,
|
|
4935
|
+
resolvedAssertions
|
|
4936
|
+
);
|
|
4937
|
+
},
|
|
4938
|
+
addResult: async (result) => {
|
|
4939
|
+
state.currentPhase = ExecutionPhase.ADD_RESULT;
|
|
4940
|
+
state.currentContext = {
|
|
4941
|
+
...state.currentContext,
|
|
4942
|
+
resultId: result.id
|
|
4943
|
+
};
|
|
4944
|
+
await api.addResult(projectId2, evalRunId2, result);
|
|
4945
|
+
},
|
|
4946
|
+
onProgress: (completed, total, iterLabel) => {
|
|
4947
|
+
console.log(
|
|
4948
|
+
"[Evaluator] Running scenario with preset:",
|
|
4949
|
+
evalData.presetName,
|
|
4950
|
+
skillNames ? `(${skillNames})` : "",
|
|
4951
|
+
agent ? `with agent: ${agent.name}` : "",
|
|
4952
|
+
`(${completed}/${total})${iterLabel}`
|
|
4953
|
+
);
|
|
4912
4954
|
}
|
|
4913
|
-
throw new Error(
|
|
4914
|
-
`[${state.currentPhase}] Failed to execute preset "${evalData.presetName}" on scenario "${scenario.name}": ${errorMsg}`
|
|
4915
|
-
);
|
|
4916
4955
|
}
|
|
4917
|
-
|
|
4956
|
+
);
|
|
4918
4957
|
state.currentPhase = ExecutionPhase.UPDATE_STATUS;
|
|
4919
4958
|
state.currentContext = {
|
|
4920
4959
|
projectId: projectId2,
|
|
4921
4960
|
evalRunId: evalRunId2,
|
|
4922
|
-
|
|
4923
|
-
|
|
4961
|
+
completedExecutions,
|
|
4962
|
+
totalExecutions
|
|
4924
4963
|
};
|
|
4925
4964
|
try {
|
|
4926
4965
|
await api.updateEvalRun(projectId2, evalRunId2, {
|