@wix/evalforge-evaluator 0.132.0 → 0.134.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +102 -62
- package/build/index.js.map +4 -4
- package/build/index.mjs +102 -62
- package/build/index.mjs.map +4 -4
- package/build/types/evaluation-loop.d.ts +23 -0
- package/package.json +5 -5
package/build/index.mjs
CHANGED
|
@@ -3474,9 +3474,10 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
|
|
|
3474
3474
|
});
|
|
3475
3475
|
for (let i = 0; i < steps.length; i++) {
|
|
3476
3476
|
const step = steps[i];
|
|
3477
|
-
const
|
|
3478
|
-
|
|
3479
|
-
).toISOString();
|
|
3477
|
+
const stepStartedAt = i === 0 ? executionStartMs : stepTimestamps[i - 1] ?? executionStartMs;
|
|
3478
|
+
const stepFinishedAt = stepTimestamps[i] ?? executionStartMs;
|
|
3479
|
+
const assistantTimestamp = new Date(stepStartedAt).toISOString();
|
|
3480
|
+
const toolResultTimestamp = new Date(stepFinishedAt).toISOString();
|
|
3480
3481
|
const assistantContent = [];
|
|
3481
3482
|
if (step.reasoningText) {
|
|
3482
3483
|
assistantContent.push({ type: "thinking", thinking: step.reasoningText });
|
|
@@ -3496,7 +3497,7 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
|
|
|
3496
3497
|
messages.push({
|
|
3497
3498
|
role: "assistant",
|
|
3498
3499
|
content: assistantContent,
|
|
3499
|
-
timestamp:
|
|
3500
|
+
timestamp: assistantTimestamp
|
|
3500
3501
|
});
|
|
3501
3502
|
}
|
|
3502
3503
|
if (step.toolResults.length > 0) {
|
|
@@ -3513,7 +3514,7 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
|
|
|
3513
3514
|
messages.push({
|
|
3514
3515
|
role: "user",
|
|
3515
3516
|
content: resultBlocks,
|
|
3516
|
-
timestamp:
|
|
3517
|
+
timestamp: toolResultTimestamp
|
|
3517
3518
|
});
|
|
3518
3519
|
}
|
|
3519
3520
|
}
|
|
@@ -4686,6 +4687,57 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
4686
4687
|
};
|
|
4687
4688
|
}
|
|
4688
4689
|
|
|
4690
|
+
// src/evaluation-loop.ts
|
|
4691
|
+
import { randomUUID as randomUUID5 } from "crypto";
|
|
4692
|
+
async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
4693
|
+
const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
|
|
4694
|
+
let completedExecutions = 0;
|
|
4695
|
+
const totalExecutions = scenarioItems.length * runsPerScenario;
|
|
4696
|
+
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
4697
|
+
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
4698
|
+
for (const { scenario, template, resolvedAssertions } of scenarioItems) {
|
|
4699
|
+
for (let iterationIndex = 0; iterationIndex < runsPerScenario; iterationIndex++) {
|
|
4700
|
+
const iterLabel = runsPerScenario > 1 ? ` [run ${iterationIndex + 1}/${runsPerScenario}]` : "";
|
|
4701
|
+
callbacks.onProgress(completedExecutions + 1, totalExecutions, iterLabel);
|
|
4702
|
+
let scenarioResult = null;
|
|
4703
|
+
try {
|
|
4704
|
+
scenarioResult = await callbacks.runScenario(
|
|
4705
|
+
scenario,
|
|
4706
|
+
template,
|
|
4707
|
+
resolvedAssertions
|
|
4708
|
+
);
|
|
4709
|
+
} catch (err) {
|
|
4710
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
4711
|
+
console.error(
|
|
4712
|
+
`[Evaluator] Scenario iteration failed, recording as error result: "${scenario.name}"${iterLabel} \u2014 ${errorMsg}`
|
|
4713
|
+
);
|
|
4714
|
+
const errorResult = {
|
|
4715
|
+
id: randomUUID5(),
|
|
4716
|
+
targetId,
|
|
4717
|
+
targetName,
|
|
4718
|
+
scenarioId: scenario.id,
|
|
4719
|
+
scenarioName: scenario.name,
|
|
4720
|
+
assertionResults: [],
|
|
4721
|
+
passed: 0,
|
|
4722
|
+
failed: 0,
|
|
4723
|
+
passRate: 0,
|
|
4724
|
+
duration: 0,
|
|
4725
|
+
outputText: `Execution error: ${errorMsg}`,
|
|
4726
|
+
startedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4727
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4728
|
+
iterationIndex
|
|
4729
|
+
};
|
|
4730
|
+
await callbacks.addResult(errorResult);
|
|
4731
|
+
}
|
|
4732
|
+
if (scenarioResult !== null) {
|
|
4733
|
+
await callbacks.addResult({ ...scenarioResult, iterationIndex });
|
|
4734
|
+
}
|
|
4735
|
+
completedExecutions++;
|
|
4736
|
+
}
|
|
4737
|
+
}
|
|
4738
|
+
return { completedExecutions, totalExecutions };
|
|
4739
|
+
}
|
|
4740
|
+
|
|
4689
4741
|
// src/error-reporter.ts
|
|
4690
4742
|
import { EvalStatus } from "@wix/evalforge-types";
|
|
4691
4743
|
function formatError(error, phase, context) {
|
|
@@ -4866,69 +4918,57 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
4866
4918
|
`[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for evaluation runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
|
|
4867
4919
|
);
|
|
4868
4920
|
}
|
|
4869
|
-
|
|
4870
|
-
const
|
|
4871
|
-
|
|
4872
|
-
|
|
4873
|
-
|
|
4874
|
-
|
|
4875
|
-
|
|
4876
|
-
|
|
4877
|
-
|
|
4878
|
-
|
|
4879
|
-
|
|
4880
|
-
|
|
4881
|
-
|
|
4882
|
-
|
|
4883
|
-
|
|
4884
|
-
|
|
4885
|
-
|
|
4886
|
-
|
|
4887
|
-
|
|
4888
|
-
|
|
4889
|
-
|
|
4890
|
-
|
|
4891
|
-
|
|
4892
|
-
|
|
4893
|
-
|
|
4894
|
-
|
|
4895
|
-
|
|
4896
|
-
|
|
4897
|
-
|
|
4898
|
-
|
|
4899
|
-
|
|
4900
|
-
|
|
4901
|
-
|
|
4902
|
-
|
|
4903
|
-
|
|
4904
|
-
|
|
4905
|
-
|
|
4906
|
-
|
|
4907
|
-
|
|
4908
|
-
|
|
4909
|
-
|
|
4910
|
-
|
|
4911
|
-
const errorStack = err instanceof Error ? err.stack : void 0;
|
|
4912
|
-
console.error(
|
|
4913
|
-
"[Evaluator] Failed to run scenario with preset:",
|
|
4914
|
-
evalData.presetName,
|
|
4915
|
-
"Error:",
|
|
4916
|
-
errorMsg
|
|
4917
|
-
);
|
|
4918
|
-
if (errorStack) {
|
|
4919
|
-
console.error("[Evaluator] Stack trace:", errorStack);
|
|
4921
|
+
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
4922
|
+
const { completedExecutions, totalExecutions } = await runEvaluationLoop(
|
|
4923
|
+
scenarioItems,
|
|
4924
|
+
evalData,
|
|
4925
|
+
{
|
|
4926
|
+
runScenario: (scenario, template, resolvedAssertions) => {
|
|
4927
|
+
state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
|
|
4928
|
+
state.currentContext = {
|
|
4929
|
+
projectId: projectId2,
|
|
4930
|
+
evalRunId: evalRunId2,
|
|
4931
|
+
scenarioId: scenario.id,
|
|
4932
|
+
scenarioName: scenario.name,
|
|
4933
|
+
presetId: evalData.evalRun.presetId,
|
|
4934
|
+
presetName: evalData.presetName,
|
|
4935
|
+
agentId: agent?.id,
|
|
4936
|
+
agentName: agent?.name
|
|
4937
|
+
};
|
|
4938
|
+
return runScenario(
|
|
4939
|
+
config,
|
|
4940
|
+
evalRunId2,
|
|
4941
|
+
scenario,
|
|
4942
|
+
evalData,
|
|
4943
|
+
template,
|
|
4944
|
+
resolvedAssertions
|
|
4945
|
+
);
|
|
4946
|
+
},
|
|
4947
|
+
addResult: async (result) => {
|
|
4948
|
+
state.currentPhase = ExecutionPhase.ADD_RESULT;
|
|
4949
|
+
state.currentContext = {
|
|
4950
|
+
...state.currentContext,
|
|
4951
|
+
resultId: result.id
|
|
4952
|
+
};
|
|
4953
|
+
await api.addResult(projectId2, evalRunId2, result);
|
|
4954
|
+
},
|
|
4955
|
+
onProgress: (completed, total, iterLabel) => {
|
|
4956
|
+
console.log(
|
|
4957
|
+
"[Evaluator] Running scenario with preset:",
|
|
4958
|
+
evalData.presetName,
|
|
4959
|
+
skillNames ? `(${skillNames})` : "",
|
|
4960
|
+
agent ? `with agent: ${agent.name}` : "",
|
|
4961
|
+
`(${completed}/${total})${iterLabel}`
|
|
4962
|
+
);
|
|
4920
4963
|
}
|
|
4921
|
-
throw new Error(
|
|
4922
|
-
`[${state.currentPhase}] Failed to execute preset "${evalData.presetName}" on scenario "${scenario.name}": ${errorMsg}`
|
|
4923
|
-
);
|
|
4924
4964
|
}
|
|
4925
|
-
|
|
4965
|
+
);
|
|
4926
4966
|
state.currentPhase = ExecutionPhase.UPDATE_STATUS;
|
|
4927
4967
|
state.currentContext = {
|
|
4928
4968
|
projectId: projectId2,
|
|
4929
4969
|
evalRunId: evalRunId2,
|
|
4930
|
-
|
|
4931
|
-
|
|
4970
|
+
completedExecutions,
|
|
4971
|
+
totalExecutions
|
|
4932
4972
|
};
|
|
4933
4973
|
try {
|
|
4934
4974
|
await api.updateEvalRun(projectId2, evalRunId2, {
|