@wix/evalforge-evaluator 0.132.0 → 0.134.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +102 -62
- package/build/index.js.map +4 -4
- package/build/index.mjs +102 -62
- package/build/index.mjs.map +4 -4
- package/build/types/evaluation-loop.d.ts +23 -0
- package/package.json +5 -5
package/build/index.js
CHANGED
|
@@ -3466,9 +3466,10 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
|
|
|
3466
3466
|
});
|
|
3467
3467
|
for (let i = 0; i < steps.length; i++) {
|
|
3468
3468
|
const step = steps[i];
|
|
3469
|
-
const
|
|
3470
|
-
|
|
3471
|
-
).toISOString();
|
|
3469
|
+
const stepStartedAt = i === 0 ? executionStartMs : stepTimestamps[i - 1] ?? executionStartMs;
|
|
3470
|
+
const stepFinishedAt = stepTimestamps[i] ?? executionStartMs;
|
|
3471
|
+
const assistantTimestamp = new Date(stepStartedAt).toISOString();
|
|
3472
|
+
const toolResultTimestamp = new Date(stepFinishedAt).toISOString();
|
|
3472
3473
|
const assistantContent = [];
|
|
3473
3474
|
if (step.reasoningText) {
|
|
3474
3475
|
assistantContent.push({ type: "thinking", thinking: step.reasoningText });
|
|
@@ -3488,7 +3489,7 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
|
|
|
3488
3489
|
messages.push({
|
|
3489
3490
|
role: "assistant",
|
|
3490
3491
|
content: assistantContent,
|
|
3491
|
-
timestamp:
|
|
3492
|
+
timestamp: assistantTimestamp
|
|
3492
3493
|
});
|
|
3493
3494
|
}
|
|
3494
3495
|
if (step.toolResults.length > 0) {
|
|
@@ -3505,7 +3506,7 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
|
|
|
3505
3506
|
messages.push({
|
|
3506
3507
|
role: "user",
|
|
3507
3508
|
content: resultBlocks,
|
|
3508
|
-
timestamp:
|
|
3509
|
+
timestamp: toolResultTimestamp
|
|
3509
3510
|
});
|
|
3510
3511
|
}
|
|
3511
3512
|
}
|
|
@@ -4678,6 +4679,57 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
4678
4679
|
};
|
|
4679
4680
|
}
|
|
4680
4681
|
|
|
4682
|
+
// src/evaluation-loop.ts
|
|
4683
|
+
var import_crypto5 = require("crypto");
|
|
4684
|
+
async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
4685
|
+
const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
|
|
4686
|
+
let completedExecutions = 0;
|
|
4687
|
+
const totalExecutions = scenarioItems.length * runsPerScenario;
|
|
4688
|
+
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
4689
|
+
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
4690
|
+
for (const { scenario, template, resolvedAssertions } of scenarioItems) {
|
|
4691
|
+
for (let iterationIndex = 0; iterationIndex < runsPerScenario; iterationIndex++) {
|
|
4692
|
+
const iterLabel = runsPerScenario > 1 ? ` [run ${iterationIndex + 1}/${runsPerScenario}]` : "";
|
|
4693
|
+
callbacks.onProgress(completedExecutions + 1, totalExecutions, iterLabel);
|
|
4694
|
+
let scenarioResult = null;
|
|
4695
|
+
try {
|
|
4696
|
+
scenarioResult = await callbacks.runScenario(
|
|
4697
|
+
scenario,
|
|
4698
|
+
template,
|
|
4699
|
+
resolvedAssertions
|
|
4700
|
+
);
|
|
4701
|
+
} catch (err) {
|
|
4702
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
4703
|
+
console.error(
|
|
4704
|
+
`[Evaluator] Scenario iteration failed, recording as error result: "${scenario.name}"${iterLabel} \u2014 ${errorMsg}`
|
|
4705
|
+
);
|
|
4706
|
+
const errorResult = {
|
|
4707
|
+
id: (0, import_crypto5.randomUUID)(),
|
|
4708
|
+
targetId,
|
|
4709
|
+
targetName,
|
|
4710
|
+
scenarioId: scenario.id,
|
|
4711
|
+
scenarioName: scenario.name,
|
|
4712
|
+
assertionResults: [],
|
|
4713
|
+
passed: 0,
|
|
4714
|
+
failed: 0,
|
|
4715
|
+
passRate: 0,
|
|
4716
|
+
duration: 0,
|
|
4717
|
+
outputText: `Execution error: ${errorMsg}`,
|
|
4718
|
+
startedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4719
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4720
|
+
iterationIndex
|
|
4721
|
+
};
|
|
4722
|
+
await callbacks.addResult(errorResult);
|
|
4723
|
+
}
|
|
4724
|
+
if (scenarioResult !== null) {
|
|
4725
|
+
await callbacks.addResult({ ...scenarioResult, iterationIndex });
|
|
4726
|
+
}
|
|
4727
|
+
completedExecutions++;
|
|
4728
|
+
}
|
|
4729
|
+
}
|
|
4730
|
+
return { completedExecutions, totalExecutions };
|
|
4731
|
+
}
|
|
4732
|
+
|
|
4681
4733
|
// src/error-reporter.ts
|
|
4682
4734
|
var import_evalforge_types14 = require("@wix/evalforge-types");
|
|
4683
4735
|
function formatError(error, phase, context) {
|
|
@@ -4858,69 +4910,57 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
4858
4910
|
`[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for evaluation runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
|
|
4859
4911
|
);
|
|
4860
4912
|
}
|
|
4861
|
-
|
|
4862
|
-
const
|
|
4863
|
-
|
|
4864
|
-
|
|
4865
|
-
|
|
4866
|
-
|
|
4867
|
-
|
|
4868
|
-
|
|
4869
|
-
|
|
4870
|
-
|
|
4871
|
-
|
|
4872
|
-
|
|
4873
|
-
|
|
4874
|
-
|
|
4875
|
-
|
|
4876
|
-
|
|
4877
|
-
|
|
4878
|
-
|
|
4879
|
-
|
|
4880
|
-
|
|
4881
|
-
|
|
4882
|
-
|
|
4883
|
-
|
|
4884
|
-
|
|
4885
|
-
|
|
4886
|
-
|
|
4887
|
-
|
|
4888
|
-
|
|
4889
|
-
|
|
4890
|
-
|
|
4891
|
-
|
|
4892
|
-
|
|
4893
|
-
|
|
4894
|
-
|
|
4895
|
-
|
|
4896
|
-
|
|
4897
|
-
|
|
4898
|
-
|
|
4899
|
-
|
|
4900
|
-
|
|
4901
|
-
|
|
4902
|
-
|
|
4903
|
-
const errorStack = err instanceof Error ? err.stack : void 0;
|
|
4904
|
-
console.error(
|
|
4905
|
-
"[Evaluator] Failed to run scenario with preset:",
|
|
4906
|
-
evalData.presetName,
|
|
4907
|
-
"Error:",
|
|
4908
|
-
errorMsg
|
|
4909
|
-
);
|
|
4910
|
-
if (errorStack) {
|
|
4911
|
-
console.error("[Evaluator] Stack trace:", errorStack);
|
|
4913
|
+
const skillNames = evalData.skills.map((s) => s.name).join(", ");
|
|
4914
|
+
const { completedExecutions, totalExecutions } = await runEvaluationLoop(
|
|
4915
|
+
scenarioItems,
|
|
4916
|
+
evalData,
|
|
4917
|
+
{
|
|
4918
|
+
runScenario: (scenario, template, resolvedAssertions) => {
|
|
4919
|
+
state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
|
|
4920
|
+
state.currentContext = {
|
|
4921
|
+
projectId: projectId2,
|
|
4922
|
+
evalRunId: evalRunId2,
|
|
4923
|
+
scenarioId: scenario.id,
|
|
4924
|
+
scenarioName: scenario.name,
|
|
4925
|
+
presetId: evalData.evalRun.presetId,
|
|
4926
|
+
presetName: evalData.presetName,
|
|
4927
|
+
agentId: agent?.id,
|
|
4928
|
+
agentName: agent?.name
|
|
4929
|
+
};
|
|
4930
|
+
return runScenario(
|
|
4931
|
+
config,
|
|
4932
|
+
evalRunId2,
|
|
4933
|
+
scenario,
|
|
4934
|
+
evalData,
|
|
4935
|
+
template,
|
|
4936
|
+
resolvedAssertions
|
|
4937
|
+
);
|
|
4938
|
+
},
|
|
4939
|
+
addResult: async (result) => {
|
|
4940
|
+
state.currentPhase = ExecutionPhase.ADD_RESULT;
|
|
4941
|
+
state.currentContext = {
|
|
4942
|
+
...state.currentContext,
|
|
4943
|
+
resultId: result.id
|
|
4944
|
+
};
|
|
4945
|
+
await api.addResult(projectId2, evalRunId2, result);
|
|
4946
|
+
},
|
|
4947
|
+
onProgress: (completed, total, iterLabel) => {
|
|
4948
|
+
console.log(
|
|
4949
|
+
"[Evaluator] Running scenario with preset:",
|
|
4950
|
+
evalData.presetName,
|
|
4951
|
+
skillNames ? `(${skillNames})` : "",
|
|
4952
|
+
agent ? `with agent: ${agent.name}` : "",
|
|
4953
|
+
`(${completed}/${total})${iterLabel}`
|
|
4954
|
+
);
|
|
4912
4955
|
}
|
|
4913
|
-
throw new Error(
|
|
4914
|
-
`[${state.currentPhase}] Failed to execute preset "${evalData.presetName}" on scenario "${scenario.name}": ${errorMsg}`
|
|
4915
|
-
);
|
|
4916
4956
|
}
|
|
4917
|
-
|
|
4957
|
+
);
|
|
4918
4958
|
state.currentPhase = ExecutionPhase.UPDATE_STATUS;
|
|
4919
4959
|
state.currentContext = {
|
|
4920
4960
|
projectId: projectId2,
|
|
4921
4961
|
evalRunId: evalRunId2,
|
|
4922
|
-
|
|
4923
|
-
|
|
4962
|
+
completedExecutions,
|
|
4963
|
+
totalExecutions
|
|
4924
4964
|
};
|
|
4925
4965
|
try {
|
|
4926
4966
|
await api.updateEvalRun(projectId2, evalRunId2, {
|