@wix/evalforge-evaluator 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +203 -23
- package/build/index.js.map +4 -4
- package/build/index.mjs +203 -23
- package/build/index.mjs.map +4 -4
- package/build/types/error-reporter.d.ts +77 -0
- package/build/types/index.d.ts +7 -0
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
24
24
|
));
|
|
25
25
|
|
|
26
26
|
// src/index.ts
|
|
27
|
-
var
|
|
27
|
+
var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
28
28
|
|
|
29
29
|
// src/config.ts
|
|
30
30
|
function loadConfig() {
|
|
@@ -6620,6 +6620,67 @@ async function runScenario(config, evalRunId2, scenario, target, template) {
|
|
|
6620
6620
|
};
|
|
6621
6621
|
}
|
|
6622
6622
|
|
|
6623
|
+
// src/error-reporter.ts
|
|
6624
|
+
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
6625
|
+
function formatError(error, phase, context) {
|
|
6626
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
6627
|
+
if (error instanceof Error) {
|
|
6628
|
+
return {
|
|
6629
|
+
message: error.message,
|
|
6630
|
+
stack: error.stack,
|
|
6631
|
+
errorType: error.constructor.name,
|
|
6632
|
+
phase,
|
|
6633
|
+
context,
|
|
6634
|
+
timestamp
|
|
6635
|
+
};
|
|
6636
|
+
}
|
|
6637
|
+
return {
|
|
6638
|
+
message: String(error),
|
|
6639
|
+
errorType: typeof error,
|
|
6640
|
+
phase,
|
|
6641
|
+
context,
|
|
6642
|
+
timestamp
|
|
6643
|
+
};
|
|
6644
|
+
}
|
|
6645
|
+
function formatErrorForJobError(details) {
|
|
6646
|
+
const parts = [];
|
|
6647
|
+
if (details.phase) {
|
|
6648
|
+
parts.push(`[Phase: ${details.phase}]`);
|
|
6649
|
+
}
|
|
6650
|
+
if (details.errorType && details.errorType !== "Error") {
|
|
6651
|
+
parts.push(`${details.errorType}: ${details.message}`);
|
|
6652
|
+
} else {
|
|
6653
|
+
parts.push(details.message);
|
|
6654
|
+
}
|
|
6655
|
+
if (details.context && Object.keys(details.context).length > 0) {
|
|
6656
|
+
parts.push(`
|
|
6657
|
+
Context: ${JSON.stringify(details.context)}`);
|
|
6658
|
+
}
|
|
6659
|
+
if (details.stack) {
|
|
6660
|
+
const stackLines = details.stack.split("\n").slice(0, 6);
|
|
6661
|
+
parts.push(`
|
|
6662
|
+
Stack:
|
|
6663
|
+
${stackLines.join("\n")}`);
|
|
6664
|
+
}
|
|
6665
|
+
return parts.join(" ");
|
|
6666
|
+
}
|
|
6667
|
+
var ExecutionPhase = {
|
|
6668
|
+
CONFIG: "config-loading",
|
|
6669
|
+
API_CLIENT: "api-client-creation",
|
|
6670
|
+
FETCH_EVAL_RUN: "fetch-eval-run",
|
|
6671
|
+
FETCH_SKILLS: "fetch-skills",
|
|
6672
|
+
FETCH_AGENT: "fetch-agent",
|
|
6673
|
+
FETCH_SCENARIOS: "fetch-scenarios",
|
|
6674
|
+
VALIDATION: "validation",
|
|
6675
|
+
PREPARE_WORKSPACE: "prepare-workspace",
|
|
6676
|
+
EXECUTE_SKILL: "execute-skill",
|
|
6677
|
+
EXECUTE_AGENT: "execute-agent",
|
|
6678
|
+
CLAUDE_SDK_IMPORT: "claude-sdk-import",
|
|
6679
|
+
CLAUDE_SDK_EXECUTION: "claude-sdk-execution",
|
|
6680
|
+
ADD_RESULT: "add-result",
|
|
6681
|
+
UPDATE_STATUS: "update-status"
|
|
6682
|
+
};
|
|
6683
|
+
|
|
6623
6684
|
// src/index.ts
|
|
6624
6685
|
console.error(
|
|
6625
6686
|
"[EVALUATOR-BOOT] Module loading started",
|
|
@@ -6627,13 +6688,22 @@ console.error(
|
|
|
6627
6688
|
);
|
|
6628
6689
|
console.error("[EVALUATOR-BOOT] All static imports successful");
|
|
6629
6690
|
async function runEvaluation(projectId2, evalRunId2) {
|
|
6691
|
+
const state = {
|
|
6692
|
+
config: null,
|
|
6693
|
+
api: null,
|
|
6694
|
+
currentPhase: ExecutionPhase.CONFIG,
|
|
6695
|
+
currentContext: { projectId: projectId2, evalRunId: evalRunId2 }
|
|
6696
|
+
};
|
|
6630
6697
|
console.error(
|
|
6631
6698
|
"[DEBUG-H1] runEvaluation entry",
|
|
6632
6699
|
JSON.stringify({ projectId: projectId2, evalRunId: evalRunId2, timestamp: Date.now() })
|
|
6633
6700
|
);
|
|
6701
|
+
state.currentPhase = ExecutionPhase.CONFIG;
|
|
6702
|
+
state.currentContext = { projectId: projectId2, evalRunId: evalRunId2 };
|
|
6634
6703
|
let config;
|
|
6635
6704
|
try {
|
|
6636
6705
|
config = loadConfig();
|
|
6706
|
+
state.config = config;
|
|
6637
6707
|
console.error(
|
|
6638
6708
|
"[DEBUG-H1] loadConfig SUCCESS",
|
|
6639
6709
|
JSON.stringify({
|
|
@@ -6649,10 +6719,13 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
6649
6719
|
"[DEBUG-H1] loadConfig FAILED",
|
|
6650
6720
|
JSON.stringify({
|
|
6651
6721
|
error: configErr instanceof Error ? configErr.message : String(configErr),
|
|
6722
|
+
stack: configErr instanceof Error ? configErr.stack : void 0,
|
|
6652
6723
|
timestamp: Date.now()
|
|
6653
6724
|
})
|
|
6654
6725
|
);
|
|
6655
|
-
throw
|
|
6726
|
+
throw new Error(
|
|
6727
|
+
`[${ExecutionPhase.CONFIG}] ${configErr instanceof Error ? configErr.message : String(configErr)}`
|
|
6728
|
+
);
|
|
6656
6729
|
}
|
|
6657
6730
|
console.log("[Evaluator] Config loaded", {
|
|
6658
6731
|
serverUrl: config.serverUrl,
|
|
@@ -6661,11 +6734,22 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
6661
6734
|
hasAiGatewayHeaders: Object.keys(config.aiGatewayHeaders).length > 0,
|
|
6662
6735
|
hasRouteHeader: !!config.routeHeader
|
|
6663
6736
|
});
|
|
6664
|
-
|
|
6665
|
-
|
|
6666
|
-
|
|
6667
|
-
|
|
6668
|
-
|
|
6737
|
+
state.currentPhase = ExecutionPhase.API_CLIENT;
|
|
6738
|
+
let api;
|
|
6739
|
+
try {
|
|
6740
|
+
api = createApiClient(config.serverUrl, {
|
|
6741
|
+
apiPrefix: config.apiPrefix,
|
|
6742
|
+
routeHeader: config.routeHeader,
|
|
6743
|
+
authToken: config.authToken
|
|
6744
|
+
});
|
|
6745
|
+
state.api = api;
|
|
6746
|
+
} catch (apiErr) {
|
|
6747
|
+
throw new Error(
|
|
6748
|
+
`[${ExecutionPhase.API_CLIENT}] Failed to create API client: ${apiErr instanceof Error ? apiErr.message : String(apiErr)}`
|
|
6749
|
+
);
|
|
6750
|
+
}
|
|
6751
|
+
state.currentPhase = ExecutionPhase.FETCH_EVAL_RUN;
|
|
6752
|
+
state.currentContext = { projectId: projectId2, evalRunId: evalRunId2, serverUrl: config.serverUrl };
|
|
6669
6753
|
console.error(
|
|
6670
6754
|
"[DEBUG-H2] fetchEvaluationData START",
|
|
6671
6755
|
JSON.stringify({ serverUrl: config.serverUrl, timestamp: Date.now() })
|
|
@@ -6684,32 +6768,61 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
6684
6768
|
})
|
|
6685
6769
|
);
|
|
6686
6770
|
} catch (fetchErr) {
|
|
6771
|
+
const errorMsg = fetchErr instanceof Error ? fetchErr.message : String(fetchErr);
|
|
6687
6772
|
console.error(
|
|
6688
6773
|
"[DEBUG-H2] fetchEvaluationData FAILED",
|
|
6689
6774
|
JSON.stringify({
|
|
6690
|
-
error:
|
|
6775
|
+
error: errorMsg,
|
|
6776
|
+
stack: fetchErr instanceof Error ? fetchErr.stack : void 0,
|
|
6691
6777
|
timestamp: Date.now()
|
|
6692
6778
|
})
|
|
6693
6779
|
);
|
|
6694
|
-
throw
|
|
6780
|
+
throw new Error(
|
|
6781
|
+
`[${ExecutionPhase.FETCH_EVAL_RUN}] Failed to fetch evaluation data: ${errorMsg}`
|
|
6782
|
+
);
|
|
6695
6783
|
}
|
|
6696
6784
|
const { codeAgent, skills, scenarioItems } = evalData;
|
|
6785
|
+
state.currentPhase = ExecutionPhase.VALIDATION;
|
|
6786
|
+
state.currentContext = {
|
|
6787
|
+
projectId: projectId2,
|
|
6788
|
+
evalRunId: evalRunId2,
|
|
6789
|
+
scenarioCount: scenarioItems.length,
|
|
6790
|
+
skillCount: skills.length,
|
|
6791
|
+
hasAgent: !!codeAgent,
|
|
6792
|
+
agentId: evalData.evalRun.agentId,
|
|
6793
|
+
skillsGroupId: evalData.evalRun.skillsGroupId
|
|
6794
|
+
};
|
|
6697
6795
|
if (scenarioItems.length > 0 && skills.length === 0) {
|
|
6698
6796
|
throw new Error(
|
|
6699
|
-
|
|
6797
|
+
`[${ExecutionPhase.VALIDATION}] Eval run has no skills: set skillsGroupId and ensure the group has skills. (skillsGroupId: ${evalData.evalRun.skillsGroupId || "not set"})`
|
|
6700
6798
|
);
|
|
6701
6799
|
}
|
|
6702
6800
|
if (scenarioItems.length > 0 && skills.length > 0 && !codeAgent) {
|
|
6703
6801
|
throw new Error(
|
|
6704
|
-
|
|
6802
|
+
`[${ExecutionPhase.VALIDATION}] Eval run has no code agent: set agentId for skill-based runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
|
|
6705
6803
|
);
|
|
6706
6804
|
}
|
|
6805
|
+
let completedScenarios = 0;
|
|
6806
|
+
const totalScenarios = scenarioItems.length * skills.length;
|
|
6707
6807
|
for (const { scenario, template } of scenarioItems) {
|
|
6708
6808
|
for (const skill of skills) {
|
|
6809
|
+
state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
|
|
6810
|
+
state.currentContext = {
|
|
6811
|
+
projectId: projectId2,
|
|
6812
|
+
evalRunId: evalRunId2,
|
|
6813
|
+
scenarioId: scenario.id,
|
|
6814
|
+
scenarioName: scenario.name,
|
|
6815
|
+
skillId: skill.id,
|
|
6816
|
+
skillName: skill.name,
|
|
6817
|
+
agentId: codeAgent?.id,
|
|
6818
|
+
agentName: codeAgent?.name,
|
|
6819
|
+
progress: `${completedScenarios + 1}/${totalScenarios}`
|
|
6820
|
+
};
|
|
6709
6821
|
console.log(
|
|
6710
6822
|
"[Evaluator] Running skill:",
|
|
6711
6823
|
skill.name,
|
|
6712
|
-
codeAgent ? `with agent: ${codeAgent.name}` : ""
|
|
6824
|
+
codeAgent ? `with agent: ${codeAgent.name}` : "",
|
|
6825
|
+
`(${completedScenarios + 1}/${totalScenarios})`
|
|
6713
6826
|
);
|
|
6714
6827
|
try {
|
|
6715
6828
|
const result = await runScenario(
|
|
@@ -6720,17 +6833,48 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
6720
6833
|
template
|
|
6721
6834
|
);
|
|
6722
6835
|
console.log("[Evaluator] Skill completed, adding result");
|
|
6836
|
+
state.currentPhase = ExecutionPhase.ADD_RESULT;
|
|
6837
|
+
state.currentContext = {
|
|
6838
|
+
...state.currentContext,
|
|
6839
|
+
resultId: result.id
|
|
6840
|
+
};
|
|
6723
6841
|
await api.addResult(projectId2, evalRunId2, result);
|
|
6842
|
+
completedScenarios++;
|
|
6724
6843
|
} catch (err) {
|
|
6725
|
-
|
|
6726
|
-
|
|
6844
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
6845
|
+
const errorStack = err instanceof Error ? err.stack : void 0;
|
|
6846
|
+
console.error(
|
|
6847
|
+
"[Evaluator] Failed to run skill:",
|
|
6848
|
+
skill.name,
|
|
6849
|
+
"Error:",
|
|
6850
|
+
errorMsg
|
|
6851
|
+
);
|
|
6852
|
+
if (errorStack) {
|
|
6853
|
+
console.error("[Evaluator] Stack trace:", errorStack);
|
|
6854
|
+
}
|
|
6855
|
+
throw new Error(
|
|
6856
|
+
`[${state.currentPhase}] Failed to execute skill "${skill.name}" on scenario "${scenario.name}": ${errorMsg}`
|
|
6857
|
+
);
|
|
6727
6858
|
}
|
|
6728
6859
|
}
|
|
6729
6860
|
}
|
|
6730
|
-
|
|
6731
|
-
|
|
6732
|
-
|
|
6733
|
-
|
|
6861
|
+
state.currentPhase = ExecutionPhase.UPDATE_STATUS;
|
|
6862
|
+
state.currentContext = {
|
|
6863
|
+
projectId: projectId2,
|
|
6864
|
+
evalRunId: evalRunId2,
|
|
6865
|
+
completedScenarios,
|
|
6866
|
+
totalScenarios
|
|
6867
|
+
};
|
|
6868
|
+
try {
|
|
6869
|
+
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
6870
|
+
status: import_evalforge_types4.EvalStatus.COMPLETED,
|
|
6871
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
6872
|
+
});
|
|
6873
|
+
} catch (updateErr) {
|
|
6874
|
+
throw new Error(
|
|
6875
|
+
`[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to COMPLETED: ${updateErr instanceof Error ? updateErr.message : String(updateErr)}`
|
|
6876
|
+
);
|
|
6877
|
+
}
|
|
6734
6878
|
}
|
|
6735
6879
|
var projectId = process.argv[2];
|
|
6736
6880
|
var evalRunId = process.argv[3];
|
|
@@ -6746,7 +6890,16 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
6746
6890
|
console.error("[EVALUATOR-BOOT] runEvaluation completed successfully");
|
|
6747
6891
|
process.exit(0);
|
|
6748
6892
|
}).catch(async (err) => {
|
|
6749
|
-
|
|
6893
|
+
const errorDetails = formatError(err, "main-execution", {
|
|
6894
|
+
projectId,
|
|
6895
|
+
evalRunId
|
|
6896
|
+
});
|
|
6897
|
+
const jobError = formatErrorForJobError(errorDetails);
|
|
6898
|
+
console.error("[EVALUATOR-BOOT] runEvaluation FAILED");
|
|
6899
|
+
console.error(
|
|
6900
|
+
"[EVALUATOR-BOOT] Error details:",
|
|
6901
|
+
JSON.stringify(errorDetails, null, 2)
|
|
6902
|
+
);
|
|
6750
6903
|
try {
|
|
6751
6904
|
const config = loadConfig();
|
|
6752
6905
|
const api = createApiClient(config.serverUrl, {
|
|
@@ -6755,15 +6908,42 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
6755
6908
|
authToken: config.authToken
|
|
6756
6909
|
});
|
|
6757
6910
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
6758
|
-
status:
|
|
6759
|
-
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
6911
|
+
status: import_evalforge_types4.EvalStatus.FAILED,
|
|
6912
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6913
|
+
jobError,
|
|
6914
|
+
jobStatus: "FAILED"
|
|
6760
6915
|
});
|
|
6761
|
-
console.error(
|
|
6916
|
+
console.error(
|
|
6917
|
+
"[EVALUATOR-BOOT] Updated eval run status to FAILED with error details"
|
|
6918
|
+
);
|
|
6762
6919
|
} catch (updateErr) {
|
|
6763
6920
|
console.error(
|
|
6764
6921
|
"[EVALUATOR-BOOT] Failed to update eval run status:",
|
|
6765
|
-
updateErr
|
|
6922
|
+
updateErr instanceof Error ? updateErr.message : String(updateErr)
|
|
6766
6923
|
);
|
|
6924
|
+
try {
|
|
6925
|
+
const serverUrl = process.env.EVAL_SERVER_URL;
|
|
6926
|
+
const authToken = process.env.EVAL_AUTH_TOKEN;
|
|
6927
|
+
const routeHeader = process.env.EVAL_ROUTE_HEADER;
|
|
6928
|
+
if (serverUrl) {
|
|
6929
|
+
const api = createApiClient(serverUrl, {
|
|
6930
|
+
routeHeader,
|
|
6931
|
+
authToken
|
|
6932
|
+
});
|
|
6933
|
+
await api.updateEvalRun(projectId, evalRunId, {
|
|
6934
|
+
status: import_evalforge_types4.EvalStatus.FAILED,
|
|
6935
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6936
|
+
jobError: `Config load failed, then: ${jobError}`,
|
|
6937
|
+
jobStatus: "FAILED"
|
|
6938
|
+
});
|
|
6939
|
+
console.error("[EVALUATOR-BOOT] Fallback: Updated status to FAILED");
|
|
6940
|
+
}
|
|
6941
|
+
} catch (fallbackErr) {
|
|
6942
|
+
console.error(
|
|
6943
|
+
"[EVALUATOR-BOOT] Fallback also failed:",
|
|
6944
|
+
fallbackErr instanceof Error ? fallbackErr.message : String(fallbackErr)
|
|
6945
|
+
);
|
|
6946
|
+
}
|
|
6767
6947
|
}
|
|
6768
6948
|
process.exit(1);
|
|
6769
6949
|
});
|