@wix/evalforge-evaluator 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +203 -23
- package/build/index.js.map +4 -4
- package/build/index.mjs +203 -23
- package/build/index.mjs.map +4 -4
- package/build/types/error-reporter.d.ts +77 -0
- package/build/types/index.d.ts +7 -0
- package/package.json +2 -2
package/build/index.mjs
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/index.ts
|
|
4
|
-
import { EvalStatus } from "@wix/evalforge-types";
|
|
4
|
+
import { EvalStatus as EvalStatus2 } from "@wix/evalforge-types";
|
|
5
5
|
|
|
6
6
|
// src/config.ts
|
|
7
7
|
function loadConfig() {
|
|
@@ -6603,6 +6603,67 @@ async function runScenario(config, evalRunId2, scenario, target, template) {
|
|
|
6603
6603
|
};
|
|
6604
6604
|
}
|
|
6605
6605
|
|
|
6606
|
+
// src/error-reporter.ts
|
|
6607
|
+
import { EvalStatus } from "@wix/evalforge-types";
|
|
6608
|
+
function formatError(error, phase, context) {
|
|
6609
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
6610
|
+
if (error instanceof Error) {
|
|
6611
|
+
return {
|
|
6612
|
+
message: error.message,
|
|
6613
|
+
stack: error.stack,
|
|
6614
|
+
errorType: error.constructor.name,
|
|
6615
|
+
phase,
|
|
6616
|
+
context,
|
|
6617
|
+
timestamp
|
|
6618
|
+
};
|
|
6619
|
+
}
|
|
6620
|
+
return {
|
|
6621
|
+
message: String(error),
|
|
6622
|
+
errorType: typeof error,
|
|
6623
|
+
phase,
|
|
6624
|
+
context,
|
|
6625
|
+
timestamp
|
|
6626
|
+
};
|
|
6627
|
+
}
|
|
6628
|
+
function formatErrorForJobError(details) {
|
|
6629
|
+
const parts = [];
|
|
6630
|
+
if (details.phase) {
|
|
6631
|
+
parts.push(`[Phase: ${details.phase}]`);
|
|
6632
|
+
}
|
|
6633
|
+
if (details.errorType && details.errorType !== "Error") {
|
|
6634
|
+
parts.push(`${details.errorType}: ${details.message}`);
|
|
6635
|
+
} else {
|
|
6636
|
+
parts.push(details.message);
|
|
6637
|
+
}
|
|
6638
|
+
if (details.context && Object.keys(details.context).length > 0) {
|
|
6639
|
+
parts.push(`
|
|
6640
|
+
Context: ${JSON.stringify(details.context)}`);
|
|
6641
|
+
}
|
|
6642
|
+
if (details.stack) {
|
|
6643
|
+
const stackLines = details.stack.split("\n").slice(0, 6);
|
|
6644
|
+
parts.push(`
|
|
6645
|
+
Stack:
|
|
6646
|
+
${stackLines.join("\n")}`);
|
|
6647
|
+
}
|
|
6648
|
+
return parts.join(" ");
|
|
6649
|
+
}
|
|
6650
|
+
var ExecutionPhase = {
|
|
6651
|
+
CONFIG: "config-loading",
|
|
6652
|
+
API_CLIENT: "api-client-creation",
|
|
6653
|
+
FETCH_EVAL_RUN: "fetch-eval-run",
|
|
6654
|
+
FETCH_SKILLS: "fetch-skills",
|
|
6655
|
+
FETCH_AGENT: "fetch-agent",
|
|
6656
|
+
FETCH_SCENARIOS: "fetch-scenarios",
|
|
6657
|
+
VALIDATION: "validation",
|
|
6658
|
+
PREPARE_WORKSPACE: "prepare-workspace",
|
|
6659
|
+
EXECUTE_SKILL: "execute-skill",
|
|
6660
|
+
EXECUTE_AGENT: "execute-agent",
|
|
6661
|
+
CLAUDE_SDK_IMPORT: "claude-sdk-import",
|
|
6662
|
+
CLAUDE_SDK_EXECUTION: "claude-sdk-execution",
|
|
6663
|
+
ADD_RESULT: "add-result",
|
|
6664
|
+
UPDATE_STATUS: "update-status"
|
|
6665
|
+
};
|
|
6666
|
+
|
|
6606
6667
|
// src/index.ts
|
|
6607
6668
|
console.error(
|
|
6608
6669
|
"[EVALUATOR-BOOT] Module loading started",
|
|
@@ -6610,13 +6671,22 @@ console.error(
|
|
|
6610
6671
|
);
|
|
6611
6672
|
console.error("[EVALUATOR-BOOT] All static imports successful");
|
|
6612
6673
|
async function runEvaluation(projectId2, evalRunId2) {
|
|
6674
|
+
const state = {
|
|
6675
|
+
config: null,
|
|
6676
|
+
api: null,
|
|
6677
|
+
currentPhase: ExecutionPhase.CONFIG,
|
|
6678
|
+
currentContext: { projectId: projectId2, evalRunId: evalRunId2 }
|
|
6679
|
+
};
|
|
6613
6680
|
console.error(
|
|
6614
6681
|
"[DEBUG-H1] runEvaluation entry",
|
|
6615
6682
|
JSON.stringify({ projectId: projectId2, evalRunId: evalRunId2, timestamp: Date.now() })
|
|
6616
6683
|
);
|
|
6684
|
+
state.currentPhase = ExecutionPhase.CONFIG;
|
|
6685
|
+
state.currentContext = { projectId: projectId2, evalRunId: evalRunId2 };
|
|
6617
6686
|
let config;
|
|
6618
6687
|
try {
|
|
6619
6688
|
config = loadConfig();
|
|
6689
|
+
state.config = config;
|
|
6620
6690
|
console.error(
|
|
6621
6691
|
"[DEBUG-H1] loadConfig SUCCESS",
|
|
6622
6692
|
JSON.stringify({
|
|
@@ -6632,10 +6702,13 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
6632
6702
|
"[DEBUG-H1] loadConfig FAILED",
|
|
6633
6703
|
JSON.stringify({
|
|
6634
6704
|
error: configErr instanceof Error ? configErr.message : String(configErr),
|
|
6705
|
+
stack: configErr instanceof Error ? configErr.stack : void 0,
|
|
6635
6706
|
timestamp: Date.now()
|
|
6636
6707
|
})
|
|
6637
6708
|
);
|
|
6638
|
-
throw
|
|
6709
|
+
throw new Error(
|
|
6710
|
+
`[${ExecutionPhase.CONFIG}] ${configErr instanceof Error ? configErr.message : String(configErr)}`
|
|
6711
|
+
);
|
|
6639
6712
|
}
|
|
6640
6713
|
console.log("[Evaluator] Config loaded", {
|
|
6641
6714
|
serverUrl: config.serverUrl,
|
|
@@ -6644,11 +6717,22 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
6644
6717
|
hasAiGatewayHeaders: Object.keys(config.aiGatewayHeaders).length > 0,
|
|
6645
6718
|
hasRouteHeader: !!config.routeHeader
|
|
6646
6719
|
});
|
|
6647
|
-
|
|
6648
|
-
|
|
6649
|
-
|
|
6650
|
-
|
|
6651
|
-
|
|
6720
|
+
state.currentPhase = ExecutionPhase.API_CLIENT;
|
|
6721
|
+
let api;
|
|
6722
|
+
try {
|
|
6723
|
+
api = createApiClient(config.serverUrl, {
|
|
6724
|
+
apiPrefix: config.apiPrefix,
|
|
6725
|
+
routeHeader: config.routeHeader,
|
|
6726
|
+
authToken: config.authToken
|
|
6727
|
+
});
|
|
6728
|
+
state.api = api;
|
|
6729
|
+
} catch (apiErr) {
|
|
6730
|
+
throw new Error(
|
|
6731
|
+
`[${ExecutionPhase.API_CLIENT}] Failed to create API client: ${apiErr instanceof Error ? apiErr.message : String(apiErr)}`
|
|
6732
|
+
);
|
|
6733
|
+
}
|
|
6734
|
+
state.currentPhase = ExecutionPhase.FETCH_EVAL_RUN;
|
|
6735
|
+
state.currentContext = { projectId: projectId2, evalRunId: evalRunId2, serverUrl: config.serverUrl };
|
|
6652
6736
|
console.error(
|
|
6653
6737
|
"[DEBUG-H2] fetchEvaluationData START",
|
|
6654
6738
|
JSON.stringify({ serverUrl: config.serverUrl, timestamp: Date.now() })
|
|
@@ -6667,32 +6751,61 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
6667
6751
|
})
|
|
6668
6752
|
);
|
|
6669
6753
|
} catch (fetchErr) {
|
|
6754
|
+
const errorMsg = fetchErr instanceof Error ? fetchErr.message : String(fetchErr);
|
|
6670
6755
|
console.error(
|
|
6671
6756
|
"[DEBUG-H2] fetchEvaluationData FAILED",
|
|
6672
6757
|
JSON.stringify({
|
|
6673
|
-
error:
|
|
6758
|
+
error: errorMsg,
|
|
6759
|
+
stack: fetchErr instanceof Error ? fetchErr.stack : void 0,
|
|
6674
6760
|
timestamp: Date.now()
|
|
6675
6761
|
})
|
|
6676
6762
|
);
|
|
6677
|
-
throw
|
|
6763
|
+
throw new Error(
|
|
6764
|
+
`[${ExecutionPhase.FETCH_EVAL_RUN}] Failed to fetch evaluation data: ${errorMsg}`
|
|
6765
|
+
);
|
|
6678
6766
|
}
|
|
6679
6767
|
const { codeAgent, skills, scenarioItems } = evalData;
|
|
6768
|
+
state.currentPhase = ExecutionPhase.VALIDATION;
|
|
6769
|
+
state.currentContext = {
|
|
6770
|
+
projectId: projectId2,
|
|
6771
|
+
evalRunId: evalRunId2,
|
|
6772
|
+
scenarioCount: scenarioItems.length,
|
|
6773
|
+
skillCount: skills.length,
|
|
6774
|
+
hasAgent: !!codeAgent,
|
|
6775
|
+
agentId: evalData.evalRun.agentId,
|
|
6776
|
+
skillsGroupId: evalData.evalRun.skillsGroupId
|
|
6777
|
+
};
|
|
6680
6778
|
if (scenarioItems.length > 0 && skills.length === 0) {
|
|
6681
6779
|
throw new Error(
|
|
6682
|
-
|
|
6780
|
+
`[${ExecutionPhase.VALIDATION}] Eval run has no skills: set skillsGroupId and ensure the group has skills. (skillsGroupId: ${evalData.evalRun.skillsGroupId || "not set"})`
|
|
6683
6781
|
);
|
|
6684
6782
|
}
|
|
6685
6783
|
if (scenarioItems.length > 0 && skills.length > 0 && !codeAgent) {
|
|
6686
6784
|
throw new Error(
|
|
6687
|
-
|
|
6785
|
+
`[${ExecutionPhase.VALIDATION}] Eval run has no code agent: set agentId for skill-based runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
|
|
6688
6786
|
);
|
|
6689
6787
|
}
|
|
6788
|
+
let completedScenarios = 0;
|
|
6789
|
+
const totalScenarios = scenarioItems.length * skills.length;
|
|
6690
6790
|
for (const { scenario, template } of scenarioItems) {
|
|
6691
6791
|
for (const skill of skills) {
|
|
6792
|
+
state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
|
|
6793
|
+
state.currentContext = {
|
|
6794
|
+
projectId: projectId2,
|
|
6795
|
+
evalRunId: evalRunId2,
|
|
6796
|
+
scenarioId: scenario.id,
|
|
6797
|
+
scenarioName: scenario.name,
|
|
6798
|
+
skillId: skill.id,
|
|
6799
|
+
skillName: skill.name,
|
|
6800
|
+
agentId: codeAgent?.id,
|
|
6801
|
+
agentName: codeAgent?.name,
|
|
6802
|
+
progress: `${completedScenarios + 1}/${totalScenarios}`
|
|
6803
|
+
};
|
|
6692
6804
|
console.log(
|
|
6693
6805
|
"[Evaluator] Running skill:",
|
|
6694
6806
|
skill.name,
|
|
6695
|
-
codeAgent ? `with agent: ${codeAgent.name}` : ""
|
|
6807
|
+
codeAgent ? `with agent: ${codeAgent.name}` : "",
|
|
6808
|
+
`(${completedScenarios + 1}/${totalScenarios})`
|
|
6696
6809
|
);
|
|
6697
6810
|
try {
|
|
6698
6811
|
const result = await runScenario(
|
|
@@ -6703,17 +6816,48 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
6703
6816
|
template
|
|
6704
6817
|
);
|
|
6705
6818
|
console.log("[Evaluator] Skill completed, adding result");
|
|
6819
|
+
state.currentPhase = ExecutionPhase.ADD_RESULT;
|
|
6820
|
+
state.currentContext = {
|
|
6821
|
+
...state.currentContext,
|
|
6822
|
+
resultId: result.id
|
|
6823
|
+
};
|
|
6706
6824
|
await api.addResult(projectId2, evalRunId2, result);
|
|
6825
|
+
completedScenarios++;
|
|
6707
6826
|
} catch (err) {
|
|
6708
|
-
|
|
6709
|
-
|
|
6827
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
6828
|
+
const errorStack = err instanceof Error ? err.stack : void 0;
|
|
6829
|
+
console.error(
|
|
6830
|
+
"[Evaluator] Failed to run skill:",
|
|
6831
|
+
skill.name,
|
|
6832
|
+
"Error:",
|
|
6833
|
+
errorMsg
|
|
6834
|
+
);
|
|
6835
|
+
if (errorStack) {
|
|
6836
|
+
console.error("[Evaluator] Stack trace:", errorStack);
|
|
6837
|
+
}
|
|
6838
|
+
throw new Error(
|
|
6839
|
+
`[${state.currentPhase}] Failed to execute skill "${skill.name}" on scenario "${scenario.name}": ${errorMsg}`
|
|
6840
|
+
);
|
|
6710
6841
|
}
|
|
6711
6842
|
}
|
|
6712
6843
|
}
|
|
6713
|
-
|
|
6714
|
-
|
|
6715
|
-
|
|
6716
|
-
|
|
6844
|
+
state.currentPhase = ExecutionPhase.UPDATE_STATUS;
|
|
6845
|
+
state.currentContext = {
|
|
6846
|
+
projectId: projectId2,
|
|
6847
|
+
evalRunId: evalRunId2,
|
|
6848
|
+
completedScenarios,
|
|
6849
|
+
totalScenarios
|
|
6850
|
+
};
|
|
6851
|
+
try {
|
|
6852
|
+
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
6853
|
+
status: EvalStatus2.COMPLETED,
|
|
6854
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
6855
|
+
});
|
|
6856
|
+
} catch (updateErr) {
|
|
6857
|
+
throw new Error(
|
|
6858
|
+
`[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to COMPLETED: ${updateErr instanceof Error ? updateErr.message : String(updateErr)}`
|
|
6859
|
+
);
|
|
6860
|
+
}
|
|
6717
6861
|
}
|
|
6718
6862
|
var projectId = process.argv[2];
|
|
6719
6863
|
var evalRunId = process.argv[3];
|
|
@@ -6729,7 +6873,16 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
6729
6873
|
console.error("[EVALUATOR-BOOT] runEvaluation completed successfully");
|
|
6730
6874
|
process.exit(0);
|
|
6731
6875
|
}).catch(async (err) => {
|
|
6732
|
-
|
|
6876
|
+
const errorDetails = formatError(err, "main-execution", {
|
|
6877
|
+
projectId,
|
|
6878
|
+
evalRunId
|
|
6879
|
+
});
|
|
6880
|
+
const jobError = formatErrorForJobError(errorDetails);
|
|
6881
|
+
console.error("[EVALUATOR-BOOT] runEvaluation FAILED");
|
|
6882
|
+
console.error(
|
|
6883
|
+
"[EVALUATOR-BOOT] Error details:",
|
|
6884
|
+
JSON.stringify(errorDetails, null, 2)
|
|
6885
|
+
);
|
|
6733
6886
|
try {
|
|
6734
6887
|
const config = loadConfig();
|
|
6735
6888
|
const api = createApiClient(config.serverUrl, {
|
|
@@ -6738,15 +6891,42 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
6738
6891
|
authToken: config.authToken
|
|
6739
6892
|
});
|
|
6740
6893
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
6741
|
-
status:
|
|
6742
|
-
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
6894
|
+
status: EvalStatus2.FAILED,
|
|
6895
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6896
|
+
jobError,
|
|
6897
|
+
jobStatus: "FAILED"
|
|
6743
6898
|
});
|
|
6744
|
-
console.error(
|
|
6899
|
+
console.error(
|
|
6900
|
+
"[EVALUATOR-BOOT] Updated eval run status to FAILED with error details"
|
|
6901
|
+
);
|
|
6745
6902
|
} catch (updateErr) {
|
|
6746
6903
|
console.error(
|
|
6747
6904
|
"[EVALUATOR-BOOT] Failed to update eval run status:",
|
|
6748
|
-
updateErr
|
|
6905
|
+
updateErr instanceof Error ? updateErr.message : String(updateErr)
|
|
6749
6906
|
);
|
|
6907
|
+
try {
|
|
6908
|
+
const serverUrl = process.env.EVAL_SERVER_URL;
|
|
6909
|
+
const authToken = process.env.EVAL_AUTH_TOKEN;
|
|
6910
|
+
const routeHeader = process.env.EVAL_ROUTE_HEADER;
|
|
6911
|
+
if (serverUrl) {
|
|
6912
|
+
const api = createApiClient(serverUrl, {
|
|
6913
|
+
routeHeader,
|
|
6914
|
+
authToken
|
|
6915
|
+
});
|
|
6916
|
+
await api.updateEvalRun(projectId, evalRunId, {
|
|
6917
|
+
status: EvalStatus2.FAILED,
|
|
6918
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6919
|
+
jobError: `Config load failed, then: ${jobError}`,
|
|
6920
|
+
jobStatus: "FAILED"
|
|
6921
|
+
});
|
|
6922
|
+
console.error("[EVALUATOR-BOOT] Fallback: Updated status to FAILED");
|
|
6923
|
+
}
|
|
6924
|
+
} catch (fallbackErr) {
|
|
6925
|
+
console.error(
|
|
6926
|
+
"[EVALUATOR-BOOT] Fallback also failed:",
|
|
6927
|
+
fallbackErr instanceof Error ? fallbackErr.message : String(fallbackErr)
|
|
6928
|
+
);
|
|
6929
|
+
}
|
|
6750
6930
|
}
|
|
6751
6931
|
process.exit(1);
|
|
6752
6932
|
});
|