@wix/evalforge-evaluator 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
24
24
  ));
25
25
 
26
26
  // src/index.ts
27
- var import_evalforge_types3 = require("@wix/evalforge-types");
27
+ var import_evalforge_types4 = require("@wix/evalforge-types");
28
28
 
29
29
  // src/config.ts
30
30
  function loadConfig() {
@@ -6620,6 +6620,67 @@ async function runScenario(config, evalRunId2, scenario, target, template) {
6620
6620
  };
6621
6621
  }
6622
6622
 
6623
+ // src/error-reporter.ts
6624
+ var import_evalforge_types3 = require("@wix/evalforge-types");
6625
+ function formatError(error, phase, context) {
6626
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString();
6627
+ if (error instanceof Error) {
6628
+ return {
6629
+ message: error.message,
6630
+ stack: error.stack,
6631
+ errorType: error.constructor.name,
6632
+ phase,
6633
+ context,
6634
+ timestamp
6635
+ };
6636
+ }
6637
+ return {
6638
+ message: String(error),
6639
+ errorType: typeof error,
6640
+ phase,
6641
+ context,
6642
+ timestamp
6643
+ };
6644
+ }
6645
+ function formatErrorForJobError(details) {
6646
+ const parts = [];
6647
+ if (details.phase) {
6648
+ parts.push(`[Phase: ${details.phase}]`);
6649
+ }
6650
+ if (details.errorType && details.errorType !== "Error") {
6651
+ parts.push(`${details.errorType}: ${details.message}`);
6652
+ } else {
6653
+ parts.push(details.message);
6654
+ }
6655
+ if (details.context && Object.keys(details.context).length > 0) {
6656
+ parts.push(`
6657
+ Context: ${JSON.stringify(details.context)}`);
6658
+ }
6659
+ if (details.stack) {
6660
+ const stackLines = details.stack.split("\n").slice(0, 6);
6661
+ parts.push(`
6662
+ Stack:
6663
+ ${stackLines.join("\n")}`);
6664
+ }
6665
+ return parts.join(" ");
6666
+ }
6667
+ var ExecutionPhase = {
6668
+ CONFIG: "config-loading",
6669
+ API_CLIENT: "api-client-creation",
6670
+ FETCH_EVAL_RUN: "fetch-eval-run",
6671
+ FETCH_SKILLS: "fetch-skills",
6672
+ FETCH_AGENT: "fetch-agent",
6673
+ FETCH_SCENARIOS: "fetch-scenarios",
6674
+ VALIDATION: "validation",
6675
+ PREPARE_WORKSPACE: "prepare-workspace",
6676
+ EXECUTE_SKILL: "execute-skill",
6677
+ EXECUTE_AGENT: "execute-agent",
6678
+ CLAUDE_SDK_IMPORT: "claude-sdk-import",
6679
+ CLAUDE_SDK_EXECUTION: "claude-sdk-execution",
6680
+ ADD_RESULT: "add-result",
6681
+ UPDATE_STATUS: "update-status"
6682
+ };
6683
+
6623
6684
  // src/index.ts
6624
6685
  console.error(
6625
6686
  "[EVALUATOR-BOOT] Module loading started",
@@ -6627,13 +6688,22 @@ console.error(
6627
6688
  );
6628
6689
  console.error("[EVALUATOR-BOOT] All static imports successful");
6629
6690
  async function runEvaluation(projectId2, evalRunId2) {
6691
+ const state = {
6692
+ config: null,
6693
+ api: null,
6694
+ currentPhase: ExecutionPhase.CONFIG,
6695
+ currentContext: { projectId: projectId2, evalRunId: evalRunId2 }
6696
+ };
6630
6697
  console.error(
6631
6698
  "[DEBUG-H1] runEvaluation entry",
6632
6699
  JSON.stringify({ projectId: projectId2, evalRunId: evalRunId2, timestamp: Date.now() })
6633
6700
  );
6701
+ state.currentPhase = ExecutionPhase.CONFIG;
6702
+ state.currentContext = { projectId: projectId2, evalRunId: evalRunId2 };
6634
6703
  let config;
6635
6704
  try {
6636
6705
  config = loadConfig();
6706
+ state.config = config;
6637
6707
  console.error(
6638
6708
  "[DEBUG-H1] loadConfig SUCCESS",
6639
6709
  JSON.stringify({
@@ -6649,10 +6719,13 @@ async function runEvaluation(projectId2, evalRunId2) {
6649
6719
  "[DEBUG-H1] loadConfig FAILED",
6650
6720
  JSON.stringify({
6651
6721
  error: configErr instanceof Error ? configErr.message : String(configErr),
6722
+ stack: configErr instanceof Error ? configErr.stack : void 0,
6652
6723
  timestamp: Date.now()
6653
6724
  })
6654
6725
  );
6655
- throw configErr;
6726
+ throw new Error(
6727
+ `[${ExecutionPhase.CONFIG}] ${configErr instanceof Error ? configErr.message : String(configErr)}`
6728
+ );
6656
6729
  }
6657
6730
  console.log("[Evaluator] Config loaded", {
6658
6731
  serverUrl: config.serverUrl,
@@ -6661,11 +6734,22 @@ async function runEvaluation(projectId2, evalRunId2) {
6661
6734
  hasAiGatewayHeaders: Object.keys(config.aiGatewayHeaders).length > 0,
6662
6735
  hasRouteHeader: !!config.routeHeader
6663
6736
  });
6664
- const api = createApiClient(config.serverUrl, {
6665
- apiPrefix: config.apiPrefix,
6666
- routeHeader: config.routeHeader,
6667
- authToken: config.authToken
6668
- });
6737
+ state.currentPhase = ExecutionPhase.API_CLIENT;
6738
+ let api;
6739
+ try {
6740
+ api = createApiClient(config.serverUrl, {
6741
+ apiPrefix: config.apiPrefix,
6742
+ routeHeader: config.routeHeader,
6743
+ authToken: config.authToken
6744
+ });
6745
+ state.api = api;
6746
+ } catch (apiErr) {
6747
+ throw new Error(
6748
+ `[${ExecutionPhase.API_CLIENT}] Failed to create API client: ${apiErr instanceof Error ? apiErr.message : String(apiErr)}`
6749
+ );
6750
+ }
6751
+ state.currentPhase = ExecutionPhase.FETCH_EVAL_RUN;
6752
+ state.currentContext = { projectId: projectId2, evalRunId: evalRunId2, serverUrl: config.serverUrl };
6669
6753
  console.error(
6670
6754
  "[DEBUG-H2] fetchEvaluationData START",
6671
6755
  JSON.stringify({ serverUrl: config.serverUrl, timestamp: Date.now() })
@@ -6684,32 +6768,61 @@ async function runEvaluation(projectId2, evalRunId2) {
6684
6768
  })
6685
6769
  );
6686
6770
  } catch (fetchErr) {
6771
+ const errorMsg = fetchErr instanceof Error ? fetchErr.message : String(fetchErr);
6687
6772
  console.error(
6688
6773
  "[DEBUG-H2] fetchEvaluationData FAILED",
6689
6774
  JSON.stringify({
6690
- error: fetchErr instanceof Error ? fetchErr.message : String(fetchErr),
6775
+ error: errorMsg,
6776
+ stack: fetchErr instanceof Error ? fetchErr.stack : void 0,
6691
6777
  timestamp: Date.now()
6692
6778
  })
6693
6779
  );
6694
- throw fetchErr;
6780
+ throw new Error(
6781
+ `[${ExecutionPhase.FETCH_EVAL_RUN}] Failed to fetch evaluation data: ${errorMsg}`
6782
+ );
6695
6783
  }
6696
6784
  const { codeAgent, skills, scenarioItems } = evalData;
6785
+ state.currentPhase = ExecutionPhase.VALIDATION;
6786
+ state.currentContext = {
6787
+ projectId: projectId2,
6788
+ evalRunId: evalRunId2,
6789
+ scenarioCount: scenarioItems.length,
6790
+ skillCount: skills.length,
6791
+ hasAgent: !!codeAgent,
6792
+ agentId: evalData.evalRun.agentId,
6793
+ skillsGroupId: evalData.evalRun.skillsGroupId
6794
+ };
6697
6795
  if (scenarioItems.length > 0 && skills.length === 0) {
6698
6796
  throw new Error(
6699
- "Eval run has no skills: set skillsGroupId and ensure the group has skills"
6797
+ `[${ExecutionPhase.VALIDATION}] Eval run has no skills: set skillsGroupId and ensure the group has skills. (skillsGroupId: ${evalData.evalRun.skillsGroupId || "not set"})`
6700
6798
  );
6701
6799
  }
6702
6800
  if (scenarioItems.length > 0 && skills.length > 0 && !codeAgent) {
6703
6801
  throw new Error(
6704
- "Eval run has no code agent: set agentId for skill-based runs"
6802
+ `[${ExecutionPhase.VALIDATION}] Eval run has no code agent: set agentId for skill-based runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
6705
6803
  );
6706
6804
  }
6805
+ let completedScenarios = 0;
6806
+ const totalScenarios = scenarioItems.length * skills.length;
6707
6807
  for (const { scenario, template } of scenarioItems) {
6708
6808
  for (const skill of skills) {
6809
+ state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
6810
+ state.currentContext = {
6811
+ projectId: projectId2,
6812
+ evalRunId: evalRunId2,
6813
+ scenarioId: scenario.id,
6814
+ scenarioName: scenario.name,
6815
+ skillId: skill.id,
6816
+ skillName: skill.name,
6817
+ agentId: codeAgent?.id,
6818
+ agentName: codeAgent?.name,
6819
+ progress: `${completedScenarios + 1}/${totalScenarios}`
6820
+ };
6709
6821
  console.log(
6710
6822
  "[Evaluator] Running skill:",
6711
6823
  skill.name,
6712
- codeAgent ? `with agent: ${codeAgent.name}` : ""
6824
+ codeAgent ? `with agent: ${codeAgent.name}` : "",
6825
+ `(${completedScenarios + 1}/${totalScenarios})`
6713
6826
  );
6714
6827
  try {
6715
6828
  const result = await runScenario(
@@ -6720,17 +6833,48 @@ async function runEvaluation(projectId2, evalRunId2) {
6720
6833
  template
6721
6834
  );
6722
6835
  console.log("[Evaluator] Skill completed, adding result");
6836
+ state.currentPhase = ExecutionPhase.ADD_RESULT;
6837
+ state.currentContext = {
6838
+ ...state.currentContext,
6839
+ resultId: result.id
6840
+ };
6723
6841
  await api.addResult(projectId2, evalRunId2, result);
6842
+ completedScenarios++;
6724
6843
  } catch (err) {
6725
- console.error("[Evaluator] Failed to run skill:", skill.name, err);
6726
- throw err;
6844
+ const errorMsg = err instanceof Error ? err.message : String(err);
6845
+ const errorStack = err instanceof Error ? err.stack : void 0;
6846
+ console.error(
6847
+ "[Evaluator] Failed to run skill:",
6848
+ skill.name,
6849
+ "Error:",
6850
+ errorMsg
6851
+ );
6852
+ if (errorStack) {
6853
+ console.error("[Evaluator] Stack trace:", errorStack);
6854
+ }
6855
+ throw new Error(
6856
+ `[${state.currentPhase}] Failed to execute skill "${skill.name}" on scenario "${scenario.name}": ${errorMsg}`
6857
+ );
6727
6858
  }
6728
6859
  }
6729
6860
  }
6730
- await api.updateEvalRun(projectId2, evalRunId2, {
6731
- status: import_evalforge_types3.EvalStatus.COMPLETED,
6732
- completedAt: (/* @__PURE__ */ new Date()).toISOString()
6733
- });
6861
+ state.currentPhase = ExecutionPhase.UPDATE_STATUS;
6862
+ state.currentContext = {
6863
+ projectId: projectId2,
6864
+ evalRunId: evalRunId2,
6865
+ completedScenarios,
6866
+ totalScenarios
6867
+ };
6868
+ try {
6869
+ await api.updateEvalRun(projectId2, evalRunId2, {
6870
+ status: import_evalforge_types4.EvalStatus.COMPLETED,
6871
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
6872
+ });
6873
+ } catch (updateErr) {
6874
+ throw new Error(
6875
+ `[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to COMPLETED: ${updateErr instanceof Error ? updateErr.message : String(updateErr)}`
6876
+ );
6877
+ }
6734
6878
  }
6735
6879
  var projectId = process.argv[2];
6736
6880
  var evalRunId = process.argv[3];
@@ -6746,7 +6890,16 @@ runEvaluation(projectId, evalRunId).then(() => {
6746
6890
  console.error("[EVALUATOR-BOOT] runEvaluation completed successfully");
6747
6891
  process.exit(0);
6748
6892
  }).catch(async (err) => {
6749
- console.error("[EVALUATOR-BOOT] runEvaluation FAILED:", err);
6893
+ const errorDetails = formatError(err, "main-execution", {
6894
+ projectId,
6895
+ evalRunId
6896
+ });
6897
+ const jobError = formatErrorForJobError(errorDetails);
6898
+ console.error("[EVALUATOR-BOOT] runEvaluation FAILED");
6899
+ console.error(
6900
+ "[EVALUATOR-BOOT] Error details:",
6901
+ JSON.stringify(errorDetails, null, 2)
6902
+ );
6750
6903
  try {
6751
6904
  const config = loadConfig();
6752
6905
  const api = createApiClient(config.serverUrl, {
@@ -6755,15 +6908,42 @@ runEvaluation(projectId, evalRunId).then(() => {
6755
6908
  authToken: config.authToken
6756
6909
  });
6757
6910
  await api.updateEvalRun(projectId, evalRunId, {
6758
- status: import_evalforge_types3.EvalStatus.FAILED,
6759
- completedAt: (/* @__PURE__ */ new Date()).toISOString()
6911
+ status: import_evalforge_types4.EvalStatus.FAILED,
6912
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
6913
+ jobError,
6914
+ jobStatus: "FAILED"
6760
6915
  });
6761
- console.error("[EVALUATOR-BOOT] Updated eval run status to FAILED");
6916
+ console.error(
6917
+ "[EVALUATOR-BOOT] Updated eval run status to FAILED with error details"
6918
+ );
6762
6919
  } catch (updateErr) {
6763
6920
  console.error(
6764
6921
  "[EVALUATOR-BOOT] Failed to update eval run status:",
6765
- updateErr
6922
+ updateErr instanceof Error ? updateErr.message : String(updateErr)
6766
6923
  );
6924
+ try {
6925
+ const serverUrl = process.env.EVAL_SERVER_URL;
6926
+ const authToken = process.env.EVAL_AUTH_TOKEN;
6927
+ const routeHeader = process.env.EVAL_ROUTE_HEADER;
6928
+ if (serverUrl) {
6929
+ const api = createApiClient(serverUrl, {
6930
+ routeHeader,
6931
+ authToken
6932
+ });
6933
+ await api.updateEvalRun(projectId, evalRunId, {
6934
+ status: import_evalforge_types4.EvalStatus.FAILED,
6935
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
6936
+ jobError: `Config load failed, then: ${jobError}`,
6937
+ jobStatus: "FAILED"
6938
+ });
6939
+ console.error("[EVALUATOR-BOOT] Fallback: Updated status to FAILED");
6940
+ }
6941
+ } catch (fallbackErr) {
6942
+ console.error(
6943
+ "[EVALUATOR-BOOT] Fallback also failed:",
6944
+ fallbackErr instanceof Error ? fallbackErr.message : String(fallbackErr)
6945
+ );
6946
+ }
6767
6947
  }
6768
6948
  process.exit(1);
6769
6949
  });