@wix/evalforge-evaluator 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
24
24
  ));
25
25
 
26
26
  // src/index.ts
27
- var import_evalforge_types3 = require("@wix/evalforge-types");
27
+ var import_evalforge_types4 = require("@wix/evalforge-types");
28
28
 
29
29
  // src/config.ts
30
30
  function loadConfig() {
@@ -6268,45 +6268,94 @@ async function executeWithClaudeCode(skill, scenario, options) {
6268
6268
  if (options.maxTokens !== void 0) {
6269
6269
  queryOptions.maxTokens = options.maxTokens;
6270
6270
  }
6271
- for await (const message of query({
6272
- prompt: scenario.triggerPrompt,
6273
- options: queryOptions
6274
- })) {
6275
- messageCount++;
6276
- console.log("[SDK Message]", JSON.stringify(message, null, 2));
6277
- allMessages.push(message);
6278
- if (messageCount <= 3) {
6279
- console.error(
6280
- "[DEBUG-H5] SDK message received",
6281
- JSON.stringify({
6282
- messageCount,
6283
- type: message.type,
6284
- timestamp: Date.now()
6285
- })
6286
- );
6271
+ try {
6272
+ for await (const message of query({
6273
+ prompt: scenario.triggerPrompt,
6274
+ options: queryOptions
6275
+ })) {
6276
+ messageCount++;
6277
+ console.log("[SDK Message]", JSON.stringify(message, null, 2));
6278
+ allMessages.push(message);
6279
+ if (messageCount <= 3) {
6280
+ console.error(
6281
+ "[DEBUG-H5] SDK message received",
6282
+ JSON.stringify({
6283
+ messageCount,
6284
+ type: message.type,
6285
+ timestamp: Date.now()
6286
+ })
6287
+ );
6288
+ }
6289
+ if (traceContext && isAssistantMessage(message)) {
6290
+ traceStepNumber++;
6291
+ const traceEvent = createTraceEventFromMessage(
6292
+ message,
6293
+ traceContext,
6294
+ traceStepNumber,
6295
+ false
6296
+ // Not complete yet
6297
+ );
6298
+ emitTraceEvent(
6299
+ traceEvent,
6300
+ traceContext.tracePushUrl,
6301
+ traceContext.routeHeader,
6302
+ traceContext.authToken
6303
+ );
6304
+ }
6287
6305
  }
6288
- if (traceContext && isAssistantMessage(message)) {
6289
- traceStepNumber++;
6290
- const traceEvent = createTraceEventFromMessage(
6291
- message,
6292
- traceContext,
6293
- traceStepNumber,
6294
- false
6295
- // Not complete yet
6296
- );
6297
- emitTraceEvent(
6298
- traceEvent,
6299
- traceContext.tracePushUrl,
6300
- traceContext.routeHeader,
6301
- traceContext.authToken
6302
- );
6306
+ console.log(
6307
+ "[executeWithClaudeCode] Claude Agent SDK query completed, received",
6308
+ allMessages.length,
6309
+ "messages"
6310
+ );
6311
+ } catch (sdkError) {
6312
+ const errorMessage = sdkError instanceof Error ? sdkError.message : String(sdkError);
6313
+ const errorStack = sdkError instanceof Error ? sdkError.stack : void 0;
6314
+ console.error("[executeWithClaudeCode] Claude SDK execution FAILED");
6315
+ console.error("[executeWithClaudeCode] Error message:", errorMessage);
6316
+ if (errorStack) {
6317
+ console.error("[executeWithClaudeCode] Stack trace:", errorStack);
6318
+ }
6319
+ if (sdkError && typeof sdkError === "object") {
6320
+ const errObj = sdkError;
6321
+ const extraInfo = {};
6322
+ for (const key of [
6323
+ "code",
6324
+ "status",
6325
+ "stderr",
6326
+ "stdout",
6327
+ "exitCode",
6328
+ "signal",
6329
+ "cause"
6330
+ ]) {
6331
+ if (key in errObj && errObj[key] !== void 0) {
6332
+ extraInfo[key] = errObj[key];
6333
+ }
6334
+ }
6335
+ if (Object.keys(extraInfo).length > 0) {
6336
+ console.error(
6337
+ "[executeWithClaudeCode] Additional error info:",
6338
+ JSON.stringify(extraInfo)
6339
+ );
6340
+ }
6303
6341
  }
6342
+ console.error(
6343
+ "[executeWithClaudeCode] Context:",
6344
+ JSON.stringify({
6345
+ skillId: skill.id,
6346
+ skillName: skill.name,
6347
+ scenarioId: scenario.id,
6348
+ scenarioName: scenario.name,
6349
+ messagesReceived: messageCount,
6350
+ cwd: options.cwd,
6351
+ model: options.model || DEFAULT_MODEL
6352
+ })
6353
+ );
6354
+ throw new Error(
6355
+ `Claude SDK execution failed after ${messageCount} messages: ${errorMessage}` + (errorStack ? `
6356
+ Stack: ${errorStack.split("\n").slice(0, 3).join("\n")}` : "")
6357
+ );
6304
6358
  }
6305
- console.log(
6306
- "[executeWithClaudeCode] Claude Agent SDK query completed, received",
6307
- allMessages.length,
6308
- "messages"
6309
- );
6310
6359
  if (traceContext) {
6311
6360
  emitTraceEvent(
6312
6361
  {
@@ -6620,6 +6669,67 @@ async function runScenario(config, evalRunId2, scenario, target, template) {
6620
6669
  };
6621
6670
  }
6622
6671
 
6672
+ // src/error-reporter.ts
6673
+ var import_evalforge_types3 = require("@wix/evalforge-types");
6674
+ function formatError(error, phase, context) {
6675
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString();
6676
+ if (error instanceof Error) {
6677
+ return {
6678
+ message: error.message,
6679
+ stack: error.stack,
6680
+ errorType: error.constructor.name,
6681
+ phase,
6682
+ context,
6683
+ timestamp
6684
+ };
6685
+ }
6686
+ return {
6687
+ message: String(error),
6688
+ errorType: typeof error,
6689
+ phase,
6690
+ context,
6691
+ timestamp
6692
+ };
6693
+ }
6694
+ function formatErrorForJobError(details) {
6695
+ const parts = [];
6696
+ if (details.phase) {
6697
+ parts.push(`[Phase: ${details.phase}]`);
6698
+ }
6699
+ if (details.errorType && details.errorType !== "Error") {
6700
+ parts.push(`${details.errorType}: ${details.message}`);
6701
+ } else {
6702
+ parts.push(details.message);
6703
+ }
6704
+ if (details.context && Object.keys(details.context).length > 0) {
6705
+ parts.push(`
6706
+ Context: ${JSON.stringify(details.context)}`);
6707
+ }
6708
+ if (details.stack) {
6709
+ const stackLines = details.stack.split("\n").slice(0, 6);
6710
+ parts.push(`
6711
+ Stack:
6712
+ ${stackLines.join("\n")}`);
6713
+ }
6714
+ return parts.join(" ");
6715
+ }
6716
+ var ExecutionPhase = {
6717
+ CONFIG: "config-loading",
6718
+ API_CLIENT: "api-client-creation",
6719
+ FETCH_EVAL_RUN: "fetch-eval-run",
6720
+ FETCH_SKILLS: "fetch-skills",
6721
+ FETCH_AGENT: "fetch-agent",
6722
+ FETCH_SCENARIOS: "fetch-scenarios",
6723
+ VALIDATION: "validation",
6724
+ PREPARE_WORKSPACE: "prepare-workspace",
6725
+ EXECUTE_SKILL: "execute-skill",
6726
+ EXECUTE_AGENT: "execute-agent",
6727
+ CLAUDE_SDK_IMPORT: "claude-sdk-import",
6728
+ CLAUDE_SDK_EXECUTION: "claude-sdk-execution",
6729
+ ADD_RESULT: "add-result",
6730
+ UPDATE_STATUS: "update-status"
6731
+ };
6732
+
6623
6733
  // src/index.ts
6624
6734
  console.error(
6625
6735
  "[EVALUATOR-BOOT] Module loading started",
@@ -6627,13 +6737,22 @@ console.error(
6627
6737
  );
6628
6738
  console.error("[EVALUATOR-BOOT] All static imports successful");
6629
6739
  async function runEvaluation(projectId2, evalRunId2) {
6740
+ const state = {
6741
+ config: null,
6742
+ api: null,
6743
+ currentPhase: ExecutionPhase.CONFIG,
6744
+ currentContext: { projectId: projectId2, evalRunId: evalRunId2 }
6745
+ };
6630
6746
  console.error(
6631
6747
  "[DEBUG-H1] runEvaluation entry",
6632
6748
  JSON.stringify({ projectId: projectId2, evalRunId: evalRunId2, timestamp: Date.now() })
6633
6749
  );
6750
+ state.currentPhase = ExecutionPhase.CONFIG;
6751
+ state.currentContext = { projectId: projectId2, evalRunId: evalRunId2 };
6634
6752
  let config;
6635
6753
  try {
6636
6754
  config = loadConfig();
6755
+ state.config = config;
6637
6756
  console.error(
6638
6757
  "[DEBUG-H1] loadConfig SUCCESS",
6639
6758
  JSON.stringify({
@@ -6649,10 +6768,13 @@ async function runEvaluation(projectId2, evalRunId2) {
6649
6768
  "[DEBUG-H1] loadConfig FAILED",
6650
6769
  JSON.stringify({
6651
6770
  error: configErr instanceof Error ? configErr.message : String(configErr),
6771
+ stack: configErr instanceof Error ? configErr.stack : void 0,
6652
6772
  timestamp: Date.now()
6653
6773
  })
6654
6774
  );
6655
- throw configErr;
6775
+ throw new Error(
6776
+ `[${ExecutionPhase.CONFIG}] ${configErr instanceof Error ? configErr.message : String(configErr)}`
6777
+ );
6656
6778
  }
6657
6779
  console.log("[Evaluator] Config loaded", {
6658
6780
  serverUrl: config.serverUrl,
@@ -6661,11 +6783,22 @@ async function runEvaluation(projectId2, evalRunId2) {
6661
6783
  hasAiGatewayHeaders: Object.keys(config.aiGatewayHeaders).length > 0,
6662
6784
  hasRouteHeader: !!config.routeHeader
6663
6785
  });
6664
- const api = createApiClient(config.serverUrl, {
6665
- apiPrefix: config.apiPrefix,
6666
- routeHeader: config.routeHeader,
6667
- authToken: config.authToken
6668
- });
6786
+ state.currentPhase = ExecutionPhase.API_CLIENT;
6787
+ let api;
6788
+ try {
6789
+ api = createApiClient(config.serverUrl, {
6790
+ apiPrefix: config.apiPrefix,
6791
+ routeHeader: config.routeHeader,
6792
+ authToken: config.authToken
6793
+ });
6794
+ state.api = api;
6795
+ } catch (apiErr) {
6796
+ throw new Error(
6797
+ `[${ExecutionPhase.API_CLIENT}] Failed to create API client: ${apiErr instanceof Error ? apiErr.message : String(apiErr)}`
6798
+ );
6799
+ }
6800
+ state.currentPhase = ExecutionPhase.FETCH_EVAL_RUN;
6801
+ state.currentContext = { projectId: projectId2, evalRunId: evalRunId2, serverUrl: config.serverUrl };
6669
6802
  console.error(
6670
6803
  "[DEBUG-H2] fetchEvaluationData START",
6671
6804
  JSON.stringify({ serverUrl: config.serverUrl, timestamp: Date.now() })
@@ -6684,32 +6817,61 @@ async function runEvaluation(projectId2, evalRunId2) {
6684
6817
  })
6685
6818
  );
6686
6819
  } catch (fetchErr) {
6820
+ const errorMsg = fetchErr instanceof Error ? fetchErr.message : String(fetchErr);
6687
6821
  console.error(
6688
6822
  "[DEBUG-H2] fetchEvaluationData FAILED",
6689
6823
  JSON.stringify({
6690
- error: fetchErr instanceof Error ? fetchErr.message : String(fetchErr),
6824
+ error: errorMsg,
6825
+ stack: fetchErr instanceof Error ? fetchErr.stack : void 0,
6691
6826
  timestamp: Date.now()
6692
6827
  })
6693
6828
  );
6694
- throw fetchErr;
6829
+ throw new Error(
6830
+ `[${ExecutionPhase.FETCH_EVAL_RUN}] Failed to fetch evaluation data: ${errorMsg}`
6831
+ );
6695
6832
  }
6696
6833
  const { codeAgent, skills, scenarioItems } = evalData;
6834
+ state.currentPhase = ExecutionPhase.VALIDATION;
6835
+ state.currentContext = {
6836
+ projectId: projectId2,
6837
+ evalRunId: evalRunId2,
6838
+ scenarioCount: scenarioItems.length,
6839
+ skillCount: skills.length,
6840
+ hasAgent: !!codeAgent,
6841
+ agentId: evalData.evalRun.agentId,
6842
+ skillsGroupId: evalData.evalRun.skillsGroupId
6843
+ };
6697
6844
  if (scenarioItems.length > 0 && skills.length === 0) {
6698
6845
  throw new Error(
6699
- "Eval run has no skills: set skillsGroupId and ensure the group has skills"
6846
+ `[${ExecutionPhase.VALIDATION}] Eval run has no skills: set skillsGroupId and ensure the group has skills. (skillsGroupId: ${evalData.evalRun.skillsGroupId || "not set"})`
6700
6847
  );
6701
6848
  }
6702
6849
  if (scenarioItems.length > 0 && skills.length > 0 && !codeAgent) {
6703
6850
  throw new Error(
6704
- "Eval run has no code agent: set agentId for skill-based runs"
6851
+ `[${ExecutionPhase.VALIDATION}] Eval run has no code agent: set agentId for skill-based runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
6705
6852
  );
6706
6853
  }
6854
+ let completedScenarios = 0;
6855
+ const totalScenarios = scenarioItems.length * skills.length;
6707
6856
  for (const { scenario, template } of scenarioItems) {
6708
6857
  for (const skill of skills) {
6858
+ state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
6859
+ state.currentContext = {
6860
+ projectId: projectId2,
6861
+ evalRunId: evalRunId2,
6862
+ scenarioId: scenario.id,
6863
+ scenarioName: scenario.name,
6864
+ skillId: skill.id,
6865
+ skillName: skill.name,
6866
+ agentId: codeAgent?.id,
6867
+ agentName: codeAgent?.name,
6868
+ progress: `${completedScenarios + 1}/${totalScenarios}`
6869
+ };
6709
6870
  console.log(
6710
6871
  "[Evaluator] Running skill:",
6711
6872
  skill.name,
6712
- codeAgent ? `with agent: ${codeAgent.name}` : ""
6873
+ codeAgent ? `with agent: ${codeAgent.name}` : "",
6874
+ `(${completedScenarios + 1}/${totalScenarios})`
6713
6875
  );
6714
6876
  try {
6715
6877
  const result = await runScenario(
@@ -6720,17 +6882,48 @@ async function runEvaluation(projectId2, evalRunId2) {
6720
6882
  template
6721
6883
  );
6722
6884
  console.log("[Evaluator] Skill completed, adding result");
6885
+ state.currentPhase = ExecutionPhase.ADD_RESULT;
6886
+ state.currentContext = {
6887
+ ...state.currentContext,
6888
+ resultId: result.id
6889
+ };
6723
6890
  await api.addResult(projectId2, evalRunId2, result);
6891
+ completedScenarios++;
6724
6892
  } catch (err) {
6725
- console.error("[Evaluator] Failed to run skill:", skill.name, err);
6726
- throw err;
6893
+ const errorMsg = err instanceof Error ? err.message : String(err);
6894
+ const errorStack = err instanceof Error ? err.stack : void 0;
6895
+ console.error(
6896
+ "[Evaluator] Failed to run skill:",
6897
+ skill.name,
6898
+ "Error:",
6899
+ errorMsg
6900
+ );
6901
+ if (errorStack) {
6902
+ console.error("[Evaluator] Stack trace:", errorStack);
6903
+ }
6904
+ throw new Error(
6905
+ `[${state.currentPhase}] Failed to execute skill "${skill.name}" on scenario "${scenario.name}": ${errorMsg}`
6906
+ );
6727
6907
  }
6728
6908
  }
6729
6909
  }
6730
- await api.updateEvalRun(projectId2, evalRunId2, {
6731
- status: import_evalforge_types3.EvalStatus.COMPLETED,
6732
- completedAt: (/* @__PURE__ */ new Date()).toISOString()
6733
- });
6910
+ state.currentPhase = ExecutionPhase.UPDATE_STATUS;
6911
+ state.currentContext = {
6912
+ projectId: projectId2,
6913
+ evalRunId: evalRunId2,
6914
+ completedScenarios,
6915
+ totalScenarios
6916
+ };
6917
+ try {
6918
+ await api.updateEvalRun(projectId2, evalRunId2, {
6919
+ status: import_evalforge_types4.EvalStatus.COMPLETED,
6920
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
6921
+ });
6922
+ } catch (updateErr) {
6923
+ throw new Error(
6924
+ `[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to COMPLETED: ${updateErr instanceof Error ? updateErr.message : String(updateErr)}`
6925
+ );
6926
+ }
6734
6927
  }
6735
6928
  var projectId = process.argv[2];
6736
6929
  var evalRunId = process.argv[3];
@@ -6746,7 +6939,16 @@ runEvaluation(projectId, evalRunId).then(() => {
6746
6939
  console.error("[EVALUATOR-BOOT] runEvaluation completed successfully");
6747
6940
  process.exit(0);
6748
6941
  }).catch(async (err) => {
6749
- console.error("[EVALUATOR-BOOT] runEvaluation FAILED:", err);
6942
+ const errorDetails = formatError(err, "main-execution", {
6943
+ projectId,
6944
+ evalRunId
6945
+ });
6946
+ const jobError = formatErrorForJobError(errorDetails);
6947
+ console.error("[EVALUATOR-BOOT] runEvaluation FAILED");
6948
+ console.error(
6949
+ "[EVALUATOR-BOOT] Error details:",
6950
+ JSON.stringify(errorDetails, null, 2)
6951
+ );
6750
6952
  try {
6751
6953
  const config = loadConfig();
6752
6954
  const api = createApiClient(config.serverUrl, {
@@ -6755,15 +6957,42 @@ runEvaluation(projectId, evalRunId).then(() => {
6755
6957
  authToken: config.authToken
6756
6958
  });
6757
6959
  await api.updateEvalRun(projectId, evalRunId, {
6758
- status: import_evalforge_types3.EvalStatus.FAILED,
6759
- completedAt: (/* @__PURE__ */ new Date()).toISOString()
6960
+ status: import_evalforge_types4.EvalStatus.FAILED,
6961
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
6962
+ jobError,
6963
+ jobStatus: "FAILED"
6760
6964
  });
6761
- console.error("[EVALUATOR-BOOT] Updated eval run status to FAILED");
6965
+ console.error(
6966
+ "[EVALUATOR-BOOT] Updated eval run status to FAILED with error details"
6967
+ );
6762
6968
  } catch (updateErr) {
6763
6969
  console.error(
6764
6970
  "[EVALUATOR-BOOT] Failed to update eval run status:",
6765
- updateErr
6971
+ updateErr instanceof Error ? updateErr.message : String(updateErr)
6766
6972
  );
6973
+ try {
6974
+ const serverUrl = process.env.EVAL_SERVER_URL;
6975
+ const authToken = process.env.EVAL_AUTH_TOKEN;
6976
+ const routeHeader = process.env.EVAL_ROUTE_HEADER;
6977
+ if (serverUrl) {
6978
+ const api = createApiClient(serverUrl, {
6979
+ routeHeader,
6980
+ authToken
6981
+ });
6982
+ await api.updateEvalRun(projectId, evalRunId, {
6983
+ status: import_evalforge_types4.EvalStatus.FAILED,
6984
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
6985
+ jobError: `Config load failed, then: ${jobError}`,
6986
+ jobStatus: "FAILED"
6987
+ });
6988
+ console.error("[EVALUATOR-BOOT] Fallback: Updated status to FAILED");
6989
+ }
6990
+ } catch (fallbackErr) {
6991
+ console.error(
6992
+ "[EVALUATOR-BOOT] Fallback also failed:",
6993
+ fallbackErr instanceof Error ? fallbackErr.message : String(fallbackErr)
6994
+ );
6995
+ }
6767
6996
  }
6768
6997
  process.exit(1);
6769
6998
  });