@wix/evalforge-evaluator 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
 
3
3
  // src/index.ts
4
- import { EvalStatus } from "@wix/evalforge-types";
4
+ import { EvalStatus as EvalStatus2 } from "@wix/evalforge-types";
5
5
 
6
6
  // src/config.ts
7
7
  function loadConfig() {
@@ -6251,45 +6251,94 @@ async function executeWithClaudeCode(skill, scenario, options) {
6251
6251
  if (options.maxTokens !== void 0) {
6252
6252
  queryOptions.maxTokens = options.maxTokens;
6253
6253
  }
6254
- for await (const message of query({
6255
- prompt: scenario.triggerPrompt,
6256
- options: queryOptions
6257
- })) {
6258
- messageCount++;
6259
- console.log("[SDK Message]", JSON.stringify(message, null, 2));
6260
- allMessages.push(message);
6261
- if (messageCount <= 3) {
6262
- console.error(
6263
- "[DEBUG-H5] SDK message received",
6264
- JSON.stringify({
6265
- messageCount,
6266
- type: message.type,
6267
- timestamp: Date.now()
6268
- })
6269
- );
6254
+ try {
6255
+ for await (const message of query({
6256
+ prompt: scenario.triggerPrompt,
6257
+ options: queryOptions
6258
+ })) {
6259
+ messageCount++;
6260
+ console.log("[SDK Message]", JSON.stringify(message, null, 2));
6261
+ allMessages.push(message);
6262
+ if (messageCount <= 3) {
6263
+ console.error(
6264
+ "[DEBUG-H5] SDK message received",
6265
+ JSON.stringify({
6266
+ messageCount,
6267
+ type: message.type,
6268
+ timestamp: Date.now()
6269
+ })
6270
+ );
6271
+ }
6272
+ if (traceContext && isAssistantMessage(message)) {
6273
+ traceStepNumber++;
6274
+ const traceEvent = createTraceEventFromMessage(
6275
+ message,
6276
+ traceContext,
6277
+ traceStepNumber,
6278
+ false
6279
+ // Not complete yet
6280
+ );
6281
+ emitTraceEvent(
6282
+ traceEvent,
6283
+ traceContext.tracePushUrl,
6284
+ traceContext.routeHeader,
6285
+ traceContext.authToken
6286
+ );
6287
+ }
6270
6288
  }
6271
- if (traceContext && isAssistantMessage(message)) {
6272
- traceStepNumber++;
6273
- const traceEvent = createTraceEventFromMessage(
6274
- message,
6275
- traceContext,
6276
- traceStepNumber,
6277
- false
6278
- // Not complete yet
6279
- );
6280
- emitTraceEvent(
6281
- traceEvent,
6282
- traceContext.tracePushUrl,
6283
- traceContext.routeHeader,
6284
- traceContext.authToken
6285
- );
6289
+ console.log(
6290
+ "[executeWithClaudeCode] Claude Agent SDK query completed, received",
6291
+ allMessages.length,
6292
+ "messages"
6293
+ );
6294
+ } catch (sdkError) {
6295
+ const errorMessage = sdkError instanceof Error ? sdkError.message : String(sdkError);
6296
+ const errorStack = sdkError instanceof Error ? sdkError.stack : void 0;
6297
+ console.error("[executeWithClaudeCode] Claude SDK execution FAILED");
6298
+ console.error("[executeWithClaudeCode] Error message:", errorMessage);
6299
+ if (errorStack) {
6300
+ console.error("[executeWithClaudeCode] Stack trace:", errorStack);
6301
+ }
6302
+ if (sdkError && typeof sdkError === "object") {
6303
+ const errObj = sdkError;
6304
+ const extraInfo = {};
6305
+ for (const key of [
6306
+ "code",
6307
+ "status",
6308
+ "stderr",
6309
+ "stdout",
6310
+ "exitCode",
6311
+ "signal",
6312
+ "cause"
6313
+ ]) {
6314
+ if (key in errObj && errObj[key] !== void 0) {
6315
+ extraInfo[key] = errObj[key];
6316
+ }
6317
+ }
6318
+ if (Object.keys(extraInfo).length > 0) {
6319
+ console.error(
6320
+ "[executeWithClaudeCode] Additional error info:",
6321
+ JSON.stringify(extraInfo)
6322
+ );
6323
+ }
6286
6324
  }
6325
+ console.error(
6326
+ "[executeWithClaudeCode] Context:",
6327
+ JSON.stringify({
6328
+ skillId: skill.id,
6329
+ skillName: skill.name,
6330
+ scenarioId: scenario.id,
6331
+ scenarioName: scenario.name,
6332
+ messagesReceived: messageCount,
6333
+ cwd: options.cwd,
6334
+ model: options.model || DEFAULT_MODEL
6335
+ })
6336
+ );
6337
+ throw new Error(
6338
+ `Claude SDK execution failed after ${messageCount} messages: ${errorMessage}` + (errorStack ? `
6339
+ Stack: ${errorStack.split("\n").slice(0, 3).join("\n")}` : "")
6340
+ );
6287
6341
  }
6288
- console.log(
6289
- "[executeWithClaudeCode] Claude Agent SDK query completed, received",
6290
- allMessages.length,
6291
- "messages"
6292
- );
6293
6342
  if (traceContext) {
6294
6343
  emitTraceEvent(
6295
6344
  {
@@ -6603,6 +6652,67 @@ async function runScenario(config, evalRunId2, scenario, target, template) {
6603
6652
  };
6604
6653
  }
6605
6654
 
6655
+ // src/error-reporter.ts
6656
+ import { EvalStatus } from "@wix/evalforge-types";
6657
+ function formatError(error, phase, context) {
6658
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString();
6659
+ if (error instanceof Error) {
6660
+ return {
6661
+ message: error.message,
6662
+ stack: error.stack,
6663
+ errorType: error.constructor.name,
6664
+ phase,
6665
+ context,
6666
+ timestamp
6667
+ };
6668
+ }
6669
+ return {
6670
+ message: String(error),
6671
+ errorType: typeof error,
6672
+ phase,
6673
+ context,
6674
+ timestamp
6675
+ };
6676
+ }
6677
+ function formatErrorForJobError(details) {
6678
+ const parts = [];
6679
+ if (details.phase) {
6680
+ parts.push(`[Phase: ${details.phase}]`);
6681
+ }
6682
+ if (details.errorType && details.errorType !== "Error") {
6683
+ parts.push(`${details.errorType}: ${details.message}`);
6684
+ } else {
6685
+ parts.push(details.message);
6686
+ }
6687
+ if (details.context && Object.keys(details.context).length > 0) {
6688
+ parts.push(`
6689
+ Context: ${JSON.stringify(details.context)}`);
6690
+ }
6691
+ if (details.stack) {
6692
+ const stackLines = details.stack.split("\n").slice(0, 6);
6693
+ parts.push(`
6694
+ Stack:
6695
+ ${stackLines.join("\n")}`);
6696
+ }
6697
+ return parts.join(" ");
6698
+ }
6699
+ var ExecutionPhase = {
6700
+ CONFIG: "config-loading",
6701
+ API_CLIENT: "api-client-creation",
6702
+ FETCH_EVAL_RUN: "fetch-eval-run",
6703
+ FETCH_SKILLS: "fetch-skills",
6704
+ FETCH_AGENT: "fetch-agent",
6705
+ FETCH_SCENARIOS: "fetch-scenarios",
6706
+ VALIDATION: "validation",
6707
+ PREPARE_WORKSPACE: "prepare-workspace",
6708
+ EXECUTE_SKILL: "execute-skill",
6709
+ EXECUTE_AGENT: "execute-agent",
6710
+ CLAUDE_SDK_IMPORT: "claude-sdk-import",
6711
+ CLAUDE_SDK_EXECUTION: "claude-sdk-execution",
6712
+ ADD_RESULT: "add-result",
6713
+ UPDATE_STATUS: "update-status"
6714
+ };
6715
+
6606
6716
  // src/index.ts
6607
6717
  console.error(
6608
6718
  "[EVALUATOR-BOOT] Module loading started",
@@ -6610,13 +6720,22 @@ console.error(
6610
6720
  );
6611
6721
  console.error("[EVALUATOR-BOOT] All static imports successful");
6612
6722
  async function runEvaluation(projectId2, evalRunId2) {
6723
+ const state = {
6724
+ config: null,
6725
+ api: null,
6726
+ currentPhase: ExecutionPhase.CONFIG,
6727
+ currentContext: { projectId: projectId2, evalRunId: evalRunId2 }
6728
+ };
6613
6729
  console.error(
6614
6730
  "[DEBUG-H1] runEvaluation entry",
6615
6731
  JSON.stringify({ projectId: projectId2, evalRunId: evalRunId2, timestamp: Date.now() })
6616
6732
  );
6733
+ state.currentPhase = ExecutionPhase.CONFIG;
6734
+ state.currentContext = { projectId: projectId2, evalRunId: evalRunId2 };
6617
6735
  let config;
6618
6736
  try {
6619
6737
  config = loadConfig();
6738
+ state.config = config;
6620
6739
  console.error(
6621
6740
  "[DEBUG-H1] loadConfig SUCCESS",
6622
6741
  JSON.stringify({
@@ -6632,10 +6751,13 @@ async function runEvaluation(projectId2, evalRunId2) {
6632
6751
  "[DEBUG-H1] loadConfig FAILED",
6633
6752
  JSON.stringify({
6634
6753
  error: configErr instanceof Error ? configErr.message : String(configErr),
6754
+ stack: configErr instanceof Error ? configErr.stack : void 0,
6635
6755
  timestamp: Date.now()
6636
6756
  })
6637
6757
  );
6638
- throw configErr;
6758
+ throw new Error(
6759
+ `[${ExecutionPhase.CONFIG}] ${configErr instanceof Error ? configErr.message : String(configErr)}`
6760
+ );
6639
6761
  }
6640
6762
  console.log("[Evaluator] Config loaded", {
6641
6763
  serverUrl: config.serverUrl,
@@ -6644,11 +6766,22 @@ async function runEvaluation(projectId2, evalRunId2) {
6644
6766
  hasAiGatewayHeaders: Object.keys(config.aiGatewayHeaders).length > 0,
6645
6767
  hasRouteHeader: !!config.routeHeader
6646
6768
  });
6647
- const api = createApiClient(config.serverUrl, {
6648
- apiPrefix: config.apiPrefix,
6649
- routeHeader: config.routeHeader,
6650
- authToken: config.authToken
6651
- });
6769
+ state.currentPhase = ExecutionPhase.API_CLIENT;
6770
+ let api;
6771
+ try {
6772
+ api = createApiClient(config.serverUrl, {
6773
+ apiPrefix: config.apiPrefix,
6774
+ routeHeader: config.routeHeader,
6775
+ authToken: config.authToken
6776
+ });
6777
+ state.api = api;
6778
+ } catch (apiErr) {
6779
+ throw new Error(
6780
+ `[${ExecutionPhase.API_CLIENT}] Failed to create API client: ${apiErr instanceof Error ? apiErr.message : String(apiErr)}`
6781
+ );
6782
+ }
6783
+ state.currentPhase = ExecutionPhase.FETCH_EVAL_RUN;
6784
+ state.currentContext = { projectId: projectId2, evalRunId: evalRunId2, serverUrl: config.serverUrl };
6652
6785
  console.error(
6653
6786
  "[DEBUG-H2] fetchEvaluationData START",
6654
6787
  JSON.stringify({ serverUrl: config.serverUrl, timestamp: Date.now() })
@@ -6667,32 +6800,61 @@ async function runEvaluation(projectId2, evalRunId2) {
6667
6800
  })
6668
6801
  );
6669
6802
  } catch (fetchErr) {
6803
+ const errorMsg = fetchErr instanceof Error ? fetchErr.message : String(fetchErr);
6670
6804
  console.error(
6671
6805
  "[DEBUG-H2] fetchEvaluationData FAILED",
6672
6806
  JSON.stringify({
6673
- error: fetchErr instanceof Error ? fetchErr.message : String(fetchErr),
6807
+ error: errorMsg,
6808
+ stack: fetchErr instanceof Error ? fetchErr.stack : void 0,
6674
6809
  timestamp: Date.now()
6675
6810
  })
6676
6811
  );
6677
- throw fetchErr;
6812
+ throw new Error(
6813
+ `[${ExecutionPhase.FETCH_EVAL_RUN}] Failed to fetch evaluation data: ${errorMsg}`
6814
+ );
6678
6815
  }
6679
6816
  const { codeAgent, skills, scenarioItems } = evalData;
6817
+ state.currentPhase = ExecutionPhase.VALIDATION;
6818
+ state.currentContext = {
6819
+ projectId: projectId2,
6820
+ evalRunId: evalRunId2,
6821
+ scenarioCount: scenarioItems.length,
6822
+ skillCount: skills.length,
6823
+ hasAgent: !!codeAgent,
6824
+ agentId: evalData.evalRun.agentId,
6825
+ skillsGroupId: evalData.evalRun.skillsGroupId
6826
+ };
6680
6827
  if (scenarioItems.length > 0 && skills.length === 0) {
6681
6828
  throw new Error(
6682
- "Eval run has no skills: set skillsGroupId and ensure the group has skills"
6829
+ `[${ExecutionPhase.VALIDATION}] Eval run has no skills: set skillsGroupId and ensure the group has skills. (skillsGroupId: ${evalData.evalRun.skillsGroupId || "not set"})`
6683
6830
  );
6684
6831
  }
6685
6832
  if (scenarioItems.length > 0 && skills.length > 0 && !codeAgent) {
6686
6833
  throw new Error(
6687
- "Eval run has no code agent: set agentId for skill-based runs"
6834
+ `[${ExecutionPhase.VALIDATION}] Eval run has no code agent: set agentId for skill-based runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
6688
6835
  );
6689
6836
  }
6837
+ let completedScenarios = 0;
6838
+ const totalScenarios = scenarioItems.length * skills.length;
6690
6839
  for (const { scenario, template } of scenarioItems) {
6691
6840
  for (const skill of skills) {
6841
+ state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
6842
+ state.currentContext = {
6843
+ projectId: projectId2,
6844
+ evalRunId: evalRunId2,
6845
+ scenarioId: scenario.id,
6846
+ scenarioName: scenario.name,
6847
+ skillId: skill.id,
6848
+ skillName: skill.name,
6849
+ agentId: codeAgent?.id,
6850
+ agentName: codeAgent?.name,
6851
+ progress: `${completedScenarios + 1}/${totalScenarios}`
6852
+ };
6692
6853
  console.log(
6693
6854
  "[Evaluator] Running skill:",
6694
6855
  skill.name,
6695
- codeAgent ? `with agent: ${codeAgent.name}` : ""
6856
+ codeAgent ? `with agent: ${codeAgent.name}` : "",
6857
+ `(${completedScenarios + 1}/${totalScenarios})`
6696
6858
  );
6697
6859
  try {
6698
6860
  const result = await runScenario(
@@ -6703,17 +6865,48 @@ async function runEvaluation(projectId2, evalRunId2) {
6703
6865
  template
6704
6866
  );
6705
6867
  console.log("[Evaluator] Skill completed, adding result");
6868
+ state.currentPhase = ExecutionPhase.ADD_RESULT;
6869
+ state.currentContext = {
6870
+ ...state.currentContext,
6871
+ resultId: result.id
6872
+ };
6706
6873
  await api.addResult(projectId2, evalRunId2, result);
6874
+ completedScenarios++;
6707
6875
  } catch (err) {
6708
- console.error("[Evaluator] Failed to run skill:", skill.name, err);
6709
- throw err;
6876
+ const errorMsg = err instanceof Error ? err.message : String(err);
6877
+ const errorStack = err instanceof Error ? err.stack : void 0;
6878
+ console.error(
6879
+ "[Evaluator] Failed to run skill:",
6880
+ skill.name,
6881
+ "Error:",
6882
+ errorMsg
6883
+ );
6884
+ if (errorStack) {
6885
+ console.error("[Evaluator] Stack trace:", errorStack);
6886
+ }
6887
+ throw new Error(
6888
+ `[${state.currentPhase}] Failed to execute skill "${skill.name}" on scenario "${scenario.name}": ${errorMsg}`
6889
+ );
6710
6890
  }
6711
6891
  }
6712
6892
  }
6713
- await api.updateEvalRun(projectId2, evalRunId2, {
6714
- status: EvalStatus.COMPLETED,
6715
- completedAt: (/* @__PURE__ */ new Date()).toISOString()
6716
- });
6893
+ state.currentPhase = ExecutionPhase.UPDATE_STATUS;
6894
+ state.currentContext = {
6895
+ projectId: projectId2,
6896
+ evalRunId: evalRunId2,
6897
+ completedScenarios,
6898
+ totalScenarios
6899
+ };
6900
+ try {
6901
+ await api.updateEvalRun(projectId2, evalRunId2, {
6902
+ status: EvalStatus2.COMPLETED,
6903
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
6904
+ });
6905
+ } catch (updateErr) {
6906
+ throw new Error(
6907
+ `[${ExecutionPhase.UPDATE_STATUS}] Failed to update eval run status to COMPLETED: ${updateErr instanceof Error ? updateErr.message : String(updateErr)}`
6908
+ );
6909
+ }
6717
6910
  }
6718
6911
  var projectId = process.argv[2];
6719
6912
  var evalRunId = process.argv[3];
@@ -6729,7 +6922,16 @@ runEvaluation(projectId, evalRunId).then(() => {
6729
6922
  console.error("[EVALUATOR-BOOT] runEvaluation completed successfully");
6730
6923
  process.exit(0);
6731
6924
  }).catch(async (err) => {
6732
- console.error("[EVALUATOR-BOOT] runEvaluation FAILED:", err);
6925
+ const errorDetails = formatError(err, "main-execution", {
6926
+ projectId,
6927
+ evalRunId
6928
+ });
6929
+ const jobError = formatErrorForJobError(errorDetails);
6930
+ console.error("[EVALUATOR-BOOT] runEvaluation FAILED");
6931
+ console.error(
6932
+ "[EVALUATOR-BOOT] Error details:",
6933
+ JSON.stringify(errorDetails, null, 2)
6934
+ );
6733
6935
  try {
6734
6936
  const config = loadConfig();
6735
6937
  const api = createApiClient(config.serverUrl, {
@@ -6738,15 +6940,42 @@ runEvaluation(projectId, evalRunId).then(() => {
6738
6940
  authToken: config.authToken
6739
6941
  });
6740
6942
  await api.updateEvalRun(projectId, evalRunId, {
6741
- status: EvalStatus.FAILED,
6742
- completedAt: (/* @__PURE__ */ new Date()).toISOString()
6943
+ status: EvalStatus2.FAILED,
6944
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
6945
+ jobError,
6946
+ jobStatus: "FAILED"
6743
6947
  });
6744
- console.error("[EVALUATOR-BOOT] Updated eval run status to FAILED");
6948
+ console.error(
6949
+ "[EVALUATOR-BOOT] Updated eval run status to FAILED with error details"
6950
+ );
6745
6951
  } catch (updateErr) {
6746
6952
  console.error(
6747
6953
  "[EVALUATOR-BOOT] Failed to update eval run status:",
6748
- updateErr
6954
+ updateErr instanceof Error ? updateErr.message : String(updateErr)
6749
6955
  );
6956
+ try {
6957
+ const serverUrl = process.env.EVAL_SERVER_URL;
6958
+ const authToken = process.env.EVAL_AUTH_TOKEN;
6959
+ const routeHeader = process.env.EVAL_ROUTE_HEADER;
6960
+ if (serverUrl) {
6961
+ const api = createApiClient(serverUrl, {
6962
+ routeHeader,
6963
+ authToken
6964
+ });
6965
+ await api.updateEvalRun(projectId, evalRunId, {
6966
+ status: EvalStatus2.FAILED,
6967
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
6968
+ jobError: `Config load failed, then: ${jobError}`,
6969
+ jobStatus: "FAILED"
6970
+ });
6971
+ console.error("[EVALUATOR-BOOT] Fallback: Updated status to FAILED");
6972
+ }
6973
+ } catch (fallbackErr) {
6974
+ console.error(
6975
+ "[EVALUATOR-BOOT] Fallback also failed:",
6976
+ fallbackErr instanceof Error ? fallbackErr.message : String(fallbackErr)
6977
+ );
6978
+ }
6750
6979
  }
6751
6980
  process.exit(1);
6752
6981
  });