@wix/evalforge-evaluator 0.115.0 → 0.117.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -1199,6 +1199,31 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
1199
1199
  outputPreview: `Message type: ${message.type}`
1200
1200
  };
1201
1201
  }
1202
+ async function prepareClaudeCodeEnvironment(cwd, skills, options) {
1203
+ const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
1204
+ const claudeDir = `${cwd}/.claude`;
1205
+ await mkdirAsync(claudeDir, { recursive: true });
1206
+ await writeFile6(`${claudeDir}/settings.json`, "{}", {
1207
+ flag: "wx"
1208
+ }).catch(() => {
1209
+ });
1210
+ if (options.mcps && options.mcps.length > 0) {
1211
+ await writeMcpToFilesystem(cwd, options.mcps);
1212
+ }
1213
+ if (options.subAgents && options.subAgents.length > 0) {
1214
+ await writeSubAgentsToFilesystem(cwd, options.subAgents);
1215
+ }
1216
+ if (options.rules && options.rules.length > 0) {
1217
+ await writeRulesToFilesystem(cwd, options.rules);
1218
+ }
1219
+ try {
1220
+ await writeSkillsToFilesystem(cwd, skills);
1221
+ } catch (writeError) {
1222
+ throw new Error(
1223
+ `Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
1224
+ );
1225
+ }
1226
+ }
1202
1227
  async function executeWithClaudeCode(skills, scenario, options) {
1203
1228
  const skillNames = skills.map((s) => s.name).join(", ");
1204
1229
  console.log("[executeWithClaudeCode] Starting execution", {
@@ -1222,29 +1247,6 @@ async function executeWithClaudeCode(skills, scenario, options) {
1222
1247
  }
1223
1248
  const startTime = /* @__PURE__ */ new Date();
1224
1249
  const allMessages = [];
1225
- const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
1226
- const claudeDir = `${options.cwd}/.claude`;
1227
- await mkdirAsync(claudeDir, { recursive: true });
1228
- await writeFile6(`${claudeDir}/settings.json`, "{}", {
1229
- flag: "wx"
1230
- }).catch(() => {
1231
- });
1232
- if (options.mcps && options.mcps.length > 0) {
1233
- await writeMcpToFilesystem(options.cwd, options.mcps);
1234
- }
1235
- if (options.subAgents && options.subAgents.length > 0) {
1236
- await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
1237
- }
1238
- if (options.rules && options.rules.length > 0) {
1239
- await writeRulesToFilesystem(options.cwd, options.rules);
1240
- }
1241
- try {
1242
- await writeSkillsToFilesystem(options.cwd, skills);
1243
- } catch (writeError) {
1244
- throw new Error(
1245
- `Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
1246
- );
1247
- }
1248
1250
  const sdkEnv = buildSdkEnvironment(options);
1249
1251
  let traceStepNumber = 0;
1250
1252
  const traceContext = options.traceContext;
@@ -2086,6 +2088,17 @@ var ClaudeCodeAdapter = class {
2086
2088
  id = "claude-code";
2087
2089
  name = "Claude Code";
2088
2090
  supportedCommands = [import_evalforge_types5.AgentRunCommand.CLAUDE];
2091
+ /**
2092
+ * Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
2093
+ * before the baseline snapshot is taken.
2094
+ */
2095
+ async prepareEnvironment(context) {
2096
+ await prepareClaudeCodeEnvironment(context.cwd, context.skills, {
2097
+ mcps: context.mcps,
2098
+ subAgents: context.subAgents,
2099
+ rules: context.rules
2100
+ });
2101
+ }
2089
2102
  /**
2090
2103
  * Execute a skill using the Claude Code SDK.
2091
2104
  *
@@ -2736,37 +2749,39 @@ function createTraceEventFromPart(part, context, stepNumber, isComplete) {
2736
2749
  return null;
2737
2750
  }
2738
2751
  }
2739
- async function executeWithOpenCode(skills, scenario, options) {
2740
- const skillNames = skills.map((s) => s.name).join(", ");
2741
- console.log("[executeWithOpenCode] Starting execution", {
2742
- skillCount: skills.length,
2743
- skillNames,
2744
- scenarioId: scenario.id,
2745
- scenarioName: scenario.name,
2746
- cwd: options.cwd,
2747
- aiGatewayUrl: options.aiGatewayUrl,
2748
- hasAiGatewayHeaders: !!options.aiGatewayHeaders,
2749
- model: options.model
2750
- });
2751
- const startTime = /* @__PURE__ */ new Date();
2752
+ async function prepareOpenCodeEnvironment(cwd, skills, options) {
2752
2753
  if (options.mcps && options.mcps.length > 0) {
2753
2754
  console.log(
2754
2755
  `[MCP] ${options.mcps.length} MCP(s) will be configured inline`
2755
2756
  );
2756
2757
  }
2757
2758
  if (options.subAgents && options.subAgents.length > 0) {
2758
- await writeSubAgentsToFilesystem2(options.cwd, options.subAgents);
2759
+ await writeSubAgentsToFilesystem2(cwd, options.subAgents);
2759
2760
  }
2760
2761
  if (options.rules && options.rules.length > 0) {
2761
- await writeRulesToFilesystem(options.cwd, options.rules);
2762
+ await writeRulesToFilesystem(cwd, options.rules);
2762
2763
  }
2763
2764
  try {
2764
- await writeSkillsToFilesystem2(options.cwd, skills);
2765
+ await writeSkillsToFilesystem2(cwd, skills);
2765
2766
  } catch (writeError) {
2766
2767
  throw new Error(
2767
2768
  `Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
2768
2769
  );
2769
2770
  }
2771
+ }
2772
+ async function executeWithOpenCode(skills, scenario, options) {
2773
+ const skillNames = skills.map((s) => s.name).join(", ");
2774
+ console.log("[executeWithOpenCode] Starting execution", {
2775
+ skillCount: skills.length,
2776
+ skillNames,
2777
+ scenarioId: scenario.id,
2778
+ scenarioName: scenario.name,
2779
+ cwd: options.cwd,
2780
+ aiGatewayUrl: options.aiGatewayUrl,
2781
+ hasAiGatewayHeaders: !!options.aiGatewayHeaders,
2782
+ model: options.model
2783
+ });
2784
+ const startTime = /* @__PURE__ */ new Date();
2770
2785
  const maxTurns = options.maxTurns ?? 10;
2771
2786
  const { config, providerID, modelID } = await buildOpenCodeConfig({
2772
2787
  model: options.model,
@@ -3097,9 +3112,24 @@ async function executeWithOpenCode(skills, scenario, options) {
3097
3112
  const errorMessage = sdkError instanceof Error ? sdkError.message : String(sdkError);
3098
3113
  const errorStack = sdkError instanceof Error ? sdkError.stack : void 0;
3099
3114
  const errorName = sdkError instanceof Error ? sdkError.name : "Unknown";
3115
+ const causeDetails = [];
3116
+ let current = sdkError;
3117
+ while (current instanceof Error && current.cause) {
3118
+ current = current.cause;
3119
+ if (current instanceof Error) {
3120
+ causeDetails.push(`${current.name}: ${current.message}`);
3121
+ } else {
3122
+ causeDetails.push(String(current));
3123
+ }
3124
+ }
3125
+ const causeChain = causeDetails.length > 0 ? `
3126
+ Cause chain: ${causeDetails.join(" -> ")}` : "";
3100
3127
  console.error("[SDK-ERROR] ====== OPENCODE SDK EXECUTION FAILED ======");
3101
3128
  console.error("[SDK-ERROR] Error name:", errorName);
3102
3129
  console.error("[SDK-ERROR] Error message:", errorMessage);
3130
+ if (causeDetails.length > 0) {
3131
+ console.error("[SDK-ERROR] Cause chain:", causeDetails.join(" -> "));
3132
+ }
3103
3133
  if (errorStack) {
3104
3134
  console.error("[SDK-ERROR] Stack:", errorStack);
3105
3135
  }
@@ -3116,7 +3146,10 @@ async function executeWithOpenCode(skills, scenario, options) {
3116
3146
  outputPreview: JSON.stringify({
3117
3147
  event: "sdk-execution-failed",
3118
3148
  error: errorMessage,
3119
- errorName
3149
+ errorName,
3150
+ ...causeDetails.length > 0 && {
3151
+ causeChain: causeDetails.join(" -> ")
3152
+ }
3120
3153
  }).slice(0, 2e3),
3121
3154
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3122
3155
  isComplete: true
@@ -3127,7 +3160,7 @@ async function executeWithOpenCode(skills, scenario, options) {
3127
3160
  );
3128
3161
  }
3129
3162
  throw new Error(
3130
- `OpenCode SDK execution failed: ${errorMessage}` + (errorStack ? `
3163
+ `OpenCode SDK execution failed: ${errorMessage}` + causeChain + (errorStack ? `
3131
3164
  Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
3132
3165
  );
3133
3166
  } finally {
@@ -3146,6 +3179,13 @@ var OpenCodeAdapter = class {
3146
3179
  id = "opencode";
3147
3180
  name = "OpenCode";
3148
3181
  supportedCommands = [import_evalforge_types9.AgentRunCommand.OPENCODE];
3182
+ async prepareEnvironment(context) {
3183
+ await prepareOpenCodeEnvironment(context.cwd, context.skills, {
3184
+ mcps: context.mcps,
3185
+ subAgents: context.subAgents,
3186
+ rules: context.rules
3187
+ });
3188
+ }
3149
3189
  async execute(context) {
3150
3190
  const {
3151
3191
  skills,
@@ -4273,6 +4313,15 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
4273
4313
  };
4274
4314
 
4275
4315
  // src/run-scenario/file-diff.ts
4316
+ function deriveInfrastructurePaths(prePrep, postPrep) {
4317
+ const infraPaths = /* @__PURE__ */ new Set();
4318
+ for (const path2 of Object.keys(postPrep)) {
4319
+ if (prePrep[path2] === void 0 || prePrep[path2] !== postPrep[path2]) {
4320
+ infraPaths.add(path2);
4321
+ }
4322
+ }
4323
+ return infraPaths;
4324
+ }
4276
4325
  var IGNORED_PATTERNS = [
4277
4326
  "node_modules",
4278
4327
  ".git",
@@ -4376,7 +4425,7 @@ function generateDiffLines(before, after) {
4376
4425
  }
4377
4426
  return result;
4378
4427
  }
4379
- function diffSnapshots(before, after) {
4428
+ function diffSnapshots(before, after, infrastructurePaths) {
4380
4429
  const diffs = [];
4381
4430
  const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
4382
4431
  for (const path2 of allPaths) {
@@ -4390,7 +4439,8 @@ function diffSnapshots(before, after) {
4390
4439
  path: path2,
4391
4440
  expected: beforeContent,
4392
4441
  actual: afterContent,
4393
- diffLines: diffLines2
4442
+ diffLines: diffLines2,
4443
+ ...infrastructurePaths?.has(path2) && { isInfrastructure: true }
4394
4444
  });
4395
4445
  }
4396
4446
  const deletedPaths = [...allPaths].filter((p) => after[p] === void 0);
@@ -4414,7 +4464,7 @@ function diffSnapshots(before, after) {
4414
4464
  result.sort((a, b) => a.path.localeCompare(b.path));
4415
4465
  return result;
4416
4466
  }
4417
- function extractTemplateFiles(before, after) {
4467
+ function extractTemplateFiles(before, after, infrastructurePaths) {
4418
4468
  const files = [];
4419
4469
  const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
4420
4470
  for (const path2 of allPaths) {
@@ -4434,7 +4484,8 @@ function extractTemplateFiles(before, after) {
4434
4484
  files.push({
4435
4485
  path: path2,
4436
4486
  content: afterContent,
4437
- status
4487
+ status,
4488
+ ...infrastructurePaths?.has(path2) && { isInfrastructure: true }
4438
4489
  });
4439
4490
  }
4440
4491
  files.sort((a, b) => a.path.localeCompare(b.path));
@@ -4450,7 +4501,6 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4450
4501
  const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
4451
4502
  const adapter = getAdapter(identifier);
4452
4503
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
4453
- const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
4454
4504
  const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
4455
4505
  const targetName = evalData.presetName || agent?.name || "";
4456
4506
  const executionContext = {
@@ -4475,11 +4525,22 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4475
4525
  rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
4476
4526
  systemPrompt: agent?.systemPrompt
4477
4527
  };
4528
+ const hasPrepare = !!adapter.prepareEnvironment;
4529
+ const prePrepSnapshot = hasPrepare && workDir ? snapshotDirectory(workDir) : {};
4530
+ if (hasPrepare) {
4531
+ await adapter.prepareEnvironment(executionContext);
4532
+ }
4533
+ const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
4534
+ const infrastructurePaths = hasPrepare ? deriveInfrastructurePaths(prePrepSnapshot, beforeSnapshot) : /* @__PURE__ */ new Set();
4478
4535
  const { outputText, durationMs, llmTrace, conversation } = await adapter.execute(executionContext);
4479
4536
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
4480
4537
  const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
4481
- const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
4482
- const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
4538
+ const fileDiffs = diffSnapshots(
4539
+ beforeSnapshot,
4540
+ afterSnapshot,
4541
+ infrastructurePaths
4542
+ );
4543
+ const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
4483
4544
  return {
4484
4545
  id: (0, import_crypto4.randomUUID)(),
4485
4546
  targetId,