@wix/evalforge-evaluator 0.114.0 → 0.116.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -1199,6 +1199,31 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
1199
1199
  outputPreview: `Message type: ${message.type}`
1200
1200
  };
1201
1201
  }
1202
+ async function prepareClaudeCodeEnvironment(cwd, skills, options) {
1203
+ const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
1204
+ const claudeDir = `${cwd}/.claude`;
1205
+ await mkdirAsync(claudeDir, { recursive: true });
1206
+ await writeFile6(`${claudeDir}/settings.json`, "{}", {
1207
+ flag: "wx"
1208
+ }).catch(() => {
1209
+ });
1210
+ if (options.mcps && options.mcps.length > 0) {
1211
+ await writeMcpToFilesystem(cwd, options.mcps);
1212
+ }
1213
+ if (options.subAgents && options.subAgents.length > 0) {
1214
+ await writeSubAgentsToFilesystem(cwd, options.subAgents);
1215
+ }
1216
+ if (options.rules && options.rules.length > 0) {
1217
+ await writeRulesToFilesystem(cwd, options.rules);
1218
+ }
1219
+ try {
1220
+ await writeSkillsToFilesystem(cwd, skills);
1221
+ } catch (writeError) {
1222
+ throw new Error(
1223
+ `Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
1224
+ );
1225
+ }
1226
+ }
1202
1227
  async function executeWithClaudeCode(skills, scenario, options) {
1203
1228
  const skillNames = skills.map((s) => s.name).join(", ");
1204
1229
  console.log("[executeWithClaudeCode] Starting execution", {
@@ -1222,29 +1247,6 @@ async function executeWithClaudeCode(skills, scenario, options) {
1222
1247
  }
1223
1248
  const startTime = /* @__PURE__ */ new Date();
1224
1249
  const allMessages = [];
1225
- const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
1226
- const claudeDir = `${options.cwd}/.claude`;
1227
- await mkdirAsync(claudeDir, { recursive: true });
1228
- await writeFile6(`${claudeDir}/settings.json`, "{}", {
1229
- flag: "wx"
1230
- }).catch(() => {
1231
- });
1232
- if (options.mcps && options.mcps.length > 0) {
1233
- await writeMcpToFilesystem(options.cwd, options.mcps);
1234
- }
1235
- if (options.subAgents && options.subAgents.length > 0) {
1236
- await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
1237
- }
1238
- if (options.rules && options.rules.length > 0) {
1239
- await writeRulesToFilesystem(options.cwd, options.rules);
1240
- }
1241
- try {
1242
- await writeSkillsToFilesystem(options.cwd, skills);
1243
- } catch (writeError) {
1244
- throw new Error(
1245
- `Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
1246
- );
1247
- }
1248
1250
  const sdkEnv = buildSdkEnvironment(options);
1249
1251
  let traceStepNumber = 0;
1250
1252
  const traceContext = options.traceContext;
@@ -2086,6 +2088,17 @@ var ClaudeCodeAdapter = class {
2086
2088
  id = "claude-code";
2087
2089
  name = "Claude Code";
2088
2090
  supportedCommands = [import_evalforge_types5.AgentRunCommand.CLAUDE];
2091
+ /**
2092
+ * Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
2093
+ * before the baseline snapshot is taken.
2094
+ */
2095
+ async prepareEnvironment(context) {
2096
+ await prepareClaudeCodeEnvironment(context.cwd, context.skills, {
2097
+ mcps: context.mcps,
2098
+ subAgents: context.subAgents,
2099
+ rules: context.rules
2100
+ });
2101
+ }
2089
2102
  /**
2090
2103
  * Execute a skill using the Claude Code SDK.
2091
2104
  *
@@ -2149,6 +2162,7 @@ defaultRegistry.register(claudeCodeAdapter);
2149
2162
  var import_evalforge_types9 = require("@wix/evalforge-types");
2150
2163
 
2151
2164
  // src/run-scenario/agents/opencode/execute.ts
2165
+ var import_os3 = require("os");
2152
2166
  var import_evalforge_types8 = require("@wix/evalforge-types");
2153
2167
 
2154
2168
  // src/run-scenario/agents/opencode/write-skills.ts
@@ -2651,6 +2665,13 @@ function buildConversation2(messages) {
2651
2665
 
2652
2666
  // src/run-scenario/agents/opencode/execute.ts
2653
2667
  var DEFAULT_MODEL3 = `anthropic/${import_evalforge_types8.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
2668
+ function ensureOpenCodeInPath() {
2669
+ const opencodeBin = `${(0, import_os3.homedir)()}/.opencode/bin`;
2670
+ const currentPath = process.env.PATH || "";
2671
+ if (!currentPath.includes(opencodeBin)) {
2672
+ process.env.PATH = `${opencodeBin}:${currentPath}`;
2673
+ }
2674
+ }
2654
2675
  function extractToolAction(toolName, args) {
2655
2676
  if (!toolName) return "Using tool...";
2656
2677
  const a = args;
@@ -2728,37 +2749,39 @@ function createTraceEventFromPart(part, context, stepNumber, isComplete) {
2728
2749
  return null;
2729
2750
  }
2730
2751
  }
2731
- async function executeWithOpenCode(skills, scenario, options) {
2732
- const skillNames = skills.map((s) => s.name).join(", ");
2733
- console.log("[executeWithOpenCode] Starting execution", {
2734
- skillCount: skills.length,
2735
- skillNames,
2736
- scenarioId: scenario.id,
2737
- scenarioName: scenario.name,
2738
- cwd: options.cwd,
2739
- aiGatewayUrl: options.aiGatewayUrl,
2740
- hasAiGatewayHeaders: !!options.aiGatewayHeaders,
2741
- model: options.model
2742
- });
2743
- const startTime = /* @__PURE__ */ new Date();
2752
+ async function prepareOpenCodeEnvironment(cwd, skills, options) {
2744
2753
  if (options.mcps && options.mcps.length > 0) {
2745
2754
  console.log(
2746
2755
  `[MCP] ${options.mcps.length} MCP(s) will be configured inline`
2747
2756
  );
2748
2757
  }
2749
2758
  if (options.subAgents && options.subAgents.length > 0) {
2750
- await writeSubAgentsToFilesystem2(options.cwd, options.subAgents);
2759
+ await writeSubAgentsToFilesystem2(cwd, options.subAgents);
2751
2760
  }
2752
2761
  if (options.rules && options.rules.length > 0) {
2753
- await writeRulesToFilesystem(options.cwd, options.rules);
2762
+ await writeRulesToFilesystem(cwd, options.rules);
2754
2763
  }
2755
2764
  try {
2756
- await writeSkillsToFilesystem2(options.cwd, skills);
2765
+ await writeSkillsToFilesystem2(cwd, skills);
2757
2766
  } catch (writeError) {
2758
2767
  throw new Error(
2759
2768
  `Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
2760
2769
  );
2761
2770
  }
2771
+ }
2772
+ async function executeWithOpenCode(skills, scenario, options) {
2773
+ const skillNames = skills.map((s) => s.name).join(", ");
2774
+ console.log("[executeWithOpenCode] Starting execution", {
2775
+ skillCount: skills.length,
2776
+ skillNames,
2777
+ scenarioId: scenario.id,
2778
+ scenarioName: scenario.name,
2779
+ cwd: options.cwd,
2780
+ aiGatewayUrl: options.aiGatewayUrl,
2781
+ hasAiGatewayHeaders: !!options.aiGatewayHeaders,
2782
+ model: options.model
2783
+ });
2784
+ const startTime = /* @__PURE__ */ new Date();
2762
2785
  const maxTurns = options.maxTurns ?? 10;
2763
2786
  const { config, providerID, modelID } = await buildOpenCodeConfig({
2764
2787
  model: options.model,
@@ -2806,6 +2829,7 @@ async function executeWithOpenCode(skills, scenario, options) {
2806
2829
  }
2807
2830
  let server;
2808
2831
  try {
2832
+ ensureOpenCodeInPath();
2809
2833
  console.log("[SDK-DEBUG] Starting OpenCode server...");
2810
2834
  server = await createOpencodeServer({
2811
2835
  config,
@@ -3137,6 +3161,13 @@ var OpenCodeAdapter = class {
3137
3161
  id = "opencode";
3138
3162
  name = "OpenCode";
3139
3163
  supportedCommands = [import_evalforge_types9.AgentRunCommand.OPENCODE];
3164
+ async prepareEnvironment(context) {
3165
+ await prepareOpenCodeEnvironment(context.cwd, context.skills, {
3166
+ mcps: context.mcps,
3167
+ subAgents: context.subAgents,
3168
+ rules: context.rules
3169
+ });
3170
+ }
3140
3171
  async execute(context) {
3141
3172
  const {
3142
3173
  skills,
@@ -4264,6 +4295,15 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
4264
4295
  };
4265
4296
 
4266
4297
  // src/run-scenario/file-diff.ts
4298
+ function deriveInfrastructurePaths(prePrep, postPrep) {
4299
+ const infraPaths = /* @__PURE__ */ new Set();
4300
+ for (const path2 of Object.keys(postPrep)) {
4301
+ if (prePrep[path2] === void 0 || prePrep[path2] !== postPrep[path2]) {
4302
+ infraPaths.add(path2);
4303
+ }
4304
+ }
4305
+ return infraPaths;
4306
+ }
4267
4307
  var IGNORED_PATTERNS = [
4268
4308
  "node_modules",
4269
4309
  ".git",
@@ -4367,7 +4407,7 @@ function generateDiffLines(before, after) {
4367
4407
  }
4368
4408
  return result;
4369
4409
  }
4370
- function diffSnapshots(before, after) {
4410
+ function diffSnapshots(before, after, infrastructurePaths) {
4371
4411
  const diffs = [];
4372
4412
  const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
4373
4413
  for (const path2 of allPaths) {
@@ -4381,7 +4421,8 @@ function diffSnapshots(before, after) {
4381
4421
  path: path2,
4382
4422
  expected: beforeContent,
4383
4423
  actual: afterContent,
4384
- diffLines: diffLines2
4424
+ diffLines: diffLines2,
4425
+ ...infrastructurePaths?.has(path2) && { isInfrastructure: true }
4385
4426
  });
4386
4427
  }
4387
4428
  const deletedPaths = [...allPaths].filter((p) => after[p] === void 0);
@@ -4405,7 +4446,7 @@ function diffSnapshots(before, after) {
4405
4446
  result.sort((a, b) => a.path.localeCompare(b.path));
4406
4447
  return result;
4407
4448
  }
4408
- function extractTemplateFiles(before, after) {
4449
+ function extractTemplateFiles(before, after, infrastructurePaths) {
4409
4450
  const files = [];
4410
4451
  const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
4411
4452
  for (const path2 of allPaths) {
@@ -4425,7 +4466,8 @@ function extractTemplateFiles(before, after) {
4425
4466
  files.push({
4426
4467
  path: path2,
4427
4468
  content: afterContent,
4428
- status
4469
+ status,
4470
+ ...infrastructurePaths?.has(path2) && { isInfrastructure: true }
4429
4471
  });
4430
4472
  }
4431
4473
  files.sort((a, b) => a.path.localeCompare(b.path));
@@ -4441,7 +4483,6 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4441
4483
  const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
4442
4484
  const adapter = getAdapter(identifier);
4443
4485
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
4444
- const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
4445
4486
  const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
4446
4487
  const targetName = evalData.presetName || agent?.name || "";
4447
4488
  const executionContext = {
@@ -4466,11 +4507,22 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4466
4507
  rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
4467
4508
  systemPrompt: agent?.systemPrompt
4468
4509
  };
4510
+ const hasPrepare = !!adapter.prepareEnvironment;
4511
+ const prePrepSnapshot = hasPrepare && workDir ? snapshotDirectory(workDir) : {};
4512
+ if (hasPrepare) {
4513
+ await adapter.prepareEnvironment(executionContext);
4514
+ }
4515
+ const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
4516
+ const infrastructurePaths = hasPrepare ? deriveInfrastructurePaths(prePrepSnapshot, beforeSnapshot) : /* @__PURE__ */ new Set();
4469
4517
  const { outputText, durationMs, llmTrace, conversation } = await adapter.execute(executionContext);
4470
4518
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
4471
4519
  const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
4472
- const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
4473
- const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
4520
+ const fileDiffs = diffSnapshots(
4521
+ beforeSnapshot,
4522
+ afterSnapshot,
4523
+ infrastructurePaths
4524
+ );
4525
+ const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
4474
4526
  return {
4475
4527
  id: (0, import_crypto4.randomUUID)(),
4476
4528
  targetId,