@wix/evalforge-evaluator 0.114.0 → 0.116.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -1191,6 +1191,31 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
1191
1191
  outputPreview: `Message type: ${message.type}`
1192
1192
  };
1193
1193
  }
1194
+ async function prepareClaudeCodeEnvironment(cwd, skills, options) {
1195
+ const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
1196
+ const claudeDir = `${cwd}/.claude`;
1197
+ await mkdirAsync(claudeDir, { recursive: true });
1198
+ await writeFile6(`${claudeDir}/settings.json`, "{}", {
1199
+ flag: "wx"
1200
+ }).catch(() => {
1201
+ });
1202
+ if (options.mcps && options.mcps.length > 0) {
1203
+ await writeMcpToFilesystem(cwd, options.mcps);
1204
+ }
1205
+ if (options.subAgents && options.subAgents.length > 0) {
1206
+ await writeSubAgentsToFilesystem(cwd, options.subAgents);
1207
+ }
1208
+ if (options.rules && options.rules.length > 0) {
1209
+ await writeRulesToFilesystem(cwd, options.rules);
1210
+ }
1211
+ try {
1212
+ await writeSkillsToFilesystem(cwd, skills);
1213
+ } catch (writeError) {
1214
+ throw new Error(
1215
+ `Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
1216
+ );
1217
+ }
1218
+ }
1194
1219
  async function executeWithClaudeCode(skills, scenario, options) {
1195
1220
  const skillNames = skills.map((s) => s.name).join(", ");
1196
1221
  console.log("[executeWithClaudeCode] Starting execution", {
@@ -1214,29 +1239,6 @@ async function executeWithClaudeCode(skills, scenario, options) {
1214
1239
  }
1215
1240
  const startTime = /* @__PURE__ */ new Date();
1216
1241
  const allMessages = [];
1217
- const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
1218
- const claudeDir = `${options.cwd}/.claude`;
1219
- await mkdirAsync(claudeDir, { recursive: true });
1220
- await writeFile6(`${claudeDir}/settings.json`, "{}", {
1221
- flag: "wx"
1222
- }).catch(() => {
1223
- });
1224
- if (options.mcps && options.mcps.length > 0) {
1225
- await writeMcpToFilesystem(options.cwd, options.mcps);
1226
- }
1227
- if (options.subAgents && options.subAgents.length > 0) {
1228
- await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
1229
- }
1230
- if (options.rules && options.rules.length > 0) {
1231
- await writeRulesToFilesystem(options.cwd, options.rules);
1232
- }
1233
- try {
1234
- await writeSkillsToFilesystem(options.cwd, skills);
1235
- } catch (writeError) {
1236
- throw new Error(
1237
- `Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
1238
- );
1239
- }
1240
1242
  const sdkEnv = buildSdkEnvironment(options);
1241
1243
  let traceStepNumber = 0;
1242
1244
  const traceContext = options.traceContext;
@@ -2078,6 +2080,17 @@ var ClaudeCodeAdapter = class {
2078
2080
  id = "claude-code";
2079
2081
  name = "Claude Code";
2080
2082
  supportedCommands = [AgentRunCommand.CLAUDE];
2083
+ /**
2084
+ * Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
2085
+ * before the baseline snapshot is taken.
2086
+ */
2087
+ async prepareEnvironment(context) {
2088
+ await prepareClaudeCodeEnvironment(context.cwd, context.skills, {
2089
+ mcps: context.mcps,
2090
+ subAgents: context.subAgents,
2091
+ rules: context.rules
2092
+ });
2093
+ }
2081
2094
  /**
2082
2095
  * Execute a skill using the Claude Code SDK.
2083
2096
  *
@@ -2141,6 +2154,7 @@ defaultRegistry.register(claudeCodeAdapter);
2141
2154
  import { AgentRunCommand as AgentRunCommand2 } from "@wix/evalforge-types";
2142
2155
 
2143
2156
  // src/run-scenario/agents/opencode/execute.ts
2157
+ import { homedir as homedir2 } from "os";
2144
2158
  import {
2145
2159
  ClaudeModel as ClaudeModel3,
2146
2160
  DEFAULT_EVALUATOR_SYSTEM_PROMPT as DEFAULT_EVALUATOR_SYSTEM_PROMPT2,
@@ -2652,6 +2666,13 @@ function buildConversation2(messages) {
2652
2666
 
2653
2667
  // src/run-scenario/agents/opencode/execute.ts
2654
2668
  var DEFAULT_MODEL3 = `anthropic/${ClaudeModel3.CLAUDE_4_5_SONNET_1_0}`;
2669
+ function ensureOpenCodeInPath() {
2670
+ const opencodeBin = `${homedir2()}/.opencode/bin`;
2671
+ const currentPath = process.env.PATH || "";
2672
+ if (!currentPath.includes(opencodeBin)) {
2673
+ process.env.PATH = `${opencodeBin}:${currentPath}`;
2674
+ }
2675
+ }
2655
2676
  function extractToolAction(toolName, args) {
2656
2677
  if (!toolName) return "Using tool...";
2657
2678
  const a = args;
@@ -2729,37 +2750,39 @@ function createTraceEventFromPart(part, context, stepNumber, isComplete) {
2729
2750
  return null;
2730
2751
  }
2731
2752
  }
2732
- async function executeWithOpenCode(skills, scenario, options) {
2733
- const skillNames = skills.map((s) => s.name).join(", ");
2734
- console.log("[executeWithOpenCode] Starting execution", {
2735
- skillCount: skills.length,
2736
- skillNames,
2737
- scenarioId: scenario.id,
2738
- scenarioName: scenario.name,
2739
- cwd: options.cwd,
2740
- aiGatewayUrl: options.aiGatewayUrl,
2741
- hasAiGatewayHeaders: !!options.aiGatewayHeaders,
2742
- model: options.model
2743
- });
2744
- const startTime = /* @__PURE__ */ new Date();
2753
+ async function prepareOpenCodeEnvironment(cwd, skills, options) {
2745
2754
  if (options.mcps && options.mcps.length > 0) {
2746
2755
  console.log(
2747
2756
  `[MCP] ${options.mcps.length} MCP(s) will be configured inline`
2748
2757
  );
2749
2758
  }
2750
2759
  if (options.subAgents && options.subAgents.length > 0) {
2751
- await writeSubAgentsToFilesystem2(options.cwd, options.subAgents);
2760
+ await writeSubAgentsToFilesystem2(cwd, options.subAgents);
2752
2761
  }
2753
2762
  if (options.rules && options.rules.length > 0) {
2754
- await writeRulesToFilesystem(options.cwd, options.rules);
2763
+ await writeRulesToFilesystem(cwd, options.rules);
2755
2764
  }
2756
2765
  try {
2757
- await writeSkillsToFilesystem2(options.cwd, skills);
2766
+ await writeSkillsToFilesystem2(cwd, skills);
2758
2767
  } catch (writeError) {
2759
2768
  throw new Error(
2760
2769
  `Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
2761
2770
  );
2762
2771
  }
2772
+ }
2773
+ async function executeWithOpenCode(skills, scenario, options) {
2774
+ const skillNames = skills.map((s) => s.name).join(", ");
2775
+ console.log("[executeWithOpenCode] Starting execution", {
2776
+ skillCount: skills.length,
2777
+ skillNames,
2778
+ scenarioId: scenario.id,
2779
+ scenarioName: scenario.name,
2780
+ cwd: options.cwd,
2781
+ aiGatewayUrl: options.aiGatewayUrl,
2782
+ hasAiGatewayHeaders: !!options.aiGatewayHeaders,
2783
+ model: options.model
2784
+ });
2785
+ const startTime = /* @__PURE__ */ new Date();
2763
2786
  const maxTurns = options.maxTurns ?? 10;
2764
2787
  const { config, providerID, modelID } = await buildOpenCodeConfig({
2765
2788
  model: options.model,
@@ -2807,6 +2830,7 @@ async function executeWithOpenCode(skills, scenario, options) {
2807
2830
  }
2808
2831
  let server;
2809
2832
  try {
2833
+ ensureOpenCodeInPath();
2810
2834
  console.log("[SDK-DEBUG] Starting OpenCode server...");
2811
2835
  server = await createOpencodeServer({
2812
2836
  config,
@@ -3138,6 +3162,13 @@ var OpenCodeAdapter = class {
3138
3162
  id = "opencode";
3139
3163
  name = "OpenCode";
3140
3164
  supportedCommands = [AgentRunCommand2.OPENCODE];
3165
+ async prepareEnvironment(context) {
3166
+ await prepareOpenCodeEnvironment(context.cwd, context.skills, {
3167
+ mcps: context.mcps,
3168
+ subAgents: context.subAgents,
3169
+ rules: context.rules
3170
+ });
3171
+ }
3141
3172
  async execute(context) {
3142
3173
  const {
3143
3174
  skills,
@@ -4273,6 +4304,15 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
4273
4304
  };
4274
4305
 
4275
4306
  // src/run-scenario/file-diff.ts
4307
+ function deriveInfrastructurePaths(prePrep, postPrep) {
4308
+ const infraPaths = /* @__PURE__ */ new Set();
4309
+ for (const path2 of Object.keys(postPrep)) {
4310
+ if (prePrep[path2] === void 0 || prePrep[path2] !== postPrep[path2]) {
4311
+ infraPaths.add(path2);
4312
+ }
4313
+ }
4314
+ return infraPaths;
4315
+ }
4276
4316
  var IGNORED_PATTERNS = [
4277
4317
  "node_modules",
4278
4318
  ".git",
@@ -4376,7 +4416,7 @@ function generateDiffLines(before, after) {
4376
4416
  }
4377
4417
  return result;
4378
4418
  }
4379
- function diffSnapshots(before, after) {
4419
+ function diffSnapshots(before, after, infrastructurePaths) {
4380
4420
  const diffs = [];
4381
4421
  const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
4382
4422
  for (const path2 of allPaths) {
@@ -4390,7 +4430,8 @@ function diffSnapshots(before, after) {
4390
4430
  path: path2,
4391
4431
  expected: beforeContent,
4392
4432
  actual: afterContent,
4393
- diffLines: diffLines2
4433
+ diffLines: diffLines2,
4434
+ ...infrastructurePaths?.has(path2) && { isInfrastructure: true }
4394
4435
  });
4395
4436
  }
4396
4437
  const deletedPaths = [...allPaths].filter((p) => after[p] === void 0);
@@ -4414,7 +4455,7 @@ function diffSnapshots(before, after) {
4414
4455
  result.sort((a, b) => a.path.localeCompare(b.path));
4415
4456
  return result;
4416
4457
  }
4417
- function extractTemplateFiles(before, after) {
4458
+ function extractTemplateFiles(before, after, infrastructurePaths) {
4418
4459
  const files = [];
4419
4460
  const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
4420
4461
  for (const path2 of allPaths) {
@@ -4434,7 +4475,8 @@ function extractTemplateFiles(before, after) {
4434
4475
  files.push({
4435
4476
  path: path2,
4436
4477
  content: afterContent,
4437
- status
4478
+ status,
4479
+ ...infrastructurePaths?.has(path2) && { isInfrastructure: true }
4438
4480
  });
4439
4481
  }
4440
4482
  files.sort((a, b) => a.path.localeCompare(b.path));
@@ -4450,7 +4492,6 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4450
4492
  const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
4451
4493
  const adapter = getAdapter(identifier);
4452
4494
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
4453
- const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
4454
4495
  const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
4455
4496
  const targetName = evalData.presetName || agent?.name || "";
4456
4497
  const executionContext = {
@@ -4475,11 +4516,22 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4475
4516
  rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
4476
4517
  systemPrompt: agent?.systemPrompt
4477
4518
  };
4519
+ const hasPrepare = !!adapter.prepareEnvironment;
4520
+ const prePrepSnapshot = hasPrepare && workDir ? snapshotDirectory(workDir) : {};
4521
+ if (hasPrepare) {
4522
+ await adapter.prepareEnvironment(executionContext);
4523
+ }
4524
+ const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
4525
+ const infrastructurePaths = hasPrepare ? deriveInfrastructurePaths(prePrepSnapshot, beforeSnapshot) : /* @__PURE__ */ new Set();
4478
4526
  const { outputText, durationMs, llmTrace, conversation } = await adapter.execute(executionContext);
4479
4527
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
4480
4528
  const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
4481
- const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
4482
- const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
4529
+ const fileDiffs = diffSnapshots(
4530
+ beforeSnapshot,
4531
+ afterSnapshot,
4532
+ infrastructurePaths
4533
+ );
4534
+ const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
4483
4535
  return {
4484
4536
  id: randomUUID4(),
4485
4537
  targetId,