@wix/evalforge-evaluator 0.115.0 → 0.116.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -1199,6 +1199,31 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
1199
1199
  outputPreview: `Message type: ${message.type}`
1200
1200
  };
1201
1201
  }
1202
+ async function prepareClaudeCodeEnvironment(cwd, skills, options) {
1203
+ const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
1204
+ const claudeDir = `${cwd}/.claude`;
1205
+ await mkdirAsync(claudeDir, { recursive: true });
1206
+ await writeFile6(`${claudeDir}/settings.json`, "{}", {
1207
+ flag: "wx"
1208
+ }).catch(() => {
1209
+ });
1210
+ if (options.mcps && options.mcps.length > 0) {
1211
+ await writeMcpToFilesystem(cwd, options.mcps);
1212
+ }
1213
+ if (options.subAgents && options.subAgents.length > 0) {
1214
+ await writeSubAgentsToFilesystem(cwd, options.subAgents);
1215
+ }
1216
+ if (options.rules && options.rules.length > 0) {
1217
+ await writeRulesToFilesystem(cwd, options.rules);
1218
+ }
1219
+ try {
1220
+ await writeSkillsToFilesystem(cwd, skills);
1221
+ } catch (writeError) {
1222
+ throw new Error(
1223
+ `Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
1224
+ );
1225
+ }
1226
+ }
1202
1227
  async function executeWithClaudeCode(skills, scenario, options) {
1203
1228
  const skillNames = skills.map((s) => s.name).join(", ");
1204
1229
  console.log("[executeWithClaudeCode] Starting execution", {
@@ -1222,29 +1247,6 @@ async function executeWithClaudeCode(skills, scenario, options) {
1222
1247
  }
1223
1248
  const startTime = /* @__PURE__ */ new Date();
1224
1249
  const allMessages = [];
1225
- const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
1226
- const claudeDir = `${options.cwd}/.claude`;
1227
- await mkdirAsync(claudeDir, { recursive: true });
1228
- await writeFile6(`${claudeDir}/settings.json`, "{}", {
1229
- flag: "wx"
1230
- }).catch(() => {
1231
- });
1232
- if (options.mcps && options.mcps.length > 0) {
1233
- await writeMcpToFilesystem(options.cwd, options.mcps);
1234
- }
1235
- if (options.subAgents && options.subAgents.length > 0) {
1236
- await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
1237
- }
1238
- if (options.rules && options.rules.length > 0) {
1239
- await writeRulesToFilesystem(options.cwd, options.rules);
1240
- }
1241
- try {
1242
- await writeSkillsToFilesystem(options.cwd, skills);
1243
- } catch (writeError) {
1244
- throw new Error(
1245
- `Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
1246
- );
1247
- }
1248
1250
  const sdkEnv = buildSdkEnvironment(options);
1249
1251
  let traceStepNumber = 0;
1250
1252
  const traceContext = options.traceContext;
@@ -2086,6 +2088,17 @@ var ClaudeCodeAdapter = class {
2086
2088
  id = "claude-code";
2087
2089
  name = "Claude Code";
2088
2090
  supportedCommands = [import_evalforge_types5.AgentRunCommand.CLAUDE];
2091
+ /**
2092
+ * Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
2093
+ * before the baseline snapshot is taken.
2094
+ */
2095
+ async prepareEnvironment(context) {
2096
+ await prepareClaudeCodeEnvironment(context.cwd, context.skills, {
2097
+ mcps: context.mcps,
2098
+ subAgents: context.subAgents,
2099
+ rules: context.rules
2100
+ });
2101
+ }
2089
2102
  /**
2090
2103
  * Execute a skill using the Claude Code SDK.
2091
2104
  *
@@ -2736,37 +2749,39 @@ function createTraceEventFromPart(part, context, stepNumber, isComplete) {
2736
2749
  return null;
2737
2750
  }
2738
2751
  }
2739
- async function executeWithOpenCode(skills, scenario, options) {
2740
- const skillNames = skills.map((s) => s.name).join(", ");
2741
- console.log("[executeWithOpenCode] Starting execution", {
2742
- skillCount: skills.length,
2743
- skillNames,
2744
- scenarioId: scenario.id,
2745
- scenarioName: scenario.name,
2746
- cwd: options.cwd,
2747
- aiGatewayUrl: options.aiGatewayUrl,
2748
- hasAiGatewayHeaders: !!options.aiGatewayHeaders,
2749
- model: options.model
2750
- });
2751
- const startTime = /* @__PURE__ */ new Date();
2752
+ async function prepareOpenCodeEnvironment(cwd, skills, options) {
2752
2753
  if (options.mcps && options.mcps.length > 0) {
2753
2754
  console.log(
2754
2755
  `[MCP] ${options.mcps.length} MCP(s) will be configured inline`
2755
2756
  );
2756
2757
  }
2757
2758
  if (options.subAgents && options.subAgents.length > 0) {
2758
- await writeSubAgentsToFilesystem2(options.cwd, options.subAgents);
2759
+ await writeSubAgentsToFilesystem2(cwd, options.subAgents);
2759
2760
  }
2760
2761
  if (options.rules && options.rules.length > 0) {
2761
- await writeRulesToFilesystem(options.cwd, options.rules);
2762
+ await writeRulesToFilesystem(cwd, options.rules);
2762
2763
  }
2763
2764
  try {
2764
- await writeSkillsToFilesystem2(options.cwd, skills);
2765
+ await writeSkillsToFilesystem2(cwd, skills);
2765
2766
  } catch (writeError) {
2766
2767
  throw new Error(
2767
2768
  `Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
2768
2769
  );
2769
2770
  }
2771
+ }
2772
+ async function executeWithOpenCode(skills, scenario, options) {
2773
+ const skillNames = skills.map((s) => s.name).join(", ");
2774
+ console.log("[executeWithOpenCode] Starting execution", {
2775
+ skillCount: skills.length,
2776
+ skillNames,
2777
+ scenarioId: scenario.id,
2778
+ scenarioName: scenario.name,
2779
+ cwd: options.cwd,
2780
+ aiGatewayUrl: options.aiGatewayUrl,
2781
+ hasAiGatewayHeaders: !!options.aiGatewayHeaders,
2782
+ model: options.model
2783
+ });
2784
+ const startTime = /* @__PURE__ */ new Date();
2770
2785
  const maxTurns = options.maxTurns ?? 10;
2771
2786
  const { config, providerID, modelID } = await buildOpenCodeConfig({
2772
2787
  model: options.model,
@@ -3146,6 +3161,13 @@ var OpenCodeAdapter = class {
3146
3161
  id = "opencode";
3147
3162
  name = "OpenCode";
3148
3163
  supportedCommands = [import_evalforge_types9.AgentRunCommand.OPENCODE];
3164
+ async prepareEnvironment(context) {
3165
+ await prepareOpenCodeEnvironment(context.cwd, context.skills, {
3166
+ mcps: context.mcps,
3167
+ subAgents: context.subAgents,
3168
+ rules: context.rules
3169
+ });
3170
+ }
3149
3171
  async execute(context) {
3150
3172
  const {
3151
3173
  skills,
@@ -4273,6 +4295,15 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
4273
4295
  };
4274
4296
 
4275
4297
  // src/run-scenario/file-diff.ts
4298
+ function deriveInfrastructurePaths(prePrep, postPrep) {
4299
+ const infraPaths = /* @__PURE__ */ new Set();
4300
+ for (const path2 of Object.keys(postPrep)) {
4301
+ if (prePrep[path2] === void 0 || prePrep[path2] !== postPrep[path2]) {
4302
+ infraPaths.add(path2);
4303
+ }
4304
+ }
4305
+ return infraPaths;
4306
+ }
4276
4307
  var IGNORED_PATTERNS = [
4277
4308
  "node_modules",
4278
4309
  ".git",
@@ -4376,7 +4407,7 @@ function generateDiffLines(before, after) {
4376
4407
  }
4377
4408
  return result;
4378
4409
  }
4379
- function diffSnapshots(before, after) {
4410
+ function diffSnapshots(before, after, infrastructurePaths) {
4380
4411
  const diffs = [];
4381
4412
  const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
4382
4413
  for (const path2 of allPaths) {
@@ -4390,7 +4421,8 @@ function diffSnapshots(before, after) {
4390
4421
  path: path2,
4391
4422
  expected: beforeContent,
4392
4423
  actual: afterContent,
4393
- diffLines: diffLines2
4424
+ diffLines: diffLines2,
4425
+ ...infrastructurePaths?.has(path2) && { isInfrastructure: true }
4394
4426
  });
4395
4427
  }
4396
4428
  const deletedPaths = [...allPaths].filter((p) => after[p] === void 0);
@@ -4414,7 +4446,7 @@ function diffSnapshots(before, after) {
4414
4446
  result.sort((a, b) => a.path.localeCompare(b.path));
4415
4447
  return result;
4416
4448
  }
4417
- function extractTemplateFiles(before, after) {
4449
+ function extractTemplateFiles(before, after, infrastructurePaths) {
4418
4450
  const files = [];
4419
4451
  const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
4420
4452
  for (const path2 of allPaths) {
@@ -4434,7 +4466,8 @@ function extractTemplateFiles(before, after) {
4434
4466
  files.push({
4435
4467
  path: path2,
4436
4468
  content: afterContent,
4437
- status
4469
+ status,
4470
+ ...infrastructurePaths?.has(path2) && { isInfrastructure: true }
4438
4471
  });
4439
4472
  }
4440
4473
  files.sort((a, b) => a.path.localeCompare(b.path));
@@ -4450,7 +4483,6 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4450
4483
  const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
4451
4484
  const adapter = getAdapter(identifier);
4452
4485
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
4453
- const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
4454
4486
  const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
4455
4487
  const targetName = evalData.presetName || agent?.name || "";
4456
4488
  const executionContext = {
@@ -4475,11 +4507,22 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4475
4507
  rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
4476
4508
  systemPrompt: agent?.systemPrompt
4477
4509
  };
4510
+ const hasPrepare = !!adapter.prepareEnvironment;
4511
+ const prePrepSnapshot = hasPrepare && workDir ? snapshotDirectory(workDir) : {};
4512
+ if (hasPrepare) {
4513
+ await adapter.prepareEnvironment(executionContext);
4514
+ }
4515
+ const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
4516
+ const infrastructurePaths = hasPrepare ? deriveInfrastructurePaths(prePrepSnapshot, beforeSnapshot) : /* @__PURE__ */ new Set();
4478
4517
  const { outputText, durationMs, llmTrace, conversation } = await adapter.execute(executionContext);
4479
4518
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
4480
4519
  const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
4481
- const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
4482
- const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
4520
+ const fileDiffs = diffSnapshots(
4521
+ beforeSnapshot,
4522
+ afterSnapshot,
4523
+ infrastructurePaths
4524
+ );
4525
+ const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
4483
4526
  return {
4484
4527
  id: (0, import_crypto4.randomUUID)(),
4485
4528
  targetId,