@wix/evalforge-evaluator 0.170.0 → 0.172.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -90,8 +90,8 @@ function createApiClient(serverUrl, options = "") {
90
90
  }
91
91
  return headers;
92
92
  }
93
- async function fetchJson(path2) {
94
- const url = `${serverUrl}${apiPrefix}${pathPrefix}${path2}`;
93
+ async function fetchJson(path3) {
94
+ const url = `${serverUrl}${apiPrefix}${pathPrefix}${path3}`;
95
95
  console.error(`[API] GET ${url}`);
96
96
  const headers = buildHeaders();
97
97
  const response = await fetch(url, {
@@ -105,8 +105,8 @@ function createApiClient(serverUrl, options = "") {
105
105
  }
106
106
  return response.json();
107
107
  }
108
- async function postJson(path2, body) {
109
- const url = `${serverUrl}${apiPrefix}${pathPrefix}${path2}`;
108
+ async function postJson(path3, body) {
109
+ const url = `${serverUrl}${apiPrefix}${pathPrefix}${path3}`;
110
110
  console.error(`[API] POST ${url}`);
111
111
  const response = await fetch(url, {
112
112
  method: "POST",
@@ -120,8 +120,8 @@ function createApiClient(serverUrl, options = "") {
120
120
  );
121
121
  }
122
122
  }
123
- async function deleteRequest(path2) {
124
- const url = `${serverUrl}${apiPrefix}${pathPrefix}${path2}`;
123
+ async function deleteRequest(path3) {
124
+ const url = `${serverUrl}${apiPrefix}${pathPrefix}${path3}`;
125
125
  console.error(`[API] DELETE ${url}`);
126
126
  const headers = buildHeaders();
127
127
  const response = await fetch(url, {
@@ -135,8 +135,8 @@ function createApiClient(serverUrl, options = "") {
135
135
  );
136
136
  }
137
137
  }
138
- async function putJson(path2, body) {
139
- const url = `${serverUrl}${apiPrefix}${pathPrefix}${path2}`;
138
+ async function putJson(path3, body) {
139
+ const url = `${serverUrl}${apiPrefix}${pathPrefix}${path3}`;
140
140
  console.error(`[API] PUT ${url}`);
141
141
  const response = await fetch(url, {
142
142
  method: "PUT",
@@ -432,10 +432,10 @@ var import_evalforge_types13 = require("@wix/evalforge-types");
432
432
  var import_eval_assertions = require("@wix/eval-assertions");
433
433
 
434
434
  // src/run-scenario/environment.ts
435
- var import_fs = require("fs");
435
+ var import_fs2 = require("fs");
436
436
  var import_promises2 = require("fs/promises");
437
437
  var import_os = require("os");
438
- var import_path2 = __toESM(require("path"));
438
+ var import_path3 = __toESM(require("path"));
439
439
  var import_evalforge_github_client = require("@wix/evalforge-github-client");
440
440
 
441
441
  // src/run-scenario/utils/write-files.ts
@@ -455,6 +455,84 @@ async function writeFilesToDirectory(targetDir, files) {
455
455
  }
456
456
  }
457
457
 
458
+ // src/run-scenario/install-dependencies.ts
459
+ var import_fs = require("fs");
460
+ var import_crypto = require("crypto");
461
+ var import_path2 = __toESM(require("path"));
462
+ var import_child_process = require("child_process");
463
+ function detectPackageManager(workDir) {
464
+ if ((0, import_fs.existsSync)(import_path2.default.join(workDir, "pnpm-lock.yaml"))) {
465
+ return { cmd: "pnpm", args: ["install", "--frozen-lockfile"], cacheSourceFile: "pnpm-lock.yaml" };
466
+ }
467
+ if ((0, import_fs.existsSync)(import_path2.default.join(workDir, "package-lock.json"))) {
468
+ return { cmd: "npm", args: ["ci"], cacheSourceFile: "package-lock.json" };
469
+ }
470
+ if ((0, import_fs.existsSync)(import_path2.default.join(workDir, "yarn.lock"))) {
471
+ return { cmd: "yarn", args: ["install", "--frozen-lockfile"], cacheSourceFile: "yarn.lock" };
472
+ }
473
+ return { cmd: "npm", args: ["install", "--legacy-peer-deps", "--prefer-offline", "--no-fund", "--no-audit"], cacheSourceFile: "package.json" };
474
+ }
475
+ function cloneDirectory(src, dest) {
476
+ if (process.platform === "darwin") {
477
+ (0, import_child_process.execFileSync)("cp", ["-rc", src, dest]);
478
+ } else {
479
+ (0, import_fs.cpSync)(src, dest, { recursive: true });
480
+ }
481
+ }
482
+ function installWithCache(workDir, exec, cacheBase, pm) {
483
+ const sourceContent = (0, import_fs.readFileSync)(import_path2.default.join(workDir, pm.cacheSourceFile), "utf-8");
484
+ const cacheKey = (0, import_crypto.createHash)("sha256").update(sourceContent).digest("hex").slice(0, 16);
485
+ const cachedNodeModules = import_path2.default.join(cacheBase, cacheKey, "node_modules");
486
+ const targetNodeModules = import_path2.default.join(workDir, "node_modules");
487
+ const cacheDir = import_path2.default.dirname(cachedNodeModules);
488
+ const cachedYarnLock = import_path2.default.join(cacheDir, "yarn.lock");
489
+ if ((0, import_fs.existsSync)(cachedNodeModules)) {
490
+ console.log(`[environment] Restoring node_modules from cache (key: ${cacheKey})`);
491
+ if (!(0, import_fs.existsSync)(targetNodeModules)) {
492
+ cloneDirectory(cachedNodeModules, targetNodeModules);
493
+ }
494
+ if ((0, import_fs.existsSync)(cachedYarnLock)) {
495
+ (0, import_fs.copyFileSync)(cachedYarnLock, import_path2.default.join(workDir, "yarn.lock"));
496
+ }
497
+ return;
498
+ }
499
+ console.log(`[environment] Running ${pm.cmd} ${pm.args.join(" ")} in ${workDir} (cache key: ${cacheKey})`);
500
+ try {
501
+ exec(pm.cmd, pm.args, { cwd: workDir, stdio: "inherit", timeout: 18e4, env: { ...process.env, NODE_ENV: "development" } });
502
+ } catch (err) {
503
+ console.error("[environment] Dependency installation failed:", err instanceof Error ? err.message : String(err));
504
+ return;
505
+ }
506
+ console.log("[environment] Dependency installation complete \u2014 saving to cache");
507
+ try {
508
+ (0, import_fs.mkdirSync)(cacheDir, { recursive: true });
509
+ const yarnLockPath = import_path2.default.join(workDir, "yarn.lock");
510
+ if ((0, import_fs.existsSync)(yarnLockPath)) {
511
+ (0, import_fs.copyFileSync)(yarnLockPath, cachedYarnLock);
512
+ }
513
+ cloneDirectory(targetNodeModules, cachedNodeModules);
514
+ } catch (err) {
515
+ console.error("[environment] Failed to save to cache (installation still succeeded):", err instanceof Error ? err.message : String(err));
516
+ }
517
+ }
518
+ async function installDependencies(workDir, exec = import_child_process.execFileSync, cacheBase) {
519
+ if (!(0, import_fs.existsSync)(import_path2.default.join(workDir, "package.json"))) {
520
+ return;
521
+ }
522
+ const pm = detectPackageManager(workDir);
523
+ if (cacheBase) {
524
+ installWithCache(workDir, exec, cacheBase, pm);
525
+ return;
526
+ }
527
+ console.log(`[environment] Running ${pm.cmd} ${pm.args.join(" ")} in ${workDir}`);
528
+ try {
529
+ exec(pm.cmd, pm.args, { cwd: workDir, stdio: "inherit", timeout: 18e4, env: { ...process.env, NODE_ENV: "development" } });
530
+ console.log("[environment] Dependency installation complete");
531
+ } catch (err) {
532
+ console.error("[environment] Dependency installation failed:", err instanceof Error ? err.message : String(err));
533
+ }
534
+ }
535
+
458
536
  // src/run-scenario/environment.ts
459
537
  async function fetchAndWriteTemplateFiles(template, workDir) {
460
538
  let sourceFiles = [];
@@ -475,27 +553,27 @@ async function fetchAndWriteTemplateFiles(template, workDir) {
475
553
  const content = ef.gitSource ? await (0, import_evalforge_github_client.fetchGitHubFile)(ef.gitSource, {
476
554
  userAgent: "EvalForge-Evaluator"
477
555
  }) : ef.content ?? "";
478
- const dest = import_path2.default.resolve(workDir, ef.path);
479
- if (!dest.startsWith(workDir + import_path2.sep)) {
556
+ const dest = import_path3.default.resolve(workDir, ef.path);
557
+ if (!dest.startsWith(workDir + import_path3.sep)) {
480
558
  throw new Error(
481
559
  `Extra file path escapes working directory: "${ef.path}"`
482
560
  );
483
561
  }
484
- await (0, import_promises2.mkdir)(import_path2.default.dirname(dest), { recursive: true });
562
+ await (0, import_promises2.mkdir)(import_path3.default.dirname(dest), { recursive: true });
485
563
  await (0, import_promises2.writeFile)(dest, content, "utf8");
486
564
  })
487
565
  );
488
566
  }
489
567
  function writeWixEnvFile(workDir) {
490
- const configPath = import_path2.default.join(workDir, "wix.config.json");
491
- if (!(0, import_fs.existsSync)(configPath)) {
568
+ const configPath = import_path3.default.join(workDir, "wix.config.json");
569
+ if (!(0, import_fs2.existsSync)(configPath)) {
492
570
  return;
493
571
  }
494
572
  try {
495
- const config = JSON.parse((0, import_fs.readFileSync)(configPath, "utf-8"));
573
+ const config = JSON.parse((0, import_fs2.readFileSync)(configPath, "utf-8"));
496
574
  if (config.appId) {
497
- (0, import_fs.writeFileSync)(
498
- import_path2.default.join(workDir, ".env"),
575
+ (0, import_fs2.writeFileSync)(
576
+ import_path3.default.join(workDir, ".env"),
499
577
  `WIX_CLIENT_ID=${config.appId}
500
578
  `,
501
579
  "utf-8"
@@ -507,34 +585,36 @@ function writeWixEnvFile(workDir) {
507
585
  }
508
586
  }
509
587
  async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId, template) {
510
- const baseDir = config.evaluationsDir ?? import_path2.default.join((0, import_os.tmpdir)(), "evalforge-evaluations");
588
+ const baseDir = config.evaluationsDir ?? import_path3.default.join((0, import_os.tmpdir)(), "evalforge-evaluations");
589
+ const nodeModulesCacheDir = import_path3.default.join(baseDir, "_node_modules_cache");
511
590
  if (template) {
512
591
  if (!config.evaluationsDir) {
513
592
  console.warn(
514
593
  "Template specified but EVALUATIONS_DIR not set, using temp directory"
515
594
  );
516
595
  }
517
- const workDir2 = import_path2.default.join(baseDir, `${evalRunId2}_${targetId}`);
518
- if ((0, import_fs.existsSync)(workDir2)) {
519
- (0, import_fs.rmSync)(workDir2, { recursive: true });
596
+ const workDir2 = import_path3.default.join(baseDir, `${evalRunId2}_${targetId}`);
597
+ if ((0, import_fs2.existsSync)(workDir2)) {
598
+ (0, import_fs2.rmSync)(workDir2, { recursive: true });
520
599
  }
521
- (0, import_fs.mkdirSync)(workDir2, { recursive: true });
600
+ (0, import_fs2.mkdirSync)(workDir2, { recursive: true });
522
601
  await fetchAndWriteTemplateFiles(template, workDir2);
523
602
  console.log(`Template files written to ${workDir2}`);
524
603
  writeWixEnvFile(workDir2);
604
+ await installDependencies(workDir2, void 0, nodeModulesCacheDir);
525
605
  return workDir2;
526
606
  }
527
- const workDir = import_path2.default.join(baseDir, `${evalRunId2}_${targetId}_${scenarioId}`);
528
- if ((0, import_fs.existsSync)(workDir)) {
529
- (0, import_fs.rmSync)(workDir, { recursive: true });
607
+ const workDir = import_path3.default.join(baseDir, `${evalRunId2}_${targetId}_${scenarioId}`);
608
+ if ((0, import_fs2.existsSync)(workDir)) {
609
+ (0, import_fs2.rmSync)(workDir, { recursive: true });
530
610
  }
531
- (0, import_fs.mkdirSync)(workDir, { recursive: true });
611
+ (0, import_fs2.mkdirSync)(workDir, { recursive: true });
532
612
  console.log(`Empty working directory created at ${workDir}`);
533
613
  return workDir;
534
614
  }
535
615
 
536
616
  // src/run-scenario/run-agent-with-context.ts
537
- var import_crypto4 = require("crypto");
617
+ var import_crypto5 = require("crypto");
538
618
 
539
619
  // src/run-scenario/agents/registry.ts
540
620
  var AgentAdapterRegistry = class {
@@ -643,7 +723,7 @@ var import_evalforge_types4 = require("@wix/evalforge-types");
643
723
 
644
724
  // src/run-scenario/agents/claude-code/write-skills.ts
645
725
  var import_promises3 = require("fs/promises");
646
- var import_path3 = require("path");
726
+ var import_path4 = require("path");
647
727
  var import_evalforge_github_client2 = require("@wix/evalforge-github-client");
648
728
  async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
649
729
  await Promise.all(
@@ -652,7 +732,7 @@ async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_g
652
732
  }
653
733
  async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
654
734
  const skillName = skill.name;
655
- const skillDir = (0, import_path3.join)(cwd, ".claude", "skills", skillName);
735
+ const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skillName);
656
736
  await (0, import_promises3.mkdir)(skillDir, { recursive: true });
657
737
  const version = skill.latestVersion;
658
738
  if (version?.files && version.files.length > 0) {
@@ -692,18 +772,18 @@ function resolveTimeoutMs(maxTurns, maxDurationMs) {
692
772
  }
693
773
 
694
774
  // src/run-scenario/agents/claude-code/execute.ts
695
- var import_crypto = require("crypto");
775
+ var import_crypto2 = require("crypto");
696
776
 
697
777
  // src/run-scenario/agents/claude-code/write-mcp.ts
698
778
  var import_promises5 = require("fs/promises");
699
- var import_path5 = require("path");
779
+ var import_path6 = require("path");
700
780
  var import_evalforge_types2 = require("@wix/evalforge-types");
701
781
 
702
782
  // src/run-scenario/agents/shared/resolve-mcp-placeholders.ts
703
783
  var import_promises4 = require("fs/promises");
704
- var import_path4 = require("path");
784
+ var import_path5 = require("path");
705
785
  var import_os2 = require("os");
706
- var WIX_AUTH_FILE = (0, import_path4.join)((0, import_os2.homedir)(), ".wix", "auth", "api-key.json");
786
+ var WIX_AUTH_FILE = (0, import_path5.join)((0, import_os2.homedir)(), ".wix", "auth", "api-key.json");
707
787
  async function loadWixAuthPlaceholders(authFilePath = WIX_AUTH_FILE) {
708
788
  try {
709
789
  const content = await (0, import_promises4.readFile)(authFilePath, "utf-8");
@@ -762,14 +842,14 @@ async function writeMcpToFilesystem(cwd, mcps) {
762
842
  null,
763
843
  2
764
844
  );
765
- const filePath = (0, import_path5.join)(cwd, ".mcp.json");
845
+ const filePath = (0, import_path6.join)(cwd, ".mcp.json");
766
846
  await (0, import_promises5.writeFile)(filePath, content, "utf8");
767
847
  console.log(`[MCP] Written to ${filePath}`);
768
848
  }
769
849
 
770
850
  // src/run-scenario/agents/claude-code/write-sub-agents.ts
771
851
  var import_promises6 = require("fs/promises");
772
- var import_path6 = require("path");
852
+ var import_path7 = require("path");
773
853
  var import_evalforge_github_client3 = require("@wix/evalforge-github-client");
774
854
  var AGENTS_DIR = ".claude/agents";
775
855
  function toAgentFilename(name, index, nameCount) {
@@ -807,12 +887,12 @@ async function resolveSubAgentContent(agent, fetchFn) {
807
887
  }
808
888
  async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalforge_github_client3.fetchGitHubFile) {
809
889
  if (subAgents.length === 0) return;
810
- const agentsDir = (0, import_path6.join)(cwd, AGENTS_DIR);
890
+ const agentsDir = (0, import_path7.join)(cwd, AGENTS_DIR);
811
891
  await (0, import_promises6.mkdir)(agentsDir, { recursive: true });
812
892
  const nameCount = /* @__PURE__ */ new Map();
813
893
  for (const [i, agent] of subAgents.entries()) {
814
894
  const filename = toAgentFilename(agent.name, i, nameCount);
815
- const filePath = (0, import_path6.join)(agentsDir, `${filename}.md`);
895
+ const filePath = (0, import_path7.join)(agentsDir, `${filename}.md`);
816
896
  const content = await resolveSubAgentContent(agent, fetchFn);
817
897
  await (0, import_promises6.writeFile)(filePath, content, "utf8");
818
898
  }
@@ -821,7 +901,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalf
821
901
 
822
902
  // src/run-scenario/agents/claude-code/write-rules.ts
823
903
  var import_promises7 = require("fs/promises");
824
- var import_path7 = require("path");
904
+ var import_path8 = require("path");
825
905
  var CURSOR_RULES_DIR = ".cursor/rules";
826
906
  function toRuleFilename(name, index, nameCount) {
827
907
  const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `rule-${index}`;
@@ -854,9 +934,9 @@ function validateGenericDirectory(dir, cwd) {
854
934
  `Generic rule directory may not contain "..", got: "${dir}"`
855
935
  );
856
936
  }
857
- const normalizedCwd = cwd.endsWith(import_path7.sep) ? cwd.slice(0, -1) : cwd;
858
- const resolved = (0, import_path7.resolve)(normalizedCwd, trimmed);
859
- if (!resolved.startsWith(normalizedCwd + import_path7.sep)) {
937
+ const normalizedCwd = cwd.endsWith(import_path8.sep) ? cwd.slice(0, -1) : cwd;
938
+ const resolved = (0, import_path8.resolve)(normalizedCwd, trimmed);
939
+ if (!resolved.startsWith(normalizedCwd + import_path8.sep)) {
860
940
  throw new Error(
861
941
  `Generic rule directory escapes the working directory: "${dir}"`
862
942
  );
@@ -870,20 +950,20 @@ async function writeRulesToFilesystem(cwd, rules) {
870
950
  for (const [i, rule] of rules.entries()) {
871
951
  switch (rule.ruleType) {
872
952
  case "claude-md": {
873
- await appendToFile((0, import_path7.join)(cwd, "CLAUDE.md"), rule.content);
953
+ await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), rule.content);
874
954
  break;
875
955
  }
876
956
  case "agents-md": {
877
- await appendToFile((0, import_path7.join)(cwd, "AGENTS.md"), rule.content);
957
+ await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), rule.content);
878
958
  break;
879
959
  }
880
960
  case "cursor-rule": {
881
961
  if (!hasCursorRules) {
882
- await (0, import_promises7.mkdir)((0, import_path7.join)(cwd, CURSOR_RULES_DIR), { recursive: true });
962
+ await (0, import_promises7.mkdir)((0, import_path8.join)(cwd, CURSOR_RULES_DIR), { recursive: true });
883
963
  hasCursorRules = true;
884
964
  }
885
965
  const filename = toRuleFilename(rule.name, i, nameCount);
886
- const filePath = (0, import_path7.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
966
+ const filePath = (0, import_path8.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
887
967
  await (0, import_promises7.writeFile)(filePath, rule.content, "utf8");
888
968
  break;
889
969
  }
@@ -892,10 +972,10 @@ async function writeRulesToFilesystem(cwd, rules) {
892
972
  rule.directory ?? ".opencode/rules",
893
973
  cwd
894
974
  );
895
- const dirPath = (0, import_path7.join)(cwd, directory);
975
+ const dirPath = (0, import_path8.join)(cwd, directory);
896
976
  await (0, import_promises7.mkdir)(dirPath, { recursive: true });
897
977
  const filename = toRuleFilename(rule.name, i, nameCount);
898
- await (0, import_promises7.writeFile)((0, import_path7.join)(dirPath, `${filename}.md`), rule.content, "utf8");
978
+ await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), rule.content, "utf8");
899
979
  break;
900
980
  }
901
981
  default: {
@@ -1066,8 +1146,8 @@ function extractToolActionDescription(toolName, toolArgs) {
1066
1146
  }
1067
1147
  }
1068
1148
  if (toolName === "LS" || toolName === "ls" || toolName === "ListFiles") {
1069
- const path2 = args.path || args.directory || ".";
1070
- return `Listing: ${String(path2).slice(0, 50)}`;
1149
+ const path3 = args.path || args.directory || ".";
1150
+ return `Listing: ${String(path3).slice(0, 50)}`;
1071
1151
  }
1072
1152
  if ((toolName === "Read" || toolName === "read" || toolName === "View") && (args.file_path || args.path || args.target_file)) {
1073
1153
  const filePath = String(
@@ -1950,7 +2030,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
1950
2030
  const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
1951
2031
  if (hasThinking && (hasText || toolCallCount > 0)) {
1952
2032
  subSteps.push({
1953
- id: (0, import_crypto.randomUUID)(),
2033
+ id: (0, import_crypto2.randomUUID)(),
1954
2034
  stepNumber: 0,
1955
2035
  // renumbered below
1956
2036
  turnIndex,
@@ -1980,7 +2060,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
1980
2060
  const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
1981
2061
  const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
1982
2062
  subSteps.push({
1983
- id: (0, import_crypto.randomUUID)(),
2063
+ id: (0, import_crypto2.randomUUID)(),
1984
2064
  stepNumber: 0,
1985
2065
  turnIndex,
1986
2066
  type: import_evalforge_types4.LLMStepType.TOOL_USE,
@@ -2010,7 +2090,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
2010
2090
  }
2011
2091
  if (hasText && toolCallCount > 0) {
2012
2092
  subSteps.push({
2013
- id: (0, import_crypto.randomUUID)(),
2093
+ id: (0, import_crypto2.randomUUID)(),
2014
2094
  stepNumber: 0,
2015
2095
  turnIndex,
2016
2096
  type: import_evalforge_types4.LLMStepType.COMPLETION,
@@ -2032,7 +2112,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
2032
2112
  if (subSteps.length === 0) {
2033
2113
  const stepType = hasThinking && !hasText ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
2034
2114
  subSteps.push({
2035
- id: (0, import_crypto.randomUUID)(),
2115
+ id: (0, import_crypto2.randomUUID)(),
2036
2116
  stepNumber: 0,
2037
2117
  turnIndex,
2038
2118
  type: stepType,
@@ -2090,7 +2170,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
2090
2170
  stepTypeBreakdown
2091
2171
  };
2092
2172
  return {
2093
- id: (0, import_crypto.randomUUID)(),
2173
+ id: (0, import_crypto2.randomUUID)(),
2094
2174
  steps: traceSteps,
2095
2175
  summary
2096
2176
  };
@@ -2192,7 +2272,7 @@ defaultRegistry.register(claudeCodeAdapter);
2192
2272
  var import_evalforge_types9 = require("@wix/evalforge-types");
2193
2273
 
2194
2274
  // src/run-scenario/agents/opencode/execute.ts
2195
- var import_child_process = require("child_process");
2275
+ var import_child_process2 = require("child_process");
2196
2276
  var import_evalforge_types8 = require("@wix/evalforge-types");
2197
2277
 
2198
2278
  // src/run-scenario/agents/opencode/types.ts
@@ -2206,7 +2286,7 @@ function tryParseJson(text) {
2206
2286
 
2207
2287
  // src/run-scenario/agents/opencode/write-skills.ts
2208
2288
  var import_promises8 = require("fs/promises");
2209
- var import_path8 = require("path");
2289
+ var import_path9 = require("path");
2210
2290
  var import_evalforge_github_client4 = require("@wix/evalforge-github-client");
2211
2291
  async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_github_client4.fetchGitHubFolder) {
2212
2292
  await Promise.all(
@@ -2215,7 +2295,7 @@ async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_
2215
2295
  }
2216
2296
  async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
2217
2297
  const skillName = skill.name;
2218
- const skillDir = (0, import_path8.join)(cwd, ".opencode", "skills", skillName);
2298
+ const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skillName);
2219
2299
  await (0, import_promises8.mkdir)(skillDir, { recursive: true });
2220
2300
  const version = skill.latestVersion;
2221
2301
  if (version?.files && version.files.length > 0) {
@@ -2248,7 +2328,7 @@ async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
2248
2328
 
2249
2329
  // src/run-scenario/agents/opencode/write-sub-agents.ts
2250
2330
  var import_promises9 = require("fs/promises");
2251
- var import_path9 = require("path");
2331
+ var import_path10 = require("path");
2252
2332
  var import_evalforge_github_client5 = require("@wix/evalforge-github-client");
2253
2333
  var AGENTS_DIR2 = ".opencode/agents";
2254
2334
  function toAgentFilename2(name, index, nameCount) {
@@ -2286,12 +2366,12 @@ async function resolveSubAgentContent2(agent, fetchFn) {
2286
2366
  }
2287
2367
  async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_evalforge_github_client5.fetchGitHubFile) {
2288
2368
  if (subAgents.length === 0) return;
2289
- const agentsDir = (0, import_path9.join)(cwd, AGENTS_DIR2);
2369
+ const agentsDir = (0, import_path10.join)(cwd, AGENTS_DIR2);
2290
2370
  await (0, import_promises9.mkdir)(agentsDir, { recursive: true });
2291
2371
  const nameCount = /* @__PURE__ */ new Map();
2292
2372
  for (const [i, agent] of subAgents.entries()) {
2293
2373
  const filename = toAgentFilename2(agent.name, i, nameCount);
2294
- const filePath = (0, import_path9.join)(agentsDir, `${filename}.md`);
2374
+ const filePath = (0, import_path10.join)(agentsDir, `${filename}.md`);
2295
2375
  const content = await resolveSubAgentContent2(agent, fetchFn);
2296
2376
  await (0, import_promises9.writeFile)(filePath, content, "utf8");
2297
2377
  }
@@ -2454,7 +2534,7 @@ async function buildOpenCodeEnv(options) {
2454
2534
 
2455
2535
  // src/run-scenario/agents/opencode/build-trace.ts
2456
2536
  var import_evalforge_types7 = require("@wix/evalforge-types");
2457
- var import_crypto2 = require("crypto");
2537
+ var import_crypto3 = require("crypto");
2458
2538
  function toCanonicalModelId(modelId) {
2459
2539
  const slashIndex = modelId.indexOf("/");
2460
2540
  return slashIndex > 0 ? modelId.slice(slashIndex + 1) : modelId;
@@ -2530,7 +2610,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
2530
2610
  const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
2531
2611
  if (hasThinking && (hasText || toolCallCount > 0)) {
2532
2612
  subSteps.push({
2533
- id: (0, import_crypto2.randomUUID)(),
2613
+ id: (0, import_crypto3.randomUUID)(),
2534
2614
  stepNumber: 0,
2535
2615
  turnIndex,
2536
2616
  type: import_evalforge_types7.LLMStepType.THINKING,
@@ -2559,7 +2639,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
2559
2639
  const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
2560
2640
  const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
2561
2641
  subSteps.push({
2562
- id: (0, import_crypto2.randomUUID)(),
2642
+ id: (0, import_crypto3.randomUUID)(),
2563
2643
  stepNumber: 0,
2564
2644
  turnIndex,
2565
2645
  type: import_evalforge_types7.LLMStepType.TOOL_USE,
@@ -2589,7 +2669,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
2589
2669
  }
2590
2670
  if (hasText && toolCallCount > 0) {
2591
2671
  subSteps.push({
2592
- id: (0, import_crypto2.randomUUID)(),
2672
+ id: (0, import_crypto3.randomUUID)(),
2593
2673
  stepNumber: 0,
2594
2674
  turnIndex,
2595
2675
  type: import_evalforge_types7.LLMStepType.COMPLETION,
@@ -2611,7 +2691,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
2611
2691
  if (subSteps.length === 0) {
2612
2692
  const stepType = hasThinking && !hasText ? import_evalforge_types7.LLMStepType.THINKING : import_evalforge_types7.LLMStepType.COMPLETION;
2613
2693
  subSteps.push({
2614
- id: (0, import_crypto2.randomUUID)(),
2694
+ id: (0, import_crypto3.randomUUID)(),
2615
2695
  stepNumber: 0,
2616
2696
  turnIndex,
2617
2697
  type: stepType,
@@ -2680,7 +2760,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
2680
2760
  stepTypeBreakdown
2681
2761
  };
2682
2762
  return {
2683
- id: (0, import_crypto2.randomUUID)(),
2763
+ id: (0, import_crypto3.randomUUID)(),
2684
2764
  steps: allSteps,
2685
2765
  summary
2686
2766
  };
@@ -2755,7 +2835,7 @@ function buildConversation2(timestampedEvents) {
2755
2835
 
2756
2836
  // src/run-scenario/agents/opencode/execute.ts
2757
2837
  var import_promises10 = require("fs/promises");
2758
- var import_path10 = require("path");
2838
+ var import_path11 = require("path");
2759
2839
  var KILL_GRACE_PERIOD_MS = 5e3;
2760
2840
  var IDLE_TIMEOUT_MS = 12e4;
2761
2841
  var IDLE_CHECK_INTERVAL_MS = 15e3;
@@ -2780,14 +2860,14 @@ function extractToolAction(toolName, args) {
2780
2860
  return `Using ${toolName}...`;
2781
2861
  }
2782
2862
  async function writePromptImages(cwd, images) {
2783
- const imagesDir = (0, import_path10.join)(cwd, "prompt-images");
2863
+ const imagesDir = (0, import_path11.join)(cwd, "prompt-images");
2784
2864
  await (0, import_promises10.mkdir)(imagesDir, { recursive: true });
2785
2865
  const filePaths = [];
2786
2866
  for (let i = 0; i < images.length; i++) {
2787
2867
  const img = images[i];
2788
2868
  const ext = img.mediaType.split("/")[1] || "png";
2789
2869
  const filename = `image-${i}.${ext}`;
2790
- const filepath = (0, import_path10.join)(imagesDir, filename);
2870
+ const filepath = (0, import_path11.join)(imagesDir, filename);
2791
2871
  const buffer = Buffer.from(img.base64, "base64");
2792
2872
  await (0, import_promises10.writeFile)(filepath, buffer);
2793
2873
  filePaths.push(filepath);
@@ -2880,10 +2960,10 @@ async function prepareOpenCodeEnvironment(cwd, skills, options) {
2880
2960
  }
2881
2961
  }
2882
2962
  async function writeSystemPromptRule(cwd, systemPrompt) {
2883
- const rulesDir = (0, import_path10.join)(cwd, ".opencode", "rules");
2963
+ const rulesDir = (0, import_path11.join)(cwd, ".opencode", "rules");
2884
2964
  await (0, import_promises10.mkdir)(rulesDir, { recursive: true });
2885
2965
  await (0, import_promises10.writeFile)(
2886
- (0, import_path10.join)(rulesDir, "evalforge-system-prompt.md"),
2966
+ (0, import_path11.join)(rulesDir, "evalforge-system-prompt.md"),
2887
2967
  systemPrompt,
2888
2968
  "utf-8"
2889
2969
  );
@@ -2986,7 +3066,7 @@ function spawnOpenCodeProcess(opts) {
2986
3066
  };
2987
3067
  let child;
2988
3068
  try {
2989
- child = (0, import_child_process.spawn)("opencode", args, {
3069
+ child = (0, import_child_process2.spawn)("opencode", args, {
2990
3070
  cwd,
2991
3071
  env,
2992
3072
  stdio: ["ignore", "pipe", "pipe"],
@@ -3471,7 +3551,7 @@ var import_anthropic = require("@ai-sdk/anthropic");
3471
3551
  var import_google = require("@ai-sdk/google");
3472
3552
  var import_openai = require("@ai-sdk/openai");
3473
3553
  var import_evalforge_types11 = require("@wix/evalforge-types");
3474
- var import_crypto3 = require("crypto");
3554
+ var import_crypto4 = require("crypto");
3475
3555
 
3476
3556
  // src/run-scenario/agents/simple-agent/mcp-tools.ts
3477
3557
  var import_mcp = require("@ai-sdk/mcp");
@@ -4087,7 +4167,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
4087
4167
  const costUsd = calculateStepCost(step, modelId, provider, tokenUsage);
4088
4168
  const toolResultError = findToolResultError(step);
4089
4169
  return {
4090
- id: (0, import_crypto3.randomUUID)(),
4170
+ id: (0, import_crypto4.randomUUID)(),
4091
4171
  stepNumber: i + 1,
4092
4172
  turnIndex: i,
4093
4173
  type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
@@ -4111,7 +4191,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
4111
4191
  total: totalUsage.totalTokens
4112
4192
  };
4113
4193
  return {
4114
- id: (0, import_crypto3.randomUUID)(),
4194
+ id: (0, import_crypto4.randomUUID)(),
4115
4195
  steps: traceSteps,
4116
4196
  summary: {
4117
4197
  totalSteps: traceSteps.length,
@@ -4186,8 +4266,8 @@ var simpleAgentAdapter = new SimpleAgentAdapter();
4186
4266
  defaultRegistry.register(simpleAgentAdapter);
4187
4267
 
4188
4268
  // src/run-scenario/file-diff.ts
4189
- var import_fs2 = require("fs");
4190
- var import_path11 = require("path");
4269
+ var import_fs3 = require("fs");
4270
+ var import_path12 = require("path");
4191
4271
 
4192
4272
  // ../../node_modules/diff/lib/index.mjs
4193
4273
  function Diff() {
@@ -4291,11 +4371,11 @@ Diff.prototype = {
4291
4371
  }
4292
4372
  }
4293
4373
  },
4294
- addToPath: function addToPath(path2, added, removed, oldPosInc, options) {
4295
- var last = path2.lastComponent;
4374
+ addToPath: function addToPath(path3, added, removed, oldPosInc, options) {
4375
+ var last = path3.lastComponent;
4296
4376
  if (last && !options.oneChangePerToken && last.added === added && last.removed === removed) {
4297
4377
  return {
4298
- oldPos: path2.oldPos + oldPosInc,
4378
+ oldPos: path3.oldPos + oldPosInc,
4299
4379
  lastComponent: {
4300
4380
  count: last.count + 1,
4301
4381
  added,
@@ -4305,7 +4385,7 @@ Diff.prototype = {
4305
4385
  };
4306
4386
  } else {
4307
4387
  return {
4308
- oldPos: path2.oldPos + oldPosInc,
4388
+ oldPos: path3.oldPos + oldPosInc,
4309
4389
  lastComponent: {
4310
4390
  count: 1,
4311
4391
  added,
@@ -4745,9 +4825,9 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
4745
4825
  // src/run-scenario/file-diff.ts
4746
4826
  function deriveInfrastructurePaths(prePrep, postPrep) {
4747
4827
  const infraPaths = /* @__PURE__ */ new Set();
4748
- for (const path2 of Object.keys(postPrep)) {
4749
- if (prePrep[path2] === void 0 || prePrep[path2] !== postPrep[path2]) {
4750
- infraPaths.add(path2);
4828
+ for (const path3 of Object.keys(postPrep)) {
4829
+ if (prePrep[path3] === void 0 || prePrep[path3] !== postPrep[path3]) {
4830
+ infraPaths.add(path3);
4751
4831
  }
4752
4832
  }
4753
4833
  return infraPaths;
@@ -4807,13 +4887,13 @@ function isBinaryFile(filename) {
4807
4887
  function snapshotDirectory(dir, baseDir) {
4808
4888
  const snapshot = {};
4809
4889
  const base = baseDir || dir;
4810
- if (!(0, import_fs2.existsSync)(dir)) {
4890
+ if (!(0, import_fs3.existsSync)(dir)) {
4811
4891
  return snapshot;
4812
4892
  }
4813
- const entries = (0, import_fs2.readdirSync)(dir, { withFileTypes: true });
4893
+ const entries = (0, import_fs3.readdirSync)(dir, { withFileTypes: true });
4814
4894
  for (const entry of entries) {
4815
- const fullPath = (0, import_path11.join)(dir, entry.name);
4816
- const relativePath = (0, import_path11.relative)(base, fullPath);
4895
+ const fullPath = (0, import_path12.join)(dir, entry.name);
4896
+ const relativePath = (0, import_path12.relative)(base, fullPath);
4817
4897
  if (shouldIgnore(entry.name)) {
4818
4898
  continue;
4819
4899
  }
@@ -4825,11 +4905,11 @@ function snapshotDirectory(dir, baseDir) {
4825
4905
  continue;
4826
4906
  }
4827
4907
  try {
4828
- const stats = (0, import_fs2.statSync)(fullPath);
4908
+ const stats = (0, import_fs3.statSync)(fullPath);
4829
4909
  if (stats.size > MAX_FILE_SIZE) {
4830
4910
  continue;
4831
4911
  }
4832
- const content = (0, import_fs2.readFileSync)(fullPath, "utf-8");
4912
+ const content = (0, import_fs3.readFileSync)(fullPath, "utf-8");
4833
4913
  snapshot[relativePath] = content;
4834
4914
  } catch {
4835
4915
  continue;
@@ -4858,19 +4938,19 @@ function generateDiffLines(before, after) {
4858
4938
  function diffSnapshots(before, after, infrastructurePaths) {
4859
4939
  const diffs = [];
4860
4940
  const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
4861
- for (const path2 of allPaths) {
4862
- const beforeContent = before[path2] ?? "";
4863
- const afterContent = after[path2] ?? "";
4864
- if (before[path2] !== void 0 && beforeContent === afterContent) {
4941
+ for (const path3 of allPaths) {
4942
+ const beforeContent = before[path3] ?? "";
4943
+ const afterContent = after[path3] ?? "";
4944
+ if (before[path3] !== void 0 && beforeContent === afterContent) {
4865
4945
  continue;
4866
4946
  }
4867
4947
  const diffLines2 = generateDiffLines(beforeContent, afterContent);
4868
4948
  diffs.push({
4869
- path: path2,
4949
+ path: path3,
4870
4950
  expected: beforeContent,
4871
4951
  actual: afterContent,
4872
4952
  diffLines: diffLines2,
4873
- ...infrastructurePaths?.has(path2) && { isInfrastructure: true }
4953
+ ...infrastructurePaths?.has(path3) && { isInfrastructure: true }
4874
4954
  });
4875
4955
  }
4876
4956
  const deletedPaths = [...allPaths].filter((p) => after[p] === void 0);
@@ -4897,9 +4977,9 @@ function diffSnapshots(before, after, infrastructurePaths) {
4897
4977
  function extractTemplateFiles(before, after, infrastructurePaths) {
4898
4978
  const files = [];
4899
4979
  const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
4900
- for (const path2 of allPaths) {
4901
- const beforeContent = before[path2];
4902
- const afterContent = after[path2];
4980
+ for (const path3 of allPaths) {
4981
+ const beforeContent = before[path3];
4982
+ const afterContent = after[path3];
4903
4983
  if (afterContent === void 0) {
4904
4984
  continue;
4905
4985
  }
@@ -4912,10 +4992,10 @@ function extractTemplateFiles(before, after, infrastructurePaths) {
4912
4992
  status = "unchanged";
4913
4993
  }
4914
4994
  files.push({
4915
- path: path2,
4995
+ path: path3,
4916
4996
  content: afterContent,
4917
4997
  status,
4918
- ...infrastructurePaths?.has(path2) && { isInfrastructure: true }
4998
+ ...infrastructurePaths?.has(path3) && { isInfrastructure: true }
4919
4999
  });
4920
5000
  }
4921
5001
  files.sort((a, b) => a.path.localeCompare(b.path));
@@ -4999,7 +5079,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4999
5079
  }
5000
5080
  } : agent?.modelConfig ?? (llmTrace?.summary.modelsUsed?.[0] ? { model: llmTrace.summary.modelsUsed[0] } : void 0);
5001
5081
  return {
5002
- id: (0, import_crypto4.randomUUID)(),
5082
+ id: (0, import_crypto5.randomUUID)(),
5003
5083
  targetId,
5004
5084
  targetName,
5005
5085
  scenarioId: scenario.id,
@@ -5020,6 +5100,24 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
5020
5100
  // src/run-scenario/index.ts
5021
5101
  async function runScenario(config, evalRunId2, scenario, evalData, template, resolvedAssertions) {
5022
5102
  const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
5103
+ const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
5104
+ if (template) {
5105
+ console.log(
5106
+ (0, import_evalforge_types13.formatTraceEventLine)({
5107
+ evalRunId: evalRunId2,
5108
+ scenarioId: scenario.id,
5109
+ scenarioName: scenario.name,
5110
+ targetId,
5111
+ targetName,
5112
+ stepNumber: 0,
5113
+ type: import_evalforge_types13.LiveTraceEventType.PROGRESS,
5114
+ outputPreview: "Setting up environment (installing dependencies)...",
5115
+ elapsedMs: 0,
5116
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
5117
+ isComplete: false
5118
+ })
5119
+ );
5120
+ }
5023
5121
  const workDir = await prepareWorkingDirectory(
5024
5122
  config,
5025
5123
  evalRunId2,
@@ -5083,7 +5181,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
5083
5181
  }
5084
5182
 
5085
5183
  // src/evaluation-loop.ts
5086
- var import_crypto5 = require("crypto");
5184
+ var import_crypto6 = require("crypto");
5087
5185
  async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
5088
5186
  const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
5089
5187
  let completedExecutions = 0;
@@ -5109,7 +5207,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
5109
5207
  `[Evaluator] Scenario iteration failed, recording as error result: "${scenario.name}"${iterLabel} \u2014 ${errorMsg}`
5110
5208
  );
5111
5209
  const errorResult = {
5112
- id: (0, import_crypto5.randomUUID)(),
5210
+ id: (0, import_crypto6.randomUUID)(),
5113
5211
  targetId,
5114
5212
  targetName,
5115
5213
  scenarioId: scenario.id,