@wix/evalforge-evaluator 0.170.0 → 0.171.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -90,8 +90,8 @@ function createApiClient(serverUrl, options = "") {
90
90
  }
91
91
  return headers;
92
92
  }
93
- async function fetchJson(path2) {
94
- const url = `${serverUrl}${apiPrefix}${pathPrefix}${path2}`;
93
+ async function fetchJson(path3) {
94
+ const url = `${serverUrl}${apiPrefix}${pathPrefix}${path3}`;
95
95
  console.error(`[API] GET ${url}`);
96
96
  const headers = buildHeaders();
97
97
  const response = await fetch(url, {
@@ -105,8 +105,8 @@ function createApiClient(serverUrl, options = "") {
105
105
  }
106
106
  return response.json();
107
107
  }
108
- async function postJson(path2, body) {
109
- const url = `${serverUrl}${apiPrefix}${pathPrefix}${path2}`;
108
+ async function postJson(path3, body) {
109
+ const url = `${serverUrl}${apiPrefix}${pathPrefix}${path3}`;
110
110
  console.error(`[API] POST ${url}`);
111
111
  const response = await fetch(url, {
112
112
  method: "POST",
@@ -120,8 +120,8 @@ function createApiClient(serverUrl, options = "") {
120
120
  );
121
121
  }
122
122
  }
123
- async function deleteRequest(path2) {
124
- const url = `${serverUrl}${apiPrefix}${pathPrefix}${path2}`;
123
+ async function deleteRequest(path3) {
124
+ const url = `${serverUrl}${apiPrefix}${pathPrefix}${path3}`;
125
125
  console.error(`[API] DELETE ${url}`);
126
126
  const headers = buildHeaders();
127
127
  const response = await fetch(url, {
@@ -135,8 +135,8 @@ function createApiClient(serverUrl, options = "") {
135
135
  );
136
136
  }
137
137
  }
138
- async function putJson(path2, body) {
139
- const url = `${serverUrl}${apiPrefix}${pathPrefix}${path2}`;
138
+ async function putJson(path3, body) {
139
+ const url = `${serverUrl}${apiPrefix}${pathPrefix}${path3}`;
140
140
  console.error(`[API] PUT ${url}`);
141
141
  const response = await fetch(url, {
142
142
  method: "PUT",
@@ -432,10 +432,10 @@ var import_evalforge_types13 = require("@wix/evalforge-types");
432
432
  var import_eval_assertions = require("@wix/eval-assertions");
433
433
 
434
434
  // src/run-scenario/environment.ts
435
- var import_fs = require("fs");
435
+ var import_fs2 = require("fs");
436
436
  var import_promises2 = require("fs/promises");
437
437
  var import_os = require("os");
438
- var import_path2 = __toESM(require("path"));
438
+ var import_path3 = __toESM(require("path"));
439
439
  var import_evalforge_github_client = require("@wix/evalforge-github-client");
440
440
 
441
441
  // src/run-scenario/utils/write-files.ts
@@ -455,6 +455,86 @@ async function writeFilesToDirectory(targetDir, files) {
455
455
  }
456
456
  }
457
457
 
458
+ // src/run-scenario/install-dependencies.ts
459
+ var import_fs = require("fs");
460
+ var import_crypto = require("crypto");
461
+ var import_path2 = __toESM(require("path"));
462
+ var import_child_process = require("child_process");
463
+ function detectPackageManager(workDir) {
464
+ if ((0, import_fs.existsSync)(import_path2.default.join(workDir, "pnpm-lock.yaml"))) {
465
+ return { cmd: "pnpm", args: ["install", "--frozen-lockfile"], cacheSourceFile: "pnpm-lock.yaml" };
466
+ }
467
+ if ((0, import_fs.existsSync)(import_path2.default.join(workDir, "package-lock.json"))) {
468
+ return { cmd: "npm", args: ["ci"], cacheSourceFile: "package-lock.json" };
469
+ }
470
+ if ((0, import_fs.existsSync)(import_path2.default.join(workDir, "yarn.lock"))) {
471
+ return { cmd: "yarn", args: ["install", "--frozen-lockfile"], cacheSourceFile: "yarn.lock" };
472
+ }
473
+ (0, import_fs.writeFileSync)(import_path2.default.join(workDir, "yarn.lock"), "", "utf-8");
474
+ return { cmd: "yarn", args: ["install"], cacheSourceFile: "package.json" };
475
+ }
476
+ function cloneDirectory(src, dest) {
477
+ if (process.platform === "darwin") {
478
+ (0, import_child_process.execFileSync)("cp", ["-rc", src, dest]);
479
+ } else {
480
+ (0, import_fs.cpSync)(src, dest, { recursive: true });
481
+ }
482
+ }
483
+ function installWithCache(workDir, exec, cacheBase, pm) {
484
+ const sourceContent = (0, import_fs.readFileSync)(import_path2.default.join(workDir, pm.cacheSourceFile), "utf-8");
485
+ const cacheKey = (0, import_crypto.createHash)("sha256").update(sourceContent).digest("hex").slice(0, 16);
486
+ const cachedNodeModules = import_path2.default.join(cacheBase, cacheKey, "node_modules");
487
+ const targetNodeModules = import_path2.default.join(workDir, "node_modules");
488
+ const cacheDir = import_path2.default.dirname(cachedNodeModules);
489
+ const cachedYarnLock = import_path2.default.join(cacheDir, "yarn.lock");
490
+ if ((0, import_fs.existsSync)(cachedNodeModules)) {
491
+ console.log(`[environment] Restoring node_modules from cache (key: ${cacheKey})`);
492
+ if (!(0, import_fs.existsSync)(targetNodeModules)) {
493
+ cloneDirectory(cachedNodeModules, targetNodeModules);
494
+ }
495
+ if ((0, import_fs.existsSync)(cachedYarnLock)) {
496
+ (0, import_fs.copyFileSync)(cachedYarnLock, import_path2.default.join(workDir, "yarn.lock"));
497
+ }
498
+ return;
499
+ }
500
+ console.log(`[environment] Running ${pm.cmd} ${pm.args.join(" ")} in ${workDir} (cache key: ${cacheKey})`);
501
+ try {
502
+ exec(pm.cmd, pm.args, { cwd: workDir, stdio: "inherit", timeout: 18e4 });
503
+ } catch (err) {
504
+ console.error("[environment] Dependency installation failed:", err instanceof Error ? err.message : String(err));
505
+ return;
506
+ }
507
+ console.log("[environment] Dependency installation complete \u2014 saving to cache");
508
+ try {
509
+ (0, import_fs.mkdirSync)(cacheDir, { recursive: true });
510
+ const yarnLockPath = import_path2.default.join(workDir, "yarn.lock");
511
+ if ((0, import_fs.existsSync)(yarnLockPath)) {
512
+ (0, import_fs.copyFileSync)(yarnLockPath, cachedYarnLock);
513
+ }
514
+ (0, import_fs.renameSync)(targetNodeModules, cachedNodeModules);
515
+ cloneDirectory(cachedNodeModules, targetNodeModules);
516
+ } catch (err) {
517
+ console.error("[environment] Failed to save to cache (installation still succeeded):", err instanceof Error ? err.message : String(err));
518
+ }
519
+ }
520
+ async function installDependencies(workDir, exec = import_child_process.execFileSync, cacheBase) {
521
+ if (!(0, import_fs.existsSync)(import_path2.default.join(workDir, "package.json"))) {
522
+ return;
523
+ }
524
+ const pm = detectPackageManager(workDir);
525
+ if (cacheBase) {
526
+ installWithCache(workDir, exec, cacheBase, pm);
527
+ return;
528
+ }
529
+ console.log(`[environment] Running ${pm.cmd} ${pm.args.join(" ")} in ${workDir}`);
530
+ try {
531
+ exec(pm.cmd, pm.args, { cwd: workDir, stdio: "inherit", timeout: 18e4 });
532
+ console.log("[environment] Dependency installation complete");
533
+ } catch (err) {
534
+ console.error("[environment] Dependency installation failed:", err instanceof Error ? err.message : String(err));
535
+ }
536
+ }
537
+
458
538
  // src/run-scenario/environment.ts
459
539
  async function fetchAndWriteTemplateFiles(template, workDir) {
460
540
  let sourceFiles = [];
@@ -475,27 +555,27 @@ async function fetchAndWriteTemplateFiles(template, workDir) {
475
555
  const content = ef.gitSource ? await (0, import_evalforge_github_client.fetchGitHubFile)(ef.gitSource, {
476
556
  userAgent: "EvalForge-Evaluator"
477
557
  }) : ef.content ?? "";
478
- const dest = import_path2.default.resolve(workDir, ef.path);
479
- if (!dest.startsWith(workDir + import_path2.sep)) {
558
+ const dest = import_path3.default.resolve(workDir, ef.path);
559
+ if (!dest.startsWith(workDir + import_path3.sep)) {
480
560
  throw new Error(
481
561
  `Extra file path escapes working directory: "${ef.path}"`
482
562
  );
483
563
  }
484
- await (0, import_promises2.mkdir)(import_path2.default.dirname(dest), { recursive: true });
564
+ await (0, import_promises2.mkdir)(import_path3.default.dirname(dest), { recursive: true });
485
565
  await (0, import_promises2.writeFile)(dest, content, "utf8");
486
566
  })
487
567
  );
488
568
  }
489
569
  function writeWixEnvFile(workDir) {
490
- const configPath = import_path2.default.join(workDir, "wix.config.json");
491
- if (!(0, import_fs.existsSync)(configPath)) {
570
+ const configPath = import_path3.default.join(workDir, "wix.config.json");
571
+ if (!(0, import_fs2.existsSync)(configPath)) {
492
572
  return;
493
573
  }
494
574
  try {
495
- const config = JSON.parse((0, import_fs.readFileSync)(configPath, "utf-8"));
575
+ const config = JSON.parse((0, import_fs2.readFileSync)(configPath, "utf-8"));
496
576
  if (config.appId) {
497
- (0, import_fs.writeFileSync)(
498
- import_path2.default.join(workDir, ".env"),
577
+ (0, import_fs2.writeFileSync)(
578
+ import_path3.default.join(workDir, ".env"),
499
579
  `WIX_CLIENT_ID=${config.appId}
500
580
  `,
501
581
  "utf-8"
@@ -507,34 +587,36 @@ function writeWixEnvFile(workDir) {
507
587
  }
508
588
  }
509
589
  async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId, template) {
510
- const baseDir = config.evaluationsDir ?? import_path2.default.join((0, import_os.tmpdir)(), "evalforge-evaluations");
590
+ const baseDir = config.evaluationsDir ?? import_path3.default.join((0, import_os.tmpdir)(), "evalforge-evaluations");
591
+ const nodeModulesCacheDir = import_path3.default.join(baseDir, "_node_modules_cache");
511
592
  if (template) {
512
593
  if (!config.evaluationsDir) {
513
594
  console.warn(
514
595
  "Template specified but EVALUATIONS_DIR not set, using temp directory"
515
596
  );
516
597
  }
517
- const workDir2 = import_path2.default.join(baseDir, `${evalRunId2}_${targetId}`);
518
- if ((0, import_fs.existsSync)(workDir2)) {
519
- (0, import_fs.rmSync)(workDir2, { recursive: true });
598
+ const workDir2 = import_path3.default.join(baseDir, `${evalRunId2}_${targetId}`);
599
+ if ((0, import_fs2.existsSync)(workDir2)) {
600
+ (0, import_fs2.rmSync)(workDir2, { recursive: true });
520
601
  }
521
- (0, import_fs.mkdirSync)(workDir2, { recursive: true });
602
+ (0, import_fs2.mkdirSync)(workDir2, { recursive: true });
522
603
  await fetchAndWriteTemplateFiles(template, workDir2);
523
604
  console.log(`Template files written to ${workDir2}`);
524
605
  writeWixEnvFile(workDir2);
606
+ await installDependencies(workDir2, void 0, nodeModulesCacheDir);
525
607
  return workDir2;
526
608
  }
527
- const workDir = import_path2.default.join(baseDir, `${evalRunId2}_${targetId}_${scenarioId}`);
528
- if ((0, import_fs.existsSync)(workDir)) {
529
- (0, import_fs.rmSync)(workDir, { recursive: true });
609
+ const workDir = import_path3.default.join(baseDir, `${evalRunId2}_${targetId}_${scenarioId}`);
610
+ if ((0, import_fs2.existsSync)(workDir)) {
611
+ (0, import_fs2.rmSync)(workDir, { recursive: true });
530
612
  }
531
- (0, import_fs.mkdirSync)(workDir, { recursive: true });
613
+ (0, import_fs2.mkdirSync)(workDir, { recursive: true });
532
614
  console.log(`Empty working directory created at ${workDir}`);
533
615
  return workDir;
534
616
  }
535
617
 
536
618
  // src/run-scenario/run-agent-with-context.ts
537
- var import_crypto4 = require("crypto");
619
+ var import_crypto5 = require("crypto");
538
620
 
539
621
  // src/run-scenario/agents/registry.ts
540
622
  var AgentAdapterRegistry = class {
@@ -643,7 +725,7 @@ var import_evalforge_types4 = require("@wix/evalforge-types");
643
725
 
644
726
  // src/run-scenario/agents/claude-code/write-skills.ts
645
727
  var import_promises3 = require("fs/promises");
646
- var import_path3 = require("path");
728
+ var import_path4 = require("path");
647
729
  var import_evalforge_github_client2 = require("@wix/evalforge-github-client");
648
730
  async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
649
731
  await Promise.all(
@@ -652,7 +734,7 @@ async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_g
652
734
  }
653
735
  async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
654
736
  const skillName = skill.name;
655
- const skillDir = (0, import_path3.join)(cwd, ".claude", "skills", skillName);
737
+ const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skillName);
656
738
  await (0, import_promises3.mkdir)(skillDir, { recursive: true });
657
739
  const version = skill.latestVersion;
658
740
  if (version?.files && version.files.length > 0) {
@@ -692,18 +774,18 @@ function resolveTimeoutMs(maxTurns, maxDurationMs) {
692
774
  }
693
775
 
694
776
  // src/run-scenario/agents/claude-code/execute.ts
695
- var import_crypto = require("crypto");
777
+ var import_crypto2 = require("crypto");
696
778
 
697
779
  // src/run-scenario/agents/claude-code/write-mcp.ts
698
780
  var import_promises5 = require("fs/promises");
699
- var import_path5 = require("path");
781
+ var import_path6 = require("path");
700
782
  var import_evalforge_types2 = require("@wix/evalforge-types");
701
783
 
702
784
  // src/run-scenario/agents/shared/resolve-mcp-placeholders.ts
703
785
  var import_promises4 = require("fs/promises");
704
- var import_path4 = require("path");
786
+ var import_path5 = require("path");
705
787
  var import_os2 = require("os");
706
- var WIX_AUTH_FILE = (0, import_path4.join)((0, import_os2.homedir)(), ".wix", "auth", "api-key.json");
788
+ var WIX_AUTH_FILE = (0, import_path5.join)((0, import_os2.homedir)(), ".wix", "auth", "api-key.json");
707
789
  async function loadWixAuthPlaceholders(authFilePath = WIX_AUTH_FILE) {
708
790
  try {
709
791
  const content = await (0, import_promises4.readFile)(authFilePath, "utf-8");
@@ -762,14 +844,14 @@ async function writeMcpToFilesystem(cwd, mcps) {
762
844
  null,
763
845
  2
764
846
  );
765
- const filePath = (0, import_path5.join)(cwd, ".mcp.json");
847
+ const filePath = (0, import_path6.join)(cwd, ".mcp.json");
766
848
  await (0, import_promises5.writeFile)(filePath, content, "utf8");
767
849
  console.log(`[MCP] Written to ${filePath}`);
768
850
  }
769
851
 
770
852
  // src/run-scenario/agents/claude-code/write-sub-agents.ts
771
853
  var import_promises6 = require("fs/promises");
772
- var import_path6 = require("path");
854
+ var import_path7 = require("path");
773
855
  var import_evalforge_github_client3 = require("@wix/evalforge-github-client");
774
856
  var AGENTS_DIR = ".claude/agents";
775
857
  function toAgentFilename(name, index, nameCount) {
@@ -807,12 +889,12 @@ async function resolveSubAgentContent(agent, fetchFn) {
807
889
  }
808
890
  async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalforge_github_client3.fetchGitHubFile) {
809
891
  if (subAgents.length === 0) return;
810
- const agentsDir = (0, import_path6.join)(cwd, AGENTS_DIR);
892
+ const agentsDir = (0, import_path7.join)(cwd, AGENTS_DIR);
811
893
  await (0, import_promises6.mkdir)(agentsDir, { recursive: true });
812
894
  const nameCount = /* @__PURE__ */ new Map();
813
895
  for (const [i, agent] of subAgents.entries()) {
814
896
  const filename = toAgentFilename(agent.name, i, nameCount);
815
- const filePath = (0, import_path6.join)(agentsDir, `${filename}.md`);
897
+ const filePath = (0, import_path7.join)(agentsDir, `${filename}.md`);
816
898
  const content = await resolveSubAgentContent(agent, fetchFn);
817
899
  await (0, import_promises6.writeFile)(filePath, content, "utf8");
818
900
  }
@@ -821,7 +903,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalf
821
903
 
822
904
  // src/run-scenario/agents/claude-code/write-rules.ts
823
905
  var import_promises7 = require("fs/promises");
824
- var import_path7 = require("path");
906
+ var import_path8 = require("path");
825
907
  var CURSOR_RULES_DIR = ".cursor/rules";
826
908
  function toRuleFilename(name, index, nameCount) {
827
909
  const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `rule-${index}`;
@@ -854,9 +936,9 @@ function validateGenericDirectory(dir, cwd) {
854
936
  `Generic rule directory may not contain "..", got: "${dir}"`
855
937
  );
856
938
  }
857
- const normalizedCwd = cwd.endsWith(import_path7.sep) ? cwd.slice(0, -1) : cwd;
858
- const resolved = (0, import_path7.resolve)(normalizedCwd, trimmed);
859
- if (!resolved.startsWith(normalizedCwd + import_path7.sep)) {
939
+ const normalizedCwd = cwd.endsWith(import_path8.sep) ? cwd.slice(0, -1) : cwd;
940
+ const resolved = (0, import_path8.resolve)(normalizedCwd, trimmed);
941
+ if (!resolved.startsWith(normalizedCwd + import_path8.sep)) {
860
942
  throw new Error(
861
943
  `Generic rule directory escapes the working directory: "${dir}"`
862
944
  );
@@ -870,20 +952,20 @@ async function writeRulesToFilesystem(cwd, rules) {
870
952
  for (const [i, rule] of rules.entries()) {
871
953
  switch (rule.ruleType) {
872
954
  case "claude-md": {
873
- await appendToFile((0, import_path7.join)(cwd, "CLAUDE.md"), rule.content);
955
+ await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), rule.content);
874
956
  break;
875
957
  }
876
958
  case "agents-md": {
877
- await appendToFile((0, import_path7.join)(cwd, "AGENTS.md"), rule.content);
959
+ await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), rule.content);
878
960
  break;
879
961
  }
880
962
  case "cursor-rule": {
881
963
  if (!hasCursorRules) {
882
- await (0, import_promises7.mkdir)((0, import_path7.join)(cwd, CURSOR_RULES_DIR), { recursive: true });
964
+ await (0, import_promises7.mkdir)((0, import_path8.join)(cwd, CURSOR_RULES_DIR), { recursive: true });
883
965
  hasCursorRules = true;
884
966
  }
885
967
  const filename = toRuleFilename(rule.name, i, nameCount);
886
- const filePath = (0, import_path7.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
968
+ const filePath = (0, import_path8.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
887
969
  await (0, import_promises7.writeFile)(filePath, rule.content, "utf8");
888
970
  break;
889
971
  }
@@ -892,10 +974,10 @@ async function writeRulesToFilesystem(cwd, rules) {
892
974
  rule.directory ?? ".opencode/rules",
893
975
  cwd
894
976
  );
895
- const dirPath = (0, import_path7.join)(cwd, directory);
977
+ const dirPath = (0, import_path8.join)(cwd, directory);
896
978
  await (0, import_promises7.mkdir)(dirPath, { recursive: true });
897
979
  const filename = toRuleFilename(rule.name, i, nameCount);
898
- await (0, import_promises7.writeFile)((0, import_path7.join)(dirPath, `${filename}.md`), rule.content, "utf8");
980
+ await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), rule.content, "utf8");
899
981
  break;
900
982
  }
901
983
  default: {
@@ -1066,8 +1148,8 @@ function extractToolActionDescription(toolName, toolArgs) {
1066
1148
  }
1067
1149
  }
1068
1150
  if (toolName === "LS" || toolName === "ls" || toolName === "ListFiles") {
1069
- const path2 = args.path || args.directory || ".";
1070
- return `Listing: ${String(path2).slice(0, 50)}`;
1151
+ const path3 = args.path || args.directory || ".";
1152
+ return `Listing: ${String(path3).slice(0, 50)}`;
1071
1153
  }
1072
1154
  if ((toolName === "Read" || toolName === "read" || toolName === "View") && (args.file_path || args.path || args.target_file)) {
1073
1155
  const filePath = String(
@@ -1950,7 +2032,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
1950
2032
  const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
1951
2033
  if (hasThinking && (hasText || toolCallCount > 0)) {
1952
2034
  subSteps.push({
1953
- id: (0, import_crypto.randomUUID)(),
2035
+ id: (0, import_crypto2.randomUUID)(),
1954
2036
  stepNumber: 0,
1955
2037
  // renumbered below
1956
2038
  turnIndex,
@@ -1980,7 +2062,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
1980
2062
  const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
1981
2063
  const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
1982
2064
  subSteps.push({
1983
- id: (0, import_crypto.randomUUID)(),
2065
+ id: (0, import_crypto2.randomUUID)(),
1984
2066
  stepNumber: 0,
1985
2067
  turnIndex,
1986
2068
  type: import_evalforge_types4.LLMStepType.TOOL_USE,
@@ -2010,7 +2092,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
2010
2092
  }
2011
2093
  if (hasText && toolCallCount > 0) {
2012
2094
  subSteps.push({
2013
- id: (0, import_crypto.randomUUID)(),
2095
+ id: (0, import_crypto2.randomUUID)(),
2014
2096
  stepNumber: 0,
2015
2097
  turnIndex,
2016
2098
  type: import_evalforge_types4.LLMStepType.COMPLETION,
@@ -2032,7 +2114,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
2032
2114
  if (subSteps.length === 0) {
2033
2115
  const stepType = hasThinking && !hasText ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
2034
2116
  subSteps.push({
2035
- id: (0, import_crypto.randomUUID)(),
2117
+ id: (0, import_crypto2.randomUUID)(),
2036
2118
  stepNumber: 0,
2037
2119
  turnIndex,
2038
2120
  type: stepType,
@@ -2090,7 +2172,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
2090
2172
  stepTypeBreakdown
2091
2173
  };
2092
2174
  return {
2093
- id: (0, import_crypto.randomUUID)(),
2175
+ id: (0, import_crypto2.randomUUID)(),
2094
2176
  steps: traceSteps,
2095
2177
  summary
2096
2178
  };
@@ -2192,7 +2274,7 @@ defaultRegistry.register(claudeCodeAdapter);
2192
2274
  var import_evalforge_types9 = require("@wix/evalforge-types");
2193
2275
 
2194
2276
  // src/run-scenario/agents/opencode/execute.ts
2195
- var import_child_process = require("child_process");
2277
+ var import_child_process2 = require("child_process");
2196
2278
  var import_evalforge_types8 = require("@wix/evalforge-types");
2197
2279
 
2198
2280
  // src/run-scenario/agents/opencode/types.ts
@@ -2206,7 +2288,7 @@ function tryParseJson(text) {
2206
2288
 
2207
2289
  // src/run-scenario/agents/opencode/write-skills.ts
2208
2290
  var import_promises8 = require("fs/promises");
2209
- var import_path8 = require("path");
2291
+ var import_path9 = require("path");
2210
2292
  var import_evalforge_github_client4 = require("@wix/evalforge-github-client");
2211
2293
  async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_github_client4.fetchGitHubFolder) {
2212
2294
  await Promise.all(
@@ -2215,7 +2297,7 @@ async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_
2215
2297
  }
2216
2298
  async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
2217
2299
  const skillName = skill.name;
2218
- const skillDir = (0, import_path8.join)(cwd, ".opencode", "skills", skillName);
2300
+ const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skillName);
2219
2301
  await (0, import_promises8.mkdir)(skillDir, { recursive: true });
2220
2302
  const version = skill.latestVersion;
2221
2303
  if (version?.files && version.files.length > 0) {
@@ -2248,7 +2330,7 @@ async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
2248
2330
 
2249
2331
  // src/run-scenario/agents/opencode/write-sub-agents.ts
2250
2332
  var import_promises9 = require("fs/promises");
2251
- var import_path9 = require("path");
2333
+ var import_path10 = require("path");
2252
2334
  var import_evalforge_github_client5 = require("@wix/evalforge-github-client");
2253
2335
  var AGENTS_DIR2 = ".opencode/agents";
2254
2336
  function toAgentFilename2(name, index, nameCount) {
@@ -2286,12 +2368,12 @@ async function resolveSubAgentContent2(agent, fetchFn) {
2286
2368
  }
2287
2369
  async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_evalforge_github_client5.fetchGitHubFile) {
2288
2370
  if (subAgents.length === 0) return;
2289
- const agentsDir = (0, import_path9.join)(cwd, AGENTS_DIR2);
2371
+ const agentsDir = (0, import_path10.join)(cwd, AGENTS_DIR2);
2290
2372
  await (0, import_promises9.mkdir)(agentsDir, { recursive: true });
2291
2373
  const nameCount = /* @__PURE__ */ new Map();
2292
2374
  for (const [i, agent] of subAgents.entries()) {
2293
2375
  const filename = toAgentFilename2(agent.name, i, nameCount);
2294
- const filePath = (0, import_path9.join)(agentsDir, `${filename}.md`);
2376
+ const filePath = (0, import_path10.join)(agentsDir, `${filename}.md`);
2295
2377
  const content = await resolveSubAgentContent2(agent, fetchFn);
2296
2378
  await (0, import_promises9.writeFile)(filePath, content, "utf8");
2297
2379
  }
@@ -2454,7 +2536,7 @@ async function buildOpenCodeEnv(options) {
2454
2536
 
2455
2537
  // src/run-scenario/agents/opencode/build-trace.ts
2456
2538
  var import_evalforge_types7 = require("@wix/evalforge-types");
2457
- var import_crypto2 = require("crypto");
2539
+ var import_crypto3 = require("crypto");
2458
2540
  function toCanonicalModelId(modelId) {
2459
2541
  const slashIndex = modelId.indexOf("/");
2460
2542
  return slashIndex > 0 ? modelId.slice(slashIndex + 1) : modelId;
@@ -2530,7 +2612,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
2530
2612
  const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
2531
2613
  if (hasThinking && (hasText || toolCallCount > 0)) {
2532
2614
  subSteps.push({
2533
- id: (0, import_crypto2.randomUUID)(),
2615
+ id: (0, import_crypto3.randomUUID)(),
2534
2616
  stepNumber: 0,
2535
2617
  turnIndex,
2536
2618
  type: import_evalforge_types7.LLMStepType.THINKING,
@@ -2559,7 +2641,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
2559
2641
  const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
2560
2642
  const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
2561
2643
  subSteps.push({
2562
- id: (0, import_crypto2.randomUUID)(),
2644
+ id: (0, import_crypto3.randomUUID)(),
2563
2645
  stepNumber: 0,
2564
2646
  turnIndex,
2565
2647
  type: import_evalforge_types7.LLMStepType.TOOL_USE,
@@ -2589,7 +2671,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
2589
2671
  }
2590
2672
  if (hasText && toolCallCount > 0) {
2591
2673
  subSteps.push({
2592
- id: (0, import_crypto2.randomUUID)(),
2674
+ id: (0, import_crypto3.randomUUID)(),
2593
2675
  stepNumber: 0,
2594
2676
  turnIndex,
2595
2677
  type: import_evalforge_types7.LLMStepType.COMPLETION,
@@ -2611,7 +2693,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
2611
2693
  if (subSteps.length === 0) {
2612
2694
  const stepType = hasThinking && !hasText ? import_evalforge_types7.LLMStepType.THINKING : import_evalforge_types7.LLMStepType.COMPLETION;
2613
2695
  subSteps.push({
2614
- id: (0, import_crypto2.randomUUID)(),
2696
+ id: (0, import_crypto3.randomUUID)(),
2615
2697
  stepNumber: 0,
2616
2698
  turnIndex,
2617
2699
  type: stepType,
@@ -2680,7 +2762,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
2680
2762
  stepTypeBreakdown
2681
2763
  };
2682
2764
  return {
2683
- id: (0, import_crypto2.randomUUID)(),
2765
+ id: (0, import_crypto3.randomUUID)(),
2684
2766
  steps: allSteps,
2685
2767
  summary
2686
2768
  };
@@ -2755,7 +2837,7 @@ function buildConversation2(timestampedEvents) {
2755
2837
 
2756
2838
  // src/run-scenario/agents/opencode/execute.ts
2757
2839
  var import_promises10 = require("fs/promises");
2758
- var import_path10 = require("path");
2840
+ var import_path11 = require("path");
2759
2841
  var KILL_GRACE_PERIOD_MS = 5e3;
2760
2842
  var IDLE_TIMEOUT_MS = 12e4;
2761
2843
  var IDLE_CHECK_INTERVAL_MS = 15e3;
@@ -2780,14 +2862,14 @@ function extractToolAction(toolName, args) {
2780
2862
  return `Using ${toolName}...`;
2781
2863
  }
2782
2864
  async function writePromptImages(cwd, images) {
2783
- const imagesDir = (0, import_path10.join)(cwd, "prompt-images");
2865
+ const imagesDir = (0, import_path11.join)(cwd, "prompt-images");
2784
2866
  await (0, import_promises10.mkdir)(imagesDir, { recursive: true });
2785
2867
  const filePaths = [];
2786
2868
  for (let i = 0; i < images.length; i++) {
2787
2869
  const img = images[i];
2788
2870
  const ext = img.mediaType.split("/")[1] || "png";
2789
2871
  const filename = `image-${i}.${ext}`;
2790
- const filepath = (0, import_path10.join)(imagesDir, filename);
2872
+ const filepath = (0, import_path11.join)(imagesDir, filename);
2791
2873
  const buffer = Buffer.from(img.base64, "base64");
2792
2874
  await (0, import_promises10.writeFile)(filepath, buffer);
2793
2875
  filePaths.push(filepath);
@@ -2880,10 +2962,10 @@ async function prepareOpenCodeEnvironment(cwd, skills, options) {
2880
2962
  }
2881
2963
  }
2882
2964
  async function writeSystemPromptRule(cwd, systemPrompt) {
2883
- const rulesDir = (0, import_path10.join)(cwd, ".opencode", "rules");
2965
+ const rulesDir = (0, import_path11.join)(cwd, ".opencode", "rules");
2884
2966
  await (0, import_promises10.mkdir)(rulesDir, { recursive: true });
2885
2967
  await (0, import_promises10.writeFile)(
2886
- (0, import_path10.join)(rulesDir, "evalforge-system-prompt.md"),
2968
+ (0, import_path11.join)(rulesDir, "evalforge-system-prompt.md"),
2887
2969
  systemPrompt,
2888
2970
  "utf-8"
2889
2971
  );
@@ -2986,7 +3068,7 @@ function spawnOpenCodeProcess(opts) {
2986
3068
  };
2987
3069
  let child;
2988
3070
  try {
2989
- child = (0, import_child_process.spawn)("opencode", args, {
3071
+ child = (0, import_child_process2.spawn)("opencode", args, {
2990
3072
  cwd,
2991
3073
  env,
2992
3074
  stdio: ["ignore", "pipe", "pipe"],
@@ -3471,7 +3553,7 @@ var import_anthropic = require("@ai-sdk/anthropic");
3471
3553
  var import_google = require("@ai-sdk/google");
3472
3554
  var import_openai = require("@ai-sdk/openai");
3473
3555
  var import_evalforge_types11 = require("@wix/evalforge-types");
3474
- var import_crypto3 = require("crypto");
3556
+ var import_crypto4 = require("crypto");
3475
3557
 
3476
3558
  // src/run-scenario/agents/simple-agent/mcp-tools.ts
3477
3559
  var import_mcp = require("@ai-sdk/mcp");
@@ -4087,7 +4169,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
4087
4169
  const costUsd = calculateStepCost(step, modelId, provider, tokenUsage);
4088
4170
  const toolResultError = findToolResultError(step);
4089
4171
  return {
4090
- id: (0, import_crypto3.randomUUID)(),
4172
+ id: (0, import_crypto4.randomUUID)(),
4091
4173
  stepNumber: i + 1,
4092
4174
  turnIndex: i,
4093
4175
  type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
@@ -4111,7 +4193,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
4111
4193
  total: totalUsage.totalTokens
4112
4194
  };
4113
4195
  return {
4114
- id: (0, import_crypto3.randomUUID)(),
4196
+ id: (0, import_crypto4.randomUUID)(),
4115
4197
  steps: traceSteps,
4116
4198
  summary: {
4117
4199
  totalSteps: traceSteps.length,
@@ -4186,8 +4268,8 @@ var simpleAgentAdapter = new SimpleAgentAdapter();
4186
4268
  defaultRegistry.register(simpleAgentAdapter);
4187
4269
 
4188
4270
  // src/run-scenario/file-diff.ts
4189
- var import_fs2 = require("fs");
4190
- var import_path11 = require("path");
4271
+ var import_fs3 = require("fs");
4272
+ var import_path12 = require("path");
4191
4273
 
4192
4274
  // ../../node_modules/diff/lib/index.mjs
4193
4275
  function Diff() {
@@ -4291,11 +4373,11 @@ Diff.prototype = {
4291
4373
  }
4292
4374
  }
4293
4375
  },
4294
- addToPath: function addToPath(path2, added, removed, oldPosInc, options) {
4295
- var last = path2.lastComponent;
4376
+ addToPath: function addToPath(path3, added, removed, oldPosInc, options) {
4377
+ var last = path3.lastComponent;
4296
4378
  if (last && !options.oneChangePerToken && last.added === added && last.removed === removed) {
4297
4379
  return {
4298
- oldPos: path2.oldPos + oldPosInc,
4380
+ oldPos: path3.oldPos + oldPosInc,
4299
4381
  lastComponent: {
4300
4382
  count: last.count + 1,
4301
4383
  added,
@@ -4305,7 +4387,7 @@ Diff.prototype = {
4305
4387
  };
4306
4388
  } else {
4307
4389
  return {
4308
- oldPos: path2.oldPos + oldPosInc,
4390
+ oldPos: path3.oldPos + oldPosInc,
4309
4391
  lastComponent: {
4310
4392
  count: 1,
4311
4393
  added,
@@ -4745,9 +4827,9 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
4745
4827
  // src/run-scenario/file-diff.ts
4746
4828
  function deriveInfrastructurePaths(prePrep, postPrep) {
4747
4829
  const infraPaths = /* @__PURE__ */ new Set();
4748
- for (const path2 of Object.keys(postPrep)) {
4749
- if (prePrep[path2] === void 0 || prePrep[path2] !== postPrep[path2]) {
4750
- infraPaths.add(path2);
4830
+ for (const path3 of Object.keys(postPrep)) {
4831
+ if (prePrep[path3] === void 0 || prePrep[path3] !== postPrep[path3]) {
4832
+ infraPaths.add(path3);
4751
4833
  }
4752
4834
  }
4753
4835
  return infraPaths;
@@ -4807,13 +4889,13 @@ function isBinaryFile(filename) {
4807
4889
  function snapshotDirectory(dir, baseDir) {
4808
4890
  const snapshot = {};
4809
4891
  const base = baseDir || dir;
4810
- if (!(0, import_fs2.existsSync)(dir)) {
4892
+ if (!(0, import_fs3.existsSync)(dir)) {
4811
4893
  return snapshot;
4812
4894
  }
4813
- const entries = (0, import_fs2.readdirSync)(dir, { withFileTypes: true });
4895
+ const entries = (0, import_fs3.readdirSync)(dir, { withFileTypes: true });
4814
4896
  for (const entry of entries) {
4815
- const fullPath = (0, import_path11.join)(dir, entry.name);
4816
- const relativePath = (0, import_path11.relative)(base, fullPath);
4897
+ const fullPath = (0, import_path12.join)(dir, entry.name);
4898
+ const relativePath = (0, import_path12.relative)(base, fullPath);
4817
4899
  if (shouldIgnore(entry.name)) {
4818
4900
  continue;
4819
4901
  }
@@ -4825,11 +4907,11 @@ function snapshotDirectory(dir, baseDir) {
4825
4907
  continue;
4826
4908
  }
4827
4909
  try {
4828
- const stats = (0, import_fs2.statSync)(fullPath);
4910
+ const stats = (0, import_fs3.statSync)(fullPath);
4829
4911
  if (stats.size > MAX_FILE_SIZE) {
4830
4912
  continue;
4831
4913
  }
4832
- const content = (0, import_fs2.readFileSync)(fullPath, "utf-8");
4914
+ const content = (0, import_fs3.readFileSync)(fullPath, "utf-8");
4833
4915
  snapshot[relativePath] = content;
4834
4916
  } catch {
4835
4917
  continue;
@@ -4858,19 +4940,19 @@ function generateDiffLines(before, after) {
4858
4940
  function diffSnapshots(before, after, infrastructurePaths) {
4859
4941
  const diffs = [];
4860
4942
  const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
4861
- for (const path2 of allPaths) {
4862
- const beforeContent = before[path2] ?? "";
4863
- const afterContent = after[path2] ?? "";
4864
- if (before[path2] !== void 0 && beforeContent === afterContent) {
4943
+ for (const path3 of allPaths) {
4944
+ const beforeContent = before[path3] ?? "";
4945
+ const afterContent = after[path3] ?? "";
4946
+ if (before[path3] !== void 0 && beforeContent === afterContent) {
4865
4947
  continue;
4866
4948
  }
4867
4949
  const diffLines2 = generateDiffLines(beforeContent, afterContent);
4868
4950
  diffs.push({
4869
- path: path2,
4951
+ path: path3,
4870
4952
  expected: beforeContent,
4871
4953
  actual: afterContent,
4872
4954
  diffLines: diffLines2,
4873
- ...infrastructurePaths?.has(path2) && { isInfrastructure: true }
4955
+ ...infrastructurePaths?.has(path3) && { isInfrastructure: true }
4874
4956
  });
4875
4957
  }
4876
4958
  const deletedPaths = [...allPaths].filter((p) => after[p] === void 0);
@@ -4897,9 +4979,9 @@ function diffSnapshots(before, after, infrastructurePaths) {
4897
4979
  function extractTemplateFiles(before, after, infrastructurePaths) {
4898
4980
  const files = [];
4899
4981
  const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
4900
- for (const path2 of allPaths) {
4901
- const beforeContent = before[path2];
4902
- const afterContent = after[path2];
4982
+ for (const path3 of allPaths) {
4983
+ const beforeContent = before[path3];
4984
+ const afterContent = after[path3];
4903
4985
  if (afterContent === void 0) {
4904
4986
  continue;
4905
4987
  }
@@ -4912,10 +4994,10 @@ function extractTemplateFiles(before, after, infrastructurePaths) {
4912
4994
  status = "unchanged";
4913
4995
  }
4914
4996
  files.push({
4915
- path: path2,
4997
+ path: path3,
4916
4998
  content: afterContent,
4917
4999
  status,
4918
- ...infrastructurePaths?.has(path2) && { isInfrastructure: true }
5000
+ ...infrastructurePaths?.has(path3) && { isInfrastructure: true }
4919
5001
  });
4920
5002
  }
4921
5003
  files.sort((a, b) => a.path.localeCompare(b.path));
@@ -4999,7 +5081,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4999
5081
  }
5000
5082
  } : agent?.modelConfig ?? (llmTrace?.summary.modelsUsed?.[0] ? { model: llmTrace.summary.modelsUsed[0] } : void 0);
5001
5083
  return {
5002
- id: (0, import_crypto4.randomUUID)(),
5084
+ id: (0, import_crypto5.randomUUID)(),
5003
5085
  targetId,
5004
5086
  targetName,
5005
5087
  scenarioId: scenario.id,
@@ -5020,6 +5102,24 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
5020
5102
  // src/run-scenario/index.ts
5021
5103
  async function runScenario(config, evalRunId2, scenario, evalData, template, resolvedAssertions) {
5022
5104
  const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
5105
+ const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
5106
+ if (template) {
5107
+ console.log(
5108
+ (0, import_evalforge_types13.formatTraceEventLine)({
5109
+ evalRunId: evalRunId2,
5110
+ scenarioId: scenario.id,
5111
+ scenarioName: scenario.name,
5112
+ targetId,
5113
+ targetName,
5114
+ stepNumber: 0,
5115
+ type: import_evalforge_types13.LiveTraceEventType.PROGRESS,
5116
+ outputPreview: "Setting up environment (installing dependencies)...",
5117
+ elapsedMs: 0,
5118
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
5119
+ isComplete: false
5120
+ })
5121
+ );
5122
+ }
5023
5123
  const workDir = await prepareWorkingDirectory(
5024
5124
  config,
5025
5125
  evalRunId2,
@@ -5083,7 +5183,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
5083
5183
  }
5084
5184
 
5085
5185
  // src/evaluation-loop.ts
5086
- var import_crypto5 = require("crypto");
5186
+ var import_crypto6 = require("crypto");
5087
5187
  async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
5088
5188
  const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
5089
5189
  let completedExecutions = 0;
@@ -5109,7 +5209,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
5109
5209
  `[Evaluator] Scenario iteration failed, recording as error result: "${scenario.name}"${iterLabel} \u2014 ${errorMsg}`
5110
5210
  );
5111
5211
  const errorResult = {
5112
- id: (0, import_crypto5.randomUUID)(),
5212
+ id: (0, import_crypto6.randomUUID)(),
5113
5213
  targetId,
5114
5214
  targetName,
5115
5215
  scenarioId: scenario.id,