@wix/evalforge-evaluator 0.170.0 → 0.171.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +205 -105
- package/build/index.js.map +4 -4
- package/build/index.mjs +158 -48
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/install-dependencies.d.ts +11 -0
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -90,8 +90,8 @@ function createApiClient(serverUrl, options = "") {
|
|
|
90
90
|
}
|
|
91
91
|
return headers;
|
|
92
92
|
}
|
|
93
|
-
async function fetchJson(
|
|
94
|
-
const url = `${serverUrl}${apiPrefix}${pathPrefix}${
|
|
93
|
+
async function fetchJson(path3) {
|
|
94
|
+
const url = `${serverUrl}${apiPrefix}${pathPrefix}${path3}`;
|
|
95
95
|
console.error(`[API] GET ${url}`);
|
|
96
96
|
const headers = buildHeaders();
|
|
97
97
|
const response = await fetch(url, {
|
|
@@ -105,8 +105,8 @@ function createApiClient(serverUrl, options = "") {
|
|
|
105
105
|
}
|
|
106
106
|
return response.json();
|
|
107
107
|
}
|
|
108
|
-
async function postJson(
|
|
109
|
-
const url = `${serverUrl}${apiPrefix}${pathPrefix}${
|
|
108
|
+
async function postJson(path3, body) {
|
|
109
|
+
const url = `${serverUrl}${apiPrefix}${pathPrefix}${path3}`;
|
|
110
110
|
console.error(`[API] POST ${url}`);
|
|
111
111
|
const response = await fetch(url, {
|
|
112
112
|
method: "POST",
|
|
@@ -120,8 +120,8 @@ function createApiClient(serverUrl, options = "") {
|
|
|
120
120
|
);
|
|
121
121
|
}
|
|
122
122
|
}
|
|
123
|
-
async function deleteRequest(
|
|
124
|
-
const url = `${serverUrl}${apiPrefix}${pathPrefix}${
|
|
123
|
+
async function deleteRequest(path3) {
|
|
124
|
+
const url = `${serverUrl}${apiPrefix}${pathPrefix}${path3}`;
|
|
125
125
|
console.error(`[API] DELETE ${url}`);
|
|
126
126
|
const headers = buildHeaders();
|
|
127
127
|
const response = await fetch(url, {
|
|
@@ -135,8 +135,8 @@ function createApiClient(serverUrl, options = "") {
|
|
|
135
135
|
);
|
|
136
136
|
}
|
|
137
137
|
}
|
|
138
|
-
async function putJson(
|
|
139
|
-
const url = `${serverUrl}${apiPrefix}${pathPrefix}${
|
|
138
|
+
async function putJson(path3, body) {
|
|
139
|
+
const url = `${serverUrl}${apiPrefix}${pathPrefix}${path3}`;
|
|
140
140
|
console.error(`[API] PUT ${url}`);
|
|
141
141
|
const response = await fetch(url, {
|
|
142
142
|
method: "PUT",
|
|
@@ -432,10 +432,10 @@ var import_evalforge_types13 = require("@wix/evalforge-types");
|
|
|
432
432
|
var import_eval_assertions = require("@wix/eval-assertions");
|
|
433
433
|
|
|
434
434
|
// src/run-scenario/environment.ts
|
|
435
|
-
var
|
|
435
|
+
var import_fs2 = require("fs");
|
|
436
436
|
var import_promises2 = require("fs/promises");
|
|
437
437
|
var import_os = require("os");
|
|
438
|
-
var
|
|
438
|
+
var import_path3 = __toESM(require("path"));
|
|
439
439
|
var import_evalforge_github_client = require("@wix/evalforge-github-client");
|
|
440
440
|
|
|
441
441
|
// src/run-scenario/utils/write-files.ts
|
|
@@ -455,6 +455,86 @@ async function writeFilesToDirectory(targetDir, files) {
|
|
|
455
455
|
}
|
|
456
456
|
}
|
|
457
457
|
|
|
458
|
+
// src/run-scenario/install-dependencies.ts
|
|
459
|
+
var import_fs = require("fs");
|
|
460
|
+
var import_crypto = require("crypto");
|
|
461
|
+
var import_path2 = __toESM(require("path"));
|
|
462
|
+
var import_child_process = require("child_process");
|
|
463
|
+
function detectPackageManager(workDir) {
|
|
464
|
+
if ((0, import_fs.existsSync)(import_path2.default.join(workDir, "pnpm-lock.yaml"))) {
|
|
465
|
+
return { cmd: "pnpm", args: ["install", "--frozen-lockfile"], cacheSourceFile: "pnpm-lock.yaml" };
|
|
466
|
+
}
|
|
467
|
+
if ((0, import_fs.existsSync)(import_path2.default.join(workDir, "package-lock.json"))) {
|
|
468
|
+
return { cmd: "npm", args: ["ci"], cacheSourceFile: "package-lock.json" };
|
|
469
|
+
}
|
|
470
|
+
if ((0, import_fs.existsSync)(import_path2.default.join(workDir, "yarn.lock"))) {
|
|
471
|
+
return { cmd: "yarn", args: ["install", "--frozen-lockfile"], cacheSourceFile: "yarn.lock" };
|
|
472
|
+
}
|
|
473
|
+
(0, import_fs.writeFileSync)(import_path2.default.join(workDir, "yarn.lock"), "", "utf-8");
|
|
474
|
+
return { cmd: "yarn", args: ["install"], cacheSourceFile: "package.json" };
|
|
475
|
+
}
|
|
476
|
+
function cloneDirectory(src, dest) {
|
|
477
|
+
if (process.platform === "darwin") {
|
|
478
|
+
(0, import_child_process.execFileSync)("cp", ["-rc", src, dest]);
|
|
479
|
+
} else {
|
|
480
|
+
(0, import_fs.cpSync)(src, dest, { recursive: true });
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
function installWithCache(workDir, exec, cacheBase, pm) {
|
|
484
|
+
const sourceContent = (0, import_fs.readFileSync)(import_path2.default.join(workDir, pm.cacheSourceFile), "utf-8");
|
|
485
|
+
const cacheKey = (0, import_crypto.createHash)("sha256").update(sourceContent).digest("hex").slice(0, 16);
|
|
486
|
+
const cachedNodeModules = import_path2.default.join(cacheBase, cacheKey, "node_modules");
|
|
487
|
+
const targetNodeModules = import_path2.default.join(workDir, "node_modules");
|
|
488
|
+
const cacheDir = import_path2.default.dirname(cachedNodeModules);
|
|
489
|
+
const cachedYarnLock = import_path2.default.join(cacheDir, "yarn.lock");
|
|
490
|
+
if ((0, import_fs.existsSync)(cachedNodeModules)) {
|
|
491
|
+
console.log(`[environment] Restoring node_modules from cache (key: ${cacheKey})`);
|
|
492
|
+
if (!(0, import_fs.existsSync)(targetNodeModules)) {
|
|
493
|
+
cloneDirectory(cachedNodeModules, targetNodeModules);
|
|
494
|
+
}
|
|
495
|
+
if ((0, import_fs.existsSync)(cachedYarnLock)) {
|
|
496
|
+
(0, import_fs.copyFileSync)(cachedYarnLock, import_path2.default.join(workDir, "yarn.lock"));
|
|
497
|
+
}
|
|
498
|
+
return;
|
|
499
|
+
}
|
|
500
|
+
console.log(`[environment] Running ${pm.cmd} ${pm.args.join(" ")} in ${workDir} (cache key: ${cacheKey})`);
|
|
501
|
+
try {
|
|
502
|
+
exec(pm.cmd, pm.args, { cwd: workDir, stdio: "inherit", timeout: 18e4 });
|
|
503
|
+
} catch (err) {
|
|
504
|
+
console.error("[environment] Dependency installation failed:", err instanceof Error ? err.message : String(err));
|
|
505
|
+
return;
|
|
506
|
+
}
|
|
507
|
+
console.log("[environment] Dependency installation complete \u2014 saving to cache");
|
|
508
|
+
try {
|
|
509
|
+
(0, import_fs.mkdirSync)(cacheDir, { recursive: true });
|
|
510
|
+
const yarnLockPath = import_path2.default.join(workDir, "yarn.lock");
|
|
511
|
+
if ((0, import_fs.existsSync)(yarnLockPath)) {
|
|
512
|
+
(0, import_fs.copyFileSync)(yarnLockPath, cachedYarnLock);
|
|
513
|
+
}
|
|
514
|
+
(0, import_fs.renameSync)(targetNodeModules, cachedNodeModules);
|
|
515
|
+
cloneDirectory(cachedNodeModules, targetNodeModules);
|
|
516
|
+
} catch (err) {
|
|
517
|
+
console.error("[environment] Failed to save to cache (installation still succeeded):", err instanceof Error ? err.message : String(err));
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
async function installDependencies(workDir, exec = import_child_process.execFileSync, cacheBase) {
|
|
521
|
+
if (!(0, import_fs.existsSync)(import_path2.default.join(workDir, "package.json"))) {
|
|
522
|
+
return;
|
|
523
|
+
}
|
|
524
|
+
const pm = detectPackageManager(workDir);
|
|
525
|
+
if (cacheBase) {
|
|
526
|
+
installWithCache(workDir, exec, cacheBase, pm);
|
|
527
|
+
return;
|
|
528
|
+
}
|
|
529
|
+
console.log(`[environment] Running ${pm.cmd} ${pm.args.join(" ")} in ${workDir}`);
|
|
530
|
+
try {
|
|
531
|
+
exec(pm.cmd, pm.args, { cwd: workDir, stdio: "inherit", timeout: 18e4 });
|
|
532
|
+
console.log("[environment] Dependency installation complete");
|
|
533
|
+
} catch (err) {
|
|
534
|
+
console.error("[environment] Dependency installation failed:", err instanceof Error ? err.message : String(err));
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
|
|
458
538
|
// src/run-scenario/environment.ts
|
|
459
539
|
async function fetchAndWriteTemplateFiles(template, workDir) {
|
|
460
540
|
let sourceFiles = [];
|
|
@@ -475,27 +555,27 @@ async function fetchAndWriteTemplateFiles(template, workDir) {
|
|
|
475
555
|
const content = ef.gitSource ? await (0, import_evalforge_github_client.fetchGitHubFile)(ef.gitSource, {
|
|
476
556
|
userAgent: "EvalForge-Evaluator"
|
|
477
557
|
}) : ef.content ?? "";
|
|
478
|
-
const dest =
|
|
479
|
-
if (!dest.startsWith(workDir +
|
|
558
|
+
const dest = import_path3.default.resolve(workDir, ef.path);
|
|
559
|
+
if (!dest.startsWith(workDir + import_path3.sep)) {
|
|
480
560
|
throw new Error(
|
|
481
561
|
`Extra file path escapes working directory: "${ef.path}"`
|
|
482
562
|
);
|
|
483
563
|
}
|
|
484
|
-
await (0, import_promises2.mkdir)(
|
|
564
|
+
await (0, import_promises2.mkdir)(import_path3.default.dirname(dest), { recursive: true });
|
|
485
565
|
await (0, import_promises2.writeFile)(dest, content, "utf8");
|
|
486
566
|
})
|
|
487
567
|
);
|
|
488
568
|
}
|
|
489
569
|
function writeWixEnvFile(workDir) {
|
|
490
|
-
const configPath =
|
|
491
|
-
if (!(0,
|
|
570
|
+
const configPath = import_path3.default.join(workDir, "wix.config.json");
|
|
571
|
+
if (!(0, import_fs2.existsSync)(configPath)) {
|
|
492
572
|
return;
|
|
493
573
|
}
|
|
494
574
|
try {
|
|
495
|
-
const config = JSON.parse((0,
|
|
575
|
+
const config = JSON.parse((0, import_fs2.readFileSync)(configPath, "utf-8"));
|
|
496
576
|
if (config.appId) {
|
|
497
|
-
(0,
|
|
498
|
-
|
|
577
|
+
(0, import_fs2.writeFileSync)(
|
|
578
|
+
import_path3.default.join(workDir, ".env"),
|
|
499
579
|
`WIX_CLIENT_ID=${config.appId}
|
|
500
580
|
`,
|
|
501
581
|
"utf-8"
|
|
@@ -507,34 +587,36 @@ function writeWixEnvFile(workDir) {
|
|
|
507
587
|
}
|
|
508
588
|
}
|
|
509
589
|
async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId, template) {
|
|
510
|
-
const baseDir = config.evaluationsDir ??
|
|
590
|
+
const baseDir = config.evaluationsDir ?? import_path3.default.join((0, import_os.tmpdir)(), "evalforge-evaluations");
|
|
591
|
+
const nodeModulesCacheDir = import_path3.default.join(baseDir, "_node_modules_cache");
|
|
511
592
|
if (template) {
|
|
512
593
|
if (!config.evaluationsDir) {
|
|
513
594
|
console.warn(
|
|
514
595
|
"Template specified but EVALUATIONS_DIR not set, using temp directory"
|
|
515
596
|
);
|
|
516
597
|
}
|
|
517
|
-
const workDir2 =
|
|
518
|
-
if ((0,
|
|
519
|
-
(0,
|
|
598
|
+
const workDir2 = import_path3.default.join(baseDir, `${evalRunId2}_${targetId}`);
|
|
599
|
+
if ((0, import_fs2.existsSync)(workDir2)) {
|
|
600
|
+
(0, import_fs2.rmSync)(workDir2, { recursive: true });
|
|
520
601
|
}
|
|
521
|
-
(0,
|
|
602
|
+
(0, import_fs2.mkdirSync)(workDir2, { recursive: true });
|
|
522
603
|
await fetchAndWriteTemplateFiles(template, workDir2);
|
|
523
604
|
console.log(`Template files written to ${workDir2}`);
|
|
524
605
|
writeWixEnvFile(workDir2);
|
|
606
|
+
await installDependencies(workDir2, void 0, nodeModulesCacheDir);
|
|
525
607
|
return workDir2;
|
|
526
608
|
}
|
|
527
|
-
const workDir =
|
|
528
|
-
if ((0,
|
|
529
|
-
(0,
|
|
609
|
+
const workDir = import_path3.default.join(baseDir, `${evalRunId2}_${targetId}_${scenarioId}`);
|
|
610
|
+
if ((0, import_fs2.existsSync)(workDir)) {
|
|
611
|
+
(0, import_fs2.rmSync)(workDir, { recursive: true });
|
|
530
612
|
}
|
|
531
|
-
(0,
|
|
613
|
+
(0, import_fs2.mkdirSync)(workDir, { recursive: true });
|
|
532
614
|
console.log(`Empty working directory created at ${workDir}`);
|
|
533
615
|
return workDir;
|
|
534
616
|
}
|
|
535
617
|
|
|
536
618
|
// src/run-scenario/run-agent-with-context.ts
|
|
537
|
-
var
|
|
619
|
+
var import_crypto5 = require("crypto");
|
|
538
620
|
|
|
539
621
|
// src/run-scenario/agents/registry.ts
|
|
540
622
|
var AgentAdapterRegistry = class {
|
|
@@ -643,7 +725,7 @@ var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
|
643
725
|
|
|
644
726
|
// src/run-scenario/agents/claude-code/write-skills.ts
|
|
645
727
|
var import_promises3 = require("fs/promises");
|
|
646
|
-
var
|
|
728
|
+
var import_path4 = require("path");
|
|
647
729
|
var import_evalforge_github_client2 = require("@wix/evalforge-github-client");
|
|
648
730
|
async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
649
731
|
await Promise.all(
|
|
@@ -652,7 +734,7 @@ async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_g
|
|
|
652
734
|
}
|
|
653
735
|
async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
654
736
|
const skillName = skill.name;
|
|
655
|
-
const skillDir = (0,
|
|
737
|
+
const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skillName);
|
|
656
738
|
await (0, import_promises3.mkdir)(skillDir, { recursive: true });
|
|
657
739
|
const version = skill.latestVersion;
|
|
658
740
|
if (version?.files && version.files.length > 0) {
|
|
@@ -692,18 +774,18 @@ function resolveTimeoutMs(maxTurns, maxDurationMs) {
|
|
|
692
774
|
}
|
|
693
775
|
|
|
694
776
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
695
|
-
var
|
|
777
|
+
var import_crypto2 = require("crypto");
|
|
696
778
|
|
|
697
779
|
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
698
780
|
var import_promises5 = require("fs/promises");
|
|
699
|
-
var
|
|
781
|
+
var import_path6 = require("path");
|
|
700
782
|
var import_evalforge_types2 = require("@wix/evalforge-types");
|
|
701
783
|
|
|
702
784
|
// src/run-scenario/agents/shared/resolve-mcp-placeholders.ts
|
|
703
785
|
var import_promises4 = require("fs/promises");
|
|
704
|
-
var
|
|
786
|
+
var import_path5 = require("path");
|
|
705
787
|
var import_os2 = require("os");
|
|
706
|
-
var WIX_AUTH_FILE = (0,
|
|
788
|
+
var WIX_AUTH_FILE = (0, import_path5.join)((0, import_os2.homedir)(), ".wix", "auth", "api-key.json");
|
|
707
789
|
async function loadWixAuthPlaceholders(authFilePath = WIX_AUTH_FILE) {
|
|
708
790
|
try {
|
|
709
791
|
const content = await (0, import_promises4.readFile)(authFilePath, "utf-8");
|
|
@@ -762,14 +844,14 @@ async function writeMcpToFilesystem(cwd, mcps) {
|
|
|
762
844
|
null,
|
|
763
845
|
2
|
|
764
846
|
);
|
|
765
|
-
const filePath = (0,
|
|
847
|
+
const filePath = (0, import_path6.join)(cwd, ".mcp.json");
|
|
766
848
|
await (0, import_promises5.writeFile)(filePath, content, "utf8");
|
|
767
849
|
console.log(`[MCP] Written to ${filePath}`);
|
|
768
850
|
}
|
|
769
851
|
|
|
770
852
|
// src/run-scenario/agents/claude-code/write-sub-agents.ts
|
|
771
853
|
var import_promises6 = require("fs/promises");
|
|
772
|
-
var
|
|
854
|
+
var import_path7 = require("path");
|
|
773
855
|
var import_evalforge_github_client3 = require("@wix/evalforge-github-client");
|
|
774
856
|
var AGENTS_DIR = ".claude/agents";
|
|
775
857
|
function toAgentFilename(name, index, nameCount) {
|
|
@@ -807,12 +889,12 @@ async function resolveSubAgentContent(agent, fetchFn) {
|
|
|
807
889
|
}
|
|
808
890
|
async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalforge_github_client3.fetchGitHubFile) {
|
|
809
891
|
if (subAgents.length === 0) return;
|
|
810
|
-
const agentsDir = (0,
|
|
892
|
+
const agentsDir = (0, import_path7.join)(cwd, AGENTS_DIR);
|
|
811
893
|
await (0, import_promises6.mkdir)(agentsDir, { recursive: true });
|
|
812
894
|
const nameCount = /* @__PURE__ */ new Map();
|
|
813
895
|
for (const [i, agent] of subAgents.entries()) {
|
|
814
896
|
const filename = toAgentFilename(agent.name, i, nameCount);
|
|
815
|
-
const filePath = (0,
|
|
897
|
+
const filePath = (0, import_path7.join)(agentsDir, `${filename}.md`);
|
|
816
898
|
const content = await resolveSubAgentContent(agent, fetchFn);
|
|
817
899
|
await (0, import_promises6.writeFile)(filePath, content, "utf8");
|
|
818
900
|
}
|
|
@@ -821,7 +903,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalf
|
|
|
821
903
|
|
|
822
904
|
// src/run-scenario/agents/claude-code/write-rules.ts
|
|
823
905
|
var import_promises7 = require("fs/promises");
|
|
824
|
-
var
|
|
906
|
+
var import_path8 = require("path");
|
|
825
907
|
var CURSOR_RULES_DIR = ".cursor/rules";
|
|
826
908
|
function toRuleFilename(name, index, nameCount) {
|
|
827
909
|
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `rule-${index}`;
|
|
@@ -854,9 +936,9 @@ function validateGenericDirectory(dir, cwd) {
|
|
|
854
936
|
`Generic rule directory may not contain "..", got: "${dir}"`
|
|
855
937
|
);
|
|
856
938
|
}
|
|
857
|
-
const normalizedCwd = cwd.endsWith(
|
|
858
|
-
const resolved = (0,
|
|
859
|
-
if (!resolved.startsWith(normalizedCwd +
|
|
939
|
+
const normalizedCwd = cwd.endsWith(import_path8.sep) ? cwd.slice(0, -1) : cwd;
|
|
940
|
+
const resolved = (0, import_path8.resolve)(normalizedCwd, trimmed);
|
|
941
|
+
if (!resolved.startsWith(normalizedCwd + import_path8.sep)) {
|
|
860
942
|
throw new Error(
|
|
861
943
|
`Generic rule directory escapes the working directory: "${dir}"`
|
|
862
944
|
);
|
|
@@ -870,20 +952,20 @@ async function writeRulesToFilesystem(cwd, rules) {
|
|
|
870
952
|
for (const [i, rule] of rules.entries()) {
|
|
871
953
|
switch (rule.ruleType) {
|
|
872
954
|
case "claude-md": {
|
|
873
|
-
await appendToFile((0,
|
|
955
|
+
await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), rule.content);
|
|
874
956
|
break;
|
|
875
957
|
}
|
|
876
958
|
case "agents-md": {
|
|
877
|
-
await appendToFile((0,
|
|
959
|
+
await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), rule.content);
|
|
878
960
|
break;
|
|
879
961
|
}
|
|
880
962
|
case "cursor-rule": {
|
|
881
963
|
if (!hasCursorRules) {
|
|
882
|
-
await (0, import_promises7.mkdir)((0,
|
|
964
|
+
await (0, import_promises7.mkdir)((0, import_path8.join)(cwd, CURSOR_RULES_DIR), { recursive: true });
|
|
883
965
|
hasCursorRules = true;
|
|
884
966
|
}
|
|
885
967
|
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
886
|
-
const filePath = (0,
|
|
968
|
+
const filePath = (0, import_path8.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
|
|
887
969
|
await (0, import_promises7.writeFile)(filePath, rule.content, "utf8");
|
|
888
970
|
break;
|
|
889
971
|
}
|
|
@@ -892,10 +974,10 @@ async function writeRulesToFilesystem(cwd, rules) {
|
|
|
892
974
|
rule.directory ?? ".opencode/rules",
|
|
893
975
|
cwd
|
|
894
976
|
);
|
|
895
|
-
const dirPath = (0,
|
|
977
|
+
const dirPath = (0, import_path8.join)(cwd, directory);
|
|
896
978
|
await (0, import_promises7.mkdir)(dirPath, { recursive: true });
|
|
897
979
|
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
898
|
-
await (0, import_promises7.writeFile)((0,
|
|
980
|
+
await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), rule.content, "utf8");
|
|
899
981
|
break;
|
|
900
982
|
}
|
|
901
983
|
default: {
|
|
@@ -1066,8 +1148,8 @@ function extractToolActionDescription(toolName, toolArgs) {
|
|
|
1066
1148
|
}
|
|
1067
1149
|
}
|
|
1068
1150
|
if (toolName === "LS" || toolName === "ls" || toolName === "ListFiles") {
|
|
1069
|
-
const
|
|
1070
|
-
return `Listing: ${String(
|
|
1151
|
+
const path3 = args.path || args.directory || ".";
|
|
1152
|
+
return `Listing: ${String(path3).slice(0, 50)}`;
|
|
1071
1153
|
}
|
|
1072
1154
|
if ((toolName === "Read" || toolName === "read" || toolName === "View") && (args.file_path || args.path || args.target_file)) {
|
|
1073
1155
|
const filePath = String(
|
|
@@ -1950,7 +2032,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1950
2032
|
const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
|
|
1951
2033
|
if (hasThinking && (hasText || toolCallCount > 0)) {
|
|
1952
2034
|
subSteps.push({
|
|
1953
|
-
id: (0,
|
|
2035
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
1954
2036
|
stepNumber: 0,
|
|
1955
2037
|
// renumbered below
|
|
1956
2038
|
turnIndex,
|
|
@@ -1980,7 +2062,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1980
2062
|
const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
|
|
1981
2063
|
const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
|
|
1982
2064
|
subSteps.push({
|
|
1983
|
-
id: (0,
|
|
2065
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
1984
2066
|
stepNumber: 0,
|
|
1985
2067
|
turnIndex,
|
|
1986
2068
|
type: import_evalforge_types4.LLMStepType.TOOL_USE,
|
|
@@ -2010,7 +2092,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
2010
2092
|
}
|
|
2011
2093
|
if (hasText && toolCallCount > 0) {
|
|
2012
2094
|
subSteps.push({
|
|
2013
|
-
id: (0,
|
|
2095
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
2014
2096
|
stepNumber: 0,
|
|
2015
2097
|
turnIndex,
|
|
2016
2098
|
type: import_evalforge_types4.LLMStepType.COMPLETION,
|
|
@@ -2032,7 +2114,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
2032
2114
|
if (subSteps.length === 0) {
|
|
2033
2115
|
const stepType = hasThinking && !hasText ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
|
|
2034
2116
|
subSteps.push({
|
|
2035
|
-
id: (0,
|
|
2117
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
2036
2118
|
stepNumber: 0,
|
|
2037
2119
|
turnIndex,
|
|
2038
2120
|
type: stepType,
|
|
@@ -2090,7 +2172,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
2090
2172
|
stepTypeBreakdown
|
|
2091
2173
|
};
|
|
2092
2174
|
return {
|
|
2093
|
-
id: (0,
|
|
2175
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
2094
2176
|
steps: traceSteps,
|
|
2095
2177
|
summary
|
|
2096
2178
|
};
|
|
@@ -2192,7 +2274,7 @@ defaultRegistry.register(claudeCodeAdapter);
|
|
|
2192
2274
|
var import_evalforge_types9 = require("@wix/evalforge-types");
|
|
2193
2275
|
|
|
2194
2276
|
// src/run-scenario/agents/opencode/execute.ts
|
|
2195
|
-
var
|
|
2277
|
+
var import_child_process2 = require("child_process");
|
|
2196
2278
|
var import_evalforge_types8 = require("@wix/evalforge-types");
|
|
2197
2279
|
|
|
2198
2280
|
// src/run-scenario/agents/opencode/types.ts
|
|
@@ -2206,7 +2288,7 @@ function tryParseJson(text) {
|
|
|
2206
2288
|
|
|
2207
2289
|
// src/run-scenario/agents/opencode/write-skills.ts
|
|
2208
2290
|
var import_promises8 = require("fs/promises");
|
|
2209
|
-
var
|
|
2291
|
+
var import_path9 = require("path");
|
|
2210
2292
|
var import_evalforge_github_client4 = require("@wix/evalforge-github-client");
|
|
2211
2293
|
async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_github_client4.fetchGitHubFolder) {
|
|
2212
2294
|
await Promise.all(
|
|
@@ -2215,7 +2297,7 @@ async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_
|
|
|
2215
2297
|
}
|
|
2216
2298
|
async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
|
|
2217
2299
|
const skillName = skill.name;
|
|
2218
|
-
const skillDir = (0,
|
|
2300
|
+
const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skillName);
|
|
2219
2301
|
await (0, import_promises8.mkdir)(skillDir, { recursive: true });
|
|
2220
2302
|
const version = skill.latestVersion;
|
|
2221
2303
|
if (version?.files && version.files.length > 0) {
|
|
@@ -2248,7 +2330,7 @@ async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
|
|
|
2248
2330
|
|
|
2249
2331
|
// src/run-scenario/agents/opencode/write-sub-agents.ts
|
|
2250
2332
|
var import_promises9 = require("fs/promises");
|
|
2251
|
-
var
|
|
2333
|
+
var import_path10 = require("path");
|
|
2252
2334
|
var import_evalforge_github_client5 = require("@wix/evalforge-github-client");
|
|
2253
2335
|
var AGENTS_DIR2 = ".opencode/agents";
|
|
2254
2336
|
function toAgentFilename2(name, index, nameCount) {
|
|
@@ -2286,12 +2368,12 @@ async function resolveSubAgentContent2(agent, fetchFn) {
|
|
|
2286
2368
|
}
|
|
2287
2369
|
async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_evalforge_github_client5.fetchGitHubFile) {
|
|
2288
2370
|
if (subAgents.length === 0) return;
|
|
2289
|
-
const agentsDir = (0,
|
|
2371
|
+
const agentsDir = (0, import_path10.join)(cwd, AGENTS_DIR2);
|
|
2290
2372
|
await (0, import_promises9.mkdir)(agentsDir, { recursive: true });
|
|
2291
2373
|
const nameCount = /* @__PURE__ */ new Map();
|
|
2292
2374
|
for (const [i, agent] of subAgents.entries()) {
|
|
2293
2375
|
const filename = toAgentFilename2(agent.name, i, nameCount);
|
|
2294
|
-
const filePath = (0,
|
|
2376
|
+
const filePath = (0, import_path10.join)(agentsDir, `${filename}.md`);
|
|
2295
2377
|
const content = await resolveSubAgentContent2(agent, fetchFn);
|
|
2296
2378
|
await (0, import_promises9.writeFile)(filePath, content, "utf8");
|
|
2297
2379
|
}
|
|
@@ -2454,7 +2536,7 @@ async function buildOpenCodeEnv(options) {
|
|
|
2454
2536
|
|
|
2455
2537
|
// src/run-scenario/agents/opencode/build-trace.ts
|
|
2456
2538
|
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
2457
|
-
var
|
|
2539
|
+
var import_crypto3 = require("crypto");
|
|
2458
2540
|
function toCanonicalModelId(modelId) {
|
|
2459
2541
|
const slashIndex = modelId.indexOf("/");
|
|
2460
2542
|
return slashIndex > 0 ? modelId.slice(slashIndex + 1) : modelId;
|
|
@@ -2530,7 +2612,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
2530
2612
|
const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
|
|
2531
2613
|
if (hasThinking && (hasText || toolCallCount > 0)) {
|
|
2532
2614
|
subSteps.push({
|
|
2533
|
-
id: (0,
|
|
2615
|
+
id: (0, import_crypto3.randomUUID)(),
|
|
2534
2616
|
stepNumber: 0,
|
|
2535
2617
|
turnIndex,
|
|
2536
2618
|
type: import_evalforge_types7.LLMStepType.THINKING,
|
|
@@ -2559,7 +2641,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
2559
2641
|
const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
|
|
2560
2642
|
const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
|
|
2561
2643
|
subSteps.push({
|
|
2562
|
-
id: (0,
|
|
2644
|
+
id: (0, import_crypto3.randomUUID)(),
|
|
2563
2645
|
stepNumber: 0,
|
|
2564
2646
|
turnIndex,
|
|
2565
2647
|
type: import_evalforge_types7.LLMStepType.TOOL_USE,
|
|
@@ -2589,7 +2671,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
2589
2671
|
}
|
|
2590
2672
|
if (hasText && toolCallCount > 0) {
|
|
2591
2673
|
subSteps.push({
|
|
2592
|
-
id: (0,
|
|
2674
|
+
id: (0, import_crypto3.randomUUID)(),
|
|
2593
2675
|
stepNumber: 0,
|
|
2594
2676
|
turnIndex,
|
|
2595
2677
|
type: import_evalforge_types7.LLMStepType.COMPLETION,
|
|
@@ -2611,7 +2693,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
2611
2693
|
if (subSteps.length === 0) {
|
|
2612
2694
|
const stepType = hasThinking && !hasText ? import_evalforge_types7.LLMStepType.THINKING : import_evalforge_types7.LLMStepType.COMPLETION;
|
|
2613
2695
|
subSteps.push({
|
|
2614
|
-
id: (0,
|
|
2696
|
+
id: (0, import_crypto3.randomUUID)(),
|
|
2615
2697
|
stepNumber: 0,
|
|
2616
2698
|
turnIndex,
|
|
2617
2699
|
type: stepType,
|
|
@@ -2680,7 +2762,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
2680
2762
|
stepTypeBreakdown
|
|
2681
2763
|
};
|
|
2682
2764
|
return {
|
|
2683
|
-
id: (0,
|
|
2765
|
+
id: (0, import_crypto3.randomUUID)(),
|
|
2684
2766
|
steps: allSteps,
|
|
2685
2767
|
summary
|
|
2686
2768
|
};
|
|
@@ -2755,7 +2837,7 @@ function buildConversation2(timestampedEvents) {
|
|
|
2755
2837
|
|
|
2756
2838
|
// src/run-scenario/agents/opencode/execute.ts
|
|
2757
2839
|
var import_promises10 = require("fs/promises");
|
|
2758
|
-
var
|
|
2840
|
+
var import_path11 = require("path");
|
|
2759
2841
|
var KILL_GRACE_PERIOD_MS = 5e3;
|
|
2760
2842
|
var IDLE_TIMEOUT_MS = 12e4;
|
|
2761
2843
|
var IDLE_CHECK_INTERVAL_MS = 15e3;
|
|
@@ -2780,14 +2862,14 @@ function extractToolAction(toolName, args) {
|
|
|
2780
2862
|
return `Using ${toolName}...`;
|
|
2781
2863
|
}
|
|
2782
2864
|
async function writePromptImages(cwd, images) {
|
|
2783
|
-
const imagesDir = (0,
|
|
2865
|
+
const imagesDir = (0, import_path11.join)(cwd, "prompt-images");
|
|
2784
2866
|
await (0, import_promises10.mkdir)(imagesDir, { recursive: true });
|
|
2785
2867
|
const filePaths = [];
|
|
2786
2868
|
for (let i = 0; i < images.length; i++) {
|
|
2787
2869
|
const img = images[i];
|
|
2788
2870
|
const ext = img.mediaType.split("/")[1] || "png";
|
|
2789
2871
|
const filename = `image-${i}.${ext}`;
|
|
2790
|
-
const filepath = (0,
|
|
2872
|
+
const filepath = (0, import_path11.join)(imagesDir, filename);
|
|
2791
2873
|
const buffer = Buffer.from(img.base64, "base64");
|
|
2792
2874
|
await (0, import_promises10.writeFile)(filepath, buffer);
|
|
2793
2875
|
filePaths.push(filepath);
|
|
@@ -2880,10 +2962,10 @@ async function prepareOpenCodeEnvironment(cwd, skills, options) {
|
|
|
2880
2962
|
}
|
|
2881
2963
|
}
|
|
2882
2964
|
async function writeSystemPromptRule(cwd, systemPrompt) {
|
|
2883
|
-
const rulesDir = (0,
|
|
2965
|
+
const rulesDir = (0, import_path11.join)(cwd, ".opencode", "rules");
|
|
2884
2966
|
await (0, import_promises10.mkdir)(rulesDir, { recursive: true });
|
|
2885
2967
|
await (0, import_promises10.writeFile)(
|
|
2886
|
-
(0,
|
|
2968
|
+
(0, import_path11.join)(rulesDir, "evalforge-system-prompt.md"),
|
|
2887
2969
|
systemPrompt,
|
|
2888
2970
|
"utf-8"
|
|
2889
2971
|
);
|
|
@@ -2986,7 +3068,7 @@ function spawnOpenCodeProcess(opts) {
|
|
|
2986
3068
|
};
|
|
2987
3069
|
let child;
|
|
2988
3070
|
try {
|
|
2989
|
-
child = (0,
|
|
3071
|
+
child = (0, import_child_process2.spawn)("opencode", args, {
|
|
2990
3072
|
cwd,
|
|
2991
3073
|
env,
|
|
2992
3074
|
stdio: ["ignore", "pipe", "pipe"],
|
|
@@ -3471,7 +3553,7 @@ var import_anthropic = require("@ai-sdk/anthropic");
|
|
|
3471
3553
|
var import_google = require("@ai-sdk/google");
|
|
3472
3554
|
var import_openai = require("@ai-sdk/openai");
|
|
3473
3555
|
var import_evalforge_types11 = require("@wix/evalforge-types");
|
|
3474
|
-
var
|
|
3556
|
+
var import_crypto4 = require("crypto");
|
|
3475
3557
|
|
|
3476
3558
|
// src/run-scenario/agents/simple-agent/mcp-tools.ts
|
|
3477
3559
|
var import_mcp = require("@ai-sdk/mcp");
|
|
@@ -4087,7 +4169,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
|
|
|
4087
4169
|
const costUsd = calculateStepCost(step, modelId, provider, tokenUsage);
|
|
4088
4170
|
const toolResultError = findToolResultError(step);
|
|
4089
4171
|
return {
|
|
4090
|
-
id: (0,
|
|
4172
|
+
id: (0, import_crypto4.randomUUID)(),
|
|
4091
4173
|
stepNumber: i + 1,
|
|
4092
4174
|
turnIndex: i,
|
|
4093
4175
|
type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
|
|
@@ -4111,7 +4193,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
|
|
|
4111
4193
|
total: totalUsage.totalTokens
|
|
4112
4194
|
};
|
|
4113
4195
|
return {
|
|
4114
|
-
id: (0,
|
|
4196
|
+
id: (0, import_crypto4.randomUUID)(),
|
|
4115
4197
|
steps: traceSteps,
|
|
4116
4198
|
summary: {
|
|
4117
4199
|
totalSteps: traceSteps.length,
|
|
@@ -4186,8 +4268,8 @@ var simpleAgentAdapter = new SimpleAgentAdapter();
|
|
|
4186
4268
|
defaultRegistry.register(simpleAgentAdapter);
|
|
4187
4269
|
|
|
4188
4270
|
// src/run-scenario/file-diff.ts
|
|
4189
|
-
var
|
|
4190
|
-
var
|
|
4271
|
+
var import_fs3 = require("fs");
|
|
4272
|
+
var import_path12 = require("path");
|
|
4191
4273
|
|
|
4192
4274
|
// ../../node_modules/diff/lib/index.mjs
|
|
4193
4275
|
function Diff() {
|
|
@@ -4291,11 +4373,11 @@ Diff.prototype = {
|
|
|
4291
4373
|
}
|
|
4292
4374
|
}
|
|
4293
4375
|
},
|
|
4294
|
-
addToPath: function addToPath(
|
|
4295
|
-
var last =
|
|
4376
|
+
addToPath: function addToPath(path3, added, removed, oldPosInc, options) {
|
|
4377
|
+
var last = path3.lastComponent;
|
|
4296
4378
|
if (last && !options.oneChangePerToken && last.added === added && last.removed === removed) {
|
|
4297
4379
|
return {
|
|
4298
|
-
oldPos:
|
|
4380
|
+
oldPos: path3.oldPos + oldPosInc,
|
|
4299
4381
|
lastComponent: {
|
|
4300
4382
|
count: last.count + 1,
|
|
4301
4383
|
added,
|
|
@@ -4305,7 +4387,7 @@ Diff.prototype = {
|
|
|
4305
4387
|
};
|
|
4306
4388
|
} else {
|
|
4307
4389
|
return {
|
|
4308
|
-
oldPos:
|
|
4390
|
+
oldPos: path3.oldPos + oldPosInc,
|
|
4309
4391
|
lastComponent: {
|
|
4310
4392
|
count: 1,
|
|
4311
4393
|
added,
|
|
@@ -4745,9 +4827,9 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
|
|
|
4745
4827
|
// src/run-scenario/file-diff.ts
|
|
4746
4828
|
function deriveInfrastructurePaths(prePrep, postPrep) {
|
|
4747
4829
|
const infraPaths = /* @__PURE__ */ new Set();
|
|
4748
|
-
for (const
|
|
4749
|
-
if (prePrep[
|
|
4750
|
-
infraPaths.add(
|
|
4830
|
+
for (const path3 of Object.keys(postPrep)) {
|
|
4831
|
+
if (prePrep[path3] === void 0 || prePrep[path3] !== postPrep[path3]) {
|
|
4832
|
+
infraPaths.add(path3);
|
|
4751
4833
|
}
|
|
4752
4834
|
}
|
|
4753
4835
|
return infraPaths;
|
|
@@ -4807,13 +4889,13 @@ function isBinaryFile(filename) {
|
|
|
4807
4889
|
function snapshotDirectory(dir, baseDir) {
|
|
4808
4890
|
const snapshot = {};
|
|
4809
4891
|
const base = baseDir || dir;
|
|
4810
|
-
if (!(0,
|
|
4892
|
+
if (!(0, import_fs3.existsSync)(dir)) {
|
|
4811
4893
|
return snapshot;
|
|
4812
4894
|
}
|
|
4813
|
-
const entries = (0,
|
|
4895
|
+
const entries = (0, import_fs3.readdirSync)(dir, { withFileTypes: true });
|
|
4814
4896
|
for (const entry of entries) {
|
|
4815
|
-
const fullPath = (0,
|
|
4816
|
-
const relativePath = (0,
|
|
4897
|
+
const fullPath = (0, import_path12.join)(dir, entry.name);
|
|
4898
|
+
const relativePath = (0, import_path12.relative)(base, fullPath);
|
|
4817
4899
|
if (shouldIgnore(entry.name)) {
|
|
4818
4900
|
continue;
|
|
4819
4901
|
}
|
|
@@ -4825,11 +4907,11 @@ function snapshotDirectory(dir, baseDir) {
|
|
|
4825
4907
|
continue;
|
|
4826
4908
|
}
|
|
4827
4909
|
try {
|
|
4828
|
-
const stats = (0,
|
|
4910
|
+
const stats = (0, import_fs3.statSync)(fullPath);
|
|
4829
4911
|
if (stats.size > MAX_FILE_SIZE) {
|
|
4830
4912
|
continue;
|
|
4831
4913
|
}
|
|
4832
|
-
const content = (0,
|
|
4914
|
+
const content = (0, import_fs3.readFileSync)(fullPath, "utf-8");
|
|
4833
4915
|
snapshot[relativePath] = content;
|
|
4834
4916
|
} catch {
|
|
4835
4917
|
continue;
|
|
@@ -4858,19 +4940,19 @@ function generateDiffLines(before, after) {
|
|
|
4858
4940
|
function diffSnapshots(before, after, infrastructurePaths) {
|
|
4859
4941
|
const diffs = [];
|
|
4860
4942
|
const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
4861
|
-
for (const
|
|
4862
|
-
const beforeContent = before[
|
|
4863
|
-
const afterContent = after[
|
|
4864
|
-
if (before[
|
|
4943
|
+
for (const path3 of allPaths) {
|
|
4944
|
+
const beforeContent = before[path3] ?? "";
|
|
4945
|
+
const afterContent = after[path3] ?? "";
|
|
4946
|
+
if (before[path3] !== void 0 && beforeContent === afterContent) {
|
|
4865
4947
|
continue;
|
|
4866
4948
|
}
|
|
4867
4949
|
const diffLines2 = generateDiffLines(beforeContent, afterContent);
|
|
4868
4950
|
diffs.push({
|
|
4869
|
-
path:
|
|
4951
|
+
path: path3,
|
|
4870
4952
|
expected: beforeContent,
|
|
4871
4953
|
actual: afterContent,
|
|
4872
4954
|
diffLines: diffLines2,
|
|
4873
|
-
...infrastructurePaths?.has(
|
|
4955
|
+
...infrastructurePaths?.has(path3) && { isInfrastructure: true }
|
|
4874
4956
|
});
|
|
4875
4957
|
}
|
|
4876
4958
|
const deletedPaths = [...allPaths].filter((p) => after[p] === void 0);
|
|
@@ -4897,9 +4979,9 @@ function diffSnapshots(before, after, infrastructurePaths) {
|
|
|
4897
4979
|
function extractTemplateFiles(before, after, infrastructurePaths) {
|
|
4898
4980
|
const files = [];
|
|
4899
4981
|
const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
4900
|
-
for (const
|
|
4901
|
-
const beforeContent = before[
|
|
4902
|
-
const afterContent = after[
|
|
4982
|
+
for (const path3 of allPaths) {
|
|
4983
|
+
const beforeContent = before[path3];
|
|
4984
|
+
const afterContent = after[path3];
|
|
4903
4985
|
if (afterContent === void 0) {
|
|
4904
4986
|
continue;
|
|
4905
4987
|
}
|
|
@@ -4912,10 +4994,10 @@ function extractTemplateFiles(before, after, infrastructurePaths) {
|
|
|
4912
4994
|
status = "unchanged";
|
|
4913
4995
|
}
|
|
4914
4996
|
files.push({
|
|
4915
|
-
path:
|
|
4997
|
+
path: path3,
|
|
4916
4998
|
content: afterContent,
|
|
4917
4999
|
status,
|
|
4918
|
-
...infrastructurePaths?.has(
|
|
5000
|
+
...infrastructurePaths?.has(path3) && { isInfrastructure: true }
|
|
4919
5001
|
});
|
|
4920
5002
|
}
|
|
4921
5003
|
files.sort((a, b) => a.path.localeCompare(b.path));
|
|
@@ -4999,7 +5081,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4999
5081
|
}
|
|
5000
5082
|
} : agent?.modelConfig ?? (llmTrace?.summary.modelsUsed?.[0] ? { model: llmTrace.summary.modelsUsed[0] } : void 0);
|
|
5001
5083
|
return {
|
|
5002
|
-
id: (0,
|
|
5084
|
+
id: (0, import_crypto5.randomUUID)(),
|
|
5003
5085
|
targetId,
|
|
5004
5086
|
targetName,
|
|
5005
5087
|
scenarioId: scenario.id,
|
|
@@ -5020,6 +5102,24 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
5020
5102
|
// src/run-scenario/index.ts
|
|
5021
5103
|
async function runScenario(config, evalRunId2, scenario, evalData, template, resolvedAssertions) {
|
|
5022
5104
|
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
5105
|
+
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
5106
|
+
if (template) {
|
|
5107
|
+
console.log(
|
|
5108
|
+
(0, import_evalforge_types13.formatTraceEventLine)({
|
|
5109
|
+
evalRunId: evalRunId2,
|
|
5110
|
+
scenarioId: scenario.id,
|
|
5111
|
+
scenarioName: scenario.name,
|
|
5112
|
+
targetId,
|
|
5113
|
+
targetName,
|
|
5114
|
+
stepNumber: 0,
|
|
5115
|
+
type: import_evalforge_types13.LiveTraceEventType.PROGRESS,
|
|
5116
|
+
outputPreview: "Setting up environment (installing dependencies)...",
|
|
5117
|
+
elapsedMs: 0,
|
|
5118
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
5119
|
+
isComplete: false
|
|
5120
|
+
})
|
|
5121
|
+
);
|
|
5122
|
+
}
|
|
5023
5123
|
const workDir = await prepareWorkingDirectory(
|
|
5024
5124
|
config,
|
|
5025
5125
|
evalRunId2,
|
|
@@ -5083,7 +5183,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
5083
5183
|
}
|
|
5084
5184
|
|
|
5085
5185
|
// src/evaluation-loop.ts
|
|
5086
|
-
var
|
|
5186
|
+
var import_crypto6 = require("crypto");
|
|
5087
5187
|
async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
5088
5188
|
const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
|
|
5089
5189
|
let completedExecutions = 0;
|
|
@@ -5109,7 +5209,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
5109
5209
|
`[Evaluator] Scenario iteration failed, recording as error result: "${scenario.name}"${iterLabel} \u2014 ${errorMsg}`
|
|
5110
5210
|
);
|
|
5111
5211
|
const errorResult = {
|
|
5112
|
-
id: (0,
|
|
5212
|
+
id: (0, import_crypto6.randomUUID)(),
|
|
5113
5213
|
targetId,
|
|
5114
5214
|
targetName,
|
|
5115
5215
|
scenarioId: scenario.id,
|