@wix/evalforge-evaluator 0.170.0 → 0.172.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +203 -105
- package/build/index.js.map +4 -4
- package/build/index.mjs +153 -47
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/install-dependencies.d.ts +11 -0
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -90,8 +90,8 @@ function createApiClient(serverUrl, options = "") {
|
|
|
90
90
|
}
|
|
91
91
|
return headers;
|
|
92
92
|
}
|
|
93
|
-
async function fetchJson(
|
|
94
|
-
const url = `${serverUrl}${apiPrefix}${pathPrefix}${
|
|
93
|
+
async function fetchJson(path3) {
|
|
94
|
+
const url = `${serverUrl}${apiPrefix}${pathPrefix}${path3}`;
|
|
95
95
|
console.error(`[API] GET ${url}`);
|
|
96
96
|
const headers = buildHeaders();
|
|
97
97
|
const response = await fetch(url, {
|
|
@@ -105,8 +105,8 @@ function createApiClient(serverUrl, options = "") {
|
|
|
105
105
|
}
|
|
106
106
|
return response.json();
|
|
107
107
|
}
|
|
108
|
-
async function postJson(
|
|
109
|
-
const url = `${serverUrl}${apiPrefix}${pathPrefix}${
|
|
108
|
+
async function postJson(path3, body) {
|
|
109
|
+
const url = `${serverUrl}${apiPrefix}${pathPrefix}${path3}`;
|
|
110
110
|
console.error(`[API] POST ${url}`);
|
|
111
111
|
const response = await fetch(url, {
|
|
112
112
|
method: "POST",
|
|
@@ -120,8 +120,8 @@ function createApiClient(serverUrl, options = "") {
|
|
|
120
120
|
);
|
|
121
121
|
}
|
|
122
122
|
}
|
|
123
|
-
async function deleteRequest(
|
|
124
|
-
const url = `${serverUrl}${apiPrefix}${pathPrefix}${
|
|
123
|
+
async function deleteRequest(path3) {
|
|
124
|
+
const url = `${serverUrl}${apiPrefix}${pathPrefix}${path3}`;
|
|
125
125
|
console.error(`[API] DELETE ${url}`);
|
|
126
126
|
const headers = buildHeaders();
|
|
127
127
|
const response = await fetch(url, {
|
|
@@ -135,8 +135,8 @@ function createApiClient(serverUrl, options = "") {
|
|
|
135
135
|
);
|
|
136
136
|
}
|
|
137
137
|
}
|
|
138
|
-
async function putJson(
|
|
139
|
-
const url = `${serverUrl}${apiPrefix}${pathPrefix}${
|
|
138
|
+
async function putJson(path3, body) {
|
|
139
|
+
const url = `${serverUrl}${apiPrefix}${pathPrefix}${path3}`;
|
|
140
140
|
console.error(`[API] PUT ${url}`);
|
|
141
141
|
const response = await fetch(url, {
|
|
142
142
|
method: "PUT",
|
|
@@ -432,10 +432,10 @@ var import_evalforge_types13 = require("@wix/evalforge-types");
|
|
|
432
432
|
var import_eval_assertions = require("@wix/eval-assertions");
|
|
433
433
|
|
|
434
434
|
// src/run-scenario/environment.ts
|
|
435
|
-
var
|
|
435
|
+
var import_fs2 = require("fs");
|
|
436
436
|
var import_promises2 = require("fs/promises");
|
|
437
437
|
var import_os = require("os");
|
|
438
|
-
var
|
|
438
|
+
var import_path3 = __toESM(require("path"));
|
|
439
439
|
var import_evalforge_github_client = require("@wix/evalforge-github-client");
|
|
440
440
|
|
|
441
441
|
// src/run-scenario/utils/write-files.ts
|
|
@@ -455,6 +455,84 @@ async function writeFilesToDirectory(targetDir, files) {
|
|
|
455
455
|
}
|
|
456
456
|
}
|
|
457
457
|
|
|
458
|
+
// src/run-scenario/install-dependencies.ts
|
|
459
|
+
var import_fs = require("fs");
|
|
460
|
+
var import_crypto = require("crypto");
|
|
461
|
+
var import_path2 = __toESM(require("path"));
|
|
462
|
+
var import_child_process = require("child_process");
|
|
463
|
+
function detectPackageManager(workDir) {
|
|
464
|
+
if ((0, import_fs.existsSync)(import_path2.default.join(workDir, "pnpm-lock.yaml"))) {
|
|
465
|
+
return { cmd: "pnpm", args: ["install", "--frozen-lockfile"], cacheSourceFile: "pnpm-lock.yaml" };
|
|
466
|
+
}
|
|
467
|
+
if ((0, import_fs.existsSync)(import_path2.default.join(workDir, "package-lock.json"))) {
|
|
468
|
+
return { cmd: "npm", args: ["ci"], cacheSourceFile: "package-lock.json" };
|
|
469
|
+
}
|
|
470
|
+
if ((0, import_fs.existsSync)(import_path2.default.join(workDir, "yarn.lock"))) {
|
|
471
|
+
return { cmd: "yarn", args: ["install", "--frozen-lockfile"], cacheSourceFile: "yarn.lock" };
|
|
472
|
+
}
|
|
473
|
+
return { cmd: "npm", args: ["install", "--legacy-peer-deps", "--prefer-offline", "--no-fund", "--no-audit"], cacheSourceFile: "package.json" };
|
|
474
|
+
}
|
|
475
|
+
function cloneDirectory(src, dest) {
|
|
476
|
+
if (process.platform === "darwin") {
|
|
477
|
+
(0, import_child_process.execFileSync)("cp", ["-rc", src, dest]);
|
|
478
|
+
} else {
|
|
479
|
+
(0, import_fs.cpSync)(src, dest, { recursive: true });
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
function installWithCache(workDir, exec, cacheBase, pm) {
|
|
483
|
+
const sourceContent = (0, import_fs.readFileSync)(import_path2.default.join(workDir, pm.cacheSourceFile), "utf-8");
|
|
484
|
+
const cacheKey = (0, import_crypto.createHash)("sha256").update(sourceContent).digest("hex").slice(0, 16);
|
|
485
|
+
const cachedNodeModules = import_path2.default.join(cacheBase, cacheKey, "node_modules");
|
|
486
|
+
const targetNodeModules = import_path2.default.join(workDir, "node_modules");
|
|
487
|
+
const cacheDir = import_path2.default.dirname(cachedNodeModules);
|
|
488
|
+
const cachedYarnLock = import_path2.default.join(cacheDir, "yarn.lock");
|
|
489
|
+
if ((0, import_fs.existsSync)(cachedNodeModules)) {
|
|
490
|
+
console.log(`[environment] Restoring node_modules from cache (key: ${cacheKey})`);
|
|
491
|
+
if (!(0, import_fs.existsSync)(targetNodeModules)) {
|
|
492
|
+
cloneDirectory(cachedNodeModules, targetNodeModules);
|
|
493
|
+
}
|
|
494
|
+
if ((0, import_fs.existsSync)(cachedYarnLock)) {
|
|
495
|
+
(0, import_fs.copyFileSync)(cachedYarnLock, import_path2.default.join(workDir, "yarn.lock"));
|
|
496
|
+
}
|
|
497
|
+
return;
|
|
498
|
+
}
|
|
499
|
+
console.log(`[environment] Running ${pm.cmd} ${pm.args.join(" ")} in ${workDir} (cache key: ${cacheKey})`);
|
|
500
|
+
try {
|
|
501
|
+
exec(pm.cmd, pm.args, { cwd: workDir, stdio: "inherit", timeout: 18e4, env: { ...process.env, NODE_ENV: "development" } });
|
|
502
|
+
} catch (err) {
|
|
503
|
+
console.error("[environment] Dependency installation failed:", err instanceof Error ? err.message : String(err));
|
|
504
|
+
return;
|
|
505
|
+
}
|
|
506
|
+
console.log("[environment] Dependency installation complete \u2014 saving to cache");
|
|
507
|
+
try {
|
|
508
|
+
(0, import_fs.mkdirSync)(cacheDir, { recursive: true });
|
|
509
|
+
const yarnLockPath = import_path2.default.join(workDir, "yarn.lock");
|
|
510
|
+
if ((0, import_fs.existsSync)(yarnLockPath)) {
|
|
511
|
+
(0, import_fs.copyFileSync)(yarnLockPath, cachedYarnLock);
|
|
512
|
+
}
|
|
513
|
+
cloneDirectory(targetNodeModules, cachedNodeModules);
|
|
514
|
+
} catch (err) {
|
|
515
|
+
console.error("[environment] Failed to save to cache (installation still succeeded):", err instanceof Error ? err.message : String(err));
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
async function installDependencies(workDir, exec = import_child_process.execFileSync, cacheBase) {
|
|
519
|
+
if (!(0, import_fs.existsSync)(import_path2.default.join(workDir, "package.json"))) {
|
|
520
|
+
return;
|
|
521
|
+
}
|
|
522
|
+
const pm = detectPackageManager(workDir);
|
|
523
|
+
if (cacheBase) {
|
|
524
|
+
installWithCache(workDir, exec, cacheBase, pm);
|
|
525
|
+
return;
|
|
526
|
+
}
|
|
527
|
+
console.log(`[environment] Running ${pm.cmd} ${pm.args.join(" ")} in ${workDir}`);
|
|
528
|
+
try {
|
|
529
|
+
exec(pm.cmd, pm.args, { cwd: workDir, stdio: "inherit", timeout: 18e4, env: { ...process.env, NODE_ENV: "development" } });
|
|
530
|
+
console.log("[environment] Dependency installation complete");
|
|
531
|
+
} catch (err) {
|
|
532
|
+
console.error("[environment] Dependency installation failed:", err instanceof Error ? err.message : String(err));
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
|
|
458
536
|
// src/run-scenario/environment.ts
|
|
459
537
|
async function fetchAndWriteTemplateFiles(template, workDir) {
|
|
460
538
|
let sourceFiles = [];
|
|
@@ -475,27 +553,27 @@ async function fetchAndWriteTemplateFiles(template, workDir) {
|
|
|
475
553
|
const content = ef.gitSource ? await (0, import_evalforge_github_client.fetchGitHubFile)(ef.gitSource, {
|
|
476
554
|
userAgent: "EvalForge-Evaluator"
|
|
477
555
|
}) : ef.content ?? "";
|
|
478
|
-
const dest =
|
|
479
|
-
if (!dest.startsWith(workDir +
|
|
556
|
+
const dest = import_path3.default.resolve(workDir, ef.path);
|
|
557
|
+
if (!dest.startsWith(workDir + import_path3.sep)) {
|
|
480
558
|
throw new Error(
|
|
481
559
|
`Extra file path escapes working directory: "${ef.path}"`
|
|
482
560
|
);
|
|
483
561
|
}
|
|
484
|
-
await (0, import_promises2.mkdir)(
|
|
562
|
+
await (0, import_promises2.mkdir)(import_path3.default.dirname(dest), { recursive: true });
|
|
485
563
|
await (0, import_promises2.writeFile)(dest, content, "utf8");
|
|
486
564
|
})
|
|
487
565
|
);
|
|
488
566
|
}
|
|
489
567
|
function writeWixEnvFile(workDir) {
|
|
490
|
-
const configPath =
|
|
491
|
-
if (!(0,
|
|
568
|
+
const configPath = import_path3.default.join(workDir, "wix.config.json");
|
|
569
|
+
if (!(0, import_fs2.existsSync)(configPath)) {
|
|
492
570
|
return;
|
|
493
571
|
}
|
|
494
572
|
try {
|
|
495
|
-
const config = JSON.parse((0,
|
|
573
|
+
const config = JSON.parse((0, import_fs2.readFileSync)(configPath, "utf-8"));
|
|
496
574
|
if (config.appId) {
|
|
497
|
-
(0,
|
|
498
|
-
|
|
575
|
+
(0, import_fs2.writeFileSync)(
|
|
576
|
+
import_path3.default.join(workDir, ".env"),
|
|
499
577
|
`WIX_CLIENT_ID=${config.appId}
|
|
500
578
|
`,
|
|
501
579
|
"utf-8"
|
|
@@ -507,34 +585,36 @@ function writeWixEnvFile(workDir) {
|
|
|
507
585
|
}
|
|
508
586
|
}
|
|
509
587
|
async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId, template) {
|
|
510
|
-
const baseDir = config.evaluationsDir ??
|
|
588
|
+
const baseDir = config.evaluationsDir ?? import_path3.default.join((0, import_os.tmpdir)(), "evalforge-evaluations");
|
|
589
|
+
const nodeModulesCacheDir = import_path3.default.join(baseDir, "_node_modules_cache");
|
|
511
590
|
if (template) {
|
|
512
591
|
if (!config.evaluationsDir) {
|
|
513
592
|
console.warn(
|
|
514
593
|
"Template specified but EVALUATIONS_DIR not set, using temp directory"
|
|
515
594
|
);
|
|
516
595
|
}
|
|
517
|
-
const workDir2 =
|
|
518
|
-
if ((0,
|
|
519
|
-
(0,
|
|
596
|
+
const workDir2 = import_path3.default.join(baseDir, `${evalRunId2}_${targetId}`);
|
|
597
|
+
if ((0, import_fs2.existsSync)(workDir2)) {
|
|
598
|
+
(0, import_fs2.rmSync)(workDir2, { recursive: true });
|
|
520
599
|
}
|
|
521
|
-
(0,
|
|
600
|
+
(0, import_fs2.mkdirSync)(workDir2, { recursive: true });
|
|
522
601
|
await fetchAndWriteTemplateFiles(template, workDir2);
|
|
523
602
|
console.log(`Template files written to ${workDir2}`);
|
|
524
603
|
writeWixEnvFile(workDir2);
|
|
604
|
+
await installDependencies(workDir2, void 0, nodeModulesCacheDir);
|
|
525
605
|
return workDir2;
|
|
526
606
|
}
|
|
527
|
-
const workDir =
|
|
528
|
-
if ((0,
|
|
529
|
-
(0,
|
|
607
|
+
const workDir = import_path3.default.join(baseDir, `${evalRunId2}_${targetId}_${scenarioId}`);
|
|
608
|
+
if ((0, import_fs2.existsSync)(workDir)) {
|
|
609
|
+
(0, import_fs2.rmSync)(workDir, { recursive: true });
|
|
530
610
|
}
|
|
531
|
-
(0,
|
|
611
|
+
(0, import_fs2.mkdirSync)(workDir, { recursive: true });
|
|
532
612
|
console.log(`Empty working directory created at ${workDir}`);
|
|
533
613
|
return workDir;
|
|
534
614
|
}
|
|
535
615
|
|
|
536
616
|
// src/run-scenario/run-agent-with-context.ts
|
|
537
|
-
var
|
|
617
|
+
var import_crypto5 = require("crypto");
|
|
538
618
|
|
|
539
619
|
// src/run-scenario/agents/registry.ts
|
|
540
620
|
var AgentAdapterRegistry = class {
|
|
@@ -643,7 +723,7 @@ var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
|
643
723
|
|
|
644
724
|
// src/run-scenario/agents/claude-code/write-skills.ts
|
|
645
725
|
var import_promises3 = require("fs/promises");
|
|
646
|
-
var
|
|
726
|
+
var import_path4 = require("path");
|
|
647
727
|
var import_evalforge_github_client2 = require("@wix/evalforge-github-client");
|
|
648
728
|
async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
649
729
|
await Promise.all(
|
|
@@ -652,7 +732,7 @@ async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_g
|
|
|
652
732
|
}
|
|
653
733
|
async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
654
734
|
const skillName = skill.name;
|
|
655
|
-
const skillDir = (0,
|
|
735
|
+
const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skillName);
|
|
656
736
|
await (0, import_promises3.mkdir)(skillDir, { recursive: true });
|
|
657
737
|
const version = skill.latestVersion;
|
|
658
738
|
if (version?.files && version.files.length > 0) {
|
|
@@ -692,18 +772,18 @@ function resolveTimeoutMs(maxTurns, maxDurationMs) {
|
|
|
692
772
|
}
|
|
693
773
|
|
|
694
774
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
695
|
-
var
|
|
775
|
+
var import_crypto2 = require("crypto");
|
|
696
776
|
|
|
697
777
|
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
698
778
|
var import_promises5 = require("fs/promises");
|
|
699
|
-
var
|
|
779
|
+
var import_path6 = require("path");
|
|
700
780
|
var import_evalforge_types2 = require("@wix/evalforge-types");
|
|
701
781
|
|
|
702
782
|
// src/run-scenario/agents/shared/resolve-mcp-placeholders.ts
|
|
703
783
|
var import_promises4 = require("fs/promises");
|
|
704
|
-
var
|
|
784
|
+
var import_path5 = require("path");
|
|
705
785
|
var import_os2 = require("os");
|
|
706
|
-
var WIX_AUTH_FILE = (0,
|
|
786
|
+
var WIX_AUTH_FILE = (0, import_path5.join)((0, import_os2.homedir)(), ".wix", "auth", "api-key.json");
|
|
707
787
|
async function loadWixAuthPlaceholders(authFilePath = WIX_AUTH_FILE) {
|
|
708
788
|
try {
|
|
709
789
|
const content = await (0, import_promises4.readFile)(authFilePath, "utf-8");
|
|
@@ -762,14 +842,14 @@ async function writeMcpToFilesystem(cwd, mcps) {
|
|
|
762
842
|
null,
|
|
763
843
|
2
|
|
764
844
|
);
|
|
765
|
-
const filePath = (0,
|
|
845
|
+
const filePath = (0, import_path6.join)(cwd, ".mcp.json");
|
|
766
846
|
await (0, import_promises5.writeFile)(filePath, content, "utf8");
|
|
767
847
|
console.log(`[MCP] Written to ${filePath}`);
|
|
768
848
|
}
|
|
769
849
|
|
|
770
850
|
// src/run-scenario/agents/claude-code/write-sub-agents.ts
|
|
771
851
|
var import_promises6 = require("fs/promises");
|
|
772
|
-
var
|
|
852
|
+
var import_path7 = require("path");
|
|
773
853
|
var import_evalforge_github_client3 = require("@wix/evalforge-github-client");
|
|
774
854
|
var AGENTS_DIR = ".claude/agents";
|
|
775
855
|
function toAgentFilename(name, index, nameCount) {
|
|
@@ -807,12 +887,12 @@ async function resolveSubAgentContent(agent, fetchFn) {
|
|
|
807
887
|
}
|
|
808
888
|
async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalforge_github_client3.fetchGitHubFile) {
|
|
809
889
|
if (subAgents.length === 0) return;
|
|
810
|
-
const agentsDir = (0,
|
|
890
|
+
const agentsDir = (0, import_path7.join)(cwd, AGENTS_DIR);
|
|
811
891
|
await (0, import_promises6.mkdir)(agentsDir, { recursive: true });
|
|
812
892
|
const nameCount = /* @__PURE__ */ new Map();
|
|
813
893
|
for (const [i, agent] of subAgents.entries()) {
|
|
814
894
|
const filename = toAgentFilename(agent.name, i, nameCount);
|
|
815
|
-
const filePath = (0,
|
|
895
|
+
const filePath = (0, import_path7.join)(agentsDir, `${filename}.md`);
|
|
816
896
|
const content = await resolveSubAgentContent(agent, fetchFn);
|
|
817
897
|
await (0, import_promises6.writeFile)(filePath, content, "utf8");
|
|
818
898
|
}
|
|
@@ -821,7 +901,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalf
|
|
|
821
901
|
|
|
822
902
|
// src/run-scenario/agents/claude-code/write-rules.ts
|
|
823
903
|
var import_promises7 = require("fs/promises");
|
|
824
|
-
var
|
|
904
|
+
var import_path8 = require("path");
|
|
825
905
|
var CURSOR_RULES_DIR = ".cursor/rules";
|
|
826
906
|
function toRuleFilename(name, index, nameCount) {
|
|
827
907
|
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `rule-${index}`;
|
|
@@ -854,9 +934,9 @@ function validateGenericDirectory(dir, cwd) {
|
|
|
854
934
|
`Generic rule directory may not contain "..", got: "${dir}"`
|
|
855
935
|
);
|
|
856
936
|
}
|
|
857
|
-
const normalizedCwd = cwd.endsWith(
|
|
858
|
-
const resolved = (0,
|
|
859
|
-
if (!resolved.startsWith(normalizedCwd +
|
|
937
|
+
const normalizedCwd = cwd.endsWith(import_path8.sep) ? cwd.slice(0, -1) : cwd;
|
|
938
|
+
const resolved = (0, import_path8.resolve)(normalizedCwd, trimmed);
|
|
939
|
+
if (!resolved.startsWith(normalizedCwd + import_path8.sep)) {
|
|
860
940
|
throw new Error(
|
|
861
941
|
`Generic rule directory escapes the working directory: "${dir}"`
|
|
862
942
|
);
|
|
@@ -870,20 +950,20 @@ async function writeRulesToFilesystem(cwd, rules) {
|
|
|
870
950
|
for (const [i, rule] of rules.entries()) {
|
|
871
951
|
switch (rule.ruleType) {
|
|
872
952
|
case "claude-md": {
|
|
873
|
-
await appendToFile((0,
|
|
953
|
+
await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), rule.content);
|
|
874
954
|
break;
|
|
875
955
|
}
|
|
876
956
|
case "agents-md": {
|
|
877
|
-
await appendToFile((0,
|
|
957
|
+
await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), rule.content);
|
|
878
958
|
break;
|
|
879
959
|
}
|
|
880
960
|
case "cursor-rule": {
|
|
881
961
|
if (!hasCursorRules) {
|
|
882
|
-
await (0, import_promises7.mkdir)((0,
|
|
962
|
+
await (0, import_promises7.mkdir)((0, import_path8.join)(cwd, CURSOR_RULES_DIR), { recursive: true });
|
|
883
963
|
hasCursorRules = true;
|
|
884
964
|
}
|
|
885
965
|
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
886
|
-
const filePath = (0,
|
|
966
|
+
const filePath = (0, import_path8.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
|
|
887
967
|
await (0, import_promises7.writeFile)(filePath, rule.content, "utf8");
|
|
888
968
|
break;
|
|
889
969
|
}
|
|
@@ -892,10 +972,10 @@ async function writeRulesToFilesystem(cwd, rules) {
|
|
|
892
972
|
rule.directory ?? ".opencode/rules",
|
|
893
973
|
cwd
|
|
894
974
|
);
|
|
895
|
-
const dirPath = (0,
|
|
975
|
+
const dirPath = (0, import_path8.join)(cwd, directory);
|
|
896
976
|
await (0, import_promises7.mkdir)(dirPath, { recursive: true });
|
|
897
977
|
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
898
|
-
await (0, import_promises7.writeFile)((0,
|
|
978
|
+
await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), rule.content, "utf8");
|
|
899
979
|
break;
|
|
900
980
|
}
|
|
901
981
|
default: {
|
|
@@ -1066,8 +1146,8 @@ function extractToolActionDescription(toolName, toolArgs) {
|
|
|
1066
1146
|
}
|
|
1067
1147
|
}
|
|
1068
1148
|
if (toolName === "LS" || toolName === "ls" || toolName === "ListFiles") {
|
|
1069
|
-
const
|
|
1070
|
-
return `Listing: ${String(
|
|
1149
|
+
const path3 = args.path || args.directory || ".";
|
|
1150
|
+
return `Listing: ${String(path3).slice(0, 50)}`;
|
|
1071
1151
|
}
|
|
1072
1152
|
if ((toolName === "Read" || toolName === "read" || toolName === "View") && (args.file_path || args.path || args.target_file)) {
|
|
1073
1153
|
const filePath = String(
|
|
@@ -1950,7 +2030,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1950
2030
|
const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
|
|
1951
2031
|
if (hasThinking && (hasText || toolCallCount > 0)) {
|
|
1952
2032
|
subSteps.push({
|
|
1953
|
-
id: (0,
|
|
2033
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
1954
2034
|
stepNumber: 0,
|
|
1955
2035
|
// renumbered below
|
|
1956
2036
|
turnIndex,
|
|
@@ -1980,7 +2060,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1980
2060
|
const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
|
|
1981
2061
|
const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
|
|
1982
2062
|
subSteps.push({
|
|
1983
|
-
id: (0,
|
|
2063
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
1984
2064
|
stepNumber: 0,
|
|
1985
2065
|
turnIndex,
|
|
1986
2066
|
type: import_evalforge_types4.LLMStepType.TOOL_USE,
|
|
@@ -2010,7 +2090,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
2010
2090
|
}
|
|
2011
2091
|
if (hasText && toolCallCount > 0) {
|
|
2012
2092
|
subSteps.push({
|
|
2013
|
-
id: (0,
|
|
2093
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
2014
2094
|
stepNumber: 0,
|
|
2015
2095
|
turnIndex,
|
|
2016
2096
|
type: import_evalforge_types4.LLMStepType.COMPLETION,
|
|
@@ -2032,7 +2112,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
2032
2112
|
if (subSteps.length === 0) {
|
|
2033
2113
|
const stepType = hasThinking && !hasText ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
|
|
2034
2114
|
subSteps.push({
|
|
2035
|
-
id: (0,
|
|
2115
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
2036
2116
|
stepNumber: 0,
|
|
2037
2117
|
turnIndex,
|
|
2038
2118
|
type: stepType,
|
|
@@ -2090,7 +2170,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
2090
2170
|
stepTypeBreakdown
|
|
2091
2171
|
};
|
|
2092
2172
|
return {
|
|
2093
|
-
id: (0,
|
|
2173
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
2094
2174
|
steps: traceSteps,
|
|
2095
2175
|
summary
|
|
2096
2176
|
};
|
|
@@ -2192,7 +2272,7 @@ defaultRegistry.register(claudeCodeAdapter);
|
|
|
2192
2272
|
var import_evalforge_types9 = require("@wix/evalforge-types");
|
|
2193
2273
|
|
|
2194
2274
|
// src/run-scenario/agents/opencode/execute.ts
|
|
2195
|
-
var
|
|
2275
|
+
var import_child_process2 = require("child_process");
|
|
2196
2276
|
var import_evalforge_types8 = require("@wix/evalforge-types");
|
|
2197
2277
|
|
|
2198
2278
|
// src/run-scenario/agents/opencode/types.ts
|
|
@@ -2206,7 +2286,7 @@ function tryParseJson(text) {
|
|
|
2206
2286
|
|
|
2207
2287
|
// src/run-scenario/agents/opencode/write-skills.ts
|
|
2208
2288
|
var import_promises8 = require("fs/promises");
|
|
2209
|
-
var
|
|
2289
|
+
var import_path9 = require("path");
|
|
2210
2290
|
var import_evalforge_github_client4 = require("@wix/evalforge-github-client");
|
|
2211
2291
|
async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_github_client4.fetchGitHubFolder) {
|
|
2212
2292
|
await Promise.all(
|
|
@@ -2215,7 +2295,7 @@ async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_
|
|
|
2215
2295
|
}
|
|
2216
2296
|
async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
|
|
2217
2297
|
const skillName = skill.name;
|
|
2218
|
-
const skillDir = (0,
|
|
2298
|
+
const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skillName);
|
|
2219
2299
|
await (0, import_promises8.mkdir)(skillDir, { recursive: true });
|
|
2220
2300
|
const version = skill.latestVersion;
|
|
2221
2301
|
if (version?.files && version.files.length > 0) {
|
|
@@ -2248,7 +2328,7 @@ async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
|
|
|
2248
2328
|
|
|
2249
2329
|
// src/run-scenario/agents/opencode/write-sub-agents.ts
|
|
2250
2330
|
var import_promises9 = require("fs/promises");
|
|
2251
|
-
var
|
|
2331
|
+
var import_path10 = require("path");
|
|
2252
2332
|
var import_evalforge_github_client5 = require("@wix/evalforge-github-client");
|
|
2253
2333
|
var AGENTS_DIR2 = ".opencode/agents";
|
|
2254
2334
|
function toAgentFilename2(name, index, nameCount) {
|
|
@@ -2286,12 +2366,12 @@ async function resolveSubAgentContent2(agent, fetchFn) {
|
|
|
2286
2366
|
}
|
|
2287
2367
|
async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_evalforge_github_client5.fetchGitHubFile) {
|
|
2288
2368
|
if (subAgents.length === 0) return;
|
|
2289
|
-
const agentsDir = (0,
|
|
2369
|
+
const agentsDir = (0, import_path10.join)(cwd, AGENTS_DIR2);
|
|
2290
2370
|
await (0, import_promises9.mkdir)(agentsDir, { recursive: true });
|
|
2291
2371
|
const nameCount = /* @__PURE__ */ new Map();
|
|
2292
2372
|
for (const [i, agent] of subAgents.entries()) {
|
|
2293
2373
|
const filename = toAgentFilename2(agent.name, i, nameCount);
|
|
2294
|
-
const filePath = (0,
|
|
2374
|
+
const filePath = (0, import_path10.join)(agentsDir, `${filename}.md`);
|
|
2295
2375
|
const content = await resolveSubAgentContent2(agent, fetchFn);
|
|
2296
2376
|
await (0, import_promises9.writeFile)(filePath, content, "utf8");
|
|
2297
2377
|
}
|
|
@@ -2454,7 +2534,7 @@ async function buildOpenCodeEnv(options) {
|
|
|
2454
2534
|
|
|
2455
2535
|
// src/run-scenario/agents/opencode/build-trace.ts
|
|
2456
2536
|
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
2457
|
-
var
|
|
2537
|
+
var import_crypto3 = require("crypto");
|
|
2458
2538
|
function toCanonicalModelId(modelId) {
|
|
2459
2539
|
const slashIndex = modelId.indexOf("/");
|
|
2460
2540
|
return slashIndex > 0 ? modelId.slice(slashIndex + 1) : modelId;
|
|
@@ -2530,7 +2610,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
2530
2610
|
const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
|
|
2531
2611
|
if (hasThinking && (hasText || toolCallCount > 0)) {
|
|
2532
2612
|
subSteps.push({
|
|
2533
|
-
id: (0,
|
|
2613
|
+
id: (0, import_crypto3.randomUUID)(),
|
|
2534
2614
|
stepNumber: 0,
|
|
2535
2615
|
turnIndex,
|
|
2536
2616
|
type: import_evalforge_types7.LLMStepType.THINKING,
|
|
@@ -2559,7 +2639,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
2559
2639
|
const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
|
|
2560
2640
|
const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
|
|
2561
2641
|
subSteps.push({
|
|
2562
|
-
id: (0,
|
|
2642
|
+
id: (0, import_crypto3.randomUUID)(),
|
|
2563
2643
|
stepNumber: 0,
|
|
2564
2644
|
turnIndex,
|
|
2565
2645
|
type: import_evalforge_types7.LLMStepType.TOOL_USE,
|
|
@@ -2589,7 +2669,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
2589
2669
|
}
|
|
2590
2670
|
if (hasText && toolCallCount > 0) {
|
|
2591
2671
|
subSteps.push({
|
|
2592
|
-
id: (0,
|
|
2672
|
+
id: (0, import_crypto3.randomUUID)(),
|
|
2593
2673
|
stepNumber: 0,
|
|
2594
2674
|
turnIndex,
|
|
2595
2675
|
type: import_evalforge_types7.LLMStepType.COMPLETION,
|
|
@@ -2611,7 +2691,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
2611
2691
|
if (subSteps.length === 0) {
|
|
2612
2692
|
const stepType = hasThinking && !hasText ? import_evalforge_types7.LLMStepType.THINKING : import_evalforge_types7.LLMStepType.COMPLETION;
|
|
2613
2693
|
subSteps.push({
|
|
2614
|
-
id: (0,
|
|
2694
|
+
id: (0, import_crypto3.randomUUID)(),
|
|
2615
2695
|
stepNumber: 0,
|
|
2616
2696
|
turnIndex,
|
|
2617
2697
|
type: stepType,
|
|
@@ -2680,7 +2760,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
2680
2760
|
stepTypeBreakdown
|
|
2681
2761
|
};
|
|
2682
2762
|
return {
|
|
2683
|
-
id: (0,
|
|
2763
|
+
id: (0, import_crypto3.randomUUID)(),
|
|
2684
2764
|
steps: allSteps,
|
|
2685
2765
|
summary
|
|
2686
2766
|
};
|
|
@@ -2755,7 +2835,7 @@ function buildConversation2(timestampedEvents) {
|
|
|
2755
2835
|
|
|
2756
2836
|
// src/run-scenario/agents/opencode/execute.ts
|
|
2757
2837
|
var import_promises10 = require("fs/promises");
|
|
2758
|
-
var
|
|
2838
|
+
var import_path11 = require("path");
|
|
2759
2839
|
var KILL_GRACE_PERIOD_MS = 5e3;
|
|
2760
2840
|
var IDLE_TIMEOUT_MS = 12e4;
|
|
2761
2841
|
var IDLE_CHECK_INTERVAL_MS = 15e3;
|
|
@@ -2780,14 +2860,14 @@ function extractToolAction(toolName, args) {
|
|
|
2780
2860
|
return `Using ${toolName}...`;
|
|
2781
2861
|
}
|
|
2782
2862
|
async function writePromptImages(cwd, images) {
|
|
2783
|
-
const imagesDir = (0,
|
|
2863
|
+
const imagesDir = (0, import_path11.join)(cwd, "prompt-images");
|
|
2784
2864
|
await (0, import_promises10.mkdir)(imagesDir, { recursive: true });
|
|
2785
2865
|
const filePaths = [];
|
|
2786
2866
|
for (let i = 0; i < images.length; i++) {
|
|
2787
2867
|
const img = images[i];
|
|
2788
2868
|
const ext = img.mediaType.split("/")[1] || "png";
|
|
2789
2869
|
const filename = `image-${i}.${ext}`;
|
|
2790
|
-
const filepath = (0,
|
|
2870
|
+
const filepath = (0, import_path11.join)(imagesDir, filename);
|
|
2791
2871
|
const buffer = Buffer.from(img.base64, "base64");
|
|
2792
2872
|
await (0, import_promises10.writeFile)(filepath, buffer);
|
|
2793
2873
|
filePaths.push(filepath);
|
|
@@ -2880,10 +2960,10 @@ async function prepareOpenCodeEnvironment(cwd, skills, options) {
|
|
|
2880
2960
|
}
|
|
2881
2961
|
}
|
|
2882
2962
|
async function writeSystemPromptRule(cwd, systemPrompt) {
|
|
2883
|
-
const rulesDir = (0,
|
|
2963
|
+
const rulesDir = (0, import_path11.join)(cwd, ".opencode", "rules");
|
|
2884
2964
|
await (0, import_promises10.mkdir)(rulesDir, { recursive: true });
|
|
2885
2965
|
await (0, import_promises10.writeFile)(
|
|
2886
|
-
(0,
|
|
2966
|
+
(0, import_path11.join)(rulesDir, "evalforge-system-prompt.md"),
|
|
2887
2967
|
systemPrompt,
|
|
2888
2968
|
"utf-8"
|
|
2889
2969
|
);
|
|
@@ -2986,7 +3066,7 @@ function spawnOpenCodeProcess(opts) {
|
|
|
2986
3066
|
};
|
|
2987
3067
|
let child;
|
|
2988
3068
|
try {
|
|
2989
|
-
child = (0,
|
|
3069
|
+
child = (0, import_child_process2.spawn)("opencode", args, {
|
|
2990
3070
|
cwd,
|
|
2991
3071
|
env,
|
|
2992
3072
|
stdio: ["ignore", "pipe", "pipe"],
|
|
@@ -3471,7 +3551,7 @@ var import_anthropic = require("@ai-sdk/anthropic");
|
|
|
3471
3551
|
var import_google = require("@ai-sdk/google");
|
|
3472
3552
|
var import_openai = require("@ai-sdk/openai");
|
|
3473
3553
|
var import_evalforge_types11 = require("@wix/evalforge-types");
|
|
3474
|
-
var
|
|
3554
|
+
var import_crypto4 = require("crypto");
|
|
3475
3555
|
|
|
3476
3556
|
// src/run-scenario/agents/simple-agent/mcp-tools.ts
|
|
3477
3557
|
var import_mcp = require("@ai-sdk/mcp");
|
|
@@ -4087,7 +4167,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
|
|
|
4087
4167
|
const costUsd = calculateStepCost(step, modelId, provider, tokenUsage);
|
|
4088
4168
|
const toolResultError = findToolResultError(step);
|
|
4089
4169
|
return {
|
|
4090
|
-
id: (0,
|
|
4170
|
+
id: (0, import_crypto4.randomUUID)(),
|
|
4091
4171
|
stepNumber: i + 1,
|
|
4092
4172
|
turnIndex: i,
|
|
4093
4173
|
type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
|
|
@@ -4111,7 +4191,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
|
|
|
4111
4191
|
total: totalUsage.totalTokens
|
|
4112
4192
|
};
|
|
4113
4193
|
return {
|
|
4114
|
-
id: (0,
|
|
4194
|
+
id: (0, import_crypto4.randomUUID)(),
|
|
4115
4195
|
steps: traceSteps,
|
|
4116
4196
|
summary: {
|
|
4117
4197
|
totalSteps: traceSteps.length,
|
|
@@ -4186,8 +4266,8 @@ var simpleAgentAdapter = new SimpleAgentAdapter();
|
|
|
4186
4266
|
defaultRegistry.register(simpleAgentAdapter);
|
|
4187
4267
|
|
|
4188
4268
|
// src/run-scenario/file-diff.ts
|
|
4189
|
-
var
|
|
4190
|
-
var
|
|
4269
|
+
var import_fs3 = require("fs");
|
|
4270
|
+
var import_path12 = require("path");
|
|
4191
4271
|
|
|
4192
4272
|
// ../../node_modules/diff/lib/index.mjs
|
|
4193
4273
|
function Diff() {
|
|
@@ -4291,11 +4371,11 @@ Diff.prototype = {
|
|
|
4291
4371
|
}
|
|
4292
4372
|
}
|
|
4293
4373
|
},
|
|
4294
|
-
addToPath: function addToPath(
|
|
4295
|
-
var last =
|
|
4374
|
+
addToPath: function addToPath(path3, added, removed, oldPosInc, options) {
|
|
4375
|
+
var last = path3.lastComponent;
|
|
4296
4376
|
if (last && !options.oneChangePerToken && last.added === added && last.removed === removed) {
|
|
4297
4377
|
return {
|
|
4298
|
-
oldPos:
|
|
4378
|
+
oldPos: path3.oldPos + oldPosInc,
|
|
4299
4379
|
lastComponent: {
|
|
4300
4380
|
count: last.count + 1,
|
|
4301
4381
|
added,
|
|
@@ -4305,7 +4385,7 @@ Diff.prototype = {
|
|
|
4305
4385
|
};
|
|
4306
4386
|
} else {
|
|
4307
4387
|
return {
|
|
4308
|
-
oldPos:
|
|
4388
|
+
oldPos: path3.oldPos + oldPosInc,
|
|
4309
4389
|
lastComponent: {
|
|
4310
4390
|
count: 1,
|
|
4311
4391
|
added,
|
|
@@ -4745,9 +4825,9 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
|
|
|
4745
4825
|
// src/run-scenario/file-diff.ts
|
|
4746
4826
|
function deriveInfrastructurePaths(prePrep, postPrep) {
|
|
4747
4827
|
const infraPaths = /* @__PURE__ */ new Set();
|
|
4748
|
-
for (const
|
|
4749
|
-
if (prePrep[
|
|
4750
|
-
infraPaths.add(
|
|
4828
|
+
for (const path3 of Object.keys(postPrep)) {
|
|
4829
|
+
if (prePrep[path3] === void 0 || prePrep[path3] !== postPrep[path3]) {
|
|
4830
|
+
infraPaths.add(path3);
|
|
4751
4831
|
}
|
|
4752
4832
|
}
|
|
4753
4833
|
return infraPaths;
|
|
@@ -4807,13 +4887,13 @@ function isBinaryFile(filename) {
|
|
|
4807
4887
|
function snapshotDirectory(dir, baseDir) {
|
|
4808
4888
|
const snapshot = {};
|
|
4809
4889
|
const base = baseDir || dir;
|
|
4810
|
-
if (!(0,
|
|
4890
|
+
if (!(0, import_fs3.existsSync)(dir)) {
|
|
4811
4891
|
return snapshot;
|
|
4812
4892
|
}
|
|
4813
|
-
const entries = (0,
|
|
4893
|
+
const entries = (0, import_fs3.readdirSync)(dir, { withFileTypes: true });
|
|
4814
4894
|
for (const entry of entries) {
|
|
4815
|
-
const fullPath = (0,
|
|
4816
|
-
const relativePath = (0,
|
|
4895
|
+
const fullPath = (0, import_path12.join)(dir, entry.name);
|
|
4896
|
+
const relativePath = (0, import_path12.relative)(base, fullPath);
|
|
4817
4897
|
if (shouldIgnore(entry.name)) {
|
|
4818
4898
|
continue;
|
|
4819
4899
|
}
|
|
@@ -4825,11 +4905,11 @@ function snapshotDirectory(dir, baseDir) {
|
|
|
4825
4905
|
continue;
|
|
4826
4906
|
}
|
|
4827
4907
|
try {
|
|
4828
|
-
const stats = (0,
|
|
4908
|
+
const stats = (0, import_fs3.statSync)(fullPath);
|
|
4829
4909
|
if (stats.size > MAX_FILE_SIZE) {
|
|
4830
4910
|
continue;
|
|
4831
4911
|
}
|
|
4832
|
-
const content = (0,
|
|
4912
|
+
const content = (0, import_fs3.readFileSync)(fullPath, "utf-8");
|
|
4833
4913
|
snapshot[relativePath] = content;
|
|
4834
4914
|
} catch {
|
|
4835
4915
|
continue;
|
|
@@ -4858,19 +4938,19 @@ function generateDiffLines(before, after) {
|
|
|
4858
4938
|
function diffSnapshots(before, after, infrastructurePaths) {
|
|
4859
4939
|
const diffs = [];
|
|
4860
4940
|
const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
4861
|
-
for (const
|
|
4862
|
-
const beforeContent = before[
|
|
4863
|
-
const afterContent = after[
|
|
4864
|
-
if (before[
|
|
4941
|
+
for (const path3 of allPaths) {
|
|
4942
|
+
const beforeContent = before[path3] ?? "";
|
|
4943
|
+
const afterContent = after[path3] ?? "";
|
|
4944
|
+
if (before[path3] !== void 0 && beforeContent === afterContent) {
|
|
4865
4945
|
continue;
|
|
4866
4946
|
}
|
|
4867
4947
|
const diffLines2 = generateDiffLines(beforeContent, afterContent);
|
|
4868
4948
|
diffs.push({
|
|
4869
|
-
path:
|
|
4949
|
+
path: path3,
|
|
4870
4950
|
expected: beforeContent,
|
|
4871
4951
|
actual: afterContent,
|
|
4872
4952
|
diffLines: diffLines2,
|
|
4873
|
-
...infrastructurePaths?.has(
|
|
4953
|
+
...infrastructurePaths?.has(path3) && { isInfrastructure: true }
|
|
4874
4954
|
});
|
|
4875
4955
|
}
|
|
4876
4956
|
const deletedPaths = [...allPaths].filter((p) => after[p] === void 0);
|
|
@@ -4897,9 +4977,9 @@ function diffSnapshots(before, after, infrastructurePaths) {
|
|
|
4897
4977
|
function extractTemplateFiles(before, after, infrastructurePaths) {
|
|
4898
4978
|
const files = [];
|
|
4899
4979
|
const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
4900
|
-
for (const
|
|
4901
|
-
const beforeContent = before[
|
|
4902
|
-
const afterContent = after[
|
|
4980
|
+
for (const path3 of allPaths) {
|
|
4981
|
+
const beforeContent = before[path3];
|
|
4982
|
+
const afterContent = after[path3];
|
|
4903
4983
|
if (afterContent === void 0) {
|
|
4904
4984
|
continue;
|
|
4905
4985
|
}
|
|
@@ -4912,10 +4992,10 @@ function extractTemplateFiles(before, after, infrastructurePaths) {
|
|
|
4912
4992
|
status = "unchanged";
|
|
4913
4993
|
}
|
|
4914
4994
|
files.push({
|
|
4915
|
-
path:
|
|
4995
|
+
path: path3,
|
|
4916
4996
|
content: afterContent,
|
|
4917
4997
|
status,
|
|
4918
|
-
...infrastructurePaths?.has(
|
|
4998
|
+
...infrastructurePaths?.has(path3) && { isInfrastructure: true }
|
|
4919
4999
|
});
|
|
4920
5000
|
}
|
|
4921
5001
|
files.sort((a, b) => a.path.localeCompare(b.path));
|
|
@@ -4999,7 +5079,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4999
5079
|
}
|
|
5000
5080
|
} : agent?.modelConfig ?? (llmTrace?.summary.modelsUsed?.[0] ? { model: llmTrace.summary.modelsUsed[0] } : void 0);
|
|
5001
5081
|
return {
|
|
5002
|
-
id: (0,
|
|
5082
|
+
id: (0, import_crypto5.randomUUID)(),
|
|
5003
5083
|
targetId,
|
|
5004
5084
|
targetName,
|
|
5005
5085
|
scenarioId: scenario.id,
|
|
@@ -5020,6 +5100,24 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
5020
5100
|
// src/run-scenario/index.ts
|
|
5021
5101
|
async function runScenario(config, evalRunId2, scenario, evalData, template, resolvedAssertions) {
|
|
5022
5102
|
const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
|
|
5103
|
+
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
5104
|
+
if (template) {
|
|
5105
|
+
console.log(
|
|
5106
|
+
(0, import_evalforge_types13.formatTraceEventLine)({
|
|
5107
|
+
evalRunId: evalRunId2,
|
|
5108
|
+
scenarioId: scenario.id,
|
|
5109
|
+
scenarioName: scenario.name,
|
|
5110
|
+
targetId,
|
|
5111
|
+
targetName,
|
|
5112
|
+
stepNumber: 0,
|
|
5113
|
+
type: import_evalforge_types13.LiveTraceEventType.PROGRESS,
|
|
5114
|
+
outputPreview: "Setting up environment (installing dependencies)...",
|
|
5115
|
+
elapsedMs: 0,
|
|
5116
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
5117
|
+
isComplete: false
|
|
5118
|
+
})
|
|
5119
|
+
);
|
|
5120
|
+
}
|
|
5023
5121
|
const workDir = await prepareWorkingDirectory(
|
|
5024
5122
|
config,
|
|
5025
5123
|
evalRunId2,
|
|
@@ -5083,7 +5181,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
5083
5181
|
}
|
|
5084
5182
|
|
|
5085
5183
|
// src/evaluation-loop.ts
|
|
5086
|
-
var
|
|
5184
|
+
var import_crypto6 = require("crypto");
|
|
5087
5185
|
async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
5088
5186
|
const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
|
|
5089
5187
|
let completedExecutions = 0;
|
|
@@ -5109,7 +5207,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
5109
5207
|
`[Evaluator] Scenario iteration failed, recording as error result: "${scenario.name}"${iterLabel} \u2014 ${errorMsg}`
|
|
5110
5208
|
);
|
|
5111
5209
|
const errorResult = {
|
|
5112
|
-
id: (0,
|
|
5210
|
+
id: (0, import_crypto6.randomUUID)(),
|
|
5113
5211
|
targetId,
|
|
5114
5212
|
targetName,
|
|
5115
5213
|
scenarioId: scenario.id,
|