@wix/evalforge-evaluator 0.184.0 → 0.186.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +240 -221
- package/build/index.js.map +4 -4
- package/build/index.mjs +155 -135
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/agents/claude-code/write-mcp.d.ts +1 -7
- package/build/types/run-scenario/agents/claude-code/write-rules.d.ts +1 -7
- package/build/types/run-scenario/agents/claude-code/write-skills.d.ts +9 -6
- package/build/types/run-scenario/agents/claude-code/write-sub-agents.d.ts +5 -4
- package/build/types/run-scenario/agents/opencode/write-skills.d.ts +8 -5
- package/build/types/run-scenario/agents/opencode/write-sub-agents.d.ts +3 -3
- package/package.json +2 -2
- package/build/types/run-scenario/agents/shared/resolve-capability-content.d.ts +0 -42
package/build/index.js
CHANGED
|
@@ -5226,7 +5226,7 @@ var require_index_node = __commonJS({
|
|
|
5226
5226
|
});
|
|
5227
5227
|
|
|
5228
5228
|
// src/index.ts
|
|
5229
|
-
var
|
|
5229
|
+
var import_evalforge_types15 = require("@wix/evalforge-types");
|
|
5230
5230
|
|
|
5231
5231
|
// src/config.ts
|
|
5232
5232
|
function loadConfig() {
|
|
@@ -6816,21 +6816,37 @@ function createApiClient(serverUrl, options = "") {
|
|
|
6816
6816
|
// The legacy REST endpoint enriched the capability with its latest version
|
|
6817
6817
|
// server-side; ambassador's GetCapability returns the bare entity, so we
|
|
6818
6818
|
// compose it with GetLatestCapabilityVersion in parallel here.
|
|
6819
|
+
//
|
|
6820
|
+
// The latest-version fetch is BEST-EFFORT: a failure must not drop the whole
|
|
6821
|
+
// capability. Otherwise one broken snapshot fetch makes the capability (e.g.
|
|
6822
|
+
// an MCP) silently vanish from the run. Runs that pin a version still resolve
|
|
6823
|
+
// their content via getCapabilityVersion downstream.
|
|
6819
6824
|
async getCapability(projectId2, id) {
|
|
6820
|
-
const [
|
|
6825
|
+
const [capResult, versionResult] = await Promise.allSettled([
|
|
6821
6826
|
httpClient.request(getCapability({ projectId: projectId2, capabilityId: id })),
|
|
6822
6827
|
httpClient.request(
|
|
6823
6828
|
getLatestCapabilityVersion({ projectId: projectId2, capabilityId: id })
|
|
6824
6829
|
)
|
|
6825
6830
|
]);
|
|
6826
|
-
|
|
6831
|
+
if (capResult.status === "rejected") {
|
|
6832
|
+
throw capResult.reason;
|
|
6833
|
+
}
|
|
6834
|
+
const capability = capResult.value.data.capability;
|
|
6827
6835
|
if (!capability) {
|
|
6828
6836
|
throw new Error(`Capability ${id} not found in project ${projectId2}`);
|
|
6829
6837
|
}
|
|
6830
|
-
|
|
6831
|
-
|
|
6832
|
-
|
|
6833
|
-
|
|
6838
|
+
let latestVersion;
|
|
6839
|
+
if (versionResult.status === "fulfilled" && versionResult.value.data.capabilityVersion) {
|
|
6840
|
+
latestVersion = capabilityVersionFromProto(
|
|
6841
|
+
versionResult.value.data.capabilityVersion,
|
|
6842
|
+
projectId2
|
|
6843
|
+
);
|
|
6844
|
+
} else if (versionResult.status === "rejected") {
|
|
6845
|
+
const reason = versionResult.reason instanceof Error ? versionResult.reason.message : String(versionResult.reason);
|
|
6846
|
+
console.warn(
|
|
6847
|
+
`[Capabilities] getLatestCapabilityVersion(${id}) failed; loading capability without a snapshot (pinned versions still resolve): ${reason}`
|
|
6848
|
+
);
|
|
6849
|
+
}
|
|
6834
6850
|
return { ...capabilityFromProto(capability), latestVersion };
|
|
6835
6851
|
},
|
|
6836
6852
|
async getCapabilityVersion(projectId2, capabilityId, versionId) {
|
|
@@ -7115,7 +7131,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
7115
7131
|
}
|
|
7116
7132
|
|
|
7117
7133
|
// src/run-scenario/index.ts
|
|
7118
|
-
var
|
|
7134
|
+
var import_evalforge_types13 = require("@wix/evalforge-types");
|
|
7119
7135
|
var import_eval_assertions = require("@wix/eval-assertions");
|
|
7120
7136
|
|
|
7121
7137
|
// src/run-scenario/environment.ts
|
|
@@ -7451,122 +7467,50 @@ function getAdapter(identifier) {
|
|
|
7451
7467
|
}
|
|
7452
7468
|
|
|
7453
7469
|
// src/run-scenario/agents/claude-code/claude-code-adapter.ts
|
|
7454
|
-
var
|
|
7470
|
+
var import_evalforge_types5 = require("@wix/evalforge-types");
|
|
7455
7471
|
|
|
7456
7472
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
7457
|
-
var
|
|
7473
|
+
var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
7458
7474
|
|
|
7459
7475
|
// src/run-scenario/agents/claude-code/write-skills.ts
|
|
7460
7476
|
var import_promises3 = require("fs/promises");
|
|
7461
7477
|
var import_path4 = require("path");
|
|
7462
|
-
|
|
7463
|
-
// src/run-scenario/agents/shared/resolve-capability-content.ts
|
|
7464
7478
|
var import_evalforge_github_client2 = require("@wix/evalforge-github-client");
|
|
7465
|
-
|
|
7466
|
-
var USER_AGENT = "EvalForge-Evaluator";
|
|
7467
|
-
async function resolveSkillFiles(skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
7468
|
-
const version = skill.latestVersion;
|
|
7469
|
-
if (version?.files && version.files.length > 0) {
|
|
7470
|
-
console.log(
|
|
7471
|
-
`[Skill] ${skill.name}: using ${version.files.length} file(s) from snapshot`
|
|
7472
|
-
);
|
|
7473
|
-
return version.files;
|
|
7474
|
-
}
|
|
7475
|
-
if (skill.source) {
|
|
7476
|
-
const files = await fetchFn(skill.source, { userAgent: USER_AGENT });
|
|
7477
|
-
console.log(
|
|
7478
|
-
`[Skill] ${skill.name}: fetched ${files.length} file(s) from GitHub (live)`
|
|
7479
|
-
);
|
|
7480
|
-
return files;
|
|
7481
|
-
}
|
|
7482
|
-
throw new Error(`Skill ${skill.name} has no files and no source configured`);
|
|
7483
|
-
}
|
|
7484
|
-
async function fetchSourceFile(label, noun, name, source, fetchFn) {
|
|
7485
|
-
try {
|
|
7486
|
-
const content = await fetchFn(source, { userAgent: USER_AGENT });
|
|
7487
|
-
console.log(
|
|
7488
|
-
`[${label}] Fetched "${name}" from ${source.owner}/${source.repo}/${source.path}@${source.ref}`
|
|
7489
|
-
);
|
|
7490
|
-
return content;
|
|
7491
|
-
} catch (error) {
|
|
7492
|
-
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7493
|
-
console.error(`[${label}] "${name}": GitHub fetch failed: ${message}`);
|
|
7494
|
-
throw new Error(
|
|
7495
|
-
`Failed to fetch ${noun} "${name}" from GitHub: ${message}`
|
|
7496
|
-
);
|
|
7497
|
-
}
|
|
7498
|
-
}
|
|
7499
|
-
async function resolveSubAgentMd(agent, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
|
|
7500
|
-
if (agent.source) {
|
|
7501
|
-
return fetchSourceFile(
|
|
7502
|
-
"SubAgents",
|
|
7503
|
-
"sub-agent",
|
|
7504
|
-
agent.name,
|
|
7505
|
-
agent.source,
|
|
7506
|
-
fetchFn
|
|
7507
|
-
);
|
|
7508
|
-
}
|
|
7509
|
-
if (!agent.subAgentMd) {
|
|
7510
|
-
console.warn(
|
|
7511
|
-
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
7512
|
-
);
|
|
7513
|
-
}
|
|
7514
|
-
return agent.subAgentMd;
|
|
7515
|
-
}
|
|
7516
|
-
async function resolveRuleText(rule, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
|
|
7517
|
-
if (!rule.source) {
|
|
7518
|
-
return rule.content;
|
|
7519
|
-
}
|
|
7520
|
-
return fetchSourceFile("Rules", "rule", rule.name, rule.source, fetchFn);
|
|
7521
|
-
}
|
|
7522
|
-
async function resolveMcpConfig(mcp, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
|
|
7523
|
-
if (!mcp.source) {
|
|
7524
|
-
return mcp.config;
|
|
7525
|
-
}
|
|
7526
|
-
const raw = await fetchSourceFile(
|
|
7527
|
-
"MCP",
|
|
7528
|
-
"MCP",
|
|
7529
|
-
mcp.name,
|
|
7530
|
-
mcp.source,
|
|
7531
|
-
fetchFn
|
|
7532
|
-
);
|
|
7533
|
-
let parsed;
|
|
7534
|
-
try {
|
|
7535
|
-
parsed = JSON.parse(raw);
|
|
7536
|
-
} catch (error) {
|
|
7537
|
-
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7538
|
-
throw new Error(
|
|
7539
|
-
`MCP "${mcp.name}" GitHub source is not valid JSON: ${message}`
|
|
7540
|
-
);
|
|
7541
|
-
}
|
|
7542
|
-
if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
|
|
7543
|
-
throw new Error(`MCP "${mcp.name}" GitHub source must be a JSON object`);
|
|
7544
|
-
}
|
|
7545
|
-
const obj = parsed;
|
|
7546
|
-
const servers = obj[import_evalforge_types2.MCP_SERVERS_JSON_KEY];
|
|
7547
|
-
if (typeof servers === "object" && servers !== null && !Array.isArray(servers)) {
|
|
7548
|
-
return servers;
|
|
7549
|
-
}
|
|
7550
|
-
return obj;
|
|
7551
|
-
}
|
|
7552
|
-
|
|
7553
|
-
// src/run-scenario/agents/claude-code/write-skills.ts
|
|
7554
|
-
async function writeSkillsToFilesystem(cwd, skills, fetchFn) {
|
|
7479
|
+
async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
7555
7480
|
await Promise.all(
|
|
7556
7481
|
skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
|
|
7557
7482
|
);
|
|
7558
7483
|
}
|
|
7559
|
-
async function writeSkillToFilesystem(cwd, skill, fetchFn) {
|
|
7560
|
-
const
|
|
7484
|
+
async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
7485
|
+
const skillName = skill.name;
|
|
7486
|
+
const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skillName);
|
|
7561
7487
|
await (0, import_promises3.mkdir)(skillDir, { recursive: true });
|
|
7562
|
-
|
|
7563
|
-
|
|
7564
|
-
await writeFilesToDirectory(skillDir, files);
|
|
7565
|
-
|
|
7566
|
-
|
|
7567
|
-
throw new Error(
|
|
7568
|
-
`Failed to write skill ${skill.name} to filesystem: ${message}`
|
|
7488
|
+
const version = skill.latestVersion;
|
|
7489
|
+
if (version?.files && version.files.length > 0) {
|
|
7490
|
+
await writeFilesToDirectory(skillDir, version.files);
|
|
7491
|
+
console.log(
|
|
7492
|
+
`[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
|
|
7569
7493
|
);
|
|
7494
|
+
} else if (skill.source) {
|
|
7495
|
+
try {
|
|
7496
|
+
const files = await fetchFn(skill.source, {
|
|
7497
|
+
userAgent: "EvalForge-Evaluator"
|
|
7498
|
+
});
|
|
7499
|
+
await writeFilesToDirectory(skillDir, files);
|
|
7500
|
+
console.log(
|
|
7501
|
+
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
7502
|
+
);
|
|
7503
|
+
} catch (error) {
|
|
7504
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7505
|
+
console.error(
|
|
7506
|
+
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
7507
|
+
);
|
|
7508
|
+
throw new Error(
|
|
7509
|
+
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
7510
|
+
);
|
|
7511
|
+
}
|
|
7512
|
+
} else {
|
|
7513
|
+
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
7570
7514
|
}
|
|
7571
7515
|
}
|
|
7572
7516
|
|
|
@@ -7584,7 +7528,7 @@ var import_crypto2 = require("crypto");
|
|
|
7584
7528
|
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
7585
7529
|
var import_promises5 = require("fs/promises");
|
|
7586
7530
|
var import_path6 = require("path");
|
|
7587
|
-
var
|
|
7531
|
+
var import_evalforge_types2 = require("@wix/evalforge-types");
|
|
7588
7532
|
|
|
7589
7533
|
// src/run-scenario/agents/shared/resolve-mcp-placeholders.ts
|
|
7590
7534
|
var import_promises4 = require("fs/promises");
|
|
@@ -7629,11 +7573,11 @@ async function resolveMcpPlaceholders(mcpServers, options = {}) {
|
|
|
7629
7573
|
}
|
|
7630
7574
|
|
|
7631
7575
|
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
7632
|
-
async function writeMcpToFilesystem(cwd, mcps
|
|
7576
|
+
async function writeMcpToFilesystem(cwd, mcps) {
|
|
7633
7577
|
if (mcps.length === 0) return;
|
|
7634
7578
|
const mcpServers = {};
|
|
7635
7579
|
for (const mcp of mcps) {
|
|
7636
|
-
const config =
|
|
7580
|
+
const config = mcp.config;
|
|
7637
7581
|
for (const [key, value] of Object.entries(config)) {
|
|
7638
7582
|
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
7639
7583
|
throw new Error(
|
|
@@ -7645,7 +7589,7 @@ async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
|
|
|
7645
7589
|
}
|
|
7646
7590
|
const resolvedServers = await resolveMcpPlaceholders(mcpServers, { cwd });
|
|
7647
7591
|
const content = JSON.stringify(
|
|
7648
|
-
{ [
|
|
7592
|
+
{ [import_evalforge_types2.MCP_SERVERS_JSON_KEY]: resolvedServers },
|
|
7649
7593
|
null,
|
|
7650
7594
|
2
|
|
7651
7595
|
);
|
|
@@ -7657,6 +7601,7 @@ async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
|
|
|
7657
7601
|
// src/run-scenario/agents/claude-code/write-sub-agents.ts
|
|
7658
7602
|
var import_promises6 = require("fs/promises");
|
|
7659
7603
|
var import_path7 = require("path");
|
|
7604
|
+
var import_evalforge_github_client3 = require("@wix/evalforge-github-client");
|
|
7660
7605
|
var AGENTS_DIR = ".claude/agents";
|
|
7661
7606
|
function toAgentFilename(name, index, nameCount) {
|
|
7662
7607
|
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
@@ -7664,7 +7609,34 @@ function toAgentFilename(name, index, nameCount) {
|
|
|
7664
7609
|
nameCount.set(base, count + 1);
|
|
7665
7610
|
return count === 0 ? base : `${base}-${count + 1}`;
|
|
7666
7611
|
}
|
|
7667
|
-
async function
|
|
7612
|
+
async function resolveSubAgentContent(agent, fetchFn) {
|
|
7613
|
+
if (agent.source) {
|
|
7614
|
+
try {
|
|
7615
|
+
const content = await fetchFn(agent.source, {
|
|
7616
|
+
userAgent: "EvalForge-Evaluator"
|
|
7617
|
+
});
|
|
7618
|
+
console.log(
|
|
7619
|
+
`[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
|
|
7620
|
+
);
|
|
7621
|
+
return content;
|
|
7622
|
+
} catch (error) {
|
|
7623
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7624
|
+
console.error(
|
|
7625
|
+
`[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
|
|
7626
|
+
);
|
|
7627
|
+
throw new Error(
|
|
7628
|
+
`Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
|
|
7629
|
+
);
|
|
7630
|
+
}
|
|
7631
|
+
}
|
|
7632
|
+
if (!agent.subAgentMd) {
|
|
7633
|
+
console.warn(
|
|
7634
|
+
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
7635
|
+
);
|
|
7636
|
+
}
|
|
7637
|
+
return agent.subAgentMd;
|
|
7638
|
+
}
|
|
7639
|
+
async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalforge_github_client3.fetchGitHubFile) {
|
|
7668
7640
|
if (subAgents.length === 0) return;
|
|
7669
7641
|
const agentsDir = (0, import_path7.join)(cwd, AGENTS_DIR);
|
|
7670
7642
|
await (0, import_promises6.mkdir)(agentsDir, { recursive: true });
|
|
@@ -7672,7 +7644,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn) {
|
|
|
7672
7644
|
for (const [i, agent] of subAgents.entries()) {
|
|
7673
7645
|
const filename = toAgentFilename(agent.name, i, nameCount);
|
|
7674
7646
|
const filePath = (0, import_path7.join)(agentsDir, `${filename}.md`);
|
|
7675
|
-
const content = await
|
|
7647
|
+
const content = await resolveSubAgentContent(agent, fetchFn);
|
|
7676
7648
|
await (0, import_promises6.writeFile)(filePath, content, "utf8");
|
|
7677
7649
|
}
|
|
7678
7650
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
@@ -7722,19 +7694,18 @@ function validateGenericDirectory(dir, cwd) {
|
|
|
7722
7694
|
}
|
|
7723
7695
|
return trimmed;
|
|
7724
7696
|
}
|
|
7725
|
-
async function writeRulesToFilesystem(cwd, rules
|
|
7697
|
+
async function writeRulesToFilesystem(cwd, rules) {
|
|
7726
7698
|
if (rules.length === 0) return;
|
|
7727
7699
|
const nameCount = /* @__PURE__ */ new Map();
|
|
7728
7700
|
let hasCursorRules = false;
|
|
7729
7701
|
for (const [i, rule] of rules.entries()) {
|
|
7730
|
-
const content = await resolveRuleText(rule, fetchFn);
|
|
7731
7702
|
switch (rule.ruleType) {
|
|
7732
7703
|
case "claude-md": {
|
|
7733
|
-
await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), content);
|
|
7704
|
+
await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), rule.content);
|
|
7734
7705
|
break;
|
|
7735
7706
|
}
|
|
7736
7707
|
case "agents-md": {
|
|
7737
|
-
await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), content);
|
|
7708
|
+
await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), rule.content);
|
|
7738
7709
|
break;
|
|
7739
7710
|
}
|
|
7740
7711
|
case "cursor-rule": {
|
|
@@ -7744,7 +7715,7 @@ async function writeRulesToFilesystem(cwd, rules, fetchFn) {
|
|
|
7744
7715
|
}
|
|
7745
7716
|
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
7746
7717
|
const filePath = (0, import_path8.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
|
|
7747
|
-
await (0, import_promises7.writeFile)(filePath, content, "utf8");
|
|
7718
|
+
await (0, import_promises7.writeFile)(filePath, rule.content, "utf8");
|
|
7748
7719
|
break;
|
|
7749
7720
|
}
|
|
7750
7721
|
case "generic": {
|
|
@@ -7755,7 +7726,7 @@ async function writeRulesToFilesystem(cwd, rules, fetchFn) {
|
|
|
7755
7726
|
const dirPath = (0, import_path8.join)(cwd, directory);
|
|
7756
7727
|
await (0, import_promises7.mkdir)(dirPath, { recursive: true });
|
|
7757
7728
|
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
7758
|
-
await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), content, "utf8");
|
|
7729
|
+
await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), rule.content, "utf8");
|
|
7759
7730
|
break;
|
|
7760
7731
|
}
|
|
7761
7732
|
default: {
|
|
@@ -7845,14 +7816,14 @@ function buildConversation(timestampedMessages) {
|
|
|
7845
7816
|
}
|
|
7846
7817
|
|
|
7847
7818
|
// src/run-scenario/agents/shared/trace-emit.ts
|
|
7848
|
-
var
|
|
7819
|
+
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
7849
7820
|
function emitTraceEvent(event, pushEvent) {
|
|
7850
|
-
console.log(`${
|
|
7821
|
+
console.log(`${import_evalforge_types3.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
|
|
7851
7822
|
pushEvent?.(event);
|
|
7852
7823
|
}
|
|
7853
7824
|
|
|
7854
7825
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
7855
|
-
var DEFAULT_MODEL =
|
|
7826
|
+
var DEFAULT_MODEL = import_evalforge_types4.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
|
|
7856
7827
|
async function* buildPromptStream(triggerPrompt, images) {
|
|
7857
7828
|
yield {
|
|
7858
7829
|
type: "user",
|
|
@@ -7917,7 +7888,7 @@ function extractToolActionDescription(toolName, toolArgs) {
|
|
|
7917
7888
|
return `Using ${toolName}...`;
|
|
7918
7889
|
}
|
|
7919
7890
|
function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
7920
|
-
let type =
|
|
7891
|
+
let type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
|
|
7921
7892
|
let toolName;
|
|
7922
7893
|
let toolArgs;
|
|
7923
7894
|
let outputPreview;
|
|
@@ -7925,28 +7896,28 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
|
7925
7896
|
let thinking;
|
|
7926
7897
|
for (const block of message.message.content) {
|
|
7927
7898
|
if (block.type === "tool_use") {
|
|
7928
|
-
type =
|
|
7899
|
+
type = import_evalforge_types4.LiveTraceEventType.TOOL_USE;
|
|
7929
7900
|
toolName = block.name;
|
|
7930
7901
|
toolArgs = JSON.stringify(block.input).slice(0, 500);
|
|
7931
7902
|
const input = block.input;
|
|
7932
7903
|
if (input.file_path || input.path || input.target_file) {
|
|
7933
7904
|
filePath = String(input.file_path || input.path || input.target_file);
|
|
7934
7905
|
if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
|
|
7935
|
-
type =
|
|
7906
|
+
type = import_evalforge_types4.LiveTraceEventType.FILE_WRITE;
|
|
7936
7907
|
} else if (block.name === "Read" || block.name === "read" || block.name === "View") {
|
|
7937
|
-
type =
|
|
7908
|
+
type = import_evalforge_types4.LiveTraceEventType.FILE_READ;
|
|
7938
7909
|
}
|
|
7939
7910
|
}
|
|
7940
7911
|
} else if (block.type === "text") {
|
|
7941
7912
|
outputPreview = block.text.slice(0, 500);
|
|
7942
7913
|
if (!toolName) {
|
|
7943
|
-
type =
|
|
7914
|
+
type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
|
|
7944
7915
|
}
|
|
7945
7916
|
} else if (block.type === "thinking") {
|
|
7946
7917
|
const thinkingBlock = block;
|
|
7947
7918
|
thinking = thinkingBlock.thinking.slice(0, 500);
|
|
7948
7919
|
if (!outputPreview && !toolName) {
|
|
7949
|
-
type =
|
|
7920
|
+
type = import_evalforge_types4.LiveTraceEventType.THINKING;
|
|
7950
7921
|
}
|
|
7951
7922
|
}
|
|
7952
7923
|
}
|
|
@@ -8012,7 +7983,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
8012
7983
|
}
|
|
8013
7984
|
return {
|
|
8014
7985
|
...baseEvent,
|
|
8015
|
-
type:
|
|
7986
|
+
type: import_evalforge_types4.LiveTraceEventType.TOOL_RESULT,
|
|
8016
7987
|
outputPreview: outputPreview || "(tool result)"
|
|
8017
7988
|
};
|
|
8018
7989
|
}
|
|
@@ -8020,7 +7991,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
8020
7991
|
const sysMsg = message;
|
|
8021
7992
|
return {
|
|
8022
7993
|
...baseEvent,
|
|
8023
|
-
type:
|
|
7994
|
+
type: import_evalforge_types4.LiveTraceEventType.SYSTEM,
|
|
8024
7995
|
outputPreview: sysMsg.subtype || "system"
|
|
8025
7996
|
};
|
|
8026
7997
|
}
|
|
@@ -8029,7 +8000,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
8029
8000
|
}
|
|
8030
8001
|
return {
|
|
8031
8002
|
...baseEvent,
|
|
8032
|
-
type:
|
|
8003
|
+
type: import_evalforge_types4.LiveTraceEventType.PROGRESS,
|
|
8033
8004
|
outputPreview: `Message type: ${message.type}`
|
|
8034
8005
|
};
|
|
8035
8006
|
}
|
|
@@ -8131,7 +8102,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8131
8102
|
queryOptions.systemPrompt = {
|
|
8132
8103
|
type: "preset",
|
|
8133
8104
|
preset: "claude_code",
|
|
8134
|
-
append:
|
|
8105
|
+
append: import_evalforge_types4.DEFAULT_EVALUATOR_SYSTEM_PROMPT
|
|
8135
8106
|
};
|
|
8136
8107
|
}
|
|
8137
8108
|
if (options.temperature !== void 0) {
|
|
@@ -8166,7 +8137,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8166
8137
|
targetId: traceContext.targetId,
|
|
8167
8138
|
targetName: traceContext.targetName,
|
|
8168
8139
|
stepNumber: 0,
|
|
8169
|
-
type:
|
|
8140
|
+
type: import_evalforge_types4.LiveTraceEventType.DIAGNOSTIC,
|
|
8170
8141
|
outputPreview: JSON.stringify({
|
|
8171
8142
|
event: "pre-sdk-execution",
|
|
8172
8143
|
model: queryOptions.model,
|
|
@@ -8230,7 +8201,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8230
8201
|
targetId: traceContext.targetId,
|
|
8231
8202
|
targetName: traceContext.targetName,
|
|
8232
8203
|
stepNumber: traceStepNumber,
|
|
8233
|
-
type:
|
|
8204
|
+
type: import_evalforge_types4.LiveTraceEventType.PROGRESS,
|
|
8234
8205
|
outputPreview: progressMessage,
|
|
8235
8206
|
toolName: lastToolName,
|
|
8236
8207
|
filePath: lastFilePath,
|
|
@@ -8267,18 +8238,18 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8267
8238
|
if (traceEvent) {
|
|
8268
8239
|
lastToolName = traceEvent.toolName;
|
|
8269
8240
|
lastFilePath = traceEvent.filePath;
|
|
8270
|
-
if (traceEvent.type ===
|
|
8241
|
+
if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.THINKING) {
|
|
8271
8242
|
lastAction = "Thinking...";
|
|
8272
|
-
} else if (traceEvent.type ===
|
|
8243
|
+
} else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.TOOL_USE) {
|
|
8273
8244
|
lastAction = extractToolActionDescription(
|
|
8274
8245
|
traceEvent.toolName,
|
|
8275
8246
|
traceEvent.toolArgs
|
|
8276
8247
|
);
|
|
8277
|
-
} else if (traceEvent.type ===
|
|
8248
|
+
} else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.FILE_WRITE) {
|
|
8278
8249
|
lastAction = `Writing: ${traceEvent.filePath || "file"}`;
|
|
8279
|
-
} else if (traceEvent.type ===
|
|
8250
|
+
} else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.FILE_READ) {
|
|
8280
8251
|
lastAction = `Reading: ${traceEvent.filePath || "file"}`;
|
|
8281
|
-
} else if (traceEvent.type ===
|
|
8252
|
+
} else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.COMPLETION) {
|
|
8282
8253
|
lastAction = "Processing response...";
|
|
8283
8254
|
}
|
|
8284
8255
|
emitTraceEvent(traceEvent, traceContext.pushEvent);
|
|
@@ -8456,7 +8427,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8456
8427
|
targetId: traceContext.targetId,
|
|
8457
8428
|
targetName: traceContext.targetName,
|
|
8458
8429
|
stepNumber: traceStepNumber + 1,
|
|
8459
|
-
type:
|
|
8430
|
+
type: import_evalforge_types4.LiveTraceEventType.DIAGNOSTIC,
|
|
8460
8431
|
outputPreview: JSON.stringify(
|
|
8461
8432
|
{
|
|
8462
8433
|
event: "sdk-execution-failed",
|
|
@@ -8490,7 +8461,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
8490
8461
|
targetId: traceContext.targetId,
|
|
8491
8462
|
targetName: traceContext.targetName,
|
|
8492
8463
|
stepNumber: traceStepNumber + 1,
|
|
8493
|
-
type:
|
|
8464
|
+
type: import_evalforge_types4.LiveTraceEventType.COMPLETION,
|
|
8494
8465
|
outputPreview: "Scenario execution completed",
|
|
8495
8466
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
8496
8467
|
isComplete: true
|
|
@@ -8765,7 +8736,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8765
8736
|
stepNumber: 0,
|
|
8766
8737
|
// renumbered below
|
|
8767
8738
|
turnIndex,
|
|
8768
|
-
type:
|
|
8739
|
+
type: import_evalforge_types4.LLMStepType.THINKING,
|
|
8769
8740
|
model,
|
|
8770
8741
|
provider: "anthropic",
|
|
8771
8742
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -8796,7 +8767,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8796
8767
|
id: (0, import_crypto2.randomUUID)(),
|
|
8797
8768
|
stepNumber: 0,
|
|
8798
8769
|
turnIndex,
|
|
8799
|
-
type:
|
|
8770
|
+
type: import_evalforge_types4.LLMStepType.TOOL_USE,
|
|
8800
8771
|
model,
|
|
8801
8772
|
provider: "anthropic",
|
|
8802
8773
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -8826,7 +8797,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8826
8797
|
id: (0, import_crypto2.randomUUID)(),
|
|
8827
8798
|
stepNumber: 0,
|
|
8828
8799
|
turnIndex,
|
|
8829
|
-
type:
|
|
8800
|
+
type: import_evalforge_types4.LLMStepType.COMPLETION,
|
|
8830
8801
|
model,
|
|
8831
8802
|
provider: "anthropic",
|
|
8832
8803
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -8843,7 +8814,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8843
8814
|
});
|
|
8844
8815
|
}
|
|
8845
8816
|
if (subSteps.length === 0) {
|
|
8846
|
-
const stepType = hasThinking && !hasText ?
|
|
8817
|
+
const stepType = hasThinking && !hasText ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
|
|
8847
8818
|
subSteps.push({
|
|
8848
8819
|
id: (0, import_crypto2.randomUUID)(),
|
|
8849
8820
|
stepNumber: 0,
|
|
@@ -8913,7 +8884,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8913
8884
|
var ClaudeCodeAdapter = class {
|
|
8914
8885
|
id = "claude-code";
|
|
8915
8886
|
name = "Claude Code";
|
|
8916
|
-
supportedCommands = [
|
|
8887
|
+
supportedCommands = [import_evalforge_types5.AgentRunCommand.CLAUDE];
|
|
8917
8888
|
/**
|
|
8918
8889
|
* Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
|
|
8919
8890
|
* before the baseline snapshot is taken.
|
|
@@ -8945,9 +8916,9 @@ var ClaudeCodeAdapter = class {
|
|
|
8945
8916
|
rules,
|
|
8946
8917
|
systemPrompt
|
|
8947
8918
|
} = context;
|
|
8948
|
-
const typed = config ?
|
|
8919
|
+
const typed = config ? import_evalforge_types5.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
|
|
8949
8920
|
const cfg = typed?.success ? typed.data : void 0;
|
|
8950
|
-
const schemaKeys = new Set(Object.keys(
|
|
8921
|
+
const schemaKeys = new Set(Object.keys(import_evalforge_types5.ClaudeCodeConfigSchema.shape));
|
|
8951
8922
|
const extras = {};
|
|
8952
8923
|
if (config) {
|
|
8953
8924
|
for (const [key, value] of Object.entries(config)) {
|
|
@@ -9002,11 +8973,11 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
|
|
|
9002
8973
|
defaultRegistry.register(claudeCodeAdapter);
|
|
9003
8974
|
|
|
9004
8975
|
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
9005
|
-
var
|
|
8976
|
+
var import_evalforge_types9 = require("@wix/evalforge-types");
|
|
9006
8977
|
|
|
9007
8978
|
// src/run-scenario/agents/opencode/execute.ts
|
|
9008
8979
|
var import_child_process2 = require("child_process");
|
|
9009
|
-
var
|
|
8980
|
+
var import_evalforge_types8 = require("@wix/evalforge-types");
|
|
9010
8981
|
|
|
9011
8982
|
// src/run-scenario/agents/opencode/types.ts
|
|
9012
8983
|
function tryParseJson(text) {
|
|
@@ -9020,28 +8991,49 @@ function tryParseJson(text) {
|
|
|
9020
8991
|
// src/run-scenario/agents/opencode/write-skills.ts
|
|
9021
8992
|
var import_promises8 = require("fs/promises");
|
|
9022
8993
|
var import_path9 = require("path");
|
|
9023
|
-
|
|
8994
|
+
var import_evalforge_github_client4 = require("@wix/evalforge-github-client");
|
|
8995
|
+
async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_github_client4.fetchGitHubFolder) {
|
|
9024
8996
|
await Promise.all(
|
|
9025
8997
|
skills.map((skill) => writeSkillToFilesystem2(cwd, skill, fetchFn))
|
|
9026
8998
|
);
|
|
9027
8999
|
}
|
|
9028
9000
|
async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
|
|
9029
|
-
const
|
|
9001
|
+
const skillName = skill.name;
|
|
9002
|
+
const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skillName);
|
|
9030
9003
|
await (0, import_promises8.mkdir)(skillDir, { recursive: true });
|
|
9031
|
-
|
|
9032
|
-
|
|
9033
|
-
await writeFilesToDirectory(skillDir, files);
|
|
9034
|
-
|
|
9035
|
-
|
|
9036
|
-
throw new Error(
|
|
9037
|
-
`Failed to write skill ${skill.name} to filesystem: ${message}`
|
|
9004
|
+
const version = skill.latestVersion;
|
|
9005
|
+
if (version?.files && version.files.length > 0) {
|
|
9006
|
+
await writeFilesToDirectory(skillDir, version.files);
|
|
9007
|
+
console.log(
|
|
9008
|
+
`[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
|
|
9038
9009
|
);
|
|
9010
|
+
} else if (skill.source) {
|
|
9011
|
+
try {
|
|
9012
|
+
const files = await fetchFn(skill.source, {
|
|
9013
|
+
userAgent: "EvalForge-Evaluator"
|
|
9014
|
+
});
|
|
9015
|
+
await writeFilesToDirectory(skillDir, files);
|
|
9016
|
+
console.log(
|
|
9017
|
+
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
9018
|
+
);
|
|
9019
|
+
} catch (error) {
|
|
9020
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
9021
|
+
console.error(
|
|
9022
|
+
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
9023
|
+
);
|
|
9024
|
+
throw new Error(
|
|
9025
|
+
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
9026
|
+
);
|
|
9027
|
+
}
|
|
9028
|
+
} else {
|
|
9029
|
+
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
9039
9030
|
}
|
|
9040
9031
|
}
|
|
9041
9032
|
|
|
9042
9033
|
// src/run-scenario/agents/opencode/write-sub-agents.ts
|
|
9043
9034
|
var import_promises9 = require("fs/promises");
|
|
9044
9035
|
var import_path10 = require("path");
|
|
9036
|
+
var import_evalforge_github_client5 = require("@wix/evalforge-github-client");
|
|
9045
9037
|
var AGENTS_DIR2 = ".opencode/agents";
|
|
9046
9038
|
function toAgentFilename2(name, index, nameCount) {
|
|
9047
9039
|
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
@@ -9049,7 +9041,34 @@ function toAgentFilename2(name, index, nameCount) {
|
|
|
9049
9041
|
nameCount.set(base, count + 1);
|
|
9050
9042
|
return count === 0 ? base : `${base}-${count + 1}`;
|
|
9051
9043
|
}
|
|
9052
|
-
async function
|
|
9044
|
+
async function resolveSubAgentContent2(agent, fetchFn) {
|
|
9045
|
+
if (agent.source) {
|
|
9046
|
+
try {
|
|
9047
|
+
const content = await fetchFn(agent.source, {
|
|
9048
|
+
userAgent: "EvalForge-Evaluator"
|
|
9049
|
+
});
|
|
9050
|
+
console.log(
|
|
9051
|
+
`[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
|
|
9052
|
+
);
|
|
9053
|
+
return content;
|
|
9054
|
+
} catch (error) {
|
|
9055
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
9056
|
+
console.error(
|
|
9057
|
+
`[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
|
|
9058
|
+
);
|
|
9059
|
+
throw new Error(
|
|
9060
|
+
`Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
|
|
9061
|
+
);
|
|
9062
|
+
}
|
|
9063
|
+
}
|
|
9064
|
+
if (!agent.subAgentMd) {
|
|
9065
|
+
console.warn(
|
|
9066
|
+
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
9067
|
+
);
|
|
9068
|
+
}
|
|
9069
|
+
return agent.subAgentMd;
|
|
9070
|
+
}
|
|
9071
|
+
async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_evalforge_github_client5.fetchGitHubFile) {
|
|
9053
9072
|
if (subAgents.length === 0) return;
|
|
9054
9073
|
const agentsDir = (0, import_path10.join)(cwd, AGENTS_DIR2);
|
|
9055
9074
|
await (0, import_promises9.mkdir)(agentsDir, { recursive: true });
|
|
@@ -9057,7 +9076,7 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
|
|
|
9057
9076
|
for (const [i, agent] of subAgents.entries()) {
|
|
9058
9077
|
const filename = toAgentFilename2(agent.name, i, nameCount);
|
|
9059
9078
|
const filePath = (0, import_path10.join)(agentsDir, `${filename}.md`);
|
|
9060
|
-
const content = await
|
|
9079
|
+
const content = await resolveSubAgentContent2(agent, fetchFn);
|
|
9061
9080
|
await (0, import_promises9.writeFile)(filePath, content, "utf8");
|
|
9062
9081
|
}
|
|
9063
9082
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
@@ -9065,8 +9084,8 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
|
|
|
9065
9084
|
|
|
9066
9085
|
// src/run-scenario/agents/opencode/config.ts
|
|
9067
9086
|
var import_os3 = require("os");
|
|
9068
|
-
var
|
|
9069
|
-
var DEFAULT_MODEL2 = `${
|
|
9087
|
+
var import_evalforge_types6 = require("@wix/evalforge-types");
|
|
9088
|
+
var DEFAULT_MODEL2 = `${import_evalforge_types6.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
|
|
9070
9089
|
var OPENCODE_MODEL_ALIASES = {
|
|
9071
9090
|
"claude-sonnet-4": "claude-sonnet-4-0",
|
|
9072
9091
|
"claude-opus-4": "claude-opus-4-0"
|
|
@@ -9082,10 +9101,10 @@ function parseModel(model) {
|
|
|
9082
9101
|
};
|
|
9083
9102
|
}
|
|
9084
9103
|
const modelID = OPENCODE_MODEL_ALIASES[model] ?? model;
|
|
9085
|
-
const isOpenAI =
|
|
9104
|
+
const isOpenAI = import_evalforge_types6.AVAILABLE_OPENAI_MODEL_IDS.includes(
|
|
9086
9105
|
model
|
|
9087
9106
|
);
|
|
9088
|
-
const isGemini =
|
|
9107
|
+
const isGemini = import_evalforge_types6.AVAILABLE_GEMINI_MODEL_IDS.includes(
|
|
9089
9108
|
model
|
|
9090
9109
|
);
|
|
9091
9110
|
if (isGemini) return { providerID: "google", modelID };
|
|
@@ -9154,7 +9173,7 @@ async function buildOpenCodeEnv(options) {
|
|
|
9154
9173
|
if (options.mcps && options.mcps.length > 0) {
|
|
9155
9174
|
const mcpServers = {};
|
|
9156
9175
|
for (const mcpEntity of options.mcps) {
|
|
9157
|
-
const entityConfig =
|
|
9176
|
+
const entityConfig = mcpEntity.config;
|
|
9158
9177
|
for (const [key, value] of Object.entries(entityConfig)) {
|
|
9159
9178
|
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
9160
9179
|
throw new Error(
|
|
@@ -9179,7 +9198,7 @@ async function buildOpenCodeEnv(options) {
|
|
|
9179
9198
|
if (options.maxTurns != null && options.maxTurns > 0) {
|
|
9180
9199
|
agentOverrides.maxSteps = options.maxTurns;
|
|
9181
9200
|
}
|
|
9182
|
-
const parsed = options.config ?
|
|
9201
|
+
const parsed = options.config ? import_evalforge_types6.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
|
|
9183
9202
|
const configPermission = parsed?.success ? parsed.data.permission : void 0;
|
|
9184
9203
|
const defaultPermission = {
|
|
9185
9204
|
"*": "allow"
|
|
@@ -9221,7 +9240,7 @@ async function buildOpenCodeEnv(options) {
|
|
|
9221
9240
|
}
|
|
9222
9241
|
|
|
9223
9242
|
// src/run-scenario/agents/opencode/build-trace.ts
|
|
9224
|
-
var
|
|
9243
|
+
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
9225
9244
|
var import_crypto3 = require("crypto");
|
|
9226
9245
|
function toCanonicalModelId(modelId) {
|
|
9227
9246
|
const slashIndex = modelId.indexOf("/");
|
|
@@ -9301,7 +9320,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9301
9320
|
id: (0, import_crypto3.randomUUID)(),
|
|
9302
9321
|
stepNumber: 0,
|
|
9303
9322
|
turnIndex,
|
|
9304
|
-
type:
|
|
9323
|
+
type: import_evalforge_types7.LLMStepType.THINKING,
|
|
9305
9324
|
model: stepModel,
|
|
9306
9325
|
provider: stepProvider,
|
|
9307
9326
|
startedAt,
|
|
@@ -9330,7 +9349,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9330
9349
|
id: (0, import_crypto3.randomUUID)(),
|
|
9331
9350
|
stepNumber: 0,
|
|
9332
9351
|
turnIndex,
|
|
9333
|
-
type:
|
|
9352
|
+
type: import_evalforge_types7.LLMStepType.TOOL_USE,
|
|
9334
9353
|
model: stepModel,
|
|
9335
9354
|
provider: stepProvider,
|
|
9336
9355
|
startedAt,
|
|
@@ -9360,7 +9379,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9360
9379
|
id: (0, import_crypto3.randomUUID)(),
|
|
9361
9380
|
stepNumber: 0,
|
|
9362
9381
|
turnIndex,
|
|
9363
|
-
type:
|
|
9382
|
+
type: import_evalforge_types7.LLMStepType.COMPLETION,
|
|
9364
9383
|
model: stepModel,
|
|
9365
9384
|
provider: stepProvider,
|
|
9366
9385
|
startedAt,
|
|
@@ -9377,7 +9396,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9377
9396
|
});
|
|
9378
9397
|
}
|
|
9379
9398
|
if (subSteps.length === 0) {
|
|
9380
|
-
const stepType = hasThinking && !hasText ?
|
|
9399
|
+
const stepType = hasThinking && !hasText ? import_evalforge_types7.LLMStepType.THINKING : import_evalforge_types7.LLMStepType.COMPLETION;
|
|
9381
9400
|
subSteps.push({
|
|
9382
9401
|
id: (0, import_crypto3.randomUUID)(),
|
|
9383
9402
|
stepNumber: 0,
|
|
@@ -9578,14 +9597,14 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
|
|
|
9578
9597
|
const te = evt;
|
|
9579
9598
|
return {
|
|
9580
9599
|
...base,
|
|
9581
|
-
type:
|
|
9600
|
+
type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
|
|
9582
9601
|
outputPreview: te.part.text.slice(0, 500)
|
|
9583
9602
|
};
|
|
9584
9603
|
}
|
|
9585
9604
|
case "reasoning":
|
|
9586
9605
|
return {
|
|
9587
9606
|
...base,
|
|
9588
|
-
type:
|
|
9607
|
+
type: import_evalforge_types8.LiveTraceEventType.THINKING,
|
|
9589
9608
|
thinking: evt.part.text.slice(0, 500)
|
|
9590
9609
|
};
|
|
9591
9610
|
case "tool_use": {
|
|
@@ -9593,15 +9612,15 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
|
|
|
9593
9612
|
const toolName = tu.part.tool;
|
|
9594
9613
|
const args = tu.part.state.input;
|
|
9595
9614
|
const toolArgs = JSON.stringify(args).slice(0, 500);
|
|
9596
|
-
let type =
|
|
9615
|
+
let type = import_evalforge_types8.LiveTraceEventType.TOOL_USE;
|
|
9597
9616
|
let filePath;
|
|
9598
9617
|
if (args) {
|
|
9599
9618
|
if (args.file_path || args.path || args.target_file) {
|
|
9600
9619
|
filePath = String(args.file_path || args.path || args.target_file);
|
|
9601
9620
|
if (/write|edit/i.test(toolName)) {
|
|
9602
|
-
type =
|
|
9621
|
+
type = import_evalforge_types8.LiveTraceEventType.FILE_WRITE;
|
|
9603
9622
|
} else if (/read|view/i.test(toolName)) {
|
|
9604
|
-
type =
|
|
9623
|
+
type = import_evalforge_types8.LiveTraceEventType.FILE_READ;
|
|
9605
9624
|
}
|
|
9606
9625
|
}
|
|
9607
9626
|
}
|
|
@@ -9610,7 +9629,7 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
|
|
|
9610
9629
|
case "step_finish":
|
|
9611
9630
|
return {
|
|
9612
9631
|
...base,
|
|
9613
|
-
type:
|
|
9632
|
+
type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
|
|
9614
9633
|
outputPreview: "Step completed"
|
|
9615
9634
|
};
|
|
9616
9635
|
default:
|
|
@@ -9641,7 +9660,7 @@ async function prepareOpenCodeEnvironment(cwd, skills, options) {
|
|
|
9641
9660
|
} else if (options.systemPrompt != null) {
|
|
9642
9661
|
systemPrompt = options.systemPrompt;
|
|
9643
9662
|
} else {
|
|
9644
|
-
systemPrompt =
|
|
9663
|
+
systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
|
|
9645
9664
|
}
|
|
9646
9665
|
if (systemPrompt) {
|
|
9647
9666
|
await writeSystemPromptRule(cwd, systemPrompt);
|
|
@@ -9833,7 +9852,7 @@ function spawnOpenCodeProcess(opts) {
|
|
|
9833
9852
|
targetId: traceContext.targetId,
|
|
9834
9853
|
targetName: traceContext.targetName,
|
|
9835
9854
|
stepNumber: traceStepNumber,
|
|
9836
|
-
type:
|
|
9855
|
+
type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
|
|
9837
9856
|
outputPreview: progressMessage,
|
|
9838
9857
|
toolName: lastToolName,
|
|
9839
9858
|
filePath: lastFilePath,
|
|
@@ -9867,18 +9886,18 @@ function spawnOpenCodeProcess(opts) {
|
|
|
9867
9886
|
if (traceEvt) {
|
|
9868
9887
|
lastToolName = traceEvt.toolName;
|
|
9869
9888
|
lastFilePath = traceEvt.filePath;
|
|
9870
|
-
if (traceEvt.type ===
|
|
9889
|
+
if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.THINKING) {
|
|
9871
9890
|
lastAction = "Thinking...";
|
|
9872
|
-
} else if (traceEvt.type ===
|
|
9891
|
+
} else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.TOOL_USE) {
|
|
9873
9892
|
lastAction = extractToolAction(
|
|
9874
9893
|
traceEvt.toolName ?? "",
|
|
9875
9894
|
void 0
|
|
9876
9895
|
);
|
|
9877
|
-
} else if (traceEvt.type ===
|
|
9896
|
+
} else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_WRITE) {
|
|
9878
9897
|
lastAction = `Writing: ${traceEvt.filePath || "file"}`;
|
|
9879
|
-
} else if (traceEvt.type ===
|
|
9898
|
+
} else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_READ) {
|
|
9880
9899
|
lastAction = `Reading: ${traceEvt.filePath || "file"}`;
|
|
9881
|
-
} else if (traceEvt.type ===
|
|
9900
|
+
} else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.COMPLETION) {
|
|
9882
9901
|
lastAction = "Processing response...";
|
|
9883
9902
|
}
|
|
9884
9903
|
emitTraceEvent(traceEvt, traceContext.pushEvent);
|
|
@@ -9960,7 +9979,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
9960
9979
|
targetId: traceContext.targetId,
|
|
9961
9980
|
targetName: traceContext.targetName,
|
|
9962
9981
|
stepNumber: 0,
|
|
9963
|
-
type:
|
|
9982
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
9964
9983
|
outputPreview: JSON.stringify({
|
|
9965
9984
|
event: "pre-cli-execution",
|
|
9966
9985
|
model: `${providerID}/${modelID}`,
|
|
@@ -10014,7 +10033,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
10014
10033
|
targetId: traceContext.targetId,
|
|
10015
10034
|
targetName: traceContext.targetName,
|
|
10016
10035
|
stepNumber: traceStepNumber + 1,
|
|
10017
|
-
type:
|
|
10036
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
10018
10037
|
outputPreview: JSON.stringify({
|
|
10019
10038
|
event: "idle-timeout-retry",
|
|
10020
10039
|
attempt,
|
|
@@ -10058,7 +10077,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
10058
10077
|
targetId: traceContext.targetId,
|
|
10059
10078
|
targetName: traceContext.targetName,
|
|
10060
10079
|
stepNumber: traceStepNumber + 1,
|
|
10061
|
-
type:
|
|
10080
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
10062
10081
|
outputPreview: JSON.stringify({
|
|
10063
10082
|
event: "cli-execution-failed",
|
|
10064
10083
|
error: lastAttemptResult.error?.message ?? "Unknown error",
|
|
@@ -10113,7 +10132,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
|
|
|
10113
10132
|
targetId: traceContext.targetId,
|
|
10114
10133
|
targetName: traceContext.targetName,
|
|
10115
10134
|
stepNumber: traceStepNumber + 1,
|
|
10116
|
-
type:
|
|
10135
|
+
type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
|
|
10117
10136
|
outputPreview: "Scenario execution completed",
|
|
10118
10137
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
10119
10138
|
isComplete: true
|
|
@@ -10150,7 +10169,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
|
|
|
10150
10169
|
var OpenCodeAdapter = class {
|
|
10151
10170
|
id = "opencode";
|
|
10152
10171
|
name = "OpenCode";
|
|
10153
|
-
supportedCommands = [
|
|
10172
|
+
supportedCommands = [import_evalforge_types9.AgentRunCommand.OPENCODE];
|
|
10154
10173
|
async prepareEnvironment(context) {
|
|
10155
10174
|
await prepareOpenCodeEnvironment(context.cwd, context.skills, {
|
|
10156
10175
|
mcps: context.mcps,
|
|
@@ -10173,7 +10192,7 @@ var OpenCodeAdapter = class {
|
|
|
10173
10192
|
rules,
|
|
10174
10193
|
systemPrompt
|
|
10175
10194
|
} = context;
|
|
10176
|
-
const typed = config ?
|
|
10195
|
+
const typed = config ? import_evalforge_types9.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
|
|
10177
10196
|
const cfg = typed?.success ? typed.data : void 0;
|
|
10178
10197
|
const rawMaxTurns = cfg?.maxTurns;
|
|
10179
10198
|
const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
|
|
@@ -10223,7 +10242,7 @@ var import_ai = require("ai");
|
|
|
10223
10242
|
var import_anthropic = require("@ai-sdk/anthropic");
|
|
10224
10243
|
var import_google = require("@ai-sdk/google");
|
|
10225
10244
|
var import_openai = require("@ai-sdk/openai");
|
|
10226
|
-
var
|
|
10245
|
+
var import_evalforge_types11 = require("@wix/evalforge-types");
|
|
10227
10246
|
var import_crypto4 = require("crypto");
|
|
10228
10247
|
|
|
10229
10248
|
// src/run-scenario/agents/simple-agent/mcp-tools.ts
|
|
@@ -10320,7 +10339,7 @@ function extractErrorText(content) {
|
|
|
10320
10339
|
}
|
|
10321
10340
|
|
|
10322
10341
|
// src/run-scenario/agents/simple-agent/cost-calculation.ts
|
|
10323
|
-
var
|
|
10342
|
+
var import_evalforge_types10 = require("@wix/evalforge-types");
|
|
10324
10343
|
var PROVIDER_ANTHROPIC = "anthropic";
|
|
10325
10344
|
var PROVIDER_GEMINI = "gemini";
|
|
10326
10345
|
var MODEL_PRICING = {
|
|
@@ -10389,7 +10408,7 @@ function extractGatewayCost(step, provider) {
|
|
|
10389
10408
|
}
|
|
10390
10409
|
}
|
|
10391
10410
|
function calculateFromPricing(modelId, tokenUsage) {
|
|
10392
|
-
const normalized = (0,
|
|
10411
|
+
const normalized = (0, import_evalforge_types10.normalizeModelId)(modelId);
|
|
10393
10412
|
const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
|
|
10394
10413
|
if (!pricing) return 0;
|
|
10395
10414
|
return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
|
|
@@ -10482,7 +10501,7 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
10482
10501
|
apiKey: "proxy-auth",
|
|
10483
10502
|
headers
|
|
10484
10503
|
});
|
|
10485
|
-
if ([...
|
|
10504
|
+
if ([...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
10486
10505
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10487
10506
|
)) {
|
|
10488
10507
|
return openai.responses(modelId);
|
|
@@ -10490,12 +10509,12 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
10490
10509
|
return openai.chat(modelId);
|
|
10491
10510
|
}
|
|
10492
10511
|
function isClaudeModelId(modelId) {
|
|
10493
|
-
return
|
|
10512
|
+
return import_evalforge_types11.AVAILABLE_CLAUDE_MODEL_IDS.some(
|
|
10494
10513
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10495
10514
|
);
|
|
10496
10515
|
}
|
|
10497
10516
|
function isGeminiModelId(modelId) {
|
|
10498
|
-
return
|
|
10517
|
+
return import_evalforge_types11.AVAILABLE_GEMINI_MODEL_IDS.some(
|
|
10499
10518
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10500
10519
|
);
|
|
10501
10520
|
}
|
|
@@ -10515,9 +10534,9 @@ async function executeWithAiSdk(context) {
|
|
|
10515
10534
|
mcps,
|
|
10516
10535
|
traceContext
|
|
10517
10536
|
} = context;
|
|
10518
|
-
const typed = config ?
|
|
10537
|
+
const typed = config ? import_evalforge_types11.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
|
|
10519
10538
|
const cfg = typed?.success ? typed.data : void 0;
|
|
10520
|
-
const schemaKeys = new Set(Object.keys(
|
|
10539
|
+
const schemaKeys = new Set(Object.keys(import_evalforge_types11.SimpleAgentConfigSchema.shape));
|
|
10521
10540
|
const configExtras = {};
|
|
10522
10541
|
if (config) {
|
|
10523
10542
|
for (const [key, value] of Object.entries(config)) {
|
|
@@ -10554,11 +10573,11 @@ async function executeWithAiSdk(context) {
|
|
|
10554
10573
|
}, SDK_TIMEOUT_MS);
|
|
10555
10574
|
try {
|
|
10556
10575
|
const isAnthropic = provider === PROVIDER_ANTHROPIC2;
|
|
10557
|
-
const isResponsesAPI = [...
|
|
10576
|
+
const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
10558
10577
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10559
10578
|
);
|
|
10560
10579
|
const isGemini = provider === PROVIDER_GEMINI2;
|
|
10561
|
-
const isGeminiThinking = isGemini &&
|
|
10580
|
+
const isGeminiThinking = isGemini && import_evalforge_types11.GEMINI_THINKING_MODEL_IDS.has(modelId);
|
|
10562
10581
|
const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
|
|
10563
10582
|
const thinkingBudgetTokens = cfg.thinkingBudgetTokens ?? 1e4;
|
|
10564
10583
|
const reasoningEffort = cfg.reasoningEffort ?? "high";
|
|
@@ -10637,7 +10656,7 @@ async function executeWithAiSdk(context) {
|
|
|
10637
10656
|
targetId: traceContext.targetId,
|
|
10638
10657
|
targetName: traceContext.targetName,
|
|
10639
10658
|
stepNumber: stepTimestamps.length,
|
|
10640
|
-
type: isToolStep ?
|
|
10659
|
+
type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
|
|
10641
10660
|
toolName: firstToolCall?.toolName,
|
|
10642
10661
|
toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
|
|
10643
10662
|
outputPreview: step.text?.slice(0, 500),
|
|
@@ -10842,7 +10861,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
|
|
|
10842
10861
|
id: (0, import_crypto4.randomUUID)(),
|
|
10843
10862
|
stepNumber: i + 1,
|
|
10844
10863
|
turnIndex: i,
|
|
10845
|
-
type: step.toolCalls.length > 0 ?
|
|
10864
|
+
type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
|
|
10846
10865
|
model: modelId,
|
|
10847
10866
|
provider,
|
|
10848
10867
|
startedAt: new Date(stepStartedAt).toISOString(),
|
|
@@ -10892,7 +10911,7 @@ function emitStartEvent(traceContext, startTime) {
|
|
|
10892
10911
|
targetId: traceContext.targetId,
|
|
10893
10912
|
targetName: traceContext.targetName,
|
|
10894
10913
|
stepNumber: 0,
|
|
10895
|
-
type:
|
|
10914
|
+
type: import_evalforge_types11.LiveTraceEventType.PROGRESS,
|
|
10896
10915
|
outputPreview: "Starting Simple Agent execution...",
|
|
10897
10916
|
elapsedMs: Date.now() - startTime,
|
|
10898
10917
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -10910,7 +10929,7 @@ function emitCompletionEvent(traceContext, stepNumber) {
|
|
|
10910
10929
|
targetId: traceContext.targetId,
|
|
10911
10930
|
targetName: traceContext.targetName,
|
|
10912
10931
|
stepNumber,
|
|
10913
|
-
type:
|
|
10932
|
+
type: import_evalforge_types11.LiveTraceEventType.COMPLETION,
|
|
10914
10933
|
outputPreview: "Scenario execution completed",
|
|
10915
10934
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
10916
10935
|
isComplete: true
|
|
@@ -11680,11 +11699,11 @@ function substituteVariables(prompt, variables) {
|
|
|
11680
11699
|
}
|
|
11681
11700
|
|
|
11682
11701
|
// src/run-scenario/run-agent-with-context.ts
|
|
11683
|
-
var
|
|
11684
|
-
var DEFAULT_AGENT_COMMAND =
|
|
11702
|
+
var import_evalforge_types12 = require("@wix/evalforge-types");
|
|
11703
|
+
var DEFAULT_AGENT_COMMAND = import_evalforge_types12.AgentRunCommand.CLAUDE;
|
|
11685
11704
|
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir, pushEvent) {
|
|
11686
11705
|
const agent = evalData.agent ?? void 0;
|
|
11687
|
-
const isSDK = agent?.agentType ===
|
|
11706
|
+
const isSDK = agent?.agentType === import_evalforge_types12.AgentType.SDK;
|
|
11688
11707
|
const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
11689
11708
|
const adapter = getAdapter(identifier);
|
|
11690
11709
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -11769,14 +11788,14 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
11769
11788
|
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
11770
11789
|
if (template) {
|
|
11771
11790
|
console.log(
|
|
11772
|
-
(0,
|
|
11791
|
+
(0, import_evalforge_types13.formatTraceEventLine)({
|
|
11773
11792
|
evalRunId: evalRunId2,
|
|
11774
11793
|
scenarioId: scenario.id,
|
|
11775
11794
|
scenarioName: scenario.name,
|
|
11776
11795
|
targetId,
|
|
11777
11796
|
targetName,
|
|
11778
11797
|
stepNumber: 0,
|
|
11779
|
-
type:
|
|
11798
|
+
type: import_evalforge_types13.LiveTraceEventType.PROGRESS,
|
|
11780
11799
|
outputPreview: "Setting up environment (installing dependencies)...",
|
|
11781
11800
|
elapsedMs: 0,
|
|
11782
11801
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -11816,7 +11835,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
11816
11835
|
})),
|
|
11817
11836
|
durationMs: partialResult.duration
|
|
11818
11837
|
};
|
|
11819
|
-
const defaultJudgeModel =
|
|
11838
|
+
const defaultJudgeModel = import_evalforge_types13.DEFAULT_JUDGE_MODEL;
|
|
11820
11839
|
const assertionContext = {
|
|
11821
11840
|
workDir,
|
|
11822
11841
|
defaultJudgeModel,
|
|
@@ -11831,10 +11850,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
11831
11850
|
assertionContext
|
|
11832
11851
|
) : [];
|
|
11833
11852
|
const passed = assertionResults.filter(
|
|
11834
|
-
(r) => r.status ===
|
|
11853
|
+
(r) => r.status === import_evalforge_types13.AssertionResultStatus.PASSED
|
|
11835
11854
|
).length;
|
|
11836
11855
|
const failed = assertionResults.filter(
|
|
11837
|
-
(r) => r.status ===
|
|
11856
|
+
(r) => r.status === import_evalforge_types13.AssertionResultStatus.FAILED
|
|
11838
11857
|
).length;
|
|
11839
11858
|
const total = assertionResults.length;
|
|
11840
11859
|
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
@@ -11910,7 +11929,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
11910
11929
|
}
|
|
11911
11930
|
|
|
11912
11931
|
// src/error-reporter.ts
|
|
11913
|
-
var
|
|
11932
|
+
var import_evalforge_types14 = require("@wix/evalforge-types");
|
|
11914
11933
|
function formatError(error, phase, context) {
|
|
11915
11934
|
const timestamp2 = (/* @__PURE__ */ new Date()).toISOString();
|
|
11916
11935
|
if (error instanceof Error) {
|
|
@@ -12153,7 +12172,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
12153
12172
|
totalExecutions
|
|
12154
12173
|
};
|
|
12155
12174
|
const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
|
|
12156
|
-
const finalStatus = allFailed ?
|
|
12175
|
+
const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
|
|
12157
12176
|
const jobErrorOnAllFailed = allFailed ? truncateForJobError(
|
|
12158
12177
|
firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
|
|
12159
12178
|
) : void 0;
|
|
@@ -12207,7 +12226,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
12207
12226
|
grpcAuthToken: config.grpcAuthToken
|
|
12208
12227
|
});
|
|
12209
12228
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
12210
|
-
status:
|
|
12229
|
+
status: import_evalforge_types15.EvalStatus.FAILED,
|
|
12211
12230
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
12212
12231
|
jobError,
|
|
12213
12232
|
jobStatus: "FAILED"
|
|
@@ -12232,7 +12251,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
12232
12251
|
grpcAuthToken
|
|
12233
12252
|
});
|
|
12234
12253
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
12235
|
-
status:
|
|
12254
|
+
status: import_evalforge_types15.EvalStatus.FAILED,
|
|
12236
12255
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
12237
12256
|
jobError: `Config load failed, then: ${jobError}`,
|
|
12238
12257
|
jobStatus: "FAILED"
|