@wix/evalforge-evaluator 0.183.0 → 0.185.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +232 -224
- package/build/index.js.map +4 -4
- package/build/index.mjs +147 -138
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/agents/claude-code/execute.d.ts +23 -1
- package/build/types/run-scenario/agents/claude-code/write-mcp.d.ts +1 -7
- package/build/types/run-scenario/agents/claude-code/write-rules.d.ts +1 -7
- package/build/types/run-scenario/agents/claude-code/write-skills.d.ts +9 -6
- package/build/types/run-scenario/agents/claude-code/write-sub-agents.d.ts +5 -4
- package/build/types/run-scenario/agents/opencode/write-skills.d.ts +8 -5
- package/build/types/run-scenario/agents/opencode/write-sub-agents.d.ts +3 -3
- package/build/types/run-scenario/types.d.ts +2 -0
- package/package.json +2 -2
- package/build/types/run-scenario/agents/shared/resolve-capability-content.d.ts +0 -42
package/build/index.js
CHANGED
|
@@ -5226,7 +5226,7 @@ var require_index_node = __commonJS({
|
|
|
5226
5226
|
});
|
|
5227
5227
|
|
|
5228
5228
|
// src/index.ts
|
|
5229
|
-
var
|
|
5229
|
+
var import_evalforge_types15 = require("@wix/evalforge-types");
|
|
5230
5230
|
|
|
5231
5231
|
// src/config.ts
|
|
5232
5232
|
function loadConfig() {
|
|
@@ -7115,7 +7115,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
7115
7115
|
}
|
|
7116
7116
|
|
|
7117
7117
|
// src/run-scenario/index.ts
|
|
7118
|
-
var
|
|
7118
|
+
var import_evalforge_types13 = require("@wix/evalforge-types");
|
|
7119
7119
|
var import_eval_assertions = require("@wix/eval-assertions");
|
|
7120
7120
|
|
|
7121
7121
|
// src/run-scenario/environment.ts
|
|
@@ -7451,122 +7451,50 @@ function getAdapter(identifier) {
|
|
|
7451
7451
|
}
|
|
7452
7452
|
|
|
7453
7453
|
// src/run-scenario/agents/claude-code/claude-code-adapter.ts
|
|
7454
|
-
var
|
|
7454
|
+
var import_evalforge_types5 = require("@wix/evalforge-types");
|
|
7455
7455
|
|
|
7456
7456
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
7457
|
-
var
|
|
7457
|
+
var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
7458
7458
|
|
|
7459
7459
|
// src/run-scenario/agents/claude-code/write-skills.ts
|
|
7460
7460
|
var import_promises3 = require("fs/promises");
|
|
7461
7461
|
var import_path4 = require("path");
|
|
7462
|
-
|
|
7463
|
-
// src/run-scenario/agents/shared/resolve-capability-content.ts
|
|
7464
7462
|
var import_evalforge_github_client2 = require("@wix/evalforge-github-client");
|
|
7465
|
-
|
|
7466
|
-
var USER_AGENT = "EvalForge-Evaluator";
|
|
7467
|
-
async function resolveSkillFiles(skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
7468
|
-
const version = skill.latestVersion;
|
|
7469
|
-
if (version?.files && version.files.length > 0) {
|
|
7470
|
-
console.log(
|
|
7471
|
-
`[Skill] ${skill.name}: using ${version.files.length} file(s) from snapshot`
|
|
7472
|
-
);
|
|
7473
|
-
return version.files;
|
|
7474
|
-
}
|
|
7475
|
-
if (skill.source) {
|
|
7476
|
-
const files = await fetchFn(skill.source, { userAgent: USER_AGENT });
|
|
7477
|
-
console.log(
|
|
7478
|
-
`[Skill] ${skill.name}: fetched ${files.length} file(s) from GitHub (live)`
|
|
7479
|
-
);
|
|
7480
|
-
return files;
|
|
7481
|
-
}
|
|
7482
|
-
throw new Error(`Skill ${skill.name} has no files and no source configured`);
|
|
7483
|
-
}
|
|
7484
|
-
async function fetchSourceFile(label, noun, name, source, fetchFn) {
|
|
7485
|
-
try {
|
|
7486
|
-
const content = await fetchFn(source, { userAgent: USER_AGENT });
|
|
7487
|
-
console.log(
|
|
7488
|
-
`[${label}] Fetched "${name}" from ${source.owner}/${source.repo}/${source.path}@${source.ref}`
|
|
7489
|
-
);
|
|
7490
|
-
return content;
|
|
7491
|
-
} catch (error) {
|
|
7492
|
-
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7493
|
-
console.error(`[${label}] "${name}": GitHub fetch failed: ${message}`);
|
|
7494
|
-
throw new Error(
|
|
7495
|
-
`Failed to fetch ${noun} "${name}" from GitHub: ${message}`
|
|
7496
|
-
);
|
|
7497
|
-
}
|
|
7498
|
-
}
|
|
7499
|
-
async function resolveSubAgentMd(agent, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
|
|
7500
|
-
if (agent.source) {
|
|
7501
|
-
return fetchSourceFile(
|
|
7502
|
-
"SubAgents",
|
|
7503
|
-
"sub-agent",
|
|
7504
|
-
agent.name,
|
|
7505
|
-
agent.source,
|
|
7506
|
-
fetchFn
|
|
7507
|
-
);
|
|
7508
|
-
}
|
|
7509
|
-
if (!agent.subAgentMd) {
|
|
7510
|
-
console.warn(
|
|
7511
|
-
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
7512
|
-
);
|
|
7513
|
-
}
|
|
7514
|
-
return agent.subAgentMd;
|
|
7515
|
-
}
|
|
7516
|
-
async function resolveRuleText(rule, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
|
|
7517
|
-
if (!rule.source) {
|
|
7518
|
-
return rule.content;
|
|
7519
|
-
}
|
|
7520
|
-
return fetchSourceFile("Rules", "rule", rule.name, rule.source, fetchFn);
|
|
7521
|
-
}
|
|
7522
|
-
async function resolveMcpConfig(mcp, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
|
|
7523
|
-
if (!mcp.source) {
|
|
7524
|
-
return mcp.config;
|
|
7525
|
-
}
|
|
7526
|
-
const raw = await fetchSourceFile(
|
|
7527
|
-
"MCP",
|
|
7528
|
-
"MCP",
|
|
7529
|
-
mcp.name,
|
|
7530
|
-
mcp.source,
|
|
7531
|
-
fetchFn
|
|
7532
|
-
);
|
|
7533
|
-
let parsed;
|
|
7534
|
-
try {
|
|
7535
|
-
parsed = JSON.parse(raw);
|
|
7536
|
-
} catch (error) {
|
|
7537
|
-
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7538
|
-
throw new Error(
|
|
7539
|
-
`MCP "${mcp.name}" GitHub source is not valid JSON: ${message}`
|
|
7540
|
-
);
|
|
7541
|
-
}
|
|
7542
|
-
if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
|
|
7543
|
-
throw new Error(`MCP "${mcp.name}" GitHub source must be a JSON object`);
|
|
7544
|
-
}
|
|
7545
|
-
const obj = parsed;
|
|
7546
|
-
const servers = obj[import_evalforge_types2.MCP_SERVERS_JSON_KEY];
|
|
7547
|
-
if (typeof servers === "object" && servers !== null && !Array.isArray(servers)) {
|
|
7548
|
-
return servers;
|
|
7549
|
-
}
|
|
7550
|
-
return obj;
|
|
7551
|
-
}
|
|
7552
|
-
|
|
7553
|
-
// src/run-scenario/agents/claude-code/write-skills.ts
|
|
7554
|
-
async function writeSkillsToFilesystem(cwd, skills, fetchFn) {
|
|
7463
|
+
async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
7555
7464
|
await Promise.all(
|
|
7556
7465
|
skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
|
|
7557
7466
|
);
|
|
7558
7467
|
}
|
|
7559
|
-
async function writeSkillToFilesystem(cwd, skill, fetchFn) {
|
|
7560
|
-
const
|
|
7468
|
+
async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
7469
|
+
const skillName = skill.name;
|
|
7470
|
+
const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skillName);
|
|
7561
7471
|
await (0, import_promises3.mkdir)(skillDir, { recursive: true });
|
|
7562
|
-
|
|
7563
|
-
|
|
7564
|
-
await writeFilesToDirectory(skillDir, files);
|
|
7565
|
-
|
|
7566
|
-
|
|
7567
|
-
throw new Error(
|
|
7568
|
-
`Failed to write skill ${skill.name} to filesystem: ${message}`
|
|
7472
|
+
const version = skill.latestVersion;
|
|
7473
|
+
if (version?.files && version.files.length > 0) {
|
|
7474
|
+
await writeFilesToDirectory(skillDir, version.files);
|
|
7475
|
+
console.log(
|
|
7476
|
+
`[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
|
|
7569
7477
|
);
|
|
7478
|
+
} else if (skill.source) {
|
|
7479
|
+
try {
|
|
7480
|
+
const files = await fetchFn(skill.source, {
|
|
7481
|
+
userAgent: "EvalForge-Evaluator"
|
|
7482
|
+
});
|
|
7483
|
+
await writeFilesToDirectory(skillDir, files);
|
|
7484
|
+
console.log(
|
|
7485
|
+
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
7486
|
+
);
|
|
7487
|
+
} catch (error) {
|
|
7488
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7489
|
+
console.error(
|
|
7490
|
+
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
7491
|
+
);
|
|
7492
|
+
throw new Error(
|
|
7493
|
+
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
7494
|
+
);
|
|
7495
|
+
}
|
|
7496
|
+
} else {
|
|
7497
|
+
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
7570
7498
|
}
|
|
7571
7499
|
}
|
|
7572
7500
|
|
|
@@ -7584,7 +7512,7 @@ var import_crypto2 = require("crypto");
|
|
|
7584
7512
|
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
7585
7513
|
var import_promises5 = require("fs/promises");
|
|
7586
7514
|
var import_path6 = require("path");
|
|
7587
|
-
var
|
|
7515
|
+
var import_evalforge_types2 = require("@wix/evalforge-types");
|
|
7588
7516
|
|
|
7589
7517
|
// src/run-scenario/agents/shared/resolve-mcp-placeholders.ts
|
|
7590
7518
|
var import_promises4 = require("fs/promises");
|
|
@@ -7629,11 +7557,11 @@ async function resolveMcpPlaceholders(mcpServers, options = {}) {
|
|
|
7629
7557
|
}
|
|
7630
7558
|
|
|
7631
7559
|
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
7632
|
-
async function writeMcpToFilesystem(cwd, mcps
|
|
7560
|
+
async function writeMcpToFilesystem(cwd, mcps) {
|
|
7633
7561
|
if (mcps.length === 0) return;
|
|
7634
7562
|
const mcpServers = {};
|
|
7635
7563
|
for (const mcp of mcps) {
|
|
7636
|
-
const config =
|
|
7564
|
+
const config = mcp.config;
|
|
7637
7565
|
for (const [key, value] of Object.entries(config)) {
|
|
7638
7566
|
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
7639
7567
|
throw new Error(
|
|
@@ -7645,7 +7573,7 @@ async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
|
|
|
7645
7573
|
}
|
|
7646
7574
|
const resolvedServers = await resolveMcpPlaceholders(mcpServers, { cwd });
|
|
7647
7575
|
const content = JSON.stringify(
|
|
7648
|
-
{ [
|
|
7576
|
+
{ [import_evalforge_types2.MCP_SERVERS_JSON_KEY]: resolvedServers },
|
|
7649
7577
|
null,
|
|
7650
7578
|
2
|
|
7651
7579
|
);
|
|
@@ -7657,6 +7585,7 @@ async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
|
|
|
7657
7585
|
// src/run-scenario/agents/claude-code/write-sub-agents.ts
|
|
7658
7586
|
var import_promises6 = require("fs/promises");
|
|
7659
7587
|
var import_path7 = require("path");
|
|
7588
|
+
var import_evalforge_github_client3 = require("@wix/evalforge-github-client");
|
|
7660
7589
|
var AGENTS_DIR = ".claude/agents";
|
|
7661
7590
|
function toAgentFilename(name, index, nameCount) {
|
|
7662
7591
|
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
@@ -7664,7 +7593,34 @@ function toAgentFilename(name, index, nameCount) {
|
|
|
7664
7593
|
nameCount.set(base, count + 1);
|
|
7665
7594
|
return count === 0 ? base : `${base}-${count + 1}`;
|
|
7666
7595
|
}
|
|
7667
|
-
async function
|
|
7596
|
+
async function resolveSubAgentContent(agent, fetchFn) {
|
|
7597
|
+
if (agent.source) {
|
|
7598
|
+
try {
|
|
7599
|
+
const content = await fetchFn(agent.source, {
|
|
7600
|
+
userAgent: "EvalForge-Evaluator"
|
|
7601
|
+
});
|
|
7602
|
+
console.log(
|
|
7603
|
+
`[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
|
|
7604
|
+
);
|
|
7605
|
+
return content;
|
|
7606
|
+
} catch (error) {
|
|
7607
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7608
|
+
console.error(
|
|
7609
|
+
`[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
|
|
7610
|
+
);
|
|
7611
|
+
throw new Error(
|
|
7612
|
+
`Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
|
|
7613
|
+
);
|
|
7614
|
+
}
|
|
7615
|
+
}
|
|
7616
|
+
if (!agent.subAgentMd) {
|
|
7617
|
+
console.warn(
|
|
7618
|
+
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
7619
|
+
);
|
|
7620
|
+
}
|
|
7621
|
+
return agent.subAgentMd;
|
|
7622
|
+
}
|
|
7623
|
+
async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalforge_github_client3.fetchGitHubFile) {
|
|
7668
7624
|
if (subAgents.length === 0) return;
|
|
7669
7625
|
const agentsDir = (0, import_path7.join)(cwd, AGENTS_DIR);
|
|
7670
7626
|
await (0, import_promises6.mkdir)(agentsDir, { recursive: true });
|
|
@@ -7672,7 +7628,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn) {
|
|
|
7672
7628
|
for (const [i, agent] of subAgents.entries()) {
|
|
7673
7629
|
const filename = toAgentFilename(agent.name, i, nameCount);
|
|
7674
7630
|
const filePath = (0, import_path7.join)(agentsDir, `${filename}.md`);
|
|
7675
|
-
const content = await
|
|
7631
|
+
const content = await resolveSubAgentContent(agent, fetchFn);
|
|
7676
7632
|
await (0, import_promises6.writeFile)(filePath, content, "utf8");
|
|
7677
7633
|
}
|
|
7678
7634
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
@@ -7722,19 +7678,18 @@ function validateGenericDirectory(dir, cwd) {
|
|
|
7722
7678
|
}
|
|
7723
7679
|
return trimmed;
|
|
7724
7680
|
}
|
|
7725
|
-
async function writeRulesToFilesystem(cwd, rules
|
|
7681
|
+
async function writeRulesToFilesystem(cwd, rules) {
|
|
7726
7682
|
if (rules.length === 0) return;
|
|
7727
7683
|
const nameCount = /* @__PURE__ */ new Map();
|
|
7728
7684
|
let hasCursorRules = false;
|
|
7729
7685
|
for (const [i, rule] of rules.entries()) {
|
|
7730
|
-
const content = await resolveRuleText(rule, fetchFn);
|
|
7731
7686
|
switch (rule.ruleType) {
|
|
7732
7687
|
case "claude-md": {
|
|
7733
|
-
await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), content);
|
|
7688
|
+
await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), rule.content);
|
|
7734
7689
|
break;
|
|
7735
7690
|
}
|
|
7736
7691
|
case "agents-md": {
|
|
7737
|
-
await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), content);
|
|
7692
|
+
await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), rule.content);
|
|
7738
7693
|
break;
|
|
7739
7694
|
}
|
|
7740
7695
|
case "cursor-rule": {
|
|
@@ -7744,7 +7699,7 @@ async function writeRulesToFilesystem(cwd, rules, fetchFn) {
|
|
|
7744
7699
|
}
|
|
7745
7700
|
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
7746
7701
|
const filePath = (0, import_path8.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
|
|
7747
|
-
await (0, import_promises7.writeFile)(filePath, content, "utf8");
|
|
7702
|
+
await (0, import_promises7.writeFile)(filePath, rule.content, "utf8");
|
|
7748
7703
|
break;
|
|
7749
7704
|
}
|
|
7750
7705
|
case "generic": {
|
|
@@ -7755,7 +7710,7 @@ async function writeRulesToFilesystem(cwd, rules, fetchFn) {
|
|
|
7755
7710
|
const dirPath = (0, import_path8.join)(cwd, directory);
|
|
7756
7711
|
await (0, import_promises7.mkdir)(dirPath, { recursive: true });
|
|
7757
7712
|
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
7758
|
-
await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), content, "utf8");
|
|
7713
|
+
await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), rule.content, "utf8");
|
|
7759
7714
|
break;
|
|
7760
7715
|
}
|
|
7761
7716
|
default: {
|
|
@@ -7845,14 +7800,14 @@ function buildConversation(timestampedMessages) {
|
|
|
7845
7800
|
}
|
|
7846
7801
|
|
|
7847
7802
|
// src/run-scenario/agents/shared/trace-emit.ts
|
|
7848
|
-
var
|
|
7803
|
+
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
7849
7804
|
function emitTraceEvent(event, pushEvent) {
|
|
7850
|
-
console.log(`${
|
|
7805
|
+
console.log(`${import_evalforge_types3.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
|
|
7851
7806
|
pushEvent?.(event);
|
|
7852
7807
|
}
|
|
7853
7808
|
|
|
7854
7809
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
7855
|
-
var DEFAULT_MODEL =
|
|
7810
|
+
var DEFAULT_MODEL = import_evalforge_types4.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
|
|
7856
7811
|
async function* buildPromptStream(triggerPrompt, images) {
|
|
7857
7812
|
yield {
|
|
7858
7813
|
type: "user",
|
|
@@ -7917,7 +7872,7 @@ function extractToolActionDescription(toolName, toolArgs) {
|
|
|
7917
7872
|
return `Using ${toolName}...`;
|
|
7918
7873
|
}
|
|
7919
7874
|
function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
7920
|
-
let type =
|
|
7875
|
+
let type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
|
|
7921
7876
|
let toolName;
|
|
7922
7877
|
let toolArgs;
|
|
7923
7878
|
let outputPreview;
|
|
@@ -7925,28 +7880,28 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
|
7925
7880
|
let thinking;
|
|
7926
7881
|
for (const block of message.message.content) {
|
|
7927
7882
|
if (block.type === "tool_use") {
|
|
7928
|
-
type =
|
|
7883
|
+
type = import_evalforge_types4.LiveTraceEventType.TOOL_USE;
|
|
7929
7884
|
toolName = block.name;
|
|
7930
7885
|
toolArgs = JSON.stringify(block.input).slice(0, 500);
|
|
7931
7886
|
const input = block.input;
|
|
7932
7887
|
if (input.file_path || input.path || input.target_file) {
|
|
7933
7888
|
filePath = String(input.file_path || input.path || input.target_file);
|
|
7934
7889
|
if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
|
|
7935
|
-
type =
|
|
7890
|
+
type = import_evalforge_types4.LiveTraceEventType.FILE_WRITE;
|
|
7936
7891
|
} else if (block.name === "Read" || block.name === "read" || block.name === "View") {
|
|
7937
|
-
type =
|
|
7892
|
+
type = import_evalforge_types4.LiveTraceEventType.FILE_READ;
|
|
7938
7893
|
}
|
|
7939
7894
|
}
|
|
7940
7895
|
} else if (block.type === "text") {
|
|
7941
7896
|
outputPreview = block.text.slice(0, 500);
|
|
7942
7897
|
if (!toolName) {
|
|
7943
|
-
type =
|
|
7898
|
+
type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
|
|
7944
7899
|
}
|
|
7945
7900
|
} else if (block.type === "thinking") {
|
|
7946
7901
|
const thinkingBlock = block;
|
|
7947
7902
|
thinking = thinkingBlock.thinking.slice(0, 500);
|
|
7948
7903
|
if (!outputPreview && !toolName) {
|
|
7949
|
-
type =
|
|
7904
|
+
type = import_evalforge_types4.LiveTraceEventType.THINKING;
|
|
7950
7905
|
}
|
|
7951
7906
|
}
|
|
7952
7907
|
}
|
|
@@ -8012,7 +7967,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
8012
7967
|
}
|
|
8013
7968
|
return {
|
|
8014
7969
|
...baseEvent,
|
|
8015
|
-
type:
|
|
7970
|
+
type: import_evalforge_types4.LiveTraceEventType.TOOL_RESULT,
|
|
8016
7971
|
outputPreview: outputPreview || "(tool result)"
|
|
8017
7972
|
};
|
|
8018
7973
|
}
|
|
@@ -8020,7 +7975,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
8020
7975
|
const sysMsg = message;
|
|
8021
7976
|
return {
|
|
8022
7977
|
...baseEvent,
|
|
8023
|
-
type:
|
|
7978
|
+
type: import_evalforge_types4.LiveTraceEventType.SYSTEM,
|
|
8024
7979
|
outputPreview: sysMsg.subtype || "system"
|
|
8025
7980
|
};
|
|
8026
7981
|
}
|
|
@@ -8029,7 +7984,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
8029
7984
|
}
|
|
8030
7985
|
return {
|
|
8031
7986
|
...baseEvent,
|
|
8032
|
-
type:
|
|
7987
|
+
type: import_evalforge_types4.LiveTraceEventType.PROGRESS,
|
|
8033
7988
|
outputPreview: `Message type: ${message.type}`
|
|
8034
7989
|
};
|
|
8035
7990
|
}
|
|
@@ -8131,7 +8086,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8131
8086
|
queryOptions.systemPrompt = {
|
|
8132
8087
|
type: "preset",
|
|
8133
8088
|
preset: "claude_code",
|
|
8134
|
-
append:
|
|
8089
|
+
append: import_evalforge_types4.DEFAULT_EVALUATOR_SYSTEM_PROMPT
|
|
8135
8090
|
};
|
|
8136
8091
|
}
|
|
8137
8092
|
if (options.temperature !== void 0) {
|
|
@@ -8166,7 +8121,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8166
8121
|
targetId: traceContext.targetId,
|
|
8167
8122
|
targetName: traceContext.targetName,
|
|
8168
8123
|
stepNumber: 0,
|
|
8169
|
-
type:
|
|
8124
|
+
type: import_evalforge_types4.LiveTraceEventType.DIAGNOSTIC,
|
|
8170
8125
|
outputPreview: JSON.stringify({
|
|
8171
8126
|
event: "pre-sdk-execution",
|
|
8172
8127
|
model: queryOptions.model,
|
|
@@ -8230,7 +8185,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8230
8185
|
targetId: traceContext.targetId,
|
|
8231
8186
|
targetName: traceContext.targetName,
|
|
8232
8187
|
stepNumber: traceStepNumber,
|
|
8233
|
-
type:
|
|
8188
|
+
type: import_evalforge_types4.LiveTraceEventType.PROGRESS,
|
|
8234
8189
|
outputPreview: progressMessage,
|
|
8235
8190
|
toolName: lastToolName,
|
|
8236
8191
|
filePath: lastFilePath,
|
|
@@ -8267,18 +8222,18 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8267
8222
|
if (traceEvent) {
|
|
8268
8223
|
lastToolName = traceEvent.toolName;
|
|
8269
8224
|
lastFilePath = traceEvent.filePath;
|
|
8270
|
-
if (traceEvent.type ===
|
|
8225
|
+
if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.THINKING) {
|
|
8271
8226
|
lastAction = "Thinking...";
|
|
8272
|
-
} else if (traceEvent.type ===
|
|
8227
|
+
} else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.TOOL_USE) {
|
|
8273
8228
|
lastAction = extractToolActionDescription(
|
|
8274
8229
|
traceEvent.toolName,
|
|
8275
8230
|
traceEvent.toolArgs
|
|
8276
8231
|
);
|
|
8277
|
-
} else if (traceEvent.type ===
|
|
8232
|
+
} else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.FILE_WRITE) {
|
|
8278
8233
|
lastAction = `Writing: ${traceEvent.filePath || "file"}`;
|
|
8279
|
-
} else if (traceEvent.type ===
|
|
8234
|
+
} else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.FILE_READ) {
|
|
8280
8235
|
lastAction = `Reading: ${traceEvent.filePath || "file"}`;
|
|
8281
|
-
} else if (traceEvent.type ===
|
|
8236
|
+
} else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.COMPLETION) {
|
|
8282
8237
|
lastAction = "Processing response...";
|
|
8283
8238
|
}
|
|
8284
8239
|
emitTraceEvent(traceEvent, traceContext.pushEvent);
|
|
@@ -8456,7 +8411,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8456
8411
|
targetId: traceContext.targetId,
|
|
8457
8412
|
targetName: traceContext.targetName,
|
|
8458
8413
|
stepNumber: traceStepNumber + 1,
|
|
8459
|
-
type:
|
|
8414
|
+
type: import_evalforge_types4.LiveTraceEventType.DIAGNOSTIC,
|
|
8460
8415
|
outputPreview: JSON.stringify(
|
|
8461
8416
|
{
|
|
8462
8417
|
event: "sdk-execution-failed",
|
|
@@ -8490,7 +8445,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
8490
8445
|
targetId: traceContext.targetId,
|
|
8491
8446
|
targetName: traceContext.targetName,
|
|
8492
8447
|
stepNumber: traceStepNumber + 1,
|
|
8493
|
-
type:
|
|
8448
|
+
type: import_evalforge_types4.LiveTraceEventType.COMPLETION,
|
|
8494
8449
|
outputPreview: "Scenario execution completed",
|
|
8495
8450
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
8496
8451
|
isComplete: true
|
|
@@ -8670,9 +8625,12 @@ function processMessages(timestampedMessages, startTime, endTime) {
|
|
|
8670
8625
|
if (!step.toolCalls) continue;
|
|
8671
8626
|
for (const tc of step.toolCalls) {
|
|
8672
8627
|
if (tc.toolUseId && toolResultErrors.has(tc.toolUseId)) {
|
|
8673
|
-
|
|
8674
|
-
|
|
8675
|
-
|
|
8628
|
+
tc.isError = true;
|
|
8629
|
+
tc.errorContent = toolResultErrors.get(tc.toolUseId);
|
|
8630
|
+
if (!step.hasToolError) {
|
|
8631
|
+
step.hasToolError = true;
|
|
8632
|
+
step.toolErrorContent = tc.errorContent;
|
|
8633
|
+
}
|
|
8676
8634
|
}
|
|
8677
8635
|
}
|
|
8678
8636
|
}
|
|
@@ -8762,7 +8720,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8762
8720
|
stepNumber: 0,
|
|
8763
8721
|
// renumbered below
|
|
8764
8722
|
turnIndex,
|
|
8765
|
-
type:
|
|
8723
|
+
type: import_evalforge_types4.LLMStepType.THINKING,
|
|
8766
8724
|
model,
|
|
8767
8725
|
provider: "anthropic",
|
|
8768
8726
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -8776,8 +8734,8 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8776
8734
|
},
|
|
8777
8735
|
costUsd: stepCost / totalSubSteps,
|
|
8778
8736
|
outputPreview: step.thinking?.slice(0, 200),
|
|
8779
|
-
success:
|
|
8780
|
-
error:
|
|
8737
|
+
success: true,
|
|
8738
|
+
error: void 0
|
|
8781
8739
|
});
|
|
8782
8740
|
}
|
|
8783
8741
|
if (toolCallCount > 0) {
|
|
@@ -8787,11 +8745,13 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8787
8745
|
const toolBudgetSteps = toolSubSteps + textSubSteps;
|
|
8788
8746
|
const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
|
|
8789
8747
|
const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
|
|
8748
|
+
const toolSuccess = !tc.isError;
|
|
8749
|
+
const toolError = tc.isError ? tc.errorContent ?? "Tool call failed" : void 0;
|
|
8790
8750
|
subSteps.push({
|
|
8791
8751
|
id: (0, import_crypto2.randomUUID)(),
|
|
8792
8752
|
stepNumber: 0,
|
|
8793
8753
|
turnIndex,
|
|
8794
|
-
type:
|
|
8754
|
+
type: import_evalforge_types4.LLMStepType.TOOL_USE,
|
|
8795
8755
|
model,
|
|
8796
8756
|
provider: "anthropic",
|
|
8797
8757
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -8811,8 +8771,8 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8811
8771
|
toolName: tc.toolName,
|
|
8812
8772
|
toolArguments: JSON.stringify(tc.args),
|
|
8813
8773
|
outputPreview: tcIdx === 0 && !hasText ? (step.text || step.thinking)?.slice(0, 200) : void 0,
|
|
8814
|
-
success:
|
|
8815
|
-
error:
|
|
8774
|
+
success: toolSuccess,
|
|
8775
|
+
error: toolError
|
|
8816
8776
|
});
|
|
8817
8777
|
}
|
|
8818
8778
|
}
|
|
@@ -8821,7 +8781,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8821
8781
|
id: (0, import_crypto2.randomUUID)(),
|
|
8822
8782
|
stepNumber: 0,
|
|
8823
8783
|
turnIndex,
|
|
8824
|
-
type:
|
|
8784
|
+
type: import_evalforge_types4.LLMStepType.COMPLETION,
|
|
8825
8785
|
model,
|
|
8826
8786
|
provider: "anthropic",
|
|
8827
8787
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -8833,12 +8793,12 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8833
8793
|
},
|
|
8834
8794
|
costUsd: stepCost - subSteps.reduce((s, ss) => s + ss.costUsd, 0),
|
|
8835
8795
|
outputPreview: step.text?.slice(0, 200),
|
|
8836
|
-
success:
|
|
8837
|
-
error:
|
|
8796
|
+
success: true,
|
|
8797
|
+
error: void 0
|
|
8838
8798
|
});
|
|
8839
8799
|
}
|
|
8840
8800
|
if (subSteps.length === 0) {
|
|
8841
|
-
const stepType = hasThinking && !hasText ?
|
|
8801
|
+
const stepType = hasThinking && !hasText ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
|
|
8842
8802
|
subSteps.push({
|
|
8843
8803
|
id: (0, import_crypto2.randomUUID)(),
|
|
8844
8804
|
stepNumber: 0,
|
|
@@ -8908,7 +8868,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8908
8868
|
var ClaudeCodeAdapter = class {
|
|
8909
8869
|
id = "claude-code";
|
|
8910
8870
|
name = "Claude Code";
|
|
8911
|
-
supportedCommands = [
|
|
8871
|
+
supportedCommands = [import_evalforge_types5.AgentRunCommand.CLAUDE];
|
|
8912
8872
|
/**
|
|
8913
8873
|
* Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
|
|
8914
8874
|
* before the baseline snapshot is taken.
|
|
@@ -8940,9 +8900,9 @@ var ClaudeCodeAdapter = class {
|
|
|
8940
8900
|
rules,
|
|
8941
8901
|
systemPrompt
|
|
8942
8902
|
} = context;
|
|
8943
|
-
const typed = config ?
|
|
8903
|
+
const typed = config ? import_evalforge_types5.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
|
|
8944
8904
|
const cfg = typed?.success ? typed.data : void 0;
|
|
8945
|
-
const schemaKeys = new Set(Object.keys(
|
|
8905
|
+
const schemaKeys = new Set(Object.keys(import_evalforge_types5.ClaudeCodeConfigSchema.shape));
|
|
8946
8906
|
const extras = {};
|
|
8947
8907
|
if (config) {
|
|
8948
8908
|
for (const [key, value] of Object.entries(config)) {
|
|
@@ -8997,11 +8957,11 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
|
|
|
8997
8957
|
defaultRegistry.register(claudeCodeAdapter);
|
|
8998
8958
|
|
|
8999
8959
|
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
9000
|
-
var
|
|
8960
|
+
var import_evalforge_types9 = require("@wix/evalforge-types");
|
|
9001
8961
|
|
|
9002
8962
|
// src/run-scenario/agents/opencode/execute.ts
|
|
9003
8963
|
var import_child_process2 = require("child_process");
|
|
9004
|
-
var
|
|
8964
|
+
var import_evalforge_types8 = require("@wix/evalforge-types");
|
|
9005
8965
|
|
|
9006
8966
|
// src/run-scenario/agents/opencode/types.ts
|
|
9007
8967
|
function tryParseJson(text) {
|
|
@@ -9015,28 +8975,49 @@ function tryParseJson(text) {
|
|
|
9015
8975
|
// src/run-scenario/agents/opencode/write-skills.ts
|
|
9016
8976
|
var import_promises8 = require("fs/promises");
|
|
9017
8977
|
var import_path9 = require("path");
|
|
9018
|
-
|
|
8978
|
+
var import_evalforge_github_client4 = require("@wix/evalforge-github-client");
|
|
8979
|
+
async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_github_client4.fetchGitHubFolder) {
|
|
9019
8980
|
await Promise.all(
|
|
9020
8981
|
skills.map((skill) => writeSkillToFilesystem2(cwd, skill, fetchFn))
|
|
9021
8982
|
);
|
|
9022
8983
|
}
|
|
9023
8984
|
async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
|
|
9024
|
-
const
|
|
8985
|
+
const skillName = skill.name;
|
|
8986
|
+
const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skillName);
|
|
9025
8987
|
await (0, import_promises8.mkdir)(skillDir, { recursive: true });
|
|
9026
|
-
|
|
9027
|
-
|
|
9028
|
-
await writeFilesToDirectory(skillDir, files);
|
|
9029
|
-
|
|
9030
|
-
|
|
9031
|
-
throw new Error(
|
|
9032
|
-
`Failed to write skill ${skill.name} to filesystem: ${message}`
|
|
8988
|
+
const version = skill.latestVersion;
|
|
8989
|
+
if (version?.files && version.files.length > 0) {
|
|
8990
|
+
await writeFilesToDirectory(skillDir, version.files);
|
|
8991
|
+
console.log(
|
|
8992
|
+
`[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
|
|
9033
8993
|
);
|
|
8994
|
+
} else if (skill.source) {
|
|
8995
|
+
try {
|
|
8996
|
+
const files = await fetchFn(skill.source, {
|
|
8997
|
+
userAgent: "EvalForge-Evaluator"
|
|
8998
|
+
});
|
|
8999
|
+
await writeFilesToDirectory(skillDir, files);
|
|
9000
|
+
console.log(
|
|
9001
|
+
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
9002
|
+
);
|
|
9003
|
+
} catch (error) {
|
|
9004
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
9005
|
+
console.error(
|
|
9006
|
+
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
9007
|
+
);
|
|
9008
|
+
throw new Error(
|
|
9009
|
+
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
9010
|
+
);
|
|
9011
|
+
}
|
|
9012
|
+
} else {
|
|
9013
|
+
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
9034
9014
|
}
|
|
9035
9015
|
}
|
|
9036
9016
|
|
|
9037
9017
|
// src/run-scenario/agents/opencode/write-sub-agents.ts
|
|
9038
9018
|
var import_promises9 = require("fs/promises");
|
|
9039
9019
|
var import_path10 = require("path");
|
|
9020
|
+
var import_evalforge_github_client5 = require("@wix/evalforge-github-client");
|
|
9040
9021
|
var AGENTS_DIR2 = ".opencode/agents";
|
|
9041
9022
|
function toAgentFilename2(name, index, nameCount) {
|
|
9042
9023
|
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
@@ -9044,7 +9025,34 @@ function toAgentFilename2(name, index, nameCount) {
|
|
|
9044
9025
|
nameCount.set(base, count + 1);
|
|
9045
9026
|
return count === 0 ? base : `${base}-${count + 1}`;
|
|
9046
9027
|
}
|
|
9047
|
-
async function
|
|
9028
|
+
async function resolveSubAgentContent2(agent, fetchFn) {
|
|
9029
|
+
if (agent.source) {
|
|
9030
|
+
try {
|
|
9031
|
+
const content = await fetchFn(agent.source, {
|
|
9032
|
+
userAgent: "EvalForge-Evaluator"
|
|
9033
|
+
});
|
|
9034
|
+
console.log(
|
|
9035
|
+
`[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
|
|
9036
|
+
);
|
|
9037
|
+
return content;
|
|
9038
|
+
} catch (error) {
|
|
9039
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
9040
|
+
console.error(
|
|
9041
|
+
`[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
|
|
9042
|
+
);
|
|
9043
|
+
throw new Error(
|
|
9044
|
+
`Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
|
|
9045
|
+
);
|
|
9046
|
+
}
|
|
9047
|
+
}
|
|
9048
|
+
if (!agent.subAgentMd) {
|
|
9049
|
+
console.warn(
|
|
9050
|
+
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
9051
|
+
);
|
|
9052
|
+
}
|
|
9053
|
+
return agent.subAgentMd;
|
|
9054
|
+
}
|
|
9055
|
+
async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_evalforge_github_client5.fetchGitHubFile) {
|
|
9048
9056
|
if (subAgents.length === 0) return;
|
|
9049
9057
|
const agentsDir = (0, import_path10.join)(cwd, AGENTS_DIR2);
|
|
9050
9058
|
await (0, import_promises9.mkdir)(agentsDir, { recursive: true });
|
|
@@ -9052,7 +9060,7 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
|
|
|
9052
9060
|
for (const [i, agent] of subAgents.entries()) {
|
|
9053
9061
|
const filename = toAgentFilename2(agent.name, i, nameCount);
|
|
9054
9062
|
const filePath = (0, import_path10.join)(agentsDir, `${filename}.md`);
|
|
9055
|
-
const content = await
|
|
9063
|
+
const content = await resolveSubAgentContent2(agent, fetchFn);
|
|
9056
9064
|
await (0, import_promises9.writeFile)(filePath, content, "utf8");
|
|
9057
9065
|
}
|
|
9058
9066
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
@@ -9060,8 +9068,8 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
|
|
|
9060
9068
|
|
|
9061
9069
|
// src/run-scenario/agents/opencode/config.ts
|
|
9062
9070
|
var import_os3 = require("os");
|
|
9063
|
-
var
|
|
9064
|
-
var DEFAULT_MODEL2 = `${
|
|
9071
|
+
var import_evalforge_types6 = require("@wix/evalforge-types");
|
|
9072
|
+
var DEFAULT_MODEL2 = `${import_evalforge_types6.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
|
|
9065
9073
|
var OPENCODE_MODEL_ALIASES = {
|
|
9066
9074
|
"claude-sonnet-4": "claude-sonnet-4-0",
|
|
9067
9075
|
"claude-opus-4": "claude-opus-4-0"
|
|
@@ -9077,10 +9085,10 @@ function parseModel(model) {
|
|
|
9077
9085
|
};
|
|
9078
9086
|
}
|
|
9079
9087
|
const modelID = OPENCODE_MODEL_ALIASES[model] ?? model;
|
|
9080
|
-
const isOpenAI =
|
|
9088
|
+
const isOpenAI = import_evalforge_types6.AVAILABLE_OPENAI_MODEL_IDS.includes(
|
|
9081
9089
|
model
|
|
9082
9090
|
);
|
|
9083
|
-
const isGemini =
|
|
9091
|
+
const isGemini = import_evalforge_types6.AVAILABLE_GEMINI_MODEL_IDS.includes(
|
|
9084
9092
|
model
|
|
9085
9093
|
);
|
|
9086
9094
|
if (isGemini) return { providerID: "google", modelID };
|
|
@@ -9149,7 +9157,7 @@ async function buildOpenCodeEnv(options) {
|
|
|
9149
9157
|
if (options.mcps && options.mcps.length > 0) {
|
|
9150
9158
|
const mcpServers = {};
|
|
9151
9159
|
for (const mcpEntity of options.mcps) {
|
|
9152
|
-
const entityConfig =
|
|
9160
|
+
const entityConfig = mcpEntity.config;
|
|
9153
9161
|
for (const [key, value] of Object.entries(entityConfig)) {
|
|
9154
9162
|
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
9155
9163
|
throw new Error(
|
|
@@ -9174,7 +9182,7 @@ async function buildOpenCodeEnv(options) {
|
|
|
9174
9182
|
if (options.maxTurns != null && options.maxTurns > 0) {
|
|
9175
9183
|
agentOverrides.maxSteps = options.maxTurns;
|
|
9176
9184
|
}
|
|
9177
|
-
const parsed = options.config ?
|
|
9185
|
+
const parsed = options.config ? import_evalforge_types6.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
|
|
9178
9186
|
const configPermission = parsed?.success ? parsed.data.permission : void 0;
|
|
9179
9187
|
const defaultPermission = {
|
|
9180
9188
|
"*": "allow"
|
|
@@ -9216,7 +9224,7 @@ async function buildOpenCodeEnv(options) {
|
|
|
9216
9224
|
}
|
|
9217
9225
|
|
|
9218
9226
|
// src/run-scenario/agents/opencode/build-trace.ts
|
|
9219
|
-
var
|
|
9227
|
+
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
9220
9228
|
var import_crypto3 = require("crypto");
|
|
9221
9229
|
function toCanonicalModelId(modelId) {
|
|
9222
9230
|
const slashIndex = modelId.indexOf("/");
|
|
@@ -9296,7 +9304,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9296
9304
|
id: (0, import_crypto3.randomUUID)(),
|
|
9297
9305
|
stepNumber: 0,
|
|
9298
9306
|
turnIndex,
|
|
9299
|
-
type:
|
|
9307
|
+
type: import_evalforge_types7.LLMStepType.THINKING,
|
|
9300
9308
|
model: stepModel,
|
|
9301
9309
|
provider: stepProvider,
|
|
9302
9310
|
startedAt,
|
|
@@ -9325,7 +9333,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9325
9333
|
id: (0, import_crypto3.randomUUID)(),
|
|
9326
9334
|
stepNumber: 0,
|
|
9327
9335
|
turnIndex,
|
|
9328
|
-
type:
|
|
9336
|
+
type: import_evalforge_types7.LLMStepType.TOOL_USE,
|
|
9329
9337
|
model: stepModel,
|
|
9330
9338
|
provider: stepProvider,
|
|
9331
9339
|
startedAt,
|
|
@@ -9355,7 +9363,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9355
9363
|
id: (0, import_crypto3.randomUUID)(),
|
|
9356
9364
|
stepNumber: 0,
|
|
9357
9365
|
turnIndex,
|
|
9358
|
-
type:
|
|
9366
|
+
type: import_evalforge_types7.LLMStepType.COMPLETION,
|
|
9359
9367
|
model: stepModel,
|
|
9360
9368
|
provider: stepProvider,
|
|
9361
9369
|
startedAt,
|
|
@@ -9372,7 +9380,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9372
9380
|
});
|
|
9373
9381
|
}
|
|
9374
9382
|
if (subSteps.length === 0) {
|
|
9375
|
-
const stepType = hasThinking && !hasText ?
|
|
9383
|
+
const stepType = hasThinking && !hasText ? import_evalforge_types7.LLMStepType.THINKING : import_evalforge_types7.LLMStepType.COMPLETION;
|
|
9376
9384
|
subSteps.push({
|
|
9377
9385
|
id: (0, import_crypto3.randomUUID)(),
|
|
9378
9386
|
stepNumber: 0,
|
|
@@ -9573,14 +9581,14 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
|
|
|
9573
9581
|
const te = evt;
|
|
9574
9582
|
return {
|
|
9575
9583
|
...base,
|
|
9576
|
-
type:
|
|
9584
|
+
type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
|
|
9577
9585
|
outputPreview: te.part.text.slice(0, 500)
|
|
9578
9586
|
};
|
|
9579
9587
|
}
|
|
9580
9588
|
case "reasoning":
|
|
9581
9589
|
return {
|
|
9582
9590
|
...base,
|
|
9583
|
-
type:
|
|
9591
|
+
type: import_evalforge_types8.LiveTraceEventType.THINKING,
|
|
9584
9592
|
thinking: evt.part.text.slice(0, 500)
|
|
9585
9593
|
};
|
|
9586
9594
|
case "tool_use": {
|
|
@@ -9588,15 +9596,15 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
|
|
|
9588
9596
|
const toolName = tu.part.tool;
|
|
9589
9597
|
const args = tu.part.state.input;
|
|
9590
9598
|
const toolArgs = JSON.stringify(args).slice(0, 500);
|
|
9591
|
-
let type =
|
|
9599
|
+
let type = import_evalforge_types8.LiveTraceEventType.TOOL_USE;
|
|
9592
9600
|
let filePath;
|
|
9593
9601
|
if (args) {
|
|
9594
9602
|
if (args.file_path || args.path || args.target_file) {
|
|
9595
9603
|
filePath = String(args.file_path || args.path || args.target_file);
|
|
9596
9604
|
if (/write|edit/i.test(toolName)) {
|
|
9597
|
-
type =
|
|
9605
|
+
type = import_evalforge_types8.LiveTraceEventType.FILE_WRITE;
|
|
9598
9606
|
} else if (/read|view/i.test(toolName)) {
|
|
9599
|
-
type =
|
|
9607
|
+
type = import_evalforge_types8.LiveTraceEventType.FILE_READ;
|
|
9600
9608
|
}
|
|
9601
9609
|
}
|
|
9602
9610
|
}
|
|
@@ -9605,7 +9613,7 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
|
|
|
9605
9613
|
case "step_finish":
|
|
9606
9614
|
return {
|
|
9607
9615
|
...base,
|
|
9608
|
-
type:
|
|
9616
|
+
type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
|
|
9609
9617
|
outputPreview: "Step completed"
|
|
9610
9618
|
};
|
|
9611
9619
|
default:
|
|
@@ -9636,7 +9644,7 @@ async function prepareOpenCodeEnvironment(cwd, skills, options) {
|
|
|
9636
9644
|
} else if (options.systemPrompt != null) {
|
|
9637
9645
|
systemPrompt = options.systemPrompt;
|
|
9638
9646
|
} else {
|
|
9639
|
-
systemPrompt =
|
|
9647
|
+
systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
|
|
9640
9648
|
}
|
|
9641
9649
|
if (systemPrompt) {
|
|
9642
9650
|
await writeSystemPromptRule(cwd, systemPrompt);
|
|
@@ -9828,7 +9836,7 @@ function spawnOpenCodeProcess(opts) {
|
|
|
9828
9836
|
targetId: traceContext.targetId,
|
|
9829
9837
|
targetName: traceContext.targetName,
|
|
9830
9838
|
stepNumber: traceStepNumber,
|
|
9831
|
-
type:
|
|
9839
|
+
type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
|
|
9832
9840
|
outputPreview: progressMessage,
|
|
9833
9841
|
toolName: lastToolName,
|
|
9834
9842
|
filePath: lastFilePath,
|
|
@@ -9862,18 +9870,18 @@ function spawnOpenCodeProcess(opts) {
|
|
|
9862
9870
|
if (traceEvt) {
|
|
9863
9871
|
lastToolName = traceEvt.toolName;
|
|
9864
9872
|
lastFilePath = traceEvt.filePath;
|
|
9865
|
-
if (traceEvt.type ===
|
|
9873
|
+
if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.THINKING) {
|
|
9866
9874
|
lastAction = "Thinking...";
|
|
9867
|
-
} else if (traceEvt.type ===
|
|
9875
|
+
} else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.TOOL_USE) {
|
|
9868
9876
|
lastAction = extractToolAction(
|
|
9869
9877
|
traceEvt.toolName ?? "",
|
|
9870
9878
|
void 0
|
|
9871
9879
|
);
|
|
9872
|
-
} else if (traceEvt.type ===
|
|
9880
|
+
} else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_WRITE) {
|
|
9873
9881
|
lastAction = `Writing: ${traceEvt.filePath || "file"}`;
|
|
9874
|
-
} else if (traceEvt.type ===
|
|
9882
|
+
} else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_READ) {
|
|
9875
9883
|
lastAction = `Reading: ${traceEvt.filePath || "file"}`;
|
|
9876
|
-
} else if (traceEvt.type ===
|
|
9884
|
+
} else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.COMPLETION) {
|
|
9877
9885
|
lastAction = "Processing response...";
|
|
9878
9886
|
}
|
|
9879
9887
|
emitTraceEvent(traceEvt, traceContext.pushEvent);
|
|
@@ -9955,7 +9963,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
9955
9963
|
targetId: traceContext.targetId,
|
|
9956
9964
|
targetName: traceContext.targetName,
|
|
9957
9965
|
stepNumber: 0,
|
|
9958
|
-
type:
|
|
9966
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
9959
9967
|
outputPreview: JSON.stringify({
|
|
9960
9968
|
event: "pre-cli-execution",
|
|
9961
9969
|
model: `${providerID}/${modelID}`,
|
|
@@ -10009,7 +10017,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
10009
10017
|
targetId: traceContext.targetId,
|
|
10010
10018
|
targetName: traceContext.targetName,
|
|
10011
10019
|
stepNumber: traceStepNumber + 1,
|
|
10012
|
-
type:
|
|
10020
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
10013
10021
|
outputPreview: JSON.stringify({
|
|
10014
10022
|
event: "idle-timeout-retry",
|
|
10015
10023
|
attempt,
|
|
@@ -10053,7 +10061,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
10053
10061
|
targetId: traceContext.targetId,
|
|
10054
10062
|
targetName: traceContext.targetName,
|
|
10055
10063
|
stepNumber: traceStepNumber + 1,
|
|
10056
|
-
type:
|
|
10064
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
10057
10065
|
outputPreview: JSON.stringify({
|
|
10058
10066
|
event: "cli-execution-failed",
|
|
10059
10067
|
error: lastAttemptResult.error?.message ?? "Unknown error",
|
|
@@ -10108,7 +10116,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
|
|
|
10108
10116
|
targetId: traceContext.targetId,
|
|
10109
10117
|
targetName: traceContext.targetName,
|
|
10110
10118
|
stepNumber: traceStepNumber + 1,
|
|
10111
|
-
type:
|
|
10119
|
+
type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
|
|
10112
10120
|
outputPreview: "Scenario execution completed",
|
|
10113
10121
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
10114
10122
|
isComplete: true
|
|
@@ -10145,7 +10153,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
|
|
|
10145
10153
|
var OpenCodeAdapter = class {
|
|
10146
10154
|
id = "opencode";
|
|
10147
10155
|
name = "OpenCode";
|
|
10148
|
-
supportedCommands = [
|
|
10156
|
+
supportedCommands = [import_evalforge_types9.AgentRunCommand.OPENCODE];
|
|
10149
10157
|
async prepareEnvironment(context) {
|
|
10150
10158
|
await prepareOpenCodeEnvironment(context.cwd, context.skills, {
|
|
10151
10159
|
mcps: context.mcps,
|
|
@@ -10168,7 +10176,7 @@ var OpenCodeAdapter = class {
|
|
|
10168
10176
|
rules,
|
|
10169
10177
|
systemPrompt
|
|
10170
10178
|
} = context;
|
|
10171
|
-
const typed = config ?
|
|
10179
|
+
const typed = config ? import_evalforge_types9.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
|
|
10172
10180
|
const cfg = typed?.success ? typed.data : void 0;
|
|
10173
10181
|
const rawMaxTurns = cfg?.maxTurns;
|
|
10174
10182
|
const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
|
|
@@ -10218,7 +10226,7 @@ var import_ai = require("ai");
|
|
|
10218
10226
|
var import_anthropic = require("@ai-sdk/anthropic");
|
|
10219
10227
|
var import_google = require("@ai-sdk/google");
|
|
10220
10228
|
var import_openai = require("@ai-sdk/openai");
|
|
10221
|
-
var
|
|
10229
|
+
var import_evalforge_types11 = require("@wix/evalforge-types");
|
|
10222
10230
|
var import_crypto4 = require("crypto");
|
|
10223
10231
|
|
|
10224
10232
|
// src/run-scenario/agents/simple-agent/mcp-tools.ts
|
|
@@ -10315,7 +10323,7 @@ function extractErrorText(content) {
|
|
|
10315
10323
|
}
|
|
10316
10324
|
|
|
10317
10325
|
// src/run-scenario/agents/simple-agent/cost-calculation.ts
|
|
10318
|
-
var
|
|
10326
|
+
var import_evalforge_types10 = require("@wix/evalforge-types");
|
|
10319
10327
|
var PROVIDER_ANTHROPIC = "anthropic";
|
|
10320
10328
|
var PROVIDER_GEMINI = "gemini";
|
|
10321
10329
|
var MODEL_PRICING = {
|
|
@@ -10384,7 +10392,7 @@ function extractGatewayCost(step, provider) {
|
|
|
10384
10392
|
}
|
|
10385
10393
|
}
|
|
10386
10394
|
function calculateFromPricing(modelId, tokenUsage) {
|
|
10387
|
-
const normalized = (0,
|
|
10395
|
+
const normalized = (0, import_evalforge_types10.normalizeModelId)(modelId);
|
|
10388
10396
|
const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
|
|
10389
10397
|
if (!pricing) return 0;
|
|
10390
10398
|
return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
|
|
@@ -10477,7 +10485,7 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
10477
10485
|
apiKey: "proxy-auth",
|
|
10478
10486
|
headers
|
|
10479
10487
|
});
|
|
10480
|
-
if ([...
|
|
10488
|
+
if ([...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
10481
10489
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10482
10490
|
)) {
|
|
10483
10491
|
return openai.responses(modelId);
|
|
@@ -10485,12 +10493,12 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
10485
10493
|
return openai.chat(modelId);
|
|
10486
10494
|
}
|
|
10487
10495
|
function isClaudeModelId(modelId) {
|
|
10488
|
-
return
|
|
10496
|
+
return import_evalforge_types11.AVAILABLE_CLAUDE_MODEL_IDS.some(
|
|
10489
10497
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10490
10498
|
);
|
|
10491
10499
|
}
|
|
10492
10500
|
function isGeminiModelId(modelId) {
|
|
10493
|
-
return
|
|
10501
|
+
return import_evalforge_types11.AVAILABLE_GEMINI_MODEL_IDS.some(
|
|
10494
10502
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10495
10503
|
);
|
|
10496
10504
|
}
|
|
@@ -10510,9 +10518,9 @@ async function executeWithAiSdk(context) {
|
|
|
10510
10518
|
mcps,
|
|
10511
10519
|
traceContext
|
|
10512
10520
|
} = context;
|
|
10513
|
-
const typed = config ?
|
|
10521
|
+
const typed = config ? import_evalforge_types11.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
|
|
10514
10522
|
const cfg = typed?.success ? typed.data : void 0;
|
|
10515
|
-
const schemaKeys = new Set(Object.keys(
|
|
10523
|
+
const schemaKeys = new Set(Object.keys(import_evalforge_types11.SimpleAgentConfigSchema.shape));
|
|
10516
10524
|
const configExtras = {};
|
|
10517
10525
|
if (config) {
|
|
10518
10526
|
for (const [key, value] of Object.entries(config)) {
|
|
@@ -10549,11 +10557,11 @@ async function executeWithAiSdk(context) {
|
|
|
10549
10557
|
}, SDK_TIMEOUT_MS);
|
|
10550
10558
|
try {
|
|
10551
10559
|
const isAnthropic = provider === PROVIDER_ANTHROPIC2;
|
|
10552
|
-
const isResponsesAPI = [...
|
|
10560
|
+
const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
10553
10561
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10554
10562
|
);
|
|
10555
10563
|
const isGemini = provider === PROVIDER_GEMINI2;
|
|
10556
|
-
const isGeminiThinking = isGemini &&
|
|
10564
|
+
const isGeminiThinking = isGemini && import_evalforge_types11.GEMINI_THINKING_MODEL_IDS.has(modelId);
|
|
10557
10565
|
const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
|
|
10558
10566
|
const thinkingBudgetTokens = cfg.thinkingBudgetTokens ?? 1e4;
|
|
10559
10567
|
const reasoningEffort = cfg.reasoningEffort ?? "high";
|
|
@@ -10632,7 +10640,7 @@ async function executeWithAiSdk(context) {
|
|
|
10632
10640
|
targetId: traceContext.targetId,
|
|
10633
10641
|
targetName: traceContext.targetName,
|
|
10634
10642
|
stepNumber: stepTimestamps.length,
|
|
10635
|
-
type: isToolStep ?
|
|
10643
|
+
type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
|
|
10636
10644
|
toolName: firstToolCall?.toolName,
|
|
10637
10645
|
toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
|
|
10638
10646
|
outputPreview: step.text?.slice(0, 500),
|
|
@@ -10837,7 +10845,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
|
|
|
10837
10845
|
id: (0, import_crypto4.randomUUID)(),
|
|
10838
10846
|
stepNumber: i + 1,
|
|
10839
10847
|
turnIndex: i,
|
|
10840
|
-
type: step.toolCalls.length > 0 ?
|
|
10848
|
+
type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
|
|
10841
10849
|
model: modelId,
|
|
10842
10850
|
provider,
|
|
10843
10851
|
startedAt: new Date(stepStartedAt).toISOString(),
|
|
@@ -10887,7 +10895,7 @@ function emitStartEvent(traceContext, startTime) {
|
|
|
10887
10895
|
targetId: traceContext.targetId,
|
|
10888
10896
|
targetName: traceContext.targetName,
|
|
10889
10897
|
stepNumber: 0,
|
|
10890
|
-
type:
|
|
10898
|
+
type: import_evalforge_types11.LiveTraceEventType.PROGRESS,
|
|
10891
10899
|
outputPreview: "Starting Simple Agent execution...",
|
|
10892
10900
|
elapsedMs: Date.now() - startTime,
|
|
10893
10901
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -10905,7 +10913,7 @@ function emitCompletionEvent(traceContext, stepNumber) {
|
|
|
10905
10913
|
targetId: traceContext.targetId,
|
|
10906
10914
|
targetName: traceContext.targetName,
|
|
10907
10915
|
stepNumber,
|
|
10908
|
-
type:
|
|
10916
|
+
type: import_evalforge_types11.LiveTraceEventType.COMPLETION,
|
|
10909
10917
|
outputPreview: "Scenario execution completed",
|
|
10910
10918
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
10911
10919
|
isComplete: true
|
|
@@ -11675,11 +11683,11 @@ function substituteVariables(prompt, variables) {
|
|
|
11675
11683
|
}
|
|
11676
11684
|
|
|
11677
11685
|
// src/run-scenario/run-agent-with-context.ts
|
|
11678
|
-
var
|
|
11679
|
-
var DEFAULT_AGENT_COMMAND =
|
|
11686
|
+
var import_evalforge_types12 = require("@wix/evalforge-types");
|
|
11687
|
+
var DEFAULT_AGENT_COMMAND = import_evalforge_types12.AgentRunCommand.CLAUDE;
|
|
11680
11688
|
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir, pushEvent) {
|
|
11681
11689
|
const agent = evalData.agent ?? void 0;
|
|
11682
|
-
const isSDK = agent?.agentType ===
|
|
11690
|
+
const isSDK = agent?.agentType === import_evalforge_types12.AgentType.SDK;
|
|
11683
11691
|
const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
11684
11692
|
const adapter = getAdapter(identifier);
|
|
11685
11693
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -11764,14 +11772,14 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
11764
11772
|
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
11765
11773
|
if (template) {
|
|
11766
11774
|
console.log(
|
|
11767
|
-
(0,
|
|
11775
|
+
(0, import_evalforge_types13.formatTraceEventLine)({
|
|
11768
11776
|
evalRunId: evalRunId2,
|
|
11769
11777
|
scenarioId: scenario.id,
|
|
11770
11778
|
scenarioName: scenario.name,
|
|
11771
11779
|
targetId,
|
|
11772
11780
|
targetName,
|
|
11773
11781
|
stepNumber: 0,
|
|
11774
|
-
type:
|
|
11782
|
+
type: import_evalforge_types13.LiveTraceEventType.PROGRESS,
|
|
11775
11783
|
outputPreview: "Setting up environment (installing dependencies)...",
|
|
11776
11784
|
elapsedMs: 0,
|
|
11777
11785
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -11811,7 +11819,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
11811
11819
|
})),
|
|
11812
11820
|
durationMs: partialResult.duration
|
|
11813
11821
|
};
|
|
11814
|
-
const defaultJudgeModel =
|
|
11822
|
+
const defaultJudgeModel = import_evalforge_types13.DEFAULT_JUDGE_MODEL;
|
|
11815
11823
|
const assertionContext = {
|
|
11816
11824
|
workDir,
|
|
11817
11825
|
defaultJudgeModel,
|
|
@@ -11826,10 +11834,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
11826
11834
|
assertionContext
|
|
11827
11835
|
) : [];
|
|
11828
11836
|
const passed = assertionResults.filter(
|
|
11829
|
-
(r) => r.status ===
|
|
11837
|
+
(r) => r.status === import_evalforge_types13.AssertionResultStatus.PASSED
|
|
11830
11838
|
).length;
|
|
11831
11839
|
const failed = assertionResults.filter(
|
|
11832
|
-
(r) => r.status ===
|
|
11840
|
+
(r) => r.status === import_evalforge_types13.AssertionResultStatus.FAILED
|
|
11833
11841
|
).length;
|
|
11834
11842
|
const total = assertionResults.length;
|
|
11835
11843
|
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
@@ -11905,7 +11913,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
11905
11913
|
}
|
|
11906
11914
|
|
|
11907
11915
|
// src/error-reporter.ts
|
|
11908
|
-
var
|
|
11916
|
+
var import_evalforge_types14 = require("@wix/evalforge-types");
|
|
11909
11917
|
function formatError(error, phase, context) {
|
|
11910
11918
|
const timestamp2 = (/* @__PURE__ */ new Date()).toISOString();
|
|
11911
11919
|
if (error instanceof Error) {
|
|
@@ -12148,7 +12156,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
12148
12156
|
totalExecutions
|
|
12149
12157
|
};
|
|
12150
12158
|
const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
|
|
12151
|
-
const finalStatus = allFailed ?
|
|
12159
|
+
const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
|
|
12152
12160
|
const jobErrorOnAllFailed = allFailed ? truncateForJobError(
|
|
12153
12161
|
firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
|
|
12154
12162
|
) : void 0;
|
|
@@ -12202,7 +12210,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
12202
12210
|
grpcAuthToken: config.grpcAuthToken
|
|
12203
12211
|
});
|
|
12204
12212
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
12205
|
-
status:
|
|
12213
|
+
status: import_evalforge_types15.EvalStatus.FAILED,
|
|
12206
12214
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
12207
12215
|
jobError,
|
|
12208
12216
|
jobStatus: "FAILED"
|
|
@@ -12227,7 +12235,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
12227
12235
|
grpcAuthToken
|
|
12228
12236
|
});
|
|
12229
12237
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
12230
|
-
status:
|
|
12238
|
+
status: import_evalforge_types15.EvalStatus.FAILED,
|
|
12231
12239
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
12232
12240
|
jobError: `Config load failed, then: ${jobError}`,
|
|
12233
12241
|
jobStatus: "FAILED"
|