@wix/evalforge-evaluator 0.182.0 → 0.184.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +229 -227
- package/build/index.js.map +4 -4
- package/build/index.mjs +143 -142
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/agents/claude-code/execute.d.ts +23 -1
- package/build/types/run-scenario/agents/claude-code/write-mcp.d.ts +7 -1
- package/build/types/run-scenario/agents/claude-code/write-rules.d.ts +7 -1
- package/build/types/run-scenario/agents/claude-code/write-skills.d.ts +6 -9
- package/build/types/run-scenario/agents/claude-code/write-sub-agents.d.ts +4 -5
- package/build/types/run-scenario/agents/opencode/write-skills.d.ts +5 -8
- package/build/types/run-scenario/agents/opencode/write-sub-agents.d.ts +3 -3
- package/build/types/run-scenario/agents/shared/resolve-capability-content.d.ts +42 -0
- package/build/types/run-scenario/types.d.ts +2 -0
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -5226,7 +5226,7 @@ var require_index_node = __commonJS({
|
|
|
5226
5226
|
});
|
|
5227
5227
|
|
|
5228
5228
|
// src/index.ts
|
|
5229
|
-
var
|
|
5229
|
+
var import_evalforge_types16 = require("@wix/evalforge-types");
|
|
5230
5230
|
|
|
5231
5231
|
// src/config.ts
|
|
5232
5232
|
function loadConfig() {
|
|
@@ -7115,7 +7115,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
7115
7115
|
}
|
|
7116
7116
|
|
|
7117
7117
|
// src/run-scenario/index.ts
|
|
7118
|
-
var
|
|
7118
|
+
var import_evalforge_types14 = require("@wix/evalforge-types");
|
|
7119
7119
|
var import_eval_assertions = require("@wix/eval-assertions");
|
|
7120
7120
|
|
|
7121
7121
|
// src/run-scenario/environment.ts
|
|
@@ -7451,50 +7451,122 @@ function getAdapter(identifier) {
|
|
|
7451
7451
|
}
|
|
7452
7452
|
|
|
7453
7453
|
// src/run-scenario/agents/claude-code/claude-code-adapter.ts
|
|
7454
|
-
var
|
|
7454
|
+
var import_evalforge_types6 = require("@wix/evalforge-types");
|
|
7455
7455
|
|
|
7456
7456
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
7457
|
-
var
|
|
7457
|
+
var import_evalforge_types5 = require("@wix/evalforge-types");
|
|
7458
7458
|
|
|
7459
7459
|
// src/run-scenario/agents/claude-code/write-skills.ts
|
|
7460
7460
|
var import_promises3 = require("fs/promises");
|
|
7461
7461
|
var import_path4 = require("path");
|
|
7462
|
+
|
|
7463
|
+
// src/run-scenario/agents/shared/resolve-capability-content.ts
|
|
7462
7464
|
var import_evalforge_github_client2 = require("@wix/evalforge-github-client");
|
|
7463
|
-
|
|
7465
|
+
var import_evalforge_types2 = require("@wix/evalforge-types");
|
|
7466
|
+
var USER_AGENT = "EvalForge-Evaluator";
|
|
7467
|
+
async function resolveSkillFiles(skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
7468
|
+
const version = skill.latestVersion;
|
|
7469
|
+
if (version?.files && version.files.length > 0) {
|
|
7470
|
+
console.log(
|
|
7471
|
+
`[Skill] ${skill.name}: using ${version.files.length} file(s) from snapshot`
|
|
7472
|
+
);
|
|
7473
|
+
return version.files;
|
|
7474
|
+
}
|
|
7475
|
+
if (skill.source) {
|
|
7476
|
+
const files = await fetchFn(skill.source, { userAgent: USER_AGENT });
|
|
7477
|
+
console.log(
|
|
7478
|
+
`[Skill] ${skill.name}: fetched ${files.length} file(s) from GitHub (live)`
|
|
7479
|
+
);
|
|
7480
|
+
return files;
|
|
7481
|
+
}
|
|
7482
|
+
throw new Error(`Skill ${skill.name} has no files and no source configured`);
|
|
7483
|
+
}
|
|
7484
|
+
async function fetchSourceFile(label, noun, name, source, fetchFn) {
|
|
7485
|
+
try {
|
|
7486
|
+
const content = await fetchFn(source, { userAgent: USER_AGENT });
|
|
7487
|
+
console.log(
|
|
7488
|
+
`[${label}] Fetched "${name}" from ${source.owner}/${source.repo}/${source.path}@${source.ref}`
|
|
7489
|
+
);
|
|
7490
|
+
return content;
|
|
7491
|
+
} catch (error) {
|
|
7492
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7493
|
+
console.error(`[${label}] "${name}": GitHub fetch failed: ${message}`);
|
|
7494
|
+
throw new Error(
|
|
7495
|
+
`Failed to fetch ${noun} "${name}" from GitHub: ${message}`
|
|
7496
|
+
);
|
|
7497
|
+
}
|
|
7498
|
+
}
|
|
7499
|
+
async function resolveSubAgentMd(agent, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
|
|
7500
|
+
if (agent.source) {
|
|
7501
|
+
return fetchSourceFile(
|
|
7502
|
+
"SubAgents",
|
|
7503
|
+
"sub-agent",
|
|
7504
|
+
agent.name,
|
|
7505
|
+
agent.source,
|
|
7506
|
+
fetchFn
|
|
7507
|
+
);
|
|
7508
|
+
}
|
|
7509
|
+
if (!agent.subAgentMd) {
|
|
7510
|
+
console.warn(
|
|
7511
|
+
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
7512
|
+
);
|
|
7513
|
+
}
|
|
7514
|
+
return agent.subAgentMd;
|
|
7515
|
+
}
|
|
7516
|
+
async function resolveRuleText(rule, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
|
|
7517
|
+
if (!rule.source) {
|
|
7518
|
+
return rule.content;
|
|
7519
|
+
}
|
|
7520
|
+
return fetchSourceFile("Rules", "rule", rule.name, rule.source, fetchFn);
|
|
7521
|
+
}
|
|
7522
|
+
async function resolveMcpConfig(mcp, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
|
|
7523
|
+
if (!mcp.source) {
|
|
7524
|
+
return mcp.config;
|
|
7525
|
+
}
|
|
7526
|
+
const raw = await fetchSourceFile(
|
|
7527
|
+
"MCP",
|
|
7528
|
+
"MCP",
|
|
7529
|
+
mcp.name,
|
|
7530
|
+
mcp.source,
|
|
7531
|
+
fetchFn
|
|
7532
|
+
);
|
|
7533
|
+
let parsed;
|
|
7534
|
+
try {
|
|
7535
|
+
parsed = JSON.parse(raw);
|
|
7536
|
+
} catch (error) {
|
|
7537
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7538
|
+
throw new Error(
|
|
7539
|
+
`MCP "${mcp.name}" GitHub source is not valid JSON: ${message}`
|
|
7540
|
+
);
|
|
7541
|
+
}
|
|
7542
|
+
if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
|
|
7543
|
+
throw new Error(`MCP "${mcp.name}" GitHub source must be a JSON object`);
|
|
7544
|
+
}
|
|
7545
|
+
const obj = parsed;
|
|
7546
|
+
const servers = obj[import_evalforge_types2.MCP_SERVERS_JSON_KEY];
|
|
7547
|
+
if (typeof servers === "object" && servers !== null && !Array.isArray(servers)) {
|
|
7548
|
+
return servers;
|
|
7549
|
+
}
|
|
7550
|
+
return obj;
|
|
7551
|
+
}
|
|
7552
|
+
|
|
7553
|
+
// src/run-scenario/agents/claude-code/write-skills.ts
|
|
7554
|
+
async function writeSkillsToFilesystem(cwd, skills, fetchFn) {
|
|
7464
7555
|
await Promise.all(
|
|
7465
7556
|
skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
|
|
7466
7557
|
);
|
|
7467
7558
|
}
|
|
7468
|
-
async function writeSkillToFilesystem(cwd, skill, fetchFn
|
|
7469
|
-
const
|
|
7470
|
-
const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skillName);
|
|
7559
|
+
async function writeSkillToFilesystem(cwd, skill, fetchFn) {
|
|
7560
|
+
const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skill.name);
|
|
7471
7561
|
await (0, import_promises3.mkdir)(skillDir, { recursive: true });
|
|
7472
|
-
|
|
7473
|
-
|
|
7474
|
-
await writeFilesToDirectory(skillDir,
|
|
7475
|
-
|
|
7476
|
-
|
|
7562
|
+
try {
|
|
7563
|
+
const files = await resolveSkillFiles(skill, fetchFn);
|
|
7564
|
+
await writeFilesToDirectory(skillDir, files);
|
|
7565
|
+
} catch (error) {
|
|
7566
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7567
|
+
throw new Error(
|
|
7568
|
+
`Failed to write skill ${skill.name} to filesystem: ${message}`
|
|
7477
7569
|
);
|
|
7478
|
-
} else if (skill.source) {
|
|
7479
|
-
try {
|
|
7480
|
-
const files = await fetchFn(skill.source, {
|
|
7481
|
-
userAgent: "EvalForge-Evaluator"
|
|
7482
|
-
});
|
|
7483
|
-
await writeFilesToDirectory(skillDir, files);
|
|
7484
|
-
console.log(
|
|
7485
|
-
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
7486
|
-
);
|
|
7487
|
-
} catch (error) {
|
|
7488
|
-
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7489
|
-
console.error(
|
|
7490
|
-
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
7491
|
-
);
|
|
7492
|
-
throw new Error(
|
|
7493
|
-
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
7494
|
-
);
|
|
7495
|
-
}
|
|
7496
|
-
} else {
|
|
7497
|
-
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
7498
7570
|
}
|
|
7499
7571
|
}
|
|
7500
7572
|
|
|
@@ -7512,7 +7584,7 @@ var import_crypto2 = require("crypto");
|
|
|
7512
7584
|
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
7513
7585
|
var import_promises5 = require("fs/promises");
|
|
7514
7586
|
var import_path6 = require("path");
|
|
7515
|
-
var
|
|
7587
|
+
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
7516
7588
|
|
|
7517
7589
|
// src/run-scenario/agents/shared/resolve-mcp-placeholders.ts
|
|
7518
7590
|
var import_promises4 = require("fs/promises");
|
|
@@ -7557,11 +7629,11 @@ async function resolveMcpPlaceholders(mcpServers, options = {}) {
|
|
|
7557
7629
|
}
|
|
7558
7630
|
|
|
7559
7631
|
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
7560
|
-
async function writeMcpToFilesystem(cwd, mcps) {
|
|
7632
|
+
async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
|
|
7561
7633
|
if (mcps.length === 0) return;
|
|
7562
7634
|
const mcpServers = {};
|
|
7563
7635
|
for (const mcp of mcps) {
|
|
7564
|
-
const config = mcp
|
|
7636
|
+
const config = await resolveMcpConfig(mcp, fetchFn);
|
|
7565
7637
|
for (const [key, value] of Object.entries(config)) {
|
|
7566
7638
|
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
7567
7639
|
throw new Error(
|
|
@@ -7573,7 +7645,7 @@ async function writeMcpToFilesystem(cwd, mcps) {
|
|
|
7573
7645
|
}
|
|
7574
7646
|
const resolvedServers = await resolveMcpPlaceholders(mcpServers, { cwd });
|
|
7575
7647
|
const content = JSON.stringify(
|
|
7576
|
-
{ [
|
|
7648
|
+
{ [import_evalforge_types3.MCP_SERVERS_JSON_KEY]: resolvedServers },
|
|
7577
7649
|
null,
|
|
7578
7650
|
2
|
|
7579
7651
|
);
|
|
@@ -7585,7 +7657,6 @@ async function writeMcpToFilesystem(cwd, mcps) {
|
|
|
7585
7657
|
// src/run-scenario/agents/claude-code/write-sub-agents.ts
|
|
7586
7658
|
var import_promises6 = require("fs/promises");
|
|
7587
7659
|
var import_path7 = require("path");
|
|
7588
|
-
var import_evalforge_github_client3 = require("@wix/evalforge-github-client");
|
|
7589
7660
|
var AGENTS_DIR = ".claude/agents";
|
|
7590
7661
|
function toAgentFilename(name, index, nameCount) {
|
|
7591
7662
|
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
@@ -7593,34 +7664,7 @@ function toAgentFilename(name, index, nameCount) {
|
|
|
7593
7664
|
nameCount.set(base, count + 1);
|
|
7594
7665
|
return count === 0 ? base : `${base}-${count + 1}`;
|
|
7595
7666
|
}
|
|
7596
|
-
async function
|
|
7597
|
-
if (agent.source) {
|
|
7598
|
-
try {
|
|
7599
|
-
const content = await fetchFn(agent.source, {
|
|
7600
|
-
userAgent: "EvalForge-Evaluator"
|
|
7601
|
-
});
|
|
7602
|
-
console.log(
|
|
7603
|
-
`[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
|
|
7604
|
-
);
|
|
7605
|
-
return content;
|
|
7606
|
-
} catch (error) {
|
|
7607
|
-
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7608
|
-
console.error(
|
|
7609
|
-
`[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
|
|
7610
|
-
);
|
|
7611
|
-
throw new Error(
|
|
7612
|
-
`Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
|
|
7613
|
-
);
|
|
7614
|
-
}
|
|
7615
|
-
}
|
|
7616
|
-
if (!agent.subAgentMd) {
|
|
7617
|
-
console.warn(
|
|
7618
|
-
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
7619
|
-
);
|
|
7620
|
-
}
|
|
7621
|
-
return agent.subAgentMd;
|
|
7622
|
-
}
|
|
7623
|
-
async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalforge_github_client3.fetchGitHubFile) {
|
|
7667
|
+
async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn) {
|
|
7624
7668
|
if (subAgents.length === 0) return;
|
|
7625
7669
|
const agentsDir = (0, import_path7.join)(cwd, AGENTS_DIR);
|
|
7626
7670
|
await (0, import_promises6.mkdir)(agentsDir, { recursive: true });
|
|
@@ -7628,7 +7672,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalf
|
|
|
7628
7672
|
for (const [i, agent] of subAgents.entries()) {
|
|
7629
7673
|
const filename = toAgentFilename(agent.name, i, nameCount);
|
|
7630
7674
|
const filePath = (0, import_path7.join)(agentsDir, `${filename}.md`);
|
|
7631
|
-
const content = await
|
|
7675
|
+
const content = await resolveSubAgentMd(agent, fetchFn);
|
|
7632
7676
|
await (0, import_promises6.writeFile)(filePath, content, "utf8");
|
|
7633
7677
|
}
|
|
7634
7678
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
@@ -7678,18 +7722,19 @@ function validateGenericDirectory(dir, cwd) {
|
|
|
7678
7722
|
}
|
|
7679
7723
|
return trimmed;
|
|
7680
7724
|
}
|
|
7681
|
-
async function writeRulesToFilesystem(cwd, rules) {
|
|
7725
|
+
async function writeRulesToFilesystem(cwd, rules, fetchFn) {
|
|
7682
7726
|
if (rules.length === 0) return;
|
|
7683
7727
|
const nameCount = /* @__PURE__ */ new Map();
|
|
7684
7728
|
let hasCursorRules = false;
|
|
7685
7729
|
for (const [i, rule] of rules.entries()) {
|
|
7730
|
+
const content = await resolveRuleText(rule, fetchFn);
|
|
7686
7731
|
switch (rule.ruleType) {
|
|
7687
7732
|
case "claude-md": {
|
|
7688
|
-
await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"),
|
|
7733
|
+
await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), content);
|
|
7689
7734
|
break;
|
|
7690
7735
|
}
|
|
7691
7736
|
case "agents-md": {
|
|
7692
|
-
await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"),
|
|
7737
|
+
await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), content);
|
|
7693
7738
|
break;
|
|
7694
7739
|
}
|
|
7695
7740
|
case "cursor-rule": {
|
|
@@ -7699,7 +7744,7 @@ async function writeRulesToFilesystem(cwd, rules) {
|
|
|
7699
7744
|
}
|
|
7700
7745
|
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
7701
7746
|
const filePath = (0, import_path8.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
|
|
7702
|
-
await (0, import_promises7.writeFile)(filePath,
|
|
7747
|
+
await (0, import_promises7.writeFile)(filePath, content, "utf8");
|
|
7703
7748
|
break;
|
|
7704
7749
|
}
|
|
7705
7750
|
case "generic": {
|
|
@@ -7710,7 +7755,7 @@ async function writeRulesToFilesystem(cwd, rules) {
|
|
|
7710
7755
|
const dirPath = (0, import_path8.join)(cwd, directory);
|
|
7711
7756
|
await (0, import_promises7.mkdir)(dirPath, { recursive: true });
|
|
7712
7757
|
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
7713
|
-
await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`),
|
|
7758
|
+
await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), content, "utf8");
|
|
7714
7759
|
break;
|
|
7715
7760
|
}
|
|
7716
7761
|
default: {
|
|
@@ -7800,14 +7845,14 @@ function buildConversation(timestampedMessages) {
|
|
|
7800
7845
|
}
|
|
7801
7846
|
|
|
7802
7847
|
// src/run-scenario/agents/shared/trace-emit.ts
|
|
7803
|
-
var
|
|
7848
|
+
var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
7804
7849
|
function emitTraceEvent(event, pushEvent) {
|
|
7805
|
-
console.log(`${
|
|
7850
|
+
console.log(`${import_evalforge_types4.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
|
|
7806
7851
|
pushEvent?.(event);
|
|
7807
7852
|
}
|
|
7808
7853
|
|
|
7809
7854
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
7810
|
-
var DEFAULT_MODEL =
|
|
7855
|
+
var DEFAULT_MODEL = import_evalforge_types5.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
|
|
7811
7856
|
async function* buildPromptStream(triggerPrompt, images) {
|
|
7812
7857
|
yield {
|
|
7813
7858
|
type: "user",
|
|
@@ -7872,7 +7917,7 @@ function extractToolActionDescription(toolName, toolArgs) {
|
|
|
7872
7917
|
return `Using ${toolName}...`;
|
|
7873
7918
|
}
|
|
7874
7919
|
function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
7875
|
-
let type =
|
|
7920
|
+
let type = import_evalforge_types5.LiveTraceEventType.COMPLETION;
|
|
7876
7921
|
let toolName;
|
|
7877
7922
|
let toolArgs;
|
|
7878
7923
|
let outputPreview;
|
|
@@ -7880,28 +7925,28 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
|
7880
7925
|
let thinking;
|
|
7881
7926
|
for (const block of message.message.content) {
|
|
7882
7927
|
if (block.type === "tool_use") {
|
|
7883
|
-
type =
|
|
7928
|
+
type = import_evalforge_types5.LiveTraceEventType.TOOL_USE;
|
|
7884
7929
|
toolName = block.name;
|
|
7885
7930
|
toolArgs = JSON.stringify(block.input).slice(0, 500);
|
|
7886
7931
|
const input = block.input;
|
|
7887
7932
|
if (input.file_path || input.path || input.target_file) {
|
|
7888
7933
|
filePath = String(input.file_path || input.path || input.target_file);
|
|
7889
7934
|
if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
|
|
7890
|
-
type =
|
|
7935
|
+
type = import_evalforge_types5.LiveTraceEventType.FILE_WRITE;
|
|
7891
7936
|
} else if (block.name === "Read" || block.name === "read" || block.name === "View") {
|
|
7892
|
-
type =
|
|
7937
|
+
type = import_evalforge_types5.LiveTraceEventType.FILE_READ;
|
|
7893
7938
|
}
|
|
7894
7939
|
}
|
|
7895
7940
|
} else if (block.type === "text") {
|
|
7896
7941
|
outputPreview = block.text.slice(0, 500);
|
|
7897
7942
|
if (!toolName) {
|
|
7898
|
-
type =
|
|
7943
|
+
type = import_evalforge_types5.LiveTraceEventType.COMPLETION;
|
|
7899
7944
|
}
|
|
7900
7945
|
} else if (block.type === "thinking") {
|
|
7901
7946
|
const thinkingBlock = block;
|
|
7902
7947
|
thinking = thinkingBlock.thinking.slice(0, 500);
|
|
7903
7948
|
if (!outputPreview && !toolName) {
|
|
7904
|
-
type =
|
|
7949
|
+
type = import_evalforge_types5.LiveTraceEventType.THINKING;
|
|
7905
7950
|
}
|
|
7906
7951
|
}
|
|
7907
7952
|
}
|
|
@@ -7967,7 +8012,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
7967
8012
|
}
|
|
7968
8013
|
return {
|
|
7969
8014
|
...baseEvent,
|
|
7970
|
-
type:
|
|
8015
|
+
type: import_evalforge_types5.LiveTraceEventType.TOOL_RESULT,
|
|
7971
8016
|
outputPreview: outputPreview || "(tool result)"
|
|
7972
8017
|
};
|
|
7973
8018
|
}
|
|
@@ -7975,7 +8020,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
7975
8020
|
const sysMsg = message;
|
|
7976
8021
|
return {
|
|
7977
8022
|
...baseEvent,
|
|
7978
|
-
type:
|
|
8023
|
+
type: import_evalforge_types5.LiveTraceEventType.SYSTEM,
|
|
7979
8024
|
outputPreview: sysMsg.subtype || "system"
|
|
7980
8025
|
};
|
|
7981
8026
|
}
|
|
@@ -7984,7 +8029,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
7984
8029
|
}
|
|
7985
8030
|
return {
|
|
7986
8031
|
...baseEvent,
|
|
7987
|
-
type:
|
|
8032
|
+
type: import_evalforge_types5.LiveTraceEventType.PROGRESS,
|
|
7988
8033
|
outputPreview: `Message type: ${message.type}`
|
|
7989
8034
|
};
|
|
7990
8035
|
}
|
|
@@ -8086,7 +8131,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8086
8131
|
queryOptions.systemPrompt = {
|
|
8087
8132
|
type: "preset",
|
|
8088
8133
|
preset: "claude_code",
|
|
8089
|
-
append:
|
|
8134
|
+
append: import_evalforge_types5.DEFAULT_EVALUATOR_SYSTEM_PROMPT
|
|
8090
8135
|
};
|
|
8091
8136
|
}
|
|
8092
8137
|
if (options.temperature !== void 0) {
|
|
@@ -8121,7 +8166,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8121
8166
|
targetId: traceContext.targetId,
|
|
8122
8167
|
targetName: traceContext.targetName,
|
|
8123
8168
|
stepNumber: 0,
|
|
8124
|
-
type:
|
|
8169
|
+
type: import_evalforge_types5.LiveTraceEventType.DIAGNOSTIC,
|
|
8125
8170
|
outputPreview: JSON.stringify({
|
|
8126
8171
|
event: "pre-sdk-execution",
|
|
8127
8172
|
model: queryOptions.model,
|
|
@@ -8185,7 +8230,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8185
8230
|
targetId: traceContext.targetId,
|
|
8186
8231
|
targetName: traceContext.targetName,
|
|
8187
8232
|
stepNumber: traceStepNumber,
|
|
8188
|
-
type:
|
|
8233
|
+
type: import_evalforge_types5.LiveTraceEventType.PROGRESS,
|
|
8189
8234
|
outputPreview: progressMessage,
|
|
8190
8235
|
toolName: lastToolName,
|
|
8191
8236
|
filePath: lastFilePath,
|
|
@@ -8222,18 +8267,18 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8222
8267
|
if (traceEvent) {
|
|
8223
8268
|
lastToolName = traceEvent.toolName;
|
|
8224
8269
|
lastFilePath = traceEvent.filePath;
|
|
8225
|
-
if (traceEvent.type ===
|
|
8270
|
+
if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.THINKING) {
|
|
8226
8271
|
lastAction = "Thinking...";
|
|
8227
|
-
} else if (traceEvent.type ===
|
|
8272
|
+
} else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.TOOL_USE) {
|
|
8228
8273
|
lastAction = extractToolActionDescription(
|
|
8229
8274
|
traceEvent.toolName,
|
|
8230
8275
|
traceEvent.toolArgs
|
|
8231
8276
|
);
|
|
8232
|
-
} else if (traceEvent.type ===
|
|
8277
|
+
} else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.FILE_WRITE) {
|
|
8233
8278
|
lastAction = `Writing: ${traceEvent.filePath || "file"}`;
|
|
8234
|
-
} else if (traceEvent.type ===
|
|
8279
|
+
} else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.FILE_READ) {
|
|
8235
8280
|
lastAction = `Reading: ${traceEvent.filePath || "file"}`;
|
|
8236
|
-
} else if (traceEvent.type ===
|
|
8281
|
+
} else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.COMPLETION) {
|
|
8237
8282
|
lastAction = "Processing response...";
|
|
8238
8283
|
}
|
|
8239
8284
|
emitTraceEvent(traceEvent, traceContext.pushEvent);
|
|
@@ -8411,7 +8456,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8411
8456
|
targetId: traceContext.targetId,
|
|
8412
8457
|
targetName: traceContext.targetName,
|
|
8413
8458
|
stepNumber: traceStepNumber + 1,
|
|
8414
|
-
type:
|
|
8459
|
+
type: import_evalforge_types5.LiveTraceEventType.DIAGNOSTIC,
|
|
8415
8460
|
outputPreview: JSON.stringify(
|
|
8416
8461
|
{
|
|
8417
8462
|
event: "sdk-execution-failed",
|
|
@@ -8445,7 +8490,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
8445
8490
|
targetId: traceContext.targetId,
|
|
8446
8491
|
targetName: traceContext.targetName,
|
|
8447
8492
|
stepNumber: traceStepNumber + 1,
|
|
8448
|
-
type:
|
|
8493
|
+
type: import_evalforge_types5.LiveTraceEventType.COMPLETION,
|
|
8449
8494
|
outputPreview: "Scenario execution completed",
|
|
8450
8495
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
8451
8496
|
isComplete: true
|
|
@@ -8625,9 +8670,12 @@ function processMessages(timestampedMessages, startTime, endTime) {
|
|
|
8625
8670
|
if (!step.toolCalls) continue;
|
|
8626
8671
|
for (const tc of step.toolCalls) {
|
|
8627
8672
|
if (tc.toolUseId && toolResultErrors.has(tc.toolUseId)) {
|
|
8628
|
-
|
|
8629
|
-
|
|
8630
|
-
|
|
8673
|
+
tc.isError = true;
|
|
8674
|
+
tc.errorContent = toolResultErrors.get(tc.toolUseId);
|
|
8675
|
+
if (!step.hasToolError) {
|
|
8676
|
+
step.hasToolError = true;
|
|
8677
|
+
step.toolErrorContent = tc.errorContent;
|
|
8678
|
+
}
|
|
8631
8679
|
}
|
|
8632
8680
|
}
|
|
8633
8681
|
}
|
|
@@ -8717,7 +8765,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8717
8765
|
stepNumber: 0,
|
|
8718
8766
|
// renumbered below
|
|
8719
8767
|
turnIndex,
|
|
8720
|
-
type:
|
|
8768
|
+
type: import_evalforge_types5.LLMStepType.THINKING,
|
|
8721
8769
|
model,
|
|
8722
8770
|
provider: "anthropic",
|
|
8723
8771
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -8731,8 +8779,8 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8731
8779
|
},
|
|
8732
8780
|
costUsd: stepCost / totalSubSteps,
|
|
8733
8781
|
outputPreview: step.thinking?.slice(0, 200),
|
|
8734
|
-
success:
|
|
8735
|
-
error:
|
|
8782
|
+
success: true,
|
|
8783
|
+
error: void 0
|
|
8736
8784
|
});
|
|
8737
8785
|
}
|
|
8738
8786
|
if (toolCallCount > 0) {
|
|
@@ -8742,11 +8790,13 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8742
8790
|
const toolBudgetSteps = toolSubSteps + textSubSteps;
|
|
8743
8791
|
const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
|
|
8744
8792
|
const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
|
|
8793
|
+
const toolSuccess = !tc.isError;
|
|
8794
|
+
const toolError = tc.isError ? tc.errorContent ?? "Tool call failed" : void 0;
|
|
8745
8795
|
subSteps.push({
|
|
8746
8796
|
id: (0, import_crypto2.randomUUID)(),
|
|
8747
8797
|
stepNumber: 0,
|
|
8748
8798
|
turnIndex,
|
|
8749
|
-
type:
|
|
8799
|
+
type: import_evalforge_types5.LLMStepType.TOOL_USE,
|
|
8750
8800
|
model,
|
|
8751
8801
|
provider: "anthropic",
|
|
8752
8802
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -8766,8 +8816,8 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8766
8816
|
toolName: tc.toolName,
|
|
8767
8817
|
toolArguments: JSON.stringify(tc.args),
|
|
8768
8818
|
outputPreview: tcIdx === 0 && !hasText ? (step.text || step.thinking)?.slice(0, 200) : void 0,
|
|
8769
|
-
success:
|
|
8770
|
-
error:
|
|
8819
|
+
success: toolSuccess,
|
|
8820
|
+
error: toolError
|
|
8771
8821
|
});
|
|
8772
8822
|
}
|
|
8773
8823
|
}
|
|
@@ -8776,7 +8826,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8776
8826
|
id: (0, import_crypto2.randomUUID)(),
|
|
8777
8827
|
stepNumber: 0,
|
|
8778
8828
|
turnIndex,
|
|
8779
|
-
type:
|
|
8829
|
+
type: import_evalforge_types5.LLMStepType.COMPLETION,
|
|
8780
8830
|
model,
|
|
8781
8831
|
provider: "anthropic",
|
|
8782
8832
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -8788,12 +8838,12 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8788
8838
|
},
|
|
8789
8839
|
costUsd: stepCost - subSteps.reduce((s, ss) => s + ss.costUsd, 0),
|
|
8790
8840
|
outputPreview: step.text?.slice(0, 200),
|
|
8791
|
-
success:
|
|
8792
|
-
error:
|
|
8841
|
+
success: true,
|
|
8842
|
+
error: void 0
|
|
8793
8843
|
});
|
|
8794
8844
|
}
|
|
8795
8845
|
if (subSteps.length === 0) {
|
|
8796
|
-
const stepType = hasThinking && !hasText ?
|
|
8846
|
+
const stepType = hasThinking && !hasText ? import_evalforge_types5.LLMStepType.THINKING : import_evalforge_types5.LLMStepType.COMPLETION;
|
|
8797
8847
|
subSteps.push({
|
|
8798
8848
|
id: (0, import_crypto2.randomUUID)(),
|
|
8799
8849
|
stepNumber: 0,
|
|
@@ -8863,7 +8913,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8863
8913
|
var ClaudeCodeAdapter = class {
|
|
8864
8914
|
id = "claude-code";
|
|
8865
8915
|
name = "Claude Code";
|
|
8866
|
-
supportedCommands = [
|
|
8916
|
+
supportedCommands = [import_evalforge_types6.AgentRunCommand.CLAUDE];
|
|
8867
8917
|
/**
|
|
8868
8918
|
* Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
|
|
8869
8919
|
* before the baseline snapshot is taken.
|
|
@@ -8895,9 +8945,9 @@ var ClaudeCodeAdapter = class {
|
|
|
8895
8945
|
rules,
|
|
8896
8946
|
systemPrompt
|
|
8897
8947
|
} = context;
|
|
8898
|
-
const typed = config ?
|
|
8948
|
+
const typed = config ? import_evalforge_types6.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
|
|
8899
8949
|
const cfg = typed?.success ? typed.data : void 0;
|
|
8900
|
-
const schemaKeys = new Set(Object.keys(
|
|
8950
|
+
const schemaKeys = new Set(Object.keys(import_evalforge_types6.ClaudeCodeConfigSchema.shape));
|
|
8901
8951
|
const extras = {};
|
|
8902
8952
|
if (config) {
|
|
8903
8953
|
for (const [key, value] of Object.entries(config)) {
|
|
@@ -8952,11 +9002,11 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
|
|
|
8952
9002
|
defaultRegistry.register(claudeCodeAdapter);
|
|
8953
9003
|
|
|
8954
9004
|
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
8955
|
-
var
|
|
9005
|
+
var import_evalforge_types10 = require("@wix/evalforge-types");
|
|
8956
9006
|
|
|
8957
9007
|
// src/run-scenario/agents/opencode/execute.ts
|
|
8958
9008
|
var import_child_process2 = require("child_process");
|
|
8959
|
-
var
|
|
9009
|
+
var import_evalforge_types9 = require("@wix/evalforge-types");
|
|
8960
9010
|
|
|
8961
9011
|
// src/run-scenario/agents/opencode/types.ts
|
|
8962
9012
|
function tryParseJson(text) {
|
|
@@ -8970,49 +9020,28 @@ function tryParseJson(text) {
|
|
|
8970
9020
|
// src/run-scenario/agents/opencode/write-skills.ts
|
|
8971
9021
|
var import_promises8 = require("fs/promises");
|
|
8972
9022
|
var import_path9 = require("path");
|
|
8973
|
-
|
|
8974
|
-
async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_github_client4.fetchGitHubFolder) {
|
|
9023
|
+
async function writeSkillsToFilesystem2(cwd, skills, fetchFn) {
|
|
8975
9024
|
await Promise.all(
|
|
8976
9025
|
skills.map((skill) => writeSkillToFilesystem2(cwd, skill, fetchFn))
|
|
8977
9026
|
);
|
|
8978
9027
|
}
|
|
8979
9028
|
async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
|
|
8980
|
-
const
|
|
8981
|
-
const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skillName);
|
|
9029
|
+
const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skill.name);
|
|
8982
9030
|
await (0, import_promises8.mkdir)(skillDir, { recursive: true });
|
|
8983
|
-
|
|
8984
|
-
|
|
8985
|
-
await writeFilesToDirectory(skillDir,
|
|
8986
|
-
|
|
8987
|
-
|
|
9031
|
+
try {
|
|
9032
|
+
const files = await resolveSkillFiles(skill, fetchFn);
|
|
9033
|
+
await writeFilesToDirectory(skillDir, files);
|
|
9034
|
+
} catch (error) {
|
|
9035
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
9036
|
+
throw new Error(
|
|
9037
|
+
`Failed to write skill ${skill.name} to filesystem: ${message}`
|
|
8988
9038
|
);
|
|
8989
|
-
} else if (skill.source) {
|
|
8990
|
-
try {
|
|
8991
|
-
const files = await fetchFn(skill.source, {
|
|
8992
|
-
userAgent: "EvalForge-Evaluator"
|
|
8993
|
-
});
|
|
8994
|
-
await writeFilesToDirectory(skillDir, files);
|
|
8995
|
-
console.log(
|
|
8996
|
-
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
8997
|
-
);
|
|
8998
|
-
} catch (error) {
|
|
8999
|
-
const message = error instanceof Error ? error.message : "Unknown error";
|
|
9000
|
-
console.error(
|
|
9001
|
-
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
9002
|
-
);
|
|
9003
|
-
throw new Error(
|
|
9004
|
-
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
9005
|
-
);
|
|
9006
|
-
}
|
|
9007
|
-
} else {
|
|
9008
|
-
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
9009
9039
|
}
|
|
9010
9040
|
}
|
|
9011
9041
|
|
|
9012
9042
|
// src/run-scenario/agents/opencode/write-sub-agents.ts
|
|
9013
9043
|
var import_promises9 = require("fs/promises");
|
|
9014
9044
|
var import_path10 = require("path");
|
|
9015
|
-
var import_evalforge_github_client5 = require("@wix/evalforge-github-client");
|
|
9016
9045
|
var AGENTS_DIR2 = ".opencode/agents";
|
|
9017
9046
|
function toAgentFilename2(name, index, nameCount) {
|
|
9018
9047
|
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
@@ -9020,34 +9049,7 @@ function toAgentFilename2(name, index, nameCount) {
|
|
|
9020
9049
|
nameCount.set(base, count + 1);
|
|
9021
9050
|
return count === 0 ? base : `${base}-${count + 1}`;
|
|
9022
9051
|
}
|
|
9023
|
-
async function
|
|
9024
|
-
if (agent.source) {
|
|
9025
|
-
try {
|
|
9026
|
-
const content = await fetchFn(agent.source, {
|
|
9027
|
-
userAgent: "EvalForge-Evaluator"
|
|
9028
|
-
});
|
|
9029
|
-
console.log(
|
|
9030
|
-
`[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
|
|
9031
|
-
);
|
|
9032
|
-
return content;
|
|
9033
|
-
} catch (error) {
|
|
9034
|
-
const message = error instanceof Error ? error.message : "Unknown error";
|
|
9035
|
-
console.error(
|
|
9036
|
-
`[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
|
|
9037
|
-
);
|
|
9038
|
-
throw new Error(
|
|
9039
|
-
`Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
|
|
9040
|
-
);
|
|
9041
|
-
}
|
|
9042
|
-
}
|
|
9043
|
-
if (!agent.subAgentMd) {
|
|
9044
|
-
console.warn(
|
|
9045
|
-
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
9046
|
-
);
|
|
9047
|
-
}
|
|
9048
|
-
return agent.subAgentMd;
|
|
9049
|
-
}
|
|
9050
|
-
async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_evalforge_github_client5.fetchGitHubFile) {
|
|
9052
|
+
async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
|
|
9051
9053
|
if (subAgents.length === 0) return;
|
|
9052
9054
|
const agentsDir = (0, import_path10.join)(cwd, AGENTS_DIR2);
|
|
9053
9055
|
await (0, import_promises9.mkdir)(agentsDir, { recursive: true });
|
|
@@ -9055,7 +9057,7 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_eval
|
|
|
9055
9057
|
for (const [i, agent] of subAgents.entries()) {
|
|
9056
9058
|
const filename = toAgentFilename2(agent.name, i, nameCount);
|
|
9057
9059
|
const filePath = (0, import_path10.join)(agentsDir, `${filename}.md`);
|
|
9058
|
-
const content = await
|
|
9060
|
+
const content = await resolveSubAgentMd(agent, fetchFn);
|
|
9059
9061
|
await (0, import_promises9.writeFile)(filePath, content, "utf8");
|
|
9060
9062
|
}
|
|
9061
9063
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
@@ -9063,8 +9065,8 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_eval
|
|
|
9063
9065
|
|
|
9064
9066
|
// src/run-scenario/agents/opencode/config.ts
|
|
9065
9067
|
var import_os3 = require("os");
|
|
9066
|
-
var
|
|
9067
|
-
var DEFAULT_MODEL2 = `${
|
|
9068
|
+
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
9069
|
+
var DEFAULT_MODEL2 = `${import_evalforge_types7.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
|
|
9068
9070
|
var OPENCODE_MODEL_ALIASES = {
|
|
9069
9071
|
"claude-sonnet-4": "claude-sonnet-4-0",
|
|
9070
9072
|
"claude-opus-4": "claude-opus-4-0"
|
|
@@ -9080,10 +9082,10 @@ function parseModel(model) {
|
|
|
9080
9082
|
};
|
|
9081
9083
|
}
|
|
9082
9084
|
const modelID = OPENCODE_MODEL_ALIASES[model] ?? model;
|
|
9083
|
-
const isOpenAI =
|
|
9085
|
+
const isOpenAI = import_evalforge_types7.AVAILABLE_OPENAI_MODEL_IDS.includes(
|
|
9084
9086
|
model
|
|
9085
9087
|
);
|
|
9086
|
-
const isGemini =
|
|
9088
|
+
const isGemini = import_evalforge_types7.AVAILABLE_GEMINI_MODEL_IDS.includes(
|
|
9087
9089
|
model
|
|
9088
9090
|
);
|
|
9089
9091
|
if (isGemini) return { providerID: "google", modelID };
|
|
@@ -9152,7 +9154,7 @@ async function buildOpenCodeEnv(options) {
|
|
|
9152
9154
|
if (options.mcps && options.mcps.length > 0) {
|
|
9153
9155
|
const mcpServers = {};
|
|
9154
9156
|
for (const mcpEntity of options.mcps) {
|
|
9155
|
-
const entityConfig = mcpEntity
|
|
9157
|
+
const entityConfig = await resolveMcpConfig(mcpEntity);
|
|
9156
9158
|
for (const [key, value] of Object.entries(entityConfig)) {
|
|
9157
9159
|
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
9158
9160
|
throw new Error(
|
|
@@ -9177,7 +9179,7 @@ async function buildOpenCodeEnv(options) {
|
|
|
9177
9179
|
if (options.maxTurns != null && options.maxTurns > 0) {
|
|
9178
9180
|
agentOverrides.maxSteps = options.maxTurns;
|
|
9179
9181
|
}
|
|
9180
|
-
const parsed = options.config ?
|
|
9182
|
+
const parsed = options.config ? import_evalforge_types7.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
|
|
9181
9183
|
const configPermission = parsed?.success ? parsed.data.permission : void 0;
|
|
9182
9184
|
const defaultPermission = {
|
|
9183
9185
|
"*": "allow"
|
|
@@ -9219,7 +9221,7 @@ async function buildOpenCodeEnv(options) {
|
|
|
9219
9221
|
}
|
|
9220
9222
|
|
|
9221
9223
|
// src/run-scenario/agents/opencode/build-trace.ts
|
|
9222
|
-
var
|
|
9224
|
+
var import_evalforge_types8 = require("@wix/evalforge-types");
|
|
9223
9225
|
var import_crypto3 = require("crypto");
|
|
9224
9226
|
function toCanonicalModelId(modelId) {
|
|
9225
9227
|
const slashIndex = modelId.indexOf("/");
|
|
@@ -9299,7 +9301,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9299
9301
|
id: (0, import_crypto3.randomUUID)(),
|
|
9300
9302
|
stepNumber: 0,
|
|
9301
9303
|
turnIndex,
|
|
9302
|
-
type:
|
|
9304
|
+
type: import_evalforge_types8.LLMStepType.THINKING,
|
|
9303
9305
|
model: stepModel,
|
|
9304
9306
|
provider: stepProvider,
|
|
9305
9307
|
startedAt,
|
|
@@ -9328,7 +9330,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9328
9330
|
id: (0, import_crypto3.randomUUID)(),
|
|
9329
9331
|
stepNumber: 0,
|
|
9330
9332
|
turnIndex,
|
|
9331
|
-
type:
|
|
9333
|
+
type: import_evalforge_types8.LLMStepType.TOOL_USE,
|
|
9332
9334
|
model: stepModel,
|
|
9333
9335
|
provider: stepProvider,
|
|
9334
9336
|
startedAt,
|
|
@@ -9358,7 +9360,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9358
9360
|
id: (0, import_crypto3.randomUUID)(),
|
|
9359
9361
|
stepNumber: 0,
|
|
9360
9362
|
turnIndex,
|
|
9361
|
-
type:
|
|
9363
|
+
type: import_evalforge_types8.LLMStepType.COMPLETION,
|
|
9362
9364
|
model: stepModel,
|
|
9363
9365
|
provider: stepProvider,
|
|
9364
9366
|
startedAt,
|
|
@@ -9375,7 +9377,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9375
9377
|
});
|
|
9376
9378
|
}
|
|
9377
9379
|
if (subSteps.length === 0) {
|
|
9378
|
-
const stepType = hasThinking && !hasText ?
|
|
9380
|
+
const stepType = hasThinking && !hasText ? import_evalforge_types8.LLMStepType.THINKING : import_evalforge_types8.LLMStepType.COMPLETION;
|
|
9379
9381
|
subSteps.push({
|
|
9380
9382
|
id: (0, import_crypto3.randomUUID)(),
|
|
9381
9383
|
stepNumber: 0,
|
|
@@ -9576,14 +9578,14 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
|
|
|
9576
9578
|
const te = evt;
|
|
9577
9579
|
return {
|
|
9578
9580
|
...base,
|
|
9579
|
-
type:
|
|
9581
|
+
type: import_evalforge_types9.LiveTraceEventType.COMPLETION,
|
|
9580
9582
|
outputPreview: te.part.text.slice(0, 500)
|
|
9581
9583
|
};
|
|
9582
9584
|
}
|
|
9583
9585
|
case "reasoning":
|
|
9584
9586
|
return {
|
|
9585
9587
|
...base,
|
|
9586
|
-
type:
|
|
9588
|
+
type: import_evalforge_types9.LiveTraceEventType.THINKING,
|
|
9587
9589
|
thinking: evt.part.text.slice(0, 500)
|
|
9588
9590
|
};
|
|
9589
9591
|
case "tool_use": {
|
|
@@ -9591,15 +9593,15 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
|
|
|
9591
9593
|
const toolName = tu.part.tool;
|
|
9592
9594
|
const args = tu.part.state.input;
|
|
9593
9595
|
const toolArgs = JSON.stringify(args).slice(0, 500);
|
|
9594
|
-
let type =
|
|
9596
|
+
let type = import_evalforge_types9.LiveTraceEventType.TOOL_USE;
|
|
9595
9597
|
let filePath;
|
|
9596
9598
|
if (args) {
|
|
9597
9599
|
if (args.file_path || args.path || args.target_file) {
|
|
9598
9600
|
filePath = String(args.file_path || args.path || args.target_file);
|
|
9599
9601
|
if (/write|edit/i.test(toolName)) {
|
|
9600
|
-
type =
|
|
9602
|
+
type = import_evalforge_types9.LiveTraceEventType.FILE_WRITE;
|
|
9601
9603
|
} else if (/read|view/i.test(toolName)) {
|
|
9602
|
-
type =
|
|
9604
|
+
type = import_evalforge_types9.LiveTraceEventType.FILE_READ;
|
|
9603
9605
|
}
|
|
9604
9606
|
}
|
|
9605
9607
|
}
|
|
@@ -9608,7 +9610,7 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
|
|
|
9608
9610
|
case "step_finish":
|
|
9609
9611
|
return {
|
|
9610
9612
|
...base,
|
|
9611
|
-
type:
|
|
9613
|
+
type: import_evalforge_types9.LiveTraceEventType.PROGRESS,
|
|
9612
9614
|
outputPreview: "Step completed"
|
|
9613
9615
|
};
|
|
9614
9616
|
default:
|
|
@@ -9639,7 +9641,7 @@ async function prepareOpenCodeEnvironment(cwd, skills, options) {
|
|
|
9639
9641
|
} else if (options.systemPrompt != null) {
|
|
9640
9642
|
systemPrompt = options.systemPrompt;
|
|
9641
9643
|
} else {
|
|
9642
|
-
systemPrompt =
|
|
9644
|
+
systemPrompt = import_evalforge_types9.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
|
|
9643
9645
|
}
|
|
9644
9646
|
if (systemPrompt) {
|
|
9645
9647
|
await writeSystemPromptRule(cwd, systemPrompt);
|
|
@@ -9831,7 +9833,7 @@ function spawnOpenCodeProcess(opts) {
|
|
|
9831
9833
|
targetId: traceContext.targetId,
|
|
9832
9834
|
targetName: traceContext.targetName,
|
|
9833
9835
|
stepNumber: traceStepNumber,
|
|
9834
|
-
type:
|
|
9836
|
+
type: import_evalforge_types9.LiveTraceEventType.PROGRESS,
|
|
9835
9837
|
outputPreview: progressMessage,
|
|
9836
9838
|
toolName: lastToolName,
|
|
9837
9839
|
filePath: lastFilePath,
|
|
@@ -9865,18 +9867,18 @@ function spawnOpenCodeProcess(opts) {
|
|
|
9865
9867
|
if (traceEvt) {
|
|
9866
9868
|
lastToolName = traceEvt.toolName;
|
|
9867
9869
|
lastFilePath = traceEvt.filePath;
|
|
9868
|
-
if (traceEvt.type ===
|
|
9870
|
+
if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.THINKING) {
|
|
9869
9871
|
lastAction = "Thinking...";
|
|
9870
|
-
} else if (traceEvt.type ===
|
|
9872
|
+
} else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.TOOL_USE) {
|
|
9871
9873
|
lastAction = extractToolAction(
|
|
9872
9874
|
traceEvt.toolName ?? "",
|
|
9873
9875
|
void 0
|
|
9874
9876
|
);
|
|
9875
|
-
} else if (traceEvt.type ===
|
|
9877
|
+
} else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.FILE_WRITE) {
|
|
9876
9878
|
lastAction = `Writing: ${traceEvt.filePath || "file"}`;
|
|
9877
|
-
} else if (traceEvt.type ===
|
|
9879
|
+
} else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.FILE_READ) {
|
|
9878
9880
|
lastAction = `Reading: ${traceEvt.filePath || "file"}`;
|
|
9879
|
-
} else if (traceEvt.type ===
|
|
9881
|
+
} else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.COMPLETION) {
|
|
9880
9882
|
lastAction = "Processing response...";
|
|
9881
9883
|
}
|
|
9882
9884
|
emitTraceEvent(traceEvt, traceContext.pushEvent);
|
|
@@ -9958,7 +9960,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
9958
9960
|
targetId: traceContext.targetId,
|
|
9959
9961
|
targetName: traceContext.targetName,
|
|
9960
9962
|
stepNumber: 0,
|
|
9961
|
-
type:
|
|
9963
|
+
type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
|
|
9962
9964
|
outputPreview: JSON.stringify({
|
|
9963
9965
|
event: "pre-cli-execution",
|
|
9964
9966
|
model: `${providerID}/${modelID}`,
|
|
@@ -10012,7 +10014,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
10012
10014
|
targetId: traceContext.targetId,
|
|
10013
10015
|
targetName: traceContext.targetName,
|
|
10014
10016
|
stepNumber: traceStepNumber + 1,
|
|
10015
|
-
type:
|
|
10017
|
+
type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
|
|
10016
10018
|
outputPreview: JSON.stringify({
|
|
10017
10019
|
event: "idle-timeout-retry",
|
|
10018
10020
|
attempt,
|
|
@@ -10056,7 +10058,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
10056
10058
|
targetId: traceContext.targetId,
|
|
10057
10059
|
targetName: traceContext.targetName,
|
|
10058
10060
|
stepNumber: traceStepNumber + 1,
|
|
10059
|
-
type:
|
|
10061
|
+
type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
|
|
10060
10062
|
outputPreview: JSON.stringify({
|
|
10061
10063
|
event: "cli-execution-failed",
|
|
10062
10064
|
error: lastAttemptResult.error?.message ?? "Unknown error",
|
|
@@ -10111,7 +10113,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
|
|
|
10111
10113
|
targetId: traceContext.targetId,
|
|
10112
10114
|
targetName: traceContext.targetName,
|
|
10113
10115
|
stepNumber: traceStepNumber + 1,
|
|
10114
|
-
type:
|
|
10116
|
+
type: import_evalforge_types9.LiveTraceEventType.COMPLETION,
|
|
10115
10117
|
outputPreview: "Scenario execution completed",
|
|
10116
10118
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
10117
10119
|
isComplete: true
|
|
@@ -10148,7 +10150,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
|
|
|
10148
10150
|
var OpenCodeAdapter = class {
|
|
10149
10151
|
id = "opencode";
|
|
10150
10152
|
name = "OpenCode";
|
|
10151
|
-
supportedCommands = [
|
|
10153
|
+
supportedCommands = [import_evalforge_types10.AgentRunCommand.OPENCODE];
|
|
10152
10154
|
async prepareEnvironment(context) {
|
|
10153
10155
|
await prepareOpenCodeEnvironment(context.cwd, context.skills, {
|
|
10154
10156
|
mcps: context.mcps,
|
|
@@ -10171,7 +10173,7 @@ var OpenCodeAdapter = class {
|
|
|
10171
10173
|
rules,
|
|
10172
10174
|
systemPrompt
|
|
10173
10175
|
} = context;
|
|
10174
|
-
const typed = config ?
|
|
10176
|
+
const typed = config ? import_evalforge_types10.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
|
|
10175
10177
|
const cfg = typed?.success ? typed.data : void 0;
|
|
10176
10178
|
const rawMaxTurns = cfg?.maxTurns;
|
|
10177
10179
|
const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
|
|
@@ -10221,7 +10223,7 @@ var import_ai = require("ai");
|
|
|
10221
10223
|
var import_anthropic = require("@ai-sdk/anthropic");
|
|
10222
10224
|
var import_google = require("@ai-sdk/google");
|
|
10223
10225
|
var import_openai = require("@ai-sdk/openai");
|
|
10224
|
-
var
|
|
10226
|
+
var import_evalforge_types12 = require("@wix/evalforge-types");
|
|
10225
10227
|
var import_crypto4 = require("crypto");
|
|
10226
10228
|
|
|
10227
10229
|
// src/run-scenario/agents/simple-agent/mcp-tools.ts
|
|
@@ -10318,7 +10320,7 @@ function extractErrorText(content) {
|
|
|
10318
10320
|
}
|
|
10319
10321
|
|
|
10320
10322
|
// src/run-scenario/agents/simple-agent/cost-calculation.ts
|
|
10321
|
-
var
|
|
10323
|
+
var import_evalforge_types11 = require("@wix/evalforge-types");
|
|
10322
10324
|
var PROVIDER_ANTHROPIC = "anthropic";
|
|
10323
10325
|
var PROVIDER_GEMINI = "gemini";
|
|
10324
10326
|
var MODEL_PRICING = {
|
|
@@ -10387,7 +10389,7 @@ function extractGatewayCost(step, provider) {
|
|
|
10387
10389
|
}
|
|
10388
10390
|
}
|
|
10389
10391
|
function calculateFromPricing(modelId, tokenUsage) {
|
|
10390
|
-
const normalized = (0,
|
|
10392
|
+
const normalized = (0, import_evalforge_types11.normalizeModelId)(modelId);
|
|
10391
10393
|
const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
|
|
10392
10394
|
if (!pricing) return 0;
|
|
10393
10395
|
return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
|
|
@@ -10480,7 +10482,7 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
10480
10482
|
apiKey: "proxy-auth",
|
|
10481
10483
|
headers
|
|
10482
10484
|
});
|
|
10483
|
-
if ([...
|
|
10485
|
+
if ([...import_evalforge_types12.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
10484
10486
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10485
10487
|
)) {
|
|
10486
10488
|
return openai.responses(modelId);
|
|
@@ -10488,12 +10490,12 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
10488
10490
|
return openai.chat(modelId);
|
|
10489
10491
|
}
|
|
10490
10492
|
function isClaudeModelId(modelId) {
|
|
10491
|
-
return
|
|
10493
|
+
return import_evalforge_types12.AVAILABLE_CLAUDE_MODEL_IDS.some(
|
|
10492
10494
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10493
10495
|
);
|
|
10494
10496
|
}
|
|
10495
10497
|
function isGeminiModelId(modelId) {
|
|
10496
|
-
return
|
|
10498
|
+
return import_evalforge_types12.AVAILABLE_GEMINI_MODEL_IDS.some(
|
|
10497
10499
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10498
10500
|
);
|
|
10499
10501
|
}
|
|
@@ -10513,9 +10515,9 @@ async function executeWithAiSdk(context) {
|
|
|
10513
10515
|
mcps,
|
|
10514
10516
|
traceContext
|
|
10515
10517
|
} = context;
|
|
10516
|
-
const typed = config ?
|
|
10518
|
+
const typed = config ? import_evalforge_types12.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
|
|
10517
10519
|
const cfg = typed?.success ? typed.data : void 0;
|
|
10518
|
-
const schemaKeys = new Set(Object.keys(
|
|
10520
|
+
const schemaKeys = new Set(Object.keys(import_evalforge_types12.SimpleAgentConfigSchema.shape));
|
|
10519
10521
|
const configExtras = {};
|
|
10520
10522
|
if (config) {
|
|
10521
10523
|
for (const [key, value] of Object.entries(config)) {
|
|
@@ -10552,11 +10554,11 @@ async function executeWithAiSdk(context) {
|
|
|
10552
10554
|
}, SDK_TIMEOUT_MS);
|
|
10553
10555
|
try {
|
|
10554
10556
|
const isAnthropic = provider === PROVIDER_ANTHROPIC2;
|
|
10555
|
-
const isResponsesAPI = [...
|
|
10557
|
+
const isResponsesAPI = [...import_evalforge_types12.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
10556
10558
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10557
10559
|
);
|
|
10558
10560
|
const isGemini = provider === PROVIDER_GEMINI2;
|
|
10559
|
-
const isGeminiThinking = isGemini &&
|
|
10561
|
+
const isGeminiThinking = isGemini && import_evalforge_types12.GEMINI_THINKING_MODEL_IDS.has(modelId);
|
|
10560
10562
|
const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
|
|
10561
10563
|
const thinkingBudgetTokens = cfg.thinkingBudgetTokens ?? 1e4;
|
|
10562
10564
|
const reasoningEffort = cfg.reasoningEffort ?? "high";
|
|
@@ -10635,7 +10637,7 @@ async function executeWithAiSdk(context) {
|
|
|
10635
10637
|
targetId: traceContext.targetId,
|
|
10636
10638
|
targetName: traceContext.targetName,
|
|
10637
10639
|
stepNumber: stepTimestamps.length,
|
|
10638
|
-
type: isToolStep ?
|
|
10640
|
+
type: isToolStep ? import_evalforge_types12.LiveTraceEventType.TOOL_USE : import_evalforge_types12.LiveTraceEventType.COMPLETION,
|
|
10639
10641
|
toolName: firstToolCall?.toolName,
|
|
10640
10642
|
toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
|
|
10641
10643
|
outputPreview: step.text?.slice(0, 500),
|
|
@@ -10840,7 +10842,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
|
|
|
10840
10842
|
id: (0, import_crypto4.randomUUID)(),
|
|
10841
10843
|
stepNumber: i + 1,
|
|
10842
10844
|
turnIndex: i,
|
|
10843
|
-
type: step.toolCalls.length > 0 ?
|
|
10845
|
+
type: step.toolCalls.length > 0 ? import_evalforge_types12.LLMStepType.TOOL_USE : import_evalforge_types12.LLMStepType.COMPLETION,
|
|
10844
10846
|
model: modelId,
|
|
10845
10847
|
provider,
|
|
10846
10848
|
startedAt: new Date(stepStartedAt).toISOString(),
|
|
@@ -10890,7 +10892,7 @@ function emitStartEvent(traceContext, startTime) {
|
|
|
10890
10892
|
targetId: traceContext.targetId,
|
|
10891
10893
|
targetName: traceContext.targetName,
|
|
10892
10894
|
stepNumber: 0,
|
|
10893
|
-
type:
|
|
10895
|
+
type: import_evalforge_types12.LiveTraceEventType.PROGRESS,
|
|
10894
10896
|
outputPreview: "Starting Simple Agent execution...",
|
|
10895
10897
|
elapsedMs: Date.now() - startTime,
|
|
10896
10898
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -10908,7 +10910,7 @@ function emitCompletionEvent(traceContext, stepNumber) {
|
|
|
10908
10910
|
targetId: traceContext.targetId,
|
|
10909
10911
|
targetName: traceContext.targetName,
|
|
10910
10912
|
stepNumber,
|
|
10911
|
-
type:
|
|
10913
|
+
type: import_evalforge_types12.LiveTraceEventType.COMPLETION,
|
|
10912
10914
|
outputPreview: "Scenario execution completed",
|
|
10913
10915
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
10914
10916
|
isComplete: true
|
|
@@ -11678,11 +11680,11 @@ function substituteVariables(prompt, variables) {
|
|
|
11678
11680
|
}
|
|
11679
11681
|
|
|
11680
11682
|
// src/run-scenario/run-agent-with-context.ts
|
|
11681
|
-
var
|
|
11682
|
-
var DEFAULT_AGENT_COMMAND =
|
|
11683
|
+
var import_evalforge_types13 = require("@wix/evalforge-types");
|
|
11684
|
+
var DEFAULT_AGENT_COMMAND = import_evalforge_types13.AgentRunCommand.CLAUDE;
|
|
11683
11685
|
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir, pushEvent) {
|
|
11684
11686
|
const agent = evalData.agent ?? void 0;
|
|
11685
|
-
const isSDK = agent?.agentType ===
|
|
11687
|
+
const isSDK = agent?.agentType === import_evalforge_types13.AgentType.SDK;
|
|
11686
11688
|
const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
11687
11689
|
const adapter = getAdapter(identifier);
|
|
11688
11690
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -11767,14 +11769,14 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
11767
11769
|
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
11768
11770
|
if (template) {
|
|
11769
11771
|
console.log(
|
|
11770
|
-
(0,
|
|
11772
|
+
(0, import_evalforge_types14.formatTraceEventLine)({
|
|
11771
11773
|
evalRunId: evalRunId2,
|
|
11772
11774
|
scenarioId: scenario.id,
|
|
11773
11775
|
scenarioName: scenario.name,
|
|
11774
11776
|
targetId,
|
|
11775
11777
|
targetName,
|
|
11776
11778
|
stepNumber: 0,
|
|
11777
|
-
type:
|
|
11779
|
+
type: import_evalforge_types14.LiveTraceEventType.PROGRESS,
|
|
11778
11780
|
outputPreview: "Setting up environment (installing dependencies)...",
|
|
11779
11781
|
elapsedMs: 0,
|
|
11780
11782
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -11814,7 +11816,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
11814
11816
|
})),
|
|
11815
11817
|
durationMs: partialResult.duration
|
|
11816
11818
|
};
|
|
11817
|
-
const defaultJudgeModel =
|
|
11819
|
+
const defaultJudgeModel = import_evalforge_types14.DEFAULT_JUDGE_MODEL;
|
|
11818
11820
|
const assertionContext = {
|
|
11819
11821
|
workDir,
|
|
11820
11822
|
defaultJudgeModel,
|
|
@@ -11829,10 +11831,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
11829
11831
|
assertionContext
|
|
11830
11832
|
) : [];
|
|
11831
11833
|
const passed = assertionResults.filter(
|
|
11832
|
-
(r) => r.status ===
|
|
11834
|
+
(r) => r.status === import_evalforge_types14.AssertionResultStatus.PASSED
|
|
11833
11835
|
).length;
|
|
11834
11836
|
const failed = assertionResults.filter(
|
|
11835
|
-
(r) => r.status ===
|
|
11837
|
+
(r) => r.status === import_evalforge_types14.AssertionResultStatus.FAILED
|
|
11836
11838
|
).length;
|
|
11837
11839
|
const total = assertionResults.length;
|
|
11838
11840
|
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
@@ -11908,7 +11910,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
11908
11910
|
}
|
|
11909
11911
|
|
|
11910
11912
|
// src/error-reporter.ts
|
|
11911
|
-
var
|
|
11913
|
+
var import_evalforge_types15 = require("@wix/evalforge-types");
|
|
11912
11914
|
function formatError(error, phase, context) {
|
|
11913
11915
|
const timestamp2 = (/* @__PURE__ */ new Date()).toISOString();
|
|
11914
11916
|
if (error instanceof Error) {
|
|
@@ -12151,7 +12153,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
12151
12153
|
totalExecutions
|
|
12152
12154
|
};
|
|
12153
12155
|
const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
|
|
12154
|
-
const finalStatus = allFailed ?
|
|
12156
|
+
const finalStatus = allFailed ? import_evalforge_types16.EvalStatus.FAILED : import_evalforge_types16.EvalStatus.COMPLETED;
|
|
12155
12157
|
const jobErrorOnAllFailed = allFailed ? truncateForJobError(
|
|
12156
12158
|
firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
|
|
12157
12159
|
) : void 0;
|
|
@@ -12205,7 +12207,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
12205
12207
|
grpcAuthToken: config.grpcAuthToken
|
|
12206
12208
|
});
|
|
12207
12209
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
12208
|
-
status:
|
|
12210
|
+
status: import_evalforge_types16.EvalStatus.FAILED,
|
|
12209
12211
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
12210
12212
|
jobError,
|
|
12211
12213
|
jobStatus: "FAILED"
|
|
@@ -12230,7 +12232,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
12230
12232
|
grpcAuthToken
|
|
12231
12233
|
});
|
|
12232
12234
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
12233
|
-
status:
|
|
12235
|
+
status: import_evalforge_types16.EvalStatus.FAILED,
|
|
12234
12236
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
12235
12237
|
jobError: `Config load failed, then: ${jobError}`,
|
|
12236
12238
|
jobStatus: "FAILED"
|