@wix/evalforge-evaluator 0.181.0 → 0.183.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +215 -218
- package/build/index.js.map +4 -4
- package/build/index.mjs +129 -133
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/agents/claude-code/write-mcp.d.ts +7 -1
- package/build/types/run-scenario/agents/claude-code/write-rules.d.ts +7 -1
- package/build/types/run-scenario/agents/claude-code/write-skills.d.ts +6 -9
- package/build/types/run-scenario/agents/claude-code/write-sub-agents.d.ts +4 -5
- package/build/types/run-scenario/agents/opencode/write-skills.d.ts +5 -8
- package/build/types/run-scenario/agents/opencode/write-sub-agents.d.ts +3 -3
- package/build/types/run-scenario/agents/shared/resolve-capability-content.d.ts +42 -0
- package/package.json +3 -3
package/build/index.js
CHANGED
|
@@ -5226,7 +5226,7 @@ var require_index_node = __commonJS({
|
|
|
5226
5226
|
});
|
|
5227
5227
|
|
|
5228
5228
|
// src/index.ts
|
|
5229
|
-
var
|
|
5229
|
+
var import_evalforge_types16 = require("@wix/evalforge-types");
|
|
5230
5230
|
|
|
5231
5231
|
// src/config.ts
|
|
5232
5232
|
function loadConfig() {
|
|
@@ -7115,7 +7115,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
7115
7115
|
}
|
|
7116
7116
|
|
|
7117
7117
|
// src/run-scenario/index.ts
|
|
7118
|
-
var
|
|
7118
|
+
var import_evalforge_types14 = require("@wix/evalforge-types");
|
|
7119
7119
|
var import_eval_assertions = require("@wix/eval-assertions");
|
|
7120
7120
|
|
|
7121
7121
|
// src/run-scenario/environment.ts
|
|
@@ -7451,50 +7451,122 @@ function getAdapter(identifier) {
|
|
|
7451
7451
|
}
|
|
7452
7452
|
|
|
7453
7453
|
// src/run-scenario/agents/claude-code/claude-code-adapter.ts
|
|
7454
|
-
var
|
|
7454
|
+
var import_evalforge_types6 = require("@wix/evalforge-types");
|
|
7455
7455
|
|
|
7456
7456
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
7457
|
-
var
|
|
7457
|
+
var import_evalforge_types5 = require("@wix/evalforge-types");
|
|
7458
7458
|
|
|
7459
7459
|
// src/run-scenario/agents/claude-code/write-skills.ts
|
|
7460
7460
|
var import_promises3 = require("fs/promises");
|
|
7461
7461
|
var import_path4 = require("path");
|
|
7462
|
+
|
|
7463
|
+
// src/run-scenario/agents/shared/resolve-capability-content.ts
|
|
7462
7464
|
var import_evalforge_github_client2 = require("@wix/evalforge-github-client");
|
|
7463
|
-
|
|
7465
|
+
var import_evalforge_types2 = require("@wix/evalforge-types");
|
|
7466
|
+
var USER_AGENT = "EvalForge-Evaluator";
|
|
7467
|
+
async function resolveSkillFiles(skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
7468
|
+
const version = skill.latestVersion;
|
|
7469
|
+
if (version?.files && version.files.length > 0) {
|
|
7470
|
+
console.log(
|
|
7471
|
+
`[Skill] ${skill.name}: using ${version.files.length} file(s) from snapshot`
|
|
7472
|
+
);
|
|
7473
|
+
return version.files;
|
|
7474
|
+
}
|
|
7475
|
+
if (skill.source) {
|
|
7476
|
+
const files = await fetchFn(skill.source, { userAgent: USER_AGENT });
|
|
7477
|
+
console.log(
|
|
7478
|
+
`[Skill] ${skill.name}: fetched ${files.length} file(s) from GitHub (live)`
|
|
7479
|
+
);
|
|
7480
|
+
return files;
|
|
7481
|
+
}
|
|
7482
|
+
throw new Error(`Skill ${skill.name} has no files and no source configured`);
|
|
7483
|
+
}
|
|
7484
|
+
async function fetchSourceFile(label, noun, name, source, fetchFn) {
|
|
7485
|
+
try {
|
|
7486
|
+
const content = await fetchFn(source, { userAgent: USER_AGENT });
|
|
7487
|
+
console.log(
|
|
7488
|
+
`[${label}] Fetched "${name}" from ${source.owner}/${source.repo}/${source.path}@${source.ref}`
|
|
7489
|
+
);
|
|
7490
|
+
return content;
|
|
7491
|
+
} catch (error) {
|
|
7492
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7493
|
+
console.error(`[${label}] "${name}": GitHub fetch failed: ${message}`);
|
|
7494
|
+
throw new Error(
|
|
7495
|
+
`Failed to fetch ${noun} "${name}" from GitHub: ${message}`
|
|
7496
|
+
);
|
|
7497
|
+
}
|
|
7498
|
+
}
|
|
7499
|
+
async function resolveSubAgentMd(agent, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
|
|
7500
|
+
if (agent.source) {
|
|
7501
|
+
return fetchSourceFile(
|
|
7502
|
+
"SubAgents",
|
|
7503
|
+
"sub-agent",
|
|
7504
|
+
agent.name,
|
|
7505
|
+
agent.source,
|
|
7506
|
+
fetchFn
|
|
7507
|
+
);
|
|
7508
|
+
}
|
|
7509
|
+
if (!agent.subAgentMd) {
|
|
7510
|
+
console.warn(
|
|
7511
|
+
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
7512
|
+
);
|
|
7513
|
+
}
|
|
7514
|
+
return agent.subAgentMd;
|
|
7515
|
+
}
|
|
7516
|
+
async function resolveRuleText(rule, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
|
|
7517
|
+
if (!rule.source) {
|
|
7518
|
+
return rule.content;
|
|
7519
|
+
}
|
|
7520
|
+
return fetchSourceFile("Rules", "rule", rule.name, rule.source, fetchFn);
|
|
7521
|
+
}
|
|
7522
|
+
async function resolveMcpConfig(mcp, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
|
|
7523
|
+
if (!mcp.source) {
|
|
7524
|
+
return mcp.config;
|
|
7525
|
+
}
|
|
7526
|
+
const raw = await fetchSourceFile(
|
|
7527
|
+
"MCP",
|
|
7528
|
+
"MCP",
|
|
7529
|
+
mcp.name,
|
|
7530
|
+
mcp.source,
|
|
7531
|
+
fetchFn
|
|
7532
|
+
);
|
|
7533
|
+
let parsed;
|
|
7534
|
+
try {
|
|
7535
|
+
parsed = JSON.parse(raw);
|
|
7536
|
+
} catch (error) {
|
|
7537
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7538
|
+
throw new Error(
|
|
7539
|
+
`MCP "${mcp.name}" GitHub source is not valid JSON: ${message}`
|
|
7540
|
+
);
|
|
7541
|
+
}
|
|
7542
|
+
if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
|
|
7543
|
+
throw new Error(`MCP "${mcp.name}" GitHub source must be a JSON object`);
|
|
7544
|
+
}
|
|
7545
|
+
const obj = parsed;
|
|
7546
|
+
const servers = obj[import_evalforge_types2.MCP_SERVERS_JSON_KEY];
|
|
7547
|
+
if (typeof servers === "object" && servers !== null && !Array.isArray(servers)) {
|
|
7548
|
+
return servers;
|
|
7549
|
+
}
|
|
7550
|
+
return obj;
|
|
7551
|
+
}
|
|
7552
|
+
|
|
7553
|
+
// src/run-scenario/agents/claude-code/write-skills.ts
|
|
7554
|
+
async function writeSkillsToFilesystem(cwd, skills, fetchFn) {
|
|
7464
7555
|
await Promise.all(
|
|
7465
7556
|
skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
|
|
7466
7557
|
);
|
|
7467
7558
|
}
|
|
7468
|
-
async function writeSkillToFilesystem(cwd, skill, fetchFn
|
|
7469
|
-
const
|
|
7470
|
-
const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skillName);
|
|
7559
|
+
async function writeSkillToFilesystem(cwd, skill, fetchFn) {
|
|
7560
|
+
const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skill.name);
|
|
7471
7561
|
await (0, import_promises3.mkdir)(skillDir, { recursive: true });
|
|
7472
|
-
|
|
7473
|
-
|
|
7474
|
-
await writeFilesToDirectory(skillDir,
|
|
7475
|
-
|
|
7476
|
-
|
|
7562
|
+
try {
|
|
7563
|
+
const files = await resolveSkillFiles(skill, fetchFn);
|
|
7564
|
+
await writeFilesToDirectory(skillDir, files);
|
|
7565
|
+
} catch (error) {
|
|
7566
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7567
|
+
throw new Error(
|
|
7568
|
+
`Failed to write skill ${skill.name} to filesystem: ${message}`
|
|
7477
7569
|
);
|
|
7478
|
-
} else if (skill.source) {
|
|
7479
|
-
try {
|
|
7480
|
-
const files = await fetchFn(skill.source, {
|
|
7481
|
-
userAgent: "EvalForge-Evaluator"
|
|
7482
|
-
});
|
|
7483
|
-
await writeFilesToDirectory(skillDir, files);
|
|
7484
|
-
console.log(
|
|
7485
|
-
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
7486
|
-
);
|
|
7487
|
-
} catch (error) {
|
|
7488
|
-
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7489
|
-
console.error(
|
|
7490
|
-
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
7491
|
-
);
|
|
7492
|
-
throw new Error(
|
|
7493
|
-
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
7494
|
-
);
|
|
7495
|
-
}
|
|
7496
|
-
} else {
|
|
7497
|
-
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
7498
7570
|
}
|
|
7499
7571
|
}
|
|
7500
7572
|
|
|
@@ -7512,7 +7584,7 @@ var import_crypto2 = require("crypto");
|
|
|
7512
7584
|
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
7513
7585
|
var import_promises5 = require("fs/promises");
|
|
7514
7586
|
var import_path6 = require("path");
|
|
7515
|
-
var
|
|
7587
|
+
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
7516
7588
|
|
|
7517
7589
|
// src/run-scenario/agents/shared/resolve-mcp-placeholders.ts
|
|
7518
7590
|
var import_promises4 = require("fs/promises");
|
|
@@ -7557,11 +7629,11 @@ async function resolveMcpPlaceholders(mcpServers, options = {}) {
|
|
|
7557
7629
|
}
|
|
7558
7630
|
|
|
7559
7631
|
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
7560
|
-
async function writeMcpToFilesystem(cwd, mcps) {
|
|
7632
|
+
async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
|
|
7561
7633
|
if (mcps.length === 0) return;
|
|
7562
7634
|
const mcpServers = {};
|
|
7563
7635
|
for (const mcp of mcps) {
|
|
7564
|
-
const config = mcp
|
|
7636
|
+
const config = await resolveMcpConfig(mcp, fetchFn);
|
|
7565
7637
|
for (const [key, value] of Object.entries(config)) {
|
|
7566
7638
|
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
7567
7639
|
throw new Error(
|
|
@@ -7573,7 +7645,7 @@ async function writeMcpToFilesystem(cwd, mcps) {
|
|
|
7573
7645
|
}
|
|
7574
7646
|
const resolvedServers = await resolveMcpPlaceholders(mcpServers, { cwd });
|
|
7575
7647
|
const content = JSON.stringify(
|
|
7576
|
-
{ [
|
|
7648
|
+
{ [import_evalforge_types3.MCP_SERVERS_JSON_KEY]: resolvedServers },
|
|
7577
7649
|
null,
|
|
7578
7650
|
2
|
|
7579
7651
|
);
|
|
@@ -7585,7 +7657,6 @@ async function writeMcpToFilesystem(cwd, mcps) {
|
|
|
7585
7657
|
// src/run-scenario/agents/claude-code/write-sub-agents.ts
|
|
7586
7658
|
var import_promises6 = require("fs/promises");
|
|
7587
7659
|
var import_path7 = require("path");
|
|
7588
|
-
var import_evalforge_github_client3 = require("@wix/evalforge-github-client");
|
|
7589
7660
|
var AGENTS_DIR = ".claude/agents";
|
|
7590
7661
|
function toAgentFilename(name, index, nameCount) {
|
|
7591
7662
|
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
@@ -7593,34 +7664,7 @@ function toAgentFilename(name, index, nameCount) {
|
|
|
7593
7664
|
nameCount.set(base, count + 1);
|
|
7594
7665
|
return count === 0 ? base : `${base}-${count + 1}`;
|
|
7595
7666
|
}
|
|
7596
|
-
async function
|
|
7597
|
-
if (agent.source) {
|
|
7598
|
-
try {
|
|
7599
|
-
const content = await fetchFn(agent.source, {
|
|
7600
|
-
userAgent: "EvalForge-Evaluator"
|
|
7601
|
-
});
|
|
7602
|
-
console.log(
|
|
7603
|
-
`[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
|
|
7604
|
-
);
|
|
7605
|
-
return content;
|
|
7606
|
-
} catch (error) {
|
|
7607
|
-
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7608
|
-
console.error(
|
|
7609
|
-
`[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
|
|
7610
|
-
);
|
|
7611
|
-
throw new Error(
|
|
7612
|
-
`Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
|
|
7613
|
-
);
|
|
7614
|
-
}
|
|
7615
|
-
}
|
|
7616
|
-
if (!agent.subAgentMd) {
|
|
7617
|
-
console.warn(
|
|
7618
|
-
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
7619
|
-
);
|
|
7620
|
-
}
|
|
7621
|
-
return agent.subAgentMd;
|
|
7622
|
-
}
|
|
7623
|
-
async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalforge_github_client3.fetchGitHubFile) {
|
|
7667
|
+
async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn) {
|
|
7624
7668
|
if (subAgents.length === 0) return;
|
|
7625
7669
|
const agentsDir = (0, import_path7.join)(cwd, AGENTS_DIR);
|
|
7626
7670
|
await (0, import_promises6.mkdir)(agentsDir, { recursive: true });
|
|
@@ -7628,7 +7672,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalf
|
|
|
7628
7672
|
for (const [i, agent] of subAgents.entries()) {
|
|
7629
7673
|
const filename = toAgentFilename(agent.name, i, nameCount);
|
|
7630
7674
|
const filePath = (0, import_path7.join)(agentsDir, `${filename}.md`);
|
|
7631
|
-
const content = await
|
|
7675
|
+
const content = await resolveSubAgentMd(agent, fetchFn);
|
|
7632
7676
|
await (0, import_promises6.writeFile)(filePath, content, "utf8");
|
|
7633
7677
|
}
|
|
7634
7678
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
@@ -7678,18 +7722,19 @@ function validateGenericDirectory(dir, cwd) {
|
|
|
7678
7722
|
}
|
|
7679
7723
|
return trimmed;
|
|
7680
7724
|
}
|
|
7681
|
-
async function writeRulesToFilesystem(cwd, rules) {
|
|
7725
|
+
async function writeRulesToFilesystem(cwd, rules, fetchFn) {
|
|
7682
7726
|
if (rules.length === 0) return;
|
|
7683
7727
|
const nameCount = /* @__PURE__ */ new Map();
|
|
7684
7728
|
let hasCursorRules = false;
|
|
7685
7729
|
for (const [i, rule] of rules.entries()) {
|
|
7730
|
+
const content = await resolveRuleText(rule, fetchFn);
|
|
7686
7731
|
switch (rule.ruleType) {
|
|
7687
7732
|
case "claude-md": {
|
|
7688
|
-
await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"),
|
|
7733
|
+
await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), content);
|
|
7689
7734
|
break;
|
|
7690
7735
|
}
|
|
7691
7736
|
case "agents-md": {
|
|
7692
|
-
await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"),
|
|
7737
|
+
await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), content);
|
|
7693
7738
|
break;
|
|
7694
7739
|
}
|
|
7695
7740
|
case "cursor-rule": {
|
|
@@ -7699,7 +7744,7 @@ async function writeRulesToFilesystem(cwd, rules) {
|
|
|
7699
7744
|
}
|
|
7700
7745
|
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
7701
7746
|
const filePath = (0, import_path8.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
|
|
7702
|
-
await (0, import_promises7.writeFile)(filePath,
|
|
7747
|
+
await (0, import_promises7.writeFile)(filePath, content, "utf8");
|
|
7703
7748
|
break;
|
|
7704
7749
|
}
|
|
7705
7750
|
case "generic": {
|
|
@@ -7710,7 +7755,7 @@ async function writeRulesToFilesystem(cwd, rules) {
|
|
|
7710
7755
|
const dirPath = (0, import_path8.join)(cwd, directory);
|
|
7711
7756
|
await (0, import_promises7.mkdir)(dirPath, { recursive: true });
|
|
7712
7757
|
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
7713
|
-
await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`),
|
|
7758
|
+
await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), content, "utf8");
|
|
7714
7759
|
break;
|
|
7715
7760
|
}
|
|
7716
7761
|
default: {
|
|
@@ -7800,14 +7845,14 @@ function buildConversation(timestampedMessages) {
|
|
|
7800
7845
|
}
|
|
7801
7846
|
|
|
7802
7847
|
// src/run-scenario/agents/shared/trace-emit.ts
|
|
7803
|
-
var
|
|
7848
|
+
var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
7804
7849
|
function emitTraceEvent(event, pushEvent) {
|
|
7805
|
-
console.log(`${
|
|
7850
|
+
console.log(`${import_evalforge_types4.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
|
|
7806
7851
|
pushEvent?.(event);
|
|
7807
7852
|
}
|
|
7808
7853
|
|
|
7809
7854
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
7810
|
-
var DEFAULT_MODEL =
|
|
7855
|
+
var DEFAULT_MODEL = import_evalforge_types5.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
|
|
7811
7856
|
async function* buildPromptStream(triggerPrompt, images) {
|
|
7812
7857
|
yield {
|
|
7813
7858
|
type: "user",
|
|
@@ -7872,7 +7917,7 @@ function extractToolActionDescription(toolName, toolArgs) {
|
|
|
7872
7917
|
return `Using ${toolName}...`;
|
|
7873
7918
|
}
|
|
7874
7919
|
function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
7875
|
-
let type =
|
|
7920
|
+
let type = import_evalforge_types5.LiveTraceEventType.COMPLETION;
|
|
7876
7921
|
let toolName;
|
|
7877
7922
|
let toolArgs;
|
|
7878
7923
|
let outputPreview;
|
|
@@ -7880,28 +7925,28 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
|
7880
7925
|
let thinking;
|
|
7881
7926
|
for (const block of message.message.content) {
|
|
7882
7927
|
if (block.type === "tool_use") {
|
|
7883
|
-
type =
|
|
7928
|
+
type = import_evalforge_types5.LiveTraceEventType.TOOL_USE;
|
|
7884
7929
|
toolName = block.name;
|
|
7885
7930
|
toolArgs = JSON.stringify(block.input).slice(0, 500);
|
|
7886
7931
|
const input = block.input;
|
|
7887
7932
|
if (input.file_path || input.path || input.target_file) {
|
|
7888
7933
|
filePath = String(input.file_path || input.path || input.target_file);
|
|
7889
7934
|
if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
|
|
7890
|
-
type =
|
|
7935
|
+
type = import_evalforge_types5.LiveTraceEventType.FILE_WRITE;
|
|
7891
7936
|
} else if (block.name === "Read" || block.name === "read" || block.name === "View") {
|
|
7892
|
-
type =
|
|
7937
|
+
type = import_evalforge_types5.LiveTraceEventType.FILE_READ;
|
|
7893
7938
|
}
|
|
7894
7939
|
}
|
|
7895
7940
|
} else if (block.type === "text") {
|
|
7896
7941
|
outputPreview = block.text.slice(0, 500);
|
|
7897
7942
|
if (!toolName) {
|
|
7898
|
-
type =
|
|
7943
|
+
type = import_evalforge_types5.LiveTraceEventType.COMPLETION;
|
|
7899
7944
|
}
|
|
7900
7945
|
} else if (block.type === "thinking") {
|
|
7901
7946
|
const thinkingBlock = block;
|
|
7902
7947
|
thinking = thinkingBlock.thinking.slice(0, 500);
|
|
7903
7948
|
if (!outputPreview && !toolName) {
|
|
7904
|
-
type =
|
|
7949
|
+
type = import_evalforge_types5.LiveTraceEventType.THINKING;
|
|
7905
7950
|
}
|
|
7906
7951
|
}
|
|
7907
7952
|
}
|
|
@@ -7967,7 +8012,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
7967
8012
|
}
|
|
7968
8013
|
return {
|
|
7969
8014
|
...baseEvent,
|
|
7970
|
-
type:
|
|
8015
|
+
type: import_evalforge_types5.LiveTraceEventType.TOOL_RESULT,
|
|
7971
8016
|
outputPreview: outputPreview || "(tool result)"
|
|
7972
8017
|
};
|
|
7973
8018
|
}
|
|
@@ -7975,7 +8020,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
7975
8020
|
const sysMsg = message;
|
|
7976
8021
|
return {
|
|
7977
8022
|
...baseEvent,
|
|
7978
|
-
type:
|
|
8023
|
+
type: import_evalforge_types5.LiveTraceEventType.SYSTEM,
|
|
7979
8024
|
outputPreview: sysMsg.subtype || "system"
|
|
7980
8025
|
};
|
|
7981
8026
|
}
|
|
@@ -7984,7 +8029,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
7984
8029
|
}
|
|
7985
8030
|
return {
|
|
7986
8031
|
...baseEvent,
|
|
7987
|
-
type:
|
|
8032
|
+
type: import_evalforge_types5.LiveTraceEventType.PROGRESS,
|
|
7988
8033
|
outputPreview: `Message type: ${message.type}`
|
|
7989
8034
|
};
|
|
7990
8035
|
}
|
|
@@ -8086,7 +8131,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8086
8131
|
queryOptions.systemPrompt = {
|
|
8087
8132
|
type: "preset",
|
|
8088
8133
|
preset: "claude_code",
|
|
8089
|
-
append:
|
|
8134
|
+
append: import_evalforge_types5.DEFAULT_EVALUATOR_SYSTEM_PROMPT
|
|
8090
8135
|
};
|
|
8091
8136
|
}
|
|
8092
8137
|
if (options.temperature !== void 0) {
|
|
@@ -8121,7 +8166,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8121
8166
|
targetId: traceContext.targetId,
|
|
8122
8167
|
targetName: traceContext.targetName,
|
|
8123
8168
|
stepNumber: 0,
|
|
8124
|
-
type:
|
|
8169
|
+
type: import_evalforge_types5.LiveTraceEventType.DIAGNOSTIC,
|
|
8125
8170
|
outputPreview: JSON.stringify({
|
|
8126
8171
|
event: "pre-sdk-execution",
|
|
8127
8172
|
model: queryOptions.model,
|
|
@@ -8185,7 +8230,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8185
8230
|
targetId: traceContext.targetId,
|
|
8186
8231
|
targetName: traceContext.targetName,
|
|
8187
8232
|
stepNumber: traceStepNumber,
|
|
8188
|
-
type:
|
|
8233
|
+
type: import_evalforge_types5.LiveTraceEventType.PROGRESS,
|
|
8189
8234
|
outputPreview: progressMessage,
|
|
8190
8235
|
toolName: lastToolName,
|
|
8191
8236
|
filePath: lastFilePath,
|
|
@@ -8222,18 +8267,18 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8222
8267
|
if (traceEvent) {
|
|
8223
8268
|
lastToolName = traceEvent.toolName;
|
|
8224
8269
|
lastFilePath = traceEvent.filePath;
|
|
8225
|
-
if (traceEvent.type ===
|
|
8270
|
+
if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.THINKING) {
|
|
8226
8271
|
lastAction = "Thinking...";
|
|
8227
|
-
} else if (traceEvent.type ===
|
|
8272
|
+
} else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.TOOL_USE) {
|
|
8228
8273
|
lastAction = extractToolActionDescription(
|
|
8229
8274
|
traceEvent.toolName,
|
|
8230
8275
|
traceEvent.toolArgs
|
|
8231
8276
|
);
|
|
8232
|
-
} else if (traceEvent.type ===
|
|
8277
|
+
} else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.FILE_WRITE) {
|
|
8233
8278
|
lastAction = `Writing: ${traceEvent.filePath || "file"}`;
|
|
8234
|
-
} else if (traceEvent.type ===
|
|
8279
|
+
} else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.FILE_READ) {
|
|
8235
8280
|
lastAction = `Reading: ${traceEvent.filePath || "file"}`;
|
|
8236
|
-
} else if (traceEvent.type ===
|
|
8281
|
+
} else if (traceEvent.type === import_evalforge_types5.LiveTraceEventType.COMPLETION) {
|
|
8237
8282
|
lastAction = "Processing response...";
|
|
8238
8283
|
}
|
|
8239
8284
|
emitTraceEvent(traceEvent, traceContext.pushEvent);
|
|
@@ -8411,7 +8456,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8411
8456
|
targetId: traceContext.targetId,
|
|
8412
8457
|
targetName: traceContext.targetName,
|
|
8413
8458
|
stepNumber: traceStepNumber + 1,
|
|
8414
|
-
type:
|
|
8459
|
+
type: import_evalforge_types5.LiveTraceEventType.DIAGNOSTIC,
|
|
8415
8460
|
outputPreview: JSON.stringify(
|
|
8416
8461
|
{
|
|
8417
8462
|
event: "sdk-execution-failed",
|
|
@@ -8445,7 +8490,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
8445
8490
|
targetId: traceContext.targetId,
|
|
8446
8491
|
targetName: traceContext.targetName,
|
|
8447
8492
|
stepNumber: traceStepNumber + 1,
|
|
8448
|
-
type:
|
|
8493
|
+
type: import_evalforge_types5.LiveTraceEventType.COMPLETION,
|
|
8449
8494
|
outputPreview: "Scenario execution completed",
|
|
8450
8495
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
8451
8496
|
isComplete: true
|
|
@@ -8717,7 +8762,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8717
8762
|
stepNumber: 0,
|
|
8718
8763
|
// renumbered below
|
|
8719
8764
|
turnIndex,
|
|
8720
|
-
type:
|
|
8765
|
+
type: import_evalforge_types5.LLMStepType.THINKING,
|
|
8721
8766
|
model,
|
|
8722
8767
|
provider: "anthropic",
|
|
8723
8768
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -8746,7 +8791,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8746
8791
|
id: (0, import_crypto2.randomUUID)(),
|
|
8747
8792
|
stepNumber: 0,
|
|
8748
8793
|
turnIndex,
|
|
8749
|
-
type:
|
|
8794
|
+
type: import_evalforge_types5.LLMStepType.TOOL_USE,
|
|
8750
8795
|
model,
|
|
8751
8796
|
provider: "anthropic",
|
|
8752
8797
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -8776,7 +8821,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8776
8821
|
id: (0, import_crypto2.randomUUID)(),
|
|
8777
8822
|
stepNumber: 0,
|
|
8778
8823
|
turnIndex,
|
|
8779
|
-
type:
|
|
8824
|
+
type: import_evalforge_types5.LLMStepType.COMPLETION,
|
|
8780
8825
|
model,
|
|
8781
8826
|
provider: "anthropic",
|
|
8782
8827
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -8793,7 +8838,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8793
8838
|
});
|
|
8794
8839
|
}
|
|
8795
8840
|
if (subSteps.length === 0) {
|
|
8796
|
-
const stepType = hasThinking && !hasText ?
|
|
8841
|
+
const stepType = hasThinking && !hasText ? import_evalforge_types5.LLMStepType.THINKING : import_evalforge_types5.LLMStepType.COMPLETION;
|
|
8797
8842
|
subSteps.push({
|
|
8798
8843
|
id: (0, import_crypto2.randomUUID)(),
|
|
8799
8844
|
stepNumber: 0,
|
|
@@ -8863,7 +8908,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8863
8908
|
var ClaudeCodeAdapter = class {
|
|
8864
8909
|
id = "claude-code";
|
|
8865
8910
|
name = "Claude Code";
|
|
8866
|
-
supportedCommands = [
|
|
8911
|
+
supportedCommands = [import_evalforge_types6.AgentRunCommand.CLAUDE];
|
|
8867
8912
|
/**
|
|
8868
8913
|
* Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
|
|
8869
8914
|
* before the baseline snapshot is taken.
|
|
@@ -8895,9 +8940,9 @@ var ClaudeCodeAdapter = class {
|
|
|
8895
8940
|
rules,
|
|
8896
8941
|
systemPrompt
|
|
8897
8942
|
} = context;
|
|
8898
|
-
const typed = config ?
|
|
8943
|
+
const typed = config ? import_evalforge_types6.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
|
|
8899
8944
|
const cfg = typed?.success ? typed.data : void 0;
|
|
8900
|
-
const schemaKeys = new Set(Object.keys(
|
|
8945
|
+
const schemaKeys = new Set(Object.keys(import_evalforge_types6.ClaudeCodeConfigSchema.shape));
|
|
8901
8946
|
const extras = {};
|
|
8902
8947
|
if (config) {
|
|
8903
8948
|
for (const [key, value] of Object.entries(config)) {
|
|
@@ -8952,11 +8997,11 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
|
|
|
8952
8997
|
defaultRegistry.register(claudeCodeAdapter);
|
|
8953
8998
|
|
|
8954
8999
|
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
8955
|
-
var
|
|
9000
|
+
var import_evalforge_types10 = require("@wix/evalforge-types");
|
|
8956
9001
|
|
|
8957
9002
|
// src/run-scenario/agents/opencode/execute.ts
|
|
8958
9003
|
var import_child_process2 = require("child_process");
|
|
8959
|
-
var
|
|
9004
|
+
var import_evalforge_types9 = require("@wix/evalforge-types");
|
|
8960
9005
|
|
|
8961
9006
|
// src/run-scenario/agents/opencode/types.ts
|
|
8962
9007
|
function tryParseJson(text) {
|
|
@@ -8970,49 +9015,28 @@ function tryParseJson(text) {
|
|
|
8970
9015
|
// src/run-scenario/agents/opencode/write-skills.ts
|
|
8971
9016
|
var import_promises8 = require("fs/promises");
|
|
8972
9017
|
var import_path9 = require("path");
|
|
8973
|
-
|
|
8974
|
-
async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_github_client4.fetchGitHubFolder) {
|
|
9018
|
+
async function writeSkillsToFilesystem2(cwd, skills, fetchFn) {
|
|
8975
9019
|
await Promise.all(
|
|
8976
9020
|
skills.map((skill) => writeSkillToFilesystem2(cwd, skill, fetchFn))
|
|
8977
9021
|
);
|
|
8978
9022
|
}
|
|
8979
9023
|
async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
|
|
8980
|
-
const
|
|
8981
|
-
const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skillName);
|
|
9024
|
+
const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skill.name);
|
|
8982
9025
|
await (0, import_promises8.mkdir)(skillDir, { recursive: true });
|
|
8983
|
-
|
|
8984
|
-
|
|
8985
|
-
await writeFilesToDirectory(skillDir,
|
|
8986
|
-
|
|
8987
|
-
|
|
9026
|
+
try {
|
|
9027
|
+
const files = await resolveSkillFiles(skill, fetchFn);
|
|
9028
|
+
await writeFilesToDirectory(skillDir, files);
|
|
9029
|
+
} catch (error) {
|
|
9030
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
9031
|
+
throw new Error(
|
|
9032
|
+
`Failed to write skill ${skill.name} to filesystem: ${message}`
|
|
8988
9033
|
);
|
|
8989
|
-
} else if (skill.source) {
|
|
8990
|
-
try {
|
|
8991
|
-
const files = await fetchFn(skill.source, {
|
|
8992
|
-
userAgent: "EvalForge-Evaluator"
|
|
8993
|
-
});
|
|
8994
|
-
await writeFilesToDirectory(skillDir, files);
|
|
8995
|
-
console.log(
|
|
8996
|
-
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
8997
|
-
);
|
|
8998
|
-
} catch (error) {
|
|
8999
|
-
const message = error instanceof Error ? error.message : "Unknown error";
|
|
9000
|
-
console.error(
|
|
9001
|
-
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
9002
|
-
);
|
|
9003
|
-
throw new Error(
|
|
9004
|
-
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
9005
|
-
);
|
|
9006
|
-
}
|
|
9007
|
-
} else {
|
|
9008
|
-
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
9009
9034
|
}
|
|
9010
9035
|
}
|
|
9011
9036
|
|
|
9012
9037
|
// src/run-scenario/agents/opencode/write-sub-agents.ts
|
|
9013
9038
|
var import_promises9 = require("fs/promises");
|
|
9014
9039
|
var import_path10 = require("path");
|
|
9015
|
-
var import_evalforge_github_client5 = require("@wix/evalforge-github-client");
|
|
9016
9040
|
var AGENTS_DIR2 = ".opencode/agents";
|
|
9017
9041
|
function toAgentFilename2(name, index, nameCount) {
|
|
9018
9042
|
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
@@ -9020,34 +9044,7 @@ function toAgentFilename2(name, index, nameCount) {
|
|
|
9020
9044
|
nameCount.set(base, count + 1);
|
|
9021
9045
|
return count === 0 ? base : `${base}-${count + 1}`;
|
|
9022
9046
|
}
|
|
9023
|
-
async function
|
|
9024
|
-
if (agent.source) {
|
|
9025
|
-
try {
|
|
9026
|
-
const content = await fetchFn(agent.source, {
|
|
9027
|
-
userAgent: "EvalForge-Evaluator"
|
|
9028
|
-
});
|
|
9029
|
-
console.log(
|
|
9030
|
-
`[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
|
|
9031
|
-
);
|
|
9032
|
-
return content;
|
|
9033
|
-
} catch (error) {
|
|
9034
|
-
const message = error instanceof Error ? error.message : "Unknown error";
|
|
9035
|
-
console.error(
|
|
9036
|
-
`[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
|
|
9037
|
-
);
|
|
9038
|
-
throw new Error(
|
|
9039
|
-
`Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
|
|
9040
|
-
);
|
|
9041
|
-
}
|
|
9042
|
-
}
|
|
9043
|
-
if (!agent.subAgentMd) {
|
|
9044
|
-
console.warn(
|
|
9045
|
-
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
9046
|
-
);
|
|
9047
|
-
}
|
|
9048
|
-
return agent.subAgentMd;
|
|
9049
|
-
}
|
|
9050
|
-
async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_evalforge_github_client5.fetchGitHubFile) {
|
|
9047
|
+
async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
|
|
9051
9048
|
if (subAgents.length === 0) return;
|
|
9052
9049
|
const agentsDir = (0, import_path10.join)(cwd, AGENTS_DIR2);
|
|
9053
9050
|
await (0, import_promises9.mkdir)(agentsDir, { recursive: true });
|
|
@@ -9055,7 +9052,7 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_eval
|
|
|
9055
9052
|
for (const [i, agent] of subAgents.entries()) {
|
|
9056
9053
|
const filename = toAgentFilename2(agent.name, i, nameCount);
|
|
9057
9054
|
const filePath = (0, import_path10.join)(agentsDir, `${filename}.md`);
|
|
9058
|
-
const content = await
|
|
9055
|
+
const content = await resolveSubAgentMd(agent, fetchFn);
|
|
9059
9056
|
await (0, import_promises9.writeFile)(filePath, content, "utf8");
|
|
9060
9057
|
}
|
|
9061
9058
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
@@ -9063,8 +9060,8 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_eval
|
|
|
9063
9060
|
|
|
9064
9061
|
// src/run-scenario/agents/opencode/config.ts
|
|
9065
9062
|
var import_os3 = require("os");
|
|
9066
|
-
var
|
|
9067
|
-
var DEFAULT_MODEL2 = `${
|
|
9063
|
+
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
9064
|
+
var DEFAULT_MODEL2 = `${import_evalforge_types7.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
|
|
9068
9065
|
var OPENCODE_MODEL_ALIASES = {
|
|
9069
9066
|
"claude-sonnet-4": "claude-sonnet-4-0",
|
|
9070
9067
|
"claude-opus-4": "claude-opus-4-0"
|
|
@@ -9080,10 +9077,10 @@ function parseModel(model) {
|
|
|
9080
9077
|
};
|
|
9081
9078
|
}
|
|
9082
9079
|
const modelID = OPENCODE_MODEL_ALIASES[model] ?? model;
|
|
9083
|
-
const isOpenAI =
|
|
9080
|
+
const isOpenAI = import_evalforge_types7.AVAILABLE_OPENAI_MODEL_IDS.includes(
|
|
9084
9081
|
model
|
|
9085
9082
|
);
|
|
9086
|
-
const isGemini =
|
|
9083
|
+
const isGemini = import_evalforge_types7.AVAILABLE_GEMINI_MODEL_IDS.includes(
|
|
9087
9084
|
model
|
|
9088
9085
|
);
|
|
9089
9086
|
if (isGemini) return { providerID: "google", modelID };
|
|
@@ -9152,7 +9149,7 @@ async function buildOpenCodeEnv(options) {
|
|
|
9152
9149
|
if (options.mcps && options.mcps.length > 0) {
|
|
9153
9150
|
const mcpServers = {};
|
|
9154
9151
|
for (const mcpEntity of options.mcps) {
|
|
9155
|
-
const entityConfig = mcpEntity
|
|
9152
|
+
const entityConfig = await resolveMcpConfig(mcpEntity);
|
|
9156
9153
|
for (const [key, value] of Object.entries(entityConfig)) {
|
|
9157
9154
|
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
9158
9155
|
throw new Error(
|
|
@@ -9177,7 +9174,7 @@ async function buildOpenCodeEnv(options) {
|
|
|
9177
9174
|
if (options.maxTurns != null && options.maxTurns > 0) {
|
|
9178
9175
|
agentOverrides.maxSteps = options.maxTurns;
|
|
9179
9176
|
}
|
|
9180
|
-
const parsed = options.config ?
|
|
9177
|
+
const parsed = options.config ? import_evalforge_types7.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
|
|
9181
9178
|
const configPermission = parsed?.success ? parsed.data.permission : void 0;
|
|
9182
9179
|
const defaultPermission = {
|
|
9183
9180
|
"*": "allow"
|
|
@@ -9219,7 +9216,7 @@ async function buildOpenCodeEnv(options) {
|
|
|
9219
9216
|
}
|
|
9220
9217
|
|
|
9221
9218
|
// src/run-scenario/agents/opencode/build-trace.ts
|
|
9222
|
-
var
|
|
9219
|
+
var import_evalforge_types8 = require("@wix/evalforge-types");
|
|
9223
9220
|
var import_crypto3 = require("crypto");
|
|
9224
9221
|
function toCanonicalModelId(modelId) {
|
|
9225
9222
|
const slashIndex = modelId.indexOf("/");
|
|
@@ -9299,7 +9296,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9299
9296
|
id: (0, import_crypto3.randomUUID)(),
|
|
9300
9297
|
stepNumber: 0,
|
|
9301
9298
|
turnIndex,
|
|
9302
|
-
type:
|
|
9299
|
+
type: import_evalforge_types8.LLMStepType.THINKING,
|
|
9303
9300
|
model: stepModel,
|
|
9304
9301
|
provider: stepProvider,
|
|
9305
9302
|
startedAt,
|
|
@@ -9328,7 +9325,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9328
9325
|
id: (0, import_crypto3.randomUUID)(),
|
|
9329
9326
|
stepNumber: 0,
|
|
9330
9327
|
turnIndex,
|
|
9331
|
-
type:
|
|
9328
|
+
type: import_evalforge_types8.LLMStepType.TOOL_USE,
|
|
9332
9329
|
model: stepModel,
|
|
9333
9330
|
provider: stepProvider,
|
|
9334
9331
|
startedAt,
|
|
@@ -9358,7 +9355,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9358
9355
|
id: (0, import_crypto3.randomUUID)(),
|
|
9359
9356
|
stepNumber: 0,
|
|
9360
9357
|
turnIndex,
|
|
9361
|
-
type:
|
|
9358
|
+
type: import_evalforge_types8.LLMStepType.COMPLETION,
|
|
9362
9359
|
model: stepModel,
|
|
9363
9360
|
provider: stepProvider,
|
|
9364
9361
|
startedAt,
|
|
@@ -9375,7 +9372,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9375
9372
|
});
|
|
9376
9373
|
}
|
|
9377
9374
|
if (subSteps.length === 0) {
|
|
9378
|
-
const stepType = hasThinking && !hasText ?
|
|
9375
|
+
const stepType = hasThinking && !hasText ? import_evalforge_types8.LLMStepType.THINKING : import_evalforge_types8.LLMStepType.COMPLETION;
|
|
9379
9376
|
subSteps.push({
|
|
9380
9377
|
id: (0, import_crypto3.randomUUID)(),
|
|
9381
9378
|
stepNumber: 0,
|
|
@@ -9576,14 +9573,14 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
|
|
|
9576
9573
|
const te = evt;
|
|
9577
9574
|
return {
|
|
9578
9575
|
...base,
|
|
9579
|
-
type:
|
|
9576
|
+
type: import_evalforge_types9.LiveTraceEventType.COMPLETION,
|
|
9580
9577
|
outputPreview: te.part.text.slice(0, 500)
|
|
9581
9578
|
};
|
|
9582
9579
|
}
|
|
9583
9580
|
case "reasoning":
|
|
9584
9581
|
return {
|
|
9585
9582
|
...base,
|
|
9586
|
-
type:
|
|
9583
|
+
type: import_evalforge_types9.LiveTraceEventType.THINKING,
|
|
9587
9584
|
thinking: evt.part.text.slice(0, 500)
|
|
9588
9585
|
};
|
|
9589
9586
|
case "tool_use": {
|
|
@@ -9591,15 +9588,15 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
|
|
|
9591
9588
|
const toolName = tu.part.tool;
|
|
9592
9589
|
const args = tu.part.state.input;
|
|
9593
9590
|
const toolArgs = JSON.stringify(args).slice(0, 500);
|
|
9594
|
-
let type =
|
|
9591
|
+
let type = import_evalforge_types9.LiveTraceEventType.TOOL_USE;
|
|
9595
9592
|
let filePath;
|
|
9596
9593
|
if (args) {
|
|
9597
9594
|
if (args.file_path || args.path || args.target_file) {
|
|
9598
9595
|
filePath = String(args.file_path || args.path || args.target_file);
|
|
9599
9596
|
if (/write|edit/i.test(toolName)) {
|
|
9600
|
-
type =
|
|
9597
|
+
type = import_evalforge_types9.LiveTraceEventType.FILE_WRITE;
|
|
9601
9598
|
} else if (/read|view/i.test(toolName)) {
|
|
9602
|
-
type =
|
|
9599
|
+
type = import_evalforge_types9.LiveTraceEventType.FILE_READ;
|
|
9603
9600
|
}
|
|
9604
9601
|
}
|
|
9605
9602
|
}
|
|
@@ -9608,7 +9605,7 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
|
|
|
9608
9605
|
case "step_finish":
|
|
9609
9606
|
return {
|
|
9610
9607
|
...base,
|
|
9611
|
-
type:
|
|
9608
|
+
type: import_evalforge_types9.LiveTraceEventType.PROGRESS,
|
|
9612
9609
|
outputPreview: "Step completed"
|
|
9613
9610
|
};
|
|
9614
9611
|
default:
|
|
@@ -9639,7 +9636,7 @@ async function prepareOpenCodeEnvironment(cwd, skills, options) {
|
|
|
9639
9636
|
} else if (options.systemPrompt != null) {
|
|
9640
9637
|
systemPrompt = options.systemPrompt;
|
|
9641
9638
|
} else {
|
|
9642
|
-
systemPrompt =
|
|
9639
|
+
systemPrompt = import_evalforge_types9.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
|
|
9643
9640
|
}
|
|
9644
9641
|
if (systemPrompt) {
|
|
9645
9642
|
await writeSystemPromptRule(cwd, systemPrompt);
|
|
@@ -9831,7 +9828,7 @@ function spawnOpenCodeProcess(opts) {
|
|
|
9831
9828
|
targetId: traceContext.targetId,
|
|
9832
9829
|
targetName: traceContext.targetName,
|
|
9833
9830
|
stepNumber: traceStepNumber,
|
|
9834
|
-
type:
|
|
9831
|
+
type: import_evalforge_types9.LiveTraceEventType.PROGRESS,
|
|
9835
9832
|
outputPreview: progressMessage,
|
|
9836
9833
|
toolName: lastToolName,
|
|
9837
9834
|
filePath: lastFilePath,
|
|
@@ -9865,18 +9862,18 @@ function spawnOpenCodeProcess(opts) {
|
|
|
9865
9862
|
if (traceEvt) {
|
|
9866
9863
|
lastToolName = traceEvt.toolName;
|
|
9867
9864
|
lastFilePath = traceEvt.filePath;
|
|
9868
|
-
if (traceEvt.type ===
|
|
9865
|
+
if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.THINKING) {
|
|
9869
9866
|
lastAction = "Thinking...";
|
|
9870
|
-
} else if (traceEvt.type ===
|
|
9867
|
+
} else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.TOOL_USE) {
|
|
9871
9868
|
lastAction = extractToolAction(
|
|
9872
9869
|
traceEvt.toolName ?? "",
|
|
9873
9870
|
void 0
|
|
9874
9871
|
);
|
|
9875
|
-
} else if (traceEvt.type ===
|
|
9872
|
+
} else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.FILE_WRITE) {
|
|
9876
9873
|
lastAction = `Writing: ${traceEvt.filePath || "file"}`;
|
|
9877
|
-
} else if (traceEvt.type ===
|
|
9874
|
+
} else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.FILE_READ) {
|
|
9878
9875
|
lastAction = `Reading: ${traceEvt.filePath || "file"}`;
|
|
9879
|
-
} else if (traceEvt.type ===
|
|
9876
|
+
} else if (traceEvt.type === import_evalforge_types9.LiveTraceEventType.COMPLETION) {
|
|
9880
9877
|
lastAction = "Processing response...";
|
|
9881
9878
|
}
|
|
9882
9879
|
emitTraceEvent(traceEvt, traceContext.pushEvent);
|
|
@@ -9958,7 +9955,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
9958
9955
|
targetId: traceContext.targetId,
|
|
9959
9956
|
targetName: traceContext.targetName,
|
|
9960
9957
|
stepNumber: 0,
|
|
9961
|
-
type:
|
|
9958
|
+
type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
|
|
9962
9959
|
outputPreview: JSON.stringify({
|
|
9963
9960
|
event: "pre-cli-execution",
|
|
9964
9961
|
model: `${providerID}/${modelID}`,
|
|
@@ -10012,7 +10009,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
10012
10009
|
targetId: traceContext.targetId,
|
|
10013
10010
|
targetName: traceContext.targetName,
|
|
10014
10011
|
stepNumber: traceStepNumber + 1,
|
|
10015
|
-
type:
|
|
10012
|
+
type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
|
|
10016
10013
|
outputPreview: JSON.stringify({
|
|
10017
10014
|
event: "idle-timeout-retry",
|
|
10018
10015
|
attempt,
|
|
@@ -10056,7 +10053,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
10056
10053
|
targetId: traceContext.targetId,
|
|
10057
10054
|
targetName: traceContext.targetName,
|
|
10058
10055
|
stepNumber: traceStepNumber + 1,
|
|
10059
|
-
type:
|
|
10056
|
+
type: import_evalforge_types9.LiveTraceEventType.DIAGNOSTIC,
|
|
10060
10057
|
outputPreview: JSON.stringify({
|
|
10061
10058
|
event: "cli-execution-failed",
|
|
10062
10059
|
error: lastAttemptResult.error?.message ?? "Unknown error",
|
|
@@ -10111,7 +10108,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
|
|
|
10111
10108
|
targetId: traceContext.targetId,
|
|
10112
10109
|
targetName: traceContext.targetName,
|
|
10113
10110
|
stepNumber: traceStepNumber + 1,
|
|
10114
|
-
type:
|
|
10111
|
+
type: import_evalforge_types9.LiveTraceEventType.COMPLETION,
|
|
10115
10112
|
outputPreview: "Scenario execution completed",
|
|
10116
10113
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
10117
10114
|
isComplete: true
|
|
@@ -10148,7 +10145,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
|
|
|
10148
10145
|
var OpenCodeAdapter = class {
|
|
10149
10146
|
id = "opencode";
|
|
10150
10147
|
name = "OpenCode";
|
|
10151
|
-
supportedCommands = [
|
|
10148
|
+
supportedCommands = [import_evalforge_types10.AgentRunCommand.OPENCODE];
|
|
10152
10149
|
async prepareEnvironment(context) {
|
|
10153
10150
|
await prepareOpenCodeEnvironment(context.cwd, context.skills, {
|
|
10154
10151
|
mcps: context.mcps,
|
|
@@ -10171,7 +10168,7 @@ var OpenCodeAdapter = class {
|
|
|
10171
10168
|
rules,
|
|
10172
10169
|
systemPrompt
|
|
10173
10170
|
} = context;
|
|
10174
|
-
const typed = config ?
|
|
10171
|
+
const typed = config ? import_evalforge_types10.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
|
|
10175
10172
|
const cfg = typed?.success ? typed.data : void 0;
|
|
10176
10173
|
const rawMaxTurns = cfg?.maxTurns;
|
|
10177
10174
|
const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
|
|
@@ -10221,7 +10218,7 @@ var import_ai = require("ai");
|
|
|
10221
10218
|
var import_anthropic = require("@ai-sdk/anthropic");
|
|
10222
10219
|
var import_google = require("@ai-sdk/google");
|
|
10223
10220
|
var import_openai = require("@ai-sdk/openai");
|
|
10224
|
-
var
|
|
10221
|
+
var import_evalforge_types12 = require("@wix/evalforge-types");
|
|
10225
10222
|
var import_crypto4 = require("crypto");
|
|
10226
10223
|
|
|
10227
10224
|
// src/run-scenario/agents/simple-agent/mcp-tools.ts
|
|
@@ -10318,7 +10315,7 @@ function extractErrorText(content) {
|
|
|
10318
10315
|
}
|
|
10319
10316
|
|
|
10320
10317
|
// src/run-scenario/agents/simple-agent/cost-calculation.ts
|
|
10321
|
-
var
|
|
10318
|
+
var import_evalforge_types11 = require("@wix/evalforge-types");
|
|
10322
10319
|
var PROVIDER_ANTHROPIC = "anthropic";
|
|
10323
10320
|
var PROVIDER_GEMINI = "gemini";
|
|
10324
10321
|
var MODEL_PRICING = {
|
|
@@ -10387,7 +10384,7 @@ function extractGatewayCost(step, provider) {
|
|
|
10387
10384
|
}
|
|
10388
10385
|
}
|
|
10389
10386
|
function calculateFromPricing(modelId, tokenUsage) {
|
|
10390
|
-
const normalized = (0,
|
|
10387
|
+
const normalized = (0, import_evalforge_types11.normalizeModelId)(modelId);
|
|
10391
10388
|
const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
|
|
10392
10389
|
if (!pricing) return 0;
|
|
10393
10390
|
return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
|
|
@@ -10480,7 +10477,7 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
10480
10477
|
apiKey: "proxy-auth",
|
|
10481
10478
|
headers
|
|
10482
10479
|
});
|
|
10483
|
-
if ([...
|
|
10480
|
+
if ([...import_evalforge_types12.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
10484
10481
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10485
10482
|
)) {
|
|
10486
10483
|
return openai.responses(modelId);
|
|
@@ -10488,12 +10485,12 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
10488
10485
|
return openai.chat(modelId);
|
|
10489
10486
|
}
|
|
10490
10487
|
function isClaudeModelId(modelId) {
|
|
10491
|
-
return
|
|
10488
|
+
return import_evalforge_types12.AVAILABLE_CLAUDE_MODEL_IDS.some(
|
|
10492
10489
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10493
10490
|
);
|
|
10494
10491
|
}
|
|
10495
10492
|
function isGeminiModelId(modelId) {
|
|
10496
|
-
return
|
|
10493
|
+
return import_evalforge_types12.AVAILABLE_GEMINI_MODEL_IDS.some(
|
|
10497
10494
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10498
10495
|
);
|
|
10499
10496
|
}
|
|
@@ -10513,9 +10510,9 @@ async function executeWithAiSdk(context) {
|
|
|
10513
10510
|
mcps,
|
|
10514
10511
|
traceContext
|
|
10515
10512
|
} = context;
|
|
10516
|
-
const typed = config ?
|
|
10513
|
+
const typed = config ? import_evalforge_types12.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
|
|
10517
10514
|
const cfg = typed?.success ? typed.data : void 0;
|
|
10518
|
-
const schemaKeys = new Set(Object.keys(
|
|
10515
|
+
const schemaKeys = new Set(Object.keys(import_evalforge_types12.SimpleAgentConfigSchema.shape));
|
|
10519
10516
|
const configExtras = {};
|
|
10520
10517
|
if (config) {
|
|
10521
10518
|
for (const [key, value] of Object.entries(config)) {
|
|
@@ -10552,11 +10549,11 @@ async function executeWithAiSdk(context) {
|
|
|
10552
10549
|
}, SDK_TIMEOUT_MS);
|
|
10553
10550
|
try {
|
|
10554
10551
|
const isAnthropic = provider === PROVIDER_ANTHROPIC2;
|
|
10555
|
-
const isResponsesAPI = [...
|
|
10552
|
+
const isResponsesAPI = [...import_evalforge_types12.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
10556
10553
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10557
10554
|
);
|
|
10558
10555
|
const isGemini = provider === PROVIDER_GEMINI2;
|
|
10559
|
-
const isGeminiThinking = isGemini &&
|
|
10556
|
+
const isGeminiThinking = isGemini && import_evalforge_types12.GEMINI_THINKING_MODEL_IDS.has(modelId);
|
|
10560
10557
|
const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
|
|
10561
10558
|
const thinkingBudgetTokens = cfg.thinkingBudgetTokens ?? 1e4;
|
|
10562
10559
|
const reasoningEffort = cfg.reasoningEffort ?? "high";
|
|
@@ -10635,7 +10632,7 @@ async function executeWithAiSdk(context) {
|
|
|
10635
10632
|
targetId: traceContext.targetId,
|
|
10636
10633
|
targetName: traceContext.targetName,
|
|
10637
10634
|
stepNumber: stepTimestamps.length,
|
|
10638
|
-
type: isToolStep ?
|
|
10635
|
+
type: isToolStep ? import_evalforge_types12.LiveTraceEventType.TOOL_USE : import_evalforge_types12.LiveTraceEventType.COMPLETION,
|
|
10639
10636
|
toolName: firstToolCall?.toolName,
|
|
10640
10637
|
toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
|
|
10641
10638
|
outputPreview: step.text?.slice(0, 500),
|
|
@@ -10840,7 +10837,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
|
|
|
10840
10837
|
id: (0, import_crypto4.randomUUID)(),
|
|
10841
10838
|
stepNumber: i + 1,
|
|
10842
10839
|
turnIndex: i,
|
|
10843
|
-
type: step.toolCalls.length > 0 ?
|
|
10840
|
+
type: step.toolCalls.length > 0 ? import_evalforge_types12.LLMStepType.TOOL_USE : import_evalforge_types12.LLMStepType.COMPLETION,
|
|
10844
10841
|
model: modelId,
|
|
10845
10842
|
provider,
|
|
10846
10843
|
startedAt: new Date(stepStartedAt).toISOString(),
|
|
@@ -10890,7 +10887,7 @@ function emitStartEvent(traceContext, startTime) {
|
|
|
10890
10887
|
targetId: traceContext.targetId,
|
|
10891
10888
|
targetName: traceContext.targetName,
|
|
10892
10889
|
stepNumber: 0,
|
|
10893
|
-
type:
|
|
10890
|
+
type: import_evalforge_types12.LiveTraceEventType.PROGRESS,
|
|
10894
10891
|
outputPreview: "Starting Simple Agent execution...",
|
|
10895
10892
|
elapsedMs: Date.now() - startTime,
|
|
10896
10893
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -10908,7 +10905,7 @@ function emitCompletionEvent(traceContext, stepNumber) {
|
|
|
10908
10905
|
targetId: traceContext.targetId,
|
|
10909
10906
|
targetName: traceContext.targetName,
|
|
10910
10907
|
stepNumber,
|
|
10911
|
-
type:
|
|
10908
|
+
type: import_evalforge_types12.LiveTraceEventType.COMPLETION,
|
|
10912
10909
|
outputPreview: "Scenario execution completed",
|
|
10913
10910
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
10914
10911
|
isComplete: true
|
|
@@ -11678,11 +11675,11 @@ function substituteVariables(prompt, variables) {
|
|
|
11678
11675
|
}
|
|
11679
11676
|
|
|
11680
11677
|
// src/run-scenario/run-agent-with-context.ts
|
|
11681
|
-
var
|
|
11682
|
-
var DEFAULT_AGENT_COMMAND =
|
|
11678
|
+
var import_evalforge_types13 = require("@wix/evalforge-types");
|
|
11679
|
+
var DEFAULT_AGENT_COMMAND = import_evalforge_types13.AgentRunCommand.CLAUDE;
|
|
11683
11680
|
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir, pushEvent) {
|
|
11684
11681
|
const agent = evalData.agent ?? void 0;
|
|
11685
|
-
const isSDK = agent?.agentType ===
|
|
11682
|
+
const isSDK = agent?.agentType === import_evalforge_types13.AgentType.SDK;
|
|
11686
11683
|
const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
11687
11684
|
const adapter = getAdapter(identifier);
|
|
11688
11685
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -11767,14 +11764,14 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
11767
11764
|
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
11768
11765
|
if (template) {
|
|
11769
11766
|
console.log(
|
|
11770
|
-
(0,
|
|
11767
|
+
(0, import_evalforge_types14.formatTraceEventLine)({
|
|
11771
11768
|
evalRunId: evalRunId2,
|
|
11772
11769
|
scenarioId: scenario.id,
|
|
11773
11770
|
scenarioName: scenario.name,
|
|
11774
11771
|
targetId,
|
|
11775
11772
|
targetName,
|
|
11776
11773
|
stepNumber: 0,
|
|
11777
|
-
type:
|
|
11774
|
+
type: import_evalforge_types14.LiveTraceEventType.PROGRESS,
|
|
11778
11775
|
outputPreview: "Setting up environment (installing dependencies)...",
|
|
11779
11776
|
elapsedMs: 0,
|
|
11780
11777
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -11814,7 +11811,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
11814
11811
|
})),
|
|
11815
11812
|
durationMs: partialResult.duration
|
|
11816
11813
|
};
|
|
11817
|
-
const defaultJudgeModel =
|
|
11814
|
+
const defaultJudgeModel = import_evalforge_types14.DEFAULT_JUDGE_MODEL;
|
|
11818
11815
|
const assertionContext = {
|
|
11819
11816
|
workDir,
|
|
11820
11817
|
defaultJudgeModel,
|
|
@@ -11829,10 +11826,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
11829
11826
|
assertionContext
|
|
11830
11827
|
) : [];
|
|
11831
11828
|
const passed = assertionResults.filter(
|
|
11832
|
-
(r) => r.status ===
|
|
11829
|
+
(r) => r.status === import_evalforge_types14.AssertionResultStatus.PASSED
|
|
11833
11830
|
).length;
|
|
11834
11831
|
const failed = assertionResults.filter(
|
|
11835
|
-
(r) => r.status ===
|
|
11832
|
+
(r) => r.status === import_evalforge_types14.AssertionResultStatus.FAILED
|
|
11836
11833
|
).length;
|
|
11837
11834
|
const total = assertionResults.length;
|
|
11838
11835
|
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
@@ -11908,7 +11905,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
11908
11905
|
}
|
|
11909
11906
|
|
|
11910
11907
|
// src/error-reporter.ts
|
|
11911
|
-
var
|
|
11908
|
+
var import_evalforge_types15 = require("@wix/evalforge-types");
|
|
11912
11909
|
function formatError(error, phase, context) {
|
|
11913
11910
|
const timestamp2 = (/* @__PURE__ */ new Date()).toISOString();
|
|
11914
11911
|
if (error instanceof Error) {
|
|
@@ -12151,7 +12148,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
12151
12148
|
totalExecutions
|
|
12152
12149
|
};
|
|
12153
12150
|
const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
|
|
12154
|
-
const finalStatus = allFailed ?
|
|
12151
|
+
const finalStatus = allFailed ? import_evalforge_types16.EvalStatus.FAILED : import_evalforge_types16.EvalStatus.COMPLETED;
|
|
12155
12152
|
const jobErrorOnAllFailed = allFailed ? truncateForJobError(
|
|
12156
12153
|
firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
|
|
12157
12154
|
) : void 0;
|
|
@@ -12205,7 +12202,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
12205
12202
|
grpcAuthToken: config.grpcAuthToken
|
|
12206
12203
|
});
|
|
12207
12204
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
12208
|
-
status:
|
|
12205
|
+
status: import_evalforge_types16.EvalStatus.FAILED,
|
|
12209
12206
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
12210
12207
|
jobError,
|
|
12211
12208
|
jobStatus: "FAILED"
|
|
@@ -12230,7 +12227,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
12230
12227
|
grpcAuthToken
|
|
12231
12228
|
});
|
|
12232
12229
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
12233
|
-
status:
|
|
12230
|
+
status: import_evalforge_types16.EvalStatus.FAILED,
|
|
12234
12231
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
12235
12232
|
jobError: `Config load failed, then: ${jobError}`,
|
|
12236
12233
|
jobStatus: "FAILED"
|