@wix/evalforge-evaluator 0.184.0 → 0.185.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +218 -215
- package/build/index.js.map +4 -4
- package/build/index.mjs +133 -129
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/agents/claude-code/write-mcp.d.ts +1 -7
- package/build/types/run-scenario/agents/claude-code/write-rules.d.ts +1 -7
- package/build/types/run-scenario/agents/claude-code/write-skills.d.ts +9 -6
- package/build/types/run-scenario/agents/claude-code/write-sub-agents.d.ts +5 -4
- package/build/types/run-scenario/agents/opencode/write-skills.d.ts +8 -5
- package/build/types/run-scenario/agents/opencode/write-sub-agents.d.ts +3 -3
- package/package.json +2 -2
- package/build/types/run-scenario/agents/shared/resolve-capability-content.d.ts +0 -42
package/build/index.js
CHANGED
|
@@ -5226,7 +5226,7 @@ var require_index_node = __commonJS({
|
|
|
5226
5226
|
});
|
|
5227
5227
|
|
|
5228
5228
|
// src/index.ts
|
|
5229
|
-
var
|
|
5229
|
+
var import_evalforge_types15 = require("@wix/evalforge-types");
|
|
5230
5230
|
|
|
5231
5231
|
// src/config.ts
|
|
5232
5232
|
function loadConfig() {
|
|
@@ -7115,7 +7115,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
7115
7115
|
}
|
|
7116
7116
|
|
|
7117
7117
|
// src/run-scenario/index.ts
|
|
7118
|
-
var
|
|
7118
|
+
var import_evalforge_types13 = require("@wix/evalforge-types");
|
|
7119
7119
|
var import_eval_assertions = require("@wix/eval-assertions");
|
|
7120
7120
|
|
|
7121
7121
|
// src/run-scenario/environment.ts
|
|
@@ -7451,122 +7451,50 @@ function getAdapter(identifier) {
|
|
|
7451
7451
|
}
|
|
7452
7452
|
|
|
7453
7453
|
// src/run-scenario/agents/claude-code/claude-code-adapter.ts
|
|
7454
|
-
var
|
|
7454
|
+
var import_evalforge_types5 = require("@wix/evalforge-types");
|
|
7455
7455
|
|
|
7456
7456
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
7457
|
-
var
|
|
7457
|
+
var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
7458
7458
|
|
|
7459
7459
|
// src/run-scenario/agents/claude-code/write-skills.ts
|
|
7460
7460
|
var import_promises3 = require("fs/promises");
|
|
7461
7461
|
var import_path4 = require("path");
|
|
7462
|
-
|
|
7463
|
-
// src/run-scenario/agents/shared/resolve-capability-content.ts
|
|
7464
7462
|
var import_evalforge_github_client2 = require("@wix/evalforge-github-client");
|
|
7465
|
-
|
|
7466
|
-
var USER_AGENT = "EvalForge-Evaluator";
|
|
7467
|
-
async function resolveSkillFiles(skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
7468
|
-
const version = skill.latestVersion;
|
|
7469
|
-
if (version?.files && version.files.length > 0) {
|
|
7470
|
-
console.log(
|
|
7471
|
-
`[Skill] ${skill.name}: using ${version.files.length} file(s) from snapshot`
|
|
7472
|
-
);
|
|
7473
|
-
return version.files;
|
|
7474
|
-
}
|
|
7475
|
-
if (skill.source) {
|
|
7476
|
-
const files = await fetchFn(skill.source, { userAgent: USER_AGENT });
|
|
7477
|
-
console.log(
|
|
7478
|
-
`[Skill] ${skill.name}: fetched ${files.length} file(s) from GitHub (live)`
|
|
7479
|
-
);
|
|
7480
|
-
return files;
|
|
7481
|
-
}
|
|
7482
|
-
throw new Error(`Skill ${skill.name} has no files and no source configured`);
|
|
7483
|
-
}
|
|
7484
|
-
async function fetchSourceFile(label, noun, name, source, fetchFn) {
|
|
7485
|
-
try {
|
|
7486
|
-
const content = await fetchFn(source, { userAgent: USER_AGENT });
|
|
7487
|
-
console.log(
|
|
7488
|
-
`[${label}] Fetched "${name}" from ${source.owner}/${source.repo}/${source.path}@${source.ref}`
|
|
7489
|
-
);
|
|
7490
|
-
return content;
|
|
7491
|
-
} catch (error) {
|
|
7492
|
-
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7493
|
-
console.error(`[${label}] "${name}": GitHub fetch failed: ${message}`);
|
|
7494
|
-
throw new Error(
|
|
7495
|
-
`Failed to fetch ${noun} "${name}" from GitHub: ${message}`
|
|
7496
|
-
);
|
|
7497
|
-
}
|
|
7498
|
-
}
|
|
7499
|
-
async function resolveSubAgentMd(agent, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
|
|
7500
|
-
if (agent.source) {
|
|
7501
|
-
return fetchSourceFile(
|
|
7502
|
-
"SubAgents",
|
|
7503
|
-
"sub-agent",
|
|
7504
|
-
agent.name,
|
|
7505
|
-
agent.source,
|
|
7506
|
-
fetchFn
|
|
7507
|
-
);
|
|
7508
|
-
}
|
|
7509
|
-
if (!agent.subAgentMd) {
|
|
7510
|
-
console.warn(
|
|
7511
|
-
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
7512
|
-
);
|
|
7513
|
-
}
|
|
7514
|
-
return agent.subAgentMd;
|
|
7515
|
-
}
|
|
7516
|
-
async function resolveRuleText(rule, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
|
|
7517
|
-
if (!rule.source) {
|
|
7518
|
-
return rule.content;
|
|
7519
|
-
}
|
|
7520
|
-
return fetchSourceFile("Rules", "rule", rule.name, rule.source, fetchFn);
|
|
7521
|
-
}
|
|
7522
|
-
async function resolveMcpConfig(mcp, fetchFn = import_evalforge_github_client2.fetchGitHubFile) {
|
|
7523
|
-
if (!mcp.source) {
|
|
7524
|
-
return mcp.config;
|
|
7525
|
-
}
|
|
7526
|
-
const raw = await fetchSourceFile(
|
|
7527
|
-
"MCP",
|
|
7528
|
-
"MCP",
|
|
7529
|
-
mcp.name,
|
|
7530
|
-
mcp.source,
|
|
7531
|
-
fetchFn
|
|
7532
|
-
);
|
|
7533
|
-
let parsed;
|
|
7534
|
-
try {
|
|
7535
|
-
parsed = JSON.parse(raw);
|
|
7536
|
-
} catch (error) {
|
|
7537
|
-
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7538
|
-
throw new Error(
|
|
7539
|
-
`MCP "${mcp.name}" GitHub source is not valid JSON: ${message}`
|
|
7540
|
-
);
|
|
7541
|
-
}
|
|
7542
|
-
if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
|
|
7543
|
-
throw new Error(`MCP "${mcp.name}" GitHub source must be a JSON object`);
|
|
7544
|
-
}
|
|
7545
|
-
const obj = parsed;
|
|
7546
|
-
const servers = obj[import_evalforge_types2.MCP_SERVERS_JSON_KEY];
|
|
7547
|
-
if (typeof servers === "object" && servers !== null && !Array.isArray(servers)) {
|
|
7548
|
-
return servers;
|
|
7549
|
-
}
|
|
7550
|
-
return obj;
|
|
7551
|
-
}
|
|
7552
|
-
|
|
7553
|
-
// src/run-scenario/agents/claude-code/write-skills.ts
|
|
7554
|
-
async function writeSkillsToFilesystem(cwd, skills, fetchFn) {
|
|
7463
|
+
async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
7555
7464
|
await Promise.all(
|
|
7556
7465
|
skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
|
|
7557
7466
|
);
|
|
7558
7467
|
}
|
|
7559
|
-
async function writeSkillToFilesystem(cwd, skill, fetchFn) {
|
|
7560
|
-
const
|
|
7468
|
+
async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client2.fetchGitHubFolder) {
|
|
7469
|
+
const skillName = skill.name;
|
|
7470
|
+
const skillDir = (0, import_path4.join)(cwd, ".claude", "skills", skillName);
|
|
7561
7471
|
await (0, import_promises3.mkdir)(skillDir, { recursive: true });
|
|
7562
|
-
|
|
7563
|
-
|
|
7564
|
-
await writeFilesToDirectory(skillDir, files);
|
|
7565
|
-
|
|
7566
|
-
|
|
7567
|
-
throw new Error(
|
|
7568
|
-
`Failed to write skill ${skill.name} to filesystem: ${message}`
|
|
7472
|
+
const version = skill.latestVersion;
|
|
7473
|
+
if (version?.files && version.files.length > 0) {
|
|
7474
|
+
await writeFilesToDirectory(skillDir, version.files);
|
|
7475
|
+
console.log(
|
|
7476
|
+
`[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
|
|
7569
7477
|
);
|
|
7478
|
+
} else if (skill.source) {
|
|
7479
|
+
try {
|
|
7480
|
+
const files = await fetchFn(skill.source, {
|
|
7481
|
+
userAgent: "EvalForge-Evaluator"
|
|
7482
|
+
});
|
|
7483
|
+
await writeFilesToDirectory(skillDir, files);
|
|
7484
|
+
console.log(
|
|
7485
|
+
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
7486
|
+
);
|
|
7487
|
+
} catch (error) {
|
|
7488
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7489
|
+
console.error(
|
|
7490
|
+
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
7491
|
+
);
|
|
7492
|
+
throw new Error(
|
|
7493
|
+
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
7494
|
+
);
|
|
7495
|
+
}
|
|
7496
|
+
} else {
|
|
7497
|
+
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
7570
7498
|
}
|
|
7571
7499
|
}
|
|
7572
7500
|
|
|
@@ -7584,7 +7512,7 @@ var import_crypto2 = require("crypto");
|
|
|
7584
7512
|
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
7585
7513
|
var import_promises5 = require("fs/promises");
|
|
7586
7514
|
var import_path6 = require("path");
|
|
7587
|
-
var
|
|
7515
|
+
var import_evalforge_types2 = require("@wix/evalforge-types");
|
|
7588
7516
|
|
|
7589
7517
|
// src/run-scenario/agents/shared/resolve-mcp-placeholders.ts
|
|
7590
7518
|
var import_promises4 = require("fs/promises");
|
|
@@ -7629,11 +7557,11 @@ async function resolveMcpPlaceholders(mcpServers, options = {}) {
|
|
|
7629
7557
|
}
|
|
7630
7558
|
|
|
7631
7559
|
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
7632
|
-
async function writeMcpToFilesystem(cwd, mcps
|
|
7560
|
+
async function writeMcpToFilesystem(cwd, mcps) {
|
|
7633
7561
|
if (mcps.length === 0) return;
|
|
7634
7562
|
const mcpServers = {};
|
|
7635
7563
|
for (const mcp of mcps) {
|
|
7636
|
-
const config =
|
|
7564
|
+
const config = mcp.config;
|
|
7637
7565
|
for (const [key, value] of Object.entries(config)) {
|
|
7638
7566
|
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
7639
7567
|
throw new Error(
|
|
@@ -7645,7 +7573,7 @@ async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
|
|
|
7645
7573
|
}
|
|
7646
7574
|
const resolvedServers = await resolveMcpPlaceholders(mcpServers, { cwd });
|
|
7647
7575
|
const content = JSON.stringify(
|
|
7648
|
-
{ [
|
|
7576
|
+
{ [import_evalforge_types2.MCP_SERVERS_JSON_KEY]: resolvedServers },
|
|
7649
7577
|
null,
|
|
7650
7578
|
2
|
|
7651
7579
|
);
|
|
@@ -7657,6 +7585,7 @@ async function writeMcpToFilesystem(cwd, mcps, fetchFn) {
|
|
|
7657
7585
|
// src/run-scenario/agents/claude-code/write-sub-agents.ts
|
|
7658
7586
|
var import_promises6 = require("fs/promises");
|
|
7659
7587
|
var import_path7 = require("path");
|
|
7588
|
+
var import_evalforge_github_client3 = require("@wix/evalforge-github-client");
|
|
7660
7589
|
var AGENTS_DIR = ".claude/agents";
|
|
7661
7590
|
function toAgentFilename(name, index, nameCount) {
|
|
7662
7591
|
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
@@ -7664,7 +7593,34 @@ function toAgentFilename(name, index, nameCount) {
|
|
|
7664
7593
|
nameCount.set(base, count + 1);
|
|
7665
7594
|
return count === 0 ? base : `${base}-${count + 1}`;
|
|
7666
7595
|
}
|
|
7667
|
-
async function
|
|
7596
|
+
async function resolveSubAgentContent(agent, fetchFn) {
|
|
7597
|
+
if (agent.source) {
|
|
7598
|
+
try {
|
|
7599
|
+
const content = await fetchFn(agent.source, {
|
|
7600
|
+
userAgent: "EvalForge-Evaluator"
|
|
7601
|
+
});
|
|
7602
|
+
console.log(
|
|
7603
|
+
`[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
|
|
7604
|
+
);
|
|
7605
|
+
return content;
|
|
7606
|
+
} catch (error) {
|
|
7607
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
7608
|
+
console.error(
|
|
7609
|
+
`[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
|
|
7610
|
+
);
|
|
7611
|
+
throw new Error(
|
|
7612
|
+
`Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
|
|
7613
|
+
);
|
|
7614
|
+
}
|
|
7615
|
+
}
|
|
7616
|
+
if (!agent.subAgentMd) {
|
|
7617
|
+
console.warn(
|
|
7618
|
+
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
7619
|
+
);
|
|
7620
|
+
}
|
|
7621
|
+
return agent.subAgentMd;
|
|
7622
|
+
}
|
|
7623
|
+
async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalforge_github_client3.fetchGitHubFile) {
|
|
7668
7624
|
if (subAgents.length === 0) return;
|
|
7669
7625
|
const agentsDir = (0, import_path7.join)(cwd, AGENTS_DIR);
|
|
7670
7626
|
await (0, import_promises6.mkdir)(agentsDir, { recursive: true });
|
|
@@ -7672,7 +7628,7 @@ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn) {
|
|
|
7672
7628
|
for (const [i, agent] of subAgents.entries()) {
|
|
7673
7629
|
const filename = toAgentFilename(agent.name, i, nameCount);
|
|
7674
7630
|
const filePath = (0, import_path7.join)(agentsDir, `${filename}.md`);
|
|
7675
|
-
const content = await
|
|
7631
|
+
const content = await resolveSubAgentContent(agent, fetchFn);
|
|
7676
7632
|
await (0, import_promises6.writeFile)(filePath, content, "utf8");
|
|
7677
7633
|
}
|
|
7678
7634
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
@@ -7722,19 +7678,18 @@ function validateGenericDirectory(dir, cwd) {
|
|
|
7722
7678
|
}
|
|
7723
7679
|
return trimmed;
|
|
7724
7680
|
}
|
|
7725
|
-
async function writeRulesToFilesystem(cwd, rules
|
|
7681
|
+
async function writeRulesToFilesystem(cwd, rules) {
|
|
7726
7682
|
if (rules.length === 0) return;
|
|
7727
7683
|
const nameCount = /* @__PURE__ */ new Map();
|
|
7728
7684
|
let hasCursorRules = false;
|
|
7729
7685
|
for (const [i, rule] of rules.entries()) {
|
|
7730
|
-
const content = await resolveRuleText(rule, fetchFn);
|
|
7731
7686
|
switch (rule.ruleType) {
|
|
7732
7687
|
case "claude-md": {
|
|
7733
|
-
await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), content);
|
|
7688
|
+
await appendToFile((0, import_path8.join)(cwd, "CLAUDE.md"), rule.content);
|
|
7734
7689
|
break;
|
|
7735
7690
|
}
|
|
7736
7691
|
case "agents-md": {
|
|
7737
|
-
await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), content);
|
|
7692
|
+
await appendToFile((0, import_path8.join)(cwd, "AGENTS.md"), rule.content);
|
|
7738
7693
|
break;
|
|
7739
7694
|
}
|
|
7740
7695
|
case "cursor-rule": {
|
|
@@ -7744,7 +7699,7 @@ async function writeRulesToFilesystem(cwd, rules, fetchFn) {
|
|
|
7744
7699
|
}
|
|
7745
7700
|
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
7746
7701
|
const filePath = (0, import_path8.join)(cwd, CURSOR_RULES_DIR, `${filename}.md`);
|
|
7747
|
-
await (0, import_promises7.writeFile)(filePath, content, "utf8");
|
|
7702
|
+
await (0, import_promises7.writeFile)(filePath, rule.content, "utf8");
|
|
7748
7703
|
break;
|
|
7749
7704
|
}
|
|
7750
7705
|
case "generic": {
|
|
@@ -7755,7 +7710,7 @@ async function writeRulesToFilesystem(cwd, rules, fetchFn) {
|
|
|
7755
7710
|
const dirPath = (0, import_path8.join)(cwd, directory);
|
|
7756
7711
|
await (0, import_promises7.mkdir)(dirPath, { recursive: true });
|
|
7757
7712
|
const filename = toRuleFilename(rule.name, i, nameCount);
|
|
7758
|
-
await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), content, "utf8");
|
|
7713
|
+
await (0, import_promises7.writeFile)((0, import_path8.join)(dirPath, `${filename}.md`), rule.content, "utf8");
|
|
7759
7714
|
break;
|
|
7760
7715
|
}
|
|
7761
7716
|
default: {
|
|
@@ -7845,14 +7800,14 @@ function buildConversation(timestampedMessages) {
|
|
|
7845
7800
|
}
|
|
7846
7801
|
|
|
7847
7802
|
// src/run-scenario/agents/shared/trace-emit.ts
|
|
7848
|
-
var
|
|
7803
|
+
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
7849
7804
|
function emitTraceEvent(event, pushEvent) {
|
|
7850
|
-
console.log(`${
|
|
7805
|
+
console.log(`${import_evalforge_types3.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
|
|
7851
7806
|
pushEvent?.(event);
|
|
7852
7807
|
}
|
|
7853
7808
|
|
|
7854
7809
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
7855
|
-
var DEFAULT_MODEL =
|
|
7810
|
+
var DEFAULT_MODEL = import_evalforge_types4.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
|
|
7856
7811
|
async function* buildPromptStream(triggerPrompt, images) {
|
|
7857
7812
|
yield {
|
|
7858
7813
|
type: "user",
|
|
@@ -7917,7 +7872,7 @@ function extractToolActionDescription(toolName, toolArgs) {
|
|
|
7917
7872
|
return `Using ${toolName}...`;
|
|
7918
7873
|
}
|
|
7919
7874
|
function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
7920
|
-
let type =
|
|
7875
|
+
let type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
|
|
7921
7876
|
let toolName;
|
|
7922
7877
|
let toolArgs;
|
|
7923
7878
|
let outputPreview;
|
|
@@ -7925,28 +7880,28 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
|
7925
7880
|
let thinking;
|
|
7926
7881
|
for (const block of message.message.content) {
|
|
7927
7882
|
if (block.type === "tool_use") {
|
|
7928
|
-
type =
|
|
7883
|
+
type = import_evalforge_types4.LiveTraceEventType.TOOL_USE;
|
|
7929
7884
|
toolName = block.name;
|
|
7930
7885
|
toolArgs = JSON.stringify(block.input).slice(0, 500);
|
|
7931
7886
|
const input = block.input;
|
|
7932
7887
|
if (input.file_path || input.path || input.target_file) {
|
|
7933
7888
|
filePath = String(input.file_path || input.path || input.target_file);
|
|
7934
7889
|
if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
|
|
7935
|
-
type =
|
|
7890
|
+
type = import_evalforge_types4.LiveTraceEventType.FILE_WRITE;
|
|
7936
7891
|
} else if (block.name === "Read" || block.name === "read" || block.name === "View") {
|
|
7937
|
-
type =
|
|
7892
|
+
type = import_evalforge_types4.LiveTraceEventType.FILE_READ;
|
|
7938
7893
|
}
|
|
7939
7894
|
}
|
|
7940
7895
|
} else if (block.type === "text") {
|
|
7941
7896
|
outputPreview = block.text.slice(0, 500);
|
|
7942
7897
|
if (!toolName) {
|
|
7943
|
-
type =
|
|
7898
|
+
type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
|
|
7944
7899
|
}
|
|
7945
7900
|
} else if (block.type === "thinking") {
|
|
7946
7901
|
const thinkingBlock = block;
|
|
7947
7902
|
thinking = thinkingBlock.thinking.slice(0, 500);
|
|
7948
7903
|
if (!outputPreview && !toolName) {
|
|
7949
|
-
type =
|
|
7904
|
+
type = import_evalforge_types4.LiveTraceEventType.THINKING;
|
|
7950
7905
|
}
|
|
7951
7906
|
}
|
|
7952
7907
|
}
|
|
@@ -8012,7 +7967,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
8012
7967
|
}
|
|
8013
7968
|
return {
|
|
8014
7969
|
...baseEvent,
|
|
8015
|
-
type:
|
|
7970
|
+
type: import_evalforge_types4.LiveTraceEventType.TOOL_RESULT,
|
|
8016
7971
|
outputPreview: outputPreview || "(tool result)"
|
|
8017
7972
|
};
|
|
8018
7973
|
}
|
|
@@ -8020,7 +7975,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
8020
7975
|
const sysMsg = message;
|
|
8021
7976
|
return {
|
|
8022
7977
|
...baseEvent,
|
|
8023
|
-
type:
|
|
7978
|
+
type: import_evalforge_types4.LiveTraceEventType.SYSTEM,
|
|
8024
7979
|
outputPreview: sysMsg.subtype || "system"
|
|
8025
7980
|
};
|
|
8026
7981
|
}
|
|
@@ -8029,7 +7984,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
8029
7984
|
}
|
|
8030
7985
|
return {
|
|
8031
7986
|
...baseEvent,
|
|
8032
|
-
type:
|
|
7987
|
+
type: import_evalforge_types4.LiveTraceEventType.PROGRESS,
|
|
8033
7988
|
outputPreview: `Message type: ${message.type}`
|
|
8034
7989
|
};
|
|
8035
7990
|
}
|
|
@@ -8131,7 +8086,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8131
8086
|
queryOptions.systemPrompt = {
|
|
8132
8087
|
type: "preset",
|
|
8133
8088
|
preset: "claude_code",
|
|
8134
|
-
append:
|
|
8089
|
+
append: import_evalforge_types4.DEFAULT_EVALUATOR_SYSTEM_PROMPT
|
|
8135
8090
|
};
|
|
8136
8091
|
}
|
|
8137
8092
|
if (options.temperature !== void 0) {
|
|
@@ -8166,7 +8121,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8166
8121
|
targetId: traceContext.targetId,
|
|
8167
8122
|
targetName: traceContext.targetName,
|
|
8168
8123
|
stepNumber: 0,
|
|
8169
|
-
type:
|
|
8124
|
+
type: import_evalforge_types4.LiveTraceEventType.DIAGNOSTIC,
|
|
8170
8125
|
outputPreview: JSON.stringify({
|
|
8171
8126
|
event: "pre-sdk-execution",
|
|
8172
8127
|
model: queryOptions.model,
|
|
@@ -8230,7 +8185,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8230
8185
|
targetId: traceContext.targetId,
|
|
8231
8186
|
targetName: traceContext.targetName,
|
|
8232
8187
|
stepNumber: traceStepNumber,
|
|
8233
|
-
type:
|
|
8188
|
+
type: import_evalforge_types4.LiveTraceEventType.PROGRESS,
|
|
8234
8189
|
outputPreview: progressMessage,
|
|
8235
8190
|
toolName: lastToolName,
|
|
8236
8191
|
filePath: lastFilePath,
|
|
@@ -8267,18 +8222,18 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8267
8222
|
if (traceEvent) {
|
|
8268
8223
|
lastToolName = traceEvent.toolName;
|
|
8269
8224
|
lastFilePath = traceEvent.filePath;
|
|
8270
|
-
if (traceEvent.type ===
|
|
8225
|
+
if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.THINKING) {
|
|
8271
8226
|
lastAction = "Thinking...";
|
|
8272
|
-
} else if (traceEvent.type ===
|
|
8227
|
+
} else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.TOOL_USE) {
|
|
8273
8228
|
lastAction = extractToolActionDescription(
|
|
8274
8229
|
traceEvent.toolName,
|
|
8275
8230
|
traceEvent.toolArgs
|
|
8276
8231
|
);
|
|
8277
|
-
} else if (traceEvent.type ===
|
|
8232
|
+
} else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.FILE_WRITE) {
|
|
8278
8233
|
lastAction = `Writing: ${traceEvent.filePath || "file"}`;
|
|
8279
|
-
} else if (traceEvent.type ===
|
|
8234
|
+
} else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.FILE_READ) {
|
|
8280
8235
|
lastAction = `Reading: ${traceEvent.filePath || "file"}`;
|
|
8281
|
-
} else if (traceEvent.type ===
|
|
8236
|
+
} else if (traceEvent.type === import_evalforge_types4.LiveTraceEventType.COMPLETION) {
|
|
8282
8237
|
lastAction = "Processing response...";
|
|
8283
8238
|
}
|
|
8284
8239
|
emitTraceEvent(traceEvent, traceContext.pushEvent);
|
|
@@ -8456,7 +8411,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
8456
8411
|
targetId: traceContext.targetId,
|
|
8457
8412
|
targetName: traceContext.targetName,
|
|
8458
8413
|
stepNumber: traceStepNumber + 1,
|
|
8459
|
-
type:
|
|
8414
|
+
type: import_evalforge_types4.LiveTraceEventType.DIAGNOSTIC,
|
|
8460
8415
|
outputPreview: JSON.stringify(
|
|
8461
8416
|
{
|
|
8462
8417
|
event: "sdk-execution-failed",
|
|
@@ -8490,7 +8445,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
8490
8445
|
targetId: traceContext.targetId,
|
|
8491
8446
|
targetName: traceContext.targetName,
|
|
8492
8447
|
stepNumber: traceStepNumber + 1,
|
|
8493
|
-
type:
|
|
8448
|
+
type: import_evalforge_types4.LiveTraceEventType.COMPLETION,
|
|
8494
8449
|
outputPreview: "Scenario execution completed",
|
|
8495
8450
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
8496
8451
|
isComplete: true
|
|
@@ -8765,7 +8720,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8765
8720
|
stepNumber: 0,
|
|
8766
8721
|
// renumbered below
|
|
8767
8722
|
turnIndex,
|
|
8768
|
-
type:
|
|
8723
|
+
type: import_evalforge_types4.LLMStepType.THINKING,
|
|
8769
8724
|
model,
|
|
8770
8725
|
provider: "anthropic",
|
|
8771
8726
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -8796,7 +8751,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8796
8751
|
id: (0, import_crypto2.randomUUID)(),
|
|
8797
8752
|
stepNumber: 0,
|
|
8798
8753
|
turnIndex,
|
|
8799
|
-
type:
|
|
8754
|
+
type: import_evalforge_types4.LLMStepType.TOOL_USE,
|
|
8800
8755
|
model,
|
|
8801
8756
|
provider: "anthropic",
|
|
8802
8757
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -8826,7 +8781,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8826
8781
|
id: (0, import_crypto2.randomUUID)(),
|
|
8827
8782
|
stepNumber: 0,
|
|
8828
8783
|
turnIndex,
|
|
8829
|
-
type:
|
|
8784
|
+
type: import_evalforge_types4.LLMStepType.COMPLETION,
|
|
8830
8785
|
model,
|
|
8831
8786
|
provider: "anthropic",
|
|
8832
8787
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -8843,7 +8798,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8843
8798
|
});
|
|
8844
8799
|
}
|
|
8845
8800
|
if (subSteps.length === 0) {
|
|
8846
|
-
const stepType = hasThinking && !hasText ?
|
|
8801
|
+
const stepType = hasThinking && !hasText ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
|
|
8847
8802
|
subSteps.push({
|
|
8848
8803
|
id: (0, import_crypto2.randomUUID)(),
|
|
8849
8804
|
stepNumber: 0,
|
|
@@ -8913,7 +8868,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
8913
8868
|
var ClaudeCodeAdapter = class {
|
|
8914
8869
|
id = "claude-code";
|
|
8915
8870
|
name = "Claude Code";
|
|
8916
|
-
supportedCommands = [
|
|
8871
|
+
supportedCommands = [import_evalforge_types5.AgentRunCommand.CLAUDE];
|
|
8917
8872
|
/**
|
|
8918
8873
|
* Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
|
|
8919
8874
|
* before the baseline snapshot is taken.
|
|
@@ -8945,9 +8900,9 @@ var ClaudeCodeAdapter = class {
|
|
|
8945
8900
|
rules,
|
|
8946
8901
|
systemPrompt
|
|
8947
8902
|
} = context;
|
|
8948
|
-
const typed = config ?
|
|
8903
|
+
const typed = config ? import_evalforge_types5.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
|
|
8949
8904
|
const cfg = typed?.success ? typed.data : void 0;
|
|
8950
|
-
const schemaKeys = new Set(Object.keys(
|
|
8905
|
+
const schemaKeys = new Set(Object.keys(import_evalforge_types5.ClaudeCodeConfigSchema.shape));
|
|
8951
8906
|
const extras = {};
|
|
8952
8907
|
if (config) {
|
|
8953
8908
|
for (const [key, value] of Object.entries(config)) {
|
|
@@ -9002,11 +8957,11 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
|
|
|
9002
8957
|
defaultRegistry.register(claudeCodeAdapter);
|
|
9003
8958
|
|
|
9004
8959
|
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
9005
|
-
var
|
|
8960
|
+
var import_evalforge_types9 = require("@wix/evalforge-types");
|
|
9006
8961
|
|
|
9007
8962
|
// src/run-scenario/agents/opencode/execute.ts
|
|
9008
8963
|
var import_child_process2 = require("child_process");
|
|
9009
|
-
var
|
|
8964
|
+
var import_evalforge_types8 = require("@wix/evalforge-types");
|
|
9010
8965
|
|
|
9011
8966
|
// src/run-scenario/agents/opencode/types.ts
|
|
9012
8967
|
function tryParseJson(text) {
|
|
@@ -9020,28 +8975,49 @@ function tryParseJson(text) {
|
|
|
9020
8975
|
// src/run-scenario/agents/opencode/write-skills.ts
|
|
9021
8976
|
var import_promises8 = require("fs/promises");
|
|
9022
8977
|
var import_path9 = require("path");
|
|
9023
|
-
|
|
8978
|
+
var import_evalforge_github_client4 = require("@wix/evalforge-github-client");
|
|
8979
|
+
async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_github_client4.fetchGitHubFolder) {
|
|
9024
8980
|
await Promise.all(
|
|
9025
8981
|
skills.map((skill) => writeSkillToFilesystem2(cwd, skill, fetchFn))
|
|
9026
8982
|
);
|
|
9027
8983
|
}
|
|
9028
8984
|
async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
|
|
9029
|
-
const
|
|
8985
|
+
const skillName = skill.name;
|
|
8986
|
+
const skillDir = (0, import_path9.join)(cwd, ".opencode", "skills", skillName);
|
|
9030
8987
|
await (0, import_promises8.mkdir)(skillDir, { recursive: true });
|
|
9031
|
-
|
|
9032
|
-
|
|
9033
|
-
await writeFilesToDirectory(skillDir, files);
|
|
9034
|
-
|
|
9035
|
-
|
|
9036
|
-
throw new Error(
|
|
9037
|
-
`Failed to write skill ${skill.name} to filesystem: ${message}`
|
|
8988
|
+
const version = skill.latestVersion;
|
|
8989
|
+
if (version?.files && version.files.length > 0) {
|
|
8990
|
+
await writeFilesToDirectory(skillDir, version.files);
|
|
8991
|
+
console.log(
|
|
8992
|
+
`[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
|
|
9038
8993
|
);
|
|
8994
|
+
} else if (skill.source) {
|
|
8995
|
+
try {
|
|
8996
|
+
const files = await fetchFn(skill.source, {
|
|
8997
|
+
userAgent: "EvalForge-Evaluator"
|
|
8998
|
+
});
|
|
8999
|
+
await writeFilesToDirectory(skillDir, files);
|
|
9000
|
+
console.log(
|
|
9001
|
+
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
9002
|
+
);
|
|
9003
|
+
} catch (error) {
|
|
9004
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
9005
|
+
console.error(
|
|
9006
|
+
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
9007
|
+
);
|
|
9008
|
+
throw new Error(
|
|
9009
|
+
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
9010
|
+
);
|
|
9011
|
+
}
|
|
9012
|
+
} else {
|
|
9013
|
+
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
9039
9014
|
}
|
|
9040
9015
|
}
|
|
9041
9016
|
|
|
9042
9017
|
// src/run-scenario/agents/opencode/write-sub-agents.ts
|
|
9043
9018
|
var import_promises9 = require("fs/promises");
|
|
9044
9019
|
var import_path10 = require("path");
|
|
9020
|
+
var import_evalforge_github_client5 = require("@wix/evalforge-github-client");
|
|
9045
9021
|
var AGENTS_DIR2 = ".opencode/agents";
|
|
9046
9022
|
function toAgentFilename2(name, index, nameCount) {
|
|
9047
9023
|
const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
@@ -9049,7 +9025,34 @@ function toAgentFilename2(name, index, nameCount) {
|
|
|
9049
9025
|
nameCount.set(base, count + 1);
|
|
9050
9026
|
return count === 0 ? base : `${base}-${count + 1}`;
|
|
9051
9027
|
}
|
|
9052
|
-
async function
|
|
9028
|
+
async function resolveSubAgentContent2(agent, fetchFn) {
|
|
9029
|
+
if (agent.source) {
|
|
9030
|
+
try {
|
|
9031
|
+
const content = await fetchFn(agent.source, {
|
|
9032
|
+
userAgent: "EvalForge-Evaluator"
|
|
9033
|
+
});
|
|
9034
|
+
console.log(
|
|
9035
|
+
`[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
|
|
9036
|
+
);
|
|
9037
|
+
return content;
|
|
9038
|
+
} catch (error) {
|
|
9039
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
9040
|
+
console.error(
|
|
9041
|
+
`[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
|
|
9042
|
+
);
|
|
9043
|
+
throw new Error(
|
|
9044
|
+
`Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
|
|
9045
|
+
);
|
|
9046
|
+
}
|
|
9047
|
+
}
|
|
9048
|
+
if (!agent.subAgentMd) {
|
|
9049
|
+
console.warn(
|
|
9050
|
+
`[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
|
|
9051
|
+
);
|
|
9052
|
+
}
|
|
9053
|
+
return agent.subAgentMd;
|
|
9054
|
+
}
|
|
9055
|
+
async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_evalforge_github_client5.fetchGitHubFile) {
|
|
9053
9056
|
if (subAgents.length === 0) return;
|
|
9054
9057
|
const agentsDir = (0, import_path10.join)(cwd, AGENTS_DIR2);
|
|
9055
9058
|
await (0, import_promises9.mkdir)(agentsDir, { recursive: true });
|
|
@@ -9057,7 +9060,7 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
|
|
|
9057
9060
|
for (const [i, agent] of subAgents.entries()) {
|
|
9058
9061
|
const filename = toAgentFilename2(agent.name, i, nameCount);
|
|
9059
9062
|
const filePath = (0, import_path10.join)(agentsDir, `${filename}.md`);
|
|
9060
|
-
const content = await
|
|
9063
|
+
const content = await resolveSubAgentContent2(agent, fetchFn);
|
|
9061
9064
|
await (0, import_promises9.writeFile)(filePath, content, "utf8");
|
|
9062
9065
|
}
|
|
9063
9066
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
@@ -9065,8 +9068,8 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn) {
|
|
|
9065
9068
|
|
|
9066
9069
|
// src/run-scenario/agents/opencode/config.ts
|
|
9067
9070
|
var import_os3 = require("os");
|
|
9068
|
-
var
|
|
9069
|
-
var DEFAULT_MODEL2 = `${
|
|
9071
|
+
var import_evalforge_types6 = require("@wix/evalforge-types");
|
|
9072
|
+
var DEFAULT_MODEL2 = `${import_evalforge_types6.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
|
|
9070
9073
|
var OPENCODE_MODEL_ALIASES = {
|
|
9071
9074
|
"claude-sonnet-4": "claude-sonnet-4-0",
|
|
9072
9075
|
"claude-opus-4": "claude-opus-4-0"
|
|
@@ -9082,10 +9085,10 @@ function parseModel(model) {
|
|
|
9082
9085
|
};
|
|
9083
9086
|
}
|
|
9084
9087
|
const modelID = OPENCODE_MODEL_ALIASES[model] ?? model;
|
|
9085
|
-
const isOpenAI =
|
|
9088
|
+
const isOpenAI = import_evalforge_types6.AVAILABLE_OPENAI_MODEL_IDS.includes(
|
|
9086
9089
|
model
|
|
9087
9090
|
);
|
|
9088
|
-
const isGemini =
|
|
9091
|
+
const isGemini = import_evalforge_types6.AVAILABLE_GEMINI_MODEL_IDS.includes(
|
|
9089
9092
|
model
|
|
9090
9093
|
);
|
|
9091
9094
|
if (isGemini) return { providerID: "google", modelID };
|
|
@@ -9154,7 +9157,7 @@ async function buildOpenCodeEnv(options) {
|
|
|
9154
9157
|
if (options.mcps && options.mcps.length > 0) {
|
|
9155
9158
|
const mcpServers = {};
|
|
9156
9159
|
for (const mcpEntity of options.mcps) {
|
|
9157
|
-
const entityConfig =
|
|
9160
|
+
const entityConfig = mcpEntity.config;
|
|
9158
9161
|
for (const [key, value] of Object.entries(entityConfig)) {
|
|
9159
9162
|
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
9160
9163
|
throw new Error(
|
|
@@ -9179,7 +9182,7 @@ async function buildOpenCodeEnv(options) {
|
|
|
9179
9182
|
if (options.maxTurns != null && options.maxTurns > 0) {
|
|
9180
9183
|
agentOverrides.maxSteps = options.maxTurns;
|
|
9181
9184
|
}
|
|
9182
|
-
const parsed = options.config ?
|
|
9185
|
+
const parsed = options.config ? import_evalforge_types6.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
|
|
9183
9186
|
const configPermission = parsed?.success ? parsed.data.permission : void 0;
|
|
9184
9187
|
const defaultPermission = {
|
|
9185
9188
|
"*": "allow"
|
|
@@ -9221,7 +9224,7 @@ async function buildOpenCodeEnv(options) {
|
|
|
9221
9224
|
}
|
|
9222
9225
|
|
|
9223
9226
|
// src/run-scenario/agents/opencode/build-trace.ts
|
|
9224
|
-
var
|
|
9227
|
+
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
9225
9228
|
var import_crypto3 = require("crypto");
|
|
9226
9229
|
function toCanonicalModelId(modelId) {
|
|
9227
9230
|
const slashIndex = modelId.indexOf("/");
|
|
@@ -9301,7 +9304,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9301
9304
|
id: (0, import_crypto3.randomUUID)(),
|
|
9302
9305
|
stepNumber: 0,
|
|
9303
9306
|
turnIndex,
|
|
9304
|
-
type:
|
|
9307
|
+
type: import_evalforge_types7.LLMStepType.THINKING,
|
|
9305
9308
|
model: stepModel,
|
|
9306
9309
|
provider: stepProvider,
|
|
9307
9310
|
startedAt,
|
|
@@ -9330,7 +9333,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9330
9333
|
id: (0, import_crypto3.randomUUID)(),
|
|
9331
9334
|
stepNumber: 0,
|
|
9332
9335
|
turnIndex,
|
|
9333
|
-
type:
|
|
9336
|
+
type: import_evalforge_types7.LLMStepType.TOOL_USE,
|
|
9334
9337
|
model: stepModel,
|
|
9335
9338
|
provider: stepProvider,
|
|
9336
9339
|
startedAt,
|
|
@@ -9360,7 +9363,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9360
9363
|
id: (0, import_crypto3.randomUUID)(),
|
|
9361
9364
|
stepNumber: 0,
|
|
9362
9365
|
turnIndex,
|
|
9363
|
-
type:
|
|
9366
|
+
type: import_evalforge_types7.LLMStepType.COMPLETION,
|
|
9364
9367
|
model: stepModel,
|
|
9365
9368
|
provider: stepProvider,
|
|
9366
9369
|
startedAt,
|
|
@@ -9377,7 +9380,7 @@ function buildLLMTrace(timestampedEvents, totalDurationMs, model, provider, exec
|
|
|
9377
9380
|
});
|
|
9378
9381
|
}
|
|
9379
9382
|
if (subSteps.length === 0) {
|
|
9380
|
-
const stepType = hasThinking && !hasText ?
|
|
9383
|
+
const stepType = hasThinking && !hasText ? import_evalforge_types7.LLMStepType.THINKING : import_evalforge_types7.LLMStepType.COMPLETION;
|
|
9381
9384
|
subSteps.push({
|
|
9382
9385
|
id: (0, import_crypto3.randomUUID)(),
|
|
9383
9386
|
stepNumber: 0,
|
|
@@ -9578,14 +9581,14 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
|
|
|
9578
9581
|
const te = evt;
|
|
9579
9582
|
return {
|
|
9580
9583
|
...base,
|
|
9581
|
-
type:
|
|
9584
|
+
type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
|
|
9582
9585
|
outputPreview: te.part.text.slice(0, 500)
|
|
9583
9586
|
};
|
|
9584
9587
|
}
|
|
9585
9588
|
case "reasoning":
|
|
9586
9589
|
return {
|
|
9587
9590
|
...base,
|
|
9588
|
-
type:
|
|
9591
|
+
type: import_evalforge_types8.LiveTraceEventType.THINKING,
|
|
9589
9592
|
thinking: evt.part.text.slice(0, 500)
|
|
9590
9593
|
};
|
|
9591
9594
|
case "tool_use": {
|
|
@@ -9593,15 +9596,15 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
|
|
|
9593
9596
|
const toolName = tu.part.tool;
|
|
9594
9597
|
const args = tu.part.state.input;
|
|
9595
9598
|
const toolArgs = JSON.stringify(args).slice(0, 500);
|
|
9596
|
-
let type =
|
|
9599
|
+
let type = import_evalforge_types8.LiveTraceEventType.TOOL_USE;
|
|
9597
9600
|
let filePath;
|
|
9598
9601
|
if (args) {
|
|
9599
9602
|
if (args.file_path || args.path || args.target_file) {
|
|
9600
9603
|
filePath = String(args.file_path || args.path || args.target_file);
|
|
9601
9604
|
if (/write|edit/i.test(toolName)) {
|
|
9602
|
-
type =
|
|
9605
|
+
type = import_evalforge_types8.LiveTraceEventType.FILE_WRITE;
|
|
9603
9606
|
} else if (/read|view/i.test(toolName)) {
|
|
9604
|
-
type =
|
|
9607
|
+
type = import_evalforge_types8.LiveTraceEventType.FILE_READ;
|
|
9605
9608
|
}
|
|
9606
9609
|
}
|
|
9607
9610
|
}
|
|
@@ -9610,7 +9613,7 @@ function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
|
|
|
9610
9613
|
case "step_finish":
|
|
9611
9614
|
return {
|
|
9612
9615
|
...base,
|
|
9613
|
-
type:
|
|
9616
|
+
type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
|
|
9614
9617
|
outputPreview: "Step completed"
|
|
9615
9618
|
};
|
|
9616
9619
|
default:
|
|
@@ -9641,7 +9644,7 @@ async function prepareOpenCodeEnvironment(cwd, skills, options) {
|
|
|
9641
9644
|
} else if (options.systemPrompt != null) {
|
|
9642
9645
|
systemPrompt = options.systemPrompt;
|
|
9643
9646
|
} else {
|
|
9644
|
-
systemPrompt =
|
|
9647
|
+
systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
|
|
9645
9648
|
}
|
|
9646
9649
|
if (systemPrompt) {
|
|
9647
9650
|
await writeSystemPromptRule(cwd, systemPrompt);
|
|
@@ -9833,7 +9836,7 @@ function spawnOpenCodeProcess(opts) {
|
|
|
9833
9836
|
targetId: traceContext.targetId,
|
|
9834
9837
|
targetName: traceContext.targetName,
|
|
9835
9838
|
stepNumber: traceStepNumber,
|
|
9836
|
-
type:
|
|
9839
|
+
type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
|
|
9837
9840
|
outputPreview: progressMessage,
|
|
9838
9841
|
toolName: lastToolName,
|
|
9839
9842
|
filePath: lastFilePath,
|
|
@@ -9867,18 +9870,18 @@ function spawnOpenCodeProcess(opts) {
|
|
|
9867
9870
|
if (traceEvt) {
|
|
9868
9871
|
lastToolName = traceEvt.toolName;
|
|
9869
9872
|
lastFilePath = traceEvt.filePath;
|
|
9870
|
-
if (traceEvt.type ===
|
|
9873
|
+
if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.THINKING) {
|
|
9871
9874
|
lastAction = "Thinking...";
|
|
9872
|
-
} else if (traceEvt.type ===
|
|
9875
|
+
} else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.TOOL_USE) {
|
|
9873
9876
|
lastAction = extractToolAction(
|
|
9874
9877
|
traceEvt.toolName ?? "",
|
|
9875
9878
|
void 0
|
|
9876
9879
|
);
|
|
9877
|
-
} else if (traceEvt.type ===
|
|
9880
|
+
} else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_WRITE) {
|
|
9878
9881
|
lastAction = `Writing: ${traceEvt.filePath || "file"}`;
|
|
9879
|
-
} else if (traceEvt.type ===
|
|
9882
|
+
} else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.FILE_READ) {
|
|
9880
9883
|
lastAction = `Reading: ${traceEvt.filePath || "file"}`;
|
|
9881
|
-
} else if (traceEvt.type ===
|
|
9884
|
+
} else if (traceEvt.type === import_evalforge_types8.LiveTraceEventType.COMPLETION) {
|
|
9882
9885
|
lastAction = "Processing response...";
|
|
9883
9886
|
}
|
|
9884
9887
|
emitTraceEvent(traceEvt, traceContext.pushEvent);
|
|
@@ -9960,7 +9963,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
9960
9963
|
targetId: traceContext.targetId,
|
|
9961
9964
|
targetName: traceContext.targetName,
|
|
9962
9965
|
stepNumber: 0,
|
|
9963
|
-
type:
|
|
9966
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
9964
9967
|
outputPreview: JSON.stringify({
|
|
9965
9968
|
event: "pre-cli-execution",
|
|
9966
9969
|
model: `${providerID}/${modelID}`,
|
|
@@ -10014,7 +10017,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
10014
10017
|
targetId: traceContext.targetId,
|
|
10015
10018
|
targetName: traceContext.targetName,
|
|
10016
10019
|
stepNumber: traceStepNumber + 1,
|
|
10017
|
-
type:
|
|
10020
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
10018
10021
|
outputPreview: JSON.stringify({
|
|
10019
10022
|
event: "idle-timeout-retry",
|
|
10020
10023
|
attempt,
|
|
@@ -10058,7 +10061,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
10058
10061
|
targetId: traceContext.targetId,
|
|
10059
10062
|
targetName: traceContext.targetName,
|
|
10060
10063
|
stepNumber: traceStepNumber + 1,
|
|
10061
|
-
type:
|
|
10064
|
+
type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
|
|
10062
10065
|
outputPreview: JSON.stringify({
|
|
10063
10066
|
event: "cli-execution-failed",
|
|
10064
10067
|
error: lastAttemptResult.error?.message ?? "Unknown error",
|
|
@@ -10113,7 +10116,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
|
|
|
10113
10116
|
targetId: traceContext.targetId,
|
|
10114
10117
|
targetName: traceContext.targetName,
|
|
10115
10118
|
stepNumber: traceStepNumber + 1,
|
|
10116
|
-
type:
|
|
10119
|
+
type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
|
|
10117
10120
|
outputPreview: "Scenario execution completed",
|
|
10118
10121
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
10119
10122
|
isComplete: true
|
|
@@ -10150,7 +10153,7 @@ Attempt: ${attempt}, Events: ${accumulatedEvents.length}`
|
|
|
10150
10153
|
var OpenCodeAdapter = class {
|
|
10151
10154
|
id = "opencode";
|
|
10152
10155
|
name = "OpenCode";
|
|
10153
|
-
supportedCommands = [
|
|
10156
|
+
supportedCommands = [import_evalforge_types9.AgentRunCommand.OPENCODE];
|
|
10154
10157
|
async prepareEnvironment(context) {
|
|
10155
10158
|
await prepareOpenCodeEnvironment(context.cwd, context.skills, {
|
|
10156
10159
|
mcps: context.mcps,
|
|
@@ -10173,7 +10176,7 @@ var OpenCodeAdapter = class {
|
|
|
10173
10176
|
rules,
|
|
10174
10177
|
systemPrompt
|
|
10175
10178
|
} = context;
|
|
10176
|
-
const typed = config ?
|
|
10179
|
+
const typed = config ? import_evalforge_types9.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
|
|
10177
10180
|
const cfg = typed?.success ? typed.data : void 0;
|
|
10178
10181
|
const rawMaxTurns = cfg?.maxTurns;
|
|
10179
10182
|
const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
|
|
@@ -10223,7 +10226,7 @@ var import_ai = require("ai");
|
|
|
10223
10226
|
var import_anthropic = require("@ai-sdk/anthropic");
|
|
10224
10227
|
var import_google = require("@ai-sdk/google");
|
|
10225
10228
|
var import_openai = require("@ai-sdk/openai");
|
|
10226
|
-
var
|
|
10229
|
+
var import_evalforge_types11 = require("@wix/evalforge-types");
|
|
10227
10230
|
var import_crypto4 = require("crypto");
|
|
10228
10231
|
|
|
10229
10232
|
// src/run-scenario/agents/simple-agent/mcp-tools.ts
|
|
@@ -10320,7 +10323,7 @@ function extractErrorText(content) {
|
|
|
10320
10323
|
}
|
|
10321
10324
|
|
|
10322
10325
|
// src/run-scenario/agents/simple-agent/cost-calculation.ts
|
|
10323
|
-
var
|
|
10326
|
+
var import_evalforge_types10 = require("@wix/evalforge-types");
|
|
10324
10327
|
var PROVIDER_ANTHROPIC = "anthropic";
|
|
10325
10328
|
var PROVIDER_GEMINI = "gemini";
|
|
10326
10329
|
var MODEL_PRICING = {
|
|
@@ -10389,7 +10392,7 @@ function extractGatewayCost(step, provider) {
|
|
|
10389
10392
|
}
|
|
10390
10393
|
}
|
|
10391
10394
|
function calculateFromPricing(modelId, tokenUsage) {
|
|
10392
|
-
const normalized = (0,
|
|
10395
|
+
const normalized = (0, import_evalforge_types10.normalizeModelId)(modelId);
|
|
10393
10396
|
const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
|
|
10394
10397
|
if (!pricing) return 0;
|
|
10395
10398
|
return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
|
|
@@ -10482,7 +10485,7 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
10482
10485
|
apiKey: "proxy-auth",
|
|
10483
10486
|
headers
|
|
10484
10487
|
});
|
|
10485
|
-
if ([...
|
|
10488
|
+
if ([...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
10486
10489
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10487
10490
|
)) {
|
|
10488
10491
|
return openai.responses(modelId);
|
|
@@ -10490,12 +10493,12 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
10490
10493
|
return openai.chat(modelId);
|
|
10491
10494
|
}
|
|
10492
10495
|
function isClaudeModelId(modelId) {
|
|
10493
|
-
return
|
|
10496
|
+
return import_evalforge_types11.AVAILABLE_CLAUDE_MODEL_IDS.some(
|
|
10494
10497
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10495
10498
|
);
|
|
10496
10499
|
}
|
|
10497
10500
|
function isGeminiModelId(modelId) {
|
|
10498
|
-
return
|
|
10501
|
+
return import_evalforge_types11.AVAILABLE_GEMINI_MODEL_IDS.some(
|
|
10499
10502
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10500
10503
|
);
|
|
10501
10504
|
}
|
|
@@ -10515,9 +10518,9 @@ async function executeWithAiSdk(context) {
|
|
|
10515
10518
|
mcps,
|
|
10516
10519
|
traceContext
|
|
10517
10520
|
} = context;
|
|
10518
|
-
const typed = config ?
|
|
10521
|
+
const typed = config ? import_evalforge_types11.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
|
|
10519
10522
|
const cfg = typed?.success ? typed.data : void 0;
|
|
10520
|
-
const schemaKeys = new Set(Object.keys(
|
|
10523
|
+
const schemaKeys = new Set(Object.keys(import_evalforge_types11.SimpleAgentConfigSchema.shape));
|
|
10521
10524
|
const configExtras = {};
|
|
10522
10525
|
if (config) {
|
|
10523
10526
|
for (const [key, value] of Object.entries(config)) {
|
|
@@ -10554,11 +10557,11 @@ async function executeWithAiSdk(context) {
|
|
|
10554
10557
|
}, SDK_TIMEOUT_MS);
|
|
10555
10558
|
try {
|
|
10556
10559
|
const isAnthropic = provider === PROVIDER_ANTHROPIC2;
|
|
10557
|
-
const isResponsesAPI = [...
|
|
10560
|
+
const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
10558
10561
|
(id) => modelId === id || modelId.startsWith(id)
|
|
10559
10562
|
);
|
|
10560
10563
|
const isGemini = provider === PROVIDER_GEMINI2;
|
|
10561
|
-
const isGeminiThinking = isGemini &&
|
|
10564
|
+
const isGeminiThinking = isGemini && import_evalforge_types11.GEMINI_THINKING_MODEL_IDS.has(modelId);
|
|
10562
10565
|
const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
|
|
10563
10566
|
const thinkingBudgetTokens = cfg.thinkingBudgetTokens ?? 1e4;
|
|
10564
10567
|
const reasoningEffort = cfg.reasoningEffort ?? "high";
|
|
@@ -10637,7 +10640,7 @@ async function executeWithAiSdk(context) {
|
|
|
10637
10640
|
targetId: traceContext.targetId,
|
|
10638
10641
|
targetName: traceContext.targetName,
|
|
10639
10642
|
stepNumber: stepTimestamps.length,
|
|
10640
|
-
type: isToolStep ?
|
|
10643
|
+
type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
|
|
10641
10644
|
toolName: firstToolCall?.toolName,
|
|
10642
10645
|
toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
|
|
10643
10646
|
outputPreview: step.text?.slice(0, 500),
|
|
@@ -10842,7 +10845,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
|
|
|
10842
10845
|
id: (0, import_crypto4.randomUUID)(),
|
|
10843
10846
|
stepNumber: i + 1,
|
|
10844
10847
|
turnIndex: i,
|
|
10845
|
-
type: step.toolCalls.length > 0 ?
|
|
10848
|
+
type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
|
|
10846
10849
|
model: modelId,
|
|
10847
10850
|
provider,
|
|
10848
10851
|
startedAt: new Date(stepStartedAt).toISOString(),
|
|
@@ -10892,7 +10895,7 @@ function emitStartEvent(traceContext, startTime) {
|
|
|
10892
10895
|
targetId: traceContext.targetId,
|
|
10893
10896
|
targetName: traceContext.targetName,
|
|
10894
10897
|
stepNumber: 0,
|
|
10895
|
-
type:
|
|
10898
|
+
type: import_evalforge_types11.LiveTraceEventType.PROGRESS,
|
|
10896
10899
|
outputPreview: "Starting Simple Agent execution...",
|
|
10897
10900
|
elapsedMs: Date.now() - startTime,
|
|
10898
10901
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -10910,7 +10913,7 @@ function emitCompletionEvent(traceContext, stepNumber) {
|
|
|
10910
10913
|
targetId: traceContext.targetId,
|
|
10911
10914
|
targetName: traceContext.targetName,
|
|
10912
10915
|
stepNumber,
|
|
10913
|
-
type:
|
|
10916
|
+
type: import_evalforge_types11.LiveTraceEventType.COMPLETION,
|
|
10914
10917
|
outputPreview: "Scenario execution completed",
|
|
10915
10918
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
10916
10919
|
isComplete: true
|
|
@@ -11680,11 +11683,11 @@ function substituteVariables(prompt, variables) {
|
|
|
11680
11683
|
}
|
|
11681
11684
|
|
|
11682
11685
|
// src/run-scenario/run-agent-with-context.ts
|
|
11683
|
-
var
|
|
11684
|
-
var DEFAULT_AGENT_COMMAND =
|
|
11686
|
+
var import_evalforge_types12 = require("@wix/evalforge-types");
|
|
11687
|
+
var DEFAULT_AGENT_COMMAND = import_evalforge_types12.AgentRunCommand.CLAUDE;
|
|
11685
11688
|
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir, pushEvent) {
|
|
11686
11689
|
const agent = evalData.agent ?? void 0;
|
|
11687
|
-
const isSDK = agent?.agentType ===
|
|
11690
|
+
const isSDK = agent?.agentType === import_evalforge_types12.AgentType.SDK;
|
|
11688
11691
|
const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
11689
11692
|
const adapter = getAdapter(identifier);
|
|
11690
11693
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -11769,14 +11772,14 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
11769
11772
|
const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
|
|
11770
11773
|
if (template) {
|
|
11771
11774
|
console.log(
|
|
11772
|
-
(0,
|
|
11775
|
+
(0, import_evalforge_types13.formatTraceEventLine)({
|
|
11773
11776
|
evalRunId: evalRunId2,
|
|
11774
11777
|
scenarioId: scenario.id,
|
|
11775
11778
|
scenarioName: scenario.name,
|
|
11776
11779
|
targetId,
|
|
11777
11780
|
targetName,
|
|
11778
11781
|
stepNumber: 0,
|
|
11779
|
-
type:
|
|
11782
|
+
type: import_evalforge_types13.LiveTraceEventType.PROGRESS,
|
|
11780
11783
|
outputPreview: "Setting up environment (installing dependencies)...",
|
|
11781
11784
|
elapsedMs: 0,
|
|
11782
11785
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
@@ -11816,7 +11819,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
11816
11819
|
})),
|
|
11817
11820
|
durationMs: partialResult.duration
|
|
11818
11821
|
};
|
|
11819
|
-
const defaultJudgeModel =
|
|
11822
|
+
const defaultJudgeModel = import_evalforge_types13.DEFAULT_JUDGE_MODEL;
|
|
11820
11823
|
const assertionContext = {
|
|
11821
11824
|
workDir,
|
|
11822
11825
|
defaultJudgeModel,
|
|
@@ -11831,10 +11834,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
11831
11834
|
assertionContext
|
|
11832
11835
|
) : [];
|
|
11833
11836
|
const passed = assertionResults.filter(
|
|
11834
|
-
(r) => r.status ===
|
|
11837
|
+
(r) => r.status === import_evalforge_types13.AssertionResultStatus.PASSED
|
|
11835
11838
|
).length;
|
|
11836
11839
|
const failed = assertionResults.filter(
|
|
11837
|
-
(r) => r.status ===
|
|
11840
|
+
(r) => r.status === import_evalforge_types13.AssertionResultStatus.FAILED
|
|
11838
11841
|
).length;
|
|
11839
11842
|
const total = assertionResults.length;
|
|
11840
11843
|
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
@@ -11910,7 +11913,7 @@ async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
|
|
|
11910
11913
|
}
|
|
11911
11914
|
|
|
11912
11915
|
// src/error-reporter.ts
|
|
11913
|
-
var
|
|
11916
|
+
var import_evalforge_types14 = require("@wix/evalforge-types");
|
|
11914
11917
|
function formatError(error, phase, context) {
|
|
11915
11918
|
const timestamp2 = (/* @__PURE__ */ new Date()).toISOString();
|
|
11916
11919
|
if (error instanceof Error) {
|
|
@@ -12153,7 +12156,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
12153
12156
|
totalExecutions
|
|
12154
12157
|
};
|
|
12155
12158
|
const allFailed = totalExecutions > 0 && erroredExecutions === totalExecutions;
|
|
12156
|
-
const finalStatus = allFailed ?
|
|
12159
|
+
const finalStatus = allFailed ? import_evalforge_types15.EvalStatus.FAILED : import_evalforge_types15.EvalStatus.COMPLETED;
|
|
12157
12160
|
const jobErrorOnAllFailed = allFailed ? truncateForJobError(
|
|
12158
12161
|
firstErrorMessage ?? `All ${totalExecutions} executions errored without an error message`
|
|
12159
12162
|
) : void 0;
|
|
@@ -12207,7 +12210,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
12207
12210
|
grpcAuthToken: config.grpcAuthToken
|
|
12208
12211
|
});
|
|
12209
12212
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
12210
|
-
status:
|
|
12213
|
+
status: import_evalforge_types15.EvalStatus.FAILED,
|
|
12211
12214
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
12212
12215
|
jobError,
|
|
12213
12216
|
jobStatus: "FAILED"
|
|
@@ -12232,7 +12235,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
12232
12235
|
grpcAuthToken
|
|
12233
12236
|
});
|
|
12234
12237
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
12235
|
-
status:
|
|
12238
|
+
status: import_evalforge_types15.EvalStatus.FAILED,
|
|
12236
12239
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
12237
12240
|
jobError: `Config load failed, then: ${jobError}`,
|
|
12238
12241
|
jobStatus: "FAILED"
|