@wix/evalforge-evaluator 0.56.0 → 0.58.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +142 -57
- package/build/index.js.map +4 -4
- package/build/index.mjs +112 -27
- package/build/index.mjs.map +4 -4
- package/build/types/api-client.d.ts +3 -1
- package/build/types/fetch-evaluation-data.d.ts +4 -2
- package/build/types/run-scenario/agents/claude-code/execute.d.ts +1 -1
- package/build/types/run-scenario/agents/claude-code/index.d.ts +1 -1
- package/build/types/run-scenario/agents/claude-code/types.d.ts +5 -11
- package/build/types/run-scenario/agents/claude-code/write-mcp.d.ts +12 -0
- package/build/types/run-scenario/agents/claude-code/write-sub-agents.d.ts +12 -0
- package/build/types/run-scenario/index.d.ts +1 -1
- package/build/types/run-scenario/run-agent-with-context.d.ts +4 -6
- package/package.json +3 -3
package/build/index.js
CHANGED
|
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
24
24
|
));
|
|
25
25
|
|
|
26
26
|
// src/index.ts
|
|
27
|
-
var
|
|
27
|
+
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
28
28
|
|
|
29
29
|
// src/config.ts
|
|
30
30
|
function loadConfig() {
|
|
@@ -166,6 +166,12 @@ function createApiClient(serverUrl, options = "") {
|
|
|
166
166
|
getTemplate(projectId2, id) {
|
|
167
167
|
return fetchJson(`/projects/${projectId2}/templates/${id}`);
|
|
168
168
|
},
|
|
169
|
+
getMcp(projectId2, id) {
|
|
170
|
+
return fetchJson(`/projects/${projectId2}/mcps/${id}`);
|
|
171
|
+
},
|
|
172
|
+
getSubAgent(projectId2, id) {
|
|
173
|
+
return fetchJson(`/projects/${projectId2}/sub-agents/${id}`);
|
|
174
|
+
},
|
|
169
175
|
getAssertion(projectId2, id) {
|
|
170
176
|
return fetchJson(`/projects/${projectId2}/assertions/${id}`);
|
|
171
177
|
},
|
|
@@ -296,6 +302,18 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
296
302
|
);
|
|
297
303
|
}
|
|
298
304
|
}
|
|
305
|
+
let mcps = [];
|
|
306
|
+
if (evalRun.mcpIds && evalRun.mcpIds.length > 0) {
|
|
307
|
+
mcps = await Promise.all(
|
|
308
|
+
evalRun.mcpIds.map((id) => api.getMcp(projectId2, id))
|
|
309
|
+
);
|
|
310
|
+
}
|
|
311
|
+
let subAgents = [];
|
|
312
|
+
if (evalRun.subAgentIds && evalRun.subAgentIds.length > 0) {
|
|
313
|
+
subAgents = await Promise.all(
|
|
314
|
+
evalRun.subAgentIds.map((id) => api.getSubAgent(projectId2, id))
|
|
315
|
+
);
|
|
316
|
+
}
|
|
299
317
|
const templateIds = [
|
|
300
318
|
...new Set(
|
|
301
319
|
scenarios.map((s) => s.templateId).filter((id) => !!id)
|
|
@@ -345,12 +363,14 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
345
363
|
skills,
|
|
346
364
|
skillsGroup,
|
|
347
365
|
skillsGroupName,
|
|
366
|
+
mcps,
|
|
367
|
+
subAgents,
|
|
348
368
|
scenarioItems
|
|
349
369
|
};
|
|
350
370
|
}
|
|
351
371
|
|
|
352
372
|
// src/run-scenario/index.ts
|
|
353
|
-
var
|
|
373
|
+
var import_evalforge_types5 = require("@wix/evalforge-types");
|
|
354
374
|
var import_eval_assertions = require("@wix/eval-assertions");
|
|
355
375
|
|
|
356
376
|
// src/run-scenario/environment.ts
|
|
@@ -6173,9 +6193,10 @@ function cleanAppleDoubleFiles(dir) {
|
|
|
6173
6193
|
}
|
|
6174
6194
|
}
|
|
6175
6195
|
async function downloadAndExtractTemplate(template, workDir) {
|
|
6176
|
-
if (
|
|
6177
|
-
(0, import_fs5.
|
|
6196
|
+
if ((0, import_fs5.existsSync)(workDir)) {
|
|
6197
|
+
(0, import_fs5.rmSync)(workDir, { recursive: true });
|
|
6178
6198
|
}
|
|
6199
|
+
(0, import_fs5.mkdirSync)(workDir, { recursive: true });
|
|
6179
6200
|
const response = await fetch(template.downloadUrl);
|
|
6180
6201
|
if (!response.ok) {
|
|
6181
6202
|
throw new Error(
|
|
@@ -6339,16 +6360,61 @@ function getAdapter(runCommand) {
|
|
|
6339
6360
|
}
|
|
6340
6361
|
|
|
6341
6362
|
// src/run-scenario/agents/claude-code/claude-code-adapter.ts
|
|
6342
|
-
var
|
|
6363
|
+
var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
6343
6364
|
|
|
6344
6365
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
6345
|
-
var
|
|
6366
|
+
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
6346
6367
|
var import_crypto = require("crypto");
|
|
6368
|
+
var import_promises5 = require("fs/promises");
|
|
6369
|
+
var import_path7 = require("path");
|
|
6370
|
+
|
|
6371
|
+
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
6347
6372
|
var import_promises3 = require("fs/promises");
|
|
6348
6373
|
var import_path5 = require("path");
|
|
6374
|
+
var import_evalforge_types2 = require("@wix/evalforge-types");
|
|
6375
|
+
async function writeMcpToFilesystem(cwd, mcps) {
|
|
6376
|
+
if (mcps.length === 0) return;
|
|
6377
|
+
const mcpServers = {};
|
|
6378
|
+
for (const mcp of mcps) {
|
|
6379
|
+
mcpServers[mcp.name] = mcp.config;
|
|
6380
|
+
}
|
|
6381
|
+
const content = JSON.stringify(
|
|
6382
|
+
{ [import_evalforge_types2.MCP_SERVERS_JSON_KEY]: mcpServers },
|
|
6383
|
+
null,
|
|
6384
|
+
2
|
|
6385
|
+
);
|
|
6386
|
+
const filePath = (0, import_path5.join)(cwd, ".mcp.json");
|
|
6387
|
+
await (0, import_promises3.writeFile)(filePath, content, "utf8");
|
|
6388
|
+
console.log(`[MCP] Written to ${filePath}`);
|
|
6389
|
+
}
|
|
6390
|
+
|
|
6391
|
+
// src/run-scenario/agents/claude-code/write-sub-agents.ts
|
|
6392
|
+
var import_promises4 = require("fs/promises");
|
|
6393
|
+
var import_path6 = require("path");
|
|
6394
|
+
var AGENTS_DIR = ".claude/agents";
|
|
6395
|
+
function toAgentFilename(name2, index, nameCount) {
|
|
6396
|
+
const base = (name2 || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
6397
|
+
const count = nameCount.get(base) ?? 0;
|
|
6398
|
+
nameCount.set(base, count + 1);
|
|
6399
|
+
return count === 0 ? base : `${base}-${count + 1}`;
|
|
6400
|
+
}
|
|
6401
|
+
async function writeSubAgentsToFilesystem(cwd, subAgents) {
|
|
6402
|
+
if (subAgents.length === 0) return;
|
|
6403
|
+
const agentsDir = (0, import_path6.join)(cwd, AGENTS_DIR);
|
|
6404
|
+
await (0, import_promises4.mkdir)(agentsDir, { recursive: true });
|
|
6405
|
+
const nameCount = /* @__PURE__ */ new Map();
|
|
6406
|
+
for (const [i, agent] of subAgents.entries()) {
|
|
6407
|
+
const filename = toAgentFilename(agent.name, i, nameCount);
|
|
6408
|
+
const filePath = (0, import_path6.join)(agentsDir, `${filename}.md`);
|
|
6409
|
+
await (0, import_promises4.writeFile)(filePath, agent.subAgentMd, "utf8");
|
|
6410
|
+
}
|
|
6411
|
+
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
6412
|
+
}
|
|
6413
|
+
|
|
6414
|
+
// src/run-scenario/agents/claude-code/execute.ts
|
|
6349
6415
|
var DEFAULT_MODEL = "claude-3-5-sonnet-latest";
|
|
6350
6416
|
function calculateStepCost(inputTokens, outputTokens, modelName) {
|
|
6351
|
-
const model =
|
|
6417
|
+
const model = import_evalforge_types3.AVAILABLE_MODELS.find(
|
|
6352
6418
|
(m) => m.name === modelName || m.providerModelId === modelName || // Handle model aliases like "claude-3-5-sonnet-latest" -> "claude-3-5-sonnet-20241022"
|
|
6353
6419
|
modelName.includes("claude-3-5-sonnet") ? m.providerModelId.includes("claude-3-5-sonnet") : modelName.includes("claude-4-sonnet") ? m.providerModelId.includes("claude-4-sonnet") : modelName.includes("claude-4-opus") ? m.providerModelId.includes("claude-4-opus") : false
|
|
6354
6420
|
);
|
|
@@ -6362,7 +6428,7 @@ function calculateStepCost(inputTokens, outputTokens, modelName) {
|
|
|
6362
6428
|
return inputCost + outputCost;
|
|
6363
6429
|
}
|
|
6364
6430
|
function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
|
|
6365
|
-
console.log(`${
|
|
6431
|
+
console.log(`${import_evalforge_types3.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
|
|
6366
6432
|
if (tracePushUrl) {
|
|
6367
6433
|
pushTraceEvent(tracePushUrl, event, routeHeader, authToken).catch((err) => {
|
|
6368
6434
|
console.error("[Trace Push] Failed to push trace event:", err);
|
|
@@ -6439,23 +6505,23 @@ async function pushTraceEvent(url, event, routeHeader, authToken) {
|
|
|
6439
6505
|
}
|
|
6440
6506
|
}
|
|
6441
6507
|
function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
6442
|
-
let type =
|
|
6508
|
+
let type = import_evalforge_types3.LiveTraceEventType.COMPLETION;
|
|
6443
6509
|
let toolName;
|
|
6444
6510
|
let toolArgs;
|
|
6445
6511
|
let outputPreview;
|
|
6446
6512
|
let filePath;
|
|
6447
6513
|
for (const block of message.message.content) {
|
|
6448
6514
|
if (block.type === "tool_use") {
|
|
6449
|
-
type =
|
|
6515
|
+
type = import_evalforge_types3.LiveTraceEventType.TOOL_USE;
|
|
6450
6516
|
toolName = block.name;
|
|
6451
6517
|
toolArgs = JSON.stringify(block.input).slice(0, 500);
|
|
6452
6518
|
const input = block.input;
|
|
6453
6519
|
if (input.file_path || input.path || input.target_file) {
|
|
6454
6520
|
filePath = String(input.file_path || input.path || input.target_file);
|
|
6455
6521
|
if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
|
|
6456
|
-
type =
|
|
6522
|
+
type = import_evalforge_types3.LiveTraceEventType.FILE_WRITE;
|
|
6457
6523
|
} else if (block.name === "Read" || block.name === "read" || block.name === "View") {
|
|
6458
|
-
type =
|
|
6524
|
+
type = import_evalforge_types3.LiveTraceEventType.FILE_READ;
|
|
6459
6525
|
}
|
|
6460
6526
|
}
|
|
6461
6527
|
} else if (block.type === "text") {
|
|
@@ -6513,7 +6579,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
6513
6579
|
}
|
|
6514
6580
|
return {
|
|
6515
6581
|
...baseEvent,
|
|
6516
|
-
type:
|
|
6582
|
+
type: import_evalforge_types3.LiveTraceEventType.USER,
|
|
6517
6583
|
outputPreview: outputPreview || "(tool result)"
|
|
6518
6584
|
};
|
|
6519
6585
|
}
|
|
@@ -6521,7 +6587,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
6521
6587
|
const sysMsg = message;
|
|
6522
6588
|
return {
|
|
6523
6589
|
...baseEvent,
|
|
6524
|
-
type:
|
|
6590
|
+
type: import_evalforge_types3.LiveTraceEventType.SYSTEM,
|
|
6525
6591
|
outputPreview: sysMsg.subtype || "system"
|
|
6526
6592
|
};
|
|
6527
6593
|
}
|
|
@@ -6530,7 +6596,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
6530
6596
|
}
|
|
6531
6597
|
return {
|
|
6532
6598
|
...baseEvent,
|
|
6533
|
-
type:
|
|
6599
|
+
type: import_evalforge_types3.LiveTraceEventType.PROGRESS,
|
|
6534
6600
|
outputPreview: `Message type: ${message.type}`
|
|
6535
6601
|
};
|
|
6536
6602
|
}
|
|
@@ -6573,6 +6639,12 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
6573
6639
|
}
|
|
6574
6640
|
const startTime = /* @__PURE__ */ new Date();
|
|
6575
6641
|
const allMessages = [];
|
|
6642
|
+
if (options.mcps && options.mcps.length > 0) {
|
|
6643
|
+
await writeMcpToFilesystem(options.cwd, options.mcps);
|
|
6644
|
+
}
|
|
6645
|
+
if (options.subAgents && options.subAgents.length > 0) {
|
|
6646
|
+
await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
|
|
6647
|
+
}
|
|
6576
6648
|
console.error(
|
|
6577
6649
|
"[DEBUG-H4] writeSkillsToFilesystem START",
|
|
6578
6650
|
JSON.stringify({
|
|
@@ -6663,15 +6735,24 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
6663
6735
|
const canUseTool = async () => {
|
|
6664
6736
|
return { behavior: "allow" };
|
|
6665
6737
|
};
|
|
6738
|
+
const baseAllowedTools = [
|
|
6739
|
+
"Skill",
|
|
6740
|
+
"Read",
|
|
6741
|
+
"Write",
|
|
6742
|
+
"Edit",
|
|
6743
|
+
"Bash",
|
|
6744
|
+
"Glob",
|
|
6745
|
+
"Grep"
|
|
6746
|
+
];
|
|
6747
|
+
const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
|
|
6666
6748
|
const queryOptions = {
|
|
6667
6749
|
env: sdkEnv,
|
|
6668
6750
|
cwd: options.cwd,
|
|
6669
6751
|
settingSources: ["project"],
|
|
6670
|
-
allowedTools
|
|
6752
|
+
allowedTools,
|
|
6671
6753
|
model: options.model || DEFAULT_MODEL,
|
|
6672
6754
|
maxTurns,
|
|
6673
6755
|
maxThinkingTokens: options.maxThinkingTokens,
|
|
6674
|
-
mcpServers: options.mcpServers,
|
|
6675
6756
|
// Use 'default' permission mode with custom canUseTool handler
|
|
6676
6757
|
// instead of 'bypassPermissions' which fails on root
|
|
6677
6758
|
permissionMode: "default",
|
|
@@ -6699,10 +6780,6 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
6699
6780
|
);
|
|
6700
6781
|
console.log("[SDK-DEBUG] settingSources:", queryOptions.settingSources);
|
|
6701
6782
|
console.log("[SDK-DEBUG] allowedTools:", queryOptions.allowedTools);
|
|
6702
|
-
console.log(
|
|
6703
|
-
"[SDK-DEBUG] mcpServers:",
|
|
6704
|
-
queryOptions.mcpServers ? Object.keys(queryOptions.mcpServers) : "none"
|
|
6705
|
-
);
|
|
6706
6783
|
console.log("[SDK-DEBUG] Calling SDK query()...");
|
|
6707
6784
|
if (traceContext) {
|
|
6708
6785
|
const preExecEvent = {
|
|
@@ -6712,7 +6789,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
6712
6789
|
targetId: traceContext.targetId,
|
|
6713
6790
|
targetName: traceContext.targetName,
|
|
6714
6791
|
stepNumber: 0,
|
|
6715
|
-
type:
|
|
6792
|
+
type: import_evalforge_types3.LiveTraceEventType.DIAGNOSTIC,
|
|
6716
6793
|
outputPreview: JSON.stringify({
|
|
6717
6794
|
event: "pre-sdk-execution",
|
|
6718
6795
|
model: queryOptions.model,
|
|
@@ -6781,7 +6858,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
6781
6858
|
targetId: traceContext.targetId,
|
|
6782
6859
|
targetName: traceContext.targetName,
|
|
6783
6860
|
stepNumber: traceStepNumber,
|
|
6784
|
-
type:
|
|
6861
|
+
type: import_evalforge_types3.LiveTraceEventType.PROGRESS,
|
|
6785
6862
|
outputPreview: progressMessage,
|
|
6786
6863
|
toolName: lastToolName,
|
|
6787
6864
|
filePath: lastFilePath,
|
|
@@ -6838,18 +6915,18 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
|
|
|
6838
6915
|
if (traceEvent) {
|
|
6839
6916
|
lastToolName = traceEvent.toolName;
|
|
6840
6917
|
lastFilePath = traceEvent.filePath;
|
|
6841
|
-
if (traceEvent.type ===
|
|
6918
|
+
if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.THINKING) {
|
|
6842
6919
|
lastAction = "Thinking...";
|
|
6843
|
-
} else if (traceEvent.type ===
|
|
6920
|
+
} else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.TOOL_USE) {
|
|
6844
6921
|
lastAction = extractToolActionDescription(
|
|
6845
6922
|
traceEvent.toolName,
|
|
6846
6923
|
traceEvent.toolArgs
|
|
6847
6924
|
);
|
|
6848
|
-
} else if (traceEvent.type ===
|
|
6925
|
+
} else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.FILE_WRITE) {
|
|
6849
6926
|
lastAction = `Writing: ${traceEvent.filePath || "file"}`;
|
|
6850
|
-
} else if (traceEvent.type ===
|
|
6927
|
+
} else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.FILE_READ) {
|
|
6851
6928
|
lastAction = `Reading: ${traceEvent.filePath || "file"}`;
|
|
6852
|
-
} else if (traceEvent.type ===
|
|
6929
|
+
} else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.COMPLETION) {
|
|
6853
6930
|
lastAction = "Processing response...";
|
|
6854
6931
|
}
|
|
6855
6932
|
emitTraceEvent(
|
|
@@ -7032,7 +7109,7 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
|
|
|
7032
7109
|
targetId: traceContext.targetId,
|
|
7033
7110
|
targetName: traceContext.targetName,
|
|
7034
7111
|
stepNumber: traceStepNumber + 1,
|
|
7035
|
-
type:
|
|
7112
|
+
type: import_evalforge_types3.LiveTraceEventType.DIAGNOSTIC,
|
|
7036
7113
|
outputPreview: JSON.stringify(
|
|
7037
7114
|
{
|
|
7038
7115
|
event: "sdk-execution-failed",
|
|
@@ -7071,7 +7148,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
7071
7148
|
targetId: traceContext.targetId,
|
|
7072
7149
|
targetName: traceContext.targetName,
|
|
7073
7150
|
stepNumber: traceStepNumber + 1,
|
|
7074
|
-
type:
|
|
7151
|
+
type: import_evalforge_types3.LiveTraceEventType.COMPLETION,
|
|
7075
7152
|
outputPreview: "Scenario execution completed",
|
|
7076
7153
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7077
7154
|
isComplete: true
|
|
@@ -7113,10 +7190,10 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
7113
7190
|
async function writeSkillsToFilesystem(cwd, skills) {
|
|
7114
7191
|
for (const skill of skills) {
|
|
7115
7192
|
const skillName = skill.name;
|
|
7116
|
-
const skillDir = (0,
|
|
7117
|
-
await (0,
|
|
7118
|
-
const skillPath = (0,
|
|
7119
|
-
await (0,
|
|
7193
|
+
const skillDir = (0, import_path7.join)(cwd, ".claude", "skills", skillName);
|
|
7194
|
+
await (0, import_promises5.mkdir)(skillDir, { recursive: true });
|
|
7195
|
+
const skillPath = (0, import_path7.join)(skillDir, "SKILL.md");
|
|
7196
|
+
await (0, import_promises5.writeFile)(skillPath, skill.skillMd, "utf-8");
|
|
7120
7197
|
console.log(`[Skill] Written to ${skillPath}`);
|
|
7121
7198
|
}
|
|
7122
7199
|
}
|
|
@@ -7249,7 +7326,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
7249
7326
|
return {
|
|
7250
7327
|
id: (0, import_crypto.randomUUID)(),
|
|
7251
7328
|
stepNumber: index + 1,
|
|
7252
|
-
type: step.toolCalls?.length ?
|
|
7329
|
+
type: step.toolCalls?.length ? import_evalforge_types3.LLMStepType.TOOL_USE : import_evalforge_types3.LLMStepType.COMPLETION,
|
|
7253
7330
|
model,
|
|
7254
7331
|
provider: "anthropic",
|
|
7255
7332
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -7320,9 +7397,11 @@ var ClaudeCodeAdapter = class {
|
|
|
7320
7397
|
modelConfig,
|
|
7321
7398
|
aiGatewayUrl,
|
|
7322
7399
|
aiGatewayHeaders,
|
|
7323
|
-
traceContext
|
|
7400
|
+
traceContext,
|
|
7401
|
+
mcps,
|
|
7402
|
+
subAgents
|
|
7324
7403
|
} = context;
|
|
7325
|
-
const modelForSdk = modelConfig?.model ?
|
|
7404
|
+
const modelForSdk = modelConfig?.model ? import_evalforge_types4.AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
|
|
7326
7405
|
const options = {
|
|
7327
7406
|
cwd,
|
|
7328
7407
|
model: modelForSdk,
|
|
@@ -7330,7 +7409,9 @@ var ClaudeCodeAdapter = class {
|
|
|
7330
7409
|
maxTokens: modelConfig?.maxTokens,
|
|
7331
7410
|
aiGatewayUrl,
|
|
7332
7411
|
aiGatewayHeaders,
|
|
7333
|
-
traceContext
|
|
7412
|
+
traceContext,
|
|
7413
|
+
mcps,
|
|
7414
|
+
subAgents
|
|
7334
7415
|
};
|
|
7335
7416
|
const { result, llmTrace } = await executeWithClaudeCode(
|
|
7336
7417
|
skills,
|
|
@@ -7357,7 +7438,7 @@ defaultRegistry.register(claudeCodeAdapter);
|
|
|
7357
7438
|
|
|
7358
7439
|
// src/run-scenario/file-diff.ts
|
|
7359
7440
|
var import_fs6 = require("fs");
|
|
7360
|
-
var
|
|
7441
|
+
var import_path8 = require("path");
|
|
7361
7442
|
|
|
7362
7443
|
// ../../node_modules/diff/lib/index.mjs
|
|
7363
7444
|
function Diff() {
|
|
@@ -7533,7 +7614,7 @@ Diff.prototype = {
|
|
|
7533
7614
|
tokenize: function tokenize(value) {
|
|
7534
7615
|
return Array.from(value);
|
|
7535
7616
|
},
|
|
7536
|
-
join: function
|
|
7617
|
+
join: function join5(chars) {
|
|
7537
7618
|
return chars.join("");
|
|
7538
7619
|
},
|
|
7539
7620
|
postProcess: function postProcess(changeObjects) {
|
|
@@ -7973,8 +8054,8 @@ function snapshotDirectory(dir, baseDir) {
|
|
|
7973
8054
|
}
|
|
7974
8055
|
const entries = (0, import_fs6.readdirSync)(dir, { withFileTypes: true });
|
|
7975
8056
|
for (const entry of entries) {
|
|
7976
|
-
const fullPath = (0,
|
|
7977
|
-
const relativePath = (0,
|
|
8057
|
+
const fullPath = (0, import_path8.join)(dir, entry.name);
|
|
8058
|
+
const relativePath = (0, import_path8.relative)(base, fullPath);
|
|
7978
8059
|
if (shouldIgnore(entry.name)) {
|
|
7979
8060
|
continue;
|
|
7980
8061
|
}
|
|
@@ -8083,13 +8164,18 @@ function extractTemplateFiles(before, after) {
|
|
|
8083
8164
|
|
|
8084
8165
|
// src/run-scenario/run-agent-with-context.ts
|
|
8085
8166
|
var DEFAULT_AGENT_COMMAND = "claude";
|
|
8086
|
-
async function runAgentWithContext(config, evalRunId2, scenario,
|
|
8167
|
+
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
|
|
8168
|
+
const skillsGroupId = evalData.evalRun.skillsGroupId;
|
|
8169
|
+
if (!skillsGroupId) {
|
|
8170
|
+
throw new Error(`Eval run ${evalData.evalRun.id} has no skillsGroupId`);
|
|
8171
|
+
}
|
|
8172
|
+
const agent = evalData.codeAgent ?? void 0;
|
|
8087
8173
|
const runCommand = agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
8088
8174
|
const adapter = getAdapter(runCommand);
|
|
8089
8175
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
8090
8176
|
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
8091
8177
|
const executionContext = {
|
|
8092
|
-
skills,
|
|
8178
|
+
skills: evalData.skills,
|
|
8093
8179
|
scenario,
|
|
8094
8180
|
cwd: workDir || process.cwd(),
|
|
8095
8181
|
modelConfig: agent?.modelConfig,
|
|
@@ -8100,11 +8186,13 @@ async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsG
|
|
|
8100
8186
|
scenarioId: scenario.id,
|
|
8101
8187
|
scenarioName: scenario.name,
|
|
8102
8188
|
targetId: skillsGroupId,
|
|
8103
|
-
targetName: skillsGroupName,
|
|
8189
|
+
targetName: evalData.skillsGroupName,
|
|
8104
8190
|
tracePushUrl: config.tracePushUrl,
|
|
8105
8191
|
routeHeader: config.routeHeader,
|
|
8106
8192
|
authToken: config.authToken
|
|
8107
|
-
}
|
|
8193
|
+
},
|
|
8194
|
+
mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
|
|
8195
|
+
subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0
|
|
8108
8196
|
};
|
|
8109
8197
|
const result = await adapter.execute(executionContext);
|
|
8110
8198
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -8114,7 +8202,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsG
|
|
|
8114
8202
|
return {
|
|
8115
8203
|
id: (0, import_crypto2.randomUUID)(),
|
|
8116
8204
|
targetId: skillsGroupId,
|
|
8117
|
-
targetName: skillsGroupName,
|
|
8205
|
+
targetName: evalData.skillsGroupName,
|
|
8118
8206
|
scenarioId: scenario.id,
|
|
8119
8207
|
scenarioName: scenario.name,
|
|
8120
8208
|
modelConfig: agent?.modelConfig,
|
|
@@ -8142,10 +8230,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
8142
8230
|
config,
|
|
8143
8231
|
evalRunId2,
|
|
8144
8232
|
scenario,
|
|
8145
|
-
evalData
|
|
8146
|
-
skillsGroupId,
|
|
8147
|
-
evalData.skillsGroupName,
|
|
8148
|
-
evalData.codeAgent ?? void 0,
|
|
8233
|
+
evalData,
|
|
8149
8234
|
workDir
|
|
8150
8235
|
);
|
|
8151
8236
|
const inlineAssertions = scenario.assertions ?? [];
|
|
@@ -8177,10 +8262,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
8177
8262
|
assertionContext
|
|
8178
8263
|
) : [];
|
|
8179
8264
|
const passed = assertionResults.filter(
|
|
8180
|
-
(r) => r.status ===
|
|
8265
|
+
(r) => r.status === import_evalforge_types5.AssertionResultStatus.PASSED
|
|
8181
8266
|
).length;
|
|
8182
8267
|
const failed = assertionResults.filter(
|
|
8183
|
-
(r) => r.status ===
|
|
8268
|
+
(r) => r.status === import_evalforge_types5.AssertionResultStatus.FAILED
|
|
8184
8269
|
).length;
|
|
8185
8270
|
const total = assertionResults.length;
|
|
8186
8271
|
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
@@ -8194,7 +8279,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
8194
8279
|
}
|
|
8195
8280
|
|
|
8196
8281
|
// src/error-reporter.ts
|
|
8197
|
-
var
|
|
8282
|
+
var import_evalforge_types6 = require("@wix/evalforge-types");
|
|
8198
8283
|
function formatError(error, phase, context) {
|
|
8199
8284
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
8200
8285
|
if (error instanceof Error) {
|
|
@@ -8443,7 +8528,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
8443
8528
|
};
|
|
8444
8529
|
try {
|
|
8445
8530
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
8446
|
-
status:
|
|
8531
|
+
status: import_evalforge_types7.EvalStatus.COMPLETED,
|
|
8447
8532
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
8448
8533
|
});
|
|
8449
8534
|
} catch (updateErr) {
|
|
@@ -8484,7 +8569,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
8484
8569
|
authToken: config.authToken
|
|
8485
8570
|
});
|
|
8486
8571
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
8487
|
-
status:
|
|
8572
|
+
status: import_evalforge_types7.EvalStatus.FAILED,
|
|
8488
8573
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
8489
8574
|
jobError,
|
|
8490
8575
|
jobStatus: "FAILED"
|
|
@@ -8507,7 +8592,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
8507
8592
|
authToken
|
|
8508
8593
|
});
|
|
8509
8594
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
8510
|
-
status:
|
|
8595
|
+
status: import_evalforge_types7.EvalStatus.FAILED,
|
|
8511
8596
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
8512
8597
|
jobError: `Config load failed, then: ${jobError}`,
|
|
8513
8598
|
jobStatus: "FAILED"
|