@wix/evalforge-evaluator 0.57.0 → 0.58.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +139 -55
- package/build/index.js.map +4 -4
- package/build/index.mjs +109 -25
- package/build/index.mjs.map +4 -4
- package/build/types/api-client.d.ts +3 -1
- package/build/types/fetch-evaluation-data.d.ts +4 -2
- package/build/types/run-scenario/agents/claude-code/execute.d.ts +1 -1
- package/build/types/run-scenario/agents/claude-code/index.d.ts +1 -1
- package/build/types/run-scenario/agents/claude-code/types.d.ts +5 -11
- package/build/types/run-scenario/agents/claude-code/write-mcp.d.ts +12 -0
- package/build/types/run-scenario/agents/claude-code/write-sub-agents.d.ts +12 -0
- package/build/types/run-scenario/index.d.ts +1 -1
- package/build/types/run-scenario/run-agent-with-context.d.ts +4 -6
- package/package.json +3 -3
package/build/index.js
CHANGED
|
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
24
24
|
));
|
|
25
25
|
|
|
26
26
|
// src/index.ts
|
|
27
|
-
var
|
|
27
|
+
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
28
28
|
|
|
29
29
|
// src/config.ts
|
|
30
30
|
function loadConfig() {
|
|
@@ -166,6 +166,12 @@ function createApiClient(serverUrl, options = "") {
|
|
|
166
166
|
getTemplate(projectId2, id) {
|
|
167
167
|
return fetchJson(`/projects/${projectId2}/templates/${id}`);
|
|
168
168
|
},
|
|
169
|
+
getMcp(projectId2, id) {
|
|
170
|
+
return fetchJson(`/projects/${projectId2}/mcps/${id}`);
|
|
171
|
+
},
|
|
172
|
+
getSubAgent(projectId2, id) {
|
|
173
|
+
return fetchJson(`/projects/${projectId2}/sub-agents/${id}`);
|
|
174
|
+
},
|
|
169
175
|
getAssertion(projectId2, id) {
|
|
170
176
|
return fetchJson(`/projects/${projectId2}/assertions/${id}`);
|
|
171
177
|
},
|
|
@@ -296,6 +302,18 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
296
302
|
);
|
|
297
303
|
}
|
|
298
304
|
}
|
|
305
|
+
let mcps = [];
|
|
306
|
+
if (evalRun.mcpIds && evalRun.mcpIds.length > 0) {
|
|
307
|
+
mcps = await Promise.all(
|
|
308
|
+
evalRun.mcpIds.map((id) => api.getMcp(projectId2, id))
|
|
309
|
+
);
|
|
310
|
+
}
|
|
311
|
+
let subAgents = [];
|
|
312
|
+
if (evalRun.subAgentIds && evalRun.subAgentIds.length > 0) {
|
|
313
|
+
subAgents = await Promise.all(
|
|
314
|
+
evalRun.subAgentIds.map((id) => api.getSubAgent(projectId2, id))
|
|
315
|
+
);
|
|
316
|
+
}
|
|
299
317
|
const templateIds = [
|
|
300
318
|
...new Set(
|
|
301
319
|
scenarios.map((s) => s.templateId).filter((id) => !!id)
|
|
@@ -345,12 +363,14 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
345
363
|
skills,
|
|
346
364
|
skillsGroup,
|
|
347
365
|
skillsGroupName,
|
|
366
|
+
mcps,
|
|
367
|
+
subAgents,
|
|
348
368
|
scenarioItems
|
|
349
369
|
};
|
|
350
370
|
}
|
|
351
371
|
|
|
352
372
|
// src/run-scenario/index.ts
|
|
353
|
-
var
|
|
373
|
+
var import_evalforge_types5 = require("@wix/evalforge-types");
|
|
354
374
|
var import_eval_assertions = require("@wix/eval-assertions");
|
|
355
375
|
|
|
356
376
|
// src/run-scenario/environment.ts
|
|
@@ -6340,16 +6360,61 @@ function getAdapter(runCommand) {
|
|
|
6340
6360
|
}
|
|
6341
6361
|
|
|
6342
6362
|
// src/run-scenario/agents/claude-code/claude-code-adapter.ts
|
|
6343
|
-
var
|
|
6363
|
+
var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
6344
6364
|
|
|
6345
6365
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
6346
|
-
var
|
|
6366
|
+
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
6347
6367
|
var import_crypto = require("crypto");
|
|
6368
|
+
var import_promises5 = require("fs/promises");
|
|
6369
|
+
var import_path7 = require("path");
|
|
6370
|
+
|
|
6371
|
+
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
6348
6372
|
var import_promises3 = require("fs/promises");
|
|
6349
6373
|
var import_path5 = require("path");
|
|
6374
|
+
var import_evalforge_types2 = require("@wix/evalforge-types");
|
|
6375
|
+
async function writeMcpToFilesystem(cwd, mcps) {
|
|
6376
|
+
if (mcps.length === 0) return;
|
|
6377
|
+
const mcpServers = {};
|
|
6378
|
+
for (const mcp of mcps) {
|
|
6379
|
+
mcpServers[mcp.name] = mcp.config;
|
|
6380
|
+
}
|
|
6381
|
+
const content = JSON.stringify(
|
|
6382
|
+
{ [import_evalforge_types2.MCP_SERVERS_JSON_KEY]: mcpServers },
|
|
6383
|
+
null,
|
|
6384
|
+
2
|
|
6385
|
+
);
|
|
6386
|
+
const filePath = (0, import_path5.join)(cwd, ".mcp.json");
|
|
6387
|
+
await (0, import_promises3.writeFile)(filePath, content, "utf8");
|
|
6388
|
+
console.log(`[MCP] Written to ${filePath}`);
|
|
6389
|
+
}
|
|
6390
|
+
|
|
6391
|
+
// src/run-scenario/agents/claude-code/write-sub-agents.ts
|
|
6392
|
+
var import_promises4 = require("fs/promises");
|
|
6393
|
+
var import_path6 = require("path");
|
|
6394
|
+
var AGENTS_DIR = ".claude/agents";
|
|
6395
|
+
function toAgentFilename(name2, index, nameCount) {
|
|
6396
|
+
const base = (name2 || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
6397
|
+
const count = nameCount.get(base) ?? 0;
|
|
6398
|
+
nameCount.set(base, count + 1);
|
|
6399
|
+
return count === 0 ? base : `${base}-${count + 1}`;
|
|
6400
|
+
}
|
|
6401
|
+
async function writeSubAgentsToFilesystem(cwd, subAgents) {
|
|
6402
|
+
if (subAgents.length === 0) return;
|
|
6403
|
+
const agentsDir = (0, import_path6.join)(cwd, AGENTS_DIR);
|
|
6404
|
+
await (0, import_promises4.mkdir)(agentsDir, { recursive: true });
|
|
6405
|
+
const nameCount = /* @__PURE__ */ new Map();
|
|
6406
|
+
for (const [i, agent] of subAgents.entries()) {
|
|
6407
|
+
const filename = toAgentFilename(agent.name, i, nameCount);
|
|
6408
|
+
const filePath = (0, import_path6.join)(agentsDir, `${filename}.md`);
|
|
6409
|
+
await (0, import_promises4.writeFile)(filePath, agent.subAgentMd, "utf8");
|
|
6410
|
+
}
|
|
6411
|
+
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
6412
|
+
}
|
|
6413
|
+
|
|
6414
|
+
// src/run-scenario/agents/claude-code/execute.ts
|
|
6350
6415
|
var DEFAULT_MODEL = "claude-3-5-sonnet-latest";
|
|
6351
6416
|
function calculateStepCost(inputTokens, outputTokens, modelName) {
|
|
6352
|
-
const model =
|
|
6417
|
+
const model = import_evalforge_types3.AVAILABLE_MODELS.find(
|
|
6353
6418
|
(m) => m.name === modelName || m.providerModelId === modelName || // Handle model aliases like "claude-3-5-sonnet-latest" -> "claude-3-5-sonnet-20241022"
|
|
6354
6419
|
modelName.includes("claude-3-5-sonnet") ? m.providerModelId.includes("claude-3-5-sonnet") : modelName.includes("claude-4-sonnet") ? m.providerModelId.includes("claude-4-sonnet") : modelName.includes("claude-4-opus") ? m.providerModelId.includes("claude-4-opus") : false
|
|
6355
6420
|
);
|
|
@@ -6363,7 +6428,7 @@ function calculateStepCost(inputTokens, outputTokens, modelName) {
|
|
|
6363
6428
|
return inputCost + outputCost;
|
|
6364
6429
|
}
|
|
6365
6430
|
function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
|
|
6366
|
-
console.log(`${
|
|
6431
|
+
console.log(`${import_evalforge_types3.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
|
|
6367
6432
|
if (tracePushUrl) {
|
|
6368
6433
|
pushTraceEvent(tracePushUrl, event, routeHeader, authToken).catch((err) => {
|
|
6369
6434
|
console.error("[Trace Push] Failed to push trace event:", err);
|
|
@@ -6440,23 +6505,23 @@ async function pushTraceEvent(url, event, routeHeader, authToken) {
|
|
|
6440
6505
|
}
|
|
6441
6506
|
}
|
|
6442
6507
|
function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
6443
|
-
let type =
|
|
6508
|
+
let type = import_evalforge_types3.LiveTraceEventType.COMPLETION;
|
|
6444
6509
|
let toolName;
|
|
6445
6510
|
let toolArgs;
|
|
6446
6511
|
let outputPreview;
|
|
6447
6512
|
let filePath;
|
|
6448
6513
|
for (const block of message.message.content) {
|
|
6449
6514
|
if (block.type === "tool_use") {
|
|
6450
|
-
type =
|
|
6515
|
+
type = import_evalforge_types3.LiveTraceEventType.TOOL_USE;
|
|
6451
6516
|
toolName = block.name;
|
|
6452
6517
|
toolArgs = JSON.stringify(block.input).slice(0, 500);
|
|
6453
6518
|
const input = block.input;
|
|
6454
6519
|
if (input.file_path || input.path || input.target_file) {
|
|
6455
6520
|
filePath = String(input.file_path || input.path || input.target_file);
|
|
6456
6521
|
if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
|
|
6457
|
-
type =
|
|
6522
|
+
type = import_evalforge_types3.LiveTraceEventType.FILE_WRITE;
|
|
6458
6523
|
} else if (block.name === "Read" || block.name === "read" || block.name === "View") {
|
|
6459
|
-
type =
|
|
6524
|
+
type = import_evalforge_types3.LiveTraceEventType.FILE_READ;
|
|
6460
6525
|
}
|
|
6461
6526
|
}
|
|
6462
6527
|
} else if (block.type === "text") {
|
|
@@ -6514,7 +6579,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
6514
6579
|
}
|
|
6515
6580
|
return {
|
|
6516
6581
|
...baseEvent,
|
|
6517
|
-
type:
|
|
6582
|
+
type: import_evalforge_types3.LiveTraceEventType.USER,
|
|
6518
6583
|
outputPreview: outputPreview || "(tool result)"
|
|
6519
6584
|
};
|
|
6520
6585
|
}
|
|
@@ -6522,7 +6587,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
6522
6587
|
const sysMsg = message;
|
|
6523
6588
|
return {
|
|
6524
6589
|
...baseEvent,
|
|
6525
|
-
type:
|
|
6590
|
+
type: import_evalforge_types3.LiveTraceEventType.SYSTEM,
|
|
6526
6591
|
outputPreview: sysMsg.subtype || "system"
|
|
6527
6592
|
};
|
|
6528
6593
|
}
|
|
@@ -6531,7 +6596,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
6531
6596
|
}
|
|
6532
6597
|
return {
|
|
6533
6598
|
...baseEvent,
|
|
6534
|
-
type:
|
|
6599
|
+
type: import_evalforge_types3.LiveTraceEventType.PROGRESS,
|
|
6535
6600
|
outputPreview: `Message type: ${message.type}`
|
|
6536
6601
|
};
|
|
6537
6602
|
}
|
|
@@ -6574,6 +6639,12 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
6574
6639
|
}
|
|
6575
6640
|
const startTime = /* @__PURE__ */ new Date();
|
|
6576
6641
|
const allMessages = [];
|
|
6642
|
+
if (options.mcps && options.mcps.length > 0) {
|
|
6643
|
+
await writeMcpToFilesystem(options.cwd, options.mcps);
|
|
6644
|
+
}
|
|
6645
|
+
if (options.subAgents && options.subAgents.length > 0) {
|
|
6646
|
+
await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
|
|
6647
|
+
}
|
|
6577
6648
|
console.error(
|
|
6578
6649
|
"[DEBUG-H4] writeSkillsToFilesystem START",
|
|
6579
6650
|
JSON.stringify({
|
|
@@ -6664,15 +6735,24 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
6664
6735
|
const canUseTool = async () => {
|
|
6665
6736
|
return { behavior: "allow" };
|
|
6666
6737
|
};
|
|
6738
|
+
const baseAllowedTools = [
|
|
6739
|
+
"Skill",
|
|
6740
|
+
"Read",
|
|
6741
|
+
"Write",
|
|
6742
|
+
"Edit",
|
|
6743
|
+
"Bash",
|
|
6744
|
+
"Glob",
|
|
6745
|
+
"Grep"
|
|
6746
|
+
];
|
|
6747
|
+
const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
|
|
6667
6748
|
const queryOptions = {
|
|
6668
6749
|
env: sdkEnv,
|
|
6669
6750
|
cwd: options.cwd,
|
|
6670
6751
|
settingSources: ["project"],
|
|
6671
|
-
allowedTools
|
|
6752
|
+
allowedTools,
|
|
6672
6753
|
model: options.model || DEFAULT_MODEL,
|
|
6673
6754
|
maxTurns,
|
|
6674
6755
|
maxThinkingTokens: options.maxThinkingTokens,
|
|
6675
|
-
mcpServers: options.mcpServers,
|
|
6676
6756
|
// Use 'default' permission mode with custom canUseTool handler
|
|
6677
6757
|
// instead of 'bypassPermissions' which fails on root
|
|
6678
6758
|
permissionMode: "default",
|
|
@@ -6700,10 +6780,6 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
6700
6780
|
);
|
|
6701
6781
|
console.log("[SDK-DEBUG] settingSources:", queryOptions.settingSources);
|
|
6702
6782
|
console.log("[SDK-DEBUG] allowedTools:", queryOptions.allowedTools);
|
|
6703
|
-
console.log(
|
|
6704
|
-
"[SDK-DEBUG] mcpServers:",
|
|
6705
|
-
queryOptions.mcpServers ? Object.keys(queryOptions.mcpServers) : "none"
|
|
6706
|
-
);
|
|
6707
6783
|
console.log("[SDK-DEBUG] Calling SDK query()...");
|
|
6708
6784
|
if (traceContext) {
|
|
6709
6785
|
const preExecEvent = {
|
|
@@ -6713,7 +6789,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
6713
6789
|
targetId: traceContext.targetId,
|
|
6714
6790
|
targetName: traceContext.targetName,
|
|
6715
6791
|
stepNumber: 0,
|
|
6716
|
-
type:
|
|
6792
|
+
type: import_evalforge_types3.LiveTraceEventType.DIAGNOSTIC,
|
|
6717
6793
|
outputPreview: JSON.stringify({
|
|
6718
6794
|
event: "pre-sdk-execution",
|
|
6719
6795
|
model: queryOptions.model,
|
|
@@ -6782,7 +6858,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
6782
6858
|
targetId: traceContext.targetId,
|
|
6783
6859
|
targetName: traceContext.targetName,
|
|
6784
6860
|
stepNumber: traceStepNumber,
|
|
6785
|
-
type:
|
|
6861
|
+
type: import_evalforge_types3.LiveTraceEventType.PROGRESS,
|
|
6786
6862
|
outputPreview: progressMessage,
|
|
6787
6863
|
toolName: lastToolName,
|
|
6788
6864
|
filePath: lastFilePath,
|
|
@@ -6839,18 +6915,18 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
|
|
|
6839
6915
|
if (traceEvent) {
|
|
6840
6916
|
lastToolName = traceEvent.toolName;
|
|
6841
6917
|
lastFilePath = traceEvent.filePath;
|
|
6842
|
-
if (traceEvent.type ===
|
|
6918
|
+
if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.THINKING) {
|
|
6843
6919
|
lastAction = "Thinking...";
|
|
6844
|
-
} else if (traceEvent.type ===
|
|
6920
|
+
} else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.TOOL_USE) {
|
|
6845
6921
|
lastAction = extractToolActionDescription(
|
|
6846
6922
|
traceEvent.toolName,
|
|
6847
6923
|
traceEvent.toolArgs
|
|
6848
6924
|
);
|
|
6849
|
-
} else if (traceEvent.type ===
|
|
6925
|
+
} else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.FILE_WRITE) {
|
|
6850
6926
|
lastAction = `Writing: ${traceEvent.filePath || "file"}`;
|
|
6851
|
-
} else if (traceEvent.type ===
|
|
6927
|
+
} else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.FILE_READ) {
|
|
6852
6928
|
lastAction = `Reading: ${traceEvent.filePath || "file"}`;
|
|
6853
|
-
} else if (traceEvent.type ===
|
|
6929
|
+
} else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.COMPLETION) {
|
|
6854
6930
|
lastAction = "Processing response...";
|
|
6855
6931
|
}
|
|
6856
6932
|
emitTraceEvent(
|
|
@@ -7033,7 +7109,7 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
|
|
|
7033
7109
|
targetId: traceContext.targetId,
|
|
7034
7110
|
targetName: traceContext.targetName,
|
|
7035
7111
|
stepNumber: traceStepNumber + 1,
|
|
7036
|
-
type:
|
|
7112
|
+
type: import_evalforge_types3.LiveTraceEventType.DIAGNOSTIC,
|
|
7037
7113
|
outputPreview: JSON.stringify(
|
|
7038
7114
|
{
|
|
7039
7115
|
event: "sdk-execution-failed",
|
|
@@ -7072,7 +7148,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
7072
7148
|
targetId: traceContext.targetId,
|
|
7073
7149
|
targetName: traceContext.targetName,
|
|
7074
7150
|
stepNumber: traceStepNumber + 1,
|
|
7075
|
-
type:
|
|
7151
|
+
type: import_evalforge_types3.LiveTraceEventType.COMPLETION,
|
|
7076
7152
|
outputPreview: "Scenario execution completed",
|
|
7077
7153
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7078
7154
|
isComplete: true
|
|
@@ -7114,10 +7190,10 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
7114
7190
|
async function writeSkillsToFilesystem(cwd, skills) {
|
|
7115
7191
|
for (const skill of skills) {
|
|
7116
7192
|
const skillName = skill.name;
|
|
7117
|
-
const skillDir = (0,
|
|
7118
|
-
await (0,
|
|
7119
|
-
const skillPath = (0,
|
|
7120
|
-
await (0,
|
|
7193
|
+
const skillDir = (0, import_path7.join)(cwd, ".claude", "skills", skillName);
|
|
7194
|
+
await (0, import_promises5.mkdir)(skillDir, { recursive: true });
|
|
7195
|
+
const skillPath = (0, import_path7.join)(skillDir, "SKILL.md");
|
|
7196
|
+
await (0, import_promises5.writeFile)(skillPath, skill.skillMd, "utf-8");
|
|
7121
7197
|
console.log(`[Skill] Written to ${skillPath}`);
|
|
7122
7198
|
}
|
|
7123
7199
|
}
|
|
@@ -7250,7 +7326,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
7250
7326
|
return {
|
|
7251
7327
|
id: (0, import_crypto.randomUUID)(),
|
|
7252
7328
|
stepNumber: index + 1,
|
|
7253
|
-
type: step.toolCalls?.length ?
|
|
7329
|
+
type: step.toolCalls?.length ? import_evalforge_types3.LLMStepType.TOOL_USE : import_evalforge_types3.LLMStepType.COMPLETION,
|
|
7254
7330
|
model,
|
|
7255
7331
|
provider: "anthropic",
|
|
7256
7332
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -7321,9 +7397,11 @@ var ClaudeCodeAdapter = class {
|
|
|
7321
7397
|
modelConfig,
|
|
7322
7398
|
aiGatewayUrl,
|
|
7323
7399
|
aiGatewayHeaders,
|
|
7324
|
-
traceContext
|
|
7400
|
+
traceContext,
|
|
7401
|
+
mcps,
|
|
7402
|
+
subAgents
|
|
7325
7403
|
} = context;
|
|
7326
|
-
const modelForSdk = modelConfig?.model ?
|
|
7404
|
+
const modelForSdk = modelConfig?.model ? import_evalforge_types4.AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
|
|
7327
7405
|
const options = {
|
|
7328
7406
|
cwd,
|
|
7329
7407
|
model: modelForSdk,
|
|
@@ -7331,7 +7409,9 @@ var ClaudeCodeAdapter = class {
|
|
|
7331
7409
|
maxTokens: modelConfig?.maxTokens,
|
|
7332
7410
|
aiGatewayUrl,
|
|
7333
7411
|
aiGatewayHeaders,
|
|
7334
|
-
traceContext
|
|
7412
|
+
traceContext,
|
|
7413
|
+
mcps,
|
|
7414
|
+
subAgents
|
|
7335
7415
|
};
|
|
7336
7416
|
const { result, llmTrace } = await executeWithClaudeCode(
|
|
7337
7417
|
skills,
|
|
@@ -7358,7 +7438,7 @@ defaultRegistry.register(claudeCodeAdapter);
|
|
|
7358
7438
|
|
|
7359
7439
|
// src/run-scenario/file-diff.ts
|
|
7360
7440
|
var import_fs6 = require("fs");
|
|
7361
|
-
var
|
|
7441
|
+
var import_path8 = require("path");
|
|
7362
7442
|
|
|
7363
7443
|
// ../../node_modules/diff/lib/index.mjs
|
|
7364
7444
|
function Diff() {
|
|
@@ -7534,7 +7614,7 @@ Diff.prototype = {
|
|
|
7534
7614
|
tokenize: function tokenize(value) {
|
|
7535
7615
|
return Array.from(value);
|
|
7536
7616
|
},
|
|
7537
|
-
join: function
|
|
7617
|
+
join: function join5(chars) {
|
|
7538
7618
|
return chars.join("");
|
|
7539
7619
|
},
|
|
7540
7620
|
postProcess: function postProcess(changeObjects) {
|
|
@@ -7974,8 +8054,8 @@ function snapshotDirectory(dir, baseDir) {
|
|
|
7974
8054
|
}
|
|
7975
8055
|
const entries = (0, import_fs6.readdirSync)(dir, { withFileTypes: true });
|
|
7976
8056
|
for (const entry of entries) {
|
|
7977
|
-
const fullPath = (0,
|
|
7978
|
-
const relativePath = (0,
|
|
8057
|
+
const fullPath = (0, import_path8.join)(dir, entry.name);
|
|
8058
|
+
const relativePath = (0, import_path8.relative)(base, fullPath);
|
|
7979
8059
|
if (shouldIgnore(entry.name)) {
|
|
7980
8060
|
continue;
|
|
7981
8061
|
}
|
|
@@ -8084,13 +8164,18 @@ function extractTemplateFiles(before, after) {
|
|
|
8084
8164
|
|
|
8085
8165
|
// src/run-scenario/run-agent-with-context.ts
|
|
8086
8166
|
var DEFAULT_AGENT_COMMAND = "claude";
|
|
8087
|
-
async function runAgentWithContext(config, evalRunId2, scenario,
|
|
8167
|
+
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
|
|
8168
|
+
const skillsGroupId = evalData.evalRun.skillsGroupId;
|
|
8169
|
+
if (!skillsGroupId) {
|
|
8170
|
+
throw new Error(`Eval run ${evalData.evalRun.id} has no skillsGroupId`);
|
|
8171
|
+
}
|
|
8172
|
+
const agent = evalData.codeAgent ?? void 0;
|
|
8088
8173
|
const runCommand = agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
8089
8174
|
const adapter = getAdapter(runCommand);
|
|
8090
8175
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
8091
8176
|
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
8092
8177
|
const executionContext = {
|
|
8093
|
-
skills,
|
|
8178
|
+
skills: evalData.skills,
|
|
8094
8179
|
scenario,
|
|
8095
8180
|
cwd: workDir || process.cwd(),
|
|
8096
8181
|
modelConfig: agent?.modelConfig,
|
|
@@ -8101,11 +8186,13 @@ async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsG
|
|
|
8101
8186
|
scenarioId: scenario.id,
|
|
8102
8187
|
scenarioName: scenario.name,
|
|
8103
8188
|
targetId: skillsGroupId,
|
|
8104
|
-
targetName: skillsGroupName,
|
|
8189
|
+
targetName: evalData.skillsGroupName,
|
|
8105
8190
|
tracePushUrl: config.tracePushUrl,
|
|
8106
8191
|
routeHeader: config.routeHeader,
|
|
8107
8192
|
authToken: config.authToken
|
|
8108
|
-
}
|
|
8193
|
+
},
|
|
8194
|
+
mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
|
|
8195
|
+
subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0
|
|
8109
8196
|
};
|
|
8110
8197
|
const result = await adapter.execute(executionContext);
|
|
8111
8198
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -8115,7 +8202,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsG
|
|
|
8115
8202
|
return {
|
|
8116
8203
|
id: (0, import_crypto2.randomUUID)(),
|
|
8117
8204
|
targetId: skillsGroupId,
|
|
8118
|
-
targetName: skillsGroupName,
|
|
8205
|
+
targetName: evalData.skillsGroupName,
|
|
8119
8206
|
scenarioId: scenario.id,
|
|
8120
8207
|
scenarioName: scenario.name,
|
|
8121
8208
|
modelConfig: agent?.modelConfig,
|
|
@@ -8143,10 +8230,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
8143
8230
|
config,
|
|
8144
8231
|
evalRunId2,
|
|
8145
8232
|
scenario,
|
|
8146
|
-
evalData
|
|
8147
|
-
skillsGroupId,
|
|
8148
|
-
evalData.skillsGroupName,
|
|
8149
|
-
evalData.codeAgent ?? void 0,
|
|
8233
|
+
evalData,
|
|
8150
8234
|
workDir
|
|
8151
8235
|
);
|
|
8152
8236
|
const inlineAssertions = scenario.assertions ?? [];
|
|
@@ -8178,10 +8262,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
8178
8262
|
assertionContext
|
|
8179
8263
|
) : [];
|
|
8180
8264
|
const passed = assertionResults.filter(
|
|
8181
|
-
(r) => r.status ===
|
|
8265
|
+
(r) => r.status === import_evalforge_types5.AssertionResultStatus.PASSED
|
|
8182
8266
|
).length;
|
|
8183
8267
|
const failed = assertionResults.filter(
|
|
8184
|
-
(r) => r.status ===
|
|
8268
|
+
(r) => r.status === import_evalforge_types5.AssertionResultStatus.FAILED
|
|
8185
8269
|
).length;
|
|
8186
8270
|
const total = assertionResults.length;
|
|
8187
8271
|
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
@@ -8195,7 +8279,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
8195
8279
|
}
|
|
8196
8280
|
|
|
8197
8281
|
// src/error-reporter.ts
|
|
8198
|
-
var
|
|
8282
|
+
var import_evalforge_types6 = require("@wix/evalforge-types");
|
|
8199
8283
|
function formatError(error, phase, context) {
|
|
8200
8284
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
8201
8285
|
if (error instanceof Error) {
|
|
@@ -8444,7 +8528,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
8444
8528
|
};
|
|
8445
8529
|
try {
|
|
8446
8530
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
8447
|
-
status:
|
|
8531
|
+
status: import_evalforge_types7.EvalStatus.COMPLETED,
|
|
8448
8532
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
8449
8533
|
});
|
|
8450
8534
|
} catch (updateErr) {
|
|
@@ -8485,7 +8569,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
8485
8569
|
authToken: config.authToken
|
|
8486
8570
|
});
|
|
8487
8571
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
8488
|
-
status:
|
|
8572
|
+
status: import_evalforge_types7.EvalStatus.FAILED,
|
|
8489
8573
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
8490
8574
|
jobError,
|
|
8491
8575
|
jobStatus: "FAILED"
|
|
@@ -8508,7 +8592,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
8508
8592
|
authToken
|
|
8509
8593
|
});
|
|
8510
8594
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
8511
|
-
status:
|
|
8595
|
+
status: import_evalforge_types7.EvalStatus.FAILED,
|
|
8512
8596
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
8513
8597
|
jobError: `Config load failed, then: ${jobError}`,
|
|
8514
8598
|
jobStatus: "FAILED"
|