@wix/evalforge-evaluator 0.57.0 → 0.59.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +165 -88
- package/build/index.js.map +4 -4
- package/build/index.mjs +135 -58
- package/build/index.mjs.map +4 -4
- package/build/types/api-client.d.ts +3 -1
- package/build/types/fetch-evaluation-data.d.ts +24 -2
- package/build/types/run-scenario/agents/claude-code/execute.d.ts +1 -1
- package/build/types/run-scenario/agents/claude-code/index.d.ts +1 -1
- package/build/types/run-scenario/agents/claude-code/types.d.ts +5 -11
- package/build/types/run-scenario/agents/claude-code/write-mcp.d.ts +12 -0
- package/build/types/run-scenario/agents/claude-code/write-sub-agents.d.ts +12 -0
- package/build/types/run-scenario/index.d.ts +1 -1
- package/build/types/run-scenario/run-agent-with-context.d.ts +4 -6
- package/package.json +4 -4
package/build/index.js
CHANGED
|
@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
24
24
|
));
|
|
25
25
|
|
|
26
26
|
// src/index.ts
|
|
27
|
-
var
|
|
27
|
+
var import_evalforge_types7 = require("@wix/evalforge-types");
|
|
28
28
|
|
|
29
29
|
// src/config.ts
|
|
30
30
|
function loadConfig() {
|
|
@@ -166,6 +166,12 @@ function createApiClient(serverUrl, options = "") {
|
|
|
166
166
|
getTemplate(projectId2, id) {
|
|
167
167
|
return fetchJson(`/projects/${projectId2}/templates/${id}`);
|
|
168
168
|
},
|
|
169
|
+
getMcp(projectId2, id) {
|
|
170
|
+
return fetchJson(`/projects/${projectId2}/mcps/${id}`);
|
|
171
|
+
},
|
|
172
|
+
getSubAgent(projectId2, id) {
|
|
173
|
+
return fetchJson(`/projects/${projectId2}/sub-agents/${id}`);
|
|
174
|
+
},
|
|
169
175
|
getAssertion(projectId2, id) {
|
|
170
176
|
return fetchJson(`/projects/${projectId2}/assertions/${id}`);
|
|
171
177
|
},
|
|
@@ -188,6 +194,16 @@ function createApiClient(serverUrl, options = "") {
|
|
|
188
194
|
|
|
189
195
|
// src/fetch-evaluation-data.ts
|
|
190
196
|
var import_evalforge_types = require("@wix/evalforge-types");
|
|
197
|
+
function parseSkillNamesFromParams(value) {
|
|
198
|
+
if (typeof value !== "string") {
|
|
199
|
+
return [];
|
|
200
|
+
}
|
|
201
|
+
const parsed = JSON.parse(value);
|
|
202
|
+
if (Array.isArray(parsed)) {
|
|
203
|
+
return parsed.map(String);
|
|
204
|
+
}
|
|
205
|
+
return [];
|
|
206
|
+
}
|
|
191
207
|
function applyParamsToAssertion(assertion, params) {
|
|
192
208
|
if (!params || Object.keys(params).length === 0) {
|
|
193
209
|
return assertion;
|
|
@@ -209,6 +225,12 @@ function applyParamsToAssertion(assertion, params) {
|
|
|
209
225
|
}
|
|
210
226
|
return { ...assertion, prompt, systemPrompt };
|
|
211
227
|
}
|
|
228
|
+
if (assertion.type === "skill_was_called" && params.skillNames !== void 0) {
|
|
229
|
+
return {
|
|
230
|
+
...assertion,
|
|
231
|
+
skillNames: parseSkillNamesFromParams(params.skillNames)
|
|
232
|
+
};
|
|
233
|
+
}
|
|
212
234
|
return { ...assertion, ...params };
|
|
213
235
|
}
|
|
214
236
|
function resolveSystemAssertion(assertionId, params) {
|
|
@@ -218,7 +240,7 @@ function resolveSystemAssertion(assertionId, params) {
|
|
|
218
240
|
case "skill_was_called":
|
|
219
241
|
baseAssertion = {
|
|
220
242
|
type: "skill_was_called",
|
|
221
|
-
|
|
243
|
+
skillNames: parseSkillNamesFromParams(params?.skillNames)
|
|
222
244
|
};
|
|
223
245
|
break;
|
|
224
246
|
case "build_passed":
|
|
@@ -243,38 +265,15 @@ function resolveSystemAssertion(assertionId, params) {
|
|
|
243
265
|
}
|
|
244
266
|
function customAssertionToAssertion(ca, params) {
|
|
245
267
|
const config = ca.config;
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
baseAssertion = {
|
|
256
|
-
type: "build_passed",
|
|
257
|
-
command: config?.command,
|
|
258
|
-
expectedExitCode: config?.expectedExitCode
|
|
259
|
-
};
|
|
260
|
-
break;
|
|
261
|
-
case "llm_judge":
|
|
262
|
-
baseAssertion = {
|
|
263
|
-
type: "llm_judge",
|
|
264
|
-
prompt: config?.prompt ?? "",
|
|
265
|
-
systemPrompt: config?.systemPrompt,
|
|
266
|
-
minScore: config?.minScore,
|
|
267
|
-
model: config?.model,
|
|
268
|
-
maxTokens: config?.maxTokens,
|
|
269
|
-
temperature: config?.temperature
|
|
270
|
-
};
|
|
271
|
-
break;
|
|
272
|
-
default:
|
|
273
|
-
baseAssertion = {
|
|
274
|
-
type: "llm_judge",
|
|
275
|
-
prompt: ""
|
|
276
|
-
};
|
|
277
|
-
}
|
|
268
|
+
const baseAssertion = {
|
|
269
|
+
type: "llm_judge",
|
|
270
|
+
prompt: config?.prompt ?? "",
|
|
271
|
+
systemPrompt: config?.systemPrompt,
|
|
272
|
+
minScore: config?.minScore,
|
|
273
|
+
model: config?.model,
|
|
274
|
+
maxTokens: config?.maxTokens,
|
|
275
|
+
temperature: config?.temperature
|
|
276
|
+
};
|
|
278
277
|
return applyParamsToAssertion(baseAssertion, params);
|
|
279
278
|
}
|
|
280
279
|
async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
@@ -296,6 +295,18 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
296
295
|
);
|
|
297
296
|
}
|
|
298
297
|
}
|
|
298
|
+
let mcps = [];
|
|
299
|
+
if (evalRun.mcpIds && evalRun.mcpIds.length > 0) {
|
|
300
|
+
mcps = await Promise.all(
|
|
301
|
+
evalRun.mcpIds.map((id) => api.getMcp(projectId2, id))
|
|
302
|
+
);
|
|
303
|
+
}
|
|
304
|
+
let subAgents = [];
|
|
305
|
+
if (evalRun.subAgentIds && evalRun.subAgentIds.length > 0) {
|
|
306
|
+
subAgents = await Promise.all(
|
|
307
|
+
evalRun.subAgentIds.map((id) => api.getSubAgent(projectId2, id))
|
|
308
|
+
);
|
|
309
|
+
}
|
|
299
310
|
const templateIds = [
|
|
300
311
|
...new Set(
|
|
301
312
|
scenarios.map((s) => s.templateId).filter((id) => !!id)
|
|
@@ -345,12 +356,14 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
345
356
|
skills,
|
|
346
357
|
skillsGroup,
|
|
347
358
|
skillsGroupName,
|
|
359
|
+
mcps,
|
|
360
|
+
subAgents,
|
|
348
361
|
scenarioItems
|
|
349
362
|
};
|
|
350
363
|
}
|
|
351
364
|
|
|
352
365
|
// src/run-scenario/index.ts
|
|
353
|
-
var
|
|
366
|
+
var import_evalforge_types5 = require("@wix/evalforge-types");
|
|
354
367
|
var import_eval_assertions = require("@wix/eval-assertions");
|
|
355
368
|
|
|
356
369
|
// src/run-scenario/environment.ts
|
|
@@ -6340,16 +6353,61 @@ function getAdapter(runCommand) {
|
|
|
6340
6353
|
}
|
|
6341
6354
|
|
|
6342
6355
|
// src/run-scenario/agents/claude-code/claude-code-adapter.ts
|
|
6343
|
-
var
|
|
6356
|
+
var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
6344
6357
|
|
|
6345
6358
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
6346
|
-
var
|
|
6359
|
+
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
6347
6360
|
var import_crypto = require("crypto");
|
|
6361
|
+
var import_promises5 = require("fs/promises");
|
|
6362
|
+
var import_path7 = require("path");
|
|
6363
|
+
|
|
6364
|
+
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
6348
6365
|
var import_promises3 = require("fs/promises");
|
|
6349
6366
|
var import_path5 = require("path");
|
|
6367
|
+
var import_evalforge_types2 = require("@wix/evalforge-types");
|
|
6368
|
+
async function writeMcpToFilesystem(cwd, mcps) {
|
|
6369
|
+
if (mcps.length === 0) return;
|
|
6370
|
+
const mcpServers = {};
|
|
6371
|
+
for (const mcp of mcps) {
|
|
6372
|
+
mcpServers[mcp.name] = mcp.config;
|
|
6373
|
+
}
|
|
6374
|
+
const content = JSON.stringify(
|
|
6375
|
+
{ [import_evalforge_types2.MCP_SERVERS_JSON_KEY]: mcpServers },
|
|
6376
|
+
null,
|
|
6377
|
+
2
|
|
6378
|
+
);
|
|
6379
|
+
const filePath = (0, import_path5.join)(cwd, ".mcp.json");
|
|
6380
|
+
await (0, import_promises3.writeFile)(filePath, content, "utf8");
|
|
6381
|
+
console.log(`[MCP] Written to ${filePath}`);
|
|
6382
|
+
}
|
|
6383
|
+
|
|
6384
|
+
// src/run-scenario/agents/claude-code/write-sub-agents.ts
|
|
6385
|
+
var import_promises4 = require("fs/promises");
|
|
6386
|
+
var import_path6 = require("path");
|
|
6387
|
+
var AGENTS_DIR = ".claude/agents";
|
|
6388
|
+
function toAgentFilename(name2, index, nameCount) {
|
|
6389
|
+
const base = (name2 || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
6390
|
+
const count = nameCount.get(base) ?? 0;
|
|
6391
|
+
nameCount.set(base, count + 1);
|
|
6392
|
+
return count === 0 ? base : `${base}-${count + 1}`;
|
|
6393
|
+
}
|
|
6394
|
+
async function writeSubAgentsToFilesystem(cwd, subAgents) {
|
|
6395
|
+
if (subAgents.length === 0) return;
|
|
6396
|
+
const agentsDir = (0, import_path6.join)(cwd, AGENTS_DIR);
|
|
6397
|
+
await (0, import_promises4.mkdir)(agentsDir, { recursive: true });
|
|
6398
|
+
const nameCount = /* @__PURE__ */ new Map();
|
|
6399
|
+
for (const [i, agent] of subAgents.entries()) {
|
|
6400
|
+
const filename = toAgentFilename(agent.name, i, nameCount);
|
|
6401
|
+
const filePath = (0, import_path6.join)(agentsDir, `${filename}.md`);
|
|
6402
|
+
await (0, import_promises4.writeFile)(filePath, agent.subAgentMd, "utf8");
|
|
6403
|
+
}
|
|
6404
|
+
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
6405
|
+
}
|
|
6406
|
+
|
|
6407
|
+
// src/run-scenario/agents/claude-code/execute.ts
|
|
6350
6408
|
var DEFAULT_MODEL = "claude-3-5-sonnet-latest";
|
|
6351
6409
|
function calculateStepCost(inputTokens, outputTokens, modelName) {
|
|
6352
|
-
const model =
|
|
6410
|
+
const model = import_evalforge_types3.AVAILABLE_MODELS.find(
|
|
6353
6411
|
(m) => m.name === modelName || m.providerModelId === modelName || // Handle model aliases like "claude-3-5-sonnet-latest" -> "claude-3-5-sonnet-20241022"
|
|
6354
6412
|
modelName.includes("claude-3-5-sonnet") ? m.providerModelId.includes("claude-3-5-sonnet") : modelName.includes("claude-4-sonnet") ? m.providerModelId.includes("claude-4-sonnet") : modelName.includes("claude-4-opus") ? m.providerModelId.includes("claude-4-opus") : false
|
|
6355
6413
|
);
|
|
@@ -6363,7 +6421,7 @@ function calculateStepCost(inputTokens, outputTokens, modelName) {
|
|
|
6363
6421
|
return inputCost + outputCost;
|
|
6364
6422
|
}
|
|
6365
6423
|
function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
|
|
6366
|
-
console.log(`${
|
|
6424
|
+
console.log(`${import_evalforge_types3.TRACE_EVENT_PREFIX}${JSON.stringify(event)}`);
|
|
6367
6425
|
if (tracePushUrl) {
|
|
6368
6426
|
pushTraceEvent(tracePushUrl, event, routeHeader, authToken).catch((err) => {
|
|
6369
6427
|
console.error("[Trace Push] Failed to push trace event:", err);
|
|
@@ -6440,23 +6498,23 @@ async function pushTraceEvent(url, event, routeHeader, authToken) {
|
|
|
6440
6498
|
}
|
|
6441
6499
|
}
|
|
6442
6500
|
function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
6443
|
-
let type =
|
|
6501
|
+
let type = import_evalforge_types3.LiveTraceEventType.COMPLETION;
|
|
6444
6502
|
let toolName;
|
|
6445
6503
|
let toolArgs;
|
|
6446
6504
|
let outputPreview;
|
|
6447
6505
|
let filePath;
|
|
6448
6506
|
for (const block of message.message.content) {
|
|
6449
6507
|
if (block.type === "tool_use") {
|
|
6450
|
-
type =
|
|
6508
|
+
type = import_evalforge_types3.LiveTraceEventType.TOOL_USE;
|
|
6451
6509
|
toolName = block.name;
|
|
6452
6510
|
toolArgs = JSON.stringify(block.input).slice(0, 500);
|
|
6453
6511
|
const input = block.input;
|
|
6454
6512
|
if (input.file_path || input.path || input.target_file) {
|
|
6455
6513
|
filePath = String(input.file_path || input.path || input.target_file);
|
|
6456
6514
|
if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
|
|
6457
|
-
type =
|
|
6515
|
+
type = import_evalforge_types3.LiveTraceEventType.FILE_WRITE;
|
|
6458
6516
|
} else if (block.name === "Read" || block.name === "read" || block.name === "View") {
|
|
6459
|
-
type =
|
|
6517
|
+
type = import_evalforge_types3.LiveTraceEventType.FILE_READ;
|
|
6460
6518
|
}
|
|
6461
6519
|
}
|
|
6462
6520
|
} else if (block.type === "text") {
|
|
@@ -6514,7 +6572,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
6514
6572
|
}
|
|
6515
6573
|
return {
|
|
6516
6574
|
...baseEvent,
|
|
6517
|
-
type:
|
|
6575
|
+
type: import_evalforge_types3.LiveTraceEventType.USER,
|
|
6518
6576
|
outputPreview: outputPreview || "(tool result)"
|
|
6519
6577
|
};
|
|
6520
6578
|
}
|
|
@@ -6522,7 +6580,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
6522
6580
|
const sysMsg = message;
|
|
6523
6581
|
return {
|
|
6524
6582
|
...baseEvent,
|
|
6525
|
-
type:
|
|
6583
|
+
type: import_evalforge_types3.LiveTraceEventType.SYSTEM,
|
|
6526
6584
|
outputPreview: sysMsg.subtype || "system"
|
|
6527
6585
|
};
|
|
6528
6586
|
}
|
|
@@ -6531,7 +6589,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
6531
6589
|
}
|
|
6532
6590
|
return {
|
|
6533
6591
|
...baseEvent,
|
|
6534
|
-
type:
|
|
6592
|
+
type: import_evalforge_types3.LiveTraceEventType.PROGRESS,
|
|
6535
6593
|
outputPreview: `Message type: ${message.type}`
|
|
6536
6594
|
};
|
|
6537
6595
|
}
|
|
@@ -6574,6 +6632,12 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
6574
6632
|
}
|
|
6575
6633
|
const startTime = /* @__PURE__ */ new Date();
|
|
6576
6634
|
const allMessages = [];
|
|
6635
|
+
if (options.mcps && options.mcps.length > 0) {
|
|
6636
|
+
await writeMcpToFilesystem(options.cwd, options.mcps);
|
|
6637
|
+
}
|
|
6638
|
+
if (options.subAgents && options.subAgents.length > 0) {
|
|
6639
|
+
await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
|
|
6640
|
+
}
|
|
6577
6641
|
console.error(
|
|
6578
6642
|
"[DEBUG-H4] writeSkillsToFilesystem START",
|
|
6579
6643
|
JSON.stringify({
|
|
@@ -6664,15 +6728,24 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
6664
6728
|
const canUseTool = async () => {
|
|
6665
6729
|
return { behavior: "allow" };
|
|
6666
6730
|
};
|
|
6731
|
+
const baseAllowedTools = [
|
|
6732
|
+
"Skill",
|
|
6733
|
+
"Read",
|
|
6734
|
+
"Write",
|
|
6735
|
+
"Edit",
|
|
6736
|
+
"Bash",
|
|
6737
|
+
"Glob",
|
|
6738
|
+
"Grep"
|
|
6739
|
+
];
|
|
6740
|
+
const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
|
|
6667
6741
|
const queryOptions = {
|
|
6668
6742
|
env: sdkEnv,
|
|
6669
6743
|
cwd: options.cwd,
|
|
6670
6744
|
settingSources: ["project"],
|
|
6671
|
-
allowedTools
|
|
6745
|
+
allowedTools,
|
|
6672
6746
|
model: options.model || DEFAULT_MODEL,
|
|
6673
6747
|
maxTurns,
|
|
6674
6748
|
maxThinkingTokens: options.maxThinkingTokens,
|
|
6675
|
-
mcpServers: options.mcpServers,
|
|
6676
6749
|
// Use 'default' permission mode with custom canUseTool handler
|
|
6677
6750
|
// instead of 'bypassPermissions' which fails on root
|
|
6678
6751
|
permissionMode: "default",
|
|
@@ -6700,10 +6773,6 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
6700
6773
|
);
|
|
6701
6774
|
console.log("[SDK-DEBUG] settingSources:", queryOptions.settingSources);
|
|
6702
6775
|
console.log("[SDK-DEBUG] allowedTools:", queryOptions.allowedTools);
|
|
6703
|
-
console.log(
|
|
6704
|
-
"[SDK-DEBUG] mcpServers:",
|
|
6705
|
-
queryOptions.mcpServers ? Object.keys(queryOptions.mcpServers) : "none"
|
|
6706
|
-
);
|
|
6707
6776
|
console.log("[SDK-DEBUG] Calling SDK query()...");
|
|
6708
6777
|
if (traceContext) {
|
|
6709
6778
|
const preExecEvent = {
|
|
@@ -6713,7 +6782,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
6713
6782
|
targetId: traceContext.targetId,
|
|
6714
6783
|
targetName: traceContext.targetName,
|
|
6715
6784
|
stepNumber: 0,
|
|
6716
|
-
type:
|
|
6785
|
+
type: import_evalforge_types3.LiveTraceEventType.DIAGNOSTIC,
|
|
6717
6786
|
outputPreview: JSON.stringify({
|
|
6718
6787
|
event: "pre-sdk-execution",
|
|
6719
6788
|
model: queryOptions.model,
|
|
@@ -6782,7 +6851,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
6782
6851
|
targetId: traceContext.targetId,
|
|
6783
6852
|
targetName: traceContext.targetName,
|
|
6784
6853
|
stepNumber: traceStepNumber,
|
|
6785
|
-
type:
|
|
6854
|
+
type: import_evalforge_types3.LiveTraceEventType.PROGRESS,
|
|
6786
6855
|
outputPreview: progressMessage,
|
|
6787
6856
|
toolName: lastToolName,
|
|
6788
6857
|
filePath: lastFilePath,
|
|
@@ -6839,18 +6908,18 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
|
|
|
6839
6908
|
if (traceEvent) {
|
|
6840
6909
|
lastToolName = traceEvent.toolName;
|
|
6841
6910
|
lastFilePath = traceEvent.filePath;
|
|
6842
|
-
if (traceEvent.type ===
|
|
6911
|
+
if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.THINKING) {
|
|
6843
6912
|
lastAction = "Thinking...";
|
|
6844
|
-
} else if (traceEvent.type ===
|
|
6913
|
+
} else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.TOOL_USE) {
|
|
6845
6914
|
lastAction = extractToolActionDescription(
|
|
6846
6915
|
traceEvent.toolName,
|
|
6847
6916
|
traceEvent.toolArgs
|
|
6848
6917
|
);
|
|
6849
|
-
} else if (traceEvent.type ===
|
|
6918
|
+
} else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.FILE_WRITE) {
|
|
6850
6919
|
lastAction = `Writing: ${traceEvent.filePath || "file"}`;
|
|
6851
|
-
} else if (traceEvent.type ===
|
|
6920
|
+
} else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.FILE_READ) {
|
|
6852
6921
|
lastAction = `Reading: ${traceEvent.filePath || "file"}`;
|
|
6853
|
-
} else if (traceEvent.type ===
|
|
6922
|
+
} else if (traceEvent.type === import_evalforge_types3.LiveTraceEventType.COMPLETION) {
|
|
6854
6923
|
lastAction = "Processing response...";
|
|
6855
6924
|
}
|
|
6856
6925
|
emitTraceEvent(
|
|
@@ -7033,7 +7102,7 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
|
|
|
7033
7102
|
targetId: traceContext.targetId,
|
|
7034
7103
|
targetName: traceContext.targetName,
|
|
7035
7104
|
stepNumber: traceStepNumber + 1,
|
|
7036
|
-
type:
|
|
7105
|
+
type: import_evalforge_types3.LiveTraceEventType.DIAGNOSTIC,
|
|
7037
7106
|
outputPreview: JSON.stringify(
|
|
7038
7107
|
{
|
|
7039
7108
|
event: "sdk-execution-failed",
|
|
@@ -7072,7 +7141,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
7072
7141
|
targetId: traceContext.targetId,
|
|
7073
7142
|
targetName: traceContext.targetName,
|
|
7074
7143
|
stepNumber: traceStepNumber + 1,
|
|
7075
|
-
type:
|
|
7144
|
+
type: import_evalforge_types3.LiveTraceEventType.COMPLETION,
|
|
7076
7145
|
outputPreview: "Scenario execution completed",
|
|
7077
7146
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7078
7147
|
isComplete: true
|
|
@@ -7114,10 +7183,10 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
7114
7183
|
async function writeSkillsToFilesystem(cwd, skills) {
|
|
7115
7184
|
for (const skill of skills) {
|
|
7116
7185
|
const skillName = skill.name;
|
|
7117
|
-
const skillDir = (0,
|
|
7118
|
-
await (0,
|
|
7119
|
-
const skillPath = (0,
|
|
7120
|
-
await (0,
|
|
7186
|
+
const skillDir = (0, import_path7.join)(cwd, ".claude", "skills", skillName);
|
|
7187
|
+
await (0, import_promises5.mkdir)(skillDir, { recursive: true });
|
|
7188
|
+
const skillPath = (0, import_path7.join)(skillDir, "SKILL.md");
|
|
7189
|
+
await (0, import_promises5.writeFile)(skillPath, skill.skillMd, "utf-8");
|
|
7121
7190
|
console.log(`[Skill] Written to ${skillPath}`);
|
|
7122
7191
|
}
|
|
7123
7192
|
}
|
|
@@ -7250,7 +7319,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
7250
7319
|
return {
|
|
7251
7320
|
id: (0, import_crypto.randomUUID)(),
|
|
7252
7321
|
stepNumber: index + 1,
|
|
7253
|
-
type: step.toolCalls?.length ?
|
|
7322
|
+
type: step.toolCalls?.length ? import_evalforge_types3.LLMStepType.TOOL_USE : import_evalforge_types3.LLMStepType.COMPLETION,
|
|
7254
7323
|
model,
|
|
7255
7324
|
provider: "anthropic",
|
|
7256
7325
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -7321,9 +7390,11 @@ var ClaudeCodeAdapter = class {
|
|
|
7321
7390
|
modelConfig,
|
|
7322
7391
|
aiGatewayUrl,
|
|
7323
7392
|
aiGatewayHeaders,
|
|
7324
|
-
traceContext
|
|
7393
|
+
traceContext,
|
|
7394
|
+
mcps,
|
|
7395
|
+
subAgents
|
|
7325
7396
|
} = context;
|
|
7326
|
-
const modelForSdk = modelConfig?.model ?
|
|
7397
|
+
const modelForSdk = modelConfig?.model ? import_evalforge_types4.AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
|
|
7327
7398
|
const options = {
|
|
7328
7399
|
cwd,
|
|
7329
7400
|
model: modelForSdk,
|
|
@@ -7331,7 +7402,9 @@ var ClaudeCodeAdapter = class {
|
|
|
7331
7402
|
maxTokens: modelConfig?.maxTokens,
|
|
7332
7403
|
aiGatewayUrl,
|
|
7333
7404
|
aiGatewayHeaders,
|
|
7334
|
-
traceContext
|
|
7405
|
+
traceContext,
|
|
7406
|
+
mcps,
|
|
7407
|
+
subAgents
|
|
7335
7408
|
};
|
|
7336
7409
|
const { result, llmTrace } = await executeWithClaudeCode(
|
|
7337
7410
|
skills,
|
|
@@ -7358,7 +7431,7 @@ defaultRegistry.register(claudeCodeAdapter);
|
|
|
7358
7431
|
|
|
7359
7432
|
// src/run-scenario/file-diff.ts
|
|
7360
7433
|
var import_fs6 = require("fs");
|
|
7361
|
-
var
|
|
7434
|
+
var import_path8 = require("path");
|
|
7362
7435
|
|
|
7363
7436
|
// ../../node_modules/diff/lib/index.mjs
|
|
7364
7437
|
function Diff() {
|
|
@@ -7534,7 +7607,7 @@ Diff.prototype = {
|
|
|
7534
7607
|
tokenize: function tokenize(value) {
|
|
7535
7608
|
return Array.from(value);
|
|
7536
7609
|
},
|
|
7537
|
-
join: function
|
|
7610
|
+
join: function join5(chars) {
|
|
7538
7611
|
return chars.join("");
|
|
7539
7612
|
},
|
|
7540
7613
|
postProcess: function postProcess(changeObjects) {
|
|
@@ -7974,8 +8047,8 @@ function snapshotDirectory(dir, baseDir) {
|
|
|
7974
8047
|
}
|
|
7975
8048
|
const entries = (0, import_fs6.readdirSync)(dir, { withFileTypes: true });
|
|
7976
8049
|
for (const entry of entries) {
|
|
7977
|
-
const fullPath = (0,
|
|
7978
|
-
const relativePath = (0,
|
|
8050
|
+
const fullPath = (0, import_path8.join)(dir, entry.name);
|
|
8051
|
+
const relativePath = (0, import_path8.relative)(base, fullPath);
|
|
7979
8052
|
if (shouldIgnore(entry.name)) {
|
|
7980
8053
|
continue;
|
|
7981
8054
|
}
|
|
@@ -8084,13 +8157,18 @@ function extractTemplateFiles(before, after) {
|
|
|
8084
8157
|
|
|
8085
8158
|
// src/run-scenario/run-agent-with-context.ts
|
|
8086
8159
|
var DEFAULT_AGENT_COMMAND = "claude";
|
|
8087
|
-
async function runAgentWithContext(config, evalRunId2, scenario,
|
|
8160
|
+
async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
|
|
8161
|
+
const skillsGroupId = evalData.evalRun.skillsGroupId;
|
|
8162
|
+
if (!skillsGroupId) {
|
|
8163
|
+
throw new Error(`Eval run ${evalData.evalRun.id} has no skillsGroupId`);
|
|
8164
|
+
}
|
|
8165
|
+
const agent = evalData.codeAgent ?? void 0;
|
|
8088
8166
|
const runCommand = agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
8089
8167
|
const adapter = getAdapter(runCommand);
|
|
8090
8168
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
8091
8169
|
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
8092
8170
|
const executionContext = {
|
|
8093
|
-
skills,
|
|
8171
|
+
skills: evalData.skills,
|
|
8094
8172
|
scenario,
|
|
8095
8173
|
cwd: workDir || process.cwd(),
|
|
8096
8174
|
modelConfig: agent?.modelConfig,
|
|
@@ -8101,11 +8179,13 @@ async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsG
|
|
|
8101
8179
|
scenarioId: scenario.id,
|
|
8102
8180
|
scenarioName: scenario.name,
|
|
8103
8181
|
targetId: skillsGroupId,
|
|
8104
|
-
targetName: skillsGroupName,
|
|
8182
|
+
targetName: evalData.skillsGroupName,
|
|
8105
8183
|
tracePushUrl: config.tracePushUrl,
|
|
8106
8184
|
routeHeader: config.routeHeader,
|
|
8107
8185
|
authToken: config.authToken
|
|
8108
|
-
}
|
|
8186
|
+
},
|
|
8187
|
+
mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
|
|
8188
|
+
subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0
|
|
8109
8189
|
};
|
|
8110
8190
|
const result = await adapter.execute(executionContext);
|
|
8111
8191
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -8115,7 +8195,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsG
|
|
|
8115
8195
|
return {
|
|
8116
8196
|
id: (0, import_crypto2.randomUUID)(),
|
|
8117
8197
|
targetId: skillsGroupId,
|
|
8118
|
-
targetName: skillsGroupName,
|
|
8198
|
+
targetName: evalData.skillsGroupName,
|
|
8119
8199
|
scenarioId: scenario.id,
|
|
8120
8200
|
scenarioName: scenario.name,
|
|
8121
8201
|
modelConfig: agent?.modelConfig,
|
|
@@ -8143,10 +8223,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
8143
8223
|
config,
|
|
8144
8224
|
evalRunId2,
|
|
8145
8225
|
scenario,
|
|
8146
|
-
evalData
|
|
8147
|
-
skillsGroupId,
|
|
8148
|
-
evalData.skillsGroupName,
|
|
8149
|
-
evalData.codeAgent ?? void 0,
|
|
8226
|
+
evalData,
|
|
8150
8227
|
workDir
|
|
8151
8228
|
);
|
|
8152
8229
|
const inlineAssertions = scenario.assertions ?? [];
|
|
@@ -8178,10 +8255,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
8178
8255
|
assertionContext
|
|
8179
8256
|
) : [];
|
|
8180
8257
|
const passed = assertionResults.filter(
|
|
8181
|
-
(r) => r.status ===
|
|
8258
|
+
(r) => r.status === import_evalforge_types5.AssertionResultStatus.PASSED
|
|
8182
8259
|
).length;
|
|
8183
8260
|
const failed = assertionResults.filter(
|
|
8184
|
-
(r) => r.status ===
|
|
8261
|
+
(r) => r.status === import_evalforge_types5.AssertionResultStatus.FAILED
|
|
8185
8262
|
).length;
|
|
8186
8263
|
const total = assertionResults.length;
|
|
8187
8264
|
const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
|
|
@@ -8195,7 +8272,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
|
|
|
8195
8272
|
}
|
|
8196
8273
|
|
|
8197
8274
|
// src/error-reporter.ts
|
|
8198
|
-
var
|
|
8275
|
+
var import_evalforge_types6 = require("@wix/evalforge-types");
|
|
8199
8276
|
function formatError(error, phase, context) {
|
|
8200
8277
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
8201
8278
|
if (error instanceof Error) {
|
|
@@ -8444,7 +8521,7 @@ async function runEvaluation(projectId2, evalRunId2) {
|
|
|
8444
8521
|
};
|
|
8445
8522
|
try {
|
|
8446
8523
|
await api.updateEvalRun(projectId2, evalRunId2, {
|
|
8447
|
-
status:
|
|
8524
|
+
status: import_evalforge_types7.EvalStatus.COMPLETED,
|
|
8448
8525
|
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
8449
8526
|
});
|
|
8450
8527
|
} catch (updateErr) {
|
|
@@ -8485,7 +8562,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
8485
8562
|
authToken: config.authToken
|
|
8486
8563
|
});
|
|
8487
8564
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
8488
|
-
status:
|
|
8565
|
+
status: import_evalforge_types7.EvalStatus.FAILED,
|
|
8489
8566
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
8490
8567
|
jobError,
|
|
8491
8568
|
jobStatus: "FAILED"
|
|
@@ -8508,7 +8585,7 @@ runEvaluation(projectId, evalRunId).then(() => {
|
|
|
8508
8585
|
authToken
|
|
8509
8586
|
});
|
|
8510
8587
|
await api.updateEvalRun(projectId, evalRunId, {
|
|
8511
|
-
status:
|
|
8588
|
+
status: import_evalforge_types7.EvalStatus.FAILED,
|
|
8512
8589
|
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
8513
8590
|
jobError: `Config load failed, then: ${jobError}`,
|
|
8514
8591
|
jobStatus: "FAILED"
|