@wix/evalforge-evaluator 0.115.0 → 0.117.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +109 -48
- package/build/index.js.map +2 -2
- package/build/index.mjs +109 -48
- package/build/index.mjs.map +3 -3
- package/build/types/run-scenario/agents/claude-code/claude-code-adapter.d.ts +5 -0
- package/build/types/run-scenario/agents/claude-code/execute.d.ts +6 -0
- package/build/types/run-scenario/agents/opencode/execute.d.ts +8 -0
- package/build/types/run-scenario/agents/opencode/opencode-adapter.d.ts +1 -0
- package/build/types/run-scenario/file-diff.d.ts +10 -2
- package/package.json +5 -5
package/build/index.js
CHANGED
|
@@ -1199,6 +1199,31 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
1199
1199
|
outputPreview: `Message type: ${message.type}`
|
|
1200
1200
|
};
|
|
1201
1201
|
}
|
|
1202
|
+
async function prepareClaudeCodeEnvironment(cwd, skills, options) {
|
|
1203
|
+
const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
|
|
1204
|
+
const claudeDir = `${cwd}/.claude`;
|
|
1205
|
+
await mkdirAsync(claudeDir, { recursive: true });
|
|
1206
|
+
await writeFile6(`${claudeDir}/settings.json`, "{}", {
|
|
1207
|
+
flag: "wx"
|
|
1208
|
+
}).catch(() => {
|
|
1209
|
+
});
|
|
1210
|
+
if (options.mcps && options.mcps.length > 0) {
|
|
1211
|
+
await writeMcpToFilesystem(cwd, options.mcps);
|
|
1212
|
+
}
|
|
1213
|
+
if (options.subAgents && options.subAgents.length > 0) {
|
|
1214
|
+
await writeSubAgentsToFilesystem(cwd, options.subAgents);
|
|
1215
|
+
}
|
|
1216
|
+
if (options.rules && options.rules.length > 0) {
|
|
1217
|
+
await writeRulesToFilesystem(cwd, options.rules);
|
|
1218
|
+
}
|
|
1219
|
+
try {
|
|
1220
|
+
await writeSkillsToFilesystem(cwd, skills);
|
|
1221
|
+
} catch (writeError) {
|
|
1222
|
+
throw new Error(
|
|
1223
|
+
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
1224
|
+
);
|
|
1225
|
+
}
|
|
1226
|
+
}
|
|
1202
1227
|
async function executeWithClaudeCode(skills, scenario, options) {
|
|
1203
1228
|
const skillNames = skills.map((s) => s.name).join(", ");
|
|
1204
1229
|
console.log("[executeWithClaudeCode] Starting execution", {
|
|
@@ -1222,29 +1247,6 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1222
1247
|
}
|
|
1223
1248
|
const startTime = /* @__PURE__ */ new Date();
|
|
1224
1249
|
const allMessages = [];
|
|
1225
|
-
const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
|
|
1226
|
-
const claudeDir = `${options.cwd}/.claude`;
|
|
1227
|
-
await mkdirAsync(claudeDir, { recursive: true });
|
|
1228
|
-
await writeFile6(`${claudeDir}/settings.json`, "{}", {
|
|
1229
|
-
flag: "wx"
|
|
1230
|
-
}).catch(() => {
|
|
1231
|
-
});
|
|
1232
|
-
if (options.mcps && options.mcps.length > 0) {
|
|
1233
|
-
await writeMcpToFilesystem(options.cwd, options.mcps);
|
|
1234
|
-
}
|
|
1235
|
-
if (options.subAgents && options.subAgents.length > 0) {
|
|
1236
|
-
await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
|
|
1237
|
-
}
|
|
1238
|
-
if (options.rules && options.rules.length > 0) {
|
|
1239
|
-
await writeRulesToFilesystem(options.cwd, options.rules);
|
|
1240
|
-
}
|
|
1241
|
-
try {
|
|
1242
|
-
await writeSkillsToFilesystem(options.cwd, skills);
|
|
1243
|
-
} catch (writeError) {
|
|
1244
|
-
throw new Error(
|
|
1245
|
-
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
1246
|
-
);
|
|
1247
|
-
}
|
|
1248
1250
|
const sdkEnv = buildSdkEnvironment(options);
|
|
1249
1251
|
let traceStepNumber = 0;
|
|
1250
1252
|
const traceContext = options.traceContext;
|
|
@@ -2086,6 +2088,17 @@ var ClaudeCodeAdapter = class {
|
|
|
2086
2088
|
id = "claude-code";
|
|
2087
2089
|
name = "Claude Code";
|
|
2088
2090
|
supportedCommands = [import_evalforge_types5.AgentRunCommand.CLAUDE];
|
|
2091
|
+
/**
|
|
2092
|
+
* Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
|
|
2093
|
+
* before the baseline snapshot is taken.
|
|
2094
|
+
*/
|
|
2095
|
+
async prepareEnvironment(context) {
|
|
2096
|
+
await prepareClaudeCodeEnvironment(context.cwd, context.skills, {
|
|
2097
|
+
mcps: context.mcps,
|
|
2098
|
+
subAgents: context.subAgents,
|
|
2099
|
+
rules: context.rules
|
|
2100
|
+
});
|
|
2101
|
+
}
|
|
2089
2102
|
/**
|
|
2090
2103
|
* Execute a skill using the Claude Code SDK.
|
|
2091
2104
|
*
|
|
@@ -2736,37 +2749,39 @@ function createTraceEventFromPart(part, context, stepNumber, isComplete) {
|
|
|
2736
2749
|
return null;
|
|
2737
2750
|
}
|
|
2738
2751
|
}
|
|
2739
|
-
async function
|
|
2740
|
-
const skillNames = skills.map((s) => s.name).join(", ");
|
|
2741
|
-
console.log("[executeWithOpenCode] Starting execution", {
|
|
2742
|
-
skillCount: skills.length,
|
|
2743
|
-
skillNames,
|
|
2744
|
-
scenarioId: scenario.id,
|
|
2745
|
-
scenarioName: scenario.name,
|
|
2746
|
-
cwd: options.cwd,
|
|
2747
|
-
aiGatewayUrl: options.aiGatewayUrl,
|
|
2748
|
-
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
2749
|
-
model: options.model
|
|
2750
|
-
});
|
|
2751
|
-
const startTime = /* @__PURE__ */ new Date();
|
|
2752
|
+
async function prepareOpenCodeEnvironment(cwd, skills, options) {
|
|
2752
2753
|
if (options.mcps && options.mcps.length > 0) {
|
|
2753
2754
|
console.log(
|
|
2754
2755
|
`[MCP] ${options.mcps.length} MCP(s) will be configured inline`
|
|
2755
2756
|
);
|
|
2756
2757
|
}
|
|
2757
2758
|
if (options.subAgents && options.subAgents.length > 0) {
|
|
2758
|
-
await writeSubAgentsToFilesystem2(
|
|
2759
|
+
await writeSubAgentsToFilesystem2(cwd, options.subAgents);
|
|
2759
2760
|
}
|
|
2760
2761
|
if (options.rules && options.rules.length > 0) {
|
|
2761
|
-
await writeRulesToFilesystem(
|
|
2762
|
+
await writeRulesToFilesystem(cwd, options.rules);
|
|
2762
2763
|
}
|
|
2763
2764
|
try {
|
|
2764
|
-
await writeSkillsToFilesystem2(
|
|
2765
|
+
await writeSkillsToFilesystem2(cwd, skills);
|
|
2765
2766
|
} catch (writeError) {
|
|
2766
2767
|
throw new Error(
|
|
2767
2768
|
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
2768
2769
|
);
|
|
2769
2770
|
}
|
|
2771
|
+
}
|
|
2772
|
+
async function executeWithOpenCode(skills, scenario, options) {
|
|
2773
|
+
const skillNames = skills.map((s) => s.name).join(", ");
|
|
2774
|
+
console.log("[executeWithOpenCode] Starting execution", {
|
|
2775
|
+
skillCount: skills.length,
|
|
2776
|
+
skillNames,
|
|
2777
|
+
scenarioId: scenario.id,
|
|
2778
|
+
scenarioName: scenario.name,
|
|
2779
|
+
cwd: options.cwd,
|
|
2780
|
+
aiGatewayUrl: options.aiGatewayUrl,
|
|
2781
|
+
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
2782
|
+
model: options.model
|
|
2783
|
+
});
|
|
2784
|
+
const startTime = /* @__PURE__ */ new Date();
|
|
2770
2785
|
const maxTurns = options.maxTurns ?? 10;
|
|
2771
2786
|
const { config, providerID, modelID } = await buildOpenCodeConfig({
|
|
2772
2787
|
model: options.model,
|
|
@@ -3097,9 +3112,24 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
3097
3112
|
const errorMessage = sdkError instanceof Error ? sdkError.message : String(sdkError);
|
|
3098
3113
|
const errorStack = sdkError instanceof Error ? sdkError.stack : void 0;
|
|
3099
3114
|
const errorName = sdkError instanceof Error ? sdkError.name : "Unknown";
|
|
3115
|
+
const causeDetails = [];
|
|
3116
|
+
let current = sdkError;
|
|
3117
|
+
while (current instanceof Error && current.cause) {
|
|
3118
|
+
current = current.cause;
|
|
3119
|
+
if (current instanceof Error) {
|
|
3120
|
+
causeDetails.push(`${current.name}: ${current.message}`);
|
|
3121
|
+
} else {
|
|
3122
|
+
causeDetails.push(String(current));
|
|
3123
|
+
}
|
|
3124
|
+
}
|
|
3125
|
+
const causeChain = causeDetails.length > 0 ? `
|
|
3126
|
+
Cause chain: ${causeDetails.join(" -> ")}` : "";
|
|
3100
3127
|
console.error("[SDK-ERROR] ====== OPENCODE SDK EXECUTION FAILED ======");
|
|
3101
3128
|
console.error("[SDK-ERROR] Error name:", errorName);
|
|
3102
3129
|
console.error("[SDK-ERROR] Error message:", errorMessage);
|
|
3130
|
+
if (causeDetails.length > 0) {
|
|
3131
|
+
console.error("[SDK-ERROR] Cause chain:", causeDetails.join(" -> "));
|
|
3132
|
+
}
|
|
3103
3133
|
if (errorStack) {
|
|
3104
3134
|
console.error("[SDK-ERROR] Stack:", errorStack);
|
|
3105
3135
|
}
|
|
@@ -3116,7 +3146,10 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
3116
3146
|
outputPreview: JSON.stringify({
|
|
3117
3147
|
event: "sdk-execution-failed",
|
|
3118
3148
|
error: errorMessage,
|
|
3119
|
-
errorName
|
|
3149
|
+
errorName,
|
|
3150
|
+
...causeDetails.length > 0 && {
|
|
3151
|
+
causeChain: causeDetails.join(" -> ")
|
|
3152
|
+
}
|
|
3120
3153
|
}).slice(0, 2e3),
|
|
3121
3154
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3122
3155
|
isComplete: true
|
|
@@ -3127,7 +3160,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
3127
3160
|
);
|
|
3128
3161
|
}
|
|
3129
3162
|
throw new Error(
|
|
3130
|
-
`OpenCode SDK execution failed: ${errorMessage}` + (errorStack ? `
|
|
3163
|
+
`OpenCode SDK execution failed: ${errorMessage}` + causeChain + (errorStack ? `
|
|
3131
3164
|
Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
3132
3165
|
);
|
|
3133
3166
|
} finally {
|
|
@@ -3146,6 +3179,13 @@ var OpenCodeAdapter = class {
|
|
|
3146
3179
|
id = "opencode";
|
|
3147
3180
|
name = "OpenCode";
|
|
3148
3181
|
supportedCommands = [import_evalforge_types9.AgentRunCommand.OPENCODE];
|
|
3182
|
+
async prepareEnvironment(context) {
|
|
3183
|
+
await prepareOpenCodeEnvironment(context.cwd, context.skills, {
|
|
3184
|
+
mcps: context.mcps,
|
|
3185
|
+
subAgents: context.subAgents,
|
|
3186
|
+
rules: context.rules
|
|
3187
|
+
});
|
|
3188
|
+
}
|
|
3149
3189
|
async execute(context) {
|
|
3150
3190
|
const {
|
|
3151
3191
|
skills,
|
|
@@ -4273,6 +4313,15 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
|
|
|
4273
4313
|
};
|
|
4274
4314
|
|
|
4275
4315
|
// src/run-scenario/file-diff.ts
|
|
4316
|
+
function deriveInfrastructurePaths(prePrep, postPrep) {
|
|
4317
|
+
const infraPaths = /* @__PURE__ */ new Set();
|
|
4318
|
+
for (const path2 of Object.keys(postPrep)) {
|
|
4319
|
+
if (prePrep[path2] === void 0 || prePrep[path2] !== postPrep[path2]) {
|
|
4320
|
+
infraPaths.add(path2);
|
|
4321
|
+
}
|
|
4322
|
+
}
|
|
4323
|
+
return infraPaths;
|
|
4324
|
+
}
|
|
4276
4325
|
var IGNORED_PATTERNS = [
|
|
4277
4326
|
"node_modules",
|
|
4278
4327
|
".git",
|
|
@@ -4376,7 +4425,7 @@ function generateDiffLines(before, after) {
|
|
|
4376
4425
|
}
|
|
4377
4426
|
return result;
|
|
4378
4427
|
}
|
|
4379
|
-
function diffSnapshots(before, after) {
|
|
4428
|
+
function diffSnapshots(before, after, infrastructurePaths) {
|
|
4380
4429
|
const diffs = [];
|
|
4381
4430
|
const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
4382
4431
|
for (const path2 of allPaths) {
|
|
@@ -4390,7 +4439,8 @@ function diffSnapshots(before, after) {
|
|
|
4390
4439
|
path: path2,
|
|
4391
4440
|
expected: beforeContent,
|
|
4392
4441
|
actual: afterContent,
|
|
4393
|
-
diffLines: diffLines2
|
|
4442
|
+
diffLines: diffLines2,
|
|
4443
|
+
...infrastructurePaths?.has(path2) && { isInfrastructure: true }
|
|
4394
4444
|
});
|
|
4395
4445
|
}
|
|
4396
4446
|
const deletedPaths = [...allPaths].filter((p) => after[p] === void 0);
|
|
@@ -4414,7 +4464,7 @@ function diffSnapshots(before, after) {
|
|
|
4414
4464
|
result.sort((a, b) => a.path.localeCompare(b.path));
|
|
4415
4465
|
return result;
|
|
4416
4466
|
}
|
|
4417
|
-
function extractTemplateFiles(before, after) {
|
|
4467
|
+
function extractTemplateFiles(before, after, infrastructurePaths) {
|
|
4418
4468
|
const files = [];
|
|
4419
4469
|
const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
4420
4470
|
for (const path2 of allPaths) {
|
|
@@ -4434,7 +4484,8 @@ function extractTemplateFiles(before, after) {
|
|
|
4434
4484
|
files.push({
|
|
4435
4485
|
path: path2,
|
|
4436
4486
|
content: afterContent,
|
|
4437
|
-
status
|
|
4487
|
+
status,
|
|
4488
|
+
...infrastructurePaths?.has(path2) && { isInfrastructure: true }
|
|
4438
4489
|
});
|
|
4439
4490
|
}
|
|
4440
4491
|
files.sort((a, b) => a.path.localeCompare(b.path));
|
|
@@ -4450,7 +4501,6 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4450
4501
|
const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
4451
4502
|
const adapter = getAdapter(identifier);
|
|
4452
4503
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4453
|
-
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
4454
4504
|
const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
|
|
4455
4505
|
const targetName = evalData.presetName || agent?.name || "";
|
|
4456
4506
|
const executionContext = {
|
|
@@ -4475,11 +4525,22 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4475
4525
|
rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
|
|
4476
4526
|
systemPrompt: agent?.systemPrompt
|
|
4477
4527
|
};
|
|
4528
|
+
const hasPrepare = !!adapter.prepareEnvironment;
|
|
4529
|
+
const prePrepSnapshot = hasPrepare && workDir ? snapshotDirectory(workDir) : {};
|
|
4530
|
+
if (hasPrepare) {
|
|
4531
|
+
await adapter.prepareEnvironment(executionContext);
|
|
4532
|
+
}
|
|
4533
|
+
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
4534
|
+
const infrastructurePaths = hasPrepare ? deriveInfrastructurePaths(prePrepSnapshot, beforeSnapshot) : /* @__PURE__ */ new Set();
|
|
4478
4535
|
const { outputText, durationMs, llmTrace, conversation } = await adapter.execute(executionContext);
|
|
4479
4536
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4480
4537
|
const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
4481
|
-
const fileDiffs = diffSnapshots(
|
|
4482
|
-
|
|
4538
|
+
const fileDiffs = diffSnapshots(
|
|
4539
|
+
beforeSnapshot,
|
|
4540
|
+
afterSnapshot,
|
|
4541
|
+
infrastructurePaths
|
|
4542
|
+
);
|
|
4543
|
+
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
|
|
4483
4544
|
return {
|
|
4484
4545
|
id: (0, import_crypto4.randomUUID)(),
|
|
4485
4546
|
targetId,
|