@wix/evalforge-evaluator 0.115.0 → 0.116.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +89 -46
- package/build/index.js.map +2 -2
- package/build/index.mjs +89 -46
- package/build/index.mjs.map +3 -3
- package/build/types/run-scenario/agents/claude-code/claude-code-adapter.d.ts +5 -0
- package/build/types/run-scenario/agents/claude-code/execute.d.ts +6 -0
- package/build/types/run-scenario/agents/opencode/execute.d.ts +8 -0
- package/build/types/run-scenario/agents/opencode/opencode-adapter.d.ts +1 -0
- package/build/types/run-scenario/file-diff.d.ts +10 -2
- package/package.json +5 -5
package/build/index.mjs
CHANGED
|
@@ -1191,6 +1191,31 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
1191
1191
|
outputPreview: `Message type: ${message.type}`
|
|
1192
1192
|
};
|
|
1193
1193
|
}
|
|
1194
|
+
async function prepareClaudeCodeEnvironment(cwd, skills, options) {
|
|
1195
|
+
const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
|
|
1196
|
+
const claudeDir = `${cwd}/.claude`;
|
|
1197
|
+
await mkdirAsync(claudeDir, { recursive: true });
|
|
1198
|
+
await writeFile6(`${claudeDir}/settings.json`, "{}", {
|
|
1199
|
+
flag: "wx"
|
|
1200
|
+
}).catch(() => {
|
|
1201
|
+
});
|
|
1202
|
+
if (options.mcps && options.mcps.length > 0) {
|
|
1203
|
+
await writeMcpToFilesystem(cwd, options.mcps);
|
|
1204
|
+
}
|
|
1205
|
+
if (options.subAgents && options.subAgents.length > 0) {
|
|
1206
|
+
await writeSubAgentsToFilesystem(cwd, options.subAgents);
|
|
1207
|
+
}
|
|
1208
|
+
if (options.rules && options.rules.length > 0) {
|
|
1209
|
+
await writeRulesToFilesystem(cwd, options.rules);
|
|
1210
|
+
}
|
|
1211
|
+
try {
|
|
1212
|
+
await writeSkillsToFilesystem(cwd, skills);
|
|
1213
|
+
} catch (writeError) {
|
|
1214
|
+
throw new Error(
|
|
1215
|
+
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
1216
|
+
);
|
|
1217
|
+
}
|
|
1218
|
+
}
|
|
1194
1219
|
async function executeWithClaudeCode(skills, scenario, options) {
|
|
1195
1220
|
const skillNames = skills.map((s) => s.name).join(", ");
|
|
1196
1221
|
console.log("[executeWithClaudeCode] Starting execution", {
|
|
@@ -1214,29 +1239,6 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1214
1239
|
}
|
|
1215
1240
|
const startTime = /* @__PURE__ */ new Date();
|
|
1216
1241
|
const allMessages = [];
|
|
1217
|
-
const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
|
|
1218
|
-
const claudeDir = `${options.cwd}/.claude`;
|
|
1219
|
-
await mkdirAsync(claudeDir, { recursive: true });
|
|
1220
|
-
await writeFile6(`${claudeDir}/settings.json`, "{}", {
|
|
1221
|
-
flag: "wx"
|
|
1222
|
-
}).catch(() => {
|
|
1223
|
-
});
|
|
1224
|
-
if (options.mcps && options.mcps.length > 0) {
|
|
1225
|
-
await writeMcpToFilesystem(options.cwd, options.mcps);
|
|
1226
|
-
}
|
|
1227
|
-
if (options.subAgents && options.subAgents.length > 0) {
|
|
1228
|
-
await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
|
|
1229
|
-
}
|
|
1230
|
-
if (options.rules && options.rules.length > 0) {
|
|
1231
|
-
await writeRulesToFilesystem(options.cwd, options.rules);
|
|
1232
|
-
}
|
|
1233
|
-
try {
|
|
1234
|
-
await writeSkillsToFilesystem(options.cwd, skills);
|
|
1235
|
-
} catch (writeError) {
|
|
1236
|
-
throw new Error(
|
|
1237
|
-
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
1238
|
-
);
|
|
1239
|
-
}
|
|
1240
1242
|
const sdkEnv = buildSdkEnvironment(options);
|
|
1241
1243
|
let traceStepNumber = 0;
|
|
1242
1244
|
const traceContext = options.traceContext;
|
|
@@ -2078,6 +2080,17 @@ var ClaudeCodeAdapter = class {
|
|
|
2078
2080
|
id = "claude-code";
|
|
2079
2081
|
name = "Claude Code";
|
|
2080
2082
|
supportedCommands = [AgentRunCommand.CLAUDE];
|
|
2083
|
+
/**
|
|
2084
|
+
* Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
|
|
2085
|
+
* before the baseline snapshot is taken.
|
|
2086
|
+
*/
|
|
2087
|
+
async prepareEnvironment(context) {
|
|
2088
|
+
await prepareClaudeCodeEnvironment(context.cwd, context.skills, {
|
|
2089
|
+
mcps: context.mcps,
|
|
2090
|
+
subAgents: context.subAgents,
|
|
2091
|
+
rules: context.rules
|
|
2092
|
+
});
|
|
2093
|
+
}
|
|
2081
2094
|
/**
|
|
2082
2095
|
* Execute a skill using the Claude Code SDK.
|
|
2083
2096
|
*
|
|
@@ -2737,37 +2750,39 @@ function createTraceEventFromPart(part, context, stepNumber, isComplete) {
|
|
|
2737
2750
|
return null;
|
|
2738
2751
|
}
|
|
2739
2752
|
}
|
|
2740
|
-
async function
|
|
2741
|
-
const skillNames = skills.map((s) => s.name).join(", ");
|
|
2742
|
-
console.log("[executeWithOpenCode] Starting execution", {
|
|
2743
|
-
skillCount: skills.length,
|
|
2744
|
-
skillNames,
|
|
2745
|
-
scenarioId: scenario.id,
|
|
2746
|
-
scenarioName: scenario.name,
|
|
2747
|
-
cwd: options.cwd,
|
|
2748
|
-
aiGatewayUrl: options.aiGatewayUrl,
|
|
2749
|
-
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
2750
|
-
model: options.model
|
|
2751
|
-
});
|
|
2752
|
-
const startTime = /* @__PURE__ */ new Date();
|
|
2753
|
+
async function prepareOpenCodeEnvironment(cwd, skills, options) {
|
|
2753
2754
|
if (options.mcps && options.mcps.length > 0) {
|
|
2754
2755
|
console.log(
|
|
2755
2756
|
`[MCP] ${options.mcps.length} MCP(s) will be configured inline`
|
|
2756
2757
|
);
|
|
2757
2758
|
}
|
|
2758
2759
|
if (options.subAgents && options.subAgents.length > 0) {
|
|
2759
|
-
await writeSubAgentsToFilesystem2(
|
|
2760
|
+
await writeSubAgentsToFilesystem2(cwd, options.subAgents);
|
|
2760
2761
|
}
|
|
2761
2762
|
if (options.rules && options.rules.length > 0) {
|
|
2762
|
-
await writeRulesToFilesystem(
|
|
2763
|
+
await writeRulesToFilesystem(cwd, options.rules);
|
|
2763
2764
|
}
|
|
2764
2765
|
try {
|
|
2765
|
-
await writeSkillsToFilesystem2(
|
|
2766
|
+
await writeSkillsToFilesystem2(cwd, skills);
|
|
2766
2767
|
} catch (writeError) {
|
|
2767
2768
|
throw new Error(
|
|
2768
2769
|
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
2769
2770
|
);
|
|
2770
2771
|
}
|
|
2772
|
+
}
|
|
2773
|
+
async function executeWithOpenCode(skills, scenario, options) {
|
|
2774
|
+
const skillNames = skills.map((s) => s.name).join(", ");
|
|
2775
|
+
console.log("[executeWithOpenCode] Starting execution", {
|
|
2776
|
+
skillCount: skills.length,
|
|
2777
|
+
skillNames,
|
|
2778
|
+
scenarioId: scenario.id,
|
|
2779
|
+
scenarioName: scenario.name,
|
|
2780
|
+
cwd: options.cwd,
|
|
2781
|
+
aiGatewayUrl: options.aiGatewayUrl,
|
|
2782
|
+
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
2783
|
+
model: options.model
|
|
2784
|
+
});
|
|
2785
|
+
const startTime = /* @__PURE__ */ new Date();
|
|
2771
2786
|
const maxTurns = options.maxTurns ?? 10;
|
|
2772
2787
|
const { config, providerID, modelID } = await buildOpenCodeConfig({
|
|
2773
2788
|
model: options.model,
|
|
@@ -3147,6 +3162,13 @@ var OpenCodeAdapter = class {
|
|
|
3147
3162
|
id = "opencode";
|
|
3148
3163
|
name = "OpenCode";
|
|
3149
3164
|
supportedCommands = [AgentRunCommand2.OPENCODE];
|
|
3165
|
+
async prepareEnvironment(context) {
|
|
3166
|
+
await prepareOpenCodeEnvironment(context.cwd, context.skills, {
|
|
3167
|
+
mcps: context.mcps,
|
|
3168
|
+
subAgents: context.subAgents,
|
|
3169
|
+
rules: context.rules
|
|
3170
|
+
});
|
|
3171
|
+
}
|
|
3150
3172
|
async execute(context) {
|
|
3151
3173
|
const {
|
|
3152
3174
|
skills,
|
|
@@ -4282,6 +4304,15 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
|
|
|
4282
4304
|
};
|
|
4283
4305
|
|
|
4284
4306
|
// src/run-scenario/file-diff.ts
|
|
4307
|
+
function deriveInfrastructurePaths(prePrep, postPrep) {
|
|
4308
|
+
const infraPaths = /* @__PURE__ */ new Set();
|
|
4309
|
+
for (const path2 of Object.keys(postPrep)) {
|
|
4310
|
+
if (prePrep[path2] === void 0 || prePrep[path2] !== postPrep[path2]) {
|
|
4311
|
+
infraPaths.add(path2);
|
|
4312
|
+
}
|
|
4313
|
+
}
|
|
4314
|
+
return infraPaths;
|
|
4315
|
+
}
|
|
4285
4316
|
var IGNORED_PATTERNS = [
|
|
4286
4317
|
"node_modules",
|
|
4287
4318
|
".git",
|
|
@@ -4385,7 +4416,7 @@ function generateDiffLines(before, after) {
|
|
|
4385
4416
|
}
|
|
4386
4417
|
return result;
|
|
4387
4418
|
}
|
|
4388
|
-
function diffSnapshots(before, after) {
|
|
4419
|
+
function diffSnapshots(before, after, infrastructurePaths) {
|
|
4389
4420
|
const diffs = [];
|
|
4390
4421
|
const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
4391
4422
|
for (const path2 of allPaths) {
|
|
@@ -4399,7 +4430,8 @@ function diffSnapshots(before, after) {
|
|
|
4399
4430
|
path: path2,
|
|
4400
4431
|
expected: beforeContent,
|
|
4401
4432
|
actual: afterContent,
|
|
4402
|
-
diffLines: diffLines2
|
|
4433
|
+
diffLines: diffLines2,
|
|
4434
|
+
...infrastructurePaths?.has(path2) && { isInfrastructure: true }
|
|
4403
4435
|
});
|
|
4404
4436
|
}
|
|
4405
4437
|
const deletedPaths = [...allPaths].filter((p) => after[p] === void 0);
|
|
@@ -4423,7 +4455,7 @@ function diffSnapshots(before, after) {
|
|
|
4423
4455
|
result.sort((a, b) => a.path.localeCompare(b.path));
|
|
4424
4456
|
return result;
|
|
4425
4457
|
}
|
|
4426
|
-
function extractTemplateFiles(before, after) {
|
|
4458
|
+
function extractTemplateFiles(before, after, infrastructurePaths) {
|
|
4427
4459
|
const files = [];
|
|
4428
4460
|
const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
4429
4461
|
for (const path2 of allPaths) {
|
|
@@ -4443,7 +4475,8 @@ function extractTemplateFiles(before, after) {
|
|
|
4443
4475
|
files.push({
|
|
4444
4476
|
path: path2,
|
|
4445
4477
|
content: afterContent,
|
|
4446
|
-
status
|
|
4478
|
+
status,
|
|
4479
|
+
...infrastructurePaths?.has(path2) && { isInfrastructure: true }
|
|
4447
4480
|
});
|
|
4448
4481
|
}
|
|
4449
4482
|
files.sort((a, b) => a.path.localeCompare(b.path));
|
|
@@ -4459,7 +4492,6 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4459
4492
|
const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
4460
4493
|
const adapter = getAdapter(identifier);
|
|
4461
4494
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4462
|
-
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
4463
4495
|
const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
|
|
4464
4496
|
const targetName = evalData.presetName || agent?.name || "";
|
|
4465
4497
|
const executionContext = {
|
|
@@ -4484,11 +4516,22 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4484
4516
|
rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
|
|
4485
4517
|
systemPrompt: agent?.systemPrompt
|
|
4486
4518
|
};
|
|
4519
|
+
const hasPrepare = !!adapter.prepareEnvironment;
|
|
4520
|
+
const prePrepSnapshot = hasPrepare && workDir ? snapshotDirectory(workDir) : {};
|
|
4521
|
+
if (hasPrepare) {
|
|
4522
|
+
await adapter.prepareEnvironment(executionContext);
|
|
4523
|
+
}
|
|
4524
|
+
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
4525
|
+
const infrastructurePaths = hasPrepare ? deriveInfrastructurePaths(prePrepSnapshot, beforeSnapshot) : /* @__PURE__ */ new Set();
|
|
4487
4526
|
const { outputText, durationMs, llmTrace, conversation } = await adapter.execute(executionContext);
|
|
4488
4527
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4489
4528
|
const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
4490
|
-
const fileDiffs = diffSnapshots(
|
|
4491
|
-
|
|
4529
|
+
const fileDiffs = diffSnapshots(
|
|
4530
|
+
beforeSnapshot,
|
|
4531
|
+
afterSnapshot,
|
|
4532
|
+
infrastructurePaths
|
|
4533
|
+
);
|
|
4534
|
+
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
|
|
4492
4535
|
return {
|
|
4493
4536
|
id: randomUUID4(),
|
|
4494
4537
|
targetId,
|