@wix/evalforge-evaluator 0.115.0 → 0.116.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +89 -46
- package/build/index.js.map +2 -2
- package/build/index.mjs +89 -46
- package/build/index.mjs.map +3 -3
- package/build/types/run-scenario/agents/claude-code/claude-code-adapter.d.ts +5 -0
- package/build/types/run-scenario/agents/claude-code/execute.d.ts +6 -0
- package/build/types/run-scenario/agents/opencode/execute.d.ts +8 -0
- package/build/types/run-scenario/agents/opencode/opencode-adapter.d.ts +1 -0
- package/build/types/run-scenario/file-diff.d.ts +10 -2
- package/package.json +5 -5
package/build/index.js
CHANGED
|
@@ -1199,6 +1199,31 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
1199
1199
|
outputPreview: `Message type: ${message.type}`
|
|
1200
1200
|
};
|
|
1201
1201
|
}
|
|
1202
|
+
async function prepareClaudeCodeEnvironment(cwd, skills, options) {
|
|
1203
|
+
const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
|
|
1204
|
+
const claudeDir = `${cwd}/.claude`;
|
|
1205
|
+
await mkdirAsync(claudeDir, { recursive: true });
|
|
1206
|
+
await writeFile6(`${claudeDir}/settings.json`, "{}", {
|
|
1207
|
+
flag: "wx"
|
|
1208
|
+
}).catch(() => {
|
|
1209
|
+
});
|
|
1210
|
+
if (options.mcps && options.mcps.length > 0) {
|
|
1211
|
+
await writeMcpToFilesystem(cwd, options.mcps);
|
|
1212
|
+
}
|
|
1213
|
+
if (options.subAgents && options.subAgents.length > 0) {
|
|
1214
|
+
await writeSubAgentsToFilesystem(cwd, options.subAgents);
|
|
1215
|
+
}
|
|
1216
|
+
if (options.rules && options.rules.length > 0) {
|
|
1217
|
+
await writeRulesToFilesystem(cwd, options.rules);
|
|
1218
|
+
}
|
|
1219
|
+
try {
|
|
1220
|
+
await writeSkillsToFilesystem(cwd, skills);
|
|
1221
|
+
} catch (writeError) {
|
|
1222
|
+
throw new Error(
|
|
1223
|
+
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
1224
|
+
);
|
|
1225
|
+
}
|
|
1226
|
+
}
|
|
1202
1227
|
async function executeWithClaudeCode(skills, scenario, options) {
|
|
1203
1228
|
const skillNames = skills.map((s) => s.name).join(", ");
|
|
1204
1229
|
console.log("[executeWithClaudeCode] Starting execution", {
|
|
@@ -1222,29 +1247,6 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1222
1247
|
}
|
|
1223
1248
|
const startTime = /* @__PURE__ */ new Date();
|
|
1224
1249
|
const allMessages = [];
|
|
1225
|
-
const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
|
|
1226
|
-
const claudeDir = `${options.cwd}/.claude`;
|
|
1227
|
-
await mkdirAsync(claudeDir, { recursive: true });
|
|
1228
|
-
await writeFile6(`${claudeDir}/settings.json`, "{}", {
|
|
1229
|
-
flag: "wx"
|
|
1230
|
-
}).catch(() => {
|
|
1231
|
-
});
|
|
1232
|
-
if (options.mcps && options.mcps.length > 0) {
|
|
1233
|
-
await writeMcpToFilesystem(options.cwd, options.mcps);
|
|
1234
|
-
}
|
|
1235
|
-
if (options.subAgents && options.subAgents.length > 0) {
|
|
1236
|
-
await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
|
|
1237
|
-
}
|
|
1238
|
-
if (options.rules && options.rules.length > 0) {
|
|
1239
|
-
await writeRulesToFilesystem(options.cwd, options.rules);
|
|
1240
|
-
}
|
|
1241
|
-
try {
|
|
1242
|
-
await writeSkillsToFilesystem(options.cwd, skills);
|
|
1243
|
-
} catch (writeError) {
|
|
1244
|
-
throw new Error(
|
|
1245
|
-
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
1246
|
-
);
|
|
1247
|
-
}
|
|
1248
1250
|
const sdkEnv = buildSdkEnvironment(options);
|
|
1249
1251
|
let traceStepNumber = 0;
|
|
1250
1252
|
const traceContext = options.traceContext;
|
|
@@ -2086,6 +2088,17 @@ var ClaudeCodeAdapter = class {
|
|
|
2086
2088
|
id = "claude-code";
|
|
2087
2089
|
name = "Claude Code";
|
|
2088
2090
|
supportedCommands = [import_evalforge_types5.AgentRunCommand.CLAUDE];
|
|
2091
|
+
/**
|
|
2092
|
+
* Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
|
|
2093
|
+
* before the baseline snapshot is taken.
|
|
2094
|
+
*/
|
|
2095
|
+
async prepareEnvironment(context) {
|
|
2096
|
+
await prepareClaudeCodeEnvironment(context.cwd, context.skills, {
|
|
2097
|
+
mcps: context.mcps,
|
|
2098
|
+
subAgents: context.subAgents,
|
|
2099
|
+
rules: context.rules
|
|
2100
|
+
});
|
|
2101
|
+
}
|
|
2089
2102
|
/**
|
|
2090
2103
|
* Execute a skill using the Claude Code SDK.
|
|
2091
2104
|
*
|
|
@@ -2736,37 +2749,39 @@ function createTraceEventFromPart(part, context, stepNumber, isComplete) {
|
|
|
2736
2749
|
return null;
|
|
2737
2750
|
}
|
|
2738
2751
|
}
|
|
2739
|
-
async function
|
|
2740
|
-
const skillNames = skills.map((s) => s.name).join(", ");
|
|
2741
|
-
console.log("[executeWithOpenCode] Starting execution", {
|
|
2742
|
-
skillCount: skills.length,
|
|
2743
|
-
skillNames,
|
|
2744
|
-
scenarioId: scenario.id,
|
|
2745
|
-
scenarioName: scenario.name,
|
|
2746
|
-
cwd: options.cwd,
|
|
2747
|
-
aiGatewayUrl: options.aiGatewayUrl,
|
|
2748
|
-
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
2749
|
-
model: options.model
|
|
2750
|
-
});
|
|
2751
|
-
const startTime = /* @__PURE__ */ new Date();
|
|
2752
|
+
async function prepareOpenCodeEnvironment(cwd, skills, options) {
|
|
2752
2753
|
if (options.mcps && options.mcps.length > 0) {
|
|
2753
2754
|
console.log(
|
|
2754
2755
|
`[MCP] ${options.mcps.length} MCP(s) will be configured inline`
|
|
2755
2756
|
);
|
|
2756
2757
|
}
|
|
2757
2758
|
if (options.subAgents && options.subAgents.length > 0) {
|
|
2758
|
-
await writeSubAgentsToFilesystem2(
|
|
2759
|
+
await writeSubAgentsToFilesystem2(cwd, options.subAgents);
|
|
2759
2760
|
}
|
|
2760
2761
|
if (options.rules && options.rules.length > 0) {
|
|
2761
|
-
await writeRulesToFilesystem(
|
|
2762
|
+
await writeRulesToFilesystem(cwd, options.rules);
|
|
2762
2763
|
}
|
|
2763
2764
|
try {
|
|
2764
|
-
await writeSkillsToFilesystem2(
|
|
2765
|
+
await writeSkillsToFilesystem2(cwd, skills);
|
|
2765
2766
|
} catch (writeError) {
|
|
2766
2767
|
throw new Error(
|
|
2767
2768
|
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
2768
2769
|
);
|
|
2769
2770
|
}
|
|
2771
|
+
}
|
|
2772
|
+
async function executeWithOpenCode(skills, scenario, options) {
|
|
2773
|
+
const skillNames = skills.map((s) => s.name).join(", ");
|
|
2774
|
+
console.log("[executeWithOpenCode] Starting execution", {
|
|
2775
|
+
skillCount: skills.length,
|
|
2776
|
+
skillNames,
|
|
2777
|
+
scenarioId: scenario.id,
|
|
2778
|
+
scenarioName: scenario.name,
|
|
2779
|
+
cwd: options.cwd,
|
|
2780
|
+
aiGatewayUrl: options.aiGatewayUrl,
|
|
2781
|
+
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
2782
|
+
model: options.model
|
|
2783
|
+
});
|
|
2784
|
+
const startTime = /* @__PURE__ */ new Date();
|
|
2770
2785
|
const maxTurns = options.maxTurns ?? 10;
|
|
2771
2786
|
const { config, providerID, modelID } = await buildOpenCodeConfig({
|
|
2772
2787
|
model: options.model,
|
|
@@ -3146,6 +3161,13 @@ var OpenCodeAdapter = class {
|
|
|
3146
3161
|
id = "opencode";
|
|
3147
3162
|
name = "OpenCode";
|
|
3148
3163
|
supportedCommands = [import_evalforge_types9.AgentRunCommand.OPENCODE];
|
|
3164
|
+
async prepareEnvironment(context) {
|
|
3165
|
+
await prepareOpenCodeEnvironment(context.cwd, context.skills, {
|
|
3166
|
+
mcps: context.mcps,
|
|
3167
|
+
subAgents: context.subAgents,
|
|
3168
|
+
rules: context.rules
|
|
3169
|
+
});
|
|
3170
|
+
}
|
|
3149
3171
|
async execute(context) {
|
|
3150
3172
|
const {
|
|
3151
3173
|
skills,
|
|
@@ -4273,6 +4295,15 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
|
|
|
4273
4295
|
};
|
|
4274
4296
|
|
|
4275
4297
|
// src/run-scenario/file-diff.ts
|
|
4298
|
+
function deriveInfrastructurePaths(prePrep, postPrep) {
|
|
4299
|
+
const infraPaths = /* @__PURE__ */ new Set();
|
|
4300
|
+
for (const path2 of Object.keys(postPrep)) {
|
|
4301
|
+
if (prePrep[path2] === void 0 || prePrep[path2] !== postPrep[path2]) {
|
|
4302
|
+
infraPaths.add(path2);
|
|
4303
|
+
}
|
|
4304
|
+
}
|
|
4305
|
+
return infraPaths;
|
|
4306
|
+
}
|
|
4276
4307
|
var IGNORED_PATTERNS = [
|
|
4277
4308
|
"node_modules",
|
|
4278
4309
|
".git",
|
|
@@ -4376,7 +4407,7 @@ function generateDiffLines(before, after) {
|
|
|
4376
4407
|
}
|
|
4377
4408
|
return result;
|
|
4378
4409
|
}
|
|
4379
|
-
function diffSnapshots(before, after) {
|
|
4410
|
+
function diffSnapshots(before, after, infrastructurePaths) {
|
|
4380
4411
|
const diffs = [];
|
|
4381
4412
|
const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
4382
4413
|
for (const path2 of allPaths) {
|
|
@@ -4390,7 +4421,8 @@ function diffSnapshots(before, after) {
|
|
|
4390
4421
|
path: path2,
|
|
4391
4422
|
expected: beforeContent,
|
|
4392
4423
|
actual: afterContent,
|
|
4393
|
-
diffLines: diffLines2
|
|
4424
|
+
diffLines: diffLines2,
|
|
4425
|
+
...infrastructurePaths?.has(path2) && { isInfrastructure: true }
|
|
4394
4426
|
});
|
|
4395
4427
|
}
|
|
4396
4428
|
const deletedPaths = [...allPaths].filter((p) => after[p] === void 0);
|
|
@@ -4414,7 +4446,7 @@ function diffSnapshots(before, after) {
|
|
|
4414
4446
|
result.sort((a, b) => a.path.localeCompare(b.path));
|
|
4415
4447
|
return result;
|
|
4416
4448
|
}
|
|
4417
|
-
function extractTemplateFiles(before, after) {
|
|
4449
|
+
function extractTemplateFiles(before, after, infrastructurePaths) {
|
|
4418
4450
|
const files = [];
|
|
4419
4451
|
const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
4420
4452
|
for (const path2 of allPaths) {
|
|
@@ -4434,7 +4466,8 @@ function extractTemplateFiles(before, after) {
|
|
|
4434
4466
|
files.push({
|
|
4435
4467
|
path: path2,
|
|
4436
4468
|
content: afterContent,
|
|
4437
|
-
status
|
|
4469
|
+
status,
|
|
4470
|
+
...infrastructurePaths?.has(path2) && { isInfrastructure: true }
|
|
4438
4471
|
});
|
|
4439
4472
|
}
|
|
4440
4473
|
files.sort((a, b) => a.path.localeCompare(b.path));
|
|
@@ -4450,7 +4483,6 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4450
4483
|
const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
4451
4484
|
const adapter = getAdapter(identifier);
|
|
4452
4485
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4453
|
-
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
4454
4486
|
const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
|
|
4455
4487
|
const targetName = evalData.presetName || agent?.name || "";
|
|
4456
4488
|
const executionContext = {
|
|
@@ -4475,11 +4507,22 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4475
4507
|
rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
|
|
4476
4508
|
systemPrompt: agent?.systemPrompt
|
|
4477
4509
|
};
|
|
4510
|
+
const hasPrepare = !!adapter.prepareEnvironment;
|
|
4511
|
+
const prePrepSnapshot = hasPrepare && workDir ? snapshotDirectory(workDir) : {};
|
|
4512
|
+
if (hasPrepare) {
|
|
4513
|
+
await adapter.prepareEnvironment(executionContext);
|
|
4514
|
+
}
|
|
4515
|
+
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
4516
|
+
const infrastructurePaths = hasPrepare ? deriveInfrastructurePaths(prePrepSnapshot, beforeSnapshot) : /* @__PURE__ */ new Set();
|
|
4478
4517
|
const { outputText, durationMs, llmTrace, conversation } = await adapter.execute(executionContext);
|
|
4479
4518
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4480
4519
|
const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
4481
|
-
const fileDiffs = diffSnapshots(
|
|
4482
|
-
|
|
4520
|
+
const fileDiffs = diffSnapshots(
|
|
4521
|
+
beforeSnapshot,
|
|
4522
|
+
afterSnapshot,
|
|
4523
|
+
infrastructurePaths
|
|
4524
|
+
);
|
|
4525
|
+
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
|
|
4483
4526
|
return {
|
|
4484
4527
|
id: (0, import_crypto4.randomUUID)(),
|
|
4485
4528
|
targetId,
|