@wix/evalforge-evaluator 0.114.0 → 0.116.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +98 -46
- package/build/index.js.map +3 -3
- package/build/index.mjs +98 -46
- package/build/index.mjs.map +3 -3
- package/build/types/run-scenario/agents/claude-code/claude-code-adapter.d.ts +5 -0
- package/build/types/run-scenario/agents/claude-code/execute.d.ts +6 -0
- package/build/types/run-scenario/agents/opencode/execute.d.ts +8 -0
- package/build/types/run-scenario/agents/opencode/opencode-adapter.d.ts +1 -0
- package/build/types/run-scenario/file-diff.d.ts +10 -2
- package/package.json +5 -5
package/build/index.js
CHANGED
|
@@ -1199,6 +1199,31 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
1199
1199
|
outputPreview: `Message type: ${message.type}`
|
|
1200
1200
|
};
|
|
1201
1201
|
}
|
|
1202
|
+
async function prepareClaudeCodeEnvironment(cwd, skills, options) {
|
|
1203
|
+
const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
|
|
1204
|
+
const claudeDir = `${cwd}/.claude`;
|
|
1205
|
+
await mkdirAsync(claudeDir, { recursive: true });
|
|
1206
|
+
await writeFile6(`${claudeDir}/settings.json`, "{}", {
|
|
1207
|
+
flag: "wx"
|
|
1208
|
+
}).catch(() => {
|
|
1209
|
+
});
|
|
1210
|
+
if (options.mcps && options.mcps.length > 0) {
|
|
1211
|
+
await writeMcpToFilesystem(cwd, options.mcps);
|
|
1212
|
+
}
|
|
1213
|
+
if (options.subAgents && options.subAgents.length > 0) {
|
|
1214
|
+
await writeSubAgentsToFilesystem(cwd, options.subAgents);
|
|
1215
|
+
}
|
|
1216
|
+
if (options.rules && options.rules.length > 0) {
|
|
1217
|
+
await writeRulesToFilesystem(cwd, options.rules);
|
|
1218
|
+
}
|
|
1219
|
+
try {
|
|
1220
|
+
await writeSkillsToFilesystem(cwd, skills);
|
|
1221
|
+
} catch (writeError) {
|
|
1222
|
+
throw new Error(
|
|
1223
|
+
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
1224
|
+
);
|
|
1225
|
+
}
|
|
1226
|
+
}
|
|
1202
1227
|
async function executeWithClaudeCode(skills, scenario, options) {
|
|
1203
1228
|
const skillNames = skills.map((s) => s.name).join(", ");
|
|
1204
1229
|
console.log("[executeWithClaudeCode] Starting execution", {
|
|
@@ -1222,29 +1247,6 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1222
1247
|
}
|
|
1223
1248
|
const startTime = /* @__PURE__ */ new Date();
|
|
1224
1249
|
const allMessages = [];
|
|
1225
|
-
const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
|
|
1226
|
-
const claudeDir = `${options.cwd}/.claude`;
|
|
1227
|
-
await mkdirAsync(claudeDir, { recursive: true });
|
|
1228
|
-
await writeFile6(`${claudeDir}/settings.json`, "{}", {
|
|
1229
|
-
flag: "wx"
|
|
1230
|
-
}).catch(() => {
|
|
1231
|
-
});
|
|
1232
|
-
if (options.mcps && options.mcps.length > 0) {
|
|
1233
|
-
await writeMcpToFilesystem(options.cwd, options.mcps);
|
|
1234
|
-
}
|
|
1235
|
-
if (options.subAgents && options.subAgents.length > 0) {
|
|
1236
|
-
await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
|
|
1237
|
-
}
|
|
1238
|
-
if (options.rules && options.rules.length > 0) {
|
|
1239
|
-
await writeRulesToFilesystem(options.cwd, options.rules);
|
|
1240
|
-
}
|
|
1241
|
-
try {
|
|
1242
|
-
await writeSkillsToFilesystem(options.cwd, skills);
|
|
1243
|
-
} catch (writeError) {
|
|
1244
|
-
throw new Error(
|
|
1245
|
-
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
1246
|
-
);
|
|
1247
|
-
}
|
|
1248
1250
|
const sdkEnv = buildSdkEnvironment(options);
|
|
1249
1251
|
let traceStepNumber = 0;
|
|
1250
1252
|
const traceContext = options.traceContext;
|
|
@@ -2086,6 +2088,17 @@ var ClaudeCodeAdapter = class {
|
|
|
2086
2088
|
id = "claude-code";
|
|
2087
2089
|
name = "Claude Code";
|
|
2088
2090
|
supportedCommands = [import_evalforge_types5.AgentRunCommand.CLAUDE];
|
|
2091
|
+
/**
|
|
2092
|
+
* Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
|
|
2093
|
+
* before the baseline snapshot is taken.
|
|
2094
|
+
*/
|
|
2095
|
+
async prepareEnvironment(context) {
|
|
2096
|
+
await prepareClaudeCodeEnvironment(context.cwd, context.skills, {
|
|
2097
|
+
mcps: context.mcps,
|
|
2098
|
+
subAgents: context.subAgents,
|
|
2099
|
+
rules: context.rules
|
|
2100
|
+
});
|
|
2101
|
+
}
|
|
2089
2102
|
/**
|
|
2090
2103
|
* Execute a skill using the Claude Code SDK.
|
|
2091
2104
|
*
|
|
@@ -2149,6 +2162,7 @@ defaultRegistry.register(claudeCodeAdapter);
|
|
|
2149
2162
|
var import_evalforge_types9 = require("@wix/evalforge-types");
|
|
2150
2163
|
|
|
2151
2164
|
// src/run-scenario/agents/opencode/execute.ts
|
|
2165
|
+
var import_os3 = require("os");
|
|
2152
2166
|
var import_evalforge_types8 = require("@wix/evalforge-types");
|
|
2153
2167
|
|
|
2154
2168
|
// src/run-scenario/agents/opencode/write-skills.ts
|
|
@@ -2651,6 +2665,13 @@ function buildConversation2(messages) {
|
|
|
2651
2665
|
|
|
2652
2666
|
// src/run-scenario/agents/opencode/execute.ts
|
|
2653
2667
|
var DEFAULT_MODEL3 = `anthropic/${import_evalforge_types8.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
|
|
2668
|
+
function ensureOpenCodeInPath() {
|
|
2669
|
+
const opencodeBin = `${(0, import_os3.homedir)()}/.opencode/bin`;
|
|
2670
|
+
const currentPath = process.env.PATH || "";
|
|
2671
|
+
if (!currentPath.includes(opencodeBin)) {
|
|
2672
|
+
process.env.PATH = `${opencodeBin}:${currentPath}`;
|
|
2673
|
+
}
|
|
2674
|
+
}
|
|
2654
2675
|
function extractToolAction(toolName, args) {
|
|
2655
2676
|
if (!toolName) return "Using tool...";
|
|
2656
2677
|
const a = args;
|
|
@@ -2728,37 +2749,39 @@ function createTraceEventFromPart(part, context, stepNumber, isComplete) {
|
|
|
2728
2749
|
return null;
|
|
2729
2750
|
}
|
|
2730
2751
|
}
|
|
2731
|
-
async function
|
|
2732
|
-
const skillNames = skills.map((s) => s.name).join(", ");
|
|
2733
|
-
console.log("[executeWithOpenCode] Starting execution", {
|
|
2734
|
-
skillCount: skills.length,
|
|
2735
|
-
skillNames,
|
|
2736
|
-
scenarioId: scenario.id,
|
|
2737
|
-
scenarioName: scenario.name,
|
|
2738
|
-
cwd: options.cwd,
|
|
2739
|
-
aiGatewayUrl: options.aiGatewayUrl,
|
|
2740
|
-
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
2741
|
-
model: options.model
|
|
2742
|
-
});
|
|
2743
|
-
const startTime = /* @__PURE__ */ new Date();
|
|
2752
|
+
async function prepareOpenCodeEnvironment(cwd, skills, options) {
|
|
2744
2753
|
if (options.mcps && options.mcps.length > 0) {
|
|
2745
2754
|
console.log(
|
|
2746
2755
|
`[MCP] ${options.mcps.length} MCP(s) will be configured inline`
|
|
2747
2756
|
);
|
|
2748
2757
|
}
|
|
2749
2758
|
if (options.subAgents && options.subAgents.length > 0) {
|
|
2750
|
-
await writeSubAgentsToFilesystem2(
|
|
2759
|
+
await writeSubAgentsToFilesystem2(cwd, options.subAgents);
|
|
2751
2760
|
}
|
|
2752
2761
|
if (options.rules && options.rules.length > 0) {
|
|
2753
|
-
await writeRulesToFilesystem(
|
|
2762
|
+
await writeRulesToFilesystem(cwd, options.rules);
|
|
2754
2763
|
}
|
|
2755
2764
|
try {
|
|
2756
|
-
await writeSkillsToFilesystem2(
|
|
2765
|
+
await writeSkillsToFilesystem2(cwd, skills);
|
|
2757
2766
|
} catch (writeError) {
|
|
2758
2767
|
throw new Error(
|
|
2759
2768
|
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
2760
2769
|
);
|
|
2761
2770
|
}
|
|
2771
|
+
}
|
|
2772
|
+
async function executeWithOpenCode(skills, scenario, options) {
|
|
2773
|
+
const skillNames = skills.map((s) => s.name).join(", ");
|
|
2774
|
+
console.log("[executeWithOpenCode] Starting execution", {
|
|
2775
|
+
skillCount: skills.length,
|
|
2776
|
+
skillNames,
|
|
2777
|
+
scenarioId: scenario.id,
|
|
2778
|
+
scenarioName: scenario.name,
|
|
2779
|
+
cwd: options.cwd,
|
|
2780
|
+
aiGatewayUrl: options.aiGatewayUrl,
|
|
2781
|
+
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
2782
|
+
model: options.model
|
|
2783
|
+
});
|
|
2784
|
+
const startTime = /* @__PURE__ */ new Date();
|
|
2762
2785
|
const maxTurns = options.maxTurns ?? 10;
|
|
2763
2786
|
const { config, providerID, modelID } = await buildOpenCodeConfig({
|
|
2764
2787
|
model: options.model,
|
|
@@ -2806,6 +2829,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
2806
2829
|
}
|
|
2807
2830
|
let server;
|
|
2808
2831
|
try {
|
|
2832
|
+
ensureOpenCodeInPath();
|
|
2809
2833
|
console.log("[SDK-DEBUG] Starting OpenCode server...");
|
|
2810
2834
|
server = await createOpencodeServer({
|
|
2811
2835
|
config,
|
|
@@ -3137,6 +3161,13 @@ var OpenCodeAdapter = class {
|
|
|
3137
3161
|
id = "opencode";
|
|
3138
3162
|
name = "OpenCode";
|
|
3139
3163
|
supportedCommands = [import_evalforge_types9.AgentRunCommand.OPENCODE];
|
|
3164
|
+
async prepareEnvironment(context) {
|
|
3165
|
+
await prepareOpenCodeEnvironment(context.cwd, context.skills, {
|
|
3166
|
+
mcps: context.mcps,
|
|
3167
|
+
subAgents: context.subAgents,
|
|
3168
|
+
rules: context.rules
|
|
3169
|
+
});
|
|
3170
|
+
}
|
|
3140
3171
|
async execute(context) {
|
|
3141
3172
|
const {
|
|
3142
3173
|
skills,
|
|
@@ -4264,6 +4295,15 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
|
|
|
4264
4295
|
};
|
|
4265
4296
|
|
|
4266
4297
|
// src/run-scenario/file-diff.ts
|
|
4298
|
+
function deriveInfrastructurePaths(prePrep, postPrep) {
|
|
4299
|
+
const infraPaths = /* @__PURE__ */ new Set();
|
|
4300
|
+
for (const path2 of Object.keys(postPrep)) {
|
|
4301
|
+
if (prePrep[path2] === void 0 || prePrep[path2] !== postPrep[path2]) {
|
|
4302
|
+
infraPaths.add(path2);
|
|
4303
|
+
}
|
|
4304
|
+
}
|
|
4305
|
+
return infraPaths;
|
|
4306
|
+
}
|
|
4267
4307
|
var IGNORED_PATTERNS = [
|
|
4268
4308
|
"node_modules",
|
|
4269
4309
|
".git",
|
|
@@ -4367,7 +4407,7 @@ function generateDiffLines(before, after) {
|
|
|
4367
4407
|
}
|
|
4368
4408
|
return result;
|
|
4369
4409
|
}
|
|
4370
|
-
function diffSnapshots(before, after) {
|
|
4410
|
+
function diffSnapshots(before, after, infrastructurePaths) {
|
|
4371
4411
|
const diffs = [];
|
|
4372
4412
|
const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
4373
4413
|
for (const path2 of allPaths) {
|
|
@@ -4381,7 +4421,8 @@ function diffSnapshots(before, after) {
|
|
|
4381
4421
|
path: path2,
|
|
4382
4422
|
expected: beforeContent,
|
|
4383
4423
|
actual: afterContent,
|
|
4384
|
-
diffLines: diffLines2
|
|
4424
|
+
diffLines: diffLines2,
|
|
4425
|
+
...infrastructurePaths?.has(path2) && { isInfrastructure: true }
|
|
4385
4426
|
});
|
|
4386
4427
|
}
|
|
4387
4428
|
const deletedPaths = [...allPaths].filter((p) => after[p] === void 0);
|
|
@@ -4405,7 +4446,7 @@ function diffSnapshots(before, after) {
|
|
|
4405
4446
|
result.sort((a, b) => a.path.localeCompare(b.path));
|
|
4406
4447
|
return result;
|
|
4407
4448
|
}
|
|
4408
|
-
function extractTemplateFiles(before, after) {
|
|
4449
|
+
function extractTemplateFiles(before, after, infrastructurePaths) {
|
|
4409
4450
|
const files = [];
|
|
4410
4451
|
const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
4411
4452
|
for (const path2 of allPaths) {
|
|
@@ -4425,7 +4466,8 @@ function extractTemplateFiles(before, after) {
|
|
|
4425
4466
|
files.push({
|
|
4426
4467
|
path: path2,
|
|
4427
4468
|
content: afterContent,
|
|
4428
|
-
status
|
|
4469
|
+
status,
|
|
4470
|
+
...infrastructurePaths?.has(path2) && { isInfrastructure: true }
|
|
4429
4471
|
});
|
|
4430
4472
|
}
|
|
4431
4473
|
files.sort((a, b) => a.path.localeCompare(b.path));
|
|
@@ -4441,7 +4483,6 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4441
4483
|
const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
4442
4484
|
const adapter = getAdapter(identifier);
|
|
4443
4485
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4444
|
-
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
4445
4486
|
const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
|
|
4446
4487
|
const targetName = evalData.presetName || agent?.name || "";
|
|
4447
4488
|
const executionContext = {
|
|
@@ -4466,11 +4507,22 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4466
4507
|
rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
|
|
4467
4508
|
systemPrompt: agent?.systemPrompt
|
|
4468
4509
|
};
|
|
4510
|
+
const hasPrepare = !!adapter.prepareEnvironment;
|
|
4511
|
+
const prePrepSnapshot = hasPrepare && workDir ? snapshotDirectory(workDir) : {};
|
|
4512
|
+
if (hasPrepare) {
|
|
4513
|
+
await adapter.prepareEnvironment(executionContext);
|
|
4514
|
+
}
|
|
4515
|
+
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
4516
|
+
const infrastructurePaths = hasPrepare ? deriveInfrastructurePaths(prePrepSnapshot, beforeSnapshot) : /* @__PURE__ */ new Set();
|
|
4469
4517
|
const { outputText, durationMs, llmTrace, conversation } = await adapter.execute(executionContext);
|
|
4470
4518
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4471
4519
|
const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
4472
|
-
const fileDiffs = diffSnapshots(
|
|
4473
|
-
|
|
4520
|
+
const fileDiffs = diffSnapshots(
|
|
4521
|
+
beforeSnapshot,
|
|
4522
|
+
afterSnapshot,
|
|
4523
|
+
infrastructurePaths
|
|
4524
|
+
);
|
|
4525
|
+
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
|
|
4474
4526
|
return {
|
|
4475
4527
|
id: (0, import_crypto4.randomUUID)(),
|
|
4476
4528
|
targetId,
|