@wix/evalforge-evaluator 0.114.0 → 0.116.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +98 -46
- package/build/index.js.map +3 -3
- package/build/index.mjs +98 -46
- package/build/index.mjs.map +3 -3
- package/build/types/run-scenario/agents/claude-code/claude-code-adapter.d.ts +5 -0
- package/build/types/run-scenario/agents/claude-code/execute.d.ts +6 -0
- package/build/types/run-scenario/agents/opencode/execute.d.ts +8 -0
- package/build/types/run-scenario/agents/opencode/opencode-adapter.d.ts +1 -0
- package/build/types/run-scenario/file-diff.d.ts +10 -2
- package/package.json +5 -5
package/build/index.mjs
CHANGED
|
@@ -1191,6 +1191,31 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
1191
1191
|
outputPreview: `Message type: ${message.type}`
|
|
1192
1192
|
};
|
|
1193
1193
|
}
|
|
1194
|
+
async function prepareClaudeCodeEnvironment(cwd, skills, options) {
|
|
1195
|
+
const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
|
|
1196
|
+
const claudeDir = `${cwd}/.claude`;
|
|
1197
|
+
await mkdirAsync(claudeDir, { recursive: true });
|
|
1198
|
+
await writeFile6(`${claudeDir}/settings.json`, "{}", {
|
|
1199
|
+
flag: "wx"
|
|
1200
|
+
}).catch(() => {
|
|
1201
|
+
});
|
|
1202
|
+
if (options.mcps && options.mcps.length > 0) {
|
|
1203
|
+
await writeMcpToFilesystem(cwd, options.mcps);
|
|
1204
|
+
}
|
|
1205
|
+
if (options.subAgents && options.subAgents.length > 0) {
|
|
1206
|
+
await writeSubAgentsToFilesystem(cwd, options.subAgents);
|
|
1207
|
+
}
|
|
1208
|
+
if (options.rules && options.rules.length > 0) {
|
|
1209
|
+
await writeRulesToFilesystem(cwd, options.rules);
|
|
1210
|
+
}
|
|
1211
|
+
try {
|
|
1212
|
+
await writeSkillsToFilesystem(cwd, skills);
|
|
1213
|
+
} catch (writeError) {
|
|
1214
|
+
throw new Error(
|
|
1215
|
+
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
1216
|
+
);
|
|
1217
|
+
}
|
|
1218
|
+
}
|
|
1194
1219
|
async function executeWithClaudeCode(skills, scenario, options) {
|
|
1195
1220
|
const skillNames = skills.map((s) => s.name).join(", ");
|
|
1196
1221
|
console.log("[executeWithClaudeCode] Starting execution", {
|
|
@@ -1214,29 +1239,6 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1214
1239
|
}
|
|
1215
1240
|
const startTime = /* @__PURE__ */ new Date();
|
|
1216
1241
|
const allMessages = [];
|
|
1217
|
-
const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
|
|
1218
|
-
const claudeDir = `${options.cwd}/.claude`;
|
|
1219
|
-
await mkdirAsync(claudeDir, { recursive: true });
|
|
1220
|
-
await writeFile6(`${claudeDir}/settings.json`, "{}", {
|
|
1221
|
-
flag: "wx"
|
|
1222
|
-
}).catch(() => {
|
|
1223
|
-
});
|
|
1224
|
-
if (options.mcps && options.mcps.length > 0) {
|
|
1225
|
-
await writeMcpToFilesystem(options.cwd, options.mcps);
|
|
1226
|
-
}
|
|
1227
|
-
if (options.subAgents && options.subAgents.length > 0) {
|
|
1228
|
-
await writeSubAgentsToFilesystem(options.cwd, options.subAgents);
|
|
1229
|
-
}
|
|
1230
|
-
if (options.rules && options.rules.length > 0) {
|
|
1231
|
-
await writeRulesToFilesystem(options.cwd, options.rules);
|
|
1232
|
-
}
|
|
1233
|
-
try {
|
|
1234
|
-
await writeSkillsToFilesystem(options.cwd, skills);
|
|
1235
|
-
} catch (writeError) {
|
|
1236
|
-
throw new Error(
|
|
1237
|
-
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
1238
|
-
);
|
|
1239
|
-
}
|
|
1240
1242
|
const sdkEnv = buildSdkEnvironment(options);
|
|
1241
1243
|
let traceStepNumber = 0;
|
|
1242
1244
|
const traceContext = options.traceContext;
|
|
@@ -2078,6 +2080,17 @@ var ClaudeCodeAdapter = class {
|
|
|
2078
2080
|
id = "claude-code";
|
|
2079
2081
|
name = "Claude Code";
|
|
2080
2082
|
supportedCommands = [AgentRunCommand.CLAUDE];
|
|
2083
|
+
/**
|
|
2084
|
+
* Write infrastructure files (settings, MCPs, sub-agents, rules, skills)
|
|
2085
|
+
* before the baseline snapshot is taken.
|
|
2086
|
+
*/
|
|
2087
|
+
async prepareEnvironment(context) {
|
|
2088
|
+
await prepareClaudeCodeEnvironment(context.cwd, context.skills, {
|
|
2089
|
+
mcps: context.mcps,
|
|
2090
|
+
subAgents: context.subAgents,
|
|
2091
|
+
rules: context.rules
|
|
2092
|
+
});
|
|
2093
|
+
}
|
|
2081
2094
|
/**
|
|
2082
2095
|
* Execute a skill using the Claude Code SDK.
|
|
2083
2096
|
*
|
|
@@ -2141,6 +2154,7 @@ defaultRegistry.register(claudeCodeAdapter);
|
|
|
2141
2154
|
import { AgentRunCommand as AgentRunCommand2 } from "@wix/evalforge-types";
|
|
2142
2155
|
|
|
2143
2156
|
// src/run-scenario/agents/opencode/execute.ts
|
|
2157
|
+
import { homedir as homedir2 } from "os";
|
|
2144
2158
|
import {
|
|
2145
2159
|
ClaudeModel as ClaudeModel3,
|
|
2146
2160
|
DEFAULT_EVALUATOR_SYSTEM_PROMPT as DEFAULT_EVALUATOR_SYSTEM_PROMPT2,
|
|
@@ -2652,6 +2666,13 @@ function buildConversation2(messages) {
|
|
|
2652
2666
|
|
|
2653
2667
|
// src/run-scenario/agents/opencode/execute.ts
|
|
2654
2668
|
var DEFAULT_MODEL3 = `anthropic/${ClaudeModel3.CLAUDE_4_5_SONNET_1_0}`;
|
|
2669
|
+
function ensureOpenCodeInPath() {
|
|
2670
|
+
const opencodeBin = `${homedir2()}/.opencode/bin`;
|
|
2671
|
+
const currentPath = process.env.PATH || "";
|
|
2672
|
+
if (!currentPath.includes(opencodeBin)) {
|
|
2673
|
+
process.env.PATH = `${opencodeBin}:${currentPath}`;
|
|
2674
|
+
}
|
|
2675
|
+
}
|
|
2655
2676
|
function extractToolAction(toolName, args) {
|
|
2656
2677
|
if (!toolName) return "Using tool...";
|
|
2657
2678
|
const a = args;
|
|
@@ -2729,37 +2750,39 @@ function createTraceEventFromPart(part, context, stepNumber, isComplete) {
|
|
|
2729
2750
|
return null;
|
|
2730
2751
|
}
|
|
2731
2752
|
}
|
|
2732
|
-
async function
|
|
2733
|
-
const skillNames = skills.map((s) => s.name).join(", ");
|
|
2734
|
-
console.log("[executeWithOpenCode] Starting execution", {
|
|
2735
|
-
skillCount: skills.length,
|
|
2736
|
-
skillNames,
|
|
2737
|
-
scenarioId: scenario.id,
|
|
2738
|
-
scenarioName: scenario.name,
|
|
2739
|
-
cwd: options.cwd,
|
|
2740
|
-
aiGatewayUrl: options.aiGatewayUrl,
|
|
2741
|
-
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
2742
|
-
model: options.model
|
|
2743
|
-
});
|
|
2744
|
-
const startTime = /* @__PURE__ */ new Date();
|
|
2753
|
+
async function prepareOpenCodeEnvironment(cwd, skills, options) {
|
|
2745
2754
|
if (options.mcps && options.mcps.length > 0) {
|
|
2746
2755
|
console.log(
|
|
2747
2756
|
`[MCP] ${options.mcps.length} MCP(s) will be configured inline`
|
|
2748
2757
|
);
|
|
2749
2758
|
}
|
|
2750
2759
|
if (options.subAgents && options.subAgents.length > 0) {
|
|
2751
|
-
await writeSubAgentsToFilesystem2(
|
|
2760
|
+
await writeSubAgentsToFilesystem2(cwd, options.subAgents);
|
|
2752
2761
|
}
|
|
2753
2762
|
if (options.rules && options.rules.length > 0) {
|
|
2754
|
-
await writeRulesToFilesystem(
|
|
2763
|
+
await writeRulesToFilesystem(cwd, options.rules);
|
|
2755
2764
|
}
|
|
2756
2765
|
try {
|
|
2757
|
-
await writeSkillsToFilesystem2(
|
|
2766
|
+
await writeSkillsToFilesystem2(cwd, skills);
|
|
2758
2767
|
} catch (writeError) {
|
|
2759
2768
|
throw new Error(
|
|
2760
2769
|
`Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
|
|
2761
2770
|
);
|
|
2762
2771
|
}
|
|
2772
|
+
}
|
|
2773
|
+
async function executeWithOpenCode(skills, scenario, options) {
|
|
2774
|
+
const skillNames = skills.map((s) => s.name).join(", ");
|
|
2775
|
+
console.log("[executeWithOpenCode] Starting execution", {
|
|
2776
|
+
skillCount: skills.length,
|
|
2777
|
+
skillNames,
|
|
2778
|
+
scenarioId: scenario.id,
|
|
2779
|
+
scenarioName: scenario.name,
|
|
2780
|
+
cwd: options.cwd,
|
|
2781
|
+
aiGatewayUrl: options.aiGatewayUrl,
|
|
2782
|
+
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
2783
|
+
model: options.model
|
|
2784
|
+
});
|
|
2785
|
+
const startTime = /* @__PURE__ */ new Date();
|
|
2763
2786
|
const maxTurns = options.maxTurns ?? 10;
|
|
2764
2787
|
const { config, providerID, modelID } = await buildOpenCodeConfig({
|
|
2765
2788
|
model: options.model,
|
|
@@ -2807,6 +2830,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
2807
2830
|
}
|
|
2808
2831
|
let server;
|
|
2809
2832
|
try {
|
|
2833
|
+
ensureOpenCodeInPath();
|
|
2810
2834
|
console.log("[SDK-DEBUG] Starting OpenCode server...");
|
|
2811
2835
|
server = await createOpencodeServer({
|
|
2812
2836
|
config,
|
|
@@ -3138,6 +3162,13 @@ var OpenCodeAdapter = class {
|
|
|
3138
3162
|
id = "opencode";
|
|
3139
3163
|
name = "OpenCode";
|
|
3140
3164
|
supportedCommands = [AgentRunCommand2.OPENCODE];
|
|
3165
|
+
async prepareEnvironment(context) {
|
|
3166
|
+
await prepareOpenCodeEnvironment(context.cwd, context.skills, {
|
|
3167
|
+
mcps: context.mcps,
|
|
3168
|
+
subAgents: context.subAgents,
|
|
3169
|
+
rules: context.rules
|
|
3170
|
+
});
|
|
3171
|
+
}
|
|
3141
3172
|
async execute(context) {
|
|
3142
3173
|
const {
|
|
3143
3174
|
skills,
|
|
@@ -4273,6 +4304,15 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
|
|
|
4273
4304
|
};
|
|
4274
4305
|
|
|
4275
4306
|
// src/run-scenario/file-diff.ts
|
|
4307
|
+
function deriveInfrastructurePaths(prePrep, postPrep) {
|
|
4308
|
+
const infraPaths = /* @__PURE__ */ new Set();
|
|
4309
|
+
for (const path2 of Object.keys(postPrep)) {
|
|
4310
|
+
if (prePrep[path2] === void 0 || prePrep[path2] !== postPrep[path2]) {
|
|
4311
|
+
infraPaths.add(path2);
|
|
4312
|
+
}
|
|
4313
|
+
}
|
|
4314
|
+
return infraPaths;
|
|
4315
|
+
}
|
|
4276
4316
|
var IGNORED_PATTERNS = [
|
|
4277
4317
|
"node_modules",
|
|
4278
4318
|
".git",
|
|
@@ -4376,7 +4416,7 @@ function generateDiffLines(before, after) {
|
|
|
4376
4416
|
}
|
|
4377
4417
|
return result;
|
|
4378
4418
|
}
|
|
4379
|
-
function diffSnapshots(before, after) {
|
|
4419
|
+
function diffSnapshots(before, after, infrastructurePaths) {
|
|
4380
4420
|
const diffs = [];
|
|
4381
4421
|
const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
4382
4422
|
for (const path2 of allPaths) {
|
|
@@ -4390,7 +4430,8 @@ function diffSnapshots(before, after) {
|
|
|
4390
4430
|
path: path2,
|
|
4391
4431
|
expected: beforeContent,
|
|
4392
4432
|
actual: afterContent,
|
|
4393
|
-
diffLines: diffLines2
|
|
4433
|
+
diffLines: diffLines2,
|
|
4434
|
+
...infrastructurePaths?.has(path2) && { isInfrastructure: true }
|
|
4394
4435
|
});
|
|
4395
4436
|
}
|
|
4396
4437
|
const deletedPaths = [...allPaths].filter((p) => after[p] === void 0);
|
|
@@ -4414,7 +4455,7 @@ function diffSnapshots(before, after) {
|
|
|
4414
4455
|
result.sort((a, b) => a.path.localeCompare(b.path));
|
|
4415
4456
|
return result;
|
|
4416
4457
|
}
|
|
4417
|
-
function extractTemplateFiles(before, after) {
|
|
4458
|
+
function extractTemplateFiles(before, after, infrastructurePaths) {
|
|
4418
4459
|
const files = [];
|
|
4419
4460
|
const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
4420
4461
|
for (const path2 of allPaths) {
|
|
@@ -4434,7 +4475,8 @@ function extractTemplateFiles(before, after) {
|
|
|
4434
4475
|
files.push({
|
|
4435
4476
|
path: path2,
|
|
4436
4477
|
content: afterContent,
|
|
4437
|
-
status
|
|
4478
|
+
status,
|
|
4479
|
+
...infrastructurePaths?.has(path2) && { isInfrastructure: true }
|
|
4438
4480
|
});
|
|
4439
4481
|
}
|
|
4440
4482
|
files.sort((a, b) => a.path.localeCompare(b.path));
|
|
@@ -4450,7 +4492,6 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4450
4492
|
const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
|
|
4451
4493
|
const adapter = getAdapter(identifier);
|
|
4452
4494
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4453
|
-
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
4454
4495
|
const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
|
|
4455
4496
|
const targetName = evalData.presetName || agent?.name || "";
|
|
4456
4497
|
const executionContext = {
|
|
@@ -4475,11 +4516,22 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4475
4516
|
rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
|
|
4476
4517
|
systemPrompt: agent?.systemPrompt
|
|
4477
4518
|
};
|
|
4519
|
+
const hasPrepare = !!adapter.prepareEnvironment;
|
|
4520
|
+
const prePrepSnapshot = hasPrepare && workDir ? snapshotDirectory(workDir) : {};
|
|
4521
|
+
if (hasPrepare) {
|
|
4522
|
+
await adapter.prepareEnvironment(executionContext);
|
|
4523
|
+
}
|
|
4524
|
+
const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
4525
|
+
const infrastructurePaths = hasPrepare ? deriveInfrastructurePaths(prePrepSnapshot, beforeSnapshot) : /* @__PURE__ */ new Set();
|
|
4478
4526
|
const { outputText, durationMs, llmTrace, conversation } = await adapter.execute(executionContext);
|
|
4479
4527
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4480
4528
|
const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
4481
|
-
const fileDiffs = diffSnapshots(
|
|
4482
|
-
|
|
4529
|
+
const fileDiffs = diffSnapshots(
|
|
4530
|
+
beforeSnapshot,
|
|
4531
|
+
afterSnapshot,
|
|
4532
|
+
infrastructurePaths
|
|
4533
|
+
);
|
|
4534
|
+
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
|
|
4483
4535
|
return {
|
|
4484
4536
|
id: randomUUID4(),
|
|
4485
4537
|
targetId,
|