@wix/evalforge-evaluator 0.59.0 → 0.61.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +126 -42
- package/build/index.js.map +4 -4
- package/build/index.mjs +126 -42
- package/build/index.mjs.map +4 -4
- package/build/types/api-client.d.ts +4 -2
- package/build/types/fetch-evaluation-data.d.ts +2 -2
- package/build/types/run-scenario/agents/claude-code/execute.d.ts +2 -2
- package/build/types/run-scenario/agents/claude-code/write-mcp.d.ts +4 -3
- package/build/types/run-scenario/agents/claude-code/write-skills.d.ts +21 -0
- package/package.json +6 -5
package/build/index.js
CHANGED
|
@@ -160,6 +160,16 @@ function createApiClient(serverUrl, options = "") {
|
|
|
160
160
|
getSkill(projectId2, id) {
|
|
161
161
|
return fetchJson(`/projects/${projectId2}/skills/${id}`);
|
|
162
162
|
},
|
|
163
|
+
getSkillVersion(projectId2, skillId, versionId) {
|
|
164
|
+
return fetchJson(
|
|
165
|
+
`/projects/${projectId2}/skills/${skillId}/versions/${versionId}`
|
|
166
|
+
);
|
|
167
|
+
},
|
|
168
|
+
getLatestSkillVersion(projectId2, skillId) {
|
|
169
|
+
return fetchJson(
|
|
170
|
+
`/projects/${projectId2}/skills/${skillId}/versions/latest`
|
|
171
|
+
);
|
|
172
|
+
},
|
|
163
173
|
getAgent(projectId2, id) {
|
|
164
174
|
return fetchJson(`/projects/${projectId2}/agents/${id}`);
|
|
165
175
|
},
|
|
@@ -294,6 +304,29 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
294
304
|
skillsGroup.skillIds.map((id) => api.getSkill(projectId2, id))
|
|
295
305
|
);
|
|
296
306
|
}
|
|
307
|
+
if (evalRun.skillVersions && Object.keys(evalRun.skillVersions).length > 0) {
|
|
308
|
+
skills = await Promise.all(
|
|
309
|
+
skills.map(async (skill) => {
|
|
310
|
+
const versionId = evalRun.skillVersions?.[skill.id];
|
|
311
|
+
if (versionId) {
|
|
312
|
+
const version = await api.getSkillVersion(
|
|
313
|
+
projectId2,
|
|
314
|
+
skill.id,
|
|
315
|
+
versionId
|
|
316
|
+
);
|
|
317
|
+
return { ...skill, latestVersion: version };
|
|
318
|
+
}
|
|
319
|
+
return skill;
|
|
320
|
+
})
|
|
321
|
+
);
|
|
322
|
+
}
|
|
323
|
+
skills = skills.map((skill) => {
|
|
324
|
+
const hasPinnedVersion = evalRun.skillVersions?.[skill.id];
|
|
325
|
+
if (!hasPinnedVersion && skill.source) {
|
|
326
|
+
return { ...skill, latestVersion: void 0 };
|
|
327
|
+
}
|
|
328
|
+
return skill;
|
|
329
|
+
});
|
|
297
330
|
}
|
|
298
331
|
let mcps = [];
|
|
299
332
|
if (evalRun.mcpIds && evalRun.mcpIds.length > 0) {
|
|
@@ -1106,10 +1139,10 @@ var Minipass = class extends import_node_events.EventEmitter {
|
|
|
1106
1139
|
* Return a void Promise that resolves once the stream ends.
|
|
1107
1140
|
*/
|
|
1108
1141
|
async promise() {
|
|
1109
|
-
return new Promise((
|
|
1142
|
+
return new Promise((resolve2, reject) => {
|
|
1110
1143
|
this.on(DESTROYED, () => reject(new Error("stream destroyed")));
|
|
1111
1144
|
this.on("error", (er) => reject(er));
|
|
1112
|
-
this.on("end", () =>
|
|
1145
|
+
this.on("end", () => resolve2());
|
|
1113
1146
|
});
|
|
1114
1147
|
}
|
|
1115
1148
|
/**
|
|
@@ -1133,7 +1166,7 @@ var Minipass = class extends import_node_events.EventEmitter {
|
|
|
1133
1166
|
return Promise.resolve({ done: false, value: res });
|
|
1134
1167
|
if (this[EOF])
|
|
1135
1168
|
return stop();
|
|
1136
|
-
let
|
|
1169
|
+
let resolve2;
|
|
1137
1170
|
let reject;
|
|
1138
1171
|
const onerr = (er) => {
|
|
1139
1172
|
this.off("data", ondata);
|
|
@@ -1147,19 +1180,19 @@ var Minipass = class extends import_node_events.EventEmitter {
|
|
|
1147
1180
|
this.off("end", onend);
|
|
1148
1181
|
this.off(DESTROYED, ondestroy);
|
|
1149
1182
|
this.pause();
|
|
1150
|
-
|
|
1183
|
+
resolve2({ value, done: !!this[EOF] });
|
|
1151
1184
|
};
|
|
1152
1185
|
const onend = () => {
|
|
1153
1186
|
this.off("error", onerr);
|
|
1154
1187
|
this.off("data", ondata);
|
|
1155
1188
|
this.off(DESTROYED, ondestroy);
|
|
1156
1189
|
stop();
|
|
1157
|
-
|
|
1190
|
+
resolve2({ done: true, value: void 0 });
|
|
1158
1191
|
};
|
|
1159
1192
|
const ondestroy = () => onerr(new Error("stream destroyed"));
|
|
1160
1193
|
return new Promise((res2, rej) => {
|
|
1161
1194
|
reject = rej;
|
|
1162
|
-
|
|
1195
|
+
resolve2 = res2;
|
|
1163
1196
|
this.once(DESTROYED, ondestroy);
|
|
1164
1197
|
this.once("error", onerr);
|
|
1165
1198
|
this.once("end", onend);
|
|
@@ -3287,9 +3320,9 @@ var listFile = (opt, _files) => {
|
|
|
3287
3320
|
const parse4 = new Parser(opt);
|
|
3288
3321
|
const readSize = opt.maxReadSize || 16 * 1024 * 1024;
|
|
3289
3322
|
const file = opt.file;
|
|
3290
|
-
const p = new Promise((
|
|
3323
|
+
const p = new Promise((resolve2, reject) => {
|
|
3291
3324
|
parse4.on("error", reject);
|
|
3292
|
-
parse4.on("end",
|
|
3325
|
+
parse4.on("end", resolve2);
|
|
3293
3326
|
import_node_fs.default.stat(file, (er, stat) => {
|
|
3294
3327
|
if (er) {
|
|
3295
3328
|
reject(er);
|
|
@@ -5930,9 +5963,9 @@ var extractFile = (opt, _) => {
|
|
|
5930
5963
|
const u = new Unpack(opt);
|
|
5931
5964
|
const readSize = opt.maxReadSize || 16 * 1024 * 1024;
|
|
5932
5965
|
const file = opt.file;
|
|
5933
|
-
const p = new Promise((
|
|
5966
|
+
const p = new Promise((resolve2, reject) => {
|
|
5934
5967
|
u.on("error", reject);
|
|
5935
|
-
u.on("close",
|
|
5968
|
+
u.on("close", resolve2);
|
|
5936
5969
|
import_node_fs5.default.stat(file, (er, stat) => {
|
|
5937
5970
|
if (er) {
|
|
5938
5971
|
reject(er);
|
|
@@ -6066,7 +6099,7 @@ var replaceAsync = (opt, files) => {
|
|
|
6066
6099
|
};
|
|
6067
6100
|
import_node_fs6.default.read(fd, headBuf, 0, 512, position, onread);
|
|
6068
6101
|
};
|
|
6069
|
-
const promise = new Promise((
|
|
6102
|
+
const promise = new Promise((resolve2, reject) => {
|
|
6070
6103
|
p.on("error", reject);
|
|
6071
6104
|
let flag = "r+";
|
|
6072
6105
|
const onopen = (er, fd) => {
|
|
@@ -6091,7 +6124,7 @@ var replaceAsync = (opt, files) => {
|
|
|
6091
6124
|
});
|
|
6092
6125
|
p.pipe(stream);
|
|
6093
6126
|
stream.on("error", reject);
|
|
6094
|
-
stream.on("close",
|
|
6127
|
+
stream.on("close", resolve2);
|
|
6095
6128
|
addFilesAsync2(p, files);
|
|
6096
6129
|
});
|
|
6097
6130
|
});
|
|
@@ -6357,33 +6390,96 @@ var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
|
6357
6390
|
|
|
6358
6391
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
6359
6392
|
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
6360
|
-
var import_crypto = require("crypto");
|
|
6361
|
-
var import_promises5 = require("fs/promises");
|
|
6362
|
-
var import_path7 = require("path");
|
|
6363
6393
|
|
|
6364
|
-
// src/run-scenario/agents/claude-code/write-
|
|
6394
|
+
// src/run-scenario/agents/claude-code/write-skills.ts
|
|
6365
6395
|
var import_promises3 = require("fs/promises");
|
|
6366
6396
|
var import_path5 = require("path");
|
|
6397
|
+
var import_evalforge_github_client = require("@wix/evalforge-github-client");
|
|
6398
|
+
async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_github_client.fetchSkillFolderRaw) {
|
|
6399
|
+
await Promise.all(
|
|
6400
|
+
skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
|
|
6401
|
+
);
|
|
6402
|
+
}
|
|
6403
|
+
async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client.fetchSkillFolderRaw) {
|
|
6404
|
+
const skillName = skill.name;
|
|
6405
|
+
const skillDir = (0, import_path5.join)(cwd, ".claude", "skills", skillName);
|
|
6406
|
+
await (0, import_promises3.mkdir)(skillDir, { recursive: true });
|
|
6407
|
+
const version = skill.latestVersion;
|
|
6408
|
+
if (version?.files && version.files.length > 0) {
|
|
6409
|
+
await writeSkillFiles(skillDir, version.files);
|
|
6410
|
+
console.log(
|
|
6411
|
+
`[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
|
|
6412
|
+
);
|
|
6413
|
+
} else if (skill.source) {
|
|
6414
|
+
try {
|
|
6415
|
+
const files = await fetchFn(skill.source, {
|
|
6416
|
+
userAgent: "EvalForge-Evaluator"
|
|
6417
|
+
});
|
|
6418
|
+
await writeSkillFiles(skillDir, files);
|
|
6419
|
+
console.log(
|
|
6420
|
+
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
6421
|
+
);
|
|
6422
|
+
} catch (error) {
|
|
6423
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
6424
|
+
console.error(
|
|
6425
|
+
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
6426
|
+
);
|
|
6427
|
+
throw new Error(
|
|
6428
|
+
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
6429
|
+
);
|
|
6430
|
+
}
|
|
6431
|
+
} else {
|
|
6432
|
+
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
6433
|
+
}
|
|
6434
|
+
}
|
|
6435
|
+
async function writeSkillFiles(skillDir, files) {
|
|
6436
|
+
const resolvedBase = (0, import_path5.resolve)(skillDir);
|
|
6437
|
+
for (const file of files) {
|
|
6438
|
+
const filePath = (0, import_path5.resolve)(skillDir, file.path);
|
|
6439
|
+
if (!filePath.startsWith(resolvedBase + import_path5.sep) && filePath !== resolvedBase) {
|
|
6440
|
+
throw new Error(
|
|
6441
|
+
`Path traversal detected in skill file: "${file.path}" resolves outside skill directory`
|
|
6442
|
+
);
|
|
6443
|
+
}
|
|
6444
|
+
await (0, import_promises3.mkdir)((0, import_path5.dirname)(filePath), { recursive: true });
|
|
6445
|
+
await (0, import_promises3.writeFile)(filePath, file.content, "utf-8");
|
|
6446
|
+
}
|
|
6447
|
+
}
|
|
6448
|
+
|
|
6449
|
+
// src/run-scenario/agents/claude-code/execute.ts
|
|
6450
|
+
var import_crypto = require("crypto");
|
|
6451
|
+
|
|
6452
|
+
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
6453
|
+
var import_promises4 = require("fs/promises");
|
|
6454
|
+
var import_path6 = require("path");
|
|
6367
6455
|
var import_evalforge_types2 = require("@wix/evalforge-types");
|
|
6368
6456
|
async function writeMcpToFilesystem(cwd, mcps) {
|
|
6369
6457
|
if (mcps.length === 0) return;
|
|
6370
6458
|
const mcpServers = {};
|
|
6371
6459
|
for (const mcp of mcps) {
|
|
6372
|
-
|
|
6460
|
+
const config = mcp.config;
|
|
6461
|
+
for (const [key, value] of Object.entries(config)) {
|
|
6462
|
+
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
6463
|
+
throw new Error(
|
|
6464
|
+
`MCP "${mcp.name}" has invalid config: value for key "${key}" must be an object (got ${typeof value}). Config must use keyed format, e.g. { "server-name": { "command": "npx", ... } }`
|
|
6465
|
+
);
|
|
6466
|
+
}
|
|
6467
|
+
mcpServers[key] = value;
|
|
6468
|
+
}
|
|
6373
6469
|
}
|
|
6374
6470
|
const content = JSON.stringify(
|
|
6375
6471
|
{ [import_evalforge_types2.MCP_SERVERS_JSON_KEY]: mcpServers },
|
|
6376
6472
|
null,
|
|
6377
6473
|
2
|
|
6378
6474
|
);
|
|
6379
|
-
const filePath = (0,
|
|
6380
|
-
await (0,
|
|
6475
|
+
const filePath = (0, import_path6.join)(cwd, ".mcp.json");
|
|
6476
|
+
await (0, import_promises4.writeFile)(filePath, content, "utf8");
|
|
6381
6477
|
console.log(`[MCP] Written to ${filePath}`);
|
|
6382
6478
|
}
|
|
6383
6479
|
|
|
6384
6480
|
// src/run-scenario/agents/claude-code/write-sub-agents.ts
|
|
6385
|
-
var
|
|
6386
|
-
var
|
|
6481
|
+
var import_promises5 = require("fs/promises");
|
|
6482
|
+
var import_path7 = require("path");
|
|
6387
6483
|
var AGENTS_DIR = ".claude/agents";
|
|
6388
6484
|
function toAgentFilename(name2, index, nameCount) {
|
|
6389
6485
|
const base = (name2 || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
@@ -6393,13 +6489,13 @@ function toAgentFilename(name2, index, nameCount) {
|
|
|
6393
6489
|
}
|
|
6394
6490
|
async function writeSubAgentsToFilesystem(cwd, subAgents) {
|
|
6395
6491
|
if (subAgents.length === 0) return;
|
|
6396
|
-
const agentsDir = (0,
|
|
6397
|
-
await (0,
|
|
6492
|
+
const agentsDir = (0, import_path7.join)(cwd, AGENTS_DIR);
|
|
6493
|
+
await (0, import_promises5.mkdir)(agentsDir, { recursive: true });
|
|
6398
6494
|
const nameCount = /* @__PURE__ */ new Map();
|
|
6399
6495
|
for (const [i, agent] of subAgents.entries()) {
|
|
6400
6496
|
const filename = toAgentFilename(agent.name, i, nameCount);
|
|
6401
|
-
const filePath = (0,
|
|
6402
|
-
await (0,
|
|
6497
|
+
const filePath = (0, import_path7.join)(agentsDir, `${filename}.md`);
|
|
6498
|
+
await (0, import_promises5.writeFile)(filePath, agent.subAgentMd, "utf8");
|
|
6403
6499
|
}
|
|
6404
6500
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
6405
6501
|
}
|
|
@@ -6725,9 +6821,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
6725
6821
|
})
|
|
6726
6822
|
);
|
|
6727
6823
|
let messageCount = 0;
|
|
6728
|
-
const canUseTool = async () => {
|
|
6729
|
-
return { behavior: "allow" };
|
|
6730
|
-
};
|
|
6824
|
+
const canUseTool = async (_toolName, input) => ({ behavior: "allow", updatedInput: input });
|
|
6731
6825
|
const baseAllowedTools = [
|
|
6732
6826
|
"Skill",
|
|
6733
6827
|
"Read",
|
|
@@ -7180,16 +7274,6 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
7180
7274
|
llmTrace
|
|
7181
7275
|
};
|
|
7182
7276
|
}
|
|
7183
|
-
async function writeSkillsToFilesystem(cwd, skills) {
|
|
7184
|
-
for (const skill of skills) {
|
|
7185
|
-
const skillName = skill.name;
|
|
7186
|
-
const skillDir = (0, import_path7.join)(cwd, ".claude", "skills", skillName);
|
|
7187
|
-
await (0, import_promises5.mkdir)(skillDir, { recursive: true });
|
|
7188
|
-
const skillPath = (0, import_path7.join)(skillDir, "SKILL.md");
|
|
7189
|
-
await (0, import_promises5.writeFile)(skillPath, skill.skillMd, "utf-8");
|
|
7190
|
-
console.log(`[Skill] Written to ${skillPath}`);
|
|
7191
|
-
}
|
|
7192
|
-
}
|
|
7193
7277
|
function buildSdkEnvironment(options) {
|
|
7194
7278
|
const env = { ...process.env };
|
|
7195
7279
|
const placeholderApiKey = "sk-ant-api03-placeholder-auth-handled-by-gateway-000000000000000000000000";
|
|
@@ -8187,7 +8271,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
8187
8271
|
mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
|
|
8188
8272
|
subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0
|
|
8189
8273
|
};
|
|
8190
|
-
const
|
|
8274
|
+
const { outputText, durationMs, llmTrace } = await adapter.execute(executionContext);
|
|
8191
8275
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
8192
8276
|
const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
8193
8277
|
const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
|
|
@@ -8199,13 +8283,13 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
8199
8283
|
scenarioId: scenario.id,
|
|
8200
8284
|
scenarioName: scenario.name,
|
|
8201
8285
|
modelConfig: agent?.modelConfig,
|
|
8202
|
-
duration:
|
|
8203
|
-
outputText
|
|
8286
|
+
duration: durationMs,
|
|
8287
|
+
outputText,
|
|
8204
8288
|
fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
|
|
8205
8289
|
templateFiles: templateFiles && templateFiles.length > 0 ? templateFiles : void 0,
|
|
8206
8290
|
startedAt,
|
|
8207
8291
|
completedAt,
|
|
8208
|
-
llmTrace
|
|
8292
|
+
llmTrace
|
|
8209
8293
|
};
|
|
8210
8294
|
}
|
|
8211
8295
|
|