@wix/evalforge-evaluator 0.60.0 → 0.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +116 -38
- package/build/index.js.map +4 -4
- package/build/index.mjs +116 -38
- package/build/index.mjs.map +4 -4
- package/build/types/api-client.d.ts +4 -2
- package/build/types/fetch-evaluation-data.d.ts +2 -2
- package/build/types/run-scenario/agents/claude-code/execute.d.ts +2 -2
- package/build/types/run-scenario/agents/claude-code/write-skills.d.ts +21 -0
- package/package.json +6 -5
package/build/index.js
CHANGED
|
@@ -160,6 +160,16 @@ function createApiClient(serverUrl, options = "") {
|
|
|
160
160
|
getSkill(projectId2, id) {
|
|
161
161
|
return fetchJson(`/projects/${projectId2}/skills/${id}`);
|
|
162
162
|
},
|
|
163
|
+
getSkillVersion(projectId2, skillId, versionId) {
|
|
164
|
+
return fetchJson(
|
|
165
|
+
`/projects/${projectId2}/skills/${skillId}/versions/${versionId}`
|
|
166
|
+
);
|
|
167
|
+
},
|
|
168
|
+
getLatestSkillVersion(projectId2, skillId) {
|
|
169
|
+
return fetchJson(
|
|
170
|
+
`/projects/${projectId2}/skills/${skillId}/versions/latest`
|
|
171
|
+
);
|
|
172
|
+
},
|
|
163
173
|
getAgent(projectId2, id) {
|
|
164
174
|
return fetchJson(`/projects/${projectId2}/agents/${id}`);
|
|
165
175
|
},
|
|
@@ -294,6 +304,29 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
294
304
|
skillsGroup.skillIds.map((id) => api.getSkill(projectId2, id))
|
|
295
305
|
);
|
|
296
306
|
}
|
|
307
|
+
if (evalRun.skillVersions && Object.keys(evalRun.skillVersions).length > 0) {
|
|
308
|
+
skills = await Promise.all(
|
|
309
|
+
skills.map(async (skill) => {
|
|
310
|
+
const versionId = evalRun.skillVersions?.[skill.id];
|
|
311
|
+
if (versionId) {
|
|
312
|
+
const version = await api.getSkillVersion(
|
|
313
|
+
projectId2,
|
|
314
|
+
skill.id,
|
|
315
|
+
versionId
|
|
316
|
+
);
|
|
317
|
+
return { ...skill, latestVersion: version };
|
|
318
|
+
}
|
|
319
|
+
return skill;
|
|
320
|
+
})
|
|
321
|
+
);
|
|
322
|
+
}
|
|
323
|
+
skills = skills.map((skill) => {
|
|
324
|
+
const hasPinnedVersion = evalRun.skillVersions?.[skill.id];
|
|
325
|
+
if (!hasPinnedVersion && skill.source) {
|
|
326
|
+
return { ...skill, latestVersion: void 0 };
|
|
327
|
+
}
|
|
328
|
+
return skill;
|
|
329
|
+
});
|
|
297
330
|
}
|
|
298
331
|
let mcps = [];
|
|
299
332
|
if (evalRun.mcpIds && evalRun.mcpIds.length > 0) {
|
|
@@ -1106,10 +1139,10 @@ var Minipass = class extends import_node_events.EventEmitter {
|
|
|
1106
1139
|
* Return a void Promise that resolves once the stream ends.
|
|
1107
1140
|
*/
|
|
1108
1141
|
async promise() {
|
|
1109
|
-
return new Promise((
|
|
1142
|
+
return new Promise((resolve2, reject) => {
|
|
1110
1143
|
this.on(DESTROYED, () => reject(new Error("stream destroyed")));
|
|
1111
1144
|
this.on("error", (er) => reject(er));
|
|
1112
|
-
this.on("end", () =>
|
|
1145
|
+
this.on("end", () => resolve2());
|
|
1113
1146
|
});
|
|
1114
1147
|
}
|
|
1115
1148
|
/**
|
|
@@ -1133,7 +1166,7 @@ var Minipass = class extends import_node_events.EventEmitter {
|
|
|
1133
1166
|
return Promise.resolve({ done: false, value: res });
|
|
1134
1167
|
if (this[EOF])
|
|
1135
1168
|
return stop();
|
|
1136
|
-
let
|
|
1169
|
+
let resolve2;
|
|
1137
1170
|
let reject;
|
|
1138
1171
|
const onerr = (er) => {
|
|
1139
1172
|
this.off("data", ondata);
|
|
@@ -1147,19 +1180,19 @@ var Minipass = class extends import_node_events.EventEmitter {
|
|
|
1147
1180
|
this.off("end", onend);
|
|
1148
1181
|
this.off(DESTROYED, ondestroy);
|
|
1149
1182
|
this.pause();
|
|
1150
|
-
|
|
1183
|
+
resolve2({ value, done: !!this[EOF] });
|
|
1151
1184
|
};
|
|
1152
1185
|
const onend = () => {
|
|
1153
1186
|
this.off("error", onerr);
|
|
1154
1187
|
this.off("data", ondata);
|
|
1155
1188
|
this.off(DESTROYED, ondestroy);
|
|
1156
1189
|
stop();
|
|
1157
|
-
|
|
1190
|
+
resolve2({ done: true, value: void 0 });
|
|
1158
1191
|
};
|
|
1159
1192
|
const ondestroy = () => onerr(new Error("stream destroyed"));
|
|
1160
1193
|
return new Promise((res2, rej) => {
|
|
1161
1194
|
reject = rej;
|
|
1162
|
-
|
|
1195
|
+
resolve2 = res2;
|
|
1163
1196
|
this.once(DESTROYED, ondestroy);
|
|
1164
1197
|
this.once("error", onerr);
|
|
1165
1198
|
this.once("end", onend);
|
|
@@ -3287,9 +3320,9 @@ var listFile = (opt, _files) => {
|
|
|
3287
3320
|
const parse4 = new Parser(opt);
|
|
3288
3321
|
const readSize = opt.maxReadSize || 16 * 1024 * 1024;
|
|
3289
3322
|
const file = opt.file;
|
|
3290
|
-
const p = new Promise((
|
|
3323
|
+
const p = new Promise((resolve2, reject) => {
|
|
3291
3324
|
parse4.on("error", reject);
|
|
3292
|
-
parse4.on("end",
|
|
3325
|
+
parse4.on("end", resolve2);
|
|
3293
3326
|
import_node_fs.default.stat(file, (er, stat) => {
|
|
3294
3327
|
if (er) {
|
|
3295
3328
|
reject(er);
|
|
@@ -5930,9 +5963,9 @@ var extractFile = (opt, _) => {
|
|
|
5930
5963
|
const u = new Unpack(opt);
|
|
5931
5964
|
const readSize = opt.maxReadSize || 16 * 1024 * 1024;
|
|
5932
5965
|
const file = opt.file;
|
|
5933
|
-
const p = new Promise((
|
|
5966
|
+
const p = new Promise((resolve2, reject) => {
|
|
5934
5967
|
u.on("error", reject);
|
|
5935
|
-
u.on("close",
|
|
5968
|
+
u.on("close", resolve2);
|
|
5936
5969
|
import_node_fs5.default.stat(file, (er, stat) => {
|
|
5937
5970
|
if (er) {
|
|
5938
5971
|
reject(er);
|
|
@@ -6066,7 +6099,7 @@ var replaceAsync = (opt, files) => {
|
|
|
6066
6099
|
};
|
|
6067
6100
|
import_node_fs6.default.read(fd, headBuf, 0, 512, position, onread);
|
|
6068
6101
|
};
|
|
6069
|
-
const promise = new Promise((
|
|
6102
|
+
const promise = new Promise((resolve2, reject) => {
|
|
6070
6103
|
p.on("error", reject);
|
|
6071
6104
|
let flag = "r+";
|
|
6072
6105
|
const onopen = (er, fd) => {
|
|
@@ -6091,7 +6124,7 @@ var replaceAsync = (opt, files) => {
|
|
|
6091
6124
|
});
|
|
6092
6125
|
p.pipe(stream);
|
|
6093
6126
|
stream.on("error", reject);
|
|
6094
|
-
stream.on("close",
|
|
6127
|
+
stream.on("close", resolve2);
|
|
6095
6128
|
addFilesAsync2(p, files);
|
|
6096
6129
|
});
|
|
6097
6130
|
});
|
|
@@ -6357,13 +6390,68 @@ var import_evalforge_types4 = require("@wix/evalforge-types");
|
|
|
6357
6390
|
|
|
6358
6391
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
6359
6392
|
var import_evalforge_types3 = require("@wix/evalforge-types");
|
|
6360
|
-
var import_crypto = require("crypto");
|
|
6361
|
-
var import_promises5 = require("fs/promises");
|
|
6362
|
-
var import_path7 = require("path");
|
|
6363
6393
|
|
|
6364
|
-
// src/run-scenario/agents/claude-code/write-
|
|
6394
|
+
// src/run-scenario/agents/claude-code/write-skills.ts
|
|
6365
6395
|
var import_promises3 = require("fs/promises");
|
|
6366
6396
|
var import_path5 = require("path");
|
|
6397
|
+
var import_evalforge_github_client = require("@wix/evalforge-github-client");
|
|
6398
|
+
async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_github_client.fetchSkillFolderRaw) {
|
|
6399
|
+
await Promise.all(
|
|
6400
|
+
skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
|
|
6401
|
+
);
|
|
6402
|
+
}
|
|
6403
|
+
async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client.fetchSkillFolderRaw) {
|
|
6404
|
+
const skillName = skill.name;
|
|
6405
|
+
const skillDir = (0, import_path5.join)(cwd, ".claude", "skills", skillName);
|
|
6406
|
+
await (0, import_promises3.mkdir)(skillDir, { recursive: true });
|
|
6407
|
+
const version = skill.latestVersion;
|
|
6408
|
+
if (version?.files && version.files.length > 0) {
|
|
6409
|
+
await writeSkillFiles(skillDir, version.files);
|
|
6410
|
+
console.log(
|
|
6411
|
+
`[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
|
|
6412
|
+
);
|
|
6413
|
+
} else if (skill.source) {
|
|
6414
|
+
try {
|
|
6415
|
+
const files = await fetchFn(skill.source, {
|
|
6416
|
+
userAgent: "EvalForge-Evaluator"
|
|
6417
|
+
});
|
|
6418
|
+
await writeSkillFiles(skillDir, files);
|
|
6419
|
+
console.log(
|
|
6420
|
+
`[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
|
|
6421
|
+
);
|
|
6422
|
+
} catch (error) {
|
|
6423
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
6424
|
+
console.error(
|
|
6425
|
+
`[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
|
|
6426
|
+
);
|
|
6427
|
+
throw new Error(
|
|
6428
|
+
`Failed to write skill ${skillName} to filesystem: ${message}`
|
|
6429
|
+
);
|
|
6430
|
+
}
|
|
6431
|
+
} else {
|
|
6432
|
+
throw new Error(`Skill ${skillName} has no files and no source configured`);
|
|
6433
|
+
}
|
|
6434
|
+
}
|
|
6435
|
+
async function writeSkillFiles(skillDir, files) {
|
|
6436
|
+
const resolvedBase = (0, import_path5.resolve)(skillDir);
|
|
6437
|
+
for (const file of files) {
|
|
6438
|
+
const filePath = (0, import_path5.resolve)(skillDir, file.path);
|
|
6439
|
+
if (!filePath.startsWith(resolvedBase + import_path5.sep) && filePath !== resolvedBase) {
|
|
6440
|
+
throw new Error(
|
|
6441
|
+
`Path traversal detected in skill file: "${file.path}" resolves outside skill directory`
|
|
6442
|
+
);
|
|
6443
|
+
}
|
|
6444
|
+
await (0, import_promises3.mkdir)((0, import_path5.dirname)(filePath), { recursive: true });
|
|
6445
|
+
await (0, import_promises3.writeFile)(filePath, file.content, "utf-8");
|
|
6446
|
+
}
|
|
6447
|
+
}
|
|
6448
|
+
|
|
6449
|
+
// src/run-scenario/agents/claude-code/execute.ts
|
|
6450
|
+
var import_crypto = require("crypto");
|
|
6451
|
+
|
|
6452
|
+
// src/run-scenario/agents/claude-code/write-mcp.ts
|
|
6453
|
+
var import_promises4 = require("fs/promises");
|
|
6454
|
+
var import_path6 = require("path");
|
|
6367
6455
|
var import_evalforge_types2 = require("@wix/evalforge-types");
|
|
6368
6456
|
async function writeMcpToFilesystem(cwd, mcps) {
|
|
6369
6457
|
if (mcps.length === 0) return;
|
|
@@ -6384,14 +6472,14 @@ async function writeMcpToFilesystem(cwd, mcps) {
|
|
|
6384
6472
|
null,
|
|
6385
6473
|
2
|
|
6386
6474
|
);
|
|
6387
|
-
const filePath = (0,
|
|
6388
|
-
await (0,
|
|
6475
|
+
const filePath = (0, import_path6.join)(cwd, ".mcp.json");
|
|
6476
|
+
await (0, import_promises4.writeFile)(filePath, content, "utf8");
|
|
6389
6477
|
console.log(`[MCP] Written to ${filePath}`);
|
|
6390
6478
|
}
|
|
6391
6479
|
|
|
6392
6480
|
// src/run-scenario/agents/claude-code/write-sub-agents.ts
|
|
6393
|
-
var
|
|
6394
|
-
var
|
|
6481
|
+
var import_promises5 = require("fs/promises");
|
|
6482
|
+
var import_path7 = require("path");
|
|
6395
6483
|
var AGENTS_DIR = ".claude/agents";
|
|
6396
6484
|
function toAgentFilename(name2, index, nameCount) {
|
|
6397
6485
|
const base = (name2 || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
|
|
@@ -6401,13 +6489,13 @@ function toAgentFilename(name2, index, nameCount) {
|
|
|
6401
6489
|
}
|
|
6402
6490
|
async function writeSubAgentsToFilesystem(cwd, subAgents) {
|
|
6403
6491
|
if (subAgents.length === 0) return;
|
|
6404
|
-
const agentsDir = (0,
|
|
6405
|
-
await (0,
|
|
6492
|
+
const agentsDir = (0, import_path7.join)(cwd, AGENTS_DIR);
|
|
6493
|
+
await (0, import_promises5.mkdir)(agentsDir, { recursive: true });
|
|
6406
6494
|
const nameCount = /* @__PURE__ */ new Map();
|
|
6407
6495
|
for (const [i, agent] of subAgents.entries()) {
|
|
6408
6496
|
const filename = toAgentFilename(agent.name, i, nameCount);
|
|
6409
|
-
const filePath = (0,
|
|
6410
|
-
await (0,
|
|
6497
|
+
const filePath = (0, import_path7.join)(agentsDir, `${filename}.md`);
|
|
6498
|
+
await (0, import_promises5.writeFile)(filePath, agent.subAgentMd, "utf8");
|
|
6411
6499
|
}
|
|
6412
6500
|
console.log(`[SubAgents] Written to ${agentsDir}`);
|
|
6413
6501
|
}
|
|
@@ -7186,16 +7274,6 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
7186
7274
|
llmTrace
|
|
7187
7275
|
};
|
|
7188
7276
|
}
|
|
7189
|
-
async function writeSkillsToFilesystem(cwd, skills) {
|
|
7190
|
-
for (const skill of skills) {
|
|
7191
|
-
const skillName = skill.name;
|
|
7192
|
-
const skillDir = (0, import_path7.join)(cwd, ".claude", "skills", skillName);
|
|
7193
|
-
await (0, import_promises5.mkdir)(skillDir, { recursive: true });
|
|
7194
|
-
const skillPath = (0, import_path7.join)(skillDir, "SKILL.md");
|
|
7195
|
-
await (0, import_promises5.writeFile)(skillPath, skill.skillMd, "utf-8");
|
|
7196
|
-
console.log(`[Skill] Written to ${skillPath}`);
|
|
7197
|
-
}
|
|
7198
|
-
}
|
|
7199
7277
|
function buildSdkEnvironment(options) {
|
|
7200
7278
|
const env = { ...process.env };
|
|
7201
7279
|
const placeholderApiKey = "sk-ant-api03-placeholder-auth-handled-by-gateway-000000000000000000000000";
|
|
@@ -8193,7 +8271,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
8193
8271
|
mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
|
|
8194
8272
|
subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0
|
|
8195
8273
|
};
|
|
8196
|
-
const
|
|
8274
|
+
const { outputText, durationMs, llmTrace } = await adapter.execute(executionContext);
|
|
8197
8275
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
8198
8276
|
const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
8199
8277
|
const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
|
|
@@ -8205,13 +8283,13 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
8205
8283
|
scenarioId: scenario.id,
|
|
8206
8284
|
scenarioName: scenario.name,
|
|
8207
8285
|
modelConfig: agent?.modelConfig,
|
|
8208
|
-
duration:
|
|
8209
|
-
outputText
|
|
8286
|
+
duration: durationMs,
|
|
8287
|
+
outputText,
|
|
8210
8288
|
fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
|
|
8211
8289
|
templateFiles: templateFiles && templateFiles.length > 0 ? templateFiles : void 0,
|
|
8212
8290
|
startedAt,
|
|
8213
8291
|
completedAt,
|
|
8214
|
-
llmTrace
|
|
8292
|
+
llmTrace
|
|
8215
8293
|
};
|
|
8216
8294
|
}
|
|
8217
8295
|
|