@wix/evalforge-evaluator 0.59.0 → 0.61.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -160,6 +160,16 @@ function createApiClient(serverUrl, options = "") {
160
160
  getSkill(projectId2, id) {
161
161
  return fetchJson(`/projects/${projectId2}/skills/${id}`);
162
162
  },
163
+ getSkillVersion(projectId2, skillId, versionId) {
164
+ return fetchJson(
165
+ `/projects/${projectId2}/skills/${skillId}/versions/${versionId}`
166
+ );
167
+ },
168
+ getLatestSkillVersion(projectId2, skillId) {
169
+ return fetchJson(
170
+ `/projects/${projectId2}/skills/${skillId}/versions/latest`
171
+ );
172
+ },
163
173
  getAgent(projectId2, id) {
164
174
  return fetchJson(`/projects/${projectId2}/agents/${id}`);
165
175
  },
@@ -294,6 +304,29 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
294
304
  skillsGroup.skillIds.map((id) => api.getSkill(projectId2, id))
295
305
  );
296
306
  }
307
+ if (evalRun.skillVersions && Object.keys(evalRun.skillVersions).length > 0) {
308
+ skills = await Promise.all(
309
+ skills.map(async (skill) => {
310
+ const versionId = evalRun.skillVersions?.[skill.id];
311
+ if (versionId) {
312
+ const version = await api.getSkillVersion(
313
+ projectId2,
314
+ skill.id,
315
+ versionId
316
+ );
317
+ return { ...skill, latestVersion: version };
318
+ }
319
+ return skill;
320
+ })
321
+ );
322
+ }
323
+ skills = skills.map((skill) => {
324
+ const hasPinnedVersion = evalRun.skillVersions?.[skill.id];
325
+ if (!hasPinnedVersion && skill.source) {
326
+ return { ...skill, latestVersion: void 0 };
327
+ }
328
+ return skill;
329
+ });
297
330
  }
298
331
  let mcps = [];
299
332
  if (evalRun.mcpIds && evalRun.mcpIds.length > 0) {
@@ -1106,10 +1139,10 @@ var Minipass = class extends import_node_events.EventEmitter {
1106
1139
  * Return a void Promise that resolves once the stream ends.
1107
1140
  */
1108
1141
  async promise() {
1109
- return new Promise((resolve, reject) => {
1142
+ return new Promise((resolve2, reject) => {
1110
1143
  this.on(DESTROYED, () => reject(new Error("stream destroyed")));
1111
1144
  this.on("error", (er) => reject(er));
1112
- this.on("end", () => resolve());
1145
+ this.on("end", () => resolve2());
1113
1146
  });
1114
1147
  }
1115
1148
  /**
@@ -1133,7 +1166,7 @@ var Minipass = class extends import_node_events.EventEmitter {
1133
1166
  return Promise.resolve({ done: false, value: res });
1134
1167
  if (this[EOF])
1135
1168
  return stop();
1136
- let resolve;
1169
+ let resolve2;
1137
1170
  let reject;
1138
1171
  const onerr = (er) => {
1139
1172
  this.off("data", ondata);
@@ -1147,19 +1180,19 @@ var Minipass = class extends import_node_events.EventEmitter {
1147
1180
  this.off("end", onend);
1148
1181
  this.off(DESTROYED, ondestroy);
1149
1182
  this.pause();
1150
- resolve({ value, done: !!this[EOF] });
1183
+ resolve2({ value, done: !!this[EOF] });
1151
1184
  };
1152
1185
  const onend = () => {
1153
1186
  this.off("error", onerr);
1154
1187
  this.off("data", ondata);
1155
1188
  this.off(DESTROYED, ondestroy);
1156
1189
  stop();
1157
- resolve({ done: true, value: void 0 });
1190
+ resolve2({ done: true, value: void 0 });
1158
1191
  };
1159
1192
  const ondestroy = () => onerr(new Error("stream destroyed"));
1160
1193
  return new Promise((res2, rej) => {
1161
1194
  reject = rej;
1162
- resolve = res2;
1195
+ resolve2 = res2;
1163
1196
  this.once(DESTROYED, ondestroy);
1164
1197
  this.once("error", onerr);
1165
1198
  this.once("end", onend);
@@ -3287,9 +3320,9 @@ var listFile = (opt, _files) => {
3287
3320
  const parse4 = new Parser(opt);
3288
3321
  const readSize = opt.maxReadSize || 16 * 1024 * 1024;
3289
3322
  const file = opt.file;
3290
- const p = new Promise((resolve, reject) => {
3323
+ const p = new Promise((resolve2, reject) => {
3291
3324
  parse4.on("error", reject);
3292
- parse4.on("end", resolve);
3325
+ parse4.on("end", resolve2);
3293
3326
  import_node_fs.default.stat(file, (er, stat) => {
3294
3327
  if (er) {
3295
3328
  reject(er);
@@ -5930,9 +5963,9 @@ var extractFile = (opt, _) => {
5930
5963
  const u = new Unpack(opt);
5931
5964
  const readSize = opt.maxReadSize || 16 * 1024 * 1024;
5932
5965
  const file = opt.file;
5933
- const p = new Promise((resolve, reject) => {
5966
+ const p = new Promise((resolve2, reject) => {
5934
5967
  u.on("error", reject);
5935
- u.on("close", resolve);
5968
+ u.on("close", resolve2);
5936
5969
  import_node_fs5.default.stat(file, (er, stat) => {
5937
5970
  if (er) {
5938
5971
  reject(er);
@@ -6066,7 +6099,7 @@ var replaceAsync = (opt, files) => {
6066
6099
  };
6067
6100
  import_node_fs6.default.read(fd, headBuf, 0, 512, position, onread);
6068
6101
  };
6069
- const promise = new Promise((resolve, reject) => {
6102
+ const promise = new Promise((resolve2, reject) => {
6070
6103
  p.on("error", reject);
6071
6104
  let flag = "r+";
6072
6105
  const onopen = (er, fd) => {
@@ -6091,7 +6124,7 @@ var replaceAsync = (opt, files) => {
6091
6124
  });
6092
6125
  p.pipe(stream);
6093
6126
  stream.on("error", reject);
6094
- stream.on("close", resolve);
6127
+ stream.on("close", resolve2);
6095
6128
  addFilesAsync2(p, files);
6096
6129
  });
6097
6130
  });
@@ -6357,33 +6390,96 @@ var import_evalforge_types4 = require("@wix/evalforge-types");
6357
6390
 
6358
6391
  // src/run-scenario/agents/claude-code/execute.ts
6359
6392
  var import_evalforge_types3 = require("@wix/evalforge-types");
6360
- var import_crypto = require("crypto");
6361
- var import_promises5 = require("fs/promises");
6362
- var import_path7 = require("path");
6363
6393
 
6364
- // src/run-scenario/agents/claude-code/write-mcp.ts
6394
+ // src/run-scenario/agents/claude-code/write-skills.ts
6365
6395
  var import_promises3 = require("fs/promises");
6366
6396
  var import_path5 = require("path");
6397
+ var import_evalforge_github_client = require("@wix/evalforge-github-client");
6398
+ async function writeSkillsToFilesystem(cwd, skills, fetchFn = import_evalforge_github_client.fetchSkillFolderRaw) {
6399
+ await Promise.all(
6400
+ skills.map((skill) => writeSkillToFilesystem(cwd, skill, fetchFn))
6401
+ );
6402
+ }
6403
+ async function writeSkillToFilesystem(cwd, skill, fetchFn = import_evalforge_github_client.fetchSkillFolderRaw) {
6404
+ const skillName = skill.name;
6405
+ const skillDir = (0, import_path5.join)(cwd, ".claude", "skills", skillName);
6406
+ await (0, import_promises3.mkdir)(skillDir, { recursive: true });
6407
+ const version = skill.latestVersion;
6408
+ if (version?.files && version.files.length > 0) {
6409
+ await writeSkillFiles(skillDir, version.files);
6410
+ console.log(
6411
+ `[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
6412
+ );
6413
+ } else if (skill.source) {
6414
+ try {
6415
+ const files = await fetchFn(skill.source, {
6416
+ userAgent: "EvalForge-Evaluator"
6417
+ });
6418
+ await writeSkillFiles(skillDir, files);
6419
+ console.log(
6420
+ `[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
6421
+ );
6422
+ } catch (error) {
6423
+ const message = error instanceof Error ? error.message : "Unknown error";
6424
+ console.error(
6425
+ `[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
6426
+ );
6427
+ throw new Error(
6428
+ `Failed to write skill ${skillName} to filesystem: ${message}`
6429
+ );
6430
+ }
6431
+ } else {
6432
+ throw new Error(`Skill ${skillName} has no files and no source configured`);
6433
+ }
6434
+ }
6435
+ async function writeSkillFiles(skillDir, files) {
6436
+ const resolvedBase = (0, import_path5.resolve)(skillDir);
6437
+ for (const file of files) {
6438
+ const filePath = (0, import_path5.resolve)(skillDir, file.path);
6439
+ if (!filePath.startsWith(resolvedBase + import_path5.sep) && filePath !== resolvedBase) {
6440
+ throw new Error(
6441
+ `Path traversal detected in skill file: "${file.path}" resolves outside skill directory`
6442
+ );
6443
+ }
6444
+ await (0, import_promises3.mkdir)((0, import_path5.dirname)(filePath), { recursive: true });
6445
+ await (0, import_promises3.writeFile)(filePath, file.content, "utf-8");
6446
+ }
6447
+ }
6448
+
6449
+ // src/run-scenario/agents/claude-code/execute.ts
6450
+ var import_crypto = require("crypto");
6451
+
6452
+ // src/run-scenario/agents/claude-code/write-mcp.ts
6453
+ var import_promises4 = require("fs/promises");
6454
+ var import_path6 = require("path");
6367
6455
  var import_evalforge_types2 = require("@wix/evalforge-types");
6368
6456
  async function writeMcpToFilesystem(cwd, mcps) {
6369
6457
  if (mcps.length === 0) return;
6370
6458
  const mcpServers = {};
6371
6459
  for (const mcp of mcps) {
6372
- mcpServers[mcp.name] = mcp.config;
6460
+ const config = mcp.config;
6461
+ for (const [key, value] of Object.entries(config)) {
6462
+ if (typeof value !== "object" || value === null || Array.isArray(value)) {
6463
+ throw new Error(
6464
+ `MCP "${mcp.name}" has invalid config: value for key "${key}" must be an object (got ${typeof value}). Config must use keyed format, e.g. { "server-name": { "command": "npx", ... } }`
6465
+ );
6466
+ }
6467
+ mcpServers[key] = value;
6468
+ }
6373
6469
  }
6374
6470
  const content = JSON.stringify(
6375
6471
  { [import_evalforge_types2.MCP_SERVERS_JSON_KEY]: mcpServers },
6376
6472
  null,
6377
6473
  2
6378
6474
  );
6379
- const filePath = (0, import_path5.join)(cwd, ".mcp.json");
6380
- await (0, import_promises3.writeFile)(filePath, content, "utf8");
6475
+ const filePath = (0, import_path6.join)(cwd, ".mcp.json");
6476
+ await (0, import_promises4.writeFile)(filePath, content, "utf8");
6381
6477
  console.log(`[MCP] Written to ${filePath}`);
6382
6478
  }
6383
6479
 
6384
6480
  // src/run-scenario/agents/claude-code/write-sub-agents.ts
6385
- var import_promises4 = require("fs/promises");
6386
- var import_path6 = require("path");
6481
+ var import_promises5 = require("fs/promises");
6482
+ var import_path7 = require("path");
6387
6483
  var AGENTS_DIR = ".claude/agents";
6388
6484
  function toAgentFilename(name2, index, nameCount) {
6389
6485
  const base = (name2 || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
@@ -6393,13 +6489,13 @@ function toAgentFilename(name2, index, nameCount) {
6393
6489
  }
6394
6490
  async function writeSubAgentsToFilesystem(cwd, subAgents) {
6395
6491
  if (subAgents.length === 0) return;
6396
- const agentsDir = (0, import_path6.join)(cwd, AGENTS_DIR);
6397
- await (0, import_promises4.mkdir)(agentsDir, { recursive: true });
6492
+ const agentsDir = (0, import_path7.join)(cwd, AGENTS_DIR);
6493
+ await (0, import_promises5.mkdir)(agentsDir, { recursive: true });
6398
6494
  const nameCount = /* @__PURE__ */ new Map();
6399
6495
  for (const [i, agent] of subAgents.entries()) {
6400
6496
  const filename = toAgentFilename(agent.name, i, nameCount);
6401
- const filePath = (0, import_path6.join)(agentsDir, `${filename}.md`);
6402
- await (0, import_promises4.writeFile)(filePath, agent.subAgentMd, "utf8");
6497
+ const filePath = (0, import_path7.join)(agentsDir, `${filename}.md`);
6498
+ await (0, import_promises5.writeFile)(filePath, agent.subAgentMd, "utf8");
6403
6499
  }
6404
6500
  console.log(`[SubAgents] Written to ${agentsDir}`);
6405
6501
  }
@@ -6725,9 +6821,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
6725
6821
  })
6726
6822
  );
6727
6823
  let messageCount = 0;
6728
- const canUseTool = async () => {
6729
- return { behavior: "allow" };
6730
- };
6824
+ const canUseTool = async (_toolName, input) => ({ behavior: "allow", updatedInput: input });
6731
6825
  const baseAllowedTools = [
6732
6826
  "Skill",
6733
6827
  "Read",
@@ -7180,16 +7274,6 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
7180
7274
  llmTrace
7181
7275
  };
7182
7276
  }
7183
- async function writeSkillsToFilesystem(cwd, skills) {
7184
- for (const skill of skills) {
7185
- const skillName = skill.name;
7186
- const skillDir = (0, import_path7.join)(cwd, ".claude", "skills", skillName);
7187
- await (0, import_promises5.mkdir)(skillDir, { recursive: true });
7188
- const skillPath = (0, import_path7.join)(skillDir, "SKILL.md");
7189
- await (0, import_promises5.writeFile)(skillPath, skill.skillMd, "utf-8");
7190
- console.log(`[Skill] Written to ${skillPath}`);
7191
- }
7192
- }
7193
7277
  function buildSdkEnvironment(options) {
7194
7278
  const env = { ...process.env };
7195
7279
  const placeholderApiKey = "sk-ant-api03-placeholder-auth-handled-by-gateway-000000000000000000000000";
@@ -8187,7 +8271,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
8187
8271
  mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
8188
8272
  subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0
8189
8273
  };
8190
- const result = await adapter.execute(executionContext);
8274
+ const { outputText, durationMs, llmTrace } = await adapter.execute(executionContext);
8191
8275
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
8192
8276
  const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
8193
8277
  const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
@@ -8199,13 +8283,13 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
8199
8283
  scenarioId: scenario.id,
8200
8284
  scenarioName: scenario.name,
8201
8285
  modelConfig: agent?.modelConfig,
8202
- duration: result.durationMs,
8203
- outputText: result.outputText,
8286
+ duration: durationMs,
8287
+ outputText,
8204
8288
  fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
8205
8289
  templateFiles: templateFiles && templateFiles.length > 0 ? templateFiles : void 0,
8206
8290
  startedAt,
8207
8291
  completedAt,
8208
- llmTrace: result.llmTrace
8292
+ llmTrace
8209
8293
  };
8210
8294
  }
8211
8295