@wix/evalforge-evaluator 0.55.0 → 0.57.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -287,11 +287,9 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
287
287
  codeAgent = await api.getAgent(projectId2, evalRun.agentId);
288
288
  }
289
289
  let skills = [];
290
+ let skillsGroup = null;
290
291
  if (evalRun.skillsGroupId) {
291
- const skillsGroup = await api.getSkillsGroup(
292
- projectId2,
293
- evalRun.skillsGroupId
294
- );
292
+ skillsGroup = await api.getSkillsGroup(projectId2, evalRun.skillsGroupId);
295
293
  if (skillsGroup.skillIds.length > 0) {
296
294
  skills = await Promise.all(
297
295
  skillsGroup.skillIds.map((id) => api.getSkill(projectId2, id))
@@ -340,10 +338,13 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
340
338
  resolvedAssertions: resolvedAssertions.length > 0 ? resolvedAssertions : void 0
341
339
  };
342
340
  });
341
+ const skillsGroupName = skillsGroup?.name ?? "";
343
342
  return {
344
343
  evalRun,
345
344
  codeAgent,
346
345
  skills,
346
+ skillsGroup,
347
+ skillsGroupName,
347
348
  scenarioItems
348
349
  };
349
350
  }
@@ -6172,9 +6173,10 @@ function cleanAppleDoubleFiles(dir) {
6172
6173
  }
6173
6174
  }
6174
6175
  async function downloadAndExtractTemplate(template, workDir) {
6175
- if (!(0, import_fs5.existsSync)(workDir)) {
6176
- (0, import_fs5.mkdirSync)(workDir, { recursive: true });
6176
+ if ((0, import_fs5.existsSync)(workDir)) {
6177
+ (0, import_fs5.rmSync)(workDir, { recursive: true });
6177
6178
  }
6179
+ (0, import_fs5.mkdirSync)(workDir, { recursive: true });
6178
6180
  const response = await fetch(template.downloadUrl);
6179
6181
  if (!response.ok) {
6180
6182
  throw new Error(
@@ -6224,7 +6226,7 @@ async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId,
6224
6226
  return workDir;
6225
6227
  }
6226
6228
 
6227
- // src/run-scenario/callSkill.ts
6229
+ // src/run-scenario/run-agent-with-context.ts
6228
6230
  var import_crypto2 = require("crypto");
6229
6231
 
6230
6232
  // src/run-scenario/agents/registry.ts
@@ -6533,10 +6535,11 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
6533
6535
  outputPreview: `Message type: ${message.type}`
6534
6536
  };
6535
6537
  }
6536
- async function executeWithClaudeCode(skill, scenario, options) {
6538
+ async function executeWithClaudeCode(skills, scenario, options) {
6539
+ const skillNames = skills.map((s) => s.name).join(", ");
6537
6540
  console.log("[executeWithClaudeCode] Starting execution", {
6538
- skillId: skill.id,
6539
- skillName: skill.name,
6541
+ skillCount: skills.length,
6542
+ skillNames,
6540
6543
  scenarioId: scenario.id,
6541
6544
  scenarioName: scenario.name,
6542
6545
  cwd: options.cwd,
@@ -6572,22 +6575,23 @@ async function executeWithClaudeCode(skill, scenario, options) {
6572
6575
  const startTime = /* @__PURE__ */ new Date();
6573
6576
  const allMessages = [];
6574
6577
  console.error(
6575
- "[DEBUG-H4] writeSkillToFilesystem START",
6578
+ "[DEBUG-H4] writeSkillsToFilesystem START",
6576
6579
  JSON.stringify({
6577
6580
  cwd: options.cwd,
6578
- skillName: skill.name,
6581
+ skillCount: skills.length,
6582
+ skillNames: skills.map((s) => s.name),
6579
6583
  timestamp: Date.now()
6580
6584
  })
6581
6585
  );
6582
6586
  try {
6583
- await writeSkillToFilesystem(options.cwd, skill);
6587
+ await writeSkillsToFilesystem(options.cwd, skills);
6584
6588
  console.error(
6585
- "[DEBUG-H4] writeSkillToFilesystem SUCCESS",
6589
+ "[DEBUG-H4] writeSkillsToFilesystem SUCCESS",
6586
6590
  JSON.stringify({ timestamp: Date.now() })
6587
6591
  );
6588
6592
  } catch (writeError) {
6589
6593
  console.error(
6590
- "[DEBUG-H4] writeSkillToFilesystem FAILED",
6594
+ "[DEBUG-H4] writeSkillsToFilesystem FAILED",
6591
6595
  JSON.stringify({
6592
6596
  error: writeError instanceof Error ? writeError.message : String(writeError),
6593
6597
  stack: writeError instanceof Error ? writeError.stack : void 0,
@@ -6595,7 +6599,7 @@ async function executeWithClaudeCode(skill, scenario, options) {
6595
6599
  })
6596
6600
  );
6597
6601
  throw new Error(
6598
- `Failed to write skill to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
6602
+ `Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
6599
6603
  );
6600
6604
  }
6601
6605
  const sdkEnv = buildSdkEnvironment(options);
@@ -6631,7 +6635,10 @@ async function executeWithClaudeCode(skill, scenario, options) {
6631
6635
  }
6632
6636
  console.log("[SDK-DEBUG] PATH available:", !!sdkEnv.PATH);
6633
6637
  console.log("[SDK-DEBUG] HOME:", sdkEnv.HOME || "NOT SET");
6634
- console.log("[SDK-DEBUG] Skill:", skill.id, "-", skill.name);
6638
+ console.log(
6639
+ "[SDK-DEBUG] Skills:",
6640
+ skills.map((s) => `${s.id} - ${s.name}`).join(", ")
6641
+ );
6635
6642
  console.log("[SDK-DEBUG] Scenario:", scenario.id, "-", scenario.name);
6636
6643
  console.log(
6637
6644
  "[SDK-DEBUG] Prompt preview:",
@@ -6741,7 +6748,7 @@ async function executeWithClaudeCode(skill, scenario, options) {
6741
6748
  timedOut = true;
6742
6749
  reject(
6743
6750
  new Error(
6744
- `SDK execution timed out after ${SDK_TIMEOUT_MS}ms. Skill: ${skill.name}, Scenario: ${scenario.name}, Messages received: ${messageCount}, MaxTurns: ${maxTurns}`
6751
+ `SDK execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, Messages received: ${messageCount}, MaxTurns: ${maxTurns}`
6745
6752
  )
6746
6753
  );
6747
6754
  }, SDK_TIMEOUT_MS);
@@ -6949,8 +6956,8 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
6949
6956
  }
6950
6957
  }
6951
6958
  console.error("[SDK-ERROR] Execution context:");
6952
- console.error("[SDK-ERROR] skillId:", skill.id);
6953
- console.error("[SDK-ERROR] skillName:", skill.name);
6959
+ console.error("[SDK-ERROR] skillCount:", skills.length);
6960
+ console.error("[SDK-ERROR] skillNames:", skillNames);
6954
6961
  console.error("[SDK-ERROR] scenarioId:", scenario.id);
6955
6962
  console.error("[SDK-ERROR] scenarioName:", scenario.name);
6956
6963
  console.error("[SDK-ERROR] cwd:", options.cwd);
@@ -7010,7 +7017,8 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
7010
7017
  messageCount,
7011
7018
  errorName,
7012
7019
  errorMessage,
7013
- skillId: skill.id,
7020
+ skillCount: skills.length,
7021
+ skillNames,
7014
7022
  scenarioId: scenario.id,
7015
7023
  model: options.model || DEFAULT_MODEL,
7016
7024
  sdkEnv: sdkEnvDebug,
@@ -7103,13 +7111,15 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
7103
7111
  llmTrace
7104
7112
  };
7105
7113
  }
7106
- async function writeSkillToFilesystem(cwd, skill) {
7107
- const skillName = skill.name;
7108
- const skillDir = (0, import_path5.join)(cwd, ".claude", "skills", skillName);
7109
- await (0, import_promises3.mkdir)(skillDir, { recursive: true });
7110
- const skillPath = (0, import_path5.join)(skillDir, "SKILL.md");
7111
- await (0, import_promises3.writeFile)(skillPath, skill.skillMd, "utf-8");
7112
- console.log(`[Skill] Written to ${skillPath}`);
7114
+ async function writeSkillsToFilesystem(cwd, skills) {
7115
+ for (const skill of skills) {
7116
+ const skillName = skill.name;
7117
+ const skillDir = (0, import_path5.join)(cwd, ".claude", "skills", skillName);
7118
+ await (0, import_promises3.mkdir)(skillDir, { recursive: true });
7119
+ const skillPath = (0, import_path5.join)(skillDir, "SKILL.md");
7120
+ await (0, import_promises3.writeFile)(skillPath, skill.skillMd, "utf-8");
7121
+ console.log(`[Skill] Written to ${skillPath}`);
7122
+ }
7113
7123
  }
7114
7124
  function buildSdkEnvironment(options) {
7115
7125
  const env = { ...process.env };
@@ -7305,7 +7315,7 @@ var ClaudeCodeAdapter = class {
7305
7315
  */
7306
7316
  async execute(context) {
7307
7317
  const {
7308
- skill,
7318
+ skills,
7309
7319
  scenario,
7310
7320
  cwd,
7311
7321
  modelConfig,
@@ -7316,7 +7326,6 @@ var ClaudeCodeAdapter = class {
7316
7326
  const modelForSdk = modelConfig?.model ? import_evalforge_types3.AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
7317
7327
  const options = {
7318
7328
  cwd,
7319
- systemPrompt: skill.skillMd,
7320
7329
  model: modelForSdk,
7321
7330
  temperature: modelConfig?.temperature,
7322
7331
  maxTokens: modelConfig?.maxTokens,
@@ -7325,7 +7334,7 @@ var ClaudeCodeAdapter = class {
7325
7334
  traceContext
7326
7335
  };
7327
7336
  const { result, llmTrace } = await executeWithClaudeCode(
7328
- skill,
7337
+ skills,
7329
7338
  scenario,
7330
7339
  options
7331
7340
  );
@@ -7908,7 +7917,6 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
7908
7917
  var IGNORED_PATTERNS = [
7909
7918
  "node_modules",
7910
7919
  ".git",
7911
- ".claude",
7912
7920
  ".cursor",
7913
7921
  "dist",
7914
7922
  "build",
@@ -8074,15 +8082,15 @@ function extractTemplateFiles(before, after) {
8074
8082
  return files;
8075
8083
  }
8076
8084
 
8077
- // src/run-scenario/callSkill.ts
8085
+ // src/run-scenario/run-agent-with-context.ts
8078
8086
  var DEFAULT_AGENT_COMMAND = "claude";
8079
- async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
8087
+ async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsGroupId, skillsGroupName, agent, workDir) {
8080
8088
  const runCommand = agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
8081
8089
  const adapter = getAdapter(runCommand);
8082
8090
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
8083
8091
  const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
8084
8092
  const executionContext = {
8085
- skill,
8093
+ skills,
8086
8094
  scenario,
8087
8095
  cwd: workDir || process.cwd(),
8088
8096
  modelConfig: agent?.modelConfig,
@@ -8092,8 +8100,8 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
8092
8100
  evalRunId: evalRunId2,
8093
8101
  scenarioId: scenario.id,
8094
8102
  scenarioName: scenario.name,
8095
- targetId: skill.id,
8096
- targetName: skill.name,
8103
+ targetId: skillsGroupId,
8104
+ targetName: skillsGroupName,
8097
8105
  tracePushUrl: config.tracePushUrl,
8098
8106
  routeHeader: config.routeHeader,
8099
8107
  authToken: config.authToken
@@ -8106,8 +8114,8 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
8106
8114
  const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
8107
8115
  return {
8108
8116
  id: (0, import_crypto2.randomUUID)(),
8109
- targetId: skill.id,
8110
- targetName: skill.name,
8117
+ targetId: skillsGroupId,
8118
+ targetName: skillsGroupName,
8111
8119
  scenarioId: scenario.id,
8112
8120
  scenarioName: scenario.name,
8113
8121
  modelConfig: agent?.modelConfig,
@@ -8121,45 +8129,26 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
8121
8129
  };
8122
8130
  }
8123
8131
 
8124
- // src/run-scenario/callAgent.ts
8125
- async function callAgent(config, scenario, agent, workDir) {
8126
- throw new Error("Agent execution not yet implemented");
8127
- }
8128
-
8129
8132
  // src/run-scenario/index.ts
8130
- function getTargetId(target) {
8131
- switch (target.type) {
8132
- case "skill":
8133
- return target.skill.id;
8134
- case "agent":
8135
- return target.agent.id;
8136
- }
8137
- }
8138
- async function runScenario(config, evalRunId2, scenario, target, template, resolvedAssertions) {
8139
- const targetId = getTargetId(target);
8133
+ async function runScenario(config, evalRunId2, scenario, evalData, template, resolvedAssertions) {
8134
+ const skillsGroupId = evalData.evalRun.skillsGroupId;
8140
8135
  const workDir = await prepareWorkingDirectory(
8141
8136
  config,
8142
8137
  evalRunId2,
8143
- targetId,
8138
+ skillsGroupId,
8144
8139
  scenario.id,
8145
8140
  template
8146
8141
  );
8147
- let partialResult;
8148
- switch (target.type) {
8149
- case "skill":
8150
- partialResult = await callSkill(
8151
- config,
8152
- evalRunId2,
8153
- scenario,
8154
- target.skill,
8155
- target.agent,
8156
- workDir
8157
- );
8158
- break;
8159
- case "agent":
8160
- partialResult = await callAgent(config, scenario, target.agent, workDir);
8161
- break;
8162
- }
8142
+ const partialResult = await runAgentWithContext(
8143
+ config,
8144
+ evalRunId2,
8145
+ scenario,
8146
+ evalData.skills,
8147
+ skillsGroupId,
8148
+ evalData.skillsGroupName,
8149
+ evalData.codeAgent ?? void 0,
8150
+ workDir
8151
+ );
8163
8152
  const inlineAssertions = scenario.assertions ?? [];
8164
8153
  const assertions = [
8165
8154
  ...inlineAssertions,
@@ -8390,60 +8379,60 @@ async function runEvaluation(projectId2, evalRunId2) {
8390
8379
  );
8391
8380
  }
8392
8381
  let completedScenarios = 0;
8393
- const totalScenarios = scenarioItems.length * skills.length;
8382
+ const totalScenarios = scenarioItems.length;
8394
8383
  for (const { scenario, template, resolvedAssertions } of scenarioItems) {
8395
- for (const skill of skills) {
8396
- state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
8384
+ state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
8385
+ state.currentContext = {
8386
+ projectId: projectId2,
8387
+ evalRunId: evalRunId2,
8388
+ scenarioId: scenario.id,
8389
+ scenarioName: scenario.name,
8390
+ skillsGroupId: evalData.evalRun.skillsGroupId,
8391
+ skillsGroupName: evalData.skillsGroupName,
8392
+ agentId: codeAgent?.id,
8393
+ agentName: codeAgent?.name,
8394
+ progress: `${completedScenarios + 1}/${totalScenarios}`
8395
+ };
8396
+ const skillNames = evalData.skills.map((s) => s.name).join(", ");
8397
+ console.log(
8398
+ "[Evaluator] Running scenario with skills group:",
8399
+ evalData.skillsGroupName,
8400
+ skillNames ? `(${skillNames})` : "",
8401
+ codeAgent ? `with agent: ${codeAgent.name}` : "",
8402
+ `(${completedScenarios + 1}/${totalScenarios})`
8403
+ );
8404
+ try {
8405
+ const result = await runScenario(
8406
+ config,
8407
+ evalRunId2,
8408
+ scenario,
8409
+ evalData,
8410
+ template,
8411
+ resolvedAssertions
8412
+ );
8413
+ console.log("[Evaluator] Scenario completed, adding result");
8414
+ state.currentPhase = ExecutionPhase.ADD_RESULT;
8397
8415
  state.currentContext = {
8398
- projectId: projectId2,
8399
- evalRunId: evalRunId2,
8400
- scenarioId: scenario.id,
8401
- scenarioName: scenario.name,
8402
- skillId: skill.id,
8403
- skillName: skill.name,
8404
- agentId: codeAgent?.id,
8405
- agentName: codeAgent?.name,
8406
- progress: `${completedScenarios + 1}/${totalScenarios}`
8416
+ ...state.currentContext,
8417
+ resultId: result.id
8407
8418
  };
8408
- console.log(
8409
- "[Evaluator] Running skill:",
8410
- skill.name,
8411
- codeAgent ? `with agent: ${codeAgent.name}` : "",
8412
- `(${completedScenarios + 1}/${totalScenarios})`
8419
+ await api.addResult(projectId2, evalRunId2, result);
8420
+ completedScenarios++;
8421
+ } catch (err) {
8422
+ const errorMsg = err instanceof Error ? err.message : String(err);
8423
+ const errorStack = err instanceof Error ? err.stack : void 0;
8424
+ console.error(
8425
+ "[Evaluator] Failed to run scenario with skills group:",
8426
+ evalData.skillsGroupName,
8427
+ "Error:",
8428
+ errorMsg
8413
8429
  );
8414
- try {
8415
- const result = await runScenario(
8416
- config,
8417
- evalRunId2,
8418
- scenario,
8419
- { type: "skill", skill, agent: codeAgent ?? void 0 },
8420
- template,
8421
- resolvedAssertions
8422
- );
8423
- console.log("[Evaluator] Skill completed, adding result");
8424
- state.currentPhase = ExecutionPhase.ADD_RESULT;
8425
- state.currentContext = {
8426
- ...state.currentContext,
8427
- resultId: result.id
8428
- };
8429
- await api.addResult(projectId2, evalRunId2, result);
8430
- completedScenarios++;
8431
- } catch (err) {
8432
- const errorMsg = err instanceof Error ? err.message : String(err);
8433
- const errorStack = err instanceof Error ? err.stack : void 0;
8434
- console.error(
8435
- "[Evaluator] Failed to run skill:",
8436
- skill.name,
8437
- "Error:",
8438
- errorMsg
8439
- );
8440
- if (errorStack) {
8441
- console.error("[Evaluator] Stack trace:", errorStack);
8442
- }
8443
- throw new Error(
8444
- `[${state.currentPhase}] Failed to execute skill "${skill.name}" on scenario "${scenario.name}": ${errorMsg}`
8445
- );
8430
+ if (errorStack) {
8431
+ console.error("[Evaluator] Stack trace:", errorStack);
8446
8432
  }
8433
+ throw new Error(
8434
+ `[${state.currentPhase}] Failed to execute skills group "${evalData.skillsGroupName}" on scenario "${scenario.name}": ${errorMsg}`
8435
+ );
8447
8436
  }
8448
8437
  }
8449
8438
  state.currentPhase = ExecutionPhase.UPDATE_STATUS;