@wix/evalforge-evaluator 0.54.0 → 0.56.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -287,11 +287,9 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
287
287
  codeAgent = await api.getAgent(projectId2, evalRun.agentId);
288
288
  }
289
289
  let skills = [];
290
+ let skillsGroup = null;
290
291
  if (evalRun.skillsGroupId) {
291
- const skillsGroup = await api.getSkillsGroup(
292
- projectId2,
293
- evalRun.skillsGroupId
294
- );
292
+ skillsGroup = await api.getSkillsGroup(projectId2, evalRun.skillsGroupId);
295
293
  if (skillsGroup.skillIds.length > 0) {
296
294
  skills = await Promise.all(
297
295
  skillsGroup.skillIds.map((id) => api.getSkill(projectId2, id))
@@ -340,10 +338,13 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
340
338
  resolvedAssertions: resolvedAssertions.length > 0 ? resolvedAssertions : void 0
341
339
  };
342
340
  });
341
+ const skillsGroupName = skillsGroup?.name ?? "";
343
342
  return {
344
343
  evalRun,
345
344
  codeAgent,
346
345
  skills,
346
+ skillsGroup,
347
+ skillsGroupName,
347
348
  scenarioItems
348
349
  };
349
350
  }
@@ -6224,7 +6225,7 @@ async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId,
6224
6225
  return workDir;
6225
6226
  }
6226
6227
 
6227
- // src/run-scenario/callSkill.ts
6228
+ // src/run-scenario/run-agent-with-context.ts
6228
6229
  var import_crypto2 = require("crypto");
6229
6230
 
6230
6231
  // src/run-scenario/agents/registry.ts
@@ -6533,10 +6534,11 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
6533
6534
  outputPreview: `Message type: ${message.type}`
6534
6535
  };
6535
6536
  }
6536
- async function executeWithClaudeCode(skill, scenario, options) {
6537
+ async function executeWithClaudeCode(skills, scenario, options) {
6538
+ const skillNames = skills.map((s) => s.name).join(", ");
6537
6539
  console.log("[executeWithClaudeCode] Starting execution", {
6538
- skillId: skill.id,
6539
- skillName: skill.name,
6540
+ skillCount: skills.length,
6541
+ skillNames,
6540
6542
  scenarioId: scenario.id,
6541
6543
  scenarioName: scenario.name,
6542
6544
  cwd: options.cwd,
@@ -6572,22 +6574,23 @@ async function executeWithClaudeCode(skill, scenario, options) {
6572
6574
  const startTime = /* @__PURE__ */ new Date();
6573
6575
  const allMessages = [];
6574
6576
  console.error(
6575
- "[DEBUG-H4] writeSkillToFilesystem START",
6577
+ "[DEBUG-H4] writeSkillsToFilesystem START",
6576
6578
  JSON.stringify({
6577
6579
  cwd: options.cwd,
6578
- skillName: skill.name,
6580
+ skillCount: skills.length,
6581
+ skillNames: skills.map((s) => s.name),
6579
6582
  timestamp: Date.now()
6580
6583
  })
6581
6584
  );
6582
6585
  try {
6583
- await writeSkillToFilesystem(options.cwd, skill);
6586
+ await writeSkillsToFilesystem(options.cwd, skills);
6584
6587
  console.error(
6585
- "[DEBUG-H4] writeSkillToFilesystem SUCCESS",
6588
+ "[DEBUG-H4] writeSkillsToFilesystem SUCCESS",
6586
6589
  JSON.stringify({ timestamp: Date.now() })
6587
6590
  );
6588
6591
  } catch (writeError) {
6589
6592
  console.error(
6590
- "[DEBUG-H4] writeSkillToFilesystem FAILED",
6593
+ "[DEBUG-H4] writeSkillsToFilesystem FAILED",
6591
6594
  JSON.stringify({
6592
6595
  error: writeError instanceof Error ? writeError.message : String(writeError),
6593
6596
  stack: writeError instanceof Error ? writeError.stack : void 0,
@@ -6595,7 +6598,7 @@ async function executeWithClaudeCode(skill, scenario, options) {
6595
6598
  })
6596
6599
  );
6597
6600
  throw new Error(
6598
- `Failed to write skill to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
6601
+ `Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
6599
6602
  );
6600
6603
  }
6601
6604
  const sdkEnv = buildSdkEnvironment(options);
@@ -6631,7 +6634,10 @@ async function executeWithClaudeCode(skill, scenario, options) {
6631
6634
  }
6632
6635
  console.log("[SDK-DEBUG] PATH available:", !!sdkEnv.PATH);
6633
6636
  console.log("[SDK-DEBUG] HOME:", sdkEnv.HOME || "NOT SET");
6634
- console.log("[SDK-DEBUG] Skill:", skill.id, "-", skill.name);
6637
+ console.log(
6638
+ "[SDK-DEBUG] Skills:",
6639
+ skills.map((s) => `${s.id} - ${s.name}`).join(", ")
6640
+ );
6635
6641
  console.log("[SDK-DEBUG] Scenario:", scenario.id, "-", scenario.name);
6636
6642
  console.log(
6637
6643
  "[SDK-DEBUG] Prompt preview:",
@@ -6741,7 +6747,7 @@ async function executeWithClaudeCode(skill, scenario, options) {
6741
6747
  timedOut = true;
6742
6748
  reject(
6743
6749
  new Error(
6744
- `SDK execution timed out after ${SDK_TIMEOUT_MS}ms. Skill: ${skill.name}, Scenario: ${scenario.name}, Messages received: ${messageCount}, MaxTurns: ${maxTurns}`
6750
+ `SDK execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, Messages received: ${messageCount}, MaxTurns: ${maxTurns}`
6745
6751
  )
6746
6752
  );
6747
6753
  }, SDK_TIMEOUT_MS);
@@ -6949,8 +6955,8 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
6949
6955
  }
6950
6956
  }
6951
6957
  console.error("[SDK-ERROR] Execution context:");
6952
- console.error("[SDK-ERROR] skillId:", skill.id);
6953
- console.error("[SDK-ERROR] skillName:", skill.name);
6958
+ console.error("[SDK-ERROR] skillCount:", skills.length);
6959
+ console.error("[SDK-ERROR] skillNames:", skillNames);
6954
6960
  console.error("[SDK-ERROR] scenarioId:", scenario.id);
6955
6961
  console.error("[SDK-ERROR] scenarioName:", scenario.name);
6956
6962
  console.error("[SDK-ERROR] cwd:", options.cwd);
@@ -7010,7 +7016,8 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
7010
7016
  messageCount,
7011
7017
  errorName,
7012
7018
  errorMessage,
7013
- skillId: skill.id,
7019
+ skillCount: skills.length,
7020
+ skillNames,
7014
7021
  scenarioId: scenario.id,
7015
7022
  model: options.model || DEFAULT_MODEL,
7016
7023
  sdkEnv: sdkEnvDebug,
@@ -7103,13 +7110,15 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
7103
7110
  llmTrace
7104
7111
  };
7105
7112
  }
7106
- async function writeSkillToFilesystem(cwd, skill) {
7107
- const skillName = skill.name;
7108
- const skillDir = (0, import_path5.join)(cwd, ".claude", "skills", skillName);
7109
- await (0, import_promises3.mkdir)(skillDir, { recursive: true });
7110
- const skillPath = (0, import_path5.join)(skillDir, "SKILL.md");
7111
- await (0, import_promises3.writeFile)(skillPath, skill.skillMd, "utf-8");
7112
- console.log(`[Skill] Written to ${skillPath}`);
7113
+ async function writeSkillsToFilesystem(cwd, skills) {
7114
+ for (const skill of skills) {
7115
+ const skillName = skill.name;
7116
+ const skillDir = (0, import_path5.join)(cwd, ".claude", "skills", skillName);
7117
+ await (0, import_promises3.mkdir)(skillDir, { recursive: true });
7118
+ const skillPath = (0, import_path5.join)(skillDir, "SKILL.md");
7119
+ await (0, import_promises3.writeFile)(skillPath, skill.skillMd, "utf-8");
7120
+ console.log(`[Skill] Written to ${skillPath}`);
7121
+ }
7113
7122
  }
7114
7123
  function buildSdkEnvironment(options) {
7115
7124
  const env = { ...process.env };
@@ -7305,7 +7314,7 @@ var ClaudeCodeAdapter = class {
7305
7314
  */
7306
7315
  async execute(context) {
7307
7316
  const {
7308
- skill,
7317
+ skills,
7309
7318
  scenario,
7310
7319
  cwd,
7311
7320
  modelConfig,
@@ -7316,7 +7325,6 @@ var ClaudeCodeAdapter = class {
7316
7325
  const modelForSdk = modelConfig?.model ? import_evalforge_types3.AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
7317
7326
  const options = {
7318
7327
  cwd,
7319
- systemPrompt: skill.skillMd,
7320
7328
  model: modelForSdk,
7321
7329
  temperature: modelConfig?.temperature,
7322
7330
  maxTokens: modelConfig?.maxTokens,
@@ -7325,7 +7333,7 @@ var ClaudeCodeAdapter = class {
7325
7333
  traceContext
7326
7334
  };
7327
7335
  const { result, llmTrace } = await executeWithClaudeCode(
7328
- skill,
7336
+ skills,
7329
7337
  scenario,
7330
7338
  options
7331
7339
  );
@@ -7908,7 +7916,6 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
7908
7916
  var IGNORED_PATTERNS = [
7909
7917
  "node_modules",
7910
7918
  ".git",
7911
- ".claude",
7912
7919
  ".cursor",
7913
7920
  "dist",
7914
7921
  "build",
@@ -8074,15 +8081,15 @@ function extractTemplateFiles(before, after) {
8074
8081
  return files;
8075
8082
  }
8076
8083
 
8077
- // src/run-scenario/callSkill.ts
8084
+ // src/run-scenario/run-agent-with-context.ts
8078
8085
  var DEFAULT_AGENT_COMMAND = "claude";
8079
- async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
8086
+ async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsGroupId, skillsGroupName, agent, workDir) {
8080
8087
  const runCommand = agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
8081
8088
  const adapter = getAdapter(runCommand);
8082
8089
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
8083
8090
  const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
8084
8091
  const executionContext = {
8085
- skill,
8092
+ skills,
8086
8093
  scenario,
8087
8094
  cwd: workDir || process.cwd(),
8088
8095
  modelConfig: agent?.modelConfig,
@@ -8092,8 +8099,8 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
8092
8099
  evalRunId: evalRunId2,
8093
8100
  scenarioId: scenario.id,
8094
8101
  scenarioName: scenario.name,
8095
- targetId: skill.id,
8096
- targetName: skill.name,
8102
+ targetId: skillsGroupId,
8103
+ targetName: skillsGroupName,
8097
8104
  tracePushUrl: config.tracePushUrl,
8098
8105
  routeHeader: config.routeHeader,
8099
8106
  authToken: config.authToken
@@ -8106,8 +8113,8 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
8106
8113
  const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
8107
8114
  return {
8108
8115
  id: (0, import_crypto2.randomUUID)(),
8109
- targetId: skill.id,
8110
- targetName: skill.name,
8116
+ targetId: skillsGroupId,
8117
+ targetName: skillsGroupName,
8111
8118
  scenarioId: scenario.id,
8112
8119
  scenarioName: scenario.name,
8113
8120
  modelConfig: agent?.modelConfig,
@@ -8121,45 +8128,26 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
8121
8128
  };
8122
8129
  }
8123
8130
 
8124
- // src/run-scenario/callAgent.ts
8125
- async function callAgent(config, scenario, agent, workDir) {
8126
- throw new Error("Agent execution not yet implemented");
8127
- }
8128
-
8129
8131
  // src/run-scenario/index.ts
8130
- function getTargetId(target) {
8131
- switch (target.type) {
8132
- case "skill":
8133
- return target.skill.id;
8134
- case "agent":
8135
- return target.agent.id;
8136
- }
8137
- }
8138
- async function runScenario(config, evalRunId2, scenario, target, template, resolvedAssertions) {
8139
- const targetId = getTargetId(target);
8132
+ async function runScenario(config, evalRunId2, scenario, evalData, template, resolvedAssertions) {
8133
+ const skillsGroupId = evalData.evalRun.skillsGroupId;
8140
8134
  const workDir = await prepareWorkingDirectory(
8141
8135
  config,
8142
8136
  evalRunId2,
8143
- targetId,
8137
+ skillsGroupId,
8144
8138
  scenario.id,
8145
8139
  template
8146
8140
  );
8147
- let partialResult;
8148
- switch (target.type) {
8149
- case "skill":
8150
- partialResult = await callSkill(
8151
- config,
8152
- evalRunId2,
8153
- scenario,
8154
- target.skill,
8155
- target.agent,
8156
- workDir
8157
- );
8158
- break;
8159
- case "agent":
8160
- partialResult = await callAgent(config, scenario, target.agent, workDir);
8161
- break;
8162
- }
8141
+ const partialResult = await runAgentWithContext(
8142
+ config,
8143
+ evalRunId2,
8144
+ scenario,
8145
+ evalData.skills,
8146
+ skillsGroupId,
8147
+ evalData.skillsGroupName,
8148
+ evalData.codeAgent ?? void 0,
8149
+ workDir
8150
+ );
8163
8151
  const inlineAssertions = scenario.assertions ?? [];
8164
8152
  const assertions = [
8165
8153
  ...inlineAssertions,
@@ -8390,60 +8378,60 @@ async function runEvaluation(projectId2, evalRunId2) {
8390
8378
  );
8391
8379
  }
8392
8380
  let completedScenarios = 0;
8393
- const totalScenarios = scenarioItems.length * skills.length;
8381
+ const totalScenarios = scenarioItems.length;
8394
8382
  for (const { scenario, template, resolvedAssertions } of scenarioItems) {
8395
- for (const skill of skills) {
8396
- state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
8383
+ state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
8384
+ state.currentContext = {
8385
+ projectId: projectId2,
8386
+ evalRunId: evalRunId2,
8387
+ scenarioId: scenario.id,
8388
+ scenarioName: scenario.name,
8389
+ skillsGroupId: evalData.evalRun.skillsGroupId,
8390
+ skillsGroupName: evalData.skillsGroupName,
8391
+ agentId: codeAgent?.id,
8392
+ agentName: codeAgent?.name,
8393
+ progress: `${completedScenarios + 1}/${totalScenarios}`
8394
+ };
8395
+ const skillNames = evalData.skills.map((s) => s.name).join(", ");
8396
+ console.log(
8397
+ "[Evaluator] Running scenario with skills group:",
8398
+ evalData.skillsGroupName,
8399
+ skillNames ? `(${skillNames})` : "",
8400
+ codeAgent ? `with agent: ${codeAgent.name}` : "",
8401
+ `(${completedScenarios + 1}/${totalScenarios})`
8402
+ );
8403
+ try {
8404
+ const result = await runScenario(
8405
+ config,
8406
+ evalRunId2,
8407
+ scenario,
8408
+ evalData,
8409
+ template,
8410
+ resolvedAssertions
8411
+ );
8412
+ console.log("[Evaluator] Scenario completed, adding result");
8413
+ state.currentPhase = ExecutionPhase.ADD_RESULT;
8397
8414
  state.currentContext = {
8398
- projectId: projectId2,
8399
- evalRunId: evalRunId2,
8400
- scenarioId: scenario.id,
8401
- scenarioName: scenario.name,
8402
- skillId: skill.id,
8403
- skillName: skill.name,
8404
- agentId: codeAgent?.id,
8405
- agentName: codeAgent?.name,
8406
- progress: `${completedScenarios + 1}/${totalScenarios}`
8415
+ ...state.currentContext,
8416
+ resultId: result.id
8407
8417
  };
8408
- console.log(
8409
- "[Evaluator] Running skill:",
8410
- skill.name,
8411
- codeAgent ? `with agent: ${codeAgent.name}` : "",
8412
- `(${completedScenarios + 1}/${totalScenarios})`
8418
+ await api.addResult(projectId2, evalRunId2, result);
8419
+ completedScenarios++;
8420
+ } catch (err) {
8421
+ const errorMsg = err instanceof Error ? err.message : String(err);
8422
+ const errorStack = err instanceof Error ? err.stack : void 0;
8423
+ console.error(
8424
+ "[Evaluator] Failed to run scenario with skills group:",
8425
+ evalData.skillsGroupName,
8426
+ "Error:",
8427
+ errorMsg
8413
8428
  );
8414
- try {
8415
- const result = await runScenario(
8416
- config,
8417
- evalRunId2,
8418
- scenario,
8419
- { type: "skill", skill, agent: codeAgent ?? void 0 },
8420
- template,
8421
- resolvedAssertions
8422
- );
8423
- console.log("[Evaluator] Skill completed, adding result");
8424
- state.currentPhase = ExecutionPhase.ADD_RESULT;
8425
- state.currentContext = {
8426
- ...state.currentContext,
8427
- resultId: result.id
8428
- };
8429
- await api.addResult(projectId2, evalRunId2, result);
8430
- completedScenarios++;
8431
- } catch (err) {
8432
- const errorMsg = err instanceof Error ? err.message : String(err);
8433
- const errorStack = err instanceof Error ? err.stack : void 0;
8434
- console.error(
8435
- "[Evaluator] Failed to run skill:",
8436
- skill.name,
8437
- "Error:",
8438
- errorMsg
8439
- );
8440
- if (errorStack) {
8441
- console.error("[Evaluator] Stack trace:", errorStack);
8442
- }
8443
- throw new Error(
8444
- `[${state.currentPhase}] Failed to execute skill "${skill.name}" on scenario "${scenario.name}": ${errorMsg}`
8445
- );
8429
+ if (errorStack) {
8430
+ console.error("[Evaluator] Stack trace:", errorStack);
8446
8431
  }
8432
+ throw new Error(
8433
+ `[${state.currentPhase}] Failed to execute skills group "${evalData.skillsGroupName}" on scenario "${scenario.name}": ${errorMsg}`
8434
+ );
8447
8435
  }
8448
8436
  }
8449
8437
  state.currentPhase = ExecutionPhase.UPDATE_STATUS;