@wix/evalforge-evaluator 0.55.0 → 0.56.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -267,11 +267,9 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
267
267
  codeAgent = await api.getAgent(projectId2, evalRun.agentId);
268
268
  }
269
269
  let skills = [];
270
+ let skillsGroup = null;
270
271
  if (evalRun.skillsGroupId) {
271
- const skillsGroup = await api.getSkillsGroup(
272
- projectId2,
273
- evalRun.skillsGroupId
274
- );
272
+ skillsGroup = await api.getSkillsGroup(projectId2, evalRun.skillsGroupId);
275
273
  if (skillsGroup.skillIds.length > 0) {
276
274
  skills = await Promise.all(
277
275
  skillsGroup.skillIds.map((id) => api.getSkill(projectId2, id))
@@ -320,10 +318,13 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
320
318
  resolvedAssertions: resolvedAssertions.length > 0 ? resolvedAssertions : void 0
321
319
  };
322
320
  });
321
+ const skillsGroupName = skillsGroup?.name ?? "";
323
322
  return {
324
323
  evalRun,
325
324
  codeAgent,
326
325
  skills,
326
+ skillsGroup,
327
+ skillsGroupName,
327
328
  scenarioItems
328
329
  };
329
330
  }
@@ -6206,7 +6207,7 @@ async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId,
6206
6207
  return workDir;
6207
6208
  }
6208
6209
 
6209
- // src/run-scenario/callSkill.ts
6210
+ // src/run-scenario/run-agent-with-context.ts
6210
6211
  import { randomUUID as randomUUID2 } from "crypto";
6211
6212
 
6212
6213
  // src/run-scenario/agents/registry.ts
@@ -6520,10 +6521,11 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
6520
6521
  outputPreview: `Message type: ${message.type}`
6521
6522
  };
6522
6523
  }
6523
- async function executeWithClaudeCode(skill, scenario, options) {
6524
+ async function executeWithClaudeCode(skills, scenario, options) {
6525
+ const skillNames = skills.map((s) => s.name).join(", ");
6524
6526
  console.log("[executeWithClaudeCode] Starting execution", {
6525
- skillId: skill.id,
6526
- skillName: skill.name,
6527
+ skillCount: skills.length,
6528
+ skillNames,
6527
6529
  scenarioId: scenario.id,
6528
6530
  scenarioName: scenario.name,
6529
6531
  cwd: options.cwd,
@@ -6559,22 +6561,23 @@ async function executeWithClaudeCode(skill, scenario, options) {
6559
6561
  const startTime = /* @__PURE__ */ new Date();
6560
6562
  const allMessages = [];
6561
6563
  console.error(
6562
- "[DEBUG-H4] writeSkillToFilesystem START",
6564
+ "[DEBUG-H4] writeSkillsToFilesystem START",
6563
6565
  JSON.stringify({
6564
6566
  cwd: options.cwd,
6565
- skillName: skill.name,
6567
+ skillCount: skills.length,
6568
+ skillNames: skills.map((s) => s.name),
6566
6569
  timestamp: Date.now()
6567
6570
  })
6568
6571
  );
6569
6572
  try {
6570
- await writeSkillToFilesystem(options.cwd, skill);
6573
+ await writeSkillsToFilesystem(options.cwd, skills);
6571
6574
  console.error(
6572
- "[DEBUG-H4] writeSkillToFilesystem SUCCESS",
6575
+ "[DEBUG-H4] writeSkillsToFilesystem SUCCESS",
6573
6576
  JSON.stringify({ timestamp: Date.now() })
6574
6577
  );
6575
6578
  } catch (writeError) {
6576
6579
  console.error(
6577
- "[DEBUG-H4] writeSkillToFilesystem FAILED",
6580
+ "[DEBUG-H4] writeSkillsToFilesystem FAILED",
6578
6581
  JSON.stringify({
6579
6582
  error: writeError instanceof Error ? writeError.message : String(writeError),
6580
6583
  stack: writeError instanceof Error ? writeError.stack : void 0,
@@ -6582,7 +6585,7 @@ async function executeWithClaudeCode(skill, scenario, options) {
6582
6585
  })
6583
6586
  );
6584
6587
  throw new Error(
6585
- `Failed to write skill to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
6588
+ `Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
6586
6589
  );
6587
6590
  }
6588
6591
  const sdkEnv = buildSdkEnvironment(options);
@@ -6618,7 +6621,10 @@ async function executeWithClaudeCode(skill, scenario, options) {
6618
6621
  }
6619
6622
  console.log("[SDK-DEBUG] PATH available:", !!sdkEnv.PATH);
6620
6623
  console.log("[SDK-DEBUG] HOME:", sdkEnv.HOME || "NOT SET");
6621
- console.log("[SDK-DEBUG] Skill:", skill.id, "-", skill.name);
6624
+ console.log(
6625
+ "[SDK-DEBUG] Skills:",
6626
+ skills.map((s) => `${s.id} - ${s.name}`).join(", ")
6627
+ );
6622
6628
  console.log("[SDK-DEBUG] Scenario:", scenario.id, "-", scenario.name);
6623
6629
  console.log(
6624
6630
  "[SDK-DEBUG] Prompt preview:",
@@ -6728,7 +6734,7 @@ async function executeWithClaudeCode(skill, scenario, options) {
6728
6734
  timedOut = true;
6729
6735
  reject(
6730
6736
  new Error(
6731
- `SDK execution timed out after ${SDK_TIMEOUT_MS}ms. Skill: ${skill.name}, Scenario: ${scenario.name}, Messages received: ${messageCount}, MaxTurns: ${maxTurns}`
6737
+ `SDK execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, Messages received: ${messageCount}, MaxTurns: ${maxTurns}`
6732
6738
  )
6733
6739
  );
6734
6740
  }, SDK_TIMEOUT_MS);
@@ -6936,8 +6942,8 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
6936
6942
  }
6937
6943
  }
6938
6944
  console.error("[SDK-ERROR] Execution context:");
6939
- console.error("[SDK-ERROR] skillId:", skill.id);
6940
- console.error("[SDK-ERROR] skillName:", skill.name);
6945
+ console.error("[SDK-ERROR] skillCount:", skills.length);
6946
+ console.error("[SDK-ERROR] skillNames:", skillNames);
6941
6947
  console.error("[SDK-ERROR] scenarioId:", scenario.id);
6942
6948
  console.error("[SDK-ERROR] scenarioName:", scenario.name);
6943
6949
  console.error("[SDK-ERROR] cwd:", options.cwd);
@@ -6997,7 +7003,8 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
6997
7003
  messageCount,
6998
7004
  errorName,
6999
7005
  errorMessage,
7000
- skillId: skill.id,
7006
+ skillCount: skills.length,
7007
+ skillNames,
7001
7008
  scenarioId: scenario.id,
7002
7009
  model: options.model || DEFAULT_MODEL,
7003
7010
  sdkEnv: sdkEnvDebug,
@@ -7090,13 +7097,15 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
7090
7097
  llmTrace
7091
7098
  };
7092
7099
  }
7093
- async function writeSkillToFilesystem(cwd, skill) {
7094
- const skillName = skill.name;
7095
- const skillDir = join2(cwd, ".claude", "skills", skillName);
7096
- await mkdir2(skillDir, { recursive: true });
7097
- const skillPath = join2(skillDir, "SKILL.md");
7098
- await writeFile(skillPath, skill.skillMd, "utf-8");
7099
- console.log(`[Skill] Written to ${skillPath}`);
7100
+ async function writeSkillsToFilesystem(cwd, skills) {
7101
+ for (const skill of skills) {
7102
+ const skillName = skill.name;
7103
+ const skillDir = join2(cwd, ".claude", "skills", skillName);
7104
+ await mkdir2(skillDir, { recursive: true });
7105
+ const skillPath = join2(skillDir, "SKILL.md");
7106
+ await writeFile(skillPath, skill.skillMd, "utf-8");
7107
+ console.log(`[Skill] Written to ${skillPath}`);
7108
+ }
7100
7109
  }
7101
7110
  function buildSdkEnvironment(options) {
7102
7111
  const env = { ...process.env };
@@ -7292,7 +7301,7 @@ var ClaudeCodeAdapter = class {
7292
7301
  */
7293
7302
  async execute(context) {
7294
7303
  const {
7295
- skill,
7304
+ skills,
7296
7305
  scenario,
7297
7306
  cwd,
7298
7307
  modelConfig,
@@ -7303,7 +7312,6 @@ var ClaudeCodeAdapter = class {
7303
7312
  const modelForSdk = modelConfig?.model ? AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
7304
7313
  const options = {
7305
7314
  cwd,
7306
- systemPrompt: skill.skillMd,
7307
7315
  model: modelForSdk,
7308
7316
  temperature: modelConfig?.temperature,
7309
7317
  maxTokens: modelConfig?.maxTokens,
@@ -7312,7 +7320,7 @@ var ClaudeCodeAdapter = class {
7312
7320
  traceContext
7313
7321
  };
7314
7322
  const { result, llmTrace } = await executeWithClaudeCode(
7315
- skill,
7323
+ skills,
7316
7324
  scenario,
7317
7325
  options
7318
7326
  );
@@ -7895,7 +7903,6 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
7895
7903
  var IGNORED_PATTERNS = [
7896
7904
  "node_modules",
7897
7905
  ".git",
7898
- ".claude",
7899
7906
  ".cursor",
7900
7907
  "dist",
7901
7908
  "build",
@@ -8061,15 +8068,15 @@ function extractTemplateFiles(before, after) {
8061
8068
  return files;
8062
8069
  }
8063
8070
 
8064
- // src/run-scenario/callSkill.ts
8071
+ // src/run-scenario/run-agent-with-context.ts
8065
8072
  var DEFAULT_AGENT_COMMAND = "claude";
8066
- async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
8073
+ async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsGroupId, skillsGroupName, agent, workDir) {
8067
8074
  const runCommand = agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
8068
8075
  const adapter = getAdapter(runCommand);
8069
8076
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
8070
8077
  const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
8071
8078
  const executionContext = {
8072
- skill,
8079
+ skills,
8073
8080
  scenario,
8074
8081
  cwd: workDir || process.cwd(),
8075
8082
  modelConfig: agent?.modelConfig,
@@ -8079,8 +8086,8 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
8079
8086
  evalRunId: evalRunId2,
8080
8087
  scenarioId: scenario.id,
8081
8088
  scenarioName: scenario.name,
8082
- targetId: skill.id,
8083
- targetName: skill.name,
8089
+ targetId: skillsGroupId,
8090
+ targetName: skillsGroupName,
8084
8091
  tracePushUrl: config.tracePushUrl,
8085
8092
  routeHeader: config.routeHeader,
8086
8093
  authToken: config.authToken
@@ -8093,8 +8100,8 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
8093
8100
  const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
8094
8101
  return {
8095
8102
  id: randomUUID2(),
8096
- targetId: skill.id,
8097
- targetName: skill.name,
8103
+ targetId: skillsGroupId,
8104
+ targetName: skillsGroupName,
8098
8105
  scenarioId: scenario.id,
8099
8106
  scenarioName: scenario.name,
8100
8107
  modelConfig: agent?.modelConfig,
@@ -8108,45 +8115,26 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
8108
8115
  };
8109
8116
  }
8110
8117
 
8111
- // src/run-scenario/callAgent.ts
8112
- async function callAgent(config, scenario, agent, workDir) {
8113
- throw new Error("Agent execution not yet implemented");
8114
- }
8115
-
8116
8118
  // src/run-scenario/index.ts
8117
- function getTargetId(target) {
8118
- switch (target.type) {
8119
- case "skill":
8120
- return target.skill.id;
8121
- case "agent":
8122
- return target.agent.id;
8123
- }
8124
- }
8125
- async function runScenario(config, evalRunId2, scenario, target, template, resolvedAssertions) {
8126
- const targetId = getTargetId(target);
8119
+ async function runScenario(config, evalRunId2, scenario, evalData, template, resolvedAssertions) {
8120
+ const skillsGroupId = evalData.evalRun.skillsGroupId;
8127
8121
  const workDir = await prepareWorkingDirectory(
8128
8122
  config,
8129
8123
  evalRunId2,
8130
- targetId,
8124
+ skillsGroupId,
8131
8125
  scenario.id,
8132
8126
  template
8133
8127
  );
8134
- let partialResult;
8135
- switch (target.type) {
8136
- case "skill":
8137
- partialResult = await callSkill(
8138
- config,
8139
- evalRunId2,
8140
- scenario,
8141
- target.skill,
8142
- target.agent,
8143
- workDir
8144
- );
8145
- break;
8146
- case "agent":
8147
- partialResult = await callAgent(config, scenario, target.agent, workDir);
8148
- break;
8149
- }
8128
+ const partialResult = await runAgentWithContext(
8129
+ config,
8130
+ evalRunId2,
8131
+ scenario,
8132
+ evalData.skills,
8133
+ skillsGroupId,
8134
+ evalData.skillsGroupName,
8135
+ evalData.codeAgent ?? void 0,
8136
+ workDir
8137
+ );
8150
8138
  const inlineAssertions = scenario.assertions ?? [];
8151
8139
  const assertions = [
8152
8140
  ...inlineAssertions,
@@ -8377,60 +8365,60 @@ async function runEvaluation(projectId2, evalRunId2) {
8377
8365
  );
8378
8366
  }
8379
8367
  let completedScenarios = 0;
8380
- const totalScenarios = scenarioItems.length * skills.length;
8368
+ const totalScenarios = scenarioItems.length;
8381
8369
  for (const { scenario, template, resolvedAssertions } of scenarioItems) {
8382
- for (const skill of skills) {
8383
- state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
8370
+ state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
8371
+ state.currentContext = {
8372
+ projectId: projectId2,
8373
+ evalRunId: evalRunId2,
8374
+ scenarioId: scenario.id,
8375
+ scenarioName: scenario.name,
8376
+ skillsGroupId: evalData.evalRun.skillsGroupId,
8377
+ skillsGroupName: evalData.skillsGroupName,
8378
+ agentId: codeAgent?.id,
8379
+ agentName: codeAgent?.name,
8380
+ progress: `${completedScenarios + 1}/${totalScenarios}`
8381
+ };
8382
+ const skillNames = evalData.skills.map((s) => s.name).join(", ");
8383
+ console.log(
8384
+ "[Evaluator] Running scenario with skills group:",
8385
+ evalData.skillsGroupName,
8386
+ skillNames ? `(${skillNames})` : "",
8387
+ codeAgent ? `with agent: ${codeAgent.name}` : "",
8388
+ `(${completedScenarios + 1}/${totalScenarios})`
8389
+ );
8390
+ try {
8391
+ const result = await runScenario(
8392
+ config,
8393
+ evalRunId2,
8394
+ scenario,
8395
+ evalData,
8396
+ template,
8397
+ resolvedAssertions
8398
+ );
8399
+ console.log("[Evaluator] Scenario completed, adding result");
8400
+ state.currentPhase = ExecutionPhase.ADD_RESULT;
8384
8401
  state.currentContext = {
8385
- projectId: projectId2,
8386
- evalRunId: evalRunId2,
8387
- scenarioId: scenario.id,
8388
- scenarioName: scenario.name,
8389
- skillId: skill.id,
8390
- skillName: skill.name,
8391
- agentId: codeAgent?.id,
8392
- agentName: codeAgent?.name,
8393
- progress: `${completedScenarios + 1}/${totalScenarios}`
8402
+ ...state.currentContext,
8403
+ resultId: result.id
8394
8404
  };
8395
- console.log(
8396
- "[Evaluator] Running skill:",
8397
- skill.name,
8398
- codeAgent ? `with agent: ${codeAgent.name}` : "",
8399
- `(${completedScenarios + 1}/${totalScenarios})`
8405
+ await api.addResult(projectId2, evalRunId2, result);
8406
+ completedScenarios++;
8407
+ } catch (err) {
8408
+ const errorMsg = err instanceof Error ? err.message : String(err);
8409
+ const errorStack = err instanceof Error ? err.stack : void 0;
8410
+ console.error(
8411
+ "[Evaluator] Failed to run scenario with skills group:",
8412
+ evalData.skillsGroupName,
8413
+ "Error:",
8414
+ errorMsg
8400
8415
  );
8401
- try {
8402
- const result = await runScenario(
8403
- config,
8404
- evalRunId2,
8405
- scenario,
8406
- { type: "skill", skill, agent: codeAgent ?? void 0 },
8407
- template,
8408
- resolvedAssertions
8409
- );
8410
- console.log("[Evaluator] Skill completed, adding result");
8411
- state.currentPhase = ExecutionPhase.ADD_RESULT;
8412
- state.currentContext = {
8413
- ...state.currentContext,
8414
- resultId: result.id
8415
- };
8416
- await api.addResult(projectId2, evalRunId2, result);
8417
- completedScenarios++;
8418
- } catch (err) {
8419
- const errorMsg = err instanceof Error ? err.message : String(err);
8420
- const errorStack = err instanceof Error ? err.stack : void 0;
8421
- console.error(
8422
- "[Evaluator] Failed to run skill:",
8423
- skill.name,
8424
- "Error:",
8425
- errorMsg
8426
- );
8427
- if (errorStack) {
8428
- console.error("[Evaluator] Stack trace:", errorStack);
8429
- }
8430
- throw new Error(
8431
- `[${state.currentPhase}] Failed to execute skill "${skill.name}" on scenario "${scenario.name}": ${errorMsg}`
8432
- );
8416
+ if (errorStack) {
8417
+ console.error("[Evaluator] Stack trace:", errorStack);
8433
8418
  }
8419
+ throw new Error(
8420
+ `[${state.currentPhase}] Failed to execute skills group "${evalData.skillsGroupName}" on scenario "${scenario.name}": ${errorMsg}`
8421
+ );
8434
8422
  }
8435
8423
  }
8436
8424
  state.currentPhase = ExecutionPhase.UPDATE_STATUS;