npm - @wix/evalforge-evaluator - Versions diffs - 0.55.0 → 0.57.0 - Mend

@wix/evalforge-evaluator 0.55.0 → 0.57.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/build/index.js +109 -120
package/build/index.js.map +3 -3
package/build/index.mjs +109 -120
package/build/index.mjs.map +3 -3
package/build/types/fetch-evaluation-data.d.ts +5 -2
package/build/types/run-scenario/agents/claude-code/execute.d.ts +7 -3
package/build/types/run-scenario/agents/claude-code/types.d.ts +0 -2
package/build/types/run-scenario/index.d.ts +5 -5
package/build/types/run-scenario/run-agent-with-context.d.ts +21 -0
package/build/types/run-scenario/types.d.ts +1 -13
package/package.json +3 -3
package/build/types/run-scenario/callAgent.d.ts +0 -13
package/build/types/run-scenario/callSkill.d.ts +0 -18

package/build/index.js CHANGED Viewed

@@ -287,11 +287,9 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
     codeAgent = await api.getAgent(projectId2, evalRun.agentId);
   }
   let skills = [];
+  let skillsGroup = null;
   if (evalRun.skillsGroupId) {
-    const skillsGroup = await api.getSkillsGroup(
-      projectId2,
-      evalRun.skillsGroupId
-    );
+    skillsGroup = await api.getSkillsGroup(projectId2, evalRun.skillsGroupId);
     if (skillsGroup.skillIds.length > 0) {
       skills = await Promise.all(
         skillsGroup.skillIds.map((id) => api.getSkill(projectId2, id))
@@ -340,10 +338,13 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
       resolvedAssertions: resolvedAssertions.length > 0 ? resolvedAssertions : void 0
     };
   });
+  const skillsGroupName = skillsGroup?.name ?? "";
   return {
     evalRun,
     codeAgent,
     skills,
+    skillsGroup,
+    skillsGroupName,
     scenarioItems
   };
 }
@@ -6172,9 +6173,10 @@ function cleanAppleDoubleFiles(dir) {
   }
 }
 async function downloadAndExtractTemplate(template, workDir) {
-  if (!(0, import_fs5.existsSync)(workDir)) {
-    (0, import_fs5.mkdirSync)(workDir, { recursive: true });
+  if ((0, import_fs5.existsSync)(workDir)) {
+    (0, import_fs5.rmSync)(workDir, { recursive: true });
   }
+  (0, import_fs5.mkdirSync)(workDir, { recursive: true });
   const response = await fetch(template.downloadUrl);
   if (!response.ok) {
     throw new Error(
@@ -6224,7 +6226,7 @@ async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId,
   return workDir;
 }
-// src/run-scenario/callSkill.ts
+// src/run-scenario/run-agent-with-context.ts
 var import_crypto2 = require("crypto");
 // src/run-scenario/agents/registry.ts
@@ -6533,10 +6535,11 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
     outputPreview: `Message type: ${message.type}`
   };
 }
-async function executeWithClaudeCode(skill, scenario, options) {
+async function executeWithClaudeCode(skills, scenario, options) {
+  const skillNames = skills.map((s) => s.name).join(", ");
   console.log("[executeWithClaudeCode] Starting execution", {
-    skillId: skill.id,
-    skillName: skill.name,
+    skillCount: skills.length,
+    skillNames,
     scenarioId: scenario.id,
     scenarioName: scenario.name,
     cwd: options.cwd,
@@ -6572,22 +6575,23 @@ async function executeWithClaudeCode(skill, scenario, options) {
   const startTime = /* @__PURE__ */ new Date();
   const allMessages = [];
   console.error(
-    "[DEBUG-H4] writeSkillToFilesystem START",
+    "[DEBUG-H4] writeSkillsToFilesystem START",
     JSON.stringify({
       cwd: options.cwd,
-      skillName: skill.name,
+      skillCount: skills.length,
+      skillNames: skills.map((s) => s.name),
       timestamp: Date.now()
     })
   );
   try {
-    await writeSkillToFilesystem(options.cwd, skill);
+    await writeSkillsToFilesystem(options.cwd, skills);
     console.error(
-      "[DEBUG-H4] writeSkillToFilesystem SUCCESS",
+      "[DEBUG-H4] writeSkillsToFilesystem SUCCESS",
       JSON.stringify({ timestamp: Date.now() })
     );
   } catch (writeError) {
     console.error(
-      "[DEBUG-H4] writeSkillToFilesystem FAILED",
+      "[DEBUG-H4] writeSkillsToFilesystem FAILED",
       JSON.stringify({
         error: writeError instanceof Error ? writeError.message : String(writeError),
         stack: writeError instanceof Error ? writeError.stack : void 0,
@@ -6595,7 +6599,7 @@ async function executeWithClaudeCode(skill, scenario, options) {
       })
     );
     throw new Error(
-      `Failed to write skill to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
+      `Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
     );
   }
   const sdkEnv = buildSdkEnvironment(options);
@@ -6631,7 +6635,10 @@ async function executeWithClaudeCode(skill, scenario, options) {
   }
   console.log("[SDK-DEBUG]   PATH available:", !!sdkEnv.PATH);
   console.log("[SDK-DEBUG]   HOME:", sdkEnv.HOME || "NOT SET");
-  console.log("[SDK-DEBUG] Skill:", skill.id, "-", skill.name);
+  console.log(
+    "[SDK-DEBUG] Skills:",
+    skills.map((s) => `${s.id} - ${s.name}`).join(", ")
+  );
   console.log("[SDK-DEBUG] Scenario:", scenario.id, "-", scenario.name);
   console.log(
     "[SDK-DEBUG] Prompt preview:",
@@ -6741,7 +6748,7 @@ async function executeWithClaudeCode(skill, scenario, options) {
         timedOut = true;
         reject(
           new Error(
-            `SDK execution timed out after ${SDK_TIMEOUT_MS}ms. Skill: ${skill.name}, Scenario: ${scenario.name}, Messages received: ${messageCount}, MaxTurns: ${maxTurns}`
+            `SDK execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, Messages received: ${messageCount}, MaxTurns: ${maxTurns}`
           )
         );
       }, SDK_TIMEOUT_MS);
@@ -6949,8 +6956,8 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
       }
     }
     console.error("[SDK-ERROR] Execution context:");
-    console.error("[SDK-ERROR]   skillId:", skill.id);
-    console.error("[SDK-ERROR]   skillName:", skill.name);
+    console.error("[SDK-ERROR]   skillCount:", skills.length);
+    console.error("[SDK-ERROR]   skillNames:", skillNames);
     console.error("[SDK-ERROR]   scenarioId:", scenario.id);
     console.error("[SDK-ERROR]   scenarioName:", scenario.name);
     console.error("[SDK-ERROR]   cwd:", options.cwd);
@@ -7010,7 +7017,8 @@ IMPORTANT: This is an automated evaluation run. Follow these guidelines:
       messageCount,
       errorName,
       errorMessage,
-      skillId: skill.id,
+      skillCount: skills.length,
+      skillNames,
       scenarioId: scenario.id,
       model: options.model || DEFAULT_MODEL,
       sdkEnv: sdkEnvDebug,
@@ -7103,13 +7111,15 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
     llmTrace
   };
 }
-async function writeSkillToFilesystem(cwd, skill) {
-  const skillName = skill.name;
-  const skillDir = (0, import_path5.join)(cwd, ".claude", "skills", skillName);
-  await (0, import_promises3.mkdir)(skillDir, { recursive: true });
-  const skillPath = (0, import_path5.join)(skillDir, "SKILL.md");
-  await (0, import_promises3.writeFile)(skillPath, skill.skillMd, "utf-8");
-  console.log(`[Skill] Written to ${skillPath}`);
+async function writeSkillsToFilesystem(cwd, skills) {
+  for (const skill of skills) {
+    const skillName = skill.name;
+    const skillDir = (0, import_path5.join)(cwd, ".claude", "skills", skillName);
+    await (0, import_promises3.mkdir)(skillDir, { recursive: true });
+    const skillPath = (0, import_path5.join)(skillDir, "SKILL.md");
+    await (0, import_promises3.writeFile)(skillPath, skill.skillMd, "utf-8");
+    console.log(`[Skill] Written to ${skillPath}`);
+  }
 }
 function buildSdkEnvironment(options) {
   const env = { ...process.env };
@@ -7305,7 +7315,7 @@ var ClaudeCodeAdapter = class {
    */
   async execute(context) {
     const {
-      skill,
+      skills,
       scenario,
       cwd,
       modelConfig,
@@ -7316,7 +7326,6 @@ var ClaudeCodeAdapter = class {
     const modelForSdk = modelConfig?.model ? import_evalforge_types3.AVAILABLE_MODELS_MAP[modelConfig.model]?.providerModelId ?? modelConfig.model : void 0;
     const options = {
       cwd,
-      systemPrompt: skill.skillMd,
       model: modelForSdk,
       temperature: modelConfig?.temperature,
       maxTokens: modelConfig?.maxTokens,
@@ -7325,7 +7334,7 @@ var ClaudeCodeAdapter = class {
       traceContext
     };
     const { result, llmTrace } = await executeWithClaudeCode(
-      skill,
+      skills,
       scenario,
       options
     );
@@ -7908,7 +7917,6 @@ arrayDiff.join = arrayDiff.removeEmpty = function(value) {
 var IGNORED_PATTERNS = [
   "node_modules",
   ".git",
-  ".claude",
   ".cursor",
   "dist",
   "build",
@@ -8074,15 +8082,15 @@ function extractTemplateFiles(before, after) {
   return files;
 }
-// src/run-scenario/callSkill.ts
+// src/run-scenario/run-agent-with-context.ts
 var DEFAULT_AGENT_COMMAND = "claude";
-async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
+async function runAgentWithContext(config, evalRunId2, scenario, skills, skillsGroupId, skillsGroupName, agent, workDir) {
   const runCommand = agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
   const adapter = getAdapter(runCommand);
   const startedAt = (/* @__PURE__ */ new Date()).toISOString();
   const beforeSnapshot = workDir ? snapshotDirectory(workDir) : {};
   const executionContext = {
-    skill,
+    skills,
     scenario,
     cwd: workDir || process.cwd(),
     modelConfig: agent?.modelConfig,
@@ -8092,8 +8100,8 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
       evalRunId: evalRunId2,
       scenarioId: scenario.id,
       scenarioName: scenario.name,
-      targetId: skill.id,
-      targetName: skill.name,
+      targetId: skillsGroupId,
+      targetName: skillsGroupName,
       tracePushUrl: config.tracePushUrl,
       routeHeader: config.routeHeader,
       authToken: config.authToken
@@ -8106,8 +8114,8 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
   const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
   return {
     id: (0, import_crypto2.randomUUID)(),
-    targetId: skill.id,
-    targetName: skill.name,
+    targetId: skillsGroupId,
+    targetName: skillsGroupName,
     scenarioId: scenario.id,
     scenarioName: scenario.name,
     modelConfig: agent?.modelConfig,
@@ -8121,45 +8129,26 @@ async function callSkill(config, evalRunId2, scenario, skill, agent, workDir) {
   };
 }
-// src/run-scenario/callAgent.ts
-async function callAgent(config, scenario, agent, workDir) {
-  throw new Error("Agent execution not yet implemented");
-}
 // src/run-scenario/index.ts
-function getTargetId(target) {
-  switch (target.type) {
-    case "skill":
-      return target.skill.id;
-    case "agent":
-      return target.agent.id;
-  }
-}
-async function runScenario(config, evalRunId2, scenario, target, template, resolvedAssertions) {
-  const targetId = getTargetId(target);
+async function runScenario(config, evalRunId2, scenario, evalData, template, resolvedAssertions) {
+  const skillsGroupId = evalData.evalRun.skillsGroupId;
   const workDir = await prepareWorkingDirectory(
     config,
     evalRunId2,
-    targetId,
+    skillsGroupId,
     scenario.id,
     template
   );
-  let partialResult;
-  switch (target.type) {
-    case "skill":
-      partialResult = await callSkill(
-        config,
-        evalRunId2,
-        scenario,
-        target.skill,
-        target.agent,
-        workDir
-      );
-      break;
-    case "agent":
-      partialResult = await callAgent(config, scenario, target.agent, workDir);
-      break;
-  }
+  const partialResult = await runAgentWithContext(
+    config,
+    evalRunId2,
+    scenario,
+    evalData.skills,
+    skillsGroupId,
+    evalData.skillsGroupName,
+    evalData.codeAgent ?? void 0,
+    workDir
+  );
   const inlineAssertions = scenario.assertions ?? [];
   const assertions = [
     ...inlineAssertions,
@@ -8390,60 +8379,60 @@ async function runEvaluation(projectId2, evalRunId2) {
     );
   }
   let completedScenarios = 0;
-  const totalScenarios = scenarioItems.length * skills.length;
+  const totalScenarios = scenarioItems.length;
   for (const { scenario, template, resolvedAssertions } of scenarioItems) {
-    for (const skill of skills) {
-      state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
+    state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
+    state.currentContext = {
+      projectId: projectId2,
+      evalRunId: evalRunId2,
+      scenarioId: scenario.id,
+      scenarioName: scenario.name,
+      skillsGroupId: evalData.evalRun.skillsGroupId,
+      skillsGroupName: evalData.skillsGroupName,
+      agentId: codeAgent?.id,
+      agentName: codeAgent?.name,
+      progress: `${completedScenarios + 1}/${totalScenarios}`
+    };
+    const skillNames = evalData.skills.map((s) => s.name).join(", ");
+    console.log(
+      "[Evaluator] Running scenario with skills group:",
+      evalData.skillsGroupName,
+      skillNames ? `(${skillNames})` : "",
+      codeAgent ? `with agent: ${codeAgent.name}` : "",
+      `(${completedScenarios + 1}/${totalScenarios})`
+    );
+    try {
+      const result = await runScenario(
+        config,
+        evalRunId2,
+        scenario,
+        evalData,
+        template,
+        resolvedAssertions
+      );
+      console.log("[Evaluator] Scenario completed, adding result");
+      state.currentPhase = ExecutionPhase.ADD_RESULT;
       state.currentContext = {
-        projectId: projectId2,
-        evalRunId: evalRunId2,
-        scenarioId: scenario.id,
-        scenarioName: scenario.name,
-        skillId: skill.id,
-        skillName: skill.name,
-        agentId: codeAgent?.id,
-        agentName: codeAgent?.name,
-        progress: `${completedScenarios + 1}/${totalScenarios}`
+        ...state.currentContext,
+        resultId: result.id
       };
-      console.log(
-        "[Evaluator] Running skill:",
-        skill.name,
-        codeAgent ? `with agent: ${codeAgent.name}` : "",
-        `(${completedScenarios + 1}/${totalScenarios})`
+      await api.addResult(projectId2, evalRunId2, result);
+      completedScenarios++;
+    } catch (err) {
+      const errorMsg = err instanceof Error ? err.message : String(err);
+      const errorStack = err instanceof Error ? err.stack : void 0;
+      console.error(
+        "[Evaluator] Failed to run scenario with skills group:",
+        evalData.skillsGroupName,
+        "Error:",
+        errorMsg
       );
-      try {
-        const result = await runScenario(
-          config,
-          evalRunId2,
-          scenario,
-          { type: "skill", skill, agent: codeAgent ?? void 0 },
-          template,
-          resolvedAssertions
-        );
-        console.log("[Evaluator] Skill completed, adding result");
-        state.currentPhase = ExecutionPhase.ADD_RESULT;
-        state.currentContext = {
-          ...state.currentContext,
-          resultId: result.id
-        };
-        await api.addResult(projectId2, evalRunId2, result);
-        completedScenarios++;
-      } catch (err) {
-        const errorMsg = err instanceof Error ? err.message : String(err);
-        const errorStack = err instanceof Error ? err.stack : void 0;
-        console.error(
-          "[Evaluator] Failed to run skill:",
-          skill.name,
-          "Error:",
-          errorMsg
-        );
-        if (errorStack) {
-          console.error("[Evaluator] Stack trace:", errorStack);
-        }
-        throw new Error(
-          `[${state.currentPhase}] Failed to execute skill "${skill.name}" on scenario "${scenario.name}": ${errorMsg}`
-        );
+      if (errorStack) {
+        console.error("[Evaluator] Stack trace:", errorStack);
       }
+      throw new Error(
+        `[${state.currentPhase}] Failed to execute skills group "${evalData.skillsGroupName}" on scenario "${scenario.name}": ${errorMsg}`
+      );
     }
   }
   state.currentPhase = ExecutionPhase.UPDATE_STATUS;