npm - @wix/evalforge-evaluator - Versions diffs - 0.111.0 → 0.113.0 - Mend

@wix/evalforge-evaluator 0.111.0 → 0.113.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/build/index.js CHANGED Viewed

@@ -24,7 +24,7 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
 ));
 // src/index.ts
-var import_evalforge_types10 = require("@wix/evalforge-types");
+var import_evalforge_types15 = require("@wix/evalforge-types");
 // src/config.ts
 function loadConfig() {
@@ -509,7 +509,7 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
 }
 // src/run-scenario/index.ts
-var import_evalforge_types8 = require("@wix/evalforge-types");
+var import_evalforge_types13 = require("@wix/evalforge-types");
 var import_eval_assertions = require("@wix/eval-assertions");
 // src/run-scenario/environment.ts
@@ -596,7 +596,7 @@ async function prepareWorkingDirectory(config, evalRunId2, targetId, scenarioId,
 }
 // src/run-scenario/run-agent-with-context.ts
-var import_crypto3 = require("crypto");
+var import_crypto4 = require("crypto");
 // src/run-scenario/agents/registry.ts
 var AgentAdapterRegistry = class {
@@ -1222,10 +1222,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
   }
   const startTime = /* @__PURE__ */ new Date();
   const allMessages = [];
-  const { mkdir: mkdirAsync, writeFile: writeFile5 } = await import("fs/promises");
+  const { mkdir: mkdirAsync, writeFile: writeFile6 } = await import("fs/promises");
   const claudeDir = `${options.cwd}/.claude`;
   await mkdirAsync(claudeDir, { recursive: true });
-  await writeFile5(`${claudeDir}/settings.json`, "{}", {
+  await writeFile6(`${claudeDir}/settings.json`, "{}", {
     flag: "wx"
   }).catch(() => {
   });
@@ -1261,7 +1261,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
     "Edit",
     "Bash",
     "Glob",
-    "Grep"
+    "Grep",
+    "Agent",
+    "WebFetch",
+    "WebSearch"
   ];
   const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
   const queryOptions = {
@@ -1896,13 +1899,15 @@ function extractTotalUsage(result) {
 }
 function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
   const totalCost = usage.costUsd ?? 0;
-  const totalStepInputTokens = steps.reduce(
-    (sum, s) => sum + s.usage.inputTokens,
+  const effectiveInput = (s) => s.usage.inputTokens + (s.usage.cacheReadTokens ?? 0) + (s.usage.cacheWriteTokens ?? 0);
+  const totalStepEffectiveInput = steps.reduce(
+    (sum, s) => sum + effectiveInput(s),
     0
   );
   const totalStepDuration = steps.reduce((sum, s) => sum + s.durationMs, 0);
-  const inputTokensDuplicated = usage.inputTokens > 0 && totalStepInputTokens > usage.inputTokens * 1.2;
-  const traceSteps = steps.map((step, index) => {
+  const authoritativeEffectiveInput = usage.inputTokens + (usage.cacheReadTokens ?? 0) + (usage.cacheWriteTokens ?? 0);
+  const inputTokensDuplicated = authoritativeEffectiveInput > 0 && totalStepEffectiveInput > authoritativeEffectiveInput * 1.2;
+  const traceSteps = steps.flatMap((step, turnIndex) => {
     let stepPromptTokens;
     let stepOutputTokens;
     let proportion;
@@ -1911,34 +1916,128 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
       stepPromptTokens = Math.round(usage.inputTokens * proportion);
       stepOutputTokens = Math.round(usage.outputTokens * proportion);
     } else {
-      proportion = totalStepInputTokens > 0 ? step.usage.inputTokens / totalStepInputTokens : 0;
-      stepPromptTokens = step.usage.inputTokens;
+      const stepEffective = effectiveInput(step);
+      proportion = totalStepEffectiveInput > 0 ? stepEffective / totalStepEffectiveInput : 0;
+      stepPromptTokens = Math.round(usage.inputTokens * proportion);
       stepOutputTokens = Math.round(usage.outputTokens * proportion);
     }
-    const stepTotalTokens = stepPromptTokens + stepOutputTokens;
     const costProportion = proportion;
-    const stepType = step.toolCalls?.length ? import_evalforge_types4.LLMStepType.TOOL_USE : step.thinking && !step.text ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
-    return {
-      id: (0, import_crypto.randomUUID)(),
-      stepNumber: index + 1,
-      type: stepType,
-      model,
-      provider: "anthropic",
-      startedAt: step.startedAt.toISOString(),
-      durationMs: step.durationMs,
-      tokenUsage: {
-        prompt: stepPromptTokens,
-        completion: stepOutputTokens,
-        total: stepTotalTokens
-      },
-      costUsd: totalCost * costProportion,
-      toolName: step.toolCalls?.[0]?.toolName,
-      toolArguments: step.toolCalls?.[0] ? JSON.stringify(step.toolCalls[0].args) : void 0,
-      outputPreview: (step.text || step.thinking)?.slice(0, 200),
-      success: step.finishReason !== "error" && !step.hasToolError,
-      error: step.finishReason === "error" ? "Generation failed" : step.hasToolError ? step.toolErrorContent ?? "Tool call failed" : void 0
-    };
-  });
+    const toolCallCount = step.toolCalls?.length ?? 0;
+    const isSuccess = step.finishReason !== "error" && !step.hasToolError;
+    const errorMsg = step.hasToolError ? step.toolErrorContent ?? "Tool call failed" : step.finishReason === "error" ? "Generation failed" : void 0;
+    const subSteps = [];
+    const stepCost = totalCost * costProportion;
+    const hasThinking = !!step.thinking;
+    const hasText = !!step.text;
+    const thinkingSubSteps = hasThinking ? 1 : 0;
+    const toolSubSteps = toolCallCount > 0 ? toolCallCount : 0;
+    const textSubSteps = hasText && toolCallCount > 0 ? 1 : 0;
+    const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
+    if (hasThinking && (hasText || toolCallCount > 0)) {
+      subSteps.push({
+        id: (0, import_crypto.randomUUID)(),
+        stepNumber: 0,
+        // renumbered below
+        turnIndex,
+        type: import_evalforge_types4.LLMStepType.THINKING,
+        model,
+        provider: "anthropic",
+        startedAt: step.startedAt.toISOString(),
+        durationMs: Math.round(step.durationMs / totalSubSteps),
+        tokenUsage: {
+          prompt: Math.round(stepPromptTokens / totalSubSteps),
+          completion: Math.round(stepOutputTokens / totalSubSteps),
+          total: Math.round(
+            (stepPromptTokens + stepOutputTokens) / totalSubSteps
+          )
+        },
+        costUsd: stepCost / totalSubSteps,
+        outputPreview: step.thinking?.slice(0, 200),
+        success: isSuccess,
+        error: errorMsg
+      });
+    }
+    if (toolCallCount > 0) {
+      for (let tcIdx = 0; tcIdx < toolCallCount; tcIdx++) {
+        const tc = step.toolCalls[tcIdx];
+        const isLast = tcIdx === toolCallCount - 1 && textSubSteps === 0;
+        const toolBudgetSteps = toolSubSteps + textSubSteps;
+        const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
+        const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
+        subSteps.push({
+          id: (0, import_crypto.randomUUID)(),
+          stepNumber: 0,
+          turnIndex,
+          type: import_evalforge_types4.LLMStepType.TOOL_USE,
+          model,
+          provider: "anthropic",
+          startedAt: step.startedAt.toISOString(),
+          durationMs: isLast ? step.durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0) : Math.round(step.durationMs * remainingFraction * toolFraction),
+          tokenUsage: {
+            prompt: Math.round(
+              stepPromptTokens * remainingFraction * toolFraction
+            ),
+            completion: Math.round(
+              stepOutputTokens * remainingFraction * toolFraction
+            ),
+            total: Math.round(
+              (stepPromptTokens + stepOutputTokens) * remainingFraction * toolFraction
+            )
+          },
+          costUsd: stepCost * remainingFraction * toolFraction,
+          toolName: tc.toolName,
+          toolArguments: JSON.stringify(tc.args),
+          outputPreview: tcIdx === 0 && !hasText ? (step.text || step.thinking)?.slice(0, 200) : void 0,
+          success: isSuccess,
+          error: errorMsg
+        });
+      }
+    }
+    if (hasText && toolCallCount > 0) {
+      subSteps.push({
+        id: (0, import_crypto.randomUUID)(),
+        stepNumber: 0,
+        turnIndex,
+        type: import_evalforge_types4.LLMStepType.COMPLETION,
+        model,
+        provider: "anthropic",
+        startedAt: step.startedAt.toISOString(),
+        durationMs: step.durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0),
+        tokenUsage: {
+          prompt: stepPromptTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.prompt, 0),
+          completion: stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.completion, 0),
+          total: stepPromptTokens + stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.total, 0)
+        },
+        costUsd: stepCost - subSteps.reduce((s, ss) => s + ss.costUsd, 0),
+        outputPreview: step.text?.slice(0, 200),
+        success: isSuccess,
+        error: errorMsg
+      });
+    }
+    if (subSteps.length === 0) {
+      const stepType = hasThinking && !hasText ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
+      subSteps.push({
+        id: (0, import_crypto.randomUUID)(),
+        stepNumber: 0,
+        turnIndex,
+        type: stepType,
+        model,
+        provider: "anthropic",
+        startedAt: step.startedAt.toISOString(),
+        durationMs: step.durationMs,
+        tokenUsage: {
+          prompt: stepPromptTokens,
+          completion: stepOutputTokens,
+          total: stepPromptTokens + stepOutputTokens
+        },
+        costUsd: stepCost,
+        outputPreview: (step.text || step.thinking)?.slice(0, 200),
+        success: isSuccess,
+        error: errorMsg
+      });
+    }
+    return subSteps;
+  }).map((s, i) => ({ ...s, stepNumber: i + 1 }));
   const finalTokens = {
     prompt: usage.inputTokens,
     completion: usage.outputTokens,
@@ -1960,6 +2059,7 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
   }
   const summary = {
     totalSteps: traceSteps.length,
+    totalTurns: steps.length,
     totalDurationMs,
     totalTokens: finalTokens,
     totalCostUsd: totalCost,
@@ -2045,12 +2145,1055 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
 // src/run-scenario/agents/claude-code/index.ts
 defaultRegistry.register(claudeCodeAdapter);
+// src/run-scenario/agents/opencode/opencode-adapter.ts
+var import_evalforge_types9 = require("@wix/evalforge-types");
+// src/run-scenario/agents/opencode/execute.ts
+var import_evalforge_types8 = require("@wix/evalforge-types");
+// src/run-scenario/agents/opencode/write-skills.ts
+var import_promises7 = require("fs/promises");
+var import_path8 = require("path");
+var import_evalforge_github_client4 = require("@wix/evalforge-github-client");
+async function writeSkillsToFilesystem2(cwd, skills, fetchFn = import_evalforge_github_client4.fetchGitHubFolder) {
+  await Promise.all(
+    skills.map((skill) => writeSkillToFilesystem2(cwd, skill, fetchFn))
+  );
+}
+async function writeSkillToFilesystem2(cwd, skill, fetchFn) {
+  const skillName = skill.name;
+  const skillDir = (0, import_path8.join)(cwd, ".opencode", "skills", skillName);
+  await (0, import_promises7.mkdir)(skillDir, { recursive: true });
+  const version = skill.latestVersion;
+  if (version?.files && version.files.length > 0) {
+    await writeFilesToDirectory(skillDir, version.files);
+    console.log(
+      `[Skill] ${skillName}: wrote ${version.files.length} file(s) from snapshot`
+    );
+  } else if (skill.source) {
+    try {
+      const files = await fetchFn(skill.source, {
+        userAgent: "EvalForge-Evaluator"
+      });
+      await writeFilesToDirectory(skillDir, files);
+      console.log(
+        `[Skill] ${skillName}: wrote ${files.length} file(s) from GitHub (live)`
+      );
+    } catch (error) {
+      const message = error instanceof Error ? error.message : "Unknown error";
+      console.error(
+        `[Skill] ${skillName}: GitHub fetch failed: ${message}, no files to fall back to`
+      );
+      throw new Error(
+        `Failed to write skill ${skillName} to filesystem: ${message}`
+      );
+    }
+  } else {
+    throw new Error(`Skill ${skillName} has no files and no source configured`);
+  }
+}
+// src/run-scenario/agents/opencode/write-sub-agents.ts
+var import_promises8 = require("fs/promises");
+var import_path9 = require("path");
+var import_evalforge_github_client5 = require("@wix/evalforge-github-client");
+var AGENTS_DIR2 = ".opencode/agents";
+function toAgentFilename2(name, index, nameCount) {
+  const base = (name || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
+  const count = nameCount.get(base) ?? 0;
+  nameCount.set(base, count + 1);
+  return count === 0 ? base : `${base}-${count + 1}`;
+}
+async function resolveSubAgentContent2(agent, fetchFn) {
+  if (agent.source) {
+    try {
+      const content = await fetchFn(agent.source, {
+        userAgent: "EvalForge-Evaluator"
+      });
+      console.log(
+        `[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
+      );
+      return content;
+    } catch (error) {
+      const message = error instanceof Error ? error.message : "Unknown error";
+      console.error(
+        `[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
+      );
+      throw new Error(
+        `Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
+      );
+    }
+  }
+  if (!agent.subAgentMd) {
+    console.warn(
+      `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
+    );
+  }
+  return agent.subAgentMd;
+}
+async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_evalforge_github_client5.fetchGitHubFile) {
+  if (subAgents.length === 0) return;
+  const agentsDir = (0, import_path9.join)(cwd, AGENTS_DIR2);
+  await (0, import_promises8.mkdir)(agentsDir, { recursive: true });
+  const nameCount = /* @__PURE__ */ new Map();
+  for (const [i, agent] of subAgents.entries()) {
+    const filename = toAgentFilename2(agent.name, i, nameCount);
+    const filePath = (0, import_path9.join)(agentsDir, `${filename}.md`);
+    const content = await resolveSubAgentContent2(agent, fetchFn);
+    await (0, import_promises8.writeFile)(filePath, content, "utf8");
+  }
+  console.log(`[SubAgents] Written to ${agentsDir}`);
+}
+// src/run-scenario/agents/opencode/config.ts
+var import_evalforge_types6 = require("@wix/evalforge-types");
+var DEFAULT_MODEL2 = `${import_evalforge_types6.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
+function parseModel(model) {
+  const slashIndex = model.indexOf("/");
+  if (slashIndex > 0) {
+    return {
+      providerID: model.slice(0, slashIndex),
+      modelID: model.slice(slashIndex + 1)
+    };
+  }
+  const isOpenAI = import_evalforge_types6.AVAILABLE_OPENAI_MODEL_IDS.includes(
+    model
+  );
+  return { providerID: isOpenAI ? "openai" : "anthropic", modelID: model };
+}
+function toOpenCodeMcpConfig(servers) {
+  const result = {};
+  for (const [name, entry] of Object.entries(servers)) {
+    if (entry.type === "local" || entry.type === "remote") {
+      result[name] = entry;
+      continue;
+    }
+    if (entry.url && typeof entry.url === "string") {
+      result[name] = {
+        type: "remote",
+        url: entry.url,
+        ...entry.headers ? { headers: entry.headers } : {},
+        ...typeof entry.enabled === "boolean" ? { enabled: entry.enabled } : {}
+      };
+      continue;
+    }
+    if (entry.command && typeof entry.command === "string") {
+      const commandArray = [
+        entry.command,
+        ...entry.args || []
+      ];
+      result[name] = {
+        type: "local",
+        command: commandArray,
+        ...entry.env ? { environment: entry.env } : {},
+        ...typeof entry.enabled === "boolean" ? { enabled: entry.enabled } : {}
+      };
+      continue;
+    }
+    console.warn(
+      `[MCP] Server "${name}" has unrecognized format, passing through as-is:`,
+      JSON.stringify(entry)
+    );
+    result[name] = entry;
+  }
+  return result;
+}
+async function buildOpenCodeConfig(options) {
+  const modelStr = options.model || DEFAULT_MODEL2;
+  const { providerID, modelID } = parseModel(modelStr);
+  const provider = {};
+  if (options.aiGatewayUrl) {
+    const providerOptions = {
+      baseURL: `${options.aiGatewayUrl}/proxy/${providerID}`,
+      apiKey: "sk-placeholder-auth-handled-by-gateway"
+    };
+    if (options.aiGatewayHeaders) {
+      providerOptions.headers = { ...options.aiGatewayHeaders };
+    }
+    provider[providerID] = {
+      options: providerOptions
+    };
+  }
+  let mcp;
+  if (options.mcps && options.mcps.length > 0) {
+    const mcpServers = {};
+    for (const mcpEntity of options.mcps) {
+      const entityConfig = mcpEntity.config;
+      for (const [key, value] of Object.entries(entityConfig)) {
+        if (typeof value !== "object" || value === null || Array.isArray(value)) {
+          throw new Error(
+            `MCP "${mcpEntity.name}" has invalid config: value for key "${key}" must be an object (got ${typeof value}).`
+          );
+        }
+        mcpServers[key] = value;
+      }
+    }
+    const resolved = await resolveMcpPlaceholders(mcpServers, {
+      cwd: options.cwd
+    });
+    mcp = toOpenCodeMcpConfig(resolved);
+  }
+  const agentOverrides = {};
+  if (options.temperature != null) {
+    agentOverrides.temperature = options.temperature;
+  }
+  if (options.maxTurns != null) {
+    agentOverrides.maxSteps = options.maxTurns;
+  }
+  const config = {
+    model: `${providerID}/${modelID}`,
+    provider,
+    ...Object.keys(agentOverrides).length > 0 ? { agent: { build: agentOverrides } } : {},
+    permission: {
+      edit: "allow",
+      bash: "allow",
+      webfetch: "allow",
+      doom_loop: "allow",
+      external_directory: "allow"
+    },
+    ...mcp ? { mcp } : {}
+  };
+  return { config, providerID, modelID };
+}
+// src/run-scenario/agents/opencode/build-trace.ts
+var import_evalforge_types7 = require("@wix/evalforge-types");
+var import_crypto2 = require("crypto");
+function buildLLMTrace(messages, totalDurationMs, model, provider) {
+  const assistantMessages = messages.filter(
+    (m) => m.info.role === "assistant"
+  );
+  const allSteps = assistantMessages.flatMap((msg, turnIndex) => {
+    const { info, parts } = msg;
+    let text = "";
+    let thinking = "";
+    const toolCalls = [];
+    let stepInputTokens = 0;
+    let stepOutputTokens = 0;
+    let stepCost = 0;
+    let finishReason = "unknown";
+    for (const part of parts) {
+      switch (part.type) {
+        case "text": {
+          const textPart = part;
+          text += textPart.text;
+          break;
+        }
+        case "reasoning": {
+          const reasoningPart = part;
+          thinking += reasoningPart.text;
+          break;
+        }
+        case "tool": {
+          const toolPart = part;
+          toolCalls.push({
+            toolName: toolPart.tool,
+            args: toolPart.state.input
+          });
+          break;
+        }
+        case "step-finish": {
+          const sf = part;
+          stepInputTokens += sf.tokens.input;
+          stepOutputTokens += sf.tokens.output;
+          stepCost += sf.cost;
+          finishReason = sf.reason;
+          break;
+        }
+      }
+    }
+    if (stepInputTokens === 0 && stepOutputTokens === 0) {
+      stepInputTokens = info.tokens.input;
+      stepOutputTokens = info.tokens.output;
+      stepCost = info.cost;
+    }
+    const startedAt = new Date(info.time.created).toISOString();
+    const completedAt = info.time.completed ? info.time.completed : turnIndex + 1 < assistantMessages.length ? assistantMessages[turnIndex + 1].info.time.created : info.time.created + totalDurationMs;
+    const durationMs = Math.max(0, completedAt - info.time.created);
+    const isSuccess = finishReason !== "error";
+    const errorMsg = finishReason === "error" ? "Generation failed" : void 0;
+    const stepModel = info.modelID || model;
+    const stepProvider = info.providerID || provider;
+    const toolCallCount = toolCalls.length;
+    const hasThinking = !!thinking;
+    const hasText = !!text;
+    const subSteps = [];
+    const thinkingSubSteps = hasThinking && (hasText || toolCallCount > 0) ? 1 : 0;
+    const toolSubSteps = toolCallCount;
+    const textSubSteps = hasText && toolCallCount > 0 ? 1 : 0;
+    const totalSubSteps = thinkingSubSteps + toolSubSteps + textSubSteps || 1;
+    if (hasThinking && (hasText || toolCallCount > 0)) {
+      subSteps.push({
+        id: (0, import_crypto2.randomUUID)(),
+        stepNumber: 0,
+        // renumbered below
+        turnIndex,
+        type: import_evalforge_types7.LLMStepType.THINKING,
+        model: stepModel,
+        provider: stepProvider,
+        startedAt,
+        durationMs: Math.round(durationMs / totalSubSteps),
+        tokenUsage: {
+          prompt: Math.round(stepInputTokens / totalSubSteps),
+          completion: Math.round(stepOutputTokens / totalSubSteps),
+          total: Math.round(
+            (stepInputTokens + stepOutputTokens) / totalSubSteps
+          )
+        },
+        costUsd: stepCost / totalSubSteps,
+        outputPreview: thinking.slice(0, 200),
+        success: isSuccess,
+        error: errorMsg
+      });
+    }
+    if (toolCallCount > 0) {
+      for (let tcIdx = 0; tcIdx < toolCallCount; tcIdx++) {
+        const tc = toolCalls[tcIdx];
+        const isLast = tcIdx === toolCallCount - 1 && textSubSteps === 0;
+        const toolBudgetSteps = toolSubSteps + textSubSteps;
+        const toolFraction = toolBudgetSteps > 0 ? 1 / toolBudgetSteps : 1;
+        const remainingFraction = (totalSubSteps - thinkingSubSteps) / totalSubSteps;
+        subSteps.push({
+          id: (0, import_crypto2.randomUUID)(),
+          stepNumber: 0,
+          turnIndex,
+          type: import_evalforge_types7.LLMStepType.TOOL_USE,
+          model: stepModel,
+          provider: stepProvider,
+          startedAt,
+          durationMs: isLast ? durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0) : Math.round(durationMs * remainingFraction * toolFraction),
+          tokenUsage: {
+            prompt: Math.round(
+              stepInputTokens * remainingFraction * toolFraction
+            ),
+            completion: Math.round(
+              stepOutputTokens * remainingFraction * toolFraction
+            ),
+            total: Math.round(
+              (stepInputTokens + stepOutputTokens) * remainingFraction * toolFraction
+            )
+          },
+          costUsd: stepCost * remainingFraction * toolFraction,
+          toolName: tc.toolName,
+          toolArguments: JSON.stringify(tc.args),
+          outputPreview: tcIdx === 0 && !hasText ? (text || thinking)?.slice(0, 200) : void 0,
+          success: isSuccess,
+          error: errorMsg
+        });
+      }
+    }
+    if (hasText && toolCallCount > 0) {
+      subSteps.push({
+        id: (0, import_crypto2.randomUUID)(),
+        stepNumber: 0,
+        turnIndex,
+        type: import_evalforge_types7.LLMStepType.COMPLETION,
+        model: stepModel,
+        provider: stepProvider,
+        startedAt,
+        durationMs: durationMs - subSteps.reduce((s, ss) => s + ss.durationMs, 0),
+        tokenUsage: {
+          prompt: stepInputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.prompt, 0),
+          completion: stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.completion, 0),
+          total: stepInputTokens + stepOutputTokens - subSteps.reduce((s, ss) => s + ss.tokenUsage.total, 0)
+        },
+        costUsd: stepCost - subSteps.reduce((s, ss) => s + ss.costUsd, 0),
+        outputPreview: text.slice(0, 200),
+        success: isSuccess,
+        error: errorMsg
+      });
+    }
+    if (subSteps.length === 0) {
+      const stepType = hasThinking && !hasText ? import_evalforge_types7.LLMStepType.THINKING : import_evalforge_types7.LLMStepType.COMPLETION;
+      subSteps.push({
+        id: (0, import_crypto2.randomUUID)(),
+        stepNumber: 0,
+        turnIndex,
+        type: stepType,
+        model: stepModel,
+        provider: stepProvider,
+        startedAt,
+        durationMs,
+        tokenUsage: {
+          prompt: stepInputTokens,
+          completion: stepOutputTokens,
+          total: stepInputTokens + stepOutputTokens
+        },
+        costUsd: stepCost,
+        outputPreview: (text || thinking)?.slice(0, 200),
+        success: isSuccess,
+        error: errorMsg
+      });
+    }
+    return subSteps;
+  }).map((s, i) => ({ ...s, stepNumber: i + 1 }));
+  const totalTokens = buildTotalTokens(assistantMessages);
+  const totalCost = assistantMessages.reduce((sum, m) => {
+    const aMsg = m.info;
+    return sum + aMsg.cost;
+  }, 0);
+  const stepTypeBreakdown = {};
+  for (const step of allSteps) {
+    const entry = stepTypeBreakdown[step.type] ?? {
+      count: 0,
+      durationMs: 0,
+      tokens: 0,
+      costUsd: 0
+    };
+    entry.count += 1;
+    entry.durationMs += step.durationMs;
+    entry.tokens += step.tokenUsage.total;
+    entry.costUsd += step.costUsd;
+    stepTypeBreakdown[step.type] = entry;
+  }
+  const modelUsed = allSteps[0]?.model || model;
+  const summary = {
+    totalSteps: allSteps.length,
+    totalTurns: assistantMessages.length,
+    totalDurationMs,
+    totalTokens,
+    totalCostUsd: totalCost,
+    modelBreakdown: {
+      [modelUsed]: {
+        count: allSteps.length,
+        durationMs: totalDurationMs,
+        tokens: totalTokens.total,
+        costUsd: totalCost
+      }
+    },
+    modelsUsed: [modelUsed],
+    stepTypeBreakdown
+  };
+  return {
+    id: (0, import_crypto2.randomUUID)(),
+    steps: allSteps,
+    summary
+  };
+}
+function buildTotalTokens(assistantMessages) {
+  let prompt = 0;
+  let completion = 0;
+  for (const { info } of assistantMessages) {
+    prompt += info.tokens.input;
+    completion += info.tokens.output;
+  }
+  return { prompt, completion, total: prompt + completion };
+}
+// src/run-scenario/agents/opencode/build-conversation.ts
+function buildConversation2(messages) {
+  const result = [];
+  for (const { info, parts } of messages) {
+    const timestamp = new Date(info.time.created).toISOString();
+    if (info.role === "assistant") {
+      const content = [];
+      for (const part of parts) {
+        switch (part.type) {
+          case "text": {
+            const textPart = part;
+            content.push({ type: "text", text: textPart.text });
+            break;
+          }
+          case "reasoning": {
+            const reasoningPart = part;
+            content.push({ type: "thinking", thinking: reasoningPart.text });
+            break;
+          }
+          case "tool": {
+            const toolPart = part;
+            content.push({
+              type: "tool_use",
+              toolName: toolPart.tool,
+              toolId: toolPart.callID,
+              input: toolPart.state.input
+            });
+            break;
+          }
+        }
+      }
+      if (content.length > 0) {
+        result.push({ role: "assistant", content, timestamp });
+      }
+    } else if (info.role === "user") {
+      const content = [];
+      for (const part of parts) {
+        if (part.type === "text") {
+          const textPart = part;
+          content.push({ type: "text", text: textPart.text });
+        } else if (part.type === "tool") {
+          const toolPart = part;
+          const state = toolPart.state;
+          if (state.status === "completed") {
+            const completed = state;
+            content.push({
+              type: "tool_result",
+              toolUseId: toolPart.callID,
+              content: completed.output
+            });
+          } else if (state.status === "error") {
+            const errState = state;
+            content.push({
+              type: "tool_result",
+              toolUseId: toolPart.callID,
+              content: errState.error,
+              isError: true
+            });
+          }
+        }
+      }
+      if (content.length > 0) {
+        result.push({ role: "user", content, timestamp });
+      }
+    }
+  }
+  return result;
+}
+// src/run-scenario/agents/opencode/execute.ts
+var DEFAULT_MODEL3 = `anthropic/${import_evalforge_types8.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
+function extractToolAction(toolName, args) {
+  if (!toolName) return "Using tool...";
+  const a = args;
+  if ((toolName === "Task" || toolName === "dispatch_agent") && a?.description) {
+    const desc = String(a.description).slice(0, 55);
+    return `Task: ${desc}${String(a.description).length > 55 ? "..." : ""}`;
+  }
+  if ((toolName === "Bash" || toolName === "bash" || toolName === "execute") && a?.command) {
+    const cmd = String(a.command).slice(0, 50);
+    return `Running: ${cmd}${String(a.command).length > 50 ? "..." : ""}`;
+  }
+  if (a?.file_path || a?.path || a?.target_file) {
+    const filePath = String(a.file_path || a.path || a.target_file).slice(
+      0,
+      50
+    );
+    if (/write|edit/i.test(toolName)) return `Writing: ${filePath}`;
+    if (/read|view/i.test(toolName)) return `Reading: ${filePath}`;
+  }
+  return `Using ${toolName}...`;
+}
+function createTraceEventFromPart(part, context, stepNumber, isComplete) {
+  const base = {
+    evalRunId: context.evalRunId,
+    scenarioId: context.scenarioId,
+    scenarioName: context.scenarioName,
+    targetId: context.targetId,
+    targetName: context.targetName,
+    stepNumber,
+    timestamp: (/* @__PURE__ */ new Date()).toISOString(),
+    isComplete
+  };
+  switch (part.type) {
+    case "text": {
+      const textPart = part;
+      return {
+        ...base,
+        type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
+        outputPreview: textPart.text.slice(0, 500)
+      };
+    }
+    case "reasoning": {
+      const reasoningPart = part;
+      return {
+        ...base,
+        type: import_evalforge_types8.LiveTraceEventType.THINKING,
+        thinking: reasoningPart.text.slice(0, 500)
+      };
+    }
+    case "tool": {
+      const toolPart = part;
+      const toolName = toolPart.tool;
+      const args = toolPart.state.input;
+      const toolArgs = JSON.stringify(args).slice(0, 500);
+      let type = import_evalforge_types8.LiveTraceEventType.TOOL_USE;
+      let filePath;
+      const a = args;
+      if (a.file_path || a.path || a.target_file) {
+        filePath = String(a.file_path || a.path || a.target_file);
+        if (/write|edit/i.test(toolName)) {
+          type = import_evalforge_types8.LiveTraceEventType.FILE_WRITE;
+        } else if (/read|view/i.test(toolName)) {
+          type = import_evalforge_types8.LiveTraceEventType.FILE_READ;
+        }
+      }
+      return { ...base, type, toolName, toolArgs, filePath };
+    }
+    case "step-finish":
+      return {
+        ...base,
+        type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
+        outputPreview: "Step completed"
+      };
+    default:
+      return null;
+  }
+}
+async function executeWithOpenCode(skills, scenario, options) {
+  const skillNames = skills.map((s) => s.name).join(", ");
+  console.log("[executeWithOpenCode] Starting execution", {
+    skillCount: skills.length,
+    skillNames,
+    scenarioId: scenario.id,
+    scenarioName: scenario.name,
+    cwd: options.cwd,
+    aiGatewayUrl: options.aiGatewayUrl,
+    hasAiGatewayHeaders: !!options.aiGatewayHeaders,
+    model: options.model
+  });
+  const startTime = /* @__PURE__ */ new Date();
+  if (options.mcps && options.mcps.length > 0) {
+    console.log(
+      `[MCP] ${options.mcps.length} MCP(s) will be configured inline`
+    );
+  }
+  if (options.subAgents && options.subAgents.length > 0) {
+    await writeSubAgentsToFilesystem2(options.cwd, options.subAgents);
+  }
+  if (options.rules && options.rules.length > 0) {
+    await writeRulesToFilesystem(options.cwd, options.rules);
+  }
+  try {
+    await writeSkillsToFilesystem2(options.cwd, skills);
+  } catch (writeError) {
+    throw new Error(
+      `Failed to write skills to filesystem: ${writeError instanceof Error ? writeError.message : String(writeError)}`
+    );
+  }
+  const maxTurns = options.maxTurns ?? 10;
+  const { config, providerID, modelID } = await buildOpenCodeConfig({
+    model: options.model,
+    temperature: options.temperature,
+    maxTurns,
+    aiGatewayUrl: options.aiGatewayUrl,
+    aiGatewayHeaders: options.aiGatewayHeaders,
+    mcps: options.mcps,
+    cwd: options.cwd
+  });
+  const { createOpencodeServer, createOpencodeClient } = await import("@opencode-ai/sdk");
+  const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
+  const abortController = new AbortController();
+  let timeoutHandle;
+  let heartbeatHandle;
+  let timedOut = false;
+  const traceContext = options.traceContext;
+  let traceStepNumber = 0;
+  let lastAction = "Starting...";
+  let lastToolName;
+  let lastFilePath;
+  if (traceContext) {
+    emitTraceEvent(
+      {
+        evalRunId: traceContext.evalRunId,
+        scenarioId: traceContext.scenarioId,
+        scenarioName: traceContext.scenarioName,
+        targetId: traceContext.targetId,
+        targetName: traceContext.targetName,
+        stepNumber: 0,
+        type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
+        outputPreview: JSON.stringify({
+          event: "pre-sdk-execution",
+          model: `${providerID}/${modelID}`,
+          maxTurns,
+          timestamp: (/* @__PURE__ */ new Date()).toISOString()
+        }),
+        timestamp: (/* @__PURE__ */ new Date()).toISOString(),
+        isComplete: false
+      },
+      traceContext.tracePushUrl,
+      traceContext.routeHeader,
+      traceContext.authToken
+    );
+  }
+  let server;
+  try {
+    console.log("[SDK-DEBUG] Starting OpenCode server...");
+    server = await createOpencodeServer({
+      config,
+      signal: abortController.signal,
+      timeout: 3e4
+    });
+    console.log(`[SDK-DEBUG] Server started at ${server.url}`);
+    const client = createOpencodeClient({
+      baseUrl: server.url,
+      directory: options.cwd
+    });
+    const session = await client.session.create({
+      body: { title: `eval-${scenario.name}` }
+    });
+    if (!session.data) {
+      const errorDetail = "error" in session ? JSON.stringify(session.error) : "unknown";
+      throw new Error(
+        `OpenCode session.create() failed: ${errorDetail} (HTTP ${session.response?.status})`
+      );
+    }
+    const sessionId = session.data.id;
+    console.log(`[SDK-DEBUG] Session created: ${sessionId}`);
+    let eventStreamAbort;
+    if (traceContext) {
+      eventStreamAbort = new AbortController();
+      const executionStartTime = Date.now();
+      (async () => {
+        try {
+          const events = await client.event.subscribe();
+          for await (const event of events.stream) {
+            if (eventStreamAbort.signal.aborted) break;
+            const evt = event;
+            if (evt.type === "message.part.updated") {
+              const { part } = evt.properties;
+              traceStepNumber++;
+              const traceEvent = createTraceEventFromPart(
+                part,
+                traceContext,
+                traceStepNumber,
+                false
+              );
+              if (traceEvent) {
+                lastToolName = traceEvent.toolName;
+                lastFilePath = traceEvent.filePath;
+                if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.THINKING) {
+                  lastAction = "Thinking...";
+                } else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.TOOL_USE) {
+                  lastAction = extractToolAction(
+                    traceEvent.toolName ?? "",
+                    void 0
+                  );
+                } else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.FILE_WRITE) {
+                  lastAction = `Writing: ${traceEvent.filePath || "file"}`;
+                } else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.FILE_READ) {
+                  lastAction = `Reading: ${traceEvent.filePath || "file"}`;
+                } else if (traceEvent.type === import_evalforge_types8.LiveTraceEventType.COMPLETION) {
+                  lastAction = "Processing response...";
+                }
+                emitTraceEvent(
+                  traceEvent,
+                  traceContext.tracePushUrl,
+                  traceContext.routeHeader,
+                  traceContext.authToken
+                );
+              }
+            } else if (evt.type === "session.error") {
+              const props = evt.properties;
+              traceStepNumber++;
+              emitTraceEvent(
+                {
+                  evalRunId: traceContext.evalRunId,
+                  scenarioId: traceContext.scenarioId,
+                  scenarioName: traceContext.scenarioName,
+                  targetId: traceContext.targetId,
+                  targetName: traceContext.targetName,
+                  stepNumber: traceStepNumber,
+                  type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
+                  outputPreview: `Session error: ${JSON.stringify(props.error)}`.slice(
+                    0,
+                    500
+                  ),
+                  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
+                  isComplete: false
+                },
+                traceContext.tracePushUrl,
+                traceContext.routeHeader,
+                traceContext.authToken
+              );
+            }
+          }
+        } catch {
+        }
+      })();
+      let lastReportedAction = "";
+      let sameActionCount = 0;
+      heartbeatHandle = setInterval(() => {
+        const elapsedMs = Date.now() - executionStartTime;
+        let progressMessage = lastAction;
+        if (lastAction === lastReportedAction) {
+          sameActionCount++;
+        } else {
+          sameActionCount = 1;
+          lastReportedAction = lastAction;
+        }
+        const isTaskTool = lastToolName === "Task" || lastToolName === "dispatch_agent";
+        if (isTaskTool && sameActionCount > 1) {
+          progressMessage = `Waiting for ${lastAction}`;
+        } else if (lastToolName && lastFilePath) {
+          progressMessage = `${lastToolName}: ${lastFilePath}`;
+        } else if (lastToolName && !isTaskTool) {
+          progressMessage = `Using ${lastToolName}...`;
+        }
+        const elapsedSec = Math.round(elapsedMs / 1e3);
+        progressMessage += ` (${elapsedSec}s, step ${traceStepNumber})`;
+        emitTraceEvent(
+          {
+            evalRunId: traceContext.evalRunId,
+            scenarioId: traceContext.scenarioId,
+            scenarioName: traceContext.scenarioName,
+            targetId: traceContext.targetId,
+            targetName: traceContext.targetName,
+            stepNumber: traceStepNumber,
+            type: import_evalforge_types8.LiveTraceEventType.PROGRESS,
+            outputPreview: progressMessage,
+            toolName: lastToolName,
+            filePath: lastFilePath,
+            elapsedMs,
+            timestamp: (/* @__PURE__ */ new Date()).toISOString(),
+            isComplete: false
+          },
+          traceContext.tracePushUrl,
+          traceContext.routeHeader,
+          traceContext.authToken
+        );
+      }, 1e4);
+    }
+    const promptPromise = (async () => {
+      let systemPrompt;
+      if (options.systemPrompt === null || options.systemPrompt === "") {
+      } else if (options.systemPrompt != null) {
+        systemPrompt = options.systemPrompt;
+      } else {
+        systemPrompt = import_evalforge_types8.DEFAULT_EVALUATOR_SYSTEM_PROMPT;
+      }
+      console.log("[SDK-DEBUG] Sending prompt...");
+      const result = await client.session.prompt({
+        path: { id: sessionId },
+        body: {
+          model: { providerID, modelID },
+          ...systemPrompt ? { system: systemPrompt } : {},
+          parts: [{ type: "text", text: scenario.triggerPrompt }]
+        }
+      });
+      return result;
+    })();
+    const timeoutPromise = new Promise((_, reject) => {
+      timeoutHandle = setTimeout(() => {
+        timedOut = true;
+        client.session.abort({ path: { id: sessionId } }).catch(() => {
+        });
+        reject(
+          new Error(
+            `OpenCode execution timed out after ${SDK_TIMEOUT_MS}ms. Skills: ${skillNames}, Scenario: ${scenario.name}, MaxTurns: ${maxTurns}`
+          )
+        );
+      }, SDK_TIMEOUT_MS);
+    });
+    const promptResult = await Promise.race([promptPromise, timeoutPromise]);
+    if (timeoutHandle) clearTimeout(timeoutHandle);
+    if (heartbeatHandle) clearInterval(heartbeatHandle);
+    if (eventStreamAbort) eventStreamAbort.abort();
+    if ("error" in promptResult && promptResult.error) {
+      const errPayload = promptResult.error;
+      throw new Error(
+        `Agent prompt failed: ${errPayload.name ?? "UnknownError"} - ${JSON.stringify(errPayload.data ?? errPayload)}`
+      );
+    }
+    console.log("[executeWithOpenCode] Prompt completed, fetching messages...");
+    const messagesResponse = await client.session.messages({
+      path: { id: sessionId }
+    });
+    const allMessages = messagesResponse.data ?? [];
+    console.log(
+      `[executeWithOpenCode] Got ${allMessages.length} message(s) from history`
+    );
+    if (traceContext) {
+      emitTraceEvent(
+        {
+          evalRunId: traceContext.evalRunId,
+          scenarioId: traceContext.scenarioId,
+          scenarioName: traceContext.scenarioName,
+          targetId: traceContext.targetId,
+          targetName: traceContext.targetName,
+          stepNumber: traceStepNumber + 1,
+          type: import_evalforge_types8.LiveTraceEventType.COMPLETION,
+          outputPreview: "Scenario execution completed",
+          timestamp: (/* @__PURE__ */ new Date()).toISOString(),
+          isComplete: true
+        },
+        traceContext.tracePushUrl,
+        traceContext.routeHeader,
+        traceContext.authToken
+      );
+    }
+    const endTime = /* @__PURE__ */ new Date();
+    const totalDurationMs = endTime.getTime() - startTime.getTime();
+    const resultData = promptResult.data;
+    const lastAssistantInfo = resultData?.info;
+    if (lastAssistantInfo?.error) {
+      const err = lastAssistantInfo.error;
+      throw new Error(
+        `Agent execution failed: ${err.name} - ${JSON.stringify(err.data)}`
+      );
+    }
+    let outputText = "";
+    if (resultData?.parts) {
+      for (const part of resultData.parts) {
+        if (part.type === "text") {
+          outputText += part.text;
+        }
+      }
+    }
+    if (!outputText && allMessages.length > 0) {
+      for (let i = allMessages.length - 1; i >= 0; i--) {
+        const msg = allMessages[i];
+        if (msg.info.role === "assistant") {
+          const assistantInfo = msg.info;
+          if (assistantInfo.error) {
+            throw new Error(
+              `Agent execution failed: ${assistantInfo.error.name} - ${JSON.stringify(assistantInfo.error.data)}`
+            );
+          }
+          for (const part of msg.parts) {
+            if (part.type === "text") {
+              outputText += part.text;
+            }
+          }
+          if (outputText) break;
+        }
+      }
+    }
+    if (!outputText) {
+      const hasAssistant = allMessages.some((m) => m.info.role === "assistant");
+      if (!hasAssistant) {
+        throw new Error(
+          `Agent produced no response: no assistant messages in session history. Model: ${providerID}/${modelID}, Messages: ${allMessages.length}`
+        );
+      }
+    }
+    const usage = lastAssistantInfo ? {
+      inputTokens: lastAssistantInfo.tokens.input,
+      outputTokens: lastAssistantInfo.tokens.output,
+      totalTokens: lastAssistantInfo.tokens.input + lastAssistantInfo.tokens.output
+    } : { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
+    const costUsd = lastAssistantInfo?.cost;
+    const modelStr = options.model || DEFAULT_MODEL3;
+    const llmTrace = buildLLMTrace(
+      allMessages,
+      totalDurationMs,
+      modelStr,
+      providerID
+    );
+    const conversation = buildConversation2(allMessages);
+    return {
+      result: {
+        outputText,
+        durationMs: totalDurationMs,
+        usage,
+        costUsd
+      },
+      llmTrace,
+      conversation
+    };
+  } catch (sdkError) {
+    if (timeoutHandle) clearTimeout(timeoutHandle);
+    if (heartbeatHandle) clearInterval(heartbeatHandle);
+    if (timedOut) {
+      console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
+    }
+    const errorMessage = sdkError instanceof Error ? sdkError.message : String(sdkError);
+    const errorStack = sdkError instanceof Error ? sdkError.stack : void 0;
+    const errorName = sdkError instanceof Error ? sdkError.name : "Unknown";
+    console.error("[SDK-ERROR] ====== OPENCODE SDK EXECUTION FAILED ======");
+    console.error("[SDK-ERROR] Error name:", errorName);
+    console.error("[SDK-ERROR] Error message:", errorMessage);
+    if (errorStack) {
+      console.error("[SDK-ERROR] Stack:", errorStack);
+    }
+    if (traceContext) {
+      emitTraceEvent(
+        {
+          evalRunId: traceContext.evalRunId,
+          scenarioId: traceContext.scenarioId,
+          scenarioName: traceContext.scenarioName,
+          targetId: traceContext.targetId,
+          targetName: traceContext.targetName,
+          stepNumber: traceStepNumber + 1,
+          type: import_evalforge_types8.LiveTraceEventType.DIAGNOSTIC,
+          outputPreview: JSON.stringify({
+            event: "sdk-execution-failed",
+            error: errorMessage,
+            errorName
+          }).slice(0, 2e3),
+          timestamp: (/* @__PURE__ */ new Date()).toISOString(),
+          isComplete: true
+        },
+        traceContext.tracePushUrl,
+        traceContext.routeHeader,
+        traceContext.authToken
+      );
+    }
+    throw new Error(
+      `OpenCode SDK execution failed: ${errorMessage}` + (errorStack ? `
+Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
+    );
+  } finally {
+    if (server) {
+      try {
+        server.close();
+        console.log("[SDK-DEBUG] OpenCode server closed");
+      } catch {
+      }
+    }
+  }
+}
+// src/run-scenario/agents/opencode/opencode-adapter.ts
+var OpenCodeAdapter = class {
+  id = "opencode";
+  name = "OpenCode";
+  supportedCommands = [import_evalforge_types9.AgentRunCommand.OPENCODE];
+  async execute(context) {
+    const {
+      skills,
+      scenario,
+      cwd,
+      modelConfig,
+      aiGatewayUrl,
+      aiGatewayHeaders,
+      traceContext,
+      mcps,
+      subAgents,
+      rules,
+      systemPrompt
+    } = context;
+    const options = {
+      cwd,
+      model: modelConfig?.model,
+      temperature: modelConfig?.temperature,
+      maxTurns: modelConfig?.maxTurns,
+      aiGatewayUrl,
+      aiGatewayHeaders,
+      traceContext,
+      mcps,
+      subAgents,
+      rules,
+      systemPrompt
+    };
+    const { result, llmTrace, conversation } = await executeWithOpenCode(
+      skills,
+      scenario,
+      options
+    );
+    return {
+      outputText: result.outputText,
+      durationMs: result.durationMs,
+      usage: {
+        inputTokens: result.usage.inputTokens,
+        outputTokens: result.usage.outputTokens,
+        totalTokens: result.usage.totalTokens
+      },
+      costUsd: result.costUsd,
+      llmTrace,
+      conversation
+    };
+  }
+};
+var openCodeAdapter = new OpenCodeAdapter();
+// src/run-scenario/agents/opencode/index.ts
+defaultRegistry.register(openCodeAdapter);
 // src/run-scenario/agents/simple-agent/execute.ts
 var import_ai = require("ai");
 var import_anthropic = require("@ai-sdk/anthropic");
 var import_openai = require("@ai-sdk/openai");
-var import_evalforge_types6 = require("@wix/evalforge-types");
-var import_crypto2 = require("crypto");
+var import_evalforge_types11 = require("@wix/evalforge-types");
+var import_crypto3 = require("crypto");
 // src/run-scenario/agents/simple-agent/mcp-tools.ts
 var import_mcp = require("@ai-sdk/mcp");
@@ -2145,48 +3288,35 @@ function extractErrorText(content) {
 }
 // src/run-scenario/agents/simple-agent/cost-calculation.ts
+var import_evalforge_types10 = require("@wix/evalforge-types");
 var PROVIDER_ANTHROPIC = "anthropic";
 var MODEL_PRICING = {
+  // Anthropic — Claude 4.6
+  "claude-sonnet-4-6": { input: 3, output: 15 },
+  "claude-opus-4-6": { input: 15, output: 75 },
   // Anthropic — Claude 4.5
-  CLAUDE_4_5_OPUS_1_0: { input: 5, output: 25 },
-  CLAUDE_4_5_SONNET_1_0: { input: 3, output: 15 },
-  CLAUDE_4_5_HAIKU_1_0: { input: 1, output: 5 },
-  // Anthropic — Claude 4 / 4.1
-  CLAUDE_4_1_OPUS_1_0: { input: 15, output: 75 },
-  CLAUDE_4_OPUS_1_0: { input: 15, output: 75 },
-  CLAUDE_4_SONNET_1_0: { input: 3, output: 15 },
-  // Anthropic — Claude 3.x
-  CLAUDE_3_5_SONNET_2_0: { input: 3, output: 15 },
-  CLAUDE_3_5_SONNET_1_0: { input: 3, output: 15 },
-  CLAUDE_3_HAIKU_1_0: { input: 0.25, output: 1.25 },
+  "claude-opus-4-5": { input: 5, output: 25 },
+  "claude-sonnet-4-5": { input: 3, output: 15 },
+  "claude-haiku-4-5": { input: 1, output: 5 },
+  // Anthropic — Claude 4
+  "claude-opus-4": { input: 15, output: 75 },
+  "claude-sonnet-4": { input: 3, output: 15 },
   // OpenAI — GPT-5
-  GPT_5_2_2025_12_11: { input: 1.75, output: 14 },
-  GPT_5_2025_08_07: { input: 1.25, output: 10 },
-  GPT_5_MINI_2025_08_07: { input: 0.25, output: 2 },
-  GPT_5_NANO_2025_08_07: { input: 0.05, output: 0.4 },
+  "gpt-5": { input: 1.25, output: 10 },
+  "gpt-5-mini": { input: 0.25, output: 2 },
+  "gpt-5-nano": { input: 0.05, output: 0.4 },
   // OpenAI — GPT-4.1
-  GPT_4_1_2025_04_14: { input: 2, output: 8 },
-  GPT_4_1_MINI_2025_04_14: { input: 0.4, output: 1.6 },
-  GPT_4_1_NANO_2025_04_14: { input: 0.1, output: 0.4 },
+  "gpt-4.1": { input: 2, output: 8 },
+  "gpt-4.1-mini": { input: 0.4, output: 1.6 },
+  "gpt-4.1-nano": { input: 0.1, output: 0.4 },
   // OpenAI — GPT-4o
-  GPT_4O_2024_05_13: { input: 2.5, output: 10 },
-  GPT_4O_2024_08_06: { input: 2.5, output: 10 },
-  GPT_4O_2024_11_20: { input: 2.5, output: 10 },
-  GPT_4O_MINI_2024_07_18: { input: 0.15, output: 0.6 },
+  "gpt-4o": { input: 2.5, output: 10 },
+  "gpt-4o-mini": { input: 0.15, output: 0.6 },
   // OpenAI — Reasoning
-  O3_2025_04_16: { input: 2, output: 8 },
-  O4_MINI_2025_04_16: { input: 1.1, output: 4.4 },
-  O3_MINI_2025_01_31: { input: 1.1, output: 4.4 },
-  O1_2024_12_17: { input: 15, output: 60 },
-  O1_MINI: { input: 1.1, output: 4.4 },
-  O1_MINI_2024_09_12: { input: 1.1, output: 4.4 },
-  O1_PREVIEW: { input: 15, output: 60 },
-  O1_PREVIEW_2024_09_12: { input: 15, output: 60 },
-  // OpenAI — Legacy
-  GPT_4_TURBO_2024_04_09: { input: 10, output: 30 },
-  GPT_4_1106_PREVIEW: { input: 10, output: 30 },
-  GPT_3_5_TURBO: { input: 0.5, output: 1.5 },
-  GPT_3_5_TURBO_0125: { input: 0.5, output: 1.5 }
+  o3: { input: 2, output: 8 },
+  "o4-mini": { input: 1.1, output: 4.4 },
+  "o3-mini": { input: 1.1, output: 4.4 },
+  o1: { input: 15, output: 60 }
 };
 function extractGatewayCost(step, provider) {
   try {
@@ -2205,7 +3335,8 @@ function extractGatewayCost(step, provider) {
   }
 }
 function calculateFromPricing(modelId, tokenUsage) {
-  const pricing = MODEL_PRICING[modelId];
+  const normalized = (0, import_evalforge_types10.normalizeModelId)(modelId);
+  const pricing = MODEL_PRICING[normalized] ?? MODEL_PRICING[Object.keys(MODEL_PRICING).find((key) => normalized.startsWith(key)) ?? ""];
   if (!pricing) return 0;
   return tokenUsage.prompt / 1e6 * pricing.input + tokenUsage.completion / 1e6 * pricing.output;
 }
@@ -2214,7 +3345,7 @@ function calculateStepCost(step, modelId, provider, tokenUsage) {
 }
 // src/run-scenario/agents/simple-agent/build-conversation.ts
-function buildConversation2(triggerPrompt, steps, executionStartMs) {
+function buildConversation3(triggerPrompt, steps, executionStartMs) {
   const messages = [];
   messages.push({
     role: "user",
@@ -2280,9 +3411,7 @@ var PROVIDER_ANTHROPIC2 = "anthropic";
 var PROVIDER_OPENAI = "openai";
 var DEFAULT_MAX_TOOL_STEPS = 25;
 function createModel(modelId, baseUrl, headers) {
-  const isClaudeModel = import_evalforge_types6.AVAILABLE_CLAUDE_MODEL_IDS.includes(
-    modelId
-  );
+  const isClaudeModel = isClaudeModelId(modelId);
   if (isClaudeModel) {
     const anthropic = (0, import_anthropic.createAnthropic)({
       baseURL: `${baseUrl}/proxy/anthropic`,
@@ -2296,13 +3425,17 @@ function createModel(modelId, baseUrl, headers) {
     apiKey: "proxy-auth",
     headers
   });
-  if (import_evalforge_types6.OPENAI_RESPONSES_MODEL_IDS.has(modelId)) {
+  if ([...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
+    (id) => modelId === id || modelId.startsWith(id)
+  )) {
     return openai.responses(modelId);
   }
   return openai.chat(modelId);
 }
 function isClaudeModelId(modelId) {
-  return import_evalforge_types6.AVAILABLE_CLAUDE_MODEL_IDS.includes(modelId);
+  return import_evalforge_types11.AVAILABLE_CLAUDE_MODEL_IDS.some(
+    (id) => modelId === id || modelId.startsWith(id)
+  );
 }
 function extractSkillContent(files) {
   if (!files || files.length === 0) return void 0;
@@ -2336,7 +3469,9 @@ async function executeWithAiSdk(context) {
   }
   try {
     const isAnthropic = provider === PROVIDER_ANTHROPIC2;
-    const isResponsesAPI = import_evalforge_types6.OPENAI_RESPONSES_MODEL_IDS.has(modelConfig.model);
+    const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
+      (id) => modelConfig.model === id || modelConfig.model.startsWith(id)
+    );
     const supportsThinking = isAnthropic || isResponsesAPI;
     const providerOpts = {
       ...isAnthropic && {
@@ -2370,7 +3505,7 @@ async function executeWithAiSdk(context) {
       outputTokens: result.usage.outputTokens ?? 0,
       totalTokens: result.usage.totalTokens ?? 0
     };
-    const llmTrace = buildLLMTrace(
+    const llmTrace = buildLLMTrace2(
       result.steps,
       durationMs,
       usage,
@@ -2382,7 +3517,7 @@ async function executeWithAiSdk(context) {
       emitStepEvents(traceContext, result.steps, startTime);
       emitCompletionEvent(traceContext, result.steps.length + 1);
     }
-    const conversation = buildConversation2(
+    const conversation = buildConversation3(
       scenario.triggerPrompt,
       result.steps,
       startTime
@@ -2426,7 +3561,7 @@ function findToolResultError(step) {
   }
   return null;
 }
-function buildLLMTrace(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs) {
+function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs) {
   const totalStepTokens = steps.reduce(
     (sum, s) => sum + (s.usage.totalTokens ?? 0),
     0
@@ -2444,9 +3579,10 @@ function buildLLMTrace(steps, totalDurationMs, totalUsage, modelId, provider, ex
     const costUsd = calculateStepCost(step, modelId, provider, tokenUsage);
     const toolResultError = findToolResultError(step);
     return {
-      id: (0, import_crypto2.randomUUID)(),
+      id: (0, import_crypto3.randomUUID)(),
       stepNumber: i + 1,
-      type: step.toolCalls.length > 0 ? import_evalforge_types6.LLMStepType.TOOL_USE : import_evalforge_types6.LLMStepType.COMPLETION,
+      turnIndex: i,
+      type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
       model: modelId,
       provider,
       startedAt: new Date(
@@ -2469,10 +3605,11 @@ function buildLLMTrace(steps, totalDurationMs, totalUsage, modelId, provider, ex
     total: totalUsage.totalTokens
   };
   return {
-    id: (0, import_crypto2.randomUUID)(),
+    id: (0, import_crypto3.randomUUID)(),
     steps: traceSteps,
     summary: {
       totalSteps: traceSteps.length,
+      totalTurns: traceSteps.length,
       totalDurationMs,
       totalTokens: finalTokens,
       totalCostUsd,
@@ -2497,7 +3634,7 @@ function emitStartEvent(traceContext, startTime) {
       targetId: traceContext.targetId,
       targetName: traceContext.targetName,
       stepNumber: 0,
-      type: import_evalforge_types6.LiveTraceEventType.PROGRESS,
+      type: import_evalforge_types11.LiveTraceEventType.PROGRESS,
       outputPreview: "Starting Simple Agent execution...",
       elapsedMs: Date.now() - startTime,
       timestamp: (/* @__PURE__ */ new Date()).toISOString(),
@@ -2521,7 +3658,7 @@ function emitStepEvents(traceContext, steps, startTime) {
         targetId: traceContext.targetId,
         targetName: traceContext.targetName,
         stepNumber: i + 1,
-        type: isToolStep ? import_evalforge_types6.LiveTraceEventType.TOOL_USE : import_evalforge_types6.LiveTraceEventType.COMPLETION,
+        type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
         toolName: firstToolCall?.toolName,
         toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
         outputPreview: step.text?.slice(0, 500),
@@ -2544,7 +3681,7 @@ function emitCompletionEvent(traceContext, stepNumber) {
       targetId: traceContext.targetId,
       targetName: traceContext.targetName,
       stepNumber,
-      type: import_evalforge_types6.LiveTraceEventType.COMPLETION,
+      type: import_evalforge_types11.LiveTraceEventType.COMPLETION,
       outputPreview: "Scenario execution completed",
       timestamp: (/* @__PURE__ */ new Date()).toISOString(),
       isComplete: true
@@ -2571,7 +3708,7 @@ defaultRegistry.register(simpleAgentAdapter);
 // src/run-scenario/file-diff.ts
 var import_fs2 = require("fs");
-var import_path8 = require("path");
+var import_path10 = require("path");
 // ../../node_modules/diff/lib/index.mjs
 function Diff() {
@@ -2747,7 +3884,7 @@ Diff.prototype = {
   tokenize: function tokenize(value) {
     return Array.from(value);
   },
-  join: function join6(chars) {
+  join: function join8(chars) {
     return chars.join("");
   },
   postProcess: function postProcess(changeObjects) {
@@ -3187,8 +4324,8 @@ function snapshotDirectory(dir, baseDir) {
   }
   const entries = (0, import_fs2.readdirSync)(dir, { withFileTypes: true });
   for (const entry of entries) {
-    const fullPath = (0, import_path8.join)(dir, entry.name);
-    const relativePath = (0, import_path8.relative)(base, fullPath);
+    const fullPath = (0, import_path10.join)(dir, entry.name);
+    const relativePath = (0, import_path10.relative)(base, fullPath);
     if (shouldIgnore(entry.name)) {
       continue;
     }
@@ -3296,17 +4433,11 @@ function extractTemplateFiles(before, after) {
 }
 // src/run-scenario/run-agent-with-context.ts
-var import_evalforge_types7 = require("@wix/evalforge-types");
-var DEFAULT_AGENT_COMMAND = import_evalforge_types7.AgentRunCommand.CLAUDE;
+var import_evalforge_types12 = require("@wix/evalforge-types");
+var DEFAULT_AGENT_COMMAND = import_evalforge_types12.AgentRunCommand.CLAUDE;
 async function runAgentWithContext(config, evalRunId2, scenario, evalData, workDir) {
-  const hasEntities = evalData.skills.length > 0 || evalData.mcps.length > 0 || evalData.subAgents.length > 0 || (evalData.rules?.length ?? 0) > 0;
-  if (!hasEntities) {
-    throw new Error(
-      `Eval run ${evalRunId2} has no entities configured: at least one skill, MCP, sub-agent, or rule is required.`
-    );
-  }
   const agent = evalData.agent ?? void 0;
-  const isSDK = agent?.agentType === import_evalforge_types7.AgentType.SDK;
+  const isSDK = agent?.agentType === import_evalforge_types12.AgentType.SDK;
   const identifier = isSDK ? simpleAgentAdapter.id : agent?.runCommand ?? DEFAULT_AGENT_COMMAND;
   const adapter = getAdapter(identifier);
   const startedAt = (/* @__PURE__ */ new Date()).toISOString();
@@ -3341,7 +4472,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
   const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
   const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
   return {
-    id: (0, import_crypto3.randomUUID)(),
+    id: (0, import_crypto4.randomUUID)(),
     targetId,
     targetName,
     scenarioId: scenario.id,
@@ -3392,7 +4523,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
     })),
     durationMs: partialResult.duration
   };
-  const defaultJudgeModel = import_evalforge_types8.DEFAULT_JUDGE_MODEL;
+  const defaultJudgeModel = import_evalforge_types13.DEFAULT_JUDGE_MODEL;
   const assertionContext = {
     workDir,
     defaultJudgeModel,
@@ -3407,10 +4538,10 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
     assertionContext
   ) : [];
   const passed = assertionResults.filter(
-    (r) => r.status === import_evalforge_types8.AssertionResultStatus.PASSED
+    (r) => r.status === import_evalforge_types13.AssertionResultStatus.PASSED
   ).length;
   const failed = assertionResults.filter(
-    (r) => r.status === import_evalforge_types8.AssertionResultStatus.FAILED
+    (r) => r.status === import_evalforge_types13.AssertionResultStatus.FAILED
   ).length;
   const total = assertionResults.length;
   const passRate = total > 0 ? Math.round(passed / total * 100) : 100;
@@ -3424,7 +4555,7 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
 }
 // src/error-reporter.ts
-var import_evalforge_types9 = require("@wix/evalforge-types");
+var import_evalforge_types14 = require("@wix/evalforge-types");
 function formatError(error, phase, context) {
   const timestamp = (/* @__PURE__ */ new Date()).toISOString();
   if (error instanceof Error) {
@@ -3598,13 +4729,7 @@ async function runEvaluation(projectId2, evalRunId2) {
     presetId: evalData.evalRun.presetId,
     skillIds: evalData.evalRun.skillIds
   };
-  const hasEntities = skills.length > 0 || evalData.mcps.length > 0 || evalData.subAgents.length > 0 || evalData.rules.length > 0;
-  if (scenarioItems.length > 0 && !hasEntities) {
-    throw new Error(
-      `[${ExecutionPhase.VALIDATION}] Eval run has no entities configured: at least one skill, MCP, sub-agent, or rule is required.`
-    );
-  }
-  if (scenarioItems.length > 0 && hasEntities && !agent) {
+  if (scenarioItems.length > 0 && !agent) {
     throw new Error(
       `[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for evaluation runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
     );
@@ -3675,7 +4800,7 @@ async function runEvaluation(projectId2, evalRunId2) {
   };
   try {
     await api.updateEvalRun(projectId2, evalRunId2, {
-      status: import_evalforge_types10.EvalStatus.COMPLETED,
+      status: import_evalforge_types15.EvalStatus.COMPLETED,
       completedAt: (/* @__PURE__ */ new Date()).toISOString()
     });
   } catch (updateErr) {
@@ -3716,7 +4841,7 @@ runEvaluation(projectId, evalRunId).then(() => {
       authToken: config.authToken
     });
     await api.updateEvalRun(projectId, evalRunId, {
-      status: import_evalforge_types10.EvalStatus.FAILED,
+      status: import_evalforge_types15.EvalStatus.FAILED,
       completedAt: (/* @__PURE__ */ new Date()).toISOString(),
       jobError,
       jobStatus: "FAILED"
@@ -3739,7 +4864,7 @@ runEvaluation(projectId, evalRunId).then(() => {
           authToken
         });
         await api.updateEvalRun(projectId, evalRunId, {
-          status: import_evalforge_types10.EvalStatus.FAILED,
+          status: import_evalforge_types15.EvalStatus.FAILED,
           completedAt: (/* @__PURE__ */ new Date()).toISOString(),
           jobError: `Config load failed, then: ${jobError}`,
           jobStatus: "FAILED"