npm - agent-duelist - Versions diffs - 0.1.0 → 0.2.0 - Mend

agent-duelist 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/dist/index.cjs CHANGED Viewed

@@ -32,13 +32,20 @@ var index_exports = {};
 __export(index_exports, {
   anthropic: () => anthropic,
   azureOpenai: () => azureOpenai,
+  compareResults: () => compareResults,
+  computeStats: () => computeStats,
   consoleReporter: () => consoleReporter,
   defineArena: () => defineArena,
+  detectGitHubContext: () => detectGitHubContext,
   gemini: () => gemini,
   jsonReporter: () => jsonReporter,
+  loadBaseline: () => loadBaseline,
+  markdownReporter: () => markdownReporter,
   openai: () => openai,
   openaiCompatible: () => openaiCompatible,
-  registerPricing: () => registerPricing
+  registerPricing: () => registerPricing,
+  saveBaseline: () => saveBaseline,
+  upsertPrComment: () => upsertPrComment
 });
 module.exports = __toCommonJS(index_exports);
@@ -1479,7 +1486,142 @@ function jaccardSimilarity(a, b) {
 }
 // src/scorers/llm-judge.ts
+var import_openai2 = __toESM(require("openai"), 1);
+// src/providers/openai.ts
 var import_openai = __toESM(require("openai"), 1);
+var import_zod_to_json_schema = require("zod-to-json-schema");
+var REQUEST_TIMEOUT_MS = 6e4;
+function openai(model, options) {
+  const client = new import_openai.default({
+    apiKey: options?.apiKey ?? process.env.OPENAI_API_KEY,
+    baseURL: options?.baseURL,
+    timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
+  });
+  return makeProvider(`openai/${model}`, "OpenAI", model, client, model);
+}
+function openaiCompatible(options) {
+  const apiKey = options.apiKey ?? (options.apiKeyEnv ? process.env[options.apiKeyEnv] : void 0) ?? "no-key";
+  const client = new import_openai.default({
+    apiKey,
+    baseURL: options.baseURL,
+    timeout: options.timeoutMs ?? REQUEST_TIMEOUT_MS
+  });
+  if (options.free) {
+    registerPricing(options.id, { inputPerToken: 0, outputPerToken: 0 });
+  }
+  return makeProvider(options.id, options.name, options.model, client, options.model, options.stripThinking);
+}
+function azureOpenai(model, options) {
+  const deployment = options?.deployment ?? model;
+  const client = new import_openai.AzureOpenAI({
+    apiKey: options?.apiKey ?? process.env.AZURE_OPENAI_API_KEY,
+    endpoint: options?.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT,
+    apiVersion: options?.apiVersion ?? process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
+    deployment,
+    timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
+  });
+  return makeProvider(`azure/${model}`, "Azure OpenAI", model, client, deployment);
+}
+function makeProvider(id, name, model, client, requestModel, stripThinking) {
+  return {
+    id,
+    name,
+    model,
+    async run(input) {
+      const start = Date.now();
+      const params = {
+        model: requestModel,
+        messages: [{ role: "user", content: input.prompt }]
+      };
+      if (input.schema) {
+        params.response_format = { type: "json_object" };
+        params.messages = [
+          { role: "system", content: "Respond with valid JSON matching the requested schema." },
+          ...params.messages
+        ];
+      }
+      if (input.tools?.length) {
+        params.tools = input.tools.map(toolDefToOpenAI);
+        params.tool_choice = "auto";
+      }
+      const response = await client.chat.completions.create(params, { signal: input.signal });
+      let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
+      let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
+      const choice = response.choices[0];
+      const toolCallsRaw = choice?.message?.tool_calls;
+      const collectedToolCalls = [];
+      let finalResponse = response;
+      if (toolCallsRaw?.length && input.tools?.length) {
+        const toolMessages = [
+          ...params.messages,
+          choice.message
+        ];
+        for (const tc of toolCallsRaw) {
+          const toolDef = input.tools.find((t) => t.name === tc.function.name);
+          let args;
+          try {
+            args = JSON.parse(tc.function.arguments);
+          } catch {
+            args = tc.function.arguments;
+          }
+          let result;
+          if (toolDef?.handler) {
+            result = await toolDef.handler(args);
+          }
+          collectedToolCalls.push({ name: tc.function.name, arguments: args, result });
+          toolMessages.push({
+            role: "tool",
+            tool_call_id: tc.id,
+            content: JSON.stringify(result ?? {})
+          });
+        }
+        const followUp = await client.chat.completions.create({
+          model: requestModel,
+          messages: toolMessages
+        }, { signal: input.signal });
+        totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
+        totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
+        finalResponse = followUp;
+      }
+      const latencyMs = Date.now() - start;
+      const finalChoice = finalResponse.choices[0];
+      let rawContent = finalChoice?.message?.content ?? "";
+      if (stripThinking) {
+        rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
+      }
+      let output = rawContent;
+      if (input.schema) {
+        try {
+          output = JSON.parse(rawContent);
+        } catch {
+        }
+      }
+      return {
+        output,
+        usage: {
+          promptTokens: totalPromptTokens || void 0,
+          completionTokens: totalCompletionTokens || void 0
+        },
+        latencyMs,
+        raw: finalResponse,
+        toolCalls: collectedToolCalls.length > 0 ? collectedToolCalls : void 0
+      };
+    }
+  };
+}
+function toolDefToOpenAI(tool) {
+  return {
+    type: "function",
+    function: {
+      name: tool.name,
+      description: tool.description,
+      parameters: (0, import_zod_to_json_schema.zodToJsonSchema)(tool.parameters, { target: "openAi" })
+    }
+  };
+}
+// src/scorers/llm-judge.ts
 var JUDGE_PROMPT = `You are a strict scoring judge. Evaluate the actual output against the expected output on three criteria. Score each from 0.0 to 1.0 using the full range (not just 0, 0.5, 1).
 Criteria:
@@ -1495,40 +1637,42 @@ conciseness: <number>
 Task: {task}
 Expected: {expected}
 Actual: {actual}`;
-function resolveJudgeClient(configModel) {
-  const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-4o-mini";
+function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
+  const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-5-mini";
   if (model.startsWith("gemini") && process.env.GOOGLE_API_KEY) {
     return {
-      client: new import_openai.default({
+      client: new import_openai2.default({
         apiKey: process.env.GOOGLE_API_KEY,
-        baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/"
+        baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
+        timeout: timeoutMs
       }),
       model
     };
   }
   if (!process.env.OPENAI_API_KEY && process.env.AZURE_OPENAI_API_KEY) {
     return {
-      client: new import_openai.AzureOpenAI({
+      client: new import_openai2.AzureOpenAI({
         apiKey: process.env.AZURE_OPENAI_API_KEY,
         endpoint: process.env.AZURE_OPENAI_ENDPOINT,
         apiVersion: process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
-        deployment: model
+        deployment: model,
+        timeout: timeoutMs
       }),
       model
     };
   }
   const apiKey = process.env.OPENAI_API_KEY;
   if (!apiKey) return void 0;
-  return { client: new import_openai.default({ apiKey }), model };
+  return { client: new import_openai2.default({ apiKey, timeout: timeoutMs }), model };
 }
-function createLlmJudgeScorer(judgeModel) {
+function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
   let cached = void 0;
   return async ({ task, result }) => {
     if (task.expected === void 0) {
       return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
     }
     if (cached === void 0) {
-      cached = resolveJudgeClient(judgeModel) ?? null;
+      cached = resolveJudgeClient(judgeModel, timeoutMs) ?? null;
     }
     if (!cached) {
       return {
@@ -1601,10 +1745,10 @@ var staticScorers = {
   "fuzzy-similarity": fuzzySimilarityScorer,
   "tool-usage": toolUsageScorer
 };
-function resolveScorers(names, judgeModel) {
+function resolveScorers(names, judgeModel, timeoutMs) {
   return names.map((name) => {
     if (name === "llm-judge-correctness") {
-      return createLlmJudgeScorer(judgeModel);
+      return createLlmJudgeScorer(judgeModel, timeoutMs);
     }
     const scorer = staticScorers[name];
     if (!scorer) {
@@ -1615,19 +1759,41 @@ function resolveScorers(names, judgeModel) {
 }
 // src/runner.ts
+var DEFAULT_TIMEOUT_MS = 6e4;
+function withTimeout(run, ms) {
+  return new Promise((resolve, reject) => {
+    const controller = new AbortController();
+    const timer = setTimeout(() => {
+      controller.abort();
+      reject(new Error(`Request timed out after ${ms}ms`));
+    }, ms);
+    run(controller.signal).then(
+      (v) => {
+        clearTimeout(timer);
+        resolve(v);
+      },
+      (e) => {
+        clearTimeout(timer);
+        reject(e);
+      }
+    );
+  });
+}
 async function runBenchmarks(options) {
   const { providers, tasks, scorers, runs, onResult } = options;
+  const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
   const results = [];
   for (const task of tasks) {
     for (const provider of providers) {
       for (let run = 1; run <= runs; run++) {
         let result;
         try {
-          const taskResult = await provider.run({
+          const taskResult = await withTimeout((signal) => provider.run({
             prompt: task.prompt,
             schema: task.schema,
-            tools: task.tools
-          });
+            tools: task.tools,
+            signal
+          }), timeout);
           const scores = await Promise.all(
             scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
           );
@@ -1670,20 +1836,162 @@ var green = "\x1B[32m";
 var red = "\x1B[31m";
 var yellow = "\x1B[33m";
 var cyan = "\x1B[36m";
+var brightGreen = "\x1B[92m";
+var brightWhite = "\x1B[97m";
 function bold(s) {
   return `${boldCode}${s}${reset}`;
 }
 function dim(s) {
   return `${dimCode}${s}${reset}`;
 }
-function colorScore(value) {
-  const pct = Math.round(value * 100);
-  const str = `${pct}%`;
-  if (value >= 0.8) return `${green}${str}${reset}`;
-  if (value >= 0.5) return `${yellow}${str}${reset}`;
-  return `${red}${str}${reset}`;
+function stripAnsi(s) {
+  return s.replace(/\x1b\[[0-9;]*m/g, "");
+}
+function displayWidth(s) {
+  const stripped = stripAnsi(s);
+  let width = 0;
+  for (const ch of stripped) {
+    const code = ch.codePointAt(0) ?? 0;
+    if (code >= 126976) width += 2;
+    else if (code >= 9728 && code <= 10175) width += 2;
+    else width += 1;
+  }
+  return width;
+}
+function padCell(str, targetWidth, align) {
+  const dw = displayWidth(str);
+  const padding = Math.max(0, targetWidth - dw);
+  if (align === "right") return " ".repeat(padding) + str;
+  return str + " ".repeat(padding);
+}
+function sparkBar(ratio, width = 8) {
+  const clamped = Math.max(0, Math.min(1, ratio));
+  const fillLen = Math.round(clamped * width);
+  const fill = "\u2593".repeat(fillLen);
+  const track = "\u2591".repeat(width - fillLen);
+  return { fill, track };
+}
+function drawTableLine(widths, position) {
+  const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
+  if (position === "bottom") {
+    return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
+  }
+  if (position === "merge") {
+    return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
+  }
+  const segments = widths.map((w) => "\u2500".repeat(w + 2));
+  if (position === "top") {
+    return dim(`\u250C${segments.join("\u252C")}\u2510`);
+  }
+  return dim(`\u251C${segments.join("\u253C")}\u2524`);
+}
+function drawTableRow(cells, widths, aligns) {
+  const parts = cells.map(
+    (cell, i) => " " + padCell(cell, widths[i], aligns[i]) + " "
+  );
+  return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
+}
+function drawSpanRow(content, widths) {
+  const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
+  const dw = displayWidth(content);
+  const padding = Math.max(0, totalInner - dw - 1);
+  return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
+}
+function computeColumnStats(providerData, scorerNames) {
+  const stats = /* @__PURE__ */ new Map();
+  const valid = providerData.filter((p) => !p.allErrors);
+  if (scorerNames.includes("latency")) {
+    const values = /* @__PURE__ */ new Map();
+    for (const p of providerData) {
+      values.set(p.providerId, p.allErrors ? void 0 : p.latencyMs);
+    }
+    const nums = valid.map((p) => p.latencyMs).filter((v) => v !== void 0);
+    stats.set("latency", {
+      values,
+      best: nums.length > 0 ? Math.min(...nums) : void 0,
+      worst: nums.length > 0 ? Math.max(...nums) : void 0
+    });
+  }
+  if (scorerNames.includes("cost")) {
+    const costValues = /* @__PURE__ */ new Map();
+    const tokenValues = /* @__PURE__ */ new Map();
+    for (const p of providerData) {
+      costValues.set(p.providerId, p.allErrors ? void 0 : p.avgDetails.costUsd);
+      tokenValues.set(p.providerId, p.allErrors ? void 0 : p.avgDetails.totalTokens);
+    }
+    const costNums = valid.map((p) => p.avgDetails.costUsd).filter((v) => v !== void 0);
+    const tokenNums = valid.map((p) => p.avgDetails.totalTokens).filter((v) => v !== void 0);
+    stats.set("cost", {
+      values: costValues,
+      best: costNums.length > 0 ? Math.min(...costNums) : void 0,
+      worst: costNums.length > 0 ? Math.max(...costNums) : void 0
+    });
+    stats.set("tokens", {
+      values: tokenValues,
+      best: tokenNums.length > 0 ? Math.min(...tokenNums) : void 0,
+      worst: tokenNums.length > 0 ? Math.max(...tokenNums) : void 0
+    });
+  }
+  for (const name of scorerNames) {
+    if (name === "latency" || name === "cost") continue;
+    const values = /* @__PURE__ */ new Map();
+    for (const p of providerData) {
+      values.set(p.providerId, p.allErrors ? void 0 : p.avgScores[name]);
+    }
+    const nums = valid.map((p) => p.avgScores[name]).filter((v) => v !== void 0);
+    stats.set(name, {
+      values,
+      best: nums.length > 0 ? Math.max(...nums) : void 0,
+      worst: nums.length > 0 ? Math.min(...nums) : void 0
+    });
+  }
+  return stats;
+}
+function colorByRank(text, value, colStats, providerCount) {
+  if (value === void 0) return dim("\u2014");
+  if (providerCount < 2) return text;
+  if (colStats.best === void 0 || colStats.worst === void 0) return text;
+  if (colStats.best === colStats.worst) return text;
+  if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
+  if (value === colStats.worst) return `${red}${text}${reset}`;
+  return `${yellow}${text}${reset}`;
 }
-function consoleReporter(results) {
+function computeMedals(columnStats, providerIds) {
+  const medals = /* @__PURE__ */ new Map();
+  if (providerIds.length < 2) {
+    for (const id of providerIds) medals.set(id, "");
+    return medals;
+  }
+  const wins = /* @__PURE__ */ new Map();
+  for (const id of providerIds) wins.set(id, 0);
+  for (const [, colStats] of columnStats) {
+    if (colStats.best === void 0) continue;
+    for (const [providerId, value] of colStats.values) {
+      if (value !== void 0 && value === colStats.best) {
+        wins.set(providerId, (wins.get(providerId) ?? 0) + 1);
+      }
+    }
+  }
+  const totalWins = [...wins.values()].reduce((a, b) => a + b, 0);
+  if (totalWins === 0) {
+    for (const id of providerIds) medals.set(id, "");
+    return medals;
+  }
+  const sorted = [...wins.entries()].sort(
+    (a, b) => b[1] - a[1] || a[0].localeCompare(b[0])
+  );
+  const medalList = ["\u{1F947}", "\u{1F948}", "\u{1F949}"];
+  let rank = 0;
+  for (let i = 0; i < sorted.length; i++) {
+    if (i > 0 && sorted[i][1] < sorted[i - 1][1]) {
+      rank = i;
+    }
+    medals.set(sorted[i][0], rank < medalList.length ? medalList[rank] : "");
+  }
+  return medals;
+}
+function consoleReporter(results, options) {
+  const showSparklines = options?.sparklines ?? true;
   if (results.length === 0) {
     console.log("\nNo results to display.\n");
     return;
@@ -1693,78 +2001,155 @@ function consoleReporter(results) {
   const scorerNames = [...new Set(results.flatMap((r) => r.scores.map((s) => s.name)))];
   const hasCost = scorerNames.includes("cost");
   const hasErrors = results.some((r) => r.error);
+  const multi = providers.length >= 2;
   const runsPerCell = Math.max(...results.map((r) => r.run));
-  const runLabel = runsPerCell > 1 ? ` (${runsPerCell} runs each)` : "";
+  const runLabel = runsPerCell > 1 ? `  ${dim(`(${runsPerCell} runs each)`)}` : "";
   console.log("");
-  console.log(`  ${bold(`\u2B21 Agent Duelist Results${runLabel}`)}`);
-  console.log(`  ${dim("\u2500".repeat(70))}`);
+  console.log(`  ${brightWhite}${boldCode}\u2B21  Agent Duelist${reset}${runLabel}`);
+  console.log(`  ${dim("\u2501".repeat(72))}`);
   console.log("");
   for (const task of tasks) {
     console.log(`  ${bold(`Task: ${task}`)}`);
-    const cols = [{ label: "Provider", width: 22, align: "left" }];
-    for (const name of scorerNames) {
-      if (name === "latency") cols.push({ label: "Latency", width: 10, align: "right" });
-      else if (name === "cost") {
-        cols.push({ label: "Cost", width: 12, align: "right" });
-        cols.push({ label: "Tokens", width: 9, align: "right" });
-      } else if (name === "correctness") cols.push({ label: "Match", width: 8, align: "right" });
-      else if (name === "schema-correctness") cols.push({ label: "Schema", width: 8, align: "right" });
-      else if (name === "fuzzy-similarity") cols.push({ label: "Fuzzy", width: 8, align: "right" });
-      else if (name === "llm-judge-correctness") cols.push({ label: "Judge", width: 8, align: "right" });
-      else if (name === "tool-usage") cols.push({ label: "Tool", width: 8, align: "right" });
-      else cols.push({ label: name, width: 10, align: "right" });
-    }
-    if (hasErrors) cols.push({ label: "Status", width: 8, align: "left" });
-    const totalWidth = cols.reduce((sum, c) => sum + c.width + 2, 0);
-    console.log(`  ${dim(cols.map((c) => pad(c.label, c.width + 2, c.align)).join(""))}`);
-    console.log(`  ${dim("\u2500".repeat(totalWidth))}`);
-    for (const provider of providers) {
-      const taskResults = results.filter(
-        (r) => r.taskName === task && r.providerId === provider
-      );
+    console.log("");
+    const providerData = providers.map((providerId) => {
+      const taskResults = results.filter((r) => r.taskName === task && r.providerId === providerId);
       const errorResults2 = taskResults.filter((r) => r.error);
       const successResults = taskResults.filter((r) => !r.error);
-      if (successResults.length === 0 && errorResults2.length > 0) {
-        const cells2 = [pad(provider, 24, "left")];
-        for (const name of scorerNames) {
-          if (name === "cost") {
-            cells2.push(pad("\u2014", 14, "right"));
-            cells2.push(pad("\u2014", 11, "right"));
-          } else cells2.push(pad("\u2014", cols.find((c) => c.label !== "Provider").width + 2, "right"));
-        }
-        if (hasErrors) cells2.push(`  ${red}FAIL${reset}`);
-        console.log(`  ${cells2.join("")}`);
-        continue;
+      if (successResults.length === 0) {
+        return {
+          providerId,
+          avgScores: {},
+          avgDetails: { costUsd: void 0, totalTokens: void 0 },
+          latencyMs: void 0,
+          allErrors: errorResults2.length > 0,
+          errorCount: errorResults2.length
+        };
+      }
+      return {
+        providerId,
+        avgScores: averageScores(successResults),
+        avgDetails: averageDetails(successResults),
+        latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
+        allErrors: false,
+        errorCount: errorResults2.length
+      };
+    });
+    const columnStats = computeColumnStats(providerData, scorerNames);
+    const medals = computeMedals(columnStats, providers);
+    const maxProviderLen = Math.max(...providers.map((id) => id.length));
+    const providerWidth = Math.min(35, Math.max(22, maxProviderLen + 5));
+    const cols = [
+      { label: "Provider", width: providerWidth, align: "left" }
+    ];
+    for (const name of scorerNames) {
+      if (name === "latency") {
+        cols.push({ label: "Latency", width: 10, align: "right", statsKey: "latency" });
+      } else if (name === "cost") {
+        cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
+        cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
+      } else {
+        const label = name === "correctness" ? "Match" : name === "schema-correctness" ? "Schema" : name === "fuzzy-similarity" ? "Fuzzy" : name === "llm-judge-correctness" ? "Judge" : name === "tool-usage" ? "Tool" : name;
+        cols.push({ label, width: showSparklines ? 15 : 8, align: "right", statsKey: name });
       }
-      const avgScores = averageScores(successResults);
-      const avgDetails = averageDetails(successResults);
-      const latencyMs = average(successResults.map((r) => r.raw.latencyMs));
-      const cells = [pad(provider, 24, "left")];
-      for (const name of scorerNames) {
-        if (name === "latency") {
-          cells.push(pad(latencyMs !== void 0 ? `${Math.round(latencyMs)}ms` : "\u2014", 12, "right"));
-        } else if (name === "cost") {
-          cells.push(pad(formatCost(avgDetails.costUsd), 14, "right"));
-          cells.push(pad(avgDetails.totalTokens !== void 0 ? `${avgDetails.totalTokens}` : "\u2014", 11, "right"));
-        } else {
-          const val = avgScores[name];
-          if (val === void 0) cells.push(pad("\u2014", 10, "right"));
-          else cells.push(pad(colorScore(val), 10 + colorLen(colorScore(val)), "right"));
+    }
+    if (hasErrors) {
+      cols.push({ label: "Status", width: 8, align: "left" });
+    }
+    const widths = cols.map((c) => c.width);
+    const aligns = cols.map((c) => c.align);
+    console.log(`  ${drawTableLine(widths, "top")}`);
+    const headerCells = cols.map((c) => bold(c.label));
+    console.log(`  ${drawTableRow(headerCells, widths, aligns)}`);
+    console.log(`  ${drawTableLine(widths, "header")}`);
+    for (const pd of providerData) {
+      const medal = medals.get(pd.providerId) ?? "";
+      const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
+      const cells = [providerCell];
+      if (pd.allErrors) {
+        for (const col of cols.slice(1)) {
+          if (col.label === "Status") {
+            cells.push(`${red}FAIL${reset}`);
+          } else {
+            cells.push(dim("\u2014"));
+          }
+        }
+      } else {
+        for (const col of cols.slice(1)) {
+          if (col.label === "Status") {
+            cells.push(
+              pd.errorCount > 0 ? `${yellow}${pd.errorCount} err${reset}` : `${green}OK${reset}`
+            );
+            continue;
+          }
+          const statsKey = col.statsKey;
+          const colStats = columnStats.get(statsKey);
+          if (statsKey === "latency") {
+            const ms = pd.latencyMs;
+            if (ms === void 0) {
+              cells.push(dim("\u2014"));
+            } else {
+              const text = `${Math.round(ms)}ms`;
+              cells.push(colStats ? colorByRank(text, ms, colStats, providers.length) : text);
+            }
+          } else if (statsKey === "cost") {
+            const cost = pd.avgDetails.costUsd;
+            if (cost === void 0) {
+              cells.push(dim("\u2014"));
+            } else {
+              const text = formatCost(cost);
+              cells.push(colStats ? colorByRank(text, cost, colStats, providers.length) : text);
+            }
+          } else if (statsKey === "tokens") {
+            const tokens = pd.avgDetails.totalTokens;
+            if (tokens === void 0) {
+              cells.push(dim("\u2014"));
+            } else {
+              const text = `${tokens}`;
+              cells.push(colStats ? colorByRank(text, tokens, colStats, providers.length) : text);
+            }
+          } else {
+            const val = pd.avgScores[statsKey];
+            if (val === void 0) {
+              cells.push(dim("\u2014"));
+            } else {
+              const pctStr = `${Math.round(val * 100)}%`.padStart(4);
+              let coloredPct;
+              if (multi && colStats) {
+                coloredPct = colorByRank(pctStr, val, colStats, providers.length);
+              } else {
+                if (val >= 0.8) coloredPct = `${green}${pctStr}${reset}`;
+                else if (val >= 0.5) coloredPct = `${yellow}${pctStr}${reset}`;
+                else coloredPct = `${red}${pctStr}${reset}`;
+              }
+              if (showSparklines) {
+                const { fill, track } = sparkBar(val);
+                const barColor = val >= 0.8 ? green : val >= 0.5 ? yellow : red;
+                cells.push(`${coloredPct} ${barColor}${fill}${reset}${dim(track)}`);
+              } else {
+                cells.push(coloredPct);
+              }
+            }
+          }
         }
       }
-      if (hasErrors) {
-        const failCount = errorResults2.length;
-        cells.push(failCount > 0 ? `  ${yellow}${failCount} err${reset}` : `  ${green}OK${reset}`);
+      console.log(`  ${drawTableRow(cells, widths, aligns)}`);
+    }
+    if (multi && providerData.some((p) => !p.allErrors)) {
+      const winnerId = [...medals.entries()].find(([, m]) => m === "\u{1F947}")?.[0];
+      if (winnerId) {
+        console.log(`  ${drawTableLine(widths, "merge")}`);
+        const winnerText = `${brightGreen}${boldCode}\u{1F3C6}  Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
+        console.log(`  ${drawSpanRow(winnerText, widths)}`);
       }
-      console.log(`  ${cells.join("")}`);
     }
+    console.log(`  ${drawTableLine(widths, "bottom")}`);
     console.log("");
   }
   printSummary(results, providers);
   const errorResults = results.filter((r) => r.error);
   if (errorResults.length > 0) {
     console.log(`  ${bold("Errors")}`);
-    console.log(`  ${dim("\u2500".repeat(70))}`);
+    console.log(`  ${dim("\u2501".repeat(72))}`);
     const seen = /* @__PURE__ */ new Set();
     for (const r of errorResults) {
       const key = `${r.providerId}::${r.error}`;
@@ -1772,7 +2157,7 @@ function consoleReporter(results) {
       seen.add(key);
       const count = errorResults.filter((e) => e.providerId === r.providerId && e.error === r.error).length;
       const suffix = count > 1 ? ` (\xD7${count})` : "";
-      console.log(`  ${red}\u2717${reset} ${r.providerId}: ${r.error}${suffix}`);
+      console.log(`  ${red}\u2716${reset} ${r.providerId}: ${r.error}${suffix}`);
       const hint = apiKeyHint(r.providerId, r.error ?? "");
       if (hint) console.log(`    ${dim(hint)}`);
     }
@@ -1786,15 +2171,20 @@ function consoleReporter(results) {
 function printSummary(results, providers) {
   const successResults = results.filter((r) => !r.error);
   if (successResults.length === 0) return;
-  console.log(`  ${dim("\u2500".repeat(70))}`);
   console.log(`  ${bold("Summary")}`);
+  console.log(`  ${dim("\u2501".repeat(72))}`);
   console.log("");
   const single = providers.length === 1;
   const correctnessKey = successResults.some((r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)) ? "llm-judge-correctness" : "correctness";
   const byCorrectness = rankProviders(successResults, providers, correctnessKey);
   if (byCorrectness) {
-    const label = single ? "Avg correctness" : `Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))}`;
-    console.log(`  ${cyan}\u25C6${reset} ${label} (avg ${colorScore(byCorrectness.avg)})`);
+    const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
+    const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
+    if (single) {
+      console.log(`  ${medal} Avg correctness:  ${brightGreen}${boldCode}${pctStr}${reset}`);
+    } else {
+      console.log(`  ${medal} Most correct:  ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))}  ${brightGreen}${boldCode}${pctStr}${reset}`);
+    }
   }
   const byLatency = providers.map((id) => {
     const runs = successResults.filter((r) => r.providerId === id);
@@ -1802,8 +2192,13 @@ function printSummary(results, providers) {
     return { id, avg: avg ?? Infinity };
   }).sort((a, b) => a.avg - b.avg)[0];
   if (byLatency && byLatency.avg !== Infinity) {
-    const label = single ? "Avg latency" : `Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))}`;
-    console.log(`  ${cyan}\u25C6${reset} ${label} (avg ${Math.round(byLatency.avg)}ms)`);
+    const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
+    const msStr = `${Math.round(byLatency.avg)}ms`;
+    if (single) {
+      console.log(`  ${medal} Avg latency:     ${brightGreen}${boldCode}${msStr}${reset}`);
+    } else {
+      console.log(`  ${medal} Fastest:       ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))}  ${brightGreen}${boldCode}${msStr}${reset}`);
+    }
   }
   const byCost = providers.map((id) => {
     const runs = successResults.filter((r) => r.providerId === id);
@@ -1815,8 +2210,32 @@ function printSummary(results, providers) {
     return { id, avg };
   }).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
   if (byCost?.avg !== void 0) {
-    const label = single ? "Avg cost" : `Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))}`;
-    console.log(`  ${cyan}\u25C6${reset} ${label} (avg ${formatCost(byCost.avg)})`);
+    const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
+    const costStr = formatCost(byCost.avg);
+    if (single) {
+      console.log(`  ${medal} Avg cost:        ${brightGreen}${boldCode}${costStr}${reset}`);
+    } else {
+      console.log(`  ${medal} Cheapest:      ${bold(byCost.id)} ${dim(providerLabel(byCost.id))}  ${brightGreen}${boldCode}${costStr}${reset}`);
+    }
+  }
+  if (!single) {
+    const wins = /* @__PURE__ */ new Map();
+    for (const id of providers) wins.set(id, 0);
+    if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
+    if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
+    if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
+    const maxWins = Math.max(...wins.values());
+    if (maxWins > 0) {
+      const topProviders = [...wins.entries()].filter(([, w]) => w === maxWins);
+      console.log("");
+      if (topProviders.length === 1) {
+        const [winnerId, winCount] = topProviders[0];
+        console.log(`  \u{1F3C6} Overall:      ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))}  ${dim(`(${winCount}/3 categories)`)}`);
+      } else {
+        const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
+        console.log(`  \u{1F3C6} Overall:      ${names}  ${dim(`(tied at ${maxWins}/3)`)}`);
+      }
+    }
   }
   console.log("");
 }
@@ -1878,14 +2297,6 @@ function formatCost(usd) {
   const digits = Math.max(4, -Math.floor(Math.log10(usd)) + 1);
   return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
 }
-function pad(str, width, align) {
-  if (align === "right") return str.padStart(width);
-  return str.padEnd(width);
-}
-function colorLen(str) {
-  const stripped = str.replace(/\x1b\[[0-9;]*m/g, "");
-  return str.length - stripped.length;
-}
 function apiKeyHint(providerId, error) {
   const lower = error.toLowerCase();
   const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
@@ -1989,7 +2400,7 @@ function defineArena(config) {
     throw new Error("At least one task is required");
   }
   const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
-  const scorerFns = resolveScorers(scorerNames, config.judgeModel);
+  const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
   const runs = config.runs ?? 1;
   return {
     config,
@@ -1999,141 +2410,13 @@ function defineArena(config) {
         tasks: config.tasks,
         scorers: scorerFns,
         runs,
+        timeout: config.timeout,
         onResult: options?.onResult
       });
     }
   };
 }
-// src/providers/openai.ts
-var import_openai2 = __toESM(require("openai"), 1);
-var import_zod_to_json_schema = require("zod-to-json-schema");
-function openai(model, options) {
-  const client = new import_openai2.default({
-    apiKey: options?.apiKey ?? process.env.OPENAI_API_KEY,
-    baseURL: options?.baseURL
-  });
-  return makeProvider(`openai/${model}`, "OpenAI", model, client, model);
-}
-function openaiCompatible(options) {
-  const apiKey = options.apiKey ?? (options.apiKeyEnv ? process.env[options.apiKeyEnv] : void 0) ?? "no-key";
-  const client = new import_openai2.default({
-    apiKey,
-    baseURL: options.baseURL
-  });
-  if (options.free) {
-    registerPricing(options.id, { inputPerToken: 0, outputPerToken: 0 });
-  }
-  return makeProvider(options.id, options.name, options.model, client, options.model, options.stripThinking);
-}
-function azureOpenai(model, options) {
-  const deployment = options?.deployment ?? model;
-  const client = new import_openai2.AzureOpenAI({
-    apiKey: options?.apiKey ?? process.env.AZURE_OPENAI_API_KEY,
-    endpoint: options?.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT,
-    apiVersion: options?.apiVersion ?? process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
-    deployment
-  });
-  return makeProvider(`azure/${model}`, "Azure OpenAI", model, client, deployment);
-}
-function makeProvider(id, name, model, client, requestModel, stripThinking) {
-  return {
-    id,
-    name,
-    model,
-    async run(input) {
-      const start = Date.now();
-      const params = {
-        model: requestModel,
-        messages: [{ role: "user", content: input.prompt }]
-      };
-      if (input.schema) {
-        params.response_format = { type: "json_object" };
-        params.messages = [
-          { role: "system", content: "Respond with valid JSON matching the requested schema." },
-          ...params.messages
-        ];
-      }
-      if (input.tools?.length) {
-        params.tools = input.tools.map(toolDefToOpenAI);
-        params.tool_choice = "auto";
-      }
-      const response = await client.chat.completions.create(params);
-      let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
-      let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
-      const choice = response.choices[0];
-      const toolCallsRaw = choice?.message?.tool_calls;
-      const collectedToolCalls = [];
-      let finalResponse = response;
-      if (toolCallsRaw?.length && input.tools?.length) {
-        const toolMessages = [
-          ...params.messages,
-          choice.message
-        ];
-        for (const tc of toolCallsRaw) {
-          const toolDef = input.tools.find((t) => t.name === tc.function.name);
-          let args;
-          try {
-            args = JSON.parse(tc.function.arguments);
-          } catch {
-            args = tc.function.arguments;
-          }
-          let result;
-          if (toolDef?.handler) {
-            result = await toolDef.handler(args);
-          }
-          collectedToolCalls.push({ name: tc.function.name, arguments: args, result });
-          toolMessages.push({
-            role: "tool",
-            tool_call_id: tc.id,
-            content: JSON.stringify(result ?? {})
-          });
-        }
-        const followUp = await client.chat.completions.create({
-          model: requestModel,
-          messages: toolMessages
-        });
-        totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
-        totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
-        finalResponse = followUp;
-      }
-      const latencyMs = Date.now() - start;
-      const finalChoice = finalResponse.choices[0];
-      let rawContent = finalChoice?.message?.content ?? "";
-      if (stripThinking) {
-        rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
-      }
-      let output = rawContent;
-      if (input.schema) {
-        try {
-          output = JSON.parse(rawContent);
-        } catch {
-        }
-      }
-      return {
-        output,
-        usage: {
-          promptTokens: totalPromptTokens || void 0,
-          completionTokens: totalCompletionTokens || void 0
-        },
-        latencyMs,
-        raw: finalResponse,
-        toolCalls: collectedToolCalls.length > 0 ? collectedToolCalls : void 0
-      };
-    }
-  };
-}
-function toolDefToOpenAI(tool) {
-  return {
-    type: "function",
-    function: {
-      name: tool.name,
-      description: tool.description,
-      parameters: (0, import_zod_to_json_schema.zodToJsonSchema)(tool.parameters, { target: "openAi" })
-    }
-  };
-}
 // src/providers/anthropic.ts
 var import_sdk = __toESM(require("@anthropic-ai/sdk"), 1);
 function anthropic(model, options) {
@@ -2153,7 +2436,7 @@ function anthropic(model, options) {
         max_tokens: maxTokens,
         system: systemMessage,
         messages: [{ role: "user", content: input.prompt }]
-      });
+      }, { signal: input.signal });
       const latencyMs = Date.now() - start;
       const textBlock = response.content.find((b) => b.type === "text");
       const rawContent = textBlock?.type === "text" ? textBlock.text : "";
@@ -2178,7 +2461,7 @@ function anthropic(model, options) {
 }
 // src/providers/gemini.ts
-var import_openai3 = __toESM(require("openai"), 1);
+var import_openai4 = __toESM(require("openai"), 1);
 function gemini(model, options) {
   const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
   if (!apiKey) {
@@ -2186,22 +2469,409 @@ function gemini(model, options) {
       `Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
     );
   }
-  const client = new import_openai3.default({
+  const client = new import_openai4.default({
     apiKey,
-    baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/"
+    baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
+    timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
   });
   return makeProvider(`google/${model}`, "Google AI", model, client, model);
 }
+// src/reporter/markdown.ts
+var COMMENT_MARKER = "<!-- duelist-ci-report -->";
+function markdownReporter(report, _current) {
+  const lines = [COMMENT_MARKER, ""];
+  const status = report.failed ? "\u{1F534} Failed" : "\u{1F7E2} Passed";
+  lines.push(`## \u2B21 Agent Duelist CI \u2014 ${status}`);
+  lines.push("");
+  if (report.comparisons.length > 0) {
+    lines.push(markdownComparisonTable(report.comparisons));
+    lines.push("");
+  }
+  if (report.cost.totalUsd > 0 || report.cost.budget !== void 0) {
+    lines.push(markdownCostSummary(report.cost));
+    lines.push("");
+  }
+  if (report.flakyResults.length > 0) {
+    lines.push("### \u26A0\uFE0F Flaky Results");
+    lines.push("");
+    lines.push("These scorer/task combinations have high variance (CV > 0.3). Consider increasing `runs` or tightening prompts.");
+    lines.push("");
+    for (const f of report.flakyResults) {
+      lines.push(`- **${f.providerId}** \xD7 ${f.taskName} \u2192 ${f.scorerName} (CV = ${f.current.cv.toFixed(2)})`);
+    }
+    lines.push("");
+  }
+  if (report.failureReasons.length > 0) {
+    lines.push("### Failure Reasons");
+    lines.push("");
+    for (const reason of report.failureReasons) {
+      lines.push(`- ${reason}`);
+    }
+    lines.push("");
+  }
+  lines.push("---");
+  lines.push("*Generated by [agent-duelist](https://github.com/DataGobes/agent-duelist) CI*");
+  return lines.join("\n");
+}
+function markdownComparisonTable(comparisons) {
+  const lines = [];
+  lines.push("| Provider | Task | Scorer | Baseline | Current | Delta | Status |");
+  lines.push("|----------|------|--------|----------|---------|-------|--------|");
+  for (const c of comparisons) {
+    const baselineStr = c.baseline ? formatStats(c.baseline) : "\u2014";
+    const currentStr = formatStats(c.current);
+    const deltaStr = c.delta !== null ? formatDelta(c.delta) : "\u2014";
+    const status = statusIndicator(c);
+    lines.push(`| ${c.providerId} | ${c.taskName} | ${c.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
+  }
+  return lines.join("\n");
+}
+function markdownCostSummary(cost) {
+  const lines = [];
+  lines.push("### \u{1F4B0} Cost Summary");
+  lines.push("");
+  lines.push(`**Total:** $${cost.totalUsd.toFixed(4)}`);
+  if (cost.budget !== void 0) {
+    const pct = cost.budget > 0 ? (cost.totalUsd / cost.budget * 100).toFixed(0) : "\u221E";
+    const indicator = cost.overBudget ? "\u{1F534}" : "\u{1F7E2}";
+    lines.push(`**Budget:** $${cost.budget.toFixed(2)} (${pct}% used) ${indicator}`);
+  }
+  if (cost.perProvider.size > 1) {
+    lines.push("");
+    lines.push("| Provider | Cost |");
+    lines.push("|----------|------|");
+    for (const [provider, usd] of cost.perProvider) {
+      lines.push(`| ${provider} | $${usd.toFixed(4)} |`);
+    }
+  }
+  return lines.join("\n");
+}
+function formatStats(stats) {
+  if (stats.n > 1) {
+    const margin = (stats.ci95Upper - stats.ci95Lower) / 2;
+    return `${stats.mean.toFixed(3)} \xB1 ${margin.toFixed(3)}`;
+  }
+  return stats.mean.toFixed(3);
+}
+function formatDelta(delta) {
+  const sign = delta >= 0 ? "+" : "";
+  return `${sign}${delta.toFixed(3)}`;
+}
+function statusIndicator(c) {
+  if (c.regressed) return "\u{1F534} regressed";
+  if (c.improved) return "\u{1F7E2} improved";
+  if (c.baseline === null) return "\u{1F195} new";
+  return "\u26AA unchanged";
+}
+// src/ci.ts
+var import_node_fs = require("fs");
+var import_node_path = require("path");
+var LOWER_IS_BETTER = /* @__PURE__ */ new Set(["cost"]);
+var FLAKY_CV_THRESHOLD = 0.3;
+var T_CRITICAL_95 = {
+  1: 12.706,
+  2: 4.303,
+  3: 3.182,
+  4: 2.776,
+  5: 2.571,
+  6: 2.447,
+  7: 2.365,
+  8: 2.306,
+  9: 2.262,
+  10: 2.228,
+  15: 2.131,
+  20: 2.086,
+  25: 2.06,
+  30: 2.042
+};
+function tCritical(df) {
+  if (df <= 0) return 1.96;
+  if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
+  const keys = Object.keys(T_CRITICAL_95).map(Number).sort((a, b) => a - b);
+  if (df > keys[keys.length - 1]) return 1.96;
+  for (let i = 0; i < keys.length - 1; i++) {
+    if (df > keys[i] && df < keys[i + 1]) {
+      const low = keys[i], high = keys[i + 1];
+      const ratio = (df - low) / (high - low);
+      return T_CRITICAL_95[low] + ratio * (T_CRITICAL_95[high] - T_CRITICAL_95[low]);
+    }
+  }
+  return 1.96;
+}
+function computeScorerStats(samples) {
+  const n = samples.length;
+  if (n === 0) {
+    return { mean: 0, stddev: 0, cv: 0, n: 0, ci95Lower: 0, ci95Upper: 0 };
+  }
+  const mean = samples.reduce((a, b) => a + b, 0) / n;
+  if (n === 1) {
+    return { mean, stddev: 0, cv: 0, n: 1, ci95Lower: mean, ci95Upper: mean };
+  }
+  const variance = samples.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (n - 1);
+  const stddev = Math.sqrt(variance);
+  const cv = mean !== 0 ? stddev / Math.abs(mean) : 0;
+  const se = stddev / Math.sqrt(n);
+  const t = tCritical(n - 1);
+  return {
+    mean,
+    stddev,
+    cv,
+    n,
+    ci95Lower: mean - t * se,
+    ci95Upper: mean + t * se
+  };
+}
+function groupKey(providerId, taskName, scorerName) {
+  return `${providerId}::${taskName}::${scorerName}`;
+}
+function computeStats(results) {
+  const grouped = /* @__PURE__ */ new Map();
+  for (const r of results) {
+    if (r.error) continue;
+    for (const score of r.scores) {
+      if (score.value < 0) continue;
+      const key = groupKey(r.providerId, r.taskName, score.name);
+      if (!grouped.has(key)) grouped.set(key, []);
+      grouped.get(key).push(score.value);
+    }
+  }
+  const stats = /* @__PURE__ */ new Map();
+  for (const [key, samples] of grouped) {
+    stats.set(key, computeScorerStats(samples));
+  }
+  return stats;
+}
+function computeCostSummary(results, budget) {
+  let totalUsd = 0;
+  const perProvider = /* @__PURE__ */ new Map();
+  for (const r of results) {
+    if (r.error) continue;
+    const costScore = r.scores.find((s) => s.name === "cost");
+    if (!costScore || costScore.value < 0) continue;
+    const details = costScore.details;
+    const usd = details?.estimatedUsd ?? 0;
+    if (usd <= 0) continue;
+    totalUsd += usd;
+    perProvider.set(r.providerId, (perProvider.get(r.providerId) ?? 0) + usd);
+  }
+  return {
+    totalUsd,
+    perProvider,
+    budget,
+    overBudget: budget !== void 0 && totalUsd > budget
+  };
+}
+function compareResults(baselineStats, currentStats, thresholds, budget, currentResults) {
+  const comparisons = [];
+  const failureReasons = [];
+  for (const [key, current] of currentStats) {
+    const [providerId, taskName, scorerName] = key.split("::");
+    const baseline = baselineStats?.get(key) ?? null;
+    let delta = null;
+    let regressed = false;
+    let improved = false;
+    if (baseline) {
+      delta = current.mean - baseline.mean;
+      const threshold = thresholds.get(scorerName);
+      if (threshold !== void 0) {
+        const lowerIsBetter = LOWER_IS_BETTER.has(scorerName);
+        regressed = detectRegression(baseline, current, threshold, lowerIsBetter);
+        improved = detectImprovement(baseline, current, threshold, lowerIsBetter);
+      }
+    }
+    const flaky = current.n > 1 && current.cv > FLAKY_CV_THRESHOLD;
+    comparisons.push({
+      providerId,
+      taskName,
+      scorerName,
+      baseline,
+      current,
+      delta,
+      regressed,
+      improved,
+      flaky
+    });
+  }
+  const cost = computeCostSummary(currentResults ?? [], budget);
+  const regressions = comparisons.filter((c) => c.regressed);
+  if (regressions.length > 0) {
+    for (const r of regressions) {
+      failureReasons.push(
+        `${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${formatDelta2(r.delta)}`
+      );
+    }
+  }
+  if (cost.overBudget) {
+    failureReasons.push(
+      `Total cost $${cost.totalUsd.toFixed(4)} exceeds budget $${cost.budget.toFixed(2)}`
+    );
+  }
+  const flakyResults = comparisons.filter((c) => c.flaky);
+  const failed = failureReasons.length > 0;
+  return { comparisons, cost, failed, flakyResults, failureReasons };
+}
+function detectRegression(baseline, current, threshold, lowerIsBetter) {
+  if (baseline.n === 1 && current.n === 1) {
+    const delta = current.mean - baseline.mean;
+    if (lowerIsBetter) return delta > threshold;
+    return delta < -threshold;
+  }
+  if (lowerIsBetter) {
+    return current.ci95Lower - baseline.ci95Upper > threshold;
+  }
+  return baseline.ci95Upper - current.ci95Lower > threshold && current.mean < baseline.mean;
+}
+function detectImprovement(baseline, current, threshold, lowerIsBetter) {
+  if (baseline.n === 1 && current.n === 1) {
+    const delta = current.mean - baseline.mean;
+    if (lowerIsBetter) return delta < -threshold;
+    return delta > threshold;
+  }
+  if (lowerIsBetter) {
+    return baseline.ci95Lower - current.ci95Upper > threshold;
+  }
+  return current.ci95Lower - baseline.ci95Upper > threshold;
+}
+function formatDelta2(delta) {
+  const sign = delta >= 0 ? "+" : "";
+  return `${sign}${delta.toFixed(4)}`;
+}
+function loadBaseline(path) {
+  try {
+    const raw = (0, import_node_fs.readFileSync)(path, "utf-8");
+    const data = JSON.parse(raw);
+    const results = data.results ?? data;
+    if (!Array.isArray(results)) return null;
+    return {
+      timestamp: data.timestamp ?? "unknown",
+      results
+    };
+  } catch {
+    return null;
+  }
+}
+function saveBaseline(path, results) {
+  (0, import_node_fs.mkdirSync)((0, import_node_path.dirname)(path), { recursive: true });
+  const data = {
+    timestamp: (/* @__PURE__ */ new Date()).toISOString(),
+    results
+  };
+  (0, import_node_fs.writeFileSync)(path, JSON.stringify(data, null, 2));
+}
+// src/github.ts
+var import_node_fs2 = require("fs");
+function detectGitHubContext() {
+  const token = process.env.GITHUB_TOKEN;
+  const repository = process.env.GITHUB_REPOSITORY;
+  const eventPath = process.env.GITHUB_EVENT_PATH;
+  if (!token || !repository) return null;
+  const [owner, repo] = repository.split("/");
+  if (!owner || !repo) return null;
+  let prNumber;
+  if (eventPath) {
+    try {
+      const event = JSON.parse((0, import_node_fs2.readFileSync)(eventPath, "utf-8"));
+      if (event.pull_request && typeof event.pull_request === "object") {
+        const pr = event.pull_request;
+        prNumber = pr.number;
+      }
+      if (!prNumber && event.issue && typeof event.issue === "object") {
+        const issue = event.issue;
+        if (issue.pull_request) {
+          prNumber = issue.number;
+        }
+      }
+    } catch {
+    }
+  }
+  if (!prNumber && process.env.DUELIST_PR_NUMBER) {
+    prNumber = parseInt(process.env.DUELIST_PR_NUMBER, 10);
+  }
+  if (!prNumber) return null;
+  return { token, owner, repo, prNumber };
+}
+var API_BASE = "https://api.github.com";
+async function findExistingComment(ctx, marker) {
+  let page = 1;
+  const perPage = 50;
+  while (true) {
+    const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
+    const res = await fetch(url, {
+      headers: {
+        Authorization: `Bearer ${ctx.token}`,
+        Accept: "application/vnd.github+json",
+        "X-GitHub-Api-Version": "2022-11-28"
+      }
+    });
+    if (!res.ok) return null;
+    const comments = await res.json();
+    if (comments.length === 0) break;
+    for (const comment of comments) {
+      if (comment.body?.includes(marker)) {
+        return comment.id;
+      }
+    }
+    if (comments.length < perPage) break;
+    page++;
+  }
+  return null;
+}
+async function upsertPrComment(ctx, body, marker) {
+  const existingId = await findExistingComment(ctx, marker);
+  if (existingId) {
+    const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
+    const res = await fetch(url, {
+      method: "PATCH",
+      headers: {
+        Authorization: `Bearer ${ctx.token}`,
+        Accept: "application/vnd.github+json",
+        "Content-Type": "application/json",
+        "X-GitHub-Api-Version": "2022-11-28"
+      },
+      body: JSON.stringify({ body })
+    });
+    if (!res.ok) {
+      const text = await res.text();
+      console.warn(`Failed to update PR comment: ${res.status} ${text}`);
+    }
+  } else {
+    const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
+    const res = await fetch(url, {
+      method: "POST",
+      headers: {
+        Authorization: `Bearer ${ctx.token}`,
+        Accept: "application/vnd.github+json",
+        "Content-Type": "application/json",
+        "X-GitHub-Api-Version": "2022-11-28"
+      },
+      body: JSON.stringify({ body })
+    });
+    if (!res.ok) {
+      const text = await res.text();
+      console.warn(`Failed to create PR comment: ${res.status} ${text}`);
+    }
+  }
+}
 // Annotate the CommonJS export names for ESM import in node:
 0 && (module.exports = {
   anthropic,
   azureOpenai,
+  compareResults,
+  computeStats,
   consoleReporter,
   defineArena,
+  detectGitHubContext,
   gemini,
   jsonReporter,
+  loadBaseline,
+  markdownReporter,
   openai,
   openaiCompatible,
-  registerPricing
+  registerPricing,
+  saveBaseline,
+  upsertPrComment
 });
 //# sourceMappingURL=index.cjs.map