npm - agent-duelist - Versions diffs - 0.2.1 → 0.3.0 - Mend

agent-duelist 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/index.cjs CHANGED Viewed

@@ -40,7 +40,9 @@ __export(index_exports, {
   gemini: () => gemini,
   htmlReporter: () => htmlReporter,
   jsonReporter: () => jsonReporter,
+  listPacks: () => listPacks,
   loadBaseline: () => loadBaseline,
+  loadPack: () => loadPack,
   markdownReporter: () => markdownReporter,
   openai: () => openai,
   openaiCompatible: () => openaiCompatible,
@@ -1401,33 +1403,42 @@ var correctnessScorer = ({ task, result }) => {
   if (task.expected === void 0) {
     return { name: "correctness", value: 0.5, details: { reason: "no expected value" } };
   }
-  const match = deepEqual(task.expected, result.output);
+  const actual = normalizeOutput(task.expected, result.output);
+  const match = deepEqual(task.expected, actual);
   return {
     name: "correctness",
     value: match ? 1 : 0,
     details: { expected: task.expected, actual: result.output }
   };
 };
-function deepEqual(a, b) {
-  if (a === b) return true;
-  if (typeof a === "string" && typeof b === "string") {
-    return a.trim().toLowerCase() === b.trim().toLowerCase();
-  }
-  if (typeof a !== typeof b) return false;
-  if (a === null || b === null) return a === b;
-  if (Array.isArray(a) && Array.isArray(b)) {
-    if (a.length !== b.length) return false;
-    return a.every((val, i) => deepEqual(val, b[i]));
-  }
-  if (typeof a === "object" && typeof b === "object") {
-    const objA = a;
-    const objB = b;
-    const keysA = Object.keys(objA);
-    const keysB = Object.keys(objB);
-    if (keysA.length !== keysB.length) return false;
-    return keysA.every((key) => key in objB && deepEqual(objA[key], objB[key]));
-  }
-  return a === b;
+function normalizeOutput(expected, actual) {
+  if (Array.isArray(expected) && !Array.isArray(actual) && typeof actual === "object" && actual !== null) {
+    const entries = Object.entries(actual);
+    const arrayEntries = entries.filter(([, v]) => Array.isArray(v));
+    if (arrayEntries.length === 1) {
+      return arrayEntries[0][1];
+    }
+  }
+  return actual;
+}
+function deepEqual(expected, actual) {
+  if (expected === actual) return true;
+  if (typeof expected === "string" && typeof actual === "string") {
+    return expected.trim().toLowerCase() === actual.trim().toLowerCase();
+  }
+  if (typeof expected !== typeof actual) return false;
+  if (expected === null || actual === null) return expected === actual;
+  if (Array.isArray(expected) && Array.isArray(actual)) {
+    if (expected.length !== actual.length) return false;
+    return expected.every((val, i) => deepEqual(val, actual[i]));
+  }
+  if (typeof expected === "object" && typeof actual === "object") {
+    const objExpected = expected;
+    const objActual = actual;
+    const keysExpected = Object.keys(objExpected);
+    return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
+  }
+  return expected === actual;
 }
 // src/scorers/schema-correctness.ts
@@ -1447,7 +1458,14 @@ var schemaCorrectnessScorer = ({ task, result }) => {
       };
     }
   }
-  const parsed = task.schema.safeParse(data);
+  let parsed = task.schema.safeParse(data);
+  if (!parsed.success && !Array.isArray(data) && typeof data === "object" && data !== null) {
+    const arrayEntries = Object.entries(data).filter(([, v]) => Array.isArray(v));
+    if (arrayEntries.length === 1) {
+      const unwrapped = task.schema.safeParse(arrayEntries[0][1]);
+      if (unwrapped.success) parsed = unwrapped;
+    }
+  }
   return {
     name: "schema-correctness",
     value: parsed.success ? 1 : 0,
@@ -1493,18 +1511,36 @@ var import_openai2 = __toESM(require("openai"), 1);
 // src/providers/openai.ts
 var import_openai = __toESM(require("openai"), 1);
-var import_zod_to_json_schema = require("zod-to-json-schema");
+var import_zod_to_json_schema2 = require("zod-to-json-schema");
 // src/providers/shared.ts
-var SCHEMA_SYSTEM_MESSAGE = "Respond with valid JSON matching the requested schema.";
+var import_zod_to_json_schema = require("zod-to-json-schema");
+function buildSchemaSystemMessage(schema) {
+  if (!schema) return "Respond with valid JSON.";
+  const jsonSchema = (0, import_zod_to_json_schema.zodToJsonSchema)(schema, { target: "openAi" });
+  return [
+    "Respond with ONLY valid JSON data. No markdown, no code fences, no explanation.",
+    "",
+    "Your output must conform to this JSON Schema:",
+    JSON.stringify(jsonSchema, null, 2),
+    "",
+    "IMPORTANT: Output the actual data values, NOT the schema definition itself.",
+    'Do NOT include keys like "type", "$schema", or "items" from the schema definition in your response.'
+  ].join("\n");
+}
 function parseSchemaOutput(rawContent, hasSchema) {
   if (!hasSchema) return rawContent;
+  const cleaned = stripCodeFences(rawContent);
   try {
-    return JSON.parse(rawContent);
+    return JSON.parse(cleaned);
   } catch {
     return rawContent;
   }
 }
+function stripCodeFences(content) {
+  const match = content.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?\s*```$/m);
+  return match ? match[1] : content;
+}
 // src/providers/openai.ts
 var REQUEST_TIMEOUT_MS = 6e4;
@@ -1553,7 +1589,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
       if (input.schema) {
         params.response_format = { type: "json_object" };
         params.messages = [
-          { role: "system", content: SCHEMA_SYSTEM_MESSAGE },
+          { role: "system", content: buildSchemaSystemMessage(input.schema) },
           ...params.messages
         ];
       }
@@ -1561,7 +1597,9 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
         params.tools = input.tools.map(toolDefToOpenAI);
         params.tool_choice = "auto";
       }
-      const response = await client.chat.completions.create(params, { signal: input.signal });
+      const reqOpts = { signal: input.signal };
+      if (input.timeout) reqOpts.timeout = input.timeout;
+      const response = await client.chat.completions.create(params, reqOpts);
       let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
       let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
       const choice = response.choices[0];
@@ -1595,7 +1633,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
         const followUp = await client.chat.completions.create({
           model: requestModel,
           messages: toolMessages
-        }, { signal: input.signal });
+        }, reqOpts);
         totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
         totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
         finalResponse = followUp;
@@ -1640,7 +1678,7 @@ function toolDefToOpenAI(tool) {
     function: {
       name: tool.name,
       description: tool.description,
-      parameters: (0, import_zod_to_json_schema.zodToJsonSchema)(tool.parameters, { target: "openAi" })
+      parameters: (0, import_zod_to_json_schema2.zodToJsonSchema)(tool.parameters, { target: "openAi" })
     }
   };
 }
@@ -1689,8 +1727,14 @@ function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
   if (!apiKey) return void 0;
   return { client: new import_openai2.default({ apiKey, timeout: timeoutMs }), model };
 }
+function isTemperatureError(err) {
+  const msg = err instanceof Error ? err.message : String(err);
+  const lower = msg.toLowerCase();
+  return lower.includes("temperature") && (lower.includes("not supported") || lower.includes("is not allowed") || lower.includes("unsupported") || lower.includes("invalid"));
+}
 function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
   let cached = void 0;
+  let useTemperature = true;
   return async ({ task, result }) => {
     if (task.expected === void 0) {
       return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
@@ -1707,35 +1751,24 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
     }
     const { client, model } = cached;
     const prompt = JUDGE_PROMPT.replace("{task}", task.prompt).replace("{expected}", JSON.stringify(task.expected)).replace("{actual}", JSON.stringify(result.output));
+    const messages = [{ role: "user", content: prompt }];
     try {
-      const response = await client.chat.completions.create({
-        model,
-        messages: [{ role: "user", content: prompt }],
-        max_completion_tokens: 2048
-      });
-      const content = response.choices[0]?.message?.content?.trim() ?? "";
-      const parsed = {};
-      for (const line of content.split("\n")) {
-        const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
-        if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
-      }
-      const accuracy = parsed.accuracy;
-      const completeness = parsed.completeness;
-      const conciseness = parsed.conciseness;
-      if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s) => isNaN(s) || s < 0 || s > 1)) {
-        return {
-          name: "llm-judge-correctness",
-          value: -1,
-          details: { reason: `judge returned unparseable scores: "${content}"`, model }
-        };
-      }
-      const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
-      return {
-        name: "llm-judge-correctness",
-        value: composite,
-        details: { model, accuracy, completeness, conciseness }
-      };
+      const response = await callJudge(client, model, messages, useTemperature);
+      return parseJudgeResponse(response, model);
     } catch (err) {
+      if (useTemperature && isTemperatureError(err)) {
+        useTemperature = false;
+        try {
+          const response = await callJudge(client, model, messages, false);
+          return parseJudgeResponse(response, model);
+        } catch (retryErr) {
+          return {
+            name: "llm-judge-correctness",
+            value: -1,
+            details: { reason: `judge call failed: ${retryErr instanceof Error ? retryErr.message : String(retryErr)}` }
+          };
+        }
+      }
       return {
         name: "llm-judge-correctness",
         value: -1,
@@ -1744,6 +1777,38 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
     }
   };
 }
+async function callJudge(client, model, messages, withTemperature) {
+  return client.chat.completions.create({
+    model,
+    messages,
+    max_completion_tokens: 2048,
+    ...withTemperature ? { temperature: 0 } : {}
+  });
+}
+function parseJudgeResponse(response, model) {
+  const content = response.choices[0]?.message?.content?.trim() ?? "";
+  const parsed = {};
+  for (const line of content.split("\n")) {
+    const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
+    if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
+  }
+  const accuracy = parsed.accuracy;
+  const completeness = parsed.completeness;
+  const conciseness = parsed.conciseness;
+  if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s) => isNaN(s) || s < 0 || s > 1)) {
+    return {
+      name: "llm-judge-correctness",
+      value: -1,
+      details: { reason: `judge returned unparseable scores: "${content}"`, model }
+    };
+  }
+  const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
+  return {
+    name: "llm-judge-correctness",
+    value: composite,
+    details: { model, accuracy, completeness, conciseness }
+  };
+}
 // src/scorers/tool-usage.ts
 var toolUsageScorer = ({ task, result }) => {
@@ -1816,7 +1881,8 @@ async function runBenchmarks(options) {
               prompt: task.prompt,
               schema: task.schema,
               tools: task.tools,
-              signal
+              signal,
+              timeout
             }), timeout);
             const scores = await Promise.all(
               scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
@@ -2025,37 +2091,76 @@ function computeColumnStats(providerData, scorerNames) {
   }
   return stats;
 }
+var QUALITY_SCORERS = /* @__PURE__ */ new Set([
+  "correctness",
+  "schema-correctness",
+  "fuzzy-similarity",
+  "llm-judge-correctness",
+  "tool-usage"
+]);
+function passesQualityGate(providerId, columnStats) {
+  const qualityColumns = [...columnStats.keys()].filter((k) => QUALITY_SCORERS.has(k));
+  if (qualityColumns.length === 0) return true;
+  return qualityColumns.some((col) => {
+    const val = columnStats.get(col)?.values.get(providerId);
+    return val !== void 0 && val > 0;
+  });
+}
 function computeMedals(columnStats, providerIds) {
   const medals = /* @__PURE__ */ new Map();
   if (providerIds.length < 2) {
     for (const id of providerIds) medals.set(id, "none");
     return medals;
   }
-  const wins = /* @__PURE__ */ new Map();
-  for (const id of providerIds) wins.set(id, 0);
-  for (const [, colStats] of columnStats) {
+  const eligible = new Set(providerIds.filter((id) => passesQualityGate(id, columnStats)));
+  const qualityWins = /* @__PURE__ */ new Map();
+  const efficiencyWins = /* @__PURE__ */ new Map();
+  for (const id of providerIds) {
+    qualityWins.set(id, 0);
+    efficiencyWins.set(id, 0);
+  }
+  for (const [colName, colStats] of columnStats) {
     if (colStats.best === void 0) continue;
     const bestProviders = [...colStats.values.entries()].filter(([, v]) => v !== void 0 && v === colStats.best);
     if (bestProviders.length === 1) {
-      wins.set(bestProviders[0][0], (wins.get(bestProviders[0][0]) ?? 0) + 1);
+      const winnerId = bestProviders[0][0];
+      if (QUALITY_SCORERS.has(colName)) {
+        qualityWins.set(winnerId, (qualityWins.get(winnerId) ?? 0) + 1);
+      } else {
+        efficiencyWins.set(winnerId, (efficiencyWins.get(winnerId) ?? 0) + 1);
+      }
     }
   }
-  const totalWins = [...wins.values()].reduce((a, b) => a + b, 0);
+  const totalWins = [...qualityWins.values()].reduce((a, b) => a + b, 0) + [...efficiencyWins.values()].reduce((a, b) => a + b, 0);
   if (totalWins === 0) {
     for (const id of providerIds) medals.set(id, "none");
     return medals;
   }
-  const sorted = [...wins.entries()].sort(
-    (a, b) => b[1] - a[1] || a[0].localeCompare(b[0])
-  );
+  const eligibleSorted = providerIds.filter((id) => eligible.has(id)).sort((a, b) => {
+    const qDiff = (qualityWins.get(b) ?? 0) - (qualityWins.get(a) ?? 0);
+    if (qDiff !== 0) return qDiff;
+    const eDiff = (efficiencyWins.get(b) ?? 0) - (efficiencyWins.get(a) ?? 0);
+    if (eDiff !== 0) return eDiff;
+    return a.localeCompare(b);
+  });
   const medalList = ["gold", "silver", "bronze"];
   let rank = 0;
-  for (let i = 0; i < sorted.length; i++) {
-    if (i > 0 && sorted[i][1] < sorted[i - 1][1]) {
-      rank = i;
+  for (let i = 0; i < eligibleSorted.length; i++) {
+    if (i > 0) {
+      const prevQ = qualityWins.get(eligibleSorted[i - 1]) ?? 0;
+      const currQ = qualityWins.get(eligibleSorted[i]) ?? 0;
+      if (currQ < prevQ) {
+        rank = i;
+      } else if (currQ === prevQ) {
+        const prevE = efficiencyWins.get(eligibleSorted[i - 1]) ?? 0;
+        const currE = efficiencyWins.get(eligibleSorted[i]) ?? 0;
+        if (currE < prevE) rank = i;
+      }
     }
-    const hasWins = sorted[i][1] > 0;
-    medals.set(sorted[i][0], hasWins && rank < medalList.length ? medalList[rank] : "none");
+    medals.set(eligibleSorted[i], rank < medalList.length ? medalList[rank] : "none");
+  }
+  for (const id of providerIds) {
+    if (!eligible.has(id)) medals.set(id, "none");
   }
   return medals;
 }
@@ -2452,24 +2557,10 @@ function printSummary(results, providers, byProvider) {
       console.log(`  ${medal} Cheapest:      ${bold(byCost.id)} ${dim(providerLabel(byCost.id))}  ${brightGreen}${boldCode}${costStr}${reset}`);
     }
   }
-  if (!single) {
-    const wins = /* @__PURE__ */ new Map();
-    for (const id of providers) wins.set(id, 0);
-    if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
-    if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
-    if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
-    const maxWins = Math.max(...wins.values());
-    if (maxWins > 0) {
-      const topProviders = [...wins.entries()].filter(([, w]) => w === maxWins);
-      console.log("");
-      if (topProviders.length === 1) {
-        const [winnerId, winCount] = topProviders[0];
-        console.log(`  \u{1F3C6} Overall:      ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))}  ${dim(`(${winCount}/3 categories)`)}`);
-      } else {
-        const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
-        console.log(`  \u{1F3C6} Overall:      ${names}  ${dim(`(tied at ${maxWins}/3)`)}`);
-      }
-    }
+  if (!single && byCorrectness && byCorrectness.avg > 0) {
+    console.log("");
+    const pct = `${Math.round(byCorrectness.avg * 100)}%`;
+    console.log(`  \u{1F3C6} Overall:      ${brightGreen}${boldCode}${byCorrectness.id}${reset} ${dim(providerLabel(byCorrectness.id))}  ${dim(`(${pct} avg correctness)`)}`);
   }
   console.log("");
 }
@@ -2503,15 +2594,15 @@ function defineArena(config) {
   if (config.providers.length === 0) {
     throw new Error("At least one provider is required");
   }
-  if (config.tasks.length === 0) {
-    throw new Error("At least one task is required");
-  }
   const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
   const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
   const runs = config.runs ?? 1;
   return {
     config,
     async run(options) {
+      if (config.tasks.length === 0) {
+        throw new Error("At least one task is required");
+      }
       return runBenchmarks({
         providers: config.providers,
         tasks: config.tasks,
@@ -2537,13 +2628,15 @@ function anthropic(model, options) {
     model,
     async run(input) {
       const start = Date.now();
-      const systemMessage = input.schema ? SCHEMA_SYSTEM_MESSAGE : void 0;
+      const systemMessage = input.schema ? buildSchemaSystemMessage(input.schema) : void 0;
+      const reqOpts = { signal: input.signal };
+      if (input.timeout) reqOpts.timeout = input.timeout;
       const response = await client.messages.create({
         model,
         max_tokens: maxTokens,
         system: systemMessage,
         messages: [{ role: "user", content: input.prompt }]
-      }, { signal: input.signal });
+      }, reqOpts);
       const latencyMs = Date.now() - start;
       const textBlock = response.content.find((b) => b.type === "text");
       const rawContent = textBlock?.type === "text" ? textBlock.text : "";
@@ -2688,17 +2781,8 @@ function htmlReporter(results) {
     return { id, avg };
   }).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
   let overallWinner;
-  if (multi) {
-    const wins = /* @__PURE__ */ new Map();
-    for (const id of providers) wins.set(id, 0);
-    if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
-    if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
-    if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
-    const maxWins = Math.max(...wins.values());
-    if (maxWins > 0) {
-      const tops = [...wins.entries()].filter(([, w]) => w === maxWins);
-      if (tops.length === 1) overallWinner = tops[0][0];
-    }
+  if (multi && byCorrectness && byCorrectness.avg > 0) {
+    overallWinner = byCorrectness.id;
   }
   const errorResults = results.filter((r) => r.error);
   const deduped = dedupeErrors(errorResults);
@@ -3219,7 +3303,7 @@ function renderErrors(errors) {
     </div>`;
   }).join("\n");
   return `<section class="errors-section">
-  <h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'block'">Errors</h2>
+  <h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'none'">Errors</h2>
   <div class="errors-list">
     ${items}
   </div>
@@ -3283,6 +3367,149 @@ function renderScript(taskCount) {
 </script>`;
 }
+// src/packs/structured-output.ts
+var import_zod = require("zod");
+var structuredOutputPack = {
+  name: "structured-output",
+  label: "Structured Output",
+  description: "Zod schema stress test \u2014 flat objects, nesting, arrays, enums, empty arrays, and adversarial input",
+  tasks: [
+    {
+      name: "so:flat-entity",
+      prompt: "Extract the person's details from this text: 'Maria Garcia, age 34, works as a software architect in Barcelona, Spain. Her employee ID is EMP-2847.' Return as JSON.",
+      expected: {
+        name: "Maria Garcia",
+        age: 34,
+        role: "software architect",
+        city: "Barcelona",
+        country: "Spain",
+        employeeId: "EMP-2847"
+      },
+      schema: import_zod.z.object({
+        name: import_zod.z.string(),
+        age: import_zod.z.number(),
+        role: import_zod.z.string(),
+        city: import_zod.z.string(),
+        country: import_zod.z.string(),
+        employeeId: import_zod.z.string()
+      })
+    },
+    {
+      name: "so:nested-object",
+      prompt: "Parse this shipping label into structured JSON: 'Ship to: Acme Corp, Attn: John Lee, 4th Floor, 742 Evergreen Terrace, Springfield, IL 62704, USA. Order #ORD-9912, 3 items, 2.4kg, express shipping.' Use shippingMethod values: standard, express, or overnight. Return as JSON.",
+      expected: {
+        recipient: { company: "Acme Corp", contact: "John Lee", floor: "4th Floor" },
+        address: { street: "742 Evergreen Terrace", city: "Springfield", state: "IL", zip: "62704", country: "USA" },
+        order: { id: "ORD-9912", itemCount: 3, weightKg: 2.4, shippingMethod: "express" }
+      },
+      schema: import_zod.z.object({
+        recipient: import_zod.z.object({ company: import_zod.z.string(), contact: import_zod.z.string(), floor: import_zod.z.string() }),
+        address: import_zod.z.object({
+          street: import_zod.z.string(),
+          city: import_zod.z.string(),
+          state: import_zod.z.string(),
+          zip: import_zod.z.string(),
+          country: import_zod.z.string()
+        }),
+        order: import_zod.z.object({
+          id: import_zod.z.string(),
+          itemCount: import_zod.z.number(),
+          weightKg: import_zod.z.number(),
+          shippingMethod: import_zod.z.enum(["standard", "express", "overnight"])
+        })
+      })
+    },
+    {
+      name: "so:array-of-objects",
+      prompt: "Extract all mentioned products with their prices and categories from this text: 'Our summer sale includes the UltraWidget Pro ($49.99, Electronics), ComfortMax Chair ($199.00, Furniture), and AquaPure Filter ($24.50, Home & Kitchen). The SmartLamp Mini is also available at $34.99 in the Electronics category.' Return as a JSON array.",
+      expected: [
+        { name: "UltraWidget Pro", price: 49.99, category: "Electronics" },
+        { name: "ComfortMax Chair", price: 199, category: "Furniture" },
+        { name: "AquaPure Filter", price: 24.5, category: "Home & Kitchen" },
+        { name: "SmartLamp Mini", price: 34.99, category: "Electronics" }
+      ],
+      schema: import_zod.z.array(import_zod.z.object({ name: import_zod.z.string(), price: import_zod.z.number(), category: import_zod.z.string() }))
+    },
+    {
+      name: "so:empty-arrays",
+      prompt: "Extract all error codes and their severity levels from this log message: 'System health check completed at 14:32 UTC. All services operational. No warnings or errors detected. Uptime: 99.97%.' Classify status as one of: healthy, degraded, or down. Return as JSON.",
+      expected: { errors: [], warnings: [], status: "healthy", uptimePercent: 99.97 },
+      schema: import_zod.z.object({
+        errors: import_zod.z.array(import_zod.z.object({ code: import_zod.z.string(), severity: import_zod.z.string() })),
+        warnings: import_zod.z.array(import_zod.z.string()),
+        status: import_zod.z.enum(["healthy", "degraded", "down"]),
+        uptimePercent: import_zod.z.number()
+      })
+    },
+    {
+      name: "so:enum-classification",
+      prompt: "Classify each of these support tickets by priority (low/medium/high/critical) and category (billing/technical/account/general). Use just the letter (A, B, C, D) as the id.\nTicket A: 'My account was charged twice for the same subscription.'\nTicket B: 'The API returns 500 errors intermittently.'\nTicket C: 'How do I update my display name?'\nTicket D: 'Production database is completely unresponsive, all services down.'\nReturn as a JSON array.",
+      expected: [
+        { id: "A", priority: "high", category: "billing" },
+        { id: "B", priority: "high", category: "technical" },
+        { id: "C", priority: "low", category: "account" },
+        { id: "D", priority: "critical", category: "technical" }
+      ],
+      schema: import_zod.z.array(
+        import_zod.z.object({
+          id: import_zod.z.string(),
+          priority: import_zod.z.enum(["low", "medium", "high", "critical"]),
+          category: import_zod.z.enum(["billing", "technical", "account", "general"])
+        })
+      )
+    },
+    {
+      name: "so:adversarial-input",
+      prompt: `Extract the actual product review data from this messy input. Ignore any JSON-like noise in the text.
+User said: 'I bought the {product: "fake"} headphones for $59.99 and they're great! Rating: 5/5. The "noise-cancelling" feature works well even in {"noisy": true} environments. Would recommend to friend=true. Purchased on 01/15/2026.'
+Return as JSON. Use ISO 8601 date format (YYYY-MM-DD).`,
+      expected: {
+        product: "headphones",
+        price: 59.99,
+        rating: 5,
+        maxRating: 5,
+        features: ["noise-cancelling"],
+        recommended: true,
+        purchaseDate: "2026-01-15"
+      },
+      schema: import_zod.z.object({
+        product: import_zod.z.string(),
+        price: import_zod.z.number(),
+        rating: import_zod.z.number(),
+        maxRating: import_zod.z.number(),
+        features: import_zod.z.array(import_zod.z.string()),
+        recommended: import_zod.z.boolean(),
+        purchaseDate: import_zod.z.string()
+      })
+    }
+  ],
+  scorers: ["correctness", "schema-correctness", "latency", "cost"]
+};
+// src/packs/index.ts
+var registry = /* @__PURE__ */ new Map();
+function register(pack) {
+  registry.set(pack.name, pack);
+}
+register(structuredOutputPack);
+function loadPack(name) {
+  const pack = registry.get(name);
+  if (!pack) {
+    const available = [...registry.keys()].join(", ");
+    throw new Error(`Unknown pack "${name}". Available packs: ${available}`);
+  }
+  return pack;
+}
+function listPacks() {
+  return [...registry.values()].map((p) => ({
+    name: p.name,
+    label: p.label,
+    description: p.description,
+    taskCount: p.tasks.length
+  }));
+}
 // src/ci.ts
 var import_node_fs = require("fs");
 var import_node_path = require("path");
@@ -3574,7 +3801,9 @@ async function upsertPrComment(ctx, body, marker) {
   gemini,
   htmlReporter,
   jsonReporter,
+  listPacks,
   loadBaseline,
+  loadPack,
   markdownReporter,
   openai,
   openaiCompatible,