npm - @archal/cli - Versions diffs - 0.7.10 → 0.7.11 - Mend

@archal/cli 0.7.10 → 0.7.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.js +506 -124
package/harnesses/_lib/providers.mjs +29 -7
package/harnesses/hardened/agent.mjs +17 -4
package/harnesses/naive/agent.mjs +13 -1
package/harnesses/react/agent.mjs +34 -8
package/harnesses/zero-shot/agent.mjs +13 -1
package/package.json +1 -1

package/dist/index.js CHANGED Viewed

@@ -1114,6 +1114,8 @@ var HTTP_RETRYABLE_STATUS_CODES = /* @__PURE__ */ new Set([408, 425, 429, 500, 5
 var HTTP_PUSH_TIMEOUT_MS = 2e4;
 var HTTP_PUSH_MAX_RETRIES = 6;
 var HTTP_PUSH_BACKOFF_MS = [1e3, 2e3, 3e3, 5e3, 5e3, 5e3];
+var HTTP_PUSH_WARMUP_RETRIES = 6;
+var HTTP_PUSH_WARMUP_BACKOFF_MS = [1500, 2500, 3500, 5e3, 6e3, 7e3];
 function resolveRetryDelay(backoffMs, attempt, fallbackMs) {
   const indexed = backoffMs[attempt];
   if (typeof indexed === "number" && Number.isFinite(indexed) && indexed >= 0) {
@@ -1164,6 +1166,10 @@ async function fetchWithRetry(url, options, retryOptions) {
 function twinBasePath(url) {
   return url.replace(/\/(mcp|api)\/?$/, "");
 }
+function isTwinWorkerWarmupResponse(status, body) {
+  if (status !== 503) return false;
+  return /twin worker endpoint not available|session is busy|retry shortly/i.test(body);
+}
 async function collectStateFromHttp(twinUrls, bearerToken, adminAuth) {
   const state = {};
   const failures = [];
@@ -1208,25 +1214,44 @@ async function pushStateToCloud(twinUrls, seedSelections, bearerToken, adminAuth
     }
     const url = `${twinBasePath(baseUrl)}/state`;
     debug(`Pushing dynamic seed to ${sel.twinName}`, { url });
-    const response = await fetchWithRetry(
-      url,
-      {
-        method: "PUT",
-        headers,
-        body: JSON.stringify(sel.seedData)
-      },
-      {
-        retries: HTTP_PUSH_MAX_RETRIES,
-        timeoutMs: HTTP_PUSH_TIMEOUT_MS,
-        backoffMs: HTTP_PUSH_BACKOFF_MS
+    const payload = JSON.stringify(sel.seedData);
+    let pushed = false;
+    for (let warmupAttempt = 0; warmupAttempt <= HTTP_PUSH_WARMUP_RETRIES; warmupAttempt++) {
+      const response = await fetchWithRetry(
+        url,
+        {
+          method: "PUT",
+          headers,
+          body: payload
+        },
+        {
+          retries: HTTP_PUSH_MAX_RETRIES,
+          timeoutMs: HTTP_PUSH_TIMEOUT_MS,
+          backoffMs: HTTP_PUSH_BACKOFF_MS
+        }
+      );
+      if (response.ok) {
+        pushed = true;
+        break;
       }
-    );
-    if (!response.ok) {
       const text = await response.text().catch(() => "");
+      const isWarmup = isTwinWorkerWarmupResponse(response.status, text);
+      if (isWarmup && warmupAttempt < HTTP_PUSH_WARMUP_RETRIES) {
+        const delay = resolveRetryDelay(HTTP_PUSH_WARMUP_BACKOFF_MS, warmupAttempt, 5e3);
+        warn(
+          `Twin "${sel.twinName}" not ready for state push (HTTP 503), retrying in ${delay}ms`,
+          { attempt: `${warmupAttempt + 1}/${HTTP_PUSH_WARMUP_RETRIES + 1}` }
+        );
+        await new Promise((resolve12) => setTimeout(resolve12, delay));
+        continue;
+      }
       throw new Error(
         `Failed to push dynamic seed to twin "${sel.twinName}": HTTP ${response.status}${text ? ` (${text})` : ""}`
       );
     }
+    if (!pushed) {
+      throw new Error(`Failed to push dynamic seed to twin "${sel.twinName}": worker warmup did not complete in time`);
+    }
     debug(`Pushed dynamic seed to ${sel.twinName} successfully`);
   }
 }
@@ -3202,6 +3227,47 @@ async function callAnthropic(options) {
   if (!textBlock?.text) throw new Error("Anthropic returned no text content");
   return textBlock.text;
 }
+function extractOpenAiTextContent(data) {
+  const message = data.choices?.[0]?.message;
+  if (!message) return null;
+  if (typeof message.content === "string") {
+    const trimmed = message.content.trim();
+    return trimmed.length > 0 ? trimmed : null;
+  }
+  if (Array.isArray(message.content)) {
+    const textSegments = [];
+    for (const part of message.content) {
+      if (typeof part === "string") {
+        const trimmed = part.trim();
+        if (trimmed.length > 0) textSegments.push(trimmed);
+        continue;
+      }
+      if (!part || typeof part !== "object") continue;
+      const partText = part.text;
+      if (typeof partText === "string") {
+        const trimmed = partText.trim();
+        if (trimmed.length > 0) textSegments.push(trimmed);
+        continue;
+      }
+      if (partText && typeof partText === "object" && typeof partText.value === "string") {
+        const trimmed = partText.value.trim();
+        if (trimmed.length > 0) textSegments.push(trimmed);
+        continue;
+      }
+      if (typeof part.value === "string") {
+        const trimmed = part.value.trim();
+        if (trimmed.length > 0) textSegments.push(trimmed);
+      }
+    }
+    if (textSegments.length > 0) {
+      return textSegments.join("\n");
+    }
+  }
+  if (typeof message.refusal === "string" && message.refusal.trim().length > 0) {
+    return message.refusal.trim();
+  }
+  return null;
+}
 function usesMaxCompletionTokens(model) {
   return model.startsWith("gpt-5") || model.startsWith("o1-") || model.startsWith("o2-") || model.startsWith("o3-") || model.startsWith("o4-");
 }
@@ -3229,7 +3295,7 @@ async function callOpenAi(options) {
     throw new LlmApiError("OpenAI", response.status, errorText.slice(0, 200));
   }
   const data = await response.json();
-  const content = data.choices?.[0]?.message?.content;
+  const content = extractOpenAiTextContent(data);
   if (!content) throw new Error("OpenAI returned no content");
   return content;
 }
@@ -3263,7 +3329,7 @@ async function callOpenAiCompatible(options) {
     throw new LlmApiError(`OpenAI-compatible (${options.baseUrl})`, response.status, errorText.slice(0, 200));
   }
   const data = await response.json();
-  const content = data.choices?.[0]?.message?.content;
+  const content = extractOpenAiTextContent(data);
   if (!content) throw new Error("OpenAI-compatible API returned no content");
   return content;
 }
@@ -3288,13 +3354,15 @@ ${CYAN}${BOLD}archal${RESET} ${DIM}|${RESET} ${scenarioTitle}
 `);
   }
 }
-function printRunProgress(runIndex, totalRuns, score, error2) {
+function printRunProgress(runIndex, totalRuns, score, error2, outcome) {
   const { quiet } = getLoggerOptions();
   if (quiet || activeOutputFormat !== "terminal") return;
   const dots = ".".repeat(Math.max(1, 20 - String(runIndex + 1).length - String(totalRuns).length));
   if (error2) {
     const shortError = error2.length > MAX_ERROR_PREVIEW_CHARS ? error2.slice(0, MAX_ERROR_PREVIEW_CHARS - 1) + "\u2026" : error2;
-    process.stderr.write(`  run ${runIndex + 1}/${totalRuns} ${DIM}${dots}${RESET} ${RED}ERROR${RESET} ${DIM}(${shortError})${RESET}
+    const inconclusive = outcome === "inconclusive_infrastructure" || outcome === "inconclusive_seed";
+    const label = inconclusive ? `${YELLOW}INCONCLUSIVE${RESET}` : `${RED}ERROR${RESET}`;
+    process.stderr.write(`  run ${runIndex + 1}/${totalRuns} ${DIM}${dots}${RESET} ${label} ${DIM}(${shortError})${RESET}
 `);
     return;
   }
@@ -5874,6 +5942,17 @@ function buildFailureAnalysisPrompt(input) {
   );
   sections.push(`## Passed Criteria (${input.passedCriteria.length})`);
   sections.push(input.passedCriteria.map((c) => `- ${sanitizeForPrompt(c.description, 300)}`).join("\n"));
+  if (input.agentError || input.agentLog) {
+    sections.push(`## Agent Execution Context`);
+    if (input.agentError) {
+      sections.push(`Error: ${sanitizeForPrompt(input.agentError, 300)}`);
+    }
+    if (input.agentLog) {
+      const logTail = input.agentLog.length > 800 ? input.agentLog.slice(-800) : input.agentLog;
+      sections.push(`Agent log (tail):
+${sanitizeForPrompt(logTail, 800)}`);
+    }
+  }
   sections.push(`## Agent Trace (${input.trace.length} tool calls)`);
   sections.push(
     input.trace.length === 0 ? "(Agent made no tool calls - likely crashed or timed out)" : JSON.stringify(traceFormatted, null, 2)
@@ -6617,7 +6696,7 @@ function resolveTelemetryEndpointFromEnv() {
   if (!fallbackBaseUrl) {
     return null;
   }
-  return `${fallbackBaseUrl}/api/traces`;
+  return `${fallbackBaseUrl}/v1/traces`;
 }
 function resolveIngestToken() {
   return process.env["ARCHAL_TELEMETRY_TOKEN"]?.trim();
@@ -6766,8 +6845,26 @@ function isTelemetryEnabled() {
   if (consent !== "pending") return consent === "granted";
   return loadConfig().telemetry;
 }
-function buildStructuredRunError(runIndex, error2) {
+function buildStructuredRunError(runIndex, error2, outcome) {
   const message = error2.trim();
+  if (outcome === "inconclusive_seed") {
+    return {
+      runIndex,
+      message,
+      category: "seed_setup",
+      code: "SEED_SETUP_ERROR",
+      retryable: true
+    };
+  }
+  if (outcome === "inconclusive_infrastructure") {
+    return {
+      runIndex,
+      message,
+      category: "infrastructure",
+      code: "INFRASTRUCTURE_ERROR",
+      retryable: true
+    };
+  }
   if (message.startsWith("Agent not found:")) {
     return {
       runIndex,
@@ -7009,7 +7106,7 @@ function buildMetadata(report, totalEntries) {
     },
     agentInternals: {
       runDurationsMs: report.runs.map((run) => run.durationMs),
-      runErrors: report.runs.filter((run) => typeof run.error === "string" && run.error.length > 0).map((run) => buildStructuredRunError(run.runIndex, run.error)),
+      runErrors: report.runs.filter((run) => typeof run.error === "string" && run.error.length > 0).map((run) => buildStructuredRunError(run.runIndex, run.error, run.outcome)),
       evaluationCounts: { pass: passCount, partial: partialCount, fail: failCount },
       runSummaries: report.runs.map((run) => ({
         runIndex: run.runIndex,
@@ -7184,6 +7281,7 @@ async function uploadIfEnabled(traceId, report) {
 }
 // src/runner/dynamic-seed-generator.ts
+import { createHash as createHash4 } from "crypto";
 import { z as z4 } from "zod";
 // src/runner/seed-schemas/seed-schema-inference.ts
@@ -8551,14 +8649,17 @@ function autoFillMissingFKs(seed, twinName) {
     if (!sourceEntities || !targetEntities || targetEntities.length === 0) continue;
     const targetValues = targetEntities.map((e) => e[rule.targetField]).filter((v) => v !== void 0 && v !== null);
     if (targetValues.length === 0) continue;
+    const validTargetSet = new Set(targetValues.map(String));
     let fillIndex = 0;
     for (const entity of sourceEntities) {
       const e = entity;
-      if (e[rule.sourceField] === void 0 || e[rule.sourceField] === null) {
+      const currentValue = e[rule.sourceField];
+      const needsFill = currentValue === void 0 || currentValue === null || !validTargetSet.has(String(currentValue));
+      if (needsFill) {
         const fillValue = targetValues[fillIndex % targetValues.length];
         fillIndex++;
         debug(
-          `Auto-filling ${rule.sourceCollection}.${rule.sourceField} = ${String(fillValue)} (from ${targetValues.length} ${rule.targetCollection})`
+          `Auto-filling ${rule.sourceCollection}.${rule.sourceField} = ${String(fillValue)} (from ${targetValues.length} ${rule.targetCollection})` + (currentValue != null ? ` (was ${String(currentValue)} \u2014 not in targets)` : "")
         );
         e[rule.sourceField] = fillValue;
       }
@@ -8652,6 +8753,7 @@ var KIND_COLLECTION_HINTS = {
   channel: ["channels"],
   user: ["users"],
   ticket: ["issues"],
+  project: ["projects"],
   table: ["tables"],
   site: ["sites", "domains"],
   file: ["files"],
@@ -8661,6 +8763,9 @@ var KIND_COLLECTION_HINTS = {
 var ENTITY_KEY_ALIASES = {
   "repo.owner": ["ownerLogin", "owner_login", "login", "owner.login", "owner.name"],
   "issue.key": ["identifier"],
+  "project.key": ["key", "projectKey"],
+  "ticket.key": ["identifier", "key"],
+  "stripe_entity.id": ["id", "charge", "chargeId", "paymentIntentId", "invoiceId", "customerId", "disputeId"],
   "email.address": ["email", "from", "to", "cc", "bcc"],
   "file.name": ["title", "fileName", "filename", "subject", "summary"]
 };
@@ -8816,10 +8921,28 @@ function validateSeedCoverage(intent, mergedSeed) {
   const entityIssues = [];
   const quoteErrors = [];
   const quoteWarnings = [];
-  const CORE_ENTITY_KEYS = /* @__PURE__ */ new Set(["owner", "name", "fullName", "channel_name", "key", "identifier", "number"]);
+  const CORE_ENTITY_KEYS = /* @__PURE__ */ new Set(["owner", "name", "fullName", "channel_name", "key", "identifier", "number", "id"]);
+  const CONTRACT_REQUIRED_KINDS = /* @__PURE__ */ new Set([
+    "repo",
+    "pullRequest",
+    "issue",
+    "channel",
+    "user",
+    "ticket",
+    "project",
+    "table"
+  ]);
   const entityWarnings = [];
   for (const entity of intent.entities) {
     if (typeof entity.value === "boolean") continue;
+    const candidateCollections = toCollectionCandidates(mergedSeed, entity.kind, entity.value);
+    if (CONTRACT_REQUIRED_KINDS.has(entity.kind) && candidateCollections.length === 0) {
+      entityIssues.push({
+        type: "missing_entity",
+        message: `Scenario entity contract mismatch: no collections match ${entity.kind}.${entity.key}=${String(entity.value)}`
+      });
+      continue;
+    }
     if (!valueExistsInCollections(mergedSeed, entity.kind, entity.key, entity.value)) {
       const issue = {
         type: "missing_entity",
@@ -8934,7 +9057,8 @@ var NON_SUBJECT_STARTS = /* @__PURE__ */ new Set([
   "under",
   "after",
   "before",
-  "during"
+  "during",
+  "as"
 ]);
 function isReasonableCountSubject(subject, expected) {
   if (expected > MAX_REASONABLE_COUNT) return false;
@@ -8949,40 +9073,92 @@ function appearsToBeClockSuffix(text, numberStart) {
   const prefix = text.slice(Math.max(0, numberStart - 3), numberStart);
   return /^\d{1,2}:$/.test(prefix);
 }
+function isDecimalFragment(text, matchIndex) {
+  if (matchIndex <= 0) return false;
+  const charBefore = text[matchIndex - 1];
+  if (charBefore === ".") {
+    return matchIndex >= 2 && /\d/.test(text[matchIndex - 2]);
+  }
+  return false;
+}
+function resolveSubjectWithKey(subject, flat) {
+  const candidates = buildSubjectCandidates2(subject);
+  for (const candidate of candidates) {
+    const normalized = candidate.replace(/\s+/g, "").toLowerCase();
+    for (const [key, value] of Object.entries(flat)) {
+      const normalizedKey = key.replace(/\s+/g, "").toLowerCase();
+      if ((normalizedKey === normalized || normalizedKey === normalized + "s") && Array.isArray(value)) {
+        return { items: value, key };
+      }
+    }
+  }
+  const items = resolveSubjectInState(subject, flat);
+  return items ? { items, key: "" } : null;
+}
+function buildSubjectCandidates2(subject) {
+  const candidates = [subject];
+  if (subject.endsWith("s") && subject.length > 3) {
+    candidates.push(subject.slice(0, -1));
+  } else {
+    candidates.push(subject + "s");
+  }
+  const words = subject.split(/\s+/);
+  if (words.length > 1) {
+    candidates.push(words[0]);
+    candidates.push(words[words.length - 1]);
+  }
+  return candidates;
+}
 function verifySeedCounts(setupText, seedState) {
   const mismatches = [];
   const flat = flattenTwinState(seedState);
   const countPattern = /\b(\d+)\s+([\w\s]+?)(?:\s+(?:that|which|are|with|in|labeled|assigned)\b)/gi;
   for (const match of setupText.matchAll(countPattern)) {
+    if (isDecimalFragment(setupText, match.index)) continue;
     const expected = parseInt(match[1], 10);
     const subject = match[2].trim();
     if (match.index !== void 0 && appearsToBeClockSuffix(setupText, match.index)) continue;
     if (!subject || expected <= 0) continue;
     if (!isReasonableCountSubject(subject, expected)) continue;
-    const resolved = resolveSubjectInState(subject, flat);
-    if (resolved && resolved.length !== expected) {
-      mismatches.push({ subject, expected, actual: resolved.length });
+    const resolved = resolveSubjectWithKey(subject, flat);
+    if (resolved && resolved.items.length !== expected) {
+      mismatches.push({ subject, expected, actual: resolved.items.length, collectionKey: resolved.key || void 0 });
     }
   }
   const simplePattern = /\b(\d+)\s+([\w\s]+?)(?:[.,;:)]|$)/gm;
   const seenSubjects = new Set(mismatches.map((m) => m.subject.toLowerCase()));
   for (const match of setupText.matchAll(simplePattern)) {
+    if (isDecimalFragment(setupText, match.index)) continue;
     const expected = parseInt(match[1], 10);
     const subject = match[2].trim();
     if (match.index !== void 0 && appearsToBeClockSuffix(setupText, match.index)) continue;
     if (!subject || expected <= 0 || seenSubjects.has(subject.toLowerCase())) continue;
     if (!isReasonableCountSubject(subject, expected)) continue;
-    const resolved = resolveSubjectInState(subject, flat);
-    if (resolved && resolved.length !== expected) {
-      mismatches.push({ subject, expected, actual: resolved.length });
+    const resolved = resolveSubjectWithKey(subject, flat);
+    if (resolved && resolved.items.length !== expected) {
+      mismatches.push({ subject, expected, actual: resolved.items.length, collectionKey: resolved.key || void 0 });
       seenSubjects.add(subject.toLowerCase());
     }
   }
   return mismatches;
 }
+function trimSeedToExpectedCounts(seed, mismatches) {
+  let totalTrimmed = 0;
+  for (const m of mismatches) {
+    if (m.actual <= m.expected) continue;
+    if (!m.collectionKey || !seed[m.collectionKey]) continue;
+    const collection = seed[m.collectionKey];
+    if (collection.length > m.expected) {
+      const trimmed = collection.length - m.expected;
+      seed[m.collectionKey] = collection.slice(0, m.expected);
+      totalTrimmed += trimmed;
+    }
+  }
+  return totalTrimmed;
+}
 // src/runner/seed-cache.ts
-var CACHE_VERSION = 3;
+var CACHE_VERSION = 4;
 var NEGATIVE_CACHE_VERSION = 2;
 var NEGATIVE_PREFIX = "neg-";
 var CACHE_DIR = join7(homedir2(), ".archal", "seed-cache");
@@ -9234,7 +9410,7 @@ ${setupText}
 Extract the seed blueprint as JSON.`;
   try {
     const provider = detectProvider(config.model);
-    const apiKey = resolveProviderApiKey(config.apiKey, provider);
+    const apiKey = config.providerMode === "archal" ? "" : resolveProviderApiKey(config.apiKey ?? "", provider);
     const responseText = await callLlm({
       provider,
       model: config.model,
@@ -10454,9 +10630,19 @@ function extractHybridPatch(obj) {
   }
   return null;
 }
-function buildSeedCacheContext(twinName, intent, context) {
+function hashText(text) {
+  return createHash4("sha256").update(text).digest("hex").slice(0, 16);
+}
+function buildSeedCacheContext(twinName, config, intent, context) {
   return {
     twinName,
+    generator: {
+      model: config.model,
+      providerMode: config.providerMode ?? "direct",
+      baseUrl: config.baseUrl ?? null,
+      systemPromptHash: hashText(SYSTEM_PROMPT2),
+      promptTemplateVersion: 2
+    },
     intent: intent ?? null,
     scenario: context ?? null
   };
@@ -10811,10 +10997,13 @@ async function tryBlueprintPath(twinName, baseSeedData, setupDescription, availa
     finalSeed = autoFillMissingFKs(finalSeed, twinName);
     const relValidation = validateSeedRelationships(finalSeed, twinName);
     if (!relValidation.valid) {
-      warn("Blueprint seed failed relationship validation", {
-        errors: relValidation.errors.slice(0, 5).join("; ")
-      });
-      return null;
+      finalSeed = autoFillMissingFKs(finalSeed, twinName);
+      const secondValidation = validateSeedRelationships(finalSeed, twinName);
+      if (!secondValidation.valid) {
+        warn("Blueprint seed has unresolved FK references (continuing anyway)", {
+          errors: secondValidation.errors.slice(0, 5).join("; ")
+        });
+      }
     }
     if (intent) {
       const coverage = validateSeedCoverage(intent, finalSeed);
@@ -10829,9 +11018,16 @@ async function tryBlueprintPath(twinName, baseSeedData, setupDescription, availa
     flatForVerify[twinName] = finalSeed;
     const countMismatches = verifySeedCounts(setupDescription, flatForVerify);
     if (countMismatches.length > 0) {
-      debug("Blueprint seed has count mismatches (acceptable)", {
-        mismatches: countMismatches.map((m) => `${m.subject}: ${m.expected} vs ${m.actual}`).join("; ")
-      });
+      const trimmed = trimSeedToExpectedCounts(finalSeed, countMismatches);
+      if (trimmed > 0) {
+        debug(`Blueprint seed: trimmed ${trimmed} excess entities to match setup counts`);
+      }
+      const remaining = countMismatches.filter((m) => m.actual > m.expected && !m.collectionKey);
+      if (remaining.length > 0) {
+        debug("Blueprint seed has unresolvable count mismatches", {
+          mismatches: remaining.map((m) => `${m.subject}: ${m.expected} vs ${m.actual}`).join("; ")
+        });
+      }
     }
     const syntheticPatch = {
       add: {}
@@ -10861,7 +11057,7 @@ async function tryBlueprintPath(twinName, baseSeedData, setupDescription, availa
 async function generateDynamicSeed(twinName, baseSeedName, baseSeedData, setupDescription, config, intent, context) {
   const cacheScope = {
     baseSeedData,
-    cacheContext: buildSeedCacheContext(twinName, intent, context)
+    cacheContext: buildSeedCacheContext(twinName, config, intent, context)
   };
   if (!config.noCache) {
     const cached = getCachedSeed(twinName, baseSeedName, setupDescription, cacheScope);
@@ -10892,7 +11088,7 @@ async function generateDynamicSeed(twinName, baseSeedName, baseSeedData, setupDe
   if (blueprintResult) {
     info("Dynamic seed generated via blueprint", { twin: twinName });
     if (!config.noCache) {
-      const cacheContext = buildSeedCacheContext(twinName, intent, context);
+      const cacheContext = buildSeedCacheContext(twinName, config, intent, context);
       cacheSeed(twinName, baseSeedName, setupDescription, blueprintResult.seed, blueprintResult.patch, {
         baseSeedData,
         cacheContext
@@ -11023,14 +11219,19 @@ Fix these issues:
       const relationshipValidation = validateSeedRelationships(mergedSeed, twinName);
       if (!relationshipValidation.valid) {
         const topErrors = relationshipValidation.errors.slice(0, 10);
-        warn(`Dynamic seed relationship validation failed (attempt ${attempt + 1})`, {
+        if (validationAttempts < MAX_ATTEMPTS - 1) {
+          warn(`Dynamic seed relationship validation failed (attempt ${attempt + 1})`, {
+            errors: topErrors.join("; ")
+          });
+          lastErrors = topErrors;
+          patch = null;
+          mergedSeed = null;
+          validationAttempts++;
+          continue;
+        }
+        warn(`Dynamic seed has unresolved FK references (accepting on final attempt)`, {
           errors: topErrors.join("; ")
         });
-        lastErrors = topErrors;
-        patch = null;
-        mergedSeed = null;
-        validationAttempts++;
-        continue;
       }
       if (intent) {
         debug("Seed intent coverage summary", {
@@ -11089,6 +11290,15 @@ Fix these issues:
   }
   mergedSeed = autoFillMissingFKs(mergedSeed, twinName);
   mergedSeed = ensureSlackScenarioChannelAccess(mergedSeed, intent);
+  if (setupDescription) {
+    const flatForTrim = {};
+    flatForTrim[twinName] = mergedSeed;
+    const finalMismatches = verifySeedCounts(setupDescription, flatForTrim);
+    const trimmed = trimSeedToExpectedCounts(mergedSeed, finalMismatches);
+    if (trimmed > 0) {
+      debug(`Trimmed ${trimmed} excess seed entities to match setup counts`);
+    }
+  }
   if (!config.noCache) {
     cacheSeed(twinName, baseSeedName, setupDescription, mergedSeed, patch, cacheScope);
   }
@@ -11236,10 +11446,23 @@ function githubIntent(setup) {
     entities.push({ kind: "repo", key: "fullName", value: fullName });
   }
   if (!primaryRepoSet) {
-    const orgMatch = setup.match(/\bgithub\s+(?:organization|org)\s+"([a-z][a-z0-9._-]*)"/i);
+    const orgMatch = setup.match(
+      /\b(?:github\s+)?(?:organization|org)\s+(?:named\s+)?["']?([a-z][a-z0-9._-]*)["']?/i
+    );
     if (orgMatch?.[1]) {
-      extractedSlots["repo.owner"] = orgMatch[1];
-      entities.push({ kind: "repo", key: "owner", value: orgMatch[1] });
+      extractedSlots["repo.owner"] = orgMatch[1].toLowerCase();
+      entities.push({ kind: "repo", key: "owner", value: orgMatch[1].toLowerCase() });
+      const repoName = setup.match(/\b(?:repository|repo)\s+(?:named\s+)?["']?([a-z][a-z0-9._-]{1,99})["']?/i)?.[1];
+      if (repoName) {
+        const normalizedName = repoName.toLowerCase();
+        extractedSlots["repo.name"] = normalizedName;
+        entities.push({ kind: "repo", key: "name", value: normalizedName });
+        entities.push({
+          kind: "repo",
+          key: "fullName",
+          value: `${String(extractedSlots["repo.owner"])}/${normalizedName}`
+        });
+      }
     } else {
       missingSlots.push({
         slot: "repo.owner/repo.name",
@@ -11430,6 +11653,18 @@ function stripeIntent(setup) {
       });
     }
   }
+  const idRegex = /\b((?:acct|cus|prod|price|pi|ch|re|in|sub|dp|pm|payout|tr|tok|evt)_[a-zA-Z0-9]+)\b/g;
+  const seenIds = /* @__PURE__ */ new Set();
+  let idMatch;
+  while ((idMatch = idRegex.exec(setup)) !== null) {
+    const id = idMatch[1];
+    if (seenIds.has(id)) continue;
+    seenIds.add(id);
+    entities.push({ kind: "stripe_entity", key: "id", value: id });
+    if (!extractedSlots["stripe.primary_id"]) {
+      extractedSlots["stripe.primary_id"] = id;
+    }
+  }
   if (missingSlots.length > 0) {
     return { intent: null, missingSlots };
   }
@@ -11523,6 +11758,30 @@ function jiraIntent(setup) {
     }
     entities.push({ kind: "ticket", key: "key", value: key });
   }
+  const seenProjects = /* @__PURE__ */ new Set();
+  const addProject = (projectKey) => {
+    const normalized = projectKey.toUpperCase();
+    if (!/^[A-Z][A-Z0-9]{1,9}$/.test(normalized)) return;
+    if (seenProjects.has(normalized)) return;
+    seenProjects.add(normalized);
+    entities.push({ kind: "project", key: "key", value: normalized });
+    if (!extractedSlots["project.key"]) {
+      extractedSlots["project.key"] = normalized;
+    }
+  };
+  for (const key of seenKeys) {
+    addProject(key.split("-", 1)[0] ?? "");
+  }
+  const projectRegexes = [
+    /\b(?:jira\s+)?project\s+(?:key\s*)?[:=]?\s*["']?([A-Z][A-Z0-9]{1,9})["']?/gi,
+    /\bproject\s+["'][^"'\n]+["']\s*\(\s*([A-Z][A-Z0-9]{1,9})\s*\)/gi
+  ];
+  for (const regex of projectRegexes) {
+    let projectMatch;
+    while ((projectMatch = regex.exec(setup)) !== null) {
+      addProject(projectMatch[1] ?? "");
+    }
+  }
   return {
     intent: {
       twinName: "jira",
@@ -11537,6 +11796,7 @@ function jiraIntent(setup) {
 }
 function supabaseIntent(setup) {
   const extractedSlots = {};
+  const entities = [];
   const missingSlots = [];
   const requiredSlots = ["database.target"];
   const seenTables = /* @__PURE__ */ new Set();
@@ -11569,6 +11829,9 @@ function supabaseIntent(setup) {
   const hasEnvVarTokens = /\b[A-Z][A-Z0-9_]{2,}\b/.test(setup);
   if (seenTables.size > 0 || mentionsProject || mentionsLogsOrService || mentionsEnvVars && hasEnvVarTokens) {
     extractedSlots["database.target"] = true;
+    for (const table2 of seenTables) {
+      entities.push({ kind: "table", key: "name", value: table2 });
+    }
   } else {
     missingSlots.push({
       slot: "database.target",
@@ -11585,10 +11848,7 @@ function supabaseIntent(setup) {
       setupSummary: setupSummary(setup),
       requiredSlots,
       extractedSlots,
-      // Supabase table names in setup can describe conceptual data sources
-      // that are not materialized in the base SQL schema. Keep intent broad
-      // to avoid false-hard failures in seed generation.
-      entities: [],
+      entities,
       quotedStrings: []
     },
     missingSlots: []
@@ -12112,12 +12372,24 @@ function loadBaseSeedFromDisk(twinName, seedName) {
 }
 function categorizeRunError(message) {
   if (/Failed to spawn|ENOENT/.test(message)) {
-    return `Agent not found: ${message}. Check that your agent command is installed and in PATH.`;
+    return {
+      message: `Agent not found: ${message}. Check that your agent command is installed and in PATH.`,
+      outcome: "failed_agent"
+    };
+  }
+  if (/Dynamic seed generation failed|Missing dynamic seed state|seed generation|seed setup/i.test(message)) {
+    return {
+      message: `Seed generation error: ${message}`,
+      outcome: "inconclusive_seed"
+    };
   }
   if (/HTTP [45]\d\d|ECONNREFUSED|ENOTFOUND|ETIMEDOUT|ECONNRESET|cloud session|fetch failed|AbortError|TimeoutError|operation was aborted|timed?\s*out/i.test(message)) {
-    return `Infrastructure error: ${message}. Check your network or try again.`;
+    return {
+      message: `Infrastructure error: ${message}. Check your network or try again.`,
+      outcome: "inconclusive_infrastructure"
+    };
   }
-  return message;
+  return { message, outcome: "failed_agent" };
 }
 async function executeSingleRun(runIndex, scenario, agentConfig, seedSelections, evaluatorConfig, timeoutSeconds, apiEngine, localEngine, remoteTwinUrlOverrides, apiRouting, cloudTwinUrls, hostedSessionId, apiBearerToken, adminAuth) {
   const startTime = Date.now();
@@ -12255,7 +12527,8 @@ ${baseTaskMessage}` : baseTaskMessage;
         stateDiff: diff,
         agentLog: agentResult.stderr || void 0,
         agentTrace: agentResult.agentTrace,
-        tokenUsage
+        tokenUsage,
+        outcome: "failed_agent"
       };
     }
     if (agentResult.exitCode !== 0 && agentResult.exitCode !== null) {
@@ -12294,7 +12567,8 @@ ${baseTaskMessage}` : baseTaskMessage;
         stateDiff: diff,
         agentLog: agentResult.stderr || void 0,
         agentTrace: agentResult.agentTrace,
-        tokenUsage
+        tokenUsage,
+        outcome: "failed_agent"
       };
     }
     if (trace.length === 0) {
@@ -12326,12 +12600,13 @@ ${baseTaskMessage}` : baseTaskMessage;
       stateDiff: diff,
       agentLog: agentResult.stderr || void 0,
       agentTrace: agentResult.agentTrace,
-      tokenUsage
+      tokenUsage,
+      outcome: "completed"
     };
   } catch (err) {
     const message = err instanceof Error ? err.message : String(err);
     const categorized = categorizeRunError(message);
-    error(`Run ${runIndex + 1} failed: ${categorized}`);
+    error(`Run ${runIndex + 1} failed: ${categorized.message}`);
     const durationMs = Date.now() - startTime;
     return {
       runIndex,
@@ -12339,12 +12614,13 @@ ${baseTaskMessage}` : baseTaskMessage;
         criterionId: c.id,
         status: "fail",
         confidence: 1,
-        explanation: `Run failed: ${categorized}`
+        explanation: `Run failed: ${categorized.message}`
       })),
       overallScore: 0,
       trace: [],
       durationMs,
-      error: categorized,
+      error: categorized.message,
+      outcome: categorized.outcome,
       stateBefore: beforeState,
       stateAfter: beforeState,
       stateDiff: { added: {}, modified: {}, removed: {} }
@@ -12421,9 +12697,20 @@ function preflightCheck(scenario, apiKey, model, baseUrl, evaluatorProvider, see
     }
   }
   if (seedModel) {
+    const mode = seedProviderMode ?? "auto";
+    const provider = detectProvider(seedModel);
+    const resolvedKey = resolveProviderApiKey(apiKey, provider);
     const creds = getCredentials();
     const hasArchalAuth = Boolean(creds?.token);
-    if (!hasArchalAuth) {
+    if (provider === "openai-compatible" && !baseUrl && mode === "direct") {
+      errors.push({
+        check: "seed.baseUrl",
+        message: `Seed model "${seedModel}" requires a base URL for the OpenAI-compatible endpoint`,
+        detail: "Set via: export ARCHAL_EVALUATOR_BASE_URL=<url> or archal config set evaluator.baseUrl <url>",
+        warning: true
+      });
+    }
+    if (mode === "archal" && !hasArchalAuth) {
       errors.push({
         check: "archal-auth-seed",
         message: "Dynamic seed generation requires Archal authentication",
@@ -12431,6 +12718,32 @@ function preflightCheck(scenario, apiKey, model, baseUrl, evaluatorProvider, see
         warning: true
       });
     }
+    if (mode === "direct" && !resolvedKey) {
+      errors.push({
+        check: getProviderEnvVar(provider),
+        message: `Seed provider is "direct" but no API key is configured for ${provider}`,
+        detail: `Set via: export ${getProviderEnvVar(provider)}=<your-key> or archal config set evaluator.apiKey <key>`,
+        warning: true
+      });
+    }
+    if (mode === "auto" && !resolvedKey && !hasArchalAuth) {
+      errors.push({
+        check: getProviderEnvVar(provider),
+        message: 'Dynamic seed generation has no available provider in "auto" mode',
+        detail: `Set ${getProviderEnvVar(provider)} (or evaluator.apiKey) for direct mode, or run archal login for Archal backend mode`,
+        warning: true
+      });
+    }
+    if (resolvedKey && (mode === "direct" || mode === "auto")) {
+      const mismatch = validateKeyForProvider(resolvedKey, provider);
+      if (mismatch) {
+        errors.push({
+          check: "seed-key-provider-mismatch",
+          message: mismatch,
+          warning: true
+        });
+      }
+    }
   }
   return errors;
 }
@@ -12479,6 +12792,35 @@ async function runScenario(options) {
       'cloudTwinUrls is required. Local twin execution has been removed; use "archal run" to provision a hosted session.'
     );
   }
+  const criterionDescriptions = {};
+  const criterionTypes = {};
+  for (const c of scenario.successCriteria) {
+    criterionDescriptions[c.id] = c.description;
+    criterionTypes[c.id] = c.type;
+  }
+  const buildInconclusiveSeedReport = (message) => ({
+    scenarioTitle: scenario.title,
+    satisfactionScore: 0,
+    criterionDescriptions,
+    criterionTypes,
+    twinNames: scenario.config.twins,
+    runs: [{
+      runIndex: 0,
+      evaluations: scenario.successCriteria.map((criterion) => ({
+        criterionId: criterion.id,
+        status: "fail",
+        confidence: 1,
+        explanation: `Run not scored due to seed setup failure: ${message}`
+      })),
+      overallScore: 0,
+      trace: [],
+      durationMs: 0,
+      error: message,
+      outcome: "inconclusive_seed"
+    }],
+    summary: `Inconclusive (seed setup): ${message}`,
+    timestamp: (/* @__PURE__ */ new Date()).toISOString()
+  });
   const preflightErrors = preflightCheck(
     scenario,
     config.apiKey,
@@ -12569,7 +12911,7 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
             cacheContext: seedPromptContext
           });
         }
-        throw new Error(message);
+        return buildInconclusiveSeedReport(message);
       }
       warn(message);
       generationTargets.push(sel);
@@ -12578,12 +12920,11 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
   if (generationTargets.length > 0) {
     progress("Generating dynamic seeds from setup description...");
     const dynamicConfig = {
-      apiKey: "",
-      // Seed gen always routes through Archal backend
+      apiKey: config.apiKey,
       model: config.seedModel,
       baseUrl: config.baseUrl,
       noCache: options.noSeedCache,
-      providerMode: "archal"
+      providerMode: config.seedProvider
     };
     let cloudSeedSnapshotByTwin = null;
     const adminAuth = options.apiAdminToken ? { token: options.apiAdminToken, userId: options.apiAdminUserId } : void 0;
@@ -12601,20 +12942,28 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
         baseSeedData = normalizeSeedState(cloudSeedSnapshotByTwin[sel.twinName]);
       }
       if (!baseSeedData || Object.keys(baseSeedData).length === 0) {
-        throw new Error(
+        return buildInconclusiveSeedReport(
           `Could not load base seed "${sel.seedName}" for twin "${sel.twinName}" from disk. Ensure the seed file exists at twins/${sel.twinName}/seeds/${sel.seedName}.json or .sql, or that the hosted twin /state endpoint is reachable.`
         );
       }
       progress(`Generating dynamic seed for ${sel.twinName}...`);
-      const result = await generateDynamicSeed(
-        sel.twinName,
-        sel.seedName,
-        baseSeedData,
-        scenario.setup,
-        dynamicConfig,
-        extractedIntentByTwin.get(sel.twinName),
-        seedPromptContext
-      );
+      let result;
+      try {
+        result = await generateDynamicSeed(
+          sel.twinName,
+          sel.seedName,
+          baseSeedData,
+          scenario.setup,
+          dynamicConfig,
+          extractedIntentByTwin.get(sel.twinName),
+          seedPromptContext
+        );
+      } catch (error2) {
+        const detail = error2 instanceof Error ? error2.message : String(error2);
+        return buildInconclusiveSeedReport(
+          `Dynamic seed generation failed for twin "${sel.twinName}": ${detail}`
+        );
+      }
       sel.seedData = result.seed;
       if (result.fromCache) {
         cachedSeedTwins.push(sel.twinName);
@@ -12630,7 +12979,7 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
   }
   const missingDynamicSeeds = seedSelections.filter((sel) => !sel.seedData);
   if (missingDynamicSeeds.length > 0) {
-    throw new Error(
+    return buildInconclusiveSeedReport(
       `Missing dynamic seed state for twin(s): ${missingDynamicSeeds.map((sel) => sel.twinName).join(", ")}`
     );
   }
@@ -12825,8 +13174,8 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
     return {
       scenarioTitle: scenario.title,
       satisfactionScore: 100,
-      criterionDescriptions: {},
-      criterionTypes: {},
+      criterionDescriptions,
+      criterionTypes,
       twinNames: scenario.config.twins,
       runs: [],
       summary: "Preflight checks passed",
@@ -12865,8 +13214,8 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
       adminAuth
     );
     runs.push(result);
-    printRunProgress(i, numRuns, result.overallScore, result.error);
-    if (result.error) {
+    printRunProgress(i, numRuns, result.overallScore, result.error, result.outcome);
+    if (result.outcome === "inconclusive_infrastructure" || result.outcome === "inconclusive_seed") {
       consecutiveInfraErrors++;
       if (consecutiveInfraErrors >= EARLY_ABORT_THRESHOLD && i < numRuns - 1) {
         warn(`${consecutiveInfraErrors} consecutive run errors \u2014 aborting remaining ${numRuns - i - 1} run(s) to avoid wasting quota.`);
@@ -12876,19 +13225,17 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
       consecutiveInfraErrors = 0;
     }
   }
-  const runScores = runs.map((r) => r.overallScore);
+  const scoredRuns = runs.filter(
+    (run) => run.outcome !== "inconclusive_infrastructure" && run.outcome !== "inconclusive_seed"
+  );
+  const runScores = scoredRuns.map((r) => r.overallScore);
   const satisfactionScore = aggregateSatisfaction(runScores);
-  const allEvaluations = runs.map((r) => r.evaluations);
-  const summary = generateSummary(allEvaluations, satisfactionScore);
-  const criterionDescriptions = {};
-  const criterionTypes = {};
-  for (const c of scenario.successCriteria) {
-    criterionDescriptions[c.id] = c.description;
-    criterionTypes[c.id] = c.type;
-  }
+  const allEvaluations = scoredRuns.map((r) => r.evaluations);
+  const inconclusiveRuns = runs.length - scoredRuns.length;
+  const summary = scoredRuns.length > 0 ? generateSummary(allEvaluations, satisfactionScore) : `Inconclusive: no scored runs (${inconclusiveRuns} infrastructure/seed setup run failure${inconclusiveRuns === 1 ? "" : "s"}).`;
   let failureAnalysis;
-  if (satisfactionScore < 100 && runs.length > 0 && !options.noFailureAnalysis) {
-    const representativeRun = runs.reduce(
+  if (satisfactionScore < 100 && scoredRuns.length > 0 && !options.noFailureAnalysis) {
+    const representativeRun = scoredRuns.reduce(
       (worst, r) => r.overallScore < worst.overallScore ? r : worst
     );
     const failedCriteria = representativeRun.evaluations.filter((e) => e.status !== "pass").map((e) => ({
@@ -12911,7 +13258,9 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
           stateDiff: representativeRun.stateDiff ?? { added: {}, modified: {}, removed: {} },
           stateBefore: representativeRun.stateBefore ?? {},
           stateAfter: representativeRun.stateAfter ?? {},
-          satisfactionScore
+          satisfactionScore,
+          agentLog: representativeRun.agentLog,
+          agentError: representativeRun.error
         },
         evaluatorConfig
       );
@@ -13690,7 +14039,21 @@ function createRunCommand() {
         }
       }
       if (!process.env["ARCHAL_ENGINE_API_KEY"] && userConfig.engineApiKey) {
-        process.env["ARCHAL_ENGINE_API_KEY"] = userConfig.engineApiKey;
+        const configKey = userConfig.engineApiKey;
+        const requestedModel = firstNonEmpty(
+          opts.engineModel,
+          process.env["ARCHAL_ENGINE_MODEL"],
+          opts.model
+          // -m also defaults the engine model for local harnesses
+        );
+        if (requestedModel) {
+          const modelProvider = detectProvider(requestedModel);
+          if (!validateKeyForProvider(configKey, modelProvider)) {
+            process.env["ARCHAL_ENGINE_API_KEY"] = configKey;
+          }
+        } else {
+          process.env["ARCHAL_ENGINE_API_KEY"] = configKey;
+        }
       }
     }
     inferEngineModelFromEvaluatorModel(opts);
@@ -13741,8 +14104,17 @@ function createRunCommand() {
       }
     }
     if (engine.mode === "local" && !process.env["ARCHAL_ENGINE_API_KEY"]) {
+      const requestedModel = firstNonEmpty(
+        opts.engineModel,
+        process.env["ARCHAL_ENGINE_MODEL"]
+      );
+      const provider = requestedModel ? detectProvider(requestedModel) : null;
+      const providerHint = provider ? `
+  Hint: You requested model "${requestedModel}" (${provider}) but no ${provider} API key is available.
+  Set ${getProviderEnvVar(provider)} or pass --engine-key <${provider}-key>
+` : "";
       process.stderr.write(
-        "Error: No API key found. The agent harness needs an API key to call the model.\nSet one of:\n  GEMINI_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY env var\n  archal config set engine.apiKey <key>\n  ARCHAL_ENGINE_API_KEY env var\n"
+        "Error: No API key found. The agent harness needs an API key to call the model.\nSet one of:\n  GEMINI_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY env var\n  archal config set engine.apiKey <key>\n  ARCHAL_ENGINE_API_KEY env var\n" + providerHint
       );
       process.exit(2);
     }
@@ -13812,12 +14184,14 @@ function createRunCommand() {
           })();
           const SESSION_READY_TIMEOUT_MS = Math.max(12e4, configuredReadyTimeoutMs);
           const SESSION_POLL_INTERVAL_MS = 2e3;
-          const STATUS_READY_GRACE_MS = 5e3;
           const readyDeadline = Date.now() + SESSION_READY_TIMEOUT_MS;
           let sessionReady = false;
           let lastPollIssue;
-          let statusReadySinceMs = null;
           const isRetryablePollFailure = (result) => result.offline || typeof result.status === "number" && result.status >= 500;
+          const workersAllReady = (workers) => {
+            if (!workers || Object.keys(workers).length === 0) return true;
+            return Object.values(workers).every((value) => value === "ready");
+          };
           const sleepForPollInterval = async () => new Promise((resolve12) => setTimeout(resolve12, SESSION_POLL_INTERVAL_MS));
           if (!opts.quiet) process.stderr.write("Starting cloud session...\n");
           let pollCount = 0;
@@ -13872,26 +14246,19 @@ function createRunCommand() {
             }
             const healthAlive = healthResult.ok && healthResult.data.alive;
             const statusAlive = statusResult.data.alive || status === "ready";
-            if (statusAlive && healthAlive) {
+            const statusWorkersReady = workersAllReady(
+              statusResult.data.twins ?? statusResult.data.workers
+            );
+            const healthWorkersReady = workersAllReady(healthResult.data.twins);
+            if (statusAlive && healthAlive && statusWorkersReady && healthWorkersReady) {
               sessionReady = true;
               break;
             }
-            if (statusAlive && !healthAlive) {
-              if (statusReadySinceMs === null) {
-                statusReadySinceMs = Date.now();
-              }
-              const readyForMs = Date.now() - statusReadySinceMs;
-              if (readyForMs >= STATUS_READY_GRACE_MS) {
-                debug(
-                  `Session ${backendSessionId} proceeded after health endpoint warmup (${readyForMs}ms).`
-                );
-                sessionReady = true;
-                break;
-              }
-            } else {
-              statusReadySinceMs = null;
-            }
-            lastPollIssue = `session still starting (status=${status}, health=${healthAlive ? "alive" : "starting"})`;
+            const statusTwinStates = Object.entries(
+              statusResult.data.twins ?? statusResult.data.workers ?? {}
+            ).map(([twin, twinStatus]) => `${twin}:${twinStatus}`).join(", ");
+            const healthTwinStates = Object.entries(healthResult.data.twins ?? {}).map(([twin, twinStatus]) => `${twin}:${twinStatus}`).join(", ");
+            lastPollIssue = `session still starting (status=${status}, health=${healthAlive ? "alive" : "starting"}, statusTwins=[${statusTwinStates || "n/a"}], healthTwins=[${healthTwinStates || "n/a"}])`;
             await sleepForPollInterval();
           }
           if (sessionReady) {
@@ -14292,6 +14659,7 @@ function buildEvidenceArtifacts(report) {
     overallScore: run.overallScore,
     durationMs: run.durationMs,
     error: run.error ?? null,
+    outcome: run.outcome ?? null,
     evaluations: (run.evaluations ?? []).map((ev) => ({
       criterionId: ev.criterionId,
       status: ev.status,
@@ -14611,7 +14979,7 @@ import { createInterface as createInterface2 } from "readline";
 import { Command as Command5 } from "commander";
 // src/telemetry/anonymizer.ts
-import { createHash as createHash4 } from "crypto";
+import { createHash as createHash5 } from "crypto";
 var API_KEY_PATTERNS = [
   /(?:api[_-]?key|token|secret|password|authorization|bearer|credential)\s*[:=]\s*["']?([a-zA-Z0-9_\-/.+=]{16,})["']?/gi,
   /sk-[a-zA-Z0-9]{20,}/g,
@@ -14661,7 +15029,7 @@ var USERNAME_FIELDS = /* @__PURE__ */ new Set([
   "maintainer"
 ]);
 function hashValue2(value, salt = "archal") {
-  return `anon_${createHash4("sha256").update(`${salt}:${value}`).digest("hex").slice(0, 12)}`;
+  return `anon_${createHash5("sha256").update(`${salt}:${value}`).digest("hex").slice(0, 12)}`;
 }
 function anonymizeForEnterprise(entries) {
   debug("Enterprise anonymization", { entryCount: String(entries.length) });
@@ -15637,7 +16005,7 @@ function createDoctorCommand() {
 // src/commands/login.ts
 import { Command as Command8 } from "commander";
 import { exec } from "child_process";
-import { createHash as createHash5, randomBytes as randomBytes2 } from "crypto";
+import { createHash as createHash6, randomBytes as randomBytes2 } from "crypto";
 import { createServer } from "http";
 var START_PORT = 51423;
 var LOGIN_TIMEOUT_MS = 5 * 60 * 1e3;
@@ -15658,7 +16026,7 @@ function openBrowser(url) {
 }
 function createPkcePair() {
   const codeVerifier = randomBytes2(32).toString("base64url");
-  const codeChallenge = createHash5("sha256").update(codeVerifier).digest("base64url");
+  const codeChallenge = createHash6("sha256").update(codeVerifier).digest("base64url");
   return { codeVerifier, codeChallenge };
 }
 function isPlan2(value) {
@@ -16388,11 +16756,25 @@ function detectProviderName(model) {
   if (normalized.startsWith("gpt-") || normalized.startsWith("o1-") || normalized.startsWith("o3-") || normalized.startsWith("o4-")) return "OpenAI";
   return "OpenAI-compatible";
 }
-function resolveEngineApiKey(explicitKey) {
+function resolveEngineApiKey(explicitKey, model) {
   if (explicitKey?.trim()) return explicitKey.trim();
   if (process.env["ARCHAL_ENGINE_API_KEY"]?.trim()) return process.env["ARCHAL_ENGINE_API_KEY"].trim();
+  const modelProvider = model ? detectProvider(model) : null;
   const config = loadConfig();
-  if (config.engineApiKey) return config.engineApiKey;
+  if (config.engineApiKey) {
+    if (!modelProvider || !validateKeyForProvider(config.engineApiKey, modelProvider)) {
+      return config.engineApiKey;
+    }
+  }
+  const providerEnvVars = {
+    gemini: "GEMINI_API_KEY",
+    openai: "OPENAI_API_KEY",
+    anthropic: "ANTHROPIC_API_KEY"
+  };
+  if (modelProvider && providerEnvVars[modelProvider]) {
+    const val = process.env[providerEnvVars[modelProvider]]?.trim();
+    if (val) return val;
+  }
   for (const envVar of ["GEMINI_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY"]) {
     const val = process.env[envVar]?.trim();
     if (val) return val;
@@ -16441,7 +16823,7 @@ function createDemoCommand() {
       process.exit(1);
     }
     const providerName = detectProviderName(opts.model);
-    const engineApiKey = resolveEngineApiKey(opts.apiKey);
+    const engineApiKey = resolveEngineApiKey(opts.apiKey, opts.model);
     if (!engineApiKey) {
       process.stderr.write(
         `Error: No API key found for model "${opts.model}" (${providerName}).

package/harnesses/_lib/providers.mjs CHANGED Viewed

@@ -5,7 +5,7 @@
  * Env var overrides:
  *   ARCHAL_MAX_TOKENS         — Max completion tokens (default from model-configs)
  *   ARCHAL_TEMPERATURE        — Sampling temperature
- *   ARCHAL_LLM_TIMEOUT        — Per-call timeout in seconds (default 120)
+ *   ARCHAL_LLM_TIMEOUT        — Per-call timeout in seconds (default 180)
  *   ARCHAL_OPENAI_BASE_URL    — Override OpenAI base URL (for proxies, Azure, etc.)
  *   ARCHAL_ANTHROPIC_BASE_URL — Override Anthropic base URL
  *   ARCHAL_GEMINI_BASE_URL    — Override Gemini base URL
@@ -48,19 +48,41 @@ const PROVIDER_ENV_VARS = {
   openai: 'OPENAI_API_KEY',
 };
+function inferKeyProvider(key) {
+  if (!key) return null;
+  if (key.startsWith('AIzaSy')) return 'gemini';
+  if (key.startsWith('sk-ant-')) return 'anthropic';
+  if (key.startsWith('sk-')) return 'openai';
+  return null;
+}
 /**
  * Resolve the API key for the detected provider.
  * Priority: ARCHAL_ENGINE_API_KEY > provider-specific env var.
+ * If ARCHAL_ENGINE_API_KEY clearly belongs to a different provider, fall back
+ * to provider-specific key when available, otherwise fail with a clear error.
  * @param {string} provider
  * @returns {string}
  */
 export function resolveApiKey(provider) {
-  const engineKey = process.env['ARCHAL_ENGINE_API_KEY']?.trim();
-  if (engineKey) return engineKey;
   const envVar = PROVIDER_ENV_VARS[provider] ?? 'OPENAI_API_KEY';
-  const key = process.env[envVar]?.trim();
-  if (key) return key;
+  const providerKey = process.env[envVar]?.trim();
+  const engineKey = process.env['ARCHAL_ENGINE_API_KEY']?.trim();
+  if (engineKey) {
+    const inferred = inferKeyProvider(engineKey);
+    if (!inferred || inferred === provider) return engineKey;
+    if (providerKey) {
+      process.stderr.write(
+        `[harness] Warning: ARCHAL_ENGINE_API_KEY appears to be for ${inferred}; using ${envVar} for ${provider} model.\n`,
+      );
+      return providerKey;
+    }
+    throw new Error(
+      `ARCHAL_ENGINE_API_KEY appears to be for ${inferred}, but provider "${provider}" requires ${envVar}. ` +
+      `Set ${envVar} or use a ${inferred} model.`
+    );
+  }
+  if (providerKey) return providerKey;
   throw new Error(
     `No API key found for provider "${provider}". ` +
@@ -111,7 +133,7 @@ function getLlmTimeoutMs() {
       return parsed * 1000;
     }
   }
-  return 120_000; // 120 seconds default
+  return 180_000; // 180 seconds default
 }
 // ── Thinking configuration ──────────────────────────────────────────

package/harnesses/hardened/agent.mjs CHANGED Viewed

@@ -107,10 +107,19 @@ try {
     // Call the LLM with retry on transient errors
     log.llmCall(step + 1);
-    const response = await withRetry(
-      () => callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools),
-      2,
-    );
+    let response;
+    try {
+      response = await withRetry(
+        () => callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools),
+        2,
+      );
+    } catch (err) {
+      const msg = err?.message ?? String(err);
+      log.error('llm_call_failed', { step: step + 1, error: msg });
+      process.stderr.write(`[hardened] LLM API error: ${msg.slice(0, 500)}\n`);
+      exitReason = 'llm_error';
+      break;
+    }
     const iterDurationMs = Date.now() - iterStart;
     totalInputTokens += response.usage.inputTokens;
@@ -218,4 +227,8 @@ try {
     `(${totalToolErrors} errors), ${totalInputTokens} input tokens, ` +
     `${totalOutputTokens} output tokens, ${(totalTimeMs / 1000).toFixed(1)}s total\n`
   );
+  if (exitReason === 'llm_error') {
+    process.exit(1);
+  }
 }

package/harnesses/naive/agent.mjs CHANGED Viewed

@@ -84,7 +84,16 @@ try {
     const iterStart = Date.now();
     log.llmCall(step + 1);
-    const response = await callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools);
+    let response;
+    try {
+      response = await callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools);
+    } catch (err) {
+      const msg = err?.message ?? String(err);
+      log.error('llm_call_failed', { step: step + 1, error: msg });
+      process.stderr.write(`[naive] LLM API error: ${msg.slice(0, 500)}\n`);
+      exitReason = 'llm_error';
+      break;
+    }
     const iterDurationMs = Date.now() - iterStart;
     totalInputTokens += response.usage.inputTokens;
@@ -150,4 +159,7 @@ try {
     `${(totalTimeMs / 1000).toFixed(1)}s total\n`
   );
+  if (exitReason === 'llm_error') {
+    process.exit(1);
+  }
 }

package/harnesses/react/agent.mjs CHANGED Viewed

@@ -6,7 +6,7 @@
  * - Structured system prompt encouraging step-by-step reasoning
  * - Error recovery with retries on transient failures
  * - Context-aware done detection
- * - Max 50 steps safety limit
+ * - Configurable step limit (default 80, cap 200 via ARCHAL_MAX_STEPS)
  * - Token usage and timing instrumentation
  *
  * Env vars (set by archal orchestrator):
@@ -34,7 +34,21 @@ import { createLogger } from '../_lib/logging.mjs';
 import { writeMetrics } from '../_lib/metrics.mjs';
 import { createAgentTrace } from '../_lib/agent-trace.mjs';
-const MAX_STEPS = 50;
+const DEFAULT_MAX_STEPS = 80;
+const MAX_STEPS = (() => {
+  const raw = process.env['ARCHAL_MAX_STEPS']?.trim();
+  if (!raw) return DEFAULT_MAX_STEPS;
+  const parsed = parseInt(raw, 10);
+  if (Number.isNaN(parsed) || parsed <= 0) return DEFAULT_MAX_STEPS;
+  return Math.min(parsed, 200);
+})();
+const MAX_CONSECUTIVE_ERRORS = (() => {
+  const raw = process.env['ARCHAL_MAX_CONSECUTIVE_ERRORS']?.trim();
+  if (!raw) return 8;
+  const parsed = parseInt(raw, 10);
+  if (Number.isNaN(parsed) || parsed <= 0) return 8;
+  return Math.min(parsed, 20);
+})();
 const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
 const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
@@ -95,10 +109,19 @@ try {
     // Call the LLM with retry on transient errors
     log.llmCall(step + 1);
-    const response = await withRetry(
-      () => callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools),
-      2,
-    );
+    let response;
+    try {
+      response = await withRetry(
+        () => callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools),
+        2,
+      );
+    } catch (err) {
+      const msg = err?.message ?? String(err);
+      log.error('llm_call_failed', { step: step + 1, error: msg });
+      process.stderr.write(`[react] LLM API error: ${msg.slice(0, 500)}\n`);
+      exitReason = 'llm_error';
+      break;
+    }
     const iterDurationMs = Date.now() - iterStart;
     totalInputTokens += response.usage.inputTokens;
@@ -154,7 +177,7 @@ try {
         process.stderr.write(`[react] Tool error (${consecutiveErrors}): ${err.message}\n`);
         // Bail if too many consecutive errors
-        if (consecutiveErrors >= 5) {
+        if (consecutiveErrors >= MAX_CONSECUTIVE_ERRORS) {
           process.stderr.write('[react] Too many consecutive tool errors — stopping.\n');
           exitReason = 'consecutive_errors';
           break;
@@ -171,7 +194,7 @@ try {
       durationMs: iterDurationMs,
     });
-    if (consecutiveErrors >= 5) break;
+    if (consecutiveErrors >= MAX_CONSECUTIVE_ERRORS) break;
     // Append tool results to conversation
     messages = appendToolResults(provider, messages, toolCalls, results);
@@ -209,4 +232,7 @@ try {
     `${totalOutputTokens} output tokens, ${(totalTimeMs / 1000).toFixed(1)}s total\n`
   );
+  if (exitReason === 'llm_error') {
+    process.exit(1);
+  }
 }

package/harnesses/zero-shot/agent.mjs CHANGED Viewed

@@ -77,7 +77,16 @@ try {
     const iterStart = Date.now();
     log.llmCall(step + 1);
-    const response = await callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools);
+    let response;
+    try {
+      response = await callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools);
+    } catch (err) {
+      const msg = err?.message ?? String(err);
+      log.error('llm_call_failed', { step: step + 1, error: msg });
+      process.stderr.write(`[zero-shot] LLM API error: ${msg.slice(0, 500)}\n`);
+      exitReason = 'llm_error';
+      break;
+    }
     const iterDurationMs = Date.now() - iterStart;
     totalInputTokens += response.usage.inputTokens;
@@ -169,4 +178,7 @@ try {
     `${totalOutputTokens} output tokens, ${(totalTimeMs / 1000).toFixed(1)}s total\n`
   );
+  if (exitReason === 'llm_error') {
+    process.exit(1);
+  }
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@archal/cli",
-  "version": "0.7.10",
+  "version": "0.7.11",
   "description": "Pre-deployment testing for AI agents",
   "type": "module",
   "main": "dist/index.js",