npm - @archal/cli - Versions diffs - 0.7.9 → 0.7.11 - Mend

@archal/cli 0.7.9 → 0.7.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.js +697 -146
package/harnesses/_lib/providers.mjs +29 -7
package/harnesses/hardened/agent.mjs +42 -109
package/harnesses/naive/agent.mjs +15 -3
package/harnesses/react/agent.mjs +36 -10
package/harnesses/zero-shot/agent.mjs +15 -3
package/package.json +1 -1

package/dist/index.js CHANGED Viewed

@@ -233,6 +233,7 @@ function parseCriterionLine(line, index) {
   } else {
     type = inferCriterionType(description);
   }
+  if (!description) return null;
   return {
     id: `criterion-${index + 1}`,
     description,
@@ -333,7 +334,11 @@ ${expectedBehavior}`.toLowerCase();
     github: ["github", "repository", "pull request", "create_issue", "create_pull_request", "merge_pull_request"],
     slack: ["slack", "slack channel", "send_message", "slack message", "direct message"],
     linear: ["linear", "linear ticket", "linear project", "linear cycle"],
-    jira: ["jira", "jira sprint", "jira epic", "jira board"]
+    jira: ["jira", "jira sprint", "jira epic", "jira board"],
+    stripe: ["stripe", "payment", "refund", "subscription", "invoice", "charge"],
+    supabase: ["supabase", "database", "sql query", "database table"],
+    "google-workspace": ["google workspace", "gmail", "google calendar", "google drive", "google docs"],
+    browser: ["browser", "web page", "navigate to", "click on", "web content"]
   };
   for (const [twin, keywords] of Object.entries(twinKeywords)) {
     if (keywords.some((kw) => combined.includes(kw))) {
@@ -425,7 +430,9 @@ function validateScenario(scenario) {
     }
   }
   if (scenario.config.twins.length === 0) {
-    errors.push("Scenario does not reference any known twins (specify in Config section or mention services in Setup/Expected Behavior)");
+    errors.push(
+      'Scenario does not reference any known twins. Add a "## Config" section with "twins: github" (or slack, linear, jira, stripe, supabase, google-workspace, browser). Alternatively, mention the service name in ## Setup or ## Expected Behavior.'
+    );
   }
   if (scenario.config.timeout <= 0) {
     errors.push("Timeout must be a positive number");
@@ -1107,6 +1114,8 @@ var HTTP_RETRYABLE_STATUS_CODES = /* @__PURE__ */ new Set([408, 425, 429, 500, 5
 var HTTP_PUSH_TIMEOUT_MS = 2e4;
 var HTTP_PUSH_MAX_RETRIES = 6;
 var HTTP_PUSH_BACKOFF_MS = [1e3, 2e3, 3e3, 5e3, 5e3, 5e3];
+var HTTP_PUSH_WARMUP_RETRIES = 6;
+var HTTP_PUSH_WARMUP_BACKOFF_MS = [1500, 2500, 3500, 5e3, 6e3, 7e3];
 function resolveRetryDelay(backoffMs, attempt, fallbackMs) {
   const indexed = backoffMs[attempt];
   if (typeof indexed === "number" && Number.isFinite(indexed) && indexed >= 0) {
@@ -1157,6 +1166,10 @@ async function fetchWithRetry(url, options, retryOptions) {
 function twinBasePath(url) {
   return url.replace(/\/(mcp|api)\/?$/, "");
 }
+function isTwinWorkerWarmupResponse(status, body) {
+  if (status !== 503) return false;
+  return /twin worker endpoint not available|session is busy|retry shortly/i.test(body);
+}
 async function collectStateFromHttp(twinUrls, bearerToken, adminAuth) {
   const state = {};
   const failures = [];
@@ -1201,25 +1214,44 @@ async function pushStateToCloud(twinUrls, seedSelections, bearerToken, adminAuth
     }
     const url = `${twinBasePath(baseUrl)}/state`;
     debug(`Pushing dynamic seed to ${sel.twinName}`, { url });
-    const response = await fetchWithRetry(
-      url,
-      {
-        method: "PUT",
-        headers,
-        body: JSON.stringify(sel.seedData)
-      },
-      {
-        retries: HTTP_PUSH_MAX_RETRIES,
-        timeoutMs: HTTP_PUSH_TIMEOUT_MS,
-        backoffMs: HTTP_PUSH_BACKOFF_MS
+    const payload = JSON.stringify(sel.seedData);
+    let pushed = false;
+    for (let warmupAttempt = 0; warmupAttempt <= HTTP_PUSH_WARMUP_RETRIES; warmupAttempt++) {
+      const response = await fetchWithRetry(
+        url,
+        {
+          method: "PUT",
+          headers,
+          body: payload
+        },
+        {
+          retries: HTTP_PUSH_MAX_RETRIES,
+          timeoutMs: HTTP_PUSH_TIMEOUT_MS,
+          backoffMs: HTTP_PUSH_BACKOFF_MS
+        }
+      );
+      if (response.ok) {
+        pushed = true;
+        break;
       }
-    );
-    if (!response.ok) {
       const text = await response.text().catch(() => "");
+      const isWarmup = isTwinWorkerWarmupResponse(response.status, text);
+      if (isWarmup && warmupAttempt < HTTP_PUSH_WARMUP_RETRIES) {
+        const delay = resolveRetryDelay(HTTP_PUSH_WARMUP_BACKOFF_MS, warmupAttempt, 5e3);
+        warn(
+          `Twin "${sel.twinName}" not ready for state push (HTTP 503), retrying in ${delay}ms`,
+          { attempt: `${warmupAttempt + 1}/${HTTP_PUSH_WARMUP_RETRIES + 1}` }
+        );
+        await new Promise((resolve12) => setTimeout(resolve12, delay));
+        continue;
+      }
       throw new Error(
         `Failed to push dynamic seed to twin "${sel.twinName}": HTTP ${response.status}${text ? ` (${text})` : ""}`
       );
     }
+    if (!pushed) {
+      throw new Error(`Failed to push dynamic seed to twin "${sel.twinName}": worker warmup did not complete in time`);
+    }
     debug(`Pushed dynamic seed to ${sel.twinName} successfully`);
   }
 }
@@ -3072,7 +3104,7 @@ async function callLlmViaArchal(options) {
   debug("Archal backend response", { model: actualModel, remaining: String(result.data.remaining ?? "unknown") });
   const isSeedGen = options.intent === "seed-generate";
   if (!modelMismatchWarned && !isSeedGen && options.model && actualModel && !actualModel.includes(options.model) && !options.model.includes(actualModel)) {
-    warn(`Requested model "${options.model}" but Archal backend used "${actualModel}". To use a specific model, set provider to "direct" with your own API key.`);
+    debug(`Archal backend used "${actualModel}" (requested "${options.model}"). To use a specific model, set provider to "direct" with your own API key.`);
     modelMismatchWarned = true;
   }
   return result.data.text;
@@ -3195,6 +3227,47 @@ async function callAnthropic(options) {
   if (!textBlock?.text) throw new Error("Anthropic returned no text content");
   return textBlock.text;
 }
+function extractOpenAiTextContent(data) {
+  const message = data.choices?.[0]?.message;
+  if (!message) return null;
+  if (typeof message.content === "string") {
+    const trimmed = message.content.trim();
+    return trimmed.length > 0 ? trimmed : null;
+  }
+  if (Array.isArray(message.content)) {
+    const textSegments = [];
+    for (const part of message.content) {
+      if (typeof part === "string") {
+        const trimmed = part.trim();
+        if (trimmed.length > 0) textSegments.push(trimmed);
+        continue;
+      }
+      if (!part || typeof part !== "object") continue;
+      const partText = part.text;
+      if (typeof partText === "string") {
+        const trimmed = partText.trim();
+        if (trimmed.length > 0) textSegments.push(trimmed);
+        continue;
+      }
+      if (partText && typeof partText === "object" && typeof partText.value === "string") {
+        const trimmed = partText.value.trim();
+        if (trimmed.length > 0) textSegments.push(trimmed);
+        continue;
+      }
+      if (typeof part.value === "string") {
+        const trimmed = part.value.trim();
+        if (trimmed.length > 0) textSegments.push(trimmed);
+      }
+    }
+    if (textSegments.length > 0) {
+      return textSegments.join("\n");
+    }
+  }
+  if (typeof message.refusal === "string" && message.refusal.trim().length > 0) {
+    return message.refusal.trim();
+  }
+  return null;
+}
 function usesMaxCompletionTokens(model) {
   return model.startsWith("gpt-5") || model.startsWith("o1-") || model.startsWith("o2-") || model.startsWith("o3-") || model.startsWith("o4-");
 }
@@ -3222,7 +3295,7 @@ async function callOpenAi(options) {
     throw new LlmApiError("OpenAI", response.status, errorText.slice(0, 200));
   }
   const data = await response.json();
-  const content = data.choices?.[0]?.message?.content;
+  const content = extractOpenAiTextContent(data);
   if (!content) throw new Error("OpenAI returned no content");
   return content;
 }
@@ -3256,7 +3329,7 @@ async function callOpenAiCompatible(options) {
     throw new LlmApiError(`OpenAI-compatible (${options.baseUrl})`, response.status, errorText.slice(0, 200));
   }
   const data = await response.json();
-  const content = data.choices?.[0]?.message?.content;
+  const content = extractOpenAiTextContent(data);
   if (!content) throw new Error("OpenAI-compatible API returned no content");
   return content;
 }
@@ -3281,13 +3354,15 @@ ${CYAN}${BOLD}archal${RESET} ${DIM}|${RESET} ${scenarioTitle}
 `);
   }
 }
-function printRunProgress(runIndex, totalRuns, score, error2) {
+function printRunProgress(runIndex, totalRuns, score, error2, outcome) {
   const { quiet } = getLoggerOptions();
   if (quiet || activeOutputFormat !== "terminal") return;
   const dots = ".".repeat(Math.max(1, 20 - String(runIndex + 1).length - String(totalRuns).length));
   if (error2) {
     const shortError = error2.length > MAX_ERROR_PREVIEW_CHARS ? error2.slice(0, MAX_ERROR_PREVIEW_CHARS - 1) + "\u2026" : error2;
-    process.stderr.write(`  run ${runIndex + 1}/${totalRuns} ${DIM}${dots}${RESET} ${RED}ERROR${RESET} ${DIM}(${shortError})${RESET}
+    const inconclusive = outcome === "inconclusive_infrastructure" || outcome === "inconclusive_seed";
+    const label = inconclusive ? `${YELLOW}INCONCLUSIVE${RESET}` : `${RED}ERROR${RESET}`;
+    process.stderr.write(`  run ${runIndex + 1}/${totalRuns} ${DIM}${dots}${RESET} ${label} ${DIM}(${shortError})${RESET}
 `);
     return;
   }
@@ -4193,8 +4268,35 @@ function filterByPredicate(items, predicate) {
   if (knownMatches.length > 0) {
     return { items: knownMatches, recognized: true };
   }
+  const ACTION_VERBS = /* @__PURE__ */ new Set([
+    "listed",
+    "fetched",
+    "retrieved",
+    "found",
+    "searched",
+    "queried",
+    "posted",
+    "sent",
+    "received",
+    "notified",
+    "alerted",
+    "reviewed",
+    "analyzed",
+    "inspected",
+    "checked",
+    "verified",
+    "triaged",
+    "escalated",
+    "assigned",
+    "tagged",
+    "labeled",
+    "updated",
+    "edited",
+    "patched",
+    "migrated"
+  ]);
   const isSingleWord = !lowerPredicate.includes(" ");
-  if (isSingleWord) {
+  if (isSingleWord && !ACTION_VERBS.has(lowerPredicate)) {
     const hasKnownField = items.some((item) => {
       if (typeof item !== "object" || item === null) return false;
       const obj = item;
@@ -5466,24 +5568,46 @@ ${JSON.stringify(context.stateDiff, null, 2)}
 ## Agent Trace Evidence
 ${traceEvidence}`;
 }
+function estimateTokens(value) {
+  const json = JSON.stringify(value);
+  return Math.ceil(json.length / 4);
+}
+var MAX_STATE_TOKENS = 4e4;
 function summarizeState(state) {
   const flat = flattenTwinState(state);
   const summary = {};
   for (const [key, value] of Object.entries(flat)) {
     if (Array.isArray(value)) {
-      if (value.length <= 100) {
+      if (value.length <= 50) {
         summary[key] = value;
       } else {
         summary[key] = {
           _count: value.length,
-          _first20: value.slice(0, 20),
-          _last20: value.slice(-20)
+          _first10: value.slice(0, 10),
+          _last10: value.slice(-10)
         };
       }
     } else {
       summary[key] = value;
     }
   }
+  let totalTokens = estimateTokens(summary);
+  if (totalTokens > MAX_STATE_TOKENS) {
+    const collectionSizes = Object.entries(summary).map(([key, value]) => ({ key, tokens: estimateTokens(value) })).sort((a, b) => b.tokens - a.tokens);
+    for (const { key } of collectionSizes) {
+      if (totalTokens <= MAX_STATE_TOKENS) break;
+      const value = summary[key];
+      if (!Array.isArray(value)) continue;
+      const before = estimateTokens(value);
+      summary[key] = {
+        _count: value.length,
+        _first5: value.slice(0, 5),
+        _last5: value.slice(-5),
+        _truncated: "Collection too large for evaluation \u2014 showing subset"
+      };
+      totalTokens -= before - estimateTokens(summary[key]);
+    }
+  }
   return summary;
 }
 function parseJudgeResponse(text) {
@@ -5583,6 +5707,15 @@ async function evaluateWithLlm(criterion, expectedBehavior, stateBefore, stateAf
       };
     }
     const message = err instanceof Error ? err.message : String(err);
+    if (err instanceof LlmApiError && err.status === 400 && message.includes("too long")) {
+      warn(`LLM judge prompt too large for criterion "${criterion.id}" \u2014 twin state may be too large for evaluation`);
+      return {
+        criterionId: criterion.id,
+        status: "fail",
+        confidence: 0,
+        explanation: "LLM evaluation skipped: prompt exceeded model context window. The scenario state is too large for probabilistic evaluation. Consider using deterministic [D] criteria for this scenario."
+      };
+    }
     error(`LLM judge call failed: ${message}`);
     return {
       criterionId: criterion.id,
@@ -5809,6 +5942,17 @@ function buildFailureAnalysisPrompt(input) {
   );
   sections.push(`## Passed Criteria (${input.passedCriteria.length})`);
   sections.push(input.passedCriteria.map((c) => `- ${sanitizeForPrompt(c.description, 300)}`).join("\n"));
+  if (input.agentError || input.agentLog) {
+    sections.push(`## Agent Execution Context`);
+    if (input.agentError) {
+      sections.push(`Error: ${sanitizeForPrompt(input.agentError, 300)}`);
+    }
+    if (input.agentLog) {
+      const logTail = input.agentLog.length > 800 ? input.agentLog.slice(-800) : input.agentLog;
+      sections.push(`Agent log (tail):
+${sanitizeForPrompt(logTail, 800)}`);
+    }
+  }
   sections.push(`## Agent Trace (${input.trace.length} tool calls)`);
   sections.push(
     input.trace.length === 0 ? "(Agent made no tool calls - likely crashed or timed out)" : JSON.stringify(traceFormatted, null, 2)
@@ -6552,7 +6696,7 @@ function resolveTelemetryEndpointFromEnv() {
   if (!fallbackBaseUrl) {
     return null;
   }
-  return `${fallbackBaseUrl}/api/traces`;
+  return `${fallbackBaseUrl}/v1/traces`;
 }
 function resolveIngestToken() {
   return process.env["ARCHAL_TELEMETRY_TOKEN"]?.trim();
@@ -6701,8 +6845,26 @@ function isTelemetryEnabled() {
   if (consent !== "pending") return consent === "granted";
   return loadConfig().telemetry;
 }
-function buildStructuredRunError(runIndex, error2) {
+function buildStructuredRunError(runIndex, error2, outcome) {
   const message = error2.trim();
+  if (outcome === "inconclusive_seed") {
+    return {
+      runIndex,
+      message,
+      category: "seed_setup",
+      code: "SEED_SETUP_ERROR",
+      retryable: true
+    };
+  }
+  if (outcome === "inconclusive_infrastructure") {
+    return {
+      runIndex,
+      message,
+      category: "infrastructure",
+      code: "INFRASTRUCTURE_ERROR",
+      retryable: true
+    };
+  }
   if (message.startsWith("Agent not found:")) {
     return {
       runIndex,
@@ -6944,7 +7106,7 @@ function buildMetadata(report, totalEntries) {
     },
     agentInternals: {
       runDurationsMs: report.runs.map((run) => run.durationMs),
-      runErrors: report.runs.filter((run) => typeof run.error === "string" && run.error.length > 0).map((run) => buildStructuredRunError(run.runIndex, run.error)),
+      runErrors: report.runs.filter((run) => typeof run.error === "string" && run.error.length > 0).map((run) => buildStructuredRunError(run.runIndex, run.error, run.outcome)),
       evaluationCounts: { pass: passCount, partial: partialCount, fail: failCount },
       runSummaries: report.runs.map((run) => ({
         runIndex: run.runIndex,
@@ -7119,6 +7281,7 @@ async function uploadIfEnabled(traceId, report) {
 }
 // src/runner/dynamic-seed-generator.ts
+import { createHash as createHash4 } from "crypto";
 import { z as z4 } from "zod";
 // src/runner/seed-schemas/seed-schema-inference.ts
@@ -8240,7 +8403,8 @@ var RELATIONSHIP_RULES = {
     { sourceCollection: "disputes", sourceField: "paymentIntentId", targetCollection: "paymentIntents", targetField: "paymentIntentId", optional: true }
   ],
   jira: [
-    { sourceCollection: "issues", sourceField: "projectId", targetCollection: "projects", targetField: "id" }
+    { sourceCollection: "issues", sourceField: "projectId", targetCollection: "projects", targetField: "id" },
+    { sourceCollection: "projects", sourceField: "leadAccountId", targetCollection: "users", targetField: "accountId" }
   ],
   linear: [
     { sourceCollection: "issues", sourceField: "teamId", targetCollection: "teams", targetField: "id" },
@@ -8484,15 +8648,20 @@ function autoFillMissingFKs(seed, twinName) {
     const targetEntities = result[rule.targetCollection];
     if (!sourceEntities || !targetEntities || targetEntities.length === 0) continue;
     const targetValues = targetEntities.map((e) => e[rule.targetField]).filter((v) => v !== void 0 && v !== null);
-    if (targetValues.length !== 1) continue;
-    const singleTarget = targetValues[0];
+    if (targetValues.length === 0) continue;
+    const validTargetSet = new Set(targetValues.map(String));
+    let fillIndex = 0;
     for (const entity of sourceEntities) {
       const e = entity;
-      if (e[rule.sourceField] === void 0 || e[rule.sourceField] === null) {
-        warn(
-          `Auto-filling ${rule.sourceCollection}.${rule.sourceField} = ${String(singleTarget)} (only one ${rule.targetCollection} exists)`
+      const currentValue = e[rule.sourceField];
+      const needsFill = currentValue === void 0 || currentValue === null || !validTargetSet.has(String(currentValue));
+      if (needsFill) {
+        const fillValue = targetValues[fillIndex % targetValues.length];
+        fillIndex++;
+        debug(
+          `Auto-filling ${rule.sourceCollection}.${rule.sourceField} = ${String(fillValue)} (from ${targetValues.length} ${rule.targetCollection})` + (currentValue != null ? ` (was ${String(currentValue)} \u2014 not in targets)` : "")
         );
-        e[rule.sourceField] = singleTarget;
+        e[rule.sourceField] = fillValue;
       }
     }
   }
@@ -8526,12 +8695,36 @@ function normalizeSeedData(seed, twinName) {
           }
         }
       }
+      const collectionSchema = schema[collection];
+      if (collectionSchema) {
+        for (const [field, fieldDef] of Object.entries(collectionSchema)) {
+          if (!(field in e) || e[field] === null || e[field] === void 0) continue;
+          const expectedType = fieldDef.type.split("|")[0].trim();
+          if (expectedType === "string" && typeof e[field] === "object" && e[field] !== null && !Array.isArray(e[field])) {
+            const obj = e[field];
+            const extracted = obj["login"] ?? obj["name"] ?? obj["value"] ?? obj["key"] ?? obj["id"] ?? obj["displayName"];
+            if (typeof extracted === "string") {
+              debug(`Seed normalization: coerced ${collection}.${field} from object to string "${extracted}"`);
+              e[field] = extracted;
+            } else {
+              const firstStr = Object.values(obj).find((v) => typeof v === "string");
+              if (firstStr) {
+                debug(`Seed normalization: coerced ${collection}.${field} from object to string "${firstStr}" (fallback)`);
+                e[field] = firstStr;
+              } else {
+                debug(`Seed normalization: could not coerce ${collection}.${field} from object to string, removing`);
+                delete e[field];
+              }
+            }
+          }
+        }
+      }
       if (collectionDefaults) {
         for (const [field, defaultValue] of Object.entries(collectionDefaults)) {
           if (!(field in e)) {
             e[field] = structuredClone(defaultValue);
           } else if (e[field] === null && defaultValue !== null) {
-            const fieldDef = schema[collection]?.[field];
+            const fieldDef = collectionSchema?.[field];
             if (fieldDef && !fieldDef.type.includes("null")) {
               e[field] = structuredClone(defaultValue);
             }
@@ -8540,6 +8733,15 @@ function normalizeSeedData(seed, twinName) {
       }
     }
   }
+  if (twinName === "github" && result["repos"]) {
+    for (const entity of result["repos"]) {
+      const e = entity;
+      if ((!e["fullName"] || typeof e["fullName"] !== "string") && typeof e["owner"] === "string" && typeof e["name"] === "string") {
+        e["fullName"] = `${e["owner"]}/${e["name"]}`;
+        debug(`Seed normalization: derived repos.fullName = "${e["fullName"]}"`);
+      }
+    }
+  }
   return result;
 }
@@ -8551,6 +8753,7 @@ var KIND_COLLECTION_HINTS = {
   channel: ["channels"],
   user: ["users"],
   ticket: ["issues"],
+  project: ["projects"],
   table: ["tables"],
   site: ["sites", "domains"],
   file: ["files"],
@@ -8560,6 +8763,9 @@ var KIND_COLLECTION_HINTS = {
 var ENTITY_KEY_ALIASES = {
   "repo.owner": ["ownerLogin", "owner_login", "login", "owner.login", "owner.name"],
   "issue.key": ["identifier"],
+  "project.key": ["key", "projectKey"],
+  "ticket.key": ["identifier", "key"],
+  "stripe_entity.id": ["id", "charge", "chargeId", "paymentIntentId", "invoiceId", "customerId", "disputeId"],
   "email.address": ["email", "from", "to", "cc", "bcc"],
   "file.name": ["title", "fileName", "filename", "subject", "summary"]
 };
@@ -8715,10 +8921,28 @@ function validateSeedCoverage(intent, mergedSeed) {
   const entityIssues = [];
   const quoteErrors = [];
   const quoteWarnings = [];
-  const CORE_ENTITY_KEYS = /* @__PURE__ */ new Set(["owner", "name", "fullName", "channel_name", "key", "identifier", "number"]);
+  const CORE_ENTITY_KEYS = /* @__PURE__ */ new Set(["owner", "name", "fullName", "channel_name", "key", "identifier", "number", "id"]);
+  const CONTRACT_REQUIRED_KINDS = /* @__PURE__ */ new Set([
+    "repo",
+    "pullRequest",
+    "issue",
+    "channel",
+    "user",
+    "ticket",
+    "project",
+    "table"
+  ]);
   const entityWarnings = [];
   for (const entity of intent.entities) {
     if (typeof entity.value === "boolean") continue;
+    const candidateCollections = toCollectionCandidates(mergedSeed, entity.kind, entity.value);
+    if (CONTRACT_REQUIRED_KINDS.has(entity.kind) && candidateCollections.length === 0) {
+      entityIssues.push({
+        type: "missing_entity",
+        message: `Scenario entity contract mismatch: no collections match ${entity.kind}.${entity.key}=${String(entity.value)}`
+      });
+      continue;
+    }
     if (!valueExistsInCollections(mergedSeed, entity.kind, entity.key, entity.value)) {
       const issue = {
         type: "missing_entity",
@@ -8816,7 +9040,25 @@ var NON_SUBJECT_STARTS = /* @__PURE__ */ new Set([
   "could",
   "would",
   "may",
-  "might"
+  "might",
+  "for",
+  "with",
+  "in",
+  "at",
+  "to",
+  "from",
+  "by",
+  "on",
+  "per",
+  "via",
+  "into",
+  "onto",
+  "over",
+  "under",
+  "after",
+  "before",
+  "during",
+  "as"
 ]);
 function isReasonableCountSubject(subject, expected) {
   if (expected > MAX_REASONABLE_COUNT) return false;
@@ -8827,38 +9069,96 @@ function isReasonableCountSubject(subject, expected) {
   if (/\b(?:have|has|had|were|was|are|is|been|being|do|does|did|can|could|should|will|would|may|might)\b/.test(subject.toLowerCase())) return false;
   return true;
 }
+function appearsToBeClockSuffix(text, numberStart) {
+  const prefix = text.slice(Math.max(0, numberStart - 3), numberStart);
+  return /^\d{1,2}:$/.test(prefix);
+}
+function isDecimalFragment(text, matchIndex) {
+  if (matchIndex <= 0) return false;
+  const charBefore = text[matchIndex - 1];
+  if (charBefore === ".") {
+    return matchIndex >= 2 && /\d/.test(text[matchIndex - 2]);
+  }
+  return false;
+}
+function resolveSubjectWithKey(subject, flat) {
+  const candidates = buildSubjectCandidates2(subject);
+  for (const candidate of candidates) {
+    const normalized = candidate.replace(/\s+/g, "").toLowerCase();
+    for (const [key, value] of Object.entries(flat)) {
+      const normalizedKey = key.replace(/\s+/g, "").toLowerCase();
+      if ((normalizedKey === normalized || normalizedKey === normalized + "s") && Array.isArray(value)) {
+        return { items: value, key };
+      }
+    }
+  }
+  const items = resolveSubjectInState(subject, flat);
+  return items ? { items, key: "" } : null;
+}
+function buildSubjectCandidates2(subject) {
+  const candidates = [subject];
+  if (subject.endsWith("s") && subject.length > 3) {
+    candidates.push(subject.slice(0, -1));
+  } else {
+    candidates.push(subject + "s");
+  }
+  const words = subject.split(/\s+/);
+  if (words.length > 1) {
+    candidates.push(words[0]);
+    candidates.push(words[words.length - 1]);
+  }
+  return candidates;
+}
 function verifySeedCounts(setupText, seedState) {
   const mismatches = [];
   const flat = flattenTwinState(seedState);
   const countPattern = /\b(\d+)\s+([\w\s]+?)(?:\s+(?:that|which|are|with|in|labeled|assigned)\b)/gi;
   for (const match of setupText.matchAll(countPattern)) {
+    if (isDecimalFragment(setupText, match.index)) continue;
     const expected = parseInt(match[1], 10);
     const subject = match[2].trim();
+    if (match.index !== void 0 && appearsToBeClockSuffix(setupText, match.index)) continue;
     if (!subject || expected <= 0) continue;
     if (!isReasonableCountSubject(subject, expected)) continue;
-    const resolved = resolveSubjectInState(subject, flat);
-    if (resolved && resolved.length !== expected) {
-      mismatches.push({ subject, expected, actual: resolved.length });
+    const resolved = resolveSubjectWithKey(subject, flat);
+    if (resolved && resolved.items.length !== expected) {
+      mismatches.push({ subject, expected, actual: resolved.items.length, collectionKey: resolved.key || void 0 });
     }
   }
   const simplePattern = /\b(\d+)\s+([\w\s]+?)(?:[.,;:)]|$)/gm;
   const seenSubjects = new Set(mismatches.map((m) => m.subject.toLowerCase()));
   for (const match of setupText.matchAll(simplePattern)) {
+    if (isDecimalFragment(setupText, match.index)) continue;
     const expected = parseInt(match[1], 10);
     const subject = match[2].trim();
+    if (match.index !== void 0 && appearsToBeClockSuffix(setupText, match.index)) continue;
     if (!subject || expected <= 0 || seenSubjects.has(subject.toLowerCase())) continue;
     if (!isReasonableCountSubject(subject, expected)) continue;
-    const resolved = resolveSubjectInState(subject, flat);
-    if (resolved && resolved.length !== expected) {
-      mismatches.push({ subject, expected, actual: resolved.length });
+    const resolved = resolveSubjectWithKey(subject, flat);
+    if (resolved && resolved.items.length !== expected) {
+      mismatches.push({ subject, expected, actual: resolved.items.length, collectionKey: resolved.key || void 0 });
       seenSubjects.add(subject.toLowerCase());
     }
   }
   return mismatches;
 }
+function trimSeedToExpectedCounts(seed, mismatches) {
+  let totalTrimmed = 0;
+  for (const m of mismatches) {
+    if (m.actual <= m.expected) continue;
+    if (!m.collectionKey || !seed[m.collectionKey]) continue;
+    const collection = seed[m.collectionKey];
+    if (collection.length > m.expected) {
+      const trimmed = collection.length - m.expected;
+      seed[m.collectionKey] = collection.slice(0, m.expected);
+      totalTrimmed += trimmed;
+    }
+  }
+  return totalTrimmed;
+}
 // src/runner/seed-cache.ts
-var CACHE_VERSION = 3;
+var CACHE_VERSION = 4;
 var NEGATIVE_CACHE_VERSION = 2;
 var NEGATIVE_PREFIX = "neg-";
 var CACHE_DIR = join7(homedir2(), ".archal", "seed-cache");
@@ -9110,7 +9410,7 @@ ${setupText}
 Extract the seed blueprint as JSON.`;
   try {
     const provider = detectProvider(config.model);
-    const apiKey = resolveProviderApiKey(config.apiKey, provider);
+    const apiKey = config.providerMode === "archal" ? "" : resolveProviderApiKey(config.apiKey ?? "", provider);
     const responseText = await callLlm({
       provider,
       model: config.model,
@@ -9129,12 +9429,26 @@ Extract the seed blueprint as JSON.`;
     }
     const parsed = parseBlueprint(responseText, twinName);
     if (!parsed) return null;
+    const validCollections = new Set(availableCollections);
+    parsed.collections = parsed.collections.filter((col) => {
+      if (validCollections.has(col.name)) return true;
+      warn(`Blueprint references unknown collection "${col.name}" for ${twinName} \u2014 dropping`);
+      return false;
+    });
     for (const col of parsed.collections) {
       const groupSum = col.groups.reduce((sum, g) => sum + g.count, 0);
       if (groupSum !== col.totalCount) {
         debug(`Blueprint group count mismatch for ${col.name}: groups sum to ${groupSum}, totalCount is ${col.totalCount}. Adjusting.`);
         col.totalCount = groupSum;
       }
+      if (col.totalCount === 0) {
+        debug(`Blueprint collection ${col.name} has 0 entities \u2014 dropping`);
+      }
+    }
+    parsed.collections = parsed.collections.filter((col) => col.totalCount > 0);
+    if (parsed.collections.length === 0 && parsed.identities.length === 0) {
+      warn("Blueprint extracted no valid collections or identities");
+      return null;
     }
     return parsed;
   } catch (err) {
@@ -9356,7 +9670,13 @@ function buildSeedFromBlueprint(blueprint, baseSeed) {
   for (const identity of blueprint.identities) {
     processIdentity(identity, seed, warnings);
   }
+  const baseCollections = new Set(Object.keys(baseSeed));
   for (const spec of blueprint.collections) {
+    if (!baseCollections.has(spec.name) && !seed[spec.name]) {
+      warnings.push(`Blueprint references unknown collection "${spec.name}" \u2014 skipping`);
+      warn(`Blueprint references unknown collection "${spec.name}" for ${blueprint.twin} twin \u2014 skipping`);
+      continue;
+    }
     processCollection(spec, seed, blueprint.twin, existingLabels, warnings, now);
   }
   return { seed, warnings };
@@ -9612,9 +9932,16 @@ function buildSlackEntity(collection, id, props, seed, index, temporal, contentH
     }
     case "messages": {
       const channels = seed["channels"] ?? [];
-      const channelId = channels.length > 0 ? String(channels[index % channels.length]["channel_id"] ?? "C0001AAAA") : "C0001AAAA";
+      const targetChannel = channels.length > 0 ? channels[index % channels.length] : null;
+      const channelId = targetChannel ? String(targetChannel["channel_id"] ?? "C0001AAAA") : "C0001AAAA";
+      const channelMembers = targetChannel ? targetChannel["members"] ?? [] : [];
       const users = seed["users"] ?? [];
-      const userId = users.length > 0 ? String(users[index % users.length]["user_id"] ?? "U0001AAAA") : "U0001AAAA";
+      let userId;
+      if (channelMembers.length > 0) {
+        userId = channelMembers[index % channelMembers.length];
+      } else {
+        userId = users.length > 0 ? String(users[index % users.length]["user_id"] ?? "U0001AAAA") : "U0001AAAA";
+      }
       const baseTs = Math.floor(new Date(temporal.createdAt).getTime() / 1e3);
       const ts = generateSlackTs(baseTs, index);
       return {
@@ -10303,9 +10630,19 @@ function extractHybridPatch(obj) {
   }
   return null;
 }
-function buildSeedCacheContext(twinName, intent, context) {
+function hashText(text) {
+  return createHash4("sha256").update(text).digest("hex").slice(0, 16);
+}
+function buildSeedCacheContext(twinName, config, intent, context) {
   return {
     twinName,
+    generator: {
+      model: config.model,
+      providerMode: config.providerMode ?? "direct",
+      baseUrl: config.baseUrl ?? null,
+      systemPromptHash: hashText(SYSTEM_PROMPT2),
+      promptTemplateVersion: 2
+    },
     intent: intent ?? null,
     scenario: context ?? null
   };
@@ -10660,10 +10997,13 @@ async function tryBlueprintPath(twinName, baseSeedData, setupDescription, availa
     finalSeed = autoFillMissingFKs(finalSeed, twinName);
     const relValidation = validateSeedRelationships(finalSeed, twinName);
     if (!relValidation.valid) {
-      warn("Blueprint seed failed relationship validation", {
-        errors: relValidation.errors.slice(0, 5).join("; ")
-      });
-      return null;
+      finalSeed = autoFillMissingFKs(finalSeed, twinName);
+      const secondValidation = validateSeedRelationships(finalSeed, twinName);
+      if (!secondValidation.valid) {
+        warn("Blueprint seed has unresolved FK references (continuing anyway)", {
+          errors: secondValidation.errors.slice(0, 5).join("; ")
+        });
+      }
     }
     if (intent) {
       const coverage = validateSeedCoverage(intent, finalSeed);
@@ -10678,9 +11018,16 @@ async function tryBlueprintPath(twinName, baseSeedData, setupDescription, availa
     flatForVerify[twinName] = finalSeed;
     const countMismatches = verifySeedCounts(setupDescription, flatForVerify);
     if (countMismatches.length > 0) {
-      debug("Blueprint seed has count mismatches (acceptable)", {
-        mismatches: countMismatches.map((m) => `${m.subject}: ${m.expected} vs ${m.actual}`).join("; ")
-      });
+      const trimmed = trimSeedToExpectedCounts(finalSeed, countMismatches);
+      if (trimmed > 0) {
+        debug(`Blueprint seed: trimmed ${trimmed} excess entities to match setup counts`);
+      }
+      const remaining = countMismatches.filter((m) => m.actual > m.expected && !m.collectionKey);
+      if (remaining.length > 0) {
+        debug("Blueprint seed has unresolvable count mismatches", {
+          mismatches: remaining.map((m) => `${m.subject}: ${m.expected} vs ${m.actual}`).join("; ")
+        });
+      }
     }
     const syntheticPatch = {
       add: {}
@@ -10710,7 +11057,7 @@ async function tryBlueprintPath(twinName, baseSeedData, setupDescription, availa
 async function generateDynamicSeed(twinName, baseSeedName, baseSeedData, setupDescription, config, intent, context) {
   const cacheScope = {
     baseSeedData,
-    cacheContext: buildSeedCacheContext(twinName, intent, context)
+    cacheContext: buildSeedCacheContext(twinName, config, intent, context)
   };
   if (!config.noCache) {
     const cached = getCachedSeed(twinName, baseSeedName, setupDescription, cacheScope);
@@ -10741,7 +11088,7 @@ async function generateDynamicSeed(twinName, baseSeedName, baseSeedData, setupDe
   if (blueprintResult) {
     info("Dynamic seed generated via blueprint", { twin: twinName });
     if (!config.noCache) {
-      const cacheContext = buildSeedCacheContext(twinName, intent, context);
+      const cacheContext = buildSeedCacheContext(twinName, config, intent, context);
       cacheSeed(twinName, baseSeedName, setupDescription, blueprintResult.seed, blueprintResult.patch, {
         baseSeedData,
         cacheContext
@@ -10787,7 +11134,7 @@ Fix these issues:
         validationAttempt: String(validationAttempts + 1)
       });
       const provider = detectProvider(config.model);
-      const apiKey = resolveProviderApiKey(config.apiKey, provider);
+      const apiKey = effectiveMode === "archal" ? "" : resolveProviderApiKey(config.apiKey, provider);
       const responseText = await callLlm({
         provider,
         model: config.model,
@@ -10796,7 +11143,7 @@ Fix these issues:
         userPrompt: promptWithFeedback,
         maxTokens: 16384,
         baseUrl: config.baseUrl,
-        providerMode: config.providerMode,
+        providerMode: effectiveMode,
         intent: "seed-generate",
         responseFormat: "json"
       });
@@ -10872,14 +11219,19 @@ Fix these issues:
       const relationshipValidation = validateSeedRelationships(mergedSeed, twinName);
       if (!relationshipValidation.valid) {
         const topErrors = relationshipValidation.errors.slice(0, 10);
-        warn(`Dynamic seed relationship validation failed (attempt ${attempt + 1})`, {
+        if (validationAttempts < MAX_ATTEMPTS - 1) {
+          warn(`Dynamic seed relationship validation failed (attempt ${attempt + 1})`, {
+            errors: topErrors.join("; ")
+          });
+          lastErrors = topErrors;
+          patch = null;
+          mergedSeed = null;
+          validationAttempts++;
+          continue;
+        }
+        warn(`Dynamic seed has unresolved FK references (accepting on final attempt)`, {
           errors: topErrors.join("; ")
         });
-        lastErrors = topErrors;
-        patch = null;
-        mergedSeed = null;
-        validationAttempts++;
-        continue;
       }
       if (intent) {
         debug("Seed intent coverage summary", {
@@ -10938,6 +11290,15 @@ Fix these issues:
   }
   mergedSeed = autoFillMissingFKs(mergedSeed, twinName);
   mergedSeed = ensureSlackScenarioChannelAccess(mergedSeed, intent);
+  if (setupDescription) {
+    const flatForTrim = {};
+    flatForTrim[twinName] = mergedSeed;
+    const finalMismatches = verifySeedCounts(setupDescription, flatForTrim);
+    const trimmed = trimSeedToExpectedCounts(mergedSeed, finalMismatches);
+    if (trimmed > 0) {
+      debug(`Trimmed ${trimmed} excess seed entities to match setup counts`);
+    }
+  }
   if (!config.noCache) {
     cacheSeed(twinName, baseSeedName, setupDescription, mergedSeed, patch, cacheScope);
   }
@@ -11085,10 +11446,23 @@ function githubIntent(setup) {
     entities.push({ kind: "repo", key: "fullName", value: fullName });
   }
   if (!primaryRepoSet) {
-    const orgMatch = setup.match(/\bgithub\s+(?:organization|org)\s+"([a-z][a-z0-9._-]*)"/i);
+    const orgMatch = setup.match(
+      /\b(?:github\s+)?(?:organization|org)\s+(?:named\s+)?["']?([a-z][a-z0-9._-]*)["']?/i
+    );
     if (orgMatch?.[1]) {
-      extractedSlots["repo.owner"] = orgMatch[1];
-      entities.push({ kind: "repo", key: "owner", value: orgMatch[1] });
+      extractedSlots["repo.owner"] = orgMatch[1].toLowerCase();
+      entities.push({ kind: "repo", key: "owner", value: orgMatch[1].toLowerCase() });
+      const repoName = setup.match(/\b(?:repository|repo)\s+(?:named\s+)?["']?([a-z][a-z0-9._-]{1,99})["']?/i)?.[1];
+      if (repoName) {
+        const normalizedName = repoName.toLowerCase();
+        extractedSlots["repo.name"] = normalizedName;
+        entities.push({ kind: "repo", key: "name", value: normalizedName });
+        entities.push({
+          kind: "repo",
+          key: "fullName",
+          value: `${String(extractedSlots["repo.owner"])}/${normalizedName}`
+        });
+      }
     } else {
       missingSlots.push({
         slot: "repo.owner/repo.name",
@@ -11279,6 +11653,18 @@ function stripeIntent(setup) {
       });
     }
   }
+  const idRegex = /\b((?:acct|cus|prod|price|pi|ch|re|in|sub|dp|pm|payout|tr|tok|evt)_[a-zA-Z0-9]+)\b/g;
+  const seenIds = /* @__PURE__ */ new Set();
+  let idMatch;
+  while ((idMatch = idRegex.exec(setup)) !== null) {
+    const id = idMatch[1];
+    if (seenIds.has(id)) continue;
+    seenIds.add(id);
+    entities.push({ kind: "stripe_entity", key: "id", value: id });
+    if (!extractedSlots["stripe.primary_id"]) {
+      extractedSlots["stripe.primary_id"] = id;
+    }
+  }
   if (missingSlots.length > 0) {
     return { intent: null, missingSlots };
   }
@@ -11372,6 +11758,30 @@ function jiraIntent(setup) {
     }
     entities.push({ kind: "ticket", key: "key", value: key });
   }
+  const seenProjects = /* @__PURE__ */ new Set();
+  const addProject = (projectKey) => {
+    const normalized = projectKey.toUpperCase();
+    if (!/^[A-Z][A-Z0-9]{1,9}$/.test(normalized)) return;
+    if (seenProjects.has(normalized)) return;
+    seenProjects.add(normalized);
+    entities.push({ kind: "project", key: "key", value: normalized });
+    if (!extractedSlots["project.key"]) {
+      extractedSlots["project.key"] = normalized;
+    }
+  };
+  for (const key of seenKeys) {
+    addProject(key.split("-", 1)[0] ?? "");
+  }
+  const projectRegexes = [
+    /\b(?:jira\s+)?project\s+(?:key\s*)?[:=]?\s*["']?([A-Z][A-Z0-9]{1,9})["']?/gi,
+    /\bproject\s+["'][^"'\n]+["']\s*\(\s*([A-Z][A-Z0-9]{1,9})\s*\)/gi
+  ];
+  for (const regex of projectRegexes) {
+    let projectMatch;
+    while ((projectMatch = regex.exec(setup)) !== null) {
+      addProject(projectMatch[1] ?? "");
+    }
+  }
   return {
     intent: {
       twinName: "jira",
@@ -11386,6 +11796,7 @@ function jiraIntent(setup) {
 }
 function supabaseIntent(setup) {
   const extractedSlots = {};
+  const entities = [];
   const missingSlots = [];
   const requiredSlots = ["database.target"];
   const seenTables = /* @__PURE__ */ new Set();
@@ -11418,6 +11829,9 @@ function supabaseIntent(setup) {
   const hasEnvVarTokens = /\b[A-Z][A-Z0-9_]{2,}\b/.test(setup);
   if (seenTables.size > 0 || mentionsProject || mentionsLogsOrService || mentionsEnvVars && hasEnvVarTokens) {
     extractedSlots["database.target"] = true;
+    for (const table2 of seenTables) {
+      entities.push({ kind: "table", key: "name", value: table2 });
+    }
   } else {
     missingSlots.push({
       slot: "database.target",
@@ -11434,10 +11848,7 @@ function supabaseIntent(setup) {
       setupSummary: setupSummary(setup),
       requiredSlots,
       extractedSlots,
-      // Supabase table names in setup can describe conceptual data sources
-      // that are not materialized in the base SQL schema. Keep intent broad
-      // to avoid false-hard failures in seed generation.
-      entities: [],
+      entities,
       quotedStrings: []
     },
     missingSlots: []
@@ -11897,11 +12308,21 @@ function parseSqlSeed(sql) {
 function loadSeedStateFromPath(seedRoot, seedName) {
   const jsonPath = resolve4(seedRoot, `${seedName}.json`);
   if (existsSync10(jsonPath)) {
-    return JSON.parse(readFileSync12(jsonPath, "utf-8"));
+    try {
+      return JSON.parse(readFileSync12(jsonPath, "utf-8"));
+    } catch (err) {
+      const detail = err instanceof Error ? err.message : String(err);
+      throw new Error(`Failed to parse seed file ${jsonPath}: ${detail}`);
+    }
   }
   const sqlPath = resolve4(seedRoot, `${seedName}.sql`);
   if (existsSync10(sqlPath)) {
-    return parseSqlSeed(readFileSync12(sqlPath, "utf-8"));
+    try {
+      return parseSqlSeed(readFileSync12(sqlPath, "utf-8"));
+    } catch (err) {
+      const detail = err instanceof Error ? err.message : String(err);
+      throw new Error(`Failed to parse seed file ${sqlPath}: ${detail}`);
+    }
   }
   return null;
 }
@@ -11951,12 +12372,24 @@ function loadBaseSeedFromDisk(twinName, seedName) {
 }
 function categorizeRunError(message) {
   if (/Failed to spawn|ENOENT/.test(message)) {
-    return `Agent not found: ${message}. Check that your agent command is installed and in PATH.`;
+    return {
+      message: `Agent not found: ${message}. Check that your agent command is installed and in PATH.`,
+      outcome: "failed_agent"
+    };
+  }
+  if (/Dynamic seed generation failed|Missing dynamic seed state|seed generation|seed setup/i.test(message)) {
+    return {
+      message: `Seed generation error: ${message}`,
+      outcome: "inconclusive_seed"
+    };
   }
   if (/HTTP [45]\d\d|ECONNREFUSED|ENOTFOUND|ETIMEDOUT|ECONNRESET|cloud session|fetch failed|AbortError|TimeoutError|operation was aborted|timed?\s*out/i.test(message)) {
-    return `Infrastructure error: ${message}. Check your network or try again.`;
+    return {
+      message: `Infrastructure error: ${message}. Check your network or try again.`,
+      outcome: "inconclusive_infrastructure"
+    };
   }
-  return message;
+  return { message, outcome: "failed_agent" };
 }
 async function executeSingleRun(runIndex, scenario, agentConfig, seedSelections, evaluatorConfig, timeoutSeconds, apiEngine, localEngine, remoteTwinUrlOverrides, apiRouting, cloudTwinUrls, hostedSessionId, apiBearerToken, adminAuth) {
   const startTime = Date.now();
@@ -12094,7 +12527,8 @@ ${baseTaskMessage}` : baseTaskMessage;
         stateDiff: diff,
         agentLog: agentResult.stderr || void 0,
         agentTrace: agentResult.agentTrace,
-        tokenUsage
+        tokenUsage,
+        outcome: "failed_agent"
       };
     }
     if (agentResult.exitCode !== 0 && agentResult.exitCode !== null) {
@@ -12133,11 +12567,14 @@ ${baseTaskMessage}` : baseTaskMessage;
         stateDiff: diff,
         agentLog: agentResult.stderr || void 0,
         agentTrace: agentResult.agentTrace,
-        tokenUsage
+        tokenUsage,
+        outcome: "failed_agent"
       };
     }
     if (trace.length === 0) {
-      warn(`Agent made no tool calls on run ${runIndex + 1}. The agent may have failed to act \u2014 check agent logs and task prompt.`);
+      warn(
+        `Agent made no tool calls on run ${runIndex + 1}. This usually means the model is too weak for this scenario. Try a more capable model (e.g. --engine-model claude-sonnet-4-6 or --engine-model gemini-2.5-pro). If using a custom agent, check that it correctly processes tool schemas and calls tools.`
+      );
     }
     progress(`Evaluating run ${runIndex + 1}...`);
     const evaluationResult = await evaluateRun(
@@ -12163,12 +12600,13 @@ ${baseTaskMessage}` : baseTaskMessage;
       stateDiff: diff,
       agentLog: agentResult.stderr || void 0,
       agentTrace: agentResult.agentTrace,
-      tokenUsage
+      tokenUsage,
+      outcome: "completed"
     };
   } catch (err) {
     const message = err instanceof Error ? err.message : String(err);
     const categorized = categorizeRunError(message);
-    error(`Run ${runIndex + 1} failed: ${categorized}`);
+    error(`Run ${runIndex + 1} failed: ${categorized.message}`);
     const durationMs = Date.now() - startTime;
     return {
       runIndex,
@@ -12176,12 +12614,13 @@ ${baseTaskMessage}` : baseTaskMessage;
         criterionId: c.id,
         status: "fail",
         confidence: 1,
-        explanation: `Run failed: ${categorized}`
+        explanation: `Run failed: ${categorized.message}`
       })),
       overallScore: 0,
       trace: [],
       durationMs,
-      error: categorized,
+      error: categorized.message,
+      outcome: categorized.outcome,
       stateBefore: beforeState,
       stateAfter: beforeState,
       stateDiff: { added: {}, modified: {}, removed: {} }
@@ -12258,9 +12697,20 @@ function preflightCheck(scenario, apiKey, model, baseUrl, evaluatorProvider, see
     }
   }
   if (seedModel) {
+    const mode = seedProviderMode ?? "auto";
+    const provider = detectProvider(seedModel);
+    const resolvedKey = resolveProviderApiKey(apiKey, provider);
     const creds = getCredentials();
     const hasArchalAuth = Boolean(creds?.token);
-    if (!hasArchalAuth) {
+    if (provider === "openai-compatible" && !baseUrl && mode === "direct") {
+      errors.push({
+        check: "seed.baseUrl",
+        message: `Seed model "${seedModel}" requires a base URL for the OpenAI-compatible endpoint`,
+        detail: "Set via: export ARCHAL_EVALUATOR_BASE_URL=<url> or archal config set evaluator.baseUrl <url>",
+        warning: true
+      });
+    }
+    if (mode === "archal" && !hasArchalAuth) {
       errors.push({
         check: "archal-auth-seed",
         message: "Dynamic seed generation requires Archal authentication",
@@ -12268,6 +12718,32 @@ function preflightCheck(scenario, apiKey, model, baseUrl, evaluatorProvider, see
         warning: true
       });
     }
+    if (mode === "direct" && !resolvedKey) {
+      errors.push({
+        check: getProviderEnvVar(provider),
+        message: `Seed provider is "direct" but no API key is configured for ${provider}`,
+        detail: `Set via: export ${getProviderEnvVar(provider)}=<your-key> or archal config set evaluator.apiKey <key>`,
+        warning: true
+      });
+    }
+    if (mode === "auto" && !resolvedKey && !hasArchalAuth) {
+      errors.push({
+        check: getProviderEnvVar(provider),
+        message: 'Dynamic seed generation has no available provider in "auto" mode',
+        detail: `Set ${getProviderEnvVar(provider)} (or evaluator.apiKey) for direct mode, or run archal login for Archal backend mode`,
+        warning: true
+      });
+    }
+    if (resolvedKey && (mode === "direct" || mode === "auto")) {
+      const mismatch = validateKeyForProvider(resolvedKey, provider);
+      if (mismatch) {
+        errors.push({
+          check: "seed-key-provider-mismatch",
+          message: mismatch,
+          warning: true
+        });
+      }
+    }
   }
   return errors;
 }
@@ -12316,6 +12792,35 @@ async function runScenario(options) {
       'cloudTwinUrls is required. Local twin execution has been removed; use "archal run" to provision a hosted session.'
     );
   }
+  const criterionDescriptions = {};
+  const criterionTypes = {};
+  for (const c of scenario.successCriteria) {
+    criterionDescriptions[c.id] = c.description;
+    criterionTypes[c.id] = c.type;
+  }
+  const buildInconclusiveSeedReport = (message) => ({
+    scenarioTitle: scenario.title,
+    satisfactionScore: 0,
+    criterionDescriptions,
+    criterionTypes,
+    twinNames: scenario.config.twins,
+    runs: [{
+      runIndex: 0,
+      evaluations: scenario.successCriteria.map((criterion) => ({
+        criterionId: criterion.id,
+        status: "fail",
+        confidence: 1,
+        explanation: `Run not scored due to seed setup failure: ${message}`
+      })),
+      overallScore: 0,
+      trace: [],
+      durationMs: 0,
+      error: message,
+      outcome: "inconclusive_seed"
+    }],
+    summary: `Inconclusive (seed setup): ${message}`,
+    timestamp: (/* @__PURE__ */ new Date()).toISOString()
+  });
   const preflightErrors = preflightCheck(
     scenario,
     config.apiKey,
@@ -12406,7 +12911,7 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
             cacheContext: seedPromptContext
           });
         }
-        throw new Error(message);
+        return buildInconclusiveSeedReport(message);
       }
       warn(message);
       generationTargets.push(sel);
@@ -12415,12 +12920,11 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
   if (generationTargets.length > 0) {
     progress("Generating dynamic seeds from setup description...");
     const dynamicConfig = {
-      apiKey: "",
-      // Seed gen always routes through Archal backend
+      apiKey: config.apiKey,
       model: config.seedModel,
       baseUrl: config.baseUrl,
       noCache: options.noSeedCache,
-      providerMode: "archal"
+      providerMode: config.seedProvider
     };
     let cloudSeedSnapshotByTwin = null;
     const adminAuth = options.apiAdminToken ? { token: options.apiAdminToken, userId: options.apiAdminUserId } : void 0;
@@ -12438,20 +12942,28 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
         baseSeedData = normalizeSeedState(cloudSeedSnapshotByTwin[sel.twinName]);
       }
       if (!baseSeedData || Object.keys(baseSeedData).length === 0) {
-        throw new Error(
+        return buildInconclusiveSeedReport(
           `Could not load base seed "${sel.seedName}" for twin "${sel.twinName}" from disk. Ensure the seed file exists at twins/${sel.twinName}/seeds/${sel.seedName}.json or .sql, or that the hosted twin /state endpoint is reachable.`
         );
       }
       progress(`Generating dynamic seed for ${sel.twinName}...`);
-      const result = await generateDynamicSeed(
-        sel.twinName,
-        sel.seedName,
-        baseSeedData,
-        scenario.setup,
-        dynamicConfig,
-        extractedIntentByTwin.get(sel.twinName),
-        seedPromptContext
-      );
+      let result;
+      try {
+        result = await generateDynamicSeed(
+          sel.twinName,
+          sel.seedName,
+          baseSeedData,
+          scenario.setup,
+          dynamicConfig,
+          extractedIntentByTwin.get(sel.twinName),
+          seedPromptContext
+        );
+      } catch (error2) {
+        const detail = error2 instanceof Error ? error2.message : String(error2);
+        return buildInconclusiveSeedReport(
+          `Dynamic seed generation failed for twin "${sel.twinName}": ${detail}`
+        );
+      }
       sel.seedData = result.seed;
       if (result.fromCache) {
         cachedSeedTwins.push(sel.twinName);
@@ -12467,15 +12979,21 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
   }
   const missingDynamicSeeds = seedSelections.filter((sel) => !sel.seedData);
   if (missingDynamicSeeds.length > 0) {
-    throw new Error(
+    return buildInconclusiveSeedReport(
       `Missing dynamic seed state for twin(s): ${missingDynamicSeeds.map((sel) => sel.twinName).join(", ")}`
     );
   }
   for (const sel of seedSelections) {
     const mismatches = verifySeedCounts(scenario.setup, sel.seedData);
     if (mismatches.length === 0) continue;
+    const significantMismatches = mismatches.filter((m) => {
+      const delta = Math.abs(m.expected - m.actual);
+      const ratio = m.expected > 0 ? delta / m.expected : delta;
+      return delta > 5 || ratio > 0.5;
+    });
+    if (significantMismatches.length === 0) continue;
     warn(
-      `Seed count mismatch for ${sel.twinName}: ${mismatches.map((m) => `${m.subject}: expected ${m.expected}, got ${m.actual}`).join("; ")}`
+      `Seed count mismatch for ${sel.twinName}: ${significantMismatches.map((m) => `${m.subject}: expected ${m.expected}, got ${m.actual}`).join("; ")}`
     );
   }
   const scenarioDir = dirname2(resolve4(options.scenarioPath));
@@ -12656,8 +13174,8 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
     return {
       scenarioTitle: scenario.title,
       satisfactionScore: 100,
-      criterionDescriptions: {},
-      criterionTypes: {},
+      criterionDescriptions,
+      criterionTypes,
       twinNames: scenario.config.twins,
       runs: [],
       summary: "Preflight checks passed",
@@ -12667,7 +13185,7 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
   printHeader(scenario.title, seedSelections);
   const evaluatorProvider = detectProvider(model);
   const configProvider = detectProvider(config.model);
-  const evaluatorApiKey = options.model && evaluatorProvider !== configProvider ? resolveProviderApiKey("", evaluatorProvider) : resolveProviderApiKey(config.apiKey, evaluatorProvider);
+  const evaluatorApiKey = config.evaluatorProvider === "archal" ? "" : options.model && evaluatorProvider !== configProvider ? resolveProviderApiKey("", evaluatorProvider) : resolveProviderApiKey(config.apiKey, evaluatorProvider);
   const evaluatorConfig = {
     apiKey: evaluatorApiKey,
     model,
@@ -12696,8 +13214,8 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
       adminAuth
     );
     runs.push(result);
-    printRunProgress(i, numRuns, result.overallScore, result.error);
-    if (result.error) {
+    printRunProgress(i, numRuns, result.overallScore, result.error, result.outcome);
+    if (result.outcome === "inconclusive_infrastructure" || result.outcome === "inconclusive_seed") {
       consecutiveInfraErrors++;
       if (consecutiveInfraErrors >= EARLY_ABORT_THRESHOLD && i < numRuns - 1) {
         warn(`${consecutiveInfraErrors} consecutive run errors \u2014 aborting remaining ${numRuns - i - 1} run(s) to avoid wasting quota.`);
@@ -12707,19 +13225,17 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
       consecutiveInfraErrors = 0;
     }
   }
-  const runScores = runs.map((r) => r.overallScore);
+  const scoredRuns = runs.filter(
+    (run) => run.outcome !== "inconclusive_infrastructure" && run.outcome !== "inconclusive_seed"
+  );
+  const runScores = scoredRuns.map((r) => r.overallScore);
   const satisfactionScore = aggregateSatisfaction(runScores);
-  const allEvaluations = runs.map((r) => r.evaluations);
-  const summary = generateSummary(allEvaluations, satisfactionScore);
-  const criterionDescriptions = {};
-  const criterionTypes = {};
-  for (const c of scenario.successCriteria) {
-    criterionDescriptions[c.id] = c.description;
-    criterionTypes[c.id] = c.type;
-  }
+  const allEvaluations = scoredRuns.map((r) => r.evaluations);
+  const inconclusiveRuns = runs.length - scoredRuns.length;
+  const summary = scoredRuns.length > 0 ? generateSummary(allEvaluations, satisfactionScore) : `Inconclusive: no scored runs (${inconclusiveRuns} infrastructure/seed setup run failure${inconclusiveRuns === 1 ? "" : "s"}).`;
   let failureAnalysis;
-  if (satisfactionScore < 100 && runs.length > 0 && !options.noFailureAnalysis) {
-    const representativeRun = runs.reduce(
+  if (satisfactionScore < 100 && scoredRuns.length > 0 && !options.noFailureAnalysis) {
+    const representativeRun = scoredRuns.reduce(
       (worst, r) => r.overallScore < worst.overallScore ? r : worst
     );
     const failedCriteria = representativeRun.evaluations.filter((e) => e.status !== "pass").map((e) => ({
@@ -12742,7 +13258,9 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
           stateDiff: representativeRun.stateDiff ?? { added: {}, modified: {}, removed: {} },
           stateBefore: representativeRun.stateBefore ?? {},
           stateAfter: representativeRun.stateAfter ?? {},
-          satisfactionScore
+          satisfactionScore,
+          agentLog: representativeRun.agentLog,
+          agentError: representativeRun.error
         },
         evaluatorConfig
       );
@@ -13521,7 +14039,21 @@ function createRunCommand() {
         }
       }
       if (!process.env["ARCHAL_ENGINE_API_KEY"] && userConfig.engineApiKey) {
-        process.env["ARCHAL_ENGINE_API_KEY"] = userConfig.engineApiKey;
+        const configKey = userConfig.engineApiKey;
+        const requestedModel = firstNonEmpty(
+          opts.engineModel,
+          process.env["ARCHAL_ENGINE_MODEL"],
+          opts.model
+          // -m also defaults the engine model for local harnesses
+        );
+        if (requestedModel) {
+          const modelProvider = detectProvider(requestedModel);
+          if (!validateKeyForProvider(configKey, modelProvider)) {
+            process.env["ARCHAL_ENGINE_API_KEY"] = configKey;
+          }
+        } else {
+          process.env["ARCHAL_ENGINE_API_KEY"] = configKey;
+        }
       }
     }
     inferEngineModelFromEvaluatorModel(opts);
@@ -13572,8 +14104,17 @@ function createRunCommand() {
       }
     }
     if (engine.mode === "local" && !process.env["ARCHAL_ENGINE_API_KEY"]) {
+      const requestedModel = firstNonEmpty(
+        opts.engineModel,
+        process.env["ARCHAL_ENGINE_MODEL"]
+      );
+      const provider = requestedModel ? detectProvider(requestedModel) : null;
+      const providerHint = provider ? `
+  Hint: You requested model "${requestedModel}" (${provider}) but no ${provider} API key is available.
+  Set ${getProviderEnvVar(provider)} or pass --engine-key <${provider}-key>
+` : "";
       process.stderr.write(
-        "Error: No API key found. The agent harness needs an API key to call the model.\nSet one of:\n  GEMINI_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY env var\n  archal config set engine.apiKey <key>\n  ARCHAL_ENGINE_API_KEY env var\n"
+        "Error: No API key found. The agent harness needs an API key to call the model.\nSet one of:\n  GEMINI_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY env var\n  archal config set engine.apiKey <key>\n  ARCHAL_ENGINE_API_KEY env var\n" + providerHint
       );
       process.exit(2);
     }
@@ -13643,12 +14184,14 @@ function createRunCommand() {
           })();
           const SESSION_READY_TIMEOUT_MS = Math.max(12e4, configuredReadyTimeoutMs);
           const SESSION_POLL_INTERVAL_MS = 2e3;
-          const STATUS_READY_GRACE_MS = 5e3;
           const readyDeadline = Date.now() + SESSION_READY_TIMEOUT_MS;
           let sessionReady = false;
           let lastPollIssue;
-          let statusReadySinceMs = null;
           const isRetryablePollFailure = (result) => result.offline || typeof result.status === "number" && result.status >= 500;
+          const workersAllReady = (workers) => {
+            if (!workers || Object.keys(workers).length === 0) return true;
+            return Object.values(workers).every((value) => value === "ready");
+          };
           const sleepForPollInterval = async () => new Promise((resolve12) => setTimeout(resolve12, SESSION_POLL_INTERVAL_MS));
           if (!opts.quiet) process.stderr.write("Starting cloud session...\n");
           let pollCount = 0;
@@ -13703,26 +14246,19 @@ function createRunCommand() {
             }
             const healthAlive = healthResult.ok && healthResult.data.alive;
             const statusAlive = statusResult.data.alive || status === "ready";
-            if (statusAlive && healthAlive) {
+            const statusWorkersReady = workersAllReady(
+              statusResult.data.twins ?? statusResult.data.workers
+            );
+            const healthWorkersReady = workersAllReady(healthResult.data.twins);
+            if (statusAlive && healthAlive && statusWorkersReady && healthWorkersReady) {
               sessionReady = true;
               break;
             }
-            if (statusAlive && !healthAlive) {
-              if (statusReadySinceMs === null) {
-                statusReadySinceMs = Date.now();
-              }
-              const readyForMs = Date.now() - statusReadySinceMs;
-              if (readyForMs >= STATUS_READY_GRACE_MS) {
-                debug(
-                  `Session ${backendSessionId} proceeded after health endpoint warmup (${readyForMs}ms).`
-                );
-                sessionReady = true;
-                break;
-              }
-            } else {
-              statusReadySinceMs = null;
-            }
-            lastPollIssue = `session still starting (status=${status}, health=${healthAlive ? "alive" : "starting"})`;
+            const statusTwinStates = Object.entries(
+              statusResult.data.twins ?? statusResult.data.workers ?? {}
+            ).map(([twin, twinStatus]) => `${twin}:${twinStatus}`).join(", ");
+            const healthTwinStates = Object.entries(healthResult.data.twins ?? {}).map(([twin, twinStatus]) => `${twin}:${twinStatus}`).join(", ");
+            lastPollIssue = `session still starting (status=${status}, health=${healthAlive ? "alive" : "starting"}, statusTwins=[${statusTwinStates || "n/a"}], healthTwins=[${healthTwinStates || "n/a"}])`;
             await sleepForPollInterval();
           }
           if (sessionReady) {
@@ -14123,6 +14659,7 @@ function buildEvidenceArtifacts(report) {
     overallScore: run.overallScore,
     durationMs: run.durationMs,
     error: run.error ?? null,
+    outcome: run.outcome ?? null,
     evaluations: (run.evaluations ?? []).map((ev) => ({
       criterionId: ev.criterionId,
       status: ev.status,
@@ -14442,7 +14979,7 @@ import { createInterface as createInterface2 } from "readline";
 import { Command as Command5 } from "commander";
 // src/telemetry/anonymizer.ts
-import { createHash as createHash4 } from "crypto";
+import { createHash as createHash5 } from "crypto";
 var API_KEY_PATTERNS = [
   /(?:api[_-]?key|token|secret|password|authorization|bearer|credential)\s*[:=]\s*["']?([a-zA-Z0-9_\-/.+=]{16,})["']?/gi,
   /sk-[a-zA-Z0-9]{20,}/g,
@@ -14492,7 +15029,7 @@ var USERNAME_FIELDS = /* @__PURE__ */ new Set([
   "maintainer"
 ]);
 function hashValue2(value, salt = "archal") {
-  return `anon_${createHash4("sha256").update(`${salt}:${value}`).digest("hex").slice(0, 12)}`;
+  return `anon_${createHash5("sha256").update(`${salt}:${value}`).digest("hex").slice(0, 12)}`;
 }
 function anonymizeForEnterprise(entries) {
   debug("Enterprise anonymization", { entryCount: String(entries.length) });
@@ -15468,7 +16005,7 @@ function createDoctorCommand() {
 // src/commands/login.ts
 import { Command as Command8 } from "commander";
 import { exec } from "child_process";
-import { createHash as createHash5, randomBytes as randomBytes2 } from "crypto";
+import { createHash as createHash6, randomBytes as randomBytes2 } from "crypto";
 import { createServer } from "http";
 var START_PORT = 51423;
 var LOGIN_TIMEOUT_MS = 5 * 60 * 1e3;
@@ -15489,7 +16026,7 @@ function openBrowser(url) {
 }
 function createPkcePair() {
   const codeVerifier = randomBytes2(32).toString("base64url");
-  const codeChallenge = createHash5("sha256").update(codeVerifier).digest("base64url");
+  const codeChallenge = createHash6("sha256").update(codeVerifier).digest("base64url");
   return { codeVerifier, codeChallenge };
 }
 function isPlan2(value) {
@@ -16219,11 +16756,25 @@ function detectProviderName(model) {
   if (normalized.startsWith("gpt-") || normalized.startsWith("o1-") || normalized.startsWith("o3-") || normalized.startsWith("o4-")) return "OpenAI";
   return "OpenAI-compatible";
 }
-function resolveEngineApiKey(explicitKey) {
+function resolveEngineApiKey(explicitKey, model) {
   if (explicitKey?.trim()) return explicitKey.trim();
   if (process.env["ARCHAL_ENGINE_API_KEY"]?.trim()) return process.env["ARCHAL_ENGINE_API_KEY"].trim();
+  const modelProvider = model ? detectProvider(model) : null;
   const config = loadConfig();
-  if (config.engineApiKey) return config.engineApiKey;
+  if (config.engineApiKey) {
+    if (!modelProvider || !validateKeyForProvider(config.engineApiKey, modelProvider)) {
+      return config.engineApiKey;
+    }
+  }
+  const providerEnvVars = {
+    gemini: "GEMINI_API_KEY",
+    openai: "OPENAI_API_KEY",
+    anthropic: "ANTHROPIC_API_KEY"
+  };
+  if (modelProvider && providerEnvVars[modelProvider]) {
+    const val = process.env[providerEnvVars[modelProvider]]?.trim();
+    if (val) return val;
+  }
   for (const envVar of ["GEMINI_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY"]) {
     const val = process.env[envVar]?.trim();
     if (val) return val;
@@ -16272,7 +16823,7 @@ function createDemoCommand() {
       process.exit(1);
     }
     const providerName = detectProviderName(opts.model);
-    const engineApiKey = resolveEngineApiKey(opts.apiKey);
+    const engineApiKey = resolveEngineApiKey(opts.apiKey, opts.model);
     if (!engineApiKey) {
       process.stderr.write(
         `Error: No API key found for model "${opts.model}" (${providerName}).