@archal/cli 0.7.10 → 0.7.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1114,6 +1114,8 @@ var HTTP_RETRYABLE_STATUS_CODES = /* @__PURE__ */ new Set([408, 425, 429, 500, 5
1114
1114
  var HTTP_PUSH_TIMEOUT_MS = 2e4;
1115
1115
  var HTTP_PUSH_MAX_RETRIES = 6;
1116
1116
  var HTTP_PUSH_BACKOFF_MS = [1e3, 2e3, 3e3, 5e3, 5e3, 5e3];
1117
+ var HTTP_PUSH_WARMUP_RETRIES = 6;
1118
+ var HTTP_PUSH_WARMUP_BACKOFF_MS = [1500, 2500, 3500, 5e3, 6e3, 7e3];
1117
1119
  function resolveRetryDelay(backoffMs, attempt, fallbackMs) {
1118
1120
  const indexed = backoffMs[attempt];
1119
1121
  if (typeof indexed === "number" && Number.isFinite(indexed) && indexed >= 0) {
@@ -1164,6 +1166,10 @@ async function fetchWithRetry(url, options, retryOptions) {
1164
1166
  function twinBasePath(url) {
1165
1167
  return url.replace(/\/(mcp|api)\/?$/, "");
1166
1168
  }
1169
+ function isTwinWorkerWarmupResponse(status, body) {
1170
+ if (status !== 503) return false;
1171
+ return /twin worker endpoint not available|session is busy|retry shortly/i.test(body);
1172
+ }
1167
1173
  async function collectStateFromHttp(twinUrls, bearerToken, adminAuth) {
1168
1174
  const state = {};
1169
1175
  const failures = [];
@@ -1208,25 +1214,44 @@ async function pushStateToCloud(twinUrls, seedSelections, bearerToken, adminAuth
1208
1214
  }
1209
1215
  const url = `${twinBasePath(baseUrl)}/state`;
1210
1216
  debug(`Pushing dynamic seed to ${sel.twinName}`, { url });
1211
- const response = await fetchWithRetry(
1212
- url,
1213
- {
1214
- method: "PUT",
1215
- headers,
1216
- body: JSON.stringify(sel.seedData)
1217
- },
1218
- {
1219
- retries: HTTP_PUSH_MAX_RETRIES,
1220
- timeoutMs: HTTP_PUSH_TIMEOUT_MS,
1221
- backoffMs: HTTP_PUSH_BACKOFF_MS
1217
+ const payload = JSON.stringify(sel.seedData);
1218
+ let pushed = false;
1219
+ for (let warmupAttempt = 0; warmupAttempt <= HTTP_PUSH_WARMUP_RETRIES; warmupAttempt++) {
1220
+ const response = await fetchWithRetry(
1221
+ url,
1222
+ {
1223
+ method: "PUT",
1224
+ headers,
1225
+ body: payload
1226
+ },
1227
+ {
1228
+ retries: HTTP_PUSH_MAX_RETRIES,
1229
+ timeoutMs: HTTP_PUSH_TIMEOUT_MS,
1230
+ backoffMs: HTTP_PUSH_BACKOFF_MS
1231
+ }
1232
+ );
1233
+ if (response.ok) {
1234
+ pushed = true;
1235
+ break;
1222
1236
  }
1223
- );
1224
- if (!response.ok) {
1225
1237
  const text = await response.text().catch(() => "");
1238
+ const isWarmup = isTwinWorkerWarmupResponse(response.status, text);
1239
+ if (isWarmup && warmupAttempt < HTTP_PUSH_WARMUP_RETRIES) {
1240
+ const delay = resolveRetryDelay(HTTP_PUSH_WARMUP_BACKOFF_MS, warmupAttempt, 5e3);
1241
+ warn(
1242
+ `Twin "${sel.twinName}" not ready for state push (HTTP 503), retrying in ${delay}ms`,
1243
+ { attempt: `${warmupAttempt + 1}/${HTTP_PUSH_WARMUP_RETRIES + 1}` }
1244
+ );
1245
+ await new Promise((resolve12) => setTimeout(resolve12, delay));
1246
+ continue;
1247
+ }
1226
1248
  throw new Error(
1227
1249
  `Failed to push dynamic seed to twin "${sel.twinName}": HTTP ${response.status}${text ? ` (${text})` : ""}`
1228
1250
  );
1229
1251
  }
1252
+ if (!pushed) {
1253
+ throw new Error(`Failed to push dynamic seed to twin "${sel.twinName}": worker warmup did not complete in time`);
1254
+ }
1230
1255
  debug(`Pushed dynamic seed to ${sel.twinName} successfully`);
1231
1256
  }
1232
1257
  }
@@ -3202,6 +3227,47 @@ async function callAnthropic(options) {
3202
3227
  if (!textBlock?.text) throw new Error("Anthropic returned no text content");
3203
3228
  return textBlock.text;
3204
3229
  }
3230
+ function extractOpenAiTextContent(data) {
3231
+ const message = data.choices?.[0]?.message;
3232
+ if (!message) return null;
3233
+ if (typeof message.content === "string") {
3234
+ const trimmed = message.content.trim();
3235
+ return trimmed.length > 0 ? trimmed : null;
3236
+ }
3237
+ if (Array.isArray(message.content)) {
3238
+ const textSegments = [];
3239
+ for (const part of message.content) {
3240
+ if (typeof part === "string") {
3241
+ const trimmed = part.trim();
3242
+ if (trimmed.length > 0) textSegments.push(trimmed);
3243
+ continue;
3244
+ }
3245
+ if (!part || typeof part !== "object") continue;
3246
+ const partText = part.text;
3247
+ if (typeof partText === "string") {
3248
+ const trimmed = partText.trim();
3249
+ if (trimmed.length > 0) textSegments.push(trimmed);
3250
+ continue;
3251
+ }
3252
+ if (partText && typeof partText === "object" && typeof partText.value === "string") {
3253
+ const trimmed = partText.value.trim();
3254
+ if (trimmed.length > 0) textSegments.push(trimmed);
3255
+ continue;
3256
+ }
3257
+ if (typeof part.value === "string") {
3258
+ const trimmed = part.value.trim();
3259
+ if (trimmed.length > 0) textSegments.push(trimmed);
3260
+ }
3261
+ }
3262
+ if (textSegments.length > 0) {
3263
+ return textSegments.join("\n");
3264
+ }
3265
+ }
3266
+ if (typeof message.refusal === "string" && message.refusal.trim().length > 0) {
3267
+ return message.refusal.trim();
3268
+ }
3269
+ return null;
3270
+ }
3205
3271
  function usesMaxCompletionTokens(model) {
3206
3272
  return model.startsWith("gpt-5") || model.startsWith("o1-") || model.startsWith("o2-") || model.startsWith("o3-") || model.startsWith("o4-");
3207
3273
  }
@@ -3229,7 +3295,7 @@ async function callOpenAi(options) {
3229
3295
  throw new LlmApiError("OpenAI", response.status, errorText.slice(0, 200));
3230
3296
  }
3231
3297
  const data = await response.json();
3232
- const content = data.choices?.[0]?.message?.content;
3298
+ const content = extractOpenAiTextContent(data);
3233
3299
  if (!content) throw new Error("OpenAI returned no content");
3234
3300
  return content;
3235
3301
  }
@@ -3263,7 +3329,7 @@ async function callOpenAiCompatible(options) {
3263
3329
  throw new LlmApiError(`OpenAI-compatible (${options.baseUrl})`, response.status, errorText.slice(0, 200));
3264
3330
  }
3265
3331
  const data = await response.json();
3266
- const content = data.choices?.[0]?.message?.content;
3332
+ const content = extractOpenAiTextContent(data);
3267
3333
  if (!content) throw new Error("OpenAI-compatible API returned no content");
3268
3334
  return content;
3269
3335
  }
@@ -3288,13 +3354,15 @@ ${CYAN}${BOLD}archal${RESET} ${DIM}|${RESET} ${scenarioTitle}
3288
3354
  `);
3289
3355
  }
3290
3356
  }
3291
- function printRunProgress(runIndex, totalRuns, score, error2) {
3357
+ function printRunProgress(runIndex, totalRuns, score, error2, outcome) {
3292
3358
  const { quiet } = getLoggerOptions();
3293
3359
  if (quiet || activeOutputFormat !== "terminal") return;
3294
3360
  const dots = ".".repeat(Math.max(1, 20 - String(runIndex + 1).length - String(totalRuns).length));
3295
3361
  if (error2) {
3296
3362
  const shortError = error2.length > MAX_ERROR_PREVIEW_CHARS ? error2.slice(0, MAX_ERROR_PREVIEW_CHARS - 1) + "\u2026" : error2;
3297
- process.stderr.write(` run ${runIndex + 1}/${totalRuns} ${DIM}${dots}${RESET} ${RED}ERROR${RESET} ${DIM}(${shortError})${RESET}
3363
+ const inconclusive = outcome === "inconclusive_infrastructure" || outcome === "inconclusive_seed";
3364
+ const label = inconclusive ? `${YELLOW}INCONCLUSIVE${RESET}` : `${RED}ERROR${RESET}`;
3365
+ process.stderr.write(` run ${runIndex + 1}/${totalRuns} ${DIM}${dots}${RESET} ${label} ${DIM}(${shortError})${RESET}
3298
3366
  `);
3299
3367
  return;
3300
3368
  }
@@ -5874,6 +5942,17 @@ function buildFailureAnalysisPrompt(input) {
5874
5942
  );
5875
5943
  sections.push(`## Passed Criteria (${input.passedCriteria.length})`);
5876
5944
  sections.push(input.passedCriteria.map((c) => `- ${sanitizeForPrompt(c.description, 300)}`).join("\n"));
5945
+ if (input.agentError || input.agentLog) {
5946
+ sections.push(`## Agent Execution Context`);
5947
+ if (input.agentError) {
5948
+ sections.push(`Error: ${sanitizeForPrompt(input.agentError, 300)}`);
5949
+ }
5950
+ if (input.agentLog) {
5951
+ const logTail = input.agentLog.length > 800 ? input.agentLog.slice(-800) : input.agentLog;
5952
+ sections.push(`Agent log (tail):
5953
+ ${sanitizeForPrompt(logTail, 800)}`);
5954
+ }
5955
+ }
5877
5956
  sections.push(`## Agent Trace (${input.trace.length} tool calls)`);
5878
5957
  sections.push(
5879
5958
  input.trace.length === 0 ? "(Agent made no tool calls - likely crashed or timed out)" : JSON.stringify(traceFormatted, null, 2)
@@ -6617,7 +6696,7 @@ function resolveTelemetryEndpointFromEnv() {
6617
6696
  if (!fallbackBaseUrl) {
6618
6697
  return null;
6619
6698
  }
6620
- return `${fallbackBaseUrl}/api/traces`;
6699
+ return `${fallbackBaseUrl}/v1/traces`;
6621
6700
  }
6622
6701
  function resolveIngestToken() {
6623
6702
  return process.env["ARCHAL_TELEMETRY_TOKEN"]?.trim();
@@ -6766,8 +6845,26 @@ function isTelemetryEnabled() {
6766
6845
  if (consent !== "pending") return consent === "granted";
6767
6846
  return loadConfig().telemetry;
6768
6847
  }
6769
- function buildStructuredRunError(runIndex, error2) {
6848
+ function buildStructuredRunError(runIndex, error2, outcome) {
6770
6849
  const message = error2.trim();
6850
+ if (outcome === "inconclusive_seed") {
6851
+ return {
6852
+ runIndex,
6853
+ message,
6854
+ category: "seed_setup",
6855
+ code: "SEED_SETUP_ERROR",
6856
+ retryable: true
6857
+ };
6858
+ }
6859
+ if (outcome === "inconclusive_infrastructure") {
6860
+ return {
6861
+ runIndex,
6862
+ message,
6863
+ category: "infrastructure",
6864
+ code: "INFRASTRUCTURE_ERROR",
6865
+ retryable: true
6866
+ };
6867
+ }
6771
6868
  if (message.startsWith("Agent not found:")) {
6772
6869
  return {
6773
6870
  runIndex,
@@ -7009,7 +7106,7 @@ function buildMetadata(report, totalEntries) {
7009
7106
  },
7010
7107
  agentInternals: {
7011
7108
  runDurationsMs: report.runs.map((run) => run.durationMs),
7012
- runErrors: report.runs.filter((run) => typeof run.error === "string" && run.error.length > 0).map((run) => buildStructuredRunError(run.runIndex, run.error)),
7109
+ runErrors: report.runs.filter((run) => typeof run.error === "string" && run.error.length > 0).map((run) => buildStructuredRunError(run.runIndex, run.error, run.outcome)),
7013
7110
  evaluationCounts: { pass: passCount, partial: partialCount, fail: failCount },
7014
7111
  runSummaries: report.runs.map((run) => ({
7015
7112
  runIndex: run.runIndex,
@@ -7184,6 +7281,7 @@ async function uploadIfEnabled(traceId, report) {
7184
7281
  }
7185
7282
 
7186
7283
  // src/runner/dynamic-seed-generator.ts
7284
+ import { createHash as createHash4 } from "crypto";
7187
7285
  import { z as z4 } from "zod";
7188
7286
 
7189
7287
  // src/runner/seed-schemas/seed-schema-inference.ts
@@ -8551,14 +8649,17 @@ function autoFillMissingFKs(seed, twinName) {
8551
8649
  if (!sourceEntities || !targetEntities || targetEntities.length === 0) continue;
8552
8650
  const targetValues = targetEntities.map((e) => e[rule.targetField]).filter((v) => v !== void 0 && v !== null);
8553
8651
  if (targetValues.length === 0) continue;
8652
+ const validTargetSet = new Set(targetValues.map(String));
8554
8653
  let fillIndex = 0;
8555
8654
  for (const entity of sourceEntities) {
8556
8655
  const e = entity;
8557
- if (e[rule.sourceField] === void 0 || e[rule.sourceField] === null) {
8656
+ const currentValue = e[rule.sourceField];
8657
+ const needsFill = currentValue === void 0 || currentValue === null || !validTargetSet.has(String(currentValue));
8658
+ if (needsFill) {
8558
8659
  const fillValue = targetValues[fillIndex % targetValues.length];
8559
8660
  fillIndex++;
8560
8661
  debug(
8561
- `Auto-filling ${rule.sourceCollection}.${rule.sourceField} = ${String(fillValue)} (from ${targetValues.length} ${rule.targetCollection})`
8662
+ `Auto-filling ${rule.sourceCollection}.${rule.sourceField} = ${String(fillValue)} (from ${targetValues.length} ${rule.targetCollection})` + (currentValue != null ? ` (was ${String(currentValue)} \u2014 not in targets)` : "")
8562
8663
  );
8563
8664
  e[rule.sourceField] = fillValue;
8564
8665
  }
@@ -8652,6 +8753,7 @@ var KIND_COLLECTION_HINTS = {
8652
8753
  channel: ["channels"],
8653
8754
  user: ["users"],
8654
8755
  ticket: ["issues"],
8756
+ project: ["projects"],
8655
8757
  table: ["tables"],
8656
8758
  site: ["sites", "domains"],
8657
8759
  file: ["files"],
@@ -8661,6 +8763,9 @@ var KIND_COLLECTION_HINTS = {
8661
8763
  var ENTITY_KEY_ALIASES = {
8662
8764
  "repo.owner": ["ownerLogin", "owner_login", "login", "owner.login", "owner.name"],
8663
8765
  "issue.key": ["identifier"],
8766
+ "project.key": ["key", "projectKey"],
8767
+ "ticket.key": ["identifier", "key"],
8768
+ "stripe_entity.id": ["id", "charge", "chargeId", "paymentIntentId", "invoiceId", "customerId", "disputeId"],
8664
8769
  "email.address": ["email", "from", "to", "cc", "bcc"],
8665
8770
  "file.name": ["title", "fileName", "filename", "subject", "summary"]
8666
8771
  };
@@ -8816,10 +8921,28 @@ function validateSeedCoverage(intent, mergedSeed) {
8816
8921
  const entityIssues = [];
8817
8922
  const quoteErrors = [];
8818
8923
  const quoteWarnings = [];
8819
- const CORE_ENTITY_KEYS = /* @__PURE__ */ new Set(["owner", "name", "fullName", "channel_name", "key", "identifier", "number"]);
8924
+ const CORE_ENTITY_KEYS = /* @__PURE__ */ new Set(["owner", "name", "fullName", "channel_name", "key", "identifier", "number", "id"]);
8925
+ const CONTRACT_REQUIRED_KINDS = /* @__PURE__ */ new Set([
8926
+ "repo",
8927
+ "pullRequest",
8928
+ "issue",
8929
+ "channel",
8930
+ "user",
8931
+ "ticket",
8932
+ "project",
8933
+ "table"
8934
+ ]);
8820
8935
  const entityWarnings = [];
8821
8936
  for (const entity of intent.entities) {
8822
8937
  if (typeof entity.value === "boolean") continue;
8938
+ const candidateCollections = toCollectionCandidates(mergedSeed, entity.kind, entity.value);
8939
+ if (CONTRACT_REQUIRED_KINDS.has(entity.kind) && candidateCollections.length === 0) {
8940
+ entityIssues.push({
8941
+ type: "missing_entity",
8942
+ message: `Scenario entity contract mismatch: no collections match ${entity.kind}.${entity.key}=${String(entity.value)}`
8943
+ });
8944
+ continue;
8945
+ }
8823
8946
  if (!valueExistsInCollections(mergedSeed, entity.kind, entity.key, entity.value)) {
8824
8947
  const issue = {
8825
8948
  type: "missing_entity",
@@ -8934,7 +9057,8 @@ var NON_SUBJECT_STARTS = /* @__PURE__ */ new Set([
8934
9057
  "under",
8935
9058
  "after",
8936
9059
  "before",
8937
- "during"
9060
+ "during",
9061
+ "as"
8938
9062
  ]);
8939
9063
  function isReasonableCountSubject(subject, expected) {
8940
9064
  if (expected > MAX_REASONABLE_COUNT) return false;
@@ -8949,40 +9073,92 @@ function appearsToBeClockSuffix(text, numberStart) {
8949
9073
  const prefix = text.slice(Math.max(0, numberStart - 3), numberStart);
8950
9074
  return /^\d{1,2}:$/.test(prefix);
8951
9075
  }
9076
+ function isDecimalFragment(text, matchIndex) {
9077
+ if (matchIndex <= 0) return false;
9078
+ const charBefore = text[matchIndex - 1];
9079
+ if (charBefore === ".") {
9080
+ return matchIndex >= 2 && /\d/.test(text[matchIndex - 2]);
9081
+ }
9082
+ return false;
9083
+ }
9084
+ function resolveSubjectWithKey(subject, flat) {
9085
+ const candidates = buildSubjectCandidates2(subject);
9086
+ for (const candidate of candidates) {
9087
+ const normalized = candidate.replace(/\s+/g, "").toLowerCase();
9088
+ for (const [key, value] of Object.entries(flat)) {
9089
+ const normalizedKey = key.replace(/\s+/g, "").toLowerCase();
9090
+ if ((normalizedKey === normalized || normalizedKey === normalized + "s") && Array.isArray(value)) {
9091
+ return { items: value, key };
9092
+ }
9093
+ }
9094
+ }
9095
+ const items = resolveSubjectInState(subject, flat);
9096
+ return items ? { items, key: "" } : null;
9097
+ }
9098
+ function buildSubjectCandidates2(subject) {
9099
+ const candidates = [subject];
9100
+ if (subject.endsWith("s") && subject.length > 3) {
9101
+ candidates.push(subject.slice(0, -1));
9102
+ } else {
9103
+ candidates.push(subject + "s");
9104
+ }
9105
+ const words = subject.split(/\s+/);
9106
+ if (words.length > 1) {
9107
+ candidates.push(words[0]);
9108
+ candidates.push(words[words.length - 1]);
9109
+ }
9110
+ return candidates;
9111
+ }
8952
9112
  function verifySeedCounts(setupText, seedState) {
8953
9113
  const mismatches = [];
8954
9114
  const flat = flattenTwinState(seedState);
8955
9115
  const countPattern = /\b(\d+)\s+([\w\s]+?)(?:\s+(?:that|which|are|with|in|labeled|assigned)\b)/gi;
8956
9116
  for (const match of setupText.matchAll(countPattern)) {
9117
+ if (isDecimalFragment(setupText, match.index)) continue;
8957
9118
  const expected = parseInt(match[1], 10);
8958
9119
  const subject = match[2].trim();
8959
9120
  if (match.index !== void 0 && appearsToBeClockSuffix(setupText, match.index)) continue;
8960
9121
  if (!subject || expected <= 0) continue;
8961
9122
  if (!isReasonableCountSubject(subject, expected)) continue;
8962
- const resolved = resolveSubjectInState(subject, flat);
8963
- if (resolved && resolved.length !== expected) {
8964
- mismatches.push({ subject, expected, actual: resolved.length });
9123
+ const resolved = resolveSubjectWithKey(subject, flat);
9124
+ if (resolved && resolved.items.length !== expected) {
9125
+ mismatches.push({ subject, expected, actual: resolved.items.length, collectionKey: resolved.key || void 0 });
8965
9126
  }
8966
9127
  }
8967
9128
  const simplePattern = /\b(\d+)\s+([\w\s]+?)(?:[.,;:)]|$)/gm;
8968
9129
  const seenSubjects = new Set(mismatches.map((m) => m.subject.toLowerCase()));
8969
9130
  for (const match of setupText.matchAll(simplePattern)) {
9131
+ if (isDecimalFragment(setupText, match.index)) continue;
8970
9132
  const expected = parseInt(match[1], 10);
8971
9133
  const subject = match[2].trim();
8972
9134
  if (match.index !== void 0 && appearsToBeClockSuffix(setupText, match.index)) continue;
8973
9135
  if (!subject || expected <= 0 || seenSubjects.has(subject.toLowerCase())) continue;
8974
9136
  if (!isReasonableCountSubject(subject, expected)) continue;
8975
- const resolved = resolveSubjectInState(subject, flat);
8976
- if (resolved && resolved.length !== expected) {
8977
- mismatches.push({ subject, expected, actual: resolved.length });
9137
+ const resolved = resolveSubjectWithKey(subject, flat);
9138
+ if (resolved && resolved.items.length !== expected) {
9139
+ mismatches.push({ subject, expected, actual: resolved.items.length, collectionKey: resolved.key || void 0 });
8978
9140
  seenSubjects.add(subject.toLowerCase());
8979
9141
  }
8980
9142
  }
8981
9143
  return mismatches;
8982
9144
  }
9145
+ function trimSeedToExpectedCounts(seed, mismatches) {
9146
+ let totalTrimmed = 0;
9147
+ for (const m of mismatches) {
9148
+ if (m.actual <= m.expected) continue;
9149
+ if (!m.collectionKey || !seed[m.collectionKey]) continue;
9150
+ const collection = seed[m.collectionKey];
9151
+ if (collection.length > m.expected) {
9152
+ const trimmed = collection.length - m.expected;
9153
+ seed[m.collectionKey] = collection.slice(0, m.expected);
9154
+ totalTrimmed += trimmed;
9155
+ }
9156
+ }
9157
+ return totalTrimmed;
9158
+ }
8983
9159
 
8984
9160
  // src/runner/seed-cache.ts
8985
- var CACHE_VERSION = 3;
9161
+ var CACHE_VERSION = 4;
8986
9162
  var NEGATIVE_CACHE_VERSION = 2;
8987
9163
  var NEGATIVE_PREFIX = "neg-";
8988
9164
  var CACHE_DIR = join7(homedir2(), ".archal", "seed-cache");
@@ -9234,7 +9410,7 @@ ${setupText}
9234
9410
  Extract the seed blueprint as JSON.`;
9235
9411
  try {
9236
9412
  const provider = detectProvider(config.model);
9237
- const apiKey = resolveProviderApiKey(config.apiKey, provider);
9413
+ const apiKey = config.providerMode === "archal" ? "" : resolveProviderApiKey(config.apiKey ?? "", provider);
9238
9414
  const responseText = await callLlm({
9239
9415
  provider,
9240
9416
  model: config.model,
@@ -10454,9 +10630,19 @@ function extractHybridPatch(obj) {
10454
10630
  }
10455
10631
  return null;
10456
10632
  }
10457
- function buildSeedCacheContext(twinName, intent, context) {
10633
+ function hashText(text) {
10634
+ return createHash4("sha256").update(text).digest("hex").slice(0, 16);
10635
+ }
10636
+ function buildSeedCacheContext(twinName, config, intent, context) {
10458
10637
  return {
10459
10638
  twinName,
10639
+ generator: {
10640
+ model: config.model,
10641
+ providerMode: config.providerMode ?? "direct",
10642
+ baseUrl: config.baseUrl ?? null,
10643
+ systemPromptHash: hashText(SYSTEM_PROMPT2),
10644
+ promptTemplateVersion: 2
10645
+ },
10460
10646
  intent: intent ?? null,
10461
10647
  scenario: context ?? null
10462
10648
  };
@@ -10811,10 +10997,13 @@ async function tryBlueprintPath(twinName, baseSeedData, setupDescription, availa
10811
10997
  finalSeed = autoFillMissingFKs(finalSeed, twinName);
10812
10998
  const relValidation = validateSeedRelationships(finalSeed, twinName);
10813
10999
  if (!relValidation.valid) {
10814
- warn("Blueprint seed failed relationship validation", {
10815
- errors: relValidation.errors.slice(0, 5).join("; ")
10816
- });
10817
- return null;
11000
+ finalSeed = autoFillMissingFKs(finalSeed, twinName);
11001
+ const secondValidation = validateSeedRelationships(finalSeed, twinName);
11002
+ if (!secondValidation.valid) {
11003
+ warn("Blueprint seed has unresolved FK references (continuing anyway)", {
11004
+ errors: secondValidation.errors.slice(0, 5).join("; ")
11005
+ });
11006
+ }
10818
11007
  }
10819
11008
  if (intent) {
10820
11009
  const coverage = validateSeedCoverage(intent, finalSeed);
@@ -10829,9 +11018,16 @@ async function tryBlueprintPath(twinName, baseSeedData, setupDescription, availa
10829
11018
  flatForVerify[twinName] = finalSeed;
10830
11019
  const countMismatches = verifySeedCounts(setupDescription, flatForVerify);
10831
11020
  if (countMismatches.length > 0) {
10832
- debug("Blueprint seed has count mismatches (acceptable)", {
10833
- mismatches: countMismatches.map((m) => `${m.subject}: ${m.expected} vs ${m.actual}`).join("; ")
10834
- });
11021
+ const trimmed = trimSeedToExpectedCounts(finalSeed, countMismatches);
11022
+ if (trimmed > 0) {
11023
+ debug(`Blueprint seed: trimmed ${trimmed} excess entities to match setup counts`);
11024
+ }
11025
+ const remaining = countMismatches.filter((m) => m.actual > m.expected && !m.collectionKey);
11026
+ if (remaining.length > 0) {
11027
+ debug("Blueprint seed has unresolvable count mismatches", {
11028
+ mismatches: remaining.map((m) => `${m.subject}: ${m.expected} vs ${m.actual}`).join("; ")
11029
+ });
11030
+ }
10835
11031
  }
10836
11032
  const syntheticPatch = {
10837
11033
  add: {}
@@ -10861,7 +11057,7 @@ async function tryBlueprintPath(twinName, baseSeedData, setupDescription, availa
10861
11057
  async function generateDynamicSeed(twinName, baseSeedName, baseSeedData, setupDescription, config, intent, context) {
10862
11058
  const cacheScope = {
10863
11059
  baseSeedData,
10864
- cacheContext: buildSeedCacheContext(twinName, intent, context)
11060
+ cacheContext: buildSeedCacheContext(twinName, config, intent, context)
10865
11061
  };
10866
11062
  if (!config.noCache) {
10867
11063
  const cached = getCachedSeed(twinName, baseSeedName, setupDescription, cacheScope);
@@ -10892,7 +11088,7 @@ async function generateDynamicSeed(twinName, baseSeedName, baseSeedData, setupDe
10892
11088
  if (blueprintResult) {
10893
11089
  info("Dynamic seed generated via blueprint", { twin: twinName });
10894
11090
  if (!config.noCache) {
10895
- const cacheContext = buildSeedCacheContext(twinName, intent, context);
11091
+ const cacheContext = buildSeedCacheContext(twinName, config, intent, context);
10896
11092
  cacheSeed(twinName, baseSeedName, setupDescription, blueprintResult.seed, blueprintResult.patch, {
10897
11093
  baseSeedData,
10898
11094
  cacheContext
@@ -11023,14 +11219,19 @@ Fix these issues:
11023
11219
  const relationshipValidation = validateSeedRelationships(mergedSeed, twinName);
11024
11220
  if (!relationshipValidation.valid) {
11025
11221
  const topErrors = relationshipValidation.errors.slice(0, 10);
11026
- warn(`Dynamic seed relationship validation failed (attempt ${attempt + 1})`, {
11222
+ if (validationAttempts < MAX_ATTEMPTS - 1) {
11223
+ warn(`Dynamic seed relationship validation failed (attempt ${attempt + 1})`, {
11224
+ errors: topErrors.join("; ")
11225
+ });
11226
+ lastErrors = topErrors;
11227
+ patch = null;
11228
+ mergedSeed = null;
11229
+ validationAttempts++;
11230
+ continue;
11231
+ }
11232
+ warn(`Dynamic seed has unresolved FK references (accepting on final attempt)`, {
11027
11233
  errors: topErrors.join("; ")
11028
11234
  });
11029
- lastErrors = topErrors;
11030
- patch = null;
11031
- mergedSeed = null;
11032
- validationAttempts++;
11033
- continue;
11034
11235
  }
11035
11236
  if (intent) {
11036
11237
  debug("Seed intent coverage summary", {
@@ -11089,6 +11290,15 @@ Fix these issues:
11089
11290
  }
11090
11291
  mergedSeed = autoFillMissingFKs(mergedSeed, twinName);
11091
11292
  mergedSeed = ensureSlackScenarioChannelAccess(mergedSeed, intent);
11293
+ if (setupDescription) {
11294
+ const flatForTrim = {};
11295
+ flatForTrim[twinName] = mergedSeed;
11296
+ const finalMismatches = verifySeedCounts(setupDescription, flatForTrim);
11297
+ const trimmed = trimSeedToExpectedCounts(mergedSeed, finalMismatches);
11298
+ if (trimmed > 0) {
11299
+ debug(`Trimmed ${trimmed} excess seed entities to match setup counts`);
11300
+ }
11301
+ }
11092
11302
  if (!config.noCache) {
11093
11303
  cacheSeed(twinName, baseSeedName, setupDescription, mergedSeed, patch, cacheScope);
11094
11304
  }
@@ -11236,10 +11446,23 @@ function githubIntent(setup) {
11236
11446
  entities.push({ kind: "repo", key: "fullName", value: fullName });
11237
11447
  }
11238
11448
  if (!primaryRepoSet) {
11239
- const orgMatch = setup.match(/\bgithub\s+(?:organization|org)\s+"([a-z][a-z0-9._-]*)"/i);
11449
+ const orgMatch = setup.match(
11450
+ /\b(?:github\s+)?(?:organization|org)\s+(?:named\s+)?["']?([a-z][a-z0-9._-]*)["']?/i
11451
+ );
11240
11452
  if (orgMatch?.[1]) {
11241
- extractedSlots["repo.owner"] = orgMatch[1];
11242
- entities.push({ kind: "repo", key: "owner", value: orgMatch[1] });
11453
+ extractedSlots["repo.owner"] = orgMatch[1].toLowerCase();
11454
+ entities.push({ kind: "repo", key: "owner", value: orgMatch[1].toLowerCase() });
11455
+ const repoName = setup.match(/\b(?:repository|repo)\s+(?:named\s+)?["']?([a-z][a-z0-9._-]{1,99})["']?/i)?.[1];
11456
+ if (repoName) {
11457
+ const normalizedName = repoName.toLowerCase();
11458
+ extractedSlots["repo.name"] = normalizedName;
11459
+ entities.push({ kind: "repo", key: "name", value: normalizedName });
11460
+ entities.push({
11461
+ kind: "repo",
11462
+ key: "fullName",
11463
+ value: `${String(extractedSlots["repo.owner"])}/${normalizedName}`
11464
+ });
11465
+ }
11243
11466
  } else {
11244
11467
  missingSlots.push({
11245
11468
  slot: "repo.owner/repo.name",
@@ -11430,6 +11653,18 @@ function stripeIntent(setup) {
11430
11653
  });
11431
11654
  }
11432
11655
  }
11656
+ const idRegex = /\b((?:acct|cus|prod|price|pi|ch|re|in|sub|dp|pm|payout|tr|tok|evt)_[a-zA-Z0-9]+)\b/g;
11657
+ const seenIds = /* @__PURE__ */ new Set();
11658
+ let idMatch;
11659
+ while ((idMatch = idRegex.exec(setup)) !== null) {
11660
+ const id = idMatch[1];
11661
+ if (seenIds.has(id)) continue;
11662
+ seenIds.add(id);
11663
+ entities.push({ kind: "stripe_entity", key: "id", value: id });
11664
+ if (!extractedSlots["stripe.primary_id"]) {
11665
+ extractedSlots["stripe.primary_id"] = id;
11666
+ }
11667
+ }
11433
11668
  if (missingSlots.length > 0) {
11434
11669
  return { intent: null, missingSlots };
11435
11670
  }
@@ -11523,6 +11758,30 @@ function jiraIntent(setup) {
11523
11758
  }
11524
11759
  entities.push({ kind: "ticket", key: "key", value: key });
11525
11760
  }
11761
+ const seenProjects = /* @__PURE__ */ new Set();
11762
+ const addProject = (projectKey) => {
11763
+ const normalized = projectKey.toUpperCase();
11764
+ if (!/^[A-Z][A-Z0-9]{1,9}$/.test(normalized)) return;
11765
+ if (seenProjects.has(normalized)) return;
11766
+ seenProjects.add(normalized);
11767
+ entities.push({ kind: "project", key: "key", value: normalized });
11768
+ if (!extractedSlots["project.key"]) {
11769
+ extractedSlots["project.key"] = normalized;
11770
+ }
11771
+ };
11772
+ for (const key of seenKeys) {
11773
+ addProject(key.split("-", 1)[0] ?? "");
11774
+ }
11775
+ const projectRegexes = [
11776
+ /\b(?:jira\s+)?project\s+(?:key\s*)?[:=]?\s*["']?([A-Z][A-Z0-9]{1,9})["']?/gi,
11777
+ /\bproject\s+["'][^"'\n]+["']\s*\(\s*([A-Z][A-Z0-9]{1,9})\s*\)/gi
11778
+ ];
11779
+ for (const regex of projectRegexes) {
11780
+ let projectMatch;
11781
+ while ((projectMatch = regex.exec(setup)) !== null) {
11782
+ addProject(projectMatch[1] ?? "");
11783
+ }
11784
+ }
11526
11785
  return {
11527
11786
  intent: {
11528
11787
  twinName: "jira",
@@ -11537,6 +11796,7 @@ function jiraIntent(setup) {
11537
11796
  }
11538
11797
  function supabaseIntent(setup) {
11539
11798
  const extractedSlots = {};
11799
+ const entities = [];
11540
11800
  const missingSlots = [];
11541
11801
  const requiredSlots = ["database.target"];
11542
11802
  const seenTables = /* @__PURE__ */ new Set();
@@ -11569,6 +11829,9 @@ function supabaseIntent(setup) {
11569
11829
  const hasEnvVarTokens = /\b[A-Z][A-Z0-9_]{2,}\b/.test(setup);
11570
11830
  if (seenTables.size > 0 || mentionsProject || mentionsLogsOrService || mentionsEnvVars && hasEnvVarTokens) {
11571
11831
  extractedSlots["database.target"] = true;
11832
+ for (const table2 of seenTables) {
11833
+ entities.push({ kind: "table", key: "name", value: table2 });
11834
+ }
11572
11835
  } else {
11573
11836
  missingSlots.push({
11574
11837
  slot: "database.target",
@@ -11585,10 +11848,7 @@ function supabaseIntent(setup) {
11585
11848
  setupSummary: setupSummary(setup),
11586
11849
  requiredSlots,
11587
11850
  extractedSlots,
11588
- // Supabase table names in setup can describe conceptual data sources
11589
- // that are not materialized in the base SQL schema. Keep intent broad
11590
- // to avoid false-hard failures in seed generation.
11591
- entities: [],
11851
+ entities,
11592
11852
  quotedStrings: []
11593
11853
  },
11594
11854
  missingSlots: []
@@ -12112,12 +12372,24 @@ function loadBaseSeedFromDisk(twinName, seedName) {
12112
12372
  }
12113
12373
  function categorizeRunError(message) {
12114
12374
  if (/Failed to spawn|ENOENT/.test(message)) {
12115
- return `Agent not found: ${message}. Check that your agent command is installed and in PATH.`;
12375
+ return {
12376
+ message: `Agent not found: ${message}. Check that your agent command is installed and in PATH.`,
12377
+ outcome: "failed_agent"
12378
+ };
12379
+ }
12380
+ if (/Dynamic seed generation failed|Missing dynamic seed state|seed generation|seed setup/i.test(message)) {
12381
+ return {
12382
+ message: `Seed generation error: ${message}`,
12383
+ outcome: "inconclusive_seed"
12384
+ };
12116
12385
  }
12117
12386
  if (/HTTP [45]\d\d|ECONNREFUSED|ENOTFOUND|ETIMEDOUT|ECONNRESET|cloud session|fetch failed|AbortError|TimeoutError|operation was aborted|timed?\s*out/i.test(message)) {
12118
- return `Infrastructure error: ${message}. Check your network or try again.`;
12387
+ return {
12388
+ message: `Infrastructure error: ${message}. Check your network or try again.`,
12389
+ outcome: "inconclusive_infrastructure"
12390
+ };
12119
12391
  }
12120
- return message;
12392
+ return { message, outcome: "failed_agent" };
12121
12393
  }
12122
12394
  async function executeSingleRun(runIndex, scenario, agentConfig, seedSelections, evaluatorConfig, timeoutSeconds, apiEngine, localEngine, remoteTwinUrlOverrides, apiRouting, cloudTwinUrls, hostedSessionId, apiBearerToken, adminAuth) {
12123
12395
  const startTime = Date.now();
@@ -12255,7 +12527,8 @@ ${baseTaskMessage}` : baseTaskMessage;
12255
12527
  stateDiff: diff,
12256
12528
  agentLog: agentResult.stderr || void 0,
12257
12529
  agentTrace: agentResult.agentTrace,
12258
- tokenUsage
12530
+ tokenUsage,
12531
+ outcome: "failed_agent"
12259
12532
  };
12260
12533
  }
12261
12534
  if (agentResult.exitCode !== 0 && agentResult.exitCode !== null) {
@@ -12294,7 +12567,8 @@ ${baseTaskMessage}` : baseTaskMessage;
12294
12567
  stateDiff: diff,
12295
12568
  agentLog: agentResult.stderr || void 0,
12296
12569
  agentTrace: agentResult.agentTrace,
12297
- tokenUsage
12570
+ tokenUsage,
12571
+ outcome: "failed_agent"
12298
12572
  };
12299
12573
  }
12300
12574
  if (trace.length === 0) {
@@ -12326,12 +12600,13 @@ ${baseTaskMessage}` : baseTaskMessage;
12326
12600
  stateDiff: diff,
12327
12601
  agentLog: agentResult.stderr || void 0,
12328
12602
  agentTrace: agentResult.agentTrace,
12329
- tokenUsage
12603
+ tokenUsage,
12604
+ outcome: "completed"
12330
12605
  };
12331
12606
  } catch (err) {
12332
12607
  const message = err instanceof Error ? err.message : String(err);
12333
12608
  const categorized = categorizeRunError(message);
12334
- error(`Run ${runIndex + 1} failed: ${categorized}`);
12609
+ error(`Run ${runIndex + 1} failed: ${categorized.message}`);
12335
12610
  const durationMs = Date.now() - startTime;
12336
12611
  return {
12337
12612
  runIndex,
@@ -12339,12 +12614,13 @@ ${baseTaskMessage}` : baseTaskMessage;
12339
12614
  criterionId: c.id,
12340
12615
  status: "fail",
12341
12616
  confidence: 1,
12342
- explanation: `Run failed: ${categorized}`
12617
+ explanation: `Run failed: ${categorized.message}`
12343
12618
  })),
12344
12619
  overallScore: 0,
12345
12620
  trace: [],
12346
12621
  durationMs,
12347
- error: categorized,
12622
+ error: categorized.message,
12623
+ outcome: categorized.outcome,
12348
12624
  stateBefore: beforeState,
12349
12625
  stateAfter: beforeState,
12350
12626
  stateDiff: { added: {}, modified: {}, removed: {} }
@@ -12421,9 +12697,20 @@ function preflightCheck(scenario, apiKey, model, baseUrl, evaluatorProvider, see
12421
12697
  }
12422
12698
  }
12423
12699
  if (seedModel) {
12700
+ const mode = seedProviderMode ?? "auto";
12701
+ const provider = detectProvider(seedModel);
12702
+ const resolvedKey = resolveProviderApiKey(apiKey, provider);
12424
12703
  const creds = getCredentials();
12425
12704
  const hasArchalAuth = Boolean(creds?.token);
12426
- if (!hasArchalAuth) {
12705
+ if (provider === "openai-compatible" && !baseUrl && mode === "direct") {
12706
+ errors.push({
12707
+ check: "seed.baseUrl",
12708
+ message: `Seed model "${seedModel}" requires a base URL for the OpenAI-compatible endpoint`,
12709
+ detail: "Set via: export ARCHAL_EVALUATOR_BASE_URL=<url> or archal config set evaluator.baseUrl <url>",
12710
+ warning: true
12711
+ });
12712
+ }
12713
+ if (mode === "archal" && !hasArchalAuth) {
12427
12714
  errors.push({
12428
12715
  check: "archal-auth-seed",
12429
12716
  message: "Dynamic seed generation requires Archal authentication",
@@ -12431,6 +12718,32 @@ function preflightCheck(scenario, apiKey, model, baseUrl, evaluatorProvider, see
12431
12718
  warning: true
12432
12719
  });
12433
12720
  }
12721
+ if (mode === "direct" && !resolvedKey) {
12722
+ errors.push({
12723
+ check: getProviderEnvVar(provider),
12724
+ message: `Seed provider is "direct" but no API key is configured for ${provider}`,
12725
+ detail: `Set via: export ${getProviderEnvVar(provider)}=<your-key> or archal config set evaluator.apiKey <key>`,
12726
+ warning: true
12727
+ });
12728
+ }
12729
+ if (mode === "auto" && !resolvedKey && !hasArchalAuth) {
12730
+ errors.push({
12731
+ check: getProviderEnvVar(provider),
12732
+ message: 'Dynamic seed generation has no available provider in "auto" mode',
12733
+ detail: `Set ${getProviderEnvVar(provider)} (or evaluator.apiKey) for direct mode, or run archal login for Archal backend mode`,
12734
+ warning: true
12735
+ });
12736
+ }
12737
+ if (resolvedKey && (mode === "direct" || mode === "auto")) {
12738
+ const mismatch = validateKeyForProvider(resolvedKey, provider);
12739
+ if (mismatch) {
12740
+ errors.push({
12741
+ check: "seed-key-provider-mismatch",
12742
+ message: mismatch,
12743
+ warning: true
12744
+ });
12745
+ }
12746
+ }
12434
12747
  }
12435
12748
  return errors;
12436
12749
  }
@@ -12479,6 +12792,35 @@ async function runScenario(options) {
12479
12792
  'cloudTwinUrls is required. Local twin execution has been removed; use "archal run" to provision a hosted session.'
12480
12793
  );
12481
12794
  }
12795
+ const criterionDescriptions = {};
12796
+ const criterionTypes = {};
12797
+ for (const c of scenario.successCriteria) {
12798
+ criterionDescriptions[c.id] = c.description;
12799
+ criterionTypes[c.id] = c.type;
12800
+ }
12801
+ const buildInconclusiveSeedReport = (message) => ({
12802
+ scenarioTitle: scenario.title,
12803
+ satisfactionScore: 0,
12804
+ criterionDescriptions,
12805
+ criterionTypes,
12806
+ twinNames: scenario.config.twins,
12807
+ runs: [{
12808
+ runIndex: 0,
12809
+ evaluations: scenario.successCriteria.map((criterion) => ({
12810
+ criterionId: criterion.id,
12811
+ status: "fail",
12812
+ confidence: 1,
12813
+ explanation: `Run not scored due to seed setup failure: ${message}`
12814
+ })),
12815
+ overallScore: 0,
12816
+ trace: [],
12817
+ durationMs: 0,
12818
+ error: message,
12819
+ outcome: "inconclusive_seed"
12820
+ }],
12821
+ summary: `Inconclusive (seed setup): ${message}`,
12822
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
12823
+ });
12482
12824
  const preflightErrors = preflightCheck(
12483
12825
  scenario,
12484
12826
  config.apiKey,
@@ -12569,7 +12911,7 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
12569
12911
  cacheContext: seedPromptContext
12570
12912
  });
12571
12913
  }
12572
- throw new Error(message);
12914
+ return buildInconclusiveSeedReport(message);
12573
12915
  }
12574
12916
  warn(message);
12575
12917
  generationTargets.push(sel);
@@ -12578,12 +12920,11 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
12578
12920
  if (generationTargets.length > 0) {
12579
12921
  progress("Generating dynamic seeds from setup description...");
12580
12922
  const dynamicConfig = {
12581
- apiKey: "",
12582
- // Seed gen always routes through Archal backend
12923
+ apiKey: config.apiKey,
12583
12924
  model: config.seedModel,
12584
12925
  baseUrl: config.baseUrl,
12585
12926
  noCache: options.noSeedCache,
12586
- providerMode: "archal"
12927
+ providerMode: config.seedProvider
12587
12928
  };
12588
12929
  let cloudSeedSnapshotByTwin = null;
12589
12930
  const adminAuth = options.apiAdminToken ? { token: options.apiAdminToken, userId: options.apiAdminUserId } : void 0;
@@ -12601,20 +12942,28 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
12601
12942
  baseSeedData = normalizeSeedState(cloudSeedSnapshotByTwin[sel.twinName]);
12602
12943
  }
12603
12944
  if (!baseSeedData || Object.keys(baseSeedData).length === 0) {
12604
- throw new Error(
12945
+ return buildInconclusiveSeedReport(
12605
12946
  `Could not load base seed "${sel.seedName}" for twin "${sel.twinName}" from disk. Ensure the seed file exists at twins/${sel.twinName}/seeds/${sel.seedName}.json or .sql, or that the hosted twin /state endpoint is reachable.`
12606
12947
  );
12607
12948
  }
12608
12949
  progress(`Generating dynamic seed for ${sel.twinName}...`);
12609
- const result = await generateDynamicSeed(
12610
- sel.twinName,
12611
- sel.seedName,
12612
- baseSeedData,
12613
- scenario.setup,
12614
- dynamicConfig,
12615
- extractedIntentByTwin.get(sel.twinName),
12616
- seedPromptContext
12617
- );
12950
+ let result;
12951
+ try {
12952
+ result = await generateDynamicSeed(
12953
+ sel.twinName,
12954
+ sel.seedName,
12955
+ baseSeedData,
12956
+ scenario.setup,
12957
+ dynamicConfig,
12958
+ extractedIntentByTwin.get(sel.twinName),
12959
+ seedPromptContext
12960
+ );
12961
+ } catch (error2) {
12962
+ const detail = error2 instanceof Error ? error2.message : String(error2);
12963
+ return buildInconclusiveSeedReport(
12964
+ `Dynamic seed generation failed for twin "${sel.twinName}": ${detail}`
12965
+ );
12966
+ }
12618
12967
  sel.seedData = result.seed;
12619
12968
  if (result.fromCache) {
12620
12969
  cachedSeedTwins.push(sel.twinName);
@@ -12630,7 +12979,7 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
12630
12979
  }
12631
12980
  const missingDynamicSeeds = seedSelections.filter((sel) => !sel.seedData);
12632
12981
  if (missingDynamicSeeds.length > 0) {
12633
- throw new Error(
12982
+ return buildInconclusiveSeedReport(
12634
12983
  `Missing dynamic seed state for twin(s): ${missingDynamicSeeds.map((sel) => sel.twinName).join(", ")}`
12635
12984
  );
12636
12985
  }
@@ -12825,8 +13174,8 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
12825
13174
  return {
12826
13175
  scenarioTitle: scenario.title,
12827
13176
  satisfactionScore: 100,
12828
- criterionDescriptions: {},
12829
- criterionTypes: {},
13177
+ criterionDescriptions,
13178
+ criterionTypes,
12830
13179
  twinNames: scenario.config.twins,
12831
13180
  runs: [],
12832
13181
  summary: "Preflight checks passed",
@@ -12865,8 +13214,8 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
12865
13214
  adminAuth
12866
13215
  );
12867
13216
  runs.push(result);
12868
- printRunProgress(i, numRuns, result.overallScore, result.error);
12869
- if (result.error) {
13217
+ printRunProgress(i, numRuns, result.overallScore, result.error, result.outcome);
13218
+ if (result.outcome === "inconclusive_infrastructure" || result.outcome === "inconclusive_seed") {
12870
13219
  consecutiveInfraErrors++;
12871
13220
  if (consecutiveInfraErrors >= EARLY_ABORT_THRESHOLD && i < numRuns - 1) {
12872
13221
  warn(`${consecutiveInfraErrors} consecutive run errors \u2014 aborting remaining ${numRuns - i - 1} run(s) to avoid wasting quota.`);
@@ -12876,19 +13225,17 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
12876
13225
  consecutiveInfraErrors = 0;
12877
13226
  }
12878
13227
  }
12879
- const runScores = runs.map((r) => r.overallScore);
13228
+ const scoredRuns = runs.filter(
13229
+ (run) => run.outcome !== "inconclusive_infrastructure" && run.outcome !== "inconclusive_seed"
13230
+ );
13231
+ const runScores = scoredRuns.map((r) => r.overallScore);
12880
13232
  const satisfactionScore = aggregateSatisfaction(runScores);
12881
- const allEvaluations = runs.map((r) => r.evaluations);
12882
- const summary = generateSummary(allEvaluations, satisfactionScore);
12883
- const criterionDescriptions = {};
12884
- const criterionTypes = {};
12885
- for (const c of scenario.successCriteria) {
12886
- criterionDescriptions[c.id] = c.description;
12887
- criterionTypes[c.id] = c.type;
12888
- }
13233
+ const allEvaluations = scoredRuns.map((r) => r.evaluations);
13234
+ const inconclusiveRuns = runs.length - scoredRuns.length;
13235
+ const summary = scoredRuns.length > 0 ? generateSummary(allEvaluations, satisfactionScore) : `Inconclusive: no scored runs (${inconclusiveRuns} infrastructure/seed setup run failure${inconclusiveRuns === 1 ? "" : "s"}).`;
12889
13236
  let failureAnalysis;
12890
- if (satisfactionScore < 100 && runs.length > 0 && !options.noFailureAnalysis) {
12891
- const representativeRun = runs.reduce(
13237
+ if (satisfactionScore < 100 && scoredRuns.length > 0 && !options.noFailureAnalysis) {
13238
+ const representativeRun = scoredRuns.reduce(
12892
13239
  (worst, r) => r.overallScore < worst.overallScore ? r : worst
12893
13240
  );
12894
13241
  const failedCriteria = representativeRun.evaluations.filter((e) => e.status !== "pass").map((e) => ({
@@ -12911,7 +13258,9 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
12911
13258
  stateDiff: representativeRun.stateDiff ?? { added: {}, modified: {}, removed: {} },
12912
13259
  stateBefore: representativeRun.stateBefore ?? {},
12913
13260
  stateAfter: representativeRun.stateAfter ?? {},
12914
- satisfactionScore
13261
+ satisfactionScore,
13262
+ agentLog: representativeRun.agentLog,
13263
+ agentError: representativeRun.error
12915
13264
  },
12916
13265
  evaluatorConfig
12917
13266
  );
@@ -13690,7 +14039,21 @@ function createRunCommand() {
13690
14039
  }
13691
14040
  }
13692
14041
  if (!process.env["ARCHAL_ENGINE_API_KEY"] && userConfig.engineApiKey) {
13693
- process.env["ARCHAL_ENGINE_API_KEY"] = userConfig.engineApiKey;
14042
+ const configKey = userConfig.engineApiKey;
14043
+ const requestedModel = firstNonEmpty(
14044
+ opts.engineModel,
14045
+ process.env["ARCHAL_ENGINE_MODEL"],
14046
+ opts.model
14047
+ // -m also defaults the engine model for local harnesses
14048
+ );
14049
+ if (requestedModel) {
14050
+ const modelProvider = detectProvider(requestedModel);
14051
+ if (!validateKeyForProvider(configKey, modelProvider)) {
14052
+ process.env["ARCHAL_ENGINE_API_KEY"] = configKey;
14053
+ }
14054
+ } else {
14055
+ process.env["ARCHAL_ENGINE_API_KEY"] = configKey;
14056
+ }
13694
14057
  }
13695
14058
  }
13696
14059
  inferEngineModelFromEvaluatorModel(opts);
@@ -13741,8 +14104,17 @@ function createRunCommand() {
13741
14104
  }
13742
14105
  }
13743
14106
  if (engine.mode === "local" && !process.env["ARCHAL_ENGINE_API_KEY"]) {
14107
+ const requestedModel = firstNonEmpty(
14108
+ opts.engineModel,
14109
+ process.env["ARCHAL_ENGINE_MODEL"]
14110
+ );
14111
+ const provider = requestedModel ? detectProvider(requestedModel) : null;
14112
+ const providerHint = provider ? `
14113
+ Hint: You requested model "${requestedModel}" (${provider}) but no ${provider} API key is available.
14114
+ Set ${getProviderEnvVar(provider)} or pass --engine-key <${provider}-key>
14115
+ ` : "";
13744
14116
  process.stderr.write(
13745
- "Error: No API key found. The agent harness needs an API key to call the model.\nSet one of:\n GEMINI_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY env var\n archal config set engine.apiKey <key>\n ARCHAL_ENGINE_API_KEY env var\n"
14117
+ "Error: No API key found. The agent harness needs an API key to call the model.\nSet one of:\n GEMINI_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY env var\n archal config set engine.apiKey <key>\n ARCHAL_ENGINE_API_KEY env var\n" + providerHint
13746
14118
  );
13747
14119
  process.exit(2);
13748
14120
  }
@@ -13812,12 +14184,14 @@ function createRunCommand() {
13812
14184
  })();
13813
14185
  const SESSION_READY_TIMEOUT_MS = Math.max(12e4, configuredReadyTimeoutMs);
13814
14186
  const SESSION_POLL_INTERVAL_MS = 2e3;
13815
- const STATUS_READY_GRACE_MS = 5e3;
13816
14187
  const readyDeadline = Date.now() + SESSION_READY_TIMEOUT_MS;
13817
14188
  let sessionReady = false;
13818
14189
  let lastPollIssue;
13819
- let statusReadySinceMs = null;
13820
14190
  const isRetryablePollFailure = (result) => result.offline || typeof result.status === "number" && result.status >= 500;
14191
+ const workersAllReady = (workers) => {
14192
+ if (!workers || Object.keys(workers).length === 0) return true;
14193
+ return Object.values(workers).every((value) => value === "ready");
14194
+ };
13821
14195
  const sleepForPollInterval = async () => new Promise((resolve12) => setTimeout(resolve12, SESSION_POLL_INTERVAL_MS));
13822
14196
  if (!opts.quiet) process.stderr.write("Starting cloud session...\n");
13823
14197
  let pollCount = 0;
@@ -13872,26 +14246,19 @@ function createRunCommand() {
13872
14246
  }
13873
14247
  const healthAlive = healthResult.ok && healthResult.data.alive;
13874
14248
  const statusAlive = statusResult.data.alive || status === "ready";
13875
- if (statusAlive && healthAlive) {
14249
+ const statusWorkersReady = workersAllReady(
14250
+ statusResult.data.twins ?? statusResult.data.workers
14251
+ );
14252
+ const healthWorkersReady = workersAllReady(healthResult.data.twins);
14253
+ if (statusAlive && healthAlive && statusWorkersReady && healthWorkersReady) {
13876
14254
  sessionReady = true;
13877
14255
  break;
13878
14256
  }
13879
- if (statusAlive && !healthAlive) {
13880
- if (statusReadySinceMs === null) {
13881
- statusReadySinceMs = Date.now();
13882
- }
13883
- const readyForMs = Date.now() - statusReadySinceMs;
13884
- if (readyForMs >= STATUS_READY_GRACE_MS) {
13885
- debug(
13886
- `Session ${backendSessionId} proceeded after health endpoint warmup (${readyForMs}ms).`
13887
- );
13888
- sessionReady = true;
13889
- break;
13890
- }
13891
- } else {
13892
- statusReadySinceMs = null;
13893
- }
13894
- lastPollIssue = `session still starting (status=${status}, health=${healthAlive ? "alive" : "starting"})`;
14257
+ const statusTwinStates = Object.entries(
14258
+ statusResult.data.twins ?? statusResult.data.workers ?? {}
14259
+ ).map(([twin, twinStatus]) => `${twin}:${twinStatus}`).join(", ");
14260
+ const healthTwinStates = Object.entries(healthResult.data.twins ?? {}).map(([twin, twinStatus]) => `${twin}:${twinStatus}`).join(", ");
14261
+ lastPollIssue = `session still starting (status=${status}, health=${healthAlive ? "alive" : "starting"}, statusTwins=[${statusTwinStates || "n/a"}], healthTwins=[${healthTwinStates || "n/a"}])`;
13895
14262
  await sleepForPollInterval();
13896
14263
  }
13897
14264
  if (sessionReady) {
@@ -14292,6 +14659,7 @@ function buildEvidenceArtifacts(report) {
14292
14659
  overallScore: run.overallScore,
14293
14660
  durationMs: run.durationMs,
14294
14661
  error: run.error ?? null,
14662
+ outcome: run.outcome ?? null,
14295
14663
  evaluations: (run.evaluations ?? []).map((ev) => ({
14296
14664
  criterionId: ev.criterionId,
14297
14665
  status: ev.status,
@@ -14611,7 +14979,7 @@ import { createInterface as createInterface2 } from "readline";
14611
14979
  import { Command as Command5 } from "commander";
14612
14980
 
14613
14981
  // src/telemetry/anonymizer.ts
14614
- import { createHash as createHash4 } from "crypto";
14982
+ import { createHash as createHash5 } from "crypto";
14615
14983
  var API_KEY_PATTERNS = [
14616
14984
  /(?:api[_-]?key|token|secret|password|authorization|bearer|credential)\s*[:=]\s*["']?([a-zA-Z0-9_\-/.+=]{16,})["']?/gi,
14617
14985
  /sk-[a-zA-Z0-9]{20,}/g,
@@ -14661,7 +15029,7 @@ var USERNAME_FIELDS = /* @__PURE__ */ new Set([
14661
15029
  "maintainer"
14662
15030
  ]);
14663
15031
  function hashValue2(value, salt = "archal") {
14664
- return `anon_${createHash4("sha256").update(`${salt}:${value}`).digest("hex").slice(0, 12)}`;
15032
+ return `anon_${createHash5("sha256").update(`${salt}:${value}`).digest("hex").slice(0, 12)}`;
14665
15033
  }
14666
15034
  function anonymizeForEnterprise(entries) {
14667
15035
  debug("Enterprise anonymization", { entryCount: String(entries.length) });
@@ -15637,7 +16005,7 @@ function createDoctorCommand() {
15637
16005
  // src/commands/login.ts
15638
16006
  import { Command as Command8 } from "commander";
15639
16007
  import { exec } from "child_process";
15640
- import { createHash as createHash5, randomBytes as randomBytes2 } from "crypto";
16008
+ import { createHash as createHash6, randomBytes as randomBytes2 } from "crypto";
15641
16009
  import { createServer } from "http";
15642
16010
  var START_PORT = 51423;
15643
16011
  var LOGIN_TIMEOUT_MS = 5 * 60 * 1e3;
@@ -15658,7 +16026,7 @@ function openBrowser(url) {
15658
16026
  }
15659
16027
  function createPkcePair() {
15660
16028
  const codeVerifier = randomBytes2(32).toString("base64url");
15661
- const codeChallenge = createHash5("sha256").update(codeVerifier).digest("base64url");
16029
+ const codeChallenge = createHash6("sha256").update(codeVerifier).digest("base64url");
15662
16030
  return { codeVerifier, codeChallenge };
15663
16031
  }
15664
16032
  function isPlan2(value) {
@@ -16388,11 +16756,25 @@ function detectProviderName(model) {
16388
16756
  if (normalized.startsWith("gpt-") || normalized.startsWith("o1-") || normalized.startsWith("o3-") || normalized.startsWith("o4-")) return "OpenAI";
16389
16757
  return "OpenAI-compatible";
16390
16758
  }
16391
- function resolveEngineApiKey(explicitKey) {
16759
+ function resolveEngineApiKey(explicitKey, model) {
16392
16760
  if (explicitKey?.trim()) return explicitKey.trim();
16393
16761
  if (process.env["ARCHAL_ENGINE_API_KEY"]?.trim()) return process.env["ARCHAL_ENGINE_API_KEY"].trim();
16762
+ const modelProvider = model ? detectProvider(model) : null;
16394
16763
  const config = loadConfig();
16395
- if (config.engineApiKey) return config.engineApiKey;
16764
+ if (config.engineApiKey) {
16765
+ if (!modelProvider || !validateKeyForProvider(config.engineApiKey, modelProvider)) {
16766
+ return config.engineApiKey;
16767
+ }
16768
+ }
16769
+ const providerEnvVars = {
16770
+ gemini: "GEMINI_API_KEY",
16771
+ openai: "OPENAI_API_KEY",
16772
+ anthropic: "ANTHROPIC_API_KEY"
16773
+ };
16774
+ if (modelProvider && providerEnvVars[modelProvider]) {
16775
+ const val = process.env[providerEnvVars[modelProvider]]?.trim();
16776
+ if (val) return val;
16777
+ }
16396
16778
  for (const envVar of ["GEMINI_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY"]) {
16397
16779
  const val = process.env[envVar]?.trim();
16398
16780
  if (val) return val;
@@ -16441,7 +16823,7 @@ function createDemoCommand() {
16441
16823
  process.exit(1);
16442
16824
  }
16443
16825
  const providerName = detectProviderName(opts.model);
16444
- const engineApiKey = resolveEngineApiKey(opts.apiKey);
16826
+ const engineApiKey = resolveEngineApiKey(opts.apiKey, opts.model);
16445
16827
  if (!engineApiKey) {
16446
16828
  process.stderr.write(
16447
16829
  `Error: No API key found for model "${opts.model}" (${providerName}).
@@ -5,7 +5,7 @@
5
5
  * Env var overrides:
6
6
  * ARCHAL_MAX_TOKENS — Max completion tokens (default from model-configs)
7
7
  * ARCHAL_TEMPERATURE — Sampling temperature
8
- * ARCHAL_LLM_TIMEOUT — Per-call timeout in seconds (default 120)
8
+ * ARCHAL_LLM_TIMEOUT — Per-call timeout in seconds (default 180)
9
9
  * ARCHAL_OPENAI_BASE_URL — Override OpenAI base URL (for proxies, Azure, etc.)
10
10
  * ARCHAL_ANTHROPIC_BASE_URL — Override Anthropic base URL
11
11
  * ARCHAL_GEMINI_BASE_URL — Override Gemini base URL
@@ -48,19 +48,41 @@ const PROVIDER_ENV_VARS = {
48
48
  openai: 'OPENAI_API_KEY',
49
49
  };
50
50
 
51
+ function inferKeyProvider(key) {
52
+ if (!key) return null;
53
+ if (key.startsWith('AIzaSy')) return 'gemini';
54
+ if (key.startsWith('sk-ant-')) return 'anthropic';
55
+ if (key.startsWith('sk-')) return 'openai';
56
+ return null;
57
+ }
58
+
51
59
  /**
52
60
  * Resolve the API key for the detected provider.
53
61
  * Priority: ARCHAL_ENGINE_API_KEY > provider-specific env var.
62
+ * If ARCHAL_ENGINE_API_KEY clearly belongs to a different provider, fall back
63
+ * to provider-specific key when available, otherwise fail with a clear error.
54
64
  * @param {string} provider
55
65
  * @returns {string}
56
66
  */
57
67
  export function resolveApiKey(provider) {
58
- const engineKey = process.env['ARCHAL_ENGINE_API_KEY']?.trim();
59
- if (engineKey) return engineKey;
60
-
61
68
  const envVar = PROVIDER_ENV_VARS[provider] ?? 'OPENAI_API_KEY';
62
- const key = process.env[envVar]?.trim();
63
- if (key) return key;
69
+ const providerKey = process.env[envVar]?.trim();
70
+ const engineKey = process.env['ARCHAL_ENGINE_API_KEY']?.trim();
71
+ if (engineKey) {
72
+ const inferred = inferKeyProvider(engineKey);
73
+ if (!inferred || inferred === provider) return engineKey;
74
+ if (providerKey) {
75
+ process.stderr.write(
76
+ `[harness] Warning: ARCHAL_ENGINE_API_KEY appears to be for ${inferred}; using ${envVar} for ${provider} model.\n`,
77
+ );
78
+ return providerKey;
79
+ }
80
+ throw new Error(
81
+ `ARCHAL_ENGINE_API_KEY appears to be for ${inferred}, but provider "${provider}" requires ${envVar}. ` +
82
+ `Set ${envVar} or use a ${inferred} model.`
83
+ );
84
+ }
85
+ if (providerKey) return providerKey;
64
86
 
65
87
  throw new Error(
66
88
  `No API key found for provider "${provider}". ` +
@@ -111,7 +133,7 @@ function getLlmTimeoutMs() {
111
133
  return parsed * 1000;
112
134
  }
113
135
  }
114
- return 120_000; // 120 seconds default
136
+ return 180_000; // 180 seconds default
115
137
  }
116
138
 
117
139
  // ── Thinking configuration ──────────────────────────────────────────
@@ -107,10 +107,19 @@ try {
107
107
 
108
108
  // Call the LLM with retry on transient errors
109
109
  log.llmCall(step + 1);
110
- const response = await withRetry(
111
- () => callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools),
112
- 2,
113
- );
110
+ let response;
111
+ try {
112
+ response = await withRetry(
113
+ () => callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools),
114
+ 2,
115
+ );
116
+ } catch (err) {
117
+ const msg = err?.message ?? String(err);
118
+ log.error('llm_call_failed', { step: step + 1, error: msg });
119
+ process.stderr.write(`[hardened] LLM API error: ${msg.slice(0, 500)}\n`);
120
+ exitReason = 'llm_error';
121
+ break;
122
+ }
114
123
 
115
124
  const iterDurationMs = Date.now() - iterStart;
116
125
  totalInputTokens += response.usage.inputTokens;
@@ -218,4 +227,8 @@ try {
218
227
  `(${totalToolErrors} errors), ${totalInputTokens} input tokens, ` +
219
228
  `${totalOutputTokens} output tokens, ${(totalTimeMs / 1000).toFixed(1)}s total\n`
220
229
  );
230
+
231
+ if (exitReason === 'llm_error') {
232
+ process.exit(1);
233
+ }
221
234
  }
@@ -84,7 +84,16 @@ try {
84
84
  const iterStart = Date.now();
85
85
 
86
86
  log.llmCall(step + 1);
87
- const response = await callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools);
87
+ let response;
88
+ try {
89
+ response = await callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools);
90
+ } catch (err) {
91
+ const msg = err?.message ?? String(err);
92
+ log.error('llm_call_failed', { step: step + 1, error: msg });
93
+ process.stderr.write(`[naive] LLM API error: ${msg.slice(0, 500)}\n`);
94
+ exitReason = 'llm_error';
95
+ break;
96
+ }
88
97
 
89
98
  const iterDurationMs = Date.now() - iterStart;
90
99
  totalInputTokens += response.usage.inputTokens;
@@ -150,4 +159,7 @@ try {
150
159
  `${(totalTimeMs / 1000).toFixed(1)}s total\n`
151
160
  );
152
161
 
162
+ if (exitReason === 'llm_error') {
163
+ process.exit(1);
164
+ }
153
165
  }
@@ -6,7 +6,7 @@
6
6
  * - Structured system prompt encouraging step-by-step reasoning
7
7
  * - Error recovery with retries on transient failures
8
8
  * - Context-aware done detection
9
- * - Max 50 steps safety limit
9
+ * - Configurable step limit (default 80, cap 200 via ARCHAL_MAX_STEPS)
10
10
  * - Token usage and timing instrumentation
11
11
  *
12
12
  * Env vars (set by archal orchestrator):
@@ -34,7 +34,21 @@ import { createLogger } from '../_lib/logging.mjs';
34
34
  import { writeMetrics } from '../_lib/metrics.mjs';
35
35
  import { createAgentTrace } from '../_lib/agent-trace.mjs';
36
36
 
37
- const MAX_STEPS = 50;
37
+ const DEFAULT_MAX_STEPS = 80;
38
+ const MAX_STEPS = (() => {
39
+ const raw = process.env['ARCHAL_MAX_STEPS']?.trim();
40
+ if (!raw) return DEFAULT_MAX_STEPS;
41
+ const parsed = parseInt(raw, 10);
42
+ if (Number.isNaN(parsed) || parsed <= 0) return DEFAULT_MAX_STEPS;
43
+ return Math.min(parsed, 200);
44
+ })();
45
+ const MAX_CONSECUTIVE_ERRORS = (() => {
46
+ const raw = process.env['ARCHAL_MAX_CONSECUTIVE_ERRORS']?.trim();
47
+ if (!raw) return 8;
48
+ const parsed = parseInt(raw, 10);
49
+ if (Number.isNaN(parsed) || parsed <= 0) return 8;
50
+ return Math.min(parsed, 20);
51
+ })();
38
52
  const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
39
53
  const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
40
54
 
@@ -95,10 +109,19 @@ try {
95
109
 
96
110
  // Call the LLM with retry on transient errors
97
111
  log.llmCall(step + 1);
98
- const response = await withRetry(
99
- () => callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools),
100
- 2,
101
- );
112
+ let response;
113
+ try {
114
+ response = await withRetry(
115
+ () => callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools),
116
+ 2,
117
+ );
118
+ } catch (err) {
119
+ const msg = err?.message ?? String(err);
120
+ log.error('llm_call_failed', { step: step + 1, error: msg });
121
+ process.stderr.write(`[react] LLM API error: ${msg.slice(0, 500)}\n`);
122
+ exitReason = 'llm_error';
123
+ break;
124
+ }
102
125
 
103
126
  const iterDurationMs = Date.now() - iterStart;
104
127
  totalInputTokens += response.usage.inputTokens;
@@ -154,7 +177,7 @@ try {
154
177
  process.stderr.write(`[react] Tool error (${consecutiveErrors}): ${err.message}\n`);
155
178
 
156
179
  // Bail if too many consecutive errors
157
- if (consecutiveErrors >= 5) {
180
+ if (consecutiveErrors >= MAX_CONSECUTIVE_ERRORS) {
158
181
  process.stderr.write('[react] Too many consecutive tool errors — stopping.\n');
159
182
  exitReason = 'consecutive_errors';
160
183
  break;
@@ -171,7 +194,7 @@ try {
171
194
  durationMs: iterDurationMs,
172
195
  });
173
196
 
174
- if (consecutiveErrors >= 5) break;
197
+ if (consecutiveErrors >= MAX_CONSECUTIVE_ERRORS) break;
175
198
 
176
199
  // Append tool results to conversation
177
200
  messages = appendToolResults(provider, messages, toolCalls, results);
@@ -209,4 +232,7 @@ try {
209
232
  `${totalOutputTokens} output tokens, ${(totalTimeMs / 1000).toFixed(1)}s total\n`
210
233
  );
211
234
 
235
+ if (exitReason === 'llm_error') {
236
+ process.exit(1);
237
+ }
212
238
  }
@@ -77,7 +77,16 @@ try {
77
77
  const iterStart = Date.now();
78
78
 
79
79
  log.llmCall(step + 1);
80
- const response = await callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools);
80
+ let response;
81
+ try {
82
+ response = await callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools);
83
+ } catch (err) {
84
+ const msg = err?.message ?? String(err);
85
+ log.error('llm_call_failed', { step: step + 1, error: msg });
86
+ process.stderr.write(`[zero-shot] LLM API error: ${msg.slice(0, 500)}\n`);
87
+ exitReason = 'llm_error';
88
+ break;
89
+ }
81
90
 
82
91
  const iterDurationMs = Date.now() - iterStart;
83
92
  totalInputTokens += response.usage.inputTokens;
@@ -169,4 +178,7 @@ try {
169
178
  `${totalOutputTokens} output tokens, ${(totalTimeMs / 1000).toFixed(1)}s total\n`
170
179
  );
171
180
 
181
+ if (exitReason === 'llm_error') {
182
+ process.exit(1);
183
+ }
172
184
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@archal/cli",
3
- "version": "0.7.10",
3
+ "version": "0.7.11",
4
4
  "description": "Pre-deployment testing for AI agents",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",