npm - @archal/cli - Versions diffs - 0.7.9 → 0.7.10 - Mend

@archal/cli 0.7.9 → 0.7.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/index.js +193 -24
package/harnesses/hardened/agent.mjs +25 -105
package/harnesses/naive/agent.mjs +2 -2
package/harnesses/react/agent.mjs +2 -2
package/harnesses/zero-shot/agent.mjs +2 -2
package/package.json +1 -1

package/dist/index.js CHANGED Viewed

@@ -233,6 +233,7 @@ function parseCriterionLine(line, index) {
   } else {
     type = inferCriterionType(description);
   }
+  if (!description) return null;
   return {
     id: `criterion-${index + 1}`,
     description,
@@ -333,7 +334,11 @@ ${expectedBehavior}`.toLowerCase();
     github: ["github", "repository", "pull request", "create_issue", "create_pull_request", "merge_pull_request"],
     slack: ["slack", "slack channel", "send_message", "slack message", "direct message"],
     linear: ["linear", "linear ticket", "linear project", "linear cycle"],
-    jira: ["jira", "jira sprint", "jira epic", "jira board"]
+    jira: ["jira", "jira sprint", "jira epic", "jira board"],
+    stripe: ["stripe", "payment", "refund", "subscription", "invoice", "charge"],
+    supabase: ["supabase", "database", "sql query", "database table"],
+    "google-workspace": ["google workspace", "gmail", "google calendar", "google drive", "google docs"],
+    browser: ["browser", "web page", "navigate to", "click on", "web content"]
   };
   for (const [twin, keywords] of Object.entries(twinKeywords)) {
     if (keywords.some((kw) => combined.includes(kw))) {
@@ -425,7 +430,9 @@ function validateScenario(scenario) {
     }
   }
   if (scenario.config.twins.length === 0) {
-    errors.push("Scenario does not reference any known twins (specify in Config section or mention services in Setup/Expected Behavior)");
+    errors.push(
+      'Scenario does not reference any known twins. Add a "## Config" section with "twins: github" (or slack, linear, jira, stripe, supabase, google-workspace, browser). Alternatively, mention the service name in ## Setup or ## Expected Behavior.'
+    );
   }
   if (scenario.config.timeout <= 0) {
     errors.push("Timeout must be a positive number");
@@ -3072,7 +3079,7 @@ async function callLlmViaArchal(options) {
   debug("Archal backend response", { model: actualModel, remaining: String(result.data.remaining ?? "unknown") });
   const isSeedGen = options.intent === "seed-generate";
   if (!modelMismatchWarned && !isSeedGen && options.model && actualModel && !actualModel.includes(options.model) && !options.model.includes(actualModel)) {
-    warn(`Requested model "${options.model}" but Archal backend used "${actualModel}". To use a specific model, set provider to "direct" with your own API key.`);
+    debug(`Archal backend used "${actualModel}" (requested "${options.model}"). To use a specific model, set provider to "direct" with your own API key.`);
     modelMismatchWarned = true;
   }
   return result.data.text;
@@ -4193,8 +4200,35 @@ function filterByPredicate(items, predicate) {
   if (knownMatches.length > 0) {
     return { items: knownMatches, recognized: true };
   }
+  const ACTION_VERBS = /* @__PURE__ */ new Set([
+    "listed",
+    "fetched",
+    "retrieved",
+    "found",
+    "searched",
+    "queried",
+    "posted",
+    "sent",
+    "received",
+    "notified",
+    "alerted",
+    "reviewed",
+    "analyzed",
+    "inspected",
+    "checked",
+    "verified",
+    "triaged",
+    "escalated",
+    "assigned",
+    "tagged",
+    "labeled",
+    "updated",
+    "edited",
+    "patched",
+    "migrated"
+  ]);
   const isSingleWord = !lowerPredicate.includes(" ");
-  if (isSingleWord) {
+  if (isSingleWord && !ACTION_VERBS.has(lowerPredicate)) {
     const hasKnownField = items.some((item) => {
       if (typeof item !== "object" || item === null) return false;
       const obj = item;
@@ -5466,24 +5500,46 @@ ${JSON.stringify(context.stateDiff, null, 2)}
 ## Agent Trace Evidence
 ${traceEvidence}`;
 }
+function estimateTokens(value) {
+  const json = JSON.stringify(value);
+  return Math.ceil(json.length / 4);
+}
+var MAX_STATE_TOKENS = 4e4;
 function summarizeState(state) {
   const flat = flattenTwinState(state);
   const summary = {};
   for (const [key, value] of Object.entries(flat)) {
     if (Array.isArray(value)) {
-      if (value.length <= 100) {
+      if (value.length <= 50) {
         summary[key] = value;
       } else {
         summary[key] = {
           _count: value.length,
-          _first20: value.slice(0, 20),
-          _last20: value.slice(-20)
+          _first10: value.slice(0, 10),
+          _last10: value.slice(-10)
         };
       }
     } else {
       summary[key] = value;
     }
   }
+  let totalTokens = estimateTokens(summary);
+  if (totalTokens > MAX_STATE_TOKENS) {
+    const collectionSizes = Object.entries(summary).map(([key, value]) => ({ key, tokens: estimateTokens(value) })).sort((a, b) => b.tokens - a.tokens);
+    for (const { key } of collectionSizes) {
+      if (totalTokens <= MAX_STATE_TOKENS) break;
+      const value = summary[key];
+      if (!Array.isArray(value)) continue;
+      const before = estimateTokens(value);
+      summary[key] = {
+        _count: value.length,
+        _first5: value.slice(0, 5),
+        _last5: value.slice(-5),
+        _truncated: "Collection too large for evaluation \u2014 showing subset"
+      };
+      totalTokens -= before - estimateTokens(summary[key]);
+    }
+  }
   return summary;
 }
 function parseJudgeResponse(text) {
@@ -5583,6 +5639,15 @@ async function evaluateWithLlm(criterion, expectedBehavior, stateBefore, stateAf
       };
     }
     const message = err instanceof Error ? err.message : String(err);
+    if (err instanceof LlmApiError && err.status === 400 && message.includes("too long")) {
+      warn(`LLM judge prompt too large for criterion "${criterion.id}" \u2014 twin state may be too large for evaluation`);
+      return {
+        criterionId: criterion.id,
+        status: "fail",
+        confidence: 0,
+        explanation: "LLM evaluation skipped: prompt exceeded model context window. The scenario state is too large for probabilistic evaluation. Consider using deterministic [D] criteria for this scenario."
+      };
+    }
     error(`LLM judge call failed: ${message}`);
     return {
       criterionId: criterion.id,
@@ -8240,7 +8305,8 @@ var RELATIONSHIP_RULES = {
     { sourceCollection: "disputes", sourceField: "paymentIntentId", targetCollection: "paymentIntents", targetField: "paymentIntentId", optional: true }
   ],
   jira: [
-    { sourceCollection: "issues", sourceField: "projectId", targetCollection: "projects", targetField: "id" }
+    { sourceCollection: "issues", sourceField: "projectId", targetCollection: "projects", targetField: "id" },
+    { sourceCollection: "projects", sourceField: "leadAccountId", targetCollection: "users", targetField: "accountId" }
   ],
   linear: [
     { sourceCollection: "issues", sourceField: "teamId", targetCollection: "teams", targetField: "id" },
@@ -8484,15 +8550,17 @@ function autoFillMissingFKs(seed, twinName) {
     const targetEntities = result[rule.targetCollection];
     if (!sourceEntities || !targetEntities || targetEntities.length === 0) continue;
     const targetValues = targetEntities.map((e) => e[rule.targetField]).filter((v) => v !== void 0 && v !== null);
-    if (targetValues.length !== 1) continue;
-    const singleTarget = targetValues[0];
+    if (targetValues.length === 0) continue;
+    let fillIndex = 0;
     for (const entity of sourceEntities) {
       const e = entity;
       if (e[rule.sourceField] === void 0 || e[rule.sourceField] === null) {
-        warn(
-          `Auto-filling ${rule.sourceCollection}.${rule.sourceField} = ${String(singleTarget)} (only one ${rule.targetCollection} exists)`
+        const fillValue = targetValues[fillIndex % targetValues.length];
+        fillIndex++;
+        debug(
+          `Auto-filling ${rule.sourceCollection}.${rule.sourceField} = ${String(fillValue)} (from ${targetValues.length} ${rule.targetCollection})`
         );
-        e[rule.sourceField] = singleTarget;
+        e[rule.sourceField] = fillValue;
       }
     }
   }
@@ -8526,12 +8594,36 @@ function normalizeSeedData(seed, twinName) {
           }
         }
       }
+      const collectionSchema = schema[collection];
+      if (collectionSchema) {
+        for (const [field, fieldDef] of Object.entries(collectionSchema)) {
+          if (!(field in e) || e[field] === null || e[field] === void 0) continue;
+          const expectedType = fieldDef.type.split("|")[0].trim();
+          if (expectedType === "string" && typeof e[field] === "object" && e[field] !== null && !Array.isArray(e[field])) {
+            const obj = e[field];
+            const extracted = obj["login"] ?? obj["name"] ?? obj["value"] ?? obj["key"] ?? obj["id"] ?? obj["displayName"];
+            if (typeof extracted === "string") {
+              debug(`Seed normalization: coerced ${collection}.${field} from object to string "${extracted}"`);
+              e[field] = extracted;
+            } else {
+              const firstStr = Object.values(obj).find((v) => typeof v === "string");
+              if (firstStr) {
+                debug(`Seed normalization: coerced ${collection}.${field} from object to string "${firstStr}" (fallback)`);
+                e[field] = firstStr;
+              } else {
+                debug(`Seed normalization: could not coerce ${collection}.${field} from object to string, removing`);
+                delete e[field];
+              }
+            }
+          }
+        }
+      }
       if (collectionDefaults) {
         for (const [field, defaultValue] of Object.entries(collectionDefaults)) {
           if (!(field in e)) {
             e[field] = structuredClone(defaultValue);
           } else if (e[field] === null && defaultValue !== null) {
-            const fieldDef = schema[collection]?.[field];
+            const fieldDef = collectionSchema?.[field];
             if (fieldDef && !fieldDef.type.includes("null")) {
               e[field] = structuredClone(defaultValue);
             }
@@ -8540,6 +8632,15 @@ function normalizeSeedData(seed, twinName) {
       }
     }
   }
+  if (twinName === "github" && result["repos"]) {
+    for (const entity of result["repos"]) {
+      const e = entity;
+      if ((!e["fullName"] || typeof e["fullName"] !== "string") && typeof e["owner"] === "string" && typeof e["name"] === "string") {
+        e["fullName"] = `${e["owner"]}/${e["name"]}`;
+        debug(`Seed normalization: derived repos.fullName = "${e["fullName"]}"`);
+      }
+    }
+  }
   return result;
 }
@@ -8816,7 +8917,24 @@ var NON_SUBJECT_STARTS = /* @__PURE__ */ new Set([
   "could",
   "would",
   "may",
-  "might"
+  "might",
+  "for",
+  "with",
+  "in",
+  "at",
+  "to",
+  "from",
+  "by",
+  "on",
+  "per",
+  "via",
+  "into",
+  "onto",
+  "over",
+  "under",
+  "after",
+  "before",
+  "during"
 ]);
 function isReasonableCountSubject(subject, expected) {
   if (expected > MAX_REASONABLE_COUNT) return false;
@@ -8827,6 +8945,10 @@ function isReasonableCountSubject(subject, expected) {
   if (/\b(?:have|has|had|were|was|are|is|been|being|do|does|did|can|could|should|will|would|may|might)\b/.test(subject.toLowerCase())) return false;
   return true;
 }
+function appearsToBeClockSuffix(text, numberStart) {
+  const prefix = text.slice(Math.max(0, numberStart - 3), numberStart);
+  return /^\d{1,2}:$/.test(prefix);
+}
 function verifySeedCounts(setupText, seedState) {
   const mismatches = [];
   const flat = flattenTwinState(seedState);
@@ -8834,6 +8956,7 @@ function verifySeedCounts(setupText, seedState) {
   for (const match of setupText.matchAll(countPattern)) {
     const expected = parseInt(match[1], 10);
     const subject = match[2].trim();
+    if (match.index !== void 0 && appearsToBeClockSuffix(setupText, match.index)) continue;
     if (!subject || expected <= 0) continue;
     if (!isReasonableCountSubject(subject, expected)) continue;
     const resolved = resolveSubjectInState(subject, flat);
@@ -8846,6 +8969,7 @@ function verifySeedCounts(setupText, seedState) {
   for (const match of setupText.matchAll(simplePattern)) {
     const expected = parseInt(match[1], 10);
     const subject = match[2].trim();
+    if (match.index !== void 0 && appearsToBeClockSuffix(setupText, match.index)) continue;
     if (!subject || expected <= 0 || seenSubjects.has(subject.toLowerCase())) continue;
     if (!isReasonableCountSubject(subject, expected)) continue;
     const resolved = resolveSubjectInState(subject, flat);
@@ -9129,12 +9253,26 @@ Extract the seed blueprint as JSON.`;
     }
     const parsed = parseBlueprint(responseText, twinName);
     if (!parsed) return null;
+    const validCollections = new Set(availableCollections);
+    parsed.collections = parsed.collections.filter((col) => {
+      if (validCollections.has(col.name)) return true;
+      warn(`Blueprint references unknown collection "${col.name}" for ${twinName} \u2014 dropping`);
+      return false;
+    });
     for (const col of parsed.collections) {
       const groupSum = col.groups.reduce((sum, g) => sum + g.count, 0);
       if (groupSum !== col.totalCount) {
         debug(`Blueprint group count mismatch for ${col.name}: groups sum to ${groupSum}, totalCount is ${col.totalCount}. Adjusting.`);
         col.totalCount = groupSum;
       }
+      if (col.totalCount === 0) {
+        debug(`Blueprint collection ${col.name} has 0 entities \u2014 dropping`);
+      }
+    }
+    parsed.collections = parsed.collections.filter((col) => col.totalCount > 0);
+    if (parsed.collections.length === 0 && parsed.identities.length === 0) {
+      warn("Blueprint extracted no valid collections or identities");
+      return null;
     }
     return parsed;
   } catch (err) {
@@ -9356,7 +9494,13 @@ function buildSeedFromBlueprint(blueprint, baseSeed) {
   for (const identity of blueprint.identities) {
     processIdentity(identity, seed, warnings);
   }
+  const baseCollections = new Set(Object.keys(baseSeed));
   for (const spec of blueprint.collections) {
+    if (!baseCollections.has(spec.name) && !seed[spec.name]) {
+      warnings.push(`Blueprint references unknown collection "${spec.name}" \u2014 skipping`);
+      warn(`Blueprint references unknown collection "${spec.name}" for ${blueprint.twin} twin \u2014 skipping`);
+      continue;
+    }
     processCollection(spec, seed, blueprint.twin, existingLabels, warnings, now);
   }
   return { seed, warnings };
@@ -9612,9 +9756,16 @@ function buildSlackEntity(collection, id, props, seed, index, temporal, contentH
     }
     case "messages": {
       const channels = seed["channels"] ?? [];
-      const channelId = channels.length > 0 ? String(channels[index % channels.length]["channel_id"] ?? "C0001AAAA") : "C0001AAAA";
+      const targetChannel = channels.length > 0 ? channels[index % channels.length] : null;
+      const channelId = targetChannel ? String(targetChannel["channel_id"] ?? "C0001AAAA") : "C0001AAAA";
+      const channelMembers = targetChannel ? targetChannel["members"] ?? [] : [];
       const users = seed["users"] ?? [];
-      const userId = users.length > 0 ? String(users[index % users.length]["user_id"] ?? "U0001AAAA") : "U0001AAAA";
+      let userId;
+      if (channelMembers.length > 0) {
+        userId = channelMembers[index % channelMembers.length];
+      } else {
+        userId = users.length > 0 ? String(users[index % users.length]["user_id"] ?? "U0001AAAA") : "U0001AAAA";
+      }
       const baseTs = Math.floor(new Date(temporal.createdAt).getTime() / 1e3);
       const ts = generateSlackTs(baseTs, index);
       return {
@@ -10787,7 +10938,7 @@ Fix these issues:
         validationAttempt: String(validationAttempts + 1)
       });
       const provider = detectProvider(config.model);
-      const apiKey = resolveProviderApiKey(config.apiKey, provider);
+      const apiKey = effectiveMode === "archal" ? "" : resolveProviderApiKey(config.apiKey, provider);
       const responseText = await callLlm({
         provider,
         model: config.model,
@@ -10796,7 +10947,7 @@ Fix these issues:
         userPrompt: promptWithFeedback,
         maxTokens: 16384,
         baseUrl: config.baseUrl,
-        providerMode: config.providerMode,
+        providerMode: effectiveMode,
         intent: "seed-generate",
         responseFormat: "json"
       });
@@ -11897,11 +12048,21 @@ function parseSqlSeed(sql) {
 function loadSeedStateFromPath(seedRoot, seedName) {
   const jsonPath = resolve4(seedRoot, `${seedName}.json`);
   if (existsSync10(jsonPath)) {
-    return JSON.parse(readFileSync12(jsonPath, "utf-8"));
+    try {
+      return JSON.parse(readFileSync12(jsonPath, "utf-8"));
+    } catch (err) {
+      const detail = err instanceof Error ? err.message : String(err);
+      throw new Error(`Failed to parse seed file ${jsonPath}: ${detail}`);
+    }
   }
   const sqlPath = resolve4(seedRoot, `${seedName}.sql`);
   if (existsSync10(sqlPath)) {
-    return parseSqlSeed(readFileSync12(sqlPath, "utf-8"));
+    try {
+      return parseSqlSeed(readFileSync12(sqlPath, "utf-8"));
+    } catch (err) {
+      const detail = err instanceof Error ? err.message : String(err);
+      throw new Error(`Failed to parse seed file ${sqlPath}: ${detail}`);
+    }
   }
   return null;
 }
@@ -12137,7 +12298,9 @@ ${baseTaskMessage}` : baseTaskMessage;
       };
     }
     if (trace.length === 0) {
-      warn(`Agent made no tool calls on run ${runIndex + 1}. The agent may have failed to act \u2014 check agent logs and task prompt.`);
+      warn(
+        `Agent made no tool calls on run ${runIndex + 1}. This usually means the model is too weak for this scenario. Try a more capable model (e.g. --engine-model claude-sonnet-4-6 or --engine-model gemini-2.5-pro). If using a custom agent, check that it correctly processes tool schemas and calls tools.`
+      );
     }
     progress(`Evaluating run ${runIndex + 1}...`);
     const evaluationResult = await evaluateRun(
@@ -12474,8 +12637,14 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
   for (const sel of seedSelections) {
     const mismatches = verifySeedCounts(scenario.setup, sel.seedData);
     if (mismatches.length === 0) continue;
+    const significantMismatches = mismatches.filter((m) => {
+      const delta = Math.abs(m.expected - m.actual);
+      const ratio = m.expected > 0 ? delta / m.expected : delta;
+      return delta > 5 || ratio > 0.5;
+    });
+    if (significantMismatches.length === 0) continue;
     warn(
-      `Seed count mismatch for ${sel.twinName}: ${mismatches.map((m) => `${m.subject}: expected ${m.expected}, got ${m.actual}`).join("; ")}`
+      `Seed count mismatch for ${sel.twinName}: ${significantMismatches.map((m) => `${m.subject}: expected ${m.expected}, got ${m.actual}`).join("; ")}`
     );
   }
   const scenarioDir = dirname2(resolve4(options.scenarioPath));
@@ -12667,7 +12836,7 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
   printHeader(scenario.title, seedSelections);
   const evaluatorProvider = detectProvider(model);
   const configProvider = detectProvider(config.model);
-  const evaluatorApiKey = options.model && evaluatorProvider !== configProvider ? resolveProviderApiKey("", evaluatorProvider) : resolveProviderApiKey(config.apiKey, evaluatorProvider);
+  const evaluatorApiKey = config.evaluatorProvider === "archal" ? "" : options.model && evaluatorProvider !== configProvider ? resolveProviderApiKey("", evaluatorProvider) : resolveProviderApiKey(config.apiKey, evaluatorProvider);
   const evaluatorConfig = {
     apiKey: evaluatorApiKey,
     model,

package/harnesses/hardened/agent.mjs CHANGED Viewed

@@ -9,11 +9,9 @@
  *
  * Key features:
  * - Security-focused system prompt emphasizing investigation and refusal
- * - SAFETY.md prompt file injected via loadPromptContext (prepended to task)
  * - Multi-provider support (Gemini, OpenAI, Anthropic) via _lib/providers.mjs
  * - Error recovery with retries on transient failures
  * - Consecutive-error bailout at 5
- * - Temperature 0 for conservative, deterministic behavior
  * - 50 steps max for thorough investigation before acting
  *
  * Env vars (set by archal orchestrator):
@@ -36,13 +34,13 @@ import {
   getStopReason,
   withRetry,
 } from '../_lib/providers.mjs';
-import { collectTwinUrls } from '../_lib/rest-client.mjs';
+import { collectTwinUrls, discoverAllTools, callToolRest } from '../_lib/rest-client.mjs';
 import { createLogger } from '../_lib/logging.mjs';
 import { writeMetrics } from '../_lib/metrics.mjs';
 import { createAgentTrace } from '../_lib/agent-trace.mjs';
 const MAX_STEPS = 50;
-const TASK = process.env['ARCHAL_ENGINE_TASK'];
+const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
 const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
 if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set'); process.exit(1); }
@@ -54,54 +52,16 @@ const log = createLogger({ harness: 'hardened', model: MODEL, provider });
 // ── Twin REST transport ─────────────────────────────────────────────
-const authHeaders = {};
-if (process.env['ARCHAL_TOKEN']) {
-  authHeaders['Authorization'] = `Bearer ${process.env['ARCHAL_TOKEN']}`;
-}
-const runtimeUserId = process.env['ARCHAL_RUNTIME_USER_ID'] || process.env['archal_runtime_user_id'];
-if (runtimeUserId) {
-  authHeaders['x-archal-user-id'] = runtimeUserId;
-}
-/** Collect twin URLs from ARCHAL_<TWIN>_URL env vars */
 const twinUrls = collectTwinUrls();
 if (Object.keys(twinUrls).length === 0) {
-  process.stderr.write('[hardened] FATAL: No twin URLs found in ARCHAL_*_URL env vars. Cannot proceed.\n');
+  console.error('[hardened] No twin URLs found. Check ARCHAL_TWIN_NAMES and ARCHAL_<TWIN>_URL env vars.');
   process.exit(1);
 }
-process.stderr.write(`[hardened] twin URLs: ${JSON.stringify(twinUrls)}\n`);
-/** Fetch available tools from a twin's REST endpoint */
-async function fetchTools(baseUrl) {
-  try {
-    const res = await fetch(`${baseUrl}/tools`, { headers: authHeaders });
-    if (!res.ok) return [];
-    const data = await res.json();
-    if (!Array.isArray(data)) {
-      process.stderr.write(`[hardened] Expected array from ${baseUrl}/tools, got ${typeof data}\n`);
-      return [];
-    }
-    return data;
-  } catch (err) {
-    process.stderr.write(`[hardened] Failed to fetch tools from ${baseUrl}: ${err.message}\n`);
-    return [];
-  }
-}
-/** Call a tool on a twin via REST */
-async function callToolRest(baseUrl, toolName, args) {
-  const res = await fetch(`${baseUrl}/tools/call`, {
-    method: 'POST',
-    headers: { 'Content-Type': 'application/json', ...authHeaders },
-    body: JSON.stringify({ name: toolName, arguments: args }),
-  });
-  const body = await res.text();
-  if (!res.ok) {
-    throw new Error(`Tool call ${toolName} failed (HTTP ${res.status}): ${body}`);
-  }
-  // Return as text string for provider message formatting
-  return body;
+const { tools: allTools, toolToTwin } = await discoverAllTools(twinUrls);
+if (allTools.length === 0) {
+  console.error('[hardened] No tools discovered from twins. Twin endpoints may be unreachable.');
+  process.exit(1);
 }
 // ── System prompt ───────────────────────────────────────────────────
@@ -123,37 +83,9 @@ APPROACH:
 4. If the request is safe and properly authorized, proceed methodically.
 5. Summarize what you did and why.`;
-// ── Discover tools from all twins ───────────────────────────────────
-const allTools = [];
-const toolToTwin = {};
-for (const [twinName, baseUrl] of Object.entries(twinUrls)) {
-  const tools = await fetchTools(baseUrl);
-  process.stderr.write(`[hardened] ${twinName}: ${tools.length} tools\n`);
-  for (const tool of tools) {
-    // Namespace tool names to match MCP convention used by evaluator
-    const namespacedName = `mcp__${twinName}__${tool.name}`;
-    allTools.push({
-      name: namespacedName,
-      description: tool.description || '',
-      inputSchema: tool.inputSchema || { type: 'object', properties: {} },
-    });
-    toolToTwin[namespacedName] = { twinName, baseUrl, originalName: tool.name };
-  }
-}
-process.stderr.write(`[hardened] Total tools: ${allTools.length}\n`);
-if (allTools.length === 0) {
-  process.stderr.write('[hardened] FATAL: No tools discovered from twins. Twin endpoints may be unreachable.\n');
-  process.exit(1);
-}
-const providerTools = formatToolsForProvider(provider, allTools);
 // ── Main loop ───────────────────────────────────────────────────────
+const providerTools = formatToolsForProvider(provider, allTools);
 let messages = buildInitialMessages(provider, SYSTEM_PROMPT, TASK, MODEL);
 let consecutiveErrors = 0;
@@ -211,45 +143,33 @@ try {
       break;
     }
-    // Execute each tool call via REST
+    // Execute each tool call via shared REST client
     const results = [];
     for (const tc of toolCalls) {
       const toolStart = Date.now();
       process.stderr.write(`[hardened] Step ${step + 1}: ${tc.name}(${JSON.stringify(tc.arguments).slice(0, 100)})\n`);
-      const mapping = toolToTwin[tc.name];
-      if (!mapping) {
-        const errorMsg = `Error: Unknown tool "${tc.name}"`;
+      try {
+        const result = await callToolRest(toolToTwin, tc.name, tc.arguments);
+        results.push(result);
+        consecutiveErrors = 0;
+        totalToolCalls++;
+        log.toolCall(step + 1, tc.name, tc.arguments, Date.now() - toolStart);
+      } catch (err) {
+        const errorMsg = `Error: ${err.message}`;
         results.push(errorMsg);
         consecutiveErrors++;
         totalToolCalls++;
         totalToolErrors++;
-        log.toolError(step + 1, tc.name, `Unknown tool`);
-        process.stderr.write(`[hardened] Tool error (${consecutiveErrors}): Unknown tool ${tc.name}\n`);
-      } else {
-        try {
-          const result = await callToolRest(mapping.baseUrl, mapping.originalName, tc.arguments);
-          results.push(result);
-          consecutiveErrors = 0;
-          totalToolCalls++;
-          log.toolCall(step + 1, tc.name, tc.arguments, Date.now() - toolStart);
-        } catch (err) {
-          const errorMsg = `Error: ${err.message}`;
-          results.push(errorMsg);
-          consecutiveErrors++;
-          totalToolCalls++;
-          totalToolErrors++;
-          log.toolError(step + 1, tc.name, err.message);
-          process.stderr.write(`[hardened] Tool error (${consecutiveErrors}): ${err.message}\n`);
+        log.toolError(step + 1, tc.name, err.message);
+        process.stderr.write(`[hardened] Tool error (${consecutiveErrors}): ${err.message}\n`);
+        // Bail if too many consecutive errors
+        if (consecutiveErrors >= 5) {
+          process.stderr.write('[hardened] Too many consecutive tool errors — stopping.\n');
+          exitReason = 'consecutive_errors';
+          break;
         }
       }
-      // Bail if too many consecutive errors
-      if (consecutiveErrors >= 5) {
-        process.stderr.write('[hardened] Too many consecutive tool errors — stopping.\n');
-        exitReason = 'consecutive_errors';
-        break;
-      }
     }
     // Record thinking trace for this step (before bailout check so the final step is captured)

package/harnesses/naive/agent.mjs CHANGED Viewed

@@ -33,10 +33,10 @@ import { createLogger } from '../_lib/logging.mjs';
 import { writeMetrics } from '../_lib/metrics.mjs';
 const MAX_STEPS = 20;
-const TASK = process.env['ARCHAL_ENGINE_TASK'];
+const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
 const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
-if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set'); process.exit(1); }
+if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set or empty'); process.exit(1); }
 if (!MODEL) { console.error('ARCHAL_ENGINE_MODEL not set'); process.exit(1); }
 // Warn when used outside demo context

package/harnesses/react/agent.mjs CHANGED Viewed

@@ -35,10 +35,10 @@ import { writeMetrics } from '../_lib/metrics.mjs';
 import { createAgentTrace } from '../_lib/agent-trace.mjs';
 const MAX_STEPS = 50;
-const TASK = process.env['ARCHAL_ENGINE_TASK'];
+const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
 const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
-if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set'); process.exit(1); }
+if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set or empty'); process.exit(1); }
 if (!MODEL) { console.error('ARCHAL_ENGINE_MODEL not set'); process.exit(1); }
 const provider = detectProvider(MODEL);

package/harnesses/zero-shot/agent.mjs CHANGED Viewed

@@ -32,10 +32,10 @@ import { writeMetrics } from '../_lib/metrics.mjs';
 import { createAgentTrace } from '../_lib/agent-trace.mjs';
 const MAX_STEPS = 40;
-const TASK = process.env['ARCHAL_ENGINE_TASK'];
+const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
 const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
-if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set'); process.exit(1); }
+if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set or empty'); process.exit(1); }
 if (!MODEL) { console.error('ARCHAL_ENGINE_MODEL not set'); process.exit(1); }
 const provider = detectProvider(MODEL);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@archal/cli",
-  "version": "0.7.9",
+  "version": "0.7.10",
   "description": "Pre-deployment testing for AI agents",
   "type": "module",
   "main": "dist/index.js",