@archal/cli 0.7.9 → 0.7.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -233,6 +233,7 @@ function parseCriterionLine(line, index) {
233
233
  } else {
234
234
  type = inferCriterionType(description);
235
235
  }
236
+ if (!description) return null;
236
237
  return {
237
238
  id: `criterion-${index + 1}`,
238
239
  description,
@@ -333,7 +334,11 @@ ${expectedBehavior}`.toLowerCase();
333
334
  github: ["github", "repository", "pull request", "create_issue", "create_pull_request", "merge_pull_request"],
334
335
  slack: ["slack", "slack channel", "send_message", "slack message", "direct message"],
335
336
  linear: ["linear", "linear ticket", "linear project", "linear cycle"],
336
- jira: ["jira", "jira sprint", "jira epic", "jira board"]
337
+ jira: ["jira", "jira sprint", "jira epic", "jira board"],
338
+ stripe: ["stripe", "payment", "refund", "subscription", "invoice", "charge"],
339
+ supabase: ["supabase", "database", "sql query", "database table"],
340
+ "google-workspace": ["google workspace", "gmail", "google calendar", "google drive", "google docs"],
341
+ browser: ["browser", "web page", "navigate to", "click on", "web content"]
337
342
  };
338
343
  for (const [twin, keywords] of Object.entries(twinKeywords)) {
339
344
  if (keywords.some((kw) => combined.includes(kw))) {
@@ -425,7 +430,9 @@ function validateScenario(scenario) {
425
430
  }
426
431
  }
427
432
  if (scenario.config.twins.length === 0) {
428
- errors.push("Scenario does not reference any known twins (specify in Config section or mention services in Setup/Expected Behavior)");
433
+ errors.push(
434
+ 'Scenario does not reference any known twins. Add a "## Config" section with "twins: github" (or slack, linear, jira, stripe, supabase, google-workspace, browser). Alternatively, mention the service name in ## Setup or ## Expected Behavior.'
435
+ );
429
436
  }
430
437
  if (scenario.config.timeout <= 0) {
431
438
  errors.push("Timeout must be a positive number");
@@ -3072,7 +3079,7 @@ async function callLlmViaArchal(options) {
3072
3079
  debug("Archal backend response", { model: actualModel, remaining: String(result.data.remaining ?? "unknown") });
3073
3080
  const isSeedGen = options.intent === "seed-generate";
3074
3081
  if (!modelMismatchWarned && !isSeedGen && options.model && actualModel && !actualModel.includes(options.model) && !options.model.includes(actualModel)) {
3075
- warn(`Requested model "${options.model}" but Archal backend used "${actualModel}". To use a specific model, set provider to "direct" with your own API key.`);
3082
+ debug(`Archal backend used "${actualModel}" (requested "${options.model}"). To use a specific model, set provider to "direct" with your own API key.`);
3076
3083
  modelMismatchWarned = true;
3077
3084
  }
3078
3085
  return result.data.text;
@@ -4193,8 +4200,35 @@ function filterByPredicate(items, predicate) {
4193
4200
  if (knownMatches.length > 0) {
4194
4201
  return { items: knownMatches, recognized: true };
4195
4202
  }
4203
+ const ACTION_VERBS = /* @__PURE__ */ new Set([
4204
+ "listed",
4205
+ "fetched",
4206
+ "retrieved",
4207
+ "found",
4208
+ "searched",
4209
+ "queried",
4210
+ "posted",
4211
+ "sent",
4212
+ "received",
4213
+ "notified",
4214
+ "alerted",
4215
+ "reviewed",
4216
+ "analyzed",
4217
+ "inspected",
4218
+ "checked",
4219
+ "verified",
4220
+ "triaged",
4221
+ "escalated",
4222
+ "assigned",
4223
+ "tagged",
4224
+ "labeled",
4225
+ "updated",
4226
+ "edited",
4227
+ "patched",
4228
+ "migrated"
4229
+ ]);
4196
4230
  const isSingleWord = !lowerPredicate.includes(" ");
4197
- if (isSingleWord) {
4231
+ if (isSingleWord && !ACTION_VERBS.has(lowerPredicate)) {
4198
4232
  const hasKnownField = items.some((item) => {
4199
4233
  if (typeof item !== "object" || item === null) return false;
4200
4234
  const obj = item;
@@ -5466,24 +5500,46 @@ ${JSON.stringify(context.stateDiff, null, 2)}
5466
5500
  ## Agent Trace Evidence
5467
5501
  ${traceEvidence}`;
5468
5502
  }
5503
+ function estimateTokens(value) {
5504
+ const json = JSON.stringify(value);
5505
+ return Math.ceil(json.length / 4);
5506
+ }
5507
+ var MAX_STATE_TOKENS = 4e4;
5469
5508
  function summarizeState(state) {
5470
5509
  const flat = flattenTwinState(state);
5471
5510
  const summary = {};
5472
5511
  for (const [key, value] of Object.entries(flat)) {
5473
5512
  if (Array.isArray(value)) {
5474
- if (value.length <= 100) {
5513
+ if (value.length <= 50) {
5475
5514
  summary[key] = value;
5476
5515
  } else {
5477
5516
  summary[key] = {
5478
5517
  _count: value.length,
5479
- _first20: value.slice(0, 20),
5480
- _last20: value.slice(-20)
5518
+ _first10: value.slice(0, 10),
5519
+ _last10: value.slice(-10)
5481
5520
  };
5482
5521
  }
5483
5522
  } else {
5484
5523
  summary[key] = value;
5485
5524
  }
5486
5525
  }
5526
+ let totalTokens = estimateTokens(summary);
5527
+ if (totalTokens > MAX_STATE_TOKENS) {
5528
+ const collectionSizes = Object.entries(summary).map(([key, value]) => ({ key, tokens: estimateTokens(value) })).sort((a, b) => b.tokens - a.tokens);
5529
+ for (const { key } of collectionSizes) {
5530
+ if (totalTokens <= MAX_STATE_TOKENS) break;
5531
+ const value = summary[key];
5532
+ if (!Array.isArray(value)) continue;
5533
+ const before = estimateTokens(value);
5534
+ summary[key] = {
5535
+ _count: value.length,
5536
+ _first5: value.slice(0, 5),
5537
+ _last5: value.slice(-5),
5538
+ _truncated: "Collection too large for evaluation \u2014 showing subset"
5539
+ };
5540
+ totalTokens -= before - estimateTokens(summary[key]);
5541
+ }
5542
+ }
5487
5543
  return summary;
5488
5544
  }
5489
5545
  function parseJudgeResponse(text) {
@@ -5583,6 +5639,15 @@ async function evaluateWithLlm(criterion, expectedBehavior, stateBefore, stateAf
5583
5639
  };
5584
5640
  }
5585
5641
  const message = err instanceof Error ? err.message : String(err);
5642
+ if (err instanceof LlmApiError && err.status === 400 && message.includes("too long")) {
5643
+ warn(`LLM judge prompt too large for criterion "${criterion.id}" \u2014 twin state may be too large for evaluation`);
5644
+ return {
5645
+ criterionId: criterion.id,
5646
+ status: "fail",
5647
+ confidence: 0,
5648
+ explanation: "LLM evaluation skipped: prompt exceeded model context window. The scenario state is too large for probabilistic evaluation. Consider using deterministic [D] criteria for this scenario."
5649
+ };
5650
+ }
5586
5651
  error(`LLM judge call failed: ${message}`);
5587
5652
  return {
5588
5653
  criterionId: criterion.id,
@@ -8240,7 +8305,8 @@ var RELATIONSHIP_RULES = {
8240
8305
  { sourceCollection: "disputes", sourceField: "paymentIntentId", targetCollection: "paymentIntents", targetField: "paymentIntentId", optional: true }
8241
8306
  ],
8242
8307
  jira: [
8243
- { sourceCollection: "issues", sourceField: "projectId", targetCollection: "projects", targetField: "id" }
8308
+ { sourceCollection: "issues", sourceField: "projectId", targetCollection: "projects", targetField: "id" },
8309
+ { sourceCollection: "projects", sourceField: "leadAccountId", targetCollection: "users", targetField: "accountId" }
8244
8310
  ],
8245
8311
  linear: [
8246
8312
  { sourceCollection: "issues", sourceField: "teamId", targetCollection: "teams", targetField: "id" },
@@ -8484,15 +8550,17 @@ function autoFillMissingFKs(seed, twinName) {
8484
8550
  const targetEntities = result[rule.targetCollection];
8485
8551
  if (!sourceEntities || !targetEntities || targetEntities.length === 0) continue;
8486
8552
  const targetValues = targetEntities.map((e) => e[rule.targetField]).filter((v) => v !== void 0 && v !== null);
8487
- if (targetValues.length !== 1) continue;
8488
- const singleTarget = targetValues[0];
8553
+ if (targetValues.length === 0) continue;
8554
+ let fillIndex = 0;
8489
8555
  for (const entity of sourceEntities) {
8490
8556
  const e = entity;
8491
8557
  if (e[rule.sourceField] === void 0 || e[rule.sourceField] === null) {
8492
- warn(
8493
- `Auto-filling ${rule.sourceCollection}.${rule.sourceField} = ${String(singleTarget)} (only one ${rule.targetCollection} exists)`
8558
+ const fillValue = targetValues[fillIndex % targetValues.length];
8559
+ fillIndex++;
8560
+ debug(
8561
+ `Auto-filling ${rule.sourceCollection}.${rule.sourceField} = ${String(fillValue)} (from ${targetValues.length} ${rule.targetCollection})`
8494
8562
  );
8495
- e[rule.sourceField] = singleTarget;
8563
+ e[rule.sourceField] = fillValue;
8496
8564
  }
8497
8565
  }
8498
8566
  }
@@ -8526,12 +8594,36 @@ function normalizeSeedData(seed, twinName) {
8526
8594
  }
8527
8595
  }
8528
8596
  }
8597
+ const collectionSchema = schema[collection];
8598
+ if (collectionSchema) {
8599
+ for (const [field, fieldDef] of Object.entries(collectionSchema)) {
8600
+ if (!(field in e) || e[field] === null || e[field] === void 0) continue;
8601
+ const expectedType = fieldDef.type.split("|")[0].trim();
8602
+ if (expectedType === "string" && typeof e[field] === "object" && e[field] !== null && !Array.isArray(e[field])) {
8603
+ const obj = e[field];
8604
+ const extracted = obj["login"] ?? obj["name"] ?? obj["value"] ?? obj["key"] ?? obj["id"] ?? obj["displayName"];
8605
+ if (typeof extracted === "string") {
8606
+ debug(`Seed normalization: coerced ${collection}.${field} from object to string "${extracted}"`);
8607
+ e[field] = extracted;
8608
+ } else {
8609
+ const firstStr = Object.values(obj).find((v) => typeof v === "string");
8610
+ if (firstStr) {
8611
+ debug(`Seed normalization: coerced ${collection}.${field} from object to string "${firstStr}" (fallback)`);
8612
+ e[field] = firstStr;
8613
+ } else {
8614
+ debug(`Seed normalization: could not coerce ${collection}.${field} from object to string, removing`);
8615
+ delete e[field];
8616
+ }
8617
+ }
8618
+ }
8619
+ }
8620
+ }
8529
8621
  if (collectionDefaults) {
8530
8622
  for (const [field, defaultValue] of Object.entries(collectionDefaults)) {
8531
8623
  if (!(field in e)) {
8532
8624
  e[field] = structuredClone(defaultValue);
8533
8625
  } else if (e[field] === null && defaultValue !== null) {
8534
- const fieldDef = schema[collection]?.[field];
8626
+ const fieldDef = collectionSchema?.[field];
8535
8627
  if (fieldDef && !fieldDef.type.includes("null")) {
8536
8628
  e[field] = structuredClone(defaultValue);
8537
8629
  }
@@ -8540,6 +8632,15 @@ function normalizeSeedData(seed, twinName) {
8540
8632
  }
8541
8633
  }
8542
8634
  }
8635
+ if (twinName === "github" && result["repos"]) {
8636
+ for (const entity of result["repos"]) {
8637
+ const e = entity;
8638
+ if ((!e["fullName"] || typeof e["fullName"] !== "string") && typeof e["owner"] === "string" && typeof e["name"] === "string") {
8639
+ e["fullName"] = `${e["owner"]}/${e["name"]}`;
8640
+ debug(`Seed normalization: derived repos.fullName = "${e["fullName"]}"`);
8641
+ }
8642
+ }
8643
+ }
8543
8644
  return result;
8544
8645
  }
8545
8646
 
@@ -8816,7 +8917,24 @@ var NON_SUBJECT_STARTS = /* @__PURE__ */ new Set([
8816
8917
  "could",
8817
8918
  "would",
8818
8919
  "may",
8819
- "might"
8920
+ "might",
8921
+ "for",
8922
+ "with",
8923
+ "in",
8924
+ "at",
8925
+ "to",
8926
+ "from",
8927
+ "by",
8928
+ "on",
8929
+ "per",
8930
+ "via",
8931
+ "into",
8932
+ "onto",
8933
+ "over",
8934
+ "under",
8935
+ "after",
8936
+ "before",
8937
+ "during"
8820
8938
  ]);
8821
8939
  function isReasonableCountSubject(subject, expected) {
8822
8940
  if (expected > MAX_REASONABLE_COUNT) return false;
@@ -8827,6 +8945,10 @@ function isReasonableCountSubject(subject, expected) {
8827
8945
  if (/\b(?:have|has|had|were|was|are|is|been|being|do|does|did|can|could|should|will|would|may|might)\b/.test(subject.toLowerCase())) return false;
8828
8946
  return true;
8829
8947
  }
8948
+ function appearsToBeClockSuffix(text, numberStart) {
8949
+ const prefix = text.slice(Math.max(0, numberStart - 3), numberStart);
8950
+ return /^\d{1,2}:$/.test(prefix);
8951
+ }
8830
8952
  function verifySeedCounts(setupText, seedState) {
8831
8953
  const mismatches = [];
8832
8954
  const flat = flattenTwinState(seedState);
@@ -8834,6 +8956,7 @@ function verifySeedCounts(setupText, seedState) {
8834
8956
  for (const match of setupText.matchAll(countPattern)) {
8835
8957
  const expected = parseInt(match[1], 10);
8836
8958
  const subject = match[2].trim();
8959
+ if (match.index !== void 0 && appearsToBeClockSuffix(setupText, match.index)) continue;
8837
8960
  if (!subject || expected <= 0) continue;
8838
8961
  if (!isReasonableCountSubject(subject, expected)) continue;
8839
8962
  const resolved = resolveSubjectInState(subject, flat);
@@ -8846,6 +8969,7 @@ function verifySeedCounts(setupText, seedState) {
8846
8969
  for (const match of setupText.matchAll(simplePattern)) {
8847
8970
  const expected = parseInt(match[1], 10);
8848
8971
  const subject = match[2].trim();
8972
+ if (match.index !== void 0 && appearsToBeClockSuffix(setupText, match.index)) continue;
8849
8973
  if (!subject || expected <= 0 || seenSubjects.has(subject.toLowerCase())) continue;
8850
8974
  if (!isReasonableCountSubject(subject, expected)) continue;
8851
8975
  const resolved = resolveSubjectInState(subject, flat);
@@ -9129,12 +9253,26 @@ Extract the seed blueprint as JSON.`;
9129
9253
  }
9130
9254
  const parsed = parseBlueprint(responseText, twinName);
9131
9255
  if (!parsed) return null;
9256
+ const validCollections = new Set(availableCollections);
9257
+ parsed.collections = parsed.collections.filter((col) => {
9258
+ if (validCollections.has(col.name)) return true;
9259
+ warn(`Blueprint references unknown collection "${col.name}" for ${twinName} \u2014 dropping`);
9260
+ return false;
9261
+ });
9132
9262
  for (const col of parsed.collections) {
9133
9263
  const groupSum = col.groups.reduce((sum, g) => sum + g.count, 0);
9134
9264
  if (groupSum !== col.totalCount) {
9135
9265
  debug(`Blueprint group count mismatch for ${col.name}: groups sum to ${groupSum}, totalCount is ${col.totalCount}. Adjusting.`);
9136
9266
  col.totalCount = groupSum;
9137
9267
  }
9268
+ if (col.totalCount === 0) {
9269
+ debug(`Blueprint collection ${col.name} has 0 entities \u2014 dropping`);
9270
+ }
9271
+ }
9272
+ parsed.collections = parsed.collections.filter((col) => col.totalCount > 0);
9273
+ if (parsed.collections.length === 0 && parsed.identities.length === 0) {
9274
+ warn("Blueprint extracted no valid collections or identities");
9275
+ return null;
9138
9276
  }
9139
9277
  return parsed;
9140
9278
  } catch (err) {
@@ -9356,7 +9494,13 @@ function buildSeedFromBlueprint(blueprint, baseSeed) {
9356
9494
  for (const identity of blueprint.identities) {
9357
9495
  processIdentity(identity, seed, warnings);
9358
9496
  }
9497
+ const baseCollections = new Set(Object.keys(baseSeed));
9359
9498
  for (const spec of blueprint.collections) {
9499
+ if (!baseCollections.has(spec.name) && !seed[spec.name]) {
9500
+ warnings.push(`Blueprint references unknown collection "${spec.name}" \u2014 skipping`);
9501
+ warn(`Blueprint references unknown collection "${spec.name}" for ${blueprint.twin} twin \u2014 skipping`);
9502
+ continue;
9503
+ }
9360
9504
  processCollection(spec, seed, blueprint.twin, existingLabels, warnings, now);
9361
9505
  }
9362
9506
  return { seed, warnings };
@@ -9612,9 +9756,16 @@ function buildSlackEntity(collection, id, props, seed, index, temporal, contentH
9612
9756
  }
9613
9757
  case "messages": {
9614
9758
  const channels = seed["channels"] ?? [];
9615
- const channelId = channels.length > 0 ? String(channels[index % channels.length]["channel_id"] ?? "C0001AAAA") : "C0001AAAA";
9759
+ const targetChannel = channels.length > 0 ? channels[index % channels.length] : null;
9760
+ const channelId = targetChannel ? String(targetChannel["channel_id"] ?? "C0001AAAA") : "C0001AAAA";
9761
+ const channelMembers = targetChannel ? targetChannel["members"] ?? [] : [];
9616
9762
  const users = seed["users"] ?? [];
9617
- const userId = users.length > 0 ? String(users[index % users.length]["user_id"] ?? "U0001AAAA") : "U0001AAAA";
9763
+ let userId;
9764
+ if (channelMembers.length > 0) {
9765
+ userId = channelMembers[index % channelMembers.length];
9766
+ } else {
9767
+ userId = users.length > 0 ? String(users[index % users.length]["user_id"] ?? "U0001AAAA") : "U0001AAAA";
9768
+ }
9618
9769
  const baseTs = Math.floor(new Date(temporal.createdAt).getTime() / 1e3);
9619
9770
  const ts = generateSlackTs(baseTs, index);
9620
9771
  return {
@@ -10787,7 +10938,7 @@ Fix these issues:
10787
10938
  validationAttempt: String(validationAttempts + 1)
10788
10939
  });
10789
10940
  const provider = detectProvider(config.model);
10790
- const apiKey = resolveProviderApiKey(config.apiKey, provider);
10941
+ const apiKey = effectiveMode === "archal" ? "" : resolveProviderApiKey(config.apiKey, provider);
10791
10942
  const responseText = await callLlm({
10792
10943
  provider,
10793
10944
  model: config.model,
@@ -10796,7 +10947,7 @@ Fix these issues:
10796
10947
  userPrompt: promptWithFeedback,
10797
10948
  maxTokens: 16384,
10798
10949
  baseUrl: config.baseUrl,
10799
- providerMode: config.providerMode,
10950
+ providerMode: effectiveMode,
10800
10951
  intent: "seed-generate",
10801
10952
  responseFormat: "json"
10802
10953
  });
@@ -11897,11 +12048,21 @@ function parseSqlSeed(sql) {
11897
12048
  function loadSeedStateFromPath(seedRoot, seedName) {
11898
12049
  const jsonPath = resolve4(seedRoot, `${seedName}.json`);
11899
12050
  if (existsSync10(jsonPath)) {
11900
- return JSON.parse(readFileSync12(jsonPath, "utf-8"));
12051
+ try {
12052
+ return JSON.parse(readFileSync12(jsonPath, "utf-8"));
12053
+ } catch (err) {
12054
+ const detail = err instanceof Error ? err.message : String(err);
12055
+ throw new Error(`Failed to parse seed file ${jsonPath}: ${detail}`);
12056
+ }
11901
12057
  }
11902
12058
  const sqlPath = resolve4(seedRoot, `${seedName}.sql`);
11903
12059
  if (existsSync10(sqlPath)) {
11904
- return parseSqlSeed(readFileSync12(sqlPath, "utf-8"));
12060
+ try {
12061
+ return parseSqlSeed(readFileSync12(sqlPath, "utf-8"));
12062
+ } catch (err) {
12063
+ const detail = err instanceof Error ? err.message : String(err);
12064
+ throw new Error(`Failed to parse seed file ${sqlPath}: ${detail}`);
12065
+ }
11905
12066
  }
11906
12067
  return null;
11907
12068
  }
@@ -12137,7 +12298,9 @@ ${baseTaskMessage}` : baseTaskMessage;
12137
12298
  };
12138
12299
  }
12139
12300
  if (trace.length === 0) {
12140
- warn(`Agent made no tool calls on run ${runIndex + 1}. The agent may have failed to act \u2014 check agent logs and task prompt.`);
12301
+ warn(
12302
+ `Agent made no tool calls on run ${runIndex + 1}. This usually means the model is too weak for this scenario. Try a more capable model (e.g. --engine-model claude-sonnet-4-6 or --engine-model gemini-2.5-pro). If using a custom agent, check that it correctly processes tool schemas and calls tools.`
12303
+ );
12141
12304
  }
12142
12305
  progress(`Evaluating run ${runIndex + 1}...`);
12143
12306
  const evaluationResult = await evaluateRun(
@@ -12474,8 +12637,14 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
12474
12637
  for (const sel of seedSelections) {
12475
12638
  const mismatches = verifySeedCounts(scenario.setup, sel.seedData);
12476
12639
  if (mismatches.length === 0) continue;
12640
+ const significantMismatches = mismatches.filter((m) => {
12641
+ const delta = Math.abs(m.expected - m.actual);
12642
+ const ratio = m.expected > 0 ? delta / m.expected : delta;
12643
+ return delta > 5 || ratio > 0.5;
12644
+ });
12645
+ if (significantMismatches.length === 0) continue;
12477
12646
  warn(
12478
- `Seed count mismatch for ${sel.twinName}: ${mismatches.map((m) => `${m.subject}: expected ${m.expected}, got ${m.actual}`).join("; ")}`
12647
+ `Seed count mismatch for ${sel.twinName}: ${significantMismatches.map((m) => `${m.subject}: expected ${m.expected}, got ${m.actual}`).join("; ")}`
12479
12648
  );
12480
12649
  }
12481
12650
  const scenarioDir = dirname2(resolve4(options.scenarioPath));
@@ -12667,7 +12836,7 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
12667
12836
  printHeader(scenario.title, seedSelections);
12668
12837
  const evaluatorProvider = detectProvider(model);
12669
12838
  const configProvider = detectProvider(config.model);
12670
- const evaluatorApiKey = options.model && evaluatorProvider !== configProvider ? resolveProviderApiKey("", evaluatorProvider) : resolveProviderApiKey(config.apiKey, evaluatorProvider);
12839
+ const evaluatorApiKey = config.evaluatorProvider === "archal" ? "" : options.model && evaluatorProvider !== configProvider ? resolveProviderApiKey("", evaluatorProvider) : resolveProviderApiKey(config.apiKey, evaluatorProvider);
12671
12840
  const evaluatorConfig = {
12672
12841
  apiKey: evaluatorApiKey,
12673
12842
  model,
@@ -9,11 +9,9 @@
9
9
  *
10
10
  * Key features:
11
11
  * - Security-focused system prompt emphasizing investigation and refusal
12
- * - SAFETY.md prompt file injected via loadPromptContext (prepended to task)
13
12
  * - Multi-provider support (Gemini, OpenAI, Anthropic) via _lib/providers.mjs
14
13
  * - Error recovery with retries on transient failures
15
14
  * - Consecutive-error bailout at 5
16
- * - Temperature 0 for conservative, deterministic behavior
17
15
  * - 50 steps max for thorough investigation before acting
18
16
  *
19
17
  * Env vars (set by archal orchestrator):
@@ -36,13 +34,13 @@ import {
36
34
  getStopReason,
37
35
  withRetry,
38
36
  } from '../_lib/providers.mjs';
39
- import { collectTwinUrls } from '../_lib/rest-client.mjs';
37
+ import { collectTwinUrls, discoverAllTools, callToolRest } from '../_lib/rest-client.mjs';
40
38
  import { createLogger } from '../_lib/logging.mjs';
41
39
  import { writeMetrics } from '../_lib/metrics.mjs';
42
40
  import { createAgentTrace } from '../_lib/agent-trace.mjs';
43
41
 
44
42
  const MAX_STEPS = 50;
45
- const TASK = process.env['ARCHAL_ENGINE_TASK'];
43
+ const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
46
44
  const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
47
45
 
48
46
  if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set'); process.exit(1); }
@@ -54,54 +52,16 @@ const log = createLogger({ harness: 'hardened', model: MODEL, provider });
54
52
 
55
53
  // ── Twin REST transport ─────────────────────────────────────────────
56
54
 
57
- const authHeaders = {};
58
- if (process.env['ARCHAL_TOKEN']) {
59
- authHeaders['Authorization'] = `Bearer ${process.env['ARCHAL_TOKEN']}`;
60
- }
61
- const runtimeUserId = process.env['ARCHAL_RUNTIME_USER_ID'] || process.env['archal_runtime_user_id'];
62
- if (runtimeUserId) {
63
- authHeaders['x-archal-user-id'] = runtimeUserId;
64
- }
65
-
66
- /** Collect twin URLs from ARCHAL_<TWIN>_URL env vars */
67
55
  const twinUrls = collectTwinUrls();
68
56
 
69
57
  if (Object.keys(twinUrls).length === 0) {
70
- process.stderr.write('[hardened] FATAL: No twin URLs found in ARCHAL_*_URL env vars. Cannot proceed.\n');
58
+ console.error('[hardened] No twin URLs found. Check ARCHAL_TWIN_NAMES and ARCHAL_<TWIN>_URL env vars.');
71
59
  process.exit(1);
72
60
  }
73
- process.stderr.write(`[hardened] twin URLs: ${JSON.stringify(twinUrls)}\n`);
74
-
75
- /** Fetch available tools from a twin's REST endpoint */
76
- async function fetchTools(baseUrl) {
77
- try {
78
- const res = await fetch(`${baseUrl}/tools`, { headers: authHeaders });
79
- if (!res.ok) return [];
80
- const data = await res.json();
81
- if (!Array.isArray(data)) {
82
- process.stderr.write(`[hardened] Expected array from ${baseUrl}/tools, got ${typeof data}\n`);
83
- return [];
84
- }
85
- return data;
86
- } catch (err) {
87
- process.stderr.write(`[hardened] Failed to fetch tools from ${baseUrl}: ${err.message}\n`);
88
- return [];
89
- }
90
- }
91
-
92
- /** Call a tool on a twin via REST */
93
- async function callToolRest(baseUrl, toolName, args) {
94
- const res = await fetch(`${baseUrl}/tools/call`, {
95
- method: 'POST',
96
- headers: { 'Content-Type': 'application/json', ...authHeaders },
97
- body: JSON.stringify({ name: toolName, arguments: args }),
98
- });
99
- const body = await res.text();
100
- if (!res.ok) {
101
- throw new Error(`Tool call ${toolName} failed (HTTP ${res.status}): ${body}`);
102
- }
103
- // Return as text string for provider message formatting
104
- return body;
61
+ const { tools: allTools, toolToTwin } = await discoverAllTools(twinUrls);
62
+ if (allTools.length === 0) {
63
+ console.error('[hardened] No tools discovered from twins. Twin endpoints may be unreachable.');
64
+ process.exit(1);
105
65
  }
106
66
 
107
67
  // ── System prompt ───────────────────────────────────────────────────
@@ -123,37 +83,9 @@ APPROACH:
123
83
  4. If the request is safe and properly authorized, proceed methodically.
124
84
  5. Summarize what you did and why.`;
125
85
 
126
- // ── Discover tools from all twins ───────────────────────────────────
127
-
128
- const allTools = [];
129
- const toolToTwin = {};
130
-
131
- for (const [twinName, baseUrl] of Object.entries(twinUrls)) {
132
- const tools = await fetchTools(baseUrl);
133
- process.stderr.write(`[hardened] ${twinName}: ${tools.length} tools\n`);
134
- for (const tool of tools) {
135
- // Namespace tool names to match MCP convention used by evaluator
136
- const namespacedName = `mcp__${twinName}__${tool.name}`;
137
- allTools.push({
138
- name: namespacedName,
139
- description: tool.description || '',
140
- inputSchema: tool.inputSchema || { type: 'object', properties: {} },
141
- });
142
- toolToTwin[namespacedName] = { twinName, baseUrl, originalName: tool.name };
143
- }
144
- }
145
-
146
- process.stderr.write(`[hardened] Total tools: ${allTools.length}\n`);
147
-
148
- if (allTools.length === 0) {
149
- process.stderr.write('[hardened] FATAL: No tools discovered from twins. Twin endpoints may be unreachable.\n');
150
- process.exit(1);
151
- }
152
-
153
- const providerTools = formatToolsForProvider(provider, allTools);
154
-
155
86
  // ── Main loop ───────────────────────────────────────────────────────
156
87
 
88
+ const providerTools = formatToolsForProvider(provider, allTools);
157
89
  let messages = buildInitialMessages(provider, SYSTEM_PROMPT, TASK, MODEL);
158
90
  let consecutiveErrors = 0;
159
91
 
@@ -211,45 +143,33 @@ try {
211
143
  break;
212
144
  }
213
145
 
214
- // Execute each tool call via REST
146
+ // Execute each tool call via shared REST client
215
147
  const results = [];
216
148
  for (const tc of toolCalls) {
217
149
  const toolStart = Date.now();
218
150
  process.stderr.write(`[hardened] Step ${step + 1}: ${tc.name}(${JSON.stringify(tc.arguments).slice(0, 100)})\n`);
219
-
220
- const mapping = toolToTwin[tc.name];
221
- if (!mapping) {
222
- const errorMsg = `Error: Unknown tool "${tc.name}"`;
151
+ try {
152
+ const result = await callToolRest(toolToTwin, tc.name, tc.arguments);
153
+ results.push(result);
154
+ consecutiveErrors = 0;
155
+ totalToolCalls++;
156
+ log.toolCall(step + 1, tc.name, tc.arguments, Date.now() - toolStart);
157
+ } catch (err) {
158
+ const errorMsg = `Error: ${err.message}`;
223
159
  results.push(errorMsg);
224
160
  consecutiveErrors++;
225
161
  totalToolCalls++;
226
162
  totalToolErrors++;
227
- log.toolError(step + 1, tc.name, `Unknown tool`);
228
- process.stderr.write(`[hardened] Tool error (${consecutiveErrors}): Unknown tool ${tc.name}\n`);
229
- } else {
230
- try {
231
- const result = await callToolRest(mapping.baseUrl, mapping.originalName, tc.arguments);
232
- results.push(result);
233
- consecutiveErrors = 0;
234
- totalToolCalls++;
235
- log.toolCall(step + 1, tc.name, tc.arguments, Date.now() - toolStart);
236
- } catch (err) {
237
- const errorMsg = `Error: ${err.message}`;
238
- results.push(errorMsg);
239
- consecutiveErrors++;
240
- totalToolCalls++;
241
- totalToolErrors++;
242
- log.toolError(step + 1, tc.name, err.message);
243
- process.stderr.write(`[hardened] Tool error (${consecutiveErrors}): ${err.message}\n`);
163
+ log.toolError(step + 1, tc.name, err.message);
164
+ process.stderr.write(`[hardened] Tool error (${consecutiveErrors}): ${err.message}\n`);
165
+
166
+ // Bail if too many consecutive errors
167
+ if (consecutiveErrors >= 5) {
168
+ process.stderr.write('[hardened] Too many consecutive tool errors — stopping.\n');
169
+ exitReason = 'consecutive_errors';
170
+ break;
244
171
  }
245
172
  }
246
-
247
- // Bail if too many consecutive errors
248
- if (consecutiveErrors >= 5) {
249
- process.stderr.write('[hardened] Too many consecutive tool errors — stopping.\n');
250
- exitReason = 'consecutive_errors';
251
- break;
252
- }
253
173
  }
254
174
 
255
175
  // Record thinking trace for this step (before bailout check so the final step is captured)
@@ -33,10 +33,10 @@ import { createLogger } from '../_lib/logging.mjs';
33
33
  import { writeMetrics } from '../_lib/metrics.mjs';
34
34
 
35
35
  const MAX_STEPS = 20;
36
- const TASK = process.env['ARCHAL_ENGINE_TASK'];
36
+ const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
37
37
  const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
38
38
 
39
- if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set'); process.exit(1); }
39
+ if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set or empty'); process.exit(1); }
40
40
  if (!MODEL) { console.error('ARCHAL_ENGINE_MODEL not set'); process.exit(1); }
41
41
 
42
42
  // Warn when used outside demo context
@@ -35,10 +35,10 @@ import { writeMetrics } from '../_lib/metrics.mjs';
35
35
  import { createAgentTrace } from '../_lib/agent-trace.mjs';
36
36
 
37
37
  const MAX_STEPS = 50;
38
- const TASK = process.env['ARCHAL_ENGINE_TASK'];
38
+ const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
39
39
  const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
40
40
 
41
- if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set'); process.exit(1); }
41
+ if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set or empty'); process.exit(1); }
42
42
  if (!MODEL) { console.error('ARCHAL_ENGINE_MODEL not set'); process.exit(1); }
43
43
 
44
44
  const provider = detectProvider(MODEL);
@@ -32,10 +32,10 @@ import { writeMetrics } from '../_lib/metrics.mjs';
32
32
  import { createAgentTrace } from '../_lib/agent-trace.mjs';
33
33
 
34
34
  const MAX_STEPS = 40;
35
- const TASK = process.env['ARCHAL_ENGINE_TASK'];
35
+ const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
36
36
  const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
37
37
 
38
- if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set'); process.exit(1); }
38
+ if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set or empty'); process.exit(1); }
39
39
  if (!MODEL) { console.error('ARCHAL_ENGINE_MODEL not set'); process.exit(1); }
40
40
 
41
41
  const provider = detectProvider(MODEL);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@archal/cli",
3
- "version": "0.7.9",
3
+ "version": "0.7.10",
4
4
  "description": "Pre-deployment testing for AI agents",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",