@archal/cli 0.7.10 → 0.7.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +506 -124
- package/harnesses/_lib/providers.mjs +29 -7
- package/harnesses/hardened/agent.mjs +17 -4
- package/harnesses/naive/agent.mjs +13 -1
- package/harnesses/react/agent.mjs +34 -8
- package/harnesses/zero-shot/agent.mjs +13 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1114,6 +1114,8 @@ var HTTP_RETRYABLE_STATUS_CODES = /* @__PURE__ */ new Set([408, 425, 429, 500, 5
|
|
|
1114
1114
|
var HTTP_PUSH_TIMEOUT_MS = 2e4;
|
|
1115
1115
|
var HTTP_PUSH_MAX_RETRIES = 6;
|
|
1116
1116
|
var HTTP_PUSH_BACKOFF_MS = [1e3, 2e3, 3e3, 5e3, 5e3, 5e3];
|
|
1117
|
+
var HTTP_PUSH_WARMUP_RETRIES = 6;
|
|
1118
|
+
var HTTP_PUSH_WARMUP_BACKOFF_MS = [1500, 2500, 3500, 5e3, 6e3, 7e3];
|
|
1117
1119
|
function resolveRetryDelay(backoffMs, attempt, fallbackMs) {
|
|
1118
1120
|
const indexed = backoffMs[attempt];
|
|
1119
1121
|
if (typeof indexed === "number" && Number.isFinite(indexed) && indexed >= 0) {
|
|
@@ -1164,6 +1166,10 @@ async function fetchWithRetry(url, options, retryOptions) {
|
|
|
1164
1166
|
function twinBasePath(url) {
|
|
1165
1167
|
return url.replace(/\/(mcp|api)\/?$/, "");
|
|
1166
1168
|
}
|
|
1169
|
+
function isTwinWorkerWarmupResponse(status, body) {
|
|
1170
|
+
if (status !== 503) return false;
|
|
1171
|
+
return /twin worker endpoint not available|session is busy|retry shortly/i.test(body);
|
|
1172
|
+
}
|
|
1167
1173
|
async function collectStateFromHttp(twinUrls, bearerToken, adminAuth) {
|
|
1168
1174
|
const state = {};
|
|
1169
1175
|
const failures = [];
|
|
@@ -1208,25 +1214,44 @@ async function pushStateToCloud(twinUrls, seedSelections, bearerToken, adminAuth
|
|
|
1208
1214
|
}
|
|
1209
1215
|
const url = `${twinBasePath(baseUrl)}/state`;
|
|
1210
1216
|
debug(`Pushing dynamic seed to ${sel.twinName}`, { url });
|
|
1211
|
-
const
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1217
|
+
const payload = JSON.stringify(sel.seedData);
|
|
1218
|
+
let pushed = false;
|
|
1219
|
+
for (let warmupAttempt = 0; warmupAttempt <= HTTP_PUSH_WARMUP_RETRIES; warmupAttempt++) {
|
|
1220
|
+
const response = await fetchWithRetry(
|
|
1221
|
+
url,
|
|
1222
|
+
{
|
|
1223
|
+
method: "PUT",
|
|
1224
|
+
headers,
|
|
1225
|
+
body: payload
|
|
1226
|
+
},
|
|
1227
|
+
{
|
|
1228
|
+
retries: HTTP_PUSH_MAX_RETRIES,
|
|
1229
|
+
timeoutMs: HTTP_PUSH_TIMEOUT_MS,
|
|
1230
|
+
backoffMs: HTTP_PUSH_BACKOFF_MS
|
|
1231
|
+
}
|
|
1232
|
+
);
|
|
1233
|
+
if (response.ok) {
|
|
1234
|
+
pushed = true;
|
|
1235
|
+
break;
|
|
1222
1236
|
}
|
|
1223
|
-
);
|
|
1224
|
-
if (!response.ok) {
|
|
1225
1237
|
const text = await response.text().catch(() => "");
|
|
1238
|
+
const isWarmup = isTwinWorkerWarmupResponse(response.status, text);
|
|
1239
|
+
if (isWarmup && warmupAttempt < HTTP_PUSH_WARMUP_RETRIES) {
|
|
1240
|
+
const delay = resolveRetryDelay(HTTP_PUSH_WARMUP_BACKOFF_MS, warmupAttempt, 5e3);
|
|
1241
|
+
warn(
|
|
1242
|
+
`Twin "${sel.twinName}" not ready for state push (HTTP 503), retrying in ${delay}ms`,
|
|
1243
|
+
{ attempt: `${warmupAttempt + 1}/${HTTP_PUSH_WARMUP_RETRIES + 1}` }
|
|
1244
|
+
);
|
|
1245
|
+
await new Promise((resolve12) => setTimeout(resolve12, delay));
|
|
1246
|
+
continue;
|
|
1247
|
+
}
|
|
1226
1248
|
throw new Error(
|
|
1227
1249
|
`Failed to push dynamic seed to twin "${sel.twinName}": HTTP ${response.status}${text ? ` (${text})` : ""}`
|
|
1228
1250
|
);
|
|
1229
1251
|
}
|
|
1252
|
+
if (!pushed) {
|
|
1253
|
+
throw new Error(`Failed to push dynamic seed to twin "${sel.twinName}": worker warmup did not complete in time`);
|
|
1254
|
+
}
|
|
1230
1255
|
debug(`Pushed dynamic seed to ${sel.twinName} successfully`);
|
|
1231
1256
|
}
|
|
1232
1257
|
}
|
|
@@ -3202,6 +3227,47 @@ async function callAnthropic(options) {
|
|
|
3202
3227
|
if (!textBlock?.text) throw new Error("Anthropic returned no text content");
|
|
3203
3228
|
return textBlock.text;
|
|
3204
3229
|
}
|
|
3230
|
+
function extractOpenAiTextContent(data) {
|
|
3231
|
+
const message = data.choices?.[0]?.message;
|
|
3232
|
+
if (!message) return null;
|
|
3233
|
+
if (typeof message.content === "string") {
|
|
3234
|
+
const trimmed = message.content.trim();
|
|
3235
|
+
return trimmed.length > 0 ? trimmed : null;
|
|
3236
|
+
}
|
|
3237
|
+
if (Array.isArray(message.content)) {
|
|
3238
|
+
const textSegments = [];
|
|
3239
|
+
for (const part of message.content) {
|
|
3240
|
+
if (typeof part === "string") {
|
|
3241
|
+
const trimmed = part.trim();
|
|
3242
|
+
if (trimmed.length > 0) textSegments.push(trimmed);
|
|
3243
|
+
continue;
|
|
3244
|
+
}
|
|
3245
|
+
if (!part || typeof part !== "object") continue;
|
|
3246
|
+
const partText = part.text;
|
|
3247
|
+
if (typeof partText === "string") {
|
|
3248
|
+
const trimmed = partText.trim();
|
|
3249
|
+
if (trimmed.length > 0) textSegments.push(trimmed);
|
|
3250
|
+
continue;
|
|
3251
|
+
}
|
|
3252
|
+
if (partText && typeof partText === "object" && typeof partText.value === "string") {
|
|
3253
|
+
const trimmed = partText.value.trim();
|
|
3254
|
+
if (trimmed.length > 0) textSegments.push(trimmed);
|
|
3255
|
+
continue;
|
|
3256
|
+
}
|
|
3257
|
+
if (typeof part.value === "string") {
|
|
3258
|
+
const trimmed = part.value.trim();
|
|
3259
|
+
if (trimmed.length > 0) textSegments.push(trimmed);
|
|
3260
|
+
}
|
|
3261
|
+
}
|
|
3262
|
+
if (textSegments.length > 0) {
|
|
3263
|
+
return textSegments.join("\n");
|
|
3264
|
+
}
|
|
3265
|
+
}
|
|
3266
|
+
if (typeof message.refusal === "string" && message.refusal.trim().length > 0) {
|
|
3267
|
+
return message.refusal.trim();
|
|
3268
|
+
}
|
|
3269
|
+
return null;
|
|
3270
|
+
}
|
|
3205
3271
|
function usesMaxCompletionTokens(model) {
|
|
3206
3272
|
return model.startsWith("gpt-5") || model.startsWith("o1-") || model.startsWith("o2-") || model.startsWith("o3-") || model.startsWith("o4-");
|
|
3207
3273
|
}
|
|
@@ -3229,7 +3295,7 @@ async function callOpenAi(options) {
|
|
|
3229
3295
|
throw new LlmApiError("OpenAI", response.status, errorText.slice(0, 200));
|
|
3230
3296
|
}
|
|
3231
3297
|
const data = await response.json();
|
|
3232
|
-
const content = data
|
|
3298
|
+
const content = extractOpenAiTextContent(data);
|
|
3233
3299
|
if (!content) throw new Error("OpenAI returned no content");
|
|
3234
3300
|
return content;
|
|
3235
3301
|
}
|
|
@@ -3263,7 +3329,7 @@ async function callOpenAiCompatible(options) {
|
|
|
3263
3329
|
throw new LlmApiError(`OpenAI-compatible (${options.baseUrl})`, response.status, errorText.slice(0, 200));
|
|
3264
3330
|
}
|
|
3265
3331
|
const data = await response.json();
|
|
3266
|
-
const content = data
|
|
3332
|
+
const content = extractOpenAiTextContent(data);
|
|
3267
3333
|
if (!content) throw new Error("OpenAI-compatible API returned no content");
|
|
3268
3334
|
return content;
|
|
3269
3335
|
}
|
|
@@ -3288,13 +3354,15 @@ ${CYAN}${BOLD}archal${RESET} ${DIM}|${RESET} ${scenarioTitle}
|
|
|
3288
3354
|
`);
|
|
3289
3355
|
}
|
|
3290
3356
|
}
|
|
3291
|
-
function printRunProgress(runIndex, totalRuns, score, error2) {
|
|
3357
|
+
function printRunProgress(runIndex, totalRuns, score, error2, outcome) {
|
|
3292
3358
|
const { quiet } = getLoggerOptions();
|
|
3293
3359
|
if (quiet || activeOutputFormat !== "terminal") return;
|
|
3294
3360
|
const dots = ".".repeat(Math.max(1, 20 - String(runIndex + 1).length - String(totalRuns).length));
|
|
3295
3361
|
if (error2) {
|
|
3296
3362
|
const shortError = error2.length > MAX_ERROR_PREVIEW_CHARS ? error2.slice(0, MAX_ERROR_PREVIEW_CHARS - 1) + "\u2026" : error2;
|
|
3297
|
-
|
|
3363
|
+
const inconclusive = outcome === "inconclusive_infrastructure" || outcome === "inconclusive_seed";
|
|
3364
|
+
const label = inconclusive ? `${YELLOW}INCONCLUSIVE${RESET}` : `${RED}ERROR${RESET}`;
|
|
3365
|
+
process.stderr.write(` run ${runIndex + 1}/${totalRuns} ${DIM}${dots}${RESET} ${label} ${DIM}(${shortError})${RESET}
|
|
3298
3366
|
`);
|
|
3299
3367
|
return;
|
|
3300
3368
|
}
|
|
@@ -5874,6 +5942,17 @@ function buildFailureAnalysisPrompt(input) {
|
|
|
5874
5942
|
);
|
|
5875
5943
|
sections.push(`## Passed Criteria (${input.passedCriteria.length})`);
|
|
5876
5944
|
sections.push(input.passedCriteria.map((c) => `- ${sanitizeForPrompt(c.description, 300)}`).join("\n"));
|
|
5945
|
+
if (input.agentError || input.agentLog) {
|
|
5946
|
+
sections.push(`## Agent Execution Context`);
|
|
5947
|
+
if (input.agentError) {
|
|
5948
|
+
sections.push(`Error: ${sanitizeForPrompt(input.agentError, 300)}`);
|
|
5949
|
+
}
|
|
5950
|
+
if (input.agentLog) {
|
|
5951
|
+
const logTail = input.agentLog.length > 800 ? input.agentLog.slice(-800) : input.agentLog;
|
|
5952
|
+
sections.push(`Agent log (tail):
|
|
5953
|
+
${sanitizeForPrompt(logTail, 800)}`);
|
|
5954
|
+
}
|
|
5955
|
+
}
|
|
5877
5956
|
sections.push(`## Agent Trace (${input.trace.length} tool calls)`);
|
|
5878
5957
|
sections.push(
|
|
5879
5958
|
input.trace.length === 0 ? "(Agent made no tool calls - likely crashed or timed out)" : JSON.stringify(traceFormatted, null, 2)
|
|
@@ -6617,7 +6696,7 @@ function resolveTelemetryEndpointFromEnv() {
|
|
|
6617
6696
|
if (!fallbackBaseUrl) {
|
|
6618
6697
|
return null;
|
|
6619
6698
|
}
|
|
6620
|
-
return `${fallbackBaseUrl}/
|
|
6699
|
+
return `${fallbackBaseUrl}/v1/traces`;
|
|
6621
6700
|
}
|
|
6622
6701
|
function resolveIngestToken() {
|
|
6623
6702
|
return process.env["ARCHAL_TELEMETRY_TOKEN"]?.trim();
|
|
@@ -6766,8 +6845,26 @@ function isTelemetryEnabled() {
|
|
|
6766
6845
|
if (consent !== "pending") return consent === "granted";
|
|
6767
6846
|
return loadConfig().telemetry;
|
|
6768
6847
|
}
|
|
6769
|
-
function buildStructuredRunError(runIndex, error2) {
|
|
6848
|
+
function buildStructuredRunError(runIndex, error2, outcome) {
|
|
6770
6849
|
const message = error2.trim();
|
|
6850
|
+
if (outcome === "inconclusive_seed") {
|
|
6851
|
+
return {
|
|
6852
|
+
runIndex,
|
|
6853
|
+
message,
|
|
6854
|
+
category: "seed_setup",
|
|
6855
|
+
code: "SEED_SETUP_ERROR",
|
|
6856
|
+
retryable: true
|
|
6857
|
+
};
|
|
6858
|
+
}
|
|
6859
|
+
if (outcome === "inconclusive_infrastructure") {
|
|
6860
|
+
return {
|
|
6861
|
+
runIndex,
|
|
6862
|
+
message,
|
|
6863
|
+
category: "infrastructure",
|
|
6864
|
+
code: "INFRASTRUCTURE_ERROR",
|
|
6865
|
+
retryable: true
|
|
6866
|
+
};
|
|
6867
|
+
}
|
|
6771
6868
|
if (message.startsWith("Agent not found:")) {
|
|
6772
6869
|
return {
|
|
6773
6870
|
runIndex,
|
|
@@ -7009,7 +7106,7 @@ function buildMetadata(report, totalEntries) {
|
|
|
7009
7106
|
},
|
|
7010
7107
|
agentInternals: {
|
|
7011
7108
|
runDurationsMs: report.runs.map((run) => run.durationMs),
|
|
7012
|
-
runErrors: report.runs.filter((run) => typeof run.error === "string" && run.error.length > 0).map((run) => buildStructuredRunError(run.runIndex, run.error)),
|
|
7109
|
+
runErrors: report.runs.filter((run) => typeof run.error === "string" && run.error.length > 0).map((run) => buildStructuredRunError(run.runIndex, run.error, run.outcome)),
|
|
7013
7110
|
evaluationCounts: { pass: passCount, partial: partialCount, fail: failCount },
|
|
7014
7111
|
runSummaries: report.runs.map((run) => ({
|
|
7015
7112
|
runIndex: run.runIndex,
|
|
@@ -7184,6 +7281,7 @@ async function uploadIfEnabled(traceId, report) {
|
|
|
7184
7281
|
}
|
|
7185
7282
|
|
|
7186
7283
|
// src/runner/dynamic-seed-generator.ts
|
|
7284
|
+
import { createHash as createHash4 } from "crypto";
|
|
7187
7285
|
import { z as z4 } from "zod";
|
|
7188
7286
|
|
|
7189
7287
|
// src/runner/seed-schemas/seed-schema-inference.ts
|
|
@@ -8551,14 +8649,17 @@ function autoFillMissingFKs(seed, twinName) {
|
|
|
8551
8649
|
if (!sourceEntities || !targetEntities || targetEntities.length === 0) continue;
|
|
8552
8650
|
const targetValues = targetEntities.map((e) => e[rule.targetField]).filter((v) => v !== void 0 && v !== null);
|
|
8553
8651
|
if (targetValues.length === 0) continue;
|
|
8652
|
+
const validTargetSet = new Set(targetValues.map(String));
|
|
8554
8653
|
let fillIndex = 0;
|
|
8555
8654
|
for (const entity of sourceEntities) {
|
|
8556
8655
|
const e = entity;
|
|
8557
|
-
|
|
8656
|
+
const currentValue = e[rule.sourceField];
|
|
8657
|
+
const needsFill = currentValue === void 0 || currentValue === null || !validTargetSet.has(String(currentValue));
|
|
8658
|
+
if (needsFill) {
|
|
8558
8659
|
const fillValue = targetValues[fillIndex % targetValues.length];
|
|
8559
8660
|
fillIndex++;
|
|
8560
8661
|
debug(
|
|
8561
|
-
`Auto-filling ${rule.sourceCollection}.${rule.sourceField} = ${String(fillValue)} (from ${targetValues.length} ${rule.targetCollection})`
|
|
8662
|
+
`Auto-filling ${rule.sourceCollection}.${rule.sourceField} = ${String(fillValue)} (from ${targetValues.length} ${rule.targetCollection})` + (currentValue != null ? ` (was ${String(currentValue)} \u2014 not in targets)` : "")
|
|
8562
8663
|
);
|
|
8563
8664
|
e[rule.sourceField] = fillValue;
|
|
8564
8665
|
}
|
|
@@ -8652,6 +8753,7 @@ var KIND_COLLECTION_HINTS = {
|
|
|
8652
8753
|
channel: ["channels"],
|
|
8653
8754
|
user: ["users"],
|
|
8654
8755
|
ticket: ["issues"],
|
|
8756
|
+
project: ["projects"],
|
|
8655
8757
|
table: ["tables"],
|
|
8656
8758
|
site: ["sites", "domains"],
|
|
8657
8759
|
file: ["files"],
|
|
@@ -8661,6 +8763,9 @@ var KIND_COLLECTION_HINTS = {
|
|
|
8661
8763
|
var ENTITY_KEY_ALIASES = {
|
|
8662
8764
|
"repo.owner": ["ownerLogin", "owner_login", "login", "owner.login", "owner.name"],
|
|
8663
8765
|
"issue.key": ["identifier"],
|
|
8766
|
+
"project.key": ["key", "projectKey"],
|
|
8767
|
+
"ticket.key": ["identifier", "key"],
|
|
8768
|
+
"stripe_entity.id": ["id", "charge", "chargeId", "paymentIntentId", "invoiceId", "customerId", "disputeId"],
|
|
8664
8769
|
"email.address": ["email", "from", "to", "cc", "bcc"],
|
|
8665
8770
|
"file.name": ["title", "fileName", "filename", "subject", "summary"]
|
|
8666
8771
|
};
|
|
@@ -8816,10 +8921,28 @@ function validateSeedCoverage(intent, mergedSeed) {
|
|
|
8816
8921
|
const entityIssues = [];
|
|
8817
8922
|
const quoteErrors = [];
|
|
8818
8923
|
const quoteWarnings = [];
|
|
8819
|
-
const CORE_ENTITY_KEYS = /* @__PURE__ */ new Set(["owner", "name", "fullName", "channel_name", "key", "identifier", "number"]);
|
|
8924
|
+
const CORE_ENTITY_KEYS = /* @__PURE__ */ new Set(["owner", "name", "fullName", "channel_name", "key", "identifier", "number", "id"]);
|
|
8925
|
+
const CONTRACT_REQUIRED_KINDS = /* @__PURE__ */ new Set([
|
|
8926
|
+
"repo",
|
|
8927
|
+
"pullRequest",
|
|
8928
|
+
"issue",
|
|
8929
|
+
"channel",
|
|
8930
|
+
"user",
|
|
8931
|
+
"ticket",
|
|
8932
|
+
"project",
|
|
8933
|
+
"table"
|
|
8934
|
+
]);
|
|
8820
8935
|
const entityWarnings = [];
|
|
8821
8936
|
for (const entity of intent.entities) {
|
|
8822
8937
|
if (typeof entity.value === "boolean") continue;
|
|
8938
|
+
const candidateCollections = toCollectionCandidates(mergedSeed, entity.kind, entity.value);
|
|
8939
|
+
if (CONTRACT_REQUIRED_KINDS.has(entity.kind) && candidateCollections.length === 0) {
|
|
8940
|
+
entityIssues.push({
|
|
8941
|
+
type: "missing_entity",
|
|
8942
|
+
message: `Scenario entity contract mismatch: no collections match ${entity.kind}.${entity.key}=${String(entity.value)}`
|
|
8943
|
+
});
|
|
8944
|
+
continue;
|
|
8945
|
+
}
|
|
8823
8946
|
if (!valueExistsInCollections(mergedSeed, entity.kind, entity.key, entity.value)) {
|
|
8824
8947
|
const issue = {
|
|
8825
8948
|
type: "missing_entity",
|
|
@@ -8934,7 +9057,8 @@ var NON_SUBJECT_STARTS = /* @__PURE__ */ new Set([
|
|
|
8934
9057
|
"under",
|
|
8935
9058
|
"after",
|
|
8936
9059
|
"before",
|
|
8937
|
-
"during"
|
|
9060
|
+
"during",
|
|
9061
|
+
"as"
|
|
8938
9062
|
]);
|
|
8939
9063
|
function isReasonableCountSubject(subject, expected) {
|
|
8940
9064
|
if (expected > MAX_REASONABLE_COUNT) return false;
|
|
@@ -8949,40 +9073,92 @@ function appearsToBeClockSuffix(text, numberStart) {
|
|
|
8949
9073
|
const prefix = text.slice(Math.max(0, numberStart - 3), numberStart);
|
|
8950
9074
|
return /^\d{1,2}:$/.test(prefix);
|
|
8951
9075
|
}
|
|
9076
|
+
function isDecimalFragment(text, matchIndex) {
|
|
9077
|
+
if (matchIndex <= 0) return false;
|
|
9078
|
+
const charBefore = text[matchIndex - 1];
|
|
9079
|
+
if (charBefore === ".") {
|
|
9080
|
+
return matchIndex >= 2 && /\d/.test(text[matchIndex - 2]);
|
|
9081
|
+
}
|
|
9082
|
+
return false;
|
|
9083
|
+
}
|
|
9084
|
+
function resolveSubjectWithKey(subject, flat) {
|
|
9085
|
+
const candidates = buildSubjectCandidates2(subject);
|
|
9086
|
+
for (const candidate of candidates) {
|
|
9087
|
+
const normalized = candidate.replace(/\s+/g, "").toLowerCase();
|
|
9088
|
+
for (const [key, value] of Object.entries(flat)) {
|
|
9089
|
+
const normalizedKey = key.replace(/\s+/g, "").toLowerCase();
|
|
9090
|
+
if ((normalizedKey === normalized || normalizedKey === normalized + "s") && Array.isArray(value)) {
|
|
9091
|
+
return { items: value, key };
|
|
9092
|
+
}
|
|
9093
|
+
}
|
|
9094
|
+
}
|
|
9095
|
+
const items = resolveSubjectInState(subject, flat);
|
|
9096
|
+
return items ? { items, key: "" } : null;
|
|
9097
|
+
}
|
|
9098
|
+
function buildSubjectCandidates2(subject) {
|
|
9099
|
+
const candidates = [subject];
|
|
9100
|
+
if (subject.endsWith("s") && subject.length > 3) {
|
|
9101
|
+
candidates.push(subject.slice(0, -1));
|
|
9102
|
+
} else {
|
|
9103
|
+
candidates.push(subject + "s");
|
|
9104
|
+
}
|
|
9105
|
+
const words = subject.split(/\s+/);
|
|
9106
|
+
if (words.length > 1) {
|
|
9107
|
+
candidates.push(words[0]);
|
|
9108
|
+
candidates.push(words[words.length - 1]);
|
|
9109
|
+
}
|
|
9110
|
+
return candidates;
|
|
9111
|
+
}
|
|
8952
9112
|
function verifySeedCounts(setupText, seedState) {
|
|
8953
9113
|
const mismatches = [];
|
|
8954
9114
|
const flat = flattenTwinState(seedState);
|
|
8955
9115
|
const countPattern = /\b(\d+)\s+([\w\s]+?)(?:\s+(?:that|which|are|with|in|labeled|assigned)\b)/gi;
|
|
8956
9116
|
for (const match of setupText.matchAll(countPattern)) {
|
|
9117
|
+
if (isDecimalFragment(setupText, match.index)) continue;
|
|
8957
9118
|
const expected = parseInt(match[1], 10);
|
|
8958
9119
|
const subject = match[2].trim();
|
|
8959
9120
|
if (match.index !== void 0 && appearsToBeClockSuffix(setupText, match.index)) continue;
|
|
8960
9121
|
if (!subject || expected <= 0) continue;
|
|
8961
9122
|
if (!isReasonableCountSubject(subject, expected)) continue;
|
|
8962
|
-
const resolved =
|
|
8963
|
-
if (resolved && resolved.length !== expected) {
|
|
8964
|
-
mismatches.push({ subject, expected, actual: resolved.length });
|
|
9123
|
+
const resolved = resolveSubjectWithKey(subject, flat);
|
|
9124
|
+
if (resolved && resolved.items.length !== expected) {
|
|
9125
|
+
mismatches.push({ subject, expected, actual: resolved.items.length, collectionKey: resolved.key || void 0 });
|
|
8965
9126
|
}
|
|
8966
9127
|
}
|
|
8967
9128
|
const simplePattern = /\b(\d+)\s+([\w\s]+?)(?:[.,;:)]|$)/gm;
|
|
8968
9129
|
const seenSubjects = new Set(mismatches.map((m) => m.subject.toLowerCase()));
|
|
8969
9130
|
for (const match of setupText.matchAll(simplePattern)) {
|
|
9131
|
+
if (isDecimalFragment(setupText, match.index)) continue;
|
|
8970
9132
|
const expected = parseInt(match[1], 10);
|
|
8971
9133
|
const subject = match[2].trim();
|
|
8972
9134
|
if (match.index !== void 0 && appearsToBeClockSuffix(setupText, match.index)) continue;
|
|
8973
9135
|
if (!subject || expected <= 0 || seenSubjects.has(subject.toLowerCase())) continue;
|
|
8974
9136
|
if (!isReasonableCountSubject(subject, expected)) continue;
|
|
8975
|
-
const resolved =
|
|
8976
|
-
if (resolved && resolved.length !== expected) {
|
|
8977
|
-
mismatches.push({ subject, expected, actual: resolved.length });
|
|
9137
|
+
const resolved = resolveSubjectWithKey(subject, flat);
|
|
9138
|
+
if (resolved && resolved.items.length !== expected) {
|
|
9139
|
+
mismatches.push({ subject, expected, actual: resolved.items.length, collectionKey: resolved.key || void 0 });
|
|
8978
9140
|
seenSubjects.add(subject.toLowerCase());
|
|
8979
9141
|
}
|
|
8980
9142
|
}
|
|
8981
9143
|
return mismatches;
|
|
8982
9144
|
}
|
|
9145
|
+
function trimSeedToExpectedCounts(seed, mismatches) {
|
|
9146
|
+
let totalTrimmed = 0;
|
|
9147
|
+
for (const m of mismatches) {
|
|
9148
|
+
if (m.actual <= m.expected) continue;
|
|
9149
|
+
if (!m.collectionKey || !seed[m.collectionKey]) continue;
|
|
9150
|
+
const collection = seed[m.collectionKey];
|
|
9151
|
+
if (collection.length > m.expected) {
|
|
9152
|
+
const trimmed = collection.length - m.expected;
|
|
9153
|
+
seed[m.collectionKey] = collection.slice(0, m.expected);
|
|
9154
|
+
totalTrimmed += trimmed;
|
|
9155
|
+
}
|
|
9156
|
+
}
|
|
9157
|
+
return totalTrimmed;
|
|
9158
|
+
}
|
|
8983
9159
|
|
|
8984
9160
|
// src/runner/seed-cache.ts
|
|
8985
|
-
var CACHE_VERSION =
|
|
9161
|
+
var CACHE_VERSION = 4;
|
|
8986
9162
|
var NEGATIVE_CACHE_VERSION = 2;
|
|
8987
9163
|
var NEGATIVE_PREFIX = "neg-";
|
|
8988
9164
|
var CACHE_DIR = join7(homedir2(), ".archal", "seed-cache");
|
|
@@ -9234,7 +9410,7 @@ ${setupText}
|
|
|
9234
9410
|
Extract the seed blueprint as JSON.`;
|
|
9235
9411
|
try {
|
|
9236
9412
|
const provider = detectProvider(config.model);
|
|
9237
|
-
const apiKey = resolveProviderApiKey(config.apiKey, provider);
|
|
9413
|
+
const apiKey = config.providerMode === "archal" ? "" : resolveProviderApiKey(config.apiKey ?? "", provider);
|
|
9238
9414
|
const responseText = await callLlm({
|
|
9239
9415
|
provider,
|
|
9240
9416
|
model: config.model,
|
|
@@ -10454,9 +10630,19 @@ function extractHybridPatch(obj) {
|
|
|
10454
10630
|
}
|
|
10455
10631
|
return null;
|
|
10456
10632
|
}
|
|
10457
|
-
function
|
|
10633
|
+
function hashText(text) {
|
|
10634
|
+
return createHash4("sha256").update(text).digest("hex").slice(0, 16);
|
|
10635
|
+
}
|
|
10636
|
+
function buildSeedCacheContext(twinName, config, intent, context) {
|
|
10458
10637
|
return {
|
|
10459
10638
|
twinName,
|
|
10639
|
+
generator: {
|
|
10640
|
+
model: config.model,
|
|
10641
|
+
providerMode: config.providerMode ?? "direct",
|
|
10642
|
+
baseUrl: config.baseUrl ?? null,
|
|
10643
|
+
systemPromptHash: hashText(SYSTEM_PROMPT2),
|
|
10644
|
+
promptTemplateVersion: 2
|
|
10645
|
+
},
|
|
10460
10646
|
intent: intent ?? null,
|
|
10461
10647
|
scenario: context ?? null
|
|
10462
10648
|
};
|
|
@@ -10811,10 +10997,13 @@ async function tryBlueprintPath(twinName, baseSeedData, setupDescription, availa
|
|
|
10811
10997
|
finalSeed = autoFillMissingFKs(finalSeed, twinName);
|
|
10812
10998
|
const relValidation = validateSeedRelationships(finalSeed, twinName);
|
|
10813
10999
|
if (!relValidation.valid) {
|
|
10814
|
-
|
|
10815
|
-
|
|
10816
|
-
|
|
10817
|
-
|
|
11000
|
+
finalSeed = autoFillMissingFKs(finalSeed, twinName);
|
|
11001
|
+
const secondValidation = validateSeedRelationships(finalSeed, twinName);
|
|
11002
|
+
if (!secondValidation.valid) {
|
|
11003
|
+
warn("Blueprint seed has unresolved FK references (continuing anyway)", {
|
|
11004
|
+
errors: secondValidation.errors.slice(0, 5).join("; ")
|
|
11005
|
+
});
|
|
11006
|
+
}
|
|
10818
11007
|
}
|
|
10819
11008
|
if (intent) {
|
|
10820
11009
|
const coverage = validateSeedCoverage(intent, finalSeed);
|
|
@@ -10829,9 +11018,16 @@ async function tryBlueprintPath(twinName, baseSeedData, setupDescription, availa
|
|
|
10829
11018
|
flatForVerify[twinName] = finalSeed;
|
|
10830
11019
|
const countMismatches = verifySeedCounts(setupDescription, flatForVerify);
|
|
10831
11020
|
if (countMismatches.length > 0) {
|
|
10832
|
-
|
|
10833
|
-
|
|
10834
|
-
|
|
11021
|
+
const trimmed = trimSeedToExpectedCounts(finalSeed, countMismatches);
|
|
11022
|
+
if (trimmed > 0) {
|
|
11023
|
+
debug(`Blueprint seed: trimmed ${trimmed} excess entities to match setup counts`);
|
|
11024
|
+
}
|
|
11025
|
+
const remaining = countMismatches.filter((m) => m.actual > m.expected && !m.collectionKey);
|
|
11026
|
+
if (remaining.length > 0) {
|
|
11027
|
+
debug("Blueprint seed has unresolvable count mismatches", {
|
|
11028
|
+
mismatches: remaining.map((m) => `${m.subject}: ${m.expected} vs ${m.actual}`).join("; ")
|
|
11029
|
+
});
|
|
11030
|
+
}
|
|
10835
11031
|
}
|
|
10836
11032
|
const syntheticPatch = {
|
|
10837
11033
|
add: {}
|
|
@@ -10861,7 +11057,7 @@ async function tryBlueprintPath(twinName, baseSeedData, setupDescription, availa
|
|
|
10861
11057
|
async function generateDynamicSeed(twinName, baseSeedName, baseSeedData, setupDescription, config, intent, context) {
|
|
10862
11058
|
const cacheScope = {
|
|
10863
11059
|
baseSeedData,
|
|
10864
|
-
cacheContext: buildSeedCacheContext(twinName, intent, context)
|
|
11060
|
+
cacheContext: buildSeedCacheContext(twinName, config, intent, context)
|
|
10865
11061
|
};
|
|
10866
11062
|
if (!config.noCache) {
|
|
10867
11063
|
const cached = getCachedSeed(twinName, baseSeedName, setupDescription, cacheScope);
|
|
@@ -10892,7 +11088,7 @@ async function generateDynamicSeed(twinName, baseSeedName, baseSeedData, setupDe
|
|
|
10892
11088
|
if (blueprintResult) {
|
|
10893
11089
|
info("Dynamic seed generated via blueprint", { twin: twinName });
|
|
10894
11090
|
if (!config.noCache) {
|
|
10895
|
-
const cacheContext = buildSeedCacheContext(twinName, intent, context);
|
|
11091
|
+
const cacheContext = buildSeedCacheContext(twinName, config, intent, context);
|
|
10896
11092
|
cacheSeed(twinName, baseSeedName, setupDescription, blueprintResult.seed, blueprintResult.patch, {
|
|
10897
11093
|
baseSeedData,
|
|
10898
11094
|
cacheContext
|
|
@@ -11023,14 +11219,19 @@ Fix these issues:
|
|
|
11023
11219
|
const relationshipValidation = validateSeedRelationships(mergedSeed, twinName);
|
|
11024
11220
|
if (!relationshipValidation.valid) {
|
|
11025
11221
|
const topErrors = relationshipValidation.errors.slice(0, 10);
|
|
11026
|
-
|
|
11222
|
+
if (validationAttempts < MAX_ATTEMPTS - 1) {
|
|
11223
|
+
warn(`Dynamic seed relationship validation failed (attempt ${attempt + 1})`, {
|
|
11224
|
+
errors: topErrors.join("; ")
|
|
11225
|
+
});
|
|
11226
|
+
lastErrors = topErrors;
|
|
11227
|
+
patch = null;
|
|
11228
|
+
mergedSeed = null;
|
|
11229
|
+
validationAttempts++;
|
|
11230
|
+
continue;
|
|
11231
|
+
}
|
|
11232
|
+
warn(`Dynamic seed has unresolved FK references (accepting on final attempt)`, {
|
|
11027
11233
|
errors: topErrors.join("; ")
|
|
11028
11234
|
});
|
|
11029
|
-
lastErrors = topErrors;
|
|
11030
|
-
patch = null;
|
|
11031
|
-
mergedSeed = null;
|
|
11032
|
-
validationAttempts++;
|
|
11033
|
-
continue;
|
|
11034
11235
|
}
|
|
11035
11236
|
if (intent) {
|
|
11036
11237
|
debug("Seed intent coverage summary", {
|
|
@@ -11089,6 +11290,15 @@ Fix these issues:
|
|
|
11089
11290
|
}
|
|
11090
11291
|
mergedSeed = autoFillMissingFKs(mergedSeed, twinName);
|
|
11091
11292
|
mergedSeed = ensureSlackScenarioChannelAccess(mergedSeed, intent);
|
|
11293
|
+
if (setupDescription) {
|
|
11294
|
+
const flatForTrim = {};
|
|
11295
|
+
flatForTrim[twinName] = mergedSeed;
|
|
11296
|
+
const finalMismatches = verifySeedCounts(setupDescription, flatForTrim);
|
|
11297
|
+
const trimmed = trimSeedToExpectedCounts(mergedSeed, finalMismatches);
|
|
11298
|
+
if (trimmed > 0) {
|
|
11299
|
+
debug(`Trimmed ${trimmed} excess seed entities to match setup counts`);
|
|
11300
|
+
}
|
|
11301
|
+
}
|
|
11092
11302
|
if (!config.noCache) {
|
|
11093
11303
|
cacheSeed(twinName, baseSeedName, setupDescription, mergedSeed, patch, cacheScope);
|
|
11094
11304
|
}
|
|
@@ -11236,10 +11446,23 @@ function githubIntent(setup) {
|
|
|
11236
11446
|
entities.push({ kind: "repo", key: "fullName", value: fullName });
|
|
11237
11447
|
}
|
|
11238
11448
|
if (!primaryRepoSet) {
|
|
11239
|
-
const orgMatch = setup.match(
|
|
11449
|
+
const orgMatch = setup.match(
|
|
11450
|
+
/\b(?:github\s+)?(?:organization|org)\s+(?:named\s+)?["']?([a-z][a-z0-9._-]*)["']?/i
|
|
11451
|
+
);
|
|
11240
11452
|
if (orgMatch?.[1]) {
|
|
11241
|
-
extractedSlots["repo.owner"] = orgMatch[1];
|
|
11242
|
-
entities.push({ kind: "repo", key: "owner", value: orgMatch[1] });
|
|
11453
|
+
extractedSlots["repo.owner"] = orgMatch[1].toLowerCase();
|
|
11454
|
+
entities.push({ kind: "repo", key: "owner", value: orgMatch[1].toLowerCase() });
|
|
11455
|
+
const repoName = setup.match(/\b(?:repository|repo)\s+(?:named\s+)?["']?([a-z][a-z0-9._-]{1,99})["']?/i)?.[1];
|
|
11456
|
+
if (repoName) {
|
|
11457
|
+
const normalizedName = repoName.toLowerCase();
|
|
11458
|
+
extractedSlots["repo.name"] = normalizedName;
|
|
11459
|
+
entities.push({ kind: "repo", key: "name", value: normalizedName });
|
|
11460
|
+
entities.push({
|
|
11461
|
+
kind: "repo",
|
|
11462
|
+
key: "fullName",
|
|
11463
|
+
value: `${String(extractedSlots["repo.owner"])}/${normalizedName}`
|
|
11464
|
+
});
|
|
11465
|
+
}
|
|
11243
11466
|
} else {
|
|
11244
11467
|
missingSlots.push({
|
|
11245
11468
|
slot: "repo.owner/repo.name",
|
|
@@ -11430,6 +11653,18 @@ function stripeIntent(setup) {
|
|
|
11430
11653
|
});
|
|
11431
11654
|
}
|
|
11432
11655
|
}
|
|
11656
|
+
const idRegex = /\b((?:acct|cus|prod|price|pi|ch|re|in|sub|dp|pm|payout|tr|tok|evt)_[a-zA-Z0-9]+)\b/g;
|
|
11657
|
+
const seenIds = /* @__PURE__ */ new Set();
|
|
11658
|
+
let idMatch;
|
|
11659
|
+
while ((idMatch = idRegex.exec(setup)) !== null) {
|
|
11660
|
+
const id = idMatch[1];
|
|
11661
|
+
if (seenIds.has(id)) continue;
|
|
11662
|
+
seenIds.add(id);
|
|
11663
|
+
entities.push({ kind: "stripe_entity", key: "id", value: id });
|
|
11664
|
+
if (!extractedSlots["stripe.primary_id"]) {
|
|
11665
|
+
extractedSlots["stripe.primary_id"] = id;
|
|
11666
|
+
}
|
|
11667
|
+
}
|
|
11433
11668
|
if (missingSlots.length > 0) {
|
|
11434
11669
|
return { intent: null, missingSlots };
|
|
11435
11670
|
}
|
|
@@ -11523,6 +11758,30 @@ function jiraIntent(setup) {
|
|
|
11523
11758
|
}
|
|
11524
11759
|
entities.push({ kind: "ticket", key: "key", value: key });
|
|
11525
11760
|
}
|
|
11761
|
+
const seenProjects = /* @__PURE__ */ new Set();
|
|
11762
|
+
const addProject = (projectKey) => {
|
|
11763
|
+
const normalized = projectKey.toUpperCase();
|
|
11764
|
+
if (!/^[A-Z][A-Z0-9]{1,9}$/.test(normalized)) return;
|
|
11765
|
+
if (seenProjects.has(normalized)) return;
|
|
11766
|
+
seenProjects.add(normalized);
|
|
11767
|
+
entities.push({ kind: "project", key: "key", value: normalized });
|
|
11768
|
+
if (!extractedSlots["project.key"]) {
|
|
11769
|
+
extractedSlots["project.key"] = normalized;
|
|
11770
|
+
}
|
|
11771
|
+
};
|
|
11772
|
+
for (const key of seenKeys) {
|
|
11773
|
+
addProject(key.split("-", 1)[0] ?? "");
|
|
11774
|
+
}
|
|
11775
|
+
const projectRegexes = [
|
|
11776
|
+
/\b(?:jira\s+)?project\s+(?:key\s*)?[:=]?\s*["']?([A-Z][A-Z0-9]{1,9})["']?/gi,
|
|
11777
|
+
/\bproject\s+["'][^"'\n]+["']\s*\(\s*([A-Z][A-Z0-9]{1,9})\s*\)/gi
|
|
11778
|
+
];
|
|
11779
|
+
for (const regex of projectRegexes) {
|
|
11780
|
+
let projectMatch;
|
|
11781
|
+
while ((projectMatch = regex.exec(setup)) !== null) {
|
|
11782
|
+
addProject(projectMatch[1] ?? "");
|
|
11783
|
+
}
|
|
11784
|
+
}
|
|
11526
11785
|
return {
|
|
11527
11786
|
intent: {
|
|
11528
11787
|
twinName: "jira",
|
|
@@ -11537,6 +11796,7 @@ function jiraIntent(setup) {
|
|
|
11537
11796
|
}
|
|
11538
11797
|
function supabaseIntent(setup) {
|
|
11539
11798
|
const extractedSlots = {};
|
|
11799
|
+
const entities = [];
|
|
11540
11800
|
const missingSlots = [];
|
|
11541
11801
|
const requiredSlots = ["database.target"];
|
|
11542
11802
|
const seenTables = /* @__PURE__ */ new Set();
|
|
@@ -11569,6 +11829,9 @@ function supabaseIntent(setup) {
|
|
|
11569
11829
|
const hasEnvVarTokens = /\b[A-Z][A-Z0-9_]{2,}\b/.test(setup);
|
|
11570
11830
|
if (seenTables.size > 0 || mentionsProject || mentionsLogsOrService || mentionsEnvVars && hasEnvVarTokens) {
|
|
11571
11831
|
extractedSlots["database.target"] = true;
|
|
11832
|
+
for (const table2 of seenTables) {
|
|
11833
|
+
entities.push({ kind: "table", key: "name", value: table2 });
|
|
11834
|
+
}
|
|
11572
11835
|
} else {
|
|
11573
11836
|
missingSlots.push({
|
|
11574
11837
|
slot: "database.target",
|
|
@@ -11585,10 +11848,7 @@ function supabaseIntent(setup) {
|
|
|
11585
11848
|
setupSummary: setupSummary(setup),
|
|
11586
11849
|
requiredSlots,
|
|
11587
11850
|
extractedSlots,
|
|
11588
|
-
|
|
11589
|
-
// that are not materialized in the base SQL schema. Keep intent broad
|
|
11590
|
-
// to avoid false-hard failures in seed generation.
|
|
11591
|
-
entities: [],
|
|
11851
|
+
entities,
|
|
11592
11852
|
quotedStrings: []
|
|
11593
11853
|
},
|
|
11594
11854
|
missingSlots: []
|
|
@@ -12112,12 +12372,24 @@ function loadBaseSeedFromDisk(twinName, seedName) {
|
|
|
12112
12372
|
}
|
|
12113
12373
|
function categorizeRunError(message) {
|
|
12114
12374
|
if (/Failed to spawn|ENOENT/.test(message)) {
|
|
12115
|
-
return
|
|
12375
|
+
return {
|
|
12376
|
+
message: `Agent not found: ${message}. Check that your agent command is installed and in PATH.`,
|
|
12377
|
+
outcome: "failed_agent"
|
|
12378
|
+
};
|
|
12379
|
+
}
|
|
12380
|
+
if (/Dynamic seed generation failed|Missing dynamic seed state|seed generation|seed setup/i.test(message)) {
|
|
12381
|
+
return {
|
|
12382
|
+
message: `Seed generation error: ${message}`,
|
|
12383
|
+
outcome: "inconclusive_seed"
|
|
12384
|
+
};
|
|
12116
12385
|
}
|
|
12117
12386
|
if (/HTTP [45]\d\d|ECONNREFUSED|ENOTFOUND|ETIMEDOUT|ECONNRESET|cloud session|fetch failed|AbortError|TimeoutError|operation was aborted|timed?\s*out/i.test(message)) {
|
|
12118
|
-
return
|
|
12387
|
+
return {
|
|
12388
|
+
message: `Infrastructure error: ${message}. Check your network or try again.`,
|
|
12389
|
+
outcome: "inconclusive_infrastructure"
|
|
12390
|
+
};
|
|
12119
12391
|
}
|
|
12120
|
-
return message;
|
|
12392
|
+
return { message, outcome: "failed_agent" };
|
|
12121
12393
|
}
|
|
12122
12394
|
async function executeSingleRun(runIndex, scenario, agentConfig, seedSelections, evaluatorConfig, timeoutSeconds, apiEngine, localEngine, remoteTwinUrlOverrides, apiRouting, cloudTwinUrls, hostedSessionId, apiBearerToken, adminAuth) {
|
|
12123
12395
|
const startTime = Date.now();
|
|
@@ -12255,7 +12527,8 @@ ${baseTaskMessage}` : baseTaskMessage;
|
|
|
12255
12527
|
stateDiff: diff,
|
|
12256
12528
|
agentLog: agentResult.stderr || void 0,
|
|
12257
12529
|
agentTrace: agentResult.agentTrace,
|
|
12258
|
-
tokenUsage
|
|
12530
|
+
tokenUsage,
|
|
12531
|
+
outcome: "failed_agent"
|
|
12259
12532
|
};
|
|
12260
12533
|
}
|
|
12261
12534
|
if (agentResult.exitCode !== 0 && agentResult.exitCode !== null) {
|
|
@@ -12294,7 +12567,8 @@ ${baseTaskMessage}` : baseTaskMessage;
|
|
|
12294
12567
|
stateDiff: diff,
|
|
12295
12568
|
agentLog: agentResult.stderr || void 0,
|
|
12296
12569
|
agentTrace: agentResult.agentTrace,
|
|
12297
|
-
tokenUsage
|
|
12570
|
+
tokenUsage,
|
|
12571
|
+
outcome: "failed_agent"
|
|
12298
12572
|
};
|
|
12299
12573
|
}
|
|
12300
12574
|
if (trace.length === 0) {
|
|
@@ -12326,12 +12600,13 @@ ${baseTaskMessage}` : baseTaskMessage;
|
|
|
12326
12600
|
stateDiff: diff,
|
|
12327
12601
|
agentLog: agentResult.stderr || void 0,
|
|
12328
12602
|
agentTrace: agentResult.agentTrace,
|
|
12329
|
-
tokenUsage
|
|
12603
|
+
tokenUsage,
|
|
12604
|
+
outcome: "completed"
|
|
12330
12605
|
};
|
|
12331
12606
|
} catch (err) {
|
|
12332
12607
|
const message = err instanceof Error ? err.message : String(err);
|
|
12333
12608
|
const categorized = categorizeRunError(message);
|
|
12334
|
-
error(`Run ${runIndex + 1} failed: ${categorized}`);
|
|
12609
|
+
error(`Run ${runIndex + 1} failed: ${categorized.message}`);
|
|
12335
12610
|
const durationMs = Date.now() - startTime;
|
|
12336
12611
|
return {
|
|
12337
12612
|
runIndex,
|
|
@@ -12339,12 +12614,13 @@ ${baseTaskMessage}` : baseTaskMessage;
|
|
|
12339
12614
|
criterionId: c.id,
|
|
12340
12615
|
status: "fail",
|
|
12341
12616
|
confidence: 1,
|
|
12342
|
-
explanation: `Run failed: ${categorized}`
|
|
12617
|
+
explanation: `Run failed: ${categorized.message}`
|
|
12343
12618
|
})),
|
|
12344
12619
|
overallScore: 0,
|
|
12345
12620
|
trace: [],
|
|
12346
12621
|
durationMs,
|
|
12347
|
-
error: categorized,
|
|
12622
|
+
error: categorized.message,
|
|
12623
|
+
outcome: categorized.outcome,
|
|
12348
12624
|
stateBefore: beforeState,
|
|
12349
12625
|
stateAfter: beforeState,
|
|
12350
12626
|
stateDiff: { added: {}, modified: {}, removed: {} }
|
|
@@ -12421,9 +12697,20 @@ function preflightCheck(scenario, apiKey, model, baseUrl, evaluatorProvider, see
|
|
|
12421
12697
|
}
|
|
12422
12698
|
}
|
|
12423
12699
|
if (seedModel) {
|
|
12700
|
+
const mode = seedProviderMode ?? "auto";
|
|
12701
|
+
const provider = detectProvider(seedModel);
|
|
12702
|
+
const resolvedKey = resolveProviderApiKey(apiKey, provider);
|
|
12424
12703
|
const creds = getCredentials();
|
|
12425
12704
|
const hasArchalAuth = Boolean(creds?.token);
|
|
12426
|
-
if (!
|
|
12705
|
+
if (provider === "openai-compatible" && !baseUrl && mode === "direct") {
|
|
12706
|
+
errors.push({
|
|
12707
|
+
check: "seed.baseUrl",
|
|
12708
|
+
message: `Seed model "${seedModel}" requires a base URL for the OpenAI-compatible endpoint`,
|
|
12709
|
+
detail: "Set via: export ARCHAL_EVALUATOR_BASE_URL=<url> or archal config set evaluator.baseUrl <url>",
|
|
12710
|
+
warning: true
|
|
12711
|
+
});
|
|
12712
|
+
}
|
|
12713
|
+
if (mode === "archal" && !hasArchalAuth) {
|
|
12427
12714
|
errors.push({
|
|
12428
12715
|
check: "archal-auth-seed",
|
|
12429
12716
|
message: "Dynamic seed generation requires Archal authentication",
|
|
@@ -12431,6 +12718,32 @@ function preflightCheck(scenario, apiKey, model, baseUrl, evaluatorProvider, see
|
|
|
12431
12718
|
warning: true
|
|
12432
12719
|
});
|
|
12433
12720
|
}
|
|
12721
|
+
if (mode === "direct" && !resolvedKey) {
|
|
12722
|
+
errors.push({
|
|
12723
|
+
check: getProviderEnvVar(provider),
|
|
12724
|
+
message: `Seed provider is "direct" but no API key is configured for ${provider}`,
|
|
12725
|
+
detail: `Set via: export ${getProviderEnvVar(provider)}=<your-key> or archal config set evaluator.apiKey <key>`,
|
|
12726
|
+
warning: true
|
|
12727
|
+
});
|
|
12728
|
+
}
|
|
12729
|
+
if (mode === "auto" && !resolvedKey && !hasArchalAuth) {
|
|
12730
|
+
errors.push({
|
|
12731
|
+
check: getProviderEnvVar(provider),
|
|
12732
|
+
message: 'Dynamic seed generation has no available provider in "auto" mode',
|
|
12733
|
+
detail: `Set ${getProviderEnvVar(provider)} (or evaluator.apiKey) for direct mode, or run archal login for Archal backend mode`,
|
|
12734
|
+
warning: true
|
|
12735
|
+
});
|
|
12736
|
+
}
|
|
12737
|
+
if (resolvedKey && (mode === "direct" || mode === "auto")) {
|
|
12738
|
+
const mismatch = validateKeyForProvider(resolvedKey, provider);
|
|
12739
|
+
if (mismatch) {
|
|
12740
|
+
errors.push({
|
|
12741
|
+
check: "seed-key-provider-mismatch",
|
|
12742
|
+
message: mismatch,
|
|
12743
|
+
warning: true
|
|
12744
|
+
});
|
|
12745
|
+
}
|
|
12746
|
+
}
|
|
12434
12747
|
}
|
|
12435
12748
|
return errors;
|
|
12436
12749
|
}
|
|
@@ -12479,6 +12792,35 @@ async function runScenario(options) {
|
|
|
12479
12792
|
'cloudTwinUrls is required. Local twin execution has been removed; use "archal run" to provision a hosted session.'
|
|
12480
12793
|
);
|
|
12481
12794
|
}
|
|
12795
|
+
const criterionDescriptions = {};
|
|
12796
|
+
const criterionTypes = {};
|
|
12797
|
+
for (const c of scenario.successCriteria) {
|
|
12798
|
+
criterionDescriptions[c.id] = c.description;
|
|
12799
|
+
criterionTypes[c.id] = c.type;
|
|
12800
|
+
}
|
|
12801
|
+
const buildInconclusiveSeedReport = (message) => ({
|
|
12802
|
+
scenarioTitle: scenario.title,
|
|
12803
|
+
satisfactionScore: 0,
|
|
12804
|
+
criterionDescriptions,
|
|
12805
|
+
criterionTypes,
|
|
12806
|
+
twinNames: scenario.config.twins,
|
|
12807
|
+
runs: [{
|
|
12808
|
+
runIndex: 0,
|
|
12809
|
+
evaluations: scenario.successCriteria.map((criterion) => ({
|
|
12810
|
+
criterionId: criterion.id,
|
|
12811
|
+
status: "fail",
|
|
12812
|
+
confidence: 1,
|
|
12813
|
+
explanation: `Run not scored due to seed setup failure: ${message}`
|
|
12814
|
+
})),
|
|
12815
|
+
overallScore: 0,
|
|
12816
|
+
trace: [],
|
|
12817
|
+
durationMs: 0,
|
|
12818
|
+
error: message,
|
|
12819
|
+
outcome: "inconclusive_seed"
|
|
12820
|
+
}],
|
|
12821
|
+
summary: `Inconclusive (seed setup): ${message}`,
|
|
12822
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
12823
|
+
});
|
|
12482
12824
|
const preflightErrors = preflightCheck(
|
|
12483
12825
|
scenario,
|
|
12484
12826
|
config.apiKey,
|
|
@@ -12569,7 +12911,7 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
|
|
|
12569
12911
|
cacheContext: seedPromptContext
|
|
12570
12912
|
});
|
|
12571
12913
|
}
|
|
12572
|
-
|
|
12914
|
+
return buildInconclusiveSeedReport(message);
|
|
12573
12915
|
}
|
|
12574
12916
|
warn(message);
|
|
12575
12917
|
generationTargets.push(sel);
|
|
@@ -12578,12 +12920,11 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
|
|
|
12578
12920
|
if (generationTargets.length > 0) {
|
|
12579
12921
|
progress("Generating dynamic seeds from setup description...");
|
|
12580
12922
|
const dynamicConfig = {
|
|
12581
|
-
apiKey:
|
|
12582
|
-
// Seed gen always routes through Archal backend
|
|
12923
|
+
apiKey: config.apiKey,
|
|
12583
12924
|
model: config.seedModel,
|
|
12584
12925
|
baseUrl: config.baseUrl,
|
|
12585
12926
|
noCache: options.noSeedCache,
|
|
12586
|
-
providerMode:
|
|
12927
|
+
providerMode: config.seedProvider
|
|
12587
12928
|
};
|
|
12588
12929
|
let cloudSeedSnapshotByTwin = null;
|
|
12589
12930
|
const adminAuth = options.apiAdminToken ? { token: options.apiAdminToken, userId: options.apiAdminUserId } : void 0;
|
|
@@ -12601,20 +12942,28 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
|
|
|
12601
12942
|
baseSeedData = normalizeSeedState(cloudSeedSnapshotByTwin[sel.twinName]);
|
|
12602
12943
|
}
|
|
12603
12944
|
if (!baseSeedData || Object.keys(baseSeedData).length === 0) {
|
|
12604
|
-
|
|
12945
|
+
return buildInconclusiveSeedReport(
|
|
12605
12946
|
`Could not load base seed "${sel.seedName}" for twin "${sel.twinName}" from disk. Ensure the seed file exists at twins/${sel.twinName}/seeds/${sel.seedName}.json or .sql, or that the hosted twin /state endpoint is reachable.`
|
|
12606
12947
|
);
|
|
12607
12948
|
}
|
|
12608
12949
|
progress(`Generating dynamic seed for ${sel.twinName}...`);
|
|
12609
|
-
|
|
12610
|
-
|
|
12611
|
-
|
|
12612
|
-
|
|
12613
|
-
|
|
12614
|
-
|
|
12615
|
-
|
|
12616
|
-
|
|
12617
|
-
|
|
12950
|
+
let result;
|
|
12951
|
+
try {
|
|
12952
|
+
result = await generateDynamicSeed(
|
|
12953
|
+
sel.twinName,
|
|
12954
|
+
sel.seedName,
|
|
12955
|
+
baseSeedData,
|
|
12956
|
+
scenario.setup,
|
|
12957
|
+
dynamicConfig,
|
|
12958
|
+
extractedIntentByTwin.get(sel.twinName),
|
|
12959
|
+
seedPromptContext
|
|
12960
|
+
);
|
|
12961
|
+
} catch (error2) {
|
|
12962
|
+
const detail = error2 instanceof Error ? error2.message : String(error2);
|
|
12963
|
+
return buildInconclusiveSeedReport(
|
|
12964
|
+
`Dynamic seed generation failed for twin "${sel.twinName}": ${detail}`
|
|
12965
|
+
);
|
|
12966
|
+
}
|
|
12618
12967
|
sel.seedData = result.seed;
|
|
12619
12968
|
if (result.fromCache) {
|
|
12620
12969
|
cachedSeedTwins.push(sel.twinName);
|
|
@@ -12630,7 +12979,7 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
|
|
|
12630
12979
|
}
|
|
12631
12980
|
const missingDynamicSeeds = seedSelections.filter((sel) => !sel.seedData);
|
|
12632
12981
|
if (missingDynamicSeeds.length > 0) {
|
|
12633
|
-
|
|
12982
|
+
return buildInconclusiveSeedReport(
|
|
12634
12983
|
`Missing dynamic seed state for twin(s): ${missingDynamicSeeds.map((sel) => sel.twinName).join(", ")}`
|
|
12635
12984
|
);
|
|
12636
12985
|
}
|
|
@@ -12825,8 +13174,8 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
|
|
|
12825
13174
|
return {
|
|
12826
13175
|
scenarioTitle: scenario.title,
|
|
12827
13176
|
satisfactionScore: 100,
|
|
12828
|
-
criterionDescriptions
|
|
12829
|
-
criterionTypes
|
|
13177
|
+
criterionDescriptions,
|
|
13178
|
+
criterionTypes,
|
|
12830
13179
|
twinNames: scenario.config.twins,
|
|
12831
13180
|
runs: [],
|
|
12832
13181
|
summary: "Preflight checks passed",
|
|
@@ -12865,8 +13214,8 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
|
|
|
12865
13214
|
adminAuth
|
|
12866
13215
|
);
|
|
12867
13216
|
runs.push(result);
|
|
12868
|
-
printRunProgress(i, numRuns, result.overallScore, result.error);
|
|
12869
|
-
if (result.
|
|
13217
|
+
printRunProgress(i, numRuns, result.overallScore, result.error, result.outcome);
|
|
13218
|
+
if (result.outcome === "inconclusive_infrastructure" || result.outcome === "inconclusive_seed") {
|
|
12870
13219
|
consecutiveInfraErrors++;
|
|
12871
13220
|
if (consecutiveInfraErrors >= EARLY_ABORT_THRESHOLD && i < numRuns - 1) {
|
|
12872
13221
|
warn(`${consecutiveInfraErrors} consecutive run errors \u2014 aborting remaining ${numRuns - i - 1} run(s) to avoid wasting quota.`);
|
|
@@ -12876,19 +13225,17 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
|
|
|
12876
13225
|
consecutiveInfraErrors = 0;
|
|
12877
13226
|
}
|
|
12878
13227
|
}
|
|
12879
|
-
const
|
|
13228
|
+
const scoredRuns = runs.filter(
|
|
13229
|
+
(run) => run.outcome !== "inconclusive_infrastructure" && run.outcome !== "inconclusive_seed"
|
|
13230
|
+
);
|
|
13231
|
+
const runScores = scoredRuns.map((r) => r.overallScore);
|
|
12880
13232
|
const satisfactionScore = aggregateSatisfaction(runScores);
|
|
12881
|
-
const allEvaluations =
|
|
12882
|
-
const
|
|
12883
|
-
const
|
|
12884
|
-
const criterionTypes = {};
|
|
12885
|
-
for (const c of scenario.successCriteria) {
|
|
12886
|
-
criterionDescriptions[c.id] = c.description;
|
|
12887
|
-
criterionTypes[c.id] = c.type;
|
|
12888
|
-
}
|
|
13233
|
+
const allEvaluations = scoredRuns.map((r) => r.evaluations);
|
|
13234
|
+
const inconclusiveRuns = runs.length - scoredRuns.length;
|
|
13235
|
+
const summary = scoredRuns.length > 0 ? generateSummary(allEvaluations, satisfactionScore) : `Inconclusive: no scored runs (${inconclusiveRuns} infrastructure/seed setup run failure${inconclusiveRuns === 1 ? "" : "s"}).`;
|
|
12889
13236
|
let failureAnalysis;
|
|
12890
|
-
if (satisfactionScore < 100 &&
|
|
12891
|
-
const representativeRun =
|
|
13237
|
+
if (satisfactionScore < 100 && scoredRuns.length > 0 && !options.noFailureAnalysis) {
|
|
13238
|
+
const representativeRun = scoredRuns.reduce(
|
|
12892
13239
|
(worst, r) => r.overallScore < worst.overallScore ? r : worst
|
|
12893
13240
|
);
|
|
12894
13241
|
const failedCriteria = representativeRun.evaluations.filter((e) => e.status !== "pass").map((e) => ({
|
|
@@ -12911,7 +13258,9 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
|
|
|
12911
13258
|
stateDiff: representativeRun.stateDiff ?? { added: {}, modified: {}, removed: {} },
|
|
12912
13259
|
stateBefore: representativeRun.stateBefore ?? {},
|
|
12913
13260
|
stateAfter: representativeRun.stateAfter ?? {},
|
|
12914
|
-
satisfactionScore
|
|
13261
|
+
satisfactionScore,
|
|
13262
|
+
agentLog: representativeRun.agentLog,
|
|
13263
|
+
agentError: representativeRun.error
|
|
12915
13264
|
},
|
|
12916
13265
|
evaluatorConfig
|
|
12917
13266
|
);
|
|
@@ -13690,7 +14039,21 @@ function createRunCommand() {
|
|
|
13690
14039
|
}
|
|
13691
14040
|
}
|
|
13692
14041
|
if (!process.env["ARCHAL_ENGINE_API_KEY"] && userConfig.engineApiKey) {
|
|
13693
|
-
|
|
14042
|
+
const configKey = userConfig.engineApiKey;
|
|
14043
|
+
const requestedModel = firstNonEmpty(
|
|
14044
|
+
opts.engineModel,
|
|
14045
|
+
process.env["ARCHAL_ENGINE_MODEL"],
|
|
14046
|
+
opts.model
|
|
14047
|
+
// -m also defaults the engine model for local harnesses
|
|
14048
|
+
);
|
|
14049
|
+
if (requestedModel) {
|
|
14050
|
+
const modelProvider = detectProvider(requestedModel);
|
|
14051
|
+
if (!validateKeyForProvider(configKey, modelProvider)) {
|
|
14052
|
+
process.env["ARCHAL_ENGINE_API_KEY"] = configKey;
|
|
14053
|
+
}
|
|
14054
|
+
} else {
|
|
14055
|
+
process.env["ARCHAL_ENGINE_API_KEY"] = configKey;
|
|
14056
|
+
}
|
|
13694
14057
|
}
|
|
13695
14058
|
}
|
|
13696
14059
|
inferEngineModelFromEvaluatorModel(opts);
|
|
@@ -13741,8 +14104,17 @@ function createRunCommand() {
|
|
|
13741
14104
|
}
|
|
13742
14105
|
}
|
|
13743
14106
|
if (engine.mode === "local" && !process.env["ARCHAL_ENGINE_API_KEY"]) {
|
|
14107
|
+
const requestedModel = firstNonEmpty(
|
|
14108
|
+
opts.engineModel,
|
|
14109
|
+
process.env["ARCHAL_ENGINE_MODEL"]
|
|
14110
|
+
);
|
|
14111
|
+
const provider = requestedModel ? detectProvider(requestedModel) : null;
|
|
14112
|
+
const providerHint = provider ? `
|
|
14113
|
+
Hint: You requested model "${requestedModel}" (${provider}) but no ${provider} API key is available.
|
|
14114
|
+
Set ${getProviderEnvVar(provider)} or pass --engine-key <${provider}-key>
|
|
14115
|
+
` : "";
|
|
13744
14116
|
process.stderr.write(
|
|
13745
|
-
"Error: No API key found. The agent harness needs an API key to call the model.\nSet one of:\n GEMINI_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY env var\n archal config set engine.apiKey <key>\n ARCHAL_ENGINE_API_KEY env var\n"
|
|
14117
|
+
"Error: No API key found. The agent harness needs an API key to call the model.\nSet one of:\n GEMINI_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY env var\n archal config set engine.apiKey <key>\n ARCHAL_ENGINE_API_KEY env var\n" + providerHint
|
|
13746
14118
|
);
|
|
13747
14119
|
process.exit(2);
|
|
13748
14120
|
}
|
|
@@ -13812,12 +14184,14 @@ function createRunCommand() {
|
|
|
13812
14184
|
})();
|
|
13813
14185
|
const SESSION_READY_TIMEOUT_MS = Math.max(12e4, configuredReadyTimeoutMs);
|
|
13814
14186
|
const SESSION_POLL_INTERVAL_MS = 2e3;
|
|
13815
|
-
const STATUS_READY_GRACE_MS = 5e3;
|
|
13816
14187
|
const readyDeadline = Date.now() + SESSION_READY_TIMEOUT_MS;
|
|
13817
14188
|
let sessionReady = false;
|
|
13818
14189
|
let lastPollIssue;
|
|
13819
|
-
let statusReadySinceMs = null;
|
|
13820
14190
|
const isRetryablePollFailure = (result) => result.offline || typeof result.status === "number" && result.status >= 500;
|
|
14191
|
+
const workersAllReady = (workers) => {
|
|
14192
|
+
if (!workers || Object.keys(workers).length === 0) return true;
|
|
14193
|
+
return Object.values(workers).every((value) => value === "ready");
|
|
14194
|
+
};
|
|
13821
14195
|
const sleepForPollInterval = async () => new Promise((resolve12) => setTimeout(resolve12, SESSION_POLL_INTERVAL_MS));
|
|
13822
14196
|
if (!opts.quiet) process.stderr.write("Starting cloud session...\n");
|
|
13823
14197
|
let pollCount = 0;
|
|
@@ -13872,26 +14246,19 @@ function createRunCommand() {
|
|
|
13872
14246
|
}
|
|
13873
14247
|
const healthAlive = healthResult.ok && healthResult.data.alive;
|
|
13874
14248
|
const statusAlive = statusResult.data.alive || status === "ready";
|
|
13875
|
-
|
|
14249
|
+
const statusWorkersReady = workersAllReady(
|
|
14250
|
+
statusResult.data.twins ?? statusResult.data.workers
|
|
14251
|
+
);
|
|
14252
|
+
const healthWorkersReady = workersAllReady(healthResult.data.twins);
|
|
14253
|
+
if (statusAlive && healthAlive && statusWorkersReady && healthWorkersReady) {
|
|
13876
14254
|
sessionReady = true;
|
|
13877
14255
|
break;
|
|
13878
14256
|
}
|
|
13879
|
-
|
|
13880
|
-
|
|
13881
|
-
|
|
13882
|
-
|
|
13883
|
-
|
|
13884
|
-
if (readyForMs >= STATUS_READY_GRACE_MS) {
|
|
13885
|
-
debug(
|
|
13886
|
-
`Session ${backendSessionId} proceeded after health endpoint warmup (${readyForMs}ms).`
|
|
13887
|
-
);
|
|
13888
|
-
sessionReady = true;
|
|
13889
|
-
break;
|
|
13890
|
-
}
|
|
13891
|
-
} else {
|
|
13892
|
-
statusReadySinceMs = null;
|
|
13893
|
-
}
|
|
13894
|
-
lastPollIssue = `session still starting (status=${status}, health=${healthAlive ? "alive" : "starting"})`;
|
|
14257
|
+
const statusTwinStates = Object.entries(
|
|
14258
|
+
statusResult.data.twins ?? statusResult.data.workers ?? {}
|
|
14259
|
+
).map(([twin, twinStatus]) => `${twin}:${twinStatus}`).join(", ");
|
|
14260
|
+
const healthTwinStates = Object.entries(healthResult.data.twins ?? {}).map(([twin, twinStatus]) => `${twin}:${twinStatus}`).join(", ");
|
|
14261
|
+
lastPollIssue = `session still starting (status=${status}, health=${healthAlive ? "alive" : "starting"}, statusTwins=[${statusTwinStates || "n/a"}], healthTwins=[${healthTwinStates || "n/a"}])`;
|
|
13895
14262
|
await sleepForPollInterval();
|
|
13896
14263
|
}
|
|
13897
14264
|
if (sessionReady) {
|
|
@@ -14292,6 +14659,7 @@ function buildEvidenceArtifacts(report) {
|
|
|
14292
14659
|
overallScore: run.overallScore,
|
|
14293
14660
|
durationMs: run.durationMs,
|
|
14294
14661
|
error: run.error ?? null,
|
|
14662
|
+
outcome: run.outcome ?? null,
|
|
14295
14663
|
evaluations: (run.evaluations ?? []).map((ev) => ({
|
|
14296
14664
|
criterionId: ev.criterionId,
|
|
14297
14665
|
status: ev.status,
|
|
@@ -14611,7 +14979,7 @@ import { createInterface as createInterface2 } from "readline";
|
|
|
14611
14979
|
import { Command as Command5 } from "commander";
|
|
14612
14980
|
|
|
14613
14981
|
// src/telemetry/anonymizer.ts
|
|
14614
|
-
import { createHash as
|
|
14982
|
+
import { createHash as createHash5 } from "crypto";
|
|
14615
14983
|
var API_KEY_PATTERNS = [
|
|
14616
14984
|
/(?:api[_-]?key|token|secret|password|authorization|bearer|credential)\s*[:=]\s*["']?([a-zA-Z0-9_\-/.+=]{16,})["']?/gi,
|
|
14617
14985
|
/sk-[a-zA-Z0-9]{20,}/g,
|
|
@@ -14661,7 +15029,7 @@ var USERNAME_FIELDS = /* @__PURE__ */ new Set([
|
|
|
14661
15029
|
"maintainer"
|
|
14662
15030
|
]);
|
|
14663
15031
|
function hashValue2(value, salt = "archal") {
|
|
14664
|
-
return `anon_${
|
|
15032
|
+
return `anon_${createHash5("sha256").update(`${salt}:${value}`).digest("hex").slice(0, 12)}`;
|
|
14665
15033
|
}
|
|
14666
15034
|
function anonymizeForEnterprise(entries) {
|
|
14667
15035
|
debug("Enterprise anonymization", { entryCount: String(entries.length) });
|
|
@@ -15637,7 +16005,7 @@ function createDoctorCommand() {
|
|
|
15637
16005
|
// src/commands/login.ts
|
|
15638
16006
|
import { Command as Command8 } from "commander";
|
|
15639
16007
|
import { exec } from "child_process";
|
|
15640
|
-
import { createHash as
|
|
16008
|
+
import { createHash as createHash6, randomBytes as randomBytes2 } from "crypto";
|
|
15641
16009
|
import { createServer } from "http";
|
|
15642
16010
|
var START_PORT = 51423;
|
|
15643
16011
|
var LOGIN_TIMEOUT_MS = 5 * 60 * 1e3;
|
|
@@ -15658,7 +16026,7 @@ function openBrowser(url) {
|
|
|
15658
16026
|
}
|
|
15659
16027
|
function createPkcePair() {
|
|
15660
16028
|
const codeVerifier = randomBytes2(32).toString("base64url");
|
|
15661
|
-
const codeChallenge =
|
|
16029
|
+
const codeChallenge = createHash6("sha256").update(codeVerifier).digest("base64url");
|
|
15662
16030
|
return { codeVerifier, codeChallenge };
|
|
15663
16031
|
}
|
|
15664
16032
|
function isPlan2(value) {
|
|
@@ -16388,11 +16756,25 @@ function detectProviderName(model) {
|
|
|
16388
16756
|
if (normalized.startsWith("gpt-") || normalized.startsWith("o1-") || normalized.startsWith("o3-") || normalized.startsWith("o4-")) return "OpenAI";
|
|
16389
16757
|
return "OpenAI-compatible";
|
|
16390
16758
|
}
|
|
16391
|
-
function resolveEngineApiKey(explicitKey) {
|
|
16759
|
+
function resolveEngineApiKey(explicitKey, model) {
|
|
16392
16760
|
if (explicitKey?.trim()) return explicitKey.trim();
|
|
16393
16761
|
if (process.env["ARCHAL_ENGINE_API_KEY"]?.trim()) return process.env["ARCHAL_ENGINE_API_KEY"].trim();
|
|
16762
|
+
const modelProvider = model ? detectProvider(model) : null;
|
|
16394
16763
|
const config = loadConfig();
|
|
16395
|
-
if (config.engineApiKey)
|
|
16764
|
+
if (config.engineApiKey) {
|
|
16765
|
+
if (!modelProvider || !validateKeyForProvider(config.engineApiKey, modelProvider)) {
|
|
16766
|
+
return config.engineApiKey;
|
|
16767
|
+
}
|
|
16768
|
+
}
|
|
16769
|
+
const providerEnvVars = {
|
|
16770
|
+
gemini: "GEMINI_API_KEY",
|
|
16771
|
+
openai: "OPENAI_API_KEY",
|
|
16772
|
+
anthropic: "ANTHROPIC_API_KEY"
|
|
16773
|
+
};
|
|
16774
|
+
if (modelProvider && providerEnvVars[modelProvider]) {
|
|
16775
|
+
const val = process.env[providerEnvVars[modelProvider]]?.trim();
|
|
16776
|
+
if (val) return val;
|
|
16777
|
+
}
|
|
16396
16778
|
for (const envVar of ["GEMINI_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY"]) {
|
|
16397
16779
|
const val = process.env[envVar]?.trim();
|
|
16398
16780
|
if (val) return val;
|
|
@@ -16441,7 +16823,7 @@ function createDemoCommand() {
|
|
|
16441
16823
|
process.exit(1);
|
|
16442
16824
|
}
|
|
16443
16825
|
const providerName = detectProviderName(opts.model);
|
|
16444
|
-
const engineApiKey = resolveEngineApiKey(opts.apiKey);
|
|
16826
|
+
const engineApiKey = resolveEngineApiKey(opts.apiKey, opts.model);
|
|
16445
16827
|
if (!engineApiKey) {
|
|
16446
16828
|
process.stderr.write(
|
|
16447
16829
|
`Error: No API key found for model "${opts.model}" (${providerName}).
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* Env var overrides:
|
|
6
6
|
* ARCHAL_MAX_TOKENS — Max completion tokens (default from model-configs)
|
|
7
7
|
* ARCHAL_TEMPERATURE — Sampling temperature
|
|
8
|
-
* ARCHAL_LLM_TIMEOUT — Per-call timeout in seconds (default
|
|
8
|
+
* ARCHAL_LLM_TIMEOUT — Per-call timeout in seconds (default 180)
|
|
9
9
|
* ARCHAL_OPENAI_BASE_URL — Override OpenAI base URL (for proxies, Azure, etc.)
|
|
10
10
|
* ARCHAL_ANTHROPIC_BASE_URL — Override Anthropic base URL
|
|
11
11
|
* ARCHAL_GEMINI_BASE_URL — Override Gemini base URL
|
|
@@ -48,19 +48,41 @@ const PROVIDER_ENV_VARS = {
|
|
|
48
48
|
openai: 'OPENAI_API_KEY',
|
|
49
49
|
};
|
|
50
50
|
|
|
51
|
+
function inferKeyProvider(key) {
|
|
52
|
+
if (!key) return null;
|
|
53
|
+
if (key.startsWith('AIzaSy')) return 'gemini';
|
|
54
|
+
if (key.startsWith('sk-ant-')) return 'anthropic';
|
|
55
|
+
if (key.startsWith('sk-')) return 'openai';
|
|
56
|
+
return null;
|
|
57
|
+
}
|
|
58
|
+
|
|
51
59
|
/**
|
|
52
60
|
* Resolve the API key for the detected provider.
|
|
53
61
|
* Priority: ARCHAL_ENGINE_API_KEY > provider-specific env var.
|
|
62
|
+
* If ARCHAL_ENGINE_API_KEY clearly belongs to a different provider, fall back
|
|
63
|
+
* to provider-specific key when available, otherwise fail with a clear error.
|
|
54
64
|
* @param {string} provider
|
|
55
65
|
* @returns {string}
|
|
56
66
|
*/
|
|
57
67
|
export function resolveApiKey(provider) {
|
|
58
|
-
const engineKey = process.env['ARCHAL_ENGINE_API_KEY']?.trim();
|
|
59
|
-
if (engineKey) return engineKey;
|
|
60
|
-
|
|
61
68
|
const envVar = PROVIDER_ENV_VARS[provider] ?? 'OPENAI_API_KEY';
|
|
62
|
-
const
|
|
63
|
-
|
|
69
|
+
const providerKey = process.env[envVar]?.trim();
|
|
70
|
+
const engineKey = process.env['ARCHAL_ENGINE_API_KEY']?.trim();
|
|
71
|
+
if (engineKey) {
|
|
72
|
+
const inferred = inferKeyProvider(engineKey);
|
|
73
|
+
if (!inferred || inferred === provider) return engineKey;
|
|
74
|
+
if (providerKey) {
|
|
75
|
+
process.stderr.write(
|
|
76
|
+
`[harness] Warning: ARCHAL_ENGINE_API_KEY appears to be for ${inferred}; using ${envVar} for ${provider} model.\n`,
|
|
77
|
+
);
|
|
78
|
+
return providerKey;
|
|
79
|
+
}
|
|
80
|
+
throw new Error(
|
|
81
|
+
`ARCHAL_ENGINE_API_KEY appears to be for ${inferred}, but provider "${provider}" requires ${envVar}. ` +
|
|
82
|
+
`Set ${envVar} or use a ${inferred} model.`
|
|
83
|
+
);
|
|
84
|
+
}
|
|
85
|
+
if (providerKey) return providerKey;
|
|
64
86
|
|
|
65
87
|
throw new Error(
|
|
66
88
|
`No API key found for provider "${provider}". ` +
|
|
@@ -111,7 +133,7 @@ function getLlmTimeoutMs() {
|
|
|
111
133
|
return parsed * 1000;
|
|
112
134
|
}
|
|
113
135
|
}
|
|
114
|
-
return
|
|
136
|
+
return 180_000; // 180 seconds default
|
|
115
137
|
}
|
|
116
138
|
|
|
117
139
|
// ── Thinking configuration ──────────────────────────────────────────
|
|
@@ -107,10 +107,19 @@ try {
|
|
|
107
107
|
|
|
108
108
|
// Call the LLM with retry on transient errors
|
|
109
109
|
log.llmCall(step + 1);
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
110
|
+
let response;
|
|
111
|
+
try {
|
|
112
|
+
response = await withRetry(
|
|
113
|
+
() => callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools),
|
|
114
|
+
2,
|
|
115
|
+
);
|
|
116
|
+
} catch (err) {
|
|
117
|
+
const msg = err?.message ?? String(err);
|
|
118
|
+
log.error('llm_call_failed', { step: step + 1, error: msg });
|
|
119
|
+
process.stderr.write(`[hardened] LLM API error: ${msg.slice(0, 500)}\n`);
|
|
120
|
+
exitReason = 'llm_error';
|
|
121
|
+
break;
|
|
122
|
+
}
|
|
114
123
|
|
|
115
124
|
const iterDurationMs = Date.now() - iterStart;
|
|
116
125
|
totalInputTokens += response.usage.inputTokens;
|
|
@@ -218,4 +227,8 @@ try {
|
|
|
218
227
|
`(${totalToolErrors} errors), ${totalInputTokens} input tokens, ` +
|
|
219
228
|
`${totalOutputTokens} output tokens, ${(totalTimeMs / 1000).toFixed(1)}s total\n`
|
|
220
229
|
);
|
|
230
|
+
|
|
231
|
+
if (exitReason === 'llm_error') {
|
|
232
|
+
process.exit(1);
|
|
233
|
+
}
|
|
221
234
|
}
|
|
@@ -84,7 +84,16 @@ try {
|
|
|
84
84
|
const iterStart = Date.now();
|
|
85
85
|
|
|
86
86
|
log.llmCall(step + 1);
|
|
87
|
-
|
|
87
|
+
let response;
|
|
88
|
+
try {
|
|
89
|
+
response = await callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools);
|
|
90
|
+
} catch (err) {
|
|
91
|
+
const msg = err?.message ?? String(err);
|
|
92
|
+
log.error('llm_call_failed', { step: step + 1, error: msg });
|
|
93
|
+
process.stderr.write(`[naive] LLM API error: ${msg.slice(0, 500)}\n`);
|
|
94
|
+
exitReason = 'llm_error';
|
|
95
|
+
break;
|
|
96
|
+
}
|
|
88
97
|
|
|
89
98
|
const iterDurationMs = Date.now() - iterStart;
|
|
90
99
|
totalInputTokens += response.usage.inputTokens;
|
|
@@ -150,4 +159,7 @@ try {
|
|
|
150
159
|
`${(totalTimeMs / 1000).toFixed(1)}s total\n`
|
|
151
160
|
);
|
|
152
161
|
|
|
162
|
+
if (exitReason === 'llm_error') {
|
|
163
|
+
process.exit(1);
|
|
164
|
+
}
|
|
153
165
|
}
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
* - Structured system prompt encouraging step-by-step reasoning
|
|
7
7
|
* - Error recovery with retries on transient failures
|
|
8
8
|
* - Context-aware done detection
|
|
9
|
-
* -
|
|
9
|
+
* - Configurable step limit (default 80, cap 200 via ARCHAL_MAX_STEPS)
|
|
10
10
|
* - Token usage and timing instrumentation
|
|
11
11
|
*
|
|
12
12
|
* Env vars (set by archal orchestrator):
|
|
@@ -34,7 +34,21 @@ import { createLogger } from '../_lib/logging.mjs';
|
|
|
34
34
|
import { writeMetrics } from '../_lib/metrics.mjs';
|
|
35
35
|
import { createAgentTrace } from '../_lib/agent-trace.mjs';
|
|
36
36
|
|
|
37
|
-
const
|
|
37
|
+
const DEFAULT_MAX_STEPS = 80;
|
|
38
|
+
const MAX_STEPS = (() => {
|
|
39
|
+
const raw = process.env['ARCHAL_MAX_STEPS']?.trim();
|
|
40
|
+
if (!raw) return DEFAULT_MAX_STEPS;
|
|
41
|
+
const parsed = parseInt(raw, 10);
|
|
42
|
+
if (Number.isNaN(parsed) || parsed <= 0) return DEFAULT_MAX_STEPS;
|
|
43
|
+
return Math.min(parsed, 200);
|
|
44
|
+
})();
|
|
45
|
+
const MAX_CONSECUTIVE_ERRORS = (() => {
|
|
46
|
+
const raw = process.env['ARCHAL_MAX_CONSECUTIVE_ERRORS']?.trim();
|
|
47
|
+
if (!raw) return 8;
|
|
48
|
+
const parsed = parseInt(raw, 10);
|
|
49
|
+
if (Number.isNaN(parsed) || parsed <= 0) return 8;
|
|
50
|
+
return Math.min(parsed, 20);
|
|
51
|
+
})();
|
|
38
52
|
const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
|
|
39
53
|
const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
|
|
40
54
|
|
|
@@ -95,10 +109,19 @@ try {
|
|
|
95
109
|
|
|
96
110
|
// Call the LLM with retry on transient errors
|
|
97
111
|
log.llmCall(step + 1);
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
112
|
+
let response;
|
|
113
|
+
try {
|
|
114
|
+
response = await withRetry(
|
|
115
|
+
() => callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools),
|
|
116
|
+
2,
|
|
117
|
+
);
|
|
118
|
+
} catch (err) {
|
|
119
|
+
const msg = err?.message ?? String(err);
|
|
120
|
+
log.error('llm_call_failed', { step: step + 1, error: msg });
|
|
121
|
+
process.stderr.write(`[react] LLM API error: ${msg.slice(0, 500)}\n`);
|
|
122
|
+
exitReason = 'llm_error';
|
|
123
|
+
break;
|
|
124
|
+
}
|
|
102
125
|
|
|
103
126
|
const iterDurationMs = Date.now() - iterStart;
|
|
104
127
|
totalInputTokens += response.usage.inputTokens;
|
|
@@ -154,7 +177,7 @@ try {
|
|
|
154
177
|
process.stderr.write(`[react] Tool error (${consecutiveErrors}): ${err.message}\n`);
|
|
155
178
|
|
|
156
179
|
// Bail if too many consecutive errors
|
|
157
|
-
if (consecutiveErrors >=
|
|
180
|
+
if (consecutiveErrors >= MAX_CONSECUTIVE_ERRORS) {
|
|
158
181
|
process.stderr.write('[react] Too many consecutive tool errors — stopping.\n');
|
|
159
182
|
exitReason = 'consecutive_errors';
|
|
160
183
|
break;
|
|
@@ -171,7 +194,7 @@ try {
|
|
|
171
194
|
durationMs: iterDurationMs,
|
|
172
195
|
});
|
|
173
196
|
|
|
174
|
-
if (consecutiveErrors >=
|
|
197
|
+
if (consecutiveErrors >= MAX_CONSECUTIVE_ERRORS) break;
|
|
175
198
|
|
|
176
199
|
// Append tool results to conversation
|
|
177
200
|
messages = appendToolResults(provider, messages, toolCalls, results);
|
|
@@ -209,4 +232,7 @@ try {
|
|
|
209
232
|
`${totalOutputTokens} output tokens, ${(totalTimeMs / 1000).toFixed(1)}s total\n`
|
|
210
233
|
);
|
|
211
234
|
|
|
235
|
+
if (exitReason === 'llm_error') {
|
|
236
|
+
process.exit(1);
|
|
237
|
+
}
|
|
212
238
|
}
|
|
@@ -77,7 +77,16 @@ try {
|
|
|
77
77
|
const iterStart = Date.now();
|
|
78
78
|
|
|
79
79
|
log.llmCall(step + 1);
|
|
80
|
-
|
|
80
|
+
let response;
|
|
81
|
+
try {
|
|
82
|
+
response = await callLlmWithMessages(provider, MODEL, apiKey, messages, providerTools);
|
|
83
|
+
} catch (err) {
|
|
84
|
+
const msg = err?.message ?? String(err);
|
|
85
|
+
log.error('llm_call_failed', { step: step + 1, error: msg });
|
|
86
|
+
process.stderr.write(`[zero-shot] LLM API error: ${msg.slice(0, 500)}\n`);
|
|
87
|
+
exitReason = 'llm_error';
|
|
88
|
+
break;
|
|
89
|
+
}
|
|
81
90
|
|
|
82
91
|
const iterDurationMs = Date.now() - iterStart;
|
|
83
92
|
totalInputTokens += response.usage.inputTokens;
|
|
@@ -169,4 +178,7 @@ try {
|
|
|
169
178
|
`${totalOutputTokens} output tokens, ${(totalTimeMs / 1000).toFixed(1)}s total\n`
|
|
170
179
|
);
|
|
171
180
|
|
|
181
|
+
if (exitReason === 'llm_error') {
|
|
182
|
+
process.exit(1);
|
|
183
|
+
}
|
|
172
184
|
}
|