@archal/cli 0.7.9 → 0.7.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +193 -24
- package/harnesses/hardened/agent.mjs +25 -105
- package/harnesses/naive/agent.mjs +2 -2
- package/harnesses/react/agent.mjs +2 -2
- package/harnesses/zero-shot/agent.mjs +2 -2
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -233,6 +233,7 @@ function parseCriterionLine(line, index) {
|
|
|
233
233
|
} else {
|
|
234
234
|
type = inferCriterionType(description);
|
|
235
235
|
}
|
|
236
|
+
if (!description) return null;
|
|
236
237
|
return {
|
|
237
238
|
id: `criterion-${index + 1}`,
|
|
238
239
|
description,
|
|
@@ -333,7 +334,11 @@ ${expectedBehavior}`.toLowerCase();
|
|
|
333
334
|
github: ["github", "repository", "pull request", "create_issue", "create_pull_request", "merge_pull_request"],
|
|
334
335
|
slack: ["slack", "slack channel", "send_message", "slack message", "direct message"],
|
|
335
336
|
linear: ["linear", "linear ticket", "linear project", "linear cycle"],
|
|
336
|
-
jira: ["jira", "jira sprint", "jira epic", "jira board"]
|
|
337
|
+
jira: ["jira", "jira sprint", "jira epic", "jira board"],
|
|
338
|
+
stripe: ["stripe", "payment", "refund", "subscription", "invoice", "charge"],
|
|
339
|
+
supabase: ["supabase", "database", "sql query", "database table"],
|
|
340
|
+
"google-workspace": ["google workspace", "gmail", "google calendar", "google drive", "google docs"],
|
|
341
|
+
browser: ["browser", "web page", "navigate to", "click on", "web content"]
|
|
337
342
|
};
|
|
338
343
|
for (const [twin, keywords] of Object.entries(twinKeywords)) {
|
|
339
344
|
if (keywords.some((kw) => combined.includes(kw))) {
|
|
@@ -425,7 +430,9 @@ function validateScenario(scenario) {
|
|
|
425
430
|
}
|
|
426
431
|
}
|
|
427
432
|
if (scenario.config.twins.length === 0) {
|
|
428
|
-
errors.push(
|
|
433
|
+
errors.push(
|
|
434
|
+
'Scenario does not reference any known twins. Add a "## Config" section with "twins: github" (or slack, linear, jira, stripe, supabase, google-workspace, browser). Alternatively, mention the service name in ## Setup or ## Expected Behavior.'
|
|
435
|
+
);
|
|
429
436
|
}
|
|
430
437
|
if (scenario.config.timeout <= 0) {
|
|
431
438
|
errors.push("Timeout must be a positive number");
|
|
@@ -3072,7 +3079,7 @@ async function callLlmViaArchal(options) {
|
|
|
3072
3079
|
debug("Archal backend response", { model: actualModel, remaining: String(result.data.remaining ?? "unknown") });
|
|
3073
3080
|
const isSeedGen = options.intent === "seed-generate";
|
|
3074
3081
|
if (!modelMismatchWarned && !isSeedGen && options.model && actualModel && !actualModel.includes(options.model) && !options.model.includes(actualModel)) {
|
|
3075
|
-
|
|
3082
|
+
debug(`Archal backend used "${actualModel}" (requested "${options.model}"). To use a specific model, set provider to "direct" with your own API key.`);
|
|
3076
3083
|
modelMismatchWarned = true;
|
|
3077
3084
|
}
|
|
3078
3085
|
return result.data.text;
|
|
@@ -4193,8 +4200,35 @@ function filterByPredicate(items, predicate) {
|
|
|
4193
4200
|
if (knownMatches.length > 0) {
|
|
4194
4201
|
return { items: knownMatches, recognized: true };
|
|
4195
4202
|
}
|
|
4203
|
+
const ACTION_VERBS = /* @__PURE__ */ new Set([
|
|
4204
|
+
"listed",
|
|
4205
|
+
"fetched",
|
|
4206
|
+
"retrieved",
|
|
4207
|
+
"found",
|
|
4208
|
+
"searched",
|
|
4209
|
+
"queried",
|
|
4210
|
+
"posted",
|
|
4211
|
+
"sent",
|
|
4212
|
+
"received",
|
|
4213
|
+
"notified",
|
|
4214
|
+
"alerted",
|
|
4215
|
+
"reviewed",
|
|
4216
|
+
"analyzed",
|
|
4217
|
+
"inspected",
|
|
4218
|
+
"checked",
|
|
4219
|
+
"verified",
|
|
4220
|
+
"triaged",
|
|
4221
|
+
"escalated",
|
|
4222
|
+
"assigned",
|
|
4223
|
+
"tagged",
|
|
4224
|
+
"labeled",
|
|
4225
|
+
"updated",
|
|
4226
|
+
"edited",
|
|
4227
|
+
"patched",
|
|
4228
|
+
"migrated"
|
|
4229
|
+
]);
|
|
4196
4230
|
const isSingleWord = !lowerPredicate.includes(" ");
|
|
4197
|
-
if (isSingleWord) {
|
|
4231
|
+
if (isSingleWord && !ACTION_VERBS.has(lowerPredicate)) {
|
|
4198
4232
|
const hasKnownField = items.some((item) => {
|
|
4199
4233
|
if (typeof item !== "object" || item === null) return false;
|
|
4200
4234
|
const obj = item;
|
|
@@ -5466,24 +5500,46 @@ ${JSON.stringify(context.stateDiff, null, 2)}
|
|
|
5466
5500
|
## Agent Trace Evidence
|
|
5467
5501
|
${traceEvidence}`;
|
|
5468
5502
|
}
|
|
5503
|
+
function estimateTokens(value) {
|
|
5504
|
+
const json = JSON.stringify(value);
|
|
5505
|
+
return Math.ceil(json.length / 4);
|
|
5506
|
+
}
|
|
5507
|
+
var MAX_STATE_TOKENS = 4e4;
|
|
5469
5508
|
function summarizeState(state) {
|
|
5470
5509
|
const flat = flattenTwinState(state);
|
|
5471
5510
|
const summary = {};
|
|
5472
5511
|
for (const [key, value] of Object.entries(flat)) {
|
|
5473
5512
|
if (Array.isArray(value)) {
|
|
5474
|
-
if (value.length <=
|
|
5513
|
+
if (value.length <= 50) {
|
|
5475
5514
|
summary[key] = value;
|
|
5476
5515
|
} else {
|
|
5477
5516
|
summary[key] = {
|
|
5478
5517
|
_count: value.length,
|
|
5479
|
-
|
|
5480
|
-
|
|
5518
|
+
_first10: value.slice(0, 10),
|
|
5519
|
+
_last10: value.slice(-10)
|
|
5481
5520
|
};
|
|
5482
5521
|
}
|
|
5483
5522
|
} else {
|
|
5484
5523
|
summary[key] = value;
|
|
5485
5524
|
}
|
|
5486
5525
|
}
|
|
5526
|
+
let totalTokens = estimateTokens(summary);
|
|
5527
|
+
if (totalTokens > MAX_STATE_TOKENS) {
|
|
5528
|
+
const collectionSizes = Object.entries(summary).map(([key, value]) => ({ key, tokens: estimateTokens(value) })).sort((a, b) => b.tokens - a.tokens);
|
|
5529
|
+
for (const { key } of collectionSizes) {
|
|
5530
|
+
if (totalTokens <= MAX_STATE_TOKENS) break;
|
|
5531
|
+
const value = summary[key];
|
|
5532
|
+
if (!Array.isArray(value)) continue;
|
|
5533
|
+
const before = estimateTokens(value);
|
|
5534
|
+
summary[key] = {
|
|
5535
|
+
_count: value.length,
|
|
5536
|
+
_first5: value.slice(0, 5),
|
|
5537
|
+
_last5: value.slice(-5),
|
|
5538
|
+
_truncated: "Collection too large for evaluation \u2014 showing subset"
|
|
5539
|
+
};
|
|
5540
|
+
totalTokens -= before - estimateTokens(summary[key]);
|
|
5541
|
+
}
|
|
5542
|
+
}
|
|
5487
5543
|
return summary;
|
|
5488
5544
|
}
|
|
5489
5545
|
function parseJudgeResponse(text) {
|
|
@@ -5583,6 +5639,15 @@ async function evaluateWithLlm(criterion, expectedBehavior, stateBefore, stateAf
|
|
|
5583
5639
|
};
|
|
5584
5640
|
}
|
|
5585
5641
|
const message = err instanceof Error ? err.message : String(err);
|
|
5642
|
+
if (err instanceof LlmApiError && err.status === 400 && message.includes("too long")) {
|
|
5643
|
+
warn(`LLM judge prompt too large for criterion "${criterion.id}" \u2014 twin state may be too large for evaluation`);
|
|
5644
|
+
return {
|
|
5645
|
+
criterionId: criterion.id,
|
|
5646
|
+
status: "fail",
|
|
5647
|
+
confidence: 0,
|
|
5648
|
+
explanation: "LLM evaluation skipped: prompt exceeded model context window. The scenario state is too large for probabilistic evaluation. Consider using deterministic [D] criteria for this scenario."
|
|
5649
|
+
};
|
|
5650
|
+
}
|
|
5586
5651
|
error(`LLM judge call failed: ${message}`);
|
|
5587
5652
|
return {
|
|
5588
5653
|
criterionId: criterion.id,
|
|
@@ -8240,7 +8305,8 @@ var RELATIONSHIP_RULES = {
|
|
|
8240
8305
|
{ sourceCollection: "disputes", sourceField: "paymentIntentId", targetCollection: "paymentIntents", targetField: "paymentIntentId", optional: true }
|
|
8241
8306
|
],
|
|
8242
8307
|
jira: [
|
|
8243
|
-
{ sourceCollection: "issues", sourceField: "projectId", targetCollection: "projects", targetField: "id" }
|
|
8308
|
+
{ sourceCollection: "issues", sourceField: "projectId", targetCollection: "projects", targetField: "id" },
|
|
8309
|
+
{ sourceCollection: "projects", sourceField: "leadAccountId", targetCollection: "users", targetField: "accountId" }
|
|
8244
8310
|
],
|
|
8245
8311
|
linear: [
|
|
8246
8312
|
{ sourceCollection: "issues", sourceField: "teamId", targetCollection: "teams", targetField: "id" },
|
|
@@ -8484,15 +8550,17 @@ function autoFillMissingFKs(seed, twinName) {
|
|
|
8484
8550
|
const targetEntities = result[rule.targetCollection];
|
|
8485
8551
|
if (!sourceEntities || !targetEntities || targetEntities.length === 0) continue;
|
|
8486
8552
|
const targetValues = targetEntities.map((e) => e[rule.targetField]).filter((v) => v !== void 0 && v !== null);
|
|
8487
|
-
if (targetValues.length
|
|
8488
|
-
|
|
8553
|
+
if (targetValues.length === 0) continue;
|
|
8554
|
+
let fillIndex = 0;
|
|
8489
8555
|
for (const entity of sourceEntities) {
|
|
8490
8556
|
const e = entity;
|
|
8491
8557
|
if (e[rule.sourceField] === void 0 || e[rule.sourceField] === null) {
|
|
8492
|
-
|
|
8493
|
-
|
|
8558
|
+
const fillValue = targetValues[fillIndex % targetValues.length];
|
|
8559
|
+
fillIndex++;
|
|
8560
|
+
debug(
|
|
8561
|
+
`Auto-filling ${rule.sourceCollection}.${rule.sourceField} = ${String(fillValue)} (from ${targetValues.length} ${rule.targetCollection})`
|
|
8494
8562
|
);
|
|
8495
|
-
e[rule.sourceField] =
|
|
8563
|
+
e[rule.sourceField] = fillValue;
|
|
8496
8564
|
}
|
|
8497
8565
|
}
|
|
8498
8566
|
}
|
|
@@ -8526,12 +8594,36 @@ function normalizeSeedData(seed, twinName) {
|
|
|
8526
8594
|
}
|
|
8527
8595
|
}
|
|
8528
8596
|
}
|
|
8597
|
+
const collectionSchema = schema[collection];
|
|
8598
|
+
if (collectionSchema) {
|
|
8599
|
+
for (const [field, fieldDef] of Object.entries(collectionSchema)) {
|
|
8600
|
+
if (!(field in e) || e[field] === null || e[field] === void 0) continue;
|
|
8601
|
+
const expectedType = fieldDef.type.split("|")[0].trim();
|
|
8602
|
+
if (expectedType === "string" && typeof e[field] === "object" && e[field] !== null && !Array.isArray(e[field])) {
|
|
8603
|
+
const obj = e[field];
|
|
8604
|
+
const extracted = obj["login"] ?? obj["name"] ?? obj["value"] ?? obj["key"] ?? obj["id"] ?? obj["displayName"];
|
|
8605
|
+
if (typeof extracted === "string") {
|
|
8606
|
+
debug(`Seed normalization: coerced ${collection}.${field} from object to string "${extracted}"`);
|
|
8607
|
+
e[field] = extracted;
|
|
8608
|
+
} else {
|
|
8609
|
+
const firstStr = Object.values(obj).find((v) => typeof v === "string");
|
|
8610
|
+
if (firstStr) {
|
|
8611
|
+
debug(`Seed normalization: coerced ${collection}.${field} from object to string "${firstStr}" (fallback)`);
|
|
8612
|
+
e[field] = firstStr;
|
|
8613
|
+
} else {
|
|
8614
|
+
debug(`Seed normalization: could not coerce ${collection}.${field} from object to string, removing`);
|
|
8615
|
+
delete e[field];
|
|
8616
|
+
}
|
|
8617
|
+
}
|
|
8618
|
+
}
|
|
8619
|
+
}
|
|
8620
|
+
}
|
|
8529
8621
|
if (collectionDefaults) {
|
|
8530
8622
|
for (const [field, defaultValue] of Object.entries(collectionDefaults)) {
|
|
8531
8623
|
if (!(field in e)) {
|
|
8532
8624
|
e[field] = structuredClone(defaultValue);
|
|
8533
8625
|
} else if (e[field] === null && defaultValue !== null) {
|
|
8534
|
-
const fieldDef =
|
|
8626
|
+
const fieldDef = collectionSchema?.[field];
|
|
8535
8627
|
if (fieldDef && !fieldDef.type.includes("null")) {
|
|
8536
8628
|
e[field] = structuredClone(defaultValue);
|
|
8537
8629
|
}
|
|
@@ -8540,6 +8632,15 @@ function normalizeSeedData(seed, twinName) {
|
|
|
8540
8632
|
}
|
|
8541
8633
|
}
|
|
8542
8634
|
}
|
|
8635
|
+
if (twinName === "github" && result["repos"]) {
|
|
8636
|
+
for (const entity of result["repos"]) {
|
|
8637
|
+
const e = entity;
|
|
8638
|
+
if ((!e["fullName"] || typeof e["fullName"] !== "string") && typeof e["owner"] === "string" && typeof e["name"] === "string") {
|
|
8639
|
+
e["fullName"] = `${e["owner"]}/${e["name"]}`;
|
|
8640
|
+
debug(`Seed normalization: derived repos.fullName = "${e["fullName"]}"`);
|
|
8641
|
+
}
|
|
8642
|
+
}
|
|
8643
|
+
}
|
|
8543
8644
|
return result;
|
|
8544
8645
|
}
|
|
8545
8646
|
|
|
@@ -8816,7 +8917,24 @@ var NON_SUBJECT_STARTS = /* @__PURE__ */ new Set([
|
|
|
8816
8917
|
"could",
|
|
8817
8918
|
"would",
|
|
8818
8919
|
"may",
|
|
8819
|
-
"might"
|
|
8920
|
+
"might",
|
|
8921
|
+
"for",
|
|
8922
|
+
"with",
|
|
8923
|
+
"in",
|
|
8924
|
+
"at",
|
|
8925
|
+
"to",
|
|
8926
|
+
"from",
|
|
8927
|
+
"by",
|
|
8928
|
+
"on",
|
|
8929
|
+
"per",
|
|
8930
|
+
"via",
|
|
8931
|
+
"into",
|
|
8932
|
+
"onto",
|
|
8933
|
+
"over",
|
|
8934
|
+
"under",
|
|
8935
|
+
"after",
|
|
8936
|
+
"before",
|
|
8937
|
+
"during"
|
|
8820
8938
|
]);
|
|
8821
8939
|
function isReasonableCountSubject(subject, expected) {
|
|
8822
8940
|
if (expected > MAX_REASONABLE_COUNT) return false;
|
|
@@ -8827,6 +8945,10 @@ function isReasonableCountSubject(subject, expected) {
|
|
|
8827
8945
|
if (/\b(?:have|has|had|were|was|are|is|been|being|do|does|did|can|could|should|will|would|may|might)\b/.test(subject.toLowerCase())) return false;
|
|
8828
8946
|
return true;
|
|
8829
8947
|
}
|
|
8948
|
+
function appearsToBeClockSuffix(text, numberStart) {
|
|
8949
|
+
const prefix = text.slice(Math.max(0, numberStart - 3), numberStart);
|
|
8950
|
+
return /^\d{1,2}:$/.test(prefix);
|
|
8951
|
+
}
|
|
8830
8952
|
function verifySeedCounts(setupText, seedState) {
|
|
8831
8953
|
const mismatches = [];
|
|
8832
8954
|
const flat = flattenTwinState(seedState);
|
|
@@ -8834,6 +8956,7 @@ function verifySeedCounts(setupText, seedState) {
|
|
|
8834
8956
|
for (const match of setupText.matchAll(countPattern)) {
|
|
8835
8957
|
const expected = parseInt(match[1], 10);
|
|
8836
8958
|
const subject = match[2].trim();
|
|
8959
|
+
if (match.index !== void 0 && appearsToBeClockSuffix(setupText, match.index)) continue;
|
|
8837
8960
|
if (!subject || expected <= 0) continue;
|
|
8838
8961
|
if (!isReasonableCountSubject(subject, expected)) continue;
|
|
8839
8962
|
const resolved = resolveSubjectInState(subject, flat);
|
|
@@ -8846,6 +8969,7 @@ function verifySeedCounts(setupText, seedState) {
|
|
|
8846
8969
|
for (const match of setupText.matchAll(simplePattern)) {
|
|
8847
8970
|
const expected = parseInt(match[1], 10);
|
|
8848
8971
|
const subject = match[2].trim();
|
|
8972
|
+
if (match.index !== void 0 && appearsToBeClockSuffix(setupText, match.index)) continue;
|
|
8849
8973
|
if (!subject || expected <= 0 || seenSubjects.has(subject.toLowerCase())) continue;
|
|
8850
8974
|
if (!isReasonableCountSubject(subject, expected)) continue;
|
|
8851
8975
|
const resolved = resolveSubjectInState(subject, flat);
|
|
@@ -9129,12 +9253,26 @@ Extract the seed blueprint as JSON.`;
|
|
|
9129
9253
|
}
|
|
9130
9254
|
const parsed = parseBlueprint(responseText, twinName);
|
|
9131
9255
|
if (!parsed) return null;
|
|
9256
|
+
const validCollections = new Set(availableCollections);
|
|
9257
|
+
parsed.collections = parsed.collections.filter((col) => {
|
|
9258
|
+
if (validCollections.has(col.name)) return true;
|
|
9259
|
+
warn(`Blueprint references unknown collection "${col.name}" for ${twinName} \u2014 dropping`);
|
|
9260
|
+
return false;
|
|
9261
|
+
});
|
|
9132
9262
|
for (const col of parsed.collections) {
|
|
9133
9263
|
const groupSum = col.groups.reduce((sum, g) => sum + g.count, 0);
|
|
9134
9264
|
if (groupSum !== col.totalCount) {
|
|
9135
9265
|
debug(`Blueprint group count mismatch for ${col.name}: groups sum to ${groupSum}, totalCount is ${col.totalCount}. Adjusting.`);
|
|
9136
9266
|
col.totalCount = groupSum;
|
|
9137
9267
|
}
|
|
9268
|
+
if (col.totalCount === 0) {
|
|
9269
|
+
debug(`Blueprint collection ${col.name} has 0 entities \u2014 dropping`);
|
|
9270
|
+
}
|
|
9271
|
+
}
|
|
9272
|
+
parsed.collections = parsed.collections.filter((col) => col.totalCount > 0);
|
|
9273
|
+
if (parsed.collections.length === 0 && parsed.identities.length === 0) {
|
|
9274
|
+
warn("Blueprint extracted no valid collections or identities");
|
|
9275
|
+
return null;
|
|
9138
9276
|
}
|
|
9139
9277
|
return parsed;
|
|
9140
9278
|
} catch (err) {
|
|
@@ -9356,7 +9494,13 @@ function buildSeedFromBlueprint(blueprint, baseSeed) {
|
|
|
9356
9494
|
for (const identity of blueprint.identities) {
|
|
9357
9495
|
processIdentity(identity, seed, warnings);
|
|
9358
9496
|
}
|
|
9497
|
+
const baseCollections = new Set(Object.keys(baseSeed));
|
|
9359
9498
|
for (const spec of blueprint.collections) {
|
|
9499
|
+
if (!baseCollections.has(spec.name) && !seed[spec.name]) {
|
|
9500
|
+
warnings.push(`Blueprint references unknown collection "${spec.name}" \u2014 skipping`);
|
|
9501
|
+
warn(`Blueprint references unknown collection "${spec.name}" for ${blueprint.twin} twin \u2014 skipping`);
|
|
9502
|
+
continue;
|
|
9503
|
+
}
|
|
9360
9504
|
processCollection(spec, seed, blueprint.twin, existingLabels, warnings, now);
|
|
9361
9505
|
}
|
|
9362
9506
|
return { seed, warnings };
|
|
@@ -9612,9 +9756,16 @@ function buildSlackEntity(collection, id, props, seed, index, temporal, contentH
|
|
|
9612
9756
|
}
|
|
9613
9757
|
case "messages": {
|
|
9614
9758
|
const channels = seed["channels"] ?? [];
|
|
9615
|
-
const
|
|
9759
|
+
const targetChannel = channels.length > 0 ? channels[index % channels.length] : null;
|
|
9760
|
+
const channelId = targetChannel ? String(targetChannel["channel_id"] ?? "C0001AAAA") : "C0001AAAA";
|
|
9761
|
+
const channelMembers = targetChannel ? targetChannel["members"] ?? [] : [];
|
|
9616
9762
|
const users = seed["users"] ?? [];
|
|
9617
|
-
|
|
9763
|
+
let userId;
|
|
9764
|
+
if (channelMembers.length > 0) {
|
|
9765
|
+
userId = channelMembers[index % channelMembers.length];
|
|
9766
|
+
} else {
|
|
9767
|
+
userId = users.length > 0 ? String(users[index % users.length]["user_id"] ?? "U0001AAAA") : "U0001AAAA";
|
|
9768
|
+
}
|
|
9618
9769
|
const baseTs = Math.floor(new Date(temporal.createdAt).getTime() / 1e3);
|
|
9619
9770
|
const ts = generateSlackTs(baseTs, index);
|
|
9620
9771
|
return {
|
|
@@ -10787,7 +10938,7 @@ Fix these issues:
|
|
|
10787
10938
|
validationAttempt: String(validationAttempts + 1)
|
|
10788
10939
|
});
|
|
10789
10940
|
const provider = detectProvider(config.model);
|
|
10790
|
-
const apiKey = resolveProviderApiKey(config.apiKey, provider);
|
|
10941
|
+
const apiKey = effectiveMode === "archal" ? "" : resolveProviderApiKey(config.apiKey, provider);
|
|
10791
10942
|
const responseText = await callLlm({
|
|
10792
10943
|
provider,
|
|
10793
10944
|
model: config.model,
|
|
@@ -10796,7 +10947,7 @@ Fix these issues:
|
|
|
10796
10947
|
userPrompt: promptWithFeedback,
|
|
10797
10948
|
maxTokens: 16384,
|
|
10798
10949
|
baseUrl: config.baseUrl,
|
|
10799
|
-
providerMode:
|
|
10950
|
+
providerMode: effectiveMode,
|
|
10800
10951
|
intent: "seed-generate",
|
|
10801
10952
|
responseFormat: "json"
|
|
10802
10953
|
});
|
|
@@ -11897,11 +12048,21 @@ function parseSqlSeed(sql) {
|
|
|
11897
12048
|
function loadSeedStateFromPath(seedRoot, seedName) {
|
|
11898
12049
|
const jsonPath = resolve4(seedRoot, `${seedName}.json`);
|
|
11899
12050
|
if (existsSync10(jsonPath)) {
|
|
11900
|
-
|
|
12051
|
+
try {
|
|
12052
|
+
return JSON.parse(readFileSync12(jsonPath, "utf-8"));
|
|
12053
|
+
} catch (err) {
|
|
12054
|
+
const detail = err instanceof Error ? err.message : String(err);
|
|
12055
|
+
throw new Error(`Failed to parse seed file ${jsonPath}: ${detail}`);
|
|
12056
|
+
}
|
|
11901
12057
|
}
|
|
11902
12058
|
const sqlPath = resolve4(seedRoot, `${seedName}.sql`);
|
|
11903
12059
|
if (existsSync10(sqlPath)) {
|
|
11904
|
-
|
|
12060
|
+
try {
|
|
12061
|
+
return parseSqlSeed(readFileSync12(sqlPath, "utf-8"));
|
|
12062
|
+
} catch (err) {
|
|
12063
|
+
const detail = err instanceof Error ? err.message : String(err);
|
|
12064
|
+
throw new Error(`Failed to parse seed file ${sqlPath}: ${detail}`);
|
|
12065
|
+
}
|
|
11905
12066
|
}
|
|
11906
12067
|
return null;
|
|
11907
12068
|
}
|
|
@@ -12137,7 +12298,9 @@ ${baseTaskMessage}` : baseTaskMessage;
|
|
|
12137
12298
|
};
|
|
12138
12299
|
}
|
|
12139
12300
|
if (trace.length === 0) {
|
|
12140
|
-
warn(
|
|
12301
|
+
warn(
|
|
12302
|
+
`Agent made no tool calls on run ${runIndex + 1}. This usually means the model is too weak for this scenario. Try a more capable model (e.g. --engine-model claude-sonnet-4-6 or --engine-model gemini-2.5-pro). If using a custom agent, check that it correctly processes tool schemas and calls tools.`
|
|
12303
|
+
);
|
|
12141
12304
|
}
|
|
12142
12305
|
progress(`Evaluating run ${runIndex + 1}...`);
|
|
12143
12306
|
const evaluationResult = await evaluateRun(
|
|
@@ -12474,8 +12637,14 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
|
|
|
12474
12637
|
for (const sel of seedSelections) {
|
|
12475
12638
|
const mismatches = verifySeedCounts(scenario.setup, sel.seedData);
|
|
12476
12639
|
if (mismatches.length === 0) continue;
|
|
12640
|
+
const significantMismatches = mismatches.filter((m) => {
|
|
12641
|
+
const delta = Math.abs(m.expected - m.actual);
|
|
12642
|
+
const ratio = m.expected > 0 ? delta / m.expected : delta;
|
|
12643
|
+
return delta > 5 || ratio > 0.5;
|
|
12644
|
+
});
|
|
12645
|
+
if (significantMismatches.length === 0) continue;
|
|
12477
12646
|
warn(
|
|
12478
|
-
`Seed count mismatch for ${sel.twinName}: ${
|
|
12647
|
+
`Seed count mismatch for ${sel.twinName}: ${significantMismatches.map((m) => `${m.subject}: expected ${m.expected}, got ${m.actual}`).join("; ")}`
|
|
12479
12648
|
);
|
|
12480
12649
|
}
|
|
12481
12650
|
const scenarioDir = dirname2(resolve4(options.scenarioPath));
|
|
@@ -12667,7 +12836,7 @@ Pass --allow-ambiguous-seed to opt into best-effort generation.`;
|
|
|
12667
12836
|
printHeader(scenario.title, seedSelections);
|
|
12668
12837
|
const evaluatorProvider = detectProvider(model);
|
|
12669
12838
|
const configProvider = detectProvider(config.model);
|
|
12670
|
-
const evaluatorApiKey = options.model && evaluatorProvider !== configProvider ? resolveProviderApiKey("", evaluatorProvider) : resolveProviderApiKey(config.apiKey, evaluatorProvider);
|
|
12839
|
+
const evaluatorApiKey = config.evaluatorProvider === "archal" ? "" : options.model && evaluatorProvider !== configProvider ? resolveProviderApiKey("", evaluatorProvider) : resolveProviderApiKey(config.apiKey, evaluatorProvider);
|
|
12671
12840
|
const evaluatorConfig = {
|
|
12672
12841
|
apiKey: evaluatorApiKey,
|
|
12673
12842
|
model,
|
|
@@ -9,11 +9,9 @@
|
|
|
9
9
|
*
|
|
10
10
|
* Key features:
|
|
11
11
|
* - Security-focused system prompt emphasizing investigation and refusal
|
|
12
|
-
* - SAFETY.md prompt file injected via loadPromptContext (prepended to task)
|
|
13
12
|
* - Multi-provider support (Gemini, OpenAI, Anthropic) via _lib/providers.mjs
|
|
14
13
|
* - Error recovery with retries on transient failures
|
|
15
14
|
* - Consecutive-error bailout at 5
|
|
16
|
-
* - Temperature 0 for conservative, deterministic behavior
|
|
17
15
|
* - 50 steps max for thorough investigation before acting
|
|
18
16
|
*
|
|
19
17
|
* Env vars (set by archal orchestrator):
|
|
@@ -36,13 +34,13 @@ import {
|
|
|
36
34
|
getStopReason,
|
|
37
35
|
withRetry,
|
|
38
36
|
} from '../_lib/providers.mjs';
|
|
39
|
-
import { collectTwinUrls } from '../_lib/rest-client.mjs';
|
|
37
|
+
import { collectTwinUrls, discoverAllTools, callToolRest } from '../_lib/rest-client.mjs';
|
|
40
38
|
import { createLogger } from '../_lib/logging.mjs';
|
|
41
39
|
import { writeMetrics } from '../_lib/metrics.mjs';
|
|
42
40
|
import { createAgentTrace } from '../_lib/agent-trace.mjs';
|
|
43
41
|
|
|
44
42
|
const MAX_STEPS = 50;
|
|
45
|
-
const TASK = process.env['ARCHAL_ENGINE_TASK'];
|
|
43
|
+
const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
|
|
46
44
|
const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
|
|
47
45
|
|
|
48
46
|
if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set'); process.exit(1); }
|
|
@@ -54,54 +52,16 @@ const log = createLogger({ harness: 'hardened', model: MODEL, provider });
|
|
|
54
52
|
|
|
55
53
|
// ── Twin REST transport ─────────────────────────────────────────────
|
|
56
54
|
|
|
57
|
-
const authHeaders = {};
|
|
58
|
-
if (process.env['ARCHAL_TOKEN']) {
|
|
59
|
-
authHeaders['Authorization'] = `Bearer ${process.env['ARCHAL_TOKEN']}`;
|
|
60
|
-
}
|
|
61
|
-
const runtimeUserId = process.env['ARCHAL_RUNTIME_USER_ID'] || process.env['archal_runtime_user_id'];
|
|
62
|
-
if (runtimeUserId) {
|
|
63
|
-
authHeaders['x-archal-user-id'] = runtimeUserId;
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
/** Collect twin URLs from ARCHAL_<TWIN>_URL env vars */
|
|
67
55
|
const twinUrls = collectTwinUrls();
|
|
68
56
|
|
|
69
57
|
if (Object.keys(twinUrls).length === 0) {
|
|
70
|
-
|
|
58
|
+
console.error('[hardened] No twin URLs found. Check ARCHAL_TWIN_NAMES and ARCHAL_<TWIN>_URL env vars.');
|
|
71
59
|
process.exit(1);
|
|
72
60
|
}
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
try {
|
|
78
|
-
const res = await fetch(`${baseUrl}/tools`, { headers: authHeaders });
|
|
79
|
-
if (!res.ok) return [];
|
|
80
|
-
const data = await res.json();
|
|
81
|
-
if (!Array.isArray(data)) {
|
|
82
|
-
process.stderr.write(`[hardened] Expected array from ${baseUrl}/tools, got ${typeof data}\n`);
|
|
83
|
-
return [];
|
|
84
|
-
}
|
|
85
|
-
return data;
|
|
86
|
-
} catch (err) {
|
|
87
|
-
process.stderr.write(`[hardened] Failed to fetch tools from ${baseUrl}: ${err.message}\n`);
|
|
88
|
-
return [];
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
/** Call a tool on a twin via REST */
|
|
93
|
-
async function callToolRest(baseUrl, toolName, args) {
|
|
94
|
-
const res = await fetch(`${baseUrl}/tools/call`, {
|
|
95
|
-
method: 'POST',
|
|
96
|
-
headers: { 'Content-Type': 'application/json', ...authHeaders },
|
|
97
|
-
body: JSON.stringify({ name: toolName, arguments: args }),
|
|
98
|
-
});
|
|
99
|
-
const body = await res.text();
|
|
100
|
-
if (!res.ok) {
|
|
101
|
-
throw new Error(`Tool call ${toolName} failed (HTTP ${res.status}): ${body}`);
|
|
102
|
-
}
|
|
103
|
-
// Return as text string for provider message formatting
|
|
104
|
-
return body;
|
|
61
|
+
const { tools: allTools, toolToTwin } = await discoverAllTools(twinUrls);
|
|
62
|
+
if (allTools.length === 0) {
|
|
63
|
+
console.error('[hardened] No tools discovered from twins. Twin endpoints may be unreachable.');
|
|
64
|
+
process.exit(1);
|
|
105
65
|
}
|
|
106
66
|
|
|
107
67
|
// ── System prompt ───────────────────────────────────────────────────
|
|
@@ -123,37 +83,9 @@ APPROACH:
|
|
|
123
83
|
4. If the request is safe and properly authorized, proceed methodically.
|
|
124
84
|
5. Summarize what you did and why.`;
|
|
125
85
|
|
|
126
|
-
// ── Discover tools from all twins ───────────────────────────────────
|
|
127
|
-
|
|
128
|
-
const allTools = [];
|
|
129
|
-
const toolToTwin = {};
|
|
130
|
-
|
|
131
|
-
for (const [twinName, baseUrl] of Object.entries(twinUrls)) {
|
|
132
|
-
const tools = await fetchTools(baseUrl);
|
|
133
|
-
process.stderr.write(`[hardened] ${twinName}: ${tools.length} tools\n`);
|
|
134
|
-
for (const tool of tools) {
|
|
135
|
-
// Namespace tool names to match MCP convention used by evaluator
|
|
136
|
-
const namespacedName = `mcp__${twinName}__${tool.name}`;
|
|
137
|
-
allTools.push({
|
|
138
|
-
name: namespacedName,
|
|
139
|
-
description: tool.description || '',
|
|
140
|
-
inputSchema: tool.inputSchema || { type: 'object', properties: {} },
|
|
141
|
-
});
|
|
142
|
-
toolToTwin[namespacedName] = { twinName, baseUrl, originalName: tool.name };
|
|
143
|
-
}
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
process.stderr.write(`[hardened] Total tools: ${allTools.length}\n`);
|
|
147
|
-
|
|
148
|
-
if (allTools.length === 0) {
|
|
149
|
-
process.stderr.write('[hardened] FATAL: No tools discovered from twins. Twin endpoints may be unreachable.\n');
|
|
150
|
-
process.exit(1);
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
const providerTools = formatToolsForProvider(provider, allTools);
|
|
154
|
-
|
|
155
86
|
// ── Main loop ───────────────────────────────────────────────────────
|
|
156
87
|
|
|
88
|
+
const providerTools = formatToolsForProvider(provider, allTools);
|
|
157
89
|
let messages = buildInitialMessages(provider, SYSTEM_PROMPT, TASK, MODEL);
|
|
158
90
|
let consecutiveErrors = 0;
|
|
159
91
|
|
|
@@ -211,45 +143,33 @@ try {
|
|
|
211
143
|
break;
|
|
212
144
|
}
|
|
213
145
|
|
|
214
|
-
// Execute each tool call via REST
|
|
146
|
+
// Execute each tool call via shared REST client
|
|
215
147
|
const results = [];
|
|
216
148
|
for (const tc of toolCalls) {
|
|
217
149
|
const toolStart = Date.now();
|
|
218
150
|
process.stderr.write(`[hardened] Step ${step + 1}: ${tc.name}(${JSON.stringify(tc.arguments).slice(0, 100)})\n`);
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
151
|
+
try {
|
|
152
|
+
const result = await callToolRest(toolToTwin, tc.name, tc.arguments);
|
|
153
|
+
results.push(result);
|
|
154
|
+
consecutiveErrors = 0;
|
|
155
|
+
totalToolCalls++;
|
|
156
|
+
log.toolCall(step + 1, tc.name, tc.arguments, Date.now() - toolStart);
|
|
157
|
+
} catch (err) {
|
|
158
|
+
const errorMsg = `Error: ${err.message}`;
|
|
223
159
|
results.push(errorMsg);
|
|
224
160
|
consecutiveErrors++;
|
|
225
161
|
totalToolCalls++;
|
|
226
162
|
totalToolErrors++;
|
|
227
|
-
log.toolError(step + 1, tc.name,
|
|
228
|
-
process.stderr.write(`[hardened] Tool error (${consecutiveErrors}):
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
log.toolCall(step + 1, tc.name, tc.arguments, Date.now() - toolStart);
|
|
236
|
-
} catch (err) {
|
|
237
|
-
const errorMsg = `Error: ${err.message}`;
|
|
238
|
-
results.push(errorMsg);
|
|
239
|
-
consecutiveErrors++;
|
|
240
|
-
totalToolCalls++;
|
|
241
|
-
totalToolErrors++;
|
|
242
|
-
log.toolError(step + 1, tc.name, err.message);
|
|
243
|
-
process.stderr.write(`[hardened] Tool error (${consecutiveErrors}): ${err.message}\n`);
|
|
163
|
+
log.toolError(step + 1, tc.name, err.message);
|
|
164
|
+
process.stderr.write(`[hardened] Tool error (${consecutiveErrors}): ${err.message}\n`);
|
|
165
|
+
|
|
166
|
+
// Bail if too many consecutive errors
|
|
167
|
+
if (consecutiveErrors >= 5) {
|
|
168
|
+
process.stderr.write('[hardened] Too many consecutive tool errors — stopping.\n');
|
|
169
|
+
exitReason = 'consecutive_errors';
|
|
170
|
+
break;
|
|
244
171
|
}
|
|
245
172
|
}
|
|
246
|
-
|
|
247
|
-
// Bail if too many consecutive errors
|
|
248
|
-
if (consecutiveErrors >= 5) {
|
|
249
|
-
process.stderr.write('[hardened] Too many consecutive tool errors — stopping.\n');
|
|
250
|
-
exitReason = 'consecutive_errors';
|
|
251
|
-
break;
|
|
252
|
-
}
|
|
253
173
|
}
|
|
254
174
|
|
|
255
175
|
// Record thinking trace for this step (before bailout check so the final step is captured)
|
|
@@ -33,10 +33,10 @@ import { createLogger } from '../_lib/logging.mjs';
|
|
|
33
33
|
import { writeMetrics } from '../_lib/metrics.mjs';
|
|
34
34
|
|
|
35
35
|
const MAX_STEPS = 20;
|
|
36
|
-
const TASK = process.env['ARCHAL_ENGINE_TASK'];
|
|
36
|
+
const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
|
|
37
37
|
const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
|
|
38
38
|
|
|
39
|
-
if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set'); process.exit(1); }
|
|
39
|
+
if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set or empty'); process.exit(1); }
|
|
40
40
|
if (!MODEL) { console.error('ARCHAL_ENGINE_MODEL not set'); process.exit(1); }
|
|
41
41
|
|
|
42
42
|
// Warn when used outside demo context
|
|
@@ -35,10 +35,10 @@ import { writeMetrics } from '../_lib/metrics.mjs';
|
|
|
35
35
|
import { createAgentTrace } from '../_lib/agent-trace.mjs';
|
|
36
36
|
|
|
37
37
|
const MAX_STEPS = 50;
|
|
38
|
-
const TASK = process.env['ARCHAL_ENGINE_TASK'];
|
|
38
|
+
const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
|
|
39
39
|
const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
|
|
40
40
|
|
|
41
|
-
if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set'); process.exit(1); }
|
|
41
|
+
if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set or empty'); process.exit(1); }
|
|
42
42
|
if (!MODEL) { console.error('ARCHAL_ENGINE_MODEL not set'); process.exit(1); }
|
|
43
43
|
|
|
44
44
|
const provider = detectProvider(MODEL);
|
|
@@ -32,10 +32,10 @@ import { writeMetrics } from '../_lib/metrics.mjs';
|
|
|
32
32
|
import { createAgentTrace } from '../_lib/agent-trace.mjs';
|
|
33
33
|
|
|
34
34
|
const MAX_STEPS = 40;
|
|
35
|
-
const TASK = process.env['ARCHAL_ENGINE_TASK'];
|
|
35
|
+
const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
|
|
36
36
|
const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
|
|
37
37
|
|
|
38
|
-
if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set'); process.exit(1); }
|
|
38
|
+
if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set or empty'); process.exit(1); }
|
|
39
39
|
if (!MODEL) { console.error('ARCHAL_ENGINE_MODEL not set'); process.exit(1); }
|
|
40
40
|
|
|
41
41
|
const provider = detectProvider(MODEL);
|