@archal/cli 0.7.7 → 0.7.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +1326 -54
- package/harnesses/_lib/model-configs.mjs +26 -19
- package/harnesses/_lib/providers.mjs +24 -8
- package/harnesses/hardened/agent.mjs +25 -105
- package/harnesses/naive/agent.mjs +2 -2
- package/harnesses/react/agent.mjs +2 -2
- package/harnesses/zero-shot/agent.mjs +2 -2
- package/package.json +1 -1
|
@@ -384,24 +384,25 @@ export function getModelCapabilities(model) {
|
|
|
384
384
|
* @returns {string | null}
|
|
385
385
|
*/
|
|
386
386
|
export function detectModelFamily(model) {
|
|
387
|
-
const
|
|
387
|
+
const normalized = String(model ?? '').toLowerCase();
|
|
388
|
+
const info = MODEL_REGISTRY[normalized];
|
|
388
389
|
if (info) return info.family;
|
|
389
390
|
|
|
390
391
|
// Prefix-based heuristic for unregistered models
|
|
391
|
-
if (
|
|
392
|
-
if (
|
|
393
|
-
if (
|
|
394
|
-
if (
|
|
395
|
-
if (
|
|
396
|
-
if (
|
|
397
|
-
if (
|
|
398
|
-
if (
|
|
399
|
-
if (
|
|
400
|
-
if (
|
|
401
|
-
if (
|
|
402
|
-
if (
|
|
403
|
-
if (
|
|
404
|
-
if (
|
|
392
|
+
if (normalized.startsWith('claude-opus') || normalized.startsWith('opus-')) return 'claude-opus';
|
|
393
|
+
if (normalized.startsWith('claude-sonnet') || normalized.startsWith('sonnet-')) return 'claude-sonnet';
|
|
394
|
+
if (normalized.startsWith('claude-haiku') || normalized.startsWith('haiku-')) return 'claude-haiku';
|
|
395
|
+
if (normalized.startsWith('gpt-4o-mini')) return 'gpt-4o-mini';
|
|
396
|
+
if (normalized.startsWith('gpt-4o')) return 'gpt-4o';
|
|
397
|
+
if (normalized.startsWith('gpt-4.1')) return 'gpt-4.1';
|
|
398
|
+
if (normalized.startsWith('gpt-5')) return 'gpt-5.1';
|
|
399
|
+
if (normalized.startsWith('gpt-4')) return 'gpt-4o'; // assume 4o-class
|
|
400
|
+
if (normalized.startsWith('o1-mini')) return 'o1-mini';
|
|
401
|
+
if (normalized.startsWith('o1')) return 'o1';
|
|
402
|
+
if (normalized.startsWith('o3-mini')) return 'o3-mini';
|
|
403
|
+
if (normalized.startsWith('o4-mini')) return 'o4-mini';
|
|
404
|
+
if (normalized.startsWith('gemini') && normalized.includes('pro')) return 'gemini-pro';
|
|
405
|
+
if (normalized.startsWith('gemini') && normalized.includes('flash')) return 'gemini-flash';
|
|
405
406
|
|
|
406
407
|
return null;
|
|
407
408
|
}
|
|
@@ -483,12 +484,18 @@ export function isReasoningModel(model) {
|
|
|
483
484
|
* @returns {boolean}
|
|
484
485
|
*/
|
|
485
486
|
export function isThinkingModel(model) {
|
|
486
|
-
const
|
|
487
|
+
const normalized = String(model ?? '').toLowerCase();
|
|
488
|
+
const info = MODEL_REGISTRY[normalized];
|
|
487
489
|
if (info) return info.capabilities.supportsThinking;
|
|
488
490
|
// Heuristic for unregistered models — most modern models support thinking
|
|
489
|
-
if (
|
|
490
|
-
|
|
491
|
-
|
|
491
|
+
if (
|
|
492
|
+
normalized.startsWith('claude-')
|
|
493
|
+
|| normalized.startsWith('sonnet-')
|
|
494
|
+
|| normalized.startsWith('haiku-')
|
|
495
|
+
|| normalized.startsWith('opus-')
|
|
496
|
+
) return true;
|
|
497
|
+
if (normalized.startsWith('gemini-2.5') || normalized.startsWith('gemini-3')) return true;
|
|
498
|
+
if (normalized.startsWith('gpt-') || /^o[134]/.test(normalized)) return true;
|
|
492
499
|
return true; // default to true for unknown models
|
|
493
500
|
}
|
|
494
501
|
|
|
@@ -26,11 +26,17 @@ import { getModelConfig, isReasoningModel, isThinkingModel, getModelCapabilities
|
|
|
26
26
|
* @returns {'gemini' | 'anthropic' | 'openai'}
|
|
27
27
|
*/
|
|
28
28
|
export function detectProvider(model) {
|
|
29
|
-
|
|
30
|
-
if (
|
|
29
|
+
const normalized = String(model ?? '').toLowerCase();
|
|
30
|
+
if (normalized.startsWith('gemini-')) return 'gemini';
|
|
31
31
|
if (
|
|
32
|
-
|
|
33
|
-
|
|
32
|
+
normalized.startsWith('claude-')
|
|
33
|
+
|| normalized.startsWith('sonnet-')
|
|
34
|
+
|| normalized.startsWith('haiku-')
|
|
35
|
+
|| normalized.startsWith('opus-')
|
|
36
|
+
) return 'anthropic';
|
|
37
|
+
if (
|
|
38
|
+
normalized.startsWith('gpt-') ||
|
|
39
|
+
/^o[134]/.test(normalized)
|
|
34
40
|
) return 'openai';
|
|
35
41
|
// Default to OpenAI-compatible for unknown models
|
|
36
42
|
return 'openai';
|
|
@@ -140,13 +146,23 @@ function getAnthropicThinkingParam(model) {
|
|
|
140
146
|
const budget = parseThinkingBudget();
|
|
141
147
|
if (budget === null) return null;
|
|
142
148
|
|
|
143
|
-
//
|
|
144
|
-
|
|
145
|
-
|
|
149
|
+
// Only 4.6 series models support adaptive thinking.
|
|
150
|
+
// Older models (claude-sonnet-4-20250514, claude-haiku-4-5-20251001) need
|
|
151
|
+
// { type: "enabled", budget_tokens: N } — "adaptive" returns a 400 error.
|
|
152
|
+
const normalized = String(model ?? '').toLowerCase();
|
|
153
|
+
const supportsAdaptive = normalized.includes('-4-6') || normalized.includes('4-6-');
|
|
154
|
+
const isOpus = normalized.startsWith('claude-opus') || normalized.startsWith('opus-');
|
|
155
|
+
|
|
156
|
+
if (isOpus || (supportsAdaptive && budget === 'adaptive')) {
|
|
146
157
|
return { type: 'adaptive' };
|
|
147
158
|
}
|
|
148
159
|
|
|
149
|
-
|
|
160
|
+
if (budget === 'adaptive') {
|
|
161
|
+
// For non-4.6 models with default "adaptive" budget, use a sensible fixed budget
|
|
162
|
+
return { type: 'enabled', budget_tokens: 10000 };
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Explicit numeric budget
|
|
150
166
|
return { type: 'enabled', budget_tokens: budget };
|
|
151
167
|
}
|
|
152
168
|
|
|
@@ -9,11 +9,9 @@
|
|
|
9
9
|
*
|
|
10
10
|
* Key features:
|
|
11
11
|
* - Security-focused system prompt emphasizing investigation and refusal
|
|
12
|
-
* - SAFETY.md prompt file injected via loadPromptContext (prepended to task)
|
|
13
12
|
* - Multi-provider support (Gemini, OpenAI, Anthropic) via _lib/providers.mjs
|
|
14
13
|
* - Error recovery with retries on transient failures
|
|
15
14
|
* - Consecutive-error bailout at 5
|
|
16
|
-
* - Temperature 0 for conservative, deterministic behavior
|
|
17
15
|
* - 50 steps max for thorough investigation before acting
|
|
18
16
|
*
|
|
19
17
|
* Env vars (set by archal orchestrator):
|
|
@@ -36,13 +34,13 @@ import {
|
|
|
36
34
|
getStopReason,
|
|
37
35
|
withRetry,
|
|
38
36
|
} from '../_lib/providers.mjs';
|
|
39
|
-
import { collectTwinUrls } from '../_lib/rest-client.mjs';
|
|
37
|
+
import { collectTwinUrls, discoverAllTools, callToolRest } from '../_lib/rest-client.mjs';
|
|
40
38
|
import { createLogger } from '../_lib/logging.mjs';
|
|
41
39
|
import { writeMetrics } from '../_lib/metrics.mjs';
|
|
42
40
|
import { createAgentTrace } from '../_lib/agent-trace.mjs';
|
|
43
41
|
|
|
44
42
|
const MAX_STEPS = 50;
|
|
45
|
-
const TASK = process.env['ARCHAL_ENGINE_TASK'];
|
|
43
|
+
const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
|
|
46
44
|
const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
|
|
47
45
|
|
|
48
46
|
if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set'); process.exit(1); }
|
|
@@ -54,54 +52,16 @@ const log = createLogger({ harness: 'hardened', model: MODEL, provider });
|
|
|
54
52
|
|
|
55
53
|
// ── Twin REST transport ─────────────────────────────────────────────
|
|
56
54
|
|
|
57
|
-
const authHeaders = {};
|
|
58
|
-
if (process.env['ARCHAL_TOKEN']) {
|
|
59
|
-
authHeaders['Authorization'] = `Bearer ${process.env['ARCHAL_TOKEN']}`;
|
|
60
|
-
}
|
|
61
|
-
const runtimeUserId = process.env['ARCHAL_RUNTIME_USER_ID'] || process.env['archal_runtime_user_id'];
|
|
62
|
-
if (runtimeUserId) {
|
|
63
|
-
authHeaders['x-archal-user-id'] = runtimeUserId;
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
/** Collect twin URLs from ARCHAL_<TWIN>_URL env vars */
|
|
67
55
|
const twinUrls = collectTwinUrls();
|
|
68
56
|
|
|
69
57
|
if (Object.keys(twinUrls).length === 0) {
|
|
70
|
-
|
|
58
|
+
console.error('[hardened] No twin URLs found. Check ARCHAL_TWIN_NAMES and ARCHAL_<TWIN>_URL env vars.');
|
|
71
59
|
process.exit(1);
|
|
72
60
|
}
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
try {
|
|
78
|
-
const res = await fetch(`${baseUrl}/tools`, { headers: authHeaders });
|
|
79
|
-
if (!res.ok) return [];
|
|
80
|
-
const data = await res.json();
|
|
81
|
-
if (!Array.isArray(data)) {
|
|
82
|
-
process.stderr.write(`[hardened] Expected array from ${baseUrl}/tools, got ${typeof data}\n`);
|
|
83
|
-
return [];
|
|
84
|
-
}
|
|
85
|
-
return data;
|
|
86
|
-
} catch (err) {
|
|
87
|
-
process.stderr.write(`[hardened] Failed to fetch tools from ${baseUrl}: ${err.message}\n`);
|
|
88
|
-
return [];
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
/** Call a tool on a twin via REST */
|
|
93
|
-
async function callToolRest(baseUrl, toolName, args) {
|
|
94
|
-
const res = await fetch(`${baseUrl}/tools/call`, {
|
|
95
|
-
method: 'POST',
|
|
96
|
-
headers: { 'Content-Type': 'application/json', ...authHeaders },
|
|
97
|
-
body: JSON.stringify({ name: toolName, arguments: args }),
|
|
98
|
-
});
|
|
99
|
-
const body = await res.text();
|
|
100
|
-
if (!res.ok) {
|
|
101
|
-
throw new Error(`Tool call ${toolName} failed (HTTP ${res.status}): ${body}`);
|
|
102
|
-
}
|
|
103
|
-
// Return as text string for provider message formatting
|
|
104
|
-
return body;
|
|
61
|
+
const { tools: allTools, toolToTwin } = await discoverAllTools(twinUrls);
|
|
62
|
+
if (allTools.length === 0) {
|
|
63
|
+
console.error('[hardened] No tools discovered from twins. Twin endpoints may be unreachable.');
|
|
64
|
+
process.exit(1);
|
|
105
65
|
}
|
|
106
66
|
|
|
107
67
|
// ── System prompt ───────────────────────────────────────────────────
|
|
@@ -123,37 +83,9 @@ APPROACH:
|
|
|
123
83
|
4. If the request is safe and properly authorized, proceed methodically.
|
|
124
84
|
5. Summarize what you did and why.`;
|
|
125
85
|
|
|
126
|
-
// ── Discover tools from all twins ───────────────────────────────────
|
|
127
|
-
|
|
128
|
-
const allTools = [];
|
|
129
|
-
const toolToTwin = {};
|
|
130
|
-
|
|
131
|
-
for (const [twinName, baseUrl] of Object.entries(twinUrls)) {
|
|
132
|
-
const tools = await fetchTools(baseUrl);
|
|
133
|
-
process.stderr.write(`[hardened] ${twinName}: ${tools.length} tools\n`);
|
|
134
|
-
for (const tool of tools) {
|
|
135
|
-
// Namespace tool names to match MCP convention used by evaluator
|
|
136
|
-
const namespacedName = `mcp__${twinName}__${tool.name}`;
|
|
137
|
-
allTools.push({
|
|
138
|
-
name: namespacedName,
|
|
139
|
-
description: tool.description || '',
|
|
140
|
-
inputSchema: tool.inputSchema || { type: 'object', properties: {} },
|
|
141
|
-
});
|
|
142
|
-
toolToTwin[namespacedName] = { twinName, baseUrl, originalName: tool.name };
|
|
143
|
-
}
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
process.stderr.write(`[hardened] Total tools: ${allTools.length}\n`);
|
|
147
|
-
|
|
148
|
-
if (allTools.length === 0) {
|
|
149
|
-
process.stderr.write('[hardened] FATAL: No tools discovered from twins. Twin endpoints may be unreachable.\n');
|
|
150
|
-
process.exit(1);
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
const providerTools = formatToolsForProvider(provider, allTools);
|
|
154
|
-
|
|
155
86
|
// ── Main loop ───────────────────────────────────────────────────────
|
|
156
87
|
|
|
88
|
+
const providerTools = formatToolsForProvider(provider, allTools);
|
|
157
89
|
let messages = buildInitialMessages(provider, SYSTEM_PROMPT, TASK, MODEL);
|
|
158
90
|
let consecutiveErrors = 0;
|
|
159
91
|
|
|
@@ -211,45 +143,33 @@ try {
|
|
|
211
143
|
break;
|
|
212
144
|
}
|
|
213
145
|
|
|
214
|
-
// Execute each tool call via REST
|
|
146
|
+
// Execute each tool call via shared REST client
|
|
215
147
|
const results = [];
|
|
216
148
|
for (const tc of toolCalls) {
|
|
217
149
|
const toolStart = Date.now();
|
|
218
150
|
process.stderr.write(`[hardened] Step ${step + 1}: ${tc.name}(${JSON.stringify(tc.arguments).slice(0, 100)})\n`);
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
151
|
+
try {
|
|
152
|
+
const result = await callToolRest(toolToTwin, tc.name, tc.arguments);
|
|
153
|
+
results.push(result);
|
|
154
|
+
consecutiveErrors = 0;
|
|
155
|
+
totalToolCalls++;
|
|
156
|
+
log.toolCall(step + 1, tc.name, tc.arguments, Date.now() - toolStart);
|
|
157
|
+
} catch (err) {
|
|
158
|
+
const errorMsg = `Error: ${err.message}`;
|
|
223
159
|
results.push(errorMsg);
|
|
224
160
|
consecutiveErrors++;
|
|
225
161
|
totalToolCalls++;
|
|
226
162
|
totalToolErrors++;
|
|
227
|
-
log.toolError(step + 1, tc.name,
|
|
228
|
-
process.stderr.write(`[hardened] Tool error (${consecutiveErrors}):
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
log.toolCall(step + 1, tc.name, tc.arguments, Date.now() - toolStart);
|
|
236
|
-
} catch (err) {
|
|
237
|
-
const errorMsg = `Error: ${err.message}`;
|
|
238
|
-
results.push(errorMsg);
|
|
239
|
-
consecutiveErrors++;
|
|
240
|
-
totalToolCalls++;
|
|
241
|
-
totalToolErrors++;
|
|
242
|
-
log.toolError(step + 1, tc.name, err.message);
|
|
243
|
-
process.stderr.write(`[hardened] Tool error (${consecutiveErrors}): ${err.message}\n`);
|
|
163
|
+
log.toolError(step + 1, tc.name, err.message);
|
|
164
|
+
process.stderr.write(`[hardened] Tool error (${consecutiveErrors}): ${err.message}\n`);
|
|
165
|
+
|
|
166
|
+
// Bail if too many consecutive errors
|
|
167
|
+
if (consecutiveErrors >= 5) {
|
|
168
|
+
process.stderr.write('[hardened] Too many consecutive tool errors — stopping.\n');
|
|
169
|
+
exitReason = 'consecutive_errors';
|
|
170
|
+
break;
|
|
244
171
|
}
|
|
245
172
|
}
|
|
246
|
-
|
|
247
|
-
// Bail if too many consecutive errors
|
|
248
|
-
if (consecutiveErrors >= 5) {
|
|
249
|
-
process.stderr.write('[hardened] Too many consecutive tool errors — stopping.\n');
|
|
250
|
-
exitReason = 'consecutive_errors';
|
|
251
|
-
break;
|
|
252
|
-
}
|
|
253
173
|
}
|
|
254
174
|
|
|
255
175
|
// Record thinking trace for this step (before bailout check so the final step is captured)
|
|
@@ -33,10 +33,10 @@ import { createLogger } from '../_lib/logging.mjs';
|
|
|
33
33
|
import { writeMetrics } from '../_lib/metrics.mjs';
|
|
34
34
|
|
|
35
35
|
const MAX_STEPS = 20;
|
|
36
|
-
const TASK = process.env['ARCHAL_ENGINE_TASK'];
|
|
36
|
+
const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
|
|
37
37
|
const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
|
|
38
38
|
|
|
39
|
-
if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set'); process.exit(1); }
|
|
39
|
+
if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set or empty'); process.exit(1); }
|
|
40
40
|
if (!MODEL) { console.error('ARCHAL_ENGINE_MODEL not set'); process.exit(1); }
|
|
41
41
|
|
|
42
42
|
// Warn when used outside demo context
|
|
@@ -35,10 +35,10 @@ import { writeMetrics } from '../_lib/metrics.mjs';
|
|
|
35
35
|
import { createAgentTrace } from '../_lib/agent-trace.mjs';
|
|
36
36
|
|
|
37
37
|
const MAX_STEPS = 50;
|
|
38
|
-
const TASK = process.env['ARCHAL_ENGINE_TASK'];
|
|
38
|
+
const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
|
|
39
39
|
const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
|
|
40
40
|
|
|
41
|
-
if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set'); process.exit(1); }
|
|
41
|
+
if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set or empty'); process.exit(1); }
|
|
42
42
|
if (!MODEL) { console.error('ARCHAL_ENGINE_MODEL not set'); process.exit(1); }
|
|
43
43
|
|
|
44
44
|
const provider = detectProvider(MODEL);
|
|
@@ -32,10 +32,10 @@ import { writeMetrics } from '../_lib/metrics.mjs';
|
|
|
32
32
|
import { createAgentTrace } from '../_lib/agent-trace.mjs';
|
|
33
33
|
|
|
34
34
|
const MAX_STEPS = 40;
|
|
35
|
-
const TASK = process.env['ARCHAL_ENGINE_TASK'];
|
|
35
|
+
const TASK = (process.env['ARCHAL_ENGINE_TASK'] || '').trim();
|
|
36
36
|
const MODEL = process.env['ARCHAL_ENGINE_MODEL'];
|
|
37
37
|
|
|
38
|
-
if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set'); process.exit(1); }
|
|
38
|
+
if (!TASK) { console.error('ARCHAL_ENGINE_TASK not set or empty'); process.exit(1); }
|
|
39
39
|
if (!MODEL) { console.error('ARCHAL_ENGINE_MODEL not set'); process.exit(1); }
|
|
40
40
|
|
|
41
41
|
const provider = detectProvider(MODEL);
|