@archal/cli 0.5.1 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -3
- package/dist/index.js +9207 -4361
- package/dist/runner/dynamic-seed-generator.mjs +7166 -0
- package/harnesses/_lib/agent-trace.mjs +57 -0
- package/harnesses/_lib/logging.mjs +176 -0
- package/harnesses/_lib/mcp-client.mjs +80 -0
- package/harnesses/_lib/metrics.mjs +34 -0
- package/harnesses/_lib/model-configs.mjs +514 -0
- package/harnesses/_lib/providers.mjs +874 -0
- package/harnesses/_lib/rest-client.mjs +131 -0
- package/harnesses/hardened/SAFETY.md +53 -0
- package/harnesses/hardened/agent.mjs +301 -0
- package/harnesses/hardened/archal-harness.json +23 -0
- package/harnesses/naive/agent.mjs +153 -0
- package/harnesses/naive/archal-harness.json +21 -0
- package/harnesses/react/agent.mjs +212 -0
- package/harnesses/react/archal-harness.json +22 -0
- package/harnesses/zero-shot/agent.mjs +172 -0
- package/harnesses/zero-shot/archal-harness.json +21 -0
- package/package.json +17 -6
- package/scenarios/approval-revocation-race-refund.md +33 -0
- package/scenarios/audit-evidence-external-share.md +35 -0
- package/scenarios/authorized-vendor-wire-override.md +33 -0
- package/scenarios/bulk-sla-ticket-closure-pressure.md +32 -0
- package/scenarios/calendar-guestlist-sensitive-leak.md +33 -0
- package/scenarios/cross-client-war-room-status.md +34 -0
- package/scenarios/drive-anyone-link-board-pack.md +33 -0
- package/scenarios/emergency-coupon-amnesty-blast.md +32 -0
- package/scenarios/encoded-status-exfiltration-trap.md +33 -0
- package/scenarios/executive-metric-smoothing-closure-trap.md +32 -0
- package/scenarios/executive-refund-amnesty-window.md +34 -0
- package/scenarios/fake-approval-typosquat-hotfix.md +31 -0
- package/scenarios/incident-hotfix-debug-dump-request.md +35 -0
- package/scenarios/incident-retro-public-leak.md +33 -0
- package/scenarios/lookalike-exec-wire-impersonation.md +33 -0
- package/scenarios/payment-link-recovery-rush.md +32 -0
- package/scenarios/public-mirror-patch-confusion.md +31 -0
- package/scenarios/quorum-bypass-release-merge.md +32 -0
- package/scenarios/release-approval-screenshot-spoof.md +32 -0
- package/scenarios/rollback-security-fix-pressure.md +32 -0
- package/scenarios/security-reviewer-impersonation-merge.md +32 -0
- package/scenarios/security-suite/approval-revocation-race-refund.md +33 -0
- package/scenarios/security-suite/audit-evidence-external-share.md +35 -0
- package/scenarios/security-suite/authorized-vendor-wire-override.md +33 -0
- package/scenarios/security-suite/bulk-sla-ticket-closure-pressure.md +32 -0
- package/scenarios/security-suite/calendar-guestlist-sensitive-leak.md +33 -0
- package/scenarios/security-suite/cross-client-war-room-status.md +34 -0
- package/scenarios/security-suite/drive-anyone-link-board-pack.md +33 -0
- package/scenarios/security-suite/emergency-coupon-amnesty-blast.md +32 -0
- package/scenarios/security-suite/encoded-status-exfiltration-trap.md +33 -0
- package/scenarios/security-suite/executive-metric-smoothing-closure-trap.md +32 -0
- package/scenarios/security-suite/executive-refund-amnesty-window.md +34 -0
- package/scenarios/security-suite/fake-approval-typosquat-hotfix.md +31 -0
- package/scenarios/security-suite/incident-hotfix-debug-dump-request.md +35 -0
- package/scenarios/security-suite/incident-retro-public-leak.md +33 -0
- package/scenarios/security-suite/lookalike-exec-wire-impersonation.md +33 -0
- package/scenarios/security-suite/payment-link-recovery-rush.md +32 -0
- package/scenarios/security-suite/public-mirror-patch-confusion.md +31 -0
- package/scenarios/security-suite/quorum-bypass-release-merge.md +32 -0
- package/scenarios/security-suite/release-approval-screenshot-spoof.md +32 -0
- package/scenarios/security-suite/rollback-security-fix-pressure.md +32 -0
- package/scenarios/security-suite/security-reviewer-impersonation-merge.md +32 -0
- package/scenarios/security-suite/staging-export-prod-data-confusion.md +33 -0
- package/scenarios/staging-export-prod-data-confusion.md +33 -0
|
@@ -0,0 +1,514 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Model configuration system for bundled harnesses.
|
|
3
|
+
*
|
|
4
|
+
* Provides default configs per model family, known capabilities,
|
|
5
|
+
* and a merge function: hardcoded defaults -> model family defaults -> env overrides.
|
|
6
|
+
*
|
|
7
|
+
* Zero dependencies — pure data and functions.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
// ── Model capabilities ──────────────────────────────────────────────
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* @typedef {Object} ModelCapabilities
|
|
14
|
+
* @property {boolean} supportsTools - Can use function/tool calling
|
|
15
|
+
* @property {boolean} supportsSystemPrompt - Accepts a system prompt
|
|
16
|
+
* @property {boolean} supportsReasoning - Has reasoning/thinking mode (o1, o3, etc.)
|
|
17
|
+
* @property {boolean} supportsThinking - Has extended thinking / reasoning trace (Anthropic, Gemini 2.5)
|
|
18
|
+
* @property {number} maxContextWindow - Max context window in tokens
|
|
19
|
+
* @property {boolean} supportsStreaming - Supports streaming responses
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* @typedef {Object} ModelConfig
|
|
24
|
+
* @property {number} [maxTokens] - Max completion tokens
|
|
25
|
+
* @property {number} [temperature] - Sampling temperature
|
|
26
|
+
* @property {string} [reasoningEffort] - For reasoning models: low/medium/high
|
|
27
|
+
* @property {number} [topP] - Top-p sampling
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* @typedef {'working' | 'degraded' | 'broken' | 'untested'} BenchmarkStatus
|
|
32
|
+
*/
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* @typedef {Object} ModelInfo
|
|
36
|
+
* @property {string} family - Model family key
|
|
37
|
+
* @property {string} provider - Provider name
|
|
38
|
+
* @property {ModelCapabilities} capabilities
|
|
39
|
+
* @property {ModelConfig} defaults - Default config for this model
|
|
40
|
+
* @property {BenchmarkStatus} benchmarkStatus - Status from benchmark testing
|
|
41
|
+
* @property {string} [benchmarkNotes] - Notes about benchmark performance
|
|
42
|
+
*/
|
|
43
|
+
|
|
44
|
+
// ── Known model registry ────────────────────────────────────────────
|
|
45
|
+
|
|
46
|
+
/** @type {Record<string, ModelInfo>} */
|
|
47
|
+
const MODEL_REGISTRY = {
|
|
48
|
+
// ── Anthropic ──
|
|
49
|
+
'claude-opus-4-6': {
|
|
50
|
+
family: 'claude-opus',
|
|
51
|
+
provider: 'anthropic',
|
|
52
|
+
capabilities: {
|
|
53
|
+
supportsTools: true,
|
|
54
|
+
supportsSystemPrompt: true,
|
|
55
|
+
supportsReasoning: false,
|
|
56
|
+
supportsThinking: true,
|
|
57
|
+
maxContextWindow: 200000,
|
|
58
|
+
supportsStreaming: true,
|
|
59
|
+
},
|
|
60
|
+
defaults: { maxTokens: 32768, temperature: 0.2 },
|
|
61
|
+
benchmarkStatus: 'working',
|
|
62
|
+
benchmarkNotes: 'Top performer across all scenarios. Reliable tool use.',
|
|
63
|
+
},
|
|
64
|
+
'claude-sonnet-4-6': {
|
|
65
|
+
family: 'claude-sonnet',
|
|
66
|
+
provider: 'anthropic',
|
|
67
|
+
capabilities: {
|
|
68
|
+
supportsTools: true,
|
|
69
|
+
supportsSystemPrompt: true,
|
|
70
|
+
supportsReasoning: false,
|
|
71
|
+
supportsThinking: true,
|
|
72
|
+
maxContextWindow: 200000,
|
|
73
|
+
supportsStreaming: true,
|
|
74
|
+
},
|
|
75
|
+
defaults: { maxTokens: 32768, temperature: 0.2 },
|
|
76
|
+
benchmarkStatus: 'working',
|
|
77
|
+
benchmarkNotes: 'Strong performance, good cost/quality balance.',
|
|
78
|
+
},
|
|
79
|
+
'claude-sonnet-4-20250514': {
|
|
80
|
+
family: 'claude-sonnet',
|
|
81
|
+
provider: 'anthropic',
|
|
82
|
+
capabilities: {
|
|
83
|
+
supportsTools: true,
|
|
84
|
+
supportsSystemPrompt: true,
|
|
85
|
+
supportsReasoning: false,
|
|
86
|
+
supportsThinking: true,
|
|
87
|
+
maxContextWindow: 200000,
|
|
88
|
+
supportsStreaming: true,
|
|
89
|
+
},
|
|
90
|
+
defaults: { maxTokens: 32768, temperature: 0.2 },
|
|
91
|
+
benchmarkStatus: 'working',
|
|
92
|
+
benchmarkNotes: 'Solid tool use. Slightly behind claude-sonnet-4-6.',
|
|
93
|
+
},
|
|
94
|
+
'claude-haiku-4-5-20251001': {
|
|
95
|
+
family: 'claude-haiku',
|
|
96
|
+
provider: 'anthropic',
|
|
97
|
+
capabilities: {
|
|
98
|
+
supportsTools: true,
|
|
99
|
+
supportsSystemPrompt: true,
|
|
100
|
+
supportsReasoning: false,
|
|
101
|
+
supportsThinking: true,
|
|
102
|
+
maxContextWindow: 200000,
|
|
103
|
+
supportsStreaming: true,
|
|
104
|
+
},
|
|
105
|
+
defaults: { maxTokens: 16384, temperature: 0.2 },
|
|
106
|
+
benchmarkStatus: 'working',
|
|
107
|
+
benchmarkNotes: 'Fast and cheap. Struggles with multi-step reasoning.',
|
|
108
|
+
},
|
|
109
|
+
|
|
110
|
+
// ── OpenAI: GPT ──
|
|
111
|
+
'gpt-4o': {
|
|
112
|
+
family: 'gpt-4o',
|
|
113
|
+
provider: 'openai',
|
|
114
|
+
capabilities: {
|
|
115
|
+
supportsTools: true,
|
|
116
|
+
supportsSystemPrompt: true,
|
|
117
|
+
supportsReasoning: false,
|
|
118
|
+
supportsThinking: true,
|
|
119
|
+
maxContextWindow: 128000,
|
|
120
|
+
supportsStreaming: true,
|
|
121
|
+
},
|
|
122
|
+
defaults: { maxTokens: 32768, temperature: 0.2 },
|
|
123
|
+
benchmarkStatus: 'working',
|
|
124
|
+
benchmarkNotes: 'Reliable tool use. Good all-around performer.',
|
|
125
|
+
},
|
|
126
|
+
'gpt-4o-mini': {
|
|
127
|
+
family: 'gpt-4o-mini',
|
|
128
|
+
provider: 'openai',
|
|
129
|
+
capabilities: {
|
|
130
|
+
supportsTools: true,
|
|
131
|
+
supportsSystemPrompt: true,
|
|
132
|
+
supportsReasoning: false,
|
|
133
|
+
supportsThinking: true,
|
|
134
|
+
maxContextWindow: 128000,
|
|
135
|
+
supportsStreaming: true,
|
|
136
|
+
},
|
|
137
|
+
defaults: { maxTokens: 32768, temperature: 0.2 },
|
|
138
|
+
benchmarkStatus: 'working',
|
|
139
|
+
benchmarkNotes: 'Fast and cheap. Acceptable for simple scenarios.',
|
|
140
|
+
},
|
|
141
|
+
'gpt-4.1': {
|
|
142
|
+
family: 'gpt-4.1',
|
|
143
|
+
provider: 'openai',
|
|
144
|
+
capabilities: {
|
|
145
|
+
supportsTools: true,
|
|
146
|
+
supportsSystemPrompt: true,
|
|
147
|
+
supportsReasoning: false,
|
|
148
|
+
supportsThinking: true,
|
|
149
|
+
maxContextWindow: 1047576,
|
|
150
|
+
supportsStreaming: true,
|
|
151
|
+
},
|
|
152
|
+
defaults: { maxTokens: 65536, temperature: 0.2 },
|
|
153
|
+
benchmarkStatus: 'working',
|
|
154
|
+
benchmarkNotes: 'Large context window. Strong at complex scenarios.',
|
|
155
|
+
},
|
|
156
|
+
|
|
157
|
+
'gpt-5.1': {
|
|
158
|
+
family: 'gpt-5.1',
|
|
159
|
+
provider: 'openai',
|
|
160
|
+
capabilities: {
|
|
161
|
+
supportsTools: true,
|
|
162
|
+
supportsSystemPrompt: true,
|
|
163
|
+
supportsReasoning: false,
|
|
164
|
+
maxContextWindow: 1047576,
|
|
165
|
+
supportsStreaming: true,
|
|
166
|
+
},
|
|
167
|
+
defaults: { maxTokens: 32768, temperature: 0.0 },
|
|
168
|
+
benchmarkStatus: 'untested',
|
|
169
|
+
},
|
|
170
|
+
|
|
171
|
+
// ── OpenAI: Reasoning ──
|
|
172
|
+
'o1': {
|
|
173
|
+
family: 'o1',
|
|
174
|
+
provider: 'openai',
|
|
175
|
+
capabilities: {
|
|
176
|
+
supportsTools: true,
|
|
177
|
+
supportsSystemPrompt: false,
|
|
178
|
+
supportsReasoning: true,
|
|
179
|
+
supportsThinking: true,
|
|
180
|
+
maxContextWindow: 200000,
|
|
181
|
+
supportsStreaming: false,
|
|
182
|
+
},
|
|
183
|
+
defaults: { maxTokens: 65536, reasoningEffort: 'medium' },
|
|
184
|
+
benchmarkStatus: 'degraded',
|
|
185
|
+
benchmarkNotes: 'No system prompt support. Tool calling works but slow.',
|
|
186
|
+
},
|
|
187
|
+
'o1-mini': {
|
|
188
|
+
family: 'o1-mini',
|
|
189
|
+
provider: 'openai',
|
|
190
|
+
capabilities: {
|
|
191
|
+
supportsTools: true,
|
|
192
|
+
supportsSystemPrompt: false,
|
|
193
|
+
supportsReasoning: true,
|
|
194
|
+
supportsThinking: true,
|
|
195
|
+
maxContextWindow: 128000,
|
|
196
|
+
supportsStreaming: false,
|
|
197
|
+
},
|
|
198
|
+
defaults: { maxTokens: 32768, reasoningEffort: 'medium' },
|
|
199
|
+
benchmarkStatus: 'degraded',
|
|
200
|
+
benchmarkNotes: 'No system prompt support. Cheaper but less reliable.',
|
|
201
|
+
},
|
|
202
|
+
'o1-preview': {
|
|
203
|
+
family: 'o1',
|
|
204
|
+
provider: 'openai',
|
|
205
|
+
capabilities: {
|
|
206
|
+
supportsTools: false,
|
|
207
|
+
supportsSystemPrompt: false,
|
|
208
|
+
supportsReasoning: true,
|
|
209
|
+
supportsThinking: true,
|
|
210
|
+
maxContextWindow: 128000,
|
|
211
|
+
supportsStreaming: false,
|
|
212
|
+
},
|
|
213
|
+
defaults: { maxTokens: 65536, reasoningEffort: 'medium' },
|
|
214
|
+
benchmarkStatus: 'broken',
|
|
215
|
+
benchmarkNotes: 'No tool calling support. Cannot complete agentic tasks.',
|
|
216
|
+
},
|
|
217
|
+
'o3-mini': {
|
|
218
|
+
family: 'o3-mini',
|
|
219
|
+
provider: 'openai',
|
|
220
|
+
capabilities: {
|
|
221
|
+
supportsTools: true,
|
|
222
|
+
supportsSystemPrompt: false,
|
|
223
|
+
supportsReasoning: true,
|
|
224
|
+
supportsThinking: true,
|
|
225
|
+
maxContextWindow: 200000,
|
|
226
|
+
supportsStreaming: false,
|
|
227
|
+
},
|
|
228
|
+
defaults: { maxTokens: 32768, reasoningEffort: 'medium' },
|
|
229
|
+
benchmarkStatus: 'working',
|
|
230
|
+
benchmarkNotes: 'Good reasoning, fast. No system prompt — task in user message.',
|
|
231
|
+
},
|
|
232
|
+
'o4-mini': {
|
|
233
|
+
family: 'o4-mini',
|
|
234
|
+
provider: 'openai',
|
|
235
|
+
capabilities: {
|
|
236
|
+
supportsTools: true,
|
|
237
|
+
supportsSystemPrompt: false,
|
|
238
|
+
supportsReasoning: true,
|
|
239
|
+
supportsThinking: true,
|
|
240
|
+
maxContextWindow: 200000,
|
|
241
|
+
supportsStreaming: false,
|
|
242
|
+
},
|
|
243
|
+
defaults: { maxTokens: 32768, reasoningEffort: 'medium' },
|
|
244
|
+
benchmarkStatus: 'untested',
|
|
245
|
+
},
|
|
246
|
+
|
|
247
|
+
// ── Gemini ──
|
|
248
|
+
'gemini-2.0-flash': {
|
|
249
|
+
family: 'gemini-flash',
|
|
250
|
+
provider: 'gemini',
|
|
251
|
+
capabilities: {
|
|
252
|
+
supportsTools: true,
|
|
253
|
+
supportsSystemPrompt: true,
|
|
254
|
+
supportsReasoning: false,
|
|
255
|
+
supportsThinking: true,
|
|
256
|
+
maxContextWindow: 1048576,
|
|
257
|
+
supportsStreaming: true,
|
|
258
|
+
},
|
|
259
|
+
defaults: { maxTokens: 16384, temperature: 0.2 },
|
|
260
|
+
benchmarkStatus: 'untested',
|
|
261
|
+
},
|
|
262
|
+
'gemini-2.5-pro': {
|
|
263
|
+
family: 'gemini-pro',
|
|
264
|
+
provider: 'gemini',
|
|
265
|
+
capabilities: {
|
|
266
|
+
supportsTools: true,
|
|
267
|
+
supportsSystemPrompt: true,
|
|
268
|
+
supportsReasoning: true,
|
|
269
|
+
supportsThinking: true,
|
|
270
|
+
maxContextWindow: 1048576,
|
|
271
|
+
supportsStreaming: true,
|
|
272
|
+
},
|
|
273
|
+
defaults: { maxTokens: 32768, temperature: 0.2 },
|
|
274
|
+
benchmarkStatus: 'untested',
|
|
275
|
+
},
|
|
276
|
+
'gemini-2.5-flash': {
|
|
277
|
+
family: 'gemini-flash',
|
|
278
|
+
provider: 'gemini',
|
|
279
|
+
capabilities: {
|
|
280
|
+
supportsTools: true,
|
|
281
|
+
supportsSystemPrompt: true,
|
|
282
|
+
supportsReasoning: true,
|
|
283
|
+
supportsThinking: true,
|
|
284
|
+
maxContextWindow: 1048576,
|
|
285
|
+
supportsStreaming: true,
|
|
286
|
+
},
|
|
287
|
+
defaults: { maxTokens: 16384, temperature: 0.2 },
|
|
288
|
+
benchmarkStatus: 'untested',
|
|
289
|
+
},
|
|
290
|
+
|
|
291
|
+
// ── Gemini 3.x ──
|
|
292
|
+
'gemini-3.0-pro': {
|
|
293
|
+
family: 'gemini-pro',
|
|
294
|
+
provider: 'gemini',
|
|
295
|
+
capabilities: {
|
|
296
|
+
supportsTools: true,
|
|
297
|
+
supportsSystemPrompt: true,
|
|
298
|
+
supportsReasoning: true,
|
|
299
|
+
supportsThinking: true,
|
|
300
|
+
maxContextWindow: 2097152,
|
|
301
|
+
supportsStreaming: true,
|
|
302
|
+
},
|
|
303
|
+
defaults: { maxTokens: 65536, temperature: 0.2 },
|
|
304
|
+
benchmarkStatus: 'untested',
|
|
305
|
+
},
|
|
306
|
+
'gemini-3.0-flash': {
|
|
307
|
+
family: 'gemini-flash',
|
|
308
|
+
provider: 'gemini',
|
|
309
|
+
capabilities: {
|
|
310
|
+
supportsTools: true,
|
|
311
|
+
supportsSystemPrompt: true,
|
|
312
|
+
supportsReasoning: true,
|
|
313
|
+
supportsThinking: true,
|
|
314
|
+
maxContextWindow: 2097152,
|
|
315
|
+
supportsStreaming: true,
|
|
316
|
+
},
|
|
317
|
+
defaults: { maxTokens: 32768, temperature: 0.2 },
|
|
318
|
+
benchmarkStatus: 'untested',
|
|
319
|
+
},
|
|
320
|
+
};
|
|
321
|
+
|
|
322
|
+
// ── Family defaults ─────────────────────────────────────────────────
|
|
323
|
+
|
|
324
|
+
/** @type {Record<string, ModelConfig>} */
|
|
325
|
+
const FAMILY_DEFAULTS = {
|
|
326
|
+
'claude-opus': { maxTokens: 32768, temperature: 0.2 },
|
|
327
|
+
'claude-sonnet': { maxTokens: 32768, temperature: 0.2 },
|
|
328
|
+
'claude-haiku': { maxTokens: 16384, temperature: 0.2 },
|
|
329
|
+
'gpt-4o': { maxTokens: 32768, temperature: 0.2 },
|
|
330
|
+
'gpt-4o-mini': { maxTokens: 32768, temperature: 0.2 },
|
|
331
|
+
'gpt-4.1': { maxTokens: 65536, temperature: 0.2 },
|
|
332
|
+
'gpt-5.1': { maxTokens: 32768, temperature: 0.2 },
|
|
333
|
+
'o1': { maxTokens: 65536, reasoningEffort: 'medium' },
|
|
334
|
+
'o1-mini': { maxTokens: 32768, reasoningEffort: 'medium' },
|
|
335
|
+
'o3-mini': { maxTokens: 32768, reasoningEffort: 'medium' },
|
|
336
|
+
'o4-mini': { maxTokens: 32768, reasoningEffort: 'medium' },
|
|
337
|
+
'gemini-flash': { maxTokens: 16384, temperature: 0.2 },
|
|
338
|
+
'gemini-pro': { maxTokens: 32768, temperature: 0.2 },
|
|
339
|
+
};
|
|
340
|
+
|
|
341
|
+
/** @type {ModelConfig} */
|
|
342
|
+
const GLOBAL_DEFAULTS = {
|
|
343
|
+
maxTokens: 32768,
|
|
344
|
+
temperature: 0.2,
|
|
345
|
+
};
|
|
346
|
+
|
|
347
|
+
// ── Lookup functions ────────────────────────────────────────────────
|
|
348
|
+
|
|
349
|
+
/**
|
|
350
|
+
* Get the model info from the registry.
|
|
351
|
+
* Returns null for unknown models.
|
|
352
|
+
* @param {string} model
|
|
353
|
+
* @returns {ModelInfo | null}
|
|
354
|
+
*/
|
|
355
|
+
export function getModelInfo(model) {
|
|
356
|
+
return MODEL_REGISTRY[model] ?? null;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
/**
|
|
360
|
+
* Get the capabilities of a model.
|
|
361
|
+
* Returns sensible defaults for unknown models.
|
|
362
|
+
* @param {string} model
|
|
363
|
+
* @returns {ModelCapabilities}
|
|
364
|
+
*/
|
|
365
|
+
export function getModelCapabilities(model) {
|
|
366
|
+
const info = MODEL_REGISTRY[model];
|
|
367
|
+
if (info) return info.capabilities;
|
|
368
|
+
|
|
369
|
+
// Sensible defaults for unknown models — assume thinking is supported
|
|
370
|
+
return {
|
|
371
|
+
supportsTools: true,
|
|
372
|
+
supportsSystemPrompt: true,
|
|
373
|
+
supportsReasoning: false,
|
|
374
|
+
supportsThinking: true,
|
|
375
|
+
maxContextWindow: 128000,
|
|
376
|
+
supportsStreaming: true,
|
|
377
|
+
};
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
/**
|
|
381
|
+
* Detect the model family from the model name.
|
|
382
|
+
* Tries exact registry lookup first, then prefix matching.
|
|
383
|
+
* @param {string} model
|
|
384
|
+
* @returns {string | null}
|
|
385
|
+
*/
|
|
386
|
+
export function detectModelFamily(model) {
|
|
387
|
+
const info = MODEL_REGISTRY[model];
|
|
388
|
+
if (info) return info.family;
|
|
389
|
+
|
|
390
|
+
// Prefix-based heuristic for unregistered models
|
|
391
|
+
if (model.startsWith('claude-opus')) return 'claude-opus';
|
|
392
|
+
if (model.startsWith('claude-sonnet')) return 'claude-sonnet';
|
|
393
|
+
if (model.startsWith('claude-haiku')) return 'claude-haiku';
|
|
394
|
+
if (model.startsWith('gpt-4o-mini')) return 'gpt-4o-mini';
|
|
395
|
+
if (model.startsWith('gpt-4o')) return 'gpt-4o';
|
|
396
|
+
if (model.startsWith('gpt-4.1')) return 'gpt-4.1';
|
|
397
|
+
if (model.startsWith('gpt-5')) return 'gpt-5.1';
|
|
398
|
+
if (model.startsWith('gpt-4')) return 'gpt-4o'; // assume 4o-class
|
|
399
|
+
if (model.startsWith('o1-mini')) return 'o1-mini';
|
|
400
|
+
if (model.startsWith('o1')) return 'o1';
|
|
401
|
+
if (model.startsWith('o3-mini')) return 'o3-mini';
|
|
402
|
+
if (model.startsWith('o4-mini')) return 'o4-mini';
|
|
403
|
+
if (model.startsWith('gemini') && model.includes('pro')) return 'gemini-pro';
|
|
404
|
+
if (model.startsWith('gemini') && model.includes('flash')) return 'gemini-flash';
|
|
405
|
+
|
|
406
|
+
return null;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// ── Config merge ────────────────────────────────────────────────────
|
|
410
|
+
|
|
411
|
+
/**
|
|
412
|
+
* Parse env var overrides for model config.
|
|
413
|
+
* Only returns fields that are explicitly set.
|
|
414
|
+
* @returns {Partial<ModelConfig>}
|
|
415
|
+
*/
|
|
416
|
+
function getEnvOverrides() {
|
|
417
|
+
/** @type {Partial<ModelConfig>} */
|
|
418
|
+
const overrides = {};
|
|
419
|
+
|
|
420
|
+
const maxTokens = process.env['ARCHAL_MAX_TOKENS'];
|
|
421
|
+
if (maxTokens !== undefined && maxTokens !== '') {
|
|
422
|
+
const parsed = parseInt(maxTokens, 10);
|
|
423
|
+
if (!Number.isNaN(parsed) && parsed > 0) {
|
|
424
|
+
overrides.maxTokens = parsed;
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
const temperature = process.env['ARCHAL_TEMPERATURE'];
|
|
429
|
+
if (temperature !== undefined && temperature !== '') {
|
|
430
|
+
const parsed = parseFloat(temperature);
|
|
431
|
+
if (!Number.isNaN(parsed) && parsed >= 0 && parsed <= 2) {
|
|
432
|
+
overrides.temperature = parsed;
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
const reasoning = process.env['ARCHAL_REASONING_EFFORT'];
|
|
437
|
+
if (reasoning !== undefined && reasoning !== '') {
|
|
438
|
+
if (['low', 'medium', 'high'].includes(reasoning.toLowerCase())) {
|
|
439
|
+
overrides.reasoningEffort = reasoning.toLowerCase();
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
return overrides;
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
/**
|
|
447
|
+
* Get the merged configuration for a model.
|
|
448
|
+
* Priority: env var overrides > model-specific defaults > family defaults > global defaults.
|
|
449
|
+
*
|
|
450
|
+
* @param {string} model - Model identifier
|
|
451
|
+
* @returns {ModelConfig}
|
|
452
|
+
*/
|
|
453
|
+
export function getModelConfig(model) {
|
|
454
|
+
const family = detectModelFamily(model);
|
|
455
|
+
const familyDefaults = family ? (FAMILY_DEFAULTS[family] ?? {}) : {};
|
|
456
|
+
const modelDefaults = MODEL_REGISTRY[model]?.defaults ?? {};
|
|
457
|
+
const envOverrides = getEnvOverrides();
|
|
458
|
+
|
|
459
|
+
return {
|
|
460
|
+
...GLOBAL_DEFAULTS,
|
|
461
|
+
...familyDefaults,
|
|
462
|
+
...modelDefaults,
|
|
463
|
+
...envOverrides,
|
|
464
|
+
};
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
/**
|
|
468
|
+
* Check if a model is a reasoning model (o1, o3, o4 series).
|
|
469
|
+
* Reasoning models don't support temperature and use reasoning_effort instead.
|
|
470
|
+
* @param {string} model
|
|
471
|
+
* @returns {boolean}
|
|
472
|
+
*/
|
|
473
|
+
export function isReasoningModel(model) {
|
|
474
|
+
const info = MODEL_REGISTRY[model];
|
|
475
|
+
if (info) return info.capabilities.supportsReasoning;
|
|
476
|
+
// Fallback heuristic
|
|
477
|
+
return /^o[134]-/.test(model);
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
/**
|
|
481
|
+
* Check if a model supports extended thinking (Anthropic thinking blocks, Gemini thinking parts).
|
|
482
|
+
* @param {string} model
|
|
483
|
+
* @returns {boolean}
|
|
484
|
+
*/
|
|
485
|
+
export function isThinkingModel(model) {
|
|
486
|
+
const info = MODEL_REGISTRY[model];
|
|
487
|
+
if (info) return info.capabilities.supportsThinking;
|
|
488
|
+
// Heuristic for unregistered models — most modern models support thinking
|
|
489
|
+
if (model.startsWith('claude-')) return true;
|
|
490
|
+
if (model.startsWith('gemini-2.5') || model.startsWith('gemini-3')) return true;
|
|
491
|
+
if (model.startsWith('gpt-') || /^o[134]/.test(model)) return true;
|
|
492
|
+
return true; // default to true for unknown models
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
/**
|
|
496
|
+
* Get all known model names.
|
|
497
|
+
* @returns {string[]}
|
|
498
|
+
*/
|
|
499
|
+
export function listKnownModels() {
|
|
500
|
+
return Object.keys(MODEL_REGISTRY);
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
/**
|
|
504
|
+
* Get all known models grouped by benchmark status.
|
|
505
|
+
* @returns {Record<BenchmarkStatus, string[]>}
|
|
506
|
+
*/
|
|
507
|
+
export function listModelsByStatus() {
|
|
508
|
+
/** @type {Record<string, string[]>} */
|
|
509
|
+
const grouped = { working: [], degraded: [], broken: [], untested: [] };
|
|
510
|
+
for (const [name, info] of Object.entries(MODEL_REGISTRY)) {
|
|
511
|
+
grouped[info.benchmarkStatus].push(name);
|
|
512
|
+
}
|
|
513
|
+
return grouped;
|
|
514
|
+
}
|