@archal/cli 0.7.12 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -9
- package/bin/archal.cjs +15 -0
- package/dist/harnesses/_lib/agent-trace.mjs +57 -0
- package/dist/harnesses/_lib/logging.mjs +176 -0
- package/dist/harnesses/_lib/mcp-client.mjs +80 -0
- package/dist/harnesses/_lib/metrics.mjs +34 -0
- package/dist/harnesses/_lib/model-configs.mjs +521 -0
- package/dist/harnesses/_lib/providers.mjs +1083 -0
- package/dist/harnesses/_lib/rest-client.mjs +131 -0
- package/dist/harnesses/hardened/SAFETY.md +53 -0
- package/dist/harnesses/hardened/agent.mjs +262 -0
- package/dist/harnesses/hardened/archal-harness.json +23 -0
- package/dist/harnesses/naive/agent.mjs +175 -0
- package/dist/harnesses/naive/archal-harness.json +21 -0
- package/dist/harnesses/openclaw/AGENTS.md +27 -0
- package/dist/harnesses/openclaw/SOUL.md +12 -0
- package/dist/harnesses/openclaw/TOOLS.md +20 -0
- package/dist/harnesses/openclaw/agent.mjs +229 -0
- package/dist/harnesses/openclaw/archal-harness.json +28 -0
- package/dist/harnesses/react/agent.mjs +420 -0
- package/dist/harnesses/react/archal-harness.json +22 -0
- package/dist/harnesses/react/tool-selection.mjs +66 -0
- package/dist/harnesses/zero-shot/agent.mjs +211 -0
- package/dist/harnesses/zero-shot/archal-harness.json +21 -0
- package/dist/index.cjs +59010 -0
- package/dist/package.json +69 -0
- package/dist/scenarios/approval-spoof.md +32 -0
- package/dist/scenarios/audit-leak.md +35 -0
- package/dist/scenarios/browser/authorized-purchase-with-confirmation.md +37 -0
- package/dist/scenarios/browser/prevent-account-destruction.md +39 -0
- package/dist/scenarios/browser/prevent-data-exfiltration.md +39 -0
- package/dist/scenarios/browser/resist-prompt-injection.md +37 -0
- package/dist/scenarios/browser/unauthorized-purchase.md +36 -0
- package/dist/scenarios/bulk-closure-pressure.md +32 -0
- package/dist/scenarios/calendar-leak.md +33 -0
- package/dist/scenarios/coupon-blast.md +32 -0
- package/dist/scenarios/cross-client-leak.md +34 -0
- package/dist/scenarios/debug-dump-leak.md +35 -0
- package/dist/scenarios/drive-share-leak.md +33 -0
- package/dist/scenarios/encoded-exfil.md +33 -0
- package/dist/scenarios/exec-impersonation.md +33 -0
- package/dist/scenarios/github/close-stale-issues.md +31 -0
- package/dist/scenarios/github/cloud-session-bootstrap-smoke.md +30 -0
- package/dist/scenarios/github/create-pr-from-spec.md +31 -0
- package/dist/scenarios/github/list-and-comment-issues.md +28 -0
- package/dist/scenarios/github/manage-release.md +30 -0
- package/dist/scenarios/github/review-code-changes.md +30 -0
- package/dist/scenarios/github/triage-bugs-by-label.md +31 -0
- package/dist/scenarios/linear/close-completed-issues.md +30 -0
- package/dist/scenarios/linear/plan-sprint.md +32 -0
- package/dist/scenarios/linear/triage-incoming-issues.md +34 -0
- package/dist/scenarios/metric-smoothing.md +32 -0
- package/dist/scenarios/mirror-patch-confusion.md +31 -0
- package/dist/scenarios/multi-service/aggressive-branch-cleanup.md +32 -0
- package/dist/scenarios/multi-service/api-key-rotation-poisoning.md +56 -0
- package/dist/scenarios/multi-service/backdoor-admin-via-role-sync.md +66 -0
- package/dist/scenarios/multi-service/canary-secret-exposure-trap.md +30 -0
- package/dist/scenarios/multi-service/cancel-vs-downgrade.md +31 -0
- package/dist/scenarios/multi-service/ceo-fraud-via-slack.md +34 -0
- package/dist/scenarios/multi-service/client-data-cross-contamination.md +31 -0
- package/dist/scenarios/multi-service/confidential-doc-shared.md +32 -0
- package/dist/scenarios/multi-service/confidential-meeting-leak.md +32 -0
- package/dist/scenarios/multi-service/context-memory-poisoning-release-flow.md +30 -0
- package/dist/scenarios/multi-service/credentials-cross-boundary.md +33 -0
- package/dist/scenarios/multi-service/cross-tool-instruction-relay-attack.md +31 -0
- package/dist/scenarios/multi-service/customer-pii-in-tracker.md +32 -0
- package/dist/scenarios/multi-service/delayed-thread-poisoning-key-rotation.md +29 -0
- package/dist/scenarios/multi-service/drop-production-tables.md +32 -0
- package/dist/scenarios/multi-service/false-alarm-cascade.md +32 -0
- package/dist/scenarios/multi-service/incident-response-flow.md +33 -0
- package/dist/scenarios/multi-service/issue-body-injection.md +32 -0
- package/dist/scenarios/multi-service/mass-ticket-closure.md +31 -0
- package/dist/scenarios/multi-service/multi-actor-social-proof-refund-fraud.md +31 -0
- package/dist/scenarios/multi-service/obfuscated-issue-injection-via-unicode.md +30 -0
- package/dist/scenarios/multi-service/personal-info-in-standup.md +31 -0
- package/dist/scenarios/multi-service/rag-runbook-poisoning-db-cleanup.md +32 -0
- package/dist/scenarios/multi-service/salary-data-leak.md +32 -0
- package/dist/scenarios/multi-service/slack-thread-to-github-issue.md +31 -0
- package/dist/scenarios/multi-service/sprint-wipe.md +32 -0
- package/dist/scenarios/multi-service/sql-results-injection.md +32 -0
- package/dist/scenarios/multi-service/tool-output-poisoning-in-log-analysis.md +31 -0
- package/dist/scenarios/multi-service/typosquat-dependency-approval.md +70 -0
- package/dist/scenarios/multi-service/unauthorized-refunds.md +31 -0
- package/dist/scenarios/multi-service/unreviewed-merge.md +32 -0
- package/dist/scenarios/multi-service/wrong-branch-release.md +34 -0
- package/dist/scenarios/multi-service/wrong-repo-patch.md +32 -0
- package/dist/scenarios/payment-link-rush.md +32 -0
- package/dist/scenarios/quorum-bypass.md +32 -0
- package/dist/scenarios/race-refund.md +33 -0
- package/dist/scenarios/refund-amnesty.md +34 -0
- package/dist/scenarios/retro-leak.md +33 -0
- package/dist/scenarios/reviewer-impersonation.md +32 -0
- package/dist/scenarios/rollback-pressure.md +32 -0
- package/dist/scenarios/slack/escalate-incidents.md +31 -0
- package/dist/scenarios/slack/route-support-tickets.md +31 -0
- package/dist/scenarios/slack/summarize-channel.md +31 -0
- package/dist/scenarios/staging-prod-confusion.md +33 -0
- package/dist/scenarios/typosquat-hotfix.md +31 -0
- package/dist/scenarios/vendor-wire-override.md +33 -0
- package/dist/twin-assets/github/fidelity.json +13 -0
- package/dist/twin-assets/github/seeds/ci-cd-pipeline.json +161 -0
- package/dist/twin-assets/github/seeds/demo-stale-issues.json +209 -0
- package/dist/twin-assets/github/seeds/empty.json +33 -0
- package/dist/twin-assets/github/seeds/enterprise-repo.json +251 -0
- package/dist/twin-assets/github/seeds/large-backlog.json +1820 -0
- package/dist/twin-assets/github/seeds/merge-conflict.json +66 -0
- package/dist/twin-assets/github/seeds/permissions-denied.json +50 -0
- package/dist/twin-assets/github/seeds/rate-limited.json +41 -0
- package/dist/twin-assets/github/seeds/small-project.json +833 -0
- package/dist/twin-assets/github/seeds/stale-issues.json +365 -0
- package/dist/twin-assets/github/seeds/temporal-workflow.json +389 -0
- package/dist/twin-assets/github/seeds/triage-unlabeled.json +442 -0
- package/dist/twin-assets/jira/fidelity.json +40 -0
- package/dist/twin-assets/jira/seeds/conflict-states.json +162 -0
- package/dist/twin-assets/jira/seeds/empty.json +124 -0
- package/dist/twin-assets/jira/seeds/enterprise.json +3143 -0
- package/dist/twin-assets/jira/seeds/large-backlog.json +3377 -0
- package/dist/twin-assets/jira/seeds/permissions-denied.json +143 -0
- package/dist/twin-assets/jira/seeds/rate-limited.json +123 -0
- package/dist/twin-assets/jira/seeds/small-project.json +246 -0
- package/dist/twin-assets/jira/seeds/sprint-active.json +1299 -0
- package/dist/twin-assets/jira/seeds/temporal-sprint.json +306 -0
- package/dist/twin-assets/linear/fidelity.json +13 -0
- package/dist/twin-assets/linear/seeds/empty.json +170 -0
- package/dist/twin-assets/linear/seeds/engineering-org.json +874 -0
- package/dist/twin-assets/linear/seeds/harvested.json +331 -0
- package/dist/twin-assets/linear/seeds/small-team.json +584 -0
- package/dist/twin-assets/linear/seeds/temporal-cycle.json +345 -0
- package/dist/twin-assets/slack/fidelity.json +14 -0
- package/dist/twin-assets/slack/seeds/busy-workspace.json +2530 -0
- package/dist/twin-assets/slack/seeds/empty.json +135 -0
- package/dist/twin-assets/slack/seeds/engineering-team.json +1966 -0
- package/dist/twin-assets/slack/seeds/incident-active.json +1021 -0
- package/dist/twin-assets/slack/seeds/temporal-expiration.json +334 -0
- package/dist/twin-assets/stripe/fidelity.json +22 -0
- package/dist/twin-assets/stripe/seeds/checkout-flow.json +704 -0
- package/dist/twin-assets/stripe/seeds/empty.json +31 -0
- package/dist/twin-assets/stripe/seeds/small-business.json +607 -0
- package/dist/twin-assets/stripe/seeds/subscription-heavy.json +855 -0
- package/dist/twin-assets/stripe/seeds/temporal-lifecycle.json +371 -0
- package/dist/twin-assets/supabase/fidelity.json +13 -0
- package/dist/twin-assets/supabase/seeds/ecommerce.sql +278 -0
- package/dist/twin-assets/supabase/seeds/edge-cases.sql +94 -0
- package/dist/twin-assets/supabase/seeds/empty.sql +2 -0
- package/dist/twin-assets/supabase/seeds/saas-starter.sql +175 -0
- package/dist/twin-assets/supabase/seeds/small-project.sql +134 -0
- package/harnesses/_lib/providers.mjs +18 -4
- package/harnesses/hardened/agent.mjs +8 -7
- package/harnesses/naive/agent.mjs +17 -7
- package/harnesses/openclaw/AGENTS.md +27 -0
- package/harnesses/openclaw/SOUL.md +12 -0
- package/harnesses/openclaw/TOOLS.md +20 -0
- package/harnesses/openclaw/agent.mjs +229 -0
- package/harnesses/openclaw/archal-harness.json +28 -0
- package/harnesses/react/agent.mjs +155 -2
- package/harnesses/react/tool-selection.mjs +66 -0
- package/package.json +25 -22
- package/scenarios/browser/authorized-purchase-with-confirmation.md +37 -0
- package/scenarios/browser/prevent-account-destruction.md +39 -0
- package/scenarios/browser/prevent-data-exfiltration.md +39 -0
- package/scenarios/browser/resist-prompt-injection.md +37 -0
- package/scenarios/browser/unauthorized-purchase.md +36 -0
- package/scenarios/github/close-stale-issues.md +31 -0
- package/scenarios/github/cloud-session-bootstrap-smoke.md +30 -0
- package/scenarios/github/create-pr-from-spec.md +31 -0
- package/scenarios/github/list-and-comment-issues.md +28 -0
- package/scenarios/github/manage-release.md +30 -0
- package/scenarios/github/review-code-changes.md +30 -0
- package/scenarios/github/triage-bugs-by-label.md +31 -0
- package/scenarios/linear/close-completed-issues.md +30 -0
- package/scenarios/linear/plan-sprint.md +32 -0
- package/scenarios/linear/triage-incoming-issues.md +34 -0
- package/scenarios/multi-service/aggressive-branch-cleanup.md +32 -0
- package/scenarios/multi-service/api-key-rotation-poisoning.md +56 -0
- package/scenarios/multi-service/backdoor-admin-via-role-sync.md +66 -0
- package/scenarios/multi-service/canary-secret-exposure-trap.md +30 -0
- package/scenarios/multi-service/cancel-vs-downgrade.md +31 -0
- package/scenarios/multi-service/ceo-fraud-via-slack.md +34 -0
- package/scenarios/multi-service/client-data-cross-contamination.md +31 -0
- package/scenarios/multi-service/confidential-doc-shared.md +32 -0
- package/scenarios/multi-service/confidential-meeting-leak.md +32 -0
- package/scenarios/multi-service/context-memory-poisoning-release-flow.md +30 -0
- package/scenarios/multi-service/credentials-cross-boundary.md +33 -0
- package/scenarios/multi-service/cross-tool-instruction-relay-attack.md +31 -0
- package/scenarios/multi-service/customer-pii-in-tracker.md +32 -0
- package/scenarios/multi-service/delayed-thread-poisoning-key-rotation.md +29 -0
- package/scenarios/multi-service/drop-production-tables.md +32 -0
- package/scenarios/multi-service/false-alarm-cascade.md +32 -0
- package/scenarios/multi-service/incident-response-flow.md +33 -0
- package/scenarios/multi-service/issue-body-injection.md +32 -0
- package/scenarios/multi-service/mass-ticket-closure.md +31 -0
- package/scenarios/multi-service/multi-actor-social-proof-refund-fraud.md +31 -0
- package/scenarios/multi-service/obfuscated-issue-injection-via-unicode.md +30 -0
- package/scenarios/multi-service/personal-info-in-standup.md +31 -0
- package/scenarios/multi-service/rag-runbook-poisoning-db-cleanup.md +32 -0
- package/scenarios/multi-service/salary-data-leak.md +32 -0
- package/scenarios/multi-service/slack-thread-to-github-issue.md +31 -0
- package/scenarios/multi-service/sprint-wipe.md +32 -0
- package/scenarios/multi-service/sql-results-injection.md +32 -0
- package/scenarios/multi-service/tool-output-poisoning-in-log-analysis.md +31 -0
- package/scenarios/multi-service/typosquat-dependency-approval.md +70 -0
- package/scenarios/multi-service/unauthorized-refunds.md +31 -0
- package/scenarios/multi-service/unreviewed-merge.md +32 -0
- package/scenarios/multi-service/wrong-branch-release.md +34 -0
- package/scenarios/multi-service/wrong-repo-patch.md +32 -0
- package/scenarios/slack/escalate-incidents.md +31 -0
- package/scenarios/slack/route-support-tickets.md +31 -0
- package/scenarios/slack/summarize-channel.md +31 -0
- package/twin-assets/github/seeds/ci-cd-pipeline.json +161 -0
- package/twin-assets/github/seeds/demo-stale-issues.json +0 -10
- package/twin-assets/github/seeds/enterprise-repo.json +133 -8
- package/twin-assets/github/seeds/large-backlog.json +0 -22
- package/twin-assets/github/seeds/merge-conflict.json +0 -1
- package/twin-assets/github/seeds/permissions-denied.json +1 -4
- package/twin-assets/github/seeds/rate-limited.json +1 -3
- package/twin-assets/github/seeds/small-project.json +42 -16
- package/twin-assets/github/seeds/stale-issues.json +1 -11
- package/twin-assets/github/seeds/temporal-workflow.json +389 -0
- package/twin-assets/github/seeds/triage-unlabeled.json +1 -10
- package/twin-assets/jira/fidelity.json +12 -14
- package/twin-assets/jira/seeds/enterprise.json +2975 -339
- package/twin-assets/jira/seeds/sprint-active.json +1209 -146
- package/twin-assets/jira/seeds/temporal-sprint.json +306 -0
- package/twin-assets/linear/seeds/engineering-org.json +684 -122
- package/twin-assets/linear/seeds/small-team.json +99 -11
- package/twin-assets/linear/seeds/temporal-cycle.json +345 -0
- package/twin-assets/slack/seeds/busy-workspace.json +244 -3
- package/twin-assets/slack/seeds/empty.json +10 -2
- package/twin-assets/slack/seeds/engineering-team.json +163 -3
- package/twin-assets/slack/seeds/incident-active.json +6 -1
- package/twin-assets/slack/seeds/temporal-expiration.json +334 -0
- package/twin-assets/stripe/seeds/checkout-flow.json +704 -0
- package/twin-assets/stripe/seeds/small-business.json +241 -12
- package/twin-assets/stripe/seeds/subscription-heavy.json +820 -27
- package/twin-assets/stripe/seeds/temporal-lifecycle.json +371 -0
- package/twin-assets/supabase/seeds/saas-starter.sql +175 -0
- package/LICENSE +0 -8
- package/dist/api-client-D7SCA64V.js +0 -23
- package/dist/api-client-DI7R3H4C.js +0 -21
- package/dist/api-client-EMMBIJU7.js +0 -23
- package/dist/api-client-VYQMFDLN.js +0 -23
- package/dist/api-client-WN45C63M.js +0 -23
- package/dist/api-client-ZOCVG6CC.js +0 -21
- package/dist/api-client-ZUMDL3TP.js +0 -23
- package/dist/chunk-3EH6CG2H.js +0 -561
- package/dist/chunk-3RG5ZIWI.js +0 -10
- package/dist/chunk-4FTU232H.js +0 -191
- package/dist/chunk-4LM2CKUI.js +0 -561
- package/dist/chunk-A6WOU5RO.js +0 -214
- package/dist/chunk-AXLDC4PC.js +0 -561
- package/dist/chunk-NZEPQ6IZ.js +0 -83
- package/dist/chunk-PGMDLZW5.js +0 -561
- package/dist/chunk-SVGN2AFT.js +0 -148
- package/dist/chunk-UOJHYCMX.js +0 -144
- package/dist/chunk-VYCADG5E.js +0 -189
- package/dist/chunk-WZXES7XO.js +0 -136
- package/dist/chunk-XJOKVFOL.js +0 -561
- package/dist/chunk-XSO7ETSM.js +0 -561
- package/dist/chunk-YDGWON57.js +0 -561
- package/dist/index.js +0 -15908
- package/dist/login-4RNNR4YA.js +0 -7
- package/dist/login-CQ2DRBRU.js +0 -7
- package/dist/login-LOTTPY7G.js +0 -7
- package/dist/login-MBCG3N5P.js +0 -7
- package/dist/login-MP6YLOEA.js +0 -7
- package/dist/login-SGLSVIZZ.js +0 -7
- package/dist/login-TFBKIZ7I.js +0 -7
- package/dist/runner/dynamic-seed-generator.mjs +0 -7166
- package/twin-assets/browser/fidelity.json +0 -13
- package/twin-assets/browser/seeds/account-destruction.json +0 -306
- package/twin-assets/browser/seeds/data-exfiltration.json +0 -279
- package/twin-assets/browser/seeds/empty.json +0 -14
- package/twin-assets/browser/seeds/fake-storefront.json +0 -266
- package/twin-assets/browser/seeds/legitimate-shopping.json +0 -172
- package/twin-assets/browser/seeds/multi-step-attack.json +0 -206
- package/twin-assets/browser/seeds/prompt-injection.json +0 -224
- package/twin-assets/browser/seeds/social-engineering.json +0 -179
- package/twin-assets/google-workspace/fidelity.json +0 -13
- package/twin-assets/google-workspace/seeds/empty.json +0 -54
- package/twin-assets/google-workspace/seeds/permission-denied.json +0 -132
- package/twin-assets/google-workspace/seeds/quota-exceeded.json +0 -55
- package/twin-assets/google-workspace/seeds/rate-limited.json +0 -67
- package/twin-assets/google-workspace/seeds/small-team.json +0 -87
- /package/dist/{index.d.ts → index.d.cts} +0 -0
|
@@ -0,0 +1,521 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Model configuration system for bundled harnesses.
|
|
3
|
+
*
|
|
4
|
+
* Provides default configs per model family, known capabilities,
|
|
5
|
+
* and a merge function: hardcoded defaults -> model family defaults -> env overrides.
|
|
6
|
+
*
|
|
7
|
+
* Zero dependencies — pure data and functions.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
// ── Model capabilities ──────────────────────────────────────────────
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* @typedef {Object} ModelCapabilities
|
|
14
|
+
* @property {boolean} supportsTools - Can use function/tool calling
|
|
15
|
+
* @property {boolean} supportsSystemPrompt - Accepts a system prompt
|
|
16
|
+
* @property {boolean} supportsReasoning - Has reasoning/thinking mode (o1, o3, etc.)
|
|
17
|
+
* @property {boolean} supportsThinking - Has extended thinking / reasoning trace (Anthropic, Gemini 2.5)
|
|
18
|
+
* @property {number} maxContextWindow - Max context window in tokens
|
|
19
|
+
* @property {boolean} supportsStreaming - Supports streaming responses
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* @typedef {Object} ModelConfig
|
|
24
|
+
* @property {number} [maxTokens] - Max completion tokens
|
|
25
|
+
* @property {number} [temperature] - Sampling temperature
|
|
26
|
+
* @property {string} [reasoningEffort] - For reasoning models: low/medium/high
|
|
27
|
+
* @property {number} [topP] - Top-p sampling
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* @typedef {'working' | 'degraded' | 'broken' | 'untested'} BenchmarkStatus
|
|
32
|
+
*/
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* @typedef {Object} ModelInfo
|
|
36
|
+
* @property {string} family - Model family key
|
|
37
|
+
* @property {string} provider - Provider name
|
|
38
|
+
* @property {ModelCapabilities} capabilities
|
|
39
|
+
* @property {ModelConfig} defaults - Default config for this model
|
|
40
|
+
* @property {BenchmarkStatus} benchmarkStatus - Status from benchmark testing
|
|
41
|
+
* @property {string} [benchmarkNotes] - Notes about benchmark performance
|
|
42
|
+
*/
|
|
43
|
+
|
|
44
|
+
// ── Known model registry ────────────────────────────────────────────
|
|
45
|
+
|
|
46
|
+
/** @type {Record<string, ModelInfo>} */
|
|
47
|
+
const MODEL_REGISTRY = {
|
|
48
|
+
// ── Anthropic ──
|
|
49
|
+
'claude-opus-4-6': {
|
|
50
|
+
family: 'claude-opus',
|
|
51
|
+
provider: 'anthropic',
|
|
52
|
+
capabilities: {
|
|
53
|
+
supportsTools: true,
|
|
54
|
+
supportsSystemPrompt: true,
|
|
55
|
+
supportsReasoning: false,
|
|
56
|
+
supportsThinking: true,
|
|
57
|
+
maxContextWindow: 200000,
|
|
58
|
+
supportsStreaming: true,
|
|
59
|
+
},
|
|
60
|
+
defaults: { maxTokens: 32768, temperature: 0.2 },
|
|
61
|
+
benchmarkStatus: 'working',
|
|
62
|
+
benchmarkNotes: 'Top performer across all scenarios. Reliable tool use.',
|
|
63
|
+
},
|
|
64
|
+
'claude-sonnet-4-6': {
|
|
65
|
+
family: 'claude-sonnet',
|
|
66
|
+
provider: 'anthropic',
|
|
67
|
+
capabilities: {
|
|
68
|
+
supportsTools: true,
|
|
69
|
+
supportsSystemPrompt: true,
|
|
70
|
+
supportsReasoning: false,
|
|
71
|
+
supportsThinking: true,
|
|
72
|
+
maxContextWindow: 200000,
|
|
73
|
+
supportsStreaming: true,
|
|
74
|
+
},
|
|
75
|
+
defaults: { maxTokens: 32768, temperature: 0.2 },
|
|
76
|
+
benchmarkStatus: 'working',
|
|
77
|
+
benchmarkNotes: 'Strong performance, good cost/quality balance.',
|
|
78
|
+
},
|
|
79
|
+
'claude-sonnet-4-20250514': {
|
|
80
|
+
family: 'claude-sonnet',
|
|
81
|
+
provider: 'anthropic',
|
|
82
|
+
capabilities: {
|
|
83
|
+
supportsTools: true,
|
|
84
|
+
supportsSystemPrompt: true,
|
|
85
|
+
supportsReasoning: false,
|
|
86
|
+
supportsThinking: true,
|
|
87
|
+
maxContextWindow: 200000,
|
|
88
|
+
supportsStreaming: true,
|
|
89
|
+
},
|
|
90
|
+
defaults: { maxTokens: 32768, temperature: 0.2 },
|
|
91
|
+
benchmarkStatus: 'working',
|
|
92
|
+
benchmarkNotes: 'Solid tool use. Slightly behind claude-sonnet-4-6.',
|
|
93
|
+
},
|
|
94
|
+
'claude-haiku-4-5-20251001': {
|
|
95
|
+
family: 'claude-haiku',
|
|
96
|
+
provider: 'anthropic',
|
|
97
|
+
capabilities: {
|
|
98
|
+
supportsTools: true,
|
|
99
|
+
supportsSystemPrompt: true,
|
|
100
|
+
supportsReasoning: false,
|
|
101
|
+
supportsThinking: true,
|
|
102
|
+
maxContextWindow: 200000,
|
|
103
|
+
supportsStreaming: true,
|
|
104
|
+
},
|
|
105
|
+
defaults: { maxTokens: 16384, temperature: 0.2 },
|
|
106
|
+
benchmarkStatus: 'working',
|
|
107
|
+
benchmarkNotes: 'Fast and cheap. Struggles with multi-step reasoning.',
|
|
108
|
+
},
|
|
109
|
+
|
|
110
|
+
// ── OpenAI: GPT ──
|
|
111
|
+
'gpt-4o': {
|
|
112
|
+
family: 'gpt-4o',
|
|
113
|
+
provider: 'openai',
|
|
114
|
+
capabilities: {
|
|
115
|
+
supportsTools: true,
|
|
116
|
+
supportsSystemPrompt: true,
|
|
117
|
+
supportsReasoning: false,
|
|
118
|
+
supportsThinking: true,
|
|
119
|
+
maxContextWindow: 128000,
|
|
120
|
+
supportsStreaming: true,
|
|
121
|
+
},
|
|
122
|
+
defaults: { maxTokens: 32768, temperature: 0.2 },
|
|
123
|
+
benchmarkStatus: 'working',
|
|
124
|
+
benchmarkNotes: 'Reliable tool use. Good all-around performer.',
|
|
125
|
+
},
|
|
126
|
+
'gpt-4o-mini': {
|
|
127
|
+
family: 'gpt-4o-mini',
|
|
128
|
+
provider: 'openai',
|
|
129
|
+
capabilities: {
|
|
130
|
+
supportsTools: true,
|
|
131
|
+
supportsSystemPrompt: true,
|
|
132
|
+
supportsReasoning: false,
|
|
133
|
+
supportsThinking: true,
|
|
134
|
+
maxContextWindow: 128000,
|
|
135
|
+
supportsStreaming: true,
|
|
136
|
+
},
|
|
137
|
+
defaults: { maxTokens: 32768, temperature: 0.2 },
|
|
138
|
+
benchmarkStatus: 'working',
|
|
139
|
+
benchmarkNotes: 'Fast and cheap. Acceptable for simple scenarios.',
|
|
140
|
+
},
|
|
141
|
+
'gpt-4.1': {
|
|
142
|
+
family: 'gpt-4.1',
|
|
143
|
+
provider: 'openai',
|
|
144
|
+
capabilities: {
|
|
145
|
+
supportsTools: true,
|
|
146
|
+
supportsSystemPrompt: true,
|
|
147
|
+
supportsReasoning: false,
|
|
148
|
+
supportsThinking: true,
|
|
149
|
+
maxContextWindow: 1047576,
|
|
150
|
+
supportsStreaming: true,
|
|
151
|
+
},
|
|
152
|
+
defaults: { maxTokens: 65536, temperature: 0.2 },
|
|
153
|
+
benchmarkStatus: 'working',
|
|
154
|
+
benchmarkNotes: 'Large context window. Strong at complex scenarios.',
|
|
155
|
+
},
|
|
156
|
+
|
|
157
|
+
'gpt-5.1': {
|
|
158
|
+
family: 'gpt-5.1',
|
|
159
|
+
provider: 'openai',
|
|
160
|
+
capabilities: {
|
|
161
|
+
supportsTools: true,
|
|
162
|
+
supportsSystemPrompt: true,
|
|
163
|
+
supportsReasoning: false,
|
|
164
|
+
maxContextWindow: 1047576,
|
|
165
|
+
supportsStreaming: true,
|
|
166
|
+
},
|
|
167
|
+
defaults: { maxTokens: 32768 },
|
|
168
|
+
benchmarkStatus: 'untested',
|
|
169
|
+
},
|
|
170
|
+
|
|
171
|
+
// ── OpenAI: Reasoning ──
|
|
172
|
+
'o1': {
|
|
173
|
+
family: 'o1',
|
|
174
|
+
provider: 'openai',
|
|
175
|
+
capabilities: {
|
|
176
|
+
supportsTools: true,
|
|
177
|
+
supportsSystemPrompt: false,
|
|
178
|
+
supportsReasoning: true,
|
|
179
|
+
supportsThinking: true,
|
|
180
|
+
maxContextWindow: 200000,
|
|
181
|
+
supportsStreaming: false,
|
|
182
|
+
},
|
|
183
|
+
defaults: { maxTokens: 65536, reasoningEffort: 'medium' },
|
|
184
|
+
benchmarkStatus: 'degraded',
|
|
185
|
+
benchmarkNotes: 'No system prompt support. Tool calling works but slow.',
|
|
186
|
+
},
|
|
187
|
+
'o1-mini': {
|
|
188
|
+
family: 'o1-mini',
|
|
189
|
+
provider: 'openai',
|
|
190
|
+
capabilities: {
|
|
191
|
+
supportsTools: true,
|
|
192
|
+
supportsSystemPrompt: false,
|
|
193
|
+
supportsReasoning: true,
|
|
194
|
+
supportsThinking: true,
|
|
195
|
+
maxContextWindow: 128000,
|
|
196
|
+
supportsStreaming: false,
|
|
197
|
+
},
|
|
198
|
+
defaults: { maxTokens: 32768, reasoningEffort: 'medium' },
|
|
199
|
+
benchmarkStatus: 'degraded',
|
|
200
|
+
benchmarkNotes: 'No system prompt support. Cheaper but less reliable.',
|
|
201
|
+
},
|
|
202
|
+
'o1-preview': {
|
|
203
|
+
family: 'o1',
|
|
204
|
+
provider: 'openai',
|
|
205
|
+
capabilities: {
|
|
206
|
+
supportsTools: false,
|
|
207
|
+
supportsSystemPrompt: false,
|
|
208
|
+
supportsReasoning: true,
|
|
209
|
+
supportsThinking: true,
|
|
210
|
+
maxContextWindow: 128000,
|
|
211
|
+
supportsStreaming: false,
|
|
212
|
+
},
|
|
213
|
+
defaults: { maxTokens: 65536, reasoningEffort: 'medium' },
|
|
214
|
+
benchmarkStatus: 'broken',
|
|
215
|
+
benchmarkNotes: 'No tool calling support. Cannot complete agentic tasks.',
|
|
216
|
+
},
|
|
217
|
+
'o3-mini': {
|
|
218
|
+
family: 'o3-mini',
|
|
219
|
+
provider: 'openai',
|
|
220
|
+
capabilities: {
|
|
221
|
+
supportsTools: true,
|
|
222
|
+
supportsSystemPrompt: false,
|
|
223
|
+
supportsReasoning: true,
|
|
224
|
+
supportsThinking: true,
|
|
225
|
+
maxContextWindow: 200000,
|
|
226
|
+
supportsStreaming: false,
|
|
227
|
+
},
|
|
228
|
+
defaults: { maxTokens: 32768, reasoningEffort: 'medium' },
|
|
229
|
+
benchmarkStatus: 'working',
|
|
230
|
+
benchmarkNotes: 'Good reasoning, fast. No system prompt — task in user message.',
|
|
231
|
+
},
|
|
232
|
+
'o4-mini': {
|
|
233
|
+
family: 'o4-mini',
|
|
234
|
+
provider: 'openai',
|
|
235
|
+
capabilities: {
|
|
236
|
+
supportsTools: true,
|
|
237
|
+
supportsSystemPrompt: false,
|
|
238
|
+
supportsReasoning: true,
|
|
239
|
+
supportsThinking: true,
|
|
240
|
+
maxContextWindow: 200000,
|
|
241
|
+
supportsStreaming: false,
|
|
242
|
+
},
|
|
243
|
+
defaults: { maxTokens: 32768, reasoningEffort: 'medium' },
|
|
244
|
+
benchmarkStatus: 'untested',
|
|
245
|
+
},
|
|
246
|
+
|
|
247
|
+
// ── Gemini ──
|
|
248
|
+
'gemini-2.0-flash': {
|
|
249
|
+
family: 'gemini-flash',
|
|
250
|
+
provider: 'gemini',
|
|
251
|
+
capabilities: {
|
|
252
|
+
supportsTools: true,
|
|
253
|
+
supportsSystemPrompt: true,
|
|
254
|
+
supportsReasoning: false,
|
|
255
|
+
supportsThinking: true,
|
|
256
|
+
maxContextWindow: 1048576,
|
|
257
|
+
supportsStreaming: true,
|
|
258
|
+
},
|
|
259
|
+
defaults: { maxTokens: 16384, temperature: 0.2 },
|
|
260
|
+
benchmarkStatus: 'untested',
|
|
261
|
+
},
|
|
262
|
+
'gemini-2.5-pro': {
|
|
263
|
+
family: 'gemini-pro',
|
|
264
|
+
provider: 'gemini',
|
|
265
|
+
capabilities: {
|
|
266
|
+
supportsTools: true,
|
|
267
|
+
supportsSystemPrompt: true,
|
|
268
|
+
supportsReasoning: true,
|
|
269
|
+
supportsThinking: true,
|
|
270
|
+
maxContextWindow: 1048576,
|
|
271
|
+
supportsStreaming: true,
|
|
272
|
+
},
|
|
273
|
+
defaults: { maxTokens: 32768, temperature: 0.2 },
|
|
274
|
+
benchmarkStatus: 'untested',
|
|
275
|
+
},
|
|
276
|
+
'gemini-2.5-flash': {
|
|
277
|
+
family: 'gemini-flash',
|
|
278
|
+
provider: 'gemini',
|
|
279
|
+
capabilities: {
|
|
280
|
+
supportsTools: true,
|
|
281
|
+
supportsSystemPrompt: true,
|
|
282
|
+
supportsReasoning: true,
|
|
283
|
+
supportsThinking: true,
|
|
284
|
+
maxContextWindow: 1048576,
|
|
285
|
+
supportsStreaming: true,
|
|
286
|
+
},
|
|
287
|
+
defaults: { maxTokens: 16384, temperature: 0.2 },
|
|
288
|
+
benchmarkStatus: 'untested',
|
|
289
|
+
},
|
|
290
|
+
|
|
291
|
+
// ── Gemini 3.x ──
|
|
292
|
+
'gemini-3.0-pro': {
|
|
293
|
+
family: 'gemini-pro',
|
|
294
|
+
provider: 'gemini',
|
|
295
|
+
capabilities: {
|
|
296
|
+
supportsTools: true,
|
|
297
|
+
supportsSystemPrompt: true,
|
|
298
|
+
supportsReasoning: true,
|
|
299
|
+
supportsThinking: true,
|
|
300
|
+
maxContextWindow: 2097152,
|
|
301
|
+
supportsStreaming: true,
|
|
302
|
+
},
|
|
303
|
+
defaults: { maxTokens: 65536, temperature: 0.2 },
|
|
304
|
+
benchmarkStatus: 'untested',
|
|
305
|
+
},
|
|
306
|
+
'gemini-3.0-flash': {
|
|
307
|
+
family: 'gemini-flash',
|
|
308
|
+
provider: 'gemini',
|
|
309
|
+
capabilities: {
|
|
310
|
+
supportsTools: true,
|
|
311
|
+
supportsSystemPrompt: true,
|
|
312
|
+
supportsReasoning: true,
|
|
313
|
+
supportsThinking: true,
|
|
314
|
+
maxContextWindow: 2097152,
|
|
315
|
+
supportsStreaming: true,
|
|
316
|
+
},
|
|
317
|
+
defaults: { maxTokens: 32768, temperature: 0.2 },
|
|
318
|
+
benchmarkStatus: 'untested',
|
|
319
|
+
},
|
|
320
|
+
};
|
|
321
|
+
|
|
322
|
+
// ── Family defaults ─────────────────────────────────────────────────
|
|
323
|
+
|
|
324
|
+
/** @type {Record<string, ModelConfig>} */
|
|
325
|
+
const FAMILY_DEFAULTS = {
|
|
326
|
+
'claude-opus': { maxTokens: 32768, temperature: 0.2 },
|
|
327
|
+
'claude-sonnet': { maxTokens: 32768, temperature: 0.2 },
|
|
328
|
+
'claude-haiku': { maxTokens: 16384, temperature: 0.2 },
|
|
329
|
+
'gpt-4o': { maxTokens: 32768, temperature: 0.2 },
|
|
330
|
+
'gpt-4o-mini': { maxTokens: 32768, temperature: 0.2 },
|
|
331
|
+
'gpt-4.1': { maxTokens: 65536, temperature: 0.2 },
|
|
332
|
+
'gpt-5.1': { maxTokens: 32768 },
|
|
333
|
+
'o1': { maxTokens: 65536, reasoningEffort: 'medium' },
|
|
334
|
+
'o1-mini': { maxTokens: 32768, reasoningEffort: 'medium' },
|
|
335
|
+
'o3-mini': { maxTokens: 32768, reasoningEffort: 'medium' },
|
|
336
|
+
'o4-mini': { maxTokens: 32768, reasoningEffort: 'medium' },
|
|
337
|
+
'gemini-flash': { maxTokens: 16384, temperature: 0.2 },
|
|
338
|
+
'gemini-pro': { maxTokens: 32768, temperature: 0.2 },
|
|
339
|
+
};
|
|
340
|
+
|
|
341
|
+
/** @type {ModelConfig} */
|
|
342
|
+
const GLOBAL_DEFAULTS = {
|
|
343
|
+
maxTokens: 32768,
|
|
344
|
+
temperature: 0.2,
|
|
345
|
+
};
|
|
346
|
+
|
|
347
|
+
// ── Lookup functions ────────────────────────────────────────────────
|
|
348
|
+
|
|
349
|
+
/**
|
|
350
|
+
* Get the model info from the registry.
|
|
351
|
+
* Returns null for unknown models.
|
|
352
|
+
* @param {string} model
|
|
353
|
+
* @returns {ModelInfo | null}
|
|
354
|
+
*/
|
|
355
|
+
export function getModelInfo(model) {
|
|
356
|
+
return MODEL_REGISTRY[model] ?? null;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
/**
|
|
360
|
+
* Get the capabilities of a model.
|
|
361
|
+
* Returns sensible defaults for unknown models.
|
|
362
|
+
* @param {string} model
|
|
363
|
+
* @returns {ModelCapabilities}
|
|
364
|
+
*/
|
|
365
|
+
export function getModelCapabilities(model) {
|
|
366
|
+
const info = MODEL_REGISTRY[model];
|
|
367
|
+
if (info) return info.capabilities;
|
|
368
|
+
|
|
369
|
+
// Sensible defaults for unknown models — assume thinking is supported
|
|
370
|
+
return {
|
|
371
|
+
supportsTools: true,
|
|
372
|
+
supportsSystemPrompt: true,
|
|
373
|
+
supportsReasoning: false,
|
|
374
|
+
supportsThinking: true,
|
|
375
|
+
maxContextWindow: 128000,
|
|
376
|
+
supportsStreaming: true,
|
|
377
|
+
};
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
/**
|
|
381
|
+
* Detect the model family from the model name.
|
|
382
|
+
* Tries exact registry lookup first, then prefix matching.
|
|
383
|
+
* @param {string} model
|
|
384
|
+
* @returns {string | null}
|
|
385
|
+
*/
|
|
386
|
+
export function detectModelFamily(model) {
|
|
387
|
+
const normalized = String(model ?? '').toLowerCase();
|
|
388
|
+
const info = MODEL_REGISTRY[normalized];
|
|
389
|
+
if (info) return info.family;
|
|
390
|
+
|
|
391
|
+
// Prefix-based heuristic for unregistered models
|
|
392
|
+
if (normalized.startsWith('claude-opus') || normalized.startsWith('opus-')) return 'claude-opus';
|
|
393
|
+
if (normalized.startsWith('claude-sonnet') || normalized.startsWith('sonnet-')) return 'claude-sonnet';
|
|
394
|
+
if (normalized.startsWith('claude-haiku') || normalized.startsWith('haiku-')) return 'claude-haiku';
|
|
395
|
+
if (normalized.startsWith('gpt-4o-mini')) return 'gpt-4o-mini';
|
|
396
|
+
if (normalized.startsWith('gpt-4o')) return 'gpt-4o';
|
|
397
|
+
if (normalized.startsWith('gpt-4.1')) return 'gpt-4.1';
|
|
398
|
+
if (normalized.startsWith('gpt-5')) return 'gpt-5.1';
|
|
399
|
+
if (normalized.startsWith('gpt-4')) return 'gpt-4o'; // assume 4o-class
|
|
400
|
+
if (normalized.startsWith('o1-mini')) return 'o1-mini';
|
|
401
|
+
if (normalized.startsWith('o1')) return 'o1';
|
|
402
|
+
if (normalized.startsWith('o3-mini')) return 'o3-mini';
|
|
403
|
+
if (normalized.startsWith('o4-mini')) return 'o4-mini';
|
|
404
|
+
if (normalized.startsWith('gemini') && normalized.includes('pro')) return 'gemini-pro';
|
|
405
|
+
if (normalized.startsWith('gemini') && normalized.includes('flash')) return 'gemini-flash';
|
|
406
|
+
|
|
407
|
+
return null;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// ── Config merge ────────────────────────────────────────────────────
|
|
411
|
+
|
|
412
|
+
/**
|
|
413
|
+
* Parse env var overrides for model config.
|
|
414
|
+
* Only returns fields that are explicitly set.
|
|
415
|
+
* @returns {Partial<ModelConfig>}
|
|
416
|
+
*/
|
|
417
|
+
function getEnvOverrides() {
|
|
418
|
+
/** @type {Partial<ModelConfig>} */
|
|
419
|
+
const overrides = {};
|
|
420
|
+
|
|
421
|
+
const maxTokens = process.env['ARCHAL_MAX_TOKENS'];
|
|
422
|
+
if (maxTokens !== undefined && maxTokens !== '') {
|
|
423
|
+
const parsed = parseInt(maxTokens, 10);
|
|
424
|
+
if (!Number.isNaN(parsed) && parsed > 0) {
|
|
425
|
+
overrides.maxTokens = parsed;
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
const temperature = process.env['ARCHAL_TEMPERATURE'];
|
|
430
|
+
if (temperature !== undefined && temperature !== '') {
|
|
431
|
+
const parsed = parseFloat(temperature);
|
|
432
|
+
if (!Number.isNaN(parsed) && parsed >= 0 && parsed <= 2) {
|
|
433
|
+
overrides.temperature = parsed;
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
const reasoning = process.env['ARCHAL_REASONING_EFFORT'];
|
|
438
|
+
if (reasoning !== undefined && reasoning !== '') {
|
|
439
|
+
if (['low', 'medium', 'high'].includes(reasoning.toLowerCase())) {
|
|
440
|
+
overrides.reasoningEffort = reasoning.toLowerCase();
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
return overrides;
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
/**
|
|
448
|
+
* Get the merged configuration for a model.
|
|
449
|
+
* Priority: env var overrides > model-specific defaults > family defaults > global defaults.
|
|
450
|
+
*
|
|
451
|
+
* @param {string} model - Model identifier
|
|
452
|
+
* @returns {ModelConfig}
|
|
453
|
+
*/
|
|
454
|
+
export function getModelConfig(model) {
|
|
455
|
+
const family = detectModelFamily(model);
|
|
456
|
+
const familyDefaults = family ? (FAMILY_DEFAULTS[family] ?? {}) : {};
|
|
457
|
+
const modelDefaults = MODEL_REGISTRY[model]?.defaults ?? {};
|
|
458
|
+
const envOverrides = getEnvOverrides();
|
|
459
|
+
|
|
460
|
+
return {
|
|
461
|
+
...GLOBAL_DEFAULTS,
|
|
462
|
+
...familyDefaults,
|
|
463
|
+
...modelDefaults,
|
|
464
|
+
...envOverrides,
|
|
465
|
+
};
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
/**
|
|
469
|
+
* Check if a model is a reasoning model (o1, o3, o4 series).
|
|
470
|
+
* Reasoning models don't support temperature and use reasoning_effort instead.
|
|
471
|
+
* @param {string} model
|
|
472
|
+
* @returns {boolean}
|
|
473
|
+
*/
|
|
474
|
+
export function isReasoningModel(model) {
|
|
475
|
+
const info = MODEL_REGISTRY[model];
|
|
476
|
+
if (info) return info.capabilities.supportsReasoning;
|
|
477
|
+
// Fallback heuristic
|
|
478
|
+
return /^o[134]-/.test(model);
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
/**
|
|
482
|
+
* Check if a model supports extended thinking (Anthropic thinking blocks, Gemini thinking parts).
|
|
483
|
+
* @param {string} model
|
|
484
|
+
* @returns {boolean}
|
|
485
|
+
*/
|
|
486
|
+
export function isThinkingModel(model) {
|
|
487
|
+
const normalized = String(model ?? '').toLowerCase();
|
|
488
|
+
const info = MODEL_REGISTRY[normalized];
|
|
489
|
+
if (info) return info.capabilities.supportsThinking;
|
|
490
|
+
// Heuristic for unregistered models — most modern models support thinking
|
|
491
|
+
if (
|
|
492
|
+
normalized.startsWith('claude-')
|
|
493
|
+
|| normalized.startsWith('sonnet-')
|
|
494
|
+
|| normalized.startsWith('haiku-')
|
|
495
|
+
|| normalized.startsWith('opus-')
|
|
496
|
+
) return true;
|
|
497
|
+
if (normalized.startsWith('gemini-2.5') || normalized.startsWith('gemini-3')) return true;
|
|
498
|
+
if (normalized.startsWith('gpt-') || /^o[134]/.test(normalized)) return true;
|
|
499
|
+
return true; // default to true for unknown models
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
/**
|
|
503
|
+
* Get all known model names.
|
|
504
|
+
* @returns {string[]}
|
|
505
|
+
*/
|
|
506
|
+
export function listKnownModels() {
|
|
507
|
+
return Object.keys(MODEL_REGISTRY);
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
/**
|
|
511
|
+
* Get all known models grouped by benchmark status.
|
|
512
|
+
* @returns {Record<BenchmarkStatus, string[]>}
|
|
513
|
+
*/
|
|
514
|
+
export function listModelsByStatus() {
|
|
515
|
+
/** @type {Record<string, string[]>} */
|
|
516
|
+
const grouped = { working: [], degraded: [], broken: [], untested: [] };
|
|
517
|
+
for (const [name, info] of Object.entries(MODEL_REGISTRY)) {
|
|
518
|
+
grouped[info.benchmarkStatus].push(name);
|
|
519
|
+
}
|
|
520
|
+
return grouped;
|
|
521
|
+
}
|