dual-brain 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +97 -0
- package/CLAUDE.md +147 -0
- package/LICENSE +21 -0
- package/README.md +197 -0
- package/agents/implementer.md +22 -0
- package/agents/researcher.md +25 -0
- package/agents/verifier.md +30 -0
- package/bin/dual-brain.mjs +2868 -0
- package/hooks/auto-update-wrapper.mjs +102 -0
- package/hooks/auto-update.sh +67 -0
- package/hooks/budget-balancer.mjs +679 -0
- package/hooks/control-panel.mjs +1195 -0
- package/hooks/cost-logger.mjs +286 -0
- package/hooks/cost-report.mjs +351 -0
- package/hooks/decision-ledger.mjs +299 -0
- package/hooks/dual-brain-review.mjs +404 -0
- package/hooks/dual-brain-think.mjs +393 -0
- package/hooks/enforce-tier.mjs +469 -0
- package/hooks/failure-detector.mjs +138 -0
- package/hooks/gpt-work-dispatcher.mjs +512 -0
- package/hooks/head-guard.mjs +105 -0
- package/hooks/health-check.mjs +444 -0
- package/hooks/install-git-hooks.mjs +106 -0
- package/hooks/model-registry.mjs +859 -0
- package/hooks/plan-generator.mjs +544 -0
- package/hooks/profiles.mjs +254 -0
- package/hooks/quality-gate.mjs +355 -0
- package/hooks/risk-classifier.mjs +41 -0
- package/hooks/session-report.mjs +514 -0
- package/hooks/setup-wizard.mjs +130 -0
- package/hooks/summary-checkpoint.mjs +432 -0
- package/hooks/task-classifier.mjs +328 -0
- package/hooks/test-orchestrator.mjs +1077 -0
- package/hooks/vibe-memory.mjs +463 -0
- package/hooks/vibe-router.mjs +387 -0
- package/hooks/wave-orchestrator.mjs +1397 -0
- package/install.mjs +1541 -0
- package/mcp-server/README.md +81 -0
- package/mcp-server/index.mjs +388 -0
- package/orchestrator.json +215 -0
- package/package.json +108 -0
- package/playbooks/debug.json +49 -0
- package/playbooks/refactor.json +57 -0
- package/playbooks/security-audit.json +57 -0
- package/playbooks/security.json +38 -0
- package/playbooks/test-gen.json +48 -0
- package/plugin.json +22 -0
- package/review-rules.md +17 -0
- package/shell-hook.sh +26 -0
- package/skills/go.md +22 -0
- package/skills/review.md +19 -0
- package/skills/status.md +13 -0
- package/skills/think.md +22 -0
- package/src/brief.mjs +266 -0
- package/src/decide.mjs +635 -0
- package/src/decompose.mjs +331 -0
- package/src/detect.mjs +345 -0
- package/src/dispatch.mjs +942 -0
- package/src/health.mjs +253 -0
- package/src/index.mjs +44 -0
- package/src/install-hooks.mjs +100 -0
- package/src/playbook.mjs +257 -0
- package/src/profile.mjs +990 -0
- package/src/redact.mjs +192 -0
- package/src/repo.mjs +292 -0
- package/src/session.mjs +1036 -0
- package/src/tui.mjs +197 -0
- package/src/update-check.mjs +35 -0
|
@@ -0,0 +1,859 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* model-registry.mjs — Comprehensive model capability registry with outcome tracking.
|
|
4
|
+
* Knows every model's strengths, weaknesses, quirks, modes, and dispatch config.
|
|
5
|
+
* Tracks outcomes to learn which models work best for which tasks.
|
|
6
|
+
*
|
|
7
|
+
* Exports: getModelInfo, getAllModels, recordOutcome, getSuccessRate, getBestModelFor,
|
|
8
|
+
* refreshRegistry, getDispatchConfig, getCapabilities, MODEL_CAPABILITIES
|
|
9
|
+
* CLI: node hooks/model-registry.mjs [--best-for <intent>] [--success-rates] [--refresh] [--caps <model>]
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import fs from 'fs';
|
|
13
|
+
import path from 'path';
|
|
14
|
+
import { fileURLToPath } from 'url';
|
|
15
|
+
import { execSync } from 'child_process';
|
|
16
|
+
|
|
17
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
18
|
+
const OUTCOMES_DIR = path.join(__dirname, '..', '.dualbrain');
|
|
19
|
+
const OUTCOMES_FILE = path.join(OUTCOMES_DIR, 'model-outcomes.jsonl');
|
|
20
|
+
|
|
21
|
+
// ─── Model Capabilities & Modes ──────────────────────────────────────────────
|
|
22
|
+
|
|
23
|
+
export const MODEL_CAPABILITIES = {
|
|
24
|
+
// ── Claude Models ──────────────────────────────────────────────────────────
|
|
25
|
+
|
|
26
|
+
haiku: {
|
|
27
|
+
provider: 'claude',
|
|
28
|
+
fullName: 'Claude Haiku 4.5',
|
|
29
|
+
tier: 'search',
|
|
30
|
+
cost: 'minimal',
|
|
31
|
+
contextWindow: 200_000,
|
|
32
|
+
maxOutput: 64_000,
|
|
33
|
+
|
|
34
|
+
strengths: ['search', 'format', 'lookup', 'classification', 'simple-qa', 'grep-analysis'],
|
|
35
|
+
weaknesses: ['complex-edits', 'architecture', 'security', 'multi-file-refactor', 'ambiguous-requirements'],
|
|
36
|
+
bestFor: ['file-search', 'grep', 'formatting', 'lint-fixes', 'simple-classification', 'read-only-exploration'],
|
|
37
|
+
avoidFor: ['multi-file-edits', 'architecture', 'security-review', 'complex-debug', 'code-generation'],
|
|
38
|
+
|
|
39
|
+
reasoning: {
|
|
40
|
+
extendedThinking: false,
|
|
41
|
+
adaptiveThinking: false,
|
|
42
|
+
effortLevels: null,
|
|
43
|
+
defaultEffort: null,
|
|
44
|
+
},
|
|
45
|
+
modes: {
|
|
46
|
+
fastMode: false,
|
|
47
|
+
extendedContext: false,
|
|
48
|
+
webSearch: false,
|
|
49
|
+
worktreeIsolation: true,
|
|
50
|
+
},
|
|
51
|
+
dispatch: {
|
|
52
|
+
method: 'claude-agent',
|
|
53
|
+
flag: "model: 'haiku'",
|
|
54
|
+
example: "Agent({ model: 'haiku', prompt: '...' })",
|
|
55
|
+
},
|
|
56
|
+
latency: 'fastest',
|
|
57
|
+
quirks: [
|
|
58
|
+
'No extended thinking — pure fast inference',
|
|
59
|
+
'Will confidently hallucinate on complex multi-file edits',
|
|
60
|
+
'Excellent at pattern matching and classification tasks',
|
|
61
|
+
'Cost is ~10x less than Sonnet per token',
|
|
62
|
+
'200K context only — no 1M extended context available',
|
|
63
|
+
],
|
|
64
|
+
},
|
|
65
|
+
|
|
66
|
+
sonnet: {
|
|
67
|
+
provider: 'claude',
|
|
68
|
+
fullName: 'Claude Sonnet 4.6',
|
|
69
|
+
tier: 'execute',
|
|
70
|
+
cost: 'moderate',
|
|
71
|
+
contextWindow: 200_000,
|
|
72
|
+
extendedContextWindow: 1_000_000,
|
|
73
|
+
maxOutput: 64_000,
|
|
74
|
+
|
|
75
|
+
strengths: ['edit', 'refactor', 'test', 'debug', 'code-generation', 'tool-use', 'multi-file-edits'],
|
|
76
|
+
weaknesses: ['deep-architecture', 'ambiguous-requirements', 'frontier-reasoning', 'novel-algorithm-design'],
|
|
77
|
+
bestFor: ['file-edits', 'test-writing', 'bug-fixes', 'refactoring', 'code-generation', 'moderate-debug'],
|
|
78
|
+
avoidFor: ['architecture-decisions', 'security-audit', 'complex-system-design', 'ambiguous-specs'],
|
|
79
|
+
|
|
80
|
+
reasoning: {
|
|
81
|
+
extendedThinking: true,
|
|
82
|
+
adaptiveThinking: true,
|
|
83
|
+
effortLevels: ['low', 'medium', 'high', 'xhigh', 'max'],
|
|
84
|
+
defaultEffort: 'high',
|
|
85
|
+
disableAdaptiveEnv: 'CLAUDE_CODE_DISABLE_ADAPTIVE_THINKING=1',
|
|
86
|
+
fixedBudgetEnv: 'MAX_THINKING_TOKENS',
|
|
87
|
+
},
|
|
88
|
+
modes: {
|
|
89
|
+
fastMode: false,
|
|
90
|
+
extendedContext: true,
|
|
91
|
+
extendedContextSuffix: '[1m]',
|
|
92
|
+
webSearch: false,
|
|
93
|
+
worktreeIsolation: true,
|
|
94
|
+
},
|
|
95
|
+
dispatch: {
|
|
96
|
+
method: 'claude-agent',
|
|
97
|
+
flag: "model: 'sonnet'",
|
|
98
|
+
example: "Agent({ model: 'sonnet', prompt: '...' })",
|
|
99
|
+
},
|
|
100
|
+
latency: 'medium',
|
|
101
|
+
quirks: [
|
|
102
|
+
'Extended thinking available but costs extra thinking tokens',
|
|
103
|
+
'1M context requires "extra usage" on all plans',
|
|
104
|
+
'Sweet spot for execution: good enough reasoning at moderate cost',
|
|
105
|
+
'Adaptive thinking adjusts depth per-turn automatically',
|
|
106
|
+
'Default effort "high" gives good quality without excessive token burn',
|
|
107
|
+
'Can be spawned as subagent with Agent(model: "sonnet")',
|
|
108
|
+
],
|
|
109
|
+
},
|
|
110
|
+
|
|
111
|
+
opus: {
|
|
112
|
+
provider: 'claude',
|
|
113
|
+
fullName: 'Claude Opus 4.6 / 4.7',
|
|
114
|
+
tier: 'think',
|
|
115
|
+
cost: 'expensive',
|
|
116
|
+
contextWindow: 200_000,
|
|
117
|
+
extendedContextWindow: 1_000_000,
|
|
118
|
+
maxOutput: 128_000,
|
|
119
|
+
|
|
120
|
+
strengths: ['architecture', 'security', 'complex-debug', 'review', 'planning',
|
|
121
|
+
'ambiguous-requirements', 'novel-algorithm-design', 'multi-system-reasoning',
|
|
122
|
+
'threat-modeling', 'code-review', 'design-decisions'],
|
|
123
|
+
weaknesses: ['cost', 'overkill-for-simple-tasks', 'latency-for-trivial-work'],
|
|
124
|
+
bestFor: ['architecture-decisions', 'security-review', 'complex-debug', 'code-review',
|
|
125
|
+
'planning', 'dual-brain-think', 'ambiguous-specs', 'system-design'],
|
|
126
|
+
avoidFor: ['simple-edits', 'formatting', 'grep', 'file-search', 'lint-fixes'],
|
|
127
|
+
|
|
128
|
+
reasoning: {
|
|
129
|
+
extendedThinking: true,
|
|
130
|
+
adaptiveThinking: true,
|
|
131
|
+
effortLevels: ['low', 'medium', 'high', 'xhigh', 'max'],
|
|
132
|
+
defaultEffort: 'xhigh',
|
|
133
|
+
disableAdaptiveEnv: 'CLAUDE_CODE_DISABLE_ADAPTIVE_THINKING=1',
|
|
134
|
+
fixedBudgetEnv: 'MAX_THINKING_TOKENS',
|
|
135
|
+
ultrathinkKeyword: true,
|
|
136
|
+
},
|
|
137
|
+
modes: {
|
|
138
|
+
fastMode: true,
|
|
139
|
+
fastModeSpeedup: '2.5x',
|
|
140
|
+
fastModeCostMultiplier: '~2x per token',
|
|
141
|
+
extendedContext: true,
|
|
142
|
+
extendedContextSuffix: '[1m]',
|
|
143
|
+
webSearch: false,
|
|
144
|
+
worktreeIsolation: true,
|
|
145
|
+
agentTeams: true,
|
|
146
|
+
agentTeamsEnv: 'CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1',
|
|
147
|
+
},
|
|
148
|
+
dispatch: {
|
|
149
|
+
method: 'main-session',
|
|
150
|
+
flag: "model: 'opus'",
|
|
151
|
+
example: "Agent({ model: 'opus', prompt: '...' }) or main session",
|
|
152
|
+
note: 'Prefer main session for think-tier; spawn as agent only for parallel think tasks',
|
|
153
|
+
},
|
|
154
|
+
latency: 'slow',
|
|
155
|
+
quirks: [
|
|
156
|
+
'Opus 4.7 always uses adaptive reasoning (cannot disable)',
|
|
157
|
+
'Opus 4.6 can disable adaptive thinking for fixed budget control',
|
|
158
|
+
'Fast mode (/fast): 2.5x speed but ~2x cost per token — use for iteration',
|
|
159
|
+
'"ultrathink" keyword in prompt triggers deeper reasoning for one turn',
|
|
160
|
+
'1M context auto-upgrades on Max/Team/Enterprise plans',
|
|
161
|
+
'xhigh effort recommended for Opus 4.7 — default minimum for think-tier',
|
|
162
|
+
'"max" effort removes token cap entirely — use for session-critical decisions only',
|
|
163
|
+
'Thinking tokens are billable even when collapsed/hidden',
|
|
164
|
+
'opusplan alias: Opus in plan mode, auto-switches to Sonnet for execution',
|
|
165
|
+
],
|
|
166
|
+
},
|
|
167
|
+
|
|
168
|
+
// ── OpenAI Models (via Codex CLI) ──────────────────────────────────────────
|
|
169
|
+
|
|
170
|
+
'gpt-4.1-mini': {
|
|
171
|
+
provider: 'openai',
|
|
172
|
+
fullName: 'GPT-4.1 Mini',
|
|
173
|
+
tier: 'search',
|
|
174
|
+
cost: 'minimal',
|
|
175
|
+
contextWindow: 1_047_576,
|
|
176
|
+
maxOutput: 32_768,
|
|
177
|
+
|
|
178
|
+
strengths: ['search', 'format', 'simple-edits', 'classification', 'fast-lookups'],
|
|
179
|
+
weaknesses: ['complex-refactors', 'architecture', 'multi-file-edits', 'reasoning'],
|
|
180
|
+
bestFor: ['grep-analysis', 'simple-formatting', 'classification', 'file-search', 'quick-lookups'],
|
|
181
|
+
avoidFor: ['refactoring', 'architecture', 'security', 'complex-debug', 'code-generation'],
|
|
182
|
+
|
|
183
|
+
reasoning: {
|
|
184
|
+
effortLevels: ['low', 'medium', 'high', 'xhigh'],
|
|
185
|
+
defaultEffort: 'medium',
|
|
186
|
+
codexFlag: '-c reasoning.effort="<level>"',
|
|
187
|
+
},
|
|
188
|
+
modes: {
|
|
189
|
+
webSearch: true,
|
|
190
|
+
webSearchFlag: '--search',
|
|
191
|
+
sandbox: { search: 'read-only', execute: 'danger-full-access' },
|
|
192
|
+
},
|
|
193
|
+
dispatch: {
|
|
194
|
+
method: 'codex-exec',
|
|
195
|
+
example: 'codex exec -m gpt-4.1-mini -s read-only "..."',
|
|
196
|
+
},
|
|
197
|
+
latency: 'fastest',
|
|
198
|
+
quirks: [
|
|
199
|
+
'Cheapest OpenAI model — great for high-volume search tasks',
|
|
200
|
+
'Large 1M context window but weak reasoning',
|
|
201
|
+
'Max output only 32K (vs 128K for newer models)',
|
|
202
|
+
'Legacy model — may be deprecated; prefer gpt-5.3-codex-spark for speed',
|
|
203
|
+
],
|
|
204
|
+
},
|
|
205
|
+
|
|
206
|
+
'gpt-4.1': {
|
|
207
|
+
provider: 'openai',
|
|
208
|
+
fullName: 'GPT-4.1',
|
|
209
|
+
tier: 'execute',
|
|
210
|
+
cost: 'low',
|
|
211
|
+
contextWindow: 1_047_576,
|
|
212
|
+
maxOutput: 32_768,
|
|
213
|
+
|
|
214
|
+
strengths: ['edits', 'test-fixes', 'straightforward-tasks', 'instruction-following'],
|
|
215
|
+
weaknesses: ['complex-architecture', 'ambiguous-debug', 'frontier-reasoning'],
|
|
216
|
+
bestFor: ['simple-edits', 'test-fixes', 'boilerplate', 'straightforward-refactors'],
|
|
217
|
+
avoidFor: ['architecture', 'security-audit', 'complex-debug', 'novel-design'],
|
|
218
|
+
|
|
219
|
+
reasoning: {
|
|
220
|
+
effortLevels: ['low', 'medium', 'high', 'xhigh'],
|
|
221
|
+
defaultEffort: 'medium',
|
|
222
|
+
codexFlag: '-c reasoning.effort="<level>"',
|
|
223
|
+
},
|
|
224
|
+
modes: {
|
|
225
|
+
webSearch: true,
|
|
226
|
+
webSearchFlag: '--search',
|
|
227
|
+
sandbox: { search: 'read-only', execute: 'danger-full-access' },
|
|
228
|
+
},
|
|
229
|
+
dispatch: {
|
|
230
|
+
method: 'codex-exec',
|
|
231
|
+
example: 'codex exec -m gpt-4.1 -s danger-full-access "..."',
|
|
232
|
+
},
|
|
233
|
+
latency: 'fast',
|
|
234
|
+
quirks: [
|
|
235
|
+
'Legacy model — solid but outclassed by gpt-5.x series',
|
|
236
|
+
'Good instruction following for well-specified tasks',
|
|
237
|
+
'Max output 32K limits usefulness for large code generation',
|
|
238
|
+
'Best used when budget is tight and task is simple',
|
|
239
|
+
],
|
|
240
|
+
},
|
|
241
|
+
|
|
242
|
+
'gpt-5.2': {
|
|
243
|
+
provider: 'openai',
|
|
244
|
+
fullName: 'GPT-5.2',
|
|
245
|
+
tier: 'execute',
|
|
246
|
+
cost: 'low',
|
|
247
|
+
contextWindow: 400_000,
|
|
248
|
+
maxOutput: 128_000,
|
|
249
|
+
|
|
250
|
+
strengths: ['edits', 'test-fixes', 'simple-refactors', 'explanations', 'budget-execution'],
|
|
251
|
+
weaknesses: ['complex-architecture', 'frontier-reasoning', 'novel-design'],
|
|
252
|
+
bestFor: ['budget-edits', 'test-fixes', 'explanations', 'simple-refactors', 'documentation'],
|
|
253
|
+
avoidFor: ['architecture', 'security', 'complex-multi-file-refactor'],
|
|
254
|
+
|
|
255
|
+
reasoning: {
|
|
256
|
+
effortLevels: ['low', 'medium', 'high', 'xhigh'],
|
|
257
|
+
defaultEffort: 'medium',
|
|
258
|
+
codexFlag: '-c reasoning.effort="<level>"',
|
|
259
|
+
},
|
|
260
|
+
modes: {
|
|
261
|
+
webSearch: true,
|
|
262
|
+
webSearchFlag: '--search',
|
|
263
|
+
sandbox: { search: 'read-only', execute: 'danger-full-access' },
|
|
264
|
+
},
|
|
265
|
+
dispatch: {
|
|
266
|
+
method: 'codex-exec',
|
|
267
|
+
example: 'codex exec -m gpt-5.2 -s danger-full-access "..."',
|
|
268
|
+
},
|
|
269
|
+
latency: 'fast',
|
|
270
|
+
quirks: [
|
|
271
|
+
'Good value: 128K output at low cost',
|
|
272
|
+
'400K context — smaller than gpt-4.1 but sufficient for most tasks',
|
|
273
|
+
'Previous generation — solid reasoning but not frontier',
|
|
274
|
+
'Best budget option for execution tasks that need 128K output',
|
|
275
|
+
],
|
|
276
|
+
},
|
|
277
|
+
|
|
278
|
+
'gpt-5.3-codex': {
|
|
279
|
+
provider: 'openai',
|
|
280
|
+
fullName: 'GPT-5.3 Codex',
|
|
281
|
+
tier: 'execute',
|
|
282
|
+
cost: 'moderate',
|
|
283
|
+
contextWindow: 400_000,
|
|
284
|
+
maxOutput: 128_000,
|
|
285
|
+
|
|
286
|
+
strengths: ['code-generation', 'edit', 'refactor', 'test', 'debug', 'bulk-edits',
|
|
287
|
+
'agentic-coding', 'tool-use', 'multi-step-execution'],
|
|
288
|
+
weaknesses: ['deep-architecture', 'non-code-reasoning'],
|
|
289
|
+
bestFor: ['code-generation', 'bulk-file-edits', 'test-writing', 'refactoring',
|
|
290
|
+
'agentic-coding-tasks', 'multi-step-execution'],
|
|
291
|
+
avoidFor: ['architecture-decisions', 'security-audit', 'non-code-tasks'],
|
|
292
|
+
|
|
293
|
+
reasoning: {
|
|
294
|
+
effortLevels: ['low', 'medium', 'high', 'xhigh'],
|
|
295
|
+
defaultEffort: 'medium',
|
|
296
|
+
codexFlag: '-c reasoning.effort="<level>"',
|
|
297
|
+
},
|
|
298
|
+
modes: {
|
|
299
|
+
webSearch: true,
|
|
300
|
+
webSearchFlag: '--search',
|
|
301
|
+
sandbox: { search: 'read-only', execute: 'danger-full-access' },
|
|
302
|
+
},
|
|
303
|
+
dispatch: {
|
|
304
|
+
method: 'codex-exec',
|
|
305
|
+
example: 'codex exec -m gpt-5.3-codex -s danger-full-access "..."',
|
|
306
|
+
},
|
|
307
|
+
latency: 'medium',
|
|
308
|
+
quirks: [
|
|
309
|
+
'Purpose-built for agentic coding — optimized for Codex CLI',
|
|
310
|
+
'Excellent at multi-step tool use chains',
|
|
311
|
+
'400K context (not 1M) — plan file batches accordingly',
|
|
312
|
+
'API pricing: ~$1.75/$14 per 1M tokens (input/output)',
|
|
313
|
+
'Sweet spot for execution-tier Codex dispatch',
|
|
314
|
+
],
|
|
315
|
+
},
|
|
316
|
+
|
|
317
|
+
'gpt-5.3-codex-spark': {
|
|
318
|
+
provider: 'openai',
|
|
319
|
+
fullName: 'GPT-5.3 Codex Spark',
|
|
320
|
+
tier: 'execute',
|
|
321
|
+
cost: 'moderate',
|
|
322
|
+
contextWindow: 128_000,
|
|
323
|
+
maxOutput: 128_000,
|
|
324
|
+
|
|
325
|
+
strengths: ['code-generation', 'fast-edits', 'refactor', 'test', 'debug', 'speed'],
|
|
326
|
+
weaknesses: ['deep-architecture', 'ambiguous-requirements', 'vision', 'small-context'],
|
|
327
|
+
bestFor: ['fast-iteration', 'quick-edits', 'test-fixes', 'speed-critical-execution'],
|
|
328
|
+
avoidFor: ['large-codebase-refactors', 'architecture', 'image-analysis', 'long-context-tasks'],
|
|
329
|
+
|
|
330
|
+
reasoning: {
|
|
331
|
+
effortLevels: ['low', 'medium', 'high', 'xhigh'],
|
|
332
|
+
defaultEffort: 'high',
|
|
333
|
+
codexFlag: '-c reasoning.effort="<level>"',
|
|
334
|
+
},
|
|
335
|
+
modes: {
|
|
336
|
+
webSearch: false,
|
|
337
|
+
sandbox: { search: 'read-only', execute: 'danger-full-access' },
|
|
338
|
+
},
|
|
339
|
+
dispatch: {
|
|
340
|
+
method: 'codex-exec',
|
|
341
|
+
example: 'codex exec -m gpt-5.3-codex-spark -s danger-full-access "..."',
|
|
342
|
+
},
|
|
343
|
+
latency: 'fastest',
|
|
344
|
+
quirks: [
|
|
345
|
+
'1000+ tokens/sec — fastest code model available',
|
|
346
|
+
'TEXT ONLY — no vision/image support',
|
|
347
|
+
'Only 128K context — smallest window of any current model',
|
|
348
|
+
'Default effort "high" because speed compensates for token cost',
|
|
349
|
+
'Research preview — may change behavior between versions',
|
|
350
|
+
'Best for rapid iteration loops where latency matters more than depth',
|
|
351
|
+
],
|
|
352
|
+
},
|
|
353
|
+
|
|
354
|
+
'gpt-5.4-mini': {
|
|
355
|
+
provider: 'openai',
|
|
356
|
+
fullName: 'GPT-5.4 Mini',
|
|
357
|
+
tier: 'execute',
|
|
358
|
+
cost: 'moderate',
|
|
359
|
+
contextWindow: 400_000,
|
|
360
|
+
maxOutput: 128_000,
|
|
361
|
+
|
|
362
|
+
strengths: ['edits', 'refactors', 'moderate-debug', 'balanced-cost-quality'],
|
|
363
|
+
weaknesses: ['frontier-reasoning', 'complex-architecture'],
|
|
364
|
+
bestFor: ['moderate-edits', 'balanced-budget-tasks', 'refactoring', 'test-writing'],
|
|
365
|
+
avoidFor: ['architecture', 'security', 'frontier-reasoning-tasks'],
|
|
366
|
+
|
|
367
|
+
reasoning: {
|
|
368
|
+
effortLevels: ['low', 'medium', 'high', 'xhigh'],
|
|
369
|
+
defaultEffort: 'medium',
|
|
370
|
+
codexFlag: '-c reasoning.effort="<level>"',
|
|
371
|
+
},
|
|
372
|
+
modes: {
|
|
373
|
+
webSearch: true,
|
|
374
|
+
webSearchFlag: '--search',
|
|
375
|
+
sandbox: { search: 'read-only', execute: 'danger-full-access' },
|
|
376
|
+
},
|
|
377
|
+
dispatch: {
|
|
378
|
+
method: 'codex-exec',
|
|
379
|
+
example: 'codex exec -m gpt-5.4-mini -s danger-full-access "..."',
|
|
380
|
+
},
|
|
381
|
+
latency: 'medium',
|
|
382
|
+
quirks: [
|
|
383
|
+
'Balanced cost/quality — smaller version of gpt-5.4',
|
|
384
|
+
'400K context is sufficient for most single-module work',
|
|
385
|
+
'Good fallback when gpt-5.4 is too expensive but task needs reasoning',
|
|
386
|
+
],
|
|
387
|
+
},
|
|
388
|
+
|
|
389
|
+
'gpt-5.4': {
|
|
390
|
+
provider: 'openai',
|
|
391
|
+
fullName: 'GPT-5.4',
|
|
392
|
+
tier: 'execute',
|
|
393
|
+
cost: 'moderate',
|
|
394
|
+
contextWindow: 1_050_000,
|
|
395
|
+
maxOutput: 128_000,
|
|
396
|
+
|
|
397
|
+
strengths: ['edit', 'refactor', 'test', 'debug', 'code-generation', 'bulk-edits',
|
|
398
|
+
'tool-use', 'computer-use', 'agentic-coding', 'multi-file-refactor'],
|
|
399
|
+
weaknesses: ['deep-architecture', 'cost-for-simple-tasks'],
|
|
400
|
+
bestFor: ['bulk-file-edits', 'complex-refactoring', 'test-suites', 'debug',
|
|
401
|
+
'agentic-execution', 'multi-step-tool-chains'],
|
|
402
|
+
avoidFor: ['simple-formatting', 'grep', 'architecture-decisions-alone'],
|
|
403
|
+
|
|
404
|
+
reasoning: {
|
|
405
|
+
effortLevels: ['low', 'medium', 'high', 'xhigh'],
|
|
406
|
+
defaultEffort: 'medium',
|
|
407
|
+
codexFlag: '-c reasoning.effort="<level>"',
|
|
408
|
+
},
|
|
409
|
+
modes: {
|
|
410
|
+
webSearch: true,
|
|
411
|
+
webSearchFlag: '--search',
|
|
412
|
+
computerUse: true,
|
|
413
|
+
toolSearch: true,
|
|
414
|
+
sandbox: { search: 'read-only', execute: 'danger-full-access' },
|
|
415
|
+
},
|
|
416
|
+
dispatch: {
|
|
417
|
+
method: 'codex-exec',
|
|
418
|
+
example: 'codex exec -m gpt-5.4 -s danger-full-access -c reasoning.effort="high" "..."',
|
|
419
|
+
},
|
|
420
|
+
latency: 'medium',
|
|
421
|
+
quirks: [
|
|
422
|
+
'Incorporates gpt-5.3-codex capabilities — can replace it for most tasks',
|
|
423
|
+
'1M+ context window — largest among OpenAI models',
|
|
424
|
+
'Default context in Codex may be 272K; full 1M via API config',
|
|
425
|
+
'Supports computer use and tool search modes',
|
|
426
|
+
'Workhorse model: best general-purpose execute-tier choice',
|
|
427
|
+
'API pricing: ~$2.50/premium per 1M tokens',
|
|
428
|
+
'Priority service tier available for ~2x cost, ~40% faster',
|
|
429
|
+
],
|
|
430
|
+
},
|
|
431
|
+
|
|
432
|
+
'gpt-5.5': {
|
|
433
|
+
provider: 'openai',
|
|
434
|
+
fullName: 'GPT-5.5',
|
|
435
|
+
tier: 'think',
|
|
436
|
+
cost: 'expensive',
|
|
437
|
+
contextWindow: 1_000_000,
|
|
438
|
+
maxOutput: 128_000,
|
|
439
|
+
|
|
440
|
+
strengths: ['architecture', 'security', 'complex-debug', 'review', 'planning',
|
|
441
|
+
'frontier-reasoning', 'multi-system-reasoning', 'threat-modeling',
|
|
442
|
+
'novel-algorithm-design', 'agentic-research'],
|
|
443
|
+
weaknesses: ['cost', 'latency', 'overkill-for-simple-tasks'],
|
|
444
|
+
bestFor: ['architecture-decisions', 'security-audit', 'complex-debug',
|
|
445
|
+
'code-review', 'planning', 'dual-brain-think', 'frontier-reasoning'],
|
|
446
|
+
avoidFor: ['simple-edits', 'formatting', 'grep', 'file-search', 'bulk-boilerplate'],
|
|
447
|
+
|
|
448
|
+
reasoning: {
|
|
449
|
+
effortLevels: ['low', 'medium', 'high', 'xhigh'],
|
|
450
|
+
defaultEffort: 'medium',
|
|
451
|
+
codexFlag: '-c reasoning.effort="<level>"',
|
|
452
|
+
},
|
|
453
|
+
modes: {
|
|
454
|
+
webSearch: true,
|
|
455
|
+
webSearchFlag: '--search',
|
|
456
|
+
sandbox: { search: 'read-only', execute: 'danger-full-access', think: 'read-only' },
|
|
457
|
+
},
|
|
458
|
+
dispatch: {
|
|
459
|
+
method: 'codex-exec',
|
|
460
|
+
example: 'codex exec -m gpt-5.5 -s read-only -c reasoning.effort="high" "..."',
|
|
461
|
+
},
|
|
462
|
+
latency: 'slow',
|
|
463
|
+
quirks: [
|
|
464
|
+
'Strongest OpenAI model — reserve for think-tier and dual-brain',
|
|
465
|
+
'API pricing: $5/$30 per 1M tokens — most expensive OpenAI model',
|
|
466
|
+
'Codex context may be capped at 400K vs 1M via API',
|
|
467
|
+
'Excellent for independent analysis in dual-brain think/review flows',
|
|
468
|
+
'medium effort is usually sufficient — high/xhigh for truly complex reasoning',
|
|
469
|
+
'Priority tier available: ~40% faster at ~2x cost',
|
|
470
|
+
],
|
|
471
|
+
},
|
|
472
|
+
};
|
|
473
|
+
|
|
474
|
+
// ─── Derived Constants ───────────────────────────────────────────────────────
|
|
475
|
+
|
|
476
|
+
const MODELS = {};
|
|
477
|
+
for (const [name, cap] of Object.entries(MODEL_CAPABILITIES)) {
|
|
478
|
+
MODELS[name] = {
|
|
479
|
+
provider: cap.provider,
|
|
480
|
+
tier: cap.tier,
|
|
481
|
+
cost: cap.cost,
|
|
482
|
+
strengths: cap.strengths,
|
|
483
|
+
weaknesses: cap.weaknesses,
|
|
484
|
+
contextWindow: cap.contextWindow,
|
|
485
|
+
maxOutput: cap.maxOutput,
|
|
486
|
+
...(cap.reasoning?.effortLevels ? {
|
|
487
|
+
efforts: cap.reasoning.effortLevels,
|
|
488
|
+
defaultEffort: cap.reasoning.defaultEffort,
|
|
489
|
+
} : {}),
|
|
490
|
+
};
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
const COST_RANK = { minimal: 0, low: 1, moderate: 2, expensive: 3 };
|
|
494
|
+
const LATENCY_RANK = { fastest: 0, fast: 1, medium: 2, slow: 3 };
|
|
495
|
+
|
|
496
|
+
// ─── Capability Queries ──────────────────────────────────────────────────────
|
|
497
|
+
|
|
498
|
+
export function getCapabilities(modelName) {
|
|
499
|
+
return MODEL_CAPABILITIES[modelName] ?? null;
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
export function getDispatchConfig(modelName) {
|
|
503
|
+
const cap = MODEL_CAPABILITIES[modelName];
|
|
504
|
+
if (!cap) return null;
|
|
505
|
+
return {
|
|
506
|
+
method: cap.dispatch.method,
|
|
507
|
+
model: modelName,
|
|
508
|
+
effort: cap.reasoning?.defaultEffort ?? null,
|
|
509
|
+
effortLevels: cap.reasoning?.effortLevels ?? null,
|
|
510
|
+
extendedThinking: cap.reasoning?.extendedThinking ?? false,
|
|
511
|
+
fastMode: cap.modes?.fastMode ?? false,
|
|
512
|
+
extendedContext: cap.modes?.extendedContext ?? false,
|
|
513
|
+
webSearch: cap.modes?.webSearch ?? false,
|
|
514
|
+
sandbox: cap.modes?.sandbox ?? null,
|
|
515
|
+
example: cap.dispatch.example,
|
|
516
|
+
};
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
export function recommendEffort(modelName, taskComplexity, risk) {
|
|
520
|
+
const cap = MODEL_CAPABILITIES[modelName];
|
|
521
|
+
if (!cap?.reasoning?.effortLevels) return null;
|
|
522
|
+
|
|
523
|
+
const levels = cap.reasoning.effortLevels;
|
|
524
|
+
const base = cap.reasoning.defaultEffort;
|
|
525
|
+
|
|
526
|
+
if (risk === 'critical') {
|
|
527
|
+
return levels.includes('xhigh') ? 'xhigh' : levels[levels.length - 1];
|
|
528
|
+
}
|
|
529
|
+
if (taskComplexity === 'complex' || risk === 'high') {
|
|
530
|
+
const idx = levels.indexOf(base);
|
|
531
|
+
return levels[Math.min(idx + 1, levels.length - 1)];
|
|
532
|
+
}
|
|
533
|
+
if (taskComplexity === 'trivial') {
|
|
534
|
+
const idx = levels.indexOf(base);
|
|
535
|
+
return levels[Math.max(idx - 1, 0)];
|
|
536
|
+
}
|
|
537
|
+
return base;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
export function shouldUseExtendedContext(modelName, estimatedTokens) {
|
|
541
|
+
const cap = MODEL_CAPABILITIES[modelName];
|
|
542
|
+
if (!cap?.modes?.extendedContext) return false;
|
|
543
|
+
return estimatedTokens > cap.contextWindow * 0.7;
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
export function shouldUseFastMode(modelName, isIterating) {
|
|
547
|
+
const cap = MODEL_CAPABILITIES[modelName];
|
|
548
|
+
if (!cap?.modes?.fastMode) return false;
|
|
549
|
+
return isIterating;
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
|
553
|
+
|
|
554
|
+
function ensureDir() {
|
|
555
|
+
fs.mkdirSync(OUTCOMES_DIR, { recursive: true });
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
function readOutcomes() {
|
|
559
|
+
if (!fs.existsSync(OUTCOMES_FILE)) return [];
|
|
560
|
+
const lines = fs.readFileSync(OUTCOMES_FILE, 'utf8').trim().split('\n').filter(Boolean);
|
|
561
|
+
const results = [];
|
|
562
|
+
for (const line of lines) {
|
|
563
|
+
try { results.push(JSON.parse(line)); } catch { /* skip malformed */ }
|
|
564
|
+
}
|
|
565
|
+
return results;
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
function atomicAppend(entry) {
|
|
569
|
+
ensureDir();
|
|
570
|
+
const line = JSON.stringify(entry) + '\n';
|
|
571
|
+
const tmp = OUTCOMES_FILE + '.tmp.' + process.pid;
|
|
572
|
+
let existing = '';
|
|
573
|
+
if (fs.existsSync(OUTCOMES_FILE)) {
|
|
574
|
+
existing = fs.readFileSync(OUTCOMES_FILE, 'utf8');
|
|
575
|
+
}
|
|
576
|
+
fs.writeFileSync(tmp, existing + line, 'utf8');
|
|
577
|
+
fs.renameSync(tmp, OUTCOMES_FILE);
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
// ─── Core Exports ────────────────────────────────────────────────────────────
|
|
581
|
+
|
|
582
|
+
export function getModelInfo(modelName) {
|
|
583
|
+
return MODELS[modelName] ?? null;
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
export function getAllModels(filter = {}) {
|
|
587
|
+
return Object.entries(MODELS)
|
|
588
|
+
.map(([name, info]) => ({ name, ...info }))
|
|
589
|
+
.filter(m => {
|
|
590
|
+
if (filter.provider && m.provider !== filter.provider) return false;
|
|
591
|
+
if (filter.tier && m.tier !== filter.tier) return false;
|
|
592
|
+
if (filter.maxCost !== undefined && COST_RANK[m.cost] > COST_RANK[filter.maxCost]) return false;
|
|
593
|
+
return true;
|
|
594
|
+
});
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
export function recordOutcome(entry) {
|
|
598
|
+
const record = {
|
|
599
|
+
timestamp: new Date().toISOString(),
|
|
600
|
+
model: null,
|
|
601
|
+
intent: null,
|
|
602
|
+
risk: null,
|
|
603
|
+
complexity: null,
|
|
604
|
+
effort: null,
|
|
605
|
+
success: null,
|
|
606
|
+
testsPassed: null,
|
|
607
|
+
durationMs: null,
|
|
608
|
+
filesChanged: null,
|
|
609
|
+
escalated: false,
|
|
610
|
+
userCorrected: false,
|
|
611
|
+
...entry,
|
|
612
|
+
};
|
|
613
|
+
atomicAppend(record);
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
export function getSuccessRate(model, intent = null, options = {}) {
|
|
617
|
+
const { since = null, minSamples = 3 } = options;
|
|
618
|
+
const sinceMs = since ? new Date(since).getTime() : 0;
|
|
619
|
+
|
|
620
|
+
const outcomes = readOutcomes().filter(o => {
|
|
621
|
+
if (o.model !== model) return false;
|
|
622
|
+
if (intent && o.intent !== intent) return false;
|
|
623
|
+
if (since && new Date(o.timestamp).getTime() < sinceMs) return false;
|
|
624
|
+
return true;
|
|
625
|
+
});
|
|
626
|
+
|
|
627
|
+
if (outcomes.length < minSamples) return null;
|
|
628
|
+
|
|
629
|
+
const successes = outcomes.filter(o => o.success === true).length;
|
|
630
|
+
const failures = outcomes.filter(o => o.success === false).length;
|
|
631
|
+
return {
|
|
632
|
+
rate: successes / outcomes.length,
|
|
633
|
+
total: outcomes.length,
|
|
634
|
+
successes,
|
|
635
|
+
failures,
|
|
636
|
+
};
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
export function getBestModelFor(intent, provider = null, options = {}) {
|
|
640
|
+
const { minSamples = 3, maxCost = null, maxLatency = null } = options;
|
|
641
|
+
|
|
642
|
+
const candidates = getAllModels({
|
|
643
|
+
...(provider ? { provider } : {}),
|
|
644
|
+
...(maxCost ? { maxCost } : {}),
|
|
645
|
+
}).filter(m => {
|
|
646
|
+
if (maxLatency) {
|
|
647
|
+
const cap = MODEL_CAPABILITIES[m.name];
|
|
648
|
+
if (cap && LATENCY_RANK[cap.latency] > LATENCY_RANK[maxLatency]) return false;
|
|
649
|
+
}
|
|
650
|
+
return true;
|
|
651
|
+
});
|
|
652
|
+
|
|
653
|
+
if (candidates.length === 0) return null;
|
|
654
|
+
|
|
655
|
+
let best = null;
|
|
656
|
+
let bestScore = -Infinity;
|
|
657
|
+
|
|
658
|
+
for (const m of candidates) {
|
|
659
|
+
const cap = MODEL_CAPABILITIES[m.name];
|
|
660
|
+
const hasStrength = m.strengths.includes(intent) ? 1 : 0;
|
|
661
|
+
const hasBestFor = cap?.bestFor?.includes(intent) ? 0.2 : 0;
|
|
662
|
+
const hasAvoid = cap?.avoidFor?.includes(intent) ? -0.5 : 0;
|
|
663
|
+
const rateData = getSuccessRate(m.name, intent, { minSamples });
|
|
664
|
+
|
|
665
|
+
let score;
|
|
666
|
+
let reason;
|
|
667
|
+
|
|
668
|
+
if (rateData) {
|
|
669
|
+
score = (hasStrength + hasBestFor + hasAvoid) * 0.4 + rateData.rate * 0.6;
|
|
670
|
+
reason = `empirical rate ${(rateData.rate * 100).toFixed(0)}% over ${rateData.total} samples`;
|
|
671
|
+
} else {
|
|
672
|
+
const costPenalty = COST_RANK[m.cost] * 0.05;
|
|
673
|
+
score = hasStrength + hasBestFor + hasAvoid - costPenalty;
|
|
674
|
+
reason = hasStrength ? `static strength match for "${intent}"` : 'no direct strength match, lowest-cost option';
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
if (score > bestScore) {
|
|
678
|
+
bestScore = score;
|
|
679
|
+
best = {
|
|
680
|
+
model: m.name,
|
|
681
|
+
provider: m.provider,
|
|
682
|
+
successRate: rateData ? rateData.rate : null,
|
|
683
|
+
sampleSize: rateData ? rateData.total : 0,
|
|
684
|
+
reason,
|
|
685
|
+
dispatch: cap?.dispatch ?? null,
|
|
686
|
+
};
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
return best;
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
export function refreshRegistry() {
|
|
694
|
+
const discovered = [];
|
|
695
|
+
const removed = [];
|
|
696
|
+
const unchanged = [];
|
|
697
|
+
|
|
698
|
+
const claudeModels = ['haiku', 'sonnet', 'opus'];
|
|
699
|
+
for (const m of claudeModels) {
|
|
700
|
+
if (MODELS[m]) unchanged.push(m);
|
|
701
|
+
else { MODELS[m] = { provider: 'claude', uncalibrated: true }; discovered.push(m); }
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
try {
|
|
705
|
+
const raw = execSync('codex debug models 2>/dev/null', { timeout: 8000 }).toString();
|
|
706
|
+
let parsed;
|
|
707
|
+
try { parsed = JSON.parse(raw); } catch { parsed = null; }
|
|
708
|
+
|
|
709
|
+
const list = Array.isArray(parsed)
|
|
710
|
+
? parsed
|
|
711
|
+
: Array.isArray(parsed?.models)
|
|
712
|
+
? parsed.models
|
|
713
|
+
: [];
|
|
714
|
+
|
|
715
|
+
for (const entry of list) {
|
|
716
|
+
const name = typeof entry === 'string' ? entry : entry?.id ?? entry?.name;
|
|
717
|
+
if (!name) continue;
|
|
718
|
+
if (MODELS[name]) {
|
|
719
|
+
unchanged.push(name);
|
|
720
|
+
} else {
|
|
721
|
+
MODELS[name] = {
|
|
722
|
+
provider: 'openai',
|
|
723
|
+
tier: 'execute',
|
|
724
|
+
cost: 'moderate',
|
|
725
|
+
strengths: [],
|
|
726
|
+
weaknesses: [],
|
|
727
|
+
uncalibrated: true,
|
|
728
|
+
};
|
|
729
|
+
discovered.push(name);
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
} catch {
|
|
733
|
+
// codex not available
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
return { discovered, removed, unchanged };
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
// ─── CLI ─────────────────────────────────────────────────────────────────────
|
|
740
|
+
|
|
741
|
+
function fmtRate(r) {
|
|
742
|
+
if (r === null) return 'no data';
|
|
743
|
+
return `${(r.rate * 100).toFixed(0)}% (${r.total} samples)`;
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
function printTable(rows, cols) {
|
|
747
|
+
const widths = cols.map(c => Math.max(c.label.length, ...rows.map(r => String(r[c.key] ?? '').length)));
|
|
748
|
+
const header = cols.map((c, i) => c.label.padEnd(widths[i])).join(' ');
|
|
749
|
+
const divider = widths.map(w => '-'.repeat(w)).join(' ');
|
|
750
|
+
console.log(header);
|
|
751
|
+
console.log(divider);
|
|
752
|
+
for (const row of rows) {
|
|
753
|
+
console.log(cols.map((c, i) => String(row[c.key] ?? '').padEnd(widths[i])).join(' '));
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
if (process.argv[1] === fileURLToPath(import.meta.url)) {
|
|
758
|
+
const args = process.argv.slice(2);
|
|
759
|
+
|
|
760
|
+
if (args.includes('--caps')) {
|
|
761
|
+
const modelName = args[args.indexOf('--caps') + 1];
|
|
762
|
+
if (!modelName) {
|
|
763
|
+
console.error('Usage: --caps <model>');
|
|
764
|
+
process.exit(1);
|
|
765
|
+
}
|
|
766
|
+
const cap = getCapabilities(modelName);
|
|
767
|
+
if (!cap) {
|
|
768
|
+
console.error(`Unknown model: ${modelName}`);
|
|
769
|
+
process.exit(1);
|
|
770
|
+
}
|
|
771
|
+
console.log(JSON.stringify(cap, null, 2));
|
|
772
|
+
|
|
773
|
+
} else if (args.includes('--dispatch')) {
|
|
774
|
+
const modelName = args[args.indexOf('--dispatch') + 1];
|
|
775
|
+
if (!modelName) {
|
|
776
|
+
console.error('Usage: --dispatch <model>');
|
|
777
|
+
process.exit(1);
|
|
778
|
+
}
|
|
779
|
+
const dc = getDispatchConfig(modelName);
|
|
780
|
+
if (!dc) {
|
|
781
|
+
console.error(`Unknown model: ${modelName}`);
|
|
782
|
+
process.exit(1);
|
|
783
|
+
}
|
|
784
|
+
console.log(JSON.stringify(dc, null, 2));
|
|
785
|
+
|
|
786
|
+
} else if (args.includes('--refresh')) {
|
|
787
|
+
const result = refreshRegistry();
|
|
788
|
+
console.log('Registry refresh:');
|
|
789
|
+
console.log(` Discovered: ${result.discovered.join(', ') || 'none'}`);
|
|
790
|
+
console.log(` Unchanged: ${result.unchanged.join(', ') || 'none'}`);
|
|
791
|
+
console.log(` Removed: ${result.removed.join(', ') || 'none'}`);
|
|
792
|
+
|
|
793
|
+
} else if (args.includes('--success-rates')) {
|
|
794
|
+
const outcomes = readOutcomes();
|
|
795
|
+
const modelNames = [...new Set(outcomes.map(o => o.model))];
|
|
796
|
+
if (modelNames.length === 0) {
|
|
797
|
+
console.log('No outcome data recorded yet.');
|
|
798
|
+
} else {
|
|
799
|
+
const rows = modelNames.map(m => {
|
|
800
|
+
const r = getSuccessRate(m, null, { minSamples: 1 });
|
|
801
|
+
return { model: m, rate: r ? fmtRate(r) : 'no data', total: r?.total ?? 0 };
|
|
802
|
+
}).sort((a, b) => b.total - a.total);
|
|
803
|
+
printTable(rows, [
|
|
804
|
+
{ key: 'model', label: 'Model' },
|
|
805
|
+
{ key: 'rate', label: 'Success Rate' },
|
|
806
|
+
{ key: 'total', label: 'Samples' },
|
|
807
|
+
]);
|
|
808
|
+
}
|
|
809
|
+
|
|
810
|
+
} else if (args.includes('--best-for')) {
|
|
811
|
+
const intent = args[args.indexOf('--best-for') + 1];
|
|
812
|
+
const pIdx = args.indexOf('--provider');
|
|
813
|
+
const provider = pIdx !== -1 ? args[pIdx + 1] : null;
|
|
814
|
+
const cIdx = args.indexOf('--max-cost');
|
|
815
|
+
const maxCost = cIdx !== -1 ? args[cIdx + 1] : null;
|
|
816
|
+
|
|
817
|
+
if (!intent) {
|
|
818
|
+
console.error('Usage: --best-for <intent> [--provider claude|openai] [--max-cost minimal|low|moderate|expensive]');
|
|
819
|
+
process.exit(1);
|
|
820
|
+
}
|
|
821
|
+
|
|
822
|
+
const result = getBestModelFor(intent, provider, { ...(maxCost ? { maxCost } : {}) });
|
|
823
|
+
if (!result) {
|
|
824
|
+
console.log(`No matching model found for intent "${intent}"`);
|
|
825
|
+
} else {
|
|
826
|
+
console.log(`Best model for "${intent}"${provider ? ` (${provider})` : ''}:`);
|
|
827
|
+
console.log(` Model: ${result.model}`);
|
|
828
|
+
console.log(` Provider: ${result.provider}`);
|
|
829
|
+
console.log(` Success rate: ${result.successRate !== null ? (result.successRate * 100).toFixed(0) + '%' : 'n/a'}`);
|
|
830
|
+
console.log(` Samples: ${result.sampleSize}`);
|
|
831
|
+
console.log(` Reason: ${result.reason}`);
|
|
832
|
+
console.log(` Dispatch: ${result.dispatch?.example || 'n/a'}`);
|
|
833
|
+
}
|
|
834
|
+
|
|
835
|
+
} else {
|
|
836
|
+
const rows = Object.entries(MODEL_CAPABILITIES).map(([name, cap]) => ({
|
|
837
|
+
name,
|
|
838
|
+
provider: cap.provider,
|
|
839
|
+
tier: cap.tier,
|
|
840
|
+
cost: cap.cost,
|
|
841
|
+
context: cap.contextWindow >= 1_000_000 ? `${(cap.contextWindow / 1_000_000).toFixed(1)}M` : `${(cap.contextWindow / 1000).toFixed(0)}K`,
|
|
842
|
+
output: `${(cap.maxOutput / 1000).toFixed(0)}K`,
|
|
843
|
+
effort: cap.reasoning?.defaultEffort ?? '-',
|
|
844
|
+
latency: cap.latency,
|
|
845
|
+
strengths: cap.strengths.slice(0, 4).join(', '),
|
|
846
|
+
}));
|
|
847
|
+
printTable(rows, [
|
|
848
|
+
{ key: 'name', label: 'Model' },
|
|
849
|
+
{ key: 'provider', label: 'Provider' },
|
|
850
|
+
{ key: 'tier', label: 'Tier' },
|
|
851
|
+
{ key: 'cost', label: 'Cost' },
|
|
852
|
+
{ key: 'context', label: 'Context' },
|
|
853
|
+
{ key: 'output', label: 'Output' },
|
|
854
|
+
{ key: 'effort', label: 'Effort' },
|
|
855
|
+
{ key: 'latency', label: 'Latency' },
|
|
856
|
+
{ key: 'strengths', label: 'Top Strengths' },
|
|
857
|
+
]);
|
|
858
|
+
}
|
|
859
|
+
}
|