mohdel 0.90.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +377 -0
  3. package/config/benchmarks.json +39 -0
  4. package/js/client/call.js +75 -0
  5. package/js/client/call_image.js +82 -0
  6. package/js/client/gate-binary.js +72 -0
  7. package/js/client/index.js +16 -0
  8. package/js/client/ndjson.js +29 -0
  9. package/js/client/transport.js +48 -0
  10. package/js/core/envelope.js +141 -0
  11. package/js/core/errors.js +75 -0
  12. package/js/core/events.js +96 -0
  13. package/js/core/image.js +58 -0
  14. package/js/core/index.js +10 -0
  15. package/js/core/status.js +48 -0
  16. package/js/factory/bridge.js +372 -0
  17. package/js/session/_cooldown.js +114 -0
  18. package/js/session/_logger.js +138 -0
  19. package/js/session/_rate_limiter.js +77 -0
  20. package/js/session/_tracing.js +58 -0
  21. package/js/session/adapters/_cancelled.js +44 -0
  22. package/js/session/adapters/_catalog.js +58 -0
  23. package/js/session/adapters/_chat_completions.js +439 -0
  24. package/js/session/adapters/_errors.js +85 -0
  25. package/js/session/adapters/_images.js +60 -0
  26. package/js/session/adapters/_lazy_json_cache.js +76 -0
  27. package/js/session/adapters/_pricing.js +67 -0
  28. package/js/session/adapters/_providers.js +60 -0
  29. package/js/session/adapters/_tools.js +185 -0
  30. package/js/session/adapters/_videos.js +283 -0
  31. package/js/session/adapters/anthropic.js +397 -0
  32. package/js/session/adapters/cerebras.js +28 -0
  33. package/js/session/adapters/deepseek.js +32 -0
  34. package/js/session/adapters/echo.js +51 -0
  35. package/js/session/adapters/fake.js +262 -0
  36. package/js/session/adapters/fireworks.js +46 -0
  37. package/js/session/adapters/gemini.js +381 -0
  38. package/js/session/adapters/groq.js +23 -0
  39. package/js/session/adapters/image/fake.js +55 -0
  40. package/js/session/adapters/image/index.js +40 -0
  41. package/js/session/adapters/image/novita.js +135 -0
  42. package/js/session/adapters/image/openai.js +50 -0
  43. package/js/session/adapters/index.js +53 -0
  44. package/js/session/adapters/mistral.js +31 -0
  45. package/js/session/adapters/novita.js +29 -0
  46. package/js/session/adapters/openai.js +381 -0
  47. package/js/session/adapters/openrouter.js +66 -0
  48. package/js/session/adapters/xai.js +27 -0
  49. package/js/session/bin.js +54 -0
  50. package/js/session/driver.js +160 -0
  51. package/js/session/index.js +18 -0
  52. package/js/session/run.js +393 -0
  53. package/js/session/run_image.js +61 -0
  54. package/package.json +107 -0
  55. package/src/cli/ask.js +160 -0
  56. package/src/cli/backup.js +107 -0
  57. package/src/cli/bench.js +262 -0
  58. package/src/cli/check.js +123 -0
  59. package/src/cli/colored-logger.js +67 -0
  60. package/src/cli/colors.js +13 -0
  61. package/src/cli/default.js +39 -0
  62. package/src/cli/index.js +150 -0
  63. package/src/cli/json-output.js +60 -0
  64. package/src/cli/model.js +571 -0
  65. package/src/cli/onboard.js +232 -0
  66. package/src/cli/rank.js +176 -0
  67. package/src/cli/ratelimit.js +160 -0
  68. package/src/cli/tag.js +105 -0
  69. package/src/lib/assets/alibaba.svg +1 -0
  70. package/src/lib/assets/anthropic.svg +5 -0
  71. package/src/lib/assets/deepseek.svg +1 -0
  72. package/src/lib/assets/gemini.svg +1 -0
  73. package/src/lib/assets/google.svg +2 -0
  74. package/src/lib/assets/kwaipilot.svg +1 -0
  75. package/src/lib/assets/meta.svg +1 -0
  76. package/src/lib/assets/minimax.svg +9 -0
  77. package/src/lib/assets/moonshotai.svg +4 -0
  78. package/src/lib/assets/openai.svg +5 -0
  79. package/src/lib/assets/xai.svg +1 -0
  80. package/src/lib/assets/xiaomi.svg +2 -0
  81. package/src/lib/assets/zai.svg +219 -0
  82. package/src/lib/benchmark-score.js +215 -0
  83. package/src/lib/benchmark-truth.js +68 -0
  84. package/src/lib/cache.js +76 -0
  85. package/src/lib/common.js +208 -0
  86. package/src/lib/cooldown.js +63 -0
  87. package/src/lib/creators.js +71 -0
  88. package/src/lib/curated-cache.js +146 -0
  89. package/src/lib/errors.js +126 -0
  90. package/src/lib/index.js +726 -0
  91. package/src/lib/logger.js +29 -0
  92. package/src/lib/providers.js +87 -0
  93. package/src/lib/rank.js +390 -0
  94. package/src/lib/rate-limiter.js +50 -0
  95. package/src/lib/schema.js +150 -0
  96. package/src/lib/select.js +474 -0
  97. package/src/lib/tracing.js +62 -0
  98. package/src/lib/utils.js +85 -0
package/package.json ADDED
@@ -0,0 +1,107 @@
1
+ {
2
+ "name": "mohdel",
3
+ "version": "0.90.0",
4
+ "license": "MIT",
5
+ "author": {
6
+ "name": "Christophe Le Bars",
7
+ "email": "clb@toort.net"
8
+ },
9
+ "description": "Self-hosted LLM gateway with an embeddable SDK. Process-isolated, OpenTelemetry-native inference across 11 providers — streaming, tools, thinking control — without orchestration. Use the Node factory in-process, or run thin-gate for fault isolation and any-language HTTP callers.",
10
+ "type": "module",
11
+ "repository": {
12
+ "type": "git",
13
+ "url": "git+https://github.com/clbrge/mohdel.git"
14
+ },
15
+ "homepage": "https://github.com/clbrge/mohdel#readme",
16
+ "bugs": {
17
+ "url": "https://github.com/clbrge/mohdel/issues"
18
+ },
19
+ "engines": {
20
+ "node": ">=22"
21
+ },
22
+ "main": "src/lib/index.js",
23
+ "exports": {
24
+ ".": "./src/lib/index.js",
25
+ "./providers": "./src/lib/providers.js",
26
+ "./creators": "./src/lib/creators.js",
27
+ "./utils": "./src/lib/utils.js",
28
+ "./errors": "./src/lib/errors.js",
29
+ "./client": "./js/client/index.js",
30
+ "./session": "./js/session/index.js",
31
+ "./session/bin": "./js/session/bin.js"
32
+ },
33
+ "imports": {
34
+ "#core": "./js/core/index.js",
35
+ "#core/*": "./js/core/*"
36
+ },
37
+ "bin": {
38
+ "mo": "./src/cli/index.js"
39
+ },
40
+ "files": [
41
+ "js",
42
+ "src/lib",
43
+ "src/cli",
44
+ "config",
45
+ "README.md",
46
+ "LICENSE"
47
+ ],
48
+ "publishConfig": {
49
+ "registry": "https://registry.npmjs.org",
50
+ "access": "public",
51
+ "provenance": true
52
+ },
53
+ "scripts": {
54
+ "lint": "standard",
55
+ "test": "vitest run test/unit",
56
+ "prerelease": "npm run lint && npm run test",
57
+ "release": "release-it",
58
+ "test:provider": "vitest run test/integration/provider.test.js",
59
+ "test:multiturn": "vitest run test/integration/multiturn.test.js",
60
+ "test:vision": "vitest run test/integration/vision.test.js",
61
+ "test:live": "vitest run test/live"
62
+ },
63
+ "release-it": {
64
+ "hooks": {
65
+ "after:bump": "node scripts/sync-version.js"
66
+ },
67
+ "git": {
68
+ "commitMessage": "release: v${version}",
69
+ "requireUpstream": false,
70
+ "tagName": "v${version}",
71
+ "push": true
72
+ },
73
+ "npm": {
74
+ "publish": false
75
+ },
76
+ "github": {
77
+ "release": true,
78
+ "releaseName": "mohdel v${version}",
79
+ "releaseNotes": "awk '/^## \\[${version}\\]/{flag=1;next}/^## \\[/{flag=0}flag' CHANGELOG.md"
80
+ }
81
+ },
82
+ "optionalDependencies": {
83
+ "@clack/prompts": "^1.2.0",
84
+ "@opentelemetry/exporter-trace-otlp-grpc": "^0.215.0",
85
+ "@opentelemetry/sdk-node": "^0.215.0",
86
+ "chalk": "^5.4.0",
87
+ "mohdel-thin-gate-linux-x64-gnu": "0.90.0"
88
+ },
89
+ "dependencies": {
90
+ "@anthropic-ai/sdk": "^0.90.0",
91
+ "@cerebras/cerebras_cloud_sdk": "^1.61.1",
92
+ "@google/genai": "^1.50.1",
93
+ "@opentelemetry/api": "^1.9.1",
94
+ "env-paths": "^4.0.0",
95
+ "groq-sdk": "^1.1.2",
96
+ "openai": "^6.34.0"
97
+ },
98
+ "lint-staged": {
99
+ "*.{js,cjs}": "standard"
100
+ },
101
+ "devDependencies": {
102
+ "lint-staged": "^16.4.0",
103
+ "release-it": "^20.0.0",
104
+ "standard": "^17.1.2",
105
+ "vitest": "^4.1.5"
106
+ }
107
+ }
package/src/cli/ask.js ADDED
@@ -0,0 +1,160 @@
1
+ import mohdel, { silent } from '../lib/index.js'
2
+ import { loadDefaultEnv } from '../lib/common.js'
3
+
4
+ const noop = () => {}
5
+
6
+ export async function runAsk (args) {
7
+ if (args.includes('-h') || args.includes('--help')) {
8
+ console.log(`mohdel ask — one-shot inference, pipeable
9
+
10
+ Usage:
11
+ mo ask <model> [prompt] Prompt from args
12
+ echo "prompt" | mo ask <model> Prompt from stdin
13
+ mo ask <model> "question" < file Combined: args + stdin
14
+
15
+ Options:
16
+ --effort <level> Thinking effort: high, medium, low, none
17
+ --budget <tokens> Output token budget
18
+ --json Output full result as JSON
19
+ --stream Stream output to stdout in real time
20
+ -v, --verbose Show debug info on stderr (cooldown, rate limit, SDK calls)
21
+
22
+ Output:
23
+ stdout: model output text (raw, no formatting — or JSON with --json)
24
+ stderr: model name + token usage summary
25
+
26
+ Examples:
27
+ mo ask gemini/gemini-3-flash-preview "why is the sky blue"
28
+ cat article.txt | mo ask anthropic/claude-sonnet-4-6 "summarize this"
29
+ mo ask openai/gpt-5.4 --effort high "explain monads" --json | jq .cost`)
30
+ process.exit(0)
31
+ }
32
+
33
+ loadDefaultEnv()
34
+
35
+ // Parse flags
36
+ const flagVal = (name) => {
37
+ const idx = args.indexOf(name)
38
+ if (idx === -1) return undefined
39
+ const val = args[idx + 1]
40
+ args.splice(idx, 2)
41
+ return val
42
+ }
43
+ const flag = (name) => {
44
+ const idx = args.indexOf(name)
45
+ if (idx === -1) return false
46
+ args.splice(idx, 1)
47
+ return true
48
+ }
49
+
50
+ const json = flag('--json')
51
+ const stream = flag('--stream')
52
+ const verbose = flag('--verbose') || flag('-v')
53
+ const effort = flagVal('--effort')
54
+ const budget = flagVal('--budget')
55
+
56
+ // First remaining arg is model
57
+ const modelId = args[0]
58
+ if (!modelId) {
59
+ console.error('Usage: mo ask <model> [prompt]')
60
+ process.exit(1)
61
+ }
62
+
63
+ // Remaining args form the prompt
64
+ const promptArgs = args.slice(1).join(' ').trim()
65
+
66
+ // Read stdin if piped
67
+ let stdinContent = ''
68
+ if (!process.stdin.isTTY) {
69
+ const chunks = []
70
+ for await (const chunk of process.stdin) chunks.push(chunk)
71
+ stdinContent = Buffer.concat(chunks).toString('utf8').trim()
72
+ }
73
+
74
+ // Build prompt: args + stdin
75
+ const parts = [promptArgs, stdinContent].filter(Boolean)
76
+ const prompt = parts.join('\n\n')
77
+
78
+ if (!prompt) {
79
+ console.error('No prompt provided. Pass as argument or pipe via stdin.')
80
+ process.exit(1)
81
+ }
82
+
83
+ const log = verbose ? (...args) => process.stderr.write(`${args.map(a => typeof a === 'string' ? a : JSON.stringify(a)).join(' ')}\n`) : noop
84
+ // Verbose mode routes info+warn+error+fatal (and debug) to stderr; trace stays silent.
85
+ // Non-verbose: only error/fatal go to stderr (everything else silent).
86
+ const askLogger = {
87
+ ...silent,
88
+ debug: verbose ? log : noop,
89
+ info: log,
90
+ warn: log,
91
+ error: log,
92
+ fatal: log
93
+ }
94
+ const mo = await mohdel({ logger: askLogger })
95
+ let model
96
+ try {
97
+ model = mo.use(modelId)
98
+ } catch (err) {
99
+ console.error(err.message)
100
+ process.exit(1)
101
+ }
102
+
103
+ const options = {}
104
+ if (effort) options.outputEffort = effort
105
+ if (budget) options.outputBudget = parseInt(budget, 10)
106
+ if (stream && !json) {
107
+ options.realtimeHandler = (delta) => process.stdout.write(delta)
108
+ options.bufferOpts = { maxChars: 1, maxMs: 0 }
109
+ }
110
+
111
+ process.stderr.write(`${model.id}\n`)
112
+
113
+ try {
114
+ const result = await model.answer(prompt, options)
115
+ const output = typeof result === 'string' ? result : result?.output || ''
116
+ const tokens = typeof result === 'object' ? result : {}
117
+
118
+ if (json) {
119
+ console.log(JSON.stringify({
120
+ model: model.id,
121
+ output,
122
+ inputTokens: tokens.inputTokens || 0,
123
+ outputTokens: tokens.outputTokens || 0,
124
+ thinkingTokens: tokens.thinkingTokens || 0,
125
+ cost: tokens.cost ?? null,
126
+ status: tokens.status || 'completed'
127
+ }, null, 2))
128
+ } else if (!stream) {
129
+ process.stdout.write(output)
130
+ if (output && !output.endsWith('\n')) process.stdout.write('\n')
131
+ } else {
132
+ // Stream already wrote to stdout; ensure trailing newline
133
+ if (output && !output.endsWith('\n')) process.stdout.write('\n')
134
+ }
135
+
136
+ // Token + timing summary to stderr
137
+ const summary = []
138
+ if (tokens.inputTokens) summary.push(`${tokens.inputTokens} in`)
139
+ if (tokens.outputTokens) summary.push(`${tokens.outputTokens} out`)
140
+ if (tokens.thinkingTokens) summary.push(`${tokens.thinkingTokens} think`)
141
+ if (tokens.cost != null) summary.push(`$${tokens.cost.toFixed(4)}`)
142
+ const ts = tokens.timestamps
143
+ if (ts) {
144
+ const toMs = (a, b) => {
145
+ if (!a || !b) return null
146
+ const na = typeof a === 'bigint' ? a : BigInt(a)
147
+ const nb = typeof b === 'bigint' ? b : BigInt(b)
148
+ return Number(nb - na) / 1e6
149
+ }
150
+ const ttft = toMs(ts.start, ts.first)
151
+ const total = toMs(ts.start, ts.end)
152
+ if (ttft != null) summary.push(`${Math.round(ttft)}ms ttft`)
153
+ if (total != null) summary.push(`${Math.round(total)}ms total`)
154
+ }
155
+ if (summary.length) process.stderr.write(`${summary.join(', ')}\n`)
156
+ } catch (err) {
157
+ console.error(`Error: ${err.detail || err.message}`)
158
+ process.exit(1)
159
+ }
160
+ }
@@ -0,0 +1,107 @@
1
+ import { existsSync } from 'fs'
2
+ import { readFile, copyFile, stat } from 'fs/promises'
3
+ import { CURATED_PATH, BACKUP_SLOTS } from '../lib/common.js'
4
+ import { id, meta, ok, err, warn } from './colors.js'
5
+
6
+ export async function runBackup (args) {
7
+ const [action, slot] = args
8
+
9
+ if (!action || action === '-h' || action === '--help') {
10
+ console.log(`mohdel model backup — manage catalog backups
11
+
12
+ Usage:
13
+ model backup list Show backup slots with timestamps
14
+ model backup restore <slot> Restore from a backup slot
15
+ model backup diff <slot> Show changes between current and slot
16
+
17
+ Slots: prev (last save), daily (first save of the day), weekly (first save of the week)`)
18
+ process.exit(0)
19
+ }
20
+
21
+ if (action === 'list') {
22
+ const current = existsSync(CURATED_PATH) ? await stat(CURATED_PATH) : null
23
+ if (current) {
24
+ const entries = JSON.parse(await readFile(CURATED_PATH, 'utf8'))
25
+ const count = Object.keys(entries).length
26
+ console.log(` ${ok('●')} current ${meta(fmtDate(current.mtimeMs))} ${meta(`${count} models`)}`)
27
+ } else {
28
+ console.log(` ${meta('○')} current ${meta('(no catalog)')}`)
29
+ }
30
+ for (const s of BACKUP_SLOTS) {
31
+ const path = CURATED_PATH + '.' + s
32
+ if (existsSync(path)) {
33
+ const st = await stat(path)
34
+ const entries = JSON.parse(await readFile(path, 'utf8'))
35
+ const count = Object.keys(entries).length
36
+ console.log(` ${ok('●')} ${id(s.padEnd(7))} ${meta(fmtDate(st.mtimeMs))} ${meta(`${count} models`)}`)
37
+ } else {
38
+ console.log(` ${meta('○')} ${meta(s.padEnd(7))} ${meta('(empty)')}`)
39
+ }
40
+ }
41
+ return
42
+ }
43
+
44
+ if (action === 'restore') {
45
+ if (!slot || !BACKUP_SLOTS.includes(slot)) {
46
+ console.error(`Usage: model backup restore <${BACKUP_SLOTS.join('|')}>`)
47
+ process.exit(1)
48
+ }
49
+ const backupPath = CURATED_PATH + '.' + slot
50
+ if (!existsSync(backupPath)) {
51
+ console.error(err(`No backup in slot "${slot}"`))
52
+ process.exit(1)
53
+ }
54
+ // Rotate current to .prev before restoring
55
+ if (existsSync(CURATED_PATH)) {
56
+ await copyFile(CURATED_PATH, CURATED_PATH + '.prev')
57
+ }
58
+ await copyFile(backupPath, CURATED_PATH)
59
+ const entries = JSON.parse(await readFile(CURATED_PATH, 'utf8'))
60
+ console.log(`${ok('✓')} Restored from ${id(slot)} (${Object.keys(entries).length} models). Previous state saved to ${meta('prev')}.`)
61
+ return
62
+ }
63
+
64
+ if (action === 'diff') {
65
+ if (!slot || !BACKUP_SLOTS.includes(slot)) {
66
+ console.error(`Usage: model backup diff <${BACKUP_SLOTS.join('|')}>`)
67
+ process.exit(1)
68
+ }
69
+ const backupPath = CURATED_PATH + '.' + slot
70
+ if (!existsSync(backupPath)) {
71
+ console.error(err(`No backup in slot "${slot}"`))
72
+ process.exit(1)
73
+ }
74
+ if (!existsSync(CURATED_PATH)) {
75
+ console.error(err('No current catalog'))
76
+ process.exit(1)
77
+ }
78
+
79
+ const current = JSON.parse(await readFile(CURATED_PATH, 'utf8'))
80
+ const backup = JSON.parse(await readFile(backupPath, 'utf8'))
81
+ const currentKeys = new Set(Object.keys(current))
82
+ const backupKeys = new Set(Object.keys(backup))
83
+
84
+ const added = [...currentKeys].filter(k => !backupKeys.has(k))
85
+ const removed = [...backupKeys].filter(k => !currentKeys.has(k))
86
+ const changed = [...currentKeys].filter(k => backupKeys.has(k) && JSON.stringify(current[k]) !== JSON.stringify(backup[k]))
87
+
88
+ if (!added.length && !removed.length && !changed.length) {
89
+ console.log(meta('No differences'))
90
+ return
91
+ }
92
+
93
+ for (const k of added) console.log(`${ok('+')} ${id(k)}`)
94
+ for (const k of removed) console.log(`${err('-')} ${id(k)}`)
95
+ for (const k of changed) console.log(`${warn('~')} ${id(k)}`)
96
+ console.log(meta(`\n${added.length} added, ${removed.length} removed, ${changed.length} changed`))
97
+ return
98
+ }
99
+
100
+ console.error(`Unknown action: ${action}. Run "model backup --help".`)
101
+ process.exit(1)
102
+ }
103
+
104
+ function fmtDate (ms) {
105
+ const d = new Date(ms)
106
+ return d.toLocaleDateString() + ' ' + d.toLocaleTimeString()
107
+ }
@@ -0,0 +1,262 @@
1
+ import { id, label, meta, price, err } from './colors.js'
2
+ import fs from 'node:fs/promises'
3
+ import path from 'node:path'
4
+ import mohdel from '../lib/index.js'
5
+ import { loadDefaultEnv } from '../lib/common.js'
6
+ import {
7
+ loadPrompt, parseJson, scoreCorrectness, computeCost,
8
+ computeTiming, formatNumber
9
+ } from '../lib/benchmark-score.js'
10
+
11
+ export async function runBench (args) {
12
+ if (args.includes('-h') || args.includes('--help')) {
13
+ console.log(`mohdel model bench — benchmark models with live inference
14
+
15
+ Usage:
16
+ model bench <model> [options] Benchmark a single model
17
+ model bench --tag <tag> [options] Benchmark all models with a tag
18
+
19
+ Options:
20
+ --effort <level> Thinking effort: high, medium, low, none
21
+ --budget <tokens> Output token budget (default: 12000)
22
+ --prompt <path> Prompt file (default: test/benchmark.md)
23
+ --save <path> Save results to JSON file
24
+ --json Output as JSON (single model only)
25
+
26
+ Examples:
27
+ mo bench anthropic/claude-sonnet-4-6
28
+ mo bench --tag fast --effort low
29
+ mo bench openai/gpt-5 --budget 8000 --save results.json`)
30
+ process.exit(0)
31
+ }
32
+
33
+ loadDefaultEnv()
34
+
35
+ // Parse flags
36
+ const flag = (name) => {
37
+ const idx = args.indexOf(name)
38
+ if (idx === -1) return false
39
+ args.splice(idx, 1)
40
+ return true
41
+ }
42
+ const flagVal = (name) => {
43
+ const idx = args.indexOf(name)
44
+ if (idx === -1) return undefined
45
+ const val = args[idx + 1]
46
+ args.splice(idx, 2)
47
+ return val
48
+ }
49
+
50
+ const json = flag('--json')
51
+ const effort = flagVal('--effort')
52
+ const budget = parseInt(flagVal('--budget') || '12000', 10)
53
+ const promptPath = flagVal('--prompt') || 'test/benchmark.md'
54
+ const savePath = flagVal('--save')
55
+ const tags = []
56
+ let t
57
+ while ((t = flagVal('--tag'))) tags.push(t)
58
+
59
+ const mo = await mohdel()
60
+
61
+ if (tags.length) {
62
+ await runSuite(mo, { tags, effort, budget, promptPath, savePath })
63
+ } else {
64
+ const modelId = args[0]
65
+ if (!modelId) {
66
+ console.error('Provide a model ID or --tag. Run "mo model bench --help".')
67
+ process.exit(1)
68
+ }
69
+ await runSingle(mo, modelId, { effort, budget, promptPath, savePath, json })
70
+ }
71
+ }
72
+
73
+ // --- Single model ---
74
+
75
+ async function runSingle (mo, modelId, { effort, budget, promptPath, savePath, json }) {
76
+ const prompt = await loadPrompt(promptPath)
77
+ const model = mo.use(modelId)
78
+ const info = model.info()
79
+ const pricing = resolvePricing(info)
80
+
81
+ const result = await benchmarkModel(model, prompt, { effort, budget, pricing })
82
+
83
+ if (savePath) {
84
+ await fs.writeFile(path.resolve(savePath), JSON.stringify(result, null, 2))
85
+ }
86
+
87
+ if (json) {
88
+ console.log(JSON.stringify(result, null, 2))
89
+ } else {
90
+ printSingleResult(result)
91
+ }
92
+ }
93
+
94
+ // --- Suite (multi-model by tag) ---
95
+
96
+ async function runSuite (mo, { tags, effort, budget, promptPath, savePath }) {
97
+ const prompt = await loadPrompt(promptPath)
98
+ const seen = new Set()
99
+ const models = []
100
+ for (const tag of tags) {
101
+ for (const m of mo.list(tag)) {
102
+ if (!seen.has(m.value)) {
103
+ seen.add(m.value)
104
+ models.push(m)
105
+ }
106
+ }
107
+ }
108
+
109
+ if (!models.length) {
110
+ console.error(err(`No models found with tags: ${tags.join(', ')}`))
111
+ process.exit(1)
112
+ }
113
+
114
+ const results = []
115
+ for (let i = 0; i < models.length; i++) {
116
+ const { value, label } = models[i]
117
+ process.stderr.write(`[${i + 1}/${models.length}] ${value}...`)
118
+
119
+ try {
120
+ const model = mo.use(value)
121
+ const pricing = resolvePricing(model.info())
122
+ const result = await benchmarkModel(model, prompt, { effort, budget, pricing })
123
+ results.push(result)
124
+ process.stderr.write(` ${result.correctness.toFixed(3)}\n`)
125
+ } catch (e) {
126
+ process.stderr.write(` ${err('FAILED')}: ${e.message}\n`)
127
+ results.push({ model: value, label, correctness: null, cost: null, correctnessPerDollar: null, error: e.message })
128
+ }
129
+ }
130
+
131
+ results.sort((a, b) => {
132
+ if (a.correctnessPerDollar === null && b.correctnessPerDollar === null) return 0
133
+ if (a.correctnessPerDollar === null) return 1
134
+ if (b.correctnessPerDollar === null) return -1
135
+ return b.correctnessPerDollar - a.correctnessPerDollar
136
+ })
137
+
138
+ printSuiteTable(results)
139
+
140
+ if (savePath) {
141
+ await fs.writeFile(path.resolve(savePath), JSON.stringify(results, null, 2))
142
+ process.stderr.write(`\nResults saved to ${path.resolve(savePath)}\n`)
143
+ }
144
+ }
145
+
146
+ // --- Shared benchmark runner ---
147
+
148
+ async function benchmarkModel (model, prompt, { effort, budget, pricing }) {
149
+ const runTag = `[run:${Date.now()}-${Math.random().toString(36).slice(2, 8)}]`
150
+ const minimalBudget = Math.min(budget || 0, 32) || 32
151
+
152
+ const minimalResponse = await model.answer(`${runTag} say ack`, { outputBudget: minimalBudget, outputEffort: effort })
153
+ const response = await model.answer(`${runTag}\n${prompt}`, { outputBudget: budget, outputEffort: effort })
154
+
155
+ const rawOutput = typeof response === 'string' ? response : response?.output || ''
156
+ const parsed = parseJson(rawOutput)
157
+ const scoring = scoreCorrectness(parsed)
158
+
159
+ const minimalTiming = computeTiming(typeof minimalResponse === 'object' ? minimalResponse.timestamps : {})
160
+ const standardTiming = computeTiming(typeof response === 'object' ? response.timestamps : {})
161
+
162
+ const generationSeconds = standardTiming.generationMs !== null ? standardTiming.generationMs / 1000 : null
163
+ const outputTokens = Number.isFinite(response?.outputTokens) ? response.outputTokens : null
164
+
165
+ const tokens = {
166
+ input: response?.inputTokens ?? null,
167
+ output: response?.outputTokens ?? null,
168
+ thinking: response?.thinkingTokens ?? null
169
+ }
170
+
171
+ const costDollars = computeCost(tokens, pricing)
172
+
173
+ return {
174
+ model: model.id,
175
+ label: model.label,
176
+ correctness: formatNumber(scoring.correctness),
177
+ cost: costDollars !== null ? formatNumber(costDollars) : null,
178
+ correctnessPerDollar: costDollars > 0 ? formatNumber(scoring.correctness / costDollars) : null,
179
+ breakdown: Object.fromEntries(
180
+ Object.entries(scoring.breakdown).map(([k, v]) => [k, formatNumber(v)])
181
+ ),
182
+ tokens,
183
+ pricing: pricing ? { inputPerMillion: pricing.input, outputPerMillion: pricing.output, thinkingPerMillion: pricing.thinking } : null,
184
+ parse: { ok: parsed.ok, error: parsed.ok ? null : parsed.error, extraneous: parsed.extraneous },
185
+ timing: { minimal: minimalTiming, standard: standardTiming },
186
+ throughput: {
187
+ outputTokensPerSecond: outputTokens !== null && generationSeconds > 0 ? formatNumber(outputTokens / generationSeconds) : null,
188
+ charactersPerSecond: generationSeconds > 0 ? formatNumber(rawOutput.length / generationSeconds) : null
189
+ },
190
+ latencyMs: { minimal: minimalTiming.latencyMs, standard: standardTiming.latencyMs },
191
+ requested: { outputBudget: budget, outputEffort: effort || null },
192
+ details: scoring.details,
193
+ raw: rawOutput
194
+ }
195
+ }
196
+
197
+ // --- Helpers ---
198
+
199
+ function resolvePricing (info) {
200
+ if (!info) return null
201
+ const rp = p => typeof p === 'number' ? p : (p?.default ?? 0)
202
+ return { input: rp(info.inputPrice), output: rp(info.outputPrice), thinking: rp(info.thinkingPrice) }
203
+ }
204
+
205
+ // --- Output ---
206
+
207
+ function printSingleResult (r) {
208
+ console.log(`\n${label(r.label)} ${meta(`(${r.model})`)}`)
209
+ console.log(`${meta('correctness:')} ${id(r.correctness?.toFixed(3) || '—')}`)
210
+ console.log(`${meta('cost:')} ${r.cost != null ? price('$' + r.cost.toFixed(4)) : '—'}`)
211
+ console.log(`${meta('corr/$:')} ${r.correctnessPerDollar?.toFixed(1) || '—'}`)
212
+ console.log(`${meta('latency:')} ${r.latencyMs.standard != null ? r.latencyMs.standard.toFixed(0) + 'ms' : '—'}`)
213
+ console.log(`${meta('throughput:')} ${r.throughput.outputTokensPerSecond || '—'} tok/s`)
214
+ console.log(`${meta('tokens:')} ${r.tokens.input || 0} in, ${r.tokens.output || 0} out, ${r.tokens.thinking || 0} thinking`)
215
+
216
+ if (r.breakdown) {
217
+ const parts = Object.entries(r.breakdown).map(([k, v]) => `${k}=${v?.toFixed(3) || '—'}`)
218
+ console.log(`${meta('breakdown:')} ${parts.join(' ')}`)
219
+ }
220
+ console.log()
221
+ }
222
+
223
+ const pad = (str, len) => {
224
+ const s = String(str)
225
+ return s.length >= len ? s : s + ' '.repeat(len - s.length)
226
+ }
227
+
228
+ const fmtNum = (v, width) => {
229
+ if (v === null || v === undefined) return pad('-', width)
230
+ return pad(v.toFixed(3), width)
231
+ }
232
+
233
+ function printSuiteTable (results) {
234
+ const colModel = 35
235
+ const colNum = 9
236
+ const header = pad('Model', colModel) +
237
+ pad('Correct', colNum) +
238
+ pad('Cost($)', colNum) +
239
+ pad('Corr/$', colNum) +
240
+ pad('Entities', colNum) +
241
+ pad('Metrics', colNum) +
242
+ pad('Contrad.', colNum)
243
+ console.log('\n' + meta(header))
244
+ console.log(meta('─'.repeat(header.length)))
245
+
246
+ for (const r of results) {
247
+ if (r.error) {
248
+ console.log(pad(r.model, colModel) + err('FAILED: ' + r.error))
249
+ continue
250
+ }
251
+ console.log(
252
+ pad(r.model, colModel) +
253
+ fmtNum(r.correctness, colNum) +
254
+ fmtNum(r.cost, colNum) +
255
+ fmtNum(r.correctnessPerDollar, colNum) +
256
+ fmtNum(r.breakdown?.entities, colNum) +
257
+ fmtNum(r.breakdown?.metrics, colNum) +
258
+ fmtNum(r.breakdown?.contradictions, colNum)
259
+ )
260
+ }
261
+ console.log()
262
+ }