mohdel 0.90.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +377 -0
- package/config/benchmarks.json +39 -0
- package/js/client/call.js +75 -0
- package/js/client/call_image.js +82 -0
- package/js/client/gate-binary.js +72 -0
- package/js/client/index.js +16 -0
- package/js/client/ndjson.js +29 -0
- package/js/client/transport.js +48 -0
- package/js/core/envelope.js +141 -0
- package/js/core/errors.js +75 -0
- package/js/core/events.js +96 -0
- package/js/core/image.js +58 -0
- package/js/core/index.js +10 -0
- package/js/core/status.js +48 -0
- package/js/factory/bridge.js +372 -0
- package/js/session/_cooldown.js +114 -0
- package/js/session/_logger.js +138 -0
- package/js/session/_rate_limiter.js +77 -0
- package/js/session/_tracing.js +58 -0
- package/js/session/adapters/_cancelled.js +44 -0
- package/js/session/adapters/_catalog.js +58 -0
- package/js/session/adapters/_chat_completions.js +439 -0
- package/js/session/adapters/_errors.js +85 -0
- package/js/session/adapters/_images.js +60 -0
- package/js/session/adapters/_lazy_json_cache.js +76 -0
- package/js/session/adapters/_pricing.js +67 -0
- package/js/session/adapters/_providers.js +60 -0
- package/js/session/adapters/_tools.js +185 -0
- package/js/session/adapters/_videos.js +283 -0
- package/js/session/adapters/anthropic.js +397 -0
- package/js/session/adapters/cerebras.js +28 -0
- package/js/session/adapters/deepseek.js +32 -0
- package/js/session/adapters/echo.js +51 -0
- package/js/session/adapters/fake.js +262 -0
- package/js/session/adapters/fireworks.js +46 -0
- package/js/session/adapters/gemini.js +381 -0
- package/js/session/adapters/groq.js +23 -0
- package/js/session/adapters/image/fake.js +55 -0
- package/js/session/adapters/image/index.js +40 -0
- package/js/session/adapters/image/novita.js +135 -0
- package/js/session/adapters/image/openai.js +50 -0
- package/js/session/adapters/index.js +53 -0
- package/js/session/adapters/mistral.js +31 -0
- package/js/session/adapters/novita.js +29 -0
- package/js/session/adapters/openai.js +381 -0
- package/js/session/adapters/openrouter.js +66 -0
- package/js/session/adapters/xai.js +27 -0
- package/js/session/bin.js +54 -0
- package/js/session/driver.js +160 -0
- package/js/session/index.js +18 -0
- package/js/session/run.js +393 -0
- package/js/session/run_image.js +61 -0
- package/package.json +107 -0
- package/src/cli/ask.js +160 -0
- package/src/cli/backup.js +107 -0
- package/src/cli/bench.js +262 -0
- package/src/cli/check.js +123 -0
- package/src/cli/colored-logger.js +67 -0
- package/src/cli/colors.js +13 -0
- package/src/cli/default.js +39 -0
- package/src/cli/index.js +150 -0
- package/src/cli/json-output.js +60 -0
- package/src/cli/model.js +571 -0
- package/src/cli/onboard.js +232 -0
- package/src/cli/rank.js +176 -0
- package/src/cli/ratelimit.js +160 -0
- package/src/cli/tag.js +105 -0
- package/src/lib/assets/alibaba.svg +1 -0
- package/src/lib/assets/anthropic.svg +5 -0
- package/src/lib/assets/deepseek.svg +1 -0
- package/src/lib/assets/gemini.svg +1 -0
- package/src/lib/assets/google.svg +2 -0
- package/src/lib/assets/kwaipilot.svg +1 -0
- package/src/lib/assets/meta.svg +1 -0
- package/src/lib/assets/minimax.svg +9 -0
- package/src/lib/assets/moonshotai.svg +4 -0
- package/src/lib/assets/openai.svg +5 -0
- package/src/lib/assets/xai.svg +1 -0
- package/src/lib/assets/xiaomi.svg +2 -0
- package/src/lib/assets/zai.svg +219 -0
- package/src/lib/benchmark-score.js +215 -0
- package/src/lib/benchmark-truth.js +68 -0
- package/src/lib/cache.js +76 -0
- package/src/lib/common.js +208 -0
- package/src/lib/cooldown.js +63 -0
- package/src/lib/creators.js +71 -0
- package/src/lib/curated-cache.js +146 -0
- package/src/lib/errors.js +126 -0
- package/src/lib/index.js +726 -0
- package/src/lib/logger.js +29 -0
- package/src/lib/providers.js +87 -0
- package/src/lib/rank.js +390 -0
- package/src/lib/rate-limiter.js +50 -0
- package/src/lib/schema.js +150 -0
- package/src/lib/select.js +474 -0
- package/src/lib/tracing.js +62 -0
- package/src/lib/utils.js +85 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Mohdel logger interface.
|
|
3
|
+
*
|
|
4
|
+
* Mohdel does not own a log sink — it accepts handler functions from the consumer
|
|
5
|
+
* and routes structured events through them. Modules that need a default use
|
|
6
|
+
* `silent` (no-op for all levels).
|
|
7
|
+
*
|
|
8
|
+
* Consumers pass their own logger (pino-compatible) to the mohdel factory.
|
|
9
|
+
* CLI code that wants a colored-stderr helper imports `cliLogger` from
|
|
10
|
+
* `src/cli/colored-logger.js`; keeping that shim out of `src/lib/` lets
|
|
11
|
+
* library consumers avoid loading chalk.
|
|
12
|
+
*
|
|
13
|
+
* All loggers match the interface contract
|
|
14
|
+
* `{ trace, debug, info, warn, error, fatal, child }`.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
const noop = () => {}
|
|
18
|
+
|
|
19
|
+
function silentChild () { return silent }
|
|
20
|
+
|
|
21
|
+
export const silent = {
|
|
22
|
+
trace: noop,
|
|
23
|
+
debug: noop,
|
|
24
|
+
info: noop,
|
|
25
|
+
warn: noop,
|
|
26
|
+
error: noop,
|
|
27
|
+
fatal: noop,
|
|
28
|
+
child: silentChild
|
|
29
|
+
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
const providers = {
|
|
2
|
+
anthropic: {
|
|
3
|
+
sdk: 'anthropic',
|
|
4
|
+
apiKeyEnv: 'ANTHROPIC_API_SK',
|
|
5
|
+
createConfiguration: apiKey => ({ apiKey }),
|
|
6
|
+
creators: ['anthropic']
|
|
7
|
+
},
|
|
8
|
+
cerebras: {
|
|
9
|
+
sdk: 'cerebras',
|
|
10
|
+
apiKeyEnv: 'CEREBRAS_API_SK',
|
|
11
|
+
createConfiguration: apiKey => ({ apiKey }),
|
|
12
|
+
creators: ['openai', 'zai']
|
|
13
|
+
},
|
|
14
|
+
deepseek: {
|
|
15
|
+
sdk: 'openai',
|
|
16
|
+
api: 'chatCompletions',
|
|
17
|
+
apiKeyEnv: 'DEEPSEEK_API_SK',
|
|
18
|
+
createConfiguration: apiKey => ({ baseURL: 'https://api.deepseek.com', apiKey }),
|
|
19
|
+
creators: ['deepseek']
|
|
20
|
+
},
|
|
21
|
+
gemini: {
|
|
22
|
+
sdk: 'gemini',
|
|
23
|
+
apiKeyEnv: 'GEMINI_API_SK',
|
|
24
|
+
createConfiguration: apiKey => ({ apiKey }),
|
|
25
|
+
creators: ['google']
|
|
26
|
+
},
|
|
27
|
+
groq: {
|
|
28
|
+
sdk: 'groq',
|
|
29
|
+
apiKeyEnv: 'GROQ_API_SK',
|
|
30
|
+
createConfiguration: apiKey => ({ apiKey }),
|
|
31
|
+
creators: ['meta']
|
|
32
|
+
},
|
|
33
|
+
mistral: {
|
|
34
|
+
sdk: 'openai',
|
|
35
|
+
api: 'chatCompletions',
|
|
36
|
+
apiKeyEnv: 'MISTRAL_API_SK',
|
|
37
|
+
createConfiguration: apiKey => ({ baseURL: 'https://api.mistral.ai/v1', apiKey }),
|
|
38
|
+
creators: ['mistral']
|
|
39
|
+
},
|
|
40
|
+
fireworks: {
|
|
41
|
+
sdk: 'fireworks',
|
|
42
|
+
apiKeyEnv: 'FIREWORKS_API_SK',
|
|
43
|
+
createConfiguration: apiKey => ({ apiKey, baseURL: 'https://api.fireworks.ai/inference/v1' }),
|
|
44
|
+
creators: ['meta', 'alibaba']
|
|
45
|
+
},
|
|
46
|
+
novita: {
|
|
47
|
+
sdk: 'openai',
|
|
48
|
+
api: 'chatCompletions',
|
|
49
|
+
imageHandler: 'novita',
|
|
50
|
+
apiKeyEnv: 'NOVITA_API_SK',
|
|
51
|
+
createConfiguration: apiKey => ({ apiKey, baseURL: 'https://api.novita.ai/openai' }),
|
|
52
|
+
creators: ['deepseek', 'openai', 'bfl']
|
|
53
|
+
},
|
|
54
|
+
openai: {
|
|
55
|
+
sdk: 'openai',
|
|
56
|
+
apiKeyEnv: 'OPENAI_API_SK',
|
|
57
|
+
createConfiguration: apiKey => ({ apiKey }),
|
|
58
|
+
creators: ['openai']
|
|
59
|
+
},
|
|
60
|
+
openrouter: {
|
|
61
|
+
sdk: 'openrouter',
|
|
62
|
+
apiKeyEnv: 'OPENROUTER_API_SK',
|
|
63
|
+
createConfiguration: apiKey => {
|
|
64
|
+
// Optional OpenRouter attribution headers — only sent when the
|
|
65
|
+
// embedder opts in via env. No defaults.
|
|
66
|
+
const defaultHeaders = {}
|
|
67
|
+
if (process.env.OPENROUTER_REFERER) defaultHeaders['HTTP-Referer'] = process.env.OPENROUTER_REFERER
|
|
68
|
+
if (process.env.OPENROUTER_TITLE) defaultHeaders['X-Title'] = process.env.OPENROUTER_TITLE
|
|
69
|
+
return {
|
|
70
|
+
baseURL: 'https://openrouter.ai/api/v1',
|
|
71
|
+
apiKey,
|
|
72
|
+
defaultHeaders
|
|
73
|
+
}
|
|
74
|
+
},
|
|
75
|
+
creators: []
|
|
76
|
+
},
|
|
77
|
+
xai: {
|
|
78
|
+
sdk: 'openai',
|
|
79
|
+
apiKeyEnv: 'XAI_API_SK',
|
|
80
|
+
createConfiguration: apiKey => ({ baseURL: 'https://api.x.ai/v1', apiKey }),
|
|
81
|
+
creators: ['xai']
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
Object.freeze(providers)
|
|
86
|
+
|
|
87
|
+
export default providers
|
package/src/lib/rank.js
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
// Model ranking engine — fetches benchmarks, merges sources, computes scores.
|
|
2
|
+
// No CLI, no output formatting — pure data.
|
|
3
|
+
|
|
4
|
+
import { readFile, writeFile, mkdir } from 'fs/promises'
|
|
5
|
+
import { join, dirname } from 'path'
|
|
6
|
+
import { fileURLToPath } from 'url'
|
|
7
|
+
import { existsSync } from 'fs'
|
|
8
|
+
import { CACHE_DIR } from './cache.js'
|
|
9
|
+
|
|
10
|
+
const __dirname = dirname(fileURLToPath(import.meta.url))
|
|
11
|
+
const CONFIG_PATH = join(__dirname, '..', '..', 'config', 'benchmarks.json')
|
|
12
|
+
|
|
13
|
+
// --- Config ---
|
|
14
|
+
|
|
15
|
+
export const loadConfig = async () => {
|
|
16
|
+
const raw = await readFile(CONFIG_PATH, 'utf8')
|
|
17
|
+
return JSON.parse(raw)
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// --- Cache ---
|
|
21
|
+
|
|
22
|
+
const cachePath = (name) => join(CACHE_DIR, `rank-${name}.json`)
|
|
23
|
+
|
|
24
|
+
const loadCache = async (name, ttlMs) => {
|
|
25
|
+
try {
|
|
26
|
+
const raw = await readFile(cachePath(name), 'utf8')
|
|
27
|
+
const { timestamp, data } = JSON.parse(raw)
|
|
28
|
+
if (Date.now() - timestamp < ttlMs) return data
|
|
29
|
+
} catch {}
|
|
30
|
+
return null
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const saveCache = async (name, data) => {
|
|
34
|
+
try {
|
|
35
|
+
if (!existsSync(CACHE_DIR)) await mkdir(CACHE_DIR, { recursive: true })
|
|
36
|
+
await writeFile(cachePath(name), JSON.stringify({ timestamp: Date.now(), data }))
|
|
37
|
+
} catch {}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// --- Name matching ---
|
|
41
|
+
|
|
42
|
+
const normalizeCompact = (s) => s.toLowerCase().replace(/[^a-z0-9]/g, '')
|
|
43
|
+
|
|
44
|
+
const buildNameIndex = (models) => {
|
|
45
|
+
const index = new Map()
|
|
46
|
+
for (const m of models) {
|
|
47
|
+
index.set(normalizeCompact(m.name), m.model_id)
|
|
48
|
+
index.set(normalizeCompact(m.model_id), m.model_id)
|
|
49
|
+
const stripped = m.model_id.replace(/-\d{8}$/, '')
|
|
50
|
+
if (stripped !== m.model_id) index.set(normalizeCompact(stripped), m.model_id)
|
|
51
|
+
}
|
|
52
|
+
return index
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
const matchModel = (name, nameIndex) => {
|
|
56
|
+
const compact = normalizeCompact(name)
|
|
57
|
+
if (nameIndex.has(compact)) return nameIndex.get(compact)
|
|
58
|
+
const cleaned = normalizeCompact(name.replace(/\s*\(.*?\)\s*/g, ''))
|
|
59
|
+
if (cleaned !== compact && nameIndex.has(cleaned)) return nameIndex.get(cleaned)
|
|
60
|
+
const noVersion = normalizeCompact(name.replace(/-\d{4,8}$/, ''))
|
|
61
|
+
if (noVersion !== compact && nameIndex.has(noVersion)) return nameIndex.get(noVersion)
|
|
62
|
+
return null
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// --- CSV parser ---
|
|
66
|
+
|
|
67
|
+
const parseCSV = (text) => {
|
|
68
|
+
const rows = []
|
|
69
|
+
let row = []
|
|
70
|
+
let field = ''
|
|
71
|
+
let inQuote = false
|
|
72
|
+
for (let i = 0; i < text.length; i++) {
|
|
73
|
+
const ch = text[i]
|
|
74
|
+
if (inQuote) {
|
|
75
|
+
if (ch === '"') {
|
|
76
|
+
if (text[i + 1] === '"') { field += '"'; i++ } else inQuote = false
|
|
77
|
+
} else { field += ch }
|
|
78
|
+
} else if (ch === '"') { inQuote = true } else if (ch === ',') { row.push(field); field = '' } else if (ch === '\n' || ch === '\r') {
|
|
79
|
+
if (ch === '\r' && text[i + 1] === '\n') i++
|
|
80
|
+
row.push(field); field = ''
|
|
81
|
+
if (row.length > 1) rows.push(row)
|
|
82
|
+
row = []
|
|
83
|
+
} else { field += ch }
|
|
84
|
+
}
|
|
85
|
+
if (row.length || field) { row.push(field); rows.push(row) }
|
|
86
|
+
return rows
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// --- Sources ---
|
|
90
|
+
|
|
91
|
+
const URLS = {
|
|
92
|
+
zeroeval: 'https://api.zeroeval.com/leaderboard/models/full?justCanonicals=true',
|
|
93
|
+
epoch: 'https://epoch.ai/data/benchmarks.csv',
|
|
94
|
+
tau2Manifest: 'https://raw.githubusercontent.com/sierra-research/tau2-bench/main/web/leaderboard/public/submissions/manifest.json',
|
|
95
|
+
tau2Base: 'https://raw.githubusercontent.com/sierra-research/tau2-bench/main/web/leaderboard/public/submissions'
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
const fetchZeroEval = async (fresh, ttlMs) => {
|
|
99
|
+
if (!fresh) {
|
|
100
|
+
const cached = await loadCache('zeroeval', ttlMs)
|
|
101
|
+
if (cached) return { data: cached, fromCache: true }
|
|
102
|
+
}
|
|
103
|
+
const res = await fetch(URLS.zeroeval)
|
|
104
|
+
if (!res.ok) throw new Error(`ZeroEval API ${res.status}`)
|
|
105
|
+
const data = await res.json()
|
|
106
|
+
await saveCache('zeroeval', data)
|
|
107
|
+
return { data, fromCache: false }
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
const fetchEpoch = async (fresh, ttlMs) => {
|
|
111
|
+
if (!fresh) {
|
|
112
|
+
const cached = await loadCache('epoch', ttlMs)
|
|
113
|
+
if (cached) return cached
|
|
114
|
+
}
|
|
115
|
+
const res = await fetch(URLS.epoch)
|
|
116
|
+
if (!res.ok) throw new Error(`Epoch AI ${res.status}`)
|
|
117
|
+
const rows = parseCSV(await res.text())
|
|
118
|
+
if (!rows.length) return {}
|
|
119
|
+
|
|
120
|
+
const header = rows[0]
|
|
121
|
+
const taskIdx = header.indexOf('task')
|
|
122
|
+
const modelIdx = header.indexOf('Model')
|
|
123
|
+
const scoreIdx = header.indexOf('best_score')
|
|
124
|
+
if (taskIdx < 0 || modelIdx < 0 || scoreIdx < 0) return {}
|
|
125
|
+
|
|
126
|
+
const results = {}
|
|
127
|
+
for (let i = 1; i < rows.length; i++) {
|
|
128
|
+
const row = rows[i]
|
|
129
|
+
const task = (row[taskIdx] || '').trim()
|
|
130
|
+
const model = (row[modelIdx] || '').trim()
|
|
131
|
+
const score = parseFloat(row[scoreIdx])
|
|
132
|
+
if (!model || !Number.isFinite(score)) continue
|
|
133
|
+
let field = null
|
|
134
|
+
if (task === 'GPQA diamond') field = 'gpqa_score'
|
|
135
|
+
else if (task === 'SWE-Bench verified') field = 'swe_bench_verified_score'
|
|
136
|
+
if (!field) continue
|
|
137
|
+
if (!results[model]) results[model] = {}
|
|
138
|
+
if (results[model][field] == null || score > results[model][field]) {
|
|
139
|
+
results[model][field] = score
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
await saveCache('epoch', results)
|
|
143
|
+
return results
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
const fetchTau2 = async (fresh, ttlMs) => {
|
|
147
|
+
if (!fresh) {
|
|
148
|
+
const cached = await loadCache('tau2', ttlMs)
|
|
149
|
+
if (cached) return cached
|
|
150
|
+
}
|
|
151
|
+
const manifestRes = await fetch(URLS.tau2Manifest)
|
|
152
|
+
if (!manifestRes.ok) throw new Error(`Tau2 manifest ${manifestRes.status}`)
|
|
153
|
+
const manifest = await manifestRes.json()
|
|
154
|
+
const allNames = [...(manifest.submissions || []), ...(manifest.legacy_submissions || [])]
|
|
155
|
+
|
|
156
|
+
const results = {}
|
|
157
|
+
for (let i = 0; i < allNames.length; i += 8) {
|
|
158
|
+
const batch = allNames.slice(i, i + 8)
|
|
159
|
+
await Promise.all(batch.map(async (name) => {
|
|
160
|
+
try {
|
|
161
|
+
const res = await fetch(`${URLS.tau2Base}/${name}/submission.json`)
|
|
162
|
+
if (!res.ok) return
|
|
163
|
+
const sub = await res.json()
|
|
164
|
+
const modelName = sub.model_name
|
|
165
|
+
const retail = sub.results?.retail?.pass_1
|
|
166
|
+
if (modelName && retail != null) {
|
|
167
|
+
const val = retail / 100
|
|
168
|
+
const prev = results[modelName]?.tau_bench_retail_score
|
|
169
|
+
if (prev == null || val > prev) {
|
|
170
|
+
results[modelName] = { tau_bench_retail_score: val }
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
} catch {}
|
|
174
|
+
}))
|
|
175
|
+
}
|
|
176
|
+
await saveCache('tau2', results)
|
|
177
|
+
return results
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// --- Merge ---
|
|
181
|
+
|
|
182
|
+
const mergeSources = (models, epoch, tau2, nameIndex) => {
|
|
183
|
+
const byId = new Map()
|
|
184
|
+
for (const m of models) byId.set(m.model_id, m)
|
|
185
|
+
|
|
186
|
+
const overlays = [
|
|
187
|
+
{ name: 'epoch', data: epoch, fields: ['gpqa_score', 'swe_bench_verified_score'] },
|
|
188
|
+
{ name: 'tau2', data: tau2, fields: ['tau_bench_retail_score'] }
|
|
189
|
+
]
|
|
190
|
+
|
|
191
|
+
const stats = { overlaid: 0, unmatched: [] }
|
|
192
|
+
for (const { name, data, fields } of overlays) {
|
|
193
|
+
if (!data || !Object.keys(data).length) continue
|
|
194
|
+
for (const [modelName, scores] of Object.entries(data)) {
|
|
195
|
+
const modelId = matchModel(modelName, nameIndex)
|
|
196
|
+
if (!modelId || !byId.has(modelId)) {
|
|
197
|
+
stats.unmatched.push({ source: name, model: modelName })
|
|
198
|
+
continue
|
|
199
|
+
}
|
|
200
|
+
const model = byId.get(modelId)
|
|
201
|
+
for (const field of fields) {
|
|
202
|
+
if (scores[field] != null) { model[field] = scores[field]; stats.overlaid++ }
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
return stats
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// --- Scoring ---
|
|
210
|
+
|
|
211
|
+
const normalizeScore = (value, scale) => {
|
|
212
|
+
if (value == null) return null
|
|
213
|
+
return scale === '0-1' ? value * 100 : value
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
const computeScores = (model, benchmarks, weights) => {
|
|
217
|
+
const available = {}
|
|
218
|
+
let totalWeight = 0
|
|
219
|
+
let coverage = 0
|
|
220
|
+
|
|
221
|
+
for (const [field, cfg] of Object.entries(benchmarks)) {
|
|
222
|
+
const normalized = normalizeScore(model[field], cfg.scale)
|
|
223
|
+
if (normalized != null) {
|
|
224
|
+
available[field] = normalized
|
|
225
|
+
totalWeight += weights[field]
|
|
226
|
+
coverage++
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
if (!totalWeight) return null
|
|
230
|
+
|
|
231
|
+
let overall = 0
|
|
232
|
+
for (const [field, normalized] of Object.entries(available)) {
|
|
233
|
+
overall += normalized * (weights[field] / totalWeight)
|
|
234
|
+
}
|
|
235
|
+
return { overall, available, coverage }
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
const computeGroupScores = (available, benchmarks) => {
|
|
239
|
+
const groups = {}
|
|
240
|
+
for (const [field, normalized] of Object.entries(available)) {
|
|
241
|
+
const group = benchmarks[field].group
|
|
242
|
+
if (!groups[group]) groups[group] = { sum: 0, count: 0 }
|
|
243
|
+
groups[group].sum += normalized
|
|
244
|
+
groups[group].count++
|
|
245
|
+
}
|
|
246
|
+
const result = {}
|
|
247
|
+
for (const [group, { sum, count }] of Object.entries(groups)) {
|
|
248
|
+
result[group] = count > 0 ? sum / count : null
|
|
249
|
+
}
|
|
250
|
+
return result
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// --- Curated matching ---
|
|
254
|
+
|
|
255
|
+
const buildCuratedIndex = (curated) => {
|
|
256
|
+
const index = new Map()
|
|
257
|
+
for (const [key, entry] of Object.entries(curated)) {
|
|
258
|
+
if (entry.deprecated) continue
|
|
259
|
+
const modelPart = key.split('/').slice(1).join('/')
|
|
260
|
+
index.set(modelPart, key)
|
|
261
|
+
if (entry.model) index.set(entry.model, key)
|
|
262
|
+
}
|
|
263
|
+
return index
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
const matchCurated = (zeroEvalId, curatedIndex) => {
|
|
267
|
+
if (curatedIndex.has(zeroEvalId)) return curatedIndex.get(zeroEvalId)
|
|
268
|
+
const stripped = zeroEvalId.replace(/-\d{8}$/, '')
|
|
269
|
+
if (curatedIndex.has(stripped)) return curatedIndex.get(stripped)
|
|
270
|
+
return null
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// --- Public API ---
|
|
274
|
+
|
|
275
|
+
export const resolveWeights = (useCase, benchmarks, presets) => {
|
|
276
|
+
const name = useCase === 'tool-loop' ? 'tool_loop' : useCase
|
|
277
|
+
if (name === 'balanced') {
|
|
278
|
+
return Object.fromEntries(
|
|
279
|
+
Object.entries(benchmarks).map(([field, cfg]) => [field, cfg.weight])
|
|
280
|
+
)
|
|
281
|
+
}
|
|
282
|
+
return presets[name] || null
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
/**
|
|
286
|
+
* Fetch all benchmark sources.
|
|
287
|
+
* Returns { models, sources, mergeStats } or throws.
|
|
288
|
+
*/
|
|
289
|
+
export const fetchBenchmarks = async ({ fresh = false, onStatus } = {}) => {
|
|
290
|
+
const config = await loadConfig()
|
|
291
|
+
const ttlMs = config.cacheTtlHours * 60 * 60 * 1000
|
|
292
|
+
const log = onStatus || (() => {})
|
|
293
|
+
|
|
294
|
+
const fetchSource = async (name, fn) => {
|
|
295
|
+
try { return await fn(fresh, ttlMs) } catch (err) {
|
|
296
|
+
log(`${name}: failed (${err.message})`)
|
|
297
|
+
return null
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
log('Fetching benchmark sources...')
|
|
302
|
+
const [zeroEvalResult, epoch, tau2] = await Promise.all([
|
|
303
|
+
fetchSource('ZeroEval', fetchZeroEval),
|
|
304
|
+
fetchSource('Epoch AI', fetchEpoch),
|
|
305
|
+
fetchSource('Tau2', fetchTau2)
|
|
306
|
+
])
|
|
307
|
+
|
|
308
|
+
if (!zeroEvalResult) throw new Error('ZeroEval fetch failed — cannot rank without skeleton data')
|
|
309
|
+
const models = zeroEvalResult.data
|
|
310
|
+
const nameIndex = buildNameIndex(models)
|
|
311
|
+
|
|
312
|
+
const sources = [`ZeroEval (${models.length})`]
|
|
313
|
+
const epochCount = epoch ? Object.keys(epoch).length : 0
|
|
314
|
+
const tau2Count = tau2 ? Object.keys(tau2).length : 0
|
|
315
|
+
if (epochCount) sources.push(`Epoch AI (${epochCount})`)
|
|
316
|
+
if (tau2Count) sources.push(`Tau2 (${tau2Count})`)
|
|
317
|
+
|
|
318
|
+
const mergeStats = mergeSources(models, epoch, tau2, nameIndex)
|
|
319
|
+
log(`Sources: ${sources.join(', ')}`)
|
|
320
|
+
if (mergeStats.overlaid) log(`${mergeStats.overlaid} scores overlaid`)
|
|
321
|
+
|
|
322
|
+
return { models, sources, mergeStats, config }
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
// Combined fetch + rank
|
|
326
|
+
export const rank = async ({ curated, useCase = 'balanced', top = 20, all = false, since, minContext, fresh = false, onStatus } = {}) => {
|
|
327
|
+
const { models, sources, config } = await fetchBenchmarks({ fresh, onStatus })
|
|
328
|
+
const { benchmarks, minCoverage, useCasePresets } = config
|
|
329
|
+
|
|
330
|
+
const weights = resolveWeights(useCase, benchmarks, useCasePresets)
|
|
331
|
+
if (!weights) throw new Error(`Unknown use-case: ${useCase}. Available: balanced, analysis, tool-loop, cowork`)
|
|
332
|
+
|
|
333
|
+
const curatedIndex = !all && curated ? buildCuratedIndex(curated) : null
|
|
334
|
+
const benchmarkCount = Object.keys(benchmarks).length
|
|
335
|
+
|
|
336
|
+
// Filter
|
|
337
|
+
let filtered = models
|
|
338
|
+
if (curatedIndex) filtered = filtered.filter(m => matchCurated(m.model_id, curatedIndex))
|
|
339
|
+
if (since) {
|
|
340
|
+
const sinceDate = new Date(since + '-01')
|
|
341
|
+
filtered = filtered.filter(m => m.release_date && new Date(m.release_date) >= sinceDate)
|
|
342
|
+
}
|
|
343
|
+
if (minContext) {
|
|
344
|
+
filtered = filtered.filter(m => m.context && m.context >= minContext)
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
// Score
|
|
348
|
+
const scored = []
|
|
349
|
+
for (const model of filtered) {
|
|
350
|
+
const result = computeScores(model, benchmarks, weights)
|
|
351
|
+
if (!result || result.coverage < minCoverage) continue
|
|
352
|
+
const groupScores = computeGroupScores(result.available, benchmarks)
|
|
353
|
+
const outputPrice = model.output_price
|
|
354
|
+
const value = (outputPrice != null && outputPrice > 0) ? result.overall / outputPrice : null
|
|
355
|
+
|
|
356
|
+
scored.push({
|
|
357
|
+
model: model.name,
|
|
358
|
+
organization: model.organization,
|
|
359
|
+
overall: result.overall,
|
|
360
|
+
analysis: groupScores.analysis ?? null,
|
|
361
|
+
tool_loop: groupScores.tool_loop ?? null,
|
|
362
|
+
cowork: groupScores.cowork ?? null,
|
|
363
|
+
output_price: outputPrice ?? null,
|
|
364
|
+
value,
|
|
365
|
+
coverage: `${result.coverage}/${benchmarkCount}`,
|
|
366
|
+
scores: Object.fromEntries(
|
|
367
|
+
Object.keys(benchmarks).map(field => [
|
|
368
|
+
field.replace(/_score$/, ''),
|
|
369
|
+
model[field] ?? null
|
|
370
|
+
])
|
|
371
|
+
)
|
|
372
|
+
})
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
scored.sort((a, b) => b.overall - a.overall)
|
|
376
|
+
const rankings = scored.slice(0, top).map((r, i) => ({ rank: i + 1, ...r }))
|
|
377
|
+
|
|
378
|
+
return {
|
|
379
|
+
rankings,
|
|
380
|
+
meta: {
|
|
381
|
+
date: new Date().toISOString().split('T')[0],
|
|
382
|
+
sources,
|
|
383
|
+
benchmarkCount,
|
|
384
|
+
useCase,
|
|
385
|
+
totalModels: models.length,
|
|
386
|
+
matchedModels: filtered.length,
|
|
387
|
+
rankedModels: rankings.length
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
// Lightweight per-minute rate limiter.
|
|
2
|
+
// Tracks RPM and TPM with minute-bucket granularity.
|
|
3
|
+
// Throttles (delays) rather than rejects — returns ms to wait.
|
|
4
|
+
|
|
5
|
+
const createRateLimiter = () => {
|
|
6
|
+
// key → { count, tokens, minute }
|
|
7
|
+
const buckets = new Map()
|
|
8
|
+
|
|
9
|
+
const currentMinute = () => Math.floor(Date.now() / 60000)
|
|
10
|
+
|
|
11
|
+
const getBucket = (key) => {
|
|
12
|
+
const minute = currentMinute()
|
|
13
|
+
const bucket = buckets.get(key)
|
|
14
|
+
if (bucket && bucket.minute === minute) return bucket
|
|
15
|
+
const fresh = { count: 0, tokens: 0, minute }
|
|
16
|
+
buckets.set(key, fresh)
|
|
17
|
+
return fresh
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
const msUntilNextMinute = (minute) => Math.max(0, (minute + 1) * 60000 - Date.now())
|
|
21
|
+
|
|
22
|
+
// Returns ms to wait before sending (0 = go ahead)
|
|
23
|
+
const check = (key, { rpmLimit, tpmLimit } = {}) => {
|
|
24
|
+
if (!rpmLimit && !tpmLimit) return 0
|
|
25
|
+
const bucket = getBucket(key)
|
|
26
|
+
if (rpmLimit && bucket.count >= rpmLimit) {
|
|
27
|
+
return msUntilNextMinute(bucket.minute)
|
|
28
|
+
}
|
|
29
|
+
if (tpmLimit && bucket.tokens >= tpmLimit) {
|
|
30
|
+
return msUntilNextMinute(bucket.minute)
|
|
31
|
+
}
|
|
32
|
+
return 0
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Record a request count (call before sending — RPM tracking)
|
|
36
|
+
const recordRequest = (key) => {
|
|
37
|
+
const bucket = getBucket(key)
|
|
38
|
+
bucket.count++
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Record token usage (call after response — TPM tracking)
|
|
42
|
+
const recordTokens = (key, tokens) => {
|
|
43
|
+
const bucket = getBucket(key)
|
|
44
|
+
bucket.tokens += tokens
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
return { check, recordRequest, recordTokens }
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export default createRateLimiter
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
const fieldDefs = {
|
|
2
|
+
model: { type: 'string', required: true },
|
|
3
|
+
provider: { type: 'string' },
|
|
4
|
+
sdk: { type: 'string' },
|
|
5
|
+
type: { type: 'string', default: 'model' },
|
|
6
|
+
creator: { type: 'string', required: true },
|
|
7
|
+
label: { type: 'string' },
|
|
8
|
+
displayName: { type: 'string', deprecated: 'use label instead' },
|
|
9
|
+
description: { type: 'string' },
|
|
10
|
+
inputPrice: { type: 'number', altType: 'object' },
|
|
11
|
+
outputPrice: { type: 'number', altType: 'object' },
|
|
12
|
+
thinkingPrice: { type: 'number', altType: 'object' },
|
|
13
|
+
contextTokenLimit: { type: 'number' },
|
|
14
|
+
outputTokenLimit: { type: 'number' },
|
|
15
|
+
thinkingTokenLimit: { type: 'number' },
|
|
16
|
+
thinkingEffortLevels: { type: 'object', nullable: true, default: null },
|
|
17
|
+
defaultThinkingEffort: { type: 'string' },
|
|
18
|
+
tags: { type: 'array', itemType: 'string', default: [] },
|
|
19
|
+
aliases: { type: 'array', itemType: 'string', default: [] },
|
|
20
|
+
replaces: { type: 'array', itemType: 'string', default: [] },
|
|
21
|
+
leaderboard: { type: 'array', itemType: 'number', validate: (v) => Array.isArray(v) && v.length === 3 ? null : 'must be [intelligence, speed, latency]' },
|
|
22
|
+
leaderboardNote: { type: 'string' },
|
|
23
|
+
inputFormat: { type: 'array', itemType: 'string', required: true, default: ['text'] },
|
|
24
|
+
version: { type: 'string' },
|
|
25
|
+
createdAt: { type: 'string' },
|
|
26
|
+
created: { type: 'number' },
|
|
27
|
+
imagePrice: { type: 'number' },
|
|
28
|
+
imageEndpoint: { type: 'string' },
|
|
29
|
+
imageDefaultSize: { type: 'string' },
|
|
30
|
+
deprecated: { type: 'string' },
|
|
31
|
+
suspended: { type: 'string' },
|
|
32
|
+
rpmLimit: { type: 'number' },
|
|
33
|
+
tpmLimit: { type: 'number' },
|
|
34
|
+
rateLimitScope: { type: 'string', validate: (v) => ['model', 'provider'].includes(v) ? null : 'must be "model" or "provider"' },
|
|
35
|
+
supportsTools: { type: 'boolean' }
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const knownFields = new Set(Object.keys(fieldDefs))
|
|
39
|
+
|
|
40
|
+
const COMPUTED_FIELDS = new Set(['upstreamIds'])
|
|
41
|
+
|
|
42
|
+
const TYPE_CHECKERS = {
|
|
43
|
+
string: (v) => typeof v === 'string',
|
|
44
|
+
number: (v) => typeof v === 'number',
|
|
45
|
+
array: (v) => Array.isArray(v),
|
|
46
|
+
object: (v) => typeof v === 'object' && v !== null && !Array.isArray(v)
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
const checkType = (value, def) => {
|
|
50
|
+
if (value === null && def.nullable) return true
|
|
51
|
+
const checker = TYPE_CHECKERS[def.type]
|
|
52
|
+
if (checker && checker(value)) return true
|
|
53
|
+
if (def.altType) {
|
|
54
|
+
const altChecker = TYPE_CHECKERS[def.altType]
|
|
55
|
+
if (altChecker && altChecker(value)) return true
|
|
56
|
+
}
|
|
57
|
+
return false
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
const validate = (entry, curatedKey, { strict = false } = {}) => {
|
|
61
|
+
const issues = []
|
|
62
|
+
const isDeprecatedStub = !!entry.deprecated
|
|
63
|
+
|
|
64
|
+
for (const [field, def] of Object.entries(fieldDefs)) {
|
|
65
|
+
const value = entry[field]
|
|
66
|
+
|
|
67
|
+
if (def.required && !isDeprecatedStub && (value === undefined || value === null || value === '')) {
|
|
68
|
+
issues.push({ field, message: 'required field missing', severity: 'error' })
|
|
69
|
+
continue
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
if (value === undefined) continue
|
|
73
|
+
|
|
74
|
+
if (!checkType(value, def)) {
|
|
75
|
+
issues.push({ field, message: `expected ${def.type}, got ${typeof value}`, severity: 'error' })
|
|
76
|
+
continue
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
if (def.deprecated) {
|
|
80
|
+
issues.push({ field, message: `deprecated: ${def.deprecated}`, severity: 'warn' })
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
if (def.validate) {
|
|
84
|
+
const msg = def.validate(value)
|
|
85
|
+
if (msg) {
|
|
86
|
+
issues.push({ field, message: msg, severity: 'warn' })
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
if (def.type === 'array' && def.itemType && Array.isArray(value)) {
|
|
91
|
+
const itemChecker = TYPE_CHECKERS[def.itemType]
|
|
92
|
+
if (itemChecker) {
|
|
93
|
+
for (let i = 0; i < value.length; i++) {
|
|
94
|
+
if (!itemChecker(value[i])) {
|
|
95
|
+
issues.push({ field, message: `item ${i} expected ${def.itemType}, got ${typeof value[i]}`, severity: 'warn' })
|
|
96
|
+
break
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
if (strict) {
|
|
104
|
+
for (const key of Object.keys(entry)) {
|
|
105
|
+
if (!knownFields.has(key) && !COMPUTED_FIELDS.has(key)) {
|
|
106
|
+
issues.push({ field: key, message: 'unknown field', severity: 'warn' })
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
return issues
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const applyDefaults = (entry) => {
|
|
115
|
+
const result = { ...entry }
|
|
116
|
+
for (const [field, def] of Object.entries(fieldDefs)) {
|
|
117
|
+
if (result[field] === undefined && def.default !== undefined) {
|
|
118
|
+
result[field] = Array.isArray(def.default) ? [...def.default] : def.default
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
return result
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const stripComputed = (entry) => {
|
|
125
|
+
const result = {}
|
|
126
|
+
for (const [key, value] of Object.entries(entry)) {
|
|
127
|
+
if (!COMPUTED_FIELDS.has(key)) {
|
|
128
|
+
result[key] = value
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
return result
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// Strip only computed fields. Custom fields (not in knownFields) are preserved —
|
|
135
|
+
// consumers own their own namespace (e.g. `<yourapp>:label`,
|
|
136
|
+
// `<yourapp>:billingKey`).
|
|
137
|
+
const stripUnknown = (entry) => {
|
|
138
|
+
const result = {}
|
|
139
|
+
for (const [key, value] of Object.entries(entry)) {
|
|
140
|
+
if (!COMPUTED_FIELDS.has(key)) {
|
|
141
|
+
result[key] = value
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
return result
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
const TAG_RE = /^[a-zA-Z][a-zA-Z0-9._-]{0,31}$/
|
|
148
|
+
const isValidTag = (tag) => typeof tag === 'string' && TAG_RE.test(tag)
|
|
149
|
+
|
|
150
|
+
export { fieldDefs, knownFields, validate, applyDefaults, stripComputed, stripUnknown, isValidTag }
|