@aggc/or-info 0.2.12 → 0.2.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +30 -9
- package/bin/or-info.mjs +4 -2
- package/lib/scorer.mjs +85 -24
- package/mcp/server.mjs +8 -2
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -154,17 +154,38 @@ or-info compare openai/gpt-4o deepseek/deepseek-chat-v3-0324 --json
|
|
|
154
154
|
### Top models for a task
|
|
155
155
|
|
|
156
156
|
```bash
|
|
157
|
-
or-info top --task coding
|
|
158
|
-
or-info top --task reasoning
|
|
159
|
-
or-info top --task general
|
|
160
|
-
or-info top --task vision
|
|
161
|
-
or-info top --task cheap
|
|
162
|
-
or-info top --task
|
|
157
|
+
or-info top --task coding # Best coding models
|
|
158
|
+
or-info top --task reasoning # Best reasoning models
|
|
159
|
+
or-info top --task general # Best all-rounders
|
|
160
|
+
or-info top --task vision # Best vision models (requires image input)
|
|
161
|
+
or-info top --task cheap # Best value for money
|
|
162
|
+
or-info top --task premium # Highest quality, ignoring price
|
|
163
|
+
or-info top --task coding --pricing premium # Best coder regardless of price
|
|
164
|
+
or-info top --task coding --budget 2 # Best coders under $2/M output
|
|
163
165
|
or-info top --task general --limit 10
|
|
164
166
|
```
|
|
165
167
|
|
|
166
|
-
Ranking combines LMArena ELO with price
|
|
167
|
-
|
|
168
|
+
Ranking combines LMArena ELO with price and context window size.
|
|
169
|
+
|
|
170
|
+
`--task` controls which ELO category and capability filter to apply.
|
|
171
|
+
`--pricing` overrides the price-penalty strategy independently:
|
|
172
|
+
|
|
173
|
+
| `--pricing` | Effect |
|
|
174
|
+
|-------------|--------|
|
|
175
|
+
| `standard` (default) | Moderate penalty for expensive models |
|
|
176
|
+
| `cheap` | Steep penalty; strongly favours free/low-cost models |
|
|
177
|
+
| `premium` | No penalty; ranks by quality alone |
|
|
178
|
+
|
|
179
|
+
Task defaults (when `--pricing` is not set):
|
|
180
|
+
|
|
181
|
+
| Task | Default pricing | Capability filter |
|
|
182
|
+
|------|----------------|-------------------|
|
|
183
|
+
| `general` | standard | none |
|
|
184
|
+
| `coding` | standard | soft penalty (−15%) if no tool support |
|
|
185
|
+
| `reasoning` | standard | none |
|
|
186
|
+
| `vision` | standard | hard filter: image input required |
|
|
187
|
+
| `cheap` | cheap | none |
|
|
188
|
+
| `premium` | premium | none |
|
|
168
189
|
|
|
169
190
|
### Cache management
|
|
170
191
|
|
|
@@ -184,7 +205,7 @@ or-info refresh # Force-refresh OpenRouter catalog + LMArena ELO
|
|
|
184
205
|
| `models.get` | Pricing, context, architecture, features and LMArena ELO for a model |
|
|
185
206
|
| `models.list` | List models with optional filter, sort and limit |
|
|
186
207
|
| `models.compare` | Side-by-side comparison of two models |
|
|
187
|
-
| `models.top` | Ranked top models for coding/reasoning/general/vision/cheap |
|
|
208
|
+
| `models.top` | Ranked top models for coding/reasoning/general/vision/cheap/premium; accepts optional `pricing` override |
|
|
188
209
|
| `benchmarks.get` | LMArena ELO score, global rank, vote count and confidence interval for a model |
|
|
189
210
|
| `cache.refresh` | Force-refresh OpenRouter catalog + LMArena ELO |
|
|
190
211
|
|
package/bin/or-info.mjs
CHANGED
|
@@ -22,7 +22,7 @@ function die(msg) {
|
|
|
22
22
|
process.exit(1);
|
|
23
23
|
}
|
|
24
24
|
|
|
25
|
-
const TOP_TASKS = new Set(['coding', 'reasoning', 'general', 'vision', 'cheap']);
|
|
25
|
+
const TOP_TASKS = new Set(['coding', 'reasoning', 'general', 'vision', 'cheap', 'premium']);
|
|
26
26
|
|
|
27
27
|
function parsePositiveInteger(value) {
|
|
28
28
|
const n = Number.parseInt(value, 10);
|
|
@@ -173,7 +173,8 @@ program
|
|
|
173
173
|
program
|
|
174
174
|
.command('top')
|
|
175
175
|
.description('Best models for a task')
|
|
176
|
-
.option('--task <task>', 'Task: coding, reasoning, general, vision, cheap', 'general')
|
|
176
|
+
.option('--task <task>', 'Task: coding, reasoning, general, vision, cheap, premium', 'general')
|
|
177
|
+
.option('--pricing <mode>', 'Price scoring override: standard, cheap, premium', v => { const s = new Set(['standard', 'cheap', 'premium']); if (!s.has(v)) throw new InvalidArgumentError('must be standard, cheap, or premium'); return v; })
|
|
177
178
|
.option('--budget <usd>', 'Max price per 1M output tokens (e.g. 1.00)', parseFloat)
|
|
178
179
|
.option('--limit <n>', 'Number of results', parsePositiveInteger, 5)
|
|
179
180
|
.option('--json', 'Output raw JSON')
|
|
@@ -190,6 +191,7 @@ program
|
|
|
190
191
|
|
|
191
192
|
const ranked = rankModels(models, allElo, {
|
|
192
193
|
task: opts.task,
|
|
194
|
+
pricing: opts.pricing,
|
|
193
195
|
maxPricePerMOutput: opts.budget,
|
|
194
196
|
limit: opts.limit,
|
|
195
197
|
});
|
package/lib/scorer.mjs
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import { pricePerMillion, supportsFeature } from './openrouter.mjs';
|
|
1
|
+
import { pricePerMillion, supportsFeature, contextLength } from './openrouter.mjs';
|
|
2
2
|
|
|
3
|
-
// ELO range observed on LMArena (2026): ~
|
|
4
|
-
const ELO_MIN =
|
|
5
|
-
const ELO_MAX =
|
|
3
|
+
// ELO range observed on LMArena (2026): ~1000 (weak) to ~1540+ (best)
|
|
4
|
+
const ELO_MIN = 1000;
|
|
5
|
+
const ELO_MAX = 1600;
|
|
6
6
|
|
|
7
7
|
function normaliseElo(elo) {
|
|
8
8
|
return Math.max(0, Math.min(100, ((elo - ELO_MIN) / (ELO_MAX - ELO_MIN)) * 100));
|
|
@@ -18,40 +18,83 @@ function pricePenalty(outputPerM) {
|
|
|
18
18
|
return Math.max(0.1, 1 - Math.log10(outputPerM + 1) * 0.15);
|
|
19
19
|
}
|
|
20
20
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
return
|
|
21
|
+
// Steeper penalty for cheap task — more spread between free and expensive.
|
|
22
|
+
// Free → 1.0, $1/M → 0.88, $5/M → 0.72, $20/M → 0.59
|
|
23
|
+
function cheapPenalty(outputPerM) {
|
|
24
|
+
if (outputPerM === null || outputPerM === 0) return 1.0;
|
|
25
|
+
return Math.max(0.1, 1 - Math.log10(outputPerM + 1) * 0.25);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// Context window bonus [0.9, 1.0]. Larger context is more useful,
|
|
29
|
+
// especially for coding where long files are common.
|
|
30
|
+
function contextBonus(ctx, task) {
|
|
31
|
+
if (!ctx) return 0.9;
|
|
32
|
+
if (ctx >= 128_000) return 1.0;
|
|
33
|
+
if (ctx >= 64_000) return 0.97;
|
|
34
|
+
if (ctx >= 32_000) return 0.93;
|
|
35
|
+
return 0.9;
|
|
25
36
|
}
|
|
26
37
|
|
|
27
|
-
// Score a model for a task.
|
|
38
|
+
// Score a model for a task with optional pricing override.
|
|
28
39
|
// Returns { score, qualityScore } or null if not eligible.
|
|
29
|
-
export function scoreForTask(model, eloEntry, task = 'general') {
|
|
30
|
-
const
|
|
31
|
-
|
|
40
|
+
export function scoreForTask(model, eloEntry, task = 'general', pricing) {
|
|
41
|
+
const { pricingMode, capability } = parseTaskSpec(task, pricing);
|
|
42
|
+
|
|
43
|
+
// Hard filter: vision requires vision capability
|
|
44
|
+
if (capability === 'vision' && !supportsFeature(model, 'vision')) return null;
|
|
32
45
|
if (!eloEntry?.elo) return null;
|
|
33
46
|
|
|
34
47
|
const quality = normaliseElo(eloEntry.elo);
|
|
35
48
|
const price = pricePerMillion(model);
|
|
36
|
-
const
|
|
37
|
-
|
|
38
|
-
|
|
49
|
+
const ctx = contextLength(model);
|
|
50
|
+
|
|
51
|
+
// Price penalty: premium ignores price, cheap uses steep curve, others standard
|
|
52
|
+
let penalty;
|
|
53
|
+
if (pricingMode === 'premium') {
|
|
54
|
+
penalty = 1.0;
|
|
55
|
+
} else if (pricingMode === 'cheap') {
|
|
56
|
+
penalty = cheapPenalty(price.output);
|
|
57
|
+
} else {
|
|
58
|
+
penalty = pricePenalty(price.output);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
const ctxB = contextBonus(ctx, task);
|
|
62
|
+
|
|
63
|
+
// Soft penalty for coding without tools (still eligible, just less ideal)
|
|
64
|
+
const capPenalty = (task === 'coding' && !supportsFeature(model, 'tools')) ? 0.85 : 1.0;
|
|
65
|
+
|
|
66
|
+
const rawScore = quality * penalty * ctxB * capPenalty;
|
|
39
67
|
|
|
40
68
|
return {
|
|
41
|
-
score: Math.round(
|
|
69
|
+
score: Math.round(Math.min(100, rawScore) * 10) / 10,
|
|
42
70
|
qualityScore: Math.round(quality * 10) / 10,
|
|
43
71
|
};
|
|
44
72
|
}
|
|
45
73
|
|
|
46
|
-
|
|
74
|
+
// Parse task into { eloCategory, pricingMode, capability }.
|
|
75
|
+
// This decouples the ELO category from the price penalty strategy.
|
|
76
|
+
// task='coding' → coding ELO, standard pricing
|
|
77
|
+
// task='coding', pricing='premium' → coding ELO, no price penalty
|
|
78
|
+
const TASK_ELO = { coding: 'coding', reasoning: 'math', vision: null, general: null, cheap: null, premium: null };
|
|
79
|
+
const TASK_CAP = { vision: 'vision', coding: 'tools' };
|
|
80
|
+
const PRICING_MODES = new Set(['standard', 'cheap', 'premium']);
|
|
81
|
+
|
|
82
|
+
function parseTaskSpec(task, pricing) {
|
|
83
|
+
const eloCategory = TASK_ELO[task] ?? 'overall';
|
|
84
|
+
const capability = TASK_CAP[task] ?? null;
|
|
85
|
+
let pricingMode = pricing ?? 'standard';
|
|
86
|
+
// Legacy: 'cheap' and 'premium' as task names set pricing mode
|
|
87
|
+
if (task === 'cheap') pricingMode = 'cheap';
|
|
88
|
+
else if (task === 'premium') pricingMode = 'premium';
|
|
89
|
+
if (!PRICING_MODES.has(pricingMode)) pricingMode = 'standard';
|
|
90
|
+
return { eloCategory, pricingMode, capability };
|
|
91
|
+
}
|
|
47
92
|
|
|
48
|
-
export function rankModels(models, allElo, { task = 'general', maxPricePerMOutput, limit = 5 } = {}) {
|
|
49
|
-
|
|
50
|
-
// or a plain array (legacy). Select the right category for this task.
|
|
51
|
-
const category = CATEGORY_FOR_TASK[task] ?? 'overall';
|
|
93
|
+
export function rankModels(models, allElo, { task = 'general', pricing, maxPricePerMOutput, limit = 5 } = {}) {
|
|
94
|
+
const { eloCategory } = parseTaskSpec(task, pricing);
|
|
52
95
|
const entries = Array.isArray(allElo)
|
|
53
96
|
? allElo
|
|
54
|
-
: (allElo[
|
|
97
|
+
: (allElo[eloCategory] ?? allElo.overall ?? []);
|
|
55
98
|
|
|
56
99
|
const scored = [];
|
|
57
100
|
|
|
@@ -60,7 +103,7 @@ export function rankModels(models, allElo, { task = 'general', maxPricePerMOutpu
|
|
|
60
103
|
? entries.find((e) => _matchName(e.lmarenaName, model.id))
|
|
61
104
|
: null;
|
|
62
105
|
|
|
63
|
-
const result = scoreForTask(model, eloEntry, task);
|
|
106
|
+
const result = scoreForTask(model, eloEntry, task, pricing);
|
|
64
107
|
if (!result) continue;
|
|
65
108
|
|
|
66
109
|
const price = pricePerMillion(model);
|
|
@@ -69,7 +112,25 @@ export function rankModels(models, allElo, { task = 'general', maxPricePerMOutpu
|
|
|
69
112
|
scored.push({ model, score: result.score, qualityScore: result.qualityScore, eloEntry });
|
|
70
113
|
}
|
|
71
114
|
|
|
72
|
-
|
|
115
|
+
// Dedup 1: :free variants — keep highest-scoring variant per base model
|
|
116
|
+
const byBase = new Map();
|
|
117
|
+
for (const entry of scored) {
|
|
118
|
+
const baseId = entry.model.id.replace(/:free$/, '');
|
|
119
|
+
const prev = byBase.get(baseId);
|
|
120
|
+
if (!prev || entry.score > prev.score) byBase.set(baseId, entry);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// Dedup 2: same ELO entry — multiple OR models can match one LMArena name
|
|
124
|
+
// (e.g. gpt-5.4-nano and gpt-5.4 both match "gpt-5.4-high").
|
|
125
|
+
// Keep only the best-scoring OR model per ELO entry.
|
|
126
|
+
const byElo = new Map();
|
|
127
|
+
for (const entry of byBase.values()) {
|
|
128
|
+
const eloKey = entry.eloEntry?.lmarenaName ?? entry.model.id;
|
|
129
|
+
const prev = byElo.get(eloKey);
|
|
130
|
+
if (!prev || entry.score > prev.score) byElo.set(eloKey, entry);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
return [...byElo.values()].sort((a, b) => b.score - a.score).slice(0, limit);
|
|
73
134
|
}
|
|
74
135
|
|
|
75
136
|
// Inline minimal name matching (mirrors lmarena.mjs logic without importing it)
|
package/mcp/server.mjs
CHANGED
|
@@ -147,9 +147,14 @@ const CANONICAL_TOOLS = [
|
|
|
147
147
|
properties: {
|
|
148
148
|
task: {
|
|
149
149
|
type: 'string',
|
|
150
|
-
enum: ['coding', 'reasoning', 'general', 'vision', 'cheap'],
|
|
150
|
+
enum: ['coding', 'reasoning', 'general', 'vision', 'cheap', 'premium'],
|
|
151
151
|
description: 'Task type to optimise for',
|
|
152
152
|
},
|
|
153
|
+
pricing: {
|
|
154
|
+
type: 'string',
|
|
155
|
+
enum: ['standard', 'cheap', 'premium'],
|
|
156
|
+
description: 'Price scoring override. Set to "premium" with task="coding" for best coding model regardless of price',
|
|
157
|
+
},
|
|
153
158
|
max_price_per_m_output: {
|
|
154
159
|
type: 'number',
|
|
155
160
|
description: 'Maximum price per 1M output tokens in USD (e.g. 1.0)',
|
|
@@ -307,11 +312,12 @@ async function handleTool(name, args) {
|
|
|
307
312
|
|
|
308
313
|
if (name === 'models.top') {
|
|
309
314
|
const task = args.task ?? 'general';
|
|
315
|
+
const pricing = args.pricing ?? undefined;
|
|
310
316
|
const limit = Math.min(20, Math.max(1, args.limit ?? 5));
|
|
311
317
|
const maxPrice = args.max_price_per_m_output ?? undefined;
|
|
312
318
|
|
|
313
319
|
const [models, allElo] = await Promise.all([fetchModels({ apiKey: key }), getAllElo()]);
|
|
314
|
-
const ranked = rankModels(models, allElo, { task, maxPricePerMOutput: maxPrice, limit });
|
|
320
|
+
const ranked = rankModels(models, allElo, { task, pricing, maxPricePerMOutput: maxPrice, limit });
|
|
315
321
|
return result({ task, results: ranked.map((r) => ({ ...safeModelSummary(r.model), score: r.score, lmarena_elo: r.eloEntry })) });
|
|
316
322
|
}
|
|
317
323
|
|