pi-model-auto 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +164 -0
- package/package.json +42 -0
- package/src/canonical-models.ts +141 -0
- package/src/index.ts +611 -0
- package/src/quota.ts +306 -0
- package/src/router-core.ts +916 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Zack Wong
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# pi-model-auto
|
|
2
|
+
|
|
3
|
+
`pi-model-auto` adds one Pi model: **Pi Router (Auto)**.
|
|
4
|
+
|
|
5
|
+
Choose it once with `/model`. After that, Pi keeps using the router, and the router chooses one of your authenticated models for each turn.
|
|
6
|
+
|
|
7
|
+
## Start
|
|
8
|
+
|
|
9
|
+
Install from npm:
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pi install npm:pi-model-auto
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Or install from git (pin a release with `@vX.Y.Z`, or omit it to track the default branch):
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pi install git:github.com/maynewong/pi-model-auto@v0.1.0
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Update later with `pi update --extensions`.
|
|
22
|
+
|
|
23
|
+
To try it once without installing, point `-e` at a local checkout:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pi -e /path/to/pi-model-auto
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Then:
|
|
30
|
+
|
|
31
|
+
1. Run `/model`.
|
|
32
|
+
2. Choose **Pi Router (Auto)**.
|
|
33
|
+
3. Use Pi normally.
|
|
34
|
+
|
|
35
|
+
No config is required at first. If the router says no authenticated models are available, run `/login` for the providers you want to use, then reload Pi.
|
|
36
|
+
|
|
37
|
+
## Use
|
|
38
|
+
|
|
39
|
+
Check what the router sees:
|
|
40
|
+
|
|
41
|
+
```text
|
|
42
|
+
/auto
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Force one turn when you know what you want:
|
|
46
|
+
|
|
47
|
+
```text
|
|
48
|
+
@cheap update the README
|
|
49
|
+
@strong debug this race condition
|
|
50
|
+
@model:anthropic/claude-3-5-sonnet-20241022 use Sonnet here
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
- `@cheap` asks for the cheap end of the current pool.
|
|
54
|
+
- `@strong` asks for the strong end.
|
|
55
|
+
- `@model:provider/model-id` uses that exact model.
|
|
56
|
+
|
|
57
|
+
The prefix is removed before the model sees your prompt.
|
|
58
|
+
|
|
59
|
+
## Configure When Needed
|
|
60
|
+
|
|
61
|
+
Most people only need config for two reasons:
|
|
62
|
+
|
|
63
|
+
1. Limit the pool to providers they trust.
|
|
64
|
+
2. Tell the router what each model really costs them.
|
|
65
|
+
|
|
66
|
+
Config lives in either file:
|
|
67
|
+
|
|
68
|
+
- `~/.pi/agent/model-router.json`
|
|
69
|
+
- `.pi/model-router.json` in trusted projects
|
|
70
|
+
|
|
71
|
+
Project config overrides user config.
|
|
72
|
+
|
|
73
|
+
```jsonc
|
|
74
|
+
{
|
|
75
|
+
"router": {
|
|
76
|
+
"modelFilter": { "include": ["anthropic", "z-ai"], "exclude": [] },
|
|
77
|
+
"modelOverrides": {
|
|
78
|
+
"anthropic/claude-3-5-sonnet-20241022": { "costCoef": 0.05 },
|
|
79
|
+
"z-ai/glm-5.2": {
|
|
80
|
+
"costCoef": 0.4,
|
|
81
|
+
"costCoefHours": [{ "hours": [14, 18], "factor": 3 }]
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Use provider/model ids exactly as they appear in your Pi registry. The names above are public examples.
|
|
89
|
+
|
|
90
|
+
### `costCoef`
|
|
91
|
+
|
|
92
|
+
`costCoef` multiplies the benchmark cost:
|
|
93
|
+
|
|
94
|
+
- `< 1`: cheaper for you than the benchmark, such as a subscription or discount.
|
|
95
|
+
- `= 1`: roughly benchmark cost.
|
|
96
|
+
- `> 1`: more expensive for you.
|
|
97
|
+
|
|
98
|
+
Avoid setting a limited subscription near zero unless you want it to win almost every turn.
|
|
99
|
+
|
|
100
|
+
### `costCoefHours`
|
|
101
|
+
|
|
102
|
+
Use this when a model is more expensive during local hours:
|
|
103
|
+
|
|
104
|
+
```jsonc
|
|
105
|
+
"z-ai/glm-5.2": {
|
|
106
|
+
"costCoef": 0.4,
|
|
107
|
+
"costCoefHours": [{ "hours": [14, 18], "factor": 3 }]
|
|
108
|
+
}
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
This means `0.4` normally, `1.2` from 14:00 through 17:59. Windows are half-open `[start, end)`. `[22, 2]` wraps across midnight.
|
|
112
|
+
|
|
113
|
+
## How It Chooses
|
|
114
|
+
|
|
115
|
+
The router compares quality against your effective cost.
|
|
116
|
+
|
|
117
|
+
Quality comes from one benchmark table. Cost starts from the same table, then applies your `costCoef` and any active time window. The router keeps to the efficient frontier: a model is only worth considering if no other available model is both better and cheaper.
|
|
118
|
+
|
|
119
|
+
`capabilitySource` chooses the benchmark:
|
|
120
|
+
|
|
121
|
+
- `"ramp"` (default): this package's [Ramp SWE-Bench](https://labs.ramp.com/swebench#score-vs-spend) table, using coding-agent resolve rate and measured cost per task. It is a narrow coding-agent slice, not a general model score. The task family follows [SWE-bench](https://arxiv.org/abs/2310.06770).
|
|
122
|
+
- `"aa"`: [Artificial Analysis](https://artificialanalysis.ai/models) model data, using its Intelligence Index and blended price metrics.
|
|
123
|
+
|
|
124
|
+
The numeric tables live in [`src/canonical-models.ts`](src/canonical-models.ts). The two sources are not mixed.
|
|
125
|
+
|
|
126
|
+
One user turn keeps one model, including tool-call continuations. Automatic routing also avoids quota-cooled plans and avoids switching away from a useful warm cache when the switch is not worth it.
|
|
127
|
+
|
|
128
|
+
## Settings
|
|
129
|
+
|
|
130
|
+
| setting | use |
|
|
131
|
+
| --- | --- |
|
|
132
|
+
| `capabilitySource` | Choose `"ramp"` or `"aa"`. |
|
|
133
|
+
| `modelFilter` | Include or exclude providers/models by substring. |
|
|
134
|
+
| `models` | Pin the configured `cheap` or `strong` endpoint. |
|
|
135
|
+
| `modelOverrides` | Adjust cost or metadata for known/private/local models. |
|
|
136
|
+
| `willingness` | Control how far each difficulty climbs toward stronger models. |
|
|
137
|
+
| `cacheAware` | Keep warm prompt caches when switching is not worth it. Enabled by default. |
|
|
138
|
+
| `quota` | Skip cooled-down plans after rate-limit headers or `429`. Enabled by default. |
|
|
139
|
+
| `forceStrongOnHighReasoning` | Send `high` or `xhigh` reasoning to the top of the frontier. |
|
|
140
|
+
| `weights` | Difficulty-scoring weights. Advanced. |
|
|
141
|
+
| `log` | Append routing decisions to `.pi/router.log`. |
|
|
142
|
+
|
|
143
|
+
Useful override fields:
|
|
144
|
+
|
|
145
|
+
| field | meaning |
|
|
146
|
+
| --- | --- |
|
|
147
|
+
| `costCoef` | Cost multiplier. |
|
|
148
|
+
| `costCoefHours` | Local-hour multipliers. |
|
|
149
|
+
| `canonical` | Name shown in `/auto`. |
|
|
150
|
+
| `costTier` | `cheap`, `standard`, `premium`, or `unknown`. |
|
|
151
|
+
| `profiles` | `deep`, `fast`, `coder`, `balanced`, `vision`, `frontier`. |
|
|
152
|
+
| `frontier` | Whether the model can appear in the strong frontier. |
|
|
153
|
+
| `priceBlended`, `intelligence`, `scores`, `tps` | Raw metrics for models without benchmark data. |
|
|
154
|
+
|
|
155
|
+
Quota state is stored at `~/.pi/agent/quota-state.json`. Providers without remaining-quota headers only cool down after a real `429`.
|
|
156
|
+
|
|
157
|
+
## Develop
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
npm run typecheck
|
|
161
|
+
npm test
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
Maintainers: see [RELEASING.md](RELEASING.md) for the publish flow.
|
package/package.json
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "pi-model-auto",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Pi extension package that routes each turn to cheap or strong authenticated models.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"keywords": ["pi-package", "pi", "extension", "model-router"],
|
|
7
|
+
"license": "MIT",
|
|
8
|
+
"homepage": "https://github.com/maynewong/pi-model-auto#readme",
|
|
9
|
+
"repository": {
|
|
10
|
+
"type": "git",
|
|
11
|
+
"url": "git+https://github.com/maynewong/pi-model-auto.git"
|
|
12
|
+
},
|
|
13
|
+
"bugs": {
|
|
14
|
+
"url": "https://github.com/maynewong/pi-model-auto/issues"
|
|
15
|
+
},
|
|
16
|
+
"files": [
|
|
17
|
+
"src/index.ts",
|
|
18
|
+
"src/router-core.ts",
|
|
19
|
+
"src/quota.ts",
|
|
20
|
+
"src/canonical-models.ts",
|
|
21
|
+
"README.md",
|
|
22
|
+
"LICENSE"
|
|
23
|
+
],
|
|
24
|
+
"pi": {
|
|
25
|
+
"extensions": ["./src/index.ts"]
|
|
26
|
+
},
|
|
27
|
+
"peerDependencies": {
|
|
28
|
+
"@earendil-works/pi-ai": "*",
|
|
29
|
+
"@earendil-works/pi-coding-agent": "*"
|
|
30
|
+
},
|
|
31
|
+
"devDependencies": {
|
|
32
|
+
"@types/node": "24.12.4",
|
|
33
|
+
"typescript": "5.9.3",
|
|
34
|
+
"vitest": "4.1.9"
|
|
35
|
+
},
|
|
36
|
+
"scripts": {
|
|
37
|
+
"typecheck": "tsc --noEmit",
|
|
38
|
+
"test": "vitest --run",
|
|
39
|
+
"prepublishOnly": "npm run typecheck && npm test",
|
|
40
|
+
"release": "npm version patch && git push --follow-tags && npm publish"
|
|
41
|
+
}
|
|
42
|
+
}
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
export type CostTier = "cheap" | "standard" | "premium" | "unknown";
|
|
2
|
+
export type ModelProfile = "deep" | "fast" | "coder" | "balanced" | "vision" | "frontier";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Per-profile capability sub-scores. Used to pick the right Pareto axis per task:
|
|
6
|
+
* coding tasks compare on `coding`, agentic/deep tasks on `agentic`, everything else
|
|
7
|
+
* falls back to the synthetic `intelligence` index. Values are facts (benchmark scores),
|
|
8
|
+
* transcribed offline — we ship only the derived constants for the models we route to,
|
|
9
|
+
* never a third-party dataset. See docs/routing-redesign.md §6.
|
|
10
|
+
*/
|
|
11
|
+
export interface CanonicalScores {
|
|
12
|
+
/** coding_index (~0–80). Axis for the `coder` profile. */
|
|
13
|
+
coding?: number;
|
|
14
|
+
/** terminalbench v2 (0–1). Axis for the `deep`/agentic profile; scaled ×100 at use. */
|
|
15
|
+
agentic?: number;
|
|
16
|
+
/** instruction-following (ifbench, 0–1). Informational. */
|
|
17
|
+
ifbench?: number;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface CanonicalMeta {
|
|
21
|
+
key: string;
|
|
22
|
+
/** Synthetic intelligence index (~3–60). Axis for the `balanced` profile and the floor scale. */
|
|
23
|
+
intelligence: number;
|
|
24
|
+
/** List price, $/1M tokens, blended 3:1 input:output. NOT marginal/subscription cost. */
|
|
25
|
+
priceBlended: number;
|
|
26
|
+
scores?: CanonicalScores;
|
|
27
|
+
/** Output tokens/sec. Axis for the `fast` profile. */
|
|
28
|
+
tps?: number;
|
|
29
|
+
costTier: CostTier;
|
|
30
|
+
profiles: ModelProfile[];
|
|
31
|
+
frontier: boolean;
|
|
32
|
+
/** Provenance of the numeric fields. */
|
|
33
|
+
source?: string;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const AA = "Artificial Analysis, 2026-06";
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Curated frontier + recent-two-generation models across major labs. The numeric fields
|
|
40
|
+
* (intelligence/priceBlended/scores/tps) drive the data-driven Pareto routing in router-core;
|
|
41
|
+
* costTier/profiles/frontier remain as display hints and pool-bucketing only.
|
|
42
|
+
*
|
|
43
|
+
* `key` matches the provider model id (substring, longest-match wins), so dotted vendor naming
|
|
44
|
+
* is intentional. Add models you actually have access to; unmatched keys are simply inert.
|
|
45
|
+
*/
|
|
46
|
+
export const CANONICAL_MODELS: CanonicalMeta[] = [
|
|
47
|
+
// OpenAI
|
|
48
|
+
{ key: "gpt-5.5", intelligence: 53.1, priceBlended: 11.25, scores: { coding: 71.6, agentic: 0.794, ifbench: 0.716 }, tps: 83, costTier: "premium", profiles: ["coder", "balanced", "frontier"], frontier: true, source: AA },
|
|
49
|
+
{ key: "gpt-5.4", intelligence: 51.4, priceBlended: 5.625, scores: { coding: 71.1, agentic: 0.783, ifbench: 0.739 }, tps: 168, costTier: "premium", profiles: ["coder", "balanced", "frontier"], frontier: true, source: AA },
|
|
50
|
+
{ key: "gpt-5.4-mini", intelligence: 40.0, priceBlended: 1.688, scores: { coding: 56.1, agentic: 0.592, ifbench: 0.733 }, tps: 202, costTier: "standard", profiles: ["coder", "balanced"], frontier: false, source: AA },
|
|
51
|
+
{ key: "gpt-5.4-nano", intelligence: 38.2, priceBlended: 0.463, scores: { coding: 56.1, agentic: 0.607, ifbench: 0.759 }, tps: 168, costTier: "cheap", profiles: ["coder", "fast"], frontier: false, source: AA },
|
|
52
|
+
{ key: "gpt-5.3-codex-spark", intelligence: 44.3, priceBlended: 4.813, scores: { coding: 62.0, agentic: 0.62, ifbench: 0.754 }, tps: 130, costTier: "standard", profiles: ["coder", "fast"], frontier: false, source: `${AA} (coding/agentic estimated)` },
|
|
53
|
+
{ key: "gpt-oss-120b", intelligence: 23.8, priceBlended: 0.262, scores: { coding: 30.4, agentic: 0.262, ifbench: 0.690 }, tps: 347, costTier: "cheap", profiles: ["balanced"], frontier: false, source: AA },
|
|
54
|
+
// Anthropic
|
|
55
|
+
{ key: "claude-fable-5", intelligence: 59.9, priceBlended: 20, scores: { coding: 76.5, agentic: 0.846, ifbench: 0.635 }, tps: 70, costTier: "premium", profiles: ["deep", "coder", "balanced", "frontier"], frontier: true, source: AA },
|
|
56
|
+
{ key: "claude-opus-4-8", intelligence: 55.7, priceBlended: 10, scores: { coding: 74.3, agentic: 0.846, ifbench: 0.622 }, tps: 70, costTier: "premium", profiles: ["deep", "coder", "balanced", "frontier"], frontier: true, source: AA },
|
|
57
|
+
{ key: "claude-opus-4-7", intelligence: 53.5, priceBlended: 10, scores: { coding: 73.6, agentic: 0.831, ifbench: 0.586 }, tps: 58, costTier: "premium", profiles: ["deep", "coder", "balanced"], frontier: false, source: AA },
|
|
58
|
+
{ key: "claude-sonnet-4-6", intelligence: 47.2, priceBlended: 6, scores: { coding: 63.0, agentic: 0.712, ifbench: 0.566 }, tps: 69, costTier: "premium", profiles: ["coder", "balanced"], frontier: false, source: AA },
|
|
59
|
+
{ key: "claude-4-5-haiku", intelligence: 29.6, priceBlended: 2, scores: { coding: 43.9, agentic: 0.442, ifbench: 0.543 }, tps: 173, costTier: "standard", profiles: ["fast", "balanced"], frontier: false, source: AA },
|
|
60
|
+
// Google
|
|
61
|
+
{ key: "gemini-3.5-flash", intelligence: 50.2, priceBlended: 3.375, scores: { coding: 70.1, agentic: 0.787, ifbench: 0.763 }, tps: 247, costTier: "premium", profiles: ["coder", "balanced", "frontier"], frontier: true, source: AA },
|
|
62
|
+
{ key: "gemini-3.1-pro", intelligence: 46.5, priceBlended: 4.5, scores: { coding: 68.8, agentic: 0.738, ifbench: 0.771 }, tps: 152, costTier: "premium", profiles: ["deep", "balanced", "frontier"], frontier: true, source: AA },
|
|
63
|
+
{ key: "gemini-3.1-flash-lite", intelligence: 25.0, priceBlended: 0.563, scores: { coding: 34.7, agentic: 0.311, ifbench: 0.772 }, tps: 355, costTier: "cheap", profiles: ["fast", "balanced"], frontier: false, source: AA },
|
|
64
|
+
// DeepSeek
|
|
65
|
+
{ key: "deepseek-v4-pro", intelligence: 44.3, priceBlended: 0.544, scores: { coding: 59.4, agentic: 0.640, ifbench: 0.765 }, tps: 104, costTier: "standard", profiles: ["deep", "balanced"], frontier: false, source: AA },
|
|
66
|
+
{ key: "deepseek-v4-flash", intelligence: 40.3, priceBlended: 0.175, scores: { coding: 56.2, agentic: 0.618, ifbench: 0.792 }, tps: 113, costTier: "cheap", profiles: ["fast", "balanced"], frontier: false, source: AA },
|
|
67
|
+
// xAI
|
|
68
|
+
{ key: "grok-4.3", intelligence: 37.6, priceBlended: 1.563, scores: { coding: 42.2, agentic: 0.397, ifbench: 0.813 }, tps: 185, costTier: "standard", profiles: ["balanced"], frontier: false, source: AA },
|
|
69
|
+
// Alibaba Qwen
|
|
70
|
+
{ key: "qwen3.7-max", intelligence: 46.0, priceBlended: 3.75, scores: { coding: 66.0, agentic: 0.745, ifbench: 0.805 }, tps: 203, costTier: "premium", profiles: ["coder", "balanced", "frontier"], frontier: true, source: AA },
|
|
71
|
+
{ key: "qwen3.7-plus", intelligence: 39.0, priceBlended: 0.59, scores: { coding: 55.9, agentic: 0.610, ifbench: 0.780 }, tps: 51, costTier: "standard", profiles: ["coder", "balanced"], frontier: false, source: AA },
|
|
72
|
+
{ key: "qwen3.6-plus", intelligence: 39.6, priceBlended: 1.125, scores: { coding: 54.5, agentic: 0.614, ifbench: 0.752 }, tps: 52, costTier: "standard", profiles: ["coder", "balanced"], frontier: false, source: AA },
|
|
73
|
+
// Z AI GLM
|
|
74
|
+
{ key: "glm-5.2", intelligence: 51.1, priceBlended: 2.15, scores: { coding: 68.8, agentic: 0.779, ifbench: 0.733 }, tps: 104, costTier: "premium", profiles: ["deep", "balanced"], frontier: true, source: AA },
|
|
75
|
+
{ key: "glm-5.1", intelligence: 40.2, priceBlended: 2.15, scores: { coding: 55.8, agentic: 0.618, ifbench: 0.763 }, tps: 105, costTier: "standard", profiles: ["deep", "balanced"], frontier: false, source: AA },
|
|
76
|
+
// Kimi (Moonshot)
|
|
77
|
+
{ key: "kimi-k2.7-code-highspeed", intelligence: 41.9, priceBlended: 1.712, scores: { coding: 60.8, agentic: 0.674, ifbench: 0.631 }, tps: 180, costTier: "premium", profiles: ["fast", "coder"], frontier: false, source: `${AA} (highspeed serving of kimi-k2.7-code)` },
|
|
78
|
+
{ key: "kimi-k2.7-code", intelligence: 41.9, priceBlended: 1.712, scores: { coding: 60.8, agentic: 0.674, ifbench: 0.631 }, tps: 53, costTier: "standard", profiles: ["coder", "balanced"], frontier: false, source: AA },
|
|
79
|
+
{ key: "kimi-k2.6", intelligence: 42.8, priceBlended: 1.712, scores: { coding: 56.0, agentic: 0.573, ifbench: 0.760 }, tps: 61, costTier: "standard", profiles: ["coder", "balanced"], frontier: false, source: AA },
|
|
80
|
+
// MiniMax
|
|
81
|
+
{ key: "minimax-m3", intelligence: 44.4, priceBlended: 0.525, scores: { coding: 58.6, agentic: 0.652, ifbench: 0.829 }, tps: 56, costTier: "standard", profiles: ["balanced", "coder"], frontier: false, source: AA },
|
|
82
|
+
{ key: "minimax-m2.7", intelligence: 38.1, priceBlended: 0.525, scores: { coding: 52.6, agentic: 0.554, ifbench: 0.757 }, tps: 48, costTier: "standard", profiles: ["coder", "balanced"], frontier: false, source: AA },
|
|
83
|
+
// Xiaomi MiMo
|
|
84
|
+
{ key: "mimo-v2.5-pro", intelligence: 42.2, priceBlended: 0.544, scores: { coding: 60.2, agentic: 0.652, ifbench: 0.799 }, tps: 51, costTier: "standard", profiles: ["coder", "balanced"], frontier: false, source: AA },
|
|
85
|
+
// Meta
|
|
86
|
+
{ key: "muse-spark", intelligence: 43.1, priceBlended: 0, scores: { coding: 58.6, agentic: 0.622, ifbench: 0.759 }, tps: 60, costTier: "standard", profiles: ["deep", "coder", "balanced"], frontier: false, source: AA },
|
|
87
|
+
{ key: "llama-4-maverick", intelligence: 14.3, priceBlended: 0.475, scores: { coding: 16.3, agentic: 0.0787, ifbench: 0.430 }, tps: 122, costTier: "cheap", profiles: ["fast", "balanced"], frontier: false, source: AA },
|
|
88
|
+
// NVIDIA
|
|
89
|
+
{ key: "nemotron-3-ultra", intelligence: 37.8, priceBlended: 1.175, scores: { coding: 49.3, agentic: 0.539, ifbench: 0.814 }, tps: 148, costTier: "standard", profiles: ["balanced"], frontier: false, source: AA },
|
|
90
|
+
];
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Per-model results from the Ramp SWE-Bench run (mini-swe-agent harness, 80 tasks): real agentic
|
|
94
|
+
* resolve-rate and measured per-task cost (API list pricing, prompt-cache included). This is a
|
|
95
|
+
* SEPARATE source from the Artificial Analysis numbers above — the router consumes one or the other
|
|
96
|
+
* (`capabilitySource`), never a merge: mixing real outcomes and synthetic scores on one scale is
|
|
97
|
+
* meaningless. Keyed by canonical model key. A model absent here has no Ramp result and is therefore
|
|
98
|
+
* not auto-routed when `capabilitySource` is `ramp` (reach it via `modelOverrides` or a forced route).
|
|
99
|
+
*
|
|
100
|
+
* Caveat: a single harness/run, reasoning fixed at high/xhigh, billed at API list (no subscription),
|
|
101
|
+
* no vision and no throughput metric. It is a real-task coding slice, not a universal capability score.
|
|
102
|
+
*/
|
|
103
|
+
export interface RampMeta {
|
|
104
|
+
/** Canonical model key; matches a `CanonicalMeta.key` above. */
|
|
105
|
+
key: string;
|
|
106
|
+
/** SWE-bench resolve rate, 0–100. The capability axis for every profile under the `ramp` source. */
|
|
107
|
+
resolveRate: number;
|
|
108
|
+
/** Mean measured cost per task, USD (API list pricing). The Pareto cost axis under `ramp`. */
|
|
109
|
+
costPerTask: number;
|
|
110
|
+
/** Mean tool-call turns to complete. Informational (shown in `/router`). */
|
|
111
|
+
turns: number;
|
|
112
|
+
source: string;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
const RAMP = "Ramp SWE-Bench (mini-swe-agent), 2026-06";
|
|
116
|
+
|
|
117
|
+
export const RAMP_MODELS: RampMeta[] = [
|
|
118
|
+
{ key: "claude-fable-5", resolveRate: 87.5, costPerTask: 2.66, turns: 48, source: RAMP },
|
|
119
|
+
{ key: "claude-opus-4-7", resolveRate: 83.8, costPerTask: 2.24, turns: 71, source: RAMP },
|
|
120
|
+
{ key: "gpt-5.5", resolveRate: 83.8, costPerTask: 1.81, turns: 52, source: RAMP },
|
|
121
|
+
{ key: "glm-5.2", resolveRate: 80.0, costPerTask: 1.88, turns: 98, source: RAMP },
|
|
122
|
+
{ key: "kimi-k2.7-code", resolveRate: 78.8, costPerTask: 0.89, turns: 77, source: RAMP },
|
|
123
|
+
{ key: "claude-opus-4-8", resolveRate: 77.5, costPerTask: 1.09, turns: 39, source: RAMP },
|
|
124
|
+
{ key: "gemini-3.1-pro", resolveRate: 73.8, costPerTask: 1.03, turns: 55, source: RAMP },
|
|
125
|
+
{ key: "claude-sonnet-4-6", resolveRate: 72.5, costPerTask: 0.79, turns: 49, source: RAMP },
|
|
126
|
+
{ key: "gpt-5.4", resolveRate: 72.5, costPerTask: 0.66, turns: 28, source: RAMP },
|
|
127
|
+
{ key: "kimi-k2.6", resolveRate: 72.5, costPerTask: 0.69, turns: 81, source: RAMP },
|
|
128
|
+
{ key: "glm-5.1", resolveRate: 71.2, costPerTask: 1.08, turns: 78, source: RAMP },
|
|
129
|
+
{ key: "deepseek-v4-pro", resolveRate: 65.0, costPerTask: 0.80, turns: 55, source: RAMP },
|
|
130
|
+
{ key: "qwen3.6-plus", resolveRate: 65.0, costPerTask: 0.29, turns: 107, source: RAMP },
|
|
131
|
+
{ key: "qwen3.7-plus", resolveRate: 61.3, costPerTask: 0.16, turns: 54, source: RAMP },
|
|
132
|
+
{ key: "gpt-5.4-mini", resolveRate: 58.8, costPerTask: 0.23, turns: 29, source: RAMP },
|
|
133
|
+
{ key: "claude-4-5-haiku", resolveRate: 50.0, costPerTask: 0.51, turns: 72, source: RAMP },
|
|
134
|
+
{ key: "gpt-5.4-nano", resolveRate: 48.8, costPerTask: 0.09, turns: 54, source: RAMP },
|
|
135
|
+
];
|
|
136
|
+
|
|
137
|
+
const RAMP_BY_KEY = new Map(RAMP_MODELS.map((entry) => [entry.key, entry]));
|
|
138
|
+
|
|
139
|
+
export function findRampModel(canonicalKey: string | null | undefined): RampMeta | undefined {
|
|
140
|
+
return canonicalKey ? RAMP_BY_KEY.get(canonicalKey) : undefined;
|
|
141
|
+
}
|