pi-free 2.0.13 → 2.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +28 -0
- package/README.md +9 -5
- package/config.ts +15 -0
- package/constants.ts +3 -0
- package/index.ts +135 -0
- package/lib/built-in-toggle.ts +4 -4
- package/lib/probe-cache.ts +86 -0
- package/lib/provider-compat.ts +33 -0
- package/lib/registry.ts +25 -3
- package/lib/telemetry.ts +328 -0
- package/lib/util.ts +10 -1
- package/package.json +1 -1
- package/provider-failover/benchmark-lookup.ts +94 -8
- package/provider-failover/benchmarks-chunk-0.ts +599 -890
- package/provider-failover/benchmarks-chunk-1.ts +655 -924
- package/provider-failover/benchmarks-chunk-2.ts +675 -966
- package/provider-failover/benchmarks-chunk-3.ts +676 -967
- package/provider-failover/benchmarks-chunk-4.ts +704 -954
- package/provider-failover/benchmarks-chunk-5.ts +1301 -0
- package/provider-failover/hardcoded-benchmarks.ts +9 -3
- package/providers/cline/cline-models.ts +200 -68
- package/providers/cline/cline.ts +3 -3
- package/providers/dynamic-built-in/index.ts +1 -1
- package/providers/kilo/kilo.ts +2 -2
- package/providers/model-fetcher.ts +3 -1
- package/providers/nvidia/nvidia.ts +54 -16
- package/providers/ollama/ollama.ts +103 -46
- package/providers/opencode-session.ts +398 -371
- package/providers/qwen/qwen.ts +2 -2
- package/providers/routeway/routeway.ts +391 -0
|
@@ -1,23 +1,20 @@
|
|
|
1
1
|
// Auto-generated benchmark data chunk 3
|
|
2
|
-
// Models:
|
|
2
|
+
// Models: llama-3.1-instruct-70b .. magistral-medium-1 (90 entries)
|
|
3
|
+
// Last updated: 2026-06-01
|
|
3
4
|
// DO NOT EDIT MANUALLY — generated by scripts/update-benchmarks.ts
|
|
4
5
|
|
|
5
6
|
import type { HardcodedBenchmark } from "./hardcoded-benchmarks.ts";
|
|
6
7
|
|
|
7
8
|
export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
8
|
-
"
|
|
9
|
-
// AA Intelligence Index (composite score)
|
|
10
|
-
intelligenceIndex: 29.5,
|
|
11
|
-
normalizedScore: 42,
|
|
12
|
-
|
|
9
|
+
"llama-3.1-instruct-70b": {
|
|
13
10
|
// AA specific benchmarks
|
|
14
|
-
codingIndex:
|
|
15
|
-
mathIndex:
|
|
11
|
+
codingIndex: 10.9,
|
|
12
|
+
mathIndex: 4,
|
|
16
13
|
|
|
17
14
|
// Academic benchmarks
|
|
18
|
-
mmluPro: 0.
|
|
19
|
-
gpqa: 0.
|
|
20
|
-
hle: 0.
|
|
15
|
+
mmluPro: 0.676,
|
|
16
|
+
gpqa: 0.409,
|
|
17
|
+
hle: 0.046,
|
|
21
18
|
|
|
22
19
|
// Capabilities
|
|
23
20
|
contextWindow: 8192,
|
|
@@ -25,21 +22,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
25
22
|
supportsVision: false,
|
|
26
23
|
|
|
27
24
|
// Metadata
|
|
28
|
-
lastUpdated: "2026-
|
|
25
|
+
lastUpdated: "2026-06-01",
|
|
26
|
+
originalModel: "Llama 3.1 Instruct 70B",
|
|
29
27
|
},
|
|
30
|
-
"
|
|
31
|
-
// AA Intelligence Index (composite score)
|
|
32
|
-
intelligenceIndex: 17.8,
|
|
33
|
-
normalizedScore: 25,
|
|
34
|
-
|
|
28
|
+
"llama-3.1-instruct-8b": {
|
|
35
29
|
// AA specific benchmarks
|
|
36
|
-
codingIndex:
|
|
37
|
-
mathIndex:
|
|
30
|
+
codingIndex: 4.9,
|
|
31
|
+
mathIndex: 4.3,
|
|
38
32
|
|
|
39
33
|
// Academic benchmarks
|
|
40
|
-
mmluPro: 0.
|
|
41
|
-
gpqa: 0.
|
|
42
|
-
hle: 0.
|
|
34
|
+
mmluPro: 0.476,
|
|
35
|
+
gpqa: 0.259,
|
|
36
|
+
hle: 0.051,
|
|
43
37
|
|
|
44
38
|
// Capabilities
|
|
45
39
|
contextWindow: 8192,
|
|
@@ -47,21 +41,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
47
41
|
supportsVision: false,
|
|
48
42
|
|
|
49
43
|
// Metadata
|
|
50
|
-
lastUpdated: "2026-
|
|
44
|
+
lastUpdated: "2026-06-01",
|
|
45
|
+
originalModel: "Llama 3.1 Instruct 8B",
|
|
51
46
|
},
|
|
52
|
-
"
|
|
53
|
-
// AA Intelligence Index (composite score)
|
|
54
|
-
intelligenceIndex: 12,
|
|
55
|
-
normalizedScore: 17,
|
|
56
|
-
|
|
47
|
+
"llama-3.2-instruct-3b": {
|
|
57
48
|
// AA specific benchmarks
|
|
58
|
-
codingIndex:
|
|
59
|
-
mathIndex:
|
|
49
|
+
codingIndex: undefined,
|
|
50
|
+
mathIndex: 3.3,
|
|
60
51
|
|
|
61
52
|
// Academic benchmarks
|
|
62
|
-
mmluPro: 0.
|
|
63
|
-
gpqa: 0.
|
|
64
|
-
hle: 0.
|
|
53
|
+
mmluPro: 0.347,
|
|
54
|
+
gpqa: 0.255,
|
|
55
|
+
hle: 0.052,
|
|
65
56
|
|
|
66
57
|
// Capabilities
|
|
67
58
|
contextWindow: 8192,
|
|
@@ -69,21 +60,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
69
60
|
supportsVision: false,
|
|
70
61
|
|
|
71
62
|
// Metadata
|
|
72
|
-
lastUpdated: "2026-
|
|
63
|
+
lastUpdated: "2026-06-01",
|
|
64
|
+
originalModel: "Llama 3.2 Instruct 3B",
|
|
73
65
|
},
|
|
74
|
-
"
|
|
75
|
-
// AA Intelligence Index (composite score)
|
|
76
|
-
intelligenceIndex: 20.6,
|
|
77
|
-
normalizedScore: 29,
|
|
78
|
-
|
|
66
|
+
"llama-3-instruct-70b": {
|
|
79
67
|
// AA specific benchmarks
|
|
80
|
-
codingIndex:
|
|
81
|
-
mathIndex:
|
|
68
|
+
codingIndex: 6.8,
|
|
69
|
+
mathIndex: undefined,
|
|
82
70
|
|
|
83
71
|
// Academic benchmarks
|
|
84
|
-
mmluPro: 0.
|
|
85
|
-
gpqa: 0.
|
|
86
|
-
hle: 0.
|
|
72
|
+
mmluPro: 0.574,
|
|
73
|
+
gpqa: 0.379,
|
|
74
|
+
hle: 0.044,
|
|
87
75
|
|
|
88
76
|
// Capabilities
|
|
89
77
|
contextWindow: 8192,
|
|
@@ -91,21 +79,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
91
79
|
supportsVision: false,
|
|
92
80
|
|
|
93
81
|
// Metadata
|
|
94
|
-
lastUpdated: "2026-
|
|
82
|
+
lastUpdated: "2026-06-01",
|
|
83
|
+
originalModel: "Llama 3 Instruct 70B",
|
|
95
84
|
},
|
|
96
|
-
"
|
|
97
|
-
// AA Intelligence Index (composite score)
|
|
98
|
-
intelligenceIndex: 27,
|
|
99
|
-
normalizedScore: 39,
|
|
100
|
-
|
|
85
|
+
"llama-3-instruct-8b": {
|
|
101
86
|
// AA specific benchmarks
|
|
102
|
-
codingIndex:
|
|
103
|
-
mathIndex:
|
|
87
|
+
codingIndex: 4,
|
|
88
|
+
mathIndex: undefined,
|
|
104
89
|
|
|
105
90
|
// Academic benchmarks
|
|
106
|
-
mmluPro: 0.
|
|
107
|
-
gpqa: 0.
|
|
108
|
-
hle: 0.
|
|
91
|
+
mmluPro: 0.405,
|
|
92
|
+
gpqa: 0.296,
|
|
93
|
+
hle: 0.051,
|
|
109
94
|
|
|
110
95
|
// Capabilities
|
|
111
96
|
contextWindow: 8192,
|
|
@@ -113,21 +98,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
113
98
|
supportsVision: false,
|
|
114
99
|
|
|
115
100
|
// Metadata
|
|
116
|
-
lastUpdated: "2026-
|
|
101
|
+
lastUpdated: "2026-06-01",
|
|
102
|
+
originalModel: "Llama 3 Instruct 8B",
|
|
117
103
|
},
|
|
118
|
-
"
|
|
119
|
-
// AA Intelligence Index (composite score)
|
|
120
|
-
intelligenceIndex: 25.7,
|
|
121
|
-
normalizedScore: 37,
|
|
122
|
-
|
|
104
|
+
"llama-3.2-instruct-1b": {
|
|
123
105
|
// AA specific benchmarks
|
|
124
|
-
codingIndex:
|
|
125
|
-
mathIndex:
|
|
106
|
+
codingIndex: 0.6,
|
|
107
|
+
mathIndex: 0,
|
|
126
108
|
|
|
127
109
|
// Academic benchmarks
|
|
128
|
-
mmluPro: 0.
|
|
129
|
-
gpqa: 0.
|
|
130
|
-
hle: 0.
|
|
110
|
+
mmluPro: 0.2,
|
|
111
|
+
gpqa: 0.196,
|
|
112
|
+
hle: 0.053,
|
|
131
113
|
|
|
132
114
|
// Capabilities
|
|
133
115
|
contextWindow: 8192,
|
|
@@ -135,21 +117,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
135
117
|
supportsVision: false,
|
|
136
118
|
|
|
137
119
|
// Metadata
|
|
138
|
-
lastUpdated: "2026-
|
|
120
|
+
lastUpdated: "2026-06-01",
|
|
121
|
+
originalModel: "Llama 3.2 Instruct 1B",
|
|
139
122
|
},
|
|
140
|
-
"
|
|
141
|
-
// AA Intelligence Index (composite score)
|
|
142
|
-
intelligenceIndex: 10.1,
|
|
143
|
-
normalizedScore: 14,
|
|
144
|
-
|
|
123
|
+
"llama-2-chat-70b": {
|
|
145
124
|
// AA specific benchmarks
|
|
146
125
|
codingIndex: undefined,
|
|
147
126
|
mathIndex: undefined,
|
|
148
127
|
|
|
149
128
|
// Academic benchmarks
|
|
150
|
-
mmluPro: 0.
|
|
151
|
-
gpqa: 0.
|
|
152
|
-
hle: 0.
|
|
129
|
+
mmluPro: 0.406,
|
|
130
|
+
gpqa: 0.327,
|
|
131
|
+
hle: 0.05,
|
|
153
132
|
|
|
154
133
|
// Capabilities
|
|
155
134
|
contextWindow: 8192,
|
|
@@ -157,21 +136,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
157
136
|
supportsVision: false,
|
|
158
137
|
|
|
159
138
|
// Metadata
|
|
160
|
-
lastUpdated: "2026-
|
|
139
|
+
lastUpdated: "2026-06-01",
|
|
140
|
+
originalModel: "Llama 2 Chat 70B",
|
|
161
141
|
},
|
|
162
|
-
"
|
|
163
|
-
// AA Intelligence Index (composite score)
|
|
164
|
-
intelligenceIndex: 10.5,
|
|
165
|
-
normalizedScore: 15,
|
|
166
|
-
|
|
142
|
+
"llama-2-chat-7b": {
|
|
167
143
|
// AA specific benchmarks
|
|
168
144
|
codingIndex: undefined,
|
|
169
145
|
mathIndex: undefined,
|
|
170
146
|
|
|
171
147
|
// Academic benchmarks
|
|
172
|
-
mmluPro: 0.
|
|
173
|
-
gpqa: 0.
|
|
174
|
-
hle: 0.
|
|
148
|
+
mmluPro: 0.164,
|
|
149
|
+
gpqa: 0.227,
|
|
150
|
+
hle: 0.058,
|
|
175
151
|
|
|
176
152
|
// Capabilities
|
|
177
153
|
contextWindow: 8192,
|
|
@@ -179,21 +155,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
179
155
|
supportsVision: false,
|
|
180
156
|
|
|
181
157
|
// Metadata
|
|
182
|
-
lastUpdated: "2026-
|
|
158
|
+
lastUpdated: "2026-06-01",
|
|
159
|
+
originalModel: "Llama 2 Chat 7B",
|
|
183
160
|
},
|
|
184
|
-
"
|
|
185
|
-
// AA Intelligence Index (composite score)
|
|
186
|
-
intelligenceIndex: 17.6,
|
|
187
|
-
normalizedScore: 25,
|
|
188
|
-
|
|
161
|
+
"llama-2-chat-13b": {
|
|
189
162
|
// AA specific benchmarks
|
|
190
|
-
codingIndex:
|
|
191
|
-
mathIndex:
|
|
163
|
+
codingIndex: undefined,
|
|
164
|
+
mathIndex: undefined,
|
|
192
165
|
|
|
193
166
|
// Academic benchmarks
|
|
194
|
-
mmluPro: 0.
|
|
195
|
-
gpqa: 0.
|
|
196
|
-
hle: 0.
|
|
167
|
+
mmluPro: 0.406,
|
|
168
|
+
gpqa: 0.321,
|
|
169
|
+
hle: 0.047,
|
|
197
170
|
|
|
198
171
|
// Capabilities
|
|
199
172
|
contextWindow: 8192,
|
|
@@ -201,21 +174,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
201
174
|
supportsVision: false,
|
|
202
175
|
|
|
203
176
|
// Metadata
|
|
204
|
-
lastUpdated: "2026-
|
|
177
|
+
lastUpdated: "2026-06-01",
|
|
178
|
+
originalModel: "Llama 2 Chat 13B",
|
|
205
179
|
},
|
|
206
|
-
"gemini-2-
|
|
207
|
-
// AA Intelligence Index (composite score)
|
|
208
|
-
intelligenceIndex: 14.7,
|
|
209
|
-
normalizedScore: 21,
|
|
210
|
-
|
|
180
|
+
"gemini-2.0-pro-experimental-feb-25": {
|
|
211
181
|
// AA specific benchmarks
|
|
212
|
-
codingIndex:
|
|
182
|
+
codingIndex: 25.5,
|
|
213
183
|
mathIndex: undefined,
|
|
214
184
|
|
|
215
185
|
// Academic benchmarks
|
|
216
|
-
mmluPro: 0.
|
|
217
|
-
gpqa: 0.
|
|
218
|
-
hle: 0.
|
|
186
|
+
mmluPro: 0.805,
|
|
187
|
+
gpqa: 0.622,
|
|
188
|
+
hle: 0.068,
|
|
219
189
|
|
|
220
190
|
// Capabilities
|
|
221
191
|
contextWindow: 8192,
|
|
@@ -223,21 +193,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
223
193
|
supportsVision: false,
|
|
224
194
|
|
|
225
195
|
// Metadata
|
|
226
|
-
lastUpdated: "2026-
|
|
196
|
+
lastUpdated: "2026-06-01",
|
|
197
|
+
originalModel: "Gemini 2.0 Pro Experimental (Feb '25)",
|
|
227
198
|
},
|
|
228
|
-
"gemini-2.
|
|
229
|
-
// AA Intelligence Index (composite score)
|
|
230
|
-
intelligenceIndex: 24.3,
|
|
231
|
-
normalizedScore: 35,
|
|
232
|
-
|
|
199
|
+
"gemini-2.0-flash-experimental": {
|
|
233
200
|
// AA specific benchmarks
|
|
234
201
|
codingIndex: undefined,
|
|
235
202
|
mathIndex: undefined,
|
|
236
203
|
|
|
237
204
|
// Academic benchmarks
|
|
238
|
-
mmluPro: 0.
|
|
239
|
-
gpqa: 0.
|
|
240
|
-
hle: 0.
|
|
205
|
+
mmluPro: 0.782,
|
|
206
|
+
gpqa: 0.636,
|
|
207
|
+
hle: 0.047,
|
|
241
208
|
|
|
242
209
|
// Capabilities
|
|
243
210
|
contextWindow: 8192,
|
|
@@ -245,21 +212,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
245
212
|
supportsVision: false,
|
|
246
213
|
|
|
247
214
|
// Metadata
|
|
248
|
-
lastUpdated: "2026-
|
|
215
|
+
lastUpdated: "2026-06-01",
|
|
216
|
+
originalModel: "Gemini 2.0 Flash (experimental)",
|
|
249
217
|
},
|
|
250
|
-
"gemini-
|
|
251
|
-
// AA Intelligence Index (composite score)
|
|
252
|
-
intelligenceIndex: 30.3,
|
|
253
|
-
normalizedScore: 43,
|
|
254
|
-
|
|
218
|
+
"gemini-1.5-pro-sep-24": {
|
|
255
219
|
// AA specific benchmarks
|
|
256
|
-
codingIndex:
|
|
220
|
+
codingIndex: 23.6,
|
|
257
221
|
mathIndex: undefined,
|
|
258
222
|
|
|
259
223
|
// Academic benchmarks
|
|
260
|
-
mmluPro: 0.
|
|
261
|
-
gpqa: 0.
|
|
262
|
-
hle: 0.
|
|
224
|
+
mmluPro: 0.75,
|
|
225
|
+
gpqa: 0.589,
|
|
226
|
+
hle: 0.049,
|
|
263
227
|
|
|
264
228
|
// Capabilities
|
|
265
229
|
contextWindow: 8192,
|
|
@@ -267,21 +231,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
267
231
|
supportsVision: false,
|
|
268
232
|
|
|
269
233
|
// Metadata
|
|
270
|
-
lastUpdated: "2026-
|
|
234
|
+
lastUpdated: "2026-06-01",
|
|
235
|
+
originalModel: "Gemini 1.5 Pro (Sep '24)",
|
|
271
236
|
},
|
|
272
|
-
"gemini-
|
|
273
|
-
// AA Intelligence Index (composite score)
|
|
274
|
-
intelligenceIndex: 10.1,
|
|
275
|
-
normalizedScore: 14,
|
|
276
|
-
|
|
237
|
+
"gemini-2.0-flash-lite-preview": {
|
|
277
238
|
// AA specific benchmarks
|
|
278
|
-
codingIndex:
|
|
239
|
+
codingIndex: undefined,
|
|
279
240
|
mathIndex: undefined,
|
|
280
241
|
|
|
281
242
|
// Academic benchmarks
|
|
282
243
|
mmluPro: undefined,
|
|
283
|
-
gpqa:
|
|
284
|
-
hle:
|
|
244
|
+
gpqa: 0.542,
|
|
245
|
+
hle: 0.044,
|
|
285
246
|
|
|
286
247
|
// Capabilities
|
|
287
248
|
contextWindow: 8192,
|
|
@@ -289,21 +250,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
289
250
|
supportsVision: false,
|
|
290
251
|
|
|
291
252
|
// Metadata
|
|
292
|
-
lastUpdated: "2026-
|
|
253
|
+
lastUpdated: "2026-06-01",
|
|
254
|
+
originalModel: "Gemini 2.0 Flash-Lite (Preview)",
|
|
293
255
|
},
|
|
294
|
-
"gemini-2.
|
|
295
|
-
// AA Intelligence Index (composite score)
|
|
296
|
-
intelligenceIndex: 31.1,
|
|
297
|
-
normalizedScore: 44,
|
|
298
|
-
|
|
256
|
+
"gemini-2.0-flash-feb-25": {
|
|
299
257
|
// AA specific benchmarks
|
|
300
|
-
codingIndex:
|
|
301
|
-
mathIndex:
|
|
258
|
+
codingIndex: 13.6,
|
|
259
|
+
mathIndex: 21.7,
|
|
302
260
|
|
|
303
261
|
// Academic benchmarks
|
|
304
|
-
mmluPro: 0.
|
|
305
|
-
gpqa: 0.
|
|
306
|
-
hle: 0.
|
|
262
|
+
mmluPro: 0.779,
|
|
263
|
+
gpqa: 0.623,
|
|
264
|
+
hle: 0.053,
|
|
307
265
|
|
|
308
266
|
// Capabilities
|
|
309
267
|
contextWindow: 8192,
|
|
@@ -311,21 +269,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
311
269
|
supportsVision: false,
|
|
312
270
|
|
|
313
271
|
// Metadata
|
|
314
|
-
lastUpdated: "2026-
|
|
272
|
+
lastUpdated: "2026-06-01",
|
|
273
|
+
originalModel: "Gemini 2.0 Flash (Feb '25)",
|
|
315
274
|
},
|
|
316
|
-
"
|
|
317
|
-
// AA Intelligence Index (composite score)
|
|
318
|
-
intelligenceIndex: 15.9,
|
|
319
|
-
normalizedScore: 23,
|
|
320
|
-
|
|
275
|
+
"gemini-1.5-flash-sep-24": {
|
|
321
276
|
// AA specific benchmarks
|
|
322
|
-
codingIndex:
|
|
277
|
+
codingIndex: undefined,
|
|
323
278
|
mathIndex: undefined,
|
|
324
279
|
|
|
325
280
|
// Academic benchmarks
|
|
326
|
-
mmluPro: 0.
|
|
327
|
-
gpqa: 0.
|
|
328
|
-
hle: 0.
|
|
281
|
+
mmluPro: 0.68,
|
|
282
|
+
gpqa: 0.463,
|
|
283
|
+
hle: 0.035,
|
|
329
284
|
|
|
330
285
|
// Capabilities
|
|
331
286
|
contextWindow: 8192,
|
|
@@ -333,21 +288,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
333
288
|
supportsVision: false,
|
|
334
289
|
|
|
335
290
|
// Metadata
|
|
336
|
-
lastUpdated: "2026-
|
|
291
|
+
lastUpdated: "2026-06-01",
|
|
292
|
+
originalModel: "Gemini 1.5 Flash (Sep '24)",
|
|
337
293
|
},
|
|
338
|
-
"
|
|
339
|
-
// AA Intelligence Index (composite score)
|
|
340
|
-
intelligenceIndex: 14.2,
|
|
341
|
-
normalizedScore: 20,
|
|
342
|
-
|
|
294
|
+
"gemini-1.5-flash-8b": {
|
|
343
295
|
// AA specific benchmarks
|
|
344
|
-
codingIndex:
|
|
296
|
+
codingIndex: undefined,
|
|
345
297
|
mathIndex: undefined,
|
|
346
298
|
|
|
347
299
|
// Academic benchmarks
|
|
348
|
-
mmluPro: 0.
|
|
349
|
-
gpqa: 0.
|
|
350
|
-
hle: 0.
|
|
300
|
+
mmluPro: 0.569,
|
|
301
|
+
gpqa: 0.359,
|
|
302
|
+
hle: 0.045,
|
|
351
303
|
|
|
352
304
|
// Capabilities
|
|
353
305
|
contextWindow: 8192,
|
|
@@ -355,21 +307,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
355
307
|
supportsVision: false,
|
|
356
308
|
|
|
357
309
|
// Metadata
|
|
358
|
-
lastUpdated: "2026-
|
|
310
|
+
lastUpdated: "2026-06-01",
|
|
311
|
+
originalModel: "Gemini 1.5 Flash-8B",
|
|
359
312
|
},
|
|
360
|
-
"
|
|
361
|
-
// AA Intelligence Index (composite score)
|
|
362
|
-
intelligenceIndex: 18,
|
|
363
|
-
normalizedScore: 26,
|
|
364
|
-
|
|
313
|
+
"gemma-3-1b-instruct": {
|
|
365
314
|
// AA specific benchmarks
|
|
366
|
-
codingIndex:
|
|
367
|
-
mathIndex:
|
|
315
|
+
codingIndex: 0.2,
|
|
316
|
+
mathIndex: 3.3,
|
|
368
317
|
|
|
369
318
|
// Academic benchmarks
|
|
370
|
-
mmluPro: 0.
|
|
371
|
-
gpqa: 0.
|
|
372
|
-
hle: 0.
|
|
319
|
+
mmluPro: 0.135,
|
|
320
|
+
gpqa: 0.237,
|
|
321
|
+
hle: 0.052,
|
|
373
322
|
|
|
374
323
|
// Capabilities
|
|
375
324
|
contextWindow: 8192,
|
|
@@ -377,21 +326,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
377
326
|
supportsVision: false,
|
|
378
327
|
|
|
379
328
|
// Metadata
|
|
380
|
-
lastUpdated: "2026-
|
|
329
|
+
lastUpdated: "2026-06-01",
|
|
330
|
+
originalModel: "Gemma 3 1B Instruct",
|
|
381
331
|
},
|
|
382
|
-
"
|
|
383
|
-
// AA Intelligence Index (composite score)
|
|
384
|
-
intelligenceIndex: 18.7,
|
|
385
|
-
normalizedScore: 27,
|
|
386
|
-
|
|
332
|
+
"gemini-2.5-flash-lite-non-reasoning": {
|
|
387
333
|
// AA specific benchmarks
|
|
388
|
-
codingIndex:
|
|
389
|
-
mathIndex:
|
|
334
|
+
codingIndex: 7.4,
|
|
335
|
+
mathIndex: 35.3,
|
|
390
336
|
|
|
391
337
|
// Academic benchmarks
|
|
392
|
-
mmluPro: 0.
|
|
393
|
-
gpqa: 0.
|
|
394
|
-
hle: 0.
|
|
338
|
+
mmluPro: 0.724,
|
|
339
|
+
gpqa: 0.474,
|
|
340
|
+
hle: 0.037,
|
|
395
341
|
|
|
396
342
|
// Capabilities
|
|
397
343
|
contextWindow: 8192,
|
|
@@ -399,21 +345,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
399
345
|
supportsVision: false,
|
|
400
346
|
|
|
401
347
|
// Metadata
|
|
402
|
-
lastUpdated: "2026-
|
|
348
|
+
lastUpdated: "2026-06-01",
|
|
349
|
+
originalModel: "Gemini 2.5 Flash-Lite (Non-reasoning)",
|
|
403
350
|
},
|
|
404
|
-
"
|
|
405
|
-
// AA Intelligence Index (composite score)
|
|
406
|
-
intelligenceIndex: 10.3,
|
|
407
|
-
normalizedScore: 15,
|
|
408
|
-
|
|
351
|
+
"gemini-3-pro-preview-high": {
|
|
409
352
|
// AA specific benchmarks
|
|
410
|
-
codingIndex:
|
|
411
|
-
mathIndex:
|
|
353
|
+
codingIndex: 46.5,
|
|
354
|
+
mathIndex: 95.7,
|
|
412
355
|
|
|
413
356
|
// Academic benchmarks
|
|
414
|
-
mmluPro: 0.
|
|
415
|
-
gpqa: 0.
|
|
416
|
-
hle: 0.
|
|
357
|
+
mmluPro: 0.898,
|
|
358
|
+
gpqa: 0.908,
|
|
359
|
+
hle: 0.372,
|
|
417
360
|
|
|
418
361
|
// Capabilities
|
|
419
362
|
contextWindow: 8192,
|
|
@@ -421,21 +364,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
421
364
|
supportsVision: false,
|
|
422
365
|
|
|
423
366
|
// Metadata
|
|
424
|
-
lastUpdated: "2026-
|
|
367
|
+
lastUpdated: "2026-06-01",
|
|
368
|
+
originalModel: "Gemini 3 Pro Preview (high)",
|
|
425
369
|
},
|
|
426
|
-
"
|
|
427
|
-
// AA Intelligence Index (composite score)
|
|
428
|
-
intelligenceIndex: 12.3,
|
|
429
|
-
normalizedScore: 18,
|
|
430
|
-
|
|
370
|
+
"gemma-3n-e2b-instruct": {
|
|
431
371
|
// AA specific benchmarks
|
|
432
|
-
codingIndex:
|
|
433
|
-
mathIndex:
|
|
372
|
+
codingIndex: 2.2,
|
|
373
|
+
mathIndex: 10.3,
|
|
434
374
|
|
|
435
375
|
// Academic benchmarks
|
|
436
|
-
mmluPro:
|
|
437
|
-
gpqa: 0.
|
|
438
|
-
hle: 0.
|
|
376
|
+
mmluPro: 0.378,
|
|
377
|
+
gpqa: 0.229,
|
|
378
|
+
hle: 0.04,
|
|
439
379
|
|
|
440
380
|
// Capabilities
|
|
441
381
|
contextWindow: 8192,
|
|
@@ -443,21 +383,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
443
383
|
supportsVision: false,
|
|
444
384
|
|
|
445
385
|
// Metadata
|
|
446
|
-
lastUpdated: "2026-
|
|
386
|
+
lastUpdated: "2026-06-01",
|
|
387
|
+
originalModel: "Gemma 3n E2B Instruct",
|
|
447
388
|
},
|
|
448
|
-
"
|
|
449
|
-
// AA Intelligence Index (composite score)
|
|
450
|
-
intelligenceIndex: 7.4,
|
|
451
|
-
normalizedScore: 11,
|
|
452
|
-
|
|
389
|
+
"gemma-3-12b-instruct": {
|
|
453
390
|
// AA specific benchmarks
|
|
454
|
-
codingIndex:
|
|
455
|
-
mathIndex:
|
|
391
|
+
codingIndex: 6.3,
|
|
392
|
+
mathIndex: 18.3,
|
|
456
393
|
|
|
457
394
|
// Academic benchmarks
|
|
458
|
-
mmluPro: 0.
|
|
459
|
-
gpqa: 0.
|
|
460
|
-
hle: 0.
|
|
395
|
+
mmluPro: 0.595,
|
|
396
|
+
gpqa: 0.349,
|
|
397
|
+
hle: 0.048,
|
|
461
398
|
|
|
462
399
|
// Capabilities
|
|
463
400
|
contextWindow: 8192,
|
|
@@ -465,21 +402,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
465
402
|
supportsVision: false,
|
|
466
403
|
|
|
467
404
|
// Metadata
|
|
468
|
-
lastUpdated: "2026-
|
|
405
|
+
lastUpdated: "2026-06-01",
|
|
406
|
+
originalModel: "Gemma 3 12B Instruct",
|
|
469
407
|
},
|
|
470
|
-
"
|
|
471
|
-
// AA Intelligence Index (composite score)
|
|
472
|
-
intelligenceIndex: 30.8,
|
|
473
|
-
normalizedScore: 44,
|
|
474
|
-
|
|
408
|
+
"gemma-3-27b-instruct": {
|
|
475
409
|
// AA specific benchmarks
|
|
476
|
-
codingIndex:
|
|
477
|
-
mathIndex:
|
|
410
|
+
codingIndex: 9.6,
|
|
411
|
+
mathIndex: 20.7,
|
|
478
412
|
|
|
479
413
|
// Academic benchmarks
|
|
480
|
-
mmluPro: 0.
|
|
481
|
-
gpqa: 0.
|
|
482
|
-
hle: 0.
|
|
414
|
+
mmluPro: 0.669,
|
|
415
|
+
gpqa: 0.428,
|
|
416
|
+
hle: 0.047,
|
|
483
417
|
|
|
484
418
|
// Capabilities
|
|
485
419
|
contextWindow: 8192,
|
|
@@ -487,21 +421,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
487
421
|
supportsVision: false,
|
|
488
422
|
|
|
489
423
|
// Metadata
|
|
490
|
-
lastUpdated: "2026-
|
|
424
|
+
lastUpdated: "2026-06-01",
|
|
425
|
+
originalModel: "Gemma 3 27B Instruct",
|
|
491
426
|
},
|
|
492
|
-
"
|
|
493
|
-
// AA Intelligence Index (composite score)
|
|
494
|
-
intelligenceIndex: 9.3,
|
|
495
|
-
normalizedScore: 13,
|
|
496
|
-
|
|
427
|
+
"gemini-2.5-flash-preview-sep-25-reasoning": {
|
|
497
428
|
// AA specific benchmarks
|
|
498
|
-
codingIndex:
|
|
499
|
-
mathIndex:
|
|
429
|
+
codingIndex: 24.6,
|
|
430
|
+
mathIndex: 78.3,
|
|
500
431
|
|
|
501
432
|
// Academic benchmarks
|
|
502
|
-
mmluPro: 0.
|
|
503
|
-
gpqa: 0.
|
|
504
|
-
hle: 0.
|
|
433
|
+
mmluPro: 0.842,
|
|
434
|
+
gpqa: 0.793,
|
|
435
|
+
hle: 0.127,
|
|
505
436
|
|
|
506
437
|
// Capabilities
|
|
507
438
|
contextWindow: 8192,
|
|
@@ -509,21 +440,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
509
440
|
supportsVision: false,
|
|
510
441
|
|
|
511
442
|
// Metadata
|
|
512
|
-
lastUpdated: "2026-
|
|
443
|
+
lastUpdated: "2026-06-01",
|
|
444
|
+
originalModel: "Gemini 2.5 Flash Preview (Sep '25) (Reasoning)",
|
|
513
445
|
},
|
|
514
|
-
"
|
|
515
|
-
// AA Intelligence Index (composite score)
|
|
516
|
-
intelligenceIndex: 34.7,
|
|
517
|
-
normalizedScore: 50,
|
|
518
|
-
|
|
446
|
+
"gemini-1.5-pro-may-24": {
|
|
519
447
|
// AA specific benchmarks
|
|
520
|
-
codingIndex:
|
|
521
|
-
mathIndex:
|
|
448
|
+
codingIndex: 19.8,
|
|
449
|
+
mathIndex: undefined,
|
|
522
450
|
|
|
523
451
|
// Academic benchmarks
|
|
524
|
-
mmluPro: 0.
|
|
525
|
-
gpqa: 0.
|
|
526
|
-
hle: 0.
|
|
452
|
+
mmluPro: 0.657,
|
|
453
|
+
gpqa: 0.371,
|
|
454
|
+
hle: 0.039,
|
|
527
455
|
|
|
528
456
|
// Capabilities
|
|
529
457
|
contextWindow: 8192,
|
|
@@ -531,21 +459,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
531
459
|
supportsVision: false,
|
|
532
460
|
|
|
533
461
|
// Metadata
|
|
534
|
-
lastUpdated: "2026-
|
|
462
|
+
lastUpdated: "2026-06-01",
|
|
463
|
+
originalModel: "Gemini 1.5 Pro (May '24)",
|
|
535
464
|
},
|
|
536
|
-
"
|
|
537
|
-
// AA Intelligence Index (composite score)
|
|
538
|
-
intelligenceIndex: 36,
|
|
539
|
-
normalizedScore: 51,
|
|
540
|
-
|
|
465
|
+
"gemma-3-4b-instruct": {
|
|
541
466
|
// AA specific benchmarks
|
|
542
|
-
codingIndex:
|
|
543
|
-
mathIndex:
|
|
467
|
+
codingIndex: 2.9,
|
|
468
|
+
mathIndex: 12.7,
|
|
544
469
|
|
|
545
470
|
// Academic benchmarks
|
|
546
|
-
mmluPro:
|
|
547
|
-
gpqa:
|
|
548
|
-
hle:
|
|
471
|
+
mmluPro: 0.417,
|
|
472
|
+
gpqa: 0.291,
|
|
473
|
+
hle: 0.052,
|
|
549
474
|
|
|
550
475
|
// Capabilities
|
|
551
476
|
contextWindow: 8192,
|
|
@@ -553,21 +478,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
553
478
|
supportsVision: false,
|
|
554
479
|
|
|
555
480
|
// Metadata
|
|
556
|
-
lastUpdated: "2026-
|
|
481
|
+
lastUpdated: "2026-06-01",
|
|
482
|
+
originalModel: "Gemma 3 4B Instruct",
|
|
557
483
|
},
|
|
558
|
-
"
|
|
559
|
-
// AA Intelligence Index (composite score)
|
|
560
|
-
intelligenceIndex: 42,
|
|
561
|
-
normalizedScore: 60,
|
|
562
|
-
|
|
484
|
+
"gemini-2.5-pro-preview-may-25": {
|
|
563
485
|
// AA specific benchmarks
|
|
564
|
-
codingIndex:
|
|
565
|
-
mathIndex:
|
|
486
|
+
codingIndex: undefined,
|
|
487
|
+
mathIndex: undefined,
|
|
566
488
|
|
|
567
489
|
// Academic benchmarks
|
|
568
|
-
mmluPro: 0.
|
|
569
|
-
gpqa: 0.
|
|
570
|
-
hle: 0.
|
|
490
|
+
mmluPro: 0.837,
|
|
491
|
+
gpqa: 0.822,
|
|
492
|
+
hle: 0.154,
|
|
571
493
|
|
|
572
494
|
// Capabilities
|
|
573
495
|
contextWindow: 8192,
|
|
@@ -575,21 +497,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
575
497
|
supportsVision: false,
|
|
576
498
|
|
|
577
499
|
// Metadata
|
|
578
|
-
lastUpdated: "2026-
|
|
500
|
+
lastUpdated: "2026-06-01",
|
|
501
|
+
originalModel: "Gemini 2.5 Pro Preview (May' 25)",
|
|
579
502
|
},
|
|
580
|
-
"
|
|
581
|
-
// AA Intelligence Index (composite score)
|
|
582
|
-
intelligenceIndex: 33,
|
|
583
|
-
normalizedScore: 47,
|
|
584
|
-
|
|
503
|
+
"gemini-2.0-flash-thinking-experimental-jan-25": {
|
|
585
504
|
// AA specific benchmarks
|
|
586
|
-
codingIndex:
|
|
587
|
-
mathIndex:
|
|
505
|
+
codingIndex: 24.1,
|
|
506
|
+
mathIndex: undefined,
|
|
588
507
|
|
|
589
508
|
// Academic benchmarks
|
|
590
|
-
mmluPro: 0.
|
|
591
|
-
gpqa: 0.
|
|
592
|
-
hle: 0.
|
|
509
|
+
mmluPro: 0.798,
|
|
510
|
+
gpqa: 0.701,
|
|
511
|
+
hle: 0.071,
|
|
593
512
|
|
|
594
513
|
// Capabilities
|
|
595
514
|
contextWindow: 8192,
|
|
@@ -597,21 +516,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
597
516
|
supportsVision: false,
|
|
598
517
|
|
|
599
518
|
// Metadata
|
|
600
|
-
lastUpdated: "2026-
|
|
519
|
+
lastUpdated: "2026-06-01",
|
|
520
|
+
originalModel: "Gemini 2.0 Flash Thinking Experimental (Jan '25)",
|
|
601
521
|
},
|
|
602
|
-
"
|
|
603
|
-
// AA Intelligence Index (composite score)
|
|
604
|
-
intelligenceIndex: 33,
|
|
605
|
-
normalizedScore: 47,
|
|
606
|
-
|
|
522
|
+
"gemma-3n-e4b-instruct-preview-may-25": {
|
|
607
523
|
// AA specific benchmarks
|
|
608
524
|
codingIndex: undefined,
|
|
609
|
-
mathIndex:
|
|
525
|
+
mathIndex: undefined,
|
|
610
526
|
|
|
611
527
|
// Academic benchmarks
|
|
612
|
-
mmluPro: 0.
|
|
613
|
-
gpqa: 0.
|
|
614
|
-
hle: 0.
|
|
528
|
+
mmluPro: 0.483,
|
|
529
|
+
gpqa: 0.278,
|
|
530
|
+
hle: 0.049,
|
|
615
531
|
|
|
616
532
|
// Capabilities
|
|
617
533
|
contextWindow: 8192,
|
|
@@ -619,21 +535,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
619
535
|
supportsVision: false,
|
|
620
536
|
|
|
621
537
|
// Metadata
|
|
622
|
-
lastUpdated: "2026-
|
|
538
|
+
lastUpdated: "2026-06-01",
|
|
539
|
+
originalModel: "Gemma 3n E4B Instruct Preview (May '25)",
|
|
623
540
|
},
|
|
624
|
-
"
|
|
625
|
-
// AA Intelligence Index (composite score)
|
|
626
|
-
intelligenceIndex: 38.7,
|
|
627
|
-
normalizedScore: 55,
|
|
628
|
-
|
|
541
|
+
"gemini-1.0-ultra": {
|
|
629
542
|
// AA specific benchmarks
|
|
630
|
-
codingIndex:
|
|
631
|
-
mathIndex:
|
|
543
|
+
codingIndex: 17.6,
|
|
544
|
+
mathIndex: undefined,
|
|
632
545
|
|
|
633
546
|
// Academic benchmarks
|
|
634
|
-
mmluPro:
|
|
635
|
-
gpqa:
|
|
636
|
-
hle:
|
|
547
|
+
mmluPro: undefined,
|
|
548
|
+
gpqa: undefined,
|
|
549
|
+
hle: undefined,
|
|
637
550
|
|
|
638
551
|
// Capabilities
|
|
639
552
|
contextWindow: 8192,
|
|
@@ -641,43 +554,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
641
554
|
supportsVision: false,
|
|
642
555
|
|
|
643
556
|
// Metadata
|
|
644
|
-
lastUpdated: "2026-
|
|
557
|
+
lastUpdated: "2026-06-01",
|
|
558
|
+
originalModel: "Gemini 1.0 Ultra",
|
|
645
559
|
},
|
|
646
|
-
"
|
|
647
|
-
// AA Intelligence Index (composite score)
|
|
648
|
-
intelligenceIndex: 43.1,
|
|
649
|
-
normalizedScore: 62,
|
|
650
|
-
|
|
560
|
+
"gemma-3n-e4b-instruct": {
|
|
651
561
|
// AA specific benchmarks
|
|
652
|
-
codingIndex:
|
|
653
|
-
mathIndex:
|
|
654
|
-
|
|
655
|
-
// Academic benchmarks
|
|
656
|
-
mmluPro: 0.889,
|
|
657
|
-
gpqa: 0.81,
|
|
658
|
-
hle: 0.129,
|
|
659
|
-
|
|
660
|
-
// Capabilities
|
|
661
|
-
contextWindow: 8192,
|
|
662
|
-
supportsReasoning: false,
|
|
663
|
-
supportsVision: false,
|
|
664
|
-
|
|
665
|
-
// Metadata
|
|
666
|
-
lastUpdated: "2026-04-06",
|
|
667
|
-
},
|
|
668
|
-
"claude-opus-4.5-reasoning": {
|
|
669
|
-
// AA Intelligence Index (composite score)
|
|
670
|
-
intelligenceIndex: 49.7,
|
|
671
|
-
normalizedScore: 71,
|
|
672
|
-
|
|
673
|
-
// AA specific benchmarks
|
|
674
|
-
codingIndex: 47.8,
|
|
675
|
-
mathIndex: 91.3,
|
|
562
|
+
codingIndex: 4.2,
|
|
563
|
+
mathIndex: 14.3,
|
|
676
564
|
|
|
677
565
|
// Academic benchmarks
|
|
678
|
-
mmluPro: 0.
|
|
679
|
-
gpqa: 0.
|
|
680
|
-
hle: 0.
|
|
566
|
+
mmluPro: 0.488,
|
|
567
|
+
gpqa: 0.296,
|
|
568
|
+
hle: 0.044,
|
|
681
569
|
|
|
682
570
|
// Capabilities
|
|
683
571
|
contextWindow: 8192,
|
|
@@ -685,21 +573,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
685
573
|
supportsVision: false,
|
|
686
574
|
|
|
687
575
|
// Metadata
|
|
688
|
-
lastUpdated: "2026-
|
|
576
|
+
lastUpdated: "2026-06-01",
|
|
577
|
+
originalModel: "Gemma 3n E4B Instruct",
|
|
689
578
|
},
|
|
690
|
-
"
|
|
691
|
-
// AA Intelligence Index (composite score)
|
|
692
|
-
intelligenceIndex: 39,
|
|
693
|
-
normalizedScore: 56,
|
|
694
|
-
|
|
579
|
+
"gemini-2.5-flash-lite-reasoning": {
|
|
695
580
|
// AA specific benchmarks
|
|
696
|
-
codingIndex:
|
|
697
|
-
mathIndex:
|
|
581
|
+
codingIndex: 9.5,
|
|
582
|
+
mathIndex: 53.3,
|
|
698
583
|
|
|
699
584
|
// Academic benchmarks
|
|
700
|
-
mmluPro: 0.
|
|
701
|
-
gpqa: 0.
|
|
702
|
-
hle: 0.
|
|
585
|
+
mmluPro: 0.759,
|
|
586
|
+
gpqa: 0.625,
|
|
587
|
+
hle: 0.064,
|
|
703
588
|
|
|
704
589
|
// Capabilities
|
|
705
590
|
contextWindow: 8192,
|
|
@@ -707,21 +592,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
707
592
|
supportsVision: false,
|
|
708
593
|
|
|
709
594
|
// Metadata
|
|
710
|
-
lastUpdated: "2026-
|
|
595
|
+
lastUpdated: "2026-06-01",
|
|
596
|
+
originalModel: "Gemini 2.5 Flash-Lite (Reasoning)",
|
|
711
597
|
},
|
|
712
|
-
"
|
|
713
|
-
// AA Intelligence Index (composite score)
|
|
714
|
-
intelligenceIndex: 37.1,
|
|
715
|
-
normalizedScore: 53,
|
|
716
|
-
|
|
598
|
+
"gemini-2.0-flash-thinking-experimental-dec-24": {
|
|
717
599
|
// AA specific benchmarks
|
|
718
|
-
codingIndex:
|
|
719
|
-
mathIndex:
|
|
600
|
+
codingIndex: undefined,
|
|
601
|
+
mathIndex: undefined,
|
|
720
602
|
|
|
721
603
|
// Academic benchmarks
|
|
722
|
-
mmluPro:
|
|
723
|
-
gpqa:
|
|
724
|
-
hle:
|
|
604
|
+
mmluPro: undefined,
|
|
605
|
+
gpqa: undefined,
|
|
606
|
+
hle: undefined,
|
|
725
607
|
|
|
726
608
|
// Capabilities
|
|
727
609
|
contextWindow: 8192,
|
|
@@ -729,21 +611,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
729
611
|
supportsVision: false,
|
|
730
612
|
|
|
731
613
|
// Metadata
|
|
732
|
-
lastUpdated: "2026-
|
|
614
|
+
lastUpdated: "2026-06-01",
|
|
615
|
+
originalModel: "Gemini 2.0 Flash Thinking Experimental (Dec '24)",
|
|
733
616
|
},
|
|
734
|
-
"
|
|
735
|
-
// AA Intelligence Index (composite score)
|
|
736
|
-
intelligenceIndex: 43,
|
|
737
|
-
normalizedScore: 61,
|
|
738
|
-
|
|
617
|
+
"gemini-2.5-flash-reasoning": {
|
|
739
618
|
// AA specific benchmarks
|
|
740
|
-
codingIndex:
|
|
741
|
-
mathIndex:
|
|
619
|
+
codingIndex: 22.2,
|
|
620
|
+
mathIndex: 73.3,
|
|
742
621
|
|
|
743
622
|
// Academic benchmarks
|
|
744
|
-
mmluPro: 0.
|
|
745
|
-
gpqa: 0.
|
|
746
|
-
hle: 0.
|
|
623
|
+
mmluPro: 0.832,
|
|
624
|
+
gpqa: 0.79,
|
|
625
|
+
hle: 0.111,
|
|
747
626
|
|
|
748
627
|
// Capabilities
|
|
749
628
|
contextWindow: 8192,
|
|
@@ -751,21 +630,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
751
630
|
supportsVision: false,
|
|
752
631
|
|
|
753
632
|
// Metadata
|
|
754
|
-
lastUpdated: "2026-
|
|
633
|
+
lastUpdated: "2026-06-01",
|
|
634
|
+
originalModel: "Gemini 2.5 Flash (Reasoning)",
|
|
755
635
|
},
|
|
756
|
-
"
|
|
757
|
-
// AA Intelligence Index (composite score)
|
|
758
|
-
intelligenceIndex: 9.1,
|
|
759
|
-
normalizedScore: 13,
|
|
760
|
-
|
|
636
|
+
"gemini-1.5-flash-may-24": {
|
|
761
637
|
// AA specific benchmarks
|
|
762
|
-
codingIndex:
|
|
638
|
+
codingIndex: undefined,
|
|
763
639
|
mathIndex: undefined,
|
|
764
640
|
|
|
765
641
|
// Academic benchmarks
|
|
766
|
-
mmluPro: 0.
|
|
767
|
-
gpqa: 0.
|
|
768
|
-
hle:
|
|
642
|
+
mmluPro: 0.574,
|
|
643
|
+
gpqa: 0.324,
|
|
644
|
+
hle: 0.042,
|
|
769
645
|
|
|
770
646
|
// Capabilities
|
|
771
647
|
contextWindow: 8192,
|
|
@@ -773,21 +649,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
773
649
|
supportsVision: false,
|
|
774
650
|
|
|
775
651
|
// Metadata
|
|
776
|
-
lastUpdated: "2026-
|
|
652
|
+
lastUpdated: "2026-06-01",
|
|
653
|
+
originalModel: "Gemini 1.5 Flash (May '24)",
|
|
777
654
|
},
|
|
778
|
-
"
|
|
779
|
-
// AA Intelligence Index (composite score)
|
|
780
|
-
intelligenceIndex: 15.1,
|
|
781
|
-
normalizedScore: 22,
|
|
782
|
-
|
|
655
|
+
"gemini-2.5-flash-preview-reasoning": {
|
|
783
656
|
// AA specific benchmarks
|
|
784
|
-
codingIndex:
|
|
785
|
-
mathIndex:
|
|
657
|
+
codingIndex: undefined,
|
|
658
|
+
mathIndex: undefined,
|
|
786
659
|
|
|
787
660
|
// Academic benchmarks
|
|
788
|
-
mmluPro: 0.
|
|
789
|
-
gpqa: 0.
|
|
790
|
-
hle: 0.
|
|
661
|
+
mmluPro: 0.8,
|
|
662
|
+
gpqa: 0.698,
|
|
663
|
+
hle: 0.116,
|
|
791
664
|
|
|
792
665
|
// Capabilities
|
|
793
666
|
contextWindow: 8192,
|
|
@@ -795,21 +668,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
795
668
|
supportsVision: false,
|
|
796
669
|
|
|
797
670
|
// Metadata
|
|
798
|
-
lastUpdated: "2026-
|
|
671
|
+
lastUpdated: "2026-06-01",
|
|
672
|
+
originalModel: "Gemini 2.5 Flash Preview (Reasoning)",
|
|
799
673
|
},
|
|
800
|
-
"
|
|
801
|
-
// AA Intelligence Index (composite score)
|
|
802
|
-
intelligenceIndex: 13,
|
|
803
|
-
normalizedScore: 19,
|
|
804
|
-
|
|
674
|
+
"gemini-2.5-flash-lite-preview-sep-25-reasoning": {
|
|
805
675
|
// AA specific benchmarks
|
|
806
|
-
codingIndex:
|
|
807
|
-
mathIndex:
|
|
676
|
+
codingIndex: 18.2,
|
|
677
|
+
mathIndex: 68.7,
|
|
808
678
|
|
|
809
679
|
// Academic benchmarks
|
|
810
|
-
mmluPro: 0.
|
|
811
|
-
gpqa: 0.
|
|
812
|
-
hle: 0.
|
|
680
|
+
mmluPro: 0.808,
|
|
681
|
+
gpqa: 0.709,
|
|
682
|
+
hle: 0.066,
|
|
813
683
|
|
|
814
684
|
// Capabilities
|
|
815
685
|
contextWindow: 8192,
|
|
@@ -817,21 +687,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
817
687
|
supportsVision: false,
|
|
818
688
|
|
|
819
689
|
// Metadata
|
|
820
|
-
lastUpdated: "2026-
|
|
690
|
+
lastUpdated: "2026-06-01",
|
|
691
|
+
originalModel: "Gemini 2.5 Flash-Lite Preview (Sep '25) (Reasoning)",
|
|
821
692
|
},
|
|
822
|
-
"
|
|
823
|
-
// AA Intelligence Index (composite score)
|
|
824
|
-
intelligenceIndex: 14,
|
|
825
|
-
normalizedScore: 20,
|
|
826
|
-
|
|
693
|
+
"gemini-2.5-flash-preview-sep-25-non-reasoning": {
|
|
827
694
|
// AA specific benchmarks
|
|
828
|
-
codingIndex:
|
|
829
|
-
mathIndex:
|
|
695
|
+
codingIndex: 22.1,
|
|
696
|
+
mathIndex: 56.7,
|
|
830
697
|
|
|
831
698
|
// Academic benchmarks
|
|
832
|
-
mmluPro: 0.
|
|
833
|
-
gpqa: 0.
|
|
834
|
-
hle: 0.
|
|
699
|
+
mmluPro: 0.836,
|
|
700
|
+
gpqa: 0.766,
|
|
701
|
+
hle: 0.078,
|
|
835
702
|
|
|
836
703
|
// Capabilities
|
|
837
704
|
contextWindow: 8192,
|
|
@@ -839,21 +706,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
839
706
|
supportsVision: false,
|
|
840
707
|
|
|
841
708
|
// Metadata
|
|
842
|
-
lastUpdated: "2026-
|
|
709
|
+
lastUpdated: "2026-06-01",
|
|
710
|
+
originalModel: "Gemini 2.5 Flash Preview (Sep '25) (Non-reasoning)",
|
|
843
711
|
},
|
|
844
|
-
"
|
|
845
|
-
// AA Intelligence Index (composite score)
|
|
846
|
-
intelligenceIndex: 12.7,
|
|
847
|
-
normalizedScore: 18,
|
|
848
|
-
|
|
712
|
+
"gemini-2.5-flash-preview-non-reasoning": {
|
|
849
713
|
// AA specific benchmarks
|
|
850
714
|
codingIndex: undefined,
|
|
851
|
-
mathIndex:
|
|
715
|
+
mathIndex: undefined,
|
|
852
716
|
|
|
853
717
|
// Academic benchmarks
|
|
854
|
-
mmluPro: 0.
|
|
855
|
-
gpqa: 0.
|
|
856
|
-
hle: 0.
|
|
718
|
+
mmluPro: 0.783,
|
|
719
|
+
gpqa: 0.594,
|
|
720
|
+
hle: 0.05,
|
|
857
721
|
|
|
858
722
|
// Capabilities
|
|
859
723
|
contextWindow: 8192,
|
|
@@ -861,21 +725,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
861
725
|
supportsVision: false,
|
|
862
726
|
|
|
863
727
|
// Metadata
|
|
864
|
-
lastUpdated: "2026-
|
|
728
|
+
lastUpdated: "2026-06-01",
|
|
729
|
+
originalModel: "Gemini 2.5 Flash Preview (Non-reasoning)",
|
|
865
730
|
},
|
|
866
|
-
"
|
|
867
|
-
// AA Intelligence Index (composite score)
|
|
868
|
-
intelligenceIndex: 10.2,
|
|
869
|
-
normalizedScore: 15,
|
|
870
|
-
|
|
731
|
+
"gemini-2.5-pro-preview-mar-25": {
|
|
871
732
|
// AA specific benchmarks
|
|
872
|
-
codingIndex:
|
|
733
|
+
codingIndex: 46.7,
|
|
873
734
|
mathIndex: undefined,
|
|
874
735
|
|
|
875
736
|
// Academic benchmarks
|
|
876
|
-
mmluPro: 0.
|
|
877
|
-
gpqa: 0.
|
|
878
|
-
hle: 0.
|
|
737
|
+
mmluPro: 0.858,
|
|
738
|
+
gpqa: 0.836,
|
|
739
|
+
hle: 0.171,
|
|
879
740
|
|
|
880
741
|
// Capabilities
|
|
881
742
|
contextWindow: 8192,
|
|
@@ -883,21 +744,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
883
744
|
supportsVision: false,
|
|
884
745
|
|
|
885
746
|
// Metadata
|
|
886
|
-
lastUpdated: "2026-
|
|
747
|
+
lastUpdated: "2026-06-01",
|
|
748
|
+
originalModel: "Gemini 2.5 Pro Preview (Mar' 25)",
|
|
887
749
|
},
|
|
888
|
-
"
|
|
889
|
-
// AA Intelligence Index (composite score)
|
|
890
|
-
intelligenceIndex: 9.8,
|
|
891
|
-
normalizedScore: 14,
|
|
892
|
-
|
|
750
|
+
"gemini-1.0-pro": {
|
|
893
751
|
// AA specific benchmarks
|
|
894
752
|
codingIndex: undefined,
|
|
895
753
|
mathIndex: undefined,
|
|
896
754
|
|
|
897
755
|
// Academic benchmarks
|
|
898
|
-
mmluPro: 0.
|
|
899
|
-
gpqa: 0.
|
|
900
|
-
hle: 0.
|
|
756
|
+
mmluPro: 0.431,
|
|
757
|
+
gpqa: 0.277,
|
|
758
|
+
hle: 0.046,
|
|
901
759
|
|
|
902
760
|
// Capabilities
|
|
903
761
|
contextWindow: 8192,
|
|
@@ -905,21 +763,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
905
763
|
supportsVision: false,
|
|
906
764
|
|
|
907
765
|
// Metadata
|
|
908
|
-
lastUpdated: "2026-
|
|
766
|
+
lastUpdated: "2026-06-01",
|
|
767
|
+
originalModel: "Gemini 1.0 Pro",
|
|
909
768
|
},
|
|
910
|
-
"
|
|
911
|
-
// AA Intelligence Index (composite score)
|
|
912
|
-
intelligenceIndex: 9,
|
|
913
|
-
normalizedScore: 13,
|
|
914
|
-
|
|
769
|
+
"gemini-2.5-flash-non-reasoning": {
|
|
915
770
|
// AA specific benchmarks
|
|
916
|
-
codingIndex:
|
|
917
|
-
mathIndex:
|
|
771
|
+
codingIndex: 17.8,
|
|
772
|
+
mathIndex: 60.3,
|
|
918
773
|
|
|
919
774
|
// Academic benchmarks
|
|
920
|
-
mmluPro: 0.
|
|
921
|
-
gpqa: 0.
|
|
922
|
-
hle: 0.
|
|
775
|
+
mmluPro: 0.809,
|
|
776
|
+
gpqa: 0.683,
|
|
777
|
+
hle: 0.051,
|
|
923
778
|
|
|
924
779
|
// Capabilities
|
|
925
780
|
contextWindow: 8192,
|
|
@@ -927,21 +782,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
927
782
|
supportsVision: false,
|
|
928
783
|
|
|
929
784
|
// Metadata
|
|
930
|
-
lastUpdated: "2026-
|
|
785
|
+
lastUpdated: "2026-06-01",
|
|
786
|
+
originalModel: "Gemini 2.5 Flash (Non-reasoning)",
|
|
931
787
|
},
|
|
932
|
-
"
|
|
933
|
-
// AA Intelligence Index (composite score)
|
|
934
|
-
intelligenceIndex: 9.9,
|
|
935
|
-
normalizedScore: 14,
|
|
936
|
-
|
|
788
|
+
"gemini-2.5-flash-lite-preview-sep-25-non-reasoning": {
|
|
937
789
|
// AA specific benchmarks
|
|
938
|
-
codingIndex:
|
|
939
|
-
mathIndex:
|
|
790
|
+
codingIndex: 14.5,
|
|
791
|
+
mathIndex: 46.7,
|
|
940
792
|
|
|
941
793
|
// Academic benchmarks
|
|
942
|
-
mmluPro: 0.
|
|
943
|
-
gpqa: 0.
|
|
944
|
-
hle: 0.
|
|
794
|
+
mmluPro: 0.796,
|
|
795
|
+
gpqa: 0.651,
|
|
796
|
+
hle: 0.046,
|
|
945
797
|
|
|
946
798
|
// Capabilities
|
|
947
799
|
contextWindow: 8192,
|
|
@@ -949,21 +801,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
949
801
|
supportsVision: false,
|
|
950
802
|
|
|
951
803
|
// Metadata
|
|
952
|
-
lastUpdated: "2026-
|
|
804
|
+
lastUpdated: "2026-06-01",
|
|
805
|
+
originalModel: "Gemini 2.5 Flash-Lite Preview (Sep '25) (Non-reasoning)",
|
|
953
806
|
},
|
|
954
|
-
"
|
|
955
|
-
// AA Intelligence Index (composite score)
|
|
956
|
-
intelligenceIndex: 7.7,
|
|
957
|
-
normalizedScore: 11,
|
|
958
|
-
|
|
807
|
+
"gemini-3-flash-preview-non-reasoning": {
|
|
959
808
|
// AA specific benchmarks
|
|
960
|
-
codingIndex:
|
|
961
|
-
mathIndex:
|
|
809
|
+
codingIndex: 37.8,
|
|
810
|
+
mathIndex: 55.7,
|
|
962
811
|
|
|
963
812
|
// Academic benchmarks
|
|
964
|
-
mmluPro: 0.
|
|
965
|
-
gpqa: 0.
|
|
966
|
-
hle: 0.
|
|
813
|
+
mmluPro: 0.882,
|
|
814
|
+
gpqa: 0.812,
|
|
815
|
+
hle: 0.141,
|
|
967
816
|
|
|
968
817
|
// Capabilities
|
|
969
818
|
contextWindow: 8192,
|
|
@@ -971,21 +820,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
971
820
|
supportsVision: false,
|
|
972
821
|
|
|
973
822
|
// Metadata
|
|
974
|
-
lastUpdated: "2026-
|
|
823
|
+
lastUpdated: "2026-06-01",
|
|
824
|
+
originalModel: "Gemini 3 Flash Preview (Non-reasoning)",
|
|
975
825
|
},
|
|
976
|
-
"
|
|
977
|
-
// AA Intelligence Index (composite score)
|
|
978
|
-
intelligenceIndex: 7.4,
|
|
979
|
-
normalizedScore: 11,
|
|
980
|
-
|
|
826
|
+
"gemini-2.0-flash-lite-feb-25": {
|
|
981
827
|
// AA specific benchmarks
|
|
982
828
|
codingIndex: undefined,
|
|
983
829
|
mathIndex: undefined,
|
|
984
830
|
|
|
985
831
|
// Academic benchmarks
|
|
986
|
-
mmluPro: 0.
|
|
987
|
-
gpqa: 0.
|
|
988
|
-
hle: 0.
|
|
832
|
+
mmluPro: 0.724,
|
|
833
|
+
gpqa: 0.535,
|
|
834
|
+
hle: 0.036,
|
|
989
835
|
|
|
990
836
|
// Capabilities
|
|
991
837
|
contextWindow: 8192,
|
|
@@ -993,21 +839,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
993
839
|
supportsVision: false,
|
|
994
840
|
|
|
995
841
|
// Metadata
|
|
996
|
-
lastUpdated: "2026-
|
|
842
|
+
lastUpdated: "2026-06-01",
|
|
843
|
+
originalModel: "Gemini 2.0 Flash-Lite (Feb '25)",
|
|
997
844
|
},
|
|
998
|
-
"
|
|
999
|
-
// AA Intelligence Index (composite score)
|
|
1000
|
-
intelligenceIndex: 14.5,
|
|
1001
|
-
normalizedScore: 21,
|
|
1002
|
-
|
|
845
|
+
"gemini-3-flash-preview-reasoning": {
|
|
1003
846
|
// AA specific benchmarks
|
|
1004
|
-
codingIndex:
|
|
1005
|
-
mathIndex:
|
|
847
|
+
codingIndex: 42.6,
|
|
848
|
+
mathIndex: 97,
|
|
1006
849
|
|
|
1007
850
|
// Academic benchmarks
|
|
1008
|
-
mmluPro: 0.
|
|
1009
|
-
gpqa: 0.
|
|
1010
|
-
hle: 0.
|
|
851
|
+
mmluPro: 0.89,
|
|
852
|
+
gpqa: 0.898,
|
|
853
|
+
hle: 0.347,
|
|
1011
854
|
|
|
1012
855
|
// Capabilities
|
|
1013
856
|
contextWindow: 8192,
|
|
@@ -1015,21 +858,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1015
858
|
supportsVision: false,
|
|
1016
859
|
|
|
1017
860
|
// Metadata
|
|
1018
|
-
lastUpdated: "2026-
|
|
861
|
+
lastUpdated: "2026-06-01",
|
|
862
|
+
originalModel: "Gemini 3 Flash Preview (Reasoning)",
|
|
1019
863
|
},
|
|
1020
|
-
"
|
|
1021
|
-
// AA Intelligence Index (composite score)
|
|
1022
|
-
intelligenceIndex: 18.8,
|
|
1023
|
-
normalizedScore: 27,
|
|
1024
|
-
|
|
864
|
+
"gemini-3-pro-preview-low": {
|
|
1025
865
|
// AA specific benchmarks
|
|
1026
|
-
codingIndex:
|
|
1027
|
-
mathIndex:
|
|
866
|
+
codingIndex: 39.4,
|
|
867
|
+
mathIndex: 86.7,
|
|
1028
868
|
|
|
1029
869
|
// Academic benchmarks
|
|
1030
|
-
mmluPro: 0.
|
|
1031
|
-
gpqa: 0.
|
|
1032
|
-
hle: 0.
|
|
870
|
+
mmluPro: 0.895,
|
|
871
|
+
gpqa: 0.887,
|
|
872
|
+
hle: 0.276,
|
|
1033
873
|
|
|
1034
874
|
// Capabilities
|
|
1035
875
|
contextWindow: 8192,
|
|
@@ -1037,21 +877,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1037
877
|
supportsVision: false,
|
|
1038
878
|
|
|
1039
879
|
// Metadata
|
|
1040
|
-
lastUpdated: "2026-
|
|
880
|
+
lastUpdated: "2026-06-01",
|
|
881
|
+
originalModel: "Gemini 3 Pro Preview (low)",
|
|
1041
882
|
},
|
|
1042
|
-
"
|
|
1043
|
-
// AA Intelligence Index (composite score)
|
|
1044
|
-
intelligenceIndex: 12.1,
|
|
1045
|
-
normalizedScore: 17,
|
|
1046
|
-
|
|
883
|
+
"palm-2": {
|
|
1047
884
|
// AA specific benchmarks
|
|
1048
|
-
codingIndex:
|
|
885
|
+
codingIndex: 4.6,
|
|
1049
886
|
mathIndex: undefined,
|
|
1050
887
|
|
|
1051
888
|
// Academic benchmarks
|
|
1052
|
-
mmluPro:
|
|
1053
|
-
gpqa:
|
|
1054
|
-
hle:
|
|
889
|
+
mmluPro: undefined,
|
|
890
|
+
gpqa: undefined,
|
|
891
|
+
hle: undefined,
|
|
1055
892
|
|
|
1056
893
|
// Capabilities
|
|
1057
894
|
contextWindow: 8192,
|
|
@@ -1059,21 +896,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1059
896
|
supportsVision: false,
|
|
1060
897
|
|
|
1061
898
|
// Metadata
|
|
1062
|
-
lastUpdated: "2026-
|
|
899
|
+
lastUpdated: "2026-06-01",
|
|
900
|
+
originalModel: "PALM-2",
|
|
1063
901
|
},
|
|
1064
|
-
"
|
|
1065
|
-
// AA Intelligence Index (composite score)
|
|
1066
|
-
intelligenceIndex: 15.1,
|
|
1067
|
-
normalizedScore: 22,
|
|
1068
|
-
|
|
902
|
+
"claude-3.5-sonnet-oct-24": {
|
|
1069
903
|
// AA specific benchmarks
|
|
1070
|
-
codingIndex:
|
|
1071
|
-
mathIndex:
|
|
904
|
+
codingIndex: 30.2,
|
|
905
|
+
mathIndex: undefined,
|
|
1072
906
|
|
|
1073
907
|
// Academic benchmarks
|
|
1074
|
-
mmluPro: 0.
|
|
1075
|
-
gpqa: 0.
|
|
1076
|
-
hle: 0.
|
|
908
|
+
mmluPro: 0.772,
|
|
909
|
+
gpqa: 0.599,
|
|
910
|
+
hle: 0.039,
|
|
1077
911
|
|
|
1078
912
|
// Capabilities
|
|
1079
913
|
contextWindow: 8192,
|
|
@@ -1081,21 +915,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1081
915
|
supportsVision: false,
|
|
1082
916
|
|
|
1083
917
|
// Metadata
|
|
1084
|
-
lastUpdated: "2026-
|
|
918
|
+
lastUpdated: "2026-06-01",
|
|
919
|
+
originalModel: "Claude 3.5 Sonnet (Oct '24)",
|
|
1085
920
|
},
|
|
1086
|
-
"
|
|
1087
|
-
// AA Intelligence Index (composite score)
|
|
1088
|
-
intelligenceIndex: 18.8,
|
|
1089
|
-
normalizedScore: 27,
|
|
1090
|
-
|
|
921
|
+
"claude-3.5-sonnet-june-24": {
|
|
1091
922
|
// AA specific benchmarks
|
|
1092
|
-
codingIndex:
|
|
1093
|
-
mathIndex:
|
|
923
|
+
codingIndex: 26,
|
|
924
|
+
mathIndex: undefined,
|
|
1094
925
|
|
|
1095
926
|
// Academic benchmarks
|
|
1096
|
-
mmluPro: 0.
|
|
1097
|
-
gpqa: 0.
|
|
1098
|
-
hle: 0.
|
|
927
|
+
mmluPro: 0.751,
|
|
928
|
+
gpqa: 0.56,
|
|
929
|
+
hle: 0.037,
|
|
1099
930
|
|
|
1100
931
|
// Capabilities
|
|
1101
932
|
contextWindow: 8192,
|
|
@@ -1103,21 +934,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1103
934
|
supportsVision: false,
|
|
1104
935
|
|
|
1105
936
|
// Metadata
|
|
1106
|
-
lastUpdated: "2026-
|
|
937
|
+
lastUpdated: "2026-06-01",
|
|
938
|
+
originalModel: "Claude 3.5 Sonnet (June '24)",
|
|
1107
939
|
},
|
|
1108
|
-
"
|
|
1109
|
-
// AA Intelligence Index (composite score)
|
|
1110
|
-
intelligenceIndex: 18.7,
|
|
1111
|
-
normalizedScore: 27,
|
|
1112
|
-
|
|
940
|
+
"claude-3-opus": {
|
|
1113
941
|
// AA specific benchmarks
|
|
1114
|
-
codingIndex:
|
|
1115
|
-
mathIndex:
|
|
942
|
+
codingIndex: 19.5,
|
|
943
|
+
mathIndex: undefined,
|
|
1116
944
|
|
|
1117
945
|
// Academic benchmarks
|
|
1118
|
-
mmluPro: 0.
|
|
1119
|
-
gpqa: 0.
|
|
1120
|
-
hle: 0.
|
|
946
|
+
mmluPro: 0.696,
|
|
947
|
+
gpqa: 0.489,
|
|
948
|
+
hle: 0.031,
|
|
1121
949
|
|
|
1122
950
|
// Capabilities
|
|
1123
951
|
contextWindow: 8192,
|
|
@@ -1125,21 +953,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1125
953
|
supportsVision: false,
|
|
1126
954
|
|
|
1127
955
|
// Metadata
|
|
1128
|
-
lastUpdated: "2026-
|
|
956
|
+
lastUpdated: "2026-06-01",
|
|
957
|
+
originalModel: "Claude 3 Opus",
|
|
1129
958
|
},
|
|
1130
|
-
"
|
|
1131
|
-
// AA Intelligence Index (composite score)
|
|
1132
|
-
intelligenceIndex: 16.8,
|
|
1133
|
-
normalizedScore: 24,
|
|
1134
|
-
|
|
959
|
+
"claude-3.5-haiku": {
|
|
1135
960
|
// AA specific benchmarks
|
|
1136
|
-
codingIndex:
|
|
1137
|
-
mathIndex:
|
|
961
|
+
codingIndex: 10.7,
|
|
962
|
+
mathIndex: undefined,
|
|
1138
963
|
|
|
1139
964
|
// Academic benchmarks
|
|
1140
|
-
mmluPro: 0.
|
|
1141
|
-
gpqa: 0.
|
|
1142
|
-
hle: 0.
|
|
965
|
+
mmluPro: 0.634,
|
|
966
|
+
gpqa: 0.408,
|
|
967
|
+
hle: 0.035,
|
|
1143
968
|
|
|
1144
969
|
// Capabilities
|
|
1145
970
|
contextWindow: 8192,
|
|
@@ -1147,21 +972,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1147
972
|
supportsVision: false,
|
|
1148
973
|
|
|
1149
974
|
// Metadata
|
|
1150
|
-
lastUpdated: "2026-
|
|
975
|
+
lastUpdated: "2026-06-01",
|
|
976
|
+
originalModel: "Claude 3.5 Haiku",
|
|
1151
977
|
},
|
|
1152
|
-
"
|
|
1153
|
-
// AA Intelligence Index (composite score)
|
|
1154
|
-
intelligenceIndex: 9,
|
|
1155
|
-
normalizedScore: 13,
|
|
1156
|
-
|
|
978
|
+
"claude-3-sonnet": {
|
|
1157
979
|
// AA specific benchmarks
|
|
1158
980
|
codingIndex: undefined,
|
|
1159
981
|
mathIndex: undefined,
|
|
1160
982
|
|
|
1161
983
|
// Academic benchmarks
|
|
1162
|
-
mmluPro: 0.
|
|
1163
|
-
gpqa: 0.
|
|
1164
|
-
hle: 0.
|
|
984
|
+
mmluPro: 0.579,
|
|
985
|
+
gpqa: 0.4,
|
|
986
|
+
hle: 0.038,
|
|
1165
987
|
|
|
1166
988
|
// Capabilities
|
|
1167
989
|
contextWindow: 8192,
|
|
@@ -1169,21 +991,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1169
991
|
supportsVision: false,
|
|
1170
992
|
|
|
1171
993
|
// Metadata
|
|
1172
|
-
lastUpdated: "2026-
|
|
994
|
+
lastUpdated: "2026-06-01",
|
|
995
|
+
originalModel: "Claude 3 Sonnet",
|
|
1173
996
|
},
|
|
1174
|
-
"
|
|
1175
|
-
// AA Intelligence Index (composite score)
|
|
1176
|
-
intelligenceIndex: 15.2,
|
|
1177
|
-
normalizedScore: 22,
|
|
1178
|
-
|
|
997
|
+
"claude-3-haiku": {
|
|
1179
998
|
// AA specific benchmarks
|
|
1180
|
-
codingIndex:
|
|
1181
|
-
mathIndex:
|
|
999
|
+
codingIndex: 6.7,
|
|
1000
|
+
mathIndex: undefined,
|
|
1182
1001
|
|
|
1183
1002
|
// Academic benchmarks
|
|
1184
|
-
mmluPro:
|
|
1185
|
-
gpqa: 0.
|
|
1186
|
-
hle: 0.
|
|
1003
|
+
mmluPro: undefined,
|
|
1004
|
+
gpqa: 0.374,
|
|
1005
|
+
hle: 0.039,
|
|
1187
1006
|
|
|
1188
1007
|
// Capabilities
|
|
1189
1008
|
contextWindow: 8192,
|
|
@@ -1191,21 +1010,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1191
1010
|
supportsVision: false,
|
|
1192
1011
|
|
|
1193
1012
|
// Metadata
|
|
1194
|
-
lastUpdated: "2026-
|
|
1013
|
+
lastUpdated: "2026-06-01",
|
|
1014
|
+
originalModel: "Claude 3 Haiku",
|
|
1195
1015
|
},
|
|
1196
|
-
"
|
|
1197
|
-
// AA Intelligence Index (composite score)
|
|
1198
|
-
intelligenceIndex: 18,
|
|
1199
|
-
normalizedScore: 26,
|
|
1200
|
-
|
|
1016
|
+
"claude-instant": {
|
|
1201
1017
|
// AA specific benchmarks
|
|
1202
|
-
codingIndex:
|
|
1018
|
+
codingIndex: 7.8,
|
|
1203
1019
|
mathIndex: undefined,
|
|
1204
1020
|
|
|
1205
1021
|
// Academic benchmarks
|
|
1206
|
-
mmluPro: 0.
|
|
1207
|
-
gpqa: 0.
|
|
1208
|
-
hle: 0.
|
|
1022
|
+
mmluPro: 0.434,
|
|
1023
|
+
gpqa: 0.33,
|
|
1024
|
+
hle: 0.038,
|
|
1209
1025
|
|
|
1210
1026
|
// Capabilities
|
|
1211
1027
|
contextWindow: 8192,
|
|
@@ -1213,21 +1029,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1213
1029
|
supportsVision: false,
|
|
1214
1030
|
|
|
1215
1031
|
// Metadata
|
|
1216
|
-
lastUpdated: "2026-
|
|
1032
|
+
lastUpdated: "2026-06-01",
|
|
1033
|
+
originalModel: "Claude Instant",
|
|
1217
1034
|
},
|
|
1218
|
-
"
|
|
1219
|
-
// AA Intelligence Index (composite score)
|
|
1220
|
-
intelligenceIndex: 17.2,
|
|
1221
|
-
normalizedScore: 25,
|
|
1222
|
-
|
|
1035
|
+
"claude-4.5-sonnet-reasoning": {
|
|
1223
1036
|
// AA specific benchmarks
|
|
1224
|
-
codingIndex:
|
|
1225
|
-
mathIndex:
|
|
1037
|
+
codingIndex: 38.6,
|
|
1038
|
+
mathIndex: 88,
|
|
1226
1039
|
|
|
1227
1040
|
// Academic benchmarks
|
|
1228
|
-
mmluPro: 0.
|
|
1229
|
-
gpqa: 0.
|
|
1230
|
-
hle: 0.
|
|
1041
|
+
mmluPro: 0.875,
|
|
1042
|
+
gpqa: 0.834,
|
|
1043
|
+
hle: 0.173,
|
|
1231
1044
|
|
|
1232
1045
|
// Capabilities
|
|
1233
1046
|
contextWindow: 8192,
|
|
@@ -1235,21 +1048,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1235
1048
|
supportsVision: false,
|
|
1236
1049
|
|
|
1237
1050
|
// Metadata
|
|
1238
|
-
lastUpdated: "2026-
|
|
1051
|
+
lastUpdated: "2026-06-01",
|
|
1052
|
+
originalModel: "Claude 4.5 Sonnet (Reasoning)",
|
|
1239
1053
|
},
|
|
1240
|
-
"
|
|
1241
|
-
// AA Intelligence Index (composite score)
|
|
1242
|
-
intelligenceIndex: 16.5,
|
|
1243
|
-
normalizedScore: 24,
|
|
1244
|
-
|
|
1054
|
+
"claude-4-sonnet-non-reasoning": {
|
|
1245
1055
|
// AA specific benchmarks
|
|
1246
|
-
codingIndex:
|
|
1247
|
-
mathIndex:
|
|
1056
|
+
codingIndex: 30.6,
|
|
1057
|
+
mathIndex: 38,
|
|
1248
1058
|
|
|
1249
1059
|
// Academic benchmarks
|
|
1250
|
-
mmluPro: 0.
|
|
1251
|
-
gpqa: 0.
|
|
1252
|
-
hle: 0.
|
|
1060
|
+
mmluPro: 0.837,
|
|
1061
|
+
gpqa: 0.683,
|
|
1062
|
+
hle: 0.04,
|
|
1253
1063
|
|
|
1254
1064
|
// Capabilities
|
|
1255
1065
|
contextWindow: 8192,
|
|
@@ -1257,21 +1067,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1257
1067
|
supportsVision: false,
|
|
1258
1068
|
|
|
1259
1069
|
// Metadata
|
|
1260
|
-
lastUpdated: "2026-
|
|
1070
|
+
lastUpdated: "2026-06-01",
|
|
1071
|
+
originalModel: "Claude 4 Sonnet (Non-reasoning)",
|
|
1261
1072
|
},
|
|
1262
|
-
"
|
|
1263
|
-
// AA Intelligence Index (composite score)
|
|
1264
|
-
intelligenceIndex: 15.8,
|
|
1265
|
-
normalizedScore: 23,
|
|
1266
|
-
|
|
1073
|
+
"claude-3.7-sonnet-non-reasoning": {
|
|
1267
1074
|
// AA specific benchmarks
|
|
1268
|
-
codingIndex:
|
|
1269
|
-
mathIndex:
|
|
1075
|
+
codingIndex: 26.7,
|
|
1076
|
+
mathIndex: 21,
|
|
1270
1077
|
|
|
1271
1078
|
// Academic benchmarks
|
|
1272
|
-
mmluPro: 0.
|
|
1273
|
-
gpqa: 0.
|
|
1274
|
-
hle: 0.
|
|
1079
|
+
mmluPro: 0.803,
|
|
1080
|
+
gpqa: 0.656,
|
|
1081
|
+
hle: 0.048,
|
|
1275
1082
|
|
|
1276
1083
|
// Capabilities
|
|
1277
1084
|
contextWindow: 8192,
|
|
@@ -1279,20 +1086,17 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1279
1086
|
supportsVision: false,
|
|
1280
1087
|
|
|
1281
1088
|
// Metadata
|
|
1282
|
-
lastUpdated: "2026-
|
|
1089
|
+
lastUpdated: "2026-06-01",
|
|
1090
|
+
originalModel: "Claude 3.7 Sonnet (Non-reasoning)",
|
|
1283
1091
|
},
|
|
1284
|
-
"
|
|
1285
|
-
// AA Intelligence Index (composite score)
|
|
1286
|
-
intelligenceIndex: 12.5,
|
|
1287
|
-
normalizedScore: 18,
|
|
1288
|
-
|
|
1092
|
+
"claude-2.0": {
|
|
1289
1093
|
// AA specific benchmarks
|
|
1290
|
-
codingIndex:
|
|
1094
|
+
codingIndex: 12.9,
|
|
1291
1095
|
mathIndex: undefined,
|
|
1292
1096
|
|
|
1293
1097
|
// Academic benchmarks
|
|
1294
|
-
mmluPro:
|
|
1295
|
-
gpqa:
|
|
1098
|
+
mmluPro: 0.486,
|
|
1099
|
+
gpqa: 0.344,
|
|
1296
1100
|
hle: undefined,
|
|
1297
1101
|
|
|
1298
1102
|
// Capabilities
|
|
@@ -1301,13 +1105,10 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1301
1105
|
supportsVision: false,
|
|
1302
1106
|
|
|
1303
1107
|
// Metadata
|
|
1304
|
-
lastUpdated: "2026-
|
|
1108
|
+
lastUpdated: "2026-06-01",
|
|
1109
|
+
originalModel: "Claude 2.0",
|
|
1305
1110
|
},
|
|
1306
|
-
"
|
|
1307
|
-
// AA Intelligence Index (composite score)
|
|
1308
|
-
intelligenceIndex: 10.6,
|
|
1309
|
-
normalizedScore: 15,
|
|
1310
|
-
|
|
1111
|
+
"claude-4.1-opus-non-reasoning": {
|
|
1311
1112
|
// AA specific benchmarks
|
|
1312
1113
|
codingIndex: undefined,
|
|
1313
1114
|
mathIndex: undefined,
|
|
@@ -1323,21 +1124,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1323
1124
|
supportsVision: false,
|
|
1324
1125
|
|
|
1325
1126
|
// Metadata
|
|
1326
|
-
lastUpdated: "2026-
|
|
1127
|
+
lastUpdated: "2026-06-01",
|
|
1128
|
+
originalModel: "Claude 4.1 Opus (Non-reasoning)",
|
|
1327
1129
|
},
|
|
1328
|
-
"
|
|
1329
|
-
// AA Intelligence Index (composite score)
|
|
1330
|
-
intelligenceIndex: 12.1,
|
|
1331
|
-
normalizedScore: 17,
|
|
1332
|
-
|
|
1130
|
+
"claude-4.1-opus-reasoning": {
|
|
1333
1131
|
// AA specific benchmarks
|
|
1334
|
-
codingIndex:
|
|
1335
|
-
mathIndex:
|
|
1132
|
+
codingIndex: 36.5,
|
|
1133
|
+
mathIndex: 80.3,
|
|
1336
1134
|
|
|
1337
1135
|
// Academic benchmarks
|
|
1338
|
-
mmluPro: 0.
|
|
1339
|
-
gpqa: 0.
|
|
1340
|
-
hle: 0.
|
|
1136
|
+
mmluPro: 0.88,
|
|
1137
|
+
gpqa: 0.809,
|
|
1138
|
+
hle: 0.119,
|
|
1341
1139
|
|
|
1342
1140
|
// Capabilities
|
|
1343
1141
|
contextWindow: 8192,
|
|
@@ -1345,21 +1143,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1345
1143
|
supportsVision: false,
|
|
1346
1144
|
|
|
1347
1145
|
// Metadata
|
|
1348
|
-
lastUpdated: "2026-
|
|
1146
|
+
lastUpdated: "2026-06-01",
|
|
1147
|
+
originalModel: "Claude 4.1 Opus (Reasoning)",
|
|
1349
1148
|
},
|
|
1350
|
-
"
|
|
1351
|
-
// AA Intelligence Index (composite score)
|
|
1352
|
-
intelligenceIndex: 8.4,
|
|
1353
|
-
normalizedScore: 12,
|
|
1354
|
-
|
|
1149
|
+
"claude-4.5-sonnet-non-reasoning": {
|
|
1355
1150
|
// AA specific benchmarks
|
|
1356
|
-
codingIndex:
|
|
1357
|
-
mathIndex:
|
|
1151
|
+
codingIndex: 33.5,
|
|
1152
|
+
mathIndex: 37,
|
|
1358
1153
|
|
|
1359
1154
|
// Academic benchmarks
|
|
1360
|
-
mmluPro:
|
|
1361
|
-
gpqa:
|
|
1362
|
-
hle:
|
|
1155
|
+
mmluPro: 0.86,
|
|
1156
|
+
gpqa: 0.727,
|
|
1157
|
+
hle: 0.071,
|
|
1363
1158
|
|
|
1364
1159
|
// Capabilities
|
|
1365
1160
|
contextWindow: 8192,
|
|
@@ -1367,21 +1162,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1367
1162
|
supportsVision: false,
|
|
1368
1163
|
|
|
1369
1164
|
// Metadata
|
|
1370
|
-
lastUpdated: "2026-
|
|
1165
|
+
lastUpdated: "2026-06-01",
|
|
1166
|
+
originalModel: "Claude 4.5 Sonnet (Non-reasoning)",
|
|
1371
1167
|
},
|
|
1372
|
-
"
|
|
1373
|
-
// AA Intelligence Index (composite score)
|
|
1374
|
-
intelligenceIndex: 9.1,
|
|
1375
|
-
normalizedScore: 13,
|
|
1376
|
-
|
|
1168
|
+
"claude-opus-4.6-adaptive-reasoning-max-effort": {
|
|
1377
1169
|
// AA specific benchmarks
|
|
1378
|
-
codingIndex:
|
|
1379
|
-
mathIndex:
|
|
1170
|
+
codingIndex: 48.1,
|
|
1171
|
+
mathIndex: undefined,
|
|
1380
1172
|
|
|
1381
1173
|
// Academic benchmarks
|
|
1382
|
-
mmluPro:
|
|
1383
|
-
gpqa: 0.
|
|
1384
|
-
hle: 0.
|
|
1174
|
+
mmluPro: undefined,
|
|
1175
|
+
gpqa: 0.896,
|
|
1176
|
+
hle: 0.367,
|
|
1385
1177
|
|
|
1386
1178
|
// Capabilities
|
|
1387
1179
|
contextWindow: 8192,
|
|
@@ -1389,21 +1181,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1389
1181
|
supportsVision: false,
|
|
1390
1182
|
|
|
1391
1183
|
// Metadata
|
|
1392
|
-
lastUpdated: "2026-
|
|
1184
|
+
lastUpdated: "2026-06-01",
|
|
1185
|
+
originalModel: "Claude Opus 4.6 (Adaptive Reasoning, Max Effort)",
|
|
1393
1186
|
},
|
|
1394
|
-
"
|
|
1395
|
-
// AA Intelligence Index (composite score)
|
|
1396
|
-
intelligenceIndex: 28.5,
|
|
1397
|
-
normalizedScore: 41,
|
|
1398
|
-
|
|
1187
|
+
"claude-opus-4.5-reasoning": {
|
|
1399
1188
|
// AA specific benchmarks
|
|
1400
|
-
codingIndex:
|
|
1401
|
-
mathIndex:
|
|
1189
|
+
codingIndex: 47.8,
|
|
1190
|
+
mathIndex: 91.3,
|
|
1402
1191
|
|
|
1403
1192
|
// Academic benchmarks
|
|
1404
|
-
mmluPro: 0.
|
|
1405
|
-
gpqa: 0.
|
|
1406
|
-
hle: 0.
|
|
1193
|
+
mmluPro: 0.895,
|
|
1194
|
+
gpqa: 0.866,
|
|
1195
|
+
hle: 0.284,
|
|
1407
1196
|
|
|
1408
1197
|
// Capabilities
|
|
1409
1198
|
contextWindow: 8192,
|
|
@@ -1411,21 +1200,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1411
1200
|
supportsVision: false,
|
|
1412
1201
|
|
|
1413
1202
|
// Metadata
|
|
1414
|
-
lastUpdated: "2026-
|
|
1203
|
+
lastUpdated: "2026-06-01",
|
|
1204
|
+
originalModel: "Claude Opus 4.5 (Reasoning)",
|
|
1415
1205
|
},
|
|
1416
|
-
"
|
|
1417
|
-
// AA Intelligence Index (composite score)
|
|
1418
|
-
intelligenceIndex: 32.9,
|
|
1419
|
-
normalizedScore: 47,
|
|
1420
|
-
|
|
1206
|
+
"claude-3.7-sonnet-reasoning": {
|
|
1421
1207
|
// AA specific benchmarks
|
|
1422
|
-
codingIndex:
|
|
1423
|
-
mathIndex:
|
|
1208
|
+
codingIndex: 27.6,
|
|
1209
|
+
mathIndex: 56.3,
|
|
1424
1210
|
|
|
1425
1211
|
// Academic benchmarks
|
|
1426
|
-
mmluPro: 0.
|
|
1427
|
-
gpqa: 0.
|
|
1428
|
-
hle: 0.
|
|
1212
|
+
mmluPro: 0.837,
|
|
1213
|
+
gpqa: 0.772,
|
|
1214
|
+
hle: 0.103,
|
|
1429
1215
|
|
|
1430
1216
|
// Capabilities
|
|
1431
1217
|
contextWindow: 8192,
|
|
@@ -1433,21 +1219,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1433
1219
|
supportsVision: false,
|
|
1434
1220
|
|
|
1435
1221
|
// Metadata
|
|
1436
|
-
lastUpdated: "2026-
|
|
1222
|
+
lastUpdated: "2026-06-01",
|
|
1223
|
+
originalModel: "Claude 3.7 Sonnet (Reasoning)",
|
|
1437
1224
|
},
|
|
1438
|
-
"
|
|
1439
|
-
// AA Intelligence Index (composite score)
|
|
1440
|
-
intelligenceIndex: 27.7,
|
|
1441
|
-
normalizedScore: 40,
|
|
1442
|
-
|
|
1225
|
+
"claude-opus-4.6-non-reasoning-high-effort": {
|
|
1443
1226
|
// AA specific benchmarks
|
|
1444
|
-
codingIndex:
|
|
1445
|
-
mathIndex:
|
|
1227
|
+
codingIndex: 47.6,
|
|
1228
|
+
mathIndex: undefined,
|
|
1446
1229
|
|
|
1447
1230
|
// Academic benchmarks
|
|
1448
|
-
mmluPro:
|
|
1449
|
-
gpqa: 0.
|
|
1450
|
-
hle: 0.
|
|
1231
|
+
mmluPro: undefined,
|
|
1232
|
+
gpqa: 0.84,
|
|
1233
|
+
hle: 0.186,
|
|
1451
1234
|
|
|
1452
1235
|
// Capabilities
|
|
1453
1236
|
contextWindow: 8192,
|
|
@@ -1455,21 +1238,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1455
1238
|
supportsVision: false,
|
|
1456
1239
|
|
|
1457
1240
|
// Metadata
|
|
1458
|
-
lastUpdated: "2026-
|
|
1241
|
+
lastUpdated: "2026-06-01",
|
|
1242
|
+
originalModel: "Claude Opus 4.6 (Non-reasoning, High Effort)",
|
|
1459
1243
|
},
|
|
1460
|
-
"
|
|
1461
|
-
// AA Intelligence Index (composite score)
|
|
1462
|
-
intelligenceIndex: 28.4,
|
|
1463
|
-
normalizedScore: 41,
|
|
1464
|
-
|
|
1244
|
+
"claude-4-opus-non-reasoning": {
|
|
1465
1245
|
// AA specific benchmarks
|
|
1466
|
-
codingIndex:
|
|
1467
|
-
mathIndex:
|
|
1246
|
+
codingIndex: undefined,
|
|
1247
|
+
mathIndex: 36.3,
|
|
1468
1248
|
|
|
1469
1249
|
// Academic benchmarks
|
|
1470
|
-
mmluPro: 0.
|
|
1471
|
-
gpqa: 0.
|
|
1472
|
-
hle: 0.
|
|
1250
|
+
mmluPro: 0.86,
|
|
1251
|
+
gpqa: 0.701,
|
|
1252
|
+
hle: 0.059,
|
|
1473
1253
|
|
|
1474
1254
|
// Capabilities
|
|
1475
1255
|
contextWindow: 8192,
|
|
@@ -1477,21 +1257,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1477
1257
|
supportsVision: false,
|
|
1478
1258
|
|
|
1479
1259
|
// Metadata
|
|
1480
|
-
lastUpdated: "2026-
|
|
1260
|
+
lastUpdated: "2026-06-01",
|
|
1261
|
+
originalModel: "Claude 4 Opus (Non-reasoning)",
|
|
1481
1262
|
},
|
|
1482
|
-
"
|
|
1483
|
-
// AA Intelligence Index (composite score)
|
|
1484
|
-
intelligenceIndex: 33.9,
|
|
1485
|
-
normalizedScore: 48,
|
|
1486
|
-
|
|
1263
|
+
"claude-opus-4.5-non-reasoning": {
|
|
1487
1264
|
// AA specific benchmarks
|
|
1488
|
-
codingIndex:
|
|
1489
|
-
mathIndex:
|
|
1265
|
+
codingIndex: 42.9,
|
|
1266
|
+
mathIndex: 62.7,
|
|
1490
1267
|
|
|
1491
1268
|
// Academic benchmarks
|
|
1492
|
-
mmluPro: 0.
|
|
1493
|
-
gpqa: 0.
|
|
1494
|
-
hle: 0.
|
|
1269
|
+
mmluPro: 0.889,
|
|
1270
|
+
gpqa: 0.81,
|
|
1271
|
+
hle: 0.129,
|
|
1495
1272
|
|
|
1496
1273
|
// Capabilities
|
|
1497
1274
|
contextWindow: 8192,
|
|
@@ -1499,21 +1276,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1499
1276
|
supportsVision: false,
|
|
1500
1277
|
|
|
1501
1278
|
// Metadata
|
|
1502
|
-
lastUpdated: "2026-
|
|
1279
|
+
lastUpdated: "2026-06-01",
|
|
1280
|
+
originalModel: "Claude Opus 4.5 (Non-reasoning)",
|
|
1503
1281
|
},
|
|
1504
|
-
"
|
|
1505
|
-
// AA Intelligence Index (composite score)
|
|
1506
|
-
intelligenceIndex: 22.3,
|
|
1507
|
-
normalizedScore: 32,
|
|
1508
|
-
|
|
1282
|
+
"claude-4-sonnet-reasoning": {
|
|
1509
1283
|
// AA specific benchmarks
|
|
1510
|
-
codingIndex:
|
|
1511
|
-
mathIndex:
|
|
1284
|
+
codingIndex: 34.1,
|
|
1285
|
+
mathIndex: 74.3,
|
|
1512
1286
|
|
|
1513
1287
|
// Academic benchmarks
|
|
1514
|
-
mmluPro: 0.
|
|
1515
|
-
gpqa: 0.
|
|
1516
|
-
hle: 0.
|
|
1288
|
+
mmluPro: 0.842,
|
|
1289
|
+
gpqa: 0.777,
|
|
1290
|
+
hle: 0.096,
|
|
1517
1291
|
|
|
1518
1292
|
// Capabilities
|
|
1519
1293
|
contextWindow: 8192,
|
|
@@ -1521,21 +1295,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1521
1295
|
supportsVision: false,
|
|
1522
1296
|
|
|
1523
1297
|
// Metadata
|
|
1524
|
-
lastUpdated: "2026-
|
|
1298
|
+
lastUpdated: "2026-06-01",
|
|
1299
|
+
originalModel: "Claude 4 Sonnet (Reasoning)",
|
|
1525
1300
|
},
|
|
1526
|
-
"
|
|
1527
|
-
// AA Intelligence Index (composite score)
|
|
1528
|
-
intelligenceIndex: 18.8,
|
|
1529
|
-
normalizedScore: 27,
|
|
1530
|
-
|
|
1301
|
+
"claude-4-opus-reasoning": {
|
|
1531
1302
|
// AA specific benchmarks
|
|
1532
|
-
codingIndex:
|
|
1533
|
-
mathIndex:
|
|
1303
|
+
codingIndex: 34,
|
|
1304
|
+
mathIndex: 73.3,
|
|
1534
1305
|
|
|
1535
1306
|
// Academic benchmarks
|
|
1536
|
-
mmluPro: 0.
|
|
1537
|
-
gpqa: 0.
|
|
1538
|
-
hle: 0.
|
|
1307
|
+
mmluPro: 0.873,
|
|
1308
|
+
gpqa: 0.796,
|
|
1309
|
+
hle: 0.117,
|
|
1539
1310
|
|
|
1540
1311
|
// Capabilities
|
|
1541
1312
|
contextWindow: 8192,
|
|
@@ -1543,21 +1314,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1543
1314
|
supportsVision: false,
|
|
1544
1315
|
|
|
1545
1316
|
// Metadata
|
|
1546
|
-
lastUpdated: "2026-
|
|
1317
|
+
lastUpdated: "2026-06-01",
|
|
1318
|
+
originalModel: "Claude 4 Opus (Reasoning)",
|
|
1547
1319
|
},
|
|
1548
|
-
"
|
|
1549
|
-
// AA Intelligence Index (composite score)
|
|
1550
|
-
intelligenceIndex: 28.1,
|
|
1551
|
-
normalizedScore: 40,
|
|
1552
|
-
|
|
1320
|
+
"claude-2.1": {
|
|
1553
1321
|
// AA specific benchmarks
|
|
1554
|
-
codingIndex:
|
|
1555
|
-
mathIndex:
|
|
1322
|
+
codingIndex: 14,
|
|
1323
|
+
mathIndex: undefined,
|
|
1556
1324
|
|
|
1557
1325
|
// Academic benchmarks
|
|
1558
|
-
mmluPro: 0.
|
|
1559
|
-
gpqa: 0.
|
|
1560
|
-
hle: 0.
|
|
1326
|
+
mmluPro: 0.495,
|
|
1327
|
+
gpqa: 0.319,
|
|
1328
|
+
hle: 0.042,
|
|
1561
1329
|
|
|
1562
1330
|
// Capabilities
|
|
1563
1331
|
contextWindow: 8192,
|
|
@@ -1565,21 +1333,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1565
1333
|
supportsVision: false,
|
|
1566
1334
|
|
|
1567
1335
|
// Metadata
|
|
1568
|
-
lastUpdated: "2026-
|
|
1336
|
+
lastUpdated: "2026-06-01",
|
|
1337
|
+
originalModel: "Claude 2.1",
|
|
1569
1338
|
},
|
|
1570
|
-
"
|
|
1571
|
-
// AA Intelligence Index (composite score)
|
|
1572
|
-
intelligenceIndex: 12.3,
|
|
1573
|
-
normalizedScore: 18,
|
|
1574
|
-
|
|
1339
|
+
"mistral-large-2-nov-24": {
|
|
1575
1340
|
// AA specific benchmarks
|
|
1576
|
-
codingIndex:
|
|
1577
|
-
mathIndex:
|
|
1341
|
+
codingIndex: 13.8,
|
|
1342
|
+
mathIndex: 14,
|
|
1578
1343
|
|
|
1579
1344
|
// Academic benchmarks
|
|
1580
|
-
mmluPro:
|
|
1581
|
-
gpqa:
|
|
1582
|
-
hle:
|
|
1345
|
+
mmluPro: 0.697,
|
|
1346
|
+
gpqa: 0.486,
|
|
1347
|
+
hle: 0.04,
|
|
1583
1348
|
|
|
1584
1349
|
// Capabilities
|
|
1585
1350
|
contextWindow: 8192,
|
|
@@ -1587,21 +1352,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1587
1352
|
supportsVision: false,
|
|
1588
1353
|
|
|
1589
1354
|
// Metadata
|
|
1590
|
-
lastUpdated: "2026-
|
|
1355
|
+
lastUpdated: "2026-06-01",
|
|
1356
|
+
originalModel: "Mistral Large 2 (Nov '24)",
|
|
1591
1357
|
},
|
|
1592
|
-
"
|
|
1593
|
-
// AA Intelligence Index (composite score)
|
|
1594
|
-
intelligenceIndex: 9.1,
|
|
1595
|
-
normalizedScore: 13,
|
|
1596
|
-
|
|
1358
|
+
"mistral-large-2-jul-24": {
|
|
1597
1359
|
// AA specific benchmarks
|
|
1598
1360
|
codingIndex: undefined,
|
|
1599
|
-
mathIndex:
|
|
1361
|
+
mathIndex: 0,
|
|
1600
1362
|
|
|
1601
1363
|
// Academic benchmarks
|
|
1602
|
-
mmluPro:
|
|
1603
|
-
gpqa:
|
|
1604
|
-
hle:
|
|
1364
|
+
mmluPro: 0.683,
|
|
1365
|
+
gpqa: 0.472,
|
|
1366
|
+
hle: 0.032,
|
|
1605
1367
|
|
|
1606
1368
|
// Capabilities
|
|
1607
1369
|
contextWindow: 8192,
|
|
@@ -1609,21 +1371,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1609
1371
|
supportsVision: false,
|
|
1610
1372
|
|
|
1611
1373
|
// Metadata
|
|
1612
|
-
lastUpdated: "2026-
|
|
1374
|
+
lastUpdated: "2026-06-01",
|
|
1375
|
+
originalModel: "Mistral Large 2 (Jul '24)",
|
|
1613
1376
|
},
|
|
1614
|
-
"
|
|
1615
|
-
// AA Intelligence Index (composite score)
|
|
1616
|
-
intelligenceIndex: 8.5,
|
|
1617
|
-
normalizedScore: 12,
|
|
1618
|
-
|
|
1377
|
+
"pixtral-large": {
|
|
1619
1378
|
// AA specific benchmarks
|
|
1620
1379
|
codingIndex: undefined,
|
|
1621
|
-
mathIndex:
|
|
1380
|
+
mathIndex: 2.3,
|
|
1622
1381
|
|
|
1623
1382
|
// Academic benchmarks
|
|
1624
|
-
mmluPro: 0.
|
|
1625
|
-
gpqa: 0.
|
|
1626
|
-
hle: 0.
|
|
1383
|
+
mmluPro: 0.701,
|
|
1384
|
+
gpqa: 0.505,
|
|
1385
|
+
hle: 0.036,
|
|
1627
1386
|
|
|
1628
1387
|
// Capabilities
|
|
1629
1388
|
contextWindow: 8192,
|
|
@@ -1631,21 +1390,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1631
1390
|
supportsVision: false,
|
|
1632
1391
|
|
|
1633
1392
|
// Metadata
|
|
1634
|
-
lastUpdated: "2026-
|
|
1393
|
+
lastUpdated: "2026-06-01",
|
|
1394
|
+
originalModel: "Pixtral Large",
|
|
1635
1395
|
},
|
|
1636
|
-
|
|
1637
|
-
// AA Intelligence Index (composite score)
|
|
1638
|
-
intelligenceIndex: 15.5,
|
|
1639
|
-
normalizedScore: 22,
|
|
1640
|
-
|
|
1396
|
+
"mistral-small-3": {
|
|
1641
1397
|
// AA specific benchmarks
|
|
1642
1398
|
codingIndex: undefined,
|
|
1643
|
-
mathIndex:
|
|
1399
|
+
mathIndex: 4.3,
|
|
1644
1400
|
|
|
1645
1401
|
// Academic benchmarks
|
|
1646
|
-
mmluPro: 0.
|
|
1647
|
-
gpqa: 0.
|
|
1648
|
-
hle: 0.
|
|
1402
|
+
mmluPro: 0.652,
|
|
1403
|
+
gpqa: 0.462,
|
|
1404
|
+
hle: 0.041,
|
|
1649
1405
|
|
|
1650
1406
|
// Capabilities
|
|
1651
1407
|
contextWindow: 8192,
|
|
@@ -1653,21 +1409,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1653
1409
|
supportsVision: false,
|
|
1654
1410
|
|
|
1655
1411
|
// Metadata
|
|
1656
|
-
lastUpdated: "2026-
|
|
1412
|
+
lastUpdated: "2026-06-01",
|
|
1413
|
+
originalModel: "Mistral Small 3",
|
|
1657
1414
|
},
|
|
1658
|
-
"
|
|
1659
|
-
// AA Intelligence Index (composite score)
|
|
1660
|
-
intelligenceIndex: 24.6,
|
|
1661
|
-
normalizedScore: 35,
|
|
1662
|
-
|
|
1415
|
+
"mistral-small-sep-24": {
|
|
1663
1416
|
// AA specific benchmarks
|
|
1664
1417
|
codingIndex: undefined,
|
|
1665
1418
|
mathIndex: undefined,
|
|
1666
1419
|
|
|
1667
1420
|
// Academic benchmarks
|
|
1668
|
-
mmluPro:
|
|
1669
|
-
gpqa:
|
|
1670
|
-
hle:
|
|
1421
|
+
mmluPro: 0.529,
|
|
1422
|
+
gpqa: 0.381,
|
|
1423
|
+
hle: 0.043,
|
|
1671
1424
|
|
|
1672
1425
|
// Capabilities
|
|
1673
1426
|
contextWindow: 8192,
|
|
@@ -1675,21 +1428,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1675
1428
|
supportsVision: false,
|
|
1676
1429
|
|
|
1677
1430
|
// Metadata
|
|
1678
|
-
lastUpdated: "2026-
|
|
1431
|
+
lastUpdated: "2026-06-01",
|
|
1432
|
+
originalModel: "Mistral Small (Sep '24)",
|
|
1679
1433
|
},
|
|
1680
|
-
"
|
|
1681
|
-
// AA Intelligence Index (composite score)
|
|
1682
|
-
intelligenceIndex: 15.2,
|
|
1683
|
-
normalizedScore: 22,
|
|
1684
|
-
|
|
1434
|
+
"mixtral-8x22b-instruct": {
|
|
1685
1435
|
// AA specific benchmarks
|
|
1686
1436
|
codingIndex: undefined,
|
|
1687
1437
|
mathIndex: undefined,
|
|
1688
1438
|
|
|
1689
1439
|
// Academic benchmarks
|
|
1690
|
-
mmluPro: 0.
|
|
1691
|
-
gpqa: 0.
|
|
1692
|
-
hle: 0.
|
|
1440
|
+
mmluPro: 0.537,
|
|
1441
|
+
gpqa: 0.332,
|
|
1442
|
+
hle: 0.041,
|
|
1693
1443
|
|
|
1694
1444
|
// Capabilities
|
|
1695
1445
|
contextWindow: 8192,
|
|
@@ -1697,21 +1447,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1697
1447
|
supportsVision: false,
|
|
1698
1448
|
|
|
1699
1449
|
// Metadata
|
|
1700
|
-
lastUpdated: "2026-
|
|
1450
|
+
lastUpdated: "2026-06-01",
|
|
1451
|
+
originalModel: "Mixtral 8x22B Instruct",
|
|
1701
1452
|
},
|
|
1702
|
-
"
|
|
1703
|
-
// AA Intelligence Index (composite score)
|
|
1704
|
-
intelligenceIndex: 17.9,
|
|
1705
|
-
normalizedScore: 26,
|
|
1706
|
-
|
|
1453
|
+
"mistral-small-feb-24": {
|
|
1707
1454
|
// AA specific benchmarks
|
|
1708
1455
|
codingIndex: undefined,
|
|
1709
1456
|
mathIndex: undefined,
|
|
1710
1457
|
|
|
1711
1458
|
// Academic benchmarks
|
|
1712
|
-
mmluPro:
|
|
1713
|
-
gpqa: 0.
|
|
1714
|
-
hle:
|
|
1459
|
+
mmluPro: 0.419,
|
|
1460
|
+
gpqa: 0.302,
|
|
1461
|
+
hle: 0.044,
|
|
1715
1462
|
|
|
1716
1463
|
// Capabilities
|
|
1717
1464
|
contextWindow: 8192,
|
|
@@ -1719,21 +1466,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1719
1466
|
supportsVision: false,
|
|
1720
1467
|
|
|
1721
1468
|
// Metadata
|
|
1722
|
-
lastUpdated: "2026-
|
|
1469
|
+
lastUpdated: "2026-06-01",
|
|
1470
|
+
originalModel: "Mistral Small (Feb '24)",
|
|
1723
1471
|
},
|
|
1724
|
-
"
|
|
1725
|
-
// AA Intelligence Index (composite score)
|
|
1726
|
-
intelligenceIndex: 13.3,
|
|
1727
|
-
normalizedScore: 19,
|
|
1728
|
-
|
|
1472
|
+
"mistral-large-feb-24": {
|
|
1729
1473
|
// AA specific benchmarks
|
|
1730
1474
|
codingIndex: undefined,
|
|
1731
1475
|
mathIndex: undefined,
|
|
1732
1476
|
|
|
1733
1477
|
// Academic benchmarks
|
|
1734
|
-
mmluPro: 0.
|
|
1735
|
-
gpqa: 0.
|
|
1736
|
-
hle: 0.
|
|
1478
|
+
mmluPro: 0.515,
|
|
1479
|
+
gpqa: 0.351,
|
|
1480
|
+
hle: 0.034,
|
|
1737
1481
|
|
|
1738
1482
|
// Capabilities
|
|
1739
1483
|
contextWindow: 8192,
|
|
@@ -1741,21 +1485,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1741
1485
|
supportsVision: false,
|
|
1742
1486
|
|
|
1743
1487
|
// Metadata
|
|
1744
|
-
lastUpdated: "2026-
|
|
1488
|
+
lastUpdated: "2026-06-01",
|
|
1489
|
+
originalModel: "Mistral Large (Feb '24)",
|
|
1745
1490
|
},
|
|
1746
|
-
"
|
|
1747
|
-
// AA Intelligence Index (composite score)
|
|
1748
|
-
intelligenceIndex: 35.1,
|
|
1749
|
-
normalizedScore: 50,
|
|
1750
|
-
|
|
1491
|
+
"mixtral-8x7b-instruct": {
|
|
1751
1492
|
// AA specific benchmarks
|
|
1752
|
-
codingIndex:
|
|
1753
|
-
mathIndex:
|
|
1493
|
+
codingIndex: undefined,
|
|
1494
|
+
mathIndex: undefined,
|
|
1754
1495
|
|
|
1755
1496
|
// Academic benchmarks
|
|
1756
|
-
mmluPro: 0.
|
|
1757
|
-
gpqa: 0.
|
|
1758
|
-
hle: 0.
|
|
1497
|
+
mmluPro: 0.387,
|
|
1498
|
+
gpqa: 0.292,
|
|
1499
|
+
hle: 0.045,
|
|
1759
1500
|
|
|
1760
1501
|
// Capabilities
|
|
1761
1502
|
contextWindow: 8192,
|
|
@@ -1763,21 +1504,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1763
1504
|
supportsVision: false,
|
|
1764
1505
|
|
|
1765
1506
|
// Metadata
|
|
1766
|
-
lastUpdated: "2026-
|
|
1507
|
+
lastUpdated: "2026-06-01",
|
|
1508
|
+
originalModel: "Mixtral 8x7B Instruct",
|
|
1767
1509
|
},
|
|
1768
|
-
"
|
|
1769
|
-
// AA Intelligence Index (composite score)
|
|
1770
|
-
intelligenceIndex: 21.6,
|
|
1771
|
-
normalizedScore: 31,
|
|
1772
|
-
|
|
1510
|
+
"mistral-7b-instruct": {
|
|
1773
1511
|
// AA specific benchmarks
|
|
1774
1512
|
codingIndex: undefined,
|
|
1775
1513
|
mathIndex: undefined,
|
|
1776
1514
|
|
|
1777
1515
|
// Academic benchmarks
|
|
1778
|
-
mmluPro:
|
|
1779
|
-
gpqa:
|
|
1780
|
-
hle:
|
|
1516
|
+
mmluPro: 0.245,
|
|
1517
|
+
gpqa: 0.177,
|
|
1518
|
+
hle: 0.043,
|
|
1781
1519
|
|
|
1782
1520
|
// Capabilities
|
|
1783
1521
|
contextWindow: 8192,
|
|
@@ -1785,21 +1523,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1785
1523
|
supportsVision: false,
|
|
1786
1524
|
|
|
1787
1525
|
// Metadata
|
|
1788
|
-
lastUpdated: "2026-
|
|
1526
|
+
lastUpdated: "2026-06-01",
|
|
1527
|
+
originalModel: "Mistral 7B Instruct",
|
|
1789
1528
|
},
|
|
1790
|
-
"
|
|
1791
|
-
// AA Intelligence Index (composite score)
|
|
1792
|
-
intelligenceIndex: 25.2,
|
|
1793
|
-
normalizedScore: 36,
|
|
1794
|
-
|
|
1529
|
+
"mistral-saba": {
|
|
1795
1530
|
// AA specific benchmarks
|
|
1796
|
-
codingIndex:
|
|
1797
|
-
mathIndex:
|
|
1531
|
+
codingIndex: undefined,
|
|
1532
|
+
mathIndex: undefined,
|
|
1798
1533
|
|
|
1799
1534
|
// Academic benchmarks
|
|
1800
|
-
mmluPro: 0.
|
|
1801
|
-
gpqa: 0.
|
|
1802
|
-
hle: 0.
|
|
1535
|
+
mmluPro: 0.611,
|
|
1536
|
+
gpqa: 0.424,
|
|
1537
|
+
hle: 0.041,
|
|
1803
1538
|
|
|
1804
1539
|
// Capabilities
|
|
1805
1540
|
contextWindow: 8192,
|
|
@@ -1807,21 +1542,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1807
1542
|
supportsVision: false,
|
|
1808
1543
|
|
|
1809
1544
|
// Metadata
|
|
1810
|
-
lastUpdated: "2026-
|
|
1545
|
+
lastUpdated: "2026-06-01",
|
|
1546
|
+
originalModel: "Mistral Saba",
|
|
1811
1547
|
},
|
|
1812
|
-
"
|
|
1813
|
-
// AA Intelligence Index (composite score)
|
|
1814
|
-
intelligenceIndex: 41.5,
|
|
1815
|
-
normalizedScore: 59,
|
|
1816
|
-
|
|
1548
|
+
"mistral-small-3.2": {
|
|
1817
1549
|
// AA specific benchmarks
|
|
1818
|
-
codingIndex:
|
|
1819
|
-
mathIndex:
|
|
1550
|
+
codingIndex: 13.3,
|
|
1551
|
+
mathIndex: 27,
|
|
1820
1552
|
|
|
1821
1553
|
// Academic benchmarks
|
|
1822
|
-
mmluPro: 0.
|
|
1823
|
-
gpqa: 0.
|
|
1824
|
-
hle: 0.
|
|
1554
|
+
mmluPro: 0.681,
|
|
1555
|
+
gpqa: 0.505,
|
|
1556
|
+
hle: 0.043,
|
|
1825
1557
|
|
|
1826
1558
|
// Capabilities
|
|
1827
1559
|
contextWindow: 8192,
|
|
@@ -1829,21 +1561,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1829
1561
|
supportsVision: false,
|
|
1830
1562
|
|
|
1831
1563
|
// Metadata
|
|
1832
|
-
lastUpdated: "2026-
|
|
1564
|
+
lastUpdated: "2026-06-01",
|
|
1565
|
+
originalModel: "Mistral Small 3.2",
|
|
1833
1566
|
},
|
|
1834
|
-
"
|
|
1835
|
-
// AA Intelligence Index (composite score)
|
|
1836
|
-
intelligenceIndex: 23.6,
|
|
1837
|
-
normalizedScore: 34,
|
|
1838
|
-
|
|
1567
|
+
"mistral-small-3.1": {
|
|
1839
1568
|
// AA specific benchmarks
|
|
1840
|
-
codingIndex:
|
|
1841
|
-
mathIndex:
|
|
1569
|
+
codingIndex: 13.9,
|
|
1570
|
+
mathIndex: 3.7,
|
|
1842
1571
|
|
|
1843
1572
|
// Academic benchmarks
|
|
1844
|
-
mmluPro: 0.
|
|
1845
|
-
gpqa: 0.
|
|
1846
|
-
hle: 0.
|
|
1573
|
+
mmluPro: 0.659,
|
|
1574
|
+
gpqa: 0.454,
|
|
1575
|
+
hle: 0.048,
|
|
1847
1576
|
|
|
1848
1577
|
// Capabilities
|
|
1849
1578
|
contextWindow: 8192,
|
|
@@ -1851,21 +1580,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1851
1580
|
supportsVision: false,
|
|
1852
1581
|
|
|
1853
1582
|
// Metadata
|
|
1854
|
-
lastUpdated: "2026-
|
|
1583
|
+
lastUpdated: "2026-06-01",
|
|
1584
|
+
originalModel: "Mistral Small 3.1",
|
|
1855
1585
|
},
|
|
1856
|
-
"
|
|
1857
|
-
// AA Intelligence Index (composite score)
|
|
1858
|
-
intelligenceIndex: 38.6,
|
|
1859
|
-
normalizedScore: 55,
|
|
1860
|
-
|
|
1586
|
+
"mistral-medium-3": {
|
|
1861
1587
|
// AA specific benchmarks
|
|
1862
|
-
codingIndex:
|
|
1863
|
-
mathIndex:
|
|
1588
|
+
codingIndex: 13.6,
|
|
1589
|
+
mathIndex: 30.3,
|
|
1864
1590
|
|
|
1865
1591
|
// Academic benchmarks
|
|
1866
|
-
mmluPro: 0.
|
|
1867
|
-
gpqa: 0.
|
|
1868
|
-
hle: 0.
|
|
1592
|
+
mmluPro: 0.76,
|
|
1593
|
+
gpqa: 0.578,
|
|
1594
|
+
hle: 0.043,
|
|
1869
1595
|
|
|
1870
1596
|
// Capabilities
|
|
1871
1597
|
contextWindow: 8192,
|
|
@@ -1873,21 +1599,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1873
1599
|
supportsVision: false,
|
|
1874
1600
|
|
|
1875
1601
|
// Metadata
|
|
1876
|
-
lastUpdated: "2026-
|
|
1602
|
+
lastUpdated: "2026-06-01",
|
|
1603
|
+
originalModel: "Mistral Medium 3",
|
|
1877
1604
|
},
|
|
1878
|
-
"
|
|
1879
|
-
// AA Intelligence Index (composite score)
|
|
1880
|
-
intelligenceIndex: 13.9,
|
|
1881
|
-
normalizedScore: 20,
|
|
1882
|
-
|
|
1605
|
+
"magistral-small-1": {
|
|
1883
1606
|
// AA specific benchmarks
|
|
1884
|
-
codingIndex:
|
|
1885
|
-
mathIndex:
|
|
1607
|
+
codingIndex: 11.1,
|
|
1608
|
+
mathIndex: 41.3,
|
|
1886
1609
|
|
|
1887
1610
|
// Academic benchmarks
|
|
1888
|
-
mmluPro: 0.
|
|
1889
|
-
gpqa: 0.
|
|
1890
|
-
hle: 0.
|
|
1611
|
+
mmluPro: 0.746,
|
|
1612
|
+
gpqa: 0.641,
|
|
1613
|
+
hle: 0.072,
|
|
1891
1614
|
|
|
1892
1615
|
// Capabilities
|
|
1893
1616
|
contextWindow: 8192,
|
|
@@ -1895,21 +1618,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1895
1618
|
supportsVision: false,
|
|
1896
1619
|
|
|
1897
1620
|
// Metadata
|
|
1898
|
-
lastUpdated: "2026-
|
|
1621
|
+
lastUpdated: "2026-06-01",
|
|
1622
|
+
originalModel: "Magistral Small 1",
|
|
1899
1623
|
},
|
|
1900
|
-
"
|
|
1901
|
-
// AA Intelligence Index (composite score)
|
|
1902
|
-
intelligenceIndex: 23.1,
|
|
1903
|
-
normalizedScore: 33,
|
|
1904
|
-
|
|
1624
|
+
"devstral-small-may-25": {
|
|
1905
1625
|
// AA specific benchmarks
|
|
1906
|
-
codingIndex:
|
|
1907
|
-
mathIndex:
|
|
1626
|
+
codingIndex: 12.2,
|
|
1627
|
+
mathIndex: undefined,
|
|
1908
1628
|
|
|
1909
1629
|
// Academic benchmarks
|
|
1910
|
-
mmluPro: 0.
|
|
1911
|
-
gpqa: 0.
|
|
1912
|
-
hle: 0.
|
|
1630
|
+
mmluPro: 0.632,
|
|
1631
|
+
gpqa: 0.434,
|
|
1632
|
+
hle: 0.04,
|
|
1913
1633
|
|
|
1914
1634
|
// Capabilities
|
|
1915
1635
|
contextWindow: 8192,
|
|
@@ -1917,21 +1637,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1917
1637
|
supportsVision: false,
|
|
1918
1638
|
|
|
1919
1639
|
// Metadata
|
|
1920
|
-
lastUpdated: "2026-
|
|
1640
|
+
lastUpdated: "2026-06-01",
|
|
1641
|
+
originalModel: "Devstral Small (May '25)",
|
|
1921
1642
|
},
|
|
1922
|
-
"
|
|
1923
|
-
// AA Intelligence Index (composite score)
|
|
1924
|
-
intelligenceIndex: 8.3,
|
|
1925
|
-
normalizedScore: 12,
|
|
1926
|
-
|
|
1643
|
+
"mistral-medium": {
|
|
1927
1644
|
// AA specific benchmarks
|
|
1928
1645
|
codingIndex: undefined,
|
|
1929
1646
|
mathIndex: undefined,
|
|
1930
1647
|
|
|
1931
1648
|
// Academic benchmarks
|
|
1932
|
-
mmluPro: 0.
|
|
1933
|
-
gpqa: 0.
|
|
1934
|
-
hle: 0.
|
|
1649
|
+
mmluPro: 0.491,
|
|
1650
|
+
gpqa: 0.349,
|
|
1651
|
+
hle: 0.034,
|
|
1935
1652
|
|
|
1936
1653
|
// Capabilities
|
|
1937
1654
|
contextWindow: 8192,
|
|
@@ -1939,21 +1656,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1939
1656
|
supportsVision: false,
|
|
1940
1657
|
|
|
1941
1658
|
// Metadata
|
|
1942
|
-
lastUpdated: "2026-
|
|
1659
|
+
lastUpdated: "2026-06-01",
|
|
1660
|
+
originalModel: "Mistral Medium",
|
|
1943
1661
|
},
|
|
1944
|
-
"
|
|
1945
|
-
// AA Intelligence Index (composite score)
|
|
1946
|
-
intelligenceIndex: 13.5,
|
|
1947
|
-
normalizedScore: 19,
|
|
1948
|
-
|
|
1662
|
+
"devstral-small-jul-25": {
|
|
1949
1663
|
// AA specific benchmarks
|
|
1950
|
-
codingIndex:
|
|
1951
|
-
mathIndex:
|
|
1664
|
+
codingIndex: 12.1,
|
|
1665
|
+
mathIndex: 29.3,
|
|
1952
1666
|
|
|
1953
1667
|
// Academic benchmarks
|
|
1954
|
-
mmluPro: 0.
|
|
1955
|
-
gpqa: 0.
|
|
1956
|
-
hle: 0.
|
|
1668
|
+
mmluPro: 0.622,
|
|
1669
|
+
gpqa: 0.414,
|
|
1670
|
+
hle: 0.037,
|
|
1957
1671
|
|
|
1958
1672
|
// Capabilities
|
|
1959
1673
|
contextWindow: 8192,
|
|
@@ -1961,21 +1675,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1961
1675
|
supportsVision: false,
|
|
1962
1676
|
|
|
1963
1677
|
// Metadata
|
|
1964
|
-
lastUpdated: "2026-
|
|
1678
|
+
lastUpdated: "2026-06-01",
|
|
1679
|
+
originalModel: "Devstral Small (Jul '25)",
|
|
1965
1680
|
},
|
|
1966
|
-
"
|
|
1967
|
-
// AA Intelligence Index (composite score)
|
|
1968
|
-
intelligenceIndex: 12.7,
|
|
1969
|
-
normalizedScore: 18,
|
|
1970
|
-
|
|
1681
|
+
"devstral-medium": {
|
|
1971
1682
|
// AA specific benchmarks
|
|
1972
|
-
codingIndex:
|
|
1973
|
-
mathIndex: 7,
|
|
1683
|
+
codingIndex: 15.9,
|
|
1684
|
+
mathIndex: 4.7,
|
|
1974
1685
|
|
|
1975
1686
|
// Academic benchmarks
|
|
1976
|
-
mmluPro: 0.
|
|
1977
|
-
gpqa: 0.
|
|
1978
|
-
hle: 0.
|
|
1687
|
+
mmluPro: 0.708,
|
|
1688
|
+
gpqa: 0.492,
|
|
1689
|
+
hle: 0.038,
|
|
1979
1690
|
|
|
1980
1691
|
// Capabilities
|
|
1981
1692
|
contextWindow: 8192,
|
|
@@ -1983,21 +1694,18 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
1983
1694
|
supportsVision: false,
|
|
1984
1695
|
|
|
1985
1696
|
// Metadata
|
|
1986
|
-
lastUpdated: "2026-
|
|
1697
|
+
lastUpdated: "2026-06-01",
|
|
1698
|
+
originalModel: "Devstral Medium",
|
|
1987
1699
|
},
|
|
1988
|
-
"
|
|
1989
|
-
// AA Intelligence Index (composite score)
|
|
1990
|
-
intelligenceIndex: 10.1,
|
|
1991
|
-
normalizedScore: 14,
|
|
1992
|
-
|
|
1700
|
+
"magistral-medium-1": {
|
|
1993
1701
|
// AA specific benchmarks
|
|
1994
|
-
codingIndex:
|
|
1995
|
-
mathIndex:
|
|
1702
|
+
codingIndex: 16,
|
|
1703
|
+
mathIndex: 40.3,
|
|
1996
1704
|
|
|
1997
1705
|
// Academic benchmarks
|
|
1998
|
-
mmluPro: 0.
|
|
1999
|
-
gpqa: 0.
|
|
2000
|
-
hle: 0.
|
|
1706
|
+
mmluPro: 0.753,
|
|
1707
|
+
gpqa: 0.679,
|
|
1708
|
+
hle: 0.095,
|
|
2001
1709
|
|
|
2002
1710
|
// Capabilities
|
|
2003
1711
|
contextWindow: 8192,
|
|
@@ -2005,6 +1713,7 @@ export const BENCHMARKS_CHUNK_3: Record<string, HardcodedBenchmark> = {
|
|
|
2005
1713
|
supportsVision: false,
|
|
2006
1714
|
|
|
2007
1715
|
// Metadata
|
|
2008
|
-
lastUpdated: "2026-
|
|
1716
|
+
lastUpdated: "2026-06-01",
|
|
1717
|
+
originalModel: "Magistral Medium 1",
|
|
2009
1718
|
},
|
|
2010
1719
|
};
|