free-coding-models 0.1.39 → 0.1.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -199,23 +199,23 @@ free-coding-models
199
199
 
200
200
  ## 🤖 Coding Models
201
201
 
202
- **44 coding models** across 8 tiers, ranked by [Aider Polyglot benchmark](https://aider.chat/docs/leaderboards) (225 coding exercises across C++/Go/Java/JS/Python/Rust). Models without a confirmed Aider score are estimated from model family, size, and published release benchmarks.
203
-
204
- | Tier | Score | Count | Models |
205
- |------|-------|-------|--------|
206
- | **S+** | 75%+ | 7 | DeepSeek V3.1/Terminus, DeepSeek V3.2, Kimi K2.5, Devstral 2, Nemotron Ultra 253B, Mistral Large 675B |
207
- | **S** | 6274% | 7 | Qwen2.5 Coder 32B, GLM 5, Qwen3.5 400B VLM, Qwen3 Coder 480B, Qwen3 80B Thinking, Llama 3.1 405B, MiniMax M2.1 |
208
- | **A+** | 5462% | 6 | Kimi K2 Thinking/Instruct, Qwen3 235B, Llama 3.3 70B, GLM 4.7, Qwen3 80B Instruct |
209
- | **A** | 4454% | 5 | MiniMax M2, Mistral Medium 3, Magistral Small, Nemotron Nano 30B, R1 Distill 32B |
210
- | **A-** | 3644% | 5 | GPT OSS 120B, Nemotron Super 49B, Llama 4 Scout, R1 Distill 14B, Colosseum 355B |
211
- | **B+** | 2536% | 5 | QwQ 32B, GPT OSS 20B, Stockmark 100B, Seed OSS 36B, Step 3.5 Flash |
212
- | **B** | 1425% | 5 | Llama 4 Maverick, Mixtral 8x22B, Ministral 14B, Granite 34B Code, R1 Distill 8B |
213
- | **C** | <14% | 4 | R1 Distill 7B, Gemma 2 9B, Phi 3.5 Mini, Phi 4 Mini |
202
+ **44 coding models** across 8 tiers, ranked by [SWE-bench Verified](https://www.swebench.com) the industry-standard benchmark measuring real GitHub issue resolution. Scores are self-reported by providers unless noted.
203
+
204
+ | Tier | SWE-bench | Models |
205
+ |------|-----------|--------|
206
+ | **S+** ≥70% | GLM 5 (77.8%), Kimi K2.5 (76.8%), Step 3.5 Flash (74.4%), MiniMax M2.1 (74.0%), GLM 4.7 (73.8%), DeepSeek V3.2 (73.1%), Devstral 2 (72.2%), Kimi K2 Thinking (71.3%), Qwen3 Coder 480B (70.6%), Qwen3 235B (70.0%) |
207
+ | **S** 6070% | MiniMax M2 (69.4%), DeepSeek V3.1 Terminus (68.4%), Qwen3 80B Thinking (68.0%), Qwen3.5 400B (68.0%), Kimi K2 Instruct (65.8%), Qwen3 80B Instruct (65.0%), DeepSeek V3.1 (62.0%), Llama 4 Maverick (62.0%), GPT OSS 120B (60.0%) |
208
+ | **A+** 5060% | Mistral Large 675B (58.0%), Nemotron Ultra 253B (56.0%), Colosseum 355B (52.0%), QwQ 32B (50.0%) |
209
+ | **A** 4050% | Nemotron Super 49B (49.0%), Mistral Medium 3 (48.0%), Qwen2.5 Coder 32B (46.0%), Magistral Small (45.0%), Llama 4 Scout (44.0%), Llama 3.1 405B (44.0%), Nemotron Nano 30B (43.0%), R1 Distill 32B (43.9%), GPT OSS 20B (42.0%) |
210
+ | **A-** 3540% | Llama 3.3 70B (39.5%), Seed OSS 36B (38.0%), R1 Distill 14B (37.7%), Stockmark 100B (36.0%) |
211
+ | **B+** 3035% | Ministral 14B (34.0%), Mixtral 8x22B (32.0%), Granite 34B Code (30.0%) |
212
+ | **B** 2030% | R1 Distill 8B (28.2%), R1 Distill 7B (22.6%) |
213
+ | **C** <20% | Gemma 2 9B (18.0%), Phi 4 Mini (14.0%), Phi 3.5 Mini (12.0%) |
214
214
 
215
215
  ### Tier scale
216
216
 
217
- - **S+/S** — Frontier coders, top Aider polyglot scores, best for complex refactors
218
- - **A+/A** — Excellent alternatives, strong at most coding tasks
217
+ - **S+/S** — Elite frontier coders (≥60% SWE-bench), best for complex real-world tasks and refactors
218
+ - **A+/A** — Great alternatives, strong at most coding tasks
219
219
  - **A-/B+** — Solid performers, good for targeted programming tasks
220
220
  - **B/C** — Lightweight or older models, good for code completion on constrained infra
221
221
 
@@ -421,7 +421,7 @@ function renderTable(results, pendingPings, frame, cursor = null, sortColumn = '
421
421
  // 📖 Column widths (generous spacing with margins)
422
422
  const W_RANK = 6
423
423
  const W_TIER = 6
424
- const W_CTW = 6
424
+ const W_CTX = 6
425
425
  const W_SOURCE = 14
426
426
  const W_MODEL = 26
427
427
  const W_SWE = 9
@@ -454,7 +454,7 @@ function renderTable(results, pendingPings, frame, cursor = null, sortColumn = '
454
454
  const originH = 'Origin'
455
455
  const modelH = 'Model'
456
456
  const sweH = sortColumn === 'swe' ? dir + ' SWE%' : 'SWE%'
457
- const ctwH = sortColumn === 'ctw' ? dir + ' CTW' : 'CTW'
457
+ const ctxH = sortColumn === 'ctx' ? dir + ' CTX' : 'CTX'
458
458
  const pingH = sortColumn === 'ping' ? dir + ' Latest Ping' : 'Latest Ping'
459
459
  const avgH = sortColumn === 'avg' ? dir + ' Avg Ping' : 'Avg Ping'
460
460
  const healthH = sortColumn === 'condition' ? dir + ' Health' : 'Health'
@@ -477,15 +477,15 @@ function renderTable(results, pendingPings, frame, cursor = null, sortColumn = '
477
477
  const originH_c = sortColumn === 'origin' ? chalk.bold.cyan(originH.padEnd(W_SOURCE)) : colorFirst(originH, W_SOURCE)
478
478
  const modelH_c = colorFirst(modelH, W_MODEL)
479
479
  const sweH_c = sortColumn === 'swe' ? chalk.bold.cyan(sweH.padEnd(W_SWE)) : colorFirst(sweH, W_SWE)
480
- const ctwH_c = sortColumn === 'ctw' ? chalk.bold.cyan(ctwH.padEnd(W_CTW)) : colorFirst(ctwH, W_CTW)
480
+ const ctxH_c = sortColumn === 'ctx' ? chalk.bold.cyan(ctxH.padEnd(W_CTX)) : colorFirst(ctxH, W_CTX)
481
481
  const pingH_c = sortColumn === 'ping' ? chalk.bold.cyan(pingH.padEnd(W_PING)) : colorFirst('Latest Ping', W_PING)
482
482
  const avgH_c = sortColumn === 'avg' ? chalk.bold.cyan(avgH.padEnd(W_AVG)) : colorFirst('Avg Ping', W_AVG)
483
483
  const healthH_c = sortColumn === 'condition' ? chalk.bold.cyan(healthH.padEnd(W_STATUS)) : colorFirst('Health', W_STATUS)
484
484
  const verdictH_c = sortColumn === 'verdict' ? chalk.bold.cyan(verdictH.padEnd(W_VERDICT)) : colorFirst(verdictH, W_VERDICT)
485
485
  const uptimeH_c = sortColumn === 'uptime' ? chalk.bold.cyan(uptimeH.padStart(W_UPTIME)) : colorFirst(uptimeH, W_UPTIME, chalk.green)
486
486
 
487
- // 📖 Header with proper spacing (column order: Rank, Tier, SWE%, CTW, Model, Origin, Latest Ping, Avg Ping, Health, Verdict, Up%)
488
- lines.push(' ' + rankH_c + ' ' + tierH_c + ' ' + sweH_c + ' ' + ctwH_c + ' ' + modelH_c + ' ' + originH_c + ' ' + pingH_c + ' ' + avgH_c + ' ' + healthH_c + ' ' + verdictH_c + ' ' + uptimeH_c)
487
+ // 📖 Header with proper spacing (column order: Rank, Tier, SWE%, CTX, Model, Origin, Latest Ping, Avg Ping, Health, Verdict, Up%)
488
+ lines.push(' ' + rankH_c + ' ' + tierH_c + ' ' + sweH_c + ' ' + ctxH_c + ' ' + modelH_c + ' ' + originH_c + ' ' + pingH_c + ' ' + avgH_c + ' ' + healthH_c + ' ' + verdictH_c + ' ' + uptimeH_c)
489
489
 
490
490
  // 📖 Separator line
491
491
  lines.push(
@@ -493,7 +493,7 @@ function renderTable(results, pendingPings, frame, cursor = null, sortColumn = '
493
493
  chalk.dim('─'.repeat(W_RANK)) + ' ' +
494
494
  chalk.dim('─'.repeat(W_TIER)) + ' ' +
495
495
  chalk.dim('─'.repeat(W_SWE)) + ' ' +
496
- chalk.dim('─'.repeat(W_CTW)) + ' ' +
496
+ chalk.dim('─'.repeat(W_CTX)) + ' ' +
497
497
  '─'.repeat(W_MODEL) + ' ' +
498
498
  '─'.repeat(W_SOURCE) + ' ' +
499
499
  chalk.dim('─'.repeat(W_PING)) + ' ' +
@@ -529,12 +529,12 @@ function renderTable(results, pendingPings, frame, cursor = null, sortColumn = '
529
529
  : chalk.dim(sweScore.padEnd(W_SWE))
530
530
 
531
531
  // 📖 Context window column - colorized by size (larger = better)
532
- const ctwRaw = r.ctw ?? '—'
533
- const ctwCell = ctwRaw !== '—' && (ctwRaw.includes('128k') || ctwRaw.includes('200k') || ctwRaw.includes('1m'))
534
- ? chalk.greenBright(ctwRaw.padEnd(W_CTW))
535
- : ctwRaw !== '—' && (ctwRaw.includes('32k') || ctwRaw.includes('64k'))
536
- ? chalk.cyan(ctwRaw.padEnd(W_CTW))
537
- : chalk.dim(ctwRaw.padEnd(W_CTW))
532
+ const ctxRaw = r.ctx ?? '—'
533
+ const ctxCell = ctxRaw !== '—' && (ctxRaw.includes('128k') || ctxRaw.includes('200k') || ctxRaw.includes('1m'))
534
+ ? chalk.greenBright(ctxRaw.padEnd(W_CTX))
535
+ : ctxRaw !== '—' && (ctxRaw.includes('32k') || ctxRaw.includes('64k'))
536
+ ? chalk.cyan(ctxRaw.padEnd(W_CTX))
537
+ : chalk.dim(ctxRaw.padEnd(W_CTX))
538
538
 
539
539
  // 📖 Latest ping - pings are objects: { ms, code }
540
540
  // 📖 Only show response time for successful pings, "—" for errors (error code is in Status column)
@@ -640,8 +640,8 @@ function renderTable(results, pendingPings, frame, cursor = null, sortColumn = '
640
640
  uptimeCell = chalk.red(uptimeStr.padStart(W_UPTIME))
641
641
  }
642
642
 
643
- // 📖 Build row with double space between columns (order: Rank, Tier, SWE%, CTW, Model, Origin, Latest Ping, Avg Ping, Health, Verdict, Up%)
644
- const row = ' ' + num + ' ' + tier + ' ' + sweCell + ' ' + ctwCell + ' ' + name + ' ' + source + ' ' + pingCell + ' ' + avgCell + ' ' + status + ' ' + speedCell + ' ' + uptimeCell
643
+ // 📖 Build row with double space between columns (order: Rank, Tier, SWE%, CTX, Model, Origin, Latest Ping, Avg Ping, Health, Verdict, Up%)
644
+ const row = ' ' + num + ' ' + tier + ' ' + sweCell + ' ' + ctxCell + ' ' + name + ' ' + source + ' ' + pingCell + ' ' + avgCell + ' ' + status + ' ' + speedCell + ' ' + uptimeCell
645
645
 
646
646
  if (isCursor) {
647
647
  lines.push(chalk.bgRgb(139, 0, 139)(row))
@@ -1105,8 +1105,8 @@ async function runFiableMode(apiKey) {
1105
1105
  console.log(chalk.cyan(' ⚡ Analyzing models for reliability (10 seconds)...'))
1106
1106
  console.log()
1107
1107
 
1108
- let results = MODELS.map(([modelId, label, tier, sweScore, ctw], i) => ({
1109
- idx: i + 1, modelId, label, tier, sweScore, ctw,
1108
+ let results = MODELS.map(([modelId, label, tier, sweScore, ctx], i) => ({
1109
+ idx: i + 1, modelId, label, tier, sweScore, ctx,
1110
1110
  status: 'pending',
1111
1111
  pings: [],
1112
1112
  httpCode: null,
@@ -1183,21 +1183,47 @@ async function main() {
1183
1183
  }
1184
1184
  }
1185
1185
 
1186
- // 📖 Skip update check during development to avoid blocking menus
1187
- // 📖 In production, this will work correctly when versions are published
1188
- const latestVersion = null // Skip update check for now
1186
+ // 📖 Check for updates in the background
1187
+ let latestVersion = null
1188
+ try {
1189
+ latestVersion = await checkForUpdate()
1190
+ } catch {
1191
+ // Silently fail - don't block the app if npm registry is unreachable
1192
+ }
1189
1193
 
1190
1194
  // 📖 Default mode: OpenCode CLI
1191
1195
  let mode = 'opencode'
1192
1196
 
1193
- // 📖 AUTO-UPDATE: Disabled during development
1194
- // 📖 Will be re-enabled when versions are properly published
1195
-
1196
- // 📖 This section is now handled by the update notification menu above
1197
+ // 📖 Show update notification menu if a new version is available
1198
+ if (latestVersion) {
1199
+ const action = await promptUpdateNotification(latestVersion)
1200
+ if (action === 'update') {
1201
+ runUpdate(latestVersion)
1202
+ return // runUpdate will restart the process
1203
+ } else if (action === 'changelogs') {
1204
+ console.log()
1205
+ console.log(chalk.cyan(' Opening changelog in browser...'))
1206
+ console.log()
1207
+ const { execSync } = require('child_process')
1208
+ const changelogUrl = 'https://github.com/vava-nessa/free-coding-models/releases'
1209
+ try {
1210
+ if (isMac) {
1211
+ execSync(`open "${changelogUrl}"`, { stdio: 'ignore' })
1212
+ } else if (isWindows) {
1213
+ execSync(`start "" "${changelogUrl}"`, { stdio: 'ignore' })
1214
+ } else {
1215
+ execSync(`xdg-open "${changelogUrl}"`, { stdio: 'ignore' })
1216
+ }
1217
+ } catch {
1218
+ console.log(chalk.dim(` Could not open browser. Visit: ${changelogUrl}`))
1219
+ }
1220
+ }
1221
+ // If action is null (Continue without update) or changelogs, proceed to main app
1222
+ }
1197
1223
 
1198
1224
  // 📖 Create results array with all models initially visible
1199
- let results = MODELS.map(([modelId, label, tier, sweScore, ctw], i) => ({
1200
- idx: i + 1, modelId, label, tier, sweScore, ctw,
1225
+ let results = MODELS.map(([modelId, label, tier, sweScore, ctx], i) => ({
1226
+ idx: i + 1, modelId, label, tier, sweScore, ctx,
1201
1227
  status: 'pending',
1202
1228
  pings: [], // 📖 All ping results (ms or 'TIMEOUT')
1203
1229
  httpCode: null,
@@ -1306,10 +1332,10 @@ async function main() {
1306
1332
  const onKeyPress = async (str, key) => {
1307
1333
  if (!key) return
1308
1334
 
1309
- // 📖 Sorting keys: R=rank, T=tier, O=origin, M=model, L=latest ping, A=avg ping, S=SWE-bench, C=context window, H=health, V=verdict, U=uptime
1335
+ // 📖 Sorting keys: R=rank, T=tier, O=origin, M=model, L=latest ping, A=avg ping, S=SWE-bench, N=context, H=health, V=verdict, U=uptime
1310
1336
  const sortKeys = {
1311
1337
  'r': 'rank', 't': 'tier', 'o': 'origin', 'm': 'model',
1312
- 'l': 'ping', 'a': 'avg', 's': 'swe', 'c': 'ctw', 'h': 'condition', 'v': 'verdict', 'u': 'uptime'
1338
+ 'l': 'ping', 'a': 'avg', 's': 'swe', 'n': 'ctx', 'h': 'condition', 'v': 'verdict', 'u': 'uptime'
1313
1339
  }
1314
1340
 
1315
1341
  if (sortKeys[key.name]) {
package/lib/utils.js CHANGED
@@ -136,7 +136,7 @@ export const getUptime = (r) => {
136
136
  // - 'ping' (L key) — last ping latency (only successful ones count)
137
137
  // - 'avg' (A key) — average latency across all successful pings
138
138
  // - 'swe' (S key) — SWE-bench score (higher is better)
139
- // - 'ctw' (C key) — context window size (larger is better)
139
+ // - 'ctx' (N key) — context window size (larger is better)
140
140
  // - 'condition' (H key) — health status (alphabetical)
141
141
  // - 'verdict' (V key) — verdict order (Perfect → Pending)
142
142
  // - 'uptime' (U key) — uptime percentage
@@ -185,12 +185,12 @@ export const sortResults = (results, sortColumn, sortDirection) => {
185
185
  cmp = parseSwe(a.sweScore) - parseSwe(b.sweScore)
186
186
  break
187
187
  }
188
- case 'ctw': {
188
+ case 'ctx': {
189
189
  // 📖 Sort by context window size — larger is better
190
190
  // 📖 Parse strings like "128k", "32k", "1m" into numeric tokens
191
- const parseCtw = (ctw) => {
192
- if (!ctw || ctw === '—') return 0
193
- const str = ctw.toLowerCase()
191
+ const parseCtx = (ctx) => {
192
+ if (!ctx || ctx === '—') return 0
193
+ const str = ctx.toLowerCase()
194
194
  // 📖 Handle millions (1m = 1000k)
195
195
  if (str.includes('m')) {
196
196
  const num = parseFloat(str.replace('m', ''))
@@ -203,7 +203,7 @@ export const sortResults = (results, sortColumn, sortDirection) => {
203
203
  }
204
204
  return 0
205
205
  }
206
- cmp = parseCtw(a.ctw) - parseCtw(b.ctw)
206
+ cmp = parseCtx(a.ctx) - parseCtx(b.ctx)
207
207
  break
208
208
  }
209
209
  case 'condition':
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "free-coding-models",
3
- "version": "0.1.39",
3
+ "version": "0.1.41",
4
4
  "description": "Find the fastest coding LLM models in seconds — ping free models from multiple providers, pick the best one for OpenCode, Cursor, or any AI coding assistant.",
5
5
  "keywords": [
6
6
  "nvidia",
package/sources.js CHANGED
@@ -4,13 +4,13 @@
4
4
  *
5
5
  * @details
6
6
  * This file contains all model definitions organized by provider/source.
7
- * Each source has its own models array with [model_id, display_label, tier, swe_score, ctw].
7
+ * Each source has its own models array with [model_id, display_label, tier, swe_score, ctx].
8
8
  * - model_id: The model identifier for API calls
9
9
  * - display_label: Human-friendly name for display
10
10
  * - tier: Performance tier (S+, S, A+, A, A-, B+, B, C)
11
- * - swe_score: SWE-bench Verified score percentage
12
- * - ctw: Context window size in tokens (e.g., "128k", "32k")
13
- *
11
+ * - swe_score: SWE-bench Verified score percentage (self-reported by model provider)
12
+ * - ctx: Context window size in tokens (e.g., "128k", "32k")
13
+ *
14
14
  * Add new sources here to support additional providers beyond NIM.
15
15
  *
16
16
  * 🎯 Tier scale (based on SWE-bench Verified):
@@ -18,12 +18,14 @@
18
18
  * - S: 60-70% (excellent)
19
19
  * - A+: 50-60% (great)
20
20
  * - A: 40-50% (good)
21
- * - A-: 35-45% (decent)
22
- * - B+: 30-40% (average)
21
+ * - A-: 35-40% (decent)
22
+ * - B+: 30-35% (average)
23
23
  * - B: 20-30% (below average)
24
24
  * - C: <20% (lightweight/edge)
25
25
  *
26
- * 📖 Source: https://www.swebench.com
26
+ * 📖 Source: https://www.swebench.com — scores are self-reported unless noted
27
+ * 📖 Secondary: https://swe-rebench.com (independent evals, scores are lower)
28
+ * 📖 Leaderboard tracker: https://www.marc0.dev/en/leaderboard
27
29
  *
28
30
  * @exports Object containing all sources and their models
29
31
  */
@@ -31,57 +33,57 @@
31
33
  // 📖 NIM source - https://build.nvidia.com
32
34
  export const nvidiaNim = [
33
35
  // ── S+ tier — SWE-bench Verified ≥70% ──
34
- ['deepseek-ai/deepseek-v3.1', 'DeepSeek V3.1', 'S+', '49.2%', '128k'],
35
- ['deepseek-ai/deepseek-v3.1-terminus', 'DeepSeek V3.1 Term', 'S+', '49.2%', '128k'],
36
36
  ['deepseek-ai/deepseek-v3.2', 'DeepSeek V3.2', 'S+', '73.1%', '128k'],
37
37
  ['moonshotai/kimi-k2.5', 'Kimi K2.5', 'S+', '76.8%', '128k'],
38
- ['mistralai/devstral-2-123b-instruct-2512', 'Devstral 2 123B', 'S+', '62.0%', '128k'],
39
- ['nvidia/llama-3.1-nemotron-ultra-253b-v1', 'Nemotron Ultra 253B', 'S+', '56.0%', '128k'],
40
- ['mistralai/mistral-large-3-675b-instruct-2512', 'Mistral Large 675B', 'S+', '58.0%', '128k'],
41
- // ── S tier — SWE-bench Verified 50–70% ──
42
- ['qwen/qwen2.5-coder-32b-instruct', 'Qwen2.5 Coder 32B', 'S', '46.0%', '32k'],
43
- ['z-ai/glm5', 'GLM 5', 'S', '77.8%', '128k'],
44
- ['qwen/qwen3.5-397b-a17b', 'Qwen3.5 400B VLM', 'S', '68.0%', '128k'],
45
- ['qwen/qwen3-coder-480b-a35b-instruct', 'Qwen3 Coder 480B', 'S', '72.0%', '128k'],
46
- ['qwen/qwen3-next-80b-a3b-thinking', 'Qwen3 80B Thinking', 'S', '68.0%', '128k'],
47
- ['meta/llama-3.1-405b-instruct', 'Llama 3.1 405B', 'S', '44.0%', '128k'],
48
- ['minimaxai/minimax-m2.1', 'MiniMax M2.1', 'S', '70.0%', '128k'],
49
- // ── A+ tier — SWE-bench Verified 60–70% ──
50
- ['moonshotai/kimi-k2-thinking', 'Kimi K2 Thinking', 'A+', '67.0%', '128k'],
51
- ['moonshotai/kimi-k2-instruct', 'Kimi K2 Instruct', 'A+', '65.8%', '128k'],
52
- ['qwen/qwen3-235b-a22b', 'Qwen3 235B', 'A+', '70.0%', '128k'],
53
- ['meta/llama-3.3-70b-instruct', 'Llama 3.3 70B', 'A+', '39.5%', '128k'],
54
- ['z-ai/glm4.7', 'GLM 4.7', 'A+', '73.8%', '128k'],
55
- ['qwen/qwen3-next-80b-a3b-instruct', 'Qwen3 80B Instruct', 'A+', '65.0%', '128k'],
56
- // ── A tier — SWE-bench Verified 45–60% ──
57
- ['minimaxai/minimax-m2', 'MiniMax M2', 'A', '56.5%', '128k'],
58
- ['mistralai/mistral-medium-3-instruct', 'Mistral Medium 3', 'A', '48.0%', '128k'],
59
- ['mistralai/magistral-small-2506', 'Magistral Small', 'A', '45.0%', '32k'],
60
- ['nvidia/nemotron-3-nano-30b-a3b', 'Nemotron Nano 30B', 'A', '43.0%', '128k'],
61
- ['deepseek-ai/deepseek-r1-distill-qwen-32b', 'R1 Distill 32B', 'A', '43.9%', '128k'],
62
- // ── A- tier — SWE-bench Verified 35–45% ──
63
- ['openai/gpt-oss-120b', 'GPT OSS 120B', 'A-', '60.0%', '128k'],
64
- ['nvidia/llama-3.3-nemotron-super-49b-v1.5', 'Nemotron Super 49B', 'A-', '49.0%', '128k'],
65
- ['meta/llama-4-scout-17b-16e-instruct', 'Llama 4 Scout', 'A-', '44.0%', '128k'],
38
+ ['z-ai/glm5', 'GLM 5', 'S+', '77.8%', '128k'],
39
+ ['z-ai/glm4.7', 'GLM 4.7', 'S+', '73.8%', '200k'],
40
+ ['moonshotai/kimi-k2-thinking', 'Kimi K2 Thinking', 'S+', '71.3%', '256k'],
41
+ ['minimaxai/minimax-m2.1', 'MiniMax M2.1', 'S+', '74.0%', '200k'],
42
+ ['stepfun-ai/step-3.5-flash', 'Step 3.5 Flash', 'S+', '74.4%', '256k'],
43
+ ['qwen/qwen3-coder-480b-a35b-instruct', 'Qwen3 Coder 480B', 'S+', '70.6%', '256k'],
44
+ ['qwen/qwen3-235b-a22b', 'Qwen3 235B', 'S+', '70.0%', '128k'],
45
+ ['mistralai/devstral-2-123b-instruct-2512', 'Devstral 2 123B', 'S+', '72.2%', '256k'],
46
+ // ── S tier — SWE-bench Verified 60–70% ──
47
+ ['deepseek-ai/deepseek-v3.1-terminus', 'DeepSeek V3.1 Term', 'S', '68.4%', '128k'],
48
+ ['moonshotai/kimi-k2-instruct', 'Kimi K2 Instruct', 'S', '65.8%', '128k'],
49
+ ['minimaxai/minimax-m2', 'MiniMax M2', 'S', '69.4%', '128k'],
50
+ ['qwen/qwen3-next-80b-a3b-thinking', 'Qwen3 80B Thinking', 'S', '68.0%', '128k'],
51
+ ['qwen/qwen3-next-80b-a3b-instruct', 'Qwen3 80B Instruct', 'S', '65.0%', '128k'],
52
+ ['qwen/qwen3.5-397b-a17b', 'Qwen3.5 400B VLM', 'S', '68.0%', '128k'],
53
+ ['openai/gpt-oss-120b', 'GPT OSS 120B', 'S', '60.0%', '128k'],
54
+ ['meta/llama-4-maverick-17b-128e-instruct', 'Llama 4 Maverick', 'S', '62.0%', '1M'],
55
+ ['deepseek-ai/deepseek-v3.1', 'DeepSeek V3.1', 'S', '62.0%', '128k'],
56
+ // ── A+ tier — SWE-bench Verified 50–60% ──
57
+ ['nvidia/llama-3.1-nemotron-ultra-253b-v1', 'Nemotron Ultra 253B', 'A+', '56.0%', '128k'],
58
+ ['mistralai/mistral-large-3-675b-instruct-2512', 'Mistral Large 675B', 'A+', '58.0%', '256k'],
59
+ ['qwen/qwq-32b', 'QwQ 32B', 'A+', '50.0%', '131k'],
60
+ ['igenius/colosseum_355b_instruct_16k', 'Colosseum 355B', 'A+', '52.0%', '16k'],
61
+ // ── A tier — SWE-bench Verified 40–50% ──
62
+ ['mistralai/mistral-medium-3-instruct', 'Mistral Medium 3', 'A', '48.0%', '128k'],
63
+ ['mistralai/magistral-small-2506', 'Magistral Small', 'A', '45.0%', '32k'],
64
+ ['nvidia/llama-3.3-nemotron-super-49b-v1.5', 'Nemotron Super 49B', 'A', '49.0%', '128k'],
65
+ ['meta/llama-4-scout-17b-16e-instruct', 'Llama 4 Scout', 'A', '44.0%', '10M'],
66
+ ['nvidia/nemotron-3-nano-30b-a3b', 'Nemotron Nano 30B', 'A', '43.0%', '128k'],
67
+ ['deepseek-ai/deepseek-r1-distill-qwen-32b', 'R1 Distill 32B', 'A', '43.9%', '128k'],
68
+ ['openai/gpt-oss-20b', 'GPT OSS 20B', 'A', '42.0%', '128k'],
69
+ ['qwen/qwen2.5-coder-32b-instruct', 'Qwen2.5 Coder 32B', 'A', '46.0%', '32k'],
70
+ ['meta/llama-3.1-405b-instruct', 'Llama 3.1 405B', 'A', '44.0%', '128k'],
71
+ // ── A- tier — SWE-bench Verified 35–40% ──
72
+ ['meta/llama-3.3-70b-instruct', 'Llama 3.3 70B', 'A-', '39.5%', '128k'],
66
73
  ['deepseek-ai/deepseek-r1-distill-qwen-14b', 'R1 Distill 14B', 'A-', '37.7%', '64k'],
67
- ['igenius/colosseum_355b_instruct_16k', 'Colosseum 355B', 'A-', '52.0%', '16k'],
68
- // ── B+ tier — SWE-bench Verified 30–40% ──
69
- ['qwen/qwq-32b', 'QwQ 32B', 'B+', '50.0%', '32k'],
70
- ['openai/gpt-oss-20b', 'GPT OSS 20B', 'B+', '42.0%', '32k'],
71
- ['stockmark/stockmark-2-100b-instruct', 'Stockmark 100B', 'B+', '36.0%', '32k'],
72
- ['bytedance/seed-oss-36b-instruct', 'Seed OSS 36B', 'B+', '38.0%', '32k'],
73
- ['stepfun-ai/step-3.5-flash', 'Step 3.5 Flash', 'B+', '74.4%', '32k'],
74
- // ── B tier — SWE-bench Verified 20–35% ──
75
- ['meta/llama-4-maverick-17b-128e-instruct', 'Llama 4 Maverick', 'B', '62.0%', '128k'],
76
- ['mistralai/mixtral-8x22b-instruct-v0.1', 'Mixtral 8x22B', 'B', '32.0%', '64k'],
77
- ['mistralai/ministral-14b-instruct-2512', 'Ministral 14B', 'B', '34.0%', '32k'],
78
- ['ibm/granite-34b-code-instruct', 'Granite 34B Code', 'B', '30.0%', '32k'],
79
- ['deepseek-ai/deepseek-r1-distill-llama-8b', 'R1 Distill 8B', 'B', '28.2%', '32k'],
80
- // ── C tier — SWE-bench Verified <25% or lightweight edge models ──
81
- ['deepseek-ai/deepseek-r1-distill-qwen-7b', 'R1 Distill 7B', 'C', '22.6%', '32k'],
82
- ['google/gemma-2-9b-it', 'Gemma 2 9B', 'C', '18.0%', '8k'],
83
- ['microsoft/phi-3.5-mini-instruct', 'Phi 3.5 Mini', 'C', '12.0%', '128k'],
84
- ['microsoft/phi-4-mini-instruct', 'Phi 4 Mini', 'C', '14.0%', '128k'],
74
+ ['bytedance/seed-oss-36b-instruct', 'Seed OSS 36B', 'A-', '38.0%', '32k'],
75
+ ['stockmark/stockmark-2-100b-instruct', 'Stockmark 100B', 'A-', '36.0%', '32k'],
76
+ // ── B+ tier — SWE-bench Verified 30–35% ──
77
+ ['mistralai/mixtral-8x22b-instruct-v0.1', 'Mixtral 8x22B', 'B+', '32.0%', '64k'],
78
+ ['mistralai/ministral-14b-instruct-2512', 'Ministral 14B', 'B+', '34.0%', '32k'],
79
+ ['ibm/granite-34b-code-instruct', 'Granite 34B Code', 'B+', '30.0%', '32k'],
80
+ // ── B tier — SWE-bench Verified 20–30% ──
81
+ ['deepseek-ai/deepseek-r1-distill-llama-8b', 'R1 Distill 8B', 'B', '28.2%', '32k'],
82
+ ['deepseek-ai/deepseek-r1-distill-qwen-7b', 'R1 Distill 7B', 'B', '22.6%', '32k'],
83
+ // ── C tier — SWE-bench Verified <20% or lightweight edge models ──
84
+ ['google/gemma-2-9b-it', 'Gemma 2 9B', 'C', '18.0%', '8k'],
85
+ ['microsoft/phi-3.5-mini-instruct', 'Phi 3.5 Mini', 'C', '12.0%', '128k'],
86
+ ['microsoft/phi-4-mini-instruct', 'Phi 4 Mini', 'C', '14.0%', '128k'],
85
87
  ]
86
88
 
87
89
  // 📖 All sources combined - used by the main script
@@ -95,7 +97,7 @@ export const sources = {
95
97
  // 📖 Flatten all models from all sources for backward compatibility
96
98
  export const MODELS = []
97
99
  for (const [sourceKey, sourceData] of Object.entries(sources)) {
98
- for (const [modelId, label, tier, sweScore, ctw] of sourceData.models) {
99
- MODELS.push([modelId, label, tier, sweScore, ctw])
100
+ for (const [modelId, label, tier, sweScore, ctx] of sourceData.models) {
101
+ MODELS.push([modelId, label, tier, sweScore, ctx])
100
102
  }
101
103
  }