winter-super-cli 2026.5.27 → 2026.5.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/winter.js +2 -1
- package/package.json +1 -1
- package/src/ai/benchmark.js +352 -0
- package/src/ai/model-capabilities.js +185 -0
- package/src/ai/prompts/system-prompt.js +106 -29
- package/src/ai/prompts/task-classifier.js +5 -1
- package/src/ai/providers.js +145 -8
- package/src/ai/reasoning.js +190 -0
- package/src/cli/commands.js +62 -0
- package/src/cli/context-loader.js +64 -1
- package/src/cli/conversation-format.js +90 -12
- package/src/cli/prompt-builder.js +43 -17
- package/src/cli/repl-commands.js +15 -3
- package/src/cli/repl.js +327 -209
- package/src/context/resource-loader.js +136 -0
- package/src/context/router.js +77 -20
- package/src/tools/executor.js +78 -9
package/bin/winter.js
CHANGED
|
@@ -17,7 +17,7 @@ const pkg = JSON.parse(readFileSync(new URL('../package.json', import.meta.url),
|
|
|
17
17
|
const version = pkg.version;
|
|
18
18
|
|
|
19
19
|
const COMMANDS = new Set([
|
|
20
|
-
'chat', 'call', 'session', 'skill', 'plugin', 'design', 'config', 'init',
|
|
20
|
+
'chat', 'call', 'benchmark', 'session', 'skill', 'plugin', 'design', 'config', 'init',
|
|
21
21
|
'help', 'project', 'code', 'review', 'mcp', 'permissions',
|
|
22
22
|
'provider', 'providers', 'model', 'models',
|
|
23
23
|
]);
|
|
@@ -53,6 +53,7 @@ Usage:
|
|
|
53
53
|
Commands:
|
|
54
54
|
|
|
55
55
|
winter call <prompt> Call all configured providers
|
|
56
|
+
winter benchmark [providers] Benchmark model intelligence
|
|
56
57
|
winter session <action> Session management
|
|
57
58
|
winter skill <action> Skill management
|
|
58
59
|
winter plugin <action> Plugin management
|
package/package.json
CHANGED
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Benchmark Engine — Đo độ thông minh của models trong Winter CLI
|
|
3
|
+
*
|
|
4
|
+
* Cố định câu hỏi test (logic, coding, math, reasoning, language)
|
|
5
|
+
* + Coding task thật → chạy qua providers → chấm điểm → so sánh
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { colors } from '../cli/snowflake-logo.js';
|
|
9
|
+
|
|
10
|
+
// ── Question Bank ────────────────────────────────────────────────────────────
|
|
11
|
+
|
|
12
|
+
const BENCHMARK_QUESTIONS = [
|
|
13
|
+
{
|
|
14
|
+
id: 'q01',
|
|
15
|
+
category: 'logic',
|
|
16
|
+
question: `If all cats are mammals and some mammals are dogs, are all cats dogs? Explain your reasoning step by step.`,
|
|
17
|
+
keywords: ['not', 'no', 'incorrect', 'cannot conclude', 'not necessarily', 'invalid'],
|
|
18
|
+
weight: 1,
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
id: 'q02',
|
|
22
|
+
category: 'coding',
|
|
23
|
+
question: `Write a JavaScript function called isPalindrome that checks if a string is a palindrome (reads the same forwards and backwards). Include example usage.`,
|
|
24
|
+
keywords: ['function', 'palindrome', 'reverse', 'split', 'return'],
|
|
25
|
+
weight: 1.5,
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
id: 'q03',
|
|
29
|
+
category: 'math',
|
|
30
|
+
question: `What is 15% of 200? Show your calculation.`,
|
|
31
|
+
keywords: ['30', '15', '200', '0.15'],
|
|
32
|
+
weight: 0.5,
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
id: 'q04',
|
|
36
|
+
category: 'reasoning',
|
|
37
|
+
question: `A bat and a ball cost $1.10 in total. The bat costs $1.00 more than the ball. How much does the ball cost? Think carefully.`,
|
|
38
|
+
keywords: ['0.05', '5 cents', '5 cent', '0.05$', '5¢', 'five cents'],
|
|
39
|
+
weight: 1.5,
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
id: 'q05',
|
|
43
|
+
category: 'coding',
|
|
44
|
+
question: `What's wrong with this code and how would you fix it?\n\nfunction add(a, b) {\n return a + b;\n}\nconsole.log(add(5, '3'));`,
|
|
45
|
+
keywords: ['string', 'type', 'concatenation', 'number', 'parse', 'typeof', 'coercion'],
|
|
46
|
+
weight: 1,
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
id: 'q06',
|
|
50
|
+
category: 'language',
|
|
51
|
+
question: `Translate this sentence to Vietnamese: "Good morning, how are you today?"`,
|
|
52
|
+
keywords: ['chào', 'sáng', 'khỏe', 'hôm nay', 'bạn'],
|
|
53
|
+
weight: 0.5,
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
id: 'q07',
|
|
57
|
+
category: 'logic',
|
|
58
|
+
question: `You have a 3-gallon jug and a 5-gallon jug. How can you measure exactly 4 gallons of water? Explain step by step.`,
|
|
59
|
+
keywords: ['fill', 'pour', '3', '5', '4', 'empty'],
|
|
60
|
+
weight: 1.5,
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
id: 'q08',
|
|
64
|
+
category: 'coding',
|
|
65
|
+
question: `Write a recursive function to calculate the nth Fibonacci number. Explain how memoization can optimize it.`,
|
|
66
|
+
keywords: ['function', 'fibonacci', 'recursive', 'memoization', 'cache'],
|
|
67
|
+
weight: 1.5,
|
|
68
|
+
},
|
|
69
|
+
];
|
|
70
|
+
|
|
71
|
+
const BENCHMARK_TASKS = [
|
|
72
|
+
{
|
|
73
|
+
id: 't01',
|
|
74
|
+
category: 'coding-task',
|
|
75
|
+
title: 'API Fetch with Error Handling',
|
|
76
|
+
description: 'Write a JavaScript function that fetches JSON data from a URL, handles network errors, HTTP errors, and invalid JSON responses gracefully.',
|
|
77
|
+
evaluationCriteria: ['error handling', 'try/catch', 'async/await', 'fetch', 'response.ok'],
|
|
78
|
+
weight: 2,
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
id: 't02',
|
|
82
|
+
category: 'coding-task',
|
|
83
|
+
title: 'Event Emitter Class',
|
|
84
|
+
description: 'Create a simple EventEmitter class in JavaScript with on(), off(), and emit() methods. It should support multiple listeners for the same event and removing listeners.',
|
|
85
|
+
evaluationCriteria: ['class', 'on', 'off', 'emit', 'listeners', 'events'],
|
|
86
|
+
weight: 2,
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
id: 't03',
|
|
90
|
+
category: 'coding-task',
|
|
91
|
+
title: 'Fix This Bug',
|
|
92
|
+
description: `What's wrong with this code? Identify ALL bugs and provide a fixed version:\n\nconst users = [\n { name: 'Alice', age: 30 },\n { name: 'Bob', age: 25 },\n { name: 'Charlie', age: 35 },\n];\n\nconst adultUsers = users.filter(u => u.age >= 18);\nadultUsers.forEach(u => {\n console.log(u.Name);\n});\n\nadultUsers.sort((a, b) => a.age - b.age);\nconst totalAge = adultUsers.reduce((acc, u) => acc + u.age);\nconsole.log('Average age:', totalAge / adultUsers.length);`,
|
|
93
|
+
evaluationCriteria: ['Name', 'name', 'undefined', 'reduce', 'initial', 'initialize', 'capital N'],
|
|
94
|
+
weight: 2.5,
|
|
95
|
+
},
|
|
96
|
+
];
|
|
97
|
+
|
|
98
|
+
// ── Scoring ──────────────────────────────────────────────────────────────────
|
|
99
|
+
|
|
100
|
+
function scoreAnswer(question, answer) {
|
|
101
|
+
if (!answer || typeof answer !== 'string') return 0;
|
|
102
|
+
|
|
103
|
+
const lower = answer.toLowerCase();
|
|
104
|
+
let matches = 0;
|
|
105
|
+
|
|
106
|
+
for (const kw of question.keywords) {
|
|
107
|
+
if (lower.includes(kw.toLowerCase())) {
|
|
108
|
+
matches++;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
const ratio = question.keywords.length > 0 ? matches / question.keywords.length : 0;
|
|
113
|
+
|
|
114
|
+
// Bonus: longer, well-structured answers tend to be better
|
|
115
|
+
const words = answer.split(/\s+/).length;
|
|
116
|
+
const lengthBonus = words > 50 ? 0.1 : words > 20 ? 0.05 : 0;
|
|
117
|
+
|
|
118
|
+
return Math.min(1, ratio + lengthBonus);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
function scoreTask(task, answer) {
|
|
122
|
+
if (!answer || typeof answer !== 'string') return 0;
|
|
123
|
+
|
|
124
|
+
const lower = answer.toLowerCase();
|
|
125
|
+
let matches = 0;
|
|
126
|
+
|
|
127
|
+
for (const criterion of task.evaluationCriteria) {
|
|
128
|
+
if (lower.includes(criterion.toLowerCase())) {
|
|
129
|
+
matches++;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const ratio = task.evaluationCriteria.length > 0 ? matches / task.evaluationCriteria.length : 0;
|
|
134
|
+
const words = answer.split(/\s+/).length;
|
|
135
|
+
const lengthBonus = words > 100 ? 0.1 : words > 50 ? 0.05 : 0;
|
|
136
|
+
|
|
137
|
+
return Math.min(1, ratio + lengthBonus);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// ── Benchmark Runner ─────────────────────────────────────────────────────────
|
|
141
|
+
|
|
142
|
+
export class BenchmarkRunner {
|
|
143
|
+
constructor(aiManager) {
|
|
144
|
+
this.ai = aiManager;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Run all benchmark questions across specified providers.
|
|
149
|
+
* @param {string[]} providerNames - List of provider names (e.g., ['claude', 'openai', 'ollama'])
|
|
150
|
+
* @param {object} options
|
|
151
|
+
* @param {boolean} options.tasks - Whether to include coding tasks (default: true)
|
|
152
|
+
* @param {boolean} options.questions - Whether to include fixed questions (default: true)
|
|
153
|
+
*/
|
|
154
|
+
async run(providerNames, options = {}) {
|
|
155
|
+
const { questions = true, tasks = true } = options;
|
|
156
|
+
|
|
157
|
+
await this.ai.init();
|
|
158
|
+
|
|
159
|
+
// Filter to only ready providers
|
|
160
|
+
const providers = providerNames
|
|
161
|
+
.map(name => ({ name, provider: this.ai.providers[name] }))
|
|
162
|
+
.filter(({ provider }) => provider && provider.ready);
|
|
163
|
+
|
|
164
|
+
if (providers.length === 0) {
|
|
165
|
+
return { error: 'No ready providers found. Configure providers in winter.json first.' };
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
const results = {};
|
|
169
|
+
const startTime = Date.now();
|
|
170
|
+
|
|
171
|
+
for (const { name, provider } of providers) {
|
|
172
|
+
console.log(`${colors.dim}Benchmarking ${colors.bright}${name}${colors.reset}${colors.dim}...${colors.reset}`);
|
|
173
|
+
|
|
174
|
+
const providerResults = [];
|
|
175
|
+
let totalScore = 0;
|
|
176
|
+
let maxScore = 0;
|
|
177
|
+
|
|
178
|
+
// Fixed questions
|
|
179
|
+
if (questions) {
|
|
180
|
+
for (const q of BENCHMARK_QUESTIONS) {
|
|
181
|
+
const qStart = Date.now();
|
|
182
|
+
const answer = await this.askProvider(provider, q.question);
|
|
183
|
+
const elapsed = Date.now() - qStart;
|
|
184
|
+
const score = scoreAnswer(q, answer);
|
|
185
|
+
|
|
186
|
+
providerResults.push({
|
|
187
|
+
type: 'question',
|
|
188
|
+
id: q.id,
|
|
189
|
+
category: q.category,
|
|
190
|
+
question: q.question,
|
|
191
|
+
answer: answer.slice(0, 500), // truncate for display
|
|
192
|
+
score,
|
|
193
|
+
weightedScore: score * q.weight,
|
|
194
|
+
maxWeightedScore: q.weight,
|
|
195
|
+
elapsed,
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
totalScore += score * q.weight;
|
|
199
|
+
maxScore += q.weight;
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// Coding tasks
|
|
204
|
+
if (tasks) {
|
|
205
|
+
for (const t of BENCHMARK_TASKS) {
|
|
206
|
+
const tStart = Date.now();
|
|
207
|
+
const answer = await this.askProvider(provider, t.description);
|
|
208
|
+
const elapsed = Date.now() - tStart;
|
|
209
|
+
const score = scoreTask(t, answer);
|
|
210
|
+
|
|
211
|
+
providerResults.push({
|
|
212
|
+
type: 'task',
|
|
213
|
+
id: t.id,
|
|
214
|
+
category: t.category,
|
|
215
|
+
title: t.title,
|
|
216
|
+
question: t.description,
|
|
217
|
+
answer: answer.slice(0, 500),
|
|
218
|
+
score,
|
|
219
|
+
weightedScore: score * t.weight,
|
|
220
|
+
maxWeightedScore: t.weight,
|
|
221
|
+
elapsed,
|
|
222
|
+
});
|
|
223
|
+
|
|
224
|
+
totalScore += score * t.weight;
|
|
225
|
+
maxScore += t.weight;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
const overall = maxScore > 0 ? Math.round((totalScore / maxScore) * 100) : 0;
|
|
230
|
+
|
|
231
|
+
results[name] = {
|
|
232
|
+
provider: name,
|
|
233
|
+
model: provider.model,
|
|
234
|
+
results: providerResults,
|
|
235
|
+
totalScore,
|
|
236
|
+
maxScore,
|
|
237
|
+
overall,
|
|
238
|
+
elapsed: Date.now() - startTime,
|
|
239
|
+
};
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
return {
|
|
243
|
+
timestamp: new Date().toISOString(),
|
|
244
|
+
totalElapsed: Date.now() - startTime,
|
|
245
|
+
providers: results,
|
|
246
|
+
// Sort providers by overall score descending
|
|
247
|
+
ranking: Object.values(results)
|
|
248
|
+
.sort((a, b) => b.overall - a.overall)
|
|
249
|
+
.map(r => ({ name: r.provider, model: r.model, score: r.overall, elapsed: r.elapsed })),
|
|
250
|
+
};
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
async askProvider(provider, prompt) {
|
|
254
|
+
try {
|
|
255
|
+
const messages = [
|
|
256
|
+
{ role: 'system', content: 'You are a helpful AI assistant. Answer concisely and accurately.' },
|
|
257
|
+
{ role: 'user', content: prompt },
|
|
258
|
+
];
|
|
259
|
+
const data = await this.ai.sendRequestToProvider(provider, messages, {
|
|
260
|
+
enableTools: false,
|
|
261
|
+
model: provider.model,
|
|
262
|
+
});
|
|
263
|
+
return data.choices?.[0]?.message?.content || '';
|
|
264
|
+
} catch (err) {
|
|
265
|
+
return `[ERROR: ${err.message}]`;
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// ── Format Results ────────────────────────────────────────────────────────
|
|
270
|
+
|
|
271
|
+
formatResults(benchmarkResult) {
|
|
272
|
+
if (benchmarkResult.error) {
|
|
273
|
+
return `\n${colors.red}${benchmarkResult.error}${colors.reset}\n`;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
const lines = [];
|
|
277
|
+
lines.push(`\n${colors.cyan}${'═'.repeat(60)}${colors.reset}`);
|
|
278
|
+
lines.push(`${colors.bright}${colors.cyan} 🧠 WINTER MODEL BENCHMARK${colors.reset}`);
|
|
279
|
+
lines.push(`${colors.cyan}${'═'.repeat(60)}${colors.reset}`);
|
|
280
|
+
lines.push(` ${colors.dim}${benchmarkResult.timestamp}${colors.reset}`);
|
|
281
|
+
lines.push(` ${colors.dim}Total time: ${(benchmarkResult.totalElapsed / 1000).toFixed(1)}s${colors.reset}`);
|
|
282
|
+
lines.push('');
|
|
283
|
+
|
|
284
|
+
// Ranking
|
|
285
|
+
lines.push(`${colors.bright}🏆 RANKING${colors.reset}`);
|
|
286
|
+
lines.push(`${'─'.repeat(40)}`);
|
|
287
|
+
benchmarkResult.ranking.forEach((r, i) => {
|
|
288
|
+
const medal = i === 0 ? '🥇' : i === 1 ? '🥈' : i === 2 ? '🥉' : ` ${i + 1}.`;
|
|
289
|
+
const bar = this._scoreBar(r.score, 20);
|
|
290
|
+
lines.push(` ${medal} ${colors.bright}${r.name}${colors.reset} ${bar} ${r.score}%`);
|
|
291
|
+
lines.push(` ${colors.dim}Model: ${r.model} | Time: ${(r.elapsed / 1000).toFixed(1)}s${colors.reset}`);
|
|
292
|
+
});
|
|
293
|
+
lines.push('');
|
|
294
|
+
|
|
295
|
+
// Detail per provider
|
|
296
|
+
for (const [name, data] of Object.entries(benchmarkResult.providers)) {
|
|
297
|
+
lines.push(`${colors.bright}${'─'.repeat(50)}${colors.reset}`);
|
|
298
|
+
lines.push(`${colors.bright}📊 ${name}${colors.reset} ${colors.dim}(${data.model})${colors.reset}`);
|
|
299
|
+
lines.push(`${'─'.repeat(50)}`);
|
|
300
|
+
|
|
301
|
+
const categories = {};
|
|
302
|
+
for (const r of data.results) {
|
|
303
|
+
const cat = r.category || 'other';
|
|
304
|
+
if (!categories[cat]) categories[cat] = { count: 0, totalScore: 0, maxScore: 0 };
|
|
305
|
+
categories[cat].count++;
|
|
306
|
+
categories[cat].totalScore += r.score;
|
|
307
|
+
categories[cat].maxScore += 1;
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
for (const [cat, stats] of Object.entries(categories)) {
|
|
311
|
+
const catPct = Math.round((stats.totalScore / stats.maxScore) * 100);
|
|
312
|
+
const bar = this._scoreBar(catPct, 10);
|
|
313
|
+
lines.push(` ${bar} ${colors.dim}${cat}:${colors.reset} ${catPct}% (${stats.count} items)`);
|
|
314
|
+
}
|
|
315
|
+
lines.push('');
|
|
316
|
+
|
|
317
|
+
// Per-item breakdown
|
|
318
|
+
for (const r of data.results) {
|
|
319
|
+
const icon = r.score >= 0.8 ? '✅' : r.score >= 0.5 ? '🟡' : r.score >= 0.2 ? '🟠' : '❌';
|
|
320
|
+
const label = r.type === 'question' ? r.id : r.title;
|
|
321
|
+
lines.push(` ${icon} ${colors.dim}${label}:${colors.reset} ${Math.round(r.score * 100)}% (${(r.elapsed / 1000).toFixed(1)}s)`);
|
|
322
|
+
// Show preview of answer
|
|
323
|
+
const preview = r.answer.replace(/\n/g, ' ').slice(0, 120);
|
|
324
|
+
lines.push(` ${colors.dim}${preview}${r.answer.length > 120 ? '...' : ''}${colors.reset}`);
|
|
325
|
+
}
|
|
326
|
+
lines.push('');
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
lines.push(`${colors.cyan}${'═'.repeat(60)}${colors.reset}\n`);
|
|
330
|
+
|
|
331
|
+
return lines.join('\n');
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
_scoreBar(score, width = 20) {
|
|
335
|
+
const filled = Math.round((score / 100) * width);
|
|
336
|
+
const empty = width - filled;
|
|
337
|
+
const filledChar = '█';
|
|
338
|
+
const emptyChar = '░';
|
|
339
|
+
return colors.green + filledChar.repeat(filled) + colors.dim + emptyChar.repeat(empty) + colors.reset;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// ── History ───────────────────────────────────────────────────────────────
|
|
343
|
+
|
|
344
|
+
formatHistorySummary(benchmarkResult) {
|
|
345
|
+
return benchmarkResult.ranking
|
|
346
|
+
.map(r => `[${r.name}] Score: ${r.score}% | Model: ${r.model} | Time: ${(r.elapsed / 1000).toFixed(1)}s`)
|
|
347
|
+
.join('\n');
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
// Export question/task banks for testing
|
|
352
|
+
export { BENCHMARK_QUESTIONS, BENCHMARK_TASKS, scoreAnswer, scoreTask };
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ❄️ MODEL CAPABILITIES ❄️
|
|
3
|
+
* Detect AI model capability tier from model name.
|
|
4
|
+
* Small models need aggressive prompting to compete with large ones.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
export const MODEL_TIERS = {
|
|
8
|
+
TINY: 'tiny', // <3B params — barely functional for code
|
|
9
|
+
SMALL: 'small', // 3B-15B params — basic code ability
|
|
10
|
+
MEDIUM: 'medium', // 15B-40B params — decent code ability
|
|
11
|
+
LARGE: 'large', // 40B-120B params — strong code ability, could be flagship
|
|
12
|
+
FLAGSHIP: 'flagship', // 120B+ or proprietary frontier models
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Ordered tiers from weakest to strongest (for comparison).
|
|
17
|
+
*/
|
|
18
|
+
const TIER_ORDER = [MODEL_TIERS.TINY, MODEL_TIERS.SMALL, MODEL_TIERS.MEDIUM, MODEL_TIERS.LARGE, MODEL_TIERS.FLAGSHIP];
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Classify a model name into a capability tier.
|
|
22
|
+
* @param {string} modelName - e.g. "llama3", "gpt-4", "qwen2.5:7b"
|
|
23
|
+
* @param {string} [provider] - e.g. "ollama", "openai" (optional, helps disambiguate)
|
|
24
|
+
* @returns {string} One of MODEL_TIERS
|
|
25
|
+
*/
|
|
26
|
+
export function classifyModelTier(modelName, provider = '') {
|
|
27
|
+
const name = (modelName || '').toLowerCase().trim();
|
|
28
|
+
const prov = (provider || '').toLowerCase().trim();
|
|
29
|
+
|
|
30
|
+
// ===== FLAGSHIP (frontier models) =====
|
|
31
|
+
const flagshipPatterns = [
|
|
32
|
+
/claude-3-5-sonnet/i, /claude-opus/i, /claude-4/i, /claude-sonnet-4/i,
|
|
33
|
+
/gpt-4o/i, /gpt-4-turbo/i, /o1/i, /o3/i,
|
|
34
|
+
/gemini-2\.5-pro/i, /gemini-2\.0-ultra/i,
|
|
35
|
+
/deepseek-v3/i, /deepseek-r1/i,
|
|
36
|
+
/llama-4/i, /llama-3-70b/i, /llama3-70b/i, /llama3\.1-70b/i, /llama3\.2-90b/i, /llama3\.3/i,
|
|
37
|
+
/qwen2\.5-?72b/i, /qwen2\.5-?70b/i, /qwen-?2\.5-?72b/i,
|
|
38
|
+
/mistral-large/i, /mixtral-8x22b/i,
|
|
39
|
+
/command-r-plus/i, /command-a/i,
|
|
40
|
+
/yi-?34b/i,
|
|
41
|
+
/dbrx-instruct/i,
|
|
42
|
+
];
|
|
43
|
+
|
|
44
|
+
// If using a cloud provider like OpenAI/Anthropic/Groq, their default models are typically large+
|
|
45
|
+
if (prov === 'openai' || prov === 'anthropic' || prov === 'claude') {
|
|
46
|
+
if (name.includes('gpt-3.5') || name.includes('gpt-3')) return MODEL_TIERS.MEDIUM;
|
|
47
|
+
if (name.includes('claude-3-haiku') || name.includes('claude-3-5-haiku')) return MODEL_TIERS.MEDIUM;
|
|
48
|
+
return MODEL_TIERS.LARGE; // Default for OpenAI/Anthropic is >= gpt-4 level
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
if (prov === 'groq') {
|
|
52
|
+
// Groq runs open models, most are large but some are not
|
|
53
|
+
if (/llama.*8b|llama3.*8b|llama3\.2.*3b/i.test(name)) return MODEL_TIERS.SMALL;
|
|
54
|
+
if (/gemma2.*9b/i.test(name)) return MODEL_TIERS.SMALL;
|
|
55
|
+
if (/mixtral-8x7|llama.*70b|llama3.*70b|llama3\.1.*70b|qwen/i.test(name)) return MODEL_TIERS.LARGE;
|
|
56
|
+
return MODEL_TIERS.MEDIUM; // Default for Groq
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Check patterns for any provider
|
|
60
|
+
for (const pattern of flagshipPatterns) {
|
|
61
|
+
if (pattern.test(name)) return MODEL_TIERS.FLAGSHIP;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// ===== LARGE MODELS =====
|
|
65
|
+
const largePatterns = [
|
|
66
|
+
/claude-sonnet/i, /claude-3/i, /claude-2/i,
|
|
67
|
+
/gpt-4/i, /gpt-4-32k/i,
|
|
68
|
+
/llama-3\.1-?70b/i, /llama-3\.2-?70b/i, /llama3-?70b/i,
|
|
69
|
+
/llama-2-?70b/i,
|
|
70
|
+
/qwen-?2\.5-?32b/i, /qwen-?2-?72b/i,
|
|
71
|
+
/codellama-?70b/i,
|
|
72
|
+
/mixtral/i,
|
|
73
|
+
/deepseek-?v2/i,
|
|
74
|
+
/gemini-1\.5-pro/i, /gemini-2\.0-flash/i,
|
|
75
|
+
/command-r/i,
|
|
76
|
+
/yi-?34b/i,
|
|
77
|
+
/mistral-medium/i,
|
|
78
|
+
];
|
|
79
|
+
|
|
80
|
+
for (const pattern of largePatterns) {
|
|
81
|
+
if (pattern.test(name)) return MODEL_TIERS.LARGE;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// ===== MEDIUM MODELS =====
|
|
85
|
+
const mediumPatterns = [
|
|
86
|
+
/qwen-?2\.5-?14b/i, /qwen-?2\.5-?7b/i, /qwen-?2/i,
|
|
87
|
+
/llama-3-?8b/i, /llama-3\.1-?8b/i, /llama-3\.2-?11b/i,
|
|
88
|
+
/llama-2-?13b/i, /llama-2-?7b/i,
|
|
89
|
+
/deepseek-coder-?6\.7b/i, /deepseek-coder-?33b/i,
|
|
90
|
+
/codellama-?34b/i, /codellama-?13b/i, /codellama-?7b/i,
|
|
91
|
+
/mistral/i, /mistral-7b/i,
|
|
92
|
+
/gemma-2-?9b/i, /gemma-?7b/i,
|
|
93
|
+
/phi-3/i, /phi-3-medium/i,
|
|
94
|
+
/nemotron/i,
|
|
95
|
+
/solar/i,
|
|
96
|
+
/dbrx/i,
|
|
97
|
+
/starcoder2/i,
|
|
98
|
+
/deepseek-llm/i,
|
|
99
|
+
/yi-?6b/i, /yi-?9b/i,
|
|
100
|
+
];
|
|
101
|
+
|
|
102
|
+
for (const pattern of mediumPatterns) {
|
|
103
|
+
if (pattern.test(name)) return MODEL_TIERS.MEDIUM;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// ===== SMALL MODELS =====
|
|
107
|
+
const smallPatterns = [
|
|
108
|
+
/llama-3\.2-?3b/i, /llama-3\.2-?1b/i, /tinyllama/i,
|
|
109
|
+
/qwen-?2\.5-?3b/i, /qwen-?2\.5-?1\.5b/i, /qwen-?2\.5-?0\.5b/i,
|
|
110
|
+
/phi-?3-?mini/i, /phi-?2/i, /phi-?1/i,
|
|
111
|
+
/gemma-?2-?2b/i,
|
|
112
|
+
/stablelm/i,
|
|
113
|
+
/orca/i,
|
|
114
|
+
/falcon/i,
|
|
115
|
+
/red-pajama/i,
|
|
116
|
+
/pythia/i,
|
|
117
|
+
/opt/i,
|
|
118
|
+
/bloom/i,
|
|
119
|
+
/mpnet/i,
|
|
120
|
+
];
|
|
121
|
+
|
|
122
|
+
for (const pattern of smallPatterns) {
|
|
123
|
+
if (pattern.test(name)) return MODEL_TIERS.SMALL;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
if (/tiny/i.test(name) || /mini/i.test(name) || /small/i.test(name) || /nano/i.test(name)) {
|
|
127
|
+
return MODEL_TIERS.TINY;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// Fallback: if Ollama, likely small
|
|
131
|
+
if (prov === 'ollama' || prov === 'local') return MODEL_TIERS.SMALL;
|
|
132
|
+
|
|
133
|
+
// Default: assume medium
|
|
134
|
+
return MODEL_TIERS.MEDIUM;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Check if a model tier is considered "small" (needs aggressive prompting).
|
|
139
|
+
*/
|
|
140
|
+
export function isSmallModel(tier) {
|
|
141
|
+
return tier === MODEL_TIERS.TINY || tier === MODEL_TIERS.SMALL;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Get the index of a tier in the order array (0=weakest).
|
|
146
|
+
* @private
|
|
147
|
+
*/
|
|
148
|
+
function tierIndex(tier) {
|
|
149
|
+
const idx = TIER_ORDER.indexOf(tier);
|
|
150
|
+
return idx >= 0 ? idx : 2; // Default to medium index
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Compare two tiers. Returns negative if a < b, positive if a > b, 0 if equal.
|
|
155
|
+
* @private
|
|
156
|
+
*/
|
|
157
|
+
function compareTiers(a, b) {
|
|
158
|
+
return tierIndex(a) - tierIndex(b);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Get recommended reasoning level bump for small models.
|
|
163
|
+
* Small models need more aggressive reasoning prompting to compensate.
|
|
164
|
+
*/
|
|
165
|
+
export function getReasoningBump(tier) {
|
|
166
|
+
switch (tier) {
|
|
167
|
+
case MODEL_TIERS.TINY: return 2; // bump 2 levels
|
|
168
|
+
case MODEL_TIERS.SMALL: return 1; // bump 1 level
|
|
169
|
+
default: return 0;
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Build a short string describing model capability for system prompt injection.
|
|
175
|
+
*/
|
|
176
|
+
export function getModelCapabilityLabel(tier) {
|
|
177
|
+
switch (tier) {
|
|
178
|
+
case MODEL_TIERS.TINY: return 'tiny local model — needs maximum guidance';
|
|
179
|
+
case MODEL_TIERS.SMALL: return 'small local model — needs extra structure';
|
|
180
|
+
case MODEL_TIERS.MEDIUM: return 'medium-capability model';
|
|
181
|
+
case MODEL_TIERS.LARGE: return 'high-capability model';
|
|
182
|
+
case MODEL_TIERS.FLAGSHIP: return 'frontier model — full capability expected';
|
|
183
|
+
default: return '';
|
|
184
|
+
}
|
|
185
|
+
}
|