@aws/ml-container-creator 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/bin/cli.js +5 -2
  2. package/config/bootstrap-stack.json +40 -9
  3. package/infra/ci-harness/buildspec.yml +60 -0
  4. package/infra/ci-harness/package-lock.json +5 -1
  5. package/package.json +1 -1
  6. package/servers/README.md +41 -1
  7. package/servers/instance-sizer/index.js +10 -4
  8. package/servers/instance-sizer/lib/model-resolver.js +1 -1
  9. package/servers/lib/catalogs/model-sizes.json +135 -90
  10. package/servers/lib/catalogs/models.json +483 -411
  11. package/src/app.js +33 -2
  12. package/src/lib/bootstrap-command-handler.js +6 -0
  13. package/src/lib/cli-handler.js +1 -1
  14. package/src/lib/config-manager.js +41 -2
  15. package/src/lib/deployment-entry-schema.js +16 -0
  16. package/src/lib/mcp-client.js +3 -3
  17. package/src/lib/prompt-runner.js +179 -8
  18. package/src/lib/prompts.js +253 -7
  19. package/src/lib/registry-command-handler.js +12 -0
  20. package/templates/Dockerfile +12 -0
  21. package/templates/code/serving.properties +14 -0
  22. package/templates/do/adapter +1230 -0
  23. package/templates/do/adapters/.gitkeep +2 -0
  24. package/templates/do/add-ic +130 -0
  25. package/templates/do/benchmark +81 -9
  26. package/templates/do/clean +507 -17
  27. package/templates/do/config +28 -5
  28. package/templates/do/deploy +513 -367
  29. package/templates/do/ic/default.conf +32 -0
  30. package/templates/do/lib/endpoint-config.sh +216 -0
  31. package/templates/do/lib/inference-component.sh +167 -0
  32. package/templates/do/lib/secrets.sh +44 -0
  33. package/templates/do/lib/wait.sh +131 -0
  34. package/templates/do/logs +107 -27
  35. package/templates/do/optimize +528 -0
  36. package/templates/do/register +111 -1
  37. package/templates/do/status +337 -0
  38. package/templates/do/test +80 -28
@@ -1,372 +1,555 @@
1
1
  {
2
- "openai/gpt-oss-20b": {
3
- "family": "gpt-oss",
4
- "parameterCount": 20000000000,
2
+ "meta-llama/Llama-3.2-1B-Instruct": {
3
+ "family": "llama-3",
4
+ "parameterCount": 1235814400,
5
5
  "defaultDtype": "bfloat16",
6
- "maxPositionEmbeddings": 8192,
7
- "gated": false,
6
+ "maxPositionEmbeddings": 131072,
7
+ "gated": true,
8
8
  "tags": [
9
9
  "text-generation",
10
- "openai",
10
+ "llama-3",
11
11
  "conversational"
12
12
  ],
13
- "architecture": "GPT2LMHeadModel",
14
- "notes": "Open-source 20B parameter model. Requires significant GPU memory for inference",
13
+ "architecture": "LlamaForCausalLM",
14
+ "notes": "Llama 3.2 1B Instruct. Lightweight model suitable for single-GPU deployment",
15
15
  "chatTemplate": "",
16
16
  "frameworkCompatibility": {
17
- "vllm": ">=0.3.0",
18
- "tensorrt-llm": ">=0.8.0",
19
- "sglang": ">=0.2.0"
17
+ "vllm": ">=0.5.0",
18
+ "tensorrt-llm": ">=0.9.0",
19
+ "sglang": ">=0.3.0"
20
20
  },
21
- "validationLevel": "community-validated",
21
+ "validationLevel": "tested",
22
22
  "modelType": "transformer",
23
23
  "tasks": [
24
24
  "text-generation"
25
25
  ]
26
26
  },
27
- "meta-llama/Llama-2-7b-chat-hf": {
28
- "family": "llama-2",
27
+ "meta-llama/Llama-3.2-3B-Instruct": {
28
+ "family": "llama-3",
29
+ "parameterCount": 3212749824,
30
+ "defaultDtype": "bfloat16",
31
+ "maxPositionEmbeddings": 131072,
29
32
  "gated": true,
30
33
  "tags": [
31
34
  "text-generation",
32
- "llama-2",
35
+ "llama-3",
33
36
  "conversational"
34
37
  ],
35
38
  "architecture": "LlamaForCausalLM",
36
- "profiles": {
37
- "7b": {
38
- "displayName": "Llama-2 7B",
39
- "envVars": {
40
- "MAX_MODEL_LEN": "4096",
41
- "GPU_MEMORY_UTILIZATION": "0.9"
42
- }
43
- }
44
- },
45
- "notes": "Llama-2 7B chat model with official chat template. Requires HuggingFace authentication for download",
46
- "chatTemplate": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '[INST] <<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' }}{% endif %}{% endfor %}",
39
+ "notes": "Llama 3.2 3B Instruct. Compact model with strong performance for its size",
40
+ "chatTemplate": "",
47
41
  "frameworkCompatibility": {
48
- "vllm": ">=0.3.0",
49
- "tensorrt-llm": ">=0.8.0",
50
- "sglang": ">=0.2.0"
42
+ "vllm": ">=0.5.0",
43
+ "tensorrt-llm": ">=0.9.0",
44
+ "sglang": ">=0.3.0"
51
45
  },
52
46
  "validationLevel": "tested",
53
47
  "modelType": "transformer",
54
- "parameterCount": 6738415616,
55
- "defaultDtype": "float16",
56
- "maxPositionEmbeddings": 4096,
57
- "recommendedQuantizations": [
58
- "awq",
59
- "gptq"
60
- ],
61
48
  "tasks": [
62
49
  "text-generation"
63
50
  ]
64
51
  },
65
- "meta-llama/Llama-2-13b-chat-hf": {
66
- "family": "llama-2",
52
+ "meta-llama/Llama-3.1-8B-Instruct": {
53
+ "family": "llama-3",
54
+ "parameterCount": 8030261248,
55
+ "defaultDtype": "bfloat16",
56
+ "maxPositionEmbeddings": 131072,
67
57
  "gated": true,
68
58
  "tags": [
69
59
  "text-generation",
70
- "llama-2",
60
+ "llama-3",
71
61
  "conversational"
72
62
  ],
73
63
  "architecture": "LlamaForCausalLM",
74
- "profiles": {
75
- "13b": {
76
- "displayName": "Llama-2 13B",
77
- "envVars": {
78
- "MAX_MODEL_LEN": "4096",
79
- "GPU_MEMORY_UTILIZATION": "0.9"
80
- }
81
- }
82
- },
83
- "notes": "Llama-2 13B chat model. Requires more GPU memory than 7B variant",
84
- "chatTemplate": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '[INST] <<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' }}{% endif %}{% endfor %}",
64
+ "notes": "Llama 3.1 8B Instruct with 128K context window",
65
+ "chatTemplate": "",
85
66
  "frameworkCompatibility": {
86
- "vllm": ">=0.3.0",
87
- "tensorrt-llm": ">=0.8.0",
88
- "sglang": ">=0.2.0"
67
+ "vllm": ">=0.5.0",
68
+ "tensorrt-llm": ">=0.9.0",
69
+ "sglang": ">=0.3.0"
89
70
  },
90
71
  "validationLevel": "tested",
91
72
  "modelType": "transformer",
92
- "parameterCount": 13015864320,
93
- "defaultDtype": "float16",
94
- "maxPositionEmbeddings": 4096,
95
- "recommendedQuantizations": [
96
- "awq",
97
- "gptq"
98
- ],
99
73
  "tasks": [
100
74
  "text-generation"
101
75
  ]
102
76
  },
103
- "meta-llama/Llama-2-70b-chat-hf": {
104
- "family": "llama-2",
105
- "parameterCount": 70000000000,
106
- "defaultDtype": "float16",
107
- "maxPositionEmbeddings": 4096,
77
+ "meta-llama/Llama-3.3-70B-Instruct": {
78
+ "family": "llama-3",
79
+ "parameterCount": 70553706496,
80
+ "defaultDtype": "bfloat16",
81
+ "maxPositionEmbeddings": 131072,
108
82
  "gated": true,
109
83
  "tags": [
110
84
  "text-generation",
111
- "llama-2",
85
+ "llama-3",
112
86
  "conversational"
113
87
  ],
114
88
  "architecture": "LlamaForCausalLM",
115
- "profiles": {
116
- "70b-tp2": {
117
- "displayName": "Llama-2 70B (2-GPU)",
118
- "envVars": {
119
- "TENSOR_PARALLEL_SIZE": "2",
120
- "MAX_MODEL_LEN": "4096",
121
- "GPU_MEMORY_UTILIZATION": "0.95"
122
- }
123
- },
124
- "70b-tp4": {
125
- "displayName": "Llama-2 70B (4-GPU)",
126
- "envVars": {
127
- "TENSOR_PARALLEL_SIZE": "4",
128
- "MAX_MODEL_LEN": "4096",
129
- "GPU_MEMORY_UTILIZATION": "0.9"
130
- }
131
- }
89
+ "notes": "Llama 3.3 70B Instruct. Requires multi-GPU tensor parallelism",
90
+ "chatTemplate": "",
91
+ "frameworkCompatibility": {
92
+ "vllm": ">=0.5.0",
93
+ "tensorrt-llm": ">=0.9.0",
94
+ "sglang": ">=0.3.0"
132
95
  },
133
- "notes": "Llama-2 70B requires tensor parallelism across multiple GPUs",
134
- "chatTemplate": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '[INST] <<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' }}{% endif %}{% endfor %}",
96
+ "validationLevel": "tested",
97
+ "modelType": "transformer",
98
+ "tasks": [
99
+ "text-generation"
100
+ ]
101
+ },
102
+ "Qwen/Qwen3-0.6B": {
103
+ "family": "qwen3",
104
+ "parameterCount": 600000000,
105
+ "defaultDtype": "bfloat16",
106
+ "maxPositionEmbeddings": 32768,
107
+ "gated": false,
108
+ "tags": [
109
+ "text-generation",
110
+ "qwen",
111
+ "conversational"
112
+ ],
113
+ "architecture": "Qwen3ForCausalLM",
114
+ "notes": "Qwen3 0.6B. Ultra-lightweight model for edge and low-resource deployments",
115
+ "chatTemplate": "",
135
116
  "frameworkCompatibility": {
136
- "vllm": ">=0.3.0",
137
- "tensorrt-llm": ">=0.8.0",
138
- "sglang": ">=0.2.0"
117
+ "vllm": ">=0.5.0",
118
+ "tensorrt-llm": ">=0.9.0",
119
+ "sglang": ">=0.3.0"
139
120
  },
140
- "validationLevel": "community-validated",
121
+ "validationLevel": "tested",
141
122
  "modelType": "transformer",
142
- "parameterCount": 68976648192,
143
- "defaultDtype": "float16",
144
- "maxPositionEmbeddings": 4096,
145
- "recommendedQuantizations": [
146
- "awq",
147
- "gptq"
123
+ "tasks": [
124
+ "text-generation"
125
+ ]
126
+ },
127
+ "Qwen/Qwen3-1.7B": {
128
+ "family": "qwen3",
129
+ "parameterCount": 1700000000,
130
+ "defaultDtype": "bfloat16",
131
+ "maxPositionEmbeddings": 32768,
132
+ "gated": false,
133
+ "tags": [
134
+ "text-generation",
135
+ "qwen",
136
+ "conversational"
148
137
  ],
138
+ "architecture": "Qwen3ForCausalLM",
139
+ "notes": "Qwen3 1.7B. Lightweight model with strong reasoning capabilities",
140
+ "chatTemplate": "",
141
+ "frameworkCompatibility": {
142
+ "vllm": ">=0.5.0",
143
+ "tensorrt-llm": ">=0.9.0",
144
+ "sglang": ">=0.3.0"
145
+ },
146
+ "validationLevel": "tested",
147
+ "modelType": "transformer",
149
148
  "tasks": [
150
149
  "text-generation"
151
150
  ]
152
151
  },
153
- "mistralai/Mistral-7B-Instruct-v0.1": {
154
- "family": "mistral",
152
+ "Qwen/Qwen3-4B": {
153
+ "family": "qwen3",
154
+ "parameterCount": 4000000000,
155
+ "defaultDtype": "bfloat16",
156
+ "maxPositionEmbeddings": 32768,
155
157
  "gated": false,
156
158
  "tags": [
157
159
  "text-generation",
158
- "mistral",
160
+ "qwen",
159
161
  "conversational"
160
162
  ],
161
- "architecture": "MistralForCausalLM",
162
- "profiles": {
163
- "7b": {
164
- "displayName": "Mistral 7B Instruct",
165
- "envVars": {
166
- "MAX_MODEL_LEN": "8192",
167
- "GPU_MEMORY_UTILIZATION": "0.9"
168
- }
169
- }
163
+ "architecture": "Qwen3ForCausalLM",
164
+ "notes": "Qwen3 4B. Balanced model for single-GPU inference",
165
+ "chatTemplate": "",
166
+ "frameworkCompatibility": {
167
+ "vllm": ">=0.5.0",
168
+ "tensorrt-llm": ">=0.9.0",
169
+ "sglang": ">=0.3.0"
170
170
  },
171
- "notes": "Mistral 7B v0.1 with 8K context window",
172
- "chatTemplate": "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
171
+ "validationLevel": "tested",
172
+ "modelType": "transformer",
173
+ "tasks": [
174
+ "text-generation"
175
+ ]
176
+ },
177
+ "Qwen/Qwen3-8B": {
178
+ "family": "qwen3",
179
+ "parameterCount": 8000000000,
180
+ "defaultDtype": "bfloat16",
181
+ "maxPositionEmbeddings": 32768,
182
+ "gated": false,
183
+ "tags": [
184
+ "text-generation",
185
+ "qwen",
186
+ "conversational"
187
+ ],
188
+ "architecture": "Qwen3ForCausalLM",
189
+ "notes": "Qwen3 8B. Strong general-purpose model for single-GPU deployment",
190
+ "chatTemplate": "",
173
191
  "frameworkCompatibility": {
174
- "vllm": ">=0.3.0",
175
- "tensorrt-llm": ">=0.8.0",
176
- "sglang": ">=0.2.0"
192
+ "vllm": ">=0.5.0",
193
+ "tensorrt-llm": ">=0.9.0",
194
+ "sglang": ">=0.3.0"
177
195
  },
178
196
  "validationLevel": "tested",
179
197
  "modelType": "transformer",
180
- "parameterCount": 7241732096,
198
+ "tasks": [
199
+ "text-generation"
200
+ ]
201
+ },
202
+ "Qwen/Qwen3-14B": {
203
+ "family": "qwen3",
204
+ "parameterCount": 14000000000,
181
205
  "defaultDtype": "bfloat16",
182
206
  "maxPositionEmbeddings": 32768,
183
- "recommendedQuantizations": [
184
- "awq",
185
- "gptq"
207
+ "gated": false,
208
+ "tags": [
209
+ "text-generation",
210
+ "qwen",
211
+ "conversational"
186
212
  ],
213
+ "architecture": "Qwen3ForCausalLM",
214
+ "notes": "Qwen3 14B. High-quality model requiring larger GPU memory",
215
+ "chatTemplate": "",
216
+ "frameworkCompatibility": {
217
+ "vllm": ">=0.5.0",
218
+ "tensorrt-llm": ">=0.9.0",
219
+ "sglang": ">=0.3.0"
220
+ },
221
+ "validationLevel": "tested",
222
+ "modelType": "transformer",
187
223
  "tasks": [
188
224
  "text-generation"
189
225
  ]
190
226
  },
191
- "mistralai/Mistral-7B-Instruct-v0.2": {
192
- "family": "mistral",
227
+ "Qwen/Qwen3-32B": {
228
+ "family": "qwen3",
229
+ "parameterCount": 32000000000,
230
+ "defaultDtype": "bfloat16",
231
+ "maxPositionEmbeddings": 32768,
193
232
  "gated": false,
194
233
  "tags": [
195
234
  "text-generation",
196
- "mistral",
235
+ "qwen",
197
236
  "conversational"
198
237
  ],
199
- "architecture": "MistralForCausalLM",
200
- "profiles": {
201
- "7b": {
202
- "displayName": "Mistral 7B Instruct v0.2",
203
- "envVars": {
204
- "MAX_MODEL_LEN": "32768",
205
- "GPU_MEMORY_UTILIZATION": "0.9"
206
- }
207
- }
238
+ "architecture": "Qwen3ForCausalLM",
239
+ "notes": "Qwen3 32B. Large model requiring multi-GPU or quantization",
240
+ "chatTemplate": "",
241
+ "frameworkCompatibility": {
242
+ "vllm": ">=0.5.0",
243
+ "tensorrt-llm": ">=0.9.0",
244
+ "sglang": ">=0.3.0"
208
245
  },
209
- "notes": "Mistral 7B v0.2 with extended 32K context window. Requires more memory for long contexts",
210
- "chatTemplate": "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
246
+ "validationLevel": "tested",
247
+ "modelType": "transformer",
248
+ "tasks": [
249
+ "text-generation"
250
+ ]
251
+ },
252
+ "Qwen/Qwen2.5-7B-Instruct": {
253
+ "family": "qwen2.5",
254
+ "parameterCount": 7721324544,
255
+ "defaultDtype": "bfloat16",
256
+ "maxPositionEmbeddings": 131072,
257
+ "gated": false,
258
+ "tags": [
259
+ "text-generation",
260
+ "qwen",
261
+ "conversational"
262
+ ],
263
+ "architecture": "Qwen2ForCausalLM",
264
+ "notes": "Qwen2.5 7B Instruct with 128K context window",
265
+ "chatTemplate": "",
211
266
  "frameworkCompatibility": {
212
- "vllm": ">=0.3.0",
213
- "tensorrt-llm": ">=0.8.0",
214
- "sglang": ">=0.2.0"
267
+ "vllm": ">=0.5.0",
268
+ "tensorrt-llm": ">=0.9.0",
269
+ "sglang": ">=0.3.0"
215
270
  },
216
271
  "validationLevel": "tested",
217
272
  "modelType": "transformer",
218
- "parameterCount": 7241732096,
273
+ "tasks": [
274
+ "text-generation"
275
+ ]
276
+ },
277
+ "Qwen/Qwen2.5-14B-Instruct": {
278
+ "family": "qwen2.5",
279
+ "parameterCount": 14167134208,
219
280
  "defaultDtype": "bfloat16",
220
- "maxPositionEmbeddings": 32768,
221
- "recommendedQuantizations": [
222
- "awq",
223
- "gptq"
281
+ "maxPositionEmbeddings": 131072,
282
+ "gated": false,
283
+ "tags": [
284
+ "text-generation",
285
+ "qwen",
286
+ "conversational"
224
287
  ],
288
+ "architecture": "Qwen2ForCausalLM",
289
+ "notes": "Qwen2.5 14B Instruct with 128K context window",
290
+ "chatTemplate": "",
291
+ "frameworkCompatibility": {
292
+ "vllm": ">=0.5.0",
293
+ "tensorrt-llm": ">=0.9.0",
294
+ "sglang": ">=0.3.0"
295
+ },
296
+ "validationLevel": "tested",
297
+ "modelType": "transformer",
225
298
  "tasks": [
226
299
  "text-generation"
227
300
  ]
228
301
  },
229
- "mistralai/Mixtral-8x7B-Instruct-v0.1": {
230
- "family": "mistral",
302
+ "Qwen/Qwen2.5-32B-Instruct": {
303
+ "family": "qwen2.5",
304
+ "parameterCount": 32000000000,
305
+ "defaultDtype": "bfloat16",
306
+ "maxPositionEmbeddings": 131072,
231
307
  "gated": false,
232
308
  "tags": [
233
309
  "text-generation",
234
- "mistral",
235
- "mixture-of-experts"
310
+ "qwen",
311
+ "conversational"
236
312
  ],
237
- "architecture": "MixtralForCausalLM",
238
- "profiles": {
239
- "8x7b-tp2": {
240
- "displayName": "Mixtral 8x7B (2-GPU)",
241
- "envVars": {
242
- "TENSOR_PARALLEL_SIZE": "2",
243
- "MAX_MODEL_LEN": "32768",
244
- "GPU_MEMORY_UTILIZATION": "0.95"
245
- }
246
- }
313
+ "architecture": "Qwen2ForCausalLM",
314
+ "notes": "Qwen2.5 32B Instruct with 128K context window. Requires multi-GPU or quantization",
315
+ "chatTemplate": "",
316
+ "frameworkCompatibility": {
317
+ "vllm": ">=0.5.0",
318
+ "tensorrt-llm": ">=0.9.0",
319
+ "sglang": ">=0.3.0"
247
320
  },
248
- "notes": "Mixtral 8x7B MoE model. Requires tensor parallelism for efficient inference",
249
- "chatTemplate": "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
321
+ "validationLevel": "tested",
322
+ "modelType": "transformer",
323
+ "tasks": [
324
+ "text-generation"
325
+ ]
326
+ },
327
+ "Qwen/Qwen2.5-72B-Instruct": {
328
+ "family": "qwen2.5",
329
+ "parameterCount": 72710410240,
330
+ "defaultDtype": "bfloat16",
331
+ "maxPositionEmbeddings": 131072,
332
+ "gated": false,
333
+ "tags": [
334
+ "text-generation",
335
+ "qwen",
336
+ "conversational"
337
+ ],
338
+ "architecture": "Qwen2ForCausalLM",
339
+ "notes": "Qwen2.5 72B Instruct with 128K context window. Requires multi-GPU tensor parallelism",
340
+ "chatTemplate": "",
250
341
  "frameworkCompatibility": {
251
- "vllm": ">=0.3.0",
252
- "tensorrt-llm": ">=0.8.0",
253
- "sglang": ">=0.2.0"
342
+ "vllm": ">=0.5.0",
343
+ "tensorrt-llm": ">=0.9.0",
344
+ "sglang": ">=0.3.0"
254
345
  },
255
- "validationLevel": "community-validated",
346
+ "validationLevel": "tested",
256
347
  "modelType": "transformer",
257
- "parameterCount": 46702792704,
348
+ "tasks": [
349
+ "text-generation"
350
+ ]
351
+ },
352
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B": {
353
+ "family": "deepseek-r1",
354
+ "parameterCount": 1500000000,
258
355
  "defaultDtype": "bfloat16",
259
- "maxPositionEmbeddings": 32768,
260
- "recommendedQuantizations": [
261
- "awq",
262
- "gptq"
356
+ "maxPositionEmbeddings": 131072,
357
+ "gated": false,
358
+ "tags": [
359
+ "text-generation",
360
+ "deepseek",
361
+ "reasoning",
362
+ "conversational"
263
363
  ],
364
+ "architecture": "Qwen2ForCausalLM",
365
+ "notes": "DeepSeek R1 Distill Qwen 1.5B. Reasoning-focused distilled model",
366
+ "chatTemplate": "",
367
+ "frameworkCompatibility": {
368
+ "vllm": ">=0.5.0",
369
+ "tensorrt-llm": ">=0.9.0",
370
+ "sglang": ">=0.3.0"
371
+ },
372
+ "validationLevel": "tested",
373
+ "modelType": "transformer",
264
374
  "tasks": [
265
375
  "text-generation"
266
376
  ]
267
377
  },
268
- "meta-llama/Llama-2-70b-hf": {
269
- "family": "llama-2",
270
- "parameterCount": 70000000000,
271
- "defaultDtype": "float16",
272
- "maxPositionEmbeddings": 4096,
273
- "gated": true,
378
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B": {
379
+ "family": "deepseek-r1",
380
+ "parameterCount": 7000000000,
381
+ "defaultDtype": "bfloat16",
382
+ "maxPositionEmbeddings": 131072,
383
+ "gated": false,
274
384
  "tags": [
275
385
  "text-generation",
276
- "llama-2"
386
+ "deepseek",
387
+ "reasoning",
388
+ "conversational"
277
389
  ],
278
- "architecture": "LlamaForCausalLM",
279
- "notes": "Llama-2 70B base model (non-chat). Requires multi-GPU for inference.",
390
+ "architecture": "Qwen2ForCausalLM",
391
+ "notes": "DeepSeek R1 Distill Qwen 7B. Reasoning-focused distilled model",
280
392
  "chatTemplate": "",
281
393
  "frameworkCompatibility": {
282
- "vllm": ">=0.3.0",
283
- "tensorrt-llm": ">=0.8.0",
284
- "sglang": ">=0.2.0"
394
+ "vllm": ">=0.5.0",
395
+ "tensorrt-llm": ">=0.9.0",
396
+ "sglang": ">=0.3.0"
285
397
  },
286
- "validationLevel": "community-validated",
398
+ "validationLevel": "tested",
287
399
  "modelType": "transformer",
288
400
  "tasks": [
289
401
  "text-generation"
290
402
  ]
291
403
  },
292
- "meta-llama/Llama-2-*": {
293
- "family": "llama-2",
294
- "gated": true,
404
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": {
405
+ "family": "deepseek-r1",
406
+ "parameterCount": 14000000000,
407
+ "defaultDtype": "bfloat16",
408
+ "maxPositionEmbeddings": 131072,
409
+ "gated": false,
295
410
  "tags": [
296
411
  "text-generation",
297
- "llama-2"
412
+ "deepseek",
413
+ "reasoning",
414
+ "conversational"
298
415
  ],
299
- "architecture": null,
300
- "notes": "Fallback configuration for Llama-2 models not explicitly listed. Uses standard Llama-2 chat template",
301
- "chatTemplate": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '[INST] <<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' }}{% endif %}{% endfor %}",
416
+ "architecture": "Qwen2ForCausalLM",
417
+ "notes": "DeepSeek R1 Distill Qwen 14B. Reasoning-focused distilled model",
418
+ "chatTemplate": "",
302
419
  "frameworkCompatibility": {
303
- "vllm": ">=0.3.0",
304
- "tensorrt-llm": ">=0.8.0",
305
- "sglang": ">=0.2.0"
420
+ "vllm": ">=0.5.0",
421
+ "tensorrt-llm": ">=0.9.0",
422
+ "sglang": ">=0.3.0"
306
423
  },
307
- "validationLevel": "experimental",
424
+ "validationLevel": "tested",
308
425
  "modelType": "transformer",
309
426
  "tasks": [
310
427
  "text-generation"
311
428
  ]
312
429
  },
313
- "mistralai/Mistral-*": {
314
- "family": "mistral",
430
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B": {
431
+ "family": "deepseek-r1",
432
+ "parameterCount": 32000000000,
433
+ "defaultDtype": "bfloat16",
434
+ "maxPositionEmbeddings": 131072,
315
435
  "gated": false,
316
436
  "tags": [
317
437
  "text-generation",
318
- "mistral"
438
+ "deepseek",
439
+ "reasoning",
440
+ "conversational"
319
441
  ],
320
- "architecture": null,
321
- "notes": "Fallback configuration for Mistral models not explicitly listed. Uses standard Mistral chat template",
322
- "chatTemplate": "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
442
+ "architecture": "Qwen2ForCausalLM",
443
+ "notes": "DeepSeek R1 Distill Qwen 32B. Reasoning-focused distilled model. Requires multi-GPU or quantization",
444
+ "chatTemplate": "",
323
445
  "frameworkCompatibility": {
324
- "vllm": ">=0.3.0",
325
- "tensorrt-llm": ">=0.8.0",
326
- "sglang": ">=0.2.0"
446
+ "vllm": ">=0.5.0",
447
+ "tensorrt-llm": ">=0.9.0",
448
+ "sglang": ">=0.3.0"
327
449
  },
328
- "validationLevel": "experimental",
450
+ "validationLevel": "tested",
329
451
  "modelType": "transformer",
330
452
  "tasks": [
331
453
  "text-generation"
332
454
  ]
333
455
  },
334
- "codellama/*": {
335
- "family": "codellama",
456
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-8B": {
457
+ "family": "deepseek-r1",
458
+ "parameterCount": 8000000000,
459
+ "defaultDtype": "bfloat16",
460
+ "maxPositionEmbeddings": 131072,
336
461
  "gated": false,
337
462
  "tags": [
338
463
  "text-generation",
339
- "code",
340
- "codellama"
464
+ "deepseek",
465
+ "reasoning",
466
+ "conversational"
341
467
  ],
342
- "architecture": null,
343
- "notes": "CodeLlama models use Llama-2 chat template. Optimized for code generation",
344
- "chatTemplate": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '[INST] <<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' }}{% endif %}{% endfor %}",
468
+ "architecture": "LlamaForCausalLM",
469
+ "notes": "DeepSeek R1 Distill Llama 8B. Reasoning-focused distilled model based on Llama architecture",
470
+ "chatTemplate": "",
345
471
  "frameworkCompatibility": {
346
- "vllm": ">=0.3.0",
347
- "tensorrt-llm": ">=0.8.0"
472
+ "vllm": ">=0.5.0",
473
+ "tensorrt-llm": ">=0.9.0",
474
+ "sglang": ">=0.3.0"
348
475
  },
349
- "validationLevel": "experimental",
476
+ "validationLevel": "tested",
350
477
  "modelType": "transformer",
351
478
  "tasks": [
352
479
  "text-generation"
353
480
  ]
354
481
  },
355
- "tiiuae/falcon-*": {
356
- "family": "falcon",
482
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B": {
483
+ "family": "deepseek-r1",
484
+ "parameterCount": 70000000000,
485
+ "defaultDtype": "bfloat16",
486
+ "maxPositionEmbeddings": 131072,
357
487
  "gated": false,
358
488
  "tags": [
359
489
  "text-generation",
360
- "falcon"
490
+ "deepseek",
491
+ "reasoning",
492
+ "conversational"
361
493
  ],
362
- "architecture": null,
363
- "notes": "Falcon models typically don't require chat templates for instruction following",
364
- "chatTemplate": null,
494
+ "architecture": "LlamaForCausalLM",
495
+ "notes": "DeepSeek R1 Distill Llama 70B. Reasoning-focused distilled model. Requires multi-GPU tensor parallelism",
496
+ "chatTemplate": "",
365
497
  "frameworkCompatibility": {
366
- "vllm": ">=0.3.0",
367
- "tensorrt-llm": ">=0.8.0"
498
+ "vllm": ">=0.5.0",
499
+ "tensorrt-llm": ">=0.9.0",
500
+ "sglang": ">=0.3.0"
368
501
  },
369
- "validationLevel": "experimental",
502
+ "validationLevel": "tested",
503
+ "modelType": "transformer",
504
+ "tasks": [
505
+ "text-generation"
506
+ ]
507
+ },
508
+ "openai/gpt-oss-20b": {
509
+ "family": "gpt-oss",
510
+ "parameterCount": 20000000000,
511
+ "defaultDtype": "bfloat16",
512
+ "maxPositionEmbeddings": 8192,
513
+ "gated": false,
514
+ "tags": [
515
+ "text-generation",
516
+ "openai",
517
+ "conversational"
518
+ ],
519
+ "architecture": "GPT2LMHeadModel",
520
+ "notes": "GPT-OSS 20B. Open-source 20B parameter model from OpenAI",
521
+ "chatTemplate": "",
522
+ "frameworkCompatibility": {
523
+ "vllm": ">=0.5.0",
524
+ "tensorrt-llm": ">=0.9.0",
525
+ "sglang": ">=0.3.0"
526
+ },
527
+ "validationLevel": "tested",
528
+ "modelType": "transformer",
529
+ "tasks": [
530
+ "text-generation"
531
+ ]
532
+ },
533
+ "openai/gpt-oss-120b": {
534
+ "family": "gpt-oss",
535
+ "parameterCount": 120000000000,
536
+ "defaultDtype": "bfloat16",
537
+ "maxPositionEmbeddings": 8192,
538
+ "gated": false,
539
+ "tags": [
540
+ "text-generation",
541
+ "openai",
542
+ "conversational"
543
+ ],
544
+ "architecture": "GPT2LMHeadModel",
545
+ "notes": "GPT-OSS 120B. Large open-source model from OpenAI. Requires multi-GPU tensor parallelism",
546
+ "chatTemplate": "",
547
+ "frameworkCompatibility": {
548
+ "vllm": ">=0.5.0",
549
+ "tensorrt-llm": ">=0.9.0",
550
+ "sglang": ">=0.3.0"
551
+ },
552
+ "validationLevel": "tested",
370
553
  "modelType": "transformer",
371
554
  "tasks": [
372
555
  "text-generation"
@@ -464,240 +647,129 @@
464
647
  "text-to-video"
465
648
  ]
466
649
  },
467
- "stabilityai/stable-diffusion-*": {
468
- "family": "stable-diffusion",
469
- "gated": false,
650
+ "meta-llama/Llama-3*": {
651
+ "family": "llama-3",
652
+ "gated": true,
470
653
  "tags": [
471
- "image-generation",
472
- "diffusion",
473
- "stable-diffusion"
654
+ "text-generation",
655
+ "llama-3"
474
656
  ],
475
657
  "architecture": null,
476
- "notes": "Fallback for Stable Diffusion variants not explicitly listed",
477
- "chatTemplate": null,
658
+ "notes": "Fallback configuration for Llama 3.x models not explicitly listed",
659
+ "chatTemplate": "",
478
660
  "frameworkCompatibility": {
479
- "vllm-omni": ">=0.14.0"
661
+ "vllm": ">=0.5.0",
662
+ "tensorrt-llm": ">=0.9.0",
663
+ "sglang": ">=0.3.0"
480
664
  },
481
665
  "validationLevel": "experimental",
482
- "modelType": "diffusor",
666
+ "modelType": "transformer",
483
667
  "tasks": [
484
- "text-to-image"
668
+ "text-generation"
485
669
  ]
486
670
  },
487
- "black-forest-labs/FLUX*": {
488
- "family": "flux",
671
+ "Qwen/Qwen*": {
672
+ "family": "qwen",
489
673
  "gated": false,
490
674
  "tags": [
491
- "image-generation",
492
- "diffusion",
493
- "flux"
675
+ "text-generation",
676
+ "qwen"
494
677
  ],
495
678
  "architecture": null,
496
- "notes": "Fallback for FLUX model variants not explicitly listed",
497
- "chatTemplate": null,
679
+ "notes": "Fallback configuration for Qwen models not explicitly listed",
680
+ "chatTemplate": "",
498
681
  "frameworkCompatibility": {
499
- "vllm-omni": ">=0.14.0"
682
+ "vllm": ">=0.5.0",
683
+ "tensorrt-llm": ">=0.9.0",
684
+ "sglang": ">=0.3.0"
500
685
  },
501
686
  "validationLevel": "experimental",
502
- "modelType": "diffusor",
503
- "tasks": [
504
- "text-to-image"
505
- ]
506
- },
507
- "meta-llama/Meta-Llama-3-8B*": {
508
- "parameterCount": 8030261248,
509
- "defaultDtype": "bfloat16",
510
- "architecture": "LlamaForCausalLM",
511
- "maxPositionEmbeddings": 8192,
512
- "recommendedQuantizations": [
513
- "awq",
514
- "gptq"
515
- ],
516
- "modelType": "transformer",
517
- "tasks": [
518
- "text-generation"
519
- ]
520
- },
521
- "meta-llama/Meta-Llama-3-70B*": {
522
- "parameterCount": 70553706496,
523
- "defaultDtype": "bfloat16",
524
- "architecture": "LlamaForCausalLM",
525
- "maxPositionEmbeddings": 8192,
526
- "recommendedQuantizations": [
527
- "awq",
528
- "gptq"
529
- ],
530
- "modelType": "transformer",
531
- "tasks": [
532
- "text-generation"
533
- ]
534
- },
535
- "meta-llama/Llama-3.1-8B*": {
536
- "parameterCount": 8030261248,
537
- "defaultDtype": "bfloat16",
538
- "architecture": "LlamaForCausalLM",
539
- "maxPositionEmbeddings": 131072,
540
- "recommendedQuantizations": [
541
- "awq",
542
- "gptq"
543
- ],
544
- "modelType": "transformer",
545
- "tasks": [
546
- "text-generation"
547
- ]
548
- },
549
- "meta-llama/Llama-3.1-70B*": {
550
- "parameterCount": 70553706496,
551
- "defaultDtype": "bfloat16",
552
- "architecture": "LlamaForCausalLM",
553
- "maxPositionEmbeddings": 131072,
554
- "recommendedQuantizations": [
555
- "awq",
556
- "gptq"
557
- ],
558
- "modelType": "transformer",
559
- "tasks": [
560
- "text-generation"
561
- ]
562
- },
563
- "meta-llama/Llama-3.1-405B*": {
564
- "parameterCount": 405000000000,
565
- "defaultDtype": "bfloat16",
566
- "architecture": "LlamaForCausalLM",
567
- "maxPositionEmbeddings": 131072,
568
- "recommendedQuantizations": [
569
- "awq",
570
- "gptq",
571
- "fp8"
572
- ],
573
687
  "modelType": "transformer",
574
688
  "tasks": [
575
689
  "text-generation"
576
690
  ]
577
691
  },
578
- "meta-llama/Llama-3.2-1B*": {
579
- "parameterCount": 1235814400,
580
- "defaultDtype": "bfloat16",
581
- "architecture": "LlamaForCausalLM",
582
- "maxPositionEmbeddings": 131072,
583
- "recommendedQuantizations": [
584
- "awq",
585
- "gptq"
586
- ],
587
- "modelType": "transformer",
588
- "tasks": [
589
- "text-generation"
590
- ]
591
- },
592
- "meta-llama/Llama-3.2-3B*": {
593
- "parameterCount": 3212749824,
594
- "defaultDtype": "bfloat16",
595
- "architecture": "LlamaForCausalLM",
596
- "maxPositionEmbeddings": 131072,
597
- "recommendedQuantizations": [
598
- "awq",
599
- "gptq"
600
- ],
601
- "modelType": "transformer",
602
- "tasks": [
603
- "text-generation"
604
- ]
605
- },
606
- "Qwen/Qwen-7B*": {
607
- "parameterCount": 7721324544,
608
- "defaultDtype": "bfloat16",
609
- "architecture": "QWenLMHeadModel",
610
- "maxPositionEmbeddings": 8192,
611
- "recommendedQuantizations": [
612
- "awq",
613
- "gptq"
614
- ],
615
- "modelType": "transformer",
616
- "tasks": [
617
- "text-generation"
618
- ]
619
- },
620
- "Qwen/Qwen2-7B*": {
621
- "parameterCount": 7721324544,
622
- "defaultDtype": "bfloat16",
623
- "architecture": "Qwen2ForCausalLM",
624
- "maxPositionEmbeddings": 32768,
625
- "recommendedQuantizations": [
626
- "awq",
627
- "gptq"
628
- ],
629
- "modelType": "transformer",
630
- "tasks": [
631
- "text-generation"
632
- ]
633
- },
634
- "Qwen/Qwen-14B*": {
635
- "parameterCount": 14167134208,
636
- "defaultDtype": "bfloat16",
637
- "architecture": "QWenLMHeadModel",
638
- "maxPositionEmbeddings": 8192,
639
- "recommendedQuantizations": [
640
- "awq",
641
- "gptq"
642
- ],
643
- "modelType": "transformer",
644
- "tasks": [
645
- "text-generation"
646
- ]
647
- },
648
- "Qwen/Qwen2-14B*": {
649
- "parameterCount": 14167134208,
650
- "defaultDtype": "bfloat16",
651
- "architecture": "Qwen2ForCausalLM",
652
- "maxPositionEmbeddings": 32768,
653
- "recommendedQuantizations": [
654
- "awq",
655
- "gptq"
692
+ "deepseek-ai/DeepSeek*": {
693
+ "family": "deepseek",
694
+ "gated": false,
695
+ "tags": [
696
+ "text-generation",
697
+ "deepseek",
698
+ "reasoning"
656
699
  ],
700
+ "architecture": null,
701
+ "notes": "Fallback configuration for DeepSeek models not explicitly listed",
702
+ "chatTemplate": "",
703
+ "frameworkCompatibility": {
704
+ "vllm": ">=0.5.0",
705
+ "tensorrt-llm": ">=0.9.0",
706
+ "sglang": ">=0.3.0"
707
+ },
708
+ "validationLevel": "experimental",
657
709
  "modelType": "transformer",
658
710
  "tasks": [
659
711
  "text-generation"
660
712
  ]
661
713
  },
662
- "Qwen/Qwen-72B*": {
663
- "parameterCount": 72710410240,
664
- "defaultDtype": "bfloat16",
665
- "architecture": "QWenLMHeadModel",
666
- "maxPositionEmbeddings": 32768,
667
- "recommendedQuantizations": [
668
- "awq",
669
- "gptq"
714
+ "openai/gpt-oss*": {
715
+ "family": "gpt-oss",
716
+ "gated": false,
717
+ "tags": [
718
+ "text-generation",
719
+ "openai"
670
720
  ],
721
+ "architecture": null,
722
+ "notes": "Fallback configuration for OpenAI GPT-OSS models not explicitly listed",
723
+ "chatTemplate": "",
724
+ "frameworkCompatibility": {
725
+ "vllm": ">=0.5.0",
726
+ "tensorrt-llm": ">=0.9.0",
727
+ "sglang": ">=0.3.0"
728
+ },
729
+ "validationLevel": "experimental",
671
730
  "modelType": "transformer",
672
731
  "tasks": [
673
732
  "text-generation"
674
733
  ]
675
734
  },
676
- "Qwen/Qwen2-72B*": {
677
- "parameterCount": 72710410240,
678
- "defaultDtype": "bfloat16",
679
- "architecture": "Qwen2ForCausalLM",
680
- "maxPositionEmbeddings": 32768,
681
- "recommendedQuantizations": [
682
- "awq",
683
- "gptq"
735
+ "stabilityai/stable-diffusion-*": {
736
+ "family": "stable-diffusion",
737
+ "gated": false,
738
+ "tags": [
739
+ "image-generation",
740
+ "diffusion",
741
+ "stable-diffusion"
684
742
  ],
685
- "modelType": "transformer",
743
+ "architecture": null,
744
+ "notes": "Fallback for Stable Diffusion variants not explicitly listed",
745
+ "chatTemplate": null,
746
+ "frameworkCompatibility": {
747
+ "vllm-omni": ">=0.14.0"
748
+ },
749
+ "validationLevel": "experimental",
750
+ "modelType": "diffusor",
686
751
  "tasks": [
687
- "text-generation"
752
+ "text-to-image"
688
753
  ]
689
754
  },
690
- "EleutherAI/gpt-neox-20b*": {
691
- "parameterCount": 20554568704,
692
- "defaultDtype": "float16",
693
- "architecture": "GPTNeoXForCausalLM",
694
- "maxPositionEmbeddings": 2048,
695
- "recommendedQuantizations": [
696
- "gptq"
755
+ "black-forest-labs/FLUX*": {
756
+ "family": "flux",
757
+ "gated": false,
758
+ "tags": [
759
+ "image-generation",
760
+ "diffusion",
761
+ "flux"
697
762
  ],
698
- "modelType": "transformer",
763
+ "architecture": null,
764
+ "notes": "Fallback for FLUX model variants not explicitly listed",
765
+ "chatTemplate": null,
766
+ "frameworkCompatibility": {
767
+ "vllm-omni": ">=0.14.0"
768
+ },
769
+ "validationLevel": "experimental",
770
+ "modelType": "diffusor",
699
771
  "tasks": [
700
- "text-generation"
772
+ "text-to-image"
701
773
  ]
702
774
  }
703
775
  }