@aws/ml-container-creator 0.2.5 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/bin/cli.js +45 -4
  2. package/config/bootstrap-stack.json +14 -0
  3. package/infra/ci-harness/package-lock.json +22 -9
  4. package/package.json +7 -8
  5. package/servers/base-image-picker/index.js +3 -3
  6. package/servers/base-image-picker/manifest.json +4 -2
  7. package/servers/instance-sizer/index.js +564 -0
  8. package/servers/instance-sizer/lib/instance-ranker.js +270 -0
  9. package/servers/instance-sizer/lib/model-resolver.js +269 -0
  10. package/servers/instance-sizer/lib/vram-estimator.js +177 -0
  11. package/servers/instance-sizer/manifest.json +17 -0
  12. package/servers/instance-sizer/package.json +15 -0
  13. package/servers/{instance-recommender → lib}/catalogs/instances.json +136 -34
  14. package/servers/{base-image-picker → lib}/catalogs/model-servers.json +302 -254
  15. package/servers/lib/catalogs/model-sizes.json +131 -0
  16. package/servers/lib/catalogs/models.json +632 -0
  17. package/servers/{model-picker → lib}/catalogs/popular-diffusors.json +32 -10
  18. package/servers/{model-picker → lib}/catalogs/popular-transformers.json +59 -26
  19. package/servers/{base-image-picker → lib}/catalogs/python-slim.json +12 -12
  20. package/servers/lib/schemas/image-catalog.schema.json +6 -12
  21. package/servers/lib/schemas/instances.schema.json +29 -0
  22. package/servers/lib/schemas/model-catalog.schema.json +12 -10
  23. package/servers/lib/schemas/unified-model-catalog.schema.json +129 -0
  24. package/servers/model-picker/index.js +4 -4
  25. package/servers/model-picker/manifest.json +2 -3
  26. package/servers/region-picker/index.js +1 -1
  27. package/servers/region-picker/manifest.json +1 -1
  28. package/src/app.js +36 -0
  29. package/src/lib/architecture-sync.js +171 -0
  30. package/src/lib/arn-detection.js +22 -0
  31. package/src/lib/bootstrap-command-handler.js +120 -0
  32. package/src/lib/cli-handler.js +3 -3
  33. package/src/lib/config-manager.js +47 -1
  34. package/src/lib/configuration-manager.js +2 -2
  35. package/src/lib/cross-cutting-checker.js +460 -0
  36. package/src/lib/deployment-entry-schema.js +1 -2
  37. package/src/lib/dry-run-validator.js +78 -0
  38. package/src/lib/generation-validator.js +102 -0
  39. package/src/lib/mcp-validator-config.js +89 -0
  40. package/src/lib/payload-builder.js +153 -0
  41. package/src/lib/prompt-runner.js +866 -149
  42. package/src/lib/prompts.js +2 -2
  43. package/src/lib/registry-command-handler.js +236 -0
  44. package/src/lib/registry-loader.js +5 -5
  45. package/src/lib/schema-sync.js +203 -0
  46. package/src/lib/schema-validation-engine.js +195 -0
  47. package/src/lib/secret-classification.js +56 -0
  48. package/src/lib/secrets-command-handler.js +550 -0
  49. package/src/lib/service-model-parser.js +102 -0
  50. package/src/lib/validate-runner.js +216 -0
  51. package/src/lib/validation-report.js +140 -0
  52. package/src/lib/validators/base-validator.js +36 -0
  53. package/src/lib/validators/catalog-validator.js +177 -0
  54. package/src/lib/validators/enum-validator.js +120 -0
  55. package/src/lib/validators/required-field-validator.js +150 -0
  56. package/src/lib/validators/type-validator.js +313 -0
  57. package/src/prompt-adapter.js +3 -2
  58. package/templates/Dockerfile +1 -1
  59. package/templates/do/build +37 -5
  60. package/templates/do/config +15 -3
  61. package/templates/do/deploy +60 -5
  62. package/templates/do/logs +18 -3
  63. package/templates/do/run +15 -1
  64. package/templates/do/validate +61 -0
  65. package/servers/instance-recommender/LICENSE +0 -202
  66. package/servers/instance-recommender/index.js +0 -284
  67. package/servers/instance-recommender/manifest.json +0 -16
  68. package/servers/instance-recommender/package.json +0 -15
  69. /package/servers/{model-picker → lib}/catalogs/jumpstart-public.json +0 -0
  70. /package/servers/{region-picker → lib}/catalogs/regions.json +0 -0
  71. /package/servers/{base-image-picker → lib}/catalogs/triton-backends.json +0 -0
  72. /package/servers/{base-image-picker → lib}/catalogs/triton.json +0 -0
@@ -0,0 +1,632 @@
1
+ {
2
+ "openai/gpt-oss-20b": {
3
+ "family": "gpt-oss",
4
+ "parameterCount": 20000000000,
5
+ "defaultDtype": "bfloat16",
6
+ "maxPositionEmbeddings": 8192,
7
+ "gated": false,
8
+ "tags": [
9
+ "text-generation",
10
+ "openai",
11
+ "conversational"
12
+ ],
13
+ "architecture": "GPT2LMHeadModel",
14
+ "notes": "Open-source 20B parameter model. Requires significant GPU memory for inference",
15
+ "chatTemplate": "",
16
+ "frameworkCompatibility": {
17
+ "vllm": ">=0.3.0",
18
+ "tensorrt-llm": ">=0.8.0",
19
+ "sglang": ">=0.2.0"
20
+ },
21
+ "validationLevel": "community-validated",
22
+ "modelType": "transformer",
23
+ "tasks": [
24
+ "text-generation"
25
+ ]
26
+ },
27
+ "meta-llama/Llama-2-7b-chat-hf": {
28
+ "family": "llama-2",
29
+ "gated": true,
30
+ "tags": [
31
+ "text-generation",
32
+ "llama-2",
33
+ "conversational"
34
+ ],
35
+ "architecture": "LlamaForCausalLM",
36
+ "profiles": {
37
+ "7b": {
38
+ "displayName": "Llama-2 7B",
39
+ "envVars": {
40
+ "MAX_MODEL_LEN": "4096",
41
+ "GPU_MEMORY_UTILIZATION": "0.9"
42
+ }
43
+ }
44
+ },
45
+ "notes": "Llama-2 7B chat model with official chat template. Requires HuggingFace authentication for download",
46
+ "chatTemplate": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '[INST] <<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' }}{% endif %}{% endfor %}",
47
+ "frameworkCompatibility": {
48
+ "vllm": ">=0.3.0",
49
+ "tensorrt-llm": ">=0.8.0",
50
+ "sglang": ">=0.2.0"
51
+ },
52
+ "validationLevel": "tested",
53
+ "modelType": "transformer",
54
+ "parameterCount": 6738415616,
55
+ "defaultDtype": "float16",
56
+ "maxPositionEmbeddings": 4096,
57
+ "recommendedQuantizations": [
58
+ "awq",
59
+ "gptq"
60
+ ],
61
+ "tasks": [
62
+ "text-generation"
63
+ ]
64
+ },
65
+ "meta-llama/Llama-2-13b-chat-hf": {
66
+ "family": "llama-2",
67
+ "gated": true,
68
+ "tags": [
69
+ "text-generation",
70
+ "llama-2",
71
+ "conversational"
72
+ ],
73
+ "architecture": "LlamaForCausalLM",
74
+ "profiles": {
75
+ "13b": {
76
+ "displayName": "Llama-2 13B",
77
+ "envVars": {
78
+ "MAX_MODEL_LEN": "4096",
79
+ "GPU_MEMORY_UTILIZATION": "0.9"
80
+ }
81
+ }
82
+ },
83
+ "notes": "Llama-2 13B chat model. Requires more GPU memory than 7B variant",
84
+ "chatTemplate": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '[INST] <<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' }}{% endif %}{% endfor %}",
85
+ "frameworkCompatibility": {
86
+ "vllm": ">=0.3.0",
87
+ "tensorrt-llm": ">=0.8.0",
88
+ "sglang": ">=0.2.0"
89
+ },
90
+ "validationLevel": "tested",
91
+ "modelType": "transformer",
92
+ "parameterCount": 13015864320,
93
+ "defaultDtype": "float16",
94
+ "maxPositionEmbeddings": 4096,
95
+ "recommendedQuantizations": [
96
+ "awq",
97
+ "gptq"
98
+ ],
99
+ "tasks": [
100
+ "text-generation"
101
+ ]
102
+ },
103
+ "meta-llama/Llama-2-70b-chat-hf": {
104
+ "family": "llama-2",
105
+ "parameterCount": 70000000000,
106
+ "defaultDtype": "float16",
107
+ "maxPositionEmbeddings": 4096,
108
+ "gated": true,
109
+ "tags": [
110
+ "text-generation",
111
+ "llama-2",
112
+ "conversational"
113
+ ],
114
+ "architecture": "LlamaForCausalLM",
115
+ "profiles": {
116
+ "70b-tp2": {
117
+ "displayName": "Llama-2 70B (2-GPU)",
118
+ "envVars": {
119
+ "TENSOR_PARALLEL_SIZE": "2",
120
+ "MAX_MODEL_LEN": "4096",
121
+ "GPU_MEMORY_UTILIZATION": "0.95"
122
+ }
123
+ },
124
+ "70b-tp4": {
125
+ "displayName": "Llama-2 70B (4-GPU)",
126
+ "envVars": {
127
+ "TENSOR_PARALLEL_SIZE": "4",
128
+ "MAX_MODEL_LEN": "4096",
129
+ "GPU_MEMORY_UTILIZATION": "0.9"
130
+ }
131
+ }
132
+ },
133
+ "notes": "Llama-2 70B requires tensor parallelism across multiple GPUs",
134
+ "chatTemplate": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '[INST] <<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' }}{% endif %}{% endfor %}",
135
+ "frameworkCompatibility": {
136
+ "vllm": ">=0.3.0",
137
+ "tensorrt-llm": ">=0.8.0",
138
+ "sglang": ">=0.2.0"
139
+ },
140
+ "validationLevel": "community-validated",
141
+ "modelType": "transformer",
142
+ "parameterCount": 68976648192,
143
+ "defaultDtype": "float16",
144
+ "maxPositionEmbeddings": 4096,
145
+ "recommendedQuantizations": [
146
+ "awq",
147
+ "gptq"
148
+ ],
149
+ "tasks": [
150
+ "text-generation"
151
+ ]
152
+ },
153
+ "mistralai/Mistral-7B-Instruct-v0.1": {
154
+ "family": "mistral",
155
+ "gated": false,
156
+ "tags": [
157
+ "text-generation",
158
+ "mistral",
159
+ "conversational"
160
+ ],
161
+ "architecture": "MistralForCausalLM",
162
+ "profiles": {
163
+ "7b": {
164
+ "displayName": "Mistral 7B Instruct",
165
+ "envVars": {
166
+ "MAX_MODEL_LEN": "8192",
167
+ "GPU_MEMORY_UTILIZATION": "0.9"
168
+ }
169
+ }
170
+ },
171
+ "notes": "Mistral 7B v0.1 with 8K context window",
172
+ "chatTemplate": "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
173
+ "frameworkCompatibility": {
174
+ "vllm": ">=0.3.0",
175
+ "tensorrt-llm": ">=0.8.0",
176
+ "sglang": ">=0.2.0"
177
+ },
178
+ "validationLevel": "tested",
179
+ "modelType": "transformer",
180
+ "parameterCount": 7241732096,
181
+ "defaultDtype": "bfloat16",
182
+ "maxPositionEmbeddings": 32768,
183
+ "recommendedQuantizations": [
184
+ "awq",
185
+ "gptq"
186
+ ],
187
+ "tasks": [
188
+ "text-generation"
189
+ ]
190
+ },
191
+ "mistralai/Mistral-7B-Instruct-v0.2": {
192
+ "family": "mistral",
193
+ "gated": false,
194
+ "tags": [
195
+ "text-generation",
196
+ "mistral",
197
+ "conversational"
198
+ ],
199
+ "architecture": "MistralForCausalLM",
200
+ "profiles": {
201
+ "7b": {
202
+ "displayName": "Mistral 7B Instruct v0.2",
203
+ "envVars": {
204
+ "MAX_MODEL_LEN": "32768",
205
+ "GPU_MEMORY_UTILIZATION": "0.9"
206
+ }
207
+ }
208
+ },
209
+ "notes": "Mistral 7B v0.2 with extended 32K context window. Requires more memory for long contexts",
210
+ "chatTemplate": "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
211
+ "frameworkCompatibility": {
212
+ "vllm": ">=0.3.0",
213
+ "tensorrt-llm": ">=0.8.0",
214
+ "sglang": ">=0.2.0"
215
+ },
216
+ "validationLevel": "tested",
217
+ "modelType": "transformer",
218
+ "parameterCount": 7241732096,
219
+ "defaultDtype": "bfloat16",
220
+ "maxPositionEmbeddings": 32768,
221
+ "recommendedQuantizations": [
222
+ "awq",
223
+ "gptq"
224
+ ],
225
+ "tasks": [
226
+ "text-generation"
227
+ ]
228
+ },
229
+ "mistralai/Mixtral-8x7B-Instruct-v0.1": {
230
+ "family": "mistral",
231
+ "gated": false,
232
+ "tags": [
233
+ "text-generation",
234
+ "mistral",
235
+ "mixture-of-experts"
236
+ ],
237
+ "architecture": "MixtralForCausalLM",
238
+ "profiles": {
239
+ "8x7b-tp2": {
240
+ "displayName": "Mixtral 8x7B (2-GPU)",
241
+ "envVars": {
242
+ "TENSOR_PARALLEL_SIZE": "2",
243
+ "MAX_MODEL_LEN": "32768",
244
+ "GPU_MEMORY_UTILIZATION": "0.95"
245
+ }
246
+ }
247
+ },
248
+ "notes": "Mixtral 8x7B MoE model. Requires tensor parallelism for efficient inference",
249
+ "chatTemplate": "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
250
+ "frameworkCompatibility": {
251
+ "vllm": ">=0.3.0",
252
+ "tensorrt-llm": ">=0.8.0",
253
+ "sglang": ">=0.2.0"
254
+ },
255
+ "validationLevel": "community-validated",
256
+ "modelType": "transformer",
257
+ "parameterCount": 46702792704,
258
+ "defaultDtype": "bfloat16",
259
+ "maxPositionEmbeddings": 32768,
260
+ "recommendedQuantizations": [
261
+ "awq",
262
+ "gptq"
263
+ ],
264
+ "tasks": [
265
+ "text-generation"
266
+ ]
267
+ },
268
+ "meta-llama/Llama-2-70b-hf": {
269
+ "family": "llama-2",
270
+ "parameterCount": 70000000000,
271
+ "defaultDtype": "float16",
272
+ "maxPositionEmbeddings": 4096,
273
+ "gated": true,
274
+ "tags": [
275
+ "text-generation",
276
+ "llama-2"
277
+ ],
278
+ "architecture": "LlamaForCausalLM",
279
+ "notes": "Llama-2 70B base model (non-chat). Requires multi-GPU for inference.",
280
+ "chatTemplate": "",
281
+ "frameworkCompatibility": {
282
+ "vllm": ">=0.3.0",
283
+ "tensorrt-llm": ">=0.8.0",
284
+ "sglang": ">=0.2.0"
285
+ },
286
+ "validationLevel": "community-validated",
287
+ "modelType": "transformer",
288
+ "tasks": [
289
+ "text-generation"
290
+ ]
291
+ },
292
+ "meta-llama/Llama-2-*": {
293
+ "family": "llama-2",
294
+ "gated": true,
295
+ "tags": [
296
+ "text-generation",
297
+ "llama-2"
298
+ ],
299
+ "architecture": null,
300
+ "notes": "Fallback configuration for Llama-2 models not explicitly listed. Uses standard Llama-2 chat template",
301
+ "chatTemplate": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '[INST] <<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' }}{% endif %}{% endfor %}",
302
+ "frameworkCompatibility": {
303
+ "vllm": ">=0.3.0",
304
+ "tensorrt-llm": ">=0.8.0",
305
+ "sglang": ">=0.2.0"
306
+ },
307
+ "validationLevel": "experimental",
308
+ "modelType": "transformer",
309
+ "tasks": [
310
+ "text-generation"
311
+ ]
312
+ },
313
+ "mistralai/Mistral-*": {
314
+ "family": "mistral",
315
+ "gated": false,
316
+ "tags": [
317
+ "text-generation",
318
+ "mistral"
319
+ ],
320
+ "architecture": null,
321
+ "notes": "Fallback configuration for Mistral models not explicitly listed. Uses standard Mistral chat template",
322
+ "chatTemplate": "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
323
+ "frameworkCompatibility": {
324
+ "vllm": ">=0.3.0",
325
+ "tensorrt-llm": ">=0.8.0",
326
+ "sglang": ">=0.2.0"
327
+ },
328
+ "validationLevel": "experimental",
329
+ "modelType": "transformer",
330
+ "tasks": [
331
+ "text-generation"
332
+ ]
333
+ },
334
+ "codellama/*": {
335
+ "family": "codellama",
336
+ "gated": false,
337
+ "tags": [
338
+ "text-generation",
339
+ "code",
340
+ "codellama"
341
+ ],
342
+ "architecture": null,
343
+ "notes": "CodeLlama models use Llama-2 chat template. Optimized for code generation",
344
+ "chatTemplate": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '[INST] <<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' }}{% endif %}{% endfor %}",
345
+ "frameworkCompatibility": {
346
+ "vllm": ">=0.3.0",
347
+ "tensorrt-llm": ">=0.8.0"
348
+ },
349
+ "validationLevel": "experimental",
350
+ "modelType": "transformer",
351
+ "tasks": [
352
+ "text-generation"
353
+ ]
354
+ },
355
+ "tiiuae/falcon-*": {
356
+ "family": "falcon",
357
+ "gated": false,
358
+ "tags": [
359
+ "text-generation",
360
+ "falcon"
361
+ ],
362
+ "architecture": null,
363
+ "notes": "Falcon models typically don't require chat templates for instruction following",
364
+ "chatTemplate": null,
365
+ "frameworkCompatibility": {
366
+ "vllm": ">=0.3.0",
367
+ "tensorrt-llm": ">=0.8.0"
368
+ },
369
+ "validationLevel": "experimental",
370
+ "modelType": "transformer",
371
+ "tasks": [
372
+ "text-generation"
373
+ ]
374
+ },
375
+ "stabilityai/stable-diffusion-3.5-medium": {
376
+ "family": "stable-diffusion-3",
377
+ "gated": false,
378
+ "tags": [
379
+ "image-generation",
380
+ "diffusion",
381
+ "stable-diffusion"
382
+ ],
383
+ "architecture": "StableDiffusion3Pipeline",
384
+ "profiles": {
385
+ "default": {
386
+ "displayName": "SD3.5 Medium",
387
+ "envVars": {}
388
+ }
389
+ },
390
+ "notes": "Stable Diffusion 3.5 medium model. Supported natively by vLLM-Omni StableDiffusion3Pipeline.",
391
+ "chatTemplate": null,
392
+ "frameworkCompatibility": {
393
+ "vllm-omni": ">=0.14.0"
394
+ },
395
+ "validationLevel": "experimental",
396
+ "modelType": "diffusor",
397
+ "tasks": [
398
+ "text-to-image"
399
+ ]
400
+ },
401
+ "black-forest-labs/FLUX.1-dev": {
402
+ "family": "flux",
403
+ "gated": true,
404
+ "tags": [
405
+ "image-generation",
406
+ "diffusion",
407
+ "flux"
408
+ ],
409
+ "architecture": "FluxPipeline",
410
+ "profiles": {
411
+ "default": {
412
+ "displayName": "FLUX.1 Dev",
413
+ "envVars": {}
414
+ }
415
+ },
416
+ "notes": "FLUX.1-dev high-quality generation model. Uses dual text encoders (CLIP + T5) and FlowMatchEuler scheduler. Requires significant VRAM.",
417
+ "chatTemplate": null,
418
+ "frameworkCompatibility": {
419
+ "vllm-omni": ">=0.14.0"
420
+ },
421
+ "validationLevel": "experimental",
422
+ "modelType": "diffusor",
423
+ "tasks": [
424
+ "text-to-image"
425
+ ]
426
+ },
427
+ "black-forest-labs/FLUX.1-schnell": {
428
+ "family": "flux",
429
+ "gated": false,
430
+ "tags": [
431
+ "image-generation",
432
+ "diffusion",
433
+ "flux"
434
+ ],
435
+ "architecture": "FluxPipeline",
436
+ "notes": "FLUX.1-schnell fast generation model. Fewer denoising steps for faster inference at slightly lower quality",
437
+ "chatTemplate": null,
438
+ "frameworkCompatibility": {
439
+ "vllm-omni": ">=0.14.0"
440
+ },
441
+ "validationLevel": "experimental",
442
+ "modelType": "diffusor",
443
+ "tasks": [
444
+ "text-to-image"
445
+ ]
446
+ },
447
+ "Wan-AI/Wan2.1-T2V-14B-Diffusers": {
448
+ "family": "wan",
449
+ "gated": false,
450
+ "tags": [
451
+ "video-generation",
452
+ "diffusion",
453
+ "wan"
454
+ ],
455
+ "architecture": "WanPipeline",
456
+ "notes": "Wan2.1 text-to-video 14B model (diffusers format). Requires multi-GPU instance (ml.g5.12xlarge or larger). Must use the -Diffusers variant — the base Wan2.1-T2V-14B repo lacks model_index.json required by vLLM-Omni",
457
+ "chatTemplate": null,
458
+ "frameworkCompatibility": {
459
+ "vllm-omni": ">=0.16.0"
460
+ },
461
+ "validationLevel": "experimental",
462
+ "modelType": "diffusor",
463
+ "tasks": [
464
+ "text-to-video"
465
+ ]
466
+ },
467
+ "stabilityai/stable-diffusion-*": {
468
+ "family": "stable-diffusion",
469
+ "gated": false,
470
+ "tags": [
471
+ "image-generation",
472
+ "diffusion",
473
+ "stable-diffusion"
474
+ ],
475
+ "architecture": null,
476
+ "notes": "Fallback for Stable Diffusion variants not explicitly listed",
477
+ "chatTemplate": null,
478
+ "frameworkCompatibility": {
479
+ "vllm-omni": ">=0.14.0"
480
+ },
481
+ "validationLevel": "experimental",
482
+ "modelType": "diffusor",
483
+ "tasks": [
484
+ "text-to-image"
485
+ ]
486
+ },
487
+ "black-forest-labs/FLUX*": {
488
+ "family": "flux",
489
+ "gated": false,
490
+ "tags": [
491
+ "image-generation",
492
+ "diffusion",
493
+ "flux"
494
+ ],
495
+ "architecture": null,
496
+ "notes": "Fallback for FLUX model variants not explicitly listed",
497
+ "chatTemplate": null,
498
+ "frameworkCompatibility": {
499
+ "vllm-omni": ">=0.14.0"
500
+ },
501
+ "validationLevel": "experimental",
502
+ "modelType": "diffusor",
503
+ "tasks": [
504
+ "text-to-image"
505
+ ]
506
+ },
507
+ "meta-llama/Meta-Llama-3-8B*": {
508
+ "parameterCount": 8030261248,
509
+ "defaultDtype": "bfloat16",
510
+ "architecture": "LlamaForCausalLM",
511
+ "maxPositionEmbeddings": 8192,
512
+ "recommendedQuantizations": [
513
+ "awq",
514
+ "gptq"
515
+ ],
516
+ "modelType": "transformer",
517
+ "tasks": [
518
+ "text-generation"
519
+ ]
520
+ },
521
+ "meta-llama/Meta-Llama-3-70B*": {
522
+ "parameterCount": 70553706496,
523
+ "defaultDtype": "bfloat16",
524
+ "architecture": "LlamaForCausalLM",
525
+ "maxPositionEmbeddings": 8192,
526
+ "recommendedQuantizations": [
527
+ "awq",
528
+ "gptq"
529
+ ],
530
+ "modelType": "transformer",
531
+ "tasks": [
532
+ "text-generation"
533
+ ]
534
+ },
535
+ "Qwen/Qwen-7B*": {
536
+ "parameterCount": 7721324544,
537
+ "defaultDtype": "bfloat16",
538
+ "architecture": "QWenLMHeadModel",
539
+ "maxPositionEmbeddings": 8192,
540
+ "recommendedQuantizations": [
541
+ "awq",
542
+ "gptq"
543
+ ],
544
+ "modelType": "transformer",
545
+ "tasks": [
546
+ "text-generation"
547
+ ]
548
+ },
549
+ "Qwen/Qwen2-7B*": {
550
+ "parameterCount": 7721324544,
551
+ "defaultDtype": "bfloat16",
552
+ "architecture": "Qwen2ForCausalLM",
553
+ "maxPositionEmbeddings": 32768,
554
+ "recommendedQuantizations": [
555
+ "awq",
556
+ "gptq"
557
+ ],
558
+ "modelType": "transformer",
559
+ "tasks": [
560
+ "text-generation"
561
+ ]
562
+ },
563
+ "Qwen/Qwen-14B*": {
564
+ "parameterCount": 14167134208,
565
+ "defaultDtype": "bfloat16",
566
+ "architecture": "QWenLMHeadModel",
567
+ "maxPositionEmbeddings": 8192,
568
+ "recommendedQuantizations": [
569
+ "awq",
570
+ "gptq"
571
+ ],
572
+ "modelType": "transformer",
573
+ "tasks": [
574
+ "text-generation"
575
+ ]
576
+ },
577
+ "Qwen/Qwen2-14B*": {
578
+ "parameterCount": 14167134208,
579
+ "defaultDtype": "bfloat16",
580
+ "architecture": "Qwen2ForCausalLM",
581
+ "maxPositionEmbeddings": 32768,
582
+ "recommendedQuantizations": [
583
+ "awq",
584
+ "gptq"
585
+ ],
586
+ "modelType": "transformer",
587
+ "tasks": [
588
+ "text-generation"
589
+ ]
590
+ },
591
+ "Qwen/Qwen-72B*": {
592
+ "parameterCount": 72710410240,
593
+ "defaultDtype": "bfloat16",
594
+ "architecture": "QWenLMHeadModel",
595
+ "maxPositionEmbeddings": 32768,
596
+ "recommendedQuantizations": [
597
+ "awq",
598
+ "gptq"
599
+ ],
600
+ "modelType": "transformer",
601
+ "tasks": [
602
+ "text-generation"
603
+ ]
604
+ },
605
+ "Qwen/Qwen2-72B*": {
606
+ "parameterCount": 72710410240,
607
+ "defaultDtype": "bfloat16",
608
+ "architecture": "Qwen2ForCausalLM",
609
+ "maxPositionEmbeddings": 32768,
610
+ "recommendedQuantizations": [
611
+ "awq",
612
+ "gptq"
613
+ ],
614
+ "modelType": "transformer",
615
+ "tasks": [
616
+ "text-generation"
617
+ ]
618
+ },
619
+ "EleutherAI/gpt-neox-20b*": {
620
+ "parameterCount": 20554568704,
621
+ "defaultDtype": "float16",
622
+ "architecture": "GPTNeoXForCausalLM",
623
+ "maxPositionEmbeddings": 2048,
624
+ "recommendedQuantizations": [
625
+ "gptq"
626
+ ],
627
+ "modelType": "transformer",
628
+ "tasks": [
629
+ "text-generation"
630
+ ]
631
+ }
632
+ }