@aws/ml-container-creator 0.2.6 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.js +38 -2
- package/config/bootstrap-stack.json +94 -1
- package/config/defaults.json +1 -1
- package/infra/ci-harness/package-lock.json +22 -9
- package/package.json +3 -1
- package/servers/instance-sizer/index.js +45 -8
- package/servers/instance-sizer/lib/instance-ranker.js +140 -11
- package/servers/instance-sizer/lib/model-resolver.js +10 -6
- package/servers/instance-sizer/lib/quota-resolver.js +368 -0
- package/servers/instance-sizer/package.json +2 -0
- package/servers/lib/catalogs/instances.json +527 -12
- package/servers/lib/catalogs/model-servers.json +298 -20
- package/servers/lib/catalogs/model-sizes.json +27 -0
- package/servers/lib/catalogs/models.json +101 -0
- package/servers/lib/schemas/image-catalog.schema.json +15 -1
- package/servers/model-picker/index.js +2 -1
- package/src/app.js +96 -2
- package/src/lib/architecture-sync.js +171 -0
- package/src/lib/arn-detection.js +22 -0
- package/src/lib/bootstrap-command-handler.js +178 -3
- package/src/lib/cli-handler.js +2 -2
- package/src/lib/config-manager.js +121 -1
- package/src/lib/cross-cutting-checker.js +119 -0
- package/src/lib/deployment-entry-schema.js +1 -2
- package/src/lib/prompt-runner.js +514 -20
- package/src/lib/prompts.js +67 -5
- package/src/lib/registry-command-handler.js +236 -0
- package/src/lib/schema-sync.js +31 -0
- package/src/lib/secret-classification.js +56 -0
- package/src/lib/secrets-command-handler.js +550 -0
- package/src/lib/template-manager.js +49 -1
- package/src/lib/validate-runner.js +174 -2
- package/src/lib/validation-report.js +8 -1
- package/src/prompt-adapter.js +3 -2
- package/templates/Dockerfile +10 -2
- package/templates/code/cuda_compat.sh +22 -0
- package/templates/code/serve +3 -0
- package/templates/code/start_server.sh +3 -0
- package/templates/diffusors/Dockerfile +2 -1
- package/templates/diffusors/serve +3 -0
- package/templates/do/README.md +33 -0
- package/templates/do/benchmark +646 -0
- package/templates/do/build +22 -0
- package/templates/do/clean +86 -0
- package/templates/do/config +41 -6
- package/templates/do/deploy +66 -6
- package/templates/do/logs +18 -3
- package/templates/do/register +8 -1
- package/templates/do/run +10 -0
- package/templates/triton/Dockerfile +5 -0
|
@@ -64,7 +64,157 @@
|
|
|
64
64
|
"notes": "Requires instance with 4+ GPUs. Set TENSOR_PARALLEL_SIZE to match GPU count"
|
|
65
65
|
}
|
|
66
66
|
},
|
|
67
|
-
"notes": "vLLM 0.4.0 adds prefix caching and improved performance. Requires CUDA 12.0+"
|
|
67
|
+
"notes": "vLLM 0.4.0 adds prefix caching and improved performance. Requires CUDA 12.0+",
|
|
68
|
+
"supportedModelTypes": [
|
|
69
|
+
"arcee",
|
|
70
|
+
"arctic",
|
|
71
|
+
"aria",
|
|
72
|
+
"aya_vision",
|
|
73
|
+
"baichuan",
|
|
74
|
+
"bailing_moe",
|
|
75
|
+
"bamba",
|
|
76
|
+
"bart",
|
|
77
|
+
"bert",
|
|
78
|
+
"bert_with_rope",
|
|
79
|
+
"blip2",
|
|
80
|
+
"bloom",
|
|
81
|
+
"chameleon",
|
|
82
|
+
"chatglm",
|
|
83
|
+
"cohere2_vision",
|
|
84
|
+
"commandr",
|
|
85
|
+
"dbrx",
|
|
86
|
+
"deepseek",
|
|
87
|
+
"deepseek_mtp",
|
|
88
|
+
"deepseek_v2",
|
|
89
|
+
"deepseek_vl2",
|
|
90
|
+
"dots1",
|
|
91
|
+
"ernie45",
|
|
92
|
+
"ernie45_moe",
|
|
93
|
+
"exaone",
|
|
94
|
+
"exaone4",
|
|
95
|
+
"fairseq2_llama",
|
|
96
|
+
"falcon",
|
|
97
|
+
"falcon_h1",
|
|
98
|
+
"florence2",
|
|
99
|
+
"fuyu",
|
|
100
|
+
"gemma",
|
|
101
|
+
"gemma2",
|
|
102
|
+
"gemma3",
|
|
103
|
+
"gemma3_mm",
|
|
104
|
+
"gemma3n",
|
|
105
|
+
"gemma3n_mm",
|
|
106
|
+
"glm",
|
|
107
|
+
"glm4",
|
|
108
|
+
"glm4_1v",
|
|
109
|
+
"glm4_moe",
|
|
110
|
+
"glm4_moe_mtp",
|
|
111
|
+
"glm4v",
|
|
112
|
+
"gpt2",
|
|
113
|
+
"gpt_bigcode",
|
|
114
|
+
"gpt_j",
|
|
115
|
+
"gpt_neox",
|
|
116
|
+
"gpt_oss",
|
|
117
|
+
"granite",
|
|
118
|
+
"granite_speech",
|
|
119
|
+
"granitemoe",
|
|
120
|
+
"granitemoehybrid",
|
|
121
|
+
"granitemoeshared",
|
|
122
|
+
"gritlm",
|
|
123
|
+
"grok1",
|
|
124
|
+
"h2ovl",
|
|
125
|
+
"hunyuan_v1",
|
|
126
|
+
"hyperclovax_vision",
|
|
127
|
+
"idefics3",
|
|
128
|
+
"internlm2",
|
|
129
|
+
"internlm2_ve",
|
|
130
|
+
"interns1",
|
|
131
|
+
"internvl",
|
|
132
|
+
"jais",
|
|
133
|
+
"jamba",
|
|
134
|
+
"jina_vl",
|
|
135
|
+
"keye",
|
|
136
|
+
"kimi_vl",
|
|
137
|
+
"llama",
|
|
138
|
+
"llama4",
|
|
139
|
+
"llama4_eagle",
|
|
140
|
+
"llama_eagle",
|
|
141
|
+
"llama_eagle3",
|
|
142
|
+
"llava",
|
|
143
|
+
"llava_next",
|
|
144
|
+
"llava_next_video",
|
|
145
|
+
"llava_onevision",
|
|
146
|
+
"mamba",
|
|
147
|
+
"mamba2",
|
|
148
|
+
"medusa",
|
|
149
|
+
"mimo",
|
|
150
|
+
"mimo_mtp",
|
|
151
|
+
"minicpm",
|
|
152
|
+
"minicpm3",
|
|
153
|
+
"minicpm_eagle",
|
|
154
|
+
"minicpmo",
|
|
155
|
+
"minicpmv",
|
|
156
|
+
"minimax_text_01",
|
|
157
|
+
"minimax_vl_01",
|
|
158
|
+
"mistral3",
|
|
159
|
+
"mixtral",
|
|
160
|
+
"mixtral_quant",
|
|
161
|
+
"mllama",
|
|
162
|
+
"mllama4",
|
|
163
|
+
"mlp_speculator",
|
|
164
|
+
"modernbert",
|
|
165
|
+
"molmo",
|
|
166
|
+
"mpt",
|
|
167
|
+
"nemotron",
|
|
168
|
+
"nemotron_h",
|
|
169
|
+
"nemotron_nas",
|
|
170
|
+
"nemotron_vl",
|
|
171
|
+
"nvlm_d",
|
|
172
|
+
"olmo",
|
|
173
|
+
"olmo2",
|
|
174
|
+
"olmoe",
|
|
175
|
+
"opt",
|
|
176
|
+
"orion",
|
|
177
|
+
"ovis",
|
|
178
|
+
"paligemma",
|
|
179
|
+
"persimmon",
|
|
180
|
+
"phi",
|
|
181
|
+
"phi3",
|
|
182
|
+
"phi3v",
|
|
183
|
+
"phi4_multimodal",
|
|
184
|
+
"phi4flash",
|
|
185
|
+
"phi4mm",
|
|
186
|
+
"phimoe",
|
|
187
|
+
"pixtral",
|
|
188
|
+
"plamo2",
|
|
189
|
+
"prithvi_geospatial_mae",
|
|
190
|
+
"qwen",
|
|
191
|
+
"qwen2",
|
|
192
|
+
"qwen2_5_omni_thinker",
|
|
193
|
+
"qwen2_5_vl",
|
|
194
|
+
"qwen2_audio",
|
|
195
|
+
"qwen2_moe",
|
|
196
|
+
"qwen2_rm",
|
|
197
|
+
"qwen2_vl",
|
|
198
|
+
"qwen3",
|
|
199
|
+
"qwen3_moe",
|
|
200
|
+
"qwen_vl",
|
|
201
|
+
"roberta",
|
|
202
|
+
"skyworkr1v",
|
|
203
|
+
"smolvlm",
|
|
204
|
+
"solar",
|
|
205
|
+
"stablelm",
|
|
206
|
+
"starcoder2",
|
|
207
|
+
"step3_text",
|
|
208
|
+
"step3_vl",
|
|
209
|
+
"tarsier",
|
|
210
|
+
"telechat2",
|
|
211
|
+
"teleflm",
|
|
212
|
+
"transformers",
|
|
213
|
+
"ultravox",
|
|
214
|
+
"voxtral",
|
|
215
|
+
"whisper",
|
|
216
|
+
"zamba2"
|
|
217
|
+
]
|
|
68
218
|
},
|
|
69
219
|
{
|
|
70
220
|
"image": "vllm/vllm-openai:v0.9.1",
|
|
@@ -130,7 +280,133 @@
|
|
|
130
280
|
"notes": "Requires instance with 4+ GPUs. Set TENSOR_PARALLEL_SIZE to match GPU count"
|
|
131
281
|
}
|
|
132
282
|
},
|
|
133
|
-
"notes": "vLLM 0.4.0 adds prefix caching and improved performance. Requires CUDA 12.0+"
|
|
283
|
+
"notes": "vLLM 0.4.0 adds prefix caching and improved performance. Requires CUDA 12.0+",
|
|
284
|
+
"supportedModelTypes": [
|
|
285
|
+
"arctic",
|
|
286
|
+
"aria",
|
|
287
|
+
"aya_vision",
|
|
288
|
+
"baichuan",
|
|
289
|
+
"bamba",
|
|
290
|
+
"bart",
|
|
291
|
+
"bert",
|
|
292
|
+
"bert_with_rope",
|
|
293
|
+
"blip2",
|
|
294
|
+
"bloom",
|
|
295
|
+
"chameleon",
|
|
296
|
+
"chatglm",
|
|
297
|
+
"commandr",
|
|
298
|
+
"dbrx",
|
|
299
|
+
"deepseek",
|
|
300
|
+
"deepseek_mtp",
|
|
301
|
+
"deepseek_v2",
|
|
302
|
+
"deepseek_vl2",
|
|
303
|
+
"eagle",
|
|
304
|
+
"exaone",
|
|
305
|
+
"fairseq2_llama",
|
|
306
|
+
"falcon",
|
|
307
|
+
"falcon_h1",
|
|
308
|
+
"florence2",
|
|
309
|
+
"fuyu",
|
|
310
|
+
"gemma",
|
|
311
|
+
"gemma2",
|
|
312
|
+
"gemma3",
|
|
313
|
+
"gemma3_mm",
|
|
314
|
+
"glm",
|
|
315
|
+
"glm4",
|
|
316
|
+
"glm4v",
|
|
317
|
+
"gpt2",
|
|
318
|
+
"gpt_bigcode",
|
|
319
|
+
"gpt_j",
|
|
320
|
+
"gpt_neox",
|
|
321
|
+
"granite",
|
|
322
|
+
"granite_speech",
|
|
323
|
+
"granitemoe",
|
|
324
|
+
"granitemoehybrid",
|
|
325
|
+
"granitemoeshared",
|
|
326
|
+
"gritlm",
|
|
327
|
+
"grok1",
|
|
328
|
+
"h2ovl",
|
|
329
|
+
"idefics3",
|
|
330
|
+
"internlm2",
|
|
331
|
+
"internlm2_ve",
|
|
332
|
+
"internvl",
|
|
333
|
+
"jais",
|
|
334
|
+
"jamba",
|
|
335
|
+
"kimi_vl",
|
|
336
|
+
"llama",
|
|
337
|
+
"llama_eagle",
|
|
338
|
+
"llama_eagle3",
|
|
339
|
+
"llava",
|
|
340
|
+
"llava_next",
|
|
341
|
+
"llava_next_video",
|
|
342
|
+
"llava_onevision",
|
|
343
|
+
"mamba",
|
|
344
|
+
"mamba2",
|
|
345
|
+
"medusa",
|
|
346
|
+
"mimo",
|
|
347
|
+
"mimo_mtp",
|
|
348
|
+
"minicpm",
|
|
349
|
+
"minicpm3",
|
|
350
|
+
"minicpm_eagle",
|
|
351
|
+
"minicpmo",
|
|
352
|
+
"minicpmv",
|
|
353
|
+
"minimax_text_01",
|
|
354
|
+
"minimax_vl_01",
|
|
355
|
+
"mistral3",
|
|
356
|
+
"mixtral",
|
|
357
|
+
"mixtral_quant",
|
|
358
|
+
"mllama",
|
|
359
|
+
"mllama4",
|
|
360
|
+
"mlp_speculator",
|
|
361
|
+
"modernbert",
|
|
362
|
+
"molmo",
|
|
363
|
+
"mpt",
|
|
364
|
+
"nemotron",
|
|
365
|
+
"nemotron_h",
|
|
366
|
+
"nemotron_nas",
|
|
367
|
+
"nvlm_d",
|
|
368
|
+
"olmo",
|
|
369
|
+
"olmo2",
|
|
370
|
+
"olmoe",
|
|
371
|
+
"opt",
|
|
372
|
+
"orion",
|
|
373
|
+
"ovis",
|
|
374
|
+
"paligemma",
|
|
375
|
+
"persimmon",
|
|
376
|
+
"phi",
|
|
377
|
+
"phi3",
|
|
378
|
+
"phi3_small",
|
|
379
|
+
"phi3v",
|
|
380
|
+
"phi4mm",
|
|
381
|
+
"phimoe",
|
|
382
|
+
"pixtral",
|
|
383
|
+
"plamo2",
|
|
384
|
+
"prithvi_geospatial_mae",
|
|
385
|
+
"qwen",
|
|
386
|
+
"qwen2",
|
|
387
|
+
"qwen2_5_omni_thinker",
|
|
388
|
+
"qwen2_5_vl",
|
|
389
|
+
"qwen2_audio",
|
|
390
|
+
"qwen2_moe",
|
|
391
|
+
"qwen2_rm",
|
|
392
|
+
"qwen2_vl",
|
|
393
|
+
"qwen3",
|
|
394
|
+
"qwen3_moe",
|
|
395
|
+
"qwen_vl",
|
|
396
|
+
"roberta",
|
|
397
|
+
"skyworkr1v",
|
|
398
|
+
"smolvlm",
|
|
399
|
+
"solar",
|
|
400
|
+
"stablelm",
|
|
401
|
+
"starcoder2",
|
|
402
|
+
"tarsier",
|
|
403
|
+
"telechat2",
|
|
404
|
+
"teleflm",
|
|
405
|
+
"transformers",
|
|
406
|
+
"ultravox",
|
|
407
|
+
"whisper",
|
|
408
|
+
"zamba2"
|
|
409
|
+
]
|
|
134
410
|
}
|
|
135
411
|
],
|
|
136
412
|
"sglang": [
|
|
@@ -266,7 +542,7 @@
|
|
|
266
542
|
"TRTLLM_ENABLE_CHUNKED_CONTEXT": "true",
|
|
267
543
|
"UCX_MEMTYPE_CACHE": "n"
|
|
268
544
|
},
|
|
269
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-
|
|
545
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
270
546
|
},
|
|
271
547
|
"accelerator": {
|
|
272
548
|
"type": "cuda",
|
|
@@ -311,7 +587,8 @@
|
|
|
311
587
|
"notes": "Enables running larger models on smaller instances with acceptable accuracy"
|
|
312
588
|
}
|
|
313
589
|
},
|
|
314
|
-
"notes": "TensorRT-LLM 1.0.0 adds chunked context and INT4 support. Requires CUDA 12.1+"
|
|
590
|
+
"notes": "TensorRT-LLM 1.0.0 adds chunked context and INT4 support. Requires CUDA 12.1+",
|
|
591
|
+
"supportedModelTypes": []
|
|
315
592
|
},
|
|
316
593
|
{
|
|
317
594
|
"image": "nvcr.io/nvidia/tensorrt-llm/release:1.1.0",
|
|
@@ -335,7 +612,7 @@
|
|
|
335
612
|
"TRTLLM_ENABLE_CHUNKED_CONTEXT": "true",
|
|
336
613
|
"UCX_MEMTYPE_CACHE": "n"
|
|
337
614
|
},
|
|
338
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-
|
|
615
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
339
616
|
},
|
|
340
617
|
"accelerator": {
|
|
341
618
|
"type": "cuda",
|
|
@@ -380,7 +657,8 @@
|
|
|
380
657
|
"notes": "Enables running larger models on smaller instances with acceptable accuracy"
|
|
381
658
|
}
|
|
382
659
|
},
|
|
383
|
-
"notes": "TensorRT-LLM 1.0.0 adds chunked context and INT4 support. Requires CUDA 12.1+"
|
|
660
|
+
"notes": "TensorRT-LLM 1.0.0 adds chunked context and INT4 support. Requires CUDA 12.1+",
|
|
661
|
+
"supportedModelTypes": []
|
|
384
662
|
}
|
|
385
663
|
],
|
|
386
664
|
"lmi": [
|
|
@@ -403,7 +681,7 @@
|
|
|
403
681
|
"OPTION_MAX_ROLLING_BATCH_SIZE": "32",
|
|
404
682
|
"OPTION_DTYPE": "fp16"
|
|
405
683
|
},
|
|
406
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-
|
|
684
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
407
685
|
},
|
|
408
686
|
"accelerator": {
|
|
409
687
|
"type": "cuda",
|
|
@@ -476,7 +754,7 @@
|
|
|
476
754
|
"OPTION_MAX_ROLLING_BATCH_SIZE": "32",
|
|
477
755
|
"OPTION_DTYPE": "fp16"
|
|
478
756
|
},
|
|
479
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-
|
|
757
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
480
758
|
},
|
|
481
759
|
"accelerator": {
|
|
482
760
|
"type": "cuda",
|
|
@@ -550,7 +828,7 @@
|
|
|
550
828
|
"OPTION_TENSOR_PARALLEL_DEGREE": "1",
|
|
551
829
|
"OPTION_DEVICE_MAP": "auto"
|
|
552
830
|
},
|
|
553
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-
|
|
831
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
554
832
|
},
|
|
555
833
|
"accelerator": {
|
|
556
834
|
"type": "cuda",
|
|
@@ -603,7 +881,7 @@
|
|
|
603
881
|
"OPTION_TENSOR_PARALLEL_DEGREE": "1",
|
|
604
882
|
"OPTION_DEVICE_MAP": "auto"
|
|
605
883
|
},
|
|
606
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-
|
|
884
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
607
885
|
},
|
|
608
886
|
"accelerator": {
|
|
609
887
|
"type": "cuda",
|
|
@@ -657,7 +935,7 @@
|
|
|
657
935
|
"HF_TOKEN": "${hfToken}",
|
|
658
936
|
"VLLM_WORKER_MULTIPROC_METHOD": "spawn"
|
|
659
937
|
},
|
|
660
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-
|
|
938
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
661
939
|
},
|
|
662
940
|
"accelerator": {
|
|
663
941
|
"type": "cuda",
|
|
@@ -707,7 +985,7 @@
|
|
|
707
985
|
"HF_TOKEN": "${hfToken}",
|
|
708
986
|
"VLLM_WORKER_MULTIPROC_METHOD": "spawn"
|
|
709
987
|
},
|
|
710
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-
|
|
988
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
711
989
|
},
|
|
712
990
|
"accelerator": {
|
|
713
991
|
"type": "cuda",
|
|
@@ -758,7 +1036,7 @@
|
|
|
758
1036
|
"envVars": {
|
|
759
1037
|
"TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
|
|
760
1038
|
},
|
|
761
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-
|
|
1039
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
762
1040
|
},
|
|
763
1041
|
"accelerator": {
|
|
764
1042
|
"type": "cuda",
|
|
@@ -789,7 +1067,7 @@
|
|
|
789
1067
|
"envVars": {
|
|
790
1068
|
"TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
|
|
791
1069
|
},
|
|
792
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-
|
|
1070
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
793
1071
|
},
|
|
794
1072
|
"accelerator": {
|
|
795
1073
|
"type": "cuda",
|
|
@@ -820,7 +1098,7 @@
|
|
|
820
1098
|
"envVars": {
|
|
821
1099
|
"TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
|
|
822
1100
|
},
|
|
823
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-
|
|
1101
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
824
1102
|
},
|
|
825
1103
|
"accelerator": {
|
|
826
1104
|
"type": "cuda",
|
|
@@ -851,7 +1129,7 @@
|
|
|
851
1129
|
"envVars": {
|
|
852
1130
|
"TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
|
|
853
1131
|
},
|
|
854
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-
|
|
1132
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
855
1133
|
},
|
|
856
1134
|
"accelerator": {
|
|
857
1135
|
"type": "cuda",
|
|
@@ -882,7 +1160,7 @@
|
|
|
882
1160
|
"envVars": {
|
|
883
1161
|
"TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
|
|
884
1162
|
},
|
|
885
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-
|
|
1163
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
886
1164
|
},
|
|
887
1165
|
"accelerator": {
|
|
888
1166
|
"type": "cuda",
|
|
@@ -913,7 +1191,7 @@
|
|
|
913
1191
|
"envVars": {
|
|
914
1192
|
"TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
|
|
915
1193
|
},
|
|
916
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-
|
|
1194
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
917
1195
|
},
|
|
918
1196
|
"accelerator": {
|
|
919
1197
|
"type": "cuda",
|
|
@@ -944,7 +1222,7 @@
|
|
|
944
1222
|
"envVars": {
|
|
945
1223
|
"TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
|
|
946
1224
|
},
|
|
947
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-
|
|
1225
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
948
1226
|
},
|
|
949
1227
|
"accelerator": {
|
|
950
1228
|
"type": "cuda",
|
|
@@ -958,4 +1236,4 @@
|
|
|
958
1236
|
"notes": "Triton Python backend for custom model serving with TritonPythonModel interface. GPU optional"
|
|
959
1237
|
}
|
|
960
1238
|
]
|
|
961
|
-
}
|
|
1239
|
+
}
|
|
@@ -46,6 +46,33 @@
|
|
|
46
46
|
"minVramGb": 184,
|
|
47
47
|
"recommendedInstances": ["ml.g5.48xlarge", "ml.p4d.24xlarge"]
|
|
48
48
|
},
|
|
49
|
+
"meta-llama/Llama-3.1-8B*": {
|
|
50
|
+
"parameterCount": 8030261248,
|
|
51
|
+
"defaultDtype": "bfloat16",
|
|
52
|
+
"architecture": "LlamaForCausalLM",
|
|
53
|
+
"maxPositionEmbeddings": 131072,
|
|
54
|
+
"recommendedQuantizations": ["awq", "gptq"],
|
|
55
|
+
"minVramGb": 20,
|
|
56
|
+
"recommendedInstances": ["ml.g5.2xlarge", "ml.g6.2xlarge"]
|
|
57
|
+
},
|
|
58
|
+
"meta-llama/Llama-3.2-1B*": {
|
|
59
|
+
"parameterCount": 1235814400,
|
|
60
|
+
"defaultDtype": "bfloat16",
|
|
61
|
+
"architecture": "LlamaForCausalLM",
|
|
62
|
+
"maxPositionEmbeddings": 131072,
|
|
63
|
+
"recommendedQuantizations": ["awq", "gptq"],
|
|
64
|
+
"minVramGb": 5,
|
|
65
|
+
"recommendedInstances": ["ml.g5.xlarge", "ml.g6.xlarge"]
|
|
66
|
+
},
|
|
67
|
+
"meta-llama/Llama-3.2-3B*": {
|
|
68
|
+
"parameterCount": 3212749824,
|
|
69
|
+
"defaultDtype": "bfloat16",
|
|
70
|
+
"architecture": "LlamaForCausalLM",
|
|
71
|
+
"maxPositionEmbeddings": 131072,
|
|
72
|
+
"recommendedQuantizations": ["awq", "gptq"],
|
|
73
|
+
"minVramGb": 9,
|
|
74
|
+
"recommendedInstances": ["ml.g5.xlarge", "ml.g6.xlarge"]
|
|
75
|
+
},
|
|
49
76
|
"mistralai/Mistral-7B*": {
|
|
50
77
|
"parameterCount": 7241732096,
|
|
51
78
|
"defaultDtype": "bfloat16",
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
{
|
|
2
2
|
"openai/gpt-oss-20b": {
|
|
3
3
|
"family": "gpt-oss",
|
|
4
|
+
"parameterCount": 20000000000,
|
|
5
|
+
"defaultDtype": "bfloat16",
|
|
6
|
+
"maxPositionEmbeddings": 8192,
|
|
4
7
|
"gated": false,
|
|
5
8
|
"tags": [
|
|
6
9
|
"text-generation",
|
|
@@ -99,6 +102,9 @@
|
|
|
99
102
|
},
|
|
100
103
|
"meta-llama/Llama-2-70b-chat-hf": {
|
|
101
104
|
"family": "llama-2",
|
|
105
|
+
"parameterCount": 70000000000,
|
|
106
|
+
"defaultDtype": "float16",
|
|
107
|
+
"maxPositionEmbeddings": 4096,
|
|
102
108
|
"gated": true,
|
|
103
109
|
"tags": [
|
|
104
110
|
"text-generation",
|
|
@@ -259,6 +265,30 @@
|
|
|
259
265
|
"text-generation"
|
|
260
266
|
]
|
|
261
267
|
},
|
|
268
|
+
"meta-llama/Llama-2-70b-hf": {
|
|
269
|
+
"family": "llama-2",
|
|
270
|
+
"parameterCount": 70000000000,
|
|
271
|
+
"defaultDtype": "float16",
|
|
272
|
+
"maxPositionEmbeddings": 4096,
|
|
273
|
+
"gated": true,
|
|
274
|
+
"tags": [
|
|
275
|
+
"text-generation",
|
|
276
|
+
"llama-2"
|
|
277
|
+
],
|
|
278
|
+
"architecture": "LlamaForCausalLM",
|
|
279
|
+
"notes": "Llama-2 70B base model (non-chat). Requires multi-GPU for inference.",
|
|
280
|
+
"chatTemplate": "",
|
|
281
|
+
"frameworkCompatibility": {
|
|
282
|
+
"vllm": ">=0.3.0",
|
|
283
|
+
"tensorrt-llm": ">=0.8.0",
|
|
284
|
+
"sglang": ">=0.2.0"
|
|
285
|
+
},
|
|
286
|
+
"validationLevel": "community-validated",
|
|
287
|
+
"modelType": "transformer",
|
|
288
|
+
"tasks": [
|
|
289
|
+
"text-generation"
|
|
290
|
+
]
|
|
291
|
+
},
|
|
262
292
|
"meta-llama/Llama-2-*": {
|
|
263
293
|
"family": "llama-2",
|
|
264
294
|
"gated": true,
|
|
@@ -502,6 +532,77 @@
|
|
|
502
532
|
"text-generation"
|
|
503
533
|
]
|
|
504
534
|
},
|
|
535
|
+
"meta-llama/Llama-3.1-8B*": {
|
|
536
|
+
"parameterCount": 8030261248,
|
|
537
|
+
"defaultDtype": "bfloat16",
|
|
538
|
+
"architecture": "LlamaForCausalLM",
|
|
539
|
+
"maxPositionEmbeddings": 131072,
|
|
540
|
+
"recommendedQuantizations": [
|
|
541
|
+
"awq",
|
|
542
|
+
"gptq"
|
|
543
|
+
],
|
|
544
|
+
"modelType": "transformer",
|
|
545
|
+
"tasks": [
|
|
546
|
+
"text-generation"
|
|
547
|
+
]
|
|
548
|
+
},
|
|
549
|
+
"meta-llama/Llama-3.1-70B*": {
|
|
550
|
+
"parameterCount": 70553706496,
|
|
551
|
+
"defaultDtype": "bfloat16",
|
|
552
|
+
"architecture": "LlamaForCausalLM",
|
|
553
|
+
"maxPositionEmbeddings": 131072,
|
|
554
|
+
"recommendedQuantizations": [
|
|
555
|
+
"awq",
|
|
556
|
+
"gptq"
|
|
557
|
+
],
|
|
558
|
+
"modelType": "transformer",
|
|
559
|
+
"tasks": [
|
|
560
|
+
"text-generation"
|
|
561
|
+
]
|
|
562
|
+
},
|
|
563
|
+
"meta-llama/Llama-3.1-405B*": {
|
|
564
|
+
"parameterCount": 405000000000,
|
|
565
|
+
"defaultDtype": "bfloat16",
|
|
566
|
+
"architecture": "LlamaForCausalLM",
|
|
567
|
+
"maxPositionEmbeddings": 131072,
|
|
568
|
+
"recommendedQuantizations": [
|
|
569
|
+
"awq",
|
|
570
|
+
"gptq",
|
|
571
|
+
"fp8"
|
|
572
|
+
],
|
|
573
|
+
"modelType": "transformer",
|
|
574
|
+
"tasks": [
|
|
575
|
+
"text-generation"
|
|
576
|
+
]
|
|
577
|
+
},
|
|
578
|
+
"meta-llama/Llama-3.2-1B*": {
|
|
579
|
+
"parameterCount": 1235814400,
|
|
580
|
+
"defaultDtype": "bfloat16",
|
|
581
|
+
"architecture": "LlamaForCausalLM",
|
|
582
|
+
"maxPositionEmbeddings": 131072,
|
|
583
|
+
"recommendedQuantizations": [
|
|
584
|
+
"awq",
|
|
585
|
+
"gptq"
|
|
586
|
+
],
|
|
587
|
+
"modelType": "transformer",
|
|
588
|
+
"tasks": [
|
|
589
|
+
"text-generation"
|
|
590
|
+
]
|
|
591
|
+
},
|
|
592
|
+
"meta-llama/Llama-3.2-3B*": {
|
|
593
|
+
"parameterCount": 3212749824,
|
|
594
|
+
"defaultDtype": "bfloat16",
|
|
595
|
+
"architecture": "LlamaForCausalLM",
|
|
596
|
+
"maxPositionEmbeddings": 131072,
|
|
597
|
+
"recommendedQuantizations": [
|
|
598
|
+
"awq",
|
|
599
|
+
"gptq"
|
|
600
|
+
],
|
|
601
|
+
"modelType": "transformer",
|
|
602
|
+
"tasks": [
|
|
603
|
+
"text-generation"
|
|
604
|
+
]
|
|
605
|
+
},
|
|
505
606
|
"Qwen/Qwen-7B*": {
|
|
506
607
|
"parameterCount": 7721324544,
|
|
507
608
|
"defaultDtype": "bfloat16",
|
|
@@ -62,7 +62,15 @@
|
|
|
62
62
|
}
|
|
63
63
|
},
|
|
64
64
|
"inferenceAmiVersion": {
|
|
65
|
-
"type": "string"
|
|
65
|
+
"type": "string",
|
|
66
|
+
"enum": [
|
|
67
|
+
"al2023-ami-sagemaker-inference-cpu-0",
|
|
68
|
+
"al2-ami-sagemaker-inference-gpu-2",
|
|
69
|
+
"al2-ami-sagemaker-inference-gpu-2-1",
|
|
70
|
+
"al2-ami-sagemaker-inference-neuron-2",
|
|
71
|
+
"al2-ami-sagemaker-inference-gpu-3-1",
|
|
72
|
+
"al2023-ami-sagemaker-inference-gpu-4-1"
|
|
73
|
+
]
|
|
66
74
|
}
|
|
67
75
|
},
|
|
68
76
|
"additionalProperties": false
|
|
@@ -145,6 +153,12 @@
|
|
|
145
153
|
},
|
|
146
154
|
"notes": {
|
|
147
155
|
"type": "string"
|
|
156
|
+
},
|
|
157
|
+
"supportedModelTypes": {
|
|
158
|
+
"type": "array",
|
|
159
|
+
"items": {
|
|
160
|
+
"type": "string"
|
|
161
|
+
}
|
|
148
162
|
}
|
|
149
163
|
},
|
|
150
164
|
"additionalProperties": false
|
|
@@ -195,11 +195,12 @@ class HuggingFaceResolver extends ModelResolver {
|
|
|
195
195
|
}
|
|
196
196
|
|
|
197
197
|
// Fetch model config (conditional)
|
|
198
|
-
if (!fields || fields.includes('architecture')) {
|
|
198
|
+
if (!fields || fields.includes('architecture') || fields.includes('model_type')) {
|
|
199
199
|
const modelConfig = await this._fetchJson(
|
|
200
200
|
`${this.baseUrl}/${modelId}/resolve/main/config.json`
|
|
201
201
|
)
|
|
202
202
|
metadata.architecture = modelConfig?.architectures?.[0] || null
|
|
203
|
+
metadata.model_type = modelConfig?.model_type || null
|
|
203
204
|
}
|
|
204
205
|
|
|
205
206
|
return Object.keys(metadata).length > 0 ? metadata : null
|