@aws/ml-container-creator 0.2.5 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/bin/cli.js +45 -4
  2. package/config/bootstrap-stack.json +14 -0
  3. package/infra/ci-harness/package-lock.json +22 -9
  4. package/package.json +7 -8
  5. package/servers/base-image-picker/index.js +3 -3
  6. package/servers/base-image-picker/manifest.json +4 -2
  7. package/servers/instance-sizer/index.js +564 -0
  8. package/servers/instance-sizer/lib/instance-ranker.js +270 -0
  9. package/servers/instance-sizer/lib/model-resolver.js +269 -0
  10. package/servers/instance-sizer/lib/vram-estimator.js +177 -0
  11. package/servers/instance-sizer/manifest.json +17 -0
  12. package/servers/instance-sizer/package.json +15 -0
  13. package/servers/{instance-recommender → lib}/catalogs/instances.json +136 -34
  14. package/servers/{base-image-picker → lib}/catalogs/model-servers.json +302 -254
  15. package/servers/lib/catalogs/model-sizes.json +131 -0
  16. package/servers/lib/catalogs/models.json +632 -0
  17. package/servers/{model-picker → lib}/catalogs/popular-diffusors.json +32 -10
  18. package/servers/{model-picker → lib}/catalogs/popular-transformers.json +59 -26
  19. package/servers/{base-image-picker → lib}/catalogs/python-slim.json +12 -12
  20. package/servers/lib/schemas/image-catalog.schema.json +6 -12
  21. package/servers/lib/schemas/instances.schema.json +29 -0
  22. package/servers/lib/schemas/model-catalog.schema.json +12 -10
  23. package/servers/lib/schemas/unified-model-catalog.schema.json +129 -0
  24. package/servers/model-picker/index.js +4 -4
  25. package/servers/model-picker/manifest.json +2 -3
  26. package/servers/region-picker/index.js +1 -1
  27. package/servers/region-picker/manifest.json +1 -1
  28. package/src/app.js +36 -0
  29. package/src/lib/architecture-sync.js +171 -0
  30. package/src/lib/arn-detection.js +22 -0
  31. package/src/lib/bootstrap-command-handler.js +120 -0
  32. package/src/lib/cli-handler.js +3 -3
  33. package/src/lib/config-manager.js +47 -1
  34. package/src/lib/configuration-manager.js +2 -2
  35. package/src/lib/cross-cutting-checker.js +460 -0
  36. package/src/lib/deployment-entry-schema.js +1 -2
  37. package/src/lib/dry-run-validator.js +78 -0
  38. package/src/lib/generation-validator.js +102 -0
  39. package/src/lib/mcp-validator-config.js +89 -0
  40. package/src/lib/payload-builder.js +153 -0
  41. package/src/lib/prompt-runner.js +866 -149
  42. package/src/lib/prompts.js +2 -2
  43. package/src/lib/registry-command-handler.js +236 -0
  44. package/src/lib/registry-loader.js +5 -5
  45. package/src/lib/schema-sync.js +203 -0
  46. package/src/lib/schema-validation-engine.js +195 -0
  47. package/src/lib/secret-classification.js +56 -0
  48. package/src/lib/secrets-command-handler.js +550 -0
  49. package/src/lib/service-model-parser.js +102 -0
  50. package/src/lib/validate-runner.js +216 -0
  51. package/src/lib/validation-report.js +140 -0
  52. package/src/lib/validators/base-validator.js +36 -0
  53. package/src/lib/validators/catalog-validator.js +177 -0
  54. package/src/lib/validators/enum-validator.js +120 -0
  55. package/src/lib/validators/required-field-validator.js +150 -0
  56. package/src/lib/validators/type-validator.js +313 -0
  57. package/src/prompt-adapter.js +3 -2
  58. package/templates/Dockerfile +1 -1
  59. package/templates/do/build +37 -5
  60. package/templates/do/config +15 -3
  61. package/templates/do/deploy +60 -5
  62. package/templates/do/logs +18 -3
  63. package/templates/do/run +15 -1
  64. package/templates/do/validate +61 -0
  65. package/servers/instance-recommender/LICENSE +0 -202
  66. package/servers/instance-recommender/index.js +0 -284
  67. package/servers/instance-recommender/manifest.json +0 -16
  68. package/servers/instance-recommender/package.json +0 -15
  69. /package/servers/{model-picker → lib}/catalogs/jumpstart-public.json +0 -0
  70. /package/servers/{region-picker → lib}/catalogs/regions.json +0 -0
  71. /package/servers/{base-image-picker → lib}/catalogs/triton-backends.json +0 -0
  72. /package/servers/{base-image-picker → lib}/catalogs/triton.json +0 -0
@@ -20,13 +20,7 @@
20
20
  "VLLM_MAX_MODEL_LEN": "4096",
21
21
  "VLLM_ENABLE_PREFIX_CACHING": "true"
22
22
  },
23
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1",
24
- "recommendedInstanceTypes": [
25
- "ml.g5.xlarge",
26
- "ml.g5.2xlarge",
27
- "ml.g5.4xlarge",
28
- "ml.g5.12xlarge"
29
- ]
23
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
30
24
  },
31
25
  "accelerator": {
32
26
  "type": "cuda",
@@ -46,9 +40,6 @@
46
40
  "VLLM_GPU_MEMORY_UTILIZATION": "0.85",
47
41
  "VLLM_ENABLE_PREFIX_CACHING": "true"
48
42
  },
49
- "recommendedInstanceTypes": [
50
- "ml.g5.xlarge"
51
- ],
52
43
  "notes": "Prefix caching improves latency for repeated prompts"
53
44
  },
54
45
  "high-throughput": {
@@ -60,10 +51,6 @@
60
51
  "VLLM_MAX_MODEL_LEN": "2048",
61
52
  "VLLM_ENABLE_PREFIX_CACHING": "false"
62
53
  },
63
- "recommendedInstanceTypes": [
64
- "ml.g5.4xlarge",
65
- "ml.g5.12xlarge"
66
- ],
67
54
  "notes": "Continuous batching maximizes GPU utilization"
68
55
  },
69
56
  "multi-gpu": {
@@ -74,14 +61,160 @@
74
61
  "VLLM_GPU_MEMORY_UTILIZATION": "0.9",
75
62
  "VLLM_MAX_NUM_SEQS": "256"
76
63
  },
77
- "recommendedInstanceTypes": [
78
- "ml.g5.12xlarge",
79
- "ml.g5.48xlarge"
80
- ],
81
64
  "notes": "Requires instance with 4+ GPUs. Set TENSOR_PARALLEL_SIZE to match GPU count"
82
65
  }
83
66
  },
84
- "notes": "vLLM 0.4.0 adds prefix caching and improved performance. Requires CUDA 12.0+"
67
+ "notes": "vLLM 0.4.0 adds prefix caching and improved performance. Requires CUDA 12.0+",
68
+ "supportedModelTypes": [
69
+ "arcee",
70
+ "arctic",
71
+ "aria",
72
+ "aya_vision",
73
+ "baichuan",
74
+ "bailing_moe",
75
+ "bamba",
76
+ "bart",
77
+ "bert",
78
+ "bert_with_rope",
79
+ "blip2",
80
+ "bloom",
81
+ "chameleon",
82
+ "chatglm",
83
+ "cohere2_vision",
84
+ "commandr",
85
+ "dbrx",
86
+ "deepseek",
87
+ "deepseek_mtp",
88
+ "deepseek_v2",
89
+ "deepseek_vl2",
90
+ "dots1",
91
+ "ernie45",
92
+ "ernie45_moe",
93
+ "exaone",
94
+ "exaone4",
95
+ "fairseq2_llama",
96
+ "falcon",
97
+ "falcon_h1",
98
+ "florence2",
99
+ "fuyu",
100
+ "gemma",
101
+ "gemma2",
102
+ "gemma3",
103
+ "gemma3_mm",
104
+ "gemma3n",
105
+ "gemma3n_mm",
106
+ "glm",
107
+ "glm4",
108
+ "glm4_1v",
109
+ "glm4_moe",
110
+ "glm4_moe_mtp",
111
+ "glm4v",
112
+ "gpt2",
113
+ "gpt_bigcode",
114
+ "gpt_j",
115
+ "gpt_neox",
116
+ "gpt_oss",
117
+ "granite",
118
+ "granite_speech",
119
+ "granitemoe",
120
+ "granitemoehybrid",
121
+ "granitemoeshared",
122
+ "gritlm",
123
+ "grok1",
124
+ "h2ovl",
125
+ "hunyuan_v1",
126
+ "hyperclovax_vision",
127
+ "idefics3",
128
+ "internlm2",
129
+ "internlm2_ve",
130
+ "interns1",
131
+ "internvl",
132
+ "jais",
133
+ "jamba",
134
+ "jina_vl",
135
+ "keye",
136
+ "kimi_vl",
137
+ "llama",
138
+ "llama4",
139
+ "llama4_eagle",
140
+ "llama_eagle",
141
+ "llama_eagle3",
142
+ "llava",
143
+ "llava_next",
144
+ "llava_next_video",
145
+ "llava_onevision",
146
+ "mamba",
147
+ "mamba2",
148
+ "medusa",
149
+ "mimo",
150
+ "mimo_mtp",
151
+ "minicpm",
152
+ "minicpm3",
153
+ "minicpm_eagle",
154
+ "minicpmo",
155
+ "minicpmv",
156
+ "minimax_text_01",
157
+ "minimax_vl_01",
158
+ "mistral3",
159
+ "mixtral",
160
+ "mixtral_quant",
161
+ "mllama",
162
+ "mllama4",
163
+ "mlp_speculator",
164
+ "modernbert",
165
+ "molmo",
166
+ "mpt",
167
+ "nemotron",
168
+ "nemotron_h",
169
+ "nemotron_nas",
170
+ "nemotron_vl",
171
+ "nvlm_d",
172
+ "olmo",
173
+ "olmo2",
174
+ "olmoe",
175
+ "opt",
176
+ "orion",
177
+ "ovis",
178
+ "paligemma",
179
+ "persimmon",
180
+ "phi",
181
+ "phi3",
182
+ "phi3v",
183
+ "phi4_multimodal",
184
+ "phi4flash",
185
+ "phi4mm",
186
+ "phimoe",
187
+ "pixtral",
188
+ "plamo2",
189
+ "prithvi_geospatial_mae",
190
+ "qwen",
191
+ "qwen2",
192
+ "qwen2_5_omni_thinker",
193
+ "qwen2_5_vl",
194
+ "qwen2_audio",
195
+ "qwen2_moe",
196
+ "qwen2_rm",
197
+ "qwen2_vl",
198
+ "qwen3",
199
+ "qwen3_moe",
200
+ "qwen_vl",
201
+ "roberta",
202
+ "skyworkr1v",
203
+ "smolvlm",
204
+ "solar",
205
+ "stablelm",
206
+ "starcoder2",
207
+ "step3_text",
208
+ "step3_vl",
209
+ "tarsier",
210
+ "telechat2",
211
+ "teleflm",
212
+ "transformers",
213
+ "ultravox",
214
+ "voxtral",
215
+ "whisper",
216
+ "zamba2"
217
+ ]
85
218
  },
86
219
  {
87
220
  "image": "vllm/vllm-openai:v0.9.1",
@@ -103,13 +236,7 @@
103
236
  "VLLM_MAX_MODEL_LEN": "4096",
104
237
  "VLLM_ENABLE_PREFIX_CACHING": "true"
105
238
  },
106
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1",
107
- "recommendedInstanceTypes": [
108
- "ml.g5.xlarge",
109
- "ml.g5.2xlarge",
110
- "ml.g5.4xlarge",
111
- "ml.g5.12xlarge"
112
- ]
239
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
113
240
  },
114
241
  "accelerator": {
115
242
  "type": "cuda",
@@ -129,9 +256,6 @@
129
256
  "VLLM_GPU_MEMORY_UTILIZATION": "0.85",
130
257
  "VLLM_ENABLE_PREFIX_CACHING": "true"
131
258
  },
132
- "recommendedInstanceTypes": [
133
- "ml.g5.xlarge"
134
- ],
135
259
  "notes": "Prefix caching improves latency for repeated prompts"
136
260
  },
137
261
  "high-throughput": {
@@ -143,10 +267,6 @@
143
267
  "VLLM_MAX_MODEL_LEN": "2048",
144
268
  "VLLM_ENABLE_PREFIX_CACHING": "false"
145
269
  },
146
- "recommendedInstanceTypes": [
147
- "ml.g5.4xlarge",
148
- "ml.g5.12xlarge"
149
- ],
150
270
  "notes": "Continuous batching maximizes GPU utilization"
151
271
  },
152
272
  "multi-gpu": {
@@ -157,14 +277,136 @@
157
277
  "VLLM_GPU_MEMORY_UTILIZATION": "0.9",
158
278
  "VLLM_MAX_NUM_SEQS": "256"
159
279
  },
160
- "recommendedInstanceTypes": [
161
- "ml.g5.12xlarge",
162
- "ml.g5.48xlarge"
163
- ],
164
280
  "notes": "Requires instance with 4+ GPUs. Set TENSOR_PARALLEL_SIZE to match GPU count"
165
281
  }
166
282
  },
167
- "notes": "vLLM 0.4.0 adds prefix caching and improved performance. Requires CUDA 12.0+"
283
+ "notes": "vLLM 0.4.0 adds prefix caching and improved performance. Requires CUDA 12.0+",
284
+ "supportedModelTypes": [
285
+ "arctic",
286
+ "aria",
287
+ "aya_vision",
288
+ "baichuan",
289
+ "bamba",
290
+ "bart",
291
+ "bert",
292
+ "bert_with_rope",
293
+ "blip2",
294
+ "bloom",
295
+ "chameleon",
296
+ "chatglm",
297
+ "commandr",
298
+ "dbrx",
299
+ "deepseek",
300
+ "deepseek_mtp",
301
+ "deepseek_v2",
302
+ "deepseek_vl2",
303
+ "eagle",
304
+ "exaone",
305
+ "fairseq2_llama",
306
+ "falcon",
307
+ "falcon_h1",
308
+ "florence2",
309
+ "fuyu",
310
+ "gemma",
311
+ "gemma2",
312
+ "gemma3",
313
+ "gemma3_mm",
314
+ "glm",
315
+ "glm4",
316
+ "glm4v",
317
+ "gpt2",
318
+ "gpt_bigcode",
319
+ "gpt_j",
320
+ "gpt_neox",
321
+ "granite",
322
+ "granite_speech",
323
+ "granitemoe",
324
+ "granitemoehybrid",
325
+ "granitemoeshared",
326
+ "gritlm",
327
+ "grok1",
328
+ "h2ovl",
329
+ "idefics3",
330
+ "internlm2",
331
+ "internlm2_ve",
332
+ "internvl",
333
+ "jais",
334
+ "jamba",
335
+ "kimi_vl",
336
+ "llama",
337
+ "llama_eagle",
338
+ "llama_eagle3",
339
+ "llava",
340
+ "llava_next",
341
+ "llava_next_video",
342
+ "llava_onevision",
343
+ "mamba",
344
+ "mamba2",
345
+ "medusa",
346
+ "mimo",
347
+ "mimo_mtp",
348
+ "minicpm",
349
+ "minicpm3",
350
+ "minicpm_eagle",
351
+ "minicpmo",
352
+ "minicpmv",
353
+ "minimax_text_01",
354
+ "minimax_vl_01",
355
+ "mistral3",
356
+ "mixtral",
357
+ "mixtral_quant",
358
+ "mllama",
359
+ "mllama4",
360
+ "mlp_speculator",
361
+ "modernbert",
362
+ "molmo",
363
+ "mpt",
364
+ "nemotron",
365
+ "nemotron_h",
366
+ "nemotron_nas",
367
+ "nvlm_d",
368
+ "olmo",
369
+ "olmo2",
370
+ "olmoe",
371
+ "opt",
372
+ "orion",
373
+ "ovis",
374
+ "paligemma",
375
+ "persimmon",
376
+ "phi",
377
+ "phi3",
378
+ "phi3_small",
379
+ "phi3v",
380
+ "phi4mm",
381
+ "phimoe",
382
+ "pixtral",
383
+ "plamo2",
384
+ "prithvi_geospatial_mae",
385
+ "qwen",
386
+ "qwen2",
387
+ "qwen2_5_omni_thinker",
388
+ "qwen2_5_vl",
389
+ "qwen2_audio",
390
+ "qwen2_moe",
391
+ "qwen2_rm",
392
+ "qwen2_vl",
393
+ "qwen3",
394
+ "qwen3_moe",
395
+ "qwen_vl",
396
+ "roberta",
397
+ "skyworkr1v",
398
+ "smolvlm",
399
+ "solar",
400
+ "stablelm",
401
+ "starcoder2",
402
+ "tarsier",
403
+ "telechat2",
404
+ "teleflm",
405
+ "transformers",
406
+ "ultravox",
407
+ "whisper",
408
+ "zamba2"
409
+ ]
168
410
  }
169
411
  ],
170
412
  "sglang": [
@@ -187,12 +429,7 @@
187
429
  "SGLANG_MAX_RUNNING_REQUESTS": "256",
188
430
  "SGLANG_CONTEXT_LENGTH": "4096"
189
431
  },
190
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1",
191
- "recommendedInstanceTypes": [
192
- "ml.g5.xlarge",
193
- "ml.g5.2xlarge",
194
- "ml.g5.4xlarge"
195
- ]
432
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
196
433
  },
197
434
  "accelerator": {
198
435
  "type": "cuda",
@@ -211,10 +448,6 @@
211
448
  "SGLANG_MAX_RUNNING_REQUESTS": "256",
212
449
  "SGLANG_MEM_FRACTION": "0.9"
213
450
  },
214
- "recommendedInstanceTypes": [
215
- "ml.g5.xlarge",
216
- "ml.g5.2xlarge"
217
- ],
218
451
  "notes": "Good starting point for most workloads"
219
452
  },
220
453
  "high-throughput": {
@@ -226,10 +459,6 @@
226
459
  "SGLANG_CONTEXT_LENGTH": "2048",
227
460
  "SGLANG_ENABLE_RADIX_CACHE": "true"
228
461
  },
229
- "recommendedInstanceTypes": [
230
- "ml.g5.4xlarge",
231
- "ml.g5.12xlarge"
232
- ],
233
462
  "notes": "RadixAttention provides automatic KV cache reuse for improved throughput"
234
463
  }
235
464
  },
@@ -254,12 +483,7 @@
254
483
  "SGLANG_MAX_RUNNING_REQUESTS": "256",
255
484
  "SGLANG_CONTEXT_LENGTH": "4096"
256
485
  },
257
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1",
258
- "recommendedInstanceTypes": [
259
- "ml.g5.xlarge",
260
- "ml.g5.2xlarge",
261
- "ml.g5.4xlarge"
262
- ]
486
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
263
487
  },
264
488
  "accelerator": {
265
489
  "type": "cuda",
@@ -278,10 +502,6 @@
278
502
  "SGLANG_MAX_RUNNING_REQUESTS": "256",
279
503
  "SGLANG_MEM_FRACTION": "0.9"
280
504
  },
281
- "recommendedInstanceTypes": [
282
- "ml.g5.xlarge",
283
- "ml.g5.2xlarge"
284
- ],
285
505
  "notes": "Good starting point for most workloads"
286
506
  },
287
507
  "high-throughput": {
@@ -293,10 +513,6 @@
293
513
  "SGLANG_CONTEXT_LENGTH": "2048",
294
514
  "SGLANG_ENABLE_RADIX_CACHE": "true"
295
515
  },
296
- "recommendedInstanceTypes": [
297
- "ml.g5.4xlarge",
298
- "ml.g5.12xlarge"
299
- ],
300
516
  "notes": "RadixAttention provides automatic KV cache reuse for improved throughput"
301
517
  }
302
518
  },
@@ -326,13 +542,7 @@
326
542
  "TRTLLM_ENABLE_CHUNKED_CONTEXT": "true",
327
543
  "UCX_MEMTYPE_CACHE": "n"
328
544
  },
329
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2",
330
- "recommendedInstanceTypes": [
331
- "ml.g5.2xlarge",
332
- "ml.g5.4xlarge",
333
- "ml.g5.12xlarge",
334
- "ml.g5.48xlarge"
335
- ]
545
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
336
546
  },
337
547
  "accelerator": {
338
548
  "type": "cuda",
@@ -352,10 +562,6 @@
352
562
  "TRTLLM_MAX_BATCH_SIZE": "16",
353
563
  "TRTLLM_ENABLE_CHUNKED_CONTEXT": "true"
354
564
  },
355
- "recommendedInstanceTypes": [
356
- "ml.g5.2xlarge",
357
- "ml.g5.4xlarge"
358
- ],
359
565
  "notes": "Chunked context allows processing longer sequences"
360
566
  },
361
567
  "int8": {
@@ -367,10 +573,6 @@
367
573
  "TRTLLM_USE_WEIGHT_ONLY": "true",
368
574
  "TRTLLM_WEIGHT_ONLY_PRECISION": "int8"
369
575
  },
370
- "recommendedInstanceTypes": [
371
- "ml.g5.xlarge",
372
- "ml.g5.2xlarge"
373
- ],
374
576
  "notes": "Weight-only quantization provides best speed/accuracy tradeoff"
375
577
  },
376
578
  "int4": {
@@ -382,13 +584,11 @@
382
584
  "TRTLLM_USE_WEIGHT_ONLY": "true",
383
585
  "TRTLLM_WEIGHT_ONLY_PRECISION": "int4"
384
586
  },
385
- "recommendedInstanceTypes": [
386
- "ml.g5.xlarge"
387
- ],
388
587
  "notes": "Enables running larger models on smaller instances with acceptable accuracy"
389
588
  }
390
589
  },
391
- "notes": "TensorRT-LLM 1.0.0 adds chunked context and INT4 support. Requires CUDA 12.1+"
590
+ "notes": "TensorRT-LLM 1.0.0 adds chunked context and INT4 support. Requires CUDA 12.1+",
591
+ "supportedModelTypes": []
392
592
  },
393
593
  {
394
594
  "image": "nvcr.io/nvidia/tensorrt-llm/release:1.1.0",
@@ -412,13 +612,7 @@
412
612
  "TRTLLM_ENABLE_CHUNKED_CONTEXT": "true",
413
613
  "UCX_MEMTYPE_CACHE": "n"
414
614
  },
415
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2",
416
- "recommendedInstanceTypes": [
417
- "ml.g5.2xlarge",
418
- "ml.g5.4xlarge",
419
- "ml.g5.12xlarge",
420
- "ml.g5.48xlarge"
421
- ]
615
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
422
616
  },
423
617
  "accelerator": {
424
618
  "type": "cuda",
@@ -438,10 +632,6 @@
438
632
  "TRTLLM_MAX_BATCH_SIZE": "16",
439
633
  "TRTLLM_ENABLE_CHUNKED_CONTEXT": "true"
440
634
  },
441
- "recommendedInstanceTypes": [
442
- "ml.g5.2xlarge",
443
- "ml.g5.4xlarge"
444
- ],
445
635
  "notes": "Chunked context allows processing longer sequences"
446
636
  },
447
637
  "int8": {
@@ -453,10 +643,6 @@
453
643
  "TRTLLM_USE_WEIGHT_ONLY": "true",
454
644
  "TRTLLM_WEIGHT_ONLY_PRECISION": "int8"
455
645
  },
456
- "recommendedInstanceTypes": [
457
- "ml.g5.xlarge",
458
- "ml.g5.2xlarge"
459
- ],
460
646
  "notes": "Weight-only quantization provides best speed/accuracy tradeoff"
461
647
  },
462
648
  "int4": {
@@ -468,13 +654,11 @@
468
654
  "TRTLLM_USE_WEIGHT_ONLY": "true",
469
655
  "TRTLLM_WEIGHT_ONLY_PRECISION": "int4"
470
656
  },
471
- "recommendedInstanceTypes": [
472
- "ml.g5.xlarge"
473
- ],
474
657
  "notes": "Enables running larger models on smaller instances with acceptable accuracy"
475
658
  }
476
659
  },
477
- "notes": "TensorRT-LLM 1.0.0 adds chunked context and INT4 support. Requires CUDA 12.1+"
660
+ "notes": "TensorRT-LLM 1.0.0 adds chunked context and INT4 support. Requires CUDA 12.1+",
661
+ "supportedModelTypes": []
478
662
  }
479
663
  ],
480
664
  "lmi": [
@@ -497,13 +681,7 @@
497
681
  "OPTION_MAX_ROLLING_BATCH_SIZE": "32",
498
682
  "OPTION_DTYPE": "fp16"
499
683
  },
500
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2",
501
- "recommendedInstanceTypes": [
502
- "ml.g5.xlarge",
503
- "ml.g5.2xlarge",
504
- "ml.g5.4xlarge",
505
- "ml.g5.12xlarge"
506
- ]
684
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
507
685
  },
508
686
  "accelerator": {
509
687
  "type": "cuda",
@@ -523,10 +701,6 @@
523
701
  "OPTION_MAX_ROLLING_BATCH_SIZE": "32",
524
702
  "OPTION_DTYPE": "fp16"
525
703
  },
526
- "recommendedInstanceTypes": [
527
- "ml.g5.xlarge",
528
- "ml.g5.2xlarge"
529
- ],
530
704
  "notes": "vLLM backend provides excellent performance for most models"
531
705
  },
532
706
  "tensorrt-backend": {
@@ -537,10 +711,6 @@
537
711
  "OPTION_MAX_ROLLING_BATCH_SIZE": "16",
538
712
  "OPTION_DTYPE": "fp16"
539
713
  },
540
- "recommendedInstanceTypes": [
541
- "ml.g5.2xlarge",
542
- "ml.g5.4xlarge"
543
- ],
544
714
  "notes": "TensorRT-LLM provides best performance but requires model compilation"
545
715
  },
546
716
  "lmi-dist": {
@@ -551,10 +721,6 @@
551
721
  "OPTION_TENSOR_PARALLEL_DEGREE": "4",
552
722
  "OPTION_MAX_ROLLING_BATCH_SIZE": "64"
553
723
  },
554
- "recommendedInstanceTypes": [
555
- "ml.g5.12xlarge",
556
- "ml.g5.48xlarge"
557
- ],
558
724
  "notes": "Best for very large models requiring multi-GPU tensor parallelism"
559
725
  },
560
726
  "auto": {
@@ -564,11 +730,6 @@
564
730
  "OPTION_MAX_ROLLING_BATCH_SIZE": "32",
565
731
  "OPTION_DTYPE": "fp16"
566
732
  },
567
- "recommendedInstanceTypes": [
568
- "ml.g5.xlarge",
569
- "ml.g5.2xlarge",
570
- "ml.g5.4xlarge"
571
- ],
572
733
  "notes": "LMI will analyze your model and select the optimal backend automatically"
573
734
  }
574
735
  },
@@ -593,13 +754,7 @@
593
754
  "OPTION_MAX_ROLLING_BATCH_SIZE": "32",
594
755
  "OPTION_DTYPE": "fp16"
595
756
  },
596
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2",
597
- "recommendedInstanceTypes": [
598
- "ml.g5.xlarge",
599
- "ml.g5.2xlarge",
600
- "ml.g5.4xlarge",
601
- "ml.g5.12xlarge"
602
- ]
757
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
603
758
  },
604
759
  "accelerator": {
605
760
  "type": "cuda",
@@ -619,10 +774,6 @@
619
774
  "OPTION_MAX_ROLLING_BATCH_SIZE": "32",
620
775
  "OPTION_DTYPE": "fp16"
621
776
  },
622
- "recommendedInstanceTypes": [
623
- "ml.g5.xlarge",
624
- "ml.g5.2xlarge"
625
- ],
626
777
  "notes": "vLLM backend provides excellent performance for most models"
627
778
  },
628
779
  "tensorrt-backend": {
@@ -633,10 +784,6 @@
633
784
  "OPTION_MAX_ROLLING_BATCH_SIZE": "16",
634
785
  "OPTION_DTYPE": "fp16"
635
786
  },
636
- "recommendedInstanceTypes": [
637
- "ml.g5.2xlarge",
638
- "ml.g5.4xlarge"
639
- ],
640
787
  "notes": "TensorRT-LLM provides best performance but requires model compilation"
641
788
  },
642
789
  "lmi-dist": {
@@ -647,10 +794,6 @@
647
794
  "OPTION_TENSOR_PARALLEL_DEGREE": "4",
648
795
  "OPTION_MAX_ROLLING_BATCH_SIZE": "64"
649
796
  },
650
- "recommendedInstanceTypes": [
651
- "ml.g5.12xlarge",
652
- "ml.g5.48xlarge"
653
- ],
654
797
  "notes": "Best for very large models requiring multi-GPU tensor parallelism"
655
798
  },
656
799
  "auto": {
@@ -660,11 +803,6 @@
660
803
  "OPTION_MAX_ROLLING_BATCH_SIZE": "32",
661
804
  "OPTION_DTYPE": "fp16"
662
805
  },
663
- "recommendedInstanceTypes": [
664
- "ml.g5.xlarge",
665
- "ml.g5.2xlarge",
666
- "ml.g5.4xlarge"
667
- ],
668
806
  "notes": "LMI will analyze your model and select the optimal backend automatically"
669
807
  }
670
808
  },
@@ -690,12 +828,7 @@
690
828
  "OPTION_TENSOR_PARALLEL_DEGREE": "1",
691
829
  "OPTION_DEVICE_MAP": "auto"
692
830
  },
693
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2",
694
- "recommendedInstanceTypes": [
695
- "ml.g5.xlarge",
696
- "ml.g5.2xlarge",
697
- "ml.g5.4xlarge"
698
- ]
831
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
699
832
  },
700
833
  "accelerator": {
701
834
  "type": "cuda",
@@ -715,10 +848,6 @@
715
848
  "OPTION_DEVICE_MAP": "auto",
716
849
  "BATCH_SIZE": "1"
717
850
  },
718
- "recommendedInstanceTypes": [
719
- "ml.g5.xlarge",
720
- "ml.g5.2xlarge"
721
- ],
722
851
  "notes": "PyTorch engine provides good compatibility with HuggingFace models"
723
852
  },
724
853
  "multi-gpu": {
@@ -729,10 +858,6 @@
729
858
  "OPTION_TENSOR_PARALLEL_DEGREE": "4",
730
859
  "OPTION_DEVICE_MAP": "auto"
731
860
  },
732
- "recommendedInstanceTypes": [
733
- "ml.g5.12xlarge",
734
- "ml.g5.48xlarge"
735
- ],
736
861
  "notes": "Distribute model across multiple GPUs for large models"
737
862
  }
738
863
  },
@@ -756,12 +881,7 @@
756
881
  "OPTION_TENSOR_PARALLEL_DEGREE": "1",
757
882
  "OPTION_DEVICE_MAP": "auto"
758
883
  },
759
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2",
760
- "recommendedInstanceTypes": [
761
- "ml.g5.xlarge",
762
- "ml.g5.2xlarge",
763
- "ml.g5.4xlarge"
764
- ]
884
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
765
885
  },
766
886
  "accelerator": {
767
887
  "type": "cuda",
@@ -781,10 +901,6 @@
781
901
  "OPTION_DEVICE_MAP": "auto",
782
902
  "BATCH_SIZE": "1"
783
903
  },
784
- "recommendedInstanceTypes": [
785
- "ml.g5.xlarge",
786
- "ml.g5.2xlarge"
787
- ],
788
904
  "notes": "PyTorch engine provides good compatibility with HuggingFace models"
789
905
  },
790
906
  "multi-gpu": {
@@ -795,10 +911,6 @@
795
911
  "OPTION_TENSOR_PARALLEL_DEGREE": "4",
796
912
  "OPTION_DEVICE_MAP": "auto"
797
913
  },
798
- "recommendedInstanceTypes": [
799
- "ml.g5.12xlarge",
800
- "ml.g5.48xlarge"
801
- ],
802
914
  "notes": "Distribute model across multiple GPUs for large models"
803
915
  }
804
916
  },
@@ -823,12 +935,7 @@
823
935
  "HF_TOKEN": "${hfToken}",
824
936
  "VLLM_WORKER_MULTIPROC_METHOD": "spawn"
825
937
  },
826
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2",
827
- "recommendedInstanceTypes": [
828
- "ml.g5.2xlarge",
829
- "ml.g5.4xlarge",
830
- "ml.g5.12xlarge"
831
- ]
938
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
832
939
  },
833
940
  "accelerator": {
834
941
  "type": "cuda",
@@ -844,30 +951,18 @@
844
951
  "displayName": "Quality",
845
952
  "description": "Higher step count for better image quality",
846
953
  "envVars": {},
847
- "recommendedInstanceTypes": [
848
- "ml.g5.4xlarge",
849
- "ml.g5.12xlarge"
850
- ],
851
954
  "notes": "Best image quality, no cache acceleration, VAE tiling for memory efficiency"
852
955
  },
853
956
  "speed": {
854
957
  "displayName": "Speed",
855
958
  "description": "Cache acceleration for faster generation",
856
959
  "envVars": {},
857
- "recommendedInstanceTypes": [
858
- "ml.g5.2xlarge",
859
- "ml.g5.4xlarge"
860
- ],
861
960
  "notes": "TeaCache acceleration reduces redundant computation between denoising steps"
862
961
  },
863
962
  "multi-gpu": {
864
963
  "displayName": "Multi-GPU",
865
964
  "description": "Sequence parallelism for large diffusion models",
866
965
  "envVars": {},
867
- "recommendedInstanceTypes": [
868
- "ml.g5.12xlarge",
869
- "ml.g5.48xlarge"
870
- ],
871
966
  "notes": "Ulysses sequence parallelism for large models like FLUX on multi-GPU instances"
872
967
  }
873
968
  },
@@ -890,12 +985,7 @@
890
985
  "HF_TOKEN": "${hfToken}",
891
986
  "VLLM_WORKER_MULTIPROC_METHOD": "spawn"
892
987
  },
893
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2",
894
- "recommendedInstanceTypes": [
895
- "ml.g5.2xlarge",
896
- "ml.g5.4xlarge",
897
- "ml.g5.12xlarge"
898
- ]
988
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
899
989
  },
900
990
  "accelerator": {
901
991
  "type": "cuda",
@@ -911,30 +1001,18 @@
911
1001
  "displayName": "Quality",
912
1002
  "description": "Higher step count for better image quality",
913
1003
  "envVars": {},
914
- "recommendedInstanceTypes": [
915
- "ml.g5.4xlarge",
916
- "ml.g5.12xlarge"
917
- ],
918
1004
  "notes": "Best image quality, no cache acceleration, VAE tiling for memory efficiency"
919
1005
  },
920
1006
  "speed": {
921
1007
  "displayName": "Speed",
922
1008
  "description": "Cache acceleration for faster generation",
923
1009
  "envVars": {},
924
- "recommendedInstanceTypes": [
925
- "ml.g5.2xlarge",
926
- "ml.g5.4xlarge"
927
- ],
928
1010
  "notes": "TeaCache acceleration reduces redundant computation between denoising steps"
929
1011
  },
930
1012
  "multi-gpu": {
931
1013
  "displayName": "Multi-GPU",
932
1014
  "description": "Sequence parallelism for large diffusion models",
933
1015
  "envVars": {},
934
- "recommendedInstanceTypes": [
935
- "ml.g5.12xlarge",
936
- "ml.g5.48xlarge"
937
- ],
938
1016
  "notes": "Ulysses sequence parallelism for large models like FLUX on multi-GPU instances"
939
1017
  }
940
1018
  },
@@ -958,11 +1036,7 @@
958
1036
  "envVars": {
959
1037
  "TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
960
1038
  },
961
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2",
962
- "recommendedInstanceTypes": [
963
- "ml.g5.xlarge",
964
- "ml.g5.2xlarge"
965
- ]
1039
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
966
1040
  },
967
1041
  "accelerator": {
968
1042
  "type": "cuda",
@@ -993,11 +1067,7 @@
993
1067
  "envVars": {
994
1068
  "TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
995
1069
  },
996
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2",
997
- "recommendedInstanceTypes": [
998
- "ml.g5.xlarge",
999
- "ml.g5.2xlarge"
1000
- ]
1070
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
1001
1071
  },
1002
1072
  "accelerator": {
1003
1073
  "type": "cuda",
@@ -1028,11 +1098,7 @@
1028
1098
  "envVars": {
1029
1099
  "TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
1030
1100
  },
1031
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2",
1032
- "recommendedInstanceTypes": [
1033
- "ml.g5.xlarge",
1034
- "ml.g5.2xlarge"
1035
- ]
1101
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
1036
1102
  },
1037
1103
  "accelerator": {
1038
1104
  "type": "cuda",
@@ -1063,11 +1129,7 @@
1063
1129
  "envVars": {
1064
1130
  "TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
1065
1131
  },
1066
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2",
1067
- "recommendedInstanceTypes": [
1068
- "ml.g5.xlarge",
1069
- "ml.g5.2xlarge"
1070
- ]
1132
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
1071
1133
  },
1072
1134
  "accelerator": {
1073
1135
  "type": "cuda",
@@ -1098,12 +1160,7 @@
1098
1160
  "envVars": {
1099
1161
  "TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
1100
1162
  },
1101
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2",
1102
- "recommendedInstanceTypes": [
1103
- "ml.g5.xlarge",
1104
- "ml.g5.2xlarge",
1105
- "ml.g5.4xlarge"
1106
- ]
1163
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
1107
1164
  },
1108
1165
  "accelerator": {
1109
1166
  "type": "cuda",
@@ -1134,12 +1191,7 @@
1134
1191
  "envVars": {
1135
1192
  "TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
1136
1193
  },
1137
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2",
1138
- "recommendedInstanceTypes": [
1139
- "ml.g5.2xlarge",
1140
- "ml.g5.4xlarge",
1141
- "ml.g5.12xlarge"
1142
- ]
1194
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
1143
1195
  },
1144
1196
  "accelerator": {
1145
1197
  "type": "cuda",
@@ -1170,11 +1222,7 @@
1170
1222
  "envVars": {
1171
1223
  "TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
1172
1224
  },
1173
- "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2",
1174
- "recommendedInstanceTypes": [
1175
- "ml.g5.xlarge",
1176
- "ml.g5.2xlarge"
1177
- ]
1225
+ "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
1178
1226
  },
1179
1227
  "accelerator": {
1180
1228
  "type": "cuda",
@@ -1188,4 +1236,4 @@
1188
1236
  "notes": "Triton Python backend for custom model serving with TritonPythonModel interface. GPU optional"
1189
1237
  }
1190
1238
  ]
1191
- }
1239
+ }