@aws/ml-container-creator 0.2.4 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -298
- package/bin/cli.js +7 -2
- package/package.json +7 -8
- package/servers/base-image-picker/index.js +3 -3
- package/servers/base-image-picker/manifest.json +4 -2
- package/servers/instance-sizer/index.js +561 -0
- package/servers/instance-sizer/lib/instance-ranker.js +245 -0
- package/servers/instance-sizer/lib/model-resolver.js +265 -0
- package/servers/instance-sizer/lib/vram-estimator.js +177 -0
- package/servers/instance-sizer/manifest.json +17 -0
- package/servers/instance-sizer/package.json +15 -0
- package/servers/{instance-recommender → lib}/catalogs/instances.json +136 -34
- package/servers/{base-image-picker → lib}/catalogs/model-servers.json +19 -249
- package/servers/lib/catalogs/model-sizes.json +131 -0
- package/servers/lib/catalogs/models.json +602 -0
- package/servers/{model-picker → lib}/catalogs/popular-diffusors.json +32 -10
- package/servers/{model-picker → lib}/catalogs/popular-transformers.json +59 -26
- package/servers/{base-image-picker → lib}/catalogs/python-slim.json +12 -12
- package/servers/lib/schemas/image-catalog.schema.json +0 -12
- package/servers/lib/schemas/instances.schema.json +29 -0
- package/servers/lib/schemas/model-catalog.schema.json +12 -10
- package/servers/lib/schemas/unified-model-catalog.schema.json +129 -0
- package/servers/model-picker/index.js +2 -3
- package/servers/model-picker/manifest.json +2 -3
- package/servers/region-picker/index.js +1 -1
- package/servers/region-picker/manifest.json +1 -1
- package/src/app.js +17 -0
- package/src/lib/bootstrap-command-handler.js +38 -0
- package/src/lib/cli-handler.js +3 -3
- package/src/lib/config-manager.js +4 -1
- package/src/lib/configuration-manager.js +2 -2
- package/src/lib/cross-cutting-checker.js +341 -0
- package/src/lib/dry-run-validator.js +78 -0
- package/src/lib/generation-validator.js +102 -0
- package/src/lib/mcp-validator-config.js +89 -0
- package/src/lib/payload-builder.js +153 -0
- package/src/lib/prompt-runner.js +445 -135
- package/src/lib/prompts.js +1 -1
- package/src/lib/registry-loader.js +5 -5
- package/src/lib/schema-sync.js +203 -0
- package/src/lib/schema-validation-engine.js +195 -0
- package/src/lib/service-model-parser.js +102 -0
- package/src/lib/validate-runner.js +167 -0
- package/src/lib/validation-report.js +133 -0
- package/src/lib/validators/base-validator.js +36 -0
- package/src/lib/validators/catalog-validator.js +177 -0
- package/src/lib/validators/enum-validator.js +120 -0
- package/src/lib/validators/required-field-validator.js +150 -0
- package/src/lib/validators/type-validator.js +313 -0
- package/templates/Dockerfile +1 -1
- package/templates/do/build +15 -5
- package/templates/do/run +5 -1
- package/templates/do/validate +61 -0
- package/servers/instance-recommender/LICENSE +0 -202
- package/servers/instance-recommender/index.js +0 -284
- package/servers/instance-recommender/manifest.json +0 -16
- package/servers/instance-recommender/package.json +0 -15
- /package/servers/{model-picker → lib}/catalogs/jumpstart-public.json +0 -0
- /package/servers/{region-picker → lib}/catalogs/regions.json +0 -0
- /package/servers/{base-image-picker → lib}/catalogs/triton-backends.json +0 -0
- /package/servers/{base-image-picker → lib}/catalogs/triton.json +0 -0
|
@@ -20,13 +20,7 @@
|
|
|
20
20
|
"VLLM_MAX_MODEL_LEN": "4096",
|
|
21
21
|
"VLLM_ENABLE_PREFIX_CACHING": "true"
|
|
22
22
|
},
|
|
23
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
24
|
-
"recommendedInstanceTypes": [
|
|
25
|
-
"ml.g5.xlarge",
|
|
26
|
-
"ml.g5.2xlarge",
|
|
27
|
-
"ml.g5.4xlarge",
|
|
28
|
-
"ml.g5.12xlarge"
|
|
29
|
-
]
|
|
23
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
30
24
|
},
|
|
31
25
|
"accelerator": {
|
|
32
26
|
"type": "cuda",
|
|
@@ -46,9 +40,6 @@
|
|
|
46
40
|
"VLLM_GPU_MEMORY_UTILIZATION": "0.85",
|
|
47
41
|
"VLLM_ENABLE_PREFIX_CACHING": "true"
|
|
48
42
|
},
|
|
49
|
-
"recommendedInstanceTypes": [
|
|
50
|
-
"ml.g5.xlarge"
|
|
51
|
-
],
|
|
52
43
|
"notes": "Prefix caching improves latency for repeated prompts"
|
|
53
44
|
},
|
|
54
45
|
"high-throughput": {
|
|
@@ -60,10 +51,6 @@
|
|
|
60
51
|
"VLLM_MAX_MODEL_LEN": "2048",
|
|
61
52
|
"VLLM_ENABLE_PREFIX_CACHING": "false"
|
|
62
53
|
},
|
|
63
|
-
"recommendedInstanceTypes": [
|
|
64
|
-
"ml.g5.4xlarge",
|
|
65
|
-
"ml.g5.12xlarge"
|
|
66
|
-
],
|
|
67
54
|
"notes": "Continuous batching maximizes GPU utilization"
|
|
68
55
|
},
|
|
69
56
|
"multi-gpu": {
|
|
@@ -74,10 +61,6 @@
|
|
|
74
61
|
"VLLM_GPU_MEMORY_UTILIZATION": "0.9",
|
|
75
62
|
"VLLM_MAX_NUM_SEQS": "256"
|
|
76
63
|
},
|
|
77
|
-
"recommendedInstanceTypes": [
|
|
78
|
-
"ml.g5.12xlarge",
|
|
79
|
-
"ml.g5.48xlarge"
|
|
80
|
-
],
|
|
81
64
|
"notes": "Requires instance with 4+ GPUs. Set TENSOR_PARALLEL_SIZE to match GPU count"
|
|
82
65
|
}
|
|
83
66
|
},
|
|
@@ -103,13 +86,7 @@
|
|
|
103
86
|
"VLLM_MAX_MODEL_LEN": "4096",
|
|
104
87
|
"VLLM_ENABLE_PREFIX_CACHING": "true"
|
|
105
88
|
},
|
|
106
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
107
|
-
"recommendedInstanceTypes": [
|
|
108
|
-
"ml.g5.xlarge",
|
|
109
|
-
"ml.g5.2xlarge",
|
|
110
|
-
"ml.g5.4xlarge",
|
|
111
|
-
"ml.g5.12xlarge"
|
|
112
|
-
]
|
|
89
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
113
90
|
},
|
|
114
91
|
"accelerator": {
|
|
115
92
|
"type": "cuda",
|
|
@@ -129,9 +106,6 @@
|
|
|
129
106
|
"VLLM_GPU_MEMORY_UTILIZATION": "0.85",
|
|
130
107
|
"VLLM_ENABLE_PREFIX_CACHING": "true"
|
|
131
108
|
},
|
|
132
|
-
"recommendedInstanceTypes": [
|
|
133
|
-
"ml.g5.xlarge"
|
|
134
|
-
],
|
|
135
109
|
"notes": "Prefix caching improves latency for repeated prompts"
|
|
136
110
|
},
|
|
137
111
|
"high-throughput": {
|
|
@@ -143,10 +117,6 @@
|
|
|
143
117
|
"VLLM_MAX_MODEL_LEN": "2048",
|
|
144
118
|
"VLLM_ENABLE_PREFIX_CACHING": "false"
|
|
145
119
|
},
|
|
146
|
-
"recommendedInstanceTypes": [
|
|
147
|
-
"ml.g5.4xlarge",
|
|
148
|
-
"ml.g5.12xlarge"
|
|
149
|
-
],
|
|
150
120
|
"notes": "Continuous batching maximizes GPU utilization"
|
|
151
121
|
},
|
|
152
122
|
"multi-gpu": {
|
|
@@ -157,10 +127,6 @@
|
|
|
157
127
|
"VLLM_GPU_MEMORY_UTILIZATION": "0.9",
|
|
158
128
|
"VLLM_MAX_NUM_SEQS": "256"
|
|
159
129
|
},
|
|
160
|
-
"recommendedInstanceTypes": [
|
|
161
|
-
"ml.g5.12xlarge",
|
|
162
|
-
"ml.g5.48xlarge"
|
|
163
|
-
],
|
|
164
130
|
"notes": "Requires instance with 4+ GPUs. Set TENSOR_PARALLEL_SIZE to match GPU count"
|
|
165
131
|
}
|
|
166
132
|
},
|
|
@@ -187,12 +153,7 @@
|
|
|
187
153
|
"SGLANG_MAX_RUNNING_REQUESTS": "256",
|
|
188
154
|
"SGLANG_CONTEXT_LENGTH": "4096"
|
|
189
155
|
},
|
|
190
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
191
|
-
"recommendedInstanceTypes": [
|
|
192
|
-
"ml.g5.xlarge",
|
|
193
|
-
"ml.g5.2xlarge",
|
|
194
|
-
"ml.g5.4xlarge"
|
|
195
|
-
]
|
|
156
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
196
157
|
},
|
|
197
158
|
"accelerator": {
|
|
198
159
|
"type": "cuda",
|
|
@@ -211,10 +172,6 @@
|
|
|
211
172
|
"SGLANG_MAX_RUNNING_REQUESTS": "256",
|
|
212
173
|
"SGLANG_MEM_FRACTION": "0.9"
|
|
213
174
|
},
|
|
214
|
-
"recommendedInstanceTypes": [
|
|
215
|
-
"ml.g5.xlarge",
|
|
216
|
-
"ml.g5.2xlarge"
|
|
217
|
-
],
|
|
218
175
|
"notes": "Good starting point for most workloads"
|
|
219
176
|
},
|
|
220
177
|
"high-throughput": {
|
|
@@ -226,10 +183,6 @@
|
|
|
226
183
|
"SGLANG_CONTEXT_LENGTH": "2048",
|
|
227
184
|
"SGLANG_ENABLE_RADIX_CACHE": "true"
|
|
228
185
|
},
|
|
229
|
-
"recommendedInstanceTypes": [
|
|
230
|
-
"ml.g5.4xlarge",
|
|
231
|
-
"ml.g5.12xlarge"
|
|
232
|
-
],
|
|
233
186
|
"notes": "RadixAttention provides automatic KV cache reuse for improved throughput"
|
|
234
187
|
}
|
|
235
188
|
},
|
|
@@ -254,12 +207,7 @@
|
|
|
254
207
|
"SGLANG_MAX_RUNNING_REQUESTS": "256",
|
|
255
208
|
"SGLANG_CONTEXT_LENGTH": "4096"
|
|
256
209
|
},
|
|
257
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
258
|
-
"recommendedInstanceTypes": [
|
|
259
|
-
"ml.g5.xlarge",
|
|
260
|
-
"ml.g5.2xlarge",
|
|
261
|
-
"ml.g5.4xlarge"
|
|
262
|
-
]
|
|
210
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
|
|
263
211
|
},
|
|
264
212
|
"accelerator": {
|
|
265
213
|
"type": "cuda",
|
|
@@ -278,10 +226,6 @@
|
|
|
278
226
|
"SGLANG_MAX_RUNNING_REQUESTS": "256",
|
|
279
227
|
"SGLANG_MEM_FRACTION": "0.9"
|
|
280
228
|
},
|
|
281
|
-
"recommendedInstanceTypes": [
|
|
282
|
-
"ml.g5.xlarge",
|
|
283
|
-
"ml.g5.2xlarge"
|
|
284
|
-
],
|
|
285
229
|
"notes": "Good starting point for most workloads"
|
|
286
230
|
},
|
|
287
231
|
"high-throughput": {
|
|
@@ -293,10 +237,6 @@
|
|
|
293
237
|
"SGLANG_CONTEXT_LENGTH": "2048",
|
|
294
238
|
"SGLANG_ENABLE_RADIX_CACHE": "true"
|
|
295
239
|
},
|
|
296
|
-
"recommendedInstanceTypes": [
|
|
297
|
-
"ml.g5.4xlarge",
|
|
298
|
-
"ml.g5.12xlarge"
|
|
299
|
-
],
|
|
300
240
|
"notes": "RadixAttention provides automatic KV cache reuse for improved throughput"
|
|
301
241
|
}
|
|
302
242
|
},
|
|
@@ -326,13 +266,7 @@
|
|
|
326
266
|
"TRTLLM_ENABLE_CHUNKED_CONTEXT": "true",
|
|
327
267
|
"UCX_MEMTYPE_CACHE": "n"
|
|
328
268
|
},
|
|
329
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
330
|
-
"recommendedInstanceTypes": [
|
|
331
|
-
"ml.g5.2xlarge",
|
|
332
|
-
"ml.g5.4xlarge",
|
|
333
|
-
"ml.g5.12xlarge",
|
|
334
|
-
"ml.g5.48xlarge"
|
|
335
|
-
]
|
|
269
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
336
270
|
},
|
|
337
271
|
"accelerator": {
|
|
338
272
|
"type": "cuda",
|
|
@@ -352,10 +286,6 @@
|
|
|
352
286
|
"TRTLLM_MAX_BATCH_SIZE": "16",
|
|
353
287
|
"TRTLLM_ENABLE_CHUNKED_CONTEXT": "true"
|
|
354
288
|
},
|
|
355
|
-
"recommendedInstanceTypes": [
|
|
356
|
-
"ml.g5.2xlarge",
|
|
357
|
-
"ml.g5.4xlarge"
|
|
358
|
-
],
|
|
359
289
|
"notes": "Chunked context allows processing longer sequences"
|
|
360
290
|
},
|
|
361
291
|
"int8": {
|
|
@@ -367,10 +297,6 @@
|
|
|
367
297
|
"TRTLLM_USE_WEIGHT_ONLY": "true",
|
|
368
298
|
"TRTLLM_WEIGHT_ONLY_PRECISION": "int8"
|
|
369
299
|
},
|
|
370
|
-
"recommendedInstanceTypes": [
|
|
371
|
-
"ml.g5.xlarge",
|
|
372
|
-
"ml.g5.2xlarge"
|
|
373
|
-
],
|
|
374
300
|
"notes": "Weight-only quantization provides best speed/accuracy tradeoff"
|
|
375
301
|
},
|
|
376
302
|
"int4": {
|
|
@@ -382,9 +308,6 @@
|
|
|
382
308
|
"TRTLLM_USE_WEIGHT_ONLY": "true",
|
|
383
309
|
"TRTLLM_WEIGHT_ONLY_PRECISION": "int4"
|
|
384
310
|
},
|
|
385
|
-
"recommendedInstanceTypes": [
|
|
386
|
-
"ml.g5.xlarge"
|
|
387
|
-
],
|
|
388
311
|
"notes": "Enables running larger models on smaller instances with acceptable accuracy"
|
|
389
312
|
}
|
|
390
313
|
},
|
|
@@ -412,13 +335,7 @@
|
|
|
412
335
|
"TRTLLM_ENABLE_CHUNKED_CONTEXT": "true",
|
|
413
336
|
"UCX_MEMTYPE_CACHE": "n"
|
|
414
337
|
},
|
|
415
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
416
|
-
"recommendedInstanceTypes": [
|
|
417
|
-
"ml.g5.2xlarge",
|
|
418
|
-
"ml.g5.4xlarge",
|
|
419
|
-
"ml.g5.12xlarge",
|
|
420
|
-
"ml.g5.48xlarge"
|
|
421
|
-
]
|
|
338
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
422
339
|
},
|
|
423
340
|
"accelerator": {
|
|
424
341
|
"type": "cuda",
|
|
@@ -438,10 +355,6 @@
|
|
|
438
355
|
"TRTLLM_MAX_BATCH_SIZE": "16",
|
|
439
356
|
"TRTLLM_ENABLE_CHUNKED_CONTEXT": "true"
|
|
440
357
|
},
|
|
441
|
-
"recommendedInstanceTypes": [
|
|
442
|
-
"ml.g5.2xlarge",
|
|
443
|
-
"ml.g5.4xlarge"
|
|
444
|
-
],
|
|
445
358
|
"notes": "Chunked context allows processing longer sequences"
|
|
446
359
|
},
|
|
447
360
|
"int8": {
|
|
@@ -453,10 +366,6 @@
|
|
|
453
366
|
"TRTLLM_USE_WEIGHT_ONLY": "true",
|
|
454
367
|
"TRTLLM_WEIGHT_ONLY_PRECISION": "int8"
|
|
455
368
|
},
|
|
456
|
-
"recommendedInstanceTypes": [
|
|
457
|
-
"ml.g5.xlarge",
|
|
458
|
-
"ml.g5.2xlarge"
|
|
459
|
-
],
|
|
460
369
|
"notes": "Weight-only quantization provides best speed/accuracy tradeoff"
|
|
461
370
|
},
|
|
462
371
|
"int4": {
|
|
@@ -468,9 +377,6 @@
|
|
|
468
377
|
"TRTLLM_USE_WEIGHT_ONLY": "true",
|
|
469
378
|
"TRTLLM_WEIGHT_ONLY_PRECISION": "int4"
|
|
470
379
|
},
|
|
471
|
-
"recommendedInstanceTypes": [
|
|
472
|
-
"ml.g5.xlarge"
|
|
473
|
-
],
|
|
474
380
|
"notes": "Enables running larger models on smaller instances with acceptable accuracy"
|
|
475
381
|
}
|
|
476
382
|
},
|
|
@@ -497,13 +403,7 @@
|
|
|
497
403
|
"OPTION_MAX_ROLLING_BATCH_SIZE": "32",
|
|
498
404
|
"OPTION_DTYPE": "fp16"
|
|
499
405
|
},
|
|
500
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
501
|
-
"recommendedInstanceTypes": [
|
|
502
|
-
"ml.g5.xlarge",
|
|
503
|
-
"ml.g5.2xlarge",
|
|
504
|
-
"ml.g5.4xlarge",
|
|
505
|
-
"ml.g5.12xlarge"
|
|
506
|
-
]
|
|
406
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
507
407
|
},
|
|
508
408
|
"accelerator": {
|
|
509
409
|
"type": "cuda",
|
|
@@ -523,10 +423,6 @@
|
|
|
523
423
|
"OPTION_MAX_ROLLING_BATCH_SIZE": "32",
|
|
524
424
|
"OPTION_DTYPE": "fp16"
|
|
525
425
|
},
|
|
526
|
-
"recommendedInstanceTypes": [
|
|
527
|
-
"ml.g5.xlarge",
|
|
528
|
-
"ml.g5.2xlarge"
|
|
529
|
-
],
|
|
530
426
|
"notes": "vLLM backend provides excellent performance for most models"
|
|
531
427
|
},
|
|
532
428
|
"tensorrt-backend": {
|
|
@@ -537,10 +433,6 @@
|
|
|
537
433
|
"OPTION_MAX_ROLLING_BATCH_SIZE": "16",
|
|
538
434
|
"OPTION_DTYPE": "fp16"
|
|
539
435
|
},
|
|
540
|
-
"recommendedInstanceTypes": [
|
|
541
|
-
"ml.g5.2xlarge",
|
|
542
|
-
"ml.g5.4xlarge"
|
|
543
|
-
],
|
|
544
436
|
"notes": "TensorRT-LLM provides best performance but requires model compilation"
|
|
545
437
|
},
|
|
546
438
|
"lmi-dist": {
|
|
@@ -551,10 +443,6 @@
|
|
|
551
443
|
"OPTION_TENSOR_PARALLEL_DEGREE": "4",
|
|
552
444
|
"OPTION_MAX_ROLLING_BATCH_SIZE": "64"
|
|
553
445
|
},
|
|
554
|
-
"recommendedInstanceTypes": [
|
|
555
|
-
"ml.g5.12xlarge",
|
|
556
|
-
"ml.g5.48xlarge"
|
|
557
|
-
],
|
|
558
446
|
"notes": "Best for very large models requiring multi-GPU tensor parallelism"
|
|
559
447
|
},
|
|
560
448
|
"auto": {
|
|
@@ -564,11 +452,6 @@
|
|
|
564
452
|
"OPTION_MAX_ROLLING_BATCH_SIZE": "32",
|
|
565
453
|
"OPTION_DTYPE": "fp16"
|
|
566
454
|
},
|
|
567
|
-
"recommendedInstanceTypes": [
|
|
568
|
-
"ml.g5.xlarge",
|
|
569
|
-
"ml.g5.2xlarge",
|
|
570
|
-
"ml.g5.4xlarge"
|
|
571
|
-
],
|
|
572
455
|
"notes": "LMI will analyze your model and select the optimal backend automatically"
|
|
573
456
|
}
|
|
574
457
|
},
|
|
@@ -593,13 +476,7 @@
|
|
|
593
476
|
"OPTION_MAX_ROLLING_BATCH_SIZE": "32",
|
|
594
477
|
"OPTION_DTYPE": "fp16"
|
|
595
478
|
},
|
|
596
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
597
|
-
"recommendedInstanceTypes": [
|
|
598
|
-
"ml.g5.xlarge",
|
|
599
|
-
"ml.g5.2xlarge",
|
|
600
|
-
"ml.g5.4xlarge",
|
|
601
|
-
"ml.g5.12xlarge"
|
|
602
|
-
]
|
|
479
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
603
480
|
},
|
|
604
481
|
"accelerator": {
|
|
605
482
|
"type": "cuda",
|
|
@@ -619,10 +496,6 @@
|
|
|
619
496
|
"OPTION_MAX_ROLLING_BATCH_SIZE": "32",
|
|
620
497
|
"OPTION_DTYPE": "fp16"
|
|
621
498
|
},
|
|
622
|
-
"recommendedInstanceTypes": [
|
|
623
|
-
"ml.g5.xlarge",
|
|
624
|
-
"ml.g5.2xlarge"
|
|
625
|
-
],
|
|
626
499
|
"notes": "vLLM backend provides excellent performance for most models"
|
|
627
500
|
},
|
|
628
501
|
"tensorrt-backend": {
|
|
@@ -633,10 +506,6 @@
|
|
|
633
506
|
"OPTION_MAX_ROLLING_BATCH_SIZE": "16",
|
|
634
507
|
"OPTION_DTYPE": "fp16"
|
|
635
508
|
},
|
|
636
|
-
"recommendedInstanceTypes": [
|
|
637
|
-
"ml.g5.2xlarge",
|
|
638
|
-
"ml.g5.4xlarge"
|
|
639
|
-
],
|
|
640
509
|
"notes": "TensorRT-LLM provides best performance but requires model compilation"
|
|
641
510
|
},
|
|
642
511
|
"lmi-dist": {
|
|
@@ -647,10 +516,6 @@
|
|
|
647
516
|
"OPTION_TENSOR_PARALLEL_DEGREE": "4",
|
|
648
517
|
"OPTION_MAX_ROLLING_BATCH_SIZE": "64"
|
|
649
518
|
},
|
|
650
|
-
"recommendedInstanceTypes": [
|
|
651
|
-
"ml.g5.12xlarge",
|
|
652
|
-
"ml.g5.48xlarge"
|
|
653
|
-
],
|
|
654
519
|
"notes": "Best for very large models requiring multi-GPU tensor parallelism"
|
|
655
520
|
},
|
|
656
521
|
"auto": {
|
|
@@ -660,11 +525,6 @@
|
|
|
660
525
|
"OPTION_MAX_ROLLING_BATCH_SIZE": "32",
|
|
661
526
|
"OPTION_DTYPE": "fp16"
|
|
662
527
|
},
|
|
663
|
-
"recommendedInstanceTypes": [
|
|
664
|
-
"ml.g5.xlarge",
|
|
665
|
-
"ml.g5.2xlarge",
|
|
666
|
-
"ml.g5.4xlarge"
|
|
667
|
-
],
|
|
668
528
|
"notes": "LMI will analyze your model and select the optimal backend automatically"
|
|
669
529
|
}
|
|
670
530
|
},
|
|
@@ -690,12 +550,7 @@
|
|
|
690
550
|
"OPTION_TENSOR_PARALLEL_DEGREE": "1",
|
|
691
551
|
"OPTION_DEVICE_MAP": "auto"
|
|
692
552
|
},
|
|
693
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
694
|
-
"recommendedInstanceTypes": [
|
|
695
|
-
"ml.g5.xlarge",
|
|
696
|
-
"ml.g5.2xlarge",
|
|
697
|
-
"ml.g5.4xlarge"
|
|
698
|
-
]
|
|
553
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
699
554
|
},
|
|
700
555
|
"accelerator": {
|
|
701
556
|
"type": "cuda",
|
|
@@ -715,10 +570,6 @@
|
|
|
715
570
|
"OPTION_DEVICE_MAP": "auto",
|
|
716
571
|
"BATCH_SIZE": "1"
|
|
717
572
|
},
|
|
718
|
-
"recommendedInstanceTypes": [
|
|
719
|
-
"ml.g5.xlarge",
|
|
720
|
-
"ml.g5.2xlarge"
|
|
721
|
-
],
|
|
722
573
|
"notes": "PyTorch engine provides good compatibility with HuggingFace models"
|
|
723
574
|
},
|
|
724
575
|
"multi-gpu": {
|
|
@@ -729,10 +580,6 @@
|
|
|
729
580
|
"OPTION_TENSOR_PARALLEL_DEGREE": "4",
|
|
730
581
|
"OPTION_DEVICE_MAP": "auto"
|
|
731
582
|
},
|
|
732
|
-
"recommendedInstanceTypes": [
|
|
733
|
-
"ml.g5.12xlarge",
|
|
734
|
-
"ml.g5.48xlarge"
|
|
735
|
-
],
|
|
736
583
|
"notes": "Distribute model across multiple GPUs for large models"
|
|
737
584
|
}
|
|
738
585
|
},
|
|
@@ -756,12 +603,7 @@
|
|
|
756
603
|
"OPTION_TENSOR_PARALLEL_DEGREE": "1",
|
|
757
604
|
"OPTION_DEVICE_MAP": "auto"
|
|
758
605
|
},
|
|
759
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
760
|
-
"recommendedInstanceTypes": [
|
|
761
|
-
"ml.g5.xlarge",
|
|
762
|
-
"ml.g5.2xlarge",
|
|
763
|
-
"ml.g5.4xlarge"
|
|
764
|
-
]
|
|
606
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
765
607
|
},
|
|
766
608
|
"accelerator": {
|
|
767
609
|
"type": "cuda",
|
|
@@ -781,10 +623,6 @@
|
|
|
781
623
|
"OPTION_DEVICE_MAP": "auto",
|
|
782
624
|
"BATCH_SIZE": "1"
|
|
783
625
|
},
|
|
784
|
-
"recommendedInstanceTypes": [
|
|
785
|
-
"ml.g5.xlarge",
|
|
786
|
-
"ml.g5.2xlarge"
|
|
787
|
-
],
|
|
788
626
|
"notes": "PyTorch engine provides good compatibility with HuggingFace models"
|
|
789
627
|
},
|
|
790
628
|
"multi-gpu": {
|
|
@@ -795,10 +633,6 @@
|
|
|
795
633
|
"OPTION_TENSOR_PARALLEL_DEGREE": "4",
|
|
796
634
|
"OPTION_DEVICE_MAP": "auto"
|
|
797
635
|
},
|
|
798
|
-
"recommendedInstanceTypes": [
|
|
799
|
-
"ml.g5.12xlarge",
|
|
800
|
-
"ml.g5.48xlarge"
|
|
801
|
-
],
|
|
802
636
|
"notes": "Distribute model across multiple GPUs for large models"
|
|
803
637
|
}
|
|
804
638
|
},
|
|
@@ -823,12 +657,7 @@
|
|
|
823
657
|
"HF_TOKEN": "${hfToken}",
|
|
824
658
|
"VLLM_WORKER_MULTIPROC_METHOD": "spawn"
|
|
825
659
|
},
|
|
826
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
827
|
-
"recommendedInstanceTypes": [
|
|
828
|
-
"ml.g5.2xlarge",
|
|
829
|
-
"ml.g5.4xlarge",
|
|
830
|
-
"ml.g5.12xlarge"
|
|
831
|
-
]
|
|
660
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
832
661
|
},
|
|
833
662
|
"accelerator": {
|
|
834
663
|
"type": "cuda",
|
|
@@ -844,30 +673,18 @@
|
|
|
844
673
|
"displayName": "Quality",
|
|
845
674
|
"description": "Higher step count for better image quality",
|
|
846
675
|
"envVars": {},
|
|
847
|
-
"recommendedInstanceTypes": [
|
|
848
|
-
"ml.g5.4xlarge",
|
|
849
|
-
"ml.g5.12xlarge"
|
|
850
|
-
],
|
|
851
676
|
"notes": "Best image quality, no cache acceleration, VAE tiling for memory efficiency"
|
|
852
677
|
},
|
|
853
678
|
"speed": {
|
|
854
679
|
"displayName": "Speed",
|
|
855
680
|
"description": "Cache acceleration for faster generation",
|
|
856
681
|
"envVars": {},
|
|
857
|
-
"recommendedInstanceTypes": [
|
|
858
|
-
"ml.g5.2xlarge",
|
|
859
|
-
"ml.g5.4xlarge"
|
|
860
|
-
],
|
|
861
682
|
"notes": "TeaCache acceleration reduces redundant computation between denoising steps"
|
|
862
683
|
},
|
|
863
684
|
"multi-gpu": {
|
|
864
685
|
"displayName": "Multi-GPU",
|
|
865
686
|
"description": "Sequence parallelism for large diffusion models",
|
|
866
687
|
"envVars": {},
|
|
867
|
-
"recommendedInstanceTypes": [
|
|
868
|
-
"ml.g5.12xlarge",
|
|
869
|
-
"ml.g5.48xlarge"
|
|
870
|
-
],
|
|
871
688
|
"notes": "Ulysses sequence parallelism for large models like FLUX on multi-GPU instances"
|
|
872
689
|
}
|
|
873
690
|
},
|
|
@@ -890,12 +707,7 @@
|
|
|
890
707
|
"HF_TOKEN": "${hfToken}",
|
|
891
708
|
"VLLM_WORKER_MULTIPROC_METHOD": "spawn"
|
|
892
709
|
},
|
|
893
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
894
|
-
"recommendedInstanceTypes": [
|
|
895
|
-
"ml.g5.2xlarge",
|
|
896
|
-
"ml.g5.4xlarge",
|
|
897
|
-
"ml.g5.12xlarge"
|
|
898
|
-
]
|
|
710
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
899
711
|
},
|
|
900
712
|
"accelerator": {
|
|
901
713
|
"type": "cuda",
|
|
@@ -911,30 +723,18 @@
|
|
|
911
723
|
"displayName": "Quality",
|
|
912
724
|
"description": "Higher step count for better image quality",
|
|
913
725
|
"envVars": {},
|
|
914
|
-
"recommendedInstanceTypes": [
|
|
915
|
-
"ml.g5.4xlarge",
|
|
916
|
-
"ml.g5.12xlarge"
|
|
917
|
-
],
|
|
918
726
|
"notes": "Best image quality, no cache acceleration, VAE tiling for memory efficiency"
|
|
919
727
|
},
|
|
920
728
|
"speed": {
|
|
921
729
|
"displayName": "Speed",
|
|
922
730
|
"description": "Cache acceleration for faster generation",
|
|
923
731
|
"envVars": {},
|
|
924
|
-
"recommendedInstanceTypes": [
|
|
925
|
-
"ml.g5.2xlarge",
|
|
926
|
-
"ml.g5.4xlarge"
|
|
927
|
-
],
|
|
928
732
|
"notes": "TeaCache acceleration reduces redundant computation between denoising steps"
|
|
929
733
|
},
|
|
930
734
|
"multi-gpu": {
|
|
931
735
|
"displayName": "Multi-GPU",
|
|
932
736
|
"description": "Sequence parallelism for large diffusion models",
|
|
933
737
|
"envVars": {},
|
|
934
|
-
"recommendedInstanceTypes": [
|
|
935
|
-
"ml.g5.12xlarge",
|
|
936
|
-
"ml.g5.48xlarge"
|
|
937
|
-
],
|
|
938
738
|
"notes": "Ulysses sequence parallelism for large models like FLUX on multi-GPU instances"
|
|
939
739
|
}
|
|
940
740
|
},
|
|
@@ -958,11 +758,7 @@
|
|
|
958
758
|
"envVars": {
|
|
959
759
|
"TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
|
|
960
760
|
},
|
|
961
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
962
|
-
"recommendedInstanceTypes": [
|
|
963
|
-
"ml.g5.xlarge",
|
|
964
|
-
"ml.g5.2xlarge"
|
|
965
|
-
]
|
|
761
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
966
762
|
},
|
|
967
763
|
"accelerator": {
|
|
968
764
|
"type": "cuda",
|
|
@@ -993,11 +789,7 @@
|
|
|
993
789
|
"envVars": {
|
|
994
790
|
"TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
|
|
995
791
|
},
|
|
996
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
997
|
-
"recommendedInstanceTypes": [
|
|
998
|
-
"ml.g5.xlarge",
|
|
999
|
-
"ml.g5.2xlarge"
|
|
1000
|
-
]
|
|
792
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
1001
793
|
},
|
|
1002
794
|
"accelerator": {
|
|
1003
795
|
"type": "cuda",
|
|
@@ -1028,11 +820,7 @@
|
|
|
1028
820
|
"envVars": {
|
|
1029
821
|
"TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
|
|
1030
822
|
},
|
|
1031
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
1032
|
-
"recommendedInstanceTypes": [
|
|
1033
|
-
"ml.g5.xlarge",
|
|
1034
|
-
"ml.g5.2xlarge"
|
|
1035
|
-
]
|
|
823
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
1036
824
|
},
|
|
1037
825
|
"accelerator": {
|
|
1038
826
|
"type": "cuda",
|
|
@@ -1063,11 +851,7 @@
|
|
|
1063
851
|
"envVars": {
|
|
1064
852
|
"TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
|
|
1065
853
|
},
|
|
1066
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
1067
|
-
"recommendedInstanceTypes": [
|
|
1068
|
-
"ml.g5.xlarge",
|
|
1069
|
-
"ml.g5.2xlarge"
|
|
1070
|
-
]
|
|
854
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
1071
855
|
},
|
|
1072
856
|
"accelerator": {
|
|
1073
857
|
"type": "cuda",
|
|
@@ -1098,12 +882,7 @@
|
|
|
1098
882
|
"envVars": {
|
|
1099
883
|
"TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
|
|
1100
884
|
},
|
|
1101
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
1102
|
-
"recommendedInstanceTypes": [
|
|
1103
|
-
"ml.g5.xlarge",
|
|
1104
|
-
"ml.g5.2xlarge",
|
|
1105
|
-
"ml.g5.4xlarge"
|
|
1106
|
-
]
|
|
885
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
1107
886
|
},
|
|
1108
887
|
"accelerator": {
|
|
1109
888
|
"type": "cuda",
|
|
@@ -1134,12 +913,7 @@
|
|
|
1134
913
|
"envVars": {
|
|
1135
914
|
"TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
|
|
1136
915
|
},
|
|
1137
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
1138
|
-
"recommendedInstanceTypes": [
|
|
1139
|
-
"ml.g5.2xlarge",
|
|
1140
|
-
"ml.g5.4xlarge",
|
|
1141
|
-
"ml.g5.12xlarge"
|
|
1142
|
-
]
|
|
916
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
1143
917
|
},
|
|
1144
918
|
"accelerator": {
|
|
1145
919
|
"type": "cuda",
|
|
@@ -1170,11 +944,7 @@
|
|
|
1170
944
|
"envVars": {
|
|
1171
945
|
"TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
|
|
1172
946
|
},
|
|
1173
|
-
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
1174
|
-
"recommendedInstanceTypes": [
|
|
1175
|
-
"ml.g5.xlarge",
|
|
1176
|
-
"ml.g5.2xlarge"
|
|
1177
|
-
]
|
|
947
|
+
"inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
|
|
1178
948
|
},
|
|
1179
949
|
"accelerator": {
|
|
1180
950
|
"type": "cuda",
|