npm - @aws/ml-container-creator - Versions diffs - 0.2.6 → 0.4.0 - Mend

@aws/ml-container-creator 0.2.6 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

package/bin/cli.js +38 -2
package/config/bootstrap-stack.json +94 -1
package/config/defaults.json +1 -1
package/infra/ci-harness/package-lock.json +22 -9
package/package.json +3 -1
package/servers/instance-sizer/index.js +45 -8
package/servers/instance-sizer/lib/instance-ranker.js +140 -11
package/servers/instance-sizer/lib/model-resolver.js +10 -6
package/servers/instance-sizer/lib/quota-resolver.js +368 -0
package/servers/instance-sizer/package.json +2 -0
package/servers/lib/catalogs/instances.json +527 -12
package/servers/lib/catalogs/model-servers.json +298 -20
package/servers/lib/catalogs/model-sizes.json +27 -0
package/servers/lib/catalogs/models.json +101 -0
package/servers/lib/schemas/image-catalog.schema.json +15 -1
package/servers/model-picker/index.js +2 -1
package/src/app.js +96 -2
package/src/lib/architecture-sync.js +171 -0
package/src/lib/arn-detection.js +22 -0
package/src/lib/bootstrap-command-handler.js +178 -3
package/src/lib/cli-handler.js +2 -2
package/src/lib/config-manager.js +121 -1
package/src/lib/cross-cutting-checker.js +119 -0
package/src/lib/deployment-entry-schema.js +1 -2
package/src/lib/prompt-runner.js +514 -20
package/src/lib/prompts.js +67 -5
package/src/lib/registry-command-handler.js +236 -0
package/src/lib/schema-sync.js +31 -0
package/src/lib/secret-classification.js +56 -0
package/src/lib/secrets-command-handler.js +550 -0
package/src/lib/template-manager.js +49 -1
package/src/lib/validate-runner.js +174 -2
package/src/lib/validation-report.js +8 -1
package/src/prompt-adapter.js +3 -2
package/templates/Dockerfile +10 -2
package/templates/code/cuda_compat.sh +22 -0
package/templates/code/serve +3 -0
package/templates/code/start_server.sh +3 -0
package/templates/diffusors/Dockerfile +2 -1
package/templates/diffusors/serve +3 -0
package/templates/do/README.md +33 -0
package/templates/do/benchmark +646 -0
package/templates/do/build +22 -0
package/templates/do/clean +86 -0
package/templates/do/config +41 -6
package/templates/do/deploy +66 -6
package/templates/do/logs +18 -3
package/templates/do/register +8 -1
package/templates/do/run +10 -0
package/templates/triton/Dockerfile +5 -0

package/servers/lib/catalogs/model-servers.json CHANGED Viewed

@@ -64,7 +64,157 @@
                     "notes": "Requires instance with 4+ GPUs. Set TENSOR_PARALLEL_SIZE to match GPU count"
                 }
             },
-            "notes": "vLLM 0.4.0 adds prefix caching and improved performance. Requires CUDA 12.0+"
+            "notes": "vLLM 0.4.0 adds prefix caching and improved performance. Requires CUDA 12.0+",
+            "supportedModelTypes": [
+                "arcee",
+                "arctic",
+                "aria",
+                "aya_vision",
+                "baichuan",
+                "bailing_moe",
+                "bamba",
+                "bart",
+                "bert",
+                "bert_with_rope",
+                "blip2",
+                "bloom",
+                "chameleon",
+                "chatglm",
+                "cohere2_vision",
+                "commandr",
+                "dbrx",
+                "deepseek",
+                "deepseek_mtp",
+                "deepseek_v2",
+                "deepseek_vl2",
+                "dots1",
+                "ernie45",
+                "ernie45_moe",
+                "exaone",
+                "exaone4",
+                "fairseq2_llama",
+                "falcon",
+                "falcon_h1",
+                "florence2",
+                "fuyu",
+                "gemma",
+                "gemma2",
+                "gemma3",
+                "gemma3_mm",
+                "gemma3n",
+                "gemma3n_mm",
+                "glm",
+                "glm4",
+                "glm4_1v",
+                "glm4_moe",
+                "glm4_moe_mtp",
+                "glm4v",
+                "gpt2",
+                "gpt_bigcode",
+                "gpt_j",
+                "gpt_neox",
+                "gpt_oss",
+                "granite",
+                "granite_speech",
+                "granitemoe",
+                "granitemoehybrid",
+                "granitemoeshared",
+                "gritlm",
+                "grok1",
+                "h2ovl",
+                "hunyuan_v1",
+                "hyperclovax_vision",
+                "idefics3",
+                "internlm2",
+                "internlm2_ve",
+                "interns1",
+                "internvl",
+                "jais",
+                "jamba",
+                "jina_vl",
+                "keye",
+                "kimi_vl",
+                "llama",
+                "llama4",
+                "llama4_eagle",
+                "llama_eagle",
+                "llama_eagle3",
+                "llava",
+                "llava_next",
+                "llava_next_video",
+                "llava_onevision",
+                "mamba",
+                "mamba2",
+                "medusa",
+                "mimo",
+                "mimo_mtp",
+                "minicpm",
+                "minicpm3",
+                "minicpm_eagle",
+                "minicpmo",
+                "minicpmv",
+                "minimax_text_01",
+                "minimax_vl_01",
+                "mistral3",
+                "mixtral",
+                "mixtral_quant",
+                "mllama",
+                "mllama4",
+                "mlp_speculator",
+                "modernbert",
+                "molmo",
+                "mpt",
+                "nemotron",
+                "nemotron_h",
+                "nemotron_nas",
+                "nemotron_vl",
+                "nvlm_d",
+                "olmo",
+                "olmo2",
+                "olmoe",
+                "opt",
+                "orion",
+                "ovis",
+                "paligemma",
+                "persimmon",
+                "phi",
+                "phi3",
+                "phi3v",
+                "phi4_multimodal",
+                "phi4flash",
+                "phi4mm",
+                "phimoe",
+                "pixtral",
+                "plamo2",
+                "prithvi_geospatial_mae",
+                "qwen",
+                "qwen2",
+                "qwen2_5_omni_thinker",
+                "qwen2_5_vl",
+                "qwen2_audio",
+                "qwen2_moe",
+                "qwen2_rm",
+                "qwen2_vl",
+                "qwen3",
+                "qwen3_moe",
+                "qwen_vl",
+                "roberta",
+                "skyworkr1v",
+                "smolvlm",
+                "solar",
+                "stablelm",
+                "starcoder2",
+                "step3_text",
+                "step3_vl",
+                "tarsier",
+                "telechat2",
+                "teleflm",
+                "transformers",
+                "ultravox",
+                "voxtral",
+                "whisper",
+                "zamba2"
+            ]
         },
         {
             "image": "vllm/vllm-openai:v0.9.1",
@@ -130,7 +280,133 @@
                     "notes": "Requires instance with 4+ GPUs. Set TENSOR_PARALLEL_SIZE to match GPU count"
                 }
             },
-            "notes": "vLLM 0.4.0 adds prefix caching and improved performance. Requires CUDA 12.0+"
+            "notes": "vLLM 0.4.0 adds prefix caching and improved performance. Requires CUDA 12.0+",
+            "supportedModelTypes": [
+                "arctic",
+                "aria",
+                "aya_vision",
+                "baichuan",
+                "bamba",
+                "bart",
+                "bert",
+                "bert_with_rope",
+                "blip2",
+                "bloom",
+                "chameleon",
+                "chatglm",
+                "commandr",
+                "dbrx",
+                "deepseek",
+                "deepseek_mtp",
+                "deepseek_v2",
+                "deepseek_vl2",
+                "eagle",
+                "exaone",
+                "fairseq2_llama",
+                "falcon",
+                "falcon_h1",
+                "florence2",
+                "fuyu",
+                "gemma",
+                "gemma2",
+                "gemma3",
+                "gemma3_mm",
+                "glm",
+                "glm4",
+                "glm4v",
+                "gpt2",
+                "gpt_bigcode",
+                "gpt_j",
+                "gpt_neox",
+                "granite",
+                "granite_speech",
+                "granitemoe",
+                "granitemoehybrid",
+                "granitemoeshared",
+                "gritlm",
+                "grok1",
+                "h2ovl",
+                "idefics3",
+                "internlm2",
+                "internlm2_ve",
+                "internvl",
+                "jais",
+                "jamba",
+                "kimi_vl",
+                "llama",
+                "llama_eagle",
+                "llama_eagle3",
+                "llava",
+                "llava_next",
+                "llava_next_video",
+                "llava_onevision",
+                "mamba",
+                "mamba2",
+                "medusa",
+                "mimo",
+                "mimo_mtp",
+                "minicpm",
+                "minicpm3",
+                "minicpm_eagle",
+                "minicpmo",
+                "minicpmv",
+                "minimax_text_01",
+                "minimax_vl_01",
+                "mistral3",
+                "mixtral",
+                "mixtral_quant",
+                "mllama",
+                "mllama4",
+                "mlp_speculator",
+                "modernbert",
+                "molmo",
+                "mpt",
+                "nemotron",
+                "nemotron_h",
+                "nemotron_nas",
+                "nvlm_d",
+                "olmo",
+                "olmo2",
+                "olmoe",
+                "opt",
+                "orion",
+                "ovis",
+                "paligemma",
+                "persimmon",
+                "phi",
+                "phi3",
+                "phi3_small",
+                "phi3v",
+                "phi4mm",
+                "phimoe",
+                "pixtral",
+                "plamo2",
+                "prithvi_geospatial_mae",
+                "qwen",
+                "qwen2",
+                "qwen2_5_omni_thinker",
+                "qwen2_5_vl",
+                "qwen2_audio",
+                "qwen2_moe",
+                "qwen2_rm",
+                "qwen2_vl",
+                "qwen3",
+                "qwen3_moe",
+                "qwen_vl",
+                "roberta",
+                "skyworkr1v",
+                "smolvlm",
+                "solar",
+                "stablelm",
+                "starcoder2",
+                "tarsier",
+                "telechat2",
+                "teleflm",
+                "transformers",
+                "ultravox",
+                "whisper",
+                "zamba2"
+            ]
         }
     ],
     "sglang": [
@@ -266,7 +542,7 @@
                     "TRTLLM_ENABLE_CHUNKED_CONTEXT": "true",
                     "UCX_MEMTYPE_CACHE": "n"
                 },
-                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
+                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
             },
             "accelerator": {
                 "type": "cuda",
@@ -311,7 +587,8 @@
                     "notes": "Enables running larger models on smaller instances with acceptable accuracy"
                 }
             },
-            "notes": "TensorRT-LLM 1.0.0 adds chunked context and INT4 support. Requires CUDA 12.1+"
+            "notes": "TensorRT-LLM 1.0.0 adds chunked context and INT4 support. Requires CUDA 12.1+",
+            "supportedModelTypes": []
         },
         {
             "image": "nvcr.io/nvidia/tensorrt-llm/release:1.1.0",
@@ -335,7 +612,7 @@
                     "TRTLLM_ENABLE_CHUNKED_CONTEXT": "true",
                     "UCX_MEMTYPE_CACHE": "n"
                 },
-                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
+                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
             },
             "accelerator": {
                 "type": "cuda",
@@ -380,7 +657,8 @@
                     "notes": "Enables running larger models on smaller instances with acceptable accuracy"
                 }
             },
-            "notes": "TensorRT-LLM 1.0.0 adds chunked context and INT4 support. Requires CUDA 12.1+"
+            "notes": "TensorRT-LLM 1.0.0 adds chunked context and INT4 support. Requires CUDA 12.1+",
+            "supportedModelTypes": []
         }
     ],
     "lmi": [
@@ -403,7 +681,7 @@
                     "OPTION_MAX_ROLLING_BATCH_SIZE": "32",
                     "OPTION_DTYPE": "fp16"
                 },
-                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
+                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
             },
             "accelerator": {
                 "type": "cuda",
@@ -476,7 +754,7 @@
                     "OPTION_MAX_ROLLING_BATCH_SIZE": "32",
                     "OPTION_DTYPE": "fp16"
                 },
-                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
+                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
             },
             "accelerator": {
                 "type": "cuda",
@@ -550,7 +828,7 @@
                     "OPTION_TENSOR_PARALLEL_DEGREE": "1",
                     "OPTION_DEVICE_MAP": "auto"
                 },
-                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
+                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
             },
             "accelerator": {
                 "type": "cuda",
@@ -603,7 +881,7 @@
                     "OPTION_TENSOR_PARALLEL_DEGREE": "1",
                     "OPTION_DEVICE_MAP": "auto"
                 },
-                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
+                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
             },
             "accelerator": {
                 "type": "cuda",
@@ -657,7 +935,7 @@
                     "HF_TOKEN": "${hfToken}",
                     "VLLM_WORKER_MULTIPROC_METHOD": "spawn"
                 },
-                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
+                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
             },
             "accelerator": {
                 "type": "cuda",
@@ -707,7 +985,7 @@
                     "HF_TOKEN": "${hfToken}",
                     "VLLM_WORKER_MULTIPROC_METHOD": "spawn"
                 },
-                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
+                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
             },
             "accelerator": {
                 "type": "cuda",
@@ -758,7 +1036,7 @@
                 "envVars": {
                     "TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
                 },
-                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
+                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
             },
             "accelerator": {
                 "type": "cuda",
@@ -789,7 +1067,7 @@
                 "envVars": {
                     "TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
                 },
-                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
+                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
             },
             "accelerator": {
                 "type": "cuda",
@@ -820,7 +1098,7 @@
                 "envVars": {
                     "TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
                 },
-                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
+                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
             },
             "accelerator": {
                 "type": "cuda",
@@ -851,7 +1129,7 @@
                 "envVars": {
                     "TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
                 },
-                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
+                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
             },
             "accelerator": {
                 "type": "cuda",
@@ -882,7 +1160,7 @@
                 "envVars": {
                     "TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
                 },
-                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
+                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
             },
             "accelerator": {
                 "type": "cuda",
@@ -913,7 +1191,7 @@
                 "envVars": {
                     "TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
                 },
-                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
+                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
             },
             "accelerator": {
                 "type": "cuda",
@@ -944,7 +1222,7 @@
                 "envVars": {
                     "TRITON_MODEL_REPOSITORY": "/opt/ml/model/model_repository"
                 },
-                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-2"
+                "inferenceAmiVersion": "al2-ami-sagemaker-inference-gpu-3-1"
             },
             "accelerator": {
                 "type": "cuda",
@@ -958,4 +1236,4 @@
             "notes": "Triton Python backend for custom model serving with TritonPythonModel interface. GPU optional"
         }
     ]
-}
+}

package/servers/lib/catalogs/model-sizes.json CHANGED Viewed

@@ -46,6 +46,33 @@
             "minVramGb": 184,
             "recommendedInstances": ["ml.g5.48xlarge", "ml.p4d.24xlarge"]
         },
+        "meta-llama/Llama-3.1-8B*": {
+            "parameterCount": 8030261248,
+            "defaultDtype": "bfloat16",
+            "architecture": "LlamaForCausalLM",
+            "maxPositionEmbeddings": 131072,
+            "recommendedQuantizations": ["awq", "gptq"],
+            "minVramGb": 20,
+            "recommendedInstances": ["ml.g5.2xlarge", "ml.g6.2xlarge"]
+        },
+        "meta-llama/Llama-3.2-1B*": {
+            "parameterCount": 1235814400,
+            "defaultDtype": "bfloat16",
+            "architecture": "LlamaForCausalLM",
+            "maxPositionEmbeddings": 131072,
+            "recommendedQuantizations": ["awq", "gptq"],
+            "minVramGb": 5,
+            "recommendedInstances": ["ml.g5.xlarge", "ml.g6.xlarge"]
+        },
+        "meta-llama/Llama-3.2-3B*": {
+            "parameterCount": 3212749824,
+            "defaultDtype": "bfloat16",
+            "architecture": "LlamaForCausalLM",
+            "maxPositionEmbeddings": 131072,
+            "recommendedQuantizations": ["awq", "gptq"],
+            "minVramGb": 9,
+            "recommendedInstances": ["ml.g5.xlarge", "ml.g6.xlarge"]
+        },
         "mistralai/Mistral-7B*": {
             "parameterCount": 7241732096,
             "defaultDtype": "bfloat16",

package/servers/lib/catalogs/models.json CHANGED Viewed

@@ -1,6 +1,9 @@
 {
     "openai/gpt-oss-20b": {
         "family": "gpt-oss",
+        "parameterCount": 20000000000,
+        "defaultDtype": "bfloat16",
+        "maxPositionEmbeddings": 8192,
         "gated": false,
         "tags": [
             "text-generation",
@@ -99,6 +102,9 @@
     },
     "meta-llama/Llama-2-70b-chat-hf": {
         "family": "llama-2",
+        "parameterCount": 70000000000,
+        "defaultDtype": "float16",
+        "maxPositionEmbeddings": 4096,
         "gated": true,
         "tags": [
             "text-generation",
@@ -259,6 +265,30 @@
             "text-generation"
         ]
     },
+    "meta-llama/Llama-2-70b-hf": {
+        "family": "llama-2",
+        "parameterCount": 70000000000,
+        "defaultDtype": "float16",
+        "maxPositionEmbeddings": 4096,
+        "gated": true,
+        "tags": [
+            "text-generation",
+            "llama-2"
+        ],
+        "architecture": "LlamaForCausalLM",
+        "notes": "Llama-2 70B base model (non-chat). Requires multi-GPU for inference.",
+        "chatTemplate": "",
+        "frameworkCompatibility": {
+            "vllm": ">=0.3.0",
+            "tensorrt-llm": ">=0.8.0",
+            "sglang": ">=0.2.0"
+        },
+        "validationLevel": "community-validated",
+        "modelType": "transformer",
+        "tasks": [
+            "text-generation"
+        ]
+    },
     "meta-llama/Llama-2-*": {
         "family": "llama-2",
         "gated": true,
@@ -502,6 +532,77 @@
             "text-generation"
         ]
     },
+    "meta-llama/Llama-3.1-8B*": {
+        "parameterCount": 8030261248,
+        "defaultDtype": "bfloat16",
+        "architecture": "LlamaForCausalLM",
+        "maxPositionEmbeddings": 131072,
+        "recommendedQuantizations": [
+            "awq",
+            "gptq"
+        ],
+        "modelType": "transformer",
+        "tasks": [
+            "text-generation"
+        ]
+    },
+    "meta-llama/Llama-3.1-70B*": {
+        "parameterCount": 70553706496,
+        "defaultDtype": "bfloat16",
+        "architecture": "LlamaForCausalLM",
+        "maxPositionEmbeddings": 131072,
+        "recommendedQuantizations": [
+            "awq",
+            "gptq"
+        ],
+        "modelType": "transformer",
+        "tasks": [
+            "text-generation"
+        ]
+    },
+    "meta-llama/Llama-3.1-405B*": {
+        "parameterCount": 405000000000,
+        "defaultDtype": "bfloat16",
+        "architecture": "LlamaForCausalLM",
+        "maxPositionEmbeddings": 131072,
+        "recommendedQuantizations": [
+            "awq",
+            "gptq",
+            "fp8"
+        ],
+        "modelType": "transformer",
+        "tasks": [
+            "text-generation"
+        ]
+    },
+    "meta-llama/Llama-3.2-1B*": {
+        "parameterCount": 1235814400,
+        "defaultDtype": "bfloat16",
+        "architecture": "LlamaForCausalLM",
+        "maxPositionEmbeddings": 131072,
+        "recommendedQuantizations": [
+            "awq",
+            "gptq"
+        ],
+        "modelType": "transformer",
+        "tasks": [
+            "text-generation"
+        ]
+    },
+    "meta-llama/Llama-3.2-3B*": {
+        "parameterCount": 3212749824,
+        "defaultDtype": "bfloat16",
+        "architecture": "LlamaForCausalLM",
+        "maxPositionEmbeddings": 131072,
+        "recommendedQuantizations": [
+            "awq",
+            "gptq"
+        ],
+        "modelType": "transformer",
+        "tasks": [
+            "text-generation"
+        ]
+    },
     "Qwen/Qwen-7B*": {
         "parameterCount": 7721324544,
         "defaultDtype": "bfloat16",

package/servers/lib/schemas/image-catalog.schema.json CHANGED Viewed

@@ -62,7 +62,15 @@
                             }
                         },
                         "inferenceAmiVersion": {
-                            "type": "string"
+                            "type": "string",
+                            "enum": [
+                                "al2023-ami-sagemaker-inference-cpu-0",
+                                "al2-ami-sagemaker-inference-gpu-2",
+                                "al2-ami-sagemaker-inference-gpu-2-1",
+                                "al2-ami-sagemaker-inference-neuron-2",
+                                "al2-ami-sagemaker-inference-gpu-3-1",
+                                "al2023-ami-sagemaker-inference-gpu-4-1"
+                            ]
                         }
                     },
                     "additionalProperties": false
@@ -145,6 +153,12 @@
                 },
                 "notes": {
                     "type": "string"
+                },
+                "supportedModelTypes": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
                 }
             },
             "additionalProperties": false

package/servers/model-picker/index.js CHANGED Viewed

@@ -195,11 +195,12 @@ class HuggingFaceResolver extends ModelResolver {
         }
         // Fetch model config (conditional)
-        if (!fields || fields.includes('architecture')) {
+        if (!fields || fields.includes('architecture') || fields.includes('model_type')) {
             const modelConfig = await this._fetchJson(
                 `${this.baseUrl}/${modelId}/resolve/main/config.json`
             )
             metadata.architecture = modelConfig?.architectures?.[0] || null
+            metadata.model_type = modelConfig?.model_type || null
         }
         return Object.keys(metadata).length > 0 ? metadata : null