npm - @aws/ml-container-creator - Versions diffs - 0.3.0 → 0.5.0 - Mend

@aws/ml-container-creator 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

package/bin/cli.js +5 -2
package/config/bootstrap-stack.json +86 -7
package/config/defaults.json +1 -1
package/infra/ci-harness/buildspec.yml +60 -0
package/package.json +3 -1
package/servers/README.md +41 -1
package/servers/instance-sizer/index.js +42 -2
package/servers/instance-sizer/lib/instance-ranker.js +114 -10
package/servers/instance-sizer/lib/quota-resolver.js +368 -0
package/servers/instance-sizer/package.json +2 -0
package/servers/lib/catalogs/instances.json +527 -12
package/servers/lib/catalogs/model-servers.json +15 -15
package/servers/lib/catalogs/model-sizes.json +27 -0
package/servers/lib/catalogs/models.json +71 -0
package/servers/lib/schemas/image-catalog.schema.json +9 -1
package/src/app.js +109 -3
package/src/lib/bootstrap-command-handler.js +96 -3
package/src/lib/cli-handler.js +2 -2
package/src/lib/config-manager.js +117 -1
package/src/lib/deployment-entry-schema.js +16 -0
package/src/lib/prompt-runner.js +270 -12
package/src/lib/prompts.js +288 -6
package/src/lib/registry-command-handler.js +12 -0
package/src/lib/schema-sync.js +31 -0
package/src/lib/template-manager.js +49 -1
package/src/lib/validate-runner.js +125 -2
package/templates/Dockerfile +22 -2
package/templates/code/cuda_compat.sh +22 -0
package/templates/code/serve +3 -0
package/templates/code/serving.properties +14 -0
package/templates/code/start_server.sh +3 -0
package/templates/diffusors/Dockerfile +2 -1
package/templates/diffusors/serve +3 -0
package/templates/do/README.md +33 -0
package/templates/do/adapter +1214 -0
package/templates/do/adapters/.gitkeep +2 -0
package/templates/do/add-ic +130 -0
package/templates/do/benchmark +718 -0
package/templates/do/clean +593 -17
package/templates/do/config +49 -4
package/templates/do/deploy +513 -362
package/templates/do/ic/default.conf +32 -0
package/templates/do/lib/endpoint-config.sh +216 -0
package/templates/do/lib/inference-component.sh +167 -0
package/templates/do/lib/secrets.sh +44 -0
package/templates/do/lib/wait.sh +131 -0
package/templates/do/logs +107 -27
package/templates/do/optimize +528 -0
package/templates/do/register +119 -2
package/templates/do/status +337 -0
package/templates/do/test +80 -28
package/templates/triton/Dockerfile +5 -0

package/templates/Dockerfile CHANGED Viewed

@@ -12,6 +12,9 @@
 <% if (framework !== 'transformers') { %>
 FROM <%= baseImage || 'public.ecr.aws/docker/library/python:3.12-slim' %>
+# Ensure Python output is unbuffered so SageMaker can capture logs in CloudWatch
+ENV PYTHONUNBUFFERED=1
 # Set a docker label to name this project, postpended with the build time
 LABEL project.name="<%= projectName %>-<%= buildTimestamp %>" \
       project.base-name="<%= projectName %>" \
@@ -143,6 +146,9 @@ ARG BASE_IMAGE=<%= baseImage || 'deepjavalibrary/djl-serving:0.36.0-pytorch-gpu'
 FROM ${BASE_IMAGE}
+# Ensure Python output is unbuffered so SageMaker can capture logs in CloudWatch
+ENV PYTHONUNBUFFERED=1
 <% if (comments && comments.chatTemplate) { %>
 <%= comments.chatTemplate %>
 <% } %>
@@ -232,6 +238,18 @@ ENV <%= key %>=<%= value %>
 <% }); %>
 <% } %>
+<% if (enableLora && modelServer === 'vllm') { %>
+# LoRA adapter serving configuration
+ENV VLLM_ENABLE_LORA=true
+ENV VLLM_MAX_LORAS=<%= maxLoras %>
+ENV VLLM_MAX_LORA_RANK=<%= maxLoraRank %>
+<% } %>
+<% if (enableLora && modelServer === 'sglang') { %>
+# LoRA adapter serving configuration
+ENV SGLANG_ENABLE_LORA=true
+ENV SGLANG_MAX_LORAS=<%= maxLoras %>
+<% } %>
 <% if (typeof modelSource !== 'undefined' && modelSource && modelSource !== 'huggingface' && modelServer !== 'lmi' && modelServer !== 'djl') { %>
 # Install AWS CLI for S3 model downloads
 RUN pip install --no-cache-dir awscli
@@ -271,8 +289,9 @@ COPY code/serve /usr/bin/serve_trtllm
 RUN chmod +x /usr/bin/serve_trtllm
 # Copy startup script
+COPY code/cuda_compat.sh /usr/bin/cuda_compat.sh
 COPY code/start_server.sh /usr/bin/start_server.sh
-RUN chmod +x /usr/bin/start_server.sh
+RUN chmod +x /usr/bin/start_server.sh /usr/bin/cuda_compat.sh
 ENTRYPOINT [ "/usr/bin/start_server.sh" ]
 <% } else if (modelServer === 'lmi' || modelServer === 'djl') { %>
@@ -287,8 +306,9 @@ COPY code/serving.properties /opt/ml/model/serving.properties
 # LMI/DJL containers use their own entrypoint
 # The container will automatically start DJL Serving with the configuration
 <% } else { %>
+COPY code/cuda_compat.sh /usr/bin/cuda_compat.sh
 COPY code/serve /usr/bin/serve
-RUN chmod 777 /usr/bin/serve
+RUN chmod 777 /usr/bin/serve /usr/bin/cuda_compat.sh
 <% if (comments && comments.troubleshooting) { %>
 <%= comments.troubleshooting %>

package/templates/code/cuda_compat.sh ADDED Viewed

@@ -0,0 +1,22 @@
+#!/bin/bash
+# CUDA Compatibility Setup
+# Required for SageMaker inference AMIs using NVIDIA Container Toolkit 1.17.4+
+# (al2-ami-sagemaker-inference-gpu-2-1, al2-ami-sagemaker-inference-gpu-3-1,
+#  al2023-ami-sagemaker-inference-gpu-4-1)
+#
+# These AMIs no longer auto-mount CUDA compat libraries. This script detects
+# whether the host NVIDIA driver is older than what the container's CUDA toolkit
+# requires, and adds the compat libraries to LD_LIBRARY_PATH if needed.
+_verlt() {
+    [ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
+}
+if [ -f /usr/local/cuda/compat/libcuda.so.1 ]; then
+    CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink /usr/local/cuda/compat/libcuda.so.1 | cut -d'.' -f 3-)
+    NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
+    if [ -n "$NVIDIA_DRIVER_VERSION" ] && _verlt "$NVIDIA_DRIVER_VERSION" "$CUDA_COMPAT_MAX_DRIVER_VERSION"; then
+        echo "CUDA compat: driver ${NVIDIA_DRIVER_VERSION} < ${CUDA_COMPAT_MAX_DRIVER_VERSION}, adding compat libs"
+        export LD_LIBRARY_PATH=/usr/local/cuda/compat:${LD_LIBRARY_PATH:-}
+    fi
+fi

package/templates/code/serve CHANGED Viewed

@@ -2,6 +2,9 @@
 # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
+# CUDA compatibility setup (required for newer SageMaker inference AMIs)
+source /usr/bin/cuda_compat.sh 2>/dev/null || true
 <% if (modelServer === 'vllm') { %>
 echo "Starting vLLM server"
 <% } else if (modelServer === 'sglang') { %>

package/templates/code/serving.properties CHANGED Viewed

@@ -53,6 +53,13 @@ option.chat_template=<%= chatTemplate %>
 # option.gpu_memory_utilization=0.9
 # option.enable_chunked_prefill=true
+<% if (enableLora) { %>
+# LoRA adapter serving configuration
+option.enable_lora=true
+option.max_loras=<%= maxLoras %>
+option.max_cpu_loras=70
+<% } %>
 <% } else if (modelServer === 'djl') { %>
 # DJL Serving Configuration
 # DJL provides flexible model serving with multiple framework support
@@ -94,6 +101,13 @@ option.chat_template=<%= chatTemplate %>
 # option.tensor_parallel_degree=1
 # option.device_map=auto
+<% if (enableLora) { %>
+# LoRA adapter serving configuration
+option.enable_lora=true
+option.max_loras=<%= maxLoras %>
+option.max_cpu_loras=70
+<% } %>
 <% } %>
 # Additional Environment-Specific Configuration

package/templates/code/start_server.sh CHANGED Viewed

@@ -2,6 +2,9 @@
 # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
+# CUDA compatibility setup (required for newer SageMaker inference AMIs)
+source /usr/bin/cuda_compat.sh 2>/dev/null || true
 set -e
 echo "Starting TensorRT-LLM server on port 8081..."

package/templates/diffusors/Dockerfile CHANGED Viewed

@@ -59,8 +59,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends nginx \
 COPY nginx-diffusors.conf /etc/nginx/nginx.conf
 # Copy serve entrypoint and startup scripts
+COPY code/cuda_compat.sh /usr/bin/cuda_compat.sh
 COPY code/serve /usr/bin/serve
-RUN chmod 777 /usr/bin/serve
+RUN chmod 777 /usr/bin/serve /usr/bin/cuda_compat.sh
 COPY code/start_server.sh /usr/bin/start_server.sh
 RUN chmod +x /usr/bin/start_server.sh

package/templates/diffusors/serve CHANGED Viewed

@@ -2,6 +2,9 @@
 # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
+# CUDA compatibility setup (required for newer SageMaker inference AMIs)
+source /usr/bin/cuda_compat.sh 2>/dev/null || true
 echo "Starting vLLM-Omni server (diffusion model serving)"
 # Resolve model URI prefixes that engines cannot handle natively.

package/templates/do/README.md CHANGED Viewed

@@ -262,6 +262,39 @@ Clean everything:
 ---
+<% if (typeof includeBenchmark !== 'undefined' && includeBenchmark) { %>
+### `./do/benchmark`
+Run SageMaker AI Benchmark against deployed endpoint.
+**What it does:**
+- Verifies endpoint is InService
+- Ensures S3 output bucket exists
+- Creates AI workload configuration
+- Creates and monitors AI benchmark job
+- Displays performance results (throughput, latency P50/P90/P99, TTFT, ITL)
+**Prerequisites:**
+- Endpoint deployed and InService (`./do/deploy`)
+- AWS credentials configured
+**Usage:**
+```bash
+./do/benchmark
+```
+**Clean up benchmark resources:**
+```bash
+./do/benchmark --clean
+```
+**Output:**
+- Benchmark results summary table
+- Detailed results in S3
+---
+<% } %>
 <% if (buildTarget === 'codebuild') { %>
 ### `./do/submit`