@aws/ml-container-creator 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/bin/cli.js +5 -2
  2. package/config/bootstrap-stack.json +86 -7
  3. package/config/defaults.json +1 -1
  4. package/infra/ci-harness/buildspec.yml +60 -0
  5. package/package.json +3 -1
  6. package/servers/README.md +41 -1
  7. package/servers/instance-sizer/index.js +42 -2
  8. package/servers/instance-sizer/lib/instance-ranker.js +114 -10
  9. package/servers/instance-sizer/lib/quota-resolver.js +368 -0
  10. package/servers/instance-sizer/package.json +2 -0
  11. package/servers/lib/catalogs/instances.json +527 -12
  12. package/servers/lib/catalogs/model-servers.json +15 -15
  13. package/servers/lib/catalogs/model-sizes.json +27 -0
  14. package/servers/lib/catalogs/models.json +71 -0
  15. package/servers/lib/schemas/image-catalog.schema.json +9 -1
  16. package/src/app.js +109 -3
  17. package/src/lib/bootstrap-command-handler.js +96 -3
  18. package/src/lib/cli-handler.js +2 -2
  19. package/src/lib/config-manager.js +117 -1
  20. package/src/lib/deployment-entry-schema.js +16 -0
  21. package/src/lib/prompt-runner.js +270 -12
  22. package/src/lib/prompts.js +288 -6
  23. package/src/lib/registry-command-handler.js +12 -0
  24. package/src/lib/schema-sync.js +31 -0
  25. package/src/lib/template-manager.js +49 -1
  26. package/src/lib/validate-runner.js +125 -2
  27. package/templates/Dockerfile +22 -2
  28. package/templates/code/cuda_compat.sh +22 -0
  29. package/templates/code/serve +3 -0
  30. package/templates/code/serving.properties +14 -0
  31. package/templates/code/start_server.sh +3 -0
  32. package/templates/diffusors/Dockerfile +2 -1
  33. package/templates/diffusors/serve +3 -0
  34. package/templates/do/README.md +33 -0
  35. package/templates/do/adapter +1214 -0
  36. package/templates/do/adapters/.gitkeep +2 -0
  37. package/templates/do/add-ic +130 -0
  38. package/templates/do/benchmark +718 -0
  39. package/templates/do/clean +593 -17
  40. package/templates/do/config +49 -4
  41. package/templates/do/deploy +513 -362
  42. package/templates/do/ic/default.conf +32 -0
  43. package/templates/do/lib/endpoint-config.sh +216 -0
  44. package/templates/do/lib/inference-component.sh +167 -0
  45. package/templates/do/lib/secrets.sh +44 -0
  46. package/templates/do/lib/wait.sh +131 -0
  47. package/templates/do/logs +107 -27
  48. package/templates/do/optimize +528 -0
  49. package/templates/do/register +119 -2
  50. package/templates/do/status +337 -0
  51. package/templates/do/test +80 -28
  52. package/templates/triton/Dockerfile +5 -0
@@ -12,6 +12,9 @@
12
12
  <% if (framework !== 'transformers') { %>
13
13
  FROM <%= baseImage || 'public.ecr.aws/docker/library/python:3.12-slim' %>
14
14
 
15
+ # Ensure Python output is unbuffered so SageMaker can capture logs in CloudWatch
16
+ ENV PYTHONUNBUFFERED=1
17
+
15
18
  # Set a docker label to name this project, postpended with the build time
16
19
  LABEL project.name="<%= projectName %>-<%= buildTimestamp %>" \
17
20
  project.base-name="<%= projectName %>" \
@@ -143,6 +146,9 @@ ARG BASE_IMAGE=<%= baseImage || 'deepjavalibrary/djl-serving:0.36.0-pytorch-gpu'
143
146
 
144
147
  FROM ${BASE_IMAGE}
145
148
 
149
+ # Ensure Python output is unbuffered so SageMaker can capture logs in CloudWatch
150
+ ENV PYTHONUNBUFFERED=1
151
+
146
152
  <% if (comments && comments.chatTemplate) { %>
147
153
  <%= comments.chatTemplate %>
148
154
  <% } %>
@@ -232,6 +238,18 @@ ENV <%= key %>=<%= value %>
232
238
  <% }); %>
233
239
  <% } %>
234
240
 
241
+ <% if (enableLora && modelServer === 'vllm') { %>
242
+ # LoRA adapter serving configuration
243
+ ENV VLLM_ENABLE_LORA=true
244
+ ENV VLLM_MAX_LORAS=<%= maxLoras %>
245
+ ENV VLLM_MAX_LORA_RANK=<%= maxLoraRank %>
246
+ <% } %>
247
+ <% if (enableLora && modelServer === 'sglang') { %>
248
+ # LoRA adapter serving configuration
249
+ ENV SGLANG_ENABLE_LORA=true
250
+ ENV SGLANG_MAX_LORAS=<%= maxLoras %>
251
+ <% } %>
252
+
235
253
  <% if (typeof modelSource !== 'undefined' && modelSource && modelSource !== 'huggingface' && modelServer !== 'lmi' && modelServer !== 'djl') { %>
236
254
  # Install AWS CLI for S3 model downloads
237
255
  RUN pip install --no-cache-dir awscli
@@ -271,8 +289,9 @@ COPY code/serve /usr/bin/serve_trtllm
271
289
  RUN chmod +x /usr/bin/serve_trtllm
272
290
 
273
291
  # Copy startup script
292
+ COPY code/cuda_compat.sh /usr/bin/cuda_compat.sh
274
293
  COPY code/start_server.sh /usr/bin/start_server.sh
275
- RUN chmod +x /usr/bin/start_server.sh
294
+ RUN chmod +x /usr/bin/start_server.sh /usr/bin/cuda_compat.sh
276
295
 
277
296
  ENTRYPOINT [ "/usr/bin/start_server.sh" ]
278
297
  <% } else if (modelServer === 'lmi' || modelServer === 'djl') { %>
@@ -287,8 +306,9 @@ COPY code/serving.properties /opt/ml/model/serving.properties
287
306
  # LMI/DJL containers use their own entrypoint
288
307
  # The container will automatically start DJL Serving with the configuration
289
308
  <% } else { %>
309
+ COPY code/cuda_compat.sh /usr/bin/cuda_compat.sh
290
310
  COPY code/serve /usr/bin/serve
291
- RUN chmod 777 /usr/bin/serve
311
+ RUN chmod 777 /usr/bin/serve /usr/bin/cuda_compat.sh
292
312
 
293
313
  <% if (comments && comments.troubleshooting) { %>
294
314
  <%= comments.troubleshooting %>
@@ -0,0 +1,22 @@
1
+ #!/bin/bash
2
+ # CUDA Compatibility Setup
3
+ # Required for SageMaker inference AMIs using NVIDIA Container Toolkit 1.17.4+
4
+ # (al2-ami-sagemaker-inference-gpu-2-1, al2-ami-sagemaker-inference-gpu-3-1,
5
+ # al2023-ami-sagemaker-inference-gpu-4-1)
6
+ #
7
+ # These AMIs no longer auto-mount CUDA compat libraries. This script detects
8
+ # whether the host NVIDIA driver is older than what the container's CUDA toolkit
9
+ # requires, and adds the compat libraries to LD_LIBRARY_PATH if needed.
10
+
11
+ _verlt() {
12
+ [ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
13
+ }
14
+
15
+ if [ -f /usr/local/cuda/compat/libcuda.so.1 ]; then
16
+ CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink /usr/local/cuda/compat/libcuda.so.1 | cut -d'.' -f 3-)
17
+ NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
18
+ if [ -n "$NVIDIA_DRIVER_VERSION" ] && _verlt "$NVIDIA_DRIVER_VERSION" "$CUDA_COMPAT_MAX_DRIVER_VERSION"; then
19
+ echo "CUDA compat: driver ${NVIDIA_DRIVER_VERSION} < ${CUDA_COMPAT_MAX_DRIVER_VERSION}, adding compat libs"
20
+ export LD_LIBRARY_PATH=/usr/local/cuda/compat:${LD_LIBRARY_PATH:-}
21
+ fi
22
+ fi
@@ -2,6 +2,9 @@
2
2
  # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
+ # CUDA compatibility setup (required for newer SageMaker inference AMIs)
6
+ source /usr/bin/cuda_compat.sh 2>/dev/null || true
7
+
5
8
  <% if (modelServer === 'vllm') { %>
6
9
  echo "Starting vLLM server"
7
10
  <% } else if (modelServer === 'sglang') { %>
@@ -53,6 +53,13 @@ option.chat_template=<%= chatTemplate %>
53
53
  # option.gpu_memory_utilization=0.9
54
54
  # option.enable_chunked_prefill=true
55
55
 
56
+ <% if (enableLora) { %>
57
+ # LoRA adapter serving configuration
58
+ option.enable_lora=true
59
+ option.max_loras=<%= maxLoras %>
60
+ option.max_cpu_loras=70
61
+ <% } %>
62
+
56
63
  <% } else if (modelServer === 'djl') { %>
57
64
  # DJL Serving Configuration
58
65
  # DJL provides flexible model serving with multiple framework support
@@ -94,6 +101,13 @@ option.chat_template=<%= chatTemplate %>
94
101
  # option.tensor_parallel_degree=1
95
102
  # option.device_map=auto
96
103
 
104
+ <% if (enableLora) { %>
105
+ # LoRA adapter serving configuration
106
+ option.enable_lora=true
107
+ option.max_loras=<%= maxLoras %>
108
+ option.max_cpu_loras=70
109
+ <% } %>
110
+
97
111
  <% } %>
98
112
 
99
113
  # Additional Environment-Specific Configuration
@@ -2,6 +2,9 @@
2
2
  # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
+ # CUDA compatibility setup (required for newer SageMaker inference AMIs)
6
+ source /usr/bin/cuda_compat.sh 2>/dev/null || true
7
+
5
8
  set -e
6
9
 
7
10
  echo "Starting TensorRT-LLM server on port 8081..."
@@ -59,8 +59,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends nginx \
59
59
  COPY nginx-diffusors.conf /etc/nginx/nginx.conf
60
60
 
61
61
  # Copy serve entrypoint and startup scripts
62
+ COPY code/cuda_compat.sh /usr/bin/cuda_compat.sh
62
63
  COPY code/serve /usr/bin/serve
63
- RUN chmod 777 /usr/bin/serve
64
+ RUN chmod 777 /usr/bin/serve /usr/bin/cuda_compat.sh
64
65
 
65
66
  COPY code/start_server.sh /usr/bin/start_server.sh
66
67
  RUN chmod +x /usr/bin/start_server.sh
@@ -2,6 +2,9 @@
2
2
  # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
+ # CUDA compatibility setup (required for newer SageMaker inference AMIs)
6
+ source /usr/bin/cuda_compat.sh 2>/dev/null || true
7
+
5
8
  echo "Starting vLLM-Omni server (diffusion model serving)"
6
9
 
7
10
  # Resolve model URI prefixes that engines cannot handle natively.
@@ -262,6 +262,39 @@ Clean everything:
262
262
 
263
263
  ---
264
264
 
265
+ <% if (typeof includeBenchmark !== 'undefined' && includeBenchmark) { %>
266
+ ### `./do/benchmark`
267
+
268
+ Run SageMaker AI Benchmark against deployed endpoint.
269
+
270
+ **What it does:**
271
+ - Verifies endpoint is InService
272
+ - Ensures S3 output bucket exists
273
+ - Creates AI workload configuration
274
+ - Creates and monitors AI benchmark job
275
+ - Displays performance results (throughput, latency P50/P90/P99, TTFT, ITL)
276
+
277
+ **Prerequisites:**
278
+ - Endpoint deployed and InService (`./do/deploy`)
279
+ - AWS credentials configured
280
+
281
+ **Usage:**
282
+ ```bash
283
+ ./do/benchmark
284
+ ```
285
+
286
+ **Clean up benchmark resources:**
287
+ ```bash
288
+ ./do/benchmark --clean
289
+ ```
290
+
291
+ **Output:**
292
+ - Benchmark results summary table
293
+ - Detailed results in S3
294
+
295
+ ---
296
+
297
+ <% } %>
265
298
  <% if (buildTarget === 'codebuild') { %>
266
299
  ### `./do/submit`
267
300