@aws/ml-container-creator 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.js +5 -2
- package/config/bootstrap-stack.json +86 -7
- package/config/defaults.json +1 -1
- package/infra/ci-harness/buildspec.yml +60 -0
- package/package.json +3 -1
- package/servers/README.md +41 -1
- package/servers/instance-sizer/index.js +42 -2
- package/servers/instance-sizer/lib/instance-ranker.js +114 -10
- package/servers/instance-sizer/lib/quota-resolver.js +368 -0
- package/servers/instance-sizer/package.json +2 -0
- package/servers/lib/catalogs/instances.json +527 -12
- package/servers/lib/catalogs/model-servers.json +15 -15
- package/servers/lib/catalogs/model-sizes.json +27 -0
- package/servers/lib/catalogs/models.json +71 -0
- package/servers/lib/schemas/image-catalog.schema.json +9 -1
- package/src/app.js +109 -3
- package/src/lib/bootstrap-command-handler.js +96 -3
- package/src/lib/cli-handler.js +2 -2
- package/src/lib/config-manager.js +117 -1
- package/src/lib/deployment-entry-schema.js +16 -0
- package/src/lib/prompt-runner.js +270 -12
- package/src/lib/prompts.js +288 -6
- package/src/lib/registry-command-handler.js +12 -0
- package/src/lib/schema-sync.js +31 -0
- package/src/lib/template-manager.js +49 -1
- package/src/lib/validate-runner.js +125 -2
- package/templates/Dockerfile +22 -2
- package/templates/code/cuda_compat.sh +22 -0
- package/templates/code/serve +3 -0
- package/templates/code/serving.properties +14 -0
- package/templates/code/start_server.sh +3 -0
- package/templates/diffusors/Dockerfile +2 -1
- package/templates/diffusors/serve +3 -0
- package/templates/do/README.md +33 -0
- package/templates/do/adapter +1214 -0
- package/templates/do/adapters/.gitkeep +2 -0
- package/templates/do/add-ic +130 -0
- package/templates/do/benchmark +718 -0
- package/templates/do/clean +593 -17
- package/templates/do/config +49 -4
- package/templates/do/deploy +513 -362
- package/templates/do/ic/default.conf +32 -0
- package/templates/do/lib/endpoint-config.sh +216 -0
- package/templates/do/lib/inference-component.sh +167 -0
- package/templates/do/lib/secrets.sh +44 -0
- package/templates/do/lib/wait.sh +131 -0
- package/templates/do/logs +107 -27
- package/templates/do/optimize +528 -0
- package/templates/do/register +119 -2
- package/templates/do/status +337 -0
- package/templates/do/test +80 -28
- package/templates/triton/Dockerfile +5 -0
package/templates/Dockerfile
CHANGED
|
@@ -12,6 +12,9 @@
|
|
|
12
12
|
<% if (framework !== 'transformers') { %>
|
|
13
13
|
FROM <%= baseImage || 'public.ecr.aws/docker/library/python:3.12-slim' %>
|
|
14
14
|
|
|
15
|
+
# Ensure Python output is unbuffered so SageMaker can capture logs in CloudWatch
|
|
16
|
+
ENV PYTHONUNBUFFERED=1
|
|
17
|
+
|
|
15
18
|
# Set a docker label to name this project, postpended with the build time
|
|
16
19
|
LABEL project.name="<%= projectName %>-<%= buildTimestamp %>" \
|
|
17
20
|
project.base-name="<%= projectName %>" \
|
|
@@ -143,6 +146,9 @@ ARG BASE_IMAGE=<%= baseImage || 'deepjavalibrary/djl-serving:0.36.0-pytorch-gpu'
|
|
|
143
146
|
|
|
144
147
|
FROM ${BASE_IMAGE}
|
|
145
148
|
|
|
149
|
+
# Ensure Python output is unbuffered so SageMaker can capture logs in CloudWatch
|
|
150
|
+
ENV PYTHONUNBUFFERED=1
|
|
151
|
+
|
|
146
152
|
<% if (comments && comments.chatTemplate) { %>
|
|
147
153
|
<%= comments.chatTemplate %>
|
|
148
154
|
<% } %>
|
|
@@ -232,6 +238,18 @@ ENV <%= key %>=<%= value %>
|
|
|
232
238
|
<% }); %>
|
|
233
239
|
<% } %>
|
|
234
240
|
|
|
241
|
+
<% if (enableLora && modelServer === 'vllm') { %>
|
|
242
|
+
# LoRA adapter serving configuration
|
|
243
|
+
ENV VLLM_ENABLE_LORA=true
|
|
244
|
+
ENV VLLM_MAX_LORAS=<%= maxLoras %>
|
|
245
|
+
ENV VLLM_MAX_LORA_RANK=<%= maxLoraRank %>
|
|
246
|
+
<% } %>
|
|
247
|
+
<% if (enableLora && modelServer === 'sglang') { %>
|
|
248
|
+
# LoRA adapter serving configuration
|
|
249
|
+
ENV SGLANG_ENABLE_LORA=true
|
|
250
|
+
ENV SGLANG_MAX_LORAS=<%= maxLoras %>
|
|
251
|
+
<% } %>
|
|
252
|
+
|
|
235
253
|
<% if (typeof modelSource !== 'undefined' && modelSource && modelSource !== 'huggingface' && modelServer !== 'lmi' && modelServer !== 'djl') { %>
|
|
236
254
|
# Install AWS CLI for S3 model downloads
|
|
237
255
|
RUN pip install --no-cache-dir awscli
|
|
@@ -271,8 +289,9 @@ COPY code/serve /usr/bin/serve_trtllm
|
|
|
271
289
|
RUN chmod +x /usr/bin/serve_trtllm
|
|
272
290
|
|
|
273
291
|
# Copy startup script
|
|
292
|
+
COPY code/cuda_compat.sh /usr/bin/cuda_compat.sh
|
|
274
293
|
COPY code/start_server.sh /usr/bin/start_server.sh
|
|
275
|
-
RUN chmod +x /usr/bin/start_server.sh
|
|
294
|
+
RUN chmod +x /usr/bin/start_server.sh /usr/bin/cuda_compat.sh
|
|
276
295
|
|
|
277
296
|
ENTRYPOINT [ "/usr/bin/start_server.sh" ]
|
|
278
297
|
<% } else if (modelServer === 'lmi' || modelServer === 'djl') { %>
|
|
@@ -287,8 +306,9 @@ COPY code/serving.properties /opt/ml/model/serving.properties
|
|
|
287
306
|
# LMI/DJL containers use their own entrypoint
|
|
288
307
|
# The container will automatically start DJL Serving with the configuration
|
|
289
308
|
<% } else { %>
|
|
309
|
+
COPY code/cuda_compat.sh /usr/bin/cuda_compat.sh
|
|
290
310
|
COPY code/serve /usr/bin/serve
|
|
291
|
-
RUN chmod 777 /usr/bin/serve
|
|
311
|
+
RUN chmod 777 /usr/bin/serve /usr/bin/cuda_compat.sh
|
|
292
312
|
|
|
293
313
|
<% if (comments && comments.troubleshooting) { %>
|
|
294
314
|
<%= comments.troubleshooting %>
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# CUDA Compatibility Setup
|
|
3
|
+
# Required for SageMaker inference AMIs using NVIDIA Container Toolkit 1.17.4+
|
|
4
|
+
# (al2-ami-sagemaker-inference-gpu-2-1, al2-ami-sagemaker-inference-gpu-3-1,
|
|
5
|
+
# al2023-ami-sagemaker-inference-gpu-4-1)
|
|
6
|
+
#
|
|
7
|
+
# These AMIs no longer auto-mount CUDA compat libraries. This script detects
|
|
8
|
+
# whether the host NVIDIA driver is older than what the container's CUDA toolkit
|
|
9
|
+
# requires, and adds the compat libraries to LD_LIBRARY_PATH if needed.
|
|
10
|
+
|
|
11
|
+
_verlt() {
|
|
12
|
+
[ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
if [ -f /usr/local/cuda/compat/libcuda.so.1 ]; then
|
|
16
|
+
CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink /usr/local/cuda/compat/libcuda.so.1 | cut -d'.' -f 3-)
|
|
17
|
+
NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
|
|
18
|
+
if [ -n "$NVIDIA_DRIVER_VERSION" ] && _verlt "$NVIDIA_DRIVER_VERSION" "$CUDA_COMPAT_MAX_DRIVER_VERSION"; then
|
|
19
|
+
echo "CUDA compat: driver ${NVIDIA_DRIVER_VERSION} < ${CUDA_COMPAT_MAX_DRIVER_VERSION}, adding compat libs"
|
|
20
|
+
export LD_LIBRARY_PATH=/usr/local/cuda/compat:${LD_LIBRARY_PATH:-}
|
|
21
|
+
fi
|
|
22
|
+
fi
|
package/templates/code/serve
CHANGED
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
+
# CUDA compatibility setup (required for newer SageMaker inference AMIs)
|
|
6
|
+
source /usr/bin/cuda_compat.sh 2>/dev/null || true
|
|
7
|
+
|
|
5
8
|
<% if (modelServer === 'vllm') { %>
|
|
6
9
|
echo "Starting vLLM server"
|
|
7
10
|
<% } else if (modelServer === 'sglang') { %>
|
|
@@ -53,6 +53,13 @@ option.chat_template=<%= chatTemplate %>
|
|
|
53
53
|
# option.gpu_memory_utilization=0.9
|
|
54
54
|
# option.enable_chunked_prefill=true
|
|
55
55
|
|
|
56
|
+
<% if (enableLora) { %>
|
|
57
|
+
# LoRA adapter serving configuration
|
|
58
|
+
option.enable_lora=true
|
|
59
|
+
option.max_loras=<%= maxLoras %>
|
|
60
|
+
option.max_cpu_loras=70
|
|
61
|
+
<% } %>
|
|
62
|
+
|
|
56
63
|
<% } else if (modelServer === 'djl') { %>
|
|
57
64
|
# DJL Serving Configuration
|
|
58
65
|
# DJL provides flexible model serving with multiple framework support
|
|
@@ -94,6 +101,13 @@ option.chat_template=<%= chatTemplate %>
|
|
|
94
101
|
# option.tensor_parallel_degree=1
|
|
95
102
|
# option.device_map=auto
|
|
96
103
|
|
|
104
|
+
<% if (enableLora) { %>
|
|
105
|
+
# LoRA adapter serving configuration
|
|
106
|
+
option.enable_lora=true
|
|
107
|
+
option.max_loras=<%= maxLoras %>
|
|
108
|
+
option.max_cpu_loras=70
|
|
109
|
+
<% } %>
|
|
110
|
+
|
|
97
111
|
<% } %>
|
|
98
112
|
|
|
99
113
|
# Additional Environment-Specific Configuration
|
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
+
# CUDA compatibility setup (required for newer SageMaker inference AMIs)
|
|
6
|
+
source /usr/bin/cuda_compat.sh 2>/dev/null || true
|
|
7
|
+
|
|
5
8
|
set -e
|
|
6
9
|
|
|
7
10
|
echo "Starting TensorRT-LLM server on port 8081..."
|
|
@@ -59,8 +59,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends nginx \
|
|
|
59
59
|
COPY nginx-diffusors.conf /etc/nginx/nginx.conf
|
|
60
60
|
|
|
61
61
|
# Copy serve entrypoint and startup scripts
|
|
62
|
+
COPY code/cuda_compat.sh /usr/bin/cuda_compat.sh
|
|
62
63
|
COPY code/serve /usr/bin/serve
|
|
63
|
-
RUN chmod 777 /usr/bin/serve
|
|
64
|
+
RUN chmod 777 /usr/bin/serve /usr/bin/cuda_compat.sh
|
|
64
65
|
|
|
65
66
|
COPY code/start_server.sh /usr/bin/start_server.sh
|
|
66
67
|
RUN chmod +x /usr/bin/start_server.sh
|
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
+
# CUDA compatibility setup (required for newer SageMaker inference AMIs)
|
|
6
|
+
source /usr/bin/cuda_compat.sh 2>/dev/null || true
|
|
7
|
+
|
|
5
8
|
echo "Starting vLLM-Omni server (diffusion model serving)"
|
|
6
9
|
|
|
7
10
|
# Resolve model URI prefixes that engines cannot handle natively.
|
package/templates/do/README.md
CHANGED
|
@@ -262,6 +262,39 @@ Clean everything:
|
|
|
262
262
|
|
|
263
263
|
---
|
|
264
264
|
|
|
265
|
+
<% if (typeof includeBenchmark !== 'undefined' && includeBenchmark) { %>
|
|
266
|
+
### `./do/benchmark`
|
|
267
|
+
|
|
268
|
+
Run SageMaker AI Benchmark against deployed endpoint.
|
|
269
|
+
|
|
270
|
+
**What it does:**
|
|
271
|
+
- Verifies endpoint is InService
|
|
272
|
+
- Ensures S3 output bucket exists
|
|
273
|
+
- Creates AI workload configuration
|
|
274
|
+
- Creates and monitors AI benchmark job
|
|
275
|
+
- Displays performance results (throughput, latency P50/P90/P99, TTFT, ITL)
|
|
276
|
+
|
|
277
|
+
**Prerequisites:**
|
|
278
|
+
- Endpoint deployed and InService (`./do/deploy`)
|
|
279
|
+
- AWS credentials configured
|
|
280
|
+
|
|
281
|
+
**Usage:**
|
|
282
|
+
```bash
|
|
283
|
+
./do/benchmark
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
**Clean up benchmark resources:**
|
|
287
|
+
```bash
|
|
288
|
+
./do/benchmark --clean
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
**Output:**
|
|
292
|
+
- Benchmark results summary table
|
|
293
|
+
- Detailed results in S3
|
|
294
|
+
|
|
295
|
+
---
|
|
296
|
+
|
|
297
|
+
<% } %>
|
|
265
298
|
<% if (buildTarget === 'codebuild') { %>
|
|
266
299
|
### `./do/submit`
|
|
267
300
|
|