@aws/ml-container-creator 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -0
- package/LICENSE-THIRD-PARTY +68620 -0
- package/NOTICE +2 -0
- package/README.md +106 -0
- package/bin/cli.js +365 -0
- package/config/defaults.json +32 -0
- package/config/presets/transformers-djl.json +26 -0
- package/config/presets/transformers-gpu.json +24 -0
- package/config/presets/transformers-lmi.json +27 -0
- package/package.json +129 -0
- package/servers/README.md +419 -0
- package/servers/base-image-picker/catalogs/model-servers.json +1191 -0
- package/servers/base-image-picker/catalogs/python-slim.json +38 -0
- package/servers/base-image-picker/catalogs/triton-backends.json +51 -0
- package/servers/base-image-picker/catalogs/triton.json +38 -0
- package/servers/base-image-picker/index.js +495 -0
- package/servers/base-image-picker/manifest.json +17 -0
- package/servers/base-image-picker/package.json +15 -0
- package/servers/hyperpod-cluster-picker/LICENSE +202 -0
- package/servers/hyperpod-cluster-picker/index.js +424 -0
- package/servers/hyperpod-cluster-picker/manifest.json +14 -0
- package/servers/hyperpod-cluster-picker/package.json +17 -0
- package/servers/instance-recommender/LICENSE +202 -0
- package/servers/instance-recommender/catalogs/instances.json +852 -0
- package/servers/instance-recommender/index.js +284 -0
- package/servers/instance-recommender/manifest.json +16 -0
- package/servers/instance-recommender/package.json +15 -0
- package/servers/lib/LICENSE +202 -0
- package/servers/lib/bedrock-client.js +160 -0
- package/servers/lib/custom-validators.js +46 -0
- package/servers/lib/dynamic-resolver.js +36 -0
- package/servers/lib/package.json +11 -0
- package/servers/lib/schemas/image-catalog.schema.json +185 -0
- package/servers/lib/schemas/instances.schema.json +124 -0
- package/servers/lib/schemas/manifest.schema.json +64 -0
- package/servers/lib/schemas/model-catalog.schema.json +91 -0
- package/servers/lib/schemas/regions.schema.json +26 -0
- package/servers/lib/schemas/triton-backends.schema.json +51 -0
- package/servers/model-picker/catalogs/jumpstart-public.json +66 -0
- package/servers/model-picker/catalogs/popular-diffusors.json +88 -0
- package/servers/model-picker/catalogs/popular-transformers.json +226 -0
- package/servers/model-picker/index.js +1693 -0
- package/servers/model-picker/manifest.json +18 -0
- package/servers/model-picker/package.json +20 -0
- package/servers/region-picker/LICENSE +202 -0
- package/servers/region-picker/catalogs/regions.json +263 -0
- package/servers/region-picker/index.js +230 -0
- package/servers/region-picker/manifest.json +16 -0
- package/servers/region-picker/package.json +15 -0
- package/src/app.js +1007 -0
- package/src/copy-tpl.js +77 -0
- package/src/lib/accelerator-validator.js +39 -0
- package/src/lib/asset-manager.js +385 -0
- package/src/lib/aws-profile-parser.js +181 -0
- package/src/lib/bootstrap-command-handler.js +1647 -0
- package/src/lib/bootstrap-config.js +238 -0
- package/src/lib/ci-register-helpers.js +124 -0
- package/src/lib/ci-report-helpers.js +158 -0
- package/src/lib/ci-stage-helpers.js +268 -0
- package/src/lib/cli-handler.js +529 -0
- package/src/lib/comment-generator.js +544 -0
- package/src/lib/community-reports-validator.js +91 -0
- package/src/lib/config-manager.js +2106 -0
- package/src/lib/configuration-exporter.js +204 -0
- package/src/lib/configuration-manager.js +695 -0
- package/src/lib/configuration-matcher.js +221 -0
- package/src/lib/cpu-validator.js +36 -0
- package/src/lib/cuda-validator.js +57 -0
- package/src/lib/deployment-config-resolver.js +103 -0
- package/src/lib/deployment-entry-schema.js +125 -0
- package/src/lib/deployment-registry.js +598 -0
- package/src/lib/docker-introspection-validator.js +51 -0
- package/src/lib/engine-prefix-resolver.js +60 -0
- package/src/lib/huggingface-client.js +172 -0
- package/src/lib/key-value-parser.js +37 -0
- package/src/lib/known-flags-validator.js +200 -0
- package/src/lib/manifest-cli.js +280 -0
- package/src/lib/mcp-client.js +303 -0
- package/src/lib/mcp-command-handler.js +532 -0
- package/src/lib/neuron-validator.js +80 -0
- package/src/lib/parameter-schema-validator.js +284 -0
- package/src/lib/prompt-runner.js +1349 -0
- package/src/lib/prompts.js +1138 -0
- package/src/lib/registry-command-handler.js +519 -0
- package/src/lib/registry-loader.js +198 -0
- package/src/lib/rocm-validator.js +80 -0
- package/src/lib/schema-validator.js +157 -0
- package/src/lib/sensitive-redactor.js +59 -0
- package/src/lib/template-engine.js +156 -0
- package/src/lib/template-manager.js +341 -0
- package/src/lib/validation-engine.js +314 -0
- package/src/prompt-adapter.js +63 -0
- package/templates/Dockerfile +300 -0
- package/templates/IAM_PERMISSIONS.md +84 -0
- package/templates/MIGRATION.md +488 -0
- package/templates/PROJECT_README.md +439 -0
- package/templates/TEMPLATE_SYSTEM.md +243 -0
- package/templates/buildspec.yml +64 -0
- package/templates/code/chat_template.jinja +1 -0
- package/templates/code/flask/gunicorn_config.py +35 -0
- package/templates/code/flask/wsgi.py +10 -0
- package/templates/code/model_handler.py +387 -0
- package/templates/code/serve +300 -0
- package/templates/code/serve.py +175 -0
- package/templates/code/serving.properties +105 -0
- package/templates/code/start_server.py +39 -0
- package/templates/code/start_server.sh +39 -0
- package/templates/diffusors/Dockerfile +72 -0
- package/templates/diffusors/patch_image_api.py +35 -0
- package/templates/diffusors/serve +115 -0
- package/templates/diffusors/start_server.sh +114 -0
- package/templates/do/.gitkeep +1 -0
- package/templates/do/README.md +541 -0
- package/templates/do/build +83 -0
- package/templates/do/ci +681 -0
- package/templates/do/clean +811 -0
- package/templates/do/config +260 -0
- package/templates/do/deploy +1560 -0
- package/templates/do/export +306 -0
- package/templates/do/logs +319 -0
- package/templates/do/manifest +12 -0
- package/templates/do/push +119 -0
- package/templates/do/register +580 -0
- package/templates/do/run +113 -0
- package/templates/do/submit +417 -0
- package/templates/do/test +1147 -0
- package/templates/hyperpod/configmap.yaml +24 -0
- package/templates/hyperpod/deployment.yaml +71 -0
- package/templates/hyperpod/pvc.yaml +42 -0
- package/templates/hyperpod/service.yaml +17 -0
- package/templates/nginx-diffusors.conf +74 -0
- package/templates/nginx-predictors.conf +47 -0
- package/templates/nginx-tensorrt.conf +74 -0
- package/templates/requirements.txt +61 -0
- package/templates/sample_model/test_inference.py +123 -0
- package/templates/sample_model/train_abalone.py +252 -0
- package/templates/test/test_endpoint.sh +79 -0
- package/templates/test/test_local_image.sh +80 -0
- package/templates/test/test_model_handler.py +180 -0
- package/templates/triton/Dockerfile +128 -0
- package/templates/triton/config.pbtxt +163 -0
- package/templates/triton/model.py +130 -0
- package/templates/triton/requirements.txt +11 -0
|
@@ -0,0 +1,1560 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
set -e
|
|
6
|
+
set -u
|
|
7
|
+
set -o pipefail
|
|
8
|
+
|
|
9
|
+
# Parse flags
|
|
10
|
+
FORCE_NEW=false
|
|
11
|
+
FORCE_IC=false
|
|
12
|
+
for arg in "$@"; do
|
|
13
|
+
case "$arg" in
|
|
14
|
+
--force) FORCE_NEW=true ;;
|
|
15
|
+
--force-ic) FORCE_IC=true ;;
|
|
16
|
+
--help|-h)
|
|
17
|
+
echo "Usage: ./do/deploy [--force] [--force-ic]"
|
|
18
|
+
echo ""
|
|
19
|
+
echo "Options:"
|
|
20
|
+
echo " --force Create a new endpoint and IC, even if one already exists."
|
|
21
|
+
echo " --force-ic Recreate just the IC on the existing endpoint."
|
|
22
|
+
echo ""
|
|
23
|
+
echo "Without flags, deploy resumes from the last run."
|
|
24
|
+
exit 0
|
|
25
|
+
;;
|
|
26
|
+
esac
|
|
27
|
+
done
|
|
28
|
+
|
|
29
|
+
# Source configuration
|
|
30
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
31
|
+
source "${SCRIPT_DIR}/config"
|
|
32
|
+
|
|
33
|
+
echo "๐ Deploying to AWS"
|
|
34
|
+
echo " Project: ${PROJECT_NAME}"
|
|
35
|
+
echo " Deployment config: ${DEPLOYMENT_CONFIG}"
|
|
36
|
+
echo " Region: ${AWS_REGION}"
|
|
37
|
+
echo " Build target: ${BUILD_TARGET}"
|
|
38
|
+
echo " Deployment target: ${DEPLOYMENT_TARGET}"
|
|
39
|
+
<% if (deploymentTarget === 'managed-inference') { %>
|
|
40
|
+
echo " Instance type: ${INSTANCE_TYPE}"
|
|
41
|
+
<% } else if (deploymentTarget === 'async-inference') { %>
|
|
42
|
+
echo " Instance type: ${INSTANCE_TYPE}"
|
|
43
|
+
echo " S3 output: ${ASYNC_S3_OUTPUT_PATH}"
|
|
44
|
+
echo " SNS success: ${ASYNC_SNS_SUCCESS_TOPIC}"
|
|
45
|
+
echo " SNS error: ${ASYNC_SNS_ERROR_TOPIC}"
|
|
46
|
+
<% if (asyncMaxConcurrentInvocations) { %>
|
|
47
|
+
echo " Max concurrent: ${ASYNC_MAX_CONCURRENT_INVOCATIONS}"
|
|
48
|
+
<% } %>
|
|
49
|
+
<% } else if (deploymentTarget === 'hyperpod-eks') { %>
|
|
50
|
+
echo " HyperPod cluster: ${HYPERPOD_CLUSTER_NAME}"
|
|
51
|
+
echo " Namespace: ${HYPERPOD_NAMESPACE}"
|
|
52
|
+
echo " Replicas: ${HYPERPOD_REPLICAS}"
|
|
53
|
+
<% } else if (deploymentTarget === 'batch-transform') { %>
|
|
54
|
+
echo " Instance type: ${INSTANCE_TYPE}"
|
|
55
|
+
echo " S3 input: ${BATCH_INPUT_PATH}"
|
|
56
|
+
echo " S3 output: ${BATCH_OUTPUT_PATH}"
|
|
57
|
+
echo " Instance count: ${BATCH_INSTANCE_COUNT}"
|
|
58
|
+
echo " Split type: ${BATCH_SPLIT_TYPE}"
|
|
59
|
+
echo " Strategy: ${BATCH_STRATEGY}"
|
|
60
|
+
<% } %>
|
|
61
|
+
|
|
62
|
+
# Check AWS credentials
|
|
63
|
+
echo "๐ Validating AWS credentials..."
|
|
64
|
+
if ! aws sts get-caller-identity &> /dev/null; then
|
|
65
|
+
echo "โ AWS credentials not configured"
|
|
66
|
+
echo " Run: aws configure"
|
|
67
|
+
echo " Or set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables"
|
|
68
|
+
exit 4
|
|
69
|
+
fi
|
|
70
|
+
|
|
71
|
+
AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
|
|
72
|
+
echo "โ
AWS credentials validated (Account: ${AWS_ACCOUNT_ID})"
|
|
73
|
+
|
|
74
|
+
# Construct ECR repository URL
|
|
75
|
+
ECR_REPOSITORY="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_REPOSITORY_NAME}"
|
|
76
|
+
|
|
77
|
+
# ============================================================
|
|
78
|
+
# Shared: Verify ECR image exists
|
|
79
|
+
# ============================================================
|
|
80
|
+
echo "๐ Verifying ECR image exists..."
|
|
81
|
+
if ! aws ecr describe-images \
|
|
82
|
+
--repository-name "${ECR_REPOSITORY_NAME}" \
|
|
83
|
+
--image-ids imageTag="${PROJECT_NAME}-latest" \
|
|
84
|
+
--region "${AWS_REGION}" &> /dev/null; then
|
|
85
|
+
|
|
86
|
+
echo "โ ECR image not found: ${ECR_REPOSITORY}:${PROJECT_NAME}-latest"
|
|
87
|
+
echo ""
|
|
88
|
+
echo "Please build and push your image first:"
|
|
89
|
+
echo " ./do/submit"
|
|
90
|
+
echo ""
|
|
91
|
+
echo "After the build completes successfully, run this deploy script again."
|
|
92
|
+
exit 4
|
|
93
|
+
fi
|
|
94
|
+
|
|
95
|
+
echo "โ
ECR image found: ${ECR_REPOSITORY}:${PROJECT_NAME}-latest"
|
|
96
|
+
IMAGE_TAG="${PROJECT_NAME}-latest"
|
|
97
|
+
|
|
98
|
+
<% if (deploymentTarget === 'managed-inference') { %>
|
|
99
|
+
# ============================================================
|
|
100
|
+
# SageMaker Managed Inference Deployment (Inference Components)
|
|
101
|
+
# ============================================================
|
|
102
|
+
|
|
103
|
+
# Validate execution role ARN
|
|
104
|
+
if [ -z "${ROLE_ARN:-}" ]; then
|
|
105
|
+
echo "โ Execution role ARN not provided"
|
|
106
|
+
echo ""
|
|
107
|
+
echo "Usage:"
|
|
108
|
+
echo " export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_ROLE"
|
|
109
|
+
echo " ./do/deploy"
|
|
110
|
+
echo ""
|
|
111
|
+
echo "Or set ROLE_ARN in do/config"
|
|
112
|
+
echo ""
|
|
113
|
+
echo "The execution role must have permissions for:"
|
|
114
|
+
echo " โข SageMaker endpoint and inference component management"
|
|
115
|
+
echo " โข ECR image access"
|
|
116
|
+
echo " โข S3 access (if using model artifacts)"
|
|
117
|
+
echo " โข CloudWatch Logs"
|
|
118
|
+
exit 3
|
|
119
|
+
fi
|
|
120
|
+
|
|
121
|
+
echo " Using execution role: ${ROLE_ARN}"
|
|
122
|
+
|
|
123
|
+
# Helper: persist a variable to do/config so other scripts can use it
|
|
124
|
+
_update_config_var() {
|
|
125
|
+
local var_name="$1" var_value="$2" config_file="${SCRIPT_DIR}/config"
|
|
126
|
+
if grep -q "^export ${var_name}=" "${config_file}" 2>/dev/null; then
|
|
127
|
+
sed -i.bak "s|^export ${var_name}=.*|export ${var_name}=\"${var_value}\"|" "${config_file}"
|
|
128
|
+
rm -f "${config_file}.bak"
|
|
129
|
+
else
|
|
130
|
+
echo "" >> "${config_file}"
|
|
131
|
+
echo "export ${var_name}=\"${var_value}\"" >> "${config_file}"
|
|
132
|
+
fi
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
# Helper: query a SageMaker resource status, returns empty string if not found
|
|
136
|
+
_get_endpoint_status() {
|
|
137
|
+
aws sagemaker describe-endpoint \
|
|
138
|
+
--endpoint-name "$1" \
|
|
139
|
+
--region "${AWS_REGION}" \
|
|
140
|
+
--query EndpointStatus \
|
|
141
|
+
--output text 2>/dev/null || echo ""
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
_get_ic_status() {
|
|
145
|
+
aws sagemaker describe-inference-component \
|
|
146
|
+
--inference-component-name "$1" \
|
|
147
|
+
--region "${AWS_REGION}" \
|
|
148
|
+
--query InferenceComponentStatus \
|
|
149
|
+
--output text 2>/dev/null || echo ""
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
# Helper: find an InService IC on an endpoint (returns first match or empty)
|
|
153
|
+
_find_active_ic_on_endpoint() {
|
|
154
|
+
aws sagemaker list-inference-components \
|
|
155
|
+
--endpoint-name "$1" \
|
|
156
|
+
--status-equals InService \
|
|
157
|
+
--region "${AWS_REGION}" \
|
|
158
|
+
--query 'InferenceComponents[0].InferenceComponentName' \
|
|
159
|
+
--output text 2>/dev/null || echo ""
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
# ============================================================
|
|
163
|
+
# Idempotency: check for existing deployment from a previous run
|
|
164
|
+
# ============================================================
|
|
165
|
+
SKIP_TO=""
|
|
166
|
+
|
|
167
|
+
if [ "${FORCE_NEW}" = true ]; then
|
|
168
|
+
echo "๐ --force: ignoring previous deployment, creating new resources."
|
|
169
|
+
elif [ "${FORCE_IC}" = true ] && [ -n "${ENDPOINT_NAME:-}" ]; then
|
|
170
|
+
EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")
|
|
171
|
+
if [ "${EP_STATUS}" = "InService" ]; then
|
|
172
|
+
echo "๐ --force-ic: recreating inference component on existing endpoint: ${ENDPOINT_NAME}"
|
|
173
|
+
SKIP_TO="create_ic"
|
|
174
|
+
else
|
|
175
|
+
echo "โ ๏ธ --force-ic requires an InService endpoint, but ${ENDPOINT_NAME} is: ${EP_STATUS:-not found}"
|
|
176
|
+
echo " Use --force to create a new endpoint, or wait for the current one."
|
|
177
|
+
exit 4
|
|
178
|
+
fi
|
|
179
|
+
elif [ -n "${ENDPOINT_NAME:-}" ]; then
|
|
180
|
+
echo "๐ Checking for existing deployment: ${ENDPOINT_NAME}"
|
|
181
|
+
|
|
182
|
+
EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")
|
|
183
|
+
|
|
184
|
+
case "${EP_STATUS}" in
|
|
185
|
+
InService)
|
|
186
|
+
echo "โ
Endpoint already InService: ${ENDPOINT_NAME}"
|
|
187
|
+
|
|
188
|
+
# Check inference component
|
|
189
|
+
if [ -n "${INFERENCE_COMPONENT_NAME:-}" ]; then
|
|
190
|
+
IC_STATUS=$(_get_ic_status "${INFERENCE_COMPONENT_NAME}")
|
|
191
|
+
|
|
192
|
+
case "${IC_STATUS}" in
|
|
193
|
+
InService)
|
|
194
|
+
echo "โ
Inference component already InService: ${INFERENCE_COMPONENT_NAME}"
|
|
195
|
+
echo ""
|
|
196
|
+
echo "๐ Deployment is already live. Nothing to do."
|
|
197
|
+
echo " Endpoint: ${ENDPOINT_NAME}"
|
|
198
|
+
echo " Inference Component: ${INFERENCE_COMPONENT_NAME}"
|
|
199
|
+
echo ""
|
|
200
|
+
echo "๐งช Test your endpoint:"
|
|
201
|
+
echo " ./do/test"
|
|
202
|
+
echo ""
|
|
203
|
+
echo "๐งน Clean up when done:"
|
|
204
|
+
echo " ./do/clean endpoint"
|
|
205
|
+
exit 0
|
|
206
|
+
;;
|
|
207
|
+
Creating)
|
|
208
|
+
echo "โณ Inference component still creating: ${INFERENCE_COMPONENT_NAME}"
|
|
209
|
+
SKIP_TO="wait_ic"
|
|
210
|
+
IC_NAME="${INFERENCE_COMPONENT_NAME}"
|
|
211
|
+
;;
|
|
212
|
+
Failed)
|
|
213
|
+
echo "โ ๏ธ Inference component failed: ${INFERENCE_COMPONENT_NAME}"
|
|
214
|
+
echo " Will create a new inference component on the existing endpoint."
|
|
215
|
+
SKIP_TO="create_ic"
|
|
216
|
+
;;
|
|
217
|
+
*)
|
|
218
|
+
# Stored IC not found โ check if a different IC is running on this endpoint
|
|
219
|
+
LIVE_IC=$(_find_active_ic_on_endpoint "${ENDPOINT_NAME}")
|
|
220
|
+
if [ -n "${LIVE_IC}" ] && [ "${LIVE_IC}" != "None" ]; then
|
|
221
|
+
echo "โ
Found running inference component on endpoint: ${LIVE_IC}"
|
|
222
|
+
echo " (config had stale reference: ${INFERENCE_COMPONENT_NAME})"
|
|
223
|
+
_update_config_var "INFERENCE_COMPONENT_NAME" "${LIVE_IC}"
|
|
224
|
+
echo ""
|
|
225
|
+
echo "๐ Deployment is already live. Nothing to do."
|
|
226
|
+
echo " Endpoint: ${ENDPOINT_NAME}"
|
|
227
|
+
echo " Inference Component: ${LIVE_IC}"
|
|
228
|
+
echo ""
|
|
229
|
+
echo "๐งช Test your endpoint:"
|
|
230
|
+
echo " ./do/test"
|
|
231
|
+
echo ""
|
|
232
|
+
echo "๐งน Clean up when done:"
|
|
233
|
+
echo " ./do/clean endpoint"
|
|
234
|
+
exit 0
|
|
235
|
+
else
|
|
236
|
+
echo " No existing inference component found on endpoint. Will create one."
|
|
237
|
+
SKIP_TO="create_ic"
|
|
238
|
+
fi
|
|
239
|
+
;;
|
|
240
|
+
esac
|
|
241
|
+
else
|
|
242
|
+
# No IC name in config โ check if one is already running on the endpoint
|
|
243
|
+
LIVE_IC=$(_find_active_ic_on_endpoint "${ENDPOINT_NAME}")
|
|
244
|
+
if [ -n "${LIVE_IC}" ] && [ "${LIVE_IC}" != "None" ]; then
|
|
245
|
+
echo "โ
Found running inference component on endpoint: ${LIVE_IC}"
|
|
246
|
+
_update_config_var "INFERENCE_COMPONENT_NAME" "${LIVE_IC}"
|
|
247
|
+
echo ""
|
|
248
|
+
echo "๐ Deployment is already live. Nothing to do."
|
|
249
|
+
echo " Endpoint: ${ENDPOINT_NAME}"
|
|
250
|
+
echo " Inference Component: ${LIVE_IC}"
|
|
251
|
+
echo ""
|
|
252
|
+
echo "๐งช Test your endpoint:"
|
|
253
|
+
echo " ./do/test"
|
|
254
|
+
echo ""
|
|
255
|
+
echo "๐งน Clean up when done:"
|
|
256
|
+
echo " ./do/clean endpoint"
|
|
257
|
+
exit 0
|
|
258
|
+
else
|
|
259
|
+
SKIP_TO="create_ic"
|
|
260
|
+
fi
|
|
261
|
+
fi
|
|
262
|
+
;;
|
|
263
|
+
Creating|Updating)
|
|
264
|
+
echo "โณ Endpoint still ${EP_STATUS}: ${ENDPOINT_NAME}"
|
|
265
|
+
SKIP_TO="wait_endpoint"
|
|
266
|
+
;;
|
|
267
|
+
Failed)
|
|
268
|
+
echo "โ ๏ธ Previous endpoint failed: ${ENDPOINT_NAME}"
|
|
269
|
+
echo " Creating a new deployment. Clean up the failed endpoint with:"
|
|
270
|
+
echo " ./do/clean endpoint"
|
|
271
|
+
echo ""
|
|
272
|
+
# Fall through to create new resources
|
|
273
|
+
;;
|
|
274
|
+
"")
|
|
275
|
+
echo " Previous endpoint not found (may have been cleaned up). Creating new deployment."
|
|
276
|
+
;;
|
|
277
|
+
*)
|
|
278
|
+
echo " Endpoint in unexpected state: ${EP_STATUS}. Creating new deployment."
|
|
279
|
+
;;
|
|
280
|
+
esac
|
|
281
|
+
fi
|
|
282
|
+
|
|
283
|
+
# ============================================================
|
|
284
|
+
# Step 1: Create endpoint configuration (skip if resuming)
|
|
285
|
+
# ============================================================
|
|
286
|
+
if [ -z "${SKIP_TO}" ]; then
|
|
287
|
+
TIMESTAMP=$(date +%s)
|
|
288
|
+
ENDPOINT_CONFIG_NAME="${PROJECT_NAME}-epc-${TIMESTAMP}"
|
|
289
|
+
ENDPOINT_NAME="${PROJECT_NAME}-endpoint-${TIMESTAMP}"
|
|
290
|
+
IC_NAME="${PROJECT_NAME}-ic-${TIMESTAMP}"
|
|
291
|
+
|
|
292
|
+
_update_config_var "ENDPOINT_NAME" "${ENDPOINT_NAME}"
|
|
293
|
+
_update_config_var "ENDPOINT_CONFIG_NAME" "${ENDPOINT_CONFIG_NAME}"
|
|
294
|
+
_update_config_var "INFERENCE_COMPONENT_NAME" "${IC_NAME}"
|
|
295
|
+
|
|
296
|
+
# Build production variant JSON
|
|
297
|
+
VARIANT_JSON="[{\"VariantName\":\"AllTraffic\",\"InstanceType\":\"${INSTANCE_TYPE}\",\"InitialInstanceCount\":1"
|
|
298
|
+
|
|
299
|
+
if [ -n "${INFERENCE_AMI_VERSION:-}" ]; then
|
|
300
|
+
VARIANT_JSON="${VARIANT_JSON},\"InferenceAmiVersion\":\"${INFERENCE_AMI_VERSION}\""
|
|
301
|
+
echo " AMI version: ${INFERENCE_AMI_VERSION}"
|
|
302
|
+
fi
|
|
303
|
+
|
|
304
|
+
VARIANT_JSON="${VARIANT_JSON}}]"
|
|
305
|
+
|
|
306
|
+
echo "โ๏ธ Creating endpoint configuration: ${ENDPOINT_CONFIG_NAME}"
|
|
307
|
+
if ! aws sagemaker create-endpoint-config \
|
|
308
|
+
--endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
|
|
309
|
+
--execution-role-arn "${ROLE_ARN}" \
|
|
310
|
+
--production-variants "${VARIANT_JSON}" \
|
|
311
|
+
--region "${AWS_REGION}"; then
|
|
312
|
+
|
|
313
|
+
echo "โ Failed to create endpoint configuration"
|
|
314
|
+
echo " Check that:"
|
|
315
|
+
echo " โข The execution role ARN is valid"
|
|
316
|
+
echo " โข The instance type is valid: ${INSTANCE_TYPE}"
|
|
317
|
+
echo " โข The instance type is available in region: ${AWS_REGION}"
|
|
318
|
+
echo " โข You have sufficient service quota for the instance type"
|
|
319
|
+
exit 4
|
|
320
|
+
fi
|
|
321
|
+
|
|
322
|
+
echo "โ
Endpoint configuration created: ${ENDPOINT_CONFIG_NAME}"
|
|
323
|
+
|
|
324
|
+
# Record endpoint config in manifest (non-blocking)
|
|
325
|
+
ENDPOINT_CONFIG_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint-config/${ENDPOINT_CONFIG_NAME}"
|
|
326
|
+
./do/manifest add \
|
|
327
|
+
--type sagemaker-endpoint-config \
|
|
328
|
+
--id "${ENDPOINT_CONFIG_ARN}" \
|
|
329
|
+
--project "${PROJECT_NAME}" \
|
|
330
|
+
--meta "{\"endpointConfigName\":\"${ENDPOINT_CONFIG_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
|
|
331
|
+
2>/dev/null || true
|
|
332
|
+
|
|
333
|
+
# Step 2: Create endpoint
|
|
334
|
+
echo "๐ Creating endpoint: ${ENDPOINT_NAME}"
|
|
335
|
+
if ! aws sagemaker create-endpoint \
|
|
336
|
+
--endpoint-name "${ENDPOINT_NAME}" \
|
|
337
|
+
--endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
|
|
338
|
+
--region "${AWS_REGION}"; then
|
|
339
|
+
|
|
340
|
+
echo "โ Failed to create endpoint"
|
|
341
|
+
echo " Check that:"
|
|
342
|
+
echo " โข Your IAM credentials have sagemaker:CreateEndpoint permission"
|
|
343
|
+
echo " โข You have sufficient service quota in region: ${AWS_REGION}"
|
|
344
|
+
exit 4
|
|
345
|
+
fi
|
|
346
|
+
|
|
347
|
+
echo "โ
Endpoint creation initiated: ${ENDPOINT_NAME}"
|
|
348
|
+
|
|
349
|
+
# Record endpoint in manifest (non-blocking)
|
|
350
|
+
ENDPOINT_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint/${ENDPOINT_NAME}"
|
|
351
|
+
./do/manifest add \
|
|
352
|
+
--type sagemaker-endpoint \
|
|
353
|
+
--id "${ENDPOINT_ARN}" \
|
|
354
|
+
--project "${PROJECT_NAME}" \
|
|
355
|
+
--meta "{\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
|
|
356
|
+
2>/dev/null || true
|
|
357
|
+
fi
|
|
358
|
+
|
|
359
|
+
# ============================================================
|
|
360
|
+
# Wait for endpoint (skip if already InService)
|
|
361
|
+
# ============================================================
|
|
362
|
+
if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
|
|
363
|
+
echo "โณ Waiting for endpoint to reach InService status..."
|
|
364
|
+
echo " This may take a few minutes..."
|
|
365
|
+
echo " If this times out, re-run ./do/deploy to resume."
|
|
366
|
+
|
|
367
|
+
if ! aws sagemaker wait endpoint-in-service \
|
|
368
|
+
--endpoint-name "${ENDPOINT_NAME}" \
|
|
369
|
+
--region "${AWS_REGION}"; then
|
|
370
|
+
|
|
371
|
+
# Check if it was a credential expiration vs actual failure
|
|
372
|
+
EP_CHECK=$(_get_endpoint_status "${ENDPOINT_NAME}" 2>/dev/null)
|
|
373
|
+
if [ "${EP_CHECK}" = "Creating" ]; then
|
|
374
|
+
echo ""
|
|
375
|
+
echo "โ ๏ธ Wait interrupted (credentials may have expired), but endpoint is still creating."
|
|
376
|
+
echo " Refresh your credentials and re-run ./do/deploy to resume."
|
|
377
|
+
echo ""
|
|
378
|
+
echo " Or check status manually:"
|
|
379
|
+
echo " aws sagemaker describe-endpoint --endpoint-name ${ENDPOINT_NAME} --region ${AWS_REGION} --query EndpointStatus"
|
|
380
|
+
exit 4
|
|
381
|
+
fi
|
|
382
|
+
|
|
383
|
+
echo "โ Endpoint failed to reach InService status"
|
|
384
|
+
echo " Check CloudWatch Logs for details:"
|
|
385
|
+
echo " https://console.aws.amazon.com/cloudwatch/home?region=${AWS_REGION}#logsV2:log-groups/log-group//aws/sagemaker/Endpoints/${ENDPOINT_NAME}"
|
|
386
|
+
exit 4
|
|
387
|
+
fi
|
|
388
|
+
|
|
389
|
+
echo "โ
Endpoint is InService: ${ENDPOINT_NAME}"
|
|
390
|
+
fi
|
|
391
|
+
|
|
392
|
+
# ============================================================
|
|
393
|
+
# Step 3: Create inference component (skip if resuming from wait_ic)
|
|
394
|
+
# ============================================================
|
|
395
|
+
if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "create_ic" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
|
|
396
|
+
# Generate new IC name if resuming after endpoint wait or failed IC
|
|
397
|
+
if [ "${SKIP_TO}" = "create_ic" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
|
|
398
|
+
TIMESTAMP=$(date +%s)
|
|
399
|
+
IC_NAME="${PROJECT_NAME}-ic-${TIMESTAMP}"
|
|
400
|
+
_update_config_var "INFERENCE_COMPONENT_NAME" "${IC_NAME}"
|
|
401
|
+
fi
|
|
402
|
+
|
|
403
|
+
echo "๐ฆ Creating inference component: ${IC_NAME}"
|
|
404
|
+
if ! aws sagemaker create-inference-component \
|
|
405
|
+
--inference-component-name "${IC_NAME}" \
|
|
406
|
+
--endpoint-name "${ENDPOINT_NAME}" \
|
|
407
|
+
--variant-name "AllTraffic" \
|
|
408
|
+
--specification "{
|
|
409
|
+
\"Container\": {
|
|
410
|
+
\"Image\": \"${ECR_REPOSITORY}:${IMAGE_TAG}\"
|
|
411
|
+
},
|
|
412
|
+
\"StartupParameters\": {
|
|
413
|
+
\"ContainerStartupHealthCheckTimeoutInSeconds\": 900
|
|
414
|
+
},
|
|
415
|
+
\"ComputeResourceRequirements\": {
|
|
416
|
+
\"NumberOfAcceleratorDevicesRequired\": 1,
|
|
417
|
+
\"MinMemoryRequiredInMb\": 1024
|
|
418
|
+
}
|
|
419
|
+
}" \
|
|
420
|
+
--runtime-config "{\"CopyCount\": 1}" \
|
|
421
|
+
--region "${AWS_REGION}"; then
|
|
422
|
+
|
|
423
|
+
echo "โ Failed to create inference component"
|
|
424
|
+
echo " Check that:"
|
|
425
|
+
echo " โข The ECR image exists and is accessible"
|
|
426
|
+
echo " โข The endpoint is in InService status"
|
|
427
|
+
echo " โข The compute resource requirements fit the instance type: ${INSTANCE_TYPE}"
|
|
428
|
+
exit 4
|
|
429
|
+
fi
|
|
430
|
+
|
|
431
|
+
echo "โ
Inference component creation initiated: ${IC_NAME}"
|
|
432
|
+
|
|
433
|
+
# Record inference component in manifest (non-blocking)
|
|
434
|
+
IC_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:inference-component/${IC_NAME}"
|
|
435
|
+
./do/manifest add \
|
|
436
|
+
--type sagemaker-inference-component \
|
|
437
|
+
--id "${IC_ARN}" \
|
|
438
|
+
--project "${PROJECT_NAME}" \
|
|
439
|
+
--meta "{\"inferenceComponentName\":\"${IC_NAME}\",\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
|
|
440
|
+
2>/dev/null || true
|
|
441
|
+
fi
|
|
442
|
+
|
|
443
|
+
# ============================================================
|
|
444
|
+
# Wait for inference component
|
|
445
|
+
# ============================================================
|
|
446
|
+
echo "โณ Waiting for inference component to reach InService status..."
|
|
447
|
+
echo " This may take 5-10 minutes..."
|
|
448
|
+
echo " If this times out, re-run ./do/deploy to resume."
|
|
449
|
+
|
|
450
|
+
# Poll loop โ replaces `aws sagemaker wait inference-component-in-service`
|
|
451
|
+
# which is only available in AWS CLI v2.15+
|
|
452
|
+
IC_WAIT_TIMEOUT=1800 # 30 minutes max
|
|
453
|
+
IC_WAIT_START=$(date +%s)
|
|
454
|
+
|
|
455
|
+
while true; do
|
|
456
|
+
IC_STATUS=$(_get_ic_status "${IC_NAME}" 2>/dev/null)
|
|
457
|
+
|
|
458
|
+
case "${IC_STATUS}" in
|
|
459
|
+
InService)
|
|
460
|
+
break
|
|
461
|
+
;;
|
|
462
|
+
Failed)
|
|
463
|
+
echo "โ Inference component failed to reach InService status"
|
|
464
|
+
echo " Check CloudWatch Logs for details:"
|
|
465
|
+
echo " https://console.aws.amazon.com/cloudwatch/home?region=${AWS_REGION}#logsV2:log-groups/log-group//aws/sagemaker/Endpoints/${ENDPOINT_NAME}"
|
|
466
|
+
echo ""
|
|
467
|
+
echo " Debug:"
|
|
468
|
+
echo " aws sagemaker describe-inference-component --inference-component-name ${IC_NAME} --region ${AWS_REGION}"
|
|
469
|
+
exit 4
|
|
470
|
+
;;
|
|
471
|
+
Creating)
|
|
472
|
+
# Check timeout
|
|
473
|
+
IC_ELAPSED=$(( $(date +%s) - IC_WAIT_START ))
|
|
474
|
+
if [ "${IC_ELAPSED}" -ge "${IC_WAIT_TIMEOUT}" ]; then
|
|
475
|
+
echo ""
|
|
476
|
+
echo "โ ๏ธ Inference component still creating after ${IC_WAIT_TIMEOUT}s."
|
|
477
|
+
echo " Re-run ./do/deploy to resume waiting."
|
|
478
|
+
echo ""
|
|
479
|
+
echo " Or check status manually:"
|
|
480
|
+
echo " aws sagemaker describe-inference-component --inference-component-name ${IC_NAME} --region ${AWS_REGION}"
|
|
481
|
+
exit 4
|
|
482
|
+
fi
|
|
483
|
+
echo " $(date +%H:%M:%S) Status: Creating (${IC_ELAPSED}s elapsed)..."
|
|
484
|
+
sleep 30
|
|
485
|
+
;;
|
|
486
|
+
"")
|
|
487
|
+
echo "โ ๏ธ Could not determine inference component status (credentials may have expired)."
|
|
488
|
+
echo " Re-run ./do/deploy to resume."
|
|
489
|
+
exit 4
|
|
490
|
+
;;
|
|
491
|
+
*)
|
|
492
|
+
echo " $(date +%H:%M:%S) Status: ${IC_STATUS}..."
|
|
493
|
+
sleep 30
|
|
494
|
+
;;
|
|
495
|
+
esac
|
|
496
|
+
done
|
|
497
|
+
|
|
498
|
+
echo "โ
Deployment complete!"
|
|
499
|
+
echo ""
|
|
500
|
+
echo "๐ Deployment Details:"
|
|
501
|
+
echo " Endpoint: ${ENDPOINT_NAME}"
|
|
502
|
+
echo " Endpoint Config: ${ENDPOINT_CONFIG_NAME}"
|
|
503
|
+
echo " Inference Component: ${IC_NAME}"
|
|
504
|
+
echo " Region: ${AWS_REGION}"
|
|
505
|
+
echo " Instance Type: ${INSTANCE_TYPE}"
|
|
506
|
+
echo " Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
|
|
507
|
+
echo ""
|
|
508
|
+
echo "๐งช Test your endpoint:"
|
|
509
|
+
echo " ./do/test"
|
|
510
|
+
echo ""
|
|
511
|
+
echo "๐ Register this deployment:"
|
|
512
|
+
echo " ./do/register"
|
|
513
|
+
echo ""
|
|
514
|
+
echo "๐ Monitor your deployment:"
|
|
515
|
+
echo " aws sagemaker describe-inference-component --inference-component-name ${IC_NAME} --region ${AWS_REGION}"
|
|
516
|
+
echo " aws sagemaker describe-endpoint --endpoint-name ${ENDPOINT_NAME} --region ${AWS_REGION}"
|
|
517
|
+
echo ""
|
|
518
|
+
echo "๐งน Clean up when done:"
|
|
519
|
+
echo " ./do/clean endpoint"
|
|
520
|
+
|
|
521
|
+
<% } else if (deploymentTarget === 'async-inference') { %>
|
|
522
|
+
# ============================================================
|
|
523
|
+
# SageMaker Managed Inference - Async Deployment (Model-Based)
|
|
524
|
+
# SageMaker async inference does NOT support Inference Components.
|
|
525
|
+
# Flow: create-model โ create-endpoint-config (with AsyncInferenceConfig) โ create-endpoint
|
|
526
|
+
# ============================================================
|
|
527
|
+
|
|
528
|
+
# Validate execution role ARN
|
|
529
|
+
if [ -z "${ROLE_ARN:-}" ]; then
|
|
530
|
+
echo "โ Execution role ARN not provided"
|
|
531
|
+
echo ""
|
|
532
|
+
echo "Usage:"
|
|
533
|
+
echo " export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_ROLE"
|
|
534
|
+
echo " ./do/deploy"
|
|
535
|
+
echo ""
|
|
536
|
+
echo "Or set ROLE_ARN in do/config"
|
|
537
|
+
echo ""
|
|
538
|
+
echo "The execution role must have permissions for:"
|
|
539
|
+
echo " โข SageMaker model and endpoint management"
|
|
540
|
+
echo " โข ECR image access"
|
|
541
|
+
echo " โข S3 write access for async output path: ${ASYNC_S3_OUTPUT_PATH}"
|
|
542
|
+
echo " โข SNS publish permissions (optional, for notifications)"
|
|
543
|
+
echo " โข CloudWatch Logs"
|
|
544
|
+
exit 3
|
|
545
|
+
fi
|
|
546
|
+
|
|
547
|
+
echo " Using execution role: ${ROLE_ARN}"
|
|
548
|
+
|
|
549
|
+
# ============================================================
|
|
550
|
+
# Bootstrap async infrastructure (S3 bucket + SNS topics)
|
|
551
|
+
# ============================================================
|
|
552
|
+
|
|
553
|
+
# Extract bucket name from S3 output path
|
|
554
|
+
ASYNC_S3_BUCKET=$(echo "${ASYNC_S3_OUTPUT_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
|
|
555
|
+
|
|
556
|
+
<% if (!asyncS3OutputPath) { %>
|
|
557
|
+
# Bootstrap default S3 bucket (check-and-create)
|
|
558
|
+
echo "๐ Checking if S3 bucket exists: ${ASYNC_S3_BUCKET}"
|
|
559
|
+
if ! aws s3api head-bucket --bucket "${ASYNC_S3_BUCKET}" --region "${AWS_REGION}" 2>/dev/null; then
|
|
560
|
+
echo "๐ฆ Creating S3 bucket: ${ASYNC_S3_BUCKET}"
|
|
561
|
+
if [ "${AWS_REGION}" = "us-east-1" ]; then
|
|
562
|
+
if ! aws s3api create-bucket \
|
|
563
|
+
--bucket "${ASYNC_S3_BUCKET}" \
|
|
564
|
+
--region "${AWS_REGION}"; then
|
|
565
|
+
echo "โ Failed to create S3 bucket: ${ASYNC_S3_BUCKET}"
|
|
566
|
+
echo ""
|
|
567
|
+
echo " Check that:"
|
|
568
|
+
echo " โข Your IAM credentials have s3:CreateBucket permission"
|
|
569
|
+
echo " โข The bucket name is not already taken globally"
|
|
570
|
+
exit 4
|
|
571
|
+
fi
|
|
572
|
+
else
|
|
573
|
+
if ! aws s3api create-bucket \
|
|
574
|
+
--bucket "${ASYNC_S3_BUCKET}" \
|
|
575
|
+
--region "${AWS_REGION}" \
|
|
576
|
+
--create-bucket-configuration LocationConstraint="${AWS_REGION}"; then
|
|
577
|
+
echo "โ Failed to create S3 bucket: ${ASYNC_S3_BUCKET}"
|
|
578
|
+
echo ""
|
|
579
|
+
echo " Check that:"
|
|
580
|
+
echo " โข Your IAM credentials have s3:CreateBucket permission"
|
|
581
|
+
echo " โข The bucket name is not already taken globally"
|
|
582
|
+
exit 4
|
|
583
|
+
fi
|
|
584
|
+
fi
|
|
585
|
+
echo "โ
S3 bucket created: ${ASYNC_S3_BUCKET}"
|
|
586
|
+
else
|
|
587
|
+
echo "โ
S3 bucket exists: ${ASYNC_S3_BUCKET}"
|
|
588
|
+
fi
|
|
589
|
+
<% } else { %>
|
|
590
|
+
# Custom S3 output path provided โ skip bucket creation
|
|
591
|
+
echo "โ
Using custom S3 output path: ${ASYNC_S3_OUTPUT_PATH}"
|
|
592
|
+
<% } %>
|
|
593
|
+
|
|
594
|
+
# Extract topic name from SNS success topic ARN
|
|
595
|
+
ASYNC_SNS_SUCCESS_TOPIC_NAME=$(echo "${ASYNC_SNS_SUCCESS_TOPIC}" | awk -F: '{print $NF}')
|
|
596
|
+
|
|
597
|
+
<% if (!asyncSnsSuccessTopic) { %>
|
|
598
|
+
# Bootstrap default SNS success topic (check-and-create)
|
|
599
|
+
echo "๐ Checking if SNS success topic exists: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
|
|
600
|
+
if ! aws sns get-topic-attributes --topic-arn "${ASYNC_SNS_SUCCESS_TOPIC}" --region "${AWS_REGION}" 2>/dev/null; then
|
|
601
|
+
echo "๐ฆ Creating SNS success topic: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
|
|
602
|
+
if ! aws sns create-topic \
|
|
603
|
+
--name "${ASYNC_SNS_SUCCESS_TOPIC_NAME}" \
|
|
604
|
+
--region "${AWS_REGION}" > /dev/null; then
|
|
605
|
+
echo "โ Failed to create SNS success topic: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
|
|
606
|
+
echo ""
|
|
607
|
+
echo " Check that:"
|
|
608
|
+
echo " โข Your IAM credentials have sns:CreateTopic permission"
|
|
609
|
+
exit 4
|
|
610
|
+
fi
|
|
611
|
+
echo "โ
SNS success topic created: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
|
|
612
|
+
else
|
|
613
|
+
echo "โ
SNS success topic exists: ${ASYNC_SNS_SUCCESS_TOPIC_NAME}"
|
|
614
|
+
fi
|
|
615
|
+
|
|
616
|
+
# Record SNS success topic in manifest (non-blocking)
|
|
617
|
+
./do/manifest add \
|
|
618
|
+
--type sns-topic \
|
|
619
|
+
--id "${ASYNC_SNS_SUCCESS_TOPIC}" \
|
|
620
|
+
--project "${PROJECT_NAME}" \
|
|
621
|
+
--meta "{\"topicName\":\"${ASYNC_SNS_SUCCESS_TOPIC_NAME}\",\"purpose\":\"async-success\",\"region\":\"${AWS_REGION}\"}" \
|
|
622
|
+
2>/dev/null || true
|
|
623
|
+
|
|
624
|
+
<% } else { %>
|
|
625
|
+
# Custom SNS success topic ARN provided โ skip topic creation
|
|
626
|
+
echo "โ
Using custom SNS success topic: ${ASYNC_SNS_SUCCESS_TOPIC}"
|
|
627
|
+
|
|
628
|
+
# Record SNS success topic in manifest (non-blocking)
|
|
629
|
+
ASYNC_SNS_SUCCESS_TOPIC_NAME=$(echo "${ASYNC_SNS_SUCCESS_TOPIC}" | awk -F: '{print $NF}')
|
|
630
|
+
./do/manifest add \
|
|
631
|
+
--type sns-topic \
|
|
632
|
+
--id "${ASYNC_SNS_SUCCESS_TOPIC}" \
|
|
633
|
+
--project "${PROJECT_NAME}" \
|
|
634
|
+
--meta "{\"topicName\":\"${ASYNC_SNS_SUCCESS_TOPIC_NAME}\",\"purpose\":\"async-success\",\"region\":\"${AWS_REGION}\"}" \
|
|
635
|
+
2>/dev/null || true
|
|
636
|
+
|
|
637
|
+
<% } %>
|
|
638
|
+
|
|
639
|
+
# Extract topic name from SNS error topic ARN
|
|
640
|
+
ASYNC_SNS_ERROR_TOPIC_NAME=$(echo "${ASYNC_SNS_ERROR_TOPIC}" | awk -F: '{print $NF}')
|
|
641
|
+
|
|
642
|
+
<% if (!asyncSnsErrorTopic) { %>
|
|
643
|
+
# Bootstrap default SNS error topic (check-and-create)
|
|
644
|
+
echo "๐ Checking if SNS error topic exists: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
|
|
645
|
+
if ! aws sns get-topic-attributes --topic-arn "${ASYNC_SNS_ERROR_TOPIC}" --region "${AWS_REGION}" 2>/dev/null; then
|
|
646
|
+
echo "๐ฆ Creating SNS error topic: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
|
|
647
|
+
if ! aws sns create-topic \
|
|
648
|
+
--name "${ASYNC_SNS_ERROR_TOPIC_NAME}" \
|
|
649
|
+
--region "${AWS_REGION}" > /dev/null; then
|
|
650
|
+
echo "โ Failed to create SNS error topic: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
|
|
651
|
+
echo ""
|
|
652
|
+
echo " Check that:"
|
|
653
|
+
echo " โข Your IAM credentials have sns:CreateTopic permission"
|
|
654
|
+
exit 4
|
|
655
|
+
fi
|
|
656
|
+
echo "โ
SNS error topic created: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
|
|
657
|
+
else
|
|
658
|
+
echo "โ
SNS error topic exists: ${ASYNC_SNS_ERROR_TOPIC_NAME}"
|
|
659
|
+
fi
|
|
660
|
+
|
|
661
|
+
# Record SNS error topic in manifest (non-blocking)
|
|
662
|
+
./do/manifest add \
|
|
663
|
+
--type sns-topic \
|
|
664
|
+
--id "${ASYNC_SNS_ERROR_TOPIC}" \
|
|
665
|
+
--project "${PROJECT_NAME}" \
|
|
666
|
+
--meta "{\"topicName\":\"${ASYNC_SNS_ERROR_TOPIC_NAME}\",\"purpose\":\"async-error\",\"region\":\"${AWS_REGION}\"}" \
|
|
667
|
+
2>/dev/null || true
|
|
668
|
+
|
|
669
|
+
<% } else { %>
|
|
670
|
+
# Custom SNS error topic ARN provided โ skip topic creation
|
|
671
|
+
echo "โ
Using custom SNS error topic: ${ASYNC_SNS_ERROR_TOPIC}"
|
|
672
|
+
|
|
673
|
+
# Record SNS error topic in manifest (non-blocking)
|
|
674
|
+
ASYNC_SNS_ERROR_TOPIC_NAME=$(echo "${ASYNC_SNS_ERROR_TOPIC}" | awk -F: '{print $NF}')
|
|
675
|
+
./do/manifest add \
|
|
676
|
+
--type sns-topic \
|
|
677
|
+
--id "${ASYNC_SNS_ERROR_TOPIC}" \
|
|
678
|
+
--project "${PROJECT_NAME}" \
|
|
679
|
+
--meta "{\"topicName\":\"${ASYNC_SNS_ERROR_TOPIC_NAME}\",\"purpose\":\"async-error\",\"region\":\"${AWS_REGION}\"}" \
|
|
680
|
+
2>/dev/null || true
|
|
681
|
+
|
|
682
|
+
<% } %>
|
|
683
|
+
|
|
684
|
+
# ============================================================
|
|
685
|
+
# Create async endpoint (classic model-based flow)
|
|
686
|
+
# SageMaker async inference does NOT support Inference Components.
|
|
687
|
+
# Flow: create-model โ create-endpoint-config (with AsyncInferenceConfig) โ create-endpoint
|
|
688
|
+
# ============================================================
|
|
689
|
+
|
|
690
|
+
# Helper: persist a variable to do/config so other scripts can use it
|
|
691
|
+
_update_config_var() {
|
|
692
|
+
local var_name="$1" var_value="$2" config_file="${SCRIPT_DIR}/config"
|
|
693
|
+
if grep -q "^export ${var_name}=" "${config_file}" 2>/dev/null; then
|
|
694
|
+
sed -i.bak "s|^export ${var_name}=.*|export ${var_name}=\"${var_value}\"|" "${config_file}"
|
|
695
|
+
rm -f "${config_file}.bak"
|
|
696
|
+
else
|
|
697
|
+
echo "" >> "${config_file}"
|
|
698
|
+
echo "export ${var_name}=\"${var_value}\"" >> "${config_file}"
|
|
699
|
+
fi
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
# Helper: query a SageMaker resource status, returns empty string if not found
|
|
703
|
+
_get_endpoint_status() {
|
|
704
|
+
aws sagemaker describe-endpoint \
|
|
705
|
+
--endpoint-name "$1" \
|
|
706
|
+
--region "${AWS_REGION}" \
|
|
707
|
+
--query EndpointStatus \
|
|
708
|
+
--output text 2>/dev/null || echo ""
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
# ============================================================
|
|
712
|
+
# Idempotency: check for existing deployment from a previous run
|
|
713
|
+
# ============================================================
|
|
714
|
+
SKIP_TO=""
|
|
715
|
+
|
|
716
|
+
if [ "${FORCE_NEW}" = true ]; then
|
|
717
|
+
echo "๐ --force: ignoring previous deployment, creating new resources."
|
|
718
|
+
elif [ -n "${ENDPOINT_NAME:-}" ]; then
|
|
719
|
+
echo "๐ Checking for existing deployment: ${ENDPOINT_NAME}"
|
|
720
|
+
|
|
721
|
+
EP_STATUS=$(_get_endpoint_status "${ENDPOINT_NAME}")
|
|
722
|
+
|
|
723
|
+
case "${EP_STATUS}" in
|
|
724
|
+
InService)
|
|
725
|
+
echo "โ
Async endpoint already InService: ${ENDPOINT_NAME}"
|
|
726
|
+
echo ""
|
|
727
|
+
echo "๐ Deployment is already live. Nothing to do."
|
|
728
|
+
echo " Endpoint: ${ENDPOINT_NAME}"
|
|
729
|
+
echo ""
|
|
730
|
+
echo "๐งช Test your async endpoint:"
|
|
731
|
+
echo " ./do/test"
|
|
732
|
+
echo ""
|
|
733
|
+
echo "๐งน Clean up when done:"
|
|
734
|
+
echo " ./do/clean endpoint"
|
|
735
|
+
exit 0
|
|
736
|
+
;;
|
|
737
|
+
Creating|Updating)
|
|
738
|
+
echo "โณ Endpoint still ${EP_STATUS}: ${ENDPOINT_NAME}"
|
|
739
|
+
SKIP_TO="wait_endpoint"
|
|
740
|
+
;;
|
|
741
|
+
Failed)
|
|
742
|
+
echo "โ ๏ธ Previous endpoint failed: ${ENDPOINT_NAME}"
|
|
743
|
+
echo " Creating a new deployment. Clean up the failed endpoint with:"
|
|
744
|
+
echo " ./do/clean endpoint"
|
|
745
|
+
echo ""
|
|
746
|
+
;;
|
|
747
|
+
"")
|
|
748
|
+
echo " Previous endpoint not found (may have been cleaned up). Creating new deployment."
|
|
749
|
+
;;
|
|
750
|
+
*)
|
|
751
|
+
echo " Endpoint in unexpected state: ${EP_STATUS}. Creating new deployment."
|
|
752
|
+
;;
|
|
753
|
+
esac
|
|
754
|
+
fi
|
|
755
|
+
|
|
756
|
+
# ============================================================
|
|
757
|
+
# Create async resources (skip if resuming from wait)
|
|
758
|
+
# ============================================================
|
|
759
|
+
if [ -z "${SKIP_TO}" ]; then
|
|
760
|
+
TIMESTAMP=$(date +%s)
|
|
761
|
+
MODEL_NAME_SM="${PROJECT_NAME}-async-model-${TIMESTAMP}"
|
|
762
|
+
ENDPOINT_CONFIG_NAME="${PROJECT_NAME}-async-epc-${TIMESTAMP}"
|
|
763
|
+
ENDPOINT_NAME="${PROJECT_NAME}-async-ep-${TIMESTAMP}"
|
|
764
|
+
|
|
765
|
+
_update_config_var "ENDPOINT_NAME" "${ENDPOINT_NAME}"
|
|
766
|
+
_update_config_var "ENDPOINT_CONFIG_NAME" "${ENDPOINT_CONFIG_NAME}"
|
|
767
|
+
_update_config_var "SAGEMAKER_MODEL_NAME" "${MODEL_NAME_SM}"
|
|
768
|
+
|
|
769
|
+
# Step 1: Create SageMaker model
|
|
770
|
+
echo "๐ฆ Creating SageMaker model: ${MODEL_NAME_SM}"
|
|
771
|
+
if ! aws sagemaker create-model \
|
|
772
|
+
--model-name "${MODEL_NAME_SM}" \
|
|
773
|
+
--primary-container "{\"Image\":\"${ECR_REPOSITORY}:${IMAGE_TAG}\"}" \
|
|
774
|
+
--execution-role-arn "${ROLE_ARN}" \
|
|
775
|
+
--region "${AWS_REGION}"; then
|
|
776
|
+
|
|
777
|
+
echo "โ Failed to create SageMaker model"
|
|
778
|
+
echo " Check that:"
|
|
779
|
+
echo " โข The execution role ARN is valid"
|
|
780
|
+
echo " โข The ECR image exists and is accessible"
|
|
781
|
+
echo " โข The IAM role has ecr:GetDownloadUrlForLayer permission"
|
|
782
|
+
exit 4
|
|
783
|
+
fi
|
|
784
|
+
|
|
785
|
+
echo "โ
SageMaker model created: ${MODEL_NAME_SM}"
|
|
786
|
+
|
|
787
|
+
# Record model in manifest (non-blocking)
|
|
788
|
+
MODEL_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:model/${MODEL_NAME_SM}"
|
|
789
|
+
./do/manifest add \
|
|
790
|
+
--type sagemaker-model \
|
|
791
|
+
--id "${MODEL_ARN}" \
|
|
792
|
+
--project "${PROJECT_NAME}" \
|
|
793
|
+
--meta "{\"modelName\":\"${MODEL_NAME_SM}\",\"region\":\"${AWS_REGION}\"}" \
|
|
794
|
+
2>/dev/null || true
|
|
795
|
+
|
|
796
|
+
# Build production variant JSON (classic: includes ModelName, no execution-role-arn on endpoint config)
|
|
797
|
+
VARIANT_JSON="[{\"VariantName\":\"AllTraffic\",\"ModelName\":\"${MODEL_NAME_SM}\",\"InstanceType\":\"${INSTANCE_TYPE}\",\"InitialInstanceCount\":1"
|
|
798
|
+
|
|
799
|
+
# Append InferenceAmiVersion if configured
|
|
800
|
+
if [ -n "${INFERENCE_AMI_VERSION:-}" ]; then
|
|
801
|
+
VARIANT_JSON="${VARIANT_JSON},\"InferenceAmiVersion\":\"${INFERENCE_AMI_VERSION}\""
|
|
802
|
+
echo " AMI version: ${INFERENCE_AMI_VERSION}"
|
|
803
|
+
fi
|
|
804
|
+
|
|
805
|
+
VARIANT_JSON="${VARIANT_JSON}}]"
|
|
806
|
+
|
|
807
|
+
# Build AsyncInferenceConfig JSON
|
|
808
|
+
ASYNC_CONFIG="{\"OutputConfig\":{\"S3OutputPath\":\"${ASYNC_S3_OUTPUT_PATH}\",\"NotificationConfig\":{\"SuccessTopic\":\"${ASYNC_SNS_SUCCESS_TOPIC}\",\"ErrorTopic\":\"${ASYNC_SNS_ERROR_TOPIC}\"}}"
|
|
809
|
+
|
|
810
|
+
if [ -n "${ASYNC_MAX_CONCURRENT_INVOCATIONS:-}" ]; then
|
|
811
|
+
ASYNC_CONFIG="${ASYNC_CONFIG},\"ClientConfig\":{\"MaxConcurrentInvocationsPerInstance\":${ASYNC_MAX_CONCURRENT_INVOCATIONS}}"
|
|
812
|
+
fi
|
|
813
|
+
|
|
814
|
+
ASYNC_CONFIG="${ASYNC_CONFIG}}"
|
|
815
|
+
|
|
816
|
+
# Step 2: Create endpoint configuration with AsyncInferenceConfig (no --execution-role-arn)
|
|
817
|
+
echo "โ๏ธ Creating async endpoint configuration: ${ENDPOINT_CONFIG_NAME}"
|
|
818
|
+
if ! aws sagemaker create-endpoint-config \
|
|
819
|
+
--endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
|
|
820
|
+
--production-variants "${VARIANT_JSON}" \
|
|
821
|
+
--async-inference-config "${ASYNC_CONFIG}" \
|
|
822
|
+
--region "${AWS_REGION}"; then
|
|
823
|
+
|
|
824
|
+
echo "โ Failed to create async endpoint configuration"
|
|
825
|
+
echo " Check that:"
|
|
826
|
+
echo " โข The S3 output path is accessible: ${ASYNC_S3_OUTPUT_PATH}"
|
|
827
|
+
echo " โข The IAM role has s3:PutObject permission on the output path"
|
|
828
|
+
echo " โข The instance type is valid: ${INSTANCE_TYPE}"
|
|
829
|
+
echo " โข The instance type is available in region: ${AWS_REGION}"
|
|
830
|
+
echo " โข You have sufficient service quota for the instance type"
|
|
831
|
+
exit 4
|
|
832
|
+
fi
|
|
833
|
+
|
|
834
|
+
echo "โ
Async endpoint configuration created: ${ENDPOINT_CONFIG_NAME}"
|
|
835
|
+
|
|
836
|
+
# Record endpoint config in manifest (non-blocking)
|
|
837
|
+
ENDPOINT_CONFIG_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint-config/${ENDPOINT_CONFIG_NAME}"
|
|
838
|
+
./do/manifest add \
|
|
839
|
+
--type sagemaker-endpoint-config \
|
|
840
|
+
--id "${ENDPOINT_CONFIG_ARN}" \
|
|
841
|
+
--project "${PROJECT_NAME}" \
|
|
842
|
+
--meta "{\"endpointConfigName\":\"${ENDPOINT_CONFIG_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
|
|
843
|
+
2>/dev/null || true
|
|
844
|
+
|
|
845
|
+
# Step 3: Create endpoint
|
|
846
|
+
echo "๐ Creating async endpoint: ${ENDPOINT_NAME}"
|
|
847
|
+
if ! aws sagemaker create-endpoint \
|
|
848
|
+
--endpoint-name "${ENDPOINT_NAME}" \
|
|
849
|
+
--endpoint-config-name "${ENDPOINT_CONFIG_NAME}" \
|
|
850
|
+
--region "${AWS_REGION}"; then
|
|
851
|
+
|
|
852
|
+
echo "โ Failed to create async endpoint"
|
|
853
|
+
echo " Check that:"
|
|
854
|
+
echo " โข Your IAM credentials have sagemaker:CreateEndpoint permission"
|
|
855
|
+
echo " โข You have sufficient service quota in region: ${AWS_REGION}"
|
|
856
|
+
exit 4
|
|
857
|
+
fi
|
|
858
|
+
|
|
859
|
+
echo "โ
Async endpoint creation initiated: ${ENDPOINT_NAME}"
|
|
860
|
+
|
|
861
|
+
# Record endpoint in manifest (non-blocking)
|
|
862
|
+
ENDPOINT_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:endpoint/${ENDPOINT_NAME}"
|
|
863
|
+
./do/manifest add \
|
|
864
|
+
--type sagemaker-endpoint \
|
|
865
|
+
--id "${ENDPOINT_ARN}" \
|
|
866
|
+
--project "${PROJECT_NAME}" \
|
|
867
|
+
--meta "{\"endpointName\":\"${ENDPOINT_NAME}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
|
|
868
|
+
2>/dev/null || true
|
|
869
|
+
fi
|
|
870
|
+
|
|
871
|
+
# ============================================================
|
|
872
|
+
# Wait for endpoint (skip if already InService)
|
|
873
|
+
# ============================================================
|
|
874
|
+
if [ -z "${SKIP_TO}" ] || [ "${SKIP_TO}" = "wait_endpoint" ]; then
|
|
875
|
+
echo "โณ Waiting for async endpoint to reach InService status..."
|
|
876
|
+
echo " This may take several minutes..."
|
|
877
|
+
echo " If this times out, re-run ./do/deploy to resume."
|
|
878
|
+
|
|
879
|
+
if ! aws sagemaker wait endpoint-in-service \
|
|
880
|
+
--endpoint-name "${ENDPOINT_NAME}" \
|
|
881
|
+
--region "${AWS_REGION}"; then
|
|
882
|
+
|
|
883
|
+
# Check if it was a credential expiration vs actual failure
|
|
884
|
+
EP_CHECK=$(_get_endpoint_status "${ENDPOINT_NAME}" 2>/dev/null)
|
|
885
|
+
if [ "${EP_CHECK}" = "Creating" ]; then
|
|
886
|
+
echo ""
|
|
887
|
+
echo "โ ๏ธ Wait interrupted (credentials may have expired), but endpoint is still creating."
|
|
888
|
+
echo " Refresh your credentials and re-run ./do/deploy to resume."
|
|
889
|
+
echo ""
|
|
890
|
+
echo " Or check status manually:"
|
|
891
|
+
echo " aws sagemaker describe-endpoint --endpoint-name ${ENDPOINT_NAME} --region ${AWS_REGION} --query EndpointStatus"
|
|
892
|
+
exit 4
|
|
893
|
+
fi
|
|
894
|
+
|
|
895
|
+
echo "โ Async endpoint failed to reach InService status"
|
|
896
|
+
echo " Check CloudWatch Logs for details:"
|
|
897
|
+
echo " https://console.aws.amazon.com/cloudwatch/home?region=${AWS_REGION}#logsV2:log-groups/log-group//aws/sagemaker/Endpoints/${ENDPOINT_NAME}"
|
|
898
|
+
exit 4
|
|
899
|
+
fi
|
|
900
|
+
fi
|
|
901
|
+
|
|
902
|
+
echo "โ
Async deployment complete!"
|
|
903
|
+
echo ""
|
|
904
|
+
echo "๐ Deployment Details:"
|
|
905
|
+
echo " Endpoint: ${ENDPOINT_NAME}"
|
|
906
|
+
echo " Endpoint Config: ${ENDPOINT_CONFIG_NAME}"
|
|
907
|
+
echo " Model: ${MODEL_NAME_SM}"
|
|
908
|
+
echo " Region: ${AWS_REGION}"
|
|
909
|
+
echo " Instance Type: ${INSTANCE_TYPE}"
|
|
910
|
+
echo " Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
|
|
911
|
+
echo " S3 Output: ${ASYNC_S3_OUTPUT_PATH}"
|
|
912
|
+
echo " SNS Success: ${ASYNC_SNS_SUCCESS_TOPIC}"
|
|
913
|
+
echo " SNS Error: ${ASYNC_SNS_ERROR_TOPIC}"
|
|
914
|
+
echo ""
|
|
915
|
+
echo "๐งช Test your async endpoint:"
|
|
916
|
+
echo " ./do/test"
|
|
917
|
+
echo ""
|
|
918
|
+
echo "๐ Register this deployment:"
|
|
919
|
+
echo " ./do/register"
|
|
920
|
+
echo ""
|
|
921
|
+
echo "๐ Monitor your deployment:"
|
|
922
|
+
echo " aws sagemaker describe-endpoint --endpoint-name ${ENDPOINT_NAME} --region ${AWS_REGION}"
|
|
923
|
+
echo ""
|
|
924
|
+
echo "๐งน Clean up when done:"
|
|
925
|
+
echo " ./do/clean endpoint"
|
|
926
|
+
|
|
927
|
+
<% } else if (deploymentTarget === 'hyperpod-eks') { %>
|
|
928
|
+
# ============================================================
|
|
929
|
+
# HyperPod EKS Deployment
|
|
930
|
+
# ============================================================
|
|
931
|
+
|
|
932
|
+
# Get kubeconfig for HyperPod cluster
|
|
933
|
+
echo "๐ Configuring kubectl for HyperPod cluster..."
|
|
934
|
+
KUBECONFIG_PATH="${HOME}/.kube/hyperpod-${HYPERPOD_CLUSTER_NAME}"
|
|
935
|
+
|
|
936
|
+
# Step 1: Describe the HyperPod cluster to get the underlying EKS cluster ARN
|
|
937
|
+
EKS_CLUSTER_ARN=$(aws sagemaker describe-cluster \
|
|
938
|
+
--cluster-name "${HYPERPOD_CLUSTER_NAME}" \
|
|
939
|
+
--region "${AWS_REGION}" \
|
|
940
|
+
--query "Orchestrator.Eks.ClusterArn" \
|
|
941
|
+
--output text 2>&1) || {
|
|
942
|
+
echo "โ Failed to describe HyperPod cluster: ${HYPERPOD_CLUSTER_NAME}"
|
|
943
|
+
echo ""
|
|
944
|
+
echo " Error details:"
|
|
945
|
+
echo " ${EKS_CLUSTER_ARN}"
|
|
946
|
+
echo ""
|
|
947
|
+
echo " Check that:"
|
|
948
|
+
echo " โข The cluster name is correct"
|
|
949
|
+
echo " โข The cluster exists in region: ${AWS_REGION}"
|
|
950
|
+
echo " โข Your IAM user/role has permission to access the cluster"
|
|
951
|
+
echo ""
|
|
952
|
+
echo " Required IAM permissions:"
|
|
953
|
+
echo " โข sagemaker:DescribeCluster"
|
|
954
|
+
echo " โข eks:DescribeCluster"
|
|
955
|
+
exit 4
|
|
956
|
+
}
|
|
957
|
+
|
|
958
|
+
# Step 2: Extract the EKS cluster name from the ARN
|
|
959
|
+
EKS_CLUSTER_NAME=$(echo "${EKS_CLUSTER_ARN}" | awk -F'/' '{print $NF}')
|
|
960
|
+
echo " HyperPod cluster: ${HYPERPOD_CLUSTER_NAME}"
|
|
961
|
+
echo " EKS cluster: ${EKS_CLUSTER_NAME}"
|
|
962
|
+
|
|
963
|
+
# Step 3: Update kubeconfig using the EKS cluster
|
|
964
|
+
if ! aws eks update-kubeconfig \
|
|
965
|
+
--name "${EKS_CLUSTER_NAME}" \
|
|
966
|
+
--region "${AWS_REGION}" \
|
|
967
|
+
--kubeconfig "${KUBECONFIG_PATH}" 2>&1; then
|
|
968
|
+
echo "โ Failed to configure kubectl for EKS cluster: ${EKS_CLUSTER_NAME}"
|
|
969
|
+
echo ""
|
|
970
|
+
echo " Required IAM permissions:"
|
|
971
|
+
echo " โข eks:DescribeCluster"
|
|
972
|
+
echo " โข eks:AccessKubernetesApi"
|
|
973
|
+
exit 4
|
|
974
|
+
fi
|
|
975
|
+
|
|
976
|
+
export KUBECONFIG="${KUBECONFIG_PATH}"
|
|
977
|
+
echo "โ
Kubeconfig saved to: ${KUBECONFIG_PATH}"
|
|
978
|
+
|
|
979
|
+
# Verify cluster connectivity
|
|
980
|
+
echo "๐ Verifying cluster connectivity..."
|
|
981
|
+
if ! kubectl cluster-info &> /dev/null; then
|
|
982
|
+
echo "โ Cannot connect to HyperPod cluster"
|
|
983
|
+
echo ""
|
|
984
|
+
echo " Check that:"
|
|
985
|
+
echo " โข The cluster is in 'InService' status"
|
|
986
|
+
echo " โข Your network can reach the cluster API server"
|
|
987
|
+
echo " โข Your IAM credentials are valid"
|
|
988
|
+
exit 4
|
|
989
|
+
fi
|
|
990
|
+
echo "โ
Connected to HyperPod cluster"
|
|
991
|
+
|
|
992
|
+
# Create namespace if it doesn't exist
|
|
993
|
+
echo "๐ Ensuring namespace exists: ${HYPERPOD_NAMESPACE}"
|
|
994
|
+
if ! kubectl create namespace "${HYPERPOD_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - 2>&1; then
|
|
995
|
+
echo "โ ๏ธ Warning: Could not create/verify namespace"
|
|
996
|
+
fi
|
|
997
|
+
|
|
998
|
+
# Apply Kubernetes manifests
|
|
999
|
+
echo "๐ Applying Kubernetes manifests from hyperpod/..."
|
|
1000
|
+
|
|
1001
|
+
# Substitute shell variables (e.g. ${AWS_ACCOUNT_ID}) in manifests before applying
|
|
1002
|
+
export AWS_ACCOUNT_ID
|
|
1003
|
+
export ECR_IMAGE="${ECR_REPOSITORY}:${IMAGE_TAG}"
|
|
1004
|
+
|
|
1005
|
+
APPLY_OUTPUT=""
|
|
1006
|
+
APPLY_EXIT_CODE=0
|
|
1007
|
+
for manifest in hyperpod/*.yaml; do
|
|
1008
|
+
# Skip files that contain no Kubernetes objects (e.g. comment-only PVC stubs)
|
|
1009
|
+
RENDERED=$(envsubst < "${manifest}")
|
|
1010
|
+
if echo "${RENDERED}" | grep -q '^kind:'; then
|
|
1011
|
+
FILE_OUTPUT=$(echo "${RENDERED}" | kubectl apply -n "${HYPERPOD_NAMESPACE}" -f - 2>&1) || {
|
|
1012
|
+
APPLY_EXIT_CODE=$?
|
|
1013
|
+
}
|
|
1014
|
+
APPLY_OUTPUT="${APPLY_OUTPUT}${FILE_OUTPUT}\n"
|
|
1015
|
+
fi
|
|
1016
|
+
done
|
|
1017
|
+
|
|
1018
|
+
if [ "${APPLY_EXIT_CODE}" -ne 0 ]; then
|
|
1019
|
+
echo ""
|
|
1020
|
+
echo "โ Failed to apply Kubernetes manifests"
|
|
1021
|
+
echo ""
|
|
1022
|
+
echo " Error details:"
|
|
1023
|
+
echo " ${APPLY_OUTPUT}"
|
|
1024
|
+
echo ""
|
|
1025
|
+
echo " Common issues:"
|
|
1026
|
+
echo " โข Insufficient node capacity - check available GPU nodes"
|
|
1027
|
+
echo " โข Resource requests exceed node capacity"
|
|
1028
|
+
echo " โข RBAC permissions - ensure you have permission to create resources in namespace '${HYPERPOD_NAMESPACE}'"
|
|
1029
|
+
echo " โข Invalid manifest syntax"
|
|
1030
|
+
<% if (fsxVolumeHandle) { %>
|
|
1031
|
+
echo " โข PVC creation failure - verify the FSx CSI driver is installed on the cluster"
|
|
1032
|
+
echo " kubectl get csidriver -o name | grep fsx"
|
|
1033
|
+
<% } %>
|
|
1034
|
+
echo ""
|
|
1035
|
+
echo " Debug commands:"
|
|
1036
|
+
echo " kubectl get nodes -o wide"
|
|
1037
|
+
echo " kubectl describe nodes"
|
|
1038
|
+
echo " kubectl get events -n ${HYPERPOD_NAMESPACE}"
|
|
1039
|
+
exit ${APPLY_EXIT_CODE}
|
|
1040
|
+
fi
|
|
1041
|
+
|
|
1042
|
+
echo "โ
Kubernetes manifests applied"
|
|
1043
|
+
|
|
1044
|
+
# Record k8s deployment and service in manifest (non-blocking)
|
|
1045
|
+
./do/manifest add \
|
|
1046
|
+
--type k8s-deployment \
|
|
1047
|
+
--id "${HYPERPOD_NAMESPACE}/${PROJECT_NAME}" \
|
|
1048
|
+
--project "${PROJECT_NAME}" \
|
|
1049
|
+
--meta "{\"namespace\":\"${HYPERPOD_NAMESPACE}\",\"deploymentName\":\"${PROJECT_NAME}\",\"clusterName\":\"${HYPERPOD_CLUSTER_NAME}\",\"region\":\"${AWS_REGION}\"}" \
|
|
1050
|
+
2>/dev/null || true
|
|
1051
|
+
|
|
1052
|
+
./do/manifest add \
|
|
1053
|
+
--type k8s-service \
|
|
1054
|
+
--id "${HYPERPOD_NAMESPACE}/${PROJECT_NAME}" \
|
|
1055
|
+
--project "${PROJECT_NAME}" \
|
|
1056
|
+
--meta "{\"namespace\":\"${HYPERPOD_NAMESPACE}\",\"serviceName\":\"${PROJECT_NAME}\",\"clusterName\":\"${HYPERPOD_CLUSTER_NAME}\",\"region\":\"${AWS_REGION}\"}" \
|
|
1057
|
+
2>/dev/null || true
|
|
1058
|
+
|
|
1059
|
+
# Wait for deployment to be ready
|
|
1060
|
+
DEPLOY_TIMEOUT=${DEPLOY_TIMEOUT:-1200}
|
|
1061
|
+
echo "โณ Waiting for deployment to be ready (timeout: ${DEPLOY_TIMEOUT}s)..."
|
|
1062
|
+
echo " This may take several minutes for GPU workloads..."
|
|
1063
|
+
echo ""
|
|
1064
|
+
|
|
1065
|
+
# Poll pod status every 30s while rollout is in progress
|
|
1066
|
+
(
|
|
1067
|
+
while true; do
|
|
1068
|
+
sleep 30
|
|
1069
|
+
POD_STATUS=$(kubectl get pods -n "${HYPERPOD_NAMESPACE}" -l app=${PROJECT_NAME} \
|
|
1070
|
+
--no-headers 2>/dev/null | head -5)
|
|
1071
|
+
if [ -n "${POD_STATUS}" ]; then
|
|
1072
|
+
echo " ๐ $(date +%H:%M:%S) Pod status:"
|
|
1073
|
+
echo "${POD_STATUS}" | while read -r line; do echo " ${line}"; done
|
|
1074
|
+
fi
|
|
1075
|
+
done
|
|
1076
|
+
) &
|
|
1077
|
+
STATUS_PID=$!
|
|
1078
|
+
trap "kill ${STATUS_PID} 2>/dev/null; wait ${STATUS_PID} 2>/dev/null" EXIT
|
|
1079
|
+
|
|
1080
|
+
ROLLOUT_OUTPUT=$(kubectl rollout status deployment/${PROJECT_NAME} -n "${HYPERPOD_NAMESPACE}" --timeout=${DEPLOY_TIMEOUT}s 2>&1) || {
|
|
1081
|
+
ROLLOUT_EXIT_CODE=$?
|
|
1082
|
+
kill ${STATUS_PID} 2>/dev/null
|
|
1083
|
+
echo ""
|
|
1084
|
+
echo "โ Deployment failed to become ready within timeout"
|
|
1085
|
+
echo ""
|
|
1086
|
+
echo " Error details:"
|
|
1087
|
+
echo " ${ROLLOUT_OUTPUT}"
|
|
1088
|
+
echo ""
|
|
1089
|
+
echo " Current pod state:"
|
|
1090
|
+
kubectl get pods -n "${HYPERPOD_NAMESPACE}" -l app=${PROJECT_NAME} -o wide 2>/dev/null
|
|
1091
|
+
echo ""
|
|
1092
|
+
echo " Debug commands:"
|
|
1093
|
+
echo " kubectl describe pods -n ${HYPERPOD_NAMESPACE} -l app=${PROJECT_NAME}"
|
|
1094
|
+
echo " kubectl logs -n ${HYPERPOD_NAMESPACE} -l app=${PROJECT_NAME} --tail=100"
|
|
1095
|
+
echo ""
|
|
1096
|
+
echo " Common issues:"
|
|
1097
|
+
echo " โข Image pull errors - check ECR permissions"
|
|
1098
|
+
echo " โข Resource scheduling - insufficient GPU nodes"
|
|
1099
|
+
echo " โข Container crash - check application logs"
|
|
1100
|
+
<% if (fsxVolumeHandle) { %>
|
|
1101
|
+
echo " โข PVC binding errors - verify FSx CSI driver is installed on the cluster"
|
|
1102
|
+
echo " kubectl get pvc -n ${HYPERPOD_NAMESPACE}"
|
|
1103
|
+
echo " kubectl describe pvc -n ${HYPERPOD_NAMESPACE}"
|
|
1104
|
+
echo " kubectl get csidriver -o name | grep fsx"
|
|
1105
|
+
<% } %>
|
|
1106
|
+
exit ${ROLLOUT_EXIT_CODE}
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
kill ${STATUS_PID} 2>/dev/null
|
|
1110
|
+
wait ${STATUS_PID} 2>/dev/null
|
|
1111
|
+
|
|
1112
|
+
echo "โ
HyperPod EKS deployment complete!"
|
|
1113
|
+
echo ""
|
|
1114
|
+
echo "๐ Deployment Details:"
|
|
1115
|
+
echo " Cluster: ${HYPERPOD_CLUSTER_NAME}"
|
|
1116
|
+
echo " Namespace: ${HYPERPOD_NAMESPACE}"
|
|
1117
|
+
echo " Deployment: ${PROJECT_NAME}"
|
|
1118
|
+
echo " Replicas: ${HYPERPOD_REPLICAS}"
|
|
1119
|
+
echo " Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
|
|
1120
|
+
echo ""
|
|
1121
|
+
echo "๐ Check deployment status:"
|
|
1122
|
+
echo " export KUBECONFIG=${KUBECONFIG_PATH}"
|
|
1123
|
+
echo " kubectl get pods -n ${HYPERPOD_NAMESPACE}"
|
|
1124
|
+
echo " kubectl get svc -n ${HYPERPOD_NAMESPACE}"
|
|
1125
|
+
echo ""
|
|
1126
|
+
echo "๐งช Test your deployment:"
|
|
1127
|
+
echo " ./do/test"
|
|
1128
|
+
echo ""
|
|
1129
|
+
echo "๐ Register this deployment:"
|
|
1130
|
+
echo " ./do/register"
|
|
1131
|
+
echo ""
|
|
1132
|
+
echo "๐ View logs:"
|
|
1133
|
+
echo " ./do/logs"
|
|
1134
|
+
echo ""
|
|
1135
|
+
echo "๐งน Clean up when done:"
|
|
1136
|
+
echo " ./do/clean hyperpod"
|
|
1137
|
+
|
|
1138
|
+
# Write kubeconfig path to config so other scripts can use it (idempotent)
|
|
1139
|
+
_update_config_var() {
|
|
1140
|
+
local var_name="$1" var_value="$2" config_file="${SCRIPT_DIR}/config"
|
|
1141
|
+
if grep -q "^export ${var_name}=" "${config_file}" 2>/dev/null; then
|
|
1142
|
+
sed -i.bak "s|^export ${var_name}=.*|export ${var_name}=\"${var_value}\"|" "${config_file}"
|
|
1143
|
+
rm -f "${config_file}.bak"
|
|
1144
|
+
else
|
|
1145
|
+
echo "" >> "${config_file}"
|
|
1146
|
+
echo "export ${var_name}=\"${var_value}\"" >> "${config_file}"
|
|
1147
|
+
fi
|
|
1148
|
+
}
|
|
1149
|
+
|
|
1150
|
+
_update_config_var "KUBECONFIG" "${KUBECONFIG_PATH}"
|
|
1151
|
+
|
|
1152
|
+
<% } else if (deploymentTarget === 'batch-transform') { %>
|
|
1153
|
+
# ============================================================
|
|
1154
|
+
# SageMaker Managed Inference - Batch Deployment
|
|
1155
|
+
# Flow: create-model โ create-transform-job โ poll until completion
|
|
1156
|
+
# ============================================================
|
|
1157
|
+
|
|
1158
|
+
# Validate execution role ARN
|
|
1159
|
+
if [ -z "${ROLE_ARN:-}" ]; then
|
|
1160
|
+
echo "โ Execution role ARN not provided"
|
|
1161
|
+
echo ""
|
|
1162
|
+
echo "Usage:"
|
|
1163
|
+
echo " export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_ROLE"
|
|
1164
|
+
echo " ./do/deploy"
|
|
1165
|
+
echo ""
|
|
1166
|
+
echo "Or set ROLE_ARN in do/config"
|
|
1167
|
+
echo ""
|
|
1168
|
+
echo "The execution role must have permissions for:"
|
|
1169
|
+
echo " โข SageMaker model and transform job management"
|
|
1170
|
+
echo " โข ECR image access"
|
|
1171
|
+
echo " โข S3 read access for input path: ${BATCH_INPUT_PATH}"
|
|
1172
|
+
echo " โข S3 write access for output path: ${BATCH_OUTPUT_PATH}"
|
|
1173
|
+
echo " โข CloudWatch Logs"
|
|
1174
|
+
exit 3
|
|
1175
|
+
fi
|
|
1176
|
+
|
|
1177
|
+
echo " Using execution role: ${ROLE_ARN}"
|
|
1178
|
+
|
|
1179
|
+
# Validate S3 input path
|
|
1180
|
+
if [ -z "${BATCH_INPUT_PATH:-}" ]; then
|
|
1181
|
+
echo "โ S3 input path not provided"
|
|
1182
|
+
echo ""
|
|
1183
|
+
echo "Set BATCH_INPUT_PATH in do/config or provide via CLI:"
|
|
1184
|
+
echo " export BATCH_INPUT_PATH=s3://my-bucket/input/"
|
|
1185
|
+
echo " ./do/deploy"
|
|
1186
|
+
exit 3
|
|
1187
|
+
fi
|
|
1188
|
+
|
|
1189
|
+
if [[ "${BATCH_INPUT_PATH}" != s3://* ]]; then
|
|
1190
|
+
echo "โ S3 input path must start with s3://"
|
|
1191
|
+
echo " Current value: ${BATCH_INPUT_PATH}"
|
|
1192
|
+
echo " Example: s3://my-bucket/input/"
|
|
1193
|
+
exit 3
|
|
1194
|
+
fi
|
|
1195
|
+
|
|
1196
|
+
# Validate S3 output path
|
|
1197
|
+
if [ -z "${BATCH_OUTPUT_PATH:-}" ]; then
|
|
1198
|
+
echo "โ S3 output path not provided"
|
|
1199
|
+
echo ""
|
|
1200
|
+
echo "Set BATCH_OUTPUT_PATH in do/config or provide via CLI:"
|
|
1201
|
+
echo " export BATCH_OUTPUT_PATH=s3://my-bucket/output/"
|
|
1202
|
+
echo " ./do/deploy"
|
|
1203
|
+
exit 3
|
|
1204
|
+
fi
|
|
1205
|
+
|
|
1206
|
+
if [[ "${BATCH_OUTPUT_PATH}" != s3://* ]]; then
|
|
1207
|
+
echo "โ S3 output path must start with s3://"
|
|
1208
|
+
echo " Current value: ${BATCH_OUTPUT_PATH}"
|
|
1209
|
+
echo " Example: s3://my-bucket/output/"
|
|
1210
|
+
exit 3
|
|
1211
|
+
fi
|
|
1212
|
+
|
|
1213
|
+
# ============================================================
|
|
1214
|
+
# Bootstrap S3 buckets for batch transform
|
|
1215
|
+
# ============================================================
|
|
1216
|
+
|
|
1217
|
+
# Extract bucket names from S3 paths
|
|
1218
|
+
BATCH_INPUT_BUCKET=$(echo "${BATCH_INPUT_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
|
|
1219
|
+
BATCH_OUTPUT_BUCKET=$(echo "${BATCH_OUTPUT_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
|
|
1220
|
+
|
|
1221
|
+
<% if (!batchInputPath) { %>
|
|
1222
|
+
# Bootstrap default S3 input bucket (check-and-create)
|
|
1223
|
+
echo "๐ Checking if S3 input bucket exists: ${BATCH_INPUT_BUCKET}"
|
|
1224
|
+
if ! aws s3api head-bucket --bucket "${BATCH_INPUT_BUCKET}" --region "${AWS_REGION}" 2>/dev/null; then
|
|
1225
|
+
echo "๐ฆ Creating S3 input bucket: ${BATCH_INPUT_BUCKET}"
|
|
1226
|
+
if [ "${AWS_REGION}" = "us-east-1" ]; then
|
|
1227
|
+
if ! aws s3api create-bucket \
|
|
1228
|
+
--bucket "${BATCH_INPUT_BUCKET}" \
|
|
1229
|
+
--region "${AWS_REGION}"; then
|
|
1230
|
+
echo "โ Failed to create S3 input bucket: ${BATCH_INPUT_BUCKET}"
|
|
1231
|
+
echo ""
|
|
1232
|
+
echo " Check that:"
|
|
1233
|
+
echo " โข Your IAM credentials have s3:CreateBucket permission"
|
|
1234
|
+
echo " โข The bucket name is not already taken globally"
|
|
1235
|
+
exit 4
|
|
1236
|
+
fi
|
|
1237
|
+
else
|
|
1238
|
+
if ! aws s3api create-bucket \
|
|
1239
|
+
--bucket "${BATCH_INPUT_BUCKET}" \
|
|
1240
|
+
--region "${AWS_REGION}" \
|
|
1241
|
+
--create-bucket-configuration LocationConstraint="${AWS_REGION}"; then
|
|
1242
|
+
echo "โ Failed to create S3 input bucket: ${BATCH_INPUT_BUCKET}"
|
|
1243
|
+
echo ""
|
|
1244
|
+
echo " Check that:"
|
|
1245
|
+
echo " โข Your IAM credentials have s3:CreateBucket permission"
|
|
1246
|
+
echo " โข The bucket name is not already taken globally"
|
|
1247
|
+
exit 4
|
|
1248
|
+
fi
|
|
1249
|
+
fi
|
|
1250
|
+
echo "โ
S3 input bucket created: ${BATCH_INPUT_BUCKET}"
|
|
1251
|
+
else
|
|
1252
|
+
echo "โ
S3 input bucket exists: ${BATCH_INPUT_BUCKET}"
|
|
1253
|
+
fi
|
|
1254
|
+
|
|
1255
|
+
# Upload sample input file if the input prefix is empty
|
|
1256
|
+
EXISTING_OBJECTS=$(aws s3 ls "${BATCH_INPUT_PATH}" --region "${AWS_REGION}" 2>/dev/null | head -1 || true)
|
|
1257
|
+
if [ -z "${EXISTING_OBJECTS}" ]; then
|
|
1258
|
+
echo "๐ Uploading sample input file to ${BATCH_INPUT_PATH}"
|
|
1259
|
+
<% if (framework === 'transformers' && (modelServer === 'vllm' || modelServer === 'sglang')) { %>
|
|
1260
|
+
echo '{"model": "<%= modelName %>", "messages": [{"role": "user", "content": "What is machine learning?"}], "max_tokens": 50}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
|
|
1261
|
+
<% } else if (framework === 'transformers') { %>
|
|
1262
|
+
echo '{"inputs": "What is machine learning?", "parameters": {"max_new_tokens": 50}}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
|
|
1263
|
+
<% } else if (framework === 'diffusors') { %>
|
|
1264
|
+
echo '{"prompt": "A white cat", "n": 1, "size": "512x512"}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
|
|
1265
|
+
<% } else { %>
|
|
1266
|
+
echo '{"instances": [[1.0, 2.0, 3.0, 4.0]]}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
|
|
1267
|
+
<% } %>
|
|
1268
|
+
echo "โ
Sample input uploaded: ${BATCH_INPUT_PATH}sample.jsonl"
|
|
1269
|
+
echo " โ ๏ธ Replace this with your actual input data before running production jobs"
|
|
1270
|
+
fi
|
|
1271
|
+
<% } else { %>
|
|
1272
|
+
# Custom S3 input path provided โ skip bucket creation
|
|
1273
|
+
echo "โ
Using custom S3 input path: ${BATCH_INPUT_PATH}"
|
|
1274
|
+
<% } %>
|
|
1275
|
+
|
|
1276
|
+
<% if (!batchOutputPath) { %>
|
|
1277
|
+
# Bootstrap default S3 output bucket (check-and-create, may be same as input)
|
|
1278
|
+
if [ "${BATCH_OUTPUT_BUCKET}" != "${BATCH_INPUT_BUCKET}" ]; then
|
|
1279
|
+
echo "๐ Checking if S3 output bucket exists: ${BATCH_OUTPUT_BUCKET}"
|
|
1280
|
+
if ! aws s3api head-bucket --bucket "${BATCH_OUTPUT_BUCKET}" --region "${AWS_REGION}" 2>/dev/null; then
|
|
1281
|
+
echo "๐ฆ Creating S3 output bucket: ${BATCH_OUTPUT_BUCKET}"
|
|
1282
|
+
if [ "${AWS_REGION}" = "us-east-1" ]; then
|
|
1283
|
+
if ! aws s3api create-bucket \
|
|
1284
|
+
--bucket "${BATCH_OUTPUT_BUCKET}" \
|
|
1285
|
+
--region "${AWS_REGION}"; then
|
|
1286
|
+
echo "โ Failed to create S3 output bucket: ${BATCH_OUTPUT_BUCKET}"
|
|
1287
|
+
exit 4
|
|
1288
|
+
fi
|
|
1289
|
+
else
|
|
1290
|
+
if ! aws s3api create-bucket \
|
|
1291
|
+
--bucket "${BATCH_OUTPUT_BUCKET}" \
|
|
1292
|
+
--region "${AWS_REGION}" \
|
|
1293
|
+
--create-bucket-configuration LocationConstraint="${AWS_REGION}"; then
|
|
1294
|
+
echo "โ Failed to create S3 output bucket: ${BATCH_OUTPUT_BUCKET}"
|
|
1295
|
+
exit 4
|
|
1296
|
+
fi
|
|
1297
|
+
fi
|
|
1298
|
+
echo "โ
S3 output bucket created: ${BATCH_OUTPUT_BUCKET}"
|
|
1299
|
+
else
|
|
1300
|
+
echo "โ
S3 output bucket exists: ${BATCH_OUTPUT_BUCKET}"
|
|
1301
|
+
fi
|
|
1302
|
+
else
|
|
1303
|
+
echo "โ
S3 output bucket same as input: ${BATCH_OUTPUT_BUCKET}"
|
|
1304
|
+
fi
|
|
1305
|
+
<% } else { %>
|
|
1306
|
+
# Custom S3 output path provided โ skip bucket creation
|
|
1307
|
+
echo "โ
Using custom S3 output path: ${BATCH_OUTPUT_PATH}"
|
|
1308
|
+
<% } %>
|
|
1309
|
+
|
|
1310
|
+
# Helper: persist a variable to do/config so other scripts can use it
|
|
1311
|
+
_update_config_var() {
|
|
1312
|
+
local var_name="$1" var_value="$2" config_file="${SCRIPT_DIR}/config"
|
|
1313
|
+
if grep -q "^export ${var_name}=" "${config_file}" 2>/dev/null; then
|
|
1314
|
+
sed -i.bak "s|^export ${var_name}=.*|export ${var_name}=\"${var_value}\"|" "${config_file}"
|
|
1315
|
+
rm -f "${config_file}.bak"
|
|
1316
|
+
else
|
|
1317
|
+
echo "" >> "${config_file}"
|
|
1318
|
+
echo "export ${var_name}=\"${var_value}\"" >> "${config_file}"
|
|
1319
|
+
fi
|
|
1320
|
+
}
|
|
1321
|
+
|
|
1322
|
+
# ============================================================
|
|
1323
|
+
# Check for previous transform job still running
|
|
1324
|
+
# ============================================================
|
|
1325
|
+
if [ "${FORCE_NEW}" != true ] && [ -n "${TRANSFORM_JOB_NAME:-}" ]; then
|
|
1326
|
+
echo "๐ Checking previous transform job: ${TRANSFORM_JOB_NAME}"
|
|
1327
|
+
PREV_JOB_STATUS=$(aws sagemaker describe-transform-job \
|
|
1328
|
+
--transform-job-name "${TRANSFORM_JOB_NAME}" \
|
|
1329
|
+
--region "${AWS_REGION}" \
|
|
1330
|
+
--query "TransformJobStatus" \
|
|
1331
|
+
--output text 2>/dev/null || echo "")
|
|
1332
|
+
|
|
1333
|
+
case "${PREV_JOB_STATUS}" in
|
|
1334
|
+
InProgress)
|
|
1335
|
+
echo "โ ๏ธ Previous transform job is still running: ${TRANSFORM_JOB_NAME}"
|
|
1336
|
+
echo " Wait for it to complete, or stop it with:"
|
|
1337
|
+
echo " aws sagemaker stop-transform-job --transform-job-name ${TRANSFORM_JOB_NAME} --region ${AWS_REGION}"
|
|
1338
|
+
echo ""
|
|
1339
|
+
echo " Use --force to create a new job anyway."
|
|
1340
|
+
exit 4
|
|
1341
|
+
;;
|
|
1342
|
+
Completed)
|
|
1343
|
+
echo "โ
Previous transform job completed: ${TRANSFORM_JOB_NAME}"
|
|
1344
|
+
echo " Creating a new job. Results from the previous job are in:"
|
|
1345
|
+
echo " ${BATCH_OUTPUT_PATH}"
|
|
1346
|
+
echo ""
|
|
1347
|
+
;;
|
|
1348
|
+
*)
|
|
1349
|
+
# Failed, Stopped, or not found โ proceed with new job
|
|
1350
|
+
;;
|
|
1351
|
+
esac
|
|
1352
|
+
fi
|
|
1353
|
+
|
|
1354
|
+
# Generate unique names with timestamp
|
|
1355
|
+
TIMESTAMP=$(date +%s)
|
|
1356
|
+
MODEL_NAME_SM="${PROJECT_NAME}-batch-model-${TIMESTAMP}"
|
|
1357
|
+
TRANSFORM_JOB_NAME="${PROJECT_NAME}-batch-job-${TIMESTAMP}"
|
|
1358
|
+
|
|
1359
|
+
_update_config_var "TRANSFORM_JOB_NAME" "${TRANSFORM_JOB_NAME}"
|
|
1360
|
+
_update_config_var "SAGEMAKER_MODEL_NAME" "${MODEL_NAME_SM}"
|
|
1361
|
+
|
|
1362
|
+
# Step 1: Create SageMaker model
|
|
1363
|
+
echo "๐ฆ Creating SageMaker model: ${MODEL_NAME_SM}"
|
|
1364
|
+
if ! aws sagemaker create-model \
|
|
1365
|
+
--model-name "${MODEL_NAME_SM}" \
|
|
1366
|
+
--primary-container "{\"Image\":\"${ECR_REPOSITORY}:${IMAGE_TAG}\"}" \
|
|
1367
|
+
--execution-role-arn "${ROLE_ARN}" \
|
|
1368
|
+
--region "${AWS_REGION}"; then
|
|
1369
|
+
|
|
1370
|
+
echo "โ Failed to create SageMaker model"
|
|
1371
|
+
echo " Check that:"
|
|
1372
|
+
echo " โข The execution role ARN is valid"
|
|
1373
|
+
echo " โข The ECR image exists and is accessible"
|
|
1374
|
+
echo " โข The IAM role has ecr:GetDownloadUrlForLayer permission"
|
|
1375
|
+
exit 4
|
|
1376
|
+
fi
|
|
1377
|
+
|
|
1378
|
+
echo "โ
SageMaker model created: ${MODEL_NAME_SM}"
|
|
1379
|
+
|
|
1380
|
+
# Record model in manifest (non-blocking)
|
|
1381
|
+
MODEL_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:model/${MODEL_NAME_SM}"
|
|
1382
|
+
./do/manifest add \
|
|
1383
|
+
--type sagemaker-model \
|
|
1384
|
+
--id "${MODEL_ARN}" \
|
|
1385
|
+
--project "${PROJECT_NAME}" \
|
|
1386
|
+
--meta "{\"modelName\":\"${MODEL_NAME_SM}\",\"region\":\"${AWS_REGION}\"}" \
|
|
1387
|
+
2>/dev/null || true
|
|
1388
|
+
|
|
1389
|
+
# Step 2: Build transform job JSON
|
|
1390
|
+
TRANSFORM_JOB_JSON="{
|
|
1391
|
+
\"TransformJobName\": \"${TRANSFORM_JOB_NAME}\",
|
|
1392
|
+
\"ModelName\": \"${MODEL_NAME_SM}\",
|
|
1393
|
+
\"TransformInput\": {
|
|
1394
|
+
\"DataSource\": {
|
|
1395
|
+
\"S3DataSource\": {
|
|
1396
|
+
\"S3DataType\": \"S3Prefix\",
|
|
1397
|
+
\"S3Uri\": \"${BATCH_INPUT_PATH}\"
|
|
1398
|
+
}
|
|
1399
|
+
},
|
|
1400
|
+
\"ContentType\": \"application/json\",
|
|
1401
|
+
\"SplitType\": \"${BATCH_SPLIT_TYPE}\"
|
|
1402
|
+
},
|
|
1403
|
+
\"TransformOutput\": {
|
|
1404
|
+
\"S3OutputPath\": \"${BATCH_OUTPUT_PATH}\"
|
|
1405
|
+
$([ "${BATCH_JOIN_SOURCE:-None}" = "Input" ] && echo ",\"Accept\": \"application/json\", \"AssembleWith\": \"${BATCH_SPLIT_TYPE}\"")
|
|
1406
|
+
},
|
|
1407
|
+
\"TransformResources\": {
|
|
1408
|
+
\"InstanceType\": \"${INSTANCE_TYPE}\",
|
|
1409
|
+
\"InstanceCount\": ${BATCH_INSTANCE_COUNT}
|
|
1410
|
+
},
|
|
1411
|
+
\"MaxConcurrentTransforms\": ${BATCH_MAX_CONCURRENT_TRANSFORMS:-1},
|
|
1412
|
+
\"MaxPayloadInMB\": ${BATCH_MAX_PAYLOAD_IN_MB:-6},
|
|
1413
|
+
\"BatchStrategy\": \"${BATCH_STRATEGY}\"
|
|
1414
|
+
$([ "${BATCH_JOIN_SOURCE:-None}" = "Input" ] && echo ",\"DataProcessing\": { \"JoinSource\": \"Input\" }")
|
|
1415
|
+
}"
|
|
1416
|
+
|
|
1417
|
+
# Step 3: Create transform job
|
|
1418
|
+
echo "๐ Creating transform job: ${TRANSFORM_JOB_NAME}"
|
|
1419
|
+
if ! aws sagemaker create-transform-job \
|
|
1420
|
+
--cli-input-json "${TRANSFORM_JOB_JSON}" \
|
|
1421
|
+
--region "${AWS_REGION}"; then
|
|
1422
|
+
|
|
1423
|
+
echo "โ Failed to create transform job"
|
|
1424
|
+
echo " Check that:"
|
|
1425
|
+
echo " โข The S3 input path exists and is accessible: ${BATCH_INPUT_PATH}"
|
|
1426
|
+
echo " โข The S3 output path is writable: ${BATCH_OUTPUT_PATH}"
|
|
1427
|
+
echo " โข The IAM role has s3:GetObject permission on the input path"
|
|
1428
|
+
echo " โข The IAM role has s3:PutObject permission on the output path"
|
|
1429
|
+
echo " โข The instance type is valid: ${INSTANCE_TYPE}"
|
|
1430
|
+
echo " โข The instance type is available in region: ${AWS_REGION}"
|
|
1431
|
+
echo " โข You have sufficient service quota for the instance type"
|
|
1432
|
+
exit 4
|
|
1433
|
+
fi
|
|
1434
|
+
|
|
1435
|
+
echo "โ
Transform job created: ${TRANSFORM_JOB_NAME}"
|
|
1436
|
+
|
|
1437
|
+
# Record transform job in manifest (non-blocking)
|
|
1438
|
+
TRANSFORM_JOB_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:transform-job/${TRANSFORM_JOB_NAME}"
|
|
1439
|
+
./do/manifest add \
|
|
1440
|
+
--type sagemaker-transform-job \
|
|
1441
|
+
--id "${TRANSFORM_JOB_ARN}" \
|
|
1442
|
+
--project "${PROJECT_NAME}" \
|
|
1443
|
+
--meta "{\"transformJobName\":\"${TRANSFORM_JOB_NAME}\",\"modelName\":\"${MODEL_NAME_SM}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
|
|
1444
|
+
2>/dev/null || true
|
|
1445
|
+
|
|
1446
|
+
# Step 4: Poll transform job status until completion or failure
|
|
1447
|
+
echo "โณ Waiting for transform job to complete..."
|
|
1448
|
+
echo " This may take several minutes depending on dataset size..."
|
|
1449
|
+
echo " If this times out, check status with:"
|
|
1450
|
+
echo " aws sagemaker describe-transform-job --transform-job-name ${TRANSFORM_JOB_NAME} --region ${AWS_REGION}"
|
|
1451
|
+
echo ""
|
|
1452
|
+
|
|
1453
|
+
while true; do
|
|
1454
|
+
JOB_STATUS=$(aws sagemaker describe-transform-job \
|
|
1455
|
+
--transform-job-name "${TRANSFORM_JOB_NAME}" \
|
|
1456
|
+
--region "${AWS_REGION}" \
|
|
1457
|
+
--query "TransformJobStatus" \
|
|
1458
|
+
--output text 2>&1) || {
|
|
1459
|
+
# Check if it was a credential expiration
|
|
1460
|
+
if echo "${JOB_STATUS}" | grep -qi "expired\|token"; then
|
|
1461
|
+
echo ""
|
|
1462
|
+
echo "โ ๏ธ Credentials expired, but the transform job is still running."
|
|
1463
|
+
echo " Refresh your credentials and check status with:"
|
|
1464
|
+
echo " aws sagemaker describe-transform-job --transform-job-name ${TRANSFORM_JOB_NAME} --region ${AWS_REGION} --query TransformJobStatus"
|
|
1465
|
+
exit 4
|
|
1466
|
+
fi
|
|
1467
|
+
echo "โ Failed to describe transform job: ${TRANSFORM_JOB_NAME}"
|
|
1468
|
+
echo " Error: ${JOB_STATUS}"
|
|
1469
|
+
exit 4
|
|
1470
|
+
}
|
|
1471
|
+
|
|
1472
|
+
case "${JOB_STATUS}" in
|
|
1473
|
+
Completed)
|
|
1474
|
+
echo "โ
Transform job completed successfully!"
|
|
1475
|
+
break
|
|
1476
|
+
;;
|
|
1477
|
+
Failed)
|
|
1478
|
+
FAILURE_REASON=$(aws sagemaker describe-transform-job \
|
|
1479
|
+
--transform-job-name "${TRANSFORM_JOB_NAME}" \
|
|
1480
|
+
--region "${AWS_REGION}" \
|
|
1481
|
+
--query "FailureReason" \
|
|
1482
|
+
--output text 2>/dev/null || echo "Unknown")
|
|
1483
|
+
echo "โ Transform job failed"
|
|
1484
|
+
echo " Reason: ${FAILURE_REASON}"
|
|
1485
|
+
echo ""
|
|
1486
|
+
echo " Check CloudWatch Logs for details:"
|
|
1487
|
+
echo " https://console.aws.amazon.com/cloudwatch/home?region=${AWS_REGION}#logsV2:log-groups/log-group//aws/sagemaker/TransformJobs"
|
|
1488
|
+
echo ""
|
|
1489
|
+
echo " Verify that:"
|
|
1490
|
+
echo " โข The S3 input path exists and contains data: ${BATCH_INPUT_PATH}"
|
|
1491
|
+
echo " โข The input data format matches the container's expected format"
|
|
1492
|
+
echo " โข The container's /ping and /invocations endpoints work correctly"
|
|
1493
|
+
exit 4
|
|
1494
|
+
;;
|
|
1495
|
+
Stopped)
|
|
1496
|
+
echo "โ ๏ธ Transform job was stopped"
|
|
1497
|
+
exit 4
|
|
1498
|
+
;;
|
|
1499
|
+
InProgress)
|
|
1500
|
+
echo " $(date +%H:%M:%S) Job status: InProgress..."
|
|
1501
|
+
sleep 30
|
|
1502
|
+
;;
|
|
1503
|
+
*)
|
|
1504
|
+
echo " $(date +%H:%M:%S) Job status: ${JOB_STATUS}..."
|
|
1505
|
+
sleep 30
|
|
1506
|
+
;;
|
|
1507
|
+
esac
|
|
1508
|
+
done
|
|
1509
|
+
|
|
1510
|
+
echo ""
|
|
1511
|
+
echo "๐ Deployment Details:"
|
|
1512
|
+
echo " Transform Job: ${TRANSFORM_JOB_NAME}"
|
|
1513
|
+
echo " Model: ${MODEL_NAME_SM}"
|
|
1514
|
+
echo " Region: ${AWS_REGION}"
|
|
1515
|
+
echo " Instance Type: ${INSTANCE_TYPE}"
|
|
1516
|
+
echo " Instance Count: ${BATCH_INSTANCE_COUNT}"
|
|
1517
|
+
echo " Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
|
|
1518
|
+
echo " S3 Input: ${BATCH_INPUT_PATH}"
|
|
1519
|
+
echo " S3 Output: ${BATCH_OUTPUT_PATH}"
|
|
1520
|
+
echo " Split Type: ${BATCH_SPLIT_TYPE}"
|
|
1521
|
+
echo " Strategy: ${BATCH_STRATEGY}"
|
|
1522
|
+
echo ""
|
|
1523
|
+
|
|
1524
|
+
# Download results locally
|
|
1525
|
+
LOCAL_OUTPUT_DIR="${SCRIPT_DIR}/../batch-output"
|
|
1526
|
+
mkdir -p "${LOCAL_OUTPUT_DIR}"
|
|
1527
|
+
echo "๐ฅ Downloading results to ${LOCAL_OUTPUT_DIR}/"
|
|
1528
|
+
if aws s3 sync "${BATCH_OUTPUT_PATH}" "${LOCAL_OUTPUT_DIR}/" --region "${AWS_REGION}"; then
|
|
1529
|
+
DOWNLOADED=$(ls -1 "${LOCAL_OUTPUT_DIR}" 2>/dev/null | wc -l | tr -d ' ')
|
|
1530
|
+
echo "โ
Downloaded ${DOWNLOADED} file(s) to ${LOCAL_OUTPUT_DIR}/"
|
|
1531
|
+
echo ""
|
|
1532
|
+
|
|
1533
|
+
# Display first output file preview
|
|
1534
|
+
FIRST_FILE=$(ls -1 "${LOCAL_OUTPUT_DIR}" 2>/dev/null | head -1)
|
|
1535
|
+
if [ -n "${FIRST_FILE}" ]; then
|
|
1536
|
+
echo "๐ Sample output (${FIRST_FILE}):"
|
|
1537
|
+
head -5 "${LOCAL_OUTPUT_DIR}/${FIRST_FILE}"
|
|
1538
|
+
LINES=$(wc -l < "${LOCAL_OUTPUT_DIR}/${FIRST_FILE}" | tr -d ' ')
|
|
1539
|
+
if [ "${LINES}" -gt 5 ]; then
|
|
1540
|
+
echo " ... (${LINES} total lines)"
|
|
1541
|
+
fi
|
|
1542
|
+
fi
|
|
1543
|
+
else
|
|
1544
|
+
echo "โ ๏ธ Could not download output files"
|
|
1545
|
+
fi
|
|
1546
|
+
|
|
1547
|
+
echo ""
|
|
1548
|
+
echo "๐งช Review results:"
|
|
1549
|
+
echo " ./do/test"
|
|
1550
|
+
echo ""
|
|
1551
|
+
echo "๐ Register this deployment:"
|
|
1552
|
+
echo " ./do/register"
|
|
1553
|
+
echo ""
|
|
1554
|
+
echo "๐ View logs:"
|
|
1555
|
+
echo " ./do/logs"
|
|
1556
|
+
echo ""
|
|
1557
|
+
echo "๐งน Clean up when done:"
|
|
1558
|
+
echo " ./do/clean"
|
|
1559
|
+
|
|
1560
|
+
<% } %>
|