@aws/ml-container-creator 0.8.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE-THIRD-PARTY +50760 -16218
- package/bin/cli.js +31 -137
- package/package.json +7 -2
- package/servers/lib/catalogs/instances.json +52 -1275
- package/servers/lib/catalogs/models.json +0 -132
- package/servers/lib/catalogs/popular-diffusors.json +1 -110
- package/src/app.js +29 -2
- package/src/lib/config-manager.js +17 -0
- package/src/lib/generated/cli-options.js +467 -0
- package/src/lib/generated/validation-rules.js +202 -0
- package/src/lib/mcp-client.js +16 -1
- package/src/lib/mcp-command-handler.js +10 -2
- package/src/lib/prompt-runner.js +16 -2
- package/src/lib/train-config-parser.js +136 -0
- package/src/lib/train-config-persistence.js +143 -0
- package/src/lib/train-config-validator.js +112 -0
- package/src/lib/train-feedback.js +46 -0
- package/src/lib/train-idempotency.js +97 -0
- package/src/lib/train-request-builder.js +120 -0
- package/templates/code/serve +5 -134
- package/templates/code/serve.d/lmi.ejs +19 -0
- package/templates/code/serve.d/sglang.ejs +47 -0
- package/templates/code/serve.d/tensorrt-llm.ejs +53 -0
- package/templates/code/serve.d/vllm.ejs +48 -0
- package/templates/do/.train_build_request.py +141 -0
- package/templates/do/.train_poll_parser.py +135 -0
- package/templates/do/.train_status_parser.py +187 -0
- package/templates/do/clean +1 -1387
- package/templates/do/clean.d/async-inference.ejs +508 -0
- package/templates/do/clean.d/batch-transform.ejs +512 -0
- package/templates/do/clean.d/hyperpod-eks.ejs +481 -0
- package/templates/do/clean.d/managed-inference.ejs +1043 -0
- package/templates/do/deploy +1 -1766
- package/templates/do/deploy.d/async-inference.ejs +501 -0
- package/templates/do/deploy.d/batch-transform.ejs +529 -0
- package/templates/do/deploy.d/hyperpod-eks.ejs +339 -0
- package/templates/do/deploy.d/managed-inference.ejs +726 -0
- package/templates/do/lib/feedback.sh +41 -0
- package/templates/do/train +786 -0
- package/templates/do/training/config.yaml +140 -0
- package/templates/do/training/train.py +463 -0
|
@@ -0,0 +1,529 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
set -e
|
|
6
|
+
set -u
|
|
7
|
+
set -o pipefail
|
|
8
|
+
|
|
9
|
+
# Parse flags
|
|
10
|
+
FORCE_NEW=false
|
|
11
|
+
FORCE_IC=false
|
|
12
|
+
IC_TARGET=""
|
|
13
|
+
while [ $# -gt 0 ]; do
|
|
14
|
+
case "$1" in
|
|
15
|
+
--force) FORCE_NEW=true; shift ;;
|
|
16
|
+
--force-ic)
|
|
17
|
+
FORCE_IC=true
|
|
18
|
+
shift
|
|
19
|
+
;;
|
|
20
|
+
--help|-h)
|
|
21
|
+
echo "Usage: ./do/deploy [--force] [--force-ic]"
|
|
22
|
+
echo ""
|
|
23
|
+
echo "Options:"
|
|
24
|
+
echo " --force Create a new endpoint, even if one already exists."
|
|
25
|
+
echo " --force-ic Recreate the inference component on the existing endpoint."
|
|
26
|
+
echo ""
|
|
27
|
+
echo "Without flags, deploy resumes from the last run."
|
|
28
|
+
exit 0
|
|
29
|
+
;;
|
|
30
|
+
*)
|
|
31
|
+
echo "❌ Unknown option: $1"
|
|
32
|
+
echo " Run ./do/deploy --help for usage."
|
|
33
|
+
exit 1
|
|
34
|
+
;;
|
|
35
|
+
esac
|
|
36
|
+
done
|
|
37
|
+
|
|
38
|
+
# Source configuration
|
|
39
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
40
|
+
source "${SCRIPT_DIR}/config"
|
|
41
|
+
|
|
42
|
+
echo "🚀 Deploying to AWS"
|
|
43
|
+
echo " Project: ${PROJECT_NAME}"
|
|
44
|
+
echo " Deployment config: ${DEPLOYMENT_CONFIG}"
|
|
45
|
+
echo " Region: ${AWS_REGION}"
|
|
46
|
+
echo " Build target: ${BUILD_TARGET}"
|
|
47
|
+
echo " Deployment target: ${DEPLOYMENT_TARGET}"
|
|
48
|
+
echo " Instance type: ${INSTANCE_TYPE}"
|
|
49
|
+
echo " S3 input: ${BATCH_INPUT_PATH}"
|
|
50
|
+
echo " S3 output: ${BATCH_OUTPUT_PATH}"
|
|
51
|
+
echo " Instance count: ${BATCH_INSTANCE_COUNT}"
|
|
52
|
+
echo " Split type: ${BATCH_SPLIT_TYPE}"
|
|
53
|
+
echo " Strategy: ${BATCH_STRATEGY}"
|
|
54
|
+
|
|
55
|
+
# Check AWS credentials
|
|
56
|
+
echo "🔍 Validating AWS credentials..."
|
|
57
|
+
if ! aws sts get-caller-identity &> /dev/null; then
|
|
58
|
+
echo "❌ AWS credentials not configured"
|
|
59
|
+
echo " Run: aws configure"
|
|
60
|
+
echo " Or set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables"
|
|
61
|
+
exit 4
|
|
62
|
+
fi
|
|
63
|
+
|
|
64
|
+
AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
|
|
65
|
+
echo "✅ AWS credentials validated (Account: ${AWS_ACCOUNT_ID})"
|
|
66
|
+
|
|
67
|
+
# Construct ECR repository URL
|
|
68
|
+
ECR_REPOSITORY="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_REPOSITORY_NAME}"
|
|
69
|
+
|
|
70
|
+
# ============================================================
|
|
71
|
+
# Shared: Verify ECR image exists
|
|
72
|
+
# ============================================================
|
|
73
|
+
echo "🔍 Verifying ECR image exists..."
|
|
74
|
+
if ! aws ecr describe-images \
|
|
75
|
+
--repository-name "${ECR_REPOSITORY_NAME}" \
|
|
76
|
+
--image-ids imageTag="${PROJECT_NAME}-latest" \
|
|
77
|
+
--region "${AWS_REGION}" &> /dev/null; then
|
|
78
|
+
|
|
79
|
+
echo "❌ ECR image not found: ${ECR_REPOSITORY}:${PROJECT_NAME}-latest"
|
|
80
|
+
echo ""
|
|
81
|
+
echo "Please build and push your image first:"
|
|
82
|
+
echo " ./do/submit"
|
|
83
|
+
echo ""
|
|
84
|
+
echo "After the build completes successfully, run this deploy script again."
|
|
85
|
+
exit 4
|
|
86
|
+
fi
|
|
87
|
+
|
|
88
|
+
echo "✅ ECR image found: ${ECR_REPOSITORY}:${PROJECT_NAME}-latest"
|
|
89
|
+
IMAGE_TAG="${PROJECT_NAME}-latest"
|
|
90
|
+
|
|
91
|
+
# ============================================================
|
|
92
|
+
# Shared: Resolve secrets for container environment
|
|
93
|
+
# ============================================================
|
|
94
|
+
CONTAINER_ENV_JSON=""
|
|
95
|
+
|
|
96
|
+
if [ -n "${HF_TOKEN_ARN:-}" ]; then
|
|
97
|
+
echo "🔐 Resolving HuggingFace token from Secrets Manager..."
|
|
98
|
+
RESOLVED_HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "${HF_TOKEN_ARN}" --query SecretString --output text --region "${AWS_REGION}") || {
|
|
99
|
+
echo "❌ Failed to resolve HuggingFace token from Secrets Manager"
|
|
100
|
+
exit 3
|
|
101
|
+
}
|
|
102
|
+
CONTAINER_ENV_JSON="\"HF_TOKEN\":\"${RESOLVED_HF_TOKEN}\""
|
|
103
|
+
elif [ -n "${HF_TOKEN:-}" ]; then
|
|
104
|
+
CONTAINER_ENV_JSON="\"HF_TOKEN\":\"${HF_TOKEN}\""
|
|
105
|
+
fi
|
|
106
|
+
|
|
107
|
+
if [ -n "${NGC_API_KEY_ARN:-}" ]; then
|
|
108
|
+
echo "🔐 Resolving NGC API key from Secrets Manager..."
|
|
109
|
+
RESOLVED_NGC_KEY=$(aws secretsmanager get-secret-value --secret-id "${NGC_API_KEY_ARN}" --query SecretString --output text --region "${AWS_REGION}") || {
|
|
110
|
+
echo "❌ Failed to resolve NGC API key from Secrets Manager"
|
|
111
|
+
exit 3
|
|
112
|
+
}
|
|
113
|
+
if [ -n "${CONTAINER_ENV_JSON}" ]; then
|
|
114
|
+
CONTAINER_ENV_JSON="${CONTAINER_ENV_JSON},\"NGC_API_KEY\":\"${RESOLVED_NGC_KEY}\""
|
|
115
|
+
else
|
|
116
|
+
CONTAINER_ENV_JSON="\"NGC_API_KEY\":\"${RESOLVED_NGC_KEY}\""
|
|
117
|
+
fi
|
|
118
|
+
elif [ -n "${NGC_API_KEY:-}" ]; then
|
|
119
|
+
if [ -n "${CONTAINER_ENV_JSON}" ]; then
|
|
120
|
+
CONTAINER_ENV_JSON="${CONTAINER_ENV_JSON},\"NGC_API_KEY\":\"${NGC_API_KEY}\""
|
|
121
|
+
else
|
|
122
|
+
CONTAINER_ENV_JSON="\"NGC_API_KEY\":\"${NGC_API_KEY}\""
|
|
123
|
+
fi
|
|
124
|
+
fi
|
|
125
|
+
|
|
126
|
+
# ============================================================
|
|
127
|
+
# SageMaker Batch Transform Deployment
|
|
128
|
+
# Flow: create-model → create-transform-job → poll until completion
|
|
129
|
+
# ============================================================
|
|
130
|
+
|
|
131
|
+
# Source shared helpers
|
|
132
|
+
source "${SCRIPT_DIR}/lib/secrets.sh"
|
|
133
|
+
source "${SCRIPT_DIR}/lib/wait.sh"
|
|
134
|
+
|
|
135
|
+
# Resolve container secrets (HF_TOKEN, NGC_API_KEY)
|
|
136
|
+
resolve_secrets
|
|
137
|
+
|
|
138
|
+
# Validate execution role ARN
|
|
139
|
+
if [ -z "${ROLE_ARN:-}" ]; then
|
|
140
|
+
echo "❌ Execution role ARN not provided"
|
|
141
|
+
echo ""
|
|
142
|
+
echo "Usage:"
|
|
143
|
+
echo " export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_ROLE"
|
|
144
|
+
echo " ./do/deploy"
|
|
145
|
+
echo ""
|
|
146
|
+
echo "Or set ROLE_ARN in do/config"
|
|
147
|
+
echo ""
|
|
148
|
+
echo "The execution role must have permissions for:"
|
|
149
|
+
echo " • SageMaker model and transform job management"
|
|
150
|
+
echo " • ECR image access"
|
|
151
|
+
echo " • S3 read access for input path: ${BATCH_INPUT_PATH}"
|
|
152
|
+
echo " • S3 write access for output path: ${BATCH_OUTPUT_PATH}"
|
|
153
|
+
echo " • CloudWatch Logs"
|
|
154
|
+
exit 3
|
|
155
|
+
fi
|
|
156
|
+
|
|
157
|
+
echo " Using execution role: ${ROLE_ARN}"
|
|
158
|
+
|
|
159
|
+
# Validate S3 input path
|
|
160
|
+
if [ -z "${BATCH_INPUT_PATH:-}" ]; then
|
|
161
|
+
echo "❌ S3 input path not provided"
|
|
162
|
+
echo ""
|
|
163
|
+
echo "Set BATCH_INPUT_PATH in do/config or provide via CLI:"
|
|
164
|
+
echo " export BATCH_INPUT_PATH=s3://my-bucket/input/"
|
|
165
|
+
echo " ./do/deploy"
|
|
166
|
+
exit 3
|
|
167
|
+
fi
|
|
168
|
+
|
|
169
|
+
if [[ "${BATCH_INPUT_PATH}" != s3://* ]]; then
|
|
170
|
+
echo "❌ S3 input path must start with s3://"
|
|
171
|
+
echo " Current value: ${BATCH_INPUT_PATH}"
|
|
172
|
+
echo " Example: s3://my-bucket/input/"
|
|
173
|
+
exit 3
|
|
174
|
+
fi
|
|
175
|
+
|
|
176
|
+
# Validate S3 output path
|
|
177
|
+
if [ -z "${BATCH_OUTPUT_PATH:-}" ]; then
|
|
178
|
+
echo "❌ S3 output path not provided"
|
|
179
|
+
echo ""
|
|
180
|
+
echo "Set BATCH_OUTPUT_PATH in do/config or provide via CLI:"
|
|
181
|
+
echo " export BATCH_OUTPUT_PATH=s3://my-bucket/output/"
|
|
182
|
+
echo " ./do/deploy"
|
|
183
|
+
exit 3
|
|
184
|
+
fi
|
|
185
|
+
|
|
186
|
+
if [[ "${BATCH_OUTPUT_PATH}" != s3://* ]]; then
|
|
187
|
+
echo "❌ S3 output path must start with s3://"
|
|
188
|
+
echo " Current value: ${BATCH_OUTPUT_PATH}"
|
|
189
|
+
echo " Example: s3://my-bucket/output/"
|
|
190
|
+
exit 3
|
|
191
|
+
fi
|
|
192
|
+
|
|
193
|
+
# ============================================================
|
|
194
|
+
# Bootstrap S3 buckets for batch transform
|
|
195
|
+
# ============================================================
|
|
196
|
+
|
|
197
|
+
# Extract bucket names from S3 paths
|
|
198
|
+
BATCH_INPUT_BUCKET=$(echo "${BATCH_INPUT_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
|
|
199
|
+
BATCH_OUTPUT_BUCKET=$(echo "${BATCH_OUTPUT_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
|
|
200
|
+
|
|
201
|
+
<% if (!batchInputPath) { %>
|
|
202
|
+
# Bootstrap default S3 input bucket (check-and-create)
|
|
203
|
+
echo "🔍 Checking if S3 input bucket exists: ${BATCH_INPUT_BUCKET}"
|
|
204
|
+
if ! aws s3api head-bucket --bucket "${BATCH_INPUT_BUCKET}" --region "${AWS_REGION}" 2>/dev/null; then
|
|
205
|
+
echo "📦 Creating S3 input bucket: ${BATCH_INPUT_BUCKET}"
|
|
206
|
+
if [ "${AWS_REGION}" = "us-east-1" ]; then
|
|
207
|
+
if ! aws s3api create-bucket \
|
|
208
|
+
--bucket "${BATCH_INPUT_BUCKET}" \
|
|
209
|
+
--region "${AWS_REGION}"; then
|
|
210
|
+
echo "❌ Failed to create S3 input bucket: ${BATCH_INPUT_BUCKET}"
|
|
211
|
+
echo ""
|
|
212
|
+
echo " Check that:"
|
|
213
|
+
echo " • Your IAM credentials have s3:CreateBucket permission"
|
|
214
|
+
echo " • The bucket name is not already taken globally"
|
|
215
|
+
exit 4
|
|
216
|
+
fi
|
|
217
|
+
else
|
|
218
|
+
if ! aws s3api create-bucket \
|
|
219
|
+
--bucket "${BATCH_INPUT_BUCKET}" \
|
|
220
|
+
--region "${AWS_REGION}" \
|
|
221
|
+
--create-bucket-configuration LocationConstraint="${AWS_REGION}"; then
|
|
222
|
+
echo "❌ Failed to create S3 input bucket: ${BATCH_INPUT_BUCKET}"
|
|
223
|
+
echo ""
|
|
224
|
+
echo " Check that:"
|
|
225
|
+
echo " • Your IAM credentials have s3:CreateBucket permission"
|
|
226
|
+
echo " • The bucket name is not already taken globally"
|
|
227
|
+
exit 4
|
|
228
|
+
fi
|
|
229
|
+
fi
|
|
230
|
+
echo "✅ S3 input bucket created: ${BATCH_INPUT_BUCKET}"
|
|
231
|
+
else
|
|
232
|
+
echo "✅ S3 input bucket exists: ${BATCH_INPUT_BUCKET}"
|
|
233
|
+
fi
|
|
234
|
+
|
|
235
|
+
# Upload sample input file if the input prefix is empty
|
|
236
|
+
EXISTING_OBJECTS=$(aws s3 ls "${BATCH_INPUT_PATH}" --region "${AWS_REGION}" 2>/dev/null | head -1 || true)
|
|
237
|
+
if [ -z "${EXISTING_OBJECTS}" ]; then
|
|
238
|
+
echo "📄 Uploading sample input file to ${BATCH_INPUT_PATH}"
|
|
239
|
+
<% if (framework === 'transformers' && (modelServer === 'vllm' || modelServer === 'sglang')) { %>
|
|
240
|
+
echo '{"model": "<%= modelName %>", "messages": [{"role": "user", "content": "What is machine learning?"}], "max_tokens": 50}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
|
|
241
|
+
<% } else if (framework === 'transformers') { %>
|
|
242
|
+
echo '{"inputs": "What is machine learning?", "parameters": {"max_new_tokens": 50}}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
|
|
243
|
+
<% } else if (framework === 'diffusors') { %>
|
|
244
|
+
echo '{"prompt": "A white cat", "n": 1, "size": "512x512"}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
|
|
245
|
+
<% } else { %>
|
|
246
|
+
echo '{"instances": [[1.0, 2.0, 3.0, 4.0]]}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
|
|
247
|
+
<% } %>
|
|
248
|
+
echo "✅ Sample input uploaded: ${BATCH_INPUT_PATH}sample.jsonl"
|
|
249
|
+
echo " ⚠️ Replace this with your actual input data before running production jobs"
|
|
250
|
+
fi
|
|
251
|
+
<% } else { %>
|
|
252
|
+
# Custom S3 input path provided — skip bucket creation
|
|
253
|
+
echo "✅ Using custom S3 input path: ${BATCH_INPUT_PATH}"
|
|
254
|
+
<% } %>
|
|
255
|
+
|
|
256
|
+
<% if (!batchOutputPath) { %>
|
|
257
|
+
# Bootstrap default S3 output bucket (check-and-create, may be same as input)
|
|
258
|
+
if [ "${BATCH_OUTPUT_BUCKET}" != "${BATCH_INPUT_BUCKET}" ]; then
|
|
259
|
+
echo "🔍 Checking if S3 output bucket exists: ${BATCH_OUTPUT_BUCKET}"
|
|
260
|
+
if ! aws s3api head-bucket --bucket "${BATCH_OUTPUT_BUCKET}" --region "${AWS_REGION}" 2>/dev/null; then
|
|
261
|
+
echo "📦 Creating S3 output bucket: ${BATCH_OUTPUT_BUCKET}"
|
|
262
|
+
if [ "${AWS_REGION}" = "us-east-1" ]; then
|
|
263
|
+
if ! aws s3api create-bucket \
|
|
264
|
+
--bucket "${BATCH_OUTPUT_BUCKET}" \
|
|
265
|
+
--region "${AWS_REGION}"; then
|
|
266
|
+
echo "❌ Failed to create S3 output bucket: ${BATCH_OUTPUT_BUCKET}"
|
|
267
|
+
exit 4
|
|
268
|
+
fi
|
|
269
|
+
else
|
|
270
|
+
if ! aws s3api create-bucket \
|
|
271
|
+
--bucket "${BATCH_OUTPUT_BUCKET}" \
|
|
272
|
+
--region "${AWS_REGION}" \
|
|
273
|
+
--create-bucket-configuration LocationConstraint="${AWS_REGION}"; then
|
|
274
|
+
echo "❌ Failed to create S3 output bucket: ${BATCH_OUTPUT_BUCKET}"
|
|
275
|
+
exit 4
|
|
276
|
+
fi
|
|
277
|
+
fi
|
|
278
|
+
echo "✅ S3 output bucket created: ${BATCH_OUTPUT_BUCKET}"
|
|
279
|
+
else
|
|
280
|
+
echo "✅ S3 output bucket exists: ${BATCH_OUTPUT_BUCKET}"
|
|
281
|
+
fi
|
|
282
|
+
else
|
|
283
|
+
echo "✅ S3 output bucket same as input: ${BATCH_OUTPUT_BUCKET}"
|
|
284
|
+
fi
|
|
285
|
+
<% } else { %>
|
|
286
|
+
# Custom S3 output path provided — skip bucket creation
|
|
287
|
+
echo "✅ Using custom S3 output path: ${BATCH_OUTPUT_PATH}"
|
|
288
|
+
<% } %>
|
|
289
|
+
|
|
290
|
+
# ============================================================
|
|
291
|
+
# Check for previous transform job still running
|
|
292
|
+
# ============================================================
|
|
293
|
+
if [ "${FORCE_NEW}" != true ] && [ -n "${TRANSFORM_JOB_NAME:-}" ]; then
|
|
294
|
+
echo "🔍 Checking previous transform job: ${TRANSFORM_JOB_NAME}"
|
|
295
|
+
PREV_JOB_STATUS=$(aws sagemaker describe-transform-job \
|
|
296
|
+
--transform-job-name "${TRANSFORM_JOB_NAME}" \
|
|
297
|
+
--region "${AWS_REGION}" \
|
|
298
|
+
--query "TransformJobStatus" \
|
|
299
|
+
--output text 2>/dev/null || echo "")
|
|
300
|
+
|
|
301
|
+
case "${PREV_JOB_STATUS}" in
|
|
302
|
+
InProgress)
|
|
303
|
+
echo "⚠️ Previous transform job is still running: ${TRANSFORM_JOB_NAME}"
|
|
304
|
+
echo " Wait for it to complete, or stop it with:"
|
|
305
|
+
echo " aws sagemaker stop-transform-job --transform-job-name ${TRANSFORM_JOB_NAME} --region ${AWS_REGION}"
|
|
306
|
+
echo ""
|
|
307
|
+
echo " Use --force to create a new job anyway."
|
|
308
|
+
exit 4
|
|
309
|
+
;;
|
|
310
|
+
Completed)
|
|
311
|
+
echo "✅ Previous transform job completed: ${TRANSFORM_JOB_NAME}"
|
|
312
|
+
echo " Creating a new job. Results from the previous job are in:"
|
|
313
|
+
echo " ${BATCH_OUTPUT_PATH}"
|
|
314
|
+
echo ""
|
|
315
|
+
;;
|
|
316
|
+
*)
|
|
317
|
+
# Failed, Stopped, or not found — proceed with new job
|
|
318
|
+
;;
|
|
319
|
+
esac
|
|
320
|
+
fi
|
|
321
|
+
|
|
322
|
+
# Generate unique names with timestamp
|
|
323
|
+
TIMESTAMP=$(date +%s)
|
|
324
|
+
MODEL_NAME_SM="${PROJECT_NAME}-batch-model-${TIMESTAMP}"
|
|
325
|
+
TRANSFORM_JOB_NAME="${PROJECT_NAME}-batch-job-${TIMESTAMP}"
|
|
326
|
+
|
|
327
|
+
_update_config_var "TRANSFORM_JOB_NAME" "${TRANSFORM_JOB_NAME}"
|
|
328
|
+
_update_config_var "SAGEMAKER_MODEL_NAME" "${MODEL_NAME_SM}"
|
|
329
|
+
|
|
330
|
+
# Step 1: Create SageMaker model
|
|
331
|
+
echo "📦 Creating SageMaker model: ${MODEL_NAME_SM}"
|
|
332
|
+
|
|
333
|
+
# Build primary container spec
|
|
334
|
+
BATCH_PRIMARY_CONTAINER="{\"Image\":\"${ECR_REPOSITORY}:${IMAGE_TAG}\""
|
|
335
|
+
if [ -n "${CONTAINER_ENV_JSON}" ]; then
|
|
336
|
+
BATCH_PRIMARY_CONTAINER="${BATCH_PRIMARY_CONTAINER},\"Environment\":{${CONTAINER_ENV_JSON}}"
|
|
337
|
+
fi
|
|
338
|
+
BATCH_PRIMARY_CONTAINER="${BATCH_PRIMARY_CONTAINER}}"
|
|
339
|
+
|
|
340
|
+
if ! aws sagemaker create-model \
|
|
341
|
+
--model-name "${MODEL_NAME_SM}" \
|
|
342
|
+
--primary-container "${BATCH_PRIMARY_CONTAINER}" \
|
|
343
|
+
--execution-role-arn "${ROLE_ARN}" \
|
|
344
|
+
--region "${AWS_REGION}"; then
|
|
345
|
+
|
|
346
|
+
echo "❌ Failed to create SageMaker model"
|
|
347
|
+
echo " Check that:"
|
|
348
|
+
echo " • The execution role ARN is valid"
|
|
349
|
+
echo " • The ECR image exists and is accessible"
|
|
350
|
+
echo " • The IAM role has ecr:GetDownloadUrlForLayer permission"
|
|
351
|
+
exit 4
|
|
352
|
+
fi
|
|
353
|
+
|
|
354
|
+
echo "✅ SageMaker model created: ${MODEL_NAME_SM}"
|
|
355
|
+
|
|
356
|
+
# Record model in manifest (non-blocking)
|
|
357
|
+
MODEL_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:model/${MODEL_NAME_SM}"
|
|
358
|
+
./do/manifest add \
|
|
359
|
+
--type sagemaker-model \
|
|
360
|
+
--id "${MODEL_ARN}" \
|
|
361
|
+
--project "${PROJECT_NAME}" \
|
|
362
|
+
--meta "{\"modelName\":\"${MODEL_NAME_SM}\",\"region\":\"${AWS_REGION}\"}" \
|
|
363
|
+
2>/dev/null || true
|
|
364
|
+
|
|
365
|
+
# Step 2: Build transform job JSON
|
|
366
|
+
TRANSFORM_JOB_JSON="{
|
|
367
|
+
\"TransformJobName\": \"${TRANSFORM_JOB_NAME}\",
|
|
368
|
+
\"ModelName\": \"${MODEL_NAME_SM}\",
|
|
369
|
+
\"TransformInput\": {
|
|
370
|
+
\"DataSource\": {
|
|
371
|
+
\"S3DataSource\": {
|
|
372
|
+
\"S3DataType\": \"S3Prefix\",
|
|
373
|
+
\"S3Uri\": \"${BATCH_INPUT_PATH}\"
|
|
374
|
+
}
|
|
375
|
+
},
|
|
376
|
+
\"ContentType\": \"application/json\",
|
|
377
|
+
\"SplitType\": \"${BATCH_SPLIT_TYPE}\"
|
|
378
|
+
},
|
|
379
|
+
\"TransformOutput\": {
|
|
380
|
+
\"S3OutputPath\": \"${BATCH_OUTPUT_PATH}\"
|
|
381
|
+
$([ "${BATCH_JOIN_SOURCE:-None}" = "Input" ] && echo ",\"Accept\": \"application/json\", \"AssembleWith\": \"${BATCH_SPLIT_TYPE}\"")
|
|
382
|
+
},
|
|
383
|
+
\"TransformResources\": {
|
|
384
|
+
\"InstanceType\": \"${INSTANCE_TYPE}\",
|
|
385
|
+
\"InstanceCount\": ${BATCH_INSTANCE_COUNT}
|
|
386
|
+
},
|
|
387
|
+
\"MaxConcurrentTransforms\": ${BATCH_MAX_CONCURRENT_TRANSFORMS:-1},
|
|
388
|
+
\"MaxPayloadInMB\": ${BATCH_MAX_PAYLOAD_IN_MB:-6},
|
|
389
|
+
\"BatchStrategy\": \"${BATCH_STRATEGY}\"
|
|
390
|
+
$([ "${BATCH_JOIN_SOURCE:-None}" = "Input" ] && echo ",\"DataProcessing\": { \"JoinSource\": \"Input\" }")
|
|
391
|
+
}"
|
|
392
|
+
|
|
393
|
+
# Step 3: Create transform job
|
|
394
|
+
echo "🚀 Creating transform job: ${TRANSFORM_JOB_NAME}"
|
|
395
|
+
if ! aws sagemaker create-transform-job \
|
|
396
|
+
--cli-input-json "${TRANSFORM_JOB_JSON}" \
|
|
397
|
+
--region "${AWS_REGION}"; then
|
|
398
|
+
|
|
399
|
+
echo "❌ Failed to create transform job"
|
|
400
|
+
echo " Check that:"
|
|
401
|
+
echo " • The S3 input path exists and is accessible: ${BATCH_INPUT_PATH}"
|
|
402
|
+
echo " • The S3 output path is writable: ${BATCH_OUTPUT_PATH}"
|
|
403
|
+
echo " • The IAM role has s3:GetObject permission on the input path"
|
|
404
|
+
echo " • The IAM role has s3:PutObject permission on the output path"
|
|
405
|
+
echo " • The instance type is valid: ${INSTANCE_TYPE}"
|
|
406
|
+
echo " • The instance type is available in region: ${AWS_REGION}"
|
|
407
|
+
echo " • You have sufficient service quota for the instance type"
|
|
408
|
+
exit 4
|
|
409
|
+
fi
|
|
410
|
+
|
|
411
|
+
echo "✅ Transform job created: ${TRANSFORM_JOB_NAME}"
|
|
412
|
+
|
|
413
|
+
# Record transform job in manifest (non-blocking)
|
|
414
|
+
TRANSFORM_JOB_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:transform-job/${TRANSFORM_JOB_NAME}"
|
|
415
|
+
./do/manifest add \
|
|
416
|
+
--type sagemaker-transform-job \
|
|
417
|
+
--id "${TRANSFORM_JOB_ARN}" \
|
|
418
|
+
--project "${PROJECT_NAME}" \
|
|
419
|
+
--meta "{\"transformJobName\":\"${TRANSFORM_JOB_NAME}\",\"modelName\":\"${MODEL_NAME_SM}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
|
|
420
|
+
2>/dev/null || true
|
|
421
|
+
|
|
422
|
+
# Step 4: Poll transform job status until completion or failure
|
|
423
|
+
echo "⏳ Waiting for transform job to complete..."
|
|
424
|
+
echo " This may take several minutes depending on dataset size..."
|
|
425
|
+
echo " If this times out, check status with:"
|
|
426
|
+
echo " aws sagemaker describe-transform-job --transform-job-name ${TRANSFORM_JOB_NAME} --region ${AWS_REGION}"
|
|
427
|
+
echo ""
|
|
428
|
+
|
|
429
|
+
while true; do
|
|
430
|
+
JOB_STATUS=$(aws sagemaker describe-transform-job \
|
|
431
|
+
--transform-job-name "${TRANSFORM_JOB_NAME}" \
|
|
432
|
+
--region "${AWS_REGION}" \
|
|
433
|
+
--query "TransformJobStatus" \
|
|
434
|
+
--output text 2>&1) || {
|
|
435
|
+
# Check if it was a credential expiration
|
|
436
|
+
if echo "${JOB_STATUS}" | grep -qi "expired\|token"; then
|
|
437
|
+
echo ""
|
|
438
|
+
echo "⚠️ Credentials expired, but the transform job is still running."
|
|
439
|
+
echo " Refresh your credentials and check status with:"
|
|
440
|
+
echo " aws sagemaker describe-transform-job --transform-job-name ${TRANSFORM_JOB_NAME} --region ${AWS_REGION} --query TransformJobStatus"
|
|
441
|
+
exit 4
|
|
442
|
+
fi
|
|
443
|
+
echo "❌ Failed to describe transform job: ${TRANSFORM_JOB_NAME}"
|
|
444
|
+
echo " Error: ${JOB_STATUS}"
|
|
445
|
+
exit 4
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
case "${JOB_STATUS}" in
|
|
449
|
+
Completed)
|
|
450
|
+
echo "✅ Transform job completed successfully!"
|
|
451
|
+
break
|
|
452
|
+
;;
|
|
453
|
+
Failed)
|
|
454
|
+
FAILURE_REASON=$(aws sagemaker describe-transform-job \
|
|
455
|
+
--transform-job-name "${TRANSFORM_JOB_NAME}" \
|
|
456
|
+
--region "${AWS_REGION}" \
|
|
457
|
+
--query "FailureReason" \
|
|
458
|
+
--output text 2>/dev/null || echo "Unknown")
|
|
459
|
+
echo "❌ Transform job failed"
|
|
460
|
+
echo " Reason: ${FAILURE_REASON}"
|
|
461
|
+
echo ""
|
|
462
|
+
echo " Check CloudWatch Logs for details:"
|
|
463
|
+
echo " https://console.aws.amazon.com/cloudwatch/home?region=${AWS_REGION}#logsV2:log-groups/log-group//aws/sagemaker/TransformJobs"
|
|
464
|
+
echo ""
|
|
465
|
+
echo " Verify that:"
|
|
466
|
+
echo " • The S3 input path exists and contains data: ${BATCH_INPUT_PATH}"
|
|
467
|
+
echo " • The input data format matches the container's expected format"
|
|
468
|
+
echo " • The container's /ping and /invocations endpoints work correctly"
|
|
469
|
+
exit 4
|
|
470
|
+
;;
|
|
471
|
+
Stopped)
|
|
472
|
+
echo "⚠️ Transform job was stopped"
|
|
473
|
+
exit 4
|
|
474
|
+
;;
|
|
475
|
+
InProgress)
|
|
476
|
+
echo " $(date +%H:%M:%S) Job status: InProgress..."
|
|
477
|
+
sleep 30
|
|
478
|
+
;;
|
|
479
|
+
*)
|
|
480
|
+
echo " $(date +%H:%M:%S) Job status: ${JOB_STATUS}..."
|
|
481
|
+
sleep 30
|
|
482
|
+
;;
|
|
483
|
+
esac
|
|
484
|
+
done
|
|
485
|
+
|
|
486
|
+
echo ""
|
|
487
|
+
echo "📋 Deployment Details:"
|
|
488
|
+
echo " Transform Job: ${TRANSFORM_JOB_NAME}"
|
|
489
|
+
echo " Model: ${MODEL_NAME_SM}"
|
|
490
|
+
echo " Region: ${AWS_REGION}"
|
|
491
|
+
echo " Instance Type: ${INSTANCE_TYPE}"
|
|
492
|
+
echo " Instance Count: ${BATCH_INSTANCE_COUNT}"
|
|
493
|
+
echo " Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
|
|
494
|
+
echo " S3 Input: ${BATCH_INPUT_PATH}"
|
|
495
|
+
echo " S3 Output: ${BATCH_OUTPUT_PATH}"
|
|
496
|
+
echo " Split Type: ${BATCH_SPLIT_TYPE}"
|
|
497
|
+
echo " Strategy: ${BATCH_STRATEGY}"
|
|
498
|
+
echo ""
|
|
499
|
+
|
|
500
|
+
# Download results locally
|
|
501
|
+
LOCAL_OUTPUT_DIR="${SCRIPT_DIR}/../batch-output"
|
|
502
|
+
mkdir -p "${LOCAL_OUTPUT_DIR}"
|
|
503
|
+
echo "📥 Downloading results to ${LOCAL_OUTPUT_DIR}/"
|
|
504
|
+
if aws s3 sync "${BATCH_OUTPUT_PATH}" "${LOCAL_OUTPUT_DIR}/" --region "${AWS_REGION}"; then
|
|
505
|
+
DOWNLOADED=$(ls -1 "${LOCAL_OUTPUT_DIR}" 2>/dev/null | wc -l | tr -d ' ')
|
|
506
|
+
echo "✅ Downloaded ${DOWNLOADED} file(s) to ${LOCAL_OUTPUT_DIR}/"
|
|
507
|
+
echo ""
|
|
508
|
+
|
|
509
|
+
# Display first output file preview
|
|
510
|
+
FIRST_FILE=$(ls -1 "${LOCAL_OUTPUT_DIR}" 2>/dev/null | head -1)
|
|
511
|
+
if [ -n "${FIRST_FILE}" ]; then
|
|
512
|
+
echo "📄 Sample output (${FIRST_FILE}):"
|
|
513
|
+
head -5 "${LOCAL_OUTPUT_DIR}/${FIRST_FILE}"
|
|
514
|
+
LINES=$(wc -l < "${LOCAL_OUTPUT_DIR}/${FIRST_FILE}" | tr -d ' ')
|
|
515
|
+
if [ "${LINES}" -gt 5 ]; then
|
|
516
|
+
echo " ... (${LINES} total lines)"
|
|
517
|
+
fi
|
|
518
|
+
fi
|
|
519
|
+
else
|
|
520
|
+
echo "⚠️ Could not download output files"
|
|
521
|
+
fi
|
|
522
|
+
|
|
523
|
+
echo ""
|
|
524
|
+
echo "📋 What's next?"
|
|
525
|
+
echo " • View results: cat batch-output/"
|
|
526
|
+
echo " • Review results: ./do/test"
|
|
527
|
+
echo " • Register this deployment: ./do/register"
|
|
528
|
+
echo " • View logs: ./do/logs"
|
|
529
|
+
echo " • Clean up when done: ./do/clean"
|