@aws/ml-container-creator 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,529 @@
1
+ #!/bin/bash
2
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ set -e
6
+ set -u
7
+ set -o pipefail
8
+
9
+ # Parse flags
10
+ FORCE_NEW=false
11
+ FORCE_IC=false
12
+ IC_TARGET=""
13
+ while [ $# -gt 0 ]; do
14
+ case "$1" in
15
+ --force) FORCE_NEW=true; shift ;;
16
+ --force-ic)
17
+ FORCE_IC=true
18
+ shift
19
+ ;;
20
+ --help|-h)
21
+ echo "Usage: ./do/deploy [--force] [--force-ic]"
22
+ echo ""
23
+ echo "Options:"
24
+ echo " --force Create a new endpoint, even if one already exists."
25
+ echo " --force-ic Recreate the inference component on the existing endpoint."
26
+ echo ""
27
+ echo "Without flags, deploy resumes from the last run."
28
+ exit 0
29
+ ;;
30
+ *)
31
+ echo "❌ Unknown option: $1"
32
+ echo " Run ./do/deploy --help for usage."
33
+ exit 1
34
+ ;;
35
+ esac
36
+ done
37
+
38
+ # Source configuration
39
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
40
+ source "${SCRIPT_DIR}/config"
41
+
42
+ echo "🚀 Deploying to AWS"
43
+ echo " Project: ${PROJECT_NAME}"
44
+ echo " Deployment config: ${DEPLOYMENT_CONFIG}"
45
+ echo " Region: ${AWS_REGION}"
46
+ echo " Build target: ${BUILD_TARGET}"
47
+ echo " Deployment target: ${DEPLOYMENT_TARGET}"
48
+ echo " Instance type: ${INSTANCE_TYPE}"
49
+ echo " S3 input: ${BATCH_INPUT_PATH}"
50
+ echo " S3 output: ${BATCH_OUTPUT_PATH}"
51
+ echo " Instance count: ${BATCH_INSTANCE_COUNT}"
52
+ echo " Split type: ${BATCH_SPLIT_TYPE}"
53
+ echo " Strategy: ${BATCH_STRATEGY}"
54
+
55
+ # Check AWS credentials
56
+ echo "🔍 Validating AWS credentials..."
57
+ if ! aws sts get-caller-identity &> /dev/null; then
58
+ echo "❌ AWS credentials not configured"
59
+ echo " Run: aws configure"
60
+ echo " Or set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables"
61
+ exit 4
62
+ fi
63
+
64
+ AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
65
+ echo "✅ AWS credentials validated (Account: ${AWS_ACCOUNT_ID})"
66
+
67
+ # Construct ECR repository URL
68
+ ECR_REPOSITORY="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_REPOSITORY_NAME}"
69
+
70
+ # ============================================================
71
+ # Shared: Verify ECR image exists
72
+ # ============================================================
73
+ echo "🔍 Verifying ECR image exists..."
74
+ if ! aws ecr describe-images \
75
+ --repository-name "${ECR_REPOSITORY_NAME}" \
76
+ --image-ids imageTag="${PROJECT_NAME}-latest" \
77
+ --region "${AWS_REGION}" &> /dev/null; then
78
+
79
+ echo "❌ ECR image not found: ${ECR_REPOSITORY}:${PROJECT_NAME}-latest"
80
+ echo ""
81
+ echo "Please build and push your image first:"
82
+ echo " ./do/submit"
83
+ echo ""
84
+ echo "After the build completes successfully, run this deploy script again."
85
+ exit 4
86
+ fi
87
+
88
+ echo "✅ ECR image found: ${ECR_REPOSITORY}:${PROJECT_NAME}-latest"
89
+ IMAGE_TAG="${PROJECT_NAME}-latest"
90
+
91
+ # ============================================================
92
+ # Shared: Resolve secrets for container environment
93
+ # ============================================================
94
+ CONTAINER_ENV_JSON=""
95
+
96
+ if [ -n "${HF_TOKEN_ARN:-}" ]; then
97
+ echo "🔐 Resolving HuggingFace token from Secrets Manager..."
98
+ RESOLVED_HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "${HF_TOKEN_ARN}" --query SecretString --output text --region "${AWS_REGION}") || {
99
+ echo "❌ Failed to resolve HuggingFace token from Secrets Manager"
100
+ exit 3
101
+ }
102
+ CONTAINER_ENV_JSON="\"HF_TOKEN\":\"${RESOLVED_HF_TOKEN}\""
103
+ elif [ -n "${HF_TOKEN:-}" ]; then
104
+ CONTAINER_ENV_JSON="\"HF_TOKEN\":\"${HF_TOKEN}\""
105
+ fi
106
+
107
+ if [ -n "${NGC_API_KEY_ARN:-}" ]; then
108
+ echo "🔐 Resolving NGC API key from Secrets Manager..."
109
+ RESOLVED_NGC_KEY=$(aws secretsmanager get-secret-value --secret-id "${NGC_API_KEY_ARN}" --query SecretString --output text --region "${AWS_REGION}") || {
110
+ echo "❌ Failed to resolve NGC API key from Secrets Manager"
111
+ exit 3
112
+ }
113
+ if [ -n "${CONTAINER_ENV_JSON}" ]; then
114
+ CONTAINER_ENV_JSON="${CONTAINER_ENV_JSON},\"NGC_API_KEY\":\"${RESOLVED_NGC_KEY}\""
115
+ else
116
+ CONTAINER_ENV_JSON="\"NGC_API_KEY\":\"${RESOLVED_NGC_KEY}\""
117
+ fi
118
+ elif [ -n "${NGC_API_KEY:-}" ]; then
119
+ if [ -n "${CONTAINER_ENV_JSON}" ]; then
120
+ CONTAINER_ENV_JSON="${CONTAINER_ENV_JSON},\"NGC_API_KEY\":\"${NGC_API_KEY}\""
121
+ else
122
+ CONTAINER_ENV_JSON="\"NGC_API_KEY\":\"${NGC_API_KEY}\""
123
+ fi
124
+ fi
125
+
126
+ # ============================================================
127
+ # SageMaker Batch Transform Deployment
128
+ # Flow: create-model → create-transform-job → poll until completion
129
+ # ============================================================
130
+
131
+ # Source shared helpers
132
+ source "${SCRIPT_DIR}/lib/secrets.sh"
133
+ source "${SCRIPT_DIR}/lib/wait.sh"
134
+
135
+ # Resolve container secrets (HF_TOKEN, NGC_API_KEY)
136
+ resolve_secrets
137
+
138
+ # Validate execution role ARN
139
+ if [ -z "${ROLE_ARN:-}" ]; then
140
+ echo "❌ Execution role ARN not provided"
141
+ echo ""
142
+ echo "Usage:"
143
+ echo " export ROLE_ARN=arn:aws:iam::ACCOUNT_ID:role/YOUR_ROLE"
144
+ echo " ./do/deploy"
145
+ echo ""
146
+ echo "Or set ROLE_ARN in do/config"
147
+ echo ""
148
+ echo "The execution role must have permissions for:"
149
+ echo " • SageMaker model and transform job management"
150
+ echo " • ECR image access"
151
+ echo " • S3 read access for input path: ${BATCH_INPUT_PATH}"
152
+ echo " • S3 write access for output path: ${BATCH_OUTPUT_PATH}"
153
+ echo " • CloudWatch Logs"
154
+ exit 3
155
+ fi
156
+
157
+ echo " Using execution role: ${ROLE_ARN}"
158
+
159
+ # Validate S3 input path
160
+ if [ -z "${BATCH_INPUT_PATH:-}" ]; then
161
+ echo "❌ S3 input path not provided"
162
+ echo ""
163
+ echo "Set BATCH_INPUT_PATH in do/config or provide via CLI:"
164
+ echo " export BATCH_INPUT_PATH=s3://my-bucket/input/"
165
+ echo " ./do/deploy"
166
+ exit 3
167
+ fi
168
+
169
+ if [[ "${BATCH_INPUT_PATH}" != s3://* ]]; then
170
+ echo "❌ S3 input path must start with s3://"
171
+ echo " Current value: ${BATCH_INPUT_PATH}"
172
+ echo " Example: s3://my-bucket/input/"
173
+ exit 3
174
+ fi
175
+
176
+ # Validate S3 output path
177
+ if [ -z "${BATCH_OUTPUT_PATH:-}" ]; then
178
+ echo "❌ S3 output path not provided"
179
+ echo ""
180
+ echo "Set BATCH_OUTPUT_PATH in do/config or provide via CLI:"
181
+ echo " export BATCH_OUTPUT_PATH=s3://my-bucket/output/"
182
+ echo " ./do/deploy"
183
+ exit 3
184
+ fi
185
+
186
+ if [[ "${BATCH_OUTPUT_PATH}" != s3://* ]]; then
187
+ echo "❌ S3 output path must start with s3://"
188
+ echo " Current value: ${BATCH_OUTPUT_PATH}"
189
+ echo " Example: s3://my-bucket/output/"
190
+ exit 3
191
+ fi
192
+
193
+ # ============================================================
194
+ # Bootstrap S3 buckets for batch transform
195
+ # ============================================================
196
+
197
+ # Extract bucket names from S3 paths
198
+ BATCH_INPUT_BUCKET=$(echo "${BATCH_INPUT_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
199
+ BATCH_OUTPUT_BUCKET=$(echo "${BATCH_OUTPUT_PATH}" | sed 's|s3://||' | cut -d'/' -f1)
200
+
201
+ <% if (!batchInputPath) { %>
202
+ # Bootstrap default S3 input bucket (check-and-create)
203
+ echo "🔍 Checking if S3 input bucket exists: ${BATCH_INPUT_BUCKET}"
204
+ if ! aws s3api head-bucket --bucket "${BATCH_INPUT_BUCKET}" --region "${AWS_REGION}" 2>/dev/null; then
205
+ echo "📦 Creating S3 input bucket: ${BATCH_INPUT_BUCKET}"
206
+ if [ "${AWS_REGION}" = "us-east-1" ]; then
207
+ if ! aws s3api create-bucket \
208
+ --bucket "${BATCH_INPUT_BUCKET}" \
209
+ --region "${AWS_REGION}"; then
210
+ echo "❌ Failed to create S3 input bucket: ${BATCH_INPUT_BUCKET}"
211
+ echo ""
212
+ echo " Check that:"
213
+ echo " • Your IAM credentials have s3:CreateBucket permission"
214
+ echo " • The bucket name is not already taken globally"
215
+ exit 4
216
+ fi
217
+ else
218
+ if ! aws s3api create-bucket \
219
+ --bucket "${BATCH_INPUT_BUCKET}" \
220
+ --region "${AWS_REGION}" \
221
+ --create-bucket-configuration LocationConstraint="${AWS_REGION}"; then
222
+ echo "❌ Failed to create S3 input bucket: ${BATCH_INPUT_BUCKET}"
223
+ echo ""
224
+ echo " Check that:"
225
+ echo " • Your IAM credentials have s3:CreateBucket permission"
226
+ echo " • The bucket name is not already taken globally"
227
+ exit 4
228
+ fi
229
+ fi
230
+ echo "✅ S3 input bucket created: ${BATCH_INPUT_BUCKET}"
231
+ else
232
+ echo "✅ S3 input bucket exists: ${BATCH_INPUT_BUCKET}"
233
+ fi
234
+
235
+ # Upload sample input file if the input prefix is empty
236
+ EXISTING_OBJECTS=$(aws s3 ls "${BATCH_INPUT_PATH}" --region "${AWS_REGION}" 2>/dev/null | head -1 || true)
237
+ if [ -z "${EXISTING_OBJECTS}" ]; then
238
+ echo "📄 Uploading sample input file to ${BATCH_INPUT_PATH}"
239
+ <% if (framework === 'transformers' && (modelServer === 'vllm' || modelServer === 'sglang')) { %>
240
+ echo '{"model": "<%= modelName %>", "messages": [{"role": "user", "content": "What is machine learning?"}], "max_tokens": 50}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
241
+ <% } else if (framework === 'transformers') { %>
242
+ echo '{"inputs": "What is machine learning?", "parameters": {"max_new_tokens": 50}}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
243
+ <% } else if (framework === 'diffusors') { %>
244
+ echo '{"prompt": "A white cat", "n": 1, "size": "512x512"}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
245
+ <% } else { %>
246
+ echo '{"instances": [[1.0, 2.0, 3.0, 4.0]]}' | aws s3 cp - "${BATCH_INPUT_PATH}sample.jsonl" --region "${AWS_REGION}"
247
+ <% } %>
248
+ echo "✅ Sample input uploaded: ${BATCH_INPUT_PATH}sample.jsonl"
249
+ echo " ⚠️ Replace this with your actual input data before running production jobs"
250
+ fi
251
+ <% } else { %>
252
+ # Custom S3 input path provided — skip bucket creation
253
+ echo "✅ Using custom S3 input path: ${BATCH_INPUT_PATH}"
254
+ <% } %>
255
+
256
+ <% if (!batchOutputPath) { %>
257
+ # Bootstrap default S3 output bucket (check-and-create, may be same as input)
258
+ if [ "${BATCH_OUTPUT_BUCKET}" != "${BATCH_INPUT_BUCKET}" ]; then
259
+ echo "🔍 Checking if S3 output bucket exists: ${BATCH_OUTPUT_BUCKET}"
260
+ if ! aws s3api head-bucket --bucket "${BATCH_OUTPUT_BUCKET}" --region "${AWS_REGION}" 2>/dev/null; then
261
+ echo "📦 Creating S3 output bucket: ${BATCH_OUTPUT_BUCKET}"
262
+ if [ "${AWS_REGION}" = "us-east-1" ]; then
263
+ if ! aws s3api create-bucket \
264
+ --bucket "${BATCH_OUTPUT_BUCKET}" \
265
+ --region "${AWS_REGION}"; then
266
+ echo "❌ Failed to create S3 output bucket: ${BATCH_OUTPUT_BUCKET}"
267
+ exit 4
268
+ fi
269
+ else
270
+ if ! aws s3api create-bucket \
271
+ --bucket "${BATCH_OUTPUT_BUCKET}" \
272
+ --region "${AWS_REGION}" \
273
+ --create-bucket-configuration LocationConstraint="${AWS_REGION}"; then
274
+ echo "❌ Failed to create S3 output bucket: ${BATCH_OUTPUT_BUCKET}"
275
+ exit 4
276
+ fi
277
+ fi
278
+ echo "✅ S3 output bucket created: ${BATCH_OUTPUT_BUCKET}"
279
+ else
280
+ echo "✅ S3 output bucket exists: ${BATCH_OUTPUT_BUCKET}"
281
+ fi
282
+ else
283
+ echo "✅ S3 output bucket same as input: ${BATCH_OUTPUT_BUCKET}"
284
+ fi
285
+ <% } else { %>
286
+ # Custom S3 output path provided — skip bucket creation
287
+ echo "✅ Using custom S3 output path: ${BATCH_OUTPUT_PATH}"
288
+ <% } %>
289
+
290
+ # ============================================================
291
+ # Check for previous transform job still running
292
+ # ============================================================
293
+ if [ "${FORCE_NEW}" != true ] && [ -n "${TRANSFORM_JOB_NAME:-}" ]; then
294
+ echo "🔍 Checking previous transform job: ${TRANSFORM_JOB_NAME}"
295
+ PREV_JOB_STATUS=$(aws sagemaker describe-transform-job \
296
+ --transform-job-name "${TRANSFORM_JOB_NAME}" \
297
+ --region "${AWS_REGION}" \
298
+ --query "TransformJobStatus" \
299
+ --output text 2>/dev/null || echo "")
300
+
301
+ case "${PREV_JOB_STATUS}" in
302
+ InProgress)
303
+ echo "⚠️ Previous transform job is still running: ${TRANSFORM_JOB_NAME}"
304
+ echo " Wait for it to complete, or stop it with:"
305
+ echo " aws sagemaker stop-transform-job --transform-job-name ${TRANSFORM_JOB_NAME} --region ${AWS_REGION}"
306
+ echo ""
307
+ echo " Use --force to create a new job anyway."
308
+ exit 4
309
+ ;;
310
+ Completed)
311
+ echo "✅ Previous transform job completed: ${TRANSFORM_JOB_NAME}"
312
+ echo " Creating a new job. Results from the previous job are in:"
313
+ echo " ${BATCH_OUTPUT_PATH}"
314
+ echo ""
315
+ ;;
316
+ *)
317
+ # Failed, Stopped, or not found — proceed with new job
318
+ ;;
319
+ esac
320
+ fi
321
+
322
+ # Generate unique names with timestamp
323
+ TIMESTAMP=$(date +%s)
324
+ MODEL_NAME_SM="${PROJECT_NAME}-batch-model-${TIMESTAMP}"
325
+ TRANSFORM_JOB_NAME="${PROJECT_NAME}-batch-job-${TIMESTAMP}"
326
+
327
+ _update_config_var "TRANSFORM_JOB_NAME" "${TRANSFORM_JOB_NAME}"
328
+ _update_config_var "SAGEMAKER_MODEL_NAME" "${MODEL_NAME_SM}"
329
+
330
+ # Step 1: Create SageMaker model
331
+ echo "📦 Creating SageMaker model: ${MODEL_NAME_SM}"
332
+
333
+ # Build primary container spec
334
+ BATCH_PRIMARY_CONTAINER="{\"Image\":\"${ECR_REPOSITORY}:${IMAGE_TAG}\""
335
+ if [ -n "${CONTAINER_ENV_JSON}" ]; then
336
+ BATCH_PRIMARY_CONTAINER="${BATCH_PRIMARY_CONTAINER},\"Environment\":{${CONTAINER_ENV_JSON}}"
337
+ fi
338
+ BATCH_PRIMARY_CONTAINER="${BATCH_PRIMARY_CONTAINER}}"
339
+
340
+ if ! aws sagemaker create-model \
341
+ --model-name "${MODEL_NAME_SM}" \
342
+ --primary-container "${BATCH_PRIMARY_CONTAINER}" \
343
+ --execution-role-arn "${ROLE_ARN}" \
344
+ --region "${AWS_REGION}"; then
345
+
346
+ echo "❌ Failed to create SageMaker model"
347
+ echo " Check that:"
348
+ echo " • The execution role ARN is valid"
349
+ echo " • The ECR image exists and is accessible"
350
+ echo " • The IAM role has ecr:GetDownloadUrlForLayer permission"
351
+ exit 4
352
+ fi
353
+
354
+ echo "✅ SageMaker model created: ${MODEL_NAME_SM}"
355
+
356
+ # Record model in manifest (non-blocking)
357
+ MODEL_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:model/${MODEL_NAME_SM}"
358
+ ./do/manifest add \
359
+ --type sagemaker-model \
360
+ --id "${MODEL_ARN}" \
361
+ --project "${PROJECT_NAME}" \
362
+ --meta "{\"modelName\":\"${MODEL_NAME_SM}\",\"region\":\"${AWS_REGION}\"}" \
363
+ 2>/dev/null || true
364
+
365
+ # Step 2: Build transform job JSON
366
+ TRANSFORM_JOB_JSON="{
367
+ \"TransformJobName\": \"${TRANSFORM_JOB_NAME}\",
368
+ \"ModelName\": \"${MODEL_NAME_SM}\",
369
+ \"TransformInput\": {
370
+ \"DataSource\": {
371
+ \"S3DataSource\": {
372
+ \"S3DataType\": \"S3Prefix\",
373
+ \"S3Uri\": \"${BATCH_INPUT_PATH}\"
374
+ }
375
+ },
376
+ \"ContentType\": \"application/json\",
377
+ \"SplitType\": \"${BATCH_SPLIT_TYPE}\"
378
+ },
379
+ \"TransformOutput\": {
380
+ \"S3OutputPath\": \"${BATCH_OUTPUT_PATH}\"
381
+ $([ "${BATCH_JOIN_SOURCE:-None}" = "Input" ] && echo ",\"Accept\": \"application/json\", \"AssembleWith\": \"${BATCH_SPLIT_TYPE}\"")
382
+ },
383
+ \"TransformResources\": {
384
+ \"InstanceType\": \"${INSTANCE_TYPE}\",
385
+ \"InstanceCount\": ${BATCH_INSTANCE_COUNT}
386
+ },
387
+ \"MaxConcurrentTransforms\": ${BATCH_MAX_CONCURRENT_TRANSFORMS:-1},
388
+ \"MaxPayloadInMB\": ${BATCH_MAX_PAYLOAD_IN_MB:-6},
389
+ \"BatchStrategy\": \"${BATCH_STRATEGY}\"
390
+ $([ "${BATCH_JOIN_SOURCE:-None}" = "Input" ] && echo ",\"DataProcessing\": { \"JoinSource\": \"Input\" }")
391
+ }"
392
+
393
+ # Step 3: Create transform job
394
+ echo "🚀 Creating transform job: ${TRANSFORM_JOB_NAME}"
395
+ if ! aws sagemaker create-transform-job \
396
+ --cli-input-json "${TRANSFORM_JOB_JSON}" \
397
+ --region "${AWS_REGION}"; then
398
+
399
+ echo "❌ Failed to create transform job"
400
+ echo " Check that:"
401
+ echo " • The S3 input path exists and is accessible: ${BATCH_INPUT_PATH}"
402
+ echo " • The S3 output path is writable: ${BATCH_OUTPUT_PATH}"
403
+ echo " • The IAM role has s3:GetObject permission on the input path"
404
+ echo " • The IAM role has s3:PutObject permission on the output path"
405
+ echo " • The instance type is valid: ${INSTANCE_TYPE}"
406
+ echo " • The instance type is available in region: ${AWS_REGION}"
407
+ echo " • You have sufficient service quota for the instance type"
408
+ exit 4
409
+ fi
410
+
411
+ echo "✅ Transform job created: ${TRANSFORM_JOB_NAME}"
412
+
413
+ # Record transform job in manifest (non-blocking)
414
+ TRANSFORM_JOB_ARN="arn:aws:sagemaker:${AWS_REGION}:${AWS_ACCOUNT_ID}:transform-job/${TRANSFORM_JOB_NAME}"
415
+ ./do/manifest add \
416
+ --type sagemaker-transform-job \
417
+ --id "${TRANSFORM_JOB_ARN}" \
418
+ --project "${PROJECT_NAME}" \
419
+ --meta "{\"transformJobName\":\"${TRANSFORM_JOB_NAME}\",\"modelName\":\"${MODEL_NAME_SM}\",\"instanceType\":\"${INSTANCE_TYPE}\",\"region\":\"${AWS_REGION}\"}" \
420
+ 2>/dev/null || true
421
+
422
+ # Step 4: Poll transform job status until completion or failure
423
+ echo "⏳ Waiting for transform job to complete..."
424
+ echo " This may take several minutes depending on dataset size..."
425
+ echo " If this times out, check status with:"
426
+ echo " aws sagemaker describe-transform-job --transform-job-name ${TRANSFORM_JOB_NAME} --region ${AWS_REGION}"
427
+ echo ""
428
+
429
+ while true; do
430
+ JOB_STATUS=$(aws sagemaker describe-transform-job \
431
+ --transform-job-name "${TRANSFORM_JOB_NAME}" \
432
+ --region "${AWS_REGION}" \
433
+ --query "TransformJobStatus" \
434
+ --output text 2>&1) || {
435
+ # Check if it was a credential expiration
436
+ if echo "${JOB_STATUS}" | grep -qi "expired\|token"; then
437
+ echo ""
438
+ echo "⚠️ Credentials expired, but the transform job is still running."
439
+ echo " Refresh your credentials and check status with:"
440
+ echo " aws sagemaker describe-transform-job --transform-job-name ${TRANSFORM_JOB_NAME} --region ${AWS_REGION} --query TransformJobStatus"
441
+ exit 4
442
+ fi
443
+ echo "❌ Failed to describe transform job: ${TRANSFORM_JOB_NAME}"
444
+ echo " Error: ${JOB_STATUS}"
445
+ exit 4
446
+ }
447
+
448
+ case "${JOB_STATUS}" in
449
+ Completed)
450
+ echo "✅ Transform job completed successfully!"
451
+ break
452
+ ;;
453
+ Failed)
454
+ FAILURE_REASON=$(aws sagemaker describe-transform-job \
455
+ --transform-job-name "${TRANSFORM_JOB_NAME}" \
456
+ --region "${AWS_REGION}" \
457
+ --query "FailureReason" \
458
+ --output text 2>/dev/null || echo "Unknown")
459
+ echo "❌ Transform job failed"
460
+ echo " Reason: ${FAILURE_REASON}"
461
+ echo ""
462
+ echo " Check CloudWatch Logs for details:"
463
+ echo " https://console.aws.amazon.com/cloudwatch/home?region=${AWS_REGION}#logsV2:log-groups/log-group//aws/sagemaker/TransformJobs"
464
+ echo ""
465
+ echo " Verify that:"
466
+ echo " • The S3 input path exists and contains data: ${BATCH_INPUT_PATH}"
467
+ echo " • The input data format matches the container's expected format"
468
+ echo " • The container's /ping and /invocations endpoints work correctly"
469
+ exit 4
470
+ ;;
471
+ Stopped)
472
+ echo "⚠️ Transform job was stopped"
473
+ exit 4
474
+ ;;
475
+ InProgress)
476
+ echo " $(date +%H:%M:%S) Job status: InProgress..."
477
+ sleep 30
478
+ ;;
479
+ *)
480
+ echo " $(date +%H:%M:%S) Job status: ${JOB_STATUS}..."
481
+ sleep 30
482
+ ;;
483
+ esac
484
+ done
485
+
486
+ echo ""
487
+ echo "📋 Deployment Details:"
488
+ echo " Transform Job: ${TRANSFORM_JOB_NAME}"
489
+ echo " Model: ${MODEL_NAME_SM}"
490
+ echo " Region: ${AWS_REGION}"
491
+ echo " Instance Type: ${INSTANCE_TYPE}"
492
+ echo " Instance Count: ${BATCH_INSTANCE_COUNT}"
493
+ echo " Image: ${ECR_REPOSITORY}:${IMAGE_TAG}"
494
+ echo " S3 Input: ${BATCH_INPUT_PATH}"
495
+ echo " S3 Output: ${BATCH_OUTPUT_PATH}"
496
+ echo " Split Type: ${BATCH_SPLIT_TYPE}"
497
+ echo " Strategy: ${BATCH_STRATEGY}"
498
+ echo ""
499
+
500
+ # Download results locally
501
+ LOCAL_OUTPUT_DIR="${SCRIPT_DIR}/../batch-output"
502
+ mkdir -p "${LOCAL_OUTPUT_DIR}"
503
+ echo "📥 Downloading results to ${LOCAL_OUTPUT_DIR}/"
504
+ if aws s3 sync "${BATCH_OUTPUT_PATH}" "${LOCAL_OUTPUT_DIR}/" --region "${AWS_REGION}"; then
505
+ DOWNLOADED=$(ls -1 "${LOCAL_OUTPUT_DIR}" 2>/dev/null | wc -l | tr -d ' ')
506
+ echo "✅ Downloaded ${DOWNLOADED} file(s) to ${LOCAL_OUTPUT_DIR}/"
507
+ echo ""
508
+
509
+ # Display first output file preview
510
+ FIRST_FILE=$(ls -1 "${LOCAL_OUTPUT_DIR}" 2>/dev/null | head -1)
511
+ if [ -n "${FIRST_FILE}" ]; then
512
+ echo "📄 Sample output (${FIRST_FILE}):"
513
+ head -5 "${LOCAL_OUTPUT_DIR}/${FIRST_FILE}"
514
+ LINES=$(wc -l < "${LOCAL_OUTPUT_DIR}/${FIRST_FILE}" | tr -d ' ')
515
+ if [ "${LINES}" -gt 5 ]; then
516
+ echo " ... (${LINES} total lines)"
517
+ fi
518
+ fi
519
+ else
520
+ echo "⚠️ Could not download output files"
521
+ fi
522
+
523
+ echo ""
524
+ echo "📋 What's next?"
525
+ echo " • View results: cat batch-output/"
526
+ echo " • Review results: ./do/test"
527
+ echo " • Register this deployment: ./do/register"
528
+ echo " • View logs: ./do/logs"
529
+ echo " • Clean up when done: ./do/clean"